1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for x86_64. 18# 19# March 2015 20# 21# Initial release. 22# 23# December 2016 24# 25# Add AVX512F+VL+BW code path. 26# 27# November 2017 28# 29# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 30# executed even on Knights Landing. Trigger for modification was 31# observation that AVX512 code paths can negatively affect overall 32# Skylake-X system performance. Since we are likely to suppress 33# AVX512F capability flag [at least on Skylake-X], conversion serves 34# as kind of "investment protection". Note that next *lake processor, 35# Cannolake, has AVX512IFMA code path to execute... 36# 37# Numbers are cycles per processed byte with poly1305_blocks alone, 38# measured with rdtsc at fixed clock frequency. 39# 40# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 41# P4 4.46/+120% - 42# Core 2 2.41/+90% - 43# Westmere 1.88/+120% - 44# Sandy Bridge 1.39/+140% 1.10 45# Haswell 1.14/+175% 1.11 0.65 46# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 47# Silvermont 2.83/+95% - 48# Knights L 3.60/? 1.65 1.10 0.41(***) 49# Goldmont 1.70/+180% - 50# VIA Nano 1.82/+150% - 51# Sledgehammer 1.38/+160% - 52# Bulldozer 2.30/+130% 0.97 53# Ryzen 1.15/+200% 1.08 1.18 54# 55# (*) improvement coefficients relative to clang are more modest and 56# are ~50% on most processors, in both cases we are comparing to 57# __int128 code; 58# (**) SSE2 implementation was attempted, but among non-AVX processors 59# it was faster than integer-only code only on older Intel P4 and 60# Core processors, 50-30%, less newer processor is, but slower on 61# contemporary ones, for example almost 2x slower on Atom, and as 62# former are naturally disappearing, SSE2 is deemed unnecessary; 63# (***) strangely enough performance seems to vary from core to core, 64# listed result is best case; 65 66$flavour = shift; 67$output = shift; 68if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 69 70$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 71 72$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 73( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 74( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 75die "can't locate x86_64-xlate.pl"; 76 77if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 78 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 79 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); 80} 81 82if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 83 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 84 $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); 85 $avx += 2 if ($1==2.11 && $2>=8); 86} 87 88if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 89 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 90 $avx = ($1>=10) + ($1>=12); 91} 92 93if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 94 $avx = ($2>=3.0) + ($2>3.0); 95} 96 97open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 98*STDOUT=*OUT; 99 100my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 101my ($mac,$nonce)=($inp,$len); # *_emit arguments 102my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); 103my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); 104 105sub poly1305_iteration { 106# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 107# output: $h0-$h2 *= $r0-$r1 108$code.=<<___; 109 mulq $h0 # h0*r1 110 mov %rax,$d2 111 mov $r0,%rax 112 mov %rdx,$d3 113 114 mulq $h0 # h0*r0 115 mov %rax,$h0 # future $h0 116 mov $r0,%rax 117 mov %rdx,$d1 118 119 mulq $h1 # h1*r0 120 add %rax,$d2 121 mov $s1,%rax 122 adc %rdx,$d3 123 124 mulq $h1 # h1*s1 125 mov $h2,$h1 # borrow $h1 126 add %rax,$h0 127 adc %rdx,$d1 128 129 imulq $s1,$h1 # h2*s1 130 add $h1,$d2 131 mov $d1,$h1 132 adc \$0,$d3 133 134 imulq $r0,$h2 # h2*r0 135 add $d2,$h1 136 mov \$-4,%rax # mask value 137 adc $h2,$d3 138 139 and $d3,%rax # last reduction step 140 mov $d3,$h2 141 shr \$2,$d3 142 and \$3,$h2 143 add $d3,%rax 144 add %rax,$h0 145 adc \$0,$h1 146 adc \$0,$h2 147___ 148} 149 150######################################################################## 151# Layout of opaque area is following. 152# 153# unsigned __int64 h[3]; # current hash value base 2^64 154# unsigned __int64 r[2]; # key value base 2^64 155 156$code.=<<___; 157.text 158 159.extern OPENSSL_ia32cap_P 160 161.globl poly1305_init 162.hidden poly1305_init 163.globl poly1305_blocks 164.hidden poly1305_blocks 165.globl poly1305_emit 166.hidden poly1305_emit 167 168.type poly1305_init,\@function,3 169.align 32 170poly1305_init: 171.cfi_startproc 172 xor %rax,%rax 173 mov %rax,0($ctx) # initialize hash value 174 mov %rax,8($ctx) 175 mov %rax,16($ctx) 176 177 cmp \$0,$inp 178 je .Lno_key 179 180 lea poly1305_blocks(%rip),%r10 181 lea poly1305_emit(%rip),%r11 182___ 183$code.=<<___ if ($avx); 184 mov OPENSSL_ia32cap_P+4(%rip),%r9 185 lea poly1305_blocks_avx(%rip),%rax 186 lea poly1305_emit_avx(%rip),%rcx 187 bt \$`60-32`,%r9 # AVX? 188 cmovc %rax,%r10 189 cmovc %rcx,%r11 190___ 191$code.=<<___ if ($avx>1); 192 lea poly1305_blocks_avx2(%rip),%rax 193 bt \$`5+32`,%r9 # AVX2? 194 cmovc %rax,%r10 195___ 196$code.=<<___ if ($avx>3); 197 mov \$`(1<<31|1<<21|1<<16)`,%rax 198 shr \$32,%r9 199 and %rax,%r9 200 cmp %rax,%r9 201 je .Linit_base2_44 202___ 203$code.=<<___; 204 mov \$0x0ffffffc0fffffff,%rax 205 mov \$0x0ffffffc0ffffffc,%rcx 206 and 0($inp),%rax 207 and 8($inp),%rcx 208 mov %rax,24($ctx) 209 mov %rcx,32($ctx) 210___ 211$code.=<<___ if ($flavour !~ /elf32/); 212 mov %r10,0(%rdx) 213 mov %r11,8(%rdx) 214___ 215$code.=<<___ if ($flavour =~ /elf32/); 216 mov %r10d,0(%rdx) 217 mov %r11d,4(%rdx) 218___ 219$code.=<<___; 220 mov \$1,%eax 221.Lno_key: 222 ret 223.cfi_endproc 224.size poly1305_init,.-poly1305_init 225 226.type poly1305_blocks,\@function,4 227.align 32 228poly1305_blocks: 229.cfi_startproc 230.Lblocks: 231 shr \$4,$len 232 jz .Lno_data # too short 233 234 push %rbx 235.cfi_push %rbx 236 push %rbp 237.cfi_push %rbp 238 push %r12 239.cfi_push %r12 240 push %r13 241.cfi_push %r13 242 push %r14 243.cfi_push %r14 244 push %r15 245.cfi_push %r15 246.Lblocks_body: 247 248 mov $len,%r15 # reassign $len 249 250 mov 24($ctx),$r0 # load r 251 mov 32($ctx),$s1 252 253 mov 0($ctx),$h0 # load hash value 254 mov 8($ctx),$h1 255 mov 16($ctx),$h2 256 257 mov $s1,$r1 258 shr \$2,$s1 259 mov $r1,%rax 260 add $r1,$s1 # s1 = r1 + (r1 >> 2) 261 jmp .Loop 262 263.align 32 264.Loop: 265 add 0($inp),$h0 # accumulate input 266 adc 8($inp),$h1 267 lea 16($inp),$inp 268 adc $padbit,$h2 269___ 270 &poly1305_iteration(); 271$code.=<<___; 272 mov $r1,%rax 273 dec %r15 # len-=16 274 jnz .Loop 275 276 mov $h0,0($ctx) # store hash value 277 mov $h1,8($ctx) 278 mov $h2,16($ctx) 279 280 mov 0(%rsp),%r15 281.cfi_restore %r15 282 mov 8(%rsp),%r14 283.cfi_restore %r14 284 mov 16(%rsp),%r13 285.cfi_restore %r13 286 mov 24(%rsp),%r12 287.cfi_restore %r12 288 mov 32(%rsp),%rbp 289.cfi_restore %rbp 290 mov 40(%rsp),%rbx 291.cfi_restore %rbx 292 lea 48(%rsp),%rsp 293.cfi_adjust_cfa_offset -48 294.Lno_data: 295.Lblocks_epilogue: 296 ret 297.cfi_endproc 298.size poly1305_blocks,.-poly1305_blocks 299 300.type poly1305_emit,\@function,3 301.align 32 302poly1305_emit: 303.cfi_startproc 304.Lemit: 305 mov 0($ctx),%r8 # load hash value 306 mov 8($ctx),%r9 307 mov 16($ctx),%r10 308 309 mov %r8,%rax 310 add \$5,%r8 # compare to modulus 311 mov %r9,%rcx 312 adc \$0,%r9 313 adc \$0,%r10 314 shr \$2,%r10 # did 130-bit value overflow? 315 cmovnz %r8,%rax 316 cmovnz %r9,%rcx 317 318 add 0($nonce),%rax # accumulate nonce 319 adc 8($nonce),%rcx 320 mov %rax,0($mac) # write result 321 mov %rcx,8($mac) 322 323 ret 324.cfi_endproc 325.size poly1305_emit,.-poly1305_emit 326___ 327if ($avx) { 328 329######################################################################## 330# Layout of opaque area is following. 331# 332# unsigned __int32 h[5]; # current hash value base 2^26 333# unsigned __int32 is_base2_26; 334# unsigned __int64 r[2]; # key value base 2^64 335# unsigned __int64 pad; 336# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 337# 338# where r^n are base 2^26 digits of degrees of multiplier key. There are 339# 5 digits, but last four are interleaved with multiples of 5, totalling 340# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 341 342my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 343 map("%xmm$_",(0..15)); 344 345$code.=<<___; 346.type __poly1305_block,\@abi-omnipotent 347.align 32 348__poly1305_block: 349.cfi_startproc 350___ 351 &poly1305_iteration(); 352$code.=<<___; 353 ret 354.cfi_endproc 355.size __poly1305_block,.-__poly1305_block 356 357.type __poly1305_init_avx,\@abi-omnipotent 358.align 32 359__poly1305_init_avx: 360.cfi_startproc 361 mov $r0,$h0 362 mov $r1,$h1 363 xor $h2,$h2 364 365 lea 48+64($ctx),$ctx # size optimization 366 367 mov $r1,%rax 368 call __poly1305_block # r^2 369 370 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 371 mov \$0x3ffffff,%edx 372 mov $h0,$d1 373 and $h0#d,%eax 374 mov $r0,$d2 375 and $r0#d,%edx 376 mov %eax,`16*0+0-64`($ctx) 377 shr \$26,$d1 378 mov %edx,`16*0+4-64`($ctx) 379 shr \$26,$d2 380 381 mov \$0x3ffffff,%eax 382 mov \$0x3ffffff,%edx 383 and $d1#d,%eax 384 and $d2#d,%edx 385 mov %eax,`16*1+0-64`($ctx) 386 lea (%rax,%rax,4),%eax # *5 387 mov %edx,`16*1+4-64`($ctx) 388 lea (%rdx,%rdx,4),%edx # *5 389 mov %eax,`16*2+0-64`($ctx) 390 shr \$26,$d1 391 mov %edx,`16*2+4-64`($ctx) 392 shr \$26,$d2 393 394 mov $h1,%rax 395 mov $r1,%rdx 396 shl \$12,%rax 397 shl \$12,%rdx 398 or $d1,%rax 399 or $d2,%rdx 400 and \$0x3ffffff,%eax 401 and \$0x3ffffff,%edx 402 mov %eax,`16*3+0-64`($ctx) 403 lea (%rax,%rax,4),%eax # *5 404 mov %edx,`16*3+4-64`($ctx) 405 lea (%rdx,%rdx,4),%edx # *5 406 mov %eax,`16*4+0-64`($ctx) 407 mov $h1,$d1 408 mov %edx,`16*4+4-64`($ctx) 409 mov $r1,$d2 410 411 mov \$0x3ffffff,%eax 412 mov \$0x3ffffff,%edx 413 shr \$14,$d1 414 shr \$14,$d2 415 and $d1#d,%eax 416 and $d2#d,%edx 417 mov %eax,`16*5+0-64`($ctx) 418 lea (%rax,%rax,4),%eax # *5 419 mov %edx,`16*5+4-64`($ctx) 420 lea (%rdx,%rdx,4),%edx # *5 421 mov %eax,`16*6+0-64`($ctx) 422 shr \$26,$d1 423 mov %edx,`16*6+4-64`($ctx) 424 shr \$26,$d2 425 426 mov $h2,%rax 427 shl \$24,%rax 428 or %rax,$d1 429 mov $d1#d,`16*7+0-64`($ctx) 430 lea ($d1,$d1,4),$d1 # *5 431 mov $d2#d,`16*7+4-64`($ctx) 432 lea ($d2,$d2,4),$d2 # *5 433 mov $d1#d,`16*8+0-64`($ctx) 434 mov $d2#d,`16*8+4-64`($ctx) 435 436 mov $r1,%rax 437 call __poly1305_block # r^3 438 439 mov \$0x3ffffff,%eax # save r^3 base 2^26 440 mov $h0,$d1 441 and $h0#d,%eax 442 shr \$26,$d1 443 mov %eax,`16*0+12-64`($ctx) 444 445 mov \$0x3ffffff,%edx 446 and $d1#d,%edx 447 mov %edx,`16*1+12-64`($ctx) 448 lea (%rdx,%rdx,4),%edx # *5 449 shr \$26,$d1 450 mov %edx,`16*2+12-64`($ctx) 451 452 mov $h1,%rax 453 shl \$12,%rax 454 or $d1,%rax 455 and \$0x3ffffff,%eax 456 mov %eax,`16*3+12-64`($ctx) 457 lea (%rax,%rax,4),%eax # *5 458 mov $h1,$d1 459 mov %eax,`16*4+12-64`($ctx) 460 461 mov \$0x3ffffff,%edx 462 shr \$14,$d1 463 and $d1#d,%edx 464 mov %edx,`16*5+12-64`($ctx) 465 lea (%rdx,%rdx,4),%edx # *5 466 shr \$26,$d1 467 mov %edx,`16*6+12-64`($ctx) 468 469 mov $h2,%rax 470 shl \$24,%rax 471 or %rax,$d1 472 mov $d1#d,`16*7+12-64`($ctx) 473 lea ($d1,$d1,4),$d1 # *5 474 mov $d1#d,`16*8+12-64`($ctx) 475 476 mov $r1,%rax 477 call __poly1305_block # r^4 478 479 mov \$0x3ffffff,%eax # save r^4 base 2^26 480 mov $h0,$d1 481 and $h0#d,%eax 482 shr \$26,$d1 483 mov %eax,`16*0+8-64`($ctx) 484 485 mov \$0x3ffffff,%edx 486 and $d1#d,%edx 487 mov %edx,`16*1+8-64`($ctx) 488 lea (%rdx,%rdx,4),%edx # *5 489 shr \$26,$d1 490 mov %edx,`16*2+8-64`($ctx) 491 492 mov $h1,%rax 493 shl \$12,%rax 494 or $d1,%rax 495 and \$0x3ffffff,%eax 496 mov %eax,`16*3+8-64`($ctx) 497 lea (%rax,%rax,4),%eax # *5 498 mov $h1,$d1 499 mov %eax,`16*4+8-64`($ctx) 500 501 mov \$0x3ffffff,%edx 502 shr \$14,$d1 503 and $d1#d,%edx 504 mov %edx,`16*5+8-64`($ctx) 505 lea (%rdx,%rdx,4),%edx # *5 506 shr \$26,$d1 507 mov %edx,`16*6+8-64`($ctx) 508 509 mov $h2,%rax 510 shl \$24,%rax 511 or %rax,$d1 512 mov $d1#d,`16*7+8-64`($ctx) 513 lea ($d1,$d1,4),$d1 # *5 514 mov $d1#d,`16*8+8-64`($ctx) 515 516 lea -48-64($ctx),$ctx # size [de-]optimization 517 ret 518.cfi_endproc 519.size __poly1305_init_avx,.-__poly1305_init_avx 520 521.type poly1305_blocks_avx,\@function,4 522.align 32 523poly1305_blocks_avx: 524.cfi_startproc 525 mov 20($ctx),%r8d # is_base2_26 526 cmp \$128,$len 527 jae .Lblocks_avx 528 test %r8d,%r8d 529 jz .Lblocks 530 531.Lblocks_avx: 532 and \$-16,$len 533 jz .Lno_data_avx 534 535 vzeroupper 536 537 test %r8d,%r8d 538 jz .Lbase2_64_avx 539 540 test \$31,$len 541 jz .Leven_avx 542 543 push %rbx 544.cfi_push %rbx 545 push %rbp 546.cfi_push %rbp 547 push %r12 548.cfi_push %r12 549 push %r13 550.cfi_push %r13 551 push %r14 552.cfi_push %r14 553 push %r15 554.cfi_push %r15 555.Lblocks_avx_body: 556 557 mov $len,%r15 # reassign $len 558 559 mov 0($ctx),$d1 # load hash value 560 mov 8($ctx),$d2 561 mov 16($ctx),$h2#d 562 563 mov 24($ctx),$r0 # load r 564 mov 32($ctx),$s1 565 566 ################################# base 2^26 -> base 2^64 567 mov $d1#d,$h0#d 568 and \$`-1*(1<<31)`,$d1 569 mov $d2,$r1 # borrow $r1 570 mov $d2#d,$h1#d 571 and \$`-1*(1<<31)`,$d2 572 573 shr \$6,$d1 574 shl \$52,$r1 575 add $d1,$h0 576 shr \$12,$h1 577 shr \$18,$d2 578 add $r1,$h0 579 adc $d2,$h1 580 581 mov $h2,$d1 582 shl \$40,$d1 583 shr \$24,$h2 584 add $d1,$h1 585 adc \$0,$h2 # can be partially reduced... 586 587 mov \$-4,$d2 # ... so reduce 588 mov $h2,$d1 589 and $h2,$d2 590 shr \$2,$d1 591 and \$3,$h2 592 add $d2,$d1 # =*5 593 add $d1,$h0 594 adc \$0,$h1 595 adc \$0,$h2 596 597 mov $s1,$r1 598 mov $s1,%rax 599 shr \$2,$s1 600 add $r1,$s1 # s1 = r1 + (r1 >> 2) 601 602 add 0($inp),$h0 # accumulate input 603 adc 8($inp),$h1 604 lea 16($inp),$inp 605 adc $padbit,$h2 606 607 call __poly1305_block 608 609 test $padbit,$padbit # if $padbit is zero, 610 jz .Lstore_base2_64_avx # store hash in base 2^64 format 611 612 ################################# base 2^64 -> base 2^26 613 mov $h0,%rax 614 mov $h0,%rdx 615 shr \$52,$h0 616 mov $h1,$r0 617 mov $h1,$r1 618 shr \$26,%rdx 619 and \$0x3ffffff,%rax # h[0] 620 shl \$12,$r0 621 and \$0x3ffffff,%rdx # h[1] 622 shr \$14,$h1 623 or $r0,$h0 624 shl \$24,$h2 625 and \$0x3ffffff,$h0 # h[2] 626 shr \$40,$r1 627 and \$0x3ffffff,$h1 # h[3] 628 or $r1,$h2 # h[4] 629 630 sub \$16,%r15 631 jz .Lstore_base2_26_avx 632 633 vmovd %rax#d,$H0 634 vmovd %rdx#d,$H1 635 vmovd $h0#d,$H2 636 vmovd $h1#d,$H3 637 vmovd $h2#d,$H4 638 jmp .Lproceed_avx 639 640.align 32 641.Lstore_base2_64_avx: 642 mov $h0,0($ctx) 643 mov $h1,8($ctx) 644 mov $h2,16($ctx) # note that is_base2_26 is zeroed 645 jmp .Ldone_avx 646 647.align 16 648.Lstore_base2_26_avx: 649 mov %rax#d,0($ctx) # store hash value base 2^26 650 mov %rdx#d,4($ctx) 651 mov $h0#d,8($ctx) 652 mov $h1#d,12($ctx) 653 mov $h2#d,16($ctx) 654.align 16 655.Ldone_avx: 656 mov 0(%rsp),%r15 657.cfi_restore %r15 658 mov 8(%rsp),%r14 659.cfi_restore %r14 660 mov 16(%rsp),%r13 661.cfi_restore %r13 662 mov 24(%rsp),%r12 663.cfi_restore %r12 664 mov 32(%rsp),%rbp 665.cfi_restore %rbp 666 mov 40(%rsp),%rbx 667.cfi_restore %rbx 668 lea 48(%rsp),%rsp 669.cfi_adjust_cfa_offset -48 670.Lno_data_avx: 671.Lblocks_avx_epilogue: 672 ret 673.cfi_endproc 674 675.align 32 676.Lbase2_64_avx: 677.cfi_startproc 678 push %rbx 679.cfi_push %rbx 680 push %rbp 681.cfi_push %rbp 682 push %r12 683.cfi_push %r12 684 push %r13 685.cfi_push %r13 686 push %r14 687.cfi_push %r14 688 push %r15 689.cfi_push %r15 690.Lbase2_64_avx_body: 691 692 mov $len,%r15 # reassign $len 693 694 mov 24($ctx),$r0 # load r 695 mov 32($ctx),$s1 696 697 mov 0($ctx),$h0 # load hash value 698 mov 8($ctx),$h1 699 mov 16($ctx),$h2#d 700 701 mov $s1,$r1 702 mov $s1,%rax 703 shr \$2,$s1 704 add $r1,$s1 # s1 = r1 + (r1 >> 2) 705 706 test \$31,$len 707 jz .Linit_avx 708 709 add 0($inp),$h0 # accumulate input 710 adc 8($inp),$h1 711 lea 16($inp),$inp 712 adc $padbit,$h2 713 sub \$16,%r15 714 715 call __poly1305_block 716 717.Linit_avx: 718 ################################# base 2^64 -> base 2^26 719 mov $h0,%rax 720 mov $h0,%rdx 721 shr \$52,$h0 722 mov $h1,$d1 723 mov $h1,$d2 724 shr \$26,%rdx 725 and \$0x3ffffff,%rax # h[0] 726 shl \$12,$d1 727 and \$0x3ffffff,%rdx # h[1] 728 shr \$14,$h1 729 or $d1,$h0 730 shl \$24,$h2 731 and \$0x3ffffff,$h0 # h[2] 732 shr \$40,$d2 733 and \$0x3ffffff,$h1 # h[3] 734 or $d2,$h2 # h[4] 735 736 vmovd %rax#d,$H0 737 vmovd %rdx#d,$H1 738 vmovd $h0#d,$H2 739 vmovd $h1#d,$H3 740 vmovd $h2#d,$H4 741 movl \$1,20($ctx) # set is_base2_26 742 743 call __poly1305_init_avx 744 745.Lproceed_avx: 746 mov %r15,$len 747 748 mov 0(%rsp),%r15 749.cfi_restore %r15 750 mov 8(%rsp),%r14 751.cfi_restore %r14 752 mov 16(%rsp),%r13 753.cfi_restore %r13 754 mov 24(%rsp),%r12 755.cfi_restore %r12 756 mov 32(%rsp),%rbp 757.cfi_restore %rbp 758 mov 40(%rsp),%rbx 759.cfi_restore %rbx 760 lea 48(%rsp),%rax 761 lea 48(%rsp),%rsp 762.cfi_adjust_cfa_offset -48 763.Lbase2_64_avx_epilogue: 764 jmp .Ldo_avx 765.cfi_endproc 766 767.align 32 768.Leven_avx: 769.cfi_startproc 770 vmovd 4*0($ctx),$H0 # load hash value 771 vmovd 4*1($ctx),$H1 772 vmovd 4*2($ctx),$H2 773 vmovd 4*3($ctx),$H3 774 vmovd 4*4($ctx),$H4 775 776.Ldo_avx: 777___ 778$code.=<<___ if (!$win64); 779 lea -0x58(%rsp),%r11 780.cfi_def_cfa %r11,0x60 781 sub \$0x178,%rsp 782___ 783$code.=<<___ if ($win64); 784 lea -0xf8(%rsp),%r11 785 sub \$0x218,%rsp 786 vmovdqa %xmm6,0x50(%r11) 787 vmovdqa %xmm7,0x60(%r11) 788 vmovdqa %xmm8,0x70(%r11) 789 vmovdqa %xmm9,0x80(%r11) 790 vmovdqa %xmm10,0x90(%r11) 791 vmovdqa %xmm11,0xa0(%r11) 792 vmovdqa %xmm12,0xb0(%r11) 793 vmovdqa %xmm13,0xc0(%r11) 794 vmovdqa %xmm14,0xd0(%r11) 795 vmovdqa %xmm15,0xe0(%r11) 796.Ldo_avx_body: 797___ 798$code.=<<___; 799 sub \$64,$len 800 lea -32($inp),%rax 801 cmovc %rax,$inp 802 803 vmovdqu `16*3`($ctx),$D4 # preload r0^2 804 lea `16*3+64`($ctx),$ctx # size optimization 805 lea .Lconst(%rip),%rcx 806 807 ################################################################ 808 # load input 809 vmovdqu 16*2($inp),$T0 810 vmovdqu 16*3($inp),$T1 811 vmovdqa 64(%rcx),$MASK # .Lmask26 812 813 vpsrldq \$6,$T0,$T2 # splat input 814 vpsrldq \$6,$T1,$T3 815 vpunpckhqdq $T1,$T0,$T4 # 4 816 vpunpcklqdq $T1,$T0,$T0 # 0:1 817 vpunpcklqdq $T3,$T2,$T3 # 2:3 818 819 vpsrlq \$40,$T4,$T4 # 4 820 vpsrlq \$26,$T0,$T1 821 vpand $MASK,$T0,$T0 # 0 822 vpsrlq \$4,$T3,$T2 823 vpand $MASK,$T1,$T1 # 1 824 vpsrlq \$30,$T3,$T3 825 vpand $MASK,$T2,$T2 # 2 826 vpand $MASK,$T3,$T3 # 3 827 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 828 829 jbe .Lskip_loop_avx 830 831 # expand and copy pre-calculated table to stack 832 vmovdqu `16*1-64`($ctx),$D1 833 vmovdqu `16*2-64`($ctx),$D2 834 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 835 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 836 vmovdqa $D3,-0x90(%r11) 837 vmovdqa $D0,0x00(%rsp) 838 vpshufd \$0xEE,$D1,$D4 839 vmovdqu `16*3-64`($ctx),$D0 840 vpshufd \$0x44,$D1,$D1 841 vmovdqa $D4,-0x80(%r11) 842 vmovdqa $D1,0x10(%rsp) 843 vpshufd \$0xEE,$D2,$D3 844 vmovdqu `16*4-64`($ctx),$D1 845 vpshufd \$0x44,$D2,$D2 846 vmovdqa $D3,-0x70(%r11) 847 vmovdqa $D2,0x20(%rsp) 848 vpshufd \$0xEE,$D0,$D4 849 vmovdqu `16*5-64`($ctx),$D2 850 vpshufd \$0x44,$D0,$D0 851 vmovdqa $D4,-0x60(%r11) 852 vmovdqa $D0,0x30(%rsp) 853 vpshufd \$0xEE,$D1,$D3 854 vmovdqu `16*6-64`($ctx),$D0 855 vpshufd \$0x44,$D1,$D1 856 vmovdqa $D3,-0x50(%r11) 857 vmovdqa $D1,0x40(%rsp) 858 vpshufd \$0xEE,$D2,$D4 859 vmovdqu `16*7-64`($ctx),$D1 860 vpshufd \$0x44,$D2,$D2 861 vmovdqa $D4,-0x40(%r11) 862 vmovdqa $D2,0x50(%rsp) 863 vpshufd \$0xEE,$D0,$D3 864 vmovdqu `16*8-64`($ctx),$D2 865 vpshufd \$0x44,$D0,$D0 866 vmovdqa $D3,-0x30(%r11) 867 vmovdqa $D0,0x60(%rsp) 868 vpshufd \$0xEE,$D1,$D4 869 vpshufd \$0x44,$D1,$D1 870 vmovdqa $D4,-0x20(%r11) 871 vmovdqa $D1,0x70(%rsp) 872 vpshufd \$0xEE,$D2,$D3 873 vmovdqa 0x00(%rsp),$D4 # preload r0^2 874 vpshufd \$0x44,$D2,$D2 875 vmovdqa $D3,-0x10(%r11) 876 vmovdqa $D2,0x80(%rsp) 877 878 jmp .Loop_avx 879 880.align 32 881.Loop_avx: 882 ################################################################ 883 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 884 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 885 # \___________________/ 886 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 887 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 888 # \___________________/ \____________________/ 889 # 890 # Note that we start with inp[2:3]*r^2. This is because it 891 # doesn't depend on reduction in previous iteration. 892 ################################################################ 893 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 894 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 895 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 896 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 897 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 898 # 899 # though note that $Tx and $Hx are "reversed" in this section, 900 # and $D4 is preloaded with r0^2... 901 902 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 903 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 904 vmovdqa $H2,0x20(%r11) # offload hash 905 vpmuludq $T2,$D4,$D2 # d3 = h2*r0 906 vmovdqa 0x10(%rsp),$H2 # r1^2 907 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 908 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 909 910 vmovdqa $H0,0x00(%r11) # 911 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 912 vmovdqa $H1,0x10(%r11) # 913 vpmuludq $T3,$H2,$H1 # h3*r1 914 vpaddq $H0,$D0,$D0 # d0 += h4*s1 915 vpaddq $H1,$D4,$D4 # d4 += h3*r1 916 vmovdqa $H3,0x30(%r11) # 917 vpmuludq $T2,$H2,$H0 # h2*r1 918 vpmuludq $T1,$H2,$H1 # h1*r1 919 vpaddq $H0,$D3,$D3 # d3 += h2*r1 920 vmovdqa 0x30(%rsp),$H3 # r2^2 921 vpaddq $H1,$D2,$D2 # d2 += h1*r1 922 vmovdqa $H4,0x40(%r11) # 923 vpmuludq $T0,$H2,$H2 # h0*r1 924 vpmuludq $T2,$H3,$H0 # h2*r2 925 vpaddq $H2,$D1,$D1 # d1 += h0*r1 926 927 vmovdqa 0x40(%rsp),$H4 # s2^2 928 vpaddq $H0,$D4,$D4 # d4 += h2*r2 929 vpmuludq $T1,$H3,$H1 # h1*r2 930 vpmuludq $T0,$H3,$H3 # h0*r2 931 vpaddq $H1,$D3,$D3 # d3 += h1*r2 932 vmovdqa 0x50(%rsp),$H2 # r3^2 933 vpaddq $H3,$D2,$D2 # d2 += h0*r2 934 vpmuludq $T4,$H4,$H0 # h4*s2 935 vpmuludq $T3,$H4,$H4 # h3*s2 936 vpaddq $H0,$D1,$D1 # d1 += h4*s2 937 vmovdqa 0x60(%rsp),$H3 # s3^2 938 vpaddq $H4,$D0,$D0 # d0 += h3*s2 939 940 vmovdqa 0x80(%rsp),$H4 # s4^2 941 vpmuludq $T1,$H2,$H1 # h1*r3 942 vpmuludq $T0,$H2,$H2 # h0*r3 943 vpaddq $H1,$D4,$D4 # d4 += h1*r3 944 vpaddq $H2,$D3,$D3 # d3 += h0*r3 945 vpmuludq $T4,$H3,$H0 # h4*s3 946 vpmuludq $T3,$H3,$H1 # h3*s3 947 vpaddq $H0,$D2,$D2 # d2 += h4*s3 948 vmovdqu 16*0($inp),$H0 # load input 949 vpaddq $H1,$D1,$D1 # d1 += h3*s3 950 vpmuludq $T2,$H3,$H3 # h2*s3 951 vpmuludq $T2,$H4,$T2 # h2*s4 952 vpaddq $H3,$D0,$D0 # d0 += h2*s3 953 954 vmovdqu 16*1($inp),$H1 # 955 vpaddq $T2,$D1,$D1 # d1 += h2*s4 956 vpmuludq $T3,$H4,$T3 # h3*s4 957 vpmuludq $T4,$H4,$T4 # h4*s4 958 vpsrldq \$6,$H0,$H2 # splat input 959 vpaddq $T3,$D2,$D2 # d2 += h3*s4 960 vpaddq $T4,$D3,$D3 # d3 += h4*s4 961 vpsrldq \$6,$H1,$H3 # 962 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 963 vpmuludq $T1,$H4,$T0 # h1*s4 964 vpunpckhqdq $H1,$H0,$H4 # 4 965 vpaddq $T4,$D4,$D4 # d4 += h0*r4 966 vmovdqa -0x90(%r11),$T4 # r0^4 967 vpaddq $T0,$D0,$D0 # d0 += h1*s4 968 969 vpunpcklqdq $H1,$H0,$H0 # 0:1 970 vpunpcklqdq $H3,$H2,$H3 # 2:3 971 972 #vpsrlq \$40,$H4,$H4 # 4 973 vpsrldq \$`40/8`,$H4,$H4 # 4 974 vpsrlq \$26,$H0,$H1 975 vpand $MASK,$H0,$H0 # 0 976 vpsrlq \$4,$H3,$H2 977 vpand $MASK,$H1,$H1 # 1 978 vpand 0(%rcx),$H4,$H4 # .Lmask24 979 vpsrlq \$30,$H3,$H3 980 vpand $MASK,$H2,$H2 # 2 981 vpand $MASK,$H3,$H3 # 3 982 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 983 984 vpaddq 0x00(%r11),$H0,$H0 # add hash value 985 vpaddq 0x10(%r11),$H1,$H1 986 vpaddq 0x20(%r11),$H2,$H2 987 vpaddq 0x30(%r11),$H3,$H3 988 vpaddq 0x40(%r11),$H4,$H4 989 990 lea 16*2($inp),%rax 991 lea 16*4($inp),$inp 992 sub \$64,$len 993 cmovc %rax,$inp 994 995 ################################################################ 996 # Now we accumulate (inp[0:1]+hash)*r^4 997 ################################################################ 998 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 999 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1000 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1001 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1002 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1003 1004 vpmuludq $H0,$T4,$T0 # h0*r0 1005 vpmuludq $H1,$T4,$T1 # h1*r0 1006 vpaddq $T0,$D0,$D0 1007 vpaddq $T1,$D1,$D1 1008 vmovdqa -0x80(%r11),$T2 # r1^4 1009 vpmuludq $H2,$T4,$T0 # h2*r0 1010 vpmuludq $H3,$T4,$T1 # h3*r0 1011 vpaddq $T0,$D2,$D2 1012 vpaddq $T1,$D3,$D3 1013 vpmuludq $H4,$T4,$T4 # h4*r0 1014 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 1015 vpaddq $T4,$D4,$D4 1016 1017 vpaddq $T0,$D0,$D0 # d0 += h4*s1 1018 vpmuludq $H2,$T2,$T1 # h2*r1 1019 vpmuludq $H3,$T2,$T0 # h3*r1 1020 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1021 vmovdqa -0x60(%r11),$T3 # r2^4 1022 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1023 vpmuludq $H1,$T2,$T1 # h1*r1 1024 vpmuludq $H0,$T2,$T2 # h0*r1 1025 vpaddq $T1,$D2,$D2 # d2 += h1*r1 1026 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1027 1028 vmovdqa -0x50(%r11),$T4 # s2^4 1029 vpmuludq $H2,$T3,$T0 # h2*r2 1030 vpmuludq $H1,$T3,$T1 # h1*r2 1031 vpaddq $T0,$D4,$D4 # d4 += h2*r2 1032 vpaddq $T1,$D3,$D3 # d3 += h1*r2 1033 vmovdqa -0x40(%r11),$T2 # r3^4 1034 vpmuludq $H0,$T3,$T3 # h0*r2 1035 vpmuludq $H4,$T4,$T0 # h4*s2 1036 vpaddq $T3,$D2,$D2 # d2 += h0*r2 1037 vpaddq $T0,$D1,$D1 # d1 += h4*s2 1038 vmovdqa -0x30(%r11),$T3 # s3^4 1039 vpmuludq $H3,$T4,$T4 # h3*s2 1040 vpmuludq $H1,$T2,$T1 # h1*r3 1041 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1042 1043 vmovdqa -0x10(%r11),$T4 # s4^4 1044 vpaddq $T1,$D4,$D4 # d4 += h1*r3 1045 vpmuludq $H0,$T2,$T2 # h0*r3 1046 vpmuludq $H4,$T3,$T0 # h4*s3 1047 vpaddq $T2,$D3,$D3 # d3 += h0*r3 1048 vpaddq $T0,$D2,$D2 # d2 += h4*s3 1049 vmovdqu 16*2($inp),$T0 # load input 1050 vpmuludq $H3,$T3,$T2 # h3*s3 1051 vpmuludq $H2,$T3,$T3 # h2*s3 1052 vpaddq $T2,$D1,$D1 # d1 += h3*s3 1053 vmovdqu 16*3($inp),$T1 # 1054 vpaddq $T3,$D0,$D0 # d0 += h2*s3 1055 1056 vpmuludq $H2,$T4,$H2 # h2*s4 1057 vpmuludq $H3,$T4,$H3 # h3*s4 1058 vpsrldq \$6,$T0,$T2 # splat input 1059 vpaddq $H2,$D1,$D1 # d1 += h2*s4 1060 vpmuludq $H4,$T4,$H4 # h4*s4 1061 vpsrldq \$6,$T1,$T3 # 1062 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 1063 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 1064 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 1065 vpmuludq $H1,$T4,$H0 1066 vpunpckhqdq $T1,$T0,$T4 # 4 1067 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1068 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1069 1070 vpunpcklqdq $T1,$T0,$T0 # 0:1 1071 vpunpcklqdq $T3,$T2,$T3 # 2:3 1072 1073 #vpsrlq \$40,$T4,$T4 # 4 1074 vpsrldq \$`40/8`,$T4,$T4 # 4 1075 vpsrlq \$26,$T0,$T1 1076 vmovdqa 0x00(%rsp),$D4 # preload r0^2 1077 vpand $MASK,$T0,$T0 # 0 1078 vpsrlq \$4,$T3,$T2 1079 vpand $MASK,$T1,$T1 # 1 1080 vpand 0(%rcx),$T4,$T4 # .Lmask24 1081 vpsrlq \$30,$T3,$T3 1082 vpand $MASK,$T2,$T2 # 2 1083 vpand $MASK,$T3,$T3 # 3 1084 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1085 1086 ################################################################ 1087 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1088 # and P. Schwabe 1089 1090 vpsrlq \$26,$H3,$D3 1091 vpand $MASK,$H3,$H3 1092 vpaddq $D3,$H4,$H4 # h3 -> h4 1093 1094 vpsrlq \$26,$H0,$D0 1095 vpand $MASK,$H0,$H0 1096 vpaddq $D0,$D1,$H1 # h0 -> h1 1097 1098 vpsrlq \$26,$H4,$D0 1099 vpand $MASK,$H4,$H4 1100 1101 vpsrlq \$26,$H1,$D1 1102 vpand $MASK,$H1,$H1 1103 vpaddq $D1,$H2,$H2 # h1 -> h2 1104 1105 vpaddq $D0,$H0,$H0 1106 vpsllq \$2,$D0,$D0 1107 vpaddq $D0,$H0,$H0 # h4 -> h0 1108 1109 vpsrlq \$26,$H2,$D2 1110 vpand $MASK,$H2,$H2 1111 vpaddq $D2,$H3,$H3 # h2 -> h3 1112 1113 vpsrlq \$26,$H0,$D0 1114 vpand $MASK,$H0,$H0 1115 vpaddq $D0,$H1,$H1 # h0 -> h1 1116 1117 vpsrlq \$26,$H3,$D3 1118 vpand $MASK,$H3,$H3 1119 vpaddq $D3,$H4,$H4 # h3 -> h4 1120 1121 ja .Loop_avx 1122 1123.Lskip_loop_avx: 1124 ################################################################ 1125 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1126 1127 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 1128 add \$32,$len 1129 jnz .Long_tail_avx 1130 1131 vpaddq $H2,$T2,$T2 1132 vpaddq $H0,$T0,$T0 1133 vpaddq $H1,$T1,$T1 1134 vpaddq $H3,$T3,$T3 1135 vpaddq $H4,$T4,$T4 1136 1137.Long_tail_avx: 1138 vmovdqa $H2,0x20(%r11) 1139 vmovdqa $H0,0x00(%r11) 1140 vmovdqa $H1,0x10(%r11) 1141 vmovdqa $H3,0x30(%r11) 1142 vmovdqa $H4,0x40(%r11) 1143 1144 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1145 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1146 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1147 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1148 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1149 1150 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 1151 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 1152 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 1153 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 1154 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 1155 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 1156 1157 vpmuludq $T3,$H2,$H0 # h3*r1 1158 vpaddq $H0,$D4,$D4 # d4 += h3*r1 1159 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 1160 vpmuludq $T2,$H2,$H1 # h2*r1 1161 vpaddq $H1,$D3,$D3 # d3 += h2*r1 1162 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 1163 vpmuludq $T1,$H2,$H0 # h1*r1 1164 vpaddq $H0,$D2,$D2 # d2 += h1*r1 1165 vpmuludq $T0,$H2,$H2 # h0*r1 1166 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1167 vpmuludq $T4,$H3,$H3 # h4*s1 1168 vpaddq $H3,$D0,$D0 # d0 += h4*s1 1169 1170 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 1171 vpmuludq $T2,$H4,$H1 # h2*r2 1172 vpaddq $H1,$D4,$D4 # d4 += h2*r2 1173 vpmuludq $T1,$H4,$H0 # h1*r2 1174 vpaddq $H0,$D3,$D3 # d3 += h1*r2 1175 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 1176 vpmuludq $T0,$H4,$H4 # h0*r2 1177 vpaddq $H4,$D2,$D2 # d2 += h0*r2 1178 vpmuludq $T4,$H2,$H1 # h4*s2 1179 vpaddq $H1,$D1,$D1 # d1 += h4*s2 1180 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 1181 vpmuludq $T3,$H2,$H2 # h3*s2 1182 vpaddq $H2,$D0,$D0 # d0 += h3*s2 1183 1184 vpmuludq $T1,$H3,$H0 # h1*r3 1185 vpaddq $H0,$D4,$D4 # d4 += h1*r3 1186 vpmuludq $T0,$H3,$H3 # h0*r3 1187 vpaddq $H3,$D3,$D3 # d3 += h0*r3 1188 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 1189 vpmuludq $T4,$H4,$H1 # h4*s3 1190 vpaddq $H1,$D2,$D2 # d2 += h4*s3 1191 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 1192 vpmuludq $T3,$H4,$H0 # h3*s3 1193 vpaddq $H0,$D1,$D1 # d1 += h3*s3 1194 vpmuludq $T2,$H4,$H4 # h2*s3 1195 vpaddq $H4,$D0,$D0 # d0 += h2*s3 1196 1197 vpmuludq $T0,$H2,$H2 # h0*r4 1198 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 1199 vpmuludq $T4,$H3,$H1 # h4*s4 1200 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 1201 vpmuludq $T3,$H3,$H0 # h3*s4 1202 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 1203 vpmuludq $T2,$H3,$H1 # h2*s4 1204 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 1205 vpmuludq $T1,$H3,$H3 # h1*s4 1206 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 1207 1208 jz .Lshort_tail_avx 1209 1210 vmovdqu 16*0($inp),$H0 # load input 1211 vmovdqu 16*1($inp),$H1 1212 1213 vpsrldq \$6,$H0,$H2 # splat input 1214 vpsrldq \$6,$H1,$H3 1215 vpunpckhqdq $H1,$H0,$H4 # 4 1216 vpunpcklqdq $H1,$H0,$H0 # 0:1 1217 vpunpcklqdq $H3,$H2,$H3 # 2:3 1218 1219 vpsrlq \$40,$H4,$H4 # 4 1220 vpsrlq \$26,$H0,$H1 1221 vpand $MASK,$H0,$H0 # 0 1222 vpsrlq \$4,$H3,$H2 1223 vpand $MASK,$H1,$H1 # 1 1224 vpsrlq \$30,$H3,$H3 1225 vpand $MASK,$H2,$H2 # 2 1226 vpand $MASK,$H3,$H3 # 3 1227 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1228 1229 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 1230 vpaddq 0x00(%r11),$H0,$H0 1231 vpaddq 0x10(%r11),$H1,$H1 1232 vpaddq 0x20(%r11),$H2,$H2 1233 vpaddq 0x30(%r11),$H3,$H3 1234 vpaddq 0x40(%r11),$H4,$H4 1235 1236 ################################################################ 1237 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 1238 1239 vpmuludq $H0,$T4,$T0 # h0*r0 1240 vpaddq $T0,$D0,$D0 # d0 += h0*r0 1241 vpmuludq $H1,$T4,$T1 # h1*r0 1242 vpaddq $T1,$D1,$D1 # d1 += h1*r0 1243 vpmuludq $H2,$T4,$T0 # h2*r0 1244 vpaddq $T0,$D2,$D2 # d2 += h2*r0 1245 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 1246 vpmuludq $H3,$T4,$T1 # h3*r0 1247 vpaddq $T1,$D3,$D3 # d3 += h3*r0 1248 vpmuludq $H4,$T4,$T4 # h4*r0 1249 vpaddq $T4,$D4,$D4 # d4 += h4*r0 1250 1251 vpmuludq $H3,$T2,$T0 # h3*r1 1252 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1253 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 1254 vpmuludq $H2,$T2,$T1 # h2*r1 1255 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1256 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 1257 vpmuludq $H1,$T2,$T0 # h1*r1 1258 vpaddq $T0,$D2,$D2 # d2 += h1*r1 1259 vpmuludq $H0,$T2,$T2 # h0*r1 1260 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1261 vpmuludq $H4,$T3,$T3 # h4*s1 1262 vpaddq $T3,$D0,$D0 # d0 += h4*s1 1263 1264 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 1265 vpmuludq $H2,$T4,$T1 # h2*r2 1266 vpaddq $T1,$D4,$D4 # d4 += h2*r2 1267 vpmuludq $H1,$T4,$T0 # h1*r2 1268 vpaddq $T0,$D3,$D3 # d3 += h1*r2 1269 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 1270 vpmuludq $H0,$T4,$T4 # h0*r2 1271 vpaddq $T4,$D2,$D2 # d2 += h0*r2 1272 vpmuludq $H4,$T2,$T1 # h4*s2 1273 vpaddq $T1,$D1,$D1 # d1 += h4*s2 1274 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 1275 vpmuludq $H3,$T2,$T2 # h3*s2 1276 vpaddq $T2,$D0,$D0 # d0 += h3*s2 1277 1278 vpmuludq $H1,$T3,$T0 # h1*r3 1279 vpaddq $T0,$D4,$D4 # d4 += h1*r3 1280 vpmuludq $H0,$T3,$T3 # h0*r3 1281 vpaddq $T3,$D3,$D3 # d3 += h0*r3 1282 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 1283 vpmuludq $H4,$T4,$T1 # h4*s3 1284 vpaddq $T1,$D2,$D2 # d2 += h4*s3 1285 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 1286 vpmuludq $H3,$T4,$T0 # h3*s3 1287 vpaddq $T0,$D1,$D1 # d1 += h3*s3 1288 vpmuludq $H2,$T4,$T4 # h2*s3 1289 vpaddq $T4,$D0,$D0 # d0 += h2*s3 1290 1291 vpmuludq $H0,$T2,$T2 # h0*r4 1292 vpaddq $T2,$D4,$D4 # d4 += h0*r4 1293 vpmuludq $H4,$T3,$T1 # h4*s4 1294 vpaddq $T1,$D3,$D3 # d3 += h4*s4 1295 vpmuludq $H3,$T3,$T0 # h3*s4 1296 vpaddq $T0,$D2,$D2 # d2 += h3*s4 1297 vpmuludq $H2,$T3,$T1 # h2*s4 1298 vpaddq $T1,$D1,$D1 # d1 += h2*s4 1299 vpmuludq $H1,$T3,$T3 # h1*s4 1300 vpaddq $T3,$D0,$D0 # d0 += h1*s4 1301 1302.Lshort_tail_avx: 1303 ################################################################ 1304 # horizontal addition 1305 1306 vpsrldq \$8,$D4,$T4 1307 vpsrldq \$8,$D3,$T3 1308 vpsrldq \$8,$D1,$T1 1309 vpsrldq \$8,$D0,$T0 1310 vpsrldq \$8,$D2,$T2 1311 vpaddq $T3,$D3,$D3 1312 vpaddq $T4,$D4,$D4 1313 vpaddq $T0,$D0,$D0 1314 vpaddq $T1,$D1,$D1 1315 vpaddq $T2,$D2,$D2 1316 1317 ################################################################ 1318 # lazy reduction 1319 1320 vpsrlq \$26,$D3,$H3 1321 vpand $MASK,$D3,$D3 1322 vpaddq $H3,$D4,$D4 # h3 -> h4 1323 1324 vpsrlq \$26,$D0,$H0 1325 vpand $MASK,$D0,$D0 1326 vpaddq $H0,$D1,$D1 # h0 -> h1 1327 1328 vpsrlq \$26,$D4,$H4 1329 vpand $MASK,$D4,$D4 1330 1331 vpsrlq \$26,$D1,$H1 1332 vpand $MASK,$D1,$D1 1333 vpaddq $H1,$D2,$D2 # h1 -> h2 1334 1335 vpaddq $H4,$D0,$D0 1336 vpsllq \$2,$H4,$H4 1337 vpaddq $H4,$D0,$D0 # h4 -> h0 1338 1339 vpsrlq \$26,$D2,$H2 1340 vpand $MASK,$D2,$D2 1341 vpaddq $H2,$D3,$D3 # h2 -> h3 1342 1343 vpsrlq \$26,$D0,$H0 1344 vpand $MASK,$D0,$D0 1345 vpaddq $H0,$D1,$D1 # h0 -> h1 1346 1347 vpsrlq \$26,$D3,$H3 1348 vpand $MASK,$D3,$D3 1349 vpaddq $H3,$D4,$D4 # h3 -> h4 1350 1351 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 1352 vmovd $D1,`4*1-48-64`($ctx) 1353 vmovd $D2,`4*2-48-64`($ctx) 1354 vmovd $D3,`4*3-48-64`($ctx) 1355 vmovd $D4,`4*4-48-64`($ctx) 1356___ 1357$code.=<<___ if ($win64); 1358 vmovdqa 0x50(%r11),%xmm6 1359 vmovdqa 0x60(%r11),%xmm7 1360 vmovdqa 0x70(%r11),%xmm8 1361 vmovdqa 0x80(%r11),%xmm9 1362 vmovdqa 0x90(%r11),%xmm10 1363 vmovdqa 0xa0(%r11),%xmm11 1364 vmovdqa 0xb0(%r11),%xmm12 1365 vmovdqa 0xc0(%r11),%xmm13 1366 vmovdqa 0xd0(%r11),%xmm14 1367 vmovdqa 0xe0(%r11),%xmm15 1368 lea 0xf8(%r11),%rsp 1369.Ldo_avx_epilogue: 1370___ 1371$code.=<<___ if (!$win64); 1372 lea 0x58(%r11),%rsp 1373.cfi_def_cfa %rsp,8 1374___ 1375$code.=<<___; 1376 vzeroupper 1377 ret 1378.cfi_endproc 1379.size poly1305_blocks_avx,.-poly1305_blocks_avx 1380 1381.type poly1305_emit_avx,\@function,3 1382.align 32 1383poly1305_emit_avx: 1384.cfi_startproc 1385 cmpl \$0,20($ctx) # is_base2_26? 1386 je .Lemit 1387 1388 mov 0($ctx),%eax # load hash value base 2^26 1389 mov 4($ctx),%ecx 1390 mov 8($ctx),%r8d 1391 mov 12($ctx),%r11d 1392 mov 16($ctx),%r10d 1393 1394 shl \$26,%rcx # base 2^26 -> base 2^64 1395 mov %r8,%r9 1396 shl \$52,%r8 1397 add %rcx,%rax 1398 shr \$12,%r9 1399 add %rax,%r8 # h0 1400 adc \$0,%r9 1401 1402 shl \$14,%r11 1403 mov %r10,%rax 1404 shr \$24,%r10 1405 add %r11,%r9 1406 shl \$40,%rax 1407 add %rax,%r9 # h1 1408 adc \$0,%r10 # h2 1409 1410 mov %r10,%rax # could be partially reduced, so reduce 1411 mov %r10,%rcx 1412 and \$3,%r10 1413 shr \$2,%rax 1414 and \$-4,%rcx 1415 add %rcx,%rax 1416 add %rax,%r8 1417 adc \$0,%r9 1418 adc \$0,%r10 1419 1420 mov %r8,%rax 1421 add \$5,%r8 # compare to modulus 1422 mov %r9,%rcx 1423 adc \$0,%r9 1424 adc \$0,%r10 1425 shr \$2,%r10 # did 130-bit value overflow? 1426 cmovnz %r8,%rax 1427 cmovnz %r9,%rcx 1428 1429 add 0($nonce),%rax # accumulate nonce 1430 adc 8($nonce),%rcx 1431 mov %rax,0($mac) # write result 1432 mov %rcx,8($mac) 1433 1434 ret 1435.cfi_endproc 1436.size poly1305_emit_avx,.-poly1305_emit_avx 1437___ 1438 1439if ($avx>1) { 1440my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 1441 map("%ymm$_",(0..15)); 1442my $S4=$MASK; 1443 1444$code.=<<___; 1445.type poly1305_blocks_avx2,\@function,4 1446.align 32 1447poly1305_blocks_avx2: 1448.cfi_startproc 1449 mov 20($ctx),%r8d # is_base2_26 1450 cmp \$128,$len 1451 jae .Lblocks_avx2 1452 test %r8d,%r8d 1453 jz .Lblocks 1454 1455.Lblocks_avx2: 1456 and \$-16,$len 1457 jz .Lno_data_avx2 1458 1459 vzeroupper 1460 1461 test %r8d,%r8d 1462 jz .Lbase2_64_avx2 1463 1464 test \$63,$len 1465 jz .Leven_avx2 1466 1467 push %rbx 1468.cfi_push %rbx 1469 push %rbp 1470.cfi_push %rbp 1471 push %r12 1472.cfi_push %r12 1473 push %r13 1474.cfi_push %r13 1475 push %r14 1476.cfi_push %r14 1477 push %r15 1478.cfi_push %r15 1479.Lblocks_avx2_body: 1480 1481 mov $len,%r15 # reassign $len 1482 1483 mov 0($ctx),$d1 # load hash value 1484 mov 8($ctx),$d2 1485 mov 16($ctx),$h2#d 1486 1487 mov 24($ctx),$r0 # load r 1488 mov 32($ctx),$s1 1489 1490 ################################# base 2^26 -> base 2^64 1491 mov $d1#d,$h0#d 1492 and \$`-1*(1<<31)`,$d1 1493 mov $d2,$r1 # borrow $r1 1494 mov $d2#d,$h1#d 1495 and \$`-1*(1<<31)`,$d2 1496 1497 shr \$6,$d1 1498 shl \$52,$r1 1499 add $d1,$h0 1500 shr \$12,$h1 1501 shr \$18,$d2 1502 add $r1,$h0 1503 adc $d2,$h1 1504 1505 mov $h2,$d1 1506 shl \$40,$d1 1507 shr \$24,$h2 1508 add $d1,$h1 1509 adc \$0,$h2 # can be partially reduced... 1510 1511 mov \$-4,$d2 # ... so reduce 1512 mov $h2,$d1 1513 and $h2,$d2 1514 shr \$2,$d1 1515 and \$3,$h2 1516 add $d2,$d1 # =*5 1517 add $d1,$h0 1518 adc \$0,$h1 1519 adc \$0,$h2 1520 1521 mov $s1,$r1 1522 mov $s1,%rax 1523 shr \$2,$s1 1524 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1525 1526.Lbase2_26_pre_avx2: 1527 add 0($inp),$h0 # accumulate input 1528 adc 8($inp),$h1 1529 lea 16($inp),$inp 1530 adc $padbit,$h2 1531 sub \$16,%r15 1532 1533 call __poly1305_block 1534 mov $r1,%rax 1535 1536 test \$63,%r15 1537 jnz .Lbase2_26_pre_avx2 1538 1539 test $padbit,$padbit # if $padbit is zero, 1540 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format 1541 1542 ################################# base 2^64 -> base 2^26 1543 mov $h0,%rax 1544 mov $h0,%rdx 1545 shr \$52,$h0 1546 mov $h1,$r0 1547 mov $h1,$r1 1548 shr \$26,%rdx 1549 and \$0x3ffffff,%rax # h[0] 1550 shl \$12,$r0 1551 and \$0x3ffffff,%rdx # h[1] 1552 shr \$14,$h1 1553 or $r0,$h0 1554 shl \$24,$h2 1555 and \$0x3ffffff,$h0 # h[2] 1556 shr \$40,$r1 1557 and \$0x3ffffff,$h1 # h[3] 1558 or $r1,$h2 # h[4] 1559 1560 test %r15,%r15 1561 jz .Lstore_base2_26_avx2 1562 1563 vmovd %rax#d,%x#$H0 1564 vmovd %rdx#d,%x#$H1 1565 vmovd $h0#d,%x#$H2 1566 vmovd $h1#d,%x#$H3 1567 vmovd $h2#d,%x#$H4 1568 jmp .Lproceed_avx2 1569 1570.align 32 1571.Lstore_base2_64_avx2: 1572 mov $h0,0($ctx) 1573 mov $h1,8($ctx) 1574 mov $h2,16($ctx) # note that is_base2_26 is zeroed 1575 jmp .Ldone_avx2 1576 1577.align 16 1578.Lstore_base2_26_avx2: 1579 mov %rax#d,0($ctx) # store hash value base 2^26 1580 mov %rdx#d,4($ctx) 1581 mov $h0#d,8($ctx) 1582 mov $h1#d,12($ctx) 1583 mov $h2#d,16($ctx) 1584.align 16 1585.Ldone_avx2: 1586 mov 0(%rsp),%r15 1587.cfi_restore %r15 1588 mov 8(%rsp),%r14 1589.cfi_restore %r14 1590 mov 16(%rsp),%r13 1591.cfi_restore %r13 1592 mov 24(%rsp),%r12 1593.cfi_restore %r12 1594 mov 32(%rsp),%rbp 1595.cfi_restore %rbp 1596 mov 40(%rsp),%rbx 1597.cfi_restore %rbx 1598 lea 48(%rsp),%rsp 1599.cfi_adjust_cfa_offset -48 1600.Lno_data_avx2: 1601.Lblocks_avx2_epilogue: 1602 ret 1603.cfi_endproc 1604 1605.align 32 1606.Lbase2_64_avx2: 1607.cfi_startproc 1608 push %rbx 1609.cfi_push %rbx 1610 push %rbp 1611.cfi_push %rbp 1612 push %r12 1613.cfi_push %r12 1614 push %r13 1615.cfi_push %r13 1616 push %r14 1617.cfi_push %r14 1618 push %r15 1619.cfi_push %r15 1620.Lbase2_64_avx2_body: 1621 1622 mov $len,%r15 # reassign $len 1623 1624 mov 24($ctx),$r0 # load r 1625 mov 32($ctx),$s1 1626 1627 mov 0($ctx),$h0 # load hash value 1628 mov 8($ctx),$h1 1629 mov 16($ctx),$h2#d 1630 1631 mov $s1,$r1 1632 mov $s1,%rax 1633 shr \$2,$s1 1634 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1635 1636 test \$63,$len 1637 jz .Linit_avx2 1638 1639.Lbase2_64_pre_avx2: 1640 add 0($inp),$h0 # accumulate input 1641 adc 8($inp),$h1 1642 lea 16($inp),$inp 1643 adc $padbit,$h2 1644 sub \$16,%r15 1645 1646 call __poly1305_block 1647 mov $r1,%rax 1648 1649 test \$63,%r15 1650 jnz .Lbase2_64_pre_avx2 1651 1652.Linit_avx2: 1653 ################################# base 2^64 -> base 2^26 1654 mov $h0,%rax 1655 mov $h0,%rdx 1656 shr \$52,$h0 1657 mov $h1,$d1 1658 mov $h1,$d2 1659 shr \$26,%rdx 1660 and \$0x3ffffff,%rax # h[0] 1661 shl \$12,$d1 1662 and \$0x3ffffff,%rdx # h[1] 1663 shr \$14,$h1 1664 or $d1,$h0 1665 shl \$24,$h2 1666 and \$0x3ffffff,$h0 # h[2] 1667 shr \$40,$d2 1668 and \$0x3ffffff,$h1 # h[3] 1669 or $d2,$h2 # h[4] 1670 1671 vmovd %rax#d,%x#$H0 1672 vmovd %rdx#d,%x#$H1 1673 vmovd $h0#d,%x#$H2 1674 vmovd $h1#d,%x#$H3 1675 vmovd $h2#d,%x#$H4 1676 movl \$1,20($ctx) # set is_base2_26 1677 1678 call __poly1305_init_avx 1679 1680.Lproceed_avx2: 1681 mov %r15,$len # restore $len 1682 mov OPENSSL_ia32cap_P+8(%rip),%r10d 1683 mov \$`(1<<31|1<<30|1<<16)`,%r11d 1684 1685 mov 0(%rsp),%r15 1686.cfi_restore %r15 1687 mov 8(%rsp),%r14 1688.cfi_restore %r14 1689 mov 16(%rsp),%r13 1690.cfi_restore %r13 1691 mov 24(%rsp),%r12 1692.cfi_restore %r12 1693 mov 32(%rsp),%rbp 1694.cfi_restore %rbp 1695 mov 40(%rsp),%rbx 1696.cfi_restore %rbx 1697 lea 48(%rsp),%rax 1698 lea 48(%rsp),%rsp 1699.cfi_adjust_cfa_offset -48 1700.Lbase2_64_avx2_epilogue: 1701 jmp .Ldo_avx2 1702.cfi_endproc 1703 1704.align 32 1705.Leven_avx2: 1706.cfi_startproc 1707 mov OPENSSL_ia32cap_P+8(%rip),%r10d 1708 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 1709 vmovd 4*1($ctx),%x#$H1 1710 vmovd 4*2($ctx),%x#$H2 1711 vmovd 4*3($ctx),%x#$H3 1712 vmovd 4*4($ctx),%x#$H4 1713 1714.Ldo_avx2: 1715___ 1716$code.=<<___ if ($avx>2); 1717 cmp \$512,$len 1718 jb .Lskip_avx512 1719 and %r11d,%r10d 1720 test \$`1<<16`,%r10d # check for AVX512F 1721 jnz .Lblocks_avx512 1722.Lskip_avx512: 1723___ 1724$code.=<<___ if (!$win64); 1725 lea -8(%rsp),%r11 1726.cfi_def_cfa %r11,16 1727 sub \$0x128,%rsp 1728___ 1729$code.=<<___ if ($win64); 1730 lea -0xf8(%rsp),%r11 1731 sub \$0x1c8,%rsp 1732 vmovdqa %xmm6,0x50(%r11) 1733 vmovdqa %xmm7,0x60(%r11) 1734 vmovdqa %xmm8,0x70(%r11) 1735 vmovdqa %xmm9,0x80(%r11) 1736 vmovdqa %xmm10,0x90(%r11) 1737 vmovdqa %xmm11,0xa0(%r11) 1738 vmovdqa %xmm12,0xb0(%r11) 1739 vmovdqa %xmm13,0xc0(%r11) 1740 vmovdqa %xmm14,0xd0(%r11) 1741 vmovdqa %xmm15,0xe0(%r11) 1742.Ldo_avx2_body: 1743___ 1744$code.=<<___; 1745 lea .Lconst(%rip),%rcx 1746 lea 48+64($ctx),$ctx # size optimization 1747 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 1748 1749 # expand and copy pre-calculated table to stack 1750 vmovdqu `16*0-64`($ctx),%x#$T2 1751 and \$-512,%rsp 1752 vmovdqu `16*1-64`($ctx),%x#$T3 1753 vmovdqu `16*2-64`($ctx),%x#$T4 1754 vmovdqu `16*3-64`($ctx),%x#$D0 1755 vmovdqu `16*4-64`($ctx),%x#$D1 1756 vmovdqu `16*5-64`($ctx),%x#$D2 1757 lea 0x90(%rsp),%rax # size optimization 1758 vmovdqu `16*6-64`($ctx),%x#$D3 1759 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 1760 vmovdqu `16*7-64`($ctx),%x#$D4 1761 vpermd $T3,$T0,$T3 1762 vmovdqu `16*8-64`($ctx),%x#$MASK 1763 vpermd $T4,$T0,$T4 1764 vmovdqa $T2,0x00(%rsp) 1765 vpermd $D0,$T0,$D0 1766 vmovdqa $T3,0x20-0x90(%rax) 1767 vpermd $D1,$T0,$D1 1768 vmovdqa $T4,0x40-0x90(%rax) 1769 vpermd $D2,$T0,$D2 1770 vmovdqa $D0,0x60-0x90(%rax) 1771 vpermd $D3,$T0,$D3 1772 vmovdqa $D1,0x80-0x90(%rax) 1773 vpermd $D4,$T0,$D4 1774 vmovdqa $D2,0xa0-0x90(%rax) 1775 vpermd $MASK,$T0,$MASK 1776 vmovdqa $D3,0xc0-0x90(%rax) 1777 vmovdqa $D4,0xe0-0x90(%rax) 1778 vmovdqa $MASK,0x100-0x90(%rax) 1779 vmovdqa 64(%rcx),$MASK # .Lmask26 1780 1781 ################################################################ 1782 # load input 1783 vmovdqu 16*0($inp),%x#$T0 1784 vmovdqu 16*1($inp),%x#$T1 1785 vinserti128 \$1,16*2($inp),$T0,$T0 1786 vinserti128 \$1,16*3($inp),$T1,$T1 1787 lea 16*4($inp),$inp 1788 1789 vpsrldq \$6,$T0,$T2 # splat input 1790 vpsrldq \$6,$T1,$T3 1791 vpunpckhqdq $T1,$T0,$T4 # 4 1792 vpunpcklqdq $T3,$T2,$T2 # 2:3 1793 vpunpcklqdq $T1,$T0,$T0 # 0:1 1794 1795 vpsrlq \$30,$T2,$T3 1796 vpsrlq \$4,$T2,$T2 1797 vpsrlq \$26,$T0,$T1 1798 vpsrlq \$40,$T4,$T4 # 4 1799 vpand $MASK,$T2,$T2 # 2 1800 vpand $MASK,$T0,$T0 # 0 1801 vpand $MASK,$T1,$T1 # 1 1802 vpand $MASK,$T3,$T3 # 3 1803 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1804 1805 vpaddq $H2,$T2,$H2 # accumulate input 1806 sub \$64,$len 1807 jz .Ltail_avx2 1808 jmp .Loop_avx2 1809 1810.align 32 1811.Loop_avx2: 1812 ################################################################ 1813 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 1814 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 1815 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 1816 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 1817 # \________/\__________/ 1818 ################################################################ 1819 #vpaddq $H2,$T2,$H2 # accumulate input 1820 vpaddq $H0,$T0,$H0 1821 vmovdqa `32*0`(%rsp),$T0 # r0^4 1822 vpaddq $H1,$T1,$H1 1823 vmovdqa `32*1`(%rsp),$T1 # r1^4 1824 vpaddq $H3,$T3,$H3 1825 vmovdqa `32*3`(%rsp),$T2 # r2^4 1826 vpaddq $H4,$T4,$H4 1827 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 1828 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 1829 1830 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1831 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1832 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1833 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1834 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1835 # 1836 # however, as h2 is "chronologically" first one available pull 1837 # corresponding operations up, so it's 1838 # 1839 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 1840 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 1841 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1842 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 1843 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 1844 1845 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1846 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1847 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1848 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1849 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1850 1851 vpmuludq $H0,$T1,$T4 # h0*r1 1852 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 1853 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1854 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1855 vpmuludq $H3,$T1,$T4 # h3*r1 1856 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 1857 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1858 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1859 vmovdqa `32*4-0x90`(%rax),$T1 # s2 1860 1861 vpmuludq $H0,$T0,$T4 # h0*r0 1862 vpmuludq $H1,$T0,$H2 # h1*r0 1863 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1864 vpaddq $H2,$D1,$D1 # d1 += h1*r0 1865 vpmuludq $H3,$T0,$T4 # h3*r0 1866 vpmuludq $H4,$T0,$H2 # h4*r0 1867 vmovdqu 16*0($inp),%x#$T0 # load input 1868 vpaddq $T4,$D3,$D3 # d3 += h3*r0 1869 vpaddq $H2,$D4,$D4 # d4 += h4*r0 1870 vinserti128 \$1,16*2($inp),$T0,$T0 1871 1872 vpmuludq $H3,$T1,$T4 # h3*s2 1873 vpmuludq $H4,$T1,$H2 # h4*s2 1874 vmovdqu 16*1($inp),%x#$T1 1875 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1876 vpaddq $H2,$D1,$D1 # d1 += h4*s2 1877 vmovdqa `32*5-0x90`(%rax),$H2 # r3 1878 vpmuludq $H1,$T2,$T4 # h1*r2 1879 vpmuludq $H0,$T2,$T2 # h0*r2 1880 vpaddq $T4,$D3,$D3 # d3 += h1*r2 1881 vpaddq $T2,$D2,$D2 # d2 += h0*r2 1882 vinserti128 \$1,16*3($inp),$T1,$T1 1883 lea 16*4($inp),$inp 1884 1885 vpmuludq $H1,$H2,$T4 # h1*r3 1886 vpmuludq $H0,$H2,$H2 # h0*r3 1887 vpsrldq \$6,$T0,$T2 # splat input 1888 vpaddq $T4,$D4,$D4 # d4 += h1*r3 1889 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1890 vpmuludq $H3,$T3,$T4 # h3*s3 1891 vpmuludq $H4,$T3,$H2 # h4*s3 1892 vpsrldq \$6,$T1,$T3 1893 vpaddq $T4,$D1,$D1 # d1 += h3*s3 1894 vpaddq $H2,$D2,$D2 # d2 += h4*s3 1895 vpunpckhqdq $T1,$T0,$T4 # 4 1896 1897 vpmuludq $H3,$S4,$H3 # h3*s4 1898 vpmuludq $H4,$S4,$H4 # h4*s4 1899 vpunpcklqdq $T1,$T0,$T0 # 0:1 1900 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 1901 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 1902 vpunpcklqdq $T3,$T2,$T3 # 2:3 1903 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 1904 vpmuludq $H1,$S4,$H0 # h1*s4 1905 vmovdqa 64(%rcx),$MASK # .Lmask26 1906 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1907 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1908 1909 ################################################################ 1910 # lazy reduction (interleaved with tail of input splat) 1911 1912 vpsrlq \$26,$H3,$D3 1913 vpand $MASK,$H3,$H3 1914 vpaddq $D3,$H4,$H4 # h3 -> h4 1915 1916 vpsrlq \$26,$H0,$D0 1917 vpand $MASK,$H0,$H0 1918 vpaddq $D0,$D1,$H1 # h0 -> h1 1919 1920 vpsrlq \$26,$H4,$D4 1921 vpand $MASK,$H4,$H4 1922 1923 vpsrlq \$4,$T3,$T2 1924 1925 vpsrlq \$26,$H1,$D1 1926 vpand $MASK,$H1,$H1 1927 vpaddq $D1,$H2,$H2 # h1 -> h2 1928 1929 vpaddq $D4,$H0,$H0 1930 vpsllq \$2,$D4,$D4 1931 vpaddq $D4,$H0,$H0 # h4 -> h0 1932 1933 vpand $MASK,$T2,$T2 # 2 1934 vpsrlq \$26,$T0,$T1 1935 1936 vpsrlq \$26,$H2,$D2 1937 vpand $MASK,$H2,$H2 1938 vpaddq $D2,$H3,$H3 # h2 -> h3 1939 1940 vpaddq $T2,$H2,$H2 # modulo-scheduled 1941 vpsrlq \$30,$T3,$T3 1942 1943 vpsrlq \$26,$H0,$D0 1944 vpand $MASK,$H0,$H0 1945 vpaddq $D0,$H1,$H1 # h0 -> h1 1946 1947 vpsrlq \$40,$T4,$T4 # 4 1948 1949 vpsrlq \$26,$H3,$D3 1950 vpand $MASK,$H3,$H3 1951 vpaddq $D3,$H4,$H4 # h3 -> h4 1952 1953 vpand $MASK,$T0,$T0 # 0 1954 vpand $MASK,$T1,$T1 # 1 1955 vpand $MASK,$T3,$T3 # 3 1956 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1957 1958 sub \$64,$len 1959 jnz .Loop_avx2 1960 1961 .byte 0x66,0x90 1962.Ltail_avx2: 1963 ################################################################ 1964 # while above multiplications were by r^4 in all lanes, in last 1965 # iteration we multiply least significant lane by r^4 and most 1966 # significant one by r, so copy of above except that references 1967 # to the precomputed table are displaced by 4... 1968 1969 #vpaddq $H2,$T2,$H2 # accumulate input 1970 vpaddq $H0,$T0,$H0 1971 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 1972 vpaddq $H1,$T1,$H1 1973 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 1974 vpaddq $H3,$T3,$H3 1975 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 1976 vpaddq $H4,$T4,$H4 1977 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 1978 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 1979 1980 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1981 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1982 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1983 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1984 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1985 1986 vpmuludq $H0,$T1,$T4 # h0*r1 1987 vpmuludq $H1,$T1,$H2 # h1*r1 1988 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1989 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1990 vpmuludq $H3,$T1,$T4 # h3*r1 1991 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 1992 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1993 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1994 1995 vpmuludq $H0,$T0,$T4 # h0*r0 1996 vpmuludq $H1,$T0,$H2 # h1*r0 1997 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1998 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 1999 vpaddq $H2,$D1,$D1 # d1 += h1*r0 2000 vpmuludq $H3,$T0,$T4 # h3*r0 2001 vpmuludq $H4,$T0,$H2 # h4*r0 2002 vpaddq $T4,$D3,$D3 # d3 += h3*r0 2003 vpaddq $H2,$D4,$D4 # d4 += h4*r0 2004 2005 vpmuludq $H3,$T1,$T4 # h3*s2 2006 vpmuludq $H4,$T1,$H2 # h4*s2 2007 vpaddq $T4,$D0,$D0 # d0 += h3*s2 2008 vpaddq $H2,$D1,$D1 # d1 += h4*s2 2009 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 2010 vpmuludq $H1,$T2,$T4 # h1*r2 2011 vpmuludq $H0,$T2,$T2 # h0*r2 2012 vpaddq $T4,$D3,$D3 # d3 += h1*r2 2013 vpaddq $T2,$D2,$D2 # d2 += h0*r2 2014 2015 vpmuludq $H1,$H2,$T4 # h1*r3 2016 vpmuludq $H0,$H2,$H2 # h0*r3 2017 vpaddq $T4,$D4,$D4 # d4 += h1*r3 2018 vpaddq $H2,$D3,$D3 # d3 += h0*r3 2019 vpmuludq $H3,$T3,$T4 # h3*s3 2020 vpmuludq $H4,$T3,$H2 # h4*s3 2021 vpaddq $T4,$D1,$D1 # d1 += h3*s3 2022 vpaddq $H2,$D2,$D2 # d2 += h4*s3 2023 2024 vpmuludq $H3,$S4,$H3 # h3*s4 2025 vpmuludq $H4,$S4,$H4 # h4*s4 2026 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 2027 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 2028 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 2029 vpmuludq $H1,$S4,$H0 # h1*s4 2030 vmovdqa 64(%rcx),$MASK # .Lmask26 2031 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2032 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2033 2034 ################################################################ 2035 # horizontal addition 2036 2037 vpsrldq \$8,$D1,$T1 2038 vpsrldq \$8,$H2,$T2 2039 vpsrldq \$8,$H3,$T3 2040 vpsrldq \$8,$H4,$T4 2041 vpsrldq \$8,$H0,$T0 2042 vpaddq $T1,$D1,$D1 2043 vpaddq $T2,$H2,$H2 2044 vpaddq $T3,$H3,$H3 2045 vpaddq $T4,$H4,$H4 2046 vpaddq $T0,$H0,$H0 2047 2048 vpermq \$0x2,$H3,$T3 2049 vpermq \$0x2,$H4,$T4 2050 vpermq \$0x2,$H0,$T0 2051 vpermq \$0x2,$D1,$T1 2052 vpermq \$0x2,$H2,$T2 2053 vpaddq $T3,$H3,$H3 2054 vpaddq $T4,$H4,$H4 2055 vpaddq $T0,$H0,$H0 2056 vpaddq $T1,$D1,$D1 2057 vpaddq $T2,$H2,$H2 2058 2059 ################################################################ 2060 # lazy reduction 2061 2062 vpsrlq \$26,$H3,$D3 2063 vpand $MASK,$H3,$H3 2064 vpaddq $D3,$H4,$H4 # h3 -> h4 2065 2066 vpsrlq \$26,$H0,$D0 2067 vpand $MASK,$H0,$H0 2068 vpaddq $D0,$D1,$H1 # h0 -> h1 2069 2070 vpsrlq \$26,$H4,$D4 2071 vpand $MASK,$H4,$H4 2072 2073 vpsrlq \$26,$H1,$D1 2074 vpand $MASK,$H1,$H1 2075 vpaddq $D1,$H2,$H2 # h1 -> h2 2076 2077 vpaddq $D4,$H0,$H0 2078 vpsllq \$2,$D4,$D4 2079 vpaddq $D4,$H0,$H0 # h4 -> h0 2080 2081 vpsrlq \$26,$H2,$D2 2082 vpand $MASK,$H2,$H2 2083 vpaddq $D2,$H3,$H3 # h2 -> h3 2084 2085 vpsrlq \$26,$H0,$D0 2086 vpand $MASK,$H0,$H0 2087 vpaddq $D0,$H1,$H1 # h0 -> h1 2088 2089 vpsrlq \$26,$H3,$D3 2090 vpand $MASK,$H3,$H3 2091 vpaddq $D3,$H4,$H4 # h3 -> h4 2092 2093 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2094 vmovd %x#$H1,`4*1-48-64`($ctx) 2095 vmovd %x#$H2,`4*2-48-64`($ctx) 2096 vmovd %x#$H3,`4*3-48-64`($ctx) 2097 vmovd %x#$H4,`4*4-48-64`($ctx) 2098___ 2099$code.=<<___ if ($win64); 2100 vmovdqa 0x50(%r11),%xmm6 2101 vmovdqa 0x60(%r11),%xmm7 2102 vmovdqa 0x70(%r11),%xmm8 2103 vmovdqa 0x80(%r11),%xmm9 2104 vmovdqa 0x90(%r11),%xmm10 2105 vmovdqa 0xa0(%r11),%xmm11 2106 vmovdqa 0xb0(%r11),%xmm12 2107 vmovdqa 0xc0(%r11),%xmm13 2108 vmovdqa 0xd0(%r11),%xmm14 2109 vmovdqa 0xe0(%r11),%xmm15 2110 lea 0xf8(%r11),%rsp 2111.Ldo_avx2_epilogue: 2112___ 2113$code.=<<___ if (!$win64); 2114 lea 8(%r11),%rsp 2115.cfi_def_cfa %rsp,8 2116___ 2117$code.=<<___; 2118 vzeroupper 2119 ret 2120.cfi_endproc 2121.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 2122___ 2123####################################################################### 2124if ($avx>2) { 2125# On entry we have input length divisible by 64. But since inner loop 2126# processes 128 bytes per iteration, cases when length is not divisible 2127# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 2128# reason stack layout is kept identical to poly1305_blocks_avx2. If not 2129# for this tail, we wouldn't have to even allocate stack frame... 2130 2131my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 2132my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 2133my $PADBIT="%zmm30"; 2134 2135map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 2136map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 2137map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 2138map(s/%y/%z/,($MASK)); 2139 2140$code.=<<___; 2141.type poly1305_blocks_avx512,\@function,4 2142.align 32 2143poly1305_blocks_avx512: 2144.cfi_startproc 2145.Lblocks_avx512: 2146 mov \$15,%eax 2147 kmovw %eax,%k2 2148___ 2149$code.=<<___ if (!$win64); 2150 lea -8(%rsp),%r11 2151.cfi_def_cfa %r11,16 2152 sub \$0x128,%rsp 2153___ 2154$code.=<<___ if ($win64); 2155 lea -0xf8(%rsp),%r11 2156 sub \$0x1c8,%rsp 2157 vmovdqa %xmm6,0x50(%r11) 2158 vmovdqa %xmm7,0x60(%r11) 2159 vmovdqa %xmm8,0x70(%r11) 2160 vmovdqa %xmm9,0x80(%r11) 2161 vmovdqa %xmm10,0x90(%r11) 2162 vmovdqa %xmm11,0xa0(%r11) 2163 vmovdqa %xmm12,0xb0(%r11) 2164 vmovdqa %xmm13,0xc0(%r11) 2165 vmovdqa %xmm14,0xd0(%r11) 2166 vmovdqa %xmm15,0xe0(%r11) 2167.Ldo_avx512_body: 2168___ 2169$code.=<<___; 2170 lea .Lconst(%rip),%rcx 2171 lea 48+64($ctx),$ctx # size optimization 2172 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 2173 2174 # expand pre-calculated table 2175 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 2176 and \$-512,%rsp 2177 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 2178 mov \$0x20,%rax 2179 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 2180 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 2181 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 2182 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 2183 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 2184 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 2185 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 2186 vpermd $D0,$T2,$R0 # 00003412 -> 14243444 2187 vpbroadcastq 64(%rcx),$MASK # .Lmask26 2188 vpermd $D1,$T2,$R1 2189 vpermd $T0,$T2,$S1 2190 vpermd $D2,$T2,$R2 2191 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 2192 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 2193 vpermd $T1,$T2,$S2 2194 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 2195 vpsrlq \$32,$R1,$T1 2196 vpermd $D3,$T2,$R3 2197 vmovdqa64 $S1,0x40(%rsp){%k2} 2198 vpermd $T3,$T2,$S3 2199 vpermd $D4,$T2,$R4 2200 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 2201 vpermd $T4,$T2,$S4 2202 vmovdqa64 $S2,0x80(%rsp){%k2} 2203 vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 2204 vmovdqa64 $S3,0xc0(%rsp){%k2} 2205 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 2206 vmovdqa64 $S4,0x100(%rsp){%k2} 2207 2208 ################################################################ 2209 # calculate 5th through 8th powers of the key 2210 # 2211 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 2212 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 2213 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 2214 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 2215 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 2216 2217 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 2218 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 2219 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 2220 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 2221 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 2222 vpsrlq \$32,$R2,$T2 2223 2224 vpmuludq $T1,$S4,$M0 2225 vpmuludq $T1,$R0,$M1 2226 vpmuludq $T1,$R1,$M2 2227 vpmuludq $T1,$R2,$M3 2228 vpmuludq $T1,$R3,$M4 2229 vpsrlq \$32,$R3,$T3 2230 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 2231 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 2232 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 2233 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 2234 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 2235 2236 vpmuludq $T2,$S3,$M0 2237 vpmuludq $T2,$S4,$M1 2238 vpmuludq $T2,$R1,$M3 2239 vpmuludq $T2,$R2,$M4 2240 vpmuludq $T2,$R0,$M2 2241 vpsrlq \$32,$R4,$T4 2242 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 2243 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 2244 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 2245 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 2246 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 2247 2248 vpmuludq $T3,$S2,$M0 2249 vpmuludq $T3,$R0,$M3 2250 vpmuludq $T3,$R1,$M4 2251 vpmuludq $T3,$S3,$M1 2252 vpmuludq $T3,$S4,$M2 2253 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 2254 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 2255 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 2256 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 2257 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 2258 2259 vpmuludq $T4,$S4,$M3 2260 vpmuludq $T4,$R0,$M4 2261 vpmuludq $T4,$S1,$M0 2262 vpmuludq $T4,$S2,$M1 2263 vpmuludq $T4,$S3,$M2 2264 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 2265 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 2266 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 2267 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 2268 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 2269 2270 ################################################################ 2271 # load input 2272 vmovdqu64 16*0($inp),%z#$T3 2273 vmovdqu64 16*4($inp),%z#$T4 2274 lea 16*8($inp),$inp 2275 2276 ################################################################ 2277 # lazy reduction 2278 2279 vpsrlq \$26,$D3,$M3 2280 vpandq $MASK,$D3,$D3 2281 vpaddq $M3,$D4,$D4 # d3 -> d4 2282 2283 vpsrlq \$26,$D0,$M0 2284 vpandq $MASK,$D0,$D0 2285 vpaddq $M0,$D1,$D1 # d0 -> d1 2286 2287 vpsrlq \$26,$D4,$M4 2288 vpandq $MASK,$D4,$D4 2289 2290 vpsrlq \$26,$D1,$M1 2291 vpandq $MASK,$D1,$D1 2292 vpaddq $M1,$D2,$D2 # d1 -> d2 2293 2294 vpaddq $M4,$D0,$D0 2295 vpsllq \$2,$M4,$M4 2296 vpaddq $M4,$D0,$D0 # d4 -> d0 2297 2298 vpsrlq \$26,$D2,$M2 2299 vpandq $MASK,$D2,$D2 2300 vpaddq $M2,$D3,$D3 # d2 -> d3 2301 2302 vpsrlq \$26,$D0,$M0 2303 vpandq $MASK,$D0,$D0 2304 vpaddq $M0,$D1,$D1 # d0 -> d1 2305 2306 vpsrlq \$26,$D3,$M3 2307 vpandq $MASK,$D3,$D3 2308 vpaddq $M3,$D4,$D4 # d3 -> d4 2309 2310 ################################################################ 2311 # at this point we have 14243444 in $R0-$S4 and 05060708 in 2312 # $D0-$D4, ... 2313 2314 vpunpcklqdq $T4,$T3,$T0 # transpose input 2315 vpunpckhqdq $T4,$T3,$T4 2316 2317 # ... since input 64-bit lanes are ordered as 73625140, we could 2318 # "vperm" it to 76543210 (here and in each loop iteration), *or* 2319 # we could just flow along, hence the goal for $R0-$S4 is 2320 # 1858286838784888 ... 2321 2322 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 2323 mov \$0x7777,%eax 2324 kmovw %eax,%k1 2325 2326 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 2327 vpermd $R1,$M0,$R1 2328 vpermd $R2,$M0,$R2 2329 vpermd $R3,$M0,$R3 2330 vpermd $R4,$M0,$R4 2331 2332 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 2333 vpermd $D1,$M0,${R1}{%k1} 2334 vpermd $D2,$M0,${R2}{%k1} 2335 vpermd $D3,$M0,${R3}{%k1} 2336 vpermd $D4,$M0,${R4}{%k1} 2337 2338 vpslld \$2,$R1,$S1 # *5 2339 vpslld \$2,$R2,$S2 2340 vpslld \$2,$R3,$S3 2341 vpslld \$2,$R4,$S4 2342 vpaddd $R1,$S1,$S1 2343 vpaddd $R2,$S2,$S2 2344 vpaddd $R3,$S3,$S3 2345 vpaddd $R4,$S4,$S4 2346 2347 vpbroadcastq 32(%rcx),$PADBIT # .L129 2348 2349 vpsrlq \$52,$T0,$T2 # splat input 2350 vpsllq \$12,$T4,$T3 2351 vporq $T3,$T2,$T2 2352 vpsrlq \$26,$T0,$T1 2353 vpsrlq \$14,$T4,$T3 2354 vpsrlq \$40,$T4,$T4 # 4 2355 vpandq $MASK,$T2,$T2 # 2 2356 vpandq $MASK,$T0,$T0 # 0 2357 #vpandq $MASK,$T1,$T1 # 1 2358 #vpandq $MASK,$T3,$T3 # 3 2359 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2360 2361 vpaddq $H2,$T2,$H2 # accumulate input 2362 sub \$192,$len 2363 jbe .Ltail_avx512 2364 jmp .Loop_avx512 2365 2366.align 32 2367.Loop_avx512: 2368 ################################################################ 2369 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 2370 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 2371 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 2372 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 2373 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 2374 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 2375 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 2376 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 2377 # \________/\___________/ 2378 ################################################################ 2379 #vpaddq $H2,$T2,$H2 # accumulate input 2380 2381 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 2382 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 2383 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 2384 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 2385 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 2386 # 2387 # however, as h2 is "chronologically" first one available pull 2388 # corresponding operations up, so it's 2389 # 2390 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 2391 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 2392 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 2393 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 2394 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 2395 2396 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2397 vpaddq $H0,$T0,$H0 2398 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2399 vpandq $MASK,$T1,$T1 # 1 2400 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2401 vpandq $MASK,$T3,$T3 # 3 2402 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2403 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2404 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2405 vpaddq $H1,$T1,$H1 # accumulate input 2406 vpaddq $H3,$T3,$H3 2407 vpaddq $H4,$T4,$H4 2408 2409 vmovdqu64 16*0($inp),$T3 # load input 2410 vmovdqu64 16*4($inp),$T4 2411 lea 16*8($inp),$inp 2412 vpmuludq $H0,$R3,$M3 2413 vpmuludq $H0,$R4,$M4 2414 vpmuludq $H0,$R0,$M0 2415 vpmuludq $H0,$R1,$M1 2416 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2417 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2418 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2419 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2420 2421 vpmuludq $H1,$R2,$M3 2422 vpmuludq $H1,$R3,$M4 2423 vpmuludq $H1,$S4,$M0 2424 vpmuludq $H0,$R2,$M2 2425 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2426 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2427 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2428 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2429 2430 vpunpcklqdq $T4,$T3,$T0 # transpose input 2431 vpunpckhqdq $T4,$T3,$T4 2432 2433 vpmuludq $H3,$R0,$M3 2434 vpmuludq $H3,$R1,$M4 2435 vpmuludq $H1,$R0,$M1 2436 vpmuludq $H1,$R1,$M2 2437 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2438 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2439 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2440 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2441 2442 vpmuludq $H4,$S4,$M3 2443 vpmuludq $H4,$R0,$M4 2444 vpmuludq $H3,$S2,$M0 2445 vpmuludq $H3,$S3,$M1 2446 vpaddq $M3,$D3,$D3 # d3 += h4*s4 2447 vpmuludq $H3,$S4,$M2 2448 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2449 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2450 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2451 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2452 2453 vpmuludq $H4,$S1,$M0 2454 vpmuludq $H4,$S2,$M1 2455 vpmuludq $H4,$S3,$M2 2456 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2457 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2458 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2459 2460 ################################################################ 2461 # lazy reduction (interleaved with input splat) 2462 2463 vpsrlq \$52,$T0,$T2 # splat input 2464 vpsllq \$12,$T4,$T3 2465 2466 vpsrlq \$26,$D3,$H3 2467 vpandq $MASK,$D3,$D3 2468 vpaddq $H3,$D4,$H4 # h3 -> h4 2469 2470 vporq $T3,$T2,$T2 2471 2472 vpsrlq \$26,$H0,$D0 2473 vpandq $MASK,$H0,$H0 2474 vpaddq $D0,$H1,$H1 # h0 -> h1 2475 2476 vpandq $MASK,$T2,$T2 # 2 2477 2478 vpsrlq \$26,$H4,$D4 2479 vpandq $MASK,$H4,$H4 2480 2481 vpsrlq \$26,$H1,$D1 2482 vpandq $MASK,$H1,$H1 2483 vpaddq $D1,$H2,$H2 # h1 -> h2 2484 2485 vpaddq $D4,$H0,$H0 2486 vpsllq \$2,$D4,$D4 2487 vpaddq $D4,$H0,$H0 # h4 -> h0 2488 2489 vpaddq $T2,$H2,$H2 # modulo-scheduled 2490 vpsrlq \$26,$T0,$T1 2491 2492 vpsrlq \$26,$H2,$D2 2493 vpandq $MASK,$H2,$H2 2494 vpaddq $D2,$D3,$H3 # h2 -> h3 2495 2496 vpsrlq \$14,$T4,$T3 2497 2498 vpsrlq \$26,$H0,$D0 2499 vpandq $MASK,$H0,$H0 2500 vpaddq $D0,$H1,$H1 # h0 -> h1 2501 2502 vpsrlq \$40,$T4,$T4 # 4 2503 2504 vpsrlq \$26,$H3,$D3 2505 vpandq $MASK,$H3,$H3 2506 vpaddq $D3,$H4,$H4 # h3 -> h4 2507 2508 vpandq $MASK,$T0,$T0 # 0 2509 #vpandq $MASK,$T1,$T1 # 1 2510 #vpandq $MASK,$T3,$T3 # 3 2511 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2512 2513 sub \$128,$len 2514 ja .Loop_avx512 2515 2516.Ltail_avx512: 2517 ################################################################ 2518 # while above multiplications were by r^8 in all lanes, in last 2519 # iteration we multiply least significant lane by r^8 and most 2520 # significant one by r, that's why table gets shifted... 2521 2522 vpsrlq \$32,$R0,$R0 # 0105020603070408 2523 vpsrlq \$32,$R1,$R1 2524 vpsrlq \$32,$R2,$R2 2525 vpsrlq \$32,$S3,$S3 2526 vpsrlq \$32,$S4,$S4 2527 vpsrlq \$32,$R3,$R3 2528 vpsrlq \$32,$R4,$R4 2529 vpsrlq \$32,$S1,$S1 2530 vpsrlq \$32,$S2,$S2 2531 2532 ################################################################ 2533 # load either next or last 64 byte of input 2534 lea ($inp,$len),$inp 2535 2536 #vpaddq $H2,$T2,$H2 # accumulate input 2537 vpaddq $H0,$T0,$H0 2538 2539 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2540 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2541 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2542 vpandq $MASK,$T1,$T1 # 1 2543 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2544 vpandq $MASK,$T3,$T3 # 3 2545 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2546 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2547 vpaddq $H1,$T1,$H1 # accumulate input 2548 vpaddq $H3,$T3,$H3 2549 vpaddq $H4,$T4,$H4 2550 2551 vmovdqu 16*0($inp),%x#$T0 2552 vpmuludq $H0,$R3,$M3 2553 vpmuludq $H0,$R4,$M4 2554 vpmuludq $H0,$R0,$M0 2555 vpmuludq $H0,$R1,$M1 2556 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2557 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2558 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2559 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2560 2561 vmovdqu 16*1($inp),%x#$T1 2562 vpmuludq $H1,$R2,$M3 2563 vpmuludq $H1,$R3,$M4 2564 vpmuludq $H1,$S4,$M0 2565 vpmuludq $H0,$R2,$M2 2566 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2567 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2568 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2569 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2570 2571 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 2572 vpmuludq $H3,$R0,$M3 2573 vpmuludq $H3,$R1,$M4 2574 vpmuludq $H1,$R0,$M1 2575 vpmuludq $H1,$R1,$M2 2576 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2577 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2578 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2579 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2580 2581 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 2582 vpmuludq $H4,$S4,$M3 2583 vpmuludq $H4,$R0,$M4 2584 vpmuludq $H3,$S2,$M0 2585 vpmuludq $H3,$S3,$M1 2586 vpmuludq $H3,$S4,$M2 2587 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 2588 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2589 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2590 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2591 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2592 2593 vpmuludq $H4,$S1,$M0 2594 vpmuludq $H4,$S2,$M1 2595 vpmuludq $H4,$S3,$M2 2596 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2597 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2598 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2599 2600 ################################################################ 2601 # horizontal addition 2602 2603 mov \$1,%eax 2604 vpermq \$0xb1,$H3,$D3 2605 vpermq \$0xb1,$D4,$H4 2606 vpermq \$0xb1,$H0,$D0 2607 vpermq \$0xb1,$H1,$D1 2608 vpermq \$0xb1,$H2,$D2 2609 vpaddq $D3,$H3,$H3 2610 vpaddq $D4,$H4,$H4 2611 vpaddq $D0,$H0,$H0 2612 vpaddq $D1,$H1,$H1 2613 vpaddq $D2,$H2,$H2 2614 2615 kmovw %eax,%k3 2616 vpermq \$0x2,$H3,$D3 2617 vpermq \$0x2,$H4,$D4 2618 vpermq \$0x2,$H0,$D0 2619 vpermq \$0x2,$H1,$D1 2620 vpermq \$0x2,$H2,$D2 2621 vpaddq $D3,$H3,$H3 2622 vpaddq $D4,$H4,$H4 2623 vpaddq $D0,$H0,$H0 2624 vpaddq $D1,$H1,$H1 2625 vpaddq $D2,$H2,$H2 2626 2627 vextracti64x4 \$0x1,$H3,%y#$D3 2628 vextracti64x4 \$0x1,$H4,%y#$D4 2629 vextracti64x4 \$0x1,$H0,%y#$D0 2630 vextracti64x4 \$0x1,$H1,%y#$D1 2631 vextracti64x4 \$0x1,$H2,%y#$D2 2632 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 2633 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 2634 vpaddq $D0,$H0,${H0}{%k3}{z} 2635 vpaddq $D1,$H1,${H1}{%k3}{z} 2636 vpaddq $D2,$H2,${H2}{%k3}{z} 2637___ 2638map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 2639map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 2640$code.=<<___; 2641 ################################################################ 2642 # lazy reduction (interleaved with input splat) 2643 2644 vpsrlq \$26,$H3,$D3 2645 vpand $MASK,$H3,$H3 2646 vpsrldq \$6,$T0,$T2 # splat input 2647 vpsrldq \$6,$T1,$T3 2648 vpunpckhqdq $T1,$T0,$T4 # 4 2649 vpaddq $D3,$H4,$H4 # h3 -> h4 2650 2651 vpsrlq \$26,$H0,$D0 2652 vpand $MASK,$H0,$H0 2653 vpunpcklqdq $T3,$T2,$T2 # 2:3 2654 vpunpcklqdq $T1,$T0,$T0 # 0:1 2655 vpaddq $D0,$H1,$H1 # h0 -> h1 2656 2657 vpsrlq \$26,$H4,$D4 2658 vpand $MASK,$H4,$H4 2659 2660 vpsrlq \$26,$H1,$D1 2661 vpand $MASK,$H1,$H1 2662 vpsrlq \$30,$T2,$T3 2663 vpsrlq \$4,$T2,$T2 2664 vpaddq $D1,$H2,$H2 # h1 -> h2 2665 2666 vpaddq $D4,$H0,$H0 2667 vpsllq \$2,$D4,$D4 2668 vpsrlq \$26,$T0,$T1 2669 vpsrlq \$40,$T4,$T4 # 4 2670 vpaddq $D4,$H0,$H0 # h4 -> h0 2671 2672 vpsrlq \$26,$H2,$D2 2673 vpand $MASK,$H2,$H2 2674 vpand $MASK,$T2,$T2 # 2 2675 vpand $MASK,$T0,$T0 # 0 2676 vpaddq $D2,$H3,$H3 # h2 -> h3 2677 2678 vpsrlq \$26,$H0,$D0 2679 vpand $MASK,$H0,$H0 2680 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 2681 vpand $MASK,$T1,$T1 # 1 2682 vpaddq $D0,$H1,$H1 # h0 -> h1 2683 2684 vpsrlq \$26,$H3,$D3 2685 vpand $MASK,$H3,$H3 2686 vpand $MASK,$T3,$T3 # 3 2687 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2688 vpaddq $D3,$H4,$H4 # h3 -> h4 2689 2690 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 2691 add \$64,$len 2692 jnz .Ltail_avx2 2693 2694 vpsubq $T2,$H2,$H2 # undo input accumulation 2695 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2696 vmovd %x#$H1,`4*1-48-64`($ctx) 2697 vmovd %x#$H2,`4*2-48-64`($ctx) 2698 vmovd %x#$H3,`4*3-48-64`($ctx) 2699 vmovd %x#$H4,`4*4-48-64`($ctx) 2700 vzeroall 2701___ 2702$code.=<<___ if ($win64); 2703 movdqa 0x50(%r11),%xmm6 2704 movdqa 0x60(%r11),%xmm7 2705 movdqa 0x70(%r11),%xmm8 2706 movdqa 0x80(%r11),%xmm9 2707 movdqa 0x90(%r11),%xmm10 2708 movdqa 0xa0(%r11),%xmm11 2709 movdqa 0xb0(%r11),%xmm12 2710 movdqa 0xc0(%r11),%xmm13 2711 movdqa 0xd0(%r11),%xmm14 2712 movdqa 0xe0(%r11),%xmm15 2713 lea 0xf8(%r11),%rsp 2714.Ldo_avx512_epilogue: 2715___ 2716$code.=<<___ if (!$win64); 2717 lea 8(%r11),%rsp 2718.cfi_def_cfa %rsp,8 2719___ 2720$code.=<<___; 2721 ret 2722.cfi_endproc 2723.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 2724___ 2725if ($avx>3) { 2726######################################################################## 2727# VPMADD52 version using 2^44 radix. 2728# 2729# One can argue that base 2^52 would be more natural. Well, even though 2730# some operations would be more natural, one has to recognize couple of 2731# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 2732# at amount of multiply-n-accumulate operations. Secondly, it makes it 2733# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 2734# reference implementations], which means that more such operations 2735# would have to be performed in inner loop, which in turn makes critical 2736# path longer. In other words, even though base 2^44 reduction might 2737# look less elegant, overall critical path is actually shorter... 2738 2739######################################################################## 2740# Layout of opaque area is following. 2741# 2742# unsigned __int64 h[3]; # current hash value base 2^44 2743# unsigned __int64 s[2]; # key value*20 base 2^44 2744# unsigned __int64 r[3]; # key value base 2^44 2745# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 2746# # r^n positions reflect 2747# # placement in register, not 2748# # memory, R[3] is R[1]*20 2749 2750$code.=<<___; 2751.type poly1305_init_base2_44,\@function,3 2752.align 32 2753poly1305_init_base2_44: 2754.cfi_startproc 2755 xor %rax,%rax 2756 mov %rax,0($ctx) # initialize hash value 2757 mov %rax,8($ctx) 2758 mov %rax,16($ctx) 2759 2760.Linit_base2_44: 2761 lea poly1305_blocks_vpmadd52(%rip),%r10 2762 lea poly1305_emit_base2_44(%rip),%r11 2763 2764 mov \$0x0ffffffc0fffffff,%rax 2765 mov \$0x0ffffffc0ffffffc,%rcx 2766 and 0($inp),%rax 2767 mov \$0x00000fffffffffff,%r8 2768 and 8($inp),%rcx 2769 mov \$0x00000fffffffffff,%r9 2770 and %rax,%r8 2771 shrd \$44,%rcx,%rax 2772 mov %r8,40($ctx) # r0 2773 and %r9,%rax 2774 shr \$24,%rcx 2775 mov %rax,48($ctx) # r1 2776 lea (%rax,%rax,4),%rax # *5 2777 mov %rcx,56($ctx) # r2 2778 shl \$2,%rax # magic <<2 2779 lea (%rcx,%rcx,4),%rcx # *5 2780 shl \$2,%rcx # magic <<2 2781 mov %rax,24($ctx) # s1 2782 mov %rcx,32($ctx) # s2 2783 movq \$-1,64($ctx) # write impossible value 2784___ 2785$code.=<<___ if ($flavour !~ /elf32/); 2786 mov %r10,0(%rdx) 2787 mov %r11,8(%rdx) 2788___ 2789$code.=<<___ if ($flavour =~ /elf32/); 2790 mov %r10d,0(%rdx) 2791 mov %r11d,4(%rdx) 2792___ 2793$code.=<<___; 2794 mov \$1,%eax 2795 ret 2796.cfi_endproc 2797.size poly1305_init_base2_44,.-poly1305_init_base2_44 2798___ 2799{ 2800my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 2801my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 2802my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 2803 2804$code.=<<___; 2805.type poly1305_blocks_vpmadd52,\@function,4 2806.align 32 2807poly1305_blocks_vpmadd52: 2808.cfi_startproc 2809 shr \$4,$len 2810 jz .Lno_data_vpmadd52 # too short 2811 2812 shl \$40,$padbit 2813 mov 64($ctx),%r8 # peek on power of the key 2814 2815 # if powers of the key are not calculated yet, process up to 3 2816 # blocks with this single-block subroutine, otherwise ensure that 2817 # length is divisible by 2 blocks and pass the rest down to next 2818 # subroutine... 2819 2820 mov \$3,%rax 2821 mov \$1,%r10 2822 cmp \$4,$len # is input long 2823 cmovae %r10,%rax 2824 test %r8,%r8 # is power value impossible? 2825 cmovns %r10,%rax 2826 2827 and $len,%rax # is input of favourable length? 2828 jz .Lblocks_vpmadd52_4x 2829 2830 sub %rax,$len 2831 mov \$7,%r10d 2832 mov \$1,%r11d 2833 kmovw %r10d,%k7 2834 lea .L2_44_inp_permd(%rip),%r10 2835 kmovw %r11d,%k1 2836 2837 vmovq $padbit,%x#$PAD 2838 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 2839 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 2840 vpermq \$0xcf,$PAD,$PAD 2841 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 2842 2843 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 2844 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 2845 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 2846 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 2847 2848 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 2849 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 2850 2851 jmp .Loop_vpmadd52 2852 2853.align 32 2854.Loop_vpmadd52: 2855 vmovdqu32 0($inp),%x#$T0 # load input as ----3210 2856 lea 16($inp),$inp 2857 2858 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 2859 vpsrlvq $inp_shift,$T0,$T0 2860 vpandq $reduc_mask,$T0,$T0 2861 vporq $PAD,$T0,$T0 2862 2863 vpaddq $T0,$Dlo,$Dlo # accumulate input 2864 2865 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 2866 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 2867 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 2868 2869 vpxord $Dlo,$Dlo,$Dlo 2870 vpxord $Dhi,$Dhi,$Dhi 2871 2872 vpmadd52luq $r2r1r0,$H0,$Dlo 2873 vpmadd52huq $r2r1r0,$H0,$Dhi 2874 2875 vpmadd52luq $r1r0s2,$H1,$Dlo 2876 vpmadd52huq $r1r0s2,$H1,$Dhi 2877 2878 vpmadd52luq $r0s2s1,$H2,$Dlo 2879 vpmadd52huq $r0s2s1,$H2,$Dhi 2880 2881 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 2882 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 2883 vpandq $reduc_mask,$Dlo,$Dlo 2884 2885 vpaddq $T0,$Dhi,$Dhi 2886 2887 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 2888 2889 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 2890 2891 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 2892 vpandq $reduc_mask,$Dlo,$Dlo 2893 2894 vpermq \$0b10010011,$T0,$T0 2895 2896 vpaddq $T0,$Dlo,$Dlo 2897 2898 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 2899 2900 vpaddq $T0,$Dlo,$Dlo 2901 vpsllq \$2,$T0,$T0 2902 2903 vpaddq $T0,$Dlo,$Dlo 2904 2905 dec %rax # len-=16 2906 jnz .Loop_vpmadd52 2907 2908 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 2909 2910 test $len,$len 2911 jnz .Lblocks_vpmadd52_4x 2912 2913.Lno_data_vpmadd52: 2914 ret 2915.cfi_endproc 2916.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 2917___ 2918} 2919{ 2920######################################################################## 2921# As implied by its name 4x subroutine processes 4 blocks in parallel 2922# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 2923# and is handled in 256-bit %ymm registers. 2924 2925my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 2926my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 2927my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 2928 2929$code.=<<___; 2930.type poly1305_blocks_vpmadd52_4x,\@function,4 2931.align 32 2932poly1305_blocks_vpmadd52_4x: 2933.cfi_startproc 2934 shr \$4,$len 2935 jz .Lno_data_vpmadd52_4x # too short 2936 2937 shl \$40,$padbit 2938 mov 64($ctx),%r8 # peek on power of the key 2939 2940.Lblocks_vpmadd52_4x: 2941 vpbroadcastq $padbit,$PAD 2942 2943 vmovdqa64 .Lx_mask44(%rip),$mask44 2944 mov \$5,%eax 2945 vmovdqa64 .Lx_mask42(%rip),$mask42 2946 kmovw %eax,%k1 # used in 2x path 2947 2948 test %r8,%r8 # is power value impossible? 2949 js .Linit_vpmadd52 # if it is, then init R[4] 2950 2951 vmovq 0($ctx),%x#$H0 # load current hash value 2952 vmovq 8($ctx),%x#$H1 2953 vmovq 16($ctx),%x#$H2 2954 2955 test \$3,$len # is length 4*n+2? 2956 jnz .Lblocks_vpmadd52_2x_do 2957 2958.Lblocks_vpmadd52_4x_do: 2959 vpbroadcastq 64($ctx),$R0 # load 4th power of the key 2960 vpbroadcastq 96($ctx),$R1 2961 vpbroadcastq 128($ctx),$R2 2962 vpbroadcastq 160($ctx),$S1 2963 2964.Lblocks_vpmadd52_4x_key_loaded: 2965 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 2966 vpaddq $R2,$S2,$S2 2967 vpsllq \$2,$S2,$S2 2968 2969 test \$7,$len # is len 8*n? 2970 jz .Lblocks_vpmadd52_8x 2971 2972 vmovdqu64 16*0($inp),$T2 # load data 2973 vmovdqu64 16*2($inp),$T3 2974 lea 16*4($inp),$inp 2975 2976 vpunpcklqdq $T3,$T2,$T1 # transpose data 2977 vpunpckhqdq $T3,$T2,$T3 2978 2979 # at this point 64-bit lanes are ordered as 3-1-2-0 2980 2981 vpsrlq \$24,$T3,$T2 # splat the data 2982 vporq $PAD,$T2,$T2 2983 vpaddq $T2,$H2,$H2 # accumulate input 2984 vpandq $mask44,$T1,$T0 2985 vpsrlq \$44,$T1,$T1 2986 vpsllq \$20,$T3,$T3 2987 vporq $T3,$T1,$T1 2988 vpandq $mask44,$T1,$T1 2989 2990 sub \$4,$len 2991 jz .Ltail_vpmadd52_4x 2992 jmp .Loop_vpmadd52_4x 2993 ud2 2994 2995.align 32 2996.Linit_vpmadd52: 2997 vmovq 24($ctx),%x#$S1 # load key 2998 vmovq 56($ctx),%x#$H2 2999 vmovq 32($ctx),%x#$S2 3000 vmovq 40($ctx),%x#$R0 3001 vmovq 48($ctx),%x#$R1 3002 3003 vmovdqa $R0,$H0 3004 vmovdqa $R1,$H1 3005 vmovdqa $H2,$R2 3006 3007 mov \$2,%eax 3008 3009.Lmul_init_vpmadd52: 3010 vpxorq $D0lo,$D0lo,$D0lo 3011 vpmadd52luq $H2,$S1,$D0lo 3012 vpxorq $D0hi,$D0hi,$D0hi 3013 vpmadd52huq $H2,$S1,$D0hi 3014 vpxorq $D1lo,$D1lo,$D1lo 3015 vpmadd52luq $H2,$S2,$D1lo 3016 vpxorq $D1hi,$D1hi,$D1hi 3017 vpmadd52huq $H2,$S2,$D1hi 3018 vpxorq $D2lo,$D2lo,$D2lo 3019 vpmadd52luq $H2,$R0,$D2lo 3020 vpxorq $D2hi,$D2hi,$D2hi 3021 vpmadd52huq $H2,$R0,$D2hi 3022 3023 vpmadd52luq $H0,$R0,$D0lo 3024 vpmadd52huq $H0,$R0,$D0hi 3025 vpmadd52luq $H0,$R1,$D1lo 3026 vpmadd52huq $H0,$R1,$D1hi 3027 vpmadd52luq $H0,$R2,$D2lo 3028 vpmadd52huq $H0,$R2,$D2hi 3029 3030 vpmadd52luq $H1,$S2,$D0lo 3031 vpmadd52huq $H1,$S2,$D0hi 3032 vpmadd52luq $H1,$R0,$D1lo 3033 vpmadd52huq $H1,$R0,$D1hi 3034 vpmadd52luq $H1,$R1,$D2lo 3035 vpmadd52huq $H1,$R1,$D2hi 3036 3037 ################################################################ 3038 # partial reduction 3039 vpsrlq \$44,$D0lo,$tmp 3040 vpsllq \$8,$D0hi,$D0hi 3041 vpandq $mask44,$D0lo,$H0 3042 vpaddq $tmp,$D0hi,$D0hi 3043 3044 vpaddq $D0hi,$D1lo,$D1lo 3045 3046 vpsrlq \$44,$D1lo,$tmp 3047 vpsllq \$8,$D1hi,$D1hi 3048 vpandq $mask44,$D1lo,$H1 3049 vpaddq $tmp,$D1hi,$D1hi 3050 3051 vpaddq $D1hi,$D2lo,$D2lo 3052 3053 vpsrlq \$42,$D2lo,$tmp 3054 vpsllq \$10,$D2hi,$D2hi 3055 vpandq $mask42,$D2lo,$H2 3056 vpaddq $tmp,$D2hi,$D2hi 3057 3058 vpaddq $D2hi,$H0,$H0 3059 vpsllq \$2,$D2hi,$D2hi 3060 3061 vpaddq $D2hi,$H0,$H0 3062 3063 vpsrlq \$44,$H0,$tmp # additional step 3064 vpandq $mask44,$H0,$H0 3065 3066 vpaddq $tmp,$H1,$H1 3067 3068 dec %eax 3069 jz .Ldone_init_vpmadd52 3070 3071 vpunpcklqdq $R1,$H1,$R1 # 1,2 3072 vpbroadcastq %x#$H1,%x#$H1 # 2,2 3073 vpunpcklqdq $R2,$H2,$R2 3074 vpbroadcastq %x#$H2,%x#$H2 3075 vpunpcklqdq $R0,$H0,$R0 3076 vpbroadcastq %x#$H0,%x#$H0 3077 3078 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3079 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3080 vpaddq $R1,$S1,$S1 3081 vpaddq $R2,$S2,$S2 3082 vpsllq \$2,$S1,$S1 3083 vpsllq \$2,$S2,$S2 3084 3085 jmp .Lmul_init_vpmadd52 3086 ud2 3087 3088.align 32 3089.Ldone_init_vpmadd52: 3090 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 3091 vinserti128 \$1,%x#$R2,$H2,$R2 3092 vinserti128 \$1,%x#$R0,$H0,$R0 3093 3094 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 3095 vpermq \$0b11011000,$R2,$R2 3096 vpermq \$0b11011000,$R0,$R0 3097 3098 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3099 vpaddq $R1,$S1,$S1 3100 vpsllq \$2,$S1,$S1 3101 3102 vmovq 0($ctx),%x#$H0 # load current hash value 3103 vmovq 8($ctx),%x#$H1 3104 vmovq 16($ctx),%x#$H2 3105 3106 test \$3,$len # is length 4*n+2? 3107 jnz .Ldone_init_vpmadd52_2x 3108 3109 vmovdqu64 $R0,64($ctx) # save key powers 3110 vpbroadcastq %x#$R0,$R0 # broadcast 4th power 3111 vmovdqu64 $R1,96($ctx) 3112 vpbroadcastq %x#$R1,$R1 3113 vmovdqu64 $R2,128($ctx) 3114 vpbroadcastq %x#$R2,$R2 3115 vmovdqu64 $S1,160($ctx) 3116 vpbroadcastq %x#$S1,$S1 3117 3118 jmp .Lblocks_vpmadd52_4x_key_loaded 3119 ud2 3120 3121.align 32 3122.Ldone_init_vpmadd52_2x: 3123 vmovdqu64 $R0,64($ctx) # save key powers 3124 vpsrldq \$8,$R0,$R0 # 0-1-0-2 3125 vmovdqu64 $R1,96($ctx) 3126 vpsrldq \$8,$R1,$R1 3127 vmovdqu64 $R2,128($ctx) 3128 vpsrldq \$8,$R2,$R2 3129 vmovdqu64 $S1,160($ctx) 3130 vpsrldq \$8,$S1,$S1 3131 jmp .Lblocks_vpmadd52_2x_key_loaded 3132 ud2 3133 3134.align 32 3135.Lblocks_vpmadd52_2x_do: 3136 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 3137 vmovdqu64 160+8($ctx),${S1}{%k1}{z} 3138 vmovdqu64 64+8($ctx),${R0}{%k1}{z} 3139 vmovdqu64 96+8($ctx),${R1}{%k1}{z} 3140 3141.Lblocks_vpmadd52_2x_key_loaded: 3142 vmovdqu64 16*0($inp),$T2 # load data 3143 vpxorq $T3,$T3,$T3 3144 lea 16*2($inp),$inp 3145 3146 vpunpcklqdq $T3,$T2,$T1 # transpose data 3147 vpunpckhqdq $T3,$T2,$T3 3148 3149 # at this point 64-bit lanes are ordered as x-1-x-0 3150 3151 vpsrlq \$24,$T3,$T2 # splat the data 3152 vporq $PAD,$T2,$T2 3153 vpaddq $T2,$H2,$H2 # accumulate input 3154 vpandq $mask44,$T1,$T0 3155 vpsrlq \$44,$T1,$T1 3156 vpsllq \$20,$T3,$T3 3157 vporq $T3,$T1,$T1 3158 vpandq $mask44,$T1,$T1 3159 3160 jmp .Ltail_vpmadd52_2x 3161 ud2 3162 3163.align 32 3164.Loop_vpmadd52_4x: 3165 #vpaddq $T2,$H2,$H2 # accumulate input 3166 vpaddq $T0,$H0,$H0 3167 vpaddq $T1,$H1,$H1 3168 3169 vpxorq $D0lo,$D0lo,$D0lo 3170 vpmadd52luq $H2,$S1,$D0lo 3171 vpxorq $D0hi,$D0hi,$D0hi 3172 vpmadd52huq $H2,$S1,$D0hi 3173 vpxorq $D1lo,$D1lo,$D1lo 3174 vpmadd52luq $H2,$S2,$D1lo 3175 vpxorq $D1hi,$D1hi,$D1hi 3176 vpmadd52huq $H2,$S2,$D1hi 3177 vpxorq $D2lo,$D2lo,$D2lo 3178 vpmadd52luq $H2,$R0,$D2lo 3179 vpxorq $D2hi,$D2hi,$D2hi 3180 vpmadd52huq $H2,$R0,$D2hi 3181 3182 vmovdqu64 16*0($inp),$T2 # load data 3183 vmovdqu64 16*2($inp),$T3 3184 lea 16*4($inp),$inp 3185 vpmadd52luq $H0,$R0,$D0lo 3186 vpmadd52huq $H0,$R0,$D0hi 3187 vpmadd52luq $H0,$R1,$D1lo 3188 vpmadd52huq $H0,$R1,$D1hi 3189 vpmadd52luq $H0,$R2,$D2lo 3190 vpmadd52huq $H0,$R2,$D2hi 3191 3192 vpunpcklqdq $T3,$T2,$T1 # transpose data 3193 vpunpckhqdq $T3,$T2,$T3 3194 vpmadd52luq $H1,$S2,$D0lo 3195 vpmadd52huq $H1,$S2,$D0hi 3196 vpmadd52luq $H1,$R0,$D1lo 3197 vpmadd52huq $H1,$R0,$D1hi 3198 vpmadd52luq $H1,$R1,$D2lo 3199 vpmadd52huq $H1,$R1,$D2hi 3200 3201 ################################################################ 3202 # partial reduction (interleaved with data splat) 3203 vpsrlq \$44,$D0lo,$tmp 3204 vpsllq \$8,$D0hi,$D0hi 3205 vpandq $mask44,$D0lo,$H0 3206 vpaddq $tmp,$D0hi,$D0hi 3207 3208 vpsrlq \$24,$T3,$T2 3209 vporq $PAD,$T2,$T2 3210 vpaddq $D0hi,$D1lo,$D1lo 3211 3212 vpsrlq \$44,$D1lo,$tmp 3213 vpsllq \$8,$D1hi,$D1hi 3214 vpandq $mask44,$D1lo,$H1 3215 vpaddq $tmp,$D1hi,$D1hi 3216 3217 vpandq $mask44,$T1,$T0 3218 vpsrlq \$44,$T1,$T1 3219 vpsllq \$20,$T3,$T3 3220 vpaddq $D1hi,$D2lo,$D2lo 3221 3222 vpsrlq \$42,$D2lo,$tmp 3223 vpsllq \$10,$D2hi,$D2hi 3224 vpandq $mask42,$D2lo,$H2 3225 vpaddq $tmp,$D2hi,$D2hi 3226 3227 vpaddq $T2,$H2,$H2 # accumulate input 3228 vpaddq $D2hi,$H0,$H0 3229 vpsllq \$2,$D2hi,$D2hi 3230 3231 vpaddq $D2hi,$H0,$H0 3232 vporq $T3,$T1,$T1 3233 vpandq $mask44,$T1,$T1 3234 3235 vpsrlq \$44,$H0,$tmp # additional step 3236 vpandq $mask44,$H0,$H0 3237 3238 vpaddq $tmp,$H1,$H1 3239 3240 sub \$4,$len # len-=64 3241 jnz .Loop_vpmadd52_4x 3242 3243.Ltail_vpmadd52_4x: 3244 vmovdqu64 128($ctx),$R2 # load all key powers 3245 vmovdqu64 160($ctx),$S1 3246 vmovdqu64 64($ctx),$R0 3247 vmovdqu64 96($ctx),$R1 3248 3249.Ltail_vpmadd52_2x: 3250 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3251 vpaddq $R2,$S2,$S2 3252 vpsllq \$2,$S2,$S2 3253 3254 #vpaddq $T2,$H2,$H2 # accumulate input 3255 vpaddq $T0,$H0,$H0 3256 vpaddq $T1,$H1,$H1 3257 3258 vpxorq $D0lo,$D0lo,$D0lo 3259 vpmadd52luq $H2,$S1,$D0lo 3260 vpxorq $D0hi,$D0hi,$D0hi 3261 vpmadd52huq $H2,$S1,$D0hi 3262 vpxorq $D1lo,$D1lo,$D1lo 3263 vpmadd52luq $H2,$S2,$D1lo 3264 vpxorq $D1hi,$D1hi,$D1hi 3265 vpmadd52huq $H2,$S2,$D1hi 3266 vpxorq $D2lo,$D2lo,$D2lo 3267 vpmadd52luq $H2,$R0,$D2lo 3268 vpxorq $D2hi,$D2hi,$D2hi 3269 vpmadd52huq $H2,$R0,$D2hi 3270 3271 vpmadd52luq $H0,$R0,$D0lo 3272 vpmadd52huq $H0,$R0,$D0hi 3273 vpmadd52luq $H0,$R1,$D1lo 3274 vpmadd52huq $H0,$R1,$D1hi 3275 vpmadd52luq $H0,$R2,$D2lo 3276 vpmadd52huq $H0,$R2,$D2hi 3277 3278 vpmadd52luq $H1,$S2,$D0lo 3279 vpmadd52huq $H1,$S2,$D0hi 3280 vpmadd52luq $H1,$R0,$D1lo 3281 vpmadd52huq $H1,$R0,$D1hi 3282 vpmadd52luq $H1,$R1,$D2lo 3283 vpmadd52huq $H1,$R1,$D2hi 3284 3285 ################################################################ 3286 # horizontal addition 3287 3288 mov \$1,%eax 3289 kmovw %eax,%k1 3290 vpsrldq \$8,$D0lo,$T0 3291 vpsrldq \$8,$D0hi,$H0 3292 vpsrldq \$8,$D1lo,$T1 3293 vpsrldq \$8,$D1hi,$H1 3294 vpaddq $T0,$D0lo,$D0lo 3295 vpaddq $H0,$D0hi,$D0hi 3296 vpsrldq \$8,$D2lo,$T2 3297 vpsrldq \$8,$D2hi,$H2 3298 vpaddq $T1,$D1lo,$D1lo 3299 vpaddq $H1,$D1hi,$D1hi 3300 vpermq \$0x2,$D0lo,$T0 3301 vpermq \$0x2,$D0hi,$H0 3302 vpaddq $T2,$D2lo,$D2lo 3303 vpaddq $H2,$D2hi,$D2hi 3304 3305 vpermq \$0x2,$D1lo,$T1 3306 vpermq \$0x2,$D1hi,$H1 3307 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3308 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3309 vpermq \$0x2,$D2lo,$T2 3310 vpermq \$0x2,$D2hi,$H2 3311 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3312 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3313 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3314 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3315 3316 ################################################################ 3317 # partial reduction 3318 vpsrlq \$44,$D0lo,$tmp 3319 vpsllq \$8,$D0hi,$D0hi 3320 vpandq $mask44,$D0lo,$H0 3321 vpaddq $tmp,$D0hi,$D0hi 3322 3323 vpaddq $D0hi,$D1lo,$D1lo 3324 3325 vpsrlq \$44,$D1lo,$tmp 3326 vpsllq \$8,$D1hi,$D1hi 3327 vpandq $mask44,$D1lo,$H1 3328 vpaddq $tmp,$D1hi,$D1hi 3329 3330 vpaddq $D1hi,$D2lo,$D2lo 3331 3332 vpsrlq \$42,$D2lo,$tmp 3333 vpsllq \$10,$D2hi,$D2hi 3334 vpandq $mask42,$D2lo,$H2 3335 vpaddq $tmp,$D2hi,$D2hi 3336 3337 vpaddq $D2hi,$H0,$H0 3338 vpsllq \$2,$D2hi,$D2hi 3339 3340 vpaddq $D2hi,$H0,$H0 3341 3342 vpsrlq \$44,$H0,$tmp # additional step 3343 vpandq $mask44,$H0,$H0 3344 3345 vpaddq $tmp,$H1,$H1 3346 # at this point $len is 3347 # either 4*n+2 or 0... 3348 sub \$2,$len # len-=32 3349 ja .Lblocks_vpmadd52_4x_do 3350 3351 vmovq %x#$H0,0($ctx) 3352 vmovq %x#$H1,8($ctx) 3353 vmovq %x#$H2,16($ctx) 3354 vzeroall 3355 3356.Lno_data_vpmadd52_4x: 3357 ret 3358.cfi_endproc 3359.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3360___ 3361} 3362{ 3363######################################################################## 3364# As implied by its name 8x subroutine processes 8 blocks in parallel... 3365# This is intermediate version, as it's used only in cases when input 3366# length is either 8*n, 8*n+1 or 8*n+2... 3367 3368my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3369my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3370my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3371my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 3372 3373$code.=<<___; 3374.type poly1305_blocks_vpmadd52_8x,\@function,4 3375.align 32 3376poly1305_blocks_vpmadd52_8x: 3377.cfi_startproc 3378 shr \$4,$len 3379 jz .Lno_data_vpmadd52_8x # too short 3380 3381 shl \$40,$padbit 3382 mov 64($ctx),%r8 # peek on power of the key 3383 3384 vmovdqa64 .Lx_mask44(%rip),$mask44 3385 vmovdqa64 .Lx_mask42(%rip),$mask42 3386 3387 test %r8,%r8 # is power value impossible? 3388 js .Linit_vpmadd52 # if it is, then init R[4] 3389 3390 vmovq 0($ctx),%x#$H0 # load current hash value 3391 vmovq 8($ctx),%x#$H1 3392 vmovq 16($ctx),%x#$H2 3393 3394.Lblocks_vpmadd52_8x: 3395 ################################################################ 3396 # fist we calculate more key powers 3397 3398 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 3399 vmovdqu64 160($ctx),$S1 3400 vmovdqu64 64($ctx),$R0 3401 vmovdqu64 96($ctx),$R1 3402 3403 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3404 vpaddq $R2,$S2,$S2 3405 vpsllq \$2,$S2,$S2 3406 3407 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 3408 vpbroadcastq %x#$R0,$RR0 3409 vpbroadcastq %x#$R1,$RR1 3410 3411 vpxorq $D0lo,$D0lo,$D0lo 3412 vpmadd52luq $RR2,$S1,$D0lo 3413 vpxorq $D0hi,$D0hi,$D0hi 3414 vpmadd52huq $RR2,$S1,$D0hi 3415 vpxorq $D1lo,$D1lo,$D1lo 3416 vpmadd52luq $RR2,$S2,$D1lo 3417 vpxorq $D1hi,$D1hi,$D1hi 3418 vpmadd52huq $RR2,$S2,$D1hi 3419 vpxorq $D2lo,$D2lo,$D2lo 3420 vpmadd52luq $RR2,$R0,$D2lo 3421 vpxorq $D2hi,$D2hi,$D2hi 3422 vpmadd52huq $RR2,$R0,$D2hi 3423 3424 vpmadd52luq $RR0,$R0,$D0lo 3425 vpmadd52huq $RR0,$R0,$D0hi 3426 vpmadd52luq $RR0,$R1,$D1lo 3427 vpmadd52huq $RR0,$R1,$D1hi 3428 vpmadd52luq $RR0,$R2,$D2lo 3429 vpmadd52huq $RR0,$R2,$D2hi 3430 3431 vpmadd52luq $RR1,$S2,$D0lo 3432 vpmadd52huq $RR1,$S2,$D0hi 3433 vpmadd52luq $RR1,$R0,$D1lo 3434 vpmadd52huq $RR1,$R0,$D1hi 3435 vpmadd52luq $RR1,$R1,$D2lo 3436 vpmadd52huq $RR1,$R1,$D2hi 3437 3438 ################################################################ 3439 # partial reduction 3440 vpsrlq \$44,$D0lo,$tmp 3441 vpsllq \$8,$D0hi,$D0hi 3442 vpandq $mask44,$D0lo,$RR0 3443 vpaddq $tmp,$D0hi,$D0hi 3444 3445 vpaddq $D0hi,$D1lo,$D1lo 3446 3447 vpsrlq \$44,$D1lo,$tmp 3448 vpsllq \$8,$D1hi,$D1hi 3449 vpandq $mask44,$D1lo,$RR1 3450 vpaddq $tmp,$D1hi,$D1hi 3451 3452 vpaddq $D1hi,$D2lo,$D2lo 3453 3454 vpsrlq \$42,$D2lo,$tmp 3455 vpsllq \$10,$D2hi,$D2hi 3456 vpandq $mask42,$D2lo,$RR2 3457 vpaddq $tmp,$D2hi,$D2hi 3458 3459 vpaddq $D2hi,$RR0,$RR0 3460 vpsllq \$2,$D2hi,$D2hi 3461 3462 vpaddq $D2hi,$RR0,$RR0 3463 3464 vpsrlq \$44,$RR0,$tmp # additional step 3465 vpandq $mask44,$RR0,$RR0 3466 3467 vpaddq $tmp,$RR1,$RR1 3468 3469 ################################################################ 3470 # At this point Rx holds 1324 powers, RRx - 5768, and the goal 3471 # is 15263748, which reflects how data is loaded... 3472 3473 vpunpcklqdq $R2,$RR2,$T2 # 3748 3474 vpunpckhqdq $R2,$RR2,$R2 # 1526 3475 vpunpcklqdq $R0,$RR0,$T0 3476 vpunpckhqdq $R0,$RR0,$R0 3477 vpunpcklqdq $R1,$RR1,$T1 3478 vpunpckhqdq $R1,$RR1,$R1 3479___ 3480######## switch to %zmm 3481map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3482map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3483map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3484map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 3485 3486$code.=<<___; 3487 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 3488 vshufi64x2 \$0x44,$R0,$T0,$RR0 3489 vshufi64x2 \$0x44,$R1,$T1,$RR1 3490 3491 vmovdqu64 16*0($inp),$T2 # load data 3492 vmovdqu64 16*4($inp),$T3 3493 lea 16*8($inp),$inp 3494 3495 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 3496 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 3497 vpaddq $RR2,$SS2,$SS2 3498 vpaddq $RR1,$SS1,$SS1 3499 vpsllq \$2,$SS2,$SS2 3500 vpsllq \$2,$SS1,$SS1 3501 3502 vpbroadcastq $padbit,$PAD 3503 vpbroadcastq %x#$mask44,$mask44 3504 vpbroadcastq %x#$mask42,$mask42 3505 3506 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 3507 vpbroadcastq %x#$SS2,$S2 3508 vpbroadcastq %x#$RR0,$R0 3509 vpbroadcastq %x#$RR1,$R1 3510 vpbroadcastq %x#$RR2,$R2 3511 3512 vpunpcklqdq $T3,$T2,$T1 # transpose data 3513 vpunpckhqdq $T3,$T2,$T3 3514 3515 # at this point 64-bit lanes are ordered as 73625140 3516 3517 vpsrlq \$24,$T3,$T2 # splat the data 3518 vporq $PAD,$T2,$T2 3519 vpaddq $T2,$H2,$H2 # accumulate input 3520 vpandq $mask44,$T1,$T0 3521 vpsrlq \$44,$T1,$T1 3522 vpsllq \$20,$T3,$T3 3523 vporq $T3,$T1,$T1 3524 vpandq $mask44,$T1,$T1 3525 3526 sub \$8,$len 3527 jz .Ltail_vpmadd52_8x 3528 jmp .Loop_vpmadd52_8x 3529 3530.align 32 3531.Loop_vpmadd52_8x: 3532 #vpaddq $T2,$H2,$H2 # accumulate input 3533 vpaddq $T0,$H0,$H0 3534 vpaddq $T1,$H1,$H1 3535 3536 vpxorq $D0lo,$D0lo,$D0lo 3537 vpmadd52luq $H2,$S1,$D0lo 3538 vpxorq $D0hi,$D0hi,$D0hi 3539 vpmadd52huq $H2,$S1,$D0hi 3540 vpxorq $D1lo,$D1lo,$D1lo 3541 vpmadd52luq $H2,$S2,$D1lo 3542 vpxorq $D1hi,$D1hi,$D1hi 3543 vpmadd52huq $H2,$S2,$D1hi 3544 vpxorq $D2lo,$D2lo,$D2lo 3545 vpmadd52luq $H2,$R0,$D2lo 3546 vpxorq $D2hi,$D2hi,$D2hi 3547 vpmadd52huq $H2,$R0,$D2hi 3548 3549 vmovdqu64 16*0($inp),$T2 # load data 3550 vmovdqu64 16*4($inp),$T3 3551 lea 16*8($inp),$inp 3552 vpmadd52luq $H0,$R0,$D0lo 3553 vpmadd52huq $H0,$R0,$D0hi 3554 vpmadd52luq $H0,$R1,$D1lo 3555 vpmadd52huq $H0,$R1,$D1hi 3556 vpmadd52luq $H0,$R2,$D2lo 3557 vpmadd52huq $H0,$R2,$D2hi 3558 3559 vpunpcklqdq $T3,$T2,$T1 # transpose data 3560 vpunpckhqdq $T3,$T2,$T3 3561 vpmadd52luq $H1,$S2,$D0lo 3562 vpmadd52huq $H1,$S2,$D0hi 3563 vpmadd52luq $H1,$R0,$D1lo 3564 vpmadd52huq $H1,$R0,$D1hi 3565 vpmadd52luq $H1,$R1,$D2lo 3566 vpmadd52huq $H1,$R1,$D2hi 3567 3568 ################################################################ 3569 # partial reduction (interleaved with data splat) 3570 vpsrlq \$44,$D0lo,$tmp 3571 vpsllq \$8,$D0hi,$D0hi 3572 vpandq $mask44,$D0lo,$H0 3573 vpaddq $tmp,$D0hi,$D0hi 3574 3575 vpsrlq \$24,$T3,$T2 3576 vporq $PAD,$T2,$T2 3577 vpaddq $D0hi,$D1lo,$D1lo 3578 3579 vpsrlq \$44,$D1lo,$tmp 3580 vpsllq \$8,$D1hi,$D1hi 3581 vpandq $mask44,$D1lo,$H1 3582 vpaddq $tmp,$D1hi,$D1hi 3583 3584 vpandq $mask44,$T1,$T0 3585 vpsrlq \$44,$T1,$T1 3586 vpsllq \$20,$T3,$T3 3587 vpaddq $D1hi,$D2lo,$D2lo 3588 3589 vpsrlq \$42,$D2lo,$tmp 3590 vpsllq \$10,$D2hi,$D2hi 3591 vpandq $mask42,$D2lo,$H2 3592 vpaddq $tmp,$D2hi,$D2hi 3593 3594 vpaddq $T2,$H2,$H2 # accumulate input 3595 vpaddq $D2hi,$H0,$H0 3596 vpsllq \$2,$D2hi,$D2hi 3597 3598 vpaddq $D2hi,$H0,$H0 3599 vporq $T3,$T1,$T1 3600 vpandq $mask44,$T1,$T1 3601 3602 vpsrlq \$44,$H0,$tmp # additional step 3603 vpandq $mask44,$H0,$H0 3604 3605 vpaddq $tmp,$H1,$H1 3606 3607 sub \$8,$len # len-=128 3608 jnz .Loop_vpmadd52_8x 3609 3610.Ltail_vpmadd52_8x: 3611 #vpaddq $T2,$H2,$H2 # accumulate input 3612 vpaddq $T0,$H0,$H0 3613 vpaddq $T1,$H1,$H1 3614 3615 vpxorq $D0lo,$D0lo,$D0lo 3616 vpmadd52luq $H2,$SS1,$D0lo 3617 vpxorq $D0hi,$D0hi,$D0hi 3618 vpmadd52huq $H2,$SS1,$D0hi 3619 vpxorq $D1lo,$D1lo,$D1lo 3620 vpmadd52luq $H2,$SS2,$D1lo 3621 vpxorq $D1hi,$D1hi,$D1hi 3622 vpmadd52huq $H2,$SS2,$D1hi 3623 vpxorq $D2lo,$D2lo,$D2lo 3624 vpmadd52luq $H2,$RR0,$D2lo 3625 vpxorq $D2hi,$D2hi,$D2hi 3626 vpmadd52huq $H2,$RR0,$D2hi 3627 3628 vpmadd52luq $H0,$RR0,$D0lo 3629 vpmadd52huq $H0,$RR0,$D0hi 3630 vpmadd52luq $H0,$RR1,$D1lo 3631 vpmadd52huq $H0,$RR1,$D1hi 3632 vpmadd52luq $H0,$RR2,$D2lo 3633 vpmadd52huq $H0,$RR2,$D2hi 3634 3635 vpmadd52luq $H1,$SS2,$D0lo 3636 vpmadd52huq $H1,$SS2,$D0hi 3637 vpmadd52luq $H1,$RR0,$D1lo 3638 vpmadd52huq $H1,$RR0,$D1hi 3639 vpmadd52luq $H1,$RR1,$D2lo 3640 vpmadd52huq $H1,$RR1,$D2hi 3641 3642 ################################################################ 3643 # horizontal addition 3644 3645 mov \$1,%eax 3646 kmovw %eax,%k1 3647 vpsrldq \$8,$D0lo,$T0 3648 vpsrldq \$8,$D0hi,$H0 3649 vpsrldq \$8,$D1lo,$T1 3650 vpsrldq \$8,$D1hi,$H1 3651 vpaddq $T0,$D0lo,$D0lo 3652 vpaddq $H0,$D0hi,$D0hi 3653 vpsrldq \$8,$D2lo,$T2 3654 vpsrldq \$8,$D2hi,$H2 3655 vpaddq $T1,$D1lo,$D1lo 3656 vpaddq $H1,$D1hi,$D1hi 3657 vpermq \$0x2,$D0lo,$T0 3658 vpermq \$0x2,$D0hi,$H0 3659 vpaddq $T2,$D2lo,$D2lo 3660 vpaddq $H2,$D2hi,$D2hi 3661 3662 vpermq \$0x2,$D1lo,$T1 3663 vpermq \$0x2,$D1hi,$H1 3664 vpaddq $T0,$D0lo,$D0lo 3665 vpaddq $H0,$D0hi,$D0hi 3666 vpermq \$0x2,$D2lo,$T2 3667 vpermq \$0x2,$D2hi,$H2 3668 vpaddq $T1,$D1lo,$D1lo 3669 vpaddq $H1,$D1hi,$D1hi 3670 vextracti64x4 \$1,$D0lo,%y#$T0 3671 vextracti64x4 \$1,$D0hi,%y#$H0 3672 vpaddq $T2,$D2lo,$D2lo 3673 vpaddq $H2,$D2hi,$D2hi 3674 3675 vextracti64x4 \$1,$D1lo,%y#$T1 3676 vextracti64x4 \$1,$D1hi,%y#$H1 3677 vextracti64x4 \$1,$D2lo,%y#$T2 3678 vextracti64x4 \$1,$D2hi,%y#$H2 3679___ 3680######## switch back to %ymm 3681map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3682map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3683map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3684 3685$code.=<<___; 3686 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3687 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3688 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3689 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3690 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3691 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3692 3693 ################################################################ 3694 # partial reduction 3695 vpsrlq \$44,$D0lo,$tmp 3696 vpsllq \$8,$D0hi,$D0hi 3697 vpandq $mask44,$D0lo,$H0 3698 vpaddq $tmp,$D0hi,$D0hi 3699 3700 vpaddq $D0hi,$D1lo,$D1lo 3701 3702 vpsrlq \$44,$D1lo,$tmp 3703 vpsllq \$8,$D1hi,$D1hi 3704 vpandq $mask44,$D1lo,$H1 3705 vpaddq $tmp,$D1hi,$D1hi 3706 3707 vpaddq $D1hi,$D2lo,$D2lo 3708 3709 vpsrlq \$42,$D2lo,$tmp 3710 vpsllq \$10,$D2hi,$D2hi 3711 vpandq $mask42,$D2lo,$H2 3712 vpaddq $tmp,$D2hi,$D2hi 3713 3714 vpaddq $D2hi,$H0,$H0 3715 vpsllq \$2,$D2hi,$D2hi 3716 3717 vpaddq $D2hi,$H0,$H0 3718 3719 vpsrlq \$44,$H0,$tmp # additional step 3720 vpandq $mask44,$H0,$H0 3721 3722 vpaddq $tmp,$H1,$H1 3723 3724 ################################################################ 3725 3726 vmovq %x#$H0,0($ctx) 3727 vmovq %x#$H1,8($ctx) 3728 vmovq %x#$H2,16($ctx) 3729 vzeroall 3730 3731.Lno_data_vpmadd52_8x: 3732 ret 3733.cfi_endproc 3734.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3735___ 3736} 3737$code.=<<___; 3738.type poly1305_emit_base2_44,\@function,3 3739.align 32 3740poly1305_emit_base2_44: 3741.cfi_startproc 3742 mov 0($ctx),%r8 # load hash value 3743 mov 8($ctx),%r9 3744 mov 16($ctx),%r10 3745 3746 mov %r9,%rax 3747 shr \$20,%r9 3748 shl \$44,%rax 3749 mov %r10,%rcx 3750 shr \$40,%r10 3751 shl \$24,%rcx 3752 3753 add %rax,%r8 3754 adc %rcx,%r9 3755 adc \$0,%r10 3756 3757 mov %r8,%rax 3758 add \$5,%r8 # compare to modulus 3759 mov %r9,%rcx 3760 adc \$0,%r9 3761 adc \$0,%r10 3762 shr \$2,%r10 # did 130-bit value overflow? 3763 cmovnz %r8,%rax 3764 cmovnz %r9,%rcx 3765 3766 add 0($nonce),%rax # accumulate nonce 3767 adc 8($nonce),%rcx 3768 mov %rax,0($mac) # write result 3769 mov %rcx,8($mac) 3770 3771 ret 3772.cfi_endproc 3773.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3774___ 3775} } } 3776$code.=<<___; 3777.align 64 3778.Lconst: 3779.Lmask24: 3780.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 3781.L129: 3782.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 3783.Lmask26: 3784.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 3785.Lpermd_avx2: 3786.long 2,2,2,3,2,0,2,1 3787.Lpermd_avx512: 3788.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 3789 3790.L2_44_inp_permd: 3791.long 0,1,1,2,2,3,7,7 3792.L2_44_inp_shift: 3793.quad 0,12,24,64 3794.L2_44_mask: 3795.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 3796.L2_44_shift_rgt: 3797.quad 44,44,42,64 3798.L2_44_shift_lft: 3799.quad 8,8,10,64 3800 3801.align 64 3802.Lx_mask44: 3803.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3804.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3805.Lx_mask42: 3806.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3807.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3808___ 3809} 3810$code.=<<___; 3811.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3812.align 16 3813___ 3814 3815{ # chacha20-poly1305 helpers 3816my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 3817 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 3818$code.=<<___; 3819.globl xor128_encrypt_n_pad 3820.type xor128_encrypt_n_pad,\@abi-omnipotent 3821.align 16 3822xor128_encrypt_n_pad: 3823.cfi_startproc 3824 sub $otp,$inp 3825 sub $otp,$out 3826 mov $len,%r10 # put len aside 3827 shr \$4,$len # len / 16 3828 jz .Ltail_enc 3829 nop 3830.Loop_enc_xmm: 3831 movdqu ($inp,$otp),%xmm0 3832 pxor ($otp),%xmm0 3833 movdqu %xmm0,($out,$otp) 3834 movdqa %xmm0,($otp) 3835 lea 16($otp),$otp 3836 dec $len 3837 jnz .Loop_enc_xmm 3838 3839 and \$15,%r10 # len % 16 3840 jz .Ldone_enc 3841 3842.Ltail_enc: 3843 mov \$16,$len 3844 sub %r10,$len 3845 xor %eax,%eax 3846.Loop_enc_byte: 3847 mov ($inp,$otp),%al 3848 xor ($otp),%al 3849 mov %al,($out,$otp) 3850 mov %al,($otp) 3851 lea 1($otp),$otp 3852 dec %r10 3853 jnz .Loop_enc_byte 3854 3855 xor %eax,%eax 3856.Loop_enc_pad: 3857 mov %al,($otp) 3858 lea 1($otp),$otp 3859 dec $len 3860 jnz .Loop_enc_pad 3861 3862.Ldone_enc: 3863 mov $otp,%rax 3864 ret 3865.cfi_endproc 3866.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3867 3868.globl xor128_decrypt_n_pad 3869.type xor128_decrypt_n_pad,\@abi-omnipotent 3870.align 16 3871xor128_decrypt_n_pad: 3872.cfi_startproc 3873 sub $otp,$inp 3874 sub $otp,$out 3875 mov $len,%r10 # put len aside 3876 shr \$4,$len # len / 16 3877 jz .Ltail_dec 3878 nop 3879.Loop_dec_xmm: 3880 movdqu ($inp,$otp),%xmm0 3881 movdqa ($otp),%xmm1 3882 pxor %xmm0,%xmm1 3883 movdqu %xmm1,($out,$otp) 3884 movdqa %xmm0,($otp) 3885 lea 16($otp),$otp 3886 dec $len 3887 jnz .Loop_dec_xmm 3888 3889 pxor %xmm1,%xmm1 3890 and \$15,%r10 # len % 16 3891 jz .Ldone_dec 3892 3893.Ltail_dec: 3894 mov \$16,$len 3895 sub %r10,$len 3896 xor %eax,%eax 3897 xor %r11,%r11 3898.Loop_dec_byte: 3899 mov ($inp,$otp),%r11b 3900 mov ($otp),%al 3901 xor %r11b,%al 3902 mov %al,($out,$otp) 3903 mov %r11b,($otp) 3904 lea 1($otp),$otp 3905 dec %r10 3906 jnz .Loop_dec_byte 3907 3908 xor %eax,%eax 3909.Loop_dec_pad: 3910 mov %al,($otp) 3911 lea 1($otp),$otp 3912 dec $len 3913 jnz .Loop_dec_pad 3914 3915.Ldone_dec: 3916 mov $otp,%rax 3917 ret 3918.cfi_endproc 3919.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3920___ 3921} 3922 3923# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3924# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3925if ($win64) { 3926$rec="%rcx"; 3927$frame="%rdx"; 3928$context="%r8"; 3929$disp="%r9"; 3930 3931$code.=<<___; 3932.extern __imp_RtlVirtualUnwind 3933.type se_handler,\@abi-omnipotent 3934.align 16 3935se_handler: 3936 push %rsi 3937 push %rdi 3938 push %rbx 3939 push %rbp 3940 push %r12 3941 push %r13 3942 push %r14 3943 push %r15 3944 pushfq 3945 sub \$64,%rsp 3946 3947 mov 120($context),%rax # pull context->Rax 3948 mov 248($context),%rbx # pull context->Rip 3949 3950 mov 8($disp),%rsi # disp->ImageBase 3951 mov 56($disp),%r11 # disp->HandlerData 3952 3953 mov 0(%r11),%r10d # HandlerData[0] 3954 lea (%rsi,%r10),%r10 # prologue label 3955 cmp %r10,%rbx # context->Rip<.Lprologue 3956 jb .Lcommon_seh_tail 3957 3958 mov 152($context),%rax # pull context->Rsp 3959 3960 mov 4(%r11),%r10d # HandlerData[1] 3961 lea (%rsi,%r10),%r10 # epilogue label 3962 cmp %r10,%rbx # context->Rip>=.Lepilogue 3963 jae .Lcommon_seh_tail 3964 3965 lea 48(%rax),%rax 3966 3967 mov -8(%rax),%rbx 3968 mov -16(%rax),%rbp 3969 mov -24(%rax),%r12 3970 mov -32(%rax),%r13 3971 mov -40(%rax),%r14 3972 mov -48(%rax),%r15 3973 mov %rbx,144($context) # restore context->Rbx 3974 mov %rbp,160($context) # restore context->Rbp 3975 mov %r12,216($context) # restore context->R12 3976 mov %r13,224($context) # restore context->R13 3977 mov %r14,232($context) # restore context->R14 3978 mov %r15,240($context) # restore context->R14 3979 3980 jmp .Lcommon_seh_tail 3981.size se_handler,.-se_handler 3982 3983.type avx_handler,\@abi-omnipotent 3984.align 16 3985avx_handler: 3986 push %rsi 3987 push %rdi 3988 push %rbx 3989 push %rbp 3990 push %r12 3991 push %r13 3992 push %r14 3993 push %r15 3994 pushfq 3995 sub \$64,%rsp 3996 3997 mov 120($context),%rax # pull context->Rax 3998 mov 248($context),%rbx # pull context->Rip 3999 4000 mov 8($disp),%rsi # disp->ImageBase 4001 mov 56($disp),%r11 # disp->HandlerData 4002 4003 mov 0(%r11),%r10d # HandlerData[0] 4004 lea (%rsi,%r10),%r10 # prologue label 4005 cmp %r10,%rbx # context->Rip<prologue label 4006 jb .Lcommon_seh_tail 4007 4008 mov 152($context),%rax # pull context->Rsp 4009 4010 mov 4(%r11),%r10d # HandlerData[1] 4011 lea (%rsi,%r10),%r10 # epilogue label 4012 cmp %r10,%rbx # context->Rip>=epilogue label 4013 jae .Lcommon_seh_tail 4014 4015 mov 208($context),%rax # pull context->R11 4016 4017 lea 0x50(%rax),%rsi 4018 lea 0xf8(%rax),%rax 4019 lea 512($context),%rdi # &context.Xmm6 4020 mov \$20,%ecx 4021 .long 0xa548f3fc # cld; rep movsq 4022 4023.Lcommon_seh_tail: 4024 mov 8(%rax),%rdi 4025 mov 16(%rax),%rsi 4026 mov %rax,152($context) # restore context->Rsp 4027 mov %rsi,168($context) # restore context->Rsi 4028 mov %rdi,176($context) # restore context->Rdi 4029 4030 mov 40($disp),%rdi # disp->ContextRecord 4031 mov $context,%rsi # context 4032 mov \$154,%ecx # sizeof(CONTEXT) 4033 .long 0xa548f3fc # cld; rep movsq 4034 4035 mov $disp,%rsi 4036 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4037 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4038 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4039 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4040 mov 40(%rsi),%r10 # disp->ContextRecord 4041 lea 56(%rsi),%r11 # &disp->HandlerData 4042 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4043 mov %r10,32(%rsp) # arg5 4044 mov %r11,40(%rsp) # arg6 4045 mov %r12,48(%rsp) # arg7 4046 mov %rcx,56(%rsp) # arg8, (NULL) 4047 call *__imp_RtlVirtualUnwind(%rip) 4048 4049 mov \$1,%eax # ExceptionContinueSearch 4050 add \$64,%rsp 4051 popfq 4052 pop %r15 4053 pop %r14 4054 pop %r13 4055 pop %r12 4056 pop %rbp 4057 pop %rbx 4058 pop %rdi 4059 pop %rsi 4060 ret 4061.size avx_handler,.-avx_handler 4062 4063.section .pdata 4064.align 4 4065 .rva .LSEH_begin_poly1305_init 4066 .rva .LSEH_end_poly1305_init 4067 .rva .LSEH_info_poly1305_init 4068 4069 .rva .LSEH_begin_poly1305_blocks 4070 .rva .LSEH_end_poly1305_blocks 4071 .rva .LSEH_info_poly1305_blocks 4072 4073 .rva .LSEH_begin_poly1305_emit 4074 .rva .LSEH_end_poly1305_emit 4075 .rva .LSEH_info_poly1305_emit 4076___ 4077$code.=<<___ if ($avx); 4078 .rva .LSEH_begin_poly1305_blocks_avx 4079 .rva .Lbase2_64_avx 4080 .rva .LSEH_info_poly1305_blocks_avx_1 4081 4082 .rva .Lbase2_64_avx 4083 .rva .Leven_avx 4084 .rva .LSEH_info_poly1305_blocks_avx_2 4085 4086 .rva .Leven_avx 4087 .rva .LSEH_end_poly1305_blocks_avx 4088 .rva .LSEH_info_poly1305_blocks_avx_3 4089 4090 .rva .LSEH_begin_poly1305_emit_avx 4091 .rva .LSEH_end_poly1305_emit_avx 4092 .rva .LSEH_info_poly1305_emit_avx 4093___ 4094$code.=<<___ if ($avx>1); 4095 .rva .LSEH_begin_poly1305_blocks_avx2 4096 .rva .Lbase2_64_avx2 4097 .rva .LSEH_info_poly1305_blocks_avx2_1 4098 4099 .rva .Lbase2_64_avx2 4100 .rva .Leven_avx2 4101 .rva .LSEH_info_poly1305_blocks_avx2_2 4102 4103 .rva .Leven_avx2 4104 .rva .LSEH_end_poly1305_blocks_avx2 4105 .rva .LSEH_info_poly1305_blocks_avx2_3 4106___ 4107$code.=<<___ if ($avx>2); 4108 .rva .LSEH_begin_poly1305_blocks_avx512 4109 .rva .LSEH_end_poly1305_blocks_avx512 4110 .rva .LSEH_info_poly1305_blocks_avx512 4111___ 4112$code.=<<___; 4113.section .xdata 4114.align 8 4115.LSEH_info_poly1305_init: 4116 .byte 9,0,0,0 4117 .rva se_handler 4118 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init 4119 4120.LSEH_info_poly1305_blocks: 4121 .byte 9,0,0,0 4122 .rva se_handler 4123 .rva .Lblocks_body,.Lblocks_epilogue 4124 4125.LSEH_info_poly1305_emit: 4126 .byte 9,0,0,0 4127 .rva se_handler 4128 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit 4129___ 4130$code.=<<___ if ($avx); 4131.LSEH_info_poly1305_blocks_avx_1: 4132 .byte 9,0,0,0 4133 .rva se_handler 4134 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 4135 4136.LSEH_info_poly1305_blocks_avx_2: 4137 .byte 9,0,0,0 4138 .rva se_handler 4139 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 4140 4141.LSEH_info_poly1305_blocks_avx_3: 4142 .byte 9,0,0,0 4143 .rva avx_handler 4144 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 4145 4146.LSEH_info_poly1305_emit_avx: 4147 .byte 9,0,0,0 4148 .rva se_handler 4149 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 4150___ 4151$code.=<<___ if ($avx>1); 4152.LSEH_info_poly1305_blocks_avx2_1: 4153 .byte 9,0,0,0 4154 .rva se_handler 4155 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 4156 4157.LSEH_info_poly1305_blocks_avx2_2: 4158 .byte 9,0,0,0 4159 .rva se_handler 4160 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 4161 4162.LSEH_info_poly1305_blocks_avx2_3: 4163 .byte 9,0,0,0 4164 .rva avx_handler 4165 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 4166___ 4167$code.=<<___ if ($avx>2); 4168.LSEH_info_poly1305_blocks_avx512: 4169 .byte 9,0,0,0 4170 .rva avx_handler 4171 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 4172___ 4173} 4174 4175foreach (split('\n',$code)) { 4176 s/\`([^\`]*)\`/eval($1)/ge; 4177 s/%r([a-z]+)#d/%e$1/g; 4178 s/%r([0-9]+)#d/%r$1d/g; 4179 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 4180 4181 print $_,"\n"; 4182} 4183close STDOUT or die "error closing STDOUT: $!"; 4184