1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for x86_64. 18# 19# March 2015 20# 21# Initial release. 22# 23# December 2016 24# 25# Add AVX512F+VL+BW code path. 26# 27# November 2017 28# 29# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 30# executed even on Knights Landing. Trigger for modification was 31# observation that AVX512 code paths can negatively affect overall 32# Skylake-X system performance. Since we are likely to suppress 33# AVX512F capability flag [at least on Skylake-X], conversion serves 34# as kind of "investment protection". Note that next *lake processor, 35# Cannolake, has AVX512IFMA code path to execute... 36# 37# Numbers are cycles per processed byte with poly1305_blocks alone, 38# measured with rdtsc at fixed clock frequency. 39# 40# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 41# P4 4.46/+120% - 42# Core 2 2.41/+90% - 43# Westmere 1.88/+120% - 44# Sandy Bridge 1.39/+140% 1.10 45# Haswell 1.14/+175% 1.11 0.65 46# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 47# Silvermont 2.83/+95% - 48# Knights L 3.60/? 1.65 1.10 0.41(***) 49# Goldmont 1.70/+180% - 50# VIA Nano 1.82/+150% - 51# Sledgehammer 1.38/+160% - 52# Bulldozer 2.30/+130% 0.97 53# Ryzen 1.15/+200% 1.08 1.18 54# 55# (*) improvement coefficients relative to clang are more modest and 56# are ~50% on most processors, in both cases we are comparing to 57# __int128 code; 58# (**) SSE2 implementation was attempted, but among non-AVX processors 59# it was faster than integer-only code only on older Intel P4 and 60# Core processors, 50-30%, less newer processor is, but slower on 61# contemporary ones, for example almost 2x slower on Atom, and as 62# former are naturally disappearing, SSE2 is deemed unnecessary; 63# (***) strangely enough performance seems to vary from core to core, 64# listed result is best case; 65 66# $output is the last argument if it looks like a file (it has an extension) 67# $flavour is the first argument if it doesn't look like a file 68$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 69$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 70 71$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 72 73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 74( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 75( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 76die "can't locate x86_64-xlate.pl"; 77 78if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 80 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); 81} 82 83if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 85 $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); 86 $avx += 2 if ($1==2.11 && $2>=8); 87} 88 89if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 90 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 91 $avx = ($1>=10) + ($1>=12); 92} 93 94if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 95 $avx = ($2>=3.0) + ($2>3.0); 96} 97 98open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 99 or die "can't call $xlate: $!"; 100*STDOUT=*OUT; 101 102my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 103my ($mac,$nonce)=($inp,$len); # *_emit arguments 104my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); 105my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); 106 107sub poly1305_iteration { 108# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 109# output: $h0-$h2 *= $r0-$r1 110$code.=<<___; 111 mulq $h0 # h0*r1 112 mov %rax,$d2 113 mov $r0,%rax 114 mov %rdx,$d3 115 116 mulq $h0 # h0*r0 117 mov %rax,$h0 # future $h0 118 mov $r0,%rax 119 mov %rdx,$d1 120 121 mulq $h1 # h1*r0 122 add %rax,$d2 123 mov $s1,%rax 124 adc %rdx,$d3 125 126 mulq $h1 # h1*s1 127 mov $h2,$h1 # borrow $h1 128 add %rax,$h0 129 adc %rdx,$d1 130 131 imulq $s1,$h1 # h2*s1 132 add $h1,$d2 133 mov $d1,$h1 134 adc \$0,$d3 135 136 imulq $r0,$h2 # h2*r0 137 add $d2,$h1 138 mov \$-4,%rax # mask value 139 adc $h2,$d3 140 141 and $d3,%rax # last reduction step 142 mov $d3,$h2 143 shr \$2,$d3 144 and \$3,$h2 145 add $d3,%rax 146 add %rax,$h0 147 adc \$0,$h1 148 adc \$0,$h2 149___ 150} 151 152######################################################################## 153# Layout of opaque area is following. 154# 155# unsigned __int64 h[3]; # current hash value base 2^64 156# unsigned __int64 r[2]; # key value base 2^64 157 158$code.=<<___; 159.text 160 161.extern OPENSSL_ia32cap_P 162 163.globl poly1305_init 164.hidden poly1305_init 165.globl poly1305_blocks 166.hidden poly1305_blocks 167.globl poly1305_emit 168.hidden poly1305_emit 169 170.type poly1305_init,\@function,3 171.align 32 172poly1305_init: 173.cfi_startproc 174 xor %rax,%rax 175 mov %rax,0($ctx) # initialize hash value 176 mov %rax,8($ctx) 177 mov %rax,16($ctx) 178 179 cmp \$0,$inp 180 je .Lno_key 181 182 lea poly1305_blocks(%rip),%r10 183 lea poly1305_emit(%rip),%r11 184___ 185$code.=<<___ if ($avx); 186 mov OPENSSL_ia32cap_P+4(%rip),%r9 187 lea poly1305_blocks_avx(%rip),%rax 188 lea poly1305_emit_avx(%rip),%rcx 189 bt \$`60-32`,%r9 # AVX? 190 cmovc %rax,%r10 191 cmovc %rcx,%r11 192___ 193$code.=<<___ if ($avx>1); 194 lea poly1305_blocks_avx2(%rip),%rax 195 bt \$`5+32`,%r9 # AVX2? 196 cmovc %rax,%r10 197___ 198$code.=<<___ if ($avx>3); 199 mov \$`(1<<31|1<<21|1<<16)`,%rax 200 shr \$32,%r9 201 and %rax,%r9 202 cmp %rax,%r9 203 je .Linit_base2_44 204___ 205$code.=<<___; 206 mov \$0x0ffffffc0fffffff,%rax 207 mov \$0x0ffffffc0ffffffc,%rcx 208 and 0($inp),%rax 209 and 8($inp),%rcx 210 mov %rax,24($ctx) 211 mov %rcx,32($ctx) 212___ 213$code.=<<___ if ($flavour !~ /elf32/); 214 mov %r10,0(%rdx) 215 mov %r11,8(%rdx) 216___ 217$code.=<<___ if ($flavour =~ /elf32/); 218 mov %r10d,0(%rdx) 219 mov %r11d,4(%rdx) 220___ 221$code.=<<___; 222 mov \$1,%eax 223.Lno_key: 224 ret 225.cfi_endproc 226.size poly1305_init,.-poly1305_init 227 228.type poly1305_blocks,\@function,4 229.align 32 230poly1305_blocks: 231.cfi_startproc 232.Lblocks: 233 shr \$4,$len 234 jz .Lno_data # too short 235 236 push %rbx 237.cfi_push %rbx 238 push %rbp 239.cfi_push %rbp 240 push %r12 241.cfi_push %r12 242 push %r13 243.cfi_push %r13 244 push %r14 245.cfi_push %r14 246 push %r15 247.cfi_push %r15 248.Lblocks_body: 249 250 mov $len,%r15 # reassign $len 251 252 mov 24($ctx),$r0 # load r 253 mov 32($ctx),$s1 254 255 mov 0($ctx),$h0 # load hash value 256 mov 8($ctx),$h1 257 mov 16($ctx),$h2 258 259 mov $s1,$r1 260 shr \$2,$s1 261 mov $r1,%rax 262 add $r1,$s1 # s1 = r1 + (r1 >> 2) 263 jmp .Loop 264 265.align 32 266.Loop: 267 add 0($inp),$h0 # accumulate input 268 adc 8($inp),$h1 269 lea 16($inp),$inp 270 adc $padbit,$h2 271___ 272 &poly1305_iteration(); 273$code.=<<___; 274 mov $r1,%rax 275 dec %r15 # len-=16 276 jnz .Loop 277 278 mov $h0,0($ctx) # store hash value 279 mov $h1,8($ctx) 280 mov $h2,16($ctx) 281 282 mov 0(%rsp),%r15 283.cfi_restore %r15 284 mov 8(%rsp),%r14 285.cfi_restore %r14 286 mov 16(%rsp),%r13 287.cfi_restore %r13 288 mov 24(%rsp),%r12 289.cfi_restore %r12 290 mov 32(%rsp),%rbp 291.cfi_restore %rbp 292 mov 40(%rsp),%rbx 293.cfi_restore %rbx 294 lea 48(%rsp),%rsp 295.cfi_adjust_cfa_offset -48 296.Lno_data: 297.Lblocks_epilogue: 298 ret 299.cfi_endproc 300.size poly1305_blocks,.-poly1305_blocks 301 302.type poly1305_emit,\@function,3 303.align 32 304poly1305_emit: 305.cfi_startproc 306.Lemit: 307 mov 0($ctx),%r8 # load hash value 308 mov 8($ctx),%r9 309 mov 16($ctx),%r10 310 311 mov %r8,%rax 312 add \$5,%r8 # compare to modulus 313 mov %r9,%rcx 314 adc \$0,%r9 315 adc \$0,%r10 316 shr \$2,%r10 # did 130-bit value overflow? 317 cmovnz %r8,%rax 318 cmovnz %r9,%rcx 319 320 add 0($nonce),%rax # accumulate nonce 321 adc 8($nonce),%rcx 322 mov %rax,0($mac) # write result 323 mov %rcx,8($mac) 324 325 ret 326.cfi_endproc 327.size poly1305_emit,.-poly1305_emit 328___ 329if ($avx) { 330 331######################################################################## 332# Layout of opaque area is following. 333# 334# unsigned __int32 h[5]; # current hash value base 2^26 335# unsigned __int32 is_base2_26; 336# unsigned __int64 r[2]; # key value base 2^64 337# unsigned __int64 pad; 338# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 339# 340# where r^n are base 2^26 digits of degrees of multiplier key. There are 341# 5 digits, but last four are interleaved with multiples of 5, totalling 342# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 343 344my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 345 map("%xmm$_",(0..15)); 346 347$code.=<<___; 348.type __poly1305_block,\@abi-omnipotent 349.align 32 350__poly1305_block: 351.cfi_startproc 352___ 353 &poly1305_iteration(); 354$code.=<<___; 355 ret 356.cfi_endproc 357.size __poly1305_block,.-__poly1305_block 358 359.type __poly1305_init_avx,\@abi-omnipotent 360.align 32 361__poly1305_init_avx: 362.cfi_startproc 363 mov $r0,$h0 364 mov $r1,$h1 365 xor $h2,$h2 366 367 lea 48+64($ctx),$ctx # size optimization 368 369 mov $r1,%rax 370 call __poly1305_block # r^2 371 372 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 373 mov \$0x3ffffff,%edx 374 mov $h0,$d1 375 and $h0#d,%eax 376 mov $r0,$d2 377 and $r0#d,%edx 378 mov %eax,`16*0+0-64`($ctx) 379 shr \$26,$d1 380 mov %edx,`16*0+4-64`($ctx) 381 shr \$26,$d2 382 383 mov \$0x3ffffff,%eax 384 mov \$0x3ffffff,%edx 385 and $d1#d,%eax 386 and $d2#d,%edx 387 mov %eax,`16*1+0-64`($ctx) 388 lea (%rax,%rax,4),%eax # *5 389 mov %edx,`16*1+4-64`($ctx) 390 lea (%rdx,%rdx,4),%edx # *5 391 mov %eax,`16*2+0-64`($ctx) 392 shr \$26,$d1 393 mov %edx,`16*2+4-64`($ctx) 394 shr \$26,$d2 395 396 mov $h1,%rax 397 mov $r1,%rdx 398 shl \$12,%rax 399 shl \$12,%rdx 400 or $d1,%rax 401 or $d2,%rdx 402 and \$0x3ffffff,%eax 403 and \$0x3ffffff,%edx 404 mov %eax,`16*3+0-64`($ctx) 405 lea (%rax,%rax,4),%eax # *5 406 mov %edx,`16*3+4-64`($ctx) 407 lea (%rdx,%rdx,4),%edx # *5 408 mov %eax,`16*4+0-64`($ctx) 409 mov $h1,$d1 410 mov %edx,`16*4+4-64`($ctx) 411 mov $r1,$d2 412 413 mov \$0x3ffffff,%eax 414 mov \$0x3ffffff,%edx 415 shr \$14,$d1 416 shr \$14,$d2 417 and $d1#d,%eax 418 and $d2#d,%edx 419 mov %eax,`16*5+0-64`($ctx) 420 lea (%rax,%rax,4),%eax # *5 421 mov %edx,`16*5+4-64`($ctx) 422 lea (%rdx,%rdx,4),%edx # *5 423 mov %eax,`16*6+0-64`($ctx) 424 shr \$26,$d1 425 mov %edx,`16*6+4-64`($ctx) 426 shr \$26,$d2 427 428 mov $h2,%rax 429 shl \$24,%rax 430 or %rax,$d1 431 mov $d1#d,`16*7+0-64`($ctx) 432 lea ($d1,$d1,4),$d1 # *5 433 mov $d2#d,`16*7+4-64`($ctx) 434 lea ($d2,$d2,4),$d2 # *5 435 mov $d1#d,`16*8+0-64`($ctx) 436 mov $d2#d,`16*8+4-64`($ctx) 437 438 mov $r1,%rax 439 call __poly1305_block # r^3 440 441 mov \$0x3ffffff,%eax # save r^3 base 2^26 442 mov $h0,$d1 443 and $h0#d,%eax 444 shr \$26,$d1 445 mov %eax,`16*0+12-64`($ctx) 446 447 mov \$0x3ffffff,%edx 448 and $d1#d,%edx 449 mov %edx,`16*1+12-64`($ctx) 450 lea (%rdx,%rdx,4),%edx # *5 451 shr \$26,$d1 452 mov %edx,`16*2+12-64`($ctx) 453 454 mov $h1,%rax 455 shl \$12,%rax 456 or $d1,%rax 457 and \$0x3ffffff,%eax 458 mov %eax,`16*3+12-64`($ctx) 459 lea (%rax,%rax,4),%eax # *5 460 mov $h1,$d1 461 mov %eax,`16*4+12-64`($ctx) 462 463 mov \$0x3ffffff,%edx 464 shr \$14,$d1 465 and $d1#d,%edx 466 mov %edx,`16*5+12-64`($ctx) 467 lea (%rdx,%rdx,4),%edx # *5 468 shr \$26,$d1 469 mov %edx,`16*6+12-64`($ctx) 470 471 mov $h2,%rax 472 shl \$24,%rax 473 or %rax,$d1 474 mov $d1#d,`16*7+12-64`($ctx) 475 lea ($d1,$d1,4),$d1 # *5 476 mov $d1#d,`16*8+12-64`($ctx) 477 478 mov $r1,%rax 479 call __poly1305_block # r^4 480 481 mov \$0x3ffffff,%eax # save r^4 base 2^26 482 mov $h0,$d1 483 and $h0#d,%eax 484 shr \$26,$d1 485 mov %eax,`16*0+8-64`($ctx) 486 487 mov \$0x3ffffff,%edx 488 and $d1#d,%edx 489 mov %edx,`16*1+8-64`($ctx) 490 lea (%rdx,%rdx,4),%edx # *5 491 shr \$26,$d1 492 mov %edx,`16*2+8-64`($ctx) 493 494 mov $h1,%rax 495 shl \$12,%rax 496 or $d1,%rax 497 and \$0x3ffffff,%eax 498 mov %eax,`16*3+8-64`($ctx) 499 lea (%rax,%rax,4),%eax # *5 500 mov $h1,$d1 501 mov %eax,`16*4+8-64`($ctx) 502 503 mov \$0x3ffffff,%edx 504 shr \$14,$d1 505 and $d1#d,%edx 506 mov %edx,`16*5+8-64`($ctx) 507 lea (%rdx,%rdx,4),%edx # *5 508 shr \$26,$d1 509 mov %edx,`16*6+8-64`($ctx) 510 511 mov $h2,%rax 512 shl \$24,%rax 513 or %rax,$d1 514 mov $d1#d,`16*7+8-64`($ctx) 515 lea ($d1,$d1,4),$d1 # *5 516 mov $d1#d,`16*8+8-64`($ctx) 517 518 lea -48-64($ctx),$ctx # size [de-]optimization 519 ret 520.cfi_endproc 521.size __poly1305_init_avx,.-__poly1305_init_avx 522 523.type poly1305_blocks_avx,\@function,4 524.align 32 525poly1305_blocks_avx: 526.cfi_startproc 527 mov 20($ctx),%r8d # is_base2_26 528 cmp \$128,$len 529 jae .Lblocks_avx 530 test %r8d,%r8d 531 jz .Lblocks 532 533.Lblocks_avx: 534 and \$-16,$len 535 jz .Lno_data_avx 536 537 vzeroupper 538 539 test %r8d,%r8d 540 jz .Lbase2_64_avx 541 542 test \$31,$len 543 jz .Leven_avx 544 545 push %rbx 546.cfi_push %rbx 547 push %rbp 548.cfi_push %rbp 549 push %r12 550.cfi_push %r12 551 push %r13 552.cfi_push %r13 553 push %r14 554.cfi_push %r14 555 push %r15 556.cfi_push %r15 557.Lblocks_avx_body: 558 559 mov $len,%r15 # reassign $len 560 561 mov 0($ctx),$d1 # load hash value 562 mov 8($ctx),$d2 563 mov 16($ctx),$h2#d 564 565 mov 24($ctx),$r0 # load r 566 mov 32($ctx),$s1 567 568 ################################# base 2^26 -> base 2^64 569 mov $d1#d,$h0#d 570 and \$`-1*(1<<31)`,$d1 571 mov $d2,$r1 # borrow $r1 572 mov $d2#d,$h1#d 573 and \$`-1*(1<<31)`,$d2 574 575 shr \$6,$d1 576 shl \$52,$r1 577 add $d1,$h0 578 shr \$12,$h1 579 shr \$18,$d2 580 add $r1,$h0 581 adc $d2,$h1 582 583 mov $h2,$d1 584 shl \$40,$d1 585 shr \$24,$h2 586 add $d1,$h1 587 adc \$0,$h2 # can be partially reduced... 588 589 mov \$-4,$d2 # ... so reduce 590 mov $h2,$d1 591 and $h2,$d2 592 shr \$2,$d1 593 and \$3,$h2 594 add $d2,$d1 # =*5 595 add $d1,$h0 596 adc \$0,$h1 597 adc \$0,$h2 598 599 mov $s1,$r1 600 mov $s1,%rax 601 shr \$2,$s1 602 add $r1,$s1 # s1 = r1 + (r1 >> 2) 603 604 add 0($inp),$h0 # accumulate input 605 adc 8($inp),$h1 606 lea 16($inp),$inp 607 adc $padbit,$h2 608 609 call __poly1305_block 610 611 test $padbit,$padbit # if $padbit is zero, 612 jz .Lstore_base2_64_avx # store hash in base 2^64 format 613 614 ################################# base 2^64 -> base 2^26 615 mov $h0,%rax 616 mov $h0,%rdx 617 shr \$52,$h0 618 mov $h1,$r0 619 mov $h1,$r1 620 shr \$26,%rdx 621 and \$0x3ffffff,%rax # h[0] 622 shl \$12,$r0 623 and \$0x3ffffff,%rdx # h[1] 624 shr \$14,$h1 625 or $r0,$h0 626 shl \$24,$h2 627 and \$0x3ffffff,$h0 # h[2] 628 shr \$40,$r1 629 and \$0x3ffffff,$h1 # h[3] 630 or $r1,$h2 # h[4] 631 632 sub \$16,%r15 633 jz .Lstore_base2_26_avx 634 635 vmovd %rax#d,$H0 636 vmovd %rdx#d,$H1 637 vmovd $h0#d,$H2 638 vmovd $h1#d,$H3 639 vmovd $h2#d,$H4 640 jmp .Lproceed_avx 641 642.align 32 643.Lstore_base2_64_avx: 644 mov $h0,0($ctx) 645 mov $h1,8($ctx) 646 mov $h2,16($ctx) # note that is_base2_26 is zeroed 647 jmp .Ldone_avx 648 649.align 16 650.Lstore_base2_26_avx: 651 mov %rax#d,0($ctx) # store hash value base 2^26 652 mov %rdx#d,4($ctx) 653 mov $h0#d,8($ctx) 654 mov $h1#d,12($ctx) 655 mov $h2#d,16($ctx) 656.align 16 657.Ldone_avx: 658 mov 0(%rsp),%r15 659.cfi_restore %r15 660 mov 8(%rsp),%r14 661.cfi_restore %r14 662 mov 16(%rsp),%r13 663.cfi_restore %r13 664 mov 24(%rsp),%r12 665.cfi_restore %r12 666 mov 32(%rsp),%rbp 667.cfi_restore %rbp 668 mov 40(%rsp),%rbx 669.cfi_restore %rbx 670 lea 48(%rsp),%rsp 671.cfi_adjust_cfa_offset -48 672.Lno_data_avx: 673.Lblocks_avx_epilogue: 674 ret 675.cfi_endproc 676 677.align 32 678.Lbase2_64_avx: 679.cfi_startproc 680 push %rbx 681.cfi_push %rbx 682 push %rbp 683.cfi_push %rbp 684 push %r12 685.cfi_push %r12 686 push %r13 687.cfi_push %r13 688 push %r14 689.cfi_push %r14 690 push %r15 691.cfi_push %r15 692.Lbase2_64_avx_body: 693 694 mov $len,%r15 # reassign $len 695 696 mov 24($ctx),$r0 # load r 697 mov 32($ctx),$s1 698 699 mov 0($ctx),$h0 # load hash value 700 mov 8($ctx),$h1 701 mov 16($ctx),$h2#d 702 703 mov $s1,$r1 704 mov $s1,%rax 705 shr \$2,$s1 706 add $r1,$s1 # s1 = r1 + (r1 >> 2) 707 708 test \$31,$len 709 jz .Linit_avx 710 711 add 0($inp),$h0 # accumulate input 712 adc 8($inp),$h1 713 lea 16($inp),$inp 714 adc $padbit,$h2 715 sub \$16,%r15 716 717 call __poly1305_block 718 719.Linit_avx: 720 ################################# base 2^64 -> base 2^26 721 mov $h0,%rax 722 mov $h0,%rdx 723 shr \$52,$h0 724 mov $h1,$d1 725 mov $h1,$d2 726 shr \$26,%rdx 727 and \$0x3ffffff,%rax # h[0] 728 shl \$12,$d1 729 and \$0x3ffffff,%rdx # h[1] 730 shr \$14,$h1 731 or $d1,$h0 732 shl \$24,$h2 733 and \$0x3ffffff,$h0 # h[2] 734 shr \$40,$d2 735 and \$0x3ffffff,$h1 # h[3] 736 or $d2,$h2 # h[4] 737 738 vmovd %rax#d,$H0 739 vmovd %rdx#d,$H1 740 vmovd $h0#d,$H2 741 vmovd $h1#d,$H3 742 vmovd $h2#d,$H4 743 movl \$1,20($ctx) # set is_base2_26 744 745 call __poly1305_init_avx 746 747.Lproceed_avx: 748 mov %r15,$len 749 750 mov 0(%rsp),%r15 751.cfi_restore %r15 752 mov 8(%rsp),%r14 753.cfi_restore %r14 754 mov 16(%rsp),%r13 755.cfi_restore %r13 756 mov 24(%rsp),%r12 757.cfi_restore %r12 758 mov 32(%rsp),%rbp 759.cfi_restore %rbp 760 mov 40(%rsp),%rbx 761.cfi_restore %rbx 762 lea 48(%rsp),%rax 763 lea 48(%rsp),%rsp 764.cfi_adjust_cfa_offset -48 765.Lbase2_64_avx_epilogue: 766 jmp .Ldo_avx 767.cfi_endproc 768 769.align 32 770.Leven_avx: 771.cfi_startproc 772 vmovd 4*0($ctx),$H0 # load hash value 773 vmovd 4*1($ctx),$H1 774 vmovd 4*2($ctx),$H2 775 vmovd 4*3($ctx),$H3 776 vmovd 4*4($ctx),$H4 777 778.Ldo_avx: 779___ 780$code.=<<___ if (!$win64); 781 lea -0x58(%rsp),%r11 782.cfi_def_cfa %r11,0x60 783 sub \$0x178,%rsp 784___ 785$code.=<<___ if ($win64); 786 lea -0xf8(%rsp),%r11 787 sub \$0x218,%rsp 788 vmovdqa %xmm6,0x50(%r11) 789 vmovdqa %xmm7,0x60(%r11) 790 vmovdqa %xmm8,0x70(%r11) 791 vmovdqa %xmm9,0x80(%r11) 792 vmovdqa %xmm10,0x90(%r11) 793 vmovdqa %xmm11,0xa0(%r11) 794 vmovdqa %xmm12,0xb0(%r11) 795 vmovdqa %xmm13,0xc0(%r11) 796 vmovdqa %xmm14,0xd0(%r11) 797 vmovdqa %xmm15,0xe0(%r11) 798.Ldo_avx_body: 799___ 800$code.=<<___; 801 sub \$64,$len 802 lea -32($inp),%rax 803 cmovc %rax,$inp 804 805 vmovdqu `16*3`($ctx),$D4 # preload r0^2 806 lea `16*3+64`($ctx),$ctx # size optimization 807 lea .Lconst(%rip),%rcx 808 809 ################################################################ 810 # load input 811 vmovdqu 16*2($inp),$T0 812 vmovdqu 16*3($inp),$T1 813 vmovdqa 64(%rcx),$MASK # .Lmask26 814 815 vpsrldq \$6,$T0,$T2 # splat input 816 vpsrldq \$6,$T1,$T3 817 vpunpckhqdq $T1,$T0,$T4 # 4 818 vpunpcklqdq $T1,$T0,$T0 # 0:1 819 vpunpcklqdq $T3,$T2,$T3 # 2:3 820 821 vpsrlq \$40,$T4,$T4 # 4 822 vpsrlq \$26,$T0,$T1 823 vpand $MASK,$T0,$T0 # 0 824 vpsrlq \$4,$T3,$T2 825 vpand $MASK,$T1,$T1 # 1 826 vpsrlq \$30,$T3,$T3 827 vpand $MASK,$T2,$T2 # 2 828 vpand $MASK,$T3,$T3 # 3 829 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 830 831 jbe .Lskip_loop_avx 832 833 # expand and copy pre-calculated table to stack 834 vmovdqu `16*1-64`($ctx),$D1 835 vmovdqu `16*2-64`($ctx),$D2 836 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 837 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 838 vmovdqa $D3,-0x90(%r11) 839 vmovdqa $D0,0x00(%rsp) 840 vpshufd \$0xEE,$D1,$D4 841 vmovdqu `16*3-64`($ctx),$D0 842 vpshufd \$0x44,$D1,$D1 843 vmovdqa $D4,-0x80(%r11) 844 vmovdqa $D1,0x10(%rsp) 845 vpshufd \$0xEE,$D2,$D3 846 vmovdqu `16*4-64`($ctx),$D1 847 vpshufd \$0x44,$D2,$D2 848 vmovdqa $D3,-0x70(%r11) 849 vmovdqa $D2,0x20(%rsp) 850 vpshufd \$0xEE,$D0,$D4 851 vmovdqu `16*5-64`($ctx),$D2 852 vpshufd \$0x44,$D0,$D0 853 vmovdqa $D4,-0x60(%r11) 854 vmovdqa $D0,0x30(%rsp) 855 vpshufd \$0xEE,$D1,$D3 856 vmovdqu `16*6-64`($ctx),$D0 857 vpshufd \$0x44,$D1,$D1 858 vmovdqa $D3,-0x50(%r11) 859 vmovdqa $D1,0x40(%rsp) 860 vpshufd \$0xEE,$D2,$D4 861 vmovdqu `16*7-64`($ctx),$D1 862 vpshufd \$0x44,$D2,$D2 863 vmovdqa $D4,-0x40(%r11) 864 vmovdqa $D2,0x50(%rsp) 865 vpshufd \$0xEE,$D0,$D3 866 vmovdqu `16*8-64`($ctx),$D2 867 vpshufd \$0x44,$D0,$D0 868 vmovdqa $D3,-0x30(%r11) 869 vmovdqa $D0,0x60(%rsp) 870 vpshufd \$0xEE,$D1,$D4 871 vpshufd \$0x44,$D1,$D1 872 vmovdqa $D4,-0x20(%r11) 873 vmovdqa $D1,0x70(%rsp) 874 vpshufd \$0xEE,$D2,$D3 875 vmovdqa 0x00(%rsp),$D4 # preload r0^2 876 vpshufd \$0x44,$D2,$D2 877 vmovdqa $D3,-0x10(%r11) 878 vmovdqa $D2,0x80(%rsp) 879 880 jmp .Loop_avx 881 882.align 32 883.Loop_avx: 884 ################################################################ 885 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 886 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 887 # \___________________/ 888 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 889 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 890 # \___________________/ \____________________/ 891 # 892 # Note that we start with inp[2:3]*r^2. This is because it 893 # doesn't depend on reduction in previous iteration. 894 ################################################################ 895 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 896 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 897 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 898 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 899 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 900 # 901 # though note that $Tx and $Hx are "reversed" in this section, 902 # and $D4 is preloaded with r0^2... 903 904 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 905 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 906 vmovdqa $H2,0x20(%r11) # offload hash 907 vpmuludq $T2,$D4,$D2 # d3 = h2*r0 908 vmovdqa 0x10(%rsp),$H2 # r1^2 909 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 910 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 911 912 vmovdqa $H0,0x00(%r11) # 913 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 914 vmovdqa $H1,0x10(%r11) # 915 vpmuludq $T3,$H2,$H1 # h3*r1 916 vpaddq $H0,$D0,$D0 # d0 += h4*s1 917 vpaddq $H1,$D4,$D4 # d4 += h3*r1 918 vmovdqa $H3,0x30(%r11) # 919 vpmuludq $T2,$H2,$H0 # h2*r1 920 vpmuludq $T1,$H2,$H1 # h1*r1 921 vpaddq $H0,$D3,$D3 # d3 += h2*r1 922 vmovdqa 0x30(%rsp),$H3 # r2^2 923 vpaddq $H1,$D2,$D2 # d2 += h1*r1 924 vmovdqa $H4,0x40(%r11) # 925 vpmuludq $T0,$H2,$H2 # h0*r1 926 vpmuludq $T2,$H3,$H0 # h2*r2 927 vpaddq $H2,$D1,$D1 # d1 += h0*r1 928 929 vmovdqa 0x40(%rsp),$H4 # s2^2 930 vpaddq $H0,$D4,$D4 # d4 += h2*r2 931 vpmuludq $T1,$H3,$H1 # h1*r2 932 vpmuludq $T0,$H3,$H3 # h0*r2 933 vpaddq $H1,$D3,$D3 # d3 += h1*r2 934 vmovdqa 0x50(%rsp),$H2 # r3^2 935 vpaddq $H3,$D2,$D2 # d2 += h0*r2 936 vpmuludq $T4,$H4,$H0 # h4*s2 937 vpmuludq $T3,$H4,$H4 # h3*s2 938 vpaddq $H0,$D1,$D1 # d1 += h4*s2 939 vmovdqa 0x60(%rsp),$H3 # s3^2 940 vpaddq $H4,$D0,$D0 # d0 += h3*s2 941 942 vmovdqa 0x80(%rsp),$H4 # s4^2 943 vpmuludq $T1,$H2,$H1 # h1*r3 944 vpmuludq $T0,$H2,$H2 # h0*r3 945 vpaddq $H1,$D4,$D4 # d4 += h1*r3 946 vpaddq $H2,$D3,$D3 # d3 += h0*r3 947 vpmuludq $T4,$H3,$H0 # h4*s3 948 vpmuludq $T3,$H3,$H1 # h3*s3 949 vpaddq $H0,$D2,$D2 # d2 += h4*s3 950 vmovdqu 16*0($inp),$H0 # load input 951 vpaddq $H1,$D1,$D1 # d1 += h3*s3 952 vpmuludq $T2,$H3,$H3 # h2*s3 953 vpmuludq $T2,$H4,$T2 # h2*s4 954 vpaddq $H3,$D0,$D0 # d0 += h2*s3 955 956 vmovdqu 16*1($inp),$H1 # 957 vpaddq $T2,$D1,$D1 # d1 += h2*s4 958 vpmuludq $T3,$H4,$T3 # h3*s4 959 vpmuludq $T4,$H4,$T4 # h4*s4 960 vpsrldq \$6,$H0,$H2 # splat input 961 vpaddq $T3,$D2,$D2 # d2 += h3*s4 962 vpaddq $T4,$D3,$D3 # d3 += h4*s4 963 vpsrldq \$6,$H1,$H3 # 964 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 965 vpmuludq $T1,$H4,$T0 # h1*s4 966 vpunpckhqdq $H1,$H0,$H4 # 4 967 vpaddq $T4,$D4,$D4 # d4 += h0*r4 968 vmovdqa -0x90(%r11),$T4 # r0^4 969 vpaddq $T0,$D0,$D0 # d0 += h1*s4 970 971 vpunpcklqdq $H1,$H0,$H0 # 0:1 972 vpunpcklqdq $H3,$H2,$H3 # 2:3 973 974 #vpsrlq \$40,$H4,$H4 # 4 975 vpsrldq \$`40/8`,$H4,$H4 # 4 976 vpsrlq \$26,$H0,$H1 977 vpand $MASK,$H0,$H0 # 0 978 vpsrlq \$4,$H3,$H2 979 vpand $MASK,$H1,$H1 # 1 980 vpand 0(%rcx),$H4,$H4 # .Lmask24 981 vpsrlq \$30,$H3,$H3 982 vpand $MASK,$H2,$H2 # 2 983 vpand $MASK,$H3,$H3 # 3 984 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 985 986 vpaddq 0x00(%r11),$H0,$H0 # add hash value 987 vpaddq 0x10(%r11),$H1,$H1 988 vpaddq 0x20(%r11),$H2,$H2 989 vpaddq 0x30(%r11),$H3,$H3 990 vpaddq 0x40(%r11),$H4,$H4 991 992 lea 16*2($inp),%rax 993 lea 16*4($inp),$inp 994 sub \$64,$len 995 cmovc %rax,$inp 996 997 ################################################################ 998 # Now we accumulate (inp[0:1]+hash)*r^4 999 ################################################################ 1000 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1001 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1002 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1003 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1004 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1005 1006 vpmuludq $H0,$T4,$T0 # h0*r0 1007 vpmuludq $H1,$T4,$T1 # h1*r0 1008 vpaddq $T0,$D0,$D0 1009 vpaddq $T1,$D1,$D1 1010 vmovdqa -0x80(%r11),$T2 # r1^4 1011 vpmuludq $H2,$T4,$T0 # h2*r0 1012 vpmuludq $H3,$T4,$T1 # h3*r0 1013 vpaddq $T0,$D2,$D2 1014 vpaddq $T1,$D3,$D3 1015 vpmuludq $H4,$T4,$T4 # h4*r0 1016 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 1017 vpaddq $T4,$D4,$D4 1018 1019 vpaddq $T0,$D0,$D0 # d0 += h4*s1 1020 vpmuludq $H2,$T2,$T1 # h2*r1 1021 vpmuludq $H3,$T2,$T0 # h3*r1 1022 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1023 vmovdqa -0x60(%r11),$T3 # r2^4 1024 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1025 vpmuludq $H1,$T2,$T1 # h1*r1 1026 vpmuludq $H0,$T2,$T2 # h0*r1 1027 vpaddq $T1,$D2,$D2 # d2 += h1*r1 1028 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1029 1030 vmovdqa -0x50(%r11),$T4 # s2^4 1031 vpmuludq $H2,$T3,$T0 # h2*r2 1032 vpmuludq $H1,$T3,$T1 # h1*r2 1033 vpaddq $T0,$D4,$D4 # d4 += h2*r2 1034 vpaddq $T1,$D3,$D3 # d3 += h1*r2 1035 vmovdqa -0x40(%r11),$T2 # r3^4 1036 vpmuludq $H0,$T3,$T3 # h0*r2 1037 vpmuludq $H4,$T4,$T0 # h4*s2 1038 vpaddq $T3,$D2,$D2 # d2 += h0*r2 1039 vpaddq $T0,$D1,$D1 # d1 += h4*s2 1040 vmovdqa -0x30(%r11),$T3 # s3^4 1041 vpmuludq $H3,$T4,$T4 # h3*s2 1042 vpmuludq $H1,$T2,$T1 # h1*r3 1043 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1044 1045 vmovdqa -0x10(%r11),$T4 # s4^4 1046 vpaddq $T1,$D4,$D4 # d4 += h1*r3 1047 vpmuludq $H0,$T2,$T2 # h0*r3 1048 vpmuludq $H4,$T3,$T0 # h4*s3 1049 vpaddq $T2,$D3,$D3 # d3 += h0*r3 1050 vpaddq $T0,$D2,$D2 # d2 += h4*s3 1051 vmovdqu 16*2($inp),$T0 # load input 1052 vpmuludq $H3,$T3,$T2 # h3*s3 1053 vpmuludq $H2,$T3,$T3 # h2*s3 1054 vpaddq $T2,$D1,$D1 # d1 += h3*s3 1055 vmovdqu 16*3($inp),$T1 # 1056 vpaddq $T3,$D0,$D0 # d0 += h2*s3 1057 1058 vpmuludq $H2,$T4,$H2 # h2*s4 1059 vpmuludq $H3,$T4,$H3 # h3*s4 1060 vpsrldq \$6,$T0,$T2 # splat input 1061 vpaddq $H2,$D1,$D1 # d1 += h2*s4 1062 vpmuludq $H4,$T4,$H4 # h4*s4 1063 vpsrldq \$6,$T1,$T3 # 1064 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 1065 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 1066 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 1067 vpmuludq $H1,$T4,$H0 1068 vpunpckhqdq $T1,$T0,$T4 # 4 1069 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1070 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1071 1072 vpunpcklqdq $T1,$T0,$T0 # 0:1 1073 vpunpcklqdq $T3,$T2,$T3 # 2:3 1074 1075 #vpsrlq \$40,$T4,$T4 # 4 1076 vpsrldq \$`40/8`,$T4,$T4 # 4 1077 vpsrlq \$26,$T0,$T1 1078 vmovdqa 0x00(%rsp),$D4 # preload r0^2 1079 vpand $MASK,$T0,$T0 # 0 1080 vpsrlq \$4,$T3,$T2 1081 vpand $MASK,$T1,$T1 # 1 1082 vpand 0(%rcx),$T4,$T4 # .Lmask24 1083 vpsrlq \$30,$T3,$T3 1084 vpand $MASK,$T2,$T2 # 2 1085 vpand $MASK,$T3,$T3 # 3 1086 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1087 1088 ################################################################ 1089 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1090 # and P. Schwabe 1091 1092 vpsrlq \$26,$H3,$D3 1093 vpand $MASK,$H3,$H3 1094 vpaddq $D3,$H4,$H4 # h3 -> h4 1095 1096 vpsrlq \$26,$H0,$D0 1097 vpand $MASK,$H0,$H0 1098 vpaddq $D0,$D1,$H1 # h0 -> h1 1099 1100 vpsrlq \$26,$H4,$D0 1101 vpand $MASK,$H4,$H4 1102 1103 vpsrlq \$26,$H1,$D1 1104 vpand $MASK,$H1,$H1 1105 vpaddq $D1,$H2,$H2 # h1 -> h2 1106 1107 vpaddq $D0,$H0,$H0 1108 vpsllq \$2,$D0,$D0 1109 vpaddq $D0,$H0,$H0 # h4 -> h0 1110 1111 vpsrlq \$26,$H2,$D2 1112 vpand $MASK,$H2,$H2 1113 vpaddq $D2,$H3,$H3 # h2 -> h3 1114 1115 vpsrlq \$26,$H0,$D0 1116 vpand $MASK,$H0,$H0 1117 vpaddq $D0,$H1,$H1 # h0 -> h1 1118 1119 vpsrlq \$26,$H3,$D3 1120 vpand $MASK,$H3,$H3 1121 vpaddq $D3,$H4,$H4 # h3 -> h4 1122 1123 ja .Loop_avx 1124 1125.Lskip_loop_avx: 1126 ################################################################ 1127 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1128 1129 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 1130 add \$32,$len 1131 jnz .Long_tail_avx 1132 1133 vpaddq $H2,$T2,$T2 1134 vpaddq $H0,$T0,$T0 1135 vpaddq $H1,$T1,$T1 1136 vpaddq $H3,$T3,$T3 1137 vpaddq $H4,$T4,$T4 1138 1139.Long_tail_avx: 1140 vmovdqa $H2,0x20(%r11) 1141 vmovdqa $H0,0x00(%r11) 1142 vmovdqa $H1,0x10(%r11) 1143 vmovdqa $H3,0x30(%r11) 1144 vmovdqa $H4,0x40(%r11) 1145 1146 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1147 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1148 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1149 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1150 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1151 1152 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 1153 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 1154 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 1155 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 1156 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 1157 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 1158 1159 vpmuludq $T3,$H2,$H0 # h3*r1 1160 vpaddq $H0,$D4,$D4 # d4 += h3*r1 1161 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 1162 vpmuludq $T2,$H2,$H1 # h2*r1 1163 vpaddq $H1,$D3,$D3 # d3 += h2*r1 1164 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 1165 vpmuludq $T1,$H2,$H0 # h1*r1 1166 vpaddq $H0,$D2,$D2 # d2 += h1*r1 1167 vpmuludq $T0,$H2,$H2 # h0*r1 1168 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1169 vpmuludq $T4,$H3,$H3 # h4*s1 1170 vpaddq $H3,$D0,$D0 # d0 += h4*s1 1171 1172 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 1173 vpmuludq $T2,$H4,$H1 # h2*r2 1174 vpaddq $H1,$D4,$D4 # d4 += h2*r2 1175 vpmuludq $T1,$H4,$H0 # h1*r2 1176 vpaddq $H0,$D3,$D3 # d3 += h1*r2 1177 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 1178 vpmuludq $T0,$H4,$H4 # h0*r2 1179 vpaddq $H4,$D2,$D2 # d2 += h0*r2 1180 vpmuludq $T4,$H2,$H1 # h4*s2 1181 vpaddq $H1,$D1,$D1 # d1 += h4*s2 1182 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 1183 vpmuludq $T3,$H2,$H2 # h3*s2 1184 vpaddq $H2,$D0,$D0 # d0 += h3*s2 1185 1186 vpmuludq $T1,$H3,$H0 # h1*r3 1187 vpaddq $H0,$D4,$D4 # d4 += h1*r3 1188 vpmuludq $T0,$H3,$H3 # h0*r3 1189 vpaddq $H3,$D3,$D3 # d3 += h0*r3 1190 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 1191 vpmuludq $T4,$H4,$H1 # h4*s3 1192 vpaddq $H1,$D2,$D2 # d2 += h4*s3 1193 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 1194 vpmuludq $T3,$H4,$H0 # h3*s3 1195 vpaddq $H0,$D1,$D1 # d1 += h3*s3 1196 vpmuludq $T2,$H4,$H4 # h2*s3 1197 vpaddq $H4,$D0,$D0 # d0 += h2*s3 1198 1199 vpmuludq $T0,$H2,$H2 # h0*r4 1200 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 1201 vpmuludq $T4,$H3,$H1 # h4*s4 1202 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 1203 vpmuludq $T3,$H3,$H0 # h3*s4 1204 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 1205 vpmuludq $T2,$H3,$H1 # h2*s4 1206 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 1207 vpmuludq $T1,$H3,$H3 # h1*s4 1208 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 1209 1210 jz .Lshort_tail_avx 1211 1212 vmovdqu 16*0($inp),$H0 # load input 1213 vmovdqu 16*1($inp),$H1 1214 1215 vpsrldq \$6,$H0,$H2 # splat input 1216 vpsrldq \$6,$H1,$H3 1217 vpunpckhqdq $H1,$H0,$H4 # 4 1218 vpunpcklqdq $H1,$H0,$H0 # 0:1 1219 vpunpcklqdq $H3,$H2,$H3 # 2:3 1220 1221 vpsrlq \$40,$H4,$H4 # 4 1222 vpsrlq \$26,$H0,$H1 1223 vpand $MASK,$H0,$H0 # 0 1224 vpsrlq \$4,$H3,$H2 1225 vpand $MASK,$H1,$H1 # 1 1226 vpsrlq \$30,$H3,$H3 1227 vpand $MASK,$H2,$H2 # 2 1228 vpand $MASK,$H3,$H3 # 3 1229 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1230 1231 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 1232 vpaddq 0x00(%r11),$H0,$H0 1233 vpaddq 0x10(%r11),$H1,$H1 1234 vpaddq 0x20(%r11),$H2,$H2 1235 vpaddq 0x30(%r11),$H3,$H3 1236 vpaddq 0x40(%r11),$H4,$H4 1237 1238 ################################################################ 1239 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 1240 1241 vpmuludq $H0,$T4,$T0 # h0*r0 1242 vpaddq $T0,$D0,$D0 # d0 += h0*r0 1243 vpmuludq $H1,$T4,$T1 # h1*r0 1244 vpaddq $T1,$D1,$D1 # d1 += h1*r0 1245 vpmuludq $H2,$T4,$T0 # h2*r0 1246 vpaddq $T0,$D2,$D2 # d2 += h2*r0 1247 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 1248 vpmuludq $H3,$T4,$T1 # h3*r0 1249 vpaddq $T1,$D3,$D3 # d3 += h3*r0 1250 vpmuludq $H4,$T4,$T4 # h4*r0 1251 vpaddq $T4,$D4,$D4 # d4 += h4*r0 1252 1253 vpmuludq $H3,$T2,$T0 # h3*r1 1254 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1255 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 1256 vpmuludq $H2,$T2,$T1 # h2*r1 1257 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1258 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 1259 vpmuludq $H1,$T2,$T0 # h1*r1 1260 vpaddq $T0,$D2,$D2 # d2 += h1*r1 1261 vpmuludq $H0,$T2,$T2 # h0*r1 1262 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1263 vpmuludq $H4,$T3,$T3 # h4*s1 1264 vpaddq $T3,$D0,$D0 # d0 += h4*s1 1265 1266 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 1267 vpmuludq $H2,$T4,$T1 # h2*r2 1268 vpaddq $T1,$D4,$D4 # d4 += h2*r2 1269 vpmuludq $H1,$T4,$T0 # h1*r2 1270 vpaddq $T0,$D3,$D3 # d3 += h1*r2 1271 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 1272 vpmuludq $H0,$T4,$T4 # h0*r2 1273 vpaddq $T4,$D2,$D2 # d2 += h0*r2 1274 vpmuludq $H4,$T2,$T1 # h4*s2 1275 vpaddq $T1,$D1,$D1 # d1 += h4*s2 1276 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 1277 vpmuludq $H3,$T2,$T2 # h3*s2 1278 vpaddq $T2,$D0,$D0 # d0 += h3*s2 1279 1280 vpmuludq $H1,$T3,$T0 # h1*r3 1281 vpaddq $T0,$D4,$D4 # d4 += h1*r3 1282 vpmuludq $H0,$T3,$T3 # h0*r3 1283 vpaddq $T3,$D3,$D3 # d3 += h0*r3 1284 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 1285 vpmuludq $H4,$T4,$T1 # h4*s3 1286 vpaddq $T1,$D2,$D2 # d2 += h4*s3 1287 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 1288 vpmuludq $H3,$T4,$T0 # h3*s3 1289 vpaddq $T0,$D1,$D1 # d1 += h3*s3 1290 vpmuludq $H2,$T4,$T4 # h2*s3 1291 vpaddq $T4,$D0,$D0 # d0 += h2*s3 1292 1293 vpmuludq $H0,$T2,$T2 # h0*r4 1294 vpaddq $T2,$D4,$D4 # d4 += h0*r4 1295 vpmuludq $H4,$T3,$T1 # h4*s4 1296 vpaddq $T1,$D3,$D3 # d3 += h4*s4 1297 vpmuludq $H3,$T3,$T0 # h3*s4 1298 vpaddq $T0,$D2,$D2 # d2 += h3*s4 1299 vpmuludq $H2,$T3,$T1 # h2*s4 1300 vpaddq $T1,$D1,$D1 # d1 += h2*s4 1301 vpmuludq $H1,$T3,$T3 # h1*s4 1302 vpaddq $T3,$D0,$D0 # d0 += h1*s4 1303 1304.Lshort_tail_avx: 1305 ################################################################ 1306 # horizontal addition 1307 1308 vpsrldq \$8,$D4,$T4 1309 vpsrldq \$8,$D3,$T3 1310 vpsrldq \$8,$D1,$T1 1311 vpsrldq \$8,$D0,$T0 1312 vpsrldq \$8,$D2,$T2 1313 vpaddq $T3,$D3,$D3 1314 vpaddq $T4,$D4,$D4 1315 vpaddq $T0,$D0,$D0 1316 vpaddq $T1,$D1,$D1 1317 vpaddq $T2,$D2,$D2 1318 1319 ################################################################ 1320 # lazy reduction 1321 1322 vpsrlq \$26,$D3,$H3 1323 vpand $MASK,$D3,$D3 1324 vpaddq $H3,$D4,$D4 # h3 -> h4 1325 1326 vpsrlq \$26,$D0,$H0 1327 vpand $MASK,$D0,$D0 1328 vpaddq $H0,$D1,$D1 # h0 -> h1 1329 1330 vpsrlq \$26,$D4,$H4 1331 vpand $MASK,$D4,$D4 1332 1333 vpsrlq \$26,$D1,$H1 1334 vpand $MASK,$D1,$D1 1335 vpaddq $H1,$D2,$D2 # h1 -> h2 1336 1337 vpaddq $H4,$D0,$D0 1338 vpsllq \$2,$H4,$H4 1339 vpaddq $H4,$D0,$D0 # h4 -> h0 1340 1341 vpsrlq \$26,$D2,$H2 1342 vpand $MASK,$D2,$D2 1343 vpaddq $H2,$D3,$D3 # h2 -> h3 1344 1345 vpsrlq \$26,$D0,$H0 1346 vpand $MASK,$D0,$D0 1347 vpaddq $H0,$D1,$D1 # h0 -> h1 1348 1349 vpsrlq \$26,$D3,$H3 1350 vpand $MASK,$D3,$D3 1351 vpaddq $H3,$D4,$D4 # h3 -> h4 1352 1353 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 1354 vmovd $D1,`4*1-48-64`($ctx) 1355 vmovd $D2,`4*2-48-64`($ctx) 1356 vmovd $D3,`4*3-48-64`($ctx) 1357 vmovd $D4,`4*4-48-64`($ctx) 1358___ 1359$code.=<<___ if ($win64); 1360 vmovdqa 0x50(%r11),%xmm6 1361 vmovdqa 0x60(%r11),%xmm7 1362 vmovdqa 0x70(%r11),%xmm8 1363 vmovdqa 0x80(%r11),%xmm9 1364 vmovdqa 0x90(%r11),%xmm10 1365 vmovdqa 0xa0(%r11),%xmm11 1366 vmovdqa 0xb0(%r11),%xmm12 1367 vmovdqa 0xc0(%r11),%xmm13 1368 vmovdqa 0xd0(%r11),%xmm14 1369 vmovdqa 0xe0(%r11),%xmm15 1370 lea 0xf8(%r11),%rsp 1371.Ldo_avx_epilogue: 1372___ 1373$code.=<<___ if (!$win64); 1374 lea 0x58(%r11),%rsp 1375.cfi_def_cfa %rsp,8 1376___ 1377$code.=<<___; 1378 vzeroupper 1379 ret 1380.cfi_endproc 1381.size poly1305_blocks_avx,.-poly1305_blocks_avx 1382 1383.type poly1305_emit_avx,\@function,3 1384.align 32 1385poly1305_emit_avx: 1386.cfi_startproc 1387 cmpl \$0,20($ctx) # is_base2_26? 1388 je .Lemit 1389 1390 mov 0($ctx),%eax # load hash value base 2^26 1391 mov 4($ctx),%ecx 1392 mov 8($ctx),%r8d 1393 mov 12($ctx),%r11d 1394 mov 16($ctx),%r10d 1395 1396 shl \$26,%rcx # base 2^26 -> base 2^64 1397 mov %r8,%r9 1398 shl \$52,%r8 1399 add %rcx,%rax 1400 shr \$12,%r9 1401 add %rax,%r8 # h0 1402 adc \$0,%r9 1403 1404 shl \$14,%r11 1405 mov %r10,%rax 1406 shr \$24,%r10 1407 add %r11,%r9 1408 shl \$40,%rax 1409 add %rax,%r9 # h1 1410 adc \$0,%r10 # h2 1411 1412 mov %r10,%rax # could be partially reduced, so reduce 1413 mov %r10,%rcx 1414 and \$3,%r10 1415 shr \$2,%rax 1416 and \$-4,%rcx 1417 add %rcx,%rax 1418 add %rax,%r8 1419 adc \$0,%r9 1420 adc \$0,%r10 1421 1422 mov %r8,%rax 1423 add \$5,%r8 # compare to modulus 1424 mov %r9,%rcx 1425 adc \$0,%r9 1426 adc \$0,%r10 1427 shr \$2,%r10 # did 130-bit value overflow? 1428 cmovnz %r8,%rax 1429 cmovnz %r9,%rcx 1430 1431 add 0($nonce),%rax # accumulate nonce 1432 adc 8($nonce),%rcx 1433 mov %rax,0($mac) # write result 1434 mov %rcx,8($mac) 1435 1436 ret 1437.cfi_endproc 1438.size poly1305_emit_avx,.-poly1305_emit_avx 1439___ 1440 1441if ($avx>1) { 1442my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 1443 map("%ymm$_",(0..15)); 1444my $S4=$MASK; 1445 1446$code.=<<___; 1447.type poly1305_blocks_avx2,\@function,4 1448.align 32 1449poly1305_blocks_avx2: 1450.cfi_startproc 1451 mov 20($ctx),%r8d # is_base2_26 1452 cmp \$128,$len 1453 jae .Lblocks_avx2 1454 test %r8d,%r8d 1455 jz .Lblocks 1456 1457.Lblocks_avx2: 1458 and \$-16,$len 1459 jz .Lno_data_avx2 1460 1461 vzeroupper 1462 1463 test %r8d,%r8d 1464 jz .Lbase2_64_avx2 1465 1466 test \$63,$len 1467 jz .Leven_avx2 1468 1469 push %rbx 1470.cfi_push %rbx 1471 push %rbp 1472.cfi_push %rbp 1473 push %r12 1474.cfi_push %r12 1475 push %r13 1476.cfi_push %r13 1477 push %r14 1478.cfi_push %r14 1479 push %r15 1480.cfi_push %r15 1481.Lblocks_avx2_body: 1482 1483 mov $len,%r15 # reassign $len 1484 1485 mov 0($ctx),$d1 # load hash value 1486 mov 8($ctx),$d2 1487 mov 16($ctx),$h2#d 1488 1489 mov 24($ctx),$r0 # load r 1490 mov 32($ctx),$s1 1491 1492 ################################# base 2^26 -> base 2^64 1493 mov $d1#d,$h0#d 1494 and \$`-1*(1<<31)`,$d1 1495 mov $d2,$r1 # borrow $r1 1496 mov $d2#d,$h1#d 1497 and \$`-1*(1<<31)`,$d2 1498 1499 shr \$6,$d1 1500 shl \$52,$r1 1501 add $d1,$h0 1502 shr \$12,$h1 1503 shr \$18,$d2 1504 add $r1,$h0 1505 adc $d2,$h1 1506 1507 mov $h2,$d1 1508 shl \$40,$d1 1509 shr \$24,$h2 1510 add $d1,$h1 1511 adc \$0,$h2 # can be partially reduced... 1512 1513 mov \$-4,$d2 # ... so reduce 1514 mov $h2,$d1 1515 and $h2,$d2 1516 shr \$2,$d1 1517 and \$3,$h2 1518 add $d2,$d1 # =*5 1519 add $d1,$h0 1520 adc \$0,$h1 1521 adc \$0,$h2 1522 1523 mov $s1,$r1 1524 mov $s1,%rax 1525 shr \$2,$s1 1526 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1527 1528.Lbase2_26_pre_avx2: 1529 add 0($inp),$h0 # accumulate input 1530 adc 8($inp),$h1 1531 lea 16($inp),$inp 1532 adc $padbit,$h2 1533 sub \$16,%r15 1534 1535 call __poly1305_block 1536 mov $r1,%rax 1537 1538 test \$63,%r15 1539 jnz .Lbase2_26_pre_avx2 1540 1541 test $padbit,$padbit # if $padbit is zero, 1542 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format 1543 1544 ################################# base 2^64 -> base 2^26 1545 mov $h0,%rax 1546 mov $h0,%rdx 1547 shr \$52,$h0 1548 mov $h1,$r0 1549 mov $h1,$r1 1550 shr \$26,%rdx 1551 and \$0x3ffffff,%rax # h[0] 1552 shl \$12,$r0 1553 and \$0x3ffffff,%rdx # h[1] 1554 shr \$14,$h1 1555 or $r0,$h0 1556 shl \$24,$h2 1557 and \$0x3ffffff,$h0 # h[2] 1558 shr \$40,$r1 1559 and \$0x3ffffff,$h1 # h[3] 1560 or $r1,$h2 # h[4] 1561 1562 test %r15,%r15 1563 jz .Lstore_base2_26_avx2 1564 1565 vmovd %rax#d,%x#$H0 1566 vmovd %rdx#d,%x#$H1 1567 vmovd $h0#d,%x#$H2 1568 vmovd $h1#d,%x#$H3 1569 vmovd $h2#d,%x#$H4 1570 jmp .Lproceed_avx2 1571 1572.align 32 1573.Lstore_base2_64_avx2: 1574 mov $h0,0($ctx) 1575 mov $h1,8($ctx) 1576 mov $h2,16($ctx) # note that is_base2_26 is zeroed 1577 jmp .Ldone_avx2 1578 1579.align 16 1580.Lstore_base2_26_avx2: 1581 mov %rax#d,0($ctx) # store hash value base 2^26 1582 mov %rdx#d,4($ctx) 1583 mov $h0#d,8($ctx) 1584 mov $h1#d,12($ctx) 1585 mov $h2#d,16($ctx) 1586.align 16 1587.Ldone_avx2: 1588 mov 0(%rsp),%r15 1589.cfi_restore %r15 1590 mov 8(%rsp),%r14 1591.cfi_restore %r14 1592 mov 16(%rsp),%r13 1593.cfi_restore %r13 1594 mov 24(%rsp),%r12 1595.cfi_restore %r12 1596 mov 32(%rsp),%rbp 1597.cfi_restore %rbp 1598 mov 40(%rsp),%rbx 1599.cfi_restore %rbx 1600 lea 48(%rsp),%rsp 1601.cfi_adjust_cfa_offset -48 1602.Lno_data_avx2: 1603.Lblocks_avx2_epilogue: 1604 ret 1605.cfi_endproc 1606 1607.align 32 1608.Lbase2_64_avx2: 1609.cfi_startproc 1610 push %rbx 1611.cfi_push %rbx 1612 push %rbp 1613.cfi_push %rbp 1614 push %r12 1615.cfi_push %r12 1616 push %r13 1617.cfi_push %r13 1618 push %r14 1619.cfi_push %r14 1620 push %r15 1621.cfi_push %r15 1622.Lbase2_64_avx2_body: 1623 1624 mov $len,%r15 # reassign $len 1625 1626 mov 24($ctx),$r0 # load r 1627 mov 32($ctx),$s1 1628 1629 mov 0($ctx),$h0 # load hash value 1630 mov 8($ctx),$h1 1631 mov 16($ctx),$h2#d 1632 1633 mov $s1,$r1 1634 mov $s1,%rax 1635 shr \$2,$s1 1636 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1637 1638 test \$63,$len 1639 jz .Linit_avx2 1640 1641.Lbase2_64_pre_avx2: 1642 add 0($inp),$h0 # accumulate input 1643 adc 8($inp),$h1 1644 lea 16($inp),$inp 1645 adc $padbit,$h2 1646 sub \$16,%r15 1647 1648 call __poly1305_block 1649 mov $r1,%rax 1650 1651 test \$63,%r15 1652 jnz .Lbase2_64_pre_avx2 1653 1654.Linit_avx2: 1655 ################################# base 2^64 -> base 2^26 1656 mov $h0,%rax 1657 mov $h0,%rdx 1658 shr \$52,$h0 1659 mov $h1,$d1 1660 mov $h1,$d2 1661 shr \$26,%rdx 1662 and \$0x3ffffff,%rax # h[0] 1663 shl \$12,$d1 1664 and \$0x3ffffff,%rdx # h[1] 1665 shr \$14,$h1 1666 or $d1,$h0 1667 shl \$24,$h2 1668 and \$0x3ffffff,$h0 # h[2] 1669 shr \$40,$d2 1670 and \$0x3ffffff,$h1 # h[3] 1671 or $d2,$h2 # h[4] 1672 1673 vmovd %rax#d,%x#$H0 1674 vmovd %rdx#d,%x#$H1 1675 vmovd $h0#d,%x#$H2 1676 vmovd $h1#d,%x#$H3 1677 vmovd $h2#d,%x#$H4 1678 movl \$1,20($ctx) # set is_base2_26 1679 1680 call __poly1305_init_avx 1681 1682.Lproceed_avx2: 1683 mov %r15,$len # restore $len 1684 mov OPENSSL_ia32cap_P+8(%rip),%r10d 1685 mov \$`(1<<31|1<<30|1<<16)`,%r11d 1686 1687 mov 0(%rsp),%r15 1688.cfi_restore %r15 1689 mov 8(%rsp),%r14 1690.cfi_restore %r14 1691 mov 16(%rsp),%r13 1692.cfi_restore %r13 1693 mov 24(%rsp),%r12 1694.cfi_restore %r12 1695 mov 32(%rsp),%rbp 1696.cfi_restore %rbp 1697 mov 40(%rsp),%rbx 1698.cfi_restore %rbx 1699 lea 48(%rsp),%rax 1700 lea 48(%rsp),%rsp 1701.cfi_adjust_cfa_offset -48 1702.Lbase2_64_avx2_epilogue: 1703 jmp .Ldo_avx2 1704.cfi_endproc 1705 1706.align 32 1707.Leven_avx2: 1708.cfi_startproc 1709 mov OPENSSL_ia32cap_P+8(%rip),%r10d 1710 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 1711 vmovd 4*1($ctx),%x#$H1 1712 vmovd 4*2($ctx),%x#$H2 1713 vmovd 4*3($ctx),%x#$H3 1714 vmovd 4*4($ctx),%x#$H4 1715 1716.Ldo_avx2: 1717___ 1718$code.=<<___ if ($avx>2); 1719 cmp \$512,$len 1720 jb .Lskip_avx512 1721 and %r11d,%r10d 1722 test \$`1<<16`,%r10d # check for AVX512F 1723 jnz .Lblocks_avx512 1724.Lskip_avx512: 1725___ 1726$code.=<<___ if (!$win64); 1727 lea -8(%rsp),%r11 1728.cfi_def_cfa %r11,16 1729 sub \$0x128,%rsp 1730___ 1731$code.=<<___ if ($win64); 1732 lea -0xf8(%rsp),%r11 1733 sub \$0x1c8,%rsp 1734 vmovdqa %xmm6,0x50(%r11) 1735 vmovdqa %xmm7,0x60(%r11) 1736 vmovdqa %xmm8,0x70(%r11) 1737 vmovdqa %xmm9,0x80(%r11) 1738 vmovdqa %xmm10,0x90(%r11) 1739 vmovdqa %xmm11,0xa0(%r11) 1740 vmovdqa %xmm12,0xb0(%r11) 1741 vmovdqa %xmm13,0xc0(%r11) 1742 vmovdqa %xmm14,0xd0(%r11) 1743 vmovdqa %xmm15,0xe0(%r11) 1744.Ldo_avx2_body: 1745___ 1746$code.=<<___; 1747 lea .Lconst(%rip),%rcx 1748 lea 48+64($ctx),$ctx # size optimization 1749 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 1750 1751 # expand and copy pre-calculated table to stack 1752 vmovdqu `16*0-64`($ctx),%x#$T2 1753 and \$-512,%rsp 1754 vmovdqu `16*1-64`($ctx),%x#$T3 1755 vmovdqu `16*2-64`($ctx),%x#$T4 1756 vmovdqu `16*3-64`($ctx),%x#$D0 1757 vmovdqu `16*4-64`($ctx),%x#$D1 1758 vmovdqu `16*5-64`($ctx),%x#$D2 1759 lea 0x90(%rsp),%rax # size optimization 1760 vmovdqu `16*6-64`($ctx),%x#$D3 1761 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 1762 vmovdqu `16*7-64`($ctx),%x#$D4 1763 vpermd $T3,$T0,$T3 1764 vmovdqu `16*8-64`($ctx),%x#$MASK 1765 vpermd $T4,$T0,$T4 1766 vmovdqa $T2,0x00(%rsp) 1767 vpermd $D0,$T0,$D0 1768 vmovdqa $T3,0x20-0x90(%rax) 1769 vpermd $D1,$T0,$D1 1770 vmovdqa $T4,0x40-0x90(%rax) 1771 vpermd $D2,$T0,$D2 1772 vmovdqa $D0,0x60-0x90(%rax) 1773 vpermd $D3,$T0,$D3 1774 vmovdqa $D1,0x80-0x90(%rax) 1775 vpermd $D4,$T0,$D4 1776 vmovdqa $D2,0xa0-0x90(%rax) 1777 vpermd $MASK,$T0,$MASK 1778 vmovdqa $D3,0xc0-0x90(%rax) 1779 vmovdqa $D4,0xe0-0x90(%rax) 1780 vmovdqa $MASK,0x100-0x90(%rax) 1781 vmovdqa 64(%rcx),$MASK # .Lmask26 1782 1783 ################################################################ 1784 # load input 1785 vmovdqu 16*0($inp),%x#$T0 1786 vmovdqu 16*1($inp),%x#$T1 1787 vinserti128 \$1,16*2($inp),$T0,$T0 1788 vinserti128 \$1,16*3($inp),$T1,$T1 1789 lea 16*4($inp),$inp 1790 1791 vpsrldq \$6,$T0,$T2 # splat input 1792 vpsrldq \$6,$T1,$T3 1793 vpunpckhqdq $T1,$T0,$T4 # 4 1794 vpunpcklqdq $T3,$T2,$T2 # 2:3 1795 vpunpcklqdq $T1,$T0,$T0 # 0:1 1796 1797 vpsrlq \$30,$T2,$T3 1798 vpsrlq \$4,$T2,$T2 1799 vpsrlq \$26,$T0,$T1 1800 vpsrlq \$40,$T4,$T4 # 4 1801 vpand $MASK,$T2,$T2 # 2 1802 vpand $MASK,$T0,$T0 # 0 1803 vpand $MASK,$T1,$T1 # 1 1804 vpand $MASK,$T3,$T3 # 3 1805 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1806 1807 vpaddq $H2,$T2,$H2 # accumulate input 1808 sub \$64,$len 1809 jz .Ltail_avx2 1810 jmp .Loop_avx2 1811 1812.align 32 1813.Loop_avx2: 1814 ################################################################ 1815 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 1816 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 1817 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 1818 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 1819 # \________/\__________/ 1820 ################################################################ 1821 #vpaddq $H2,$T2,$H2 # accumulate input 1822 vpaddq $H0,$T0,$H0 1823 vmovdqa `32*0`(%rsp),$T0 # r0^4 1824 vpaddq $H1,$T1,$H1 1825 vmovdqa `32*1`(%rsp),$T1 # r1^4 1826 vpaddq $H3,$T3,$H3 1827 vmovdqa `32*3`(%rsp),$T2 # r2^4 1828 vpaddq $H4,$T4,$H4 1829 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 1830 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 1831 1832 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1833 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1834 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1835 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1836 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1837 # 1838 # however, as h2 is "chronologically" first one available pull 1839 # corresponding operations up, so it's 1840 # 1841 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 1842 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 1843 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1844 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 1845 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 1846 1847 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1848 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1849 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1850 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1851 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1852 1853 vpmuludq $H0,$T1,$T4 # h0*r1 1854 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 1855 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1856 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1857 vpmuludq $H3,$T1,$T4 # h3*r1 1858 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 1859 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1860 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1861 vmovdqa `32*4-0x90`(%rax),$T1 # s2 1862 1863 vpmuludq $H0,$T0,$T4 # h0*r0 1864 vpmuludq $H1,$T0,$H2 # h1*r0 1865 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1866 vpaddq $H2,$D1,$D1 # d1 += h1*r0 1867 vpmuludq $H3,$T0,$T4 # h3*r0 1868 vpmuludq $H4,$T0,$H2 # h4*r0 1869 vmovdqu 16*0($inp),%x#$T0 # load input 1870 vpaddq $T4,$D3,$D3 # d3 += h3*r0 1871 vpaddq $H2,$D4,$D4 # d4 += h4*r0 1872 vinserti128 \$1,16*2($inp),$T0,$T0 1873 1874 vpmuludq $H3,$T1,$T4 # h3*s2 1875 vpmuludq $H4,$T1,$H2 # h4*s2 1876 vmovdqu 16*1($inp),%x#$T1 1877 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1878 vpaddq $H2,$D1,$D1 # d1 += h4*s2 1879 vmovdqa `32*5-0x90`(%rax),$H2 # r3 1880 vpmuludq $H1,$T2,$T4 # h1*r2 1881 vpmuludq $H0,$T2,$T2 # h0*r2 1882 vpaddq $T4,$D3,$D3 # d3 += h1*r2 1883 vpaddq $T2,$D2,$D2 # d2 += h0*r2 1884 vinserti128 \$1,16*3($inp),$T1,$T1 1885 lea 16*4($inp),$inp 1886 1887 vpmuludq $H1,$H2,$T4 # h1*r3 1888 vpmuludq $H0,$H2,$H2 # h0*r3 1889 vpsrldq \$6,$T0,$T2 # splat input 1890 vpaddq $T4,$D4,$D4 # d4 += h1*r3 1891 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1892 vpmuludq $H3,$T3,$T4 # h3*s3 1893 vpmuludq $H4,$T3,$H2 # h4*s3 1894 vpsrldq \$6,$T1,$T3 1895 vpaddq $T4,$D1,$D1 # d1 += h3*s3 1896 vpaddq $H2,$D2,$D2 # d2 += h4*s3 1897 vpunpckhqdq $T1,$T0,$T4 # 4 1898 1899 vpmuludq $H3,$S4,$H3 # h3*s4 1900 vpmuludq $H4,$S4,$H4 # h4*s4 1901 vpunpcklqdq $T1,$T0,$T0 # 0:1 1902 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 1903 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 1904 vpunpcklqdq $T3,$T2,$T3 # 2:3 1905 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 1906 vpmuludq $H1,$S4,$H0 # h1*s4 1907 vmovdqa 64(%rcx),$MASK # .Lmask26 1908 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1909 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1910 1911 ################################################################ 1912 # lazy reduction (interleaved with tail of input splat) 1913 1914 vpsrlq \$26,$H3,$D3 1915 vpand $MASK,$H3,$H3 1916 vpaddq $D3,$H4,$H4 # h3 -> h4 1917 1918 vpsrlq \$26,$H0,$D0 1919 vpand $MASK,$H0,$H0 1920 vpaddq $D0,$D1,$H1 # h0 -> h1 1921 1922 vpsrlq \$26,$H4,$D4 1923 vpand $MASK,$H4,$H4 1924 1925 vpsrlq \$4,$T3,$T2 1926 1927 vpsrlq \$26,$H1,$D1 1928 vpand $MASK,$H1,$H1 1929 vpaddq $D1,$H2,$H2 # h1 -> h2 1930 1931 vpaddq $D4,$H0,$H0 1932 vpsllq \$2,$D4,$D4 1933 vpaddq $D4,$H0,$H0 # h4 -> h0 1934 1935 vpand $MASK,$T2,$T2 # 2 1936 vpsrlq \$26,$T0,$T1 1937 1938 vpsrlq \$26,$H2,$D2 1939 vpand $MASK,$H2,$H2 1940 vpaddq $D2,$H3,$H3 # h2 -> h3 1941 1942 vpaddq $T2,$H2,$H2 # modulo-scheduled 1943 vpsrlq \$30,$T3,$T3 1944 1945 vpsrlq \$26,$H0,$D0 1946 vpand $MASK,$H0,$H0 1947 vpaddq $D0,$H1,$H1 # h0 -> h1 1948 1949 vpsrlq \$40,$T4,$T4 # 4 1950 1951 vpsrlq \$26,$H3,$D3 1952 vpand $MASK,$H3,$H3 1953 vpaddq $D3,$H4,$H4 # h3 -> h4 1954 1955 vpand $MASK,$T0,$T0 # 0 1956 vpand $MASK,$T1,$T1 # 1 1957 vpand $MASK,$T3,$T3 # 3 1958 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1959 1960 sub \$64,$len 1961 jnz .Loop_avx2 1962 1963 .byte 0x66,0x90 1964.Ltail_avx2: 1965 ################################################################ 1966 # while above multiplications were by r^4 in all lanes, in last 1967 # iteration we multiply least significant lane by r^4 and most 1968 # significant one by r, so copy of above except that references 1969 # to the precomputed table are displaced by 4... 1970 1971 #vpaddq $H2,$T2,$H2 # accumulate input 1972 vpaddq $H0,$T0,$H0 1973 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 1974 vpaddq $H1,$T1,$H1 1975 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 1976 vpaddq $H3,$T3,$H3 1977 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 1978 vpaddq $H4,$T4,$H4 1979 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 1980 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 1981 1982 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1983 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1984 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1985 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1986 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1987 1988 vpmuludq $H0,$T1,$T4 # h0*r1 1989 vpmuludq $H1,$T1,$H2 # h1*r1 1990 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1991 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1992 vpmuludq $H3,$T1,$T4 # h3*r1 1993 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 1994 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1995 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1996 1997 vpmuludq $H0,$T0,$T4 # h0*r0 1998 vpmuludq $H1,$T0,$H2 # h1*r0 1999 vpaddq $T4,$D0,$D0 # d0 += h0*r0 2000 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 2001 vpaddq $H2,$D1,$D1 # d1 += h1*r0 2002 vpmuludq $H3,$T0,$T4 # h3*r0 2003 vpmuludq $H4,$T0,$H2 # h4*r0 2004 vpaddq $T4,$D3,$D3 # d3 += h3*r0 2005 vpaddq $H2,$D4,$D4 # d4 += h4*r0 2006 2007 vpmuludq $H3,$T1,$T4 # h3*s2 2008 vpmuludq $H4,$T1,$H2 # h4*s2 2009 vpaddq $T4,$D0,$D0 # d0 += h3*s2 2010 vpaddq $H2,$D1,$D1 # d1 += h4*s2 2011 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 2012 vpmuludq $H1,$T2,$T4 # h1*r2 2013 vpmuludq $H0,$T2,$T2 # h0*r2 2014 vpaddq $T4,$D3,$D3 # d3 += h1*r2 2015 vpaddq $T2,$D2,$D2 # d2 += h0*r2 2016 2017 vpmuludq $H1,$H2,$T4 # h1*r3 2018 vpmuludq $H0,$H2,$H2 # h0*r3 2019 vpaddq $T4,$D4,$D4 # d4 += h1*r3 2020 vpaddq $H2,$D3,$D3 # d3 += h0*r3 2021 vpmuludq $H3,$T3,$T4 # h3*s3 2022 vpmuludq $H4,$T3,$H2 # h4*s3 2023 vpaddq $T4,$D1,$D1 # d1 += h3*s3 2024 vpaddq $H2,$D2,$D2 # d2 += h4*s3 2025 2026 vpmuludq $H3,$S4,$H3 # h3*s4 2027 vpmuludq $H4,$S4,$H4 # h4*s4 2028 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 2029 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 2030 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 2031 vpmuludq $H1,$S4,$H0 # h1*s4 2032 vmovdqa 64(%rcx),$MASK # .Lmask26 2033 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2034 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2035 2036 ################################################################ 2037 # horizontal addition 2038 2039 vpsrldq \$8,$D1,$T1 2040 vpsrldq \$8,$H2,$T2 2041 vpsrldq \$8,$H3,$T3 2042 vpsrldq \$8,$H4,$T4 2043 vpsrldq \$8,$H0,$T0 2044 vpaddq $T1,$D1,$D1 2045 vpaddq $T2,$H2,$H2 2046 vpaddq $T3,$H3,$H3 2047 vpaddq $T4,$H4,$H4 2048 vpaddq $T0,$H0,$H0 2049 2050 vpermq \$0x2,$H3,$T3 2051 vpermq \$0x2,$H4,$T4 2052 vpermq \$0x2,$H0,$T0 2053 vpermq \$0x2,$D1,$T1 2054 vpermq \$0x2,$H2,$T2 2055 vpaddq $T3,$H3,$H3 2056 vpaddq $T4,$H4,$H4 2057 vpaddq $T0,$H0,$H0 2058 vpaddq $T1,$D1,$D1 2059 vpaddq $T2,$H2,$H2 2060 2061 ################################################################ 2062 # lazy reduction 2063 2064 vpsrlq \$26,$H3,$D3 2065 vpand $MASK,$H3,$H3 2066 vpaddq $D3,$H4,$H4 # h3 -> h4 2067 2068 vpsrlq \$26,$H0,$D0 2069 vpand $MASK,$H0,$H0 2070 vpaddq $D0,$D1,$H1 # h0 -> h1 2071 2072 vpsrlq \$26,$H4,$D4 2073 vpand $MASK,$H4,$H4 2074 2075 vpsrlq \$26,$H1,$D1 2076 vpand $MASK,$H1,$H1 2077 vpaddq $D1,$H2,$H2 # h1 -> h2 2078 2079 vpaddq $D4,$H0,$H0 2080 vpsllq \$2,$D4,$D4 2081 vpaddq $D4,$H0,$H0 # h4 -> h0 2082 2083 vpsrlq \$26,$H2,$D2 2084 vpand $MASK,$H2,$H2 2085 vpaddq $D2,$H3,$H3 # h2 -> h3 2086 2087 vpsrlq \$26,$H0,$D0 2088 vpand $MASK,$H0,$H0 2089 vpaddq $D0,$H1,$H1 # h0 -> h1 2090 2091 vpsrlq \$26,$H3,$D3 2092 vpand $MASK,$H3,$H3 2093 vpaddq $D3,$H4,$H4 # h3 -> h4 2094 2095 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2096 vmovd %x#$H1,`4*1-48-64`($ctx) 2097 vmovd %x#$H2,`4*2-48-64`($ctx) 2098 vmovd %x#$H3,`4*3-48-64`($ctx) 2099 vmovd %x#$H4,`4*4-48-64`($ctx) 2100___ 2101$code.=<<___ if ($win64); 2102 vmovdqa 0x50(%r11),%xmm6 2103 vmovdqa 0x60(%r11),%xmm7 2104 vmovdqa 0x70(%r11),%xmm8 2105 vmovdqa 0x80(%r11),%xmm9 2106 vmovdqa 0x90(%r11),%xmm10 2107 vmovdqa 0xa0(%r11),%xmm11 2108 vmovdqa 0xb0(%r11),%xmm12 2109 vmovdqa 0xc0(%r11),%xmm13 2110 vmovdqa 0xd0(%r11),%xmm14 2111 vmovdqa 0xe0(%r11),%xmm15 2112 lea 0xf8(%r11),%rsp 2113.Ldo_avx2_epilogue: 2114___ 2115$code.=<<___ if (!$win64); 2116 lea 8(%r11),%rsp 2117.cfi_def_cfa %rsp,8 2118___ 2119$code.=<<___; 2120 vzeroupper 2121 ret 2122.cfi_endproc 2123.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 2124___ 2125####################################################################### 2126if ($avx>2) { 2127# On entry we have input length divisible by 64. But since inner loop 2128# processes 128 bytes per iteration, cases when length is not divisible 2129# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 2130# reason stack layout is kept identical to poly1305_blocks_avx2. If not 2131# for this tail, we wouldn't have to even allocate stack frame... 2132 2133my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 2134my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 2135my $PADBIT="%zmm30"; 2136 2137map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 2138map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 2139map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 2140map(s/%y/%z/,($MASK)); 2141 2142$code.=<<___; 2143.type poly1305_blocks_avx512,\@function,4 2144.align 32 2145poly1305_blocks_avx512: 2146.cfi_startproc 2147.Lblocks_avx512: 2148 mov \$15,%eax 2149 kmovw %eax,%k2 2150___ 2151$code.=<<___ if (!$win64); 2152 lea -8(%rsp),%r11 2153.cfi_def_cfa %r11,16 2154 sub \$0x128,%rsp 2155___ 2156$code.=<<___ if ($win64); 2157 lea -0xf8(%rsp),%r11 2158 sub \$0x1c8,%rsp 2159 vmovdqa %xmm6,0x50(%r11) 2160 vmovdqa %xmm7,0x60(%r11) 2161 vmovdqa %xmm8,0x70(%r11) 2162 vmovdqa %xmm9,0x80(%r11) 2163 vmovdqa %xmm10,0x90(%r11) 2164 vmovdqa %xmm11,0xa0(%r11) 2165 vmovdqa %xmm12,0xb0(%r11) 2166 vmovdqa %xmm13,0xc0(%r11) 2167 vmovdqa %xmm14,0xd0(%r11) 2168 vmovdqa %xmm15,0xe0(%r11) 2169.Ldo_avx512_body: 2170___ 2171$code.=<<___; 2172 lea .Lconst(%rip),%rcx 2173 lea 48+64($ctx),$ctx # size optimization 2174 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 2175 2176 # expand pre-calculated table 2177 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 2178 and \$-512,%rsp 2179 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 2180 mov \$0x20,%rax 2181 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 2182 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 2183 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 2184 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 2185 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 2186 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 2187 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 2188 vpermd $D0,$T2,$R0 # 00003412 -> 14243444 2189 vpbroadcastq 64(%rcx),$MASK # .Lmask26 2190 vpermd $D1,$T2,$R1 2191 vpermd $T0,$T2,$S1 2192 vpermd $D2,$T2,$R2 2193 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 2194 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 2195 vpermd $T1,$T2,$S2 2196 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 2197 vpsrlq \$32,$R1,$T1 2198 vpermd $D3,$T2,$R3 2199 vmovdqa64 $S1,0x40(%rsp){%k2} 2200 vpermd $T3,$T2,$S3 2201 vpermd $D4,$T2,$R4 2202 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 2203 vpermd $T4,$T2,$S4 2204 vmovdqa64 $S2,0x80(%rsp){%k2} 2205 vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 2206 vmovdqa64 $S3,0xc0(%rsp){%k2} 2207 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 2208 vmovdqa64 $S4,0x100(%rsp){%k2} 2209 2210 ################################################################ 2211 # calculate 5th through 8th powers of the key 2212 # 2213 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 2214 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 2215 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 2216 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 2217 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 2218 2219 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 2220 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 2221 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 2222 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 2223 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 2224 vpsrlq \$32,$R2,$T2 2225 2226 vpmuludq $T1,$S4,$M0 2227 vpmuludq $T1,$R0,$M1 2228 vpmuludq $T1,$R1,$M2 2229 vpmuludq $T1,$R2,$M3 2230 vpmuludq $T1,$R3,$M4 2231 vpsrlq \$32,$R3,$T3 2232 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 2233 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 2234 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 2235 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 2236 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 2237 2238 vpmuludq $T2,$S3,$M0 2239 vpmuludq $T2,$S4,$M1 2240 vpmuludq $T2,$R1,$M3 2241 vpmuludq $T2,$R2,$M4 2242 vpmuludq $T2,$R0,$M2 2243 vpsrlq \$32,$R4,$T4 2244 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 2245 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 2246 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 2247 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 2248 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 2249 2250 vpmuludq $T3,$S2,$M0 2251 vpmuludq $T3,$R0,$M3 2252 vpmuludq $T3,$R1,$M4 2253 vpmuludq $T3,$S3,$M1 2254 vpmuludq $T3,$S4,$M2 2255 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 2256 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 2257 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 2258 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 2259 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 2260 2261 vpmuludq $T4,$S4,$M3 2262 vpmuludq $T4,$R0,$M4 2263 vpmuludq $T4,$S1,$M0 2264 vpmuludq $T4,$S2,$M1 2265 vpmuludq $T4,$S3,$M2 2266 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 2267 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 2268 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 2269 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 2270 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 2271 2272 ################################################################ 2273 # load input 2274 vmovdqu64 16*0($inp),%z#$T3 2275 vmovdqu64 16*4($inp),%z#$T4 2276 lea 16*8($inp),$inp 2277 2278 ################################################################ 2279 # lazy reduction 2280 2281 vpsrlq \$26,$D3,$M3 2282 vpandq $MASK,$D3,$D3 2283 vpaddq $M3,$D4,$D4 # d3 -> d4 2284 2285 vpsrlq \$26,$D0,$M0 2286 vpandq $MASK,$D0,$D0 2287 vpaddq $M0,$D1,$D1 # d0 -> d1 2288 2289 vpsrlq \$26,$D4,$M4 2290 vpandq $MASK,$D4,$D4 2291 2292 vpsrlq \$26,$D1,$M1 2293 vpandq $MASK,$D1,$D1 2294 vpaddq $M1,$D2,$D2 # d1 -> d2 2295 2296 vpaddq $M4,$D0,$D0 2297 vpsllq \$2,$M4,$M4 2298 vpaddq $M4,$D0,$D0 # d4 -> d0 2299 2300 vpsrlq \$26,$D2,$M2 2301 vpandq $MASK,$D2,$D2 2302 vpaddq $M2,$D3,$D3 # d2 -> d3 2303 2304 vpsrlq \$26,$D0,$M0 2305 vpandq $MASK,$D0,$D0 2306 vpaddq $M0,$D1,$D1 # d0 -> d1 2307 2308 vpsrlq \$26,$D3,$M3 2309 vpandq $MASK,$D3,$D3 2310 vpaddq $M3,$D4,$D4 # d3 -> d4 2311 2312 ################################################################ 2313 # at this point we have 14243444 in $R0-$S4 and 05060708 in 2314 # $D0-$D4, ... 2315 2316 vpunpcklqdq $T4,$T3,$T0 # transpose input 2317 vpunpckhqdq $T4,$T3,$T4 2318 2319 # ... since input 64-bit lanes are ordered as 73625140, we could 2320 # "vperm" it to 76543210 (here and in each loop iteration), *or* 2321 # we could just flow along, hence the goal for $R0-$S4 is 2322 # 1858286838784888 ... 2323 2324 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 2325 mov \$0x7777,%eax 2326 kmovw %eax,%k1 2327 2328 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 2329 vpermd $R1,$M0,$R1 2330 vpermd $R2,$M0,$R2 2331 vpermd $R3,$M0,$R3 2332 vpermd $R4,$M0,$R4 2333 2334 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 2335 vpermd $D1,$M0,${R1}{%k1} 2336 vpermd $D2,$M0,${R2}{%k1} 2337 vpermd $D3,$M0,${R3}{%k1} 2338 vpermd $D4,$M0,${R4}{%k1} 2339 2340 vpslld \$2,$R1,$S1 # *5 2341 vpslld \$2,$R2,$S2 2342 vpslld \$2,$R3,$S3 2343 vpslld \$2,$R4,$S4 2344 vpaddd $R1,$S1,$S1 2345 vpaddd $R2,$S2,$S2 2346 vpaddd $R3,$S3,$S3 2347 vpaddd $R4,$S4,$S4 2348 2349 vpbroadcastq 32(%rcx),$PADBIT # .L129 2350 2351 vpsrlq \$52,$T0,$T2 # splat input 2352 vpsllq \$12,$T4,$T3 2353 vporq $T3,$T2,$T2 2354 vpsrlq \$26,$T0,$T1 2355 vpsrlq \$14,$T4,$T3 2356 vpsrlq \$40,$T4,$T4 # 4 2357 vpandq $MASK,$T2,$T2 # 2 2358 vpandq $MASK,$T0,$T0 # 0 2359 #vpandq $MASK,$T1,$T1 # 1 2360 #vpandq $MASK,$T3,$T3 # 3 2361 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2362 2363 vpaddq $H2,$T2,$H2 # accumulate input 2364 sub \$192,$len 2365 jbe .Ltail_avx512 2366 jmp .Loop_avx512 2367 2368.align 32 2369.Loop_avx512: 2370 ################################################################ 2371 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 2372 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 2373 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 2374 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 2375 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 2376 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 2377 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 2378 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 2379 # \________/\___________/ 2380 ################################################################ 2381 #vpaddq $H2,$T2,$H2 # accumulate input 2382 2383 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 2384 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 2385 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 2386 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 2387 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 2388 # 2389 # however, as h2 is "chronologically" first one available pull 2390 # corresponding operations up, so it's 2391 # 2392 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 2393 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 2394 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 2395 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 2396 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 2397 2398 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2399 vpaddq $H0,$T0,$H0 2400 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2401 vpandq $MASK,$T1,$T1 # 1 2402 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2403 vpandq $MASK,$T3,$T3 # 3 2404 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2405 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2406 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2407 vpaddq $H1,$T1,$H1 # accumulate input 2408 vpaddq $H3,$T3,$H3 2409 vpaddq $H4,$T4,$H4 2410 2411 vmovdqu64 16*0($inp),$T3 # load input 2412 vmovdqu64 16*4($inp),$T4 2413 lea 16*8($inp),$inp 2414 vpmuludq $H0,$R3,$M3 2415 vpmuludq $H0,$R4,$M4 2416 vpmuludq $H0,$R0,$M0 2417 vpmuludq $H0,$R1,$M1 2418 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2419 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2420 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2421 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2422 2423 vpmuludq $H1,$R2,$M3 2424 vpmuludq $H1,$R3,$M4 2425 vpmuludq $H1,$S4,$M0 2426 vpmuludq $H0,$R2,$M2 2427 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2428 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2429 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2430 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2431 2432 vpunpcklqdq $T4,$T3,$T0 # transpose input 2433 vpunpckhqdq $T4,$T3,$T4 2434 2435 vpmuludq $H3,$R0,$M3 2436 vpmuludq $H3,$R1,$M4 2437 vpmuludq $H1,$R0,$M1 2438 vpmuludq $H1,$R1,$M2 2439 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2440 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2441 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2442 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2443 2444 vpmuludq $H4,$S4,$M3 2445 vpmuludq $H4,$R0,$M4 2446 vpmuludq $H3,$S2,$M0 2447 vpmuludq $H3,$S3,$M1 2448 vpaddq $M3,$D3,$D3 # d3 += h4*s4 2449 vpmuludq $H3,$S4,$M2 2450 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2451 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2452 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2453 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2454 2455 vpmuludq $H4,$S1,$M0 2456 vpmuludq $H4,$S2,$M1 2457 vpmuludq $H4,$S3,$M2 2458 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2459 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2460 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2461 2462 ################################################################ 2463 # lazy reduction (interleaved with input splat) 2464 2465 vpsrlq \$52,$T0,$T2 # splat input 2466 vpsllq \$12,$T4,$T3 2467 2468 vpsrlq \$26,$D3,$H3 2469 vpandq $MASK,$D3,$D3 2470 vpaddq $H3,$D4,$H4 # h3 -> h4 2471 2472 vporq $T3,$T2,$T2 2473 2474 vpsrlq \$26,$H0,$D0 2475 vpandq $MASK,$H0,$H0 2476 vpaddq $D0,$H1,$H1 # h0 -> h1 2477 2478 vpandq $MASK,$T2,$T2 # 2 2479 2480 vpsrlq \$26,$H4,$D4 2481 vpandq $MASK,$H4,$H4 2482 2483 vpsrlq \$26,$H1,$D1 2484 vpandq $MASK,$H1,$H1 2485 vpaddq $D1,$H2,$H2 # h1 -> h2 2486 2487 vpaddq $D4,$H0,$H0 2488 vpsllq \$2,$D4,$D4 2489 vpaddq $D4,$H0,$H0 # h4 -> h0 2490 2491 vpaddq $T2,$H2,$H2 # modulo-scheduled 2492 vpsrlq \$26,$T0,$T1 2493 2494 vpsrlq \$26,$H2,$D2 2495 vpandq $MASK,$H2,$H2 2496 vpaddq $D2,$D3,$H3 # h2 -> h3 2497 2498 vpsrlq \$14,$T4,$T3 2499 2500 vpsrlq \$26,$H0,$D0 2501 vpandq $MASK,$H0,$H0 2502 vpaddq $D0,$H1,$H1 # h0 -> h1 2503 2504 vpsrlq \$40,$T4,$T4 # 4 2505 2506 vpsrlq \$26,$H3,$D3 2507 vpandq $MASK,$H3,$H3 2508 vpaddq $D3,$H4,$H4 # h3 -> h4 2509 2510 vpandq $MASK,$T0,$T0 # 0 2511 #vpandq $MASK,$T1,$T1 # 1 2512 #vpandq $MASK,$T3,$T3 # 3 2513 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2514 2515 sub \$128,$len 2516 ja .Loop_avx512 2517 2518.Ltail_avx512: 2519 ################################################################ 2520 # while above multiplications were by r^8 in all lanes, in last 2521 # iteration we multiply least significant lane by r^8 and most 2522 # significant one by r, that's why table gets shifted... 2523 2524 vpsrlq \$32,$R0,$R0 # 0105020603070408 2525 vpsrlq \$32,$R1,$R1 2526 vpsrlq \$32,$R2,$R2 2527 vpsrlq \$32,$S3,$S3 2528 vpsrlq \$32,$S4,$S4 2529 vpsrlq \$32,$R3,$R3 2530 vpsrlq \$32,$R4,$R4 2531 vpsrlq \$32,$S1,$S1 2532 vpsrlq \$32,$S2,$S2 2533 2534 ################################################################ 2535 # load either next or last 64 byte of input 2536 lea ($inp,$len),$inp 2537 2538 #vpaddq $H2,$T2,$H2 # accumulate input 2539 vpaddq $H0,$T0,$H0 2540 2541 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2542 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2543 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2544 vpandq $MASK,$T1,$T1 # 1 2545 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2546 vpandq $MASK,$T3,$T3 # 3 2547 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2548 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2549 vpaddq $H1,$T1,$H1 # accumulate input 2550 vpaddq $H3,$T3,$H3 2551 vpaddq $H4,$T4,$H4 2552 2553 vmovdqu 16*0($inp),%x#$T0 2554 vpmuludq $H0,$R3,$M3 2555 vpmuludq $H0,$R4,$M4 2556 vpmuludq $H0,$R0,$M0 2557 vpmuludq $H0,$R1,$M1 2558 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2559 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2560 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2561 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2562 2563 vmovdqu 16*1($inp),%x#$T1 2564 vpmuludq $H1,$R2,$M3 2565 vpmuludq $H1,$R3,$M4 2566 vpmuludq $H1,$S4,$M0 2567 vpmuludq $H0,$R2,$M2 2568 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2569 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2570 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2571 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2572 2573 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 2574 vpmuludq $H3,$R0,$M3 2575 vpmuludq $H3,$R1,$M4 2576 vpmuludq $H1,$R0,$M1 2577 vpmuludq $H1,$R1,$M2 2578 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2579 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2580 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2581 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2582 2583 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 2584 vpmuludq $H4,$S4,$M3 2585 vpmuludq $H4,$R0,$M4 2586 vpmuludq $H3,$S2,$M0 2587 vpmuludq $H3,$S3,$M1 2588 vpmuludq $H3,$S4,$M2 2589 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 2590 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2591 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2592 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2593 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2594 2595 vpmuludq $H4,$S1,$M0 2596 vpmuludq $H4,$S2,$M1 2597 vpmuludq $H4,$S3,$M2 2598 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2599 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2600 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2601 2602 ################################################################ 2603 # horizontal addition 2604 2605 mov \$1,%eax 2606 vpermq \$0xb1,$H3,$D3 2607 vpermq \$0xb1,$D4,$H4 2608 vpermq \$0xb1,$H0,$D0 2609 vpermq \$0xb1,$H1,$D1 2610 vpermq \$0xb1,$H2,$D2 2611 vpaddq $D3,$H3,$H3 2612 vpaddq $D4,$H4,$H4 2613 vpaddq $D0,$H0,$H0 2614 vpaddq $D1,$H1,$H1 2615 vpaddq $D2,$H2,$H2 2616 2617 kmovw %eax,%k3 2618 vpermq \$0x2,$H3,$D3 2619 vpermq \$0x2,$H4,$D4 2620 vpermq \$0x2,$H0,$D0 2621 vpermq \$0x2,$H1,$D1 2622 vpermq \$0x2,$H2,$D2 2623 vpaddq $D3,$H3,$H3 2624 vpaddq $D4,$H4,$H4 2625 vpaddq $D0,$H0,$H0 2626 vpaddq $D1,$H1,$H1 2627 vpaddq $D2,$H2,$H2 2628 2629 vextracti64x4 \$0x1,$H3,%y#$D3 2630 vextracti64x4 \$0x1,$H4,%y#$D4 2631 vextracti64x4 \$0x1,$H0,%y#$D0 2632 vextracti64x4 \$0x1,$H1,%y#$D1 2633 vextracti64x4 \$0x1,$H2,%y#$D2 2634 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 2635 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 2636 vpaddq $D0,$H0,${H0}{%k3}{z} 2637 vpaddq $D1,$H1,${H1}{%k3}{z} 2638 vpaddq $D2,$H2,${H2}{%k3}{z} 2639___ 2640map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 2641map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 2642$code.=<<___; 2643 ################################################################ 2644 # lazy reduction (interleaved with input splat) 2645 2646 vpsrlq \$26,$H3,$D3 2647 vpand $MASK,$H3,$H3 2648 vpsrldq \$6,$T0,$T2 # splat input 2649 vpsrldq \$6,$T1,$T3 2650 vpunpckhqdq $T1,$T0,$T4 # 4 2651 vpaddq $D3,$H4,$H4 # h3 -> h4 2652 2653 vpsrlq \$26,$H0,$D0 2654 vpand $MASK,$H0,$H0 2655 vpunpcklqdq $T3,$T2,$T2 # 2:3 2656 vpunpcklqdq $T1,$T0,$T0 # 0:1 2657 vpaddq $D0,$H1,$H1 # h0 -> h1 2658 2659 vpsrlq \$26,$H4,$D4 2660 vpand $MASK,$H4,$H4 2661 2662 vpsrlq \$26,$H1,$D1 2663 vpand $MASK,$H1,$H1 2664 vpsrlq \$30,$T2,$T3 2665 vpsrlq \$4,$T2,$T2 2666 vpaddq $D1,$H2,$H2 # h1 -> h2 2667 2668 vpaddq $D4,$H0,$H0 2669 vpsllq \$2,$D4,$D4 2670 vpsrlq \$26,$T0,$T1 2671 vpsrlq \$40,$T4,$T4 # 4 2672 vpaddq $D4,$H0,$H0 # h4 -> h0 2673 2674 vpsrlq \$26,$H2,$D2 2675 vpand $MASK,$H2,$H2 2676 vpand $MASK,$T2,$T2 # 2 2677 vpand $MASK,$T0,$T0 # 0 2678 vpaddq $D2,$H3,$H3 # h2 -> h3 2679 2680 vpsrlq \$26,$H0,$D0 2681 vpand $MASK,$H0,$H0 2682 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 2683 vpand $MASK,$T1,$T1 # 1 2684 vpaddq $D0,$H1,$H1 # h0 -> h1 2685 2686 vpsrlq \$26,$H3,$D3 2687 vpand $MASK,$H3,$H3 2688 vpand $MASK,$T3,$T3 # 3 2689 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2690 vpaddq $D3,$H4,$H4 # h3 -> h4 2691 2692 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 2693 add \$64,$len 2694 jnz .Ltail_avx2 2695 2696 vpsubq $T2,$H2,$H2 # undo input accumulation 2697 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2698 vmovd %x#$H1,`4*1-48-64`($ctx) 2699 vmovd %x#$H2,`4*2-48-64`($ctx) 2700 vmovd %x#$H3,`4*3-48-64`($ctx) 2701 vmovd %x#$H4,`4*4-48-64`($ctx) 2702 vzeroall 2703___ 2704$code.=<<___ if ($win64); 2705 movdqa 0x50(%r11),%xmm6 2706 movdqa 0x60(%r11),%xmm7 2707 movdqa 0x70(%r11),%xmm8 2708 movdqa 0x80(%r11),%xmm9 2709 movdqa 0x90(%r11),%xmm10 2710 movdqa 0xa0(%r11),%xmm11 2711 movdqa 0xb0(%r11),%xmm12 2712 movdqa 0xc0(%r11),%xmm13 2713 movdqa 0xd0(%r11),%xmm14 2714 movdqa 0xe0(%r11),%xmm15 2715 lea 0xf8(%r11),%rsp 2716.Ldo_avx512_epilogue: 2717___ 2718$code.=<<___ if (!$win64); 2719 lea 8(%r11),%rsp 2720.cfi_def_cfa %rsp,8 2721___ 2722$code.=<<___; 2723 ret 2724.cfi_endproc 2725.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 2726___ 2727if ($avx>3) { 2728######################################################################## 2729# VPMADD52 version using 2^44 radix. 2730# 2731# One can argue that base 2^52 would be more natural. Well, even though 2732# some operations would be more natural, one has to recognize couple of 2733# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 2734# at amount of multiply-n-accumulate operations. Secondly, it makes it 2735# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 2736# reference implementations], which means that more such operations 2737# would have to be performed in inner loop, which in turn makes critical 2738# path longer. In other words, even though base 2^44 reduction might 2739# look less elegant, overall critical path is actually shorter... 2740 2741######################################################################## 2742# Layout of opaque area is following. 2743# 2744# unsigned __int64 h[3]; # current hash value base 2^44 2745# unsigned __int64 s[2]; # key value*20 base 2^44 2746# unsigned __int64 r[3]; # key value base 2^44 2747# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 2748# # r^n positions reflect 2749# # placement in register, not 2750# # memory, R[3] is R[1]*20 2751 2752$code.=<<___; 2753.type poly1305_init_base2_44,\@function,3 2754.align 32 2755poly1305_init_base2_44: 2756.cfi_startproc 2757 xor %rax,%rax 2758 mov %rax,0($ctx) # initialize hash value 2759 mov %rax,8($ctx) 2760 mov %rax,16($ctx) 2761 2762.Linit_base2_44: 2763 lea poly1305_blocks_vpmadd52(%rip),%r10 2764 lea poly1305_emit_base2_44(%rip),%r11 2765 2766 mov \$0x0ffffffc0fffffff,%rax 2767 mov \$0x0ffffffc0ffffffc,%rcx 2768 and 0($inp),%rax 2769 mov \$0x00000fffffffffff,%r8 2770 and 8($inp),%rcx 2771 mov \$0x00000fffffffffff,%r9 2772 and %rax,%r8 2773 shrd \$44,%rcx,%rax 2774 mov %r8,40($ctx) # r0 2775 and %r9,%rax 2776 shr \$24,%rcx 2777 mov %rax,48($ctx) # r1 2778 lea (%rax,%rax,4),%rax # *5 2779 mov %rcx,56($ctx) # r2 2780 shl \$2,%rax # magic <<2 2781 lea (%rcx,%rcx,4),%rcx # *5 2782 shl \$2,%rcx # magic <<2 2783 mov %rax,24($ctx) # s1 2784 mov %rcx,32($ctx) # s2 2785 movq \$-1,64($ctx) # write impossible value 2786___ 2787$code.=<<___ if ($flavour !~ /elf32/); 2788 mov %r10,0(%rdx) 2789 mov %r11,8(%rdx) 2790___ 2791$code.=<<___ if ($flavour =~ /elf32/); 2792 mov %r10d,0(%rdx) 2793 mov %r11d,4(%rdx) 2794___ 2795$code.=<<___; 2796 mov \$1,%eax 2797 ret 2798.cfi_endproc 2799.size poly1305_init_base2_44,.-poly1305_init_base2_44 2800___ 2801{ 2802my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 2803my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 2804my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 2805 2806$code.=<<___; 2807.type poly1305_blocks_vpmadd52,\@function,4 2808.align 32 2809poly1305_blocks_vpmadd52: 2810.cfi_startproc 2811 endbranch 2812 shr \$4,$len 2813 jz .Lno_data_vpmadd52 # too short 2814 2815 shl \$40,$padbit 2816 mov 64($ctx),%r8 # peek on power of the key 2817 2818 # if powers of the key are not calculated yet, process up to 3 2819 # blocks with this single-block subroutine, otherwise ensure that 2820 # length is divisible by 2 blocks and pass the rest down to next 2821 # subroutine... 2822 2823 mov \$3,%rax 2824 mov \$1,%r10 2825 cmp \$4,$len # is input long 2826 cmovae %r10,%rax 2827 test %r8,%r8 # is power value impossible? 2828 cmovns %r10,%rax 2829 2830 and $len,%rax # is input of favourable length? 2831 jz .Lblocks_vpmadd52_4x 2832 2833 sub %rax,$len 2834 mov \$7,%r10d 2835 mov \$1,%r11d 2836 kmovw %r10d,%k7 2837 lea .L2_44_inp_permd(%rip),%r10 2838 kmovw %r11d,%k1 2839 2840 vmovq $padbit,%x#$PAD 2841 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 2842 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 2843 vpermq \$0xcf,$PAD,$PAD 2844 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 2845 2846 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 2847 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 2848 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 2849 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 2850 2851 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 2852 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 2853 2854 jmp .Loop_vpmadd52 2855 2856.align 32 2857.Loop_vpmadd52: 2858 vmovdqu32 0($inp),%x#$T0 # load input as ----3210 2859 lea 16($inp),$inp 2860 2861 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 2862 vpsrlvq $inp_shift,$T0,$T0 2863 vpandq $reduc_mask,$T0,$T0 2864 vporq $PAD,$T0,$T0 2865 2866 vpaddq $T0,$Dlo,$Dlo # accumulate input 2867 2868 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 2869 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 2870 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 2871 2872 vpxord $Dlo,$Dlo,$Dlo 2873 vpxord $Dhi,$Dhi,$Dhi 2874 2875 vpmadd52luq $r2r1r0,$H0,$Dlo 2876 vpmadd52huq $r2r1r0,$H0,$Dhi 2877 2878 vpmadd52luq $r1r0s2,$H1,$Dlo 2879 vpmadd52huq $r1r0s2,$H1,$Dhi 2880 2881 vpmadd52luq $r0s2s1,$H2,$Dlo 2882 vpmadd52huq $r0s2s1,$H2,$Dhi 2883 2884 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 2885 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 2886 vpandq $reduc_mask,$Dlo,$Dlo 2887 2888 vpaddq $T0,$Dhi,$Dhi 2889 2890 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 2891 2892 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 2893 2894 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 2895 vpandq $reduc_mask,$Dlo,$Dlo 2896 2897 vpermq \$0b10010011,$T0,$T0 2898 2899 vpaddq $T0,$Dlo,$Dlo 2900 2901 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 2902 2903 vpaddq $T0,$Dlo,$Dlo 2904 vpsllq \$2,$T0,$T0 2905 2906 vpaddq $T0,$Dlo,$Dlo 2907 2908 dec %rax # len-=16 2909 jnz .Loop_vpmadd52 2910 2911 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 2912 2913 test $len,$len 2914 jnz .Lblocks_vpmadd52_4x 2915 2916.Lno_data_vpmadd52: 2917 ret 2918.cfi_endproc 2919.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 2920___ 2921} 2922{ 2923######################################################################## 2924# As implied by its name 4x subroutine processes 4 blocks in parallel 2925# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 2926# and is handled in 256-bit %ymm registers. 2927 2928my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 2929my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 2930my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 2931 2932$code.=<<___; 2933.type poly1305_blocks_vpmadd52_4x,\@function,4 2934.align 32 2935poly1305_blocks_vpmadd52_4x: 2936.cfi_startproc 2937 shr \$4,$len 2938 jz .Lno_data_vpmadd52_4x # too short 2939 2940 shl \$40,$padbit 2941 mov 64($ctx),%r8 # peek on power of the key 2942 2943.Lblocks_vpmadd52_4x: 2944 vpbroadcastq $padbit,$PAD 2945 2946 vmovdqa64 .Lx_mask44(%rip),$mask44 2947 mov \$5,%eax 2948 vmovdqa64 .Lx_mask42(%rip),$mask42 2949 kmovw %eax,%k1 # used in 2x path 2950 2951 test %r8,%r8 # is power value impossible? 2952 js .Linit_vpmadd52 # if it is, then init R[4] 2953 2954 vmovq 0($ctx),%x#$H0 # load current hash value 2955 vmovq 8($ctx),%x#$H1 2956 vmovq 16($ctx),%x#$H2 2957 2958 test \$3,$len # is length 4*n+2? 2959 jnz .Lblocks_vpmadd52_2x_do 2960 2961.Lblocks_vpmadd52_4x_do: 2962 vpbroadcastq 64($ctx),$R0 # load 4th power of the key 2963 vpbroadcastq 96($ctx),$R1 2964 vpbroadcastq 128($ctx),$R2 2965 vpbroadcastq 160($ctx),$S1 2966 2967.Lblocks_vpmadd52_4x_key_loaded: 2968 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 2969 vpaddq $R2,$S2,$S2 2970 vpsllq \$2,$S2,$S2 2971 2972 test \$7,$len # is len 8*n? 2973 jz .Lblocks_vpmadd52_8x 2974 2975 vmovdqu64 16*0($inp),$T2 # load data 2976 vmovdqu64 16*2($inp),$T3 2977 lea 16*4($inp),$inp 2978 2979 vpunpcklqdq $T3,$T2,$T1 # transpose data 2980 vpunpckhqdq $T3,$T2,$T3 2981 2982 # at this point 64-bit lanes are ordered as 3-1-2-0 2983 2984 vpsrlq \$24,$T3,$T2 # splat the data 2985 vporq $PAD,$T2,$T2 2986 vpaddq $T2,$H2,$H2 # accumulate input 2987 vpandq $mask44,$T1,$T0 2988 vpsrlq \$44,$T1,$T1 2989 vpsllq \$20,$T3,$T3 2990 vporq $T3,$T1,$T1 2991 vpandq $mask44,$T1,$T1 2992 2993 sub \$4,$len 2994 jz .Ltail_vpmadd52_4x 2995 jmp .Loop_vpmadd52_4x 2996 ud2 2997 2998.align 32 2999.Linit_vpmadd52: 3000 vmovq 24($ctx),%x#$S1 # load key 3001 vmovq 56($ctx),%x#$H2 3002 vmovq 32($ctx),%x#$S2 3003 vmovq 40($ctx),%x#$R0 3004 vmovq 48($ctx),%x#$R1 3005 3006 vmovdqa $R0,$H0 3007 vmovdqa $R1,$H1 3008 vmovdqa $H2,$R2 3009 3010 mov \$2,%eax 3011 3012.Lmul_init_vpmadd52: 3013 vpxorq $D0lo,$D0lo,$D0lo 3014 vpmadd52luq $H2,$S1,$D0lo 3015 vpxorq $D0hi,$D0hi,$D0hi 3016 vpmadd52huq $H2,$S1,$D0hi 3017 vpxorq $D1lo,$D1lo,$D1lo 3018 vpmadd52luq $H2,$S2,$D1lo 3019 vpxorq $D1hi,$D1hi,$D1hi 3020 vpmadd52huq $H2,$S2,$D1hi 3021 vpxorq $D2lo,$D2lo,$D2lo 3022 vpmadd52luq $H2,$R0,$D2lo 3023 vpxorq $D2hi,$D2hi,$D2hi 3024 vpmadd52huq $H2,$R0,$D2hi 3025 3026 vpmadd52luq $H0,$R0,$D0lo 3027 vpmadd52huq $H0,$R0,$D0hi 3028 vpmadd52luq $H0,$R1,$D1lo 3029 vpmadd52huq $H0,$R1,$D1hi 3030 vpmadd52luq $H0,$R2,$D2lo 3031 vpmadd52huq $H0,$R2,$D2hi 3032 3033 vpmadd52luq $H1,$S2,$D0lo 3034 vpmadd52huq $H1,$S2,$D0hi 3035 vpmadd52luq $H1,$R0,$D1lo 3036 vpmadd52huq $H1,$R0,$D1hi 3037 vpmadd52luq $H1,$R1,$D2lo 3038 vpmadd52huq $H1,$R1,$D2hi 3039 3040 ################################################################ 3041 # partial reduction 3042 vpsrlq \$44,$D0lo,$tmp 3043 vpsllq \$8,$D0hi,$D0hi 3044 vpandq $mask44,$D0lo,$H0 3045 vpaddq $tmp,$D0hi,$D0hi 3046 3047 vpaddq $D0hi,$D1lo,$D1lo 3048 3049 vpsrlq \$44,$D1lo,$tmp 3050 vpsllq \$8,$D1hi,$D1hi 3051 vpandq $mask44,$D1lo,$H1 3052 vpaddq $tmp,$D1hi,$D1hi 3053 3054 vpaddq $D1hi,$D2lo,$D2lo 3055 3056 vpsrlq \$42,$D2lo,$tmp 3057 vpsllq \$10,$D2hi,$D2hi 3058 vpandq $mask42,$D2lo,$H2 3059 vpaddq $tmp,$D2hi,$D2hi 3060 3061 vpaddq $D2hi,$H0,$H0 3062 vpsllq \$2,$D2hi,$D2hi 3063 3064 vpaddq $D2hi,$H0,$H0 3065 3066 vpsrlq \$44,$H0,$tmp # additional step 3067 vpandq $mask44,$H0,$H0 3068 3069 vpaddq $tmp,$H1,$H1 3070 3071 dec %eax 3072 jz .Ldone_init_vpmadd52 3073 3074 vpunpcklqdq $R1,$H1,$R1 # 1,2 3075 vpbroadcastq %x#$H1,%x#$H1 # 2,2 3076 vpunpcklqdq $R2,$H2,$R2 3077 vpbroadcastq %x#$H2,%x#$H2 3078 vpunpcklqdq $R0,$H0,$R0 3079 vpbroadcastq %x#$H0,%x#$H0 3080 3081 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3082 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3083 vpaddq $R1,$S1,$S1 3084 vpaddq $R2,$S2,$S2 3085 vpsllq \$2,$S1,$S1 3086 vpsllq \$2,$S2,$S2 3087 3088 jmp .Lmul_init_vpmadd52 3089 ud2 3090 3091.align 32 3092.Ldone_init_vpmadd52: 3093 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 3094 vinserti128 \$1,%x#$R2,$H2,$R2 3095 vinserti128 \$1,%x#$R0,$H0,$R0 3096 3097 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 3098 vpermq \$0b11011000,$R2,$R2 3099 vpermq \$0b11011000,$R0,$R0 3100 3101 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3102 vpaddq $R1,$S1,$S1 3103 vpsllq \$2,$S1,$S1 3104 3105 vmovq 0($ctx),%x#$H0 # load current hash value 3106 vmovq 8($ctx),%x#$H1 3107 vmovq 16($ctx),%x#$H2 3108 3109 test \$3,$len # is length 4*n+2? 3110 jnz .Ldone_init_vpmadd52_2x 3111 3112 vmovdqu64 $R0,64($ctx) # save key powers 3113 vpbroadcastq %x#$R0,$R0 # broadcast 4th power 3114 vmovdqu64 $R1,96($ctx) 3115 vpbroadcastq %x#$R1,$R1 3116 vmovdqu64 $R2,128($ctx) 3117 vpbroadcastq %x#$R2,$R2 3118 vmovdqu64 $S1,160($ctx) 3119 vpbroadcastq %x#$S1,$S1 3120 3121 jmp .Lblocks_vpmadd52_4x_key_loaded 3122 ud2 3123 3124.align 32 3125.Ldone_init_vpmadd52_2x: 3126 vmovdqu64 $R0,64($ctx) # save key powers 3127 vpsrldq \$8,$R0,$R0 # 0-1-0-2 3128 vmovdqu64 $R1,96($ctx) 3129 vpsrldq \$8,$R1,$R1 3130 vmovdqu64 $R2,128($ctx) 3131 vpsrldq \$8,$R2,$R2 3132 vmovdqu64 $S1,160($ctx) 3133 vpsrldq \$8,$S1,$S1 3134 jmp .Lblocks_vpmadd52_2x_key_loaded 3135 ud2 3136 3137.align 32 3138.Lblocks_vpmadd52_2x_do: 3139 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 3140 vmovdqu64 160+8($ctx),${S1}{%k1}{z} 3141 vmovdqu64 64+8($ctx),${R0}{%k1}{z} 3142 vmovdqu64 96+8($ctx),${R1}{%k1}{z} 3143 3144.Lblocks_vpmadd52_2x_key_loaded: 3145 vmovdqu64 16*0($inp),$T2 # load data 3146 vpxorq $T3,$T3,$T3 3147 lea 16*2($inp),$inp 3148 3149 vpunpcklqdq $T3,$T2,$T1 # transpose data 3150 vpunpckhqdq $T3,$T2,$T3 3151 3152 # at this point 64-bit lanes are ordered as x-1-x-0 3153 3154 vpsrlq \$24,$T3,$T2 # splat the data 3155 vporq $PAD,$T2,$T2 3156 vpaddq $T2,$H2,$H2 # accumulate input 3157 vpandq $mask44,$T1,$T0 3158 vpsrlq \$44,$T1,$T1 3159 vpsllq \$20,$T3,$T3 3160 vporq $T3,$T1,$T1 3161 vpandq $mask44,$T1,$T1 3162 3163 jmp .Ltail_vpmadd52_2x 3164 ud2 3165 3166.align 32 3167.Loop_vpmadd52_4x: 3168 #vpaddq $T2,$H2,$H2 # accumulate input 3169 vpaddq $T0,$H0,$H0 3170 vpaddq $T1,$H1,$H1 3171 3172 vpxorq $D0lo,$D0lo,$D0lo 3173 vpmadd52luq $H2,$S1,$D0lo 3174 vpxorq $D0hi,$D0hi,$D0hi 3175 vpmadd52huq $H2,$S1,$D0hi 3176 vpxorq $D1lo,$D1lo,$D1lo 3177 vpmadd52luq $H2,$S2,$D1lo 3178 vpxorq $D1hi,$D1hi,$D1hi 3179 vpmadd52huq $H2,$S2,$D1hi 3180 vpxorq $D2lo,$D2lo,$D2lo 3181 vpmadd52luq $H2,$R0,$D2lo 3182 vpxorq $D2hi,$D2hi,$D2hi 3183 vpmadd52huq $H2,$R0,$D2hi 3184 3185 vmovdqu64 16*0($inp),$T2 # load data 3186 vmovdqu64 16*2($inp),$T3 3187 lea 16*4($inp),$inp 3188 vpmadd52luq $H0,$R0,$D0lo 3189 vpmadd52huq $H0,$R0,$D0hi 3190 vpmadd52luq $H0,$R1,$D1lo 3191 vpmadd52huq $H0,$R1,$D1hi 3192 vpmadd52luq $H0,$R2,$D2lo 3193 vpmadd52huq $H0,$R2,$D2hi 3194 3195 vpunpcklqdq $T3,$T2,$T1 # transpose data 3196 vpunpckhqdq $T3,$T2,$T3 3197 vpmadd52luq $H1,$S2,$D0lo 3198 vpmadd52huq $H1,$S2,$D0hi 3199 vpmadd52luq $H1,$R0,$D1lo 3200 vpmadd52huq $H1,$R0,$D1hi 3201 vpmadd52luq $H1,$R1,$D2lo 3202 vpmadd52huq $H1,$R1,$D2hi 3203 3204 ################################################################ 3205 # partial reduction (interleaved with data splat) 3206 vpsrlq \$44,$D0lo,$tmp 3207 vpsllq \$8,$D0hi,$D0hi 3208 vpandq $mask44,$D0lo,$H0 3209 vpaddq $tmp,$D0hi,$D0hi 3210 3211 vpsrlq \$24,$T3,$T2 3212 vporq $PAD,$T2,$T2 3213 vpaddq $D0hi,$D1lo,$D1lo 3214 3215 vpsrlq \$44,$D1lo,$tmp 3216 vpsllq \$8,$D1hi,$D1hi 3217 vpandq $mask44,$D1lo,$H1 3218 vpaddq $tmp,$D1hi,$D1hi 3219 3220 vpandq $mask44,$T1,$T0 3221 vpsrlq \$44,$T1,$T1 3222 vpsllq \$20,$T3,$T3 3223 vpaddq $D1hi,$D2lo,$D2lo 3224 3225 vpsrlq \$42,$D2lo,$tmp 3226 vpsllq \$10,$D2hi,$D2hi 3227 vpandq $mask42,$D2lo,$H2 3228 vpaddq $tmp,$D2hi,$D2hi 3229 3230 vpaddq $T2,$H2,$H2 # accumulate input 3231 vpaddq $D2hi,$H0,$H0 3232 vpsllq \$2,$D2hi,$D2hi 3233 3234 vpaddq $D2hi,$H0,$H0 3235 vporq $T3,$T1,$T1 3236 vpandq $mask44,$T1,$T1 3237 3238 vpsrlq \$44,$H0,$tmp # additional step 3239 vpandq $mask44,$H0,$H0 3240 3241 vpaddq $tmp,$H1,$H1 3242 3243 sub \$4,$len # len-=64 3244 jnz .Loop_vpmadd52_4x 3245 3246.Ltail_vpmadd52_4x: 3247 vmovdqu64 128($ctx),$R2 # load all key powers 3248 vmovdqu64 160($ctx),$S1 3249 vmovdqu64 64($ctx),$R0 3250 vmovdqu64 96($ctx),$R1 3251 3252.Ltail_vpmadd52_2x: 3253 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3254 vpaddq $R2,$S2,$S2 3255 vpsllq \$2,$S2,$S2 3256 3257 #vpaddq $T2,$H2,$H2 # accumulate input 3258 vpaddq $T0,$H0,$H0 3259 vpaddq $T1,$H1,$H1 3260 3261 vpxorq $D0lo,$D0lo,$D0lo 3262 vpmadd52luq $H2,$S1,$D0lo 3263 vpxorq $D0hi,$D0hi,$D0hi 3264 vpmadd52huq $H2,$S1,$D0hi 3265 vpxorq $D1lo,$D1lo,$D1lo 3266 vpmadd52luq $H2,$S2,$D1lo 3267 vpxorq $D1hi,$D1hi,$D1hi 3268 vpmadd52huq $H2,$S2,$D1hi 3269 vpxorq $D2lo,$D2lo,$D2lo 3270 vpmadd52luq $H2,$R0,$D2lo 3271 vpxorq $D2hi,$D2hi,$D2hi 3272 vpmadd52huq $H2,$R0,$D2hi 3273 3274 vpmadd52luq $H0,$R0,$D0lo 3275 vpmadd52huq $H0,$R0,$D0hi 3276 vpmadd52luq $H0,$R1,$D1lo 3277 vpmadd52huq $H0,$R1,$D1hi 3278 vpmadd52luq $H0,$R2,$D2lo 3279 vpmadd52huq $H0,$R2,$D2hi 3280 3281 vpmadd52luq $H1,$S2,$D0lo 3282 vpmadd52huq $H1,$S2,$D0hi 3283 vpmadd52luq $H1,$R0,$D1lo 3284 vpmadd52huq $H1,$R0,$D1hi 3285 vpmadd52luq $H1,$R1,$D2lo 3286 vpmadd52huq $H1,$R1,$D2hi 3287 3288 ################################################################ 3289 # horizontal addition 3290 3291 mov \$1,%eax 3292 kmovw %eax,%k1 3293 vpsrldq \$8,$D0lo,$T0 3294 vpsrldq \$8,$D0hi,$H0 3295 vpsrldq \$8,$D1lo,$T1 3296 vpsrldq \$8,$D1hi,$H1 3297 vpaddq $T0,$D0lo,$D0lo 3298 vpaddq $H0,$D0hi,$D0hi 3299 vpsrldq \$8,$D2lo,$T2 3300 vpsrldq \$8,$D2hi,$H2 3301 vpaddq $T1,$D1lo,$D1lo 3302 vpaddq $H1,$D1hi,$D1hi 3303 vpermq \$0x2,$D0lo,$T0 3304 vpermq \$0x2,$D0hi,$H0 3305 vpaddq $T2,$D2lo,$D2lo 3306 vpaddq $H2,$D2hi,$D2hi 3307 3308 vpermq \$0x2,$D1lo,$T1 3309 vpermq \$0x2,$D1hi,$H1 3310 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3311 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3312 vpermq \$0x2,$D2lo,$T2 3313 vpermq \$0x2,$D2hi,$H2 3314 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3315 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3316 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3317 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3318 3319 ################################################################ 3320 # partial reduction 3321 vpsrlq \$44,$D0lo,$tmp 3322 vpsllq \$8,$D0hi,$D0hi 3323 vpandq $mask44,$D0lo,$H0 3324 vpaddq $tmp,$D0hi,$D0hi 3325 3326 vpaddq $D0hi,$D1lo,$D1lo 3327 3328 vpsrlq \$44,$D1lo,$tmp 3329 vpsllq \$8,$D1hi,$D1hi 3330 vpandq $mask44,$D1lo,$H1 3331 vpaddq $tmp,$D1hi,$D1hi 3332 3333 vpaddq $D1hi,$D2lo,$D2lo 3334 3335 vpsrlq \$42,$D2lo,$tmp 3336 vpsllq \$10,$D2hi,$D2hi 3337 vpandq $mask42,$D2lo,$H2 3338 vpaddq $tmp,$D2hi,$D2hi 3339 3340 vpaddq $D2hi,$H0,$H0 3341 vpsllq \$2,$D2hi,$D2hi 3342 3343 vpaddq $D2hi,$H0,$H0 3344 3345 vpsrlq \$44,$H0,$tmp # additional step 3346 vpandq $mask44,$H0,$H0 3347 3348 vpaddq $tmp,$H1,$H1 3349 # at this point $len is 3350 # either 4*n+2 or 0... 3351 sub \$2,$len # len-=32 3352 ja .Lblocks_vpmadd52_4x_do 3353 3354 vmovq %x#$H0,0($ctx) 3355 vmovq %x#$H1,8($ctx) 3356 vmovq %x#$H2,16($ctx) 3357 vzeroall 3358 3359.Lno_data_vpmadd52_4x: 3360 ret 3361.cfi_endproc 3362.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3363___ 3364} 3365{ 3366######################################################################## 3367# As implied by its name 8x subroutine processes 8 blocks in parallel... 3368# This is intermediate version, as it's used only in cases when input 3369# length is either 8*n, 8*n+1 or 8*n+2... 3370 3371my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3372my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3373my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3374my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 3375 3376$code.=<<___; 3377.type poly1305_blocks_vpmadd52_8x,\@function,4 3378.align 32 3379poly1305_blocks_vpmadd52_8x: 3380.cfi_startproc 3381 shr \$4,$len 3382 jz .Lno_data_vpmadd52_8x # too short 3383 3384 shl \$40,$padbit 3385 mov 64($ctx),%r8 # peek on power of the key 3386 3387 vmovdqa64 .Lx_mask44(%rip),$mask44 3388 vmovdqa64 .Lx_mask42(%rip),$mask42 3389 3390 test %r8,%r8 # is power value impossible? 3391 js .Linit_vpmadd52 # if it is, then init R[4] 3392 3393 vmovq 0($ctx),%x#$H0 # load current hash value 3394 vmovq 8($ctx),%x#$H1 3395 vmovq 16($ctx),%x#$H2 3396 3397.Lblocks_vpmadd52_8x: 3398 ################################################################ 3399 # fist we calculate more key powers 3400 3401 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 3402 vmovdqu64 160($ctx),$S1 3403 vmovdqu64 64($ctx),$R0 3404 vmovdqu64 96($ctx),$R1 3405 3406 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3407 vpaddq $R2,$S2,$S2 3408 vpsllq \$2,$S2,$S2 3409 3410 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 3411 vpbroadcastq %x#$R0,$RR0 3412 vpbroadcastq %x#$R1,$RR1 3413 3414 vpxorq $D0lo,$D0lo,$D0lo 3415 vpmadd52luq $RR2,$S1,$D0lo 3416 vpxorq $D0hi,$D0hi,$D0hi 3417 vpmadd52huq $RR2,$S1,$D0hi 3418 vpxorq $D1lo,$D1lo,$D1lo 3419 vpmadd52luq $RR2,$S2,$D1lo 3420 vpxorq $D1hi,$D1hi,$D1hi 3421 vpmadd52huq $RR2,$S2,$D1hi 3422 vpxorq $D2lo,$D2lo,$D2lo 3423 vpmadd52luq $RR2,$R0,$D2lo 3424 vpxorq $D2hi,$D2hi,$D2hi 3425 vpmadd52huq $RR2,$R0,$D2hi 3426 3427 vpmadd52luq $RR0,$R0,$D0lo 3428 vpmadd52huq $RR0,$R0,$D0hi 3429 vpmadd52luq $RR0,$R1,$D1lo 3430 vpmadd52huq $RR0,$R1,$D1hi 3431 vpmadd52luq $RR0,$R2,$D2lo 3432 vpmadd52huq $RR0,$R2,$D2hi 3433 3434 vpmadd52luq $RR1,$S2,$D0lo 3435 vpmadd52huq $RR1,$S2,$D0hi 3436 vpmadd52luq $RR1,$R0,$D1lo 3437 vpmadd52huq $RR1,$R0,$D1hi 3438 vpmadd52luq $RR1,$R1,$D2lo 3439 vpmadd52huq $RR1,$R1,$D2hi 3440 3441 ################################################################ 3442 # partial reduction 3443 vpsrlq \$44,$D0lo,$tmp 3444 vpsllq \$8,$D0hi,$D0hi 3445 vpandq $mask44,$D0lo,$RR0 3446 vpaddq $tmp,$D0hi,$D0hi 3447 3448 vpaddq $D0hi,$D1lo,$D1lo 3449 3450 vpsrlq \$44,$D1lo,$tmp 3451 vpsllq \$8,$D1hi,$D1hi 3452 vpandq $mask44,$D1lo,$RR1 3453 vpaddq $tmp,$D1hi,$D1hi 3454 3455 vpaddq $D1hi,$D2lo,$D2lo 3456 3457 vpsrlq \$42,$D2lo,$tmp 3458 vpsllq \$10,$D2hi,$D2hi 3459 vpandq $mask42,$D2lo,$RR2 3460 vpaddq $tmp,$D2hi,$D2hi 3461 3462 vpaddq $D2hi,$RR0,$RR0 3463 vpsllq \$2,$D2hi,$D2hi 3464 3465 vpaddq $D2hi,$RR0,$RR0 3466 3467 vpsrlq \$44,$RR0,$tmp # additional step 3468 vpandq $mask44,$RR0,$RR0 3469 3470 vpaddq $tmp,$RR1,$RR1 3471 3472 ################################################################ 3473 # At this point Rx holds 1324 powers, RRx - 5768, and the goal 3474 # is 15263748, which reflects how data is loaded... 3475 3476 vpunpcklqdq $R2,$RR2,$T2 # 3748 3477 vpunpckhqdq $R2,$RR2,$R2 # 1526 3478 vpunpcklqdq $R0,$RR0,$T0 3479 vpunpckhqdq $R0,$RR0,$R0 3480 vpunpcklqdq $R1,$RR1,$T1 3481 vpunpckhqdq $R1,$RR1,$R1 3482___ 3483######## switch to %zmm 3484map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3485map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3486map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3487map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 3488 3489$code.=<<___; 3490 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 3491 vshufi64x2 \$0x44,$R0,$T0,$RR0 3492 vshufi64x2 \$0x44,$R1,$T1,$RR1 3493 3494 vmovdqu64 16*0($inp),$T2 # load data 3495 vmovdqu64 16*4($inp),$T3 3496 lea 16*8($inp),$inp 3497 3498 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 3499 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 3500 vpaddq $RR2,$SS2,$SS2 3501 vpaddq $RR1,$SS1,$SS1 3502 vpsllq \$2,$SS2,$SS2 3503 vpsllq \$2,$SS1,$SS1 3504 3505 vpbroadcastq $padbit,$PAD 3506 vpbroadcastq %x#$mask44,$mask44 3507 vpbroadcastq %x#$mask42,$mask42 3508 3509 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 3510 vpbroadcastq %x#$SS2,$S2 3511 vpbroadcastq %x#$RR0,$R0 3512 vpbroadcastq %x#$RR1,$R1 3513 vpbroadcastq %x#$RR2,$R2 3514 3515 vpunpcklqdq $T3,$T2,$T1 # transpose data 3516 vpunpckhqdq $T3,$T2,$T3 3517 3518 # at this point 64-bit lanes are ordered as 73625140 3519 3520 vpsrlq \$24,$T3,$T2 # splat the data 3521 vporq $PAD,$T2,$T2 3522 vpaddq $T2,$H2,$H2 # accumulate input 3523 vpandq $mask44,$T1,$T0 3524 vpsrlq \$44,$T1,$T1 3525 vpsllq \$20,$T3,$T3 3526 vporq $T3,$T1,$T1 3527 vpandq $mask44,$T1,$T1 3528 3529 sub \$8,$len 3530 jz .Ltail_vpmadd52_8x 3531 jmp .Loop_vpmadd52_8x 3532 3533.align 32 3534.Loop_vpmadd52_8x: 3535 #vpaddq $T2,$H2,$H2 # accumulate input 3536 vpaddq $T0,$H0,$H0 3537 vpaddq $T1,$H1,$H1 3538 3539 vpxorq $D0lo,$D0lo,$D0lo 3540 vpmadd52luq $H2,$S1,$D0lo 3541 vpxorq $D0hi,$D0hi,$D0hi 3542 vpmadd52huq $H2,$S1,$D0hi 3543 vpxorq $D1lo,$D1lo,$D1lo 3544 vpmadd52luq $H2,$S2,$D1lo 3545 vpxorq $D1hi,$D1hi,$D1hi 3546 vpmadd52huq $H2,$S2,$D1hi 3547 vpxorq $D2lo,$D2lo,$D2lo 3548 vpmadd52luq $H2,$R0,$D2lo 3549 vpxorq $D2hi,$D2hi,$D2hi 3550 vpmadd52huq $H2,$R0,$D2hi 3551 3552 vmovdqu64 16*0($inp),$T2 # load data 3553 vmovdqu64 16*4($inp),$T3 3554 lea 16*8($inp),$inp 3555 vpmadd52luq $H0,$R0,$D0lo 3556 vpmadd52huq $H0,$R0,$D0hi 3557 vpmadd52luq $H0,$R1,$D1lo 3558 vpmadd52huq $H0,$R1,$D1hi 3559 vpmadd52luq $H0,$R2,$D2lo 3560 vpmadd52huq $H0,$R2,$D2hi 3561 3562 vpunpcklqdq $T3,$T2,$T1 # transpose data 3563 vpunpckhqdq $T3,$T2,$T3 3564 vpmadd52luq $H1,$S2,$D0lo 3565 vpmadd52huq $H1,$S2,$D0hi 3566 vpmadd52luq $H1,$R0,$D1lo 3567 vpmadd52huq $H1,$R0,$D1hi 3568 vpmadd52luq $H1,$R1,$D2lo 3569 vpmadd52huq $H1,$R1,$D2hi 3570 3571 ################################################################ 3572 # partial reduction (interleaved with data splat) 3573 vpsrlq \$44,$D0lo,$tmp 3574 vpsllq \$8,$D0hi,$D0hi 3575 vpandq $mask44,$D0lo,$H0 3576 vpaddq $tmp,$D0hi,$D0hi 3577 3578 vpsrlq \$24,$T3,$T2 3579 vporq $PAD,$T2,$T2 3580 vpaddq $D0hi,$D1lo,$D1lo 3581 3582 vpsrlq \$44,$D1lo,$tmp 3583 vpsllq \$8,$D1hi,$D1hi 3584 vpandq $mask44,$D1lo,$H1 3585 vpaddq $tmp,$D1hi,$D1hi 3586 3587 vpandq $mask44,$T1,$T0 3588 vpsrlq \$44,$T1,$T1 3589 vpsllq \$20,$T3,$T3 3590 vpaddq $D1hi,$D2lo,$D2lo 3591 3592 vpsrlq \$42,$D2lo,$tmp 3593 vpsllq \$10,$D2hi,$D2hi 3594 vpandq $mask42,$D2lo,$H2 3595 vpaddq $tmp,$D2hi,$D2hi 3596 3597 vpaddq $T2,$H2,$H2 # accumulate input 3598 vpaddq $D2hi,$H0,$H0 3599 vpsllq \$2,$D2hi,$D2hi 3600 3601 vpaddq $D2hi,$H0,$H0 3602 vporq $T3,$T1,$T1 3603 vpandq $mask44,$T1,$T1 3604 3605 vpsrlq \$44,$H0,$tmp # additional step 3606 vpandq $mask44,$H0,$H0 3607 3608 vpaddq $tmp,$H1,$H1 3609 3610 sub \$8,$len # len-=128 3611 jnz .Loop_vpmadd52_8x 3612 3613.Ltail_vpmadd52_8x: 3614 #vpaddq $T2,$H2,$H2 # accumulate input 3615 vpaddq $T0,$H0,$H0 3616 vpaddq $T1,$H1,$H1 3617 3618 vpxorq $D0lo,$D0lo,$D0lo 3619 vpmadd52luq $H2,$SS1,$D0lo 3620 vpxorq $D0hi,$D0hi,$D0hi 3621 vpmadd52huq $H2,$SS1,$D0hi 3622 vpxorq $D1lo,$D1lo,$D1lo 3623 vpmadd52luq $H2,$SS2,$D1lo 3624 vpxorq $D1hi,$D1hi,$D1hi 3625 vpmadd52huq $H2,$SS2,$D1hi 3626 vpxorq $D2lo,$D2lo,$D2lo 3627 vpmadd52luq $H2,$RR0,$D2lo 3628 vpxorq $D2hi,$D2hi,$D2hi 3629 vpmadd52huq $H2,$RR0,$D2hi 3630 3631 vpmadd52luq $H0,$RR0,$D0lo 3632 vpmadd52huq $H0,$RR0,$D0hi 3633 vpmadd52luq $H0,$RR1,$D1lo 3634 vpmadd52huq $H0,$RR1,$D1hi 3635 vpmadd52luq $H0,$RR2,$D2lo 3636 vpmadd52huq $H0,$RR2,$D2hi 3637 3638 vpmadd52luq $H1,$SS2,$D0lo 3639 vpmadd52huq $H1,$SS2,$D0hi 3640 vpmadd52luq $H1,$RR0,$D1lo 3641 vpmadd52huq $H1,$RR0,$D1hi 3642 vpmadd52luq $H1,$RR1,$D2lo 3643 vpmadd52huq $H1,$RR1,$D2hi 3644 3645 ################################################################ 3646 # horizontal addition 3647 3648 mov \$1,%eax 3649 kmovw %eax,%k1 3650 vpsrldq \$8,$D0lo,$T0 3651 vpsrldq \$8,$D0hi,$H0 3652 vpsrldq \$8,$D1lo,$T1 3653 vpsrldq \$8,$D1hi,$H1 3654 vpaddq $T0,$D0lo,$D0lo 3655 vpaddq $H0,$D0hi,$D0hi 3656 vpsrldq \$8,$D2lo,$T2 3657 vpsrldq \$8,$D2hi,$H2 3658 vpaddq $T1,$D1lo,$D1lo 3659 vpaddq $H1,$D1hi,$D1hi 3660 vpermq \$0x2,$D0lo,$T0 3661 vpermq \$0x2,$D0hi,$H0 3662 vpaddq $T2,$D2lo,$D2lo 3663 vpaddq $H2,$D2hi,$D2hi 3664 3665 vpermq \$0x2,$D1lo,$T1 3666 vpermq \$0x2,$D1hi,$H1 3667 vpaddq $T0,$D0lo,$D0lo 3668 vpaddq $H0,$D0hi,$D0hi 3669 vpermq \$0x2,$D2lo,$T2 3670 vpermq \$0x2,$D2hi,$H2 3671 vpaddq $T1,$D1lo,$D1lo 3672 vpaddq $H1,$D1hi,$D1hi 3673 vextracti64x4 \$1,$D0lo,%y#$T0 3674 vextracti64x4 \$1,$D0hi,%y#$H0 3675 vpaddq $T2,$D2lo,$D2lo 3676 vpaddq $H2,$D2hi,$D2hi 3677 3678 vextracti64x4 \$1,$D1lo,%y#$T1 3679 vextracti64x4 \$1,$D1hi,%y#$H1 3680 vextracti64x4 \$1,$D2lo,%y#$T2 3681 vextracti64x4 \$1,$D2hi,%y#$H2 3682___ 3683######## switch back to %ymm 3684map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3685map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3686map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3687 3688$code.=<<___; 3689 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3690 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3691 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3692 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3693 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3694 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3695 3696 ################################################################ 3697 # partial reduction 3698 vpsrlq \$44,$D0lo,$tmp 3699 vpsllq \$8,$D0hi,$D0hi 3700 vpandq $mask44,$D0lo,$H0 3701 vpaddq $tmp,$D0hi,$D0hi 3702 3703 vpaddq $D0hi,$D1lo,$D1lo 3704 3705 vpsrlq \$44,$D1lo,$tmp 3706 vpsllq \$8,$D1hi,$D1hi 3707 vpandq $mask44,$D1lo,$H1 3708 vpaddq $tmp,$D1hi,$D1hi 3709 3710 vpaddq $D1hi,$D2lo,$D2lo 3711 3712 vpsrlq \$42,$D2lo,$tmp 3713 vpsllq \$10,$D2hi,$D2hi 3714 vpandq $mask42,$D2lo,$H2 3715 vpaddq $tmp,$D2hi,$D2hi 3716 3717 vpaddq $D2hi,$H0,$H0 3718 vpsllq \$2,$D2hi,$D2hi 3719 3720 vpaddq $D2hi,$H0,$H0 3721 3722 vpsrlq \$44,$H0,$tmp # additional step 3723 vpandq $mask44,$H0,$H0 3724 3725 vpaddq $tmp,$H1,$H1 3726 3727 ################################################################ 3728 3729 vmovq %x#$H0,0($ctx) 3730 vmovq %x#$H1,8($ctx) 3731 vmovq %x#$H2,16($ctx) 3732 vzeroall 3733 3734.Lno_data_vpmadd52_8x: 3735 ret 3736.cfi_endproc 3737.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3738___ 3739} 3740$code.=<<___; 3741.type poly1305_emit_base2_44,\@function,3 3742.align 32 3743poly1305_emit_base2_44: 3744.cfi_startproc 3745 endbranch 3746 mov 0($ctx),%r8 # load hash value 3747 mov 8($ctx),%r9 3748 mov 16($ctx),%r10 3749 3750 mov %r9,%rax 3751 shr \$20,%r9 3752 shl \$44,%rax 3753 mov %r10,%rcx 3754 shr \$40,%r10 3755 shl \$24,%rcx 3756 3757 add %rax,%r8 3758 adc %rcx,%r9 3759 adc \$0,%r10 3760 3761 mov %r8,%rax 3762 add \$5,%r8 # compare to modulus 3763 mov %r9,%rcx 3764 adc \$0,%r9 3765 adc \$0,%r10 3766 shr \$2,%r10 # did 130-bit value overflow? 3767 cmovnz %r8,%rax 3768 cmovnz %r9,%rcx 3769 3770 add 0($nonce),%rax # accumulate nonce 3771 adc 8($nonce),%rcx 3772 mov %rax,0($mac) # write result 3773 mov %rcx,8($mac) 3774 3775 ret 3776.cfi_endproc 3777.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3778___ 3779} } } 3780$code.=<<___; 3781.align 64 3782.Lconst: 3783.Lmask24: 3784.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 3785.L129: 3786.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 3787.Lmask26: 3788.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 3789.Lpermd_avx2: 3790.long 2,2,2,3,2,0,2,1 3791.Lpermd_avx512: 3792.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 3793 3794.L2_44_inp_permd: 3795.long 0,1,1,2,2,3,7,7 3796.L2_44_inp_shift: 3797.quad 0,12,24,64 3798.L2_44_mask: 3799.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 3800.L2_44_shift_rgt: 3801.quad 44,44,42,64 3802.L2_44_shift_lft: 3803.quad 8,8,10,64 3804 3805.align 64 3806.Lx_mask44: 3807.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3808.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3809.Lx_mask42: 3810.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3811.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3812___ 3813} 3814$code.=<<___; 3815.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3816.align 16 3817___ 3818 3819{ # chacha20-poly1305 helpers 3820my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 3821 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 3822$code.=<<___; 3823.globl xor128_encrypt_n_pad 3824.type xor128_encrypt_n_pad,\@abi-omnipotent 3825.align 16 3826xor128_encrypt_n_pad: 3827.cfi_startproc 3828 sub $otp,$inp 3829 sub $otp,$out 3830 mov $len,%r10 # put len aside 3831 shr \$4,$len # len / 16 3832 jz .Ltail_enc 3833 nop 3834.Loop_enc_xmm: 3835 movdqu ($inp,$otp),%xmm0 3836 pxor ($otp),%xmm0 3837 movdqu %xmm0,($out,$otp) 3838 movdqa %xmm0,($otp) 3839 lea 16($otp),$otp 3840 dec $len 3841 jnz .Loop_enc_xmm 3842 3843 and \$15,%r10 # len % 16 3844 jz .Ldone_enc 3845 3846.Ltail_enc: 3847 mov \$16,$len 3848 sub %r10,$len 3849 xor %eax,%eax 3850.Loop_enc_byte: 3851 mov ($inp,$otp),%al 3852 xor ($otp),%al 3853 mov %al,($out,$otp) 3854 mov %al,($otp) 3855 lea 1($otp),$otp 3856 dec %r10 3857 jnz .Loop_enc_byte 3858 3859 xor %eax,%eax 3860.Loop_enc_pad: 3861 mov %al,($otp) 3862 lea 1($otp),$otp 3863 dec $len 3864 jnz .Loop_enc_pad 3865 3866.Ldone_enc: 3867 mov $otp,%rax 3868 ret 3869.cfi_endproc 3870.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3871 3872.globl xor128_decrypt_n_pad 3873.type xor128_decrypt_n_pad,\@abi-omnipotent 3874.align 16 3875xor128_decrypt_n_pad: 3876.cfi_startproc 3877 sub $otp,$inp 3878 sub $otp,$out 3879 mov $len,%r10 # put len aside 3880 shr \$4,$len # len / 16 3881 jz .Ltail_dec 3882 nop 3883.Loop_dec_xmm: 3884 movdqu ($inp,$otp),%xmm0 3885 movdqa ($otp),%xmm1 3886 pxor %xmm0,%xmm1 3887 movdqu %xmm1,($out,$otp) 3888 movdqa %xmm0,($otp) 3889 lea 16($otp),$otp 3890 dec $len 3891 jnz .Loop_dec_xmm 3892 3893 pxor %xmm1,%xmm1 3894 and \$15,%r10 # len % 16 3895 jz .Ldone_dec 3896 3897.Ltail_dec: 3898 mov \$16,$len 3899 sub %r10,$len 3900 xor %eax,%eax 3901 xor %r11,%r11 3902.Loop_dec_byte: 3903 mov ($inp,$otp),%r11b 3904 mov ($otp),%al 3905 xor %r11b,%al 3906 mov %al,($out,$otp) 3907 mov %r11b,($otp) 3908 lea 1($otp),$otp 3909 dec %r10 3910 jnz .Loop_dec_byte 3911 3912 xor %eax,%eax 3913.Loop_dec_pad: 3914 mov %al,($otp) 3915 lea 1($otp),$otp 3916 dec $len 3917 jnz .Loop_dec_pad 3918 3919.Ldone_dec: 3920 mov $otp,%rax 3921 ret 3922.cfi_endproc 3923.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3924___ 3925} 3926 3927# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3928# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3929if ($win64) { 3930$rec="%rcx"; 3931$frame="%rdx"; 3932$context="%r8"; 3933$disp="%r9"; 3934 3935$code.=<<___; 3936.extern __imp_RtlVirtualUnwind 3937.type se_handler,\@abi-omnipotent 3938.align 16 3939se_handler: 3940 push %rsi 3941 push %rdi 3942 push %rbx 3943 push %rbp 3944 push %r12 3945 push %r13 3946 push %r14 3947 push %r15 3948 pushfq 3949 sub \$64,%rsp 3950 3951 mov 120($context),%rax # pull context->Rax 3952 mov 248($context),%rbx # pull context->Rip 3953 3954 mov 8($disp),%rsi # disp->ImageBase 3955 mov 56($disp),%r11 # disp->HandlerData 3956 3957 mov 0(%r11),%r10d # HandlerData[0] 3958 lea (%rsi,%r10),%r10 # prologue label 3959 cmp %r10,%rbx # context->Rip<.Lprologue 3960 jb .Lcommon_seh_tail 3961 3962 mov 152($context),%rax # pull context->Rsp 3963 3964 mov 4(%r11),%r10d # HandlerData[1] 3965 lea (%rsi,%r10),%r10 # epilogue label 3966 cmp %r10,%rbx # context->Rip>=.Lepilogue 3967 jae .Lcommon_seh_tail 3968 3969 lea 48(%rax),%rax 3970 3971 mov -8(%rax),%rbx 3972 mov -16(%rax),%rbp 3973 mov -24(%rax),%r12 3974 mov -32(%rax),%r13 3975 mov -40(%rax),%r14 3976 mov -48(%rax),%r15 3977 mov %rbx,144($context) # restore context->Rbx 3978 mov %rbp,160($context) # restore context->Rbp 3979 mov %r12,216($context) # restore context->R12 3980 mov %r13,224($context) # restore context->R13 3981 mov %r14,232($context) # restore context->R14 3982 mov %r15,240($context) # restore context->R14 3983 3984 jmp .Lcommon_seh_tail 3985.size se_handler,.-se_handler 3986 3987.type avx_handler,\@abi-omnipotent 3988.align 16 3989avx_handler: 3990 push %rsi 3991 push %rdi 3992 push %rbx 3993 push %rbp 3994 push %r12 3995 push %r13 3996 push %r14 3997 push %r15 3998 pushfq 3999 sub \$64,%rsp 4000 4001 mov 120($context),%rax # pull context->Rax 4002 mov 248($context),%rbx # pull context->Rip 4003 4004 mov 8($disp),%rsi # disp->ImageBase 4005 mov 56($disp),%r11 # disp->HandlerData 4006 4007 mov 0(%r11),%r10d # HandlerData[0] 4008 lea (%rsi,%r10),%r10 # prologue label 4009 cmp %r10,%rbx # context->Rip<prologue label 4010 jb .Lcommon_seh_tail 4011 4012 mov 152($context),%rax # pull context->Rsp 4013 4014 mov 4(%r11),%r10d # HandlerData[1] 4015 lea (%rsi,%r10),%r10 # epilogue label 4016 cmp %r10,%rbx # context->Rip>=epilogue label 4017 jae .Lcommon_seh_tail 4018 4019 mov 208($context),%rax # pull context->R11 4020 4021 lea 0x50(%rax),%rsi 4022 lea 0xf8(%rax),%rax 4023 lea 512($context),%rdi # &context.Xmm6 4024 mov \$20,%ecx 4025 .long 0xa548f3fc # cld; rep movsq 4026 4027.Lcommon_seh_tail: 4028 mov 8(%rax),%rdi 4029 mov 16(%rax),%rsi 4030 mov %rax,152($context) # restore context->Rsp 4031 mov %rsi,168($context) # restore context->Rsi 4032 mov %rdi,176($context) # restore context->Rdi 4033 4034 mov 40($disp),%rdi # disp->ContextRecord 4035 mov $context,%rsi # context 4036 mov \$154,%ecx # sizeof(CONTEXT) 4037 .long 0xa548f3fc # cld; rep movsq 4038 4039 mov $disp,%rsi 4040 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4041 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4042 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4043 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4044 mov 40(%rsi),%r10 # disp->ContextRecord 4045 lea 56(%rsi),%r11 # &disp->HandlerData 4046 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4047 mov %r10,32(%rsp) # arg5 4048 mov %r11,40(%rsp) # arg6 4049 mov %r12,48(%rsp) # arg7 4050 mov %rcx,56(%rsp) # arg8, (NULL) 4051 call *__imp_RtlVirtualUnwind(%rip) 4052 4053 mov \$1,%eax # ExceptionContinueSearch 4054 add \$64,%rsp 4055 popfq 4056 pop %r15 4057 pop %r14 4058 pop %r13 4059 pop %r12 4060 pop %rbp 4061 pop %rbx 4062 pop %rdi 4063 pop %rsi 4064 ret 4065.size avx_handler,.-avx_handler 4066 4067.section .pdata 4068.align 4 4069 .rva .LSEH_begin_poly1305_init 4070 .rva .LSEH_end_poly1305_init 4071 .rva .LSEH_info_poly1305_init 4072 4073 .rva .LSEH_begin_poly1305_blocks 4074 .rva .LSEH_end_poly1305_blocks 4075 .rva .LSEH_info_poly1305_blocks 4076 4077 .rva .LSEH_begin_poly1305_emit 4078 .rva .LSEH_end_poly1305_emit 4079 .rva .LSEH_info_poly1305_emit 4080___ 4081$code.=<<___ if ($avx); 4082 .rva .LSEH_begin_poly1305_blocks_avx 4083 .rva .Lbase2_64_avx 4084 .rva .LSEH_info_poly1305_blocks_avx_1 4085 4086 .rva .Lbase2_64_avx 4087 .rva .Leven_avx 4088 .rva .LSEH_info_poly1305_blocks_avx_2 4089 4090 .rva .Leven_avx 4091 .rva .LSEH_end_poly1305_blocks_avx 4092 .rva .LSEH_info_poly1305_blocks_avx_3 4093 4094 .rva .LSEH_begin_poly1305_emit_avx 4095 .rva .LSEH_end_poly1305_emit_avx 4096 .rva .LSEH_info_poly1305_emit_avx 4097___ 4098$code.=<<___ if ($avx>1); 4099 .rva .LSEH_begin_poly1305_blocks_avx2 4100 .rva .Lbase2_64_avx2 4101 .rva .LSEH_info_poly1305_blocks_avx2_1 4102 4103 .rva .Lbase2_64_avx2 4104 .rva .Leven_avx2 4105 .rva .LSEH_info_poly1305_blocks_avx2_2 4106 4107 .rva .Leven_avx2 4108 .rva .LSEH_end_poly1305_blocks_avx2 4109 .rva .LSEH_info_poly1305_blocks_avx2_3 4110___ 4111$code.=<<___ if ($avx>2); 4112 .rva .LSEH_begin_poly1305_blocks_avx512 4113 .rva .LSEH_end_poly1305_blocks_avx512 4114 .rva .LSEH_info_poly1305_blocks_avx512 4115___ 4116$code.=<<___; 4117.section .xdata 4118.align 8 4119.LSEH_info_poly1305_init: 4120 .byte 9,0,0,0 4121 .rva se_handler 4122 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init 4123 4124.LSEH_info_poly1305_blocks: 4125 .byte 9,0,0,0 4126 .rva se_handler 4127 .rva .Lblocks_body,.Lblocks_epilogue 4128 4129.LSEH_info_poly1305_emit: 4130 .byte 9,0,0,0 4131 .rva se_handler 4132 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit 4133___ 4134$code.=<<___ if ($avx); 4135.LSEH_info_poly1305_blocks_avx_1: 4136 .byte 9,0,0,0 4137 .rva se_handler 4138 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 4139 4140.LSEH_info_poly1305_blocks_avx_2: 4141 .byte 9,0,0,0 4142 .rva se_handler 4143 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 4144 4145.LSEH_info_poly1305_blocks_avx_3: 4146 .byte 9,0,0,0 4147 .rva avx_handler 4148 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 4149 4150.LSEH_info_poly1305_emit_avx: 4151 .byte 9,0,0,0 4152 .rva se_handler 4153 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 4154___ 4155$code.=<<___ if ($avx>1); 4156.LSEH_info_poly1305_blocks_avx2_1: 4157 .byte 9,0,0,0 4158 .rva se_handler 4159 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 4160 4161.LSEH_info_poly1305_blocks_avx2_2: 4162 .byte 9,0,0,0 4163 .rva se_handler 4164 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 4165 4166.LSEH_info_poly1305_blocks_avx2_3: 4167 .byte 9,0,0,0 4168 .rva avx_handler 4169 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 4170___ 4171$code.=<<___ if ($avx>2); 4172.LSEH_info_poly1305_blocks_avx512: 4173 .byte 9,0,0,0 4174 .rva avx_handler 4175 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 4176___ 4177} 4178 4179foreach (split('\n',$code)) { 4180 s/\`([^\`]*)\`/eval($1)/ge; 4181 s/%r([a-z]+)#d/%e$1/g; 4182 s/%r([0-9]+)#d/%r$1d/g; 4183 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 4184 4185 print $_,"\n"; 4186} 4187close STDOUT or die "error closing STDOUT: $!"; 4188