1#! /usr/bin/env perl 2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# November 2014 18# 19# ChaCha20 for x86_64. 20# 21# December 2016 22# 23# Add AVX512F code path. 24# 25# December 2017 26# 27# Add AVX512VL code path. 28# 29# Performance in cycles per byte out of large buffer. 30# 31# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) 32# 33# P4 9.48/+99% - - 34# Core2 7.83/+55% 7.90/5.76 4.35 35# Westmere 7.19/+50% 5.60/4.50 3.00 36# Sandy Bridge 8.31/+42% 5.45/4.00 2.72 37# Ivy Bridge 6.71/+46% 5.40/? 2.41 38# Haswell 5.92/+43% 5.20/3.45 2.42 1.23 39# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] 40# Silvermont 12.0/+33% 7.75/6.90 7.03(iii) 41# Knights L 11.7/- ? 9.60(iii) 0.80 42# Goldmont 10.6/+17% 5.10/3.52 3.28 43# Sledgehammer 7.28/+52% - - 44# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) 45# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 46# VIA Nano 10.5/+46% 6.72/6.88 6.05 47# 48# (i) compared to older gcc 3.x one can observe >2x improvement on 49# most platforms; 50# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used 51# by chacha20_poly1305_tls_cipher, results are EVP-free; 52# (iii) this is not optimal result for Atom because of MSROM 53# limitations, SSE2 can do better, but gain is considered too 54# low to justify the [maintenance] effort; 55# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 56# and 4.85 for 128-byte inputs; 57# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; 58# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 59# cpb in single thread, the corresponding capability is suppressed; 60 61# $output is the last argument if it looks like a file (it has an extension) 62# $flavour is the first argument if it doesn't look like a file 63$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 64$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 65 66$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 67 68$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 69( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 70( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 71die "can't locate x86_64-xlate.pl"; 72 73if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 74 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 75 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 76} 77 78if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 80 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 81 $avx += 1 if ($1==2.11 && $2>=8); 82} 83 84if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 85 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 86 $avx = ($1>=10) + ($1>=11); 87} 88 89if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 90 $avx = ($2>=3.0) + ($2>3.0); 91} 92 93open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 94 or die "can't call $xlate: $!"; 95*STDOUT=*OUT; 96 97# input parameter block 98($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); 99 100$code.=<<___; 101.text 102 103.extern OPENSSL_ia32cap_P 104 105.align 64 106.Lzero: 107.long 0,0,0,0 108.Lone: 109.long 1,0,0,0 110.Linc: 111.long 0,1,2,3 112.Lfour: 113.long 4,4,4,4 114.Lincy: 115.long 0,2,4,6,1,3,5,7 116.Leight: 117.long 8,8,8,8,8,8,8,8 118.Lrot16: 119.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 120.Lrot24: 121.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 122.Ltwoy: 123.long 2,0,0,0, 2,0,0,0 124.align 64 125.Lzeroz: 126.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 127.Lfourz: 128.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 129.Lincz: 130.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 131.Lsixteen: 132.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 133.Lsigma: 134.asciz "expand 32-byte k" 135.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 136___ 137 138sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 139{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 140 my $arg = pop; 141 $arg = "\$$arg" if ($arg*1 eq $arg); 142 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 143} 144 145@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), 146 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); 147@t=("%esi","%edi"); 148 149sub ROUND { # critical path is 24 cycles per round 150my ($a0,$b0,$c0,$d0)=@_; 151my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 152my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 153my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 154my ($xc,$xc_)=map("\"$_\"",@t); 155my @x=map("\"$_\"",@x); 156 157 # Consider order in which variables are addressed by their 158 # index: 159 # 160 # a b c d 161 # 162 # 0 4 8 12 < even round 163 # 1 5 9 13 164 # 2 6 10 14 165 # 3 7 11 15 166 # 0 5 10 15 < odd round 167 # 1 6 11 12 168 # 2 7 8 13 169 # 3 4 9 14 170 # 171 # 'a', 'b' and 'd's are permanently allocated in registers, 172 # @x[0..7,12..15], while 'c's are maintained in memory. If 173 # you observe 'c' column, you'll notice that pair of 'c's is 174 # invariant between rounds. This means that we have to reload 175 # them once per round, in the middle. This is why you'll see 176 # bunch of 'c' stores and loads in the middle, but none in 177 # the beginning or end. 178 179 # Normally instructions would be interleaved to favour in-order 180 # execution. Generally out-of-order cores manage it gracefully, 181 # but not this time for some reason. As in-order execution 182 # cores are dying breed, old Atom is the only one around, 183 # instructions are left uninterleaved. Besides, Atom is better 184 # off executing 1xSSSE3 code anyway... 185 186 ( 187 "&add (@x[$a0],@x[$b0])", # Q1 188 "&xor (@x[$d0],@x[$a0])", 189 "&rol (@x[$d0],16)", 190 "&add (@x[$a1],@x[$b1])", # Q2 191 "&xor (@x[$d1],@x[$a1])", 192 "&rol (@x[$d1],16)", 193 194 "&add ($xc,@x[$d0])", 195 "&xor (@x[$b0],$xc)", 196 "&rol (@x[$b0],12)", 197 "&add ($xc_,@x[$d1])", 198 "&xor (@x[$b1],$xc_)", 199 "&rol (@x[$b1],12)", 200 201 "&add (@x[$a0],@x[$b0])", 202 "&xor (@x[$d0],@x[$a0])", 203 "&rol (@x[$d0],8)", 204 "&add (@x[$a1],@x[$b1])", 205 "&xor (@x[$d1],@x[$a1])", 206 "&rol (@x[$d1],8)", 207 208 "&add ($xc,@x[$d0])", 209 "&xor (@x[$b0],$xc)", 210 "&rol (@x[$b0],7)", 211 "&add ($xc_,@x[$d1])", 212 "&xor (@x[$b1],$xc_)", 213 "&rol (@x[$b1],7)", 214 215 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's 216 "&mov (\"4*$c1(%rsp)\",$xc_)", 217 "&mov ($xc,\"4*$c2(%rsp)\")", 218 "&mov ($xc_,\"4*$c3(%rsp)\")", 219 220 "&add (@x[$a2],@x[$b2])", # Q3 221 "&xor (@x[$d2],@x[$a2])", 222 "&rol (@x[$d2],16)", 223 "&add (@x[$a3],@x[$b3])", # Q4 224 "&xor (@x[$d3],@x[$a3])", 225 "&rol (@x[$d3],16)", 226 227 "&add ($xc,@x[$d2])", 228 "&xor (@x[$b2],$xc)", 229 "&rol (@x[$b2],12)", 230 "&add ($xc_,@x[$d3])", 231 "&xor (@x[$b3],$xc_)", 232 "&rol (@x[$b3],12)", 233 234 "&add (@x[$a2],@x[$b2])", 235 "&xor (@x[$d2],@x[$a2])", 236 "&rol (@x[$d2],8)", 237 "&add (@x[$a3],@x[$b3])", 238 "&xor (@x[$d3],@x[$a3])", 239 "&rol (@x[$d3],8)", 240 241 "&add ($xc,@x[$d2])", 242 "&xor (@x[$b2],$xc)", 243 "&rol (@x[$b2],7)", 244 "&add ($xc_,@x[$d3])", 245 "&xor (@x[$b3],$xc_)", 246 "&rol (@x[$b3],7)" 247 ); 248} 249 250######################################################################## 251# Generic code path that handles all lengths on pre-SSSE3 processors. 252$code.=<<___; 253.globl ChaCha20_ctr32 254.type ChaCha20_ctr32,\@function,5 255.align 64 256ChaCha20_ctr32: 257.cfi_startproc 258 cmp \$0,$len 259 je .Lno_data 260 mov OPENSSL_ia32cap_P+4(%rip),%r10 261___ 262$code.=<<___ if ($avx>2); 263 bt \$48,%r10 # check for AVX512F 264 jc .LChaCha20_avx512 265 test %r10,%r10 # check for AVX512VL 266 js .LChaCha20_avx512vl 267___ 268$code.=<<___; 269 test \$`1<<(41-32)`,%r10d 270 jnz .LChaCha20_ssse3 271 272 push %rbx 273.cfi_push %rbx 274 push %rbp 275.cfi_push %rbp 276 push %r12 277.cfi_push %r12 278 push %r13 279.cfi_push %r13 280 push %r14 281.cfi_push %r14 282 push %r15 283.cfi_push %r15 284 sub \$64+24,%rsp 285.cfi_adjust_cfa_offset 64+24 286.Lctr32_body: 287 288 #movdqa .Lsigma(%rip),%xmm0 289 movdqu ($key),%xmm1 290 movdqu 16($key),%xmm2 291 movdqu ($counter),%xmm3 292 movdqa .Lone(%rip),%xmm4 293 294 #movdqa %xmm0,4*0(%rsp) # key[0] 295 movdqa %xmm1,4*4(%rsp) # key[1] 296 movdqa %xmm2,4*8(%rsp) # key[2] 297 movdqa %xmm3,4*12(%rsp) # key[3] 298 mov $len,%rbp # reassign $len 299 jmp .Loop_outer 300 301.align 32 302.Loop_outer: 303 mov \$0x61707865,@x[0] # 'expa' 304 mov \$0x3320646e,@x[1] # 'nd 3' 305 mov \$0x79622d32,@x[2] # '2-by' 306 mov \$0x6b206574,@x[3] # 'te k' 307 mov 4*4(%rsp),@x[4] 308 mov 4*5(%rsp),@x[5] 309 mov 4*6(%rsp),@x[6] 310 mov 4*7(%rsp),@x[7] 311 movd %xmm3,@x[12] 312 mov 4*13(%rsp),@x[13] 313 mov 4*14(%rsp),@x[14] 314 mov 4*15(%rsp),@x[15] 315 316 mov %rbp,64+0(%rsp) # save len 317 mov \$10,%ebp 318 mov $inp,64+8(%rsp) # save inp 319 movq %xmm2,%rsi # "@x[8]" 320 mov $out,64+16(%rsp) # save out 321 mov %rsi,%rdi 322 shr \$32,%rdi # "@x[9]" 323 jmp .Loop 324 325.align 32 326.Loop: 327___ 328 foreach (&ROUND (0, 4, 8,12)) { eval; } 329 foreach (&ROUND (0, 5,10,15)) { eval; } 330 &dec ("%ebp"); 331 &jnz (".Loop"); 332 333$code.=<<___; 334 mov @t[1],4*9(%rsp) # modulo-scheduled 335 mov @t[0],4*8(%rsp) 336 mov 64(%rsp),%rbp # load len 337 movdqa %xmm2,%xmm1 338 mov 64+8(%rsp),$inp # load inp 339 paddd %xmm4,%xmm3 # increment counter 340 mov 64+16(%rsp),$out # load out 341 342 add \$0x61707865,@x[0] # 'expa' 343 add \$0x3320646e,@x[1] # 'nd 3' 344 add \$0x79622d32,@x[2] # '2-by' 345 add \$0x6b206574,@x[3] # 'te k' 346 add 4*4(%rsp),@x[4] 347 add 4*5(%rsp),@x[5] 348 add 4*6(%rsp),@x[6] 349 add 4*7(%rsp),@x[7] 350 add 4*12(%rsp),@x[12] 351 add 4*13(%rsp),@x[13] 352 add 4*14(%rsp),@x[14] 353 add 4*15(%rsp),@x[15] 354 paddd 4*8(%rsp),%xmm1 355 356 cmp \$64,%rbp 357 jb .Ltail 358 359 xor 4*0($inp),@x[0] # xor with input 360 xor 4*1($inp),@x[1] 361 xor 4*2($inp),@x[2] 362 xor 4*3($inp),@x[3] 363 xor 4*4($inp),@x[4] 364 xor 4*5($inp),@x[5] 365 xor 4*6($inp),@x[6] 366 xor 4*7($inp),@x[7] 367 movdqu 4*8($inp),%xmm0 368 xor 4*12($inp),@x[12] 369 xor 4*13($inp),@x[13] 370 xor 4*14($inp),@x[14] 371 xor 4*15($inp),@x[15] 372 lea 4*16($inp),$inp # inp+=64 373 pxor %xmm1,%xmm0 374 375 movdqa %xmm2,4*8(%rsp) 376 movd %xmm3,4*12(%rsp) 377 378 mov @x[0],4*0($out) # write output 379 mov @x[1],4*1($out) 380 mov @x[2],4*2($out) 381 mov @x[3],4*3($out) 382 mov @x[4],4*4($out) 383 mov @x[5],4*5($out) 384 mov @x[6],4*6($out) 385 mov @x[7],4*7($out) 386 movdqu %xmm0,4*8($out) 387 mov @x[12],4*12($out) 388 mov @x[13],4*13($out) 389 mov @x[14],4*14($out) 390 mov @x[15],4*15($out) 391 lea 4*16($out),$out # out+=64 392 393 sub \$64,%rbp 394 jnz .Loop_outer 395 396 jmp .Ldone 397 398.align 16 399.Ltail: 400 mov @x[0],4*0(%rsp) 401 mov @x[1],4*1(%rsp) 402 xor %rbx,%rbx 403 mov @x[2],4*2(%rsp) 404 mov @x[3],4*3(%rsp) 405 mov @x[4],4*4(%rsp) 406 mov @x[5],4*5(%rsp) 407 mov @x[6],4*6(%rsp) 408 mov @x[7],4*7(%rsp) 409 movdqa %xmm1,4*8(%rsp) 410 mov @x[12],4*12(%rsp) 411 mov @x[13],4*13(%rsp) 412 mov @x[14],4*14(%rsp) 413 mov @x[15],4*15(%rsp) 414 415.Loop_tail: 416 movzb ($inp,%rbx),%eax 417 movzb (%rsp,%rbx),%edx 418 lea 1(%rbx),%rbx 419 xor %edx,%eax 420 mov %al,-1($out,%rbx) 421 dec %rbp 422 jnz .Loop_tail 423 424.Ldone: 425 lea 64+24+48(%rsp),%rsi 426.cfi_def_cfa %rsi,8 427 mov -48(%rsi),%r15 428.cfi_restore %r15 429 mov -40(%rsi),%r14 430.cfi_restore %r14 431 mov -32(%rsi),%r13 432.cfi_restore %r13 433 mov -24(%rsi),%r12 434.cfi_restore %r12 435 mov -16(%rsi),%rbp 436.cfi_restore %rbp 437 mov -8(%rsi),%rbx 438.cfi_restore %rbx 439 lea (%rsi),%rsp 440.cfi_def_cfa_register %rsp 441.Lno_data: 442 ret 443.cfi_endproc 444.size ChaCha20_ctr32,.-ChaCha20_ctr32 445___ 446 447######################################################################## 448# SSSE3 code path that handles shorter lengths 449{ 450my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); 451 452sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 453 &paddd ($a,$b); 454 &pxor ($d,$a); 455 &pshufb ($d,$rot16); 456 457 &paddd ($c,$d); 458 &pxor ($b,$c); 459 &movdqa ($t,$b); 460 &psrld ($b,20); 461 &pslld ($t,12); 462 &por ($b,$t); 463 464 &paddd ($a,$b); 465 &pxor ($d,$a); 466 &pshufb ($d,$rot24); 467 468 &paddd ($c,$d); 469 &pxor ($b,$c); 470 &movdqa ($t,$b); 471 &psrld ($b,25); 472 &pslld ($t,7); 473 &por ($b,$t); 474} 475 476my $xframe = $win64 ? 160+8 : 8; 477 478$code.=<<___; 479.type ChaCha20_ssse3,\@function,5 480.align 32 481ChaCha20_ssse3: 482.cfi_startproc 483.LChaCha20_ssse3: 484 mov %rsp,%r9 # frame pointer 485.cfi_def_cfa_register %r9 486___ 487$code.=<<___ if ($avx); 488 test \$`1<<(43-32)`,%r10d 489 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 490___ 491$code.=<<___; 492 cmp \$128,$len # we might throw away some data, 493 je .LChaCha20_128 494 ja .LChaCha20_4x # but overall it won't be slower 495 496.Ldo_sse3_after_all: 497 sub \$64+$xframe,%rsp 498___ 499$code.=<<___ if ($win64); 500 movaps %xmm6,-0x28(%r9) 501 movaps %xmm7,-0x18(%r9) 502.Lssse3_body: 503___ 504$code.=<<___; 505 movdqa .Lsigma(%rip),$a 506 movdqu ($key),$b 507 movdqu 16($key),$c 508 movdqu ($counter),$d 509 movdqa .Lrot16(%rip),$rot16 510 movdqa .Lrot24(%rip),$rot24 511 512 movdqa $a,0x00(%rsp) 513 movdqa $b,0x10(%rsp) 514 movdqa $c,0x20(%rsp) 515 movdqa $d,0x30(%rsp) 516 mov \$10,$counter # reuse $counter 517 jmp .Loop_ssse3 518 519.align 32 520.Loop_outer_ssse3: 521 movdqa .Lone(%rip),$d 522 movdqa 0x00(%rsp),$a 523 movdqa 0x10(%rsp),$b 524 movdqa 0x20(%rsp),$c 525 paddd 0x30(%rsp),$d 526 mov \$10,$counter 527 movdqa $d,0x30(%rsp) 528 jmp .Loop_ssse3 529 530.align 32 531.Loop_ssse3: 532___ 533 &SSSE3ROUND(); 534 &pshufd ($c,$c,0b01001110); 535 &pshufd ($b,$b,0b00111001); 536 &pshufd ($d,$d,0b10010011); 537 &nop (); 538 539 &SSSE3ROUND(); 540 &pshufd ($c,$c,0b01001110); 541 &pshufd ($b,$b,0b10010011); 542 &pshufd ($d,$d,0b00111001); 543 544 &dec ($counter); 545 &jnz (".Loop_ssse3"); 546 547$code.=<<___; 548 paddd 0x00(%rsp),$a 549 paddd 0x10(%rsp),$b 550 paddd 0x20(%rsp),$c 551 paddd 0x30(%rsp),$d 552 553 cmp \$64,$len 554 jb .Ltail_ssse3 555 556 movdqu 0x00($inp),$t 557 movdqu 0x10($inp),$t1 558 pxor $t,$a # xor with input 559 movdqu 0x20($inp),$t 560 pxor $t1,$b 561 movdqu 0x30($inp),$t1 562 lea 0x40($inp),$inp # inp+=64 563 pxor $t,$c 564 pxor $t1,$d 565 566 movdqu $a,0x00($out) # write output 567 movdqu $b,0x10($out) 568 movdqu $c,0x20($out) 569 movdqu $d,0x30($out) 570 lea 0x40($out),$out # out+=64 571 572 sub \$64,$len 573 jnz .Loop_outer_ssse3 574 575 jmp .Ldone_ssse3 576 577.align 16 578.Ltail_ssse3: 579 movdqa $a,0x00(%rsp) 580 movdqa $b,0x10(%rsp) 581 movdqa $c,0x20(%rsp) 582 movdqa $d,0x30(%rsp) 583 xor $counter,$counter 584 585.Loop_tail_ssse3: 586 movzb ($inp,$counter),%eax 587 movzb (%rsp,$counter),%ecx 588 lea 1($counter),$counter 589 xor %ecx,%eax 590 mov %al,-1($out,$counter) 591 dec $len 592 jnz .Loop_tail_ssse3 593 594.Ldone_ssse3: 595___ 596$code.=<<___ if ($win64); 597 movaps -0x28(%r9),%xmm6 598 movaps -0x18(%r9),%xmm7 599___ 600$code.=<<___; 601 lea (%r9),%rsp 602.cfi_def_cfa_register %rsp 603.Lssse3_epilogue: 604 ret 605.cfi_endproc 606.size ChaCha20_ssse3,.-ChaCha20_ssse3 607___ 608} 609 610######################################################################## 611# SSSE3 code path that handles 128-byte inputs 612{ 613my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); 614my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); 615 616sub SSSE3ROUND_2x { 617 &paddd ($a,$b); 618 &pxor ($d,$a); 619 &paddd ($a1,$b1); 620 &pxor ($d1,$a1); 621 &pshufb ($d,$rot16); 622 &pshufb($d1,$rot16); 623 624 &paddd ($c,$d); 625 &paddd ($c1,$d1); 626 &pxor ($b,$c); 627 &pxor ($b1,$c1); 628 &movdqa ($t,$b); 629 &psrld ($b,20); 630 &movdqa($t1,$b1); 631 &pslld ($t,12); 632 &psrld ($b1,20); 633 &por ($b,$t); 634 &pslld ($t1,12); 635 &por ($b1,$t1); 636 637 &paddd ($a,$b); 638 &pxor ($d,$a); 639 &paddd ($a1,$b1); 640 &pxor ($d1,$a1); 641 &pshufb ($d,$rot24); 642 &pshufb($d1,$rot24); 643 644 &paddd ($c,$d); 645 &paddd ($c1,$d1); 646 &pxor ($b,$c); 647 &pxor ($b1,$c1); 648 &movdqa ($t,$b); 649 &psrld ($b,25); 650 &movdqa($t1,$b1); 651 &pslld ($t,7); 652 &psrld ($b1,25); 653 &por ($b,$t); 654 &pslld ($t1,7); 655 &por ($b1,$t1); 656} 657 658my $xframe = $win64 ? 0x68 : 8; 659 660$code.=<<___; 661.type ChaCha20_128,\@function,5 662.align 32 663ChaCha20_128: 664.cfi_startproc 665.LChaCha20_128: 666 mov %rsp,%r9 # frame pointer 667.cfi_def_cfa_register %r9 668 sub \$64+$xframe,%rsp 669___ 670$code.=<<___ if ($win64); 671 movaps %xmm6,-0x68(%r9) 672 movaps %xmm7,-0x58(%r9) 673 movaps %xmm8,-0x48(%r9) 674 movaps %xmm9,-0x38(%r9) 675 movaps %xmm10,-0x28(%r9) 676 movaps %xmm11,-0x18(%r9) 677.L128_body: 678___ 679$code.=<<___; 680 movdqa .Lsigma(%rip),$a 681 movdqu ($key),$b 682 movdqu 16($key),$c 683 movdqu ($counter),$d 684 movdqa .Lone(%rip),$d1 685 movdqa .Lrot16(%rip),$rot16 686 movdqa .Lrot24(%rip),$rot24 687 688 movdqa $a,$a1 689 movdqa $a,0x00(%rsp) 690 movdqa $b,$b1 691 movdqa $b,0x10(%rsp) 692 movdqa $c,$c1 693 movdqa $c,0x20(%rsp) 694 paddd $d,$d1 695 movdqa $d,0x30(%rsp) 696 mov \$10,$counter # reuse $counter 697 jmp .Loop_128 698 699.align 32 700.Loop_128: 701___ 702 &SSSE3ROUND_2x(); 703 &pshufd ($c,$c,0b01001110); 704 &pshufd ($b,$b,0b00111001); 705 &pshufd ($d,$d,0b10010011); 706 &pshufd ($c1,$c1,0b01001110); 707 &pshufd ($b1,$b1,0b00111001); 708 &pshufd ($d1,$d1,0b10010011); 709 710 &SSSE3ROUND_2x(); 711 &pshufd ($c,$c,0b01001110); 712 &pshufd ($b,$b,0b10010011); 713 &pshufd ($d,$d,0b00111001); 714 &pshufd ($c1,$c1,0b01001110); 715 &pshufd ($b1,$b1,0b10010011); 716 &pshufd ($d1,$d1,0b00111001); 717 718 &dec ($counter); 719 &jnz (".Loop_128"); 720 721$code.=<<___; 722 paddd 0x00(%rsp),$a 723 paddd 0x10(%rsp),$b 724 paddd 0x20(%rsp),$c 725 paddd 0x30(%rsp),$d 726 paddd .Lone(%rip),$d1 727 paddd 0x00(%rsp),$a1 728 paddd 0x10(%rsp),$b1 729 paddd 0x20(%rsp),$c1 730 paddd 0x30(%rsp),$d1 731 732 movdqu 0x00($inp),$t 733 movdqu 0x10($inp),$t1 734 pxor $t,$a # xor with input 735 movdqu 0x20($inp),$t 736 pxor $t1,$b 737 movdqu 0x30($inp),$t1 738 pxor $t,$c 739 movdqu 0x40($inp),$t 740 pxor $t1,$d 741 movdqu 0x50($inp),$t1 742 pxor $t,$a1 743 movdqu 0x60($inp),$t 744 pxor $t1,$b1 745 movdqu 0x70($inp),$t1 746 pxor $t,$c1 747 pxor $t1,$d1 748 749 movdqu $a,0x00($out) # write output 750 movdqu $b,0x10($out) 751 movdqu $c,0x20($out) 752 movdqu $d,0x30($out) 753 movdqu $a1,0x40($out) 754 movdqu $b1,0x50($out) 755 movdqu $c1,0x60($out) 756 movdqu $d1,0x70($out) 757___ 758$code.=<<___ if ($win64); 759 movaps -0x68(%r9),%xmm6 760 movaps -0x58(%r9),%xmm7 761 movaps -0x48(%r9),%xmm8 762 movaps -0x38(%r9),%xmm9 763 movaps -0x28(%r9),%xmm10 764 movaps -0x18(%r9),%xmm11 765___ 766$code.=<<___; 767 lea (%r9),%rsp 768.cfi_def_cfa_register %rsp 769.L128_epilogue: 770 ret 771.cfi_endproc 772.size ChaCha20_128,.-ChaCha20_128 773___ 774} 775 776######################################################################## 777# SSSE3 code path that handles longer messages. 778{ 779# assign variables to favor Atom front-end 780my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, 781 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); 782my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 783 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 784 785sub SSSE3_lane_ROUND { 786my ($a0,$b0,$c0,$d0)=@_; 787my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 788my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 789my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 790my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 791my @x=map("\"$_\"",@xx); 792 793 # Consider order in which variables are addressed by their 794 # index: 795 # 796 # a b c d 797 # 798 # 0 4 8 12 < even round 799 # 1 5 9 13 800 # 2 6 10 14 801 # 3 7 11 15 802 # 0 5 10 15 < odd round 803 # 1 6 11 12 804 # 2 7 8 13 805 # 3 4 9 14 806 # 807 # 'a', 'b' and 'd's are permanently allocated in registers, 808 # @x[0..7,12..15], while 'c's are maintained in memory. If 809 # you observe 'c' column, you'll notice that pair of 'c's is 810 # invariant between rounds. This means that we have to reload 811 # them once per round, in the middle. This is why you'll see 812 # bunch of 'c' stores and loads in the middle, but none in 813 # the beginning or end. 814 815 ( 816 "&paddd (@x[$a0],@x[$b0])", # Q1 817 "&paddd (@x[$a1],@x[$b1])", # Q2 818 "&pxor (@x[$d0],@x[$a0])", 819 "&pxor (@x[$d1],@x[$a1])", 820 "&pshufb (@x[$d0],$t1)", 821 "&pshufb (@x[$d1],$t1)", 822 823 "&paddd ($xc,@x[$d0])", 824 "&paddd ($xc_,@x[$d1])", 825 "&pxor (@x[$b0],$xc)", 826 "&pxor (@x[$b1],$xc_)", 827 "&movdqa ($t0,@x[$b0])", 828 "&pslld (@x[$b0],12)", 829 "&psrld ($t0,20)", 830 "&movdqa ($t1,@x[$b1])", 831 "&pslld (@x[$b1],12)", 832 "&por (@x[$b0],$t0)", 833 "&psrld ($t1,20)", 834 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 835 "&por (@x[$b1],$t1)", 836 837 "&paddd (@x[$a0],@x[$b0])", 838 "&paddd (@x[$a1],@x[$b1])", 839 "&pxor (@x[$d0],@x[$a0])", 840 "&pxor (@x[$d1],@x[$a1])", 841 "&pshufb (@x[$d0],$t0)", 842 "&pshufb (@x[$d1],$t0)", 843 844 "&paddd ($xc,@x[$d0])", 845 "&paddd ($xc_,@x[$d1])", 846 "&pxor (@x[$b0],$xc)", 847 "&pxor (@x[$b1],$xc_)", 848 "&movdqa ($t1,@x[$b0])", 849 "&pslld (@x[$b0],7)", 850 "&psrld ($t1,25)", 851 "&movdqa ($t0,@x[$b1])", 852 "&pslld (@x[$b1],7)", 853 "&por (@x[$b0],$t1)", 854 "&psrld ($t0,25)", 855 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 856 "&por (@x[$b1],$t0)", 857 858 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 859 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", 860 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", 861 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", 862 863 "&paddd (@x[$a2],@x[$b2])", # Q3 864 "&paddd (@x[$a3],@x[$b3])", # Q4 865 "&pxor (@x[$d2],@x[$a2])", 866 "&pxor (@x[$d3],@x[$a3])", 867 "&pshufb (@x[$d2],$t1)", 868 "&pshufb (@x[$d3],$t1)", 869 870 "&paddd ($xc,@x[$d2])", 871 "&paddd ($xc_,@x[$d3])", 872 "&pxor (@x[$b2],$xc)", 873 "&pxor (@x[$b3],$xc_)", 874 "&movdqa ($t0,@x[$b2])", 875 "&pslld (@x[$b2],12)", 876 "&psrld ($t0,20)", 877 "&movdqa ($t1,@x[$b3])", 878 "&pslld (@x[$b3],12)", 879 "&por (@x[$b2],$t0)", 880 "&psrld ($t1,20)", 881 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 882 "&por (@x[$b3],$t1)", 883 884 "&paddd (@x[$a2],@x[$b2])", 885 "&paddd (@x[$a3],@x[$b3])", 886 "&pxor (@x[$d2],@x[$a2])", 887 "&pxor (@x[$d3],@x[$a3])", 888 "&pshufb (@x[$d2],$t0)", 889 "&pshufb (@x[$d3],$t0)", 890 891 "&paddd ($xc,@x[$d2])", 892 "&paddd ($xc_,@x[$d3])", 893 "&pxor (@x[$b2],$xc)", 894 "&pxor (@x[$b3],$xc_)", 895 "&movdqa ($t1,@x[$b2])", 896 "&pslld (@x[$b2],7)", 897 "&psrld ($t1,25)", 898 "&movdqa ($t0,@x[$b3])", 899 "&pslld (@x[$b3],7)", 900 "&por (@x[$b2],$t1)", 901 "&psrld ($t0,25)", 902 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 903 "&por (@x[$b3],$t0)" 904 ); 905} 906 907my $xframe = $win64 ? 0xa8 : 8; 908 909$code.=<<___; 910.type ChaCha20_4x,\@function,5 911.align 32 912ChaCha20_4x: 913.cfi_startproc 914.LChaCha20_4x: 915 mov %rsp,%r9 # frame pointer 916.cfi_def_cfa_register %r9 917 mov %r10,%r11 918___ 919$code.=<<___ if ($avx>1); 920 shr \$32,%r10 # OPENSSL_ia32cap_P+8 921 test \$`1<<5`,%r10 # test AVX2 922 jnz .LChaCha20_8x 923___ 924$code.=<<___; 925 cmp \$192,$len 926 ja .Lproceed4x 927 928 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE 929 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE 930 je .Ldo_sse3_after_all # to detect Atom 931 932.Lproceed4x: 933 sub \$0x140+$xframe,%rsp 934___ 935 ################ stack layout 936 # +0x00 SIMD equivalent of @x[8-12] 937 # ... 938 # +0x40 constant copy of key[0-2] smashed by lanes 939 # ... 940 # +0x100 SIMD counters (with nonce smashed by lanes) 941 # ... 942 # +0x140 943$code.=<<___ if ($win64); 944 movaps %xmm6,-0xa8(%r9) 945 movaps %xmm7,-0x98(%r9) 946 movaps %xmm8,-0x88(%r9) 947 movaps %xmm9,-0x78(%r9) 948 movaps %xmm10,-0x68(%r9) 949 movaps %xmm11,-0x58(%r9) 950 movaps %xmm12,-0x48(%r9) 951 movaps %xmm13,-0x38(%r9) 952 movaps %xmm14,-0x28(%r9) 953 movaps %xmm15,-0x18(%r9) 954.L4x_body: 955___ 956$code.=<<___; 957 movdqa .Lsigma(%rip),$xa3 # key[0] 958 movdqu ($key),$xb3 # key[1] 959 movdqu 16($key),$xt3 # key[2] 960 movdqu ($counter),$xd3 # key[3] 961 lea 0x100(%rsp),%rcx # size optimization 962 lea .Lrot16(%rip),%r10 963 lea .Lrot24(%rip),%r11 964 965 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... 966 pshufd \$0x55,$xa3,$xa1 967 movdqa $xa0,0x40(%rsp) # ... and offload 968 pshufd \$0xaa,$xa3,$xa2 969 movdqa $xa1,0x50(%rsp) 970 pshufd \$0xff,$xa3,$xa3 971 movdqa $xa2,0x60(%rsp) 972 movdqa $xa3,0x70(%rsp) 973 974 pshufd \$0x00,$xb3,$xb0 975 pshufd \$0x55,$xb3,$xb1 976 movdqa $xb0,0x80-0x100(%rcx) 977 pshufd \$0xaa,$xb3,$xb2 978 movdqa $xb1,0x90-0x100(%rcx) 979 pshufd \$0xff,$xb3,$xb3 980 movdqa $xb2,0xa0-0x100(%rcx) 981 movdqa $xb3,0xb0-0x100(%rcx) 982 983 pshufd \$0x00,$xt3,$xt0 # "$xc0" 984 pshufd \$0x55,$xt3,$xt1 # "$xc1" 985 movdqa $xt0,0xc0-0x100(%rcx) 986 pshufd \$0xaa,$xt3,$xt2 # "$xc2" 987 movdqa $xt1,0xd0-0x100(%rcx) 988 pshufd \$0xff,$xt3,$xt3 # "$xc3" 989 movdqa $xt2,0xe0-0x100(%rcx) 990 movdqa $xt3,0xf0-0x100(%rcx) 991 992 pshufd \$0x00,$xd3,$xd0 993 pshufd \$0x55,$xd3,$xd1 994 paddd .Linc(%rip),$xd0 # don't save counters yet 995 pshufd \$0xaa,$xd3,$xd2 996 movdqa $xd1,0x110-0x100(%rcx) 997 pshufd \$0xff,$xd3,$xd3 998 movdqa $xd2,0x120-0x100(%rcx) 999 movdqa $xd3,0x130-0x100(%rcx) 1000 1001 jmp .Loop_enter4x 1002 1003.align 32 1004.Loop_outer4x: 1005 movdqa 0x40(%rsp),$xa0 # re-load smashed key 1006 movdqa 0x50(%rsp),$xa1 1007 movdqa 0x60(%rsp),$xa2 1008 movdqa 0x70(%rsp),$xa3 1009 movdqa 0x80-0x100(%rcx),$xb0 1010 movdqa 0x90-0x100(%rcx),$xb1 1011 movdqa 0xa0-0x100(%rcx),$xb2 1012 movdqa 0xb0-0x100(%rcx),$xb3 1013 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1014 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1015 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1016 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1017 movdqa 0x100-0x100(%rcx),$xd0 1018 movdqa 0x110-0x100(%rcx),$xd1 1019 movdqa 0x120-0x100(%rcx),$xd2 1020 movdqa 0x130-0x100(%rcx),$xd3 1021 paddd .Lfour(%rip),$xd0 # next SIMD counters 1022 1023.Loop_enter4x: 1024 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" 1025 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" 1026 movdqa (%r10),$xt3 # .Lrot16(%rip) 1027 mov \$10,%eax 1028 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1029 jmp .Loop4x 1030 1031.align 32 1032.Loop4x: 1033___ 1034 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } 1035 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } 1036$code.=<<___; 1037 dec %eax 1038 jnz .Loop4x 1039 1040 paddd 0x40(%rsp),$xa0 # accumulate key material 1041 paddd 0x50(%rsp),$xa1 1042 paddd 0x60(%rsp),$xa2 1043 paddd 0x70(%rsp),$xa3 1044 1045 movdqa $xa0,$xt2 # "de-interlace" data 1046 punpckldq $xa1,$xa0 1047 movdqa $xa2,$xt3 1048 punpckldq $xa3,$xa2 1049 punpckhdq $xa1,$xt2 1050 punpckhdq $xa3,$xt3 1051 movdqa $xa0,$xa1 1052 punpcklqdq $xa2,$xa0 # "a0" 1053 movdqa $xt2,$xa3 1054 punpcklqdq $xt3,$xt2 # "a2" 1055 punpckhqdq $xa2,$xa1 # "a1" 1056 punpckhqdq $xt3,$xa3 # "a3" 1057___ 1058 ($xa2,$xt2)=($xt2,$xa2); 1059$code.=<<___; 1060 paddd 0x80-0x100(%rcx),$xb0 1061 paddd 0x90-0x100(%rcx),$xb1 1062 paddd 0xa0-0x100(%rcx),$xb2 1063 paddd 0xb0-0x100(%rcx),$xb3 1064 1065 movdqa $xa0,0x00(%rsp) # offload $xaN 1066 movdqa $xa1,0x10(%rsp) 1067 movdqa 0x20(%rsp),$xa0 # "xc2" 1068 movdqa 0x30(%rsp),$xa1 # "xc3" 1069 1070 movdqa $xb0,$xt2 1071 punpckldq $xb1,$xb0 1072 movdqa $xb2,$xt3 1073 punpckldq $xb3,$xb2 1074 punpckhdq $xb1,$xt2 1075 punpckhdq $xb3,$xt3 1076 movdqa $xb0,$xb1 1077 punpcklqdq $xb2,$xb0 # "b0" 1078 movdqa $xt2,$xb3 1079 punpcklqdq $xt3,$xt2 # "b2" 1080 punpckhqdq $xb2,$xb1 # "b1" 1081 punpckhqdq $xt3,$xb3 # "b3" 1082___ 1083 ($xb2,$xt2)=($xt2,$xb2); 1084 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1085$code.=<<___; 1086 paddd 0xc0-0x100(%rcx),$xc0 1087 paddd 0xd0-0x100(%rcx),$xc1 1088 paddd 0xe0-0x100(%rcx),$xc2 1089 paddd 0xf0-0x100(%rcx),$xc3 1090 1091 movdqa $xa2,0x20(%rsp) # keep offloading $xaN 1092 movdqa $xa3,0x30(%rsp) 1093 1094 movdqa $xc0,$xt2 1095 punpckldq $xc1,$xc0 1096 movdqa $xc2,$xt3 1097 punpckldq $xc3,$xc2 1098 punpckhdq $xc1,$xt2 1099 punpckhdq $xc3,$xt3 1100 movdqa $xc0,$xc1 1101 punpcklqdq $xc2,$xc0 # "c0" 1102 movdqa $xt2,$xc3 1103 punpcklqdq $xt3,$xt2 # "c2" 1104 punpckhqdq $xc2,$xc1 # "c1" 1105 punpckhqdq $xt3,$xc3 # "c3" 1106___ 1107 ($xc2,$xt2)=($xt2,$xc2); 1108 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary 1109$code.=<<___; 1110 paddd 0x100-0x100(%rcx),$xd0 1111 paddd 0x110-0x100(%rcx),$xd1 1112 paddd 0x120-0x100(%rcx),$xd2 1113 paddd 0x130-0x100(%rcx),$xd3 1114 1115 movdqa $xd0,$xt2 1116 punpckldq $xd1,$xd0 1117 movdqa $xd2,$xt3 1118 punpckldq $xd3,$xd2 1119 punpckhdq $xd1,$xt2 1120 punpckhdq $xd3,$xt3 1121 movdqa $xd0,$xd1 1122 punpcklqdq $xd2,$xd0 # "d0" 1123 movdqa $xt2,$xd3 1124 punpcklqdq $xt3,$xt2 # "d2" 1125 punpckhqdq $xd2,$xd1 # "d1" 1126 punpckhqdq $xt3,$xd3 # "d3" 1127___ 1128 ($xd2,$xt2)=($xt2,$xd2); 1129$code.=<<___; 1130 cmp \$64*4,$len 1131 jb .Ltail4x 1132 1133 movdqu 0x00($inp),$xt0 # xor with input 1134 movdqu 0x10($inp),$xt1 1135 movdqu 0x20($inp),$xt2 1136 movdqu 0x30($inp),$xt3 1137 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1138 pxor $xb0,$xt1 1139 pxor $xc0,$xt2 1140 pxor $xd0,$xt3 1141 1142 movdqu $xt0,0x00($out) 1143 movdqu 0x40($inp),$xt0 1144 movdqu $xt1,0x10($out) 1145 movdqu 0x50($inp),$xt1 1146 movdqu $xt2,0x20($out) 1147 movdqu 0x60($inp),$xt2 1148 movdqu $xt3,0x30($out) 1149 movdqu 0x70($inp),$xt3 1150 lea 0x80($inp),$inp # size optimization 1151 pxor 0x10(%rsp),$xt0 1152 pxor $xb1,$xt1 1153 pxor $xc1,$xt2 1154 pxor $xd1,$xt3 1155 1156 movdqu $xt0,0x40($out) 1157 movdqu 0x00($inp),$xt0 1158 movdqu $xt1,0x50($out) 1159 movdqu 0x10($inp),$xt1 1160 movdqu $xt2,0x60($out) 1161 movdqu 0x20($inp),$xt2 1162 movdqu $xt3,0x70($out) 1163 lea 0x80($out),$out # size optimization 1164 movdqu 0x30($inp),$xt3 1165 pxor 0x20(%rsp),$xt0 1166 pxor $xb2,$xt1 1167 pxor $xc2,$xt2 1168 pxor $xd2,$xt3 1169 1170 movdqu $xt0,0x00($out) 1171 movdqu 0x40($inp),$xt0 1172 movdqu $xt1,0x10($out) 1173 movdqu 0x50($inp),$xt1 1174 movdqu $xt2,0x20($out) 1175 movdqu 0x60($inp),$xt2 1176 movdqu $xt3,0x30($out) 1177 movdqu 0x70($inp),$xt3 1178 lea 0x80($inp),$inp # inp+=64*4 1179 pxor 0x30(%rsp),$xt0 1180 pxor $xb3,$xt1 1181 pxor $xc3,$xt2 1182 pxor $xd3,$xt3 1183 movdqu $xt0,0x40($out) 1184 movdqu $xt1,0x50($out) 1185 movdqu $xt2,0x60($out) 1186 movdqu $xt3,0x70($out) 1187 lea 0x80($out),$out # out+=64*4 1188 1189 sub \$64*4,$len 1190 jnz .Loop_outer4x 1191 1192 jmp .Ldone4x 1193 1194.Ltail4x: 1195 cmp \$192,$len 1196 jae .L192_or_more4x 1197 cmp \$128,$len 1198 jae .L128_or_more4x 1199 cmp \$64,$len 1200 jae .L64_or_more4x 1201 1202 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1203 xor %r10,%r10 1204 #movdqa $xt0,0x00(%rsp) 1205 movdqa $xb0,0x10(%rsp) 1206 movdqa $xc0,0x20(%rsp) 1207 movdqa $xd0,0x30(%rsp) 1208 jmp .Loop_tail4x 1209 1210.align 32 1211.L64_or_more4x: 1212 movdqu 0x00($inp),$xt0 # xor with input 1213 movdqu 0x10($inp),$xt1 1214 movdqu 0x20($inp),$xt2 1215 movdqu 0x30($inp),$xt3 1216 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? 1217 pxor $xb0,$xt1 1218 pxor $xc0,$xt2 1219 pxor $xd0,$xt3 1220 movdqu $xt0,0x00($out) 1221 movdqu $xt1,0x10($out) 1222 movdqu $xt2,0x20($out) 1223 movdqu $xt3,0x30($out) 1224 je .Ldone4x 1225 1226 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? 1227 lea 0x40($inp),$inp # inp+=64*1 1228 xor %r10,%r10 1229 movdqa $xt0,0x00(%rsp) 1230 movdqa $xb1,0x10(%rsp) 1231 lea 0x40($out),$out # out+=64*1 1232 movdqa $xc1,0x20(%rsp) 1233 sub \$64,$len # len-=64*1 1234 movdqa $xd1,0x30(%rsp) 1235 jmp .Loop_tail4x 1236 1237.align 32 1238.L128_or_more4x: 1239 movdqu 0x00($inp),$xt0 # xor with input 1240 movdqu 0x10($inp),$xt1 1241 movdqu 0x20($inp),$xt2 1242 movdqu 0x30($inp),$xt3 1243 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1244 pxor $xb0,$xt1 1245 pxor $xc0,$xt2 1246 pxor $xd0,$xt3 1247 1248 movdqu $xt0,0x00($out) 1249 movdqu 0x40($inp),$xt0 1250 movdqu $xt1,0x10($out) 1251 movdqu 0x50($inp),$xt1 1252 movdqu $xt2,0x20($out) 1253 movdqu 0x60($inp),$xt2 1254 movdqu $xt3,0x30($out) 1255 movdqu 0x70($inp),$xt3 1256 pxor 0x10(%rsp),$xt0 1257 pxor $xb1,$xt1 1258 pxor $xc1,$xt2 1259 pxor $xd1,$xt3 1260 movdqu $xt0,0x40($out) 1261 movdqu $xt1,0x50($out) 1262 movdqu $xt2,0x60($out) 1263 movdqu $xt3,0x70($out) 1264 je .Ldone4x 1265 1266 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? 1267 lea 0x80($inp),$inp # inp+=64*2 1268 xor %r10,%r10 1269 movdqa $xt0,0x00(%rsp) 1270 movdqa $xb2,0x10(%rsp) 1271 lea 0x80($out),$out # out+=64*2 1272 movdqa $xc2,0x20(%rsp) 1273 sub \$128,$len # len-=64*2 1274 movdqa $xd2,0x30(%rsp) 1275 jmp .Loop_tail4x 1276 1277.align 32 1278.L192_or_more4x: 1279 movdqu 0x00($inp),$xt0 # xor with input 1280 movdqu 0x10($inp),$xt1 1281 movdqu 0x20($inp),$xt2 1282 movdqu 0x30($inp),$xt3 1283 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1284 pxor $xb0,$xt1 1285 pxor $xc0,$xt2 1286 pxor $xd0,$xt3 1287 1288 movdqu $xt0,0x00($out) 1289 movdqu 0x40($inp),$xt0 1290 movdqu $xt1,0x10($out) 1291 movdqu 0x50($inp),$xt1 1292 movdqu $xt2,0x20($out) 1293 movdqu 0x60($inp),$xt2 1294 movdqu $xt3,0x30($out) 1295 movdqu 0x70($inp),$xt3 1296 lea 0x80($inp),$inp # size optimization 1297 pxor 0x10(%rsp),$xt0 1298 pxor $xb1,$xt1 1299 pxor $xc1,$xt2 1300 pxor $xd1,$xt3 1301 1302 movdqu $xt0,0x40($out) 1303 movdqu 0x00($inp),$xt0 1304 movdqu $xt1,0x50($out) 1305 movdqu 0x10($inp),$xt1 1306 movdqu $xt2,0x60($out) 1307 movdqu 0x20($inp),$xt2 1308 movdqu $xt3,0x70($out) 1309 lea 0x80($out),$out # size optimization 1310 movdqu 0x30($inp),$xt3 1311 pxor 0x20(%rsp),$xt0 1312 pxor $xb2,$xt1 1313 pxor $xc2,$xt2 1314 pxor $xd2,$xt3 1315 movdqu $xt0,0x00($out) 1316 movdqu $xt1,0x10($out) 1317 movdqu $xt2,0x20($out) 1318 movdqu $xt3,0x30($out) 1319 je .Ldone4x 1320 1321 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? 1322 lea 0x40($inp),$inp # inp+=64*3 1323 xor %r10,%r10 1324 movdqa $xt0,0x00(%rsp) 1325 movdqa $xb3,0x10(%rsp) 1326 lea 0x40($out),$out # out+=64*3 1327 movdqa $xc3,0x20(%rsp) 1328 sub \$192,$len # len-=64*3 1329 movdqa $xd3,0x30(%rsp) 1330 1331.Loop_tail4x: 1332 movzb ($inp,%r10),%eax 1333 movzb (%rsp,%r10),%ecx 1334 lea 1(%r10),%r10 1335 xor %ecx,%eax 1336 mov %al,-1($out,%r10) 1337 dec $len 1338 jnz .Loop_tail4x 1339 1340.Ldone4x: 1341___ 1342$code.=<<___ if ($win64); 1343 movaps -0xa8(%r9),%xmm6 1344 movaps -0x98(%r9),%xmm7 1345 movaps -0x88(%r9),%xmm8 1346 movaps -0x78(%r9),%xmm9 1347 movaps -0x68(%r9),%xmm10 1348 movaps -0x58(%r9),%xmm11 1349 movaps -0x48(%r9),%xmm12 1350 movaps -0x38(%r9),%xmm13 1351 movaps -0x28(%r9),%xmm14 1352 movaps -0x18(%r9),%xmm15 1353___ 1354$code.=<<___; 1355 lea (%r9),%rsp 1356.cfi_def_cfa_register %rsp 1357.L4x_epilogue: 1358 ret 1359.cfi_endproc 1360.size ChaCha20_4x,.-ChaCha20_4x 1361___ 1362} 1363 1364######################################################################## 1365# XOP code path that handles all lengths. 1366if ($avx) { 1367# There is some "anomaly" observed depending on instructions' size or 1368# alignment. If you look closely at below code you'll notice that 1369# sometimes argument order varies. The order affects instruction 1370# encoding by making it larger, and such fiddling gives 5% performance 1371# improvement. This is on FX-4100... 1372 1373my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1374 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); 1375my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1376 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); 1377 1378sub XOP_lane_ROUND { 1379my ($a0,$b0,$c0,$d0)=@_; 1380my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1381my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1382my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1383my @x=map("\"$_\"",@xx); 1384 1385 ( 1386 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1387 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1388 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1389 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1390 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1391 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1392 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1393 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1394 "&vprotd (@x[$d0],@x[$d0],16)", 1395 "&vprotd (@x[$d1],@x[$d1],16)", 1396 "&vprotd (@x[$d2],@x[$d2],16)", 1397 "&vprotd (@x[$d3],@x[$d3],16)", 1398 1399 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1400 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1401 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1402 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1403 "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1404 "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1405 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1406 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1407 "&vprotd (@x[$b0],@x[$b0],12)", 1408 "&vprotd (@x[$b1],@x[$b1],12)", 1409 "&vprotd (@x[$b2],@x[$b2],12)", 1410 "&vprotd (@x[$b3],@x[$b3],12)", 1411 1412 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip 1413 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip 1414 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1415 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1416 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1417 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1418 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1419 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1420 "&vprotd (@x[$d0],@x[$d0],8)", 1421 "&vprotd (@x[$d1],@x[$d1],8)", 1422 "&vprotd (@x[$d2],@x[$d2],8)", 1423 "&vprotd (@x[$d3],@x[$d3],8)", 1424 1425 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1426 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1427 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1428 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1429 "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1430 "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1431 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1432 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1433 "&vprotd (@x[$b0],@x[$b0],7)", 1434 "&vprotd (@x[$b1],@x[$b1],7)", 1435 "&vprotd (@x[$b2],@x[$b2],7)", 1436 "&vprotd (@x[$b3],@x[$b3],7)" 1437 ); 1438} 1439 1440my $xframe = $win64 ? 0xa8 : 8; 1441 1442$code.=<<___; 1443.type ChaCha20_4xop,\@function,5 1444.align 32 1445ChaCha20_4xop: 1446.cfi_startproc 1447.LChaCha20_4xop: 1448 mov %rsp,%r9 # frame pointer 1449.cfi_def_cfa_register %r9 1450 sub \$0x140+$xframe,%rsp 1451___ 1452 ################ stack layout 1453 # +0x00 SIMD equivalent of @x[8-12] 1454 # ... 1455 # +0x40 constant copy of key[0-2] smashed by lanes 1456 # ... 1457 # +0x100 SIMD counters (with nonce smashed by lanes) 1458 # ... 1459 # +0x140 1460$code.=<<___ if ($win64); 1461 movaps %xmm6,-0xa8(%r9) 1462 movaps %xmm7,-0x98(%r9) 1463 movaps %xmm8,-0x88(%r9) 1464 movaps %xmm9,-0x78(%r9) 1465 movaps %xmm10,-0x68(%r9) 1466 movaps %xmm11,-0x58(%r9) 1467 movaps %xmm12,-0x48(%r9) 1468 movaps %xmm13,-0x38(%r9) 1469 movaps %xmm14,-0x28(%r9) 1470 movaps %xmm15,-0x18(%r9) 1471.L4xop_body: 1472___ 1473$code.=<<___; 1474 vzeroupper 1475 1476 vmovdqa .Lsigma(%rip),$xa3 # key[0] 1477 vmovdqu ($key),$xb3 # key[1] 1478 vmovdqu 16($key),$xt3 # key[2] 1479 vmovdqu ($counter),$xd3 # key[3] 1480 lea 0x100(%rsp),%rcx # size optimization 1481 1482 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1483 vpshufd \$0x55,$xa3,$xa1 1484 vmovdqa $xa0,0x40(%rsp) # ... and offload 1485 vpshufd \$0xaa,$xa3,$xa2 1486 vmovdqa $xa1,0x50(%rsp) 1487 vpshufd \$0xff,$xa3,$xa3 1488 vmovdqa $xa2,0x60(%rsp) 1489 vmovdqa $xa3,0x70(%rsp) 1490 1491 vpshufd \$0x00,$xb3,$xb0 1492 vpshufd \$0x55,$xb3,$xb1 1493 vmovdqa $xb0,0x80-0x100(%rcx) 1494 vpshufd \$0xaa,$xb3,$xb2 1495 vmovdqa $xb1,0x90-0x100(%rcx) 1496 vpshufd \$0xff,$xb3,$xb3 1497 vmovdqa $xb2,0xa0-0x100(%rcx) 1498 vmovdqa $xb3,0xb0-0x100(%rcx) 1499 1500 vpshufd \$0x00,$xt3,$xt0 # "$xc0" 1501 vpshufd \$0x55,$xt3,$xt1 # "$xc1" 1502 vmovdqa $xt0,0xc0-0x100(%rcx) 1503 vpshufd \$0xaa,$xt3,$xt2 # "$xc2" 1504 vmovdqa $xt1,0xd0-0x100(%rcx) 1505 vpshufd \$0xff,$xt3,$xt3 # "$xc3" 1506 vmovdqa $xt2,0xe0-0x100(%rcx) 1507 vmovdqa $xt3,0xf0-0x100(%rcx) 1508 1509 vpshufd \$0x00,$xd3,$xd0 1510 vpshufd \$0x55,$xd3,$xd1 1511 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet 1512 vpshufd \$0xaa,$xd3,$xd2 1513 vmovdqa $xd1,0x110-0x100(%rcx) 1514 vpshufd \$0xff,$xd3,$xd3 1515 vmovdqa $xd2,0x120-0x100(%rcx) 1516 vmovdqa $xd3,0x130-0x100(%rcx) 1517 1518 jmp .Loop_enter4xop 1519 1520.align 32 1521.Loop_outer4xop: 1522 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key 1523 vmovdqa 0x50(%rsp),$xa1 1524 vmovdqa 0x60(%rsp),$xa2 1525 vmovdqa 0x70(%rsp),$xa3 1526 vmovdqa 0x80-0x100(%rcx),$xb0 1527 vmovdqa 0x90-0x100(%rcx),$xb1 1528 vmovdqa 0xa0-0x100(%rcx),$xb2 1529 vmovdqa 0xb0-0x100(%rcx),$xb3 1530 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1531 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1532 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1533 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1534 vmovdqa 0x100-0x100(%rcx),$xd0 1535 vmovdqa 0x110-0x100(%rcx),$xd1 1536 vmovdqa 0x120-0x100(%rcx),$xd2 1537 vmovdqa 0x130-0x100(%rcx),$xd3 1538 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters 1539 1540.Loop_enter4xop: 1541 mov \$10,%eax 1542 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1543 jmp .Loop4xop 1544 1545.align 32 1546.Loop4xop: 1547___ 1548 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } 1549 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } 1550$code.=<<___; 1551 dec %eax 1552 jnz .Loop4xop 1553 1554 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material 1555 vpaddd 0x50(%rsp),$xa1,$xa1 1556 vpaddd 0x60(%rsp),$xa2,$xa2 1557 vpaddd 0x70(%rsp),$xa3,$xa3 1558 1559 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 1560 vmovdqa $xt3,0x30(%rsp) 1561 1562 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 1563 vpunpckldq $xa3,$xa2,$xt3 1564 vpunpckhdq $xa1,$xa0,$xa0 1565 vpunpckhdq $xa3,$xa2,$xa2 1566 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 1567 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 1568 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 1569 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 1570___ 1571 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 1572$code.=<<___; 1573 vpaddd 0x80-0x100(%rcx),$xb0,$xb0 1574 vpaddd 0x90-0x100(%rcx),$xb1,$xb1 1575 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 1576 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 1577 1578 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 1579 vmovdqa $xa1,0x10(%rsp) 1580 vmovdqa 0x20(%rsp),$xa0 # "xc2" 1581 vmovdqa 0x30(%rsp),$xa1 # "xc3" 1582 1583 vpunpckldq $xb1,$xb0,$xt2 1584 vpunpckldq $xb3,$xb2,$xt3 1585 vpunpckhdq $xb1,$xb0,$xb0 1586 vpunpckhdq $xb3,$xb2,$xb2 1587 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 1588 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 1589 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 1590 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 1591___ 1592 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 1593 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1594$code.=<<___; 1595 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 1596 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 1597 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 1598 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 1599 1600 vpunpckldq $xc1,$xc0,$xt2 1601 vpunpckldq $xc3,$xc2,$xt3 1602 vpunpckhdq $xc1,$xc0,$xc0 1603 vpunpckhdq $xc3,$xc2,$xc2 1604 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 1605 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 1606 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 1607 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 1608___ 1609 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 1610$code.=<<___; 1611 vpaddd 0x100-0x100(%rcx),$xd0,$xd0 1612 vpaddd 0x110-0x100(%rcx),$xd1,$xd1 1613 vpaddd 0x120-0x100(%rcx),$xd2,$xd2 1614 vpaddd 0x130-0x100(%rcx),$xd3,$xd3 1615 1616 vpunpckldq $xd1,$xd0,$xt2 1617 vpunpckldq $xd3,$xd2,$xt3 1618 vpunpckhdq $xd1,$xd0,$xd0 1619 vpunpckhdq $xd3,$xd2,$xd2 1620 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 1621 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 1622 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 1623 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 1624___ 1625 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 1626 ($xa0,$xa1)=($xt2,$xt3); 1627$code.=<<___; 1628 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 1629 vmovdqa 0x10(%rsp),$xa1 1630 1631 cmp \$64*4,$len 1632 jb .Ltail4xop 1633 1634 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1635 vpxor 0x10($inp),$xb0,$xb0 1636 vpxor 0x20($inp),$xc0,$xc0 1637 vpxor 0x30($inp),$xd0,$xd0 1638 vpxor 0x40($inp),$xa1,$xa1 1639 vpxor 0x50($inp),$xb1,$xb1 1640 vpxor 0x60($inp),$xc1,$xc1 1641 vpxor 0x70($inp),$xd1,$xd1 1642 lea 0x80($inp),$inp # size optimization 1643 vpxor 0x00($inp),$xa2,$xa2 1644 vpxor 0x10($inp),$xb2,$xb2 1645 vpxor 0x20($inp),$xc2,$xc2 1646 vpxor 0x30($inp),$xd2,$xd2 1647 vpxor 0x40($inp),$xa3,$xa3 1648 vpxor 0x50($inp),$xb3,$xb3 1649 vpxor 0x60($inp),$xc3,$xc3 1650 vpxor 0x70($inp),$xd3,$xd3 1651 lea 0x80($inp),$inp # inp+=64*4 1652 1653 vmovdqu $xa0,0x00($out) 1654 vmovdqu $xb0,0x10($out) 1655 vmovdqu $xc0,0x20($out) 1656 vmovdqu $xd0,0x30($out) 1657 vmovdqu $xa1,0x40($out) 1658 vmovdqu $xb1,0x50($out) 1659 vmovdqu $xc1,0x60($out) 1660 vmovdqu $xd1,0x70($out) 1661 lea 0x80($out),$out # size optimization 1662 vmovdqu $xa2,0x00($out) 1663 vmovdqu $xb2,0x10($out) 1664 vmovdqu $xc2,0x20($out) 1665 vmovdqu $xd2,0x30($out) 1666 vmovdqu $xa3,0x40($out) 1667 vmovdqu $xb3,0x50($out) 1668 vmovdqu $xc3,0x60($out) 1669 vmovdqu $xd3,0x70($out) 1670 lea 0x80($out),$out # out+=64*4 1671 1672 sub \$64*4,$len 1673 jnz .Loop_outer4xop 1674 1675 jmp .Ldone4xop 1676 1677.align 32 1678.Ltail4xop: 1679 cmp \$192,$len 1680 jae .L192_or_more4xop 1681 cmp \$128,$len 1682 jae .L128_or_more4xop 1683 cmp \$64,$len 1684 jae .L64_or_more4xop 1685 1686 xor %r10,%r10 1687 vmovdqa $xa0,0x00(%rsp) 1688 vmovdqa $xb0,0x10(%rsp) 1689 vmovdqa $xc0,0x20(%rsp) 1690 vmovdqa $xd0,0x30(%rsp) 1691 jmp .Loop_tail4xop 1692 1693.align 32 1694.L64_or_more4xop: 1695 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1696 vpxor 0x10($inp),$xb0,$xb0 1697 vpxor 0x20($inp),$xc0,$xc0 1698 vpxor 0x30($inp),$xd0,$xd0 1699 vmovdqu $xa0,0x00($out) 1700 vmovdqu $xb0,0x10($out) 1701 vmovdqu $xc0,0x20($out) 1702 vmovdqu $xd0,0x30($out) 1703 je .Ldone4xop 1704 1705 lea 0x40($inp),$inp # inp+=64*1 1706 vmovdqa $xa1,0x00(%rsp) 1707 xor %r10,%r10 1708 vmovdqa $xb1,0x10(%rsp) 1709 lea 0x40($out),$out # out+=64*1 1710 vmovdqa $xc1,0x20(%rsp) 1711 sub \$64,$len # len-=64*1 1712 vmovdqa $xd1,0x30(%rsp) 1713 jmp .Loop_tail4xop 1714 1715.align 32 1716.L128_or_more4xop: 1717 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1718 vpxor 0x10($inp),$xb0,$xb0 1719 vpxor 0x20($inp),$xc0,$xc0 1720 vpxor 0x30($inp),$xd0,$xd0 1721 vpxor 0x40($inp),$xa1,$xa1 1722 vpxor 0x50($inp),$xb1,$xb1 1723 vpxor 0x60($inp),$xc1,$xc1 1724 vpxor 0x70($inp),$xd1,$xd1 1725 1726 vmovdqu $xa0,0x00($out) 1727 vmovdqu $xb0,0x10($out) 1728 vmovdqu $xc0,0x20($out) 1729 vmovdqu $xd0,0x30($out) 1730 vmovdqu $xa1,0x40($out) 1731 vmovdqu $xb1,0x50($out) 1732 vmovdqu $xc1,0x60($out) 1733 vmovdqu $xd1,0x70($out) 1734 je .Ldone4xop 1735 1736 lea 0x80($inp),$inp # inp+=64*2 1737 vmovdqa $xa2,0x00(%rsp) 1738 xor %r10,%r10 1739 vmovdqa $xb2,0x10(%rsp) 1740 lea 0x80($out),$out # out+=64*2 1741 vmovdqa $xc2,0x20(%rsp) 1742 sub \$128,$len # len-=64*2 1743 vmovdqa $xd2,0x30(%rsp) 1744 jmp .Loop_tail4xop 1745 1746.align 32 1747.L192_or_more4xop: 1748 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1749 vpxor 0x10($inp),$xb0,$xb0 1750 vpxor 0x20($inp),$xc0,$xc0 1751 vpxor 0x30($inp),$xd0,$xd0 1752 vpxor 0x40($inp),$xa1,$xa1 1753 vpxor 0x50($inp),$xb1,$xb1 1754 vpxor 0x60($inp),$xc1,$xc1 1755 vpxor 0x70($inp),$xd1,$xd1 1756 lea 0x80($inp),$inp # size optimization 1757 vpxor 0x00($inp),$xa2,$xa2 1758 vpxor 0x10($inp),$xb2,$xb2 1759 vpxor 0x20($inp),$xc2,$xc2 1760 vpxor 0x30($inp),$xd2,$xd2 1761 1762 vmovdqu $xa0,0x00($out) 1763 vmovdqu $xb0,0x10($out) 1764 vmovdqu $xc0,0x20($out) 1765 vmovdqu $xd0,0x30($out) 1766 vmovdqu $xa1,0x40($out) 1767 vmovdqu $xb1,0x50($out) 1768 vmovdqu $xc1,0x60($out) 1769 vmovdqu $xd1,0x70($out) 1770 lea 0x80($out),$out # size optimization 1771 vmovdqu $xa2,0x00($out) 1772 vmovdqu $xb2,0x10($out) 1773 vmovdqu $xc2,0x20($out) 1774 vmovdqu $xd2,0x30($out) 1775 je .Ldone4xop 1776 1777 lea 0x40($inp),$inp # inp+=64*3 1778 vmovdqa $xa3,0x00(%rsp) 1779 xor %r10,%r10 1780 vmovdqa $xb3,0x10(%rsp) 1781 lea 0x40($out),$out # out+=64*3 1782 vmovdqa $xc3,0x20(%rsp) 1783 sub \$192,$len # len-=64*3 1784 vmovdqa $xd3,0x30(%rsp) 1785 1786.Loop_tail4xop: 1787 movzb ($inp,%r10),%eax 1788 movzb (%rsp,%r10),%ecx 1789 lea 1(%r10),%r10 1790 xor %ecx,%eax 1791 mov %al,-1($out,%r10) 1792 dec $len 1793 jnz .Loop_tail4xop 1794 1795.Ldone4xop: 1796 vzeroupper 1797___ 1798$code.=<<___ if ($win64); 1799 movaps -0xa8(%r9),%xmm6 1800 movaps -0x98(%r9),%xmm7 1801 movaps -0x88(%r9),%xmm8 1802 movaps -0x78(%r9),%xmm9 1803 movaps -0x68(%r9),%xmm10 1804 movaps -0x58(%r9),%xmm11 1805 movaps -0x48(%r9),%xmm12 1806 movaps -0x38(%r9),%xmm13 1807 movaps -0x28(%r9),%xmm14 1808 movaps -0x18(%r9),%xmm15 1809___ 1810$code.=<<___; 1811 lea (%r9),%rsp 1812.cfi_def_cfa_register %rsp 1813.L4xop_epilogue: 1814 ret 1815.cfi_endproc 1816.size ChaCha20_4xop,.-ChaCha20_4xop 1817___ 1818} 1819 1820######################################################################## 1821# AVX2 code path 1822if ($avx>1) { 1823my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1824 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); 1825my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1826 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 1827 1828sub AVX2_lane_ROUND { 1829my ($a0,$b0,$c0,$d0)=@_; 1830my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1831my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1832my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1833my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 1834my @x=map("\"$_\"",@xx); 1835 1836 # Consider order in which variables are addressed by their 1837 # index: 1838 # 1839 # a b c d 1840 # 1841 # 0 4 8 12 < even round 1842 # 1 5 9 13 1843 # 2 6 10 14 1844 # 3 7 11 15 1845 # 0 5 10 15 < odd round 1846 # 1 6 11 12 1847 # 2 7 8 13 1848 # 3 4 9 14 1849 # 1850 # 'a', 'b' and 'd's are permanently allocated in registers, 1851 # @x[0..7,12..15], while 'c's are maintained in memory. If 1852 # you observe 'c' column, you'll notice that pair of 'c's is 1853 # invariant between rounds. This means that we have to reload 1854 # them once per round, in the middle. This is why you'll see 1855 # bunch of 'c' stores and loads in the middle, but none in 1856 # the beginning or end. 1857 1858 ( 1859 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1860 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1861 "&vpshufb (@x[$d0],@x[$d0],$t1)", 1862 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1863 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1864 "&vpshufb (@x[$d1],@x[$d1],$t1)", 1865 1866 "&vpaddd ($xc,$xc,@x[$d0])", 1867 "&vpxor (@x[$b0],$xc,@x[$b0])", 1868 "&vpslld ($t0,@x[$b0],12)", 1869 "&vpsrld (@x[$b0],@x[$b0],20)", 1870 "&vpor (@x[$b0],$t0,@x[$b0])", 1871 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1872 "&vpaddd ($xc_,$xc_,@x[$d1])", 1873 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1874 "&vpslld ($t1,@x[$b1],12)", 1875 "&vpsrld (@x[$b1],@x[$b1],20)", 1876 "&vpor (@x[$b1],$t1,@x[$b1])", 1877 1878 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 1879 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1880 "&vpshufb (@x[$d0],@x[$d0],$t0)", 1881 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 1882 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1883 "&vpshufb (@x[$d1],@x[$d1],$t0)", 1884 1885 "&vpaddd ($xc,$xc,@x[$d0])", 1886 "&vpxor (@x[$b0],$xc,@x[$b0])", 1887 "&vpslld ($t1,@x[$b0],7)", 1888 "&vpsrld (@x[$b0],@x[$b0],25)", 1889 "&vpor (@x[$b0],$t1,@x[$b0])", 1890 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1891 "&vpaddd ($xc_,$xc_,@x[$d1])", 1892 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1893 "&vpslld ($t0,@x[$b1],7)", 1894 "&vpsrld (@x[$b1],@x[$b1],25)", 1895 "&vpor (@x[$b1],$t0,@x[$b1])", 1896 1897 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 1898 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", 1899 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", 1900 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", 1901 1902 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1903 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1904 "&vpshufb (@x[$d2],@x[$d2],$t1)", 1905 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1906 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1907 "&vpshufb (@x[$d3],@x[$d3],$t1)", 1908 1909 "&vpaddd ($xc,$xc,@x[$d2])", 1910 "&vpxor (@x[$b2],$xc,@x[$b2])", 1911 "&vpslld ($t0,@x[$b2],12)", 1912 "&vpsrld (@x[$b2],@x[$b2],20)", 1913 "&vpor (@x[$b2],$t0,@x[$b2])", 1914 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1915 "&vpaddd ($xc_,$xc_,@x[$d3])", 1916 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1917 "&vpslld ($t1,@x[$b3],12)", 1918 "&vpsrld (@x[$b3],@x[$b3],20)", 1919 "&vpor (@x[$b3],$t1,@x[$b3])", 1920 1921 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1922 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1923 "&vpshufb (@x[$d2],@x[$d2],$t0)", 1924 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1925 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1926 "&vpshufb (@x[$d3],@x[$d3],$t0)", 1927 1928 "&vpaddd ($xc,$xc,@x[$d2])", 1929 "&vpxor (@x[$b2],$xc,@x[$b2])", 1930 "&vpslld ($t1,@x[$b2],7)", 1931 "&vpsrld (@x[$b2],@x[$b2],25)", 1932 "&vpor (@x[$b2],$t1,@x[$b2])", 1933 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1934 "&vpaddd ($xc_,$xc_,@x[$d3])", 1935 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1936 "&vpslld ($t0,@x[$b3],7)", 1937 "&vpsrld (@x[$b3],@x[$b3],25)", 1938 "&vpor (@x[$b3],$t0,@x[$b3])" 1939 ); 1940} 1941 1942my $xframe = $win64 ? 0xa8 : 8; 1943 1944$code.=<<___; 1945.type ChaCha20_8x,\@function,5 1946.align 32 1947ChaCha20_8x: 1948.cfi_startproc 1949.LChaCha20_8x: 1950 mov %rsp,%r9 # frame register 1951.cfi_def_cfa_register %r9 1952 sub \$0x280+$xframe,%rsp 1953 and \$-32,%rsp 1954___ 1955$code.=<<___ if ($win64); 1956 movaps %xmm6,-0xa8(%r9) 1957 movaps %xmm7,-0x98(%r9) 1958 movaps %xmm8,-0x88(%r9) 1959 movaps %xmm9,-0x78(%r9) 1960 movaps %xmm10,-0x68(%r9) 1961 movaps %xmm11,-0x58(%r9) 1962 movaps %xmm12,-0x48(%r9) 1963 movaps %xmm13,-0x38(%r9) 1964 movaps %xmm14,-0x28(%r9) 1965 movaps %xmm15,-0x18(%r9) 1966.L8x_body: 1967___ 1968$code.=<<___; 1969 vzeroupper 1970 1971 ################ stack layout 1972 # +0x00 SIMD equivalent of @x[8-12] 1973 # ... 1974 # +0x80 constant copy of key[0-2] smashed by lanes 1975 # ... 1976 # +0x200 SIMD counters (with nonce smashed by lanes) 1977 # ... 1978 # +0x280 1979 1980 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] 1981 vbroadcasti128 ($key),$xb3 # key[1] 1982 vbroadcasti128 16($key),$xt3 # key[2] 1983 vbroadcasti128 ($counter),$xd3 # key[3] 1984 lea 0x100(%rsp),%rcx # size optimization 1985 lea 0x200(%rsp),%rax # size optimization 1986 lea .Lrot16(%rip),%r10 1987 lea .Lrot24(%rip),%r11 1988 1989 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1990 vpshufd \$0x55,$xa3,$xa1 1991 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload 1992 vpshufd \$0xaa,$xa3,$xa2 1993 vmovdqa $xa1,0xa0-0x100(%rcx) 1994 vpshufd \$0xff,$xa3,$xa3 1995 vmovdqa $xa2,0xc0-0x100(%rcx) 1996 vmovdqa $xa3,0xe0-0x100(%rcx) 1997 1998 vpshufd \$0x00,$xb3,$xb0 1999 vpshufd \$0x55,$xb3,$xb1 2000 vmovdqa $xb0,0x100-0x100(%rcx) 2001 vpshufd \$0xaa,$xb3,$xb2 2002 vmovdqa $xb1,0x120-0x100(%rcx) 2003 vpshufd \$0xff,$xb3,$xb3 2004 vmovdqa $xb2,0x140-0x100(%rcx) 2005 vmovdqa $xb3,0x160-0x100(%rcx) 2006 2007 vpshufd \$0x00,$xt3,$xt0 # "xc0" 2008 vpshufd \$0x55,$xt3,$xt1 # "xc1" 2009 vmovdqa $xt0,0x180-0x200(%rax) 2010 vpshufd \$0xaa,$xt3,$xt2 # "xc2" 2011 vmovdqa $xt1,0x1a0-0x200(%rax) 2012 vpshufd \$0xff,$xt3,$xt3 # "xc3" 2013 vmovdqa $xt2,0x1c0-0x200(%rax) 2014 vmovdqa $xt3,0x1e0-0x200(%rax) 2015 2016 vpshufd \$0x00,$xd3,$xd0 2017 vpshufd \$0x55,$xd3,$xd1 2018 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 2019 vpshufd \$0xaa,$xd3,$xd2 2020 vmovdqa $xd1,0x220-0x200(%rax) 2021 vpshufd \$0xff,$xd3,$xd3 2022 vmovdqa $xd2,0x240-0x200(%rax) 2023 vmovdqa $xd3,0x260-0x200(%rax) 2024 2025 jmp .Loop_enter8x 2026 2027.align 32 2028.Loop_outer8x: 2029 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key 2030 vmovdqa 0xa0-0x100(%rcx),$xa1 2031 vmovdqa 0xc0-0x100(%rcx),$xa2 2032 vmovdqa 0xe0-0x100(%rcx),$xa3 2033 vmovdqa 0x100-0x100(%rcx),$xb0 2034 vmovdqa 0x120-0x100(%rcx),$xb1 2035 vmovdqa 0x140-0x100(%rcx),$xb2 2036 vmovdqa 0x160-0x100(%rcx),$xb3 2037 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" 2038 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" 2039 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" 2040 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" 2041 vmovdqa 0x200-0x200(%rax),$xd0 2042 vmovdqa 0x220-0x200(%rax),$xd1 2043 vmovdqa 0x240-0x200(%rax),$xd2 2044 vmovdqa 0x260-0x200(%rax),$xd3 2045 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters 2046 2047.Loop_enter8x: 2048 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" 2049 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" 2050 vbroadcasti128 (%r10),$xt3 2051 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters 2052 mov \$10,%eax 2053 jmp .Loop8x 2054 2055.align 32 2056.Loop8x: 2057___ 2058 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } 2059 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } 2060$code.=<<___; 2061 dec %eax 2062 jnz .Loop8x 2063 2064 lea 0x200(%rsp),%rax # size optimization 2065 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key 2066 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 2067 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 2068 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 2069 2070 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 2071 vpunpckldq $xa3,$xa2,$xt3 2072 vpunpckhdq $xa1,$xa0,$xa0 2073 vpunpckhdq $xa3,$xa2,$xa2 2074 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 2075 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 2076 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 2077 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 2078___ 2079 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 2080$code.=<<___; 2081 vpaddd 0x100-0x100(%rcx),$xb0,$xb0 2082 vpaddd 0x120-0x100(%rcx),$xb1,$xb1 2083 vpaddd 0x140-0x100(%rcx),$xb2,$xb2 2084 vpaddd 0x160-0x100(%rcx),$xb3,$xb3 2085 2086 vpunpckldq $xb1,$xb0,$xt2 2087 vpunpckldq $xb3,$xb2,$xt3 2088 vpunpckhdq $xb1,$xb0,$xb0 2089 vpunpckhdq $xb3,$xb2,$xb2 2090 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 2091 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 2092 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 2093 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 2094___ 2095 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 2096$code.=<<___; 2097 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further 2098 vperm2i128 \$0x31,$xb0,$xa0,$xb0 2099 vperm2i128 \$0x20,$xb1,$xa1,$xa0 2100 vperm2i128 \$0x31,$xb1,$xa1,$xb1 2101 vperm2i128 \$0x20,$xb2,$xa2,$xa1 2102 vperm2i128 \$0x31,$xb2,$xa2,$xb2 2103 vperm2i128 \$0x20,$xb3,$xa3,$xa2 2104 vperm2i128 \$0x31,$xb3,$xa3,$xb3 2105___ 2106 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 2107 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 2108$code.=<<___; 2109 vmovdqa $xa0,0x00(%rsp) # offload $xaN 2110 vmovdqa $xa1,0x20(%rsp) 2111 vmovdqa 0x40(%rsp),$xc2 # $xa0 2112 vmovdqa 0x60(%rsp),$xc3 # $xa1 2113 2114 vpaddd 0x180-0x200(%rax),$xc0,$xc0 2115 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 2116 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 2117 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 2118 2119 vpunpckldq $xc1,$xc0,$xt2 2120 vpunpckldq $xc3,$xc2,$xt3 2121 vpunpckhdq $xc1,$xc0,$xc0 2122 vpunpckhdq $xc3,$xc2,$xc2 2123 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 2124 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 2125 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 2126 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 2127___ 2128 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 2129$code.=<<___; 2130 vpaddd 0x200-0x200(%rax),$xd0,$xd0 2131 vpaddd 0x220-0x200(%rax),$xd1,$xd1 2132 vpaddd 0x240-0x200(%rax),$xd2,$xd2 2133 vpaddd 0x260-0x200(%rax),$xd3,$xd3 2134 2135 vpunpckldq $xd1,$xd0,$xt2 2136 vpunpckldq $xd3,$xd2,$xt3 2137 vpunpckhdq $xd1,$xd0,$xd0 2138 vpunpckhdq $xd3,$xd2,$xd2 2139 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 2140 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 2141 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 2142 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 2143___ 2144 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 2145$code.=<<___; 2146 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 2147 vperm2i128 \$0x31,$xd0,$xc0,$xd0 2148 vperm2i128 \$0x20,$xd1,$xc1,$xc0 2149 vperm2i128 \$0x31,$xd1,$xc1,$xd1 2150 vperm2i128 \$0x20,$xd2,$xc2,$xc1 2151 vperm2i128 \$0x31,$xd2,$xc2,$xd2 2152 vperm2i128 \$0x20,$xd3,$xc3,$xc2 2153 vperm2i128 \$0x31,$xd3,$xc3,$xd3 2154___ 2155 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 2156 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 2157 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 2158 ($xa0,$xa1)=($xt2,$xt3); 2159$code.=<<___; 2160 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? 2161 vmovdqa 0x20(%rsp),$xa1 2162 2163 cmp \$64*8,$len 2164 jb .Ltail8x 2165 2166 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2167 vpxor 0x20($inp),$xb0,$xb0 2168 vpxor 0x40($inp),$xc0,$xc0 2169 vpxor 0x60($inp),$xd0,$xd0 2170 lea 0x80($inp),$inp # size optimization 2171 vmovdqu $xa0,0x00($out) 2172 vmovdqu $xb0,0x20($out) 2173 vmovdqu $xc0,0x40($out) 2174 vmovdqu $xd0,0x60($out) 2175 lea 0x80($out),$out # size optimization 2176 2177 vpxor 0x00($inp),$xa1,$xa1 2178 vpxor 0x20($inp),$xb1,$xb1 2179 vpxor 0x40($inp),$xc1,$xc1 2180 vpxor 0x60($inp),$xd1,$xd1 2181 lea 0x80($inp),$inp # size optimization 2182 vmovdqu $xa1,0x00($out) 2183 vmovdqu $xb1,0x20($out) 2184 vmovdqu $xc1,0x40($out) 2185 vmovdqu $xd1,0x60($out) 2186 lea 0x80($out),$out # size optimization 2187 2188 vpxor 0x00($inp),$xa2,$xa2 2189 vpxor 0x20($inp),$xb2,$xb2 2190 vpxor 0x40($inp),$xc2,$xc2 2191 vpxor 0x60($inp),$xd2,$xd2 2192 lea 0x80($inp),$inp # size optimization 2193 vmovdqu $xa2,0x00($out) 2194 vmovdqu $xb2,0x20($out) 2195 vmovdqu $xc2,0x40($out) 2196 vmovdqu $xd2,0x60($out) 2197 lea 0x80($out),$out # size optimization 2198 2199 vpxor 0x00($inp),$xa3,$xa3 2200 vpxor 0x20($inp),$xb3,$xb3 2201 vpxor 0x40($inp),$xc3,$xc3 2202 vpxor 0x60($inp),$xd3,$xd3 2203 lea 0x80($inp),$inp # size optimization 2204 vmovdqu $xa3,0x00($out) 2205 vmovdqu $xb3,0x20($out) 2206 vmovdqu $xc3,0x40($out) 2207 vmovdqu $xd3,0x60($out) 2208 lea 0x80($out),$out # size optimization 2209 2210 sub \$64*8,$len 2211 jnz .Loop_outer8x 2212 2213 jmp .Ldone8x 2214 2215.Ltail8x: 2216 cmp \$448,$len 2217 jae .L448_or_more8x 2218 cmp \$384,$len 2219 jae .L384_or_more8x 2220 cmp \$320,$len 2221 jae .L320_or_more8x 2222 cmp \$256,$len 2223 jae .L256_or_more8x 2224 cmp \$192,$len 2225 jae .L192_or_more8x 2226 cmp \$128,$len 2227 jae .L128_or_more8x 2228 cmp \$64,$len 2229 jae .L64_or_more8x 2230 2231 xor %r10,%r10 2232 vmovdqa $xa0,0x00(%rsp) 2233 vmovdqa $xb0,0x20(%rsp) 2234 jmp .Loop_tail8x 2235 2236.align 32 2237.L64_or_more8x: 2238 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2239 vpxor 0x20($inp),$xb0,$xb0 2240 vmovdqu $xa0,0x00($out) 2241 vmovdqu $xb0,0x20($out) 2242 je .Ldone8x 2243 2244 lea 0x40($inp),$inp # inp+=64*1 2245 xor %r10,%r10 2246 vmovdqa $xc0,0x00(%rsp) 2247 lea 0x40($out),$out # out+=64*1 2248 sub \$64,$len # len-=64*1 2249 vmovdqa $xd0,0x20(%rsp) 2250 jmp .Loop_tail8x 2251 2252.align 32 2253.L128_or_more8x: 2254 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2255 vpxor 0x20($inp),$xb0,$xb0 2256 vpxor 0x40($inp),$xc0,$xc0 2257 vpxor 0x60($inp),$xd0,$xd0 2258 vmovdqu $xa0,0x00($out) 2259 vmovdqu $xb0,0x20($out) 2260 vmovdqu $xc0,0x40($out) 2261 vmovdqu $xd0,0x60($out) 2262 je .Ldone8x 2263 2264 lea 0x80($inp),$inp # inp+=64*2 2265 xor %r10,%r10 2266 vmovdqa $xa1,0x00(%rsp) 2267 lea 0x80($out),$out # out+=64*2 2268 sub \$128,$len # len-=64*2 2269 vmovdqa $xb1,0x20(%rsp) 2270 jmp .Loop_tail8x 2271 2272.align 32 2273.L192_or_more8x: 2274 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2275 vpxor 0x20($inp),$xb0,$xb0 2276 vpxor 0x40($inp),$xc0,$xc0 2277 vpxor 0x60($inp),$xd0,$xd0 2278 vpxor 0x80($inp),$xa1,$xa1 2279 vpxor 0xa0($inp),$xb1,$xb1 2280 vmovdqu $xa0,0x00($out) 2281 vmovdqu $xb0,0x20($out) 2282 vmovdqu $xc0,0x40($out) 2283 vmovdqu $xd0,0x60($out) 2284 vmovdqu $xa1,0x80($out) 2285 vmovdqu $xb1,0xa0($out) 2286 je .Ldone8x 2287 2288 lea 0xc0($inp),$inp # inp+=64*3 2289 xor %r10,%r10 2290 vmovdqa $xc1,0x00(%rsp) 2291 lea 0xc0($out),$out # out+=64*3 2292 sub \$192,$len # len-=64*3 2293 vmovdqa $xd1,0x20(%rsp) 2294 jmp .Loop_tail8x 2295 2296.align 32 2297.L256_or_more8x: 2298 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2299 vpxor 0x20($inp),$xb0,$xb0 2300 vpxor 0x40($inp),$xc0,$xc0 2301 vpxor 0x60($inp),$xd0,$xd0 2302 vpxor 0x80($inp),$xa1,$xa1 2303 vpxor 0xa0($inp),$xb1,$xb1 2304 vpxor 0xc0($inp),$xc1,$xc1 2305 vpxor 0xe0($inp),$xd1,$xd1 2306 vmovdqu $xa0,0x00($out) 2307 vmovdqu $xb0,0x20($out) 2308 vmovdqu $xc0,0x40($out) 2309 vmovdqu $xd0,0x60($out) 2310 vmovdqu $xa1,0x80($out) 2311 vmovdqu $xb1,0xa0($out) 2312 vmovdqu $xc1,0xc0($out) 2313 vmovdqu $xd1,0xe0($out) 2314 je .Ldone8x 2315 2316 lea 0x100($inp),$inp # inp+=64*4 2317 xor %r10,%r10 2318 vmovdqa $xa2,0x00(%rsp) 2319 lea 0x100($out),$out # out+=64*4 2320 sub \$256,$len # len-=64*4 2321 vmovdqa $xb2,0x20(%rsp) 2322 jmp .Loop_tail8x 2323 2324.align 32 2325.L320_or_more8x: 2326 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2327 vpxor 0x20($inp),$xb0,$xb0 2328 vpxor 0x40($inp),$xc0,$xc0 2329 vpxor 0x60($inp),$xd0,$xd0 2330 vpxor 0x80($inp),$xa1,$xa1 2331 vpxor 0xa0($inp),$xb1,$xb1 2332 vpxor 0xc0($inp),$xc1,$xc1 2333 vpxor 0xe0($inp),$xd1,$xd1 2334 vpxor 0x100($inp),$xa2,$xa2 2335 vpxor 0x120($inp),$xb2,$xb2 2336 vmovdqu $xa0,0x00($out) 2337 vmovdqu $xb0,0x20($out) 2338 vmovdqu $xc0,0x40($out) 2339 vmovdqu $xd0,0x60($out) 2340 vmovdqu $xa1,0x80($out) 2341 vmovdqu $xb1,0xa0($out) 2342 vmovdqu $xc1,0xc0($out) 2343 vmovdqu $xd1,0xe0($out) 2344 vmovdqu $xa2,0x100($out) 2345 vmovdqu $xb2,0x120($out) 2346 je .Ldone8x 2347 2348 lea 0x140($inp),$inp # inp+=64*5 2349 xor %r10,%r10 2350 vmovdqa $xc2,0x00(%rsp) 2351 lea 0x140($out),$out # out+=64*5 2352 sub \$320,$len # len-=64*5 2353 vmovdqa $xd2,0x20(%rsp) 2354 jmp .Loop_tail8x 2355 2356.align 32 2357.L384_or_more8x: 2358 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2359 vpxor 0x20($inp),$xb0,$xb0 2360 vpxor 0x40($inp),$xc0,$xc0 2361 vpxor 0x60($inp),$xd0,$xd0 2362 vpxor 0x80($inp),$xa1,$xa1 2363 vpxor 0xa0($inp),$xb1,$xb1 2364 vpxor 0xc0($inp),$xc1,$xc1 2365 vpxor 0xe0($inp),$xd1,$xd1 2366 vpxor 0x100($inp),$xa2,$xa2 2367 vpxor 0x120($inp),$xb2,$xb2 2368 vpxor 0x140($inp),$xc2,$xc2 2369 vpxor 0x160($inp),$xd2,$xd2 2370 vmovdqu $xa0,0x00($out) 2371 vmovdqu $xb0,0x20($out) 2372 vmovdqu $xc0,0x40($out) 2373 vmovdqu $xd0,0x60($out) 2374 vmovdqu $xa1,0x80($out) 2375 vmovdqu $xb1,0xa0($out) 2376 vmovdqu $xc1,0xc0($out) 2377 vmovdqu $xd1,0xe0($out) 2378 vmovdqu $xa2,0x100($out) 2379 vmovdqu $xb2,0x120($out) 2380 vmovdqu $xc2,0x140($out) 2381 vmovdqu $xd2,0x160($out) 2382 je .Ldone8x 2383 2384 lea 0x180($inp),$inp # inp+=64*6 2385 xor %r10,%r10 2386 vmovdqa $xa3,0x00(%rsp) 2387 lea 0x180($out),$out # out+=64*6 2388 sub \$384,$len # len-=64*6 2389 vmovdqa $xb3,0x20(%rsp) 2390 jmp .Loop_tail8x 2391 2392.align 32 2393.L448_or_more8x: 2394 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2395 vpxor 0x20($inp),$xb0,$xb0 2396 vpxor 0x40($inp),$xc0,$xc0 2397 vpxor 0x60($inp),$xd0,$xd0 2398 vpxor 0x80($inp),$xa1,$xa1 2399 vpxor 0xa0($inp),$xb1,$xb1 2400 vpxor 0xc0($inp),$xc1,$xc1 2401 vpxor 0xe0($inp),$xd1,$xd1 2402 vpxor 0x100($inp),$xa2,$xa2 2403 vpxor 0x120($inp),$xb2,$xb2 2404 vpxor 0x140($inp),$xc2,$xc2 2405 vpxor 0x160($inp),$xd2,$xd2 2406 vpxor 0x180($inp),$xa3,$xa3 2407 vpxor 0x1a0($inp),$xb3,$xb3 2408 vmovdqu $xa0,0x00($out) 2409 vmovdqu $xb0,0x20($out) 2410 vmovdqu $xc0,0x40($out) 2411 vmovdqu $xd0,0x60($out) 2412 vmovdqu $xa1,0x80($out) 2413 vmovdqu $xb1,0xa0($out) 2414 vmovdqu $xc1,0xc0($out) 2415 vmovdqu $xd1,0xe0($out) 2416 vmovdqu $xa2,0x100($out) 2417 vmovdqu $xb2,0x120($out) 2418 vmovdqu $xc2,0x140($out) 2419 vmovdqu $xd2,0x160($out) 2420 vmovdqu $xa3,0x180($out) 2421 vmovdqu $xb3,0x1a0($out) 2422 je .Ldone8x 2423 2424 lea 0x1c0($inp),$inp # inp+=64*7 2425 xor %r10,%r10 2426 vmovdqa $xc3,0x00(%rsp) 2427 lea 0x1c0($out),$out # out+=64*7 2428 sub \$448,$len # len-=64*7 2429 vmovdqa $xd3,0x20(%rsp) 2430 2431.Loop_tail8x: 2432 movzb ($inp,%r10),%eax 2433 movzb (%rsp,%r10),%ecx 2434 lea 1(%r10),%r10 2435 xor %ecx,%eax 2436 mov %al,-1($out,%r10) 2437 dec $len 2438 jnz .Loop_tail8x 2439 2440.Ldone8x: 2441 vzeroall 2442___ 2443$code.=<<___ if ($win64); 2444 movaps -0xa8(%r9),%xmm6 2445 movaps -0x98(%r9),%xmm7 2446 movaps -0x88(%r9),%xmm8 2447 movaps -0x78(%r9),%xmm9 2448 movaps -0x68(%r9),%xmm10 2449 movaps -0x58(%r9),%xmm11 2450 movaps -0x48(%r9),%xmm12 2451 movaps -0x38(%r9),%xmm13 2452 movaps -0x28(%r9),%xmm14 2453 movaps -0x18(%r9),%xmm15 2454___ 2455$code.=<<___; 2456 lea (%r9),%rsp 2457.cfi_def_cfa_register %rsp 2458.L8x_epilogue: 2459 ret 2460.cfi_endproc 2461.size ChaCha20_8x,.-ChaCha20_8x 2462___ 2463} 2464 2465######################################################################## 2466# AVX512 code paths 2467if ($avx>2) { 2468# This one handles shorter inputs... 2469 2470my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); 2471my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 2472 2473sub vpxord() # size optimization 2474{ my $opcode = "vpxor"; # adhere to vpxor when possible 2475 2476 foreach (@_) { 2477 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { 2478 $opcode = "vpxord"; 2479 last; 2480 } 2481 } 2482 2483 $code .= "\t$opcode\t".join(',',reverse @_)."\n"; 2484} 2485 2486sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round 2487 &vpaddd ($a,$a,$b); 2488 &vpxord ($d,$d,$a); 2489 &vprold ($d,$d,16); 2490 2491 &vpaddd ($c,$c,$d); 2492 &vpxord ($b,$b,$c); 2493 &vprold ($b,$b,12); 2494 2495 &vpaddd ($a,$a,$b); 2496 &vpxord ($d,$d,$a); 2497 &vprold ($d,$d,8); 2498 2499 &vpaddd ($c,$c,$d); 2500 &vpxord ($b,$b,$c); 2501 &vprold ($b,$b,7); 2502} 2503 2504my $xframe = $win64 ? 160+8 : 8; 2505 2506$code.=<<___; 2507.type ChaCha20_avx512,\@function,5 2508.align 32 2509ChaCha20_avx512: 2510.cfi_startproc 2511.LChaCha20_avx512: 2512 mov %rsp,%r9 # frame pointer 2513.cfi_def_cfa_register %r9 2514 cmp \$512,$len 2515 ja .LChaCha20_16x 2516 2517 sub \$64+$xframe,%rsp 2518___ 2519$code.=<<___ if ($win64); 2520 movaps %xmm6,-0xa8(%r9) 2521 movaps %xmm7,-0x98(%r9) 2522 movaps %xmm8,-0x88(%r9) 2523 movaps %xmm9,-0x78(%r9) 2524 movaps %xmm10,-0x68(%r9) 2525 movaps %xmm11,-0x58(%r9) 2526 movaps %xmm12,-0x48(%r9) 2527 movaps %xmm13,-0x38(%r9) 2528 movaps %xmm14,-0x28(%r9) 2529 movaps %xmm15,-0x18(%r9) 2530.Lavx512_body: 2531___ 2532$code.=<<___; 2533 vbroadcasti32x4 .Lsigma(%rip),$a 2534 vbroadcasti32x4 ($key),$b 2535 vbroadcasti32x4 16($key),$c 2536 vbroadcasti32x4 ($counter),$d 2537 2538 vmovdqa32 $a,$a_ 2539 vmovdqa32 $b,$b_ 2540 vmovdqa32 $c,$c_ 2541 vpaddd .Lzeroz(%rip),$d,$d 2542 vmovdqa32 .Lfourz(%rip),$fourz 2543 mov \$10,$counter # reuse $counter 2544 vmovdqa32 $d,$d_ 2545 jmp .Loop_avx512 2546 2547.align 16 2548.Loop_outer_avx512: 2549 vmovdqa32 $a_,$a 2550 vmovdqa32 $b_,$b 2551 vmovdqa32 $c_,$c 2552 vpaddd $fourz,$d_,$d 2553 mov \$10,$counter 2554 vmovdqa32 $d,$d_ 2555 jmp .Loop_avx512 2556 2557.align 32 2558.Loop_avx512: 2559___ 2560 &AVX512ROUND(); 2561 &vpshufd ($c,$c,0b01001110); 2562 &vpshufd ($b,$b,0b00111001); 2563 &vpshufd ($d,$d,0b10010011); 2564 2565 &AVX512ROUND(); 2566 &vpshufd ($c,$c,0b01001110); 2567 &vpshufd ($b,$b,0b10010011); 2568 &vpshufd ($d,$d,0b00111001); 2569 2570 &dec ($counter); 2571 &jnz (".Loop_avx512"); 2572 2573$code.=<<___; 2574 vpaddd $a_,$a,$a 2575 vpaddd $b_,$b,$b 2576 vpaddd $c_,$c,$c 2577 vpaddd $d_,$d,$d 2578 2579 sub \$64,$len 2580 jb .Ltail64_avx512 2581 2582 vpxor 0x00($inp),%x#$a,$t0 # xor with input 2583 vpxor 0x10($inp),%x#$b,$t1 2584 vpxor 0x20($inp),%x#$c,$t2 2585 vpxor 0x30($inp),%x#$d,$t3 2586 lea 0x40($inp),$inp # inp+=64 2587 2588 vmovdqu $t0,0x00($out) # write output 2589 vmovdqu $t1,0x10($out) 2590 vmovdqu $t2,0x20($out) 2591 vmovdqu $t3,0x30($out) 2592 lea 0x40($out),$out # out+=64 2593 2594 jz .Ldone_avx512 2595 2596 vextracti32x4 \$1,$a,$t0 2597 vextracti32x4 \$1,$b,$t1 2598 vextracti32x4 \$1,$c,$t2 2599 vextracti32x4 \$1,$d,$t3 2600 2601 sub \$64,$len 2602 jb .Ltail_avx512 2603 2604 vpxor 0x00($inp),$t0,$t0 # xor with input 2605 vpxor 0x10($inp),$t1,$t1 2606 vpxor 0x20($inp),$t2,$t2 2607 vpxor 0x30($inp),$t3,$t3 2608 lea 0x40($inp),$inp # inp+=64 2609 2610 vmovdqu $t0,0x00($out) # write output 2611 vmovdqu $t1,0x10($out) 2612 vmovdqu $t2,0x20($out) 2613 vmovdqu $t3,0x30($out) 2614 lea 0x40($out),$out # out+=64 2615 2616 jz .Ldone_avx512 2617 2618 vextracti32x4 \$2,$a,$t0 2619 vextracti32x4 \$2,$b,$t1 2620 vextracti32x4 \$2,$c,$t2 2621 vextracti32x4 \$2,$d,$t3 2622 2623 sub \$64,$len 2624 jb .Ltail_avx512 2625 2626 vpxor 0x00($inp),$t0,$t0 # xor with input 2627 vpxor 0x10($inp),$t1,$t1 2628 vpxor 0x20($inp),$t2,$t2 2629 vpxor 0x30($inp),$t3,$t3 2630 lea 0x40($inp),$inp # inp+=64 2631 2632 vmovdqu $t0,0x00($out) # write output 2633 vmovdqu $t1,0x10($out) 2634 vmovdqu $t2,0x20($out) 2635 vmovdqu $t3,0x30($out) 2636 lea 0x40($out),$out # out+=64 2637 2638 jz .Ldone_avx512 2639 2640 vextracti32x4 \$3,$a,$t0 2641 vextracti32x4 \$3,$b,$t1 2642 vextracti32x4 \$3,$c,$t2 2643 vextracti32x4 \$3,$d,$t3 2644 2645 sub \$64,$len 2646 jb .Ltail_avx512 2647 2648 vpxor 0x00($inp),$t0,$t0 # xor with input 2649 vpxor 0x10($inp),$t1,$t1 2650 vpxor 0x20($inp),$t2,$t2 2651 vpxor 0x30($inp),$t3,$t3 2652 lea 0x40($inp),$inp # inp+=64 2653 2654 vmovdqu $t0,0x00($out) # write output 2655 vmovdqu $t1,0x10($out) 2656 vmovdqu $t2,0x20($out) 2657 vmovdqu $t3,0x30($out) 2658 lea 0x40($out),$out # out+=64 2659 2660 jnz .Loop_outer_avx512 2661 2662 jmp .Ldone_avx512 2663 2664.align 16 2665.Ltail64_avx512: 2666 vmovdqa %x#$a,0x00(%rsp) 2667 vmovdqa %x#$b,0x10(%rsp) 2668 vmovdqa %x#$c,0x20(%rsp) 2669 vmovdqa %x#$d,0x30(%rsp) 2670 add \$64,$len 2671 jmp .Loop_tail_avx512 2672 2673.align 16 2674.Ltail_avx512: 2675 vmovdqa $t0,0x00(%rsp) 2676 vmovdqa $t1,0x10(%rsp) 2677 vmovdqa $t2,0x20(%rsp) 2678 vmovdqa $t3,0x30(%rsp) 2679 add \$64,$len 2680 2681.Loop_tail_avx512: 2682 movzb ($inp,$counter),%eax 2683 movzb (%rsp,$counter),%ecx 2684 lea 1($counter),$counter 2685 xor %ecx,%eax 2686 mov %al,-1($out,$counter) 2687 dec $len 2688 jnz .Loop_tail_avx512 2689 2690 vmovdqu32 $a_,0x00(%rsp) 2691 2692.Ldone_avx512: 2693 vzeroall 2694___ 2695$code.=<<___ if ($win64); 2696 movaps -0xa8(%r9),%xmm6 2697 movaps -0x98(%r9),%xmm7 2698 movaps -0x88(%r9),%xmm8 2699 movaps -0x78(%r9),%xmm9 2700 movaps -0x68(%r9),%xmm10 2701 movaps -0x58(%r9),%xmm11 2702 movaps -0x48(%r9),%xmm12 2703 movaps -0x38(%r9),%xmm13 2704 movaps -0x28(%r9),%xmm14 2705 movaps -0x18(%r9),%xmm15 2706___ 2707$code.=<<___; 2708 lea (%r9),%rsp 2709.cfi_def_cfa_register %rsp 2710.Lavx512_epilogue: 2711 ret 2712.cfi_endproc 2713.size ChaCha20_avx512,.-ChaCha20_avx512 2714___ 2715 2716map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); 2717 2718$code.=<<___; 2719.type ChaCha20_avx512vl,\@function,5 2720.align 32 2721ChaCha20_avx512vl: 2722.cfi_startproc 2723.LChaCha20_avx512vl: 2724 mov %rsp,%r9 # frame pointer 2725.cfi_def_cfa_register %r9 2726 cmp \$128,$len 2727 ja .LChaCha20_8xvl 2728 2729 sub \$64+$xframe,%rsp 2730___ 2731$code.=<<___ if ($win64); 2732 movaps %xmm6,-0xa8(%r9) 2733 movaps %xmm7,-0x98(%r9) 2734 movaps %xmm8,-0x88(%r9) 2735 movaps %xmm9,-0x78(%r9) 2736 movaps %xmm10,-0x68(%r9) 2737 movaps %xmm11,-0x58(%r9) 2738 movaps %xmm12,-0x48(%r9) 2739 movaps %xmm13,-0x38(%r9) 2740 movaps %xmm14,-0x28(%r9) 2741 movaps %xmm15,-0x18(%r9) 2742.Lavx512vl_body: 2743___ 2744$code.=<<___; 2745 vbroadcasti128 .Lsigma(%rip),$a 2746 vbroadcasti128 ($key),$b 2747 vbroadcasti128 16($key),$c 2748 vbroadcasti128 ($counter),$d 2749 2750 vmovdqa32 $a,$a_ 2751 vmovdqa32 $b,$b_ 2752 vmovdqa32 $c,$c_ 2753 vpaddd .Lzeroz(%rip),$d,$d 2754 vmovdqa32 .Ltwoy(%rip),$fourz 2755 mov \$10,$counter # reuse $counter 2756 vmovdqa32 $d,$d_ 2757 jmp .Loop_avx512vl 2758 2759.align 16 2760.Loop_outer_avx512vl: 2761 vmovdqa32 $c_,$c 2762 vpaddd $fourz,$d_,$d 2763 mov \$10,$counter 2764 vmovdqa32 $d,$d_ 2765 jmp .Loop_avx512vl 2766 2767.align 32 2768.Loop_avx512vl: 2769___ 2770 &AVX512ROUND(); 2771 &vpshufd ($c,$c,0b01001110); 2772 &vpshufd ($b,$b,0b00111001); 2773 &vpshufd ($d,$d,0b10010011); 2774 2775 &AVX512ROUND(); 2776 &vpshufd ($c,$c,0b01001110); 2777 &vpshufd ($b,$b,0b10010011); 2778 &vpshufd ($d,$d,0b00111001); 2779 2780 &dec ($counter); 2781 &jnz (".Loop_avx512vl"); 2782 2783$code.=<<___; 2784 vpaddd $a_,$a,$a 2785 vpaddd $b_,$b,$b 2786 vpaddd $c_,$c,$c 2787 vpaddd $d_,$d,$d 2788 2789 sub \$64,$len 2790 jb .Ltail64_avx512vl 2791 2792 vpxor 0x00($inp),%x#$a,$t0 # xor with input 2793 vpxor 0x10($inp),%x#$b,$t1 2794 vpxor 0x20($inp),%x#$c,$t2 2795 vpxor 0x30($inp),%x#$d,$t3 2796 lea 0x40($inp),$inp # inp+=64 2797 2798 vmovdqu $t0,0x00($out) # write output 2799 vmovdqu $t1,0x10($out) 2800 vmovdqu $t2,0x20($out) 2801 vmovdqu $t3,0x30($out) 2802 lea 0x40($out),$out # out+=64 2803 2804 jz .Ldone_avx512vl 2805 2806 vextracti128 \$1,$a,$t0 2807 vextracti128 \$1,$b,$t1 2808 vextracti128 \$1,$c,$t2 2809 vextracti128 \$1,$d,$t3 2810 2811 sub \$64,$len 2812 jb .Ltail_avx512vl 2813 2814 vpxor 0x00($inp),$t0,$t0 # xor with input 2815 vpxor 0x10($inp),$t1,$t1 2816 vpxor 0x20($inp),$t2,$t2 2817 vpxor 0x30($inp),$t3,$t3 2818 lea 0x40($inp),$inp # inp+=64 2819 2820 vmovdqu $t0,0x00($out) # write output 2821 vmovdqu $t1,0x10($out) 2822 vmovdqu $t2,0x20($out) 2823 vmovdqu $t3,0x30($out) 2824 lea 0x40($out),$out # out+=64 2825 2826 vmovdqa32 $a_,$a 2827 vmovdqa32 $b_,$b 2828 jnz .Loop_outer_avx512vl 2829 2830 jmp .Ldone_avx512vl 2831 2832.align 16 2833.Ltail64_avx512vl: 2834 vmovdqa %x#$a,0x00(%rsp) 2835 vmovdqa %x#$b,0x10(%rsp) 2836 vmovdqa %x#$c,0x20(%rsp) 2837 vmovdqa %x#$d,0x30(%rsp) 2838 add \$64,$len 2839 jmp .Loop_tail_avx512vl 2840 2841.align 16 2842.Ltail_avx512vl: 2843 vmovdqa $t0,0x00(%rsp) 2844 vmovdqa $t1,0x10(%rsp) 2845 vmovdqa $t2,0x20(%rsp) 2846 vmovdqa $t3,0x30(%rsp) 2847 add \$64,$len 2848 2849.Loop_tail_avx512vl: 2850 movzb ($inp,$counter),%eax 2851 movzb (%rsp,$counter),%ecx 2852 lea 1($counter),$counter 2853 xor %ecx,%eax 2854 mov %al,-1($out,$counter) 2855 dec $len 2856 jnz .Loop_tail_avx512vl 2857 2858 vmovdqu32 $a_,0x00(%rsp) 2859 vmovdqu32 $a_,0x20(%rsp) 2860 2861.Ldone_avx512vl: 2862 vzeroall 2863___ 2864$code.=<<___ if ($win64); 2865 movaps -0xa8(%r9),%xmm6 2866 movaps -0x98(%r9),%xmm7 2867 movaps -0x88(%r9),%xmm8 2868 movaps -0x78(%r9),%xmm9 2869 movaps -0x68(%r9),%xmm10 2870 movaps -0x58(%r9),%xmm11 2871 movaps -0x48(%r9),%xmm12 2872 movaps -0x38(%r9),%xmm13 2873 movaps -0x28(%r9),%xmm14 2874 movaps -0x18(%r9),%xmm15 2875___ 2876$code.=<<___; 2877 lea (%r9),%rsp 2878.cfi_def_cfa_register %rsp 2879.Lavx512vl_epilogue: 2880 ret 2881.cfi_endproc 2882.size ChaCha20_avx512vl,.-ChaCha20_avx512vl 2883___ 2884} 2885if ($avx>2) { 2886# This one handles longer inputs... 2887 2888my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2889 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); 2890my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2891 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 2892my @key=map("%zmm$_",(16..31)); 2893my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 2894 2895sub AVX512_lane_ROUND { 2896my ($a0,$b0,$c0,$d0)=@_; 2897my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 2898my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 2899my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 2900my @x=map("\"$_\"",@xx); 2901 2902 ( 2903 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 2904 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 2905 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 2906 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 2907 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2908 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2909 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2910 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2911 "&vprold (@x[$d0],@x[$d0],16)", 2912 "&vprold (@x[$d1],@x[$d1],16)", 2913 "&vprold (@x[$d2],@x[$d2],16)", 2914 "&vprold (@x[$d3],@x[$d3],16)", 2915 2916 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2917 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2918 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2919 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2920 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2921 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2922 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2923 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2924 "&vprold (@x[$b0],@x[$b0],12)", 2925 "&vprold (@x[$b1],@x[$b1],12)", 2926 "&vprold (@x[$b2],@x[$b2],12)", 2927 "&vprold (@x[$b3],@x[$b3],12)", 2928 2929 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 2930 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 2931 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 2932 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 2933 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2934 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2935 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2936 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2937 "&vprold (@x[$d0],@x[$d0],8)", 2938 "&vprold (@x[$d1],@x[$d1],8)", 2939 "&vprold (@x[$d2],@x[$d2],8)", 2940 "&vprold (@x[$d3],@x[$d3],8)", 2941 2942 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2943 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2944 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2945 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2946 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2947 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2948 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2949 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2950 "&vprold (@x[$b0],@x[$b0],7)", 2951 "&vprold (@x[$b1],@x[$b1],7)", 2952 "&vprold (@x[$b2],@x[$b2],7)", 2953 "&vprold (@x[$b3],@x[$b3],7)" 2954 ); 2955} 2956 2957my $xframe = $win64 ? 0xa8 : 8; 2958 2959$code.=<<___; 2960.type ChaCha20_16x,\@function,5 2961.align 32 2962ChaCha20_16x: 2963.cfi_startproc 2964.LChaCha20_16x: 2965 mov %rsp,%r9 # frame register 2966.cfi_def_cfa_register %r9 2967 sub \$64+$xframe,%rsp 2968 and \$-64,%rsp 2969___ 2970$code.=<<___ if ($win64); 2971 movaps %xmm6,-0xa8(%r9) 2972 movaps %xmm7,-0x98(%r9) 2973 movaps %xmm8,-0x88(%r9) 2974 movaps %xmm9,-0x78(%r9) 2975 movaps %xmm10,-0x68(%r9) 2976 movaps %xmm11,-0x58(%r9) 2977 movaps %xmm12,-0x48(%r9) 2978 movaps %xmm13,-0x38(%r9) 2979 movaps %xmm14,-0x28(%r9) 2980 movaps %xmm15,-0x18(%r9) 2981.L16x_body: 2982___ 2983$code.=<<___; 2984 vzeroupper 2985 2986 lea .Lsigma(%rip),%r10 2987 vbroadcasti32x4 (%r10),$xa3 # key[0] 2988 vbroadcasti32x4 ($key),$xb3 # key[1] 2989 vbroadcasti32x4 16($key),$xc3 # key[2] 2990 vbroadcasti32x4 ($counter),$xd3 # key[3] 2991 2992 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 2993 vpshufd \$0x55,$xa3,$xa1 2994 vpshufd \$0xaa,$xa3,$xa2 2995 vpshufd \$0xff,$xa3,$xa3 2996 vmovdqa64 $xa0,@key[0] 2997 vmovdqa64 $xa1,@key[1] 2998 vmovdqa64 $xa2,@key[2] 2999 vmovdqa64 $xa3,@key[3] 3000 3001 vpshufd \$0x00,$xb3,$xb0 3002 vpshufd \$0x55,$xb3,$xb1 3003 vpshufd \$0xaa,$xb3,$xb2 3004 vpshufd \$0xff,$xb3,$xb3 3005 vmovdqa64 $xb0,@key[4] 3006 vmovdqa64 $xb1,@key[5] 3007 vmovdqa64 $xb2,@key[6] 3008 vmovdqa64 $xb3,@key[7] 3009 3010 vpshufd \$0x00,$xc3,$xc0 3011 vpshufd \$0x55,$xc3,$xc1 3012 vpshufd \$0xaa,$xc3,$xc2 3013 vpshufd \$0xff,$xc3,$xc3 3014 vmovdqa64 $xc0,@key[8] 3015 vmovdqa64 $xc1,@key[9] 3016 vmovdqa64 $xc2,@key[10] 3017 vmovdqa64 $xc3,@key[11] 3018 3019 vpshufd \$0x00,$xd3,$xd0 3020 vpshufd \$0x55,$xd3,$xd1 3021 vpshufd \$0xaa,$xd3,$xd2 3022 vpshufd \$0xff,$xd3,$xd3 3023 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet 3024 vmovdqa64 $xd0,@key[12] 3025 vmovdqa64 $xd1,@key[13] 3026 vmovdqa64 $xd2,@key[14] 3027 vmovdqa64 $xd3,@key[15] 3028 3029 mov \$10,%eax 3030 jmp .Loop16x 3031 3032.align 32 3033.Loop_outer16x: 3034 vpbroadcastd 0(%r10),$xa0 # reload key 3035 vpbroadcastd 4(%r10),$xa1 3036 vpbroadcastd 8(%r10),$xa2 3037 vpbroadcastd 12(%r10),$xa3 3038 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters 3039 vmovdqa64 @key[4],$xb0 3040 vmovdqa64 @key[5],$xb1 3041 vmovdqa64 @key[6],$xb2 3042 vmovdqa64 @key[7],$xb3 3043 vmovdqa64 @key[8],$xc0 3044 vmovdqa64 @key[9],$xc1 3045 vmovdqa64 @key[10],$xc2 3046 vmovdqa64 @key[11],$xc3 3047 vmovdqa64 @key[12],$xd0 3048 vmovdqa64 @key[13],$xd1 3049 vmovdqa64 @key[14],$xd2 3050 vmovdqa64 @key[15],$xd3 3051 3052 vmovdqa64 $xa0,@key[0] 3053 vmovdqa64 $xa1,@key[1] 3054 vmovdqa64 $xa2,@key[2] 3055 vmovdqa64 $xa3,@key[3] 3056 3057 mov \$10,%eax 3058 jmp .Loop16x 3059 3060.align 32 3061.Loop16x: 3062___ 3063 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3064 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3065$code.=<<___; 3066 dec %eax 3067 jnz .Loop16x 3068 3069 vpaddd @key[0],$xa0,$xa0 # accumulate key 3070 vpaddd @key[1],$xa1,$xa1 3071 vpaddd @key[2],$xa2,$xa2 3072 vpaddd @key[3],$xa3,$xa3 3073 3074 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3075 vpunpckldq $xa3,$xa2,$xt3 3076 vpunpckhdq $xa1,$xa0,$xa0 3077 vpunpckhdq $xa3,$xa2,$xa2 3078 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3079 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3080 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3081 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3082___ 3083 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3084$code.=<<___; 3085 vpaddd @key[4],$xb0,$xb0 3086 vpaddd @key[5],$xb1,$xb1 3087 vpaddd @key[6],$xb2,$xb2 3088 vpaddd @key[7],$xb3,$xb3 3089 3090 vpunpckldq $xb1,$xb0,$xt2 3091 vpunpckldq $xb3,$xb2,$xt3 3092 vpunpckhdq $xb1,$xb0,$xb0 3093 vpunpckhdq $xb3,$xb2,$xb2 3094 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3095 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3096 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3097 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3098___ 3099 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3100$code.=<<___; 3101 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further 3102 vshufi32x4 \$0xee,$xb0,$xa0,$xb0 3103 vshufi32x4 \$0x44,$xb1,$xa1,$xa0 3104 vshufi32x4 \$0xee,$xb1,$xa1,$xb1 3105 vshufi32x4 \$0x44,$xb2,$xa2,$xa1 3106 vshufi32x4 \$0xee,$xb2,$xa2,$xb2 3107 vshufi32x4 \$0x44,$xb3,$xa3,$xa2 3108 vshufi32x4 \$0xee,$xb3,$xa3,$xb3 3109___ 3110 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3111$code.=<<___; 3112 vpaddd @key[8],$xc0,$xc0 3113 vpaddd @key[9],$xc1,$xc1 3114 vpaddd @key[10],$xc2,$xc2 3115 vpaddd @key[11],$xc3,$xc3 3116 3117 vpunpckldq $xc1,$xc0,$xt2 3118 vpunpckldq $xc3,$xc2,$xt3 3119 vpunpckhdq $xc1,$xc0,$xc0 3120 vpunpckhdq $xc3,$xc2,$xc2 3121 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3122 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3123 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3124 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3125___ 3126 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3127$code.=<<___; 3128 vpaddd @key[12],$xd0,$xd0 3129 vpaddd @key[13],$xd1,$xd1 3130 vpaddd @key[14],$xd2,$xd2 3131 vpaddd @key[15],$xd3,$xd3 3132 3133 vpunpckldq $xd1,$xd0,$xt2 3134 vpunpckldq $xd3,$xd2,$xt3 3135 vpunpckhdq $xd1,$xd0,$xd0 3136 vpunpckhdq $xd3,$xd2,$xd2 3137 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3138 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3139 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3140 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3141___ 3142 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3143$code.=<<___; 3144 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further 3145 vshufi32x4 \$0xee,$xd0,$xc0,$xd0 3146 vshufi32x4 \$0x44,$xd1,$xc1,$xc0 3147 vshufi32x4 \$0xee,$xd1,$xc1,$xd1 3148 vshufi32x4 \$0x44,$xd2,$xc2,$xc1 3149 vshufi32x4 \$0xee,$xd2,$xc2,$xd2 3150 vshufi32x4 \$0x44,$xd3,$xc3,$xc2 3151 vshufi32x4 \$0xee,$xd3,$xc3,$xd3 3152___ 3153 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3154$code.=<<___; 3155 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further 3156 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 3157 vshufi32x4 \$0x88,$xd0,$xb0,$xc0 3158 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 3159 vshufi32x4 \$0x88,$xc1,$xa1,$xt1 3160 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 3161 vshufi32x4 \$0x88,$xd1,$xb1,$xc1 3162 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 3163 vshufi32x4 \$0x88,$xc2,$xa2,$xt2 3164 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 3165 vshufi32x4 \$0x88,$xd2,$xb2,$xc2 3166 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 3167 vshufi32x4 \$0x88,$xc3,$xa3,$xt3 3168 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 3169 vshufi32x4 \$0x88,$xd3,$xb3,$xc3 3170 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 3171___ 3172 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= 3173 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); 3174 3175 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, 3176 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = 3177 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3178 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3179$code.=<<___; 3180 cmp \$64*16,$len 3181 jb .Ltail16x 3182 3183 vpxord 0x00($inp),$xa0,$xa0 # xor with input 3184 vpxord 0x40($inp),$xb0,$xb0 3185 vpxord 0x80($inp),$xc0,$xc0 3186 vpxord 0xc0($inp),$xd0,$xd0 3187 vmovdqu32 $xa0,0x00($out) 3188 vmovdqu32 $xb0,0x40($out) 3189 vmovdqu32 $xc0,0x80($out) 3190 vmovdqu32 $xd0,0xc0($out) 3191 3192 vpxord 0x100($inp),$xa1,$xa1 3193 vpxord 0x140($inp),$xb1,$xb1 3194 vpxord 0x180($inp),$xc1,$xc1 3195 vpxord 0x1c0($inp),$xd1,$xd1 3196 vmovdqu32 $xa1,0x100($out) 3197 vmovdqu32 $xb1,0x140($out) 3198 vmovdqu32 $xc1,0x180($out) 3199 vmovdqu32 $xd1,0x1c0($out) 3200 3201 vpxord 0x200($inp),$xa2,$xa2 3202 vpxord 0x240($inp),$xb2,$xb2 3203 vpxord 0x280($inp),$xc2,$xc2 3204 vpxord 0x2c0($inp),$xd2,$xd2 3205 vmovdqu32 $xa2,0x200($out) 3206 vmovdqu32 $xb2,0x240($out) 3207 vmovdqu32 $xc2,0x280($out) 3208 vmovdqu32 $xd2,0x2c0($out) 3209 3210 vpxord 0x300($inp),$xa3,$xa3 3211 vpxord 0x340($inp),$xb3,$xb3 3212 vpxord 0x380($inp),$xc3,$xc3 3213 vpxord 0x3c0($inp),$xd3,$xd3 3214 lea 0x400($inp),$inp 3215 vmovdqu32 $xa3,0x300($out) 3216 vmovdqu32 $xb3,0x340($out) 3217 vmovdqu32 $xc3,0x380($out) 3218 vmovdqu32 $xd3,0x3c0($out) 3219 lea 0x400($out),$out 3220 3221 sub \$64*16,$len 3222 jnz .Loop_outer16x 3223 3224 jmp .Ldone16x 3225 3226.align 32 3227.Ltail16x: 3228 xor %r10,%r10 3229 sub $inp,$out 3230 cmp \$64*1,$len 3231 jb .Less_than_64_16x 3232 vpxord ($inp),$xa0,$xa0 # xor with input 3233 vmovdqu32 $xa0,($out,$inp) 3234 je .Ldone16x 3235 vmovdqa32 $xb0,$xa0 3236 lea 64($inp),$inp 3237 3238 cmp \$64*2,$len 3239 jb .Less_than_64_16x 3240 vpxord ($inp),$xb0,$xb0 3241 vmovdqu32 $xb0,($out,$inp) 3242 je .Ldone16x 3243 vmovdqa32 $xc0,$xa0 3244 lea 64($inp),$inp 3245 3246 cmp \$64*3,$len 3247 jb .Less_than_64_16x 3248 vpxord ($inp),$xc0,$xc0 3249 vmovdqu32 $xc0,($out,$inp) 3250 je .Ldone16x 3251 vmovdqa32 $xd0,$xa0 3252 lea 64($inp),$inp 3253 3254 cmp \$64*4,$len 3255 jb .Less_than_64_16x 3256 vpxord ($inp),$xd0,$xd0 3257 vmovdqu32 $xd0,($out,$inp) 3258 je .Ldone16x 3259 vmovdqa32 $xa1,$xa0 3260 lea 64($inp),$inp 3261 3262 cmp \$64*5,$len 3263 jb .Less_than_64_16x 3264 vpxord ($inp),$xa1,$xa1 3265 vmovdqu32 $xa1,($out,$inp) 3266 je .Ldone16x 3267 vmovdqa32 $xb1,$xa0 3268 lea 64($inp),$inp 3269 3270 cmp \$64*6,$len 3271 jb .Less_than_64_16x 3272 vpxord ($inp),$xb1,$xb1 3273 vmovdqu32 $xb1,($out,$inp) 3274 je .Ldone16x 3275 vmovdqa32 $xc1,$xa0 3276 lea 64($inp),$inp 3277 3278 cmp \$64*7,$len 3279 jb .Less_than_64_16x 3280 vpxord ($inp),$xc1,$xc1 3281 vmovdqu32 $xc1,($out,$inp) 3282 je .Ldone16x 3283 vmovdqa32 $xd1,$xa0 3284 lea 64($inp),$inp 3285 3286 cmp \$64*8,$len 3287 jb .Less_than_64_16x 3288 vpxord ($inp),$xd1,$xd1 3289 vmovdqu32 $xd1,($out,$inp) 3290 je .Ldone16x 3291 vmovdqa32 $xa2,$xa0 3292 lea 64($inp),$inp 3293 3294 cmp \$64*9,$len 3295 jb .Less_than_64_16x 3296 vpxord ($inp),$xa2,$xa2 3297 vmovdqu32 $xa2,($out,$inp) 3298 je .Ldone16x 3299 vmovdqa32 $xb2,$xa0 3300 lea 64($inp),$inp 3301 3302 cmp \$64*10,$len 3303 jb .Less_than_64_16x 3304 vpxord ($inp),$xb2,$xb2 3305 vmovdqu32 $xb2,($out,$inp) 3306 je .Ldone16x 3307 vmovdqa32 $xc2,$xa0 3308 lea 64($inp),$inp 3309 3310 cmp \$64*11,$len 3311 jb .Less_than_64_16x 3312 vpxord ($inp),$xc2,$xc2 3313 vmovdqu32 $xc2,($out,$inp) 3314 je .Ldone16x 3315 vmovdqa32 $xd2,$xa0 3316 lea 64($inp),$inp 3317 3318 cmp \$64*12,$len 3319 jb .Less_than_64_16x 3320 vpxord ($inp),$xd2,$xd2 3321 vmovdqu32 $xd2,($out,$inp) 3322 je .Ldone16x 3323 vmovdqa32 $xa3,$xa0 3324 lea 64($inp),$inp 3325 3326 cmp \$64*13,$len 3327 jb .Less_than_64_16x 3328 vpxord ($inp),$xa3,$xa3 3329 vmovdqu32 $xa3,($out,$inp) 3330 je .Ldone16x 3331 vmovdqa32 $xb3,$xa0 3332 lea 64($inp),$inp 3333 3334 cmp \$64*14,$len 3335 jb .Less_than_64_16x 3336 vpxord ($inp),$xb3,$xb3 3337 vmovdqu32 $xb3,($out,$inp) 3338 je .Ldone16x 3339 vmovdqa32 $xc3,$xa0 3340 lea 64($inp),$inp 3341 3342 cmp \$64*15,$len 3343 jb .Less_than_64_16x 3344 vpxord ($inp),$xc3,$xc3 3345 vmovdqu32 $xc3,($out,$inp) 3346 je .Ldone16x 3347 vmovdqa32 $xd3,$xa0 3348 lea 64($inp),$inp 3349 3350.Less_than_64_16x: 3351 vmovdqa32 $xa0,0x00(%rsp) 3352 lea ($out,$inp),$out 3353 and \$63,$len 3354 3355.Loop_tail16x: 3356 movzb ($inp,%r10),%eax 3357 movzb (%rsp,%r10),%ecx 3358 lea 1(%r10),%r10 3359 xor %ecx,%eax 3360 mov %al,-1($out,%r10) 3361 dec $len 3362 jnz .Loop_tail16x 3363 3364 vpxord $xa0,$xa0,$xa0 3365 vmovdqa32 $xa0,0(%rsp) 3366 3367.Ldone16x: 3368 vzeroall 3369___ 3370$code.=<<___ if ($win64); 3371 movaps -0xa8(%r9),%xmm6 3372 movaps -0x98(%r9),%xmm7 3373 movaps -0x88(%r9),%xmm8 3374 movaps -0x78(%r9),%xmm9 3375 movaps -0x68(%r9),%xmm10 3376 movaps -0x58(%r9),%xmm11 3377 movaps -0x48(%r9),%xmm12 3378 movaps -0x38(%r9),%xmm13 3379 movaps -0x28(%r9),%xmm14 3380 movaps -0x18(%r9),%xmm15 3381___ 3382$code.=<<___; 3383 lea (%r9),%rsp 3384.cfi_def_cfa_register %rsp 3385.L16x_epilogue: 3386 ret 3387.cfi_endproc 3388.size ChaCha20_16x,.-ChaCha20_16x 3389___ 3390 3391# switch to %ymm domain 3392($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3393 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); 3394@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3395 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3396@key=map("%ymm$_",(16..31)); 3397($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 3398 3399$code.=<<___; 3400.type ChaCha20_8xvl,\@function,5 3401.align 32 3402ChaCha20_8xvl: 3403.cfi_startproc 3404.LChaCha20_8xvl: 3405 mov %rsp,%r9 # frame register 3406.cfi_def_cfa_register %r9 3407 sub \$64+$xframe,%rsp 3408 and \$-64,%rsp 3409___ 3410$code.=<<___ if ($win64); 3411 movaps %xmm6,-0xa8(%r9) 3412 movaps %xmm7,-0x98(%r9) 3413 movaps %xmm8,-0x88(%r9) 3414 movaps %xmm9,-0x78(%r9) 3415 movaps %xmm10,-0x68(%r9) 3416 movaps %xmm11,-0x58(%r9) 3417 movaps %xmm12,-0x48(%r9) 3418 movaps %xmm13,-0x38(%r9) 3419 movaps %xmm14,-0x28(%r9) 3420 movaps %xmm15,-0x18(%r9) 3421.L8xvl_body: 3422___ 3423$code.=<<___; 3424 vzeroupper 3425 3426 lea .Lsigma(%rip),%r10 3427 vbroadcasti128 (%r10),$xa3 # key[0] 3428 vbroadcasti128 ($key),$xb3 # key[1] 3429 vbroadcasti128 16($key),$xc3 # key[2] 3430 vbroadcasti128 ($counter),$xd3 # key[3] 3431 3432 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 3433 vpshufd \$0x55,$xa3,$xa1 3434 vpshufd \$0xaa,$xa3,$xa2 3435 vpshufd \$0xff,$xa3,$xa3 3436 vmovdqa64 $xa0,@key[0] 3437 vmovdqa64 $xa1,@key[1] 3438 vmovdqa64 $xa2,@key[2] 3439 vmovdqa64 $xa3,@key[3] 3440 3441 vpshufd \$0x00,$xb3,$xb0 3442 vpshufd \$0x55,$xb3,$xb1 3443 vpshufd \$0xaa,$xb3,$xb2 3444 vpshufd \$0xff,$xb3,$xb3 3445 vmovdqa64 $xb0,@key[4] 3446 vmovdqa64 $xb1,@key[5] 3447 vmovdqa64 $xb2,@key[6] 3448 vmovdqa64 $xb3,@key[7] 3449 3450 vpshufd \$0x00,$xc3,$xc0 3451 vpshufd \$0x55,$xc3,$xc1 3452 vpshufd \$0xaa,$xc3,$xc2 3453 vpshufd \$0xff,$xc3,$xc3 3454 vmovdqa64 $xc0,@key[8] 3455 vmovdqa64 $xc1,@key[9] 3456 vmovdqa64 $xc2,@key[10] 3457 vmovdqa64 $xc3,@key[11] 3458 3459 vpshufd \$0x00,$xd3,$xd0 3460 vpshufd \$0x55,$xd3,$xd1 3461 vpshufd \$0xaa,$xd3,$xd2 3462 vpshufd \$0xff,$xd3,$xd3 3463 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 3464 vmovdqa64 $xd0,@key[12] 3465 vmovdqa64 $xd1,@key[13] 3466 vmovdqa64 $xd2,@key[14] 3467 vmovdqa64 $xd3,@key[15] 3468 3469 mov \$10,%eax 3470 jmp .Loop8xvl 3471 3472.align 32 3473.Loop_outer8xvl: 3474 #vpbroadcastd 0(%r10),$xa0 # reload key 3475 #vpbroadcastd 4(%r10),$xa1 3476 vpbroadcastd 8(%r10),$xa2 3477 vpbroadcastd 12(%r10),$xa3 3478 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters 3479 vmovdqa64 @key[4],$xb0 3480 vmovdqa64 @key[5],$xb1 3481 vmovdqa64 @key[6],$xb2 3482 vmovdqa64 @key[7],$xb3 3483 vmovdqa64 @key[8],$xc0 3484 vmovdqa64 @key[9],$xc1 3485 vmovdqa64 @key[10],$xc2 3486 vmovdqa64 @key[11],$xc3 3487 vmovdqa64 @key[12],$xd0 3488 vmovdqa64 @key[13],$xd1 3489 vmovdqa64 @key[14],$xd2 3490 vmovdqa64 @key[15],$xd3 3491 3492 vmovdqa64 $xa0,@key[0] 3493 vmovdqa64 $xa1,@key[1] 3494 vmovdqa64 $xa2,@key[2] 3495 vmovdqa64 $xa3,@key[3] 3496 3497 mov \$10,%eax 3498 jmp .Loop8xvl 3499 3500.align 32 3501.Loop8xvl: 3502___ 3503 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3504 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3505$code.=<<___; 3506 dec %eax 3507 jnz .Loop8xvl 3508 3509 vpaddd @key[0],$xa0,$xa0 # accumulate key 3510 vpaddd @key[1],$xa1,$xa1 3511 vpaddd @key[2],$xa2,$xa2 3512 vpaddd @key[3],$xa3,$xa3 3513 3514 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3515 vpunpckldq $xa3,$xa2,$xt3 3516 vpunpckhdq $xa1,$xa0,$xa0 3517 vpunpckhdq $xa3,$xa2,$xa2 3518 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3519 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3520 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3521 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3522___ 3523 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3524$code.=<<___; 3525 vpaddd @key[4],$xb0,$xb0 3526 vpaddd @key[5],$xb1,$xb1 3527 vpaddd @key[6],$xb2,$xb2 3528 vpaddd @key[7],$xb3,$xb3 3529 3530 vpunpckldq $xb1,$xb0,$xt2 3531 vpunpckldq $xb3,$xb2,$xt3 3532 vpunpckhdq $xb1,$xb0,$xb0 3533 vpunpckhdq $xb3,$xb2,$xb2 3534 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3535 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3536 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3537 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3538___ 3539 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3540$code.=<<___; 3541 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further 3542 vshufi32x4 \$3,$xb0,$xa0,$xb0 3543 vshufi32x4 \$0,$xb1,$xa1,$xa0 3544 vshufi32x4 \$3,$xb1,$xa1,$xb1 3545 vshufi32x4 \$0,$xb2,$xa2,$xa1 3546 vshufi32x4 \$3,$xb2,$xa2,$xb2 3547 vshufi32x4 \$0,$xb3,$xa3,$xa2 3548 vshufi32x4 \$3,$xb3,$xa3,$xb3 3549___ 3550 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3551$code.=<<___; 3552 vpaddd @key[8],$xc0,$xc0 3553 vpaddd @key[9],$xc1,$xc1 3554 vpaddd @key[10],$xc2,$xc2 3555 vpaddd @key[11],$xc3,$xc3 3556 3557 vpunpckldq $xc1,$xc0,$xt2 3558 vpunpckldq $xc3,$xc2,$xt3 3559 vpunpckhdq $xc1,$xc0,$xc0 3560 vpunpckhdq $xc3,$xc2,$xc2 3561 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3562 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3563 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3564 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3565___ 3566 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3567$code.=<<___; 3568 vpaddd @key[12],$xd0,$xd0 3569 vpaddd @key[13],$xd1,$xd1 3570 vpaddd @key[14],$xd2,$xd2 3571 vpaddd @key[15],$xd3,$xd3 3572 3573 vpunpckldq $xd1,$xd0,$xt2 3574 vpunpckldq $xd3,$xd2,$xt3 3575 vpunpckhdq $xd1,$xd0,$xd0 3576 vpunpckhdq $xd3,$xd2,$xd2 3577 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3578 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3579 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3580 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3581___ 3582 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3583$code.=<<___; 3584 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 3585 vperm2i128 \$0x31,$xd0,$xc0,$xd0 3586 vperm2i128 \$0x20,$xd1,$xc1,$xc0 3587 vperm2i128 \$0x31,$xd1,$xc1,$xd1 3588 vperm2i128 \$0x20,$xd2,$xc2,$xc1 3589 vperm2i128 \$0x31,$xd2,$xc2,$xd2 3590 vperm2i128 \$0x20,$xd3,$xc3,$xc2 3591 vperm2i128 \$0x31,$xd3,$xc3,$xd3 3592___ 3593 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3594 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 3595 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 3596$code.=<<___; 3597 cmp \$64*8,$len 3598 jb .Ltail8xvl 3599 3600 mov \$0x80,%eax # size optimization 3601 vpxord 0x00($inp),$xa0,$xa0 # xor with input 3602 vpxor 0x20($inp),$xb0,$xb0 3603 vpxor 0x40($inp),$xc0,$xc0 3604 vpxor 0x60($inp),$xd0,$xd0 3605 lea ($inp,%rax),$inp # size optimization 3606 vmovdqu32 $xa0,0x00($out) 3607 vmovdqu $xb0,0x20($out) 3608 vmovdqu $xc0,0x40($out) 3609 vmovdqu $xd0,0x60($out) 3610 lea ($out,%rax),$out # size optimization 3611 3612 vpxor 0x00($inp),$xa1,$xa1 3613 vpxor 0x20($inp),$xb1,$xb1 3614 vpxor 0x40($inp),$xc1,$xc1 3615 vpxor 0x60($inp),$xd1,$xd1 3616 lea ($inp,%rax),$inp # size optimization 3617 vmovdqu $xa1,0x00($out) 3618 vmovdqu $xb1,0x20($out) 3619 vmovdqu $xc1,0x40($out) 3620 vmovdqu $xd1,0x60($out) 3621 lea ($out,%rax),$out # size optimization 3622 3623 vpxord 0x00($inp),$xa2,$xa2 3624 vpxor 0x20($inp),$xb2,$xb2 3625 vpxor 0x40($inp),$xc2,$xc2 3626 vpxor 0x60($inp),$xd2,$xd2 3627 lea ($inp,%rax),$inp # size optimization 3628 vmovdqu32 $xa2,0x00($out) 3629 vmovdqu $xb2,0x20($out) 3630 vmovdqu $xc2,0x40($out) 3631 vmovdqu $xd2,0x60($out) 3632 lea ($out,%rax),$out # size optimization 3633 3634 vpxor 0x00($inp),$xa3,$xa3 3635 vpxor 0x20($inp),$xb3,$xb3 3636 vpxor 0x40($inp),$xc3,$xc3 3637 vpxor 0x60($inp),$xd3,$xd3 3638 lea ($inp,%rax),$inp # size optimization 3639 vmovdqu $xa3,0x00($out) 3640 vmovdqu $xb3,0x20($out) 3641 vmovdqu $xc3,0x40($out) 3642 vmovdqu $xd3,0x60($out) 3643 lea ($out,%rax),$out # size optimization 3644 3645 vpbroadcastd 0(%r10),%ymm0 # reload key 3646 vpbroadcastd 4(%r10),%ymm1 3647 3648 sub \$64*8,$len 3649 jnz .Loop_outer8xvl 3650 3651 jmp .Ldone8xvl 3652 3653.align 32 3654.Ltail8xvl: 3655 vmovdqa64 $xa0,%ymm8 # size optimization 3656___ 3657$xa0 = "%ymm8"; 3658$code.=<<___; 3659 xor %r10,%r10 3660 sub $inp,$out 3661 cmp \$64*1,$len 3662 jb .Less_than_64_8xvl 3663 vpxor 0x00($inp),$xa0,$xa0 # xor with input 3664 vpxor 0x20($inp),$xb0,$xb0 3665 vmovdqu $xa0,0x00($out,$inp) 3666 vmovdqu $xb0,0x20($out,$inp) 3667 je .Ldone8xvl 3668 vmovdqa $xc0,$xa0 3669 vmovdqa $xd0,$xb0 3670 lea 64($inp),$inp 3671 3672 cmp \$64*2,$len 3673 jb .Less_than_64_8xvl 3674 vpxor 0x00($inp),$xc0,$xc0 3675 vpxor 0x20($inp),$xd0,$xd0 3676 vmovdqu $xc0,0x00($out,$inp) 3677 vmovdqu $xd0,0x20($out,$inp) 3678 je .Ldone8xvl 3679 vmovdqa $xa1,$xa0 3680 vmovdqa $xb1,$xb0 3681 lea 64($inp),$inp 3682 3683 cmp \$64*3,$len 3684 jb .Less_than_64_8xvl 3685 vpxor 0x00($inp),$xa1,$xa1 3686 vpxor 0x20($inp),$xb1,$xb1 3687 vmovdqu $xa1,0x00($out,$inp) 3688 vmovdqu $xb1,0x20($out,$inp) 3689 je .Ldone8xvl 3690 vmovdqa $xc1,$xa0 3691 vmovdqa $xd1,$xb0 3692 lea 64($inp),$inp 3693 3694 cmp \$64*4,$len 3695 jb .Less_than_64_8xvl 3696 vpxor 0x00($inp),$xc1,$xc1 3697 vpxor 0x20($inp),$xd1,$xd1 3698 vmovdqu $xc1,0x00($out,$inp) 3699 vmovdqu $xd1,0x20($out,$inp) 3700 je .Ldone8xvl 3701 vmovdqa32 $xa2,$xa0 3702 vmovdqa $xb2,$xb0 3703 lea 64($inp),$inp 3704 3705 cmp \$64*5,$len 3706 jb .Less_than_64_8xvl 3707 vpxord 0x00($inp),$xa2,$xa2 3708 vpxor 0x20($inp),$xb2,$xb2 3709 vmovdqu32 $xa2,0x00($out,$inp) 3710 vmovdqu $xb2,0x20($out,$inp) 3711 je .Ldone8xvl 3712 vmovdqa $xc2,$xa0 3713 vmovdqa $xd2,$xb0 3714 lea 64($inp),$inp 3715 3716 cmp \$64*6,$len 3717 jb .Less_than_64_8xvl 3718 vpxor 0x00($inp),$xc2,$xc2 3719 vpxor 0x20($inp),$xd2,$xd2 3720 vmovdqu $xc2,0x00($out,$inp) 3721 vmovdqu $xd2,0x20($out,$inp) 3722 je .Ldone8xvl 3723 vmovdqa $xa3,$xa0 3724 vmovdqa $xb3,$xb0 3725 lea 64($inp),$inp 3726 3727 cmp \$64*7,$len 3728 jb .Less_than_64_8xvl 3729 vpxor 0x00($inp),$xa3,$xa3 3730 vpxor 0x20($inp),$xb3,$xb3 3731 vmovdqu $xa3,0x00($out,$inp) 3732 vmovdqu $xb3,0x20($out,$inp) 3733 je .Ldone8xvl 3734 vmovdqa $xc3,$xa0 3735 vmovdqa $xd3,$xb0 3736 lea 64($inp),$inp 3737 3738.Less_than_64_8xvl: 3739 vmovdqa $xa0,0x00(%rsp) 3740 vmovdqa $xb0,0x20(%rsp) 3741 lea ($out,$inp),$out 3742 and \$63,$len 3743 3744.Loop_tail8xvl: 3745 movzb ($inp,%r10),%eax 3746 movzb (%rsp,%r10),%ecx 3747 lea 1(%r10),%r10 3748 xor %ecx,%eax 3749 mov %al,-1($out,%r10) 3750 dec $len 3751 jnz .Loop_tail8xvl 3752 3753 vpxor $xa0,$xa0,$xa0 3754 vmovdqa $xa0,0x00(%rsp) 3755 vmovdqa $xa0,0x20(%rsp) 3756 3757.Ldone8xvl: 3758 vzeroall 3759___ 3760$code.=<<___ if ($win64); 3761 movaps -0xa8(%r9),%xmm6 3762 movaps -0x98(%r9),%xmm7 3763 movaps -0x88(%r9),%xmm8 3764 movaps -0x78(%r9),%xmm9 3765 movaps -0x68(%r9),%xmm10 3766 movaps -0x58(%r9),%xmm11 3767 movaps -0x48(%r9),%xmm12 3768 movaps -0x38(%r9),%xmm13 3769 movaps -0x28(%r9),%xmm14 3770 movaps -0x18(%r9),%xmm15 3771___ 3772$code.=<<___; 3773 lea (%r9),%rsp 3774.cfi_def_cfa_register %rsp 3775.L8xvl_epilogue: 3776 ret 3777.cfi_endproc 3778.size ChaCha20_8xvl,.-ChaCha20_8xvl 3779___ 3780} 3781 3782# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3783# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3784if ($win64) { 3785$rec="%rcx"; 3786$frame="%rdx"; 3787$context="%r8"; 3788$disp="%r9"; 3789 3790$code.=<<___; 3791.extern __imp_RtlVirtualUnwind 3792.type se_handler,\@abi-omnipotent 3793.align 16 3794se_handler: 3795 push %rsi 3796 push %rdi 3797 push %rbx 3798 push %rbp 3799 push %r12 3800 push %r13 3801 push %r14 3802 push %r15 3803 pushfq 3804 sub \$64,%rsp 3805 3806 mov 120($context),%rax # pull context->Rax 3807 mov 248($context),%rbx # pull context->Rip 3808 3809 mov 8($disp),%rsi # disp->ImageBase 3810 mov 56($disp),%r11 # disp->HandlerData 3811 3812 lea .Lctr32_body(%rip),%r10 3813 cmp %r10,%rbx # context->Rip<.Lprologue 3814 jb .Lcommon_seh_tail 3815 3816 mov 152($context),%rax # pull context->Rsp 3817 3818 lea .Lno_data(%rip),%r10 # epilogue label 3819 cmp %r10,%rbx # context->Rip>=.Lepilogue 3820 jae .Lcommon_seh_tail 3821 3822 lea 64+24+48(%rax),%rax 3823 3824 mov -8(%rax),%rbx 3825 mov -16(%rax),%rbp 3826 mov -24(%rax),%r12 3827 mov -32(%rax),%r13 3828 mov -40(%rax),%r14 3829 mov -48(%rax),%r15 3830 mov %rbx,144($context) # restore context->Rbx 3831 mov %rbp,160($context) # restore context->Rbp 3832 mov %r12,216($context) # restore context->R12 3833 mov %r13,224($context) # restore context->R13 3834 mov %r14,232($context) # restore context->R14 3835 mov %r15,240($context) # restore context->R14 3836 3837.Lcommon_seh_tail: 3838 mov 8(%rax),%rdi 3839 mov 16(%rax),%rsi 3840 mov %rax,152($context) # restore context->Rsp 3841 mov %rsi,168($context) # restore context->Rsi 3842 mov %rdi,176($context) # restore context->Rdi 3843 3844 mov 40($disp),%rdi # disp->ContextRecord 3845 mov $context,%rsi # context 3846 mov \$154,%ecx # sizeof(CONTEXT) 3847 .long 0xa548f3fc # cld; rep movsq 3848 3849 mov $disp,%rsi 3850 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3851 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3852 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3853 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3854 mov 40(%rsi),%r10 # disp->ContextRecord 3855 lea 56(%rsi),%r11 # &disp->HandlerData 3856 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3857 mov %r10,32(%rsp) # arg5 3858 mov %r11,40(%rsp) # arg6 3859 mov %r12,48(%rsp) # arg7 3860 mov %rcx,56(%rsp) # arg8, (NULL) 3861 call *__imp_RtlVirtualUnwind(%rip) 3862 3863 mov \$1,%eax # ExceptionContinueSearch 3864 add \$64,%rsp 3865 popfq 3866 pop %r15 3867 pop %r14 3868 pop %r13 3869 pop %r12 3870 pop %rbp 3871 pop %rbx 3872 pop %rdi 3873 pop %rsi 3874 ret 3875.size se_handler,.-se_handler 3876 3877.type simd_handler,\@abi-omnipotent 3878.align 16 3879simd_handler: 3880 push %rsi 3881 push %rdi 3882 push %rbx 3883 push %rbp 3884 push %r12 3885 push %r13 3886 push %r14 3887 push %r15 3888 pushfq 3889 sub \$64,%rsp 3890 3891 mov 120($context),%rax # pull context->Rax 3892 mov 248($context),%rbx # pull context->Rip 3893 3894 mov 8($disp),%rsi # disp->ImageBase 3895 mov 56($disp),%r11 # disp->HandlerData 3896 3897 mov 0(%r11),%r10d # HandlerData[0] 3898 lea (%rsi,%r10),%r10 # prologue label 3899 cmp %r10,%rbx # context->Rip<prologue label 3900 jb .Lcommon_seh_tail 3901 3902 mov 192($context),%rax # pull context->R9 3903 3904 mov 4(%r11),%r10d # HandlerData[1] 3905 mov 8(%r11),%ecx # HandlerData[2] 3906 lea (%rsi,%r10),%r10 # epilogue label 3907 cmp %r10,%rbx # context->Rip>=epilogue label 3908 jae .Lcommon_seh_tail 3909 3910 neg %rcx 3911 lea -8(%rax,%rcx),%rsi 3912 lea 512($context),%rdi # &context.Xmm6 3913 neg %ecx 3914 shr \$3,%ecx 3915 .long 0xa548f3fc # cld; rep movsq 3916 3917 jmp .Lcommon_seh_tail 3918.size simd_handler,.-simd_handler 3919 3920.section .pdata 3921.align 4 3922 .rva .LSEH_begin_ChaCha20_ctr32 3923 .rva .LSEH_end_ChaCha20_ctr32 3924 .rva .LSEH_info_ChaCha20_ctr32 3925 3926 .rva .LSEH_begin_ChaCha20_ssse3 3927 .rva .LSEH_end_ChaCha20_ssse3 3928 .rva .LSEH_info_ChaCha20_ssse3 3929 3930 .rva .LSEH_begin_ChaCha20_128 3931 .rva .LSEH_end_ChaCha20_128 3932 .rva .LSEH_info_ChaCha20_128 3933 3934 .rva .LSEH_begin_ChaCha20_4x 3935 .rva .LSEH_end_ChaCha20_4x 3936 .rva .LSEH_info_ChaCha20_4x 3937___ 3938$code.=<<___ if ($avx); 3939 .rva .LSEH_begin_ChaCha20_4xop 3940 .rva .LSEH_end_ChaCha20_4xop 3941 .rva .LSEH_info_ChaCha20_4xop 3942___ 3943$code.=<<___ if ($avx>1); 3944 .rva .LSEH_begin_ChaCha20_8x 3945 .rva .LSEH_end_ChaCha20_8x 3946 .rva .LSEH_info_ChaCha20_8x 3947___ 3948$code.=<<___ if ($avx>2); 3949 .rva .LSEH_begin_ChaCha20_avx512 3950 .rva .LSEH_end_ChaCha20_avx512 3951 .rva .LSEH_info_ChaCha20_avx512 3952 3953 .rva .LSEH_begin_ChaCha20_avx512vl 3954 .rva .LSEH_end_ChaCha20_avx512vl 3955 .rva .LSEH_info_ChaCha20_avx512vl 3956 3957 .rva .LSEH_begin_ChaCha20_16x 3958 .rva .LSEH_end_ChaCha20_16x 3959 .rva .LSEH_info_ChaCha20_16x 3960 3961 .rva .LSEH_begin_ChaCha20_8xvl 3962 .rva .LSEH_end_ChaCha20_8xvl 3963 .rva .LSEH_info_ChaCha20_8xvl 3964___ 3965$code.=<<___; 3966.section .xdata 3967.align 8 3968.LSEH_info_ChaCha20_ctr32: 3969 .byte 9,0,0,0 3970 .rva se_handler 3971 3972.LSEH_info_ChaCha20_ssse3: 3973 .byte 9,0,0,0 3974 .rva simd_handler 3975 .rva .Lssse3_body,.Lssse3_epilogue 3976 .long 0x20,0 3977 3978.LSEH_info_ChaCha20_128: 3979 .byte 9,0,0,0 3980 .rva simd_handler 3981 .rva .L128_body,.L128_epilogue 3982 .long 0x60,0 3983 3984.LSEH_info_ChaCha20_4x: 3985 .byte 9,0,0,0 3986 .rva simd_handler 3987 .rva .L4x_body,.L4x_epilogue 3988 .long 0xa0,0 3989___ 3990$code.=<<___ if ($avx); 3991.LSEH_info_ChaCha20_4xop: 3992 .byte 9,0,0,0 3993 .rva simd_handler 3994 .rva .L4xop_body,.L4xop_epilogue # HandlerData[] 3995 .long 0xa0,0 3996___ 3997$code.=<<___ if ($avx>1); 3998.LSEH_info_ChaCha20_8x: 3999 .byte 9,0,0,0 4000 .rva simd_handler 4001 .rva .L8x_body,.L8x_epilogue # HandlerData[] 4002 .long 0xa0,0 4003___ 4004$code.=<<___ if ($avx>2); 4005.LSEH_info_ChaCha20_avx512: 4006 .byte 9,0,0,0 4007 .rva simd_handler 4008 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] 4009 .long 0x20,0 4010 4011.LSEH_info_ChaCha20_avx512vl: 4012 .byte 9,0,0,0 4013 .rva simd_handler 4014 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] 4015 .long 0x20,0 4016 4017.LSEH_info_ChaCha20_16x: 4018 .byte 9,0,0,0 4019 .rva simd_handler 4020 .rva .L16x_body,.L16x_epilogue # HandlerData[] 4021 .long 0xa0,0 4022 4023.LSEH_info_ChaCha20_8xvl: 4024 .byte 9,0,0,0 4025 .rva simd_handler 4026 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] 4027 .long 0xa0,0 4028___ 4029} 4030 4031foreach (split("\n",$code)) { 4032 s/\`([^\`]*)\`/eval $1/ge; 4033 4034 s/%x#%[yz]/%x/g; # "down-shift" 4035 4036 print $_,"\n"; 4037} 4038 4039close STDOUT or die "error closing STDOUT: $!"; 4040