1#! /usr/bin/env perl 2# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# November 2014 18# 19# ChaCha20 for x86_64. 20# 21# December 2016 22# 23# Add AVX512F code path. 24# 25# December 2017 26# 27# Add AVX512VL code path. 28# 29# Performance in cycles per byte out of large buffer. 30# 31# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) 32# 33# P4 9.48/+99% - - 34# Core2 7.83/+55% 7.90/5.76 4.35 35# Westmere 7.19/+50% 5.60/4.50 3.00 36# Sandy Bridge 8.31/+42% 5.45/4.00 2.72 37# Ivy Bridge 6.71/+46% 5.40/? 2.41 38# Haswell 5.92/+43% 5.20/3.45 2.42 1.23 39# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] 40# Silvermont 12.0/+33% 7.75/6.90 7.03(iii) 41# Knights L 11.7/- ? 9.60(iii) 0.80 42# Goldmont 10.6/+17% 5.10/3.52 3.28 43# Sledgehammer 7.28/+52% - - 44# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) 45# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 46# VIA Nano 10.5/+46% 6.72/6.88 6.05 47# 48# (i) compared to older gcc 3.x one can observe >2x improvement on 49# most platforms; 50# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used 51# by chacha20_poly1305_tls_cipher, results are EVP-free; 52# (iii) this is not optimal result for Atom because of MSROM 53# limitations, SSE2 can do better, but gain is considered too 54# low to justify the [maintenance] effort; 55# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 56# and 4.85 for 128-byte inputs; 57# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; 58# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 59# cpb in single thread, the corresponding capability is suppressed; 60 61$flavour = shift; 62$output = shift; 63if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 64 65$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 66 67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 69( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 70die "can't locate x86_64-xlate.pl"; 71 72if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 73 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 74 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 75} 76 77if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 78 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 79 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 80 $avx += 1 if ($1==2.11 && $2>=8); 81} 82 83if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 84 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 85 $avx = ($1>=10) + ($1>=11); 86} 87 88if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 89 $avx = ($2>=3.0) + ($2>3.0); 90} 91 92open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 93*STDOUT=*OUT; 94 95# input parameter block 96($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); 97 98$code.=<<___; 99.text 100 101.extern OPENSSL_ia32cap_P 102 103.align 64 104.Lzero: 105.long 0,0,0,0 106.Lone: 107.long 1,0,0,0 108.Linc: 109.long 0,1,2,3 110.Lfour: 111.long 4,4,4,4 112.Lincy: 113.long 0,2,4,6,1,3,5,7 114.Leight: 115.long 8,8,8,8,8,8,8,8 116.Lrot16: 117.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 118.Lrot24: 119.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 120.Ltwoy: 121.long 2,0,0,0, 2,0,0,0 122.align 64 123.Lzeroz: 124.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 125.Lfourz: 126.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 127.Lincz: 128.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 129.Lsixteen: 130.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 131.Lsigma: 132.asciz "expand 32-byte k" 133.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 134___ 135 136sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 137{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 138 my $arg = pop; 139 $arg = "\$$arg" if ($arg*1 eq $arg); 140 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 141} 142 143@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), 144 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); 145@t=("%esi","%edi"); 146 147sub ROUND { # critical path is 24 cycles per round 148my ($a0,$b0,$c0,$d0)=@_; 149my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 150my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 151my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 152my ($xc,$xc_)=map("\"$_\"",@t); 153my @x=map("\"$_\"",@x); 154 155 # Consider order in which variables are addressed by their 156 # index: 157 # 158 # a b c d 159 # 160 # 0 4 8 12 < even round 161 # 1 5 9 13 162 # 2 6 10 14 163 # 3 7 11 15 164 # 0 5 10 15 < odd round 165 # 1 6 11 12 166 # 2 7 8 13 167 # 3 4 9 14 168 # 169 # 'a', 'b' and 'd's are permanently allocated in registers, 170 # @x[0..7,12..15], while 'c's are maintained in memory. If 171 # you observe 'c' column, you'll notice that pair of 'c's is 172 # invariant between rounds. This means that we have to reload 173 # them once per round, in the middle. This is why you'll see 174 # bunch of 'c' stores and loads in the middle, but none in 175 # the beginning or end. 176 177 # Normally instructions would be interleaved to favour in-order 178 # execution. Generally out-of-order cores manage it gracefully, 179 # but not this time for some reason. As in-order execution 180 # cores are dying breed, old Atom is the only one around, 181 # instructions are left uninterleaved. Besides, Atom is better 182 # off executing 1xSSSE3 code anyway... 183 184 ( 185 "&add (@x[$a0],@x[$b0])", # Q1 186 "&xor (@x[$d0],@x[$a0])", 187 "&rol (@x[$d0],16)", 188 "&add (@x[$a1],@x[$b1])", # Q2 189 "&xor (@x[$d1],@x[$a1])", 190 "&rol (@x[$d1],16)", 191 192 "&add ($xc,@x[$d0])", 193 "&xor (@x[$b0],$xc)", 194 "&rol (@x[$b0],12)", 195 "&add ($xc_,@x[$d1])", 196 "&xor (@x[$b1],$xc_)", 197 "&rol (@x[$b1],12)", 198 199 "&add (@x[$a0],@x[$b0])", 200 "&xor (@x[$d0],@x[$a0])", 201 "&rol (@x[$d0],8)", 202 "&add (@x[$a1],@x[$b1])", 203 "&xor (@x[$d1],@x[$a1])", 204 "&rol (@x[$d1],8)", 205 206 "&add ($xc,@x[$d0])", 207 "&xor (@x[$b0],$xc)", 208 "&rol (@x[$b0],7)", 209 "&add ($xc_,@x[$d1])", 210 "&xor (@x[$b1],$xc_)", 211 "&rol (@x[$b1],7)", 212 213 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's 214 "&mov (\"4*$c1(%rsp)\",$xc_)", 215 "&mov ($xc,\"4*$c2(%rsp)\")", 216 "&mov ($xc_,\"4*$c3(%rsp)\")", 217 218 "&add (@x[$a2],@x[$b2])", # Q3 219 "&xor (@x[$d2],@x[$a2])", 220 "&rol (@x[$d2],16)", 221 "&add (@x[$a3],@x[$b3])", # Q4 222 "&xor (@x[$d3],@x[$a3])", 223 "&rol (@x[$d3],16)", 224 225 "&add ($xc,@x[$d2])", 226 "&xor (@x[$b2],$xc)", 227 "&rol (@x[$b2],12)", 228 "&add ($xc_,@x[$d3])", 229 "&xor (@x[$b3],$xc_)", 230 "&rol (@x[$b3],12)", 231 232 "&add (@x[$a2],@x[$b2])", 233 "&xor (@x[$d2],@x[$a2])", 234 "&rol (@x[$d2],8)", 235 "&add (@x[$a3],@x[$b3])", 236 "&xor (@x[$d3],@x[$a3])", 237 "&rol (@x[$d3],8)", 238 239 "&add ($xc,@x[$d2])", 240 "&xor (@x[$b2],$xc)", 241 "&rol (@x[$b2],7)", 242 "&add ($xc_,@x[$d3])", 243 "&xor (@x[$b3],$xc_)", 244 "&rol (@x[$b3],7)" 245 ); 246} 247 248######################################################################## 249# Generic code path that handles all lengths on pre-SSSE3 processors. 250$code.=<<___; 251.globl ChaCha20_ctr32 252.type ChaCha20_ctr32,\@function,5 253.align 64 254ChaCha20_ctr32: 255.cfi_startproc 256 cmp \$0,$len 257 je .Lno_data 258 mov OPENSSL_ia32cap_P+4(%rip),%r10 259___ 260$code.=<<___ if ($avx>2); 261 bt \$48,%r10 # check for AVX512F 262 jc .LChaCha20_avx512 263 test %r10,%r10 # check for AVX512VL 264 js .LChaCha20_avx512vl 265___ 266$code.=<<___; 267 test \$`1<<(41-32)`,%r10d 268 jnz .LChaCha20_ssse3 269 270 push %rbx 271.cfi_push %rbx 272 push %rbp 273.cfi_push %rbp 274 push %r12 275.cfi_push %r12 276 push %r13 277.cfi_push %r13 278 push %r14 279.cfi_push %r14 280 push %r15 281.cfi_push %r15 282 sub \$64+24,%rsp 283.cfi_adjust_cfa_offset 64+24 284.Lctr32_body: 285 286 #movdqa .Lsigma(%rip),%xmm0 287 movdqu ($key),%xmm1 288 movdqu 16($key),%xmm2 289 movdqu ($counter),%xmm3 290 movdqa .Lone(%rip),%xmm4 291 292 #movdqa %xmm0,4*0(%rsp) # key[0] 293 movdqa %xmm1,4*4(%rsp) # key[1] 294 movdqa %xmm2,4*8(%rsp) # key[2] 295 movdqa %xmm3,4*12(%rsp) # key[3] 296 mov $len,%rbp # reassign $len 297 jmp .Loop_outer 298 299.align 32 300.Loop_outer: 301 mov \$0x61707865,@x[0] # 'expa' 302 mov \$0x3320646e,@x[1] # 'nd 3' 303 mov \$0x79622d32,@x[2] # '2-by' 304 mov \$0x6b206574,@x[3] # 'te k' 305 mov 4*4(%rsp),@x[4] 306 mov 4*5(%rsp),@x[5] 307 mov 4*6(%rsp),@x[6] 308 mov 4*7(%rsp),@x[7] 309 movd %xmm3,@x[12] 310 mov 4*13(%rsp),@x[13] 311 mov 4*14(%rsp),@x[14] 312 mov 4*15(%rsp),@x[15] 313 314 mov %rbp,64+0(%rsp) # save len 315 mov \$10,%ebp 316 mov $inp,64+8(%rsp) # save inp 317 movq %xmm2,%rsi # "@x[8]" 318 mov $out,64+16(%rsp) # save out 319 mov %rsi,%rdi 320 shr \$32,%rdi # "@x[9]" 321 jmp .Loop 322 323.align 32 324.Loop: 325___ 326 foreach (&ROUND (0, 4, 8,12)) { eval; } 327 foreach (&ROUND (0, 5,10,15)) { eval; } 328 &dec ("%ebp"); 329 &jnz (".Loop"); 330 331$code.=<<___; 332 mov @t[1],4*9(%rsp) # modulo-scheduled 333 mov @t[0],4*8(%rsp) 334 mov 64(%rsp),%rbp # load len 335 movdqa %xmm2,%xmm1 336 mov 64+8(%rsp),$inp # load inp 337 paddd %xmm4,%xmm3 # increment counter 338 mov 64+16(%rsp),$out # load out 339 340 add \$0x61707865,@x[0] # 'expa' 341 add \$0x3320646e,@x[1] # 'nd 3' 342 add \$0x79622d32,@x[2] # '2-by' 343 add \$0x6b206574,@x[3] # 'te k' 344 add 4*4(%rsp),@x[4] 345 add 4*5(%rsp),@x[5] 346 add 4*6(%rsp),@x[6] 347 add 4*7(%rsp),@x[7] 348 add 4*12(%rsp),@x[12] 349 add 4*13(%rsp),@x[13] 350 add 4*14(%rsp),@x[14] 351 add 4*15(%rsp),@x[15] 352 paddd 4*8(%rsp),%xmm1 353 354 cmp \$64,%rbp 355 jb .Ltail 356 357 xor 4*0($inp),@x[0] # xor with input 358 xor 4*1($inp),@x[1] 359 xor 4*2($inp),@x[2] 360 xor 4*3($inp),@x[3] 361 xor 4*4($inp),@x[4] 362 xor 4*5($inp),@x[5] 363 xor 4*6($inp),@x[6] 364 xor 4*7($inp),@x[7] 365 movdqu 4*8($inp),%xmm0 366 xor 4*12($inp),@x[12] 367 xor 4*13($inp),@x[13] 368 xor 4*14($inp),@x[14] 369 xor 4*15($inp),@x[15] 370 lea 4*16($inp),$inp # inp+=64 371 pxor %xmm1,%xmm0 372 373 movdqa %xmm2,4*8(%rsp) 374 movd %xmm3,4*12(%rsp) 375 376 mov @x[0],4*0($out) # write output 377 mov @x[1],4*1($out) 378 mov @x[2],4*2($out) 379 mov @x[3],4*3($out) 380 mov @x[4],4*4($out) 381 mov @x[5],4*5($out) 382 mov @x[6],4*6($out) 383 mov @x[7],4*7($out) 384 movdqu %xmm0,4*8($out) 385 mov @x[12],4*12($out) 386 mov @x[13],4*13($out) 387 mov @x[14],4*14($out) 388 mov @x[15],4*15($out) 389 lea 4*16($out),$out # out+=64 390 391 sub \$64,%rbp 392 jnz .Loop_outer 393 394 jmp .Ldone 395 396.align 16 397.Ltail: 398 mov @x[0],4*0(%rsp) 399 mov @x[1],4*1(%rsp) 400 xor %rbx,%rbx 401 mov @x[2],4*2(%rsp) 402 mov @x[3],4*3(%rsp) 403 mov @x[4],4*4(%rsp) 404 mov @x[5],4*5(%rsp) 405 mov @x[6],4*6(%rsp) 406 mov @x[7],4*7(%rsp) 407 movdqa %xmm1,4*8(%rsp) 408 mov @x[12],4*12(%rsp) 409 mov @x[13],4*13(%rsp) 410 mov @x[14],4*14(%rsp) 411 mov @x[15],4*15(%rsp) 412 413.Loop_tail: 414 movzb ($inp,%rbx),%eax 415 movzb (%rsp,%rbx),%edx 416 lea 1(%rbx),%rbx 417 xor %edx,%eax 418 mov %al,-1($out,%rbx) 419 dec %rbp 420 jnz .Loop_tail 421 422.Ldone: 423 lea 64+24+48(%rsp),%rsi 424.cfi_def_cfa %rsi,8 425 mov -48(%rsi),%r15 426.cfi_restore %r15 427 mov -40(%rsi),%r14 428.cfi_restore %r14 429 mov -32(%rsi),%r13 430.cfi_restore %r13 431 mov -24(%rsi),%r12 432.cfi_restore %r12 433 mov -16(%rsi),%rbp 434.cfi_restore %rbp 435 mov -8(%rsi),%rbx 436.cfi_restore %rbx 437 lea (%rsi),%rsp 438.cfi_def_cfa_register %rsp 439.Lno_data: 440 ret 441.cfi_endproc 442.size ChaCha20_ctr32,.-ChaCha20_ctr32 443___ 444 445######################################################################## 446# SSSE3 code path that handles shorter lengths 447{ 448my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); 449 450sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 451 &paddd ($a,$b); 452 &pxor ($d,$a); 453 &pshufb ($d,$rot16); 454 455 &paddd ($c,$d); 456 &pxor ($b,$c); 457 &movdqa ($t,$b); 458 &psrld ($b,20); 459 &pslld ($t,12); 460 &por ($b,$t); 461 462 &paddd ($a,$b); 463 &pxor ($d,$a); 464 &pshufb ($d,$rot24); 465 466 &paddd ($c,$d); 467 &pxor ($b,$c); 468 &movdqa ($t,$b); 469 &psrld ($b,25); 470 &pslld ($t,7); 471 &por ($b,$t); 472} 473 474my $xframe = $win64 ? 32+8 : 8; 475 476$code.=<<___; 477.type ChaCha20_ssse3,\@function,5 478.align 32 479ChaCha20_ssse3: 480.cfi_startproc 481.LChaCha20_ssse3: 482 mov %rsp,%r9 # frame pointer 483.cfi_def_cfa_register %r9 484___ 485$code.=<<___ if ($avx); 486 test \$`1<<(43-32)`,%r10d 487 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 488___ 489$code.=<<___; 490 cmp \$128,$len # we might throw away some data, 491 je .LChaCha20_128 492 ja .LChaCha20_4x # but overall it won't be slower 493 494.Ldo_sse3_after_all: 495 sub \$64+$xframe,%rsp 496___ 497$code.=<<___ if ($win64); 498 movaps %xmm6,-0x28(%r9) 499 movaps %xmm7,-0x18(%r9) 500.Lssse3_body: 501___ 502$code.=<<___; 503 movdqa .Lsigma(%rip),$a 504 movdqu ($key),$b 505 movdqu 16($key),$c 506 movdqu ($counter),$d 507 movdqa .Lrot16(%rip),$rot16 508 movdqa .Lrot24(%rip),$rot24 509 510 movdqa $a,0x00(%rsp) 511 movdqa $b,0x10(%rsp) 512 movdqa $c,0x20(%rsp) 513 movdqa $d,0x30(%rsp) 514 mov \$10,$counter # reuse $counter 515 jmp .Loop_ssse3 516 517.align 32 518.Loop_outer_ssse3: 519 movdqa .Lone(%rip),$d 520 movdqa 0x00(%rsp),$a 521 movdqa 0x10(%rsp),$b 522 movdqa 0x20(%rsp),$c 523 paddd 0x30(%rsp),$d 524 mov \$10,$counter 525 movdqa $d,0x30(%rsp) 526 jmp .Loop_ssse3 527 528.align 32 529.Loop_ssse3: 530___ 531 &SSSE3ROUND(); 532 &pshufd ($c,$c,0b01001110); 533 &pshufd ($b,$b,0b00111001); 534 &pshufd ($d,$d,0b10010011); 535 &nop (); 536 537 &SSSE3ROUND(); 538 &pshufd ($c,$c,0b01001110); 539 &pshufd ($b,$b,0b10010011); 540 &pshufd ($d,$d,0b00111001); 541 542 &dec ($counter); 543 &jnz (".Loop_ssse3"); 544 545$code.=<<___; 546 paddd 0x00(%rsp),$a 547 paddd 0x10(%rsp),$b 548 paddd 0x20(%rsp),$c 549 paddd 0x30(%rsp),$d 550 551 cmp \$64,$len 552 jb .Ltail_ssse3 553 554 movdqu 0x00($inp),$t 555 movdqu 0x10($inp),$t1 556 pxor $t,$a # xor with input 557 movdqu 0x20($inp),$t 558 pxor $t1,$b 559 movdqu 0x30($inp),$t1 560 lea 0x40($inp),$inp # inp+=64 561 pxor $t,$c 562 pxor $t1,$d 563 564 movdqu $a,0x00($out) # write output 565 movdqu $b,0x10($out) 566 movdqu $c,0x20($out) 567 movdqu $d,0x30($out) 568 lea 0x40($out),$out # out+=64 569 570 sub \$64,$len 571 jnz .Loop_outer_ssse3 572 573 jmp .Ldone_ssse3 574 575.align 16 576.Ltail_ssse3: 577 movdqa $a,0x00(%rsp) 578 movdqa $b,0x10(%rsp) 579 movdqa $c,0x20(%rsp) 580 movdqa $d,0x30(%rsp) 581 xor $counter,$counter 582 583.Loop_tail_ssse3: 584 movzb ($inp,$counter),%eax 585 movzb (%rsp,$counter),%ecx 586 lea 1($counter),$counter 587 xor %ecx,%eax 588 mov %al,-1($out,$counter) 589 dec $len 590 jnz .Loop_tail_ssse3 591 592.Ldone_ssse3: 593___ 594$code.=<<___ if ($win64); 595 movaps -0x28(%r9),%xmm6 596 movaps -0x18(%r9),%xmm7 597___ 598$code.=<<___; 599 lea (%r9),%rsp 600.cfi_def_cfa_register %rsp 601.Lssse3_epilogue: 602 ret 603.cfi_endproc 604.size ChaCha20_ssse3,.-ChaCha20_ssse3 605___ 606} 607 608######################################################################## 609# SSSE3 code path that handles 128-byte inputs 610{ 611my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); 612my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); 613 614sub SSSE3ROUND_2x { 615 &paddd ($a,$b); 616 &pxor ($d,$a); 617 &paddd ($a1,$b1); 618 &pxor ($d1,$a1); 619 &pshufb ($d,$rot16); 620 &pshufb($d1,$rot16); 621 622 &paddd ($c,$d); 623 &paddd ($c1,$d1); 624 &pxor ($b,$c); 625 &pxor ($b1,$c1); 626 &movdqa ($t,$b); 627 &psrld ($b,20); 628 &movdqa($t1,$b1); 629 &pslld ($t,12); 630 &psrld ($b1,20); 631 &por ($b,$t); 632 &pslld ($t1,12); 633 &por ($b1,$t1); 634 635 &paddd ($a,$b); 636 &pxor ($d,$a); 637 &paddd ($a1,$b1); 638 &pxor ($d1,$a1); 639 &pshufb ($d,$rot24); 640 &pshufb($d1,$rot24); 641 642 &paddd ($c,$d); 643 &paddd ($c1,$d1); 644 &pxor ($b,$c); 645 &pxor ($b1,$c1); 646 &movdqa ($t,$b); 647 &psrld ($b,25); 648 &movdqa($t1,$b1); 649 &pslld ($t,7); 650 &psrld ($b1,25); 651 &por ($b,$t); 652 &pslld ($t1,7); 653 &por ($b1,$t1); 654} 655 656my $xframe = $win64 ? 0x68 : 8; 657 658$code.=<<___; 659.type ChaCha20_128,\@function,5 660.align 32 661ChaCha20_128: 662.cfi_startproc 663.LChaCha20_128: 664 mov %rsp,%r9 # frame pointer 665.cfi_def_cfa_register %r9 666 sub \$64+$xframe,%rsp 667___ 668$code.=<<___ if ($win64); 669 movaps %xmm6,-0x68(%r9) 670 movaps %xmm7,-0x58(%r9) 671 movaps %xmm8,-0x48(%r9) 672 movaps %xmm9,-0x38(%r9) 673 movaps %xmm10,-0x28(%r9) 674 movaps %xmm11,-0x18(%r9) 675.L128_body: 676___ 677$code.=<<___; 678 movdqa .Lsigma(%rip),$a 679 movdqu ($key),$b 680 movdqu 16($key),$c 681 movdqu ($counter),$d 682 movdqa .Lone(%rip),$d1 683 movdqa .Lrot16(%rip),$rot16 684 movdqa .Lrot24(%rip),$rot24 685 686 movdqa $a,$a1 687 movdqa $a,0x00(%rsp) 688 movdqa $b,$b1 689 movdqa $b,0x10(%rsp) 690 movdqa $c,$c1 691 movdqa $c,0x20(%rsp) 692 paddd $d,$d1 693 movdqa $d,0x30(%rsp) 694 mov \$10,$counter # reuse $counter 695 jmp .Loop_128 696 697.align 32 698.Loop_128: 699___ 700 &SSSE3ROUND_2x(); 701 &pshufd ($c,$c,0b01001110); 702 &pshufd ($b,$b,0b00111001); 703 &pshufd ($d,$d,0b10010011); 704 &pshufd ($c1,$c1,0b01001110); 705 &pshufd ($b1,$b1,0b00111001); 706 &pshufd ($d1,$d1,0b10010011); 707 708 &SSSE3ROUND_2x(); 709 &pshufd ($c,$c,0b01001110); 710 &pshufd ($b,$b,0b10010011); 711 &pshufd ($d,$d,0b00111001); 712 &pshufd ($c1,$c1,0b01001110); 713 &pshufd ($b1,$b1,0b10010011); 714 &pshufd ($d1,$d1,0b00111001); 715 716 &dec ($counter); 717 &jnz (".Loop_128"); 718 719$code.=<<___; 720 paddd 0x00(%rsp),$a 721 paddd 0x10(%rsp),$b 722 paddd 0x20(%rsp),$c 723 paddd 0x30(%rsp),$d 724 paddd .Lone(%rip),$d1 725 paddd 0x00(%rsp),$a1 726 paddd 0x10(%rsp),$b1 727 paddd 0x20(%rsp),$c1 728 paddd 0x30(%rsp),$d1 729 730 movdqu 0x00($inp),$t 731 movdqu 0x10($inp),$t1 732 pxor $t,$a # xor with input 733 movdqu 0x20($inp),$t 734 pxor $t1,$b 735 movdqu 0x30($inp),$t1 736 pxor $t,$c 737 movdqu 0x40($inp),$t 738 pxor $t1,$d 739 movdqu 0x50($inp),$t1 740 pxor $t,$a1 741 movdqu 0x60($inp),$t 742 pxor $t1,$b1 743 movdqu 0x70($inp),$t1 744 pxor $t,$c1 745 pxor $t1,$d1 746 747 movdqu $a,0x00($out) # write output 748 movdqu $b,0x10($out) 749 movdqu $c,0x20($out) 750 movdqu $d,0x30($out) 751 movdqu $a1,0x40($out) 752 movdqu $b1,0x50($out) 753 movdqu $c1,0x60($out) 754 movdqu $d1,0x70($out) 755___ 756$code.=<<___ if ($win64); 757 movaps -0x68(%r9),%xmm6 758 movaps -0x58(%r9),%xmm7 759 movaps -0x48(%r9),%xmm8 760 movaps -0x38(%r9),%xmm9 761 movaps -0x28(%r9),%xmm10 762 movaps -0x18(%r9),%xmm11 763___ 764$code.=<<___; 765 lea (%r9),%rsp 766.cfi_def_cfa_register %rsp 767.L128_epilogue: 768 ret 769.cfi_endproc 770.size ChaCha20_128,.-ChaCha20_128 771___ 772} 773 774######################################################################## 775# SSSE3 code path that handles longer messages. 776{ 777# assign variables to favor Atom front-end 778my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, 779 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); 780my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 781 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 782 783sub SSSE3_lane_ROUND { 784my ($a0,$b0,$c0,$d0)=@_; 785my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 786my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 787my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 788my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 789my @x=map("\"$_\"",@xx); 790 791 # Consider order in which variables are addressed by their 792 # index: 793 # 794 # a b c d 795 # 796 # 0 4 8 12 < even round 797 # 1 5 9 13 798 # 2 6 10 14 799 # 3 7 11 15 800 # 0 5 10 15 < odd round 801 # 1 6 11 12 802 # 2 7 8 13 803 # 3 4 9 14 804 # 805 # 'a', 'b' and 'd's are permanently allocated in registers, 806 # @x[0..7,12..15], while 'c's are maintained in memory. If 807 # you observe 'c' column, you'll notice that pair of 'c's is 808 # invariant between rounds. This means that we have to reload 809 # them once per round, in the middle. This is why you'll see 810 # bunch of 'c' stores and loads in the middle, but none in 811 # the beginning or end. 812 813 ( 814 "&paddd (@x[$a0],@x[$b0])", # Q1 815 "&paddd (@x[$a1],@x[$b1])", # Q2 816 "&pxor (@x[$d0],@x[$a0])", 817 "&pxor (@x[$d1],@x[$a1])", 818 "&pshufb (@x[$d0],$t1)", 819 "&pshufb (@x[$d1],$t1)", 820 821 "&paddd ($xc,@x[$d0])", 822 "&paddd ($xc_,@x[$d1])", 823 "&pxor (@x[$b0],$xc)", 824 "&pxor (@x[$b1],$xc_)", 825 "&movdqa ($t0,@x[$b0])", 826 "&pslld (@x[$b0],12)", 827 "&psrld ($t0,20)", 828 "&movdqa ($t1,@x[$b1])", 829 "&pslld (@x[$b1],12)", 830 "&por (@x[$b0],$t0)", 831 "&psrld ($t1,20)", 832 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 833 "&por (@x[$b1],$t1)", 834 835 "&paddd (@x[$a0],@x[$b0])", 836 "&paddd (@x[$a1],@x[$b1])", 837 "&pxor (@x[$d0],@x[$a0])", 838 "&pxor (@x[$d1],@x[$a1])", 839 "&pshufb (@x[$d0],$t0)", 840 "&pshufb (@x[$d1],$t0)", 841 842 "&paddd ($xc,@x[$d0])", 843 "&paddd ($xc_,@x[$d1])", 844 "&pxor (@x[$b0],$xc)", 845 "&pxor (@x[$b1],$xc_)", 846 "&movdqa ($t1,@x[$b0])", 847 "&pslld (@x[$b0],7)", 848 "&psrld ($t1,25)", 849 "&movdqa ($t0,@x[$b1])", 850 "&pslld (@x[$b1],7)", 851 "&por (@x[$b0],$t1)", 852 "&psrld ($t0,25)", 853 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 854 "&por (@x[$b1],$t0)", 855 856 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 857 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", 858 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", 859 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", 860 861 "&paddd (@x[$a2],@x[$b2])", # Q3 862 "&paddd (@x[$a3],@x[$b3])", # Q4 863 "&pxor (@x[$d2],@x[$a2])", 864 "&pxor (@x[$d3],@x[$a3])", 865 "&pshufb (@x[$d2],$t1)", 866 "&pshufb (@x[$d3],$t1)", 867 868 "&paddd ($xc,@x[$d2])", 869 "&paddd ($xc_,@x[$d3])", 870 "&pxor (@x[$b2],$xc)", 871 "&pxor (@x[$b3],$xc_)", 872 "&movdqa ($t0,@x[$b2])", 873 "&pslld (@x[$b2],12)", 874 "&psrld ($t0,20)", 875 "&movdqa ($t1,@x[$b3])", 876 "&pslld (@x[$b3],12)", 877 "&por (@x[$b2],$t0)", 878 "&psrld ($t1,20)", 879 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 880 "&por (@x[$b3],$t1)", 881 882 "&paddd (@x[$a2],@x[$b2])", 883 "&paddd (@x[$a3],@x[$b3])", 884 "&pxor (@x[$d2],@x[$a2])", 885 "&pxor (@x[$d3],@x[$a3])", 886 "&pshufb (@x[$d2],$t0)", 887 "&pshufb (@x[$d3],$t0)", 888 889 "&paddd ($xc,@x[$d2])", 890 "&paddd ($xc_,@x[$d3])", 891 "&pxor (@x[$b2],$xc)", 892 "&pxor (@x[$b3],$xc_)", 893 "&movdqa ($t1,@x[$b2])", 894 "&pslld (@x[$b2],7)", 895 "&psrld ($t1,25)", 896 "&movdqa ($t0,@x[$b3])", 897 "&pslld (@x[$b3],7)", 898 "&por (@x[$b2],$t1)", 899 "&psrld ($t0,25)", 900 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 901 "&por (@x[$b3],$t0)" 902 ); 903} 904 905my $xframe = $win64 ? 0xa8 : 8; 906 907$code.=<<___; 908.type ChaCha20_4x,\@function,5 909.align 32 910ChaCha20_4x: 911.cfi_startproc 912.LChaCha20_4x: 913 mov %rsp,%r9 # frame pointer 914.cfi_def_cfa_register %r9 915 mov %r10,%r11 916___ 917$code.=<<___ if ($avx>1); 918 shr \$32,%r10 # OPENSSL_ia32cap_P+8 919 test \$`1<<5`,%r10 # test AVX2 920 jnz .LChaCha20_8x 921___ 922$code.=<<___; 923 cmp \$192,$len 924 ja .Lproceed4x 925 926 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE 927 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE 928 je .Ldo_sse3_after_all # to detect Atom 929 930.Lproceed4x: 931 sub \$0x140+$xframe,%rsp 932___ 933 ################ stack layout 934 # +0x00 SIMD equivalent of @x[8-12] 935 # ... 936 # +0x40 constant copy of key[0-2] smashed by lanes 937 # ... 938 # +0x100 SIMD counters (with nonce smashed by lanes) 939 # ... 940 # +0x140 941$code.=<<___ if ($win64); 942 movaps %xmm6,-0xa8(%r9) 943 movaps %xmm7,-0x98(%r9) 944 movaps %xmm8,-0x88(%r9) 945 movaps %xmm9,-0x78(%r9) 946 movaps %xmm10,-0x68(%r9) 947 movaps %xmm11,-0x58(%r9) 948 movaps %xmm12,-0x48(%r9) 949 movaps %xmm13,-0x38(%r9) 950 movaps %xmm14,-0x28(%r9) 951 movaps %xmm15,-0x18(%r9) 952.L4x_body: 953___ 954$code.=<<___; 955 movdqa .Lsigma(%rip),$xa3 # key[0] 956 movdqu ($key),$xb3 # key[1] 957 movdqu 16($key),$xt3 # key[2] 958 movdqu ($counter),$xd3 # key[3] 959 lea 0x100(%rsp),%rcx # size optimization 960 lea .Lrot16(%rip),%r10 961 lea .Lrot24(%rip),%r11 962 963 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... 964 pshufd \$0x55,$xa3,$xa1 965 movdqa $xa0,0x40(%rsp) # ... and offload 966 pshufd \$0xaa,$xa3,$xa2 967 movdqa $xa1,0x50(%rsp) 968 pshufd \$0xff,$xa3,$xa3 969 movdqa $xa2,0x60(%rsp) 970 movdqa $xa3,0x70(%rsp) 971 972 pshufd \$0x00,$xb3,$xb0 973 pshufd \$0x55,$xb3,$xb1 974 movdqa $xb0,0x80-0x100(%rcx) 975 pshufd \$0xaa,$xb3,$xb2 976 movdqa $xb1,0x90-0x100(%rcx) 977 pshufd \$0xff,$xb3,$xb3 978 movdqa $xb2,0xa0-0x100(%rcx) 979 movdqa $xb3,0xb0-0x100(%rcx) 980 981 pshufd \$0x00,$xt3,$xt0 # "$xc0" 982 pshufd \$0x55,$xt3,$xt1 # "$xc1" 983 movdqa $xt0,0xc0-0x100(%rcx) 984 pshufd \$0xaa,$xt3,$xt2 # "$xc2" 985 movdqa $xt1,0xd0-0x100(%rcx) 986 pshufd \$0xff,$xt3,$xt3 # "$xc3" 987 movdqa $xt2,0xe0-0x100(%rcx) 988 movdqa $xt3,0xf0-0x100(%rcx) 989 990 pshufd \$0x00,$xd3,$xd0 991 pshufd \$0x55,$xd3,$xd1 992 paddd .Linc(%rip),$xd0 # don't save counters yet 993 pshufd \$0xaa,$xd3,$xd2 994 movdqa $xd1,0x110-0x100(%rcx) 995 pshufd \$0xff,$xd3,$xd3 996 movdqa $xd2,0x120-0x100(%rcx) 997 movdqa $xd3,0x130-0x100(%rcx) 998 999 jmp .Loop_enter4x 1000 1001.align 32 1002.Loop_outer4x: 1003 movdqa 0x40(%rsp),$xa0 # re-load smashed key 1004 movdqa 0x50(%rsp),$xa1 1005 movdqa 0x60(%rsp),$xa2 1006 movdqa 0x70(%rsp),$xa3 1007 movdqa 0x80-0x100(%rcx),$xb0 1008 movdqa 0x90-0x100(%rcx),$xb1 1009 movdqa 0xa0-0x100(%rcx),$xb2 1010 movdqa 0xb0-0x100(%rcx),$xb3 1011 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1012 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1013 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1014 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1015 movdqa 0x100-0x100(%rcx),$xd0 1016 movdqa 0x110-0x100(%rcx),$xd1 1017 movdqa 0x120-0x100(%rcx),$xd2 1018 movdqa 0x130-0x100(%rcx),$xd3 1019 paddd .Lfour(%rip),$xd0 # next SIMD counters 1020 1021.Loop_enter4x: 1022 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" 1023 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" 1024 movdqa (%r10),$xt3 # .Lrot16(%rip) 1025 mov \$10,%eax 1026 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1027 jmp .Loop4x 1028 1029.align 32 1030.Loop4x: 1031___ 1032 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } 1033 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } 1034$code.=<<___; 1035 dec %eax 1036 jnz .Loop4x 1037 1038 paddd 0x40(%rsp),$xa0 # accumulate key material 1039 paddd 0x50(%rsp),$xa1 1040 paddd 0x60(%rsp),$xa2 1041 paddd 0x70(%rsp),$xa3 1042 1043 movdqa $xa0,$xt2 # "de-interlace" data 1044 punpckldq $xa1,$xa0 1045 movdqa $xa2,$xt3 1046 punpckldq $xa3,$xa2 1047 punpckhdq $xa1,$xt2 1048 punpckhdq $xa3,$xt3 1049 movdqa $xa0,$xa1 1050 punpcklqdq $xa2,$xa0 # "a0" 1051 movdqa $xt2,$xa3 1052 punpcklqdq $xt3,$xt2 # "a2" 1053 punpckhqdq $xa2,$xa1 # "a1" 1054 punpckhqdq $xt3,$xa3 # "a3" 1055___ 1056 ($xa2,$xt2)=($xt2,$xa2); 1057$code.=<<___; 1058 paddd 0x80-0x100(%rcx),$xb0 1059 paddd 0x90-0x100(%rcx),$xb1 1060 paddd 0xa0-0x100(%rcx),$xb2 1061 paddd 0xb0-0x100(%rcx),$xb3 1062 1063 movdqa $xa0,0x00(%rsp) # offload $xaN 1064 movdqa $xa1,0x10(%rsp) 1065 movdqa 0x20(%rsp),$xa0 # "xc2" 1066 movdqa 0x30(%rsp),$xa1 # "xc3" 1067 1068 movdqa $xb0,$xt2 1069 punpckldq $xb1,$xb0 1070 movdqa $xb2,$xt3 1071 punpckldq $xb3,$xb2 1072 punpckhdq $xb1,$xt2 1073 punpckhdq $xb3,$xt3 1074 movdqa $xb0,$xb1 1075 punpcklqdq $xb2,$xb0 # "b0" 1076 movdqa $xt2,$xb3 1077 punpcklqdq $xt3,$xt2 # "b2" 1078 punpckhqdq $xb2,$xb1 # "b1" 1079 punpckhqdq $xt3,$xb3 # "b3" 1080___ 1081 ($xb2,$xt2)=($xt2,$xb2); 1082 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1083$code.=<<___; 1084 paddd 0xc0-0x100(%rcx),$xc0 1085 paddd 0xd0-0x100(%rcx),$xc1 1086 paddd 0xe0-0x100(%rcx),$xc2 1087 paddd 0xf0-0x100(%rcx),$xc3 1088 1089 movdqa $xa2,0x20(%rsp) # keep offloading $xaN 1090 movdqa $xa3,0x30(%rsp) 1091 1092 movdqa $xc0,$xt2 1093 punpckldq $xc1,$xc0 1094 movdqa $xc2,$xt3 1095 punpckldq $xc3,$xc2 1096 punpckhdq $xc1,$xt2 1097 punpckhdq $xc3,$xt3 1098 movdqa $xc0,$xc1 1099 punpcklqdq $xc2,$xc0 # "c0" 1100 movdqa $xt2,$xc3 1101 punpcklqdq $xt3,$xt2 # "c2" 1102 punpckhqdq $xc2,$xc1 # "c1" 1103 punpckhqdq $xt3,$xc3 # "c3" 1104___ 1105 ($xc2,$xt2)=($xt2,$xc2); 1106 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary 1107$code.=<<___; 1108 paddd 0x100-0x100(%rcx),$xd0 1109 paddd 0x110-0x100(%rcx),$xd1 1110 paddd 0x120-0x100(%rcx),$xd2 1111 paddd 0x130-0x100(%rcx),$xd3 1112 1113 movdqa $xd0,$xt2 1114 punpckldq $xd1,$xd0 1115 movdqa $xd2,$xt3 1116 punpckldq $xd3,$xd2 1117 punpckhdq $xd1,$xt2 1118 punpckhdq $xd3,$xt3 1119 movdqa $xd0,$xd1 1120 punpcklqdq $xd2,$xd0 # "d0" 1121 movdqa $xt2,$xd3 1122 punpcklqdq $xt3,$xt2 # "d2" 1123 punpckhqdq $xd2,$xd1 # "d1" 1124 punpckhqdq $xt3,$xd3 # "d3" 1125___ 1126 ($xd2,$xt2)=($xt2,$xd2); 1127$code.=<<___; 1128 cmp \$64*4,$len 1129 jb .Ltail4x 1130 1131 movdqu 0x00($inp),$xt0 # xor with input 1132 movdqu 0x10($inp),$xt1 1133 movdqu 0x20($inp),$xt2 1134 movdqu 0x30($inp),$xt3 1135 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1136 pxor $xb0,$xt1 1137 pxor $xc0,$xt2 1138 pxor $xd0,$xt3 1139 1140 movdqu $xt0,0x00($out) 1141 movdqu 0x40($inp),$xt0 1142 movdqu $xt1,0x10($out) 1143 movdqu 0x50($inp),$xt1 1144 movdqu $xt2,0x20($out) 1145 movdqu 0x60($inp),$xt2 1146 movdqu $xt3,0x30($out) 1147 movdqu 0x70($inp),$xt3 1148 lea 0x80($inp),$inp # size optimization 1149 pxor 0x10(%rsp),$xt0 1150 pxor $xb1,$xt1 1151 pxor $xc1,$xt2 1152 pxor $xd1,$xt3 1153 1154 movdqu $xt0,0x40($out) 1155 movdqu 0x00($inp),$xt0 1156 movdqu $xt1,0x50($out) 1157 movdqu 0x10($inp),$xt1 1158 movdqu $xt2,0x60($out) 1159 movdqu 0x20($inp),$xt2 1160 movdqu $xt3,0x70($out) 1161 lea 0x80($out),$out # size optimization 1162 movdqu 0x30($inp),$xt3 1163 pxor 0x20(%rsp),$xt0 1164 pxor $xb2,$xt1 1165 pxor $xc2,$xt2 1166 pxor $xd2,$xt3 1167 1168 movdqu $xt0,0x00($out) 1169 movdqu 0x40($inp),$xt0 1170 movdqu $xt1,0x10($out) 1171 movdqu 0x50($inp),$xt1 1172 movdqu $xt2,0x20($out) 1173 movdqu 0x60($inp),$xt2 1174 movdqu $xt3,0x30($out) 1175 movdqu 0x70($inp),$xt3 1176 lea 0x80($inp),$inp # inp+=64*4 1177 pxor 0x30(%rsp),$xt0 1178 pxor $xb3,$xt1 1179 pxor $xc3,$xt2 1180 pxor $xd3,$xt3 1181 movdqu $xt0,0x40($out) 1182 movdqu $xt1,0x50($out) 1183 movdqu $xt2,0x60($out) 1184 movdqu $xt3,0x70($out) 1185 lea 0x80($out),$out # out+=64*4 1186 1187 sub \$64*4,$len 1188 jnz .Loop_outer4x 1189 1190 jmp .Ldone4x 1191 1192.Ltail4x: 1193 cmp \$192,$len 1194 jae .L192_or_more4x 1195 cmp \$128,$len 1196 jae .L128_or_more4x 1197 cmp \$64,$len 1198 jae .L64_or_more4x 1199 1200 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1201 xor %r10,%r10 1202 #movdqa $xt0,0x00(%rsp) 1203 movdqa $xb0,0x10(%rsp) 1204 movdqa $xc0,0x20(%rsp) 1205 movdqa $xd0,0x30(%rsp) 1206 jmp .Loop_tail4x 1207 1208.align 32 1209.L64_or_more4x: 1210 movdqu 0x00($inp),$xt0 # xor with input 1211 movdqu 0x10($inp),$xt1 1212 movdqu 0x20($inp),$xt2 1213 movdqu 0x30($inp),$xt3 1214 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? 1215 pxor $xb0,$xt1 1216 pxor $xc0,$xt2 1217 pxor $xd0,$xt3 1218 movdqu $xt0,0x00($out) 1219 movdqu $xt1,0x10($out) 1220 movdqu $xt2,0x20($out) 1221 movdqu $xt3,0x30($out) 1222 je .Ldone4x 1223 1224 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? 1225 lea 0x40($inp),$inp # inp+=64*1 1226 xor %r10,%r10 1227 movdqa $xt0,0x00(%rsp) 1228 movdqa $xb1,0x10(%rsp) 1229 lea 0x40($out),$out # out+=64*1 1230 movdqa $xc1,0x20(%rsp) 1231 sub \$64,$len # len-=64*1 1232 movdqa $xd1,0x30(%rsp) 1233 jmp .Loop_tail4x 1234 1235.align 32 1236.L128_or_more4x: 1237 movdqu 0x00($inp),$xt0 # xor with input 1238 movdqu 0x10($inp),$xt1 1239 movdqu 0x20($inp),$xt2 1240 movdqu 0x30($inp),$xt3 1241 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1242 pxor $xb0,$xt1 1243 pxor $xc0,$xt2 1244 pxor $xd0,$xt3 1245 1246 movdqu $xt0,0x00($out) 1247 movdqu 0x40($inp),$xt0 1248 movdqu $xt1,0x10($out) 1249 movdqu 0x50($inp),$xt1 1250 movdqu $xt2,0x20($out) 1251 movdqu 0x60($inp),$xt2 1252 movdqu $xt3,0x30($out) 1253 movdqu 0x70($inp),$xt3 1254 pxor 0x10(%rsp),$xt0 1255 pxor $xb1,$xt1 1256 pxor $xc1,$xt2 1257 pxor $xd1,$xt3 1258 movdqu $xt0,0x40($out) 1259 movdqu $xt1,0x50($out) 1260 movdqu $xt2,0x60($out) 1261 movdqu $xt3,0x70($out) 1262 je .Ldone4x 1263 1264 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? 1265 lea 0x80($inp),$inp # inp+=64*2 1266 xor %r10,%r10 1267 movdqa $xt0,0x00(%rsp) 1268 movdqa $xb2,0x10(%rsp) 1269 lea 0x80($out),$out # out+=64*2 1270 movdqa $xc2,0x20(%rsp) 1271 sub \$128,$len # len-=64*2 1272 movdqa $xd2,0x30(%rsp) 1273 jmp .Loop_tail4x 1274 1275.align 32 1276.L192_or_more4x: 1277 movdqu 0x00($inp),$xt0 # xor with input 1278 movdqu 0x10($inp),$xt1 1279 movdqu 0x20($inp),$xt2 1280 movdqu 0x30($inp),$xt3 1281 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1282 pxor $xb0,$xt1 1283 pxor $xc0,$xt2 1284 pxor $xd0,$xt3 1285 1286 movdqu $xt0,0x00($out) 1287 movdqu 0x40($inp),$xt0 1288 movdqu $xt1,0x10($out) 1289 movdqu 0x50($inp),$xt1 1290 movdqu $xt2,0x20($out) 1291 movdqu 0x60($inp),$xt2 1292 movdqu $xt3,0x30($out) 1293 movdqu 0x70($inp),$xt3 1294 lea 0x80($inp),$inp # size optimization 1295 pxor 0x10(%rsp),$xt0 1296 pxor $xb1,$xt1 1297 pxor $xc1,$xt2 1298 pxor $xd1,$xt3 1299 1300 movdqu $xt0,0x40($out) 1301 movdqu 0x00($inp),$xt0 1302 movdqu $xt1,0x50($out) 1303 movdqu 0x10($inp),$xt1 1304 movdqu $xt2,0x60($out) 1305 movdqu 0x20($inp),$xt2 1306 movdqu $xt3,0x70($out) 1307 lea 0x80($out),$out # size optimization 1308 movdqu 0x30($inp),$xt3 1309 pxor 0x20(%rsp),$xt0 1310 pxor $xb2,$xt1 1311 pxor $xc2,$xt2 1312 pxor $xd2,$xt3 1313 movdqu $xt0,0x00($out) 1314 movdqu $xt1,0x10($out) 1315 movdqu $xt2,0x20($out) 1316 movdqu $xt3,0x30($out) 1317 je .Ldone4x 1318 1319 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? 1320 lea 0x40($inp),$inp # inp+=64*3 1321 xor %r10,%r10 1322 movdqa $xt0,0x00(%rsp) 1323 movdqa $xb3,0x10(%rsp) 1324 lea 0x40($out),$out # out+=64*3 1325 movdqa $xc3,0x20(%rsp) 1326 sub \$192,$len # len-=64*3 1327 movdqa $xd3,0x30(%rsp) 1328 1329.Loop_tail4x: 1330 movzb ($inp,%r10),%eax 1331 movzb (%rsp,%r10),%ecx 1332 lea 1(%r10),%r10 1333 xor %ecx,%eax 1334 mov %al,-1($out,%r10) 1335 dec $len 1336 jnz .Loop_tail4x 1337 1338.Ldone4x: 1339___ 1340$code.=<<___ if ($win64); 1341 movaps -0xa8(%r9),%xmm6 1342 movaps -0x98(%r9),%xmm7 1343 movaps -0x88(%r9),%xmm8 1344 movaps -0x78(%r9),%xmm9 1345 movaps -0x68(%r9),%xmm10 1346 movaps -0x58(%r9),%xmm11 1347 movaps -0x48(%r9),%xmm12 1348 movaps -0x38(%r9),%xmm13 1349 movaps -0x28(%r9),%xmm14 1350 movaps -0x18(%r9),%xmm15 1351___ 1352$code.=<<___; 1353 lea (%r9),%rsp 1354.cfi_def_cfa_register %rsp 1355.L4x_epilogue: 1356 ret 1357.cfi_endproc 1358.size ChaCha20_4x,.-ChaCha20_4x 1359___ 1360} 1361 1362######################################################################## 1363# XOP code path that handles all lengths. 1364if ($avx) { 1365# There is some "anomaly" observed depending on instructions' size or 1366# alignment. If you look closely at below code you'll notice that 1367# sometimes argument order varies. The order affects instruction 1368# encoding by making it larger, and such fiddling gives 5% performance 1369# improvement. This is on FX-4100... 1370 1371my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1372 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); 1373my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1374 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); 1375 1376sub XOP_lane_ROUND { 1377my ($a0,$b0,$c0,$d0)=@_; 1378my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1379my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1380my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1381my @x=map("\"$_\"",@xx); 1382 1383 ( 1384 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1385 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1386 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1387 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1388 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1389 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1390 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1391 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1392 "&vprotd (@x[$d0],@x[$d0],16)", 1393 "&vprotd (@x[$d1],@x[$d1],16)", 1394 "&vprotd (@x[$d2],@x[$d2],16)", 1395 "&vprotd (@x[$d3],@x[$d3],16)", 1396 1397 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1398 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1399 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1400 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1401 "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1402 "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1403 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1404 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1405 "&vprotd (@x[$b0],@x[$b0],12)", 1406 "&vprotd (@x[$b1],@x[$b1],12)", 1407 "&vprotd (@x[$b2],@x[$b2],12)", 1408 "&vprotd (@x[$b3],@x[$b3],12)", 1409 1410 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip 1411 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip 1412 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1413 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1414 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1415 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1416 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1417 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1418 "&vprotd (@x[$d0],@x[$d0],8)", 1419 "&vprotd (@x[$d1],@x[$d1],8)", 1420 "&vprotd (@x[$d2],@x[$d2],8)", 1421 "&vprotd (@x[$d3],@x[$d3],8)", 1422 1423 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1424 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1425 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1426 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1427 "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1428 "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1429 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1430 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1431 "&vprotd (@x[$b0],@x[$b0],7)", 1432 "&vprotd (@x[$b1],@x[$b1],7)", 1433 "&vprotd (@x[$b2],@x[$b2],7)", 1434 "&vprotd (@x[$b3],@x[$b3],7)" 1435 ); 1436} 1437 1438my $xframe = $win64 ? 0xa8 : 8; 1439 1440$code.=<<___; 1441.type ChaCha20_4xop,\@function,5 1442.align 32 1443ChaCha20_4xop: 1444.cfi_startproc 1445.LChaCha20_4xop: 1446 mov %rsp,%r9 # frame pointer 1447.cfi_def_cfa_register %r9 1448 sub \$0x140+$xframe,%rsp 1449___ 1450 ################ stack layout 1451 # +0x00 SIMD equivalent of @x[8-12] 1452 # ... 1453 # +0x40 constant copy of key[0-2] smashed by lanes 1454 # ... 1455 # +0x100 SIMD counters (with nonce smashed by lanes) 1456 # ... 1457 # +0x140 1458$code.=<<___ if ($win64); 1459 movaps %xmm6,-0xa8(%r9) 1460 movaps %xmm7,-0x98(%r9) 1461 movaps %xmm8,-0x88(%r9) 1462 movaps %xmm9,-0x78(%r9) 1463 movaps %xmm10,-0x68(%r9) 1464 movaps %xmm11,-0x58(%r9) 1465 movaps %xmm12,-0x48(%r9) 1466 movaps %xmm13,-0x38(%r9) 1467 movaps %xmm14,-0x28(%r9) 1468 movaps %xmm15,-0x18(%r9) 1469.L4xop_body: 1470___ 1471$code.=<<___; 1472 vzeroupper 1473 1474 vmovdqa .Lsigma(%rip),$xa3 # key[0] 1475 vmovdqu ($key),$xb3 # key[1] 1476 vmovdqu 16($key),$xt3 # key[2] 1477 vmovdqu ($counter),$xd3 # key[3] 1478 lea 0x100(%rsp),%rcx # size optimization 1479 1480 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1481 vpshufd \$0x55,$xa3,$xa1 1482 vmovdqa $xa0,0x40(%rsp) # ... and offload 1483 vpshufd \$0xaa,$xa3,$xa2 1484 vmovdqa $xa1,0x50(%rsp) 1485 vpshufd \$0xff,$xa3,$xa3 1486 vmovdqa $xa2,0x60(%rsp) 1487 vmovdqa $xa3,0x70(%rsp) 1488 1489 vpshufd \$0x00,$xb3,$xb0 1490 vpshufd \$0x55,$xb3,$xb1 1491 vmovdqa $xb0,0x80-0x100(%rcx) 1492 vpshufd \$0xaa,$xb3,$xb2 1493 vmovdqa $xb1,0x90-0x100(%rcx) 1494 vpshufd \$0xff,$xb3,$xb3 1495 vmovdqa $xb2,0xa0-0x100(%rcx) 1496 vmovdqa $xb3,0xb0-0x100(%rcx) 1497 1498 vpshufd \$0x00,$xt3,$xt0 # "$xc0" 1499 vpshufd \$0x55,$xt3,$xt1 # "$xc1" 1500 vmovdqa $xt0,0xc0-0x100(%rcx) 1501 vpshufd \$0xaa,$xt3,$xt2 # "$xc2" 1502 vmovdqa $xt1,0xd0-0x100(%rcx) 1503 vpshufd \$0xff,$xt3,$xt3 # "$xc3" 1504 vmovdqa $xt2,0xe0-0x100(%rcx) 1505 vmovdqa $xt3,0xf0-0x100(%rcx) 1506 1507 vpshufd \$0x00,$xd3,$xd0 1508 vpshufd \$0x55,$xd3,$xd1 1509 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet 1510 vpshufd \$0xaa,$xd3,$xd2 1511 vmovdqa $xd1,0x110-0x100(%rcx) 1512 vpshufd \$0xff,$xd3,$xd3 1513 vmovdqa $xd2,0x120-0x100(%rcx) 1514 vmovdqa $xd3,0x130-0x100(%rcx) 1515 1516 jmp .Loop_enter4xop 1517 1518.align 32 1519.Loop_outer4xop: 1520 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key 1521 vmovdqa 0x50(%rsp),$xa1 1522 vmovdqa 0x60(%rsp),$xa2 1523 vmovdqa 0x70(%rsp),$xa3 1524 vmovdqa 0x80-0x100(%rcx),$xb0 1525 vmovdqa 0x90-0x100(%rcx),$xb1 1526 vmovdqa 0xa0-0x100(%rcx),$xb2 1527 vmovdqa 0xb0-0x100(%rcx),$xb3 1528 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1529 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1530 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1531 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1532 vmovdqa 0x100-0x100(%rcx),$xd0 1533 vmovdqa 0x110-0x100(%rcx),$xd1 1534 vmovdqa 0x120-0x100(%rcx),$xd2 1535 vmovdqa 0x130-0x100(%rcx),$xd3 1536 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters 1537 1538.Loop_enter4xop: 1539 mov \$10,%eax 1540 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1541 jmp .Loop4xop 1542 1543.align 32 1544.Loop4xop: 1545___ 1546 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } 1547 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } 1548$code.=<<___; 1549 dec %eax 1550 jnz .Loop4xop 1551 1552 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material 1553 vpaddd 0x50(%rsp),$xa1,$xa1 1554 vpaddd 0x60(%rsp),$xa2,$xa2 1555 vpaddd 0x70(%rsp),$xa3,$xa3 1556 1557 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 1558 vmovdqa $xt3,0x30(%rsp) 1559 1560 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 1561 vpunpckldq $xa3,$xa2,$xt3 1562 vpunpckhdq $xa1,$xa0,$xa0 1563 vpunpckhdq $xa3,$xa2,$xa2 1564 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 1565 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 1566 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 1567 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 1568___ 1569 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 1570$code.=<<___; 1571 vpaddd 0x80-0x100(%rcx),$xb0,$xb0 1572 vpaddd 0x90-0x100(%rcx),$xb1,$xb1 1573 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 1574 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 1575 1576 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 1577 vmovdqa $xa1,0x10(%rsp) 1578 vmovdqa 0x20(%rsp),$xa0 # "xc2" 1579 vmovdqa 0x30(%rsp),$xa1 # "xc3" 1580 1581 vpunpckldq $xb1,$xb0,$xt2 1582 vpunpckldq $xb3,$xb2,$xt3 1583 vpunpckhdq $xb1,$xb0,$xb0 1584 vpunpckhdq $xb3,$xb2,$xb2 1585 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 1586 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 1587 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 1588 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 1589___ 1590 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 1591 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1592$code.=<<___; 1593 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 1594 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 1595 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 1596 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 1597 1598 vpunpckldq $xc1,$xc0,$xt2 1599 vpunpckldq $xc3,$xc2,$xt3 1600 vpunpckhdq $xc1,$xc0,$xc0 1601 vpunpckhdq $xc3,$xc2,$xc2 1602 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 1603 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 1604 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 1605 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 1606___ 1607 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 1608$code.=<<___; 1609 vpaddd 0x100-0x100(%rcx),$xd0,$xd0 1610 vpaddd 0x110-0x100(%rcx),$xd1,$xd1 1611 vpaddd 0x120-0x100(%rcx),$xd2,$xd2 1612 vpaddd 0x130-0x100(%rcx),$xd3,$xd3 1613 1614 vpunpckldq $xd1,$xd0,$xt2 1615 vpunpckldq $xd3,$xd2,$xt3 1616 vpunpckhdq $xd1,$xd0,$xd0 1617 vpunpckhdq $xd3,$xd2,$xd2 1618 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 1619 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 1620 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 1621 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 1622___ 1623 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 1624 ($xa0,$xa1)=($xt2,$xt3); 1625$code.=<<___; 1626 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 1627 vmovdqa 0x10(%rsp),$xa1 1628 1629 cmp \$64*4,$len 1630 jb .Ltail4xop 1631 1632 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1633 vpxor 0x10($inp),$xb0,$xb0 1634 vpxor 0x20($inp),$xc0,$xc0 1635 vpxor 0x30($inp),$xd0,$xd0 1636 vpxor 0x40($inp),$xa1,$xa1 1637 vpxor 0x50($inp),$xb1,$xb1 1638 vpxor 0x60($inp),$xc1,$xc1 1639 vpxor 0x70($inp),$xd1,$xd1 1640 lea 0x80($inp),$inp # size optimization 1641 vpxor 0x00($inp),$xa2,$xa2 1642 vpxor 0x10($inp),$xb2,$xb2 1643 vpxor 0x20($inp),$xc2,$xc2 1644 vpxor 0x30($inp),$xd2,$xd2 1645 vpxor 0x40($inp),$xa3,$xa3 1646 vpxor 0x50($inp),$xb3,$xb3 1647 vpxor 0x60($inp),$xc3,$xc3 1648 vpxor 0x70($inp),$xd3,$xd3 1649 lea 0x80($inp),$inp # inp+=64*4 1650 1651 vmovdqu $xa0,0x00($out) 1652 vmovdqu $xb0,0x10($out) 1653 vmovdqu $xc0,0x20($out) 1654 vmovdqu $xd0,0x30($out) 1655 vmovdqu $xa1,0x40($out) 1656 vmovdqu $xb1,0x50($out) 1657 vmovdqu $xc1,0x60($out) 1658 vmovdqu $xd1,0x70($out) 1659 lea 0x80($out),$out # size optimization 1660 vmovdqu $xa2,0x00($out) 1661 vmovdqu $xb2,0x10($out) 1662 vmovdqu $xc2,0x20($out) 1663 vmovdqu $xd2,0x30($out) 1664 vmovdqu $xa3,0x40($out) 1665 vmovdqu $xb3,0x50($out) 1666 vmovdqu $xc3,0x60($out) 1667 vmovdqu $xd3,0x70($out) 1668 lea 0x80($out),$out # out+=64*4 1669 1670 sub \$64*4,$len 1671 jnz .Loop_outer4xop 1672 1673 jmp .Ldone4xop 1674 1675.align 32 1676.Ltail4xop: 1677 cmp \$192,$len 1678 jae .L192_or_more4xop 1679 cmp \$128,$len 1680 jae .L128_or_more4xop 1681 cmp \$64,$len 1682 jae .L64_or_more4xop 1683 1684 xor %r10,%r10 1685 vmovdqa $xa0,0x00(%rsp) 1686 vmovdqa $xb0,0x10(%rsp) 1687 vmovdqa $xc0,0x20(%rsp) 1688 vmovdqa $xd0,0x30(%rsp) 1689 jmp .Loop_tail4xop 1690 1691.align 32 1692.L64_or_more4xop: 1693 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1694 vpxor 0x10($inp),$xb0,$xb0 1695 vpxor 0x20($inp),$xc0,$xc0 1696 vpxor 0x30($inp),$xd0,$xd0 1697 vmovdqu $xa0,0x00($out) 1698 vmovdqu $xb0,0x10($out) 1699 vmovdqu $xc0,0x20($out) 1700 vmovdqu $xd0,0x30($out) 1701 je .Ldone4xop 1702 1703 lea 0x40($inp),$inp # inp+=64*1 1704 vmovdqa $xa1,0x00(%rsp) 1705 xor %r10,%r10 1706 vmovdqa $xb1,0x10(%rsp) 1707 lea 0x40($out),$out # out+=64*1 1708 vmovdqa $xc1,0x20(%rsp) 1709 sub \$64,$len # len-=64*1 1710 vmovdqa $xd1,0x30(%rsp) 1711 jmp .Loop_tail4xop 1712 1713.align 32 1714.L128_or_more4xop: 1715 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1716 vpxor 0x10($inp),$xb0,$xb0 1717 vpxor 0x20($inp),$xc0,$xc0 1718 vpxor 0x30($inp),$xd0,$xd0 1719 vpxor 0x40($inp),$xa1,$xa1 1720 vpxor 0x50($inp),$xb1,$xb1 1721 vpxor 0x60($inp),$xc1,$xc1 1722 vpxor 0x70($inp),$xd1,$xd1 1723 1724 vmovdqu $xa0,0x00($out) 1725 vmovdqu $xb0,0x10($out) 1726 vmovdqu $xc0,0x20($out) 1727 vmovdqu $xd0,0x30($out) 1728 vmovdqu $xa1,0x40($out) 1729 vmovdqu $xb1,0x50($out) 1730 vmovdqu $xc1,0x60($out) 1731 vmovdqu $xd1,0x70($out) 1732 je .Ldone4xop 1733 1734 lea 0x80($inp),$inp # inp+=64*2 1735 vmovdqa $xa2,0x00(%rsp) 1736 xor %r10,%r10 1737 vmovdqa $xb2,0x10(%rsp) 1738 lea 0x80($out),$out # out+=64*2 1739 vmovdqa $xc2,0x20(%rsp) 1740 sub \$128,$len # len-=64*2 1741 vmovdqa $xd2,0x30(%rsp) 1742 jmp .Loop_tail4xop 1743 1744.align 32 1745.L192_or_more4xop: 1746 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1747 vpxor 0x10($inp),$xb0,$xb0 1748 vpxor 0x20($inp),$xc0,$xc0 1749 vpxor 0x30($inp),$xd0,$xd0 1750 vpxor 0x40($inp),$xa1,$xa1 1751 vpxor 0x50($inp),$xb1,$xb1 1752 vpxor 0x60($inp),$xc1,$xc1 1753 vpxor 0x70($inp),$xd1,$xd1 1754 lea 0x80($inp),$inp # size optimization 1755 vpxor 0x00($inp),$xa2,$xa2 1756 vpxor 0x10($inp),$xb2,$xb2 1757 vpxor 0x20($inp),$xc2,$xc2 1758 vpxor 0x30($inp),$xd2,$xd2 1759 1760 vmovdqu $xa0,0x00($out) 1761 vmovdqu $xb0,0x10($out) 1762 vmovdqu $xc0,0x20($out) 1763 vmovdqu $xd0,0x30($out) 1764 vmovdqu $xa1,0x40($out) 1765 vmovdqu $xb1,0x50($out) 1766 vmovdqu $xc1,0x60($out) 1767 vmovdqu $xd1,0x70($out) 1768 lea 0x80($out),$out # size optimization 1769 vmovdqu $xa2,0x00($out) 1770 vmovdqu $xb2,0x10($out) 1771 vmovdqu $xc2,0x20($out) 1772 vmovdqu $xd2,0x30($out) 1773 je .Ldone4xop 1774 1775 lea 0x40($inp),$inp # inp+=64*3 1776 vmovdqa $xa3,0x00(%rsp) 1777 xor %r10,%r10 1778 vmovdqa $xb3,0x10(%rsp) 1779 lea 0x40($out),$out # out+=64*3 1780 vmovdqa $xc3,0x20(%rsp) 1781 sub \$192,$len # len-=64*3 1782 vmovdqa $xd3,0x30(%rsp) 1783 1784.Loop_tail4xop: 1785 movzb ($inp,%r10),%eax 1786 movzb (%rsp,%r10),%ecx 1787 lea 1(%r10),%r10 1788 xor %ecx,%eax 1789 mov %al,-1($out,%r10) 1790 dec $len 1791 jnz .Loop_tail4xop 1792 1793.Ldone4xop: 1794 vzeroupper 1795___ 1796$code.=<<___ if ($win64); 1797 movaps -0xa8(%r9),%xmm6 1798 movaps -0x98(%r9),%xmm7 1799 movaps -0x88(%r9),%xmm8 1800 movaps -0x78(%r9),%xmm9 1801 movaps -0x68(%r9),%xmm10 1802 movaps -0x58(%r9),%xmm11 1803 movaps -0x48(%r9),%xmm12 1804 movaps -0x38(%r9),%xmm13 1805 movaps -0x28(%r9),%xmm14 1806 movaps -0x18(%r9),%xmm15 1807___ 1808$code.=<<___; 1809 lea (%r9),%rsp 1810.cfi_def_cfa_register %rsp 1811.L4xop_epilogue: 1812 ret 1813.cfi_endproc 1814.size ChaCha20_4xop,.-ChaCha20_4xop 1815___ 1816} 1817 1818######################################################################## 1819# AVX2 code path 1820if ($avx>1) { 1821my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1822 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); 1823my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1824 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 1825 1826sub AVX2_lane_ROUND { 1827my ($a0,$b0,$c0,$d0)=@_; 1828my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1829my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1830my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1831my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 1832my @x=map("\"$_\"",@xx); 1833 1834 # Consider order in which variables are addressed by their 1835 # index: 1836 # 1837 # a b c d 1838 # 1839 # 0 4 8 12 < even round 1840 # 1 5 9 13 1841 # 2 6 10 14 1842 # 3 7 11 15 1843 # 0 5 10 15 < odd round 1844 # 1 6 11 12 1845 # 2 7 8 13 1846 # 3 4 9 14 1847 # 1848 # 'a', 'b' and 'd's are permanently allocated in registers, 1849 # @x[0..7,12..15], while 'c's are maintained in memory. If 1850 # you observe 'c' column, you'll notice that pair of 'c's is 1851 # invariant between rounds. This means that we have to reload 1852 # them once per round, in the middle. This is why you'll see 1853 # bunch of 'c' stores and loads in the middle, but none in 1854 # the beginning or end. 1855 1856 ( 1857 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1858 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1859 "&vpshufb (@x[$d0],@x[$d0],$t1)", 1860 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1861 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1862 "&vpshufb (@x[$d1],@x[$d1],$t1)", 1863 1864 "&vpaddd ($xc,$xc,@x[$d0])", 1865 "&vpxor (@x[$b0],$xc,@x[$b0])", 1866 "&vpslld ($t0,@x[$b0],12)", 1867 "&vpsrld (@x[$b0],@x[$b0],20)", 1868 "&vpor (@x[$b0],$t0,@x[$b0])", 1869 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1870 "&vpaddd ($xc_,$xc_,@x[$d1])", 1871 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1872 "&vpslld ($t1,@x[$b1],12)", 1873 "&vpsrld (@x[$b1],@x[$b1],20)", 1874 "&vpor (@x[$b1],$t1,@x[$b1])", 1875 1876 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 1877 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1878 "&vpshufb (@x[$d0],@x[$d0],$t0)", 1879 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 1880 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1881 "&vpshufb (@x[$d1],@x[$d1],$t0)", 1882 1883 "&vpaddd ($xc,$xc,@x[$d0])", 1884 "&vpxor (@x[$b0],$xc,@x[$b0])", 1885 "&vpslld ($t1,@x[$b0],7)", 1886 "&vpsrld (@x[$b0],@x[$b0],25)", 1887 "&vpor (@x[$b0],$t1,@x[$b0])", 1888 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1889 "&vpaddd ($xc_,$xc_,@x[$d1])", 1890 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1891 "&vpslld ($t0,@x[$b1],7)", 1892 "&vpsrld (@x[$b1],@x[$b1],25)", 1893 "&vpor (@x[$b1],$t0,@x[$b1])", 1894 1895 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 1896 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", 1897 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", 1898 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", 1899 1900 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1901 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1902 "&vpshufb (@x[$d2],@x[$d2],$t1)", 1903 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1904 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1905 "&vpshufb (@x[$d3],@x[$d3],$t1)", 1906 1907 "&vpaddd ($xc,$xc,@x[$d2])", 1908 "&vpxor (@x[$b2],$xc,@x[$b2])", 1909 "&vpslld ($t0,@x[$b2],12)", 1910 "&vpsrld (@x[$b2],@x[$b2],20)", 1911 "&vpor (@x[$b2],$t0,@x[$b2])", 1912 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1913 "&vpaddd ($xc_,$xc_,@x[$d3])", 1914 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1915 "&vpslld ($t1,@x[$b3],12)", 1916 "&vpsrld (@x[$b3],@x[$b3],20)", 1917 "&vpor (@x[$b3],$t1,@x[$b3])", 1918 1919 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1920 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1921 "&vpshufb (@x[$d2],@x[$d2],$t0)", 1922 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1923 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1924 "&vpshufb (@x[$d3],@x[$d3],$t0)", 1925 1926 "&vpaddd ($xc,$xc,@x[$d2])", 1927 "&vpxor (@x[$b2],$xc,@x[$b2])", 1928 "&vpslld ($t1,@x[$b2],7)", 1929 "&vpsrld (@x[$b2],@x[$b2],25)", 1930 "&vpor (@x[$b2],$t1,@x[$b2])", 1931 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1932 "&vpaddd ($xc_,$xc_,@x[$d3])", 1933 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1934 "&vpslld ($t0,@x[$b3],7)", 1935 "&vpsrld (@x[$b3],@x[$b3],25)", 1936 "&vpor (@x[$b3],$t0,@x[$b3])" 1937 ); 1938} 1939 1940my $xframe = $win64 ? 0xa8 : 8; 1941 1942$code.=<<___; 1943.type ChaCha20_8x,\@function,5 1944.align 32 1945ChaCha20_8x: 1946.cfi_startproc 1947.LChaCha20_8x: 1948 mov %rsp,%r9 # frame register 1949.cfi_def_cfa_register %r9 1950 sub \$0x280+$xframe,%rsp 1951 and \$-32,%rsp 1952___ 1953$code.=<<___ if ($win64); 1954 movaps %xmm6,-0xa8(%r9) 1955 movaps %xmm7,-0x98(%r9) 1956 movaps %xmm8,-0x88(%r9) 1957 movaps %xmm9,-0x78(%r9) 1958 movaps %xmm10,-0x68(%r9) 1959 movaps %xmm11,-0x58(%r9) 1960 movaps %xmm12,-0x48(%r9) 1961 movaps %xmm13,-0x38(%r9) 1962 movaps %xmm14,-0x28(%r9) 1963 movaps %xmm15,-0x18(%r9) 1964.L8x_body: 1965___ 1966$code.=<<___; 1967 vzeroupper 1968 1969 ################ stack layout 1970 # +0x00 SIMD equivalent of @x[8-12] 1971 # ... 1972 # +0x80 constant copy of key[0-2] smashed by lanes 1973 # ... 1974 # +0x200 SIMD counters (with nonce smashed by lanes) 1975 # ... 1976 # +0x280 1977 1978 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] 1979 vbroadcasti128 ($key),$xb3 # key[1] 1980 vbroadcasti128 16($key),$xt3 # key[2] 1981 vbroadcasti128 ($counter),$xd3 # key[3] 1982 lea 0x100(%rsp),%rcx # size optimization 1983 lea 0x200(%rsp),%rax # size optimization 1984 lea .Lrot16(%rip),%r10 1985 lea .Lrot24(%rip),%r11 1986 1987 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1988 vpshufd \$0x55,$xa3,$xa1 1989 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload 1990 vpshufd \$0xaa,$xa3,$xa2 1991 vmovdqa $xa1,0xa0-0x100(%rcx) 1992 vpshufd \$0xff,$xa3,$xa3 1993 vmovdqa $xa2,0xc0-0x100(%rcx) 1994 vmovdqa $xa3,0xe0-0x100(%rcx) 1995 1996 vpshufd \$0x00,$xb3,$xb0 1997 vpshufd \$0x55,$xb3,$xb1 1998 vmovdqa $xb0,0x100-0x100(%rcx) 1999 vpshufd \$0xaa,$xb3,$xb2 2000 vmovdqa $xb1,0x120-0x100(%rcx) 2001 vpshufd \$0xff,$xb3,$xb3 2002 vmovdqa $xb2,0x140-0x100(%rcx) 2003 vmovdqa $xb3,0x160-0x100(%rcx) 2004 2005 vpshufd \$0x00,$xt3,$xt0 # "xc0" 2006 vpshufd \$0x55,$xt3,$xt1 # "xc1" 2007 vmovdqa $xt0,0x180-0x200(%rax) 2008 vpshufd \$0xaa,$xt3,$xt2 # "xc2" 2009 vmovdqa $xt1,0x1a0-0x200(%rax) 2010 vpshufd \$0xff,$xt3,$xt3 # "xc3" 2011 vmovdqa $xt2,0x1c0-0x200(%rax) 2012 vmovdqa $xt3,0x1e0-0x200(%rax) 2013 2014 vpshufd \$0x00,$xd3,$xd0 2015 vpshufd \$0x55,$xd3,$xd1 2016 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 2017 vpshufd \$0xaa,$xd3,$xd2 2018 vmovdqa $xd1,0x220-0x200(%rax) 2019 vpshufd \$0xff,$xd3,$xd3 2020 vmovdqa $xd2,0x240-0x200(%rax) 2021 vmovdqa $xd3,0x260-0x200(%rax) 2022 2023 jmp .Loop_enter8x 2024 2025.align 32 2026.Loop_outer8x: 2027 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key 2028 vmovdqa 0xa0-0x100(%rcx),$xa1 2029 vmovdqa 0xc0-0x100(%rcx),$xa2 2030 vmovdqa 0xe0-0x100(%rcx),$xa3 2031 vmovdqa 0x100-0x100(%rcx),$xb0 2032 vmovdqa 0x120-0x100(%rcx),$xb1 2033 vmovdqa 0x140-0x100(%rcx),$xb2 2034 vmovdqa 0x160-0x100(%rcx),$xb3 2035 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" 2036 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" 2037 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" 2038 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" 2039 vmovdqa 0x200-0x200(%rax),$xd0 2040 vmovdqa 0x220-0x200(%rax),$xd1 2041 vmovdqa 0x240-0x200(%rax),$xd2 2042 vmovdqa 0x260-0x200(%rax),$xd3 2043 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters 2044 2045.Loop_enter8x: 2046 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" 2047 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" 2048 vbroadcasti128 (%r10),$xt3 2049 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters 2050 mov \$10,%eax 2051 jmp .Loop8x 2052 2053.align 32 2054.Loop8x: 2055___ 2056 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } 2057 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } 2058$code.=<<___; 2059 dec %eax 2060 jnz .Loop8x 2061 2062 lea 0x200(%rsp),%rax # size optimization 2063 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key 2064 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 2065 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 2066 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 2067 2068 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 2069 vpunpckldq $xa3,$xa2,$xt3 2070 vpunpckhdq $xa1,$xa0,$xa0 2071 vpunpckhdq $xa3,$xa2,$xa2 2072 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 2073 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 2074 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 2075 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 2076___ 2077 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 2078$code.=<<___; 2079 vpaddd 0x100-0x100(%rcx),$xb0,$xb0 2080 vpaddd 0x120-0x100(%rcx),$xb1,$xb1 2081 vpaddd 0x140-0x100(%rcx),$xb2,$xb2 2082 vpaddd 0x160-0x100(%rcx),$xb3,$xb3 2083 2084 vpunpckldq $xb1,$xb0,$xt2 2085 vpunpckldq $xb3,$xb2,$xt3 2086 vpunpckhdq $xb1,$xb0,$xb0 2087 vpunpckhdq $xb3,$xb2,$xb2 2088 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 2089 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 2090 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 2091 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 2092___ 2093 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 2094$code.=<<___; 2095 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further 2096 vperm2i128 \$0x31,$xb0,$xa0,$xb0 2097 vperm2i128 \$0x20,$xb1,$xa1,$xa0 2098 vperm2i128 \$0x31,$xb1,$xa1,$xb1 2099 vperm2i128 \$0x20,$xb2,$xa2,$xa1 2100 vperm2i128 \$0x31,$xb2,$xa2,$xb2 2101 vperm2i128 \$0x20,$xb3,$xa3,$xa2 2102 vperm2i128 \$0x31,$xb3,$xa3,$xb3 2103___ 2104 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 2105 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 2106$code.=<<___; 2107 vmovdqa $xa0,0x00(%rsp) # offload $xaN 2108 vmovdqa $xa1,0x20(%rsp) 2109 vmovdqa 0x40(%rsp),$xc2 # $xa0 2110 vmovdqa 0x60(%rsp),$xc3 # $xa1 2111 2112 vpaddd 0x180-0x200(%rax),$xc0,$xc0 2113 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 2114 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 2115 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 2116 2117 vpunpckldq $xc1,$xc0,$xt2 2118 vpunpckldq $xc3,$xc2,$xt3 2119 vpunpckhdq $xc1,$xc0,$xc0 2120 vpunpckhdq $xc3,$xc2,$xc2 2121 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 2122 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 2123 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 2124 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 2125___ 2126 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 2127$code.=<<___; 2128 vpaddd 0x200-0x200(%rax),$xd0,$xd0 2129 vpaddd 0x220-0x200(%rax),$xd1,$xd1 2130 vpaddd 0x240-0x200(%rax),$xd2,$xd2 2131 vpaddd 0x260-0x200(%rax),$xd3,$xd3 2132 2133 vpunpckldq $xd1,$xd0,$xt2 2134 vpunpckldq $xd3,$xd2,$xt3 2135 vpunpckhdq $xd1,$xd0,$xd0 2136 vpunpckhdq $xd3,$xd2,$xd2 2137 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 2138 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 2139 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 2140 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 2141___ 2142 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 2143$code.=<<___; 2144 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 2145 vperm2i128 \$0x31,$xd0,$xc0,$xd0 2146 vperm2i128 \$0x20,$xd1,$xc1,$xc0 2147 vperm2i128 \$0x31,$xd1,$xc1,$xd1 2148 vperm2i128 \$0x20,$xd2,$xc2,$xc1 2149 vperm2i128 \$0x31,$xd2,$xc2,$xd2 2150 vperm2i128 \$0x20,$xd3,$xc3,$xc2 2151 vperm2i128 \$0x31,$xd3,$xc3,$xd3 2152___ 2153 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 2154 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 2155 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 2156 ($xa0,$xa1)=($xt2,$xt3); 2157$code.=<<___; 2158 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? 2159 vmovdqa 0x20(%rsp),$xa1 2160 2161 cmp \$64*8,$len 2162 jb .Ltail8x 2163 2164 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2165 vpxor 0x20($inp),$xb0,$xb0 2166 vpxor 0x40($inp),$xc0,$xc0 2167 vpxor 0x60($inp),$xd0,$xd0 2168 lea 0x80($inp),$inp # size optimization 2169 vmovdqu $xa0,0x00($out) 2170 vmovdqu $xb0,0x20($out) 2171 vmovdqu $xc0,0x40($out) 2172 vmovdqu $xd0,0x60($out) 2173 lea 0x80($out),$out # size optimization 2174 2175 vpxor 0x00($inp),$xa1,$xa1 2176 vpxor 0x20($inp),$xb1,$xb1 2177 vpxor 0x40($inp),$xc1,$xc1 2178 vpxor 0x60($inp),$xd1,$xd1 2179 lea 0x80($inp),$inp # size optimization 2180 vmovdqu $xa1,0x00($out) 2181 vmovdqu $xb1,0x20($out) 2182 vmovdqu $xc1,0x40($out) 2183 vmovdqu $xd1,0x60($out) 2184 lea 0x80($out),$out # size optimization 2185 2186 vpxor 0x00($inp),$xa2,$xa2 2187 vpxor 0x20($inp),$xb2,$xb2 2188 vpxor 0x40($inp),$xc2,$xc2 2189 vpxor 0x60($inp),$xd2,$xd2 2190 lea 0x80($inp),$inp # size optimization 2191 vmovdqu $xa2,0x00($out) 2192 vmovdqu $xb2,0x20($out) 2193 vmovdqu $xc2,0x40($out) 2194 vmovdqu $xd2,0x60($out) 2195 lea 0x80($out),$out # size optimization 2196 2197 vpxor 0x00($inp),$xa3,$xa3 2198 vpxor 0x20($inp),$xb3,$xb3 2199 vpxor 0x40($inp),$xc3,$xc3 2200 vpxor 0x60($inp),$xd3,$xd3 2201 lea 0x80($inp),$inp # size optimization 2202 vmovdqu $xa3,0x00($out) 2203 vmovdqu $xb3,0x20($out) 2204 vmovdqu $xc3,0x40($out) 2205 vmovdqu $xd3,0x60($out) 2206 lea 0x80($out),$out # size optimization 2207 2208 sub \$64*8,$len 2209 jnz .Loop_outer8x 2210 2211 jmp .Ldone8x 2212 2213.Ltail8x: 2214 cmp \$448,$len 2215 jae .L448_or_more8x 2216 cmp \$384,$len 2217 jae .L384_or_more8x 2218 cmp \$320,$len 2219 jae .L320_or_more8x 2220 cmp \$256,$len 2221 jae .L256_or_more8x 2222 cmp \$192,$len 2223 jae .L192_or_more8x 2224 cmp \$128,$len 2225 jae .L128_or_more8x 2226 cmp \$64,$len 2227 jae .L64_or_more8x 2228 2229 xor %r10,%r10 2230 vmovdqa $xa0,0x00(%rsp) 2231 vmovdqa $xb0,0x20(%rsp) 2232 jmp .Loop_tail8x 2233 2234.align 32 2235.L64_or_more8x: 2236 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2237 vpxor 0x20($inp),$xb0,$xb0 2238 vmovdqu $xa0,0x00($out) 2239 vmovdqu $xb0,0x20($out) 2240 je .Ldone8x 2241 2242 lea 0x40($inp),$inp # inp+=64*1 2243 xor %r10,%r10 2244 vmovdqa $xc0,0x00(%rsp) 2245 lea 0x40($out),$out # out+=64*1 2246 sub \$64,$len # len-=64*1 2247 vmovdqa $xd0,0x20(%rsp) 2248 jmp .Loop_tail8x 2249 2250.align 32 2251.L128_or_more8x: 2252 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2253 vpxor 0x20($inp),$xb0,$xb0 2254 vpxor 0x40($inp),$xc0,$xc0 2255 vpxor 0x60($inp),$xd0,$xd0 2256 vmovdqu $xa0,0x00($out) 2257 vmovdqu $xb0,0x20($out) 2258 vmovdqu $xc0,0x40($out) 2259 vmovdqu $xd0,0x60($out) 2260 je .Ldone8x 2261 2262 lea 0x80($inp),$inp # inp+=64*2 2263 xor %r10,%r10 2264 vmovdqa $xa1,0x00(%rsp) 2265 lea 0x80($out),$out # out+=64*2 2266 sub \$128,$len # len-=64*2 2267 vmovdqa $xb1,0x20(%rsp) 2268 jmp .Loop_tail8x 2269 2270.align 32 2271.L192_or_more8x: 2272 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2273 vpxor 0x20($inp),$xb0,$xb0 2274 vpxor 0x40($inp),$xc0,$xc0 2275 vpxor 0x60($inp),$xd0,$xd0 2276 vpxor 0x80($inp),$xa1,$xa1 2277 vpxor 0xa0($inp),$xb1,$xb1 2278 vmovdqu $xa0,0x00($out) 2279 vmovdqu $xb0,0x20($out) 2280 vmovdqu $xc0,0x40($out) 2281 vmovdqu $xd0,0x60($out) 2282 vmovdqu $xa1,0x80($out) 2283 vmovdqu $xb1,0xa0($out) 2284 je .Ldone8x 2285 2286 lea 0xc0($inp),$inp # inp+=64*3 2287 xor %r10,%r10 2288 vmovdqa $xc1,0x00(%rsp) 2289 lea 0xc0($out),$out # out+=64*3 2290 sub \$192,$len # len-=64*3 2291 vmovdqa $xd1,0x20(%rsp) 2292 jmp .Loop_tail8x 2293 2294.align 32 2295.L256_or_more8x: 2296 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2297 vpxor 0x20($inp),$xb0,$xb0 2298 vpxor 0x40($inp),$xc0,$xc0 2299 vpxor 0x60($inp),$xd0,$xd0 2300 vpxor 0x80($inp),$xa1,$xa1 2301 vpxor 0xa0($inp),$xb1,$xb1 2302 vpxor 0xc0($inp),$xc1,$xc1 2303 vpxor 0xe0($inp),$xd1,$xd1 2304 vmovdqu $xa0,0x00($out) 2305 vmovdqu $xb0,0x20($out) 2306 vmovdqu $xc0,0x40($out) 2307 vmovdqu $xd0,0x60($out) 2308 vmovdqu $xa1,0x80($out) 2309 vmovdqu $xb1,0xa0($out) 2310 vmovdqu $xc1,0xc0($out) 2311 vmovdqu $xd1,0xe0($out) 2312 je .Ldone8x 2313 2314 lea 0x100($inp),$inp # inp+=64*4 2315 xor %r10,%r10 2316 vmovdqa $xa2,0x00(%rsp) 2317 lea 0x100($out),$out # out+=64*4 2318 sub \$256,$len # len-=64*4 2319 vmovdqa $xb2,0x20(%rsp) 2320 jmp .Loop_tail8x 2321 2322.align 32 2323.L320_or_more8x: 2324 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2325 vpxor 0x20($inp),$xb0,$xb0 2326 vpxor 0x40($inp),$xc0,$xc0 2327 vpxor 0x60($inp),$xd0,$xd0 2328 vpxor 0x80($inp),$xa1,$xa1 2329 vpxor 0xa0($inp),$xb1,$xb1 2330 vpxor 0xc0($inp),$xc1,$xc1 2331 vpxor 0xe0($inp),$xd1,$xd1 2332 vpxor 0x100($inp),$xa2,$xa2 2333 vpxor 0x120($inp),$xb2,$xb2 2334 vmovdqu $xa0,0x00($out) 2335 vmovdqu $xb0,0x20($out) 2336 vmovdqu $xc0,0x40($out) 2337 vmovdqu $xd0,0x60($out) 2338 vmovdqu $xa1,0x80($out) 2339 vmovdqu $xb1,0xa0($out) 2340 vmovdqu $xc1,0xc0($out) 2341 vmovdqu $xd1,0xe0($out) 2342 vmovdqu $xa2,0x100($out) 2343 vmovdqu $xb2,0x120($out) 2344 je .Ldone8x 2345 2346 lea 0x140($inp),$inp # inp+=64*5 2347 xor %r10,%r10 2348 vmovdqa $xc2,0x00(%rsp) 2349 lea 0x140($out),$out # out+=64*5 2350 sub \$320,$len # len-=64*5 2351 vmovdqa $xd2,0x20(%rsp) 2352 jmp .Loop_tail8x 2353 2354.align 32 2355.L384_or_more8x: 2356 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2357 vpxor 0x20($inp),$xb0,$xb0 2358 vpxor 0x40($inp),$xc0,$xc0 2359 vpxor 0x60($inp),$xd0,$xd0 2360 vpxor 0x80($inp),$xa1,$xa1 2361 vpxor 0xa0($inp),$xb1,$xb1 2362 vpxor 0xc0($inp),$xc1,$xc1 2363 vpxor 0xe0($inp),$xd1,$xd1 2364 vpxor 0x100($inp),$xa2,$xa2 2365 vpxor 0x120($inp),$xb2,$xb2 2366 vpxor 0x140($inp),$xc2,$xc2 2367 vpxor 0x160($inp),$xd2,$xd2 2368 vmovdqu $xa0,0x00($out) 2369 vmovdqu $xb0,0x20($out) 2370 vmovdqu $xc0,0x40($out) 2371 vmovdqu $xd0,0x60($out) 2372 vmovdqu $xa1,0x80($out) 2373 vmovdqu $xb1,0xa0($out) 2374 vmovdqu $xc1,0xc0($out) 2375 vmovdqu $xd1,0xe0($out) 2376 vmovdqu $xa2,0x100($out) 2377 vmovdqu $xb2,0x120($out) 2378 vmovdqu $xc2,0x140($out) 2379 vmovdqu $xd2,0x160($out) 2380 je .Ldone8x 2381 2382 lea 0x180($inp),$inp # inp+=64*6 2383 xor %r10,%r10 2384 vmovdqa $xa3,0x00(%rsp) 2385 lea 0x180($out),$out # out+=64*6 2386 sub \$384,$len # len-=64*6 2387 vmovdqa $xb3,0x20(%rsp) 2388 jmp .Loop_tail8x 2389 2390.align 32 2391.L448_or_more8x: 2392 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2393 vpxor 0x20($inp),$xb0,$xb0 2394 vpxor 0x40($inp),$xc0,$xc0 2395 vpxor 0x60($inp),$xd0,$xd0 2396 vpxor 0x80($inp),$xa1,$xa1 2397 vpxor 0xa0($inp),$xb1,$xb1 2398 vpxor 0xc0($inp),$xc1,$xc1 2399 vpxor 0xe0($inp),$xd1,$xd1 2400 vpxor 0x100($inp),$xa2,$xa2 2401 vpxor 0x120($inp),$xb2,$xb2 2402 vpxor 0x140($inp),$xc2,$xc2 2403 vpxor 0x160($inp),$xd2,$xd2 2404 vpxor 0x180($inp),$xa3,$xa3 2405 vpxor 0x1a0($inp),$xb3,$xb3 2406 vmovdqu $xa0,0x00($out) 2407 vmovdqu $xb0,0x20($out) 2408 vmovdqu $xc0,0x40($out) 2409 vmovdqu $xd0,0x60($out) 2410 vmovdqu $xa1,0x80($out) 2411 vmovdqu $xb1,0xa0($out) 2412 vmovdqu $xc1,0xc0($out) 2413 vmovdqu $xd1,0xe0($out) 2414 vmovdqu $xa2,0x100($out) 2415 vmovdqu $xb2,0x120($out) 2416 vmovdqu $xc2,0x140($out) 2417 vmovdqu $xd2,0x160($out) 2418 vmovdqu $xa3,0x180($out) 2419 vmovdqu $xb3,0x1a0($out) 2420 je .Ldone8x 2421 2422 lea 0x1c0($inp),$inp # inp+=64*7 2423 xor %r10,%r10 2424 vmovdqa $xc3,0x00(%rsp) 2425 lea 0x1c0($out),$out # out+=64*7 2426 sub \$448,$len # len-=64*7 2427 vmovdqa $xd3,0x20(%rsp) 2428 2429.Loop_tail8x: 2430 movzb ($inp,%r10),%eax 2431 movzb (%rsp,%r10),%ecx 2432 lea 1(%r10),%r10 2433 xor %ecx,%eax 2434 mov %al,-1($out,%r10) 2435 dec $len 2436 jnz .Loop_tail8x 2437 2438.Ldone8x: 2439 vzeroall 2440___ 2441$code.=<<___ if ($win64); 2442 movaps -0xa8(%r9),%xmm6 2443 movaps -0x98(%r9),%xmm7 2444 movaps -0x88(%r9),%xmm8 2445 movaps -0x78(%r9),%xmm9 2446 movaps -0x68(%r9),%xmm10 2447 movaps -0x58(%r9),%xmm11 2448 movaps -0x48(%r9),%xmm12 2449 movaps -0x38(%r9),%xmm13 2450 movaps -0x28(%r9),%xmm14 2451 movaps -0x18(%r9),%xmm15 2452___ 2453$code.=<<___; 2454 lea (%r9),%rsp 2455.cfi_def_cfa_register %rsp 2456.L8x_epilogue: 2457 ret 2458.cfi_endproc 2459.size ChaCha20_8x,.-ChaCha20_8x 2460___ 2461} 2462 2463######################################################################## 2464# AVX512 code paths 2465if ($avx>2) { 2466# This one handles shorter inputs... 2467 2468my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); 2469my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 2470 2471sub vpxord() # size optimization 2472{ my $opcode = "vpxor"; # adhere to vpxor when possible 2473 2474 foreach (@_) { 2475 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { 2476 $opcode = "vpxord"; 2477 last; 2478 } 2479 } 2480 2481 $code .= "\t$opcode\t".join(',',reverse @_)."\n"; 2482} 2483 2484sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round 2485 &vpaddd ($a,$a,$b); 2486 &vpxord ($d,$d,$a); 2487 &vprold ($d,$d,16); 2488 2489 &vpaddd ($c,$c,$d); 2490 &vpxord ($b,$b,$c); 2491 &vprold ($b,$b,12); 2492 2493 &vpaddd ($a,$a,$b); 2494 &vpxord ($d,$d,$a); 2495 &vprold ($d,$d,8); 2496 2497 &vpaddd ($c,$c,$d); 2498 &vpxord ($b,$b,$c); 2499 &vprold ($b,$b,7); 2500} 2501 2502my $xframe = $win64 ? 32+8 : 8; 2503 2504$code.=<<___; 2505.type ChaCha20_avx512,\@function,5 2506.align 32 2507ChaCha20_avx512: 2508.cfi_startproc 2509.LChaCha20_avx512: 2510 mov %rsp,%r9 # frame pointer 2511.cfi_def_cfa_register %r9 2512 cmp \$512,$len 2513 ja .LChaCha20_16x 2514 2515 sub \$64+$xframe,%rsp 2516___ 2517$code.=<<___ if ($win64); 2518 movaps %xmm6,-0x28(%r9) 2519 movaps %xmm7,-0x18(%r9) 2520.Lavx512_body: 2521___ 2522$code.=<<___; 2523 vbroadcasti32x4 .Lsigma(%rip),$a 2524 vbroadcasti32x4 ($key),$b 2525 vbroadcasti32x4 16($key),$c 2526 vbroadcasti32x4 ($counter),$d 2527 2528 vmovdqa32 $a,$a_ 2529 vmovdqa32 $b,$b_ 2530 vmovdqa32 $c,$c_ 2531 vpaddd .Lzeroz(%rip),$d,$d 2532 vmovdqa32 .Lfourz(%rip),$fourz 2533 mov \$10,$counter # reuse $counter 2534 vmovdqa32 $d,$d_ 2535 jmp .Loop_avx512 2536 2537.align 16 2538.Loop_outer_avx512: 2539 vmovdqa32 $a_,$a 2540 vmovdqa32 $b_,$b 2541 vmovdqa32 $c_,$c 2542 vpaddd $fourz,$d_,$d 2543 mov \$10,$counter 2544 vmovdqa32 $d,$d_ 2545 jmp .Loop_avx512 2546 2547.align 32 2548.Loop_avx512: 2549___ 2550 &AVX512ROUND(); 2551 &vpshufd ($c,$c,0b01001110); 2552 &vpshufd ($b,$b,0b00111001); 2553 &vpshufd ($d,$d,0b10010011); 2554 2555 &AVX512ROUND(); 2556 &vpshufd ($c,$c,0b01001110); 2557 &vpshufd ($b,$b,0b10010011); 2558 &vpshufd ($d,$d,0b00111001); 2559 2560 &dec ($counter); 2561 &jnz (".Loop_avx512"); 2562 2563$code.=<<___; 2564 vpaddd $a_,$a,$a 2565 vpaddd $b_,$b,$b 2566 vpaddd $c_,$c,$c 2567 vpaddd $d_,$d,$d 2568 2569 sub \$64,$len 2570 jb .Ltail64_avx512 2571 2572 vpxor 0x00($inp),%x#$a,$t0 # xor with input 2573 vpxor 0x10($inp),%x#$b,$t1 2574 vpxor 0x20($inp),%x#$c,$t2 2575 vpxor 0x30($inp),%x#$d,$t3 2576 lea 0x40($inp),$inp # inp+=64 2577 2578 vmovdqu $t0,0x00($out) # write output 2579 vmovdqu $t1,0x10($out) 2580 vmovdqu $t2,0x20($out) 2581 vmovdqu $t3,0x30($out) 2582 lea 0x40($out),$out # out+=64 2583 2584 jz .Ldone_avx512 2585 2586 vextracti32x4 \$1,$a,$t0 2587 vextracti32x4 \$1,$b,$t1 2588 vextracti32x4 \$1,$c,$t2 2589 vextracti32x4 \$1,$d,$t3 2590 2591 sub \$64,$len 2592 jb .Ltail_avx512 2593 2594 vpxor 0x00($inp),$t0,$t0 # xor with input 2595 vpxor 0x10($inp),$t1,$t1 2596 vpxor 0x20($inp),$t2,$t2 2597 vpxor 0x30($inp),$t3,$t3 2598 lea 0x40($inp),$inp # inp+=64 2599 2600 vmovdqu $t0,0x00($out) # write output 2601 vmovdqu $t1,0x10($out) 2602 vmovdqu $t2,0x20($out) 2603 vmovdqu $t3,0x30($out) 2604 lea 0x40($out),$out # out+=64 2605 2606 jz .Ldone_avx512 2607 2608 vextracti32x4 \$2,$a,$t0 2609 vextracti32x4 \$2,$b,$t1 2610 vextracti32x4 \$2,$c,$t2 2611 vextracti32x4 \$2,$d,$t3 2612 2613 sub \$64,$len 2614 jb .Ltail_avx512 2615 2616 vpxor 0x00($inp),$t0,$t0 # xor with input 2617 vpxor 0x10($inp),$t1,$t1 2618 vpxor 0x20($inp),$t2,$t2 2619 vpxor 0x30($inp),$t3,$t3 2620 lea 0x40($inp),$inp # inp+=64 2621 2622 vmovdqu $t0,0x00($out) # write output 2623 vmovdqu $t1,0x10($out) 2624 vmovdqu $t2,0x20($out) 2625 vmovdqu $t3,0x30($out) 2626 lea 0x40($out),$out # out+=64 2627 2628 jz .Ldone_avx512 2629 2630 vextracti32x4 \$3,$a,$t0 2631 vextracti32x4 \$3,$b,$t1 2632 vextracti32x4 \$3,$c,$t2 2633 vextracti32x4 \$3,$d,$t3 2634 2635 sub \$64,$len 2636 jb .Ltail_avx512 2637 2638 vpxor 0x00($inp),$t0,$t0 # xor with input 2639 vpxor 0x10($inp),$t1,$t1 2640 vpxor 0x20($inp),$t2,$t2 2641 vpxor 0x30($inp),$t3,$t3 2642 lea 0x40($inp),$inp # inp+=64 2643 2644 vmovdqu $t0,0x00($out) # write output 2645 vmovdqu $t1,0x10($out) 2646 vmovdqu $t2,0x20($out) 2647 vmovdqu $t3,0x30($out) 2648 lea 0x40($out),$out # out+=64 2649 2650 jnz .Loop_outer_avx512 2651 2652 jmp .Ldone_avx512 2653 2654.align 16 2655.Ltail64_avx512: 2656 vmovdqa %x#$a,0x00(%rsp) 2657 vmovdqa %x#$b,0x10(%rsp) 2658 vmovdqa %x#$c,0x20(%rsp) 2659 vmovdqa %x#$d,0x30(%rsp) 2660 add \$64,$len 2661 jmp .Loop_tail_avx512 2662 2663.align 16 2664.Ltail_avx512: 2665 vmovdqa $t0,0x00(%rsp) 2666 vmovdqa $t1,0x10(%rsp) 2667 vmovdqa $t2,0x20(%rsp) 2668 vmovdqa $t3,0x30(%rsp) 2669 add \$64,$len 2670 2671.Loop_tail_avx512: 2672 movzb ($inp,$counter),%eax 2673 movzb (%rsp,$counter),%ecx 2674 lea 1($counter),$counter 2675 xor %ecx,%eax 2676 mov %al,-1($out,$counter) 2677 dec $len 2678 jnz .Loop_tail_avx512 2679 2680 vmovdqu32 $a_,0x00(%rsp) 2681 2682.Ldone_avx512: 2683 vzeroall 2684___ 2685$code.=<<___ if ($win64); 2686 movaps -0x28(%r9),%xmm6 2687 movaps -0x18(%r9),%xmm7 2688___ 2689$code.=<<___; 2690 lea (%r9),%rsp 2691.cfi_def_cfa_register %rsp 2692.Lavx512_epilogue: 2693 ret 2694.cfi_endproc 2695.size ChaCha20_avx512,.-ChaCha20_avx512 2696___ 2697 2698map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); 2699 2700$code.=<<___; 2701.type ChaCha20_avx512vl,\@function,5 2702.align 32 2703ChaCha20_avx512vl: 2704.cfi_startproc 2705.LChaCha20_avx512vl: 2706 mov %rsp,%r9 # frame pointer 2707.cfi_def_cfa_register %r9 2708 cmp \$128,$len 2709 ja .LChaCha20_8xvl 2710 2711 sub \$64+$xframe,%rsp 2712___ 2713$code.=<<___ if ($win64); 2714 movaps %xmm6,-0x28(%r9) 2715 movaps %xmm7,-0x18(%r9) 2716.Lavx512vl_body: 2717___ 2718$code.=<<___; 2719 vbroadcasti128 .Lsigma(%rip),$a 2720 vbroadcasti128 ($key),$b 2721 vbroadcasti128 16($key),$c 2722 vbroadcasti128 ($counter),$d 2723 2724 vmovdqa32 $a,$a_ 2725 vmovdqa32 $b,$b_ 2726 vmovdqa32 $c,$c_ 2727 vpaddd .Lzeroz(%rip),$d,$d 2728 vmovdqa32 .Ltwoy(%rip),$fourz 2729 mov \$10,$counter # reuse $counter 2730 vmovdqa32 $d,$d_ 2731 jmp .Loop_avx512vl 2732 2733.align 16 2734.Loop_outer_avx512vl: 2735 vmovdqa32 $c_,$c 2736 vpaddd $fourz,$d_,$d 2737 mov \$10,$counter 2738 vmovdqa32 $d,$d_ 2739 jmp .Loop_avx512vl 2740 2741.align 32 2742.Loop_avx512vl: 2743___ 2744 &AVX512ROUND(); 2745 &vpshufd ($c,$c,0b01001110); 2746 &vpshufd ($b,$b,0b00111001); 2747 &vpshufd ($d,$d,0b10010011); 2748 2749 &AVX512ROUND(); 2750 &vpshufd ($c,$c,0b01001110); 2751 &vpshufd ($b,$b,0b10010011); 2752 &vpshufd ($d,$d,0b00111001); 2753 2754 &dec ($counter); 2755 &jnz (".Loop_avx512vl"); 2756 2757$code.=<<___; 2758 vpaddd $a_,$a,$a 2759 vpaddd $b_,$b,$b 2760 vpaddd $c_,$c,$c 2761 vpaddd $d_,$d,$d 2762 2763 sub \$64,$len 2764 jb .Ltail64_avx512vl 2765 2766 vpxor 0x00($inp),%x#$a,$t0 # xor with input 2767 vpxor 0x10($inp),%x#$b,$t1 2768 vpxor 0x20($inp),%x#$c,$t2 2769 vpxor 0x30($inp),%x#$d,$t3 2770 lea 0x40($inp),$inp # inp+=64 2771 2772 vmovdqu $t0,0x00($out) # write output 2773 vmovdqu $t1,0x10($out) 2774 vmovdqu $t2,0x20($out) 2775 vmovdqu $t3,0x30($out) 2776 lea 0x40($out),$out # out+=64 2777 2778 jz .Ldone_avx512vl 2779 2780 vextracti128 \$1,$a,$t0 2781 vextracti128 \$1,$b,$t1 2782 vextracti128 \$1,$c,$t2 2783 vextracti128 \$1,$d,$t3 2784 2785 sub \$64,$len 2786 jb .Ltail_avx512vl 2787 2788 vpxor 0x00($inp),$t0,$t0 # xor with input 2789 vpxor 0x10($inp),$t1,$t1 2790 vpxor 0x20($inp),$t2,$t2 2791 vpxor 0x30($inp),$t3,$t3 2792 lea 0x40($inp),$inp # inp+=64 2793 2794 vmovdqu $t0,0x00($out) # write output 2795 vmovdqu $t1,0x10($out) 2796 vmovdqu $t2,0x20($out) 2797 vmovdqu $t3,0x30($out) 2798 lea 0x40($out),$out # out+=64 2799 2800 vmovdqa32 $a_,$a 2801 vmovdqa32 $b_,$b 2802 jnz .Loop_outer_avx512vl 2803 2804 jmp .Ldone_avx512vl 2805 2806.align 16 2807.Ltail64_avx512vl: 2808 vmovdqa %x#$a,0x00(%rsp) 2809 vmovdqa %x#$b,0x10(%rsp) 2810 vmovdqa %x#$c,0x20(%rsp) 2811 vmovdqa %x#$d,0x30(%rsp) 2812 add \$64,$len 2813 jmp .Loop_tail_avx512vl 2814 2815.align 16 2816.Ltail_avx512vl: 2817 vmovdqa $t0,0x00(%rsp) 2818 vmovdqa $t1,0x10(%rsp) 2819 vmovdqa $t2,0x20(%rsp) 2820 vmovdqa $t3,0x30(%rsp) 2821 add \$64,$len 2822 2823.Loop_tail_avx512vl: 2824 movzb ($inp,$counter),%eax 2825 movzb (%rsp,$counter),%ecx 2826 lea 1($counter),$counter 2827 xor %ecx,%eax 2828 mov %al,-1($out,$counter) 2829 dec $len 2830 jnz .Loop_tail_avx512vl 2831 2832 vmovdqu32 $a_,0x00(%rsp) 2833 vmovdqu32 $a_,0x20(%rsp) 2834 2835.Ldone_avx512vl: 2836 vzeroall 2837___ 2838$code.=<<___ if ($win64); 2839 movaps -0x28(%r9),%xmm6 2840 movaps -0x18(%r9),%xmm7 2841___ 2842$code.=<<___; 2843 lea (%r9),%rsp 2844.cfi_def_cfa_register %rsp 2845.Lavx512vl_epilogue: 2846 ret 2847.cfi_endproc 2848.size ChaCha20_avx512vl,.-ChaCha20_avx512vl 2849___ 2850} 2851if ($avx>2) { 2852# This one handles longer inputs... 2853 2854my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2855 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); 2856my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2857 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 2858my @key=map("%zmm$_",(16..31)); 2859my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 2860 2861sub AVX512_lane_ROUND { 2862my ($a0,$b0,$c0,$d0)=@_; 2863my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 2864my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 2865my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 2866my @x=map("\"$_\"",@xx); 2867 2868 ( 2869 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 2870 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 2871 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 2872 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 2873 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2874 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2875 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2876 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2877 "&vprold (@x[$d0],@x[$d0],16)", 2878 "&vprold (@x[$d1],@x[$d1],16)", 2879 "&vprold (@x[$d2],@x[$d2],16)", 2880 "&vprold (@x[$d3],@x[$d3],16)", 2881 2882 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2883 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2884 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2885 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2886 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2887 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2888 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2889 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2890 "&vprold (@x[$b0],@x[$b0],12)", 2891 "&vprold (@x[$b1],@x[$b1],12)", 2892 "&vprold (@x[$b2],@x[$b2],12)", 2893 "&vprold (@x[$b3],@x[$b3],12)", 2894 2895 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 2896 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 2897 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 2898 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 2899 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2900 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2901 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2902 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2903 "&vprold (@x[$d0],@x[$d0],8)", 2904 "&vprold (@x[$d1],@x[$d1],8)", 2905 "&vprold (@x[$d2],@x[$d2],8)", 2906 "&vprold (@x[$d3],@x[$d3],8)", 2907 2908 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2909 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2910 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2911 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2912 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2913 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2914 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2915 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2916 "&vprold (@x[$b0],@x[$b0],7)", 2917 "&vprold (@x[$b1],@x[$b1],7)", 2918 "&vprold (@x[$b2],@x[$b2],7)", 2919 "&vprold (@x[$b3],@x[$b3],7)" 2920 ); 2921} 2922 2923my $xframe = $win64 ? 0xa8 : 8; 2924 2925$code.=<<___; 2926.type ChaCha20_16x,\@function,5 2927.align 32 2928ChaCha20_16x: 2929.cfi_startproc 2930.LChaCha20_16x: 2931 mov %rsp,%r9 # frame register 2932.cfi_def_cfa_register %r9 2933 sub \$64+$xframe,%rsp 2934 and \$-64,%rsp 2935___ 2936$code.=<<___ if ($win64); 2937 movaps %xmm6,-0xa8(%r9) 2938 movaps %xmm7,-0x98(%r9) 2939 movaps %xmm8,-0x88(%r9) 2940 movaps %xmm9,-0x78(%r9) 2941 movaps %xmm10,-0x68(%r9) 2942 movaps %xmm11,-0x58(%r9) 2943 movaps %xmm12,-0x48(%r9) 2944 movaps %xmm13,-0x38(%r9) 2945 movaps %xmm14,-0x28(%r9) 2946 movaps %xmm15,-0x18(%r9) 2947.L16x_body: 2948___ 2949$code.=<<___; 2950 vzeroupper 2951 2952 lea .Lsigma(%rip),%r10 2953 vbroadcasti32x4 (%r10),$xa3 # key[0] 2954 vbroadcasti32x4 ($key),$xb3 # key[1] 2955 vbroadcasti32x4 16($key),$xc3 # key[2] 2956 vbroadcasti32x4 ($counter),$xd3 # key[3] 2957 2958 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 2959 vpshufd \$0x55,$xa3,$xa1 2960 vpshufd \$0xaa,$xa3,$xa2 2961 vpshufd \$0xff,$xa3,$xa3 2962 vmovdqa64 $xa0,@key[0] 2963 vmovdqa64 $xa1,@key[1] 2964 vmovdqa64 $xa2,@key[2] 2965 vmovdqa64 $xa3,@key[3] 2966 2967 vpshufd \$0x00,$xb3,$xb0 2968 vpshufd \$0x55,$xb3,$xb1 2969 vpshufd \$0xaa,$xb3,$xb2 2970 vpshufd \$0xff,$xb3,$xb3 2971 vmovdqa64 $xb0,@key[4] 2972 vmovdqa64 $xb1,@key[5] 2973 vmovdqa64 $xb2,@key[6] 2974 vmovdqa64 $xb3,@key[7] 2975 2976 vpshufd \$0x00,$xc3,$xc0 2977 vpshufd \$0x55,$xc3,$xc1 2978 vpshufd \$0xaa,$xc3,$xc2 2979 vpshufd \$0xff,$xc3,$xc3 2980 vmovdqa64 $xc0,@key[8] 2981 vmovdqa64 $xc1,@key[9] 2982 vmovdqa64 $xc2,@key[10] 2983 vmovdqa64 $xc3,@key[11] 2984 2985 vpshufd \$0x00,$xd3,$xd0 2986 vpshufd \$0x55,$xd3,$xd1 2987 vpshufd \$0xaa,$xd3,$xd2 2988 vpshufd \$0xff,$xd3,$xd3 2989 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet 2990 vmovdqa64 $xd0,@key[12] 2991 vmovdqa64 $xd1,@key[13] 2992 vmovdqa64 $xd2,@key[14] 2993 vmovdqa64 $xd3,@key[15] 2994 2995 mov \$10,%eax 2996 jmp .Loop16x 2997 2998.align 32 2999.Loop_outer16x: 3000 vpbroadcastd 0(%r10),$xa0 # reload key 3001 vpbroadcastd 4(%r10),$xa1 3002 vpbroadcastd 8(%r10),$xa2 3003 vpbroadcastd 12(%r10),$xa3 3004 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters 3005 vmovdqa64 @key[4],$xb0 3006 vmovdqa64 @key[5],$xb1 3007 vmovdqa64 @key[6],$xb2 3008 vmovdqa64 @key[7],$xb3 3009 vmovdqa64 @key[8],$xc0 3010 vmovdqa64 @key[9],$xc1 3011 vmovdqa64 @key[10],$xc2 3012 vmovdqa64 @key[11],$xc3 3013 vmovdqa64 @key[12],$xd0 3014 vmovdqa64 @key[13],$xd1 3015 vmovdqa64 @key[14],$xd2 3016 vmovdqa64 @key[15],$xd3 3017 3018 vmovdqa64 $xa0,@key[0] 3019 vmovdqa64 $xa1,@key[1] 3020 vmovdqa64 $xa2,@key[2] 3021 vmovdqa64 $xa3,@key[3] 3022 3023 mov \$10,%eax 3024 jmp .Loop16x 3025 3026.align 32 3027.Loop16x: 3028___ 3029 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3030 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3031$code.=<<___; 3032 dec %eax 3033 jnz .Loop16x 3034 3035 vpaddd @key[0],$xa0,$xa0 # accumulate key 3036 vpaddd @key[1],$xa1,$xa1 3037 vpaddd @key[2],$xa2,$xa2 3038 vpaddd @key[3],$xa3,$xa3 3039 3040 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3041 vpunpckldq $xa3,$xa2,$xt3 3042 vpunpckhdq $xa1,$xa0,$xa0 3043 vpunpckhdq $xa3,$xa2,$xa2 3044 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3045 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3046 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3047 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3048___ 3049 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3050$code.=<<___; 3051 vpaddd @key[4],$xb0,$xb0 3052 vpaddd @key[5],$xb1,$xb1 3053 vpaddd @key[6],$xb2,$xb2 3054 vpaddd @key[7],$xb3,$xb3 3055 3056 vpunpckldq $xb1,$xb0,$xt2 3057 vpunpckldq $xb3,$xb2,$xt3 3058 vpunpckhdq $xb1,$xb0,$xb0 3059 vpunpckhdq $xb3,$xb2,$xb2 3060 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3061 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3062 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3063 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3064___ 3065 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3066$code.=<<___; 3067 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further 3068 vshufi32x4 \$0xee,$xb0,$xa0,$xb0 3069 vshufi32x4 \$0x44,$xb1,$xa1,$xa0 3070 vshufi32x4 \$0xee,$xb1,$xa1,$xb1 3071 vshufi32x4 \$0x44,$xb2,$xa2,$xa1 3072 vshufi32x4 \$0xee,$xb2,$xa2,$xb2 3073 vshufi32x4 \$0x44,$xb3,$xa3,$xa2 3074 vshufi32x4 \$0xee,$xb3,$xa3,$xb3 3075___ 3076 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3077$code.=<<___; 3078 vpaddd @key[8],$xc0,$xc0 3079 vpaddd @key[9],$xc1,$xc1 3080 vpaddd @key[10],$xc2,$xc2 3081 vpaddd @key[11],$xc3,$xc3 3082 3083 vpunpckldq $xc1,$xc0,$xt2 3084 vpunpckldq $xc3,$xc2,$xt3 3085 vpunpckhdq $xc1,$xc0,$xc0 3086 vpunpckhdq $xc3,$xc2,$xc2 3087 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3088 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3089 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3090 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3091___ 3092 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3093$code.=<<___; 3094 vpaddd @key[12],$xd0,$xd0 3095 vpaddd @key[13],$xd1,$xd1 3096 vpaddd @key[14],$xd2,$xd2 3097 vpaddd @key[15],$xd3,$xd3 3098 3099 vpunpckldq $xd1,$xd0,$xt2 3100 vpunpckldq $xd3,$xd2,$xt3 3101 vpunpckhdq $xd1,$xd0,$xd0 3102 vpunpckhdq $xd3,$xd2,$xd2 3103 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3104 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3105 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3106 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3107___ 3108 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3109$code.=<<___; 3110 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further 3111 vshufi32x4 \$0xee,$xd0,$xc0,$xd0 3112 vshufi32x4 \$0x44,$xd1,$xc1,$xc0 3113 vshufi32x4 \$0xee,$xd1,$xc1,$xd1 3114 vshufi32x4 \$0x44,$xd2,$xc2,$xc1 3115 vshufi32x4 \$0xee,$xd2,$xc2,$xd2 3116 vshufi32x4 \$0x44,$xd3,$xc3,$xc2 3117 vshufi32x4 \$0xee,$xd3,$xc3,$xd3 3118___ 3119 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3120$code.=<<___; 3121 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further 3122 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 3123 vshufi32x4 \$0x88,$xd0,$xb0,$xc0 3124 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 3125 vshufi32x4 \$0x88,$xc1,$xa1,$xt1 3126 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 3127 vshufi32x4 \$0x88,$xd1,$xb1,$xc1 3128 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 3129 vshufi32x4 \$0x88,$xc2,$xa2,$xt2 3130 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 3131 vshufi32x4 \$0x88,$xd2,$xb2,$xc2 3132 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 3133 vshufi32x4 \$0x88,$xc3,$xa3,$xt3 3134 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 3135 vshufi32x4 \$0x88,$xd3,$xb3,$xc3 3136 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 3137___ 3138 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= 3139 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); 3140 3141 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, 3142 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = 3143 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3144 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3145$code.=<<___; 3146 cmp \$64*16,$len 3147 jb .Ltail16x 3148 3149 vpxord 0x00($inp),$xa0,$xa0 # xor with input 3150 vpxord 0x40($inp),$xb0,$xb0 3151 vpxord 0x80($inp),$xc0,$xc0 3152 vpxord 0xc0($inp),$xd0,$xd0 3153 vmovdqu32 $xa0,0x00($out) 3154 vmovdqu32 $xb0,0x40($out) 3155 vmovdqu32 $xc0,0x80($out) 3156 vmovdqu32 $xd0,0xc0($out) 3157 3158 vpxord 0x100($inp),$xa1,$xa1 3159 vpxord 0x140($inp),$xb1,$xb1 3160 vpxord 0x180($inp),$xc1,$xc1 3161 vpxord 0x1c0($inp),$xd1,$xd1 3162 vmovdqu32 $xa1,0x100($out) 3163 vmovdqu32 $xb1,0x140($out) 3164 vmovdqu32 $xc1,0x180($out) 3165 vmovdqu32 $xd1,0x1c0($out) 3166 3167 vpxord 0x200($inp),$xa2,$xa2 3168 vpxord 0x240($inp),$xb2,$xb2 3169 vpxord 0x280($inp),$xc2,$xc2 3170 vpxord 0x2c0($inp),$xd2,$xd2 3171 vmovdqu32 $xa2,0x200($out) 3172 vmovdqu32 $xb2,0x240($out) 3173 vmovdqu32 $xc2,0x280($out) 3174 vmovdqu32 $xd2,0x2c0($out) 3175 3176 vpxord 0x300($inp),$xa3,$xa3 3177 vpxord 0x340($inp),$xb3,$xb3 3178 vpxord 0x380($inp),$xc3,$xc3 3179 vpxord 0x3c0($inp),$xd3,$xd3 3180 lea 0x400($inp),$inp 3181 vmovdqu32 $xa3,0x300($out) 3182 vmovdqu32 $xb3,0x340($out) 3183 vmovdqu32 $xc3,0x380($out) 3184 vmovdqu32 $xd3,0x3c0($out) 3185 lea 0x400($out),$out 3186 3187 sub \$64*16,$len 3188 jnz .Loop_outer16x 3189 3190 jmp .Ldone16x 3191 3192.align 32 3193.Ltail16x: 3194 xor %r10,%r10 3195 sub $inp,$out 3196 cmp \$64*1,$len 3197 jb .Less_than_64_16x 3198 vpxord ($inp),$xa0,$xa0 # xor with input 3199 vmovdqu32 $xa0,($out,$inp) 3200 je .Ldone16x 3201 vmovdqa32 $xb0,$xa0 3202 lea 64($inp),$inp 3203 3204 cmp \$64*2,$len 3205 jb .Less_than_64_16x 3206 vpxord ($inp),$xb0,$xb0 3207 vmovdqu32 $xb0,($out,$inp) 3208 je .Ldone16x 3209 vmovdqa32 $xc0,$xa0 3210 lea 64($inp),$inp 3211 3212 cmp \$64*3,$len 3213 jb .Less_than_64_16x 3214 vpxord ($inp),$xc0,$xc0 3215 vmovdqu32 $xc0,($out,$inp) 3216 je .Ldone16x 3217 vmovdqa32 $xd0,$xa0 3218 lea 64($inp),$inp 3219 3220 cmp \$64*4,$len 3221 jb .Less_than_64_16x 3222 vpxord ($inp),$xd0,$xd0 3223 vmovdqu32 $xd0,($out,$inp) 3224 je .Ldone16x 3225 vmovdqa32 $xa1,$xa0 3226 lea 64($inp),$inp 3227 3228 cmp \$64*5,$len 3229 jb .Less_than_64_16x 3230 vpxord ($inp),$xa1,$xa1 3231 vmovdqu32 $xa1,($out,$inp) 3232 je .Ldone16x 3233 vmovdqa32 $xb1,$xa0 3234 lea 64($inp),$inp 3235 3236 cmp \$64*6,$len 3237 jb .Less_than_64_16x 3238 vpxord ($inp),$xb1,$xb1 3239 vmovdqu32 $xb1,($out,$inp) 3240 je .Ldone16x 3241 vmovdqa32 $xc1,$xa0 3242 lea 64($inp),$inp 3243 3244 cmp \$64*7,$len 3245 jb .Less_than_64_16x 3246 vpxord ($inp),$xc1,$xc1 3247 vmovdqu32 $xc1,($out,$inp) 3248 je .Ldone16x 3249 vmovdqa32 $xd1,$xa0 3250 lea 64($inp),$inp 3251 3252 cmp \$64*8,$len 3253 jb .Less_than_64_16x 3254 vpxord ($inp),$xd1,$xd1 3255 vmovdqu32 $xd1,($out,$inp) 3256 je .Ldone16x 3257 vmovdqa32 $xa2,$xa0 3258 lea 64($inp),$inp 3259 3260 cmp \$64*9,$len 3261 jb .Less_than_64_16x 3262 vpxord ($inp),$xa2,$xa2 3263 vmovdqu32 $xa2,($out,$inp) 3264 je .Ldone16x 3265 vmovdqa32 $xb2,$xa0 3266 lea 64($inp),$inp 3267 3268 cmp \$64*10,$len 3269 jb .Less_than_64_16x 3270 vpxord ($inp),$xb2,$xb2 3271 vmovdqu32 $xb2,($out,$inp) 3272 je .Ldone16x 3273 vmovdqa32 $xc2,$xa0 3274 lea 64($inp),$inp 3275 3276 cmp \$64*11,$len 3277 jb .Less_than_64_16x 3278 vpxord ($inp),$xc2,$xc2 3279 vmovdqu32 $xc2,($out,$inp) 3280 je .Ldone16x 3281 vmovdqa32 $xd2,$xa0 3282 lea 64($inp),$inp 3283 3284 cmp \$64*12,$len 3285 jb .Less_than_64_16x 3286 vpxord ($inp),$xd2,$xd2 3287 vmovdqu32 $xd2,($out,$inp) 3288 je .Ldone16x 3289 vmovdqa32 $xa3,$xa0 3290 lea 64($inp),$inp 3291 3292 cmp \$64*13,$len 3293 jb .Less_than_64_16x 3294 vpxord ($inp),$xa3,$xa3 3295 vmovdqu32 $xa3,($out,$inp) 3296 je .Ldone16x 3297 vmovdqa32 $xb3,$xa0 3298 lea 64($inp),$inp 3299 3300 cmp \$64*14,$len 3301 jb .Less_than_64_16x 3302 vpxord ($inp),$xb3,$xb3 3303 vmovdqu32 $xb3,($out,$inp) 3304 je .Ldone16x 3305 vmovdqa32 $xc3,$xa0 3306 lea 64($inp),$inp 3307 3308 cmp \$64*15,$len 3309 jb .Less_than_64_16x 3310 vpxord ($inp),$xc3,$xc3 3311 vmovdqu32 $xc3,($out,$inp) 3312 je .Ldone16x 3313 vmovdqa32 $xd3,$xa0 3314 lea 64($inp),$inp 3315 3316.Less_than_64_16x: 3317 vmovdqa32 $xa0,0x00(%rsp) 3318 lea ($out,$inp),$out 3319 and \$63,$len 3320 3321.Loop_tail16x: 3322 movzb ($inp,%r10),%eax 3323 movzb (%rsp,%r10),%ecx 3324 lea 1(%r10),%r10 3325 xor %ecx,%eax 3326 mov %al,-1($out,%r10) 3327 dec $len 3328 jnz .Loop_tail16x 3329 3330 vpxord $xa0,$xa0,$xa0 3331 vmovdqa32 $xa0,0(%rsp) 3332 3333.Ldone16x: 3334 vzeroall 3335___ 3336$code.=<<___ if ($win64); 3337 movaps -0xa8(%r9),%xmm6 3338 movaps -0x98(%r9),%xmm7 3339 movaps -0x88(%r9),%xmm8 3340 movaps -0x78(%r9),%xmm9 3341 movaps -0x68(%r9),%xmm10 3342 movaps -0x58(%r9),%xmm11 3343 movaps -0x48(%r9),%xmm12 3344 movaps -0x38(%r9),%xmm13 3345 movaps -0x28(%r9),%xmm14 3346 movaps -0x18(%r9),%xmm15 3347___ 3348$code.=<<___; 3349 lea (%r9),%rsp 3350.cfi_def_cfa_register %rsp 3351.L16x_epilogue: 3352 ret 3353.cfi_endproc 3354.size ChaCha20_16x,.-ChaCha20_16x 3355___ 3356 3357# switch to %ymm domain 3358($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3359 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); 3360@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3361 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3362@key=map("%ymm$_",(16..31)); 3363($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 3364 3365$code.=<<___; 3366.type ChaCha20_8xvl,\@function,5 3367.align 32 3368ChaCha20_8xvl: 3369.cfi_startproc 3370.LChaCha20_8xvl: 3371 mov %rsp,%r9 # frame register 3372.cfi_def_cfa_register %r9 3373 sub \$64+$xframe,%rsp 3374 and \$-64,%rsp 3375___ 3376$code.=<<___ if ($win64); 3377 movaps %xmm6,-0xa8(%r9) 3378 movaps %xmm7,-0x98(%r9) 3379 movaps %xmm8,-0x88(%r9) 3380 movaps %xmm9,-0x78(%r9) 3381 movaps %xmm10,-0x68(%r9) 3382 movaps %xmm11,-0x58(%r9) 3383 movaps %xmm12,-0x48(%r9) 3384 movaps %xmm13,-0x38(%r9) 3385 movaps %xmm14,-0x28(%r9) 3386 movaps %xmm15,-0x18(%r9) 3387.L8xvl_body: 3388___ 3389$code.=<<___; 3390 vzeroupper 3391 3392 lea .Lsigma(%rip),%r10 3393 vbroadcasti128 (%r10),$xa3 # key[0] 3394 vbroadcasti128 ($key),$xb3 # key[1] 3395 vbroadcasti128 16($key),$xc3 # key[2] 3396 vbroadcasti128 ($counter),$xd3 # key[3] 3397 3398 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 3399 vpshufd \$0x55,$xa3,$xa1 3400 vpshufd \$0xaa,$xa3,$xa2 3401 vpshufd \$0xff,$xa3,$xa3 3402 vmovdqa64 $xa0,@key[0] 3403 vmovdqa64 $xa1,@key[1] 3404 vmovdqa64 $xa2,@key[2] 3405 vmovdqa64 $xa3,@key[3] 3406 3407 vpshufd \$0x00,$xb3,$xb0 3408 vpshufd \$0x55,$xb3,$xb1 3409 vpshufd \$0xaa,$xb3,$xb2 3410 vpshufd \$0xff,$xb3,$xb3 3411 vmovdqa64 $xb0,@key[4] 3412 vmovdqa64 $xb1,@key[5] 3413 vmovdqa64 $xb2,@key[6] 3414 vmovdqa64 $xb3,@key[7] 3415 3416 vpshufd \$0x00,$xc3,$xc0 3417 vpshufd \$0x55,$xc3,$xc1 3418 vpshufd \$0xaa,$xc3,$xc2 3419 vpshufd \$0xff,$xc3,$xc3 3420 vmovdqa64 $xc0,@key[8] 3421 vmovdqa64 $xc1,@key[9] 3422 vmovdqa64 $xc2,@key[10] 3423 vmovdqa64 $xc3,@key[11] 3424 3425 vpshufd \$0x00,$xd3,$xd0 3426 vpshufd \$0x55,$xd3,$xd1 3427 vpshufd \$0xaa,$xd3,$xd2 3428 vpshufd \$0xff,$xd3,$xd3 3429 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 3430 vmovdqa64 $xd0,@key[12] 3431 vmovdqa64 $xd1,@key[13] 3432 vmovdqa64 $xd2,@key[14] 3433 vmovdqa64 $xd3,@key[15] 3434 3435 mov \$10,%eax 3436 jmp .Loop8xvl 3437 3438.align 32 3439.Loop_outer8xvl: 3440 #vpbroadcastd 0(%r10),$xa0 # reload key 3441 #vpbroadcastd 4(%r10),$xa1 3442 vpbroadcastd 8(%r10),$xa2 3443 vpbroadcastd 12(%r10),$xa3 3444 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters 3445 vmovdqa64 @key[4],$xb0 3446 vmovdqa64 @key[5],$xb1 3447 vmovdqa64 @key[6],$xb2 3448 vmovdqa64 @key[7],$xb3 3449 vmovdqa64 @key[8],$xc0 3450 vmovdqa64 @key[9],$xc1 3451 vmovdqa64 @key[10],$xc2 3452 vmovdqa64 @key[11],$xc3 3453 vmovdqa64 @key[12],$xd0 3454 vmovdqa64 @key[13],$xd1 3455 vmovdqa64 @key[14],$xd2 3456 vmovdqa64 @key[15],$xd3 3457 3458 vmovdqa64 $xa0,@key[0] 3459 vmovdqa64 $xa1,@key[1] 3460 vmovdqa64 $xa2,@key[2] 3461 vmovdqa64 $xa3,@key[3] 3462 3463 mov \$10,%eax 3464 jmp .Loop8xvl 3465 3466.align 32 3467.Loop8xvl: 3468___ 3469 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3470 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3471$code.=<<___; 3472 dec %eax 3473 jnz .Loop8xvl 3474 3475 vpaddd @key[0],$xa0,$xa0 # accumulate key 3476 vpaddd @key[1],$xa1,$xa1 3477 vpaddd @key[2],$xa2,$xa2 3478 vpaddd @key[3],$xa3,$xa3 3479 3480 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3481 vpunpckldq $xa3,$xa2,$xt3 3482 vpunpckhdq $xa1,$xa0,$xa0 3483 vpunpckhdq $xa3,$xa2,$xa2 3484 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3485 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3486 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3487 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3488___ 3489 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3490$code.=<<___; 3491 vpaddd @key[4],$xb0,$xb0 3492 vpaddd @key[5],$xb1,$xb1 3493 vpaddd @key[6],$xb2,$xb2 3494 vpaddd @key[7],$xb3,$xb3 3495 3496 vpunpckldq $xb1,$xb0,$xt2 3497 vpunpckldq $xb3,$xb2,$xt3 3498 vpunpckhdq $xb1,$xb0,$xb0 3499 vpunpckhdq $xb3,$xb2,$xb2 3500 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3501 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3502 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3503 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3504___ 3505 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3506$code.=<<___; 3507 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further 3508 vshufi32x4 \$3,$xb0,$xa0,$xb0 3509 vshufi32x4 \$0,$xb1,$xa1,$xa0 3510 vshufi32x4 \$3,$xb1,$xa1,$xb1 3511 vshufi32x4 \$0,$xb2,$xa2,$xa1 3512 vshufi32x4 \$3,$xb2,$xa2,$xb2 3513 vshufi32x4 \$0,$xb3,$xa3,$xa2 3514 vshufi32x4 \$3,$xb3,$xa3,$xb3 3515___ 3516 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3517$code.=<<___; 3518 vpaddd @key[8],$xc0,$xc0 3519 vpaddd @key[9],$xc1,$xc1 3520 vpaddd @key[10],$xc2,$xc2 3521 vpaddd @key[11],$xc3,$xc3 3522 3523 vpunpckldq $xc1,$xc0,$xt2 3524 vpunpckldq $xc3,$xc2,$xt3 3525 vpunpckhdq $xc1,$xc0,$xc0 3526 vpunpckhdq $xc3,$xc2,$xc2 3527 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3528 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3529 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3530 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3531___ 3532 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3533$code.=<<___; 3534 vpaddd @key[12],$xd0,$xd0 3535 vpaddd @key[13],$xd1,$xd1 3536 vpaddd @key[14],$xd2,$xd2 3537 vpaddd @key[15],$xd3,$xd3 3538 3539 vpunpckldq $xd1,$xd0,$xt2 3540 vpunpckldq $xd3,$xd2,$xt3 3541 vpunpckhdq $xd1,$xd0,$xd0 3542 vpunpckhdq $xd3,$xd2,$xd2 3543 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3544 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3545 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3546 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3547___ 3548 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3549$code.=<<___; 3550 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 3551 vperm2i128 \$0x31,$xd0,$xc0,$xd0 3552 vperm2i128 \$0x20,$xd1,$xc1,$xc0 3553 vperm2i128 \$0x31,$xd1,$xc1,$xd1 3554 vperm2i128 \$0x20,$xd2,$xc2,$xc1 3555 vperm2i128 \$0x31,$xd2,$xc2,$xd2 3556 vperm2i128 \$0x20,$xd3,$xc3,$xc2 3557 vperm2i128 \$0x31,$xd3,$xc3,$xd3 3558___ 3559 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3560 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 3561 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 3562$code.=<<___; 3563 cmp \$64*8,$len 3564 jb .Ltail8xvl 3565 3566 mov \$0x80,%eax # size optimization 3567 vpxord 0x00($inp),$xa0,$xa0 # xor with input 3568 vpxor 0x20($inp),$xb0,$xb0 3569 vpxor 0x40($inp),$xc0,$xc0 3570 vpxor 0x60($inp),$xd0,$xd0 3571 lea ($inp,%rax),$inp # size optimization 3572 vmovdqu32 $xa0,0x00($out) 3573 vmovdqu $xb0,0x20($out) 3574 vmovdqu $xc0,0x40($out) 3575 vmovdqu $xd0,0x60($out) 3576 lea ($out,%rax),$out # size optimization 3577 3578 vpxor 0x00($inp),$xa1,$xa1 3579 vpxor 0x20($inp),$xb1,$xb1 3580 vpxor 0x40($inp),$xc1,$xc1 3581 vpxor 0x60($inp),$xd1,$xd1 3582 lea ($inp,%rax),$inp # size optimization 3583 vmovdqu $xa1,0x00($out) 3584 vmovdqu $xb1,0x20($out) 3585 vmovdqu $xc1,0x40($out) 3586 vmovdqu $xd1,0x60($out) 3587 lea ($out,%rax),$out # size optimization 3588 3589 vpxord 0x00($inp),$xa2,$xa2 3590 vpxor 0x20($inp),$xb2,$xb2 3591 vpxor 0x40($inp),$xc2,$xc2 3592 vpxor 0x60($inp),$xd2,$xd2 3593 lea ($inp,%rax),$inp # size optimization 3594 vmovdqu32 $xa2,0x00($out) 3595 vmovdqu $xb2,0x20($out) 3596 vmovdqu $xc2,0x40($out) 3597 vmovdqu $xd2,0x60($out) 3598 lea ($out,%rax),$out # size optimization 3599 3600 vpxor 0x00($inp),$xa3,$xa3 3601 vpxor 0x20($inp),$xb3,$xb3 3602 vpxor 0x40($inp),$xc3,$xc3 3603 vpxor 0x60($inp),$xd3,$xd3 3604 lea ($inp,%rax),$inp # size optimization 3605 vmovdqu $xa3,0x00($out) 3606 vmovdqu $xb3,0x20($out) 3607 vmovdqu $xc3,0x40($out) 3608 vmovdqu $xd3,0x60($out) 3609 lea ($out,%rax),$out # size optimization 3610 3611 vpbroadcastd 0(%r10),%ymm0 # reload key 3612 vpbroadcastd 4(%r10),%ymm1 3613 3614 sub \$64*8,$len 3615 jnz .Loop_outer8xvl 3616 3617 jmp .Ldone8xvl 3618 3619.align 32 3620.Ltail8xvl: 3621 vmovdqa64 $xa0,%ymm8 # size optimization 3622___ 3623$xa0 = "%ymm8"; 3624$code.=<<___; 3625 xor %r10,%r10 3626 sub $inp,$out 3627 cmp \$64*1,$len 3628 jb .Less_than_64_8xvl 3629 vpxor 0x00($inp),$xa0,$xa0 # xor with input 3630 vpxor 0x20($inp),$xb0,$xb0 3631 vmovdqu $xa0,0x00($out,$inp) 3632 vmovdqu $xb0,0x20($out,$inp) 3633 je .Ldone8xvl 3634 vmovdqa $xc0,$xa0 3635 vmovdqa $xd0,$xb0 3636 lea 64($inp),$inp 3637 3638 cmp \$64*2,$len 3639 jb .Less_than_64_8xvl 3640 vpxor 0x00($inp),$xc0,$xc0 3641 vpxor 0x20($inp),$xd0,$xd0 3642 vmovdqu $xc0,0x00($out,$inp) 3643 vmovdqu $xd0,0x20($out,$inp) 3644 je .Ldone8xvl 3645 vmovdqa $xa1,$xa0 3646 vmovdqa $xb1,$xb0 3647 lea 64($inp),$inp 3648 3649 cmp \$64*3,$len 3650 jb .Less_than_64_8xvl 3651 vpxor 0x00($inp),$xa1,$xa1 3652 vpxor 0x20($inp),$xb1,$xb1 3653 vmovdqu $xa1,0x00($out,$inp) 3654 vmovdqu $xb1,0x20($out,$inp) 3655 je .Ldone8xvl 3656 vmovdqa $xc1,$xa0 3657 vmovdqa $xd1,$xb0 3658 lea 64($inp),$inp 3659 3660 cmp \$64*4,$len 3661 jb .Less_than_64_8xvl 3662 vpxor 0x00($inp),$xc1,$xc1 3663 vpxor 0x20($inp),$xd1,$xd1 3664 vmovdqu $xc1,0x00($out,$inp) 3665 vmovdqu $xd1,0x20($out,$inp) 3666 je .Ldone8xvl 3667 vmovdqa32 $xa2,$xa0 3668 vmovdqa $xb2,$xb0 3669 lea 64($inp),$inp 3670 3671 cmp \$64*5,$len 3672 jb .Less_than_64_8xvl 3673 vpxord 0x00($inp),$xa2,$xa2 3674 vpxor 0x20($inp),$xb2,$xb2 3675 vmovdqu32 $xa2,0x00($out,$inp) 3676 vmovdqu $xb2,0x20($out,$inp) 3677 je .Ldone8xvl 3678 vmovdqa $xc2,$xa0 3679 vmovdqa $xd2,$xb0 3680 lea 64($inp),$inp 3681 3682 cmp \$64*6,$len 3683 jb .Less_than_64_8xvl 3684 vpxor 0x00($inp),$xc2,$xc2 3685 vpxor 0x20($inp),$xd2,$xd2 3686 vmovdqu $xc2,0x00($out,$inp) 3687 vmovdqu $xd2,0x20($out,$inp) 3688 je .Ldone8xvl 3689 vmovdqa $xa3,$xa0 3690 vmovdqa $xb3,$xb0 3691 lea 64($inp),$inp 3692 3693 cmp \$64*7,$len 3694 jb .Less_than_64_8xvl 3695 vpxor 0x00($inp),$xa3,$xa3 3696 vpxor 0x20($inp),$xb3,$xb3 3697 vmovdqu $xa3,0x00($out,$inp) 3698 vmovdqu $xb3,0x20($out,$inp) 3699 je .Ldone8xvl 3700 vmovdqa $xc3,$xa0 3701 vmovdqa $xd3,$xb0 3702 lea 64($inp),$inp 3703 3704.Less_than_64_8xvl: 3705 vmovdqa $xa0,0x00(%rsp) 3706 vmovdqa $xb0,0x20(%rsp) 3707 lea ($out,$inp),$out 3708 and \$63,$len 3709 3710.Loop_tail8xvl: 3711 movzb ($inp,%r10),%eax 3712 movzb (%rsp,%r10),%ecx 3713 lea 1(%r10),%r10 3714 xor %ecx,%eax 3715 mov %al,-1($out,%r10) 3716 dec $len 3717 jnz .Loop_tail8xvl 3718 3719 vpxor $xa0,$xa0,$xa0 3720 vmovdqa $xa0,0x00(%rsp) 3721 vmovdqa $xa0,0x20(%rsp) 3722 3723.Ldone8xvl: 3724 vzeroall 3725___ 3726$code.=<<___ if ($win64); 3727 movaps -0xa8(%r9),%xmm6 3728 movaps -0x98(%r9),%xmm7 3729 movaps -0x88(%r9),%xmm8 3730 movaps -0x78(%r9),%xmm9 3731 movaps -0x68(%r9),%xmm10 3732 movaps -0x58(%r9),%xmm11 3733 movaps -0x48(%r9),%xmm12 3734 movaps -0x38(%r9),%xmm13 3735 movaps -0x28(%r9),%xmm14 3736 movaps -0x18(%r9),%xmm15 3737___ 3738$code.=<<___; 3739 lea (%r9),%rsp 3740.cfi_def_cfa_register %rsp 3741.L8xvl_epilogue: 3742 ret 3743.cfi_endproc 3744.size ChaCha20_8xvl,.-ChaCha20_8xvl 3745___ 3746} 3747 3748# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3749# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3750if ($win64) { 3751$rec="%rcx"; 3752$frame="%rdx"; 3753$context="%r8"; 3754$disp="%r9"; 3755 3756$code.=<<___; 3757.extern __imp_RtlVirtualUnwind 3758.type se_handler,\@abi-omnipotent 3759.align 16 3760se_handler: 3761 push %rsi 3762 push %rdi 3763 push %rbx 3764 push %rbp 3765 push %r12 3766 push %r13 3767 push %r14 3768 push %r15 3769 pushfq 3770 sub \$64,%rsp 3771 3772 mov 120($context),%rax # pull context->Rax 3773 mov 248($context),%rbx # pull context->Rip 3774 3775 mov 8($disp),%rsi # disp->ImageBase 3776 mov 56($disp),%r11 # disp->HandlerData 3777 3778 lea .Lctr32_body(%rip),%r10 3779 cmp %r10,%rbx # context->Rip<.Lprologue 3780 jb .Lcommon_seh_tail 3781 3782 mov 152($context),%rax # pull context->Rsp 3783 3784 lea .Lno_data(%rip),%r10 # epilogue label 3785 cmp %r10,%rbx # context->Rip>=.Lepilogue 3786 jae .Lcommon_seh_tail 3787 3788 lea 64+24+48(%rax),%rax 3789 3790 mov -8(%rax),%rbx 3791 mov -16(%rax),%rbp 3792 mov -24(%rax),%r12 3793 mov -32(%rax),%r13 3794 mov -40(%rax),%r14 3795 mov -48(%rax),%r15 3796 mov %rbx,144($context) # restore context->Rbx 3797 mov %rbp,160($context) # restore context->Rbp 3798 mov %r12,216($context) # restore context->R12 3799 mov %r13,224($context) # restore context->R13 3800 mov %r14,232($context) # restore context->R14 3801 mov %r15,240($context) # restore context->R14 3802 3803.Lcommon_seh_tail: 3804 mov 8(%rax),%rdi 3805 mov 16(%rax),%rsi 3806 mov %rax,152($context) # restore context->Rsp 3807 mov %rsi,168($context) # restore context->Rsi 3808 mov %rdi,176($context) # restore context->Rdi 3809 3810 mov 40($disp),%rdi # disp->ContextRecord 3811 mov $context,%rsi # context 3812 mov \$154,%ecx # sizeof(CONTEXT) 3813 .long 0xa548f3fc # cld; rep movsq 3814 3815 mov $disp,%rsi 3816 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3817 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3818 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3819 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3820 mov 40(%rsi),%r10 # disp->ContextRecord 3821 lea 56(%rsi),%r11 # &disp->HandlerData 3822 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3823 mov %r10,32(%rsp) # arg5 3824 mov %r11,40(%rsp) # arg6 3825 mov %r12,48(%rsp) # arg7 3826 mov %rcx,56(%rsp) # arg8, (NULL) 3827 call *__imp_RtlVirtualUnwind(%rip) 3828 3829 mov \$1,%eax # ExceptionContinueSearch 3830 add \$64,%rsp 3831 popfq 3832 pop %r15 3833 pop %r14 3834 pop %r13 3835 pop %r12 3836 pop %rbp 3837 pop %rbx 3838 pop %rdi 3839 pop %rsi 3840 ret 3841.size se_handler,.-se_handler 3842 3843.type simd_handler,\@abi-omnipotent 3844.align 16 3845simd_handler: 3846 push %rsi 3847 push %rdi 3848 push %rbx 3849 push %rbp 3850 push %r12 3851 push %r13 3852 push %r14 3853 push %r15 3854 pushfq 3855 sub \$64,%rsp 3856 3857 mov 120($context),%rax # pull context->Rax 3858 mov 248($context),%rbx # pull context->Rip 3859 3860 mov 8($disp),%rsi # disp->ImageBase 3861 mov 56($disp),%r11 # disp->HandlerData 3862 3863 mov 0(%r11),%r10d # HandlerData[0] 3864 lea (%rsi,%r10),%r10 # prologue label 3865 cmp %r10,%rbx # context->Rip<prologue label 3866 jb .Lcommon_seh_tail 3867 3868 mov 192($context),%rax # pull context->R9 3869 3870 mov 4(%r11),%r10d # HandlerData[1] 3871 mov 8(%r11),%ecx # HandlerData[2] 3872 lea (%rsi,%r10),%r10 # epilogue label 3873 cmp %r10,%rbx # context->Rip>=epilogue label 3874 jae .Lcommon_seh_tail 3875 3876 neg %rcx 3877 lea -8(%rax,%rcx),%rsi 3878 lea 512($context),%rdi # &context.Xmm6 3879 neg %ecx 3880 shr \$3,%ecx 3881 .long 0xa548f3fc # cld; rep movsq 3882 3883 jmp .Lcommon_seh_tail 3884.size simd_handler,.-simd_handler 3885 3886.section .pdata 3887.align 4 3888 .rva .LSEH_begin_ChaCha20_ctr32 3889 .rva .LSEH_end_ChaCha20_ctr32 3890 .rva .LSEH_info_ChaCha20_ctr32 3891 3892 .rva .LSEH_begin_ChaCha20_ssse3 3893 .rva .LSEH_end_ChaCha20_ssse3 3894 .rva .LSEH_info_ChaCha20_ssse3 3895 3896 .rva .LSEH_begin_ChaCha20_128 3897 .rva .LSEH_end_ChaCha20_128 3898 .rva .LSEH_info_ChaCha20_128 3899 3900 .rva .LSEH_begin_ChaCha20_4x 3901 .rva .LSEH_end_ChaCha20_4x 3902 .rva .LSEH_info_ChaCha20_4x 3903___ 3904$code.=<<___ if ($avx); 3905 .rva .LSEH_begin_ChaCha20_4xop 3906 .rva .LSEH_end_ChaCha20_4xop 3907 .rva .LSEH_info_ChaCha20_4xop 3908___ 3909$code.=<<___ if ($avx>1); 3910 .rva .LSEH_begin_ChaCha20_8x 3911 .rva .LSEH_end_ChaCha20_8x 3912 .rva .LSEH_info_ChaCha20_8x 3913___ 3914$code.=<<___ if ($avx>2); 3915 .rva .LSEH_begin_ChaCha20_avx512 3916 .rva .LSEH_end_ChaCha20_avx512 3917 .rva .LSEH_info_ChaCha20_avx512 3918 3919 .rva .LSEH_begin_ChaCha20_avx512vl 3920 .rva .LSEH_end_ChaCha20_avx512vl 3921 .rva .LSEH_info_ChaCha20_avx512vl 3922 3923 .rva .LSEH_begin_ChaCha20_16x 3924 .rva .LSEH_end_ChaCha20_16x 3925 .rva .LSEH_info_ChaCha20_16x 3926 3927 .rva .LSEH_begin_ChaCha20_8xvl 3928 .rva .LSEH_end_ChaCha20_8xvl 3929 .rva .LSEH_info_ChaCha20_8xvl 3930___ 3931$code.=<<___; 3932.section .xdata 3933.align 8 3934.LSEH_info_ChaCha20_ctr32: 3935 .byte 9,0,0,0 3936 .rva se_handler 3937 3938.LSEH_info_ChaCha20_ssse3: 3939 .byte 9,0,0,0 3940 .rva simd_handler 3941 .rva .Lssse3_body,.Lssse3_epilogue 3942 .long 0x20,0 3943 3944.LSEH_info_ChaCha20_128: 3945 .byte 9,0,0,0 3946 .rva simd_handler 3947 .rva .L128_body,.L128_epilogue 3948 .long 0x60,0 3949 3950.LSEH_info_ChaCha20_4x: 3951 .byte 9,0,0,0 3952 .rva simd_handler 3953 .rva .L4x_body,.L4x_epilogue 3954 .long 0xa0,0 3955___ 3956$code.=<<___ if ($avx); 3957.LSEH_info_ChaCha20_4xop: 3958 .byte 9,0,0,0 3959 .rva simd_handler 3960 .rva .L4xop_body,.L4xop_epilogue # HandlerData[] 3961 .long 0xa0,0 3962___ 3963$code.=<<___ if ($avx>1); 3964.LSEH_info_ChaCha20_8x: 3965 .byte 9,0,0,0 3966 .rva simd_handler 3967 .rva .L8x_body,.L8x_epilogue # HandlerData[] 3968 .long 0xa0,0 3969___ 3970$code.=<<___ if ($avx>2); 3971.LSEH_info_ChaCha20_avx512: 3972 .byte 9,0,0,0 3973 .rva simd_handler 3974 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] 3975 .long 0x20,0 3976 3977.LSEH_info_ChaCha20_avx512vl: 3978 .byte 9,0,0,0 3979 .rva simd_handler 3980 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] 3981 .long 0x20,0 3982 3983.LSEH_info_ChaCha20_16x: 3984 .byte 9,0,0,0 3985 .rva simd_handler 3986 .rva .L16x_body,.L16x_epilogue # HandlerData[] 3987 .long 0xa0,0 3988 3989.LSEH_info_ChaCha20_8xvl: 3990 .byte 9,0,0,0 3991 .rva simd_handler 3992 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] 3993 .long 0xa0,0 3994___ 3995} 3996 3997foreach (split("\n",$code)) { 3998 s/\`([^\`]*)\`/eval $1/ge; 3999 4000 s/%x#%[yz]/%x/g; # "down-shift" 4001 4002 print $_,"\n"; 4003} 4004 4005close STDOUT; 4006