1#! /usr/bin/env perl 2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# November 2014 18# 19# ChaCha20 for x86_64. 20# 21# December 2016 22# 23# Add AVX512F code path. 24# 25# December 2017 26# 27# Add AVX512VL code path. 28# 29# Performance in cycles per byte out of large buffer. 30# 31# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) 32# 33# P4 9.48/+99% - - 34# Core2 7.83/+55% 7.90/5.76 4.35 35# Westmere 7.19/+50% 5.60/4.50 3.00 36# Sandy Bridge 8.31/+42% 5.45/4.00 2.72 37# Ivy Bridge 6.71/+46% 5.40/? 2.41 38# Haswell 5.92/+43% 5.20/3.45 2.42 1.23 39# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] 40# Silvermont 12.0/+33% 7.75/6.90 7.03(iii) 41# Knights L 11.7/- ? 9.60(iii) 0.80 42# Goldmont 10.6/+17% 5.10/3.52 3.28 43# Sledgehammer 7.28/+52% - - 44# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) 45# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 46# VIA Nano 10.5/+46% 6.72/6.88 6.05 47# 48# (i) compared to older gcc 3.x one can observe >2x improvement on 49# most platforms; 50# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used 51# by chacha20_poly1305_tls_cipher, results are EVP-free; 52# (iii) this is not optimal result for Atom because of MSROM 53# limitations, SSE2 can do better, but gain is considered too 54# low to justify the [maintenance] effort; 55# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 56# and 4.85 for 128-byte inputs; 57# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; 58# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 59# cpb in single thread, the corresponding capability is suppressed; 60 61$flavour = shift; 62$output = shift; 63if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 64 65$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 66 67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 69( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 70die "can't locate x86_64-xlate.pl"; 71 72if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 73 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 74 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 75} 76 77if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 78 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 79 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 80 $avx += 1 if ($1==2.11 && $2>=8); 81} 82 83if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 84 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 85 $avx = ($1>=10) + ($1>=11); 86} 87 88if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 89 $avx = ($2>=3.0) + ($2>3.0); 90} 91 92open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 93*STDOUT=*OUT; 94 95# input parameter block 96($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); 97 98$code.=<<___; 99.text 100 101.extern OPENSSL_ia32cap_P 102 103.align 64 104.Lzero: 105.long 0,0,0,0 106.Lone: 107.long 1,0,0,0 108.Linc: 109.long 0,1,2,3 110.Lfour: 111.long 4,4,4,4 112.Lincy: 113.long 0,2,4,6,1,3,5,7 114.Leight: 115.long 8,8,8,8,8,8,8,8 116.Lrot16: 117.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 118.Lrot24: 119.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 120.Ltwoy: 121.long 2,0,0,0, 2,0,0,0 122.align 64 123.Lzeroz: 124.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 125.Lfourz: 126.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 127.Lincz: 128.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 129.Lsixteen: 130.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 131.Lsigma: 132.asciz "expand 32-byte k" 133.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 134___ 135 136sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 137{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 138 my $arg = pop; 139 $arg = "\$$arg" if ($arg*1 eq $arg); 140 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 141} 142 143@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), 144 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); 145@t=("%esi","%edi"); 146 147sub ROUND { # critical path is 24 cycles per round 148my ($a0,$b0,$c0,$d0)=@_; 149my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 150my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 151my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 152my ($xc,$xc_)=map("\"$_\"",@t); 153my @x=map("\"$_\"",@x); 154 155 # Consider order in which variables are addressed by their 156 # index: 157 # 158 # a b c d 159 # 160 # 0 4 8 12 < even round 161 # 1 5 9 13 162 # 2 6 10 14 163 # 3 7 11 15 164 # 0 5 10 15 < odd round 165 # 1 6 11 12 166 # 2 7 8 13 167 # 3 4 9 14 168 # 169 # 'a', 'b' and 'd's are permanently allocated in registers, 170 # @x[0..7,12..15], while 'c's are maintained in memory. If 171 # you observe 'c' column, you'll notice that pair of 'c's is 172 # invariant between rounds. This means that we have to reload 173 # them once per round, in the middle. This is why you'll see 174 # bunch of 'c' stores and loads in the middle, but none in 175 # the beginning or end. 176 177 # Normally instructions would be interleaved to favour in-order 178 # execution. Generally out-of-order cores manage it gracefully, 179 # but not this time for some reason. As in-order execution 180 # cores are dying breed, old Atom is the only one around, 181 # instructions are left uninterleaved. Besides, Atom is better 182 # off executing 1xSSSE3 code anyway... 183 184 ( 185 "&add (@x[$a0],@x[$b0])", # Q1 186 "&xor (@x[$d0],@x[$a0])", 187 "&rol (@x[$d0],16)", 188 "&add (@x[$a1],@x[$b1])", # Q2 189 "&xor (@x[$d1],@x[$a1])", 190 "&rol (@x[$d1],16)", 191 192 "&add ($xc,@x[$d0])", 193 "&xor (@x[$b0],$xc)", 194 "&rol (@x[$b0],12)", 195 "&add ($xc_,@x[$d1])", 196 "&xor (@x[$b1],$xc_)", 197 "&rol (@x[$b1],12)", 198 199 "&add (@x[$a0],@x[$b0])", 200 "&xor (@x[$d0],@x[$a0])", 201 "&rol (@x[$d0],8)", 202 "&add (@x[$a1],@x[$b1])", 203 "&xor (@x[$d1],@x[$a1])", 204 "&rol (@x[$d1],8)", 205 206 "&add ($xc,@x[$d0])", 207 "&xor (@x[$b0],$xc)", 208 "&rol (@x[$b0],7)", 209 "&add ($xc_,@x[$d1])", 210 "&xor (@x[$b1],$xc_)", 211 "&rol (@x[$b1],7)", 212 213 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's 214 "&mov (\"4*$c1(%rsp)\",$xc_)", 215 "&mov ($xc,\"4*$c2(%rsp)\")", 216 "&mov ($xc_,\"4*$c3(%rsp)\")", 217 218 "&add (@x[$a2],@x[$b2])", # Q3 219 "&xor (@x[$d2],@x[$a2])", 220 "&rol (@x[$d2],16)", 221 "&add (@x[$a3],@x[$b3])", # Q4 222 "&xor (@x[$d3],@x[$a3])", 223 "&rol (@x[$d3],16)", 224 225 "&add ($xc,@x[$d2])", 226 "&xor (@x[$b2],$xc)", 227 "&rol (@x[$b2],12)", 228 "&add ($xc_,@x[$d3])", 229 "&xor (@x[$b3],$xc_)", 230 "&rol (@x[$b3],12)", 231 232 "&add (@x[$a2],@x[$b2])", 233 "&xor (@x[$d2],@x[$a2])", 234 "&rol (@x[$d2],8)", 235 "&add (@x[$a3],@x[$b3])", 236 "&xor (@x[$d3],@x[$a3])", 237 "&rol (@x[$d3],8)", 238 239 "&add ($xc,@x[$d2])", 240 "&xor (@x[$b2],$xc)", 241 "&rol (@x[$b2],7)", 242 "&add ($xc_,@x[$d3])", 243 "&xor (@x[$b3],$xc_)", 244 "&rol (@x[$b3],7)" 245 ); 246} 247 248######################################################################## 249# Generic code path that handles all lengths on pre-SSSE3 processors. 250$code.=<<___; 251.globl ChaCha20_ctr32 252.type ChaCha20_ctr32,\@function,5 253.align 64 254ChaCha20_ctr32: 255.cfi_startproc 256 cmp \$0,$len 257 je .Lno_data 258 mov OPENSSL_ia32cap_P+4(%rip),%r10 259___ 260$code.=<<___ if ($avx>2); 261 bt \$48,%r10 # check for AVX512F 262 jc .LChaCha20_avx512 263 test %r10,%r10 # check for AVX512VL 264 js .LChaCha20_avx512vl 265___ 266$code.=<<___; 267 test \$`1<<(41-32)`,%r10d 268 jnz .LChaCha20_ssse3 269 270 push %rbx 271.cfi_push %rbx 272 push %rbp 273.cfi_push %rbp 274 push %r12 275.cfi_push %r12 276 push %r13 277.cfi_push %r13 278 push %r14 279.cfi_push %r14 280 push %r15 281.cfi_push %r15 282 sub \$64+24,%rsp 283.cfi_adjust_cfa_offset 64+24 284.Lctr32_body: 285 286 #movdqa .Lsigma(%rip),%xmm0 287 movdqu ($key),%xmm1 288 movdqu 16($key),%xmm2 289 movdqu ($counter),%xmm3 290 movdqa .Lone(%rip),%xmm4 291 292 #movdqa %xmm0,4*0(%rsp) # key[0] 293 movdqa %xmm1,4*4(%rsp) # key[1] 294 movdqa %xmm2,4*8(%rsp) # key[2] 295 movdqa %xmm3,4*12(%rsp) # key[3] 296 mov $len,%rbp # reassign $len 297 jmp .Loop_outer 298 299.align 32 300.Loop_outer: 301 mov \$0x61707865,@x[0] # 'expa' 302 mov \$0x3320646e,@x[1] # 'nd 3' 303 mov \$0x79622d32,@x[2] # '2-by' 304 mov \$0x6b206574,@x[3] # 'te k' 305 mov 4*4(%rsp),@x[4] 306 mov 4*5(%rsp),@x[5] 307 mov 4*6(%rsp),@x[6] 308 mov 4*7(%rsp),@x[7] 309 movd %xmm3,@x[12] 310 mov 4*13(%rsp),@x[13] 311 mov 4*14(%rsp),@x[14] 312 mov 4*15(%rsp),@x[15] 313 314 mov %rbp,64+0(%rsp) # save len 315 mov \$10,%ebp 316 mov $inp,64+8(%rsp) # save inp 317 movq %xmm2,%rsi # "@x[8]" 318 mov $out,64+16(%rsp) # save out 319 mov %rsi,%rdi 320 shr \$32,%rdi # "@x[9]" 321 jmp .Loop 322 323.align 32 324.Loop: 325___ 326 foreach (&ROUND (0, 4, 8,12)) { eval; } 327 foreach (&ROUND (0, 5,10,15)) { eval; } 328 &dec ("%ebp"); 329 &jnz (".Loop"); 330 331$code.=<<___; 332 mov @t[1],4*9(%rsp) # modulo-scheduled 333 mov @t[0],4*8(%rsp) 334 mov 64(%rsp),%rbp # load len 335 movdqa %xmm2,%xmm1 336 mov 64+8(%rsp),$inp # load inp 337 paddd %xmm4,%xmm3 # increment counter 338 mov 64+16(%rsp),$out # load out 339 340 add \$0x61707865,@x[0] # 'expa' 341 add \$0x3320646e,@x[1] # 'nd 3' 342 add \$0x79622d32,@x[2] # '2-by' 343 add \$0x6b206574,@x[3] # 'te k' 344 add 4*4(%rsp),@x[4] 345 add 4*5(%rsp),@x[5] 346 add 4*6(%rsp),@x[6] 347 add 4*7(%rsp),@x[7] 348 add 4*12(%rsp),@x[12] 349 add 4*13(%rsp),@x[13] 350 add 4*14(%rsp),@x[14] 351 add 4*15(%rsp),@x[15] 352 paddd 4*8(%rsp),%xmm1 353 354 cmp \$64,%rbp 355 jb .Ltail 356 357 xor 4*0($inp),@x[0] # xor with input 358 xor 4*1($inp),@x[1] 359 xor 4*2($inp),@x[2] 360 xor 4*3($inp),@x[3] 361 xor 4*4($inp),@x[4] 362 xor 4*5($inp),@x[5] 363 xor 4*6($inp),@x[6] 364 xor 4*7($inp),@x[7] 365 movdqu 4*8($inp),%xmm0 366 xor 4*12($inp),@x[12] 367 xor 4*13($inp),@x[13] 368 xor 4*14($inp),@x[14] 369 xor 4*15($inp),@x[15] 370 lea 4*16($inp),$inp # inp+=64 371 pxor %xmm1,%xmm0 372 373 movdqa %xmm2,4*8(%rsp) 374 movd %xmm3,4*12(%rsp) 375 376 mov @x[0],4*0($out) # write output 377 mov @x[1],4*1($out) 378 mov @x[2],4*2($out) 379 mov @x[3],4*3($out) 380 mov @x[4],4*4($out) 381 mov @x[5],4*5($out) 382 mov @x[6],4*6($out) 383 mov @x[7],4*7($out) 384 movdqu %xmm0,4*8($out) 385 mov @x[12],4*12($out) 386 mov @x[13],4*13($out) 387 mov @x[14],4*14($out) 388 mov @x[15],4*15($out) 389 lea 4*16($out),$out # out+=64 390 391 sub \$64,%rbp 392 jnz .Loop_outer 393 394 jmp .Ldone 395 396.align 16 397.Ltail: 398 mov @x[0],4*0(%rsp) 399 mov @x[1],4*1(%rsp) 400 xor %rbx,%rbx 401 mov @x[2],4*2(%rsp) 402 mov @x[3],4*3(%rsp) 403 mov @x[4],4*4(%rsp) 404 mov @x[5],4*5(%rsp) 405 mov @x[6],4*6(%rsp) 406 mov @x[7],4*7(%rsp) 407 movdqa %xmm1,4*8(%rsp) 408 mov @x[12],4*12(%rsp) 409 mov @x[13],4*13(%rsp) 410 mov @x[14],4*14(%rsp) 411 mov @x[15],4*15(%rsp) 412 413.Loop_tail: 414 movzb ($inp,%rbx),%eax 415 movzb (%rsp,%rbx),%edx 416 lea 1(%rbx),%rbx 417 xor %edx,%eax 418 mov %al,-1($out,%rbx) 419 dec %rbp 420 jnz .Loop_tail 421 422.Ldone: 423 lea 64+24+48(%rsp),%rsi 424.cfi_def_cfa %rsi,8 425 mov -48(%rsi),%r15 426.cfi_restore %r15 427 mov -40(%rsi),%r14 428.cfi_restore %r14 429 mov -32(%rsi),%r13 430.cfi_restore %r13 431 mov -24(%rsi),%r12 432.cfi_restore %r12 433 mov -16(%rsi),%rbp 434.cfi_restore %rbp 435 mov -8(%rsi),%rbx 436.cfi_restore %rbx 437 lea (%rsi),%rsp 438.cfi_def_cfa_register %rsp 439.Lno_data: 440 ret 441.cfi_endproc 442.size ChaCha20_ctr32,.-ChaCha20_ctr32 443___ 444 445######################################################################## 446# SSSE3 code path that handles shorter lengths 447{ 448my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); 449 450sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 451 &paddd ($a,$b); 452 &pxor ($d,$a); 453 &pshufb ($d,$rot16); 454 455 &paddd ($c,$d); 456 &pxor ($b,$c); 457 &movdqa ($t,$b); 458 &psrld ($b,20); 459 &pslld ($t,12); 460 &por ($b,$t); 461 462 &paddd ($a,$b); 463 &pxor ($d,$a); 464 &pshufb ($d,$rot24); 465 466 &paddd ($c,$d); 467 &pxor ($b,$c); 468 &movdqa ($t,$b); 469 &psrld ($b,25); 470 &pslld ($t,7); 471 &por ($b,$t); 472} 473 474my $xframe = $win64 ? 160+8 : 8; 475 476$code.=<<___; 477.type ChaCha20_ssse3,\@function,5 478.align 32 479ChaCha20_ssse3: 480.cfi_startproc 481.LChaCha20_ssse3: 482 mov %rsp,%r9 # frame pointer 483.cfi_def_cfa_register %r9 484___ 485$code.=<<___ if ($avx); 486 test \$`1<<(43-32)`,%r10d 487 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 488___ 489$code.=<<___; 490 cmp \$128,$len # we might throw away some data, 491 je .LChaCha20_128 492 ja .LChaCha20_4x # but overall it won't be slower 493 494.Ldo_sse3_after_all: 495 sub \$64+$xframe,%rsp 496___ 497$code.=<<___ if ($win64); 498 movaps %xmm6,-0x28(%r9) 499 movaps %xmm7,-0x18(%r9) 500.Lssse3_body: 501___ 502$code.=<<___; 503 movdqa .Lsigma(%rip),$a 504 movdqu ($key),$b 505 movdqu 16($key),$c 506 movdqu ($counter),$d 507 movdqa .Lrot16(%rip),$rot16 508 movdqa .Lrot24(%rip),$rot24 509 510 movdqa $a,0x00(%rsp) 511 movdqa $b,0x10(%rsp) 512 movdqa $c,0x20(%rsp) 513 movdqa $d,0x30(%rsp) 514 mov \$10,$counter # reuse $counter 515 jmp .Loop_ssse3 516 517.align 32 518.Loop_outer_ssse3: 519 movdqa .Lone(%rip),$d 520 movdqa 0x00(%rsp),$a 521 movdqa 0x10(%rsp),$b 522 movdqa 0x20(%rsp),$c 523 paddd 0x30(%rsp),$d 524 mov \$10,$counter 525 movdqa $d,0x30(%rsp) 526 jmp .Loop_ssse3 527 528.align 32 529.Loop_ssse3: 530___ 531 &SSSE3ROUND(); 532 &pshufd ($c,$c,0b01001110); 533 &pshufd ($b,$b,0b00111001); 534 &pshufd ($d,$d,0b10010011); 535 &nop (); 536 537 &SSSE3ROUND(); 538 &pshufd ($c,$c,0b01001110); 539 &pshufd ($b,$b,0b10010011); 540 &pshufd ($d,$d,0b00111001); 541 542 &dec ($counter); 543 &jnz (".Loop_ssse3"); 544 545$code.=<<___; 546 paddd 0x00(%rsp),$a 547 paddd 0x10(%rsp),$b 548 paddd 0x20(%rsp),$c 549 paddd 0x30(%rsp),$d 550 551 cmp \$64,$len 552 jb .Ltail_ssse3 553 554 movdqu 0x00($inp),$t 555 movdqu 0x10($inp),$t1 556 pxor $t,$a # xor with input 557 movdqu 0x20($inp),$t 558 pxor $t1,$b 559 movdqu 0x30($inp),$t1 560 lea 0x40($inp),$inp # inp+=64 561 pxor $t,$c 562 pxor $t1,$d 563 564 movdqu $a,0x00($out) # write output 565 movdqu $b,0x10($out) 566 movdqu $c,0x20($out) 567 movdqu $d,0x30($out) 568 lea 0x40($out),$out # out+=64 569 570 sub \$64,$len 571 jnz .Loop_outer_ssse3 572 573 jmp .Ldone_ssse3 574 575.align 16 576.Ltail_ssse3: 577 movdqa $a,0x00(%rsp) 578 movdqa $b,0x10(%rsp) 579 movdqa $c,0x20(%rsp) 580 movdqa $d,0x30(%rsp) 581 xor $counter,$counter 582 583.Loop_tail_ssse3: 584 movzb ($inp,$counter),%eax 585 movzb (%rsp,$counter),%ecx 586 lea 1($counter),$counter 587 xor %ecx,%eax 588 mov %al,-1($out,$counter) 589 dec $len 590 jnz .Loop_tail_ssse3 591 592.Ldone_ssse3: 593___ 594$code.=<<___ if ($win64); 595 movaps -0x28(%r9),%xmm6 596 movaps -0x18(%r9),%xmm7 597___ 598$code.=<<___; 599 lea (%r9),%rsp 600.cfi_def_cfa_register %rsp 601.Lssse3_epilogue: 602 ret 603.cfi_endproc 604.size ChaCha20_ssse3,.-ChaCha20_ssse3 605___ 606} 607 608######################################################################## 609# SSSE3 code path that handles 128-byte inputs 610{ 611my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); 612my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); 613 614sub SSSE3ROUND_2x { 615 &paddd ($a,$b); 616 &pxor ($d,$a); 617 &paddd ($a1,$b1); 618 &pxor ($d1,$a1); 619 &pshufb ($d,$rot16); 620 &pshufb($d1,$rot16); 621 622 &paddd ($c,$d); 623 &paddd ($c1,$d1); 624 &pxor ($b,$c); 625 &pxor ($b1,$c1); 626 &movdqa ($t,$b); 627 &psrld ($b,20); 628 &movdqa($t1,$b1); 629 &pslld ($t,12); 630 &psrld ($b1,20); 631 &por ($b,$t); 632 &pslld ($t1,12); 633 &por ($b1,$t1); 634 635 &paddd ($a,$b); 636 &pxor ($d,$a); 637 &paddd ($a1,$b1); 638 &pxor ($d1,$a1); 639 &pshufb ($d,$rot24); 640 &pshufb($d1,$rot24); 641 642 &paddd ($c,$d); 643 &paddd ($c1,$d1); 644 &pxor ($b,$c); 645 &pxor ($b1,$c1); 646 &movdqa ($t,$b); 647 &psrld ($b,25); 648 &movdqa($t1,$b1); 649 &pslld ($t,7); 650 &psrld ($b1,25); 651 &por ($b,$t); 652 &pslld ($t1,7); 653 &por ($b1,$t1); 654} 655 656my $xframe = $win64 ? 0x68 : 8; 657 658$code.=<<___; 659.type ChaCha20_128,\@function,5 660.align 32 661ChaCha20_128: 662.cfi_startproc 663.LChaCha20_128: 664 mov %rsp,%r9 # frame pointer 665.cfi_def_cfa_register %r9 666 sub \$64+$xframe,%rsp 667___ 668$code.=<<___ if ($win64); 669 movaps %xmm6,-0x68(%r9) 670 movaps %xmm7,-0x58(%r9) 671 movaps %xmm8,-0x48(%r9) 672 movaps %xmm9,-0x38(%r9) 673 movaps %xmm10,-0x28(%r9) 674 movaps %xmm11,-0x18(%r9) 675.L128_body: 676___ 677$code.=<<___; 678 movdqa .Lsigma(%rip),$a 679 movdqu ($key),$b 680 movdqu 16($key),$c 681 movdqu ($counter),$d 682 movdqa .Lone(%rip),$d1 683 movdqa .Lrot16(%rip),$rot16 684 movdqa .Lrot24(%rip),$rot24 685 686 movdqa $a,$a1 687 movdqa $a,0x00(%rsp) 688 movdqa $b,$b1 689 movdqa $b,0x10(%rsp) 690 movdqa $c,$c1 691 movdqa $c,0x20(%rsp) 692 paddd $d,$d1 693 movdqa $d,0x30(%rsp) 694 mov \$10,$counter # reuse $counter 695 jmp .Loop_128 696 697.align 32 698.Loop_128: 699___ 700 &SSSE3ROUND_2x(); 701 &pshufd ($c,$c,0b01001110); 702 &pshufd ($b,$b,0b00111001); 703 &pshufd ($d,$d,0b10010011); 704 &pshufd ($c1,$c1,0b01001110); 705 &pshufd ($b1,$b1,0b00111001); 706 &pshufd ($d1,$d1,0b10010011); 707 708 &SSSE3ROUND_2x(); 709 &pshufd ($c,$c,0b01001110); 710 &pshufd ($b,$b,0b10010011); 711 &pshufd ($d,$d,0b00111001); 712 &pshufd ($c1,$c1,0b01001110); 713 &pshufd ($b1,$b1,0b10010011); 714 &pshufd ($d1,$d1,0b00111001); 715 716 &dec ($counter); 717 &jnz (".Loop_128"); 718 719$code.=<<___; 720 paddd 0x00(%rsp),$a 721 paddd 0x10(%rsp),$b 722 paddd 0x20(%rsp),$c 723 paddd 0x30(%rsp),$d 724 paddd .Lone(%rip),$d1 725 paddd 0x00(%rsp),$a1 726 paddd 0x10(%rsp),$b1 727 paddd 0x20(%rsp),$c1 728 paddd 0x30(%rsp),$d1 729 730 movdqu 0x00($inp),$t 731 movdqu 0x10($inp),$t1 732 pxor $t,$a # xor with input 733 movdqu 0x20($inp),$t 734 pxor $t1,$b 735 movdqu 0x30($inp),$t1 736 pxor $t,$c 737 movdqu 0x40($inp),$t 738 pxor $t1,$d 739 movdqu 0x50($inp),$t1 740 pxor $t,$a1 741 movdqu 0x60($inp),$t 742 pxor $t1,$b1 743 movdqu 0x70($inp),$t1 744 pxor $t,$c1 745 pxor $t1,$d1 746 747 movdqu $a,0x00($out) # write output 748 movdqu $b,0x10($out) 749 movdqu $c,0x20($out) 750 movdqu $d,0x30($out) 751 movdqu $a1,0x40($out) 752 movdqu $b1,0x50($out) 753 movdqu $c1,0x60($out) 754 movdqu $d1,0x70($out) 755___ 756$code.=<<___ if ($win64); 757 movaps -0x68(%r9),%xmm6 758 movaps -0x58(%r9),%xmm7 759 movaps -0x48(%r9),%xmm8 760 movaps -0x38(%r9),%xmm9 761 movaps -0x28(%r9),%xmm10 762 movaps -0x18(%r9),%xmm11 763___ 764$code.=<<___; 765 lea (%r9),%rsp 766.cfi_def_cfa_register %rsp 767.L128_epilogue: 768 ret 769.cfi_endproc 770.size ChaCha20_128,.-ChaCha20_128 771___ 772} 773 774######################################################################## 775# SSSE3 code path that handles longer messages. 776{ 777# assign variables to favor Atom front-end 778my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, 779 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); 780my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 781 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 782 783sub SSSE3_lane_ROUND { 784my ($a0,$b0,$c0,$d0)=@_; 785my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 786my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 787my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 788my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 789my @x=map("\"$_\"",@xx); 790 791 # Consider order in which variables are addressed by their 792 # index: 793 # 794 # a b c d 795 # 796 # 0 4 8 12 < even round 797 # 1 5 9 13 798 # 2 6 10 14 799 # 3 7 11 15 800 # 0 5 10 15 < odd round 801 # 1 6 11 12 802 # 2 7 8 13 803 # 3 4 9 14 804 # 805 # 'a', 'b' and 'd's are permanently allocated in registers, 806 # @x[0..7,12..15], while 'c's are maintained in memory. If 807 # you observe 'c' column, you'll notice that pair of 'c's is 808 # invariant between rounds. This means that we have to reload 809 # them once per round, in the middle. This is why you'll see 810 # bunch of 'c' stores and loads in the middle, but none in 811 # the beginning or end. 812 813 ( 814 "&paddd (@x[$a0],@x[$b0])", # Q1 815 "&paddd (@x[$a1],@x[$b1])", # Q2 816 "&pxor (@x[$d0],@x[$a0])", 817 "&pxor (@x[$d1],@x[$a1])", 818 "&pshufb (@x[$d0],$t1)", 819 "&pshufb (@x[$d1],$t1)", 820 821 "&paddd ($xc,@x[$d0])", 822 "&paddd ($xc_,@x[$d1])", 823 "&pxor (@x[$b0],$xc)", 824 "&pxor (@x[$b1],$xc_)", 825 "&movdqa ($t0,@x[$b0])", 826 "&pslld (@x[$b0],12)", 827 "&psrld ($t0,20)", 828 "&movdqa ($t1,@x[$b1])", 829 "&pslld (@x[$b1],12)", 830 "&por (@x[$b0],$t0)", 831 "&psrld ($t1,20)", 832 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 833 "&por (@x[$b1],$t1)", 834 835 "&paddd (@x[$a0],@x[$b0])", 836 "&paddd (@x[$a1],@x[$b1])", 837 "&pxor (@x[$d0],@x[$a0])", 838 "&pxor (@x[$d1],@x[$a1])", 839 "&pshufb (@x[$d0],$t0)", 840 "&pshufb (@x[$d1],$t0)", 841 842 "&paddd ($xc,@x[$d0])", 843 "&paddd ($xc_,@x[$d1])", 844 "&pxor (@x[$b0],$xc)", 845 "&pxor (@x[$b1],$xc_)", 846 "&movdqa ($t1,@x[$b0])", 847 "&pslld (@x[$b0],7)", 848 "&psrld ($t1,25)", 849 "&movdqa ($t0,@x[$b1])", 850 "&pslld (@x[$b1],7)", 851 "&por (@x[$b0],$t1)", 852 "&psrld ($t0,25)", 853 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 854 "&por (@x[$b1],$t0)", 855 856 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 857 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", 858 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", 859 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", 860 861 "&paddd (@x[$a2],@x[$b2])", # Q3 862 "&paddd (@x[$a3],@x[$b3])", # Q4 863 "&pxor (@x[$d2],@x[$a2])", 864 "&pxor (@x[$d3],@x[$a3])", 865 "&pshufb (@x[$d2],$t1)", 866 "&pshufb (@x[$d3],$t1)", 867 868 "&paddd ($xc,@x[$d2])", 869 "&paddd ($xc_,@x[$d3])", 870 "&pxor (@x[$b2],$xc)", 871 "&pxor (@x[$b3],$xc_)", 872 "&movdqa ($t0,@x[$b2])", 873 "&pslld (@x[$b2],12)", 874 "&psrld ($t0,20)", 875 "&movdqa ($t1,@x[$b3])", 876 "&pslld (@x[$b3],12)", 877 "&por (@x[$b2],$t0)", 878 "&psrld ($t1,20)", 879 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 880 "&por (@x[$b3],$t1)", 881 882 "&paddd (@x[$a2],@x[$b2])", 883 "&paddd (@x[$a3],@x[$b3])", 884 "&pxor (@x[$d2],@x[$a2])", 885 "&pxor (@x[$d3],@x[$a3])", 886 "&pshufb (@x[$d2],$t0)", 887 "&pshufb (@x[$d3],$t0)", 888 889 "&paddd ($xc,@x[$d2])", 890 "&paddd ($xc_,@x[$d3])", 891 "&pxor (@x[$b2],$xc)", 892 "&pxor (@x[$b3],$xc_)", 893 "&movdqa ($t1,@x[$b2])", 894 "&pslld (@x[$b2],7)", 895 "&psrld ($t1,25)", 896 "&movdqa ($t0,@x[$b3])", 897 "&pslld (@x[$b3],7)", 898 "&por (@x[$b2],$t1)", 899 "&psrld ($t0,25)", 900 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 901 "&por (@x[$b3],$t0)" 902 ); 903} 904 905my $xframe = $win64 ? 0xa8 : 8; 906 907$code.=<<___; 908.type ChaCha20_4x,\@function,5 909.align 32 910ChaCha20_4x: 911.cfi_startproc 912.LChaCha20_4x: 913 mov %rsp,%r9 # frame pointer 914.cfi_def_cfa_register %r9 915 mov %r10,%r11 916___ 917$code.=<<___ if ($avx>1); 918 shr \$32,%r10 # OPENSSL_ia32cap_P+8 919 test \$`1<<5`,%r10 # test AVX2 920 jnz .LChaCha20_8x 921___ 922$code.=<<___; 923 cmp \$192,$len 924 ja .Lproceed4x 925 926 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE 927 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE 928 je .Ldo_sse3_after_all # to detect Atom 929 930.Lproceed4x: 931 sub \$0x140+$xframe,%rsp 932___ 933 ################ stack layout 934 # +0x00 SIMD equivalent of @x[8-12] 935 # ... 936 # +0x40 constant copy of key[0-2] smashed by lanes 937 # ... 938 # +0x100 SIMD counters (with nonce smashed by lanes) 939 # ... 940 # +0x140 941$code.=<<___ if ($win64); 942 movaps %xmm6,-0xa8(%r9) 943 movaps %xmm7,-0x98(%r9) 944 movaps %xmm8,-0x88(%r9) 945 movaps %xmm9,-0x78(%r9) 946 movaps %xmm10,-0x68(%r9) 947 movaps %xmm11,-0x58(%r9) 948 movaps %xmm12,-0x48(%r9) 949 movaps %xmm13,-0x38(%r9) 950 movaps %xmm14,-0x28(%r9) 951 movaps %xmm15,-0x18(%r9) 952.L4x_body: 953___ 954$code.=<<___; 955 movdqa .Lsigma(%rip),$xa3 # key[0] 956 movdqu ($key),$xb3 # key[1] 957 movdqu 16($key),$xt3 # key[2] 958 movdqu ($counter),$xd3 # key[3] 959 lea 0x100(%rsp),%rcx # size optimization 960 lea .Lrot16(%rip),%r10 961 lea .Lrot24(%rip),%r11 962 963 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... 964 pshufd \$0x55,$xa3,$xa1 965 movdqa $xa0,0x40(%rsp) # ... and offload 966 pshufd \$0xaa,$xa3,$xa2 967 movdqa $xa1,0x50(%rsp) 968 pshufd \$0xff,$xa3,$xa3 969 movdqa $xa2,0x60(%rsp) 970 movdqa $xa3,0x70(%rsp) 971 972 pshufd \$0x00,$xb3,$xb0 973 pshufd \$0x55,$xb3,$xb1 974 movdqa $xb0,0x80-0x100(%rcx) 975 pshufd \$0xaa,$xb3,$xb2 976 movdqa $xb1,0x90-0x100(%rcx) 977 pshufd \$0xff,$xb3,$xb3 978 movdqa $xb2,0xa0-0x100(%rcx) 979 movdqa $xb3,0xb0-0x100(%rcx) 980 981 pshufd \$0x00,$xt3,$xt0 # "$xc0" 982 pshufd \$0x55,$xt3,$xt1 # "$xc1" 983 movdqa $xt0,0xc0-0x100(%rcx) 984 pshufd \$0xaa,$xt3,$xt2 # "$xc2" 985 movdqa $xt1,0xd0-0x100(%rcx) 986 pshufd \$0xff,$xt3,$xt3 # "$xc3" 987 movdqa $xt2,0xe0-0x100(%rcx) 988 movdqa $xt3,0xf0-0x100(%rcx) 989 990 pshufd \$0x00,$xd3,$xd0 991 pshufd \$0x55,$xd3,$xd1 992 paddd .Linc(%rip),$xd0 # don't save counters yet 993 pshufd \$0xaa,$xd3,$xd2 994 movdqa $xd1,0x110-0x100(%rcx) 995 pshufd \$0xff,$xd3,$xd3 996 movdqa $xd2,0x120-0x100(%rcx) 997 movdqa $xd3,0x130-0x100(%rcx) 998 999 jmp .Loop_enter4x 1000 1001.align 32 1002.Loop_outer4x: 1003 movdqa 0x40(%rsp),$xa0 # re-load smashed key 1004 movdqa 0x50(%rsp),$xa1 1005 movdqa 0x60(%rsp),$xa2 1006 movdqa 0x70(%rsp),$xa3 1007 movdqa 0x80-0x100(%rcx),$xb0 1008 movdqa 0x90-0x100(%rcx),$xb1 1009 movdqa 0xa0-0x100(%rcx),$xb2 1010 movdqa 0xb0-0x100(%rcx),$xb3 1011 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1012 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1013 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1014 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1015 movdqa 0x100-0x100(%rcx),$xd0 1016 movdqa 0x110-0x100(%rcx),$xd1 1017 movdqa 0x120-0x100(%rcx),$xd2 1018 movdqa 0x130-0x100(%rcx),$xd3 1019 paddd .Lfour(%rip),$xd0 # next SIMD counters 1020 1021.Loop_enter4x: 1022 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" 1023 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" 1024 movdqa (%r10),$xt3 # .Lrot16(%rip) 1025 mov \$10,%eax 1026 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1027 jmp .Loop4x 1028 1029.align 32 1030.Loop4x: 1031___ 1032 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } 1033 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } 1034$code.=<<___; 1035 dec %eax 1036 jnz .Loop4x 1037 1038 paddd 0x40(%rsp),$xa0 # accumulate key material 1039 paddd 0x50(%rsp),$xa1 1040 paddd 0x60(%rsp),$xa2 1041 paddd 0x70(%rsp),$xa3 1042 1043 movdqa $xa0,$xt2 # "de-interlace" data 1044 punpckldq $xa1,$xa0 1045 movdqa $xa2,$xt3 1046 punpckldq $xa3,$xa2 1047 punpckhdq $xa1,$xt2 1048 punpckhdq $xa3,$xt3 1049 movdqa $xa0,$xa1 1050 punpcklqdq $xa2,$xa0 # "a0" 1051 movdqa $xt2,$xa3 1052 punpcklqdq $xt3,$xt2 # "a2" 1053 punpckhqdq $xa2,$xa1 # "a1" 1054 punpckhqdq $xt3,$xa3 # "a3" 1055___ 1056 ($xa2,$xt2)=($xt2,$xa2); 1057$code.=<<___; 1058 paddd 0x80-0x100(%rcx),$xb0 1059 paddd 0x90-0x100(%rcx),$xb1 1060 paddd 0xa0-0x100(%rcx),$xb2 1061 paddd 0xb0-0x100(%rcx),$xb3 1062 1063 movdqa $xa0,0x00(%rsp) # offload $xaN 1064 movdqa $xa1,0x10(%rsp) 1065 movdqa 0x20(%rsp),$xa0 # "xc2" 1066 movdqa 0x30(%rsp),$xa1 # "xc3" 1067 1068 movdqa $xb0,$xt2 1069 punpckldq $xb1,$xb0 1070 movdqa $xb2,$xt3 1071 punpckldq $xb3,$xb2 1072 punpckhdq $xb1,$xt2 1073 punpckhdq $xb3,$xt3 1074 movdqa $xb0,$xb1 1075 punpcklqdq $xb2,$xb0 # "b0" 1076 movdqa $xt2,$xb3 1077 punpcklqdq $xt3,$xt2 # "b2" 1078 punpckhqdq $xb2,$xb1 # "b1" 1079 punpckhqdq $xt3,$xb3 # "b3" 1080___ 1081 ($xb2,$xt2)=($xt2,$xb2); 1082 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1083$code.=<<___; 1084 paddd 0xc0-0x100(%rcx),$xc0 1085 paddd 0xd0-0x100(%rcx),$xc1 1086 paddd 0xe0-0x100(%rcx),$xc2 1087 paddd 0xf0-0x100(%rcx),$xc3 1088 1089 movdqa $xa2,0x20(%rsp) # keep offloading $xaN 1090 movdqa $xa3,0x30(%rsp) 1091 1092 movdqa $xc0,$xt2 1093 punpckldq $xc1,$xc0 1094 movdqa $xc2,$xt3 1095 punpckldq $xc3,$xc2 1096 punpckhdq $xc1,$xt2 1097 punpckhdq $xc3,$xt3 1098 movdqa $xc0,$xc1 1099 punpcklqdq $xc2,$xc0 # "c0" 1100 movdqa $xt2,$xc3 1101 punpcklqdq $xt3,$xt2 # "c2" 1102 punpckhqdq $xc2,$xc1 # "c1" 1103 punpckhqdq $xt3,$xc3 # "c3" 1104___ 1105 ($xc2,$xt2)=($xt2,$xc2); 1106 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary 1107$code.=<<___; 1108 paddd 0x100-0x100(%rcx),$xd0 1109 paddd 0x110-0x100(%rcx),$xd1 1110 paddd 0x120-0x100(%rcx),$xd2 1111 paddd 0x130-0x100(%rcx),$xd3 1112 1113 movdqa $xd0,$xt2 1114 punpckldq $xd1,$xd0 1115 movdqa $xd2,$xt3 1116 punpckldq $xd3,$xd2 1117 punpckhdq $xd1,$xt2 1118 punpckhdq $xd3,$xt3 1119 movdqa $xd0,$xd1 1120 punpcklqdq $xd2,$xd0 # "d0" 1121 movdqa $xt2,$xd3 1122 punpcklqdq $xt3,$xt2 # "d2" 1123 punpckhqdq $xd2,$xd1 # "d1" 1124 punpckhqdq $xt3,$xd3 # "d3" 1125___ 1126 ($xd2,$xt2)=($xt2,$xd2); 1127$code.=<<___; 1128 cmp \$64*4,$len 1129 jb .Ltail4x 1130 1131 movdqu 0x00($inp),$xt0 # xor with input 1132 movdqu 0x10($inp),$xt1 1133 movdqu 0x20($inp),$xt2 1134 movdqu 0x30($inp),$xt3 1135 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1136 pxor $xb0,$xt1 1137 pxor $xc0,$xt2 1138 pxor $xd0,$xt3 1139 1140 movdqu $xt0,0x00($out) 1141 movdqu 0x40($inp),$xt0 1142 movdqu $xt1,0x10($out) 1143 movdqu 0x50($inp),$xt1 1144 movdqu $xt2,0x20($out) 1145 movdqu 0x60($inp),$xt2 1146 movdqu $xt3,0x30($out) 1147 movdqu 0x70($inp),$xt3 1148 lea 0x80($inp),$inp # size optimization 1149 pxor 0x10(%rsp),$xt0 1150 pxor $xb1,$xt1 1151 pxor $xc1,$xt2 1152 pxor $xd1,$xt3 1153 1154 movdqu $xt0,0x40($out) 1155 movdqu 0x00($inp),$xt0 1156 movdqu $xt1,0x50($out) 1157 movdqu 0x10($inp),$xt1 1158 movdqu $xt2,0x60($out) 1159 movdqu 0x20($inp),$xt2 1160 movdqu $xt3,0x70($out) 1161 lea 0x80($out),$out # size optimization 1162 movdqu 0x30($inp),$xt3 1163 pxor 0x20(%rsp),$xt0 1164 pxor $xb2,$xt1 1165 pxor $xc2,$xt2 1166 pxor $xd2,$xt3 1167 1168 movdqu $xt0,0x00($out) 1169 movdqu 0x40($inp),$xt0 1170 movdqu $xt1,0x10($out) 1171 movdqu 0x50($inp),$xt1 1172 movdqu $xt2,0x20($out) 1173 movdqu 0x60($inp),$xt2 1174 movdqu $xt3,0x30($out) 1175 movdqu 0x70($inp),$xt3 1176 lea 0x80($inp),$inp # inp+=64*4 1177 pxor 0x30(%rsp),$xt0 1178 pxor $xb3,$xt1 1179 pxor $xc3,$xt2 1180 pxor $xd3,$xt3 1181 movdqu $xt0,0x40($out) 1182 movdqu $xt1,0x50($out) 1183 movdqu $xt2,0x60($out) 1184 movdqu $xt3,0x70($out) 1185 lea 0x80($out),$out # out+=64*4 1186 1187 sub \$64*4,$len 1188 jnz .Loop_outer4x 1189 1190 jmp .Ldone4x 1191 1192.Ltail4x: 1193 cmp \$192,$len 1194 jae .L192_or_more4x 1195 cmp \$128,$len 1196 jae .L128_or_more4x 1197 cmp \$64,$len 1198 jae .L64_or_more4x 1199 1200 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1201 xor %r10,%r10 1202 #movdqa $xt0,0x00(%rsp) 1203 movdqa $xb0,0x10(%rsp) 1204 movdqa $xc0,0x20(%rsp) 1205 movdqa $xd0,0x30(%rsp) 1206 jmp .Loop_tail4x 1207 1208.align 32 1209.L64_or_more4x: 1210 movdqu 0x00($inp),$xt0 # xor with input 1211 movdqu 0x10($inp),$xt1 1212 movdqu 0x20($inp),$xt2 1213 movdqu 0x30($inp),$xt3 1214 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? 1215 pxor $xb0,$xt1 1216 pxor $xc0,$xt2 1217 pxor $xd0,$xt3 1218 movdqu $xt0,0x00($out) 1219 movdqu $xt1,0x10($out) 1220 movdqu $xt2,0x20($out) 1221 movdqu $xt3,0x30($out) 1222 je .Ldone4x 1223 1224 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? 1225 lea 0x40($inp),$inp # inp+=64*1 1226 xor %r10,%r10 1227 movdqa $xt0,0x00(%rsp) 1228 movdqa $xb1,0x10(%rsp) 1229 lea 0x40($out),$out # out+=64*1 1230 movdqa $xc1,0x20(%rsp) 1231 sub \$64,$len # len-=64*1 1232 movdqa $xd1,0x30(%rsp) 1233 jmp .Loop_tail4x 1234 1235.align 32 1236.L128_or_more4x: 1237 movdqu 0x00($inp),$xt0 # xor with input 1238 movdqu 0x10($inp),$xt1 1239 movdqu 0x20($inp),$xt2 1240 movdqu 0x30($inp),$xt3 1241 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1242 pxor $xb0,$xt1 1243 pxor $xc0,$xt2 1244 pxor $xd0,$xt3 1245 1246 movdqu $xt0,0x00($out) 1247 movdqu 0x40($inp),$xt0 1248 movdqu $xt1,0x10($out) 1249 movdqu 0x50($inp),$xt1 1250 movdqu $xt2,0x20($out) 1251 movdqu 0x60($inp),$xt2 1252 movdqu $xt3,0x30($out) 1253 movdqu 0x70($inp),$xt3 1254 pxor 0x10(%rsp),$xt0 1255 pxor $xb1,$xt1 1256 pxor $xc1,$xt2 1257 pxor $xd1,$xt3 1258 movdqu $xt0,0x40($out) 1259 movdqu $xt1,0x50($out) 1260 movdqu $xt2,0x60($out) 1261 movdqu $xt3,0x70($out) 1262 je .Ldone4x 1263 1264 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? 1265 lea 0x80($inp),$inp # inp+=64*2 1266 xor %r10,%r10 1267 movdqa $xt0,0x00(%rsp) 1268 movdqa $xb2,0x10(%rsp) 1269 lea 0x80($out),$out # out+=64*2 1270 movdqa $xc2,0x20(%rsp) 1271 sub \$128,$len # len-=64*2 1272 movdqa $xd2,0x30(%rsp) 1273 jmp .Loop_tail4x 1274 1275.align 32 1276.L192_or_more4x: 1277 movdqu 0x00($inp),$xt0 # xor with input 1278 movdqu 0x10($inp),$xt1 1279 movdqu 0x20($inp),$xt2 1280 movdqu 0x30($inp),$xt3 1281 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1282 pxor $xb0,$xt1 1283 pxor $xc0,$xt2 1284 pxor $xd0,$xt3 1285 1286 movdqu $xt0,0x00($out) 1287 movdqu 0x40($inp),$xt0 1288 movdqu $xt1,0x10($out) 1289 movdqu 0x50($inp),$xt1 1290 movdqu $xt2,0x20($out) 1291 movdqu 0x60($inp),$xt2 1292 movdqu $xt3,0x30($out) 1293 movdqu 0x70($inp),$xt3 1294 lea 0x80($inp),$inp # size optimization 1295 pxor 0x10(%rsp),$xt0 1296 pxor $xb1,$xt1 1297 pxor $xc1,$xt2 1298 pxor $xd1,$xt3 1299 1300 movdqu $xt0,0x40($out) 1301 movdqu 0x00($inp),$xt0 1302 movdqu $xt1,0x50($out) 1303 movdqu 0x10($inp),$xt1 1304 movdqu $xt2,0x60($out) 1305 movdqu 0x20($inp),$xt2 1306 movdqu $xt3,0x70($out) 1307 lea 0x80($out),$out # size optimization 1308 movdqu 0x30($inp),$xt3 1309 pxor 0x20(%rsp),$xt0 1310 pxor $xb2,$xt1 1311 pxor $xc2,$xt2 1312 pxor $xd2,$xt3 1313 movdqu $xt0,0x00($out) 1314 movdqu $xt1,0x10($out) 1315 movdqu $xt2,0x20($out) 1316 movdqu $xt3,0x30($out) 1317 je .Ldone4x 1318 1319 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? 1320 lea 0x40($inp),$inp # inp+=64*3 1321 xor %r10,%r10 1322 movdqa $xt0,0x00(%rsp) 1323 movdqa $xb3,0x10(%rsp) 1324 lea 0x40($out),$out # out+=64*3 1325 movdqa $xc3,0x20(%rsp) 1326 sub \$192,$len # len-=64*3 1327 movdqa $xd3,0x30(%rsp) 1328 1329.Loop_tail4x: 1330 movzb ($inp,%r10),%eax 1331 movzb (%rsp,%r10),%ecx 1332 lea 1(%r10),%r10 1333 xor %ecx,%eax 1334 mov %al,-1($out,%r10) 1335 dec $len 1336 jnz .Loop_tail4x 1337 1338.Ldone4x: 1339___ 1340$code.=<<___ if ($win64); 1341 movaps -0xa8(%r9),%xmm6 1342 movaps -0x98(%r9),%xmm7 1343 movaps -0x88(%r9),%xmm8 1344 movaps -0x78(%r9),%xmm9 1345 movaps -0x68(%r9),%xmm10 1346 movaps -0x58(%r9),%xmm11 1347 movaps -0x48(%r9),%xmm12 1348 movaps -0x38(%r9),%xmm13 1349 movaps -0x28(%r9),%xmm14 1350 movaps -0x18(%r9),%xmm15 1351___ 1352$code.=<<___; 1353 lea (%r9),%rsp 1354.cfi_def_cfa_register %rsp 1355.L4x_epilogue: 1356 ret 1357.cfi_endproc 1358.size ChaCha20_4x,.-ChaCha20_4x 1359___ 1360} 1361 1362######################################################################## 1363# XOP code path that handles all lengths. 1364if ($avx) { 1365# There is some "anomaly" observed depending on instructions' size or 1366# alignment. If you look closely at below code you'll notice that 1367# sometimes argument order varies. The order affects instruction 1368# encoding by making it larger, and such fiddling gives 5% performance 1369# improvement. This is on FX-4100... 1370 1371my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1372 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); 1373my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1374 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); 1375 1376sub XOP_lane_ROUND { 1377my ($a0,$b0,$c0,$d0)=@_; 1378my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1379my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1380my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1381my @x=map("\"$_\"",@xx); 1382 1383 ( 1384 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1385 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1386 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1387 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1388 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1389 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1390 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1391 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1392 "&vprotd (@x[$d0],@x[$d0],16)", 1393 "&vprotd (@x[$d1],@x[$d1],16)", 1394 "&vprotd (@x[$d2],@x[$d2],16)", 1395 "&vprotd (@x[$d3],@x[$d3],16)", 1396 1397 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1398 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1399 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1400 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1401 "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1402 "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1403 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1404 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1405 "&vprotd (@x[$b0],@x[$b0],12)", 1406 "&vprotd (@x[$b1],@x[$b1],12)", 1407 "&vprotd (@x[$b2],@x[$b2],12)", 1408 "&vprotd (@x[$b3],@x[$b3],12)", 1409 1410 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip 1411 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip 1412 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1413 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1414 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1415 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1416 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1417 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1418 "&vprotd (@x[$d0],@x[$d0],8)", 1419 "&vprotd (@x[$d1],@x[$d1],8)", 1420 "&vprotd (@x[$d2],@x[$d2],8)", 1421 "&vprotd (@x[$d3],@x[$d3],8)", 1422 1423 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1424 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1425 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1426 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1427 "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1428 "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1429 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1430 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1431 "&vprotd (@x[$b0],@x[$b0],7)", 1432 "&vprotd (@x[$b1],@x[$b1],7)", 1433 "&vprotd (@x[$b2],@x[$b2],7)", 1434 "&vprotd (@x[$b3],@x[$b3],7)" 1435 ); 1436} 1437 1438my $xframe = $win64 ? 0xa8 : 8; 1439 1440$code.=<<___; 1441.type ChaCha20_4xop,\@function,5 1442.align 32 1443ChaCha20_4xop: 1444.cfi_startproc 1445.LChaCha20_4xop: 1446 mov %rsp,%r9 # frame pointer 1447.cfi_def_cfa_register %r9 1448 sub \$0x140+$xframe,%rsp 1449___ 1450 ################ stack layout 1451 # +0x00 SIMD equivalent of @x[8-12] 1452 # ... 1453 # +0x40 constant copy of key[0-2] smashed by lanes 1454 # ... 1455 # +0x100 SIMD counters (with nonce smashed by lanes) 1456 # ... 1457 # +0x140 1458$code.=<<___ if ($win64); 1459 movaps %xmm6,-0xa8(%r9) 1460 movaps %xmm7,-0x98(%r9) 1461 movaps %xmm8,-0x88(%r9) 1462 movaps %xmm9,-0x78(%r9) 1463 movaps %xmm10,-0x68(%r9) 1464 movaps %xmm11,-0x58(%r9) 1465 movaps %xmm12,-0x48(%r9) 1466 movaps %xmm13,-0x38(%r9) 1467 movaps %xmm14,-0x28(%r9) 1468 movaps %xmm15,-0x18(%r9) 1469.L4xop_body: 1470___ 1471$code.=<<___; 1472 vzeroupper 1473 1474 vmovdqa .Lsigma(%rip),$xa3 # key[0] 1475 vmovdqu ($key),$xb3 # key[1] 1476 vmovdqu 16($key),$xt3 # key[2] 1477 vmovdqu ($counter),$xd3 # key[3] 1478 lea 0x100(%rsp),%rcx # size optimization 1479 1480 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1481 vpshufd \$0x55,$xa3,$xa1 1482 vmovdqa $xa0,0x40(%rsp) # ... and offload 1483 vpshufd \$0xaa,$xa3,$xa2 1484 vmovdqa $xa1,0x50(%rsp) 1485 vpshufd \$0xff,$xa3,$xa3 1486 vmovdqa $xa2,0x60(%rsp) 1487 vmovdqa $xa3,0x70(%rsp) 1488 1489 vpshufd \$0x00,$xb3,$xb0 1490 vpshufd \$0x55,$xb3,$xb1 1491 vmovdqa $xb0,0x80-0x100(%rcx) 1492 vpshufd \$0xaa,$xb3,$xb2 1493 vmovdqa $xb1,0x90-0x100(%rcx) 1494 vpshufd \$0xff,$xb3,$xb3 1495 vmovdqa $xb2,0xa0-0x100(%rcx) 1496 vmovdqa $xb3,0xb0-0x100(%rcx) 1497 1498 vpshufd \$0x00,$xt3,$xt0 # "$xc0" 1499 vpshufd \$0x55,$xt3,$xt1 # "$xc1" 1500 vmovdqa $xt0,0xc0-0x100(%rcx) 1501 vpshufd \$0xaa,$xt3,$xt2 # "$xc2" 1502 vmovdqa $xt1,0xd0-0x100(%rcx) 1503 vpshufd \$0xff,$xt3,$xt3 # "$xc3" 1504 vmovdqa $xt2,0xe0-0x100(%rcx) 1505 vmovdqa $xt3,0xf0-0x100(%rcx) 1506 1507 vpshufd \$0x00,$xd3,$xd0 1508 vpshufd \$0x55,$xd3,$xd1 1509 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet 1510 vpshufd \$0xaa,$xd3,$xd2 1511 vmovdqa $xd1,0x110-0x100(%rcx) 1512 vpshufd \$0xff,$xd3,$xd3 1513 vmovdqa $xd2,0x120-0x100(%rcx) 1514 vmovdqa $xd3,0x130-0x100(%rcx) 1515 1516 jmp .Loop_enter4xop 1517 1518.align 32 1519.Loop_outer4xop: 1520 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key 1521 vmovdqa 0x50(%rsp),$xa1 1522 vmovdqa 0x60(%rsp),$xa2 1523 vmovdqa 0x70(%rsp),$xa3 1524 vmovdqa 0x80-0x100(%rcx),$xb0 1525 vmovdqa 0x90-0x100(%rcx),$xb1 1526 vmovdqa 0xa0-0x100(%rcx),$xb2 1527 vmovdqa 0xb0-0x100(%rcx),$xb3 1528 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1529 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1530 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1531 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1532 vmovdqa 0x100-0x100(%rcx),$xd0 1533 vmovdqa 0x110-0x100(%rcx),$xd1 1534 vmovdqa 0x120-0x100(%rcx),$xd2 1535 vmovdqa 0x130-0x100(%rcx),$xd3 1536 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters 1537 1538.Loop_enter4xop: 1539 mov \$10,%eax 1540 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1541 jmp .Loop4xop 1542 1543.align 32 1544.Loop4xop: 1545___ 1546 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } 1547 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } 1548$code.=<<___; 1549 dec %eax 1550 jnz .Loop4xop 1551 1552 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material 1553 vpaddd 0x50(%rsp),$xa1,$xa1 1554 vpaddd 0x60(%rsp),$xa2,$xa2 1555 vpaddd 0x70(%rsp),$xa3,$xa3 1556 1557 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 1558 vmovdqa $xt3,0x30(%rsp) 1559 1560 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 1561 vpunpckldq $xa3,$xa2,$xt3 1562 vpunpckhdq $xa1,$xa0,$xa0 1563 vpunpckhdq $xa3,$xa2,$xa2 1564 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 1565 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 1566 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 1567 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 1568___ 1569 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 1570$code.=<<___; 1571 vpaddd 0x80-0x100(%rcx),$xb0,$xb0 1572 vpaddd 0x90-0x100(%rcx),$xb1,$xb1 1573 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 1574 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 1575 1576 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 1577 vmovdqa $xa1,0x10(%rsp) 1578 vmovdqa 0x20(%rsp),$xa0 # "xc2" 1579 vmovdqa 0x30(%rsp),$xa1 # "xc3" 1580 1581 vpunpckldq $xb1,$xb0,$xt2 1582 vpunpckldq $xb3,$xb2,$xt3 1583 vpunpckhdq $xb1,$xb0,$xb0 1584 vpunpckhdq $xb3,$xb2,$xb2 1585 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 1586 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 1587 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 1588 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 1589___ 1590 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 1591 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1592$code.=<<___; 1593 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 1594 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 1595 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 1596 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 1597 1598 vpunpckldq $xc1,$xc0,$xt2 1599 vpunpckldq $xc3,$xc2,$xt3 1600 vpunpckhdq $xc1,$xc0,$xc0 1601 vpunpckhdq $xc3,$xc2,$xc2 1602 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 1603 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 1604 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 1605 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 1606___ 1607 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 1608$code.=<<___; 1609 vpaddd 0x100-0x100(%rcx),$xd0,$xd0 1610 vpaddd 0x110-0x100(%rcx),$xd1,$xd1 1611 vpaddd 0x120-0x100(%rcx),$xd2,$xd2 1612 vpaddd 0x130-0x100(%rcx),$xd3,$xd3 1613 1614 vpunpckldq $xd1,$xd0,$xt2 1615 vpunpckldq $xd3,$xd2,$xt3 1616 vpunpckhdq $xd1,$xd0,$xd0 1617 vpunpckhdq $xd3,$xd2,$xd2 1618 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 1619 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 1620 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 1621 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 1622___ 1623 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 1624 ($xa0,$xa1)=($xt2,$xt3); 1625$code.=<<___; 1626 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 1627 vmovdqa 0x10(%rsp),$xa1 1628 1629 cmp \$64*4,$len 1630 jb .Ltail4xop 1631 1632 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1633 vpxor 0x10($inp),$xb0,$xb0 1634 vpxor 0x20($inp),$xc0,$xc0 1635 vpxor 0x30($inp),$xd0,$xd0 1636 vpxor 0x40($inp),$xa1,$xa1 1637 vpxor 0x50($inp),$xb1,$xb1 1638 vpxor 0x60($inp),$xc1,$xc1 1639 vpxor 0x70($inp),$xd1,$xd1 1640 lea 0x80($inp),$inp # size optimization 1641 vpxor 0x00($inp),$xa2,$xa2 1642 vpxor 0x10($inp),$xb2,$xb2 1643 vpxor 0x20($inp),$xc2,$xc2 1644 vpxor 0x30($inp),$xd2,$xd2 1645 vpxor 0x40($inp),$xa3,$xa3 1646 vpxor 0x50($inp),$xb3,$xb3 1647 vpxor 0x60($inp),$xc3,$xc3 1648 vpxor 0x70($inp),$xd3,$xd3 1649 lea 0x80($inp),$inp # inp+=64*4 1650 1651 vmovdqu $xa0,0x00($out) 1652 vmovdqu $xb0,0x10($out) 1653 vmovdqu $xc0,0x20($out) 1654 vmovdqu $xd0,0x30($out) 1655 vmovdqu $xa1,0x40($out) 1656 vmovdqu $xb1,0x50($out) 1657 vmovdqu $xc1,0x60($out) 1658 vmovdqu $xd1,0x70($out) 1659 lea 0x80($out),$out # size optimization 1660 vmovdqu $xa2,0x00($out) 1661 vmovdqu $xb2,0x10($out) 1662 vmovdqu $xc2,0x20($out) 1663 vmovdqu $xd2,0x30($out) 1664 vmovdqu $xa3,0x40($out) 1665 vmovdqu $xb3,0x50($out) 1666 vmovdqu $xc3,0x60($out) 1667 vmovdqu $xd3,0x70($out) 1668 lea 0x80($out),$out # out+=64*4 1669 1670 sub \$64*4,$len 1671 jnz .Loop_outer4xop 1672 1673 jmp .Ldone4xop 1674 1675.align 32 1676.Ltail4xop: 1677 cmp \$192,$len 1678 jae .L192_or_more4xop 1679 cmp \$128,$len 1680 jae .L128_or_more4xop 1681 cmp \$64,$len 1682 jae .L64_or_more4xop 1683 1684 xor %r10,%r10 1685 vmovdqa $xa0,0x00(%rsp) 1686 vmovdqa $xb0,0x10(%rsp) 1687 vmovdqa $xc0,0x20(%rsp) 1688 vmovdqa $xd0,0x30(%rsp) 1689 jmp .Loop_tail4xop 1690 1691.align 32 1692.L64_or_more4xop: 1693 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1694 vpxor 0x10($inp),$xb0,$xb0 1695 vpxor 0x20($inp),$xc0,$xc0 1696 vpxor 0x30($inp),$xd0,$xd0 1697 vmovdqu $xa0,0x00($out) 1698 vmovdqu $xb0,0x10($out) 1699 vmovdqu $xc0,0x20($out) 1700 vmovdqu $xd0,0x30($out) 1701 je .Ldone4xop 1702 1703 lea 0x40($inp),$inp # inp+=64*1 1704 vmovdqa $xa1,0x00(%rsp) 1705 xor %r10,%r10 1706 vmovdqa $xb1,0x10(%rsp) 1707 lea 0x40($out),$out # out+=64*1 1708 vmovdqa $xc1,0x20(%rsp) 1709 sub \$64,$len # len-=64*1 1710 vmovdqa $xd1,0x30(%rsp) 1711 jmp .Loop_tail4xop 1712 1713.align 32 1714.L128_or_more4xop: 1715 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1716 vpxor 0x10($inp),$xb0,$xb0 1717 vpxor 0x20($inp),$xc0,$xc0 1718 vpxor 0x30($inp),$xd0,$xd0 1719 vpxor 0x40($inp),$xa1,$xa1 1720 vpxor 0x50($inp),$xb1,$xb1 1721 vpxor 0x60($inp),$xc1,$xc1 1722 vpxor 0x70($inp),$xd1,$xd1 1723 1724 vmovdqu $xa0,0x00($out) 1725 vmovdqu $xb0,0x10($out) 1726 vmovdqu $xc0,0x20($out) 1727 vmovdqu $xd0,0x30($out) 1728 vmovdqu $xa1,0x40($out) 1729 vmovdqu $xb1,0x50($out) 1730 vmovdqu $xc1,0x60($out) 1731 vmovdqu $xd1,0x70($out) 1732 je .Ldone4xop 1733 1734 lea 0x80($inp),$inp # inp+=64*2 1735 vmovdqa $xa2,0x00(%rsp) 1736 xor %r10,%r10 1737 vmovdqa $xb2,0x10(%rsp) 1738 lea 0x80($out),$out # out+=64*2 1739 vmovdqa $xc2,0x20(%rsp) 1740 sub \$128,$len # len-=64*2 1741 vmovdqa $xd2,0x30(%rsp) 1742 jmp .Loop_tail4xop 1743 1744.align 32 1745.L192_or_more4xop: 1746 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1747 vpxor 0x10($inp),$xb0,$xb0 1748 vpxor 0x20($inp),$xc0,$xc0 1749 vpxor 0x30($inp),$xd0,$xd0 1750 vpxor 0x40($inp),$xa1,$xa1 1751 vpxor 0x50($inp),$xb1,$xb1 1752 vpxor 0x60($inp),$xc1,$xc1 1753 vpxor 0x70($inp),$xd1,$xd1 1754 lea 0x80($inp),$inp # size optimization 1755 vpxor 0x00($inp),$xa2,$xa2 1756 vpxor 0x10($inp),$xb2,$xb2 1757 vpxor 0x20($inp),$xc2,$xc2 1758 vpxor 0x30($inp),$xd2,$xd2 1759 1760 vmovdqu $xa0,0x00($out) 1761 vmovdqu $xb0,0x10($out) 1762 vmovdqu $xc0,0x20($out) 1763 vmovdqu $xd0,0x30($out) 1764 vmovdqu $xa1,0x40($out) 1765 vmovdqu $xb1,0x50($out) 1766 vmovdqu $xc1,0x60($out) 1767 vmovdqu $xd1,0x70($out) 1768 lea 0x80($out),$out # size optimization 1769 vmovdqu $xa2,0x00($out) 1770 vmovdqu $xb2,0x10($out) 1771 vmovdqu $xc2,0x20($out) 1772 vmovdqu $xd2,0x30($out) 1773 je .Ldone4xop 1774 1775 lea 0x40($inp),$inp # inp+=64*3 1776 vmovdqa $xa3,0x00(%rsp) 1777 xor %r10,%r10 1778 vmovdqa $xb3,0x10(%rsp) 1779 lea 0x40($out),$out # out+=64*3 1780 vmovdqa $xc3,0x20(%rsp) 1781 sub \$192,$len # len-=64*3 1782 vmovdqa $xd3,0x30(%rsp) 1783 1784.Loop_tail4xop: 1785 movzb ($inp,%r10),%eax 1786 movzb (%rsp,%r10),%ecx 1787 lea 1(%r10),%r10 1788 xor %ecx,%eax 1789 mov %al,-1($out,%r10) 1790 dec $len 1791 jnz .Loop_tail4xop 1792 1793.Ldone4xop: 1794 vzeroupper 1795___ 1796$code.=<<___ if ($win64); 1797 movaps -0xa8(%r9),%xmm6 1798 movaps -0x98(%r9),%xmm7 1799 movaps -0x88(%r9),%xmm8 1800 movaps -0x78(%r9),%xmm9 1801 movaps -0x68(%r9),%xmm10 1802 movaps -0x58(%r9),%xmm11 1803 movaps -0x48(%r9),%xmm12 1804 movaps -0x38(%r9),%xmm13 1805 movaps -0x28(%r9),%xmm14 1806 movaps -0x18(%r9),%xmm15 1807___ 1808$code.=<<___; 1809 lea (%r9),%rsp 1810.cfi_def_cfa_register %rsp 1811.L4xop_epilogue: 1812 ret 1813.cfi_endproc 1814.size ChaCha20_4xop,.-ChaCha20_4xop 1815___ 1816} 1817 1818######################################################################## 1819# AVX2 code path 1820if ($avx>1) { 1821my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1822 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); 1823my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1824 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 1825 1826sub AVX2_lane_ROUND { 1827my ($a0,$b0,$c0,$d0)=@_; 1828my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1829my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1830my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1831my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 1832my @x=map("\"$_\"",@xx); 1833 1834 # Consider order in which variables are addressed by their 1835 # index: 1836 # 1837 # a b c d 1838 # 1839 # 0 4 8 12 < even round 1840 # 1 5 9 13 1841 # 2 6 10 14 1842 # 3 7 11 15 1843 # 0 5 10 15 < odd round 1844 # 1 6 11 12 1845 # 2 7 8 13 1846 # 3 4 9 14 1847 # 1848 # 'a', 'b' and 'd's are permanently allocated in registers, 1849 # @x[0..7,12..15], while 'c's are maintained in memory. If 1850 # you observe 'c' column, you'll notice that pair of 'c's is 1851 # invariant between rounds. This means that we have to reload 1852 # them once per round, in the middle. This is why you'll see 1853 # bunch of 'c' stores and loads in the middle, but none in 1854 # the beginning or end. 1855 1856 ( 1857 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1858 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1859 "&vpshufb (@x[$d0],@x[$d0],$t1)", 1860 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1861 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1862 "&vpshufb (@x[$d1],@x[$d1],$t1)", 1863 1864 "&vpaddd ($xc,$xc,@x[$d0])", 1865 "&vpxor (@x[$b0],$xc,@x[$b0])", 1866 "&vpslld ($t0,@x[$b0],12)", 1867 "&vpsrld (@x[$b0],@x[$b0],20)", 1868 "&vpor (@x[$b0],$t0,@x[$b0])", 1869 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1870 "&vpaddd ($xc_,$xc_,@x[$d1])", 1871 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1872 "&vpslld ($t1,@x[$b1],12)", 1873 "&vpsrld (@x[$b1],@x[$b1],20)", 1874 "&vpor (@x[$b1],$t1,@x[$b1])", 1875 1876 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 1877 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1878 "&vpshufb (@x[$d0],@x[$d0],$t0)", 1879 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 1880 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1881 "&vpshufb (@x[$d1],@x[$d1],$t0)", 1882 1883 "&vpaddd ($xc,$xc,@x[$d0])", 1884 "&vpxor (@x[$b0],$xc,@x[$b0])", 1885 "&vpslld ($t1,@x[$b0],7)", 1886 "&vpsrld (@x[$b0],@x[$b0],25)", 1887 "&vpor (@x[$b0],$t1,@x[$b0])", 1888 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1889 "&vpaddd ($xc_,$xc_,@x[$d1])", 1890 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1891 "&vpslld ($t0,@x[$b1],7)", 1892 "&vpsrld (@x[$b1],@x[$b1],25)", 1893 "&vpor (@x[$b1],$t0,@x[$b1])", 1894 1895 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 1896 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", 1897 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", 1898 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", 1899 1900 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1901 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1902 "&vpshufb (@x[$d2],@x[$d2],$t1)", 1903 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1904 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1905 "&vpshufb (@x[$d3],@x[$d3],$t1)", 1906 1907 "&vpaddd ($xc,$xc,@x[$d2])", 1908 "&vpxor (@x[$b2],$xc,@x[$b2])", 1909 "&vpslld ($t0,@x[$b2],12)", 1910 "&vpsrld (@x[$b2],@x[$b2],20)", 1911 "&vpor (@x[$b2],$t0,@x[$b2])", 1912 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1913 "&vpaddd ($xc_,$xc_,@x[$d3])", 1914 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1915 "&vpslld ($t1,@x[$b3],12)", 1916 "&vpsrld (@x[$b3],@x[$b3],20)", 1917 "&vpor (@x[$b3],$t1,@x[$b3])", 1918 1919 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1920 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1921 "&vpshufb (@x[$d2],@x[$d2],$t0)", 1922 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1923 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1924 "&vpshufb (@x[$d3],@x[$d3],$t0)", 1925 1926 "&vpaddd ($xc,$xc,@x[$d2])", 1927 "&vpxor (@x[$b2],$xc,@x[$b2])", 1928 "&vpslld ($t1,@x[$b2],7)", 1929 "&vpsrld (@x[$b2],@x[$b2],25)", 1930 "&vpor (@x[$b2],$t1,@x[$b2])", 1931 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1932 "&vpaddd ($xc_,$xc_,@x[$d3])", 1933 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1934 "&vpslld ($t0,@x[$b3],7)", 1935 "&vpsrld (@x[$b3],@x[$b3],25)", 1936 "&vpor (@x[$b3],$t0,@x[$b3])" 1937 ); 1938} 1939 1940my $xframe = $win64 ? 0xa8 : 8; 1941 1942$code.=<<___; 1943.type ChaCha20_8x,\@function,5 1944.align 32 1945ChaCha20_8x: 1946.cfi_startproc 1947.LChaCha20_8x: 1948 mov %rsp,%r9 # frame register 1949.cfi_def_cfa_register %r9 1950 sub \$0x280+$xframe,%rsp 1951 and \$-32,%rsp 1952___ 1953$code.=<<___ if ($win64); 1954 movaps %xmm6,-0xa8(%r9) 1955 movaps %xmm7,-0x98(%r9) 1956 movaps %xmm8,-0x88(%r9) 1957 movaps %xmm9,-0x78(%r9) 1958 movaps %xmm10,-0x68(%r9) 1959 movaps %xmm11,-0x58(%r9) 1960 movaps %xmm12,-0x48(%r9) 1961 movaps %xmm13,-0x38(%r9) 1962 movaps %xmm14,-0x28(%r9) 1963 movaps %xmm15,-0x18(%r9) 1964.L8x_body: 1965___ 1966$code.=<<___; 1967 vzeroupper 1968 1969 ################ stack layout 1970 # +0x00 SIMD equivalent of @x[8-12] 1971 # ... 1972 # +0x80 constant copy of key[0-2] smashed by lanes 1973 # ... 1974 # +0x200 SIMD counters (with nonce smashed by lanes) 1975 # ... 1976 # +0x280 1977 1978 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] 1979 vbroadcasti128 ($key),$xb3 # key[1] 1980 vbroadcasti128 16($key),$xt3 # key[2] 1981 vbroadcasti128 ($counter),$xd3 # key[3] 1982 lea 0x100(%rsp),%rcx # size optimization 1983 lea 0x200(%rsp),%rax # size optimization 1984 lea .Lrot16(%rip),%r10 1985 lea .Lrot24(%rip),%r11 1986 1987 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1988 vpshufd \$0x55,$xa3,$xa1 1989 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload 1990 vpshufd \$0xaa,$xa3,$xa2 1991 vmovdqa $xa1,0xa0-0x100(%rcx) 1992 vpshufd \$0xff,$xa3,$xa3 1993 vmovdqa $xa2,0xc0-0x100(%rcx) 1994 vmovdqa $xa3,0xe0-0x100(%rcx) 1995 1996 vpshufd \$0x00,$xb3,$xb0 1997 vpshufd \$0x55,$xb3,$xb1 1998 vmovdqa $xb0,0x100-0x100(%rcx) 1999 vpshufd \$0xaa,$xb3,$xb2 2000 vmovdqa $xb1,0x120-0x100(%rcx) 2001 vpshufd \$0xff,$xb3,$xb3 2002 vmovdqa $xb2,0x140-0x100(%rcx) 2003 vmovdqa $xb3,0x160-0x100(%rcx) 2004 2005 vpshufd \$0x00,$xt3,$xt0 # "xc0" 2006 vpshufd \$0x55,$xt3,$xt1 # "xc1" 2007 vmovdqa $xt0,0x180-0x200(%rax) 2008 vpshufd \$0xaa,$xt3,$xt2 # "xc2" 2009 vmovdqa $xt1,0x1a0-0x200(%rax) 2010 vpshufd \$0xff,$xt3,$xt3 # "xc3" 2011 vmovdqa $xt2,0x1c0-0x200(%rax) 2012 vmovdqa $xt3,0x1e0-0x200(%rax) 2013 2014 vpshufd \$0x00,$xd3,$xd0 2015 vpshufd \$0x55,$xd3,$xd1 2016 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 2017 vpshufd \$0xaa,$xd3,$xd2 2018 vmovdqa $xd1,0x220-0x200(%rax) 2019 vpshufd \$0xff,$xd3,$xd3 2020 vmovdqa $xd2,0x240-0x200(%rax) 2021 vmovdqa $xd3,0x260-0x200(%rax) 2022 2023 jmp .Loop_enter8x 2024 2025.align 32 2026.Loop_outer8x: 2027 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key 2028 vmovdqa 0xa0-0x100(%rcx),$xa1 2029 vmovdqa 0xc0-0x100(%rcx),$xa2 2030 vmovdqa 0xe0-0x100(%rcx),$xa3 2031 vmovdqa 0x100-0x100(%rcx),$xb0 2032 vmovdqa 0x120-0x100(%rcx),$xb1 2033 vmovdqa 0x140-0x100(%rcx),$xb2 2034 vmovdqa 0x160-0x100(%rcx),$xb3 2035 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" 2036 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" 2037 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" 2038 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" 2039 vmovdqa 0x200-0x200(%rax),$xd0 2040 vmovdqa 0x220-0x200(%rax),$xd1 2041 vmovdqa 0x240-0x200(%rax),$xd2 2042 vmovdqa 0x260-0x200(%rax),$xd3 2043 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters 2044 2045.Loop_enter8x: 2046 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" 2047 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" 2048 vbroadcasti128 (%r10),$xt3 2049 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters 2050 mov \$10,%eax 2051 jmp .Loop8x 2052 2053.align 32 2054.Loop8x: 2055___ 2056 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } 2057 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } 2058$code.=<<___; 2059 dec %eax 2060 jnz .Loop8x 2061 2062 lea 0x200(%rsp),%rax # size optimization 2063 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key 2064 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 2065 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 2066 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 2067 2068 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 2069 vpunpckldq $xa3,$xa2,$xt3 2070 vpunpckhdq $xa1,$xa0,$xa0 2071 vpunpckhdq $xa3,$xa2,$xa2 2072 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 2073 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 2074 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 2075 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 2076___ 2077 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 2078$code.=<<___; 2079 vpaddd 0x100-0x100(%rcx),$xb0,$xb0 2080 vpaddd 0x120-0x100(%rcx),$xb1,$xb1 2081 vpaddd 0x140-0x100(%rcx),$xb2,$xb2 2082 vpaddd 0x160-0x100(%rcx),$xb3,$xb3 2083 2084 vpunpckldq $xb1,$xb0,$xt2 2085 vpunpckldq $xb3,$xb2,$xt3 2086 vpunpckhdq $xb1,$xb0,$xb0 2087 vpunpckhdq $xb3,$xb2,$xb2 2088 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 2089 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 2090 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 2091 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 2092___ 2093 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 2094$code.=<<___; 2095 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further 2096 vperm2i128 \$0x31,$xb0,$xa0,$xb0 2097 vperm2i128 \$0x20,$xb1,$xa1,$xa0 2098 vperm2i128 \$0x31,$xb1,$xa1,$xb1 2099 vperm2i128 \$0x20,$xb2,$xa2,$xa1 2100 vperm2i128 \$0x31,$xb2,$xa2,$xb2 2101 vperm2i128 \$0x20,$xb3,$xa3,$xa2 2102 vperm2i128 \$0x31,$xb3,$xa3,$xb3 2103___ 2104 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 2105 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 2106$code.=<<___; 2107 vmovdqa $xa0,0x00(%rsp) # offload $xaN 2108 vmovdqa $xa1,0x20(%rsp) 2109 vmovdqa 0x40(%rsp),$xc2 # $xa0 2110 vmovdqa 0x60(%rsp),$xc3 # $xa1 2111 2112 vpaddd 0x180-0x200(%rax),$xc0,$xc0 2113 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 2114 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 2115 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 2116 2117 vpunpckldq $xc1,$xc0,$xt2 2118 vpunpckldq $xc3,$xc2,$xt3 2119 vpunpckhdq $xc1,$xc0,$xc0 2120 vpunpckhdq $xc3,$xc2,$xc2 2121 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 2122 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 2123 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 2124 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 2125___ 2126 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 2127$code.=<<___; 2128 vpaddd 0x200-0x200(%rax),$xd0,$xd0 2129 vpaddd 0x220-0x200(%rax),$xd1,$xd1 2130 vpaddd 0x240-0x200(%rax),$xd2,$xd2 2131 vpaddd 0x260-0x200(%rax),$xd3,$xd3 2132 2133 vpunpckldq $xd1,$xd0,$xt2 2134 vpunpckldq $xd3,$xd2,$xt3 2135 vpunpckhdq $xd1,$xd0,$xd0 2136 vpunpckhdq $xd3,$xd2,$xd2 2137 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 2138 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 2139 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 2140 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 2141___ 2142 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 2143$code.=<<___; 2144 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 2145 vperm2i128 \$0x31,$xd0,$xc0,$xd0 2146 vperm2i128 \$0x20,$xd1,$xc1,$xc0 2147 vperm2i128 \$0x31,$xd1,$xc1,$xd1 2148 vperm2i128 \$0x20,$xd2,$xc2,$xc1 2149 vperm2i128 \$0x31,$xd2,$xc2,$xd2 2150 vperm2i128 \$0x20,$xd3,$xc3,$xc2 2151 vperm2i128 \$0x31,$xd3,$xc3,$xd3 2152___ 2153 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 2154 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 2155 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 2156 ($xa0,$xa1)=($xt2,$xt3); 2157$code.=<<___; 2158 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? 2159 vmovdqa 0x20(%rsp),$xa1 2160 2161 cmp \$64*8,$len 2162 jb .Ltail8x 2163 2164 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2165 vpxor 0x20($inp),$xb0,$xb0 2166 vpxor 0x40($inp),$xc0,$xc0 2167 vpxor 0x60($inp),$xd0,$xd0 2168 lea 0x80($inp),$inp # size optimization 2169 vmovdqu $xa0,0x00($out) 2170 vmovdqu $xb0,0x20($out) 2171 vmovdqu $xc0,0x40($out) 2172 vmovdqu $xd0,0x60($out) 2173 lea 0x80($out),$out # size optimization 2174 2175 vpxor 0x00($inp),$xa1,$xa1 2176 vpxor 0x20($inp),$xb1,$xb1 2177 vpxor 0x40($inp),$xc1,$xc1 2178 vpxor 0x60($inp),$xd1,$xd1 2179 lea 0x80($inp),$inp # size optimization 2180 vmovdqu $xa1,0x00($out) 2181 vmovdqu $xb1,0x20($out) 2182 vmovdqu $xc1,0x40($out) 2183 vmovdqu $xd1,0x60($out) 2184 lea 0x80($out),$out # size optimization 2185 2186 vpxor 0x00($inp),$xa2,$xa2 2187 vpxor 0x20($inp),$xb2,$xb2 2188 vpxor 0x40($inp),$xc2,$xc2 2189 vpxor 0x60($inp),$xd2,$xd2 2190 lea 0x80($inp),$inp # size optimization 2191 vmovdqu $xa2,0x00($out) 2192 vmovdqu $xb2,0x20($out) 2193 vmovdqu $xc2,0x40($out) 2194 vmovdqu $xd2,0x60($out) 2195 lea 0x80($out),$out # size optimization 2196 2197 vpxor 0x00($inp),$xa3,$xa3 2198 vpxor 0x20($inp),$xb3,$xb3 2199 vpxor 0x40($inp),$xc3,$xc3 2200 vpxor 0x60($inp),$xd3,$xd3 2201 lea 0x80($inp),$inp # size optimization 2202 vmovdqu $xa3,0x00($out) 2203 vmovdqu $xb3,0x20($out) 2204 vmovdqu $xc3,0x40($out) 2205 vmovdqu $xd3,0x60($out) 2206 lea 0x80($out),$out # size optimization 2207 2208 sub \$64*8,$len 2209 jnz .Loop_outer8x 2210 2211 jmp .Ldone8x 2212 2213.Ltail8x: 2214 cmp \$448,$len 2215 jae .L448_or_more8x 2216 cmp \$384,$len 2217 jae .L384_or_more8x 2218 cmp \$320,$len 2219 jae .L320_or_more8x 2220 cmp \$256,$len 2221 jae .L256_or_more8x 2222 cmp \$192,$len 2223 jae .L192_or_more8x 2224 cmp \$128,$len 2225 jae .L128_or_more8x 2226 cmp \$64,$len 2227 jae .L64_or_more8x 2228 2229 xor %r10,%r10 2230 vmovdqa $xa0,0x00(%rsp) 2231 vmovdqa $xb0,0x20(%rsp) 2232 jmp .Loop_tail8x 2233 2234.align 32 2235.L64_or_more8x: 2236 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2237 vpxor 0x20($inp),$xb0,$xb0 2238 vmovdqu $xa0,0x00($out) 2239 vmovdqu $xb0,0x20($out) 2240 je .Ldone8x 2241 2242 lea 0x40($inp),$inp # inp+=64*1 2243 xor %r10,%r10 2244 vmovdqa $xc0,0x00(%rsp) 2245 lea 0x40($out),$out # out+=64*1 2246 sub \$64,$len # len-=64*1 2247 vmovdqa $xd0,0x20(%rsp) 2248 jmp .Loop_tail8x 2249 2250.align 32 2251.L128_or_more8x: 2252 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2253 vpxor 0x20($inp),$xb0,$xb0 2254 vpxor 0x40($inp),$xc0,$xc0 2255 vpxor 0x60($inp),$xd0,$xd0 2256 vmovdqu $xa0,0x00($out) 2257 vmovdqu $xb0,0x20($out) 2258 vmovdqu $xc0,0x40($out) 2259 vmovdqu $xd0,0x60($out) 2260 je .Ldone8x 2261 2262 lea 0x80($inp),$inp # inp+=64*2 2263 xor %r10,%r10 2264 vmovdqa $xa1,0x00(%rsp) 2265 lea 0x80($out),$out # out+=64*2 2266 sub \$128,$len # len-=64*2 2267 vmovdqa $xb1,0x20(%rsp) 2268 jmp .Loop_tail8x 2269 2270.align 32 2271.L192_or_more8x: 2272 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2273 vpxor 0x20($inp),$xb0,$xb0 2274 vpxor 0x40($inp),$xc0,$xc0 2275 vpxor 0x60($inp),$xd0,$xd0 2276 vpxor 0x80($inp),$xa1,$xa1 2277 vpxor 0xa0($inp),$xb1,$xb1 2278 vmovdqu $xa0,0x00($out) 2279 vmovdqu $xb0,0x20($out) 2280 vmovdqu $xc0,0x40($out) 2281 vmovdqu $xd0,0x60($out) 2282 vmovdqu $xa1,0x80($out) 2283 vmovdqu $xb1,0xa0($out) 2284 je .Ldone8x 2285 2286 lea 0xc0($inp),$inp # inp+=64*3 2287 xor %r10,%r10 2288 vmovdqa $xc1,0x00(%rsp) 2289 lea 0xc0($out),$out # out+=64*3 2290 sub \$192,$len # len-=64*3 2291 vmovdqa $xd1,0x20(%rsp) 2292 jmp .Loop_tail8x 2293 2294.align 32 2295.L256_or_more8x: 2296 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2297 vpxor 0x20($inp),$xb0,$xb0 2298 vpxor 0x40($inp),$xc0,$xc0 2299 vpxor 0x60($inp),$xd0,$xd0 2300 vpxor 0x80($inp),$xa1,$xa1 2301 vpxor 0xa0($inp),$xb1,$xb1 2302 vpxor 0xc0($inp),$xc1,$xc1 2303 vpxor 0xe0($inp),$xd1,$xd1 2304 vmovdqu $xa0,0x00($out) 2305 vmovdqu $xb0,0x20($out) 2306 vmovdqu $xc0,0x40($out) 2307 vmovdqu $xd0,0x60($out) 2308 vmovdqu $xa1,0x80($out) 2309 vmovdqu $xb1,0xa0($out) 2310 vmovdqu $xc1,0xc0($out) 2311 vmovdqu $xd1,0xe0($out) 2312 je .Ldone8x 2313 2314 lea 0x100($inp),$inp # inp+=64*4 2315 xor %r10,%r10 2316 vmovdqa $xa2,0x00(%rsp) 2317 lea 0x100($out),$out # out+=64*4 2318 sub \$256,$len # len-=64*4 2319 vmovdqa $xb2,0x20(%rsp) 2320 jmp .Loop_tail8x 2321 2322.align 32 2323.L320_or_more8x: 2324 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2325 vpxor 0x20($inp),$xb0,$xb0 2326 vpxor 0x40($inp),$xc0,$xc0 2327 vpxor 0x60($inp),$xd0,$xd0 2328 vpxor 0x80($inp),$xa1,$xa1 2329 vpxor 0xa0($inp),$xb1,$xb1 2330 vpxor 0xc0($inp),$xc1,$xc1 2331 vpxor 0xe0($inp),$xd1,$xd1 2332 vpxor 0x100($inp),$xa2,$xa2 2333 vpxor 0x120($inp),$xb2,$xb2 2334 vmovdqu $xa0,0x00($out) 2335 vmovdqu $xb0,0x20($out) 2336 vmovdqu $xc0,0x40($out) 2337 vmovdqu $xd0,0x60($out) 2338 vmovdqu $xa1,0x80($out) 2339 vmovdqu $xb1,0xa0($out) 2340 vmovdqu $xc1,0xc0($out) 2341 vmovdqu $xd1,0xe0($out) 2342 vmovdqu $xa2,0x100($out) 2343 vmovdqu $xb2,0x120($out) 2344 je .Ldone8x 2345 2346 lea 0x140($inp),$inp # inp+=64*5 2347 xor %r10,%r10 2348 vmovdqa $xc2,0x00(%rsp) 2349 lea 0x140($out),$out # out+=64*5 2350 sub \$320,$len # len-=64*5 2351 vmovdqa $xd2,0x20(%rsp) 2352 jmp .Loop_tail8x 2353 2354.align 32 2355.L384_or_more8x: 2356 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2357 vpxor 0x20($inp),$xb0,$xb0 2358 vpxor 0x40($inp),$xc0,$xc0 2359 vpxor 0x60($inp),$xd0,$xd0 2360 vpxor 0x80($inp),$xa1,$xa1 2361 vpxor 0xa0($inp),$xb1,$xb1 2362 vpxor 0xc0($inp),$xc1,$xc1 2363 vpxor 0xe0($inp),$xd1,$xd1 2364 vpxor 0x100($inp),$xa2,$xa2 2365 vpxor 0x120($inp),$xb2,$xb2 2366 vpxor 0x140($inp),$xc2,$xc2 2367 vpxor 0x160($inp),$xd2,$xd2 2368 vmovdqu $xa0,0x00($out) 2369 vmovdqu $xb0,0x20($out) 2370 vmovdqu $xc0,0x40($out) 2371 vmovdqu $xd0,0x60($out) 2372 vmovdqu $xa1,0x80($out) 2373 vmovdqu $xb1,0xa0($out) 2374 vmovdqu $xc1,0xc0($out) 2375 vmovdqu $xd1,0xe0($out) 2376 vmovdqu $xa2,0x100($out) 2377 vmovdqu $xb2,0x120($out) 2378 vmovdqu $xc2,0x140($out) 2379 vmovdqu $xd2,0x160($out) 2380 je .Ldone8x 2381 2382 lea 0x180($inp),$inp # inp+=64*6 2383 xor %r10,%r10 2384 vmovdqa $xa3,0x00(%rsp) 2385 lea 0x180($out),$out # out+=64*6 2386 sub \$384,$len # len-=64*6 2387 vmovdqa $xb3,0x20(%rsp) 2388 jmp .Loop_tail8x 2389 2390.align 32 2391.L448_or_more8x: 2392 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2393 vpxor 0x20($inp),$xb0,$xb0 2394 vpxor 0x40($inp),$xc0,$xc0 2395 vpxor 0x60($inp),$xd0,$xd0 2396 vpxor 0x80($inp),$xa1,$xa1 2397 vpxor 0xa0($inp),$xb1,$xb1 2398 vpxor 0xc0($inp),$xc1,$xc1 2399 vpxor 0xe0($inp),$xd1,$xd1 2400 vpxor 0x100($inp),$xa2,$xa2 2401 vpxor 0x120($inp),$xb2,$xb2 2402 vpxor 0x140($inp),$xc2,$xc2 2403 vpxor 0x160($inp),$xd2,$xd2 2404 vpxor 0x180($inp),$xa3,$xa3 2405 vpxor 0x1a0($inp),$xb3,$xb3 2406 vmovdqu $xa0,0x00($out) 2407 vmovdqu $xb0,0x20($out) 2408 vmovdqu $xc0,0x40($out) 2409 vmovdqu $xd0,0x60($out) 2410 vmovdqu $xa1,0x80($out) 2411 vmovdqu $xb1,0xa0($out) 2412 vmovdqu $xc1,0xc0($out) 2413 vmovdqu $xd1,0xe0($out) 2414 vmovdqu $xa2,0x100($out) 2415 vmovdqu $xb2,0x120($out) 2416 vmovdqu $xc2,0x140($out) 2417 vmovdqu $xd2,0x160($out) 2418 vmovdqu $xa3,0x180($out) 2419 vmovdqu $xb3,0x1a0($out) 2420 je .Ldone8x 2421 2422 lea 0x1c0($inp),$inp # inp+=64*7 2423 xor %r10,%r10 2424 vmovdqa $xc3,0x00(%rsp) 2425 lea 0x1c0($out),$out # out+=64*7 2426 sub \$448,$len # len-=64*7 2427 vmovdqa $xd3,0x20(%rsp) 2428 2429.Loop_tail8x: 2430 movzb ($inp,%r10),%eax 2431 movzb (%rsp,%r10),%ecx 2432 lea 1(%r10),%r10 2433 xor %ecx,%eax 2434 mov %al,-1($out,%r10) 2435 dec $len 2436 jnz .Loop_tail8x 2437 2438.Ldone8x: 2439 vzeroall 2440___ 2441$code.=<<___ if ($win64); 2442 movaps -0xa8(%r9),%xmm6 2443 movaps -0x98(%r9),%xmm7 2444 movaps -0x88(%r9),%xmm8 2445 movaps -0x78(%r9),%xmm9 2446 movaps -0x68(%r9),%xmm10 2447 movaps -0x58(%r9),%xmm11 2448 movaps -0x48(%r9),%xmm12 2449 movaps -0x38(%r9),%xmm13 2450 movaps -0x28(%r9),%xmm14 2451 movaps -0x18(%r9),%xmm15 2452___ 2453$code.=<<___; 2454 lea (%r9),%rsp 2455.cfi_def_cfa_register %rsp 2456.L8x_epilogue: 2457 ret 2458.cfi_endproc 2459.size ChaCha20_8x,.-ChaCha20_8x 2460___ 2461} 2462 2463######################################################################## 2464# AVX512 code paths 2465if ($avx>2) { 2466# This one handles shorter inputs... 2467 2468my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); 2469my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 2470 2471sub vpxord() # size optimization 2472{ my $opcode = "vpxor"; # adhere to vpxor when possible 2473 2474 foreach (@_) { 2475 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { 2476 $opcode = "vpxord"; 2477 last; 2478 } 2479 } 2480 2481 $code .= "\t$opcode\t".join(',',reverse @_)."\n"; 2482} 2483 2484sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round 2485 &vpaddd ($a,$a,$b); 2486 &vpxord ($d,$d,$a); 2487 &vprold ($d,$d,16); 2488 2489 &vpaddd ($c,$c,$d); 2490 &vpxord ($b,$b,$c); 2491 &vprold ($b,$b,12); 2492 2493 &vpaddd ($a,$a,$b); 2494 &vpxord ($d,$d,$a); 2495 &vprold ($d,$d,8); 2496 2497 &vpaddd ($c,$c,$d); 2498 &vpxord ($b,$b,$c); 2499 &vprold ($b,$b,7); 2500} 2501 2502my $xframe = $win64 ? 160+8 : 8; 2503 2504$code.=<<___; 2505.type ChaCha20_avx512,\@function,5 2506.align 32 2507ChaCha20_avx512: 2508.cfi_startproc 2509.LChaCha20_avx512: 2510 mov %rsp,%r9 # frame pointer 2511.cfi_def_cfa_register %r9 2512 cmp \$512,$len 2513 ja .LChaCha20_16x 2514 2515 sub \$64+$xframe,%rsp 2516___ 2517$code.=<<___ if ($win64); 2518 movaps %xmm6,-0xa8(%r9) 2519 movaps %xmm7,-0x98(%r9) 2520 movaps %xmm8,-0x88(%r9) 2521 movaps %xmm9,-0x78(%r9) 2522 movaps %xmm10,-0x68(%r9) 2523 movaps %xmm11,-0x58(%r9) 2524 movaps %xmm12,-0x48(%r9) 2525 movaps %xmm13,-0x38(%r9) 2526 movaps %xmm14,-0x28(%r9) 2527 movaps %xmm15,-0x18(%r9) 2528.Lavx512_body: 2529___ 2530$code.=<<___; 2531 vbroadcasti32x4 .Lsigma(%rip),$a 2532 vbroadcasti32x4 ($key),$b 2533 vbroadcasti32x4 16($key),$c 2534 vbroadcasti32x4 ($counter),$d 2535 2536 vmovdqa32 $a,$a_ 2537 vmovdqa32 $b,$b_ 2538 vmovdqa32 $c,$c_ 2539 vpaddd .Lzeroz(%rip),$d,$d 2540 vmovdqa32 .Lfourz(%rip),$fourz 2541 mov \$10,$counter # reuse $counter 2542 vmovdqa32 $d,$d_ 2543 jmp .Loop_avx512 2544 2545.align 16 2546.Loop_outer_avx512: 2547 vmovdqa32 $a_,$a 2548 vmovdqa32 $b_,$b 2549 vmovdqa32 $c_,$c 2550 vpaddd $fourz,$d_,$d 2551 mov \$10,$counter 2552 vmovdqa32 $d,$d_ 2553 jmp .Loop_avx512 2554 2555.align 32 2556.Loop_avx512: 2557___ 2558 &AVX512ROUND(); 2559 &vpshufd ($c,$c,0b01001110); 2560 &vpshufd ($b,$b,0b00111001); 2561 &vpshufd ($d,$d,0b10010011); 2562 2563 &AVX512ROUND(); 2564 &vpshufd ($c,$c,0b01001110); 2565 &vpshufd ($b,$b,0b10010011); 2566 &vpshufd ($d,$d,0b00111001); 2567 2568 &dec ($counter); 2569 &jnz (".Loop_avx512"); 2570 2571$code.=<<___; 2572 vpaddd $a_,$a,$a 2573 vpaddd $b_,$b,$b 2574 vpaddd $c_,$c,$c 2575 vpaddd $d_,$d,$d 2576 2577 sub \$64,$len 2578 jb .Ltail64_avx512 2579 2580 vpxor 0x00($inp),%x#$a,$t0 # xor with input 2581 vpxor 0x10($inp),%x#$b,$t1 2582 vpxor 0x20($inp),%x#$c,$t2 2583 vpxor 0x30($inp),%x#$d,$t3 2584 lea 0x40($inp),$inp # inp+=64 2585 2586 vmovdqu $t0,0x00($out) # write output 2587 vmovdqu $t1,0x10($out) 2588 vmovdqu $t2,0x20($out) 2589 vmovdqu $t3,0x30($out) 2590 lea 0x40($out),$out # out+=64 2591 2592 jz .Ldone_avx512 2593 2594 vextracti32x4 \$1,$a,$t0 2595 vextracti32x4 \$1,$b,$t1 2596 vextracti32x4 \$1,$c,$t2 2597 vextracti32x4 \$1,$d,$t3 2598 2599 sub \$64,$len 2600 jb .Ltail_avx512 2601 2602 vpxor 0x00($inp),$t0,$t0 # xor with input 2603 vpxor 0x10($inp),$t1,$t1 2604 vpxor 0x20($inp),$t2,$t2 2605 vpxor 0x30($inp),$t3,$t3 2606 lea 0x40($inp),$inp # inp+=64 2607 2608 vmovdqu $t0,0x00($out) # write output 2609 vmovdqu $t1,0x10($out) 2610 vmovdqu $t2,0x20($out) 2611 vmovdqu $t3,0x30($out) 2612 lea 0x40($out),$out # out+=64 2613 2614 jz .Ldone_avx512 2615 2616 vextracti32x4 \$2,$a,$t0 2617 vextracti32x4 \$2,$b,$t1 2618 vextracti32x4 \$2,$c,$t2 2619 vextracti32x4 \$2,$d,$t3 2620 2621 sub \$64,$len 2622 jb .Ltail_avx512 2623 2624 vpxor 0x00($inp),$t0,$t0 # xor with input 2625 vpxor 0x10($inp),$t1,$t1 2626 vpxor 0x20($inp),$t2,$t2 2627 vpxor 0x30($inp),$t3,$t3 2628 lea 0x40($inp),$inp # inp+=64 2629 2630 vmovdqu $t0,0x00($out) # write output 2631 vmovdqu $t1,0x10($out) 2632 vmovdqu $t2,0x20($out) 2633 vmovdqu $t3,0x30($out) 2634 lea 0x40($out),$out # out+=64 2635 2636 jz .Ldone_avx512 2637 2638 vextracti32x4 \$3,$a,$t0 2639 vextracti32x4 \$3,$b,$t1 2640 vextracti32x4 \$3,$c,$t2 2641 vextracti32x4 \$3,$d,$t3 2642 2643 sub \$64,$len 2644 jb .Ltail_avx512 2645 2646 vpxor 0x00($inp),$t0,$t0 # xor with input 2647 vpxor 0x10($inp),$t1,$t1 2648 vpxor 0x20($inp),$t2,$t2 2649 vpxor 0x30($inp),$t3,$t3 2650 lea 0x40($inp),$inp # inp+=64 2651 2652 vmovdqu $t0,0x00($out) # write output 2653 vmovdqu $t1,0x10($out) 2654 vmovdqu $t2,0x20($out) 2655 vmovdqu $t3,0x30($out) 2656 lea 0x40($out),$out # out+=64 2657 2658 jnz .Loop_outer_avx512 2659 2660 jmp .Ldone_avx512 2661 2662.align 16 2663.Ltail64_avx512: 2664 vmovdqa %x#$a,0x00(%rsp) 2665 vmovdqa %x#$b,0x10(%rsp) 2666 vmovdqa %x#$c,0x20(%rsp) 2667 vmovdqa %x#$d,0x30(%rsp) 2668 add \$64,$len 2669 jmp .Loop_tail_avx512 2670 2671.align 16 2672.Ltail_avx512: 2673 vmovdqa $t0,0x00(%rsp) 2674 vmovdqa $t1,0x10(%rsp) 2675 vmovdqa $t2,0x20(%rsp) 2676 vmovdqa $t3,0x30(%rsp) 2677 add \$64,$len 2678 2679.Loop_tail_avx512: 2680 movzb ($inp,$counter),%eax 2681 movzb (%rsp,$counter),%ecx 2682 lea 1($counter),$counter 2683 xor %ecx,%eax 2684 mov %al,-1($out,$counter) 2685 dec $len 2686 jnz .Loop_tail_avx512 2687 2688 vmovdqu32 $a_,0x00(%rsp) 2689 2690.Ldone_avx512: 2691 vzeroall 2692___ 2693$code.=<<___ if ($win64); 2694 movaps -0xa8(%r9),%xmm6 2695 movaps -0x98(%r9),%xmm7 2696 movaps -0x88(%r9),%xmm8 2697 movaps -0x78(%r9),%xmm9 2698 movaps -0x68(%r9),%xmm10 2699 movaps -0x58(%r9),%xmm11 2700 movaps -0x48(%r9),%xmm12 2701 movaps -0x38(%r9),%xmm13 2702 movaps -0x28(%r9),%xmm14 2703 movaps -0x18(%r9),%xmm15 2704___ 2705$code.=<<___; 2706 lea (%r9),%rsp 2707.cfi_def_cfa_register %rsp 2708.Lavx512_epilogue: 2709 ret 2710.cfi_endproc 2711.size ChaCha20_avx512,.-ChaCha20_avx512 2712___ 2713 2714map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); 2715 2716$code.=<<___; 2717.type ChaCha20_avx512vl,\@function,5 2718.align 32 2719ChaCha20_avx512vl: 2720.cfi_startproc 2721.LChaCha20_avx512vl: 2722 mov %rsp,%r9 # frame pointer 2723.cfi_def_cfa_register %r9 2724 cmp \$128,$len 2725 ja .LChaCha20_8xvl 2726 2727 sub \$64+$xframe,%rsp 2728___ 2729$code.=<<___ if ($win64); 2730 movaps %xmm6,-0xa8(%r9) 2731 movaps %xmm7,-0x98(%r9) 2732 movaps %xmm8,-0x88(%r9) 2733 movaps %xmm9,-0x78(%r9) 2734 movaps %xmm10,-0x68(%r9) 2735 movaps %xmm11,-0x58(%r9) 2736 movaps %xmm12,-0x48(%r9) 2737 movaps %xmm13,-0x38(%r9) 2738 movaps %xmm14,-0x28(%r9) 2739 movaps %xmm15,-0x18(%r9) 2740.Lavx512vl_body: 2741___ 2742$code.=<<___; 2743 vbroadcasti128 .Lsigma(%rip),$a 2744 vbroadcasti128 ($key),$b 2745 vbroadcasti128 16($key),$c 2746 vbroadcasti128 ($counter),$d 2747 2748 vmovdqa32 $a,$a_ 2749 vmovdqa32 $b,$b_ 2750 vmovdqa32 $c,$c_ 2751 vpaddd .Lzeroz(%rip),$d,$d 2752 vmovdqa32 .Ltwoy(%rip),$fourz 2753 mov \$10,$counter # reuse $counter 2754 vmovdqa32 $d,$d_ 2755 jmp .Loop_avx512vl 2756 2757.align 16 2758.Loop_outer_avx512vl: 2759 vmovdqa32 $c_,$c 2760 vpaddd $fourz,$d_,$d 2761 mov \$10,$counter 2762 vmovdqa32 $d,$d_ 2763 jmp .Loop_avx512vl 2764 2765.align 32 2766.Loop_avx512vl: 2767___ 2768 &AVX512ROUND(); 2769 &vpshufd ($c,$c,0b01001110); 2770 &vpshufd ($b,$b,0b00111001); 2771 &vpshufd ($d,$d,0b10010011); 2772 2773 &AVX512ROUND(); 2774 &vpshufd ($c,$c,0b01001110); 2775 &vpshufd ($b,$b,0b10010011); 2776 &vpshufd ($d,$d,0b00111001); 2777 2778 &dec ($counter); 2779 &jnz (".Loop_avx512vl"); 2780 2781$code.=<<___; 2782 vpaddd $a_,$a,$a 2783 vpaddd $b_,$b,$b 2784 vpaddd $c_,$c,$c 2785 vpaddd $d_,$d,$d 2786 2787 sub \$64,$len 2788 jb .Ltail64_avx512vl 2789 2790 vpxor 0x00($inp),%x#$a,$t0 # xor with input 2791 vpxor 0x10($inp),%x#$b,$t1 2792 vpxor 0x20($inp),%x#$c,$t2 2793 vpxor 0x30($inp),%x#$d,$t3 2794 lea 0x40($inp),$inp # inp+=64 2795 2796 vmovdqu $t0,0x00($out) # write output 2797 vmovdqu $t1,0x10($out) 2798 vmovdqu $t2,0x20($out) 2799 vmovdqu $t3,0x30($out) 2800 lea 0x40($out),$out # out+=64 2801 2802 jz .Ldone_avx512vl 2803 2804 vextracti128 \$1,$a,$t0 2805 vextracti128 \$1,$b,$t1 2806 vextracti128 \$1,$c,$t2 2807 vextracti128 \$1,$d,$t3 2808 2809 sub \$64,$len 2810 jb .Ltail_avx512vl 2811 2812 vpxor 0x00($inp),$t0,$t0 # xor with input 2813 vpxor 0x10($inp),$t1,$t1 2814 vpxor 0x20($inp),$t2,$t2 2815 vpxor 0x30($inp),$t3,$t3 2816 lea 0x40($inp),$inp # inp+=64 2817 2818 vmovdqu $t0,0x00($out) # write output 2819 vmovdqu $t1,0x10($out) 2820 vmovdqu $t2,0x20($out) 2821 vmovdqu $t3,0x30($out) 2822 lea 0x40($out),$out # out+=64 2823 2824 vmovdqa32 $a_,$a 2825 vmovdqa32 $b_,$b 2826 jnz .Loop_outer_avx512vl 2827 2828 jmp .Ldone_avx512vl 2829 2830.align 16 2831.Ltail64_avx512vl: 2832 vmovdqa %x#$a,0x00(%rsp) 2833 vmovdqa %x#$b,0x10(%rsp) 2834 vmovdqa %x#$c,0x20(%rsp) 2835 vmovdqa %x#$d,0x30(%rsp) 2836 add \$64,$len 2837 jmp .Loop_tail_avx512vl 2838 2839.align 16 2840.Ltail_avx512vl: 2841 vmovdqa $t0,0x00(%rsp) 2842 vmovdqa $t1,0x10(%rsp) 2843 vmovdqa $t2,0x20(%rsp) 2844 vmovdqa $t3,0x30(%rsp) 2845 add \$64,$len 2846 2847.Loop_tail_avx512vl: 2848 movzb ($inp,$counter),%eax 2849 movzb (%rsp,$counter),%ecx 2850 lea 1($counter),$counter 2851 xor %ecx,%eax 2852 mov %al,-1($out,$counter) 2853 dec $len 2854 jnz .Loop_tail_avx512vl 2855 2856 vmovdqu32 $a_,0x00(%rsp) 2857 vmovdqu32 $a_,0x20(%rsp) 2858 2859.Ldone_avx512vl: 2860 vzeroall 2861___ 2862$code.=<<___ if ($win64); 2863 movaps -0xa8(%r9),%xmm6 2864 movaps -0x98(%r9),%xmm7 2865 movaps -0x88(%r9),%xmm8 2866 movaps -0x78(%r9),%xmm9 2867 movaps -0x68(%r9),%xmm10 2868 movaps -0x58(%r9),%xmm11 2869 movaps -0x48(%r9),%xmm12 2870 movaps -0x38(%r9),%xmm13 2871 movaps -0x28(%r9),%xmm14 2872 movaps -0x18(%r9),%xmm15 2873___ 2874$code.=<<___; 2875 lea (%r9),%rsp 2876.cfi_def_cfa_register %rsp 2877.Lavx512vl_epilogue: 2878 ret 2879.cfi_endproc 2880.size ChaCha20_avx512vl,.-ChaCha20_avx512vl 2881___ 2882} 2883if ($avx>2) { 2884# This one handles longer inputs... 2885 2886my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2887 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); 2888my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2889 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 2890my @key=map("%zmm$_",(16..31)); 2891my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 2892 2893sub AVX512_lane_ROUND { 2894my ($a0,$b0,$c0,$d0)=@_; 2895my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 2896my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 2897my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 2898my @x=map("\"$_\"",@xx); 2899 2900 ( 2901 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 2902 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 2903 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 2904 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 2905 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2906 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2907 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2908 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2909 "&vprold (@x[$d0],@x[$d0],16)", 2910 "&vprold (@x[$d1],@x[$d1],16)", 2911 "&vprold (@x[$d2],@x[$d2],16)", 2912 "&vprold (@x[$d3],@x[$d3],16)", 2913 2914 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2915 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2916 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2917 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2918 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2919 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2920 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2921 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2922 "&vprold (@x[$b0],@x[$b0],12)", 2923 "&vprold (@x[$b1],@x[$b1],12)", 2924 "&vprold (@x[$b2],@x[$b2],12)", 2925 "&vprold (@x[$b3],@x[$b3],12)", 2926 2927 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 2928 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 2929 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 2930 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 2931 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2932 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2933 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2934 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2935 "&vprold (@x[$d0],@x[$d0],8)", 2936 "&vprold (@x[$d1],@x[$d1],8)", 2937 "&vprold (@x[$d2],@x[$d2],8)", 2938 "&vprold (@x[$d3],@x[$d3],8)", 2939 2940 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2941 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2942 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2943 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2944 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2945 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2946 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2947 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2948 "&vprold (@x[$b0],@x[$b0],7)", 2949 "&vprold (@x[$b1],@x[$b1],7)", 2950 "&vprold (@x[$b2],@x[$b2],7)", 2951 "&vprold (@x[$b3],@x[$b3],7)" 2952 ); 2953} 2954 2955my $xframe = $win64 ? 0xa8 : 8; 2956 2957$code.=<<___; 2958.type ChaCha20_16x,\@function,5 2959.align 32 2960ChaCha20_16x: 2961.cfi_startproc 2962.LChaCha20_16x: 2963 mov %rsp,%r9 # frame register 2964.cfi_def_cfa_register %r9 2965 sub \$64+$xframe,%rsp 2966 and \$-64,%rsp 2967___ 2968$code.=<<___ if ($win64); 2969 movaps %xmm6,-0xa8(%r9) 2970 movaps %xmm7,-0x98(%r9) 2971 movaps %xmm8,-0x88(%r9) 2972 movaps %xmm9,-0x78(%r9) 2973 movaps %xmm10,-0x68(%r9) 2974 movaps %xmm11,-0x58(%r9) 2975 movaps %xmm12,-0x48(%r9) 2976 movaps %xmm13,-0x38(%r9) 2977 movaps %xmm14,-0x28(%r9) 2978 movaps %xmm15,-0x18(%r9) 2979.L16x_body: 2980___ 2981$code.=<<___; 2982 vzeroupper 2983 2984 lea .Lsigma(%rip),%r10 2985 vbroadcasti32x4 (%r10),$xa3 # key[0] 2986 vbroadcasti32x4 ($key),$xb3 # key[1] 2987 vbroadcasti32x4 16($key),$xc3 # key[2] 2988 vbroadcasti32x4 ($counter),$xd3 # key[3] 2989 2990 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 2991 vpshufd \$0x55,$xa3,$xa1 2992 vpshufd \$0xaa,$xa3,$xa2 2993 vpshufd \$0xff,$xa3,$xa3 2994 vmovdqa64 $xa0,@key[0] 2995 vmovdqa64 $xa1,@key[1] 2996 vmovdqa64 $xa2,@key[2] 2997 vmovdqa64 $xa3,@key[3] 2998 2999 vpshufd \$0x00,$xb3,$xb0 3000 vpshufd \$0x55,$xb3,$xb1 3001 vpshufd \$0xaa,$xb3,$xb2 3002 vpshufd \$0xff,$xb3,$xb3 3003 vmovdqa64 $xb0,@key[4] 3004 vmovdqa64 $xb1,@key[5] 3005 vmovdqa64 $xb2,@key[6] 3006 vmovdqa64 $xb3,@key[7] 3007 3008 vpshufd \$0x00,$xc3,$xc0 3009 vpshufd \$0x55,$xc3,$xc1 3010 vpshufd \$0xaa,$xc3,$xc2 3011 vpshufd \$0xff,$xc3,$xc3 3012 vmovdqa64 $xc0,@key[8] 3013 vmovdqa64 $xc1,@key[9] 3014 vmovdqa64 $xc2,@key[10] 3015 vmovdqa64 $xc3,@key[11] 3016 3017 vpshufd \$0x00,$xd3,$xd0 3018 vpshufd \$0x55,$xd3,$xd1 3019 vpshufd \$0xaa,$xd3,$xd2 3020 vpshufd \$0xff,$xd3,$xd3 3021 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet 3022 vmovdqa64 $xd0,@key[12] 3023 vmovdqa64 $xd1,@key[13] 3024 vmovdqa64 $xd2,@key[14] 3025 vmovdqa64 $xd3,@key[15] 3026 3027 mov \$10,%eax 3028 jmp .Loop16x 3029 3030.align 32 3031.Loop_outer16x: 3032 vpbroadcastd 0(%r10),$xa0 # reload key 3033 vpbroadcastd 4(%r10),$xa1 3034 vpbroadcastd 8(%r10),$xa2 3035 vpbroadcastd 12(%r10),$xa3 3036 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters 3037 vmovdqa64 @key[4],$xb0 3038 vmovdqa64 @key[5],$xb1 3039 vmovdqa64 @key[6],$xb2 3040 vmovdqa64 @key[7],$xb3 3041 vmovdqa64 @key[8],$xc0 3042 vmovdqa64 @key[9],$xc1 3043 vmovdqa64 @key[10],$xc2 3044 vmovdqa64 @key[11],$xc3 3045 vmovdqa64 @key[12],$xd0 3046 vmovdqa64 @key[13],$xd1 3047 vmovdqa64 @key[14],$xd2 3048 vmovdqa64 @key[15],$xd3 3049 3050 vmovdqa64 $xa0,@key[0] 3051 vmovdqa64 $xa1,@key[1] 3052 vmovdqa64 $xa2,@key[2] 3053 vmovdqa64 $xa3,@key[3] 3054 3055 mov \$10,%eax 3056 jmp .Loop16x 3057 3058.align 32 3059.Loop16x: 3060___ 3061 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3062 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3063$code.=<<___; 3064 dec %eax 3065 jnz .Loop16x 3066 3067 vpaddd @key[0],$xa0,$xa0 # accumulate key 3068 vpaddd @key[1],$xa1,$xa1 3069 vpaddd @key[2],$xa2,$xa2 3070 vpaddd @key[3],$xa3,$xa3 3071 3072 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3073 vpunpckldq $xa3,$xa2,$xt3 3074 vpunpckhdq $xa1,$xa0,$xa0 3075 vpunpckhdq $xa3,$xa2,$xa2 3076 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3077 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3078 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3079 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3080___ 3081 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3082$code.=<<___; 3083 vpaddd @key[4],$xb0,$xb0 3084 vpaddd @key[5],$xb1,$xb1 3085 vpaddd @key[6],$xb2,$xb2 3086 vpaddd @key[7],$xb3,$xb3 3087 3088 vpunpckldq $xb1,$xb0,$xt2 3089 vpunpckldq $xb3,$xb2,$xt3 3090 vpunpckhdq $xb1,$xb0,$xb0 3091 vpunpckhdq $xb3,$xb2,$xb2 3092 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3093 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3094 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3095 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3096___ 3097 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3098$code.=<<___; 3099 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further 3100 vshufi32x4 \$0xee,$xb0,$xa0,$xb0 3101 vshufi32x4 \$0x44,$xb1,$xa1,$xa0 3102 vshufi32x4 \$0xee,$xb1,$xa1,$xb1 3103 vshufi32x4 \$0x44,$xb2,$xa2,$xa1 3104 vshufi32x4 \$0xee,$xb2,$xa2,$xb2 3105 vshufi32x4 \$0x44,$xb3,$xa3,$xa2 3106 vshufi32x4 \$0xee,$xb3,$xa3,$xb3 3107___ 3108 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3109$code.=<<___; 3110 vpaddd @key[8],$xc0,$xc0 3111 vpaddd @key[9],$xc1,$xc1 3112 vpaddd @key[10],$xc2,$xc2 3113 vpaddd @key[11],$xc3,$xc3 3114 3115 vpunpckldq $xc1,$xc0,$xt2 3116 vpunpckldq $xc3,$xc2,$xt3 3117 vpunpckhdq $xc1,$xc0,$xc0 3118 vpunpckhdq $xc3,$xc2,$xc2 3119 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3120 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3121 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3122 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3123___ 3124 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3125$code.=<<___; 3126 vpaddd @key[12],$xd0,$xd0 3127 vpaddd @key[13],$xd1,$xd1 3128 vpaddd @key[14],$xd2,$xd2 3129 vpaddd @key[15],$xd3,$xd3 3130 3131 vpunpckldq $xd1,$xd0,$xt2 3132 vpunpckldq $xd3,$xd2,$xt3 3133 vpunpckhdq $xd1,$xd0,$xd0 3134 vpunpckhdq $xd3,$xd2,$xd2 3135 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3136 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3137 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3138 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3139___ 3140 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3141$code.=<<___; 3142 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further 3143 vshufi32x4 \$0xee,$xd0,$xc0,$xd0 3144 vshufi32x4 \$0x44,$xd1,$xc1,$xc0 3145 vshufi32x4 \$0xee,$xd1,$xc1,$xd1 3146 vshufi32x4 \$0x44,$xd2,$xc2,$xc1 3147 vshufi32x4 \$0xee,$xd2,$xc2,$xd2 3148 vshufi32x4 \$0x44,$xd3,$xc3,$xc2 3149 vshufi32x4 \$0xee,$xd3,$xc3,$xd3 3150___ 3151 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3152$code.=<<___; 3153 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further 3154 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 3155 vshufi32x4 \$0x88,$xd0,$xb0,$xc0 3156 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 3157 vshufi32x4 \$0x88,$xc1,$xa1,$xt1 3158 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 3159 vshufi32x4 \$0x88,$xd1,$xb1,$xc1 3160 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 3161 vshufi32x4 \$0x88,$xc2,$xa2,$xt2 3162 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 3163 vshufi32x4 \$0x88,$xd2,$xb2,$xc2 3164 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 3165 vshufi32x4 \$0x88,$xc3,$xa3,$xt3 3166 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 3167 vshufi32x4 \$0x88,$xd3,$xb3,$xc3 3168 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 3169___ 3170 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= 3171 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); 3172 3173 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, 3174 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = 3175 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3176 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3177$code.=<<___; 3178 cmp \$64*16,$len 3179 jb .Ltail16x 3180 3181 vpxord 0x00($inp),$xa0,$xa0 # xor with input 3182 vpxord 0x40($inp),$xb0,$xb0 3183 vpxord 0x80($inp),$xc0,$xc0 3184 vpxord 0xc0($inp),$xd0,$xd0 3185 vmovdqu32 $xa0,0x00($out) 3186 vmovdqu32 $xb0,0x40($out) 3187 vmovdqu32 $xc0,0x80($out) 3188 vmovdqu32 $xd0,0xc0($out) 3189 3190 vpxord 0x100($inp),$xa1,$xa1 3191 vpxord 0x140($inp),$xb1,$xb1 3192 vpxord 0x180($inp),$xc1,$xc1 3193 vpxord 0x1c0($inp),$xd1,$xd1 3194 vmovdqu32 $xa1,0x100($out) 3195 vmovdqu32 $xb1,0x140($out) 3196 vmovdqu32 $xc1,0x180($out) 3197 vmovdqu32 $xd1,0x1c0($out) 3198 3199 vpxord 0x200($inp),$xa2,$xa2 3200 vpxord 0x240($inp),$xb2,$xb2 3201 vpxord 0x280($inp),$xc2,$xc2 3202 vpxord 0x2c0($inp),$xd2,$xd2 3203 vmovdqu32 $xa2,0x200($out) 3204 vmovdqu32 $xb2,0x240($out) 3205 vmovdqu32 $xc2,0x280($out) 3206 vmovdqu32 $xd2,0x2c0($out) 3207 3208 vpxord 0x300($inp),$xa3,$xa3 3209 vpxord 0x340($inp),$xb3,$xb3 3210 vpxord 0x380($inp),$xc3,$xc3 3211 vpxord 0x3c0($inp),$xd3,$xd3 3212 lea 0x400($inp),$inp 3213 vmovdqu32 $xa3,0x300($out) 3214 vmovdqu32 $xb3,0x340($out) 3215 vmovdqu32 $xc3,0x380($out) 3216 vmovdqu32 $xd3,0x3c0($out) 3217 lea 0x400($out),$out 3218 3219 sub \$64*16,$len 3220 jnz .Loop_outer16x 3221 3222 jmp .Ldone16x 3223 3224.align 32 3225.Ltail16x: 3226 xor %r10,%r10 3227 sub $inp,$out 3228 cmp \$64*1,$len 3229 jb .Less_than_64_16x 3230 vpxord ($inp),$xa0,$xa0 # xor with input 3231 vmovdqu32 $xa0,($out,$inp) 3232 je .Ldone16x 3233 vmovdqa32 $xb0,$xa0 3234 lea 64($inp),$inp 3235 3236 cmp \$64*2,$len 3237 jb .Less_than_64_16x 3238 vpxord ($inp),$xb0,$xb0 3239 vmovdqu32 $xb0,($out,$inp) 3240 je .Ldone16x 3241 vmovdqa32 $xc0,$xa0 3242 lea 64($inp),$inp 3243 3244 cmp \$64*3,$len 3245 jb .Less_than_64_16x 3246 vpxord ($inp),$xc0,$xc0 3247 vmovdqu32 $xc0,($out,$inp) 3248 je .Ldone16x 3249 vmovdqa32 $xd0,$xa0 3250 lea 64($inp),$inp 3251 3252 cmp \$64*4,$len 3253 jb .Less_than_64_16x 3254 vpxord ($inp),$xd0,$xd0 3255 vmovdqu32 $xd0,($out,$inp) 3256 je .Ldone16x 3257 vmovdqa32 $xa1,$xa0 3258 lea 64($inp),$inp 3259 3260 cmp \$64*5,$len 3261 jb .Less_than_64_16x 3262 vpxord ($inp),$xa1,$xa1 3263 vmovdqu32 $xa1,($out,$inp) 3264 je .Ldone16x 3265 vmovdqa32 $xb1,$xa0 3266 lea 64($inp),$inp 3267 3268 cmp \$64*6,$len 3269 jb .Less_than_64_16x 3270 vpxord ($inp),$xb1,$xb1 3271 vmovdqu32 $xb1,($out,$inp) 3272 je .Ldone16x 3273 vmovdqa32 $xc1,$xa0 3274 lea 64($inp),$inp 3275 3276 cmp \$64*7,$len 3277 jb .Less_than_64_16x 3278 vpxord ($inp),$xc1,$xc1 3279 vmovdqu32 $xc1,($out,$inp) 3280 je .Ldone16x 3281 vmovdqa32 $xd1,$xa0 3282 lea 64($inp),$inp 3283 3284 cmp \$64*8,$len 3285 jb .Less_than_64_16x 3286 vpxord ($inp),$xd1,$xd1 3287 vmovdqu32 $xd1,($out,$inp) 3288 je .Ldone16x 3289 vmovdqa32 $xa2,$xa0 3290 lea 64($inp),$inp 3291 3292 cmp \$64*9,$len 3293 jb .Less_than_64_16x 3294 vpxord ($inp),$xa2,$xa2 3295 vmovdqu32 $xa2,($out,$inp) 3296 je .Ldone16x 3297 vmovdqa32 $xb2,$xa0 3298 lea 64($inp),$inp 3299 3300 cmp \$64*10,$len 3301 jb .Less_than_64_16x 3302 vpxord ($inp),$xb2,$xb2 3303 vmovdqu32 $xb2,($out,$inp) 3304 je .Ldone16x 3305 vmovdqa32 $xc2,$xa0 3306 lea 64($inp),$inp 3307 3308 cmp \$64*11,$len 3309 jb .Less_than_64_16x 3310 vpxord ($inp),$xc2,$xc2 3311 vmovdqu32 $xc2,($out,$inp) 3312 je .Ldone16x 3313 vmovdqa32 $xd2,$xa0 3314 lea 64($inp),$inp 3315 3316 cmp \$64*12,$len 3317 jb .Less_than_64_16x 3318 vpxord ($inp),$xd2,$xd2 3319 vmovdqu32 $xd2,($out,$inp) 3320 je .Ldone16x 3321 vmovdqa32 $xa3,$xa0 3322 lea 64($inp),$inp 3323 3324 cmp \$64*13,$len 3325 jb .Less_than_64_16x 3326 vpxord ($inp),$xa3,$xa3 3327 vmovdqu32 $xa3,($out,$inp) 3328 je .Ldone16x 3329 vmovdqa32 $xb3,$xa0 3330 lea 64($inp),$inp 3331 3332 cmp \$64*14,$len 3333 jb .Less_than_64_16x 3334 vpxord ($inp),$xb3,$xb3 3335 vmovdqu32 $xb3,($out,$inp) 3336 je .Ldone16x 3337 vmovdqa32 $xc3,$xa0 3338 lea 64($inp),$inp 3339 3340 cmp \$64*15,$len 3341 jb .Less_than_64_16x 3342 vpxord ($inp),$xc3,$xc3 3343 vmovdqu32 $xc3,($out,$inp) 3344 je .Ldone16x 3345 vmovdqa32 $xd3,$xa0 3346 lea 64($inp),$inp 3347 3348.Less_than_64_16x: 3349 vmovdqa32 $xa0,0x00(%rsp) 3350 lea ($out,$inp),$out 3351 and \$63,$len 3352 3353.Loop_tail16x: 3354 movzb ($inp,%r10),%eax 3355 movzb (%rsp,%r10),%ecx 3356 lea 1(%r10),%r10 3357 xor %ecx,%eax 3358 mov %al,-1($out,%r10) 3359 dec $len 3360 jnz .Loop_tail16x 3361 3362 vpxord $xa0,$xa0,$xa0 3363 vmovdqa32 $xa0,0(%rsp) 3364 3365.Ldone16x: 3366 vzeroall 3367___ 3368$code.=<<___ if ($win64); 3369 movaps -0xa8(%r9),%xmm6 3370 movaps -0x98(%r9),%xmm7 3371 movaps -0x88(%r9),%xmm8 3372 movaps -0x78(%r9),%xmm9 3373 movaps -0x68(%r9),%xmm10 3374 movaps -0x58(%r9),%xmm11 3375 movaps -0x48(%r9),%xmm12 3376 movaps -0x38(%r9),%xmm13 3377 movaps -0x28(%r9),%xmm14 3378 movaps -0x18(%r9),%xmm15 3379___ 3380$code.=<<___; 3381 lea (%r9),%rsp 3382.cfi_def_cfa_register %rsp 3383.L16x_epilogue: 3384 ret 3385.cfi_endproc 3386.size ChaCha20_16x,.-ChaCha20_16x 3387___ 3388 3389# switch to %ymm domain 3390($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3391 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); 3392@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3393 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3394@key=map("%ymm$_",(16..31)); 3395($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 3396 3397$code.=<<___; 3398.type ChaCha20_8xvl,\@function,5 3399.align 32 3400ChaCha20_8xvl: 3401.cfi_startproc 3402.LChaCha20_8xvl: 3403 mov %rsp,%r9 # frame register 3404.cfi_def_cfa_register %r9 3405 sub \$64+$xframe,%rsp 3406 and \$-64,%rsp 3407___ 3408$code.=<<___ if ($win64); 3409 movaps %xmm6,-0xa8(%r9) 3410 movaps %xmm7,-0x98(%r9) 3411 movaps %xmm8,-0x88(%r9) 3412 movaps %xmm9,-0x78(%r9) 3413 movaps %xmm10,-0x68(%r9) 3414 movaps %xmm11,-0x58(%r9) 3415 movaps %xmm12,-0x48(%r9) 3416 movaps %xmm13,-0x38(%r9) 3417 movaps %xmm14,-0x28(%r9) 3418 movaps %xmm15,-0x18(%r9) 3419.L8xvl_body: 3420___ 3421$code.=<<___; 3422 vzeroupper 3423 3424 lea .Lsigma(%rip),%r10 3425 vbroadcasti128 (%r10),$xa3 # key[0] 3426 vbroadcasti128 ($key),$xb3 # key[1] 3427 vbroadcasti128 16($key),$xc3 # key[2] 3428 vbroadcasti128 ($counter),$xd3 # key[3] 3429 3430 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 3431 vpshufd \$0x55,$xa3,$xa1 3432 vpshufd \$0xaa,$xa3,$xa2 3433 vpshufd \$0xff,$xa3,$xa3 3434 vmovdqa64 $xa0,@key[0] 3435 vmovdqa64 $xa1,@key[1] 3436 vmovdqa64 $xa2,@key[2] 3437 vmovdqa64 $xa3,@key[3] 3438 3439 vpshufd \$0x00,$xb3,$xb0 3440 vpshufd \$0x55,$xb3,$xb1 3441 vpshufd \$0xaa,$xb3,$xb2 3442 vpshufd \$0xff,$xb3,$xb3 3443 vmovdqa64 $xb0,@key[4] 3444 vmovdqa64 $xb1,@key[5] 3445 vmovdqa64 $xb2,@key[6] 3446 vmovdqa64 $xb3,@key[7] 3447 3448 vpshufd \$0x00,$xc3,$xc0 3449 vpshufd \$0x55,$xc3,$xc1 3450 vpshufd \$0xaa,$xc3,$xc2 3451 vpshufd \$0xff,$xc3,$xc3 3452 vmovdqa64 $xc0,@key[8] 3453 vmovdqa64 $xc1,@key[9] 3454 vmovdqa64 $xc2,@key[10] 3455 vmovdqa64 $xc3,@key[11] 3456 3457 vpshufd \$0x00,$xd3,$xd0 3458 vpshufd \$0x55,$xd3,$xd1 3459 vpshufd \$0xaa,$xd3,$xd2 3460 vpshufd \$0xff,$xd3,$xd3 3461 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 3462 vmovdqa64 $xd0,@key[12] 3463 vmovdqa64 $xd1,@key[13] 3464 vmovdqa64 $xd2,@key[14] 3465 vmovdqa64 $xd3,@key[15] 3466 3467 mov \$10,%eax 3468 jmp .Loop8xvl 3469 3470.align 32 3471.Loop_outer8xvl: 3472 #vpbroadcastd 0(%r10),$xa0 # reload key 3473 #vpbroadcastd 4(%r10),$xa1 3474 vpbroadcastd 8(%r10),$xa2 3475 vpbroadcastd 12(%r10),$xa3 3476 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters 3477 vmovdqa64 @key[4],$xb0 3478 vmovdqa64 @key[5],$xb1 3479 vmovdqa64 @key[6],$xb2 3480 vmovdqa64 @key[7],$xb3 3481 vmovdqa64 @key[8],$xc0 3482 vmovdqa64 @key[9],$xc1 3483 vmovdqa64 @key[10],$xc2 3484 vmovdqa64 @key[11],$xc3 3485 vmovdqa64 @key[12],$xd0 3486 vmovdqa64 @key[13],$xd1 3487 vmovdqa64 @key[14],$xd2 3488 vmovdqa64 @key[15],$xd3 3489 3490 vmovdqa64 $xa0,@key[0] 3491 vmovdqa64 $xa1,@key[1] 3492 vmovdqa64 $xa2,@key[2] 3493 vmovdqa64 $xa3,@key[3] 3494 3495 mov \$10,%eax 3496 jmp .Loop8xvl 3497 3498.align 32 3499.Loop8xvl: 3500___ 3501 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3502 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3503$code.=<<___; 3504 dec %eax 3505 jnz .Loop8xvl 3506 3507 vpaddd @key[0],$xa0,$xa0 # accumulate key 3508 vpaddd @key[1],$xa1,$xa1 3509 vpaddd @key[2],$xa2,$xa2 3510 vpaddd @key[3],$xa3,$xa3 3511 3512 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3513 vpunpckldq $xa3,$xa2,$xt3 3514 vpunpckhdq $xa1,$xa0,$xa0 3515 vpunpckhdq $xa3,$xa2,$xa2 3516 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3517 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3518 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3519 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3520___ 3521 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3522$code.=<<___; 3523 vpaddd @key[4],$xb0,$xb0 3524 vpaddd @key[5],$xb1,$xb1 3525 vpaddd @key[6],$xb2,$xb2 3526 vpaddd @key[7],$xb3,$xb3 3527 3528 vpunpckldq $xb1,$xb0,$xt2 3529 vpunpckldq $xb3,$xb2,$xt3 3530 vpunpckhdq $xb1,$xb0,$xb0 3531 vpunpckhdq $xb3,$xb2,$xb2 3532 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3533 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3534 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3535 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3536___ 3537 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3538$code.=<<___; 3539 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further 3540 vshufi32x4 \$3,$xb0,$xa0,$xb0 3541 vshufi32x4 \$0,$xb1,$xa1,$xa0 3542 vshufi32x4 \$3,$xb1,$xa1,$xb1 3543 vshufi32x4 \$0,$xb2,$xa2,$xa1 3544 vshufi32x4 \$3,$xb2,$xa2,$xb2 3545 vshufi32x4 \$0,$xb3,$xa3,$xa2 3546 vshufi32x4 \$3,$xb3,$xa3,$xb3 3547___ 3548 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3549$code.=<<___; 3550 vpaddd @key[8],$xc0,$xc0 3551 vpaddd @key[9],$xc1,$xc1 3552 vpaddd @key[10],$xc2,$xc2 3553 vpaddd @key[11],$xc3,$xc3 3554 3555 vpunpckldq $xc1,$xc0,$xt2 3556 vpunpckldq $xc3,$xc2,$xt3 3557 vpunpckhdq $xc1,$xc0,$xc0 3558 vpunpckhdq $xc3,$xc2,$xc2 3559 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3560 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3561 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3562 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3563___ 3564 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3565$code.=<<___; 3566 vpaddd @key[12],$xd0,$xd0 3567 vpaddd @key[13],$xd1,$xd1 3568 vpaddd @key[14],$xd2,$xd2 3569 vpaddd @key[15],$xd3,$xd3 3570 3571 vpunpckldq $xd1,$xd0,$xt2 3572 vpunpckldq $xd3,$xd2,$xt3 3573 vpunpckhdq $xd1,$xd0,$xd0 3574 vpunpckhdq $xd3,$xd2,$xd2 3575 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3576 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3577 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3578 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3579___ 3580 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3581$code.=<<___; 3582 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 3583 vperm2i128 \$0x31,$xd0,$xc0,$xd0 3584 vperm2i128 \$0x20,$xd1,$xc1,$xc0 3585 vperm2i128 \$0x31,$xd1,$xc1,$xd1 3586 vperm2i128 \$0x20,$xd2,$xc2,$xc1 3587 vperm2i128 \$0x31,$xd2,$xc2,$xd2 3588 vperm2i128 \$0x20,$xd3,$xc3,$xc2 3589 vperm2i128 \$0x31,$xd3,$xc3,$xd3 3590___ 3591 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3592 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 3593 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 3594$code.=<<___; 3595 cmp \$64*8,$len 3596 jb .Ltail8xvl 3597 3598 mov \$0x80,%eax # size optimization 3599 vpxord 0x00($inp),$xa0,$xa0 # xor with input 3600 vpxor 0x20($inp),$xb0,$xb0 3601 vpxor 0x40($inp),$xc0,$xc0 3602 vpxor 0x60($inp),$xd0,$xd0 3603 lea ($inp,%rax),$inp # size optimization 3604 vmovdqu32 $xa0,0x00($out) 3605 vmovdqu $xb0,0x20($out) 3606 vmovdqu $xc0,0x40($out) 3607 vmovdqu $xd0,0x60($out) 3608 lea ($out,%rax),$out # size optimization 3609 3610 vpxor 0x00($inp),$xa1,$xa1 3611 vpxor 0x20($inp),$xb1,$xb1 3612 vpxor 0x40($inp),$xc1,$xc1 3613 vpxor 0x60($inp),$xd1,$xd1 3614 lea ($inp,%rax),$inp # size optimization 3615 vmovdqu $xa1,0x00($out) 3616 vmovdqu $xb1,0x20($out) 3617 vmovdqu $xc1,0x40($out) 3618 vmovdqu $xd1,0x60($out) 3619 lea ($out,%rax),$out # size optimization 3620 3621 vpxord 0x00($inp),$xa2,$xa2 3622 vpxor 0x20($inp),$xb2,$xb2 3623 vpxor 0x40($inp),$xc2,$xc2 3624 vpxor 0x60($inp),$xd2,$xd2 3625 lea ($inp,%rax),$inp # size optimization 3626 vmovdqu32 $xa2,0x00($out) 3627 vmovdqu $xb2,0x20($out) 3628 vmovdqu $xc2,0x40($out) 3629 vmovdqu $xd2,0x60($out) 3630 lea ($out,%rax),$out # size optimization 3631 3632 vpxor 0x00($inp),$xa3,$xa3 3633 vpxor 0x20($inp),$xb3,$xb3 3634 vpxor 0x40($inp),$xc3,$xc3 3635 vpxor 0x60($inp),$xd3,$xd3 3636 lea ($inp,%rax),$inp # size optimization 3637 vmovdqu $xa3,0x00($out) 3638 vmovdqu $xb3,0x20($out) 3639 vmovdqu $xc3,0x40($out) 3640 vmovdqu $xd3,0x60($out) 3641 lea ($out,%rax),$out # size optimization 3642 3643 vpbroadcastd 0(%r10),%ymm0 # reload key 3644 vpbroadcastd 4(%r10),%ymm1 3645 3646 sub \$64*8,$len 3647 jnz .Loop_outer8xvl 3648 3649 jmp .Ldone8xvl 3650 3651.align 32 3652.Ltail8xvl: 3653 vmovdqa64 $xa0,%ymm8 # size optimization 3654___ 3655$xa0 = "%ymm8"; 3656$code.=<<___; 3657 xor %r10,%r10 3658 sub $inp,$out 3659 cmp \$64*1,$len 3660 jb .Less_than_64_8xvl 3661 vpxor 0x00($inp),$xa0,$xa0 # xor with input 3662 vpxor 0x20($inp),$xb0,$xb0 3663 vmovdqu $xa0,0x00($out,$inp) 3664 vmovdqu $xb0,0x20($out,$inp) 3665 je .Ldone8xvl 3666 vmovdqa $xc0,$xa0 3667 vmovdqa $xd0,$xb0 3668 lea 64($inp),$inp 3669 3670 cmp \$64*2,$len 3671 jb .Less_than_64_8xvl 3672 vpxor 0x00($inp),$xc0,$xc0 3673 vpxor 0x20($inp),$xd0,$xd0 3674 vmovdqu $xc0,0x00($out,$inp) 3675 vmovdqu $xd0,0x20($out,$inp) 3676 je .Ldone8xvl 3677 vmovdqa $xa1,$xa0 3678 vmovdqa $xb1,$xb0 3679 lea 64($inp),$inp 3680 3681 cmp \$64*3,$len 3682 jb .Less_than_64_8xvl 3683 vpxor 0x00($inp),$xa1,$xa1 3684 vpxor 0x20($inp),$xb1,$xb1 3685 vmovdqu $xa1,0x00($out,$inp) 3686 vmovdqu $xb1,0x20($out,$inp) 3687 je .Ldone8xvl 3688 vmovdqa $xc1,$xa0 3689 vmovdqa $xd1,$xb0 3690 lea 64($inp),$inp 3691 3692 cmp \$64*4,$len 3693 jb .Less_than_64_8xvl 3694 vpxor 0x00($inp),$xc1,$xc1 3695 vpxor 0x20($inp),$xd1,$xd1 3696 vmovdqu $xc1,0x00($out,$inp) 3697 vmovdqu $xd1,0x20($out,$inp) 3698 je .Ldone8xvl 3699 vmovdqa32 $xa2,$xa0 3700 vmovdqa $xb2,$xb0 3701 lea 64($inp),$inp 3702 3703 cmp \$64*5,$len 3704 jb .Less_than_64_8xvl 3705 vpxord 0x00($inp),$xa2,$xa2 3706 vpxor 0x20($inp),$xb2,$xb2 3707 vmovdqu32 $xa2,0x00($out,$inp) 3708 vmovdqu $xb2,0x20($out,$inp) 3709 je .Ldone8xvl 3710 vmovdqa $xc2,$xa0 3711 vmovdqa $xd2,$xb0 3712 lea 64($inp),$inp 3713 3714 cmp \$64*6,$len 3715 jb .Less_than_64_8xvl 3716 vpxor 0x00($inp),$xc2,$xc2 3717 vpxor 0x20($inp),$xd2,$xd2 3718 vmovdqu $xc2,0x00($out,$inp) 3719 vmovdqu $xd2,0x20($out,$inp) 3720 je .Ldone8xvl 3721 vmovdqa $xa3,$xa0 3722 vmovdqa $xb3,$xb0 3723 lea 64($inp),$inp 3724 3725 cmp \$64*7,$len 3726 jb .Less_than_64_8xvl 3727 vpxor 0x00($inp),$xa3,$xa3 3728 vpxor 0x20($inp),$xb3,$xb3 3729 vmovdqu $xa3,0x00($out,$inp) 3730 vmovdqu $xb3,0x20($out,$inp) 3731 je .Ldone8xvl 3732 vmovdqa $xc3,$xa0 3733 vmovdqa $xd3,$xb0 3734 lea 64($inp),$inp 3735 3736.Less_than_64_8xvl: 3737 vmovdqa $xa0,0x00(%rsp) 3738 vmovdqa $xb0,0x20(%rsp) 3739 lea ($out,$inp),$out 3740 and \$63,$len 3741 3742.Loop_tail8xvl: 3743 movzb ($inp,%r10),%eax 3744 movzb (%rsp,%r10),%ecx 3745 lea 1(%r10),%r10 3746 xor %ecx,%eax 3747 mov %al,-1($out,%r10) 3748 dec $len 3749 jnz .Loop_tail8xvl 3750 3751 vpxor $xa0,$xa0,$xa0 3752 vmovdqa $xa0,0x00(%rsp) 3753 vmovdqa $xa0,0x20(%rsp) 3754 3755.Ldone8xvl: 3756 vzeroall 3757___ 3758$code.=<<___ if ($win64); 3759 movaps -0xa8(%r9),%xmm6 3760 movaps -0x98(%r9),%xmm7 3761 movaps -0x88(%r9),%xmm8 3762 movaps -0x78(%r9),%xmm9 3763 movaps -0x68(%r9),%xmm10 3764 movaps -0x58(%r9),%xmm11 3765 movaps -0x48(%r9),%xmm12 3766 movaps -0x38(%r9),%xmm13 3767 movaps -0x28(%r9),%xmm14 3768 movaps -0x18(%r9),%xmm15 3769___ 3770$code.=<<___; 3771 lea (%r9),%rsp 3772.cfi_def_cfa_register %rsp 3773.L8xvl_epilogue: 3774 ret 3775.cfi_endproc 3776.size ChaCha20_8xvl,.-ChaCha20_8xvl 3777___ 3778} 3779 3780# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3781# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3782if ($win64) { 3783$rec="%rcx"; 3784$frame="%rdx"; 3785$context="%r8"; 3786$disp="%r9"; 3787 3788$code.=<<___; 3789.extern __imp_RtlVirtualUnwind 3790.type se_handler,\@abi-omnipotent 3791.align 16 3792se_handler: 3793 push %rsi 3794 push %rdi 3795 push %rbx 3796 push %rbp 3797 push %r12 3798 push %r13 3799 push %r14 3800 push %r15 3801 pushfq 3802 sub \$64,%rsp 3803 3804 mov 120($context),%rax # pull context->Rax 3805 mov 248($context),%rbx # pull context->Rip 3806 3807 mov 8($disp),%rsi # disp->ImageBase 3808 mov 56($disp),%r11 # disp->HandlerData 3809 3810 lea .Lctr32_body(%rip),%r10 3811 cmp %r10,%rbx # context->Rip<.Lprologue 3812 jb .Lcommon_seh_tail 3813 3814 mov 152($context),%rax # pull context->Rsp 3815 3816 lea .Lno_data(%rip),%r10 # epilogue label 3817 cmp %r10,%rbx # context->Rip>=.Lepilogue 3818 jae .Lcommon_seh_tail 3819 3820 lea 64+24+48(%rax),%rax 3821 3822 mov -8(%rax),%rbx 3823 mov -16(%rax),%rbp 3824 mov -24(%rax),%r12 3825 mov -32(%rax),%r13 3826 mov -40(%rax),%r14 3827 mov -48(%rax),%r15 3828 mov %rbx,144($context) # restore context->Rbx 3829 mov %rbp,160($context) # restore context->Rbp 3830 mov %r12,216($context) # restore context->R12 3831 mov %r13,224($context) # restore context->R13 3832 mov %r14,232($context) # restore context->R14 3833 mov %r15,240($context) # restore context->R14 3834 3835.Lcommon_seh_tail: 3836 mov 8(%rax),%rdi 3837 mov 16(%rax),%rsi 3838 mov %rax,152($context) # restore context->Rsp 3839 mov %rsi,168($context) # restore context->Rsi 3840 mov %rdi,176($context) # restore context->Rdi 3841 3842 mov 40($disp),%rdi # disp->ContextRecord 3843 mov $context,%rsi # context 3844 mov \$154,%ecx # sizeof(CONTEXT) 3845 .long 0xa548f3fc # cld; rep movsq 3846 3847 mov $disp,%rsi 3848 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3849 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3850 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3851 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3852 mov 40(%rsi),%r10 # disp->ContextRecord 3853 lea 56(%rsi),%r11 # &disp->HandlerData 3854 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3855 mov %r10,32(%rsp) # arg5 3856 mov %r11,40(%rsp) # arg6 3857 mov %r12,48(%rsp) # arg7 3858 mov %rcx,56(%rsp) # arg8, (NULL) 3859 call *__imp_RtlVirtualUnwind(%rip) 3860 3861 mov \$1,%eax # ExceptionContinueSearch 3862 add \$64,%rsp 3863 popfq 3864 pop %r15 3865 pop %r14 3866 pop %r13 3867 pop %r12 3868 pop %rbp 3869 pop %rbx 3870 pop %rdi 3871 pop %rsi 3872 ret 3873.size se_handler,.-se_handler 3874 3875.type simd_handler,\@abi-omnipotent 3876.align 16 3877simd_handler: 3878 push %rsi 3879 push %rdi 3880 push %rbx 3881 push %rbp 3882 push %r12 3883 push %r13 3884 push %r14 3885 push %r15 3886 pushfq 3887 sub \$64,%rsp 3888 3889 mov 120($context),%rax # pull context->Rax 3890 mov 248($context),%rbx # pull context->Rip 3891 3892 mov 8($disp),%rsi # disp->ImageBase 3893 mov 56($disp),%r11 # disp->HandlerData 3894 3895 mov 0(%r11),%r10d # HandlerData[0] 3896 lea (%rsi,%r10),%r10 # prologue label 3897 cmp %r10,%rbx # context->Rip<prologue label 3898 jb .Lcommon_seh_tail 3899 3900 mov 192($context),%rax # pull context->R9 3901 3902 mov 4(%r11),%r10d # HandlerData[1] 3903 mov 8(%r11),%ecx # HandlerData[2] 3904 lea (%rsi,%r10),%r10 # epilogue label 3905 cmp %r10,%rbx # context->Rip>=epilogue label 3906 jae .Lcommon_seh_tail 3907 3908 neg %rcx 3909 lea -8(%rax,%rcx),%rsi 3910 lea 512($context),%rdi # &context.Xmm6 3911 neg %ecx 3912 shr \$3,%ecx 3913 .long 0xa548f3fc # cld; rep movsq 3914 3915 jmp .Lcommon_seh_tail 3916.size simd_handler,.-simd_handler 3917 3918.section .pdata 3919.align 4 3920 .rva .LSEH_begin_ChaCha20_ctr32 3921 .rva .LSEH_end_ChaCha20_ctr32 3922 .rva .LSEH_info_ChaCha20_ctr32 3923 3924 .rva .LSEH_begin_ChaCha20_ssse3 3925 .rva .LSEH_end_ChaCha20_ssse3 3926 .rva .LSEH_info_ChaCha20_ssse3 3927 3928 .rva .LSEH_begin_ChaCha20_128 3929 .rva .LSEH_end_ChaCha20_128 3930 .rva .LSEH_info_ChaCha20_128 3931 3932 .rva .LSEH_begin_ChaCha20_4x 3933 .rva .LSEH_end_ChaCha20_4x 3934 .rva .LSEH_info_ChaCha20_4x 3935___ 3936$code.=<<___ if ($avx); 3937 .rva .LSEH_begin_ChaCha20_4xop 3938 .rva .LSEH_end_ChaCha20_4xop 3939 .rva .LSEH_info_ChaCha20_4xop 3940___ 3941$code.=<<___ if ($avx>1); 3942 .rva .LSEH_begin_ChaCha20_8x 3943 .rva .LSEH_end_ChaCha20_8x 3944 .rva .LSEH_info_ChaCha20_8x 3945___ 3946$code.=<<___ if ($avx>2); 3947 .rva .LSEH_begin_ChaCha20_avx512 3948 .rva .LSEH_end_ChaCha20_avx512 3949 .rva .LSEH_info_ChaCha20_avx512 3950 3951 .rva .LSEH_begin_ChaCha20_avx512vl 3952 .rva .LSEH_end_ChaCha20_avx512vl 3953 .rva .LSEH_info_ChaCha20_avx512vl 3954 3955 .rva .LSEH_begin_ChaCha20_16x 3956 .rva .LSEH_end_ChaCha20_16x 3957 .rva .LSEH_info_ChaCha20_16x 3958 3959 .rva .LSEH_begin_ChaCha20_8xvl 3960 .rva .LSEH_end_ChaCha20_8xvl 3961 .rva .LSEH_info_ChaCha20_8xvl 3962___ 3963$code.=<<___; 3964.section .xdata 3965.align 8 3966.LSEH_info_ChaCha20_ctr32: 3967 .byte 9,0,0,0 3968 .rva se_handler 3969 3970.LSEH_info_ChaCha20_ssse3: 3971 .byte 9,0,0,0 3972 .rva simd_handler 3973 .rva .Lssse3_body,.Lssse3_epilogue 3974 .long 0x20,0 3975 3976.LSEH_info_ChaCha20_128: 3977 .byte 9,0,0,0 3978 .rva simd_handler 3979 .rva .L128_body,.L128_epilogue 3980 .long 0x60,0 3981 3982.LSEH_info_ChaCha20_4x: 3983 .byte 9,0,0,0 3984 .rva simd_handler 3985 .rva .L4x_body,.L4x_epilogue 3986 .long 0xa0,0 3987___ 3988$code.=<<___ if ($avx); 3989.LSEH_info_ChaCha20_4xop: 3990 .byte 9,0,0,0 3991 .rva simd_handler 3992 .rva .L4xop_body,.L4xop_epilogue # HandlerData[] 3993 .long 0xa0,0 3994___ 3995$code.=<<___ if ($avx>1); 3996.LSEH_info_ChaCha20_8x: 3997 .byte 9,0,0,0 3998 .rva simd_handler 3999 .rva .L8x_body,.L8x_epilogue # HandlerData[] 4000 .long 0xa0,0 4001___ 4002$code.=<<___ if ($avx>2); 4003.LSEH_info_ChaCha20_avx512: 4004 .byte 9,0,0,0 4005 .rva simd_handler 4006 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] 4007 .long 0x20,0 4008 4009.LSEH_info_ChaCha20_avx512vl: 4010 .byte 9,0,0,0 4011 .rva simd_handler 4012 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] 4013 .long 0x20,0 4014 4015.LSEH_info_ChaCha20_16x: 4016 .byte 9,0,0,0 4017 .rva simd_handler 4018 .rva .L16x_body,.L16x_epilogue # HandlerData[] 4019 .long 0xa0,0 4020 4021.LSEH_info_ChaCha20_8xvl: 4022 .byte 9,0,0,0 4023 .rva simd_handler 4024 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] 4025 .long 0xa0,0 4026___ 4027} 4028 4029foreach (split("\n",$code)) { 4030 s/\`([^\`]*)\`/eval $1/ge; 4031 4032 s/%x#%[yz]/%x/g; # "down-shift" 4033 4034 print $_,"\n"; 4035} 4036 4037close STDOUT or die "error closing STDOUT: $!"; 4038