1#! /usr/bin/env perl 2# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# January 2015 18# 19# ChaCha20 for x86. 20# 21# Performance in cycles per byte out of large buffer. 22# 23# 1xIALU/gcc 4xSSSE3 24# Pentium 17.5/+80% 25# PIII 14.2/+60% 26# P4 18.6/+84% 27# Core2 9.56/+89% 4.83 28# Westmere 9.50/+45% 3.35 29# Sandy Bridge 10.5/+47% 3.20 30# Haswell 8.15/+50% 2.83 31# Skylake 7.53/+22% 2.75 32# Silvermont 17.4/+36% 8.35 33# Goldmont 13.4/+40% 4.36 34# Sledgehammer 10.2/+54% 35# Bulldozer 13.4/+50% 4.38(*) 36# 37# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; 38 39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40push(@INC,"${dir}","${dir}../../perlasm"); 41require "x86asm.pl"; 42 43$output=pop; 44open STDOUT,">$output"; 45 46&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 47 48$xmm=$ymm=0; 49for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 50 51$ymm=1 if ($xmm && 52 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 53 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 54 ($gasver=$1)>=2.19); # first version supporting AVX 55 56$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 58 $1>=2.03); # first version supporting AVX 59 60$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && 61 `ml 2>&1` =~ /Version ([0-9]+)\./ && 62 $1>=10); # first version supporting AVX 63 64$ymm=1 if ($xmm && !$ymm && 65 `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ && 66 $2>=3.0); # first version supporting AVX 67 68$a="eax"; 69($b,$b_)=("ebx","ebp"); 70($c,$c_)=("ecx","esi"); 71($d,$d_)=("edx","edi"); 72 73sub QUARTERROUND { 74my ($ai,$bi,$ci,$di,$i)=@_; 75my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 76my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 77 78 # a b c d 79 # 80 # 0 4 8 12 < even round 81 # 1 5 9 13 82 # 2 6 10 14 83 # 3 7 11 15 84 # 0 5 10 15 < odd round 85 # 1 6 11 12 86 # 2 7 8 13 87 # 3 4 9 14 88 89 if ($i==0) { 90 my $j=4; 91 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 92 } elsif ($i==3) { 93 my $j=0; 94 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 95 } elsif ($i==4) { 96 my $j=4; 97 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 98 } elsif ($i==7) { 99 my $j=0; 100 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 101 } 102 103 #&add ($a,$b); # see elsewhere 104 &xor ($d,$a); 105 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); 106 &rol ($d,16); 107 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); 108 &add ($c,$d); 109 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); 110 &xor ($b,$c); 111 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); 112 &rol ($b,12); 113 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); 114 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter 115 &add ($a,$b); 116 &xor ($d,$a); 117 &mov (&DWP(4*$ai,"esp"),$a); 118 &rol ($d,8); 119 &mov ($a,&DWP(4*$an,"esp")); 120 &add ($c,$d); 121 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); 122 &mov ($d_,$d) if ($di==$dn); 123 &xor ($b,$c); 124 &add ($a,$b_) if ($i<7); # elsewhere 125 &rol ($b,7); 126 127 ($b,$b_)=($b_,$b); 128 ($c,$c_)=($c_,$c); 129 ($d,$d_)=($d_,$d); 130} 131 132&static_label("ssse3_shortcut"); 133&static_label("xop_shortcut"); 134&static_label("ssse3_data"); 135&static_label("pic_point"); 136 137&function_begin("ChaCha20_ctr32"); 138 &xor ("eax","eax"); 139 &cmp ("eax",&wparam(2)); # len==0? 140 &je (&label("no_data")); 141if ($xmm) { 142 &call (&label("pic_point")); 143&set_label("pic_point"); 144 &blindpop("eax"); 145 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); 146 &test (&DWP(0,"ebp"),1<<24); # test FXSR bit 147 &jz (&label("x86")); 148 &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit 149 &jz (&label("x86")); 150 &jmp (&label("ssse3_shortcut")); 151&set_label("x86"); 152} 153 &mov ("esi",&wparam(3)); # key 154 &mov ("edi",&wparam(4)); # counter and nonce 155 156 &stack_push(33); 157 158 &mov ("eax",&DWP(4*0,"esi")); # copy key 159 &mov ("ebx",&DWP(4*1,"esi")); 160 &mov ("ecx",&DWP(4*2,"esi")); 161 &mov ("edx",&DWP(4*3,"esi")); 162 &mov (&DWP(64+4*4,"esp"),"eax"); 163 &mov (&DWP(64+4*5,"esp"),"ebx"); 164 &mov (&DWP(64+4*6,"esp"),"ecx"); 165 &mov (&DWP(64+4*7,"esp"),"edx"); 166 &mov ("eax",&DWP(4*4,"esi")); 167 &mov ("ebx",&DWP(4*5,"esi")); 168 &mov ("ecx",&DWP(4*6,"esi")); 169 &mov ("edx",&DWP(4*7,"esi")); 170 &mov (&DWP(64+4*8,"esp"),"eax"); 171 &mov (&DWP(64+4*9,"esp"),"ebx"); 172 &mov (&DWP(64+4*10,"esp"),"ecx"); 173 &mov (&DWP(64+4*11,"esp"),"edx"); 174 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce 175 &mov ("ebx",&DWP(4*1,"edi")); 176 &mov ("ecx",&DWP(4*2,"edi")); 177 &mov ("edx",&DWP(4*3,"edi")); 178 &sub ("eax",1); 179 &mov (&DWP(64+4*12,"esp"),"eax"); 180 &mov (&DWP(64+4*13,"esp"),"ebx"); 181 &mov (&DWP(64+4*14,"esp"),"ecx"); 182 &mov (&DWP(64+4*15,"esp"),"edx"); 183 &jmp (&label("entry")); 184 185&set_label("outer_loop",16); 186 &mov (&wparam(1),$b); # save input 187 &mov (&wparam(0),$a); # save output 188 &mov (&wparam(2),$c); # save len 189&set_label("entry"); 190 &mov ($a,0x61707865); 191 &mov (&DWP(4*1,"esp"),0x3320646e); 192 &mov (&DWP(4*2,"esp"),0x79622d32); 193 &mov (&DWP(4*3,"esp"),0x6b206574); 194 195 &mov ($b, &DWP(64+4*5,"esp")); # copy key material 196 &mov ($b_,&DWP(64+4*6,"esp")); 197 &mov ($c, &DWP(64+4*10,"esp")); 198 &mov ($c_,&DWP(64+4*11,"esp")); 199 &mov ($d, &DWP(64+4*13,"esp")); 200 &mov ($d_,&DWP(64+4*14,"esp")); 201 &mov (&DWP(4*5,"esp"),$b); 202 &mov (&DWP(4*6,"esp"),$b_); 203 &mov (&DWP(4*10,"esp"),$c); 204 &mov (&DWP(4*11,"esp"),$c_); 205 &mov (&DWP(4*13,"esp"),$d); 206 &mov (&DWP(4*14,"esp"),$d_); 207 208 &mov ($b, &DWP(64+4*7,"esp")); 209 &mov ($d_,&DWP(64+4*15,"esp")); 210 &mov ($d, &DWP(64+4*12,"esp")); 211 &mov ($b_,&DWP(64+4*4,"esp")); 212 &mov ($c, &DWP(64+4*8,"esp")); 213 &mov ($c_,&DWP(64+4*9,"esp")); 214 &add ($d,1); # counter value 215 &mov (&DWP(4*7,"esp"),$b); 216 &mov (&DWP(4*15,"esp"),$d_); 217 &mov (&DWP(64+4*12,"esp"),$d); # save counter value 218 219 &mov ($b,10); # loop counter 220 &jmp (&label("loop")); 221 222&set_label("loop",16); 223 &add ($a,$b_); # elsewhere 224 &mov (&DWP(128,"esp"),$b); # save loop counter 225 &mov ($b,$b_); 226 &QUARTERROUND(0, 4, 8, 12, 0); 227 &QUARTERROUND(1, 5, 9, 13, 1); 228 &QUARTERROUND(2, 6,10, 14, 2); 229 &QUARTERROUND(3, 7,11, 15, 3); 230 &QUARTERROUND(0, 5,10, 15, 4); 231 &QUARTERROUND(1, 6,11, 12, 5); 232 &QUARTERROUND(2, 7, 8, 13, 6); 233 &QUARTERROUND(3, 4, 9, 14, 7); 234 &dec ($b); 235 &jnz (&label("loop")); 236 237 &mov ($b,&wparam(2)); # load len 238 239 &add ($a,0x61707865); # accumulate key material 240 &add ($b_,&DWP(64+4*4,"esp")); 241 &add ($c, &DWP(64+4*8,"esp")); 242 &add ($c_,&DWP(64+4*9,"esp")); 243 244 &cmp ($b,64); 245 &jb (&label("tail")); 246 247 &mov ($b,&wparam(1)); # load input pointer 248 &add ($d, &DWP(64+4*12,"esp")); 249 &add ($d_,&DWP(64+4*14,"esp")); 250 251 &xor ($a, &DWP(4*0,$b)); # xor with input 252 &xor ($b_,&DWP(4*4,$b)); 253 &mov (&DWP(4*0,"esp"),$a); 254 &mov ($a,&wparam(0)); # load output pointer 255 &xor ($c, &DWP(4*8,$b)); 256 &xor ($c_,&DWP(4*9,$b)); 257 &xor ($d, &DWP(4*12,$b)); 258 &xor ($d_,&DWP(4*14,$b)); 259 &mov (&DWP(4*4,$a),$b_); # write output 260 &mov (&DWP(4*8,$a),$c); 261 &mov (&DWP(4*9,$a),$c_); 262 &mov (&DWP(4*12,$a),$d); 263 &mov (&DWP(4*14,$a),$d_); 264 265 &mov ($b_,&DWP(4*1,"esp")); 266 &mov ($c, &DWP(4*2,"esp")); 267 &mov ($c_,&DWP(4*3,"esp")); 268 &mov ($d, &DWP(4*5,"esp")); 269 &mov ($d_,&DWP(4*6,"esp")); 270 &add ($b_,0x3320646e); # accumulate key material 271 &add ($c, 0x79622d32); 272 &add ($c_,0x6b206574); 273 &add ($d, &DWP(64+4*5,"esp")); 274 &add ($d_,&DWP(64+4*6,"esp")); 275 &xor ($b_,&DWP(4*1,$b)); 276 &xor ($c, &DWP(4*2,$b)); 277 &xor ($c_,&DWP(4*3,$b)); 278 &xor ($d, &DWP(4*5,$b)); 279 &xor ($d_,&DWP(4*6,$b)); 280 &mov (&DWP(4*1,$a),$b_); 281 &mov (&DWP(4*2,$a),$c); 282 &mov (&DWP(4*3,$a),$c_); 283 &mov (&DWP(4*5,$a),$d); 284 &mov (&DWP(4*6,$a),$d_); 285 286 &mov ($b_,&DWP(4*7,"esp")); 287 &mov ($c, &DWP(4*10,"esp")); 288 &mov ($c_,&DWP(4*11,"esp")); 289 &mov ($d, &DWP(4*13,"esp")); 290 &mov ($d_,&DWP(4*15,"esp")); 291 &add ($b_,&DWP(64+4*7,"esp")); 292 &add ($c, &DWP(64+4*10,"esp")); 293 &add ($c_,&DWP(64+4*11,"esp")); 294 &add ($d, &DWP(64+4*13,"esp")); 295 &add ($d_,&DWP(64+4*15,"esp")); 296 &xor ($b_,&DWP(4*7,$b)); 297 &xor ($c, &DWP(4*10,$b)); 298 &xor ($c_,&DWP(4*11,$b)); 299 &xor ($d, &DWP(4*13,$b)); 300 &xor ($d_,&DWP(4*15,$b)); 301 &lea ($b,&DWP(4*16,$b)); 302 &mov (&DWP(4*7,$a),$b_); 303 &mov ($b_,&DWP(4*0,"esp")); 304 &mov (&DWP(4*10,$a),$c); 305 &mov ($c,&wparam(2)); # len 306 &mov (&DWP(4*11,$a),$c_); 307 &mov (&DWP(4*13,$a),$d); 308 &mov (&DWP(4*15,$a),$d_); 309 &mov (&DWP(4*0,$a),$b_); 310 &lea ($a,&DWP(4*16,$a)); 311 &sub ($c,64); 312 &jnz (&label("outer_loop")); 313 314 &jmp (&label("done")); 315 316&set_label("tail"); 317 &add ($d, &DWP(64+4*12,"esp")); 318 &add ($d_,&DWP(64+4*14,"esp")); 319 &mov (&DWP(4*0,"esp"),$a); 320 &mov (&DWP(4*4,"esp"),$b_); 321 &mov (&DWP(4*8,"esp"),$c); 322 &mov (&DWP(4*9,"esp"),$c_); 323 &mov (&DWP(4*12,"esp"),$d); 324 &mov (&DWP(4*14,"esp"),$d_); 325 326 &mov ($b_,&DWP(4*1,"esp")); 327 &mov ($c, &DWP(4*2,"esp")); 328 &mov ($c_,&DWP(4*3,"esp")); 329 &mov ($d, &DWP(4*5,"esp")); 330 &mov ($d_,&DWP(4*6,"esp")); 331 &add ($b_,0x3320646e); # accumulate key material 332 &add ($c, 0x79622d32); 333 &add ($c_,0x6b206574); 334 &add ($d, &DWP(64+4*5,"esp")); 335 &add ($d_,&DWP(64+4*6,"esp")); 336 &mov (&DWP(4*1,"esp"),$b_); 337 &mov (&DWP(4*2,"esp"),$c); 338 &mov (&DWP(4*3,"esp"),$c_); 339 &mov (&DWP(4*5,"esp"),$d); 340 &mov (&DWP(4*6,"esp"),$d_); 341 342 &mov ($b_,&DWP(4*7,"esp")); 343 &mov ($c, &DWP(4*10,"esp")); 344 &mov ($c_,&DWP(4*11,"esp")); 345 &mov ($d, &DWP(4*13,"esp")); 346 &mov ($d_,&DWP(4*15,"esp")); 347 &add ($b_,&DWP(64+4*7,"esp")); 348 &add ($c, &DWP(64+4*10,"esp")); 349 &add ($c_,&DWP(64+4*11,"esp")); 350 &add ($d, &DWP(64+4*13,"esp")); 351 &add ($d_,&DWP(64+4*15,"esp")); 352 &mov (&DWP(4*7,"esp"),$b_); 353 &mov ($b_,&wparam(1)); # load input 354 &mov (&DWP(4*10,"esp"),$c); 355 &mov ($c,&wparam(0)); # load output 356 &mov (&DWP(4*11,"esp"),$c_); 357 &xor ($c_,$c_); 358 &mov (&DWP(4*13,"esp"),$d); 359 &mov (&DWP(4*15,"esp"),$d_); 360 361 &xor ("eax","eax"); 362 &xor ("edx","edx"); 363&set_label("tail_loop"); 364 &movb ("al",&BP(0,$c_,$b_)); 365 &movb ("dl",&BP(0,"esp",$c_)); 366 &lea ($c_,&DWP(1,$c_)); 367 &xor ("al","dl"); 368 &mov (&BP(-1,$c,$c_),"al"); 369 &dec ($b); 370 &jnz (&label("tail_loop")); 371 372&set_label("done"); 373 &stack_pop(33); 374&set_label("no_data"); 375&function_end("ChaCha20_ctr32"); 376 377if ($xmm) { 378my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 379my ($out,$inp,$len)=("edi","esi","ecx"); 380 381sub QUARTERROUND_SSSE3 { 382my ($ai,$bi,$ci,$di,$i)=@_; 383my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 384my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 385 386 # a b c d 387 # 388 # 0 4 8 12 < even round 389 # 1 5 9 13 390 # 2 6 10 14 391 # 3 7 11 15 392 # 0 5 10 15 < odd round 393 # 1 6 11 12 394 # 2 7 8 13 395 # 3 4 9 14 396 397 if ($i==0) { 398 my $j=4; 399 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 400 } elsif ($i==3) { 401 my $j=0; 402 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 403 } elsif ($i==4) { 404 my $j=4; 405 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 406 } elsif ($i==7) { 407 my $j=0; 408 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 409 } 410 411 #&paddd ($xa,$xb); # see elsewhere 412 #&pxor ($xd,$xa); # see elsewhere 413 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 414 &pshufb ($xd,&QWP(0,"eax")); # rot16 415 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 416 &paddd ($xc,$xd); 417 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 418 &pxor ($xb,$xc); 419 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 420 &movdqa ($xa_,$xb); # borrow as temporary 421 &pslld ($xb,12); 422 &psrld ($xa_,20); 423 &por ($xb,$xa_); 424 &movdqa($xa_,&QWP(16*$an-128,"ebx")); 425 &paddd ($xa,$xb); 426 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 427 &pxor ($xd,$xa); 428 &movdqa (&QWP(16*$ai-128,"ebx"),$xa); 429 &pshufb ($xd,&QWP(16,"eax")); # rot8 430 &paddd ($xc,$xd); 431 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 432 &movdqa ($xd_,$xd) if ($di==$dn); 433 &pxor ($xb,$xc); 434 &paddd ($xa_,$xb_) if ($i<7); # elsewhere 435 &movdqa ($xa,$xb); # borrow as temporary 436 &pslld ($xb,7); 437 &psrld ($xa,25); 438 &pxor ($xd_,$xa_) if ($i<7); # elsewhere 439 &por ($xb,$xa); 440 441 ($xa,$xa_)=($xa_,$xa); 442 ($xb,$xb_)=($xb_,$xb); 443 ($xc,$xc_)=($xc_,$xc); 444 ($xd,$xd_)=($xd_,$xd); 445} 446 447&function_begin("ChaCha20_ssse3"); 448&set_label("ssse3_shortcut"); 449if ($ymm) { 450 &test (&DWP(4,"ebp"),1<<11); # test XOP bit 451 &jnz (&label("xop_shortcut")); 452} 453 454 &mov ($out,&wparam(0)); 455 &mov ($inp,&wparam(1)); 456 &mov ($len,&wparam(2)); 457 &mov ("edx",&wparam(3)); # key 458 &mov ("ebx",&wparam(4)); # counter and nonce 459 460 &mov ("ebp","esp"); 461 &stack_push (131); 462 &and ("esp",-64); 463 &mov (&DWP(512,"esp"),"ebp"); 464 465 &lea ("eax",&DWP(&label("ssse3_data")."-". 466 &label("pic_point"),"eax")); 467 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 468 469if (defined($gasver) && $gasver>=2.17) { # even though we encode 470 # pshufb manually, we 471 # handle only register 472 # operands, while this 473 # segment uses memory 474 # operand... 475 &cmp ($len,64*4); 476 &jb (&label("1x")); 477 478 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 479 &mov (&DWP(512+8,"esp"),"ebx"); 480 &sub ($len,64*4); # bias len 481 &lea ("ebp",&DWP(256+128,"esp")); # size optimization 482 483 &movdqu ("xmm7",&QWP(0,"edx")); # key 484 &pshufd ("xmm0","xmm3",0x00); 485 &pshufd ("xmm1","xmm3",0x55); 486 &pshufd ("xmm2","xmm3",0xaa); 487 &pshufd ("xmm3","xmm3",0xff); 488 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters 489 &pshufd ("xmm4","xmm7",0x00); 490 &pshufd ("xmm5","xmm7",0x55); 491 &psubd ("xmm0",&QWP(16*4,"eax")); 492 &pshufd ("xmm6","xmm7",0xaa); 493 &pshufd ("xmm7","xmm7",0xff); 494 &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); 495 &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); 496 &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); 497 &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); 498 &movdqu ("xmm3",&QWP(16,"edx")); # key 499 &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); 500 &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); 501 &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); 502 &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); 503 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma 504 &lea ("ebx",&DWP(128,"esp")); # size optimization 505 506 &pshufd ("xmm0","xmm3",0x00); 507 &pshufd ("xmm1","xmm3",0x55); 508 &pshufd ("xmm2","xmm3",0xaa); 509 &pshufd ("xmm3","xmm3",0xff); 510 &pshufd ("xmm4","xmm7",0x00); 511 &pshufd ("xmm5","xmm7",0x55); 512 &pshufd ("xmm6","xmm7",0xaa); 513 &pshufd ("xmm7","xmm7",0xff); 514 &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); 515 &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); 516 &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); 517 &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); 518 &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); 519 &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); 520 &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); 521 &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); 522 523 &lea ($inp,&DWP(128,$inp)); # size optimization 524 &lea ($out,&DWP(128,$out)); # size optimization 525 &jmp (&label("outer_loop")); 526 527&set_label("outer_loop",16); 528 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 529 &movdqa ("xmm1",&QWP(16*1-128,"ebp")); 530 &movdqa ("xmm2",&QWP(16*2-128,"ebp")); 531 &movdqa ("xmm3",&QWP(16*3-128,"ebp")); 532 #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); 533 &movdqa ("xmm5",&QWP(16*5-128,"ebp")); 534 &movdqa ("xmm6",&QWP(16*6-128,"ebp")); 535 &movdqa ("xmm7",&QWP(16*7-128,"ebp")); 536 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); 537 &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); 538 &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); 539 &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); 540 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); 541 &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); 542 &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); 543 &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); 544 #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); 545 #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); 546 &movdqa ("xmm2",&QWP(16*10-128,"ebp")); 547 &movdqa ("xmm3",&QWP(16*11-128,"ebp")); 548 &movdqa ("xmm4",&QWP(16*12-128,"ebp")); 549 &movdqa ("xmm5",&QWP(16*13-128,"ebp")); 550 &movdqa ("xmm6",&QWP(16*14-128,"ebp")); 551 &movdqa ("xmm7",&QWP(16*15-128,"ebp")); 552 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value 553 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); 554 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); 555 &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); 556 &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); 557 &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); 558 &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); 559 &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); 560 &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); 561 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 562 563 &movdqa ($xa, &QWP(16*0-128,"ebp")); 564 &movdqa ($xd, "xmm4"); 565 &movdqa ($xb_,&QWP(16*4-128,"ebp")); 566 &movdqa ($xc, &QWP(16*8-128,"ebp")); 567 &movdqa ($xc_,&QWP(16*9-128,"ebp")); 568 569 &mov ("edx",10); # loop counter 570 &nop (); 571 572&set_label("loop",16); 573 &paddd ($xa,$xb_); # elsewhere 574 &movdqa ($xb,$xb_); 575 &pxor ($xd,$xa); # elsewhere 576 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); 577 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); 578 &QUARTERROUND_SSSE3(2, 6,10, 14, 2); 579 &QUARTERROUND_SSSE3(3, 7,11, 15, 3); 580 &QUARTERROUND_SSSE3(0, 5,10, 15, 4); 581 &QUARTERROUND_SSSE3(1, 6,11, 12, 5); 582 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); 583 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); 584 &dec ("edx"); 585 &jnz (&label("loop")); 586 587 &movdqa (&QWP(16*4-128,"ebx"),$xb_); 588 &movdqa (&QWP(16*8-128,"ebx"),$xc); 589 &movdqa (&QWP(16*9-128,"ebx"),$xc_); 590 &movdqa (&QWP(16*12-128,"ebx"),$xd); 591 &movdqa (&QWP(16*14-128,"ebx"),$xd_); 592 593 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 594 595 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 596 &movdqa ($xa1,&QWP(16*1-128,"ebx")); 597 &movdqa ($xa2,&QWP(16*2-128,"ebx")); 598 &movdqa ($xa3,&QWP(16*3-128,"ebx")); 599 600 for($i=0;$i<256;$i+=64) { 601 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 602 &paddd ($xa1,&QWP($i+16*1-128,"ebp")); 603 &paddd ($xa2,&QWP($i+16*2-128,"ebp")); 604 &paddd ($xa3,&QWP($i+16*3-128,"ebp")); 605 606 &movdqa ($xt2,$xa0); # "de-interlace" data 607 &punpckldq ($xa0,$xa1); 608 &movdqa ($xt3,$xa2); 609 &punpckldq ($xa2,$xa3); 610 &punpckhdq ($xt2,$xa1); 611 &punpckhdq ($xt3,$xa3); 612 &movdqa ($xa1,$xa0); 613 &punpcklqdq ($xa0,$xa2); # "a0" 614 &movdqa ($xa3,$xt2); 615 &punpcklqdq ($xt2,$xt3); # "a2" 616 &punpckhqdq ($xa1,$xa2); # "a1" 617 &punpckhqdq ($xa3,$xt3); # "a3" 618 619 #($xa2,$xt2)=($xt2,$xa2); 620 621 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input 622 &movdqu ($xt1,&QWP(64*1-128,$inp)); 623 &movdqu ($xa2,&QWP(64*2-128,$inp)); 624 &movdqu ($xt3,&QWP(64*3-128,$inp)); 625 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 626 &pxor ($xt0,$xa0); 627 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 628 &pxor ($xt1,$xa1); 629 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 630 &pxor ($xt2,$xa2); 631 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 632 &pxor ($xt3,$xa3); 633 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 634 &movdqu (&QWP(64*0-128,$out),$xt0); # store output 635 &movdqu (&QWP(64*1-128,$out),$xt1); 636 &movdqu (&QWP(64*2-128,$out),$xt2); 637 &movdqu (&QWP(64*3-128,$out),$xt3); 638 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 639 } 640 &sub ($len,64*4); 641 &jnc (&label("outer_loop")); 642 643 &add ($len,64*4); 644 &jz (&label("done")); 645 646 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 647 &lea ($inp,&DWP(-128,$inp)); 648 &mov ("edx",&DWP(512+4,"esp")); 649 &lea ($out,&DWP(-128,$out)); 650 651 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 652 &movdqu ("xmm3",&QWP(0,"ebx")); 653 &paddd ("xmm2",&QWP(16*6,"eax")); # +four 654 &pand ("xmm3",&QWP(16*7,"eax")); 655 &por ("xmm3","xmm2"); # counter value 656} 657{ 658my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 659 660sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 661 &paddd ($a,$b); 662 &pxor ($d,$a); 663 &pshufb ($d,$rot16); 664 665 &paddd ($c,$d); 666 &pxor ($b,$c); 667 &movdqa ($t,$b); 668 &psrld ($b,20); 669 &pslld ($t,12); 670 &por ($b,$t); 671 672 &paddd ($a,$b); 673 &pxor ($d,$a); 674 &pshufb ($d,$rot24); 675 676 &paddd ($c,$d); 677 &pxor ($b,$c); 678 &movdqa ($t,$b); 679 &psrld ($b,25); 680 &pslld ($t,7); 681 &por ($b,$t); 682} 683 684&set_label("1x"); 685 &movdqa ($a,&QWP(16*2,"eax")); # sigma 686 &movdqu ($b,&QWP(0,"edx")); 687 &movdqu ($c,&QWP(16,"edx")); 688 #&movdqu ($d,&QWP(0,"ebx")); # already loaded 689 &movdqa ($rot16,&QWP(0,"eax")); 690 &movdqa ($rot24,&QWP(16,"eax")); 691 &mov (&DWP(16*3,"esp"),"ebp"); 692 693 &movdqa (&QWP(16*0,"esp"),$a); 694 &movdqa (&QWP(16*1,"esp"),$b); 695 &movdqa (&QWP(16*2,"esp"),$c); 696 &movdqa (&QWP(16*3,"esp"),$d); 697 &mov ("edx",10); 698 &jmp (&label("loop1x")); 699 700&set_label("outer1x",16); 701 &movdqa ($d,&QWP(16*5,"eax")); # one 702 &movdqa ($a,&QWP(16*0,"esp")); 703 &movdqa ($b,&QWP(16*1,"esp")); 704 &movdqa ($c,&QWP(16*2,"esp")); 705 &paddd ($d,&QWP(16*3,"esp")); 706 &mov ("edx",10); 707 &movdqa (&QWP(16*3,"esp"),$d); 708 &jmp (&label("loop1x")); 709 710&set_label("loop1x",16); 711 &SSSE3ROUND(); 712 &pshufd ($c,$c,0b01001110); 713 &pshufd ($b,$b,0b00111001); 714 &pshufd ($d,$d,0b10010011); 715 &nop (); 716 717 &SSSE3ROUND(); 718 &pshufd ($c,$c,0b01001110); 719 &pshufd ($b,$b,0b10010011); 720 &pshufd ($d,$d,0b00111001); 721 722 &dec ("edx"); 723 &jnz (&label("loop1x")); 724 725 &paddd ($a,&QWP(16*0,"esp")); 726 &paddd ($b,&QWP(16*1,"esp")); 727 &paddd ($c,&QWP(16*2,"esp")); 728 &paddd ($d,&QWP(16*3,"esp")); 729 730 &cmp ($len,64); 731 &jb (&label("tail")); 732 733 &movdqu ($t,&QWP(16*0,$inp)); 734 &movdqu ($t1,&QWP(16*1,$inp)); 735 &pxor ($a,$t); # xor with input 736 &movdqu ($t,&QWP(16*2,$inp)); 737 &pxor ($b,$t1); 738 &movdqu ($t1,&QWP(16*3,$inp)); 739 &pxor ($c,$t); 740 &pxor ($d,$t1); 741 &lea ($inp,&DWP(16*4,$inp)); # inp+=64 742 743 &movdqu (&QWP(16*0,$out),$a); # write output 744 &movdqu (&QWP(16*1,$out),$b); 745 &movdqu (&QWP(16*2,$out),$c); 746 &movdqu (&QWP(16*3,$out),$d); 747 &lea ($out,&DWP(16*4,$out)); # inp+=64 748 749 &sub ($len,64); 750 &jnz (&label("outer1x")); 751 752 &jmp (&label("done")); 753 754&set_label("tail"); 755 &movdqa (&QWP(16*0,"esp"),$a); 756 &movdqa (&QWP(16*1,"esp"),$b); 757 &movdqa (&QWP(16*2,"esp"),$c); 758 &movdqa (&QWP(16*3,"esp"),$d); 759 760 &xor ("eax","eax"); 761 &xor ("edx","edx"); 762 &xor ("ebp","ebp"); 763 764&set_label("tail_loop"); 765 &movb ("al",&BP(0,"esp","ebp")); 766 &movb ("dl",&BP(0,$inp,"ebp")); 767 &lea ("ebp",&DWP(1,"ebp")); 768 &xor ("al","dl"); 769 &movb (&BP(-1,$out,"ebp"),"al"); 770 &dec ($len); 771 &jnz (&label("tail_loop")); 772} 773&set_label("done"); 774 &mov ("esp",&DWP(512,"esp")); 775&function_end("ChaCha20_ssse3"); 776 777&align (64); 778&set_label("ssse3_data"); 779&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); 780&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); 781&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); 782&data_word(0,1,2,3); 783&data_word(4,4,4,4); 784&data_word(1,0,0,0); 785&data_word(4,0,0,0); 786&data_word(0,-1,-1,-1); 787&align (64); 788} 789&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 790 791if ($ymm) { 792my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 793my ($out,$inp,$len)=("edi","esi","ecx"); 794 795sub QUARTERROUND_XOP { 796my ($ai,$bi,$ci,$di,$i)=@_; 797my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 798my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 799 800 # a b c d 801 # 802 # 0 4 8 12 < even round 803 # 1 5 9 13 804 # 2 6 10 14 805 # 3 7 11 15 806 # 0 5 10 15 < odd round 807 # 1 6 11 12 808 # 2 7 8 13 809 # 3 4 9 14 810 811 if ($i==0) { 812 my $j=4; 813 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 814 } elsif ($i==3) { 815 my $j=0; 816 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 817 } elsif ($i==4) { 818 my $j=4; 819 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 820 } elsif ($i==7) { 821 my $j=0; 822 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 823 } 824 825 #&vpaddd ($xa,$xa,$xb); # see elsewhere 826 #&vpxor ($xd,$xd,$xa); # see elsewhere 827 &vmovdqa (&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 828 &vprotd ($xd,$xd,16); 829 &vmovdqa (&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 830 &vpaddd ($xc,$xc,$xd); 831 &vmovdqa ($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 832 &vpxor ($xb,$i!=0?$xb:$xb_,$xc); 833 &vmovdqa ($xa_,&QWP(16*$an-128,"ebx")); 834 &vprotd ($xb,$xb,12); 835 &vmovdqa ($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 836 &vpaddd ($xa,$xa,$xb); 837 &vmovdqa ($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 838 &vpxor ($xd,$xd,$xa); 839 &vpaddd ($xa_,$xa_,$xb_) if ($i<7); # elsewhere 840 &vprotd ($xd,$xd,8); 841 &vmovdqa (&QWP(16*$ai-128,"ebx"),$xa); 842 &vpaddd ($xc,$xc,$xd); 843 &vmovdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 844 &vpxor ($xb,$xb,$xc); 845 &vpxor ($xd_,$di==$dn?$xd:$xd_,$xa_) if ($i<7); # elsewhere 846 &vprotd ($xb,$xb,7); 847 848 ($xa,$xa_)=($xa_,$xa); 849 ($xb,$xb_)=($xb_,$xb); 850 ($xc,$xc_)=($xc_,$xc); 851 ($xd,$xd_)=($xd_,$xd); 852} 853 854&function_begin("ChaCha20_xop"); 855&set_label("xop_shortcut"); 856 &mov ($out,&wparam(0)); 857 &mov ($inp,&wparam(1)); 858 &mov ($len,&wparam(2)); 859 &mov ("edx",&wparam(3)); # key 860 &mov ("ebx",&wparam(4)); # counter and nonce 861 &vzeroupper (); 862 863 &mov ("ebp","esp"); 864 &stack_push (131); 865 &and ("esp",-64); 866 &mov (&DWP(512,"esp"),"ebp"); 867 868 &lea ("eax",&DWP(&label("ssse3_data")."-". 869 &label("pic_point"),"eax")); 870 &vmovdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 871 872 &cmp ($len,64*4); 873 &jb (&label("1x")); 874 875 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 876 &mov (&DWP(512+8,"esp"),"ebx"); 877 &sub ($len,64*4); # bias len 878 &lea ("ebp",&DWP(256+128,"esp")); # size optimization 879 880 &vmovdqu ("xmm7",&QWP(0,"edx")); # key 881 &vpshufd ("xmm0","xmm3",0x00); 882 &vpshufd ("xmm1","xmm3",0x55); 883 &vpshufd ("xmm2","xmm3",0xaa); 884 &vpshufd ("xmm3","xmm3",0xff); 885 &vpaddd ("xmm0","xmm0",&QWP(16*3,"eax")); # fix counters 886 &vpshufd ("xmm4","xmm7",0x00); 887 &vpshufd ("xmm5","xmm7",0x55); 888 &vpsubd ("xmm0","xmm0",&QWP(16*4,"eax")); 889 &vpshufd ("xmm6","xmm7",0xaa); 890 &vpshufd ("xmm7","xmm7",0xff); 891 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm0"); 892 &vmovdqa (&QWP(16*13-128,"ebp"),"xmm1"); 893 &vmovdqa (&QWP(16*14-128,"ebp"),"xmm2"); 894 &vmovdqa (&QWP(16*15-128,"ebp"),"xmm3"); 895 &vmovdqu ("xmm3",&QWP(16,"edx")); # key 896 &vmovdqa (&QWP(16*4-128,"ebp"),"xmm4"); 897 &vmovdqa (&QWP(16*5-128,"ebp"),"xmm5"); 898 &vmovdqa (&QWP(16*6-128,"ebp"),"xmm6"); 899 &vmovdqa (&QWP(16*7-128,"ebp"),"xmm7"); 900 &vmovdqa ("xmm7",&QWP(16*2,"eax")); # sigma 901 &lea ("ebx",&DWP(128,"esp")); # size optimization 902 903 &vpshufd ("xmm0","xmm3",0x00); 904 &vpshufd ("xmm1","xmm3",0x55); 905 &vpshufd ("xmm2","xmm3",0xaa); 906 &vpshufd ("xmm3","xmm3",0xff); 907 &vpshufd ("xmm4","xmm7",0x00); 908 &vpshufd ("xmm5","xmm7",0x55); 909 &vpshufd ("xmm6","xmm7",0xaa); 910 &vpshufd ("xmm7","xmm7",0xff); 911 &vmovdqa (&QWP(16*8-128,"ebp"),"xmm0"); 912 &vmovdqa (&QWP(16*9-128,"ebp"),"xmm1"); 913 &vmovdqa (&QWP(16*10-128,"ebp"),"xmm2"); 914 &vmovdqa (&QWP(16*11-128,"ebp"),"xmm3"); 915 &vmovdqa (&QWP(16*0-128,"ebp"),"xmm4"); 916 &vmovdqa (&QWP(16*1-128,"ebp"),"xmm5"); 917 &vmovdqa (&QWP(16*2-128,"ebp"),"xmm6"); 918 &vmovdqa (&QWP(16*3-128,"ebp"),"xmm7"); 919 920 &lea ($inp,&DWP(128,$inp)); # size optimization 921 &lea ($out,&DWP(128,$out)); # size optimization 922 &jmp (&label("outer_loop")); 923 924&set_label("outer_loop",32); 925 #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 926 &vmovdqa ("xmm1",&QWP(16*1-128,"ebp")); 927 &vmovdqa ("xmm2",&QWP(16*2-128,"ebp")); 928 &vmovdqa ("xmm3",&QWP(16*3-128,"ebp")); 929 #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp")); 930 &vmovdqa ("xmm5",&QWP(16*5-128,"ebp")); 931 &vmovdqa ("xmm6",&QWP(16*6-128,"ebp")); 932 &vmovdqa ("xmm7",&QWP(16*7-128,"ebp")); 933 #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0"); 934 &vmovdqa (&QWP(16*1-128,"ebx"),"xmm1"); 935 &vmovdqa (&QWP(16*2-128,"ebx"),"xmm2"); 936 &vmovdqa (&QWP(16*3-128,"ebx"),"xmm3"); 937 #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4"); 938 &vmovdqa (&QWP(16*5-128,"ebx"),"xmm5"); 939 &vmovdqa (&QWP(16*6-128,"ebx"),"xmm6"); 940 &vmovdqa (&QWP(16*7-128,"ebx"),"xmm7"); 941 #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp")); 942 #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp")); 943 &vmovdqa ("xmm2",&QWP(16*10-128,"ebp")); 944 &vmovdqa ("xmm3",&QWP(16*11-128,"ebp")); 945 &vmovdqa ("xmm4",&QWP(16*12-128,"ebp")); 946 &vmovdqa ("xmm5",&QWP(16*13-128,"ebp")); 947 &vmovdqa ("xmm6",&QWP(16*14-128,"ebp")); 948 &vmovdqa ("xmm7",&QWP(16*15-128,"ebp")); 949 &vpaddd ("xmm4","xmm4",&QWP(16*4,"eax")); # counter value 950 #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0"); 951 #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1"); 952 &vmovdqa (&QWP(16*10-128,"ebx"),"xmm2"); 953 &vmovdqa (&QWP(16*11-128,"ebx"),"xmm3"); 954 &vmovdqa (&QWP(16*12-128,"ebx"),"xmm4"); 955 &vmovdqa (&QWP(16*13-128,"ebx"),"xmm5"); 956 &vmovdqa (&QWP(16*14-128,"ebx"),"xmm6"); 957 &vmovdqa (&QWP(16*15-128,"ebx"),"xmm7"); 958 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 959 960 &vmovdqa ($xa, &QWP(16*0-128,"ebp")); 961 &vmovdqa ($xd, "xmm4"); 962 &vmovdqa ($xb_,&QWP(16*4-128,"ebp")); 963 &vmovdqa ($xc, &QWP(16*8-128,"ebp")); 964 &vmovdqa ($xc_,&QWP(16*9-128,"ebp")); 965 966 &mov ("edx",10); # loop counter 967 &nop (); 968 969&set_label("loop",32); 970 &vpaddd ($xa,$xa,$xb_); # elsewhere 971 &vpxor ($xd,$xd,$xa); # elsewhere 972 &QUARTERROUND_XOP(0, 4, 8, 12, 0); 973 &QUARTERROUND_XOP(1, 5, 9, 13, 1); 974 &QUARTERROUND_XOP(2, 6,10, 14, 2); 975 &QUARTERROUND_XOP(3, 7,11, 15, 3); 976 &QUARTERROUND_XOP(0, 5,10, 15, 4); 977 &QUARTERROUND_XOP(1, 6,11, 12, 5); 978 &QUARTERROUND_XOP(2, 7, 8, 13, 6); 979 &QUARTERROUND_XOP(3, 4, 9, 14, 7); 980 &dec ("edx"); 981 &jnz (&label("loop")); 982 983 &vmovdqa (&QWP(16*4-128,"ebx"),$xb_); 984 &vmovdqa (&QWP(16*8-128,"ebx"),$xc); 985 &vmovdqa (&QWP(16*9-128,"ebx"),$xc_); 986 &vmovdqa (&QWP(16*12-128,"ebx"),$xd); 987 &vmovdqa (&QWP(16*14-128,"ebx"),$xd_); 988 989 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 990 991 #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 992 &vmovdqa ($xa1,&QWP(16*1-128,"ebx")); 993 &vmovdqa ($xa2,&QWP(16*2-128,"ebx")); 994 &vmovdqa ($xa3,&QWP(16*3-128,"ebx")); 995 996 for($i=0;$i<256;$i+=64) { 997 &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 998 &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp")); 999 &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp")); 1000 &vpaddd ($xa3,$xa3,&QWP($i+16*3-128,"ebp")); 1001 1002 &vpunpckldq ($xt2,$xa0,$xa1); # "de-interlace" data 1003 &vpunpckldq ($xt3,$xa2,$xa3); 1004 &vpunpckhdq ($xa0,$xa0,$xa1); 1005 &vpunpckhdq ($xa2,$xa2,$xa3); 1006 &vpunpcklqdq ($xa1,$xt2,$xt3); # "a0" 1007 &vpunpckhqdq ($xt2,$xt2,$xt3); # "a1" 1008 &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2" 1009 &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3" 1010 1011 &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp)); 1012 &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp)); 1013 &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp)); 1014 &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp)); 1015 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 1016 &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 1017 &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 1018 &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 1019 &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 1020 &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output 1021 &vmovdqu (&QWP(64*1-128,$out),$xt1); 1022 &vmovdqu (&QWP(64*2-128,$out),$xt2); 1023 &vmovdqu (&QWP(64*3-128,$out),$xt3); 1024 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 1025 } 1026 &sub ($len,64*4); 1027 &jnc (&label("outer_loop")); 1028 1029 &add ($len,64*4); 1030 &jz (&label("done")); 1031 1032 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 1033 &lea ($inp,&DWP(-128,$inp)); 1034 &mov ("edx",&DWP(512+4,"esp")); 1035 &lea ($out,&DWP(-128,$out)); 1036 1037 &vmovd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 1038 &vmovdqu ("xmm3",&QWP(0,"ebx")); 1039 &vpaddd ("xmm2","xmm2",&QWP(16*6,"eax"));# +four 1040 &vpand ("xmm3","xmm3",&QWP(16*7,"eax")); 1041 &vpor ("xmm3","xmm3","xmm2"); # counter value 1042{ 1043my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 1044 1045sub XOPROUND { 1046 &vpaddd ($a,$a,$b); 1047 &vpxor ($d,$d,$a); 1048 &vprotd ($d,$d,16); 1049 1050 &vpaddd ($c,$c,$d); 1051 &vpxor ($b,$b,$c); 1052 &vprotd ($b,$b,12); 1053 1054 &vpaddd ($a,$a,$b); 1055 &vpxor ($d,$d,$a); 1056 &vprotd ($d,$d,8); 1057 1058 &vpaddd ($c,$c,$d); 1059 &vpxor ($b,$b,$c); 1060 &vprotd ($b,$b,7); 1061} 1062 1063&set_label("1x"); 1064 &vmovdqa ($a,&QWP(16*2,"eax")); # sigma 1065 &vmovdqu ($b,&QWP(0,"edx")); 1066 &vmovdqu ($c,&QWP(16,"edx")); 1067 #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded 1068 &vmovdqa ($rot16,&QWP(0,"eax")); 1069 &vmovdqa ($rot24,&QWP(16,"eax")); 1070 &mov (&DWP(16*3,"esp"),"ebp"); 1071 1072 &vmovdqa (&QWP(16*0,"esp"),$a); 1073 &vmovdqa (&QWP(16*1,"esp"),$b); 1074 &vmovdqa (&QWP(16*2,"esp"),$c); 1075 &vmovdqa (&QWP(16*3,"esp"),$d); 1076 &mov ("edx",10); 1077 &jmp (&label("loop1x")); 1078 1079&set_label("outer1x",16); 1080 &vmovdqa ($d,&QWP(16*5,"eax")); # one 1081 &vmovdqa ($a,&QWP(16*0,"esp")); 1082 &vmovdqa ($b,&QWP(16*1,"esp")); 1083 &vmovdqa ($c,&QWP(16*2,"esp")); 1084 &vpaddd ($d,$d,&QWP(16*3,"esp")); 1085 &mov ("edx",10); 1086 &vmovdqa (&QWP(16*3,"esp"),$d); 1087 &jmp (&label("loop1x")); 1088 1089&set_label("loop1x",16); 1090 &XOPROUND(); 1091 &vpshufd ($c,$c,0b01001110); 1092 &vpshufd ($b,$b,0b00111001); 1093 &vpshufd ($d,$d,0b10010011); 1094 1095 &XOPROUND(); 1096 &vpshufd ($c,$c,0b01001110); 1097 &vpshufd ($b,$b,0b10010011); 1098 &vpshufd ($d,$d,0b00111001); 1099 1100 &dec ("edx"); 1101 &jnz (&label("loop1x")); 1102 1103 &vpaddd ($a,$a,&QWP(16*0,"esp")); 1104 &vpaddd ($b,$b,&QWP(16*1,"esp")); 1105 &vpaddd ($c,$c,&QWP(16*2,"esp")); 1106 &vpaddd ($d,$d,&QWP(16*3,"esp")); 1107 1108 &cmp ($len,64); 1109 &jb (&label("tail")); 1110 1111 &vpxor ($a,$a,&QWP(16*0,$inp)); # xor with input 1112 &vpxor ($b,$b,&QWP(16*1,$inp)); 1113 &vpxor ($c,$c,&QWP(16*2,$inp)); 1114 &vpxor ($d,$d,&QWP(16*3,$inp)); 1115 &lea ($inp,&DWP(16*4,$inp)); # inp+=64 1116 1117 &vmovdqu (&QWP(16*0,$out),$a); # write output 1118 &vmovdqu (&QWP(16*1,$out),$b); 1119 &vmovdqu (&QWP(16*2,$out),$c); 1120 &vmovdqu (&QWP(16*3,$out),$d); 1121 &lea ($out,&DWP(16*4,$out)); # inp+=64 1122 1123 &sub ($len,64); 1124 &jnz (&label("outer1x")); 1125 1126 &jmp (&label("done")); 1127 1128&set_label("tail"); 1129 &vmovdqa (&QWP(16*0,"esp"),$a); 1130 &vmovdqa (&QWP(16*1,"esp"),$b); 1131 &vmovdqa (&QWP(16*2,"esp"),$c); 1132 &vmovdqa (&QWP(16*3,"esp"),$d); 1133 1134 &xor ("eax","eax"); 1135 &xor ("edx","edx"); 1136 &xor ("ebp","ebp"); 1137 1138&set_label("tail_loop"); 1139 &movb ("al",&BP(0,"esp","ebp")); 1140 &movb ("dl",&BP(0,$inp,"ebp")); 1141 &lea ("ebp",&DWP(1,"ebp")); 1142 &xor ("al","dl"); 1143 &movb (&BP(-1,$out,"ebp"),"al"); 1144 &dec ($len); 1145 &jnz (&label("tail_loop")); 1146} 1147&set_label("done"); 1148 &vzeroupper (); 1149 &mov ("esp",&DWP(512,"esp")); 1150&function_end("ChaCha20_xop"); 1151} 1152 1153&asm_finish(); 1154 1155close STDOUT; 1156