1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for x86_64. 17# 18# June 2017. 19# 20# Below code is [lane complementing] KECCAK_2X implementation (see 21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though 22# instead of actually unrolling the loop pair-wise I simply flip 23# pointers to T[][] and A[][] at the end of round. Since number of 24# rounds is even, last round writes to A[][] and everything works out. 25# How does it compare to x86_64 assembly module in Keccak Code Package? 26# Depending on processor it's either as fast or faster by up to 15%... 27# 28######################################################################## 29# Numbers are cycles per processed byte out of large message. 30# 31# r=1088(*) 32# 33# P4 25.8 34# Core 2 12.9 35# Westmere 13.7 36# Sandy Bridge 12.9(**) 37# Haswell 9.6 38# Skylake 9.4 39# Silvermont 22.8 40# Goldmont 15.8 41# VIA Nano 17.3 42# Sledgehammer 13.3 43# Bulldozer 16.5 44# Ryzen 8.8 45# 46# (*) Corresponds to SHA3-256. Improvement over compiler-generate 47# varies a lot, most common coefficient is 15% in comparison to 48# gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x. 49# (**) Sandy Bridge has broken rotate instruction. Performance can be 50# improved by 14% by replacing rotates with double-precision 51# shift with same register as source and destination. 52 53$flavour = shift; 54$output = shift; 55if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 56 57$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 58 59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 60( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 61( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 62die "can't locate x86_64-xlate.pl"; 63 64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 65*STDOUT=*OUT; 66 67my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100, 68 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20)); 69 70my @C = ("%rax","%rbx","%rcx","%rdx","%rbp"); 71my @D = map("%r$_",(8..12)); 72my @T = map("%r$_",(13..14)); 73my $iotas = "%r15"; 74 75my @rhotates = ([ 0, 1, 62, 28, 27 ], 76 [ 36, 44, 6, 55, 20 ], 77 [ 3, 10, 43, 25, 39 ], 78 [ 41, 45, 15, 21, 8 ], 79 [ 18, 2, 61, 56, 14 ]); 80 81$code.=<<___; 82.text 83 84.type __KeccakF1600,\@abi-omnipotent 85.align 32 86__KeccakF1600: 87.cfi_startproc 88 mov $A[4][0](%rdi),@C[0] 89 mov $A[4][1](%rdi),@C[1] 90 mov $A[4][2](%rdi),@C[2] 91 mov $A[4][3](%rdi),@C[3] 92 mov $A[4][4](%rdi),@C[4] 93 jmp .Loop 94 95.align 32 96.Loop: 97 mov $A[0][0](%rdi),@D[0] 98 mov $A[1][1](%rdi),@D[1] 99 mov $A[2][2](%rdi),@D[2] 100 mov $A[3][3](%rdi),@D[3] 101 102 xor $A[0][2](%rdi),@C[2] 103 xor $A[0][3](%rdi),@C[3] 104 xor @D[0], @C[0] 105 xor $A[0][1](%rdi),@C[1] 106 xor $A[1][2](%rdi),@C[2] 107 xor $A[1][0](%rdi),@C[0] 108 mov @C[4],@D[4] 109 xor $A[0][4](%rdi),@C[4] 110 111 xor @D[2], @C[2] 112 xor $A[2][0](%rdi),@C[0] 113 xor $A[1][3](%rdi),@C[3] 114 xor @D[1], @C[1] 115 xor $A[1][4](%rdi),@C[4] 116 117 xor $A[3][2](%rdi),@C[2] 118 xor $A[3][0](%rdi),@C[0] 119 xor $A[2][3](%rdi),@C[3] 120 xor $A[2][1](%rdi),@C[1] 121 xor $A[2][4](%rdi),@C[4] 122 123 mov @C[2],@T[0] 124 rol \$1,@C[2] 125 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0] 126 xor @D[3], @C[3] 127 128 rol \$1,@C[0] 129 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3] 130 xor $A[3][1](%rdi),@C[1] 131 132 rol \$1,@C[3] 133 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1] 134 xor $A[3][4](%rdi),@C[4] 135 136 rol \$1,@C[1] 137 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4] 138 139 rol \$1,@C[4] 140 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2] 141___ 142 (@D[0..4], @C) = (@C[1..4,0], @D); 143$code.=<<___; 144 xor @D[1],@C[1] 145 xor @D[2],@C[2] 146 rol \$$rhotates[1][1],@C[1] 147 xor @D[3],@C[3] 148 xor @D[4],@C[4] 149 rol \$$rhotates[2][2],@C[2] 150 xor @D[0],@C[0] 151 mov @C[1],@T[0] 152 rol \$$rhotates[3][3],@C[3] 153 or @C[2],@C[1] 154 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2]) 155 rol \$$rhotates[4][4],@C[4] 156 157 xor ($iotas),@C[1] 158 lea 8($iotas),$iotas 159 160 mov @C[4],@T[1] 161 and @C[3],@C[4] 162 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i] 163 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3]) 164 not @C[2] 165 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3]) 166 167 or @C[3],@C[2] 168 mov $A[4][2](%rdi),@C[4] 169 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3]) 170 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3]) 171 172 and @C[0],@T[0] 173 mov $A[1][4](%rdi),@C[1] 174 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0]) 175 mov $A[2][0](%rdi),@C[2] 176 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0]) 177 178 or @C[0],@T[1] 179 mov $A[0][3](%rdi),@C[0] 180 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0]) 181 mov $A[3][1](%rdi),@C[3] 182 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0]) 183 184 185 xor @D[3],@C[0] 186 xor @D[2],@C[4] 187 rol \$$rhotates[0][3],@C[0] 188 xor @D[1],@C[3] 189 xor @D[4],@C[1] 190 rol \$$rhotates[4][2],@C[4] 191 rol \$$rhotates[3][1],@C[3] 192 xor @D[0],@C[2] 193 rol \$$rhotates[1][4],@C[1] 194 mov @C[0],@T[0] 195 or @C[4],@C[0] 196 rol \$$rhotates[2][0],@C[2] 197 198 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4]) 199 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4]) 200 201 mov @C[1],@T[1] 202 and @T[0],@C[1] 203 mov $A[0][1](%rdi),@C[0] 204 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0]) 205 not @C[4] 206 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0]) 207 208 or @C[3],@C[4] 209 mov $A[1][2](%rdi),@C[1] 210 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3]) 211 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3]) 212 213 and @C[2],@C[3] 214 mov $A[4][0](%rdi),@C[4] 215 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2]) 216 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2]) 217 218 or @C[2],@T[1] 219 mov $A[2][3](%rdi),@C[2] 220 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2]) 221 mov $A[3][4](%rdi),@C[3] 222 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2]) 223 224 225 xor @D[3],@C[2] 226 xor @D[4],@C[3] 227 rol \$$rhotates[2][3],@C[2] 228 xor @D[2],@C[1] 229 rol \$$rhotates[3][4],@C[3] 230 xor @D[0],@C[4] 231 rol \$$rhotates[1][2],@C[1] 232 xor @D[1],@C[0] 233 rol \$$rhotates[4][0],@C[4] 234 mov @C[2],@T[0] 235 and @C[3],@C[2] 236 rol \$$rhotates[0][1],@C[0] 237 238 not @C[3] 239 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3]) 240 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3]) 241 242 mov @C[4],@T[1] 243 and @C[3],@C[4] 244 mov $A[2][1](%rdi),@C[2] 245 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3]) 246 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3]) 247 248 or @C[1],@T[0] 249 mov $A[4][3](%rdi),@C[4] 250 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1]) 251 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1]) 252 253 and @C[0],@C[1] 254 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0]) 255 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0]) 256 257 or @C[0],@T[1] 258 mov $A[1][0](%rdi),@C[1] 259 xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4]) 260 mov $A[3][2](%rdi),@C[3] 261 mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4]) 262 263 264 mov $A[0][4](%rdi),@C[0] 265 266 xor @D[1],@C[2] 267 xor @D[2],@C[3] 268 rol \$$rhotates[2][1],@C[2] 269 xor @D[0],@C[1] 270 rol \$$rhotates[3][2],@C[3] 271 xor @D[3],@C[4] 272 rol \$$rhotates[1][0],@C[1] 273 xor @D[4],@C[0] 274 rol \$$rhotates[4][3],@C[4] 275 mov @C[2],@T[0] 276 or @C[3],@C[2] 277 rol \$$rhotates[0][4],@C[0] 278 279 not @C[3] 280 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3]) 281 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3]) 282 283 mov @C[4],@T[1] 284 or @C[3],@C[4] 285 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3]) 286 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3]) 287 288 and @C[1],@T[0] 289 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1]) 290 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1]) 291 292 or @C[0],@C[1] 293 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0]) 294 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0]) 295 296 and @T[1],@C[0] 297 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4]) 298 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4]) 299 300 301 xor $A[0][2](%rdi),@D[2] 302 xor $A[1][3](%rdi),@D[3] 303 rol \$$rhotates[0][2],@D[2] 304 xor $A[4][1](%rdi),@D[1] 305 rol \$$rhotates[1][3],@D[3] 306 xor $A[2][4](%rdi),@D[4] 307 rol \$$rhotates[4][1],@D[1] 308 xor $A[3][0](%rdi),@D[0] 309 xchg %rsi,%rdi 310 rol \$$rhotates[2][4],@D[4] 311 rol \$$rhotates[3][0],@D[0] 312___ 313 @C = @D[2..4,0,1]; 314$code.=<<___; 315 mov @C[0],@T[0] 316 and @C[1],@C[0] 317 not @C[1] 318 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1]) 319 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1]) 320 321 mov @C[2],@T[1] 322 and @C[1],@C[2] 323 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1]) 324 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1]) 325 326 or @C[4],@T[0] 327 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4]) 328 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4]) 329 330 and @C[3],@C[4] 331 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3]) 332 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3]) 333 334 or @T[1],@C[3] 335 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3]) 336 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3]) 337 338 mov @C[0],@C[1] # harmonize with the loop top 339 mov @T[0],@C[0] 340 341 test \$255,$iotas 342 jnz .Loop 343 344 lea -192($iotas),$iotas # rewind iotas 345 ret 346.cfi_endproc 347.size __KeccakF1600,.-__KeccakF1600 348 349.type KeccakF1600,\@abi-omnipotent 350.align 32 351KeccakF1600: 352.cfi_startproc 353 push %rbx 354.cfi_push %rbx 355 push %rbp 356.cfi_push %rbp 357 push %r12 358.cfi_push %r12 359 push %r13 360.cfi_push %r13 361 push %r14 362.cfi_push %r14 363 push %r15 364.cfi_push %r15 365 366 lea 100(%rdi),%rdi # size optimization 367 sub \$200,%rsp 368.cfi_adjust_cfa_offset 200 369 370 notq $A[0][1](%rdi) 371 notq $A[0][2](%rdi) 372 notq $A[1][3](%rdi) 373 notq $A[2][2](%rdi) 374 notq $A[3][2](%rdi) 375 notq $A[4][0](%rdi) 376 377 lea iotas(%rip),$iotas 378 lea 100(%rsp),%rsi # size optimization 379 380 call __KeccakF1600 381 382 notq $A[0][1](%rdi) 383 notq $A[0][2](%rdi) 384 notq $A[1][3](%rdi) 385 notq $A[2][2](%rdi) 386 notq $A[3][2](%rdi) 387 notq $A[4][0](%rdi) 388 lea -100(%rdi),%rdi # preserve A[][] 389 390 add \$200,%rsp 391.cfi_adjust_cfa_offset -200 392 393 pop %r15 394.cfi_pop %r15 395 pop %r14 396.cfi_pop %r14 397 pop %r13 398.cfi_pop %r13 399 pop %r12 400.cfi_pop %r12 401 pop %rbp 402.cfi_pop %rbp 403 pop %rbx 404.cfi_pop %rbx 405 ret 406.cfi_endproc 407.size KeccakF1600,.-KeccakF1600 408___ 409 410{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 411 ($A_flat,$inp) = ("%r8","%r9"); 412$code.=<<___; 413.globl SHA3_absorb 414.type SHA3_absorb,\@function,4 415.align 32 416SHA3_absorb: 417.cfi_startproc 418 push %rbx 419.cfi_push %rbx 420 push %rbp 421.cfi_push %rbp 422 push %r12 423.cfi_push %r12 424 push %r13 425.cfi_push %r13 426 push %r14 427.cfi_push %r14 428 push %r15 429.cfi_push %r15 430 431 lea 100(%rdi),%rdi # size optimization 432 sub \$232,%rsp 433.cfi_adjust_cfa_offset 232 434 435 mov %rsi,$inp 436 lea 100(%rsp),%rsi # size optimization 437 438 notq $A[0][1](%rdi) 439 notq $A[0][2](%rdi) 440 notq $A[1][3](%rdi) 441 notq $A[2][2](%rdi) 442 notq $A[3][2](%rdi) 443 notq $A[4][0](%rdi) 444 lea iotas(%rip),$iotas 445 446 mov $bsz,216-100(%rsi) # save bsz 447 448.Loop_absorb: 449 cmp $bsz,$len 450 jc .Ldone_absorb 451 452 shr \$3,$bsz 453 lea -100(%rdi),$A_flat 454 455.Lblock_absorb: 456 mov ($inp),%rax 457 lea 8($inp),$inp 458 xor ($A_flat),%rax 459 lea 8($A_flat),$A_flat 460 sub \$8,$len 461 mov %rax,-8($A_flat) 462 sub \$1,$bsz 463 jnz .Lblock_absorb 464 465 mov $inp,200-100(%rsi) # save inp 466 mov $len,208-100(%rsi) # save len 467 call __KeccakF1600 468 mov 200-100(%rsi),$inp # pull inp 469 mov 208-100(%rsi),$len # pull len 470 mov 216-100(%rsi),$bsz # pull bsz 471 jmp .Loop_absorb 472 473.align 32 474.Ldone_absorb: 475 mov $len,%rax # return value 476 477 notq $A[0][1](%rdi) 478 notq $A[0][2](%rdi) 479 notq $A[1][3](%rdi) 480 notq $A[2][2](%rdi) 481 notq $A[3][2](%rdi) 482 notq $A[4][0](%rdi) 483 484 add \$232,%rsp 485.cfi_adjust_cfa_offset -232 486 487 pop %r15 488.cfi_pop %r15 489 pop %r14 490.cfi_pop %r14 491 pop %r13 492.cfi_pop %r13 493 pop %r12 494.cfi_pop %r12 495 pop %rbp 496.cfi_pop %rbp 497 pop %rbx 498.cfi_pop %rbx 499 ret 500.cfi_endproc 501.size SHA3_absorb,.-SHA3_absorb 502___ 503} 504{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 505 ($out,$len,$bsz) = ("%r12","%r13","%r14"); 506 507$code.=<<___; 508.globl SHA3_squeeze 509.type SHA3_squeeze,\@function,4 510.align 32 511SHA3_squeeze: 512.cfi_startproc 513 push %r12 514.cfi_push %r12 515 push %r13 516.cfi_push %r13 517 push %r14 518.cfi_push %r14 519 520 shr \$3,%rcx 521 mov $A_flat,%r8 522 mov %rsi,$out 523 mov %rdx,$len 524 mov %rcx,$bsz 525 jmp .Loop_squeeze 526 527.align 32 528.Loop_squeeze: 529 cmp \$8,$len 530 jb .Ltail_squeeze 531 532 mov (%r8),%rax 533 lea 8(%r8),%r8 534 mov %rax,($out) 535 lea 8($out),$out 536 sub \$8,$len # len -= 8 537 jz .Ldone_squeeze 538 539 sub \$1,%rcx # bsz-- 540 jnz .Loop_squeeze 541 542 call KeccakF1600 543 mov $A_flat,%r8 544 mov $bsz,%rcx 545 jmp .Loop_squeeze 546 547.Ltail_squeeze: 548 mov %r8, %rsi 549 mov $out,%rdi 550 mov $len,%rcx 551 .byte 0xf3,0xa4 # rep movsb 552 553.Ldone_squeeze: 554 pop %r14 555.cfi_pop %r14 556 pop %r13 557.cfi_pop %r13 558 pop %r12 559.cfi_pop %r13 560 ret 561.cfi_endproc 562.size SHA3_squeeze,.-SHA3_squeeze 563___ 564} 565$code.=<<___; 566.align 256 567 .quad 0,0,0,0,0,0,0,0 568.type iotas,\@object 569iotas: 570 .quad 0x0000000000000001 571 .quad 0x0000000000008082 572 .quad 0x800000000000808a 573 .quad 0x8000000080008000 574 .quad 0x000000000000808b 575 .quad 0x0000000080000001 576 .quad 0x8000000080008081 577 .quad 0x8000000000008009 578 .quad 0x000000000000008a 579 .quad 0x0000000000000088 580 .quad 0x0000000080008009 581 .quad 0x000000008000000a 582 .quad 0x000000008000808b 583 .quad 0x800000000000008b 584 .quad 0x8000000000008089 585 .quad 0x8000000000008003 586 .quad 0x8000000000008002 587 .quad 0x8000000000000080 588 .quad 0x000000000000800a 589 .quad 0x800000008000000a 590 .quad 0x8000000080008081 591 .quad 0x8000000000008080 592 .quad 0x0000000080000001 593 .quad 0x8000000080008008 594.size iotas,.-iotas 595.asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 596___ 597 598foreach (split("\n",$code)) { 599 # Below replacement results in 11.2 on Sandy Bridge, 9.4 on 600 # Haswell, but it hurts other processors by up to 2-3-4x... 601 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/; 602 # Below replacement results in 9.3 on Haswell [as well as 603 # on Ryzen, i.e. it *hurts* Ryzen]... 604 #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/; 605 606 print $_, "\n"; 607} 608 609close STDOUT or die "error closing STDOUT: $!"; 610