1#!/usr/bin/env perl 2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for x86_64. 17# 18# June 2017. 19# 20# Below code is [lane complementing] KECCAK_2X implementation (see 21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though 22# instead of actually unrolling the loop pair-wise I simply flip 23# pointers to T[][] and A[][] at the end of round. Since number of 24# rounds is even, last round writes to A[][] and everything works out. 25# How does it compare to x86_64 assembly module in Keccak Code Package? 26# Depending on processor it's either as fast or faster by up to 15%... 27# 28######################################################################## 29# Numbers are cycles per processed byte out of large message. 30# 31# r=1088(*) 32# 33# P4 25.8 34# Core 2 12.9 35# Westmere 13.7 36# Sandy Bridge 12.9(**) 37# Haswell 9.6 38# Skylake 9.4 39# Silvermont 22.8 40# Goldmont 15.8 41# VIA Nano 17.3 42# Sledgehammer 13.3 43# Bulldozer 16.5 44# Ryzen 8.8 45# 46# (*) Corresponds to SHA3-256. Improvement over compiler-generate 47# varies a lot, most commont coefficient is 15% in comparison to 48# gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x. 49# (**) Sandy Bridge has broken rotate instruction. Performance can be 50# improved by 14% by replacing rotates with double-precision 51# shift with same register as source and destination. 52 53$flavour = shift; 54$output = shift; 55if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 56 57$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 58 59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 60( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 61( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 62die "can't locate x86_64-xlate.pl"; 63 64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 65*STDOUT=*OUT; 66 67my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100, 68 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20)); 69 70my @C = ("%rax","%rbx","%rcx","%rdx","%rbp"); 71my @D = map("%r$_",(8..12)); 72my @T = map("%r$_",(13..14)); 73my $iotas = "%r15"; 74 75my @rhotates = ([ 0, 1, 62, 28, 27 ], 76 [ 36, 44, 6, 55, 20 ], 77 [ 3, 10, 43, 25, 39 ], 78 [ 41, 45, 15, 21, 8 ], 79 [ 18, 2, 61, 56, 14 ]); 80 81$code.=<<___; 82.text 83 84.type __KeccakF1600,\@abi-omnipotent 85.align 32 86__KeccakF1600: 87 mov $A[4][0](%rdi),@C[0] 88 mov $A[4][1](%rdi),@C[1] 89 mov $A[4][2](%rdi),@C[2] 90 mov $A[4][3](%rdi),@C[3] 91 mov $A[4][4](%rdi),@C[4] 92 jmp .Loop 93 94.align 32 95.Loop: 96 mov $A[0][0](%rdi),@D[0] 97 mov $A[1][1](%rdi),@D[1] 98 mov $A[2][2](%rdi),@D[2] 99 mov $A[3][3](%rdi),@D[3] 100 101 xor $A[0][2](%rdi),@C[2] 102 xor $A[0][3](%rdi),@C[3] 103 xor @D[0], @C[0] 104 xor $A[0][1](%rdi),@C[1] 105 xor $A[1][2](%rdi),@C[2] 106 xor $A[1][0](%rdi),@C[0] 107 mov @C[4],@D[4] 108 xor $A[0][4](%rdi),@C[4] 109 110 xor @D[2], @C[2] 111 xor $A[2][0](%rdi),@C[0] 112 xor $A[1][3](%rdi),@C[3] 113 xor @D[1], @C[1] 114 xor $A[1][4](%rdi),@C[4] 115 116 xor $A[3][2](%rdi),@C[2] 117 xor $A[3][0](%rdi),@C[0] 118 xor $A[2][3](%rdi),@C[3] 119 xor $A[2][1](%rdi),@C[1] 120 xor $A[2][4](%rdi),@C[4] 121 122 mov @C[2],@T[0] 123 rol \$1,@C[2] 124 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0] 125 xor @D[3], @C[3] 126 127 rol \$1,@C[0] 128 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3] 129 xor $A[3][1](%rdi),@C[1] 130 131 rol \$1,@C[3] 132 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1] 133 xor $A[3][4](%rdi),@C[4] 134 135 rol \$1,@C[1] 136 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4] 137 138 rol \$1,@C[4] 139 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2] 140___ 141 (@D[0..4], @C) = (@C[1..4,0], @D); 142$code.=<<___; 143 xor @D[1],@C[1] 144 xor @D[2],@C[2] 145 rol \$$rhotates[1][1],@C[1] 146 xor @D[3],@C[3] 147 xor @D[4],@C[4] 148 rol \$$rhotates[2][2],@C[2] 149 xor @D[0],@C[0] 150 mov @C[1],@T[0] 151 rol \$$rhotates[3][3],@C[3] 152 or @C[2],@C[1] 153 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2]) 154 rol \$$rhotates[4][4],@C[4] 155 156 xor ($iotas),@C[1] 157 lea 8($iotas),$iotas 158 159 mov @C[4],@T[1] 160 and @C[3],@C[4] 161 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i] 162 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3]) 163 not @C[2] 164 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3]) 165 166 or @C[3],@C[2] 167 mov $A[4][2](%rdi),@C[4] 168 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3]) 169 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3]) 170 171 and @C[0],@T[0] 172 mov $A[1][4](%rdi),@C[1] 173 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0]) 174 mov $A[2][0](%rdi),@C[2] 175 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0]) 176 177 or @C[0],@T[1] 178 mov $A[0][3](%rdi),@C[0] 179 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0]) 180 mov $A[3][1](%rdi),@C[3] 181 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0]) 182 183 184 xor @D[3],@C[0] 185 xor @D[2],@C[4] 186 rol \$$rhotates[0][3],@C[0] 187 xor @D[1],@C[3] 188 xor @D[4],@C[1] 189 rol \$$rhotates[4][2],@C[4] 190 rol \$$rhotates[3][1],@C[3] 191 xor @D[0],@C[2] 192 rol \$$rhotates[1][4],@C[1] 193 mov @C[0],@T[0] 194 or @C[4],@C[0] 195 rol \$$rhotates[2][0],@C[2] 196 197 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4]) 198 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4]) 199 200 mov @C[1],@T[1] 201 and @T[0],@C[1] 202 mov $A[0][1](%rdi),@C[0] 203 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0]) 204 not @C[4] 205 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0]) 206 207 or @C[3],@C[4] 208 mov $A[1][2](%rdi),@C[1] 209 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3]) 210 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3]) 211 212 and @C[2],@C[3] 213 mov $A[4][0](%rdi),@C[4] 214 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2]) 215 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2]) 216 217 or @C[2],@T[1] 218 mov $A[2][3](%rdi),@C[2] 219 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2]) 220 mov $A[3][4](%rdi),@C[3] 221 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2]) 222 223 224 xor @D[3],@C[2] 225 xor @D[4],@C[3] 226 rol \$$rhotates[2][3],@C[2] 227 xor @D[2],@C[1] 228 rol \$$rhotates[3][4],@C[3] 229 xor @D[0],@C[4] 230 rol \$$rhotates[1][2],@C[1] 231 xor @D[1],@C[0] 232 rol \$$rhotates[4][0],@C[4] 233 mov @C[2],@T[0] 234 and @C[3],@C[2] 235 rol \$$rhotates[0][1],@C[0] 236 237 not @C[3] 238 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3]) 239 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3]) 240 241 mov @C[4],@T[1] 242 and @C[3],@C[4] 243 mov $A[2][1](%rdi),@C[2] 244 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3]) 245 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3]) 246 247 or @C[1],@T[0] 248 mov $A[4][3](%rdi),@C[4] 249 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1]) 250 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1]) 251 252 and @C[0],@C[1] 253 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0]) 254 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0]) 255 256 or @C[0],@T[1] 257 mov $A[1][0](%rdi),@C[1] 258 xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4]) 259 mov $A[3][2](%rdi),@C[3] 260 mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4]) 261 262 263 mov $A[0][4](%rdi),@C[0] 264 265 xor @D[1],@C[2] 266 xor @D[2],@C[3] 267 rol \$$rhotates[2][1],@C[2] 268 xor @D[0],@C[1] 269 rol \$$rhotates[3][2],@C[3] 270 xor @D[3],@C[4] 271 rol \$$rhotates[1][0],@C[1] 272 xor @D[4],@C[0] 273 rol \$$rhotates[4][3],@C[4] 274 mov @C[2],@T[0] 275 or @C[3],@C[2] 276 rol \$$rhotates[0][4],@C[0] 277 278 not @C[3] 279 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3]) 280 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3]) 281 282 mov @C[4],@T[1] 283 or @C[3],@C[4] 284 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3]) 285 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3]) 286 287 and @C[1],@T[0] 288 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1]) 289 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1]) 290 291 or @C[0],@C[1] 292 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0]) 293 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0]) 294 295 and @T[1],@C[0] 296 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4]) 297 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4]) 298 299 300 xor $A[0][2](%rdi),@D[2] 301 xor $A[1][3](%rdi),@D[3] 302 rol \$$rhotates[0][2],@D[2] 303 xor $A[4][1](%rdi),@D[1] 304 rol \$$rhotates[1][3],@D[3] 305 xor $A[2][4](%rdi),@D[4] 306 rol \$$rhotates[4][1],@D[1] 307 xor $A[3][0](%rdi),@D[0] 308 xchg %rsi,%rdi 309 rol \$$rhotates[2][4],@D[4] 310 rol \$$rhotates[3][0],@D[0] 311___ 312 @C = @D[2..4,0,1]; 313$code.=<<___; 314 mov @C[0],@T[0] 315 and @C[1],@C[0] 316 not @C[1] 317 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1]) 318 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1]) 319 320 mov @C[2],@T[1] 321 and @C[1],@C[2] 322 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1]) 323 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1]) 324 325 or @C[4],@T[0] 326 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4]) 327 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4]) 328 329 and @C[3],@C[4] 330 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3]) 331 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3]) 332 333 or @T[1],@C[3] 334 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3]) 335 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3]) 336 337 mov @C[0],@C[1] # harmonize with the loop top 338 mov @T[0],@C[0] 339 340 test \$255,$iotas 341 jnz .Loop 342 343 lea -192($iotas),$iotas # rewind iotas 344 ret 345.size __KeccakF1600,.-__KeccakF1600 346 347.type KeccakF1600,\@abi-omnipotent 348.align 32 349KeccakF1600: 350.cfi_startproc 351 push %rbx 352.cfi_push %rbx 353 push %rbp 354.cfi_push %rbp 355 push %r12 356.cfi_push %r12 357 push %r13 358.cfi_push %r13 359 push %r14 360.cfi_push %r14 361 push %r15 362.cfi_push %r15 363 364 lea 100(%rdi),%rdi # size optimization 365 sub \$200,%rsp 366.cfi_adjust_cfa_offset 200 367 368 notq $A[0][1](%rdi) 369 notq $A[0][2](%rdi) 370 notq $A[1][3](%rdi) 371 notq $A[2][2](%rdi) 372 notq $A[3][2](%rdi) 373 notq $A[4][0](%rdi) 374 375 lea iotas(%rip),$iotas 376 lea 100(%rsp),%rsi # size optimization 377 378 call __KeccakF1600 379 380 notq $A[0][1](%rdi) 381 notq $A[0][2](%rdi) 382 notq $A[1][3](%rdi) 383 notq $A[2][2](%rdi) 384 notq $A[3][2](%rdi) 385 notq $A[4][0](%rdi) 386 lea -100(%rdi),%rdi # preserve A[][] 387 388 add \$200,%rsp 389.cfi_adjust_cfa_offset -200 390 391 pop %r15 392.cfi_pop %r15 393 pop %r14 394.cfi_pop %r14 395 pop %r13 396.cfi_pop %r13 397 pop %r12 398.cfi_pop %r12 399 pop %rbp 400.cfi_pop %rbp 401 pop %rbx 402.cfi_pop %rbx 403 ret 404.cfi_endproc 405.size KeccakF1600,.-KeccakF1600 406___ 407 408{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 409 ($A_flat,$inp) = ("%r8","%r9"); 410$code.=<<___; 411.globl SHA3_absorb 412.type SHA3_absorb,\@function,4 413.align 32 414SHA3_absorb: 415.cfi_startproc 416 push %rbx 417.cfi_push %rbx 418 push %rbp 419.cfi_push %rbp 420 push %r12 421.cfi_push %r12 422 push %r13 423.cfi_push %r13 424 push %r14 425.cfi_push %r14 426 push %r15 427.cfi_push %r15 428 429 lea 100(%rdi),%rdi # size optimization 430 sub \$232,%rsp 431.cfi_adjust_cfa_offset 232 432 433 mov %rsi,$inp 434 lea 100(%rsp),%rsi # size optimization 435 436 notq $A[0][1](%rdi) 437 notq $A[0][2](%rdi) 438 notq $A[1][3](%rdi) 439 notq $A[2][2](%rdi) 440 notq $A[3][2](%rdi) 441 notq $A[4][0](%rdi) 442 lea iotas(%rip),$iotas 443 444 mov $bsz,216-100(%rsi) # save bsz 445 446.Loop_absorb: 447 cmp $bsz,$len 448 jc .Ldone_absorb 449 450 shr \$3,$bsz 451 lea -100(%rdi),$A_flat 452 453.Lblock_absorb: 454 mov ($inp),%rax 455 lea 8($inp),$inp 456 xor ($A_flat),%rax 457 lea 8($A_flat),$A_flat 458 sub \$8,$len 459 mov %rax,-8($A_flat) 460 sub \$1,$bsz 461 jnz .Lblock_absorb 462 463 mov $inp,200-100(%rsi) # save inp 464 mov $len,208-100(%rsi) # save len 465 call __KeccakF1600 466 mov 200-100(%rsi),$inp # pull inp 467 mov 208-100(%rsi),$len # pull len 468 mov 216-100(%rsi),$bsz # pull bsz 469 jmp .Loop_absorb 470 471.align 32 472.Ldone_absorb: 473 mov $len,%rax # return value 474 475 notq $A[0][1](%rdi) 476 notq $A[0][2](%rdi) 477 notq $A[1][3](%rdi) 478 notq $A[2][2](%rdi) 479 notq $A[3][2](%rdi) 480 notq $A[4][0](%rdi) 481 482 add \$232,%rsp 483.cfi_adjust_cfa_offset -232 484 485 pop %r15 486.cfi_pop %r15 487 pop %r14 488.cfi_pop %r14 489 pop %r13 490.cfi_pop %r13 491 pop %r12 492.cfi_pop %r12 493 pop %rbp 494.cfi_pop %rbp 495 pop %rbx 496.cfi_pop %rbx 497 ret 498.cfi_endproc 499.size SHA3_absorb,.-SHA3_absorb 500___ 501} 502{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 503 ($out,$len,$bsz) = ("%r12","%r13","%r14"); 504 505$code.=<<___; 506.globl SHA3_squeeze 507.type SHA3_squeeze,\@function,4 508.align 32 509SHA3_squeeze: 510.cfi_startproc 511 push %r12 512.cfi_push %r12 513 push %r13 514.cfi_push %r13 515 push %r14 516.cfi_push %r14 517 518 shr \$3,%rcx 519 mov $A_flat,%r8 520 mov %rsi,$out 521 mov %rdx,$len 522 mov %rcx,$bsz 523 jmp .Loop_squeeze 524 525.align 32 526.Loop_squeeze: 527 cmp \$8,$len 528 jb .Ltail_squeeze 529 530 mov (%r8),%rax 531 lea 8(%r8),%r8 532 mov %rax,($out) 533 lea 8($out),$out 534 sub \$8,$len # len -= 8 535 jz .Ldone_squeeze 536 537 sub \$1,%rcx # bsz-- 538 jnz .Loop_squeeze 539 540 call KeccakF1600 541 mov $A_flat,%r8 542 mov $bsz,%rcx 543 jmp .Loop_squeeze 544 545.Ltail_squeeze: 546 mov %r8, %rsi 547 mov $out,%rdi 548 mov $len,%rcx 549 .byte 0xf3,0xa4 # rep movsb 550 551.Ldone_squeeze: 552 pop %r14 553.cfi_pop %r14 554 pop %r13 555.cfi_pop %r13 556 pop %r12 557.cfi_pop %r13 558 ret 559.cfi_endproc 560.size SHA3_squeeze,.-SHA3_squeeze 561___ 562} 563$code.=<<___; 564.align 256 565 .quad 0,0,0,0,0,0,0,0 566.type iotas,\@object 567iotas: 568 .quad 0x0000000000000001 569 .quad 0x0000000000008082 570 .quad 0x800000000000808a 571 .quad 0x8000000080008000 572 .quad 0x000000000000808b 573 .quad 0x0000000080000001 574 .quad 0x8000000080008081 575 .quad 0x8000000000008009 576 .quad 0x000000000000008a 577 .quad 0x0000000000000088 578 .quad 0x0000000080008009 579 .quad 0x000000008000000a 580 .quad 0x000000008000808b 581 .quad 0x800000000000008b 582 .quad 0x8000000000008089 583 .quad 0x8000000000008003 584 .quad 0x8000000000008002 585 .quad 0x8000000000000080 586 .quad 0x000000000000800a 587 .quad 0x800000008000000a 588 .quad 0x8000000080008081 589 .quad 0x8000000000008080 590 .quad 0x0000000080000001 591 .quad 0x8000000080008008 592.size iotas,.-iotas 593.asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 594___ 595 596foreach (split("\n",$code)) { 597 # Below replacement results in 11.2 on Sandy Bridge, 9.4 on 598 # Haswell, but it hurts other processors by up to 2-3-4x... 599 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/; 600 # Below replacement results in 9.3 on Haswell [as well as 601 # on Ryzen, i.e. it *hurts* Ryzen]... 602 #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/; 603 604 print $_, "\n"; 605} 606 607close STDOUT; 608