1#! /usr/bin/env perl 2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Hardware SPARC T4 support by David S. Miller 17# ==================================================================== 18 19# SHA256 performance improvement over compiler generated code varies 20# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit 21# build]. Just like in SHA1 module I aim to ensure scalability on 22# UltraSPARC T1 by packing X[16] to 8 64-bit registers. 23 24# SHA512 on pre-T1 UltraSPARC. 25# 26# Performance is >75% better than 64-bit code generated by Sun C and 27# over 2x than 32-bit code. X[16] resides on stack, but access to it 28# is scheduled for L2 latency and staged through 32 least significant 29# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI 30# duality. Nevetheless it's ~40% faster than SHA256, which is pretty 31# good [optimal coefficient is 50%]. 32# 33# SHA512 on UltraSPARC T1. 34# 35# It's not any faster than 64-bit code generated by Sun C 5.8. This is 36# because 64-bit code generator has the advantage of using 64-bit 37# loads(*) to access X[16], which I consciously traded for 32-/64-bit 38# ABI duality [as per above]. But it surpasses 32-bit Sun C generated 39# code by 60%, not to mention that it doesn't suffer from severe decay 40# when running 4 times physical cores threads and that it leaves gcc 41# [3.4] behind by over 4x factor! If compared to SHA256, single thread 42# performance is only 10% better, but overall throughput for maximum 43# amount of threads for given CPU exceeds corresponding one of SHA256 44# by 30% [again, optimal coefficient is 50%]. 45# 46# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly 47# in-order, i.e. load instruction has to complete prior next 48# instruction in given thread is executed, even if the latter is 49# not dependent on load result! This means that on T1 two 32-bit 50# loads are always slower than one 64-bit load. Once again this 51# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately, 52# 2x32-bit loads can be as fast as 1x64-bit ones. 53# 54# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte, 55# which is 9.3x/11.1x faster than software. Multi-process benchmark 56# saturates at 11.5x single-process result on 8-core processor, or 57# ~11/16GBps per 2.85GHz socket. 58 59$output=pop; 60open STDOUT,">$output"; 61 62if ($output =~ /512/) { 63 $label="512"; 64 $SZ=8; 65 $LD="ldx"; # load from memory 66 $ST="stx"; # store to memory 67 $SLL="sllx"; # shift left logical 68 $SRL="srlx"; # shift right logical 69 @Sigma0=(28,34,39); 70 @Sigma1=(14,18,41); 71 @sigma0=( 7, 1, 8); # right shift first 72 @sigma1=( 6,19,61); # right shift first 73 $lastK=0x817; 74 $rounds=80; 75 $align=4; 76 77 $locals=16*$SZ; # X[16] 78 79 $A="%o0"; 80 $B="%o1"; 81 $C="%o2"; 82 $D="%o3"; 83 $E="%o4"; 84 $F="%o5"; 85 $G="%g1"; 86 $H="%o7"; 87 @V=($A,$B,$C,$D,$E,$F,$G,$H); 88} else { 89 $label="256"; 90 $SZ=4; 91 $LD="ld"; # load from memory 92 $ST="st"; # store to memory 93 $SLL="sll"; # shift left logical 94 $SRL="srl"; # shift right logical 95 @Sigma0=( 2,13,22); 96 @Sigma1=( 6,11,25); 97 @sigma0=( 3, 7,18); # right shift first 98 @sigma1=(10,17,19); # right shift first 99 $lastK=0x8f2; 100 $rounds=64; 101 $align=8; 102 103 $locals=0; # X[16] is register resident 104 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 105 106 $A="%l0"; 107 $B="%l1"; 108 $C="%l2"; 109 $D="%l3"; 110 $E="%l4"; 111 $F="%l5"; 112 $G="%l6"; 113 $H="%l7"; 114 @V=($A,$B,$C,$D,$E,$F,$G,$H); 115} 116$T1="%g2"; 117$tmp0="%g3"; 118$tmp1="%g4"; 119$tmp2="%g5"; 120 121$ctx="%i0"; 122$inp="%i1"; 123$len="%i2"; 124$Ktbl="%i3"; 125$tmp31="%i4"; 126$tmp32="%i5"; 127 128########### SHA256 129$Xload = sub { 130my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 131 132 if ($i==0) { 133$code.=<<___; 134 ldx [$inp+0],@X[0] 135 ldx [$inp+16],@X[2] 136 ldx [$inp+32],@X[4] 137 ldx [$inp+48],@X[6] 138 ldx [$inp+8],@X[1] 139 ldx [$inp+24],@X[3] 140 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too 141 ldx [$inp+40],@X[5] 142 bz,pt %icc,.Laligned 143 ldx [$inp+56],@X[7] 144 145 sllx @X[0],$tmp31,@X[0] 146 ldx [$inp+64],$T1 147___ 148for($j=0;$j<7;$j++) 149{ $code.=<<___; 150 srlx @X[$j+1],$tmp32,$tmp1 151 sllx @X[$j+1],$tmp31,@X[$j+1] 152 or $tmp1,@X[$j],@X[$j] 153___ 154} 155$code.=<<___; 156 srlx $T1,$tmp32,$T1 157 or $T1,@X[7],@X[7] 158.Laligned: 159___ 160 } 161 162 if ($i&1) { 163 $code.="\tadd @X[$i/2],$h,$T1\n"; 164 } else { 165 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n"; 166 } 167} if ($SZ==4); 168 169########### SHA512 170$Xload = sub { 171my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 172my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8)); 173 174$code.=<<___ if ($i==0); 175 ld [$inp+0],%l0 176 ld [$inp+4],%l1 177 ld [$inp+8],%l2 178 ld [$inp+12],%l3 179 ld [$inp+16],%l4 180 ld [$inp+20],%l5 181 ld [$inp+24],%l6 182 cmp $tmp31,0 183 ld [$inp+28],%l7 184___ 185$code.=<<___ if ($i<15); 186 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 187 add $tmp31,32,$tmp0 188 sllx @pair[0],$tmp0,$tmp1 189 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)` 190 srlx @pair[2],$tmp32,@pair[1] 191 or $tmp1,$tmp2,$tmp2 192 or @pair[1],$tmp2,$tmp2 193 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` 194 add $h,$tmp2,$T1 195 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] 196___ 197$code.=<<___ if ($i==12); 198 bnz,a,pn %icc,.+8 199 ld [$inp+128],%l0 200___ 201$code.=<<___ if ($i==15); 202 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 203 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 204 add $tmp31,32,$tmp0 205 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 206 sllx @pair[0],$tmp0,$tmp1 207 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 208 srlx @pair[2],$tmp32,@pair[1] 209 or $tmp1,$tmp2,$tmp2 210 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 211 or @pair[1],$tmp2,$tmp2 212 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 213 add $h,$tmp2,$T1 214 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] 215 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 216 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 217 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 218___ 219} if ($SZ==8); 220 221########### common 222sub BODY_00_15 { 223my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 224 225 if ($i<16) { 226 &$Xload(@_); 227 } else { 228 $code.="\tadd $h,$T1,$T1\n"; 229 } 230 231$code.=<<___; 232 $SRL $e,@Sigma1[0],$h !! $i 233 xor $f,$g,$tmp2 234 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1 235 and $e,$tmp2,$tmp2 236 $SRL $e,@Sigma1[1],$tmp0 237 xor $tmp1,$h,$h 238 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1 239 xor $tmp0,$h,$h 240 $SRL $e,@Sigma1[2],$tmp0 241 xor $tmp1,$h,$h 242 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1 243 xor $tmp0,$h,$h 244 xor $g,$tmp2,$tmp2 ! Ch(e,f,g) 245 xor $tmp1,$h,$tmp0 ! Sigma1(e) 246 247 $SRL $a,@Sigma0[0],$h 248 add $tmp2,$T1,$T1 249 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i] 250 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1 251 add $tmp0,$T1,$T1 252 $SRL $a,@Sigma0[1],$tmp0 253 xor $tmp1,$h,$h 254 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1 255 xor $tmp0,$h,$h 256 $SRL $a,@Sigma0[2],$tmp0 257 xor $tmp1,$h,$h 258 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1 259 xor $tmp0,$h,$h 260 xor $tmp1,$h,$h ! Sigma0(a) 261 262 or $a,$b,$tmp0 263 and $a,$b,$tmp1 264 and $c,$tmp0,$tmp0 265 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c) 266 add $tmp2,$T1,$T1 ! +=K[$i] 267 add $tmp1,$h,$h 268 269 add $T1,$d,$d 270 add $T1,$h,$h 271___ 272} 273 274########### SHA256 275$BODY_16_XX = sub { 276my $i=@_[0]; 277my $xi; 278 279 if ($i&1) { 280 $xi=$tmp32; 281 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n"; 282 } else { 283 $xi=@X[(($i+1)/2)%8]; 284 } 285$code.=<<___; 286 srl $xi,@sigma0[0],$T1 !! Xupdate($i) 287 sll $xi,`32-@sigma0[2]`,$tmp1 288 srl $xi,@sigma0[1],$tmp0 289 xor $tmp1,$T1,$T1 290 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 291 xor $tmp0,$T1,$T1 292 srl $xi,@sigma0[2],$tmp0 293 xor $tmp1,$T1,$T1 294___ 295 if ($i&1) { 296 $xi=@X[(($i+14)/2)%8]; 297 } else { 298 $xi=$tmp32; 299 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n"; 300 } 301$code.=<<___; 302 srl $xi,@sigma1[0],$tmp2 303 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1]) 304 sll $xi,`32-@sigma1[2]`,$tmp1 305 srl $xi,@sigma1[1],$tmp0 306 xor $tmp1,$tmp2,$tmp2 307 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1 308 xor $tmp0,$tmp2,$tmp2 309 srl $xi,@sigma1[2],$tmp0 310 xor $tmp1,$tmp2,$tmp2 311___ 312 if ($i&1) { 313 $xi=@X[($i/2)%8]; 314$code.=<<___; 315 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] 316 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 317 srl @X[($i/2)%8],0,$tmp0 318 add $tmp2,$tmp1,$tmp1 319 add $xi,$T1,$T1 ! +=X[i] 320 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] 321 add $tmp1,$T1,$T1 322 323 srl $T1,0,$T1 324 or $T1,@X[($i/2)%8],@X[($i/2)%8] 325___ 326 } else { 327 $xi=@X[(($i+9)/2)%8]; 328$code.=<<___; 329 srlx @X[($i/2)%8],32,$tmp1 ! X[i] 330 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 331 add $xi,$T1,$T1 ! +=X[i+9] 332 add $tmp2,$tmp1,$tmp1 333 srl @X[($i/2)%8],0,@X[($i/2)%8] 334 add $tmp1,$T1,$T1 335 336 sllx $T1,32,$tmp0 337 or $tmp0,@X[($i/2)%8],@X[($i/2)%8] 338___ 339 } 340 &BODY_00_15(@_); 341} if ($SZ==4); 342 343########### SHA512 344$BODY_16_XX = sub { 345my $i=@_[0]; 346my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1)); 347 348$code.=<<___; 349 sllx %l2,32,$tmp0 !! Xupdate($i) 350 or %l3,$tmp0,$tmp0 351 352 srlx $tmp0,@sigma0[0],$T1 353 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 354 sllx $tmp0,`64-@sigma0[2]`,$tmp1 355 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 356 srlx $tmp0,@sigma0[1],$tmp0 357 xor $tmp1,$T1,$T1 358 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 359 xor $tmp0,$T1,$T1 360 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0 361 xor $tmp1,$T1,$T1 362 sllx %l6,32,$tmp2 363 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1]) 364 or %l7,$tmp2,$tmp2 365 366 srlx $tmp2,@sigma1[0],$tmp1 367 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 368 sllx $tmp2,`64-@sigma1[2]`,$tmp0 369 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 370 srlx $tmp2,@sigma1[1],$tmp2 371 xor $tmp0,$tmp1,$tmp1 372 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 373 xor $tmp2,$tmp1,$tmp1 374 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2 375 xor $tmp0,$tmp1,$tmp1 376 sllx %l4,32,$tmp0 377 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) 378 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 379 or %l5,$tmp0,$tmp0 380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 381 382 sllx %l0,32,$tmp2 383 add $tmp1,$T1,$T1 384 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 385 or %l1,$tmp2,$tmp2 386 add $tmp0,$T1,$T1 ! +=X[$i+9] 387 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 388 add $tmp2,$T1,$T1 ! +=X[$i] 389 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`] 390___ 391 &BODY_00_15(@_); 392} if ($SZ==8); 393 394$code.=<<___; 395#include "sparc_arch.h" 396 397#ifdef __arch64__ 398.register %g2,#scratch 399.register %g3,#scratch 400#endif 401 402.section ".text",#alloc,#execinstr 403 404.align 64 405K${label}: 406.type K${label},#object 407___ 408if ($SZ==4) { 409$code.=<<___; 410 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 411 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 412 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 413 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 414 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 415 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 416 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 417 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 418 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 419 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 420 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 421 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 422 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 423 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 424 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 425 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 426___ 427} else { 428$code.=<<___; 429 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 430 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 431 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 432 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 433 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 434 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 435 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 436 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 437 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 438 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 439 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 440 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 441 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 442 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 443 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 444 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 445 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 446 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 447 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 448 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 449 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 450 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 451 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 452 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 453 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 454 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 455 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 456 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 457 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 458 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 459 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 460 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 461 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 462 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 463 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 464 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 465 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 466 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 467 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 468 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 469___ 470} 471$code.=<<___; 472.size K${label},.-K${label} 473 474#ifdef __PIC__ 475SPARC_PIC_THUNK(%g1) 476#endif 477 478.globl sha${label}_block_data_order 479.align 32 480sha${label}_block_data_order: 481 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 482 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] 483 484 andcc %g1, CFR_SHA${label}, %g0 485 be .Lsoftware 486 nop 487___ 488$code.=<<___ if ($SZ==8); # SHA512 489 ldd [%o0 + 0x00], %f0 ! load context 490 ldd [%o0 + 0x08], %f2 491 ldd [%o0 + 0x10], %f4 492 ldd [%o0 + 0x18], %f6 493 ldd [%o0 + 0x20], %f8 494 ldd [%o0 + 0x28], %f10 495 andcc %o1, 0x7, %g0 496 ldd [%o0 + 0x30], %f12 497 bne,pn %icc, .Lhwunaligned 498 ldd [%o0 + 0x38], %f14 499 500.Lhwaligned_loop: 501 ldd [%o1 + 0x00], %f16 502 ldd [%o1 + 0x08], %f18 503 ldd [%o1 + 0x10], %f20 504 ldd [%o1 + 0x18], %f22 505 ldd [%o1 + 0x20], %f24 506 ldd [%o1 + 0x28], %f26 507 ldd [%o1 + 0x30], %f28 508 ldd [%o1 + 0x38], %f30 509 ldd [%o1 + 0x40], %f32 510 ldd [%o1 + 0x48], %f34 511 ldd [%o1 + 0x50], %f36 512 ldd [%o1 + 0x58], %f38 513 ldd [%o1 + 0x60], %f40 514 ldd [%o1 + 0x68], %f42 515 ldd [%o1 + 0x70], %f44 516 subcc %o2, 1, %o2 ! done yet? 517 ldd [%o1 + 0x78], %f46 518 add %o1, 0x80, %o1 519 prefetch [%o1 + 63], 20 520 prefetch [%o1 + 64+63], 20 521 522 .word 0x81b02860 ! SHA512 523 524 bne,pt SIZE_T_CC, .Lhwaligned_loop 525 nop 526 527.Lhwfinish: 528 std %f0, [%o0 + 0x00] ! store context 529 std %f2, [%o0 + 0x08] 530 std %f4, [%o0 + 0x10] 531 std %f6, [%o0 + 0x18] 532 std %f8, [%o0 + 0x20] 533 std %f10, [%o0 + 0x28] 534 std %f12, [%o0 + 0x30] 535 retl 536 std %f14, [%o0 + 0x38] 537 538.align 16 539.Lhwunaligned: 540 alignaddr %o1, %g0, %o1 541 542 ldd [%o1 + 0x00], %f18 543.Lhwunaligned_loop: 544 ldd [%o1 + 0x08], %f20 545 ldd [%o1 + 0x10], %f22 546 ldd [%o1 + 0x18], %f24 547 ldd [%o1 + 0x20], %f26 548 ldd [%o1 + 0x28], %f28 549 ldd [%o1 + 0x30], %f30 550 ldd [%o1 + 0x38], %f32 551 ldd [%o1 + 0x40], %f34 552 ldd [%o1 + 0x48], %f36 553 ldd [%o1 + 0x50], %f38 554 ldd [%o1 + 0x58], %f40 555 ldd [%o1 + 0x60], %f42 556 ldd [%o1 + 0x68], %f44 557 ldd [%o1 + 0x70], %f46 558 ldd [%o1 + 0x78], %f48 559 subcc %o2, 1, %o2 ! done yet? 560 ldd [%o1 + 0x80], %f50 561 add %o1, 0x80, %o1 562 prefetch [%o1 + 63], 20 563 prefetch [%o1 + 64+63], 20 564 565 faligndata %f18, %f20, %f16 566 faligndata %f20, %f22, %f18 567 faligndata %f22, %f24, %f20 568 faligndata %f24, %f26, %f22 569 faligndata %f26, %f28, %f24 570 faligndata %f28, %f30, %f26 571 faligndata %f30, %f32, %f28 572 faligndata %f32, %f34, %f30 573 faligndata %f34, %f36, %f32 574 faligndata %f36, %f38, %f34 575 faligndata %f38, %f40, %f36 576 faligndata %f40, %f42, %f38 577 faligndata %f42, %f44, %f40 578 faligndata %f44, %f46, %f42 579 faligndata %f46, %f48, %f44 580 faligndata %f48, %f50, %f46 581 582 .word 0x81b02860 ! SHA512 583 584 bne,pt SIZE_T_CC, .Lhwunaligned_loop 585 for %f50, %f50, %f18 ! %f18=%f50 586 587 ba .Lhwfinish 588 nop 589___ 590$code.=<<___ if ($SZ==4); # SHA256 591 ld [%o0 + 0x00], %f0 592 ld [%o0 + 0x04], %f1 593 ld [%o0 + 0x08], %f2 594 ld [%o0 + 0x0c], %f3 595 ld [%o0 + 0x10], %f4 596 ld [%o0 + 0x14], %f5 597 andcc %o1, 0x7, %g0 598 ld [%o0 + 0x18], %f6 599 bne,pn %icc, .Lhwunaligned 600 ld [%o0 + 0x1c], %f7 601 602.Lhwloop: 603 ldd [%o1 + 0x00], %f8 604 ldd [%o1 + 0x08], %f10 605 ldd [%o1 + 0x10], %f12 606 ldd [%o1 + 0x18], %f14 607 ldd [%o1 + 0x20], %f16 608 ldd [%o1 + 0x28], %f18 609 ldd [%o1 + 0x30], %f20 610 subcc %o2, 1, %o2 ! done yet? 611 ldd [%o1 + 0x38], %f22 612 add %o1, 0x40, %o1 613 prefetch [%o1 + 63], 20 614 615 .word 0x81b02840 ! SHA256 616 617 bne,pt SIZE_T_CC, .Lhwloop 618 nop 619 620.Lhwfinish: 621 st %f0, [%o0 + 0x00] ! store context 622 st %f1, [%o0 + 0x04] 623 st %f2, [%o0 + 0x08] 624 st %f3, [%o0 + 0x0c] 625 st %f4, [%o0 + 0x10] 626 st %f5, [%o0 + 0x14] 627 st %f6, [%o0 + 0x18] 628 retl 629 st %f7, [%o0 + 0x1c] 630 631.align 8 632.Lhwunaligned: 633 alignaddr %o1, %g0, %o1 634 635 ldd [%o1 + 0x00], %f10 636.Lhwunaligned_loop: 637 ldd [%o1 + 0x08], %f12 638 ldd [%o1 + 0x10], %f14 639 ldd [%o1 + 0x18], %f16 640 ldd [%o1 + 0x20], %f18 641 ldd [%o1 + 0x28], %f20 642 ldd [%o1 + 0x30], %f22 643 ldd [%o1 + 0x38], %f24 644 subcc %o2, 1, %o2 ! done yet? 645 ldd [%o1 + 0x40], %f26 646 add %o1, 0x40, %o1 647 prefetch [%o1 + 63], 20 648 649 faligndata %f10, %f12, %f8 650 faligndata %f12, %f14, %f10 651 faligndata %f14, %f16, %f12 652 faligndata %f16, %f18, %f14 653 faligndata %f18, %f20, %f16 654 faligndata %f20, %f22, %f18 655 faligndata %f22, %f24, %f20 656 faligndata %f24, %f26, %f22 657 658 .word 0x81b02840 ! SHA256 659 660 bne,pt SIZE_T_CC, .Lhwunaligned_loop 661 for %f26, %f26, %f10 ! %f10=%f26 662 663 ba .Lhwfinish 664 nop 665___ 666$code.=<<___; 667.align 16 668.Lsoftware: 669 save %sp,-STACK_FRAME-$locals,%sp 670 and $inp,`$align-1`,$tmp31 671 sllx $len,`log(16*$SZ)/log(2)`,$len 672 andn $inp,`$align-1`,$inp 673 sll $tmp31,3,$tmp31 674 add $inp,$len,$len 675___ 676$code.=<<___ if ($SZ==8); # SHA512 677 mov 32,$tmp32 678 sub $tmp32,$tmp31,$tmp32 679___ 680$code.=<<___; 681.Lpic: call .+8 682 add %o7,K${label}-.Lpic,$Ktbl 683 684 $LD [$ctx+`0*$SZ`],$A 685 $LD [$ctx+`1*$SZ`],$B 686 $LD [$ctx+`2*$SZ`],$C 687 $LD [$ctx+`3*$SZ`],$D 688 $LD [$ctx+`4*$SZ`],$E 689 $LD [$ctx+`5*$SZ`],$F 690 $LD [$ctx+`6*$SZ`],$G 691 $LD [$ctx+`7*$SZ`],$H 692 693.Lloop: 694___ 695for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 696$code.=".L16_xx:\n"; 697for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 698$code.=<<___; 699 and $tmp2,0xfff,$tmp2 700 cmp $tmp2,$lastK 701 bne .L16_xx 702 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16 703 704___ 705$code.=<<___ if ($SZ==4); # SHA256 706 $LD [$ctx+`0*$SZ`],@X[0] 707 $LD [$ctx+`1*$SZ`],@X[1] 708 $LD [$ctx+`2*$SZ`],@X[2] 709 $LD [$ctx+`3*$SZ`],@X[3] 710 $LD [$ctx+`4*$SZ`],@X[4] 711 $LD [$ctx+`5*$SZ`],@X[5] 712 $LD [$ctx+`6*$SZ`],@X[6] 713 $LD [$ctx+`7*$SZ`],@X[7] 714 715 add $A,@X[0],$A 716 $ST $A,[$ctx+`0*$SZ`] 717 add $B,@X[1],$B 718 $ST $B,[$ctx+`1*$SZ`] 719 add $C,@X[2],$C 720 $ST $C,[$ctx+`2*$SZ`] 721 add $D,@X[3],$D 722 $ST $D,[$ctx+`3*$SZ`] 723 add $E,@X[4],$E 724 $ST $E,[$ctx+`4*$SZ`] 725 add $F,@X[5],$F 726 $ST $F,[$ctx+`5*$SZ`] 727 add $G,@X[6],$G 728 $ST $G,[$ctx+`6*$SZ`] 729 add $H,@X[7],$H 730 $ST $H,[$ctx+`7*$SZ`] 731___ 732$code.=<<___ if ($SZ==8); # SHA512 733 ld [$ctx+`0*$SZ+0`],%l0 734 ld [$ctx+`0*$SZ+4`],%l1 735 ld [$ctx+`1*$SZ+0`],%l2 736 ld [$ctx+`1*$SZ+4`],%l3 737 ld [$ctx+`2*$SZ+0`],%l4 738 ld [$ctx+`2*$SZ+4`],%l5 739 ld [$ctx+`3*$SZ+0`],%l6 740 741 sllx %l0,32,$tmp0 742 ld [$ctx+`3*$SZ+4`],%l7 743 sllx %l2,32,$tmp1 744 or %l1,$tmp0,$tmp0 745 or %l3,$tmp1,$tmp1 746 add $tmp0,$A,$A 747 add $tmp1,$B,$B 748 $ST $A,[$ctx+`0*$SZ`] 749 sllx %l4,32,$tmp2 750 $ST $B,[$ctx+`1*$SZ`] 751 sllx %l6,32,$T1 752 or %l5,$tmp2,$tmp2 753 or %l7,$T1,$T1 754 add $tmp2,$C,$C 755 $ST $C,[$ctx+`2*$SZ`] 756 add $T1,$D,$D 757 $ST $D,[$ctx+`3*$SZ`] 758 759 ld [$ctx+`4*$SZ+0`],%l0 760 ld [$ctx+`4*$SZ+4`],%l1 761 ld [$ctx+`5*$SZ+0`],%l2 762 ld [$ctx+`5*$SZ+4`],%l3 763 ld [$ctx+`6*$SZ+0`],%l4 764 ld [$ctx+`6*$SZ+4`],%l5 765 ld [$ctx+`7*$SZ+0`],%l6 766 767 sllx %l0,32,$tmp0 768 ld [$ctx+`7*$SZ+4`],%l7 769 sllx %l2,32,$tmp1 770 or %l1,$tmp0,$tmp0 771 or %l3,$tmp1,$tmp1 772 add $tmp0,$E,$E 773 add $tmp1,$F,$F 774 $ST $E,[$ctx+`4*$SZ`] 775 sllx %l4,32,$tmp2 776 $ST $F,[$ctx+`5*$SZ`] 777 sllx %l6,32,$T1 778 or %l5,$tmp2,$tmp2 779 or %l7,$T1,$T1 780 add $tmp2,$G,$G 781 $ST $G,[$ctx+`6*$SZ`] 782 add $T1,$H,$H 783 $ST $H,[$ctx+`7*$SZ`] 784___ 785$code.=<<___; 786 add $inp,`16*$SZ`,$inp ! advance inp 787 cmp $inp,$len 788 bne SIZE_T_CC,.Lloop 789 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl 790 791 ret 792 restore 793.type sha${label}_block_data_order,#function 794.size sha${label}_block_data_order,(.-sha${label}_block_data_order) 795.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 796.align 4 797___ 798 799# Purpose of these subroutines is to explicitly encode VIS instructions, 800# so that one can compile the module without having to specify VIS 801# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 802# Idea is to reserve for option to produce "universal" binary and let 803# programmer detect if current CPU is VIS capable at run-time. 804sub unvis { 805my ($mnemonic,$rs1,$rs2,$rd)=@_; 806my $ref,$opf; 807my %visopf = ( "faligndata" => 0x048, 808 "for" => 0x07c ); 809 810 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 811 812 if ($opf=$visopf{$mnemonic}) { 813 foreach ($rs1,$rs2,$rd) { 814 return $ref if (!/%f([0-9]{1,2})/); 815 $_=$1; 816 if ($1>=32) { 817 return $ref if ($1&1); 818 # re-encode for upper double register addressing 819 $_=($1|$1>>5)&31; 820 } 821 } 822 823 return sprintf ".word\t0x%08x !%s", 824 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 825 $ref; 826 } else { 827 return $ref; 828 } 829} 830sub unalignaddr { 831my ($mnemonic,$rs1,$rs2,$rd)=@_; 832my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 833my $ref="$mnemonic\t$rs1,$rs2,$rd"; 834 835 foreach ($rs1,$rs2,$rd) { 836 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } 837 else { return $ref; } 838 } 839 return sprintf ".word\t0x%08x !%s", 840 0x81b00300|$rd<<25|$rs1<<14|$rs2, 841 $ref; 842} 843 844foreach (split("\n",$code)) { 845 s/\`([^\`]*)\`/eval $1/ge; 846 847 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 848 &unvis($1,$2,$3,$4) 849 /ge; 850 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 851 &unalignaddr($1,$2,$3,$4) 852 /ge; 853 854 print $_,"\n"; 855} 856 857close STDOUT; 858