1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# 9# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>. 10# ==================================================================== 11 12# SHA256 performance improvement over compiler generated code varies 13# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit 14# build]. Just like in SHA1 module I aim to ensure scalability on 15# UltraSPARC T1 by packing X[16] to 8 64-bit registers. 16 17# SHA512 on pre-T1 UltraSPARC. 18# 19# Performance is >75% better than 64-bit code generated by Sun C and 20# over 2x than 32-bit code. X[16] resides on stack, but access to it 21# is scheduled for L2 latency and staged through 32 least significant 22# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI 23# duality. Nevetheless it's ~40% faster than SHA256, which is pretty 24# good [optimal coefficient is 50%]. 25# 26# SHA512 on UltraSPARC T1. 27# 28# It's not any faster than 64-bit code generated by Sun C 5.8. This is 29# because 64-bit code generator has the advantage of using 64-bit 30# loads(*) to access X[16], which I consciously traded for 32-/64-bit 31# ABI duality [as per above]. But it surpasses 32-bit Sun C generated 32# code by 60%, not to mention that it doesn't suffer from severe decay 33# when running 4 times physical cores threads and that it leaves gcc 34# [3.4] behind by over 4x factor! If compared to SHA256, single thread 35# performance is only 10% better, but overall throughput for maximum 36# amount of threads for given CPU exceeds corresponding one of SHA256 37# by 30% [again, optimal coefficient is 50%]. 38# 39# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly 40# in-order, i.e. load instruction has to complete prior next 41# instruction in given thread is executed, even if the latter is 42# not dependent on load result! This means that on T1 two 32-bit 43# loads are always slower than one 64-bit load. Once again this 44# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately, 45# 2x32-bit loads can be as fast as 1x64-bit ones. 46# 47# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte, 48# which is 9.3x/11.1x faster than software. Multi-process benchmark 49# saturates at 11.5x single-process result on 8-core processor, or 50# ~11/16GBps per 2.85GHz socket. 51 52$output=shift; 53open STDOUT,">$output"; 54 55if ($output =~ /512/) { 56 $label="512"; 57 $SZ=8; 58 $LD="ldx"; # load from memory 59 $ST="stx"; # store to memory 60 $SLL="sllx"; # shift left logical 61 $SRL="srlx"; # shift right logical 62 @Sigma0=(28,34,39); 63 @Sigma1=(14,18,41); 64 @sigma0=( 7, 1, 8); # right shift first 65 @sigma1=( 6,19,61); # right shift first 66 $lastK=0x817; 67 $rounds=80; 68 $align=4; 69 70 $locals=16*$SZ; # X[16] 71 72 $A="%o0"; 73 $B="%o1"; 74 $C="%o2"; 75 $D="%o3"; 76 $E="%o4"; 77 $F="%o5"; 78 $G="%g1"; 79 $H="%o7"; 80 @V=($A,$B,$C,$D,$E,$F,$G,$H); 81} else { 82 $label="256"; 83 $SZ=4; 84 $LD="ld"; # load from memory 85 $ST="st"; # store to memory 86 $SLL="sll"; # shift left logical 87 $SRL="srl"; # shift right logical 88 @Sigma0=( 2,13,22); 89 @Sigma1=( 6,11,25); 90 @sigma0=( 3, 7,18); # right shift first 91 @sigma1=(10,17,19); # right shift first 92 $lastK=0x8f2; 93 $rounds=64; 94 $align=8; 95 96 $locals=0; # X[16] is register resident 97 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 98 99 $A="%l0"; 100 $B="%l1"; 101 $C="%l2"; 102 $D="%l3"; 103 $E="%l4"; 104 $F="%l5"; 105 $G="%l6"; 106 $H="%l7"; 107 @V=($A,$B,$C,$D,$E,$F,$G,$H); 108} 109$T1="%g2"; 110$tmp0="%g3"; 111$tmp1="%g4"; 112$tmp2="%g5"; 113 114$ctx="%i0"; 115$inp="%i1"; 116$len="%i2"; 117$Ktbl="%i3"; 118$tmp31="%i4"; 119$tmp32="%i5"; 120 121########### SHA256 122$Xload = sub { 123my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 124 125 if ($i==0) { 126$code.=<<___; 127 ldx [$inp+0],@X[0] 128 ldx [$inp+16],@X[2] 129 ldx [$inp+32],@X[4] 130 ldx [$inp+48],@X[6] 131 ldx [$inp+8],@X[1] 132 ldx [$inp+24],@X[3] 133 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too 134 ldx [$inp+40],@X[5] 135 bz,pt %icc,.Laligned 136 ldx [$inp+56],@X[7] 137 138 sllx @X[0],$tmp31,@X[0] 139 ldx [$inp+64],$T1 140___ 141for($j=0;$j<7;$j++) 142{ $code.=<<___; 143 srlx @X[$j+1],$tmp32,$tmp1 144 sllx @X[$j+1],$tmp31,@X[$j+1] 145 or $tmp1,@X[$j],@X[$j] 146___ 147} 148$code.=<<___; 149 srlx $T1,$tmp32,$T1 150 or $T1,@X[7],@X[7] 151.Laligned: 152___ 153 } 154 155 if ($i&1) { 156 $code.="\tadd @X[$i/2],$h,$T1\n"; 157 } else { 158 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n"; 159 } 160} if ($SZ==4); 161 162########### SHA512 163$Xload = sub { 164my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 165my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8)); 166 167$code.=<<___ if ($i==0); 168 ld [$inp+0],%l0 169 ld [$inp+4],%l1 170 ld [$inp+8],%l2 171 ld [$inp+12],%l3 172 ld [$inp+16],%l4 173 ld [$inp+20],%l5 174 ld [$inp+24],%l6 175 cmp $tmp31,0 176 ld [$inp+28],%l7 177___ 178$code.=<<___ if ($i<15); 179 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 180 add $tmp31,32,$tmp0 181 sllx @pair[0],$tmp0,$tmp1 182 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)` 183 srlx @pair[2],$tmp32,@pair[1] 184 or $tmp1,$tmp2,$tmp2 185 or @pair[1],$tmp2,$tmp2 186 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` 187 add $h,$tmp2,$T1 188 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] 189___ 190$code.=<<___ if ($i==12); 191 bnz,a,pn %icc,.+8 192 ld [$inp+128],%l0 193___ 194$code.=<<___ if ($i==15); 195 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 196 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 197 add $tmp31,32,$tmp0 198 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 199 sllx @pair[0],$tmp0,$tmp1 200 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 201 srlx @pair[2],$tmp32,@pair[1] 202 or $tmp1,$tmp2,$tmp2 203 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 204 or @pair[1],$tmp2,$tmp2 205 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 206 add $h,$tmp2,$T1 207 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] 208 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 209 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 210 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 211___ 212} if ($SZ==8); 213 214########### common 215sub BODY_00_15 { 216my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 217 218 if ($i<16) { 219 &$Xload(@_); 220 } else { 221 $code.="\tadd $h,$T1,$T1\n"; 222 } 223 224$code.=<<___; 225 $SRL $e,@Sigma1[0],$h !! $i 226 xor $f,$g,$tmp2 227 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1 228 and $e,$tmp2,$tmp2 229 $SRL $e,@Sigma1[1],$tmp0 230 xor $tmp1,$h,$h 231 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1 232 xor $tmp0,$h,$h 233 $SRL $e,@Sigma1[2],$tmp0 234 xor $tmp1,$h,$h 235 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1 236 xor $tmp0,$h,$h 237 xor $g,$tmp2,$tmp2 ! Ch(e,f,g) 238 xor $tmp1,$h,$tmp0 ! Sigma1(e) 239 240 $SRL $a,@Sigma0[0],$h 241 add $tmp2,$T1,$T1 242 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i] 243 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1 244 add $tmp0,$T1,$T1 245 $SRL $a,@Sigma0[1],$tmp0 246 xor $tmp1,$h,$h 247 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1 248 xor $tmp0,$h,$h 249 $SRL $a,@Sigma0[2],$tmp0 250 xor $tmp1,$h,$h 251 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1 252 xor $tmp0,$h,$h 253 xor $tmp1,$h,$h ! Sigma0(a) 254 255 or $a,$b,$tmp0 256 and $a,$b,$tmp1 257 and $c,$tmp0,$tmp0 258 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c) 259 add $tmp2,$T1,$T1 ! +=K[$i] 260 add $tmp1,$h,$h 261 262 add $T1,$d,$d 263 add $T1,$h,$h 264___ 265} 266 267########### SHA256 268$BODY_16_XX = sub { 269my $i=@_[0]; 270my $xi; 271 272 if ($i&1) { 273 $xi=$tmp32; 274 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n"; 275 } else { 276 $xi=@X[(($i+1)/2)%8]; 277 } 278$code.=<<___; 279 srl $xi,@sigma0[0],$T1 !! Xupdate($i) 280 sll $xi,`32-@sigma0[2]`,$tmp1 281 srl $xi,@sigma0[1],$tmp0 282 xor $tmp1,$T1,$T1 283 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 284 xor $tmp0,$T1,$T1 285 srl $xi,@sigma0[2],$tmp0 286 xor $tmp1,$T1,$T1 287___ 288 if ($i&1) { 289 $xi=@X[(($i+14)/2)%8]; 290 } else { 291 $xi=$tmp32; 292 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n"; 293 } 294$code.=<<___; 295 srl $xi,@sigma1[0],$tmp2 296 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1]) 297 sll $xi,`32-@sigma1[2]`,$tmp1 298 srl $xi,@sigma1[1],$tmp0 299 xor $tmp1,$tmp2,$tmp2 300 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1 301 xor $tmp0,$tmp2,$tmp2 302 srl $xi,@sigma1[2],$tmp0 303 xor $tmp1,$tmp2,$tmp2 304___ 305 if ($i&1) { 306 $xi=@X[($i/2)%8]; 307$code.=<<___; 308 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] 309 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 310 srl @X[($i/2)%8],0,$tmp0 311 add $tmp2,$tmp1,$tmp1 312 add $xi,$T1,$T1 ! +=X[i] 313 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] 314 add $tmp1,$T1,$T1 315 316 srl $T1,0,$T1 317 or $T1,@X[($i/2)%8],@X[($i/2)%8] 318___ 319 } else { 320 $xi=@X[(($i+9)/2)%8]; 321$code.=<<___; 322 srlx @X[($i/2)%8],32,$tmp1 ! X[i] 323 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 324 add $xi,$T1,$T1 ! +=X[i+9] 325 add $tmp2,$tmp1,$tmp1 326 srl @X[($i/2)%8],0,@X[($i/2)%8] 327 add $tmp1,$T1,$T1 328 329 sllx $T1,32,$tmp0 330 or $tmp0,@X[($i/2)%8],@X[($i/2)%8] 331___ 332 } 333 &BODY_00_15(@_); 334} if ($SZ==4); 335 336########### SHA512 337$BODY_16_XX = sub { 338my $i=@_[0]; 339my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1)); 340 341$code.=<<___; 342 sllx %l2,32,$tmp0 !! Xupdate($i) 343 or %l3,$tmp0,$tmp0 344 345 srlx $tmp0,@sigma0[0],$T1 346 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 347 sllx $tmp0,`64-@sigma0[2]`,$tmp1 348 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 349 srlx $tmp0,@sigma0[1],$tmp0 350 xor $tmp1,$T1,$T1 351 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 352 xor $tmp0,$T1,$T1 353 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0 354 xor $tmp1,$T1,$T1 355 sllx %l6,32,$tmp2 356 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1]) 357 or %l7,$tmp2,$tmp2 358 359 srlx $tmp2,@sigma1[0],$tmp1 360 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 361 sllx $tmp2,`64-@sigma1[2]`,$tmp0 362 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 363 srlx $tmp2,@sigma1[1],$tmp2 364 xor $tmp0,$tmp1,$tmp1 365 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 366 xor $tmp2,$tmp1,$tmp1 367 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2 368 xor $tmp0,$tmp1,$tmp1 369 sllx %l4,32,$tmp0 370 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) 371 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 372 or %l5,$tmp0,$tmp0 373 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 374 375 sllx %l0,32,$tmp2 376 add $tmp1,$T1,$T1 377 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 378 or %l1,$tmp2,$tmp2 379 add $tmp0,$T1,$T1 ! +=X[$i+9] 380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 381 add $tmp2,$T1,$T1 ! +=X[$i] 382 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`] 383___ 384 &BODY_00_15(@_); 385} if ($SZ==8); 386 387$code.=<<___; 388#include "sparc_arch.h" 389 390#ifdef __arch64__ 391.register %g2,#scratch 392.register %g3,#scratch 393#endif 394 395.section ".text",#alloc,#execinstr 396 397.align 64 398K${label}: 399.type K${label},#object 400___ 401if ($SZ==4) { 402$code.=<<___; 403 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 404 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 405 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 406 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 407 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 408 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 409 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 410 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 411 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 412 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 413 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 414 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 415 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 416 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 417 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 418 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 419___ 420} else { 421$code.=<<___; 422 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 423 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 424 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 425 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 426 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 427 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 428 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 429 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 430 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 431 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 432 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 433 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 434 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 435 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 436 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 437 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 438 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 439 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 440 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 441 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 442 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 443 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 444 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 445 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 446 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 447 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 448 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 449 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 450 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 451 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 452 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 453 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 454 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 455 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 456 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 457 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 458 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 459 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 460 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 461 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 462___ 463} 464$code.=<<___; 465.size K${label},.-K${label} 466 467#ifdef __PIC__ 468SPARC_PIC_THUNK(%g1) 469#endif 470 471.globl sha${label}_block_data_order 472.align 32 473sha${label}_block_data_order: 474 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 475 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] 476 477 andcc %g1, CFR_SHA${label}, %g0 478 be .Lsoftware 479 nop 480___ 481$code.=<<___ if ($SZ==8); # SHA512 482 ldd [%o0 + 0x00], %f0 ! load context 483 ldd [%o0 + 0x08], %f2 484 ldd [%o0 + 0x10], %f4 485 ldd [%o0 + 0x18], %f6 486 ldd [%o0 + 0x20], %f8 487 ldd [%o0 + 0x28], %f10 488 andcc %o1, 0x7, %g0 489 ldd [%o0 + 0x30], %f12 490 bne,pn %icc, .Lhwunaligned 491 ldd [%o0 + 0x38], %f14 492 493.Lhwaligned_loop: 494 ldd [%o1 + 0x00], %f16 495 ldd [%o1 + 0x08], %f18 496 ldd [%o1 + 0x10], %f20 497 ldd [%o1 + 0x18], %f22 498 ldd [%o1 + 0x20], %f24 499 ldd [%o1 + 0x28], %f26 500 ldd [%o1 + 0x30], %f28 501 ldd [%o1 + 0x38], %f30 502 ldd [%o1 + 0x40], %f32 503 ldd [%o1 + 0x48], %f34 504 ldd [%o1 + 0x50], %f36 505 ldd [%o1 + 0x58], %f38 506 ldd [%o1 + 0x60], %f40 507 ldd [%o1 + 0x68], %f42 508 ldd [%o1 + 0x70], %f44 509 subcc %o2, 1, %o2 ! done yet? 510 ldd [%o1 + 0x78], %f46 511 add %o1, 0x80, %o1 512 prefetch [%o1 + 63], 20 513 prefetch [%o1 + 64+63], 20 514 515 .word 0x81b02860 ! SHA512 516 517 bne,pt SIZE_T_CC, .Lhwaligned_loop 518 nop 519 520.Lhwfinish: 521 std %f0, [%o0 + 0x00] ! store context 522 std %f2, [%o0 + 0x08] 523 std %f4, [%o0 + 0x10] 524 std %f6, [%o0 + 0x18] 525 std %f8, [%o0 + 0x20] 526 std %f10, [%o0 + 0x28] 527 std %f12, [%o0 + 0x30] 528 retl 529 std %f14, [%o0 + 0x38] 530 531.align 16 532.Lhwunaligned: 533 alignaddr %o1, %g0, %o1 534 535 ldd [%o1 + 0x00], %f18 536.Lhwunaligned_loop: 537 ldd [%o1 + 0x08], %f20 538 ldd [%o1 + 0x10], %f22 539 ldd [%o1 + 0x18], %f24 540 ldd [%o1 + 0x20], %f26 541 ldd [%o1 + 0x28], %f28 542 ldd [%o1 + 0x30], %f30 543 ldd [%o1 + 0x38], %f32 544 ldd [%o1 + 0x40], %f34 545 ldd [%o1 + 0x48], %f36 546 ldd [%o1 + 0x50], %f38 547 ldd [%o1 + 0x58], %f40 548 ldd [%o1 + 0x60], %f42 549 ldd [%o1 + 0x68], %f44 550 ldd [%o1 + 0x70], %f46 551 ldd [%o1 + 0x78], %f48 552 subcc %o2, 1, %o2 ! done yet? 553 ldd [%o1 + 0x80], %f50 554 add %o1, 0x80, %o1 555 prefetch [%o1 + 63], 20 556 prefetch [%o1 + 64+63], 20 557 558 faligndata %f18, %f20, %f16 559 faligndata %f20, %f22, %f18 560 faligndata %f22, %f24, %f20 561 faligndata %f24, %f26, %f22 562 faligndata %f26, %f28, %f24 563 faligndata %f28, %f30, %f26 564 faligndata %f30, %f32, %f28 565 faligndata %f32, %f34, %f30 566 faligndata %f34, %f36, %f32 567 faligndata %f36, %f38, %f34 568 faligndata %f38, %f40, %f36 569 faligndata %f40, %f42, %f38 570 faligndata %f42, %f44, %f40 571 faligndata %f44, %f46, %f42 572 faligndata %f46, %f48, %f44 573 faligndata %f48, %f50, %f46 574 575 .word 0x81b02860 ! SHA512 576 577 bne,pt SIZE_T_CC, .Lhwunaligned_loop 578 for %f50, %f50, %f18 ! %f18=%f50 579 580 ba .Lhwfinish 581 nop 582___ 583$code.=<<___ if ($SZ==4); # SHA256 584 ld [%o0 + 0x00], %f0 585 ld [%o0 + 0x04], %f1 586 ld [%o0 + 0x08], %f2 587 ld [%o0 + 0x0c], %f3 588 ld [%o0 + 0x10], %f4 589 ld [%o0 + 0x14], %f5 590 andcc %o1, 0x7, %g0 591 ld [%o0 + 0x18], %f6 592 bne,pn %icc, .Lhwunaligned 593 ld [%o0 + 0x1c], %f7 594 595.Lhwloop: 596 ldd [%o1 + 0x00], %f8 597 ldd [%o1 + 0x08], %f10 598 ldd [%o1 + 0x10], %f12 599 ldd [%o1 + 0x18], %f14 600 ldd [%o1 + 0x20], %f16 601 ldd [%o1 + 0x28], %f18 602 ldd [%o1 + 0x30], %f20 603 subcc %o2, 1, %o2 ! done yet? 604 ldd [%o1 + 0x38], %f22 605 add %o1, 0x40, %o1 606 prefetch [%o1 + 63], 20 607 608 .word 0x81b02840 ! SHA256 609 610 bne,pt SIZE_T_CC, .Lhwloop 611 nop 612 613.Lhwfinish: 614 st %f0, [%o0 + 0x00] ! store context 615 st %f1, [%o0 + 0x04] 616 st %f2, [%o0 + 0x08] 617 st %f3, [%o0 + 0x0c] 618 st %f4, [%o0 + 0x10] 619 st %f5, [%o0 + 0x14] 620 st %f6, [%o0 + 0x18] 621 retl 622 st %f7, [%o0 + 0x1c] 623 624.align 8 625.Lhwunaligned: 626 alignaddr %o1, %g0, %o1 627 628 ldd [%o1 + 0x00], %f10 629.Lhwunaligned_loop: 630 ldd [%o1 + 0x08], %f12 631 ldd [%o1 + 0x10], %f14 632 ldd [%o1 + 0x18], %f16 633 ldd [%o1 + 0x20], %f18 634 ldd [%o1 + 0x28], %f20 635 ldd [%o1 + 0x30], %f22 636 ldd [%o1 + 0x38], %f24 637 subcc %o2, 1, %o2 ! done yet? 638 ldd [%o1 + 0x40], %f26 639 add %o1, 0x40, %o1 640 prefetch [%o1 + 63], 20 641 642 faligndata %f10, %f12, %f8 643 faligndata %f12, %f14, %f10 644 faligndata %f14, %f16, %f12 645 faligndata %f16, %f18, %f14 646 faligndata %f18, %f20, %f16 647 faligndata %f20, %f22, %f18 648 faligndata %f22, %f24, %f20 649 faligndata %f24, %f26, %f22 650 651 .word 0x81b02840 ! SHA256 652 653 bne,pt SIZE_T_CC, .Lhwunaligned_loop 654 for %f26, %f26, %f10 ! %f10=%f26 655 656 ba .Lhwfinish 657 nop 658___ 659$code.=<<___; 660.align 16 661.Lsoftware: 662 save %sp,-STACK_FRAME-$locals,%sp 663 and $inp,`$align-1`,$tmp31 664 sllx $len,`log(16*$SZ)/log(2)`,$len 665 andn $inp,`$align-1`,$inp 666 sll $tmp31,3,$tmp31 667 add $inp,$len,$len 668___ 669$code.=<<___ if ($SZ==8); # SHA512 670 mov 32,$tmp32 671 sub $tmp32,$tmp31,$tmp32 672___ 673$code.=<<___; 674.Lpic: call .+8 675 add %o7,K${label}-.Lpic,$Ktbl 676 677 $LD [$ctx+`0*$SZ`],$A 678 $LD [$ctx+`1*$SZ`],$B 679 $LD [$ctx+`2*$SZ`],$C 680 $LD [$ctx+`3*$SZ`],$D 681 $LD [$ctx+`4*$SZ`],$E 682 $LD [$ctx+`5*$SZ`],$F 683 $LD [$ctx+`6*$SZ`],$G 684 $LD [$ctx+`7*$SZ`],$H 685 686.Lloop: 687___ 688for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 689$code.=".L16_xx:\n"; 690for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 691$code.=<<___; 692 and $tmp2,0xfff,$tmp2 693 cmp $tmp2,$lastK 694 bne .L16_xx 695 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16 696 697___ 698$code.=<<___ if ($SZ==4); # SHA256 699 $LD [$ctx+`0*$SZ`],@X[0] 700 $LD [$ctx+`1*$SZ`],@X[1] 701 $LD [$ctx+`2*$SZ`],@X[2] 702 $LD [$ctx+`3*$SZ`],@X[3] 703 $LD [$ctx+`4*$SZ`],@X[4] 704 $LD [$ctx+`5*$SZ`],@X[5] 705 $LD [$ctx+`6*$SZ`],@X[6] 706 $LD [$ctx+`7*$SZ`],@X[7] 707 708 add $A,@X[0],$A 709 $ST $A,[$ctx+`0*$SZ`] 710 add $B,@X[1],$B 711 $ST $B,[$ctx+`1*$SZ`] 712 add $C,@X[2],$C 713 $ST $C,[$ctx+`2*$SZ`] 714 add $D,@X[3],$D 715 $ST $D,[$ctx+`3*$SZ`] 716 add $E,@X[4],$E 717 $ST $E,[$ctx+`4*$SZ`] 718 add $F,@X[5],$F 719 $ST $F,[$ctx+`5*$SZ`] 720 add $G,@X[6],$G 721 $ST $G,[$ctx+`6*$SZ`] 722 add $H,@X[7],$H 723 $ST $H,[$ctx+`7*$SZ`] 724___ 725$code.=<<___ if ($SZ==8); # SHA512 726 ld [$ctx+`0*$SZ+0`],%l0 727 ld [$ctx+`0*$SZ+4`],%l1 728 ld [$ctx+`1*$SZ+0`],%l2 729 ld [$ctx+`1*$SZ+4`],%l3 730 ld [$ctx+`2*$SZ+0`],%l4 731 ld [$ctx+`2*$SZ+4`],%l5 732 ld [$ctx+`3*$SZ+0`],%l6 733 734 sllx %l0,32,$tmp0 735 ld [$ctx+`3*$SZ+4`],%l7 736 sllx %l2,32,$tmp1 737 or %l1,$tmp0,$tmp0 738 or %l3,$tmp1,$tmp1 739 add $tmp0,$A,$A 740 add $tmp1,$B,$B 741 $ST $A,[$ctx+`0*$SZ`] 742 sllx %l4,32,$tmp2 743 $ST $B,[$ctx+`1*$SZ`] 744 sllx %l6,32,$T1 745 or %l5,$tmp2,$tmp2 746 or %l7,$T1,$T1 747 add $tmp2,$C,$C 748 $ST $C,[$ctx+`2*$SZ`] 749 add $T1,$D,$D 750 $ST $D,[$ctx+`3*$SZ`] 751 752 ld [$ctx+`4*$SZ+0`],%l0 753 ld [$ctx+`4*$SZ+4`],%l1 754 ld [$ctx+`5*$SZ+0`],%l2 755 ld [$ctx+`5*$SZ+4`],%l3 756 ld [$ctx+`6*$SZ+0`],%l4 757 ld [$ctx+`6*$SZ+4`],%l5 758 ld [$ctx+`7*$SZ+0`],%l6 759 760 sllx %l0,32,$tmp0 761 ld [$ctx+`7*$SZ+4`],%l7 762 sllx %l2,32,$tmp1 763 or %l1,$tmp0,$tmp0 764 or %l3,$tmp1,$tmp1 765 add $tmp0,$E,$E 766 add $tmp1,$F,$F 767 $ST $E,[$ctx+`4*$SZ`] 768 sllx %l4,32,$tmp2 769 $ST $F,[$ctx+`5*$SZ`] 770 sllx %l6,32,$T1 771 or %l5,$tmp2,$tmp2 772 or %l7,$T1,$T1 773 add $tmp2,$G,$G 774 $ST $G,[$ctx+`6*$SZ`] 775 add $T1,$H,$H 776 $ST $H,[$ctx+`7*$SZ`] 777___ 778$code.=<<___; 779 add $inp,`16*$SZ`,$inp ! advance inp 780 cmp $inp,$len 781 bne SIZE_T_CC,.Lloop 782 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl 783 784 ret 785 restore 786.type sha${label}_block_data_order,#function 787.size sha${label}_block_data_order,(.-sha${label}_block_data_order) 788.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 789.align 4 790___ 791 792# Purpose of these subroutines is to explicitly encode VIS instructions, 793# so that one can compile the module without having to specify VIS 794# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 795# Idea is to reserve for option to produce "universal" binary and let 796# programmer detect if current CPU is VIS capable at run-time. 797sub unvis { 798my ($mnemonic,$rs1,$rs2,$rd)=@_; 799my $ref,$opf; 800my %visopf = ( "faligndata" => 0x048, 801 "for" => 0x07c ); 802 803 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 804 805 if ($opf=$visopf{$mnemonic}) { 806 foreach ($rs1,$rs2,$rd) { 807 return $ref if (!/%f([0-9]{1,2})/); 808 $_=$1; 809 if ($1>=32) { 810 return $ref if ($1&1); 811 # re-encode for upper double register addressing 812 $_=($1|$1>>5)&31; 813 } 814 } 815 816 return sprintf ".word\t0x%08x !%s", 817 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 818 $ref; 819 } else { 820 return $ref; 821 } 822} 823sub unalignaddr { 824my ($mnemonic,$rs1,$rs2,$rd)=@_; 825my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 826my $ref="$mnemonic\t$rs1,$rs2,$rd"; 827 828 foreach ($rs1,$rs2,$rd) { 829 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } 830 else { return $ref; } 831 } 832 return sprintf ".word\t0x%08x !%s", 833 0x81b00300|$rd<<25|$rs1<<14|$rs2, 834 $ref; 835} 836 837foreach (split("\n",$code)) { 838 s/\`([^\`]*)\`/eval $1/ge; 839 840 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 841 &unvis($1,$2,$3,$4) 842 /ge; 843 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 844 &unalignaddr($1,$2,$3,$4) 845 /ge; 846 847 print $_,"\n"; 848} 849 850close STDOUT; 851