1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# SHA256/512 for PowerISA v2.07. 11# 12# Accurate performance measurements are problematic, because it's 13# always virtualized setup with possibly throttled processor. 14# Relative comparison is therefore more informative. This module is 15# ~60% faster than integer-only sha512-ppc.pl. To anchor to something 16# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than 17# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than 18# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting 19# result is degree of computational resources' utilization. POWER8 is 20# "massively multi-threaded chip" and difference between single- and 21# maximum multi-process benchmark results tells that utlization is 22# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and 23# for sha1-ppc.pl - 73%. 100% means that multi-process result equals 24# to single-process one, given that all threads end up on the same 25# physical core. 26 27$flavour=shift; 28$output =shift; 29 30if ($flavour =~ /64/) { 31 $SIZE_T=8; 32 $LRSAVE=2*$SIZE_T; 33 $STU="stdu"; 34 $POP="ld"; 35 $PUSH="std"; 36} elsif ($flavour =~ /32/) { 37 $SIZE_T=4; 38 $LRSAVE=$SIZE_T; 39 $STU="stwu"; 40 $POP="lwz"; 41 $PUSH="stw"; 42} else { die "nonsense $flavour"; } 43 44$LENDIAN=($flavour=~/le/); 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 49die "can't locate ppc-xlate.pl"; 50 51open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; 52 53if ($output =~ /512/) { 54 $bits=512; 55 $SZ=8; 56 $sz="d"; 57 $rounds=80; 58} else { 59 $bits=256; 60 $SZ=4; 61 $sz="w"; 62 $rounds=64; 63} 64 65$func="sha${bits}_block_p8"; 66$FRAME=8*$SIZE_T; 67 68$sp ="r1"; 69$toc="r2"; 70$ctx="r3"; 71$inp="r4"; 72$num="r5"; 73$Tbl="r6"; 74$idx="r7"; 75$lrsave="r8"; 76$offload="r11"; 77$vrsave="r12"; 78($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); 79 $x00=0 if ($flavour =~ /osx/); 80 81@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); 82@X=map("v$_",(8..23)); 83($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31)); 84 85sub ROUND { 86my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 87my $j=($i+1)%16; 88 89$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); 90 lvx_u @X[$i+1],0,$inp ; load X[i] in advance 91 addi $inp,$inp,16 92___ 93$code.=<<___ if ($i<16 && ($i%(16/$SZ))); 94 vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ 95___ 96$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); 97 vperm @X[$i],@X[$i],@X[$i],$lemask 98___ 99$code.=<<___; 100 `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)` 101 vsel $Func,$g,$f,$e ; Ch(e,f,g) 102 vshasigma${sz} $S1,$e,1,15 ; Sigma1(e) 103 vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] 104 vshasigma${sz} $S0,$a,1,0 ; Sigma0(a) 105 `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)` 106 vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) 107 vxor $Func,$a,$b 108 `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)` 109 vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e) 110 vsel $Func,$b,$c,$Func ; Maj(a,b,c) 111 vaddu${sz}m $g,$g,$Ki ; future h+=K[i] 112 vaddu${sz}m $d,$d,$h ; d+=h 113 vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c) 114 `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)` 115 lvx $Ki,$idx,$Tbl ; load next K[i] 116 addi $idx,$idx,16 117 vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c) 118 `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)` 119___ 120} 121 122$code=<<___; 123.machine "any" 124.text 125 126.globl $func 127.align 6 128$func: 129 $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) 130 mflr $lrsave 131 li r10,`$FRAME+8*16+15` 132 li r11,`$FRAME+8*16+31` 133 stvx v20,r10,$sp # ABI says so 134 addi r10,r10,32 135 mfspr $vrsave,256 136 stvx v21,r11,$sp 137 addi r11,r11,32 138 stvx v22,r10,$sp 139 addi r10,r10,32 140 stvx v23,r11,$sp 141 addi r11,r11,32 142 stvx v24,r10,$sp 143 addi r10,r10,32 144 stvx v25,r11,$sp 145 addi r11,r11,32 146 stvx v26,r10,$sp 147 addi r10,r10,32 148 stvx v27,r11,$sp 149 addi r11,r11,32 150 stvx v28,r10,$sp 151 addi r10,r10,32 152 stvx v29,r11,$sp 153 addi r11,r11,32 154 stvx v30,r10,$sp 155 stvx v31,r11,$sp 156 li r11,-1 157 stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave 158 li $x10,0x10 159 $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) 160 li $x20,0x20 161 $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) 162 li $x30,0x30 163 $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) 164 li $x40,0x40 165 $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) 166 li $x50,0x50 167 $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) 168 li $x60,0x60 169 $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) 170 li $x70,0x70 171 $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) 172 mtspr 256,r11 173 174 bl LPICmeup 175 addi $offload,$sp,$FRAME+15 176___ 177$code.=<<___ if ($LENDIAN); 178 li $idx,8 179 lvsl $lemask,0,$idx 180 vspltisb $Ki,0x0f 181 vxor $lemask,$lemask,$Ki 182___ 183$code.=<<___ if ($SZ==4); 184 lvx_4w $A,$x00,$ctx 185 lvx_4w $E,$x10,$ctx 186 vsldoi $B,$A,$A,4 # unpack 187 vsldoi $C,$A,$A,8 188 vsldoi $D,$A,$A,12 189 vsldoi $F,$E,$E,4 190 vsldoi $G,$E,$E,8 191 vsldoi $H,$E,$E,12 192___ 193$code.=<<___ if ($SZ==8); 194 lvx_u $A,$x00,$ctx 195 lvx_u $C,$x10,$ctx 196 lvx_u $E,$x20,$ctx 197 vsldoi $B,$A,$A,8 # unpack 198 lvx_u $G,$x30,$ctx 199 vsldoi $D,$C,$C,8 200 vsldoi $F,$E,$E,8 201 vsldoi $H,$G,$G,8 202___ 203$code.=<<___; 204 li r0,`($rounds-16)/16` # inner loop counter 205 b Loop 206.align 5 207Loop: 208 lvx $Ki,$x00,$Tbl 209 li $idx,16 210 lvx_u @X[0],0,$inp 211 addi $inp,$inp,16 212 stvx $A,$x00,$offload # offload $A-$H 213 stvx $B,$x10,$offload 214 stvx $C,$x20,$offload 215 stvx $D,$x30,$offload 216 stvx $E,$x40,$offload 217 stvx $F,$x50,$offload 218 stvx $G,$x60,$offload 219 stvx $H,$x70,$offload 220 vaddu${sz}m $H,$H,$Ki # h+K[i] 221 lvx $Ki,$idx,$Tbl 222 addi $idx,$idx,16 223___ 224for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } 225$code.=<<___; 226 mtctr r0 227 b L16_xx 228.align 5 229L16_xx: 230___ 231for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } 232$code.=<<___; 233 bdnz L16_xx 234 235 lvx @X[2],$x00,$offload 236 subic. $num,$num,1 237 lvx @X[3],$x10,$offload 238 vaddu${sz}m $A,$A,@X[2] 239 lvx @X[4],$x20,$offload 240 vaddu${sz}m $B,$B,@X[3] 241 lvx @X[5],$x30,$offload 242 vaddu${sz}m $C,$C,@X[4] 243 lvx @X[6],$x40,$offload 244 vaddu${sz}m $D,$D,@X[5] 245 lvx @X[7],$x50,$offload 246 vaddu${sz}m $E,$E,@X[6] 247 lvx @X[8],$x60,$offload 248 vaddu${sz}m $F,$F,@X[7] 249 lvx @X[9],$x70,$offload 250 vaddu${sz}m $G,$G,@X[8] 251 vaddu${sz}m $H,$H,@X[9] 252 bne Loop 253___ 254$code.=<<___ if ($SZ==4); 255 lvx @X[0],$idx,$Tbl 256 addi $idx,$idx,16 257 vperm $A,$A,$B,$Ki # pack the answer 258 lvx @X[1],$idx,$Tbl 259 vperm $E,$E,$F,$Ki 260 vperm $A,$A,$C,@X[0] 261 vperm $E,$E,$G,@X[0] 262 vperm $A,$A,$D,@X[1] 263 vperm $E,$E,$H,@X[1] 264 stvx_4w $A,$x00,$ctx 265 stvx_4w $E,$x10,$ctx 266___ 267$code.=<<___ if ($SZ==8); 268 vperm $A,$A,$B,$Ki # pack the answer 269 vperm $C,$C,$D,$Ki 270 vperm $E,$E,$F,$Ki 271 vperm $G,$G,$H,$Ki 272 stvx_u $A,$x00,$ctx 273 stvx_u $C,$x10,$ctx 274 stvx_u $E,$x20,$ctx 275 stvx_u $G,$x30,$ctx 276___ 277$code.=<<___; 278 li r10,`$FRAME+8*16+15` 279 mtlr $lrsave 280 li r11,`$FRAME+8*16+31` 281 mtspr 256,$vrsave 282 lvx v20,r10,$sp # ABI says so 283 addi r10,r10,32 284 lvx v21,r11,$sp 285 addi r11,r11,32 286 lvx v22,r10,$sp 287 addi r10,r10,32 288 lvx v23,r11,$sp 289 addi r11,r11,32 290 lvx v24,r10,$sp 291 addi r10,r10,32 292 lvx v25,r11,$sp 293 addi r11,r11,32 294 lvx v26,r10,$sp 295 addi r10,r10,32 296 lvx v27,r11,$sp 297 addi r11,r11,32 298 lvx v28,r10,$sp 299 addi r10,r10,32 300 lvx v29,r11,$sp 301 addi r11,r11,32 302 lvx v30,r10,$sp 303 lvx v31,r11,$sp 304 $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) 305 $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) 306 $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) 307 $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) 308 $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) 309 $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) 310 addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` 311 blr 312 .long 0 313 .byte 0,12,4,1,0x80,6,3,0 314 .long 0 315.size $func,.-$func 316___ 317 318# Ugly hack here, because PPC assembler syntax seem to vary too 319# much from platforms to platform... 320$code.=<<___; 321.align 6 322LPICmeup: 323 mflr r0 324 bcl 20,31,\$+4 325 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry 326 addi $Tbl,$Tbl,`64-8` 327 mtlr r0 328 blr 329 .long 0 330 .byte 0,12,0x14,0,0,0,0,0 331 .space `64-9*4` 332___ 333 334if ($SZ==8) { 335 local *table = sub { 336 foreach(@_) { $code.=".quad $_,$_\n"; } 337 }; 338 table( 339 "0x428a2f98d728ae22","0x7137449123ef65cd", 340 "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc", 341 "0x3956c25bf348b538","0x59f111f1b605d019", 342 "0x923f82a4af194f9b","0xab1c5ed5da6d8118", 343 "0xd807aa98a3030242","0x12835b0145706fbe", 344 "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2", 345 "0x72be5d74f27b896f","0x80deb1fe3b1696b1", 346 "0x9bdc06a725c71235","0xc19bf174cf692694", 347 "0xe49b69c19ef14ad2","0xefbe4786384f25e3", 348 "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65", 349 "0x2de92c6f592b0275","0x4a7484aa6ea6e483", 350 "0x5cb0a9dcbd41fbd4","0x76f988da831153b5", 351 "0x983e5152ee66dfab","0xa831c66d2db43210", 352 "0xb00327c898fb213f","0xbf597fc7beef0ee4", 353 "0xc6e00bf33da88fc2","0xd5a79147930aa725", 354 "0x06ca6351e003826f","0x142929670a0e6e70", 355 "0x27b70a8546d22ffc","0x2e1b21385c26c926", 356 "0x4d2c6dfc5ac42aed","0x53380d139d95b3df", 357 "0x650a73548baf63de","0x766a0abb3c77b2a8", 358 "0x81c2c92e47edaee6","0x92722c851482353b", 359 "0xa2bfe8a14cf10364","0xa81a664bbc423001", 360 "0xc24b8b70d0f89791","0xc76c51a30654be30", 361 "0xd192e819d6ef5218","0xd69906245565a910", 362 "0xf40e35855771202a","0x106aa07032bbd1b8", 363 "0x19a4c116b8d2d0c8","0x1e376c085141ab53", 364 "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8", 365 "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb", 366 "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3", 367 "0x748f82ee5defb2fc","0x78a5636f43172f60", 368 "0x84c87814a1f0ab72","0x8cc702081a6439ec", 369 "0x90befffa23631e28","0xa4506cebde82bde9", 370 "0xbef9a3f7b2c67915","0xc67178f2e372532b", 371 "0xca273eceea26619c","0xd186b8c721c0c207", 372 "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178", 373 "0x06f067aa72176fba","0x0a637dc5a2c898a6", 374 "0x113f9804bef90dae","0x1b710b35131c471b", 375 "0x28db77f523047d84","0x32caab7b40c72493", 376 "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c", 377 "0x4cc5d4becb3e42b6","0x597f299cfc657e2a", 378 "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0"); 379$code.=<<___ if (!$LENDIAN); 380.quad 0x0001020304050607,0x1011121314151617 381___ 382$code.=<<___ if ($LENDIAN); # quad-swapped 383.quad 0x1011121314151617,0x0001020304050607 384___ 385} else { 386 local *table = sub { 387 foreach(@_) { $code.=".long $_,$_,$_,$_\n"; } 388 }; 389 table( 390 "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5", 391 "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5", 392 "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3", 393 "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174", 394 "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc", 395 "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da", 396 "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7", 397 "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967", 398 "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13", 399 "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85", 400 "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3", 401 "0xd192e819","0xd6990624","0xf40e3585","0x106aa070", 402 "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5", 403 "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3", 404 "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208", 405 "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0"); 406$code.=<<___ if (!$LENDIAN); 407.long 0x00010203,0x10111213,0x10111213,0x10111213 408.long 0x00010203,0x04050607,0x10111213,0x10111213 409.long 0x00010203,0x04050607,0x08090a0b,0x10111213 410___ 411$code.=<<___ if ($LENDIAN); # word-swapped 412.long 0x10111213,0x10111213,0x10111213,0x00010203 413.long 0x10111213,0x10111213,0x04050607,0x00010203 414.long 0x10111213,0x08090a0b,0x04050607,0x00010203 415___ 416} 417$code.=<<___; 418.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 419.align 2 420___ 421 422$code =~ s/\`([^\`]*)\`/eval $1/gem; 423print $code; 424close STDOUT; 425