1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# SHA2 block procedures for MIPS. 11 12# October 2010. 13# 14# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc- 15# generated code in o32 build and ~55% in n32/64 build. SHA512 [which 16# for now can only be compiled for MIPS64 ISA] improvement is modest 17# ~17%, but it comes for free, because it's same instruction sequence. 18# Improvement coefficients are for aligned input. 19 20###################################################################### 21# There is a number of MIPS ABI in use, O32 and N32/64 are most 22# widely used. Then there is a new contender: NUBI. It appears that if 23# one picks the latter, it's possible to arrange code in ABI neutral 24# manner. Therefore let's stick to NUBI register layout: 25# 26($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 27($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 29($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 30# 31# The return value is placed in $a0. Following coding rules facilitate 32# interoperability: 33# 34# - never ever touch $tp, "thread pointer", former $gp [o32 can be 35# excluded from the rule, because it's specified volatile]; 36# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 37# old code]; 38# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 39# 40# For reference here is register layout for N32/64 MIPS ABIs: 41# 42# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 43# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 44# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 45# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 46# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 47# 48$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 49 50if ($flavour =~ /64/i) { 51 $LA="dla"; 52} else { 53 $LA="la"; 54} 55 56if ($flavour =~ /64|n32/i) { 57 $PTR_ADD="dadd"; # incidentally works even on n32 58 $PTR_SUB="dsub"; # incidentally works even on n32 59 $REG_S="sd"; 60 $REG_L="ld"; 61 $PTR_SLL="dsll"; # incidentally works even on n32 62 $SZREG=8; 63} else { 64 $PTR_ADD="add"; 65 $PTR_SUB="sub"; 66 $REG_S="sw"; 67 $REG_L="lw"; 68 $PTR_SLL="sll"; 69 $SZREG=4; 70} 71$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; 72# 73# <appro@openssl.org> 74# 75###################################################################### 76 77$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; 78 79for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } 80open STDOUT,">$output"; 81 82if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } 83 84if ($output =~ /512/) { 85 $label="512"; 86 $SZ=8; 87 $LD="ld"; # load from memory 88 $ST="sd"; # store to memory 89 $SLL="dsll"; # shift left logical 90 $SRL="dsrl"; # shift right logical 91 $ADDU="daddu"; 92 @Sigma0=(28,34,39); 93 @Sigma1=(14,18,41); 94 @sigma0=( 7, 1, 8); # right shift first 95 @sigma1=( 6,19,61); # right shift first 96 $lastK=0x817; 97 $rounds=80; 98} else { 99 $label="256"; 100 $SZ=4; 101 $LD="lw"; # load from memory 102 $ST="sw"; # store to memory 103 $SLL="sll"; # shift left logical 104 $SRL="srl"; # shift right logical 105 $ADDU="addu"; 106 @Sigma0=( 2,13,22); 107 @Sigma1=( 6,11,25); 108 @sigma0=( 3, 7,18); # right shift first 109 @sigma1=(10,17,19); # right shift first 110 $lastK=0x8f2; 111 $rounds=64; 112} 113 114$MSB = $big_endian ? 0 : ($SZ-1); 115$LSB = ($SZ-1)&~$MSB; 116 117@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31)); 118@X=map("\$$_",(8..23)); 119 120$ctx=$a0; 121$inp=$a1; 122$len=$a2; $Ktbl=$len; 123 124sub BODY_00_15 { 125my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 126my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); 127 128$code.=<<___ if ($i<15); 129 ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) 130 ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) 131___ 132$code.=<<___ if (!$big_endian && $i<16 && $SZ==4); 133 srl $tmp0,@X[0],24 # byte swap($i) 134 srl $tmp1,@X[0],8 135 andi $tmp2,@X[0],0xFF00 136 sll @X[0],@X[0],24 137 andi $tmp1,0xFF00 138 sll $tmp2,$tmp2,8 139 or @X[0],$tmp0 140 or $tmp1,$tmp2 141 or @X[0],$tmp1 142___ 143$code.=<<___ if (!$big_endian && $i<16 && $SZ==8); 144 ori $tmp0,$zero,0xFF 145 dsll $tmp2,$tmp0,32 146 or $tmp0,$tmp2 # 0x000000FF000000FF 147 and $tmp1,@X[0],$tmp0 # byte swap($i) 148 dsrl $tmp2,@X[0],24 149 dsll $tmp1,24 150 and $tmp2,$tmp0 151 dsll $tmp0,8 # 0x0000FF000000FF00 152 or $tmp1,$tmp2 153 and $tmp2,@X[0],$tmp0 154 dsrl @X[0],8 155 dsll $tmp2,8 156 and @X[0],$tmp0 157 or $tmp1,$tmp2 158 or @X[0],$tmp1 159 dsrl $tmp1,@X[0],32 160 dsll @X[0],32 161 or @X[0],$tmp1 162___ 163$code.=<<___; 164 $ADDU $T1,$X[0],$h # $i 165 $SRL $h,$e,@Sigma1[0] 166 xor $tmp2,$f,$g 167 $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]` 168 and $tmp2,$e 169 $SRL $tmp0,$e,@Sigma1[1] 170 xor $h,$tmp1 171 $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]` 172 xor $h,$tmp0 173 $SRL $tmp0,$e,@Sigma1[2] 174 xor $h,$tmp1 175 $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]` 176 xor $h,$tmp0 177 xor $tmp2,$g # Ch(e,f,g) 178 xor $tmp0,$tmp1,$h # Sigma1(e) 179 180 $SRL $h,$a,@Sigma0[0] 181 $ADDU $T1,$tmp2 182 $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] 183 $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]` 184 $ADDU $T1,$tmp0 185 $SRL $tmp0,$a,@Sigma0[1] 186 xor $h,$tmp1 187 $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]` 188 xor $h,$tmp0 189 $SRL $tmp0,$a,@Sigma0[2] 190 xor $h,$tmp1 191 $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]` 192 xor $h,$tmp0 193 $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer 194 xor $h,$tmp1 # Sigma0(a) 195 196 or $tmp0,$a,$b 197 and $tmp1,$a,$b 198 and $tmp0,$c 199 or $tmp1,$tmp0 # Maj(a,b,c) 200 $ADDU $T1,$tmp2 # +=K[$i] 201 $ADDU $h,$tmp1 202 203 $ADDU $d,$T1 204 $ADDU $h,$T1 205___ 206$code.=<<___ if ($i>=13); 207 $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer 208___ 209} 210 211sub BODY_16_XX { 212my $i=@_[0]; 213my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]); 214 215$code.=<<___; 216 $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) 217 $ADDU @X[0],@X[9] # +=X[i+9] 218 $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]` 219 $SRL $tmp0,@X[1],@sigma0[1] 220 xor $tmp2,$tmp1 221 $SLL $tmp1,`@sigma0[2]-@sigma0[1]` 222 xor $tmp2,$tmp0 223 $SRL $tmp0,@X[1],@sigma0[2] 224 xor $tmp2,$tmp1 225 226 $SRL $tmp3,@X[14],@sigma1[0] 227 xor $tmp2,$tmp0 # sigma0(X[i+1]) 228 $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]` 229 $ADDU @X[0],$tmp2 230 $SRL $tmp0,@X[14],@sigma1[1] 231 xor $tmp3,$tmp1 232 $SLL $tmp1,`@sigma1[2]-@sigma1[1]` 233 xor $tmp3,$tmp0 234 $SRL $tmp0,@X[14],@sigma1[2] 235 xor $tmp3,$tmp1 236 237 xor $tmp3,$tmp0 # sigma1(X[i+14]) 238 $ADDU @X[0],$tmp3 239___ 240 &BODY_00_15(@_); 241} 242 243$FRAMESIZE=16*$SZ+16*$SZREG; 244$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; 245 246$code.=<<___; 247.text 248.set noat 249#if !defined(__vxworks) || defined(__pic__) 250.option pic2 251#endif 252 253.align 5 254.globl sha${label}_block_data_order 255.ent sha${label}_block_data_order 256sha${label}_block_data_order: 257 .frame $sp,$FRAMESIZE,$ra 258 .mask $SAVED_REGS_MASK,-$SZREG 259 .set noreorder 260___ 261$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification 262 .cpload $pf 263___ 264$code.=<<___; 265 $PTR_SUB $sp,$FRAMESIZE 266 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) 267 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) 268 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) 269 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) 270 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) 271 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) 272 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) 273 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) 274 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) 275 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) 276___ 277$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 278 $REG_S $s3,$FRAMESIZE-11*$SZREG($sp) 279 $REG_S $s2,$FRAMESIZE-12*$SZREG($sp) 280 $REG_S $s1,$FRAMESIZE-13*$SZREG($sp) 281 $REG_S $s0,$FRAMESIZE-14*$SZREG($sp) 282 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) 283___ 284$code.=<<___; 285 $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)` 286___ 287$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification 288 .cplocal $Ktbl 289 .cpsetup $pf,$zero,sha${label}_block_data_order 290___ 291$code.=<<___; 292 .set reorder 293 $LA $Ktbl,K${label} # PIC-ified 'load address' 294 295 $LD $A,0*$SZ($ctx) # load context 296 $LD $B,1*$SZ($ctx) 297 $LD $C,2*$SZ($ctx) 298 $LD $D,3*$SZ($ctx) 299 $LD $E,4*$SZ($ctx) 300 $LD $F,5*$SZ($ctx) 301 $LD $G,6*$SZ($ctx) 302 $LD $H,7*$SZ($ctx) 303 304 $PTR_ADD @X[15],$inp # pointer to the end of input 305 $REG_S @X[15],16*$SZ($sp) 306 b .Loop 307 308.align 5 309.Loop: 310 ${LD}l @X[0],$MSB($inp) 311 ${LD}r @X[0],$LSB($inp) 312___ 313for ($i=0;$i<16;$i++) 314{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } 315$code.=<<___; 316 b .L16_xx 317.align 4 318.L16_xx: 319___ 320for (;$i<32;$i++) 321{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } 322$code.=<<___; 323 and @X[6],0xfff 324 li @X[7],$lastK 325 .set noreorder 326 bne @X[6],@X[7],.L16_xx 327 $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16 328 329 $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input 330 $LD @X[0],0*$SZ($ctx) 331 $LD @X[1],1*$SZ($ctx) 332 $LD @X[2],2*$SZ($ctx) 333 $PTR_ADD $inp,16*$SZ 334 $LD @X[3],3*$SZ($ctx) 335 $ADDU $A,@X[0] 336 $LD @X[4],4*$SZ($ctx) 337 $ADDU $B,@X[1] 338 $LD @X[5],5*$SZ($ctx) 339 $ADDU $C,@X[2] 340 $LD @X[6],6*$SZ($ctx) 341 $ADDU $D,@X[3] 342 $LD @X[7],7*$SZ($ctx) 343 $ADDU $E,@X[4] 344 $ST $A,0*$SZ($ctx) 345 $ADDU $F,@X[5] 346 $ST $B,1*$SZ($ctx) 347 $ADDU $G,@X[6] 348 $ST $C,2*$SZ($ctx) 349 $ADDU $H,@X[7] 350 $ST $D,3*$SZ($ctx) 351 $ST $E,4*$SZ($ctx) 352 $ST $F,5*$SZ($ctx) 353 $ST $G,6*$SZ($ctx) 354 $ST $H,7*$SZ($ctx) 355 356 bne $inp,@X[15],.Loop 357 $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl 358 359 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) 360 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) 361 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) 362 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) 363 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) 364 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) 365 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) 366 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) 367 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) 368 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) 369___ 370$code.=<<___ if ($flavour =~ /nubi/i); 371 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) 372 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) 373 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) 374 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) 375 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) 376___ 377$code.=<<___; 378 jr $ra 379 $PTR_ADD $sp,$FRAMESIZE 380.end sha${label}_block_data_order 381 382.rdata 383.align 5 384K${label}: 385___ 386if ($SZ==4) { 387$code.=<<___; 388 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 389 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 390 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 391 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 392 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 393 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 394 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 395 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 396 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 397 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 398 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 399 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 400 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 401 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 402 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 403 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 404___ 405} else { 406$code.=<<___; 407 .dword 0x428a2f98d728ae22, 0x7137449123ef65cd 408 .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc 409 .dword 0x3956c25bf348b538, 0x59f111f1b605d019 410 .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 411 .dword 0xd807aa98a3030242, 0x12835b0145706fbe 412 .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 413 .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 414 .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 415 .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 416 .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 417 .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 418 .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 419 .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 420 .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 421 .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 422 .dword 0x06ca6351e003826f, 0x142929670a0e6e70 423 .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 424 .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df 425 .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 426 .dword 0x81c2c92e47edaee6, 0x92722c851482353b 427 .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 428 .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 429 .dword 0xd192e819d6ef5218, 0xd69906245565a910 430 .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 431 .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 432 .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 433 .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb 434 .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 435 .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 436 .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec 437 .dword 0x90befffa23631e28, 0xa4506cebde82bde9 438 .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b 439 .dword 0xca273eceea26619c, 0xd186b8c721c0c207 440 .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 441 .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 442 .dword 0x113f9804bef90dae, 0x1b710b35131c471b 443 .dword 0x28db77f523047d84, 0x32caab7b40c72493 444 .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c 445 .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a 446 .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 447___ 448} 449$code.=<<___; 450.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 451.align 5 452 453___ 454 455$code =~ s/\`([^\`]*)\`/eval $1/gem; 456print $code; 457close STDOUT; 458