1ec07fdf1Sdjm#!/usr/bin/env perl 2ec07fdf1Sdjm 3ec07fdf1Sdjm# ==================================================================== 4ec07fdf1Sdjm# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5ec07fdf1Sdjm# project. The module is, however, dual licensed under OpenSSL and 6ec07fdf1Sdjm# CRYPTOGAMS licenses depending on where you obtain it. For further 7ec07fdf1Sdjm# details see http://www.openssl.org/~appro/cryptogams/. 8ec07fdf1Sdjm# ==================================================================== 9ec07fdf1Sdjm 10ec07fdf1Sdjm# SHA1 block procedure for MIPS. 11ec07fdf1Sdjm 12ec07fdf1Sdjm# Performance improvement is 30% on unaligned input. The "secret" is 13ec07fdf1Sdjm# to deploy lwl/lwr pair to load unaligned input. One could have 14ec07fdf1Sdjm# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- 15ec07fdf1Sdjm# compatible subroutine. There is room for minor optimization on 16ec07fdf1Sdjm# little-endian platforms... 17ec07fdf1Sdjm 18ec07fdf1Sdjm###################################################################### 19ec07fdf1Sdjm# There is a number of MIPS ABI in use, O32 and N32/64 are most 20ec07fdf1Sdjm# widely used. Then there is a new contender: NUBI. It appears that if 21ec07fdf1Sdjm# one picks the latter, it's possible to arrange code in ABI neutral 22ec07fdf1Sdjm# manner. Therefore let's stick to NUBI register layout: 23ec07fdf1Sdjm# 24ec07fdf1Sdjm($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 25ec07fdf1Sdjm($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 26ec07fdf1Sdjm($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 27ec07fdf1Sdjm($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 28ec07fdf1Sdjm# 29ec07fdf1Sdjm# The return value is placed in $a0. Following coding rules facilitate 30ec07fdf1Sdjm# interoperability: 31ec07fdf1Sdjm# 32ec07fdf1Sdjm# - never ever touch $tp, "thread pointer", former $gp; 33ec07fdf1Sdjm# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 34ec07fdf1Sdjm# old code]; 35ec07fdf1Sdjm# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 36ec07fdf1Sdjm# 37ec07fdf1Sdjm# For reference here is register layout for N32/64 MIPS ABIs: 38ec07fdf1Sdjm# 39ec07fdf1Sdjm# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 40ec07fdf1Sdjm# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 41ec07fdf1Sdjm# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 42ec07fdf1Sdjm# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 43ec07fdf1Sdjm# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 44ec07fdf1Sdjm# 45ec07fdf1Sdjm$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 46ec07fdf1Sdjm 47ec07fdf1Sdjmif ($flavour =~ /64|n32/i) { 48ec07fdf1Sdjm $PTR_ADD="dadd"; # incidentally works even on n32 49ec07fdf1Sdjm $PTR_SUB="dsub"; # incidentally works even on n32 50ec07fdf1Sdjm $REG_S="sd"; 51ec07fdf1Sdjm $REG_L="ld"; 52ec07fdf1Sdjm $PTR_SLL="dsll"; # incidentally works even on n32 53ec07fdf1Sdjm $SZREG=8; 54ec07fdf1Sdjm} else { 55ec07fdf1Sdjm $PTR_ADD="add"; 56ec07fdf1Sdjm $PTR_SUB="sub"; 57ec07fdf1Sdjm $REG_S="sw"; 58ec07fdf1Sdjm $REG_L="lw"; 59ec07fdf1Sdjm $PTR_SLL="sll"; 60ec07fdf1Sdjm $SZREG=4; 61ec07fdf1Sdjm} 62ec07fdf1Sdjm# 63ec07fdf1Sdjm# <appro@openssl.org> 64ec07fdf1Sdjm# 65ec07fdf1Sdjm###################################################################### 66ec07fdf1Sdjm 67ec07fdf1Sdjm$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; 68ec07fdf1Sdjm 69ec07fdf1Sdjmfor (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } 70ec07fdf1Sdjmopen STDOUT,">$output"; 71ec07fdf1Sdjm 72ec07fdf1Sdjmif (!defined($big_endian)) 73ec07fdf1Sdjm { $big_endian=(unpack('L',pack('N',1))==1); } 74ec07fdf1Sdjm 75ec07fdf1Sdjm# offsets of the Most and Least Significant Bytes 76ec07fdf1Sdjm$MSB=$big_endian?0:3; 77ec07fdf1Sdjm$LSB=3&~$MSB; 78ec07fdf1Sdjm 79ec07fdf1Sdjm@X=map("\$$_",(8..23)); # a4-a7,s0-s11 80ec07fdf1Sdjm 81ec07fdf1Sdjm$ctx=$a0; 82ec07fdf1Sdjm$inp=$a1; 83ec07fdf1Sdjm$num=$a2; 84ec07fdf1Sdjm$A="\$1"; 85ec07fdf1Sdjm$B="\$2"; 86ec07fdf1Sdjm$C="\$3"; 87ec07fdf1Sdjm$D="\$7"; 88ec07fdf1Sdjm$E="\$24"; @V=($A,$B,$C,$D,$E); 89ec07fdf1Sdjm$t0="\$25"; 90ec07fdf1Sdjm$t1=$num; # $num is offloaded to stack 91ec07fdf1Sdjm$t2="\$30"; # fp 92ec07fdf1Sdjm$K="\$31"; # ra 93ec07fdf1Sdjm 94ec07fdf1Sdjmsub BODY_00_14 { 95ec07fdf1Sdjmmy ($i,$a,$b,$c,$d,$e)=@_; 96ec07fdf1Sdjmmy $j=$i+1; 97ec07fdf1Sdjm$code.=<<___ if (!$big_endian); 98ec07fdf1Sdjm srl $t0,@X[$i],24 # byte swap($i) 99ec07fdf1Sdjm srl $t1,@X[$i],8 100ec07fdf1Sdjm andi $t2,@X[$i],0xFF00 101ec07fdf1Sdjm sll @X[$i],@X[$i],24 102ec07fdf1Sdjm andi $t1,0xFF00 103ec07fdf1Sdjm sll $t2,$t2,8 104ec07fdf1Sdjm or @X[$i],$t0 105ec07fdf1Sdjm or $t1,$t2 106ec07fdf1Sdjm or @X[$i],$t1 107ec07fdf1Sdjm___ 108ec07fdf1Sdjm$code.=<<___; 109ec07fdf1Sdjm lwl @X[$j],$j*4+$MSB($inp) 110ec07fdf1Sdjm sll $t0,$a,5 # $i 111ec07fdf1Sdjm addu $e,$K 112ec07fdf1Sdjm lwr @X[$j],$j*4+$LSB($inp) 113ec07fdf1Sdjm srl $t1,$a,27 114ec07fdf1Sdjm addu $e,$t0 115ec07fdf1Sdjm xor $t0,$c,$d 116ec07fdf1Sdjm addu $e,$t1 117ec07fdf1Sdjm sll $t2,$b,30 118ec07fdf1Sdjm and $t0,$b 119ec07fdf1Sdjm srl $b,$b,2 120ec07fdf1Sdjm xor $t0,$d 121ec07fdf1Sdjm addu $e,@X[$i] 122ec07fdf1Sdjm or $b,$t2 123ec07fdf1Sdjm addu $e,$t0 124ec07fdf1Sdjm___ 125ec07fdf1Sdjm} 126ec07fdf1Sdjm 127ec07fdf1Sdjmsub BODY_15_19 { 128ec07fdf1Sdjmmy ($i,$a,$b,$c,$d,$e)=@_; 129ec07fdf1Sdjmmy $j=$i+1; 130ec07fdf1Sdjm 131ec07fdf1Sdjm$code.=<<___ if (!$big_endian && $i==15); 132ec07fdf1Sdjm srl $t0,@X[$i],24 # byte swap($i) 133ec07fdf1Sdjm srl $t1,@X[$i],8 134ec07fdf1Sdjm andi $t2,@X[$i],0xFF00 135ec07fdf1Sdjm sll @X[$i],@X[$i],24 136ec07fdf1Sdjm andi $t1,0xFF00 137ec07fdf1Sdjm sll $t2,$t2,8 138ec07fdf1Sdjm or @X[$i],$t0 139ec07fdf1Sdjm or @X[$i],$t1 140ec07fdf1Sdjm or @X[$i],$t2 141ec07fdf1Sdjm___ 142ec07fdf1Sdjm$code.=<<___; 143ec07fdf1Sdjm xor @X[$j%16],@X[($j+2)%16] 144ec07fdf1Sdjm sll $t0,$a,5 # $i 145ec07fdf1Sdjm addu $e,$K 146ec07fdf1Sdjm srl $t1,$a,27 147ec07fdf1Sdjm addu $e,$t0 148ec07fdf1Sdjm xor @X[$j%16],@X[($j+8)%16] 149ec07fdf1Sdjm xor $t0,$c,$d 150ec07fdf1Sdjm addu $e,$t1 151ec07fdf1Sdjm xor @X[$j%16],@X[($j+13)%16] 152ec07fdf1Sdjm sll $t2,$b,30 153ec07fdf1Sdjm and $t0,$b 154ec07fdf1Sdjm srl $t1,@X[$j%16],31 155ec07fdf1Sdjm addu @X[$j%16],@X[$j%16] 156ec07fdf1Sdjm srl $b,$b,2 157ec07fdf1Sdjm xor $t0,$d 158ec07fdf1Sdjm or @X[$j%16],$t1 159ec07fdf1Sdjm addu $e,@X[$i%16] 160ec07fdf1Sdjm or $b,$t2 161ec07fdf1Sdjm addu $e,$t0 162ec07fdf1Sdjm___ 163ec07fdf1Sdjm} 164ec07fdf1Sdjm 165ec07fdf1Sdjmsub BODY_20_39 { 166ec07fdf1Sdjmmy ($i,$a,$b,$c,$d,$e)=@_; 167ec07fdf1Sdjmmy $j=$i+1; 168ec07fdf1Sdjm$code.=<<___ if ($i<79); 169ec07fdf1Sdjm xor @X[$j%16],@X[($j+2)%16] 170ec07fdf1Sdjm sll $t0,$a,5 # $i 171ec07fdf1Sdjm addu $e,$K 172ec07fdf1Sdjm srl $t1,$a,27 173ec07fdf1Sdjm addu $e,$t0 174ec07fdf1Sdjm xor @X[$j%16],@X[($j+8)%16] 175ec07fdf1Sdjm xor $t0,$c,$d 176ec07fdf1Sdjm addu $e,$t1 177ec07fdf1Sdjm xor @X[$j%16],@X[($j+13)%16] 178ec07fdf1Sdjm sll $t2,$b,30 179ec07fdf1Sdjm xor $t0,$b 180ec07fdf1Sdjm srl $t1,@X[$j%16],31 181ec07fdf1Sdjm addu @X[$j%16],@X[$j%16] 182ec07fdf1Sdjm srl $b,$b,2 183ec07fdf1Sdjm addu $e,@X[$i%16] 184ec07fdf1Sdjm or @X[$j%16],$t1 185ec07fdf1Sdjm or $b,$t2 186ec07fdf1Sdjm addu $e,$t0 187ec07fdf1Sdjm___ 188ec07fdf1Sdjm$code.=<<___ if ($i==79); 189ec07fdf1Sdjm lw @X[0],0($ctx) 190ec07fdf1Sdjm sll $t0,$a,5 # $i 191ec07fdf1Sdjm addu $e,$K 192ec07fdf1Sdjm lw @X[1],4($ctx) 193ec07fdf1Sdjm srl $t1,$a,27 194ec07fdf1Sdjm addu $e,$t0 195ec07fdf1Sdjm lw @X[2],8($ctx) 196ec07fdf1Sdjm xor $t0,$c,$d 197ec07fdf1Sdjm addu $e,$t1 198ec07fdf1Sdjm lw @X[3],12($ctx) 199ec07fdf1Sdjm sll $t2,$b,30 200ec07fdf1Sdjm xor $t0,$b 201ec07fdf1Sdjm lw @X[4],16($ctx) 202ec07fdf1Sdjm srl $b,$b,2 203ec07fdf1Sdjm addu $e,@X[$i%16] 204ec07fdf1Sdjm or $b,$t2 205ec07fdf1Sdjm addu $e,$t0 206ec07fdf1Sdjm___ 207ec07fdf1Sdjm} 208ec07fdf1Sdjm 209ec07fdf1Sdjmsub BODY_40_59 { 210ec07fdf1Sdjmmy ($i,$a,$b,$c,$d,$e)=@_; 211ec07fdf1Sdjmmy $j=$i+1; 212ec07fdf1Sdjm$code.=<<___ if ($i<79); 213ec07fdf1Sdjm xor @X[$j%16],@X[($j+2)%16] 214ec07fdf1Sdjm sll $t0,$a,5 # $i 215ec07fdf1Sdjm addu $e,$K 216ec07fdf1Sdjm srl $t1,$a,27 217ec07fdf1Sdjm addu $e,$t0 218ec07fdf1Sdjm xor @X[$j%16],@X[($j+8)%16] 219ec07fdf1Sdjm and $t0,$c,$d 220ec07fdf1Sdjm addu $e,$t1 221ec07fdf1Sdjm xor @X[$j%16],@X[($j+13)%16] 222ec07fdf1Sdjm sll $t2,$b,30 223ec07fdf1Sdjm addu $e,$t0 224ec07fdf1Sdjm srl $t1,@X[$j%16],31 225ec07fdf1Sdjm xor $t0,$c,$d 226ec07fdf1Sdjm addu @X[$j%16],@X[$j%16] 227ec07fdf1Sdjm and $t0,$b 228ec07fdf1Sdjm srl $b,$b,2 229ec07fdf1Sdjm or @X[$j%16],$t1 230ec07fdf1Sdjm addu $e,@X[$i%16] 231ec07fdf1Sdjm or $b,$t2 232ec07fdf1Sdjm addu $e,$t0 233ec07fdf1Sdjm___ 234ec07fdf1Sdjm} 235ec07fdf1Sdjm 236*34a62e53Skrw$FRAMESIZE=16; # large enough to accommodate NUBI saved registers 237ec07fdf1Sdjm$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; 238ec07fdf1Sdjm 239ec07fdf1Sdjm$code=<<___; 240ec07fdf1Sdjm.text 241ec07fdf1Sdjm 242ec07fdf1Sdjm.set noat 243ec07fdf1Sdjm.set noreorder 244ec07fdf1Sdjm.align 5 245ec07fdf1Sdjm.globl sha1_block_data_order 246ec07fdf1Sdjm.ent sha1_block_data_order 247ec07fdf1Sdjmsha1_block_data_order: 248ec07fdf1Sdjm .frame $sp,$FRAMESIZE*$SZREG,$ra 249ec07fdf1Sdjm .mask $SAVED_REGS_MASK,-$SZREG 250ec07fdf1Sdjm .set noreorder 251ec07fdf1Sdjm $PTR_SUB $sp,$FRAMESIZE*$SZREG 252ec07fdf1Sdjm $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) 253ec07fdf1Sdjm $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) 254ec07fdf1Sdjm $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) 255ec07fdf1Sdjm $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) 256ec07fdf1Sdjm $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) 257ec07fdf1Sdjm $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) 258ec07fdf1Sdjm $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) 259ec07fdf1Sdjm $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) 260ec07fdf1Sdjm $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) 261ec07fdf1Sdjm $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) 262ec07fdf1Sdjm___ 263ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 264ec07fdf1Sdjm $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) 265ec07fdf1Sdjm $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) 266ec07fdf1Sdjm $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) 267ec07fdf1Sdjm $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) 268ec07fdf1Sdjm $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) 269ec07fdf1Sdjm___ 270ec07fdf1Sdjm$code.=<<___; 271ec07fdf1Sdjm $PTR_SLL $num,6 272ec07fdf1Sdjm $PTR_ADD $num,$inp 273ec07fdf1Sdjm $REG_S $num,0($sp) 274ec07fdf1Sdjm lw $A,0($ctx) 275ec07fdf1Sdjm lw $B,4($ctx) 276ec07fdf1Sdjm lw $C,8($ctx) 277ec07fdf1Sdjm lw $D,12($ctx) 278ec07fdf1Sdjm b .Loop 279ec07fdf1Sdjm lw $E,16($ctx) 280ec07fdf1Sdjm.align 4 281ec07fdf1Sdjm.Loop: 282ec07fdf1Sdjm .set reorder 283ec07fdf1Sdjm lwl @X[0],$MSB($inp) 284ec07fdf1Sdjm lui $K,0x5a82 285ec07fdf1Sdjm lwr @X[0],$LSB($inp) 286ec07fdf1Sdjm ori $K,0x7999 # K_00_19 287ec07fdf1Sdjm___ 288ec07fdf1Sdjmfor ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } 289ec07fdf1Sdjmfor (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } 290ec07fdf1Sdjm$code.=<<___; 291ec07fdf1Sdjm lui $K,0x6ed9 292ec07fdf1Sdjm ori $K,0xeba1 # K_20_39 293ec07fdf1Sdjm___ 294ec07fdf1Sdjmfor (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 295ec07fdf1Sdjm$code.=<<___; 296ec07fdf1Sdjm lui $K,0x8f1b 297ec07fdf1Sdjm ori $K,0xbcdc # K_40_59 298ec07fdf1Sdjm___ 299ec07fdf1Sdjmfor (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 300ec07fdf1Sdjm$code.=<<___; 301ec07fdf1Sdjm lui $K,0xca62 302ec07fdf1Sdjm ori $K,0xc1d6 # K_60_79 303ec07fdf1Sdjm___ 304ec07fdf1Sdjmfor (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 305ec07fdf1Sdjm$code.=<<___; 306ec07fdf1Sdjm $PTR_ADD $inp,64 307ec07fdf1Sdjm $REG_L $num,0($sp) 308ec07fdf1Sdjm 309ec07fdf1Sdjm addu $A,$X[0] 310ec07fdf1Sdjm addu $B,$X[1] 311ec07fdf1Sdjm sw $A,0($ctx) 312ec07fdf1Sdjm addu $C,$X[2] 313ec07fdf1Sdjm addu $D,$X[3] 314ec07fdf1Sdjm sw $B,4($ctx) 315ec07fdf1Sdjm addu $E,$X[4] 316ec07fdf1Sdjm sw $C,8($ctx) 317ec07fdf1Sdjm sw $D,12($ctx) 318ec07fdf1Sdjm sw $E,16($ctx) 319ec07fdf1Sdjm .set noreorder 320ec07fdf1Sdjm bne $inp,$num,.Loop 321ec07fdf1Sdjm nop 322ec07fdf1Sdjm 323ec07fdf1Sdjm .set noreorder 324ec07fdf1Sdjm $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) 325ec07fdf1Sdjm $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) 326ec07fdf1Sdjm $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) 327ec07fdf1Sdjm $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) 328ec07fdf1Sdjm $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) 329ec07fdf1Sdjm $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) 330ec07fdf1Sdjm $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) 331ec07fdf1Sdjm $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) 332ec07fdf1Sdjm $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) 333ec07fdf1Sdjm $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) 334ec07fdf1Sdjm___ 335ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i); 336ec07fdf1Sdjm $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) 337ec07fdf1Sdjm $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) 338ec07fdf1Sdjm $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) 339ec07fdf1Sdjm $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) 340ec07fdf1Sdjm $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) 341ec07fdf1Sdjm___ 342ec07fdf1Sdjm$code.=<<___; 343ec07fdf1Sdjm jr $ra 344ec07fdf1Sdjm $PTR_ADD $sp,$FRAMESIZE*$SZREG 345ec07fdf1Sdjm.end sha1_block_data_order 346ec07fdf1Sdjm.rdata 347ec07fdf1Sdjm.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 348ec07fdf1Sdjm___ 349ec07fdf1Sdjmprint $code; 350ec07fdf1Sdjmclose STDOUT; 351