1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# SHA1 block procedure for MIPS. 11 12# Performance improvement is 30% on unaligned input. The "secret" is 13# to deploy lwl/lwr pair to load unaligned input. One could have 14# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- 15# compatible subroutine. There is room for minor optimization on 16# little-endian platforms... 17 18###################################################################### 19# There is a number of MIPS ABI in use, O32 and N32/64 are most 20# widely used. Then there is a new contender: NUBI. It appears that if 21# one picks the latter, it's possible to arrange code in ABI neutral 22# manner. Therefore let's stick to NUBI register layout: 23# 24($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 25($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 26($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 27($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 28# 29# The return value is placed in $a0. Following coding rules facilitate 30# interoperability: 31# 32# - never ever touch $tp, "thread pointer", former $gp; 33# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 34# old code]; 35# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 36# 37# For reference here is register layout for N32/64 MIPS ABIs: 38# 39# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 40# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 41# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 42# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 43# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 44# 45$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 46 47if ($flavour =~ /64|n32/i) { 48 $PTR_ADD="dadd"; # incidentally works even on n32 49 $PTR_SUB="dsub"; # incidentally works even on n32 50 $REG_S="sd"; 51 $REG_L="ld"; 52 $PTR_SLL="dsll"; # incidentally works even on n32 53 $SZREG=8; 54} else { 55 $PTR_ADD="add"; 56 $PTR_SUB="sub"; 57 $REG_S="sw"; 58 $REG_L="lw"; 59 $PTR_SLL="sll"; 60 $SZREG=4; 61} 62# 63# <appro@openssl.org> 64# 65###################################################################### 66 67$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; 68 69for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } 70open STDOUT,">$output"; 71 72if (!defined($big_endian)) 73 { $big_endian=(unpack('L',pack('N',1))==1); } 74 75# offsets of the Most and Least Significant Bytes 76$MSB=$big_endian?0:3; 77$LSB=3&~$MSB; 78 79@X=map("\$$_",(8..23)); # a4-a7,s0-s11 80 81$ctx=$a0; 82$inp=$a1; 83$num=$a2; 84$A="\$1"; 85$B="\$2"; 86$C="\$3"; 87$D="\$7"; 88$E="\$24"; @V=($A,$B,$C,$D,$E); 89$t0="\$25"; 90$t1=$num; # $num is offloaded to stack 91$t2="\$30"; # fp 92$K="\$31"; # ra 93 94sub BODY_00_14 { 95my ($i,$a,$b,$c,$d,$e)=@_; 96my $j=$i+1; 97$code.=<<___ if (!$big_endian); 98 srl $t0,@X[$i],24 # byte swap($i) 99 srl $t1,@X[$i],8 100 andi $t2,@X[$i],0xFF00 101 sll @X[$i],@X[$i],24 102 andi $t1,0xFF00 103 sll $t2,$t2,8 104 or @X[$i],$t0 105 or $t1,$t2 106 or @X[$i],$t1 107___ 108$code.=<<___; 109 lwl @X[$j],$j*4+$MSB($inp) 110 sll $t0,$a,5 # $i 111 addu $e,$K 112 lwr @X[$j],$j*4+$LSB($inp) 113 srl $t1,$a,27 114 addu $e,$t0 115 xor $t0,$c,$d 116 addu $e,$t1 117 sll $t2,$b,30 118 and $t0,$b 119 srl $b,$b,2 120 xor $t0,$d 121 addu $e,@X[$i] 122 or $b,$t2 123 addu $e,$t0 124___ 125} 126 127sub BODY_15_19 { 128my ($i,$a,$b,$c,$d,$e)=@_; 129my $j=$i+1; 130 131$code.=<<___ if (!$big_endian && $i==15); 132 srl $t0,@X[$i],24 # byte swap($i) 133 srl $t1,@X[$i],8 134 andi $t2,@X[$i],0xFF00 135 sll @X[$i],@X[$i],24 136 andi $t1,0xFF00 137 sll $t2,$t2,8 138 or @X[$i],$t0 139 or @X[$i],$t1 140 or @X[$i],$t2 141___ 142$code.=<<___; 143 xor @X[$j%16],@X[($j+2)%16] 144 sll $t0,$a,5 # $i 145 addu $e,$K 146 srl $t1,$a,27 147 addu $e,$t0 148 xor @X[$j%16],@X[($j+8)%16] 149 xor $t0,$c,$d 150 addu $e,$t1 151 xor @X[$j%16],@X[($j+13)%16] 152 sll $t2,$b,30 153 and $t0,$b 154 srl $t1,@X[$j%16],31 155 addu @X[$j%16],@X[$j%16] 156 srl $b,$b,2 157 xor $t0,$d 158 or @X[$j%16],$t1 159 addu $e,@X[$i%16] 160 or $b,$t2 161 addu $e,$t0 162___ 163} 164 165sub BODY_20_39 { 166my ($i,$a,$b,$c,$d,$e)=@_; 167my $j=$i+1; 168$code.=<<___ if ($i<79); 169 xor @X[$j%16],@X[($j+2)%16] 170 sll $t0,$a,5 # $i 171 addu $e,$K 172 srl $t1,$a,27 173 addu $e,$t0 174 xor @X[$j%16],@X[($j+8)%16] 175 xor $t0,$c,$d 176 addu $e,$t1 177 xor @X[$j%16],@X[($j+13)%16] 178 sll $t2,$b,30 179 xor $t0,$b 180 srl $t1,@X[$j%16],31 181 addu @X[$j%16],@X[$j%16] 182 srl $b,$b,2 183 addu $e,@X[$i%16] 184 or @X[$j%16],$t1 185 or $b,$t2 186 addu $e,$t0 187___ 188$code.=<<___ if ($i==79); 189 lw @X[0],0($ctx) 190 sll $t0,$a,5 # $i 191 addu $e,$K 192 lw @X[1],4($ctx) 193 srl $t1,$a,27 194 addu $e,$t0 195 lw @X[2],8($ctx) 196 xor $t0,$c,$d 197 addu $e,$t1 198 lw @X[3],12($ctx) 199 sll $t2,$b,30 200 xor $t0,$b 201 lw @X[4],16($ctx) 202 srl $b,$b,2 203 addu $e,@X[$i%16] 204 or $b,$t2 205 addu $e,$t0 206___ 207} 208 209sub BODY_40_59 { 210my ($i,$a,$b,$c,$d,$e)=@_; 211my $j=$i+1; 212$code.=<<___ if ($i<79); 213 xor @X[$j%16],@X[($j+2)%16] 214 sll $t0,$a,5 # $i 215 addu $e,$K 216 srl $t1,$a,27 217 addu $e,$t0 218 xor @X[$j%16],@X[($j+8)%16] 219 and $t0,$c,$d 220 addu $e,$t1 221 xor @X[$j%16],@X[($j+13)%16] 222 sll $t2,$b,30 223 addu $e,$t0 224 srl $t1,@X[$j%16],31 225 xor $t0,$c,$d 226 addu @X[$j%16],@X[$j%16] 227 and $t0,$b 228 srl $b,$b,2 229 or @X[$j%16],$t1 230 addu $e,@X[$i%16] 231 or $b,$t2 232 addu $e,$t0 233___ 234} 235 236$FRAMESIZE=16; # large enough to accommodate NUBI saved registers 237$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; 238 239$code=<<___; 240.text 241 242.set noat 243.set noreorder 244.align 5 245.globl sha1_block_data_order 246.ent sha1_block_data_order 247sha1_block_data_order: 248 .frame $sp,$FRAMESIZE*$SZREG,$ra 249 .mask $SAVED_REGS_MASK,-$SZREG 250 .set noreorder 251 $PTR_SUB $sp,$FRAMESIZE*$SZREG 252 $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) 253 $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) 254 $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) 255 $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) 256 $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) 257 $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) 258 $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) 259 $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) 260 $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) 261 $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) 262___ 263$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 264 $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) 265 $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) 266 $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) 267 $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) 268 $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) 269___ 270$code.=<<___; 271 $PTR_SLL $num,6 272 $PTR_ADD $num,$inp 273 $REG_S $num,0($sp) 274 lw $A,0($ctx) 275 lw $B,4($ctx) 276 lw $C,8($ctx) 277 lw $D,12($ctx) 278 b .Loop 279 lw $E,16($ctx) 280.align 4 281.Loop: 282 .set reorder 283 lwl @X[0],$MSB($inp) 284 lui $K,0x5a82 285 lwr @X[0],$LSB($inp) 286 ori $K,0x7999 # K_00_19 287___ 288for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } 289for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } 290$code.=<<___; 291 lui $K,0x6ed9 292 ori $K,0xeba1 # K_20_39 293___ 294for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 295$code.=<<___; 296 lui $K,0x8f1b 297 ori $K,0xbcdc # K_40_59 298___ 299for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 300$code.=<<___; 301 lui $K,0xca62 302 ori $K,0xc1d6 # K_60_79 303___ 304for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 305$code.=<<___; 306 $PTR_ADD $inp,64 307 $REG_L $num,0($sp) 308 309 addu $A,$X[0] 310 addu $B,$X[1] 311 sw $A,0($ctx) 312 addu $C,$X[2] 313 addu $D,$X[3] 314 sw $B,4($ctx) 315 addu $E,$X[4] 316 sw $C,8($ctx) 317 sw $D,12($ctx) 318 sw $E,16($ctx) 319 .set noreorder 320 bne $inp,$num,.Loop 321 nop 322 323 .set noreorder 324 $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) 325 $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) 326 $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) 327 $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) 328 $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) 329 $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) 330 $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) 331 $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) 332 $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) 333 $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) 334___ 335$code.=<<___ if ($flavour =~ /nubi/i); 336 $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) 337 $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) 338 $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) 339 $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) 340 $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) 341___ 342$code.=<<___; 343 jr $ra 344 $PTR_ADD $sp,$FRAMESIZE*$SZREG 345.end sha1_block_data_order 346.rdata 347.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 348___ 349print $code; 350close STDOUT; 351