1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# I let hardware handle unaligned input(*), except on page boundaries 11# (see below for details). Otherwise straightforward implementation 12# with X vector in register bank. The module is big-endian [which is 13# not big deal as there're no little-endian targets left around]. 14# 15# (*) this means that this module is inappropriate for PPC403? Does 16# anybody know if pre-POWER3 can sustain unaligned load? 17 18# -m64 -m32 19# ---------------------------------- 20# PPC970,gcc-4.0.0 +76% +59% 21# Power6,xlc-7 +68% +33% 22 23$flavour = shift; 24 25if ($flavour =~ /64/) { 26 $SIZE_T =8; 27 $LRSAVE =2*$SIZE_T; 28 $UCMP ="cmpld"; 29 $STU ="stdu"; 30 $POP ="ld"; 31 $PUSH ="std"; 32} elsif ($flavour =~ /32/) { 33 $SIZE_T =4; 34 $LRSAVE =$SIZE_T; 35 $UCMP ="cmplw"; 36 $STU ="stwu"; 37 $POP ="lwz"; 38 $PUSH ="stw"; 39} else { die "nonsense $flavour"; } 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 43( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 44die "can't locate ppc-xlate.pl"; 45 46open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 47 48$FRAME=24*$SIZE_T+64; 49$LOCALS=6*$SIZE_T; 50 51$K ="r0"; 52$sp ="r1"; 53$toc="r2"; 54$ctx="r3"; 55$inp="r4"; 56$num="r5"; 57$t0 ="r15"; 58$t1 ="r6"; 59 60$A ="r7"; 61$B ="r8"; 62$C ="r9"; 63$D ="r10"; 64$E ="r11"; 65$T ="r12"; 66 67@V=($A,$B,$C,$D,$E,$T); 68@X=("r16","r17","r18","r19","r20","r21","r22","r23", 69 "r24","r25","r26","r27","r28","r29","r30","r31"); 70 71sub BODY_00_19 { 72my ($i,$a,$b,$c,$d,$e,$f)=@_; 73my $j=$i+1; 74$code.=<<___ if ($i==0); 75 lwz @X[$i],`$i*4`($inp) 76___ 77$code.=<<___ if ($i<15); 78 lwz @X[$j],`$j*4`($inp) 79 add $f,$K,$e 80 rotlwi $e,$a,5 81 add $f,$f,@X[$i] 82 and $t0,$c,$b 83 add $f,$f,$e 84 andc $t1,$d,$b 85 rotlwi $b,$b,30 86 or $t0,$t0,$t1 87 add $f,$f,$t0 88___ 89$code.=<<___ if ($i>=15); 90 add $f,$K,$e 91 rotlwi $e,$a,5 92 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 93 add $f,$f,@X[$i%16] 94 and $t0,$c,$b 95 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 96 add $f,$f,$e 97 andc $t1,$d,$b 98 rotlwi $b,$b,30 99 or $t0,$t0,$t1 100 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 101 add $f,$f,$t0 102 rotlwi @X[$j%16],@X[$j%16],1 103___ 104} 105 106sub BODY_20_39 { 107my ($i,$a,$b,$c,$d,$e,$f)=@_; 108my $j=$i+1; 109$code.=<<___ if ($i<79); 110 add $f,$K,$e 111 rotlwi $e,$a,5 112 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 113 add $f,$f,@X[$i%16] 114 xor $t0,$b,$c 115 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 116 add $f,$f,$e 117 rotlwi $b,$b,30 118 xor $t0,$t0,$d 119 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 120 add $f,$f,$t0 121 rotlwi @X[$j%16],@X[$j%16],1 122___ 123$code.=<<___ if ($i==79); 124 add $f,$K,$e 125 rotlwi $e,$a,5 126 lwz r16,0($ctx) 127 add $f,$f,@X[$i%16] 128 xor $t0,$b,$c 129 lwz r17,4($ctx) 130 add $f,$f,$e 131 rotlwi $b,$b,30 132 lwz r18,8($ctx) 133 xor $t0,$t0,$d 134 lwz r19,12($ctx) 135 add $f,$f,$t0 136 lwz r20,16($ctx) 137___ 138} 139 140sub BODY_40_59 { 141my ($i,$a,$b,$c,$d,$e,$f)=@_; 142my $j=$i+1; 143$code.=<<___; 144 add $f,$K,$e 145 rotlwi $e,$a,5 146 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 147 add $f,$f,@X[$i%16] 148 and $t0,$b,$c 149 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 150 add $f,$f,$e 151 or $t1,$b,$c 152 rotlwi $b,$b,30 153 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 154 and $t1,$t1,$d 155 or $t0,$t0,$t1 156 rotlwi @X[$j%16],@X[$j%16],1 157 add $f,$f,$t0 158___ 159} 160 161$code=<<___; 162.machine "any" 163.text 164 165.globl .sha1_block_data_order 166.align 4 167.sha1_block_data_order: 168 $STU $sp,-$FRAME($sp) 169 mflr r0 170 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 171 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 172 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 173 $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 174 $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 175 $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 176 $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 177 $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 178 $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 179 $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 180 $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 181 $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 182 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 183 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 184 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 185 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 186 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 187 $PUSH r0,`$FRAME+$LRSAVE`($sp) 188 lwz $A,0($ctx) 189 lwz $B,4($ctx) 190 lwz $C,8($ctx) 191 lwz $D,12($ctx) 192 lwz $E,16($ctx) 193 andi. r0,$inp,3 194 bne Lunaligned 195Laligned: 196 mtctr $num 197 bl Lsha1_block_private 198 b Ldone 199 200; PowerPC specification allows an implementation to be ill-behaved 201; upon unaligned access which crosses page boundary. "Better safe 202; than sorry" principle makes me treat it specially. But I don't 203; look for particular offending word, but rather for 64-byte input 204; block which crosses the boundary. Once found that block is aligned 205; and hashed separately... 206.align 4 207Lunaligned: 208 subfic $t1,$inp,4096 209 andi. $t1,$t1,4095 ; distance to closest page boundary 210 srwi. $t1,$t1,6 ; t1/=64 211 beq Lcross_page 212 $UCMP $num,$t1 213 ble- Laligned ; didn't cross the page boundary 214 mtctr $t1 215 subfc $num,$t1,$num 216 bl Lsha1_block_private 217Lcross_page: 218 li $t1,16 219 mtctr $t1 220 addi r20,$sp,$LOCALS ; spot within the frame 221Lmemcpy: 222 lbz r16,0($inp) 223 lbz r17,1($inp) 224 lbz r18,2($inp) 225 lbz r19,3($inp) 226 addi $inp,$inp,4 227 stb r16,0(r20) 228 stb r17,1(r20) 229 stb r18,2(r20) 230 stb r19,3(r20) 231 addi r20,r20,4 232 bdnz Lmemcpy 233 234 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) 235 li $t1,1 236 addi $inp,$sp,$LOCALS 237 mtctr $t1 238 bl Lsha1_block_private 239 $POP $inp,`$FRAME-$SIZE_T*18`($sp) 240 addic. $num,$num,-1 241 bne- Lunaligned 242 243Ldone: 244 $POP r0,`$FRAME+$LRSAVE`($sp) 245 $POP r15,`$FRAME-$SIZE_T*17`($sp) 246 $POP r16,`$FRAME-$SIZE_T*16`($sp) 247 $POP r17,`$FRAME-$SIZE_T*15`($sp) 248 $POP r18,`$FRAME-$SIZE_T*14`($sp) 249 $POP r19,`$FRAME-$SIZE_T*13`($sp) 250 $POP r20,`$FRAME-$SIZE_T*12`($sp) 251 $POP r21,`$FRAME-$SIZE_T*11`($sp) 252 $POP r22,`$FRAME-$SIZE_T*10`($sp) 253 $POP r23,`$FRAME-$SIZE_T*9`($sp) 254 $POP r24,`$FRAME-$SIZE_T*8`($sp) 255 $POP r25,`$FRAME-$SIZE_T*7`($sp) 256 $POP r26,`$FRAME-$SIZE_T*6`($sp) 257 $POP r27,`$FRAME-$SIZE_T*5`($sp) 258 $POP r28,`$FRAME-$SIZE_T*4`($sp) 259 $POP r29,`$FRAME-$SIZE_T*3`($sp) 260 $POP r30,`$FRAME-$SIZE_T*2`($sp) 261 $POP r31,`$FRAME-$SIZE_T*1`($sp) 262 mtlr r0 263 addi $sp,$sp,$FRAME 264 blr 265 .long 0 266 .byte 0,12,4,1,0x80,18,3,0 267 .long 0 268___ 269 270# This is private block function, which uses tailored calling 271# interface, namely upon entry SHA_CTX is pre-loaded to given 272# registers and counter register contains amount of chunks to 273# digest... 274$code.=<<___; 275.align 4 276Lsha1_block_private: 277___ 278$code.=<<___; # load K_00_19 279 lis $K,0x5a82 280 ori $K,$K,0x7999 281___ 282for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 283$code.=<<___; # load K_20_39 284 lis $K,0x6ed9 285 ori $K,$K,0xeba1 286___ 287for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 288$code.=<<___; # load K_40_59 289 lis $K,0x8f1b 290 ori $K,$K,0xbcdc 291___ 292for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 293$code.=<<___; # load K_60_79 294 lis $K,0xca62 295 ori $K,$K,0xc1d6 296___ 297for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 298$code.=<<___; 299 add r16,r16,$E 300 add r17,r17,$T 301 add r18,r18,$A 302 add r19,r19,$B 303 add r20,r20,$C 304 stw r16,0($ctx) 305 mr $A,r16 306 stw r17,4($ctx) 307 mr $B,r17 308 stw r18,8($ctx) 309 mr $C,r18 310 stw r19,12($ctx) 311 mr $D,r19 312 stw r20,16($ctx) 313 mr $E,r20 314 addi $inp,$inp,`16*4` 315 bdnz- Lsha1_block_private 316 blr 317 .long 0 318 .byte 0,12,0x14,0,0,0,0,0 319___ 320$code.=<<___; 321.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" 322___ 323 324$code =~ s/\`([^\`]*)\`/eval $1/gem; 325print $code; 326close STDOUT; 327