1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# sha1_block procedure for ARMv4. 11# 12# January 2007. 13 14# Size/performance trade-off 15# ==================================================================== 16# impl size in bytes comp cycles[*] measured performance 17# ==================================================================== 18# thumb 304 3212 4420 19# armv4-small 392/+29% 1958/+64% 2250/+96% 20# armv4-compact 740/+89% 1552/+26% 1840/+22% 21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 22# full unroll ~5100/+260% ~1260/+4% ~1300/+5% 23# ==================================================================== 24# thumb = same as 'small' but in Thumb instructions[**] and 25# with recurring code in two private functions; 26# small = detached Xload/update, loops are folded; 27# compact = detached Xload/update, 5x unroll; 28# large = interleaved Xload/update, 5x unroll; 29# full unroll = interleaved Xload/update, full unroll, estimated[!]; 30# 31# [*] Manually counted instructions in "grand" loop body. Measured 32# performance is affected by prologue and epilogue overhead, 33# i-cache availability, branch penalties, etc. 34# [**] While each Thumb instruction is twice smaller, they are not as 35# diverse as ARM ones: e.g., there are only two arithmetic 36# instructions with 3 arguments, no [fixed] rotate, addressing 37# modes are limited. As result it takes more instructions to do 38# the same job in Thumb, therefore the code is never twice as 39# small and always slower. 40# [***] which is also ~35% better than compiler generated code. 41 42$output=shift; 43open STDOUT,">$output"; 44 45$ctx="r0"; 46$inp="r1"; 47$len="r2"; 48$a="r3"; 49$b="r4"; 50$c="r5"; 51$d="r6"; 52$e="r7"; 53$K="r8"; 54$t0="r9"; 55$t1="r10"; 56$t2="r11"; 57$t3="r12"; 58$Xi="r14"; 59@V=($a,$b,$c,$d,$e); 60 61# One can optimize this for aligned access on big-endian architecture, 62# but code's endian neutrality makes it too pretty:-) 63sub Xload { 64my ($a,$b,$c,$d,$e)=@_; 65$code.=<<___; 66 ldrb $t0,[$inp],#4 67 ldrb $t1,[$inp,#-3] 68 ldrb $t2,[$inp,#-2] 69 ldrb $t3,[$inp,#-1] 70 add $e,$K,$e,ror#2 @ E+=K_00_19 71 orr $t0,$t1,$t0,lsl#8 72 add $e,$e,$a,ror#27 @ E+=ROR(A,27) 73 orr $t0,$t2,$t0,lsl#8 74 eor $t1,$c,$d @ F_xx_xx 75 orr $t0,$t3,$t0,lsl#8 76 add $e,$e,$t0 @ E+=X[i] 77 str $t0,[$Xi,#-4]! 78___ 79} 80sub Xupdate { 81my ($a,$b,$c,$d,$e,$flag)=@_; 82$code.=<<___; 83 ldr $t0,[$Xi,#15*4] 84 ldr $t1,[$Xi,#13*4] 85 ldr $t2,[$Xi,#7*4] 86 ldr $t3,[$Xi,#2*4] 87 add $e,$K,$e,ror#2 @ E+=K_xx_xx 88 eor $t0,$t0,$t1 89 eor $t0,$t0,$t2 90 eor $t0,$t0,$t3 91 add $e,$e,$a,ror#27 @ E+=ROR(A,27) 92___ 93$code.=<<___ if (!defined($flag)); 94 eor $t1,$c,$d @ F_xx_xx, but not in 40_59 95___ 96$code.=<<___; 97 mov $t0,$t0,ror#31 98 add $e,$e,$t0 @ E+=X[i] 99 str $t0,[$Xi,#-4]! 100___ 101} 102 103sub BODY_00_15 { 104my ($a,$b,$c,$d,$e)=@_; 105 &Xload(@_); 106$code.=<<___; 107 and $t1,$b,$t1,ror#2 108 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 109 add $e,$e,$t1 @ E+=F_00_19(B,C,D) 110___ 111} 112 113sub BODY_16_19 { 114my ($a,$b,$c,$d,$e)=@_; 115 &Xupdate(@_); 116$code.=<<___; 117 and $t1,$b,$t1,ror#2 118 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 119 add $e,$e,$t1 @ E+=F_00_19(B,C,D) 120___ 121} 122 123sub BODY_20_39 { 124my ($a,$b,$c,$d,$e)=@_; 125 &Xupdate(@_); 126$code.=<<___; 127 eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D) 128 add $e,$e,$t1 @ E+=F_20_39(B,C,D) 129___ 130} 131 132sub BODY_40_59 { 133my ($a,$b,$c,$d,$e)=@_; 134 &Xupdate(@_,1); 135$code.=<<___; 136 and $t1,$b,$c,ror#2 137 orr $t2,$b,$c,ror#2 138 and $t2,$t2,$d,ror#2 139 orr $t1,$t1,$t2 @ F_40_59(B,C,D) 140 add $e,$e,$t1 @ E+=F_40_59(B,C,D) 141___ 142} 143 144$code=<<___; 145.text 146 147.global sha1_block_data_order 148.type sha1_block_data_order,%function 149 150.align 2 151sha1_block_data_order: 152 stmdb sp!,{r4-r12,lr} 153 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp 154 ldmia $ctx,{$a,$b,$c,$d,$e} 155.Lloop: 156 ldr $K,.LK_00_19 157 mov $Xi,sp 158 sub sp,sp,#15*4 159 mov $c,$c,ror#30 160 mov $d,$d,ror#30 161 mov $e,$e,ror#30 @ [6] 162.L_00_15: 163___ 164for($i=0;$i<5;$i++) { 165 &BODY_00_15(@V); unshift(@V,pop(@V)); 166} 167$code.=<<___; 168 teq $Xi,sp 169 bne .L_00_15 @ [((11+4)*5+2)*3] 170___ 171 &BODY_00_15(@V); unshift(@V,pop(@V)); 172 &BODY_16_19(@V); unshift(@V,pop(@V)); 173 &BODY_16_19(@V); unshift(@V,pop(@V)); 174 &BODY_16_19(@V); unshift(@V,pop(@V)); 175 &BODY_16_19(@V); unshift(@V,pop(@V)); 176$code.=<<___; 177 178 ldr $K,.LK_20_39 @ [+15+16*4] 179 sub sp,sp,#25*4 180 cmn sp,#0 @ [+3], clear carry to denote 20_39 181.L_20_39_or_60_79: 182___ 183for($i=0;$i<5;$i++) { 184 &BODY_20_39(@V); unshift(@V,pop(@V)); 185} 186$code.=<<___; 187 teq $Xi,sp @ preserve carry 188 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 189 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 190 191 ldr $K,.LK_40_59 192 sub sp,sp,#20*4 @ [+2] 193.L_40_59: 194___ 195for($i=0;$i<5;$i++) { 196 &BODY_40_59(@V); unshift(@V,pop(@V)); 197} 198$code.=<<___; 199 teq $Xi,sp 200 bne .L_40_59 @ [+((12+5)*5+2)*4] 201 202 ldr $K,.LK_60_79 203 sub sp,sp,#20*4 204 cmp sp,#0 @ set carry to denote 60_79 205 b .L_20_39_or_60_79 @ [+4], spare 300 bytes 206.L_done: 207 add sp,sp,#80*4 @ "deallocate" stack frame 208 ldmia $ctx,{$K,$t0,$t1,$t2,$t3} 209 add $a,$K,$a 210 add $b,$t0,$b 211 add $c,$t1,$c,ror#2 212 add $d,$t2,$d,ror#2 213 add $e,$t3,$e,ror#2 214 stmia $ctx,{$a,$b,$c,$d,$e} 215 teq $inp,$len 216 bne .Lloop @ [+18], total 1307 217 218 ldmia sp!,{r4-r12,lr} 219 tst lr,#1 220 moveq pc,lr @ be binary compatible with V4, yet 221 bx lr @ interoperable with Thumb ISA:-) 222.align 2 223.LK_00_19: .word 0x5a827999 224.LK_20_39: .word 0x6ed9eba1 225.LK_40_59: .word 0x8f1bbcdc 226.LK_60_79: .word 0xca62c1d6 227.size sha1_block_data_order,.-sha1_block_data_order 228.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 229.align 2 230___ 231 232$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 233print $code; 234close STDOUT; # enforce flush 235