1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# April 2006 11 12# "Teaser" Montgomery multiplication module for PowerPC. It's possible 13# to gain a bit more by modulo-scheduling outer loop, then dedicated 14# squaring procedure should give further 20% and code can be adapted 15# for 32-bit application running on 64-bit CPU. As for the latter. 16# It won't be able to achieve "native" 64-bit performance, because in 17# 32-bit application context every addc instruction will have to be 18# expanded as addc, twice right shift by 32 and finally adde, etc. 19# So far RSA *sign* performance improvement over pre-bn_mul_mont asm 20# for 64-bit application running on PPC970/G5 is: 21# 22# 512-bit +65% 23# 1024-bit +35% 24# 2048-bit +18% 25# 4096-bit +4% 26 27$flavour = shift; 28 29if ($flavour =~ /32/) { 30 $BITS= 32; 31 $BNSZ= $BITS/8; 32 $SIZE_T=4; 33 $RZONE= 224; 34 $FRAME= $SIZE_T*16; 35 36 $LD= "lwz"; # load 37 $LDU= "lwzu"; # load and update 38 $LDX= "lwzx"; # load indexed 39 $ST= "stw"; # store 40 $STU= "stwu"; # store and update 41 $STX= "stwx"; # store indexed 42 $STUX= "stwux"; # store indexed and update 43 $UMULL= "mullw"; # unsigned multiply low 44 $UMULH= "mulhwu"; # unsigned multiply high 45 $UCMP= "cmplw"; # unsigned compare 46 $SHRI= "srwi"; # unsigned shift right by immediate 47 $PUSH= $ST; 48 $POP= $LD; 49} elsif ($flavour =~ /64/) { 50 $BITS= 64; 51 $BNSZ= $BITS/8; 52 $SIZE_T=8; 53 $RZONE= 288; 54 $FRAME= $SIZE_T*16; 55 56 # same as above, but 64-bit mnemonics... 57 $LD= "ld"; # load 58 $LDU= "ldu"; # load and update 59 $LDX= "ldx"; # load indexed 60 $ST= "std"; # store 61 $STU= "stdu"; # store and update 62 $STX= "stdx"; # store indexed 63 $STUX= "stdux"; # store indexed and update 64 $UMULL= "mulld"; # unsigned multiply low 65 $UMULH= "mulhdu"; # unsigned multiply high 66 $UCMP= "cmpld"; # unsigned compare 67 $SHRI= "srdi"; # unsigned shift right by immediate 68 $PUSH= $ST; 69 $POP= $LD; 70} else { die "nonsense $flavour"; } 71 72$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 73( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 74( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 75die "can't locate ppc-xlate.pl"; 76 77open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 78 79$sp="r1"; 80$toc="r2"; 81$rp="r3"; $ovf="r3"; 82$ap="r4"; 83$bp="r5"; 84$np="r6"; 85$n0="r7"; 86$num="r8"; 87$rp="r9"; # $rp is reassigned 88$aj="r10"; 89$nj="r11"; 90$tj="r12"; 91# non-volatile registers 92$i="r14"; 93$j="r15"; 94$tp="r16"; 95$m0="r17"; 96$m1="r18"; 97$lo0="r19"; 98$hi0="r20"; 99$lo1="r21"; 100$hi1="r22"; 101$alo="r23"; 102$ahi="r24"; 103$nlo="r25"; 104# 105$nhi="r0"; 106 107$code=<<___; 108.machine "any" 109.text 110 111.globl .bn_mul_mont 112.align 4 113.bn_mul_mont: 114 cmpwi $num,4 115 mr $rp,r3 ; $rp is reassigned 116 li r3,0 117 bltlr 118 119 slwi $num,$num,`log($BNSZ)/log(2)` 120 li $tj,-4096 121 addi $ovf,$num,`$FRAME+$RZONE` 122 subf $ovf,$ovf,$sp ; $sp-$ovf 123 and $ovf,$ovf,$tj ; minimize TLB usage 124 subf $ovf,$sp,$ovf ; $ovf-$sp 125 srwi $num,$num,`log($BNSZ)/log(2)` 126 $STUX $sp,$sp,$ovf 127 128 $PUSH r14,`4*$SIZE_T`($sp) 129 $PUSH r15,`5*$SIZE_T`($sp) 130 $PUSH r16,`6*$SIZE_T`($sp) 131 $PUSH r17,`7*$SIZE_T`($sp) 132 $PUSH r18,`8*$SIZE_T`($sp) 133 $PUSH r19,`9*$SIZE_T`($sp) 134 $PUSH r20,`10*$SIZE_T`($sp) 135 $PUSH r21,`11*$SIZE_T`($sp) 136 $PUSH r22,`12*$SIZE_T`($sp) 137 $PUSH r23,`13*$SIZE_T`($sp) 138 $PUSH r24,`14*$SIZE_T`($sp) 139 $PUSH r25,`15*$SIZE_T`($sp) 140 141 $LD $n0,0($n0) ; pull n0[0] value 142 addi $num,$num,-2 ; adjust $num for counter register 143 144 $LD $m0,0($bp) ; m0=bp[0] 145 $LD $aj,0($ap) ; ap[0] 146 addi $tp,$sp,$FRAME 147 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] 148 $UMULH $hi0,$aj,$m0 149 150 $LD $aj,$BNSZ($ap) ; ap[1] 151 $LD $nj,0($np) ; np[0] 152 153 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 154 155 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] 156 $UMULH $ahi,$aj,$m0 157 158 $UMULL $lo1,$nj,$m1 ; np[0]*m1 159 $UMULH $hi1,$nj,$m1 160 $LD $nj,$BNSZ($np) ; np[1] 161 addc $lo1,$lo1,$lo0 162 addze $hi1,$hi1 163 164 $UMULL $nlo,$nj,$m1 ; np[1]*m1 165 $UMULH $nhi,$nj,$m1 166 167 mtctr $num 168 li $j,`2*$BNSZ` 169.align 4 170L1st: 171 $LDX $aj,$ap,$j ; ap[j] 172 addc $lo0,$alo,$hi0 173 $LDX $nj,$np,$j ; np[j] 174 addze $hi0,$ahi 175 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] 176 addc $lo1,$nlo,$hi1 177 $UMULH $ahi,$aj,$m0 178 addze $hi1,$nhi 179 $UMULL $nlo,$nj,$m1 ; np[j]*m1 180 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 181 $UMULH $nhi,$nj,$m1 182 addze $hi1,$hi1 183 $ST $lo1,0($tp) ; tp[j-1] 184 185 addi $j,$j,$BNSZ ; j++ 186 addi $tp,$tp,$BNSZ ; tp++ 187 bdnz- L1st 188;L1st 189 addc $lo0,$alo,$hi0 190 addze $hi0,$ahi 191 192 addc $lo1,$nlo,$hi1 193 addze $hi1,$nhi 194 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 195 addze $hi1,$hi1 196 $ST $lo1,0($tp) ; tp[j-1] 197 198 li $ovf,0 199 addc $hi1,$hi1,$hi0 200 addze $ovf,$ovf ; upmost overflow bit 201 $ST $hi1,$BNSZ($tp) 202 203 li $i,$BNSZ 204.align 4 205Louter: 206 $LDX $m0,$bp,$i ; m0=bp[i] 207 $LD $aj,0($ap) ; ap[0] 208 addi $tp,$sp,$FRAME 209 $LD $tj,$FRAME($sp) ; tp[0] 210 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] 211 $UMULH $hi0,$aj,$m0 212 $LD $aj,$BNSZ($ap) ; ap[1] 213 $LD $nj,0($np) ; np[0] 214 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] 215 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 216 addze $hi0,$hi0 217 $UMULL $m1,$lo0,$n0 ; tp[0]*n0 218 $UMULH $ahi,$aj,$m0 219 $UMULL $lo1,$nj,$m1 ; np[0]*m1 220 $UMULH $hi1,$nj,$m1 221 $LD $nj,$BNSZ($np) ; np[1] 222 addc $lo1,$lo1,$lo0 223 $UMULL $nlo,$nj,$m1 ; np[1]*m1 224 addze $hi1,$hi1 225 $UMULH $nhi,$nj,$m1 226 227 mtctr $num 228 li $j,`2*$BNSZ` 229.align 4 230Linner: 231 $LDX $aj,$ap,$j ; ap[j] 232 addc $lo0,$alo,$hi0 233 $LD $tj,$BNSZ($tp) ; tp[j] 234 addze $hi0,$ahi 235 $LDX $nj,$np,$j ; np[j] 236 addc $lo1,$nlo,$hi1 237 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 238 addze $hi1,$nhi 239 $UMULH $ahi,$aj,$m0 240 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 241 $UMULL $nlo,$nj,$m1 ; np[j]*m1 242 addze $hi0,$hi0 243 $UMULH $nhi,$nj,$m1 244 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 245 addi $j,$j,$BNSZ ; j++ 246 addze $hi1,$hi1 247 $ST $lo1,0($tp) ; tp[j-1] 248 addi $tp,$tp,$BNSZ ; tp++ 249 bdnz- Linner 250;Linner 251 $LD $tj,$BNSZ($tp) ; tp[j] 252 addc $lo0,$alo,$hi0 253 addze $hi0,$ahi 254 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 255 addze $hi0,$hi0 256 257 addc $lo1,$nlo,$hi1 258 addze $hi1,$nhi 259 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 260 addze $hi1,$hi1 261 $ST $lo1,0($tp) ; tp[j-1] 262 263 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] 264 li $ovf,0 265 adde $hi1,$hi1,$hi0 266 addze $ovf,$ovf 267 $ST $hi1,$BNSZ($tp) 268; 269 slwi $tj,$num,`log($BNSZ)/log(2)` 270 $UCMP $i,$tj 271 addi $i,$i,$BNSZ 272 ble- Louter 273 274 addi $num,$num,2 ; restore $num 275 subfc $j,$j,$j ; j=0 and "clear" XER[CA] 276 addi $tp,$sp,$FRAME 277 mtctr $num 278 279.align 4 280Lsub: $LDX $tj,$tp,$j 281 $LDX $nj,$np,$j 282 subfe $aj,$nj,$tj ; tp[j]-np[j] 283 $STX $aj,$rp,$j 284 addi $j,$j,$BNSZ 285 bdnz- Lsub 286 287 li $j,0 288 mtctr $num 289 subfe $ovf,$j,$ovf ; handle upmost overflow bit 290 and $ap,$tp,$ovf 291 andc $np,$rp,$ovf 292 or $ap,$ap,$np ; ap=borrow?tp:rp 293 294.align 4 295Lcopy: ; copy or in-place refresh 296 $LDX $tj,$ap,$j 297 $STX $tj,$rp,$j 298 $STX $j,$tp,$j ; zap at once 299 addi $j,$j,$BNSZ 300 bdnz- Lcopy 301 302 $POP r14,`4*$SIZE_T`($sp) 303 $POP r15,`5*$SIZE_T`($sp) 304 $POP r16,`6*$SIZE_T`($sp) 305 $POP r17,`7*$SIZE_T`($sp) 306 $POP r18,`8*$SIZE_T`($sp) 307 $POP r19,`9*$SIZE_T`($sp) 308 $POP r20,`10*$SIZE_T`($sp) 309 $POP r21,`11*$SIZE_T`($sp) 310 $POP r22,`12*$SIZE_T`($sp) 311 $POP r23,`13*$SIZE_T`($sp) 312 $POP r24,`14*$SIZE_T`($sp) 313 $POP r25,`15*$SIZE_T`($sp) 314 $POP $sp,0($sp) 315 li r3,1 316 blr 317 .long 0 318.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" 319___ 320 321$code =~ s/\`([^\`]*)\`/eval $1/gem; 322print $code; 323close STDOUT; 324