1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# On 21264 RSA sign performance improves by 70/35/20/15 percent for 11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler 12# instructed to '-tune host' code with in-line assembler. Other 13# benchmarks improve by 15-20%. To anchor it to something else, the 14# code provides approximately the same performance per GHz as AMD64. 15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x 16# difference. 17 18# int bn_mul_mont( 19$rp="a0"; # BN_ULONG *rp, 20$ap="a1"; # const BN_ULONG *ap, 21$bp="a2"; # const BN_ULONG *bp, 22$np="a3"; # const BN_ULONG *np, 23$n0="a4"; # const BN_ULONG *n0, 24$num="a5"; # int num); 25 26$lo0="t0"; 27$hi0="t1"; 28$lo1="t2"; 29$hi1="t3"; 30$aj="t4"; 31$bi="t5"; 32$nj="t6"; 33$tp="t7"; 34$alo="t8"; 35$ahi="t9"; 36$nlo="t10"; 37$nhi="t11"; 38$tj="t12"; 39$i="s3"; 40$j="s4"; 41$m1="s5"; 42 43$code=<<___; 44#include <asm.h> 45#include <regdef.h> 46 47.text 48 49.set noat 50.set noreorder 51 52.globl bn_mul_mont 53.align 5 54.ent bn_mul_mont 55bn_mul_mont: 56 lda sp,-40(sp) 57 stq ra,0(sp) 58 stq s3,8(sp) 59 stq s4,16(sp) 60 stq s5,24(sp) 61 stq fp,32(sp) 62 mov sp,fp 63 .mask 0x0400f000,-40 64 .frame fp,40,ra 65 .prologue 0 66 67 .align 4 68 .set reorder 69 sextl $num,$num 70 mov 0,v0 71 cmplt $num,4,AT 72 bne AT,.Lexit 73 74 ldq $hi0,0($ap) # ap[0] 75 s8addq $num,16,AT 76 ldq $aj,8($ap) 77 subq sp,AT,sp 78 ldq $bi,0($bp) # bp[0] 79 mov -4096,AT 80 ldq $n0,0($n0) 81 and sp,AT,sp 82 83 mulq $hi0,$bi,$lo0 84 ldq $hi1,0($np) # np[0] 85 umulh $hi0,$bi,$hi0 86 ldq $nj,8($np) 87 88 mulq $lo0,$n0,$m1 89 90 mulq $hi1,$m1,$lo1 91 umulh $hi1,$m1,$hi1 92 93 addq $lo1,$lo0,$lo1 94 cmpult $lo1,$lo0,AT 95 addq $hi1,AT,$hi1 96 97 mulq $aj,$bi,$alo 98 mov 2,$j 99 umulh $aj,$bi,$ahi 100 mov sp,$tp 101 102 mulq $nj,$m1,$nlo 103 s8addq $j,$ap,$aj 104 umulh $nj,$m1,$nhi 105 s8addq $j,$np,$nj 106.align 4 107.L1st: 108 .set noreorder 109 ldq $aj,($aj) 110 addl $j,1,$j 111 ldq $nj,($nj) 112 lda $tp,8($tp) 113 114 addq $alo,$hi0,$lo0 115 mulq $aj,$bi,$alo 116 cmpult $lo0,$hi0,AT 117 addq $nlo,$hi1,$lo1 118 119 mulq $nj,$m1,$nlo 120 addq $ahi,AT,$hi0 121 cmpult $lo1,$hi1,v0 122 cmplt $j,$num,$tj 123 124 umulh $aj,$bi,$ahi 125 addq $nhi,v0,$hi1 126 addq $lo1,$lo0,$lo1 127 s8addq $j,$ap,$aj 128 129 umulh $nj,$m1,$nhi 130 cmpult $lo1,$lo0,v0 131 addq $hi1,v0,$hi1 132 s8addq $j,$np,$nj 133 134 stq $lo1,-8($tp) 135 nop 136 unop 137 bne $tj,.L1st 138 .set reorder 139 140 addq $alo,$hi0,$lo0 141 addq $nlo,$hi1,$lo1 142 cmpult $lo0,$hi0,AT 143 cmpult $lo1,$hi1,v0 144 addq $ahi,AT,$hi0 145 addq $nhi,v0,$hi1 146 147 addq $lo1,$lo0,$lo1 148 cmpult $lo1,$lo0,v0 149 addq $hi1,v0,$hi1 150 151 stq $lo1,0($tp) 152 153 addq $hi1,$hi0,$hi1 154 cmpult $hi1,$hi0,AT 155 stq $hi1,8($tp) 156 stq AT,16($tp) 157 158 mov 1,$i 159.align 4 160.Louter: 161 s8addq $i,$bp,$bi 162 ldq $hi0,($ap) 163 ldq $aj,8($ap) 164 ldq $bi,($bi) 165 ldq $hi1,($np) 166 ldq $nj,8($np) 167 ldq $tj,(sp) 168 169 mulq $hi0,$bi,$lo0 170 umulh $hi0,$bi,$hi0 171 172 addq $lo0,$tj,$lo0 173 cmpult $lo0,$tj,AT 174 addq $hi0,AT,$hi0 175 176 mulq $lo0,$n0,$m1 177 178 mulq $hi1,$m1,$lo1 179 umulh $hi1,$m1,$hi1 180 181 addq $lo1,$lo0,$lo1 182 cmpult $lo1,$lo0,AT 183 mov 2,$j 184 addq $hi1,AT,$hi1 185 186 mulq $aj,$bi,$alo 187 mov sp,$tp 188 umulh $aj,$bi,$ahi 189 190 mulq $nj,$m1,$nlo 191 s8addq $j,$ap,$aj 192 umulh $nj,$m1,$nhi 193.align 4 194.Linner: 195 .set noreorder 196 ldq $tj,8($tp) #L0 197 nop #U1 198 ldq $aj,($aj) #L1 199 s8addq $j,$np,$nj #U0 200 201 ldq $nj,($nj) #L0 202 nop #U1 203 addq $alo,$hi0,$lo0 #L1 204 lda $tp,8($tp) 205 206 mulq $aj,$bi,$alo #U1 207 cmpult $lo0,$hi0,AT #L0 208 addq $nlo,$hi1,$lo1 #L1 209 addl $j,1,$j 210 211 mulq $nj,$m1,$nlo #U1 212 addq $ahi,AT,$hi0 #L0 213 addq $lo0,$tj,$lo0 #L1 214 cmpult $lo1,$hi1,v0 #U0 215 216 umulh $aj,$bi,$ahi #U1 217 cmpult $lo0,$tj,AT #L0 218 addq $lo1,$lo0,$lo1 #L1 219 addq $nhi,v0,$hi1 #U0 220 221 umulh $nj,$m1,$nhi #U1 222 s8addq $j,$ap,$aj #L0 223 cmpult $lo1,$lo0,v0 #L1 224 cmplt $j,$num,$tj #U0 # borrow $tj 225 226 addq $hi0,AT,$hi0 #L0 227 addq $hi1,v0,$hi1 #U1 228 stq $lo1,-8($tp) #L1 229 bne $tj,.Linner #U0 230 .set reorder 231 232 ldq $tj,8($tp) 233 addq $alo,$hi0,$lo0 234 addq $nlo,$hi1,$lo1 235 cmpult $lo0,$hi0,AT 236 cmpult $lo1,$hi1,v0 237 addq $ahi,AT,$hi0 238 addq $nhi,v0,$hi1 239 240 addq $lo0,$tj,$lo0 241 cmpult $lo0,$tj,AT 242 addq $hi0,AT,$hi0 243 244 ldq $tj,16($tp) 245 addq $lo1,$lo0,$j 246 cmpult $j,$lo0,v0 247 addq $hi1,v0,$hi1 248 249 addq $hi1,$hi0,$lo1 250 stq $j,($tp) 251 cmpult $lo1,$hi0,$hi1 252 addq $lo1,$tj,$lo1 253 cmpult $lo1,$tj,AT 254 addl $i,1,$i 255 addq $hi1,AT,$hi1 256 stq $lo1,8($tp) 257 cmplt $i,$num,$tj # borrow $tj 258 stq $hi1,16($tp) 259 bne $tj,.Louter 260 261 s8addq $num,sp,$tj # &tp[num] 262 mov $rp,$bp # put rp aside 263 mov sp,$tp 264 mov sp,$ap 265 mov 0,$hi0 # clear borrow bit 266 267.align 4 268.Lsub: ldq $lo0,($tp) 269 ldq $lo1,($np) 270 lda $tp,8($tp) 271 lda $np,8($np) 272 subq $lo0,$lo1,$lo1 # tp[i]-np[i] 273 cmpult $lo0,$lo1,AT 274 subq $lo1,$hi0,$lo0 275 cmpult $lo1,$lo0,$hi0 276 or $hi0,AT,$hi0 277 stq $lo0,($rp) 278 cmpult $tp,$tj,v0 279 lda $rp,8($rp) 280 bne v0,.Lsub 281 282 subq $hi1,$hi0,$hi0 # handle upmost overflow bit 283 mov sp,$tp 284 mov $bp,$rp # restore rp 285 286 and sp,$hi0,$ap 287 bic $bp,$hi0,$bp 288 bis $bp,$ap,$ap # ap=borrow?tp:rp 289 290.align 4 291.Lcopy: ldq $aj,($ap) # copy or in-place refresh 292 lda $tp,8($tp) 293 lda $rp,8($rp) 294 lda $ap,8($ap) 295 stq zero,-8($tp) # zap tp 296 cmpult $tp,$tj,AT 297 stq $aj,-8($rp) 298 bne AT,.Lcopy 299 mov 1,v0 300 301.Lexit: 302 .set noreorder 303 mov fp,sp 304 /*ldq ra,0(sp)*/ 305 ldq s3,8(sp) 306 ldq s4,16(sp) 307 ldq s5,24(sp) 308 ldq fp,32(sp) 309 lda sp,40(sp) 310 ret (ra) 311.end bn_mul_mont 312.rdata 313.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 314___ 315 316print $code; 317close STDOUT; 318