1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# On 21264 RSA sign performance improves by 70/35/20/15 percent for 11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler 12# instructed to '-tune host' code with in-line assembler. Other 13# benchmarks improve by 15-20%. To anchor it to something else, the 14# code provides approximately the same performance per GHz as AMD64. 15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x 16# difference. 17 18# int bn_mul_mont( 19$rp="a0"; # BN_ULONG *rp, 20$ap="a1"; # const BN_ULONG *ap, 21$bp="a2"; # const BN_ULONG *bp, 22$np="a3"; # const BN_ULONG *np, 23$n0="a4"; # const BN_ULONG *n0, 24$num="a5"; # int num); 25 26$lo0="t0"; 27$hi0="t1"; 28$lo1="t2"; 29$hi1="t3"; 30$aj="t4"; 31$bi="t5"; 32$nj="t6"; 33$tp="t7"; 34$alo="t8"; 35$ahi="t9"; 36$nlo="t10"; 37$nhi="t11"; 38$tj="t12"; 39$i="s3"; 40$j="s4"; 41$m1="s5"; 42 43$code=<<___; 44#ifdef __linux__ 45#include <asm/regdef.h> 46#else 47#include <asm.h> 48#include <regdef.h> 49#endif 50 51.text 52 53.set noat 54.set noreorder 55 56.globl bn_mul_mont 57.align 5 58.ent bn_mul_mont 59bn_mul_mont: 60 lda sp,-48(sp) 61 stq ra,0(sp) 62 stq s3,8(sp) 63 stq s4,16(sp) 64 stq s5,24(sp) 65 stq fp,32(sp) 66 mov sp,fp 67 .mask 0x0400f000,-48 68 .frame fp,48,ra 69 .prologue 0 70 71 .align 4 72 .set reorder 73 sextl $num,$num 74 mov 0,v0 75 cmplt $num,4,AT 76 bne AT,.Lexit 77 78 ldq $hi0,0($ap) # ap[0] 79 s8addq $num,16,AT 80 ldq $aj,8($ap) 81 subq sp,AT,sp 82 ldq $bi,0($bp) # bp[0] 83 lda AT,-4096(zero) # mov -4096,AT 84 ldq $n0,0($n0) 85 and sp,AT,sp 86 87 mulq $hi0,$bi,$lo0 88 ldq $hi1,0($np) # np[0] 89 umulh $hi0,$bi,$hi0 90 ldq $nj,8($np) 91 92 mulq $lo0,$n0,$m1 93 94 mulq $hi1,$m1,$lo1 95 umulh $hi1,$m1,$hi1 96 97 addq $lo1,$lo0,$lo1 98 cmpult $lo1,$lo0,AT 99 addq $hi1,AT,$hi1 100 101 mulq $aj,$bi,$alo 102 mov 2,$j 103 umulh $aj,$bi,$ahi 104 mov sp,$tp 105 106 mulq $nj,$m1,$nlo 107 s8addq $j,$ap,$aj 108 umulh $nj,$m1,$nhi 109 s8addq $j,$np,$nj 110.align 4 111.L1st: 112 .set noreorder 113 ldq $aj,0($aj) 114 addl $j,1,$j 115 ldq $nj,0($nj) 116 lda $tp,8($tp) 117 118 addq $alo,$hi0,$lo0 119 mulq $aj,$bi,$alo 120 cmpult $lo0,$hi0,AT 121 addq $nlo,$hi1,$lo1 122 123 mulq $nj,$m1,$nlo 124 addq $ahi,AT,$hi0 125 cmpult $lo1,$hi1,v0 126 cmplt $j,$num,$tj 127 128 umulh $aj,$bi,$ahi 129 addq $nhi,v0,$hi1 130 addq $lo1,$lo0,$lo1 131 s8addq $j,$ap,$aj 132 133 umulh $nj,$m1,$nhi 134 cmpult $lo1,$lo0,v0 135 addq $hi1,v0,$hi1 136 s8addq $j,$np,$nj 137 138 stq $lo1,-8($tp) 139 nop 140 unop 141 bne $tj,.L1st 142 .set reorder 143 144 addq $alo,$hi0,$lo0 145 addq $nlo,$hi1,$lo1 146 cmpult $lo0,$hi0,AT 147 cmpult $lo1,$hi1,v0 148 addq $ahi,AT,$hi0 149 addq $nhi,v0,$hi1 150 151 addq $lo1,$lo0,$lo1 152 cmpult $lo1,$lo0,v0 153 addq $hi1,v0,$hi1 154 155 stq $lo1,0($tp) 156 157 addq $hi1,$hi0,$hi1 158 cmpult $hi1,$hi0,AT 159 stq $hi1,8($tp) 160 stq AT,16($tp) 161 162 mov 1,$i 163.align 4 164.Louter: 165 s8addq $i,$bp,$bi 166 ldq $hi0,0($ap) 167 ldq $aj,8($ap) 168 ldq $bi,0($bi) 169 ldq $hi1,0($np) 170 ldq $nj,8($np) 171 ldq $tj,0(sp) 172 173 mulq $hi0,$bi,$lo0 174 umulh $hi0,$bi,$hi0 175 176 addq $lo0,$tj,$lo0 177 cmpult $lo0,$tj,AT 178 addq $hi0,AT,$hi0 179 180 mulq $lo0,$n0,$m1 181 182 mulq $hi1,$m1,$lo1 183 umulh $hi1,$m1,$hi1 184 185 addq $lo1,$lo0,$lo1 186 cmpult $lo1,$lo0,AT 187 mov 2,$j 188 addq $hi1,AT,$hi1 189 190 mulq $aj,$bi,$alo 191 mov sp,$tp 192 umulh $aj,$bi,$ahi 193 194 mulq $nj,$m1,$nlo 195 s8addq $j,$ap,$aj 196 umulh $nj,$m1,$nhi 197.align 4 198.Linner: 199 .set noreorder 200 ldq $tj,8($tp) #L0 201 nop #U1 202 ldq $aj,0($aj) #L1 203 s8addq $j,$np,$nj #U0 204 205 ldq $nj,0($nj) #L0 206 nop #U1 207 addq $alo,$hi0,$lo0 #L1 208 lda $tp,8($tp) 209 210 mulq $aj,$bi,$alo #U1 211 cmpult $lo0,$hi0,AT #L0 212 addq $nlo,$hi1,$lo1 #L1 213 addl $j,1,$j 214 215 mulq $nj,$m1,$nlo #U1 216 addq $ahi,AT,$hi0 #L0 217 addq $lo0,$tj,$lo0 #L1 218 cmpult $lo1,$hi1,v0 #U0 219 220 umulh $aj,$bi,$ahi #U1 221 cmpult $lo0,$tj,AT #L0 222 addq $lo1,$lo0,$lo1 #L1 223 addq $nhi,v0,$hi1 #U0 224 225 umulh $nj,$m1,$nhi #U1 226 s8addq $j,$ap,$aj #L0 227 cmpult $lo1,$lo0,v0 #L1 228 cmplt $j,$num,$tj #U0 # borrow $tj 229 230 addq $hi0,AT,$hi0 #L0 231 addq $hi1,v0,$hi1 #U1 232 stq $lo1,-8($tp) #L1 233 bne $tj,.Linner #U0 234 .set reorder 235 236 ldq $tj,8($tp) 237 addq $alo,$hi0,$lo0 238 addq $nlo,$hi1,$lo1 239 cmpult $lo0,$hi0,AT 240 cmpult $lo1,$hi1,v0 241 addq $ahi,AT,$hi0 242 addq $nhi,v0,$hi1 243 244 addq $lo0,$tj,$lo0 245 cmpult $lo0,$tj,AT 246 addq $hi0,AT,$hi0 247 248 ldq $tj,16($tp) 249 addq $lo1,$lo0,$j 250 cmpult $j,$lo0,v0 251 addq $hi1,v0,$hi1 252 253 addq $hi1,$hi0,$lo1 254 stq $j,0($tp) 255 cmpult $lo1,$hi0,$hi1 256 addq $lo1,$tj,$lo1 257 cmpult $lo1,$tj,AT 258 addl $i,1,$i 259 addq $hi1,AT,$hi1 260 stq $lo1,8($tp) 261 cmplt $i,$num,$tj # borrow $tj 262 stq $hi1,16($tp) 263 bne $tj,.Louter 264 265 s8addq $num,sp,$tj # &tp[num] 266 mov $rp,$bp # put rp aside 267 mov sp,$tp 268 mov sp,$ap 269 mov 0,$hi0 # clear borrow bit 270 271.align 4 272.Lsub: ldq $lo0,0($tp) 273 ldq $lo1,0($np) 274 lda $tp,8($tp) 275 lda $np,8($np) 276 subq $lo0,$lo1,$lo1 # tp[i]-np[i] 277 cmpult $lo0,$lo1,AT 278 subq $lo1,$hi0,$lo0 279 cmpult $lo1,$lo0,$hi0 280 or $hi0,AT,$hi0 281 stq $lo0,0($rp) 282 cmpult $tp,$tj,v0 283 lda $rp,8($rp) 284 bne v0,.Lsub 285 286 subq $hi1,$hi0,$hi0 # handle upmost overflow bit 287 mov sp,$tp 288 mov $bp,$rp # restore rp 289 290 and sp,$hi0,$ap 291 bic $bp,$hi0,$bp 292 bis $bp,$ap,$ap # ap=borrow?tp:rp 293 294.align 4 295.Lcopy: ldq $aj,0($ap) # copy or in-place refresh 296 lda $tp,8($tp) 297 lda $rp,8($rp) 298 lda $ap,8($ap) 299 stq zero,-8($tp) # zap tp 300 cmpult $tp,$tj,AT 301 stq $aj,-8($rp) 302 bne AT,.Lcopy 303 mov 1,v0 304 305.Lexit: 306 .set noreorder 307 mov fp,sp 308 /*ldq ra,0(sp)*/ 309 ldq s3,8(sp) 310 ldq s4,16(sp) 311 ldq s5,24(sp) 312 ldq fp,32(sp) 313 lda sp,48(sp) 314 ret (ra) 315.end bn_mul_mont 316.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 317.align 2 318___ 319 320print $code; 321close STDOUT; 322