1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# On 21264 RSA sign performance improves by 70/35/20/15 percent for 11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler 12# instructed to '-tune host' code with in-line assembler. Other 13# benchmarks improve by 15-20%. To anchor it to something else, the 14# code provides approximately the same performance per GHz as AMD64. 15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x 16# difference. 17 18# int bn_mul_mont( 19$rp="a0"; # BN_ULONG *rp, 20$ap="a1"; # const BN_ULONG *ap, 21$bp="a2"; # const BN_ULONG *bp, 22$np="a3"; # const BN_ULONG *np, 23$n0="a4"; # const BN_ULONG *n0, 24$num="a5"; # int num); 25 26$lo0="t0"; 27$hi0="t1"; 28$lo1="t2"; 29$hi1="t3"; 30$aj="t4"; 31$bi="t5"; 32$nj="t6"; 33$tp="t7"; 34$alo="t8"; 35$ahi="t9"; 36$nlo="t10"; 37$nhi="t11"; 38$tj="t12"; 39$i="s3"; 40$j="s4"; 41$m1="s5"; 42 43$code=<<___; 44#include <machine/asm.h> 45 46.text 47 48.set noat 49.set noreorder 50 51.globl bn_mul_mont 52.align 5 53.ent bn_mul_mont 54bn_mul_mont: 55 lda sp,-48(sp) 56 stq ra,0(sp) 57 stq s3,8(sp) 58 stq s4,16(sp) 59 stq s5,24(sp) 60 stq fp,32(sp) 61 mov sp,fp 62 .mask 0x0400f000,-48 63 .frame fp,48,ra 64 .prologue 0 65 66 .align 4 67 .set reorder 68 sextl $num,$num 69 mov 0,v0 70 cmplt $num,4,AT 71 bne AT,.Lexit 72 73 ldq $hi0,0($ap) # ap[0] 74 s8addq $num,16,AT 75 ldq $aj,8($ap) 76 subq sp,AT,sp 77 ldq $bi,0($bp) # bp[0] 78 lda AT,-4096(zero) # mov -4096,AT 79 ldq $n0,0($n0) 80 and sp,AT,sp 81 82 mulq $hi0,$bi,$lo0 83 ldq $hi1,0($np) # np[0] 84 umulh $hi0,$bi,$hi0 85 ldq $nj,8($np) 86 87 mulq $lo0,$n0,$m1 88 89 mulq $hi1,$m1,$lo1 90 umulh $hi1,$m1,$hi1 91 92 addq $lo1,$lo0,$lo1 93 cmpult $lo1,$lo0,AT 94 addq $hi1,AT,$hi1 95 96 mulq $aj,$bi,$alo 97 mov 2,$j 98 umulh $aj,$bi,$ahi 99 mov sp,$tp 100 101 mulq $nj,$m1,$nlo 102 s8addq $j,$ap,$aj 103 umulh $nj,$m1,$nhi 104 s8addq $j,$np,$nj 105.align 4 106.L1st: 107 .set noreorder 108 ldq $aj,0($aj) 109 addl $j,1,$j 110 ldq $nj,0($nj) 111 lda $tp,8($tp) 112 113 addq $alo,$hi0,$lo0 114 mulq $aj,$bi,$alo 115 cmpult $lo0,$hi0,AT 116 addq $nlo,$hi1,$lo1 117 118 mulq $nj,$m1,$nlo 119 addq $ahi,AT,$hi0 120 cmpult $lo1,$hi1,v0 121 cmplt $j,$num,$tj 122 123 umulh $aj,$bi,$ahi 124 addq $nhi,v0,$hi1 125 addq $lo1,$lo0,$lo1 126 s8addq $j,$ap,$aj 127 128 umulh $nj,$m1,$nhi 129 cmpult $lo1,$lo0,v0 130 addq $hi1,v0,$hi1 131 s8addq $j,$np,$nj 132 133 stq $lo1,-8($tp) 134 nop 135 unop 136 bne $tj,.L1st 137 .set reorder 138 139 addq $alo,$hi0,$lo0 140 addq $nlo,$hi1,$lo1 141 cmpult $lo0,$hi0,AT 142 cmpult $lo1,$hi1,v0 143 addq $ahi,AT,$hi0 144 addq $nhi,v0,$hi1 145 146 addq $lo1,$lo0,$lo1 147 cmpult $lo1,$lo0,v0 148 addq $hi1,v0,$hi1 149 150 stq $lo1,0($tp) 151 152 addq $hi1,$hi0,$hi1 153 cmpult $hi1,$hi0,AT 154 stq $hi1,8($tp) 155 stq AT,16($tp) 156 157 mov 1,$i 158.align 4 159.Louter: 160 s8addq $i,$bp,$bi 161 ldq $hi0,0($ap) 162 ldq $aj,8($ap) 163 ldq $bi,0($bi) 164 ldq $hi1,0($np) 165 ldq $nj,8($np) 166 ldq $tj,0(sp) 167 168 mulq $hi0,$bi,$lo0 169 umulh $hi0,$bi,$hi0 170 171 addq $lo0,$tj,$lo0 172 cmpult $lo0,$tj,AT 173 addq $hi0,AT,$hi0 174 175 mulq $lo0,$n0,$m1 176 177 mulq $hi1,$m1,$lo1 178 umulh $hi1,$m1,$hi1 179 180 addq $lo1,$lo0,$lo1 181 cmpult $lo1,$lo0,AT 182 mov 2,$j 183 addq $hi1,AT,$hi1 184 185 mulq $aj,$bi,$alo 186 mov sp,$tp 187 umulh $aj,$bi,$ahi 188 189 mulq $nj,$m1,$nlo 190 s8addq $j,$ap,$aj 191 umulh $nj,$m1,$nhi 192.align 4 193.Linner: 194 .set noreorder 195 ldq $tj,8($tp) #L0 196 nop #U1 197 ldq $aj,0($aj) #L1 198 s8addq $j,$np,$nj #U0 199 200 ldq $nj,0($nj) #L0 201 nop #U1 202 addq $alo,$hi0,$lo0 #L1 203 lda $tp,8($tp) 204 205 mulq $aj,$bi,$alo #U1 206 cmpult $lo0,$hi0,AT #L0 207 addq $nlo,$hi1,$lo1 #L1 208 addl $j,1,$j 209 210 mulq $nj,$m1,$nlo #U1 211 addq $ahi,AT,$hi0 #L0 212 addq $lo0,$tj,$lo0 #L1 213 cmpult $lo1,$hi1,v0 #U0 214 215 umulh $aj,$bi,$ahi #U1 216 cmpult $lo0,$tj,AT #L0 217 addq $lo1,$lo0,$lo1 #L1 218 addq $nhi,v0,$hi1 #U0 219 220 umulh $nj,$m1,$nhi #U1 221 s8addq $j,$ap,$aj #L0 222 cmpult $lo1,$lo0,v0 #L1 223 cmplt $j,$num,$tj #U0 # borrow $tj 224 225 addq $hi0,AT,$hi0 #L0 226 addq $hi1,v0,$hi1 #U1 227 stq $lo1,-8($tp) #L1 228 bne $tj,.Linner #U0 229 .set reorder 230 231 ldq $tj,8($tp) 232 addq $alo,$hi0,$lo0 233 addq $nlo,$hi1,$lo1 234 cmpult $lo0,$hi0,AT 235 cmpult $lo1,$hi1,v0 236 addq $ahi,AT,$hi0 237 addq $nhi,v0,$hi1 238 239 addq $lo0,$tj,$lo0 240 cmpult $lo0,$tj,AT 241 addq $hi0,AT,$hi0 242 243 ldq $tj,16($tp) 244 addq $lo1,$lo0,$j 245 cmpult $j,$lo0,v0 246 addq $hi1,v0,$hi1 247 248 addq $hi1,$hi0,$lo1 249 stq $j,0($tp) 250 cmpult $lo1,$hi0,$hi1 251 addq $lo1,$tj,$lo1 252 cmpult $lo1,$tj,AT 253 addl $i,1,$i 254 addq $hi1,AT,$hi1 255 stq $lo1,8($tp) 256 cmplt $i,$num,$tj # borrow $tj 257 stq $hi1,16($tp) 258 bne $tj,.Louter 259 260 s8addq $num,sp,$tj # &tp[num] 261 mov $rp,$bp # put rp aside 262 mov sp,$tp 263 mov sp,$ap 264 mov 0,$hi0 # clear borrow bit 265 266.align 4 267.Lsub: ldq $lo0,0($tp) 268 ldq $lo1,0($np) 269 lda $tp,8($tp) 270 lda $np,8($np) 271 subq $lo0,$lo1,$lo1 # tp[i]-np[i] 272 cmpult $lo0,$lo1,AT 273 subq $lo1,$hi0,$lo0 274 cmpult $lo1,$lo0,$hi0 275 or $hi0,AT,$hi0 276 stq $lo0,0($rp) 277 cmpult $tp,$tj,v0 278 lda $rp,8($rp) 279 bne v0,.Lsub 280 281 subq $hi1,$hi0,$hi0 # handle upmost overflow bit 282 mov sp,$tp 283 mov $bp,$rp # restore rp 284 285 and sp,$hi0,$ap 286 bic $bp,$hi0,$bp 287 bis $bp,$ap,$ap # ap=borrow?tp:rp 288 289.align 4 290.Lcopy: ldq $aj,0($ap) # copy or in-place refresh 291 lda $tp,8($tp) 292 lda $rp,8($rp) 293 lda $ap,8($ap) 294 stq zero,-8($tp) # zap tp 295 cmpult $tp,$tj,AT 296 stq $aj,-8($rp) 297 bne AT,.Lcopy 298 mov 1,v0 299 300.Lexit: 301 .set noreorder 302 mov fp,sp 303 /*ldq ra,0(sp)*/ 304 ldq s3,8(sp) 305 ldq s4,16(sp) 306 ldq s5,24(sp) 307 ldq fp,32(sp) 308 lda sp,48(sp) 309 ret (ra) 310.end bn_mul_mont 311.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 312.align 2 313___ 314 315print $code; 316close STDOUT; 317