1#! /usr/bin/env perl 2# Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# On 21264 RSA sign performance improves by 70/35/20/15 percent for 18# 512/1024/2048/4096 bit key lengths. This is against vendor compiler 19# instructed to '-tune host' code with in-line assembler. Other 20# benchmarks improve by 15-20%. To anchor it to something else, the 21# code provides approximately the same performance per GHz as AMD64. 22# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x 23# difference. 24 25$output=pop; 26open STDOUT,">$output"; 27 28# int bn_mul_mont( 29$rp="a0"; # BN_ULONG *rp, 30$ap="a1"; # const BN_ULONG *ap, 31$bp="a2"; # const BN_ULONG *bp, 32$np="a3"; # const BN_ULONG *np, 33$n0="a4"; # const BN_ULONG *n0, 34$num="a5"; # int num); 35 36$lo0="t0"; 37$hi0="t1"; 38$lo1="t2"; 39$hi1="t3"; 40$aj="t4"; 41$bi="t5"; 42$nj="t6"; 43$tp="t7"; 44$alo="t8"; 45$ahi="t9"; 46$nlo="t10"; 47$nhi="t11"; 48$tj="t12"; 49$i="s3"; 50$j="s4"; 51$m1="s5"; 52 53$code=<<___; 54#ifdef __linux__ 55#include <asm/regdef.h> 56#else 57#include <asm.h> 58#include <regdef.h> 59#endif 60 61.text 62 63.set noat 64.set noreorder 65 66.globl bn_mul_mont 67.align 5 68.ent bn_mul_mont 69bn_mul_mont: 70 lda sp,-48(sp) 71 stq ra,0(sp) 72 stq s3,8(sp) 73 stq s4,16(sp) 74 stq s5,24(sp) 75 stq fp,32(sp) 76 mov sp,fp 77 .mask 0x0400f000,-48 78 .frame fp,48,ra 79 .prologue 0 80 81 .align 4 82 .set reorder 83 sextl $num,$num 84 mov 0,v0 85 cmplt $num,4,AT 86 bne AT,.Lexit 87 88 ldq $hi0,0($ap) # ap[0] 89 s8addq $num,16,AT 90 ldq $aj,8($ap) 91 subq sp,AT,sp 92 ldq $bi,0($bp) # bp[0] 93 lda AT,-4096(zero) # mov -4096,AT 94 ldq $n0,0($n0) 95 and sp,AT,sp 96 97 mulq $hi0,$bi,$lo0 98 ldq $hi1,0($np) # np[0] 99 umulh $hi0,$bi,$hi0 100 ldq $nj,8($np) 101 102 mulq $lo0,$n0,$m1 103 104 mulq $hi1,$m1,$lo1 105 umulh $hi1,$m1,$hi1 106 107 addq $lo1,$lo0,$lo1 108 cmpult $lo1,$lo0,AT 109 addq $hi1,AT,$hi1 110 111 mulq $aj,$bi,$alo 112 mov 2,$j 113 umulh $aj,$bi,$ahi 114 mov sp,$tp 115 116 mulq $nj,$m1,$nlo 117 s8addq $j,$ap,$aj 118 umulh $nj,$m1,$nhi 119 s8addq $j,$np,$nj 120.align 4 121.L1st: 122 .set noreorder 123 ldq $aj,0($aj) 124 addl $j,1,$j 125 ldq $nj,0($nj) 126 lda $tp,8($tp) 127 128 addq $alo,$hi0,$lo0 129 mulq $aj,$bi,$alo 130 cmpult $lo0,$hi0,AT 131 addq $nlo,$hi1,$lo1 132 133 mulq $nj,$m1,$nlo 134 addq $ahi,AT,$hi0 135 cmpult $lo1,$hi1,v0 136 cmplt $j,$num,$tj 137 138 umulh $aj,$bi,$ahi 139 addq $nhi,v0,$hi1 140 addq $lo1,$lo0,$lo1 141 s8addq $j,$ap,$aj 142 143 umulh $nj,$m1,$nhi 144 cmpult $lo1,$lo0,v0 145 addq $hi1,v0,$hi1 146 s8addq $j,$np,$nj 147 148 stq $lo1,-8($tp) 149 nop 150 unop 151 bne $tj,.L1st 152 .set reorder 153 154 addq $alo,$hi0,$lo0 155 addq $nlo,$hi1,$lo1 156 cmpult $lo0,$hi0,AT 157 cmpult $lo1,$hi1,v0 158 addq $ahi,AT,$hi0 159 addq $nhi,v0,$hi1 160 161 addq $lo1,$lo0,$lo1 162 cmpult $lo1,$lo0,v0 163 addq $hi1,v0,$hi1 164 165 stq $lo1,0($tp) 166 167 addq $hi1,$hi0,$hi1 168 cmpult $hi1,$hi0,AT 169 stq $hi1,8($tp) 170 stq AT,16($tp) 171 172 mov 1,$i 173.align 4 174.Louter: 175 s8addq $i,$bp,$bi 176 ldq $hi0,0($ap) 177 ldq $aj,8($ap) 178 ldq $bi,0($bi) 179 ldq $hi1,0($np) 180 ldq $nj,8($np) 181 ldq $tj,0(sp) 182 183 mulq $hi0,$bi,$lo0 184 umulh $hi0,$bi,$hi0 185 186 addq $lo0,$tj,$lo0 187 cmpult $lo0,$tj,AT 188 addq $hi0,AT,$hi0 189 190 mulq $lo0,$n0,$m1 191 192 mulq $hi1,$m1,$lo1 193 umulh $hi1,$m1,$hi1 194 195 addq $lo1,$lo0,$lo1 196 cmpult $lo1,$lo0,AT 197 mov 2,$j 198 addq $hi1,AT,$hi1 199 200 mulq $aj,$bi,$alo 201 mov sp,$tp 202 umulh $aj,$bi,$ahi 203 204 mulq $nj,$m1,$nlo 205 s8addq $j,$ap,$aj 206 umulh $nj,$m1,$nhi 207.align 4 208.Linner: 209 .set noreorder 210 ldq $tj,8($tp) #L0 211 nop #U1 212 ldq $aj,0($aj) #L1 213 s8addq $j,$np,$nj #U0 214 215 ldq $nj,0($nj) #L0 216 nop #U1 217 addq $alo,$hi0,$lo0 #L1 218 lda $tp,8($tp) 219 220 mulq $aj,$bi,$alo #U1 221 cmpult $lo0,$hi0,AT #L0 222 addq $nlo,$hi1,$lo1 #L1 223 addl $j,1,$j 224 225 mulq $nj,$m1,$nlo #U1 226 addq $ahi,AT,$hi0 #L0 227 addq $lo0,$tj,$lo0 #L1 228 cmpult $lo1,$hi1,v0 #U0 229 230 umulh $aj,$bi,$ahi #U1 231 cmpult $lo0,$tj,AT #L0 232 addq $lo1,$lo0,$lo1 #L1 233 addq $nhi,v0,$hi1 #U0 234 235 umulh $nj,$m1,$nhi #U1 236 s8addq $j,$ap,$aj #L0 237 cmpult $lo1,$lo0,v0 #L1 238 cmplt $j,$num,$tj #U0 # borrow $tj 239 240 addq $hi0,AT,$hi0 #L0 241 addq $hi1,v0,$hi1 #U1 242 stq $lo1,-8($tp) #L1 243 bne $tj,.Linner #U0 244 .set reorder 245 246 ldq $tj,8($tp) 247 addq $alo,$hi0,$lo0 248 addq $nlo,$hi1,$lo1 249 cmpult $lo0,$hi0,AT 250 cmpult $lo1,$hi1,v0 251 addq $ahi,AT,$hi0 252 addq $nhi,v0,$hi1 253 254 addq $lo0,$tj,$lo0 255 cmpult $lo0,$tj,AT 256 addq $hi0,AT,$hi0 257 258 ldq $tj,16($tp) 259 addq $lo1,$lo0,$j 260 cmpult $j,$lo0,v0 261 addq $hi1,v0,$hi1 262 263 addq $hi1,$hi0,$lo1 264 stq $j,0($tp) 265 cmpult $lo1,$hi0,$hi1 266 addq $lo1,$tj,$lo1 267 cmpult $lo1,$tj,AT 268 addl $i,1,$i 269 addq $hi1,AT,$hi1 270 stq $lo1,8($tp) 271 cmplt $i,$num,$tj # borrow $tj 272 stq $hi1,16($tp) 273 bne $tj,.Louter 274 275 s8addq $num,sp,$tj # &tp[num] 276 mov $rp,$bp # put rp aside 277 mov sp,$tp 278 mov sp,$ap 279 mov 0,$hi0 # clear borrow bit 280 281.align 4 282.Lsub: ldq $lo0,0($tp) 283 ldq $lo1,0($np) 284 lda $tp,8($tp) 285 lda $np,8($np) 286 subq $lo0,$lo1,$lo1 # tp[i]-np[i] 287 cmpult $lo0,$lo1,AT 288 subq $lo1,$hi0,$lo0 289 cmpult $lo1,$lo0,$hi0 290 or $hi0,AT,$hi0 291 stq $lo0,0($rp) 292 cmpult $tp,$tj,v0 293 lda $rp,8($rp) 294 bne v0,.Lsub 295 296 subq $hi1,$hi0,$hi0 # handle upmost overflow bit 297 mov sp,$tp 298 mov $bp,$rp # restore rp 299 300.align 4 301.Lcopy: ldq $aj,0($tp) # conditional copy 302 ldq $nj,0($rp) 303 lda $tp,8($tp) 304 lda $rp,8($rp) 305 cmoveq $hi0,$nj,$aj 306 stq zero,-8($tp) # zap tp 307 cmpult $tp,$tj,AT 308 stq $aj,-8($rp) 309 bne AT,.Lcopy 310 mov 1,v0 311 312.Lexit: 313 .set noreorder 314 mov fp,sp 315 /*ldq ra,0(sp)*/ 316 ldq s3,8(sp) 317 ldq s4,16(sp) 318 ldq s5,24(sp) 319 ldq fp,32(sp) 320 lda sp,48(sp) 321 ret (ra) 322.end bn_mul_mont 323.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 324.align 2 325___ 326 327print $code; 328close STDOUT; 329