1#! /usr/bin/env perl 2# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# This module doesn't present direct interest for OpenSSL, because it 18# doesn't provide better performance for longer keys, at least not on 19# in-order-execution cores. While 512-bit RSA sign operations can be 20# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and 21# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from 22# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA 23# verify:-( All comparisons are against bn_mul_mont-free assembler. 24# The module might be of interest to embedded system developers, as 25# the code is smaller than 1KB, yet offers >3x improvement on MIPS64 26# and 75-30% [less for longer keys] on MIPS32 over compiler-generated 27# code. 28 29###################################################################### 30# There is a number of MIPS ABI in use, O32 and N32/64 are most 31# widely used. Then there is a new contender: NUBI. It appears that if 32# one picks the latter, it's possible to arrange code in ABI neutral 33# manner. Therefore let's stick to NUBI register layout: 34# 35($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 36($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 37($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 38($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 39# 40# The return value is placed in $a0. Following coding rules facilitate 41# interoperability: 42# 43# - never ever touch $tp, "thread pointer", former $gp; 44# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 45# old code]; 46# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 47# 48# For reference here is register layout for N32/64 MIPS ABIs: 49# 50# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 51# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 52# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 53# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 54# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 55# 56$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 57 58if ($flavour =~ /64|n32/i) { 59 $PTR_ADD="daddu"; # incidentally works even on n32 60 $PTR_SUB="dsubu"; # incidentally works even on n32 61 $REG_S="sd"; 62 $REG_L="ld"; 63 $SZREG=8; 64} else { 65 $PTR_ADD="addu"; 66 $PTR_SUB="subu"; 67 $REG_S="sw"; 68 $REG_L="lw"; 69 $SZREG=4; 70} 71$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; 72# 73# <appro@openssl.org> 74# 75###################################################################### 76 77while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 78open STDOUT,">$output"; 79 80if ($flavour =~ /64|n32/i) { 81 $LD="ld"; 82 $ST="sd"; 83 $MULTU="dmultu"; 84 $ADDU="daddu"; 85 $SUBU="dsubu"; 86 $BNSZ=8; 87} else { 88 $LD="lw"; 89 $ST="sw"; 90 $MULTU="multu"; 91 $ADDU="addu"; 92 $SUBU="subu"; 93 $BNSZ=4; 94} 95 96# int bn_mul_mont( 97$rp=$a0; # BN_ULONG *rp, 98$ap=$a1; # const BN_ULONG *ap, 99$bp=$a2; # const BN_ULONG *bp, 100$np=$a3; # const BN_ULONG *np, 101$n0=$a4; # const BN_ULONG *n0, 102$num=$a5; # int num); 103 104$lo0=$a6; 105$hi0=$a7; 106$lo1=$t1; 107$hi1=$t2; 108$aj=$s0; 109$bi=$s1; 110$nj=$s2; 111$tp=$s3; 112$alo=$s4; 113$ahi=$s5; 114$nlo=$s6; 115$nhi=$s7; 116$tj=$s8; 117$i=$s9; 118$j=$s10; 119$m1=$s11; 120 121$FRAMESIZE=14; 122 123$code=<<___; 124#include "mips_arch.h" 125 126.text 127 128.set noat 129.set noreorder 130 131.align 5 132.globl bn_mul_mont 133.ent bn_mul_mont 134bn_mul_mont: 135___ 136$code.=<<___ if ($flavour =~ /o32/i); 137 lw $n0,16($sp) 138 lw $num,20($sp) 139___ 140$code.=<<___; 141 slt $at,$num,4 142 bnez $at,1f 143 li $t0,0 144 slt $at,$num,17 # on in-order CPU 145 bnez $at,bn_mul_mont_internal 146 nop 1471: jr $ra 148 li $a0,0 149.end bn_mul_mont 150 151.align 5 152.ent bn_mul_mont_internal 153bn_mul_mont_internal: 154 .frame $fp,$FRAMESIZE*$SZREG,$ra 155 .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG 156 $PTR_SUB $sp,$FRAMESIZE*$SZREG 157 $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) 158 $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) 159 $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) 160 $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) 161 $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) 162 $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) 163 $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) 164 $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) 165 $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) 166___ 167$code.=<<___ if ($flavour =~ /nubi/i); 168 $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) 169 $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) 170 $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) 171 $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) 172___ 173$code.=<<___; 174 move $fp,$sp 175 176 .set reorder 177 $LD $n0,0($n0) 178 $LD $bi,0($bp) # bp[0] 179 $LD $aj,0($ap) # ap[0] 180 $LD $nj,0($np) # np[0] 181 182 $PTR_SUB $sp,2*$BNSZ # place for two extra words 183 sll $num,`log($BNSZ)/log(2)` 184 li $at,-4096 185 $PTR_SUB $sp,$num 186 and $sp,$at 187 188 $MULTU ($aj,$bi) 189 $LD $ahi,$BNSZ($ap) 190 $LD $nhi,$BNSZ($np) 191 mflo ($lo0,$aj,$bi) 192 mfhi ($hi0,$aj,$bi) 193 $MULTU ($lo0,$n0) 194 mflo ($m1,$lo0,$n0) 195 196 $MULTU ($ahi,$bi) 197 mflo ($alo,$ahi,$bi) 198 mfhi ($ahi,$ahi,$bi) 199 200 $MULTU ($nj,$m1) 201 mflo ($lo1,$nj,$m1) 202 mfhi ($hi1,$nj,$m1) 203 $MULTU ($nhi,$m1) 204 $ADDU $lo1,$lo0 205 sltu $at,$lo1,$lo0 206 $ADDU $hi1,$at 207 mflo ($nlo,$nhi,$m1) 208 mfhi ($nhi,$nhi,$m1) 209 210 move $tp,$sp 211 li $j,2*$BNSZ 212.align 4 213.L1st: 214 .set noreorder 215 $PTR_ADD $aj,$ap,$j 216 $PTR_ADD $nj,$np,$j 217 $LD $aj,($aj) 218 $LD $nj,($nj) 219 220 $MULTU ($aj,$bi) 221 $ADDU $lo0,$alo,$hi0 222 $ADDU $lo1,$nlo,$hi1 223 sltu $at,$lo0,$hi0 224 sltu $t0,$lo1,$hi1 225 $ADDU $hi0,$ahi,$at 226 $ADDU $hi1,$nhi,$t0 227 mflo ($alo,$aj,$bi) 228 mfhi ($ahi,$aj,$bi) 229 230 $ADDU $lo1,$lo0 231 sltu $at,$lo1,$lo0 232 $MULTU ($nj,$m1) 233 $ADDU $hi1,$at 234 addu $j,$BNSZ 235 $ST $lo1,($tp) 236 sltu $t0,$j,$num 237 mflo ($nlo,$nj,$m1) 238 mfhi ($nhi,$nj,$m1) 239 240 bnez $t0,.L1st 241 $PTR_ADD $tp,$BNSZ 242 .set reorder 243 244 $ADDU $lo0,$alo,$hi0 245 sltu $at,$lo0,$hi0 246 $ADDU $hi0,$ahi,$at 247 248 $ADDU $lo1,$nlo,$hi1 249 sltu $t0,$lo1,$hi1 250 $ADDU $hi1,$nhi,$t0 251 $ADDU $lo1,$lo0 252 sltu $at,$lo1,$lo0 253 $ADDU $hi1,$at 254 255 $ST $lo1,($tp) 256 257 $ADDU $hi1,$hi0 258 sltu $at,$hi1,$hi0 259 $ST $hi1,$BNSZ($tp) 260 $ST $at,2*$BNSZ($tp) 261 262 li $i,$BNSZ 263.align 4 264.Louter: 265 $PTR_ADD $bi,$bp,$i 266 $LD $bi,($bi) 267 $LD $aj,($ap) 268 $LD $ahi,$BNSZ($ap) 269 $LD $tj,($sp) 270 271 $MULTU ($aj,$bi) 272 $LD $nj,($np) 273 $LD $nhi,$BNSZ($np) 274 mflo ($lo0,$aj,$bi) 275 mfhi ($hi0,$aj,$bi) 276 $ADDU $lo0,$tj 277 $MULTU ($lo0,$n0) 278 sltu $at,$lo0,$tj 279 $ADDU $hi0,$at 280 mflo ($m1,$lo0,$n0) 281 282 $MULTU ($ahi,$bi) 283 mflo ($alo,$ahi,$bi) 284 mfhi ($ahi,$ahi,$bi) 285 286 $MULTU ($nj,$m1) 287 mflo ($lo1,$nj,$m1) 288 mfhi ($hi1,$nj,$m1) 289 290 $MULTU ($nhi,$m1) 291 $ADDU $lo1,$lo0 292 sltu $at,$lo1,$lo0 293 $ADDU $hi1,$at 294 mflo ($nlo,$nhi,$m1) 295 mfhi ($nhi,$nhi,$m1) 296 297 move $tp,$sp 298 li $j,2*$BNSZ 299 $LD $tj,$BNSZ($tp) 300.align 4 301.Linner: 302 .set noreorder 303 $PTR_ADD $aj,$ap,$j 304 $PTR_ADD $nj,$np,$j 305 $LD $aj,($aj) 306 $LD $nj,($nj) 307 308 $MULTU ($aj,$bi) 309 $ADDU $lo0,$alo,$hi0 310 $ADDU $lo1,$nlo,$hi1 311 sltu $at,$lo0,$hi0 312 sltu $t0,$lo1,$hi1 313 $ADDU $hi0,$ahi,$at 314 $ADDU $hi1,$nhi,$t0 315 mflo ($alo,$aj,$bi) 316 mfhi ($ahi,$aj,$bi) 317 318 $ADDU $lo0,$tj 319 addu $j,$BNSZ 320 $MULTU ($nj,$m1) 321 sltu $at,$lo0,$tj 322 $ADDU $lo1,$lo0 323 $ADDU $hi0,$at 324 sltu $t0,$lo1,$lo0 325 $LD $tj,2*$BNSZ($tp) 326 $ADDU $hi1,$t0 327 sltu $at,$j,$num 328 mflo ($nlo,$nj,$m1) 329 mfhi ($nhi,$nj,$m1) 330 $ST $lo1,($tp) 331 bnez $at,.Linner 332 $PTR_ADD $tp,$BNSZ 333 .set reorder 334 335 $ADDU $lo0,$alo,$hi0 336 sltu $at,$lo0,$hi0 337 $ADDU $hi0,$ahi,$at 338 $ADDU $lo0,$tj 339 sltu $t0,$lo0,$tj 340 $ADDU $hi0,$t0 341 342 $LD $tj,2*$BNSZ($tp) 343 $ADDU $lo1,$nlo,$hi1 344 sltu $at,$lo1,$hi1 345 $ADDU $hi1,$nhi,$at 346 $ADDU $lo1,$lo0 347 sltu $t0,$lo1,$lo0 348 $ADDU $hi1,$t0 349 $ST $lo1,($tp) 350 351 $ADDU $lo1,$hi1,$hi0 352 sltu $hi1,$lo1,$hi0 353 $ADDU $lo1,$tj 354 sltu $at,$lo1,$tj 355 $ADDU $hi1,$at 356 $ST $lo1,$BNSZ($tp) 357 $ST $hi1,2*$BNSZ($tp) 358 359 addu $i,$BNSZ 360 sltu $t0,$i,$num 361 bnez $t0,.Louter 362 363 .set noreorder 364 $PTR_ADD $tj,$sp,$num # &tp[num] 365 move $tp,$sp 366 move $ap,$sp 367 li $hi0,0 # clear borrow bit 368 369.align 4 370.Lsub: $LD $lo0,($tp) 371 $LD $lo1,($np) 372 $PTR_ADD $tp,$BNSZ 373 $PTR_ADD $np,$BNSZ 374 $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] 375 sgtu $at,$lo1,$lo0 376 $SUBU $lo0,$lo1,$hi0 377 sgtu $hi0,$lo0,$lo1 378 $ST $lo0,($rp) 379 or $hi0,$at 380 sltu $at,$tp,$tj 381 bnez $at,.Lsub 382 $PTR_ADD $rp,$BNSZ 383 384 $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit 385 move $tp,$sp 386 $PTR_SUB $rp,$num # restore rp 387 not $hi1,$hi0 388 389.Lcopy: $LD $nj,($tp) # conditional move 390 $LD $aj,($rp) 391 $ST $zero,($tp) 392 $PTR_ADD $tp,$BNSZ 393 and $nj,$hi0 394 and $aj,$hi1 395 or $aj,$nj 396 sltu $at,$tp,$tj 397 $ST $aj,($rp) 398 bnez $at,.Lcopy 399 $PTR_ADD $rp,$BNSZ 400 401 li $a0,1 402 li $t0,1 403 404 .set noreorder 405 move $sp,$fp 406 $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) 407 $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) 408 $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) 409 $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) 410 $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) 411 $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) 412 $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) 413 $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) 414 $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) 415___ 416$code.=<<___ if ($flavour =~ /nubi/i); 417 $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) 418 $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) 419 $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) 420 $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) 421___ 422$code.=<<___; 423 jr $ra 424 $PTR_ADD $sp,$FRAMESIZE*$SZREG 425.end bn_mul_mont_internal 426.rdata 427.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 428___ 429 430$code =~ s/\`([^\`]*)\`/eval $1/gem; 431 432print $code; 433close STDOUT; 434