1#! /usr/bin/env perl 2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# December 2005 18# 19# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons 20# for undertaken effort are multiple. First of all, UltraSPARC is not 21# the whole SPARCv9 universe and other VIS-free implementations deserve 22# optimized code as much. Secondly, newly introduced UltraSPARC T1, 23# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths, 24# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with 25# several integrated RSA/DSA accelerator circuits accessible through 26# kernel driver [only(*)], but having decent user-land software 27# implementation is important too. Finally, reasons like desire to 28# experiment with dedicated squaring procedure. Yes, this module 29# implements one, because it was easiest to draft it in SPARCv9 30# instructions... 31 32# (*) Engine accessing the driver in question is on my TODO list. 33# For reference, accelerator is estimated to give 6 to 10 times 34# improvement on single-threaded RSA sign. It should be noted 35# that 6-10x improvement coefficient does not actually mean 36# something extraordinary in terms of absolute [single-threaded] 37# performance, as SPARCv9 instruction set is by all means least 38# suitable for high performance crypto among other 64 bit 39# platforms. 6-10x factor simply places T1 in same performance 40# domain as say AMD64 and IA-64. Improvement of RSA verify don't 41# appear impressive at all, but it's the sign operation which is 42# far more critical/interesting. 43 44# You might notice that inner loops are modulo-scheduled:-) This has 45# essentially negligible impact on UltraSPARC performance, it's 46# Fujitsu SPARC64 V users who should notice and hopefully appreciate 47# the advantage... Currently this module surpasses sparcv9a-mont.pl 48# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a 49# module still have hidden potential [see TODO list there], which is 50# estimated to be larger than 20%... 51 52$output = pop; 53open STDOUT,">$output"; 54 55# int bn_mul_mont( 56$rp="%i0"; # BN_ULONG *rp, 57$ap="%i1"; # const BN_ULONG *ap, 58$bp="%i2"; # const BN_ULONG *bp, 59$np="%i3"; # const BN_ULONG *np, 60$n0="%i4"; # const BN_ULONG *n0, 61$num="%i5"; # int num); 62 63$frame="STACK_FRAME"; 64$bias="STACK_BIAS"; 65 66$car0="%o0"; 67$car1="%o1"; 68$car2="%o2"; # 1 bit 69$acc0="%o3"; 70$acc1="%o4"; 71$mask="%g1"; # 32 bits, what a waste... 72$tmp0="%g4"; 73$tmp1="%g5"; 74 75$i="%l0"; 76$j="%l1"; 77$mul0="%l2"; 78$mul1="%l3"; 79$tp="%l4"; 80$apj="%l5"; 81$npj="%l6"; 82$tpj="%l7"; 83 84$fname="bn_mul_mont_int"; 85 86$code=<<___; 87#include "sparc_arch.h" 88 89.section ".text",#alloc,#execinstr 90 91.global $fname 92.align 32 93$fname: 94 cmp %o5,4 ! 128 bits minimum 95 bge,pt %icc,.Lenter 96 sethi %hi(0xffffffff),$mask 97 retl 98 clr %o0 99.align 32 100.Lenter: 101 save %sp,-$frame,%sp 102 sll $num,2,$num ! num*=4 103 or $mask,%lo(0xffffffff),$mask 104 ld [$n0],$n0 105 cmp $ap,$bp 106 and $num,$mask,$num 107 ld [$bp],$mul0 ! bp[0] 108 nop 109 110 add %sp,$bias,%o7 ! real top of stack 111 ld [$ap],$car0 ! ap[0] ! redundant in squaring context 112 sub %o7,$num,%o7 113 ld [$ap+4],$apj ! ap[1] 114 and %o7,-1024,%o7 115 ld [$np],$car1 ! np[0] 116 sub %o7,$bias,%sp ! alloca 117 ld [$np+4],$npj ! np[1] 118 be,pt SIZE_T_CC,.Lbn_sqr_mont 119 mov 12,$j 120 121 mulx $car0,$mul0,$car0 ! ap[0]*bp[0] 122 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] 123 and $car0,$mask,$acc0 124 add %sp,$bias+$frame,$tp 125 ld [$ap+8],$apj !prologue! 126 127 mulx $n0,$acc0,$mul1 ! "t[0]"*n0 128 and $mul1,$mask,$mul1 129 130 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 131 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 132 srlx $car0,32,$car0 133 add $acc0,$car1,$car1 134 ld [$np+8],$npj !prologue! 135 srlx $car1,32,$car1 136 mov $tmp0,$acc0 !prologue! 137 138.L1st: 139 mulx $apj,$mul0,$tmp0 140 mulx $npj,$mul1,$tmp1 141 add $acc0,$car0,$car0 142 ld [$ap+$j],$apj ! ap[j] 143 and $car0,$mask,$acc0 144 add $acc1,$car1,$car1 145 ld [$np+$j],$npj ! np[j] 146 srlx $car0,32,$car0 147 add $acc0,$car1,$car1 148 add $j,4,$j ! j++ 149 mov $tmp0,$acc0 150 st $car1,[$tp] 151 cmp $j,$num 152 mov $tmp1,$acc1 153 srlx $car1,32,$car1 154 bl %icc,.L1st 155 add $tp,4,$tp ! tp++ 156!.L1st 157 158 mulx $apj,$mul0,$tmp0 !epilogue! 159 mulx $npj,$mul1,$tmp1 160 add $acc0,$car0,$car0 161 and $car0,$mask,$acc0 162 add $acc1,$car1,$car1 163 srlx $car0,32,$car0 164 add $acc0,$car1,$car1 165 st $car1,[$tp] 166 srlx $car1,32,$car1 167 168 add $tmp0,$car0,$car0 169 and $car0,$mask,$acc0 170 add $tmp1,$car1,$car1 171 srlx $car0,32,$car0 172 add $acc0,$car1,$car1 173 st $car1,[$tp+4] 174 srlx $car1,32,$car1 175 176 add $car0,$car1,$car1 177 st $car1,[$tp+8] 178 srlx $car1,32,$car2 179 180 mov 4,$i ! i++ 181 ld [$bp+4],$mul0 ! bp[1] 182.Louter: 183 add %sp,$bias+$frame,$tp 184 ld [$ap],$car0 ! ap[0] 185 ld [$ap+4],$apj ! ap[1] 186 ld [$np],$car1 ! np[0] 187 ld [$np+4],$npj ! np[1] 188 ld [$tp],$tmp1 ! tp[0] 189 ld [$tp+4],$tpj ! tp[1] 190 mov 12,$j 191 192 mulx $car0,$mul0,$car0 193 mulx $apj,$mul0,$tmp0 !prologue! 194 add $tmp1,$car0,$car0 195 ld [$ap+8],$apj !prologue! 196 and $car0,$mask,$acc0 197 198 mulx $n0,$acc0,$mul1 199 and $mul1,$mask,$mul1 200 201 mulx $car1,$mul1,$car1 202 mulx $npj,$mul1,$acc1 !prologue! 203 srlx $car0,32,$car0 204 add $acc0,$car1,$car1 205 ld [$np+8],$npj !prologue! 206 srlx $car1,32,$car1 207 mov $tmp0,$acc0 !prologue! 208 209.Linner: 210 mulx $apj,$mul0,$tmp0 211 mulx $npj,$mul1,$tmp1 212 add $tpj,$car0,$car0 213 ld [$ap+$j],$apj ! ap[j] 214 add $acc0,$car0,$car0 215 add $acc1,$car1,$car1 216 ld [$np+$j],$npj ! np[j] 217 and $car0,$mask,$acc0 218 ld [$tp+8],$tpj ! tp[j] 219 srlx $car0,32,$car0 220 add $acc0,$car1,$car1 221 add $j,4,$j ! j++ 222 mov $tmp0,$acc0 223 st $car1,[$tp] ! tp[j-1] 224 srlx $car1,32,$car1 225 mov $tmp1,$acc1 226 cmp $j,$num 227 bl %icc,.Linner 228 add $tp,4,$tp ! tp++ 229!.Linner 230 231 mulx $apj,$mul0,$tmp0 !epilogue! 232 mulx $npj,$mul1,$tmp1 233 add $tpj,$car0,$car0 234 add $acc0,$car0,$car0 235 ld [$tp+8],$tpj ! tp[j] 236 and $car0,$mask,$acc0 237 add $acc1,$car1,$car1 238 srlx $car0,32,$car0 239 add $acc0,$car1,$car1 240 st $car1,[$tp] ! tp[j-1] 241 srlx $car1,32,$car1 242 243 add $tpj,$car0,$car0 244 add $tmp0,$car0,$car0 245 and $car0,$mask,$acc0 246 add $tmp1,$car1,$car1 247 add $acc0,$car1,$car1 248 st $car1,[$tp+4] ! tp[j-1] 249 srlx $car0,32,$car0 250 add $i,4,$i ! i++ 251 srlx $car1,32,$car1 252 253 add $car0,$car1,$car1 254 cmp $i,$num 255 add $car2,$car1,$car1 256 st $car1,[$tp+8] 257 258 srlx $car1,32,$car2 259 bl,a %icc,.Louter 260 ld [$bp+$i],$mul0 ! bp[i] 261!.Louter 262 263 add $tp,12,$tp 264 265.Ltail: 266 add $np,$num,$np 267 add $rp,$num,$rp 268 sub %g0,$num,%o7 ! k=-num 269 ba .Lsub 270 subcc %g0,%g0,%g0 ! clear %icc.c 271.align 16 272.Lsub: 273 ld [$tp+%o7],%o0 274 ld [$np+%o7],%o1 275 subccc %o0,%o1,%o1 ! tp[j]-np[j] 276 add $rp,%o7,$i 277 add %o7,4,%o7 278 brnz %o7,.Lsub 279 st %o1,[$i] 280 subccc $car2,0,$car2 ! handle upmost overflow bit 281 sub %g0,$num,%o7 282 283.Lcopy: 284 ld [$tp+%o7],%o1 ! conditional copy 285 ld [$rp+%o7],%o0 286 st %g0,[$tp+%o7] ! zap tp 287 movcs %icc,%o1,%o0 288 st %o0,[$rp+%o7] 289 add %o7,4,%o7 290 brnz %o7,.Lcopy 291 nop 292 mov 1,%i0 293 ret 294 restore 295___ 296 297######## 298######## .Lbn_sqr_mont gives up to 20% *overall* improvement over 299######## code without following dedicated squaring procedure. 300######## 301$sbit="%o5"; 302 303$code.=<<___; 304.align 32 305.Lbn_sqr_mont: 306 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] 307 mulx $apj,$mul0,$tmp0 !prologue! 308 and $car0,$mask,$acc0 309 add %sp,$bias+$frame,$tp 310 ld [$ap+8],$apj !prologue! 311 312 mulx $n0,$acc0,$mul1 ! "t[0]"*n0 313 srlx $car0,32,$car0 314 and $mul1,$mask,$mul1 315 316 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 317 mulx $npj,$mul1,$acc1 !prologue! 318 and $car0,1,$sbit 319 ld [$np+8],$npj !prologue! 320 srlx $car0,1,$car0 321 add $acc0,$car1,$car1 322 srlx $car1,32,$car1 323 mov $tmp0,$acc0 !prologue! 324 325.Lsqr_1st: 326 mulx $apj,$mul0,$tmp0 327 mulx $npj,$mul1,$tmp1 328 add $acc0,$car0,$car0 ! ap[j]*a0+c0 329 add $acc1,$car1,$car1 330 ld [$ap+$j],$apj ! ap[j] 331 and $car0,$mask,$acc0 332 ld [$np+$j],$npj ! np[j] 333 srlx $car0,32,$car0 334 add $acc0,$acc0,$acc0 335 or $sbit,$acc0,$acc0 336 mov $tmp1,$acc1 337 srlx $acc0,32,$sbit 338 add $j,4,$j ! j++ 339 and $acc0,$mask,$acc0 340 cmp $j,$num 341 add $acc0,$car1,$car1 342 st $car1,[$tp] 343 mov $tmp0,$acc0 344 srlx $car1,32,$car1 345 bl %icc,.Lsqr_1st 346 add $tp,4,$tp ! tp++ 347!.Lsqr_1st 348 349 mulx $apj,$mul0,$tmp0 ! epilogue 350 mulx $npj,$mul1,$tmp1 351 add $acc0,$car0,$car0 ! ap[j]*a0+c0 352 add $acc1,$car1,$car1 353 and $car0,$mask,$acc0 354 srlx $car0,32,$car0 355 add $acc0,$acc0,$acc0 356 or $sbit,$acc0,$acc0 357 srlx $acc0,32,$sbit 358 and $acc0,$mask,$acc0 359 add $acc0,$car1,$car1 360 st $car1,[$tp] 361 srlx $car1,32,$car1 362 363 add $tmp0,$car0,$car0 ! ap[j]*a0+c0 364 add $tmp1,$car1,$car1 365 and $car0,$mask,$acc0 366 srlx $car0,32,$car0 367 add $acc0,$acc0,$acc0 368 or $sbit,$acc0,$acc0 369 srlx $acc0,32,$sbit 370 and $acc0,$mask,$acc0 371 add $acc0,$car1,$car1 372 st $car1,[$tp+4] 373 srlx $car1,32,$car1 374 375 add $car0,$car0,$car0 376 or $sbit,$car0,$car0 377 add $car0,$car1,$car1 378 st $car1,[$tp+8] 379 srlx $car1,32,$car2 380 381 ld [%sp+$bias+$frame],$tmp0 ! tp[0] 382 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] 383 ld [%sp+$bias+$frame+8],$tpj ! tp[2] 384 ld [$ap+4],$mul0 ! ap[1] 385 ld [$ap+8],$apj ! ap[2] 386 ld [$np],$car1 ! np[0] 387 ld [$np+4],$npj ! np[1] 388 mulx $n0,$tmp0,$mul1 389 390 mulx $mul0,$mul0,$car0 391 and $mul1,$mask,$mul1 392 393 mulx $car1,$mul1,$car1 394 mulx $npj,$mul1,$acc1 395 add $tmp0,$car1,$car1 396 and $car0,$mask,$acc0 397 ld [$np+8],$npj ! np[2] 398 srlx $car1,32,$car1 399 add $tmp1,$car1,$car1 400 srlx $car0,32,$car0 401 add $acc0,$car1,$car1 402 and $car0,1,$sbit 403 add $acc1,$car1,$car1 404 srlx $car0,1,$car0 405 mov 12,$j 406 st $car1,[%sp+$bias+$frame] ! tp[0]= 407 srlx $car1,32,$car1 408 add %sp,$bias+$frame+4,$tp 409 410.Lsqr_2nd: 411 mulx $apj,$mul0,$acc0 412 mulx $npj,$mul1,$acc1 413 add $acc0,$car0,$car0 414 add $tpj,$sbit,$sbit 415 ld [$ap+$j],$apj ! ap[j] 416 and $car0,$mask,$acc0 417 ld [$np+$j],$npj ! np[j] 418 srlx $car0,32,$car0 419 add $acc1,$car1,$car1 420 ld [$tp+8],$tpj ! tp[j] 421 add $acc0,$acc0,$acc0 422 add $j,4,$j ! j++ 423 add $sbit,$acc0,$acc0 424 srlx $acc0,32,$sbit 425 and $acc0,$mask,$acc0 426 cmp $j,$num 427 add $acc0,$car1,$car1 428 st $car1,[$tp] ! tp[j-1] 429 srlx $car1,32,$car1 430 bl %icc,.Lsqr_2nd 431 add $tp,4,$tp ! tp++ 432!.Lsqr_2nd 433 434 mulx $apj,$mul0,$acc0 435 mulx $npj,$mul1,$acc1 436 add $acc0,$car0,$car0 437 add $tpj,$sbit,$sbit 438 and $car0,$mask,$acc0 439 srlx $car0,32,$car0 440 add $acc1,$car1,$car1 441 add $acc0,$acc0,$acc0 442 add $sbit,$acc0,$acc0 443 srlx $acc0,32,$sbit 444 and $acc0,$mask,$acc0 445 add $acc0,$car1,$car1 446 st $car1,[$tp] ! tp[j-1] 447 srlx $car1,32,$car1 448 449 add $car0,$car0,$car0 450 add $sbit,$car0,$car0 451 add $car0,$car1,$car1 452 add $car2,$car1,$car1 453 st $car1,[$tp+4] 454 srlx $car1,32,$car2 455 456 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 457 ld [%sp+$bias+$frame+4],$tpj ! tp[1] 458 ld [$ap+8],$mul0 ! ap[2] 459 ld [$np],$car1 ! np[0] 460 ld [$np+4],$npj ! np[1] 461 mulx $n0,$tmp1,$mul1 462 and $mul1,$mask,$mul1 463 mov 8,$i 464 465 mulx $mul0,$mul0,$car0 466 mulx $car1,$mul1,$car1 467 and $car0,$mask,$acc0 468 add $tmp1,$car1,$car1 469 srlx $car0,32,$car0 470 add %sp,$bias+$frame,$tp 471 srlx $car1,32,$car1 472 and $car0,1,$sbit 473 srlx $car0,1,$car0 474 mov 4,$j 475 476.Lsqr_outer: 477.Lsqr_inner1: 478 mulx $npj,$mul1,$acc1 479 add $tpj,$car1,$car1 480 add $j,4,$j 481 ld [$tp+8],$tpj 482 cmp $j,$i 483 add $acc1,$car1,$car1 484 ld [$np+$j],$npj 485 st $car1,[$tp] 486 srlx $car1,32,$car1 487 bl %icc,.Lsqr_inner1 488 add $tp,4,$tp 489!.Lsqr_inner1 490 491 add $j,4,$j 492 ld [$ap+$j],$apj ! ap[j] 493 mulx $npj,$mul1,$acc1 494 add $tpj,$car1,$car1 495 ld [$np+$j],$npj ! np[j] 496 srlx $car1,32,$tmp0 497 and $car1,$mask,$car1 498 add $tmp0,$sbit,$sbit 499 add $acc0,$car1,$car1 500 ld [$tp+8],$tpj ! tp[j] 501 add $acc1,$car1,$car1 502 st $car1,[$tp] 503 srlx $car1,32,$car1 504 505 add $j,4,$j 506 cmp $j,$num 507 be,pn %icc,.Lsqr_no_inner2 508 add $tp,4,$tp 509 510.Lsqr_inner2: 511 mulx $apj,$mul0,$acc0 512 mulx $npj,$mul1,$acc1 513 add $tpj,$sbit,$sbit 514 add $acc0,$car0,$car0 515 ld [$ap+$j],$apj ! ap[j] 516 and $car0,$mask,$acc0 517 ld [$np+$j],$npj ! np[j] 518 srlx $car0,32,$car0 519 add $acc0,$acc0,$acc0 520 ld [$tp+8],$tpj ! tp[j] 521 add $sbit,$acc0,$acc0 522 add $j,4,$j ! j++ 523 srlx $acc0,32,$sbit 524 and $acc0,$mask,$acc0 525 cmp $j,$num 526 add $acc0,$car1,$car1 527 add $acc1,$car1,$car1 528 st $car1,[$tp] ! tp[j-1] 529 srlx $car1,32,$car1 530 bl %icc,.Lsqr_inner2 531 add $tp,4,$tp ! tp++ 532 533.Lsqr_no_inner2: 534 mulx $apj,$mul0,$acc0 535 mulx $npj,$mul1,$acc1 536 add $tpj,$sbit,$sbit 537 add $acc0,$car0,$car0 538 and $car0,$mask,$acc0 539 srlx $car0,32,$car0 540 add $acc0,$acc0,$acc0 541 add $sbit,$acc0,$acc0 542 srlx $acc0,32,$sbit 543 and $acc0,$mask,$acc0 544 add $acc0,$car1,$car1 545 add $acc1,$car1,$car1 546 st $car1,[$tp] ! tp[j-1] 547 srlx $car1,32,$car1 548 549 add $car0,$car0,$car0 550 add $sbit,$car0,$car0 551 add $car0,$car1,$car1 552 add $car2,$car1,$car1 553 st $car1,[$tp+4] 554 srlx $car1,32,$car2 555 556 add $i,4,$i ! i++ 557 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 558 ld [%sp+$bias+$frame+4],$tpj ! tp[1] 559 ld [$ap+$i],$mul0 ! ap[j] 560 ld [$np],$car1 ! np[0] 561 ld [$np+4],$npj ! np[1] 562 mulx $n0,$tmp1,$mul1 563 and $mul1,$mask,$mul1 564 add $i,4,$tmp0 565 566 mulx $mul0,$mul0,$car0 567 mulx $car1,$mul1,$car1 568 and $car0,$mask,$acc0 569 add $tmp1,$car1,$car1 570 srlx $car0,32,$car0 571 add %sp,$bias+$frame,$tp 572 srlx $car1,32,$car1 573 and $car0,1,$sbit 574 srlx $car0,1,$car0 575 576 cmp $tmp0,$num ! i<num-1 577 bl %icc,.Lsqr_outer 578 mov 4,$j 579 580.Lsqr_last: 581 mulx $npj,$mul1,$acc1 582 add $tpj,$car1,$car1 583 add $j,4,$j 584 ld [$tp+8],$tpj 585 cmp $j,$i 586 add $acc1,$car1,$car1 587 ld [$np+$j],$npj 588 st $car1,[$tp] 589 srlx $car1,32,$car1 590 bl %icc,.Lsqr_last 591 add $tp,4,$tp 592!.Lsqr_last 593 594 mulx $npj,$mul1,$acc1 595 add $tpj,$acc0,$acc0 596 srlx $acc0,32,$tmp0 597 and $acc0,$mask,$acc0 598 add $tmp0,$sbit,$sbit 599 add $acc0,$car1,$car1 600 add $acc1,$car1,$car1 601 st $car1,[$tp] 602 srlx $car1,32,$car1 603 604 add $car0,$car0,$car0 ! recover $car0 605 add $sbit,$car0,$car0 606 add $car0,$car1,$car1 607 add $car2,$car1,$car1 608 st $car1,[$tp+4] 609 srlx $car1,32,$car2 610 611 ba .Ltail 612 add $tp,8,$tp 613.type $fname,#function 614.size $fname,(.-$fname) 615.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 616.align 32 617___ 618$code =~ s/\`([^\`]*)\`/eval($1)/gem; 619print $code; 620close STDOUT or die "error closing STDOUT: $!"; 621