1#! /usr/bin/env perl 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by David S. Miller and Andy Polyakov 12# The module is licensed under 2-clause BSD license. 13# November 2012. All rights reserved. 14# ==================================================================== 15 16###################################################################### 17# Montgomery squaring-n-multiplication module for SPARC T4. 18# 19# The module consists of three parts: 20# 21# 1) collection of "single-op" subroutines that perform single 22# operation, Montgomery squaring or multiplication, on 512-, 23# 1024-, 1536- and 2048-bit operands; 24# 2) collection of "multi-op" subroutines that perform 5 squaring and 25# 1 multiplication operations on operands of above lengths; 26# 3) fall-back and helper VIS3 subroutines. 27# 28# RSA sign is dominated by multi-op subroutine, while RSA verify and 29# DSA - by single-op. Special note about 4096-bit RSA verify result. 30# Operands are too long for dedicated hardware and it's handled by 31# VIS3 code, which is why you don't see any improvement. It's surely 32# possible to improve it [by deploying 'mpmul' instruction], maybe in 33# the future... 34# 35# Performance improvement. 36# 37# 64-bit process, VIS3: 38# sign verify sign/s verify/s 39# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4 40# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3 41# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9 42# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 43# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 44# 45# 64-bit process, this module: 46# sign verify sign/s verify/s 47# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9 48# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7 49# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5 50# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5 51# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6 52# 53###################################################################### 54# 32-bit process, VIS3: 55# sign verify sign/s verify/s 56# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3 57# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4 58# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8 59# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6 60# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4 61# 62# 32-bit process, this module: 63# sign verify sign/s verify/s 64# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0 65# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7 66# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4 67# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2 68# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2 69# 70# 32-bit code is prone to performance degradation as interrupt rate 71# dispatched to CPU executing the code grows. This is because in 72# standard process of handling interrupt in 32-bit process context 73# upper halves of most integer registers used as input or output are 74# zeroed. This renders result invalid, and operation has to be re-run. 75# If CPU is "bothered" with timer interrupts only, the penalty is 76# hardly measurable. But in order to mitigate this problem for higher 77# interrupt rates contemporary Linux kernel recognizes biased stack 78# even in 32-bit process context and preserves full register contents. 79# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb 80# for details. 81 82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 83push(@INC,"${dir}","${dir}../../perlasm"); 84require "sparcv9_modes.pl"; 85 86$output = pop; 87open STDOUT,">$output"; 88 89$code.=<<___; 90#include "sparc_arch.h" 91 92#ifdef __arch64__ 93.register %g2,#scratch 94.register %g3,#scratch 95#endif 96 97.section ".text",#alloc,#execinstr 98 99#ifdef __PIC__ 100SPARC_PIC_THUNK(%g1) 101#endif 102___ 103 104######################################################################## 105# Register layout for mont[mul|sqr] instructions. 106# For details see "Oracle SPARC Architecture 2011" manual at 107# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/. 108# 109my @R=map("%f".2*$_,(0..11,30,31,12..29)); 110my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); 111my @A=(@N[0..13],@R[14..31]); 112my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3))); 113 114######################################################################## 115# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, 116# const u64 *np,const BN_ULONG *n0); 117# 118sub generate_bn_mul_mont_t4() { 119my $NUM=shift; 120my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5)); 121 122$code.=<<___; 123.globl bn_mul_mont_t4_$NUM 124.align 32 125bn_mul_mont_t4_$NUM: 126#ifdef __arch64__ 127 mov 0,$sentinel 128 mov -128,%g4 129#elif defined(SPARCV9_64BIT_STACK) 130 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 131 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] 132 mov -2047,%g4 133 and %g1,SPARCV9_64BIT_STACK,%g1 134 movrz %g1,0,%g4 135 mov -1,$sentinel 136 add %g4,-128,%g4 137#else 138 mov -1,$sentinel 139 mov -128,%g4 140#endif 141 sllx $sentinel,32,$sentinel 142 save %sp,%g4,%sp 143#ifndef __arch64__ 144 save %sp,-128,%sp ! warm it up 145 save %sp,-128,%sp 146 save %sp,-128,%sp 147 save %sp,-128,%sp 148 save %sp,-128,%sp 149 save %sp,-128,%sp 150 restore 151 restore 152 restore 153 restore 154 restore 155 restore 156#endif 157 and %sp,1,%g4 158 or $sentinel,%fp,%fp 159 or %g4,$sentinel,$sentinel 160 161 ! copy arguments to global registers 162 mov %i0,$rp 163 mov %i1,$ap 164 mov %i2,$bp 165 mov %i3,$np 166 ld [%i4+0],%f1 ! load *n0 167 ld [%i4+4],%f0 168 fsrc2 %f0,%f60 169___ 170 171# load ap[$NUM] ######################################################## 172$code.=<<___; 173 save %sp,-128,%sp; or $sentinel,%fp,%fp 174___ 175for($i=0; $i<14 && $i<$NUM; $i++) { 176my $lo=$i<13?@A[$i+1]:"%o7"; 177$code.=<<___; 178 ld [$ap+$i*8+0],$lo 179 ld [$ap+$i*8+4],@A[$i] 180 sllx @A[$i],32,@A[$i] 181 or $lo,@A[$i],@A[$i] 182___ 183} 184for(; $i<$NUM; $i++) { 185my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); 186$code.=<<___; 187 ld [$ap+$i*8+0],$lo 188 ld [$ap+$i*8+4],$hi 189 fsrc2 $hi,@A[$i] 190___ 191} 192# load np[$NUM] ######################################################## 193$code.=<<___; 194 save %sp,-128,%sp; or $sentinel,%fp,%fp 195___ 196for($i=0; $i<14 && $i<$NUM; $i++) { 197my $lo=$i<13?@N[$i+1]:"%o7"; 198$code.=<<___; 199 ld [$np+$i*8+0],$lo 200 ld [$np+$i*8+4],@N[$i] 201 sllx @N[$i],32,@N[$i] 202 or $lo,@N[$i],@N[$i] 203___ 204} 205$code.=<<___; 206 save %sp,-128,%sp; or $sentinel,%fp,%fp 207___ 208for(; $i<28 && $i<$NUM; $i++) { 209my $lo=$i<27?@N[$i+1]:"%o7"; 210$code.=<<___; 211 ld [$np+$i*8+0],$lo 212 ld [$np+$i*8+4],@N[$i] 213 sllx @N[$i],32,@N[$i] 214 or $lo,@N[$i],@N[$i] 215___ 216} 217$code.=<<___; 218 save %sp,-128,%sp; or $sentinel,%fp,%fp 219___ 220for(; $i<$NUM; $i++) { 221my $lo=($i<$NUM-1)?@N[$i+1]:"%o7"; 222$code.=<<___; 223 ld [$np+$i*8+0],$lo 224 ld [$np+$i*8+4],@N[$i] 225 sllx @N[$i],32,@N[$i] 226 or $lo,@N[$i],@N[$i] 227___ 228} 229$code.=<<___; 230 cmp $ap,$bp 231 be SIZE_T_CC,.Lmsquare_$NUM 232 nop 233___ 234 235# load bp[$NUM] ######################################################## 236$code.=<<___; 237 save %sp,-128,%sp; or $sentinel,%fp,%fp 238___ 239for($i=0; $i<14 && $i<$NUM; $i++) { 240my $lo=$i<13?@B[$i+1]:"%o7"; 241$code.=<<___; 242 ld [$bp+$i*8+0],$lo 243 ld [$bp+$i*8+4],@B[$i] 244 sllx @B[$i],32,@B[$i] 245 or $lo,@B[$i],@B[$i] 246___ 247} 248$code.=<<___; 249 save %sp,-128,%sp; or $sentinel,%fp,%fp 250___ 251for(; $i<$NUM; $i++) { 252my $lo=($i<$NUM-1)?@B[$i+1]:"%o7"; 253$code.=<<___; 254 ld [$bp+$i*8+0],$lo 255 ld [$bp+$i*8+4],@B[$i] 256 sllx @B[$i],32,@B[$i] 257 or $lo,@B[$i],@B[$i] 258___ 259} 260# magic ################################################################ 261$code.=<<___; 262 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 263.Lmresume_$NUM: 264 fbu,pn %fcc3,.Lmabort_$NUM 265#ifndef __arch64__ 266 and %fp,$sentinel,$sentinel 267 brz,pn $sentinel,.Lmabort_$NUM 268#endif 269 nop 270#ifdef __arch64__ 271 restore 272 restore 273 restore 274 restore 275 restore 276#else 277 restore; and %fp,$sentinel,$sentinel 278 restore; and %fp,$sentinel,$sentinel 279 restore; and %fp,$sentinel,$sentinel 280 restore; and %fp,$sentinel,$sentinel 281 brz,pn $sentinel,.Lmabort1_$NUM 282 restore 283#endif 284___ 285 286# save tp[$NUM] ######################################################## 287for($i=0; $i<14 && $i<$NUM; $i++) { 288$code.=<<___; 289 movxtod @A[$i],@R[$i] 290___ 291} 292$code.=<<___; 293#ifdef __arch64__ 294 restore 295#else 296 and %fp,$sentinel,$sentinel 297 restore 298 and $sentinel,1,%o7 299 and %fp,$sentinel,$sentinel 300 srl %fp,0,%fp ! just in case? 301 or %o7,$sentinel,$sentinel 302 brz,a,pn $sentinel,.Lmdone_$NUM 303 mov 0,%i0 ! return failure 304#endif 305___ 306for($i=0; $i<12 && $i<$NUM; $i++) { 307@R[$i] =~ /%f([0-9]+)/; 308my $lo = "%f".($1+1); 309$code.=<<___; 310 st $lo,[$rp+$i*8+0] 311 st @R[$i],[$rp+$i*8+4] 312___ 313} 314for(; $i<$NUM; $i++) { 315my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); 316$code.=<<___; 317 fsrc2 @R[$i],$hi 318 st $lo,[$rp+$i*8+0] 319 st $hi,[$rp+$i*8+4] 320___ 321} 322$code.=<<___; 323 mov 1,%i0 ! return success 324.Lmdone_$NUM: 325 ret 326 restore 327 328.Lmabort_$NUM: 329 restore 330 restore 331 restore 332 restore 333 restore 334.Lmabort1_$NUM: 335 restore 336 337 mov 0,%i0 ! return failure 338 ret 339 restore 340 341.align 32 342.Lmsquare_$NUM: 343 save %sp,-128,%sp; or $sentinel,%fp,%fp 344 save %sp,-128,%sp; or $sentinel,%fp,%fp 345 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 346 ba .Lmresume_$NUM 347 nop 348.type bn_mul_mont_t4_$NUM, #function 349.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM 350___ 351} 352 353for ($i=8;$i<=32;$i+=8) { 354 &generate_bn_mul_mont_t4($i); 355} 356 357######################################################################## 358# 359sub load_ccr { 360my ($ptbl,$pwr,$ccr,$skip_wr)=@_; 361$code.=<<___; 362 srl $pwr, 2, %o4 363 and $pwr, 3, %o5 364 and %o4, 7, %o4 365 sll %o5, 3, %o5 ! offset within first cache line 366 add %o5, $ptbl, $ptbl ! of the pwrtbl 367 or %g0, 1, %o5 368 sll %o5, %o4, $ccr 369___ 370$code.=<<___ if (!$skip_wr); 371 wr $ccr, %g0, %ccr 372___ 373} 374sub load_b_pair { 375my ($pwrtbl,$B0,$B1)=@_; 376 377$code.=<<___; 378 ldx [$pwrtbl+0*32], $B0 379 ldx [$pwrtbl+8*32], $B1 380 ldx [$pwrtbl+1*32], %o4 381 ldx [$pwrtbl+9*32], %o5 382 movvs %icc, %o4, $B0 383 ldx [$pwrtbl+2*32], %o4 384 movvs %icc, %o5, $B1 385 ldx [$pwrtbl+10*32],%o5 386 move %icc, %o4, $B0 387 ldx [$pwrtbl+3*32], %o4 388 move %icc, %o5, $B1 389 ldx [$pwrtbl+11*32],%o5 390 movneg %icc, %o4, $B0 391 ldx [$pwrtbl+4*32], %o4 392 movneg %icc, %o5, $B1 393 ldx [$pwrtbl+12*32],%o5 394 movcs %xcc, %o4, $B0 395 ldx [$pwrtbl+5*32],%o4 396 movcs %xcc, %o5, $B1 397 ldx [$pwrtbl+13*32],%o5 398 movvs %xcc, %o4, $B0 399 ldx [$pwrtbl+6*32], %o4 400 movvs %xcc, %o5, $B1 401 ldx [$pwrtbl+14*32],%o5 402 move %xcc, %o4, $B0 403 ldx [$pwrtbl+7*32], %o4 404 move %xcc, %o5, $B1 405 ldx [$pwrtbl+15*32],%o5 406 movneg %xcc, %o4, $B0 407 add $pwrtbl,16*32, $pwrtbl 408 movneg %xcc, %o5, $B1 409___ 410} 411sub load_b { 412my ($pwrtbl,$Bi)=@_; 413 414$code.=<<___; 415 ldx [$pwrtbl+0*32], $Bi 416 ldx [$pwrtbl+1*32], %o4 417 ldx [$pwrtbl+2*32], %o5 418 movvs %icc, %o4, $Bi 419 ldx [$pwrtbl+3*32], %o4 420 move %icc, %o5, $Bi 421 ldx [$pwrtbl+4*32], %o5 422 movneg %icc, %o4, $Bi 423 ldx [$pwrtbl+5*32], %o4 424 movcs %xcc, %o5, $Bi 425 ldx [$pwrtbl+6*32], %o5 426 movvs %xcc, %o4, $Bi 427 ldx [$pwrtbl+7*32], %o4 428 move %xcc, %o5, $Bi 429 add $pwrtbl,8*32, $pwrtbl 430 movneg %xcc, %o4, $Bi 431___ 432} 433 434######################################################################## 435# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, 436# const u64 *pwrtbl,int pwr,int stride); 437# 438sub generate_bn_pwr5_mont_t4() { 439my $NUM=shift; 440my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5)); 441 442$code.=<<___; 443.globl bn_pwr5_mont_t4_$NUM 444.align 32 445bn_pwr5_mont_t4_$NUM: 446#ifdef __arch64__ 447 mov 0,$sentinel 448 mov -128,%g4 449#elif defined(SPARCV9_64BIT_STACK) 450 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 451 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] 452 mov -2047,%g4 453 and %g1,SPARCV9_64BIT_STACK,%g1 454 movrz %g1,0,%g4 455 mov -1,$sentinel 456 add %g4,-128,%g4 457#else 458 mov -1,$sentinel 459 mov -128,%g4 460#endif 461 sllx $sentinel,32,$sentinel 462 save %sp,%g4,%sp 463#ifndef __arch64__ 464 save %sp,-128,%sp ! warm it up 465 save %sp,-128,%sp 466 save %sp,-128,%sp 467 save %sp,-128,%sp 468 save %sp,-128,%sp 469 save %sp,-128,%sp 470 restore 471 restore 472 restore 473 restore 474 restore 475 restore 476#endif 477 and %sp,1,%g4 478 or $sentinel,%fp,%fp 479 or %g4,$sentinel,$sentinel 480 481 ! copy arguments to global registers 482 mov %i0,$tp 483 mov %i1,$np 484 ld [%i2+0],%f1 ! load *n0 485 ld [%i2+4],%f0 486 mov %i3,$pwrtbl 487 srl %i4,%g0,%i4 ! pack last arguments 488 sllx %i5,32,$pwr 489 or %i4,$pwr,$pwr 490 fsrc2 %f0,%f60 491___ 492 493# load tp[$NUM] ######################################################## 494$code.=<<___; 495 save %sp,-128,%sp; or $sentinel,%fp,%fp 496___ 497for($i=0; $i<14 && $i<$NUM; $i++) { 498$code.=<<___; 499 ldx [$tp+$i*8],@A[$i] 500___ 501} 502for(; $i<$NUM; $i++) { 503$code.=<<___; 504 ldd [$tp+$i*8],@A[$i] 505___ 506} 507# load np[$NUM] ######################################################## 508$code.=<<___; 509 save %sp,-128,%sp; or $sentinel,%fp,%fp 510___ 511for($i=0; $i<14 && $i<$NUM; $i++) { 512$code.=<<___; 513 ldx [$np+$i*8],@N[$i] 514___ 515} 516$code.=<<___; 517 save %sp,-128,%sp; or $sentinel,%fp,%fp 518___ 519for(; $i<28 && $i<$NUM; $i++) { 520$code.=<<___; 521 ldx [$np+$i*8],@N[$i] 522___ 523} 524$code.=<<___; 525 save %sp,-128,%sp; or $sentinel,%fp,%fp 526___ 527for(; $i<$NUM; $i++) { 528$code.=<<___; 529 ldx [$np+$i*8],@N[$i] 530___ 531} 532# load pwrtbl[pwr] ######################################################## 533$code.=<<___; 534 save %sp,-128,%sp; or $sentinel,%fp,%fp 535 536 srlx $pwr, 32, %o4 ! unpack $pwr 537 srl $pwr, %g0, %o5 538 sub %o4, 5, %o4 539 mov $pwrtbl, %o7 540 sllx %o4, 32, $pwr ! re-pack $pwr 541 or %o5, $pwr, $pwr 542 srl %o5, %o4, %o5 543___ 544 &load_ccr("%o7","%o5","%o4"); 545$code.=<<___; 546 b .Lstride_$NUM 547 nop 548.align 16 549.Lstride_$NUM: 550___ 551for($i=0; $i<14 && $i<$NUM; $i+=2) { 552 &load_b_pair("%o7",@B[$i],@B[$i+1]); 553} 554$code.=<<___; 555 save %sp,-128,%sp; or $sentinel,%fp,%fp 556___ 557for(; $i<$NUM; $i+=2) { 558 &load_b_pair("%i7",@B[$i],@B[$i+1]); 559} 560$code.=<<___; 561 srax $pwr, 32, %o4 ! unpack $pwr 562 srl $pwr, %g0, %o5 563 sub %o4, 5, %o4 564 mov $pwrtbl, %i7 565 sllx %o4, 32, $pwr ! re-pack $pwr 566 or %o5, $pwr, $pwr 567 srl %o5, %o4, %o5 568___ 569 &load_ccr("%i7","%o5","%o4",1); 570 571# magic ################################################################ 572for($i=0; $i<5; $i++) { 573$code.=<<___; 574 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 575 fbu,pn %fcc3,.Labort_$NUM 576#ifndef __arch64__ 577 and %fp,$sentinel,$sentinel 578 brz,pn $sentinel,.Labort_$NUM 579#endif 580 nop 581___ 582} 583$code.=<<___; 584 wr %o4, %g0, %ccr 585 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 586 fbu,pn %fcc3,.Labort_$NUM 587#ifndef __arch64__ 588 and %fp,$sentinel,$sentinel 589 brz,pn $sentinel,.Labort_$NUM 590#endif 591 592 srax $pwr, 32, %o4 593#ifdef __arch64__ 594 brgez %o4,.Lstride_$NUM 595 restore 596 restore 597 restore 598 restore 599 restore 600#else 601 brgez %o4,.Lstride_$NUM 602 restore; and %fp,$sentinel,$sentinel 603 restore; and %fp,$sentinel,$sentinel 604 restore; and %fp,$sentinel,$sentinel 605 restore; and %fp,$sentinel,$sentinel 606 brz,pn $sentinel,.Labort1_$NUM 607 restore 608#endif 609___ 610 611# save tp[$NUM] ######################################################## 612for($i=0; $i<14 && $i<$NUM; $i++) { 613$code.=<<___; 614 movxtod @A[$i],@R[$i] 615___ 616} 617$code.=<<___; 618#ifdef __arch64__ 619 restore 620#else 621 and %fp,$sentinel,$sentinel 622 restore 623 and $sentinel,1,%o7 624 and %fp,$sentinel,$sentinel 625 srl %fp,0,%fp ! just in case? 626 or %o7,$sentinel,$sentinel 627 brz,a,pn $sentinel,.Ldone_$NUM 628 mov 0,%i0 ! return failure 629#endif 630___ 631for($i=0; $i<$NUM; $i++) { 632$code.=<<___; 633 std @R[$i],[$tp+$i*8] 634___ 635} 636$code.=<<___; 637 mov 1,%i0 ! return success 638.Ldone_$NUM: 639 ret 640 restore 641 642.Labort_$NUM: 643 restore 644 restore 645 restore 646 restore 647 restore 648.Labort1_$NUM: 649 restore 650 651 mov 0,%i0 ! return failure 652 ret 653 restore 654.type bn_pwr5_mont_t4_$NUM, #function 655.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM 656___ 657} 658 659for ($i=8;$i<=32;$i+=8) { 660 &generate_bn_pwr5_mont_t4($i); 661} 662 663{ 664######################################################################## 665# Fall-back subroutines 666# 667# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values 668# 669($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= 670 (map("%g$_",(1..5)),map("%o$_",(0..5,7))); 671 672# int bn_mul_mont( 673$rp="%o0"; # u64 *rp, 674$ap="%o1"; # const u64 *ap, 675$bp="%o2"; # const u64 *bp, 676$np="%o3"; # const u64 *np, 677$n0p="%o4"; # const BN_ULONG *n0, 678$num="%o5"; # int num); # caller ensures that num is >=3 679$code.=<<___; 680.globl bn_mul_mont_t4 681.align 32 682bn_mul_mont_t4: 683 add %sp, STACK_BIAS, %g4 ! real top of stack 684 sll $num, 3, $num ! size in bytes 685 add $num, 63, %g1 686 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes 687 sub %g4, %g1, %g1 688 andn %g1, 63, %g1 ! align at 64 byte 689 sub %g1, STACK_FRAME, %g1 ! new top of stack 690 sub %g1, %g4, %g1 691 692 save %sp, %g1, %sp 693___ 694# +-------------------------------+<----- %sp 695# . . 696# +-------------------------------+<----- aligned at 64 bytes 697# | __int64 tmp[0] | 698# +-------------------------------+ 699# . . 700# . . 701# +-------------------------------+<----- aligned at 64 bytes 702# . . 703($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 704($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); 705($ovf,$i)=($t0,$t1); 706$code.=<<___; 707 ld [$n0p+0], $t0 ! pull n0[0..1] value 708 ld [$n0p+4], $t1 709 add %sp, STACK_BIAS+STACK_FRAME, $tp 710 ldx [$bp+0], $m0 ! m0=bp[0] 711 sllx $t1, 32, $n0 712 add $bp, 8, $bp 713 or $t0, $n0, $n0 714 715 ldx [$ap+0], $aj ! ap[0] 716 717 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 718 umulxhi $aj, $m0, $hi0 719 720 ldx [$ap+8], $aj ! ap[1] 721 add $ap, 16, $ap 722 ldx [$np+0], $nj ! np[0] 723 724 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 725 726 mulx $aj, $m0, $alo ! ap[1]*bp[0] 727 umulxhi $aj, $m0, $aj ! ahi=aj 728 729 mulx $nj, $m1, $lo1 ! np[0]*m1 730 umulxhi $nj, $m1, $hi1 731 732 ldx [$np+8], $nj ! np[1] 733 734 addcc $lo0, $lo1, $lo1 735 add $np, 16, $np 736 addxc %g0, $hi1, $hi1 737 738 mulx $nj, $m1, $nlo ! np[1]*m1 739 umulxhi $nj, $m1, $nj ! nhi=nj 740 741 ba .L1st 742 sub $num, 24, $cnt ! cnt=num-3 743 744.align 16 745.L1st: 746 addcc $alo, $hi0, $lo0 747 addxc $aj, %g0, $hi0 748 749 ldx [$ap+0], $aj ! ap[j] 750 addcc $nlo, $hi1, $lo1 751 add $ap, 8, $ap 752 addxc $nj, %g0, $hi1 ! nhi=nj 753 754 ldx [$np+0], $nj ! np[j] 755 mulx $aj, $m0, $alo ! ap[j]*bp[0] 756 add $np, 8, $np 757 umulxhi $aj, $m0, $aj ! ahi=aj 758 759 mulx $nj, $m1, $nlo ! np[j]*m1 760 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 761 umulxhi $nj, $m1, $nj ! nhi=nj 762 addxc %g0, $hi1, $hi1 763 stxa $lo1, [$tp]0xe2 ! tp[j-1] 764 add $tp, 8, $tp ! tp++ 765 766 brnz,pt $cnt, .L1st 767 sub $cnt, 8, $cnt ! j-- 768!.L1st 769 addcc $alo, $hi0, $lo0 770 addxc $aj, %g0, $hi0 ! ahi=aj 771 772 addcc $nlo, $hi1, $lo1 773 addxc $nj, %g0, $hi1 774 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 775 addxc %g0, $hi1, $hi1 776 stxa $lo1, [$tp]0xe2 ! tp[j-1] 777 add $tp, 8, $tp 778 779 addcc $hi0, $hi1, $hi1 780 addxc %g0, %g0, $ovf ! upmost overflow bit 781 stxa $hi1, [$tp]0xe2 782 add $tp, 8, $tp 783 784 ba .Louter 785 sub $num, 16, $i ! i=num-2 786 787.align 16 788.Louter: 789 ldx [$bp+0], $m0 ! m0=bp[i] 790 add $bp, 8, $bp 791 792 sub $ap, $num, $ap ! rewind 793 sub $np, $num, $np 794 sub $tp, $num, $tp 795 796 ldx [$ap+0], $aj ! ap[0] 797 ldx [$np+0], $nj ! np[0] 798 799 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 800 ldx [$tp], $tj ! tp[0] 801 umulxhi $aj, $m0, $hi0 802 ldx [$ap+8], $aj ! ap[1] 803 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 804 mulx $aj, $m0, $alo ! ap[1]*bp[i] 805 addxc %g0, $hi0, $hi0 806 mulx $lo0, $n0, $m1 ! tp[0]*n0 807 umulxhi $aj, $m0, $aj ! ahi=aj 808 mulx $nj, $m1, $lo1 ! np[0]*m1 809 add $ap, 16, $ap 810 umulxhi $nj, $m1, $hi1 811 ldx [$np+8], $nj ! np[1] 812 add $np, 16, $np 813 addcc $lo1, $lo0, $lo1 814 mulx $nj, $m1, $nlo ! np[1]*m1 815 addxc %g0, $hi1, $hi1 816 umulxhi $nj, $m1, $nj ! nhi=nj 817 818 ba .Linner 819 sub $num, 24, $cnt ! cnt=num-3 820.align 16 821.Linner: 822 addcc $alo, $hi0, $lo0 823 ldx [$tp+8], $tj ! tp[j] 824 addxc $aj, %g0, $hi0 ! ahi=aj 825 ldx [$ap+0], $aj ! ap[j] 826 add $ap, 8, $ap 827 addcc $nlo, $hi1, $lo1 828 mulx $aj, $m0, $alo ! ap[j]*bp[i] 829 addxc $nj, %g0, $hi1 ! nhi=nj 830 ldx [$np+0], $nj ! np[j] 831 add $np, 8, $np 832 umulxhi $aj, $m0, $aj ! ahi=aj 833 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 834 mulx $nj, $m1, $nlo ! np[j]*m1 835 addxc %g0, $hi0, $hi0 836 umulxhi $nj, $m1, $nj ! nhi=nj 837 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 838 addxc %g0, $hi1, $hi1 839 stx $lo1, [$tp] ! tp[j-1] 840 add $tp, 8, $tp 841 brnz,pt $cnt, .Linner 842 sub $cnt, 8, $cnt 843!.Linner 844 ldx [$tp+8], $tj ! tp[j] 845 addcc $alo, $hi0, $lo0 846 addxc $aj, %g0, $hi0 ! ahi=aj 847 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 848 addxc %g0, $hi0, $hi0 849 850 addcc $nlo, $hi1, $lo1 851 addxc $nj, %g0, $hi1 ! nhi=nj 852 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 853 addxc %g0, $hi1, $hi1 854 stx $lo1, [$tp] ! tp[j-1] 855 856 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 857 addxccc $hi1, $hi0, $hi1 858 addxc %g0, %g0, $ovf 859 stx $hi1, [$tp+8] 860 add $tp, 16, $tp 861 862 brnz,pt $i, .Louter 863 sub $i, 8, $i 864 865 sub $ap, $num, $ap ! rewind 866 sub $np, $num, $np 867 sub $tp, $num, $tp 868 ba .Lsub 869 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 870 871.align 16 872.Lsub: 873 ldx [$tp], $tj 874 add $tp, 8, $tp 875 ldx [$np+0], $nj 876 add $np, 8, $np 877 subccc $tj, $nj, $t2 ! tp[j]-np[j] 878 srlx $tj, 32, $tj 879 srlx $nj, 32, $nj 880 subccc $tj, $nj, $t3 881 add $rp, 8, $rp 882 st $t2, [$rp-4] ! reverse order 883 st $t3, [$rp-8] 884 brnz,pt $cnt, .Lsub 885 sub $cnt, 8, $cnt 886 887 sub $np, $num, $np ! rewind 888 sub $tp, $num, $tp 889 sub $rp, $num, $rp 890 891 subccc $ovf, %g0, $ovf ! handle upmost overflow bit 892 ba .Lcopy 893 sub $num, 8, $cnt 894 895.align 16 896.Lcopy: ! conditional copy 897 ldx [$tp], $tj 898 ldx [$rp+0], $t2 899 stx %g0, [$tp] ! zap 900 add $tp, 8, $tp 901 movcs %icc, $tj, $t2 902 stx $t2, [$rp+0] 903 add $rp, 8, $rp 904 brnz $cnt, .Lcopy 905 sub $cnt, 8, $cnt 906 907 mov 1, %o0 908 ret 909 restore 910.type bn_mul_mont_t4, #function 911.size bn_mul_mont_t4, .-bn_mul_mont_t4 912___ 913 914# int bn_mul_mont_gather5( 915$rp="%o0"; # u64 *rp, 916$ap="%o1"; # const u64 *ap, 917$bp="%o2"; # const u64 *pwrtbl, 918$np="%o3"; # const u64 *np, 919$n0p="%o4"; # const BN_ULONG *n0, 920$num="%o5"; # int num, # caller ensures that num is >=3 921 # int power); 922$code.=<<___; 923.globl bn_mul_mont_gather5_t4 924.align 32 925bn_mul_mont_gather5_t4: 926 add %sp, STACK_BIAS, %g4 ! real top of stack 927 sll $num, 3, $num ! size in bytes 928 add $num, 63, %g1 929 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes 930 sub %g4, %g1, %g1 931 andn %g1, 63, %g1 ! align at 64 byte 932 sub %g1, STACK_FRAME, %g1 ! new top of stack 933 sub %g1, %g4, %g1 934 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument 935 936 save %sp, %g1, %sp 937___ 938# +-------------------------------+<----- %sp 939# . . 940# +-------------------------------+<----- aligned at 64 bytes 941# | __int64 tmp[0] | 942# +-------------------------------+ 943# . . 944# . . 945# +-------------------------------+<----- aligned at 64 bytes 946# . . 947($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 948($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7)); 949($ovf,$i)=($t0,$t1); 950 &load_ccr($bp,"%g4",$ccr); 951 &load_b($bp,$m0,"%o7"); # m0=bp[0] 952 953$code.=<<___; 954 ld [$n0p+0], $t0 ! pull n0[0..1] value 955 ld [$n0p+4], $t1 956 add %sp, STACK_BIAS+STACK_FRAME, $tp 957 sllx $t1, 32, $n0 958 or $t0, $n0, $n0 959 960 ldx [$ap+0], $aj ! ap[0] 961 962 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 963 umulxhi $aj, $m0, $hi0 964 965 ldx [$ap+8], $aj ! ap[1] 966 add $ap, 16, $ap 967 ldx [$np+0], $nj ! np[0] 968 969 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 970 971 mulx $aj, $m0, $alo ! ap[1]*bp[0] 972 umulxhi $aj, $m0, $aj ! ahi=aj 973 974 mulx $nj, $m1, $lo1 ! np[0]*m1 975 umulxhi $nj, $m1, $hi1 976 977 ldx [$np+8], $nj ! np[1] 978 979 addcc $lo0, $lo1, $lo1 980 add $np, 16, $np 981 addxc %g0, $hi1, $hi1 982 983 mulx $nj, $m1, $nlo ! np[1]*m1 984 umulxhi $nj, $m1, $nj ! nhi=nj 985 986 ba .L1st_g5 987 sub $num, 24, $cnt ! cnt=num-3 988 989.align 16 990.L1st_g5: 991 addcc $alo, $hi0, $lo0 992 addxc $aj, %g0, $hi0 993 994 ldx [$ap+0], $aj ! ap[j] 995 addcc $nlo, $hi1, $lo1 996 add $ap, 8, $ap 997 addxc $nj, %g0, $hi1 ! nhi=nj 998 999 ldx [$np+0], $nj ! np[j] 1000 mulx $aj, $m0, $alo ! ap[j]*bp[0] 1001 add $np, 8, $np 1002 umulxhi $aj, $m0, $aj ! ahi=aj 1003 1004 mulx $nj, $m1, $nlo ! np[j]*m1 1005 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 1006 umulxhi $nj, $m1, $nj ! nhi=nj 1007 addxc %g0, $hi1, $hi1 1008 stxa $lo1, [$tp]0xe2 ! tp[j-1] 1009 add $tp, 8, $tp ! tp++ 1010 1011 brnz,pt $cnt, .L1st_g5 1012 sub $cnt, 8, $cnt ! j-- 1013!.L1st_g5 1014 addcc $alo, $hi0, $lo0 1015 addxc $aj, %g0, $hi0 ! ahi=aj 1016 1017 addcc $nlo, $hi1, $lo1 1018 addxc $nj, %g0, $hi1 1019 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 1020 addxc %g0, $hi1, $hi1 1021 stxa $lo1, [$tp]0xe2 ! tp[j-1] 1022 add $tp, 8, $tp 1023 1024 addcc $hi0, $hi1, $hi1 1025 addxc %g0, %g0, $ovf ! upmost overflow bit 1026 stxa $hi1, [$tp]0xe2 1027 add $tp, 8, $tp 1028 1029 ba .Louter_g5 1030 sub $num, 16, $i ! i=num-2 1031 1032.align 16 1033.Louter_g5: 1034 wr $ccr, %g0, %ccr 1035___ 1036 &load_b($bp,$m0); # m0=bp[i] 1037$code.=<<___; 1038 sub $ap, $num, $ap ! rewind 1039 sub $np, $num, $np 1040 sub $tp, $num, $tp 1041 1042 ldx [$ap+0], $aj ! ap[0] 1043 ldx [$np+0], $nj ! np[0] 1044 1045 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 1046 ldx [$tp], $tj ! tp[0] 1047 umulxhi $aj, $m0, $hi0 1048 ldx [$ap+8], $aj ! ap[1] 1049 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 1050 mulx $aj, $m0, $alo ! ap[1]*bp[i] 1051 addxc %g0, $hi0, $hi0 1052 mulx $lo0, $n0, $m1 ! tp[0]*n0 1053 umulxhi $aj, $m0, $aj ! ahi=aj 1054 mulx $nj, $m1, $lo1 ! np[0]*m1 1055 add $ap, 16, $ap 1056 umulxhi $nj, $m1, $hi1 1057 ldx [$np+8], $nj ! np[1] 1058 add $np, 16, $np 1059 addcc $lo1, $lo0, $lo1 1060 mulx $nj, $m1, $nlo ! np[1]*m1 1061 addxc %g0, $hi1, $hi1 1062 umulxhi $nj, $m1, $nj ! nhi=nj 1063 1064 ba .Linner_g5 1065 sub $num, 24, $cnt ! cnt=num-3 1066.align 16 1067.Linner_g5: 1068 addcc $alo, $hi0, $lo0 1069 ldx [$tp+8], $tj ! tp[j] 1070 addxc $aj, %g0, $hi0 ! ahi=aj 1071 ldx [$ap+0], $aj ! ap[j] 1072 add $ap, 8, $ap 1073 addcc $nlo, $hi1, $lo1 1074 mulx $aj, $m0, $alo ! ap[j]*bp[i] 1075 addxc $nj, %g0, $hi1 ! nhi=nj 1076 ldx [$np+0], $nj ! np[j] 1077 add $np, 8, $np 1078 umulxhi $aj, $m0, $aj ! ahi=aj 1079 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 1080 mulx $nj, $m1, $nlo ! np[j]*m1 1081 addxc %g0, $hi0, $hi0 1082 umulxhi $nj, $m1, $nj ! nhi=nj 1083 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 1084 addxc %g0, $hi1, $hi1 1085 stx $lo1, [$tp] ! tp[j-1] 1086 add $tp, 8, $tp 1087 brnz,pt $cnt, .Linner_g5 1088 sub $cnt, 8, $cnt 1089!.Linner_g5 1090 ldx [$tp+8], $tj ! tp[j] 1091 addcc $alo, $hi0, $lo0 1092 addxc $aj, %g0, $hi0 ! ahi=aj 1093 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 1094 addxc %g0, $hi0, $hi0 1095 1096 addcc $nlo, $hi1, $lo1 1097 addxc $nj, %g0, $hi1 ! nhi=nj 1098 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 1099 addxc %g0, $hi1, $hi1 1100 stx $lo1, [$tp] ! tp[j-1] 1101 1102 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 1103 addxccc $hi1, $hi0, $hi1 1104 addxc %g0, %g0, $ovf 1105 stx $hi1, [$tp+8] 1106 add $tp, 16, $tp 1107 1108 brnz,pt $i, .Louter_g5 1109 sub $i, 8, $i 1110 1111 sub $ap, $num, $ap ! rewind 1112 sub $np, $num, $np 1113 sub $tp, $num, $tp 1114 ba .Lsub_g5 1115 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 1116 1117.align 16 1118.Lsub_g5: 1119 ldx [$tp], $tj 1120 add $tp, 8, $tp 1121 ldx [$np+0], $nj 1122 add $np, 8, $np 1123 subccc $tj, $nj, $t2 ! tp[j]-np[j] 1124 srlx $tj, 32, $tj 1125 srlx $nj, 32, $nj 1126 subccc $tj, $nj, $t3 1127 add $rp, 8, $rp 1128 st $t2, [$rp-4] ! reverse order 1129 st $t3, [$rp-8] 1130 brnz,pt $cnt, .Lsub_g5 1131 sub $cnt, 8, $cnt 1132 1133 sub $np, $num, $np ! rewind 1134 sub $tp, $num, $tp 1135 sub $rp, $num, $rp 1136 1137 subccc $ovf, %g0, $ovf ! handle upmost overflow bit 1138 ba .Lcopy_g5 1139 sub $num, 8, $cnt 1140 1141.align 16 1142.Lcopy_g5: ! conditional copy 1143 ldx [$tp], $tj 1144 ldx [$rp+0], $t2 1145 stx %g0, [$tp] ! zap 1146 add $tp, 8, $tp 1147 movcs %icc, $tj, $t2 1148 stx $t2, [$rp+0] 1149 add $rp, 8, $rp 1150 brnz $cnt, .Lcopy_g5 1151 sub $cnt, 8, $cnt 1152 1153 mov 1, %o0 1154 ret 1155 restore 1156.type bn_mul_mont_gather5_t4, #function 1157.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4 1158___ 1159} 1160 1161$code.=<<___; 1162.globl bn_flip_t4 1163.align 32 1164bn_flip_t4: 1165.Loop_flip: 1166 ld [%o1+0], %o4 1167 sub %o2, 1, %o2 1168 ld [%o1+4], %o5 1169 add %o1, 8, %o1 1170 st %o5, [%o0+0] 1171 st %o4, [%o0+4] 1172 brnz %o2, .Loop_flip 1173 add %o0, 8, %o0 1174 retl 1175 nop 1176.type bn_flip_t4, #function 1177.size bn_flip_t4, .-bn_flip_t4 1178 1179.globl bn_flip_n_scatter5_t4 1180.align 32 1181bn_flip_n_scatter5_t4: 1182 sll %o3, 3, %o3 1183 srl %o1, 1, %o1 1184 add %o3, %o2, %o2 ! &pwrtbl[pwr] 1185 sub %o1, 1, %o1 1186.Loop_flip_n_scatter5: 1187 ld [%o0+0], %o4 ! inp[i] 1188 ld [%o0+4], %o5 1189 add %o0, 8, %o0 1190 sllx %o5, 32, %o5 1191 or %o4, %o5, %o5 1192 stx %o5, [%o2] 1193 add %o2, 32*8, %o2 1194 brnz %o1, .Loop_flip_n_scatter5 1195 sub %o1, 1, %o1 1196 retl 1197 nop 1198.type bn_flip_n_scatter5_t4, #function 1199.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4 1200 1201.globl bn_gather5_t4 1202.align 32 1203bn_gather5_t4: 1204___ 1205 &load_ccr("%o2","%o3","%g1"); 1206$code.=<<___; 1207 sub %o1, 1, %o1 1208.Loop_gather5: 1209___ 1210 &load_b("%o2","%g1"); 1211$code.=<<___; 1212 stx %g1, [%o0] 1213 add %o0, 8, %o0 1214 brnz %o1, .Loop_gather5 1215 sub %o1, 1, %o1 1216 1217 retl 1218 nop 1219.type bn_gather5_t4, #function 1220.size bn_gather5_t4, .-bn_gather5_t4 1221 1222.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov" 1223.align 4 1224___ 1225 1226&emit_assembler(); 1227 1228close STDOUT or die "error closing STDOUT: $!"; 1229