1#! /usr/bin/env perl 2# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# April 2006 18 19# "Teaser" Montgomery multiplication module for PowerPC. It's possible 20# to gain a bit more by modulo-scheduling outer loop, then dedicated 21# squaring procedure should give further 20% and code can be adapted 22# for 32-bit application running on 64-bit CPU. As for the latter. 23# It won't be able to achieve "native" 64-bit performance, because in 24# 32-bit application context every addc instruction will have to be 25# expanded as addc, twice right shift by 32 and finally adde, etc. 26# So far RSA *sign* performance improvement over pre-bn_mul_mont asm 27# for 64-bit application running on PPC970/G5 is: 28# 29# 512-bit +65% 30# 1024-bit +35% 31# 2048-bit +18% 32# 4096-bit +4% 33 34# September 2016 35# 36# Add multiplication procedure operating on lengths divisible by 4 37# and squaring procedure operating on lengths divisible by 8. Length 38# is expressed in number of limbs. RSA private key operations are 39# ~35-50% faster (more for longer keys) on contemporary high-end POWER 40# processors in 64-bit builds, [mysteriously enough] more in 32-bit 41# builds. On low-end 32-bit processors performance improvement turned 42# to be marginal... 43 44$flavour = shift; 45 46if ($flavour =~ /32/) { 47 $BITS= 32; 48 $BNSZ= $BITS/8; 49 $SIZE_T=4; 50 $RZONE= 224; 51 52 $LD= "lwz"; # load 53 $LDU= "lwzu"; # load and update 54 $LDX= "lwzx"; # load indexed 55 $ST= "stw"; # store 56 $STU= "stwu"; # store and update 57 $STX= "stwx"; # store indexed 58 $STUX= "stwux"; # store indexed and update 59 $UMULL= "mullw"; # unsigned multiply low 60 $UMULH= "mulhwu"; # unsigned multiply high 61 $UCMP= "cmplw"; # unsigned compare 62 $SHRI= "srwi"; # unsigned shift right by immediate 63 $SHLI= "slwi"; # unsigned shift left by immediate 64 $PUSH= $ST; 65 $POP= $LD; 66} elsif ($flavour =~ /64/) { 67 $BITS= 64; 68 $BNSZ= $BITS/8; 69 $SIZE_T=8; 70 $RZONE= 288; 71 72 # same as above, but 64-bit mnemonics... 73 $LD= "ld"; # load 74 $LDU= "ldu"; # load and update 75 $LDX= "ldx"; # load indexed 76 $ST= "std"; # store 77 $STU= "stdu"; # store and update 78 $STX= "stdx"; # store indexed 79 $STUX= "stdux"; # store indexed and update 80 $UMULL= "mulld"; # unsigned multiply low 81 $UMULH= "mulhdu"; # unsigned multiply high 82 $UCMP= "cmpld"; # unsigned compare 83 $SHRI= "srdi"; # unsigned shift right by immediate 84 $SHLI= "sldi"; # unsigned shift left by immediate 85 $PUSH= $ST; 86 $POP= $LD; 87} else { die "nonsense $flavour"; } 88 89$FRAME=8*$SIZE_T+$RZONE; 90$LOCALS=8*$SIZE_T; 91 92$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 93( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 94( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 95die "can't locate ppc-xlate.pl"; 96 97open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 98 99$sp="r1"; 100$toc="r2"; 101$rp="r3"; 102$ap="r4"; 103$bp="r5"; 104$np="r6"; 105$n0="r7"; 106$num="r8"; 107 108{ 109my $ovf=$rp; 110my $rp="r9"; # $rp is reassigned 111my $aj="r10"; 112my $nj="r11"; 113my $tj="r12"; 114# non-volatile registers 115my $i="r20"; 116my $j="r21"; 117my $tp="r22"; 118my $m0="r23"; 119my $m1="r24"; 120my $lo0="r25"; 121my $hi0="r26"; 122my $lo1="r27"; 123my $hi1="r28"; 124my $alo="r29"; 125my $ahi="r30"; 126my $nlo="r31"; 127# 128my $nhi="r0"; 129 130$code=<<___; 131.machine "any" 132.text 133 134.globl .bn_mul_mont_int 135.align 5 136.bn_mul_mont_int: 137 mr $rp,r3 ; $rp is reassigned 138 li r3,0 139___ 140$code.=<<___ if ($BNSZ==4); 141 cmpwi $num,32 ; longer key performance is not better 142 bgelr 143___ 144$code.=<<___; 145 slwi $num,$num,`log($BNSZ)/log(2)` 146 li $tj,-4096 147 addi $ovf,$num,$FRAME 148 subf $ovf,$ovf,$sp ; $sp-$ovf 149 and $ovf,$ovf,$tj ; minimize TLB usage 150 subf $ovf,$sp,$ovf ; $ovf-$sp 151 mr $tj,$sp 152 srwi $num,$num,`log($BNSZ)/log(2)` 153 $STUX $sp,$sp,$ovf 154 155 $PUSH r20,`-12*$SIZE_T`($tj) 156 $PUSH r21,`-11*$SIZE_T`($tj) 157 $PUSH r22,`-10*$SIZE_T`($tj) 158 $PUSH r23,`-9*$SIZE_T`($tj) 159 $PUSH r24,`-8*$SIZE_T`($tj) 160 $PUSH r25,`-7*$SIZE_T`($tj) 161 $PUSH r26,`-6*$SIZE_T`($tj) 162 $PUSH r27,`-5*$SIZE_T`($tj) 163 $PUSH r28,`-4*$SIZE_T`($tj) 164 $PUSH r29,`-3*$SIZE_T`($tj) 165 $PUSH r30,`-2*$SIZE_T`($tj) 166 $PUSH r31,`-1*$SIZE_T`($tj) 167 168 $LD $n0,0($n0) ; pull n0[0] value 169 addi $num,$num,-2 ; adjust $num for counter register 170 171 $LD $m0,0($bp) ; m0=bp[0] 172 $LD $aj,0($ap) ; ap[0] 173 addi $tp,$sp,$LOCALS 174 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] 175 $UMULH $hi0,$aj,$m0 176 177 $LD $aj,$BNSZ($ap) ; ap[1] 178 $LD $nj,0($np) ; np[0] 179 180 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 181 182 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] 183 $UMULH $ahi,$aj,$m0 184 185 $UMULL $lo1,$nj,$m1 ; np[0]*m1 186 $UMULH $hi1,$nj,$m1 187 $LD $nj,$BNSZ($np) ; np[1] 188 addc $lo1,$lo1,$lo0 189 addze $hi1,$hi1 190 191 $UMULL $nlo,$nj,$m1 ; np[1]*m1 192 $UMULH $nhi,$nj,$m1 193 194 mtctr $num 195 li $j,`2*$BNSZ` 196.align 4 197L1st: 198 $LDX $aj,$ap,$j ; ap[j] 199 addc $lo0,$alo,$hi0 200 $LDX $nj,$np,$j ; np[j] 201 addze $hi0,$ahi 202 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] 203 addc $lo1,$nlo,$hi1 204 $UMULH $ahi,$aj,$m0 205 addze $hi1,$nhi 206 $UMULL $nlo,$nj,$m1 ; np[j]*m1 207 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 208 $UMULH $nhi,$nj,$m1 209 addze $hi1,$hi1 210 $ST $lo1,0($tp) ; tp[j-1] 211 212 addi $j,$j,$BNSZ ; j++ 213 addi $tp,$tp,$BNSZ ; tp++ 214 bdnz L1st 215;L1st 216 addc $lo0,$alo,$hi0 217 addze $hi0,$ahi 218 219 addc $lo1,$nlo,$hi1 220 addze $hi1,$nhi 221 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 222 addze $hi1,$hi1 223 $ST $lo1,0($tp) ; tp[j-1] 224 225 li $ovf,0 226 addc $hi1,$hi1,$hi0 227 addze $ovf,$ovf ; upmost overflow bit 228 $ST $hi1,$BNSZ($tp) 229 230 li $i,$BNSZ 231.align 4 232Louter: 233 $LDX $m0,$bp,$i ; m0=bp[i] 234 $LD $aj,0($ap) ; ap[0] 235 addi $tp,$sp,$LOCALS 236 $LD $tj,$LOCALS($sp); tp[0] 237 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] 238 $UMULH $hi0,$aj,$m0 239 $LD $aj,$BNSZ($ap) ; ap[1] 240 $LD $nj,0($np) ; np[0] 241 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] 242 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 243 addze $hi0,$hi0 244 $UMULL $m1,$lo0,$n0 ; tp[0]*n0 245 $UMULH $ahi,$aj,$m0 246 $UMULL $lo1,$nj,$m1 ; np[0]*m1 247 $UMULH $hi1,$nj,$m1 248 $LD $nj,$BNSZ($np) ; np[1] 249 addc $lo1,$lo1,$lo0 250 $UMULL $nlo,$nj,$m1 ; np[1]*m1 251 addze $hi1,$hi1 252 $UMULH $nhi,$nj,$m1 253 254 mtctr $num 255 li $j,`2*$BNSZ` 256.align 4 257Linner: 258 $LDX $aj,$ap,$j ; ap[j] 259 addc $lo0,$alo,$hi0 260 $LD $tj,$BNSZ($tp) ; tp[j] 261 addze $hi0,$ahi 262 $LDX $nj,$np,$j ; np[j] 263 addc $lo1,$nlo,$hi1 264 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 265 addze $hi1,$nhi 266 $UMULH $ahi,$aj,$m0 267 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 268 $UMULL $nlo,$nj,$m1 ; np[j]*m1 269 addze $hi0,$hi0 270 $UMULH $nhi,$nj,$m1 271 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 272 addi $j,$j,$BNSZ ; j++ 273 addze $hi1,$hi1 274 $ST $lo1,0($tp) ; tp[j-1] 275 addi $tp,$tp,$BNSZ ; tp++ 276 bdnz Linner 277;Linner 278 $LD $tj,$BNSZ($tp) ; tp[j] 279 addc $lo0,$alo,$hi0 280 addze $hi0,$ahi 281 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 282 addze $hi0,$hi0 283 284 addc $lo1,$nlo,$hi1 285 addze $hi1,$nhi 286 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 287 addze $hi1,$hi1 288 $ST $lo1,0($tp) ; tp[j-1] 289 290 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] 291 li $ovf,0 292 adde $hi1,$hi1,$hi0 293 addze $ovf,$ovf 294 $ST $hi1,$BNSZ($tp) 295; 296 slwi $tj,$num,`log($BNSZ)/log(2)` 297 $UCMP $i,$tj 298 addi $i,$i,$BNSZ 299 ble Louter 300 301 addi $num,$num,2 ; restore $num 302 subfc $j,$j,$j ; j=0 and "clear" XER[CA] 303 addi $tp,$sp,$LOCALS 304 mtctr $num 305 306.align 4 307Lsub: $LDX $tj,$tp,$j 308 $LDX $nj,$np,$j 309 subfe $aj,$nj,$tj ; tp[j]-np[j] 310 $STX $aj,$rp,$j 311 addi $j,$j,$BNSZ 312 bdnz Lsub 313 314 li $j,0 315 mtctr $num 316 subfe $ovf,$j,$ovf ; handle upmost overflow bit 317 318.align 4 319Lcopy: ; conditional copy 320 $LDX $tj,$tp,$j 321 $LDX $aj,$rp,$j 322 and $tj,$tj,$ovf 323 andc $aj,$aj,$ovf 324 $STX $j,$tp,$j ; zap at once 325 or $aj,$aj,$tj 326 $STX $aj,$rp,$j 327 addi $j,$j,$BNSZ 328 bdnz Lcopy 329 330 $POP $tj,0($sp) 331 li r3,1 332 $POP r20,`-12*$SIZE_T`($tj) 333 $POP r21,`-11*$SIZE_T`($tj) 334 $POP r22,`-10*$SIZE_T`($tj) 335 $POP r23,`-9*$SIZE_T`($tj) 336 $POP r24,`-8*$SIZE_T`($tj) 337 $POP r25,`-7*$SIZE_T`($tj) 338 $POP r26,`-6*$SIZE_T`($tj) 339 $POP r27,`-5*$SIZE_T`($tj) 340 $POP r28,`-4*$SIZE_T`($tj) 341 $POP r29,`-3*$SIZE_T`($tj) 342 $POP r30,`-2*$SIZE_T`($tj) 343 $POP r31,`-1*$SIZE_T`($tj) 344 mr $sp,$tj 345 blr 346 .long 0 347 .byte 0,12,4,0,0x80,12,6,0 348 .long 0 349.size .bn_mul_mont_int,.-.bn_mul_mont_int 350___ 351} 352if (1) { 353my ($a0,$a1,$a2,$a3, 354 $t0,$t1,$t2,$t3, 355 $m0,$m1,$m2,$m3, 356 $acc0,$acc1,$acc2,$acc3,$acc4, 357 $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31)); 358my ($carry,$zero) = ($rp,"r0"); 359 360# sp----------->+-------------------------------+ 361# | saved sp | 362# +-------------------------------+ 363# . . 364# +8*size_t +-------------------------------+ 365# | 4 "n0*t0" | 366# . . 367# . . 368# +12*size_t +-------------------------------+ 369# | size_t tmp[num] | 370# . . 371# . . 372# . . 373# +-------------------------------+ 374# | topmost carry | 375# . . 376# -18*size_t +-------------------------------+ 377# | 18 saved gpr, r14-r31 | 378# . . 379# . . 380# +-------------------------------+ 381$code.=<<___; 382.globl .bn_mul4x_mont_int 383.align 5 384.bn_mul4x_mont_int: 385 andi. r0,$num,7 386 bne .Lmul4x_do 387 $UCMP $ap,$bp 388 bne .Lmul4x_do 389 b .Lsqr8x_do 390.Lmul4x_do: 391 slwi $num,$num,`log($SIZE_T)/log(2)` 392 mr $a0,$sp 393 li $a1,-32*$SIZE_T 394 sub $a1,$a1,$num 395 $STUX $sp,$sp,$a1 # alloca 396 397 $PUSH r14,-$SIZE_T*18($a0) 398 $PUSH r15,-$SIZE_T*17($a0) 399 $PUSH r16,-$SIZE_T*16($a0) 400 $PUSH r17,-$SIZE_T*15($a0) 401 $PUSH r18,-$SIZE_T*14($a0) 402 $PUSH r19,-$SIZE_T*13($a0) 403 $PUSH r20,-$SIZE_T*12($a0) 404 $PUSH r21,-$SIZE_T*11($a0) 405 $PUSH r22,-$SIZE_T*10($a0) 406 $PUSH r23,-$SIZE_T*9($a0) 407 $PUSH r24,-$SIZE_T*8($a0) 408 $PUSH r25,-$SIZE_T*7($a0) 409 $PUSH r26,-$SIZE_T*6($a0) 410 $PUSH r27,-$SIZE_T*5($a0) 411 $PUSH r28,-$SIZE_T*4($a0) 412 $PUSH r29,-$SIZE_T*3($a0) 413 $PUSH r30,-$SIZE_T*2($a0) 414 $PUSH r31,-$SIZE_T*1($a0) 415 416 subi $ap,$ap,$SIZE_T # bias by -1 417 subi $np,$np,$SIZE_T # bias by -1 418 subi $rp,$rp,$SIZE_T # bias by -1 419 $LD $n0,0($n0) # *n0 420 421 add $t0,$bp,$num 422 add $ap_end,$ap,$num 423 subi $t0,$t0,$SIZE_T*4 # &b[num-4] 424 425 $LD $bi,$SIZE_T*0($bp) # b[0] 426 li $acc0,0 427 $LD $a0,$SIZE_T*1($ap) # a[0..3] 428 li $acc1,0 429 $LD $a1,$SIZE_T*2($ap) 430 li $acc2,0 431 $LD $a2,$SIZE_T*3($ap) 432 li $acc3,0 433 $LDU $a3,$SIZE_T*4($ap) 434 $LD $m0,$SIZE_T*1($np) # n[0..3] 435 $LD $m1,$SIZE_T*2($np) 436 $LD $m2,$SIZE_T*3($np) 437 $LDU $m3,$SIZE_T*4($np) 438 439 $PUSH $rp,$SIZE_T*6($sp) # offload rp and &b[num-4] 440 $PUSH $t0,$SIZE_T*7($sp) 441 li $carry,0 442 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 443 li $cnt,0 444 li $zero,0 445 b .Loop_mul4x_1st_reduction 446 447.align 5 448.Loop_mul4x_1st_reduction: 449 $UMULL $t0,$a0,$bi # lo(a[0..3]*b[0]) 450 addze $carry,$carry # modulo-scheduled 451 $UMULL $t1,$a1,$bi 452 addi $cnt,$cnt,$SIZE_T 453 $UMULL $t2,$a2,$bi 454 andi. $cnt,$cnt,$SIZE_T*4-1 455 $UMULL $t3,$a3,$bi 456 addc $acc0,$acc0,$t0 457 $UMULH $t0,$a0,$bi # hi(a[0..3]*b[0]) 458 adde $acc1,$acc1,$t1 459 $UMULH $t1,$a1,$bi 460 adde $acc2,$acc2,$t2 461 $UMULL $mi,$acc0,$n0 # t[0]*n0 462 adde $acc3,$acc3,$t3 463 $UMULH $t2,$a2,$bi 464 addze $acc4,$zero 465 $UMULH $t3,$a3,$bi 466 $LDX $bi,$bp,$cnt # next b[i] (or b[0]) 467 addc $acc1,$acc1,$t0 468 # (*) mul $t0,$m0,$mi # lo(n[0..3]*t[0]*n0) 469 $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 470 adde $acc2,$acc2,$t1 471 $UMULL $t1,$m1,$mi 472 adde $acc3,$acc3,$t2 473 $UMULL $t2,$m2,$mi 474 adde $acc4,$acc4,$t3 # can't overflow 475 $UMULL $t3,$m3,$mi 476 # (*) addc $acc0,$acc0,$t0 477 # (*) As for removal of first multiplication and addition 478 # instructions. The outcome of first addition is 479 # guaranteed to be zero, which leaves two computationally 480 # significant outcomes: it either carries or not. Then 481 # question is when does it carry? Is there alternative 482 # way to deduce it? If you follow operations, you can 483 # observe that condition for carry is quite simple: 484 # $acc0 being non-zero. So that carry can be calculated 485 # by adding -1 to $acc0. That's what next instruction does. 486 addic $acc0,$acc0,-1 # (*), discarded 487 $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0) 488 adde $acc0,$acc1,$t1 489 $UMULH $t1,$m1,$mi 490 adde $acc1,$acc2,$t2 491 $UMULH $t2,$m2,$mi 492 adde $acc2,$acc3,$t3 493 $UMULH $t3,$m3,$mi 494 adde $acc3,$acc4,$carry 495 addze $carry,$zero 496 addc $acc0,$acc0,$t0 497 adde $acc1,$acc1,$t1 498 adde $acc2,$acc2,$t2 499 adde $acc3,$acc3,$t3 500 #addze $carry,$carry 501 bne .Loop_mul4x_1st_reduction 502 503 $UCMP $ap_end,$ap 504 beq .Lmul4x4_post_condition 505 506 $LD $a0,$SIZE_T*1($ap) # a[4..7] 507 $LD $a1,$SIZE_T*2($ap) 508 $LD $a2,$SIZE_T*3($ap) 509 $LDU $a3,$SIZE_T*4($ap) 510 $LD $mi,$SIZE_T*8($sp) # a[0]*n0 511 $LD $m0,$SIZE_T*1($np) # n[4..7] 512 $LD $m1,$SIZE_T*2($np) 513 $LD $m2,$SIZE_T*3($np) 514 $LDU $m3,$SIZE_T*4($np) 515 b .Loop_mul4x_1st_tail 516 517.align 5 518.Loop_mul4x_1st_tail: 519 $UMULL $t0,$a0,$bi # lo(a[4..7]*b[i]) 520 addze $carry,$carry # modulo-scheduled 521 $UMULL $t1,$a1,$bi 522 addi $cnt,$cnt,$SIZE_T 523 $UMULL $t2,$a2,$bi 524 andi. $cnt,$cnt,$SIZE_T*4-1 525 $UMULL $t3,$a3,$bi 526 addc $acc0,$acc0,$t0 527 $UMULH $t0,$a0,$bi # hi(a[4..7]*b[i]) 528 adde $acc1,$acc1,$t1 529 $UMULH $t1,$a1,$bi 530 adde $acc2,$acc2,$t2 531 $UMULH $t2,$a2,$bi 532 adde $acc3,$acc3,$t3 533 $UMULH $t3,$a3,$bi 534 addze $acc4,$zero 535 $LDX $bi,$bp,$cnt # next b[i] (or b[0]) 536 addc $acc1,$acc1,$t0 537 $UMULL $t0,$m0,$mi # lo(n[4..7]*a[0]*n0) 538 adde $acc2,$acc2,$t1 539 $UMULL $t1,$m1,$mi 540 adde $acc3,$acc3,$t2 541 $UMULL $t2,$m2,$mi 542 adde $acc4,$acc4,$t3 # can't overflow 543 $UMULL $t3,$m3,$mi 544 addc $acc0,$acc0,$t0 545 $UMULH $t0,$m0,$mi # hi(n[4..7]*a[0]*n0) 546 adde $acc1,$acc1,$t1 547 $UMULH $t1,$m1,$mi 548 adde $acc2,$acc2,$t2 549 $UMULH $t2,$m2,$mi 550 adde $acc3,$acc3,$t3 551 adde $acc4,$acc4,$carry 552 $UMULH $t3,$m3,$mi 553 addze $carry,$zero 554 addi $mi,$sp,$SIZE_T*8 555 $LDX $mi,$mi,$cnt # next t[0]*n0 556 $STU $acc0,$SIZE_T($tp) # word of result 557 addc $acc0,$acc1,$t0 558 adde $acc1,$acc2,$t1 559 adde $acc2,$acc3,$t2 560 adde $acc3,$acc4,$t3 561 #addze $carry,$carry 562 bne .Loop_mul4x_1st_tail 563 564 sub $t1,$ap_end,$num # rewinded $ap 565 $UCMP $ap_end,$ap # done yet? 566 beq .Lmul4x_proceed 567 568 $LD $a0,$SIZE_T*1($ap) 569 $LD $a1,$SIZE_T*2($ap) 570 $LD $a2,$SIZE_T*3($ap) 571 $LDU $a3,$SIZE_T*4($ap) 572 $LD $m0,$SIZE_T*1($np) 573 $LD $m1,$SIZE_T*2($np) 574 $LD $m2,$SIZE_T*3($np) 575 $LDU $m3,$SIZE_T*4($np) 576 b .Loop_mul4x_1st_tail 577 578.align 5 579.Lmul4x_proceed: 580 $LDU $bi,$SIZE_T*4($bp) # *++b 581 addze $carry,$carry # topmost carry 582 $LD $a0,$SIZE_T*1($t1) 583 $LD $a1,$SIZE_T*2($t1) 584 $LD $a2,$SIZE_T*3($t1) 585 $LD $a3,$SIZE_T*4($t1) 586 addi $ap,$t1,$SIZE_T*4 587 sub $np,$np,$num # rewind np 588 589 $ST $acc0,$SIZE_T*1($tp) # result 590 $ST $acc1,$SIZE_T*2($tp) 591 $ST $acc2,$SIZE_T*3($tp) 592 $ST $acc3,$SIZE_T*4($tp) 593 $ST $carry,$SIZE_T*5($tp) # save topmost carry 594 $LD $acc0,$SIZE_T*12($sp) # t[0..3] 595 $LD $acc1,$SIZE_T*13($sp) 596 $LD $acc2,$SIZE_T*14($sp) 597 $LD $acc3,$SIZE_T*15($sp) 598 599 $LD $m0,$SIZE_T*1($np) # n[0..3] 600 $LD $m1,$SIZE_T*2($np) 601 $LD $m2,$SIZE_T*3($np) 602 $LDU $m3,$SIZE_T*4($np) 603 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 604 li $carry,0 605 b .Loop_mul4x_reduction 606 607.align 5 608.Loop_mul4x_reduction: 609 $UMULL $t0,$a0,$bi # lo(a[0..3]*b[4]) 610 addze $carry,$carry # modulo-scheduled 611 $UMULL $t1,$a1,$bi 612 addi $cnt,$cnt,$SIZE_T 613 $UMULL $t2,$a2,$bi 614 andi. $cnt,$cnt,$SIZE_T*4-1 615 $UMULL $t3,$a3,$bi 616 addc $acc0,$acc0,$t0 617 $UMULH $t0,$a0,$bi # hi(a[0..3]*b[4]) 618 adde $acc1,$acc1,$t1 619 $UMULH $t1,$a1,$bi 620 adde $acc2,$acc2,$t2 621 $UMULL $mi,$acc0,$n0 # t[0]*n0 622 adde $acc3,$acc3,$t3 623 $UMULH $t2,$a2,$bi 624 addze $acc4,$zero 625 $UMULH $t3,$a3,$bi 626 $LDX $bi,$bp,$cnt # next b[i] 627 addc $acc1,$acc1,$t0 628 # (*) mul $t0,$m0,$mi 629 $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 630 adde $acc2,$acc2,$t1 631 $UMULL $t1,$m1,$mi # lo(n[0..3]*t[0]*n0 632 adde $acc3,$acc3,$t2 633 $UMULL $t2,$m2,$mi 634 adde $acc4,$acc4,$t3 # can't overflow 635 $UMULL $t3,$m3,$mi 636 # (*) addc $acc0,$acc0,$t0 637 addic $acc0,$acc0,-1 # (*), discarded 638 $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0 639 adde $acc0,$acc1,$t1 640 $UMULH $t1,$m1,$mi 641 adde $acc1,$acc2,$t2 642 $UMULH $t2,$m2,$mi 643 adde $acc2,$acc3,$t3 644 $UMULH $t3,$m3,$mi 645 adde $acc3,$acc4,$carry 646 addze $carry,$zero 647 addc $acc0,$acc0,$t0 648 adde $acc1,$acc1,$t1 649 adde $acc2,$acc2,$t2 650 adde $acc3,$acc3,$t3 651 #addze $carry,$carry 652 bne .Loop_mul4x_reduction 653 654 $LD $t0,$SIZE_T*5($tp) # t[4..7] 655 addze $carry,$carry 656 $LD $t1,$SIZE_T*6($tp) 657 $LD $t2,$SIZE_T*7($tp) 658 $LD $t3,$SIZE_T*8($tp) 659 $LD $a0,$SIZE_T*1($ap) # a[4..7] 660 $LD $a1,$SIZE_T*2($ap) 661 $LD $a2,$SIZE_T*3($ap) 662 $LDU $a3,$SIZE_T*4($ap) 663 addc $acc0,$acc0,$t0 664 adde $acc1,$acc1,$t1 665 adde $acc2,$acc2,$t2 666 adde $acc3,$acc3,$t3 667 #addze $carry,$carry 668 669 $LD $mi,$SIZE_T*8($sp) # t[0]*n0 670 $LD $m0,$SIZE_T*1($np) # n[4..7] 671 $LD $m1,$SIZE_T*2($np) 672 $LD $m2,$SIZE_T*3($np) 673 $LDU $m3,$SIZE_T*4($np) 674 b .Loop_mul4x_tail 675 676.align 5 677.Loop_mul4x_tail: 678 $UMULL $t0,$a0,$bi # lo(a[4..7]*b[4]) 679 addze $carry,$carry # modulo-scheduled 680 $UMULL $t1,$a1,$bi 681 addi $cnt,$cnt,$SIZE_T 682 $UMULL $t2,$a2,$bi 683 andi. $cnt,$cnt,$SIZE_T*4-1 684 $UMULL $t3,$a3,$bi 685 addc $acc0,$acc0,$t0 686 $UMULH $t0,$a0,$bi # hi(a[4..7]*b[4]) 687 adde $acc1,$acc1,$t1 688 $UMULH $t1,$a1,$bi 689 adde $acc2,$acc2,$t2 690 $UMULH $t2,$a2,$bi 691 adde $acc3,$acc3,$t3 692 $UMULH $t3,$a3,$bi 693 addze $acc4,$zero 694 $LDX $bi,$bp,$cnt # next b[i] 695 addc $acc1,$acc1,$t0 696 $UMULL $t0,$m0,$mi # lo(n[4..7]*t[0]*n0) 697 adde $acc2,$acc2,$t1 698 $UMULL $t1,$m1,$mi 699 adde $acc3,$acc3,$t2 700 $UMULL $t2,$m2,$mi 701 adde $acc4,$acc4,$t3 # can't overflow 702 $UMULL $t3,$m3,$mi 703 addc $acc0,$acc0,$t0 704 $UMULH $t0,$m0,$mi # hi(n[4..7]*t[0]*n0) 705 adde $acc1,$acc1,$t1 706 $UMULH $t1,$m1,$mi 707 adde $acc2,$acc2,$t2 708 $UMULH $t2,$m2,$mi 709 adde $acc3,$acc3,$t3 710 $UMULH $t3,$m3,$mi 711 adde $acc4,$acc4,$carry 712 addi $mi,$sp,$SIZE_T*8 713 $LDX $mi,$mi,$cnt # next a[0]*n0 714 addze $carry,$zero 715 $STU $acc0,$SIZE_T($tp) # word of result 716 addc $acc0,$acc1,$t0 717 adde $acc1,$acc2,$t1 718 adde $acc2,$acc3,$t2 719 adde $acc3,$acc4,$t3 720 #addze $carry,$carry 721 bne .Loop_mul4x_tail 722 723 $LD $t0,$SIZE_T*5($tp) # next t[i] or topmost carry 724 sub $t1,$np,$num # rewinded np? 725 addze $carry,$carry 726 $UCMP $ap_end,$ap # done yet? 727 beq .Loop_mul4x_break 728 729 $LD $t1,$SIZE_T*6($tp) 730 $LD $t2,$SIZE_T*7($tp) 731 $LD $t3,$SIZE_T*8($tp) 732 $LD $a0,$SIZE_T*1($ap) 733 $LD $a1,$SIZE_T*2($ap) 734 $LD $a2,$SIZE_T*3($ap) 735 $LDU $a3,$SIZE_T*4($ap) 736 addc $acc0,$acc0,$t0 737 adde $acc1,$acc1,$t1 738 adde $acc2,$acc2,$t2 739 adde $acc3,$acc3,$t3 740 #addze $carry,$carry 741 742 $LD $m0,$SIZE_T*1($np) # n[4..7] 743 $LD $m1,$SIZE_T*2($np) 744 $LD $m2,$SIZE_T*3($np) 745 $LDU $m3,$SIZE_T*4($np) 746 b .Loop_mul4x_tail 747 748.align 5 749.Loop_mul4x_break: 750 $POP $t2,$SIZE_T*6($sp) # pull rp and &b[num-4] 751 $POP $t3,$SIZE_T*7($sp) 752 addc $a0,$acc0,$t0 # accumulate topmost carry 753 $LD $acc0,$SIZE_T*12($sp) # t[0..3] 754 addze $a1,$acc1 755 $LD $acc1,$SIZE_T*13($sp) 756 addze $a2,$acc2 757 $LD $acc2,$SIZE_T*14($sp) 758 addze $a3,$acc3 759 $LD $acc3,$SIZE_T*15($sp) 760 addze $carry,$carry # topmost carry 761 $ST $a0,$SIZE_T*1($tp) # result 762 sub $ap,$ap_end,$num # rewind ap 763 $ST $a1,$SIZE_T*2($tp) 764 $ST $a2,$SIZE_T*3($tp) 765 $ST $a3,$SIZE_T*4($tp) 766 $ST $carry,$SIZE_T*5($tp) # store topmost carry 767 768 $LD $m0,$SIZE_T*1($t1) # n[0..3] 769 $LD $m1,$SIZE_T*2($t1) 770 $LD $m2,$SIZE_T*3($t1) 771 $LD $m3,$SIZE_T*4($t1) 772 addi $np,$t1,$SIZE_T*4 773 $UCMP $bp,$t3 # done yet? 774 beq .Lmul4x_post 775 776 $LDU $bi,$SIZE_T*4($bp) 777 $LD $a0,$SIZE_T*1($ap) # a[0..3] 778 $LD $a1,$SIZE_T*2($ap) 779 $LD $a2,$SIZE_T*3($ap) 780 $LDU $a3,$SIZE_T*4($ap) 781 li $carry,0 782 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 783 b .Loop_mul4x_reduction 784 785.align 5 786.Lmul4x_post: 787 # Final step. We see if result is larger than modulus, and 788 # if it is, subtract the modulus. But comparison implies 789 # subtraction. So we subtract modulus, see if it borrowed, 790 # and conditionally copy original value. 791 srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 792 mr $bp,$t2 # &rp[-1] 793 subi $cnt,$cnt,1 794 mr $ap_end,$t2 # &rp[-1] copy 795 subfc $t0,$m0,$acc0 796 addi $tp,$sp,$SIZE_T*15 797 subfe $t1,$m1,$acc1 798 799 mtctr $cnt 800.Lmul4x_sub: 801 $LD $m0,$SIZE_T*1($np) 802 $LD $acc0,$SIZE_T*1($tp) 803 subfe $t2,$m2,$acc2 804 $LD $m1,$SIZE_T*2($np) 805 $LD $acc1,$SIZE_T*2($tp) 806 subfe $t3,$m3,$acc3 807 $LD $m2,$SIZE_T*3($np) 808 $LD $acc2,$SIZE_T*3($tp) 809 $LDU $m3,$SIZE_T*4($np) 810 $LDU $acc3,$SIZE_T*4($tp) 811 $ST $t0,$SIZE_T*1($bp) 812 $ST $t1,$SIZE_T*2($bp) 813 subfe $t0,$m0,$acc0 814 $ST $t2,$SIZE_T*3($bp) 815 $STU $t3,$SIZE_T*4($bp) 816 subfe $t1,$m1,$acc1 817 bdnz .Lmul4x_sub 818 819 $LD $a0,$SIZE_T*1($ap_end) 820 $ST $t0,$SIZE_T*1($bp) 821 $LD $t0,$SIZE_T*12($sp) 822 subfe $t2,$m2,$acc2 823 $LD $a1,$SIZE_T*2($ap_end) 824 $ST $t1,$SIZE_T*2($bp) 825 $LD $t1,$SIZE_T*13($sp) 826 subfe $t3,$m3,$acc3 827 subfe $carry,$zero,$carry # did it borrow? 828 addi $tp,$sp,$SIZE_T*12 829 $LD $a2,$SIZE_T*3($ap_end) 830 $ST $t2,$SIZE_T*3($bp) 831 $LD $t2,$SIZE_T*14($sp) 832 $LD $a3,$SIZE_T*4($ap_end) 833 $ST $t3,$SIZE_T*4($bp) 834 $LD $t3,$SIZE_T*15($sp) 835 836 mtctr $cnt 837.Lmul4x_cond_copy: 838 and $t0,$t0,$carry 839 andc $a0,$a0,$carry 840 $ST $zero,$SIZE_T*0($tp) # wipe stack clean 841 and $t1,$t1,$carry 842 andc $a1,$a1,$carry 843 $ST $zero,$SIZE_T*1($tp) 844 and $t2,$t2,$carry 845 andc $a2,$a2,$carry 846 $ST $zero,$SIZE_T*2($tp) 847 and $t3,$t3,$carry 848 andc $a3,$a3,$carry 849 $ST $zero,$SIZE_T*3($tp) 850 or $acc0,$t0,$a0 851 $LD $a0,$SIZE_T*5($ap_end) 852 $LD $t0,$SIZE_T*4($tp) 853 or $acc1,$t1,$a1 854 $LD $a1,$SIZE_T*6($ap_end) 855 $LD $t1,$SIZE_T*5($tp) 856 or $acc2,$t2,$a2 857 $LD $a2,$SIZE_T*7($ap_end) 858 $LD $t2,$SIZE_T*6($tp) 859 or $acc3,$t3,$a3 860 $LD $a3,$SIZE_T*8($ap_end) 861 $LD $t3,$SIZE_T*7($tp) 862 addi $tp,$tp,$SIZE_T*4 863 $ST $acc0,$SIZE_T*1($ap_end) 864 $ST $acc1,$SIZE_T*2($ap_end) 865 $ST $acc2,$SIZE_T*3($ap_end) 866 $STU $acc3,$SIZE_T*4($ap_end) 867 bdnz .Lmul4x_cond_copy 868 869 $POP $bp,0($sp) # pull saved sp 870 and $t0,$t0,$carry 871 andc $a0,$a0,$carry 872 $ST $zero,$SIZE_T*0($tp) 873 and $t1,$t1,$carry 874 andc $a1,$a1,$carry 875 $ST $zero,$SIZE_T*1($tp) 876 and $t2,$t2,$carry 877 andc $a2,$a2,$carry 878 $ST $zero,$SIZE_T*2($tp) 879 and $t3,$t3,$carry 880 andc $a3,$a3,$carry 881 $ST $zero,$SIZE_T*3($tp) 882 or $acc0,$t0,$a0 883 or $acc1,$t1,$a1 884 $ST $zero,$SIZE_T*4($tp) 885 or $acc2,$t2,$a2 886 or $acc3,$t3,$a3 887 $ST $acc0,$SIZE_T*1($ap_end) 888 $ST $acc1,$SIZE_T*2($ap_end) 889 $ST $acc2,$SIZE_T*3($ap_end) 890 $ST $acc3,$SIZE_T*4($ap_end) 891 892 b .Lmul4x_done 893 894.align 4 895.Lmul4x4_post_condition: 896 $POP $ap,$SIZE_T*6($sp) # pull &rp[-1] 897 $POP $bp,0($sp) # pull saved sp 898 addze $carry,$carry # modulo-scheduled 899 # $acc0-3,$carry hold result, $m0-3 hold modulus 900 subfc $a0,$m0,$acc0 901 subfe $a1,$m1,$acc1 902 subfe $a2,$m2,$acc2 903 subfe $a3,$m3,$acc3 904 subfe $carry,$zero,$carry # did it borrow? 905 906 and $m0,$m0,$carry 907 and $m1,$m1,$carry 908 addc $a0,$a0,$m0 909 and $m2,$m2,$carry 910 adde $a1,$a1,$m1 911 and $m3,$m3,$carry 912 adde $a2,$a2,$m2 913 adde $a3,$a3,$m3 914 915 $ST $a0,$SIZE_T*1($ap) # write result 916 $ST $a1,$SIZE_T*2($ap) 917 $ST $a2,$SIZE_T*3($ap) 918 $ST $a3,$SIZE_T*4($ap) 919 920.Lmul4x_done: 921 $ST $zero,$SIZE_T*8($sp) # wipe stack clean 922 $ST $zero,$SIZE_T*9($sp) 923 $ST $zero,$SIZE_T*10($sp) 924 $ST $zero,$SIZE_T*11($sp) 925 li r3,1 # signal "done" 926 $POP r14,-$SIZE_T*18($bp) 927 $POP r15,-$SIZE_T*17($bp) 928 $POP r16,-$SIZE_T*16($bp) 929 $POP r17,-$SIZE_T*15($bp) 930 $POP r18,-$SIZE_T*14($bp) 931 $POP r19,-$SIZE_T*13($bp) 932 $POP r20,-$SIZE_T*12($bp) 933 $POP r21,-$SIZE_T*11($bp) 934 $POP r22,-$SIZE_T*10($bp) 935 $POP r23,-$SIZE_T*9($bp) 936 $POP r24,-$SIZE_T*8($bp) 937 $POP r25,-$SIZE_T*7($bp) 938 $POP r26,-$SIZE_T*6($bp) 939 $POP r27,-$SIZE_T*5($bp) 940 $POP r28,-$SIZE_T*4($bp) 941 $POP r29,-$SIZE_T*3($bp) 942 $POP r30,-$SIZE_T*2($bp) 943 $POP r31,-$SIZE_T*1($bp) 944 mr $sp,$bp 945 blr 946 .long 0 947 .byte 0,12,4,0x20,0x80,18,6,0 948 .long 0 949.size .bn_mul4x_mont_int,.-.bn_mul4x_mont_int 950___ 951} 952 953if (1) { 954######################################################################## 955# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module. 956 957my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17)); 958my ($t0,$t1,$t2,$t3)=map("r$_",(18..21)); 959my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29)); 960my ($cnt,$carry,$zero)=("r30","r31","r0"); 961my ($tp,$ap_end,$na0)=($bp,$np,$carry); 962 963# sp----------->+-------------------------------+ 964# | saved sp | 965# +-------------------------------+ 966# . . 967# +12*size_t +-------------------------------+ 968# | size_t tmp[2*num] | 969# . . 970# . . 971# . . 972# +-------------------------------+ 973# . . 974# -18*size_t +-------------------------------+ 975# | 18 saved gpr, r14-r31 | 976# . . 977# . . 978# +-------------------------------+ 979$code.=<<___; 980.align 5 981__bn_sqr8x_mont: 982.Lsqr8x_do: 983 mr $a0,$sp 984 slwi $a1,$num,`log($SIZE_T)/log(2)+1` 985 li $a2,-32*$SIZE_T 986 sub $a1,$a2,$a1 987 slwi $num,$num,`log($SIZE_T)/log(2)` 988 $STUX $sp,$sp,$a1 # alloca 989 990 $PUSH r14,-$SIZE_T*18($a0) 991 $PUSH r15,-$SIZE_T*17($a0) 992 $PUSH r16,-$SIZE_T*16($a0) 993 $PUSH r17,-$SIZE_T*15($a0) 994 $PUSH r18,-$SIZE_T*14($a0) 995 $PUSH r19,-$SIZE_T*13($a0) 996 $PUSH r20,-$SIZE_T*12($a0) 997 $PUSH r21,-$SIZE_T*11($a0) 998 $PUSH r22,-$SIZE_T*10($a0) 999 $PUSH r23,-$SIZE_T*9($a0) 1000 $PUSH r24,-$SIZE_T*8($a0) 1001 $PUSH r25,-$SIZE_T*7($a0) 1002 $PUSH r26,-$SIZE_T*6($a0) 1003 $PUSH r27,-$SIZE_T*5($a0) 1004 $PUSH r28,-$SIZE_T*4($a0) 1005 $PUSH r29,-$SIZE_T*3($a0) 1006 $PUSH r30,-$SIZE_T*2($a0) 1007 $PUSH r31,-$SIZE_T*1($a0) 1008 1009 subi $ap,$ap,$SIZE_T # bias by -1 1010 subi $t0,$np,$SIZE_T # bias by -1 1011 subi $rp,$rp,$SIZE_T # bias by -1 1012 $LD $n0,0($n0) # *n0 1013 li $zero,0 1014 1015 add $ap_end,$ap,$num 1016 $LD $a0,$SIZE_T*1($ap) 1017 #li $acc0,0 1018 $LD $a1,$SIZE_T*2($ap) 1019 li $acc1,0 1020 $LD $a2,$SIZE_T*3($ap) 1021 li $acc2,0 1022 $LD $a3,$SIZE_T*4($ap) 1023 li $acc3,0 1024 $LD $a4,$SIZE_T*5($ap) 1025 li $acc4,0 1026 $LD $a5,$SIZE_T*6($ap) 1027 li $acc5,0 1028 $LD $a6,$SIZE_T*7($ap) 1029 li $acc6,0 1030 $LDU $a7,$SIZE_T*8($ap) 1031 li $acc7,0 1032 1033 addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1034 subic. $cnt,$num,$SIZE_T*8 1035 b .Lsqr8x_zero_start 1036 1037.align 5 1038.Lsqr8x_zero: 1039 subic. $cnt,$cnt,$SIZE_T*8 1040 $ST $zero,$SIZE_T*1($tp) 1041 $ST $zero,$SIZE_T*2($tp) 1042 $ST $zero,$SIZE_T*3($tp) 1043 $ST $zero,$SIZE_T*4($tp) 1044 $ST $zero,$SIZE_T*5($tp) 1045 $ST $zero,$SIZE_T*6($tp) 1046 $ST $zero,$SIZE_T*7($tp) 1047 $ST $zero,$SIZE_T*8($tp) 1048.Lsqr8x_zero_start: 1049 $ST $zero,$SIZE_T*9($tp) 1050 $ST $zero,$SIZE_T*10($tp) 1051 $ST $zero,$SIZE_T*11($tp) 1052 $ST $zero,$SIZE_T*12($tp) 1053 $ST $zero,$SIZE_T*13($tp) 1054 $ST $zero,$SIZE_T*14($tp) 1055 $ST $zero,$SIZE_T*15($tp) 1056 $STU $zero,$SIZE_T*16($tp) 1057 bne .Lsqr8x_zero 1058 1059 $PUSH $rp,$SIZE_T*6($sp) # offload &rp[-1] 1060 $PUSH $t0,$SIZE_T*7($sp) # offload &np[-1] 1061 $PUSH $n0,$SIZE_T*8($sp) # offload n0 1062 $PUSH $tp,$SIZE_T*9($sp) # &tp[2*num-1] 1063 $PUSH $zero,$SIZE_T*10($sp) # initial top-most carry 1064 addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1065 1066 # Multiply everything but a[i]*a[i] 1067.align 5 1068.Lsqr8x_outer_loop: 1069 # a[1]a[0] (i) 1070 # a[2]a[0] 1071 # a[3]a[0] 1072 # a[4]a[0] 1073 # a[5]a[0] 1074 # a[6]a[0] 1075 # a[7]a[0] 1076 # a[2]a[1] (ii) 1077 # a[3]a[1] 1078 # a[4]a[1] 1079 # a[5]a[1] 1080 # a[6]a[1] 1081 # a[7]a[1] 1082 # a[3]a[2] (iii) 1083 # a[4]a[2] 1084 # a[5]a[2] 1085 # a[6]a[2] 1086 # a[7]a[2] 1087 # a[4]a[3] (iv) 1088 # a[5]a[3] 1089 # a[6]a[3] 1090 # a[7]a[3] 1091 # a[5]a[4] (v) 1092 # a[6]a[4] 1093 # a[7]a[4] 1094 # a[6]a[5] (vi) 1095 # a[7]a[5] 1096 # a[7]a[6] (vii) 1097 1098 $UMULL $t0,$a1,$a0 # lo(a[1..7]*a[0]) (i) 1099 $UMULL $t1,$a2,$a0 1100 $UMULL $t2,$a3,$a0 1101 $UMULL $t3,$a4,$a0 1102 addc $acc1,$acc1,$t0 # t[1]+lo(a[1]*a[0]) 1103 $UMULL $t0,$a5,$a0 1104 adde $acc2,$acc2,$t1 1105 $UMULL $t1,$a6,$a0 1106 adde $acc3,$acc3,$t2 1107 $UMULL $t2,$a7,$a0 1108 adde $acc4,$acc4,$t3 1109 $UMULH $t3,$a1,$a0 # hi(a[1..7]*a[0]) 1110 adde $acc5,$acc5,$t0 1111 $UMULH $t0,$a2,$a0 1112 adde $acc6,$acc6,$t1 1113 $UMULH $t1,$a3,$a0 1114 adde $acc7,$acc7,$t2 1115 $UMULH $t2,$a4,$a0 1116 $ST $acc0,$SIZE_T*1($tp) # t[0] 1117 addze $acc0,$zero # t[8] 1118 $ST $acc1,$SIZE_T*2($tp) # t[1] 1119 addc $acc2,$acc2,$t3 # t[2]+lo(a[1]*a[0]) 1120 $UMULH $t3,$a5,$a0 1121 adde $acc3,$acc3,$t0 1122 $UMULH $t0,$a6,$a0 1123 adde $acc4,$acc4,$t1 1124 $UMULH $t1,$a7,$a0 1125 adde $acc5,$acc5,$t2 1126 $UMULL $t2,$a2,$a1 # lo(a[2..7]*a[1]) (ii) 1127 adde $acc6,$acc6,$t3 1128 $UMULL $t3,$a3,$a1 1129 adde $acc7,$acc7,$t0 1130 $UMULL $t0,$a4,$a1 1131 adde $acc0,$acc0,$t1 1132 1133 $UMULL $t1,$a5,$a1 1134 addc $acc3,$acc3,$t2 1135 $UMULL $t2,$a6,$a1 1136 adde $acc4,$acc4,$t3 1137 $UMULL $t3,$a7,$a1 1138 adde $acc5,$acc5,$t0 1139 $UMULH $t0,$a2,$a1 # hi(a[2..7]*a[1]) 1140 adde $acc6,$acc6,$t1 1141 $UMULH $t1,$a3,$a1 1142 adde $acc7,$acc7,$t2 1143 $UMULH $t2,$a4,$a1 1144 adde $acc0,$acc0,$t3 1145 $UMULH $t3,$a5,$a1 1146 $ST $acc2,$SIZE_T*3($tp) # t[2] 1147 addze $acc1,$zero # t[9] 1148 $ST $acc3,$SIZE_T*4($tp) # t[3] 1149 addc $acc4,$acc4,$t0 1150 $UMULH $t0,$a6,$a1 1151 adde $acc5,$acc5,$t1 1152 $UMULH $t1,$a7,$a1 1153 adde $acc6,$acc6,$t2 1154 $UMULL $t2,$a3,$a2 # lo(a[3..7]*a[2]) (iii) 1155 adde $acc7,$acc7,$t3 1156 $UMULL $t3,$a4,$a2 1157 adde $acc0,$acc0,$t0 1158 $UMULL $t0,$a5,$a2 1159 adde $acc1,$acc1,$t1 1160 1161 $UMULL $t1,$a6,$a2 1162 addc $acc5,$acc5,$t2 1163 $UMULL $t2,$a7,$a2 1164 adde $acc6,$acc6,$t3 1165 $UMULH $t3,$a3,$a2 # hi(a[3..7]*a[2]) 1166 adde $acc7,$acc7,$t0 1167 $UMULH $t0,$a4,$a2 1168 adde $acc0,$acc0,$t1 1169 $UMULH $t1,$a5,$a2 1170 adde $acc1,$acc1,$t2 1171 $UMULH $t2,$a6,$a2 1172 $ST $acc4,$SIZE_T*5($tp) # t[4] 1173 addze $acc2,$zero # t[10] 1174 $ST $acc5,$SIZE_T*6($tp) # t[5] 1175 addc $acc6,$acc6,$t3 1176 $UMULH $t3,$a7,$a2 1177 adde $acc7,$acc7,$t0 1178 $UMULL $t0,$a4,$a3 # lo(a[4..7]*a[3]) (iv) 1179 adde $acc0,$acc0,$t1 1180 $UMULL $t1,$a5,$a3 1181 adde $acc1,$acc1,$t2 1182 $UMULL $t2,$a6,$a3 1183 adde $acc2,$acc2,$t3 1184 1185 $UMULL $t3,$a7,$a3 1186 addc $acc7,$acc7,$t0 1187 $UMULH $t0,$a4,$a3 # hi(a[4..7]*a[3]) 1188 adde $acc0,$acc0,$t1 1189 $UMULH $t1,$a5,$a3 1190 adde $acc1,$acc1,$t2 1191 $UMULH $t2,$a6,$a3 1192 adde $acc2,$acc2,$t3 1193 $UMULH $t3,$a7,$a3 1194 $ST $acc6,$SIZE_T*7($tp) # t[6] 1195 addze $acc3,$zero # t[11] 1196 $STU $acc7,$SIZE_T*8($tp) # t[7] 1197 addc $acc0,$acc0,$t0 1198 $UMULL $t0,$a5,$a4 # lo(a[5..7]*a[4]) (v) 1199 adde $acc1,$acc1,$t1 1200 $UMULL $t1,$a6,$a4 1201 adde $acc2,$acc2,$t2 1202 $UMULL $t2,$a7,$a4 1203 adde $acc3,$acc3,$t3 1204 1205 $UMULH $t3,$a5,$a4 # hi(a[5..7]*a[4]) 1206 addc $acc1,$acc1,$t0 1207 $UMULH $t0,$a6,$a4 1208 adde $acc2,$acc2,$t1 1209 $UMULH $t1,$a7,$a4 1210 adde $acc3,$acc3,$t2 1211 $UMULL $t2,$a6,$a5 # lo(a[6..7]*a[5]) (vi) 1212 addze $acc4,$zero # t[12] 1213 addc $acc2,$acc2,$t3 1214 $UMULL $t3,$a7,$a5 1215 adde $acc3,$acc3,$t0 1216 $UMULH $t0,$a6,$a5 # hi(a[6..7]*a[5]) 1217 adde $acc4,$acc4,$t1 1218 1219 $UMULH $t1,$a7,$a5 1220 addc $acc3,$acc3,$t2 1221 $UMULL $t2,$a7,$a6 # lo(a[7]*a[6]) (vii) 1222 adde $acc4,$acc4,$t3 1223 $UMULH $t3,$a7,$a6 # hi(a[7]*a[6]) 1224 addze $acc5,$zero # t[13] 1225 addc $acc4,$acc4,$t0 1226 $UCMP $ap_end,$ap # done yet? 1227 adde $acc5,$acc5,$t1 1228 1229 addc $acc5,$acc5,$t2 1230 sub $t0,$ap_end,$num # rewinded ap 1231 addze $acc6,$zero # t[14] 1232 add $acc6,$acc6,$t3 1233 1234 beq .Lsqr8x_outer_break 1235 1236 mr $n0,$a0 1237 $LD $a0,$SIZE_T*1($tp) 1238 $LD $a1,$SIZE_T*2($tp) 1239 $LD $a2,$SIZE_T*3($tp) 1240 $LD $a3,$SIZE_T*4($tp) 1241 $LD $a4,$SIZE_T*5($tp) 1242 $LD $a5,$SIZE_T*6($tp) 1243 $LD $a6,$SIZE_T*7($tp) 1244 $LD $a7,$SIZE_T*8($tp) 1245 addc $acc0,$acc0,$a0 1246 $LD $a0,$SIZE_T*1($ap) 1247 adde $acc1,$acc1,$a1 1248 $LD $a1,$SIZE_T*2($ap) 1249 adde $acc2,$acc2,$a2 1250 $LD $a2,$SIZE_T*3($ap) 1251 adde $acc3,$acc3,$a3 1252 $LD $a3,$SIZE_T*4($ap) 1253 adde $acc4,$acc4,$a4 1254 $LD $a4,$SIZE_T*5($ap) 1255 adde $acc5,$acc5,$a5 1256 $LD $a5,$SIZE_T*6($ap) 1257 adde $acc6,$acc6,$a6 1258 $LD $a6,$SIZE_T*7($ap) 1259 subi $rp,$ap,$SIZE_T*7 1260 addze $acc7,$a7 1261 $LDU $a7,$SIZE_T*8($ap) 1262 #addze $carry,$zero # moved below 1263 li $cnt,0 1264 b .Lsqr8x_mul 1265 1266 # a[8]a[0] 1267 # a[9]a[0] 1268 # a[a]a[0] 1269 # a[b]a[0] 1270 # a[c]a[0] 1271 # a[d]a[0] 1272 # a[e]a[0] 1273 # a[f]a[0] 1274 # a[8]a[1] 1275 # a[f]a[1]........................ 1276 # a[8]a[2] 1277 # a[f]a[2]........................ 1278 # a[8]a[3] 1279 # a[f]a[3]........................ 1280 # a[8]a[4] 1281 # a[f]a[4]........................ 1282 # a[8]a[5] 1283 # a[f]a[5]........................ 1284 # a[8]a[6] 1285 # a[f]a[6]........................ 1286 # a[8]a[7] 1287 # a[f]a[7]........................ 1288.align 5 1289.Lsqr8x_mul: 1290 $UMULL $t0,$a0,$n0 1291 addze $carry,$zero # carry bit, modulo-scheduled 1292 $UMULL $t1,$a1,$n0 1293 addi $cnt,$cnt,$SIZE_T 1294 $UMULL $t2,$a2,$n0 1295 andi. $cnt,$cnt,$SIZE_T*8-1 1296 $UMULL $t3,$a3,$n0 1297 addc $acc0,$acc0,$t0 1298 $UMULL $t0,$a4,$n0 1299 adde $acc1,$acc1,$t1 1300 $UMULL $t1,$a5,$n0 1301 adde $acc2,$acc2,$t2 1302 $UMULL $t2,$a6,$n0 1303 adde $acc3,$acc3,$t3 1304 $UMULL $t3,$a7,$n0 1305 adde $acc4,$acc4,$t0 1306 $UMULH $t0,$a0,$n0 1307 adde $acc5,$acc5,$t1 1308 $UMULH $t1,$a1,$n0 1309 adde $acc6,$acc6,$t2 1310 $UMULH $t2,$a2,$n0 1311 adde $acc7,$acc7,$t3 1312 $UMULH $t3,$a3,$n0 1313 addze $carry,$carry 1314 $STU $acc0,$SIZE_T($tp) 1315 addc $acc0,$acc1,$t0 1316 $UMULH $t0,$a4,$n0 1317 adde $acc1,$acc2,$t1 1318 $UMULH $t1,$a5,$n0 1319 adde $acc2,$acc3,$t2 1320 $UMULH $t2,$a6,$n0 1321 adde $acc3,$acc4,$t3 1322 $UMULH $t3,$a7,$n0 1323 $LDX $n0,$rp,$cnt 1324 adde $acc4,$acc5,$t0 1325 adde $acc5,$acc6,$t1 1326 adde $acc6,$acc7,$t2 1327 adde $acc7,$carry,$t3 1328 #addze $carry,$zero # moved above 1329 bne .Lsqr8x_mul 1330 # note that carry flag is guaranteed 1331 # to be zero at this point 1332 $UCMP $ap,$ap_end # done yet? 1333 beq .Lsqr8x_break 1334 1335 $LD $a0,$SIZE_T*1($tp) 1336 $LD $a1,$SIZE_T*2($tp) 1337 $LD $a2,$SIZE_T*3($tp) 1338 $LD $a3,$SIZE_T*4($tp) 1339 $LD $a4,$SIZE_T*5($tp) 1340 $LD $a5,$SIZE_T*6($tp) 1341 $LD $a6,$SIZE_T*7($tp) 1342 $LD $a7,$SIZE_T*8($tp) 1343 addc $acc0,$acc0,$a0 1344 $LD $a0,$SIZE_T*1($ap) 1345 adde $acc1,$acc1,$a1 1346 $LD $a1,$SIZE_T*2($ap) 1347 adde $acc2,$acc2,$a2 1348 $LD $a2,$SIZE_T*3($ap) 1349 adde $acc3,$acc3,$a3 1350 $LD $a3,$SIZE_T*4($ap) 1351 adde $acc4,$acc4,$a4 1352 $LD $a4,$SIZE_T*5($ap) 1353 adde $acc5,$acc5,$a5 1354 $LD $a5,$SIZE_T*6($ap) 1355 adde $acc6,$acc6,$a6 1356 $LD $a6,$SIZE_T*7($ap) 1357 adde $acc7,$acc7,$a7 1358 $LDU $a7,$SIZE_T*8($ap) 1359 #addze $carry,$zero # moved above 1360 b .Lsqr8x_mul 1361 1362.align 5 1363.Lsqr8x_break: 1364 $LD $a0,$SIZE_T*8($rp) 1365 addi $ap,$rp,$SIZE_T*15 1366 $LD $a1,$SIZE_T*9($rp) 1367 sub. $t0,$ap_end,$ap # is it last iteration? 1368 $LD $a2,$SIZE_T*10($rp) 1369 sub $t1,$tp,$t0 1370 $LD $a3,$SIZE_T*11($rp) 1371 $LD $a4,$SIZE_T*12($rp) 1372 $LD $a5,$SIZE_T*13($rp) 1373 $LD $a6,$SIZE_T*14($rp) 1374 $LD $a7,$SIZE_T*15($rp) 1375 beq .Lsqr8x_outer_loop 1376 1377 $ST $acc0,$SIZE_T*1($tp) 1378 $LD $acc0,$SIZE_T*1($t1) 1379 $ST $acc1,$SIZE_T*2($tp) 1380 $LD $acc1,$SIZE_T*2($t1) 1381 $ST $acc2,$SIZE_T*3($tp) 1382 $LD $acc2,$SIZE_T*3($t1) 1383 $ST $acc3,$SIZE_T*4($tp) 1384 $LD $acc3,$SIZE_T*4($t1) 1385 $ST $acc4,$SIZE_T*5($tp) 1386 $LD $acc4,$SIZE_T*5($t1) 1387 $ST $acc5,$SIZE_T*6($tp) 1388 $LD $acc5,$SIZE_T*6($t1) 1389 $ST $acc6,$SIZE_T*7($tp) 1390 $LD $acc6,$SIZE_T*7($t1) 1391 $ST $acc7,$SIZE_T*8($tp) 1392 $LD $acc7,$SIZE_T*8($t1) 1393 mr $tp,$t1 1394 b .Lsqr8x_outer_loop 1395 1396.align 5 1397.Lsqr8x_outer_break: 1398 #################################################################### 1399 # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1400 $LD $a1,$SIZE_T*1($t0) # recall that $t0 is &a[-1] 1401 $LD $a3,$SIZE_T*2($t0) 1402 $LD $a5,$SIZE_T*3($t0) 1403 $LD $a7,$SIZE_T*4($t0) 1404 addi $ap,$t0,$SIZE_T*4 1405 # "tp[x]" comments are for num==8 case 1406 $LD $t1,$SIZE_T*13($sp) # =tp[1], t[0] is not interesting 1407 $LD $t2,$SIZE_T*14($sp) 1408 $LD $t3,$SIZE_T*15($sp) 1409 $LD $t0,$SIZE_T*16($sp) 1410 1411 $ST $acc0,$SIZE_T*1($tp) # tp[8]= 1412 srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 1413 $ST $acc1,$SIZE_T*2($tp) 1414 subi $cnt,$cnt,1 1415 $ST $acc2,$SIZE_T*3($tp) 1416 $ST $acc3,$SIZE_T*4($tp) 1417 $ST $acc4,$SIZE_T*5($tp) 1418 $ST $acc5,$SIZE_T*6($tp) 1419 $ST $acc6,$SIZE_T*7($tp) 1420 #$ST $acc7,$SIZE_T*8($tp) # tp[15] is not interesting 1421 addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1422 $UMULL $acc0,$a1,$a1 1423 $UMULH $a1,$a1,$a1 1424 add $acc1,$t1,$t1 # <<1 1425 $SHRI $t1,$t1,$BITS-1 1426 $UMULL $a2,$a3,$a3 1427 $UMULH $a3,$a3,$a3 1428 addc $acc1,$acc1,$a1 1429 add $acc2,$t2,$t2 1430 $SHRI $t2,$t2,$BITS-1 1431 add $acc3,$t3,$t3 1432 $SHRI $t3,$t3,$BITS-1 1433 or $acc2,$acc2,$t1 1434 1435 mtctr $cnt 1436.Lsqr4x_shift_n_add: 1437 $UMULL $a4,$a5,$a5 1438 $UMULH $a5,$a5,$a5 1439 $LD $t1,$SIZE_T*6($tp) # =tp[5] 1440 $LD $a1,$SIZE_T*1($ap) 1441 adde $acc2,$acc2,$a2 1442 add $acc4,$t0,$t0 1443 $SHRI $t0,$t0,$BITS-1 1444 or $acc3,$acc3,$t2 1445 $LD $t2,$SIZE_T*7($tp) # =tp[6] 1446 adde $acc3,$acc3,$a3 1447 $LD $a3,$SIZE_T*2($ap) 1448 add $acc5,$t1,$t1 1449 $SHRI $t1,$t1,$BITS-1 1450 or $acc4,$acc4,$t3 1451 $LD $t3,$SIZE_T*8($tp) # =tp[7] 1452 $UMULL $a6,$a7,$a7 1453 $UMULH $a7,$a7,$a7 1454 adde $acc4,$acc4,$a4 1455 add $acc6,$t2,$t2 1456 $SHRI $t2,$t2,$BITS-1 1457 or $acc5,$acc5,$t0 1458 $LD $t0,$SIZE_T*9($tp) # =tp[8] 1459 adde $acc5,$acc5,$a5 1460 $LD $a5,$SIZE_T*3($ap) 1461 add $acc7,$t3,$t3 1462 $SHRI $t3,$t3,$BITS-1 1463 or $acc6,$acc6,$t1 1464 $LD $t1,$SIZE_T*10($tp) # =tp[9] 1465 $UMULL $a0,$a1,$a1 1466 $UMULH $a1,$a1,$a1 1467 adde $acc6,$acc6,$a6 1468 $ST $acc0,$SIZE_T*1($tp) # tp[0]= 1469 add $acc0,$t0,$t0 1470 $SHRI $t0,$t0,$BITS-1 1471 or $acc7,$acc7,$t2 1472 $LD $t2,$SIZE_T*11($tp) # =tp[10] 1473 adde $acc7,$acc7,$a7 1474 $LDU $a7,$SIZE_T*4($ap) 1475 $ST $acc1,$SIZE_T*2($tp) # tp[1]= 1476 add $acc1,$t1,$t1 1477 $SHRI $t1,$t1,$BITS-1 1478 or $acc0,$acc0,$t3 1479 $LD $t3,$SIZE_T*12($tp) # =tp[11] 1480 $UMULL $a2,$a3,$a3 1481 $UMULH $a3,$a3,$a3 1482 adde $acc0,$acc0,$a0 1483 $ST $acc2,$SIZE_T*3($tp) # tp[2]= 1484 add $acc2,$t2,$t2 1485 $SHRI $t2,$t2,$BITS-1 1486 or $acc1,$acc1,$t0 1487 $LD $t0,$SIZE_T*13($tp) # =tp[12] 1488 adde $acc1,$acc1,$a1 1489 $ST $acc3,$SIZE_T*4($tp) # tp[3]= 1490 $ST $acc4,$SIZE_T*5($tp) # tp[4]= 1491 $ST $acc5,$SIZE_T*6($tp) # tp[5]= 1492 $ST $acc6,$SIZE_T*7($tp) # tp[6]= 1493 $STU $acc7,$SIZE_T*8($tp) # tp[7]= 1494 add $acc3,$t3,$t3 1495 $SHRI $t3,$t3,$BITS-1 1496 or $acc2,$acc2,$t1 1497 bdnz .Lsqr4x_shift_n_add 1498___ 1499my ($np,$np_end)=($ap,$ap_end); 1500$code.=<<___; 1501 $POP $np,$SIZE_T*7($sp) # pull &np[-1] and n0 1502 $POP $n0,$SIZE_T*8($sp) 1503 1504 $UMULL $a4,$a5,$a5 1505 $UMULH $a5,$a5,$a5 1506 $ST $acc0,$SIZE_T*1($tp) # tp[8]= 1507 $LD $acc0,$SIZE_T*12($sp) # =tp[0] 1508 $LD $t1,$SIZE_T*6($tp) # =tp[13] 1509 adde $acc2,$acc2,$a2 1510 add $acc4,$t0,$t0 1511 $SHRI $t0,$t0,$BITS-1 1512 or $acc3,$acc3,$t2 1513 $LD $t2,$SIZE_T*7($tp) # =tp[14] 1514 adde $acc3,$acc3,$a3 1515 add $acc5,$t1,$t1 1516 $SHRI $t1,$t1,$BITS-1 1517 or $acc4,$acc4,$t3 1518 $UMULL $a6,$a7,$a7 1519 $UMULH $a7,$a7,$a7 1520 adde $acc4,$acc4,$a4 1521 add $acc6,$t2,$t2 1522 $SHRI $t2,$t2,$BITS-1 1523 or $acc5,$acc5,$t0 1524 $ST $acc1,$SIZE_T*2($tp) # tp[9]= 1525 $LD $acc1,$SIZE_T*13($sp) # =tp[1] 1526 adde $acc5,$acc5,$a5 1527 or $acc6,$acc6,$t1 1528 $LD $a0,$SIZE_T*1($np) 1529 $LD $a1,$SIZE_T*2($np) 1530 adde $acc6,$acc6,$a6 1531 $LD $a2,$SIZE_T*3($np) 1532 $LD $a3,$SIZE_T*4($np) 1533 adde $acc7,$a7,$t2 1534 $LD $a4,$SIZE_T*5($np) 1535 $LD $a5,$SIZE_T*6($np) 1536 1537 ################################################################ 1538 # Reduce by 8 limbs per iteration 1539 $UMULL $na0,$n0,$acc0 # t[0]*n0 1540 li $cnt,8 1541 $LD $a6,$SIZE_T*7($np) 1542 add $np_end,$np,$num 1543 $LDU $a7,$SIZE_T*8($np) 1544 $ST $acc2,$SIZE_T*3($tp) # tp[10]= 1545 $LD $acc2,$SIZE_T*14($sp) 1546 $ST $acc3,$SIZE_T*4($tp) # tp[11]= 1547 $LD $acc3,$SIZE_T*15($sp) 1548 $ST $acc4,$SIZE_T*5($tp) # tp[12]= 1549 $LD $acc4,$SIZE_T*16($sp) 1550 $ST $acc5,$SIZE_T*6($tp) # tp[13]= 1551 $LD $acc5,$SIZE_T*17($sp) 1552 $ST $acc6,$SIZE_T*7($tp) # tp[14]= 1553 $LD $acc6,$SIZE_T*18($sp) 1554 $ST $acc7,$SIZE_T*8($tp) # tp[15]= 1555 $LD $acc7,$SIZE_T*19($sp) 1556 addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1557 mtctr $cnt 1558 b .Lsqr8x_reduction 1559 1560.align 5 1561.Lsqr8x_reduction: 1562 # (*) $UMULL $t0,$a0,$na0 # lo(n[0-7])*lo(t[0]*n0) 1563 $UMULL $t1,$a1,$na0 1564 $UMULL $t2,$a2,$na0 1565 $STU $na0,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 1566 $UMULL $t3,$a3,$na0 1567 # (*) addc $acc0,$acc0,$t0 1568 addic $acc0,$acc0,-1 # (*) 1569 $UMULL $t0,$a4,$na0 1570 adde $acc0,$acc1,$t1 1571 $UMULL $t1,$a5,$na0 1572 adde $acc1,$acc2,$t2 1573 $UMULL $t2,$a6,$na0 1574 adde $acc2,$acc3,$t3 1575 $UMULL $t3,$a7,$na0 1576 adde $acc3,$acc4,$t0 1577 $UMULH $t0,$a0,$na0 # hi(n[0-7])*lo(t[0]*n0) 1578 adde $acc4,$acc5,$t1 1579 $UMULH $t1,$a1,$na0 1580 adde $acc5,$acc6,$t2 1581 $UMULH $t2,$a2,$na0 1582 adde $acc6,$acc7,$t3 1583 $UMULH $t3,$a3,$na0 1584 addze $acc7,$zero 1585 addc $acc0,$acc0,$t0 1586 $UMULH $t0,$a4,$na0 1587 adde $acc1,$acc1,$t1 1588 $UMULH $t1,$a5,$na0 1589 adde $acc2,$acc2,$t2 1590 $UMULH $t2,$a6,$na0 1591 adde $acc3,$acc3,$t3 1592 $UMULH $t3,$a7,$na0 1593 $UMULL $na0,$n0,$acc0 # next t[0]*n0 1594 adde $acc4,$acc4,$t0 1595 adde $acc5,$acc5,$t1 1596 adde $acc6,$acc6,$t2 1597 adde $acc7,$acc7,$t3 1598 bdnz .Lsqr8x_reduction 1599 1600 $LD $t0,$SIZE_T*1($tp) 1601 $LD $t1,$SIZE_T*2($tp) 1602 $LD $t2,$SIZE_T*3($tp) 1603 $LD $t3,$SIZE_T*4($tp) 1604 subi $rp,$tp,$SIZE_T*7 1605 $UCMP $np_end,$np # done yet? 1606 addc $acc0,$acc0,$t0 1607 $LD $t0,$SIZE_T*5($tp) 1608 adde $acc1,$acc1,$t1 1609 $LD $t1,$SIZE_T*6($tp) 1610 adde $acc2,$acc2,$t2 1611 $LD $t2,$SIZE_T*7($tp) 1612 adde $acc3,$acc3,$t3 1613 $LD $t3,$SIZE_T*8($tp) 1614 adde $acc4,$acc4,$t0 1615 adde $acc5,$acc5,$t1 1616 adde $acc6,$acc6,$t2 1617 adde $acc7,$acc7,$t3 1618 #addze $carry,$zero # moved below 1619 beq .Lsqr8x8_post_condition 1620 1621 $LD $n0,$SIZE_T*0($rp) 1622 $LD $a0,$SIZE_T*1($np) 1623 $LD $a1,$SIZE_T*2($np) 1624 $LD $a2,$SIZE_T*3($np) 1625 $LD $a3,$SIZE_T*4($np) 1626 $LD $a4,$SIZE_T*5($np) 1627 $LD $a5,$SIZE_T*6($np) 1628 $LD $a6,$SIZE_T*7($np) 1629 $LDU $a7,$SIZE_T*8($np) 1630 li $cnt,0 1631 1632.align 5 1633.Lsqr8x_tail: 1634 $UMULL $t0,$a0,$n0 1635 addze $carry,$zero # carry bit, modulo-scheduled 1636 $UMULL $t1,$a1,$n0 1637 addi $cnt,$cnt,$SIZE_T 1638 $UMULL $t2,$a2,$n0 1639 andi. $cnt,$cnt,$SIZE_T*8-1 1640 $UMULL $t3,$a3,$n0 1641 addc $acc0,$acc0,$t0 1642 $UMULL $t0,$a4,$n0 1643 adde $acc1,$acc1,$t1 1644 $UMULL $t1,$a5,$n0 1645 adde $acc2,$acc2,$t2 1646 $UMULL $t2,$a6,$n0 1647 adde $acc3,$acc3,$t3 1648 $UMULL $t3,$a7,$n0 1649 adde $acc4,$acc4,$t0 1650 $UMULH $t0,$a0,$n0 1651 adde $acc5,$acc5,$t1 1652 $UMULH $t1,$a1,$n0 1653 adde $acc6,$acc6,$t2 1654 $UMULH $t2,$a2,$n0 1655 adde $acc7,$acc7,$t3 1656 $UMULH $t3,$a3,$n0 1657 addze $carry,$carry 1658 $STU $acc0,$SIZE_T($tp) 1659 addc $acc0,$acc1,$t0 1660 $UMULH $t0,$a4,$n0 1661 adde $acc1,$acc2,$t1 1662 $UMULH $t1,$a5,$n0 1663 adde $acc2,$acc3,$t2 1664 $UMULH $t2,$a6,$n0 1665 adde $acc3,$acc4,$t3 1666 $UMULH $t3,$a7,$n0 1667 $LDX $n0,$rp,$cnt 1668 adde $acc4,$acc5,$t0 1669 adde $acc5,$acc6,$t1 1670 adde $acc6,$acc7,$t2 1671 adde $acc7,$carry,$t3 1672 #addze $carry,$zero # moved above 1673 bne .Lsqr8x_tail 1674 # note that carry flag is guaranteed 1675 # to be zero at this point 1676 $LD $a0,$SIZE_T*1($tp) 1677 $POP $carry,$SIZE_T*10($sp) # pull top-most carry in case we break 1678 $UCMP $np_end,$np # done yet? 1679 $LD $a1,$SIZE_T*2($tp) 1680 sub $t2,$np_end,$num # rewinded np 1681 $LD $a2,$SIZE_T*3($tp) 1682 $LD $a3,$SIZE_T*4($tp) 1683 $LD $a4,$SIZE_T*5($tp) 1684 $LD $a5,$SIZE_T*6($tp) 1685 $LD $a6,$SIZE_T*7($tp) 1686 $LD $a7,$SIZE_T*8($tp) 1687 beq .Lsqr8x_tail_break 1688 1689 addc $acc0,$acc0,$a0 1690 $LD $a0,$SIZE_T*1($np) 1691 adde $acc1,$acc1,$a1 1692 $LD $a1,$SIZE_T*2($np) 1693 adde $acc2,$acc2,$a2 1694 $LD $a2,$SIZE_T*3($np) 1695 adde $acc3,$acc3,$a3 1696 $LD $a3,$SIZE_T*4($np) 1697 adde $acc4,$acc4,$a4 1698 $LD $a4,$SIZE_T*5($np) 1699 adde $acc5,$acc5,$a5 1700 $LD $a5,$SIZE_T*6($np) 1701 adde $acc6,$acc6,$a6 1702 $LD $a6,$SIZE_T*7($np) 1703 adde $acc7,$acc7,$a7 1704 $LDU $a7,$SIZE_T*8($np) 1705 #addze $carry,$zero # moved above 1706 b .Lsqr8x_tail 1707 1708.align 5 1709.Lsqr8x_tail_break: 1710 $POP $n0,$SIZE_T*8($sp) # pull n0 1711 $POP $t3,$SIZE_T*9($sp) # &tp[2*num-1] 1712 addi $cnt,$tp,$SIZE_T*8 # end of current t[num] window 1713 1714 addic $carry,$carry,-1 # "move" top-most carry to carry bit 1715 adde $t0,$acc0,$a0 1716 $LD $acc0,$SIZE_T*8($rp) 1717 $LD $a0,$SIZE_T*1($t2) # recall that $t2 is &n[-1] 1718 adde $t1,$acc1,$a1 1719 $LD $acc1,$SIZE_T*9($rp) 1720 $LD $a1,$SIZE_T*2($t2) 1721 adde $acc2,$acc2,$a2 1722 $LD $a2,$SIZE_T*3($t2) 1723 adde $acc3,$acc3,$a3 1724 $LD $a3,$SIZE_T*4($t2) 1725 adde $acc4,$acc4,$a4 1726 $LD $a4,$SIZE_T*5($t2) 1727 adde $acc5,$acc5,$a5 1728 $LD $a5,$SIZE_T*6($t2) 1729 adde $acc6,$acc6,$a6 1730 $LD $a6,$SIZE_T*7($t2) 1731 adde $acc7,$acc7,$a7 1732 $LD $a7,$SIZE_T*8($t2) 1733 addi $np,$t2,$SIZE_T*8 1734 addze $t2,$zero # top-most carry 1735 $UMULL $na0,$n0,$acc0 1736 $ST $t0,$SIZE_T*1($tp) 1737 $UCMP $cnt,$t3 # did we hit the bottom? 1738 $ST $t1,$SIZE_T*2($tp) 1739 li $cnt,8 1740 $ST $acc2,$SIZE_T*3($tp) 1741 $LD $acc2,$SIZE_T*10($rp) 1742 $ST $acc3,$SIZE_T*4($tp) 1743 $LD $acc3,$SIZE_T*11($rp) 1744 $ST $acc4,$SIZE_T*5($tp) 1745 $LD $acc4,$SIZE_T*12($rp) 1746 $ST $acc5,$SIZE_T*6($tp) 1747 $LD $acc5,$SIZE_T*13($rp) 1748 $ST $acc6,$SIZE_T*7($tp) 1749 $LD $acc6,$SIZE_T*14($rp) 1750 $ST $acc7,$SIZE_T*8($tp) 1751 $LD $acc7,$SIZE_T*15($rp) 1752 $PUSH $t2,$SIZE_T*10($sp) # off-load top-most carry 1753 addi $tp,$rp,$SIZE_T*7 # slide the window 1754 mtctr $cnt 1755 bne .Lsqr8x_reduction 1756 1757 ################################################################ 1758 # Final step. We see if result is larger than modulus, and 1759 # if it is, subtract the modulus. But comparison implies 1760 # subtraction. So we subtract modulus, see if it borrowed, 1761 # and conditionally copy original value. 1762 $POP $rp,$SIZE_T*6($sp) # pull &rp[-1] 1763 srwi $cnt,$num,`log($SIZE_T)/log(2)+3` 1764 mr $n0,$tp # put tp aside 1765 addi $tp,$tp,$SIZE_T*8 1766 subi $cnt,$cnt,1 1767 subfc $t0,$a0,$acc0 1768 subfe $t1,$a1,$acc1 1769 mr $carry,$t2 1770 mr $ap_end,$rp # $rp copy 1771 1772 mtctr $cnt 1773 b .Lsqr8x_sub 1774 1775.align 5 1776.Lsqr8x_sub: 1777 $LD $a0,$SIZE_T*1($np) 1778 $LD $acc0,$SIZE_T*1($tp) 1779 $LD $a1,$SIZE_T*2($np) 1780 $LD $acc1,$SIZE_T*2($tp) 1781 subfe $t2,$a2,$acc2 1782 $LD $a2,$SIZE_T*3($np) 1783 $LD $acc2,$SIZE_T*3($tp) 1784 subfe $t3,$a3,$acc3 1785 $LD $a3,$SIZE_T*4($np) 1786 $LD $acc3,$SIZE_T*4($tp) 1787 $ST $t0,$SIZE_T*1($rp) 1788 subfe $t0,$a4,$acc4 1789 $LD $a4,$SIZE_T*5($np) 1790 $LD $acc4,$SIZE_T*5($tp) 1791 $ST $t1,$SIZE_T*2($rp) 1792 subfe $t1,$a5,$acc5 1793 $LD $a5,$SIZE_T*6($np) 1794 $LD $acc5,$SIZE_T*6($tp) 1795 $ST $t2,$SIZE_T*3($rp) 1796 subfe $t2,$a6,$acc6 1797 $LD $a6,$SIZE_T*7($np) 1798 $LD $acc6,$SIZE_T*7($tp) 1799 $ST $t3,$SIZE_T*4($rp) 1800 subfe $t3,$a7,$acc7 1801 $LDU $a7,$SIZE_T*8($np) 1802 $LDU $acc7,$SIZE_T*8($tp) 1803 $ST $t0,$SIZE_T*5($rp) 1804 subfe $t0,$a0,$acc0 1805 $ST $t1,$SIZE_T*6($rp) 1806 subfe $t1,$a1,$acc1 1807 $ST $t2,$SIZE_T*7($rp) 1808 $STU $t3,$SIZE_T*8($rp) 1809 bdnz .Lsqr8x_sub 1810 1811 srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 1812 $LD $a0,$SIZE_T*1($ap_end) # original $rp 1813 $LD $acc0,$SIZE_T*1($n0) # original $tp 1814 subi $cnt,$cnt,1 1815 $LD $a1,$SIZE_T*2($ap_end) 1816 $LD $acc1,$SIZE_T*2($n0) 1817 subfe $t2,$a2,$acc2 1818 $LD $a2,$SIZE_T*3($ap_end) 1819 $LD $acc2,$SIZE_T*3($n0) 1820 subfe $t3,$a3,$acc3 1821 $LD $a3,$SIZE_T*4($ap_end) 1822 $LDU $acc3,$SIZE_T*4($n0) 1823 $ST $t0,$SIZE_T*1($rp) 1824 subfe $t0,$a4,$acc4 1825 $ST $t1,$SIZE_T*2($rp) 1826 subfe $t1,$a5,$acc5 1827 $ST $t2,$SIZE_T*3($rp) 1828 subfe $t2,$a6,$acc6 1829 $ST $t3,$SIZE_T*4($rp) 1830 subfe $t3,$a7,$acc7 1831 $ST $t0,$SIZE_T*5($rp) 1832 subfe $carry,$zero,$carry # did it borrow? 1833 $ST $t1,$SIZE_T*6($rp) 1834 $ST $t2,$SIZE_T*7($rp) 1835 $ST $t3,$SIZE_T*8($rp) 1836 1837 addi $tp,$sp,$SIZE_T*11 1838 mtctr $cnt 1839 1840.Lsqr4x_cond_copy: 1841 andc $a0,$a0,$carry 1842 $ST $zero,-$SIZE_T*3($n0) # wipe stack clean 1843 and $acc0,$acc0,$carry 1844 $ST $zero,-$SIZE_T*2($n0) 1845 andc $a1,$a1,$carry 1846 $ST $zero,-$SIZE_T*1($n0) 1847 and $acc1,$acc1,$carry 1848 $ST $zero,-$SIZE_T*0($n0) 1849 andc $a2,$a2,$carry 1850 $ST $zero,$SIZE_T*1($tp) 1851 and $acc2,$acc2,$carry 1852 $ST $zero,$SIZE_T*2($tp) 1853 andc $a3,$a3,$carry 1854 $ST $zero,$SIZE_T*3($tp) 1855 and $acc3,$acc3,$carry 1856 $STU $zero,$SIZE_T*4($tp) 1857 or $t0,$a0,$acc0 1858 $LD $a0,$SIZE_T*5($ap_end) 1859 $LD $acc0,$SIZE_T*1($n0) 1860 or $t1,$a1,$acc1 1861 $LD $a1,$SIZE_T*6($ap_end) 1862 $LD $acc1,$SIZE_T*2($n0) 1863 or $t2,$a2,$acc2 1864 $LD $a2,$SIZE_T*7($ap_end) 1865 $LD $acc2,$SIZE_T*3($n0) 1866 or $t3,$a3,$acc3 1867 $LD $a3,$SIZE_T*8($ap_end) 1868 $LDU $acc3,$SIZE_T*4($n0) 1869 $ST $t0,$SIZE_T*1($ap_end) 1870 $ST $t1,$SIZE_T*2($ap_end) 1871 $ST $t2,$SIZE_T*3($ap_end) 1872 $STU $t3,$SIZE_T*4($ap_end) 1873 bdnz .Lsqr4x_cond_copy 1874 1875 $POP $ap,0($sp) # pull saved sp 1876 andc $a0,$a0,$carry 1877 and $acc0,$acc0,$carry 1878 andc $a1,$a1,$carry 1879 and $acc1,$acc1,$carry 1880 andc $a2,$a2,$carry 1881 and $acc2,$acc2,$carry 1882 andc $a3,$a3,$carry 1883 and $acc3,$acc3,$carry 1884 or $t0,$a0,$acc0 1885 or $t1,$a1,$acc1 1886 or $t2,$a2,$acc2 1887 or $t3,$a3,$acc3 1888 $ST $t0,$SIZE_T*1($ap_end) 1889 $ST $t1,$SIZE_T*2($ap_end) 1890 $ST $t2,$SIZE_T*3($ap_end) 1891 $ST $t3,$SIZE_T*4($ap_end) 1892 1893 b .Lsqr8x_done 1894 1895.align 5 1896.Lsqr8x8_post_condition: 1897 $POP $rp,$SIZE_T*6($sp) # pull rp 1898 $POP $ap,0($sp) # pull saved sp 1899 addze $carry,$zero 1900 1901 # $acc0-7,$carry hold result, $a0-7 hold modulus 1902 subfc $acc0,$a0,$acc0 1903 subfe $acc1,$a1,$acc1 1904 $ST $zero,$SIZE_T*12($sp) # wipe stack clean 1905 $ST $zero,$SIZE_T*13($sp) 1906 subfe $acc2,$a2,$acc2 1907 $ST $zero,$SIZE_T*14($sp) 1908 $ST $zero,$SIZE_T*15($sp) 1909 subfe $acc3,$a3,$acc3 1910 $ST $zero,$SIZE_T*16($sp) 1911 $ST $zero,$SIZE_T*17($sp) 1912 subfe $acc4,$a4,$acc4 1913 $ST $zero,$SIZE_T*18($sp) 1914 $ST $zero,$SIZE_T*19($sp) 1915 subfe $acc5,$a5,$acc5 1916 $ST $zero,$SIZE_T*20($sp) 1917 $ST $zero,$SIZE_T*21($sp) 1918 subfe $acc6,$a6,$acc6 1919 $ST $zero,$SIZE_T*22($sp) 1920 $ST $zero,$SIZE_T*23($sp) 1921 subfe $acc7,$a7,$acc7 1922 $ST $zero,$SIZE_T*24($sp) 1923 $ST $zero,$SIZE_T*25($sp) 1924 subfe $carry,$zero,$carry # did it borrow? 1925 $ST $zero,$SIZE_T*26($sp) 1926 $ST $zero,$SIZE_T*27($sp) 1927 1928 and $a0,$a0,$carry 1929 and $a1,$a1,$carry 1930 addc $acc0,$acc0,$a0 # add modulus back if borrowed 1931 and $a2,$a2,$carry 1932 adde $acc1,$acc1,$a1 1933 and $a3,$a3,$carry 1934 adde $acc2,$acc2,$a2 1935 and $a4,$a4,$carry 1936 adde $acc3,$acc3,$a3 1937 and $a5,$a5,$carry 1938 adde $acc4,$acc4,$a4 1939 and $a6,$a6,$carry 1940 adde $acc5,$acc5,$a5 1941 and $a7,$a7,$carry 1942 adde $acc6,$acc6,$a6 1943 adde $acc7,$acc7,$a7 1944 $ST $acc0,$SIZE_T*1($rp) 1945 $ST $acc1,$SIZE_T*2($rp) 1946 $ST $acc2,$SIZE_T*3($rp) 1947 $ST $acc3,$SIZE_T*4($rp) 1948 $ST $acc4,$SIZE_T*5($rp) 1949 $ST $acc5,$SIZE_T*6($rp) 1950 $ST $acc6,$SIZE_T*7($rp) 1951 $ST $acc7,$SIZE_T*8($rp) 1952 1953.Lsqr8x_done: 1954 $PUSH $zero,$SIZE_T*8($sp) 1955 $PUSH $zero,$SIZE_T*10($sp) 1956 1957 $POP r14,-$SIZE_T*18($ap) 1958 li r3,1 # signal "done" 1959 $POP r15,-$SIZE_T*17($ap) 1960 $POP r16,-$SIZE_T*16($ap) 1961 $POP r17,-$SIZE_T*15($ap) 1962 $POP r18,-$SIZE_T*14($ap) 1963 $POP r19,-$SIZE_T*13($ap) 1964 $POP r20,-$SIZE_T*12($ap) 1965 $POP r21,-$SIZE_T*11($ap) 1966 $POP r22,-$SIZE_T*10($ap) 1967 $POP r23,-$SIZE_T*9($ap) 1968 $POP r24,-$SIZE_T*8($ap) 1969 $POP r25,-$SIZE_T*7($ap) 1970 $POP r26,-$SIZE_T*6($ap) 1971 $POP r27,-$SIZE_T*5($ap) 1972 $POP r28,-$SIZE_T*4($ap) 1973 $POP r29,-$SIZE_T*3($ap) 1974 $POP r30,-$SIZE_T*2($ap) 1975 $POP r31,-$SIZE_T*1($ap) 1976 mr $sp,$ap 1977 blr 1978 .long 0 1979 .byte 0,12,4,0x20,0x80,18,6,0 1980 .long 0 1981.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1982___ 1983} 1984$code.=<<___; 1985.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" 1986___ 1987 1988$code =~ s/\`([^\`]*)\`/eval $1/gem; 1989print $code; 1990close STDOUT or die "error closing STDOUT: $!"; 1991