1#! /usr/bin/env perl 2# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# December 2007 18 19# The reason for undertaken effort is basically following. Even though 20# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI 21# performance was observed to be less than impressive, essentially as 22# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. 23# Well, it's not surprising that IBM had to make some sacrifices to 24# boost the clock frequency that much, but no overall improvement? 25# Having observed how much difference did switching to FPU make on 26# UltraSPARC, playing same stunt on Power 6 appeared appropriate... 27# Unfortunately the resulting performance improvement is not as 28# impressive, ~30%, and in absolute terms is still very far from what 29# one would expect from 4.7GHz CPU. There is a chance that I'm doing 30# something wrong, but in the lack of assembler level micro-profiling 31# data or at least decent platform guide I can't tell... Or better 32# results might be achieved with VMX... Anyway, this module provides 33# *worse* performance on other PowerPC implementations, ~40-15% slower 34# on PPC970 depending on key length and ~40% slower on Power 5 for all 35# key lengths. As it's obviously inappropriate as "best all-round" 36# alternative, it has to be complemented with run-time CPU family 37# detection. Oh! It should also be noted that unlike other PowerPC 38# implementation IALU ppc-mont.pl module performs *suboptimally* on 39# >=1024-bit key lengths on Power 6. It should also be noted that 40# *everything* said so far applies to 64-bit builds! As far as 32-bit 41# application executed on 64-bit CPU goes, this module is likely to 42# become preferred choice, because it's easy to adapt it for such 43# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. 44 45# February 2008 46 47# Micro-profiling assisted optimization results in ~15% improvement 48# over original ppc64-mont.pl version, or overall ~50% improvement 49# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same 50# Power 6 CPU, this module is 5-150% faster depending on key length, 51# [hereafter] more for longer keys. But if compared to ppc-mont.pl 52# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 53# in absolute terms, but it's apparently the way Power 6 is... 54 55# December 2009 56 57# Adapted for 32-bit build this module delivers 25-120%, yes, more 58# than *twice* for longer keys, performance improvement over 32-bit 59# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes 60# even 64-bit integer operations and the trouble is that most PPC 61# operating systems don't preserve upper halves of general purpose 62# registers upon 32-bit signal delivery. They do preserve them upon 63# context switch, but not signalling:-( This means that asynchronous 64# signals have to be blocked upon entry to this subroutine. Signal 65# masking (and of course complementary unmasking) has quite an impact 66# on performance, naturally larger for shorter keys. It's so severe 67# that 512-bit key performance can be as low as 1/3 of expected one. 68# This is why this routine can be engaged for longer key operations 69# only on these OSes, see crypto/ppccap.c for further details. MacOS X 70# is an exception from this and doesn't require signal masking, and 71# that's where above improvement coefficients were collected. For 72# others alternative would be to break dependence on upper halves of 73# GPRs by sticking to 32-bit integer operations... 74 75# December 2012 76 77# Remove above mentioned dependence on GPRs' upper halves in 32-bit 78# build. No signal masking overhead, but integer instructions are 79# *more* numerous... It's still "universally" faster than 32-bit 80# ppc-mont.pl, but improvement coefficient is not as impressive 81# for longer keys... 82 83$flavour = shift; 84 85if ($flavour =~ /32/) { 86 $SIZE_T=4; 87 $RZONE= 224; 88 $fname= "bn_mul_mont_fpu64"; 89 90 $STUX= "stwux"; # store indexed and update 91 $PUSH= "stw"; 92 $POP= "lwz"; 93} elsif ($flavour =~ /64/) { 94 $SIZE_T=8; 95 $RZONE= 288; 96 $fname= "bn_mul_mont_fpu64"; 97 98 # same as above, but 64-bit mnemonics... 99 $STUX= "stdux"; # store indexed and update 100 $PUSH= "std"; 101 $POP= "ld"; 102} else { die "nonsense $flavour"; } 103 104$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; 105 106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 107( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 108( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 109die "can't locate ppc-xlate.pl"; 110 111open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 112 113$FRAME=64; # padded frame header 114$TRANSFER=16*8; 115 116$carry="r0"; 117$sp="r1"; 118$toc="r2"; 119$rp="r3"; $ovf="r3"; 120$ap="r4"; 121$bp="r5"; 122$np="r6"; 123$n0="r7"; 124$num="r8"; 125$rp="r9"; # $rp is reassigned 126$tp="r10"; 127$j="r11"; 128$i="r12"; 129# non-volatile registers 130$c1="r19"; 131$n1="r20"; 132$a1="r21"; 133$nap_d="r22"; # interleaved ap and np in double format 134$a0="r23"; # ap[0] 135$t0="r24"; # temporary registers 136$t1="r25"; 137$t2="r26"; 138$t3="r27"; 139$t4="r28"; 140$t5="r29"; 141$t6="r30"; 142$t7="r31"; 143 144# PPC offers enough register bank capacity to unroll inner loops twice 145# 146# ..A3A2A1A0 147# dcba 148# ----------- 149# A0a 150# A0b 151# A0c 152# A0d 153# A1a 154# A1b 155# A1c 156# A1d 157# A2a 158# A2b 159# A2c 160# A2d 161# A3a 162# A3b 163# A3c 164# A3d 165# ..a 166# ..b 167# 168$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; 169$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 170$dota="f8"; $dotb="f9"; 171$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 172$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; 173$T0a="f24"; $T0b="f25"; 174$T1a="f26"; $T1b="f27"; 175$T2a="f28"; $T2b="f29"; 176$T3a="f30"; $T3b="f31"; 177 178# sp----------->+-------------------------------+ 179# | saved sp | 180# +-------------------------------+ 181# . . 182# +64 +-------------------------------+ 183# | 16 gpr<->fpr transfer zone | 184# . . 185# . . 186# +16*8 +-------------------------------+ 187# | __int64 tmp[-1] | 188# +-------------------------------+ 189# | __int64 tmp[num] | 190# . . 191# . . 192# . . 193# +(num+1)*8 +-------------------------------+ 194# | padding to 64 byte boundary | 195# . . 196# +X +-------------------------------+ 197# | double nap_d[4*num] | 198# . . 199# . . 200# . . 201# +-------------------------------+ 202# . . 203# -13*size_t +-------------------------------+ 204# | 13 saved gpr, r19-r31 | 205# . . 206# . . 207# -12*8 +-------------------------------+ 208# | 12 saved fpr, f20-f31 | 209# . . 210# . . 211# +-------------------------------+ 212 213$code=<<___; 214.machine "any" 215.text 216 217.globl .$fname 218.align 5 219.$fname: 220 cmpwi $num,`3*8/$SIZE_T` 221 mr $rp,r3 ; $rp is reassigned 222 li r3,0 ; possible "not handled" return code 223 bltlr- 224 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" 225 bnelr- 226 227 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) 228 li $i,-4096 229 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 230 add $tp,$tp,$num ; place for tp[num+1] 231 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` 232 subf $tp,$tp,$sp ; $sp-$tp 233 and $tp,$tp,$i ; minimize TLB usage 234 subf $tp,$sp,$tp ; $tp-$sp 235 mr $i,$sp 236 $STUX $sp,$sp,$tp ; alloca 237 238 $PUSH r19,`-12*8-13*$SIZE_T`($i) 239 $PUSH r20,`-12*8-12*$SIZE_T`($i) 240 $PUSH r21,`-12*8-11*$SIZE_T`($i) 241 $PUSH r22,`-12*8-10*$SIZE_T`($i) 242 $PUSH r23,`-12*8-9*$SIZE_T`($i) 243 $PUSH r24,`-12*8-8*$SIZE_T`($i) 244 $PUSH r25,`-12*8-7*$SIZE_T`($i) 245 $PUSH r26,`-12*8-6*$SIZE_T`($i) 246 $PUSH r27,`-12*8-5*$SIZE_T`($i) 247 $PUSH r28,`-12*8-4*$SIZE_T`($i) 248 $PUSH r29,`-12*8-3*$SIZE_T`($i) 249 $PUSH r30,`-12*8-2*$SIZE_T`($i) 250 $PUSH r31,`-12*8-1*$SIZE_T`($i) 251 stfd f20,`-12*8`($i) 252 stfd f21,`-11*8`($i) 253 stfd f22,`-10*8`($i) 254 stfd f23,`-9*8`($i) 255 stfd f24,`-8*8`($i) 256 stfd f25,`-7*8`($i) 257 stfd f26,`-6*8`($i) 258 stfd f27,`-5*8`($i) 259 stfd f28,`-4*8`($i) 260 stfd f29,`-3*8`($i) 261 stfd f30,`-2*8`($i) 262 stfd f31,`-1*8`($i) 263 264 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 265 li $i,-64 266 add $nap_d,$tp,$num 267 and $nap_d,$nap_d,$i ; align to 64 bytes 268 ; nap_d is off by 1, because it's used with stfdu/lfdu 269 addi $nap_d,$nap_d,-8 270 srwi $j,$num,`3+1` ; counter register, num/2 271 addi $j,$j,-1 272 addi $tp,$sp,`$FRAME+$TRANSFER-8` 273 li $carry,0 274 mtctr $j 275___ 276 277$code.=<<___ if ($SIZE_T==8); 278 ld $a0,0($ap) ; pull ap[0] value 279 ld $t3,0($bp) ; bp[0] 280 ld $n0,0($n0) ; pull n0[0] value 281 282 mulld $t7,$a0,$t3 ; ap[0]*bp[0] 283 ; transfer bp[0] to FPU as 4x16-bit values 284 extrdi $t0,$t3,16,48 285 extrdi $t1,$t3,16,32 286 extrdi $t2,$t3,16,16 287 extrdi $t3,$t3,16,0 288 std $t0,`$FRAME+0`($sp) 289 std $t1,`$FRAME+8`($sp) 290 std $t2,`$FRAME+16`($sp) 291 std $t3,`$FRAME+24`($sp) 292 293 mulld $t7,$t7,$n0 ; tp[0]*n0 294 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 295 extrdi $t4,$t7,16,48 296 extrdi $t5,$t7,16,32 297 extrdi $t6,$t7,16,16 298 extrdi $t7,$t7,16,0 299 std $t4,`$FRAME+32`($sp) 300 std $t5,`$FRAME+40`($sp) 301 std $t6,`$FRAME+48`($sp) 302 std $t7,`$FRAME+56`($sp) 303 304 extrdi $t0,$a0,32,32 ; lwz $t0,4($ap) 305 extrdi $t1,$a0,32,0 ; lwz $t1,0($ap) 306 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair 307 lwz $t3,`8^$LITTLE_ENDIAN`($ap) 308 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair 309 lwz $t5,`0^$LITTLE_ENDIAN`($np) 310 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair 311 lwz $t7,`8^$LITTLE_ENDIAN`($np) 312___ 313$code.=<<___ if ($SIZE_T==4); 314 lwz $a0,0($ap) ; pull ap[0,1] value 315 mr $n1,$n0 316 lwz $a1,4($ap) 317 li $c1,0 318 lwz $t1,0($bp) ; bp[0,1] 319 lwz $t3,4($bp) 320 lwz $n0,0($n1) ; pull n0[0,1] value 321 lwz $n1,4($n1) 322 323 mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0] 324 mulhwu $t5,$a0,$t1 325 mullw $t6,$a1,$t1 326 mullw $t7,$a0,$t3 327 add $t5,$t5,$t6 328 add $t5,$t5,$t7 329 ; transfer bp[0] to FPU as 4x16-bit values 330 extrwi $t0,$t1,16,16 331 extrwi $t1,$t1,16,0 332 extrwi $t2,$t3,16,16 333 extrwi $t3,$t3,16,0 334 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build 335 std $t1,`$FRAME+8`($sp) 336 std $t2,`$FRAME+16`($sp) 337 std $t3,`$FRAME+24`($sp) 338 339 mullw $t0,$t4,$n0 ; mulld tp[0]*n0 340 mulhwu $t1,$t4,$n0 341 mullw $t2,$t5,$n0 342 mullw $t3,$t4,$n1 343 add $t1,$t1,$t2 344 add $t1,$t1,$t3 345 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 346 extrwi $t4,$t0,16,16 347 extrwi $t5,$t0,16,0 348 extrwi $t6,$t1,16,16 349 extrwi $t7,$t1,16,0 350 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build 351 std $t5,`$FRAME+40`($sp) 352 std $t6,`$FRAME+48`($sp) 353 std $t7,`$FRAME+56`($sp) 354 355 mr $t0,$a0 ; lwz $t0,0($ap) 356 mr $t1,$a1 ; lwz $t1,4($ap) 357 lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs 358 lwz $t3,12($ap) 359 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 360 lwz $t5,4($np) 361 lwz $t6,8($np) 362 lwz $t7,12($np) 363___ 364$code.=<<___; 365 lfd $ba,`$FRAME+0`($sp) 366 lfd $bb,`$FRAME+8`($sp) 367 lfd $bc,`$FRAME+16`($sp) 368 lfd $bd,`$FRAME+24`($sp) 369 lfd $na,`$FRAME+32`($sp) 370 lfd $nb,`$FRAME+40`($sp) 371 lfd $nc,`$FRAME+48`($sp) 372 lfd $nd,`$FRAME+56`($sp) 373 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build 374 std $t1,`$FRAME+72`($sp) 375 std $t2,`$FRAME+80`($sp) 376 std $t3,`$FRAME+88`($sp) 377 std $t4,`$FRAME+96`($sp) 378 std $t5,`$FRAME+104`($sp) 379 std $t6,`$FRAME+112`($sp) 380 std $t7,`$FRAME+120`($sp) 381 fcfid $ba,$ba 382 fcfid $bb,$bb 383 fcfid $bc,$bc 384 fcfid $bd,$bd 385 fcfid $na,$na 386 fcfid $nb,$nb 387 fcfid $nc,$nc 388 fcfid $nd,$nd 389 390 lfd $A0,`$FRAME+64`($sp) 391 lfd $A1,`$FRAME+72`($sp) 392 lfd $A2,`$FRAME+80`($sp) 393 lfd $A3,`$FRAME+88`($sp) 394 lfd $N0,`$FRAME+96`($sp) 395 lfd $N1,`$FRAME+104`($sp) 396 lfd $N2,`$FRAME+112`($sp) 397 lfd $N3,`$FRAME+120`($sp) 398 fcfid $A0,$A0 399 fcfid $A1,$A1 400 fcfid $A2,$A2 401 fcfid $A3,$A3 402 fcfid $N0,$N0 403 fcfid $N1,$N1 404 fcfid $N2,$N2 405 fcfid $N3,$N3 406 addi $ap,$ap,16 407 addi $np,$np,16 408 409 fmul $T1a,$A1,$ba 410 fmul $T1b,$A1,$bb 411 stfd $A0,8($nap_d) ; save a[j] in double format 412 stfd $A1,16($nap_d) 413 fmul $T2a,$A2,$ba 414 fmul $T2b,$A2,$bb 415 stfd $A2,24($nap_d) ; save a[j+1] in double format 416 stfd $A3,32($nap_d) 417 fmul $T3a,$A3,$ba 418 fmul $T3b,$A3,$bb 419 stfd $N0,40($nap_d) ; save n[j] in double format 420 stfd $N1,48($nap_d) 421 fmul $T0a,$A0,$ba 422 fmul $T0b,$A0,$bb 423 stfd $N2,56($nap_d) ; save n[j+1] in double format 424 stfdu $N3,64($nap_d) 425 426 fmadd $T1a,$A0,$bc,$T1a 427 fmadd $T1b,$A0,$bd,$T1b 428 fmadd $T2a,$A1,$bc,$T2a 429 fmadd $T2b,$A1,$bd,$T2b 430 fmadd $T3a,$A2,$bc,$T3a 431 fmadd $T3b,$A2,$bd,$T3b 432 fmul $dota,$A3,$bc 433 fmul $dotb,$A3,$bd 434 435 fmadd $T1a,$N1,$na,$T1a 436 fmadd $T1b,$N1,$nb,$T1b 437 fmadd $T2a,$N2,$na,$T2a 438 fmadd $T2b,$N2,$nb,$T2b 439 fmadd $T3a,$N3,$na,$T3a 440 fmadd $T3b,$N3,$nb,$T3b 441 fmadd $T0a,$N0,$na,$T0a 442 fmadd $T0b,$N0,$nb,$T0b 443 444 fmadd $T1a,$N0,$nc,$T1a 445 fmadd $T1b,$N0,$nd,$T1b 446 fmadd $T2a,$N1,$nc,$T2a 447 fmadd $T2b,$N1,$nd,$T2b 448 fmadd $T3a,$N2,$nc,$T3a 449 fmadd $T3b,$N2,$nd,$T3b 450 fmadd $dota,$N3,$nc,$dota 451 fmadd $dotb,$N3,$nd,$dotb 452 453 fctid $T0a,$T0a 454 fctid $T0b,$T0b 455 fctid $T1a,$T1a 456 fctid $T1b,$T1b 457 fctid $T2a,$T2a 458 fctid $T2b,$T2b 459 fctid $T3a,$T3a 460 fctid $T3b,$T3b 461 462 stfd $T0a,`$FRAME+0`($sp) 463 stfd $T0b,`$FRAME+8`($sp) 464 stfd $T1a,`$FRAME+16`($sp) 465 stfd $T1b,`$FRAME+24`($sp) 466 stfd $T2a,`$FRAME+32`($sp) 467 stfd $T2b,`$FRAME+40`($sp) 468 stfd $T3a,`$FRAME+48`($sp) 469 stfd $T3b,`$FRAME+56`($sp) 470 471.align 5 472L1st: 473___ 474$code.=<<___ if ($SIZE_T==8); 475 lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair 476 lwz $t1,`0^$LITTLE_ENDIAN`($ap) 477 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair 478 lwz $t3,`8^$LITTLE_ENDIAN`($ap) 479 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair 480 lwz $t5,`0^$LITTLE_ENDIAN`($np) 481 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair 482 lwz $t7,`8^$LITTLE_ENDIAN`($np) 483___ 484$code.=<<___ if ($SIZE_T==4); 485 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 486 lwz $t1,4($ap) 487 lwz $t2,8($ap) 488 lwz $t3,12($ap) 489 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 490 lwz $t5,4($np) 491 lwz $t6,8($np) 492 lwz $t7,12($np) 493___ 494$code.=<<___; 495 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build 496 std $t1,`$FRAME+72`($sp) 497 std $t2,`$FRAME+80`($sp) 498 std $t3,`$FRAME+88`($sp) 499 std $t4,`$FRAME+96`($sp) 500 std $t5,`$FRAME+104`($sp) 501 std $t6,`$FRAME+112`($sp) 502 std $t7,`$FRAME+120`($sp) 503___ 504if ($SIZE_T==8 or $flavour =~ /osx/) { 505$code.=<<___; 506 ld $t0,`$FRAME+0`($sp) 507 ld $t1,`$FRAME+8`($sp) 508 ld $t2,`$FRAME+16`($sp) 509 ld $t3,`$FRAME+24`($sp) 510 ld $t4,`$FRAME+32`($sp) 511 ld $t5,`$FRAME+40`($sp) 512 ld $t6,`$FRAME+48`($sp) 513 ld $t7,`$FRAME+56`($sp) 514___ 515} else { 516$code.=<<___; 517 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 518 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 519 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 520 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 521 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 522 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 523 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 524 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 525___ 526} 527$code.=<<___; 528 lfd $A0,`$FRAME+64`($sp) 529 lfd $A1,`$FRAME+72`($sp) 530 lfd $A2,`$FRAME+80`($sp) 531 lfd $A3,`$FRAME+88`($sp) 532 lfd $N0,`$FRAME+96`($sp) 533 lfd $N1,`$FRAME+104`($sp) 534 lfd $N2,`$FRAME+112`($sp) 535 lfd $N3,`$FRAME+120`($sp) 536 fcfid $A0,$A0 537 fcfid $A1,$A1 538 fcfid $A2,$A2 539 fcfid $A3,$A3 540 fcfid $N0,$N0 541 fcfid $N1,$N1 542 fcfid $N2,$N2 543 fcfid $N3,$N3 544 addi $ap,$ap,16 545 addi $np,$np,16 546 547 fmul $T1a,$A1,$ba 548 fmul $T1b,$A1,$bb 549 fmul $T2a,$A2,$ba 550 fmul $T2b,$A2,$bb 551 stfd $A0,8($nap_d) ; save a[j] in double format 552 stfd $A1,16($nap_d) 553 fmul $T3a,$A3,$ba 554 fmul $T3b,$A3,$bb 555 fmadd $T0a,$A0,$ba,$dota 556 fmadd $T0b,$A0,$bb,$dotb 557 stfd $A2,24($nap_d) ; save a[j+1] in double format 558 stfd $A3,32($nap_d) 559___ 560if ($SIZE_T==8 or $flavour =~ /osx/) { 561$code.=<<___; 562 fmadd $T1a,$A0,$bc,$T1a 563 fmadd $T1b,$A0,$bd,$T1b 564 fmadd $T2a,$A1,$bc,$T2a 565 fmadd $T2b,$A1,$bd,$T2b 566 stfd $N0,40($nap_d) ; save n[j] in double format 567 stfd $N1,48($nap_d) 568 fmadd $T3a,$A2,$bc,$T3a 569 fmadd $T3b,$A2,$bd,$T3b 570 add $t0,$t0,$carry ; can not overflow 571 fmul $dota,$A3,$bc 572 fmul $dotb,$A3,$bd 573 stfd $N2,56($nap_d) ; save n[j+1] in double format 574 stfdu $N3,64($nap_d) 575 srdi $carry,$t0,16 576 add $t1,$t1,$carry 577 srdi $carry,$t1,16 578 579 fmadd $T1a,$N1,$na,$T1a 580 fmadd $T1b,$N1,$nb,$T1b 581 insrdi $t0,$t1,16,32 582 fmadd $T2a,$N2,$na,$T2a 583 fmadd $T2b,$N2,$nb,$T2b 584 add $t2,$t2,$carry 585 fmadd $T3a,$N3,$na,$T3a 586 fmadd $T3b,$N3,$nb,$T3b 587 srdi $carry,$t2,16 588 fmadd $T0a,$N0,$na,$T0a 589 fmadd $T0b,$N0,$nb,$T0b 590 insrdi $t0,$t2,16,16 591 add $t3,$t3,$carry 592 srdi $carry,$t3,16 593 594 fmadd $T1a,$N0,$nc,$T1a 595 fmadd $T1b,$N0,$nd,$T1b 596 insrdi $t0,$t3,16,0 ; 0..63 bits 597 fmadd $T2a,$N1,$nc,$T2a 598 fmadd $T2b,$N1,$nd,$T2b 599 add $t4,$t4,$carry 600 fmadd $T3a,$N2,$nc,$T3a 601 fmadd $T3b,$N2,$nd,$T3b 602 srdi $carry,$t4,16 603 fmadd $dota,$N3,$nc,$dota 604 fmadd $dotb,$N3,$nd,$dotb 605 add $t5,$t5,$carry 606 srdi $carry,$t5,16 607 insrdi $t4,$t5,16,32 608 609 fctid $T0a,$T0a 610 fctid $T0b,$T0b 611 add $t6,$t6,$carry 612 fctid $T1a,$T1a 613 fctid $T1b,$T1b 614 srdi $carry,$t6,16 615 fctid $T2a,$T2a 616 fctid $T2b,$T2b 617 insrdi $t4,$t6,16,16 618 fctid $T3a,$T3a 619 fctid $T3b,$T3b 620 add $t7,$t7,$carry 621 insrdi $t4,$t7,16,0 ; 64..127 bits 622 srdi $carry,$t7,16 ; upper 33 bits 623 624 stfd $T0a,`$FRAME+0`($sp) 625 stfd $T0b,`$FRAME+8`($sp) 626 stfd $T1a,`$FRAME+16`($sp) 627 stfd $T1b,`$FRAME+24`($sp) 628 stfd $T2a,`$FRAME+32`($sp) 629 stfd $T2b,`$FRAME+40`($sp) 630 stfd $T3a,`$FRAME+48`($sp) 631 stfd $T3b,`$FRAME+56`($sp) 632 std $t0,8($tp) ; tp[j-1] 633 stdu $t4,16($tp) ; tp[j] 634___ 635} else { 636$code.=<<___; 637 fmadd $T1a,$A0,$bc,$T1a 638 fmadd $T1b,$A0,$bd,$T1b 639 addc $t0,$t0,$carry 640 adde $t1,$t1,$c1 641 srwi $carry,$t0,16 642 fmadd $T2a,$A1,$bc,$T2a 643 fmadd $T2b,$A1,$bd,$T2b 644 stfd $N0,40($nap_d) ; save n[j] in double format 645 stfd $N1,48($nap_d) 646 srwi $c1,$t1,16 647 insrwi $carry,$t1,16,0 648 fmadd $T3a,$A2,$bc,$T3a 649 fmadd $T3b,$A2,$bd,$T3b 650 addc $t2,$t2,$carry 651 adde $t3,$t3,$c1 652 srwi $carry,$t2,16 653 fmul $dota,$A3,$bc 654 fmul $dotb,$A3,$bd 655 stfd $N2,56($nap_d) ; save n[j+1] in double format 656 stfdu $N3,64($nap_d) 657 insrwi $t0,$t2,16,0 ; 0..31 bits 658 srwi $c1,$t3,16 659 insrwi $carry,$t3,16,0 660 661 fmadd $T1a,$N1,$na,$T1a 662 fmadd $T1b,$N1,$nb,$T1b 663 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 664 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 665 addc $t4,$t4,$carry 666 adde $t5,$t5,$c1 667 srwi $carry,$t4,16 668 fmadd $T2a,$N2,$na,$T2a 669 fmadd $T2b,$N2,$nb,$T2b 670 srwi $c1,$t5,16 671 insrwi $carry,$t5,16,0 672 fmadd $T3a,$N3,$na,$T3a 673 fmadd $T3b,$N3,$nb,$T3b 674 addc $t6,$t6,$carry 675 adde $t7,$t7,$c1 676 srwi $carry,$t6,16 677 fmadd $T0a,$N0,$na,$T0a 678 fmadd $T0b,$N0,$nb,$T0b 679 insrwi $t4,$t6,16,0 ; 32..63 bits 680 srwi $c1,$t7,16 681 insrwi $carry,$t7,16,0 682 683 fmadd $T1a,$N0,$nc,$T1a 684 fmadd $T1b,$N0,$nd,$T1b 685 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 686 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 687 addc $t2,$t2,$carry 688 adde $t3,$t3,$c1 689 srwi $carry,$t2,16 690 fmadd $T2a,$N1,$nc,$T2a 691 fmadd $T2b,$N1,$nd,$T2b 692 stw $t0,12($tp) ; tp[j-1] 693 stw $t4,8($tp) 694 srwi $c1,$t3,16 695 insrwi $carry,$t3,16,0 696 fmadd $T3a,$N2,$nc,$T3a 697 fmadd $T3b,$N2,$nd,$T3b 698 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 699 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 700 addc $t6,$t6,$carry 701 adde $t7,$t7,$c1 702 srwi $carry,$t6,16 703 fmadd $dota,$N3,$nc,$dota 704 fmadd $dotb,$N3,$nd,$dotb 705 insrwi $t2,$t6,16,0 ; 64..95 bits 706 srwi $c1,$t7,16 707 insrwi $carry,$t7,16,0 708 709 fctid $T0a,$T0a 710 fctid $T0b,$T0b 711 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 712 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 713 addc $t0,$t0,$carry 714 adde $t1,$t1,$c1 715 srwi $carry,$t0,16 716 fctid $T1a,$T1a 717 fctid $T1b,$T1b 718 srwi $c1,$t1,16 719 insrwi $carry,$t1,16,0 720 fctid $T2a,$T2a 721 fctid $T2b,$T2b 722 addc $t4,$t4,$carry 723 adde $t5,$t5,$c1 724 srwi $carry,$t4,16 725 fctid $T3a,$T3a 726 fctid $T3b,$T3b 727 insrwi $t0,$t4,16,0 ; 96..127 bits 728 srwi $c1,$t5,16 729 insrwi $carry,$t5,16,0 730 731 stfd $T0a,`$FRAME+0`($sp) 732 stfd $T0b,`$FRAME+8`($sp) 733 stfd $T1a,`$FRAME+16`($sp) 734 stfd $T1b,`$FRAME+24`($sp) 735 stfd $T2a,`$FRAME+32`($sp) 736 stfd $T2b,`$FRAME+40`($sp) 737 stfd $T3a,`$FRAME+48`($sp) 738 stfd $T3b,`$FRAME+56`($sp) 739 stw $t2,20($tp) ; tp[j] 740 stwu $t0,16($tp) 741___ 742} 743$code.=<<___; 744 bdnz L1st 745 746 fctid $dota,$dota 747 fctid $dotb,$dotb 748___ 749if ($SIZE_T==8 or $flavour =~ /osx/) { 750$code.=<<___; 751 ld $t0,`$FRAME+0`($sp) 752 ld $t1,`$FRAME+8`($sp) 753 ld $t2,`$FRAME+16`($sp) 754 ld $t3,`$FRAME+24`($sp) 755 ld $t4,`$FRAME+32`($sp) 756 ld $t5,`$FRAME+40`($sp) 757 ld $t6,`$FRAME+48`($sp) 758 ld $t7,`$FRAME+56`($sp) 759 stfd $dota,`$FRAME+64`($sp) 760 stfd $dotb,`$FRAME+72`($sp) 761 762 add $t0,$t0,$carry ; can not overflow 763 srdi $carry,$t0,16 764 add $t1,$t1,$carry 765 srdi $carry,$t1,16 766 insrdi $t0,$t1,16,32 767 add $t2,$t2,$carry 768 srdi $carry,$t2,16 769 insrdi $t0,$t2,16,16 770 add $t3,$t3,$carry 771 srdi $carry,$t3,16 772 insrdi $t0,$t3,16,0 ; 0..63 bits 773 add $t4,$t4,$carry 774 srdi $carry,$t4,16 775 add $t5,$t5,$carry 776 srdi $carry,$t5,16 777 insrdi $t4,$t5,16,32 778 add $t6,$t6,$carry 779 srdi $carry,$t6,16 780 insrdi $t4,$t6,16,16 781 add $t7,$t7,$carry 782 insrdi $t4,$t7,16,0 ; 64..127 bits 783 srdi $carry,$t7,16 ; upper 33 bits 784 ld $t6,`$FRAME+64`($sp) 785 ld $t7,`$FRAME+72`($sp) 786 787 std $t0,8($tp) ; tp[j-1] 788 stdu $t4,16($tp) ; tp[j] 789 790 add $t6,$t6,$carry ; can not overflow 791 srdi $carry,$t6,16 792 add $t7,$t7,$carry 793 insrdi $t6,$t7,48,0 794 srdi $ovf,$t7,48 795 std $t6,8($tp) ; tp[num-1] 796___ 797} else { 798$code.=<<___; 799 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 800 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 801 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 802 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 803 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 804 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 805 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 806 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 807 stfd $dota,`$FRAME+64`($sp) 808 stfd $dotb,`$FRAME+72`($sp) 809 810 addc $t0,$t0,$carry 811 adde $t1,$t1,$c1 812 srwi $carry,$t0,16 813 insrwi $carry,$t1,16,0 814 srwi $c1,$t1,16 815 addc $t2,$t2,$carry 816 adde $t3,$t3,$c1 817 srwi $carry,$t2,16 818 insrwi $t0,$t2,16,0 ; 0..31 bits 819 insrwi $carry,$t3,16,0 820 srwi $c1,$t3,16 821 addc $t4,$t4,$carry 822 adde $t5,$t5,$c1 823 srwi $carry,$t4,16 824 insrwi $carry,$t5,16,0 825 srwi $c1,$t5,16 826 addc $t6,$t6,$carry 827 adde $t7,$t7,$c1 828 srwi $carry,$t6,16 829 insrwi $t4,$t6,16,0 ; 32..63 bits 830 insrwi $carry,$t7,16,0 831 srwi $c1,$t7,16 832 stw $t0,12($tp) ; tp[j-1] 833 stw $t4,8($tp) 834 835 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 836 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 837 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 838 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 839 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 840 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 841 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 842 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 843 844 addc $t2,$t2,$carry 845 adde $t3,$t3,$c1 846 srwi $carry,$t2,16 847 insrwi $carry,$t3,16,0 848 srwi $c1,$t3,16 849 addc $t6,$t6,$carry 850 adde $t7,$t7,$c1 851 srwi $carry,$t6,16 852 insrwi $t2,$t6,16,0 ; 64..95 bits 853 insrwi $carry,$t7,16,0 854 srwi $c1,$t7,16 855 addc $t0,$t0,$carry 856 adde $t1,$t1,$c1 857 srwi $carry,$t0,16 858 insrwi $carry,$t1,16,0 859 srwi $c1,$t1,16 860 addc $t4,$t4,$carry 861 adde $t5,$t5,$c1 862 srwi $carry,$t4,16 863 insrwi $t0,$t4,16,0 ; 96..127 bits 864 insrwi $carry,$t5,16,0 865 srwi $c1,$t5,16 866 stw $t2,20($tp) ; tp[j] 867 stwu $t0,16($tp) 868 869 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp) 870 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp) 871 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp) 872 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp) 873 874 addc $t6,$t6,$carry 875 adde $t7,$t7,$c1 876 srwi $carry,$t6,16 877 insrwi $carry,$t7,16,0 878 srwi $c1,$t7,16 879 addc $t4,$t4,$carry 880 adde $t5,$t5,$c1 881 882 insrwi $t6,$t4,16,0 883 srwi $t4,$t4,16 884 insrwi $t4,$t5,16,0 885 srwi $ovf,$t5,16 886 stw $t6,12($tp) ; tp[num-1] 887 stw $t4,8($tp) 888___ 889} 890$code.=<<___; 891 slwi $t7,$num,2 892 subf $nap_d,$t7,$nap_d ; rewind pointer 893 894 li $i,8 ; i=1 895.align 5 896Louter: 897 addi $tp,$sp,`$FRAME+$TRANSFER` 898 li $carry,0 899 mtctr $j 900___ 901$code.=<<___ if ($SIZE_T==8); 902 ldx $t3,$bp,$i ; bp[i] 903 904 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 905 mulld $t7,$a0,$t3 ; ap[0]*bp[i] 906 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] 907 ; transfer bp[i] to FPU as 4x16-bit values 908 extrdi $t0,$t3,16,48 909 extrdi $t1,$t3,16,32 910 extrdi $t2,$t3,16,16 911 extrdi $t3,$t3,16,0 912 std $t0,`$FRAME+0`($sp) 913 std $t1,`$FRAME+8`($sp) 914 std $t2,`$FRAME+16`($sp) 915 std $t3,`$FRAME+24`($sp) 916 917 mulld $t7,$t7,$n0 ; tp[0]*n0 918 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 919 extrdi $t4,$t7,16,48 920 extrdi $t5,$t7,16,32 921 extrdi $t6,$t7,16,16 922 extrdi $t7,$t7,16,0 923 std $t4,`$FRAME+32`($sp) 924 std $t5,`$FRAME+40`($sp) 925 std $t6,`$FRAME+48`($sp) 926 std $t7,`$FRAME+56`($sp) 927___ 928$code.=<<___ if ($SIZE_T==4); 929 add $t0,$bp,$i 930 li $c1,0 931 lwz $t1,0($t0) ; bp[i,i+1] 932 lwz $t3,4($t0) 933 934 mullw $t4,$a0,$t1 ; ap[0]*bp[i] 935 lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0] 936 mulhwu $t5,$a0,$t1 937 lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 938 mullw $t6,$a1,$t1 939 mullw $t7,$a0,$t3 940 add $t5,$t5,$t6 941 add $t5,$t5,$t7 942 addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0] 943 adde $t5,$t5,$t2 944 ; transfer bp[i] to FPU as 4x16-bit values 945 extrwi $t0,$t1,16,16 946 extrwi $t1,$t1,16,0 947 extrwi $t2,$t3,16,16 948 extrwi $t3,$t3,16,0 949 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build 950 std $t1,`$FRAME+8`($sp) 951 std $t2,`$FRAME+16`($sp) 952 std $t3,`$FRAME+24`($sp) 953 954 mullw $t0,$t4,$n0 ; mulld tp[0]*n0 955 mulhwu $t1,$t4,$n0 956 mullw $t2,$t5,$n0 957 mullw $t3,$t4,$n1 958 add $t1,$t1,$t2 959 add $t1,$t1,$t3 960 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 961 extrwi $t4,$t0,16,16 962 extrwi $t5,$t0,16,0 963 extrwi $t6,$t1,16,16 964 extrwi $t7,$t1,16,0 965 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build 966 std $t5,`$FRAME+40`($sp) 967 std $t6,`$FRAME+48`($sp) 968 std $t7,`$FRAME+56`($sp) 969___ 970$code.=<<___; 971 lfd $A0,8($nap_d) ; load a[j] in double format 972 lfd $A1,16($nap_d) 973 lfd $A2,24($nap_d) ; load a[j+1] in double format 974 lfd $A3,32($nap_d) 975 lfd $N0,40($nap_d) ; load n[j] in double format 976 lfd $N1,48($nap_d) 977 lfd $N2,56($nap_d) ; load n[j+1] in double format 978 lfdu $N3,64($nap_d) 979 980 lfd $ba,`$FRAME+0`($sp) 981 lfd $bb,`$FRAME+8`($sp) 982 lfd $bc,`$FRAME+16`($sp) 983 lfd $bd,`$FRAME+24`($sp) 984 lfd $na,`$FRAME+32`($sp) 985 lfd $nb,`$FRAME+40`($sp) 986 lfd $nc,`$FRAME+48`($sp) 987 lfd $nd,`$FRAME+56`($sp) 988 989 fcfid $ba,$ba 990 fcfid $bb,$bb 991 fcfid $bc,$bc 992 fcfid $bd,$bd 993 fcfid $na,$na 994 fcfid $nb,$nb 995 fcfid $nc,$nc 996 fcfid $nd,$nd 997 998 fmul $T1a,$A1,$ba 999 fmul $T1b,$A1,$bb 1000 fmul $T2a,$A2,$ba 1001 fmul $T2b,$A2,$bb 1002 fmul $T3a,$A3,$ba 1003 fmul $T3b,$A3,$bb 1004 fmul $T0a,$A0,$ba 1005 fmul $T0b,$A0,$bb 1006 1007 fmadd $T1a,$A0,$bc,$T1a 1008 fmadd $T1b,$A0,$bd,$T1b 1009 fmadd $T2a,$A1,$bc,$T2a 1010 fmadd $T2b,$A1,$bd,$T2b 1011 fmadd $T3a,$A2,$bc,$T3a 1012 fmadd $T3b,$A2,$bd,$T3b 1013 fmul $dota,$A3,$bc 1014 fmul $dotb,$A3,$bd 1015 1016 fmadd $T1a,$N1,$na,$T1a 1017 fmadd $T1b,$N1,$nb,$T1b 1018 lfd $A0,8($nap_d) ; load a[j] in double format 1019 lfd $A1,16($nap_d) 1020 fmadd $T2a,$N2,$na,$T2a 1021 fmadd $T2b,$N2,$nb,$T2b 1022 lfd $A2,24($nap_d) ; load a[j+1] in double format 1023 lfd $A3,32($nap_d) 1024 fmadd $T3a,$N3,$na,$T3a 1025 fmadd $T3b,$N3,$nb,$T3b 1026 fmadd $T0a,$N0,$na,$T0a 1027 fmadd $T0b,$N0,$nb,$T0b 1028 1029 fmadd $T1a,$N0,$nc,$T1a 1030 fmadd $T1b,$N0,$nd,$T1b 1031 fmadd $T2a,$N1,$nc,$T2a 1032 fmadd $T2b,$N1,$nd,$T2b 1033 fmadd $T3a,$N2,$nc,$T3a 1034 fmadd $T3b,$N2,$nd,$T3b 1035 fmadd $dota,$N3,$nc,$dota 1036 fmadd $dotb,$N3,$nd,$dotb 1037 1038 fctid $T0a,$T0a 1039 fctid $T0b,$T0b 1040 fctid $T1a,$T1a 1041 fctid $T1b,$T1b 1042 fctid $T2a,$T2a 1043 fctid $T2b,$T2b 1044 fctid $T3a,$T3a 1045 fctid $T3b,$T3b 1046 1047 stfd $T0a,`$FRAME+0`($sp) 1048 stfd $T0b,`$FRAME+8`($sp) 1049 stfd $T1a,`$FRAME+16`($sp) 1050 stfd $T1b,`$FRAME+24`($sp) 1051 stfd $T2a,`$FRAME+32`($sp) 1052 stfd $T2b,`$FRAME+40`($sp) 1053 stfd $T3a,`$FRAME+48`($sp) 1054 stfd $T3b,`$FRAME+56`($sp) 1055 1056.align 5 1057Linner: 1058 fmul $T1a,$A1,$ba 1059 fmul $T1b,$A1,$bb 1060 fmul $T2a,$A2,$ba 1061 fmul $T2b,$A2,$bb 1062 lfd $N0,40($nap_d) ; load n[j] in double format 1063 lfd $N1,48($nap_d) 1064 fmul $T3a,$A3,$ba 1065 fmul $T3b,$A3,$bb 1066 fmadd $T0a,$A0,$ba,$dota 1067 fmadd $T0b,$A0,$bb,$dotb 1068 lfd $N2,56($nap_d) ; load n[j+1] in double format 1069 lfdu $N3,64($nap_d) 1070 1071 fmadd $T1a,$A0,$bc,$T1a 1072 fmadd $T1b,$A0,$bd,$T1b 1073 fmadd $T2a,$A1,$bc,$T2a 1074 fmadd $T2b,$A1,$bd,$T2b 1075 lfd $A0,8($nap_d) ; load a[j] in double format 1076 lfd $A1,16($nap_d) 1077 fmadd $T3a,$A2,$bc,$T3a 1078 fmadd $T3b,$A2,$bd,$T3b 1079 fmul $dota,$A3,$bc 1080 fmul $dotb,$A3,$bd 1081 lfd $A2,24($nap_d) ; load a[j+1] in double format 1082 lfd $A3,32($nap_d) 1083___ 1084if ($SIZE_T==8 or $flavour =~ /osx/) { 1085$code.=<<___; 1086 fmadd $T1a,$N1,$na,$T1a 1087 fmadd $T1b,$N1,$nb,$T1b 1088 ld $t0,`$FRAME+0`($sp) 1089 ld $t1,`$FRAME+8`($sp) 1090 fmadd $T2a,$N2,$na,$T2a 1091 fmadd $T2b,$N2,$nb,$T2b 1092 ld $t2,`$FRAME+16`($sp) 1093 ld $t3,`$FRAME+24`($sp) 1094 fmadd $T3a,$N3,$na,$T3a 1095 fmadd $T3b,$N3,$nb,$T3b 1096 add $t0,$t0,$carry ; can not overflow 1097 ld $t4,`$FRAME+32`($sp) 1098 ld $t5,`$FRAME+40`($sp) 1099 fmadd $T0a,$N0,$na,$T0a 1100 fmadd $T0b,$N0,$nb,$T0b 1101 srdi $carry,$t0,16 1102 add $t1,$t1,$carry 1103 srdi $carry,$t1,16 1104 ld $t6,`$FRAME+48`($sp) 1105 ld $t7,`$FRAME+56`($sp) 1106 1107 fmadd $T1a,$N0,$nc,$T1a 1108 fmadd $T1b,$N0,$nd,$T1b 1109 insrdi $t0,$t1,16,32 1110 ld $t1,8($tp) ; tp[j] 1111 fmadd $T2a,$N1,$nc,$T2a 1112 fmadd $T2b,$N1,$nd,$T2b 1113 add $t2,$t2,$carry 1114 fmadd $T3a,$N2,$nc,$T3a 1115 fmadd $T3b,$N2,$nd,$T3b 1116 srdi $carry,$t2,16 1117 insrdi $t0,$t2,16,16 1118 fmadd $dota,$N3,$nc,$dota 1119 fmadd $dotb,$N3,$nd,$dotb 1120 add $t3,$t3,$carry 1121 ldu $t2,16($tp) ; tp[j+1] 1122 srdi $carry,$t3,16 1123 insrdi $t0,$t3,16,0 ; 0..63 bits 1124 add $t4,$t4,$carry 1125 1126 fctid $T0a,$T0a 1127 fctid $T0b,$T0b 1128 srdi $carry,$t4,16 1129 fctid $T1a,$T1a 1130 fctid $T1b,$T1b 1131 add $t5,$t5,$carry 1132 fctid $T2a,$T2a 1133 fctid $T2b,$T2b 1134 srdi $carry,$t5,16 1135 insrdi $t4,$t5,16,32 1136 fctid $T3a,$T3a 1137 fctid $T3b,$T3b 1138 add $t6,$t6,$carry 1139 srdi $carry,$t6,16 1140 insrdi $t4,$t6,16,16 1141 1142 stfd $T0a,`$FRAME+0`($sp) 1143 stfd $T0b,`$FRAME+8`($sp) 1144 add $t7,$t7,$carry 1145 addc $t3,$t0,$t1 1146___ 1147$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1148 extrdi $t0,$t0,32,0 1149 extrdi $t1,$t1,32,0 1150 adde $t0,$t0,$t1 1151___ 1152$code.=<<___; 1153 stfd $T1a,`$FRAME+16`($sp) 1154 stfd $T1b,`$FRAME+24`($sp) 1155 insrdi $t4,$t7,16,0 ; 64..127 bits 1156 srdi $carry,$t7,16 ; upper 33 bits 1157 stfd $T2a,`$FRAME+32`($sp) 1158 stfd $T2b,`$FRAME+40`($sp) 1159 adde $t5,$t4,$t2 1160___ 1161$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1162 extrdi $t4,$t4,32,0 1163 extrdi $t2,$t2,32,0 1164 adde $t4,$t4,$t2 1165___ 1166$code.=<<___; 1167 stfd $T3a,`$FRAME+48`($sp) 1168 stfd $T3b,`$FRAME+56`($sp) 1169 addze $carry,$carry 1170 std $t3,-16($tp) ; tp[j-1] 1171 std $t5,-8($tp) ; tp[j] 1172___ 1173} else { 1174$code.=<<___; 1175 fmadd $T1a,$N1,$na,$T1a 1176 fmadd $T1b,$N1,$nb,$T1b 1177 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 1178 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 1179 fmadd $T2a,$N2,$na,$T2a 1180 fmadd $T2b,$N2,$nb,$T2b 1181 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 1182 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 1183 fmadd $T3a,$N3,$na,$T3a 1184 fmadd $T3b,$N3,$nb,$T3b 1185 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 1186 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 1187 addc $t0,$t0,$carry 1188 adde $t1,$t1,$c1 1189 srwi $carry,$t0,16 1190 fmadd $T0a,$N0,$na,$T0a 1191 fmadd $T0b,$N0,$nb,$T0b 1192 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 1193 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 1194 srwi $c1,$t1,16 1195 insrwi $carry,$t1,16,0 1196 1197 fmadd $T1a,$N0,$nc,$T1a 1198 fmadd $T1b,$N0,$nd,$T1b 1199 addc $t2,$t2,$carry 1200 adde $t3,$t3,$c1 1201 srwi $carry,$t2,16 1202 fmadd $T2a,$N1,$nc,$T2a 1203 fmadd $T2b,$N1,$nd,$T2b 1204 insrwi $t0,$t2,16,0 ; 0..31 bits 1205 srwi $c1,$t3,16 1206 insrwi $carry,$t3,16,0 1207 fmadd $T3a,$N2,$nc,$T3a 1208 fmadd $T3b,$N2,$nd,$T3b 1209 lwz $t2,12($tp) ; tp[j] 1210 lwz $t3,8($tp) 1211 addc $t4,$t4,$carry 1212 adde $t5,$t5,$c1 1213 srwi $carry,$t4,16 1214 fmadd $dota,$N3,$nc,$dota 1215 fmadd $dotb,$N3,$nd,$dotb 1216 srwi $c1,$t5,16 1217 insrwi $carry,$t5,16,0 1218 1219 fctid $T0a,$T0a 1220 addc $t6,$t6,$carry 1221 adde $t7,$t7,$c1 1222 srwi $carry,$t6,16 1223 fctid $T0b,$T0b 1224 insrwi $t4,$t6,16,0 ; 32..63 bits 1225 srwi $c1,$t7,16 1226 insrwi $carry,$t7,16,0 1227 fctid $T1a,$T1a 1228 addc $t0,$t0,$t2 1229 adde $t4,$t4,$t3 1230 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 1231 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 1232 fctid $T1b,$T1b 1233 addze $carry,$carry 1234 addze $c1,$c1 1235 stw $t0,4($tp) ; tp[j-1] 1236 stw $t4,0($tp) 1237 fctid $T2a,$T2a 1238 addc $t2,$t2,$carry 1239 adde $t3,$t3,$c1 1240 srwi $carry,$t2,16 1241 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 1242 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 1243 fctid $T2b,$T2b 1244 srwi $c1,$t3,16 1245 insrwi $carry,$t3,16,0 1246 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 1247 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 1248 fctid $T3a,$T3a 1249 addc $t6,$t6,$carry 1250 adde $t7,$t7,$c1 1251 srwi $carry,$t6,16 1252 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 1253 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 1254 fctid $T3b,$T3b 1255 1256 insrwi $t2,$t6,16,0 ; 64..95 bits 1257 insrwi $carry,$t7,16,0 1258 srwi $c1,$t7,16 1259 lwz $t6,20($tp) 1260 lwzu $t7,16($tp) 1261 addc $t0,$t0,$carry 1262 stfd $T0a,`$FRAME+0`($sp) 1263 adde $t1,$t1,$c1 1264 srwi $carry,$t0,16 1265 stfd $T0b,`$FRAME+8`($sp) 1266 insrwi $carry,$t1,16,0 1267 srwi $c1,$t1,16 1268 addc $t4,$t4,$carry 1269 stfd $T1a,`$FRAME+16`($sp) 1270 adde $t5,$t5,$c1 1271 srwi $carry,$t4,16 1272 insrwi $t0,$t4,16,0 ; 96..127 bits 1273 stfd $T1b,`$FRAME+24`($sp) 1274 insrwi $carry,$t5,16,0 1275 srwi $c1,$t5,16 1276 1277 addc $t2,$t2,$t6 1278 stfd $T2a,`$FRAME+32`($sp) 1279 adde $t0,$t0,$t7 1280 stfd $T2b,`$FRAME+40`($sp) 1281 addze $carry,$carry 1282 stfd $T3a,`$FRAME+48`($sp) 1283 addze $c1,$c1 1284 stfd $T3b,`$FRAME+56`($sp) 1285 stw $t2,-4($tp) ; tp[j] 1286 stw $t0,-8($tp) 1287___ 1288} 1289$code.=<<___; 1290 bdnz Linner 1291 1292 fctid $dota,$dota 1293 fctid $dotb,$dotb 1294___ 1295if ($SIZE_T==8 or $flavour =~ /osx/) { 1296$code.=<<___; 1297 ld $t0,`$FRAME+0`($sp) 1298 ld $t1,`$FRAME+8`($sp) 1299 ld $t2,`$FRAME+16`($sp) 1300 ld $t3,`$FRAME+24`($sp) 1301 ld $t4,`$FRAME+32`($sp) 1302 ld $t5,`$FRAME+40`($sp) 1303 ld $t6,`$FRAME+48`($sp) 1304 ld $t7,`$FRAME+56`($sp) 1305 stfd $dota,`$FRAME+64`($sp) 1306 stfd $dotb,`$FRAME+72`($sp) 1307 1308 add $t0,$t0,$carry ; can not overflow 1309 srdi $carry,$t0,16 1310 add $t1,$t1,$carry 1311 srdi $carry,$t1,16 1312 insrdi $t0,$t1,16,32 1313 add $t2,$t2,$carry 1314 ld $t1,8($tp) ; tp[j] 1315 srdi $carry,$t2,16 1316 insrdi $t0,$t2,16,16 1317 add $t3,$t3,$carry 1318 ldu $t2,16($tp) ; tp[j+1] 1319 srdi $carry,$t3,16 1320 insrdi $t0,$t3,16,0 ; 0..63 bits 1321 add $t4,$t4,$carry 1322 srdi $carry,$t4,16 1323 add $t5,$t5,$carry 1324 srdi $carry,$t5,16 1325 insrdi $t4,$t5,16,32 1326 add $t6,$t6,$carry 1327 srdi $carry,$t6,16 1328 insrdi $t4,$t6,16,16 1329 add $t7,$t7,$carry 1330 insrdi $t4,$t7,16,0 ; 64..127 bits 1331 srdi $carry,$t7,16 ; upper 33 bits 1332 ld $t6,`$FRAME+64`($sp) 1333 ld $t7,`$FRAME+72`($sp) 1334 1335 addc $t3,$t0,$t1 1336___ 1337$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1338 extrdi $t0,$t0,32,0 1339 extrdi $t1,$t1,32,0 1340 adde $t0,$t0,$t1 1341___ 1342$code.=<<___; 1343 adde $t5,$t4,$t2 1344___ 1345$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1346 extrdi $t4,$t4,32,0 1347 extrdi $t2,$t2,32,0 1348 adde $t4,$t4,$t2 1349___ 1350$code.=<<___; 1351 addze $carry,$carry 1352 1353 std $t3,-16($tp) ; tp[j-1] 1354 std $t5,-8($tp) ; tp[j] 1355 1356 add $carry,$carry,$ovf ; consume upmost overflow 1357 add $t6,$t6,$carry ; can not overflow 1358 srdi $carry,$t6,16 1359 add $t7,$t7,$carry 1360 insrdi $t6,$t7,48,0 1361 srdi $ovf,$t7,48 1362 std $t6,0($tp) ; tp[num-1] 1363___ 1364} else { 1365$code.=<<___; 1366 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 1367 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 1368 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 1369 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 1370 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 1371 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 1372 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 1373 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 1374 stfd $dota,`$FRAME+64`($sp) 1375 stfd $dotb,`$FRAME+72`($sp) 1376 1377 addc $t0,$t0,$carry 1378 adde $t1,$t1,$c1 1379 srwi $carry,$t0,16 1380 insrwi $carry,$t1,16,0 1381 srwi $c1,$t1,16 1382 addc $t2,$t2,$carry 1383 adde $t3,$t3,$c1 1384 srwi $carry,$t2,16 1385 insrwi $t0,$t2,16,0 ; 0..31 bits 1386 lwz $t2,12($tp) ; tp[j] 1387 insrwi $carry,$t3,16,0 1388 srwi $c1,$t3,16 1389 lwz $t3,8($tp) 1390 addc $t4,$t4,$carry 1391 adde $t5,$t5,$c1 1392 srwi $carry,$t4,16 1393 insrwi $carry,$t5,16,0 1394 srwi $c1,$t5,16 1395 addc $t6,$t6,$carry 1396 adde $t7,$t7,$c1 1397 srwi $carry,$t6,16 1398 insrwi $t4,$t6,16,0 ; 32..63 bits 1399 insrwi $carry,$t7,16,0 1400 srwi $c1,$t7,16 1401 1402 addc $t0,$t0,$t2 1403 adde $t4,$t4,$t3 1404 addze $carry,$carry 1405 addze $c1,$c1 1406 stw $t0,4($tp) ; tp[j-1] 1407 stw $t4,0($tp) 1408 1409 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 1410 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 1411 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 1412 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 1413 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 1414 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 1415 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 1416 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 1417 1418 addc $t2,$t2,$carry 1419 adde $t3,$t3,$c1 1420 srwi $carry,$t2,16 1421 insrwi $carry,$t3,16,0 1422 srwi $c1,$t3,16 1423 addc $t6,$t6,$carry 1424 adde $t7,$t7,$c1 1425 srwi $carry,$t6,16 1426 insrwi $t2,$t6,16,0 ; 64..95 bits 1427 lwz $t6,20($tp) 1428 insrwi $carry,$t7,16,0 1429 srwi $c1,$t7,16 1430 lwzu $t7,16($tp) 1431 addc $t0,$t0,$carry 1432 adde $t1,$t1,$c1 1433 srwi $carry,$t0,16 1434 insrwi $carry,$t1,16,0 1435 srwi $c1,$t1,16 1436 addc $t4,$t4,$carry 1437 adde $t5,$t5,$c1 1438 srwi $carry,$t4,16 1439 insrwi $t0,$t4,16,0 ; 96..127 bits 1440 insrwi $carry,$t5,16,0 1441 srwi $c1,$t5,16 1442 1443 addc $t2,$t2,$t6 1444 adde $t0,$t0,$t7 1445 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp) 1446 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp) 1447 addze $carry,$carry 1448 addze $c1,$c1 1449 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp) 1450 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp) 1451 1452 addc $t6,$t6,$carry 1453 adde $t7,$t7,$c1 1454 stw $t2,-4($tp) ; tp[j] 1455 stw $t0,-8($tp) 1456 addc $t6,$t6,$ovf 1457 addze $t7,$t7 1458 srwi $carry,$t6,16 1459 insrwi $carry,$t7,16,0 1460 srwi $c1,$t7,16 1461 addc $t4,$t4,$carry 1462 adde $t5,$t5,$c1 1463 1464 insrwi $t6,$t4,16,0 1465 srwi $t4,$t4,16 1466 insrwi $t4,$t5,16,0 1467 srwi $ovf,$t5,16 1468 stw $t6,4($tp) ; tp[num-1] 1469 stw $t4,0($tp) 1470___ 1471} 1472$code.=<<___; 1473 slwi $t7,$num,2 1474 addi $i,$i,8 1475 subf $nap_d,$t7,$nap_d ; rewind pointer 1476 cmpw $i,$num 1477 blt- Louter 1478___ 1479 1480$code.=<<___ if ($SIZE_T==8); 1481 subf $np,$num,$np ; rewind np 1482 addi $j,$j,1 ; restore counter 1483 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 1484 addi $tp,$sp,`$FRAME+$TRANSFER+8` 1485 addi $t4,$sp,`$FRAME+$TRANSFER+16` 1486 addi $t5,$np,8 1487 addi $t6,$rp,8 1488 mtctr $j 1489 1490.align 4 1491Lsub: ldx $t0,$tp,$i 1492 ldx $t1,$np,$i 1493 ldx $t2,$t4,$i 1494 ldx $t3,$t5,$i 1495 subfe $t0,$t1,$t0 ; tp[j]-np[j] 1496 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] 1497 stdx $t0,$rp,$i 1498 stdx $t2,$t6,$i 1499 addi $i,$i,16 1500 bdnz Lsub 1501 1502 li $i,0 1503 subfe $ovf,$i,$ovf ; handle upmost overflow bit 1504 mtctr $j 1505 1506.align 4 1507Lcopy: ; conditional copy 1508 ldx $t0,$tp,$i 1509 ldx $t1,$t4,$i 1510 ldx $t2,$rp,$i 1511 ldx $t3,$t6,$i 1512 std $i,8($nap_d) ; zap nap_d 1513 std $i,16($nap_d) 1514 std $i,24($nap_d) 1515 std $i,32($nap_d) 1516 std $i,40($nap_d) 1517 std $i,48($nap_d) 1518 std $i,56($nap_d) 1519 stdu $i,64($nap_d) 1520 and $t0,$t0,$ovf 1521 and $t1,$t1,$ovf 1522 andc $t2,$t2,$ovf 1523 andc $t3,$t3,$ovf 1524 or $t0,$t0,$t2 1525 or $t1,$t1,$t3 1526 stdx $t0,$rp,$i 1527 stdx $t1,$t6,$i 1528 stdx $i,$tp,$i ; zap tp at once 1529 stdx $i,$t4,$i 1530 addi $i,$i,16 1531 bdnz Lcopy 1532___ 1533$code.=<<___ if ($SIZE_T==4); 1534 subf $np,$num,$np ; rewind np 1535 addi $j,$j,1 ; restore counter 1536 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 1537 addi $tp,$sp,`$FRAME+$TRANSFER` 1538 addi $np,$np,-4 1539 addi $rp,$rp,-4 1540 addi $ap,$sp,`$FRAME+$TRANSFER+4` 1541 mtctr $j 1542 1543.align 4 1544Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order 1545 lwz $t1,8($tp) 1546 lwz $t2,20($tp) 1547 lwzu $t3,16($tp) 1548 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order 1549 lwz $t5,8($np) 1550 lwz $t6,12($np) 1551 lwzu $t7,16($np) 1552 subfe $t4,$t4,$t0 ; tp[j]-np[j] 1553 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order 1554 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] 1555 stw $t1,8($ap) 1556 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] 1557 stw $t2,12($ap) 1558 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] 1559 stwu $t3,16($ap) 1560 stw $t4,4($rp) 1561 stw $t5,8($rp) 1562 stw $t6,12($rp) 1563 stwu $t7,16($rp) 1564 bdnz Lsub 1565 1566 li $i,0 1567 subfe $ovf,$i,$ovf ; handle upmost overflow bit 1568 addi $ap,$sp,`$FRAME+$TRANSFER+4` 1569 subf $rp,$num,$rp ; rewind rp 1570 addi $tp,$sp,`$FRAME+$TRANSFER` 1571 mtctr $j 1572 1573.align 4 1574Lcopy: ; conditional copy 1575 lwz $t0,4($ap) 1576 lwz $t1,8($ap) 1577 lwz $t2,12($ap) 1578 lwzu $t3,16($ap) 1579 lwz $t4,4($rp) 1580 lwz $t5,8($rp) 1581 lwz $t6,12($rp) 1582 lwz $t7,16($rp) 1583 std $i,8($nap_d) ; zap nap_d 1584 std $i,16($nap_d) 1585 std $i,24($nap_d) 1586 std $i,32($nap_d) 1587 std $i,40($nap_d) 1588 std $i,48($nap_d) 1589 std $i,56($nap_d) 1590 stdu $i,64($nap_d) 1591 and $t0,$t0,$ovf 1592 and $t1,$t1,$ovf 1593 and $t2,$t2,$ovf 1594 and $t3,$t3,$ovf 1595 andc $t4,$t4,$ovf 1596 andc $t5,$t5,$ovf 1597 andc $t6,$t6,$ovf 1598 andc $t7,$t7,$ovf 1599 or $t0,$t0,$t4 1600 or $t1,$t1,$t5 1601 or $t2,$t2,$t6 1602 or $t3,$t3,$t7 1603 stw $t0,4($rp) 1604 stw $t1,8($rp) 1605 stw $t2,12($rp) 1606 stwu $t3,16($rp) 1607 std $i,8($tp) ; zap tp at once 1608 stdu $i,16($tp) 1609 bdnz Lcopy 1610___ 1611 1612$code.=<<___; 1613 $POP $i,0($sp) 1614 li r3,1 ; signal "handled" 1615 $POP r19,`-12*8-13*$SIZE_T`($i) 1616 $POP r20,`-12*8-12*$SIZE_T`($i) 1617 $POP r21,`-12*8-11*$SIZE_T`($i) 1618 $POP r22,`-12*8-10*$SIZE_T`($i) 1619 $POP r23,`-12*8-9*$SIZE_T`($i) 1620 $POP r24,`-12*8-8*$SIZE_T`($i) 1621 $POP r25,`-12*8-7*$SIZE_T`($i) 1622 $POP r26,`-12*8-6*$SIZE_T`($i) 1623 $POP r27,`-12*8-5*$SIZE_T`($i) 1624 $POP r28,`-12*8-4*$SIZE_T`($i) 1625 $POP r29,`-12*8-3*$SIZE_T`($i) 1626 $POP r30,`-12*8-2*$SIZE_T`($i) 1627 $POP r31,`-12*8-1*$SIZE_T`($i) 1628 lfd f20,`-12*8`($i) 1629 lfd f21,`-11*8`($i) 1630 lfd f22,`-10*8`($i) 1631 lfd f23,`-9*8`($i) 1632 lfd f24,`-8*8`($i) 1633 lfd f25,`-7*8`($i) 1634 lfd f26,`-6*8`($i) 1635 lfd f27,`-5*8`($i) 1636 lfd f28,`-4*8`($i) 1637 lfd f29,`-3*8`($i) 1638 lfd f30,`-2*8`($i) 1639 lfd f31,`-1*8`($i) 1640 mr $sp,$i 1641 blr 1642 .long 0 1643 .byte 0,12,4,0,0x8c,13,6,0 1644 .long 0 1645.size .$fname,.-.$fname 1646 1647.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 1648___ 1649 1650$code =~ s/\`([^\`]*)\`/eval $1/gem; 1651print $code; 1652close STDOUT; 1653