1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef linux 43#ifndef __64BIT__ 44#define M r3 45#define IS r4 46#define A r5 47#define LDA r6 48#define X r7 49#define INCX r8 50#define Y r9 51#define INCY r10 52#define BUFFER r14 53#else 54#define M r3 55#define IS r4 56#define A r6 57#define LDA r7 58#define X r8 59#define INCX r9 60#define Y r10 61#define INCY r5 62#define BUFFER r14 63#endif 64#endif 65 66#if defined(_AIX) || defined(__APPLE__) 67#if !defined(__64BIT__) && defined(DOUBLE) 68#define M r3 69#define IS r4 70#define A r7 71#define LDA r8 72#define X r9 73#define INCX r10 74#define Y r5 75#define INCY r6 76#define BUFFER r14 77#else 78#define M r3 79#define IS r4 80#define A r6 81#define LDA r7 82#define X r8 83#define INCX r9 84#define Y r10 85#define INCY r5 86#define BUFFER r14 87#endif 88#endif 89 90#define I r11 91#define J r12 92 93#define AO1 r15 94#define AO2 r16 95#define AO3 r17 96#define AO4 r18 97#define XX r19 98#define YY r20 99#define NEW_Y r21 100#define TEMP r22 101#define PREA r24 102 103#define y01 f0 104#define y02 f1 105#define y03 f2 106#define y04 f3 107 108#define atemp1 f4 109#define atemp2 f5 110#define atemp3 f6 111#define atemp4 f7 112 113#define xtemp1 f8 114#define xtemp2 f9 115#define xtemp3 f10 116#define xtemp4 f11 117 118#define xsum1 f12 119#define xsum2 f13 120#define xsum3 f14 121#define xsum4 f15 122 123#define a1 f16 124#define a2 f17 125#define a3 f18 126#define a4 f19 127#define a5 f20 128#define a6 f21 129#define a7 f22 130#define a8 f23 131#define a9 f24 132#define a10 f25 133#define a11 f26 134#define a12 f27 135#define a13 f28 136#define a14 f29 137#define a15 f30 138#define a16 f31 139 140#define alpha f1 141 142#if defined(PPCG4) 143#define PREFETCHSIZE_A 24 144#endif 145 146#if defined(PPC440) || defined(PPC440FP2) 147#define PREFETCHSIZE_A 24 148#endif 149 150#ifdef PPC970 151#define PREFETCHSIZE_A 64 152#endif 153 154#ifdef CELL 155#define PREFETCHSIZE_A 72 156#endif 157 158#ifdef POWER4 159#define PREFETCHSIZE_A 16 160#endif 161 162#ifdef POWER5 163#define PREFETCHSIZE_A 96 164#endif 165 166#ifdef POWER6 167#define PREFETCHSIZE_A 40 168#endif 169 170#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) 171#define NOP1 172#define NOP2 173#else 174#define NOP1 mr LDA, LDA 175#define NOP2 mr INCX, INCX 176#endif 177 178#ifndef NEEDPARAM 179 180#ifndef __64BIT__ 181#define STACKSIZE 224 182#define ALPHA 200(SP) 183#define FZERO 208(SP) 184#else 185#define STACKSIZE 280 186#define ALPHA 256(SP) 187#define FZERO 264(SP) 188#endif 189 190 PROLOGUE 191 PROFCODE 192 193 addi SP, SP, -STACKSIZE 194 li r0, 0 195 196 stfd f14, 0(SP) 197 stfd f15, 8(SP) 198 stfd f16, 16(SP) 199 stfd f17, 24(SP) 200 stfd f18, 32(SP) 201 stfd f19, 40(SP) 202 stfd f20, 48(SP) 203 stfd f21, 56(SP) 204 stfd f22, 64(SP) 205 stfd f23, 72(SP) 206 stfd f24, 80(SP) 207 stfd f25, 88(SP) 208 stfd f26, 96(SP) 209 stfd f27, 104(SP) 210 stfd f28, 112(SP) 211 stfd f29, 120(SP) 212 stfd f30, 128(SP) 213 stfd f31, 136(SP) 214 215#ifdef __64BIT__ 216 std r0, FZERO 217 std r14, 144(SP) 218 std r15, 152(SP) 219 std r16, 160(SP) 220 std r17, 168(SP) 221 std r18, 176(SP) 222 std r19, 184(SP) 223 std r20, 192(SP) 224 std r21, 200(SP) 225 std r22, 208(SP) 226 std r23, 216(SP) 227 std r24, 224(SP) 228 std r25, 232(SP) 229 std r26, 240(SP) 230 std r27, 248(SP) 231#else 232 stw r0, 0 + FZERO 233 stw r0, 4 + FZERO 234 stw r14, 144(SP) 235 stw r15, 148(SP) 236 stw r16, 152(SP) 237 stw r17, 156(SP) 238 stw r18, 160(SP) 239 stw r19, 164(SP) 240 stw r20, 168(SP) 241 stw r21, 172(SP) 242 stw r22, 176(SP) 243 stw r23, 180(SP) 244 stw r24, 184(SP) 245 stw r25, 188(SP) 246 stw r26, 192(SP) 247 stw r27, 196(SP) 248#endif 249 250#ifdef linux 251#ifndef __64BIT__ 252 lwz BUFFER, 56 + STACKSIZE(SP) 253#else 254 ld INCY, 112 + STACKSIZE(SP) 255 ld BUFFER, 120 + STACKSIZE(SP) 256#endif 257#endif 258 259#if defined(_AIX) || defined(__APPLE__) 260#ifndef __64BIT__ 261#ifdef DOUBLE 262 lwz Y, 56 + STACKSIZE(SP) 263 lwz INCY, 60 + STACKSIZE(SP) 264 lwz BUFFER, 64 + STACKSIZE(SP) 265#else 266 lwz INCY, 56 + STACKSIZE(SP) 267 lwz BUFFER, 60 + STACKSIZE(SP) 268#endif 269#else 270 ld INCY, 112 + STACKSIZE(SP) 271 ld BUFFER, 120 + STACKSIZE(SP) 272#endif 273#endif 274 275 STFD alpha, ALPHA 276 277 slwi LDA, LDA, BASE_SHIFT 278 slwi INCX, INCX, BASE_SHIFT 279 slwi INCY, INCY, BASE_SHIFT 280 281 li PREA, PREFETCHSIZE_A * SIZE 282 sub IS, M, IS 283 284 cmpwi cr0, M, 0 285 ble- LL(999) 286 287 mullw TEMP, IS, LDA 288 add A, A, TEMP 289 290 cmpwi cr0, INCX, SIZE 291 beq LL(05) 292 293 mr XX, X 294 mr X, BUFFER 295 296 srawi. r0, M, 3 297 mtspr CTR, r0 298 ble LL(03) 299 .align 4 300 301LL(01): 302 LFD a1, 0 * SIZE(XX) 303 add XX, XX, INCX 304 LFD a2, 0 * SIZE(XX) 305 add XX, XX, INCX 306 LFD a3, 0 * SIZE(XX) 307 add XX, XX, INCX 308 LFD a4, 0 * SIZE(XX) 309 add XX, XX, INCX 310 LFD a5, 0 * SIZE(XX) 311 add XX, XX, INCX 312 LFD a6, 0 * SIZE(XX) 313 add XX, XX, INCX 314 LFD a7, 0 * SIZE(XX) 315 add XX, XX, INCX 316 LFD a8, 0 * SIZE(XX) 317 add XX, XX, INCX 318 319 dcbt XX, PREA 320 dcbtst BUFFER, PREA 321 322 STFD a1, 0 * SIZE(BUFFER) 323 STFD a2, 1 * SIZE(BUFFER) 324 STFD a3, 2 * SIZE(BUFFER) 325 STFD a4, 3 * SIZE(BUFFER) 326 STFD a5, 4 * SIZE(BUFFER) 327 STFD a6, 5 * SIZE(BUFFER) 328 STFD a7, 6 * SIZE(BUFFER) 329 STFD a8, 7 * SIZE(BUFFER) 330 331 addi BUFFER, BUFFER, 8 * SIZE 332 bdnz LL(01) 333 .align 4 334 335LL(03): 336 andi. r0, M, 7 337 mtspr CTR, r0 338 ble LL(05) 339 .align 4 340 341LL(04): 342 LFD a1, 0 * SIZE(XX) 343 add XX, XX, INCX 344 345 STFD a1, 0 * SIZE(BUFFER) 346 addi BUFFER, BUFFER, 1 * SIZE 347 bdnz LL(04) 348 .align 4 349 350LL(05): 351 mr NEW_Y, Y 352 lfd f0, FZERO 353 354 cmpwi cr0, INCY, SIZE 355 beq LL(10) 356 357 mr NEW_Y, BUFFER 358 359 addi r0, M, 7 360 srawi. r0, r0, 3 361 mtspr CTR, r0 362 .align 4 363 364LL(06): 365 STFD f0, 0 * SIZE(BUFFER) 366 STFD f0, 1 * SIZE(BUFFER) 367 STFD f0, 2 * SIZE(BUFFER) 368 STFD f0, 3 * SIZE(BUFFER) 369 STFD f0, 4 * SIZE(BUFFER) 370 STFD f0, 5 * SIZE(BUFFER) 371 STFD f0, 6 * SIZE(BUFFER) 372 STFD f0, 7 * SIZE(BUFFER) 373 addi BUFFER, BUFFER, 8 * SIZE 374 bdnz LL(06) 375 .align 4 376 377LL(10): 378 addi TEMP, IS, 4 379 cmpw cr0, TEMP, M 380 bgt LL(20) 381 .align 4 382 383LL(11): 384 mr AO1, A 385 add AO2, A, LDA 386 add AO3, AO2, LDA 387 add AO4, AO3, LDA 388 add A, AO4, LDA 389 390 slwi TEMP, IS, BASE_SHIFT 391 add TEMP, X, TEMP 392 393 LFD a16, ALPHA 394 lfd xsum1, FZERO 395 396 LFD atemp1, 0 * SIZE(TEMP) 397 LFD atemp2, 1 * SIZE(TEMP) 398 LFD atemp3, 2 * SIZE(TEMP) 399 LFD atemp4, 3 * SIZE(TEMP) 400 401 LFD xtemp1, 0 * SIZE(X) 402 LFD xtemp2, 1 * SIZE(X) 403 LFD xtemp3, 2 * SIZE(X) 404 LFD xtemp4, 3 * SIZE(X) 405 406 LFD y01, 0 * SIZE(NEW_Y) 407 LFD y02, 1 * SIZE(NEW_Y) 408 LFD y03, 2 * SIZE(NEW_Y) 409 LFD y04, 3 * SIZE(NEW_Y) 410 411 LFD a1, 0 * SIZE(AO1) 412 FMUL atemp1, a16, atemp1 413 LFD a2, 1 * SIZE(AO1) 414 FMUL atemp2, a16, atemp2 415 LFD a3, 2 * SIZE(AO1) 416 FMUL atemp3, a16, atemp3 417 LFD a4, 3 * SIZE(AO1) 418 FMUL atemp4, a16, atemp4 419 420 LFD a5, 0 * SIZE(AO2) 421 fmr xsum2, xsum1 422 LFD a6, 1 * SIZE(AO2) 423 fmr xsum3, xsum1 424 LFD a7, 2 * SIZE(AO2) 425 fmr xsum4, xsum1 426 LFD a8, 3 * SIZE(AO2) 427 428 LFD a9, 0 * SIZE(AO3) 429 LFD a10, 1 * SIZE(AO3) 430 LFD a11, 2 * SIZE(AO3) 431 LFD a12, 3 * SIZE(AO3) 432 433 LFD a13, 0 * SIZE(AO4) 434 LFD a14, 1 * SIZE(AO4) 435 LFD a15, 2 * SIZE(AO4) 436 LFD a16, 3 * SIZE(AO4) 437 438 mr XX, X 439 mr YY, NEW_Y 440 441 srawi. r0, IS, 4 442 mtspr CTR, r0 443 ble LL(14) 444 .align 4 445 446LL(12): 447 FMADD xsum1, xtemp1, a1, xsum1 448 DCBT(AO1, PREA) 449 FMADD y01, atemp1, a1, y01 450 LFD a1, 4 * SIZE(AO1) 451 452 FMADD xsum2, xtemp1, a5, xsum2 453 NOP1 454 FMADD y02, atemp1, a2, y02 455 NOP2 456 457 FMADD xsum3, xtemp1, a9, xsum3 458 NOP1 459 FMADD y03, atemp1, a3, y03 460 NOP2 461 462 FMADD xsum4, xtemp1, a13, xsum4 463 LFD xtemp1, 4 * SIZE(XX) 464 FMADD y04, atemp1, a4, y04 465 NOP2 466 467 FMADD xsum1, xtemp2, a2, xsum1 468 LFD a2, 5 * SIZE(AO1) 469 FMADD y01, atemp2, a5, y01 470 LFD a5, 4 * SIZE(AO2) 471 472 FMADD xsum2, xtemp2, a6, xsum2 473 NOP1 474 FMADD y02, atemp2, a6, y02 475 LFD a6, 5 * SIZE(AO2) 476 477 FMADD xsum3, xtemp2, a10, xsum3 478 NOP1 479 FMADD y03, atemp2, a7, y03 480 NOP2 481 482 FMADD xsum4, xtemp2, a14, xsum4 483 LFD xtemp2, 5 * SIZE(XX) 484 FMADD y04, atemp2, a8, y04 485# DCBT(X, PREX) 486 NOP2 487 488 FMADD xsum1, xtemp3, a3, xsum1 489 LFD a3, 6 * SIZE(AO1) 490 FMADD y01, atemp3, a9, y01 491 LFD a9, 4 * SIZE(AO3) 492 493 FMADD xsum2, xtemp3, a7, xsum2 494 LFD a7, 6 * SIZE(AO2) 495 FMADD y02, atemp3, a10, y02 496 LFD a10, 5 * SIZE(AO3) 497 498 FMADD xsum3, xtemp3, a11, xsum3 499 NOP1 500 FMADD y03, atemp3, a11, y03 501 LFD a11, 6 * SIZE(AO3) 502 503 FMADD xsum4, xtemp3, a15, xsum4 504 LFD xtemp3, 6 * SIZE(XX) 505 FMADD y04, atemp3, a12, y04 506 NOP2 507 508 FMADD xsum1, xtemp4, a4, xsum1 509 LFD a4, 7 * SIZE(AO1) 510 FMADD y01, atemp4, a13, y01 511 LFD a13, 4 * SIZE(AO4) 512 513 FMADD xsum2, xtemp4, a8, xsum2 514 LFD a8, 7 * SIZE(AO2) 515 FMADD y02, atemp4, a14, y02 516 LFD a14, 5 * SIZE(AO4) 517 518 FMADD xsum3, xtemp4, a12, xsum3 519 LFD a12, 7 * SIZE(AO3) 520 FMADD y03, atemp4, a15, y03 521 LFD a15, 6 * SIZE(AO4) 522 523 FMADD xsum4, xtemp4, a16, xsum4 524 LFD xtemp4, 7 * SIZE(XX) 525 FMADD y04, atemp4, a16, y04 526 LFD a16, 7 * SIZE(AO4) 527 528 STFD y01, 0 * SIZE(YY) 529 LFD y01, 4 * SIZE(YY) 530 STFD y02, 1 * SIZE(YY) 531 LFD y02, 5 * SIZE(YY) 532 533 STFD y03, 2 * SIZE(YY) 534 LFD y03, 6 * SIZE(YY) 535 STFD y04, 3 * SIZE(YY) 536 LFD y04, 7 * SIZE(YY) 537 538 FMADD xsum1, xtemp1, a1, xsum1 539 DCBT(AO2, PREA) 540 FMADD y01, atemp1, a1, y01 541 LFD a1, 8 * SIZE(AO1) 542 543 FMADD xsum2, xtemp1, a5, xsum2 544 NOP1 545 FMADD y02, atemp1, a2, y02 546 NOP2 547 548 FMADD xsum3, xtemp1, a9, xsum3 549 NOP1 550 FMADD y03, atemp1, a3, y03 551 NOP2 552 553 FMADD xsum4, xtemp1, a13, xsum4 554 LFD xtemp1, 8 * SIZE(XX) 555 FMADD y04, atemp1, a4, y04 556 NOP2 557 558 FMADD xsum1, xtemp2, a2, xsum1 559 LFD a2, 9 * SIZE(AO1) 560 FMADD y01, atemp2, a5, y01 561 LFD a5, 8 * SIZE(AO2) 562 563 FMADD xsum2, xtemp2, a6, xsum2 564 NOP1 565 FMADD y02, atemp2, a6, y02 566 LFD a6, 9 * SIZE(AO2) 567 568 FMADD xsum3, xtemp2, a10, xsum3 569 NOP1 570 FMADD y03, atemp2, a7, y03 571 NOP2 572 573 FMADD xsum4, xtemp2, a14, xsum4 574 LFD xtemp2, 9 * SIZE(XX) 575 FMADD y04, atemp2, a8, y04 576 NOP2 577 578 FMADD xsum1, xtemp3, a3, xsum1 579 LFD a3, 10 * SIZE(AO1) 580 FMADD y01, atemp3, a9, y01 581 LFD a9, 8 * SIZE(AO3) 582 583 FMADD xsum2, xtemp3, a7, xsum2 584 LFD a7, 10 * SIZE(AO2) 585 FMADD y02, atemp3, a10, y02 586 LFD a10, 9 * SIZE(AO3) 587 588 FMADD xsum3, xtemp3, a11, xsum3 589 NOP1 590 FMADD y03, atemp3, a11, y03 591 LFD a11, 10 * SIZE(AO3) 592 593 FMADD xsum4, xtemp3, a15, xsum4 594 LFD xtemp3, 10 * SIZE(XX) 595 FMADD y04, atemp3, a12, y04 596 NOP2 597 598 FMADD xsum1, xtemp4, a4, xsum1 599 LFD a4, 11 * SIZE(AO1) 600 FMADD y01, atemp4, a13, y01 601 LFD a13, 8 * SIZE(AO4) 602 603 FMADD xsum2, xtemp4, a8, xsum2 604 LFD a8, 11 * SIZE(AO2) 605 FMADD y02, atemp4, a14, y02 606 LFD a14, 9 * SIZE(AO4) 607 608 FMADD xsum3, xtemp4, a12, xsum3 609 LFD a12, 11 * SIZE(AO3) 610 FMADD y03, atemp4, a15, y03 611 LFD a15, 10 * SIZE(AO4) 612 613 FMADD xsum4, xtemp4, a16, xsum4 614 LFD xtemp4, 11 * SIZE(XX) 615 FMADD y04, atemp4, a16, y04 616 LFD a16, 11 * SIZE(AO4) 617 618 STFD y01, 4 * SIZE(YY) 619 LFD y01, 8 * SIZE(YY) 620 STFD y02, 5 * SIZE(YY) 621 LFD y02, 9 * SIZE(YY) 622 623 STFD y03, 6 * SIZE(YY) 624 LFD y03, 10 * SIZE(YY) 625 STFD y04, 7 * SIZE(YY) 626 LFD y04, 11 * SIZE(YY) 627 628 629 FMADD xsum1, xtemp1, a1, xsum1 630 DCBT(AO3, PREA) 631 FMADD y01, atemp1, a1, y01 632 LFD a1, 12 * SIZE(AO1) 633 634 FMADD xsum2, xtemp1, a5, xsum2 635 NOP1 636 FMADD y02, atemp1, a2, y02 637 NOP2 638 639 FMADD xsum3, xtemp1, a9, xsum3 640 NOP1 641 FMADD y03, atemp1, a3, y03 642 NOP2 643 644 FMADD xsum4, xtemp1, a13, xsum4 645 LFD xtemp1, 12 * SIZE(XX) 646 FMADD y04, atemp1, a4, y04 647 NOP2 648 649 FMADD xsum1, xtemp2, a2, xsum1 650 LFD a2, 13 * SIZE(AO1) 651 FMADD y01, atemp2, a5, y01 652 LFD a5, 12 * SIZE(AO2) 653 654 FMADD xsum2, xtemp2, a6, xsum2 655 NOP1 656 FMADD y02, atemp2, a6, y02 657 LFD a6, 13 * SIZE(AO2) 658 659 FMADD xsum3, xtemp2, a10, xsum3 660 NOP1 661 FMADD y03, atemp2, a7, y03 662# DCBT(Y1, PREY) 663 NOP2 664 665 FMADD xsum4, xtemp2, a14, xsum4 666 LFD xtemp2, 13 * SIZE(XX) 667 FMADD y04, atemp2, a8, y04 668 NOP2 669 670 FMADD xsum1, xtemp3, a3, xsum1 671 LFD a3, 14 * SIZE(AO1) 672 FMADD y01, atemp3, a9, y01 673 LFD a9, 12 * SIZE(AO3) 674 675 FMADD xsum2, xtemp3, a7, xsum2 676 LFD a7, 14 * SIZE(AO2) 677 FMADD y02, atemp3, a10, y02 678 LFD a10,13 * SIZE(AO3) 679 680 FMADD xsum3, xtemp3, a11, xsum3 681 NOP1 682 FMADD y03, atemp3, a11, y03 683 LFD a11, 14 * SIZE(AO3) 684 685 FMADD xsum4, xtemp3, a15, xsum4 686 LFD xtemp3, 14 * SIZE(XX) 687 FMADD y04, atemp3, a12, y04 688 NOP2 689 690 FMADD xsum1, xtemp4, a4, xsum1 691 LFD a4, 15 * SIZE(AO1) 692 FMADD y01, atemp4, a13, y01 693 LFD a13,12 * SIZE(AO4) 694 695 FMADD xsum2, xtemp4, a8, xsum2 696 LFD a8, 15 * SIZE(AO2) 697 FMADD y02, atemp4, a14, y02 698 LFD a14, 13 * SIZE(AO4) 699 700 FMADD xsum3, xtemp4, a12, xsum3 701 LFD a12, 15 * SIZE(AO3) 702 FMADD y03, atemp4, a15, y03 703 LFD a15, 14 * SIZE(AO4) 704 705 FMADD xsum4, xtemp4, a16, xsum4 706 LFD xtemp4, 15 * SIZE(XX) 707 FMADD y04, atemp4, a16, y04 708 LFD a16, 15 * SIZE(AO4) 709 710 STFD y01, 8 * SIZE(YY) 711 LFD y01, 12 * SIZE(YY) 712 STFD y02, 9 * SIZE(YY) 713 LFD y02, 13 * SIZE(YY) 714 715 STFD y03, 10 * SIZE(YY) 716 LFD y03, 14 * SIZE(YY) 717 STFD y04, 11 * SIZE(YY) 718 LFD y04, 15 * SIZE(YY) 719 720 FMADD xsum1, xtemp1, a1, xsum1 721 DCBT(AO4, PREA) 722 FMADD y01, atemp1, a1, y01 723 LFD a1, 16 * SIZE(AO1) 724 725 FMADD xsum2, xtemp1, a5, xsum2 726 NOP1 727 FMADD y02, atemp1, a2, y02 728 NOP2 729 730 FMADD xsum3, xtemp1, a9, xsum3 731 NOP1 732 FMADD y03, atemp1, a3, y03 733 NOP2 734 735 FMADD xsum4, xtemp1, a13, xsum4 736 LFD xtemp1, 16 * SIZE(XX) 737 FMADD y04, atemp1, a4, y04 738 addi YY, YY, 16 * SIZE 739 740 FMADD xsum1, xtemp2, a2, xsum1 741 LFD a2, 17 * SIZE(AO1) 742 FMADD y01, atemp2, a5, y01 743 LFD a5, 16 * SIZE(AO2) 744 745 FMADD xsum2, xtemp2, a6, xsum2 746 addi AO3, AO3, 16 * SIZE 747 FMADD y02, atemp2, a6, y02 748 LFD a6, 17 * SIZE(AO2) 749 750 FMADD xsum3, xtemp2, a10, xsum3 751 addi AO1, AO1, 16 * SIZE 752 FMADD y03, atemp2, a7, y03 753 addi AO2, AO2, 16 * SIZE 754 755 FMADD xsum4, xtemp2, a14, xsum4 756 LFD xtemp2, 17 * SIZE(XX) 757 FMADD y04, atemp2, a8, y04 758 addi AO4, AO4, 16 * SIZE 759 760 FMADD xsum1, xtemp3, a3, xsum1 761 LFD a3, 2 * SIZE(AO1) 762 FMADD y01, atemp3, a9, y01 763 LFD a9, 0 * SIZE(AO3) 764 765 FMADD xsum2, xtemp3, a7, xsum2 766 LFD a7, 2 * SIZE(AO2) 767 FMADD y02, atemp3, a10, y02 768 LFD a10, 1 * SIZE(AO3) 769 770 FMADD xsum3, xtemp3, a11, xsum3 771 NOP1 772 FMADD y03, atemp3, a11, y03 773 LFD a11, 2 * SIZE(AO3) 774 775 FMADD xsum4, xtemp3, a15, xsum4 776 LFD xtemp3, 18 * SIZE(XX) 777 FMADD y04, atemp3, a12, y04 778 addi XX, XX, 16 * SIZE 779 780 FMADD xsum1, xtemp4, a4, xsum1 781 LFD a4, 3 * SIZE(AO1) 782 FMADD y01, atemp4, a13, y01 783 LFD a13, 0 * SIZE(AO4) 784 785 FMADD xsum2, xtemp4, a8, xsum2 786 LFD a8, 3 * SIZE(AO2) 787 FMADD y02, atemp4, a14, y02 788 LFD a14, 1 * SIZE(AO4) 789 790 FMADD xsum3, xtemp4, a12, xsum3 791 LFD a12, 3 * SIZE(AO3) 792 FMADD y03, atemp4, a15, y03 793 LFD a15, 2 * SIZE(AO4) 794 795 FMADD xsum4, xtemp4, a16, xsum4 796 LFD xtemp4, 3 * SIZE(XX) 797 FMADD y04, atemp4, a16, y04 798 LFD a16, 3 * SIZE(AO4) 799 800 STFD y01, -4 * SIZE(YY) 801 LFD y01, 0 * SIZE(YY) 802 STFD y02, -3 * SIZE(YY) 803 LFD y02, 1 * SIZE(YY) 804 805 STFD y03, -2 * SIZE(YY) 806 LFD y03, 2 * SIZE(YY) 807 STFD y04, -1 * SIZE(YY) 808 LFD y04, 3 * SIZE(YY) 809 bdnz LL(12) 810 .align 4 811 812LL(14): 813 andi. r0, IS, 8 814 ble LL(15) 815 816 FMADD xsum1, xtemp1, a1, xsum1 817 NOP1 818 FMADD y01, atemp1, a1, y01 819 LFD a1, 4 * SIZE(AO1) 820 821 FMADD xsum2, xtemp1, a5, xsum2 822 NOP1 823 FMADD y02, atemp1, a2, y02 824 NOP2 825 826 FMADD xsum3, xtemp1, a9, xsum3 827 NOP1 828 FMADD y03, atemp1, a3, y03 829 NOP2 830 831 FMADD xsum4, xtemp1, a13, xsum4 832 LFD xtemp1, 4 * SIZE(XX) 833 FMADD y04, atemp1, a4, y04 834 NOP2 835 836 FMADD xsum1, xtemp2, a2, xsum1 837 LFD a2, 5 * SIZE(AO1) 838 FMADD y01, atemp2, a5, y01 839 LFD a5, 4 * SIZE(AO2) 840 841 FMADD xsum2, xtemp2, a6, xsum2 842 NOP1 843 FMADD y02, atemp2, a6, y02 844 LFD a6, 5 * SIZE(AO2) 845 846 FMADD xsum3, xtemp2, a10, xsum3 847 NOP1 848 FMADD y03, atemp2, a7, y03 849 NOP2 850 851 FMADD xsum4, xtemp2, a14, xsum4 852 LFD xtemp2, 5 * SIZE(XX) 853 FMADD y04, atemp2, a8, y04 854 NOP2 855 856 FMADD xsum1, xtemp3, a3, xsum1 857 LFD a3, 6 * SIZE(AO1) 858 FMADD y01, atemp3, a9, y01 859 LFD a9, 4 * SIZE(AO3) 860 861 FMADD xsum2, xtemp3, a7, xsum2 862 LFD a7, 6 * SIZE(AO2) 863 FMADD y02, atemp3, a10, y02 864 LFD a10, 5 * SIZE(AO3) 865 866 FMADD xsum3, xtemp3, a11, xsum3 867 NOP1 868 FMADD y03, atemp3, a11, y03 869 LFD a11, 6 * SIZE(AO3) 870 871 FMADD xsum4, xtemp3, a15, xsum4 872 LFD xtemp3, 6 * SIZE(XX) 873 FMADD y04, atemp3, a12, y04 874 NOP2 875 876 FMADD xsum1, xtemp4, a4, xsum1 877 LFD a4, 7 * SIZE(AO1) 878 FMADD y01, atemp4, a13, y01 879 LFD a13, 4 * SIZE(AO4) 880 881 FMADD xsum2, xtemp4, a8, xsum2 882 LFD a8, 7 * SIZE(AO2) 883 FMADD y02, atemp4, a14, y02 884 LFD a14, 5 * SIZE(AO4) 885 886 FMADD xsum3, xtemp4, a12, xsum3 887 LFD a12, 7 * SIZE(AO3) 888 FMADD y03, atemp4, a15, y03 889 LFD a15, 6 * SIZE(AO4) 890 891 FMADD xsum4, xtemp4, a16, xsum4 892 LFD xtemp4, 7 * SIZE(XX) 893 FMADD y04, atemp4, a16, y04 894 LFD a16, 7 * SIZE(AO4) 895 896 STFD y01, 0 * SIZE(YY) 897 LFD y01, 4 * SIZE(YY) 898 STFD y02, 1 * SIZE(YY) 899 LFD y02, 5 * SIZE(YY) 900 901 STFD y03, 2 * SIZE(YY) 902 LFD y03, 6 * SIZE(YY) 903 STFD y04, 3 * SIZE(YY) 904 LFD y04, 7 * SIZE(YY) 905 906 FMADD xsum1, xtemp1, a1, xsum1 907 NOP1 908 FMADD y01, atemp1, a1, y01 909 LFD a1, 8 * SIZE(AO1) 910 911 FMADD xsum2, xtemp1, a5, xsum2 912 NOP1 913 FMADD y02, atemp1, a2, y02 914 NOP2 915 916 FMADD xsum3, xtemp1, a9, xsum3 917 NOP1 918 FMADD y03, atemp1, a3, y03 919 NOP2 920 921 FMADD xsum4, xtemp1, a13, xsum4 922 LFD xtemp1, 8 * SIZE(XX) 923 FMADD y04, atemp1, a4, y04 924 NOP2 925 926 FMADD xsum1, xtemp2, a2, xsum1 927 LFD a2, 9 * SIZE(AO1) 928 FMADD y01, atemp2, a5, y01 929 LFD a5, 8 * SIZE(AO2) 930 931 FMADD xsum2, xtemp2, a6, xsum2 932 NOP1 933 FMADD y02, atemp2, a6, y02 934 LFD a6, 9 * SIZE(AO2) 935 936 FMADD xsum3, xtemp2, a10, xsum3 937 NOP1 938 FMADD y03, atemp2, a7, y03 939 NOP2 940 941 FMADD xsum4, xtemp2, a14, xsum4 942 LFD xtemp2, 9 * SIZE(XX) 943 FMADD y04, atemp2, a8, y04 944 NOP2 945 946 FMADD xsum1, xtemp3, a3, xsum1 947 LFD a3, 10 * SIZE(AO1) 948 FMADD y01, atemp3, a9, y01 949 LFD a9, 8 * SIZE(AO3) 950 951 FMADD xsum2, xtemp3, a7, xsum2 952 LFD a7, 10 * SIZE(AO2) 953 FMADD y02, atemp3, a10, y02 954 LFD a10, 9 * SIZE(AO3) 955 956 FMADD xsum3, xtemp3, a11, xsum3 957 NOP1 958 FMADD y03, atemp3, a11, y03 959 LFD a11, 10 * SIZE(AO3) 960 961 FMADD xsum4, xtemp3, a15, xsum4 962 LFD xtemp3, 10 * SIZE(XX) 963 FMADD y04, atemp3, a12, y04 964 NOP2 965 966 FMADD xsum1, xtemp4, a4, xsum1 967 LFD a4, 11 * SIZE(AO1) 968 FMADD y01, atemp4, a13, y01 969 LFD a13, 8 * SIZE(AO4) 970 971 FMADD xsum2, xtemp4, a8, xsum2 972 LFD a8, 11 * SIZE(AO2) 973 FMADD y02, atemp4, a14, y02 974 LFD a14, 9 * SIZE(AO4) 975 976 FMADD xsum3, xtemp4, a12, xsum3 977 LFD a12, 11 * SIZE(AO3) 978 FMADD y03, atemp4, a15, y03 979 LFD a15, 10 * SIZE(AO4) 980 981 FMADD xsum4, xtemp4, a16, xsum4 982 LFD xtemp4, 11 * SIZE(XX) 983 FMADD y04, atemp4, a16, y04 984 LFD a16, 11 * SIZE(AO4) 985 986 addi AO1, AO1, 8 * SIZE 987 addi AO2, AO2, 8 * SIZE 988 addi AO3, AO3, 8 * SIZE 989 addi AO4, AO4, 8 * SIZE 990 991 STFD y01, 4 * SIZE(YY) 992 LFD y01, 8 * SIZE(YY) 993 STFD y02, 5 * SIZE(YY) 994 LFD y02, 9 * SIZE(YY) 995 996 STFD y03, 6 * SIZE(YY) 997 LFD y03, 10 * SIZE(YY) 998 STFD y04, 7 * SIZE(YY) 999 LFD y04, 11 * SIZE(YY) 1000 1001 addi XX, XX, 8 * SIZE 1002 addi YY, YY, 8 * SIZE 1003 .align 4 1004 1005LL(15): 1006 andi. r0, IS, 4 1007 ble LL(18) 1008 1009 FMADD xsum1, xtemp1, a1, xsum1 1010 NOP1 1011 FMADD y01, atemp1, a1, y01 1012 LFD a1, 4 * SIZE(AO1) 1013 1014 FMADD xsum2, xtemp1, a5, xsum2 1015 NOP1 1016 FMADD y02, atemp1, a2, y02 1017 NOP2 1018 1019 FMADD xsum3, xtemp1, a9, xsum3 1020 NOP1 1021 FMADD y03, atemp1, a3, y03 1022 NOP2 1023 1024 FMADD xsum4, xtemp1, a13, xsum4 1025 LFD xtemp1, 4 * SIZE(XX) 1026 FMADD y04, atemp1, a4, y04 1027 NOP2 1028 1029 FMADD xsum1, xtemp2, a2, xsum1 1030 LFD a2, 5 * SIZE(AO1) 1031 FMADD y01, atemp2, a5, y01 1032 LFD a5, 4 * SIZE(AO2) 1033 1034 FMADD xsum2, xtemp2, a6, xsum2 1035 NOP1 1036 FMADD y02, atemp2, a6, y02 1037 LFD a6, 5 * SIZE(AO2) 1038 1039 FMADD xsum3, xtemp2, a10, xsum3 1040 NOP1 1041 FMADD y03, atemp2, a7, y03 1042 NOP2 1043 1044 FMADD xsum4, xtemp2, a14, xsum4 1045 LFD xtemp2, 5 * SIZE(XX) 1046 FMADD y04, atemp2, a8, y04 1047 NOP2 1048 1049 FMADD xsum1, xtemp3, a3, xsum1 1050 LFD a3, 6 * SIZE(AO1) 1051 FMADD y01, atemp3, a9, y01 1052 LFD a9, 4 * SIZE(AO3) 1053 1054 FMADD xsum2, xtemp3, a7, xsum2 1055 LFD a7, 6 * SIZE(AO2) 1056 FMADD y02, atemp3, a10, y02 1057 LFD a10, 5 * SIZE(AO3) 1058 1059 FMADD xsum3, xtemp3, a11, xsum3 1060 NOP1 1061 FMADD y03, atemp3, a11, y03 1062 LFD a11, 6 * SIZE(AO3) 1063 1064 FMADD xsum4, xtemp3, a15, xsum4 1065 LFD xtemp3, 6 * SIZE(XX) 1066 FMADD y04, atemp3, a12, y04 1067 NOP2 1068 1069 FMADD xsum1, xtemp4, a4, xsum1 1070 LFD a4, 7 * SIZE(AO1) 1071 FMADD y01, atemp4, a13, y01 1072 LFD a13, 4 * SIZE(AO4) 1073 1074 FMADD xsum2, xtemp4, a8, xsum2 1075 LFD a8, 7 * SIZE(AO2) 1076 FMADD y02, atemp4, a14, y02 1077 LFD a14, 5 * SIZE(AO4) 1078 1079 FMADD xsum3, xtemp4, a12, xsum3 1080 LFD a12, 7 * SIZE(AO3) 1081 FMADD y03, atemp4, a15, y03 1082 LFD a15, 6 * SIZE(AO4) 1083 1084 FMADD xsum4, xtemp4, a16, xsum4 1085 LFD xtemp4, 7 * SIZE(XX) 1086 FMADD y04, atemp4, a16, y04 1087 LFD a16, 7 * SIZE(AO4) 1088 1089 addi AO1, AO1, 4 * SIZE 1090 addi AO2, AO2, 4 * SIZE 1091 addi AO3, AO3, 4 * SIZE 1092 addi AO4, AO4, 4 * SIZE 1093 1094 STFD y01, 0 * SIZE(YY) 1095 LFD y01, 4 * SIZE(YY) 1096 STFD y02, 1 * SIZE(YY) 1097 LFD y02, 5 * SIZE(YY) 1098 1099 STFD y03, 2 * SIZE(YY) 1100 LFD y03, 6 * SIZE(YY) 1101 STFD y04, 3 * SIZE(YY) 1102 LFD y04, 7 * SIZE(YY) 1103 1104 addi XX, XX, 4 * SIZE 1105 addi YY, YY, 4 * SIZE 1106 .align 4 1107 1108LL(18): 1109 LFD xtemp1, ALPHA 1110 1111 FMUL xsum1, xtemp1, xsum1 1112 FMUL xsum2, xtemp1, xsum2 1113 FMUL xsum3, xtemp1, xsum3 1114 FMUL xsum4, xtemp1, xsum4 1115 1116 FMADD xsum1, atemp1, a1, xsum1 1117 FMADD xsum2, atemp1, a5, xsum2 1118 FMADD xsum3, atemp1, a9, xsum3 1119 FMADD xsum4, atemp1, a13, xsum4 1120 1121 FMADD xsum1, atemp2, a5, xsum1 1122 FMADD xsum2, atemp2, a6, xsum2 1123 FMADD xsum3, atemp2, a10, xsum3 1124 FMADD xsum4, atemp2, a14, xsum4 1125 1126 FMADD xsum1, atemp3, a9, xsum1 1127 FMADD xsum2, atemp3, a10, xsum2 1128 FMADD xsum3, atemp3, a11, xsum3 1129 FMADD xsum4, atemp3, a15, xsum4 1130 1131 FMADD xsum1, atemp4, a13, xsum1 1132 FMADD xsum2, atemp4, a14, xsum2 1133 FMADD xsum3, atemp4, a15, xsum3 1134 FMADD xsum4, atemp4, a16, xsum4 1135 1136 FADD y01, y01, xsum1 1137 FADD y02, y02, xsum2 1138 FADD y03, y03, xsum3 1139 FADD y04, y04, xsum4 1140 1141 STFD y01, 0 * SIZE(YY) 1142 STFD y02, 1 * SIZE(YY) 1143 STFD y03, 2 * SIZE(YY) 1144 STFD y04, 3 * SIZE(YY) 1145 1146 addi TEMP, IS, 8 1147 addi IS, IS, 4 1148 cmpw cr0, TEMP, M 1149 ble LL(11) 1150 .align 4 1151 1152LL(20): 1153 andi. TEMP, M, 2 1154 ble LL(30) 1155 1156 mr AO1, A 1157 add AO2, A, LDA 1158 add A, AO2, LDA 1159 1160 slwi TEMP, IS, BASE_SHIFT 1161 add TEMP, X, TEMP 1162 1163 LFD atemp1, 0 * SIZE(TEMP) 1164 LFD atemp2, 1 * SIZE(TEMP) 1165 1166 LFD a1, ALPHA 1167 1168 FMUL atemp1, a1, atemp1 1169 FMUL atemp2, a1, atemp2 1170 1171 lfd xsum1, FZERO 1172 fmr xsum2, xsum1 1173 1174 mr XX, X 1175 mr YY, NEW_Y 1176 1177 LFD xtemp1, 0 * SIZE(XX) 1178 LFD xtemp2, 1 * SIZE(XX) 1179 1180 LFD y01, 0 * SIZE(YY) 1181 LFD y02, 1 * SIZE(YY) 1182 1183 LFD a1, 0 * SIZE(AO1) 1184 LFD a2, 1 * SIZE(AO1) 1185 1186 LFD a5, 0 * SIZE(AO2) 1187 LFD a6, 1 * SIZE(AO2) 1188 1189 srawi. r0, IS, 1 1190 mtspr CTR, r0 1191 ble LL(28) 1192 .align 4 1193 1194LL(22): 1195 FMADD xsum1, xtemp1, a1, xsum1 1196 FMADD xsum2, xtemp1, a5, xsum2 1197 1198 FMADD xsum1, xtemp2, a2, xsum1 1199 FMADD xsum2, xtemp2, a6, xsum2 1200 1201 FMADD y01, atemp1, a1, y01 1202 FMADD y02, atemp1, a2, y02 1203 FMADD y01, atemp2, a5, y01 1204 FMADD y02, atemp2, a6, y02 1205 1206 LFD xtemp1, 2 * SIZE(XX) 1207 LFD xtemp2, 3 * SIZE(XX) 1208 1209 LFD a1, 2 * SIZE(AO1) 1210 LFD a2, 3 * SIZE(AO1) 1211 1212 LFD a5, 2 * SIZE(AO2) 1213 LFD a6, 3 * SIZE(AO2) 1214 1215 STFD y01, 0 * SIZE(YY) 1216 STFD y02, 1 * SIZE(YY) 1217 1218 LFD y01, 2 * SIZE(YY) 1219 LFD y02, 3 * SIZE(YY) 1220 1221 addi AO1, AO1, 2 * SIZE 1222 addi AO2, AO2, 2 * SIZE 1223 1224 addi XX, XX, 2 * SIZE 1225 addi YY, YY, 2 * SIZE 1226 1227 bdnz LL(22) 1228 .align 4 1229 1230LL(28): 1231 LFD xtemp1, ALPHA 1232 1233 FMUL xsum1, xtemp1, xsum1 1234 FMUL xsum2, xtemp1, xsum2 1235 1236 FMADD xsum1, atemp1, a1, xsum1 1237 FMADD xsum2, atemp1, a5, xsum2 1238 FMADD xsum1, atemp2, a5, xsum1 1239 FMADD xsum2, atemp2, a6, xsum2 1240 1241 FADD y01, y01, xsum1 1242 FADD y02, y02, xsum2 1243 1244 STFD y01, 0 * SIZE(YY) 1245 STFD y02, 1 * SIZE(YY) 1246 1247 addi IS, IS, 2 1248 .align 4 1249 1250LL(30): 1251 andi. TEMP, M, 1 1252 ble LL(990) 1253 1254 mr AO1, A 1255 1256 slwi TEMP, IS, BASE_SHIFT 1257 add TEMP, X, TEMP 1258 1259 LFD atemp1, 0 * SIZE(TEMP) 1260 1261 LFD a1, ALPHA 1262 1263 FMUL atemp1, a1, atemp1 1264 1265 lfd xsum1, FZERO 1266 1267 mr XX, X 1268 mr YY, NEW_Y 1269 1270 LFD xtemp1, 0 * SIZE(XX) 1271 LFD y01, 0 * SIZE(YY) 1272 1273 LFD a1, 0 * SIZE(AO1) 1274 1275 mtspr CTR, IS 1276 cmpwi cr0, IS, 0 1277 ble LL(38) 1278 .align 4 1279 1280LL(32): 1281 FMADD xsum1, xtemp1, a1, xsum1 1282 1283 FMADD y01, atemp1, a1, y01 1284 1285 LFD xtemp1, 1 * SIZE(XX) 1286 1287 LFD a1, 1 * SIZE(AO1) 1288 1289 STFD y01, 0 * SIZE(YY) 1290 1291 LFD y01, 1 * SIZE(YY) 1292 1293 addi AO1, AO1, 1 * SIZE 1294 1295 addi XX, XX, 1 * SIZE 1296 addi YY, YY, 1 * SIZE 1297 1298 bdnz LL(32) 1299 .align 4 1300 1301LL(38): 1302 LFD xtemp1, ALPHA 1303 1304 FMUL xsum1, xtemp1, xsum1 1305 1306 FMADD xsum1, atemp1, a1, xsum1 1307 1308 FADD y01, y01, xsum1 1309 1310 STFD y01, 0 * SIZE(YY) 1311 .align 4 1312 1313LL(990): 1314 cmpwi cr0, INCY, SIZE 1315 beq LL(999) 1316 1317 mr YY, Y 1318 1319 srawi. r0, M, 3 1320 mtspr CTR, r0 1321 ble LL(995) 1322 .align 4 1323 1324LL(991): 1325 LFD f0, 0 * SIZE(Y) 1326 add Y, Y, INCY 1327 LFD f1, 0 * SIZE(Y) 1328 add Y, Y, INCY 1329 LFD f2, 0 * SIZE(Y) 1330 add Y, Y, INCY 1331 LFD f3, 0 * SIZE(Y) 1332 add Y, Y, INCY 1333 LFD f4, 0 * SIZE(Y) 1334 add Y, Y, INCY 1335 LFD f5, 0 * SIZE(Y) 1336 add Y, Y, INCY 1337 LFD f6, 0 * SIZE(Y) 1338 add Y, Y, INCY 1339 LFD f7, 0 * SIZE(Y) 1340 add Y, Y, INCY 1341 1342 LFD f8, 0 * SIZE(NEW_Y) 1343 LFD f9, 1 * SIZE(NEW_Y) 1344 LFD f10, 2 * SIZE(NEW_Y) 1345 LFD f11, 3 * SIZE(NEW_Y) 1346 LFD f12, 4 * SIZE(NEW_Y) 1347 LFD f13, 5 * SIZE(NEW_Y) 1348 LFD f14, 6 * SIZE(NEW_Y) 1349 LFD f15, 7 * SIZE(NEW_Y) 1350 addi NEW_Y, NEW_Y, 8 * SIZE 1351 1352 FADD f8, f8, f0 1353 FADD f9, f9, f1 1354 FADD f10, f10, f2 1355 FADD f11, f11, f3 1356 FADD f12, f12, f4 1357 FADD f13, f13, f5 1358 FADD f14, f14, f6 1359 FADD f15, f15, f7 1360 1361 STFD f8, 0 * SIZE(YY) 1362 add YY, YY, INCY 1363 STFD f9, 0 * SIZE(YY) 1364 add YY, YY, INCY 1365 STFD f10, 0 * SIZE(YY) 1366 add YY, YY, INCY 1367 STFD f11, 0 * SIZE(YY) 1368 add YY, YY, INCY 1369 STFD f12, 0 * SIZE(YY) 1370 add YY, YY, INCY 1371 STFD f13, 0 * SIZE(YY) 1372 add YY, YY, INCY 1373 STFD f14, 0 * SIZE(YY) 1374 add YY, YY, INCY 1375 STFD f15, 0 * SIZE(YY) 1376 add YY, YY, INCY 1377 bdnz LL(991) 1378 .align 4 1379 1380LL(995): 1381 andi. J, M, 4 1382 ble LL(996) 1383 1384 LFD f0, 0 * SIZE(Y) 1385 add Y, Y, INCY 1386 LFD f1, 0 * SIZE(Y) 1387 add Y, Y, INCY 1388 LFD f2, 0 * SIZE(Y) 1389 add Y, Y, INCY 1390 LFD f3, 0 * SIZE(Y) 1391 add Y, Y, INCY 1392 1393 LFD f8, 0 * SIZE(NEW_Y) 1394 LFD f9, 1 * SIZE(NEW_Y) 1395 LFD f10, 2 * SIZE(NEW_Y) 1396 LFD f11, 3 * SIZE(NEW_Y) 1397 addi NEW_Y, NEW_Y, 4 * SIZE 1398 1399 FADD f8, f8, f0 1400 FADD f9, f9, f1 1401 FADD f10, f10, f2 1402 FADD f11, f11, f3 1403 1404 STFD f8, 0 * SIZE(YY) 1405 add YY, YY, INCY 1406 STFD f9, 0 * SIZE(YY) 1407 add YY, YY, INCY 1408 STFD f10, 0 * SIZE(YY) 1409 add YY, YY, INCY 1410 STFD f11, 0 * SIZE(YY) 1411 add YY, YY, INCY 1412 .align 4 1413 1414LL(996): 1415 andi. J, M, 2 1416 ble LL(997) 1417 1418 LFD f0, 0 * SIZE(Y) 1419 add Y, Y, INCY 1420 LFD f1, 0 * SIZE(Y) 1421 add Y, Y, INCY 1422 1423 LFD f8, 0 * SIZE(NEW_Y) 1424 LFD f9, 1 * SIZE(NEW_Y) 1425 addi NEW_Y, NEW_Y, 2 * SIZE 1426 1427 FADD f8, f8, f0 1428 FADD f9, f9, f1 1429 1430 STFD f8, 0 * SIZE(YY) 1431 add YY, YY, INCY 1432 STFD f9, 0 * SIZE(YY) 1433 add YY, YY, INCY 1434 .align 4 1435 1436LL(997): 1437 andi. J, M, 1 1438 ble LL(999) 1439 1440 LFD f0, 0 * SIZE(Y) 1441 LFD f8, 0 * SIZE(NEW_Y) 1442 1443 FADD f8, f8, f0 1444 1445 STFD f8, 0 * SIZE(YY) 1446 .align 4 1447 1448LL(999): 1449 li r3, 0 1450 1451 lfd f14, 0(SP) 1452 lfd f15, 8(SP) 1453 lfd f16, 16(SP) 1454 lfd f17, 24(SP) 1455 lfd f18, 32(SP) 1456 lfd f19, 40(SP) 1457 lfd f20, 48(SP) 1458 lfd f21, 56(SP) 1459 lfd f22, 64(SP) 1460 lfd f23, 72(SP) 1461 lfd f24, 80(SP) 1462 lfd f25, 88(SP) 1463 lfd f26, 96(SP) 1464 lfd f27, 104(SP) 1465 lfd f28, 112(SP) 1466 lfd f29, 120(SP) 1467 lfd f30, 128(SP) 1468 lfd f31, 136(SP) 1469 1470#ifdef __64BIT__ 1471 ld r14, 144(SP) 1472 ld r15, 152(SP) 1473 ld r16, 160(SP) 1474 ld r17, 168(SP) 1475 ld r18, 176(SP) 1476 ld r19, 184(SP) 1477 ld r20, 192(SP) 1478 ld r21, 200(SP) 1479 ld r22, 208(SP) 1480 ld r23, 216(SP) 1481 ld r24, 224(SP) 1482 ld r25, 232(SP) 1483 ld r26, 240(SP) 1484 ld r27, 248(SP) 1485#else 1486 lwz r14, 144(SP) 1487 lwz r15, 148(SP) 1488 lwz r16, 152(SP) 1489 lwz r17, 156(SP) 1490 lwz r18, 160(SP) 1491 lwz r19, 164(SP) 1492 lwz r20, 168(SP) 1493 lwz r21, 172(SP) 1494 lwz r22, 176(SP) 1495 lwz r23, 180(SP) 1496 lwz r24, 184(SP) 1497 lwz r25, 188(SP) 1498 lwz r26, 192(SP) 1499 lwz r27, 196(SP) 1500#endif 1501 1502 addi SP, SP, STACKSIZE 1503 blr 1504 1505 EPILOGUE 1506#endif 1507