1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef linux 26#ifndef __64BIT__ 27#define M r3 28#define IS r4 29#define A r5 30#define LDA r6 31#define X r7 32#define INCX r8 33#define Y r9 34#define INCY r10 35#define BUFFER r14 36#else 37#define M r3 38#define IS r4 39#define A r7 40#define LDA r8 41#define X r9 42#define INCX r10 43#define Y r5 44#define INCY r6 45#define BUFFER r14 46#endif 47#endif 48 49#if defined(_AIX) || defined(__APPLE__) 50#if !defined(__64BIT__) && defined(DOUBLE) 51#define M r3 52#define IS r4 53#define A r9 54#define LDA r10 55#define X r5 56#define INCX r6 57#define Y r7 58#define INCY r8 59#define BUFFER r14 60#else 61#define M r3 62#define IS r4 63#define A r7 64#define LDA r8 65#define X r9 66#define INCX r10 67#define Y r5 68#define INCY r6 69#define BUFFER r14 70#endif 71#endif 72 73#define I r11 74#define J r12 75 76#define AO1 r15 77#define AO2 r16 78#define XX r19 79#define YY r20 80#define NEW_Y r21 81#define TEMP r22 82#define PREA r24 83 84#define y01 f0 85#define y02 f1 86#define y03 f2 87#define y04 f3 88#define y05 f4 89#define y06 f5 90#define y07 f6 91#define y08 f7 92 93#define xtemp1 f8 94#define xtemp2 f9 95#define xtemp3 f10 96#define xtemp4 f11 97#define xtemp5 f12 98#define xtemp6 f13 99#define xtemp7 f14 100#define xtemp8 f15 101 102#define atemp1 f16 103#define atemp2 f17 104#define atemp3 f18 105#define atemp4 f19 106 107#define xsum1 f20 108#define xsum2 f21 109#define xsum3 f22 110#define xsum4 f23 111 112#define a1 f24 113#define a2 f25 114#define a3 f26 115#define a4 f27 116#define a5 f28 117#define a6 f29 118#define a7 f30 119#define a8 f31 120 121#define alpha_r f1 122#define alpha_i f2 123 124#if defined(PPCG4) 125#define PREFETCHSIZE_A 24 126#endif 127 128#if defined(PPC440) || defined(PPC440FP2) 129#define PREFETCHSIZE_A 24 130#endif 131 132#ifdef PPC970 133#define PREFETCHSIZE_A 32 134#endif 135 136#ifdef CELL 137#define PREFETCHSIZE_A 72 138#endif 139 140#ifdef POWER4 141#define PREFETCHSIZE_A 16 142#endif 143 144#ifdef POWER5 145#define PREFETCHSIZE_A 96 146#endif 147 148#ifdef POWER6 149#define PREFETCHSIZE_A 112 150#endif 151 152#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) 153#define NOP1 154#define NOP2 155#else 156#define NOP1 mr LDA, LDA 157#define NOP2 mr INCX, INCX 158#endif 159 160#ifndef NEEDPARAM 161 162#ifndef __64BIT__ 163#define STACKSIZE 224 164#define ALPHA_R 200(SP) 165#define ALPHA_I 208(SP) 166#define FZERO 216(SP) 167#else 168#define STACKSIZE 280 169#define ALPHA_R 256(SP) 170#define ALPHA_I 264(SP) 171#define FZERO 272(SP) 172#endif 173 174#ifndef HEMV 175#define FMADD1 FNMSUB 176#define FMADD2 FMADD 177#else 178#define FMADD1 FMADD 179#define FMADD2 FNMSUB 180#endif 181 182 PROLOGUE 183 PROFCODE 184 185 addi SP, SP, -STACKSIZE 186 li r0, 0 187 188 stfd f14, 0(SP) 189 stfd f15, 8(SP) 190 stfd f16, 16(SP) 191 stfd f17, 24(SP) 192 stfd f18, 32(SP) 193 stfd f19, 40(SP) 194 stfd f20, 48(SP) 195 stfd f21, 56(SP) 196 stfd f22, 64(SP) 197 stfd f23, 72(SP) 198 stfd f24, 80(SP) 199 stfd f25, 88(SP) 200 stfd f26, 96(SP) 201 stfd f27, 104(SP) 202 stfd f28, 112(SP) 203 stfd f29, 120(SP) 204 stfd f30, 128(SP) 205 stfd f31, 136(SP) 206 207#ifdef __64BIT__ 208 std r0, FZERO 209 std r14, 144(SP) 210 std r15, 152(SP) 211 std r16, 160(SP) 212 std r17, 168(SP) 213 std r18, 176(SP) 214 std r19, 184(SP) 215 std r20, 192(SP) 216 std r21, 200(SP) 217 std r22, 208(SP) 218 std r23, 216(SP) 219 std r24, 224(SP) 220 std r25, 232(SP) 221 std r26, 240(SP) 222 std r27, 248(SP) 223#else 224 stw r0, 0 + FZERO 225 stw r0, 4 + FZERO 226 stw r14, 144(SP) 227 stw r15, 148(SP) 228 stw r16, 152(SP) 229 stw r17, 156(SP) 230 stw r18, 160(SP) 231 stw r19, 164(SP) 232 stw r20, 168(SP) 233 stw r21, 172(SP) 234 stw r22, 176(SP) 235 stw r23, 180(SP) 236 stw r24, 184(SP) 237 stw r25, 188(SP) 238 stw r26, 192(SP) 239 stw r27, 196(SP) 240#endif 241 242#ifdef linux 243#ifndef __64BIT__ 244 lwz BUFFER, 56 + STACKSIZE(SP) 245#else 246 ld Y, 112 + STACKSIZE(SP) 247 ld INCY, 120 + STACKSIZE(SP) 248 ld BUFFER, 128 + STACKSIZE(SP) 249#endif 250#endif 251 252#if defined(_AIX) || defined(__APPLE__) 253#ifndef __64BIT__ 254#ifdef DOUBLE 255 lwz X, 56 + STACKSIZE(SP) 256 lwz INCX, 60 + STACKSIZE(SP) 257 lwz Y, 64 + STACKSIZE(SP) 258 lwz INCY, 68 + STACKSIZE(SP) 259 lwz BUFFER, 72 + STACKSIZE(SP) 260#else 261 lwz Y, 56 + STACKSIZE(SP) 262 lwz INCY, 60 + STACKSIZE(SP) 263 lwz BUFFER, 64 + STACKSIZE(SP) 264#endif 265#else 266 ld Y, 112 + STACKSIZE(SP) 267 ld INCY, 120 + STACKSIZE(SP) 268 ld BUFFER, 128 + STACKSIZE(SP) 269#endif 270#endif 271 272 STFD alpha_r, ALPHA_R 273 STFD alpha_i, ALPHA_I 274 275 slwi LDA, LDA, ZBASE_SHIFT 276 slwi INCX, INCX, ZBASE_SHIFT 277 slwi INCY, INCY, ZBASE_SHIFT 278 279 li PREA, PREFETCHSIZE_A * SIZE 280 sub IS, M, IS 281 282 cmpwi cr0, M, 0 283 ble- LL(999) 284 285 mullw TEMP, IS, LDA 286 add A, A, TEMP 287 288 cmpwi cr0, INCX, 2 * SIZE 289 beq LL(05) 290 291 mr XX, X 292 mr X, BUFFER 293 294 srawi. r0, M, 2 295 mtspr CTR, r0 296 ble LL(03) 297 .align 4 298 299LL(01): 300 LFD a1, 0 * SIZE(XX) 301 LFD a2, 1 * SIZE(XX) 302 add XX, XX, INCX 303 LFD a3, 0 * SIZE(XX) 304 LFD a4, 1 * SIZE(XX) 305 add XX, XX, INCX 306 LFD a5, 0 * SIZE(XX) 307 LFD a6, 1 * SIZE(XX) 308 add XX, XX, INCX 309 LFD a7, 0 * SIZE(XX) 310 LFD a8, 1 * SIZE(XX) 311 add XX, XX, INCX 312 313 dcbt XX, PREA 314 dcbtst BUFFER, PREA 315 316 STFD a1, 0 * SIZE(BUFFER) 317 STFD a2, 1 * SIZE(BUFFER) 318 STFD a3, 2 * SIZE(BUFFER) 319 STFD a4, 3 * SIZE(BUFFER) 320 STFD a5, 4 * SIZE(BUFFER) 321 STFD a6, 5 * SIZE(BUFFER) 322 STFD a7, 6 * SIZE(BUFFER) 323 STFD a8, 7 * SIZE(BUFFER) 324 325 addi BUFFER, BUFFER, 8 * SIZE 326 bdnz LL(01) 327 .align 4 328 329LL(03): 330 andi. r0, M, 3 331 mtspr CTR, r0 332 ble LL(05) 333 .align 4 334 335LL(04): 336 LFD a1, 0 * SIZE(XX) 337 LFD a2, 1 * SIZE(XX) 338 add XX, XX, INCX 339 340 STFD a1, 0 * SIZE(BUFFER) 341 STFD a2, 1 * SIZE(BUFFER) 342 343 addi BUFFER, BUFFER, 2 * SIZE 344 bdnz LL(04) 345 .align 4 346 347LL(05): 348 mr NEW_Y, Y 349 lfd f0, FZERO 350 351 cmpwi cr0, INCY, 2 * SIZE 352 beq LL(10) 353 354 mr NEW_Y, BUFFER 355 356 addi r0, M, 3 357 srawi. r0, r0, 2 358 mtspr CTR, r0 359 .align 4 360 361LL(06): 362 STFD f0, 0 * SIZE(BUFFER) 363 STFD f0, 1 * SIZE(BUFFER) 364 STFD f0, 2 * SIZE(BUFFER) 365 STFD f0, 3 * SIZE(BUFFER) 366 STFD f0, 4 * SIZE(BUFFER) 367 STFD f0, 5 * SIZE(BUFFER) 368 STFD f0, 6 * SIZE(BUFFER) 369 STFD f0, 7 * SIZE(BUFFER) 370 addi BUFFER, BUFFER, 8 * SIZE 371 bdnz LL(06) 372 .align 4 373 374LL(10): 375 addi TEMP, IS, 2 376 cmpw cr0, TEMP, M 377 bgt LL(20) 378 .align 4 379 380LL(11): 381 mr AO1, A 382 add AO2, A, LDA 383 add A, AO2, LDA 384 385 slwi TEMP, IS, ZBASE_SHIFT 386 add TEMP, X, TEMP 387 388 LFD y05, ALPHA_R 389 LFD y06, ALPHA_I 390 391 LFD xtemp1, 0 * SIZE(TEMP) 392 LFD xtemp2, 1 * SIZE(TEMP) 393 LFD xtemp3, 2 * SIZE(TEMP) 394 LFD xtemp4, 3 * SIZE(TEMP) 395 396 FMUL atemp1, y05, xtemp1 397 FMUL atemp2, y06, xtemp1 398 FMUL atemp3, y05, xtemp3 399 FMUL atemp4, y06, xtemp3 400 401 FNMSUB atemp1, y06, xtemp2, atemp1 402 FMADD atemp2, y05, xtemp2, atemp2 403 FNMSUB atemp3, y06, xtemp4, atemp3 404 FMADD atemp4, y05, xtemp4, atemp4 405 406 lfd xsum1, FZERO 407 fmr xsum2, xsum1 408 fmr xsum3, xsum1 409 fmr xsum4, xsum1 410 411 mr XX, X 412 mr YY, NEW_Y 413 414 LFD a1, 0 * SIZE(AO1) 415 LFD a2, 1 * SIZE(AO1) 416 LFD a3, 2 * SIZE(AO1) 417 LFD a4, 3 * SIZE(AO1) 418 419 LFD a5, 0 * SIZE(AO2) 420 LFD a6, 1 * SIZE(AO2) 421 LFD a7, 2 * SIZE(AO2) 422 LFD a8, 3 * SIZE(AO2) 423 424 LFD xtemp1, 0 * SIZE(XX) 425 LFD xtemp2, 1 * SIZE(XX) 426 LFD xtemp3, 2 * SIZE(XX) 427 LFD xtemp4, 3 * SIZE(XX) 428 429 LFD y01, 0 * SIZE(YY) 430 LFD y02, 1 * SIZE(YY) 431 LFD y03, 2 * SIZE(YY) 432 LFD y04, 3 * SIZE(YY) 433 434 srawi. r0, IS, 3 435 mtspr CTR, r0 436 ble LL(15) 437 438 FMADD xsum1, xtemp1, a1, xsum1 439 DCBT(AO1, PREA) 440 FMADD y01, atemp1, a1, y01 441 NOP2 442 443 FMADD xsum2, xtemp2, a1, xsum2 444 NOP1 445 FMADD y02, atemp2, a1, y02 446 LFD a1, 4 * SIZE(AO1) 447 448 FMADD xsum3, xtemp1, a5, xsum3 449 NOP1 450 FMADD y03, atemp1, a3, y03 451 NOP2 452 453 FMADD xsum4, xtemp2, a5, xsum4 454 NOP1 455 FMADD y04, atemp2, a3, y04 456 NOP2 457 458 FMADD1 xsum1, xtemp2, a2, xsum1 459 LFD y05, 4 * SIZE(YY) 460 FNMSUB y01, atemp2, a2, y01 461 NOP2 462 463 FMADD2 xsum2, xtemp1, a2, xsum2 464 LFD y06, 5 * SIZE(YY) 465 FMADD y02, atemp1, a2, y02 466 LFD a2, 5 * SIZE(AO1) 467 468 FMADD1 xsum3, xtemp2, a6, xsum3 469 LFD xtemp2, 5 * SIZE(XX) 470 FNMSUB y03, atemp2, a4, y03 471 NOP2 472 473 FMADD2 xsum4, xtemp1, a6, xsum4 474 LFD xtemp1, 4 * SIZE(XX) 475 FMADD y04, atemp1, a4, y04 476 NOP2 477 478 FMADD xsum1, xtemp3, a3, xsum1 479 LFD y07, 6 * SIZE(YY) 480 FMADD y01, atemp3, a5, y01 481 NOP2 482 483 FMADD xsum2, xtemp4, a3, xsum2 484 LFD a3, 6 * SIZE(AO1) 485 FMADD y02, atemp4, a5, y02 486 LFD a5, 4 * SIZE(AO2) 487 488 FMADD xsum3, xtemp3, a7, xsum3 489 LFD y08, 7 * SIZE(YY) 490 FMADD y03, atemp3, a7, y03 491 NOP2 492 493 FMADD xsum4, xtemp4, a7, xsum4 494 NOP1 495 FMADD y04, atemp4, a7, y04 496 LFD a7, 6 * SIZE(AO2) 497 498 FMADD1 xsum1, xtemp4, a4, xsum1 499 NOP1 500 FNMSUB y01, atemp4, a6, y01 501# DCBT(X, PREX) 502 NOP2 503 504 FMADD2 xsum2, xtemp3, a4, xsum2 505 LFD a4, 7 * SIZE(AO1) 506 FMADD y02, atemp3, a6, y02 507 LFD a6, 5 * SIZE(AO2) 508 509 FMADD1 xsum3, xtemp4, a8, xsum3 510 LFD xtemp4, 7 * SIZE(XX) 511 FNMSUB y03, atemp4, a8, y03 512 NOP2 513 514 FMADD2 xsum4, xtemp3, a8, xsum4 515 LFD xtemp3, 6 * SIZE(XX) 516 FMADD y04, atemp3, a8, y04 517 LFD a8, 7 * SIZE(AO2) 518 519 FMADD xsum1, xtemp1, a1, xsum1 520 STFD y01, 0 * SIZE(YY) 521 FMADD y05, atemp1, a1, y05 522 NOP2 523 524 FMADD xsum2, xtemp2, a1, xsum2 525 STFD y02, 1 * SIZE(YY) 526 FMADD y06, atemp2, a1, y06 527 LFD a1, 8 * SIZE(AO1) 528 529 FMADD xsum3, xtemp1, a5, xsum3 530 STFD y03, 2 * SIZE(YY) 531 FMADD y07, atemp1, a3, y07 532 NOP2 533 534 FMADD xsum4, xtemp2, a5, xsum4 535 STFD y04, 3 * SIZE(YY) 536 FMADD y08, atemp2, a3, y08 537 NOP2 538 539 FMADD1 xsum1, xtemp2, a2, xsum1 540 LFD y01, 8 * SIZE(YY) 541 FNMSUB y05, atemp2, a2, y05 542 NOP2 543 544 FMADD2 xsum2, xtemp1, a2, xsum2 545 LFD y02, 9 * SIZE(YY) 546 FMADD y06, atemp1, a2, y06 547 LFD a2, 9 * SIZE(AO1) 548 549 FMADD1 xsum3, xtemp2, a6, xsum3 550 LFD xtemp2, 9 * SIZE(XX) 551 FNMSUB y07, atemp2, a4, y07 552 NOP2 553 554 FMADD2 xsum4, xtemp1, a6, xsum4 555 LFD xtemp1, 8 * SIZE(XX) 556 FMADD y08, atemp1, a4, y08 557 NOP2 558 559 FMADD xsum1, xtemp3, a3, xsum1 560 LFD y03, 10 * SIZE(YY) 561 FMADD y05, atemp3, a5, y05 562 NOP2 563 564 FMADD xsum2, xtemp4, a3, xsum2 565 LFD a3, 10 * SIZE(AO1) 566 FMADD y06, atemp4, a5, y06 567 LFD a5, 8 * SIZE(AO2) 568 569 FMADD xsum3, xtemp3, a7, xsum3 570 LFD y04, 11 * SIZE(YY) 571 FMADD y07, atemp3, a7, y07 572 NOP2 573 574 FMADD xsum4, xtemp4, a7, xsum4 575 NOP1 576 FMADD y08, atemp4, a7, y08 577 LFD a7, 10 * SIZE(AO2) 578 579 FMADD1 xsum1, xtemp4, a4, xsum1 580 NOP1 581 FNMSUB y05, atemp4, a6, y05 582 NOP2 583 584 FMADD2 xsum2, xtemp3, a4, xsum2 585 LFD a4, 11 * SIZE(AO1) 586 FMADD y06, atemp3, a6, y06 587 LFD a6, 9 * SIZE(AO2) 588 589 FMADD1 xsum3, xtemp4, a8, xsum3 590 LFD xtemp4, 11 * SIZE(XX) 591 FNMSUB y07, atemp4, a8, y07 592 bdz LL(13) 593 .align 4 594 595LL(12): 596 FMADD2 xsum4, xtemp3, a8, xsum4 597 LFD xtemp3, 10 * SIZE(XX) 598 FMADD y08, atemp3, a8, y08 599 LFD a8, 11 * SIZE(AO2) 600 601 FMADD xsum1, xtemp1, a1, xsum1 602 STFD y05, 4 * SIZE(YY) 603 FMADD y01, atemp1, a1, y01 604 DCBT(AO2, PREA) 605 606 FMADD xsum2, xtemp2, a1, xsum2 607 STFD y06, 5 * SIZE(YY) 608 FMADD y02, atemp2, a1, y02 609 LFD a1, 12 * SIZE(AO1) 610 611 FMADD xsum3, xtemp1, a5, xsum3 612 STFD y07, 6 * SIZE(YY) 613 FMADD y03, atemp1, a3, y03 614 NOP2 615 616 FMADD xsum4, xtemp2, a5, xsum4 617 STFD y08, 7 * SIZE(YY) 618 FMADD y04, atemp2, a3, y04 619 NOP2 620 621 FMADD1 xsum1, xtemp2, a2, xsum1 622 LFD y05, 12 * SIZE(YY) 623 FNMSUB y01, atemp2, a2, y01 624 NOP2 625 626 FMADD2 xsum2, xtemp1, a2, xsum2 627 LFD y06, 13 * SIZE(YY) 628 FMADD y02, atemp1, a2, y02 629 LFD a2, 13 * SIZE(AO1) 630 631 FMADD1 xsum3, xtemp2, a6, xsum3 632 LFD xtemp2, 13 * SIZE(XX) 633 FNMSUB y03, atemp2, a4, y03 634 NOP2 635 636 FMADD2 xsum4, xtemp1, a6, xsum4 637 LFD xtemp1, 12 * SIZE(XX) 638 FMADD y04, atemp1, a4, y04 639 NOP2 640 641 FMADD xsum1, xtemp3, a3, xsum1 642 LFD y07, 14 * SIZE(YY) 643 FMADD y01, atemp3, a5, y01 644 NOP2 645 646 FMADD xsum2, xtemp4, a3, xsum2 647 LFD a3, 14 * SIZE(AO1) 648 FMADD y02, atemp4, a5, y02 649 LFD a5, 12 * SIZE(AO2) 650 651 FMADD xsum3, xtemp3, a7, xsum3 652 LFD y08, 15 * SIZE(YY) 653 FMADD y03, atemp3, a7, y03 654 NOP2 655 656 FMADD xsum4, xtemp4, a7, xsum4 657 NOP1 658 FMADD y04, atemp4, a7, y04 659 LFD a7, 14 * SIZE(AO2) 660 661 FMADD1 xsum1, xtemp4, a4, xsum1 662 NOP1 663 FNMSUB y01, atemp4, a6, y01 664# DCBT(Y1, PREY) 665 NOP2 666 667 FMADD2 xsum2, xtemp3, a4, xsum2 668 LFD a4, 15 * SIZE(AO1) 669 FMADD y02, atemp3, a6, y02 670 LFD a6, 13 * SIZE(AO2) 671 672 FMADD1 xsum3, xtemp4, a8, xsum3 673 LFD xtemp4, 15 * SIZE(XX) 674 FNMSUB y03, atemp4, a8, y03 675 NOP2 676 677 FMADD2 xsum4, xtemp3, a8, xsum4 678 LFD xtemp3, 14 * SIZE(XX) 679 FMADD y04, atemp3, a8, y04 680 LFD a8, 15 * SIZE(AO2) 681 682 FMADD xsum1, xtemp1, a1, xsum1 683 STFD y01, 8 * SIZE(YY) 684 FMADD y05, atemp1, a1, y05 685 NOP2 686 687 FMADD xsum2, xtemp2, a1, xsum2 688 STFD y02, 9 * SIZE(YY) 689 FMADD y06, atemp2, a1, y06 690 LFD a1, 16 * SIZE(AO1) 691 692 FMADD xsum3, xtemp1, a5, xsum3 693 STFD y03, 10 * SIZE(YY) 694 FMADD y07, atemp1, a3, y07 695 NOP2 696 697 FMADD xsum4, xtemp2, a5, xsum4 698 STFD y04, 11 * SIZE(YY) 699 FMADD y08, atemp2, a3, y08 700 NOP2 701 702 FMADD1 xsum1, xtemp2, a2, xsum1 703 LFD y01, 16 * SIZE(YY) 704 FNMSUB y05, atemp2, a2, y05 705 NOP2 706 707 FMADD2 xsum2, xtemp1, a2, xsum2 708 LFD y02, 17 * SIZE(YY) 709 FMADD y06, atemp1, a2, y06 710 LFD a2, 17 * SIZE(AO1) 711 712 FMADD1 xsum3, xtemp2, a6, xsum3 713 LFD xtemp2, 17 * SIZE(XX) 714 FNMSUB y07, atemp2, a4, y07 715 NOP2 716 717 FMADD2 xsum4, xtemp1, a6, xsum4 718 LFD xtemp1, 16 * SIZE(XX) 719 FMADD y08, atemp1, a4, y08 720 addi AO2, AO2, 16 * SIZE 721 722 FMADD xsum1, xtemp3, a3, xsum1 723 LFD y03, 18 * SIZE(YY) 724 FMADD y05, atemp3, a5, y05 725 addi XX, XX, 16 * SIZE 726 727 FMADD xsum2, xtemp4, a3, xsum2 728 LFD a3, 18 * SIZE(AO1) 729 FMADD y06, atemp4, a5, y06 730 LFD a5, 0 * SIZE(AO2) 731 732 FMADD xsum3, xtemp3, a7, xsum3 733 LFD y04, 19 * SIZE(YY) 734 FMADD y07, atemp3, a7, y07 735 NOP2 736 737 FMADD xsum4, xtemp4, a7, xsum4 738 addi AO1, AO1, 16 * SIZE 739 FMADD y08, atemp4, a7, y08 740 LFD a7, 2 * SIZE(AO2) 741 742 FMADD1 xsum1, xtemp4, a4, xsum1 743 addi YY, YY, 16 * SIZE 744 FNMSUB y05, atemp4, a6, y05 745 NOP2 746 747 FMADD2 xsum2, xtemp3, a4, xsum2 748 LFD a4, 3 * SIZE(AO1) 749 FMADD y06, atemp3, a6, y06 750 LFD a6, 1 * SIZE(AO2) 751 752 FMADD1 xsum3, xtemp4, a8, xsum3 753 LFD xtemp4, 3 * SIZE(XX) 754 FNMSUB y07, atemp4, a8, y07 755 NOP2 756 757 FMADD2 xsum4, xtemp3, a8, xsum4 758 LFD xtemp3, 2 * SIZE(XX) 759 FMADD y08, atemp3, a8, y08 760 LFD a8, 3 * SIZE(AO2) 761 762 FMADD xsum1, xtemp1, a1, xsum1 763 STFD y05, -4 * SIZE(YY) 764 FMADD y01, atemp1, a1, y01 765 DCBT(AO1, PREA) 766 767 FMADD xsum2, xtemp2, a1, xsum2 768 STFD y06, -3 * SIZE(YY) 769 FMADD y02, atemp2, a1, y02 770 LFD a1, 4 * SIZE(AO1) 771 772 FMADD xsum3, xtemp1, a5, xsum3 773 STFD y07, -2 * SIZE(YY) 774 FMADD y03, atemp1, a3, y03 775 NOP2 776 777 FMADD xsum4, xtemp2, a5, xsum4 778 STFD y08, -1 * SIZE(YY) 779 FMADD y04, atemp2, a3, y04 780 NOP2 781 782 FMADD1 xsum1, xtemp2, a2, xsum1 783 LFD y05, 4 * SIZE(YY) 784 FNMSUB y01, atemp2, a2, y01 785 NOP2 786 787 FMADD2 xsum2, xtemp1, a2, xsum2 788 LFD y06, 5 * SIZE(YY) 789 FMADD y02, atemp1, a2, y02 790 LFD a2, 5 * SIZE(AO1) 791 792 FMADD1 xsum3, xtemp2, a6, xsum3 793 LFD xtemp2, 5 * SIZE(XX) 794 FNMSUB y03, atemp2, a4, y03 795 NOP2 796 797 FMADD2 xsum4, xtemp1, a6, xsum4 798 LFD xtemp1, 4 * SIZE(XX) 799 FMADD y04, atemp1, a4, y04 800 NOP2 801 802 FMADD xsum1, xtemp3, a3, xsum1 803 LFD y07, 6 * SIZE(YY) 804 FMADD y01, atemp3, a5, y01 805 NOP2 806 807 FMADD xsum2, xtemp4, a3, xsum2 808 LFD a3, 6 * SIZE(AO1) 809 FMADD y02, atemp4, a5, y02 810 LFD a5, 4 * SIZE(AO2) 811 812 FMADD xsum3, xtemp3, a7, xsum3 813 LFD y08, 7 * SIZE(YY) 814 FMADD y03, atemp3, a7, y03 815 NOP2 816 817 FMADD xsum4, xtemp4, a7, xsum4 818 NOP1 819 FMADD y04, atemp4, a7, y04 820 LFD a7, 6 * SIZE(AO2) 821 822 FMADD1 xsum1, xtemp4, a4, xsum1 823 NOP1 824 FNMSUB y01, atemp4, a6, y01 825# DCBT(X, PREX) 826 NOP2 827 828 FMADD2 xsum2, xtemp3, a4, xsum2 829 LFD a4, 7 * SIZE(AO1) 830 FMADD y02, atemp3, a6, y02 831 LFD a6, 5 * SIZE(AO2) 832 833 FMADD1 xsum3, xtemp4, a8, xsum3 834 LFD xtemp4, 7 * SIZE(XX) 835 FNMSUB y03, atemp4, a8, y03 836 NOP2 837 838 FMADD2 xsum4, xtemp3, a8, xsum4 839 LFD xtemp3, 6 * SIZE(XX) 840 FMADD y04, atemp3, a8, y04 841 LFD a8, 7 * SIZE(AO2) 842 843 FMADD xsum1, xtemp1, a1, xsum1 844 STFD y01, 0 * SIZE(YY) 845 FMADD y05, atemp1, a1, y05 846 NOP2 847 848 FMADD xsum2, xtemp2, a1, xsum2 849 STFD y02, 1 * SIZE(YY) 850 FMADD y06, atemp2, a1, y06 851 LFD a1, 8 * SIZE(AO1) 852 853 FMADD xsum3, xtemp1, a5, xsum3 854 STFD y03, 2 * SIZE(YY) 855 FMADD y07, atemp1, a3, y07 856 NOP2 857 858 FMADD xsum4, xtemp2, a5, xsum4 859 STFD y04, 3 * SIZE(YY) 860 FMADD y08, atemp2, a3, y08 861 NOP2 862 863 FMADD1 xsum1, xtemp2, a2, xsum1 864 LFD y01, 8 * SIZE(YY) 865 FNMSUB y05, atemp2, a2, y05 866 NOP2 867 868 FMADD2 xsum2, xtemp1, a2, xsum2 869 LFD y02, 9 * SIZE(YY) 870 FMADD y06, atemp1, a2, y06 871 LFD a2, 9 * SIZE(AO1) 872 873 FMADD1 xsum3, xtemp2, a6, xsum3 874 LFD xtemp2, 9 * SIZE(XX) 875 FNMSUB y07, atemp2, a4, y07 876 NOP2 877 878 FMADD2 xsum4, xtemp1, a6, xsum4 879 LFD xtemp1, 8 * SIZE(XX) 880 FMADD y08, atemp1, a4, y08 881 NOP2 882 883 FMADD xsum1, xtemp3, a3, xsum1 884 LFD y03, 10 * SIZE(YY) 885 FMADD y05, atemp3, a5, y05 886 NOP2 887 888 FMADD xsum2, xtemp4, a3, xsum2 889 LFD a3, 10 * SIZE(AO1) 890 FMADD y06, atemp4, a5, y06 891 LFD a5, 8 * SIZE(AO2) 892 893 FMADD xsum3, xtemp3, a7, xsum3 894 LFD y04, 11 * SIZE(YY) 895 FMADD y07, atemp3, a7, y07 896 NOP2 897 898 FMADD xsum4, xtemp4, a7, xsum4 899 NOP1 900 FMADD y08, atemp4, a7, y08 901 LFD a7, 10 * SIZE(AO2) 902 903 FMADD1 xsum1, xtemp4, a4, xsum1 904 NOP1 905 FNMSUB y05, atemp4, a6, y05 906 NOP2 907 908 FMADD2 xsum2, xtemp3, a4, xsum2 909 LFD a4, 11 * SIZE(AO1) 910 FMADD y06, atemp3, a6, y06 911 LFD a6, 9 * SIZE(AO2) 912 913 FMADD1 xsum3, xtemp4, a8, xsum3 914 LFD xtemp4, 11 * SIZE(XX) 915 FNMSUB y07, atemp4, a8, y07 916 bdnz LL(12) 917 .align 4 918 919LL(13): 920 FMADD2 xsum4, xtemp3, a8, xsum4 921 LFD xtemp3, 10 * SIZE(XX) 922 FMADD y08, atemp3, a8, y08 923 LFD a8, 11 * SIZE(AO2) 924 925 FMADD xsum1, xtemp1, a1, xsum1 926 STFD y05, 4 * SIZE(YY) 927 FMADD y01, atemp1, a1, y01 928 NOP2 929 930 FMADD xsum2, xtemp2, a1, xsum2 931 STFD y06, 5 * SIZE(YY) 932 FMADD y02, atemp2, a1, y02 933 LFD a1, 12 * SIZE(AO1) 934 935 FMADD xsum3, xtemp1, a5, xsum3 936 STFD y07, 6 * SIZE(YY) 937 FMADD y03, atemp1, a3, y03 938 NOP2 939 940 FMADD xsum4, xtemp2, a5, xsum4 941 STFD y08, 7 * SIZE(YY) 942 FMADD y04, atemp2, a3, y04 943 NOP2 944 945 FMADD1 xsum1, xtemp2, a2, xsum1 946 LFD y05, 12 * SIZE(YY) 947 FNMSUB y01, atemp2, a2, y01 948 NOP2 949 950 FMADD2 xsum2, xtemp1, a2, xsum2 951 LFD y06, 13 * SIZE(YY) 952 FMADD y02, atemp1, a2, y02 953 LFD a2, 13 * SIZE(AO1) 954 955 FMADD1 xsum3, xtemp2, a6, xsum3 956 LFD xtemp2, 13 * SIZE(XX) 957 FNMSUB y03, atemp2, a4, y03 958 NOP2 959 960 FMADD2 xsum4, xtemp1, a6, xsum4 961 LFD xtemp1, 12 * SIZE(XX) 962 FMADD y04, atemp1, a4, y04 963 NOP2 964 965 FMADD xsum1, xtemp3, a3, xsum1 966 LFD y07, 14 * SIZE(YY) 967 FMADD y01, atemp3, a5, y01 968 NOP2 969 970 FMADD xsum2, xtemp4, a3, xsum2 971 LFD a3, 14 * SIZE(AO1) 972 FMADD y02, atemp4, a5, y02 973 LFD a5, 12 * SIZE(AO2) 974 975 FMADD xsum3, xtemp3, a7, xsum3 976 LFD y08, 15 * SIZE(YY) 977 FMADD y03, atemp3, a7, y03 978 NOP2 979 980 FMADD xsum4, xtemp4, a7, xsum4 981 NOP1 982 FMADD y04, atemp4, a7, y04 983 LFD a7, 14 * SIZE(AO2) 984 985 FMADD1 xsum1, xtemp4, a4, xsum1 986 NOP1 987 FNMSUB y01, atemp4, a6, y01 988 NOP2 989 990 FMADD2 xsum2, xtemp3, a4, xsum2 991 LFD a4, 15 * SIZE(AO1) 992 FMADD y02, atemp3, a6, y02 993 LFD a6, 13 * SIZE(AO2) 994 995 FMADD1 xsum3, xtemp4, a8, xsum3 996 LFD xtemp4, 15 * SIZE(XX) 997 FNMSUB y03, atemp4, a8, y03 998 NOP2 999 1000 FMADD2 xsum4, xtemp3, a8, xsum4 1001 LFD xtemp3, 14 * SIZE(XX) 1002 FMADD y04, atemp3, a8, y04 1003 LFD a8, 15 * SIZE(AO2) 1004 1005 FMADD xsum1, xtemp1, a1, xsum1 1006 STFD y01, 8 * SIZE(YY) 1007 FMADD y05, atemp1, a1, y05 1008 NOP2 1009 1010 FMADD xsum2, xtemp2, a1, xsum2 1011 STFD y02, 9 * SIZE(YY) 1012 FMADD y06, atemp2, a1, y06 1013 LFD a1, 16 * SIZE(AO1) 1014 1015 FMADD xsum3, xtemp1, a5, xsum3 1016 STFD y03, 10 * SIZE(YY) 1017 FMADD y07, atemp1, a3, y07 1018 NOP2 1019 1020 FMADD xsum4, xtemp2, a5, xsum4 1021 STFD y04, 11 * SIZE(YY) 1022 FMADD y08, atemp2, a3, y08 1023 NOP2 1024 1025 FMADD1 xsum1, xtemp2, a2, xsum1 1026 LFD y01, 16 * SIZE(YY) 1027 FNMSUB y05, atemp2, a2, y05 1028 NOP2 1029 1030 FMADD2 xsum2, xtemp1, a2, xsum2 1031 LFD y02, 17 * SIZE(YY) 1032 FMADD y06, atemp1, a2, y06 1033 LFD a2, 17 * SIZE(AO1) 1034 1035 FMADD1 xsum3, xtemp2, a6, xsum3 1036 LFD xtemp2, 17 * SIZE(XX) 1037 FNMSUB y07, atemp2, a4, y07 1038 NOP2 1039 1040 FMADD2 xsum4, xtemp1, a6, xsum4 1041 LFD xtemp1, 16 * SIZE(XX) 1042 FMADD y08, atemp1, a4, y08 1043 addi AO2, AO2, 16 * SIZE 1044 1045 FMADD xsum1, xtemp3, a3, xsum1 1046 LFD y03, 18 * SIZE(YY) 1047 FMADD y05, atemp3, a5, y05 1048 addi XX, XX, 16 * SIZE 1049 1050 FMADD xsum2, xtemp4, a3, xsum2 1051 LFD a3, 18 * SIZE(AO1) 1052 FMADD y06, atemp4, a5, y06 1053 LFD a5, 0 * SIZE(AO2) 1054 1055 FMADD xsum3, xtemp3, a7, xsum3 1056 LFD y04, 19 * SIZE(YY) 1057 FMADD y07, atemp3, a7, y07 1058 NOP2 1059 1060 FMADD xsum4, xtemp4, a7, xsum4 1061 addi AO1, AO1, 16 * SIZE 1062 FMADD y08, atemp4, a7, y08 1063 LFD a7, 2 * SIZE(AO2) 1064 1065 FMADD1 xsum1, xtemp4, a4, xsum1 1066 addi YY, YY, 16 * SIZE 1067 FNMSUB y05, atemp4, a6, y05 1068 NOP2 1069 1070 FMADD2 xsum2, xtemp3, a4, xsum2 1071 LFD a4, 3 * SIZE(AO1) 1072 FMADD y06, atemp3, a6, y06 1073 LFD a6, 1 * SIZE(AO2) 1074 1075 FMADD1 xsum3, xtemp4, a8, xsum3 1076 LFD xtemp4, 3 * SIZE(XX) 1077 FNMSUB y07, atemp4, a8, y07 1078 NOP2 1079 1080 FMADD2 xsum4, xtemp3, a8, xsum4 1081 LFD xtemp3, 2 * SIZE(XX) 1082 FMADD y08, atemp3, a8, y08 1083 LFD a8, 3 * SIZE(AO2) 1084 1085 STFD y05, -4 * SIZE(YY) 1086 STFD y06, -3 * SIZE(YY) 1087 STFD y07, -2 * SIZE(YY) 1088 STFD y08, -1 * SIZE(YY) 1089 .align 4 1090 1091LL(15): 1092 andi. r0, IS, 4 1093 ble LL(16) 1094 1095 FMADD xsum1, xtemp1, a1, xsum1 1096 NOP1 1097 FMADD y01, atemp1, a1, y01 1098 NOP2 1099 1100 FMADD xsum2, xtemp2, a1, xsum2 1101 NOP1 1102 FMADD y02, atemp2, a1, y02 1103 LFD a1, 4 * SIZE(AO1) 1104 1105 FMADD xsum3, xtemp1, a5, xsum3 1106 NOP1 1107 FMADD y03, atemp1, a3, y03 1108 NOP2 1109 1110 FMADD xsum4, xtemp2, a5, xsum4 1111 NOP1 1112 FMADD y04, atemp2, a3, y04 1113 NOP2 1114 1115 FMADD1 xsum1, xtemp2, a2, xsum1 1116 LFD y05, 4 * SIZE(YY) 1117 FNMSUB y01, atemp2, a2, y01 1118 NOP2 1119 1120 FMADD2 xsum2, xtemp1, a2, xsum2 1121 LFD y06, 5 * SIZE(YY) 1122 FMADD y02, atemp1, a2, y02 1123 LFD a2, 5 * SIZE(AO1) 1124 1125 FMADD1 xsum3, xtemp2, a6, xsum3 1126 LFD xtemp2, 5 * SIZE(XX) 1127 FNMSUB y03, atemp2, a4, y03 1128 NOP2 1129 1130 FMADD2 xsum4, xtemp1, a6, xsum4 1131 LFD xtemp1, 4 * SIZE(XX) 1132 FMADD y04, atemp1, a4, y04 1133 NOP2 1134 1135 FMADD xsum1, xtemp3, a3, xsum1 1136 LFD y07, 6 * SIZE(YY) 1137 FMADD y01, atemp3, a5, y01 1138 NOP2 1139 1140 FMADD xsum2, xtemp4, a3, xsum2 1141 LFD a3, 6 * SIZE(AO1) 1142 FMADD y02, atemp4, a5, y02 1143 LFD a5, 4 * SIZE(AO2) 1144 1145 FMADD xsum3, xtemp3, a7, xsum3 1146 LFD y08, 7 * SIZE(YY) 1147 FMADD y03, atemp3, a7, y03 1148 NOP2 1149 1150 FMADD xsum4, xtemp4, a7, xsum4 1151 NOP1 1152 FMADD y04, atemp4, a7, y04 1153 LFD a7, 6 * SIZE(AO2) 1154 1155 FMADD1 xsum1, xtemp4, a4, xsum1 1156 NOP1 1157 FNMSUB y01, atemp4, a6, y01 1158 NOP2 1159 1160 FMADD2 xsum2, xtemp3, a4, xsum2 1161 LFD a4, 7 * SIZE(AO1) 1162 FMADD y02, atemp3, a6, y02 1163 LFD a6, 5 * SIZE(AO2) 1164 1165 FMADD1 xsum3, xtemp4, a8, xsum3 1166 LFD xtemp4, 7 * SIZE(XX) 1167 FNMSUB y03, atemp4, a8, y03 1168 NOP2 1169 1170 FMADD2 xsum4, xtemp3, a8, xsum4 1171 LFD xtemp3, 6 * SIZE(XX) 1172 FMADD y04, atemp3, a8, y04 1173 LFD a8, 7 * SIZE(AO2) 1174 1175 FMADD xsum1, xtemp1, a1, xsum1 1176 STFD y01, 0 * SIZE(YY) 1177 FMADD y05, atemp1, a1, y05 1178 NOP2 1179 1180 FMADD xsum2, xtemp2, a1, xsum2 1181 STFD y02, 1 * SIZE(YY) 1182 FMADD y06, atemp2, a1, y06 1183 LFD a1, 8 * SIZE(AO1) 1184 1185 FMADD xsum3, xtemp1, a5, xsum3 1186 STFD y03, 2 * SIZE(YY) 1187 FMADD y07, atemp1, a3, y07 1188 NOP2 1189 1190 FMADD xsum4, xtemp2, a5, xsum4 1191 STFD y04, 3 * SIZE(YY) 1192 FMADD y08, atemp2, a3, y08 1193 NOP2 1194 1195 FMADD1 xsum1, xtemp2, a2, xsum1 1196 LFD y01, 8 * SIZE(YY) 1197 FNMSUB y05, atemp2, a2, y05 1198 NOP2 1199 1200 FMADD2 xsum2, xtemp1, a2, xsum2 1201 LFD y02, 9 * SIZE(YY) 1202 FMADD y06, atemp1, a2, y06 1203 LFD a2, 9 * SIZE(AO1) 1204 1205 FMADD1 xsum3, xtemp2, a6, xsum3 1206 LFD xtemp2, 9 * SIZE(XX) 1207 FNMSUB y07, atemp2, a4, y07 1208 NOP2 1209 1210 FMADD2 xsum4, xtemp1, a6, xsum4 1211 LFD xtemp1, 8 * SIZE(XX) 1212 FMADD y08, atemp1, a4, y08 1213 NOP2 1214 1215 FMADD xsum1, xtemp3, a3, xsum1 1216 LFD y03, 10 * SIZE(YY) 1217 FMADD y05, atemp3, a5, y05 1218 NOP2 1219 1220 FMADD xsum2, xtemp4, a3, xsum2 1221 LFD a3, 10 * SIZE(AO1) 1222 FMADD y06, atemp4, a5, y06 1223 LFD a5, 8 * SIZE(AO2) 1224 1225 FMADD xsum3, xtemp3, a7, xsum3 1226 LFD y04, 11 * SIZE(YY) 1227 FMADD y07, atemp3, a7, y07 1228 NOP2 1229 1230 FMADD xsum4, xtemp4, a7, xsum4 1231 NOP1 1232 FMADD y08, atemp4, a7, y08 1233 LFD a7, 10 * SIZE(AO2) 1234 1235 FMADD1 xsum1, xtemp4, a4, xsum1 1236 NOP1 1237 FNMSUB y05, atemp4, a6, y05 1238 NOP2 1239 1240 FMADD2 xsum2, xtemp3, a4, xsum2 1241 LFD a4, 11 * SIZE(AO1) 1242 FMADD y06, atemp3, a6, y06 1243 LFD a6, 9 * SIZE(AO2) 1244 1245 FMADD1 xsum3, xtemp4, a8, xsum3 1246 LFD xtemp4, 11 * SIZE(XX) 1247 FNMSUB y07, atemp4, a8, y07 1248 1249 FMADD2 xsum4, xtemp3, a8, xsum4 1250 LFD xtemp3, 10 * SIZE(XX) 1251 FMADD y08, atemp3, a8, y08 1252 LFD a8, 11 * SIZE(AO2) 1253 1254 STFD y05, 4 * SIZE(YY) 1255 STFD y06, 5 * SIZE(YY) 1256 STFD y07, 6 * SIZE(YY) 1257 STFD y08, 7 * SIZE(YY) 1258 1259 addi AO1, AO1, 8 * SIZE 1260 addi AO2, AO2, 8 * SIZE 1261 1262 addi XX, XX, 8 * SIZE 1263 addi YY, YY, 8 * SIZE 1264 .align 4 1265 1266LL(16): 1267 andi. r0, IS, 2 1268 ble LL(18) 1269 1270 FMADD xsum1, xtemp1, a1, xsum1 1271 FMADD y01, atemp1, a1, y01 1272 FMADD xsum2, xtemp2, a1, xsum2 1273 FMADD y02, atemp2, a1, y02 1274 FMADD xsum3, xtemp1, a5, xsum3 1275 FMADD y03, atemp1, a3, y03 1276 FMADD xsum4, xtemp2, a5, xsum4 1277 FMADD y04, atemp2, a3, y04 1278 1279 FMADD1 xsum1, xtemp2, a2, xsum1 1280 FNMSUB y01, atemp2, a2, y01 1281 FMADD2 xsum2, xtemp1, a2, xsum2 1282 FMADD y02, atemp1, a2, y02 1283 FMADD1 xsum3, xtemp2, a6, xsum3 1284 FNMSUB y03, atemp2, a4, y03 1285 FMADD2 xsum4, xtemp1, a6, xsum4 1286 FMADD y04, atemp1, a4, y04 1287 1288 FMADD xsum1, xtemp3, a3, xsum1 1289 FMADD y01, atemp3, a5, y01 1290 FMADD xsum2, xtemp4, a3, xsum2 1291 FMADD y02, atemp4, a5, y02 1292 FMADD xsum3, xtemp3, a7, xsum3 1293 FMADD y03, atemp3, a7, y03 1294 FMADD xsum4, xtemp4, a7, xsum4 1295 FMADD y04, atemp4, a7, y04 1296 1297 FMADD1 xsum1, xtemp4, a4, xsum1 1298 FNMSUB y01, atemp4, a6, y01 1299 FMADD2 xsum2, xtemp3, a4, xsum2 1300 FMADD y02, atemp3, a6, y02 1301 FMADD1 xsum3, xtemp4, a8, xsum3 1302 FNMSUB y03, atemp4, a8, y03 1303 FMADD2 xsum4, xtemp3, a8, xsum4 1304 FMADD y04, atemp3, a8, y04 1305 1306 STFD y01, 0 * SIZE(YY) 1307 STFD y02, 1 * SIZE(YY) 1308 STFD y03, 2 * SIZE(YY) 1309 STFD y04, 3 * SIZE(YY) 1310 1311 LFD a1, 4 * SIZE(AO1) 1312 LFD a2, 5 * SIZE(AO1) 1313 1314 LFD a5, 4 * SIZE(AO2) 1315 LFD a6, 5 * SIZE(AO2) 1316 LFD a7, 6 * SIZE(AO2) 1317 LFD a8, 7 * SIZE(AO2) 1318 1319 LFD y01, 4 * SIZE(YY) 1320 LFD y02, 5 * SIZE(YY) 1321 LFD y03, 6 * SIZE(YY) 1322 LFD y04, 7 * SIZE(YY) 1323 1324 addi YY, YY, 4 * SIZE 1325 .align 4 1326 1327LL(18): 1328 LFD y05, ALPHA_R 1329 LFD y06, ALPHA_I 1330 1331 FMUL xtemp1, y05, xsum1 1332 FMUL xtemp2, y06, xsum1 1333 FMUL xtemp3, y05, xsum3 1334 FMUL xtemp4, y06, xsum3 1335 1336 FNMSUB xsum1, y06, xsum2, xtemp1 1337 FMADD xsum2, y05, xsum2, xtemp2 1338 FNMSUB xsum3, y06, xsum4, xtemp3 1339 FMADD xsum4, y05, xsum4, xtemp4 1340 1341 FMADD xsum1, atemp1, a1, xsum1 1342 FMADD xsum2, atemp2, a1, xsum2 1343 FMADD xsum3, atemp1, a5, xsum3 1344 FMADD xsum4, atemp2, a5, xsum4 1345 1346#ifndef HEMV 1347 FMADD1 xsum1, atemp2, a2, xsum1 1348 FMADD2 xsum2, atemp1, a2, xsum2 1349#endif 1350 FMADD1 xsum3, atemp2, a6, xsum3 1351 FMADD2 xsum4, atemp1, a6, xsum4 1352 1353 FMADD xsum1, atemp3, a5, xsum1 1354 FMADD xsum2, atemp4, a5, xsum2 1355 FMADD xsum3, atemp3, a7, xsum3 1356 FMADD xsum4, atemp4, a7, xsum4 1357 1358 FNMSUB xsum1, atemp4, a6, xsum1 1359 FMADD xsum2, atemp3, a6, xsum2 1360#ifndef HEMV 1361 FNMSUB xsum3, atemp4, a8, xsum3 1362 FMADD xsum4, atemp3, a8, xsum4 1363#endif 1364 1365 FADD y01, y01, xsum1 1366 FADD y02, y02, xsum2 1367 FADD y03, y03, xsum3 1368 FADD y04, y04, xsum4 1369 1370 STFD y01, 0 * SIZE(YY) 1371 addi TEMP, IS, 4 1372 STFD y02, 1 * SIZE(YY) 1373 addi IS, IS, 2 1374 STFD y03, 2 * SIZE(YY) 1375 cmpw cr0, TEMP, M 1376 STFD y04, 3 * SIZE(YY) 1377 ble LL(11) 1378 .align 4 1379 1380LL(20): 1381 andi. TEMP, M, 1 1382 ble LL(990) 1383 1384 mr AO1, A 1385 1386 slwi TEMP, IS, ZBASE_SHIFT 1387 add TEMP, X, TEMP 1388 1389 LFD y05, ALPHA_R 1390 LFD y06, ALPHA_I 1391 1392 LFD xtemp1, 0 * SIZE(TEMP) 1393 LFD xtemp2, 1 * SIZE(TEMP) 1394 1395 FMUL atemp1, y05, xtemp1 1396 FMUL atemp2, y06, xtemp1 1397 1398 FNMSUB atemp1, y06, xtemp2, atemp1 1399 FMADD atemp2, y05, xtemp2, atemp2 1400 1401 lfd xsum1, FZERO 1402 fmr xsum2, xsum1 1403 1404 mr XX, X 1405 mr YY, NEW_Y 1406 1407 LFD a1, 0 * SIZE(AO1) 1408 LFD a2, 1 * SIZE(AO1) 1409 1410 LFD xtemp1, 0 * SIZE(XX) 1411 LFD xtemp2, 1 * SIZE(XX) 1412 1413 LFD y01, 0 * SIZE(YY) 1414 LFD y02, 1 * SIZE(YY) 1415 1416 mtspr CTR, IS 1417 cmpwi cr0, IS, 0 1418 ble LL(28) 1419 .align 4 1420 1421LL(22): 1422 FMADD xsum1, xtemp1, a1, xsum1 1423 FMADD y01, atemp1, a1, y01 1424 FMADD xsum2, xtemp2, a1, xsum2 1425 FMADD y02, atemp2, a1, y02 1426 LFD a1, 2 * SIZE(AO1) 1427 1428 FMADD1 xsum1, xtemp2, a2, xsum1 1429 LFD xtemp2, 3 * SIZE(XX) 1430 FNMSUB y01, atemp2, a2, y01 1431 FMADD2 xsum2, xtemp1, a2, xsum2 1432 LFD xtemp1, 2 * SIZE(XX) 1433 FMADD y02, atemp1, a2, y02 1434 LFD a2, 3 * SIZE(AO1) 1435 1436 addi AO1, AO1, 2 * SIZE 1437 addi XX, XX, 2 * SIZE 1438 addi YY, YY, 2 * SIZE 1439 1440 STFD y01, -2 * SIZE(YY) 1441 LFD y01, 0 * SIZE(YY) 1442 STFD y02, -1 * SIZE(YY) 1443 LFD y02, 1 * SIZE(YY) 1444 bdnz LL(22) 1445 .align 4 1446 1447LL(28): 1448 LFD y05, ALPHA_R 1449 LFD y06, ALPHA_I 1450 1451 FMUL xtemp1, y05, xsum1 1452 FMUL xtemp2, y06, xsum1 1453 1454 FNMSUB xsum1, y06, xsum2, xtemp1 1455 FMADD xsum2, y05, xsum2, xtemp2 1456 1457 FMADD xsum1, atemp1, a1, xsum1 1458 FMADD xsum2, atemp2, a1, xsum2 1459 1460#ifndef HEMV 1461 FNMSUB xsum1, atemp2, a2, xsum1 1462 FMADD xsum2, atemp1, a2, xsum2 1463#endif 1464 1465 FADD y01, y01, xsum1 1466 FADD y02, y02, xsum2 1467 1468 STFD y01, 0 * SIZE(YY) 1469 STFD y02, 1 * SIZE(YY) 1470 .align 4 1471 1472LL(990): 1473 cmpwi cr0, INCY, 2 * SIZE 1474 beq LL(999) 1475 1476 mr YY, Y 1477 1478 srawi. r0, M, 2 1479 mtspr CTR, r0 1480 ble LL(995) 1481 .align 4 1482 1483LL(991): 1484 LFD f0, 0 * SIZE(Y) 1485 LFD f1, 1 * SIZE(Y) 1486 add Y, Y, INCY 1487 LFD f2, 0 * SIZE(Y) 1488 LFD f3, 1 * SIZE(Y) 1489 add Y, Y, INCY 1490 LFD f4, 0 * SIZE(Y) 1491 LFD f5, 1 * SIZE(Y) 1492 add Y, Y, INCY 1493 LFD f6, 0 * SIZE(Y) 1494 LFD f7, 1 * SIZE(Y) 1495 add Y, Y, INCY 1496 1497 LFD f8, 0 * SIZE(NEW_Y) 1498 LFD f9, 1 * SIZE(NEW_Y) 1499 LFD f10, 2 * SIZE(NEW_Y) 1500 LFD f11, 3 * SIZE(NEW_Y) 1501 LFD f12, 4 * SIZE(NEW_Y) 1502 LFD f13, 5 * SIZE(NEW_Y) 1503 LFD f14, 6 * SIZE(NEW_Y) 1504 LFD f15, 7 * SIZE(NEW_Y) 1505 addi NEW_Y, NEW_Y, 8 * SIZE 1506 1507 FADD f8, f8, f0 1508 FADD f9, f9, f1 1509 FADD f10, f10, f2 1510 FADD f11, f11, f3 1511 FADD f12, f12, f4 1512 FADD f13, f13, f5 1513 FADD f14, f14, f6 1514 FADD f15, f15, f7 1515 1516 STFD f8, 0 * SIZE(YY) 1517 STFD f9, 1 * SIZE(YY) 1518 add YY, YY, INCY 1519 STFD f10, 0 * SIZE(YY) 1520 STFD f11, 1 * SIZE(YY) 1521 add YY, YY, INCY 1522 STFD f12, 0 * SIZE(YY) 1523 STFD f13, 1 * SIZE(YY) 1524 add YY, YY, INCY 1525 STFD f14, 0 * SIZE(YY) 1526 STFD f15, 1 * SIZE(YY) 1527 add YY, YY, INCY 1528 bdnz LL(991) 1529 .align 4 1530 1531LL(995): 1532 andi. J, M, 2 1533 ble LL(996) 1534 1535 LFD f0, 0 * SIZE(Y) 1536 LFD f1, 1 * SIZE(Y) 1537 add Y, Y, INCY 1538 LFD f2, 0 * SIZE(Y) 1539 LFD f3, 1 * SIZE(Y) 1540 add Y, Y, INCY 1541 1542 LFD f8, 0 * SIZE(NEW_Y) 1543 LFD f9, 1 * SIZE(NEW_Y) 1544 LFD f10, 2 * SIZE(NEW_Y) 1545 LFD f11, 3 * SIZE(NEW_Y) 1546 addi NEW_Y, NEW_Y, 4 * SIZE 1547 1548 FADD f8, f8, f0 1549 FADD f9, f9, f1 1550 FADD f10, f10, f2 1551 FADD f11, f11, f3 1552 1553 STFD f8, 0 * SIZE(YY) 1554 STFD f9, 1 * SIZE(YY) 1555 add YY, YY, INCY 1556 STFD f10, 0 * SIZE(YY) 1557 STFD f11, 1 * SIZE(YY) 1558 add YY, YY, INCY 1559 .align 4 1560 1561LL(996): 1562 andi. J, M, 1 1563 ble LL(999) 1564 1565 LFD f0, 0 * SIZE(Y) 1566 LFD f1, 1 * SIZE(Y) 1567 1568 LFD f8, 0 * SIZE(NEW_Y) 1569 LFD f9, 1 * SIZE(NEW_Y) 1570 1571 FADD f8, f8, f0 1572 FADD f9, f9, f1 1573 1574 STFD f8, 0 * SIZE(YY) 1575 STFD f9, 1 * SIZE(YY) 1576 .align 4 1577 1578LL(999): 1579 li r3, 0 1580 1581 lfd f14, 0(SP) 1582 lfd f15, 8(SP) 1583 lfd f16, 16(SP) 1584 lfd f17, 24(SP) 1585 lfd f18, 32(SP) 1586 lfd f19, 40(SP) 1587 lfd f20, 48(SP) 1588 lfd f21, 56(SP) 1589 lfd f22, 64(SP) 1590 lfd f23, 72(SP) 1591 lfd f24, 80(SP) 1592 lfd f25, 88(SP) 1593 lfd f26, 96(SP) 1594 lfd f27, 104(SP) 1595 lfd f28, 112(SP) 1596 lfd f29, 120(SP) 1597 lfd f30, 128(SP) 1598 lfd f31, 136(SP) 1599 1600#ifdef __64BIT__ 1601 ld r14, 144(SP) 1602 ld r15, 152(SP) 1603 ld r16, 160(SP) 1604 ld r17, 168(SP) 1605 ld r18, 176(SP) 1606 ld r19, 184(SP) 1607 ld r20, 192(SP) 1608 ld r21, 200(SP) 1609 ld r22, 208(SP) 1610 ld r23, 216(SP) 1611 ld r24, 224(SP) 1612 ld r25, 232(SP) 1613 ld r26, 240(SP) 1614 ld r27, 248(SP) 1615#else 1616 lwz r14, 144(SP) 1617 lwz r15, 148(SP) 1618 lwz r16, 152(SP) 1619 lwz r17, 156(SP) 1620 lwz r18, 160(SP) 1621 lwz r19, 164(SP) 1622 lwz r20, 168(SP) 1623 lwz r21, 172(SP) 1624 lwz r22, 176(SP) 1625 lwz r23, 180(SP) 1626 lwz r24, 184(SP) 1627 lwz r25, 188(SP) 1628 lwz r26, 192(SP) 1629 lwz r27, 196(SP) 1630#endif 1631 1632 addi SP, SP, STACKSIZE 1633 blr 1634 1635 EPILOGUE 1636#endif 1637