1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifndef NEEDPARAM 43#ifndef DOUBLE 44#include "sparam.h" 45#else 46#include "dparam.h" 47#endif 48#endif 49 50#ifdef linux 51#ifndef __64BIT__ 52#define M r3 53#define N r4 54#define X r6 55#define INCX r7 56#define Y r8 57#define INCY r9 58#define A r10 59#define LDA r5 60#else 61#define M r3 62#define N r4 63#define X r7 64#define INCX r8 65#define Y r9 66#define INCY r10 67#define A r5 68#define LDA r6 69#endif 70#endif 71 72#if defined(_AIX) || defined(__APPLE__) 73#if !defined(__64BIT__) && defined(DOUBLE) 74#define M r3 75#define N r4 76#define X r8 77#define INCX r9 78#define Y r10 79#define INCY r5 80#define A r6 81#define LDA r7 82#else 83#define M r3 84#define N r4 85#define X r7 86#define INCX r8 87#define Y r9 88#define INCY r10 89#define A r5 90#define LDA r6 91#endif 92#endif 93 94#define I r11 95#define J r12 96 97#define AO1 r14 98#define AO2 r15 99#define AO3 r16 100#define AO4 r17 101#define AO5 r18 102#define AO6 r19 103#define AO7 r20 104#define AO8 r21 105 106#define X1 r22 107#define PREA r23 108#define PREC r24 109#define XX r25 110#define BUFFER r26 111 112#define y01 f0 113#define y02 f1 114#define y03 f2 115#define y04 f3 116#define y05 f4 117#define y06 f5 118#define y07 f6 119#define y08 f7 120 121#define alpha1 f8 122#define alpha2 f9 123 124#define a1 f12 125#define a2 f13 126#define a3 f14 127#define a4 f15 128#define a5 f16 129#define a6 f17 130#define a7 f18 131#define a8 f19 132#define a9 f20 133#define a10 f21 134#define a11 f22 135#define a12 f23 136#define a13 f24 137#define a14 f25 138#define a15 f26 139#define a16 f27 140 141#define alpha f31 142 143#if defined(PPC440) || defined(PPC440FP2) 144#define PREFETCHSIZE_A 24 145#define PREFETCHSIZE_C 16 146#endif 147 148#ifdef PPC970 149#define PREFETCHSIZE_A 16 150#define PREFETCHSIZE_C 16 151#endif 152 153#ifdef POWER4 154#define PREFETCHSIZE_A 16 155#define PREFETCHSIZE_C 16 156#endif 157 158#ifdef POWER5 159#define PREFETCHSIZE_A 16 160#define PREFETCHSIZE_C 16 161#endif 162 163#ifndef NEEDPARAM 164 165#ifndef __64BIT__ 166#define STACKSIZE 224 167#else 168#define STACKSIZE 280 169#endif 170 171 PROLOGUE 172 PROFCODE 173 174 addi SP, SP, -STACKSIZE 175 176 stfd f14, 0(SP) 177 stfd f15, 8(SP) 178 stfd f16, 16(SP) 179 stfd f17, 24(SP) 180 stfd f18, 32(SP) 181 stfd f19, 40(SP) 182 stfd f20, 48(SP) 183 stfd f21, 56(SP) 184 stfd f22, 64(SP) 185 stfd f23, 72(SP) 186 stfd f24, 80(SP) 187 stfd f25, 88(SP) 188 stfd f26, 96(SP) 189 stfd f27, 104(SP) 190 stfd f28, 112(SP) 191 stfd f29, 120(SP) 192 stfd f30, 128(SP) 193 stfd f31, 136(SP) 194 195#ifdef __64BIT__ 196 std r14, 144(SP) 197 std r15, 152(SP) 198 std r16, 160(SP) 199 std r17, 168(SP) 200 std r18, 176(SP) 201 std r19, 184(SP) 202 std r20, 192(SP) 203 std r21, 200(SP) 204 std r22, 208(SP) 205 std r23, 216(SP) 206 std r24, 224(SP) 207 std r25, 232(SP) 208 std r26, 240(SP) 209 std r27, 248(SP) 210#else 211 stw r14, 144(SP) 212 stw r15, 148(SP) 213 stw r16, 152(SP) 214 stw r17, 156(SP) 215 stw r18, 160(SP) 216 stw r19, 164(SP) 217 stw r20, 168(SP) 218 stw r21, 172(SP) 219 stw r22, 176(SP) 220 stw r23, 180(SP) 221 stw r24, 184(SP) 222 stw r25, 188(SP) 223 stw r26, 192(SP) 224 stw r27, 196(SP) 225#endif 226 227#ifdef linux 228#ifndef __64BIT__ 229 lwz LDA, 8 + STACKSIZE(SP) 230 lwz BUFFER, 12 + STACKSIZE(SP) 231#else 232 ld A, 112 + STACKSIZE(SP) 233 ld LDA, 120 + STACKSIZE(SP) 234 ld BUFFER, 128 + STACKSIZE(SP) 235#endif 236#endif 237 238#if defined(_AIX) || defined(__APPLE__) 239#ifndef __64BIT__ 240#ifdef DOUBLE 241 lwz INCY, 56 + STACKSIZE(SP) 242 lwz A, 60 + STACKSIZE(SP) 243 lwz LDA, 64 + STACKSIZE(SP) 244 lwz BUFFER, 68 + STACKSIZE(SP) 245#else 246 lwz A, 56 + STACKSIZE(SP) 247 lwz LDA, 60 + STACKSIZE(SP) 248 lwz BUFFER, 64 + STACKSIZE(SP) 249#endif 250#else 251 ld A, 112 + STACKSIZE(SP) 252 ld LDA, 120 + STACKSIZE(SP) 253 ld BUFFER, 128 + STACKSIZE(SP) 254#endif 255#endif 256 257 fmr alpha, f1 258 259 slwi LDA, LDA, BASE_SHIFT 260 slwi INCX, INCX, BASE_SHIFT 261 slwi INCY, INCY, BASE_SHIFT 262 263 li PREA, PREFETCHSIZE_A * SIZE 264 li PREC, PREFETCHSIZE_C * SIZE 265 266 cmpwi cr0, M, 0 267 ble- LL(999) 268 269 cmpwi cr0, N, 0 270 ble- LL(999) 271 272 mr XX, X 273 274 cmpi cr0, 0, INCX, SIZE 275 beq LL(10) 276 277 mr XX, BUFFER 278 mr X1, BUFFER 279 280 srawi. r0, M, 3 281 mtspr CTR, r0 282 ble LL(05) 283 .align 4 284 285LL(01): 286 LFD a1, 0 * SIZE(X) 287 add X, X, INCX 288 LFD a2, 0 * SIZE(X) 289 add X, X, INCX 290 LFD a3, 0 * SIZE(X) 291 add X, X, INCX 292 LFD a4, 0 * SIZE(X) 293 add X, X, INCX 294 LFD a5, 0 * SIZE(X) 295 add X, X, INCX 296 LFD a6, 0 * SIZE(X) 297 add X, X, INCX 298 LFD a7, 0 * SIZE(X) 299 add X, X, INCX 300 LFD a8, 0 * SIZE(X) 301 add X, X, INCX 302 303 STFD a1, 0 * SIZE(X1) 304 STFD a2, 1 * SIZE(X1) 305 STFD a3, 2 * SIZE(X1) 306 STFD a4, 3 * SIZE(X1) 307 STFD a5, 4 * SIZE(X1) 308 STFD a6, 5 * SIZE(X1) 309 STFD a7, 6 * SIZE(X1) 310 STFD a8, 7 * SIZE(X1) 311 312 addi X1, X1, 8 * SIZE 313 bdnz+ LL(01) 314 .align 4 315 316LL(05): 317 andi. r0, M, 7 318 mtspr CTR, r0 319 ble LL(10) 320 .align 4 321 322LL(06): 323 LFD a1, 0 * SIZE(X) 324 add X, X, INCX 325 STFD a1, 0 * SIZE(X1) 326 addi X1, X1, SIZE 327 bdnz+ LL(06) 328 .align 4 329 330LL(10): 331 srawi. J, N, 1 332 ble LL(20) 333 .align 4 334 335LL(11): 336 LFD alpha1, 0 * SIZE(Y) 337 add Y, Y, INCY 338 LFD alpha2, 0 * SIZE(Y) 339 add Y, Y, INCY 340 341 FMUL alpha1, alpha, alpha1 342 FMUL alpha2, alpha, alpha2 343 344 mr AO1, A 345 add AO2, A, LDA 346 add A, AO2, LDA 347 348 mr X1, XX 349 350 srawi. r0, M, 4 351 mtspr CTR, r0 352 ble LL(15) 353 354 LFD a1, 0 * SIZE(AO1) 355 LFD a2, 1 * SIZE(AO1) 356 LFD a3, 2 * SIZE(AO1) 357 LFD a4, 3 * SIZE(AO1) 358 359 LFD a5, 4 * SIZE(AO1) 360 LFD a6, 5 * SIZE(AO1) 361 LFD a7, 6 * SIZE(AO1) 362 LFD a8, 7 * SIZE(AO1) 363 364 LFD y01, 0 * SIZE(X1) 365 LFD y02, 1 * SIZE(X1) 366 LFD y03, 2 * SIZE(X1) 367 LFD y04, 3 * SIZE(X1) 368 369 LFD y05, 4 * SIZE(X1) 370 LFD y06, 5 * SIZE(X1) 371 LFD y07, 6 * SIZE(X1) 372 LFD y08, 7 * SIZE(X1) 373 374 LFD a9, 0 * SIZE(AO2) 375 LFD a10, 1 * SIZE(AO2) 376 LFD a11, 2 * SIZE(AO2) 377 LFD a12, 3 * SIZE(AO2) 378 379 LFD a13, 4 * SIZE(AO2) 380 LFD a14, 5 * SIZE(AO2) 381 LFD a15, 6 * SIZE(AO2) 382 LFD a16, 7 * SIZE(AO2) 383 bdz LL(13) 384 .align 4 385 386LL(12): 387 FMADD a1, alpha1, y01, a1 388 FMADD a2, alpha1, y02, a2 389 FMADD a3, alpha1, y03, a3 390 FMADD a4, alpha1, y04, a4 391 392 FMADD a5, alpha1, y05, a5 393 FMADD a6, alpha1, y06, a6 394 FMADD a7, alpha1, y07, a7 395 FMADD a8, alpha1, y08, a8 396 397 STFD a1, 0 * SIZE(AO1) 398 STFD a2, 1 * SIZE(AO1) 399 STFD a3, 2 * SIZE(AO1) 400 STFD a4, 3 * SIZE(AO1) 401 402 LFD a1, 8 * SIZE(AO1) 403 LFD a2, 9 * SIZE(AO1) 404 LFD a3, 10 * SIZE(AO1) 405 LFD a4, 11 * SIZE(AO1) 406 407 STFD a5, 4 * SIZE(AO1) 408 STFD a6, 5 * SIZE(AO1) 409 STFD a7, 6 * SIZE(AO1) 410 STFD a8, 7 * SIZE(AO1) 411 412 LFD a5, 12 * SIZE(AO1) 413 LFD a6, 13 * SIZE(AO1) 414 LFD a7, 14 * SIZE(AO1) 415 LFD a8, 15 * SIZE(AO1) 416 417 FMADD a9, alpha2, y01, a9 418 FMADD a10, alpha2, y02, a10 419 FMADD a11, alpha2, y03, a11 420 FMADD a12, alpha2, y04, a12 421 422 LFD y01, 8 * SIZE(X1) 423 LFD y02, 9 * SIZE(X1) 424 LFD y03, 10 * SIZE(X1) 425 LFD y04, 11 * SIZE(X1) 426 427 FMADD a13, alpha2, y05, a13 428 FMADD a14, alpha2, y06, a14 429 FMADD a15, alpha2, y07, a15 430 FMADD a16, alpha2, y08, a16 431 432 LFD y05, 12 * SIZE(X1) 433 LFD y06, 13 * SIZE(X1) 434 LFD y07, 14 * SIZE(X1) 435 LFD y08, 15 * SIZE(X1) 436 437 STFD a9, 0 * SIZE(AO2) 438 STFD a10, 1 * SIZE(AO2) 439 STFD a11, 2 * SIZE(AO2) 440 STFD a12, 3 * SIZE(AO2) 441 442 LFD a9, 8 * SIZE(AO2) 443 LFD a10, 9 * SIZE(AO2) 444 LFD a11, 10 * SIZE(AO2) 445 LFD a12, 11 * SIZE(AO2) 446 447 STFD a13, 4 * SIZE(AO2) 448 STFD a14, 5 * SIZE(AO2) 449 STFD a15, 6 * SIZE(AO2) 450 STFD a16, 7 * SIZE(AO2) 451 452 LFD a13, 12 * SIZE(AO2) 453 LFD a14, 13 * SIZE(AO2) 454 LFD a15, 14 * SIZE(AO2) 455 LFD a16, 15 * SIZE(AO2) 456 457 FMADD a1, alpha1, y01, a1 458 FMADD a2, alpha1, y02, a2 459 FMADD a3, alpha1, y03, a3 460 FMADD a4, alpha1, y04, a4 461 462 FMADD a5, alpha1, y05, a5 463 FMADD a6, alpha1, y06, a6 464 FMADD a7, alpha1, y07, a7 465 FMADD a8, alpha1, y08, a8 466 467 STFD a1, 8 * SIZE(AO1) 468 STFD a2, 9 * SIZE(AO1) 469 STFD a3, 10 * SIZE(AO1) 470 STFD a4, 11 * SIZE(AO1) 471 472 LFD a1, 16 * SIZE(AO1) 473 LFD a2, 17 * SIZE(AO1) 474 LFD a3, 18 * SIZE(AO1) 475 LFD a4, 19 * SIZE(AO1) 476 477 STFD a5, 12 * SIZE(AO1) 478 STFD a6, 13 * SIZE(AO1) 479 STFD a7, 14 * SIZE(AO1) 480 STFD a8, 15 * SIZE(AO1) 481 482 LFD a5, 20 * SIZE(AO1) 483 LFD a6, 21 * SIZE(AO1) 484 LFD a7, 22 * SIZE(AO1) 485 LFD a8, 23 * SIZE(AO1) 486 487 FMADD a9, alpha2, y01, a9 488 FMADD a10, alpha2, y02, a10 489 FMADD a11, alpha2, y03, a11 490 FMADD a12, alpha2, y04, a12 491 492 LFD y01, 16 * SIZE(X1) 493 LFD y02, 17 * SIZE(X1) 494 LFD y03, 18 * SIZE(X1) 495 LFD y04, 19 * SIZE(X1) 496 497 FMADD a13, alpha2, y05, a13 498 FMADD a14, alpha2, y06, a14 499 FMADD a15, alpha2, y07, a15 500 FMADD a16, alpha2, y08, a16 501 502 LFD y05, 20 * SIZE(X1) 503 LFD y06, 21 * SIZE(X1) 504 LFD y07, 22 * SIZE(X1) 505 LFD y08, 23 * SIZE(X1) 506 507 STFD a9, 8 * SIZE(AO2) 508 STFD a10, 9 * SIZE(AO2) 509 STFD a11, 10 * SIZE(AO2) 510 STFD a12, 11 * SIZE(AO2) 511 512 LFD a9, 16 * SIZE(AO2) 513 LFD a10, 17 * SIZE(AO2) 514 LFD a11, 18 * SIZE(AO2) 515 LFD a12, 19 * SIZE(AO2) 516 517 STFD a13, 12 * SIZE(AO2) 518 STFD a14, 13 * SIZE(AO2) 519 STFD a15, 14 * SIZE(AO2) 520 STFD a16, 15 * SIZE(AO2) 521 522 LFD a13, 20 * SIZE(AO2) 523 LFD a14, 21 * SIZE(AO2) 524 LFD a15, 22 * SIZE(AO2) 525 LFD a16, 23 * SIZE(AO2) 526 527 addi AO1, AO1, 16 * SIZE 528 addi AO2, AO2, 16 * SIZE 529 addi X1, X1, 16 * SIZE 530 531 DCBT(AO1, PREA) 532 DCBT(AO2, PREA) 533 DCBT(Y1, PREY) 534 535 bdnz+ LL(12) 536 .align 4 537 538LL(13): 539 FMADD a1, alpha1, y01, a1 540 FMADD a2, alpha1, y02, a2 541 FMADD a3, alpha1, y03, a3 542 FMADD a4, alpha1, y04, a4 543 544 FMADD a5, alpha1, y05, a5 545 FMADD a6, alpha1, y06, a6 546 FMADD a7, alpha1, y07, a7 547 FMADD a8, alpha1, y08, a8 548 549 STFD a1, 0 * SIZE(AO1) 550 STFD a2, 1 * SIZE(AO1) 551 STFD a3, 2 * SIZE(AO1) 552 STFD a4, 3 * SIZE(AO1) 553 554 LFD a1, 8 * SIZE(AO1) 555 LFD a2, 9 * SIZE(AO1) 556 LFD a3, 10 * SIZE(AO1) 557 LFD a4, 11 * SIZE(AO1) 558 559 STFD a5, 4 * SIZE(AO1) 560 STFD a6, 5 * SIZE(AO1) 561 STFD a7, 6 * SIZE(AO1) 562 STFD a8, 7 * SIZE(AO1) 563 564 LFD a5, 12 * SIZE(AO1) 565 LFD a6, 13 * SIZE(AO1) 566 LFD a7, 14 * SIZE(AO1) 567 LFD a8, 15 * SIZE(AO1) 568 569 FMADD a9, alpha2, y01, a9 570 FMADD a10, alpha2, y02, a10 571 FMADD a11, alpha2, y03, a11 572 FMADD a12, alpha2, y04, a12 573 574 LFD y01, 8 * SIZE(X1) 575 LFD y02, 9 * SIZE(X1) 576 LFD y03, 10 * SIZE(X1) 577 LFD y04, 11 * SIZE(X1) 578 579 FMADD a13, alpha2, y05, a13 580 FMADD a14, alpha2, y06, a14 581 FMADD a15, alpha2, y07, a15 582 FMADD a16, alpha2, y08, a16 583 584 LFD y05, 12 * SIZE(X1) 585 LFD y06, 13 * SIZE(X1) 586 LFD y07, 14 * SIZE(X1) 587 LFD y08, 15 * SIZE(X1) 588 589 STFD a9, 0 * SIZE(AO2) 590 STFD a10, 1 * SIZE(AO2) 591 STFD a11, 2 * SIZE(AO2) 592 STFD a12, 3 * SIZE(AO2) 593 594 LFD a9, 8 * SIZE(AO2) 595 LFD a10, 9 * SIZE(AO2) 596 LFD a11, 10 * SIZE(AO2) 597 LFD a12, 11 * SIZE(AO2) 598 599 STFD a13, 4 * SIZE(AO2) 600 STFD a14, 5 * SIZE(AO2) 601 STFD a15, 6 * SIZE(AO2) 602 STFD a16, 7 * SIZE(AO2) 603 604 LFD a13, 12 * SIZE(AO2) 605 LFD a14, 13 * SIZE(AO2) 606 LFD a15, 14 * SIZE(AO2) 607 LFD a16, 15 * SIZE(AO2) 608 609 FMADD a1, alpha1, y01, a1 610 FMADD a2, alpha1, y02, a2 611 FMADD a3, alpha1, y03, a3 612 FMADD a4, alpha1, y04, a4 613 614 FMADD a5, alpha1, y05, a5 615 FMADD a6, alpha1, y06, a6 616 FMADD a7, alpha1, y07, a7 617 FMADD a8, alpha1, y08, a8 618 619 STFD a1, 8 * SIZE(AO1) 620 STFD a2, 9 * SIZE(AO1) 621 STFD a3, 10 * SIZE(AO1) 622 STFD a4, 11 * SIZE(AO1) 623 624 LFD a1, 16 * SIZE(AO1) 625 LFD a2, 17 * SIZE(AO1) 626 LFD a3, 18 * SIZE(AO1) 627 LFD a4, 19 * SIZE(AO1) 628 629 STFD a5, 12 * SIZE(AO1) 630 STFD a6, 13 * SIZE(AO1) 631 STFD a7, 14 * SIZE(AO1) 632 STFD a8, 15 * SIZE(AO1) 633 634 LFD a5, 20 * SIZE(AO1) 635 LFD a6, 21 * SIZE(AO1) 636 LFD a7, 22 * SIZE(AO1) 637 LFD a8, 23 * SIZE(AO1) 638 639 FMADD a9, alpha2, y01, a9 640 FMADD a10, alpha2, y02, a10 641 FMADD a11, alpha2, y03, a11 642 FMADD a12, alpha2, y04, a12 643 644 FMADD a13, alpha2, y05, a13 645 FMADD a14, alpha2, y06, a14 646 FMADD a15, alpha2, y07, a15 647 FMADD a16, alpha2, y08, a16 648 649 STFD a9, 8 * SIZE(AO2) 650 STFD a10, 9 * SIZE(AO2) 651 STFD a11, 10 * SIZE(AO2) 652 STFD a12, 11 * SIZE(AO2) 653 654 STFD a13, 12 * SIZE(AO2) 655 STFD a14, 13 * SIZE(AO2) 656 STFD a15, 14 * SIZE(AO2) 657 STFD a16, 15 * SIZE(AO2) 658 659 addi AO1, AO1, 16 * SIZE 660 addi AO2, AO2, 16 * SIZE 661 addi X1, X1, 16 * SIZE 662 .align 4 663 664 665LL(15): 666 andi. r0, M, 15 667 ble LL(19) 668 669 andi. r0, M, 8 670 ble LL(16) 671 672 LFD y01, 0 * SIZE(X1) 673 LFD y02, 1 * SIZE(X1) 674 LFD y03, 2 * SIZE(X1) 675 LFD y04, 3 * SIZE(X1) 676 LFD y05, 4 * SIZE(X1) 677 LFD y06, 5 * SIZE(X1) 678 LFD y07, 6 * SIZE(X1) 679 LFD y08, 7 * SIZE(X1) 680 681 LFD a1, 0 * SIZE(AO1) 682 LFD a2, 1 * SIZE(AO1) 683 LFD a3, 2 * SIZE(AO1) 684 LFD a4, 3 * SIZE(AO1) 685 LFD a5, 4 * SIZE(AO1) 686 LFD a6, 5 * SIZE(AO1) 687 LFD a7, 6 * SIZE(AO1) 688 LFD a8, 7 * SIZE(AO1) 689 690 LFD a9, 0 * SIZE(AO2) 691 LFD a10, 1 * SIZE(AO2) 692 LFD a11, 2 * SIZE(AO2) 693 LFD a12, 3 * SIZE(AO2) 694 LFD a13, 4 * SIZE(AO2) 695 LFD a14, 5 * SIZE(AO2) 696 LFD a15, 6 * SIZE(AO2) 697 LFD a16, 7 * SIZE(AO2) 698 699 FMADD a1, alpha1, y01, a1 700 FMADD a2, alpha1, y02, a2 701 FMADD a3, alpha1, y03, a3 702 FMADD a4, alpha1, y04, a4 703 704 STFD a1, 0 * SIZE(AO1) 705 STFD a2, 1 * SIZE(AO1) 706 STFD a3, 2 * SIZE(AO1) 707 STFD a4, 3 * SIZE(AO1) 708 709 FMADD a5, alpha1, y05, a5 710 FMADD a6, alpha1, y06, a6 711 FMADD a7, alpha1, y07, a7 712 FMADD a8, alpha1, y08, a8 713 714 STFD a5, 4 * SIZE(AO1) 715 STFD a6, 5 * SIZE(AO1) 716 STFD a7, 6 * SIZE(AO1) 717 STFD a8, 7 * SIZE(AO1) 718 719 FMADD a9, alpha2, y01, a9 720 FMADD a10, alpha2, y02, a10 721 FMADD a11, alpha2, y03, a11 722 FMADD a12, alpha2, y04, a12 723 724 STFD a9, 0 * SIZE(AO2) 725 STFD a10, 1 * SIZE(AO2) 726 STFD a11, 2 * SIZE(AO2) 727 STFD a12, 3 * SIZE(AO2) 728 729 FMADD a13, alpha2, y05, a13 730 FMADD a14, alpha2, y06, a14 731 FMADD a15, alpha2, y07, a15 732 FMADD a16, alpha2, y08, a16 733 734 STFD a13, 4 * SIZE(AO2) 735 STFD a14, 5 * SIZE(AO2) 736 STFD a15, 6 * SIZE(AO2) 737 STFD a16, 7 * SIZE(AO2) 738 739 addi AO1, AO1, 8 * SIZE 740 addi AO2, AO2, 8 * SIZE 741 addi X1, X1, 8 * SIZE 742 .align 4 743 744LL(16): 745 andi. r0, M, 4 746 ble LL(17) 747 748 LFD a1, 0 * SIZE(AO1) 749 LFD a2, 1 * SIZE(AO1) 750 LFD a3, 2 * SIZE(AO1) 751 LFD a4, 3 * SIZE(AO1) 752 753 LFD y01, 0 * SIZE(X1) 754 LFD y02, 1 * SIZE(X1) 755 LFD y03, 2 * SIZE(X1) 756 LFD y04, 3 * SIZE(X1) 757 758 LFD a5, 0 * SIZE(AO2) 759 LFD a6, 1 * SIZE(AO2) 760 LFD a7, 2 * SIZE(AO2) 761 LFD a8, 3 * SIZE(AO2) 762 763 FMADD a1, alpha1, y01, a1 764 FMADD a2, alpha1, y02, a2 765 FMADD a3, alpha1, y03, a3 766 FMADD a4, alpha1, y04, a4 767 768 STFD a1, 0 * SIZE(AO1) 769 STFD a2, 1 * SIZE(AO1) 770 STFD a3, 2 * SIZE(AO1) 771 STFD a4, 3 * SIZE(AO1) 772 773 FMADD a5, alpha2, y01, a5 774 FMADD a6, alpha2, y02, a6 775 FMADD a7, alpha2, y03, a7 776 FMADD a8, alpha2, y04, a8 777 778 STFD a5, 0 * SIZE(AO2) 779 STFD a6, 1 * SIZE(AO2) 780 STFD a7, 2 * SIZE(AO2) 781 STFD a8, 3 * SIZE(AO2) 782 783 addi AO1, AO1, 4 * SIZE 784 addi AO2, AO2, 4 * SIZE 785 addi X1, X1, 4 * SIZE 786 .align 4 787 788LL(17): 789 andi. r0, M, 2 790 ble LL(18) 791 792 LFD a1, 0 * SIZE(AO1) 793 LFD a2, 1 * SIZE(AO1) 794 LFD a3, 0 * SIZE(AO2) 795 LFD a4, 1 * SIZE(AO2) 796 797 LFD y01, 0 * SIZE(X1) 798 LFD y02, 1 * SIZE(X1) 799 800 FMADD a1, alpha1, y01, a1 801 FMADD a2, alpha1, y02, a2 802 FMADD a3, alpha2, y01, a3 803 FMADD a4, alpha2, y02, a4 804 805 STFD a1, 0 * SIZE(AO1) 806 STFD a2, 1 * SIZE(AO1) 807 STFD a3, 0 * SIZE(AO2) 808 STFD a4, 1 * SIZE(AO2) 809 810 addi AO1, AO1, 2 * SIZE 811 addi AO2, AO2, 2 * SIZE 812 813 addi X1, X1, 2 * SIZE 814 .align 4 815 816LL(18): 817 andi. r0, M, 1 818 ble LL(19) 819 820 LFD y01, 0 * SIZE(X1) 821 822 LFD a1, 0 * SIZE(AO1) 823 LFD a2, 0 * SIZE(AO2) 824 825 FMADD a1, alpha1, y01, a1 826 FMADD a2, alpha2, y01, a2 827 828 STFD a1, 0 * SIZE(AO1) 829 STFD a2, 0 * SIZE(AO2) 830 .align 4 831 832LL(19): 833 addi J, J, -1 834 cmpi cr0, 0, J, 0 835 bgt LL(11) 836 .align 4 837 838LL(20): 839 andi. J, N, 1 840 ble LL(999) 841 .align 4 842 843LL(21): 844 LFD alpha1, 0 * SIZE(Y) 845 FMUL alpha1, alpha, alpha1 846 847 mr AO1, A 848 mr X1, XX 849 850 srawi. r0, M, 4 851 mtspr CTR, r0 852 ble LL(25) 853 854 LFD a1, 0 * SIZE(AO1) 855 LFD a2, 1 * SIZE(AO1) 856 LFD a3, 2 * SIZE(AO1) 857 LFD a4, 3 * SIZE(AO1) 858 859 LFD a5, 4 * SIZE(AO1) 860 LFD a6, 5 * SIZE(AO1) 861 LFD a7, 6 * SIZE(AO1) 862 LFD a8, 7 * SIZE(AO1) 863 864 LFD y01, 0 * SIZE(X1) 865 LFD y02, 1 * SIZE(X1) 866 LFD y03, 2 * SIZE(X1) 867 LFD y04, 3 * SIZE(X1) 868 869 LFD y05, 4 * SIZE(X1) 870 LFD y06, 5 * SIZE(X1) 871 LFD y07, 6 * SIZE(X1) 872 LFD y08, 7 * SIZE(X1) 873 874 bdz LL(23) 875 .align 4 876 877LL(22): 878 FMADD a1, alpha1, y01, a1 879 FMADD a2, alpha1, y02, a2 880 FMADD a3, alpha1, y03, a3 881 FMADD a4, alpha1, y04, a4 882 883 FMADD a5, alpha1, y05, a5 884 FMADD a6, alpha1, y06, a6 885 FMADD a7, alpha1, y07, a7 886 FMADD a8, alpha1, y08, a8 887 888 STFD a1, 0 * SIZE(AO1) 889 STFD a2, 1 * SIZE(AO1) 890 STFD a3, 2 * SIZE(AO1) 891 STFD a4, 3 * SIZE(AO1) 892 893 LFD a1, 8 * SIZE(AO1) 894 LFD a2, 9 * SIZE(AO1) 895 LFD a3, 10 * SIZE(AO1) 896 LFD a4, 11 * SIZE(AO1) 897 898 STFD a5, 4 * SIZE(AO1) 899 STFD a6, 5 * SIZE(AO1) 900 STFD a7, 6 * SIZE(AO1) 901 STFD a8, 7 * SIZE(AO1) 902 903 LFD a5, 12 * SIZE(AO1) 904 LFD a6, 13 * SIZE(AO1) 905 LFD a7, 14 * SIZE(AO1) 906 LFD a8, 15 * SIZE(AO1) 907 908 LFD y01, 8 * SIZE(X1) 909 LFD y02, 9 * SIZE(X1) 910 LFD y03, 10 * SIZE(X1) 911 LFD y04, 11 * SIZE(X1) 912 913 LFD y05, 12 * SIZE(X1) 914 LFD y06, 13 * SIZE(X1) 915 LFD y07, 14 * SIZE(X1) 916 LFD y08, 15 * SIZE(X1) 917 918 FMADD a1, alpha1, y01, a1 919 FMADD a2, alpha1, y02, a2 920 FMADD a3, alpha1, y03, a3 921 FMADD a4, alpha1, y04, a4 922 923 FMADD a5, alpha1, y05, a5 924 FMADD a6, alpha1, y06, a6 925 FMADD a7, alpha1, y07, a7 926 FMADD a8, alpha1, y08, a8 927 928 STFD a1, 8 * SIZE(AO1) 929 STFD a2, 9 * SIZE(AO1) 930 STFD a3, 10 * SIZE(AO1) 931 STFD a4, 11 * SIZE(AO1) 932 933 LFD a1, 16 * SIZE(AO1) 934 LFD a2, 17 * SIZE(AO1) 935 LFD a3, 18 * SIZE(AO1) 936 LFD a4, 19 * SIZE(AO1) 937 938 STFD a5, 12 * SIZE(AO1) 939 STFD a6, 13 * SIZE(AO1) 940 STFD a7, 14 * SIZE(AO1) 941 STFD a8, 15 * SIZE(AO1) 942 943 LFD a5, 20 * SIZE(AO1) 944 LFD a6, 21 * SIZE(AO1) 945 LFD a7, 22 * SIZE(AO1) 946 LFD a8, 23 * SIZE(AO1) 947 948 LFD y01, 16 * SIZE(X1) 949 LFD y02, 17 * SIZE(X1) 950 LFD y03, 18 * SIZE(X1) 951 LFD y04, 19 * SIZE(X1) 952 953 LFD y05, 20 * SIZE(X1) 954 LFD y06, 21 * SIZE(X1) 955 LFD y07, 22 * SIZE(X1) 956 LFD y08, 23 * SIZE(X1) 957 958 addi AO1, AO1, 16 * SIZE 959 addi X1, X1, 16 * SIZE 960 961 DCBT(AO1, PREA) 962 DCBT(Y1, PREY) 963 964 bdnz+ LL(22) 965 .align 4 966 967LL(23): 968 FMADD a1, alpha1, y01, a1 969 FMADD a2, alpha1, y02, a2 970 FMADD a3, alpha1, y03, a3 971 FMADD a4, alpha1, y04, a4 972 973 FMADD a5, alpha1, y05, a5 974 FMADD a6, alpha1, y06, a6 975 FMADD a7, alpha1, y07, a7 976 FMADD a8, alpha1, y08, a8 977 978 STFD a1, 0 * SIZE(AO1) 979 STFD a2, 1 * SIZE(AO1) 980 STFD a3, 2 * SIZE(AO1) 981 STFD a4, 3 * SIZE(AO1) 982 983 LFD a1, 8 * SIZE(AO1) 984 LFD a2, 9 * SIZE(AO1) 985 LFD a3, 10 * SIZE(AO1) 986 LFD a4, 11 * SIZE(AO1) 987 988 STFD a5, 4 * SIZE(AO1) 989 STFD a6, 5 * SIZE(AO1) 990 STFD a7, 6 * SIZE(AO1) 991 STFD a8, 7 * SIZE(AO1) 992 993 LFD a5, 12 * SIZE(AO1) 994 LFD a6, 13 * SIZE(AO1) 995 LFD a7, 14 * SIZE(AO1) 996 LFD a8, 15 * SIZE(AO1) 997 998 LFD y01, 8 * SIZE(X1) 999 LFD y02, 9 * SIZE(X1) 1000 LFD y03, 10 * SIZE(X1) 1001 LFD y04, 11 * SIZE(X1) 1002 1003 LFD y05, 12 * SIZE(X1) 1004 LFD y06, 13 * SIZE(X1) 1005 LFD y07, 14 * SIZE(X1) 1006 LFD y08, 15 * SIZE(X1) 1007 1008 FMADD a1, alpha1, y01, a1 1009 FMADD a2, alpha1, y02, a2 1010 FMADD a3, alpha1, y03, a3 1011 FMADD a4, alpha1, y04, a4 1012 1013 FMADD a5, alpha1, y05, a5 1014 FMADD a6, alpha1, y06, a6 1015 FMADD a7, alpha1, y07, a7 1016 FMADD a8, alpha1, y08, a8 1017 1018 STFD a1, 8 * SIZE(AO1) 1019 STFD a2, 9 * SIZE(AO1) 1020 STFD a3, 10 * SIZE(AO1) 1021 STFD a4, 11 * SIZE(AO1) 1022 1023 LFD a1, 16 * SIZE(AO1) 1024 LFD a2, 17 * SIZE(AO1) 1025 LFD a3, 18 * SIZE(AO1) 1026 LFD a4, 19 * SIZE(AO1) 1027 1028 STFD a5, 12 * SIZE(AO1) 1029 STFD a6, 13 * SIZE(AO1) 1030 STFD a7, 14 * SIZE(AO1) 1031 STFD a8, 15 * SIZE(AO1) 1032 1033 LFD a5, 20 * SIZE(AO1) 1034 LFD a6, 21 * SIZE(AO1) 1035 LFD a7, 22 * SIZE(AO1) 1036 LFD a8, 23 * SIZE(AO1) 1037 1038 addi AO1, AO1, 16 * SIZE 1039 addi X1, X1, 16 * SIZE 1040 .align 4 1041 1042LL(25): 1043 andi. r0, M, 15 1044 ble LL(999) 1045 1046 andi. r0, M, 8 1047 ble LL(26) 1048 1049 LFD y01, 0 * SIZE(X1) 1050 LFD y02, 1 * SIZE(X1) 1051 LFD y03, 2 * SIZE(X1) 1052 LFD y04, 3 * SIZE(X1) 1053 LFD y05, 4 * SIZE(X1) 1054 LFD y06, 5 * SIZE(X1) 1055 LFD y07, 6 * SIZE(X1) 1056 LFD y08, 7 * SIZE(X1) 1057 1058 LFD a1, 0 * SIZE(AO1) 1059 LFD a2, 1 * SIZE(AO1) 1060 LFD a3, 2 * SIZE(AO1) 1061 LFD a4, 3 * SIZE(AO1) 1062 LFD a5, 4 * SIZE(AO1) 1063 LFD a6, 5 * SIZE(AO1) 1064 LFD a7, 6 * SIZE(AO1) 1065 LFD a8, 7 * SIZE(AO1) 1066 1067 FMADD a1, alpha1, y01, a1 1068 FMADD a2, alpha1, y02, a2 1069 FMADD a3, alpha1, y03, a3 1070 FMADD a4, alpha1, y04, a4 1071 1072 STFD a1, 0 * SIZE(AO1) 1073 STFD a2, 1 * SIZE(AO1) 1074 STFD a3, 2 * SIZE(AO1) 1075 STFD a4, 3 * SIZE(AO1) 1076 1077 FMADD a5, alpha1, y05, a5 1078 FMADD a6, alpha1, y06, a6 1079 FMADD a7, alpha1, y07, a7 1080 FMADD a8, alpha1, y08, a8 1081 1082 STFD a5, 4 * SIZE(AO1) 1083 STFD a6, 5 * SIZE(AO1) 1084 STFD a7, 6 * SIZE(AO1) 1085 STFD a8, 7 * SIZE(AO1) 1086 1087 addi AO1, AO1, 8 * SIZE 1088 addi X1, X1, 8 * SIZE 1089 .align 4 1090 1091LL(26): 1092 andi. r0, M, 4 1093 ble LL(27) 1094 1095 LFD a1, 0 * SIZE(AO1) 1096 LFD a2, 1 * SIZE(AO1) 1097 LFD a3, 2 * SIZE(AO1) 1098 LFD a4, 3 * SIZE(AO1) 1099 1100 LFD y01, 0 * SIZE(X1) 1101 LFD y02, 1 * SIZE(X1) 1102 LFD y03, 2 * SIZE(X1) 1103 LFD y04, 3 * SIZE(X1) 1104 1105 FMADD a1, alpha1, y01, a1 1106 FMADD a2, alpha1, y02, a2 1107 FMADD a3, alpha1, y03, a3 1108 FMADD a4, alpha1, y04, a4 1109 1110 STFD a1, 0 * SIZE(AO1) 1111 STFD a2, 1 * SIZE(AO1) 1112 STFD a3, 2 * SIZE(AO1) 1113 STFD a4, 3 * SIZE(AO1) 1114 1115 addi AO1, AO1, 4 * SIZE 1116 addi X1, X1, 4 * SIZE 1117 .align 4 1118 1119LL(27): 1120 andi. r0, M, 2 1121 ble LL(28) 1122 1123 LFD a1, 0 * SIZE(AO1) 1124 LFD a2, 1 * SIZE(AO1) 1125 1126 LFD y01, 0 * SIZE(X1) 1127 LFD y02, 1 * SIZE(X1) 1128 1129 FMADD a1, alpha1, y01, a1 1130 FMADD a2, alpha1, y02, a2 1131 1132 STFD a1, 0 * SIZE(AO1) 1133 STFD a2, 1 * SIZE(AO1) 1134 1135 addi AO1, AO1, 2 * SIZE 1136 addi X1, X1, 2 * SIZE 1137 .align 4 1138 1139LL(28): 1140 andi. r0, M, 1 1141 ble LL(999) 1142 1143 LFD y01, 0 * SIZE(X1) 1144 LFD a1, 0 * SIZE(AO1) 1145 1146 FMADD a1, alpha1, y01, a1 1147 1148 STFD a1, 0 * SIZE(AO1) 1149 .align 4 1150 1151LL(999): 1152 li r3, 0 1153 1154 lfd f14, 0(SP) 1155 lfd f15, 8(SP) 1156 lfd f16, 16(SP) 1157 lfd f17, 24(SP) 1158 lfd f18, 32(SP) 1159 lfd f19, 40(SP) 1160 lfd f20, 48(SP) 1161 lfd f21, 56(SP) 1162 lfd f22, 64(SP) 1163 lfd f23, 72(SP) 1164 lfd f24, 80(SP) 1165 lfd f25, 88(SP) 1166 lfd f26, 96(SP) 1167 lfd f27, 104(SP) 1168 lfd f28, 112(SP) 1169 lfd f29, 120(SP) 1170 lfd f30, 128(SP) 1171 lfd f31, 136(SP) 1172 1173#ifdef __64BIT__ 1174 ld r14, 144(SP) 1175 ld r15, 152(SP) 1176 ld r16, 160(SP) 1177 ld r17, 168(SP) 1178 ld r18, 176(SP) 1179 ld r19, 184(SP) 1180 ld r20, 192(SP) 1181 ld r21, 200(SP) 1182 ld r22, 208(SP) 1183 ld r23, 216(SP) 1184 ld r24, 224(SP) 1185 ld r25, 232(SP) 1186 ld r26, 240(SP) 1187 ld r27, 248(SP) 1188#else 1189 lwz r14, 144(SP) 1190 lwz r15, 148(SP) 1191 lwz r16, 152(SP) 1192 lwz r17, 156(SP) 1193 lwz r18, 160(SP) 1194 lwz r19, 164(SP) 1195 lwz r20, 168(SP) 1196 lwz r21, 172(SP) 1197 lwz r22, 176(SP) 1198 lwz r23, 180(SP) 1199 lwz r24, 184(SP) 1200 lwz r25, 188(SP) 1201 lwz r26, 192(SP) 1202 lwz r27, 196(SP) 1203#endif 1204 1205 addi SP, SP, STACKSIZE 1206 blr 1207 1208 EPILOGUE 1209#endif 1210