1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifndef __64BIT__ 43#define LOAD lwz 44#else 45#define LOAD ld 46#endif 47 48#ifdef __64BIT__ 49#define STACKSIZE 320 50#define ALPHA 296(SP) 51#define FZERO 304(SP) 52#else 53#define STACKSIZE 240 54#define ALPHA 224(SP) 55#define FZERO 232(SP) 56#endif 57 58#define M r3 59#define N r4 60#define K r5 61 62#ifdef linux 63#ifndef __64BIT__ 64#define A r6 65#define B r7 66#define C r8 67#define LDC r9 68#define OFFSET r10 69#else 70#define A r7 71#define B r8 72#define C r9 73#define LDC r10 74#define OFFSET r6 75#endif 76#endif 77 78#if defined(_AIX) || defined(__APPLE__) 79#if !defined(__64BIT__) && defined(DOUBLE) 80#define A r8 81#define B r9 82#define C r10 83#define LDC r7 84#define OFFSET r6 85#else 86#define A r7 87#define B r8 88#define C r9 89#define LDC r10 90#define OFFSET r6 91#endif 92#endif 93 94#define TEMP r18 95#define KK r19 96#define BB r20 97#define I r21 98#define J r22 99#define AO r23 100#define BO r24 101#define CO1 r25 102#define CO2 r26 103#define CO3 r27 104#define CO4 r28 105 106#define PREA r29 107#define PREB r30 108#define PREC r31 109 110#ifndef NEEDPARAM 111 112 PROLOGUE 113 PROFCODE 114 115 addi SP, SP, -STACKSIZE 116 li r0, 0 117 118 stfd f14, 0(SP) 119 stfd f15, 8(SP) 120 stfd f16, 16(SP) 121 stfd f17, 24(SP) 122 123 stfd f18, 32(SP) 124 stfd f19, 40(SP) 125 stfd f20, 48(SP) 126 stfd f21, 56(SP) 127 128 stfd f22, 64(SP) 129 stfd f23, 72(SP) 130 stfd f24, 80(SP) 131 stfd f25, 88(SP) 132 133 stfd f26, 96(SP) 134 stfd f27, 104(SP) 135 stfd f28, 112(SP) 136 stfd f29, 120(SP) 137 138 stfd f30, 128(SP) 139 stfd f31, 136(SP) 140 141#ifdef __64BIT__ 142 std r31, 144(SP) 143 std r30, 152(SP) 144 std r29, 160(SP) 145 std r28, 168(SP) 146 std r27, 176(SP) 147 std r26, 184(SP) 148 std r25, 192(SP) 149 std r24, 200(SP) 150 std r23, 208(SP) 151 std r22, 216(SP) 152 std r21, 224(SP) 153 std r20, 232(SP) 154#if defined(TRMMKERNEL) 155 std r19, 240(SP) 156 std r18, 248(SP) 157#endif 158#else 159 stw r31, 144(SP) 160 stw r30, 148(SP) 161 stw r29, 152(SP) 162 stw r28, 156(SP) 163 stw r27, 160(SP) 164 stw r26, 164(SP) 165 stw r25, 168(SP) 166 stw r24, 172(SP) 167 stw r23, 176(SP) 168 stw r22, 180(SP) 169 stw r21, 184(SP) 170 stw r20, 188(SP) 171#if defined(TRMMKERNEL) 172 stw r19, 192(SP) 173 stw r18, 196(SP) 174#endif 175#endif 176 177 stfd f1, ALPHA 178 stw r0, FZERO 179 180#if defined(_AIX) || defined(__APPLE__) 181#if !defined(__64BIT__) && defined(DOUBLE) 182 lwz LDC, 56 + STACKSIZE(SP) 183#endif 184#endif 185 186 slwi LDC, LDC, BASE_SHIFT 187 188#if defined(TRMMKERNEL) 189#if defined(linux) && defined(__64BIT__) 190 ld OFFSET, 112 + STACKSIZE(SP) 191#endif 192 193#if defined(_AIX) || defined(__APPLE__) 194#ifdef __64BIT__ 195 ld OFFSET, 112 + STACKSIZE(SP) 196#else 197#ifdef DOUBLE 198 lwz OFFSET, 60 + STACKSIZE(SP) 199#else 200 lwz OFFSET, 56 + STACKSIZE(SP) 201#endif 202#endif 203#endif 204#endif 205 206#if defined(TRMMKERNEL) && !defined(LEFT) 207 neg KK, OFFSET 208#endif 209 210 cmpwi cr0, M, 0 211 ble LL(999) 212 cmpwi cr0, N, 0 213 ble LL(999) 214 cmpwi cr0, K, 0 215 ble LL(999) 216 217#ifndef PREFETCHTEST 218/* Normal prefetch */ 219#ifdef PPC970 220 li PREC, 4 * SIZE 221#endif 222#ifdef POWER4 223 li PREC, 4 * SIZE /* is 12 best? */ 224#endif 225#ifdef POWER5 226 li PREC, 3 * SIZE 227#endif 228 229#else 230 231#ifdef linux 232#ifndef __64BIT__ 233 mr PREA, r10 234 lwz PREB, 8 + STACKSIZE(SP) 235 lwz PREC, 12 + STACKSIZE(SP) 236#else 237 ld PREA, 112 + STACKSIZE(SP) 238 ld PREB, 120 + STACKSIZE(SP) 239 ld PREC, 128 + STACKSIZE(SP) 240#endif 241#endif 242 243#if defined(_AIX) || defined(__APPLE__) 244#ifdef __64BIT__ 245 ld PREA, 112 + STACKSIZE(SP) 246 ld PREB, 120 + STACKSIZE(SP) 247 ld PREC, 128 + STACKSIZE(SP) 248#else 249#ifdef DOUBLE 250 lwz PREA, 60 + STACKSIZE(SP) 251 lwz PREB, 64 + STACKSIZE(SP) 252 lwz PREC, 68 + STACKSIZE(SP) 253#else 254 lwz PREA, 56 + STACKSIZE(SP) 255 lwz PREB, 60 + STACKSIZE(SP) 256 lwz PREC, 64 + STACKSIZE(SP) 257#endif 258#endif 259#endif 260 261#endif 262 263#ifndef PREFETCHTEST 264#ifdef PPC970 265#ifdef ALLOC_HUGETLB 266 li PREA, (16 * 1 * SIZE) 267 li PREB, (16 * 5 * SIZE) 268#else 269 li PREA, (16 * 19 * SIZE) 270 li PREB, (16 * 8 * SIZE) 271#endif 272#endif 273#ifdef POWER4 274#ifdef ALLOC_HUGETLB 275 li PREA, (16 * 1 * SIZE) 276 li PREB, (16 * 1 * SIZE) 277#else 278 li PREA, (16 * 2 * SIZE) 279 li PREB, (16 * 2 * SIZE) 280#endif 281#endif 282#ifdef POWER5 283#ifdef ALLOC_HUGETLB 284 li PREA, (16 * 7 * SIZE) 285 li PREB, (16 * 7 * SIZE) 286#else 287 li PREA, (16 * 12 * SIZE) 288 li PREB, (16 * 6 * SIZE) 289#endif 290#endif 291#endif 292 293 srawi. J, N, 2 294 ble LL(40) 295 .align 4 296 297LL(10): 298 mr CO1, C 299 add CO2, C, LDC 300 add CO3, CO2, LDC 301 add CO4, CO3, LDC 302 303#if defined(TRMMKERNEL) && defined(LEFT) 304 mr KK, OFFSET 305#endif 306 307 slwi BB, K, BASE_SHIFT + 2 308 309 lfs f0, FZERO 310 fmr f1, f0 311 fmr f2, f0 312 fmr f3, f0 313 fmr f4, f0 314 fmr f5, f0 315 fmr f6, f0 316 fmr f7, f0 317 fmr f8, f0 318 fmr f9, f0 319 fmr f10, f0 320 fmr f11, f0 321 fmr f12, f0 322 fmr f13, f0 323 fmr f14, f0 324 fmr f15, f0 325 326 srawi. I, M, 2 327 mr AO, A 328 add C, CO4, LDC 329 ble LL(20) 330 .align 4 331 332LL(11): 333#if defined(TRMMKERNEL) 334#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 335 LFD f16, 0 * SIZE(AO) 336 LFD f17, 1 * SIZE(AO) 337 LFD f18, 2 * SIZE(AO) 338 LFD f19, 3 * SIZE(AO) 339 340 LFD f20, 0 * SIZE(B) 341 LFD f21, 1 * SIZE(B) 342 LFD f22, 2 * SIZE(B) 343 LFD f23, 3 * SIZE(B) 344 345#ifdef POWER5 346 LFD f28, 4 * SIZE(B) 347 LFD f29, 5 * SIZE(B) 348 LFD f30, 6 * SIZE(B) 349 LFD f31, 7 * SIZE(B) 350#endif 351 mr BO, B 352#else 353 slwi r0, KK, 2 + BASE_SHIFT 354 add AO, AO, r0 355 add BO, B, r0 356 357 LFD f16, 0 * SIZE(AO) 358 LFD f17, 1 * SIZE(AO) 359 LFD f18, 2 * SIZE(AO) 360 LFD f19, 3 * SIZE(AO) 361 362 LFD f20, 0 * SIZE(BO) 363 LFD f21, 1 * SIZE(BO) 364 LFD f22, 2 * SIZE(BO) 365 LFD f23, 3 * SIZE(BO) 366 367#ifdef POWER5 368 LFD f28, 4 * SIZE(BO) 369 LFD f29, 5 * SIZE(BO) 370 LFD f30, 6 * SIZE(BO) 371 LFD f31, 7 * SIZE(BO) 372#endif 373#endif 374 375 DCBTST(CO1, PREC) 376 DCBTST(CO2, PREC) 377 DCBTST(CO3, PREC) 378 DCBTST(CO4, PREC) 379 380 dcbt B, BB 381 addi BB, BB, 16 * SIZE 382 383#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 384 sub TEMP, K, KK 385#elif defined(LEFT) 386 addi TEMP, KK, 4 387#else 388 addi TEMP, KK, 4 389#endif 390 srawi. TEMP, TEMP, 2 391 mtspr CTR, TEMP 392 ble LL(15) 393 394#else 395 396 LFD f16, 0 * SIZE(AO) 397 LFD f17, 1 * SIZE(AO) 398 LFD f18, 2 * SIZE(AO) 399 LFD f19, 3 * SIZE(AO) 400 401 LFD f20, 0 * SIZE(B) 402 LFD f21, 1 * SIZE(B) 403 LFD f22, 2 * SIZE(B) 404 LFD f23, 3 * SIZE(B) 405 406#ifdef POWER5 407 LFD f28, 4 * SIZE(B) 408 LFD f29, 5 * SIZE(B) 409 LFD f30, 6 * SIZE(B) 410 LFD f31, 7 * SIZE(B) 411#endif 412 413 DCBTST(CO1, PREC) 414 DCBTST(CO2, PREC) 415 DCBTST(CO3, PREC) 416 DCBTST(CO4, PREC) 417 418 dcbt B, BB 419 addi BB, BB, 16 * SIZE 420 421 srawi. r0, K, 2 422 mtspr CTR, r0 423 mr BO, B 424 ble LL(15) 425#endif 426 .align 4 427 428LL(12): 429 FMADD f0, f16, f20, f0 430 FMADD f5, f17, f21, f5 431 FMADD f10, f18, f22, f10 432 FMADD f15, f19, f23, f15 433 434#if defined(ALLOC_HUGETLB) && !defined(POWER5) 435 LFD f28, 4 * SIZE(BO) 436 LFD f29, 5 * SIZE(BO) 437 LFD f30, 6 * SIZE(BO) 438 LFD f31, 7 * SIZE(BO) 439#endif 440 441 FMADD f1, f17, f20, f1 442 FMADD f2, f18, f20, f2 443 FMADD f3, f19, f20, f3 444 FMADD f4, f16, f21, f4 445 446#if !defined(ALLOC_HUGETLB) && !defined(POWER5) 447 LFD f28, 4 * SIZE(BO) 448 LFD f29, 5 * SIZE(BO) 449 LFD f30, 6 * SIZE(BO) 450 LFD f31, 7 * SIZE(BO) 451#endif 452 453 LFD f24, 4 * SIZE(AO) 454 LFD f25, 5 * SIZE(AO) 455 LFD f26, 6 * SIZE(AO) 456 LFD f27, 7 * SIZE(AO) 457 458 FMADD f6, f18, f21, f6 459 FMADD f7, f19, f21, f7 460 FMADD f8, f16, f22, f8 461 FMADD f9, f17, f22, f9 462 463 FMADD f11, f19, f22, f11 464 FMADD f12, f16, f23, f12 465 FMADD f13, f17, f23, f13 466 FMADD f14, f18, f23, f14 467 468 LFD f20, 8 * SIZE(BO) 469 LFD f21, 9 * SIZE(BO) 470 LFD f22, 10 * SIZE(BO) 471 LFD f23, 11 * SIZE(BO) 472 473 FMADD f0, f24, f28, f0 474 FMADD f5, f25, f29, f5 475 FMADD f10, f26, f30, f10 476 FMADD f15, f27, f31, f15 477 478 LFD f16, 8 * SIZE(AO) 479 LFD f17, 9 * SIZE(AO) 480 LFD f18, 10 * SIZE(AO) 481 LFD f19, 11 * SIZE(AO) 482 483 FMADD f1, f25, f28, f1 484 FMADD f2, f26, f28, f2 485 FMADD f3, f27, f28, f3 486 FMADD f4, f24, f29, f4 487 488 FMADD f6, f26, f29, f6 489 FMADD f7, f27, f29, f7 490 FMADD f8, f24, f30, f8 491 FMADD f9, f25, f30, f9 492 493 FMADD f11, f27, f30, f11 494 FMADD f12, f24, f31, f12 495 FMADD f13, f25, f31, f13 496 FMADD f14, f26, f31, f14 497 498 LFD f28, 12 * SIZE(BO) 499 LFD f29, 13 * SIZE(BO) 500 LFD f30, 14 * SIZE(BO) 501 LFD f31, 15 * SIZE(BO) 502 503 FMADD f0, f16, f20, f0 504 FMADD f5, f17, f21, f5 505 FMADD f10, f18, f22, f10 506 FMADD f15, f19, f23, f15 507 508 LFD f24, 12 * SIZE(AO) 509 LFD f25, 13 * SIZE(AO) 510 LFD f26, 14 * SIZE(AO) 511 LFD f27, 15 * SIZE(AO) 512 513 FMADD f1, f17, f20, f1 514 FMADD f2, f18, f20, f2 515 FMADD f3, f19, f20, f3 516 FMADD f4, f16, f21, f4 517 518 FMADD f6, f18, f21, f6 519 FMADD f7, f19, f21, f7 520 FMADD f8, f16, f22, f8 521 FMADD f9, f17, f22, f9 522 523 FMADD f11, f19, f22, f11 524 FMADD f12, f16, f23, f12 525 FMADD f13, f17, f23, f13 526 FMADD f14, f18, f23, f14 527 528#ifndef POWER5 529 LFD f16, 16 * SIZE(AO) 530 LFD f17, 17 * SIZE(AO) 531 LFD f18, 18 * SIZE(AO) 532 LFD f19, 19 * SIZE(AO) 533#else 534 LFD f20, 16 * SIZE(BO) 535 LFD f21, 17 * SIZE(BO) 536 LFD f22, 18 * SIZE(BO) 537 LFD f23, 19 * SIZE(BO) 538#endif 539 540 FMADD f0, f24, f28, f0 541 FMADD f5, f25, f29, f5 542 FMADD f10, f26, f30, f10 543 FMADD f15, f27, f31, f15 544 545#ifndef POWER5 546 LFD f20, 16 * SIZE(BO) 547 LFD f21, 17 * SIZE(BO) 548 LFD f22, 18 * SIZE(BO) 549 LFD f23, 19 * SIZE(BO) 550#else 551 LFD f16, 16 * SIZE(AO) 552 LFD f17, 17 * SIZE(AO) 553 LFD f18, 18 * SIZE(AO) 554 LFD f19, 19 * SIZE(AO) 555#endif 556 557 FMADD f1, f25, f28, f1 558 FMADD f2, f26, f28, f2 559 FMADD f3, f27, f28, f3 560 FMADD f4, f24, f29, f4 561 562 FMADD f6, f26, f29, f6 563 FMADD f7, f27, f29, f7 564 FMADD f8, f24, f30, f8 565 FMADD f9, f25, f30, f9 566 567 FMADD f11, f27, f30, f11 568 FMADD f12, f24, f31, f12 569 FMADD f13, f25, f31, f13 570 FMADD f14, f26, f31, f14 571 572#if (L2_SIZE == 1024976) && defined (ALLOC_HUGETLB) 573 nop 574 nop 575 nop 576 nop 577#endif 578 579#ifdef POWER5 580 LFD f28, 20 * SIZE(BO) 581 LFD f29, 21 * SIZE(BO) 582 LFD f30, 22 * SIZE(BO) 583 LFD f31, 23 * SIZE(BO) 584#endif 585 586 addi AO, AO, 16 * SIZE 587 addi BO, BO, 16 * SIZE 588 589#ifdef PPC970 590#ifndef ALLOC_HUGETLB 591 DCBT(AO, PREA) 592#endif 593 DCBT(BO, PREB) 594#endif 595 596#ifdef POWER4 597#ifndef ALLOC_HUGETLB 598 DCBT(AO, PREA) 599#endif 600 DCBT(BO, PREB) 601#endif 602 603#ifdef POWER5 604#ifndef ALLOC_HUGETLB 605 DCBT(BO, PREB) 606 DCBT(AO, PREA) 607#endif 608#endif 609 bdnz LL(12) 610 .align 4 611 612LL(15): 613 lfd f30, ALPHA 614 615#if defined(TRMMKERNEL) 616 617#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 618 sub TEMP, K, KK 619#elif defined(LEFT) 620 addi TEMP, KK, 4 621#else 622 addi TEMP, KK, 4 623#endif 624 625 andi. TEMP, TEMP, 3 626 mtspr CTR, TEMP 627#else 628 629 andi. r0, K, 3 630 mtspr CTR, r0 631 632#endif 633 ble+ LL(18) 634 .align 4 635 636LL(16): 637 FMADD f0, f16, f20, f0 638 FMADD f5, f17, f21, f5 639 FMADD f10, f18, f22, f10 640 FMADD f15, f19, f23, f15 641 642 FMADD f1, f17, f20, f1 643 FMADD f2, f18, f20, f2 644 FMADD f3, f19, f20, f3 645 FMADD f4, f16, f21, f4 646 647 FMADD f6, f18, f21, f6 648 FMADD f7, f19, f21, f7 649 FMADD f8, f16, f22, f8 650 FMADD f9, f17, f22, f9 651 652 FMADD f11, f19, f22, f11 653 FMADD f12, f16, f23, f12 654 FMADD f13, f17, f23, f13 655 FMADD f14, f18, f23, f14 656 657 LFD f16, 4 * SIZE(AO) 658 LFD f17, 5 * SIZE(AO) 659 LFD f18, 6 * SIZE(AO) 660 LFD f19, 7 * SIZE(AO) 661 662 LFD f20, 4 * SIZE(BO) 663 LFD f21, 5 * SIZE(BO) 664 LFD f22, 6 * SIZE(BO) 665 LFD f23, 7 * SIZE(BO) 666 667 addi BO, BO, 4 * SIZE 668 addi AO, AO, 4 * SIZE 669 bdnz LL(16) 670 .align 4 671 672LL(18): 673#ifndef TRMMKERNEL 674 LFD f16, 0 * SIZE(CO1) 675 LFD f17, 1 * SIZE(CO1) 676 LFD f18, 2 * SIZE(CO1) 677 LFD f19, 3 * SIZE(CO1) 678 679 LFD f20, 0 * SIZE(CO2) 680 LFD f21, 1 * SIZE(CO2) 681 LFD f22, 2 * SIZE(CO2) 682 LFD f23, 3 * SIZE(CO2) 683 684 FMADD f0, f0, f30, f16 685 FMADD f1, f1, f30, f17 686 FMADD f2, f2, f30, f18 687 FMADD f3, f3, f30, f19 688 689 FMADD f4, f4, f30, f20 690 FMADD f5, f5, f30, f21 691 FMADD f6, f6, f30, f22 692 FMADD f7, f7, f30, f23 693 694 LFD f16, 0 * SIZE(CO3) 695 LFD f17, 1 * SIZE(CO3) 696 LFD f18, 2 * SIZE(CO3) 697 LFD f19, 3 * SIZE(CO3) 698 699 LFD f20, 0 * SIZE(CO4) 700 LFD f21, 1 * SIZE(CO4) 701 LFD f22, 2 * SIZE(CO4) 702 LFD f23, 3 * SIZE(CO4) 703 704 FMADD f8, f8, f30, f16 705 FMADD f9, f9, f30, f17 706 FMADD f10, f10, f30, f18 707 FMADD f11, f11, f30, f19 708 709 FMADD f12, f12, f30, f20 710 FMADD f13, f13, f30, f21 711 FMADD f14, f14, f30, f22 712 FMADD f15, f15, f30, f23 713 714#else 715 716 FMUL f0, f0, f30 717 FMUL f1, f1, f30 718 FMUL f2, f2, f30 719 FMUL f3, f3, f30 720 721 FMUL f4, f4, f30 722 FMUL f5, f5, f30 723 FMUL f6, f6, f30 724 FMUL f7, f7, f30 725 726 FMUL f8, f8, f30 727 FMUL f9, f9, f30 728 FMUL f10, f10, f30 729 FMUL f11, f11, f30 730 731 FMUL f12, f12, f30 732 FMUL f13, f13, f30 733 FMUL f14, f14, f30 734 FMUL f15, f15, f30 735#endif 736 737 STFD f0, 0 * SIZE(CO1) 738 STFD f1, 1 * SIZE(CO1) 739 STFD f2, 2 * SIZE(CO1) 740 STFD f3, 3 * SIZE(CO1) 741 742 lfs f0, FZERO 743 fmr f1, f0 744 fmr f2, f0 745 fmr f3, f0 746 747 STFD f4, 0 * SIZE(CO2) 748 STFD f5, 1 * SIZE(CO2) 749 STFD f6, 2 * SIZE(CO2) 750 STFD f7, 3 * SIZE(CO2) 751 752 fmr f4, f0 753 fmr f5, f0 754 fmr f6, f0 755 fmr f7, f0 756 757 STFD f8, 0 * SIZE(CO3) 758 STFD f9, 1 * SIZE(CO3) 759 STFD f10, 2 * SIZE(CO3) 760 STFD f11, 3 * SIZE(CO3) 761 762 fmr f8, f0 763 fmr f9, f0 764 fmr f10, f0 765 fmr f11, f0 766 767 STFD f12, 0 * SIZE(CO4) 768 STFD f13, 1 * SIZE(CO4) 769 STFD f14, 2 * SIZE(CO4) 770 STFD f15, 3 * SIZE(CO4) 771 772 fmr f12, f0 773 fmr f13, f0 774 fmr f14, f0 775 fmr f15, f0 776 777 addi CO1, CO1, 4 * SIZE 778 addi CO2, CO2, 4 * SIZE 779 addi CO3, CO3, 4 * SIZE 780 addi CO4, CO4, 4 * SIZE 781 782#ifdef TRMMKERNEL 783#if ( defined(LEFT) && defined(TRANSA)) || \ 784 (!defined(LEFT) && !defined(TRANSA)) 785 sub TEMP, K, KK 786#ifdef LEFT 787 addi TEMP, TEMP, -4 788#else 789 addi TEMP, TEMP, -4 790#endif 791 slwi TEMP, TEMP, 2 + BASE_SHIFT 792 add AO, AO, TEMP 793 add BO, BO, TEMP 794#endif 795 796#ifdef LEFT 797 addi KK, KK, 4 798#endif 799#endif 800 801 addic. I, I, -1 802 bgt+ LL(11) 803 .align 4 804 805LL(20): 806 andi. I, M, 2 807 ble LL(30) 808 809#if defined(TRMMKERNEL) 810#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 811 812 LFD f16, 0 * SIZE(AO) 813 LFD f17, 1 * SIZE(AO) 814 LFD f18, 2 * SIZE(AO) 815 LFD f19, 3 * SIZE(AO) 816 817 LFD f20, 0 * SIZE(B) 818 LFD f21, 1 * SIZE(B) 819 LFD f22, 2 * SIZE(B) 820 LFD f23, 3 * SIZE(B) 821 822 LFD f24, 4 * SIZE(B) 823 LFD f25, 5 * SIZE(B) 824 LFD f26, 6 * SIZE(B) 825 LFD f27, 7 * SIZE(B) 826 827 mr BO, B 828#else 829 slwi r0, KK, 1 + BASE_SHIFT 830 slwi TEMP, KK, 2 + BASE_SHIFT 831 add AO, AO, r0 832 add BO, B, TEMP 833 834 LFD f16, 0 * SIZE(AO) 835 LFD f17, 1 * SIZE(AO) 836 LFD f18, 2 * SIZE(AO) 837 LFD f19, 3 * SIZE(AO) 838 839 LFD f20, 0 * SIZE(BO) 840 LFD f21, 1 * SIZE(BO) 841 LFD f22, 2 * SIZE(BO) 842 LFD f23, 3 * SIZE(BO) 843 844 LFD f24, 4 * SIZE(BO) 845 LFD f25, 5 * SIZE(BO) 846 LFD f26, 6 * SIZE(BO) 847 LFD f27, 7 * SIZE(BO) 848#endif 849 850#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 851 sub TEMP, K, KK 852#elif defined(LEFT) 853 addi TEMP, KK, 2 854#else 855 addi TEMP, KK, 4 856#endif 857 srawi. TEMP, TEMP, 2 858 mtspr CTR, TEMP 859 860#else 861 LFD f16, 0 * SIZE(AO) 862 LFD f17, 1 * SIZE(AO) 863 LFD f18, 2 * SIZE(AO) 864 LFD f19, 3 * SIZE(AO) 865 866 LFD f20, 0 * SIZE(B) 867 LFD f21, 1 * SIZE(B) 868 LFD f22, 2 * SIZE(B) 869 LFD f23, 3 * SIZE(B) 870 871 LFD f24, 4 * SIZE(B) 872 LFD f25, 5 * SIZE(B) 873 LFD f26, 6 * SIZE(B) 874 LFD f27, 7 * SIZE(B) 875 876 srawi. r0, K, 2 877 mtspr CTR, r0 878 mr BO, B 879#endif 880 ble LL(25) 881 .align 5 882 883LL(22): 884 FMADD f0, f16, f20, f0 885 FMADD f1, f17, f20, f1 886 FMADD f4, f16, f21, f4 887 FMADD f5, f17, f21, f5 888 889 FMADD f8, f16, f22, f8 890 FMADD f9, f17, f22, f9 891 FMADD f12, f16, f23, f12 892 FMADD f13, f17, f23, f13 893 894 LFD f20, 8 * SIZE(BO) 895 LFD f21, 9 * SIZE(BO) 896 LFD f22, 10 * SIZE(BO) 897 LFD f23, 11 * SIZE(BO) 898 899 FMADD f2, f18, f24, f2 900 FMADD f3, f19, f24, f3 901 FMADD f6, f18, f25, f6 902 FMADD f7, f19, f25, f7 903 904 FMADD f10, f18, f26, f10 905 FMADD f11, f19, f26, f11 906 FMADD f14, f18, f27, f14 907 FMADD f15, f19, f27, f15 908 909 LFD f16, 4 * SIZE(AO) 910 LFD f17, 5 * SIZE(AO) 911 LFD f18, 6 * SIZE(AO) 912 LFD f19, 7 * SIZE(AO) 913 914 FMADD f0, f16, f20, f0 915 FMADD f1, f17, f20, f1 916 FMADD f4, f16, f21, f4 917 FMADD f5, f17, f21, f5 918 919 LFD f24, 12 * SIZE(BO) 920 LFD f25, 13 * SIZE(BO) 921 LFD f26, 14 * SIZE(BO) 922 LFD f27, 15 * SIZE(BO) 923 924 FMADD f8, f16, f22, f8 925 FMADD f9, f17, f22, f9 926 FMADD f12, f16, f23, f12 927 FMADD f13, f17, f23, f13 928 929 LFD f20, 16 * SIZE(BO) 930 LFD f21, 17 * SIZE(BO) 931 LFD f22, 18 * SIZE(BO) 932 LFD f23, 19 * SIZE(BO) 933 934 FMADD f2, f18, f24, f2 935 FMADD f3, f19, f24, f3 936 FMADD f6, f18, f25, f6 937 FMADD f7, f19, f25, f7 938 939 FMADD f10, f18, f26, f10 940 FMADD f11, f19, f26, f11 941 FMADD f14, f18, f27, f14 942 FMADD f15, f19, f27, f15 943 944 LFD f16, 8 * SIZE(AO) 945 LFD f17, 9 * SIZE(AO) 946 LFD f18, 10 * SIZE(AO) 947 LFD f19, 11 * SIZE(AO) 948 949 LFD f24, 20 * SIZE(BO) 950 LFD f25, 21 * SIZE(BO) 951 LFD f26, 22 * SIZE(BO) 952 LFD f27, 23 * SIZE(BO) 953 954 addi AO, AO, 8 * SIZE 955 addi BO, BO, 16 * SIZE 956 DCBT(BO, PREB) 957 bdnz LL(22) 958 959 fadd f0, f2, f0 960 fadd f1, f3, f1 961 fadd f4, f6, f4 962 fadd f5, f7, f5 963 fadd f8, f10, f8 964 fadd f9, f11, f9 965 fadd f12, f14, f12 966 fadd f13, f15, f13 967 .align 4 968 969LL(25): 970 lfd f30, ALPHA 971 972#if defined(TRMMKERNEL) 973 974#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 975 sub TEMP, K, KK 976#elif defined(LEFT) 977 addi TEMP, KK, 2 978#else 979 addi TEMP, KK, 4 980#endif 981 andi. TEMP, TEMP, 3 982 mtspr CTR, TEMP 983 984#else 985 986 andi. r0, K, 3 987 mtspr CTR, r0 988 989#endif 990 ble+ LL(28) 991 .align 4 992 993LL(26): 994 FMADD f0, f16, f20, f0 995 FMADD f1, f17, f20, f1 996 FMADD f4, f16, f21, f4 997 FMADD f5, f17, f21, f5 998 999 FMADD f8, f16, f22, f8 1000 FMADD f9, f17, f22, f9 1001 FMADD f12, f16, f23, f12 1002 FMADD f13, f17, f23, f13 1003 1004 LFD f16, 2 * SIZE(AO) 1005 LFD f17, 3 * SIZE(AO) 1006 1007 LFD f20, 4 * SIZE(BO) 1008 LFD f21, 5 * SIZE(BO) 1009 LFD f22, 6 * SIZE(BO) 1010 LFD f23, 7 * SIZE(BO) 1011 1012 addi BO, BO, 4 * SIZE 1013 addi AO, AO, 2 * SIZE 1014 bdnz LL(26) 1015 .align 4 1016 1017LL(28): 1018#ifndef TRMMKERNEL 1019 LFD f16, 0 * SIZE(CO1) 1020 LFD f17, 1 * SIZE(CO1) 1021 LFD f18, 0 * SIZE(CO2) 1022 LFD f19, 1 * SIZE(CO2) 1023 1024 FMADD f0, f0, f30, f16 1025 FMADD f1, f1, f30, f17 1026 FMADD f4, f4, f30, f18 1027 FMADD f5, f5, f30, f19 1028 1029 LFD f20, 0 * SIZE(CO3) 1030 LFD f21, 1 * SIZE(CO3) 1031 LFD f22, 0 * SIZE(CO4) 1032 LFD f23, 1 * SIZE(CO4) 1033 1034 FMADD f8, f8, f30, f20 1035 FMADD f9, f9, f30, f21 1036 FMADD f12, f12, f30, f22 1037 FMADD f13, f13, f30, f23 1038#else 1039 FMUL f0, f0, f30 1040 FMUL f1, f1, f30 1041 FMUL f4, f4, f30 1042 FMUL f5, f5, f30 1043 1044 FMUL f8, f8, f30 1045 FMUL f9, f9, f30 1046 FMUL f12, f12, f30 1047 FMUL f13, f13, f30 1048#endif 1049 1050 STFD f0, 0 * SIZE(CO1) 1051 STFD f1, 1 * SIZE(CO1) 1052 STFD f4, 0 * SIZE(CO2) 1053 STFD f5, 1 * SIZE(CO2) 1054 1055 lfs f0, FZERO 1056 fmr f1, f0 1057 fmr f2, f0 1058 fmr f3, f0 1059 1060 STFD f8, 0 * SIZE(CO3) 1061 STFD f9, 1 * SIZE(CO3) 1062 STFD f12, 0 * SIZE(CO4) 1063 STFD f13, 1 * SIZE(CO4) 1064 1065 fmr f4, f0 1066 fmr f5, f0 1067 fmr f6, f0 1068 fmr f7, f0 1069 1070 fmr f8, f0 1071 fmr f9, f0 1072 fmr f10, f0 1073 fmr f11, f0 1074 1075 fmr f12, f0 1076 fmr f13, f0 1077 fmr f14, f0 1078 fmr f15, f0 1079 1080 addi CO1, CO1, 2 * SIZE 1081 addi CO2, CO2, 2 * SIZE 1082 addi CO3, CO3, 2 * SIZE 1083 addi CO4, CO4, 2 * SIZE 1084 1085#ifdef TRMMKERNEL 1086#if ( defined(LEFT) && defined(TRANSA)) || \ 1087 (!defined(LEFT) && !defined(TRANSA)) 1088 sub TEMP, K, KK 1089#ifdef LEFT 1090 addi TEMP, TEMP, -2 1091#else 1092 addi TEMP, TEMP, -4 1093#endif 1094 slwi r0, TEMP, 1 + BASE_SHIFT 1095 slwi TEMP, TEMP, 2 + BASE_SHIFT 1096 add AO, AO, r0 1097 add BO, BO, TEMP 1098#endif 1099 1100#ifdef LEFT 1101 addi KK, KK, 2 1102#endif 1103#endif 1104 .align 4 1105 1106LL(30): 1107 andi. I, M, 1 1108 ble LL(39) 1109 1110#if defined(TRMMKERNEL) 1111 1112#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1113 1114 LFD f16, 0 * SIZE(AO) 1115 LFD f17, 1 * SIZE(AO) 1116 LFD f18, 2 * SIZE(AO) 1117 LFD f19, 3 * SIZE(AO) 1118 1119 LFD f20, 0 * SIZE(B) 1120 LFD f21, 1 * SIZE(B) 1121 LFD f22, 2 * SIZE(B) 1122 LFD f23, 3 * SIZE(B) 1123 1124 LFD f24, 4 * SIZE(B) 1125 LFD f25, 5 * SIZE(B) 1126 LFD f26, 6 * SIZE(B) 1127 LFD f27, 7 * SIZE(B) 1128 1129 mr BO, B 1130#else 1131 slwi r0, KK, 0 + BASE_SHIFT 1132 slwi TEMP, KK, 2 + BASE_SHIFT 1133 add AO, AO, r0 1134 add BO, B, TEMP 1135 1136 LFD f16, 0 * SIZE(AO) 1137 LFD f17, 1 * SIZE(AO) 1138 LFD f18, 2 * SIZE(AO) 1139 LFD f19, 3 * SIZE(AO) 1140 1141 LFD f20, 0 * SIZE(BO) 1142 LFD f21, 1 * SIZE(BO) 1143 LFD f22, 2 * SIZE(BO) 1144 LFD f23, 3 * SIZE(BO) 1145 1146 LFD f24, 4 * SIZE(BO) 1147 LFD f25, 5 * SIZE(BO) 1148 LFD f26, 6 * SIZE(BO) 1149 LFD f27, 7 * SIZE(BO) 1150#endif 1151 1152#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1153 sub TEMP, K, KK 1154#elif defined(LEFT) 1155 addi TEMP, KK, 1 1156#else 1157 addi TEMP, KK, 4 1158#endif 1159 1160 srawi. TEMP, TEMP, 2 1161 mtspr CTR, TEMP 1162 1163#else 1164 LFD f16, 0 * SIZE(AO) 1165 LFD f17, 1 * SIZE(AO) 1166 LFD f18, 2 * SIZE(AO) 1167 LFD f19, 3 * SIZE(AO) 1168 1169 LFD f20, 0 * SIZE(B) 1170 LFD f21, 1 * SIZE(B) 1171 LFD f22, 2 * SIZE(B) 1172 LFD f23, 3 * SIZE(B) 1173 1174 LFD f24, 4 * SIZE(B) 1175 LFD f25, 5 * SIZE(B) 1176 LFD f26, 6 * SIZE(B) 1177 LFD f27, 7 * SIZE(B) 1178 1179 srawi. r0, K, 2 1180 mtspr CTR, r0 1181 mr BO, B 1182#endif 1183 ble LL(35) 1184 .align 5 1185 1186LL(32): 1187 FMADD f0, f16, f20, f0 1188 FMADD f4, f16, f21, f4 1189 FMADD f8, f16, f22, f8 1190 FMADD f12, f16, f23, f12 1191 1192 LFD f20, 8 * SIZE(BO) 1193 LFD f21, 9 * SIZE(BO) 1194 LFD f22, 10 * SIZE(BO) 1195 LFD f23, 11 * SIZE(BO) 1196 1197 FMADD f1, f17, f24, f1 1198 FMADD f5, f17, f25, f5 1199 FMADD f9, f17, f26, f9 1200 FMADD f13, f17, f27, f13 1201 1202 LFD f24, 12 * SIZE(BO) 1203 LFD f25, 13 * SIZE(BO) 1204 LFD f26, 14 * SIZE(BO) 1205 LFD f27, 15 * SIZE(BO) 1206 1207 FMADD f0, f18, f20, f0 1208 FMADD f4, f18, f21, f4 1209 FMADD f8, f18, f22, f8 1210 FMADD f12, f18, f23, f12 1211 1212 LFD f20, 16 * SIZE(BO) 1213 LFD f21, 17 * SIZE(BO) 1214 LFD f22, 18 * SIZE(BO) 1215 LFD f23, 19 * SIZE(BO) 1216 1217 FMADD f1, f19, f24, f1 1218 FMADD f5, f19, f25, f5 1219 FMADD f9, f19, f26, f9 1220 FMADD f13, f19, f27, f13 1221 1222 LFD f16, 4 * SIZE(AO) 1223 LFD f17, 5 * SIZE(AO) 1224 LFD f18, 6 * SIZE(AO) 1225 LFD f19, 7 * SIZE(AO) 1226 1227 LFD f24, 20 * SIZE(BO) 1228 LFD f25, 21 * SIZE(BO) 1229 LFD f26, 22 * SIZE(BO) 1230 LFD f27, 23 * SIZE(BO) 1231 1232 addi AO, AO, 4 * SIZE 1233 addi BO, BO, 16 * SIZE 1234 DCBT(BO, PREB) 1235 bdnz LL(32) 1236 1237 fadd f0, f1, f0 1238 fadd f4, f5, f4 1239 fadd f8, f9, f8 1240 fadd f12, f13, f12 1241 .align 4 1242 1243LL(35): 1244 lfd f30, ALPHA 1245#if defined(TRMMKERNEL) 1246 1247#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1248 sub TEMP, K, KK 1249#elif defined(LEFT) 1250 addi TEMP, KK, 1 1251#else 1252 addi TEMP, KK, 4 1253#endif 1254 andi. TEMP, TEMP, 3 1255 mtspr CTR, TEMP 1256 1257#else 1258 andi. r0, K, 3 1259 mtspr CTR, r0 1260 1261#endif 1262 ble+ LL(38) 1263 .align 4 1264 1265LL(36): 1266 FMADD f0, f16, f20, f0 1267 FMADD f4, f16, f21, f4 1268 FMADD f8, f16, f22, f8 1269 FMADD f12, f16, f23, f12 1270 1271 LFD f16, 1 * SIZE(AO) 1272 1273 LFD f20, 4 * SIZE(BO) 1274 LFD f21, 5 * SIZE(BO) 1275 LFD f22, 6 * SIZE(BO) 1276 LFD f23, 7 * SIZE(BO) 1277 1278 addi BO, BO, 4 * SIZE 1279 addi AO, AO, 1 * SIZE 1280 bdnz LL(36) 1281 .align 4 1282 1283LL(38): 1284#ifndef TRMMKERNEL 1285 LFD f16, 0 * SIZE(CO1) 1286 LFD f18, 0 * SIZE(CO2) 1287 LFD f20, 0 * SIZE(CO3) 1288 LFD f22, 0 * SIZE(CO4) 1289 1290 FMADD f0, f0, f30, f16 1291 FMADD f4, f4, f30, f18 1292 FMADD f8, f8, f30, f20 1293 FMADD f12, f12, f30, f22 1294#else 1295 FMUL f0, f0, f30 1296 FMUL f4, f4, f30 1297 FMUL f8, f8, f30 1298 FMUL f12, f12, f30 1299#endif 1300 1301 STFD f0, 0 * SIZE(CO1) 1302 STFD f4, 0 * SIZE(CO2) 1303 STFD f8, 0 * SIZE(CO3) 1304 STFD f12, 0 * SIZE(CO4) 1305 1306 lfs f0, FZERO 1307 fmr f1, f0 1308 fmr f4, f0 1309 fmr f5, f0 1310 1311 fmr f8, f0 1312 fmr f9, f0 1313 fmr f12, f0 1314 fmr f13, f0 1315 1316#ifdef TRMMKERNEL 1317#if ( defined(LEFT) && defined(TRANSA)) || \ 1318 (!defined(LEFT) && !defined(TRANSA)) 1319 sub TEMP, K, KK 1320#ifdef LEFT 1321 addi TEMP, TEMP, -1 1322#else 1323 addi TEMP, TEMP, -4 1324#endif 1325 slwi r0, TEMP, 0 + BASE_SHIFT 1326 slwi TEMP, TEMP, 2 + BASE_SHIFT 1327 add AO, AO, r0 1328 add BO, BO, TEMP 1329#endif 1330 1331#ifdef LEFT 1332 addi KK, KK, 2 1333#endif 1334#endif 1335 .align 4 1336 1337 1338LL(39): 1339#if defined(TRMMKERNEL) && !defined(LEFT) 1340 addi KK, KK, 4 1341#endif 1342 1343 mr B, BO 1344 addic. J, J, -1 1345 bgt LL(10) 1346 .align 4 1347 1348LL(40): 1349 mr CO1, C 1350 add CO2, C, LDC 1351 andi. J, N, 2 1352 ble LL(70) 1353 1354#if defined(TRMMKERNEL) && defined(LEFT) 1355 mr KK, OFFSET 1356#endif 1357 1358 1359 lfs f0, FZERO 1360 fmr f1, f0 1361 fmr f2, f0 1362 fmr f3, f0 1363 fmr f4, f0 1364 fmr f5, f0 1365 fmr f6, f0 1366 fmr f7, f0 1367 1368 srawi. I, M, 2 1369 add C, CO2, LDC 1370 mr AO, A 1371 ble LL(50) 1372 .align 4 1373 1374LL(41): 1375#if defined(TRMMKERNEL) 1376#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1377 1378 LFD f16, 0 * SIZE(AO) 1379 LFD f17, 1 * SIZE(AO) 1380 LFD f18, 2 * SIZE(AO) 1381 LFD f19, 3 * SIZE(AO) 1382 1383 LFD f20, 0 * SIZE(B) 1384 LFD f21, 1 * SIZE(B) 1385 LFD f22, 2 * SIZE(B) 1386 LFD f23, 3 * SIZE(B) 1387 1388 mr BO, B 1389#else 1390 slwi r0, KK, 2 + BASE_SHIFT 1391 slwi TEMP, KK, 1 + BASE_SHIFT 1392 add AO, AO, r0 1393 add BO, B, TEMP 1394 1395 LFD f16, 0 * SIZE(AO) 1396 LFD f17, 1 * SIZE(AO) 1397 LFD f18, 2 * SIZE(AO) 1398 LFD f19, 3 * SIZE(AO) 1399 1400 LFD f20, 0 * SIZE(BO) 1401 LFD f21, 1 * SIZE(BO) 1402 LFD f22, 2 * SIZE(BO) 1403 LFD f23, 3 * SIZE(BO) 1404#endif 1405 1406 DCBTST(CO1, PREC) 1407 DCBTST(CO2, PREC) 1408 1409#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1410 sub TEMP, K, KK 1411#elif defined(LEFT) 1412 addi TEMP, KK, 4 1413#else 1414 addi TEMP, KK, 2 1415#endif 1416 srawi. TEMP, TEMP, 2 1417 mtspr CTR, TEMP 1418 1419#else 1420 1421 LFD f16, 0 * SIZE(AO) 1422 LFD f17, 1 * SIZE(AO) 1423 LFD f18, 2 * SIZE(AO) 1424 LFD f19, 3 * SIZE(AO) 1425 1426 LFD f20, 0 * SIZE(B) 1427 LFD f21, 1 * SIZE(B) 1428 LFD f22, 2 * SIZE(B) 1429 LFD f23, 3 * SIZE(B) 1430 1431 DCBTST(CO1, PREC) 1432 DCBTST(CO2, PREC) 1433 1434 srawi. r0, K, 2 1435 mtspr CTR, r0 1436 mr BO, B 1437#endif 1438 ble LL(45) 1439 .align 5 1440 1441LL(42): 1442 FMADD f0, f16, f20, f0 1443 FMADD f1, f17, f20, f1 1444 FMADD f2, f18, f20, f2 1445 FMADD f3, f19, f20, f3 1446 1447 FMADD f4, f16, f21, f4 1448 FMADD f5, f17, f21, f5 1449 FMADD f6, f18, f21, f6 1450 FMADD f7, f19, f21, f7 1451 1452 LFD f16, 4 * SIZE(AO) 1453 LFD f17, 5 * SIZE(AO) 1454 LFD f18, 6 * SIZE(AO) 1455 LFD f19, 7 * SIZE(AO) 1456 1457 FMADD f0, f16, f22, f0 1458 FMADD f1, f17, f22, f1 1459 FMADD f2, f18, f22, f2 1460 FMADD f3, f19, f22, f3 1461 1462 FMADD f4, f16, f23, f4 1463 FMADD f5, f17, f23, f5 1464 FMADD f6, f18, f23, f6 1465 FMADD f7, f19, f23, f7 1466 1467 LFD f16, 8 * SIZE(AO) 1468 LFD f17, 9 * SIZE(AO) 1469 LFD f18, 10 * SIZE(AO) 1470 LFD f19, 11 * SIZE(AO) 1471 1472 LFD f20, 4 * SIZE(BO) 1473 LFD f21, 5 * SIZE(BO) 1474 LFD f22, 6 * SIZE(BO) 1475 LFD f23, 7 * SIZE(BO) 1476 1477 FMADD f0, f16, f20, f0 1478 FMADD f1, f17, f20, f1 1479 FMADD f2, f18, f20, f2 1480 FMADD f3, f19, f20, f3 1481 1482 FMADD f4, f16, f21, f4 1483 FMADD f5, f17, f21, f5 1484 FMADD f6, f18, f21, f6 1485 FMADD f7, f19, f21, f7 1486 1487 LFD f16, 12 * SIZE(AO) 1488 LFD f17, 13 * SIZE(AO) 1489 LFD f18, 14 * SIZE(AO) 1490 LFD f19, 15 * SIZE(AO) 1491 1492 FMADD f0, f16, f22, f0 1493 FMADD f1, f17, f22, f1 1494 FMADD f2, f18, f22, f2 1495 FMADD f3, f19, f22, f3 1496 1497 FMADD f4, f16, f23, f4 1498 FMADD f5, f17, f23, f5 1499 FMADD f6, f18, f23, f6 1500 FMADD f7, f19, f23, f7 1501 1502 LFD f16, 16 * SIZE(AO) 1503 LFD f17, 17 * SIZE(AO) 1504 LFD f18, 18 * SIZE(AO) 1505 LFD f19, 19 * SIZE(AO) 1506 1507 LFD f20, 8 * SIZE(BO) 1508 LFD f21, 9 * SIZE(BO) 1509 LFD f22, 10 * SIZE(BO) 1510 LFD f23, 11 * SIZE(BO) 1511 1512 addi AO, AO, 16 * SIZE 1513 addi BO, BO, 8 * SIZE 1514 DCBT(BO, PREB) 1515 bdnz LL(42) 1516 .align 4 1517 1518LL(45): 1519 lfd f30, ALPHA 1520#if defined(TRMMKERNEL) 1521 1522#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1523 sub TEMP, K, KK 1524#elif defined(LEFT) 1525 addi TEMP, KK, 4 1526#else 1527 addi TEMP, KK, 2 1528#endif 1529 andi. TEMP, TEMP, 3 1530 mtspr CTR, TEMP 1531#else 1532 andi. r0, K, 3 1533 mtspr CTR, r0 1534#endif 1535 ble+ LL(48) 1536 .align 4 1537 1538LL(46): 1539 FMADD f0, f16, f20, f0 1540 FMADD f1, f17, f20, f1 1541 FMADD f2, f18, f20, f2 1542 FMADD f3, f19, f20, f3 1543 1544 FMADD f4, f16, f21, f4 1545 FMADD f5, f17, f21, f5 1546 FMADD f6, f18, f21, f6 1547 FMADD f7, f19, f21, f7 1548 1549 LFD f16, 4 * SIZE(AO) 1550 LFD f17, 5 * SIZE(AO) 1551 LFD f18, 6 * SIZE(AO) 1552 LFD f19, 7 * SIZE(AO) 1553 1554 LFD f20, 2 * SIZE(BO) 1555 LFD f21, 3 * SIZE(BO) 1556 1557 addi BO, BO, 2 * SIZE 1558 addi AO, AO, 4 * SIZE 1559 bdnz LL(46) 1560 .align 4 1561 1562LL(48): 1563#ifndef TRMMKERNEL 1564 LFD f16, 0 * SIZE(CO1) 1565 LFD f17, 1 * SIZE(CO1) 1566 LFD f18, 2 * SIZE(CO1) 1567 LFD f19, 3 * SIZE(CO1) 1568 1569 LFD f20, 0 * SIZE(CO2) 1570 LFD f21, 1 * SIZE(CO2) 1571 LFD f22, 2 * SIZE(CO2) 1572 LFD f23, 3 * SIZE(CO2) 1573 1574 FMADD f0, f0, f30, f16 1575 FMADD f1, f1, f30, f17 1576 FMADD f2, f2, f30, f18 1577 FMADD f3, f3, f30, f19 1578 1579 FMADD f4, f4, f30, f20 1580 FMADD f5, f5, f30, f21 1581 FMADD f6, f6, f30, f22 1582 FMADD f7, f7, f30, f23 1583#else 1584 FMUL f0, f0, f30 1585 FMUL f1, f1, f30 1586 FMUL f2, f2, f30 1587 FMUL f3, f3, f30 1588 1589 FMUL f4, f4, f30 1590 FMUL f5, f5, f30 1591 FMUL f6, f6, f30 1592 FMUL f7, f7, f30 1593#endif 1594 1595 STFD f0, 0 * SIZE(CO1) 1596 STFD f1, 1 * SIZE(CO1) 1597 STFD f2, 2 * SIZE(CO1) 1598 STFD f3, 3 * SIZE(CO1) 1599 1600 lfs f0, FZERO 1601 fmr f1, f0 1602 fmr f2, f0 1603 fmr f3, f0 1604 1605 STFD f4, 0 * SIZE(CO2) 1606 STFD f5, 1 * SIZE(CO2) 1607 STFD f6, 2 * SIZE(CO2) 1608 STFD f7, 3 * SIZE(CO2) 1609 1610 fmr f4, f0 1611 fmr f5, f0 1612 fmr f6, f0 1613 fmr f7, f0 1614 1615 addi CO1, CO1, 4 * SIZE 1616 addi CO2, CO2, 4 * SIZE 1617 1618#ifdef TRMMKERNEL 1619#if ( defined(LEFT) && defined(TRANSA)) || \ 1620 (!defined(LEFT) && !defined(TRANSA)) 1621 sub TEMP, K, KK 1622#ifdef LEFT 1623 addi TEMP, TEMP, -4 1624#else 1625 addi TEMP, TEMP, -2 1626#endif 1627 slwi r0, TEMP, 2 + BASE_SHIFT 1628 slwi TEMP, TEMP, 1 + BASE_SHIFT 1629 add AO, AO, r0 1630 add BO, BO, TEMP 1631#endif 1632 1633#ifdef LEFT 1634 addi KK, KK, 4 1635#endif 1636#endif 1637 1638 addic. I, I, -1 1639 bgt+ LL(41) 1640 .align 4 1641 1642LL(50): 1643 andi. I, M, 2 1644 ble LL(60) 1645 1646#if defined(TRMMKERNEL) 1647 1648#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1649 1650 LFD f16, 0 * SIZE(AO) 1651 LFD f17, 1 * SIZE(AO) 1652 LFD f18, 2 * SIZE(AO) 1653 LFD f19, 3 * SIZE(AO) 1654 1655 LFD f20, 0 * SIZE(B) 1656 LFD f21, 1 * SIZE(B) 1657 LFD f22, 2 * SIZE(B) 1658 LFD f23, 3 * SIZE(B) 1659 1660 LFD f24, 4 * SIZE(B) 1661 LFD f25, 5 * SIZE(B) 1662 LFD f26, 6 * SIZE(B) 1663 LFD f27, 7 * SIZE(B) 1664 1665 mr BO, B 1666#else 1667 slwi r0, KK, 1 + BASE_SHIFT 1668 slwi TEMP, KK, 1 + BASE_SHIFT 1669 add AO, AO, r0 1670 add BO, B, TEMP 1671 1672 LFD f16, 0 * SIZE(AO) 1673 LFD f17, 1 * SIZE(AO) 1674 LFD f18, 2 * SIZE(AO) 1675 LFD f19, 3 * SIZE(AO) 1676 1677 LFD f20, 0 * SIZE(BO) 1678 LFD f21, 1 * SIZE(BO) 1679 LFD f22, 2 * SIZE(BO) 1680 LFD f23, 3 * SIZE(BO) 1681 1682 LFD f24, 4 * SIZE(BO) 1683 LFD f25, 5 * SIZE(BO) 1684 LFD f26, 6 * SIZE(BO) 1685 LFD f27, 7 * SIZE(BO) 1686#endif 1687 1688#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1689 sub TEMP, K, KK 1690#elif defined(LEFT) 1691 addi TEMP, KK, 2 1692#else 1693 addi TEMP, KK, 2 1694#endif 1695 srawi. TEMP, TEMP, 2 1696 mtspr CTR, TEMP 1697 1698#else 1699 LFD f16, 0 * SIZE(AO) 1700 LFD f17, 1 * SIZE(AO) 1701 LFD f18, 2 * SIZE(AO) 1702 LFD f19, 3 * SIZE(AO) 1703 1704 LFD f20, 0 * SIZE(B) 1705 LFD f21, 1 * SIZE(B) 1706 LFD f22, 2 * SIZE(B) 1707 LFD f23, 3 * SIZE(B) 1708 1709 LFD f24, 4 * SIZE(B) 1710 LFD f25, 5 * SIZE(B) 1711 LFD f26, 6 * SIZE(B) 1712 LFD f27, 7 * SIZE(B) 1713 1714 srawi. r0, K, 2 1715 mtspr CTR, r0 1716 mr BO, B 1717#endif 1718 ble LL(55) 1719 .align 5 1720 1721LL(52): 1722 FMADD f0, f16, f20, f0 1723 FMADD f1, f17, f20, f1 1724 FMADD f2, f16, f21, f2 1725 FMADD f3, f17, f21, f3 1726 1727 FMADD f4, f18, f22, f4 1728 FMADD f5, f19, f22, f5 1729 FMADD f6, f18, f23, f6 1730 FMADD f7, f19, f23, f7 1731 1732 LFD f16, 4 * SIZE(AO) 1733 LFD f17, 5 * SIZE(AO) 1734 LFD f18, 6 * SIZE(AO) 1735 LFD f19, 7 * SIZE(AO) 1736 1737 LFD f20, 8 * SIZE(BO) 1738 LFD f21, 9 * SIZE(BO) 1739 LFD f22, 10 * SIZE(BO) 1740 LFD f23, 11 * SIZE(BO) 1741 1742 FMADD f0, f16, f24, f0 1743 FMADD f1, f17, f24, f1 1744 FMADD f2, f16, f25, f2 1745 FMADD f3, f17, f25, f3 1746 1747 FMADD f4, f18, f26, f4 1748 FMADD f5, f19, f26, f5 1749 FMADD f6, f18, f27, f6 1750 FMADD f7, f19, f27, f7 1751 1752 LFD f16, 8 * SIZE(AO) 1753 LFD f17, 9 * SIZE(AO) 1754 LFD f18, 10 * SIZE(AO) 1755 LFD f19, 11 * SIZE(AO) 1756 1757 LFD f24, 12 * SIZE(BO) 1758 LFD f25, 13 * SIZE(BO) 1759 LFD f26, 14 * SIZE(BO) 1760 LFD f27, 15 * SIZE(BO) 1761 1762 addi AO, AO, 8 * SIZE 1763 addi BO, BO, 8 * SIZE 1764 DCBT(BO, PREB) 1765 bdnz LL(52) 1766 .align 4 1767 1768LL(55): 1769 lfd f30, ALPHA 1770#if defined(TRMMKERNEL) 1771 1772#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1773 sub TEMP, K, KK 1774#elif defined(LEFT) 1775 addi TEMP, KK, 2 1776#else 1777 addi TEMP, KK, 2 1778#endif 1779 andi. TEMP, TEMP, 3 1780 mtspr CTR, TEMP 1781 1782#else 1783 andi. r0, K, 3 1784 mtspr CTR, r0 1785#endif 1786 ble+ LL(58) 1787 .align 4 1788 1789LL(56): 1790 FMADD f0, f16, f20, f0 1791 FMADD f1, f17, f20, f1 1792 FMADD f2, f16, f21, f2 1793 FMADD f3, f17, f21, f3 1794 1795 LFD f16, 2 * SIZE(AO) 1796 LFD f17, 3 * SIZE(AO) 1797 LFD f20, 2 * SIZE(BO) 1798 LFD f21, 3 * SIZE(BO) 1799 1800 addi BO, BO, 2 * SIZE 1801 addi AO, AO, 2 * SIZE 1802 bdnz LL(56) 1803 .align 4 1804 1805LL(58): 1806#ifndef TRMMKERNEL 1807 LFD f16, 0 * SIZE(CO1) 1808 LFD f17, 1 * SIZE(CO1) 1809 LFD f18, 0 * SIZE(CO2) 1810 LFD f19, 1 * SIZE(CO2) 1811 1812 FADD f0, f4, f0 1813 FADD f1, f5, f1 1814 FADD f2, f6, f2 1815 FADD f3, f7, f3 1816 1817 FMADD f0, f0, f30, f16 1818 FMADD f1, f1, f30, f17 1819 FMADD f2, f2, f30, f18 1820 FMADD f3, f3, f30, f19 1821#else 1822 FADD f0, f4, f0 1823 FADD f1, f5, f1 1824 FADD f2, f6, f2 1825 FADD f3, f7, f3 1826 1827 FMUL f0, f0, f30 1828 FMUL f1, f1, f30 1829 FMUL f2, f2, f30 1830 FMUL f3, f3, f30 1831#endif 1832 1833 STFD f0, 0 * SIZE(CO1) 1834 STFD f1, 1 * SIZE(CO1) 1835 STFD f2, 0 * SIZE(CO2) 1836 STFD f3, 1 * SIZE(CO2) 1837 1838 lfs f0, FZERO 1839 fmr f1, f0 1840 fmr f2, f0 1841 fmr f3, f0 1842 1843 fmr f4, f0 1844 fmr f5, f0 1845 fmr f6, f0 1846 fmr f7, f0 1847 1848 addi CO1, CO1, 2 * SIZE 1849 addi CO2, CO2, 2 * SIZE 1850 1851#ifdef TRMMKERNEL 1852#if ( defined(LEFT) && defined(TRANSA)) || \ 1853 (!defined(LEFT) && !defined(TRANSA)) 1854 sub TEMP, K, KK 1855#ifdef LEFT 1856 addi TEMP, TEMP, -2 1857#else 1858 addi TEMP, TEMP, -2 1859#endif 1860 slwi r0, TEMP, 1 + BASE_SHIFT 1861 slwi TEMP, TEMP, 1 + BASE_SHIFT 1862 add AO, AO, r0 1863 add BO, BO, TEMP 1864#endif 1865 1866#ifdef LEFT 1867 addi KK, KK, 2 1868#endif 1869#endif 1870 .align 4 1871 1872LL(60): 1873 andi. I, M, 1 1874 ble LL(69) 1875 1876#if defined(TRMMKERNEL) 1877 1878#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1879 1880 LFD f16, 0 * SIZE(AO) 1881 LFD f17, 1 * SIZE(AO) 1882 LFD f18, 2 * SIZE(AO) 1883 LFD f19, 3 * SIZE(AO) 1884 1885 LFD f20, 0 * SIZE(B) 1886 LFD f21, 1 * SIZE(B) 1887 LFD f22, 2 * SIZE(B) 1888 LFD f23, 3 * SIZE(B) 1889 1890 LFD f24, 4 * SIZE(B) 1891 LFD f25, 5 * SIZE(B) 1892 LFD f26, 6 * SIZE(B) 1893 LFD f27, 7 * SIZE(B) 1894 1895 mr BO, B 1896#else 1897 slwi r0, KK, 0 + BASE_SHIFT 1898 slwi TEMP, KK, 1 + BASE_SHIFT 1899 add AO, AO, r0 1900 add BO, B, TEMP 1901 1902 LFD f16, 0 * SIZE(AO) 1903 LFD f17, 1 * SIZE(AO) 1904 LFD f18, 2 * SIZE(AO) 1905 LFD f19, 3 * SIZE(AO) 1906 1907 LFD f20, 0 * SIZE(BO) 1908 LFD f21, 1 * SIZE(BO) 1909 LFD f22, 2 * SIZE(BO) 1910 LFD f23, 3 * SIZE(BO) 1911 1912 LFD f24, 4 * SIZE(BO) 1913 LFD f25, 5 * SIZE(BO) 1914 LFD f26, 6 * SIZE(BO) 1915 LFD f27, 7 * SIZE(BO) 1916#endif 1917 1918#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1919 sub TEMP, K, KK 1920#elif defined(LEFT) 1921 addi TEMP, KK, 1 1922#else 1923 addi TEMP, KK, 2 1924#endif 1925 srawi. TEMP, TEMP, 2 1926 mtspr CTR, TEMP 1927#else 1928 LFD f16, 0 * SIZE(AO) 1929 LFD f17, 1 * SIZE(AO) 1930 LFD f18, 2 * SIZE(AO) 1931 LFD f19, 3 * SIZE(AO) 1932 1933 LFD f20, 0 * SIZE(B) 1934 LFD f21, 1 * SIZE(B) 1935 LFD f22, 2 * SIZE(B) 1936 LFD f23, 3 * SIZE(B) 1937 1938 LFD f24, 4 * SIZE(B) 1939 LFD f25, 5 * SIZE(B) 1940 LFD f26, 6 * SIZE(B) 1941 LFD f27, 7 * SIZE(B) 1942 1943 srawi. r0, K, 2 1944 mtspr CTR, r0 1945 mr BO, B 1946#endif 1947 ble LL(65) 1948 .align 5 1949 1950LL(62): 1951 FMADD f0, f16, f20, f0 1952 FMADD f1, f16, f21, f1 1953 FMADD f2, f17, f22, f2 1954 FMADD f3, f17, f23, f3 1955 1956 LFD f20, 8 * SIZE(BO) 1957 LFD f21, 9 * SIZE(BO) 1958 LFD f22, 10 * SIZE(BO) 1959 LFD f23, 11 * SIZE(BO) 1960 1961 FMADD f0, f18, f24, f0 1962 FMADD f1, f18, f25, f1 1963 FMADD f2, f19, f26, f2 1964 FMADD f3, f19, f27, f3 1965 1966 LFD f16, 4 * SIZE(AO) 1967 LFD f17, 5 * SIZE(AO) 1968 LFD f18, 6 * SIZE(AO) 1969 LFD f19, 7 * SIZE(AO) 1970 1971 LFD f24, 12 * SIZE(BO) 1972 LFD f25, 13 * SIZE(BO) 1973 LFD f26, 14 * SIZE(BO) 1974 LFD f27, 15 * SIZE(BO) 1975 1976 addi AO, AO, 4 * SIZE 1977 addi BO, BO, 8 * SIZE 1978 bdnz LL(62) 1979 .align 4 1980 1981LL(65): 1982 lfd f30, ALPHA 1983 1984#if defined(TRMMKERNEL) 1985 1986#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1987 sub TEMP, K, KK 1988#elif defined(LEFT) 1989 addi TEMP, KK, 1 1990#else 1991 addi TEMP, KK, 2 1992#endif 1993 andi. TEMP, TEMP, 3 1994 mtspr CTR, TEMP 1995 1996#else 1997 andi. r0, K, 3 1998 mtspr CTR, r0 1999 2000#endif 2001 ble+ LL(68) 2002 .align 4 2003 2004LL(66): 2005 FMADD f0, f16, f20, f0 2006 FMADD f1, f16, f21, f1 2007 2008 LFD f16, 1 * SIZE(AO) 2009 2010 LFD f20, 2 * SIZE(BO) 2011 LFD f21, 3 * SIZE(BO) 2012 2013 addi BO, BO, 2 * SIZE 2014 addi AO, AO, 1 * SIZE 2015 bdnz LL(66) 2016 .align 4 2017 2018LL(68): 2019#ifndef TRMMKERNEL 2020 LFD f16, 0 * SIZE(CO1) 2021 LFD f18, 0 * SIZE(CO2) 2022 2023 FADD f0, f2, f0 2024 FADD f1, f3, f1 2025 2026 FMADD f0, f0, f30, f16 2027 FMADD f1, f1, f30, f18 2028#else 2029 FADD f0, f2, f0 2030 FADD f1, f3, f1 2031 2032 FMUL f0, f0, f30 2033 FMUL f1, f1, f30 2034#endif 2035 2036 STFD f0, 0 * SIZE(CO1) 2037 STFD f1, 0 * SIZE(CO2) 2038 2039 lfs f0, FZERO 2040 fmr f1, f0 2041 fmr f4, f0 2042 fmr f5, f0 2043 2044 2045#ifdef TRMMKERNEL 2046#if ( defined(LEFT) && defined(TRANSA)) || \ 2047 (!defined(LEFT) && !defined(TRANSA)) 2048 sub TEMP, K, KK 2049#ifdef LEFT 2050 addi TEMP, TEMP, -1 2051#else 2052 addi TEMP, TEMP, -2 2053#endif 2054 slwi r0, TEMP, 0 + BASE_SHIFT 2055 slwi TEMP, TEMP, 1 + BASE_SHIFT 2056 add AO, AO, r0 2057 add BO, BO, TEMP 2058#endif 2059 2060#ifdef LEFT 2061 addi KK, KK, 1 2062#endif 2063#endif 2064 .align 4 2065 2066LL(69): 2067#if defined(TRMMKERNEL) && !defined(LEFT) 2068 addi KK, KK, 2 2069#endif 2070 2071 mr B, BO 2072 .align 4 2073 2074LL(70): 2075 mr CO1, C 2076 andi. J, N, 1 2077 ble LL(999) 2078 2079#if defined(TRMMKERNEL) && defined(LEFT) 2080 mr KK, OFFSET 2081#endif 2082 2083 lfs f0, FZERO 2084 fmr f1, f0 2085 fmr f2, f0 2086 fmr f3, f0 2087 2088 srawi. I, M, 2 2089 mr AO, A 2090 ble LL(80) 2091 .align 4 2092 2093LL(71): 2094#if defined(TRMMKERNEL) 2095 2096#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2097 2098 LFD f16, 0 * SIZE(AO) 2099 LFD f17, 1 * SIZE(AO) 2100 LFD f18, 2 * SIZE(AO) 2101 LFD f19, 3 * SIZE(AO) 2102 2103 LFD f20, 0 * SIZE(B) 2104 LFD f21, 1 * SIZE(B) 2105 LFD f22, 2 * SIZE(B) 2106 LFD f23, 3 * SIZE(B) 2107 2108 mr BO, B 2109#else 2110 slwi r0, KK, 2 + BASE_SHIFT 2111 slwi TEMP, KK, 0 + BASE_SHIFT 2112 add AO, AO, r0 2113 add BO, B, TEMP 2114 2115 LFD f16, 0 * SIZE(AO) 2116 LFD f17, 1 * SIZE(AO) 2117 LFD f18, 2 * SIZE(AO) 2118 LFD f19, 3 * SIZE(AO) 2119 2120 LFD f20, 0 * SIZE(BO) 2121 LFD f21, 1 * SIZE(BO) 2122 LFD f22, 2 * SIZE(BO) 2123 LFD f23, 3 * SIZE(BO) 2124#endif 2125 2126 DCBTST(CO1, PREC) 2127 2128#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2129 sub TEMP, K, KK 2130#elif defined(LEFT) 2131 addi TEMP, KK, 4 2132#else 2133 addi TEMP, KK, 1 2134#endif 2135 srawi. TEMP, TEMP, 2 2136 mtspr CTR, TEMP 2137#else 2138 LFD f16, 0 * SIZE(AO) 2139 LFD f17, 1 * SIZE(AO) 2140 LFD f18, 2 * SIZE(AO) 2141 LFD f19, 3 * SIZE(AO) 2142 2143 LFD f20, 0 * SIZE(B) 2144 LFD f21, 1 * SIZE(B) 2145 LFD f22, 2 * SIZE(B) 2146 LFD f23, 3 * SIZE(B) 2147 2148 DCBTST(CO1, PREC) 2149 2150 srawi. r0, K, 2 2151 mtspr CTR, r0 2152 mr BO, B 2153#endif 2154 ble LL(75) 2155 .align 5 2156 2157LL(72): 2158 FMADD f0, f16, f20, f0 2159 FMADD f1, f17, f20, f1 2160 FMADD f2, f18, f20, f2 2161 FMADD f3, f19, f20, f3 2162 2163 LFD f16, 4 * SIZE(AO) 2164 LFD f17, 5 * SIZE(AO) 2165 LFD f18, 6 * SIZE(AO) 2166 LFD f19, 7 * SIZE(AO) 2167 2168 FMADD f0, f16, f21, f0 2169 FMADD f1, f17, f21, f1 2170 FMADD f2, f18, f21, f2 2171 FMADD f3, f19, f21, f3 2172 2173 LFD f16, 8 * SIZE(AO) 2174 LFD f17, 9 * SIZE(AO) 2175 LFD f18, 10 * SIZE(AO) 2176 LFD f19, 11 * SIZE(AO) 2177 2178 FMADD f0, f16, f22, f0 2179 FMADD f1, f17, f22, f1 2180 FMADD f2, f18, f22, f2 2181 FMADD f3, f19, f22, f3 2182 2183 LFD f16, 12 * SIZE(AO) 2184 LFD f17, 13 * SIZE(AO) 2185 LFD f18, 14 * SIZE(AO) 2186 LFD f19, 15 * SIZE(AO) 2187 2188 FMADD f0, f16, f23, f0 2189 FMADD f1, f17, f23, f1 2190 FMADD f2, f18, f23, f2 2191 FMADD f3, f19, f23, f3 2192 2193 LFD f16, 16 * SIZE(AO) 2194 LFD f17, 17 * SIZE(AO) 2195 LFD f18, 18 * SIZE(AO) 2196 LFD f19, 19 * SIZE(AO) 2197 2198 LFD f20, 4 * SIZE(BO) 2199 LFD f21, 5 * SIZE(BO) 2200 LFD f22, 6 * SIZE(BO) 2201 LFD f23, 7 * SIZE(BO) 2202 2203 addi AO, AO, 16 * SIZE 2204 addi BO, BO, 4 * SIZE 2205 DCBT(BO, PREB) 2206 bdnz LL(72) 2207 .align 4 2208 2209LL(75): 2210 lfd f30, ALPHA 2211#if defined(TRMMKERNEL) 2212 2213#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2214 sub TEMP, K, KK 2215#elif defined(LEFT) 2216 addi TEMP, KK, 4 2217#else 2218 addi TEMP, KK, 1 2219#endif 2220 andi. TEMP, TEMP, 3 2221 mtspr CTR, TEMP 2222 2223#else 2224 andi. r0, K, 3 2225 mtspr CTR, r0 2226 2227#endif 2228 ble+ LL(78) 2229 .align 4 2230 2231LL(76): 2232 FMADD f0, f16, f20, f0 2233 FMADD f1, f17, f20, f1 2234 FMADD f2, f18, f20, f2 2235 FMADD f3, f19, f20, f3 2236 2237 LFD f16, 4 * SIZE(AO) 2238 LFD f17, 5 * SIZE(AO) 2239 LFD f18, 6 * SIZE(AO) 2240 LFD f19, 7 * SIZE(AO) 2241 2242 LFD f20, 1 * SIZE(BO) 2243 2244 addi BO, BO, 1 * SIZE 2245 addi AO, AO, 4 * SIZE 2246 bdnz LL(76) 2247 .align 4 2248 2249LL(78): 2250#ifndef TRMMKERNEL 2251 LFD f16, 0 * SIZE(CO1) 2252 LFD f17, 1 * SIZE(CO1) 2253 LFD f18, 2 * SIZE(CO1) 2254 LFD f19, 3 * SIZE(CO1) 2255 2256 FMADD f0, f0, f30, f16 2257 FMADD f1, f1, f30, f17 2258 FMADD f2, f2, f30, f18 2259 FMADD f3, f3, f30, f19 2260#else 2261 FMUL f0, f0, f30 2262 FMUL f1, f1, f30 2263 FMUL f2, f2, f30 2264 FMUL f3, f3, f30 2265#endif 2266 2267 STFD f0, 0 * SIZE(CO1) 2268 STFD f1, 1 * SIZE(CO1) 2269 STFD f2, 2 * SIZE(CO1) 2270 STFD f3, 3 * SIZE(CO1) 2271 2272 lfs f0, FZERO 2273 fmr f1, f0 2274 fmr f2, f0 2275 fmr f3, f0 2276 2277#ifdef TRMMKERNEL 2278#if ( defined(LEFT) && defined(TRANSA)) || \ 2279 (!defined(LEFT) && !defined(TRANSA)) 2280 sub TEMP, K, KK 2281#ifdef LEFT 2282 addi TEMP, TEMP, -4 2283#else 2284 addi TEMP, TEMP, -1 2285#endif 2286 slwi r0 , TEMP, 2 + BASE_SHIFT 2287 slwi TEMP, TEMP, 0 + BASE_SHIFT 2288 add AO, AO, r0 2289 add BO, BO, TEMP 2290#endif 2291 2292#ifdef LEFT 2293 addi KK, KK, 4 2294#endif 2295#endif 2296 2297 addi CO1, CO1, 4 * SIZE 2298 addic. I, I, -1 2299 bgt+ LL(71) 2300 .align 4 2301 2302LL(80): 2303 andi. I, M, 2 2304 ble LL(90) 2305 2306#if defined(TRMMKERNEL) 2307 2308#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2309 2310 LFD f16, 0 * SIZE(AO) 2311 LFD f17, 1 * SIZE(AO) 2312 LFD f18, 2 * SIZE(AO) 2313 LFD f19, 3 * SIZE(AO) 2314 2315 LFD f20, 0 * SIZE(B) 2316 LFD f21, 1 * SIZE(B) 2317 LFD f22, 2 * SIZE(B) 2318 LFD f23, 3 * SIZE(B) 2319 2320 mr BO, B 2321#else 2322 slwi r0, KK, 1 + BASE_SHIFT 2323 slwi TEMP, KK, 0 + BASE_SHIFT 2324 add AO, AO, r0 2325 add BO, B, TEMP 2326 2327 LFD f16, 0 * SIZE(AO) 2328 LFD f17, 1 * SIZE(AO) 2329 LFD f18, 2 * SIZE(AO) 2330 LFD f19, 3 * SIZE(AO) 2331 2332 LFD f20, 0 * SIZE(BO) 2333 LFD f21, 1 * SIZE(BO) 2334 LFD f22, 2 * SIZE(BO) 2335 LFD f23, 3 * SIZE(BO) 2336#endif 2337 2338#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2339 sub TEMP, K, KK 2340#elif defined(LEFT) 2341 addi TEMP, KK, 2 2342#else 2343 addi TEMP, KK, 1 2344#endif 2345 srawi. TEMP, TEMP, 2 2346 mtspr CTR, TEMP 2347 2348#else 2349 LFD f16, 0 * SIZE(AO) 2350 LFD f17, 1 * SIZE(AO) 2351 LFD f18, 2 * SIZE(AO) 2352 LFD f19, 3 * SIZE(AO) 2353 2354 LFD f20, 0 * SIZE(B) 2355 LFD f21, 1 * SIZE(B) 2356 LFD f22, 2 * SIZE(B) 2357 LFD f23, 3 * SIZE(B) 2358 2359 srawi. r0, K, 2 2360 mtspr CTR, r0 2361 mr BO, B 2362 2363#endif 2364 ble LL(85) 2365 .align 5 2366 2367LL(82): 2368 FMADD f0, f16, f20, f0 2369 FMADD f1, f17, f20, f1 2370 FMADD f2, f18, f21, f2 2371 FMADD f3, f19, f21, f3 2372 2373 LFD f16, 4 * SIZE(AO) 2374 LFD f17, 5 * SIZE(AO) 2375 LFD f18, 6 * SIZE(AO) 2376 LFD f19, 7 * SIZE(AO) 2377 2378 FMADD f0, f16, f22, f0 2379 FMADD f1, f17, f22, f1 2380 FMADD f2, f18, f23, f2 2381 FMADD f3, f19, f23, f3 2382 2383 LFD f16, 8 * SIZE(AO) 2384 LFD f17, 9 * SIZE(AO) 2385 LFD f18, 10 * SIZE(AO) 2386 LFD f19, 11 * SIZE(AO) 2387 2388 LFD f20, 4 * SIZE(BO) 2389 LFD f21, 5 * SIZE(BO) 2390 LFD f22, 6 * SIZE(BO) 2391 LFD f23, 7 * SIZE(BO) 2392 2393 addi AO, AO, 8 * SIZE 2394 addi BO, BO, 4 * SIZE 2395 DCBT(BO, PREB) 2396 bdnz LL(82) 2397 .align 4 2398 2399LL(85): 2400 lfd f30, ALPHA 2401#if defined(TRMMKERNEL) 2402 2403#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2404 sub TEMP, K, KK 2405#elif defined(LEFT) 2406 addi TEMP, KK, 2 2407#else 2408 addi TEMP, KK, 1 2409#endif 2410 andi. TEMP, TEMP, 3 2411 mtspr CTR, TEMP 2412 2413#else 2414 2415 andi. r0, K, 3 2416 mtspr CTR, r0 2417 2418#endif 2419 ble+ LL(88) 2420 .align 4 2421 2422LL(86): 2423 FMADD f0, f16, f20, f0 2424 FMADD f1, f17, f20, f1 2425 2426 LFD f16, 2 * SIZE(AO) 2427 LFD f17, 3 * SIZE(AO) 2428 LFD f20, 1 * SIZE(BO) 2429 2430 addi BO, BO, 1 * SIZE 2431 addi AO, AO, 2 * SIZE 2432 bdnz LL(86) 2433 .align 4 2434 2435LL(88): 2436#ifndef TRMMKERNEL 2437 LFD f16, 0 * SIZE(CO1) 2438 LFD f17, 1 * SIZE(CO1) 2439 2440 FADD f0, f2, f0 2441 FADD f1, f3, f1 2442 2443 FMADD f0, f0, f30, f16 2444 FMADD f1, f1, f30, f17 2445#else 2446 FADD f0, f2, f0 2447 FADD f1, f3, f1 2448 2449 FMUL f0, f0, f30 2450 FMUL f1, f1, f30 2451#endif 2452 2453 STFD f0, 0 * SIZE(CO1) 2454 STFD f1, 1 * SIZE(CO1) 2455 2456 lfs f0, FZERO 2457 fmr f1, f0 2458 fmr f2, f0 2459 fmr f3, f0 2460 2461 addi CO1, CO1, 2 * SIZE 2462 2463#ifdef TRMMKERNEL 2464#if ( defined(LEFT) && defined(TRANSA)) || \ 2465 (!defined(LEFT) && !defined(TRANSA)) 2466 sub TEMP, K, KK 2467#ifdef LEFT 2468 addi TEMP, TEMP, -2 2469#else 2470 addi TEMP, TEMP, -1 2471#endif 2472 slwi r0 , TEMP, 1 + BASE_SHIFT 2473 slwi TEMP, TEMP, 0 + BASE_SHIFT 2474 add AO, AO, r0 2475 add BO, BO, TEMP 2476#endif 2477 2478#ifdef LEFT 2479 addi KK, KK, 2 2480#endif 2481#endif 2482 .align 4 2483 2484LL(90): 2485 andi. I, M, 1 2486 ble LL(999) 2487 2488 2489#if defined(TRMMKERNEL) 2490 2491#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2492 2493 LFD f16, 0 * SIZE(AO) 2494 LFD f17, 1 * SIZE(AO) 2495 LFD f18, 2 * SIZE(AO) 2496 LFD f19, 3 * SIZE(AO) 2497 2498 LFD f20, 0 * SIZE(B) 2499 LFD f21, 1 * SIZE(B) 2500 LFD f22, 2 * SIZE(B) 2501 LFD f23, 3 * SIZE(B) 2502 2503 mr BO, B 2504#else 2505 slwi r0, KK, 0 + BASE_SHIFT 2506 slwi TEMP, KK, 0 + BASE_SHIFT 2507 add AO, AO, r0 2508 add BO, B, TEMP 2509 2510 LFD f16, 0 * SIZE(AO) 2511 LFD f17, 1 * SIZE(AO) 2512 LFD f18, 2 * SIZE(AO) 2513 LFD f19, 3 * SIZE(AO) 2514 2515 LFD f20, 0 * SIZE(BO) 2516 LFD f21, 1 * SIZE(BO) 2517 LFD f22, 2 * SIZE(BO) 2518 LFD f23, 3 * SIZE(BO) 2519#endif 2520 2521#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2522 sub TEMP, K, KK 2523#elif defined(LEFT) 2524 addi TEMP, KK, 1 2525#else 2526 addi TEMP, KK, 1 2527#endif 2528 srawi. TEMP, TEMP, 3 2529 mtspr CTR, TEMP 2530 2531#else 2532 LFD f16, 0 * SIZE(AO) 2533 LFD f17, 1 * SIZE(AO) 2534 LFD f18, 2 * SIZE(AO) 2535 LFD f19, 3 * SIZE(AO) 2536 2537 LFD f20, 0 * SIZE(B) 2538 LFD f21, 1 * SIZE(B) 2539 LFD f22, 2 * SIZE(B) 2540 LFD f23, 3 * SIZE(B) 2541 2542 srawi. r0, K, 3 2543 mtspr CTR, r0 2544 mr BO, B 2545#endif 2546 ble LL(95) 2547 .align 5 2548 2549LL(92): 2550 FMADD f0, f16, f20, f0 2551 FMADD f1, f17, f21, f1 2552 FMADD f2, f18, f22, f2 2553 FMADD f3, f19, f23, f3 2554 2555 LFD f16, 4 * SIZE(AO) 2556 LFD f17, 5 * SIZE(AO) 2557 LFD f18, 6 * SIZE(AO) 2558 LFD f19, 7 * SIZE(AO) 2559 2560 LFD f20, 4 * SIZE(BO) 2561 LFD f21, 5 * SIZE(BO) 2562 LFD f22, 6 * SIZE(BO) 2563 LFD f23, 7 * SIZE(BO) 2564 2565 FMADD f0, f16, f20, f0 2566 FMADD f1, f17, f21, f1 2567 FMADD f2, f18, f22, f2 2568 FMADD f3, f19, f23, f3 2569 2570 LFD f16, 8 * SIZE(AO) 2571 LFD f17, 9 * SIZE(AO) 2572 LFD f18, 10 * SIZE(AO) 2573 LFD f19, 11 * SIZE(AO) 2574 2575 LFD f20, 8 * SIZE(BO) 2576 LFD f21, 9 * SIZE(BO) 2577 LFD f22, 10 * SIZE(BO) 2578 LFD f23, 11 * SIZE(BO) 2579 2580 addi AO, AO, 8 * SIZE 2581 addi BO, BO, 8 * SIZE 2582 bdnz LL(92) 2583 .align 4 2584 2585LL(95): 2586 lfd f30, ALPHA 2587 2588#if defined(TRMMKERNEL) 2589 2590#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2591 sub TEMP, K, KK 2592#elif defined(LEFT) 2593 addi TEMP, KK, 1 2594#else 2595 addi TEMP, KK, 1 2596#endif 2597 andi. TEMP, TEMP, 7 2598 mtspr CTR, TEMP 2599 2600#else 2601 2602 andi. r0, K, 7 2603 mtspr CTR, r0 2604 2605#endif 2606 ble+ LL(98) 2607 .align 4 2608 2609LL(96): 2610 FMADD f0, f16, f20, f0 2611 LFD f16, 1 * SIZE(AO) 2612 LFD f20, 1 * SIZE(BO) 2613 addi BO, BO, 1 * SIZE 2614 addi AO, AO, 1 * SIZE 2615 bdnz LL(96) 2616 .align 4 2617 2618LL(98): 2619#ifndef TRMMKERNEL 2620 LFD f16, 0 * SIZE(CO1) 2621 2622 FADD f0, f1, f0 2623 FADD f2, f3, f2 2624 FADD f0, f2, f0 2625 2626 FMADD f0, f0, f30, f16 2627#else 2628 FADD f0, f1, f0 2629 FADD f2, f3, f2 2630 FADD f0, f2, f0 2631 2632 FMUL f0, f0, f30 2633#endif 2634 2635 STFD f0, 0 * SIZE(CO1) 2636 .align 4 2637 2638LL(999): 2639 addi r3, 0, 0 2640 2641 lfd f14, 0(SP) 2642 lfd f15, 8(SP) 2643 lfd f16, 16(SP) 2644 lfd f17, 24(SP) 2645 2646 lfd f18, 32(SP) 2647 lfd f19, 40(SP) 2648 lfd f20, 48(SP) 2649 lfd f21, 56(SP) 2650 2651 lfd f22, 64(SP) 2652 lfd f23, 72(SP) 2653 lfd f24, 80(SP) 2654 lfd f25, 88(SP) 2655 2656 lfd f26, 96(SP) 2657 lfd f27, 104(SP) 2658 lfd f28, 112(SP) 2659 lfd f29, 120(SP) 2660 2661 lfd f30, 128(SP) 2662 lfd f31, 136(SP) 2663 2664#ifdef __64BIT__ 2665 ld r31, 144(SP) 2666 ld r30, 152(SP) 2667 ld r29, 160(SP) 2668 ld r28, 168(SP) 2669 ld r27, 176(SP) 2670 ld r26, 184(SP) 2671 ld r25, 192(SP) 2672 ld r24, 200(SP) 2673 ld r23, 208(SP) 2674 ld r22, 216(SP) 2675 ld r21, 224(SP) 2676 ld r20, 232(SP) 2677#if defined(TRMMKERNEL) || defined(TRSMKERNEL) 2678 ld r19, 240(SP) 2679 ld r18, 248(SP) 2680#endif 2681#else 2682 lwz r31, 144(SP) 2683 lwz r30, 148(SP) 2684 lwz r29, 152(SP) 2685 lwz r28, 156(SP) 2686 lwz r27, 160(SP) 2687 lwz r26, 164(SP) 2688 lwz r25, 168(SP) 2689 lwz r24, 172(SP) 2690 lwz r23, 176(SP) 2691 lwz r22, 180(SP) 2692 lwz r21, 184(SP) 2693 lwz r20, 188(SP) 2694#if defined(TRMMKERNEL) || defined(TRSMKERNEL) 2695 lwz r19, 192(SP) 2696 lwz r18, 196(SP) 2697#endif 2698#endif 2699 2700 addi SP, SP, STACKSIZE 2701 2702 blr 2703 2704 EPILOGUE 2705#endif 2706