1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifndef __64BIT__ 43#define LOAD lwz 44#else 45#define LOAD ld 46#endif 47 48#ifdef __64BIT__ 49#define STACKSIZE 320 50#define ALPHA 296(SP) 51#define FZERO 304(SP) 52#else 53#define STACKSIZE 240 54#define ALPHA 224(SP) 55#define FZERO 232(SP) 56#endif 57 58#define M r3 59#define N r4 60#define K r5 61 62#ifdef linux 63#ifndef __64BIT__ 64#define A r6 65#define B r7 66#define C r8 67#define LDC r9 68#define OFFSET r10 69#else 70#define A r7 71#define B r8 72#define C r9 73#define LDC r10 74#define OFFSET r6 75#endif 76#endif 77 78#if defined(_AIX) || defined(__APPLE__) 79#if !defined(__64BIT__) && defined(DOUBLE) 80#define A r8 81#define B r9 82#define C r10 83#define LDC r7 84#define OFFSET r6 85#else 86#define A r7 87#define B r8 88#define C r9 89#define LDC r10 90#define OFFSET r6 91#endif 92#endif 93 94#define AORIG r18 95#define TEMP r19 96#define KK r20 97#define I r21 98#define J r22 99#define AO r23 100#define BO r24 101#define CO1 r25 102#define CO2 r26 103#define CO3 r27 104#define CO4 r28 105 106#define A1 f16 107#define A2 f17 108#define A3 f18 109#define A4 f19 110#define A5 f20 111#define A6 f21 112#define B1 f22 113#define B2 f23 114#define B3 f24 115#define B4 f25 116#define B5 f26 117#define B6 f27 118#define B7 f28 119#define B8 f29 120#define B9 f30 121#define B10 f31 122 123 PROLOGUE 124 PROFCODE 125 126 addi SP, SP, -STACKSIZE 127 li r0, 0 128 129 stfd f14, 0(SP) 130 stfd f15, 8(SP) 131 stfd f16, 16(SP) 132 stfd f17, 24(SP) 133 134 stfd f18, 32(SP) 135 stfd f19, 40(SP) 136 stfd f20, 48(SP) 137 stfd f21, 56(SP) 138 139 stfd f22, 64(SP) 140 stfd f23, 72(SP) 141 stfd f24, 80(SP) 142 stfd f25, 88(SP) 143 144 stfd f26, 96(SP) 145 stfd f27, 104(SP) 146 stfd f28, 112(SP) 147 stfd f29, 120(SP) 148 149 stfd f30, 128(SP) 150 stfd f31, 136(SP) 151 152#ifdef __64BIT__ 153 std r31, 144(SP) 154 std r30, 152(SP) 155 std r29, 160(SP) 156 std r28, 168(SP) 157 std r27, 176(SP) 158 std r26, 184(SP) 159 std r25, 192(SP) 160 std r24, 200(SP) 161 std r23, 208(SP) 162 std r22, 216(SP) 163 std r21, 224(SP) 164 std r20, 232(SP) 165 std r19, 240(SP) 166 std r18, 248(SP) 167#else 168 stw r31, 144(SP) 169 stw r30, 148(SP) 170 stw r29, 152(SP) 171 stw r28, 156(SP) 172 stw r27, 160(SP) 173 stw r26, 164(SP) 174 stw r25, 168(SP) 175 stw r24, 172(SP) 176 stw r23, 176(SP) 177 stw r22, 180(SP) 178 stw r21, 184(SP) 179 stw r20, 188(SP) 180 stw r19, 192(SP) 181 stw r18, 196(SP) 182#endif 183 184 stw r0, FZERO 185 186#if defined(_AIX) || defined(__APPLE__) 187#if !defined(__64BIT__) && defined(DOUBLE) 188 lwz LDC, 56 + STACKSIZE(SP) 189#endif 190#endif 191 192 slwi LDC, LDC, BASE_SHIFT 193 194#if defined(linux) && defined(__64BIT__) 195 ld OFFSET, 112 + STACKSIZE(SP) 196#endif 197 198#if defined(_AIX) || defined(__APPLE__) 199#ifdef __64BIT__ 200 ld OFFSET, 112 + STACKSIZE(SP) 201#else 202#ifdef DOUBLE 203 lwz OFFSET, 60 + STACKSIZE(SP) 204#else 205 lwz OFFSET, 56 + STACKSIZE(SP) 206#endif 207#endif 208#endif 209 210#ifdef LN 211 mullw r0, M, K 212 slwi r0, r0, BASE_SHIFT 213 add A, A, r0 214 215 slwi r0, M, BASE_SHIFT 216 add C, C, r0 217#endif 218 219#ifdef RN 220 neg KK, OFFSET 221#endif 222 223#ifdef RT 224 mullw r0, N, K 225 slwi r0, r0, BASE_SHIFT 226 add B, B, r0 227 228 mullw r0, N, LDC 229 add C, C, r0 230 231 sub KK, N, OFFSET 232#endif 233 234 cmpwi cr0, M, 0 235 ble .L999 236 cmpwi cr0, N, 0 237 ble .L999 238 cmpwi cr0, K, 0 239 ble .L999 240 241 lfs f0, FZERO 242 243 srawi. J, N, 2 244 ble .L40 245 .align 4 246 247.L10: 248 249#ifdef RT 250 slwi r0, K, 2 + BASE_SHIFT 251 sub B, B, r0 252 253 slwi r0, LDC, 2 254 sub C, C, r0 255#endif 256 257 mr CO1, C 258 add CO2, C, LDC 259 add CO3, CO2, LDC 260 add CO4, CO3, LDC 261 262#ifdef LN 263 add KK, M, OFFSET 264#endif 265 266#ifdef LT 267 mr KK, OFFSET 268#endif 269 270 fmr f1, f0 271 fmr f2, f0 272 fmr f3, f0 273 fmr f4, f0 274 fmr f5, f0 275 fmr f6, f0 276 fmr f7, f0 277 fmr f8, f0 278 fmr f9, f0 279 fmr f10, f0 280 fmr f11, f0 281 fmr f12, f0 282 fmr f13, f0 283 fmr f14, f0 284 fmr f15, f0 285 286#if defined(LN) || defined(RT) 287 mr AORIG, A 288#else 289 mr AO, A 290#endif 291#ifndef RT 292 add C, CO4, LDC 293#endif 294 295.L30: 296 andi. I, M, 1 297 ble .L20 298 299#if defined(LT) || defined(RN) 300 LFD f16, 0 * SIZE(AO) 301 LFD f17, 1 * SIZE(AO) 302 LFD f18, 2 * SIZE(AO) 303 LFD f19, 3 * SIZE(AO) 304 305 LFD f20, 0 * SIZE(B) 306 LFD f21, 1 * SIZE(B) 307 LFD f22, 2 * SIZE(B) 308 LFD f23, 3 * SIZE(B) 309 310 LFD f24, 4 * SIZE(B) 311 LFD f25, 5 * SIZE(B) 312 LFD f26, 6 * SIZE(B) 313 LFD f27, 7 * SIZE(B) 314 315 srawi. r0, KK, 2 316 mtspr CTR, r0 317 mr BO, B 318#else 319 320#ifdef LN 321 slwi r0, K, BASE_SHIFT 322 sub AORIG, AORIG, r0 323#endif 324 325 slwi r0, KK, 0 + BASE_SHIFT 326 slwi TEMP, KK, 2 + BASE_SHIFT 327 add AO, AORIG, r0 328 add BO, B, TEMP 329 330 sub TEMP, K, KK 331 332 LFD f16, 0 * SIZE(AO) 333 LFD f17, 1 * SIZE(AO) 334 LFD f18, 2 * SIZE(AO) 335 LFD f19, 3 * SIZE(AO) 336 337 LFD f20, 0 * SIZE(BO) 338 LFD f21, 1 * SIZE(BO) 339 LFD f22, 2 * SIZE(BO) 340 LFD f23, 3 * SIZE(BO) 341 342 LFD f24, 4 * SIZE(BO) 343 LFD f25, 5 * SIZE(BO) 344 LFD f26, 6 * SIZE(BO) 345 LFD f27, 7 * SIZE(BO) 346 347 srawi. r0, TEMP, 2 348 mtspr CTR, r0 349#endif 350 ble .L35 351 .align 5 352 353.L32: 354 FMADD f0, f16, f20, f0 355 LFD f20, 8 * SIZE(BO) 356 FMADD f4, f16, f21, f4 357 LFD f21, 9 * SIZE(BO) 358 FMADD f8, f16, f22, f8 359 LFD f22, 10 * SIZE(BO) 360 FMADD f12, f16, f23, f12 361 LFD f23, 11 * SIZE(BO) 362 LFDU f16, 4 * SIZE(AO) 363 364 FMADD f1, f17, f24, f1 365 LFD f24, 12 * SIZE(BO) 366 FMADD f5, f17, f25, f5 367 LFD f25, 13 * SIZE(BO) 368 FMADD f9, f17, f26, f9 369 LFD f26, 14 * SIZE(BO) 370 FMADD f13, f17, f27, f13 371 LFD f27, 15 * SIZE(BO) 372 LFD f17, 1 * SIZE(AO) 373 374 FMADD f0, f18, f20, f0 375 LFDU f20, 16 * SIZE(BO) 376 FMADD f4, f18, f21, f4 377 LFD f21, 1 * SIZE(BO) 378 FMADD f8, f18, f22, f8 379 LFD f22, 2 * SIZE(BO) 380 FMADD f12, f18, f23, f12 381 LFD f23, 3 * SIZE(BO) 382 LFD f18, 2 * SIZE(AO) 383 384 FMADD f1, f19, f24, f1 385 LFD f24, 4 * SIZE(BO) 386 FMADD f5, f19, f25, f5 387 LFD f25, 5 * SIZE(BO) 388 FMADD f9, f19, f26, f9 389 LFD f26, 6 * SIZE(BO) 390 FMADD f13, f19, f27, f13 391 LFD f27, 7 * SIZE(BO) 392 LFD f19, 3 * SIZE(AO) 393 bdnz .L32 394 395 fadd f0, f1, f0 396 fadd f4, f5, f4 397 fadd f8, f9, f8 398 fadd f12, f13, f12 399 .align 4 400 401.L35: 402#if defined(LT) || defined(RN) 403 andi. r0, KK, 3 404#else 405 andi. r0, TEMP, 3 406#endif 407 mtspr CTR, r0 408 ble+ .L38 409 .align 4 410 411.L36: 412 FMADD f0, f16, f20, f0 413 LFDU f20, 4 * SIZE(BO) 414 FMADD f4, f16, f21, f4 415 LFD f21, 1 * SIZE(BO) 416 FMADD f8, f16, f22, f8 417 LFD f22, 2 * SIZE(BO) 418 FMADD f12, f16, f23, f12 419 LFDU f16, 1 * SIZE(AO) 420 LFD f23, 3 * SIZE(BO) 421 bdnz .L36 422 .align 4 423 424.L38: 425#if defined(LN) || defined(RT) 426#ifdef LN 427 subi r0, KK, 1 428#else 429 subi r0, KK, 4 430#endif 431 slwi TEMP, r0, 0 + BASE_SHIFT 432 slwi r0, r0, 2 + BASE_SHIFT 433 add AO, AORIG, TEMP 434 add BO, B, r0 435#endif 436 437#if defined(LN) || defined(LT) 438 LFD f16, 0 * SIZE(BO) 439 LFD f17, 1 * SIZE(BO) 440 LFD f18, 2 * SIZE(BO) 441 LFD f19, 3 * SIZE(BO) 442 443 FSUB f0, f16, f0 444 FSUB f4, f17, f4 445 FSUB f8, f18, f8 446 FSUB f12, f19, f12 447#else 448 LFD f16, 0 * SIZE(AO) 449 LFD f20, 1 * SIZE(AO) 450 LFD f24, 2 * SIZE(AO) 451 LFD f28, 3 * SIZE(AO) 452 453 FSUB f0, f16, f0 454 FSUB f4, f20, f4 455 FSUB f8, f24, f8 456 FSUB f12, f28, f12 457#endif 458 459#ifdef LN 460 LFD f21, 0 * SIZE(AO) 461 462 FMUL f0, f21, f0 463 FMUL f4, f21, f4 464 FMUL f8, f21, f8 465 FMUL f12, f21, f12 466#endif 467 468#ifdef LT 469 LFD f16, 0 * SIZE(AO) 470 471 FMUL f0, f16, f0 472 FMUL f4, f16, f4 473 FMUL f8, f16, f8 474 FMUL f12, f16, f12 475#endif 476 477#ifdef RN 478 LFD f16, 0 * SIZE(BO) 479 LFD f17, 1 * SIZE(BO) 480 LFD f18, 2 * SIZE(BO) 481 LFD f19, 3 * SIZE(BO) 482 483 FMUL f0, f16, f0 484 FNMSUB f4, f17, f0, f4 485 FNMSUB f8, f18, f0, f8 486 FNMSUB f12, f19, f0, f12 487 488 LFD f16, 5 * SIZE(BO) 489 LFD f17, 6 * SIZE(BO) 490 LFD f18, 7 * SIZE(BO) 491 LFD f19, 10 * SIZE(BO) 492 493 LFD f20, 11 * SIZE(BO) 494 LFD f21, 15 * SIZE(BO) 495 496 FMUL f4, f16, f4 497 FNMSUB f8, f17, f4, f8 498 FNMSUB f12, f18, f4, f12 499 FMUL f8, f19, f8 500 FNMSUB f12, f20, f8, f12 501 FMUL f12, f21, f12 502#endif 503 504#ifdef RT 505 LFD f16, 15 * SIZE(BO) 506 LFD f17, 14 * SIZE(BO) 507 LFD f18, 13 * SIZE(BO) 508 LFD f19, 12 * SIZE(BO) 509 510 FMUL f12, f16, f12 511 FNMSUB f8, f17, f12, f8 512 FNMSUB f4, f18, f12, f4 513 FNMSUB f0, f19, f12, f0 514 515 LFD f16, 10 * SIZE(BO) 516 LFD f17, 9 * SIZE(BO) 517 LFD f18, 8 * SIZE(BO) 518 LFD f19, 5 * SIZE(BO) 519 520 FMUL f8, f16, f8 521 522 LFD f20, 4 * SIZE(BO) 523 LFD f21, 0 * SIZE(BO) 524 525 FNMSUB f4, f17, f8, f4 526 FNMSUB f0, f18, f8, f0 527 528 FMUL f4, f19, f4 529 FNMSUB f0, f20, f4, f0 530 FMUL f0, f21, f0 531#endif 532 533#ifdef LN 534 subi CO1, CO1, 1 * SIZE 535 subi CO2, CO2, 1 * SIZE 536 subi CO3, CO3, 1 * SIZE 537 subi CO4, CO4, 1 * SIZE 538#endif 539 540#if defined(LN) || defined(LT) 541 STFD f0, 0 * SIZE(BO) 542 STFD f4, 1 * SIZE(BO) 543 STFD f8, 2 * SIZE(BO) 544 STFD f12, 3 * SIZE(BO) 545#else 546 STFD f0, 0 * SIZE(AO) 547 STFD f4, 1 * SIZE(AO) 548 STFD f8, 2 * SIZE(AO) 549 STFD f12, 3 * SIZE(AO) 550#endif 551 552 STFD f0, 0 * SIZE(CO1) 553 STFD f4, 0 * SIZE(CO2) 554 STFD f8, 0 * SIZE(CO3) 555 STFD f12, 0 * SIZE(CO4) 556 557 lfs f0, FZERO 558 fmr f1, f0 559 fmr f4, f0 560 fmr f5, f0 561 562 fmr f8, f0 563 fmr f9, f0 564 fmr f12, f0 565 fmr f13, f0 566 567#ifndef LN 568 addi CO1, CO1, 1 * SIZE 569 addi CO2, CO2, 1 * SIZE 570 addi CO3, CO3, 1 * SIZE 571 addi CO4, CO4, 1 * SIZE 572#endif 573 574#ifdef RT 575 slwi r0, K, 0 + BASE_SHIFT 576 add AORIG, AORIG, r0 577#endif 578 579#if defined(LT) || defined(RN) 580 sub TEMP, K, KK 581 slwi r0, TEMP, 0 + BASE_SHIFT 582 slwi TEMP, TEMP, 2 + BASE_SHIFT 583 add AO, AO, r0 584 add BO, BO, TEMP 585#endif 586 587#ifdef LN 588 subi KK, KK, 1 589#endif 590 591#ifdef LT 592 addi KK, KK, 1 593#endif 594 .align 4 595 596.L20: 597 andi. I, M, 2 598 ble .L09 599 600#if defined(LT) || defined(RN) 601 LFD f16, 0 * SIZE(AO) 602 LFD f17, 1 * SIZE(AO) 603 LFD f18, 2 * SIZE(AO) 604 LFD f19, 3 * SIZE(AO) 605 606 LFD f20, 0 * SIZE(B) 607 LFD f21, 1 * SIZE(B) 608 LFD f22, 2 * SIZE(B) 609 LFD f23, 3 * SIZE(B) 610 611 LFD f24, 4 * SIZE(B) 612 LFD f25, 5 * SIZE(B) 613 LFD f26, 6 * SIZE(B) 614 LFD f27, 7 * SIZE(B) 615 616 srawi. r0, KK, 2 617 mtspr CTR, r0 618 mr BO, B 619#else 620 621#ifdef LN 622 slwi r0, K, 1 + BASE_SHIFT 623 sub AORIG, AORIG, r0 624#endif 625 626 slwi r0, KK, 1 + BASE_SHIFT 627 slwi TEMP, KK, 2 + BASE_SHIFT 628 add AO, AORIG, r0 629 add BO, B, TEMP 630 631 sub TEMP, K, KK 632 633 LFD f16, 0 * SIZE(AO) 634 LFD f17, 1 * SIZE(AO) 635 LFD f18, 2 * SIZE(AO) 636 LFD f19, 3 * SIZE(AO) 637 638 LFD f20, 0 * SIZE(BO) 639 LFD f21, 1 * SIZE(BO) 640 LFD f22, 2 * SIZE(BO) 641 LFD f23, 3 * SIZE(BO) 642 643 LFD f24, 4 * SIZE(BO) 644 LFD f25, 5 * SIZE(BO) 645 LFD f26, 6 * SIZE(BO) 646 LFD f27, 7 * SIZE(BO) 647 648 srawi. r0, TEMP, 2 649 mtspr CTR, r0 650#endif 651 ble .L25 652 .align 5 653 654.L22: 655 FMADD f0, f16, f20, f0 656 nop 657 FMADD f1, f17, f20, f1 658 LFD f20, 8 * SIZE(BO) 659 FMADD f4, f16, f21, f4 660 nop 661 FMADD f5, f17, f21, f5 662 LFD f21, 9 * SIZE(BO) 663 664 FMADD f8, f16, f22, f8 665 nop 666 FMADD f9, f17, f22, f9 667 LFD f22, 10 * SIZE(BO) 668 FMADD f12, f16, f23, f12 669 LFD f16, 4 * SIZE(AO) 670 FMADD f13, f17, f23, f13 671 LFD f23, 11 * SIZE(BO) 672 673 FMADD f2, f18, f24, f2 674 LFD f17, 5 * SIZE(AO) 675 FMADD f3, f19, f24, f3 676 LFD f24, 12 * SIZE(BO) 677 FMADD f6, f18, f25, f6 678 nop 679 FMADD f7, f19, f25, f7 680 LFD f25, 13 * SIZE(BO) 681 682 FMADD f10, f18, f26, f10 683 nop 684 FMADD f11, f19, f26, f11 685 LFD f26, 14 * SIZE(BO) 686 FMADD f14, f18, f27, f14 687 LFD f18, 6 * SIZE(AO) 688 FMADD f15, f19, f27, f15 689 LFD f27, 15 * SIZE(BO) 690 691 FMADD f0, f16, f20, f0 692 LFD f19, 7 * SIZE(AO) 693 FMADD f1, f17, f20, f1 694 LFDU f20, 16 * SIZE(BO) 695 FMADD f4, f16, f21, f4 696 nop 697 FMADD f5, f17, f21, f5 698 LFD f21, 1 * SIZE(BO) 699 700 FMADD f8, f16, f22, f8 701 nop 702 FMADD f9, f17, f22, f9 703 LFD f22, 2 * SIZE(BO) 704 FMADD f12, f16, f23, f12 705 LFDU f16, 8 * SIZE(AO) 706 FMADD f13, f17, f23, f13 707 LFD f23, 3 * SIZE(BO) 708 709 FMADD f2, f18, f24, f2 710 LFD f17, 1 * SIZE(AO) 711 FMADD f3, f19, f24, f3 712 LFD f24, 4 * SIZE(BO) 713 FMADD f6, f18, f25, f6 714 nop 715 FMADD f7, f19, f25, f7 716 LFD f25, 5 * SIZE(BO) 717 718 FMADD f10, f18, f26, f10 719 nop 720 FMADD f11, f19, f26, f11 721 LFD f26, 6 * SIZE(BO) 722 FMADD f14, f18, f27, f14 723 LFD f18, 2 * SIZE(AO) 724 FMADD f15, f19, f27, f15 725 LFD f19, 3 * SIZE(AO) 726 LFD f27, 7 * SIZE(BO) 727 bdnz .L22 728 729 fadd f0, f2, f0 730 fadd f1, f3, f1 731 fadd f4, f6, f4 732 fadd f5, f7, f5 733 fadd f8, f10, f8 734 fadd f9, f11, f9 735 fadd f12, f14, f12 736 fadd f13, f15, f13 737 .align 4 738 739.L25: 740#if defined(LT) || defined(RN) 741 andi. r0, KK, 3 742#else 743 andi. r0, TEMP, 3 744#endif 745 mtspr CTR, r0 746 ble+ .L28 747 .align 4 748 749.L26: 750 FMADD f0, f16, f20, f0 751 nop 752 FMADD f1, f17, f20, f1 753 LFDU f20, 4 * SIZE(BO) 754 FMADD f4, f16, f21, f4 755 nop 756 FMADD f5, f17, f21, f5 757 LFD f21, 1 * SIZE(BO) 758 759 FMADD f8, f16, f22, f8 760 nop 761 FMADD f9, f17, f22, f9 762 LFD f22, 2 * SIZE(BO) 763 FMADD f12, f16, f23, f12 764 LFDU f16, 2 * SIZE(AO) 765 FMADD f13, f17, f23, f13 766 LFD f17, 1 * SIZE(AO) 767 LFD f23, 3 * SIZE(BO) 768 bdnz .L26 769 .align 4 770 771.L28: 772#if defined(LN) || defined(RT) 773#ifdef LN 774 subi r0, KK, 2 775#else 776 subi r0, KK, 4 777#endif 778 slwi TEMP, r0, 1 + BASE_SHIFT 779 slwi r0, r0, 2 + BASE_SHIFT 780 add AO, AORIG, TEMP 781 add BO, B, r0 782#endif 783 784#if defined(LN) || defined(LT) 785 LFD f16, 0 * SIZE(BO) 786 LFD f17, 1 * SIZE(BO) 787 LFD f18, 2 * SIZE(BO) 788 LFD f19, 3 * SIZE(BO) 789 790 LFD f20, 4 * SIZE(BO) 791 LFD f21, 5 * SIZE(BO) 792 LFD f22, 6 * SIZE(BO) 793 LFD f23, 7 * SIZE(BO) 794 795 FSUB f0, f16, f0 796 FSUB f4, f17, f4 797 FSUB f8, f18, f8 798 FSUB f12, f19, f12 799 800 FSUB f1, f20, f1 801 FSUB f5, f21, f5 802 FSUB f9, f22, f9 803 FSUB f13, f23, f13 804#else 805 LFD f16, 0 * SIZE(AO) 806 LFD f17, 1 * SIZE(AO) 807 LFD f20, 2 * SIZE(AO) 808 LFD f21, 3 * SIZE(AO) 809 810 LFD f24, 4 * SIZE(AO) 811 LFD f25, 5 * SIZE(AO) 812 LFD f28, 6 * SIZE(AO) 813 LFD f29, 7 * SIZE(AO) 814 815 FSUB f0, f16, f0 816 FSUB f1, f17, f1 817 FSUB f4, f20, f4 818 FSUB f5, f21, f5 819 820 FSUB f8, f24, f8 821 FSUB f9, f25, f9 822 FSUB f12, f28, f12 823 FSUB f13, f29, f13 824#endif 825 826#ifdef LN 827 LFD f19, 3 * SIZE(AO) 828 LFD f20, 2 * SIZE(AO) 829 LFD f21, 0 * SIZE(AO) 830 831 FMUL f1, f19, f1 832 FMUL f5, f19, f5 833 FMUL f9, f19, f9 834 FMUL f13, f19, f13 835 836 FNMSUB f0, f20, f1, f0 837 FNMSUB f4, f20, f5, f4 838 FNMSUB f8, f20, f9, f8 839 FNMSUB f12, f20, f13, f12 840 841 FMUL f0, f21, f0 842 FMUL f4, f21, f4 843 FMUL f8, f21, f8 844 FMUL f12, f21, f12 845#endif 846 847#ifdef LT 848 LFD f16, 0 * SIZE(AO) 849 LFD f17, 1 * SIZE(AO) 850 851 FMUL f0, f16, f0 852 FMUL f4, f16, f4 853 FMUL f8, f16, f8 854 FMUL f12, f16, f12 855 856 FNMSUB f1, f17, f0, f1 857 FNMSUB f5, f17, f4, f5 858 FNMSUB f9, f17, f8, f9 859 FNMSUB f13, f17, f12, f13 860 861 LFD f17, 3 * SIZE(AO) 862 863 FMUL f1, f17, f1 864 FMUL f5, f17, f5 865 FMUL f9, f17, f9 866 FMUL f13, f17, f13 867#endif 868 869#ifdef RN 870 LFD f16, 0 * SIZE(BO) 871 LFD f17, 1 * SIZE(BO) 872 LFD f18, 2 * SIZE(BO) 873 LFD f19, 3 * SIZE(BO) 874 875 FMUL f0, f16, f0 876 FMUL f1, f16, f1 877 FNMSUB f4, f17, f0, f4 878 FNMSUB f5, f17, f1, f5 879 FNMSUB f8, f18, f0, f8 880 FNMSUB f9, f18, f1, f9 881 FNMSUB f12, f19, f0, f12 882 FNMSUB f13, f19, f1, f13 883 884 LFD f16, 5 * SIZE(BO) 885 LFD f17, 6 * SIZE(BO) 886 LFD f18, 7 * SIZE(BO) 887 LFD f19, 10 * SIZE(BO) 888 889 LFD f20, 11 * SIZE(BO) 890 LFD f21, 15 * SIZE(BO) 891 892 FMUL f4, f16, f4 893 FMUL f5, f16, f5 894 FNMSUB f8, f17, f4, f8 895 FNMSUB f9, f17, f5, f9 896 FNMSUB f12, f18, f4, f12 897 FNMSUB f13, f18, f5, f13 898 899 FMUL f8, f19, f8 900 FMUL f9, f19, f9 901 FNMSUB f12, f20, f8, f12 902 FNMSUB f13, f20, f9, f13 903 FMUL f12, f21, f12 904 FMUL f13, f21, f13 905#endif 906 907#ifdef RT 908 LFD f16, 15 * SIZE(BO) 909 LFD f17, 14 * SIZE(BO) 910 LFD f18, 13 * SIZE(BO) 911 LFD f19, 12 * SIZE(BO) 912 913 FMUL f12, f16, f12 914 FMUL f13, f16, f13 915 FNMSUB f8, f17, f12, f8 916 FNMSUB f9, f17, f13, f9 917 FNMSUB f4, f18, f12, f4 918 FNMSUB f5, f18, f13, f5 919 FNMSUB f0, f19, f12, f0 920 FNMSUB f1, f19, f13, f1 921 922 LFD f16, 10 * SIZE(BO) 923 LFD f17, 9 * SIZE(BO) 924 LFD f18, 8 * SIZE(BO) 925 LFD f19, 5 * SIZE(BO) 926 LFD f20, 4 * SIZE(BO) 927 LFD f21, 0 * SIZE(BO) 928 929 FMUL f8, f16, f8 930 FMUL f9, f16, f9 931 FNMSUB f4, f17, f8, f4 932 FNMSUB f5, f17, f9, f5 933 FNMSUB f0, f18, f8, f0 934 FNMSUB f1, f18, f9, f1 935 936 FMUL f4, f19, f4 937 FMUL f5, f19, f5 938 FNMSUB f0, f20, f4, f0 939 FNMSUB f1, f20, f5, f1 940 941 FMUL f0, f21, f0 942 FMUL f1, f21, f1 943#endif 944 945#ifdef LN 946 subi CO1, CO1, 2 * SIZE 947 subi CO2, CO2, 2 * SIZE 948 subi CO3, CO3, 2 * SIZE 949 subi CO4, CO4, 2 * SIZE 950#endif 951 952#if defined(LN) || defined(LT) 953 STFD f0, 0 * SIZE(BO) 954 STFD f4, 1 * SIZE(BO) 955 STFD f8, 2 * SIZE(BO) 956 STFD f12, 3 * SIZE(BO) 957 958 STFD f1, 4 * SIZE(BO) 959 STFD f5, 5 * SIZE(BO) 960 STFD f9, 6 * SIZE(BO) 961 STFD f13, 7 * SIZE(BO) 962#else 963 STFD f0, 0 * SIZE(AO) 964 STFD f1, 1 * SIZE(AO) 965 STFD f4, 2 * SIZE(AO) 966 STFD f5, 3 * SIZE(AO) 967 968 STFD f8, 4 * SIZE(AO) 969 STFD f9, 5 * SIZE(AO) 970 STFD f12, 6 * SIZE(AO) 971 STFD f13, 7 * SIZE(AO) 972#endif 973 974 STFD f0, 0 * SIZE(CO1) 975 STFD f1, 1 * SIZE(CO1) 976 STFD f4, 0 * SIZE(CO2) 977 STFD f5, 1 * SIZE(CO2) 978 979 STFD f8, 0 * SIZE(CO3) 980 STFD f9, 1 * SIZE(CO3) 981 STFD f12, 0 * SIZE(CO4) 982 STFD f13, 1 * SIZE(CO4) 983 984 lfs f0, FZERO 985 fmr f1, f0 986 fmr f2, f0 987 fmr f3, f0 988 989 fmr f4, f0 990 fmr f5, f0 991 fmr f6, f0 992 fmr f7, f0 993 994 fmr f8, f0 995 fmr f9, f0 996 fmr f10, f0 997 fmr f11, f0 998 999 fmr f12, f0 1000 fmr f13, f0 1001 fmr f14, f0 1002 fmr f15, f0 1003 1004#ifndef LN 1005 addi CO1, CO1, 2 * SIZE 1006 addi CO2, CO2, 2 * SIZE 1007 addi CO3, CO3, 2 * SIZE 1008 addi CO4, CO4, 2 * SIZE 1009#endif 1010 1011#ifdef RT 1012 slwi r0, K, 1 + BASE_SHIFT 1013 add AORIG, AORIG, r0 1014#endif 1015 1016#if defined(LT) || defined(RN) 1017 sub TEMP, K, KK 1018 slwi r0, TEMP, 1 + BASE_SHIFT 1019 slwi TEMP, TEMP, 2 + BASE_SHIFT 1020 add AO, AO, r0 1021 add BO, BO, TEMP 1022#endif 1023 1024#ifdef LN 1025 subi KK, KK, 2 1026#endif 1027 1028#ifdef LT 1029 addi KK, KK, 2 1030#endif 1031 .align 4 1032 1033.L09: 1034 srawi. I, M, 2 1035 ble .L39 1036 .align 4 1037 1038.L11: 1039#if defined(LT) || defined(RN) 1040 LFD A1, 0 * SIZE(AO) 1041 LFD A2, 1 * SIZE(AO) 1042 LFD A4, 4 * SIZE(AO) 1043 LFD A5, 8 * SIZE(AO) 1044 1045 LFD B1, 0 * SIZE(B) 1046 LFD B2, 1 * SIZE(B) 1047 LFD B3, 2 * SIZE(B) 1048 LFD B4, 3 * SIZE(B) 1049 LFD B5, 4 * SIZE(B) 1050 LFD B6, 8 * SIZE(B) 1051 LFD B7, 12 * SIZE(B) 1052 1053 srawi. r0, KK, 2 1054 mtspr CTR, r0 1055 mr BO, B 1056#else 1057 1058#ifdef LN 1059 slwi r0, K, 2 + BASE_SHIFT 1060 sub AORIG, AORIG, r0 1061#endif 1062 1063 slwi TEMP, KK, 2 + BASE_SHIFT 1064 add AO, AORIG, TEMP 1065 add BO, B, TEMP 1066 1067 sub TEMP, K, KK 1068 1069 LFD A1, 0 * SIZE(AO) 1070 LFD A2, 1 * SIZE(AO) 1071 LFD A4, 4 * SIZE(AO) 1072 LFD A5, 8 * SIZE(AO) 1073 1074 LFD B1, 0 * SIZE(BO) 1075 LFD B2, 1 * SIZE(BO) 1076 LFD B3, 2 * SIZE(BO) 1077 LFD B4, 3 * SIZE(BO) 1078 LFD B5, 4 * SIZE(BO) 1079 LFD B6, 8 * SIZE(BO) 1080 LFD B7, 12 * SIZE(BO) 1081 1082 srawi. r0, TEMP, 2 1083 mtspr CTR, r0 1084#endif 1085 ble .L15 1086 .align 4 1087 1088.L12: 1089 FMADD f0, A1, B1, f0 1090 LFD A3, 2 * SIZE(AO) 1091 FMADD f4, A1, B2, f4 1092 LFD A6, 12 * SIZE(AO) 1093 FMADD f8, A1, B3, f8 1094 nop 1095 FMADD f12, A1, B4, f12 1096 nop 1097 1098 FMADD f1, A2, B1, f1 1099 LFD A1, 3 * SIZE(AO) 1100 FMADD f5, A2, B2, f5 1101 nop 1102 FMADD f9, A2, B3, f9 1103 nop 1104 FMADD f13, A2, B4, f13 1105 nop 1106 1107 FMADD f2, A3, B1, f2 1108 nop 1109 FMADD f6, A3, B2, f6 1110 LFD B8, 5 * SIZE(BO) 1111 FMADD f10, A3, B3, f10 1112 LFD B9, 6 * SIZE(BO) 1113 FMADD f14, A3, B4, f14 1114 LFD B10, 7 * SIZE(BO) 1115 1116 FMADD f3, A1, B1, f3 1117 LFD A2, 5 * SIZE(AO) 1118 FMADD f7, A1, B2, f7 1119 LFD B1, 16 * SIZE(BO) 1120 FMADD f11, A1, B3, f11 1121 nop 1122 FMADD f15, A1, B4, f15 1123 nop 1124 1125 FMADD f0, A4, B5, f0 1126 LFD A3, 6 * SIZE(AO) 1127 FMADD f4, A4, B8, f4 1128 LFD A1, 16 * SIZE(AO) 1129 FMADD f8, A4, B9, f8 1130 nop 1131 FMADD f12, A4, B10, f12 1132 nop 1133 1134 FMADD f1, A2, B5, f1 1135 LFD A4, 7 * SIZE(AO) 1136 FMADD f5, A2, B8, f5 1137 nop 1138 FMADD f9, A2, B9, f9 1139 nop 1140 FMADD f13, A2, B10, f13 1141 nop 1142 1143 FMADD f2, A3, B5, f2 1144 nop 1145 FMADD f6, A3, B8, f6 1146 LFD B2, 9 * SIZE(BO) 1147 FMADD f10, A3, B9, f10 1148 LFD B3, 10 * SIZE(BO) 1149 FMADD f14, A3, B10, f14 1150 LFD B4, 11 * SIZE(BO) 1151 1152 FMADD f3, A4, B5, f3 1153 LFD A2, 9 * SIZE(AO) 1154 FMADD f7, A4, B8, f7 1155 LFD B5, 20 * SIZE(BO) 1156 FMADD f11, A4, B9, f11 1157 nop 1158 FMADD f15, A4, B10, f15 1159 nop 1160 1161 FMADD f0, A5, B6, f0 1162 LFD A3, 10 * SIZE(AO) 1163 FMADD f4, A5, B2, f4 1164 LFD A4, 20 * SIZE(AO) 1165 FMADD f8, A5, B3, f8 1166 nop 1167 FMADD f12, A5, B4, f12 1168 nop 1169 1170 FMADD f1, A2, B6, f1 1171 LFD A5, 11 * SIZE(AO) 1172 FMADD f5, A2, B2, f5 1173 nop 1174 FMADD f9, A2, B3, f9 1175 nop 1176 FMADD f13, A2, B4, f13 1177 nop 1178 1179 FMADD f2, A3, B6, f2 1180 nop 1181 FMADD f6, A3, B2, f6 1182 LFD B8, 13 * SIZE(BO) 1183 FMADD f10, A3, B3, f10 1184 LFD B9, 14 * SIZE(BO) 1185 FMADD f14, A3, B4, f14 1186 LFD B10,15 * SIZE(BO) 1187 1188 FMADD f3, A5, B6, f3 1189 LFD A2, 13 * SIZE(AO) 1190 FMADD f7, A5, B2, f7 1191 LFD B6, 24 * SIZE(BO) 1192 FMADD f11, A5, B3, f11 1193 nop 1194 FMADD f15, A5, B4, f15 1195 nop 1196 1197 1198 FMADD f0, A6, B7, f0 1199 LFD A3, 14 * SIZE(AO) 1200 FMADD f4, A6, B8, f4 1201 LFD A5, 24 * SIZE(AO) 1202 FMADD f8, A6, B9, f8 1203 nop 1204 FMADD f12, A6, B10, f12 1205 nop 1206 1207 FMADD f1, A2, B7, f1 1208 LFD A6, 15 * SIZE(AO) 1209 FMADD f5, A2, B8, f5 1210 nop 1211 FMADD f9, A2, B9, f9 1212 nop 1213 FMADD f13, A2, B10, f13 1214 nop 1215 1216 FMADD f2, A3, B7, f2 1217 addi AO, AO, 16 * SIZE 1218 FMADD f6, A3, B8, f6 1219 LFD B2, 17 * SIZE(BO) 1220 FMADD f10, A3, B9, f10 1221 LFD B3, 18 * SIZE(BO) 1222 FMADD f14, A3, B10, f14 1223 LFD B4, 19 * SIZE(BO) 1224 1225 FMADD f3, A6, B7, f3 1226 LFD A2, 1 * SIZE(AO) 1227 FMADD f7, A6, B8, f7 1228 LFD B7, 28 * SIZE(BO) 1229 FMADD f11, A6, B9, f11 1230 addi BO, BO, 16 * SIZE 1231 FMADD f15, A6, B10, f15 1232 bdnz .L12 1233 .align 4 1234 1235.L15: 1236#if defined(LT) || defined(RN) 1237 andi. r0, KK, 3 1238#else 1239 andi. r0, TEMP, 3 1240#endif 1241 mtspr CTR, r0 1242 ble+ .L18 1243 .align 4 1244 1245.L16: 1246 FMADD f0, A1, B1, f0 1247 LFD A3, 2 * SIZE(AO) 1248 FMADD f4, A1, B2, f4 1249 FMADD f8, A1, B3, f8 1250 FMADD f12, A1, B4, f12 1251 LFD A4, 3 * SIZE(AO) 1252 1253 FMADD f1, A2, B1, f1 1254 FMADD f5, A2, B2, f5 1255 FMADD f9, A2, B3, f9 1256 FMADD f13, A2, B4, f13 1257 LFDU A1, 4 * SIZE(AO) 1258 1259 FMADD f2, A3, B1, f2 1260 FMADD f6, A3, B2, f6 1261 FMADD f10, A3, B3, f10 1262 FMADD f14, A3, B4, f14 1263 LFD A2, 1 * SIZE(AO) 1264 1265 FMADD f3, A4, B1, f3 1266 LFDU B1, 4 * SIZE(BO) 1267 FMADD f7, A4, B2, f7 1268 LFD B2, 1 * SIZE(BO) 1269 FMADD f11, A4, B3, f11 1270 LFD B3, 2 * SIZE(BO) 1271 FMADD f15, A4, B4, f15 1272 LFD B4, 3 * SIZE(BO) 1273 bdnz .L16 1274 .align 4 1275 1276.L18: 1277#if defined(LN) || defined(RT) 1278 subi r0, KK, 4 1279 slwi r0, r0, 2 + BASE_SHIFT 1280 add AO, AORIG, r0 1281 add BO, B, r0 1282#endif 1283 1284#if defined(LN) || defined(LT) 1285 LFD f16, 0 * SIZE(BO) 1286 LFD f17, 1 * SIZE(BO) 1287 LFD f18, 2 * SIZE(BO) 1288 LFD f19, 3 * SIZE(BO) 1289 1290 LFD f20, 4 * SIZE(BO) 1291 LFD f21, 5 * SIZE(BO) 1292 LFD f22, 6 * SIZE(BO) 1293 LFD f23, 7 * SIZE(BO) 1294 1295 LFD f24, 8 * SIZE(BO) 1296 LFD f25, 9 * SIZE(BO) 1297 LFD f26, 10 * SIZE(BO) 1298 LFD f27, 11 * SIZE(BO) 1299 1300 LFD f28, 12 * SIZE(BO) 1301 LFD f29, 13 * SIZE(BO) 1302 LFD f30, 14 * SIZE(BO) 1303 LFD f31, 15 * SIZE(BO) 1304 1305 FSUB f0, f16, f0 1306 FSUB f4, f17, f4 1307 FSUB f8, f18, f8 1308 FSUB f12, f19, f12 1309 1310 FSUB f1, f20, f1 1311 FSUB f5, f21, f5 1312 FSUB f9, f22, f9 1313 FSUB f13, f23, f13 1314 1315 FSUB f2, f24, f2 1316 FSUB f6, f25, f6 1317 FSUB f10, f26, f10 1318 FSUB f14, f27, f14 1319 1320 FSUB f3, f28, f3 1321 FSUB f7, f29, f7 1322 FSUB f11, f30, f11 1323 FSUB f15, f31, f15 1324#else 1325 LFD f16, 0 * SIZE(AO) 1326 LFD f17, 1 * SIZE(AO) 1327 LFD f18, 2 * SIZE(AO) 1328 LFD f19, 3 * SIZE(AO) 1329 1330 LFD f20, 4 * SIZE(AO) 1331 LFD f21, 5 * SIZE(AO) 1332 LFD f22, 6 * SIZE(AO) 1333 LFD f23, 7 * SIZE(AO) 1334 1335 LFD f24, 8 * SIZE(AO) 1336 LFD f25, 9 * SIZE(AO) 1337 LFD f26, 10 * SIZE(AO) 1338 LFD f27, 11 * SIZE(AO) 1339 1340 LFD f28, 12 * SIZE(AO) 1341 LFD f29, 13 * SIZE(AO) 1342 LFD f30, 14 * SIZE(AO) 1343 LFD f31, 15 * SIZE(AO) 1344 1345 FSUB f0, f16, f0 1346 FSUB f1, f17, f1 1347 FSUB f2, f18, f2 1348 FSUB f3, f19, f3 1349 1350 FSUB f4, f20, f4 1351 FSUB f5, f21, f5 1352 FSUB f6, f22, f6 1353 FSUB f7, f23, f7 1354 1355 FSUB f8, f24, f8 1356 FSUB f9, f25, f9 1357 FSUB f10, f26, f10 1358 FSUB f11, f27, f11 1359 1360 FSUB f12, f28, f12 1361 FSUB f13, f29, f13 1362 FSUB f14, f30, f14 1363 FSUB f15, f31, f15 1364#endif 1365 1366#ifdef LN 1367 LFD f16, 15 * SIZE(AO) 1368 LFD f17, 14 * SIZE(AO) 1369 LFD f18, 13 * SIZE(AO) 1370 LFD f19, 12 * SIZE(AO) 1371 1372 FMUL f3, f16, f3 1373 FMUL f7, f16, f7 1374 FMUL f11, f16, f11 1375 FMUL f15, f16, f15 1376 1377 FNMSUB f2, f17, f3, f2 1378 FNMSUB f6, f17, f7, f6 1379 FNMSUB f10, f17, f11, f10 1380 FNMSUB f14, f17, f15, f14 1381 1382 FNMSUB f1, f18, f3, f1 1383 FNMSUB f5, f18, f7, f5 1384 FNMSUB f9, f18, f11, f9 1385 FNMSUB f13, f18, f15, f13 1386 1387 FNMSUB f0, f19, f3, f0 1388 FNMSUB f4, f19, f7, f4 1389 FNMSUB f8, f19, f11, f8 1390 FNMSUB f12, f19, f15, f12 1391 1392 LFD f16, 10 * SIZE(AO) 1393 LFD f17, 9 * SIZE(AO) 1394 LFD f18, 8 * SIZE(AO) 1395 LFD f19, 5 * SIZE(AO) 1396 1397 FMUL f2, f16, f2 1398 FMUL f6, f16, f6 1399 FMUL f10, f16, f10 1400 FMUL f14, f16, f14 1401 1402 LFD f20, 4 * SIZE(AO) 1403 LFD f21, 0 * SIZE(AO) 1404 1405 FNMSUB f1, f17, f2, f1 1406 FNMSUB f5, f17, f6, f5 1407 FNMSUB f9, f17, f10, f9 1408 FNMSUB f13, f17, f14, f13 1409 1410 FNMSUB f0, f18, f2, f0 1411 FNMSUB f4, f18, f6, f4 1412 FNMSUB f8, f18, f10, f8 1413 FNMSUB f12, f18, f14, f12 1414 1415 FMUL f1, f19, f1 1416 FMUL f5, f19, f5 1417 FMUL f9, f19, f9 1418 FMUL f13, f19, f13 1419 1420 FNMSUB f0, f20, f1, f0 1421 FNMSUB f4, f20, f5, f4 1422 FNMSUB f8, f20, f9, f8 1423 FNMSUB f12, f20, f13, f12 1424 1425 FMUL f0, f21, f0 1426 FMUL f4, f21, f4 1427 FMUL f8, f21, f8 1428 FMUL f12, f21, f12 1429#endif 1430 1431#ifdef LT 1432 LFD f16, 0 * SIZE(AO) 1433 LFD f17, 1 * SIZE(AO) 1434 LFD f18, 2 * SIZE(AO) 1435 LFD f19, 3 * SIZE(AO) 1436 1437 FMUL f0, f16, f0 1438 FMUL f4, f16, f4 1439 FMUL f8, f16, f8 1440 FMUL f12, f16, f12 1441 1442 FNMSUB f1, f17, f0, f1 1443 FNMSUB f5, f17, f4, f5 1444 FNMSUB f9, f17, f8, f9 1445 FNMSUB f13, f17, f12, f13 1446 1447 FNMSUB f2, f18, f0, f2 1448 FNMSUB f6, f18, f4, f6 1449 FNMSUB f10, f18, f8, f10 1450 FNMSUB f14, f18, f12, f14 1451 1452 FNMSUB f3, f19, f0, f3 1453 FNMSUB f7, f19, f4, f7 1454 FNMSUB f11, f19, f8, f11 1455 FNMSUB f15, f19, f12, f15 1456 1457 LFD f16, 5 * SIZE(AO) 1458 LFD f17, 6 * SIZE(AO) 1459 LFD f18, 7 * SIZE(AO) 1460 LFD f19, 10 * SIZE(AO) 1461 1462 FMUL f1, f16, f1 1463 FMUL f5, f16, f5 1464 FMUL f9, f16, f9 1465 FMUL f13, f16, f13 1466 1467 LFD f20, 11 * SIZE(AO) 1468 LFD f21, 15 * SIZE(AO) 1469 1470 FNMSUB f2, f17, f1, f2 1471 FNMSUB f6, f17, f5, f6 1472 FNMSUB f10, f17, f9, f10 1473 FNMSUB f14, f17, f13, f14 1474 1475 FNMSUB f3, f18, f1, f3 1476 FNMSUB f7, f18, f5, f7 1477 FNMSUB f11, f18, f9, f11 1478 FNMSUB f15, f18, f13, f15 1479 1480 FMUL f2, f19, f2 1481 FMUL f6, f19, f6 1482 FMUL f10, f19, f10 1483 FMUL f14, f19, f14 1484 1485 FNMSUB f3, f20, f2, f3 1486 FNMSUB f7, f20, f6, f7 1487 FNMSUB f11, f20, f10, f11 1488 FNMSUB f15, f20, f14, f15 1489 1490 FMUL f3, f21, f3 1491 FMUL f7, f21, f7 1492 FMUL f11, f21, f11 1493 FMUL f15, f21, f15 1494#endif 1495 1496#ifdef RN 1497 LFD f16, 0 * SIZE(BO) 1498 LFD f17, 1 * SIZE(BO) 1499 LFD f18, 2 * SIZE(BO) 1500 LFD f19, 3 * SIZE(BO) 1501 1502 FMUL f0, f16, f0 1503 FMUL f1, f16, f1 1504 FMUL f2, f16, f2 1505 FMUL f3, f16, f3 1506 1507 FNMSUB f4, f17, f0, f4 1508 FNMSUB f5, f17, f1, f5 1509 FNMSUB f6, f17, f2, f6 1510 FNMSUB f7, f17, f3, f7 1511 1512 FNMSUB f8, f18, f0, f8 1513 FNMSUB f9, f18, f1, f9 1514 FNMSUB f10, f18, f2, f10 1515 FNMSUB f11, f18, f3, f11 1516 1517 FNMSUB f12, f19, f0, f12 1518 FNMSUB f13, f19, f1, f13 1519 FNMSUB f14, f19, f2, f14 1520 FNMSUB f15, f19, f3, f15 1521 1522 LFD f16, 5 * SIZE(BO) 1523 LFD f17, 6 * SIZE(BO) 1524 LFD f18, 7 * SIZE(BO) 1525 LFD f19, 10 * SIZE(BO) 1526 1527 FMUL f4, f16, f4 1528 FMUL f5, f16, f5 1529 FMUL f6, f16, f6 1530 FMUL f7, f16, f7 1531 1532 LFD f20, 11 * SIZE(BO) 1533 LFD f21, 15 * SIZE(BO) 1534 1535 FNMSUB f8, f17, f4, f8 1536 FNMSUB f9, f17, f5, f9 1537 FNMSUB f10, f17, f6, f10 1538 FNMSUB f11, f17, f7, f11 1539 1540 FNMSUB f12, f18, f4, f12 1541 FNMSUB f13, f18, f5, f13 1542 FNMSUB f14, f18, f6, f14 1543 FNMSUB f15, f18, f7, f15 1544 1545 FMUL f8, f19, f8 1546 FMUL f9, f19, f9 1547 FMUL f10, f19, f10 1548 FMUL f11, f19, f11 1549 1550 FNMSUB f12, f20, f8, f12 1551 FNMSUB f13, f20, f9, f13 1552 FNMSUB f14, f20, f10, f14 1553 FNMSUB f15, f20, f11, f15 1554 1555 FMUL f12, f21, f12 1556 FMUL f13, f21, f13 1557 FMUL f14, f21, f14 1558 FMUL f15, f21, f15 1559#endif 1560 1561#ifdef RT 1562 LFD f16, 15 * SIZE(BO) 1563 LFD f17, 14 * SIZE(BO) 1564 LFD f18, 13 * SIZE(BO) 1565 LFD f19, 12 * SIZE(BO) 1566 1567 FMUL f12, f16, f12 1568 FMUL f13, f16, f13 1569 FMUL f14, f16, f14 1570 FMUL f15, f16, f15 1571 1572 FNMSUB f8, f17, f12, f8 1573 FNMSUB f9, f17, f13, f9 1574 FNMSUB f10, f17, f14, f10 1575 FNMSUB f11, f17, f15, f11 1576 1577 FNMSUB f4, f18, f12, f4 1578 FNMSUB f5, f18, f13, f5 1579 FNMSUB f6, f18, f14, f6 1580 FNMSUB f7, f18, f15, f7 1581 1582 FNMSUB f0, f19, f12, f0 1583 FNMSUB f1, f19, f13, f1 1584 FNMSUB f2, f19, f14, f2 1585 FNMSUB f3, f19, f15, f3 1586 1587 LFD f16, 10 * SIZE(BO) 1588 LFD f17, 9 * SIZE(BO) 1589 LFD f18, 8 * SIZE(BO) 1590 LFD f19, 5 * SIZE(BO) 1591 1592 FMUL f8, f16, f8 1593 FMUL f9, f16, f9 1594 FMUL f10, f16, f10 1595 FMUL f11, f16, f11 1596 1597 LFD f20, 4 * SIZE(BO) 1598 LFD f21, 0 * SIZE(BO) 1599 1600 FNMSUB f4, f17, f8, f4 1601 FNMSUB f5, f17, f9, f5 1602 FNMSUB f6, f17, f10, f6 1603 FNMSUB f7, f17, f11, f7 1604 1605 FNMSUB f0, f18, f8, f0 1606 FNMSUB f1, f18, f9, f1 1607 FNMSUB f2, f18, f10, f2 1608 FNMSUB f3, f18, f11, f3 1609 1610 FMUL f4, f19, f4 1611 FMUL f5, f19, f5 1612 FMUL f6, f19, f6 1613 FMUL f7, f19, f7 1614 1615 FNMSUB f0, f20, f4, f0 1616 FNMSUB f1, f20, f5, f1 1617 FNMSUB f2, f20, f6, f2 1618 FNMSUB f3, f20, f7, f3 1619 1620 FMUL f0, f21, f0 1621 FMUL f1, f21, f1 1622 FMUL f2, f21, f2 1623 FMUL f3, f21, f3 1624#endif 1625 1626#ifdef LN 1627 subi CO1, CO1, 4 * SIZE 1628 subi CO2, CO2, 4 * SIZE 1629 subi CO3, CO3, 4 * SIZE 1630 subi CO4, CO4, 4 * SIZE 1631#endif 1632 1633#if defined(LN) || defined(LT) 1634 STFD f0, 0 * SIZE(BO) 1635 STFD f4, 1 * SIZE(BO) 1636 STFD f8, 2 * SIZE(BO) 1637 STFD f12, 3 * SIZE(BO) 1638 1639 STFD f1, 4 * SIZE(BO) 1640 STFD f5, 5 * SIZE(BO) 1641 STFD f9, 6 * SIZE(BO) 1642 STFD f13, 7 * SIZE(BO) 1643 1644 STFD f2, 8 * SIZE(BO) 1645 STFD f6, 9 * SIZE(BO) 1646 STFD f10, 10 * SIZE(BO) 1647 STFD f14, 11 * SIZE(BO) 1648 1649 STFD f3, 12 * SIZE(BO) 1650 STFD f7, 13 * SIZE(BO) 1651 STFD f11, 14 * SIZE(BO) 1652 STFD f15, 15 * SIZE(BO) 1653#else 1654 STFD f0, 0 * SIZE(AO) 1655 STFD f1, 1 * SIZE(AO) 1656 STFD f2, 2 * SIZE(AO) 1657 STFD f3, 3 * SIZE(AO) 1658 1659 STFD f4, 4 * SIZE(AO) 1660 STFD f5, 5 * SIZE(AO) 1661 STFD f6, 6 * SIZE(AO) 1662 STFD f7, 7 * SIZE(AO) 1663 1664 STFD f8, 8 * SIZE(AO) 1665 STFD f9, 9 * SIZE(AO) 1666 STFD f10, 10 * SIZE(AO) 1667 STFD f11, 11 * SIZE(AO) 1668 1669 STFD f12, 12 * SIZE(AO) 1670 STFD f13, 13 * SIZE(AO) 1671 STFD f14, 14 * SIZE(AO) 1672 STFD f15, 15 * SIZE(AO) 1673#endif 1674 1675 STFD f0, 0 * SIZE(CO1) 1676 STFD f1, 1 * SIZE(CO1) 1677 STFD f2, 2 * SIZE(CO1) 1678 STFD f3, 3 * SIZE(CO1) 1679 1680 STFD f4, 0 * SIZE(CO2) 1681 STFD f5, 1 * SIZE(CO2) 1682 STFD f6, 2 * SIZE(CO2) 1683 STFD f7, 3 * SIZE(CO2) 1684 1685 STFD f8, 0 * SIZE(CO3) 1686 STFD f9, 1 * SIZE(CO3) 1687 STFD f10, 2 * SIZE(CO3) 1688 STFD f11, 3 * SIZE(CO3) 1689 1690 STFD f12, 0 * SIZE(CO4) 1691 STFD f13, 1 * SIZE(CO4) 1692 STFD f14, 2 * SIZE(CO4) 1693 STFD f15, 3 * SIZE(CO4) 1694 1695 lfs f0, FZERO 1696 fmr f1, f0 1697 fmr f2, f0 1698 fmr f3, f0 1699 1700 fmr f4, f0 1701 fmr f5, f0 1702 fmr f6, f0 1703 fmr f7, f0 1704 1705 fmr f8, f0 1706 fmr f9, f0 1707 fmr f10, f0 1708 fmr f11, f0 1709 1710 fmr f12, f0 1711 fmr f13, f0 1712 fmr f14, f0 1713 fmr f15, f0 1714 1715#ifndef LN 1716 addi CO1, CO1, 4 * SIZE 1717 addi CO2, CO2, 4 * SIZE 1718 addi CO3, CO3, 4 * SIZE 1719 addi CO4, CO4, 4 * SIZE 1720#endif 1721 1722#ifdef RT 1723 slwi r0, K, 2 + BASE_SHIFT 1724 add AORIG, AORIG, r0 1725#endif 1726 1727#if defined(LT) || defined(RN) 1728 sub TEMP, K, KK 1729 slwi TEMP, TEMP, 2 + BASE_SHIFT 1730 add AO, AO, TEMP 1731 add BO, BO, TEMP 1732#endif 1733 1734#ifdef LT 1735 addi KK, KK, 4 1736#endif 1737 1738#ifdef LN 1739 subi KK, KK, 4 1740#endif 1741 1742 addic. I, I, -1 1743 bgt+ .L11 1744 .align 4 1745 1746 1747.L39: 1748#ifdef LN 1749 slwi r0, K, 2 + BASE_SHIFT 1750 add B, B, r0 1751#endif 1752 1753#if defined(LT) || defined(RN) 1754 mr B, BO 1755#endif 1756 1757#ifdef RN 1758 addi KK, KK, 4 1759#endif 1760 1761#ifdef RT 1762 subi KK, KK, 4 1763#endif 1764 1765 addic. J, J, -1 1766 lfs f0, FZERO 1767 bgt .L10 1768 .align 4 1769 1770.L40: 1771 andi. J, N, 2 1772 ble .L70 1773 1774#ifdef RT 1775 slwi r0, K, 1 + BASE_SHIFT 1776 sub B, B, r0 1777 1778 slwi r0, LDC, 1 1779 sub C, C, r0 1780#endif 1781 1782 mr CO1, C 1783 add CO2, C, LDC 1784 1785#ifdef LN 1786 add KK, M, OFFSET 1787#endif 1788 1789#ifdef LT 1790 mr KK, OFFSET 1791#endif 1792 1793 fmr f1, f0 1794 fmr f2, f0 1795 fmr f3, f0 1796 fmr f4, f0 1797 fmr f5, f0 1798 fmr f6, f0 1799 fmr f7, f0 1800 1801#if defined(LN) || defined(RT) 1802 mr AORIG, A 1803#else 1804 mr AO, A 1805#endif 1806#ifndef RT 1807 add C, CO2, LDC 1808#endif 1809 1810.L60: 1811 andi. I, M, 1 1812 ble .L50 1813 1814#if defined(LT) || defined(RN) 1815 LFD f16, 0 * SIZE(AO) 1816 LFD f17, 1 * SIZE(AO) 1817 LFD f18, 2 * SIZE(AO) 1818 LFD f19, 3 * SIZE(AO) 1819 1820 LFD f20, 0 * SIZE(B) 1821 LFD f21, 1 * SIZE(B) 1822 LFD f22, 2 * SIZE(B) 1823 LFD f23, 3 * SIZE(B) 1824 1825 LFD f24, 4 * SIZE(B) 1826 LFD f25, 5 * SIZE(B) 1827 LFD f26, 6 * SIZE(B) 1828 LFD f27, 7 * SIZE(B) 1829 1830 srawi. r0, KK, 2 1831 mtspr CTR, r0 1832 mr BO, B 1833#else 1834 1835#ifdef LN 1836 slwi r0, K, BASE_SHIFT 1837 sub AORIG, AORIG, r0 1838#endif 1839 1840 slwi r0, KK, 0 + BASE_SHIFT 1841 slwi TEMP, KK, 1 + BASE_SHIFT 1842 add AO, AORIG, r0 1843 add BO, B, TEMP 1844 1845 sub TEMP, K, KK 1846 1847 LFD f16, 0 * SIZE(AO) 1848 LFD f17, 1 * SIZE(AO) 1849 LFD f18, 2 * SIZE(AO) 1850 LFD f19, 3 * SIZE(AO) 1851 1852 LFD f20, 0 * SIZE(BO) 1853 LFD f21, 1 * SIZE(BO) 1854 LFD f22, 2 * SIZE(BO) 1855 LFD f23, 3 * SIZE(BO) 1856 1857 LFD f24, 4 * SIZE(BO) 1858 LFD f25, 5 * SIZE(BO) 1859 LFD f26, 6 * SIZE(BO) 1860 LFD f27, 7 * SIZE(BO) 1861 1862 srawi. r0, TEMP, 2 1863 mtspr CTR, r0 1864#endif 1865 ble .L65 1866 .align 5 1867 1868.L62: 1869 FMADD f0, f16, f20, f0 1870 LFDU f20, 8 * SIZE(BO) 1871 FMADD f1, f16, f21, f1 1872 LFDU f16, 4 * SIZE(AO) 1873 LFD f21, 1 * SIZE(BO) 1874 FMADD f2, f17, f22, f2 1875 LFD f22, 2 * SIZE(BO) 1876 FMADD f3, f17, f23, f3 1877 LFD f17, 1 * SIZE(AO) 1878 LFD f23, 3 * SIZE(BO) 1879 1880 FMADD f0, f18, f24, f0 1881 LFD f24, 4 * SIZE(BO) 1882 FMADD f1, f18, f25, f1 1883 LFD f18, 2 * SIZE(AO) 1884 LFD f25, 5 * SIZE(BO) 1885 FMADD f2, f19, f26, f2 1886 LFD f26, 6 * SIZE(BO) 1887 FMADD f3, f19, f27, f3 1888 LFD f19, 3 * SIZE(AO) 1889 LFD f27, 7 * SIZE(BO) 1890 bdnz .L62 1891 .align 4 1892 1893.L65: 1894#if defined(LT) || defined(RN) 1895 andi. r0, KK, 3 1896#else 1897 andi. r0, TEMP, 3 1898#endif 1899 mtspr CTR, r0 1900 ble+ .L68 1901 .align 4 1902 1903.L66: 1904 FMADD f0, f16, f20, f0 1905 LFDU f20, 2 * SIZE(BO) 1906 FMADD f1, f16, f21, f1 1907 LFDU f16, 1 * SIZE(AO) 1908 LFD f21, 1 * SIZE(BO) 1909 bdnz .L66 1910 .align 4 1911 1912.L68: 1913 FADD f0, f2, f0 1914 FADD f1, f3, f1 1915 1916#if defined(LN) || defined(RT) 1917#ifdef LN 1918 subi r0, KK, 1 1919#else 1920 subi r0, KK, 2 1921#endif 1922 slwi TEMP, r0, 0 + BASE_SHIFT 1923 slwi r0, r0, 1 + BASE_SHIFT 1924 add AO, AORIG, TEMP 1925 add BO, B, r0 1926#endif 1927 1928#if defined(LN) || defined(LT) 1929 LFD f16, 0 * SIZE(BO) 1930 LFD f17, 1 * SIZE(BO) 1931 1932 FSUB f0, f16, f0 1933 FSUB f1, f17, f1 1934#else 1935 LFD f16, 0 * SIZE(AO) 1936 LFD f20, 1 * SIZE(AO) 1937 1938 FSUB f0, f16, f0 1939 FSUB f1, f20, f1 1940#endif 1941 1942#ifdef LN 1943 LFD f21, 0 * SIZE(AO) 1944 1945 FMUL f0, f21, f0 1946 FMUL f1, f21, f1 1947#endif 1948 1949#ifdef LT 1950 LFD f16, 0 * SIZE(AO) 1951 1952 FMUL f0, f16, f0 1953 FMUL f1, f16, f1 1954#endif 1955 1956#ifdef RN 1957 LFD f16, 0 * SIZE(BO) 1958 LFD f17, 1 * SIZE(BO) 1959 LFD f18, 3 * SIZE(BO) 1960 1961 FMUL f0, f16, f0 1962 FNMSUB f1, f17, f0, f1 1963 FMUL f1, f18, f1 1964#endif 1965 1966#ifdef RT 1967 LFD f19, 3 * SIZE(BO) 1968 LFD f20, 2 * SIZE(BO) 1969 LFD f21, 0 * SIZE(BO) 1970 1971 FMUL f1, f19, f1 1972 FNMSUB f0, f20, f1, f0 1973 FMUL f0, f21, f0 1974#endif 1975 1976#ifdef LN 1977 subi CO1, CO1, 1 * SIZE 1978 subi CO2, CO2, 1 * SIZE 1979#endif 1980 1981#if defined(LN) || defined(LT) 1982 STFD f0, 0 * SIZE(BO) 1983 STFD f1, 1 * SIZE(BO) 1984#else 1985 STFD f0, 0 * SIZE(AO) 1986 STFD f1, 1 * SIZE(AO) 1987#endif 1988 1989 STFD f0, 0 * SIZE(CO1) 1990 STFD f1, 0 * SIZE(CO2) 1991 1992 lfs f0, FZERO 1993 fmr f1, f0 1994 fmr f4, f0 1995 fmr f5, f0 1996 1997#ifndef LN 1998 addi CO1, CO1, 1 * SIZE 1999 addi CO2, CO2, 1 * SIZE 2000#endif 2001 2002#ifdef RT 2003 slwi r0, K, 0 + BASE_SHIFT 2004 add AORIG, AORIG, r0 2005#endif 2006 2007#if defined(LT) || defined(RN) 2008 sub TEMP, K, KK 2009 slwi r0, TEMP, 0 + BASE_SHIFT 2010 slwi TEMP, TEMP, 1 + BASE_SHIFT 2011 add AO, AO, r0 2012 add BO, BO, TEMP 2013#endif 2014 2015#ifdef LN 2016 subi KK, KK, 1 2017#endif 2018 2019#ifdef LT 2020 addi KK, KK, 1 2021#endif 2022 .align 4 2023 2024.L50: 2025 andi. I, M, 2 2026 ble .L41 2027 2028#if defined(LT) || defined(RN) 2029 LFD f16, 0 * SIZE(AO) 2030 LFD f17, 1 * SIZE(AO) 2031 LFD f18, 2 * SIZE(AO) 2032 LFD f19, 3 * SIZE(AO) 2033 2034 LFD f20, 0 * SIZE(B) 2035 LFD f21, 1 * SIZE(B) 2036 LFD f22, 2 * SIZE(B) 2037 LFD f23, 3 * SIZE(B) 2038 2039 LFD f24, 4 * SIZE(B) 2040 LFD f25, 5 * SIZE(B) 2041 LFD f26, 6 * SIZE(B) 2042 LFD f27, 7 * SIZE(B) 2043 2044 srawi. r0, KK, 2 2045 mtspr CTR, r0 2046 mr BO, B 2047#else 2048 2049#ifdef LN 2050 slwi r0, K, 1 + BASE_SHIFT 2051 sub AORIG, AORIG, r0 2052#endif 2053 2054 slwi r0, KK, 1 + BASE_SHIFT 2055 slwi TEMP, KK, 1 + BASE_SHIFT 2056 add AO, AORIG, r0 2057 add BO, B, TEMP 2058 2059 sub TEMP, K, KK 2060 2061 LFD f16, 0 * SIZE(AO) 2062 LFD f17, 1 * SIZE(AO) 2063 LFD f18, 2 * SIZE(AO) 2064 LFD f19, 3 * SIZE(AO) 2065 2066 LFD f20, 0 * SIZE(BO) 2067 LFD f21, 1 * SIZE(BO) 2068 LFD f22, 2 * SIZE(BO) 2069 LFD f23, 3 * SIZE(BO) 2070 2071 LFD f24, 4 * SIZE(BO) 2072 LFD f25, 5 * SIZE(BO) 2073 LFD f26, 6 * SIZE(BO) 2074 LFD f27, 7 * SIZE(BO) 2075 2076 srawi. r0, TEMP, 2 2077 mtspr CTR, r0 2078#endif 2079 ble .L55 2080 .align 5 2081 2082.L52: 2083 FMADD f0, f16, f20, f0 2084 FMADD f1, f17, f20, f1 2085 LFDU f20, 8 * SIZE(BO) 2086 FMADD f2, f16, f21, f2 2087 LFD f16, 4 * SIZE(AO) 2088 FMADD f3, f17, f21, f3 2089 LFD f17, 5 * SIZE(AO) 2090 2091 FMADD f4, f18, f22, f4 2092 LFD f21, 1 * SIZE(BO) 2093 FMADD f5, f19, f22, f5 2094 LFD f22, 2 * SIZE(BO) 2095 FMADD f6, f18, f23, f6 2096 LFD f18, 6 * SIZE(AO) 2097 FMADD f7, f19, f23, f7 2098 LFD f19, 7 * SIZE(AO) 2099 2100 FMADD f0, f16, f24, f0 2101 LFD f23, 3 * SIZE(BO) 2102 FMADD f1, f17, f24, f1 2103 LFD f24, 4 * SIZE(BO) 2104 FMADD f2, f16, f25, f2 2105 LFDU f16, 8 * SIZE(AO) 2106 FMADD f3, f17, f25, f3 2107 LFD f17, 1 * SIZE(AO) 2108 2109 FMADD f4, f18, f26, f4 2110 LFD f25, 5 * SIZE(BO) 2111 FMADD f5, f19, f26, f5 2112 LFD f26, 6 * SIZE(BO) 2113 FMADD f6, f18, f27, f6 2114 LFD f18, 2 * SIZE(AO) 2115 FMADD f7, f19, f27, f7 2116 LFD f19, 3 * SIZE(AO) 2117 2118 LFD f27, 7 * SIZE(BO) 2119 bdnz .L52 2120 .align 4 2121 2122.L55: 2123#if defined(LT) || defined(RN) 2124 andi. r0, KK, 3 2125#else 2126 andi. r0, TEMP, 3 2127#endif 2128 mtspr CTR, r0 2129 ble+ .L58 2130 .align 4 2131 2132.L56: 2133 FMADD f0, f16, f20, f0 2134 FMADD f1, f17, f20, f1 2135 LFDU f20, 2 * SIZE(BO) 2136 FMADD f2, f16, f21, f2 2137 LFDU f16, 2 * SIZE(AO) 2138 FMADD f3, f17, f21, f3 2139 LFD f17, 1 * SIZE(AO) 2140 LFD f21, 1 * SIZE(BO) 2141 bdnz .L56 2142 .align 4 2143 2144.L58: 2145 FADD f0, f4, f0 2146 FADD f1, f5, f1 2147 FADD f2, f6, f2 2148 FADD f3, f7, f3 2149 2150#if defined(LN) || defined(RT) 2151#ifdef LN 2152 subi r0, KK, 2 2153#else 2154 subi r0, KK, 2 2155#endif 2156 slwi TEMP, r0, 1 + BASE_SHIFT 2157 slwi r0, r0, 1 + BASE_SHIFT 2158 add AO, AORIG, TEMP 2159 add BO, B, r0 2160#endif 2161 2162#if defined(LN) || defined(LT) 2163 LFD f16, 0 * SIZE(BO) 2164 LFD f17, 1 * SIZE(BO) 2165 LFD f20, 2 * SIZE(BO) 2166 LFD f21, 3 * SIZE(BO) 2167 2168 FSUB f0, f16, f0 2169 FSUB f2, f17, f2 2170 FSUB f1, f20, f1 2171 FSUB f3, f21, f3 2172#else 2173 LFD f16, 0 * SIZE(AO) 2174 LFD f17, 1 * SIZE(AO) 2175 LFD f20, 2 * SIZE(AO) 2176 LFD f21, 3 * SIZE(AO) 2177 2178 FSUB f0, f16, f0 2179 FSUB f1, f17, f1 2180 FSUB f2, f20, f2 2181 FSUB f3, f21, f3 2182#endif 2183 2184#ifdef LN 2185 LFD f19, 3 * SIZE(AO) 2186 LFD f20, 2 * SIZE(AO) 2187 LFD f21, 0 * SIZE(AO) 2188 2189 FMUL f1, f19, f1 2190 FMUL f3, f19, f3 2191 2192 FNMSUB f0, f20, f1, f0 2193 FNMSUB f2, f20, f3, f2 2194 2195 FMUL f0, f21, f0 2196 FMUL f2, f21, f2 2197#endif 2198 2199#ifdef LT 2200 LFD f16, 0 * SIZE(AO) 2201 LFD f17, 1 * SIZE(AO) 2202 2203 FMUL f0, f16, f0 2204 FMUL f2, f16, f2 2205 FNMSUB f1, f17, f0, f1 2206 FNMSUB f3, f17, f2, f3 2207 2208 LFD f17, 3 * SIZE(AO) 2209 2210 FMUL f1, f17, f1 2211 FMUL f3, f17, f3 2212#endif 2213 2214#ifdef RN 2215 LFD f16, 0 * SIZE(BO) 2216 LFD f17, 1 * SIZE(BO) 2217 LFD f18, 3 * SIZE(BO) 2218 2219 FMUL f0, f16, f0 2220 FMUL f1, f16, f1 2221 2222 FNMSUB f2, f17, f0, f2 2223 FNMSUB f3, f17, f1, f3 2224 FMUL f2, f18, f2 2225 FMUL f3, f18, f3 2226#endif 2227 2228#ifdef RT 2229 LFD f19, 3 * SIZE(BO) 2230 LFD f20, 2 * SIZE(BO) 2231 LFD f21, 0 * SIZE(BO) 2232 2233 FMUL f2, f19, f2 2234 FMUL f3, f19, f3 2235 FNMSUB f0, f20, f2, f0 2236 FNMSUB f1, f20, f3, f1 2237 FMUL f0, f21, f0 2238 FMUL f1, f21, f1 2239#endif 2240 2241#ifdef LN 2242 subi CO1, CO1, 2 * SIZE 2243 subi CO2, CO2, 2 * SIZE 2244#endif 2245 2246#if defined(LN) || defined(LT) 2247 STFD f0, 0 * SIZE(BO) 2248 STFD f2, 1 * SIZE(BO) 2249 STFD f1, 2 * SIZE(BO) 2250 STFD f3, 3 * SIZE(BO) 2251#else 2252 STFD f0, 0 * SIZE(AO) 2253 STFD f1, 1 * SIZE(AO) 2254 STFD f2, 2 * SIZE(AO) 2255 STFD f3, 3 * SIZE(AO) 2256#endif 2257 2258 STFD f0, 0 * SIZE(CO1) 2259 STFD f1, 1 * SIZE(CO1) 2260 STFD f2, 0 * SIZE(CO2) 2261 STFD f3, 1 * SIZE(CO2) 2262 2263 lfs f0, FZERO 2264 fmr f1, f0 2265 fmr f2, f0 2266 fmr f3, f0 2267 2268 fmr f4, f0 2269 fmr f5, f0 2270 fmr f6, f0 2271 fmr f7, f0 2272 2273#ifndef LN 2274 addi CO1, CO1, 2 * SIZE 2275 addi CO2, CO2, 2 * SIZE 2276#endif 2277 2278#ifdef RT 2279 slwi r0, K, 1 + BASE_SHIFT 2280 add AORIG, AORIG, r0 2281#endif 2282 2283#if defined(LT) || defined(RN) 2284 sub TEMP, K, KK 2285 slwi r0, TEMP, 1 + BASE_SHIFT 2286 slwi TEMP, TEMP, 1 + BASE_SHIFT 2287 add AO, AO, r0 2288 add BO, BO, TEMP 2289#endif 2290 2291#ifdef LN 2292 subi KK, KK, 2 2293#endif 2294 2295#ifdef LT 2296 addi KK, KK, 2 2297#endif 2298 .align 4 2299 2300.L41: 2301 srawi. I, M, 2 2302 ble .L69 2303 .align 4 2304 2305.L42: 2306#if defined(LT) || defined(RN) 2307 LFD f16, 0 * SIZE(AO) 2308 LFD f17, 1 * SIZE(AO) 2309 LFD f18, 2 * SIZE(AO) 2310 LFD f19, 3 * SIZE(AO) 2311 2312 LFD f20, 0 * SIZE(B) 2313 LFD f21, 1 * SIZE(B) 2314 LFD f22, 2 * SIZE(B) 2315 LFD f23, 3 * SIZE(B) 2316 2317 srawi. r0, KK, 2 2318 mtspr CTR, r0 2319 mr BO, B 2320#else 2321 2322#ifdef LN 2323 slwi r0, K, 2 + BASE_SHIFT 2324 sub AORIG, AORIG, r0 2325#endif 2326 2327 slwi r0, KK, 2 + BASE_SHIFT 2328 slwi TEMP, KK, 1 + BASE_SHIFT 2329 add AO, AORIG, r0 2330 add BO, B, TEMP 2331 2332 sub TEMP, K, KK 2333 2334 LFD f16, 0 * SIZE(AO) 2335 LFD f17, 1 * SIZE(AO) 2336 LFD f18, 2 * SIZE(AO) 2337 LFD f19, 3 * SIZE(AO) 2338 2339 LFD f20, 0 * SIZE(BO) 2340 LFD f21, 1 * SIZE(BO) 2341 LFD f22, 2 * SIZE(BO) 2342 LFD f23, 3 * SIZE(BO) 2343 2344 srawi. r0, TEMP, 2 2345 mtspr CTR, r0 2346#endif 2347 ble .L45 2348 .align 5 2349 2350.L43: 2351 FMADD f0, f16, f20, f0 2352 FMADD f1, f17, f20, f1 2353 FMADD f2, f18, f20, f2 2354 FMADD f3, f19, f20, f3 2355 LFD f20, 4 * SIZE(BO) 2356 2357 FMADD f4, f16, f21, f4 2358 LFD f16, 4 * SIZE(AO) 2359 FMADD f5, f17, f21, f5 2360 LFD f17, 5 * SIZE(AO) 2361 FMADD f6, f18, f21, f6 2362 LFD f18, 6 * SIZE(AO) 2363 FMADD f7, f19, f21, f7 2364 LFD f19, 7 * SIZE(AO) 2365 2366 FMADD f0, f16, f22, f0 2367 LFD f21, 5 * SIZE(BO) 2368 FMADD f1, f17, f22, f1 2369 FMADD f2, f18, f22, f2 2370 FMADD f3, f19, f22, f3 2371 LFD f22, 6 * SIZE(BO) 2372 2373 FMADD f4, f16, f23, f4 2374 LFD f16, 8 * SIZE(AO) 2375 FMADD f5, f17, f23, f5 2376 LFD f17, 9 * SIZE(AO) 2377 FMADD f6, f18, f23, f6 2378 LFD f18, 10 * SIZE(AO) 2379 FMADD f7, f19, f23, f7 2380 LFD f19, 11 * SIZE(AO) 2381 2382 FMADD f0, f16, f20, f0 2383 LFD f23, 7 * SIZE(BO) 2384 FMADD f1, f17, f20, f1 2385 FMADD f2, f18, f20, f2 2386 FMADD f3, f19, f20, f3 2387 LFDU f20, 8 * SIZE(BO) 2388 2389 FMADD f4, f16, f21, f4 2390 LFD f16, 12 * SIZE(AO) 2391 FMADD f5, f17, f21, f5 2392 LFD f17, 13 * SIZE(AO) 2393 FMADD f6, f18, f21, f6 2394 LFD f18, 14 * SIZE(AO) 2395 FMADD f7, f19, f21, f7 2396 LFD f19, 15 * SIZE(AO) 2397 2398 FMADD f0, f16, f22, f0 2399 LFD f21, 1 * SIZE(BO) 2400 FMADD f1, f17, f22, f1 2401 FMADD f2, f18, f22, f2 2402 FMADD f3, f19, f22, f3 2403 LFD f22, 2 * SIZE(BO) 2404 2405 FMADD f4, f16, f23, f4 2406 LFDU f16, 16 * SIZE(AO) 2407 FMADD f5, f17, f23, f5 2408 LFD f17, 1 * SIZE(AO) 2409 FMADD f6, f18, f23, f6 2410 LFD f18, 2 * SIZE(AO) 2411 FMADD f7, f19, f23, f7 2412 LFD f19, 3 * SIZE(AO) 2413 2414 LFD f23, 3 * SIZE(BO) 2415 bdnz .L43 2416 .align 4 2417 2418.L45: 2419#if defined(LT) || defined(RN) 2420 andi. r0, KK, 3 2421#else 2422 andi. r0, TEMP, 3 2423#endif 2424 mtspr CTR, r0 2425 ble+ .L48 2426 .align 4 2427 2428.L46: 2429 FMADD f0, f16, f20, f0 2430 FMADD f1, f17, f20, f1 2431 FMADD f2, f18, f20, f2 2432 FMADD f3, f19, f20, f3 2433 LFDU f20, 2 * SIZE(BO) 2434 2435 FMADD f4, f16, f21, f4 2436 LFDU f16, 4 * SIZE(AO) 2437 FMADD f5, f17, f21, f5 2438 LFD f17, 1 * SIZE(AO) 2439 FMADD f6, f18, f21, f6 2440 LFD f18, 2 * SIZE(AO) 2441 FMADD f7, f19, f21, f7 2442 LFD f19, 3 * SIZE(AO) 2443 LFD f21, 1 * SIZE(BO) 2444 bdnz .L46 2445 .align 4 2446 2447.L48: 2448#if defined(LN) || defined(RT) 2449#ifdef LN 2450 subi r0, KK, 4 2451#else 2452 subi r0, KK, 2 2453#endif 2454 slwi TEMP, r0, 2 + BASE_SHIFT 2455 slwi r0, r0, 1 + BASE_SHIFT 2456 add AO, AORIG, TEMP 2457 add BO, B, r0 2458#endif 2459 2460#if defined(LN) || defined(LT) 2461 LFD f16, 0 * SIZE(BO) 2462 LFD f17, 1 * SIZE(BO) 2463 LFD f20, 2 * SIZE(BO) 2464 LFD f21, 3 * SIZE(BO) 2465 2466 LFD f24, 4 * SIZE(BO) 2467 LFD f25, 5 * SIZE(BO) 2468 LFD f28, 6 * SIZE(BO) 2469 LFD f29, 7 * SIZE(BO) 2470 2471 FSUB f0, f16, f0 2472 FSUB f4, f17, f4 2473 FSUB f1, f20, f1 2474 FSUB f5, f21, f5 2475 2476 FSUB f2, f24, f2 2477 FSUB f6, f25, f6 2478 FSUB f3, f28, f3 2479 FSUB f7, f29, f7 2480#else 2481 LFD f16, 0 * SIZE(AO) 2482 LFD f17, 1 * SIZE(AO) 2483 LFD f18, 2 * SIZE(AO) 2484 LFD f19, 3 * SIZE(AO) 2485 2486 LFD f20, 4 * SIZE(AO) 2487 LFD f21, 5 * SIZE(AO) 2488 LFD f22, 6 * SIZE(AO) 2489 LFD f23, 7 * SIZE(AO) 2490 2491 FSUB f0, f16, f0 2492 FSUB f1, f17, f1 2493 FSUB f2, f18, f2 2494 FSUB f3, f19, f3 2495 2496 FSUB f4, f20, f4 2497 FSUB f5, f21, f5 2498 FSUB f6, f22, f6 2499 FSUB f7, f23, f7 2500#endif 2501 2502#ifdef LN 2503 LFD f16, 15 * SIZE(AO) 2504 LFD f17, 14 * SIZE(AO) 2505 LFD f18, 13 * SIZE(AO) 2506 LFD f19, 12 * SIZE(AO) 2507 2508 FMUL f3, f16, f3 2509 FMUL f7, f16, f7 2510 FNMSUB f2, f17, f3, f2 2511 FNMSUB f6, f17, f7, f6 2512 FNMSUB f1, f18, f3, f1 2513 FNMSUB f5, f18, f7, f5 2514 FNMSUB f0, f19, f3, f0 2515 FNMSUB f4, f19, f7, f4 2516 2517 LFD f16, 10 * SIZE(AO) 2518 LFD f17, 9 * SIZE(AO) 2519 LFD f18, 8 * SIZE(AO) 2520 LFD f19, 5 * SIZE(AO) 2521 2522 LFD f20, 4 * SIZE(AO) 2523 LFD f21, 0 * SIZE(AO) 2524 2525 FMUL f2, f16, f2 2526 FMUL f6, f16, f6 2527 FNMSUB f1, f17, f2, f1 2528 FNMSUB f5, f17, f6, f5 2529 FNMSUB f0, f18, f2, f0 2530 FNMSUB f4, f18, f6, f4 2531 2532 FMUL f1, f19, f1 2533 FMUL f5, f19, f5 2534 FNMSUB f0, f20, f1, f0 2535 FNMSUB f4, f20, f5, f4 2536 FMUL f0, f21, f0 2537 FMUL f4, f21, f4 2538#endif 2539 2540#ifdef LT 2541 LFD f16, 0 * SIZE(AO) 2542 LFD f17, 1 * SIZE(AO) 2543 LFD f18, 2 * SIZE(AO) 2544 LFD f19, 3 * SIZE(AO) 2545 2546 FMUL f0, f16, f0 2547 FMUL f4, f16, f4 2548 FNMSUB f1, f17, f0, f1 2549 FNMSUB f5, f17, f4, f5 2550 2551 FNMSUB f2, f18, f0, f2 2552 FNMSUB f6, f18, f4, f6 2553 FNMSUB f3, f19, f0, f3 2554 FNMSUB f7, f19, f4, f7 2555 2556 LFD f17, 5 * SIZE(AO) 2557 LFD f18, 6 * SIZE(AO) 2558 LFD f19, 7 * SIZE(AO) 2559 2560 FMUL f1, f17, f1 2561 FMUL f5, f17, f5 2562 2563 FNMSUB f2, f18, f1, f2 2564 FNMSUB f6, f18, f5, f6 2565 2566 FNMSUB f3, f19, f1, f3 2567 FNMSUB f7, f19, f5, f7 2568 2569 LFD f18, 10 * SIZE(AO) 2570 LFD f19, 11 * SIZE(AO) 2571 2572 FMUL f2, f18, f2 2573 FMUL f6, f18, f6 2574 2575 FNMSUB f3, f19, f2, f3 2576 FNMSUB f7, f19, f6, f7 2577 2578 LFD f19, 15 * SIZE(AO) 2579 2580 FMUL f3, f19, f3 2581 FMUL f7, f19, f7 2582#endif 2583 2584#ifdef RN 2585 LFD f16, 0 * SIZE(BO) 2586 LFD f17, 1 * SIZE(BO) 2587 LFD f18, 3 * SIZE(BO) 2588 2589 FMUL f0, f16, f0 2590 FMUL f1, f16, f1 2591 FMUL f2, f16, f2 2592 FMUL f3, f16, f3 2593 2594 FNMSUB f4, f17, f0, f4 2595 FNMSUB f5, f17, f1, f5 2596 FNMSUB f6, f17, f2, f6 2597 FNMSUB f7, f17, f3, f7 2598 2599 FMUL f4, f18, f4 2600 FMUL f5, f18, f5 2601 FMUL f6, f18, f6 2602 FMUL f7, f18, f7 2603#endif 2604 2605#ifdef RT 2606 LFD f19, 3 * SIZE(BO) 2607 LFD f20, 2 * SIZE(BO) 2608 LFD f21, 0 * SIZE(BO) 2609 2610 FMUL f4, f19, f4 2611 FMUL f5, f19, f5 2612 FMUL f6, f19, f6 2613 FMUL f7, f19, f7 2614 2615 FNMSUB f0, f20, f4, f0 2616 FNMSUB f1, f20, f5, f1 2617 FNMSUB f2, f20, f6, f2 2618 FNMSUB f3, f20, f7, f3 2619 2620 FMUL f0, f21, f0 2621 FMUL f1, f21, f1 2622 FMUL f2, f21, f2 2623 FMUL f3, f21, f3 2624#endif 2625 2626#ifdef LN 2627 subi CO1, CO1, 4 * SIZE 2628 subi CO2, CO2, 4 * SIZE 2629#endif 2630 2631#if defined(LN) || defined(LT) 2632 STFD f0, 0 * SIZE(BO) 2633 STFD f4, 1 * SIZE(BO) 2634 STFD f1, 2 * SIZE(BO) 2635 STFD f5, 3 * SIZE(BO) 2636 2637 STFD f2, 4 * SIZE(BO) 2638 STFD f6, 5 * SIZE(BO) 2639 STFD f3, 6 * SIZE(BO) 2640 STFD f7, 7 * SIZE(BO) 2641#else 2642 STFD f0, 0 * SIZE(AO) 2643 STFD f1, 1 * SIZE(AO) 2644 STFD f2, 2 * SIZE(AO) 2645 STFD f3, 3 * SIZE(AO) 2646 2647 STFD f4, 4 * SIZE(AO) 2648 STFD f5, 5 * SIZE(AO) 2649 STFD f6, 6 * SIZE(AO) 2650 STFD f7, 7 * SIZE(AO) 2651#endif 2652 2653 STFD f0, 0 * SIZE(CO1) 2654 STFD f1, 1 * SIZE(CO1) 2655 STFD f2, 2 * SIZE(CO1) 2656 STFD f3, 3 * SIZE(CO1) 2657 2658 STFD f4, 0 * SIZE(CO2) 2659 STFD f5, 1 * SIZE(CO2) 2660 STFD f6, 2 * SIZE(CO2) 2661 STFD f7, 3 * SIZE(CO2) 2662 2663 lfs f0, FZERO 2664 fmr f1, f0 2665 fmr f2, f0 2666 fmr f3, f0 2667 2668 fmr f4, f0 2669 fmr f5, f0 2670 fmr f6, f0 2671 fmr f7, f0 2672 2673#ifndef LN 2674 addi CO1, CO1, 4 * SIZE 2675 addi CO2, CO2, 4 * SIZE 2676#endif 2677 2678#ifdef RT 2679 slwi r0, K, 2 + BASE_SHIFT 2680 add AORIG, AORIG, r0 2681#endif 2682 2683#if defined(LT) || defined(RN) 2684 sub TEMP, K, KK 2685 slwi r0, TEMP, 2 + BASE_SHIFT 2686 slwi TEMP, TEMP, 1 + BASE_SHIFT 2687 add AO, AO, r0 2688 add BO, BO, TEMP 2689#endif 2690 2691#ifdef LN 2692 subi KK, KK, 4 2693#endif 2694 2695#ifdef LT 2696 addi KK, KK, 4 2697#endif 2698 2699 addic. I, I, -1 2700 bgt+ .L42 2701 .align 4 2702 2703.L69: 2704#ifdef LN 2705 slwi r0, K, 1 + BASE_SHIFT 2706 add B, B, r0 2707#endif 2708 2709#if defined(LT) || defined(RN) 2710 mr B, BO 2711#endif 2712 2713#ifdef RN 2714 addi KK, KK, 2 2715#endif 2716 2717#ifdef RT 2718 subi KK, KK, 2 2719#endif 2720 lfs f0, FZERO 2721 .align 4 2722 2723.L70: 2724 andi. J, N, 1 2725 ble .L999 2726 2727#ifdef RT 2728 slwi r0, K, 0 + BASE_SHIFT 2729 sub B, B, r0 2730 2731 sub C, C, LDC 2732#endif 2733 2734 mr CO1, C 2735 2736#ifdef LN 2737 add KK, M, OFFSET 2738#endif 2739 2740#ifdef LT 2741 mr KK, OFFSET 2742#endif 2743 2744 fmr f1, f0 2745 fmr f2, f0 2746 fmr f3, f0 2747 2748#if defined(LN) || defined(RT) 2749 mr AORIG, A 2750#else 2751 mr AO, A 2752#endif 2753#ifndef RT 2754 add C, CO1, LDC 2755#endif 2756 .align 4 2757 2758.L90: 2759 andi. I, M, 1 2760 ble .L80 2761 2762#if defined(LT) || defined(RN) 2763 LFD f16, 0 * SIZE(AO) 2764 LFD f17, 1 * SIZE(AO) 2765 LFD f18, 2 * SIZE(AO) 2766 LFD f19, 3 * SIZE(AO) 2767 2768 LFD f20, 0 * SIZE(B) 2769 LFD f21, 1 * SIZE(B) 2770 LFD f22, 2 * SIZE(B) 2771 LFD f23, 3 * SIZE(B) 2772 2773 srawi. r0, KK, 3 2774 mtspr CTR, r0 2775 mr BO, B 2776#else 2777 2778#ifdef LN 2779 slwi r0, K, BASE_SHIFT 2780 sub AORIG, AORIG, r0 2781#endif 2782 2783 slwi r0, KK, 0 + BASE_SHIFT 2784 slwi TEMP, KK, 0 + BASE_SHIFT 2785 add AO, AORIG, r0 2786 add BO, B, TEMP 2787 2788 sub TEMP, K, KK 2789 2790 LFD f16, 0 * SIZE(AO) 2791 LFD f17, 1 * SIZE(AO) 2792 LFD f18, 2 * SIZE(AO) 2793 LFD f19, 3 * SIZE(AO) 2794 2795 LFD f20, 0 * SIZE(BO) 2796 LFD f21, 1 * SIZE(BO) 2797 LFD f22, 2 * SIZE(BO) 2798 LFD f23, 3 * SIZE(BO) 2799 2800 srawi. r0, TEMP, 3 2801 mtspr CTR, r0 2802#endif 2803 ble .L95 2804 .align 5 2805 2806.L92: 2807 FMADD f0, f16, f20, f0 2808 LFD f16, 4 * SIZE(AO) 2809 LFD f20, 4 * SIZE(BO) 2810 FMADD f1, f17, f21, f1 2811 LFD f17, 5 * SIZE(AO) 2812 LFD f21, 5 * SIZE(BO) 2813 FMADD f2, f18, f22, f2 2814 LFD f18, 6 * SIZE(AO) 2815 LFD f22, 6 * SIZE(BO) 2816 FMADD f3, f19, f23, f3 2817 LFD f19, 7 * SIZE(AO) 2818 LFD f23, 7 * SIZE(BO) 2819 2820 FMADD f0, f16, f20, f0 2821 LFDU f16, 8 * SIZE(AO) 2822 LFDU f20, 8 * SIZE(BO) 2823 FMADD f1, f17, f21, f1 2824 LFD f17, 1 * SIZE(AO) 2825 LFD f21, 1 * SIZE(BO) 2826 FMADD f2, f18, f22, f2 2827 LFD f18, 2 * SIZE(AO) 2828 LFD f22, 2 * SIZE(BO) 2829 FMADD f3, f19, f23, f3 2830 LFD f19, 3 * SIZE(AO) 2831 LFD f23, 3 * SIZE(BO) 2832 bdnz .L92 2833 .align 4 2834 2835.L95: 2836#if defined(LT) || defined(RN) 2837 andi. r0, KK, 7 2838#else 2839 andi. r0, TEMP, 7 2840#endif 2841 mtspr CTR, r0 2842 ble+ .L98 2843 .align 4 2844 2845.L96: 2846 FMADD f0, f16, f20, f0 2847 LFDU f16, 1 * SIZE(AO) 2848 LFDU f20, 1 * SIZE(BO) 2849 bdnz .L96 2850 .align 4 2851 2852.L98: 2853 FADD f0, f1, f0 2854 FADD f2, f3, f2 2855 FADD f0, f2, f0 2856 2857#if defined(LN) || defined(RT) 2858#ifdef LN 2859 subi r0, KK, 1 2860#else 2861 subi r0, KK, 1 2862#endif 2863 slwi TEMP, r0, 0 + BASE_SHIFT 2864 slwi r0, r0, 0 + BASE_SHIFT 2865 add AO, AORIG, TEMP 2866 add BO, B, r0 2867#endif 2868 2869#if defined(LN) || defined(LT) 2870 LFD f16, 0 * SIZE(BO) 2871 FSUB f0, f16, f0 2872#else 2873 LFD f16, 0 * SIZE(AO) 2874 FSUB f0, f16, f0 2875#endif 2876 2877#ifdef LN 2878 LFD f21, 0 * SIZE(AO) 2879 FMUL f0, f21, f0 2880#endif 2881 2882#ifdef LT 2883 LFD f16, 0 * SIZE(AO) 2884 FMUL f0, f16, f0 2885#endif 2886 2887#ifdef RN 2888 LFD f16, 0 * SIZE(BO) 2889 FMUL f0, f16, f0 2890#endif 2891 2892#ifdef RT 2893 LFD f21, 0 * SIZE(BO) 2894 FMUL f0, f21, f0 2895#endif 2896 2897#ifdef LN 2898 subi CO1, CO1, 1 * SIZE 2899#endif 2900 2901#if defined(LN) || defined(LT) 2902 STFD f0, 0 * SIZE(BO) 2903#else 2904 STFD f0, 0 * SIZE(AO) 2905#endif 2906 2907 STFD f0, 0 * SIZE(CO1) 2908 2909 lfs f0, FZERO 2910 fmr f1, f0 2911 fmr f2, f0 2912 fmr f3, f0 2913 2914#ifndef LN 2915 addi CO1, CO1, 1 * SIZE 2916#endif 2917 2918#ifdef RT 2919 slwi r0, K, 0 + BASE_SHIFT 2920 add AORIG, AORIG, r0 2921#endif 2922 2923#if defined(LT) || defined(RN) 2924 sub TEMP, K, KK 2925 slwi r0, TEMP, 0 + BASE_SHIFT 2926 slwi TEMP, TEMP, 0 + BASE_SHIFT 2927 add AO, AO, r0 2928 add BO, BO, TEMP 2929#endif 2930 2931#ifdef LN 2932 subi KK, KK, 1 2933#endif 2934 2935#ifdef LT 2936 addi KK, KK, 1 2937#endif 2938 .align 4 2939 2940.L80: 2941 andi. I, M, 2 2942 ble .L71 2943 2944#if defined(LT) || defined(RN) 2945 LFD f16, 0 * SIZE(AO) 2946 LFD f17, 1 * SIZE(AO) 2947 LFD f18, 2 * SIZE(AO) 2948 LFD f19, 3 * SIZE(AO) 2949 2950 LFD f20, 0 * SIZE(B) 2951 LFD f21, 1 * SIZE(B) 2952 LFD f22, 2 * SIZE(B) 2953 LFD f23, 3 * SIZE(B) 2954 2955 srawi. r0, KK, 2 2956 mtspr CTR, r0 2957 mr BO, B 2958#else 2959 2960#ifdef LN 2961 slwi r0, K, 1 + BASE_SHIFT 2962 sub AORIG, AORIG, r0 2963#endif 2964 2965 slwi r0, KK, 1 + BASE_SHIFT 2966 slwi TEMP, KK, 0 + BASE_SHIFT 2967 add AO, AORIG, r0 2968 add BO, B, TEMP 2969 2970 sub TEMP, K, KK 2971 2972 LFD f16, 0 * SIZE(AO) 2973 LFD f17, 1 * SIZE(AO) 2974 LFD f18, 2 * SIZE(AO) 2975 LFD f19, 3 * SIZE(AO) 2976 2977 LFD f20, 0 * SIZE(BO) 2978 LFD f21, 1 * SIZE(BO) 2979 LFD f22, 2 * SIZE(BO) 2980 LFD f23, 3 * SIZE(BO) 2981 2982 srawi. r0, TEMP, 2 2983 mtspr CTR, r0 2984#endif 2985 ble .L85 2986 .align 5 2987 2988.L82: 2989 FMADD f0, f16, f20, f0 2990 LFD f16, 4 * SIZE(AO) 2991 FMADD f1, f17, f20, f1 2992 LFDU f20, 4 * SIZE(BO) 2993 LFD f17, 5 * SIZE(AO) 2994 FMADD f2, f18, f21, f2 2995 LFD f18, 6 * SIZE(AO) 2996 FMADD f3, f19, f21, f3 2997 LFD f21, 1 * SIZE(BO) 2998 LFD f19, 7 * SIZE(AO) 2999 3000 FMADD f0, f16, f22, f0 3001 LFDU f16, 8 * SIZE(AO) 3002 FMADD f1, f17, f22, f1 3003 LFD f22, 2 * SIZE(BO) 3004 LFD f17, 1 * SIZE(AO) 3005 FMADD f2, f18, f23, f2 3006 LFD f18, 2 * SIZE(AO) 3007 FMADD f3, f19, f23, f3 3008 LFD f23, 3 * SIZE(BO) 3009 LFD f19, 3 * SIZE(AO) 3010 bdnz .L82 3011 .align 4 3012 3013.L85: 3014#if defined(LT) || defined(RN) 3015 andi. r0, KK, 3 3016#else 3017 andi. r0, TEMP, 3 3018#endif 3019 mtspr CTR, r0 3020 ble+ .L88 3021 .align 4 3022 3023.L86: 3024 FMADD f0, f16, f20, f0 3025 LFDU f16, 2 * SIZE(AO) 3026 FMADD f1, f17, f20, f1 3027 LFDU f20, 1 * SIZE(BO) 3028 LFD f17, 1 * SIZE(AO) 3029 bdnz .L86 3030 .align 4 3031 3032.L88: 3033 FADD f0, f2, f0 3034 FADD f1, f3, f1 3035 3036#if defined(LN) || defined(RT) 3037#ifdef LN 3038 subi r0, KK, 2 3039#else 3040 subi r0, KK, 1 3041#endif 3042 slwi TEMP, r0, 1 + BASE_SHIFT 3043 slwi r0, r0, 0 + BASE_SHIFT 3044 add AO, AORIG, TEMP 3045 add BO, B, r0 3046#endif 3047 3048#if defined(LN) || defined(LT) 3049 LFD f16, 0 * SIZE(BO) 3050 LFD f20, 1 * SIZE(BO) 3051 3052 FSUB f0, f16, f0 3053 FSUB f1, f20, f1 3054#else 3055 LFD f16, 0 * SIZE(AO) 3056 LFD f17, 1 * SIZE(AO) 3057 3058 FSUB f0, f16, f0 3059 FSUB f1, f17, f1 3060#endif 3061 3062#ifdef LN 3063 LFD f19, 3 * SIZE(AO) 3064 LFD f20, 2 * SIZE(AO) 3065 LFD f21, 0 * SIZE(AO) 3066 3067 FMUL f1, f19, f1 3068 FNMSUB f0, f20, f1, f0 3069 FMUL f0, f21, f0 3070#endif 3071 3072#ifdef LT 3073 LFD f16, 0 * SIZE(AO) 3074 LFD f17, 1 * SIZE(AO) 3075 3076 FMUL f0, f16, f0 3077 FNMSUB f1, f17, f0, f1 3078 3079 LFD f17, 3 * SIZE(AO) 3080 FMUL f1, f17, f1 3081#endif 3082 3083#ifdef RN 3084 LFD f16, 0 * SIZE(BO) 3085 3086 FMUL f0, f16, f0 3087 FMUL f1, f16, f1 3088#endif 3089 3090#ifdef RT 3091 LFD f21, 0 * SIZE(BO) 3092 3093 FMUL f0, f21, f0 3094 FMUL f1, f21, f1 3095#endif 3096 3097#ifdef LN 3098 subi CO1, CO1, 2 * SIZE 3099#endif 3100 3101#if defined(LN) || defined(LT) 3102 STFD f0, 0 * SIZE(BO) 3103 STFD f1, 1 * SIZE(BO) 3104#else 3105 STFD f0, 0 * SIZE(AO) 3106 STFD f1, 1 * SIZE(AO) 3107#endif 3108 3109 STFD f0, 0 * SIZE(CO1) 3110 STFD f1, 1 * SIZE(CO1) 3111 3112 lfs f0, FZERO 3113 fmr f1, f0 3114 fmr f2, f0 3115 fmr f3, f0 3116 3117#ifndef LN 3118 addi CO1, CO1, 2 * SIZE 3119#endif 3120 3121#ifdef RT 3122 slwi r0, K, 1 + BASE_SHIFT 3123 add AORIG, AORIG, r0 3124#endif 3125 3126#if defined(LT) || defined(RN) 3127 sub TEMP, K, KK 3128 slwi r0, TEMP, 1 + BASE_SHIFT 3129 slwi TEMP, TEMP, 0 + BASE_SHIFT 3130 add AO, AO, r0 3131 add BO, BO, TEMP 3132#endif 3133 3134#ifdef LN 3135 subi KK, KK, 2 3136#endif 3137 3138#ifdef LT 3139 addi KK, KK, 2 3140#endif 3141 .align 4 3142 3143.L71: 3144 srawi. I, M, 2 3145 ble .L999 3146 .align 4 3147 3148.L72: 3149#if defined(LT) || defined(RN) 3150 LFD f16, 0 * SIZE(AO) 3151 LFD f17, 1 * SIZE(AO) 3152 LFD f18, 2 * SIZE(AO) 3153 LFD f19, 3 * SIZE(AO) 3154 3155 LFD f20, 0 * SIZE(B) 3156 LFD f21, 1 * SIZE(B) 3157 LFD f22, 2 * SIZE(B) 3158 LFD f23, 3 * SIZE(B) 3159 3160 srawi. r0, KK, 2 3161 mtspr CTR, r0 3162 mr BO, B 3163#else 3164 3165#ifdef LN 3166 slwi r0, K, 2 + BASE_SHIFT 3167 sub AORIG, AORIG, r0 3168#endif 3169 3170 slwi r0, KK, 2 + BASE_SHIFT 3171 slwi TEMP, KK, 0 + BASE_SHIFT 3172 add AO, AORIG, r0 3173 add BO, B, TEMP 3174 3175 sub TEMP, K, KK 3176 3177 LFD f16, 0 * SIZE(AO) 3178 LFD f17, 1 * SIZE(AO) 3179 LFD f18, 2 * SIZE(AO) 3180 LFD f19, 3 * SIZE(AO) 3181 3182 LFD f20, 0 * SIZE(BO) 3183 LFD f21, 1 * SIZE(BO) 3184 LFD f22, 2 * SIZE(BO) 3185 LFD f23, 3 * SIZE(BO) 3186 3187 srawi. r0, TEMP, 2 3188 mtspr CTR, r0 3189#endif 3190 ble .L75 3191 .align 5 3192 3193.L73: 3194 FMADD f0, f16, f20, f0 3195 LFD f16, 4 * SIZE(AO) 3196 FMADD f1, f17, f20, f1 3197 LFD f17, 5 * SIZE(AO) 3198 FMADD f2, f18, f20, f2 3199 LFD f18, 6 * SIZE(AO) 3200 FMADD f3, f19, f20, f3 3201 LFD f19, 7 * SIZE(AO) 3202 LFDU f20, 4 * SIZE(BO) 3203 3204 FMADD f0, f16, f21, f0 3205 LFD f16, 8 * SIZE(AO) 3206 FMADD f1, f17, f21, f1 3207 LFD f17, 9 * SIZE(AO) 3208 FMADD f2, f18, f21, f2 3209 LFD f18, 10 * SIZE(AO) 3210 FMADD f3, f19, f21, f3 3211 LFD f19, 11 * SIZE(AO) 3212 LFD f21, 1 * SIZE(BO) 3213 3214 FMADD f0, f16, f22, f0 3215 LFD f16, 12 * SIZE(AO) 3216 FMADD f1, f17, f22, f1 3217 LFD f17, 13 * SIZE(AO) 3218 FMADD f2, f18, f22, f2 3219 LFD f18, 14 * SIZE(AO) 3220 FMADD f3, f19, f22, f3 3221 LFD f19, 15 * SIZE(AO) 3222 LFD f22, 2 * SIZE(BO) 3223 3224 FMADD f0, f16, f23, f0 3225 LFDU f16, 16 * SIZE(AO) 3226 FMADD f1, f17, f23, f1 3227 LFD f17, 1 * SIZE(AO) 3228 FMADD f2, f18, f23, f2 3229 LFD f18, 2 * SIZE(AO) 3230 FMADD f3, f19, f23, f3 3231 LFD f19, 3 * SIZE(AO) 3232 LFD f23, 3 * SIZE(BO) 3233 bdnz .L73 3234 .align 4 3235 3236.L75: 3237#if defined(LT) || defined(RN) 3238 andi. r0, KK, 3 3239#else 3240 andi. r0, TEMP, 3 3241#endif 3242 mtspr CTR, r0 3243 ble+ .L78 3244 .align 4 3245 3246.L76: 3247 FMADD f0, f16, f20, f0 3248 LFDU f16, 4 * SIZE(AO) 3249 FMADD f1, f17, f20, f1 3250 LFD f17, 1 * SIZE(AO) 3251 FMADD f2, f18, f20, f2 3252 LFD f18, 2 * SIZE(AO) 3253 FMADD f3, f19, f20, f3 3254 LFDU f20, 1 * SIZE(BO) 3255 LFD f19, 3 * SIZE(AO) 3256 bdnz .L76 3257 .align 4 3258 3259.L78: 3260#if defined(LN) || defined(RT) 3261#ifdef LN 3262 subi r0, KK, 4 3263#else 3264 subi r0, KK, 1 3265#endif 3266 slwi TEMP, r0, 2 + BASE_SHIFT 3267 slwi r0, r0, 0 + BASE_SHIFT 3268 add AO, AORIG, TEMP 3269 add BO, B, r0 3270#endif 3271 3272#if defined(LN) || defined(LT) 3273 LFD f16, 0 * SIZE(BO) 3274 LFD f20, 1 * SIZE(BO) 3275 LFD f24, 2 * SIZE(BO) 3276 LFD f28, 3 * SIZE(BO) 3277 3278 FSUB f0, f16, f0 3279 FSUB f1, f20, f1 3280 FSUB f2, f24, f2 3281 FSUB f3, f28, f3 3282#else 3283 LFD f16, 0 * SIZE(AO) 3284 LFD f17, 1 * SIZE(AO) 3285 LFD f18, 2 * SIZE(AO) 3286 LFD f19, 3 * SIZE(AO) 3287 3288 FSUB f0, f16, f0 3289 FSUB f1, f17, f1 3290 FSUB f2, f18, f2 3291 FSUB f3, f19, f3 3292#endif 3293 3294#ifdef LN 3295 LFD f16, 15 * SIZE(AO) 3296 LFD f17, 14 * SIZE(AO) 3297 LFD f18, 13 * SIZE(AO) 3298 LFD f19, 12 * SIZE(AO) 3299 3300 FMUL f3, f16, f3 3301 FNMSUB f2, f17, f3, f2 3302 FNMSUB f1, f18, f3, f1 3303 FNMSUB f0, f19, f3, f0 3304 3305 LFD f16, 10 * SIZE(AO) 3306 LFD f17, 9 * SIZE(AO) 3307 LFD f18, 8 * SIZE(AO) 3308 LFD f19, 5 * SIZE(AO) 3309 3310 LFD f20, 4 * SIZE(AO) 3311 LFD f21, 0 * SIZE(AO) 3312 3313 FMUL f2, f16, f2 3314 FNMSUB f1, f17, f2, f1 3315 FNMSUB f0, f18, f2, f0 3316 3317 FMUL f1, f19, f1 3318 FNMSUB f0, f20, f1, f0 3319 FMUL f0, f21, f0 3320#endif 3321 3322#ifdef LT 3323 LFD f16, 0 * SIZE(AO) 3324 LFD f17, 1 * SIZE(AO) 3325 LFD f18, 2 * SIZE(AO) 3326 LFD f19, 3 * SIZE(AO) 3327 3328 FMUL f0, f16, f0 3329 FNMSUB f1, f17, f0, f1 3330 FNMSUB f2, f18, f0, f2 3331 FNMSUB f3, f19, f0, f3 3332 3333 LFD f17, 5 * SIZE(AO) 3334 LFD f18, 6 * SIZE(AO) 3335 LFD f19, 7 * SIZE(AO) 3336 3337 FMUL f1, f17, f1 3338 FNMSUB f2, f18, f1, f2 3339 FNMSUB f3, f19, f1, f3 3340 3341 LFD f18, 10 * SIZE(AO) 3342 LFD f19, 11 * SIZE(AO) 3343 3344 FMUL f2, f18, f2 3345 FNMSUB f3, f19, f2, f3 3346 3347 LFD f19, 15 * SIZE(AO) 3348 3349 FMUL f3, f19, f3 3350#endif 3351 3352#ifdef RN 3353 LFD f16, 0 * SIZE(BO) 3354 3355 FMUL f0, f16, f0 3356 FMUL f1, f16, f1 3357 FMUL f2, f16, f2 3358 FMUL f3, f16, f3 3359#endif 3360 3361#ifdef RT 3362 LFD f21, 0 * SIZE(BO) 3363 3364 FMUL f0, f21, f0 3365 FMUL f1, f21, f1 3366 FMUL f2, f21, f2 3367 FMUL f3, f21, f3 3368#endif 3369 3370#ifdef LN 3371 subi CO1, CO1, 4 * SIZE 3372#endif 3373 3374#if defined(LN) || defined(LT) 3375 STFD f0, 0 * SIZE(BO) 3376 STFD f1, 1 * SIZE(BO) 3377 STFD f2, 2 * SIZE(BO) 3378 STFD f3, 3 * SIZE(BO) 3379#else 3380 STFD f0, 0 * SIZE(AO) 3381 STFD f1, 1 * SIZE(AO) 3382 STFD f2, 2 * SIZE(AO) 3383 STFD f3, 3 * SIZE(AO) 3384#endif 3385 3386 STFD f0, 0 * SIZE(CO1) 3387 STFD f1, 1 * SIZE(CO1) 3388 STFD f2, 2 * SIZE(CO1) 3389 STFD f3, 3 * SIZE(CO1) 3390 3391 lfs f0, FZERO 3392 fmr f1, f0 3393 fmr f2, f0 3394 fmr f3, f0 3395 3396#ifndef LN 3397 addi CO1, CO1, 4 * SIZE 3398#endif 3399 3400#ifdef RT 3401 slwi r0, K, 2 + BASE_SHIFT 3402 add AORIG, AORIG, r0 3403#endif 3404 3405#if defined(LT) || defined(RN) 3406 sub TEMP, K, KK 3407 slwi r0, TEMP, 2 + BASE_SHIFT 3408 slwi TEMP, TEMP, 0 + BASE_SHIFT 3409 add AO, AO, r0 3410 add BO, BO, TEMP 3411#endif 3412 3413#ifdef LN 3414 subi KK, KK, 4 3415#endif 3416 3417#ifdef LT 3418 addi KK, KK, 4 3419#endif 3420 3421 addic. I, I, -1 3422 bgt+ .L72 3423 .align 4 3424 3425.L999: 3426 addi r3, 0, 0 3427 3428 lfd f14, 0(SP) 3429 lfd f15, 8(SP) 3430 lfd f16, 16(SP) 3431 lfd f17, 24(SP) 3432 3433 lfd f18, 32(SP) 3434 lfd f19, 40(SP) 3435 lfd f20, 48(SP) 3436 lfd f21, 56(SP) 3437 3438 lfd f22, 64(SP) 3439 lfd f23, 72(SP) 3440 lfd f24, 80(SP) 3441 lfd f25, 88(SP) 3442 3443 lfd f26, 96(SP) 3444 lfd f27, 104(SP) 3445 lfd f28, 112(SP) 3446 lfd f29, 120(SP) 3447 3448 lfd f30, 128(SP) 3449 lfd f31, 136(SP) 3450 3451#ifdef __64BIT__ 3452 ld r31, 144(SP) 3453 ld r30, 152(SP) 3454 ld r29, 160(SP) 3455 ld r28, 168(SP) 3456 ld r27, 176(SP) 3457 ld r26, 184(SP) 3458 ld r25, 192(SP) 3459 ld r24, 200(SP) 3460 ld r23, 208(SP) 3461 ld r22, 216(SP) 3462 ld r21, 224(SP) 3463 ld r20, 232(SP) 3464 ld r19, 240(SP) 3465 ld r18, 248(SP) 3466#else 3467 lwz r31, 144(SP) 3468 lwz r30, 148(SP) 3469 lwz r29, 152(SP) 3470 lwz r28, 156(SP) 3471 lwz r27, 160(SP) 3472 lwz r26, 164(SP) 3473 lwz r25, 168(SP) 3474 lwz r24, 172(SP) 3475 lwz r23, 176(SP) 3476 lwz r22, 180(SP) 3477 lwz r21, 184(SP) 3478 lwz r20, 188(SP) 3479 lwz r19, 192(SP) 3480 lwz r18, 196(SP) 3481#endif 3482 3483 addi SP, SP, STACKSIZE 3484 3485 blr 3486 3487 EPILOGUE 3488