1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifndef __64BIT__ 43#define LOAD lwz 44#else 45#define LOAD ld 46#endif 47 48#ifdef __64BIT__ 49#define STACKSIZE 360 50#else 51#define STACKSIZE 272 52#endif 53 54#define ALIGN_SIZE 0xffff 55#define SWAP 0 56#define NEG 16 57#define ALPHA_R 32 58#define ALPHA_I 48 59#define FZERO 64 60 61#define M r3 62#define N r4 63#define K r5 64 65#ifdef linux 66#ifndef __64BIT__ 67#define A r6 68#define B r7 69#define C r8 70#define LDC r9 71#else 72#define A r8 73#define B r9 74#define C r10 75#define LDC r6 76#endif 77#endif 78 79#if defined(_AIX) || defined(__APPLE__) 80#if !defined(__64BIT__) && defined(DOUBLE) 81#define A r10 82#define B r6 83#define C r7 84#define LDC r8 85#else 86#define A r8 87#define B r9 88#define C r10 89#define LDC r6 90#endif 91#endif 92 93#define STACK r11 94 95#define I r21 96#define J r22 97#define AO r23 98#define BO r24 99#define CO1 r25 100#define CO2 r26 101 102#define PREA r29 103#define PREB r29 104#define PREC r30 105#define VREG r31 106 107#define LOAD_A lvx 108#define LOAD_B lvx 109 110#define OFFSET_0 0 111#define OFFSET_1 r14 112#define OFFSET_2 r15 113#define OFFSET_3 r16 114#define OFFSET_4 r17 115#define OFFSET_5 r18 116#define OFFSET_6 r19 117#define OFFSET_7 r20 118 119#define c01 v0 120#define c02 v1 121#define c03 v2 122#define c04 v3 123#define c05 v4 124#define c06 v5 125#define c07 v6 126#define c08 v7 127#define c09 v8 128#define c10 v9 129#define c11 v10 130#define c12 v11 131#define c13 v12 132#define c14 v13 133#define c15 v14 134#define c16 v15 135 136#define a1 v16 137#define a2 v17 138#define a3 v18 139#define a4 v19 140#define a5 v20 141#define a6 v21 142#define a7 v22 143#define a8 v23 144 145#define b1 v24 146#define b2 v25 147#define bp1 v26 148#define bp2 v27 149 150#define C1 v16 151#define C2 v17 152#define C3 v18 153#define C4 v19 154#define C5 v20 155 156#define c00 v24 157 158#define VZERO v25 159#define PERMRSHIFT1 v26 160#define PERMRSHIFT2 v27 161 162#define swap v28 163#define neg v29 164#define alpha_r v30 165#define alpha_i v31 166 167#ifndef NEEDPARAM 168 169 PROLOGUE 170 PROFCODE 171 172 addi SP, SP, -STACKSIZE 173 mr STACK, SP 174 175 li r0, 0 * 16 176 stvx v20, SP, r0 177 li r0, 1 * 16 178 stvx v21, SP, r0 179 li r0, 2 * 16 180 stvx v22, SP, r0 181 li r0, 3 * 16 182 stvx v23, SP, r0 183 li r0, 4 * 16 184 stvx v24, SP, r0 185 li r0, 5 * 16 186 stvx v25, SP, r0 187 li r0, 6 * 16 188 stvx v26, SP, r0 189 li r0, 7 * 16 190 stvx v27, SP, r0 191 li r0, 8 * 16 192 stvx v28, SP, r0 193 li r0, 9 * 16 194 stvx v29, SP, r0 195 li r0, 10 * 16 196 stvx v30, SP, r0 197 li r0, 11 * 16 198 stvx v31, SP, r0 199 200#ifdef __64BIT__ 201 std r31, 192(SP) 202 std r30, 200(SP) 203 std r29, 208(SP) 204 std r28, 216(SP) 205 std r27, 224(SP) 206 std r26, 232(SP) 207 std r25, 240(SP) 208 std r24, 248(SP) 209 std r23, 256(SP) 210 std r22, 264(SP) 211 std r21, 272(SP) 212 std r20, 280(SP) 213 std r19, 288(SP) 214 std r18, 296(SP) 215 std r17, 304(SP) 216 std r16, 312(SP) 217 std r15, 320(SP) 218 std r14, 328(SP) 219#else 220 stw r31, 192(SP) 221 stw r30, 196(SP) 222 stw r29, 200(SP) 223 stw r28, 204(SP) 224 stw r27, 208(SP) 225 stw r26, 212(SP) 226 stw r25, 216(SP) 227 stw r24, 220(SP) 228 stw r23, 224(SP) 229 stw r22, 228(SP) 230 stw r21, 232(SP) 231 stw r20, 236(SP) 232 stw r19, 240(SP) 233 stw r18, 244(SP) 234 stw r17, 248(SP) 235 stw r16, 252(SP) 236 stw r15, 256(SP) 237 stw r14, 260(SP) 238#endif 239 240 241#ifdef linux 242#ifdef __64BIT__ 243 ld LDC, 112 + STACKSIZE(SP) 244#endif 245#endif 246 247#if defined(_AIX) || defined(__APPLE__) 248#ifdef __64BIT__ 249 ld LDC, 112 + STACKSIZE(SP) 250#else 251#ifdef DOUBLE 252 lwz B, 56 + STACKSIZE(SP) 253 lwz C, 60 + STACKSIZE(SP) 254 lwz LDC, 64 + STACKSIZE(SP) 255#else 256 lwz LDC, 56 + STACKSIZE(SP) 257#endif 258#endif 259#endif 260 261#ifndef PREFETCHTEST 262#ifdef PPC970 263 li PREC, 16 * SIZE 264#endif 265#else 266 267#ifdef linux 268#ifndef __64BIT__ 269 lwz PREB, 16 + STACKSIZE(SP) 270 lwz PREC, 20 + STACKSIZE(SP) 271#else 272 ld PREB, 136 + STACKSIZE(SP) 273 ld PREC, 144 + STACKSIZE(SP) 274#endif 275#endif 276 277#if defined(_AIX) || defined(__APPLE__) 278#ifdef __64BIT__ 279 ld PREB, 136 + STACKSIZE(SP) 280 ld PREC, 144 + STACKSIZE(SP) 281#else 282#ifdef DOUBLE 283 lwz PREB, 72 + STACKSIZE(SP) 284 lwz PREC, 76 + STACKSIZE(SP) 285#else 286 lwz PREB, 68 + STACKSIZE(SP) 287 lwz PREC, 72 + STACKSIZE(SP) 288#endif 289#endif 290#endif 291 292#endif 293 294#ifndef PREFETCHTEST 295#ifdef CELL 296 li PREB, (3 * 32 * SIZE) 297#else 298 li PREB, (5 * 32 * SIZE) 299#endif 300#endif 301 302 li r0, -1 303 mfspr VREG, VRsave 304 305 mtspr VRsave, r0 306 307 addi SP, SP, -128 308 li r0, -8192 309 310 and SP, SP, r0 311 312 fneg f3, f1 313 fneg f4, f2 314 315#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 316 defined(NC) || defined(TC) || defined(NR) || defined(TR) 317 stfs f1, ALPHA_R + 0(SP) 318 stfs f1, ALPHA_R + 4(SP) 319 stfs f1, ALPHA_R + 8(SP) 320 stfs f1, ALPHA_R + 12(SP) 321 322 stfs f4, ALPHA_I + 0(SP) 323 stfs f2, ALPHA_I + 4(SP) 324 stfs f4, ALPHA_I + 8(SP) 325 stfs f2, ALPHA_I + 12(SP) 326#else 327 stfs f1, ALPHA_R + 0(SP) 328 stfs f3, ALPHA_R + 4(SP) 329 stfs f1, ALPHA_R + 8(SP) 330 stfs f3, ALPHA_R + 12(SP) 331 332 stfs f2, ALPHA_I + 0(SP) 333 stfs f2, ALPHA_I + 4(SP) 334 stfs f2, ALPHA_I + 8(SP) 335 stfs f2, ALPHA_I + 12(SP) 336#endif 337 338 li I, Address_L(0x04050607) 339 addis I, I, Address_H(0x04050607) 340 stw I, SWAP + 0(SP) 341 li I, Address_L(0x00010203) 342 addis I, I, Address_H(0x00010203) 343 stw I, SWAP + 4(SP) 344 li I, Address_L(0x0c0d0e0f) 345 addis I, I, Address_H(0x0c0d0e0f) 346 stw I, SWAP + 8(SP) 347 li I, Address_L(0x08090a0b) 348 addis I, I, Address_H(0x08090a0b) 349 stw I, SWAP + 12(SP) 350 351#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 352 defined(RR) || defined(RC) || defined(CR) || defined(CC) 353 lis I, 0x8000 354 stw I, NEG + 0(SP) 355 stw I, NEG + 8(SP) 356 li I, 0 357 stw I, NEG + 4(SP) 358 stw I, NEG + 12(SP) 359#else 360 li I, 0 361 stw I, NEG + 0(SP) 362 stw I, NEG + 8(SP) 363 lis I, 0x8000 364 stw I, NEG + 4(SP) 365 stw I, NEG + 12(SP) 366#endif 367 368 li r0, 0 369 stw r0, FZERO(SP) 370 371 slwi LDC, LDC, ZBASE_SHIFT 372 373 li OFFSET_1, 4 * SIZE 374 li OFFSET_2, 8 * SIZE 375 li OFFSET_3, 12 * SIZE 376 li OFFSET_4, 16 * SIZE 377 li OFFSET_5, 20 * SIZE 378 li OFFSET_6, 24 * SIZE 379 li OFFSET_7, 28 * SIZE 380 381 cmpwi cr0, M, 0 382 ble LL(999) 383 cmpwi cr0, N, 0 384 ble LL(999) 385 cmpwi cr0, K, 0 386 ble LL(999) 387 388 srawi. J, N, 1 389 ble LL(50) 390 .align 4 391 392LL(01): 393 mr CO1, C 394 add CO2, C, LDC 395 add C, CO2, LDC 396 397 mr AO, A 398 srawi. I, M, 3 399 ble LL(20) 400 .align 4 401 402LL(11): 403 vxor c01, c01, c01 404 LOAD_B b1, OFFSET_0, B 405 vxor c02, c02, c02 406 LOAD_B b2, OFFSET_1, B 407 vxor c03, c03, c03 408 LOAD_A a1, OFFSET_0, AO 409 vxor c04, c04, c04 410 LOAD_A a2, OFFSET_1, AO 411 vxor c05, c05, c05 412 LOAD_A a3, OFFSET_2, AO 413 vxor c06, c06, c06 414 LOAD_A a4, OFFSET_3, AO 415 vxor c07, c07, c07 416 LOAD_A a5, OFFSET_4, AO 417 vxor c08, c08, c08 418 419 vxor c09, c09, c09 420 dcbtst CO1, PREC 421 vxor c10, c10, c10 422 dcbtst CO2, PREC 423 vxor c11, c11, c11 424 vxor c12, c12, c12 425 vxor c13, c13, c13 426 mr BO, B 427 vxor c14, c14, c14 428 srawi. r0, K, 1 429 vxor c15, c15, c15 430 mtspr CTR, r0 431 vxor c16, c16, c16 432 vspltw bp1, b1, 0 433 ble LL(15) 434 .align 4 435 436LL(12): 437 vmaddfp c01, a1, bp1, c01 438 vspltw bp2, b1, 1 439 vmaddfp c02, a2, bp1, c02 440 DCBT(BO, PREB) 441 vmaddfp c03, a3, bp1, c03 442 nop 443 vmaddfp c04, a4, bp1, c04 444 LOAD_A a6, OFFSET_5, AO 445 446 vmaddfp c05, a1, bp2, c05 447 vspltw bp1, b1, 2 448 vmaddfp c06, a2, bp2, c06 449#ifdef CELL 450 DCBT(AO, PREA) 451#else 452 nop 453#endif 454 vmaddfp c07, a3, bp2, c07 455 nop 456 vmaddfp c08, a4, bp2, c08 457 LOAD_A a7, OFFSET_6, AO 458 459 vmaddfp c09, a1, bp1, c09 460 vspltw bp2, b1, 3 461 vmaddfp c10, a2, bp1, c10 462 LOAD_B b1, OFFSET_2, BO 463 vmaddfp c11, a3, bp1, c11 464 nop 465 vmaddfp c12, a4, bp1, c12 466 LOAD_A a8, OFFSET_7, AO 467 468 vmaddfp c13, a1, bp2, c13 469 vspltw bp1, b2, 0 470 vmaddfp c14, a2, bp2, c14 471 addi AO, AO, 32 * SIZE 472 vmaddfp c15, a3, bp2, c15 473 nop 474 vmaddfp c16, a4, bp2, c16 475 LOAD_A a1, OFFSET_0, AO 476 477 vmaddfp c01, a5, bp1, c01 478 vspltw bp2, b2, 1 479 vmaddfp c02, a6, bp1, c02 480 nop 481 vmaddfp c03, a7, bp1, c03 482 nop 483 vmaddfp c04, a8, bp1, c04 484 LOAD_A a2, OFFSET_1, AO 485 486 vmaddfp c05, a5, bp2, c05 487 vspltw bp1, b2, 2 488 vmaddfp c06, a6, bp2, c06 489 nop 490 vmaddfp c07, a7, bp2, c07 491 nop 492 vmaddfp c08, a8, bp2, c08 493 LOAD_A a3, OFFSET_2, AO 494 495 vmaddfp c09, a5, bp1, c09 496 vspltw bp2, b2, 3 497 vmaddfp c10, a6, bp1, c10 498 LOAD_B b2, OFFSET_3, BO 499 vmaddfp c11, a7, bp1, c11 500 nop 501 vmaddfp c12, a8, bp1, c12 502 LOAD_A a4, OFFSET_3, AO 503 504 vmaddfp c13, a5, bp2, c13 505 vspltw bp1, b1, 0 506 vmaddfp c14, a6, bp2, c14 507 addi BO, BO, 8 * SIZE 508 vmaddfp c15, a7, bp2, c15 509 LOAD_A a5, OFFSET_4, AO 510 vmaddfp c16, a8, bp2, c16 511 bdnz+ LL(12) 512 .align 4 513 514LL(15): 515 lvx swap, OFFSET_0, SP 516 lvx neg, OFFSET_1, SP 517 lvx alpha_r, OFFSET_2, SP 518 lvx alpha_i, OFFSET_3, SP 519 520 andi. r0, K, 1 521 ble+ LL(18) 522 .align 4 523 524LL(16): 525 vmaddfp c01, a1, bp1, c01 526 vspltw bp2, b1, 1 527 vmaddfp c02, a2, bp1, c02 528 nop 529 vmaddfp c03, a3, bp1, c03 530 nop 531 vmaddfp c04, a4, bp1, c04 532 nop 533 534 vmaddfp c05, a1, bp2, c05 535 vspltw bp1, b1, 2 536 vmaddfp c06, a2, bp2, c06 537 nop 538 vmaddfp c07, a3, bp2, c07 539 nop 540 vmaddfp c08, a4, bp2, c08 541 nop 542 543 vmaddfp c09, a1, bp1, c09 544 vspltw bp2, b1, 3 545 vmaddfp c10, a2, bp1, c10 546 addi AO, AO, 16 * SIZE 547 vmaddfp c11, a3, bp1, c11 548 addi BO, BO, 4 * SIZE 549 vmaddfp c12, a4, bp1, c12 550 nop 551 552 vmaddfp c13, a1, bp2, c13 553 vmaddfp c14, a2, bp2, c14 554 vmaddfp c15, a3, bp2, c15 555 vmaddfp c16, a4, bp2, c16 556 .align 4 557 558LL(18): 559 vxor VZERO, VZERO, VZERO 560 561 vperm c05, c05, c05, swap 562 vperm c06, c06, c06, swap 563 vperm c07, c07, c07, swap 564 vperm c08, c08, c08, swap 565 566 vperm c13, c13, c13, swap 567 vperm c14, c14, c14, swap 568 vperm c15, c15, c15, swap 569 vperm c16, c16, c16, swap 570 571 vxor c05, c05, neg 572 vxor c06, c06, neg 573 vxor c07, c07, neg 574 vxor c08, c08, neg 575 576 vxor c13, c13, neg 577 vxor c14, c14, neg 578 vxor c15, c15, neg 579 vxor c16, c16, neg 580 581 vaddfp c01, c01, c05 582 vaddfp c02, c02, c06 583 vaddfp c03, c03, c07 584 vaddfp c04, c04, c08 585 586 vaddfp c09, c09, c13 587 vaddfp c10, c10, c14 588 vaddfp c11, c11, c15 589 vaddfp c12, c12, c16 590 591 vperm c05, c01, c01, swap 592 vperm c06, c02, c02, swap 593 vperm c07, c03, c03, swap 594 vperm c08, c04, c04, swap 595 596 vperm c13, c09, c09, swap 597 vperm c14, c10, c10, swap 598 vperm c15, c11, c11, swap 599 vperm c16, c12, c12, swap 600 601 vmaddfp c01, alpha_r, c01, VZERO 602 vmaddfp c02, alpha_r, c02, VZERO 603 vmaddfp c03, alpha_r, c03, VZERO 604 vmaddfp c04, alpha_r, c04, VZERO 605 606 vmaddfp c01, alpha_i, c05, c01 607 vmaddfp c02, alpha_i, c06, c02 608 vmaddfp c03, alpha_i, c07, c03 609 vmaddfp c04, alpha_i, c08, c04 610 611 vmaddfp c09, alpha_r, c09, VZERO 612 vmaddfp c10, alpha_r, c10, VZERO 613 vmaddfp c11, alpha_r, c11, VZERO 614 vmaddfp c12, alpha_r, c12, VZERO 615 616 vmaddfp c09, alpha_i, c13, c09 617 vmaddfp c10, alpha_i, c14, c10 618 vmaddfp c11, alpha_i, c15, c11 619 vmaddfp c12, alpha_i, c16, c12 620 621 lvx C1, OFFSET_0, CO1 622 lvx C2, OFFSET_1, CO1 623 lvx C3, OFFSET_2, CO1 624 lvx C4, OFFSET_3, CO1 625 lvx C5, OFFSET_4, CO1 626 627 lvsr PERMRSHIFT1, 0, CO1 628 lvsr PERMRSHIFT2, 0, CO2 629 630 vperm c00, VZERO, c01, PERMRSHIFT1 631 vperm c01, c01, c02, PERMRSHIFT1 632 vperm c02, c02, c03, PERMRSHIFT1 633 vperm c03, c03, c04, PERMRSHIFT1 634 vperm c04, c04, VZERO, PERMRSHIFT1 635 636 vaddfp c00, c00, C1 637 vaddfp c01, c01, C2 638 vaddfp c02, c02, C3 639 vaddfp c03, c03, C4 640 vaddfp c04, c04, C5 641 642 stvx c00, OFFSET_0, CO1 643 stvx c01, OFFSET_1, CO1 644 stvx c02, OFFSET_2, CO1 645 stvx c03, OFFSET_3, CO1 646 stvx c04, OFFSET_4, CO1 647 648 lvx C1, OFFSET_0, CO2 649 lvx C2, OFFSET_1, CO2 650 lvx C3, OFFSET_2, CO2 651 lvx C4, OFFSET_3, CO2 652 lvx C5, OFFSET_4, CO2 653 654 vperm c00, VZERO, c09, PERMRSHIFT2 655 vperm c09, c09, c10, PERMRSHIFT2 656 vperm c10, c10, c11, PERMRSHIFT2 657 vperm c11, c11, c12, PERMRSHIFT2 658 vperm c12, c12, VZERO, PERMRSHIFT2 659 660 vaddfp c00, c00, C1 661 vaddfp c09, c09, C2 662 vaddfp c10, c10, C3 663 vaddfp c11, c11, C4 664 vaddfp c12, c12, C5 665 666 stvx c00, OFFSET_0, CO2 667 stvx c09, OFFSET_1, CO2 668 stvx c10, OFFSET_2, CO2 669 stvx c11, OFFSET_3, CO2 670 stvx c12, OFFSET_4, CO2 671 672 addi CO1, CO1, 16 * SIZE 673 addi CO2, CO2, 16 * SIZE 674 addic. I, I, -1 675 bgt+ LL(11) 676 .align 4 677 678LL(20): 679 andi. I, M, 4 680 ble LL(30) 681 682 vxor c01, c01, c01 683 LOAD_A a1, OFFSET_0, AO 684 vxor c02, c02, c02 685 LOAD_A a2, OFFSET_1, AO 686 vxor c05, c05, c05 687 LOAD_A a3, OFFSET_2, AO 688 vxor c06, c06, c06 689 LOAD_A a4, OFFSET_3, AO 690 vxor c09, c09, c09 691 LOAD_B b1, OFFSET_0, B 692 vxor c10, c10, c10 693 LOAD_B b2, OFFSET_1, B 694 vxor c13, c13, c13 695 vxor c14, c14, c14 696 mr BO, B 697 vspltw bp1, b1, 0 698 699 srawi. r0, K, 1 700 mtspr CTR, r0 701 ble LL(25) 702 .align 4 703 704LL(22): 705 vmaddfp c01, a1, bp1, c01 706 vspltw bp2, b1, 1 707 addi AO, AO, 16 * SIZE 708 vmaddfp c02, a2, bp1, c02 709 addi BO, BO, 8 * SIZE 710 711 vmaddfp c05, a1, bp2, c05 712 vspltw bp1, b1, 2 713 vmaddfp c06, a2, bp2, c06 714 715 vmaddfp c09, a1, bp1, c09 716 vspltw bp2, b1, 3 717 LOAD_B b1, OFFSET_0, BO 718 vmaddfp c10, a2, bp1, c10 719 720 vmaddfp c13, a1, bp2, c13 721 LOAD_A a1, OFFSET_0, AO 722 vspltw bp1, b2, 0 723 vmaddfp c14, a2, bp2, c14 724 LOAD_A a2, OFFSET_1, AO 725 726 vmaddfp c01, a3, bp1, c01 727 vspltw bp2, b2, 1 728 vmaddfp c02, a4, bp1, c02 729 730 vmaddfp c05, a3, bp2, c05 731 vspltw bp1, b2, 2 732 vmaddfp c06, a4, bp2, c06 733 734 vmaddfp c09, a3, bp1, c09 735 vspltw bp2, b2, 3 736 LOAD_B b2, OFFSET_1, BO 737 vmaddfp c10, a4, bp1, c10 738 739 vmaddfp c13, a3, bp2, c13 740 LOAD_A a3, OFFSET_2, AO 741 vmaddfp c14, a4, bp2, c14 742 LOAD_A a4, OFFSET_3, AO 743 vspltw bp1, b1, 0 744 bdnz LL(22) 745 .align 4 746 747LL(25): 748 andi. r0, K, 1 749 ble+ LL(28) 750 .align 4 751 752LL(26): 753 vmaddfp c01, a1, bp1, c01 754 vspltw bp2, b1, 1 755 vmaddfp c02, a2, bp1, c02 756 nop 757 758 vmaddfp c05, a1, bp2, c05 759 vspltw bp1, b1, 2 760 vmaddfp c06, a2, bp2, c06 761 nop 762 763 vmaddfp c09, a1, bp1, c09 764 vspltw bp2, b1, 3 765 vmaddfp c10, a2, bp1, c10 766 addi AO, AO, 8 * SIZE 767 768 vmaddfp c13, a1, bp2, c13 769 addi BO, BO, 4 * SIZE 770 vmaddfp c14, a2, bp2, c14 771 nop 772 .align 4 773 774LL(28): 775 vxor VZERO, VZERO, VZERO 776 777 lvx swap, OFFSET_0, SP 778 lvx neg, OFFSET_1, SP 779 lvx alpha_r, OFFSET_2, SP 780 lvx alpha_i, OFFSET_3, SP 781 782 vperm c05, c05, c05, swap 783 vperm c06, c06, c06, swap 784 vperm c13, c13, c13, swap 785 vperm c14, c14, c14, swap 786 787 vxor c05, c05, neg 788 vxor c06, c06, neg 789 vxor c13, c13, neg 790 vxor c14, c14, neg 791 792 vaddfp c01, c01, c05 793 vaddfp c02, c02, c06 794 vaddfp c09, c09, c13 795 vaddfp c10, c10, c14 796 797 vperm c05, c01, c01, swap 798 vperm c06, c02, c02, swap 799 vperm c13, c09, c09, swap 800 vperm c14, c10, c10, swap 801 802 vmaddfp c01, alpha_r, c01, VZERO 803 vmaddfp c02, alpha_r, c02, VZERO 804 vmaddfp c01, alpha_i, c05, c01 805 vmaddfp c02, alpha_i, c06, c02 806 807 vmaddfp c09, alpha_r, c09, VZERO 808 vmaddfp c10, alpha_r, c10, VZERO 809 vmaddfp c09, alpha_i, c13, c09 810 vmaddfp c10, alpha_i, c14, c10 811 812 lvx C1, OFFSET_0, CO1 813 lvx C2, OFFSET_1, CO1 814 lvx C3, OFFSET_2, CO1 815 816 lvsr PERMRSHIFT1, 0, CO1 817 lvsr PERMRSHIFT2, 0, CO2 818 819 vperm c00, VZERO, c01, PERMRSHIFT1 820 vperm c01, c01, c02, PERMRSHIFT1 821 vperm c02, c02, VZERO, PERMRSHIFT1 822 823 vaddfp c00, c00, C1 824 vaddfp c01, c01, C2 825 vaddfp c02, c02, C3 826 827 stvx c00, OFFSET_0, CO1 828 stvx c01, OFFSET_1, CO1 829 stvx c02, OFFSET_2, CO1 830 831 lvx C1, OFFSET_0, CO2 832 lvx C2, OFFSET_1, CO2 833 lvx C3, OFFSET_2, CO2 834 835 vperm c00, VZERO, c09, PERMRSHIFT2 836 vperm c09, c09, c10, PERMRSHIFT2 837 vperm c10, c10, VZERO, PERMRSHIFT2 838 839 vaddfp c00, c00, C1 840 vaddfp c09, c09, C2 841 vaddfp c10, c10, C3 842 843 stvx c00, OFFSET_0, CO2 844 stvx c09, OFFSET_1, CO2 845 stvx c10, OFFSET_2, CO2 846 847 addi CO1, CO1, 8 * SIZE 848 addi CO2, CO2, 8 * SIZE 849 .align 4 850 851LL(30): 852 andi. I, M, 2 853 ble LL(40) 854 855 vxor c01, c01, c01 856 LOAD_A a1, OFFSET_0, AO 857 vxor c02, c02, c02 858 LOAD_A a2, OFFSET_1, AO 859 vxor c05, c05, c05 860 LOAD_B b1, OFFSET_0, B 861 vxor c06, c06, c06 862 LOAD_B b2, OFFSET_1, B 863 vxor c09, c09, c09 864 vxor c10, c10, c10 865 vxor c13, c13, c13 866 vxor c14, c14, c14 867 868 vspltw bp1, b1, 0 869 mr BO, B 870 871 srawi. r0, K, 1 872 mtspr CTR, r0 873 ble LL(35) 874 .align 4 875 876LL(32): 877 vmaddfp c01, a1, bp1, c01 878 addi AO, AO, 8 * SIZE 879 vspltw bp2, b1, 1 880 vmaddfp c05, a1, bp2, c05 881 addi BO, BO, 8 * SIZE 882 vspltw bp1, b1, 2 883 vmaddfp c09, a1, bp1, c09 884 vspltw bp2, b1, 3 885 vmaddfp c13, a1, bp2, c13 886 LOAD_A a1, OFFSET_0, AO 887 vspltw bp1, b2, 0 888 LOAD_B b1, OFFSET_0, BO 889 890 vmaddfp c02, a2, bp1, c02 891 vspltw bp2, b2, 1 892 vmaddfp c06, a2, bp2, c06 893 vspltw bp1, b2, 2 894 vmaddfp c10, a2, bp1, c10 895 vspltw bp2, b2, 3 896 LOAD_B b2, OFFSET_1, BO 897 vmaddfp c14, a2, bp2, c14 898 LOAD_A a2, OFFSET_1, AO 899 900 vspltw bp1, b1, 0 901 bdnz LL(32) 902 .align 4 903 904LL(35): 905 andi. r0, K, 1 906 ble+ LL(38) 907 .align 4 908 909LL(36): 910 vmaddfp c01, a1, bp1, c01 911 vspltw bp2, b1, 1 912 vmaddfp c05, a1, bp2, c05 913 vspltw bp1, b1, 2 914 vmaddfp c09, a1, bp1, c09 915 vspltw bp2, b1, 3 916 vmaddfp c13, a1, bp2, c13 917 addi AO, AO, 4 * SIZE 918 addi BO, BO, 4 * SIZE 919 .align 4 920 921LL(38): 922 vaddfp c01, c01, c02 923 vaddfp c05, c05, c06 924 vaddfp c09, c09, c10 925 vaddfp c13, c13, c14 926 927 vxor VZERO, VZERO, VZERO 928 929 lvx swap, OFFSET_0, SP 930 lvx neg, OFFSET_1, SP 931 lvx alpha_r, OFFSET_2, SP 932 lvx alpha_i, OFFSET_3, SP 933 934 vperm c05, c05, c05, swap 935 vperm c13, c13, c13, swap 936 937 vxor c05, c05, neg 938 vxor c13, c13, neg 939 940 vaddfp c01, c01, c05 941 vaddfp c09, c09, c13 942 943 vperm c05, c01, c01, swap 944 vperm c13, c09, c09, swap 945 946 vmaddfp c01, alpha_r, c01, VZERO 947 vmaddfp c01, alpha_i, c05, c01 948 949 vmaddfp c09, alpha_r, c09, VZERO 950 vmaddfp c09, alpha_i, c13, c09 951 952 lvx C1, OFFSET_0, CO1 953 lvx C2, OFFSET_1, CO1 954 955 lvsr PERMRSHIFT1, 0, CO1 956 lvsr PERMRSHIFT2, 0, CO2 957 958 vperm c00, VZERO, c01, PERMRSHIFT1 959 vperm c01, c01, VZERO, PERMRSHIFT1 960 961 vaddfp c00, c00, C1 962 vaddfp c01, c01, C2 963 964 stvx c00, OFFSET_0, CO1 965 stvx c01, OFFSET_1, CO1 966 967 lvx C1, OFFSET_0, CO2 968 lvx C2, OFFSET_1, CO2 969 970 vperm c00, VZERO, c09, PERMRSHIFT2 971 vperm c09, c09, VZERO, PERMRSHIFT2 972 973 vaddfp c00, c00, C1 974 vaddfp c09, c09, C2 975 976 stvx c00, OFFSET_0, CO2 977 stvx c09, OFFSET_1, CO2 978 979 addi CO1, CO1, 4 * SIZE 980 addi CO2, CO2, 4 * SIZE 981 .align 4 982 983LL(40): 984 andi. I, M, 1 985 ble LL(49) 986 987 mr BO, B 988 989 LFD f8, 0 * SIZE(AO) 990 LFD f9, 1 * SIZE(AO) 991 992 LFD f10, 0 * SIZE(BO) 993 LFD f11, 1 * SIZE(BO) 994 LFD f12, 2 * SIZE(BO) 995 LFD f13, 3 * SIZE(BO) 996 997 lfs f0, FZERO(SP) 998 fmr f1, f0 999 fmr f2, f0 1000 fmr f3, f0 1001 1002 fmr f4, f0 1003 fmr f5, f0 1004 fmr f6, f0 1005 fmr f7, f0 1006 1007 srawi. r0, K, 1 1008 mtspr CTR, r0 1009 ble LL(45) 1010 .align 4 1011 1012LL(42): 1013 fmadd f0, f8, f10, f0 1014 fmadd f2, f8, f11, f2 1015 fmadd f4, f8, f12, f4 1016 fmadd f6, f8, f13, f6 1017 1018 fmadd f1, f9, f10, f1 1019 fmadd f3, f9, f11, f3 1020 fmadd f5, f9, f12, f5 1021 fmadd f7, f9, f13, f7 1022 1023 LFD f8, 2 * SIZE(AO) 1024 LFD f9, 3 * SIZE(AO) 1025 1026 LFD f10, 4 * SIZE(BO) 1027 LFD f11, 5 * SIZE(BO) 1028 LFD f12, 6 * SIZE(BO) 1029 LFD f13, 7 * SIZE(BO) 1030 1031 fmadd f0, f8, f10, f0 1032 fmadd f2, f8, f11, f2 1033 fmadd f4, f8, f12, f4 1034 fmadd f6, f8, f13, f6 1035 1036 fmadd f1, f9, f10, f1 1037 fmadd f3, f9, f11, f3 1038 fmadd f5, f9, f12, f5 1039 fmadd f7, f9, f13, f7 1040 1041 LFD f8, 4 * SIZE(AO) 1042 LFD f9, 5 * SIZE(AO) 1043 1044 LFD f10, 8 * SIZE(BO) 1045 LFD f11, 9 * SIZE(BO) 1046 LFD f12, 10 * SIZE(BO) 1047 LFD f13, 11 * SIZE(BO) 1048 1049 addi AO, AO, 4 * SIZE 1050 addi BO, BO, 8 * SIZE 1051 bdnz LL(42) 1052 .align 4 1053 1054LL(45): 1055 andi. r0, K, 1 1056 ble LL(48) 1057 .align 4 1058 1059LL(46): 1060 fmadd f0, f8, f10, f0 1061 fmadd f2, f8, f11, f2 1062 fmadd f4, f8, f12, f4 1063 fmadd f6, f8, f13, f6 1064 1065 fmadd f1, f9, f10, f1 1066 fmadd f3, f9, f11, f3 1067 fmadd f5, f9, f12, f5 1068 fmadd f7, f9, f13, f7 1069 1070 addi AO, AO, 2 * SIZE 1071 addi BO, BO, 4 * SIZE 1072 .align 4 1073 1074LL(48): 1075#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 1076 fsub f0, f0, f3 1077 fadd f1, f1, f2 1078 fsub f4, f4, f7 1079 fadd f5, f5, f6 1080#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 1081 fadd f0, f0, f3 1082 fsub f1, f1, f2 1083 fadd f4, f4, f7 1084 fsub f5, f5, f6 1085#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 1086 fadd f0, f0, f3 1087 fsub f1, f2, f1 1088 fadd f4, f4, f7 1089 fsub f5, f6, f5 1090#else /* RR, RC, CR, CC */ 1091 fsub f0, f0, f3 1092 fadd f1, f1, f2 1093 fsub f4, f4, f7 1094 fadd f5, f5, f6 1095#endif 1096 1097 LFD f8, 0 * SIZE(CO1) 1098 LFD f9, 1 * SIZE(CO1) 1099 LFD f10, 0 * SIZE(CO2) 1100 LFD f11, 1 * SIZE(CO2) 1101 1102 lfs f12, ALPHA_R + 0(SP) 1103 lfs f13, ALPHA_I + 4(SP) 1104 1105#if defined(RR) || defined(RC) || defined(CR) || defined(CC) 1106 fmadd f8, f12, f0, f8 1107 fnmsub f9, f12, f1, f9 1108 fmadd f10, f12, f4, f10 1109 fnmsub f11, f12, f5, f11 1110 1111 fmadd f8, f13, f1, f8 1112 fmadd f9, f13, f0, f9 1113 fmadd f10, f13, f5, f10 1114 fmadd f11, f13, f4, f11 1115#else 1116 fmadd f8, f12, f0, f8 1117 fmadd f9, f12, f1, f9 1118 fmadd f10, f12, f4, f10 1119 fmadd f11, f12, f5, f11 1120 1121 fnmsub f8, f13, f1, f8 1122 fmadd f9, f13, f0, f9 1123 fnmsub f10, f13, f5, f10 1124 fmadd f11, f13, f4, f11 1125#endif 1126 1127 STFD f8, 0 * SIZE(CO1) 1128 STFD f9, 1 * SIZE(CO1) 1129 STFD f10, 0 * SIZE(CO2) 1130 STFD f11, 1 * SIZE(CO2) 1131 1132LL(49): 1133 mr B, BO 1134 1135 addic. J, J, -1 1136 bgt LL(01) 1137 .align 4 1138 1139LL(50): 1140 andi. J, N, 1 1141 ble LL(999) 1142 1143 mr CO1, C 1144 mr AO, A 1145 1146 srawi. I, M, 3 1147 ble LL(70) 1148 .align 4 1149 1150LL(61): 1151 vxor c01, c01, c01 1152 LOAD_B b1, OFFSET_0, B 1153 vxor c02, c02, c02 1154 vxor c03, c03, c03 1155 LOAD_A a1, OFFSET_0, AO 1156 vxor c04, c04, c04 1157 LOAD_A a2, OFFSET_1, AO 1158 vxor c05, c05, c05 1159 LOAD_A a3, OFFSET_2, AO 1160 vxor c06, c06, c06 1161 LOAD_A a4, OFFSET_3, AO 1162 vxor c07, c07, c07 1163 vxor c08, c08, c08 1164 1165 mr BO, B 1166 dcbtst CO1, PREC 1167 dcbtst CO2, PREC 1168 1169 vspltw bp1, b1, 0 1170 1171 srawi. r0, K, 1 1172 mtspr CTR, r0 1173 ble LL(65) 1174 .align 4 1175 1176LL(62): 1177 LOAD_A a5, OFFSET_4, AO 1178 LOAD_A a6, OFFSET_5, AO 1179 LOAD_A a7, OFFSET_6, AO 1180 LOAD_A a8, OFFSET_7, AO 1181 1182 vmaddfp c01, a1, bp1, c01 1183 vspltw bp2, b1, 1 1184 vmaddfp c02, a2, bp1, c02 1185 vmaddfp c03, a3, bp1, c03 1186 vmaddfp c04, a4, bp1, c04 1187 1188 vmaddfp c05, a1, bp2, c05 1189 vspltw bp1, b1, 2 1190 vmaddfp c06, a2, bp2, c06 1191 vmaddfp c07, a3, bp2, c07 1192 vmaddfp c08, a4, bp2, c08 1193 1194 vmaddfp c01, a5, bp1, c01 1195 vspltw bp2, b1, 3 1196 vmaddfp c02, a6, bp1, c02 1197 vmaddfp c03, a7, bp1, c03 1198 vmaddfp c04, a8, bp1, c04 1199 1200 LOAD_B b1, OFFSET_1, BO 1201 vspltw bp1, b1, 0 1202 1203 vmaddfp c05, a5, bp2, c05 1204 vmaddfp c06, a6, bp2, c06 1205 vmaddfp c07, a7, bp2, c07 1206 vmaddfp c08, a8, bp2, c08 1207 1208 addi AO, AO, 32 * SIZE 1209 addi BO, BO, 4 * SIZE 1210 1211 LOAD_A a1, OFFSET_0, AO 1212 LOAD_A a2, OFFSET_1, AO 1213 LOAD_A a3, OFFSET_2, AO 1214 LOAD_A a4, OFFSET_3, AO 1215 bdnz LL(62) 1216 .align 4 1217 1218LL(65): 1219 andi. r0, K, 1 1220 ble+ LL(68) 1221 .align 4 1222 1223LL(66): 1224 vmaddfp c01, a1, bp1, c01 1225 vspltw bp2, b1, 1 1226 vmaddfp c02, a2, bp1, c02 1227 addi AO, AO, 16 * SIZE 1228 vmaddfp c03, a3, bp1, c03 1229 addi BO, BO, 2 * SIZE 1230 vmaddfp c04, a4, bp1, c04 1231 nop 1232 1233 vmaddfp c05, a1, bp2, c05 1234 vmaddfp c06, a2, bp2, c06 1235 vmaddfp c07, a3, bp2, c07 1236 vmaddfp c08, a4, bp2, c08 1237 .align 4 1238 1239LL(68): 1240 vxor VZERO, VZERO, VZERO 1241 1242 lvx swap, OFFSET_0, SP 1243 lvx neg, OFFSET_1, SP 1244 lvx alpha_r, OFFSET_2, SP 1245 lvx alpha_i, OFFSET_3, SP 1246 1247 vperm c05, c05, c05, swap 1248 vperm c06, c06, c06, swap 1249 vperm c07, c07, c07, swap 1250 vperm c08, c08, c08, swap 1251 1252 vxor c05, c05, neg 1253 vxor c06, c06, neg 1254 vxor c07, c07, neg 1255 vxor c08, c08, neg 1256 1257 vaddfp c01, c01, c05 1258 vaddfp c02, c02, c06 1259 vaddfp c03, c03, c07 1260 vaddfp c04, c04, c08 1261 1262 vperm c05, c01, c01, swap 1263 vperm c06, c02, c02, swap 1264 vperm c07, c03, c03, swap 1265 vperm c08, c04, c04, swap 1266 1267 vmaddfp c01, alpha_r, c01, VZERO 1268 vmaddfp c02, alpha_r, c02, VZERO 1269 vmaddfp c03, alpha_r, c03, VZERO 1270 vmaddfp c04, alpha_r, c04, VZERO 1271 1272 vmaddfp c01, alpha_i, c05, c01 1273 vmaddfp c02, alpha_i, c06, c02 1274 vmaddfp c03, alpha_i, c07, c03 1275 vmaddfp c04, alpha_i, c08, c04 1276 1277 lvx C1, OFFSET_0, CO1 1278 lvx C2, OFFSET_1, CO1 1279 lvx C3, OFFSET_2, CO1 1280 lvx C4, OFFSET_3, CO1 1281 lvx C5, OFFSET_4, CO1 1282 1283 lvsr PERMRSHIFT1, 0, CO1 1284 1285 vperm c00, VZERO, c01, PERMRSHIFT1 1286 vperm c01, c01, c02, PERMRSHIFT1 1287 vperm c02, c02, c03, PERMRSHIFT1 1288 vperm c03, c03, c04, PERMRSHIFT1 1289 vperm c04, c04, VZERO, PERMRSHIFT1 1290 1291 vaddfp c00, c00, C1 1292 vaddfp c01, c01, C2 1293 vaddfp c02, c02, C3 1294 vaddfp c03, c03, C4 1295 vaddfp c04, c04, C5 1296 1297 stvx c00, OFFSET_0, CO1 1298 stvx c01, OFFSET_1, CO1 1299 stvx c02, OFFSET_2, CO1 1300 stvx c03, OFFSET_3, CO1 1301 stvx c04, OFFSET_4, CO1 1302 1303 addi CO1, CO1, 16 * SIZE 1304 addic. I, I, -1 1305 bgt+ LL(61) 1306 .align 4 1307 1308LL(70): 1309 andi. I, M, 4 1310 ble LL(80) 1311 1312 vxor c01, c01, c01 1313 LOAD_B b1, OFFSET_0, B 1314 vxor c02, c02, c02 1315 vxor c03, c03, c03 1316 LOAD_A a1, OFFSET_0, AO 1317 vxor c04, c04, c04 1318 LOAD_A a2, OFFSET_1, AO 1319 vxor c05, c05, c05 1320 LOAD_A a3, OFFSET_2, AO 1321 vxor c06, c06, c06 1322 LOAD_A a4, OFFSET_3, AO 1323 vxor c07, c07, c07 1324 vxor c08, c08, c08 1325 1326 mr BO, B 1327 1328 vspltw bp1, b1, 0 1329 srawi. r0, K, 1 1330 mtspr CTR, r0 1331 ble LL(75) 1332 .align 4 1333 1334LL(72): 1335 vmaddfp c01, a1, bp1, c01 1336 vspltw bp2, b1, 1 1337 vmaddfp c02, a2, bp1, c02 1338 1339 vmaddfp c05, a1, bp2, c05 1340 vspltw bp1, b1, 2 1341 vmaddfp c06, a2, bp2, c06 1342 1343 vmaddfp c03, a3, bp1, c03 1344 vspltw bp2, b1, 3 1345 vmaddfp c04, a4, bp1, c04 1346 1347 LOAD_B b1, OFFSET_1, BO 1348 vspltw bp1, b1, 0 1349 1350 vmaddfp c07, a3, bp2, c07 1351 vmaddfp c08, a4, bp2, c08 1352 1353 addi AO, AO, 16 * SIZE 1354 addi BO, BO, 4 * SIZE 1355 1356 LOAD_A a1, OFFSET_0, AO 1357 LOAD_A a2, OFFSET_1, AO 1358 LOAD_A a3, OFFSET_2, AO 1359 LOAD_A a4, OFFSET_3, AO 1360 bdnz LL(72) 1361 .align 4 1362 1363LL(75): 1364 andi. r0, K, 1 1365 ble+ LL(78) 1366 .align 4 1367 1368LL(76): 1369 vmaddfp c01, a1, bp1, c01 1370 vspltw bp2, b1, 1 1371 vmaddfp c02, a2, bp1, c02 1372 addi AO, AO, 8 * SIZE 1373 vmaddfp c05, a1, bp2, c05 1374 addi BO, BO, 2 * SIZE 1375 vmaddfp c06, a2, bp2, c06 1376 .align 4 1377 1378LL(78): 1379 vaddfp c01, c01, c03 1380 vaddfp c02, c02, c04 1381 vaddfp c05, c05, c07 1382 vaddfp c06, c06, c08 1383 1384 vxor VZERO, VZERO, VZERO 1385 1386 lvx swap, OFFSET_0, SP 1387 lvx neg, OFFSET_1, SP 1388 lvx alpha_r, OFFSET_2, SP 1389 lvx alpha_i, OFFSET_3, SP 1390 1391 vperm c05, c05, c05, swap 1392 vperm c06, c06, c06, swap 1393 1394 vxor c05, c05, neg 1395 vxor c06, c06, neg 1396 1397 vaddfp c01, c01, c05 1398 vaddfp c02, c02, c06 1399 1400 vperm c05, c01, c01, swap 1401 vperm c06, c02, c02, swap 1402 1403 vmaddfp c01, alpha_r, c01, VZERO 1404 vmaddfp c02, alpha_r, c02, VZERO 1405 vmaddfp c01, alpha_i, c05, c01 1406 vmaddfp c02, alpha_i, c06, c02 1407 1408 lvx C1, OFFSET_0, CO1 1409 lvx C2, OFFSET_1, CO1 1410 lvx C3, OFFSET_2, CO1 1411 1412 lvsr PERMRSHIFT1, 0, CO1 1413 1414 vperm c00, VZERO, c01, PERMRSHIFT1 1415 vperm c01, c01, c02, PERMRSHIFT1 1416 vperm c02, c02, VZERO, PERMRSHIFT1 1417 1418 vaddfp c00, c00, C1 1419 vaddfp c01, c01, C2 1420 vaddfp c02, c02, C3 1421 1422 stvx c00, OFFSET_0, CO1 1423 stvx c01, OFFSET_1, CO1 1424 stvx c02, OFFSET_2, CO1 1425 1426 addi CO1, CO1, 8 * SIZE 1427 .align 4 1428 1429LL(80): 1430 andi. I, M, 2 1431 ble LL(90) 1432 1433 vxor c01, c01, c01 1434 LOAD_B b1, OFFSET_0, B 1435 vxor c02, c02, c02 1436 LOAD_A a1, OFFSET_0, AO 1437 LOAD_A a2, OFFSET_1, AO 1438 vxor c05, c05, c05 1439 vxor c06, c06, c06 1440 1441 mr BO, B 1442 1443 vspltw bp1, b1, 0 1444 1445 srawi. r0, K, 1 1446 mtspr CTR, r0 1447 ble LL(85) 1448 .align 4 1449 1450LL(82): 1451 vmaddfp c01, a1, bp1, c01 1452 vspltw bp2, b1, 1 1453 1454 vmaddfp c05, a1, bp2, c05 1455 vspltw bp1, b1, 2 1456 1457 vmaddfp c02, a2, bp1, c02 1458 vspltw bp2, b1, 3 1459 1460 LOAD_B b1, OFFSET_1, BO 1461 vspltw bp1, b1, 0 1462 1463 vmaddfp c06, a2, bp2, c06 1464 1465 addi AO, AO, 8 * SIZE 1466 addi BO, BO, 4 * SIZE 1467 1468 LOAD_A a1, OFFSET_0, AO 1469 LOAD_A a2, OFFSET_1, AO 1470 bdnz LL(82) 1471 .align 4 1472 1473LL(85): 1474 andi. r0, K, 1 1475 ble+ LL(88) 1476 .align 4 1477 1478LL(86): 1479 vspltw bp2, b1, 1 1480 vmaddfp c01, a1, bp1, c01 1481 vmaddfp c05, a1, bp2, c05 1482 addi AO, AO, 4 * SIZE 1483 addi BO, BO, 2 * SIZE 1484 .align 4 1485 1486LL(88): 1487 vaddfp c01, c01, c02 1488 vaddfp c05, c05, c06 1489 vaddfp c09, c09, c10 1490 vaddfp c13, c13, c14 1491 1492 vxor VZERO, VZERO, VZERO 1493 1494 lvx swap, OFFSET_0, SP 1495 lvx neg, OFFSET_1, SP 1496 lvx alpha_r, OFFSET_2, SP 1497 lvx alpha_i, OFFSET_3, SP 1498 1499 vperm c05, c05, c05, swap 1500 1501 vxor c05, c05, neg 1502 1503 vaddfp c01, c01, c05 1504 1505 vperm c05, c01, c01, swap 1506 1507 vmaddfp c01, alpha_r, c01, VZERO 1508 vmaddfp c01, alpha_i, c05, c01 1509 1510 lvx C1, OFFSET_0, CO1 1511 lvx C2, OFFSET_1, CO1 1512 1513 lvsr PERMRSHIFT1, 0, CO1 1514 1515 vperm c00, VZERO, c01, PERMRSHIFT1 1516 vperm c01, c01, VZERO, PERMRSHIFT1 1517 1518 vaddfp c00, c00, C1 1519 vaddfp c01, c01, C2 1520 1521 stvx c00, OFFSET_0, CO1 1522 stvx c01, OFFSET_1, CO1 1523 1524 addi CO1, CO1, 4 * SIZE 1525 .align 4 1526 1527LL(90): 1528 andi. I, M, 1 1529 ble LL(999) 1530 1531 mr BO, B 1532 1533 LFD f8, 0 * SIZE(AO) 1534 LFD f9, 1 * SIZE(AO) 1535 1536 LFD f10, 0 * SIZE(BO) 1537 LFD f11, 1 * SIZE(BO) 1538 LFD f12, 2 * SIZE(BO) 1539 LFD f13, 3 * SIZE(BO) 1540 1541 lfs f0, FZERO(SP) 1542 fmr f1, f0 1543 fmr f2, f0 1544 fmr f3, f0 1545 1546 srawi. r0, K, 1 1547 mtspr CTR, r0 1548 ble LL(95) 1549 .align 4 1550 1551LL(92): 1552 fmadd f0, f8, f10, f0 1553 fmadd f2, f8, f11, f2 1554 fmadd f1, f9, f10, f1 1555 fmadd f3, f9, f11, f3 1556 1557 LFD f8, 2 * SIZE(AO) 1558 LFD f9, 3 * SIZE(AO) 1559 LFD f10, 4 * SIZE(BO) 1560 LFD f11, 5 * SIZE(BO) 1561 1562 fmadd f0, f8, f12, f0 1563 fmadd f2, f8, f13, f2 1564 fmadd f1, f9, f12, f1 1565 fmadd f3, f9, f13, f3 1566 1567 LFD f8, 4 * SIZE(AO) 1568 LFD f9, 5 * SIZE(AO) 1569 LFD f12, 6 * SIZE(BO) 1570 LFD f13, 7 * SIZE(BO) 1571 1572 addi AO, AO, 4 * SIZE 1573 addi BO, BO, 4 * SIZE 1574 bdnz LL(92) 1575 .align 4 1576 1577LL(95): 1578 andi. r0, K, 1 1579 ble LL(98) 1580 .align 4 1581 1582LL(96): 1583 fmadd f0, f8, f10, f0 1584 fmadd f2, f8, f11, f2 1585 fmadd f1, f9, f10, f1 1586 fmadd f3, f9, f11, f3 1587 .align 4 1588 1589LL(98): 1590#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 1591 fsub f0, f0, f3 1592 fadd f1, f1, f2 1593#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 1594 fadd f0, f0, f3 1595 fsub f1, f1, f2 1596#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 1597 fadd f0, f0, f3 1598 fsub f1, f2, f1 1599#else /* RR, RC, CR, CC */ 1600 fsub f0, f0, f3 1601 fadd f1, f1, f2 1602#endif 1603 1604 LFD f8, 0 * SIZE(CO1) 1605 LFD f9, 1 * SIZE(CO1) 1606 1607 lfs f12, ALPHA_R + 0(SP) 1608 lfs f13, ALPHA_I + 4(SP) 1609 1610#if defined(RR) || defined(RC) || defined(CR) || defined(CC) 1611 fmadd f8, f12, f0, f8 1612 fnmsub f9, f12, f1, f9 1613 1614 fmadd f8, f13, f1, f8 1615 fmadd f9, f13, f0, f9 1616#else 1617 fmadd f8, f12, f0, f8 1618 fmadd f9, f12, f1, f9 1619 1620 fnmsub f8, f13, f1, f8 1621 fmadd f9, f13, f0, f9 1622#endif 1623 1624 STFD f8, 0 * SIZE(CO1) 1625 STFD f9, 1 * SIZE(CO1) 1626 .align 4 1627 1628LL(999): 1629 mr SP, STACK 1630 1631 li r0, 0 * 16 1632 lvx v20, SP, r0 1633 li r0, 1 * 16 1634 lvx v21, SP, r0 1635 li r0, 2 * 16 1636 lvx v22, SP, r0 1637 li r0, 3 * 16 1638 lvx v23, SP, r0 1639 li r0, 4 * 16 1640 lvx v24, SP, r0 1641 li r0, 5 * 16 1642 lvx v25, SP, r0 1643 li r0, 6 * 16 1644 lvx v26, SP, r0 1645 li r0, 7 * 16 1646 lvx v27, SP, r0 1647 li r0, 8 * 16 1648 lvx v28, SP, r0 1649 li r0, 9 * 16 1650 lvx v29, SP, r0 1651 li r0, 10 * 16 1652 lvx v30, SP, r0 1653 li r0, 11 * 16 1654 lvx v31, SP, r0 1655 1656 mtspr VRsave, VREG 1657 1658#ifdef __64BIT__ 1659 ld r31, 192(SP) 1660 ld r30, 200(SP) 1661 ld r29, 208(SP) 1662 ld r28, 216(SP) 1663 ld r27, 224(SP) 1664 ld r26, 232(SP) 1665 ld r25, 240(SP) 1666 ld r24, 248(SP) 1667 ld r23, 256(SP) 1668 ld r22, 264(SP) 1669 ld r21, 272(SP) 1670 ld r20, 280(SP) 1671 ld r19, 288(SP) 1672 ld r18, 296(SP) 1673 ld r17, 304(SP) 1674 ld r16, 312(SP) 1675 ld r15, 320(SP) 1676 ld r14, 328(SP) 1677#else 1678 lwz r31, 192(SP) 1679 lwz r30, 196(SP) 1680 lwz r29, 200(SP) 1681 lwz r28, 204(SP) 1682 lwz r27, 208(SP) 1683 lwz r26, 212(SP) 1684 lwz r25, 216(SP) 1685 lwz r24, 220(SP) 1686 lwz r23, 224(SP) 1687 lwz r22, 228(SP) 1688 lwz r21, 232(SP) 1689 lwz r20, 236(SP) 1690 lwz r19, 240(SP) 1691 lwz r18, 244(SP) 1692 lwz r17, 248(SP) 1693 lwz r16, 252(SP) 1694 lwz r15, 256(SP) 1695 lwz r14, 260(SP) 1696#endif 1697 1698 addi SP, SP, STACKSIZE 1699 1700 blr 1701 1702 EPILOGUE 1703#endif 1704