1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifndef __64BIT__ 43#define LOAD lwz 44#else 45#define LOAD ld 46#endif 47 48#ifdef __64BIT__ 49#define STACKSIZE 360 50#else 51#define STACKSIZE 272 52#endif 53 54#define ALIGN_SIZE 0xffff 55#define SWAP 0 56#define NEG 16 57#define ALPHA_R 32 58#define ALPHA_I 48 59#define FZERO 64 60 61#define M r3 62#define N r4 63#define K r5 64 65#ifdef linux 66#ifndef __64BIT__ 67#define A r6 68#define B r7 69#define C r8 70#define LDC r9 71#else 72#define A r8 73#define B r9 74#define C r10 75#define LDC r6 76#endif 77#endif 78 79#if defined(_AIX) || defined(__APPLE__) 80#if !defined(__64BIT__) && defined(DOUBLE) 81#define A r10 82#define B r6 83#define C r7 84#define LDC r8 85#else 86#define A r8 87#define B r9 88#define C r10 89#define LDC r6 90#endif 91#endif 92 93#define STACK r11 94 95#define I r21 96#define J r22 97#define AO r23 98#define BO r24 99#define CO1 r25 100#define CO2 r26 101 102#define PREA r29 103#define PREB r29 104#define PREC r30 105#define VREG r31 106 107#define LOAD_A lvx 108#define LOAD_B lvx 109 110#define OFFSET_0 0 111#define OFFSET_1 r14 112#define OFFSET_2 r15 113#define OFFSET_3 r16 114#define OFFSET_4 r17 115#define OFFSET_5 r18 116#define OFFSET_6 r19 117#define OFFSET_7 r20 118 119#define c01 v0 120#define c02 v1 121#define c03 v2 122#define c04 v3 123#define c05 v4 124#define c06 v5 125#define c07 v6 126#define c08 v7 127#define c09 v8 128#define c10 v9 129#define c11 v10 130#define c12 v11 131#define c13 v12 132#define c14 v13 133#define c15 v14 134#define c16 v15 135 136#define a1 v16 137#define a2 v17 138#define a3 v18 139#define a4 v19 140#define a5 v20 141#define a6 v21 142#define a7 v22 143#define a8 v23 144 145#define b1 v24 146#define b2 v25 147#define bp1 v26 148#define bp2 v27 149 150#define C1 v16 151#define C2 v17 152#define C3 v18 153#define C4 v19 154#define C5 v20 155 156#define c00 v24 157 158#define VZERO v25 159#define PERMRSHIFT1 v26 160#define PERMRSHIFT2 v27 161 162#define swap v28 163#define neg v29 164#define alpha_r v30 165#define alpha_i v31 166 167#ifndef NEEDPARAM 168 169#ifndef DOUBLE 170#include "../cparam.h" 171#else 172#include "../zparam.h" 173#endif 174 175 PROLOGUE 176 PROFCODE 177 178 addi SP, SP, -STACKSIZE 179 mr STACK, SP 180 181 li r0, 0 * 16 182 stvx v20, SP, r0 183 li r0, 1 * 16 184 stvx v21, SP, r0 185 li r0, 2 * 16 186 stvx v22, SP, r0 187 li r0, 3 * 16 188 stvx v23, SP, r0 189 li r0, 4 * 16 190 stvx v24, SP, r0 191 li r0, 5 * 16 192 stvx v25, SP, r0 193 li r0, 6 * 16 194 stvx v26, SP, r0 195 li r0, 7 * 16 196 stvx v27, SP, r0 197 li r0, 8 * 16 198 stvx v28, SP, r0 199 li r0, 9 * 16 200 stvx v29, SP, r0 201 li r0, 10 * 16 202 stvx v30, SP, r0 203 li r0, 11 * 16 204 stvx v31, SP, r0 205 206#ifdef __64BIT__ 207 std r31, 192(SP) 208 std r30, 200(SP) 209 std r29, 208(SP) 210 std r28, 216(SP) 211 std r27, 224(SP) 212 std r26, 232(SP) 213 std r25, 240(SP) 214 std r24, 248(SP) 215 std r23, 256(SP) 216 std r22, 264(SP) 217 std r21, 272(SP) 218 std r20, 280(SP) 219 std r19, 288(SP) 220 std r18, 296(SP) 221 std r17, 304(SP) 222 std r16, 312(SP) 223 std r15, 320(SP) 224 std r14, 328(SP) 225#else 226 stw r31, 192(SP) 227 stw r30, 196(SP) 228 stw r29, 200(SP) 229 stw r28, 204(SP) 230 stw r27, 208(SP) 231 stw r26, 212(SP) 232 stw r25, 216(SP) 233 stw r24, 220(SP) 234 stw r23, 224(SP) 235 stw r22, 228(SP) 236 stw r21, 232(SP) 237 stw r20, 236(SP) 238 stw r19, 240(SP) 239 stw r18, 244(SP) 240 stw r17, 248(SP) 241 stw r16, 252(SP) 242 stw r15, 256(SP) 243 stw r14, 260(SP) 244#endif 245 246 247#ifdef linux 248#ifdef __64BIT__ 249 ld LDC, 112 + STACKSIZE(SP) 250#endif 251#endif 252 253#if defined(_AIX) || defined(__APPLE__) 254#ifdef __64BIT__ 255 ld LDC, 112 + STACKSIZE(SP) 256#else 257#ifdef DOUBLE 258 lwz B, 56 + STACKSIZE(SP) 259 lwz C, 60 + STACKSIZE(SP) 260 lwz LDC, 64 + STACKSIZE(SP) 261#else 262 lwz LDC, 56 + STACKSIZE(SP) 263#endif 264#endif 265#endif 266 267#ifndef PREFETCHTEST 268#ifdef PPC970 269 li PREC, 16 * SIZE 270#endif 271#else 272 273#ifdef linux 274#ifndef __64BIT__ 275 lwz PREB, 16 + STACKSIZE(SP) 276 lwz PREC, 20 + STACKSIZE(SP) 277#else 278 ld PREB, 136 + STACKSIZE(SP) 279 ld PREC, 144 + STACKSIZE(SP) 280#endif 281#endif 282 283#if defined(_AIX) || defined(__APPLE__) 284#ifdef __64BIT__ 285 ld PREB, 136 + STACKSIZE(SP) 286 ld PREC, 144 + STACKSIZE(SP) 287#else 288#ifdef DOUBLE 289 lwz PREB, 72 + STACKSIZE(SP) 290 lwz PREC, 76 + STACKSIZE(SP) 291#else 292 lwz PREB, 68 + STACKSIZE(SP) 293 lwz PREC, 72 + STACKSIZE(SP) 294#endif 295#endif 296#endif 297 298#endif 299 300#ifndef PREFETCHTEST 301#ifdef CELL 302 li PREB, (3 * 32 * SIZE) 303#else 304 li PREB, (5 * 32 * SIZE) 305#endif 306#endif 307 308 li r0, -1 309 mfspr VREG, VRsave 310 311 mtspr VRsave, r0 312 313 addi SP, SP, -128 314 li r0, -8192 315 316 and SP, SP, r0 317 318 fneg f3, f1 319 fneg f4, f2 320 321#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 322 defined(NC) || defined(TC) || defined(NR) || defined(TR) 323 stfs f1, ALPHA_R + 0(SP) 324 stfs f1, ALPHA_R + 4(SP) 325 stfs f1, ALPHA_R + 8(SP) 326 stfs f1, ALPHA_R + 12(SP) 327 328 stfs f4, ALPHA_I + 0(SP) 329 stfs f2, ALPHA_I + 4(SP) 330 stfs f4, ALPHA_I + 8(SP) 331 stfs f2, ALPHA_I + 12(SP) 332#else 333 stfs f1, ALPHA_R + 0(SP) 334 stfs f3, ALPHA_R + 4(SP) 335 stfs f1, ALPHA_R + 8(SP) 336 stfs f3, ALPHA_R + 12(SP) 337 338 stfs f2, ALPHA_I + 0(SP) 339 stfs f2, ALPHA_I + 4(SP) 340 stfs f2, ALPHA_I + 8(SP) 341 stfs f2, ALPHA_I + 12(SP) 342#endif 343 344 li I, Address_L(0x04050607) 345 addis I, I, Address_H(0x04050607) 346 stw I, SWAP + 0(SP) 347 li I, Address_L(0x00010203) 348 addis I, I, Address_H(0x00010203) 349 stw I, SWAP + 4(SP) 350 li I, Address_L(0x0c0d0e0f) 351 addis I, I, Address_H(0x0c0d0e0f) 352 stw I, SWAP + 8(SP) 353 li I, Address_L(0x08090a0b) 354 addis I, I, Address_H(0x08090a0b) 355 stw I, SWAP + 12(SP) 356 357#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 358 defined(RR) || defined(RC) || defined(CR) || defined(CC) 359 lis I, 0x8000 360 stw I, NEG + 0(SP) 361 stw I, NEG + 8(SP) 362 li I, 0 363 stw I, NEG + 4(SP) 364 stw I, NEG + 12(SP) 365#else 366 li I, 0 367 stw I, NEG + 0(SP) 368 stw I, NEG + 8(SP) 369 lis I, 0x8000 370 stw I, NEG + 4(SP) 371 stw I, NEG + 12(SP) 372#endif 373 374 li r0, 0 375 stw r0, FZERO(SP) 376 377 slwi LDC, LDC, ZBASE_SHIFT 378 379 li OFFSET_1, 4 * SIZE 380 li OFFSET_2, 8 * SIZE 381 li OFFSET_3, 12 * SIZE 382 li OFFSET_4, 16 * SIZE 383 li OFFSET_5, 20 * SIZE 384 li OFFSET_6, 24 * SIZE 385 li OFFSET_7, 28 * SIZE 386 387 cmpwi cr0, M, 0 388 ble LL(999) 389 cmpwi cr0, N, 0 390 ble LL(999) 391 cmpwi cr0, K, 0 392 ble LL(999) 393 394 srawi. J, N, 1 395 ble LL(50) 396 .align 4 397 398LL(01): 399 mr CO1, C 400 add CO2, C, LDC 401 add C, CO2, LDC 402 403 mr AO, A 404 srawi. I, M, 3 405 ble LL(20) 406 .align 4 407 408LL(11): 409 vxor c01, c01, c01 410 LOAD_B b1, OFFSET_0, B 411 vxor c02, c02, c02 412 LOAD_A a1, OFFSET_0, AO 413 vxor c03, c03, c03 414 LOAD_A a2, OFFSET_1, AO 415 vxor c04, c04, c04 416 LOAD_A a3, OFFSET_2, AO 417 418 vxor c04, c04, c04 419 vxor c05, c05, c05 420 vxor c06, c06, c06 421 vxor c07, c07, c07 422 vxor c08, c08, c08 423 424 vxor c09, c09, c09 425 dcbtst CO1, PREC 426 vxor c10, c10, c10 427 dcbtst CO2, PREC 428 vxor c11, c11, c11 429 vxor c12, c12, c12 430 vxor c13, c13, c13 431 mr BO, B 432 vxor c14, c14, c14 433 srawi. r0, K, 2 434 vxor c15, c15, c15 435 mtspr CTR, r0 436 vxor c16, c16, c16 437 vspltw bp1, b1, 0 438 ble LL(13) 439 .align 4 440 441#define NOP1 mr r3, r3 442#define NOP2 mr r4, r4 443 444LL(12): 445 vmaddfp c01, a1, bp1, c01 446 vspltw bp2, b1, 1 447 vmaddfp c02, a2, bp1, c02 448 LOAD_A a4, OFFSET_3, AO 449 vmaddfp c03, a3, bp1, c03 450 dcbt AO, PREA 451 vmaddfp c04, a4, bp1, c04 452 NOP2 453 454 vmaddfp c05, a1, bp2, c05 455 vspltw bp1, b1, 2 456 vmaddfp c06, a2, bp2, c06 457 NOP2 458 vmaddfp c07, a3, bp2, c07 459 NOP1 460 vmaddfp c08, a4, bp2, c08 461 dcbt BO, PREB 462 463 vmaddfp c09, a1, bp1, c09 464 vspltw bp2, b1, 3 465 vmaddfp c10, a2, bp1, c10 466 LOAD_B b2, OFFSET_1, BO 467 vmaddfp c11, a3, bp1, c11 468 addi BO, BO, 8 * SIZE 469 vmaddfp c12, a4, bp1, c12 470 NOP1 471 472 vmaddfp c13, a1, bp2, c13 473 vspltw bp1, b2, 0 474 vmaddfp c14, a2, bp2, c14 475 LOAD_A a5, OFFSET_4, AO 476 vmaddfp c15, a3, bp2, c15 477 LOAD_A a6, OFFSET_5, AO 478 vmaddfp c16, a4, bp2, c16 479 vspltw bp2, b2, 1 480 481 vmaddfp c01, a5, bp1, c01 482 LOAD_A a7, OFFSET_6, AO 483 vmaddfp c02, a6, bp1, c02 484 LOAD_A a8, OFFSET_7, AO 485 vmaddfp c03, a7, bp1, c03 486 NOP1 487 vmaddfp c04, a8, bp1, c04 488 NOP2 489 490 vmaddfp c05, a5, bp2, c05 491 vspltw bp1, b2, 2 492 vmaddfp c06, a6, bp2, c06 493 addi AO, AO, 32 * SIZE 494 vmaddfp c07, a7, bp2, c07 495 LOAD_B b1, OFFSET_0, BO 496 vmaddfp c08, a8, bp2, c08 497 NOP1 498 499 vmaddfp c09, a5, bp1, c09 500 vspltw bp2, b2, 3 501 vmaddfp c10, a6, bp1, c10 502 NOP2 503 vmaddfp c11, a7, bp1, c11 504 NOP1 505 vmaddfp c12, a8, bp1, c12 506 dcbt AO, PREA 507 508 vmaddfp c13, a5, bp2, c13 509 vspltw bp1, b1, 0 510 vmaddfp c14, a6, bp2, c14 511 LOAD_A a1, OFFSET_0, AO // 512 vmaddfp c15, a7, bp2, c15 513 LOAD_A a2, OFFSET_1, AO 514 vmaddfp c16, a8, bp2, c16 515 vspltw bp2, b1, 1 516 517 vmaddfp c01, a1, bp1, c01 518 LOAD_A a3, OFFSET_2, AO 519 vmaddfp c02, a2, bp1, c02 520 LOAD_A a4, OFFSET_3, AO 521 vmaddfp c03, a3, bp1, c03 522 NOP1 523 vmaddfp c04, a4, bp1, c04 524 NOP2 525 526 vmaddfp c05, a1, bp2, c05 527 vspltw bp1, b1, 2 528 vmaddfp c06, a2, bp2, c06 529 NOP2 530 vmaddfp c07, a3, bp2, c07 531 NOP1 532 vmaddfp c08, a4, bp2, c08 533 LOAD_B b2, OFFSET_1, BO 534 535 vmaddfp c09, a1, bp1, c09 536 vspltw bp2, b1, 3 537 vmaddfp c10, a2, bp1, c10 538 NOP2 539 vmaddfp c11, a3, bp1, c11 540 NOP1 541 vmaddfp c12, a4, bp1, c12 542 addi BO, BO, 8 * SIZE 543 544 vmaddfp c13, a1, bp2, c13 545 vspltw bp1, b2, 0 546 vmaddfp c14, a2, bp2, c14 547 LOAD_A a5, OFFSET_4, AO 548 vmaddfp c15, a3, bp2, c15 549 LOAD_A a6, OFFSET_5, AO 550 vmaddfp c16, a4, bp2, c16 551 vspltw bp2, b2, 1 552 553 vmaddfp c01, a5, bp1, c01 554 LOAD_A a7, OFFSET_6, AO 555 vmaddfp c02, a6, bp1, c02 556 LOAD_A a8, OFFSET_7, AO 557 vmaddfp c03, a7, bp1, c03 558 addi AO, AO, 32 * SIZE 559 vmaddfp c04, a8, bp1, c04 560 NOP2 561 562 vmaddfp c05, a5, bp2, c05 563 vspltw bp1, b2, 2 564 vmaddfp c06, a6, bp2, c06 565 NOP2 566 vmaddfp c07, a7, bp2, c07 567 NOP1 568 vmaddfp c08, a8, bp2, c08 569 LOAD_B b1, OFFSET_0, BO 570 571 vmaddfp c09, a5, bp1, c09 572 vspltw bp2, b2, 3 573 vmaddfp c10, a6, bp1, c10 574 LOAD_A a1, OFFSET_0, AO // 575 vmaddfp c11, a7, bp1, c11 576 NOP2 577 vmaddfp c12, a8, bp1, c12 578 vspltw bp1, b1, 0 579 580 vmaddfp c13, a5, bp2, c13 581 LOAD_A a2, OFFSET_1, AO 582 vmaddfp c14, a6, bp2, c14 583 LOAD_A a3, OFFSET_2, AO 584 vmaddfp c15, a7, bp2, c15 585 NOP1 586 vmaddfp c16, a8, bp2, c16 587 bdnz+ LL(12) 588 .align 4 589 590LL(13): 591 andi. r0, K, 2 592 nop 593 nop 594 ble+ LL(15) 595 .align 4 596 597 vmaddfp c01, a1, bp1, c01 598 vspltw bp2, b1, 1 599 vmaddfp c02, a2, bp1, c02 600 LOAD_A a4, OFFSET_3, AO 601 vmaddfp c03, a3, bp1, c03 602 NOP1 603 vmaddfp c04, a4, bp1, c04 604 NOP2 605 606 vmaddfp c05, a1, bp2, c05 607 vspltw bp1, b1, 2 608 vmaddfp c06, a2, bp2, c06 609 NOP2 610 vmaddfp c07, a3, bp2, c07 611 NOP1 612 vmaddfp c08, a4, bp2, c08 613 LOAD_B b2, OFFSET_1, BO 614 615 vmaddfp c09, a1, bp1, c09 616 vspltw bp2, b1, 3 617 vmaddfp c10, a2, bp1, c10 618 LOAD_A a5, OFFSET_4, AO 619 vmaddfp c11, a3, bp1, c11 620 LOAD_A a6, OFFSET_5, AO 621 vmaddfp c12, a4, bp1, c12 622 addi BO, BO, 8 * SIZE 623 624 vmaddfp c13, a1, bp2, c13 625 vspltw bp1, b2, 0 626 vmaddfp c14, a2, bp2, c14 627 LOAD_A a7, OFFSET_6, AO 628 vmaddfp c15, a3, bp2, c15 629 LOAD_A a8, OFFSET_7, AO 630 vmaddfp c16, a4, bp2, c16 631 addi AO, AO, 32 * SIZE 632 633 vmaddfp c01, a5, bp1, c01 634 vspltw bp2, b2, 1 635 vmaddfp c02, a6, bp1, c02 636 NOP2 637 vmaddfp c03, a7, bp1, c03 638 NOP1 639 vmaddfp c04, a8, bp1, c04 640 NOP2 641 642 vmaddfp c05, a5, bp2, c05 643 vspltw bp1, b2, 2 644 vmaddfp c06, a6, bp2, c06 645 NOP2 646 vmaddfp c07, a7, bp2, c07 647 NOP1 648 vmaddfp c08, a8, bp2, c08 649 LOAD_B b1, OFFSET_0, BO 650 651 vmaddfp c09, a5, bp1, c09 652 vspltw bp2, b2, 3 653 vmaddfp c10, a6, bp1, c10 654 LOAD_A a1, OFFSET_0, AO 655 vmaddfp c11, a7, bp1, c11 656 LOAD_A a2, OFFSET_1, AO 657 vmaddfp c12, a8, bp1, c12 658 NOP2 659 660 vmaddfp c13, a5, bp2, c13 661 vspltw bp1, b1, 0 662 vmaddfp c14, a6, bp2, c14 663 LOAD_A a3, OFFSET_2, AO 664 vmaddfp c15, a7, bp2, c15 665 vmaddfp c16, a8, bp2, c16 666 .align 4 667 668 669LL(15): 670 andi. r0, K, 1 671 vxor VZERO, VZERO, VZERO 672 ble+ LL(18) 673 .align 4 674 675 vmaddfp c01, a1, bp1, c01 676 vspltw bp2, b1, 1 677 vmaddfp c02, a2, bp1, c02 678 LOAD_A a4, OFFSET_3, AO 679 vmaddfp c03, a3, bp1, c03 680 nop 681 vmaddfp c04, a4, bp1, c04 682 nop 683 684 vmaddfp c05, a1, bp2, c05 685 vspltw bp1, b1, 2 686 vmaddfp c06, a2, bp2, c06 687 nop 688 vmaddfp c07, a3, bp2, c07 689 nop 690 vmaddfp c08, a4, bp2, c08 691 nop 692 693 vmaddfp c09, a1, bp1, c09 694 vspltw bp2, b1, 3 695 vmaddfp c10, a2, bp1, c10 696 addi AO, AO, 16 * SIZE 697 vmaddfp c11, a3, bp1, c11 698 addi BO, BO, 4 * SIZE 699 vmaddfp c12, a4, bp1, c12 700 nop 701 702 vmaddfp c13, a1, bp2, c13 703 vmaddfp c14, a2, bp2, c14 704 vmaddfp c15, a3, bp2, c15 705 vmaddfp c16, a4, bp2, c16 706 .align 4 707 708LL(18): 709 lvx swap, OFFSET_0, SP 710 lvx neg, OFFSET_1, SP 711 lvx alpha_r, OFFSET_2, SP 712 lvx alpha_i, OFFSET_3, SP 713 714 vxor VZERO, VZERO, VZERO 715 716 vperm c05, c05, c05, swap 717 vperm c06, c06, c06, swap 718 vperm c07, c07, c07, swap 719 vperm c08, c08, c08, swap 720 721 vperm c13, c13, c13, swap 722 vperm c14, c14, c14, swap 723 vperm c15, c15, c15, swap 724 vperm c16, c16, c16, swap 725 726 vxor c05, c05, neg 727 vxor c06, c06, neg 728 vxor c07, c07, neg 729 vxor c08, c08, neg 730 731 vxor c13, c13, neg 732 vxor c14, c14, neg 733 vxor c15, c15, neg 734 vxor c16, c16, neg 735 736 vaddfp c01, c01, c05 737 vaddfp c02, c02, c06 738 vaddfp c03, c03, c07 739 vaddfp c04, c04, c08 740 741 vaddfp c09, c09, c13 742 vaddfp c10, c10, c14 743 vaddfp c11, c11, c15 744 vaddfp c12, c12, c16 745 746 vperm c05, c01, c01, swap 747 vperm c06, c02, c02, swap 748 vperm c07, c03, c03, swap 749 vperm c08, c04, c04, swap 750 751 vperm c13, c09, c09, swap 752 vperm c14, c10, c10, swap 753 vperm c15, c11, c11, swap 754 vperm c16, c12, c12, swap 755 756 vmaddfp c01, alpha_r, c01, VZERO 757 vmaddfp c02, alpha_r, c02, VZERO 758 vmaddfp c03, alpha_r, c03, VZERO 759 vmaddfp c04, alpha_r, c04, VZERO 760 761 vmaddfp c01, alpha_i, c05, c01 762 vmaddfp c02, alpha_i, c06, c02 763 vmaddfp c03, alpha_i, c07, c03 764 vmaddfp c04, alpha_i, c08, c04 765 766 vmaddfp c09, alpha_r, c09, VZERO 767 vmaddfp c10, alpha_r, c10, VZERO 768 vmaddfp c11, alpha_r, c11, VZERO 769 vmaddfp c12, alpha_r, c12, VZERO 770 771 vmaddfp c09, alpha_i, c13, c09 772 vmaddfp c10, alpha_i, c14, c10 773 vmaddfp c11, alpha_i, c15, c11 774 vmaddfp c12, alpha_i, c16, c12 775 776 lvx C1, OFFSET_0, CO1 777 lvx C2, OFFSET_1, CO1 778 lvx C3, OFFSET_2, CO1 779 lvx C4, OFFSET_3, CO1 780 lvx C5, OFFSET_4, CO1 781 782 lvsr PERMRSHIFT1, 0, CO1 783 lvsr PERMRSHIFT2, 0, CO2 784 785 vperm c00, VZERO, c01, PERMRSHIFT1 786 vperm c01, c01, c02, PERMRSHIFT1 787 vperm c02, c02, c03, PERMRSHIFT1 788 vperm c03, c03, c04, PERMRSHIFT1 789 vperm c04, c04, VZERO, PERMRSHIFT1 790 791 vaddfp c00, c00, C1 792 vaddfp c01, c01, C2 793 vaddfp c02, c02, C3 794 vaddfp c03, c03, C4 795 vaddfp c04, c04, C5 796 797 stvx c00, OFFSET_0, CO1 798 stvx c01, OFFSET_1, CO1 799 stvx c02, OFFSET_2, CO1 800 stvx c03, OFFSET_3, CO1 801 stvx c04, OFFSET_4, CO1 802 803 lvx C1, OFFSET_0, CO2 804 lvx C2, OFFSET_1, CO2 805 lvx C3, OFFSET_2, CO2 806 lvx C4, OFFSET_3, CO2 807 lvx C5, OFFSET_4, CO2 808 809 vperm c00, VZERO, c09, PERMRSHIFT2 810 vperm c09, c09, c10, PERMRSHIFT2 811 vperm c10, c10, c11, PERMRSHIFT2 812 vperm c11, c11, c12, PERMRSHIFT2 813 vperm c12, c12, VZERO, PERMRSHIFT2 814 815 vaddfp c00, c00, C1 816 vaddfp c09, c09, C2 817 vaddfp c10, c10, C3 818 vaddfp c11, c11, C4 819 vaddfp c12, c12, C5 820 821 stvx c00, OFFSET_0, CO2 822 stvx c09, OFFSET_1, CO2 823 stvx c10, OFFSET_2, CO2 824 stvx c11, OFFSET_3, CO2 825 stvx c12, OFFSET_4, CO2 826 827 addi CO1, CO1, 16 * SIZE 828 addi CO2, CO2, 16 * SIZE 829 addic. I, I, -1 830 bgt+ LL(11) 831 .align 4 832 833LL(20): 834 andi. I, M, 4 835 ble LL(30) 836 837 vxor c01, c01, c01 838 LOAD_A a1, OFFSET_0, AO 839 vxor c02, c02, c02 840 LOAD_A a2, OFFSET_1, AO 841 vxor c05, c05, c05 842 LOAD_A a3, OFFSET_2, AO 843 vxor c06, c06, c06 844 LOAD_A a4, OFFSET_3, AO 845 vxor c09, c09, c09 846 LOAD_B b1, OFFSET_0, B 847 vxor c10, c10, c10 848 LOAD_B b2, OFFSET_1, B 849 vxor c13, c13, c13 850 vxor c14, c14, c14 851 mr BO, B 852 vspltw bp1, b1, 0 853 854 srawi. r0, K, 1 855 mtspr CTR, r0 856 ble LL(25) 857 .align 4 858 859LL(22): 860 vmaddfp c01, a1, bp1, c01 861 vspltw bp2, b1, 1 862 addi AO, AO, 16 * SIZE 863 vmaddfp c02, a2, bp1, c02 864 addi BO, BO, 8 * SIZE 865 866 vmaddfp c05, a1, bp2, c05 867 vspltw bp1, b1, 2 868 vmaddfp c06, a2, bp2, c06 869 870 vmaddfp c09, a1, bp1, c09 871 vspltw bp2, b1, 3 872 LOAD_B b1, OFFSET_0, BO 873 vmaddfp c10, a2, bp1, c10 874 875 vmaddfp c13, a1, bp2, c13 876 LOAD_A a1, OFFSET_0, AO 877 vspltw bp1, b2, 0 878 vmaddfp c14, a2, bp2, c14 879 LOAD_A a2, OFFSET_1, AO 880 881 vmaddfp c01, a3, bp1, c01 882 vspltw bp2, b2, 1 883 vmaddfp c02, a4, bp1, c02 884 885 vmaddfp c05, a3, bp2, c05 886 vspltw bp1, b2, 2 887 vmaddfp c06, a4, bp2, c06 888 889 vmaddfp c09, a3, bp1, c09 890 vspltw bp2, b2, 3 891 LOAD_B b2, OFFSET_1, BO 892 vmaddfp c10, a4, bp1, c10 893 894 vmaddfp c13, a3, bp2, c13 895 LOAD_A a3, OFFSET_2, AO 896 vmaddfp c14, a4, bp2, c14 897 LOAD_A a4, OFFSET_3, AO 898 vspltw bp1, b1, 0 899 bdnz LL(22) 900 .align 4 901 902LL(25): 903 andi. r0, K, 1 904 ble+ LL(28) 905 .align 4 906 907LL(26): 908 vmaddfp c01, a1, bp1, c01 909 vspltw bp2, b1, 1 910 vmaddfp c02, a2, bp1, c02 911 nop 912 913 vmaddfp c05, a1, bp2, c05 914 vspltw bp1, b1, 2 915 vmaddfp c06, a2, bp2, c06 916 nop 917 918 vmaddfp c09, a1, bp1, c09 919 vspltw bp2, b1, 3 920 vmaddfp c10, a2, bp1, c10 921 addi AO, AO, 8 * SIZE 922 923 vmaddfp c13, a1, bp2, c13 924 addi BO, BO, 4 * SIZE 925 vmaddfp c14, a2, bp2, c14 926 nop 927 .align 4 928 929LL(28): 930 vxor VZERO, VZERO, VZERO 931 932 lvx swap, OFFSET_0, SP 933 lvx neg, OFFSET_1, SP 934 lvx alpha_r, OFFSET_2, SP 935 lvx alpha_i, OFFSET_3, SP 936 937 vperm c05, c05, c05, swap 938 vperm c06, c06, c06, swap 939 vperm c13, c13, c13, swap 940 vperm c14, c14, c14, swap 941 942 vxor c05, c05, neg 943 vxor c06, c06, neg 944 vxor c13, c13, neg 945 vxor c14, c14, neg 946 947 vaddfp c01, c01, c05 948 vaddfp c02, c02, c06 949 vaddfp c09, c09, c13 950 vaddfp c10, c10, c14 951 952 vperm c05, c01, c01, swap 953 vperm c06, c02, c02, swap 954 vperm c13, c09, c09, swap 955 vperm c14, c10, c10, swap 956 957 vmaddfp c01, alpha_r, c01, VZERO 958 vmaddfp c02, alpha_r, c02, VZERO 959 vmaddfp c01, alpha_i, c05, c01 960 vmaddfp c02, alpha_i, c06, c02 961 962 vmaddfp c09, alpha_r, c09, VZERO 963 vmaddfp c10, alpha_r, c10, VZERO 964 vmaddfp c09, alpha_i, c13, c09 965 vmaddfp c10, alpha_i, c14, c10 966 967 lvx C1, OFFSET_0, CO1 968 lvx C2, OFFSET_1, CO1 969 lvx C3, OFFSET_2, CO1 970 971 lvsr PERMRSHIFT1, 0, CO1 972 lvsr PERMRSHIFT2, 0, CO2 973 974 vperm c00, VZERO, c01, PERMRSHIFT1 975 vperm c01, c01, c02, PERMRSHIFT1 976 vperm c02, c02, VZERO, PERMRSHIFT1 977 978 vaddfp c00, c00, C1 979 vaddfp c01, c01, C2 980 vaddfp c02, c02, C3 981 982 stvx c00, OFFSET_0, CO1 983 stvx c01, OFFSET_1, CO1 984 stvx c02, OFFSET_2, CO1 985 986 lvx C1, OFFSET_0, CO2 987 lvx C2, OFFSET_1, CO2 988 lvx C3, OFFSET_2, CO2 989 990 vperm c00, VZERO, c09, PERMRSHIFT2 991 vperm c09, c09, c10, PERMRSHIFT2 992 vperm c10, c10, VZERO, PERMRSHIFT2 993 994 vaddfp c00, c00, C1 995 vaddfp c09, c09, C2 996 vaddfp c10, c10, C3 997 998 stvx c00, OFFSET_0, CO2 999 stvx c09, OFFSET_1, CO2 1000 stvx c10, OFFSET_2, CO2 1001 1002 addi CO1, CO1, 8 * SIZE 1003 addi CO2, CO2, 8 * SIZE 1004 .align 4 1005 1006LL(30): 1007 andi. I, M, 2 1008 ble LL(40) 1009 1010 vxor c01, c01, c01 1011 LOAD_A a1, OFFSET_0, AO 1012 vxor c02, c02, c02 1013 LOAD_A a2, OFFSET_1, AO 1014 vxor c05, c05, c05 1015 LOAD_B b1, OFFSET_0, B 1016 vxor c06, c06, c06 1017 LOAD_B b2, OFFSET_1, B 1018 vxor c09, c09, c09 1019 vxor c10, c10, c10 1020 vxor c13, c13, c13 1021 vxor c14, c14, c14 1022 1023 vspltw bp1, b1, 0 1024 mr BO, B 1025 1026 srawi. r0, K, 1 1027 mtspr CTR, r0 1028 ble LL(35) 1029 .align 4 1030 1031LL(32): 1032 vmaddfp c01, a1, bp1, c01 1033 addi AO, AO, 8 * SIZE 1034 vspltw bp2, b1, 1 1035 vmaddfp c05, a1, bp2, c05 1036 addi BO, BO, 8 * SIZE 1037 vspltw bp1, b1, 2 1038 vmaddfp c09, a1, bp1, c09 1039 vspltw bp2, b1, 3 1040 vmaddfp c13, a1, bp2, c13 1041 LOAD_A a1, OFFSET_0, AO 1042 vspltw bp1, b2, 0 1043 LOAD_B b1, OFFSET_0, BO 1044 1045 vmaddfp c02, a2, bp1, c02 1046 vspltw bp2, b2, 1 1047 vmaddfp c06, a2, bp2, c06 1048 vspltw bp1, b2, 2 1049 vmaddfp c10, a2, bp1, c10 1050 vspltw bp2, b2, 3 1051 LOAD_B b2, OFFSET_1, BO 1052 vmaddfp c14, a2, bp2, c14 1053 LOAD_A a2, OFFSET_1, AO 1054 1055 vspltw bp1, b1, 0 1056 bdnz LL(32) 1057 .align 4 1058 1059LL(35): 1060 andi. r0, K, 1 1061 ble+ LL(38) 1062 .align 4 1063 1064LL(36): 1065 vmaddfp c01, a1, bp1, c01 1066 vspltw bp2, b1, 1 1067 vmaddfp c05, a1, bp2, c05 1068 vspltw bp1, b1, 2 1069 vmaddfp c09, a1, bp1, c09 1070 vspltw bp2, b1, 3 1071 vmaddfp c13, a1, bp2, c13 1072 addi AO, AO, 4 * SIZE 1073 addi BO, BO, 4 * SIZE 1074 .align 4 1075 1076LL(38): 1077 vaddfp c01, c01, c02 1078 vaddfp c05, c05, c06 1079 vaddfp c09, c09, c10 1080 vaddfp c13, c13, c14 1081 1082 vxor VZERO, VZERO, VZERO 1083 1084 lvx swap, OFFSET_0, SP 1085 lvx neg, OFFSET_1, SP 1086 lvx alpha_r, OFFSET_2, SP 1087 lvx alpha_i, OFFSET_3, SP 1088 1089 vperm c05, c05, c05, swap 1090 vperm c13, c13, c13, swap 1091 1092 vxor c05, c05, neg 1093 vxor c13, c13, neg 1094 1095 vaddfp c01, c01, c05 1096 vaddfp c09, c09, c13 1097 1098 vperm c05, c01, c01, swap 1099 vperm c13, c09, c09, swap 1100 1101 vmaddfp c01, alpha_r, c01, VZERO 1102 vmaddfp c01, alpha_i, c05, c01 1103 1104 vmaddfp c09, alpha_r, c09, VZERO 1105 vmaddfp c09, alpha_i, c13, c09 1106 1107 lvx C1, OFFSET_0, CO1 1108 lvx C2, OFFSET_1, CO1 1109 1110 lvsr PERMRSHIFT1, 0, CO1 1111 lvsr PERMRSHIFT2, 0, CO2 1112 1113 vperm c00, VZERO, c01, PERMRSHIFT1 1114 vperm c01, c01, VZERO, PERMRSHIFT1 1115 1116 vaddfp c00, c00, C1 1117 vaddfp c01, c01, C2 1118 1119 stvx c00, OFFSET_0, CO1 1120 stvx c01, OFFSET_1, CO1 1121 1122 lvx C1, OFFSET_0, CO2 1123 lvx C2, OFFSET_1, CO2 1124 1125 vperm c00, VZERO, c09, PERMRSHIFT2 1126 vperm c09, c09, VZERO, PERMRSHIFT2 1127 1128 vaddfp c00, c00, C1 1129 vaddfp c09, c09, C2 1130 1131 stvx c00, OFFSET_0, CO2 1132 stvx c09, OFFSET_1, CO2 1133 1134 addi CO1, CO1, 4 * SIZE 1135 addi CO2, CO2, 4 * SIZE 1136 .align 4 1137 1138LL(40): 1139 andi. I, M, 1 1140 ble LL(49) 1141 1142 mr BO, B 1143 1144 LFD f8, 0 * SIZE(AO) 1145 LFD f9, 1 * SIZE(AO) 1146 1147 LFD f10, 0 * SIZE(BO) 1148 LFD f11, 1 * SIZE(BO) 1149 LFD f12, 2 * SIZE(BO) 1150 LFD f13, 3 * SIZE(BO) 1151 1152 lfs f0, FZERO(SP) 1153 fmr f1, f0 1154 fmr f2, f0 1155 fmr f3, f0 1156 1157 fmr f4, f0 1158 fmr f5, f0 1159 fmr f6, f0 1160 fmr f7, f0 1161 1162 srawi. r0, K, 1 1163 mtspr CTR, r0 1164 ble LL(45) 1165 .align 4 1166 1167LL(42): 1168 fmadd f0, f8, f10, f0 1169 fmadd f2, f8, f11, f2 1170 fmadd f4, f8, f12, f4 1171 fmadd f6, f8, f13, f6 1172 1173 fmadd f1, f9, f10, f1 1174 fmadd f3, f9, f11, f3 1175 fmadd f5, f9, f12, f5 1176 fmadd f7, f9, f13, f7 1177 1178 LFD f8, 2 * SIZE(AO) 1179 LFD f9, 3 * SIZE(AO) 1180 1181 LFD f10, 4 * SIZE(BO) 1182 LFD f11, 5 * SIZE(BO) 1183 LFD f12, 6 * SIZE(BO) 1184 LFD f13, 7 * SIZE(BO) 1185 1186 fmadd f0, f8, f10, f0 1187 fmadd f2, f8, f11, f2 1188 fmadd f4, f8, f12, f4 1189 fmadd f6, f8, f13, f6 1190 1191 fmadd f1, f9, f10, f1 1192 fmadd f3, f9, f11, f3 1193 fmadd f5, f9, f12, f5 1194 fmadd f7, f9, f13, f7 1195 1196 LFD f8, 4 * SIZE(AO) 1197 LFD f9, 5 * SIZE(AO) 1198 1199 LFD f10, 8 * SIZE(BO) 1200 LFD f11, 9 * SIZE(BO) 1201 LFD f12, 10 * SIZE(BO) 1202 LFD f13, 11 * SIZE(BO) 1203 1204 addi AO, AO, 4 * SIZE 1205 addi BO, BO, 8 * SIZE 1206 bdnz LL(42) 1207 .align 4 1208 1209LL(45): 1210 andi. r0, K, 1 1211 ble LL(48) 1212 .align 4 1213 1214LL(46): 1215 fmadd f0, f8, f10, f0 1216 fmadd f2, f8, f11, f2 1217 fmadd f4, f8, f12, f4 1218 fmadd f6, f8, f13, f6 1219 1220 fmadd f1, f9, f10, f1 1221 fmadd f3, f9, f11, f3 1222 fmadd f5, f9, f12, f5 1223 fmadd f7, f9, f13, f7 1224 1225 addi AO, AO, 2 * SIZE 1226 addi BO, BO, 4 * SIZE 1227 .align 4 1228 1229LL(48): 1230#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 1231 fsub f0, f0, f3 1232 fadd f1, f1, f2 1233 fsub f4, f4, f7 1234 fadd f5, f5, f6 1235#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 1236 fadd f0, f0, f3 1237 fsub f1, f1, f2 1238 fadd f4, f4, f7 1239 fsub f5, f5, f6 1240#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 1241 fadd f0, f0, f3 1242 fsub f1, f2, f1 1243 fadd f4, f4, f7 1244 fsub f5, f6, f5 1245#else /* RR, RC, CR, CC */ 1246 fsub f0, f0, f3 1247 fadd f1, f1, f2 1248 fsub f4, f4, f7 1249 fadd f5, f5, f6 1250#endif 1251 1252 LFD f8, 0 * SIZE(CO1) 1253 LFD f9, 1 * SIZE(CO1) 1254 LFD f10, 0 * SIZE(CO2) 1255 LFD f11, 1 * SIZE(CO2) 1256 1257 lfs f12, ALPHA_R + 0(SP) 1258 lfs f13, ALPHA_I + 4(SP) 1259 1260#if defined(RR) || defined(RC) || defined(CR) || defined(CC) 1261 fmadd f8, f12, f0, f8 1262 fnmsub f9, f12, f1, f9 1263 fmadd f10, f12, f4, f10 1264 fnmsub f11, f12, f5, f11 1265 1266 fmadd f8, f13, f1, f8 1267 fmadd f9, f13, f0, f9 1268 fmadd f10, f13, f5, f10 1269 fmadd f11, f13, f4, f11 1270#else 1271 fmadd f8, f12, f0, f8 1272 fmadd f9, f12, f1, f9 1273 fmadd f10, f12, f4, f10 1274 fmadd f11, f12, f5, f11 1275 1276 fnmsub f8, f13, f1, f8 1277 fmadd f9, f13, f0, f9 1278 fnmsub f10, f13, f5, f10 1279 fmadd f11, f13, f4, f11 1280#endif 1281 1282 STFD f8, 0 * SIZE(CO1) 1283 STFD f9, 1 * SIZE(CO1) 1284 STFD f10, 0 * SIZE(CO2) 1285 STFD f11, 1 * SIZE(CO2) 1286 1287LL(49): 1288 mr B, BO 1289 1290 addic. J, J, -1 1291 bgt LL(01) 1292 .align 4 1293 1294LL(50): 1295 andi. J, N, 1 1296 ble LL(999) 1297 1298 mr CO1, C 1299 mr AO, A 1300 1301 srawi. I, M, 3 1302 ble LL(70) 1303 .align 4 1304 1305LL(61): 1306 vxor c01, c01, c01 1307 LOAD_B b1, OFFSET_0, B 1308 vxor c02, c02, c02 1309 vxor c03, c03, c03 1310 LOAD_A a1, OFFSET_0, AO 1311 vxor c04, c04, c04 1312 LOAD_A a2, OFFSET_1, AO 1313 vxor c05, c05, c05 1314 LOAD_A a3, OFFSET_2, AO 1315 vxor c06, c06, c06 1316 LOAD_A a4, OFFSET_3, AO 1317 vxor c07, c07, c07 1318 vxor c08, c08, c08 1319 1320 mr BO, B 1321 dcbtst CO1, PREC 1322 dcbtst CO2, PREC 1323 1324 vspltw bp1, b1, 0 1325 1326 srawi. r0, K, 1 1327 mtspr CTR, r0 1328 ble LL(65) 1329 .align 4 1330 1331LL(62): 1332 LOAD_A a5, OFFSET_4, AO 1333 LOAD_A a6, OFFSET_5, AO 1334 LOAD_A a7, OFFSET_6, AO 1335 LOAD_A a8, OFFSET_7, AO 1336 1337 vmaddfp c01, a1, bp1, c01 1338 vspltw bp2, b1, 1 1339 vmaddfp c02, a2, bp1, c02 1340 vmaddfp c03, a3, bp1, c03 1341 vmaddfp c04, a4, bp1, c04 1342 1343 vmaddfp c05, a1, bp2, c05 1344 vspltw bp1, b1, 2 1345 vmaddfp c06, a2, bp2, c06 1346 vmaddfp c07, a3, bp2, c07 1347 vmaddfp c08, a4, bp2, c08 1348 1349 vmaddfp c01, a5, bp1, c01 1350 vspltw bp2, b1, 3 1351 vmaddfp c02, a6, bp1, c02 1352 vmaddfp c03, a7, bp1, c03 1353 vmaddfp c04, a8, bp1, c04 1354 1355 LOAD_B b1, OFFSET_1, BO 1356 vspltw bp1, b1, 0 1357 1358 vmaddfp c05, a5, bp2, c05 1359 vmaddfp c06, a6, bp2, c06 1360 vmaddfp c07, a7, bp2, c07 1361 vmaddfp c08, a8, bp2, c08 1362 1363 addi AO, AO, 32 * SIZE 1364 addi BO, BO, 4 * SIZE 1365 1366 LOAD_A a1, OFFSET_0, AO 1367 LOAD_A a2, OFFSET_1, AO 1368 LOAD_A a3, OFFSET_2, AO 1369 LOAD_A a4, OFFSET_3, AO 1370 bdnz LL(62) 1371 .align 4 1372 1373LL(65): 1374 andi. r0, K, 1 1375 ble+ LL(68) 1376 .align 4 1377 1378LL(66): 1379 vmaddfp c01, a1, bp1, c01 1380 vspltw bp2, b1, 1 1381 vmaddfp c02, a2, bp1, c02 1382 addi AO, AO, 16 * SIZE 1383 vmaddfp c03, a3, bp1, c03 1384 addi BO, BO, 2 * SIZE 1385 vmaddfp c04, a4, bp1, c04 1386 nop 1387 1388 vmaddfp c05, a1, bp2, c05 1389 vmaddfp c06, a2, bp2, c06 1390 vmaddfp c07, a3, bp2, c07 1391 vmaddfp c08, a4, bp2, c08 1392 .align 4 1393 1394LL(68): 1395 vxor VZERO, VZERO, VZERO 1396 1397 lvx swap, OFFSET_0, SP 1398 lvx neg, OFFSET_1, SP 1399 lvx alpha_r, OFFSET_2, SP 1400 lvx alpha_i, OFFSET_3, SP 1401 1402 vperm c05, c05, c05, swap 1403 vperm c06, c06, c06, swap 1404 vperm c07, c07, c07, swap 1405 vperm c08, c08, c08, swap 1406 1407 vxor c05, c05, neg 1408 vxor c06, c06, neg 1409 vxor c07, c07, neg 1410 vxor c08, c08, neg 1411 1412 vaddfp c01, c01, c05 1413 vaddfp c02, c02, c06 1414 vaddfp c03, c03, c07 1415 vaddfp c04, c04, c08 1416 1417 vperm c05, c01, c01, swap 1418 vperm c06, c02, c02, swap 1419 vperm c07, c03, c03, swap 1420 vperm c08, c04, c04, swap 1421 1422 vmaddfp c01, alpha_r, c01, VZERO 1423 vmaddfp c02, alpha_r, c02, VZERO 1424 vmaddfp c03, alpha_r, c03, VZERO 1425 vmaddfp c04, alpha_r, c04, VZERO 1426 1427 vmaddfp c01, alpha_i, c05, c01 1428 vmaddfp c02, alpha_i, c06, c02 1429 vmaddfp c03, alpha_i, c07, c03 1430 vmaddfp c04, alpha_i, c08, c04 1431 1432 lvx C1, OFFSET_0, CO1 1433 lvx C2, OFFSET_1, CO1 1434 lvx C3, OFFSET_2, CO1 1435 lvx C4, OFFSET_3, CO1 1436 lvx C5, OFFSET_4, CO1 1437 1438 lvsr PERMRSHIFT1, 0, CO1 1439 1440 vperm c00, VZERO, c01, PERMRSHIFT1 1441 vperm c01, c01, c02, PERMRSHIFT1 1442 vperm c02, c02, c03, PERMRSHIFT1 1443 vperm c03, c03, c04, PERMRSHIFT1 1444 vperm c04, c04, VZERO, PERMRSHIFT1 1445 1446 vaddfp c00, c00, C1 1447 vaddfp c01, c01, C2 1448 vaddfp c02, c02, C3 1449 vaddfp c03, c03, C4 1450 vaddfp c04, c04, C5 1451 1452 stvx c00, OFFSET_0, CO1 1453 stvx c01, OFFSET_1, CO1 1454 stvx c02, OFFSET_2, CO1 1455 stvx c03, OFFSET_3, CO1 1456 stvx c04, OFFSET_4, CO1 1457 1458 addi CO1, CO1, 16 * SIZE 1459 addic. I, I, -1 1460 bgt+ LL(61) 1461 .align 4 1462 1463LL(70): 1464 andi. I, M, 4 1465 ble LL(80) 1466 1467 vxor c01, c01, c01 1468 LOAD_B b1, OFFSET_0, B 1469 vxor c02, c02, c02 1470 vxor c03, c03, c03 1471 LOAD_A a1, OFFSET_0, AO 1472 vxor c04, c04, c04 1473 LOAD_A a2, OFFSET_1, AO 1474 vxor c05, c05, c05 1475 LOAD_A a3, OFFSET_2, AO 1476 vxor c06, c06, c06 1477 LOAD_A a4, OFFSET_3, AO 1478 vxor c07, c07, c07 1479 vxor c08, c08, c08 1480 1481 mr BO, B 1482 1483 vspltw bp1, b1, 0 1484 srawi. r0, K, 1 1485 mtspr CTR, r0 1486 ble LL(75) 1487 .align 4 1488 1489LL(72): 1490 vmaddfp c01, a1, bp1, c01 1491 vspltw bp2, b1, 1 1492 vmaddfp c02, a2, bp1, c02 1493 1494 vmaddfp c05, a1, bp2, c05 1495 vspltw bp1, b1, 2 1496 vmaddfp c06, a2, bp2, c06 1497 1498 vmaddfp c03, a3, bp1, c03 1499 vspltw bp2, b1, 3 1500 vmaddfp c04, a4, bp1, c04 1501 1502 LOAD_B b1, OFFSET_1, BO 1503 vspltw bp1, b1, 0 1504 1505 vmaddfp c07, a3, bp2, c07 1506 vmaddfp c08, a4, bp2, c08 1507 1508 addi AO, AO, 16 * SIZE 1509 addi BO, BO, 4 * SIZE 1510 1511 LOAD_A a1, OFFSET_0, AO 1512 LOAD_A a2, OFFSET_1, AO 1513 LOAD_A a3, OFFSET_2, AO 1514 LOAD_A a4, OFFSET_3, AO 1515 bdnz LL(72) 1516 .align 4 1517 1518LL(75): 1519 andi. r0, K, 1 1520 ble+ LL(78) 1521 .align 4 1522 1523LL(76): 1524 vmaddfp c01, a1, bp1, c01 1525 vspltw bp2, b1, 1 1526 vmaddfp c02, a2, bp1, c02 1527 addi AO, AO, 8 * SIZE 1528 vmaddfp c05, a1, bp2, c05 1529 addi BO, BO, 2 * SIZE 1530 vmaddfp c06, a2, bp2, c06 1531 .align 4 1532 1533LL(78): 1534 vaddfp c01, c01, c03 1535 vaddfp c02, c02, c04 1536 vaddfp c05, c05, c07 1537 vaddfp c06, c06, c08 1538 1539 vxor VZERO, VZERO, VZERO 1540 1541 lvx swap, OFFSET_0, SP 1542 lvx neg, OFFSET_1, SP 1543 lvx alpha_r, OFFSET_2, SP 1544 lvx alpha_i, OFFSET_3, SP 1545 1546 vperm c05, c05, c05, swap 1547 vperm c06, c06, c06, swap 1548 1549 vxor c05, c05, neg 1550 vxor c06, c06, neg 1551 1552 vaddfp c01, c01, c05 1553 vaddfp c02, c02, c06 1554 1555 vperm c05, c01, c01, swap 1556 vperm c06, c02, c02, swap 1557 1558 vmaddfp c01, alpha_r, c01, VZERO 1559 vmaddfp c02, alpha_r, c02, VZERO 1560 vmaddfp c01, alpha_i, c05, c01 1561 vmaddfp c02, alpha_i, c06, c02 1562 1563 lvx C1, OFFSET_0, CO1 1564 lvx C2, OFFSET_1, CO1 1565 lvx C3, OFFSET_2, CO1 1566 1567 lvsr PERMRSHIFT1, 0, CO1 1568 1569 vperm c00, VZERO, c01, PERMRSHIFT1 1570 vperm c01, c01, c02, PERMRSHIFT1 1571 vperm c02, c02, VZERO, PERMRSHIFT1 1572 1573 vaddfp c00, c00, C1 1574 vaddfp c01, c01, C2 1575 vaddfp c02, c02, C3 1576 1577 stvx c00, OFFSET_0, CO1 1578 stvx c01, OFFSET_1, CO1 1579 stvx c02, OFFSET_2, CO1 1580 1581 addi CO1, CO1, 8 * SIZE 1582 .align 4 1583 1584LL(80): 1585 andi. I, M, 2 1586 ble LL(90) 1587 1588 vxor c01, c01, c01 1589 LOAD_B b1, OFFSET_0, B 1590 vxor c02, c02, c02 1591 LOAD_A a1, OFFSET_0, AO 1592 LOAD_A a2, OFFSET_1, AO 1593 vxor c05, c05, c05 1594 vxor c06, c06, c06 1595 1596 mr BO, B 1597 1598 vspltw bp1, b1, 0 1599 1600 srawi. r0, K, 1 1601 mtspr CTR, r0 1602 ble LL(85) 1603 .align 4 1604 1605LL(82): 1606 vmaddfp c01, a1, bp1, c01 1607 vspltw bp2, b1, 1 1608 1609 vmaddfp c05, a1, bp2, c05 1610 vspltw bp1, b1, 2 1611 1612 vmaddfp c02, a2, bp1, c02 1613 vspltw bp2, b1, 3 1614 1615 LOAD_B b1, OFFSET_1, BO 1616 vspltw bp1, b1, 0 1617 1618 vmaddfp c06, a2, bp2, c06 1619 1620 addi AO, AO, 8 * SIZE 1621 addi BO, BO, 4 * SIZE 1622 1623 LOAD_A a1, OFFSET_0, AO 1624 LOAD_A a2, OFFSET_1, AO 1625 bdnz LL(82) 1626 .align 4 1627 1628LL(85): 1629 andi. r0, K, 1 1630 ble+ LL(88) 1631 .align 4 1632 1633LL(86): 1634 vspltw bp2, b1, 1 1635 vmaddfp c01, a1, bp1, c01 1636 vmaddfp c05, a1, bp2, c05 1637 addi AO, AO, 4 * SIZE 1638 addi BO, BO, 2 * SIZE 1639 .align 4 1640 1641LL(88): 1642 vaddfp c01, c01, c02 1643 vaddfp c05, c05, c06 1644 vaddfp c09, c09, c10 1645 vaddfp c13, c13, c14 1646 1647 vxor VZERO, VZERO, VZERO 1648 1649 lvx swap, OFFSET_0, SP 1650 lvx neg, OFFSET_1, SP 1651 lvx alpha_r, OFFSET_2, SP 1652 lvx alpha_i, OFFSET_3, SP 1653 1654 vperm c05, c05, c05, swap 1655 1656 vxor c05, c05, neg 1657 1658 vaddfp c01, c01, c05 1659 1660 vperm c05, c01, c01, swap 1661 1662 vmaddfp c01, alpha_r, c01, VZERO 1663 vmaddfp c01, alpha_i, c05, c01 1664 1665 lvx C1, OFFSET_0, CO1 1666 lvx C2, OFFSET_1, CO1 1667 1668 lvsr PERMRSHIFT1, 0, CO1 1669 1670 vperm c00, VZERO, c01, PERMRSHIFT1 1671 vperm c01, c01, VZERO, PERMRSHIFT1 1672 1673 vaddfp c00, c00, C1 1674 vaddfp c01, c01, C2 1675 1676 stvx c00, OFFSET_0, CO1 1677 stvx c01, OFFSET_1, CO1 1678 1679 addi CO1, CO1, 4 * SIZE 1680 .align 4 1681 1682LL(90): 1683 andi. I, M, 1 1684 ble LL(999) 1685 1686 mr BO, B 1687 1688 LFD f8, 0 * SIZE(AO) 1689 LFD f9, 1 * SIZE(AO) 1690 1691 LFD f10, 0 * SIZE(BO) 1692 LFD f11, 1 * SIZE(BO) 1693 LFD f12, 2 * SIZE(BO) 1694 LFD f13, 3 * SIZE(BO) 1695 1696 lfs f0, FZERO(SP) 1697 fmr f1, f0 1698 fmr f2, f0 1699 fmr f3, f0 1700 1701 srawi. r0, K, 1 1702 mtspr CTR, r0 1703 ble LL(95) 1704 .align 4 1705 1706LL(92): 1707 fmadd f0, f8, f10, f0 1708 fmadd f2, f8, f11, f2 1709 fmadd f1, f9, f10, f1 1710 fmadd f3, f9, f11, f3 1711 1712 LFD f8, 2 * SIZE(AO) 1713 LFD f9, 3 * SIZE(AO) 1714 LFD f10, 4 * SIZE(BO) 1715 LFD f11, 5 * SIZE(BO) 1716 1717 fmadd f0, f8, f12, f0 1718 fmadd f2, f8, f13, f2 1719 fmadd f1, f9, f12, f1 1720 fmadd f3, f9, f13, f3 1721 1722 LFD f8, 4 * SIZE(AO) 1723 LFD f9, 5 * SIZE(AO) 1724 LFD f12, 6 * SIZE(BO) 1725 LFD f13, 7 * SIZE(BO) 1726 1727 addi AO, AO, 4 * SIZE 1728 addi BO, BO, 4 * SIZE 1729 bdnz LL(92) 1730 .align 4 1731 1732LL(95): 1733 andi. r0, K, 1 1734 ble LL(98) 1735 .align 4 1736 1737LL(96): 1738 fmadd f0, f8, f10, f0 1739 fmadd f2, f8, f11, f2 1740 fmadd f1, f9, f10, f1 1741 fmadd f3, f9, f11, f3 1742 .align 4 1743 1744LL(98): 1745#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 1746 fsub f0, f0, f3 1747 fadd f1, f1, f2 1748#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 1749 fadd f0, f0, f3 1750 fsub f1, f1, f2 1751#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 1752 fadd f0, f0, f3 1753 fsub f1, f2, f1 1754#else /* RR, RC, CR, CC */ 1755 fsub f0, f0, f3 1756 fadd f1, f1, f2 1757#endif 1758 1759 LFD f8, 0 * SIZE(CO1) 1760 LFD f9, 1 * SIZE(CO1) 1761 1762 lfs f12, ALPHA_R + 0(SP) 1763 lfs f13, ALPHA_I + 4(SP) 1764 1765#if defined(RR) || defined(RC) || defined(CR) || defined(CC) 1766 fmadd f8, f12, f0, f8 1767 fnmsub f9, f12, f1, f9 1768 1769 fmadd f8, f13, f1, f8 1770 fmadd f9, f13, f0, f9 1771#else 1772 fmadd f8, f12, f0, f8 1773 fmadd f9, f12, f1, f9 1774 1775 fnmsub f8, f13, f1, f8 1776 fmadd f9, f13, f0, f9 1777#endif 1778 1779 STFD f8, 0 * SIZE(CO1) 1780 STFD f9, 1 * SIZE(CO1) 1781 .align 4 1782 1783LL(999): 1784 mr SP, STACK 1785 1786 li r0, 0 * 16 1787 lvx v20, SP, r0 1788 li r0, 1 * 16 1789 lvx v21, SP, r0 1790 li r0, 2 * 16 1791 lvx v22, SP, r0 1792 li r0, 3 * 16 1793 lvx v23, SP, r0 1794 li r0, 4 * 16 1795 lvx v24, SP, r0 1796 li r0, 5 * 16 1797 lvx v25, SP, r0 1798 li r0, 6 * 16 1799 lvx v26, SP, r0 1800 li r0, 7 * 16 1801 lvx v27, SP, r0 1802 li r0, 8 * 16 1803 lvx v28, SP, r0 1804 li r0, 9 * 16 1805 lvx v29, SP, r0 1806 li r0, 10 * 16 1807 lvx v30, SP, r0 1808 li r0, 11 * 16 1809 lvx v31, SP, r0 1810 1811 mtspr VRsave, VREG 1812 1813#ifdef __64BIT__ 1814 ld r31, 192(SP) 1815 ld r30, 200(SP) 1816 ld r29, 208(SP) 1817 ld r28, 216(SP) 1818 ld r27, 224(SP) 1819 ld r26, 232(SP) 1820 ld r25, 240(SP) 1821 ld r24, 248(SP) 1822 ld r23, 256(SP) 1823 ld r22, 264(SP) 1824 ld r21, 272(SP) 1825 ld r20, 280(SP) 1826 ld r19, 288(SP) 1827 ld r18, 296(SP) 1828 ld r17, 304(SP) 1829 ld r16, 312(SP) 1830 ld r15, 320(SP) 1831 ld r14, 328(SP) 1832#else 1833 lwz r31, 192(SP) 1834 lwz r30, 196(SP) 1835 lwz r29, 200(SP) 1836 lwz r28, 204(SP) 1837 lwz r27, 208(SP) 1838 lwz r26, 212(SP) 1839 lwz r25, 216(SP) 1840 lwz r24, 220(SP) 1841 lwz r23, 224(SP) 1842 lwz r22, 228(SP) 1843 lwz r21, 232(SP) 1844 lwz r20, 236(SP) 1845 lwz r19, 240(SP) 1846 lwz r18, 244(SP) 1847 lwz r17, 248(SP) 1848 lwz r16, 252(SP) 1849 lwz r15, 256(SP) 1850 lwz r14, 260(SP) 1851#endif 1852 1853 addi SP, SP, STACKSIZE 1854 1855 blr 1856 1857 EPILOGUE 1858#endif 1859