1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifndef __64BIT__ 26#define LOAD lwz 27#else 28#define LOAD ld 29#endif 30 31#ifdef __64BIT__ 32#define STACKSIZE 360 33#else 34#define STACKSIZE 272 35#endif 36 37#define ALPHA 0 38#define FZERO 16 39 40#define M r3 41#define N r4 42#define K r5 43 44#ifdef linux 45#ifndef __64BIT__ 46#define A r6 47#define B r7 48#define C r8 49#define LDC r9 50#else 51#define A r7 52#define B r8 53#define C r9 54#define LDC r10 55#endif 56#endif 57 58#if defined(_AIX) || defined(__APPLE__) 59#if !defined(__64BIT__) && defined(DOUBLE) 60#define A r8 61#define B r9 62#define C r10 63#define LDC r7 64#else 65#define A r7 66#define B r8 67#define C r9 68#define LDC r10 69#endif 70#endif 71 72#define STACK r11 73 74#define I r21 75#define J r22 76#define AO r23 77#define BO r24 78#define CO1 r25 79#define CO2 r26 80#define CO3 r27 81#define CO4 r28 82 83#define PREA r29 84#define PREB r29 85#define PREC r30 86#define VREG r31 87 88#define LOAD_A lvx 89#define LOAD_B lvx 90 91#define OFFSET_0 0 92#define OFFSET_1 r14 93#define OFFSET_2 r15 94#define OFFSET_3 r16 95#define OFFSET_4 r17 96#define OFFSET_5 r18 97#define OFFSET_6 r19 98#define OFFSET_7 r20 99 100#define c01 v0 101#define c02 v1 102#define c03 v2 103#define c04 v3 104#define c05 v4 105#define c06 v5 106#define c07 v6 107#define c08 v7 108#define c09 v8 109#define c10 v9 110#define c11 v10 111#define c12 v11 112#define c13 v12 113#define c14 v13 114#define c15 v14 115#define c16 v15 116 117#define a1 v16 118#define a2 v17 119#define a3 v18 120#define a4 v19 121#define a5 v20 122#define a6 v21 123#define a7 v22 124#define a8 v23 125 126#define b1 v24 127#define b2 v25 128#define bp1 v26 129#define bp2 v27 130 131#define C1 v16 132#define C2 v17 133#define C3 v18 134#define C4 v19 135#define C5 v20 136#define C6 v21 137#define C7 v22 138#define C8 v23 139#define C9 v24 140 141#define c00 v25 142 143#define PERMRSHIFT1 v26 144#define PERMRSHIFT2 v27 145#define PERMRSHIFT3 v28 146#define PERMRSHIFT4 v29 147 148#define VZERO v30 149#define alpha v31 150 151#ifndef NEEDPARAM 152 153#ifndef DOUBLE 154#include "../sparam.h" 155#else 156#include "../dparam.h" 157#endif 158 159 PROLOGUE 160 PROFCODE 161 162 addi SP, SP, -STACKSIZE 163 mr STACK, SP 164 165 li r0, 0 * 16 166 stvx v20, SP, r0 167 li r0, 1 * 16 168 stvx v21, SP, r0 169 li r0, 2 * 16 170 stvx v22, SP, r0 171 li r0, 3 * 16 172 stvx v23, SP, r0 173 li r0, 4 * 16 174 stvx v24, SP, r0 175 li r0, 5 * 16 176 stvx v25, SP, r0 177 li r0, 6 * 16 178 stvx v26, SP, r0 179 li r0, 7 * 16 180 stvx v27, SP, r0 181 li r0, 8 * 16 182 stvx v28, SP, r0 183 li r0, 9 * 16 184 stvx v29, SP, r0 185 li r0, 10 * 16 186 stvx v30, SP, r0 187 li r0, 11 * 16 188 stvx v31, SP, r0 189 190#ifdef __64BIT__ 191 std r31, 192(SP) 192 std r30, 200(SP) 193 std r29, 208(SP) 194 std r28, 216(SP) 195 std r27, 224(SP) 196 std r26, 232(SP) 197 std r25, 240(SP) 198 std r24, 248(SP) 199 std r23, 256(SP) 200 std r22, 264(SP) 201 std r21, 272(SP) 202 std r20, 280(SP) 203 std r19, 288(SP) 204 std r18, 296(SP) 205 std r17, 304(SP) 206 std r16, 312(SP) 207 std r15, 320(SP) 208 std r14, 328(SP) 209#else 210 stw r31, 192(SP) 211 stw r30, 196(SP) 212 stw r29, 200(SP) 213 stw r28, 204(SP) 214 stw r27, 208(SP) 215 stw r26, 212(SP) 216 stw r25, 216(SP) 217 stw r24, 220(SP) 218 stw r23, 224(SP) 219 stw r22, 228(SP) 220 stw r21, 232(SP) 221 stw r20, 236(SP) 222 stw r19, 240(SP) 223 stw r18, 244(SP) 224 stw r17, 248(SP) 225 stw r16, 252(SP) 226 stw r15, 256(SP) 227 stw r14, 260(SP) 228#endif 229 230 231#if defined(_AIX) || defined(__APPLE__) 232#if !defined(__64BIT__) && defined(DOUBLE) 233 lwz LDC, 56 + STACKSIZE(SP) 234#endif 235#endif 236 237 li r0, -1 238 239 mfspr VREG, VRsave 240 mtspr VRsave, r0 241 242 addi SP, SP, -128 243 li r0, -128 244 and SP, SP, r0 245 246 li OFFSET_1, 4 * SIZE 247 li OFFSET_2, 8 * SIZE 248 li OFFSET_3, 12 * SIZE 249 li OFFSET_4, 16 * SIZE 250 li OFFSET_5, 20 * SIZE 251 li OFFSET_6, 24 * SIZE 252 li OFFSET_7, 28 * SIZE 253 254 stfs f1, ALPHA + 0(SP) 255 stfs f1, ALPHA + 4(SP) 256 stfs f1, ALPHA + 8(SP) 257 stfs f1, ALPHA + 12(SP) 258 259 li r29, 0 260 stw r29, FZERO(SP) 261 262 slwi LDC, LDC, BASE_SHIFT 263 264 li PREC, (15 * SIZE) 265#ifdef CELL 266 li PREB, (5 * 32 * SIZE) 267#else 268 li PREB, (5 * 32 * SIZE) 269#endif 270 271 cmpwi cr0, M, 0 272 ble LL(999) 273 cmpwi cr0, N, 0 274 ble LL(999) 275 cmpwi cr0, K, 0 276 ble LL(999) 277 278 srawi. J, N, 2 279 ble LL(60) 280 .align 4 281 282LL(01): 283 mr CO1, C 284 add CO2, C, LDC 285 add CO3, CO2, LDC 286 add CO4, CO3, LDC 287 add C, CO4, LDC 288 289 mr AO, A 290 srawi. I, M, 4 291 ble LL(20) 292 .align 4 293 294LL(11): 295 vxor c01, c01, c01 296 LOAD_B b1, OFFSET_0, B 297 vxor c02, c02, c02 298 LOAD_A a1, OFFSET_0, AO 299 vxor c03, c03, c03 300 LOAD_A a2, OFFSET_1, AO 301 vxor c04, c04, c04 302 LOAD_A a3, OFFSET_2, AO 303 304 vxor c05, c05, c05 305 vxor c06, c06, c06 306 vxor c07, c07, c07 307 vxor c08, c08, c08 308 309 vxor c09, c09, c09 310 dcbtst CO1, PREC 311 vxor c10, c10, c10 312 dcbtst CO2, PREC 313 vxor c11, c11, c11 314 dcbtst CO3, PREC 315 vxor c12, c12, c12 316 dcbtst CO4, PREC 317 vxor c13, c13, c13 318 mr BO, B 319 vxor c14, c14, c14 320 srawi. r0, K, 2 321 vxor c15, c15, c15 322 mtspr CTR, r0 323 vxor c16, c16, c16 324 vspltw bp1, b1, 0 325 ble LL(13) 326 .align 4 327 328#define NOP1 mr r3, r3 329#define NOP2 mr r4, r4 330 331LL(12): 332 vmaddfp c01, a1, bp1, c01 333 vspltw bp2, b1, 1 334 vmaddfp c02, a2, bp1, c02 335 LOAD_A a4, OFFSET_3, AO 336 vmaddfp c03, a3, bp1, c03 337 dcbt AO, PREA 338 vmaddfp c04, a4, bp1, c04 339 NOP2 340 341 vmaddfp c05, a1, bp2, c05 342 vspltw bp1, b1, 2 343 vmaddfp c06, a2, bp2, c06 344 NOP2 345 vmaddfp c07, a3, bp2, c07 346 NOP1 347 vmaddfp c08, a4, bp2, c08 348 dcbt BO, PREB 349 350 vmaddfp c09, a1, bp1, c09 351 vspltw bp2, b1, 3 352 vmaddfp c10, a2, bp1, c10 353 LOAD_B b2, OFFSET_1, BO 354 vmaddfp c11, a3, bp1, c11 355 addi BO, BO, 8 * SIZE 356 vmaddfp c12, a4, bp1, c12 357 NOP1 358 359 vmaddfp c13, a1, bp2, c13 360 vspltw bp1, b2, 0 361 vmaddfp c14, a2, bp2, c14 362 LOAD_A a5, OFFSET_4, AO 363 vmaddfp c15, a3, bp2, c15 364 LOAD_A a6, OFFSET_5, AO 365 vmaddfp c16, a4, bp2, c16 366 vspltw bp2, b2, 1 367 368 vmaddfp c01, a5, bp1, c01 369 LOAD_A a7, OFFSET_6, AO 370 vmaddfp c02, a6, bp1, c02 371 LOAD_A a8, OFFSET_7, AO 372 vmaddfp c03, a7, bp1, c03 373 NOP1 374 vmaddfp c04, a8, bp1, c04 375 NOP2 376 377 vmaddfp c05, a5, bp2, c05 378 vspltw bp1, b2, 2 379 vmaddfp c06, a6, bp2, c06 380 addi AO, AO, 32 * SIZE 381 vmaddfp c07, a7, bp2, c07 382 LOAD_B b1, OFFSET_0, BO 383 vmaddfp c08, a8, bp2, c08 384 NOP1 385 386 vmaddfp c09, a5, bp1, c09 387 vspltw bp2, b2, 3 388 vmaddfp c10, a6, bp1, c10 389 NOP2 390 vmaddfp c11, a7, bp1, c11 391 NOP1 392 vmaddfp c12, a8, bp1, c12 393 dcbt AO, PREA 394 395 vmaddfp c13, a5, bp2, c13 396 vspltw bp1, b1, 0 397 vmaddfp c14, a6, bp2, c14 398 LOAD_A a1, OFFSET_0, AO // 399 vmaddfp c15, a7, bp2, c15 400 LOAD_A a2, OFFSET_1, AO 401 vmaddfp c16, a8, bp2, c16 402 vspltw bp2, b1, 1 403 404 vmaddfp c01, a1, bp1, c01 405 LOAD_A a3, OFFSET_2, AO 406 vmaddfp c02, a2, bp1, c02 407 LOAD_A a4, OFFSET_3, AO 408 vmaddfp c03, a3, bp1, c03 409 NOP1 410 vmaddfp c04, a4, bp1, c04 411 NOP2 412 413 vmaddfp c05, a1, bp2, c05 414 vspltw bp1, b1, 2 415 vmaddfp c06, a2, bp2, c06 416 NOP2 417 vmaddfp c07, a3, bp2, c07 418 NOP1 419 vmaddfp c08, a4, bp2, c08 420 LOAD_B b2, OFFSET_1, BO 421 422 vmaddfp c09, a1, bp1, c09 423 vspltw bp2, b1, 3 424 vmaddfp c10, a2, bp1, c10 425 NOP2 426 vmaddfp c11, a3, bp1, c11 427 NOP1 428 vmaddfp c12, a4, bp1, c12 429 addi BO, BO, 8 * SIZE 430 431 vmaddfp c13, a1, bp2, c13 432 vspltw bp1, b2, 0 433 vmaddfp c14, a2, bp2, c14 434 LOAD_A a5, OFFSET_4, AO 435 vmaddfp c15, a3, bp2, c15 436 LOAD_A a6, OFFSET_5, AO 437 vmaddfp c16, a4, bp2, c16 438 vspltw bp2, b2, 1 439 440 vmaddfp c01, a5, bp1, c01 441 LOAD_A a7, OFFSET_6, AO 442 vmaddfp c02, a6, bp1, c02 443 LOAD_A a8, OFFSET_7, AO 444 vmaddfp c03, a7, bp1, c03 445 addi AO, AO, 32 * SIZE 446 vmaddfp c04, a8, bp1, c04 447 NOP2 448 449 vmaddfp c05, a5, bp2, c05 450 vspltw bp1, b2, 2 451 vmaddfp c06, a6, bp2, c06 452 NOP2 453 vmaddfp c07, a7, bp2, c07 454 NOP1 455 vmaddfp c08, a8, bp2, c08 456 LOAD_B b1, OFFSET_0, BO 457 458 vmaddfp c09, a5, bp1, c09 459 vspltw bp2, b2, 3 460 vmaddfp c10, a6, bp1, c10 461 LOAD_A a1, OFFSET_0, AO // 462 vmaddfp c11, a7, bp1, c11 463 NOP2 464 vmaddfp c12, a8, bp1, c12 465 vspltw bp1, b1, 0 466 467 vmaddfp c13, a5, bp2, c13 468 LOAD_A a2, OFFSET_1, AO 469 vmaddfp c14, a6, bp2, c14 470 LOAD_A a3, OFFSET_2, AO 471 vmaddfp c15, a7, bp2, c15 472 NOP1 473 vmaddfp c16, a8, bp2, c16 474 bdnz+ LL(12) 475 .align 4 476 477LL(13): 478 andi. r0, K, 2 479 nop 480 nop 481 ble+ LL(15) 482 .align 4 483 484 vmaddfp c01, a1, bp1, c01 485 vspltw bp2, b1, 1 486 vmaddfp c02, a2, bp1, c02 487 LOAD_A a4, OFFSET_3, AO 488 vmaddfp c03, a3, bp1, c03 489 NOP1 490 vmaddfp c04, a4, bp1, c04 491 NOP2 492 493 vmaddfp c05, a1, bp2, c05 494 vspltw bp1, b1, 2 495 vmaddfp c06, a2, bp2, c06 496 NOP2 497 vmaddfp c07, a3, bp2, c07 498 NOP1 499 vmaddfp c08, a4, bp2, c08 500 LOAD_B b2, OFFSET_1, BO 501 502 vmaddfp c09, a1, bp1, c09 503 vspltw bp2, b1, 3 504 vmaddfp c10, a2, bp1, c10 505 LOAD_A a5, OFFSET_4, AO 506 vmaddfp c11, a3, bp1, c11 507 LOAD_A a6, OFFSET_5, AO 508 vmaddfp c12, a4, bp1, c12 509 addi BO, BO, 8 * SIZE 510 511 vmaddfp c13, a1, bp2, c13 512 vspltw bp1, b2, 0 513 vmaddfp c14, a2, bp2, c14 514 LOAD_A a7, OFFSET_6, AO 515 vmaddfp c15, a3, bp2, c15 516 LOAD_A a8, OFFSET_7, AO 517 vmaddfp c16, a4, bp2, c16 518 addi AO, AO, 32 * SIZE 519 520 vmaddfp c01, a5, bp1, c01 521 vspltw bp2, b2, 1 522 vmaddfp c02, a6, bp1, c02 523 NOP2 524 vmaddfp c03, a7, bp1, c03 525 NOP1 526 vmaddfp c04, a8, bp1, c04 527 NOP2 528 529 vmaddfp c05, a5, bp2, c05 530 vspltw bp1, b2, 2 531 vmaddfp c06, a6, bp2, c06 532 NOP2 533 vmaddfp c07, a7, bp2, c07 534 NOP1 535 vmaddfp c08, a8, bp2, c08 536 LOAD_B b1, OFFSET_0, BO 537 538 vmaddfp c09, a5, bp1, c09 539 vspltw bp2, b2, 3 540 vmaddfp c10, a6, bp1, c10 541 LOAD_A a1, OFFSET_0, AO 542 vmaddfp c11, a7, bp1, c11 543 LOAD_A a2, OFFSET_1, AO 544 vmaddfp c12, a8, bp1, c12 545 NOP2 546 547 vmaddfp c13, a5, bp2, c13 548 vspltw bp1, b1, 0 549 vmaddfp c14, a6, bp2, c14 550 LOAD_A a3, OFFSET_2, AO 551 vmaddfp c15, a7, bp2, c15 552 vmaddfp c16, a8, bp2, c16 553 .align 4 554 555LL(15): 556 andi. r0, K, 1 557 lvx alpha, OFFSET_0, SP 558 vxor VZERO, VZERO, VZERO 559 ble+ LL(18) 560 .align 4 561 562 vmaddfp c01, a1, bp1, c01 563 vspltw bp2, b1, 1 564 vmaddfp c02, a2, bp1, c02 565 LOAD_A a4, OFFSET_3, AO 566 vmaddfp c03, a3, bp1, c03 567 nop 568 vmaddfp c04, a4, bp1, c04 569 nop 570 571 vmaddfp c05, a1, bp2, c05 572 vspltw bp1, b1, 2 573 vmaddfp c06, a2, bp2, c06 574 nop 575 vmaddfp c07, a3, bp2, c07 576 nop 577 vmaddfp c08, a4, bp2, c08 578 nop 579 580 vmaddfp c09, a1, bp1, c09 581 vspltw bp2, b1, 3 582 vmaddfp c10, a2, bp1, c10 583 addi AO, AO, 16 * SIZE 584 vmaddfp c11, a3, bp1, c11 585 addi BO, BO, 4 * SIZE 586 vmaddfp c12, a4, bp1, c12 587 nop 588 589 vmaddfp c13, a1, bp2, c13 590 vmaddfp c14, a2, bp2, c14 591 vmaddfp c15, a3, bp2, c15 592 vmaddfp c16, a4, bp2, c16 593 .align 4 594 595LL(18): 596 lvx C1, OFFSET_0, CO1 597 cmpwi cr0, LDC, 32 * SIZE 598 lvx C2, OFFSET_1, CO1 599 lvsr PERMRSHIFT1, 0, CO1 600 lvx C3, OFFSET_2, CO1 601 lvsr PERMRSHIFT2, 0, CO2 602 lvx C4, OFFSET_3, CO1 603 lvsr PERMRSHIFT3, 0, CO3 604 lvx C5, OFFSET_4, CO1 605 lvsr PERMRSHIFT4, 0, CO4 606 ble LL(19) 607 608 vperm c00, VZERO, c01, PERMRSHIFT1 609 vperm c01, c01, c02, PERMRSHIFT1 610 vperm c02, c02, c03, PERMRSHIFT1 611 vperm c03, c03, c04, PERMRSHIFT1 612 vperm c04, c04, VZERO, PERMRSHIFT1 613 614 vmaddfp c00, alpha, c00, C1 615 lvx C1, OFFSET_0, CO2 616 vmaddfp c01, alpha, c01, C2 617 lvx C6, OFFSET_1, CO2 618 vmaddfp c02, alpha, c02, C3 619 lvx C7, OFFSET_2, CO2 620 vmaddfp c03, alpha, c03, C4 621 lvx C8, OFFSET_3, CO2 622 vmaddfp c04, alpha, c04, C5 623 lvx C9, OFFSET_4, CO2 624 625 stvx c00, OFFSET_0, CO1 626 vperm c00, VZERO, c05, PERMRSHIFT2 627 stvx c01, OFFSET_1, CO1 628 vperm c05, c05, c06, PERMRSHIFT2 629 stvx c02, OFFSET_2, CO1 630 vperm c06, c06, c07, PERMRSHIFT2 631 stvx c03, OFFSET_3, CO1 632 vperm c07, c07, c08, PERMRSHIFT2 633 stvx c04, OFFSET_4, CO1 634 vperm c08, c08, VZERO, PERMRSHIFT2 635 636 vmaddfp c00, alpha, c00, C1 637 lvx C1, OFFSET_0, CO3 638 vmaddfp c05, alpha, c05, C6 639 lvx C2, OFFSET_1, CO3 640 vmaddfp c06, alpha, c06, C7 641 lvx C3, OFFSET_2, CO3 642 vmaddfp c07, alpha, c07, C8 643 lvx C4, OFFSET_3, CO3 644 vmaddfp c08, alpha, c08, C9 645 lvx C5, OFFSET_4, CO3 646 647 stvx c00, OFFSET_0, CO2 648 vperm c00, VZERO, c09, PERMRSHIFT3 649 stvx c05, OFFSET_1, CO2 650 vperm c09, c09, c10, PERMRSHIFT3 651 stvx c06, OFFSET_2, CO2 652 vperm c10, c10, c11, PERMRSHIFT3 653 stvx c07, OFFSET_3, CO2 654 vperm c11, c11, c12, PERMRSHIFT3 655 stvx c08, OFFSET_4, CO2 656 vperm c12, c12, VZERO, PERMRSHIFT3 657 658 vmaddfp c00, alpha, c00, C1 659 lvx C9, OFFSET_4, CO4 660 vmaddfp c09, alpha, c09, C2 661 lvx C1, OFFSET_0, CO4 662 vmaddfp c10, alpha, c10, C3 663 lvx C6, OFFSET_1, CO4 664 vmaddfp c11, alpha, c11, C4 665 lvx C7, OFFSET_2, CO4 666 vmaddfp c12, alpha, c12, C5 667 lvx C8, OFFSET_3, CO4 668 669 stvx c00, OFFSET_0, CO3 670 vperm c00, VZERO, c13, PERMRSHIFT4 671 stvx c09, OFFSET_1, CO3 672 vperm c13, c13, c14, PERMRSHIFT4 673 stvx c10, OFFSET_2, CO3 674 vperm c14, c14, c15, PERMRSHIFT4 675 stvx c11, OFFSET_3, CO3 676 vperm c15, c15, c16, PERMRSHIFT4 677 stvx c12, OFFSET_4, CO3 678 vperm c16, c16, VZERO, PERMRSHIFT4 679 680 vmaddfp c00, alpha, c00, C1 681 vmaddfp c13, alpha, c13, C6 682 vmaddfp c14, alpha, c14, C7 683 vmaddfp c15, alpha, c15, C8 684 vmaddfp c16, alpha, c16, C9 685 686 stvx c00, OFFSET_0, CO4 687 stvx c13, OFFSET_1, CO4 688 stvx c14, OFFSET_2, CO4 689 stvx c15, OFFSET_3, CO4 690 stvx c16, OFFSET_4, CO4 691 692 addi CO1, CO1, 16 * SIZE 693 addi CO2, CO2, 16 * SIZE 694 addi CO3, CO3, 16 * SIZE 695 addi CO4, CO4, 16 * SIZE 696 697 addic. I, I, -1 698 bgt+ LL(11) 699 b LL(20) 700 .align 4 701 702LL(19): 703 lvx C6, OFFSET_1, CO2 704 lvx C7, OFFSET_2, CO2 705 lvx C8, OFFSET_3, CO2 706 lvx C9, OFFSET_4, CO2 707 708 vperm c00, VZERO, c01, PERMRSHIFT1 709 vperm c01, c01, c02, PERMRSHIFT1 710 vperm c02, c02, c03, PERMRSHIFT1 711 vperm c03, c03, c04, PERMRSHIFT1 712 vperm c04, c04, VZERO, PERMRSHIFT1 713 714 vmaddfp c00, alpha, c00, C1 715 vmaddfp c01, alpha, c01, C2 716 lvx C2, OFFSET_1, CO3 717 vmaddfp c02, alpha, c02, C3 718 lvx C3, OFFSET_2, CO3 719 vmaddfp c03, alpha, c03, C4 720 lvx C4, OFFSET_3, CO3 721 vmaddfp c04, alpha, c04, C5 722 lvx C5, OFFSET_4, CO3 723 724 stvx c00, OFFSET_0, CO1 725 stvx c01, OFFSET_1, CO1 726 stvx c02, OFFSET_2, CO1 727 stvx c03, OFFSET_3, CO1 728 stvx c04, OFFSET_4, CO1 729 730 lvx C1, OFFSET_0, CO2 731 732 vperm c00, VZERO, c05, PERMRSHIFT2 733 vperm c05, c05, c06, PERMRSHIFT2 734 vperm c06, c06, c07, PERMRSHIFT2 735 vperm c07, c07, c08, PERMRSHIFT2 736 vperm c08, c08, VZERO, PERMRSHIFT2 737 738 vmaddfp c00, alpha, c00, C1 739 vmaddfp c05, alpha, c05, C6 740 lvx C6, OFFSET_1, CO4 741 vmaddfp c06, alpha, c06, C7 742 lvx C7, OFFSET_2, CO4 743 vmaddfp c07, alpha, c07, C8 744 lvx C8, OFFSET_3, CO4 745 vmaddfp c08, alpha, c08, C9 746 lvx C9, OFFSET_4, CO4 747 748 stvx c00, OFFSET_0, CO2 749 stvx c05, OFFSET_1, CO2 750 stvx c06, OFFSET_2, CO2 751 stvx c07, OFFSET_3, CO2 752 stvx c08, OFFSET_4, CO2 753 754 lvx C1, OFFSET_0, CO3 755 756 vperm c00, VZERO, c09, PERMRSHIFT3 757 vperm c09, c09, c10, PERMRSHIFT3 758 vperm c10, c10, c11, PERMRSHIFT3 759 vperm c11, c11, c12, PERMRSHIFT3 760 vperm c12, c12, VZERO, PERMRSHIFT3 761 762 vmaddfp c00, alpha, c00, C1 763 vmaddfp c09, alpha, c09, C2 764 vmaddfp c10, alpha, c10, C3 765 vmaddfp c11, alpha, c11, C4 766 vmaddfp c12, alpha, c12, C5 767 768 stvx c00, OFFSET_0, CO3 769 stvx c09, OFFSET_1, CO3 770 stvx c10, OFFSET_2, CO3 771 stvx c11, OFFSET_3, CO3 772 stvx c12, OFFSET_4, CO3 773 774 lvx C1, OFFSET_0, CO4 775 776 vperm c00, VZERO, c13, PERMRSHIFT4 777 vperm c13, c13, c14, PERMRSHIFT4 778 vperm c14, c14, c15, PERMRSHIFT4 779 vperm c15, c15, c16, PERMRSHIFT4 780 vperm c16, c16, VZERO, PERMRSHIFT4 781 782 vmaddfp c00, alpha, c00, C1 783 vmaddfp c13, alpha, c13, C6 784 vmaddfp c14, alpha, c14, C7 785 vmaddfp c15, alpha, c15, C8 786 vmaddfp c16, alpha, c16, C9 787 788 stvx c00, OFFSET_0, CO4 789 stvx c13, OFFSET_1, CO4 790 stvx c14, OFFSET_2, CO4 791 stvx c15, OFFSET_3, CO4 792 stvx c16, OFFSET_4, CO4 793 794 addi CO1, CO1, 16 * SIZE 795 addi CO2, CO2, 16 * SIZE 796 addi CO3, CO3, 16 * SIZE 797 addi CO4, CO4, 16 * SIZE 798 799 addic. I, I, -1 800 bgt+ LL(11) 801 .align 4 802 803LL(20): 804 andi. I, M, 8 805 ble LL(30) 806 807 vxor c01, c01, c01 808 LOAD_A a1, OFFSET_0, AO 809 vxor c02, c02, c02 810 LOAD_A a2, OFFSET_1, AO 811 vxor c05, c05, c05 812 LOAD_A a3, OFFSET_2, AO 813 vxor c06, c06, c06 814 LOAD_A a4, OFFSET_3, AO 815 vxor c09, c09, c09 816 LOAD_B b1, OFFSET_0, B 817 vxor c10, c10, c10 818 LOAD_B b2, OFFSET_1, B 819 vxor c13, c13, c13 820 vxor c14, c14, c14 821 mr BO, B 822 vspltw bp1, b1, 0 823 824 srawi. r0, K, 1 825 mtspr CTR, r0 826 ble LL(25) 827 .align 4 828 829LL(22): 830 vmaddfp c01, a1, bp1, c01 831 vspltw bp2, b1, 1 832 addi AO, AO, 16 * SIZE 833 vmaddfp c02, a2, bp1, c02 834 addi BO, BO, 8 * SIZE 835 836 vmaddfp c05, a1, bp2, c05 837 vspltw bp1, b1, 2 838 vmaddfp c06, a2, bp2, c06 839 840 vmaddfp c09, a1, bp1, c09 841 vspltw bp2, b1, 3 842 LOAD_B b1, OFFSET_0, BO 843 vmaddfp c10, a2, bp1, c10 844 845 vmaddfp c13, a1, bp2, c13 846 LOAD_A a1, OFFSET_0, AO 847 vspltw bp1, b2, 0 848 vmaddfp c14, a2, bp2, c14 849 LOAD_A a2, OFFSET_1, AO 850 851 vmaddfp c01, a3, bp1, c01 852 vspltw bp2, b2, 1 853 vmaddfp c02, a4, bp1, c02 854 855 vmaddfp c05, a3, bp2, c05 856 vspltw bp1, b2, 2 857 vmaddfp c06, a4, bp2, c06 858 859 vmaddfp c09, a3, bp1, c09 860 vspltw bp2, b2, 3 861 LOAD_B b2, OFFSET_1, BO 862 vmaddfp c10, a4, bp1, c10 863 864 vmaddfp c13, a3, bp2, c13 865 LOAD_A a3, OFFSET_2, AO 866 vmaddfp c14, a4, bp2, c14 867 LOAD_A a4, OFFSET_3, AO 868 vspltw bp1, b1, 0 869 bdnz LL(22) 870 .align 4 871 872LL(25): 873 andi. r0, K, 1 874 lvx alpha, OFFSET_0, SP 875 vxor VZERO, VZERO, VZERO 876 ble+ LL(28) 877 .align 4 878 879LL(26): 880 vmaddfp c01, a1, bp1, c01 881 vspltw bp2, b1, 1 882 vmaddfp c02, a2, bp1, c02 883 nop 884 885 vmaddfp c05, a1, bp2, c05 886 vspltw bp1, b1, 2 887 vmaddfp c06, a2, bp2, c06 888 nop 889 890 vmaddfp c09, a1, bp1, c09 891 vspltw bp2, b1, 3 892 vmaddfp c10, a2, bp1, c10 893 addi AO, AO, 8 * SIZE 894 895 vmaddfp c13, a1, bp2, c13 896 addi BO, BO, 4 * SIZE 897 vmaddfp c14, a2, bp2, c14 898 nop 899 .align 4 900 901LL(28): 902 lvx C1, OFFSET_0, CO1 903 lvx C2, OFFSET_1, CO1 904 lvx C3, OFFSET_2, CO1 905 906 lvsr PERMRSHIFT1, 0, CO1 907 lvsr PERMRSHIFT2, 0, CO2 908 lvsr PERMRSHIFT3, 0, CO3 909 lvsr PERMRSHIFT4, 0, CO4 910 911 vperm c00, VZERO, c01, PERMRSHIFT1 912 vperm c01, c01, c02, PERMRSHIFT1 913 vperm c02, c02, VZERO, PERMRSHIFT1 914 915 vmaddfp c00, alpha, c00, C1 916 vmaddfp c01, alpha, c01, C2 917 vmaddfp c02, alpha, c02, C3 918 919 stvx c00, OFFSET_0, CO1 920 stvx c01, OFFSET_1, CO1 921 stvx c02, OFFSET_2, CO1 922 923 lvx C1, OFFSET_0, CO2 924 lvx C2, OFFSET_1, CO2 925 lvx C3, OFFSET_2, CO2 926 927 vperm c00, VZERO, c05, PERMRSHIFT2 928 vperm c05, c05, c06, PERMRSHIFT2 929 vperm c06, c06, VZERO, PERMRSHIFT2 930 931 vmaddfp c00, alpha, c00, C1 932 vmaddfp c05, alpha, c05, C2 933 vmaddfp c06, alpha, c06, C3 934 935 stvx c00, OFFSET_0, CO2 936 stvx c05, OFFSET_1, CO2 937 stvx c06, OFFSET_2, CO2 938 939 lvx C1, OFFSET_0, CO3 940 lvx C2, OFFSET_1, CO3 941 lvx C3, OFFSET_2, CO3 942 943 vperm c00, VZERO, c09, PERMRSHIFT3 944 vperm c09, c09, c10, PERMRSHIFT3 945 vperm c10, c10, VZERO, PERMRSHIFT3 946 947 vmaddfp c00, alpha, c00, C1 948 vmaddfp c09, alpha, c09, C2 949 vmaddfp c10, alpha, c10, C3 950 951 stvx c00, OFFSET_0, CO3 952 stvx c09, OFFSET_1, CO3 953 stvx c10, OFFSET_2, CO3 954 955 lvx C1, OFFSET_0, CO4 956 lvx C2, OFFSET_1, CO4 957 lvx C3, OFFSET_2, CO4 958 959 vperm c00, VZERO, c13, PERMRSHIFT4 960 vperm c13, c13, c14, PERMRSHIFT4 961 vperm c14, c14, VZERO, PERMRSHIFT4 962 963 vmaddfp c00, alpha, c00, C1 964 vmaddfp c13, alpha, c13, C2 965 vmaddfp c14, alpha, c14, C3 966 967 stvx c00, OFFSET_0, CO4 968 stvx c13, OFFSET_1, CO4 969 stvx c14, OFFSET_2, CO4 970 971 addi CO1, CO1, 8 * SIZE 972 addi CO2, CO2, 8 * SIZE 973 addi CO3, CO3, 8 * SIZE 974 addi CO4, CO4, 8 * SIZE 975 .align 4 976 977LL(30): 978 andi. I, M, 4 979 ble LL(40) 980 981 vxor c01, c01, c01 982 LOAD_A a1, OFFSET_0, AO 983 vxor c02, c02, c02 984 LOAD_A a2, OFFSET_1, AO 985 vxor c05, c05, c05 986 LOAD_B b1, OFFSET_0, B 987 vxor c06, c06, c06 988 LOAD_B b2, OFFSET_1, B 989 vxor c09, c09, c09 990 vxor c10, c10, c10 991 vxor c13, c13, c13 992 vxor c14, c14, c14 993 994 vspltw bp1, b1, 0 995 mr BO, B 996 997 srawi. r0, K, 1 998 mtspr CTR, r0 999 ble LL(35) 1000 .align 4 1001 1002LL(32): 1003 vmaddfp c01, a1, bp1, c01 1004 addi AO, AO, 8 * SIZE 1005 vspltw bp2, b1, 1 1006 vmaddfp c05, a1, bp2, c05 1007 addi BO, BO, 8 * SIZE 1008 vspltw bp1, b1, 2 1009 vmaddfp c09, a1, bp1, c09 1010 vspltw bp2, b1, 3 1011 vmaddfp c13, a1, bp2, c13 1012 LOAD_A a1, OFFSET_0, AO 1013 vspltw bp1, b2, 0 1014 LOAD_B b1, OFFSET_0, BO 1015 1016 vmaddfp c02, a2, bp1, c02 1017 vspltw bp2, b2, 1 1018 vmaddfp c06, a2, bp2, c06 1019 vspltw bp1, b2, 2 1020 vmaddfp c10, a2, bp1, c10 1021 vspltw bp2, b2, 3 1022 LOAD_B b2, OFFSET_1, BO 1023 vmaddfp c14, a2, bp2, c14 1024 LOAD_A a2, OFFSET_1, AO 1025 1026 vspltw bp1, b1, 0 1027 bdnz LL(32) 1028 .align 4 1029 1030LL(35): 1031 andi. r0, K, 1 1032 lvx alpha, OFFSET_0, SP 1033 vxor VZERO, VZERO, VZERO 1034 ble+ LL(38) 1035 .align 4 1036 1037LL(36): 1038 vmaddfp c01, a1, bp1, c01 1039 vspltw bp2, b1, 1 1040 vmaddfp c05, a1, bp2, c05 1041 vspltw bp1, b1, 2 1042 vmaddfp c09, a1, bp1, c09 1043 vspltw bp2, b1, 3 1044 vmaddfp c13, a1, bp2, c13 1045 addi AO, AO, 4 * SIZE 1046 addi BO, BO, 4 * SIZE 1047 .align 4 1048 1049LL(38): 1050 vaddfp c01, c01, c02 1051 vaddfp c05, c05, c06 1052 vaddfp c09, c09, c10 1053 vaddfp c13, c13, c14 1054 1055 lvx C1, OFFSET_0, CO1 1056 lvx C2, OFFSET_1, CO1 1057 1058 lvsr PERMRSHIFT1, 0, CO1 1059 lvsr PERMRSHIFT2, 0, CO2 1060 lvsr PERMRSHIFT3, 0, CO3 1061 lvsr PERMRSHIFT4, 0, CO4 1062 1063 vperm c00, VZERO, c01, PERMRSHIFT1 1064 vperm c01, c01, VZERO, PERMRSHIFT1 1065 1066 vmaddfp c00, alpha, c00, C1 1067 vmaddfp c01, alpha, c01, C2 1068 1069 stvx c00, OFFSET_0, CO1 1070 stvx c01, OFFSET_1, CO1 1071 1072 lvx C1, OFFSET_0, CO2 1073 lvx C2, OFFSET_1, CO2 1074 1075 vperm c00, VZERO, c05, PERMRSHIFT2 1076 vperm c05, c05, VZERO, PERMRSHIFT2 1077 1078 vmaddfp c00, alpha, c00, C1 1079 vmaddfp c05, alpha, c05, C2 1080 1081 stvx c00, OFFSET_0, CO2 1082 stvx c05, OFFSET_1, CO2 1083 1084 lvx C1, OFFSET_0, CO3 1085 lvx C2, OFFSET_1, CO3 1086 1087 vperm c00, VZERO, c09, PERMRSHIFT3 1088 vperm c09, c09, VZERO, PERMRSHIFT3 1089 1090 vmaddfp c00, alpha, c00, C1 1091 vmaddfp c09, alpha, c09, C2 1092 1093 stvx c00, OFFSET_0, CO3 1094 stvx c09, OFFSET_1, CO3 1095 1096 lvx C1, OFFSET_0, CO4 1097 lvx C2, OFFSET_1, CO4 1098 1099 vperm c00, VZERO, c13, PERMRSHIFT4 1100 vperm c13, c13, VZERO, PERMRSHIFT4 1101 1102 vmaddfp c00, alpha, c00, C1 1103 vmaddfp c13, alpha, c13, C2 1104 1105 stvx c00, OFFSET_0, CO4 1106 stvx c13, OFFSET_1, CO4 1107 1108 addi CO1, CO1, 4 * SIZE 1109 addi CO2, CO2, 4 * SIZE 1110 addi CO3, CO3, 4 * SIZE 1111 addi CO4, CO4, 4 * SIZE 1112 .align 4 1113 1114LL(40): 1115 andi. I, M, 2 1116 ble LL(50) 1117 1118 mr BO, B 1119 1120 LFD f8, 0 * SIZE(AO) 1121 LFD f9, 1 * SIZE(AO) 1122 1123 LFD f10, 0 * SIZE(B) 1124 LFD f11, 1 * SIZE(B) 1125 LFD f12, 2 * SIZE(B) 1126 LFD f13, 3 * SIZE(B) 1127 1128 lfs f0, FZERO(SP) 1129 fmr f1, f0 1130 fmr f2, f0 1131 fmr f3, f0 1132 1133 fmr f4, f0 1134 fmr f5, f0 1135 fmr f6, f0 1136 fmr f7, f0 1137 1138 srawi. r0, K, 1 1139 mtspr CTR, r0 1140 ble LL(45) 1141 .align 4 1142 1143LL(42): 1144 FMADD f0, f8, f10, f0 1145 FMADD f2, f8, f11, f2 1146 FMADD f4, f8, f12, f4 1147 FMADD f6, f8, f13, f6 1148 1149 FMADD f1, f9, f10, f1 1150 FMADD f3, f9, f11, f3 1151 FMADD f5, f9, f12, f5 1152 FMADD f7, f9, f13, f7 1153 1154 LFD f8, 2 * SIZE(AO) 1155 LFD f9, 3 * SIZE(AO) 1156 1157 LFD f10, 4 * SIZE(BO) 1158 LFD f11, 5 * SIZE(BO) 1159 LFD f12, 6 * SIZE(BO) 1160 LFD f13, 7 * SIZE(BO) 1161 1162 FMADD f0, f8, f10, f0 1163 FMADD f2, f8, f11, f2 1164 FMADD f4, f8, f12, f4 1165 FMADD f6, f8, f13, f6 1166 1167 FMADD f1, f9, f10, f1 1168 FMADD f3, f9, f11, f3 1169 FMADD f5, f9, f12, f5 1170 FMADD f7, f9, f13, f7 1171 1172 LFD f8, 4 * SIZE(AO) 1173 LFD f9, 5 * SIZE(AO) 1174 1175 LFD f10, 8 * SIZE(BO) 1176 LFD f11, 9 * SIZE(BO) 1177 LFD f12, 10 * SIZE(BO) 1178 LFD f13, 11 * SIZE(BO) 1179 1180 addi AO, AO, 4 * SIZE 1181 addi BO, BO, 8 * SIZE 1182 bdnz LL(42) 1183 .align 4 1184 1185LL(45): 1186 andi. r0, K, 1 1187 ble LL(48) 1188 .align 4 1189 1190LL(46): 1191 FMADD f0, f8, f10, f0 1192 FMADD f2, f8, f11, f2 1193 FMADD f4, f8, f12, f4 1194 FMADD f6, f8, f13, f6 1195 1196 FMADD f1, f9, f10, f1 1197 FMADD f3, f9, f11, f3 1198 FMADD f5, f9, f12, f5 1199 FMADD f7, f9, f13, f7 1200 1201 LFD f8, 2 * SIZE(AO) 1202 LFD f9, 3 * SIZE(AO) 1203 1204 LFD f10, 4 * SIZE(BO) 1205 LFD f11, 5 * SIZE(BO) 1206 LFD f12, 6 * SIZE(BO) 1207 LFD f13, 7 * SIZE(BO) 1208 1209 addi AO, AO, 2 * SIZE 1210 addi BO, BO, 4 * SIZE 1211 .align 4 1212 1213LL(48): 1214 lfs f13, ALPHA(SP) 1215 1216 LFD f8, 0 * SIZE(CO1) 1217 LFD f9, 1 * SIZE(CO1) 1218 LFD f10, 0 * SIZE(CO2) 1219 LFD f11, 1 * SIZE(CO2) 1220 1221 FMADD f0, f0, f13, f8 1222 FMADD f1, f1, f13, f9 1223 FMADD f2, f2, f13, f10 1224 FMADD f3, f3, f13, f11 1225 1226 LFD f8, 0 * SIZE(CO3) 1227 LFD f9, 1 * SIZE(CO3) 1228 LFD f10, 0 * SIZE(CO4) 1229 LFD f11, 1 * SIZE(CO4) 1230 1231 FMADD f4, f4, f13, f8 1232 FMADD f5, f5, f13, f9 1233 FMADD f6, f6, f13, f10 1234 FMADD f7, f7, f13, f11 1235 1236 STFD f0, 0 * SIZE(CO1) 1237 STFD f1, 1 * SIZE(CO1) 1238 STFD f2, 0 * SIZE(CO2) 1239 STFD f3, 1 * SIZE(CO2) 1240 1241 STFD f4, 0 * SIZE(CO3) 1242 STFD f5, 1 * SIZE(CO3) 1243 STFD f6, 0 * SIZE(CO4) 1244 STFD f7, 1 * SIZE(CO4) 1245 1246 addi CO1, CO1, 2 * SIZE 1247 addi CO2, CO2, 2 * SIZE 1248 addi CO3, CO3, 2 * SIZE 1249 addi CO4, CO4, 2 * SIZE 1250 .align 4 1251 1252LL(50): 1253 andi. I, M, 1 1254 ble LL(59) 1255 1256 mr BO, B 1257 1258 LFD f8, 0 * SIZE(AO) 1259 LFD f9, 1 * SIZE(AO) 1260 1261 LFD f10, 0 * SIZE(B) 1262 LFD f11, 1 * SIZE(B) 1263 LFD f12, 2 * SIZE(B) 1264 LFD f13, 3 * SIZE(B) 1265 1266 lfs f0, FZERO(SP) 1267 fmr f1, f0 1268 fmr f2, f0 1269 fmr f3, f0 1270 1271 srawi. r0, K, 1 1272 mtspr CTR, r0 1273 ble LL(55) 1274 .align 4 1275 1276LL(52): 1277 FMADD f0, f8, f10, f0 1278 FMADD f1, f8, f11, f1 1279 FMADD f2, f8, f12, f2 1280 FMADD f3, f8, f13, f3 1281 1282 LFD f8, 2 * SIZE(AO) 1283 1284 LFD f10, 4 * SIZE(BO) 1285 LFD f11, 5 * SIZE(BO) 1286 LFD f12, 6 * SIZE(BO) 1287 LFD f13, 7 * SIZE(BO) 1288 1289 FMADD f0, f9, f10, f0 1290 FMADD f1, f9, f11, f1 1291 FMADD f2, f9, f12, f2 1292 FMADD f3, f9, f13, f3 1293 1294 LFD f9, 3 * SIZE(AO) 1295 1296 LFD f10, 8 * SIZE(BO) 1297 LFD f11, 9 * SIZE(BO) 1298 LFD f12, 10 * SIZE(BO) 1299 LFD f13, 11 * SIZE(BO) 1300 1301 addi AO, AO, 2 * SIZE 1302 addi BO, BO, 8 * SIZE 1303 bdnz LL(52) 1304 .align 4 1305 1306LL(55): 1307 andi. r0, K, 1 1308 ble LL(58) 1309 .align 4 1310 1311LL(56): 1312 FMADD f0, f8, f10, f0 1313 FMADD f1, f8, f11, f1 1314 FMADD f2, f8, f12, f2 1315 FMADD f3, f8, f13, f3 1316 1317 LFD f8, 2 * SIZE(AO) 1318 1319 LFD f10, 4 * SIZE(BO) 1320 LFD f11, 5 * SIZE(BO) 1321 LFD f12, 6 * SIZE(BO) 1322 LFD f13, 7 * SIZE(BO) 1323 1324 addi AO, AO, 1 * SIZE 1325 addi BO, BO, 4 * SIZE 1326 .align 4 1327 1328LL(58): 1329 lfs f13, ALPHA(SP) 1330 1331 LFD f8, 0 * SIZE(CO1) 1332 LFD f9, 0 * SIZE(CO2) 1333 LFD f10, 0 * SIZE(CO3) 1334 LFD f11, 0 * SIZE(CO4) 1335 1336 FMADD f0, f0, f13, f8 1337 FMADD f1, f1, f13, f9 1338 FMADD f2, f2, f13, f10 1339 FMADD f3, f3, f13, f11 1340 1341 STFD f0, 0 * SIZE(CO1) 1342 STFD f1, 0 * SIZE(CO2) 1343 STFD f2, 0 * SIZE(CO3) 1344 STFD f3, 0 * SIZE(CO4) 1345 .align 4 1346 1347LL(59): 1348 mr B, BO 1349 1350 addic. J, J, -1 1351 bgt LL(01) 1352 .align 4 1353 1354LL(60): 1355 andi. r0, N, 2 1356 ble LL(120) 1357 1358 mr CO1, C 1359 add CO2, C, LDC 1360 add C, CO2, LDC 1361 1362 mr AO, A 1363 srawi. I, M, 4 1364 ble LL(80) 1365 .align 4 1366 1367LL(71): 1368 vxor c01, c01, c01 1369 LOAD_B b1, OFFSET_0, B 1370 vxor c02, c02, c02 1371 vxor c03, c03, c03 1372 LOAD_A a1, OFFSET_0, AO 1373 vxor c04, c04, c04 1374 LOAD_A a2, OFFSET_1, AO 1375 vxor c05, c05, c05 1376 LOAD_A a3, OFFSET_2, AO 1377 vxor c06, c06, c06 1378 LOAD_A a4, OFFSET_3, AO 1379 vxor c07, c07, c07 1380 vxor c08, c08, c08 1381 1382 mr BO, B 1383 dcbtst CO1, PREC 1384 dcbtst CO2, PREC 1385 1386 vspltw bp1, b1, 0 1387 1388 srawi. r0, K, 1 1389 mtspr CTR, r0 1390 ble LL(75) 1391 .align 4 1392 1393LL(72): 1394 LOAD_A a5, OFFSET_4, AO 1395 LOAD_A a6, OFFSET_5, AO 1396 LOAD_A a7, OFFSET_6, AO 1397 LOAD_A a8, OFFSET_7, AO 1398 1399 vmaddfp c01, a1, bp1, c01 1400 vspltw bp2, b1, 1 1401 vmaddfp c02, a2, bp1, c02 1402 vmaddfp c03, a3, bp1, c03 1403 vmaddfp c04, a4, bp1, c04 1404 1405 vmaddfp c05, a1, bp2, c05 1406 vspltw bp1, b1, 2 1407 vmaddfp c06, a2, bp2, c06 1408 vmaddfp c07, a3, bp2, c07 1409 vmaddfp c08, a4, bp2, c08 1410 1411 vmaddfp c01, a5, bp1, c01 1412 vspltw bp2, b1, 3 1413 vmaddfp c02, a6, bp1, c02 1414 vmaddfp c03, a7, bp1, c03 1415 vmaddfp c04, a8, bp1, c04 1416 1417 LOAD_B b1, OFFSET_1, BO 1418 vspltw bp1, b1, 0 1419 1420 vmaddfp c05, a5, bp2, c05 1421 vmaddfp c06, a6, bp2, c06 1422 vmaddfp c07, a7, bp2, c07 1423 vmaddfp c08, a8, bp2, c08 1424 1425 addi AO, AO, 32 * SIZE 1426 addi BO, BO, 4 * SIZE 1427 1428 LOAD_A a1, OFFSET_0, AO 1429 LOAD_A a2, OFFSET_1, AO 1430 LOAD_A a3, OFFSET_2, AO 1431 LOAD_A a4, OFFSET_3, AO 1432 bdnz LL(72) 1433 .align 4 1434 1435LL(75): 1436 andi. r0, K, 1 1437 lvx alpha, OFFSET_0, SP 1438 vxor VZERO, VZERO, VZERO 1439 ble+ LL(78) 1440 .align 4 1441 1442LL(76): 1443 vmaddfp c01, a1, bp1, c01 1444 vspltw bp2, b1, 1 1445 vmaddfp c02, a2, bp1, c02 1446 addi AO, AO, 16 * SIZE 1447 vmaddfp c03, a3, bp1, c03 1448 addi BO, BO, 2 * SIZE 1449 vmaddfp c04, a4, bp1, c04 1450 nop 1451 1452 vmaddfp c05, a1, bp2, c05 1453 vmaddfp c06, a2, bp2, c06 1454 vmaddfp c07, a3, bp2, c07 1455 vmaddfp c08, a4, bp2, c08 1456 .align 4 1457 1458LL(78): 1459 lvx C1, OFFSET_0, CO1 1460 lvx C2, OFFSET_1, CO1 1461 lvx C3, OFFSET_2, CO1 1462 lvx C4, OFFSET_3, CO1 1463 lvx C5, OFFSET_4, CO1 1464 1465 lvsr PERMRSHIFT1, 0, CO1 1466 lvsr PERMRSHIFT2, 0, CO2 1467 lvsr PERMRSHIFT3, 0, CO3 1468 lvsr PERMRSHIFT4, 0, CO4 1469 1470 vperm c00, VZERO, c01, PERMRSHIFT1 1471 vperm c01, c01, c02, PERMRSHIFT1 1472 vperm c02, c02, c03, PERMRSHIFT1 1473 vperm c03, c03, c04, PERMRSHIFT1 1474 vperm c04, c04, VZERO, PERMRSHIFT1 1475 1476 vmaddfp c00, alpha, c00, C1 1477 vmaddfp c01, alpha, c01, C2 1478 vmaddfp c02, alpha, c02, C3 1479 vmaddfp c03, alpha, c03, C4 1480 vmaddfp c04, alpha, c04, C5 1481 1482 stvx c00, OFFSET_0, CO1 1483 stvx c01, OFFSET_1, CO1 1484 stvx c02, OFFSET_2, CO1 1485 stvx c03, OFFSET_3, CO1 1486 stvx c04, OFFSET_4, CO1 1487 1488 lvx C1, OFFSET_0, CO2 1489 lvx C2, OFFSET_1, CO2 1490 lvx C3, OFFSET_2, CO2 1491 lvx C4, OFFSET_3, CO2 1492 lvx C5, OFFSET_4, CO2 1493 1494 vperm c00, VZERO, c05, PERMRSHIFT2 1495 vperm c05, c05, c06, PERMRSHIFT2 1496 vperm c06, c06, c07, PERMRSHIFT2 1497 vperm c07, c07, c08, PERMRSHIFT2 1498 vperm c08, c08, VZERO, PERMRSHIFT2 1499 1500 vmaddfp c00, alpha, c00, C1 1501 vmaddfp c05, alpha, c05, C2 1502 vmaddfp c06, alpha, c06, C3 1503 vmaddfp c07, alpha, c07, C4 1504 vmaddfp c08, alpha, c08, C5 1505 1506 stvx c00, OFFSET_0, CO2 1507 stvx c05, OFFSET_1, CO2 1508 stvx c06, OFFSET_2, CO2 1509 stvx c07, OFFSET_3, CO2 1510 stvx c08, OFFSET_4, CO2 1511 1512 addi CO1, CO1, 16 * SIZE 1513 addi CO2, CO2, 16 * SIZE 1514 addic. I, I, -1 1515 bgt+ LL(71) 1516 .align 4 1517 1518LL(80): 1519 andi. I, M, 8 1520 ble LL(90) 1521 1522 vxor c01, c01, c01 1523 LOAD_B b1, OFFSET_0, B 1524 vxor c02, c02, c02 1525 vxor c03, c03, c03 1526 LOAD_A a1, OFFSET_0, AO 1527 vxor c04, c04, c04 1528 LOAD_A a2, OFFSET_1, AO 1529 vxor c05, c05, c05 1530 LOAD_A a3, OFFSET_2, AO 1531 vxor c06, c06, c06 1532 LOAD_A a4, OFFSET_3, AO 1533 vxor c07, c07, c07 1534 vxor c08, c08, c08 1535 1536 mr BO, B 1537 1538 vspltw bp1, b1, 0 1539 srawi. r0, K, 1 1540 mtspr CTR, r0 1541 ble LL(85) 1542 .align 4 1543 1544LL(82): 1545 vmaddfp c01, a1, bp1, c01 1546 vspltw bp2, b1, 1 1547 vmaddfp c02, a2, bp1, c02 1548 1549 vmaddfp c05, a1, bp2, c05 1550 vspltw bp1, b1, 2 1551 vmaddfp c06, a2, bp2, c06 1552 1553 vmaddfp c03, a3, bp1, c03 1554 vspltw bp2, b1, 3 1555 vmaddfp c04, a4, bp1, c04 1556 1557 LOAD_B b1, OFFSET_1, BO 1558 vspltw bp1, b1, 0 1559 1560 vmaddfp c07, a3, bp2, c07 1561 vmaddfp c08, a4, bp2, c08 1562 1563 addi AO, AO, 16 * SIZE 1564 addi BO, BO, 4 * SIZE 1565 1566 LOAD_A a1, OFFSET_0, AO 1567 LOAD_A a2, OFFSET_1, AO 1568 LOAD_A a3, OFFSET_2, AO 1569 LOAD_A a4, OFFSET_3, AO 1570 bdnz LL(82) 1571 .align 4 1572 1573LL(85): 1574 andi. r0, K, 1 1575 lvx alpha, OFFSET_0, SP 1576 vxor VZERO, VZERO, VZERO 1577 ble+ LL(88) 1578 .align 4 1579 1580LL(86): 1581 vmaddfp c01, a1, bp1, c01 1582 vspltw bp2, b1, 1 1583 vmaddfp c02, a2, bp1, c02 1584 addi AO, AO, 8 * SIZE 1585 vmaddfp c05, a1, bp2, c05 1586 addi BO, BO, 2 * SIZE 1587 vmaddfp c06, a2, bp2, c06 1588 .align 4 1589 1590LL(88): 1591 lvx C1, OFFSET_0, CO1 1592 lvx C2, OFFSET_1, CO1 1593 lvx C3, OFFSET_2, CO1 1594 1595 vaddfp c01, c01, c03 1596 vaddfp c02, c02, c04 1597 vaddfp c05, c05, c07 1598 vaddfp c06, c06, c08 1599 1600 lvsr PERMRSHIFT1, 0, CO1 1601 lvsr PERMRSHIFT2, 0, CO2 1602 lvsr PERMRSHIFT3, 0, CO3 1603 lvsr PERMRSHIFT4, 0, CO4 1604 1605 vperm c00, VZERO, c01, PERMRSHIFT1 1606 vperm c01, c01, c02, PERMRSHIFT1 1607 vperm c02, c02, VZERO, PERMRSHIFT1 1608 1609 vmaddfp c00, alpha, c00, C1 1610 vmaddfp c01, alpha, c01, C2 1611 vmaddfp c02, alpha, c02, C3 1612 1613 stvx c00, OFFSET_0, CO1 1614 stvx c01, OFFSET_1, CO1 1615 stvx c02, OFFSET_2, CO1 1616 1617 lvx C1, OFFSET_0, CO2 1618 lvx C2, OFFSET_1, CO2 1619 lvx C3, OFFSET_2, CO2 1620 1621 vperm c00, VZERO, c05, PERMRSHIFT2 1622 vperm c05, c05, c06, PERMRSHIFT2 1623 vperm c06, c06, VZERO, PERMRSHIFT2 1624 1625 vmaddfp c00, alpha, c00, C1 1626 vmaddfp c05, alpha, c05, C2 1627 vmaddfp c06, alpha, c06, C3 1628 1629 stvx c00, OFFSET_0, CO2 1630 stvx c05, OFFSET_1, CO2 1631 stvx c06, OFFSET_2, CO2 1632 1633 addi CO1, CO1, 8 * SIZE 1634 addi CO2, CO2, 8 * SIZE 1635 .align 4 1636 1637LL(90): 1638 andi. I, M, 4 1639 ble LL(100) 1640 1641 vxor c01, c01, c01 1642 LOAD_B b1, OFFSET_0, B 1643 vxor c02, c02, c02 1644 LOAD_A a1, OFFSET_0, AO 1645 LOAD_A a2, OFFSET_1, AO 1646 vxor c05, c05, c05 1647 vxor c06, c06, c06 1648 1649 mr BO, B 1650 1651 vspltw bp1, b1, 0 1652 1653 srawi. r0, K, 1 1654 mtspr CTR, r0 1655 ble LL(95) 1656 .align 4 1657 1658LL(92): 1659 vmaddfp c01, a1, bp1, c01 1660 vspltw bp2, b1, 1 1661 1662 vmaddfp c05, a1, bp2, c05 1663 vspltw bp1, b1, 2 1664 1665 vmaddfp c02, a2, bp1, c02 1666 vspltw bp2, b1, 3 1667 1668 LOAD_B b1, OFFSET_1, BO 1669 vspltw bp1, b1, 0 1670 1671 vmaddfp c06, a2, bp2, c06 1672 1673 addi AO, AO, 8 * SIZE 1674 addi BO, BO, 4 * SIZE 1675 1676 LOAD_A a1, OFFSET_0, AO 1677 LOAD_A a2, OFFSET_1, AO 1678 bdnz LL(92) 1679 .align 4 1680 1681LL(95): 1682 andi. r0, K, 1 1683 lvx alpha, OFFSET_0, SP 1684 vxor VZERO, VZERO, VZERO 1685 ble+ LL(98) 1686 .align 4 1687 1688LL(96): 1689 vspltw bp2, b1, 1 1690 vmaddfp c01, a1, bp1, c01 1691 vmaddfp c05, a1, bp2, c05 1692 addi AO, AO, 4 * SIZE 1693 addi BO, BO, 2 * SIZE 1694 .align 4 1695 1696LL(98): 1697 vaddfp c01, c01, c02 1698 vaddfp c05, c05, c06 1699 vaddfp c09, c09, c10 1700 vaddfp c13, c13, c14 1701 1702 lvx C1, OFFSET_0, CO1 1703 lvx C2, OFFSET_1, CO1 1704 1705 lvsr PERMRSHIFT1, 0, CO1 1706 lvsr PERMRSHIFT2, 0, CO2 1707 lvsr PERMRSHIFT3, 0, CO3 1708 lvsr PERMRSHIFT4, 0, CO4 1709 1710 vperm c00, VZERO, c01, PERMRSHIFT1 1711 vperm c01, c01, VZERO, PERMRSHIFT1 1712 1713 vmaddfp c00, alpha, c00, C1 1714 vmaddfp c01, alpha, c01, C2 1715 1716 stvx c00, OFFSET_0, CO1 1717 stvx c01, OFFSET_1, CO1 1718 1719 lvx C1, OFFSET_0, CO2 1720 lvx C2, OFFSET_1, CO2 1721 1722 vperm c00, VZERO, c05, PERMRSHIFT2 1723 vperm c05, c05, VZERO, PERMRSHIFT2 1724 1725 vmaddfp c00, alpha, c00, C1 1726 vmaddfp c05, alpha, c05, C2 1727 1728 stvx c00, OFFSET_0, CO2 1729 stvx c05, OFFSET_1, CO2 1730 1731 addi CO1, CO1, 4 * SIZE 1732 addi CO2, CO2, 4 * SIZE 1733 .align 4 1734 1735LL(100): 1736 andi. I, M, 2 1737 ble LL(110) 1738 1739 mr BO, B 1740 1741 LFD f8, 0 * SIZE(AO) 1742 LFD f9, 1 * SIZE(AO) 1743 1744 LFD f10, 0 * SIZE(B) 1745 LFD f11, 1 * SIZE(B) 1746 LFD f12, 2 * SIZE(B) 1747 LFD f13, 3 * SIZE(B) 1748 1749 lfs f0, FZERO(SP) 1750 fmr f1, f0 1751 fmr f2, f0 1752 fmr f3, f0 1753 1754 fmr f4, f0 1755 fmr f5, f0 1756 fmr f6, f0 1757 fmr f7, f0 1758 1759 srawi. r0, K, 1 1760 mtspr CTR, r0 1761 ble LL(105) 1762 .align 4 1763 1764LL(102): 1765 FMADD f0, f8, f10, f0 1766 FMADD f1, f9, f10, f1 1767 FMADD f2, f8, f11, f2 1768 FMADD f3, f9, f11, f3 1769 1770 LFD f8, 2 * SIZE(AO) 1771 LFD f9, 3 * SIZE(AO) 1772 1773 FMADD f4, f8, f12, f4 1774 FMADD f5, f9, f12, f5 1775 FMADD f6, f8, f13, f6 1776 FMADD f7, f9, f13, f7 1777 1778 LFD f8, 4 * SIZE(AO) 1779 LFD f9, 5 * SIZE(AO) 1780 1781 LFD f10, 4 * SIZE(BO) 1782 LFD f11, 5 * SIZE(BO) 1783 LFD f12, 6 * SIZE(BO) 1784 LFD f13, 7 * SIZE(BO) 1785 1786 addi AO, AO, 4 * SIZE 1787 addi BO, BO, 4 * SIZE 1788 bdnz LL(102) 1789 .align 4 1790 1791LL(105): 1792 andi. r0, K, 1 1793 lfs f13, ALPHA(SP) 1794 ble LL(108) 1795 .align 4 1796 1797LL(106): 1798 FMADD f0, f8, f10, f0 1799 FMADD f1, f9, f10, f1 1800 FMADD f2, f8, f11, f2 1801 FMADD f3, f9, f11, f3 1802 1803 LFD f8, 2 * SIZE(AO) 1804 LFD f9, 3 * SIZE(AO) 1805 1806 LFD f10, 2 * SIZE(BO) 1807 LFD f11, 3 * SIZE(BO) 1808 1809 addi AO, AO, 2 * SIZE 1810 addi BO, BO, 2 * SIZE 1811 .align 4 1812 1813LL(108): 1814 LFD f8, 0 * SIZE(CO1) 1815 LFD f9, 1 * SIZE(CO1) 1816 LFD f10, 0 * SIZE(CO2) 1817 LFD f11, 1 * SIZE(CO2) 1818 1819 FADD f0, f0, f4 1820 FADD f1, f1, f5 1821 FADD f2, f2, f6 1822 FADD f3, f3, f7 1823 1824 FMADD f0, f0, f13, f8 1825 FMADD f1, f1, f13, f9 1826 FMADD f2, f2, f13, f10 1827 FMADD f3, f3, f13, f11 1828 1829 STFD f0, 0 * SIZE(CO1) 1830 STFD f1, 1 * SIZE(CO1) 1831 STFD f2, 0 * SIZE(CO2) 1832 STFD f3, 1 * SIZE(CO2) 1833 1834 addi CO1, CO1, 2 * SIZE 1835 addi CO2, CO2, 2 * SIZE 1836 .align 4 1837 1838LL(110): 1839 andi. I, M, 1 1840 ble LL(119) 1841 1842 mr BO, B 1843 1844 LFD f8, 0 * SIZE(AO) 1845 LFD f9, 1 * SIZE(AO) 1846 1847 LFD f10, 0 * SIZE(B) 1848 LFD f11, 1 * SIZE(B) 1849 LFD f12, 2 * SIZE(B) 1850 LFD f13, 3 * SIZE(B) 1851 1852 lfs f0, FZERO(SP) 1853 fmr f1, f0 1854 fmr f2, f0 1855 fmr f3, f0 1856 1857 srawi. r0, K, 1 1858 mtspr CTR, r0 1859 ble LL(115) 1860 .align 4 1861 1862LL(112): 1863 FMADD f0, f8, f10, f0 1864 FMADD f1, f8, f11, f1 1865 FMADD f2, f9, f12, f2 1866 FMADD f3, f9, f13, f3 1867 1868 LFD f8, 2 * SIZE(AO) 1869 LFD f9, 3 * SIZE(AO) 1870 1871 LFD f10, 4 * SIZE(BO) 1872 LFD f11, 5 * SIZE(BO) 1873 LFD f12, 6 * SIZE(BO) 1874 LFD f13, 7 * SIZE(BO) 1875 1876 addi AO, AO, 2 * SIZE 1877 addi BO, BO, 4 * SIZE 1878 bdnz LL(112) 1879 .align 4 1880 1881LL(115): 1882 andi. r0, K, 1 1883 lfs f13, ALPHA(SP) 1884 ble LL(118) 1885 .align 4 1886 1887LL(116): 1888 FMADD f0, f8, f10, f0 1889 FMADD f1, f8, f11, f1 1890 1891 LFD f8, 1 * SIZE(AO) 1892 1893 LFD f10, 2 * SIZE(BO) 1894 LFD f11, 3 * SIZE(BO) 1895 1896 addi AO, AO, 1 * SIZE 1897 addi BO, BO, 2 * SIZE 1898 .align 4 1899 1900LL(118): 1901 LFD f8, 0 * SIZE(CO1) 1902 LFD f9, 0 * SIZE(CO2) 1903 1904 FADD f0, f0, f2 1905 FADD f1, f1, f3 1906 1907 FMADD f0, f0, f13, f8 1908 FMADD f1, f1, f13, f9 1909 1910 STFD f0, 0 * SIZE(CO1) 1911 STFD f1, 0 * SIZE(CO2) 1912 .align 4 1913 1914LL(119): 1915 mr B, BO 1916 .align 4 1917 1918LL(120): 1919 andi. r0, N, 1 1920 ble LL(999) 1921 1922 mr CO1, C 1923 mr AO, A 1924 srawi. I, M, 4 1925 ble LL(140) 1926 .align 4 1927 1928LL(130): 1929 vxor c01, c01, c01 1930 vxor c02, c02, c02 1931 vxor c03, c03, c03 1932 vxor c04, c04, c04 1933 1934 mr BO, B 1935 1936 dcbtst CO1, PREC 1937 1938 mr J, K 1939 1940 andi. r0, B, 15 1941 ble+ LL(131) 1942 1943 LOAD_A a1, OFFSET_0, AO 1944 LOAD_A a2, OFFSET_1, AO 1945 LOAD_A a3, OFFSET_2, AO 1946 LOAD_A a4, OFFSET_3, AO 1947 LOAD_B b1, OFFSET_0, BO 1948 vspltw bp1, b1, 2 1949 vspltw bp2, b1, 3 1950 1951 addi AO, AO, 16 * SIZE 1952 addi BO, BO, SIZE 1953 1954 vmaddfp c01, a1, bp1, c01 1955 vmaddfp c02, a2, bp1, c02 1956 vmaddfp c03, a3, bp1, c03 1957 vmaddfp c04, a4, bp1, c04 1958 subi J, J, 1 1959 cmpwi cr0, J, 0 1960 ble LL(138) 1961 1962 LOAD_A a1, OFFSET_0, AO 1963 LOAD_A a2, OFFSET_1, AO 1964 LOAD_A a3, OFFSET_2, AO 1965 LOAD_A a4, OFFSET_3, AO 1966 1967 addi AO, AO, 16 * SIZE 1968 addi BO, BO, SIZE 1969 1970 vmaddfp c01, a1, bp2, c01 1971 vmaddfp c02, a2, bp2, c02 1972 vmaddfp c03, a3, bp2, c03 1973 vmaddfp c04, a4, bp2, c04 1974 subi J, J, 1 1975 cmpwi cr0, J, 0 1976 ble LL(138) 1977 .align 4 1978 1979 1980LL(131): 1981 LOAD_A a1, OFFSET_0, AO 1982 LOAD_A a2, OFFSET_1, AO 1983 LOAD_A a3, OFFSET_2, AO 1984 LOAD_A a4, OFFSET_3, AO 1985 LOAD_A a5, OFFSET_4, AO 1986 LOAD_A a6, OFFSET_5, AO 1987 LOAD_A a7, OFFSET_6, AO 1988 LOAD_A a8, OFFSET_7, AO 1989 1990 LOAD_B b1, OFFSET_0, BO 1991 1992 srawi. r0, J, 2 1993 mtspr CTR, r0 1994 ble LL(135) 1995 .align 4 1996 1997LL(133): 1998 vspltw bp1, b1, 0 1999 vmaddfp c01, a1, bp1, c01 2000 vmaddfp c02, a2, bp1, c02 2001 vmaddfp c03, a3, bp1, c03 2002 vmaddfp c04, a4, bp1, c04 2003 2004 vspltw bp2, b1, 1 2005 vmaddfp c01, a5, bp2, c01 2006 vmaddfp c02, a6, bp2, c02 2007 vmaddfp c03, a7, bp2, c03 2008 vmaddfp c04, a8, bp2, c04 2009 2010 addi AO, AO, 32 * SIZE 2011 2012 LOAD_A a1, OFFSET_0, AO 2013 LOAD_A a2, OFFSET_1, AO 2014 LOAD_A a3, OFFSET_2, AO 2015 LOAD_A a4, OFFSET_3, AO 2016 2017 vspltw bp1, b1, 2 2018 vmaddfp c01, a1, bp1, c01 2019 vmaddfp c02, a2, bp1, c02 2020 vmaddfp c03, a3, bp1, c03 2021 vmaddfp c04, a4, bp1, c04 2022 2023 LOAD_A a5, OFFSET_4, AO 2024 LOAD_A a6, OFFSET_5, AO 2025 LOAD_A a7, OFFSET_6, AO 2026 LOAD_A a8, OFFSET_7, AO 2027 2028 vspltw bp2, b1, 3 2029 vmaddfp c01, a5, bp2, c01 2030 vmaddfp c02, a6, bp2, c02 2031 vmaddfp c03, a7, bp2, c03 2032 vmaddfp c04, a8, bp2, c04 2033 2034 addi AO, AO, 32 * SIZE 2035 addi BO, BO, 4 * SIZE 2036 2037 LOAD_A a1, OFFSET_0, AO 2038 LOAD_A a2, OFFSET_1, AO 2039 LOAD_A a3, OFFSET_2, AO 2040 LOAD_A a4, OFFSET_3, AO 2041 2042 LOAD_A a5, OFFSET_4, AO 2043 LOAD_A a6, OFFSET_5, AO 2044 LOAD_A a7, OFFSET_6, AO 2045 LOAD_A a8, OFFSET_7, AO 2046 2047 LOAD_B b1, OFFSET_0, BO 2048 2049 bdnz LL(133) 2050 .align 4 2051 2052LL(135): 2053 andi. r0, J, 3 2054 ble+ LL(138) 2055 2056 cmpwi cr0, r0, 3 2057 bne LL(136) 2058 2059 vspltw bp1, b1, 0 2060 vmaddfp c01, a1, bp1, c01 2061 vmaddfp c02, a2, bp1, c02 2062 vmaddfp c03, a3, bp1, c03 2063 vmaddfp c04, a4, bp1, c04 2064 2065 addi AO, AO, 16 * SIZE 2066 2067 LOAD_A a1, OFFSET_0, AO 2068 LOAD_A a2, OFFSET_1, AO 2069 LOAD_A a3, OFFSET_2, AO 2070 LOAD_A a4, OFFSET_3, AO 2071 2072 vspltw bp2, b1, 1 2073 vmaddfp c01, a1, bp2, c01 2074 vmaddfp c02, a2, bp2, c02 2075 vmaddfp c03, a3, bp2, c03 2076 vmaddfp c04, a4, bp2, c04 2077 2078 addi AO, AO, 16 * SIZE 2079 2080 LOAD_A a1, OFFSET_0, AO 2081 LOAD_A a2, OFFSET_1, AO 2082 LOAD_A a3, OFFSET_2, AO 2083 LOAD_A a4, OFFSET_3, AO 2084 2085 vspltw bp1, b1, 2 2086 vmaddfp c01, a1, bp1, c01 2087 vmaddfp c02, a2, bp1, c02 2088 vmaddfp c03, a3, bp1, c03 2089 vmaddfp c04, a4, bp1, c04 2090 2091 addi AO, AO, 16 * SIZE 2092 addi BO, BO, 3 * SIZE 2093 b LL(138) 2094 .align 4 2095 2096LL(136): 2097 cmpwi cr0, r0, 2 2098 bne LL(137) 2099 2100 vspltw bp1, b1, 0 2101 vspltw bp2, b1, 1 2102 2103 vmaddfp c01, a1, bp1, c01 2104 vmaddfp c02, a2, bp1, c02 2105 vmaddfp c03, a3, bp1, c03 2106 vmaddfp c04, a4, bp1, c04 2107 2108 LOAD_A a1, OFFSET_4, AO 2109 LOAD_A a2, OFFSET_5, AO 2110 LOAD_A a3, OFFSET_6, AO 2111 LOAD_A a4, OFFSET_7, AO 2112 2113 vmaddfp c01, a1, bp2, c01 2114 vmaddfp c02, a2, bp2, c02 2115 vmaddfp c03, a3, bp2, c03 2116 vmaddfp c04, a4, bp2, c04 2117 2118 addi AO, AO, 32 * SIZE 2119 addi BO, BO, 2 * SIZE 2120 b LL(138) 2121 .align 4 2122 2123LL(137): 2124 cmpwi cr0, r0, 1 2125 bne LL(138) 2126 2127 vspltw bp1, b1, 0 2128 2129 vmaddfp c01, a1, bp1, c01 2130 vmaddfp c02, a2, bp1, c02 2131 vmaddfp c03, a3, bp1, c03 2132 vmaddfp c04, a4, bp1, c04 2133 2134 addi AO, AO, 16 * SIZE 2135 addi BO, BO, 1 * SIZE 2136 .align 4 2137 2138LL(138): 2139 lvx alpha, OFFSET_0, SP 2140 vxor VZERO, VZERO, VZERO 2141 2142 lvx C1, OFFSET_0, CO1 2143 lvx C2, OFFSET_1, CO1 2144 lvx C3, OFFSET_2, CO1 2145 lvx C4, OFFSET_3, CO1 2146 lvx C5, OFFSET_4, CO1 2147 2148 lvsr PERMRSHIFT1, 0, CO1 2149 2150 vperm c00, VZERO, c01, PERMRSHIFT1 2151 vperm c01, c01, c02, PERMRSHIFT1 2152 vperm c02, c02, c03, PERMRSHIFT1 2153 vperm c03, c03, c04, PERMRSHIFT1 2154 vperm c04, c04, VZERO, PERMRSHIFT1 2155 2156 vmaddfp c00, alpha, c00, C1 2157 vmaddfp c01, alpha, c01, C2 2158 vmaddfp c02, alpha, c02, C3 2159 vmaddfp c03, alpha, c03, C4 2160 vmaddfp c04, alpha, c04, C5 2161 2162 stvx c00, OFFSET_0, CO1 2163 stvx c01, OFFSET_1, CO1 2164 stvx c02, OFFSET_2, CO1 2165 stvx c03, OFFSET_3, CO1 2166 stvx c04, OFFSET_4, CO1 2167 2168 addi CO1, CO1, 16 * SIZE 2169 addic. I, I, -1 2170 bgt+ LL(130) 2171 .align 4 2172 2173LL(140): 2174 andi. I, M, 8 2175 ble LL(150) 2176 2177 vxor c01, c01, c01 2178 vxor c02, c02, c02 2179 2180 mr BO, B 2181 2182 mr J, K 2183 2184 andi. r0, B, 15 2185 ble+ LL(141) 2186 2187 LOAD_A a1, OFFSET_0, AO 2188 LOAD_A a2, OFFSET_1, AO 2189 LOAD_B b1, OFFSET_0, BO 2190 vspltw bp1, b1, 2 2191 vspltw bp2, b1, 3 2192 2193 addi AO, AO, 8 * SIZE 2194 addi BO, BO, SIZE 2195 2196 vmaddfp c01, a1, bp1, c01 2197 vmaddfp c02, a2, bp1, c02 2198 subi J, J, 1 2199 cmpwi cr0, J, 0 2200 ble LL(148) 2201 2202 LOAD_A a1, OFFSET_0, AO 2203 LOAD_A a2, OFFSET_1, AO 2204 2205 addi AO, AO, 8 * SIZE 2206 addi BO, BO, SIZE 2207 2208 vmaddfp c01, a1, bp2, c01 2209 vmaddfp c02, a2, bp2, c02 2210 subi J, J, 1 2211 cmpwi cr0, J, 0 2212 ble LL(148) 2213 .align 4 2214 2215 2216LL(141): 2217 LOAD_A a1, OFFSET_0, AO 2218 LOAD_A a2, OFFSET_1, AO 2219 LOAD_A a3, OFFSET_2, AO 2220 LOAD_A a4, OFFSET_3, AO 2221 LOAD_A a5, OFFSET_4, AO 2222 LOAD_A a6, OFFSET_5, AO 2223 LOAD_A a7, OFFSET_6, AO 2224 LOAD_A a8, OFFSET_7, AO 2225 2226 LOAD_B b1, OFFSET_0, BO 2227 2228 srawi. r0, J, 2 2229 mtspr CTR, r0 2230 ble LL(145) 2231 .align 4 2232 2233LL(143): 2234 vspltw bp1, b1, 0 2235 vmaddfp c01, a1, bp1, c01 2236 vmaddfp c02, a2, bp1, c02 2237 2238 vspltw bp2, b1, 1 2239 vmaddfp c01, a3, bp2, c01 2240 vmaddfp c02, a4, bp2, c02 2241 2242 vspltw bp1, b1, 2 2243 vmaddfp c01, a5, bp1, c01 2244 vmaddfp c02, a6, bp1, c02 2245 2246 vspltw bp2, b1, 3 2247 vmaddfp c01, a7, bp2, c01 2248 vmaddfp c02, a8, bp2, c02 2249 2250 addi AO, AO, 32 * SIZE 2251 addi BO, BO, 4 * SIZE 2252 2253 LOAD_A a1, OFFSET_0, AO 2254 LOAD_A a2, OFFSET_1, AO 2255 LOAD_A a3, OFFSET_2, AO 2256 LOAD_A a4, OFFSET_3, AO 2257 2258 LOAD_A a5, OFFSET_4, AO 2259 LOAD_A a6, OFFSET_5, AO 2260 LOAD_A a7, OFFSET_6, AO 2261 LOAD_A a8, OFFSET_7, AO 2262 2263 LOAD_B b1, OFFSET_0, BO 2264 2265 bdnz LL(143) 2266 .align 4 2267 2268LL(145): 2269 andi. r0, J, 3 2270 ble+ LL(148) 2271 2272 cmpwi cr0, r0, 3 2273 bne LL(146) 2274 2275 vspltw bp1, b1, 0 2276 vmaddfp c01, a1, bp1, c01 2277 vmaddfp c02, a2, bp1, c02 2278 2279 vspltw bp2, b1, 1 2280 vmaddfp c01, a3, bp2, c01 2281 vmaddfp c02, a4, bp2, c02 2282 2283 LOAD_A a1, OFFSET_4, AO 2284 LOAD_A a2, OFFSET_5, AO 2285 2286 vspltw bp1, b1, 2 2287 vmaddfp c01, a1, bp1, c01 2288 vmaddfp c02, a2, bp1, c02 2289 2290 2291 addi AO, AO, 24 * SIZE 2292 addi BO, BO, 3 * SIZE 2293 b LL(148) 2294 .align 4 2295 2296LL(146): 2297 cmpwi cr0, r0, 2 2298 bne LL(147) 2299 2300 vspltw bp1, b1, 0 2301 vspltw bp2, b1, 1 2302 2303 vmaddfp c01, a1, bp1, c01 2304 vmaddfp c02, a2, bp1, c02 2305 2306 vmaddfp c01, a3, bp2, c01 2307 vmaddfp c02, a4, bp2, c02 2308 2309 addi AO, AO, 16 * SIZE 2310 addi BO, BO, 2 * SIZE 2311 b LL(148) 2312 .align 4 2313 2314LL(147): 2315 cmpwi cr0, r0, 1 2316 bne LL(148) 2317 2318 vspltw bp1, b1, 0 2319 2320 vmaddfp c01, a1, bp1, c01 2321 vmaddfp c02, a2, bp1, c02 2322 2323 addi AO, AO, 8 * SIZE 2324 addi BO, BO, 1 * SIZE 2325 .align 4 2326 2327LL(148): 2328 lvx alpha, OFFSET_0, SP 2329 vxor VZERO, VZERO, VZERO 2330 2331 lvx C1, OFFSET_0, CO1 2332 lvx C2, OFFSET_1, CO1 2333 lvx C3, OFFSET_2, CO1 2334 2335 lvsr PERMRSHIFT1, 0, CO1 2336 2337 vperm c00, VZERO, c01, PERMRSHIFT1 2338 vperm c01, c01, c02, PERMRSHIFT1 2339 vperm c02, c02, VZERO, PERMRSHIFT1 2340 2341 vmaddfp c00, alpha, c00, C1 2342 vmaddfp c01, alpha, c01, C2 2343 vmaddfp c02, alpha, c02, C3 2344 2345 stvx c00, OFFSET_0, CO1 2346 stvx c01, OFFSET_1, CO1 2347 stvx c02, OFFSET_2, CO1 2348 addi CO1, CO1, 8 * SIZE 2349 .align 4 2350 2351LL(150): 2352 andi. I, M, 4 2353 ble LL(160) 2354 2355 vxor c01, c01, c01 2356 2357 mr BO, B 2358 2359 mr J, K 2360 2361 andi. r0, B, 15 2362 ble+ LL(151) 2363 2364 LOAD_A a1, OFFSET_0, AO 2365 LOAD_B b1, OFFSET_0, BO 2366 vspltw bp1, b1, 2 2367 vspltw bp2, b1, 3 2368 2369 addi AO, AO, 4 * SIZE 2370 addi BO, BO, SIZE 2371 2372 vmaddfp c01, a1, bp1, c01 2373 subi J, J, 1 2374 cmpwi cr0, J, 0 2375 ble LL(158) 2376 2377 LOAD_A a1, OFFSET_0, AO 2378 addi AO, AO, 4 * SIZE 2379 addi BO, BO, SIZE 2380 2381 vmaddfp c01, a1, bp2, c01 2382 subi J, J, 1 2383 cmpwi cr0, J, 0 2384 ble LL(158) 2385 .align 4 2386 2387 2388LL(151): 2389 LOAD_A a1, OFFSET_0, AO 2390 LOAD_A a2, OFFSET_1, AO 2391 LOAD_A a3, OFFSET_2, AO 2392 LOAD_A a4, OFFSET_3, AO 2393 LOAD_B b1, OFFSET_0, BO 2394 2395 srawi. r0, J, 2 2396 mtspr CTR, r0 2397 ble LL(155) 2398 .align 4 2399 2400LL(153): 2401 vspltw bp1, b1, 0 2402 vmaddfp c01, a1, bp1, c01 2403 vspltw bp2, b1, 1 2404 vmaddfp c01, a2, bp2, c01 2405 vspltw bp1, b1, 2 2406 vmaddfp c01, a3, bp1, c01 2407 vspltw bp2, b1, 3 2408 vmaddfp c01, a4, bp2, c01 2409 2410 addi AO, AO, 16 * SIZE 2411 addi BO, BO, 4 * SIZE 2412 2413 LOAD_A a1, OFFSET_0, AO 2414 LOAD_A a2, OFFSET_1, AO 2415 LOAD_A a3, OFFSET_2, AO 2416 LOAD_A a4, OFFSET_3, AO 2417 2418 LOAD_B b1, OFFSET_0, BO 2419 2420 bdnz LL(153) 2421 .align 4 2422 2423LL(155): 2424 andi. r0, J, 3 2425 ble+ LL(158) 2426 2427 cmpwi cr0, r0, 3 2428 bne LL(156) 2429 2430 vspltw bp1, b1, 0 2431 vmaddfp c01, a1, bp1, c01 2432 vspltw bp2, b1, 1 2433 vmaddfp c01, a2, bp2, c01 2434 vspltw bp1, b1, 2 2435 vmaddfp c01, a3, bp1, c01 2436 2437 addi AO, AO, 12 * SIZE 2438 addi BO, BO, 3 * SIZE 2439 b LL(158) 2440 .align 4 2441 2442LL(156): 2443 cmpwi cr0, r0, 2 2444 bne LL(157) 2445 2446 vspltw bp1, b1, 0 2447 vspltw bp2, b1, 1 2448 2449 vmaddfp c01, a1, bp1, c01 2450 vmaddfp c01, a2, bp2, c01 2451 2452 addi AO, AO, 8 * SIZE 2453 addi BO, BO, 2 * SIZE 2454 b LL(158) 2455 .align 4 2456 2457LL(157): 2458 cmpwi cr0, r0, 1 2459 bne LL(158) 2460 2461 vspltw bp1, b1, 0 2462 2463 vmaddfp c01, a1, bp1, c01 2464 2465 addi AO, AO, 4 * SIZE 2466 addi BO, BO, 1 * SIZE 2467 .align 4 2468 2469LL(158): 2470 lvx alpha, OFFSET_0, SP 2471 vxor VZERO, VZERO, VZERO 2472 2473 lvx C1, OFFSET_0, CO1 2474 lvx C2, OFFSET_1, CO1 2475 2476 lvsr PERMRSHIFT1, 0, CO1 2477 2478 vperm c00, VZERO, c01, PERMRSHIFT1 2479 vperm c01, c01, VZERO, PERMRSHIFT1 2480 2481 vmaddfp c00, alpha, c00, C1 2482 vmaddfp c01, alpha, c01, C2 2483 2484 stvx c00, OFFSET_0, CO1 2485 stvx c01, OFFSET_1, CO1 2486 addi CO1, CO1, 4 * SIZE 2487 .align 4 2488 2489LL(160): 2490 andi. I, M, 2 2491 ble LL(170) 2492 2493 mr BO, B 2494 2495 LFD f8, 0 * SIZE(AO) 2496 LFD f9, 1 * SIZE(AO) 2497 LFD f10, 2 * SIZE(AO) 2498 LFD f11, 3 * SIZE(AO) 2499 2500 LFD f12, 0 * SIZE(B) 2501 LFD f13, 1 * SIZE(B) 2502 2503 lfs f0, FZERO(SP) 2504 fmr f1, f0 2505 fmr f2, f0 2506 fmr f3, f0 2507 2508 srawi. r0, K, 1 2509 mtspr CTR, r0 2510 ble LL(165) 2511 .align 4 2512 2513LL(162): 2514 FMADD f0, f8, f12, f0 2515 FMADD f1, f9, f12, f1 2516 FMADD f2, f10, f13, f2 2517 FMADD f3, f11, f13, f3 2518 2519 LFD f8, 4 * SIZE(AO) 2520 LFD f9, 5 * SIZE(AO) 2521 LFD f10, 6 * SIZE(AO) 2522 LFD f11, 7 * SIZE(AO) 2523 2524 LFD f12, 2 * SIZE(BO) 2525 LFD f13, 3 * SIZE(BO) 2526 2527 addi AO, AO, 4 * SIZE 2528 addi BO, BO, 2 * SIZE 2529 bdnz LL(162) 2530 .align 4 2531 2532LL(165): 2533 andi. r0, K, 1 2534 lfs f13, ALPHA(SP) 2535 ble LL(168) 2536 .align 4 2537 2538LL(166): 2539 FMADD f0, f8, f12, f0 2540 FMADD f1, f9, f12, f1 2541 2542 addi AO, AO, 2 * SIZE 2543 addi BO, BO, 1 * SIZE 2544 .align 4 2545 2546LL(168): 2547 LFD f8, 0 * SIZE(CO1) 2548 LFD f9, 1 * SIZE(CO1) 2549 2550 FADD f0, f0, f2 2551 FADD f1, f1, f3 2552 2553 FMADD f0, f0, f13, f8 2554 FMADD f1, f1, f13, f9 2555 2556 STFD f0, 0 * SIZE(CO1) 2557 STFD f1, 1 * SIZE(CO1) 2558 2559 addi CO1, CO1, 2 * SIZE 2560 .align 4 2561 2562LL(170): 2563 andi. I, M, 1 2564 ble LL(999) 2565 2566 mr BO, B 2567 2568 LFD f8, 0 * SIZE(AO) 2569 LFD f9, 1 * SIZE(AO) 2570 2571 LFD f10, 0 * SIZE(B) 2572 LFD f11, 1 * SIZE(B) 2573 2574 lfs f0, FZERO(SP) 2575 fmr f1, f0 2576 2577 srawi. r0, K, 1 2578 mtspr CTR, r0 2579 ble LL(175) 2580 .align 4 2581 2582LL(172): 2583 FMADD f0, f8, f10, f0 2584 FMADD f1, f9, f11, f1 2585 2586 LFD f8, 2 * SIZE(AO) 2587 LFD f9, 3 * SIZE(AO) 2588 LFD f10, 2 * SIZE(BO) 2589 LFD f11, 3 * SIZE(BO) 2590 2591 addi AO, AO, 2 * SIZE 2592 addi BO, BO, 2 * SIZE 2593 bdnz LL(172) 2594 .align 4 2595 2596LL(175): 2597 andi. r0, K, 1 2598 lfs f13, ALPHA(SP) 2599 ble LL(178) 2600 .align 4 2601 2602LL(176): 2603 FMADD f0, f8, f10, f0 2604 2605 addi AO, AO, 1 * SIZE 2606 addi BO, BO, 1 * SIZE 2607 .align 4 2608 2609LL(178): 2610 LFD f8, 0 * SIZE(CO1) 2611 2612 FADD f0, f0, f1 2613 2614 FMADD f0, f0, f13, f8 2615 2616 STFD f0, 0 * SIZE(CO1) 2617 .align 4 2618 2619LL(999): 2620 mr SP, STACK 2621 2622 li r0, 0 * 16 2623 lvx v20, SP, r0 2624 li r0, 1 * 16 2625 lvx v21, SP, r0 2626 li r0, 2 * 16 2627 lvx v22, SP, r0 2628 li r0, 3 * 16 2629 lvx v23, SP, r0 2630 li r0, 4 * 16 2631 lvx v24, SP, r0 2632 li r0, 5 * 16 2633 lvx v25, SP, r0 2634 li r0, 6 * 16 2635 lvx v26, SP, r0 2636 li r0, 7 * 16 2637 lvx v27, SP, r0 2638 li r0, 8 * 16 2639 lvx v28, SP, r0 2640 li r0, 9 * 16 2641 lvx v29, SP, r0 2642 li r0, 10 * 16 2643 lvx v30, SP, r0 2644 li r0, 11 * 16 2645 lvx v31, SP, r0 2646 2647 mtspr VRsave, VREG 2648 2649#ifdef __64BIT__ 2650 ld r31, 192(SP) 2651 ld r30, 200(SP) 2652 ld r29, 208(SP) 2653 ld r28, 216(SP) 2654 ld r27, 224(SP) 2655 ld r26, 232(SP) 2656 ld r25, 240(SP) 2657 ld r24, 248(SP) 2658 ld r23, 256(SP) 2659 ld r22, 264(SP) 2660 ld r21, 272(SP) 2661 ld r20, 280(SP) 2662 ld r19, 288(SP) 2663 ld r18, 296(SP) 2664 ld r17, 304(SP) 2665 ld r16, 312(SP) 2666 ld r15, 320(SP) 2667 ld r14, 328(SP) 2668#else 2669 lwz r31, 192(SP) 2670 lwz r30, 196(SP) 2671 lwz r29, 200(SP) 2672 lwz r28, 204(SP) 2673 lwz r27, 208(SP) 2674 lwz r26, 212(SP) 2675 lwz r25, 216(SP) 2676 lwz r24, 220(SP) 2677 lwz r23, 224(SP) 2678 lwz r22, 228(SP) 2679 lwz r21, 232(SP) 2680 lwz r20, 236(SP) 2681 lwz r19, 240(SP) 2682 lwz r18, 244(SP) 2683 lwz r17, 248(SP) 2684 lwz r16, 252(SP) 2685 lwz r15, 256(SP) 2686 lwz r14, 260(SP) 2687#endif 2688 2689 addi SP, SP, STACKSIZE 2690 2691 blr 2692 2693 EPILOGUE 2694#endif 2695