1/*********************************************************************/ 2/* Copyright 2005-2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define APREFETCHSIZE 24 43#define APREFETCH_CATEGORY 0 44 45#define M %i0 46#define N %i1 47#define K %i2 48 49#if defined(DOUBLE) && !defined(__64BIT__) 50#define A %i5 51#define B %i4 52#else 53#define A %i4 54#define B %i5 55#endif 56 57#define C %o4 58#define LDC %o5 59 60#define AO %l0 61#define BO %l1 62#define I %l2 63#define J %l3 64#define L %l4 65 66#define BB %o7 67 68#define C1 %o0 69#define C2 %o1 70#define C3 %o2 71#define C4 %o3 72 73#define C5 %l5 74#define C6 %l6 75#define C7 %l7 76#define C8 %i3 77 78#define OFFSET %g1 79#define KK %g2 80#define TEMP1 %g3 81#define TEMP2 %g4 82 83#ifdef DOUBLE 84#define c01 %f0 85#define c02 %f2 86#define c03 %f4 87#define c04 %f6 88#define c05 %f8 89#define c06 %f10 90#define c07 %f12 91#define c08 %f14 92#define c09 %f16 93#define c10 %f18 94#define c11 %f20 95#define c12 %f22 96#define c13 %f24 97#define c14 %f26 98#define c15 %f28 99#define c16 %f30 100 101#define a1 %f32 102#define a2 %f34 103#define a3 %f36 104#define a4 %f38 105#define a5 %f40 106 107#define b1 %f42 108#define b2 %f44 109#define b3 %f46 110#define b4 %f48 111#define b5 %f50 112#define b6 %f52 113#define b7 %f54 114#define b8 %f56 115#define b9 %f58 116 117#define ALPHA %f62 118 119#define cc01 0 120#define cc02 2 121#define cc03 4 122#define cc04 6 123#define cc05 8 124#define cc06 10 125#define cc07 12 126#define cc08 14 127#define cc09 16 128#define cc10 18 129#define cc11 20 130#define cc12 22 131#define cc13 24 132#define cc14 26 133#define cc15 28 134#define cc16 30 135 136#define aa1 1 137#define aa2 3 138#define aa3 5 139#define aa4 7 140#define aa5 9 141 142#define bb1 11 143#define bb2 13 144#define bb3 15 145#define bb4 17 146#define bb5 19 147#define bb6 21 148#define bb7 23 149#define bb8 25 150#define bb9 27 151 152#define alpha 31 153#else 154#define c01 %f0 155#define c02 %f1 156#define c03 %f2 157#define c04 %f3 158#define c05 %f4 159#define c06 %f5 160#define c07 %f6 161#define c08 %f7 162#define c09 %f8 163#define c10 %f9 164#define c11 %f10 165#define c12 %f11 166#define c13 %f12 167#define c14 %f13 168#define c15 %f14 169#define c16 %f15 170 171#define a1 %f16 172#define a2 %f17 173#define a3 %f18 174#define a4 %f19 175#define a5 %f20 176 177#define b1 %f21 178#define b2 %f22 179#define b3 %f23 180#define b4 %f24 181#define b5 %f25 182#define b6 %f26 183#define b7 %f27 184#define b8 %f28 185#define b9 %f29 186 187#define ALPHA %f31 188 189#define cc01 0 190#define cc02 1 191#define cc03 2 192#define cc04 3 193#define cc05 4 194#define cc06 5 195#define cc07 6 196#define cc08 7 197#define cc09 8 198#define cc10 9 199#define cc11 10 200#define cc12 11 201#define cc13 12 202#define cc14 13 203#define cc15 14 204#define cc16 15 205 206#define aa1 16 207#define aa2 17 208#define aa3 18 209#define aa4 19 210#define aa5 20 211 212#define bb1 21 213#define bb2 22 214#define bb3 23 215#define bb4 24 216#define bb5 25 217#define bb6 26 218#define bb7 27 219#define bb8 28 220#define bb9 29 221 222#define alpha 31 223 224#endif 225 226 .register %g2, #scratch 227 .register %g3, #scratch 228 229 PROLOGUE 230 SAVESP 231 nop 232 233#ifndef __64BIT__ 234 235#ifdef DOUBLE 236 st %i3, [%sp + STACK_START + 16] 237 st %i4, [%sp + STACK_START + 20] 238 239 ld [%sp + STACK_START + 28], B 240 ld [%sp + STACK_START + 32], C 241 ld [%sp + STACK_START + 36], LDC 242#ifdef TRMMKERNEL 243 ld [%sp + STACK_START + 40], OFFSET 244#endif 245#else 246 st %i3, [%sp + STACK_START + 16] 247 248 ld [%sp + STACK_START + 28], C 249 ld [%sp + STACK_START + 32], LDC 250#ifdef TRMMKERNEL 251 ld [%sp + STACK_START + 36], OFFSET 252#endif 253#endif 254 LDF [%sp + STACK_START + 16], ALPHA 255#ifdef TRMMKERNEL 256 st %g1, [%sp + STACK_START + 8] 257 st %g2, [%sp + STACK_START + 12] 258 st %g3, [%sp + STACK_START + 16] 259 st %g4, [%sp + STACK_START + 20] 260#endif 261#else 262 263 ldx [%sp+ STACK_START + 56], C 264 ldx [%sp+ STACK_START + 64], LDC 265#ifdef TRMMKERNEL 266 ldx [%sp+ STACK_START + 72], OFFSET 267#endif 268 269#ifdef DOUBLE 270 FMOV %f6, ALPHA 271#else 272 FMOV %f7, ALPHA 273#endif 274 275#ifdef TRMMKERNEL 276 stx %g1, [%sp + STACK_START + 32] 277 stx %g2, [%sp + STACK_START + 40] 278 stx %g3, [%sp + STACK_START + 48] 279 stx %g4, [%sp + STACK_START + 56] 280#endif 281 282#endif 283 284#if defined(TRMMKERNEL) && !defined(LEFT) 285 neg OFFSET, KK 286#endif 287 288 sra N, 3, J 289 cmp J, 0 290 ble,pn %icc, .LL30 291 sll LDC, BASE_SHIFT, LDC 292 293.LL11: 294 mov C, C1 295 add C, LDC, C2 296 add C2, LDC, C3 297 add C3, LDC, C4 298 add C4, LDC, C5 299 add C5, LDC, C6 300 add C6, LDC, C7 301 add C7, LDC, C8 302 add C8, LDC, C 303 304 sll K, BASE_SHIFT + 3, BB 305 306#if defined(TRMMKERNEL) && defined(LEFT) 307 mov OFFSET, KK 308#endif 309 310 mov A, AO 311 312 sra M, 1, I 313 cmp I, 0 314 ble,pn %icc, .LL20 315 add B, BB, BB 316 .align 4 317 318.LL12: 319 prefetch [BB + 0 * SIZE], 1 320 321#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) 322 mov B, BO 323#else 324 sll KK, BASE_SHIFT + 1, TEMP1 325 sll KK, BASE_SHIFT + 3, TEMP2 326 327 add AO, TEMP1, AO 328 add B, TEMP2, BO 329#endif 330 331 LDF [AO + 0 * SIZE], a1 332 LDF [AO + 1 * SIZE], a2 333 LDF [AO + 8 * SIZE], a5 334 335 LDF [BO + 0 * SIZE], b1 336 337 LDF [BO + 1 * SIZE], b2 338 FCLR (cc01) 339 LDF [BO + 2 * SIZE], b3 340 FCLR (cc05) 341 LDF [BO + 3 * SIZE], b4 342 FCLR (cc09) 343 LDF [BO + 4 * SIZE], b5 344 FCLR (cc13) 345 346 LDF [BO + 5 * SIZE], b6 347 FCLR (cc02) 348 LDF [BO + 6 * SIZE], b7 349 FCLR (cc06) 350 LDF [BO + 7 * SIZE], b8 351 FCLR (cc10) 352 LDF [BO + 8 * SIZE], b9 353 FCLR (cc14) 354 355 prefetch [C1 + 1 * SIZE], 3 356 FCLR (cc03) 357 prefetch [C2 + 2 * SIZE], 3 358 FCLR (cc07) 359 prefetch [C3 + 1 * SIZE], 3 360 FCLR (cc11) 361 prefetch [C4 + 2 * SIZE], 3 362 FCLR (cc15) 363 364 prefetch [C5 + 1 * SIZE], 3 365 FCLR (cc04) 366 prefetch [C6 + 2 * SIZE], 3 367 FCLR (cc08) 368 prefetch [C7 + 1 * SIZE], 3 369 FCLR (cc12) 370 prefetch [C8 + 2 * SIZE], 3 371 FCLR (cc16) 372 373#ifndef TRMMKERNEL 374 sra K, 3, L 375#else 376#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 377 sub K, KK, L 378#elif defined(LEFT) 379 add KK, 2, L 380#else 381 add KK, 8, L 382#endif 383 sra L, 3, L 384#endif 385 cmp L, 0 386 ble,pn %icc, .LL15 387 add BB, 32 * SIZE, BB 388 .align 4 389 390.LL13: 391 FMADD (aa1, bb1, cc01, cc01) 392 FMADD (aa2, bb1, cc02, cc02) 393 FMADD (aa1, bb2, cc03, cc03) 394 FMADD (aa2, bb2, cc04, cc04) 395 396 FMADD (aa1, bb3, cc05, cc05) 397 LDF [BO + 16 * SIZE], b1 398 FMADD (aa2, bb3, cc06, cc06) 399 LDF [BO + 9 * SIZE], b2 400 401 FMADD (aa1, bb4, cc07, cc07) 402 LDF [BO + 10 * SIZE], b3 403 FMADD (aa2, bb4, cc08, cc08) 404 LDF [BO + 11 * SIZE], b4 405 406 FMADD (aa1, bb5, cc09, cc09) 407 LDF [AO + 2 * SIZE], a3 408 FMADD (aa2, bb5, cc10, cc10) 409 LDF [AO + 3 * SIZE], a4 410 411 FMADD (aa1, bb6, cc11, cc11) 412 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 413 FMADD (aa2, bb6, cc12, cc12) 414 nop 415 416 FMADD (aa1, bb7, cc13, cc13) 417 LDF [BO + 12 * SIZE], b5 418 FMADD (aa2, bb7, cc14, cc14) 419 LDF [BO + 13 * SIZE], b6 420 421 FMADD (aa1, bb8, cc15, cc15) 422 LDF [BO + 14 * SIZE], b7 423 FMADD (aa2, bb8, cc16, cc16) 424 LDF [BO + 15 * SIZE], b8 425 426 FMADD (aa3, bb9, cc01, cc01) 427 FMADD (aa4, bb9, cc02, cc02) 428 FMADD (aa3, bb2, cc03, cc03) 429 FMADD (aa4, bb2, cc04, cc04) 430 431 FMADD (aa3, bb3, cc05, cc05) 432 LDF [BO + 24 * SIZE], b9 433 FMADD (aa4, bb3, cc06, cc06) 434 LDF [BO + 17 * SIZE], b2 435 436 FMADD (aa3, bb4, cc07, cc07) 437 LDF [BO + 18 * SIZE], b3 438 FMADD (aa4, bb4, cc08, cc08) 439 LDF [BO + 19 * SIZE], b4 440 441 FMADD (aa3, bb5, cc09, cc09) 442 LDF [AO + 4 * SIZE], a1 443 FMADD (aa4, bb5, cc10, cc10) 444 LDF [AO + 5 * SIZE], a2 445 446 FMADD (aa3, bb6, cc11, cc11) 447 add L, -1, L 448 FMADD (aa4, bb6, cc12, cc12) 449 nop 450 451 FMADD (aa3, bb7, cc13, cc13) 452 LDF [BO + 20 * SIZE], b5 453 FMADD (aa4, bb7, cc14, cc14) 454 LDF [BO + 21 * SIZE], b6 455 456 FMADD (aa3, bb8, cc15, cc15) 457 LDF [BO + 22 * SIZE], b7 458 FMADD (aa4, bb8, cc16, cc16) 459 LDF [BO + 23 * SIZE], b8 460 461 FMADD (aa1, bb1, cc01, cc01) 462 FMADD (aa2, bb1, cc02, cc02) 463 FMADD (aa1, bb2, cc03, cc03) 464 FMADD (aa2, bb2, cc04, cc04) 465 466 FMADD (aa1, bb3, cc05, cc05) 467 LDF [BO + 32 * SIZE], b1 468 FMADD (aa2, bb3, cc06, cc06) 469 LDF [BO + 25 * SIZE], b2 470 471 FMADD (aa1, bb4, cc07, cc07) 472 LDF [BO + 26 * SIZE], b3 473 FMADD (aa2, bb4, cc08, cc08) 474 LDF [BO + 27 * SIZE], b4 475 476 FMADD (aa1, bb5, cc09, cc09) 477 LDF [AO + 6 * SIZE], a3 478 FMADD (aa2, bb5, cc10, cc10) 479 LDF [AO + 7 * SIZE], a4 480 481 FMADD (aa1, bb6, cc11, cc11) 482 nop 483 FMADD (aa2, bb6, cc12, cc12) 484 nop 485 486 FMADD (aa1, bb7, cc13, cc13) 487 LDF [BO + 28 * SIZE], b5 488 FMADD (aa2, bb7, cc14, cc14) 489 LDF [BO + 29 * SIZE], b6 490 491 FMADD (aa1, bb8, cc15, cc15) 492 LDF [BO + 30 * SIZE], b7 493 FMADD (aa2, bb8, cc16, cc16) 494 LDF [BO + 31 * SIZE], b8 495 496 FMADD (aa3, bb9, cc01, cc01) 497 FMADD (aa4, bb9, cc02, cc02) 498 FMADD (aa3, bb2, cc03, cc03) 499 FMADD (aa4, bb2, cc04, cc04) 500 501 FMADD (aa3, bb3, cc05, cc05) 502 LDF [BO + 40 * SIZE], b9 503 FMADD (aa4, bb3, cc06, cc06) 504 LDF [BO + 33 * SIZE], b2 505 506 FMADD (aa3, bb4, cc07, cc07) 507 LDF [BO + 34 * SIZE], b3 508 FMADD (aa4, bb4, cc08, cc08) 509 LDF [BO + 35 * SIZE], b4 510 511 FMADD (aa3, bb5, cc09, cc09) 512 LDF [AO + 16 * SIZE], a1 /****/ 513 FMADD (aa4, bb5, cc10, cc10) 514 LDF [AO + 9 * SIZE], a2 515 516 FMADD (aa3, bb6, cc11, cc11) 517 nop 518 FMADD (aa4, bb6, cc12, cc12) 519 nop 520 521 FMADD (aa3, bb7, cc13, cc13) 522 LDF [BO + 36 * SIZE], b5 523 FMADD (aa4, bb7, cc14, cc14) 524 LDF [BO + 37 * SIZE], b6 525 526 FMADD (aa3, bb8, cc15, cc15) 527 LDF [BO + 38 * SIZE], b7 528 FMADD (aa4, bb8, cc16, cc16) 529 LDF [BO + 39 * SIZE], b8 530 531 FMADD (aa5, bb1, cc01, cc01) 532 FMADD (aa2, bb1, cc02, cc02) 533 FMADD (aa5, bb2, cc03, cc03) 534 FMADD (aa2, bb2, cc04, cc04) 535 536 FMADD (aa5, bb3, cc05, cc05) 537 LDF [BO + 48 * SIZE], b1 538 FMADD (aa2, bb3, cc06, cc06) 539 LDF [BO + 41 * SIZE], b2 540 541 FMADD (aa5, bb4, cc07, cc07) 542 LDF [BO + 42 * SIZE], b3 543 FMADD (aa2, bb4, cc08, cc08) 544 LDF [BO + 43 * SIZE], b4 545 546 FMADD (aa5, bb5, cc09, cc09) 547 LDF [AO + 10 * SIZE], a3 548 FMADD (aa2, bb5, cc10, cc10) 549 LDF [AO + 11 * SIZE], a4 550 551 FMADD (aa5, bb6, cc11, cc11) 552 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 553 FMADD (aa2, bb6, cc12, cc12) 554 nop 555 556 FMADD (aa5, bb7, cc13, cc13) 557 LDF [BO + 44 * SIZE], b5 558 FMADD (aa2, bb7, cc14, cc14) 559 LDF [BO + 45 * SIZE], b6 560 561 FMADD (aa5, bb8, cc15, cc15) 562 LDF [BO + 46 * SIZE], b7 563 FMADD (aa2, bb8, cc16, cc16) 564 LDF [BO + 47 * SIZE], b8 565 566 FMADD (aa3, bb9, cc01, cc01) 567 FMADD (aa4, bb9, cc02, cc02) 568 FMADD (aa3, bb2, cc03, cc03) 569 FMADD (aa4, bb2, cc04, cc04) 570 571 FMADD (aa3, bb3, cc05, cc05) 572 LDF [BO + 56 * SIZE], b9 573 FMADD (aa4, bb3, cc06, cc06) 574 LDF [BO + 49 * SIZE], b2 575 576 FMADD (aa3, bb4, cc07, cc07) 577 LDF [BO + 50 * SIZE], b3 578 FMADD (aa4, bb4, cc08, cc08) 579 LDF [BO + 51 * SIZE], b4 580 581 FMADD (aa3, bb5, cc09, cc09) 582 LDF [AO + 12 * SIZE], a5 583 FMADD (aa4, bb5, cc10, cc10) 584 LDF [AO + 13 * SIZE], a2 585 586 FMADD (aa3, bb6, cc11, cc11) 587 cmp L, 0 588 FMADD (aa4, bb6, cc12, cc12) 589 nop 590 591 FMADD (aa3, bb7, cc13, cc13) 592 LDF [BO + 52 * SIZE], b5 593 FMADD (aa4, bb7, cc14, cc14) 594 LDF [BO + 53 * SIZE], b6 595 596 FMADD (aa3, bb8, cc15, cc15) 597 LDF [BO + 54 * SIZE], b7 598 FMADD (aa4, bb8, cc16, cc16) 599 LDF [BO + 55 * SIZE], b8 600 601 FMADD (aa5, bb1, cc01, cc01) 602 FMADD (aa2, bb1, cc02, cc02) 603 FMADD (aa5, bb2, cc03, cc03) 604 FMADD (aa2, bb2, cc04, cc04) 605 606 FMADD (aa5, bb3, cc05, cc05) 607 LDF [BO + 64 * SIZE], b1 608 FMADD (aa2, bb3, cc06, cc06) 609 LDF [BO + 57 * SIZE], b2 610 611 FMADD (aa5, bb4, cc07, cc07) 612 LDF [BO + 58 * SIZE], b3 613 FMADD (aa2, bb4, cc08, cc08) 614 LDF [BO + 59 * SIZE], b4 615 616 FMADD (aa5, bb5, cc09, cc09) 617 LDF [AO + 14 * SIZE], a3 618 FMADD (aa2, bb5, cc10, cc10) 619 LDF [AO + 15 * SIZE], a4 620 621 FMADD (aa5, bb6, cc11, cc11) 622 add BO, 64 * SIZE, BO 623 FMADD (aa2, bb6, cc12, cc12) 624 add AO, 16 * SIZE, AO 625 626 FMADD (aa5, bb7, cc13, cc13) 627 LDF [BO - 4 * SIZE], b5 628 FMADD (aa2, bb7, cc14, cc14) 629 LDF [BO - 3 * SIZE], b6 630 631 FMADD (aa5, bb8, cc15, cc15) 632 LDF [BO - 2 * SIZE], b7 633 FMADD (aa2, bb8, cc16, cc16) 634 LDF [BO - 1 * SIZE], b8 635 636 FMADD (aa3, bb9, cc01, cc01) 637 FMADD (aa4, bb9, cc02, cc02) 638 FMADD (aa3, bb2, cc03, cc03) 639 FMADD (aa4, bb2, cc04, cc04) 640 641 FMADD (aa3, bb3, cc05, cc05) 642 LDF [BO + 8 * SIZE], b9 643 FMADD (aa4, bb3, cc06, cc06) 644 LDF [BO + 1 * SIZE], b2 645 646 FMADD (aa3, bb4, cc07, cc07) 647 LDF [BO + 2 * SIZE], b3 648 FMADD (aa4, bb4, cc08, cc08) 649 LDF [BO + 3 * SIZE], b4 650 651 FMADD (aa3, bb5, cc09, cc09) 652 LDF [AO + 8 * SIZE], a5 /****/ 653 FMADD (aa4, bb5, cc10, cc10) 654 LDF [AO + 1 * SIZE], a2 655 656 FMADD (aa3, bb6, cc11, cc11) 657 FMADD (aa4, bb6, cc12, cc12) 658 659 FMADD (aa3, bb7, cc13, cc13) 660 LDF [BO + 4 * SIZE], b5 661 FMADD (aa4, bb7, cc14, cc14) 662 LDF [BO + 5 * SIZE], b6 663 664 FMADD (aa3, bb8, cc15, cc15) 665 LDF [BO + 6 * SIZE], b7 666 FMADD (aa4, bb8, cc16, cc16) 667 ble,pn %icc, .LL15 668 LDF [BO + 7 * SIZE], b8 669 670 FMADD (aa1, bb1, cc01, cc01) 671 FMADD (aa2, bb1, cc02, cc02) 672 FMADD (aa1, bb2, cc03, cc03) 673 FMADD (aa2, bb2, cc04, cc04) 674 675 FMADD (aa1, bb3, cc05, cc05) 676 LDF [BO + 16 * SIZE], b1 677 FMADD (aa2, bb3, cc06, cc06) 678 LDF [BO + 9 * SIZE], b2 679 680 FMADD (aa1, bb4, cc07, cc07) 681 LDF [BO + 10 * SIZE], b3 682 FMADD (aa2, bb4, cc08, cc08) 683 LDF [BO + 11 * SIZE], b4 684 685 FMADD (aa1, bb5, cc09, cc09) 686 LDF [AO + 2 * SIZE], a3 687 FMADD (aa2, bb5, cc10, cc10) 688 LDF [AO + 3 * SIZE], a4 689 690 FMADD (aa1, bb6, cc11, cc11) 691 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 692 FMADD (aa2, bb6, cc12, cc12) 693 nop 694 695 FMADD (aa1, bb7, cc13, cc13) 696 LDF [BO + 12 * SIZE], b5 697 FMADD (aa2, bb7, cc14, cc14) 698 LDF [BO + 13 * SIZE], b6 699 700 FMADD (aa1, bb8, cc15, cc15) 701 LDF [BO + 14 * SIZE], b7 702 FMADD (aa2, bb8, cc16, cc16) 703 LDF [BO + 15 * SIZE], b8 704 705 FMADD (aa3, bb9, cc01, cc01) 706 FMADD (aa4, bb9, cc02, cc02) 707 FMADD (aa3, bb2, cc03, cc03) 708 FMADD (aa4, bb2, cc04, cc04) 709 710 FMADD (aa3, bb3, cc05, cc05) 711 LDF [BO + 24 * SIZE], b9 712 FMADD (aa4, bb3, cc06, cc06) 713 LDF [BO + 17 * SIZE], b2 714 715 FMADD (aa3, bb4, cc07, cc07) 716 LDF [BO + 18 * SIZE], b3 717 FMADD (aa4, bb4, cc08, cc08) 718 LDF [BO + 19 * SIZE], b4 719 720 FMADD (aa3, bb5, cc09, cc09) 721 LDF [AO + 4 * SIZE], a1 722 FMADD (aa4, bb5, cc10, cc10) 723 LDF [AO + 5 * SIZE], a2 724 725 FMADD (aa3, bb6, cc11, cc11) 726 add L, -1, L 727 FMADD (aa4, bb6, cc12, cc12) 728 nop 729 730 FMADD (aa3, bb7, cc13, cc13) 731 LDF [BO + 20 * SIZE], b5 732 FMADD (aa4, bb7, cc14, cc14) 733 LDF [BO + 21 * SIZE], b6 734 735 FMADD (aa3, bb8, cc15, cc15) 736 LDF [BO + 22 * SIZE], b7 737 FMADD (aa4, bb8, cc16, cc16) 738 LDF [BO + 23 * SIZE], b8 739 740 FMADD (aa1, bb1, cc01, cc01) 741 FMADD (aa2, bb1, cc02, cc02) 742 FMADD (aa1, bb2, cc03, cc03) 743 FMADD (aa2, bb2, cc04, cc04) 744 745 FMADD (aa1, bb3, cc05, cc05) 746 LDF [BO + 32 * SIZE], b1 747 FMADD (aa2, bb3, cc06, cc06) 748 LDF [BO + 25 * SIZE], b2 749 750 FMADD (aa1, bb4, cc07, cc07) 751 LDF [BO + 26 * SIZE], b3 752 FMADD (aa2, bb4, cc08, cc08) 753 LDF [BO + 27 * SIZE], b4 754 755 FMADD (aa1, bb5, cc09, cc09) 756 LDF [AO + 6 * SIZE], a3 757 FMADD (aa2, bb5, cc10, cc10) 758 LDF [AO + 7 * SIZE], a4 759 760 FMADD (aa1, bb6, cc11, cc11) 761 nop 762 FMADD (aa2, bb6, cc12, cc12) 763 nop 764 765 FMADD (aa1, bb7, cc13, cc13) 766 LDF [BO + 28 * SIZE], b5 767 FMADD (aa2, bb7, cc14, cc14) 768 LDF [BO + 29 * SIZE], b6 769 770 FMADD (aa1, bb8, cc15, cc15) 771 LDF [BO + 30 * SIZE], b7 772 FMADD (aa2, bb8, cc16, cc16) 773 LDF [BO + 31 * SIZE], b8 774 775 FMADD (aa3, bb9, cc01, cc01) 776 FMADD (aa4, bb9, cc02, cc02) 777 FMADD (aa3, bb2, cc03, cc03) 778 FMADD (aa4, bb2, cc04, cc04) 779 780 FMADD (aa3, bb3, cc05, cc05) 781 LDF [BO + 40 * SIZE], b9 782 FMADD (aa4, bb3, cc06, cc06) 783 LDF [BO + 33 * SIZE], b2 784 785 FMADD (aa3, bb4, cc07, cc07) 786 LDF [BO + 34 * SIZE], b3 787 FMADD (aa4, bb4, cc08, cc08) 788 LDF [BO + 35 * SIZE], b4 789 790 FMADD (aa3, bb5, cc09, cc09) 791 LDF [AO + 16 * SIZE], a1 /****/ 792 FMADD (aa4, bb5, cc10, cc10) 793 LDF [AO + 9 * SIZE], a2 794 795 FMADD (aa3, bb6, cc11, cc11) 796 nop 797 FMADD (aa4, bb6, cc12, cc12) 798 nop 799 800 FMADD (aa3, bb7, cc13, cc13) 801 LDF [BO + 36 * SIZE], b5 802 FMADD (aa4, bb7, cc14, cc14) 803 LDF [BO + 37 * SIZE], b6 804 805 FMADD (aa3, bb8, cc15, cc15) 806 LDF [BO + 38 * SIZE], b7 807 FMADD (aa4, bb8, cc16, cc16) 808 LDF [BO + 39 * SIZE], b8 809 810 FMADD (aa5, bb1, cc01, cc01) 811 FMADD (aa2, bb1, cc02, cc02) 812 FMADD (aa5, bb2, cc03, cc03) 813 FMADD (aa2, bb2, cc04, cc04) 814 815 FMADD (aa5, bb3, cc05, cc05) 816 LDF [BO + 48 * SIZE], b1 817 FMADD (aa2, bb3, cc06, cc06) 818 LDF [BO + 41 * SIZE], b2 819 820 FMADD (aa5, bb4, cc07, cc07) 821 LDF [BO + 42 * SIZE], b3 822 FMADD (aa2, bb4, cc08, cc08) 823 LDF [BO + 43 * SIZE], b4 824 825 FMADD (aa5, bb5, cc09, cc09) 826 LDF [AO + 10 * SIZE], a3 827 FMADD (aa2, bb5, cc10, cc10) 828 LDF [AO + 11 * SIZE], a4 829 830 FMADD (aa5, bb6, cc11, cc11) 831 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 832 FMADD (aa2, bb6, cc12, cc12) 833 nop 834 835 FMADD (aa5, bb7, cc13, cc13) 836 LDF [BO + 44 * SIZE], b5 837 FMADD (aa2, bb7, cc14, cc14) 838 LDF [BO + 45 * SIZE], b6 839 840 FMADD (aa5, bb8, cc15, cc15) 841 LDF [BO + 46 * SIZE], b7 842 FMADD (aa2, bb8, cc16, cc16) 843 LDF [BO + 47 * SIZE], b8 844 845 FMADD (aa3, bb9, cc01, cc01) 846 FMADD (aa4, bb9, cc02, cc02) 847 FMADD (aa3, bb2, cc03, cc03) 848 FMADD (aa4, bb2, cc04, cc04) 849 850 FMADD (aa3, bb3, cc05, cc05) 851 LDF [BO + 56 * SIZE], b9 852 FMADD (aa4, bb3, cc06, cc06) 853 LDF [BO + 49 * SIZE], b2 854 855 FMADD (aa3, bb4, cc07, cc07) 856 LDF [BO + 50 * SIZE], b3 857 FMADD (aa4, bb4, cc08, cc08) 858 LDF [BO + 51 * SIZE], b4 859 860 FMADD (aa3, bb5, cc09, cc09) 861 LDF [AO + 12 * SIZE], a5 862 FMADD (aa4, bb5, cc10, cc10) 863 LDF [AO + 13 * SIZE], a2 864 865 FMADD (aa3, bb6, cc11, cc11) 866 cmp L, 0 867 FMADD (aa4, bb6, cc12, cc12) 868 nop 869 870 FMADD (aa3, bb7, cc13, cc13) 871 LDF [BO + 52 * SIZE], b5 872 FMADD (aa4, bb7, cc14, cc14) 873 LDF [BO + 53 * SIZE], b6 874 875 FMADD (aa3, bb8, cc15, cc15) 876 LDF [BO + 54 * SIZE], b7 877 FMADD (aa4, bb8, cc16, cc16) 878 LDF [BO + 55 * SIZE], b8 879 880 FMADD (aa5, bb1, cc01, cc01) 881 FMADD (aa2, bb1, cc02, cc02) 882 FMADD (aa5, bb2, cc03, cc03) 883 FMADD (aa2, bb2, cc04, cc04) 884 885 FMADD (aa5, bb3, cc05, cc05) 886 LDF [BO + 64 * SIZE], b1 887 FMADD (aa2, bb3, cc06, cc06) 888 LDF [BO + 57 * SIZE], b2 889 890 FMADD (aa5, bb4, cc07, cc07) 891 LDF [BO + 58 * SIZE], b3 892 FMADD (aa2, bb4, cc08, cc08) 893 LDF [BO + 59 * SIZE], b4 894 895 FMADD (aa5, bb5, cc09, cc09) 896 LDF [AO + 14 * SIZE], a3 897 FMADD (aa2, bb5, cc10, cc10) 898 LDF [AO + 15 * SIZE], a4 899 900 FMADD (aa5, bb6, cc11, cc11) 901 add BO, 64 * SIZE, BO 902 FMADD (aa2, bb6, cc12, cc12) 903 add AO, 16 * SIZE, AO 904 905 FMADD (aa5, bb7, cc13, cc13) 906 LDF [BO - 4 * SIZE], b5 907 FMADD (aa2, bb7, cc14, cc14) 908 LDF [BO - 3 * SIZE], b6 909 910 FMADD (aa5, bb8, cc15, cc15) 911 LDF [BO - 2 * SIZE], b7 912 FMADD (aa2, bb8, cc16, cc16) 913 LDF [BO - 1 * SIZE], b8 914 915 FMADD (aa3, bb9, cc01, cc01) 916 FMADD (aa4, bb9, cc02, cc02) 917 FMADD (aa3, bb2, cc03, cc03) 918 FMADD (aa4, bb2, cc04, cc04) 919 920 FMADD (aa3, bb3, cc05, cc05) 921 LDF [BO + 8 * SIZE], b9 922 FMADD (aa4, bb3, cc06, cc06) 923 LDF [BO + 1 * SIZE], b2 924 925 FMADD (aa3, bb4, cc07, cc07) 926 LDF [BO + 2 * SIZE], b3 927 FMADD (aa4, bb4, cc08, cc08) 928 LDF [BO + 3 * SIZE], b4 929 930 FMADD (aa3, bb5, cc09, cc09) 931 LDF [AO + 8 * SIZE], a5 /****/ 932 FMADD (aa4, bb5, cc10, cc10) 933 LDF [AO + 1 * SIZE], a2 934 935 FMADD (aa3, bb6, cc11, cc11) 936 FMADD (aa4, bb6, cc12, cc12) 937 938 FMADD (aa3, bb7, cc13, cc13) 939 LDF [BO + 4 * SIZE], b5 940 FMADD (aa4, bb7, cc14, cc14) 941 LDF [BO + 5 * SIZE], b6 942 943 FMADD (aa3, bb8, cc15, cc15) 944 LDF [BO + 6 * SIZE], b7 945 FMADD (aa4, bb8, cc16, cc16) 946 bg,pt %icc, .LL13 947 LDF [BO + 7 * SIZE], b8 948 .align 4 949 950.LL15: 951#ifndef TRMMKERNEL 952 and K, 7, L 953#else 954#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 955 sub K, KK, L 956#elif defined(LEFT) 957 add KK, 2, L 958#else 959 add KK, 8, L 960#endif 961 and L, 7, L 962#endif 963 cmp L, 0 964 ble,a,pn %icc, .LL18 965 nop 966 .align 4 967 968.LL17: 969 FMADD (aa1, bb1, cc01, cc01) 970 add L, -1, L 971 FMADD (aa2, bb1, cc02, cc02) 972 nop 973 974 FMADD (aa1, bb2, cc03, cc03) 975 LDF [BO + 8 * SIZE], b1 976 FMADD (aa2, bb2, cc04, cc04) 977 LDF [BO + 9 * SIZE], b2 978 979 FMADD (aa1, bb3, cc05, cc05) 980 cmp L, 0 981 FMADD (aa2, bb3, cc06, cc06) 982 nop 983 984 FMADD (aa1, bb4, cc07, cc07) 985 LDF [BO + 10 * SIZE], b3 986 FMADD (aa2, bb4, cc08, cc08) 987 LDF [BO + 11 * SIZE], b4 988 989 FMADD (aa1, bb5, cc09, cc09) 990 nop 991 FMADD (aa2, bb5, cc10, cc10) 992 nop 993 994 FMADD (aa1, bb6, cc11, cc11) 995 LDF [BO + 12 * SIZE], b5 996 FMADD (aa2, bb6, cc12, cc12) 997 LDF [BO + 13 * SIZE], b6 998 999 FMADD (aa1, bb7, cc13, cc13) 1000 add AO, 2 * SIZE, AO 1001 FMADD (aa2, bb7, cc14, cc14) 1002 add BO, 8 * SIZE, BO 1003 1004 FMADD (aa1, bb8, cc15, cc15) 1005 LDF [AO + 0 * SIZE], a1 1006 FMADD (aa2, bb8, cc16, cc16) 1007 LDF [AO + 1 * SIZE], a2 1008 1009 LDF [BO + 6 * SIZE], b7 1010 bg,pt %icc, .LL17 1011 LDF [BO + 7 * SIZE], b8 1012 nop 1013 .align 4 1014 1015.LL18: 1016#ifndef TRMMKERNEL 1017 LDF [C1 + 0 * SIZE], a1 1018 LDF [C1 + 1 * SIZE], a2 1019 LDF [C2 + 0 * SIZE], a3 1020 LDF [C2 + 1 * SIZE], a4 1021 1022 LDF [C3 + 0 * SIZE], b1 1023 LDF [C3 + 1 * SIZE], b2 1024 LDF [C4 + 0 * SIZE], b3 1025 LDF [C4 + 1 * SIZE], b4 1026 1027 FMADD (alpha, cc01, aa1, cc01) 1028 LDF [C5 + 0 * SIZE], a1 1029 FMADD (alpha, cc02, aa2, cc02) 1030 LDF [C5 + 1 * SIZE], a2 1031 FMADD (alpha, cc03, aa3, cc03) 1032 LDF [C6 + 0 * SIZE], a3 1033 FMADD (alpha, cc04, aa4, cc04) 1034 LDF [C6 + 1 * SIZE], a4 1035 1036 FMADD (alpha, cc05, bb1, cc05) 1037 LDF [C7 + 0 * SIZE], b1 1038 FMADD (alpha, cc06, bb2, cc06) 1039 LDF [C7 + 1 * SIZE], b2 1040 FMADD (alpha, cc07, bb3, cc07) 1041 LDF [C8 + 0 * SIZE], b3 1042 FMADD (alpha, cc08, bb4, cc08) 1043 LDF [C8 + 1 * SIZE], b4 1044 1045 FMADD (alpha, cc09, aa1, cc09) 1046 STF c01, [C1 + 0 * SIZE] 1047 FMADD (alpha, cc10, aa2, cc10) 1048 STF c02, [C1 + 1 * SIZE] 1049 FMADD (alpha, cc11, aa3, cc11) 1050 STF c03, [C2 + 0 * SIZE] 1051 FMADD (alpha, cc12, aa4, cc12) 1052 STF c04, [C2 + 1 * SIZE] 1053 1054 FMADD (alpha, cc13, bb1, cc13) 1055 STF c05, [C3 + 0 * SIZE] 1056 FMADD (alpha, cc14, bb2, cc14) 1057 STF c06, [C3 + 1 * SIZE] 1058 FMADD (alpha, cc15, bb3, cc15) 1059 STF c07, [C4 + 0 * SIZE] 1060 FMADD (alpha, cc16, bb4, cc16) 1061 STF c08, [C4 + 1 * SIZE] 1062 1063#else 1064 FMUL ALPHA, c01, c01 1065 FMUL ALPHA, c02, c02 1066 FMUL ALPHA, c03, c03 1067 FMUL ALPHA, c04, c04 1068 1069 FMUL ALPHA, c05, c05 1070 FMUL ALPHA, c06, c06 1071 FMUL ALPHA, c07, c07 1072 FMUL ALPHA, c08, c08 1073 1074 FMUL ALPHA, c09, c09 1075 STF c01, [C1 + 0 * SIZE] 1076 FMUL ALPHA, c10, c10 1077 STF c02, [C1 + 1 * SIZE] 1078 FMUL ALPHA, c11, c11 1079 STF c03, [C2 + 0 * SIZE] 1080 FMUL ALPHA, c12, c12 1081 STF c04, [C2 + 1 * SIZE] 1082 1083 FMUL ALPHA, c13, c13 1084 STF c05, [C3 + 0 * SIZE] 1085 FMUL ALPHA, c14, c14 1086 STF c06, [C3 + 1 * SIZE] 1087 FMUL ALPHA, c15, c15 1088 STF c07, [C4 + 0 * SIZE] 1089 FMUL ALPHA, c16, c16 1090 STF c08, [C4 + 1 * SIZE] 1091#endif 1092 1093 STF c09, [C5 + 0 * SIZE] 1094 add C1, 2 * SIZE, C1 1095 STF c10, [C5 + 1 * SIZE] 1096 add C2, 2 * SIZE, C2 1097 STF c11, [C6 + 0 * SIZE] 1098 add C3, 2 * SIZE, C3 1099 STF c12, [C6 + 1 * SIZE] 1100 add C4, 2 * SIZE, C4 1101 1102 STF c13, [C7 + 0 * SIZE] 1103 add C5, 2 * SIZE, C5 1104 STF c14, [C7 + 1 * SIZE] 1105 add C6, 2 * SIZE, C6 1106 STF c15, [C8 + 0 * SIZE] 1107 add C7, 2 * SIZE, C7 1108 STF c16, [C8 + 1 * SIZE] 1109 add C8, 2 * SIZE, C8 1110 1111#ifdef TRMMKERNEL 1112#if ( defined(LEFT) && defined(TRANSA)) || \ 1113 (!defined(LEFT) && !defined(TRANSA)) 1114 sub K, KK, TEMP1 1115#ifdef LEFT 1116 add TEMP1, -2, TEMP1 1117#else 1118 add TEMP1, -8, TEMP1 1119#endif 1120 sll TEMP1, BASE_SHIFT + 1, TEMP2 1121 sll TEMP1, BASE_SHIFT + 3, TEMP1 1122 1123 add AO, TEMP2, AO 1124 add BO, TEMP1, BO 1125#endif 1126 1127#ifdef LEFT 1128 add KK, 2, KK 1129#endif 1130#endif 1131 1132 add I, -1, I 1133 cmp I, 0 1134 bg,pt %icc, .LL12 1135 nop 1136 .align 4 1137 1138.LL20: 1139 and M, 1, I 1140 cmp I, 0 1141 ble,pn %icc, .LL29 1142 nop 1143 1144#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) 1145 mov B, BO 1146#else 1147 sll KK, BASE_SHIFT + 0, TEMP1 1148 sll KK, BASE_SHIFT + 3, TEMP2 1149 1150 add AO, TEMP1, AO 1151 add B, TEMP2, BO 1152#endif 1153 1154 LDF [AO + 0 * SIZE], a1 1155 LDF [AO + 1 * SIZE], a2 1156 LDF [AO + 2 * SIZE], a3 1157 LDF [AO + 3 * SIZE], a4 1158 1159 LDF [BO + 0 * SIZE], b1 1160 FCLR (cc01) 1161 LDF [BO + 1 * SIZE], b2 1162 FCLR (cc03) 1163 LDF [BO + 2 * SIZE], b3 1164 FCLR (cc05) 1165 LDF [BO + 3 * SIZE], b4 1166 FCLR (cc07) 1167 LDF [BO + 4 * SIZE], b5 1168 FCLR (cc09) 1169 LDF [BO + 5 * SIZE], b6 1170 FCLR (cc11) 1171 LDF [BO + 6 * SIZE], b7 1172 FCLR (cc13) 1173 LDF [BO + 7 * SIZE], b8 1174 FCLR (cc15) 1175 1176#ifndef TRMMKERNEL 1177 sra K, 2, L 1178#else 1179#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1180 sub K, KK, L 1181#elif defined(LEFT) 1182 add KK, 1, L 1183#else 1184 add KK, 8, L 1185#endif 1186 sra L, 2, L 1187#endif 1188 cmp L, 0 1189 ble,pn %icc, .LL25 1190 LDF [BO + 8 * SIZE], b9 1191 .align 4 1192 1193.LL23: 1194 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1195 add L, -1, L 1196 1197 FMADD (aa1, bb1, cc01, cc01) 1198 LDF [BO + 16 * SIZE], b1 1199 FMADD (aa1, bb2, cc03, cc03) 1200 LDF [BO + 9 * SIZE], b2 1201 1202 FMADD (aa1, bb3, cc05, cc05) 1203 LDF [BO + 10 * SIZE], b3 1204 FMADD (aa1, bb4, cc07, cc07) 1205 LDF [BO + 11 * SIZE], b4 1206 1207 FMADD (aa1, bb5, cc09, cc09) 1208 LDF [BO + 12 * SIZE], b5 1209 FMADD (aa1, bb6, cc11, cc11) 1210 LDF [BO + 13 * SIZE], b6 1211 1212 FMADD (aa1, bb7, cc13, cc13) 1213 LDF [BO + 14 * SIZE], b7 1214 FMADD (aa1, bb8, cc15, cc15) 1215 LDF [BO + 15 * SIZE], b8 1216 1217 FMADD (aa2, bb9, cc01, cc01) 1218 LDF [BO + 24 * SIZE], b9 1219 FMADD (aa2, bb2, cc03, cc03) 1220 LDF [BO + 17 * SIZE], b2 1221 1222 FMADD (aa2, bb3, cc05, cc05) 1223 LDF [BO + 18 * SIZE], b3 1224 FMADD (aa2, bb4, cc07, cc07) 1225 LDF [BO + 19 * SIZE], b4 1226 1227 FMADD (aa2, bb5, cc09, cc09) 1228 LDF [BO + 20 * SIZE], b5 1229 FMADD (aa2, bb6, cc11, cc11) 1230 LDF [BO + 21 * SIZE], b6 1231 1232 FMADD (aa2, bb7, cc13, cc13) 1233 LDF [BO + 22 * SIZE], b7 1234 FMADD (aa2, bb8, cc15, cc15) 1235 LDF [BO + 23 * SIZE], b8 1236 1237 LDF [AO + 4 * SIZE], a1 1238 LDF [AO + 5 * SIZE], a2 1239 1240 FMADD (aa3, bb1, cc01, cc01) 1241 LDF [BO + 32 * SIZE], b1 1242 FMADD (aa3, bb2, cc03, cc03) 1243 LDF [BO + 25 * SIZE], b2 1244 1245 FMADD (aa3, bb3, cc05, cc05) 1246 LDF [BO + 26 * SIZE], b3 1247 FMADD (aa3, bb4, cc07, cc07) 1248 LDF [BO + 27 * SIZE], b4 1249 1250 FMADD (aa3, bb5, cc09, cc09) 1251 LDF [BO + 28 * SIZE], b5 1252 FMADD (aa3, bb6, cc11, cc11) 1253 LDF [BO + 29 * SIZE], b6 1254 1255 FMADD (aa3, bb7, cc13, cc13) 1256 LDF [BO + 30 * SIZE], b7 1257 FMADD (aa3, bb8, cc15, cc15) 1258 LDF [BO + 31 * SIZE], b8 1259 1260 FMADD (aa4, bb9, cc01, cc01) 1261 LDF [BO + 40 * SIZE], b9 1262 FMADD (aa4, bb2, cc03, cc03) 1263 LDF [BO + 33 * SIZE], b2 1264 1265 FMADD (aa4, bb3, cc05, cc05) 1266 LDF [BO + 34 * SIZE], b3 1267 FMADD (aa4, bb4, cc07, cc07) 1268 LDF [BO + 35 * SIZE], b4 1269 1270 FMADD (aa4, bb5, cc09, cc09) 1271 LDF [BO + 36 * SIZE], b5 1272 FMADD (aa4, bb6, cc11, cc11) 1273 LDF [BO + 37 * SIZE], b6 1274 1275 FMADD (aa4, bb7, cc13, cc13) 1276 LDF [BO + 38 * SIZE], b7 1277 FMADD (aa4, bb8, cc15, cc15) 1278 LDF [BO + 39 * SIZE], b8 1279 1280 LDF [AO + 6 * SIZE], a3 1281 LDF [AO + 7 * SIZE], a4 1282 1283 add AO, 4 * SIZE, AO 1284 cmp L, 0 1285 bg,pt %icc, .LL23 1286 add BO, 32 * SIZE, BO 1287 .align 4 1288 1289.LL25: 1290#ifndef TRMMKERNEL 1291 and K, 3, L 1292#else 1293#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1294 sub K, KK, L 1295#elif defined(LEFT) 1296 add KK, 1, L 1297#else 1298 add KK, 8, L 1299#endif 1300 and L, 3, L 1301#endif 1302 cmp L, 0 1303 ble,a,pn %icc, .LL28 1304 nop 1305 .align 4 1306 1307.LL27: 1308 FMADD (aa1, bb1, cc01, cc01) 1309 LDF [BO + 8 * SIZE], b1 1310 FMADD (aa1, bb2, cc03, cc03) 1311 LDF [BO + 9 * SIZE], b2 1312 1313 FMADD (aa1, bb3, cc05, cc05) 1314 LDF [BO + 10 * SIZE], b3 1315 FMADD (aa1, bb4, cc07, cc07) 1316 LDF [BO + 11 * SIZE], b4 1317 1318 FMADD (aa1, bb5, cc09, cc09) 1319 LDF [BO + 12 * SIZE], b5 1320 FMADD (aa1, bb6, cc11, cc11) 1321 LDF [BO + 13 * SIZE], b6 1322 1323 FMADD (aa1, bb7, cc13, cc13) 1324 LDF [BO + 14 * SIZE], b7 1325 FMADD (aa1, bb8, cc15, cc15) 1326 LDF [BO + 15 * SIZE], b8 1327 1328 LDF [AO + 1 * SIZE], a1 1329 add AO, 1 * SIZE, AO 1330 1331 add L, -1, L 1332 cmp L, 0 1333 bg,pt %icc, .LL27 1334 add BO, 8 * SIZE, BO 1335 .align 4 1336 1337.LL28: 1338#ifndef TRMMKERNEL 1339 LDF [C1 + 0 * SIZE], a1 1340 LDF [C2 + 0 * SIZE], a2 1341 LDF [C3 + 0 * SIZE], a3 1342 LDF [C4 + 0 * SIZE], a4 1343 1344 FMADD (alpha, cc01, aa1, cc01) 1345 LDF [C5 + 0 * SIZE], b1 1346 FMADD (alpha, cc03, aa2, cc03) 1347 LDF [C6 + 0 * SIZE], b2 1348 1349 FMADD (alpha, cc05, aa3, cc05) 1350 LDF [C7 + 0 * SIZE], b3 1351 FMADD (alpha, cc07, aa4, cc07) 1352 LDF [C8 + 0 * SIZE], b4 1353 1354 FMADD (alpha, cc09, bb1, cc09) 1355 STF c01, [C1 + 0 * SIZE] 1356 FMADD (alpha, cc11, bb2, cc11) 1357 STF c03, [C2 + 0 * SIZE] 1358 FMADD (alpha, cc13, bb3, cc13) 1359 STF c05, [C3 + 0 * SIZE] 1360 FMADD (alpha, cc15, bb4, cc15) 1361 STF c07, [C4 + 0 * SIZE] 1362#else 1363 FMUL ALPHA, c01, c01 1364 FMUL ALPHA, c03, c03 1365 FMUL ALPHA, c05, c05 1366 FMUL ALPHA, c07, c07 1367 1368 FMUL ALPHA, c09, c09 1369 STF c01, [C1 + 0 * SIZE] 1370 FMUL ALPHA, c11, c11 1371 STF c03, [C2 + 0 * SIZE] 1372 1373 FMUL ALPHA, c13, c13 1374 STF c05, [C3 + 0 * SIZE] 1375 FMUL ALPHA, c15, c15 1376 STF c07, [C4 + 0 * SIZE] 1377#endif 1378 1379 STF c09, [C5 + 0 * SIZE] 1380 STF c11, [C6 + 0 * SIZE] 1381 STF c13, [C7 + 0 * SIZE] 1382 STF c15, [C8 + 0 * SIZE] 1383 1384#ifdef TRMMKERNEL 1385#if ( defined(LEFT) && defined(TRANSA)) || \ 1386 (!defined(LEFT) && !defined(TRANSA)) 1387 sub K, KK, TEMP1 1388#ifdef LEFT 1389 add TEMP1, -1, TEMP1 1390#else 1391 add TEMP1, -8, TEMP1 1392#endif 1393 sll TEMP1, BASE_SHIFT + 0, TEMP2 1394 sll TEMP1, BASE_SHIFT + 3, TEMP1 1395 1396 add AO, TEMP2, AO 1397 add BO, TEMP1, BO 1398#endif 1399 1400#ifdef LEFT 1401 add KK, 1, KK 1402#endif 1403#endif 1404 .align 4 1405 1406.LL29: 1407#if defined(TRMMKERNEL) && !defined(LEFT) 1408 add KK, 8, KK 1409#endif 1410 1411 add J, -1, J 1412 cmp J, 0 1413 bg,pt %icc, .LL11 1414 mov BO, B 1415 .align 4 1416 1417.LL30: 1418 and N, 4, J 1419 cmp J, 0 1420 ble,pn %icc, .LL50 1421 mov C, C1 1422 1423 add C, LDC, C2 1424 add C2, LDC, C3 1425 add C3, LDC, C4 1426 add C4, LDC, C 1427 1428#if defined(TRMMKERNEL) && defined(LEFT) 1429 mov OFFSET, KK 1430#endif 1431 1432 sra M, 1, I 1433 cmp I, 0 1434 ble,pn %icc, .LL40 1435 mov A, AO 1436 .align 4 1437 1438.LL32: 1439#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) 1440 mov B, BO 1441#else 1442 sll KK, BASE_SHIFT + 1, TEMP1 1443 sll KK, BASE_SHIFT + 2, TEMP2 1444 1445 add AO, TEMP1, AO 1446 add B, TEMP2, BO 1447#endif 1448 1449 LDF [AO + 0 * SIZE], a1 1450 LDF [AO + 1 * SIZE], a2 1451 1452 LDF [BO + 0 * SIZE], b1 1453 LDF [BO + 1 * SIZE], b2 1454 LDF [BO + 2 * SIZE], b3 1455 LDF [BO + 3 * SIZE], b4 1456 LDF [BO + 4 * SIZE], b5 1457 1458 LDF [BO + 5 * SIZE], b6 1459 FCLR (cc01) 1460 LDF [BO + 6 * SIZE], b7 1461 FCLR (cc02) 1462 LDF [BO + 7 * SIZE], b8 1463 FCLR (cc03) 1464 LDF [BO + 8 * SIZE], b9 1465 FCLR (cc04) 1466 1467 prefetch [C1 + 2 * SIZE], 3 1468 FCLR (cc05) 1469 prefetch [C2 + 2 * SIZE], 3 1470 FCLR (cc06) 1471 prefetch [C3 + 2 * SIZE], 3 1472 FCLR (cc07) 1473 prefetch [C4 + 2 * SIZE], 3 1474 FCLR (cc08) 1475 1476#ifndef TRMMKERNEL 1477 sra K, 2, L 1478#else 1479#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1480 sub K, KK, L 1481#elif defined(LEFT) 1482 add KK, 2, L 1483#else 1484 add KK, 4, L 1485#endif 1486 sra L, 2, L 1487#endif 1488 cmp L, 0 1489 ble,pn %icc, .LL35 1490 nop 1491 .align 4 1492 1493.LL33: 1494 FMADD (aa1, bb1, cc01, cc01) 1495 LDF [AO + 2 * SIZE], a3 1496 FMADD (aa2, bb1, cc02, cc02) 1497 LDF [AO + 3 * SIZE], a4 1498 1499 FMADD (aa1, bb2, cc03, cc03) 1500 LDF [BO + 16 * SIZE], b1 1501 FMADD (aa2, bb2, cc04, cc04) 1502 LDF [BO + 9 * SIZE], b2 1503 1504 FMADD (aa1, bb3, cc05, cc05) 1505 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1506 FMADD (aa2, bb3, cc06, cc06) 1507 add L, -1, L 1508 1509 FMADD (aa1, bb4, cc07, cc07) 1510 LDF [BO + 10 * SIZE], b3 1511 FMADD (aa2, bb4, cc08, cc08) 1512 LDF [BO + 11 * SIZE], b4 1513 1514 FMADD (aa3, bb5, cc01, cc01) 1515 LDF [AO + 4 * SIZE], a1 1516 FMADD (aa4, bb5, cc02, cc02) 1517 LDF [AO + 5 * SIZE], a2 1518 1519 FMADD (aa3, bb6, cc03, cc03) 1520 LDF [BO + 12 * SIZE], b5 1521 FMADD (aa4, bb6, cc04, cc04) 1522 LDF [BO + 13 * SIZE], b6 1523 1524 FMADD (aa3, bb7, cc05, cc05) 1525 cmp L, 0 1526 FMADD (aa4, bb7, cc06, cc06) 1527 add AO, 8 * SIZE, AO 1528 1529 FMADD (aa3, bb8, cc07, cc07) 1530 LDF [BO + 14 * SIZE], b7 1531 FMADD (aa4, bb8, cc08, cc08) 1532 LDF [BO + 15 * SIZE], b8 1533 1534 FMADD (aa1, bb9, cc01, cc01) 1535 LDF [AO - 2 * SIZE], a3 1536 FMADD (aa2, bb9, cc02, cc02) 1537 LDF [AO - 1 * SIZE], a4 1538 1539 FMADD (aa1, bb2, cc03, cc03) 1540 LDF [BO + 24 * SIZE], b9 1541 FMADD (aa2, bb2, cc04, cc04) 1542 LDF [BO + 17 * SIZE], b2 1543 1544 FMADD (aa1, bb3, cc05, cc05) 1545 add BO, 16 * SIZE, BO 1546 FMADD (aa2, bb3, cc06, cc06) 1547 nop 1548 1549 FMADD (aa1, bb4, cc07, cc07) 1550 LDF [BO + 2 * SIZE], b3 1551 FMADD (aa2, bb4, cc08, cc08) 1552 LDF [BO + 3 * SIZE], b4 1553 1554 FMADD (aa3, bb5, cc01, cc01) 1555 LDF [AO + 0 * SIZE], a1 1556 FMADD (aa4, bb5, cc02, cc02) 1557 LDF [AO + 1 * SIZE], a2 1558 FMADD (aa3, bb6, cc03, cc03) 1559 LDF [BO + 4 * SIZE], b5 1560 FMADD (aa4, bb6, cc04, cc04) 1561 LDF [BO + 5 * SIZE], b6 1562 1563 FMADD (aa3, bb7, cc05, cc05) 1564 nop 1565 FMADD (aa4, bb7, cc06, cc06) 1566 LDF [BO + 6 * SIZE], b7 1567 1568 FMADD (aa3, bb8, cc07, cc07) 1569 FMADD (aa4, bb8, cc08, cc08) 1570 bg,pt %icc, .LL33 1571 LDF [BO + 7 * SIZE], b8 1572 .align 4 1573 1574.LL35: 1575#ifndef TRMMKERNEL 1576 and K, 3, L 1577#else 1578#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1579 sub K, KK, L 1580#elif defined(LEFT) 1581 add KK, 2, L 1582#else 1583 add KK, 4, L 1584#endif 1585 and L, 3, L 1586#endif 1587 cmp L, 0 1588 ble,a,pn %icc, .LL38 1589 nop 1590 .align 4 1591 1592.LL37: 1593 1594 FMADD (aa1, bb1, cc01, cc01) 1595 add L, -1, L 1596 FMADD (aa2, bb1, cc02, cc02) 1597 LDF [BO + 4 * SIZE], b1 1598 1599 FMADD (aa1, bb2, cc03, cc03) 1600 add AO, 2 * SIZE, AO 1601 FMADD (aa2, bb2, cc04, cc04) 1602 LDF [BO + 5 * SIZE], b2 1603 1604 FMADD (aa1, bb3, cc05, cc05) 1605 cmp L, 0 1606 FMADD (aa2, bb3, cc06, cc06) 1607 LDF [BO + 6 * SIZE], b3 1608 1609 FMADD (aa1, bb4, cc07, cc07) 1610 LDF [AO + 0 * SIZE], a1 1611 FMADD (aa2, bb4, cc08, cc08) 1612 LDF [AO + 1 * SIZE], a2 1613 1614 LDF [BO + 7 * SIZE], b4 1615 bg,pt %icc, .LL37 1616 add BO, 4 * SIZE, BO 1617 .align 4 1618 1619.LL38: 1620#ifndef TRMMKERNEL 1621 LDF [C1 + 0 * SIZE], a1 1622 LDF [C1 + 1 * SIZE], a2 1623 LDF [C2 + 0 * SIZE], a3 1624 LDF [C2 + 1 * SIZE], a4 1625 1626 FMADD (alpha, cc01, aa1, cc01) 1627 LDF [C3 + 0 * SIZE], b1 1628 FMADD (alpha, cc02, aa2, cc02) 1629 LDF [C3 + 1 * SIZE], b2 1630 FMADD (alpha, cc03, aa3, cc03) 1631 LDF [C4 + 0 * SIZE], b3 1632 FMADD (alpha, cc04, aa4, cc04) 1633 LDF [C4 + 1 * SIZE], b4 1634 1635 FMADD (alpha, cc05, bb1, cc05) 1636 STF c01, [C1 + 0 * SIZE] 1637 FMADD (alpha, cc06, bb2, cc06) 1638 STF c02, [C1 + 1 * SIZE] 1639 FMADD (alpha, cc07, bb3, cc07) 1640 STF c03, [C2 + 0 * SIZE] 1641 FMADD (alpha, cc08, bb4, cc08) 1642 STF c04, [C2 + 1 * SIZE] 1643#else 1644 1645 FMUL ALPHA, c01, c01 1646 FMUL ALPHA, c02, c02 1647 FMUL ALPHA, c03, c03 1648 FMUL ALPHA, c04, c04 1649 1650 FMUL ALPHA, c05, c05 1651 STF c01, [C1 + 0 * SIZE] 1652 FMUL ALPHA, c06, c06 1653 STF c02, [C1 + 1 * SIZE] 1654 FMUL ALPHA, c07, c07 1655 STF c03, [C2 + 0 * SIZE] 1656 FMUL ALPHA, c08, c08 1657 STF c04, [C2 + 1 * SIZE] 1658#endif 1659 1660 STF c05, [C3 + 0 * SIZE] 1661 add C1, 2 * SIZE, C1 1662 STF c06, [C3 + 1 * SIZE] 1663 add C2, 2 * SIZE, C2 1664 STF c07, [C4 + 0 * SIZE] 1665 add C3, 2 * SIZE, C3 1666 STF c08, [C4 + 1 * SIZE] 1667 add C4, 2 * SIZE, C4 1668 1669#ifdef TRMMKERNEL 1670#if ( defined(LEFT) && defined(TRANSA)) || \ 1671 (!defined(LEFT) && !defined(TRANSA)) 1672 sub K, KK, TEMP1 1673#ifdef LEFT 1674 add TEMP1, -2, TEMP1 1675#else 1676 add TEMP1, -4, TEMP1 1677#endif 1678 sll TEMP1, BASE_SHIFT + 1, TEMP2 1679 sll TEMP1, BASE_SHIFT + 2, TEMP1 1680 1681 add AO, TEMP2, AO 1682 add BO, TEMP1, BO 1683#endif 1684 1685#ifdef LEFT 1686 add KK, 2, KK 1687#endif 1688#endif 1689 1690 add I, -1, I 1691 cmp I, 0 1692 bg,pt %icc, .LL32 1693 nop 1694 1695.LL40: 1696 and M, 1, I 1697 cmp I, 0 1698 ble,pn %icc, .LL49 1699 nop 1700 1701#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) 1702 mov B, BO 1703#else 1704 sll KK, BASE_SHIFT + 0, TEMP1 1705 sll KK, BASE_SHIFT + 2, TEMP2 1706 1707 add AO, TEMP1, AO 1708 add B, TEMP2, BO 1709#endif 1710 1711 LDF [AO + 0 * SIZE], a1 1712 LDF [AO + 1 * SIZE], a2 1713 LDF [AO + 2 * SIZE], a3 1714 LDF [AO + 3 * SIZE], a4 1715 1716 LDF [BO + 0 * SIZE], b1 1717 LDF [BO + 1 * SIZE], b2 1718 LDF [BO + 2 * SIZE], b3 1719 LDF [BO + 3 * SIZE], b4 1720 LDF [BO + 4 * SIZE], b5 1721 LDF [BO + 5 * SIZE], b6 1722 FCLR (cc01) 1723 LDF [BO + 6 * SIZE], b7 1724 FCLR (cc03) 1725 LDF [BO + 7 * SIZE], b8 1726 FCLR (cc05) 1727 LDF [BO + 8 * SIZE], b9 1728 FCLR (cc07) 1729 1730#ifndef TRMMKERNEL 1731 sra K, 2, L 1732#else 1733#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1734 sub K, KK, L 1735#elif defined(LEFT) 1736 add KK, 1, L 1737#else 1738 add KK, 4, L 1739#endif 1740 sra L, 2, L 1741#endif 1742 cmp L, 0 1743 ble,pn %icc, .LL45 1744 nop 1745 1746.LL43: 1747 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1748 add L, -1, L 1749 1750 FMADD (aa1, bb1, cc01, cc01) 1751 LDF [BO + 16 * SIZE], b1 1752 FMADD (aa1, bb2, cc03, cc03) 1753 LDF [BO + 9 * SIZE], b2 1754 FMADD (aa1, bb3, cc05, cc05) 1755 LDF [BO + 10 * SIZE], b3 1756 FMADD (aa1, bb4, cc07, cc07) 1757 LDF [BO + 11 * SIZE], b4 1758 1759 LDF [AO + 4 * SIZE], a1 1760 cmp L, 0 1761 1762 FMADD (aa2, bb5, cc01, cc01) 1763 LDF [BO + 12 * SIZE], b5 1764 FMADD (aa2, bb6, cc03, cc03) 1765 LDF [BO + 13 * SIZE], b6 1766 FMADD (aa2, bb7, cc05, cc05) 1767 LDF [BO + 14 * SIZE], b7 1768 FMADD (aa2, bb8, cc07, cc07) 1769 LDF [BO + 15 * SIZE], b8 1770 1771 LDF [AO + 5 * SIZE], a2 1772 add AO, 4 * SIZE, AO 1773 1774 FMADD (aa3, bb9, cc01, cc01) 1775 LDF [BO + 24 * SIZE], b9 1776 FMADD (aa3, bb2, cc03, cc03) 1777 LDF [BO + 17 * SIZE], b2 1778 FMADD (aa3, bb3, cc05, cc05) 1779 LDF [BO + 18 * SIZE], b3 1780 FMADD (aa3, bb4, cc07, cc07) 1781 LDF [BO + 19 * SIZE], b4 1782 1783 LDF [AO + 2 * SIZE], a3 1784 add BO, 16 * SIZE, BO 1785 1786 FMADD (aa4, bb5, cc01, cc01) 1787 LDF [BO + 4 * SIZE], b5 1788 FMADD (aa4, bb6, cc03, cc03) 1789 LDF [BO + 5 * SIZE], b6 1790 FMADD (aa4, bb7, cc05, cc05) 1791 LDF [BO + 6 * SIZE], b7 1792 FMADD (aa4, bb8, cc07, cc07) 1793 LDF [BO + 7 * SIZE], b8 1794 1795 bg,pt %icc, .LL43 1796 LDF [AO + 3 * SIZE], a4 1797 .align 4 1798 1799.LL45: 1800#ifndef TRMMKERNEL 1801 and K, 3, L 1802#else 1803#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1804 sub K, KK, L 1805#elif defined(LEFT) 1806 add KK, 1, L 1807#else 1808 add KK, 4, L 1809#endif 1810 and L, 3, L 1811#endif 1812 cmp L, 0 1813 ble,a,pn %icc, .LL48 1814 nop 1815 .align 4 1816 1817.LL47: 1818 FMADD (aa1, bb1, cc01, cc01) 1819 LDF [BO + 4 * SIZE], b1 1820 add L, -1, L 1821 FMADD (aa1, bb2, cc03, cc03) 1822 LDF [BO + 5 * SIZE], b2 1823 add AO, 1 * SIZE, AO 1824 1825 FMADD (aa1, bb3, cc05, cc05) 1826 LDF [BO + 6 * SIZE], b3 1827 cmp L, 0 1828 FMADD (aa1, bb4, cc07, cc07) 1829 LDF [BO + 7 * SIZE], b4 1830 add BO, 4 * SIZE, BO 1831 1832 bg,pt %icc, .LL47 1833 LDF [AO + 0 * SIZE], a1 1834 .align 4 1835 1836.LL48: 1837#ifndef TRMMKERNEL 1838 LDF [C1 + 0 * SIZE], a1 1839 LDF [C2 + 0 * SIZE], a2 1840 LDF [C3 + 0 * SIZE], a3 1841 LDF [C4 + 0 * SIZE], a4 1842 1843 FMADD (alpha, cc01, aa1, cc01) 1844 FMADD (alpha, cc03, aa2, cc03) 1845 FMADD (alpha, cc05, aa3, cc05) 1846 FMADD (alpha, cc07, aa4, cc07) 1847#else 1848 FMUL ALPHA, c01, c01 1849 FMUL ALPHA, c03, c03 1850 FMUL ALPHA, c05, c05 1851 FMUL ALPHA, c07, c07 1852#endif 1853 1854 STF c01, [C1 + 0 * SIZE] 1855 STF c03, [C2 + 0 * SIZE] 1856 STF c05, [C3 + 0 * SIZE] 1857 STF c07, [C4 + 0 * SIZE] 1858 1859#ifdef TRMMKERNEL 1860#if ( defined(LEFT) && defined(TRANSA)) || \ 1861 (!defined(LEFT) && !defined(TRANSA)) 1862 sub K, KK, TEMP1 1863#ifdef LEFT 1864 add TEMP1, -1, TEMP1 1865#else 1866 add TEMP1, -4, TEMP1 1867#endif 1868 sll TEMP1, BASE_SHIFT + 0, TEMP2 1869 sll TEMP1, BASE_SHIFT + 2, TEMP1 1870 1871 add AO, TEMP2, AO 1872 add BO, TEMP1, BO 1873#endif 1874 1875#ifdef LEFT 1876 add KK, 1, KK 1877#endif 1878#endif 1879 .align 4 1880 1881.LL49: 1882#if defined(TRMMKERNEL) && !defined(LEFT) 1883 add KK, 4, KK 1884#endif 1885 mov BO, B 1886 .align 4 1887 1888.LL50: 1889 and N, 2, J 1890 cmp J, 0 1891 ble,pn %icc, .LL70 1892 mov C, C1 1893 1894 add C, LDC, C2 1895 add C2, LDC, C 1896 1897#if defined(TRMMKERNEL) && defined(LEFT) 1898 mov OFFSET, KK 1899#endif 1900 1901 sra M, 1, I 1902 cmp I, 0 1903 ble,pn %icc, .LL60 1904 mov A, AO 1905 .align 4 1906 1907.LL52: 1908#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) 1909 mov B, BO 1910#else 1911 sll KK, BASE_SHIFT + 1, TEMP1 1912 sll KK, BASE_SHIFT + 1, TEMP2 1913 1914 add AO, TEMP1, AO 1915 add B, TEMP2, BO 1916#endif 1917 1918 LDF [AO + 0 * SIZE], a1 1919 LDF [AO + 1 * SIZE], a2 1920 LDF [AO + 2 * SIZE], a3 1921 LDF [AO + 3 * SIZE], a4 1922 1923 LDF [BO + 0 * SIZE], b1 1924 LDF [BO + 1 * SIZE], b2 1925 LDF [BO + 2 * SIZE], b3 1926 FCLR (cc01) 1927 LDF [BO + 3 * SIZE], b4 1928 FCLR (cc02) 1929 1930 LDF [BO + 4 * SIZE], b5 1931 FCLR (cc03) 1932 LDF [BO + 5 * SIZE], b6 1933 FCLR (cc04) 1934 LDF [BO + 6 * SIZE], b7 1935 FCLR (cc05) 1936 LDF [BO + 7 * SIZE], b8 1937 FCLR (cc06) 1938 1939 prefetch [C1 + 2 * SIZE], 3 1940 FCLR (cc07) 1941 prefetch [C2 + 2 * SIZE], 3 1942 FCLR (cc08) 1943 1944#ifndef TRMMKERNEL 1945 sra K, 2, L 1946#else 1947#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1948 sub K, KK, L 1949#elif defined(LEFT) 1950 add KK, 2, L 1951#else 1952 add KK, 2, L 1953#endif 1954 sra L, 2, L 1955#endif 1956 cmp L, 0 1957 ble,pn %icc, .LL55 1958 nop 1959 .align 4 1960 1961.LL53: 1962 FMADD (aa1, bb1, cc01, cc01) 1963 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1964 FMADD (aa2, bb1, cc02, cc02) 1965 LDF [BO + 8 * SIZE], b1 1966 1967 FMADD (aa1, bb2, cc03, cc03) 1968 LDF [AO + 4 * SIZE], a1 1969 FMADD (aa2, bb2, cc04, cc04) 1970 LDF [AO + 5 * SIZE], a2 1971 1972 FMADD (aa3, bb3, cc01, cc01) 1973 LDF [BO + 9 * SIZE], b2 1974 FMADD (aa4, bb3, cc02, cc02) 1975 LDF [BO + 10 * SIZE], b3 1976 1977 FMADD (aa3, bb4, cc03, cc03) 1978 LDF [AO + 6 * SIZE], a3 1979 FMADD (aa4, bb4, cc04, cc04) 1980 LDF [AO + 7 * SIZE], a4 1981 1982 FMADD (aa1, bb5, cc01, cc01) 1983 LDF [BO + 11 * SIZE], b4 1984 FMADD (aa2, bb5, cc02, cc02) 1985 LDF [BO + 12 * SIZE], b5 1986 1987 FMADD (aa1, bb6, cc03, cc03) 1988 LDF [AO + 8 * SIZE], a1 1989 FMADD (aa2, bb6, cc04, cc04) 1990 LDF [AO + 9 * SIZE], a2 1991 1992 FMADD (aa3, bb7, cc01, cc01) 1993 LDF [BO + 13 * SIZE], b6 1994 1995 FMADD (aa4, bb7, cc02, cc02) 1996 LDF [BO + 14 * SIZE], b7 1997 1998 FMADD (aa3, bb8, cc03, cc03) 1999 LDF [AO + 10 * SIZE], a3 2000 FMADD (aa4, bb8, cc04, cc04) 2001 LDF [AO + 11 * SIZE], a4 2002 2003 add AO, 8 * SIZE, AO 2004 add L, -1, L 2005 add BO, 8 * SIZE, BO 2006 cmp L, 0 2007 2008 bg,pt %icc, .LL53 2009 LDF [BO + 7 * SIZE], b8 2010 .align 4 2011 2012.LL55: 2013#ifndef TRMMKERNEL 2014 and K, 3, L 2015#else 2016#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2017 sub K, KK, L 2018#elif defined(LEFT) 2019 add KK, 2, L 2020#else 2021 add KK, 2, L 2022#endif 2023 and L, 3, L 2024#endif 2025 cmp L, 0 2026 ble,a,pn %icc, .LL58 2027 nop 2028 .align 4 2029 2030.LL57: 2031 FMADD (aa1, bb1, cc01, cc01) 2032 add L, -1, L 2033 FMADD (aa2, bb1, cc02, cc02) 2034 LDF [BO + 2 * SIZE], b1 2035 2036 FMADD (aa1, bb2, cc03, cc03) 2037 LDF [AO + 2 * SIZE], a1 2038 FMADD (aa2, bb2, cc04, cc04) 2039 LDF [AO + 3 * SIZE], a2 2040 2041 add AO, 2 * SIZE, AO 2042 cmp L, 0 2043 add BO, 2 * SIZE, BO 2044 bg,pt %icc, .LL57 2045 LDF [BO + 1 * SIZE], b2 2046 .align 4 2047 2048.LL58: 2049#ifndef TRMMKERNEL 2050 LDF [C1 + 0 * SIZE], a1 2051 LDF [C1 + 1 * SIZE], a2 2052 LDF [C2 + 0 * SIZE], a3 2053 LDF [C2 + 1 * SIZE], a4 2054 2055 FMADD (alpha, cc01, aa1, cc01) 2056 FMADD (alpha, cc02, aa2, cc02) 2057 FMADD (alpha, cc03, aa3, cc03) 2058 FMADD (alpha, cc04, aa4, cc04) 2059#else 2060 FMUL ALPHA, c01, c01 2061 FMUL ALPHA, c02, c02 2062 FMUL ALPHA, c03, c03 2063 FMUL ALPHA, c04, c04 2064#endif 2065 2066 STF c01, [C1 + 0 * SIZE] 2067 add I, -1, I 2068 STF c02, [C1 + 1 * SIZE] 2069 add C1, 2 * SIZE, C1 2070 2071 STF c03, [C2 + 0 * SIZE] 2072 cmp I, 0 2073 STF c04, [C2 + 1 * SIZE] 2074 add C2, 2 * SIZE, C2 2075 2076#ifdef TRMMKERNEL 2077#if ( defined(LEFT) && defined(TRANSA)) || \ 2078 (!defined(LEFT) && !defined(TRANSA)) 2079 sub K, KK, TEMP1 2080#ifdef LEFT 2081 add TEMP1, -2, TEMP1 2082#else 2083 add TEMP1, -2, TEMP1 2084#endif 2085 sll TEMP1, BASE_SHIFT + 1, TEMP2 2086 sll TEMP1, BASE_SHIFT + 1, TEMP1 2087 2088 add AO, TEMP2, AO 2089 add BO, TEMP1, BO 2090#endif 2091 2092#ifdef LEFT 2093 add KK, 2, KK 2094#endif 2095#endif 2096 2097 bg,pt %icc, .LL52 2098 nop 2099 .align 4 2100 2101.LL60: 2102 and M, 1, I 2103 cmp I, 0 2104 ble,pn %icc, .LL69 2105 nop 2106 2107#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) 2108 mov B, BO 2109#else 2110 sll KK, BASE_SHIFT + 0, TEMP1 2111 sll KK, BASE_SHIFT + 1, TEMP2 2112 2113 add AO, TEMP1, AO 2114 add B, TEMP2, BO 2115#endif 2116 2117 LDF [AO + 0 * SIZE], a1 2118 LDF [AO + 1 * SIZE], a2 2119 LDF [AO + 2 * SIZE], a3 2120 LDF [AO + 3 * SIZE], a4 2121 2122 LDF [BO + 0 * SIZE], b1 2123 LDF [BO + 1 * SIZE], b2 2124 LDF [BO + 2 * SIZE], b3 2125 LDF [BO + 3 * SIZE], b4 2126 LDF [BO + 4 * SIZE], b5 2127 LDF [BO + 5 * SIZE], b6 2128 LDF [BO + 6 * SIZE], b7 2129 FCLR (cc01) 2130 LDF [BO + 7 * SIZE], b8 2131 FCLR (cc03) 2132 2133#ifndef TRMMKERNEL 2134 sra K, 2, L 2135#else 2136#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2137 sub K, KK, L 2138#elif defined(LEFT) 2139 add KK, 1, L 2140#else 2141 add KK, 2, L 2142#endif 2143 sra L, 2, L 2144#endif 2145 cmp L, 0 2146 ble,pn %icc, .LL65 2147 nop 2148 .align 4 2149 2150.LL63: 2151 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2152 add L, -1, L 2153 2154 FMADD (aa1, bb1, cc01, cc01) 2155 LDF [BO + 8 * SIZE], b1 2156 FMADD (aa1, bb2, cc03, cc03) 2157 LDF [BO + 9 * SIZE], b2 2158 2159 LDF [AO + 4 * SIZE], a1 2160 cmp L, 0 2161 2162 FMADD (aa2, bb3, cc01, cc01) 2163 LDF [BO + 10 * SIZE], b3 2164 FMADD (aa2, bb4, cc03, cc03) 2165 LDF [BO + 11 * SIZE], b4 2166 2167 LDF [AO + 5 * SIZE], a2 2168 add AO, 4 * SIZE, AO 2169 2170 FMADD (aa3, bb5, cc01, cc01) 2171 LDF [BO + 12 * SIZE], b5 2172 FMADD (aa3, bb6, cc03, cc03) 2173 LDF [BO + 13 * SIZE], b6 2174 2175 LDF [AO + 2 * SIZE], a3 2176 add BO, 8 * SIZE, BO 2177 2178 FMADD (aa4, bb7, cc01, cc01) 2179 LDF [BO + 6 * SIZE], b7 2180 FMADD (aa4, bb8, cc03, cc03) 2181 LDF [BO + 7 * SIZE], b8 2182 2183 bg,pt %icc, .LL63 2184 LDF [AO + 3 * SIZE], a4 2185 .align 4 2186 2187.LL65: 2188#ifndef TRMMKERNEL 2189 and K, 3, L 2190#else 2191#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2192 sub K, KK, L 2193#elif defined(LEFT) 2194 add KK, 1, L 2195#else 2196 add KK, 2, L 2197#endif 2198 and L, 3, L 2199#endif 2200 cmp L, 0 2201 ble,a,pn %icc, .LL68 2202 nop 2203 .align 4 2204 2205.LL67: 2206 FMADD (aa1, bb1, cc01, cc01) 2207 LDF [BO + 2 * SIZE], b1 2208 FMADD (aa1, bb2, cc03, cc03) 2209 LDF [BO + 3 * SIZE], b2 2210 2211 LDF [AO + 1 * SIZE], a1 2212 add L, -1, L 2213 add AO, 1 * SIZE, AO 2214 cmp L, 0 2215 2216 bg,pt %icc, .LL67 2217 add BO, 2 * SIZE, BO 2218 .align 4 2219 2220.LL68: 2221#ifndef TRMMKERNEL 2222 LDF [C1 + 0 * SIZE], a1 2223 LDF [C2 + 0 * SIZE], a2 2224 2225 FMADD (alpha, cc01, aa1, cc01) 2226 FMADD (alpha, cc03, aa2, cc03) 2227#else 2228 FMUL ALPHA, c01, c01 2229 FMUL ALPHA, c03, c03 2230#endif 2231 2232 STF c01, [C1 + 0 * SIZE] 2233 STF c03, [C2 + 0 * SIZE] 2234 2235#ifdef TRMMKERNEL 2236#if ( defined(LEFT) && defined(TRANSA)) || \ 2237 (!defined(LEFT) && !defined(TRANSA)) 2238 sub K, KK, TEMP1 2239#ifdef LEFT 2240 add TEMP1, -1, TEMP1 2241#else 2242 add TEMP1, -2, TEMP1 2243#endif 2244 sll TEMP1, BASE_SHIFT + 0, TEMP2 2245 sll TEMP1, BASE_SHIFT + 1, TEMP1 2246 2247 add AO, TEMP2, AO 2248 add BO, TEMP1, BO 2249#endif 2250 2251#ifdef LEFT 2252 add KK, 1, KK 2253#endif 2254#endif 2255 .align 4 2256 2257.LL69: 2258#if defined(TRMMKERNEL) && !defined(LEFT) 2259 add KK, 2, KK 2260#endif 2261 mov BO, B 2262 .align 4 2263 2264.LL70: 2265 and N, 1, J 2266 cmp J, 0 2267 ble,pn %icc, .LL999 2268 mov C, C1 2269 2270#if defined(TRMMKERNEL) && defined(LEFT) 2271 mov OFFSET, KK 2272#endif 2273 2274 sra M, 1, I 2275 cmp I, 0 2276 ble,pn %icc, .LL80 2277 mov A, AO 2278 .align 4 2279 2280.LL72: 2281#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) 2282 mov B, BO 2283#else 2284 sll KK, BASE_SHIFT + 1, TEMP1 2285 sll KK, BASE_SHIFT + 0, TEMP2 2286 2287 add AO, TEMP1, AO 2288 add B, TEMP2, BO 2289#endif 2290 2291 LDF [AO + 0 * SIZE], a1 2292 LDF [AO + 1 * SIZE], a2 2293 LDF [AO + 2 * SIZE], a3 2294 LDF [AO + 3 * SIZE], a4 2295 2296 LDF [BO + 0 * SIZE], b1 2297 LDF [BO + 1 * SIZE], b2 2298 LDF [BO + 2 * SIZE], b3 2299 FCLR (cc01) 2300 LDF [BO + 3 * SIZE], b4 2301 FCLR (cc02) 2302 2303 prefetch [C1 + 2 * SIZE], 3 2304 2305#ifndef TRMMKERNEL 2306 sra K, 2, L 2307#else 2308#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2309 sub K, KK, L 2310#elif defined(LEFT) 2311 add KK, 2, L 2312#else 2313 add KK, 1, L 2314#endif 2315 sra L, 2, L 2316#endif 2317 cmp L, 0 2318 ble,pn %icc, .LL75 2319 nop 2320 2321.LL73: 2322 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2323 add L, -1, L 2324 2325 FMADD (aa1, bb1, cc01, cc01) 2326 LDF [AO + 4 * SIZE], a1 2327 FMADD (aa2, bb1, cc02, cc02) 2328 LDF [AO + 5 * SIZE], a2 2329 2330 LDF [BO + 4 * SIZE], b1 2331 cmp L, 0 2332 2333 FMADD (aa3, bb2, cc01, cc01) 2334 LDF [AO + 6 * SIZE], a3 2335 FMADD (aa4, bb2, cc02, cc02) 2336 LDF [AO + 7 * SIZE], a4 2337 2338 LDF [BO + 5 * SIZE], b2 2339 add BO, 4 * SIZE, BO 2340 2341 FMADD (aa1, bb3, cc01, cc01) 2342 LDF [AO + 8 * SIZE], a1 2343 FMADD (aa2, bb3, cc02, cc02) 2344 LDF [AO + 9 * SIZE], a2 2345 2346 LDF [BO + 2 * SIZE], b3 2347 add AO, 8 * SIZE, AO 2348 2349 FMADD (aa3, bb4, cc01, cc01) 2350 LDF [AO + 2 * SIZE], a3 2351 FMADD (aa4, bb4, cc02, cc02) 2352 LDF [AO + 3 * SIZE], a4 2353 2354 bg,pt %icc, .LL73 2355 LDF [BO + 3 * SIZE], b4 2356 .align 4 2357 2358.LL75: 2359#ifndef TRMMKERNEL 2360 and K, 3, L 2361#else 2362#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2363 sub K, KK, L 2364#elif defined(LEFT) 2365 add KK, 2, L 2366#else 2367 add KK, 1, L 2368#endif 2369 and L, 3, L 2370#endif 2371 cmp L, 0 2372 ble,a,pn %icc, .LL78 2373 nop 2374 .align 4 2375 2376.LL77: 2377 FMADD (aa1, bb1, cc01, cc01) 2378 LDF [AO + 2 * SIZE], a1 2379 FMADD (aa2, bb1, cc02, cc02) 2380 LDF [AO + 3 * SIZE], a2 2381 2382 LDF [BO + 1 * SIZE], b1 2383 add L, -1, L 2384 add AO, 2 * SIZE, AO 2385 cmp L, 0 2386 bg,pt %icc, .LL77 2387 add BO, 1 * SIZE, BO 2388 .align 4 2389 2390.LL78: 2391#ifndef TRMMKERNEL 2392 LDF [C1 + 0 * SIZE], a1 2393 LDF [C1 + 1 * SIZE], a2 2394 2395 FMADD (alpha, cc01, aa1, cc01) 2396 FMADD (alpha, cc02, aa2, cc02) 2397#else 2398 FMUL ALPHA, c01, c01 2399 FMUL ALPHA, c02, c02 2400#endif 2401 2402 STF c01, [C1 + 0 * SIZE] 2403 add I, -1, I 2404 STF c02, [C1 + 1 * SIZE] 2405 cmp I, 0 2406 2407#ifdef TRMMKERNEL 2408#if ( defined(LEFT) && defined(TRANSA)) || \ 2409 (!defined(LEFT) && !defined(TRANSA)) 2410 sub K, KK, TEMP1 2411#ifdef LEFT 2412 add TEMP1, -2, TEMP1 2413#else 2414 add TEMP1, -1, TEMP1 2415#endif 2416 sll TEMP1, BASE_SHIFT + 1, TEMP2 2417 sll TEMP1, BASE_SHIFT + 0, TEMP1 2418 2419 add AO, TEMP2, AO 2420 add BO, TEMP1, BO 2421#endif 2422 2423#ifdef LEFT 2424 add KK, 2, KK 2425#endif 2426#endif 2427 2428 bg,pt %icc, .LL72 2429 add C1, 2 * SIZE, C1 2430 .align 4 2431 2432.LL80: 2433 and M, 1, I 2434 cmp I, 0 2435 ble,pn %icc, .LL999 2436 nop 2437 2438#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) 2439 mov B, BO 2440#else 2441 sll KK, BASE_SHIFT + 0, TEMP1 2442 sll KK, BASE_SHIFT + 0, TEMP2 2443 2444 add AO, TEMP1, AO 2445 add B, TEMP2, BO 2446#endif 2447 2448 LDF [AO + 0 * SIZE], a1 2449 LDF [BO + 0 * SIZE], b1 2450 LDF [AO + 1 * SIZE], a2 2451 LDF [BO + 1 * SIZE], b2 2452 LDF [AO + 2 * SIZE], a3 2453 LDF [BO + 2 * SIZE], b3 2454 LDF [AO + 3 * SIZE], a4 2455 LDF [BO + 3 * SIZE], b4 2456 2457#ifndef TRMMKERNEL 2458 sra K, 2, L 2459#else 2460#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2461 sub K, KK, L 2462#elif defined(LEFT) 2463 add KK, 1, L 2464#else 2465 add KK, 1, L 2466#endif 2467 sra L, 2, L 2468#endif 2469 cmp L, 0 2470 ble,pn %icc, .LL85 2471 FCLR (cc01) 2472 .align 4 2473 2474.LL83: 2475 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2476 add L, -1, L 2477 2478 FMADD (aa1, bb1, cc01, cc01) 2479 LDF [AO + 4 * SIZE], a1 2480 LDF [BO + 4 * SIZE], b1 2481 2482 FMADD (aa2, bb2, cc01, cc01) 2483 LDF [AO + 5 * SIZE], a2 2484 LDF [BO + 5 * SIZE], b2 2485 2486 FMADD (aa3, bb3, cc01, cc01) 2487 LDF [AO + 6 * SIZE], a3 2488 LDF [BO + 6 * SIZE], b3 2489 2490 FMADD (aa4, bb4, cc01, cc01) 2491 LDF [AO + 7 * SIZE], a4 2492 LDF [BO + 7 * SIZE], b4 2493 2494 add AO, 4 * SIZE, AO 2495 cmp L, 0 2496 2497 bg,pt %icc, .LL83 2498 add BO, 4 * SIZE, BO 2499 .align 4 2500 2501.LL85: 2502#ifndef TRMMKERNEL 2503 and K, 3, L 2504#else 2505#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2506 sub K, KK, L 2507#elif defined(LEFT) 2508 add KK, 1, L 2509#else 2510 add KK, 1, L 2511#endif 2512 and L, 3, L 2513#endif 2514 cmp L, 0 2515 ble,a,pn %icc, .LL88 2516 nop 2517 .align 4 2518 2519.LL87: 2520 FMADD (aa1, bb1, cc01, cc01) 2521 LDF [AO + 1 * SIZE], a1 2522 LDF [BO + 1 * SIZE], b1 2523 2524 add AO, 1 * SIZE, AO 2525 add L, -1, L 2526 cmp L, 0 2527 bg,pt %icc, .LL87 2528 add BO, 1 * SIZE, BO 2529 .align 4 2530 2531.LL88: 2532#ifndef TRMMKERNEL 2533 LDF [C1 + 0 * SIZE], a1 2534 2535 FMADD (alpha, cc01, aa1, cc01) 2536#else 2537 FMUL ALPHA, c01, c01 2538#endif 2539 2540 STF c01, [C1 + 0 * SIZE] 2541 .align 4 2542 2543.LL999: 2544#ifdef TRMMKERNEL 2545#ifndef __64BIT__ 2546 ld [%sp + STACK_START + 8], %g1 2547 ld [%sp + STACK_START + 12], %g2 2548 ld [%sp + STACK_START + 16], %g3 2549 ld [%sp + STACK_START + 20], %g4 2550#else 2551 ldx [%sp + STACK_START + 32], %g1 2552 ldx [%sp + STACK_START + 40], %g2 2553 ldx [%sp + STACK_START + 48], %g3 2554 ldx [%sp + STACK_START + 56], %g4 2555#endif 2556#endif 2557 2558 return %i7 + 8 2559 clr %o0 2560 2561 EPILOGUE 2562