1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %i0 43#define N %i1 44#define K %i2 45 46#if defined(DOUBLE) && !defined(__64BIT__) 47#define A %i5 48#define B %i4 49#else 50#define A %i4 51#define B %i5 52#endif 53 54#define C %o4 55#define LDC %o5 56 57#define AO %l0 58#define BO %l1 59#define I %l2 60#define J %l3 61#define L %l4 62 63#define C1 %o0 64#define C2 %o1 65#define C3 %o2 66#define C4 %o3 67 68#define OFFSET %l5 69#define KK %l6 70#define TEMP1 %l7 71#define TEMP2 %i3 72 73#ifdef DOUBLE 74#define c01 %f0 75#define c02 %f2 76#define c03 %f4 77#define c04 %f6 78#define c05 %f8 79#define c06 %f10 80#define c07 %f12 81#define c08 %f14 82#define c09 %f16 83#define c10 %f18 84#define c11 %f20 85#define c12 %f22 86#define c13 %f24 87#define c14 %f26 88#define c15 %f28 89#define c16 %f30 90 91#define t1 %f32 92#define t2 %f34 93#define t3 %f36 94#define t4 %f38 95 96#define a1 %f40 97#define a2 %f42 98#define a3 %f44 99#define a4 %f46 100#define a5 %f58 101 102#define b1 %f48 103#define b2 %f50 104#define b3 %f52 105#define b4 %f54 106#define b5 %f56 107 108#define FZERO %f60 109#define ALPHA %f62 110#else 111#define c01 %f0 112#define c02 %f1 113#define c03 %f2 114#define c04 %f3 115#define c05 %f4 116#define c06 %f5 117#define c07 %f6 118#define c08 %f7 119#define c09 %f8 120#define c10 %f9 121#define c11 %f10 122#define c12 %f11 123#define c13 %f12 124#define c14 %f13 125#define c15 %f14 126#define c16 %f15 127 128#define t1 %f16 129#define t2 %f17 130#define t3 %f18 131#define t4 %f19 132 133#define a1 %f20 134#define a2 %f21 135#define a3 %f22 136#define a4 %f23 137#define a5 %f31 138 139#define b1 %f24 140#define b2 %f25 141#define b3 %f26 142#define b4 %f27 143#define b5 %f28 144 145#define FZERO %f29 146#define ALPHA %f30 147#endif 148 149 PROLOGUE 150 SAVESP 151 nop 152 153#ifndef __64BIT__ 154 155#ifdef DOUBLE 156 st %i3, [%sp + STACK_START + 16] /* ALPHA */ 157 st %i4, [%sp + STACK_START + 20] 158 159 ld [%sp + STACK_START + 28], B 160 ld [%sp + STACK_START + 32], C 161 ld [%sp + STACK_START + 36], LDC 162#ifdef TRMMKERNEL 163 ld [%sp + STACK_START + 40], OFFSET 164#endif 165#else 166 st %i3, [%sp + STACK_START + 16] /* ALPHA */ 167 168 ld [%sp + STACK_START + 28], C 169 ld [%sp + STACK_START + 32], LDC 170#ifdef TRMMKERNEL 171 ld [%sp + STACK_START + 36], OFFSET 172#endif 173#endif 174 LDF [%sp + STACK_START + 16], ALPHA 175#else 176 ldx [%sp+ STACK_START + 56], C 177 ldx [%sp+ STACK_START + 64], LDC 178#ifdef TRMMKERNEL 179 ldx [%sp+ STACK_START + 72], OFFSET 180#endif 181#ifdef DOUBLE 182 FMOV %f6, ALPHA 183#else 184 FMOV %f7, ALPHA 185#endif 186#endif 187 188 FCLR(29) 189 190#if defined(TRMMKERNEL) && !defined(LEFT) 191 neg OFFSET, KK 192#endif 193 194 sra N, 2, J 195 cmp J, 0 196 ble,pn %icc, .LL100 197 sll LDC, BASE_SHIFT, LDC 198 199.LL11: 200 add C, LDC, C2 201 FMOV FZERO, t1 202 nop 203 mov C, C1 204 205 add C2, LDC, C3 206 FMOV FZERO, t2 207 sra K, 2, L 208 mov A, AO 209 210 sra M, 2, I 211 add C3, LDC, C4 212 FMOV FZERO, t3 213 214#if defined(TRMMKERNEL) && defined(LEFT) 215 mov OFFSET, KK 216#endif 217 218 cmp I, 0 219 add C4, LDC, C 220 FMOV FZERO, t4 221 222 ble,pn %icc, .LL50 223 FMOV FZERO, c01 224 225.LL21: 226#if !defined(TRMMKERNEL) 227 FMOV FZERO, c02 228 mov B, BO 229 230 FMOV FZERO, c03 231 cmp L, 0 232#else 233 FMOV FZERO, c02 234 FMOV FZERO, c03 235 236#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 237 mov B, BO 238#else 239 sll KK, 2 + BASE_SHIFT, TEMP1 240 241 add AO, TEMP1, AO 242 add B, TEMP1, BO 243#endif 244 245#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 246 sub K, KK, L 247#elif defined(LEFT) 248 add KK, 4, L 249#else 250 add KK, 4, L 251#endif 252 sra L, 2, L 253 cmp L, 0 254#endif 255 256 LDF [AO + 0 * SIZE], a1 257 FMOV FZERO, c04 258 LDF [BO + 0 * SIZE], b1 259 FMOV FZERO, c05 260 LDF [AO + 1 * SIZE], a2 261 FMOV FZERO, c06 262 LDF [BO + 1 * SIZE], b2 263 FMOV FZERO, c07 264 265 LDF [AO + 2 * SIZE], a3 266 FMOV FZERO, c08 267 LDF [BO + 2 * SIZE], b3 268 FMOV FZERO, c09 269 LDF [AO + 3 * SIZE], a4 270 FMOV FZERO, c10 271 LDF [BO + 3 * SIZE], b4 272 FMOV FZERO, c11 273 LDF [BO + 4 * SIZE], b5 /* ***** */ 274 275 LDF [AO + 4 * SIZE], a5 /* ***** */ 276 277 prefetch [C1 + 3 * SIZE], 3 278 FMOV FZERO, c12 279 prefetch [C2 + 3 * SIZE], 3 280 FMOV FZERO, c13 281 prefetch [C3 + 3 * SIZE], 3 282 FMOV FZERO, c14 283 prefetch [C4 + 3 * SIZE], 3 284 FMOV FZERO, c15 285 286 ble,pn %icc, .LL25 287 FMOV FZERO, c16 288 289 290#define APREFETCHSIZE 40 291#define BPREFETCHSIZE 40 292 293#define APREFETCH_CATEGORY 0 294#define BPREFETCH_CATEGORY 0 295 296.LL22: 297 FADD c04, t1, c04 298 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY 299 FMUL a1, b1, t1 300 nop 301 302 FADD c08, t2, c08 303 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY 304 FMUL a1, b2, t2 305 add AO, 16 * SIZE, AO 306 307 FADD c12, t3, c12 308 LDF [AO - 13 * SIZE], a4 309 FMUL a1, b3, t3 310 add BO, 16 * SIZE, BO 311 312 FADD c16, t4, c16 313 nop 314 FMUL a1, b4, t4 315 LDF [AO - 8 * SIZE], a1 316 317 FADD c01, t1, c01 318 nop 319 FMUL a2, b1, t1 320 nop 321 322 FADD c05, t2, c05 323 nop 324 FMUL a2, b2, t2 325 nop 326 327 FADD c09, t3, c09 328 nop 329 FMUL a2, b3, t3 330 nop 331 332 FADD c13, t4, c13 333 add L, -1, L 334 FMUL a2, b4, t4 335 LDF [AO - 11 * SIZE], a2 336 337 FADD c02, t1, c02 338 nop 339 FMUL a3, b1, t1 340 nop 341 342 FADD c06, t2, c06 343 nop 344 FMUL a3, b2, t2 345 nop 346 347 FADD c10, t3, c10 348 nop 349 FMUL a3, b3, t3 350 nop 351 352 FADD c14, t4, c14 353 nop 354 FMUL a3, b4, t4 355 LDF [AO - 10 * SIZE], a3 356 357 FADD c03, t1, c03 358 nop 359 FMUL a4, b1, t1 360 LDF [BO - 8 * SIZE], b1 361 362 FADD c07, t2, c07 363 nop 364 FMUL a4, b2, t2 365 LDF [BO - 11 * SIZE], b2 366 367 FADD c11, t3, c11 368 nop 369 FMUL a4, b3, t3 370 LDF [BO - 10 * SIZE], b3 371 372 FADD c15, t4, c15 373 nop 374 FMUL a4, b4, t4 375 LDF [BO - 9 * SIZE], b4 376 377 FADD c04, t1, c04 378 nop 379 FMUL a5, b5, t1 380 LDF [AO - 9 * SIZE], a4 381 382 FADD c08, t2, c08 383 nop 384 FMUL a5, b2, t2 385 nop 386 387 FADD c12, t3, c12 388 nop 389 FMUL a5, b3, t3 390 nop 391 392 FADD c16, t4, c16 393 nop 394 FMUL a5, b4, t4 395 LDF [AO - 4 * SIZE], a5 396 397 FADD c01, t1, c01 398 nop 399 FMUL a2, b5, t1 400 nop 401 402 FADD c05, t2, c05 403 nop 404 FMUL a2, b2, t2 405 nop 406 407 FADD c09, t3, c09 408 nop 409 FMUL a2, b3, t3 410 nop 411 412 FADD c13, t4, c13 413 nop 414 FMUL a2, b4, t4 415 LDF [AO - 7 * SIZE], a2 416 417 FADD c02, t1, c02 418 nop 419 FMUL a3, b5, t1 420 nop 421 422 FADD c06, t2, c06 423 nop 424 FMUL a3, b2, t2 425 nop 426 427 FADD c10, t3, c10 428 nop 429 FMUL a3, b3, t3 430 nop 431 432 FADD c14, t4, c14 433 nop 434 FMUL a3, b4, t4 435 LDF [AO - 6 * SIZE], a3 436 437 FADD c03, t1, c03 438 nop 439 FMUL a4, b5, t1 440 LDF [BO - 4 * SIZE], b5 441 442 FADD c07, t2, c07 443 nop 444 FMUL a4, b2, t2 445 LDF [BO - 7 * SIZE], b2 446 447 FADD c11, t3, c11 448 nop 449 FMUL a4, b3, t3 450 LDF [BO - 6 * SIZE], b3 451 452 FADD c15, t4, c15 453 nop 454 FMUL a4, b4, t4 455 LDF [BO - 5 * SIZE], b4 456 457 FADD c04, t1, c04 458 nop 459 FMUL a1, b1, t1 460 LDF [AO - 5 * SIZE], a4 461 462 FADD c08, t2, c08 463 nop 464 FMUL a1, b2, t2 465 nop 466 467 FADD c12, t3, c12 468 nop 469 FMUL a1, b3, t3 470 nop 471 472 FADD c16, t4, c16 473 nop 474 FMUL a1, b4, t4 475 LDF [AO - 0 * SIZE], a1 476 477 FADD c01, t1, c01 478 nop 479 FMUL a2, b1, t1 480 nop 481 482#ifdef DOUBLE 483 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 484#else 485 nop 486#endif 487 FADD c05, t2, c05 488 nop 489 FMUL a2, b2, t2 490 491 FADD c09, t3, c09 492 nop 493 FMUL a2, b3, t3 494 nop 495 496 FADD c13, t4, c13 497 nop 498 FMUL a2, b4, t4 499 nop 500 501 FADD c02, t1, c02 502 nop 503 FMUL a3, b1, t1 504 LDF [AO - 3 * SIZE], a2 505 506 FADD c06, t2, c06 507#ifdef DOUBLE 508 prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY 509#else 510 nop 511#endif 512 FMUL a3, b2, t2 513 nop 514 515 FADD c10, t3, c10 516 nop 517 FMUL a3, b3, t3 518 nop 519 520 FADD c14, t4, c14 521 nop 522 FMUL a3, b4, t4 523 LDF [AO - 2 * SIZE], a3 524 525 FADD c03, t1, c03 526 nop 527 FMUL a4, b1, t1 528 LDF [BO - 0 * SIZE], b1 529 530 FADD c07, t2, c07 531 nop 532 FMUL a4, b2, t2 533 LDF [BO - 3 * SIZE], b2 534 535 FADD c11, t3, c11 536 nop 537 FMUL a4, b3, t3 538 LDF [BO - 2 * SIZE], b3 539 540 FADD c15, t4, c15 541 nop 542 FMUL a4, b4, t4 543 LDF [BO - 1 * SIZE], b4 544 545 FADD c04, t1, c04 546 nop 547 FMUL a5, b5, t1 548 LDF [AO - 1 * SIZE], a4 549 550 FADD c08, t2, c08 551 FMUL a5, b2, t2 552 FADD c12, t3, c12 553 FMUL a5, b3, t3 554 555 FADD c16, t4, c16 556 nop 557 FMUL a5, b4, t4 558 LDF [AO + 4 * SIZE], a5 559 560 FADD c01, t1, c01 561 nop 562 FMUL a2, b5, t1 563 nop 564 565 FADD c05, t2, c05 566 nop 567 FMUL a2, b2, t2 568 nop 569 570 FADD c09, t3, c09 571 nop 572 FMUL a2, b3, t3 573 nop 574 575 FADD c13, t4, c13 576 nop 577 FMUL a2, b4, t4 578 LDF [AO + 1 * SIZE], a2 579 580 FADD c02, t1, c02 581 nop 582 FMUL a3, b5, t1 583 nop 584 585 FADD c06, t2, c06 586 nop 587 FMUL a3, b2, t2 588 nop 589 590 FADD c10, t3, c10 591 nop 592 FMUL a3, b3, t3 593 nop 594 595 FADD c14, t4, c14 596 nop 597 FMUL a3, b4, t4 598 LDF [AO + 2 * SIZE], a3 599 600 FADD c03, t1, c03 601 cmp L, 0 602 FMUL a4, b5, t1 603 LDF [BO + 4 * SIZE], b5 604 605 FADD c07, t2, c07 606 nop 607 FMUL a4, b2, t2 608 LDF [BO + 1 * SIZE], b2 609 610 FADD c11, t3, c11 611 nop 612 FMUL a4, b3, t3 613 LDF [BO + 2 * SIZE], b3 614 615 FADD c15, t4, c15 616 FMUL a4, b4, t4 617 bg,pt %icc, .LL22 618 LDF [BO + 3 * SIZE], b4 619 620.LL25: 621#ifndef TRMMKERNEL 622 and K, 3, L 623#else 624#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 625 sub K, KK, L 626#elif defined(LEFT) 627 add KK, 4, L 628#else 629 add KK, 4, L 630#endif 631 and L, 3, L 632#endif 633 cmp L, 0 634 ble,a,pn %icc, .LL29 635 nop 636 637.LL26: 638 FADD c04, t1, c04 639 LDF [AO + 3 * SIZE], a4 640 FMUL a1, b1, t1 641 add AO, 4 * SIZE, AO 642 643 FADD c08, t2, c08 644 add BO, 4 * SIZE, BO 645 FMUL a1, b2, t2 646 add L, -1, L 647 648 FADD c12, t3, c12 649 nop 650 FMUL a1, b3, t3 651 cmp L, 0 652 653 FADD c16, t4, c16 654 nop 655 FMUL a1, b4, t4 656 LDF [AO + 0 * SIZE], a1 657 658 FADD c01, t1, c01 659 nop 660 FMUL a2, b1, t1 661 nop 662 663 FADD c05, t2, c05 664 nop 665 FMUL a2, b2, t2 666 nop 667 668 FADD c09, t3, c09 669 nop 670 FMUL a2, b3, t3 671 nop 672 673 FADD c13, t4, c13 674 nop 675 FMUL a2, b4, t4 676 LDF [AO + 1 * SIZE], a2 677 678 FADD c02, t1, c02 679 nop 680 FMUL a3, b1, t1 681 nop 682 683 FADD c06, t2, c06 684 nop 685 FMUL a3, b2, t2 686 nop 687 688 FADD c10, t3, c10 689 nop 690 FMUL a3, b3, t3 691 nop 692 693 FADD c14, t4, c14 694 nop 695 FMUL a3, b4, t4 696 LDF [AO + 2 * SIZE], a3 697 698 FADD c03, t1, c03 699 nop 700 FMUL a4, b1, t1 701 LDF [BO + 0 * SIZE], b1 702 703 FADD c07, t2, c07 704 nop 705 FMUL a4, b2, t2 706 LDF [BO + 1 * SIZE], b2 707 708 FADD c11, t3, c11 709 nop 710 FMUL a4, b3, t3 711 LDF [BO + 2 * SIZE], b3 712 713 FADD c15, t4, c15 714 FMUL a4, b4, t4 715 bg,pt %icc, .LL26 716 LDF [BO + 3 * SIZE], b4 717 718.LL29: 719#ifndef TRMMKERNEL 720 FADD c04, t1, c04 721 add I, -1, I 722 FMUL c01, ALPHA, c01 723 LDF [C1 + 0 * SIZE], a1 724 725 FADD c08, t2, c08 726 cmp I, 0 727 FMUL c02, ALPHA, c02 728 LDF [C1 + 1 * SIZE], a2 729 730 FADD c12, t3, c12 731 nop 732 FMUL c03, ALPHA, c03 733 LDF [C1 + 2 * SIZE], a3 734 735 FADD c16, t4, c16 736 nop 737 FMUL c04, ALPHA, c04 738 LDF [C1 + 3 * SIZE], a4 739 740 FMUL c05, ALPHA, c05 741 LDF [C2 + 0 * SIZE], b1 742 FMUL c06, ALPHA, c06 743 LDF [C2 + 1 * SIZE], b2 744 745 FMUL c07, ALPHA, c07 746 LDF [C2 + 2 * SIZE], b3 747 FMUL c08, ALPHA, c08 748 LDF [C2 + 3 * SIZE], b4 749 750 FMUL c09, ALPHA, c09 751 LDF [C3 + 0 * SIZE], t1 752 FMUL c10, ALPHA, c10 753 LDF [C3 + 1 * SIZE], t2 754 755 FMUL c11, ALPHA, c11 756 LDF [C3 + 2 * SIZE], t3 757 FMUL c12, ALPHA, c12 758 LDF [C3 + 3 * SIZE], t4 759 760 FMUL c13, ALPHA, c13 761 add C1, 4 * SIZE, C1 762 FADD c01, a1, c01 763 LDF [C4 + 0 * SIZE], a1 764 765 FMUL c14, ALPHA, c14 766 add C2, 4 * SIZE, C2 767 FADD c02, a2, c02 768 LDF [C4 + 1 * SIZE], a2 769 770 FMUL c15, ALPHA, c15 771 add C3, 4 * SIZE, C3 772 FADD c03, a3, c03 773 LDF [C4 + 2 * SIZE], a3 774 775 FMUL c16, ALPHA, c16 776 nop 777 FADD c04, a4, c04 778 LDF [C4 + 3 * SIZE], a4 779 780 STF c01, [C1 - 4 * SIZE] 781 FADD c05, b1, c05 782 STF c02, [C1 - 3 * SIZE] 783 FADD c06, b2, c06 784 785 STF c03, [C1 - 2 * SIZE] 786 FADD c07, b3, c07 787 STF c04, [C1 - 1 * SIZE] 788 FADD c08, b4, c08 789 790 STF c05, [C2 - 4 * SIZE] 791 FADD c09, t1, c09 792 STF c06, [C2 - 3 * SIZE] 793 FADD c10, t2, c10 794 795 STF c07, [C2 - 2 * SIZE] 796 FADD c11, t3, c11 797 STF c08, [C2 - 1 * SIZE] 798 FADD c12, t4, c12 799 800 STF c09, [C3 - 4 * SIZE] 801 FADD c13, a1, c13 802 STF c10, [C3 - 3 * SIZE] 803 FADD c14, a2, c14 804 805 STF c11, [C3 - 2 * SIZE] 806 FADD c15, a3, c15 807 STF c12, [C3 - 1 * SIZE] 808 FADD c16, a4, c16 809 810 STF c13, [C4 + 0 * SIZE] 811 FMOV FZERO, t1 812 STF c14, [C4 + 1 * SIZE] 813 FMOV FZERO, t2 814 815 STF c15, [C4 + 2 * SIZE] 816 FMOV FZERO, t3 817 STF c16, [C4 + 3 * SIZE] 818 FMOV FZERO, t4 819 820 add C4, 4 * SIZE, C4 821#else 822 823 FADD c04, t1, c04 824 FMUL c01, ALPHA, c01 825 FADD c08, t2, c08 826 FMUL c02, ALPHA, c02 827 FADD c12, t3, c12 828 FMUL c03, ALPHA, c03 829 FADD c16, t4, c16 830 FMUL c04, ALPHA, c04 831 832 STF c01, [C1 + 0 * SIZE] 833 FMUL c05, ALPHA, c05 834 STF c02, [C1 + 1 * SIZE] 835 FMUL c06, ALPHA, c06 836 STF c03, [C1 + 2 * SIZE] 837 FMUL c07, ALPHA, c07 838 STF c04, [C1 + 3 * SIZE] 839 FMUL c08, ALPHA, c08 840 841 STF c05, [C2 + 0 * SIZE] 842 FMUL c09, ALPHA, c09 843 STF c06, [C2 + 1 * SIZE] 844 FMUL c10, ALPHA, c10 845 STF c07, [C2 + 2 * SIZE] 846 FMUL c11, ALPHA, c11 847 STF c08, [C2 + 3 * SIZE] 848 FMUL c12, ALPHA, c12 849 850 STF c09, [C3 + 0 * SIZE] 851 FMUL c13, ALPHA, c13 852 STF c10, [C3 + 1 * SIZE] 853 FMUL c14, ALPHA, c14 854 STF c11, [C3 + 2 * SIZE] 855 FMUL c15, ALPHA, c15 856 STF c12, [C3 + 3 * SIZE] 857 FMUL c16, ALPHA, c16 858 859 STF c13, [C4 + 0 * SIZE] 860 STF c14, [C4 + 1 * SIZE] 861 STF c15, [C4 + 2 * SIZE] 862 STF c16, [C4 + 3 * SIZE] 863 864 FMOV FZERO, t1 865 FMOV FZERO, t2 866 FMOV FZERO, t3 867 FMOV FZERO, t4 868 869 add C1, 4 * SIZE, C1 870 add C2, 4 * SIZE, C2 871 add C3, 4 * SIZE, C3 872 add C4, 4 * SIZE, C4 873 874#if ( defined(LEFT) && defined(TRANSA)) || \ 875 (!defined(LEFT) && !defined(TRANSA)) 876 sub K, KK, TEMP1 877#ifdef LEFT 878 add TEMP1, -4, TEMP1 879#else 880 add TEMP1, -4, TEMP1 881#endif 882 sll TEMP1, 2 + BASE_SHIFT, TEMP1 883 884 add AO, TEMP1, AO 885 add BO, TEMP1, BO 886#endif 887 888#ifdef LEFT 889 add KK, 4, KK 890#endif 891 892 add I, -1, I 893 cmp I, 0 894 895#endif 896 897 sra K, 2, L 898 bg,pt %icc, .LL21 899 FMOV FZERO, c01 900 901.LL50: 902 and M, 2, I 903 FMOV FZERO, c02 904 cmp I, 0 905 906 FMOV FZERO, t1 907 ble,pn %icc, .LL70 908 FMOV FZERO, c04 909 910#if !defined(TRMMKERNEL) 911 LDF [AO + 0 * SIZE], a1 912 sra K, 2, L 913 FMOV FZERO, t2 914 LDF [B + 0 * SIZE], b1 915 mov B, BO 916 FMOV FZERO, c06 917 LDF [AO + 1 * SIZE], a2 918 cmp L, 0 919 FMOV FZERO, t3 920 LDF [B + 1 * SIZE], b2 921 FMOV FZERO, c08 922 LDF [AO + 2 * SIZE], a3 923 FMOV FZERO, t4 924 LDF [B + 2 * SIZE], b3 925 FMOV FZERO, c01 926 LDF [AO + 3 * SIZE], a4 927 FMOV FZERO, c03 928 LDF [B + 3 * SIZE], b4 929 FMOV FZERO, c05 930#else 931 932#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 933 mov B, BO 934#else 935 sll KK, 1 + BASE_SHIFT, TEMP1 936 sll KK, 2 + BASE_SHIFT, TEMP2 937 938 add AO, TEMP1, AO 939 add B, TEMP2, BO 940#endif 941 942#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 943 sub K, KK, L 944#elif defined(LEFT) 945 add KK, 2, L 946#else 947 add KK, 4, L 948#endif 949 sra L, 2, L 950 cmp L, 0 951 952 LDF [AO + 0 * SIZE], a1 953 FMOV FZERO, t2 954 LDF [BO + 0 * SIZE], b1 955 FMOV FZERO, c06 956 957 LDF [AO + 1 * SIZE], a2 958 FMOV FZERO, t3 959 LDF [BO + 1 * SIZE], b2 960 FMOV FZERO, c08 961 962 LDF [AO + 2 * SIZE], a3 963 FMOV FZERO, t4 964 LDF [BO + 2 * SIZE], b3 965 FMOV FZERO, c01 966 967 LDF [AO + 3 * SIZE], a4 968 FMOV FZERO, c03 969 LDF [BO + 3 * SIZE], b4 970 FMOV FZERO, c05 971 972#endif 973 ble,pn %icc, .LL55 974 FMOV FZERO, c07 975 976.LL52: 977 FADD c02, t1, c02 978 add AO, 8 * SIZE, AO 979 prefetch [AO + APREFETCHSIZE * SIZE], 0 980 981 FMUL a1, b1, t1 982 add BO, 16 * SIZE, BO 983 984 FADD c04, t2, c04 985 add L, -1, L 986 FMUL a1, b2, t2 987 988 FADD c06, t3, c06 989 cmp L, 0 990 FMUL a1, b3, t3 991 992 FADD c08, t4, c08 993 FMUL a1, b4, t4 994 LDF [AO - 4 * SIZE], a1 995 996 FADD c01, t1, c01 997 FMUL a2, b1, t1 998 LDF [BO - 12 * SIZE], b1 999 FADD c03, t2, c03 1000 FMUL a2, b2, t2 1001 LDF [BO - 11 * SIZE], b2 1002 1003 FADD c05, t3, c05 1004 FMUL a2, b3, t3 1005 LDF [BO - 10 * SIZE], b3 1006 FADD c07, t4, c07 1007 FMUL a2, b4, t4 1008 LDF [BO - 9 * SIZE], b4 1009 1010 FADD c02, t1, c02 1011 FMUL a3, b1, t1 1012 LDF [AO - 3 * SIZE], a2 1013 FADD c04, t2, c04 1014 FMUL a3, b2, t2 1015 1016 FADD c06, t3, c06 1017 FMUL a3, b3, t3 1018 FADD c08, t4, c08 1019 FMUL a3, b4, t4 1020 LDF [AO - 2 * SIZE], a3 1021 1022 FADD c01, t1, c01 1023 FMUL a4, b1, t1 1024 LDF [BO - 8 * SIZE], b1 1025 FADD c03, t2, c03 1026 FMUL a4, b2, t2 1027 LDF [BO - 7 * SIZE], b2 1028 1029 FADD c05, t3, c05 1030 FMUL a4, b3, t3 1031 LDF [BO - 6 * SIZE], b3 1032 FADD c07, t4, c07 1033 FMUL a4, b4, t4 1034 LDF [BO - 5 * SIZE], b4 1035 1036 FADD c02, t1, c02 1037 FMUL a1, b1, t1 1038 LDF [AO - 1 * SIZE], a4 1039 FADD c04, t2, c04 1040 FMUL a1, b2, t2 1041 1042 FADD c06, t3, c06 1043 FMUL a1, b3, t3 1044 FADD c08, t4, c08 1045 FMUL a1, b4, t4 1046 LDF [AO + 0 * SIZE], a1 1047 1048 FADD c01, t1, c01 1049 FMUL a2, b1, t1 1050 LDF [BO - 4 * SIZE], b1 1051 1052 FADD c03, t2, c03 1053 FMUL a2, b2, t2 1054 LDF [BO - 3 * SIZE], b2 1055 1056 FADD c05, t3, c05 1057 FMUL a2, b3, t3 1058 LDF [BO - 2 * SIZE], b3 1059 FADD c07, t4, c07 1060 FMUL a2, b4, t4 1061 LDF [BO - 1 * SIZE], b4 1062 1063 FADD c02, t1, c02 1064 FMUL a3, b1, t1 1065 LDF [AO + 1 * SIZE], a2 1066 FADD c04, t2, c04 1067 FMUL a3, b2, t2 1068 1069 FADD c06, t3, c06 1070 FMUL a3, b3, t3 1071 FADD c08, t4, c08 1072 FMUL a3, b4, t4 1073 LDF [AO + 2 * SIZE], a3 1074 1075 FADD c01, t1, c01 1076 FMUL a4, b1, t1 1077 LDF [BO + 0 * SIZE], b1 1078 FADD c03, t2, c03 1079 FMUL a4, b2, t2 1080 LDF [BO + 1 * SIZE], b2 1081 1082 FADD c05, t3, c05 1083 FMUL a4, b3, t3 1084 LDF [BO + 2 * SIZE], b3 1085 FADD c07, t4, c07 1086 FMUL a4, b4, t4 1087 LDF [BO + 3 * SIZE], b4 1088 1089 bg,pt %icc, .LL52 1090 LDF [AO + 3 * SIZE], a4 1091 1092.LL55: 1093#ifndef TRMMKERNEL 1094 and K, 3, L 1095#else 1096#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1097 sub K, KK, L 1098#elif defined(LEFT) 1099 add KK, 2, L 1100#else 1101 add KK, 4, L 1102#endif 1103 and L, 3, L 1104#endif 1105 cmp L, 0 1106 ble,a,pn %icc, .LL59 1107 nop 1108 1109.LL56: 1110 FADD c02, t1, c02 1111 add AO, 2 * SIZE, AO 1112 FMUL a1, b1, t1 1113 add L, -1, L 1114 1115 add BO, 4 * SIZE, BO 1116 FADD c04, t2, c04 1117 cmp L, 0 1118 FMUL a1, b2, t2 1119 1120 FADD c06, t3, c06 1121 FMUL a1, b3, t3 1122 FADD c08, t4, c08 1123 FMUL a1, b4, t4 1124 LDF [AO + 0 * SIZE], a1 1125 1126 FADD c01, t1, c01 1127 FMUL a2, b1, t1 1128 LDF [BO + 0 * SIZE], b1 1129 FADD c03, t2, c03 1130 FMUL a2, b2, t2 1131 LDF [BO + 1 * SIZE], b2 1132 1133 FADD c05, t3, c05 1134 FMUL a2, b3, t3 1135 LDF [BO + 2 * SIZE], b3 1136 FADD c07, t4, c07 1137 FMUL a2, b4, t4 1138 LDF [BO + 3 * SIZE], b4 1139 1140 bg,pt %icc, .LL56 1141 LDF [AO + 1 * SIZE], a2 1142 1143.LL59: 1144#ifndef TRMMKERNEL 1145 FADD c02, t1, c02 1146 FMUL c01, ALPHA, c01 1147 LDF [C1 + 0 * SIZE], a1 1148 FADD c04, t2, c04 1149 FMUL c03, ALPHA, c03 1150 LDF [C1 + 1 * SIZE], a2 1151 FADD c06, t3, c06 1152 FMUL c05, ALPHA, c05 1153 LDF [C2 + 0 * SIZE], a3 1154 FADD c08, t4, c08 1155 FMUL c07, ALPHA, c07 1156 LDF [C2 + 1 * SIZE], a4 1157 1158 FMUL c02, ALPHA, c02 1159 FADD c01, a1, c01 1160 LDF [C3 + 0 * SIZE], b1 1161 1162 FMUL c04, ALPHA, c04 1163 FADD c02, a2, c02 1164 LDF [C3 + 1 * SIZE], b2 1165 1166 FMUL c06, ALPHA, c06 1167 FADD c03, a3, c03 1168 LDF [C4 + 0 * SIZE], b3 1169 1170 FMUL c08, ALPHA, c08 1171 FADD c04, a4, c04 1172 LDF [C4 + 1 * SIZE], b4 1173 1174 STF c01, [C1 + 0 * SIZE] 1175 FADD c05, b1, c05 1176 STF c02, [C1 + 1 * SIZE] 1177 FADD c06, b2, c06 1178 add C1, 2 * SIZE, C1 1179 1180 STF c03, [C2 + 0 * SIZE] 1181 FADD c07, b3, c07 1182 STF c04, [C2 + 1 * SIZE] 1183 FADD c08, b4, c08 1184 add C2, 2 * SIZE, C2 1185 1186 STF c05, [C3 + 0 * SIZE] 1187 STF c06, [C3 + 1 * SIZE] 1188 add C3, 2 * SIZE, C3 1189 1190 STF c07, [C4 + 0 * SIZE] 1191 STF c08, [C4 + 1 * SIZE] 1192 add C4, 2 * SIZE, C4 1193#else 1194 1195 FADD c02, t1, c02 1196 FADD c04, t2, c04 1197 FADD c06, t3, c06 1198 FADD c08, t4, c08 1199 1200 FMUL c01, ALPHA, c01 1201 FMUL c03, ALPHA, c03 1202 FMUL c05, ALPHA, c05 1203 FMUL c07, ALPHA, c07 1204 1205 FMUL c02, ALPHA, c02 1206 FMUL c04, ALPHA, c04 1207 FMUL c06, ALPHA, c06 1208 FMUL c08, ALPHA, c08 1209 1210 STF c01, [C1 + 0 * SIZE] 1211 STF c02, [C1 + 1 * SIZE] 1212 1213 STF c03, [C2 + 0 * SIZE] 1214 STF c04, [C2 + 1 * SIZE] 1215 1216 STF c05, [C3 + 0 * SIZE] 1217 STF c06, [C3 + 1 * SIZE] 1218 1219 STF c07, [C4 + 0 * SIZE] 1220 STF c08, [C4 + 1 * SIZE] 1221 1222 add C1, 2 * SIZE, C1 1223 add C2, 2 * SIZE, C2 1224 add C3, 2 * SIZE, C3 1225 add C4, 2 * SIZE, C4 1226 1227#if ( defined(LEFT) && defined(TRANSA)) || \ 1228 (!defined(LEFT) && !defined(TRANSA)) 1229 sub K, KK, TEMP1 1230#ifdef LEFT 1231 add TEMP1, -2, TEMP1 1232#else 1233 add TEMP1, -4, TEMP1 1234#endif 1235 sll TEMP1, 1 + BASE_SHIFT, TEMP2 1236 sll TEMP1, 2 + BASE_SHIFT, TEMP1 1237 1238 add AO, TEMP2, AO 1239 add BO, TEMP1, BO 1240#endif 1241 1242#ifdef LEFT 1243 add KK, 2, KK 1244#endif 1245#endif 1246 1247.LL70: 1248 and M, 1, I 1249 cmp I, 0 1250 ble,pn %icc, .LL99 1251 nop 1252 1253.LL71: 1254#if !defined(TRMMKERNEL) 1255 LDF [AO + 0 * SIZE], a1 1256 sra K, 2, L 1257 FMOV FZERO, c01 1258 LDF [B + 0 * SIZE], b1 1259 mov B, BO 1260 FMOV FZERO, t1 1261 LDF [AO + 1 * SIZE], a2 1262 cmp L, 0 1263 FMOV FZERO, c02 1264 LDF [B + 1 * SIZE], b2 1265 FMOV FZERO, t2 1266 LDF [AO + 2 * SIZE], a3 1267 FMOV FZERO, c03 1268 LDF [B + 2 * SIZE], b3 1269 FMOV FZERO, t3 1270 LDF [AO + 3 * SIZE], a4 1271 FMOV FZERO, c04 1272 LDF [B + 3 * SIZE], b4 1273 FMOV FZERO, t4 1274#else 1275 1276#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1277 mov B, BO 1278#else 1279 sll KK, 0 + BASE_SHIFT, TEMP1 1280 sll KK, 2 + BASE_SHIFT, TEMP2 1281 1282 add AO, TEMP1, AO 1283 add B, TEMP2, BO 1284#endif 1285 1286#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1287 sub K, KK, L 1288#elif defined(LEFT) 1289 add KK, 1, L 1290#else 1291 add KK, 4, L 1292#endif 1293 sra L, 2, L 1294 cmp L, 0 1295 1296 LDF [AO + 0 * SIZE], a1 1297 FMOV FZERO, c01 1298 LDF [BO + 0 * SIZE], b1 1299 FMOV FZERO, t1 1300 1301 LDF [AO + 1 * SIZE], a2 1302 FMOV FZERO, c02 1303 LDF [BO + 1 * SIZE], b2 1304 FMOV FZERO, t2 1305 1306 LDF [AO + 2 * SIZE], a3 1307 FMOV FZERO, c03 1308 LDF [BO + 2 * SIZE], b3 1309 FMOV FZERO, t3 1310 1311 LDF [AO + 3 * SIZE], a4 1312 FMOV FZERO, c04 1313 LDF [BO + 3 * SIZE], b4 1314 FMOV FZERO, t4 1315#endif 1316 1317 ble,pn %icc, .LL75 1318 nop 1319 1320.LL72: 1321 FADD c01, t1, c01 1322 add L, -1, L 1323 FMUL a1, b1, t1 1324 LDF [BO + 4 * SIZE], b1 1325 1326 FADD c02, t2, c02 1327 cmp L, 0 1328 FMUL a1, b2, t2 1329 LDF [BO + 5 * SIZE], b2 1330 1331 FADD c03, t3, c03 1332 FMUL a1, b3, t3 1333 LDF [BO + 6 * SIZE], b3 1334 1335 FADD c04, t4, c04 1336 FMUL a1, b4, t4 1337 LDF [BO + 7 * SIZE], b4 1338 LDF [AO + 4 * SIZE], a1 1339 1340 FADD c01, t1, c01 1341 add AO, 4 * SIZE, AO 1342 FMUL a2, b1, t1 1343 LDF [BO + 8 * SIZE], b1 1344 1345 FADD c02, t2, c02 1346 FMUL a2, b2, t2 1347 LDF [BO + 9 * SIZE], b2 1348 1349 FADD c03, t3, c03 1350 FMUL a2, b3, t3 1351 LDF [BO + 10 * SIZE], b3 1352 1353 FADD c04, t4, c04 1354 FMUL a2, b4, t4 1355 LDF [BO + 11 * SIZE], b4 1356 LDF [AO + 1 * SIZE], a2 1357 1358 FADD c01, t1, c01 1359 FMUL a3, b1, t1 1360 LDF [BO + 12 * SIZE], b1 1361 1362 FADD c02, t2, c02 1363 FMUL a3, b2, t2 1364 LDF [BO + 13 * SIZE], b2 1365 1366 FADD c03, t3, c03 1367 FMUL a3, b3, t3 1368 LDF [BO + 14 * SIZE], b3 1369 1370 FADD c04, t4, c04 1371 FMUL a3, b4, t4 1372 LDF [BO + 15 * SIZE], b4 1373 LDF [AO + 2 * SIZE], a3 1374 1375 FADD c01, t1, c01 1376 FMUL a4, b1, t1 1377 LDF [BO + 16 * SIZE], b1 1378 1379 FADD c02, t2, c02 1380 FMUL a4, b2, t2 1381 LDF [BO + 17 * SIZE], b2 1382 1383 FADD c03, t3, c03 1384 FMUL a4, b3, t3 1385 LDF [BO + 18 * SIZE], b3 1386 1387 FADD c04, t4, c04 1388 FMUL a4, b4, t4 1389 LDF [BO + 19 * SIZE], b4 1390 1391 add BO, 16 * SIZE, BO 1392 bg,pt %icc, .LL72 1393 LDF [AO + 3 * SIZE], a4 1394 1395.LL75: 1396#ifndef TRMMKERNEL 1397 and K, 3, L 1398#else 1399#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1400 sub K, KK, L 1401#elif defined(LEFT) 1402 add KK, 1, L 1403#else 1404 add KK, 4, L 1405#endif 1406 and L, 3, L 1407#endif 1408 cmp L, 0 1409 ble,a,pn %icc, .LL79 1410 nop 1411 1412.LL76: 1413 FADD c01, t1, c01 1414 add AO, 1 * SIZE, AO 1415 FMUL a1, b1, t1 1416 LDF [BO + 4 * SIZE], b1 1417 1418 FADD c02, t2, c02 1419 add L, -1, L 1420 FMUL a1, b2, t2 1421 LDF [BO + 5 * SIZE], b2 1422 1423 FADD c03, t3, c03 1424 cmp L, 0 1425 FMUL a1, b3, t3 1426 LDF [BO + 6 * SIZE], b3 1427 1428 FADD c04, t4, c04 1429 add BO, 4 * SIZE, BO 1430 FMUL a1, b4, t4 1431 LDF [AO + 0 * SIZE], a1 1432 1433 bg,pt %icc, .LL76 1434 LDF [BO + 3 * SIZE], b4 1435 1436 1437.LL79: 1438#ifndef TRMMKERNEL 1439 FADD c01, t1, c01 1440 LDF [C1 + 0 * SIZE], a1 1441 FADD c02, t2, c02 1442 LDF [C2 + 0 * SIZE], a2 1443 FADD c03, t3, c03 1444 LDF [C3 + 0 * SIZE], a3 1445 FADD c04, t4, c04 1446 LDF [C4 + 0 * SIZE], a4 1447 1448 FMUL c01, ALPHA, c01 1449 FMUL c02, ALPHA, c02 1450 FMUL c03, ALPHA, c03 1451 FMUL c04, ALPHA, c04 1452 1453 FADD c01, a1, c01 1454 FADD c02, a2, c02 1455 FADD c03, a3, c03 1456 FADD c04, a4, c04 1457 1458 STF c01, [C1 + 0 * SIZE] 1459 STF c02, [C2 + 0 * SIZE] 1460 STF c03, [C3 + 0 * SIZE] 1461 STF c04, [C4 + 0 * SIZE] 1462#else 1463 FADD c01, t1, c01 1464 FADD c02, t2, c02 1465 FADD c03, t3, c03 1466 FADD c04, t4, c04 1467 1468 FMUL c01, ALPHA, c01 1469 FMUL c02, ALPHA, c02 1470 FMUL c03, ALPHA, c03 1471 FMUL c04, ALPHA, c04 1472 1473 STF c01, [C1 + 0 * SIZE] 1474 STF c02, [C2 + 0 * SIZE] 1475 STF c03, [C3 + 0 * SIZE] 1476 STF c04, [C4 + 0 * SIZE] 1477 1478#if ( defined(LEFT) && defined(TRANSA)) || \ 1479 (!defined(LEFT) && !defined(TRANSA)) 1480 sub K, KK, TEMP1 1481#ifdef LEFT 1482 add TEMP1, -1, TEMP1 1483#else 1484 add TEMP1, -4, TEMP1 1485#endif 1486 sll TEMP1, 0 + BASE_SHIFT, TEMP2 1487 sll TEMP1, 2 + BASE_SHIFT, TEMP1 1488 1489 add AO, TEMP2, AO 1490 add BO, TEMP1, BO 1491#endif 1492 1493#ifdef LEFT 1494 add KK, 1, KK 1495#endif 1496#endif 1497 1498.LL99: 1499 add J, -1, J 1500 mov BO, B 1501 cmp J, 0 1502 bg,pt %icc, .LL11 1503#if defined(TRMMKERNEL) && !defined(LEFT) 1504 add KK, 4, KK 1505#else 1506 nop 1507#endif 1508 1509.LL100: /* n & 2 */ 1510 sra M, 2, I 1511 and N, 2, J 1512 1513 cmp J, 0 1514 add C, LDC, C2 1515 ble,pn %icc, .LL200 1516 mov A, AO 1517 1518#if defined(TRMMKERNEL) && defined(LEFT) 1519 mov OFFSET, KK 1520#endif 1521 1522 mov C, C1 1523 add C2, LDC, C 1524 1525 cmp I, 0 1526 ble,pn %icc, .LL150 1527 FMOV FZERO, c03 1528 1529.LL121: 1530#if !defined(TRMMKERNEL) 1531 LDF [AO + 0 * SIZE], a1 1532 sra K, 2, L 1533 FMOV FZERO, t1 1534 LDF [B + 0 * SIZE], b1 1535 mov B, BO 1536 FMOV FZERO, c07 1537 1538 LDF [AO + 1 * SIZE], a2 1539 cmp L, 0 1540 FMOV FZERO, t2 1541 LDF [B + 1 * SIZE], b2 1542 FMOV FZERO, c04 1543 1544 LDF [AO + 2 * SIZE], a3 1545 FMOV FZERO, t3 1546 LDF [B + 2 * SIZE], b3 1547 FMOV FZERO, c08 1548 1549 LDF [AO + 3 * SIZE], a4 1550 FMOV FZERO, t4 1551 LDF [B + 3 * SIZE], b4 1552 FMOV FZERO, c01 1553 1554 prefetch [C1 + 3 * SIZE], 2 1555 FMOV FZERO, c05 1556 prefetch [C2 + 3 * SIZE], 2 1557 FMOV FZERO, c02 1558#else 1559 1560#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1561 mov B, BO 1562#else 1563 sll KK, 2 + BASE_SHIFT, TEMP1 1564 sll KK, 1 + BASE_SHIFT, TEMP2 1565 1566 add AO, TEMP1, AO 1567 add B, TEMP2, BO 1568#endif 1569 1570#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1571 sub K, KK, L 1572#elif defined(LEFT) 1573 add KK, 4, L 1574#else 1575 add KK, 2, L 1576#endif 1577 sra L, 2, L 1578 cmp L, 0 1579 1580 LDF [AO + 0 * SIZE], a1 1581 FMOV FZERO, t1 1582 LDF [BO + 0 * SIZE], b1 1583 FMOV FZERO, c07 1584 1585 LDF [AO + 1 * SIZE], a2 1586 FMOV FZERO, t2 1587 LDF [BO + 1 * SIZE], b2 1588 FMOV FZERO, c04 1589 1590 LDF [AO + 2 * SIZE], a3 1591 FMOV FZERO, t3 1592 LDF [BO + 2 * SIZE], b3 1593 FMOV FZERO, c08 1594 1595 LDF [AO + 3 * SIZE], a4 1596 FMOV FZERO, t4 1597 LDF [BO + 3 * SIZE], b4 1598 FMOV FZERO, c01 1599 1600 prefetch [C1 + 3 * SIZE], 2 1601 FMOV FZERO, c05 1602 prefetch [C2 + 3 * SIZE], 2 1603 FMOV FZERO, c02 1604#endif 1605 1606 ble,pn %icc, .LL125 1607 FMOV FZERO, c06 1608 1609.LL122: 1610 FADD c03, t1, c03 1611 add L, -1, L 1612 FMUL a1, b1, t1 1613 prefetch [AO + APREFETCHSIZE * SIZE], 0 1614 1615 FADD c07, t2, c07 1616 add BO, 8 * SIZE, BO 1617 FMUL a1, b2, t2 1618 LDF [AO + 4 * SIZE], a1 1619 1620 FADD c04, t3, c04 1621 add AO, 16 * SIZE, AO 1622 FMUL a2, b1, t3 1623 cmp L, 0 1624 1625 FADD c08, t4, c08 1626 nop 1627 FMUL a2, b2, t4 1628 LDF [AO - 11 * SIZE], a2 1629 1630 FADD c01, t1, c01 1631 nop 1632 FMUL a3, b1, t1 1633 nop 1634 1635 FADD c05, t2, c05 1636 nop 1637 FMUL a3, b2, t2 1638 LDF [AO - 10 * SIZE], a3 1639 1640 FADD c02, t3, c02 1641 nop 1642 FMUL a4, b1, t3 1643 LDF [BO - 4 * SIZE], b1 1644 1645 FADD c06, t4, c06 1646 nop 1647 FMUL a4, b2, t4 1648 LDF [BO - 3 * SIZE], b2 1649 1650 FADD c03, t1, c03 1651 nop 1652 FMUL a1, b3, t1 1653 LDF [AO - 9 * SIZE], a4 1654 1655 FADD c07, t2, c07 1656 nop 1657 FMUL a1, b4, t2 1658 LDF [AO - 8 * SIZE], a1 1659 1660 FADD c04, t3, c04 1661 nop 1662 FMUL a2, b3, t3 1663 nop 1664 1665 FADD c08, t4, c08 1666 nop 1667 FMUL a2, b4, t4 1668 LDF [AO - 7 * SIZE], a2 1669 1670 FADD c01, t1, c01 1671 nop 1672 FMUL a3, b3, t1 1673 nop 1674 1675 FADD c05, t2, c05 1676 nop 1677 FMUL a3, b4, t2 1678 LDF [AO - 6 * SIZE], a3 1679 1680 FADD c02, t3, c02 1681 nop 1682 FMUL a4, b3, t3 1683 LDF [BO - 2 * SIZE], b3 1684 1685 FADD c06, t4, c06 1686 nop 1687 FMUL a4, b4, t4 1688 LDF [BO - 1 * SIZE], b4 1689 1690 FADD c03, t1, c03 1691 nop 1692 FMUL a1, b1, t1 1693 LDF [AO - 5 * SIZE], a4 1694 1695 FADD c07, t2, c07 1696 nop 1697 FMUL a1, b2, t2 1698 LDF [AO - 4 * SIZE], a1 1699 1700 FADD c04, t3, c04 1701 nop 1702 FMUL a2, b1, t3 1703 nop 1704 1705 FADD c08, t4, c08 1706 nop 1707 FMUL a2, b2, t4 1708 LDF [AO - 3 * SIZE], a2 1709 1710 FADD c01, t1, c01 1711 nop 1712 FMUL a3, b1, t1 1713 nop 1714 1715 FADD c05, t2, c05 1716 nop 1717 FMUL a3, b2, t2 1718 LDF [AO - 2 * SIZE], a3 1719 1720 FADD c02, t3, c02 1721 nop 1722 FMUL a4, b1, t3 1723 LDF [BO + 0 * SIZE], b1 1724 1725 FADD c06, t4, c06 1726 nop 1727 FMUL a4, b2, t4 1728 LDF [BO + 1 * SIZE], b2 1729 1730 FADD c03, t1, c03 1731 nop 1732 FMUL a1, b3, t1 1733 LDF [AO - 1 * SIZE], a4 1734 1735 FADD c07, t2, c07 1736 nop 1737 FMUL a1, b4, t2 1738 LDF [AO + 0 * SIZE], a1 1739 1740 FADD c04, t3, c04 1741 nop 1742 FMUL a2, b3, t3 1743 nop 1744 1745 FADD c08, t4, c08 1746 nop 1747 FMUL a2, b4, t4 1748 LDF [AO + 1 * SIZE], a2 1749 1750 FADD c01, t1, c01 1751 nop 1752 FMUL a3, b3, t1 1753 nop 1754 1755 FADD c05, t2, c05 1756 nop 1757 FMUL a3, b4, t2 1758 LDF [AO + 2 * SIZE], a3 1759 1760 FADD c02, t3, c02 1761 nop 1762 FMUL a4, b3, t3 1763 LDF [BO + 2 * SIZE], b3 1764 1765 FADD c06, t4, c06 1766 FMUL a4, b4, t4 1767 LDF [AO + 3 * SIZE], a4 1768 1769 bg,pt %icc, .LL122 1770 LDF [BO + 3 * SIZE], b4 1771 1772.LL125: 1773#ifndef TRMMKERNEL 1774 and K, 3, L 1775#else 1776#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1777 sub K, KK, L 1778#elif defined(LEFT) 1779 add KK, 4, L 1780#else 1781 add KK, 2, L 1782#endif 1783 and L, 3, L 1784#endif 1785 cmp L, 0 1786 ble,a,pn %icc, .LL129 1787 nop 1788 1789.LL126: 1790 FADD c03, t1, c03 1791 add AO, 4 * SIZE, AO 1792 FMUL a1, b1, t1 1793 add BO, 2 * SIZE, BO 1794 1795 FADD c07, t2, c07 1796 add L, -1, L 1797 FMUL a1, b2, t2 1798 LDF [AO + 0 * SIZE], a1 1799 1800 FADD c04, t3, c04 1801 cmp L, 0 1802 FMUL a2, b1, t3 1803 1804 FADD c08, t4, c08 1805 FMUL a2, b2, t4 1806 LDF [AO + 1 * SIZE], a2 1807 1808 FADD c01, t1, c01 1809 FMUL a3, b1, t1 1810 FADD c05, t2, c05 1811 FMUL a3, b2, t2 1812 LDF [AO + 2 * SIZE], a3 1813 1814 FADD c02, t3, c02 1815 FMUL a4, b1, t3 1816 LDF [BO + 0 * SIZE], b1 1817 FADD c06, t4, c06 1818 FMUL a4, b2, t4 1819 LDF [BO + 1 * SIZE], b2 1820 bg,pt %icc, .LL126 1821 LDF [AO + 3 * SIZE], a4 1822 1823.LL129: 1824#ifndef TRMMKERNEL 1825 FADD c03, t1, c03 1826 add I, -1, I 1827 LDF [C1 + 0 * SIZE], a1 1828 FADD c07, t2, c07 1829 cmp I, 0 1830 LDF [C1 + 1 * SIZE], a2 1831 FADD c04, t3, c04 1832 LDF [C1 + 2 * SIZE], a3 1833 FADD c08, t4, c08 1834 LDF [C1 + 3 * SIZE], a4 1835 1836 LDF [C2 + 0 * SIZE], b1 1837 FMUL c01, ALPHA, c01 1838 LDF [C2 + 1 * SIZE], b2 1839 FMUL c02, ALPHA, c02 1840 LDF [C2 + 2 * SIZE], b3 1841 FMUL c03, ALPHA, c03 1842 LDF [C2 + 3 * SIZE], b4 1843 FMUL c04, ALPHA, c04 1844 1845 FMUL c05, ALPHA, c05 1846 FADD c01, a1, c01 1847 FMUL c06, ALPHA, c06 1848 FADD c02, a2, c02 1849 FMUL c07, ALPHA, c07 1850 FADD c03, a3, c03 1851 FMUL c08, ALPHA, c08 1852 FADD c04, a4, c04 1853 1854 STF c01, [C1 + 0 * SIZE] 1855 FADD c05, b1, c05 1856 STF c02, [C1 + 1 * SIZE] 1857 FADD c06, b2, c06 1858 STF c03, [C1 + 2 * SIZE] 1859 FADD c07, b3, c07 1860 STF c04, [C1 + 3 * SIZE] 1861 add C1, 4 * SIZE, C1 1862 FADD c08, b4, c08 1863 1864 STF c05, [C2 + 0 * SIZE] 1865 STF c06, [C2 + 1 * SIZE] 1866 STF c07, [C2 + 2 * SIZE] 1867 STF c08, [C2 + 3 * SIZE] 1868 add C2, 4 * SIZE, C2 1869#else 1870 FADD c03, t1, c03 1871 FADD c07, t2, c07 1872 FADD c04, t3, c04 1873 FADD c08, t4, c08 1874 1875 FMUL c01, ALPHA, c01 1876 FMUL c02, ALPHA, c02 1877 FMUL c03, ALPHA, c03 1878 FMUL c04, ALPHA, c04 1879 1880 FMUL c05, ALPHA, c05 1881 FMUL c06, ALPHA, c06 1882 FMUL c07, ALPHA, c07 1883 FMUL c08, ALPHA, c08 1884 1885 STF c01, [C1 + 0 * SIZE] 1886 STF c02, [C1 + 1 * SIZE] 1887 STF c03, [C1 + 2 * SIZE] 1888 STF c04, [C1 + 3 * SIZE] 1889 1890 STF c05, [C2 + 0 * SIZE] 1891 STF c06, [C2 + 1 * SIZE] 1892 STF c07, [C2 + 2 * SIZE] 1893 STF c08, [C2 + 3 * SIZE] 1894 add C1, 4 * SIZE, C1 1895 add C2, 4 * SIZE, C2 1896 1897#if ( defined(LEFT) && defined(TRANSA)) || \ 1898 (!defined(LEFT) && !defined(TRANSA)) 1899 sub K, KK, TEMP1 1900#ifdef LEFT 1901 add TEMP1, -4, TEMP1 1902#else 1903 add TEMP1, -2, TEMP1 1904#endif 1905 sll TEMP1, 2 + BASE_SHIFT, TEMP2 1906 sll TEMP1, 1 + BASE_SHIFT, TEMP1 1907 1908 add AO, TEMP2, AO 1909 add BO, TEMP1, BO 1910#endif 1911 1912#ifdef LEFT 1913 add KK, 4, KK 1914#endif 1915 1916 add I, -1, I 1917 cmp I, 0 1918#endif 1919 1920 bg,pt %icc, .LL121 1921 FMOV FZERO, c03 1922 1923.LL150: 1924 and M, 2, I 1925 cmp I, 0 1926 ble,pn %icc, .LL170 1927 nop 1928 1929.LL151: 1930#if !defined(TRMMKERNEL) 1931 LDF [AO + 0 * SIZE], a1 1932 sra K, 2, L 1933 FMOV FZERO, c01 1934 1935 LDF [B + 0 * SIZE], b1 1936 mov B, BO 1937 FMOV FZERO, t1 1938 1939 LDF [AO + 1 * SIZE], a2 1940 cmp L, 0 1941 FMOV FZERO, c02 1942 LDF [B + 1 * SIZE], b2 1943 FMOV FZERO, t2 1944 1945 LDF [AO + 2 * SIZE], a3 1946 FMOV FZERO, c03 1947 LDF [B + 2 * SIZE], b3 1948 FMOV FZERO, t3 1949 1950 LDF [AO + 3 * SIZE], a4 1951 FMOV FZERO, c04 1952 LDF [B + 3 * SIZE], b4 1953 FMOV FZERO, t4 1954#else 1955 1956#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1957 mov B, BO 1958#else 1959 sll KK, 1 + BASE_SHIFT, TEMP1 1960 sll KK, 1 + BASE_SHIFT, TEMP2 1961 1962 add AO, TEMP1, AO 1963 add B, TEMP2, BO 1964#endif 1965 1966#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1967 sub K, KK, L 1968#elif defined(LEFT) 1969 add KK, 2, L 1970#else 1971 add KK, 2, L 1972#endif 1973 sra L, 2, L 1974 cmp L, 0 1975 1976 LDF [AO + 0 * SIZE], a1 1977 FMOV FZERO, c01 1978 LDF [BO + 0 * SIZE], b1 1979 FMOV FZERO, t1 1980 1981 LDF [AO + 1 * SIZE], a2 1982 FMOV FZERO, c02 1983 LDF [BO + 1 * SIZE], b2 1984 FMOV FZERO, t2 1985 1986 LDF [AO + 2 * SIZE], a3 1987 FMOV FZERO, c03 1988 LDF [BO + 2 * SIZE], b3 1989 FMOV FZERO, t3 1990 1991 LDF [AO + 3 * SIZE], a4 1992 FMOV FZERO, c04 1993 LDF [BO + 3 * SIZE], b4 1994 FMOV FZERO, t4 1995#endif 1996 1997 ble,pn %icc, .LL155 1998 nop 1999 2000.LL152: 2001 FADD c01, t1, c01 2002 add L, -1, L 2003 FMUL a1, b1, t1 2004 prefetch [AO + APREFETCHSIZE * SIZE], 0 2005 2006 FADD c02, t2, c02 2007 add BO, 8 * SIZE, BO 2008 FMUL a1, b2, t2 2009 LDF [AO + 4 * SIZE], a1 2010 2011 FADD c03, t3, c03 2012 cmp L, 0 2013 FMUL a2, b1, t3 2014 LDF [BO - 4 * SIZE], b1 2015 2016 FADD c04, t4, c04 2017 nop 2018 FMUL a2, b2, t4 2019 LDF [AO + 5 * SIZE], a2 2020 2021 FADD c01, t1, c01 2022 nop 2023 FMUL a3, b3, t1 2024 LDF [BO - 3 * SIZE], b2 2025 2026 FADD c02, t2, c02 2027 nop 2028 FMUL a3, b4, t2 2029 LDF [AO + 6 * SIZE], a3 2030 2031 FADD c03, t3, c03 2032 nop 2033 FMUL a4, b3, t3 2034 LDF [BO - 2 * SIZE], b3 2035 2036 FADD c04, t4, c04 2037 nop 2038 FMUL a4, b4, t4 2039 LDF [AO + 7 * SIZE], a4 2040 2041 FADD c01, t1, c01 2042 nop 2043 FMUL a1, b1, t1 2044 LDF [BO - 1 * SIZE], b4 2045 2046 FADD c02, t2, c02 2047 FMUL a1, b2, t2 2048 LDF [AO + 8 * SIZE], a1 2049 2050 FADD c03, t3, c03 2051 FMUL a2, b1, t3 2052 LDF [BO + 0 * SIZE], b1 2053 2054 FADD c04, t4, c04 2055 FMUL a2, b2, t4 2056 LDF [AO + 9 * SIZE], a2 2057 2058 FADD c01, t1, c01 2059 FMUL a3, b3, t1 2060 LDF [BO + 1 * SIZE], b2 2061 2062 FADD c02, t2, c02 2063 FMUL a3, b4, t2 2064 LDF [AO + 10 * SIZE], a3 2065 2066 FADD c03, t3, c03 2067 FMUL a4, b3, t3 2068 LDF [BO + 2 * SIZE], b3 2069 2070 FADD c04, t4, c04 2071 FMUL a4, b4, t4 2072 LDF [AO + 11 * SIZE], a4 2073 2074 add AO, 8 * SIZE, AO 2075 bg,pt %icc, .LL152 2076 LDF [BO + 3 * SIZE], b4 2077 2078.LL155: 2079#ifndef TRMMKERNEL 2080 and K, 3, L 2081#else 2082#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2083 sub K, KK, L 2084#elif defined(LEFT) 2085 add KK, 2, L 2086#else 2087 add KK, 2, L 2088#endif 2089 and L, 3, L 2090#endif 2091 cmp L, 0 2092 ble,a,pn %icc, .LL159 2093 nop 2094 2095.LL156: 2096 LDF [AO + 0 * SIZE], a1 2097 LDF [AO + 1 * SIZE], a2 2098 2099 LDF [BO + 0 * SIZE], b1 2100 LDF [BO + 1 * SIZE], b2 2101 2102 FADD c01, t1, c01 2103 FADD c02, t2, c02 2104 FADD c03, t3, c03 2105 FADD c04, t4, c04 2106 2107 FMUL a1, b1, t1 2108 FMUL a1, b2, t2 2109 FMUL a2, b1, t3 2110 FMUL a2, b2, t4 2111 2112 add AO, 2 * SIZE, AO 2113 add BO, 2 * SIZE, BO 2114 2115 add L, -1, L 2116 cmp L, 0 2117 bg,pt %icc, .LL156 2118 nop 2119 2120.LL159: 2121#ifndef TRMMKERNEL 2122 LDF [C1 + 0 * SIZE], a1 2123 LDF [C2 + 0 * SIZE], a2 2124 LDF [C1 + 1 * SIZE], a3 2125 LDF [C2 + 1 * SIZE], a4 2126 2127 FADD c01, t1, c01 2128 FADD c02, t2, c02 2129 FADD c03, t3, c03 2130 FADD c04, t4, c04 2131 2132 FMUL c01, ALPHA, c01 2133 FMUL c02, ALPHA, c02 2134 FMUL c03, ALPHA, c03 2135 FMUL c04, ALPHA, c04 2136 2137 FADD c01, a1, c01 2138 FADD c02, a2, c02 2139 FADD c03, a3, c03 2140 FADD c04, a4, c04 2141 2142 STF c01, [C1 + 0 * SIZE] 2143 STF c02, [C2 + 0 * SIZE] 2144 STF c03, [C1 + 1 * SIZE] 2145 add C1, 2 * SIZE, C1 2146 STF c04, [C2 + 1 * SIZE] 2147 add C2, 2 * SIZE, C2 2148#else 2149 FADD c01, t1, c01 2150 FADD c02, t2, c02 2151 FADD c03, t3, c03 2152 FADD c04, t4, c04 2153 2154 FMUL c01, ALPHA, c01 2155 FMUL c02, ALPHA, c02 2156 FMUL c03, ALPHA, c03 2157 FMUL c04, ALPHA, c04 2158 2159 STF c01, [C1 + 0 * SIZE] 2160 STF c02, [C2 + 0 * SIZE] 2161 STF c03, [C1 + 1 * SIZE] 2162 STF c04, [C2 + 1 * SIZE] 2163 add C1, 2 * SIZE, C1 2164 add C2, 2 * SIZE, C2 2165 2166#if ( defined(LEFT) && defined(TRANSA)) || \ 2167 (!defined(LEFT) && !defined(TRANSA)) 2168 sub K, KK, TEMP1 2169#ifdef LEFT 2170 add TEMP1, -2, TEMP1 2171#else 2172 add TEMP1, -2, TEMP1 2173#endif 2174 sll TEMP1, 1 + BASE_SHIFT, TEMP2 2175 sll TEMP1, 1 + BASE_SHIFT, TEMP1 2176 2177 add AO, TEMP2, AO 2178 add BO, TEMP1, BO 2179#endif 2180 2181#ifdef LEFT 2182 add KK, 2, KK 2183#endif 2184#endif 2185 2186.LL170: 2187 and M, 1, I 2188 cmp I, 0 2189 ble,pn %icc, .LL199 2190 nop 2191 2192.LL171: 2193#if !defined(TRMMKERNEL) 2194 LDF [AO + 0 * SIZE], a1 2195 sra K, 2, L 2196 FMOV FZERO, c01 2197 LDF [B + 0 * SIZE], b1 2198 mov B, BO 2199 FMOV FZERO, t1 2200 2201 LDF [AO + 1 * SIZE], a2 2202 cmp L, 0 2203 FMOV FZERO, c02 2204 LDF [B + 1 * SIZE], b2 2205 FMOV FZERO, t2 2206 2207 LDF [AO + 2 * SIZE], a3 2208 FMOV FZERO, c03 2209 2210 LDF [B + 2 * SIZE], b3 2211 FMOV FZERO, t3 2212 2213 LDF [AO + 3 * SIZE], a4 2214 FMOV FZERO, c04 2215 LDF [B + 3 * SIZE], b4 2216 FMOV FZERO, t4 2217#else 2218#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2219 mov B, BO 2220#else 2221 sll KK, 0 + BASE_SHIFT, TEMP1 2222 sll KK, 1 + BASE_SHIFT, TEMP2 2223 2224 add AO, TEMP1, AO 2225 add B, TEMP2, BO 2226#endif 2227 2228#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2229 sub K, KK, L 2230#elif defined(LEFT) 2231 add KK, 1, L 2232#else 2233 add KK, 2, L 2234#endif 2235 sra L, 2, L 2236 cmp L, 0 2237 2238 LDF [AO + 0 * SIZE], a1 2239 FMOV FZERO, c01 2240 LDF [BO + 0 * SIZE], b1 2241 FMOV FZERO, t1 2242 2243 LDF [AO + 1 * SIZE], a2 2244 FMOV FZERO, c02 2245 LDF [BO + 1 * SIZE], b2 2246 FMOV FZERO, t2 2247 2248 LDF [AO + 2 * SIZE], a3 2249 FMOV FZERO, c03 2250 LDF [BO + 2 * SIZE], b3 2251 FMOV FZERO, t3 2252 2253 LDF [AO + 3 * SIZE], a4 2254 FMOV FZERO, c04 2255 LDF [BO + 3 * SIZE], b4 2256 FMOV FZERO, t4 2257#endif 2258 2259 ble,pn %icc, .LL175 2260 nop 2261 2262.LL172: 2263 FADD c01, t1, c01 2264 add AO, 4 * SIZE, AO 2265 FMUL a1, b1, t1 2266 LDF [BO + 4 * SIZE], b1 2267 2268 FADD c02, t2, c02 2269 FMUL a1, b2, t2 2270 LDF [BO + 5 * SIZE], b2 2271 2272 add L, -1, L 2273 LDF [AO + 0 * SIZE], a1 2274 2275 FADD c03, t3, c03 2276 cmp L, 0 2277 FMUL a2, b3, t3 2278 LDF [BO + 6 * SIZE], b3 2279 2280 FADD c04, t4, c04 2281 FMUL a2, b4, t4 2282 LDF [BO + 7 * SIZE], b4 2283 LDF [AO + 1 * SIZE], a2 2284 2285 FADD c01, t1, c01 2286 FMUL a3, b1, t1 2287 LDF [BO + 8 * SIZE], b1 2288 2289 FADD c02, t2, c02 2290 FMUL a3, b2, t2 2291 LDF [BO + 9 * SIZE], b2 2292 LDF [AO + 2 * SIZE], a3 2293 2294 FADD c03, t3, c03 2295 FMUL a4, b3, t3 2296 LDF [BO + 10 * SIZE], b3 2297 FADD c04, t4, c04 2298 FMUL a4, b4, t4 2299 LDF [BO + 11 * SIZE], b4 2300 add BO, 8 * SIZE, BO 2301 2302 bg,pt %icc, .LL172 2303 LDF [AO + 3 * SIZE], a4 2304 2305.LL175: 2306#ifndef TRMMKERNEL 2307 and K, 3, L 2308#else 2309#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2310 sub K, KK, L 2311#elif defined(LEFT) 2312 add KK, 1, L 2313#else 2314 add KK, 2, L 2315#endif 2316 and L, 3, L 2317#endif 2318 cmp L, 0 2319 ble,a,pn %icc, .LL179 2320 nop 2321 2322.LL176: 2323 FADD c01, t1, c01 2324 add L, -1, L 2325 FMUL a1, b1, t1 2326 add AO, 1 * SIZE, AO 2327 LDF [BO + 2 * SIZE], b1 2328 FADD c02, t2, c02 2329 cmp L, 0 2330 FMUL a1, b2, t2 2331 LDF [BO + 3 * SIZE], b2 2332 2333 add BO, 2 * SIZE, BO 2334 bg,pt %icc, .LL176 2335 LDF [AO + 0 * SIZE], a1 2336 2337.LL179: 2338#ifndef TRMMKERNEL 2339 FADD c01, t1, c01 2340 LDF [C1 + 0 * SIZE], a1 2341 FADD c02, t2, c02 2342 LDF [C2 + 0 * SIZE], a2 2343 FADD c03, t3, c03 2344 FADD c04, t4, c04 2345 2346 FADD c01, c03, c01 2347 FADD c02, c04, c02 2348 2349 FMUL c01, ALPHA, c01 2350 FMUL c02, ALPHA, c02 2351 2352 FADD c01, a1, c01 2353 FADD c02, a2, c02 2354 2355 STF c01, [C1 + 0 * SIZE] 2356 STF c02, [C2 + 0 * SIZE] 2357#else 2358 2359 FADD c01, t1, c01 2360 FADD c02, t2, c02 2361 FADD c03, t3, c03 2362 FADD c04, t4, c04 2363 2364 FADD c01, c03, c01 2365 FADD c02, c04, c02 2366 2367 FMUL c01, ALPHA, c01 2368 FMUL c02, ALPHA, c02 2369 2370 STF c01, [C1 + 0 * SIZE] 2371 STF c02, [C2 + 0 * SIZE] 2372 2373#if ( defined(LEFT) && defined(TRANSA)) || \ 2374 (!defined(LEFT) && !defined(TRANSA)) 2375 sub K, KK, TEMP1 2376#ifdef LEFT 2377 add TEMP1, -1, TEMP1 2378#else 2379 add TEMP1, -2, TEMP1 2380#endif 2381 sll TEMP1, 0 + BASE_SHIFT, TEMP2 2382 sll TEMP1, 1 + BASE_SHIFT, TEMP1 2383 2384 add AO, TEMP2, AO 2385 add BO, TEMP1, BO 2386#endif 2387 2388#ifdef LEFT 2389 add KK, 1, KK 2390#endif 2391#endif 2392 2393.LL199: 2394 mov BO, B 2395#if defined(TRMMKERNEL) && !defined(LEFT) 2396 add KK, 2, KK 2397#else 2398 nop 2399#endif 2400 2401.LL200: 2402 and N, 1, J 2403 sra M, 2, I 2404 2405 cmp J, 0 2406 ble,pn %icc, .LL999 2407 mov A, AO 2408 2409#if defined(TRMMKERNEL) && defined(LEFT) 2410 mov OFFSET, KK 2411#endif 2412 2413 cmp I, 0 2414 ble,pn %icc, .LL250 2415 mov C, C1 2416 2417.LL221: 2418#if !defined(TRMMKERNEL) 2419 LDF [AO + 0 * SIZE], a1 2420 sra K, 2, L 2421 FMOV FZERO, c01 2422 LDF [B + 0 * SIZE], b1 2423 mov B, BO 2424 FMOV FZERO, t1 2425 2426 LDF [AO + 1 * SIZE], a2 2427 cmp L, 0 2428 FMOV FZERO, c02 2429 LDF [B + 1 * SIZE], b2 2430 FMOV FZERO, t2 2431 2432 LDF [AO + 2 * SIZE], a3 2433 FMOV FZERO, c03 2434 LDF [B + 2 * SIZE], b3 2435 FMOV FZERO, t3 2436 2437 LDF [AO + 3 * SIZE], a4 2438 FMOV FZERO, c04 2439 LDF [B + 3 * SIZE], b4 2440 FMOV FZERO, t4 2441#else 2442#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2443 mov B, BO 2444#else 2445 sll KK, 2 + BASE_SHIFT, TEMP1 2446 sll KK, 0 + BASE_SHIFT, TEMP2 2447 2448 add AO, TEMP1, AO 2449 add B, TEMP2, BO 2450#endif 2451 2452#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2453 sub K, KK, L 2454#elif defined(LEFT) 2455 add KK, 4, L 2456#else 2457 add KK, 1, L 2458#endif 2459 sra L, 2, L 2460 cmp L, 0 2461 2462 LDF [AO + 0 * SIZE], a1 2463 FMOV FZERO, c01 2464 LDF [BO + 0 * SIZE], b1 2465 FMOV FZERO, t1 2466 2467 LDF [AO + 1 * SIZE], a2 2468 FMOV FZERO, c02 2469 LDF [BO + 1 * SIZE], b2 2470 FMOV FZERO, t2 2471 2472 LDF [AO + 2 * SIZE], a3 2473 FMOV FZERO, c03 2474 LDF [BO + 2 * SIZE], b3 2475 FMOV FZERO, t3 2476 2477 LDF [AO + 3 * SIZE], a4 2478 FMOV FZERO, c04 2479 LDF [BO + 3 * SIZE], b4 2480 FMOV FZERO, t4 2481#endif 2482 2483 ble,pn %icc, .LL225 2484 prefetch [C1 + 4 * SIZE], 2 2485 2486.LL222: 2487 FADD c01, t1, c01 2488 add BO, 4 * SIZE, BO 2489 FMUL a1, b1, t1 2490 LDF [AO + 4 * SIZE], a1 2491 2492 FADD c02, t2, c02 2493 FMUL a2, b1, t2 2494 LDF [AO + 5 * SIZE], a2 2495 2496 FADD c03, t3, c03 2497 add L, -1, L 2498 FMUL a3, b1, t3 2499 LDF [AO + 6 * SIZE], a3 2500 2501 FADD c04, t4, c04 2502 FMUL a4, b1, t4 2503 LDF [AO + 7 * SIZE], a4 2504 LDF [BO + 0 * SIZE], b1 2505 2506 FADD c01, t1, c01 2507 cmp L, 0 2508 FMUL a1, b2, t1 2509 LDF [AO + 8 * SIZE], a1 2510 2511 FADD c02, t2, c02 2512 FMUL a2, b2, t2 2513 LDF [AO + 9 * SIZE], a2 2514 2515 FADD c03, t3, c03 2516 FMUL a3, b2, t3 2517 LDF [AO + 10 * SIZE], a3 2518 2519 FADD c04, t4, c04 2520 FMUL a4, b2, t4 2521 LDF [AO + 11 * SIZE], a4 2522 LDF [BO + 1 * SIZE], b2 2523 2524 FADD c01, t1, c01 2525 FMUL a1, b3, t1 2526 LDF [AO + 12 * SIZE], a1 2527 2528 FADD c02, t2, c02 2529 FMUL a2, b3, t2 2530 LDF [AO + 13 * SIZE], a2 2531 2532 FADD c03, t3, c03 2533 FMUL a3, b3, t3 2534 LDF [AO + 14 * SIZE], a3 2535 2536 FADD c04, t4, c04 2537 FMUL a4, b3, t4 2538 LDF [AO + 15 * SIZE], a4 2539 LDF [BO + 2 * SIZE], b3 2540 2541 FADD c01, t1, c01 2542 FMUL a1, b4, t1 2543 LDF [AO + 16 * SIZE], a1 2544 2545 FADD c02, t2, c02 2546 FMUL a2, b4, t2 2547 LDF [AO + 17 * SIZE], a2 2548 2549 FADD c03, t3, c03 2550 FMUL a3, b4, t3 2551 LDF [AO + 18 * SIZE], a3 2552 2553 FADD c04, t4, c04 2554 FMUL a4, b4, t4 2555 LDF [AO + 19 * SIZE], a4 2556 add AO, 16 * SIZE, AO 2557 2558 bg,pt %icc, .LL222 2559 LDF [BO + 3 * SIZE], b4 2560 2561.LL225: 2562#ifndef TRMMKERNEL 2563 and K, 3, L 2564#else 2565#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2566 sub K, KK, L 2567#elif defined(LEFT) 2568 add KK, 4, L 2569#else 2570 add KK, 1, L 2571#endif 2572 and L, 3, L 2573#endif 2574 cmp L, 0 2575 ble,a,pn %icc, .LL229 2576 nop 2577 2578.LL226: 2579 FADD c01, t1, c01 2580 add BO, 1 * SIZE, BO 2581 FMUL a1, b1, t1 2582 LDF [AO + 4 * SIZE], a1 2583 2584 FADD c02, t2, c02 2585 add L, -1, L 2586 FMUL a2, b1, t2 2587 LDF [AO + 5 * SIZE], a2 2588 2589 FADD c03, t3, c03 2590 cmp L, 0 2591 FMUL a3, b1, t3 2592 LDF [AO + 6 * SIZE], a3 2593 2594 FADD c04, t4, c04 2595 FMUL a4, b1, t4 2596 LDF [AO + 7 * SIZE], a4 2597 add AO, 4 * SIZE, AO 2598 2599 bg,pt %icc, .LL226 2600 LDF [BO + 0 * SIZE], b1 2601 2602.LL229: 2603#ifndef TRMMKERNEL 2604 FADD c01, t1, c01 2605 add I, -1, I 2606 FADD c02, t2, c02 2607 cmp I, 0 2608 FADD c03, t3, c03 2609 FADD c04, t4, c04 2610 2611 FMUL c01, ALPHA, c01 2612 FMUL c02, ALPHA, c02 2613 FMUL c03, ALPHA, c03 2614 FMUL c04, ALPHA, c04 2615 2616 LDF [C1 + 0 * SIZE], a1 2617 LDF [C1 + 1 * SIZE], a2 2618 LDF [C1 + 2 * SIZE], a3 2619 LDF [C1 + 3 * SIZE], a4 2620 2621 FADD c01, a1, c01 2622 FADD c02, a2, c02 2623 FADD c03, a3, c03 2624 FADD c04, a4, c04 2625 2626 STF c01, [C1 + 0 * SIZE] 2627 STF c02, [C1 + 1 * SIZE] 2628 STF c03, [C1 + 2 * SIZE] 2629 STF c04, [C1 + 3 * SIZE] 2630 add C1, 4 * SIZE, C1 2631#else 2632 FADD c01, t1, c01 2633 FADD c02, t2, c02 2634 FADD c03, t3, c03 2635 FADD c04, t4, c04 2636 2637 FMUL c01, ALPHA, c01 2638 FMUL c02, ALPHA, c02 2639 FMUL c03, ALPHA, c03 2640 FMUL c04, ALPHA, c04 2641 2642 STF c01, [C1 + 0 * SIZE] 2643 STF c02, [C1 + 1 * SIZE] 2644 STF c03, [C1 + 2 * SIZE] 2645 STF c04, [C1 + 3 * SIZE] 2646 add C1, 4 * SIZE, C1 2647 2648#if ( defined(LEFT) && defined(TRANSA)) || \ 2649 (!defined(LEFT) && !defined(TRANSA)) 2650 sub K, KK, TEMP1 2651#ifdef LEFT 2652 add TEMP1, -4, TEMP1 2653#else 2654 add TEMP1, -1, TEMP1 2655#endif 2656 sll TEMP1, 2 + BASE_SHIFT, TEMP2 2657 sll TEMP1, 0 + BASE_SHIFT, TEMP1 2658 2659 add AO, TEMP2, AO 2660 add BO, TEMP1, BO 2661#endif 2662 2663#ifdef LEFT 2664 add KK, 4, KK 2665#endif 2666 2667 add I, -1, I 2668 cmp I, 0 2669#endif 2670 2671 bg,pt %icc, .LL221 2672 nop 2673 2674.LL250: 2675 and M, 2, I 2676 cmp I, 0 2677 ble,pn %icc, .LL270 2678 nop 2679 2680.LL251: 2681#if !defined(TRMMKERNEL) 2682 LDF [AO + 0 * SIZE], a1 2683 sra K, 2, L 2684 FMOV FZERO, c01 2685 LDF [B + 0 * SIZE], b1 2686 mov B, BO 2687 FMOV FZERO, t1 2688 2689 LDF [AO + 1 * SIZE], a2 2690 cmp L, 0 2691 FMOV FZERO, c02 2692 LDF [B + 1 * SIZE], b2 2693 FMOV FZERO, t2 2694 2695 LDF [AO + 2 * SIZE], a3 2696 FMOV FZERO, c03 2697 LDF [B + 2 * SIZE], b3 2698 FMOV FZERO, t3 2699 2700 LDF [AO + 3 * SIZE], a4 2701 FMOV FZERO, c04 2702 LDF [B + 3 * SIZE], b4 2703 FMOV FZERO, t4 2704#else 2705#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2706 mov B, BO 2707#else 2708 sll KK, 1 + BASE_SHIFT, TEMP1 2709 sll KK, 0 + BASE_SHIFT, TEMP2 2710 2711 add AO, TEMP1, AO 2712 add B, TEMP2, BO 2713#endif 2714 2715#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2716 sub K, KK, L 2717#elif defined(LEFT) 2718 add KK, 2, L 2719#else 2720 add KK, 1, L 2721#endif 2722 sra L, 2, L 2723 cmp L, 0 2724 2725 LDF [AO + 0 * SIZE], a1 2726 FMOV FZERO, c01 2727 LDF [BO + 0 * SIZE], b1 2728 FMOV FZERO, t1 2729 2730 LDF [AO + 1 * SIZE], a2 2731 FMOV FZERO, c02 2732 LDF [BO + 1 * SIZE], b2 2733 FMOV FZERO, t2 2734 2735 LDF [AO + 2 * SIZE], a3 2736 FMOV FZERO, c03 2737 LDF [BO + 2 * SIZE], b3 2738 FMOV FZERO, t3 2739 2740 LDF [AO + 3 * SIZE], a4 2741 FMOV FZERO, c04 2742 LDF [BO + 3 * SIZE], b4 2743 FMOV FZERO, t4 2744#endif 2745 2746 ble,pn %icc, .LL255 2747 nop 2748 2749.LL252: 2750 FADD c01, t1, c01 2751 add L, -1, L 2752 FMUL a1, b1, t1 2753 LDF [AO + 4 * SIZE], a1 2754 2755 FADD c02, t2, c02 2756 FMUL a2, b1, t2 2757 LDF [AO + 5 * SIZE], a2 2758 LDF [BO + 4 * SIZE], b1 2759 2760 FADD c03, t3, c03 2761 cmp L, 0 2762 FMUL a3, b2, t3 2763 LDF [AO + 6 * SIZE], a3 2764 2765 FADD c04, t4, c04 2766 FMUL a4, b2, t4 2767 LDF [AO + 7 * SIZE], a4 2768 LDF [BO + 5 * SIZE], b2 2769 2770 FADD c01, t1, c01 2771 FMUL a1, b3, t1 2772 LDF [AO + 8 * SIZE], a1 2773 2774 FADD c02, t2, c02 2775 FMUL a2, b3, t2 2776 LDF [AO + 9 * SIZE], a2 2777 LDF [BO + 6 * SIZE], b3 2778 2779 FADD c03, t3, c03 2780 FMUL a3, b4, t3 2781 LDF [AO + 10 * SIZE], a3 2782 2783 FADD c04, t4, c04 2784 FMUL a4, b4, t4 2785 LDF [AO + 11 * SIZE], a4 2786 add AO, 8 * SIZE, AO 2787 2788 LDF [BO + 7 * SIZE], b4 2789 bg,pt %icc, .LL252 2790 add BO, 4 * SIZE, BO 2791 2792.LL255: 2793#ifndef TRMMKERNEL 2794 and K, 3, L 2795#else 2796#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2797 sub K, KK, L 2798#elif defined(LEFT) 2799 add KK, 2, L 2800#else 2801 add KK, 1, L 2802#endif 2803 and L, 3, L 2804#endif 2805 cmp L, 0 2806 ble,a,pn %icc, .LL259 2807 nop 2808 2809.LL256: 2810 2811 FADD c01, t1, c01 2812 add L, -1, L 2813 FMUL a1, b1, t1 2814 LDF [AO + 2 * SIZE], a1 2815 2816 FADD c02, t2, c02 2817 cmp L, 0 2818 FMUL a2, b1, t2 2819 LDF [AO + 3 * SIZE], a2 2820 2821 LDF [BO + 1 * SIZE], b1 2822 add AO, 2 * SIZE, AO 2823 2824 bg,pt %icc, .LL256 2825 add BO, 1 * SIZE, BO 2826 2827.LL259: 2828#ifndef TRMMKERNEL 2829 FADD c01, t1, c01 2830 LDF [C1 + 0 * SIZE], a1 2831 FADD c02, t2, c02 2832 LDF [C1 + 1 * SIZE], a2 2833 FADD c03, t3, c03 2834 FADD c04, t4, c04 2835 2836 FADD c01, c03, c01 2837 FADD c02, c04, c02 2838 FMUL c01, ALPHA, c01 2839 FMUL c02, ALPHA, c02 2840 FADD c01, a1, c01 2841 FADD c02, a2, c02 2842 2843 STF c01, [C1 + 0 * SIZE] 2844 STF c02, [C1 + 1 * SIZE] 2845 add C1, 2 * SIZE, C1 2846#else 2847 FADD c01, t1, c01 2848 FADD c02, t2, c02 2849 FADD c03, t3, c03 2850 FADD c04, t4, c04 2851 2852 FADD c01, c03, c01 2853 FADD c02, c04, c02 2854 FMUL c01, ALPHA, c01 2855 FMUL c02, ALPHA, c02 2856 2857 STF c01, [C1 + 0 * SIZE] 2858 STF c02, [C1 + 1 * SIZE] 2859 add C1, 2 * SIZE, C1 2860 2861#if ( defined(LEFT) && defined(TRANSA)) || \ 2862 (!defined(LEFT) && !defined(TRANSA)) 2863 sub K, KK, TEMP1 2864#ifdef LEFT 2865 add TEMP1, -2, TEMP1 2866#else 2867 add TEMP1, -1, TEMP1 2868#endif 2869 sll TEMP1, 1 + BASE_SHIFT, TEMP2 2870 sll TEMP1, 0 + BASE_SHIFT, TEMP1 2871 2872 add AO, TEMP2, AO 2873 add BO, TEMP1, BO 2874#endif 2875 2876#ifdef LEFT 2877 add KK, 2, KK 2878#endif 2879#endif 2880 2881.LL270: 2882 and M, 1, I 2883 cmp I, 0 2884 ble,pn %icc, .LL999 2885 nop 2886 2887.LL271: 2888#if !defined(TRMMKERNEL) 2889 LDF [AO + 0 * SIZE], a1 2890 sra K, 2, L 2891 FMOV FZERO, t1 2892 2893 LDF [AO + 1 * SIZE], a2 2894 mov B, BO 2895 FMOV FZERO, c01 2896 2897 LDF [AO + 2 * SIZE], a3 2898 cmp L, 0 2899 FMOV FZERO, t2 2900 2901 LDF [AO + 3 * SIZE], a4 2902 FMOV FZERO, c02 2903 2904 LDF [BO + 0 * SIZE], b1 2905 FMOV FZERO, t3 2906 LDF [BO + 1 * SIZE], b2 2907 FMOV FZERO, t4 2908 LDF [BO + 2 * SIZE], b3 2909#else 2910#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2911 mov B, BO 2912#else 2913 sll KK, 0 + BASE_SHIFT, TEMP1 2914 sll KK, 0 + BASE_SHIFT, TEMP2 2915 2916 add AO, TEMP1, AO 2917 add B, TEMP2, BO 2918#endif 2919 2920#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2921 sub K, KK, L 2922#elif defined(LEFT) 2923 add KK, 1, L 2924#else 2925 add KK, 1, L 2926#endif 2927 sra L, 2, L 2928 cmp L, 0 2929 2930 LDF [AO + 0 * SIZE], a1 2931 FMOV FZERO, t1 2932 LDF [AO + 1 * SIZE], a2 2933 FMOV FZERO, c01 2934 2935 LDF [AO + 2 * SIZE], a3 2936 FMOV FZERO, t2 2937 LDF [AO + 3 * SIZE], a4 2938 FMOV FZERO, c02 2939 2940 LDF [BO + 0 * SIZE], b1 2941 FMOV FZERO, t3 2942 LDF [BO + 1 * SIZE], b2 2943 FMOV FZERO, t4 2944 LDF [BO + 2 * SIZE], b3 2945#endif 2946 2947 ble,pn %icc, .LL275 2948 LDF [BO + 3 * SIZE], b4 2949 2950.LL272: 2951 FADD c01, t1, c01 2952 add L, -1, L 2953 add AO, 4 * SIZE, AO 2954 2955 FMUL a1, b1, t1 2956 add BO, 4 * SIZE, BO 2957 LDF [AO + 0 * SIZE], a1 2958 2959 FADD c02, t2, c02 2960 cmp L, 0 2961 LDF [BO + 0 * SIZE], b1 2962 FMUL a2, b2, t2 2963 2964 LDF [AO + 1 * SIZE], a2 2965 FADD c01, t3, c01 2966 LDF [BO + 1 * SIZE], b2 2967 FMUL a3, b3, t3 2968 2969 LDF [AO + 2 * SIZE], a3 2970 FADD c02, t4, c02 2971 LDF [BO + 2 * SIZE], b3 2972 FMUL a4, b4, t4 2973 LDF [AO + 3 * SIZE], a4 2974 2975 bg,pt %icc, .LL272 2976 LDF [BO + 3 * SIZE], b4 2977 2978.LL275: 2979#ifndef TRMMKERNEL 2980 and K, 3, L 2981#else 2982#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2983 sub K, KK, L 2984#elif defined(LEFT) 2985 add KK, 1, L 2986#else 2987 add KK, 1, L 2988#endif 2989 and L, 3, L 2990#endif 2991 cmp L, 0 2992 ble,a,pn %icc, .LL279 2993 nop 2994 2995.LL276: 2996 FADD c01, t1, c01 2997 add L, -1, L 2998 FMUL a1, b1, t1 2999 LDF [AO + 1 * SIZE], a1 3000 3001 LDF [BO + 1 * SIZE], b1 3002 add BO, 1 * SIZE, BO 3003 cmp L, 0 3004 bg,pt %icc, .LL276 3005 add AO, 1 * SIZE, AO 3006 3007.LL279: 3008#ifndef TRMMKERNEL 3009 FADD c01, t1, c01 3010 3011 LDF [C1 + 0 * SIZE], a1 3012 FADD c02, t2, c02 3013 FADD c01, t3, c01 3014 FADD c02, t4, c02 3015 FADD c01, c02, c01 3016 3017 FMUL c01, ALPHA, c01 3018 FADD c01, a1, c01 3019 STF c01, [C1 + 0 * SIZE] 3020#else 3021 FADD c01, t1, c01 3022 FADD c02, t2, c02 3023 FADD c01, t3, c01 3024 FADD c02, t4, c02 3025 FADD c01, c02, c01 3026 3027 FMUL c01, ALPHA, c01 3028 STF c01, [C1 + 0 * SIZE] 3029 3030#if ( defined(LEFT) && defined(TRANSA)) || \ 3031 (!defined(LEFT) && !defined(TRANSA)) 3032 sub K, KK, TEMP1 3033#ifdef LEFT 3034 add TEMP1, -1, TEMP1 3035#else 3036 add TEMP1, -1, TEMP1 3037#endif 3038 sll TEMP1, 0 + BASE_SHIFT, TEMP2 3039 sll TEMP1, 0 + BASE_SHIFT, TEMP1 3040 3041 add AO, TEMP2, AO 3042 add BO, TEMP1, BO 3043#endif 3044 3045#ifdef LEFT 3046 add KK, 1, KK 3047#endif 3048#endif 3049 3050.LL999: 3051 return %i7 + 8 3052 clr %o0 3053 3054 EPILOGUE 3055