1/*********************************************************************/ 2/* Copyright 2005-2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define APREFETCHSIZE 24 43#define APREFETCH_CATEGORY 0 44 45#define M %i0 46#define N %i1 47#define K %i2 48 49#if defined(DOUBLE) && !defined(__64BIT__) 50#define A %i5 51#define B %i4 52#else 53#define A %i4 54#define B %i5 55#endif 56 57#define C %o4 58#define LDC %o5 59 60#define AO %l0 61#define BO %l1 62#define I %l2 63#define J %l3 64#define L %l4 65 66#define C1 %o0 67#define C2 %o1 68#define C3 %o2 69#define C4 %o3 70 71#define C5 %l5 72#define C6 %l6 73#define C7 %l7 74#define C8 %i3 75 76#define OFFSET %g1 77#define KK %g2 78#define TEMP1 %g3 79#define TEMP2 %g4 80#define AORIG %o7 81 82#ifdef DOUBLE 83#define c01 %f0 84#define c02 %f2 85#define c03 %f4 86#define c04 %f6 87#define c05 %f8 88#define c06 %f10 89#define c07 %f12 90#define c08 %f14 91#define c09 %f16 92#define c10 %f18 93#define c11 %f20 94#define c12 %f22 95#define c13 %f24 96#define c14 %f26 97#define c15 %f28 98#define c16 %f30 99 100#define a1 %f32 101#define a2 %f34 102#define a3 %f36 103#define a4 %f38 104#define a5 %f40 105 106#define b1 %f42 107#define b2 %f44 108#define b3 %f46 109#define b4 %f48 110#define b5 %f50 111#define b6 %f52 112#define b7 %f54 113#define b8 %f56 114#define b9 %f58 115 116#define cc01 0 117#define cc02 2 118#define cc03 4 119#define cc04 6 120#define cc05 8 121#define cc06 10 122#define cc07 12 123#define cc08 14 124#define cc09 16 125#define cc10 18 126#define cc11 20 127#define cc12 22 128#define cc13 24 129#define cc14 26 130#define cc15 28 131#define cc16 30 132 133#define aa1 1 134#define aa2 3 135#define aa3 5 136#define aa4 7 137#define aa5 9 138 139#define bb1 11 140#define bb2 13 141#define bb3 15 142#define bb4 17 143#define bb5 19 144#define bb6 21 145#define bb7 23 146#define bb8 25 147#define bb9 27 148 149#else 150#define c01 %f0 151#define c02 %f1 152#define c03 %f2 153#define c04 %f3 154#define c05 %f4 155#define c06 %f5 156#define c07 %f6 157#define c08 %f7 158#define c09 %f8 159#define c10 %f9 160#define c11 %f10 161#define c12 %f11 162#define c13 %f12 163#define c14 %f13 164#define c15 %f14 165#define c16 %f15 166 167#define a1 %f16 168#define a2 %f17 169#define a3 %f18 170#define a4 %f19 171#define a5 %f20 172 173#define b1 %f21 174#define b2 %f22 175#define b3 %f23 176#define b4 %f24 177#define b5 %f25 178#define b6 %f26 179#define b7 %f27 180#define b8 %f28 181#define b9 %f29 182 183#define cc01 0 184#define cc02 1 185#define cc03 2 186#define cc04 3 187#define cc05 4 188#define cc06 5 189#define cc07 6 190#define cc08 7 191#define cc09 8 192#define cc10 9 193#define cc11 10 194#define cc12 11 195#define cc13 12 196#define cc14 13 197#define cc15 14 198#define cc16 15 199 200#define aa1 16 201#define aa2 17 202#define aa3 18 203#define aa4 19 204#define aa5 20 205 206#define bb1 21 207#define bb2 22 208#define bb3 23 209#define bb4 24 210#define bb5 25 211#define bb6 26 212#define bb7 27 213#define bb8 28 214#define bb9 29 215 216#endif 217 218 .register %g2, #scratch 219 .register %g3, #scratch 220 221 PROLOGUE 222 SAVESP 223 nop 224 225#ifndef __64BIT__ 226 227#ifdef DOUBLE 228 ld [%sp + STACK_START + 28], B 229 ld [%sp + STACK_START + 32], C 230 ld [%sp + STACK_START + 36], LDC 231 ld [%sp + STACK_START + 40], OFFSET 232#else 233 ld [%sp + STACK_START + 28], C 234 ld [%sp + STACK_START + 32], LDC 235 ld [%sp + STACK_START + 36], OFFSET 236#endif 237 st %g1, [%sp + STACK_START + 8] 238 st %g2, [%sp + STACK_START + 12] 239 st %g3, [%sp + STACK_START + 16] 240 st %g4, [%sp + STACK_START + 20] 241#else 242 243 ldx [%sp+ STACK_START + 56], C 244 ldx [%sp+ STACK_START + 64], LDC 245 ldx [%sp+ STACK_START + 72], OFFSET 246 247 stx %g1, [%sp + STACK_START + 32] 248 stx %g2, [%sp + STACK_START + 40] 249 stx %g3, [%sp + STACK_START + 48] 250 stx %g4, [%sp + STACK_START + 56] 251#endif 252 253#if defined(TRMMKERNEL) && !defined(LEFT) 254 neg OFFSET, KK 255#endif 256 257 sll LDC, BASE_SHIFT, LDC 258 259#ifdef LN 260 smul M, K, TEMP1 261 sll TEMP1, BASE_SHIFT, TEMP1 262 add A, TEMP1, A 263 264 sll M, BASE_SHIFT, TEMP1 265 add C, TEMP1, C 266#endif 267 268#ifdef RN 269 neg OFFSET, KK 270#endif 271 272#ifdef RT 273 smul N, K, TEMP1 274 sll TEMP1, BASE_SHIFT, TEMP1 275 add B, TEMP1, B 276 277 smul N, LDC, TEMP1 278 add C, TEMP1, C 279 280 sub N, OFFSET, KK 281#endif 282 283 and N, 1, J 284 cmp J, 0 285 ble,pn %icc, .LL50 286 nop 287 288#ifdef RT 289 sll K, BASE_SHIFT, TEMP1 290 sub B, TEMP1, B 291#endif 292 293#ifndef RT 294 mov C, C1 295 add C1, LDC, C 296#else 297 sub C, LDC, C1 298 sub C, LDC, C 299#endif 300 301#ifdef LN 302 add M, OFFSET, KK 303#endif 304 305#ifdef LT 306 mov OFFSET, KK 307#endif 308 309#if defined(LN) || defined(RT) 310 mov A, AORIG 311#else 312 mov A, AO 313#endif 314 315 sra M, 1, I 316 cmp I, 0 317 ble,pn %icc, .LL80 318 nop 319 .align 4 320 321.LL72: 322#if defined(LT) || defined(RN) 323 mov B, BO 324#else 325#ifdef LN 326 sll K, BASE_SHIFT + 1, TEMP1 327 sub AORIG, TEMP1, AORIG 328#endif 329 330 sll KK, BASE_SHIFT + 1, TEMP1 331 sll KK, BASE_SHIFT + 0, TEMP2 332 333 add AORIG, TEMP1, AO 334 add B, TEMP2, BO 335#endif 336 337 LDF [AO + 0 * SIZE], a1 338 LDF [AO + 1 * SIZE], a2 339 LDF [AO + 2 * SIZE], a3 340 LDF [AO + 3 * SIZE], a4 341 342 LDF [BO + 0 * SIZE], b1 343 LDF [BO + 1 * SIZE], b2 344 LDF [BO + 2 * SIZE], b3 345 FCLR (cc01) 346 LDF [BO + 3 * SIZE], b4 347 FCLR (cc02) 348 349 prefetch [C1 + 2 * SIZE], 3 350 351#if defined(LT) || defined(RN) 352 sra KK, 2, L 353#else 354 sub K, KK, L 355 sra L, 2, L 356#endif 357 cmp L, 0 358 ble,pn %icc, .LL75 359 nop 360 361.LL73: 362 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 363 add L, -1, L 364 365 FMADD (aa1, bb1, cc01, cc01) 366 LDF [AO + 4 * SIZE], a1 367 FMADD (aa2, bb1, cc02, cc02) 368 LDF [AO + 5 * SIZE], a2 369 370 LDF [BO + 4 * SIZE], b1 371 cmp L, 0 372 373 FMADD (aa3, bb2, cc01, cc01) 374 LDF [AO + 6 * SIZE], a3 375 FMADD (aa4, bb2, cc02, cc02) 376 LDF [AO + 7 * SIZE], a4 377 378 LDF [BO + 5 * SIZE], b2 379 add BO, 4 * SIZE, BO 380 381 FMADD (aa1, bb3, cc01, cc01) 382 LDF [AO + 8 * SIZE], a1 383 FMADD (aa2, bb3, cc02, cc02) 384 LDF [AO + 9 * SIZE], a2 385 386 LDF [BO + 2 * SIZE], b3 387 add AO, 8 * SIZE, AO 388 389 FMADD (aa3, bb4, cc01, cc01) 390 LDF [AO + 2 * SIZE], a3 391 FMADD (aa4, bb4, cc02, cc02) 392 LDF [AO + 3 * SIZE], a4 393 394 bg,pt %icc, .LL73 395 LDF [BO + 3 * SIZE], b4 396 .align 4 397 398.LL75: 399#if defined(LT) || defined(RN) 400 and KK, 3, L 401#else 402 sub K, KK, L 403 and L, 3, L 404#endif 405 cmp L, 0 406 ble,a,pn %icc, .LL78 407 nop 408 .align 4 409 410.LL77: 411 FMADD (aa1, bb1, cc01, cc01) 412 LDF [AO + 2 * SIZE], a1 413 FMADD (aa2, bb1, cc02, cc02) 414 LDF [AO + 3 * SIZE], a2 415 416 LDF [BO + 1 * SIZE], b1 417 add L, -1, L 418 add AO, 2 * SIZE, AO 419 cmp L, 0 420 bg,pt %icc, .LL77 421 add BO, 1 * SIZE, BO 422 .align 4 423 424.LL78: 425#if defined(LN) || defined(RT) 426#ifdef LN 427 sub KK, 2, TEMP1 428#else 429 sub KK, 1, TEMP1 430#endif 431 sll TEMP1, BASE_SHIFT + 1, TEMP2 432 sll TEMP1, BASE_SHIFT + 0, TEMP1 433 434 add AORIG, TEMP2, AO 435 add B, TEMP1, BO 436#endif 437 438#if defined(LN) || defined(LT) 439 LDF [BO + 0 * SIZE], a1 440 LDF [BO + 1 * SIZE], a2 441 442 FSUB a1, c01, c01 443 FSUB a2, c02, c02 444#else 445 LDF [AO + 0 * SIZE], a1 446 LDF [AO + 1 * SIZE], a2 447 448 FSUB a1, c01, c01 449 FSUB a2, c02, c02 450#endif 451 452#ifdef LN 453 LDF [AO + 3 * SIZE], a1 454 LDF [AO + 2 * SIZE], a2 455 LDF [AO + 0 * SIZE], a3 456 457 FMUL a1, c02, c02 458 459 FNMSUB (aa2, cc02, cc01, cc01) 460 461 FMUL a3, c01, c01 462#endif 463 464#ifdef LT 465 LDF [AO + 0 * SIZE], a1 466 LDF [AO + 1 * SIZE], a2 467 LDF [AO + 3 * SIZE], a3 468 469 FMUL a1, c01, c01 470 471 FNMSUB (aa2, cc01, cc02, cc02) 472 473 FMUL a3, c02, c02 474#endif 475 476#if defined(RN) || defined(RT) 477 LDF [BO + 0 * SIZE], a1 478 479 FMUL a1, c01, c01 480 FMUL a1, c02, c02 481#endif 482 483#ifdef LN 484 add C1, -2 * SIZE, C1 485#endif 486 487#if defined(LN) || defined(LT) 488 STF c01, [BO + 0 * SIZE] 489 STF c02, [BO + 1 * SIZE] 490#else 491 STF c01, [AO + 0 * SIZE] 492 STF c02, [AO + 1 * SIZE] 493#endif 494 495 STF c01, [C1 + 0 * SIZE] 496 STF c02, [C1 + 1 * SIZE] 497 498#ifndef LN 499 add C1, 2 * SIZE, C1 500#endif 501 502#ifdef RT 503 sll K, BASE_SHIFT + 1, TEMP1 504 add AORIG, TEMP1, AORIG 505#endif 506 507#if defined(LT) || defined(RN) 508 sub K, KK, TEMP1 509 sll TEMP1, BASE_SHIFT + 1, TEMP2 510 sll TEMP1, BASE_SHIFT + 0, TEMP1 511 add AO, TEMP2, AO 512 add BO, TEMP1, BO 513#endif 514 515#ifdef LT 516 add KK, 2, KK 517#endif 518 519#ifdef LN 520 sub KK, 2, KK 521#endif 522 523 add I, -1, I 524 cmp I, 0 525 bg,pt %icc, .LL72 526 nop 527 .align 4 528 529.LL80: 530 and M, 1, I 531 cmp I, 0 532 ble,pn %icc, .LL89 533 nop 534 535#if defined(LT) || defined(RN) 536 mov B, BO 537#else 538#ifdef LN 539 sll K, BASE_SHIFT + 0, TEMP1 540 sub AORIG, TEMP1, AORIG 541#endif 542 543 sll KK, BASE_SHIFT + 0, TEMP1 544 sll KK, BASE_SHIFT + 0, TEMP2 545 546 add AORIG, TEMP1, AO 547 add B, TEMP2, BO 548#endif 549 550 LDF [AO + 0 * SIZE], a1 551 LDF [BO + 0 * SIZE], b1 552 LDF [AO + 1 * SIZE], a2 553 LDF [BO + 1 * SIZE], b2 554 LDF [AO + 2 * SIZE], a3 555 LDF [BO + 2 * SIZE], b3 556 LDF [AO + 3 * SIZE], a4 557 LDF [BO + 3 * SIZE], b4 558 559#if defined(LT) || defined(RN) 560 sra KK, 2, L 561#else 562 sub K, KK, L 563 sra L, 2, L 564#endif 565 cmp L, 0 566 ble,pn %icc, .LL85 567 FCLR (cc01) 568 .align 4 569 570.LL83: 571 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 572 add L, -1, L 573 574 FMADD (aa1, bb1, cc01, cc01) 575 LDF [AO + 4 * SIZE], a1 576 LDF [BO + 4 * SIZE], b1 577 578 FMADD (aa2, bb2, cc01, cc01) 579 LDF [AO + 5 * SIZE], a2 580 LDF [BO + 5 * SIZE], b2 581 582 FMADD (aa3, bb3, cc01, cc01) 583 LDF [AO + 6 * SIZE], a3 584 LDF [BO + 6 * SIZE], b3 585 586 FMADD (aa4, bb4, cc01, cc01) 587 LDF [AO + 7 * SIZE], a4 588 LDF [BO + 7 * SIZE], b4 589 590 add AO, 4 * SIZE, AO 591 cmp L, 0 592 593 bg,pt %icc, .LL83 594 add BO, 4 * SIZE, BO 595 .align 4 596 597.LL85: 598#if defined(LT) || defined(RN) 599 and KK, 3, L 600#else 601 sub K, KK, L 602 and L, 3, L 603#endif 604 cmp L, 0 605 ble,a,pn %icc, .LL88 606 nop 607 .align 4 608 609.LL87: 610 FMADD (aa1, bb1, cc01, cc01) 611 LDF [AO + 1 * SIZE], a1 612 LDF [BO + 1 * SIZE], b1 613 614 add AO, 1 * SIZE, AO 615 add L, -1, L 616 cmp L, 0 617 bg,pt %icc, .LL87 618 add BO, 1 * SIZE, BO 619 .align 4 620 621.LL88: 622#if defined(LN) || defined(RT) 623#ifdef LN 624 sub KK, 1, TEMP1 625#else 626 sub KK, 1, TEMP1 627#endif 628 sll TEMP1, BASE_SHIFT + 0, TEMP2 629 sll TEMP1, BASE_SHIFT + 0, TEMP1 630 631 add AORIG, TEMP2, AO 632 add B, TEMP1, BO 633#endif 634 635#if defined(LN) || defined(LT) 636 LDF [BO + 0 * SIZE], a1 637 638 FSUB a1, c01, c01 639#else 640 LDF [AO + 0 * SIZE], a1 641 642 FSUB a1, c01, c01 643#endif 644 645#if defined(LN) || defined(LT) 646 LDF [AO + 0 * SIZE], a1 647 648 FMUL a1, c01, c01 649#endif 650 651#if defined(RN) || defined(RT) 652 LDF [BO + 0 * SIZE], a1 653 654 FMUL a1, c01, c01 655#endif 656 657#ifdef LN 658 add C1, -1 * SIZE, C1 659#endif 660 661#if defined(LN) || defined(LT) 662 STF c01, [BO + 0 * SIZE] 663#else 664 STF c01, [AO + 0 * SIZE] 665#endif 666 667 STF c01, [C1 + 0 * SIZE] 668 669#ifdef RT 670 sll K, BASE_SHIFT + 0, TEMP1 671 add AORIG, TEMP1, AORIG 672#endif 673 674#if defined(LT) || defined(RN) 675 sub K, KK, TEMP1 676 sll TEMP1, BASE_SHIFT + 0, TEMP2 677 sll TEMP1, BASE_SHIFT + 0, TEMP1 678 add AO, TEMP2, AO 679 add BO, TEMP1, BO 680#endif 681 682#ifdef LT 683 add KK, 1, KK 684#endif 685 686#ifdef LN 687 sub KK, 1, KK 688#endif 689 .align 4 690 691.LL89: 692#ifdef LN 693 sll K, BASE_SHIFT, TEMP1 694 add B, TEMP1, B 695#endif 696 697#if defined(LT) || defined(RN) 698 mov BO, B 699#endif 700 701#ifdef RN 702 add KK, 1, KK 703#endif 704 705#ifdef RT 706 sub KK, 1, KK 707#endif 708 .align 4 709 710.LL50: 711 and N, 2, J 712 cmp J, 0 713 ble,pn %icc, .LL30 714 nop 715 716#ifdef RT 717 sll K, BASE_SHIFT + 1, TEMP1 718 sub B, TEMP1, B 719#endif 720 721#ifndef RT 722 mov C, C1 723 add C, LDC, C2 724 add C2, LDC, C 725#else 726 sub C, LDC, C2 727 sub C2, LDC, C1 728 sub C2, LDC, C 729#endif 730 731#ifdef LN 732 add M, OFFSET, KK 733#endif 734 735#ifdef LT 736 mov OFFSET, KK 737#endif 738 739#if defined(LN) || defined(RT) 740 mov A, AORIG 741#else 742 mov A, AO 743#endif 744 745 sra M, 1, I 746 cmp I, 0 747 ble,pn %icc, .LL60 748 nop 749 .align 4 750 751.LL52: 752#if defined(LT) || defined(RN) 753 mov B, BO 754#else 755#ifdef LN 756 sll K, BASE_SHIFT + 1, TEMP1 757 sub AORIG, TEMP1, AORIG 758#endif 759 760 sll KK, BASE_SHIFT + 1, TEMP1 761 sll KK, BASE_SHIFT + 1, TEMP2 762 763 add AORIG, TEMP1, AO 764 add B, TEMP2, BO 765#endif 766 767 LDF [AO + 0 * SIZE], a1 768 LDF [AO + 1 * SIZE], a2 769 LDF [AO + 2 * SIZE], a3 770 LDF [AO + 3 * SIZE], a4 771 772 LDF [BO + 0 * SIZE], b1 773 LDF [BO + 1 * SIZE], b2 774 LDF [BO + 2 * SIZE], b3 775 FCLR (cc01) 776 LDF [BO + 3 * SIZE], b4 777 FCLR (cc02) 778 779 LDF [BO + 4 * SIZE], b5 780 FCLR (cc03) 781 LDF [BO + 5 * SIZE], b6 782 FCLR (cc04) 783 LDF [BO + 6 * SIZE], b7 784 FCLR (cc05) 785 LDF [BO + 7 * SIZE], b8 786 FCLR (cc06) 787 788 prefetch [C1 + 2 * SIZE], 3 789 FCLR (cc07) 790 prefetch [C2 + 2 * SIZE], 3 791 FCLR (cc08) 792 793#if defined(LT) || defined(RN) 794 sra KK, 2, L 795#else 796 sub K, KK, L 797 sra L, 2, L 798#endif 799 cmp L, 0 800 ble,pn %icc, .LL55 801 nop 802 .align 4 803 804.LL53: 805 FMADD (aa1, bb1, cc01, cc01) 806 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 807 FMADD (aa2, bb1, cc02, cc02) 808 LDF [BO + 8 * SIZE], b1 809 810 FMADD (aa1, bb2, cc03, cc03) 811 LDF [AO + 4 * SIZE], a1 812 FMADD (aa2, bb2, cc04, cc04) 813 LDF [AO + 5 * SIZE], a2 814 815 FMADD (aa3, bb3, cc01, cc01) 816 LDF [BO + 9 * SIZE], b2 817 FMADD (aa4, bb3, cc02, cc02) 818 LDF [BO + 10 * SIZE], b3 819 820 FMADD (aa3, bb4, cc03, cc03) 821 LDF [AO + 6 * SIZE], a3 822 FMADD (aa4, bb4, cc04, cc04) 823 LDF [AO + 7 * SIZE], a4 824 825 FMADD (aa1, bb5, cc01, cc01) 826 LDF [BO + 11 * SIZE], b4 827 FMADD (aa2, bb5, cc02, cc02) 828 LDF [BO + 12 * SIZE], b5 829 830 FMADD (aa1, bb6, cc03, cc03) 831 LDF [AO + 8 * SIZE], a1 832 FMADD (aa2, bb6, cc04, cc04) 833 LDF [AO + 9 * SIZE], a2 834 835 FMADD (aa3, bb7, cc01, cc01) 836 LDF [BO + 13 * SIZE], b6 837 838 FMADD (aa4, bb7, cc02, cc02) 839 LDF [BO + 14 * SIZE], b7 840 841 FMADD (aa3, bb8, cc03, cc03) 842 LDF [AO + 10 * SIZE], a3 843 FMADD (aa4, bb8, cc04, cc04) 844 LDF [AO + 11 * SIZE], a4 845 846 add AO, 8 * SIZE, AO 847 add L, -1, L 848 add BO, 8 * SIZE, BO 849 cmp L, 0 850 851 bg,pt %icc, .LL53 852 LDF [BO + 7 * SIZE], b8 853 .align 4 854 855.LL55: 856#if defined(LT) || defined(RN) 857 and KK, 3, L 858#else 859 sub K, KK, L 860 and L, 3, L 861#endif 862 cmp L, 0 863 ble,a,pn %icc, .LL58 864 nop 865 .align 4 866 867.LL57: 868 FMADD (aa1, bb1, cc01, cc01) 869 add L, -1, L 870 FMADD (aa2, bb1, cc02, cc02) 871 LDF [BO + 2 * SIZE], b1 872 873 FMADD (aa1, bb2, cc03, cc03) 874 LDF [AO + 2 * SIZE], a1 875 FMADD (aa2, bb2, cc04, cc04) 876 LDF [AO + 3 * SIZE], a2 877 878 add AO, 2 * SIZE, AO 879 cmp L, 0 880 add BO, 2 * SIZE, BO 881 bg,pt %icc, .LL57 882 LDF [BO + 1 * SIZE], b2 883 .align 4 884 885.LL58: 886#if defined(LN) || defined(RT) 887#ifdef LN 888 sub KK, 2, TEMP1 889#else 890 sub KK, 2, TEMP1 891#endif 892 sll TEMP1, BASE_SHIFT + 1, TEMP2 893 sll TEMP1, BASE_SHIFT + 1, TEMP1 894 895 add AORIG, TEMP2, AO 896 add B, TEMP1, BO 897#endif 898 899#if defined(LN) || defined(LT) 900 LDF [BO + 0 * SIZE], a1 901 LDF [BO + 1 * SIZE], a2 902 LDF [BO + 2 * SIZE], a3 903 LDF [BO + 3 * SIZE], a4 904 905 FSUB a1, c01, c01 906 FSUB a2, c03, c03 907 FSUB a3, c02, c02 908 FSUB a4, c04, c04 909#else 910 LDF [AO + 0 * SIZE], a1 911 LDF [AO + 1 * SIZE], a2 912 LDF [AO + 2 * SIZE], a3 913 LDF [AO + 3 * SIZE], a4 914 915 FSUB a1, c01, c01 916 FSUB a2, c02, c02 917 FSUB a3, c03, c03 918 FSUB a4, c04, c04 919#endif 920 921#ifdef LN 922 LDF [AO + 3 * SIZE], a1 923 LDF [AO + 2 * SIZE], a2 924 LDF [AO + 0 * SIZE], a3 925 926 FMUL a1, c02, c02 927 FMUL a1, c04, c04 928 929 FNMSUB (aa2, cc02, cc01, cc01) 930 FNMSUB (aa2, cc04, cc03, cc03) 931 932 FMUL a3, c01, c01 933 FMUL a3, c03, c03 934#endif 935 936#ifdef LT 937 LDF [AO + 0 * SIZE], a1 938 LDF [AO + 1 * SIZE], a2 939 LDF [AO + 3 * SIZE], a3 940 941 FMUL a1, c01, c01 942 FMUL a1, c03, c03 943 944 FNMSUB (aa2, cc01, cc02, cc02) 945 FNMSUB (aa2, cc03, cc04, cc04) 946 947 FMUL a3, c02, c02 948 FMUL a3, c04, c04 949#endif 950 951#ifdef RN 952 LDF [BO + 0 * SIZE], a1 953 LDF [BO + 1 * SIZE], a2 954 955 FMUL a1, c01, c01 956 FMUL a1, c02, c02 957 958 FNMSUB (aa2, cc01, cc03, cc03) 959 FNMSUB (aa2, cc02, cc04, cc04) 960 961 LDF [BO + 3 * SIZE], a1 962 963 FMUL a1, c03, c03 964 FMUL a1, c04, c04 965#endif 966 967#ifdef RT 968 LDF [BO + 3 * SIZE], a1 969 LDF [BO + 2 * SIZE], a2 970 971 FMUL a1, c04, c04 972 FMUL a1, c03, c03 973 974 FNMSUB (aa2, cc04, cc02, cc02) 975 FNMSUB (aa2, cc03, cc01, cc01) 976 977 LDF [BO + 0 * SIZE], a1 978 979 FMUL a1, c02, c02 980 FMUL a1, c01, c01 981#endif 982 983#ifdef LN 984 add C1, -2 * SIZE, C1 985 add C2, -2 * SIZE, C2 986#endif 987 988#if defined(LN) || defined(LT) 989 STF c01, [BO + 0 * SIZE] 990 STF c03, [BO + 1 * SIZE] 991 STF c02, [BO + 2 * SIZE] 992 STF c04, [BO + 3 * SIZE] 993#else 994 STF c01, [AO + 0 * SIZE] 995 STF c02, [AO + 1 * SIZE] 996 STF c03, [AO + 2 * SIZE] 997 STF c04, [AO + 3 * SIZE] 998#endif 999 1000 STF c01, [C1 + 0 * SIZE] 1001 STF c02, [C1 + 1 * SIZE] 1002 STF c03, [C2 + 0 * SIZE] 1003 STF c04, [C2 + 1 * SIZE] 1004 1005#ifndef LN 1006 add C1, 2 * SIZE, C1 1007 add C2, 2 * SIZE, C2 1008#endif 1009 1010#ifdef RT 1011 sll K, BASE_SHIFT + 1, TEMP1 1012 add AORIG, TEMP1, AORIG 1013#endif 1014 1015#if defined(LT) || defined(RN) 1016 sub K, KK, TEMP1 1017 sll TEMP1, BASE_SHIFT + 1, TEMP2 1018 sll TEMP1, BASE_SHIFT + 1, TEMP1 1019 add AO, TEMP2, AO 1020 add BO, TEMP1, BO 1021#endif 1022 1023#ifdef LT 1024 add KK, 2, KK 1025#endif 1026 1027#ifdef LN 1028 sub KK, 2, KK 1029#endif 1030 1031 add I, -1, I 1032 cmp I, 0 1033 bg,pt %icc, .LL52 1034 nop 1035 .align 4 1036 1037.LL60: 1038 and M, 1, I 1039 cmp I, 0 1040 ble,pn %icc, .LL69 1041 nop 1042 1043#if defined(LT) || defined(RN) 1044 mov B, BO 1045#else 1046#ifdef LN 1047 sll K, BASE_SHIFT + 0, TEMP1 1048 sub AORIG, TEMP1, AORIG 1049#endif 1050 1051 sll KK, BASE_SHIFT + 0, TEMP1 1052 sll KK, BASE_SHIFT + 1, TEMP2 1053 1054 add AORIG, TEMP1, AO 1055 add B, TEMP2, BO 1056#endif 1057 1058 LDF [AO + 0 * SIZE], a1 1059 LDF [AO + 1 * SIZE], a2 1060 LDF [AO + 2 * SIZE], a3 1061 LDF [AO + 3 * SIZE], a4 1062 1063 LDF [BO + 0 * SIZE], b1 1064 LDF [BO + 1 * SIZE], b2 1065 LDF [BO + 2 * SIZE], b3 1066 LDF [BO + 3 * SIZE], b4 1067 LDF [BO + 4 * SIZE], b5 1068 LDF [BO + 5 * SIZE], b6 1069 LDF [BO + 6 * SIZE], b7 1070 FCLR (cc01) 1071 LDF [BO + 7 * SIZE], b8 1072 FCLR (cc03) 1073 1074#if defined(LT) || defined(RN) 1075 sra KK, 2, L 1076#else 1077 sub K, KK, L 1078 sra L, 2, L 1079#endif 1080 cmp L, 0 1081 ble,pn %icc, .LL65 1082 nop 1083 .align 4 1084 1085.LL63: 1086 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1087 add L, -1, L 1088 1089 FMADD (aa1, bb1, cc01, cc01) 1090 LDF [BO + 8 * SIZE], b1 1091 FMADD (aa1, bb2, cc03, cc03) 1092 LDF [BO + 9 * SIZE], b2 1093 1094 LDF [AO + 4 * SIZE], a1 1095 cmp L, 0 1096 1097 FMADD (aa2, bb3, cc01, cc01) 1098 LDF [BO + 10 * SIZE], b3 1099 FMADD (aa2, bb4, cc03, cc03) 1100 LDF [BO + 11 * SIZE], b4 1101 1102 LDF [AO + 5 * SIZE], a2 1103 add AO, 4 * SIZE, AO 1104 1105 FMADD (aa3, bb5, cc01, cc01) 1106 LDF [BO + 12 * SIZE], b5 1107 FMADD (aa3, bb6, cc03, cc03) 1108 LDF [BO + 13 * SIZE], b6 1109 1110 LDF [AO + 2 * SIZE], a3 1111 add BO, 8 * SIZE, BO 1112 1113 FMADD (aa4, bb7, cc01, cc01) 1114 LDF [BO + 6 * SIZE], b7 1115 FMADD (aa4, bb8, cc03, cc03) 1116 LDF [BO + 7 * SIZE], b8 1117 1118 bg,pt %icc, .LL63 1119 LDF [AO + 3 * SIZE], a4 1120 .align 4 1121 1122.LL65: 1123#if defined(LT) || defined(RN) 1124 and KK, 3, L 1125#else 1126 sub K, KK, L 1127 and L, 3, L 1128#endif 1129 cmp L, 0 1130 ble,a,pn %icc, .LL68 1131 nop 1132 .align 4 1133 1134.LL67: 1135 FMADD (aa1, bb1, cc01, cc01) 1136 LDF [BO + 2 * SIZE], b1 1137 FMADD (aa1, bb2, cc03, cc03) 1138 LDF [BO + 3 * SIZE], b2 1139 1140 LDF [AO + 1 * SIZE], a1 1141 add L, -1, L 1142 add AO, 1 * SIZE, AO 1143 cmp L, 0 1144 1145 bg,pt %icc, .LL67 1146 add BO, 2 * SIZE, BO 1147 .align 4 1148 1149.LL68: 1150#if defined(LN) || defined(RT) 1151#ifdef LN 1152 sub KK, 1, TEMP1 1153#else 1154 sub KK, 2, TEMP1 1155#endif 1156 sll TEMP1, BASE_SHIFT + 0, TEMP2 1157 sll TEMP1, BASE_SHIFT + 1, TEMP1 1158 1159 add AORIG, TEMP2, AO 1160 add B, TEMP1, BO 1161#endif 1162 1163#if defined(LN) || defined(LT) 1164 LDF [BO + 0 * SIZE], a1 1165 LDF [BO + 1 * SIZE], a2 1166 1167 FSUB a1, c01, c01 1168 FSUB a2, c03, c03 1169#else 1170 LDF [AO + 0 * SIZE], a1 1171 LDF [AO + 1 * SIZE], a2 1172 1173 FSUB a1, c01, c01 1174 FSUB a2, c03, c03 1175#endif 1176 1177#if defined(LN) || defined(LT) 1178 LDF [AO + 0 * SIZE], a1 1179 1180 FMUL a1, c01, c01 1181 FMUL a1, c03, c03 1182#endif 1183 1184#ifdef RN 1185 LDF [BO + 0 * SIZE], a1 1186 LDF [BO + 1 * SIZE], a2 1187 1188 FMUL a1, c01, c01 1189 1190 FNMSUB (aa2, cc01, cc03, cc03) 1191 1192 LDF [BO + 3 * SIZE], a1 1193 1194 FMUL a1, c03, c03 1195#endif 1196 1197#ifdef RT 1198 LDF [BO + 3 * SIZE], a1 1199 LDF [BO + 2 * SIZE], a2 1200 1201 FMUL a1, c03, c03 1202 1203 FNMSUB (aa2, cc03, cc01, cc01) 1204 1205 LDF [BO + 0 * SIZE], a1 1206 1207 FMUL a1, c01, c01 1208#endif 1209 1210#ifdef LN 1211 add C1, -1 * SIZE, C1 1212 add C2, -1 * SIZE, C2 1213#endif 1214 1215#if defined(LN) || defined(LT) 1216 STF c01, [BO + 0 * SIZE] 1217 STF c03, [BO + 1 * SIZE] 1218#else 1219 STF c01, [AO + 0 * SIZE] 1220 STF c03, [AO + 1 * SIZE] 1221#endif 1222 1223 STF c01, [C1 + 0 * SIZE] 1224 STF c03, [C2 + 0 * SIZE] 1225 1226#ifdef RT 1227 sll K, BASE_SHIFT + 0, TEMP1 1228 add AORIG, TEMP1, AORIG 1229#endif 1230 1231#if defined(LT) || defined(RN) 1232 sub K, KK, TEMP1 1233 sll TEMP1, BASE_SHIFT + 0, TEMP2 1234 sll TEMP1, BASE_SHIFT + 1, TEMP1 1235 add AO, TEMP2, AO 1236 add BO, TEMP1, BO 1237#endif 1238 1239#ifdef LT 1240 add KK, 1, KK 1241#endif 1242 1243#ifdef LN 1244 sub KK, 1, KK 1245#endif 1246 .align 4 1247 1248.LL69: 1249#ifdef LN 1250 sll K, BASE_SHIFT + 1, TEMP1 1251 add B, TEMP1, B 1252#endif 1253 1254#if defined(LT) || defined(RN) 1255 mov BO, B 1256#endif 1257 1258#ifdef RN 1259 add KK, 2, KK 1260#endif 1261 1262#ifdef RT 1263 sub KK, 2, KK 1264#endif 1265 .align 4 1266 1267.LL30: 1268 and N, 4, J 1269 cmp J, 0 1270 ble,pn %icc, .LL10 1271 nop 1272 1273#ifdef RT 1274 sll K, BASE_SHIFT + 2, TEMP1 1275 sub B, TEMP1, B 1276#endif 1277 1278#ifndef RT 1279 mov C, C1 1280 add C, LDC, C2 1281 add C2, LDC, C3 1282 add C3, LDC, C4 1283 add C4, LDC, C 1284#else 1285 sub C, LDC, C4 1286 sub C4, LDC, C3 1287 sub C3, LDC, C2 1288 sub C2, LDC, C1 1289 sub C2, LDC, C 1290#endif 1291 1292#ifdef LN 1293 add M, OFFSET, KK 1294#endif 1295 1296#ifdef LT 1297 mov OFFSET, KK 1298#endif 1299 1300#if defined(LN) || defined(RT) 1301 mov A, AORIG 1302#else 1303 mov A, AO 1304#endif 1305 1306 sra M, 1, I 1307 cmp I, 0 1308 ble,pn %icc, .LL40 1309 nop 1310 .align 4 1311 1312.LL32: 1313#if defined(LT) || defined(RN) 1314 mov B, BO 1315#else 1316#ifdef LN 1317 sll K, BASE_SHIFT + 1, TEMP1 1318 sub AORIG, TEMP1, AORIG 1319#endif 1320 1321 sll KK, BASE_SHIFT + 1, TEMP1 1322 sll KK, BASE_SHIFT + 2, TEMP2 1323 1324 add AORIG, TEMP1, AO 1325 add B, TEMP2, BO 1326#endif 1327 1328 LDF [AO + 0 * SIZE], a1 1329 LDF [AO + 1 * SIZE], a2 1330 1331 LDF [BO + 0 * SIZE], b1 1332 LDF [BO + 1 * SIZE], b2 1333 LDF [BO + 2 * SIZE], b3 1334 LDF [BO + 3 * SIZE], b4 1335 LDF [BO + 4 * SIZE], b5 1336 1337 LDF [BO + 5 * SIZE], b6 1338 FCLR (cc01) 1339 LDF [BO + 6 * SIZE], b7 1340 FCLR (cc02) 1341 LDF [BO + 7 * SIZE], b8 1342 FCLR (cc03) 1343 LDF [BO + 8 * SIZE], b9 1344 FCLR (cc04) 1345 1346 prefetch [C1 + 2 * SIZE], 3 1347 FCLR (cc05) 1348 prefetch [C2 + 2 * SIZE], 3 1349 FCLR (cc06) 1350 prefetch [C3 + 2 * SIZE], 3 1351 FCLR (cc07) 1352 prefetch [C4 + 2 * SIZE], 3 1353 FCLR (cc08) 1354 1355#if defined(LT) || defined(RN) 1356 sra KK, 2, L 1357#else 1358 sub K, KK, L 1359 sra L, 2, L 1360#endif 1361 cmp L, 0 1362 ble,pn %icc, .LL35 1363 nop 1364 .align 4 1365 1366.LL33: 1367 FMADD (aa1, bb1, cc01, cc01) 1368 LDF [AO + 2 * SIZE], a3 1369 FMADD (aa2, bb1, cc02, cc02) 1370 LDF [AO + 3 * SIZE], a4 1371 1372 FMADD (aa1, bb2, cc03, cc03) 1373 LDF [BO + 16 * SIZE], b1 1374 FMADD (aa2, bb2, cc04, cc04) 1375 LDF [BO + 9 * SIZE], b2 1376 1377 FMADD (aa1, bb3, cc05, cc05) 1378 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1379 FMADD (aa2, bb3, cc06, cc06) 1380 add L, -1, L 1381 1382 FMADD (aa1, bb4, cc07, cc07) 1383 LDF [BO + 10 * SIZE], b3 1384 FMADD (aa2, bb4, cc08, cc08) 1385 LDF [BO + 11 * SIZE], b4 1386 1387 FMADD (aa3, bb5, cc01, cc01) 1388 LDF [AO + 4 * SIZE], a1 1389 FMADD (aa4, bb5, cc02, cc02) 1390 LDF [AO + 5 * SIZE], a2 1391 1392 FMADD (aa3, bb6, cc03, cc03) 1393 LDF [BO + 12 * SIZE], b5 1394 FMADD (aa4, bb6, cc04, cc04) 1395 LDF [BO + 13 * SIZE], b6 1396 1397 FMADD (aa3, bb7, cc05, cc05) 1398 cmp L, 0 1399 FMADD (aa4, bb7, cc06, cc06) 1400 add AO, 8 * SIZE, AO 1401 1402 FMADD (aa3, bb8, cc07, cc07) 1403 LDF [BO + 14 * SIZE], b7 1404 FMADD (aa4, bb8, cc08, cc08) 1405 LDF [BO + 15 * SIZE], b8 1406 1407 FMADD (aa1, bb9, cc01, cc01) 1408 LDF [AO - 2 * SIZE], a3 1409 FMADD (aa2, bb9, cc02, cc02) 1410 LDF [AO - 1 * SIZE], a4 1411 1412 FMADD (aa1, bb2, cc03, cc03) 1413 LDF [BO + 24 * SIZE], b9 1414 FMADD (aa2, bb2, cc04, cc04) 1415 LDF [BO + 17 * SIZE], b2 1416 1417 FMADD (aa1, bb3, cc05, cc05) 1418 add BO, 16 * SIZE, BO 1419 FMADD (aa2, bb3, cc06, cc06) 1420 nop 1421 1422 FMADD (aa1, bb4, cc07, cc07) 1423 LDF [BO + 2 * SIZE], b3 1424 FMADD (aa2, bb4, cc08, cc08) 1425 LDF [BO + 3 * SIZE], b4 1426 1427 FMADD (aa3, bb5, cc01, cc01) 1428 LDF [AO + 0 * SIZE], a1 1429 FMADD (aa4, bb5, cc02, cc02) 1430 LDF [AO + 1 * SIZE], a2 1431 FMADD (aa3, bb6, cc03, cc03) 1432 LDF [BO + 4 * SIZE], b5 1433 FMADD (aa4, bb6, cc04, cc04) 1434 LDF [BO + 5 * SIZE], b6 1435 1436 FMADD (aa3, bb7, cc05, cc05) 1437 nop 1438 FMADD (aa4, bb7, cc06, cc06) 1439 LDF [BO + 6 * SIZE], b7 1440 1441 FMADD (aa3, bb8, cc07, cc07) 1442 FMADD (aa4, bb8, cc08, cc08) 1443 bg,pt %icc, .LL33 1444 LDF [BO + 7 * SIZE], b8 1445 .align 4 1446 1447.LL35: 1448#if defined(LT) || defined(RN) 1449 and KK, 3, L 1450#else 1451 sub K, KK, L 1452 and L, 3, L 1453#endif 1454 cmp L, 0 1455 ble,a,pn %icc, .LL38 1456 nop 1457 .align 4 1458 1459.LL37: 1460 FMADD (aa1, bb1, cc01, cc01) 1461 add L, -1, L 1462 FMADD (aa2, bb1, cc02, cc02) 1463 LDF [BO + 4 * SIZE], b1 1464 1465 FMADD (aa1, bb2, cc03, cc03) 1466 add AO, 2 * SIZE, AO 1467 FMADD (aa2, bb2, cc04, cc04) 1468 LDF [BO + 5 * SIZE], b2 1469 1470 FMADD (aa1, bb3, cc05, cc05) 1471 cmp L, 0 1472 FMADD (aa2, bb3, cc06, cc06) 1473 LDF [BO + 6 * SIZE], b3 1474 1475 FMADD (aa1, bb4, cc07, cc07) 1476 LDF [AO + 0 * SIZE], a1 1477 FMADD (aa2, bb4, cc08, cc08) 1478 LDF [AO + 1 * SIZE], a2 1479 1480 LDF [BO + 7 * SIZE], b4 1481 bg,pt %icc, .LL37 1482 add BO, 4 * SIZE, BO 1483 .align 4 1484 1485.LL38: 1486#if defined(LN) || defined(RT) 1487#ifdef LN 1488 sub KK, 2, TEMP1 1489#else 1490 sub KK, 4, TEMP1 1491#endif 1492 sll TEMP1, BASE_SHIFT + 1, TEMP2 1493 sll TEMP1, BASE_SHIFT + 2, TEMP1 1494 1495 add AORIG, TEMP2, AO 1496 add B, TEMP1, BO 1497#endif 1498 1499#if defined(LN) || defined(LT) 1500 LDF [BO + 0 * SIZE], a1 1501 LDF [BO + 1 * SIZE], a2 1502 LDF [BO + 2 * SIZE], a3 1503 LDF [BO + 3 * SIZE], a4 1504 1505 LDF [BO + 4 * SIZE], b1 1506 LDF [BO + 5 * SIZE], b2 1507 LDF [BO + 6 * SIZE], b3 1508 LDF [BO + 7 * SIZE], b4 1509 1510 FSUB a1, c01, c01 1511 FSUB a2, c03, c03 1512 FSUB a3, c05, c05 1513 FSUB a4, c07, c07 1514 1515 FSUB b1, c02, c02 1516 FSUB b2, c04, c04 1517 FSUB b3, c06, c06 1518 FSUB b4, c08, c08 1519#else 1520 LDF [AO + 0 * SIZE], a1 1521 LDF [AO + 1 * SIZE], a2 1522 LDF [AO + 2 * SIZE], a3 1523 LDF [AO + 3 * SIZE], a4 1524 1525 LDF [AO + 4 * SIZE], b1 1526 LDF [AO + 5 * SIZE], b2 1527 LDF [AO + 6 * SIZE], b3 1528 LDF [AO + 7 * SIZE], b4 1529 1530 FSUB a1, c01, c01 1531 FSUB a2, c02, c02 1532 FSUB a3, c03, c03 1533 FSUB a4, c04, c04 1534 1535 FSUB b1, c05, c05 1536 FSUB b2, c06, c06 1537 FSUB b3, c07, c07 1538 FSUB b4, c08, c08 1539 1540#endif 1541 1542#ifdef LN 1543 LDF [AO + 3 * SIZE], a1 1544 LDF [AO + 2 * SIZE], a2 1545 LDF [AO + 0 * SIZE], a3 1546 1547 FMUL a1, c02, c02 1548 FMUL a1, c04, c04 1549 FMUL a1, c06, c06 1550 FMUL a1, c08, c08 1551 1552 FNMSUB (aa2, cc02, cc01, cc01) 1553 FNMSUB (aa2, cc04, cc03, cc03) 1554 FNMSUB (aa2, cc06, cc05, cc05) 1555 FNMSUB (aa2, cc08, cc07, cc07) 1556 1557 FMUL a3, c01, c01 1558 FMUL a3, c03, c03 1559 FMUL a3, c05, c05 1560 FMUL a3, c07, c07 1561#endif 1562 1563#ifdef LT 1564 LDF [AO + 0 * SIZE], a1 1565 LDF [AO + 1 * SIZE], a2 1566 LDF [AO + 3 * SIZE], a3 1567 1568 FMUL a1, c01, c01 1569 FMUL a1, c03, c03 1570 FMUL a1, c05, c05 1571 FMUL a1, c07, c07 1572 1573 FNMSUB (aa2, cc01, cc02, cc02) 1574 FNMSUB (aa2, cc03, cc04, cc04) 1575 FNMSUB (aa2, cc05, cc06, cc06) 1576 FNMSUB (aa2, cc07, cc08, cc08) 1577 1578 FMUL a3, c02, c02 1579 FMUL a3, c04, c04 1580 FMUL a3, c06, c06 1581 FMUL a3, c08, c08 1582#endif 1583 1584#ifdef RN 1585 LDF [BO + 0 * SIZE], a1 1586 LDF [BO + 1 * SIZE], a2 1587 LDF [BO + 2 * SIZE], a3 1588 LDF [BO + 3 * SIZE], a4 1589 1590 FMUL a1, c01, c01 1591 FMUL a1, c02, c02 1592 1593 FNMSUB (aa2, cc01, cc03, cc03) 1594 FNMSUB (aa2, cc02, cc04, cc04) 1595 FNMSUB (aa3, cc01, cc05, cc05) 1596 FNMSUB (aa3, cc02, cc06, cc06) 1597 FNMSUB (aa4, cc01, cc07, cc07) 1598 FNMSUB (aa4, cc02, cc08, cc08) 1599 1600 LDF [BO + 5 * SIZE], a1 1601 LDF [BO + 6 * SIZE], a2 1602 LDF [BO + 7 * SIZE], a3 1603 1604 FMUL a1, c03, c03 1605 FMUL a1, c04, c04 1606 1607 FNMSUB (aa2, cc03, cc05, cc05) 1608 FNMSUB (aa2, cc04, cc06, cc06) 1609 FNMSUB (aa3, cc03, cc07, cc07) 1610 FNMSUB (aa3, cc04, cc08, cc08) 1611 1612 LDF [BO + 10 * SIZE], a1 1613 LDF [BO + 11 * SIZE], a2 1614 1615 FMUL a1, c05, c05 1616 FMUL a1, c06, c06 1617 1618 FNMSUB (aa2, cc05, cc07, cc07) 1619 FNMSUB (aa2, cc06, cc08, cc08) 1620 1621 LDF [BO + 15 * SIZE], a1 1622 1623 FMUL a1, c07, c07 1624 FMUL a1, c08, c08 1625#endif 1626 1627#ifdef RT 1628 LDF [BO + 15 * SIZE], a1 1629 LDF [BO + 14 * SIZE], a2 1630 LDF [BO + 13 * SIZE], a3 1631 LDF [BO + 12 * SIZE], a4 1632 1633 FMUL a1, c08, c08 1634 FMUL a1, c07, c07 1635 1636 FNMSUB (aa2, cc08, cc06, cc06) 1637 FNMSUB (aa2, cc07, cc05, cc05) 1638 FNMSUB (aa3, cc08, cc04, cc04) 1639 FNMSUB (aa3, cc07, cc03, cc03) 1640 FNMSUB (aa4, cc08, cc02, cc02) 1641 FNMSUB (aa4, cc07, cc01, cc01) 1642 1643 LDF [BO + 10 * SIZE], a1 1644 LDF [BO + 9 * SIZE], a2 1645 LDF [BO + 8 * SIZE], a3 1646 1647 FMUL a1, c06, c06 1648 FMUL a1, c05, c05 1649 1650 FNMSUB (aa2, cc06, cc04, cc04) 1651 FNMSUB (aa2, cc05, cc03, cc03) 1652 FNMSUB (aa3, cc06, cc02, cc02) 1653 FNMSUB (aa3, cc05, cc01, cc01) 1654 1655 LDF [BO + 5 * SIZE], a1 1656 LDF [BO + 4 * SIZE], a2 1657 1658 FMUL a1, c04, c04 1659 FMUL a1, c03, c03 1660 1661 FNMSUB (aa2, cc04, cc02, cc02) 1662 FNMSUB (aa2, cc03, cc01, cc01) 1663 1664 LDF [BO + 0 * SIZE], a1 1665 1666 FMUL a1, c02, c02 1667 FMUL a1, c01, c01 1668#endif 1669 1670#ifdef LN 1671 add C1, -2 * SIZE, C1 1672 add C2, -2 * SIZE, C2 1673 add C3, -2 * SIZE, C3 1674 add C4, -2 * SIZE, C4 1675#endif 1676 1677#if defined(LN) || defined(LT) 1678 STF c01, [BO + 0 * SIZE] 1679 STF c03, [BO + 1 * SIZE] 1680 STF c05, [BO + 2 * SIZE] 1681 STF c07, [BO + 3 * SIZE] 1682 1683 STF c02, [BO + 4 * SIZE] 1684 STF c04, [BO + 5 * SIZE] 1685 STF c06, [BO + 6 * SIZE] 1686 STF c08, [BO + 7 * SIZE] 1687#else 1688 STF c01, [AO + 0 * SIZE] 1689 STF c02, [AO + 1 * SIZE] 1690 STF c03, [AO + 2 * SIZE] 1691 STF c04, [AO + 3 * SIZE] 1692 1693 STF c05, [AO + 4 * SIZE] 1694 STF c06, [AO + 5 * SIZE] 1695 STF c07, [AO + 6 * SIZE] 1696 STF c08, [AO + 7 * SIZE] 1697#endif 1698 1699 STF c01, [C1 + 0 * SIZE] 1700 STF c02, [C1 + 1 * SIZE] 1701 STF c03, [C2 + 0 * SIZE] 1702 STF c04, [C2 + 1 * SIZE] 1703 1704 STF c05, [C3 + 0 * SIZE] 1705 STF c06, [C3 + 1 * SIZE] 1706 STF c07, [C4 + 0 * SIZE] 1707 STF c08, [C4 + 1 * SIZE] 1708 1709#ifndef LN 1710 add C1, 2 * SIZE, C1 1711 add C2, 2 * SIZE, C2 1712 add C3, 2 * SIZE, C3 1713 add C4, 2 * SIZE, C4 1714#endif 1715 1716#ifdef RT 1717 sll K, BASE_SHIFT + 1, TEMP1 1718 add AORIG, TEMP1, AORIG 1719#endif 1720 1721#if defined(LT) || defined(RN) 1722 sub K, KK, TEMP1 1723 sll TEMP1, BASE_SHIFT + 1, TEMP2 1724 sll TEMP1, BASE_SHIFT + 2, TEMP1 1725 add AO, TEMP2, AO 1726 add BO, TEMP1, BO 1727#endif 1728 1729#ifdef LT 1730 add KK, 2, KK 1731#endif 1732 1733#ifdef LN 1734 sub KK, 2, KK 1735#endif 1736 1737 add I, -1, I 1738 cmp I, 0 1739 bg,pt %icc, .LL32 1740 nop 1741 1742.LL40: 1743 and M, 1, I 1744 cmp I, 0 1745 ble,pn %icc, .LL49 1746 nop 1747 1748#if defined(LT) || defined(RN) 1749 mov B, BO 1750#else 1751#ifdef LN 1752 sll K, BASE_SHIFT + 0, TEMP1 1753 sub AORIG, TEMP1, AORIG 1754#endif 1755 1756 sll KK, BASE_SHIFT + 0, TEMP1 1757 sll KK, BASE_SHIFT + 2, TEMP2 1758 1759 add AORIG, TEMP1, AO 1760 add B, TEMP2, BO 1761#endif 1762 1763 LDF [AO + 0 * SIZE], a1 1764 LDF [AO + 1 * SIZE], a2 1765 LDF [AO + 2 * SIZE], a3 1766 LDF [AO + 3 * SIZE], a4 1767 1768 LDF [BO + 0 * SIZE], b1 1769 LDF [BO + 1 * SIZE], b2 1770 LDF [BO + 2 * SIZE], b3 1771 LDF [BO + 3 * SIZE], b4 1772 LDF [BO + 4 * SIZE], b5 1773 LDF [BO + 5 * SIZE], b6 1774 FCLR (cc01) 1775 LDF [BO + 6 * SIZE], b7 1776 FCLR (cc03) 1777 LDF [BO + 7 * SIZE], b8 1778 FCLR (cc05) 1779 LDF [BO + 8 * SIZE], b9 1780 FCLR (cc07) 1781 1782#if defined(LT) || defined(RN) 1783 sra KK, 2, L 1784#else 1785 sub K, KK, L 1786 sra L, 2, L 1787#endif 1788 cmp L, 0 1789 ble,pn %icc, .LL45 1790 nop 1791 1792.LL43: 1793 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1794 add L, -1, L 1795 1796 FMADD (aa1, bb1, cc01, cc01) 1797 LDF [BO + 16 * SIZE], b1 1798 FMADD (aa1, bb2, cc03, cc03) 1799 LDF [BO + 9 * SIZE], b2 1800 FMADD (aa1, bb3, cc05, cc05) 1801 LDF [BO + 10 * SIZE], b3 1802 FMADD (aa1, bb4, cc07, cc07) 1803 LDF [BO + 11 * SIZE], b4 1804 1805 LDF [AO + 4 * SIZE], a1 1806 cmp L, 0 1807 1808 FMADD (aa2, bb5, cc01, cc01) 1809 LDF [BO + 12 * SIZE], b5 1810 FMADD (aa2, bb6, cc03, cc03) 1811 LDF [BO + 13 * SIZE], b6 1812 FMADD (aa2, bb7, cc05, cc05) 1813 LDF [BO + 14 * SIZE], b7 1814 FMADD (aa2, bb8, cc07, cc07) 1815 LDF [BO + 15 * SIZE], b8 1816 1817 LDF [AO + 5 * SIZE], a2 1818 add AO, 4 * SIZE, AO 1819 1820 FMADD (aa3, bb9, cc01, cc01) 1821 LDF [BO + 24 * SIZE], b9 1822 FMADD (aa3, bb2, cc03, cc03) 1823 LDF [BO + 17 * SIZE], b2 1824 FMADD (aa3, bb3, cc05, cc05) 1825 LDF [BO + 18 * SIZE], b3 1826 FMADD (aa3, bb4, cc07, cc07) 1827 LDF [BO + 19 * SIZE], b4 1828 1829 LDF [AO + 2 * SIZE], a3 1830 add BO, 16 * SIZE, BO 1831 1832 FMADD (aa4, bb5, cc01, cc01) 1833 LDF [BO + 4 * SIZE], b5 1834 FMADD (aa4, bb6, cc03, cc03) 1835 LDF [BO + 5 * SIZE], b6 1836 FMADD (aa4, bb7, cc05, cc05) 1837 LDF [BO + 6 * SIZE], b7 1838 FMADD (aa4, bb8, cc07, cc07) 1839 LDF [BO + 7 * SIZE], b8 1840 1841 bg,pt %icc, .LL43 1842 LDF [AO + 3 * SIZE], a4 1843 .align 4 1844 1845.LL45: 1846#if defined(LT) || defined(RN) 1847 and KK, 3, L 1848#else 1849 sub K, KK, L 1850 and L, 3, L 1851#endif 1852 cmp L, 0 1853 ble,a,pn %icc, .LL48 1854 nop 1855 .align 4 1856 1857.LL47: 1858 FMADD (aa1, bb1, cc01, cc01) 1859 LDF [BO + 4 * SIZE], b1 1860 add L, -1, L 1861 FMADD (aa1, bb2, cc03, cc03) 1862 LDF [BO + 5 * SIZE], b2 1863 add AO, 1 * SIZE, AO 1864 1865 FMADD (aa1, bb3, cc05, cc05) 1866 LDF [BO + 6 * SIZE], b3 1867 cmp L, 0 1868 FMADD (aa1, bb4, cc07, cc07) 1869 LDF [BO + 7 * SIZE], b4 1870 add BO, 4 * SIZE, BO 1871 1872 bg,pt %icc, .LL47 1873 LDF [AO + 0 * SIZE], a1 1874 .align 4 1875 1876.LL48: 1877#if defined(LN) || defined(RT) 1878#ifdef LN 1879 sub KK, 1, TEMP1 1880#else 1881 sub KK, 4, TEMP1 1882#endif 1883 sll TEMP1, BASE_SHIFT + 0, TEMP2 1884 sll TEMP1, BASE_SHIFT + 2, TEMP1 1885 1886 add AORIG, TEMP2, AO 1887 add B, TEMP1, BO 1888#endif 1889 1890#if defined(LN) || defined(LT) 1891 LDF [BO + 0 * SIZE], a1 1892 LDF [BO + 1 * SIZE], a2 1893 LDF [BO + 2 * SIZE], a3 1894 LDF [BO + 3 * SIZE], a4 1895 1896 FSUB a1, c01, c01 1897 FSUB a2, c03, c03 1898 FSUB a3, c05, c05 1899 FSUB a4, c07, c07 1900#else 1901 LDF [AO + 0 * SIZE], a1 1902 LDF [AO + 1 * SIZE], a2 1903 LDF [AO + 2 * SIZE], a3 1904 LDF [AO + 3 * SIZE], a4 1905 1906 FSUB a1, c01, c01 1907 FSUB a2, c03, c03 1908 FSUB a3, c05, c05 1909 FSUB a4, c07, c07 1910#endif 1911 1912#if defined(LN) || defined(LT) 1913 LDF [AO + 0 * SIZE], a1 1914 1915 FMUL a1, c01, c01 1916 FMUL a1, c03, c03 1917 FMUL a1, c05, c05 1918 FMUL a1, c07, c07 1919#endif 1920 1921#ifdef RN 1922 LDF [BO + 0 * SIZE], a1 1923 LDF [BO + 1 * SIZE], a2 1924 LDF [BO + 2 * SIZE], a3 1925 LDF [BO + 3 * SIZE], a4 1926 1927 FMUL a1, c01, c01 1928 1929 FNMSUB (aa2, cc01, cc03, cc03) 1930 FNMSUB (aa3, cc01, cc05, cc05) 1931 FNMSUB (aa4, cc01, cc07, cc07) 1932 1933 LDF [BO + 5 * SIZE], a1 1934 LDF [BO + 6 * SIZE], a2 1935 LDF [BO + 7 * SIZE], a3 1936 1937 FMUL a1, c03, c03 1938 1939 FNMSUB (aa2, cc03, cc05, cc05) 1940 FNMSUB (aa3, cc03, cc07, cc07) 1941 1942 LDF [BO + 10 * SIZE], a1 1943 LDF [BO + 11 * SIZE], a2 1944 1945 FMUL a1, c05, c05 1946 1947 FNMSUB (aa2, cc05, cc07, cc07) 1948 1949 LDF [BO + 15 * SIZE], a1 1950 1951 FMUL a1, c07, c07 1952#endif 1953 1954#ifdef RT 1955 LDF [BO + 15 * SIZE], a1 1956 LDF [BO + 14 * SIZE], a2 1957 LDF [BO + 13 * SIZE], a3 1958 LDF [BO + 12 * SIZE], a4 1959 1960 FMUL a1, c07, c07 1961 1962 FNMSUB (aa2, cc07, cc05, cc05) 1963 FNMSUB (aa3, cc07, cc03, cc03) 1964 FNMSUB (aa4, cc07, cc01, cc01) 1965 1966 LDF [BO + 10 * SIZE], a1 1967 LDF [BO + 9 * SIZE], a2 1968 LDF [BO + 8 * SIZE], a3 1969 1970 FMUL a1, c05, c05 1971 1972 FNMSUB (aa2, cc05, cc03, cc03) 1973 FNMSUB (aa3, cc05, cc01, cc01) 1974 1975 LDF [BO + 5 * SIZE], a1 1976 LDF [BO + 4 * SIZE], a2 1977 1978 FMUL a1, c03, c03 1979 1980 FNMSUB (aa2, cc03, cc01, cc01) 1981 1982 LDF [BO + 0 * SIZE], a1 1983 1984 FMUL a1, c01, c01 1985#endif 1986 1987#ifdef LN 1988 add C1, -1 * SIZE, C1 1989 add C2, -1 * SIZE, C2 1990 add C3, -1 * SIZE, C3 1991 add C4, -1 * SIZE, C4 1992#endif 1993 1994#if defined(LN) || defined(LT) 1995 STF c01, [BO + 0 * SIZE] 1996 STF c03, [BO + 1 * SIZE] 1997 STF c05, [BO + 2 * SIZE] 1998 STF c07, [BO + 3 * SIZE] 1999#else 2000 STF c01, [AO + 0 * SIZE] 2001 STF c03, [AO + 1 * SIZE] 2002 STF c05, [AO + 2 * SIZE] 2003 STF c07, [AO + 3 * SIZE] 2004#endif 2005 2006 STF c01, [C1 + 0 * SIZE] 2007 STF c03, [C2 + 0 * SIZE] 2008 STF c05, [C3 + 0 * SIZE] 2009 STF c07, [C4 + 0 * SIZE] 2010 2011#ifdef RT 2012 sll K, BASE_SHIFT + 0, TEMP1 2013 add AORIG, TEMP1, AORIG 2014#endif 2015 2016#if defined(LT) || defined(RN) 2017 sub K, KK, TEMP1 2018 sll TEMP1, BASE_SHIFT + 0, TEMP2 2019 sll TEMP1, BASE_SHIFT + 2, TEMP1 2020 add AO, TEMP2, AO 2021 add BO, TEMP1, BO 2022#endif 2023 2024#ifdef LT 2025 add KK, 1, KK 2026#endif 2027 2028#ifdef LN 2029 sub KK, 1, KK 2030#endif 2031 .align 4 2032 2033.LL49: 2034#ifdef LN 2035 sll K, BASE_SHIFT + 2, TEMP1 2036 add B, TEMP1, B 2037#endif 2038 2039#if defined(LT) || defined(RN) 2040 mov BO, B 2041#endif 2042 2043#ifdef RN 2044 add KK, 4, KK 2045#endif 2046 2047#ifdef RT 2048 sub KK, 4, KK 2049#endif 2050 .align 4 2051 2052.LL10: 2053 sra N, 3, J 2054 cmp J, 0 2055 ble,pn %icc, .LL999 2056 nop 2057 .align 4 2058 2059.LL11: 2060#ifdef RT 2061 sll K, BASE_SHIFT + 3, TEMP1 2062 sub B, TEMP1, B 2063#endif 2064 2065#ifndef RT 2066 mov C, C1 2067 add C, LDC, C2 2068 add C2, LDC, C3 2069 add C3, LDC, C4 2070 add C4, LDC, C5 2071 add C5, LDC, C6 2072 add C6, LDC, C7 2073 add C7, LDC, C8 2074 add C8, LDC, C 2075#else 2076 sub C, LDC, C8 2077 sub C8, LDC, C7 2078 sub C7, LDC, C6 2079 sub C6, LDC, C5 2080 sub C5, LDC, C4 2081 sub C4, LDC, C3 2082 sub C3, LDC, C2 2083 sub C2, LDC, C1 2084 sub C2, LDC, C 2085#endif 2086 2087#ifdef LN 2088 add M, OFFSET, KK 2089#endif 2090 2091#ifdef LT 2092 mov OFFSET, KK 2093#endif 2094 2095#if defined(LN) || defined(RT) 2096 mov A, AORIG 2097#else 2098 mov A, AO 2099#endif 2100 2101 sra M, 1, I 2102 cmp I, 0 2103 ble,pn %icc, .LL20 2104 nop 2105 .align 4 2106 2107.LL12: 2108#if defined(LT) || defined(RN) 2109 mov B, BO 2110#else 2111#ifdef LN 2112 sll K, BASE_SHIFT + 1, TEMP1 2113 sub AORIG, TEMP1, AORIG 2114#endif 2115 2116 sll KK, BASE_SHIFT + 1, TEMP1 2117 sll KK, BASE_SHIFT + 3, TEMP2 2118 2119 add AORIG, TEMP1, AO 2120 add B, TEMP2, BO 2121#endif 2122 2123 LDF [AO + 0 * SIZE], a1 2124 LDF [AO + 1 * SIZE], a2 2125 LDF [AO + 8 * SIZE], a5 2126 2127 LDF [BO + 0 * SIZE], b1 2128 2129 LDF [BO + 1 * SIZE], b2 2130 FCLR (cc01) 2131 LDF [BO + 2 * SIZE], b3 2132 FCLR (cc05) 2133 LDF [BO + 3 * SIZE], b4 2134 FCLR (cc09) 2135 LDF [BO + 4 * SIZE], b5 2136 FCLR (cc13) 2137 2138 LDF [BO + 5 * SIZE], b6 2139 FCLR (cc02) 2140 LDF [BO + 6 * SIZE], b7 2141 FCLR (cc06) 2142 LDF [BO + 7 * SIZE], b8 2143 FCLR (cc10) 2144 LDF [BO + 8 * SIZE], b9 2145 FCLR (cc14) 2146 2147 prefetch [C1 + 1 * SIZE], 3 2148 FCLR (cc03) 2149 prefetch [C2 + 2 * SIZE], 3 2150 FCLR (cc07) 2151 prefetch [C3 + 1 * SIZE], 3 2152 FCLR (cc11) 2153 prefetch [C4 + 2 * SIZE], 3 2154 FCLR (cc15) 2155 2156 prefetch [C5 + 1 * SIZE], 3 2157 FCLR (cc04) 2158 prefetch [C6 + 2 * SIZE], 3 2159 FCLR (cc08) 2160 prefetch [C7 + 1 * SIZE], 3 2161 FCLR (cc12) 2162 prefetch [C8 + 2 * SIZE], 3 2163 FCLR (cc16) 2164 2165#if defined(LT) || defined(RN) 2166 sra KK, 3, L 2167#else 2168 sub K, KK, L 2169 sra L, 3, L 2170#endif 2171 cmp L, 0 2172 ble,pn %icc, .LL15 2173 nop 2174 .align 4 2175 2176.LL13: 2177 FMADD (aa1, bb1, cc01, cc01) 2178 FMADD (aa2, bb1, cc02, cc02) 2179 FMADD (aa1, bb2, cc03, cc03) 2180 FMADD (aa2, bb2, cc04, cc04) 2181 2182 FMADD (aa1, bb3, cc05, cc05) 2183 LDF [BO + 16 * SIZE], b1 2184 FMADD (aa2, bb3, cc06, cc06) 2185 LDF [BO + 9 * SIZE], b2 2186 2187 FMADD (aa1, bb4, cc07, cc07) 2188 LDF [BO + 10 * SIZE], b3 2189 FMADD (aa2, bb4, cc08, cc08) 2190 LDF [BO + 11 * SIZE], b4 2191 2192 FMADD (aa1, bb5, cc09, cc09) 2193 LDF [AO + 2 * SIZE], a3 2194 FMADD (aa2, bb5, cc10, cc10) 2195 LDF [AO + 3 * SIZE], a4 2196 2197 FMADD (aa1, bb6, cc11, cc11) 2198 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2199 FMADD (aa2, bb6, cc12, cc12) 2200 nop 2201 2202 FMADD (aa1, bb7, cc13, cc13) 2203 LDF [BO + 12 * SIZE], b5 2204 FMADD (aa2, bb7, cc14, cc14) 2205 LDF [BO + 13 * SIZE], b6 2206 2207 FMADD (aa1, bb8, cc15, cc15) 2208 LDF [BO + 14 * SIZE], b7 2209 FMADD (aa2, bb8, cc16, cc16) 2210 LDF [BO + 15 * SIZE], b8 2211 2212 FMADD (aa3, bb9, cc01, cc01) 2213 FMADD (aa4, bb9, cc02, cc02) 2214 FMADD (aa3, bb2, cc03, cc03) 2215 FMADD (aa4, bb2, cc04, cc04) 2216 2217 FMADD (aa3, bb3, cc05, cc05) 2218 LDF [BO + 24 * SIZE], b9 2219 FMADD (aa4, bb3, cc06, cc06) 2220 LDF [BO + 17 * SIZE], b2 2221 2222 FMADD (aa3, bb4, cc07, cc07) 2223 LDF [BO + 18 * SIZE], b3 2224 FMADD (aa4, bb4, cc08, cc08) 2225 LDF [BO + 19 * SIZE], b4 2226 2227 FMADD (aa3, bb5, cc09, cc09) 2228 LDF [AO + 4 * SIZE], a1 2229 FMADD (aa4, bb5, cc10, cc10) 2230 LDF [AO + 5 * SIZE], a2 2231 2232 FMADD (aa3, bb6, cc11, cc11) 2233 add L, -1, L 2234 FMADD (aa4, bb6, cc12, cc12) 2235 nop 2236 2237 FMADD (aa3, bb7, cc13, cc13) 2238 LDF [BO + 20 * SIZE], b5 2239 FMADD (aa4, bb7, cc14, cc14) 2240 LDF [BO + 21 * SIZE], b6 2241 2242 FMADD (aa3, bb8, cc15, cc15) 2243 LDF [BO + 22 * SIZE], b7 2244 FMADD (aa4, bb8, cc16, cc16) 2245 LDF [BO + 23 * SIZE], b8 2246 2247 FMADD (aa1, bb1, cc01, cc01) 2248 FMADD (aa2, bb1, cc02, cc02) 2249 FMADD (aa1, bb2, cc03, cc03) 2250 FMADD (aa2, bb2, cc04, cc04) 2251 2252 FMADD (aa1, bb3, cc05, cc05) 2253 LDF [BO + 32 * SIZE], b1 2254 FMADD (aa2, bb3, cc06, cc06) 2255 LDF [BO + 25 * SIZE], b2 2256 2257 FMADD (aa1, bb4, cc07, cc07) 2258 LDF [BO + 26 * SIZE], b3 2259 FMADD (aa2, bb4, cc08, cc08) 2260 LDF [BO + 27 * SIZE], b4 2261 2262 FMADD (aa1, bb5, cc09, cc09) 2263 LDF [AO + 6 * SIZE], a3 2264 FMADD (aa2, bb5, cc10, cc10) 2265 LDF [AO + 7 * SIZE], a4 2266 2267 FMADD (aa1, bb6, cc11, cc11) 2268 nop 2269 FMADD (aa2, bb6, cc12, cc12) 2270 nop 2271 2272 FMADD (aa1, bb7, cc13, cc13) 2273 LDF [BO + 28 * SIZE], b5 2274 FMADD (aa2, bb7, cc14, cc14) 2275 LDF [BO + 29 * SIZE], b6 2276 2277 FMADD (aa1, bb8, cc15, cc15) 2278 LDF [BO + 30 * SIZE], b7 2279 FMADD (aa2, bb8, cc16, cc16) 2280 LDF [BO + 31 * SIZE], b8 2281 2282 FMADD (aa3, bb9, cc01, cc01) 2283 FMADD (aa4, bb9, cc02, cc02) 2284 FMADD (aa3, bb2, cc03, cc03) 2285 FMADD (aa4, bb2, cc04, cc04) 2286 2287 FMADD (aa3, bb3, cc05, cc05) 2288 LDF [BO + 40 * SIZE], b9 2289 FMADD (aa4, bb3, cc06, cc06) 2290 LDF [BO + 33 * SIZE], b2 2291 2292 FMADD (aa3, bb4, cc07, cc07) 2293 LDF [BO + 34 * SIZE], b3 2294 FMADD (aa4, bb4, cc08, cc08) 2295 LDF [BO + 35 * SIZE], b4 2296 2297 FMADD (aa3, bb5, cc09, cc09) 2298 LDF [AO + 16 * SIZE], a1 /****/ 2299 FMADD (aa4, bb5, cc10, cc10) 2300 LDF [AO + 9 * SIZE], a2 2301 2302 FMADD (aa3, bb6, cc11, cc11) 2303 nop 2304 FMADD (aa4, bb6, cc12, cc12) 2305 nop 2306 2307 FMADD (aa3, bb7, cc13, cc13) 2308 LDF [BO + 36 * SIZE], b5 2309 FMADD (aa4, bb7, cc14, cc14) 2310 LDF [BO + 37 * SIZE], b6 2311 2312 FMADD (aa3, bb8, cc15, cc15) 2313 LDF [BO + 38 * SIZE], b7 2314 FMADD (aa4, bb8, cc16, cc16) 2315 LDF [BO + 39 * SIZE], b8 2316 2317 FMADD (aa5, bb1, cc01, cc01) 2318 FMADD (aa2, bb1, cc02, cc02) 2319 FMADD (aa5, bb2, cc03, cc03) 2320 FMADD (aa2, bb2, cc04, cc04) 2321 2322 FMADD (aa5, bb3, cc05, cc05) 2323 LDF [BO + 48 * SIZE], b1 2324 FMADD (aa2, bb3, cc06, cc06) 2325 LDF [BO + 41 * SIZE], b2 2326 2327 FMADD (aa5, bb4, cc07, cc07) 2328 LDF [BO + 42 * SIZE], b3 2329 FMADD (aa2, bb4, cc08, cc08) 2330 LDF [BO + 43 * SIZE], b4 2331 2332 FMADD (aa5, bb5, cc09, cc09) 2333 LDF [AO + 10 * SIZE], a3 2334 FMADD (aa2, bb5, cc10, cc10) 2335 LDF [AO + 11 * SIZE], a4 2336 2337 FMADD (aa5, bb6, cc11, cc11) 2338 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 2339 FMADD (aa2, bb6, cc12, cc12) 2340 nop 2341 2342 FMADD (aa5, bb7, cc13, cc13) 2343 LDF [BO + 44 * SIZE], b5 2344 FMADD (aa2, bb7, cc14, cc14) 2345 LDF [BO + 45 * SIZE], b6 2346 2347 FMADD (aa5, bb8, cc15, cc15) 2348 LDF [BO + 46 * SIZE], b7 2349 FMADD (aa2, bb8, cc16, cc16) 2350 LDF [BO + 47 * SIZE], b8 2351 2352 FMADD (aa3, bb9, cc01, cc01) 2353 FMADD (aa4, bb9, cc02, cc02) 2354 FMADD (aa3, bb2, cc03, cc03) 2355 FMADD (aa4, bb2, cc04, cc04) 2356 2357 FMADD (aa3, bb3, cc05, cc05) 2358 LDF [BO + 56 * SIZE], b9 2359 FMADD (aa4, bb3, cc06, cc06) 2360 LDF [BO + 49 * SIZE], b2 2361 2362 FMADD (aa3, bb4, cc07, cc07) 2363 LDF [BO + 50 * SIZE], b3 2364 FMADD (aa4, bb4, cc08, cc08) 2365 LDF [BO + 51 * SIZE], b4 2366 2367 FMADD (aa3, bb5, cc09, cc09) 2368 LDF [AO + 12 * SIZE], a5 2369 FMADD (aa4, bb5, cc10, cc10) 2370 LDF [AO + 13 * SIZE], a2 2371 2372 FMADD (aa3, bb6, cc11, cc11) 2373 cmp L, 0 2374 FMADD (aa4, bb6, cc12, cc12) 2375 nop 2376 2377 FMADD (aa3, bb7, cc13, cc13) 2378 LDF [BO + 52 * SIZE], b5 2379 FMADD (aa4, bb7, cc14, cc14) 2380 LDF [BO + 53 * SIZE], b6 2381 2382 FMADD (aa3, bb8, cc15, cc15) 2383 LDF [BO + 54 * SIZE], b7 2384 FMADD (aa4, bb8, cc16, cc16) 2385 LDF [BO + 55 * SIZE], b8 2386 2387 FMADD (aa5, bb1, cc01, cc01) 2388 FMADD (aa2, bb1, cc02, cc02) 2389 FMADD (aa5, bb2, cc03, cc03) 2390 FMADD (aa2, bb2, cc04, cc04) 2391 2392 FMADD (aa5, bb3, cc05, cc05) 2393 LDF [BO + 64 * SIZE], b1 2394 FMADD (aa2, bb3, cc06, cc06) 2395 LDF [BO + 57 * SIZE], b2 2396 2397 FMADD (aa5, bb4, cc07, cc07) 2398 LDF [BO + 58 * SIZE], b3 2399 FMADD (aa2, bb4, cc08, cc08) 2400 LDF [BO + 59 * SIZE], b4 2401 2402 FMADD (aa5, bb5, cc09, cc09) 2403 LDF [AO + 14 * SIZE], a3 2404 FMADD (aa2, bb5, cc10, cc10) 2405 LDF [AO + 15 * SIZE], a4 2406 2407 FMADD (aa5, bb6, cc11, cc11) 2408 add BO, 64 * SIZE, BO 2409 FMADD (aa2, bb6, cc12, cc12) 2410 add AO, 16 * SIZE, AO 2411 2412 FMADD (aa5, bb7, cc13, cc13) 2413 LDF [BO - 4 * SIZE], b5 2414 FMADD (aa2, bb7, cc14, cc14) 2415 LDF [BO - 3 * SIZE], b6 2416 2417 FMADD (aa5, bb8, cc15, cc15) 2418 LDF [BO - 2 * SIZE], b7 2419 FMADD (aa2, bb8, cc16, cc16) 2420 LDF [BO - 1 * SIZE], b8 2421 2422 FMADD (aa3, bb9, cc01, cc01) 2423 FMADD (aa4, bb9, cc02, cc02) 2424 FMADD (aa3, bb2, cc03, cc03) 2425 FMADD (aa4, bb2, cc04, cc04) 2426 2427 FMADD (aa3, bb3, cc05, cc05) 2428 LDF [BO + 8 * SIZE], b9 2429 FMADD (aa4, bb3, cc06, cc06) 2430 LDF [BO + 1 * SIZE], b2 2431 2432 FMADD (aa3, bb4, cc07, cc07) 2433 LDF [BO + 2 * SIZE], b3 2434 FMADD (aa4, bb4, cc08, cc08) 2435 LDF [BO + 3 * SIZE], b4 2436 2437 FMADD (aa3, bb5, cc09, cc09) 2438 LDF [AO + 8 * SIZE], a5 /****/ 2439 FMADD (aa4, bb5, cc10, cc10) 2440 LDF [AO + 1 * SIZE], a2 2441 2442 FMADD (aa3, bb6, cc11, cc11) 2443 FMADD (aa4, bb6, cc12, cc12) 2444 2445 FMADD (aa3, bb7, cc13, cc13) 2446 LDF [BO + 4 * SIZE], b5 2447 FMADD (aa4, bb7, cc14, cc14) 2448 LDF [BO + 5 * SIZE], b6 2449 2450 FMADD (aa3, bb8, cc15, cc15) 2451 LDF [BO + 6 * SIZE], b7 2452 FMADD (aa4, bb8, cc16, cc16) 2453 ble,pn %icc, .LL15 2454 LDF [BO + 7 * SIZE], b8 2455 2456 FMADD (aa1, bb1, cc01, cc01) 2457 FMADD (aa2, bb1, cc02, cc02) 2458 FMADD (aa1, bb2, cc03, cc03) 2459 FMADD (aa2, bb2, cc04, cc04) 2460 2461 FMADD (aa1, bb3, cc05, cc05) 2462 LDF [BO + 16 * SIZE], b1 2463 FMADD (aa2, bb3, cc06, cc06) 2464 LDF [BO + 9 * SIZE], b2 2465 2466 FMADD (aa1, bb4, cc07, cc07) 2467 LDF [BO + 10 * SIZE], b3 2468 FMADD (aa2, bb4, cc08, cc08) 2469 LDF [BO + 11 * SIZE], b4 2470 2471 FMADD (aa1, bb5, cc09, cc09) 2472 LDF [AO + 2 * SIZE], a3 2473 FMADD (aa2, bb5, cc10, cc10) 2474 LDF [AO + 3 * SIZE], a4 2475 2476 FMADD (aa1, bb6, cc11, cc11) 2477 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2478 FMADD (aa2, bb6, cc12, cc12) 2479 nop 2480 2481 FMADD (aa1, bb7, cc13, cc13) 2482 LDF [BO + 12 * SIZE], b5 2483 FMADD (aa2, bb7, cc14, cc14) 2484 LDF [BO + 13 * SIZE], b6 2485 2486 FMADD (aa1, bb8, cc15, cc15) 2487 LDF [BO + 14 * SIZE], b7 2488 FMADD (aa2, bb8, cc16, cc16) 2489 LDF [BO + 15 * SIZE], b8 2490 2491 FMADD (aa3, bb9, cc01, cc01) 2492 FMADD (aa4, bb9, cc02, cc02) 2493 FMADD (aa3, bb2, cc03, cc03) 2494 FMADD (aa4, bb2, cc04, cc04) 2495 2496 FMADD (aa3, bb3, cc05, cc05) 2497 LDF [BO + 24 * SIZE], b9 2498 FMADD (aa4, bb3, cc06, cc06) 2499 LDF [BO + 17 * SIZE], b2 2500 2501 FMADD (aa3, bb4, cc07, cc07) 2502 LDF [BO + 18 * SIZE], b3 2503 FMADD (aa4, bb4, cc08, cc08) 2504 LDF [BO + 19 * SIZE], b4 2505 2506 FMADD (aa3, bb5, cc09, cc09) 2507 LDF [AO + 4 * SIZE], a1 2508 FMADD (aa4, bb5, cc10, cc10) 2509 LDF [AO + 5 * SIZE], a2 2510 2511 FMADD (aa3, bb6, cc11, cc11) 2512 add L, -1, L 2513 FMADD (aa4, bb6, cc12, cc12) 2514 nop 2515 2516 FMADD (aa3, bb7, cc13, cc13) 2517 LDF [BO + 20 * SIZE], b5 2518 FMADD (aa4, bb7, cc14, cc14) 2519 LDF [BO + 21 * SIZE], b6 2520 2521 FMADD (aa3, bb8, cc15, cc15) 2522 LDF [BO + 22 * SIZE], b7 2523 FMADD (aa4, bb8, cc16, cc16) 2524 LDF [BO + 23 * SIZE], b8 2525 2526 FMADD (aa1, bb1, cc01, cc01) 2527 FMADD (aa2, bb1, cc02, cc02) 2528 FMADD (aa1, bb2, cc03, cc03) 2529 FMADD (aa2, bb2, cc04, cc04) 2530 2531 FMADD (aa1, bb3, cc05, cc05) 2532 LDF [BO + 32 * SIZE], b1 2533 FMADD (aa2, bb3, cc06, cc06) 2534 LDF [BO + 25 * SIZE], b2 2535 2536 FMADD (aa1, bb4, cc07, cc07) 2537 LDF [BO + 26 * SIZE], b3 2538 FMADD (aa2, bb4, cc08, cc08) 2539 LDF [BO + 27 * SIZE], b4 2540 2541 FMADD (aa1, bb5, cc09, cc09) 2542 LDF [AO + 6 * SIZE], a3 2543 FMADD (aa2, bb5, cc10, cc10) 2544 LDF [AO + 7 * SIZE], a4 2545 2546 FMADD (aa1, bb6, cc11, cc11) 2547 nop 2548 FMADD (aa2, bb6, cc12, cc12) 2549 nop 2550 2551 FMADD (aa1, bb7, cc13, cc13) 2552 LDF [BO + 28 * SIZE], b5 2553 FMADD (aa2, bb7, cc14, cc14) 2554 LDF [BO + 29 * SIZE], b6 2555 2556 FMADD (aa1, bb8, cc15, cc15) 2557 LDF [BO + 30 * SIZE], b7 2558 FMADD (aa2, bb8, cc16, cc16) 2559 LDF [BO + 31 * SIZE], b8 2560 2561 FMADD (aa3, bb9, cc01, cc01) 2562 FMADD (aa4, bb9, cc02, cc02) 2563 FMADD (aa3, bb2, cc03, cc03) 2564 FMADD (aa4, bb2, cc04, cc04) 2565 2566 FMADD (aa3, bb3, cc05, cc05) 2567 LDF [BO + 40 * SIZE], b9 2568 FMADD (aa4, bb3, cc06, cc06) 2569 LDF [BO + 33 * SIZE], b2 2570 2571 FMADD (aa3, bb4, cc07, cc07) 2572 LDF [BO + 34 * SIZE], b3 2573 FMADD (aa4, bb4, cc08, cc08) 2574 LDF [BO + 35 * SIZE], b4 2575 2576 FMADD (aa3, bb5, cc09, cc09) 2577 LDF [AO + 16 * SIZE], a1 /****/ 2578 FMADD (aa4, bb5, cc10, cc10) 2579 LDF [AO + 9 * SIZE], a2 2580 2581 FMADD (aa3, bb6, cc11, cc11) 2582 nop 2583 FMADD (aa4, bb6, cc12, cc12) 2584 nop 2585 2586 FMADD (aa3, bb7, cc13, cc13) 2587 LDF [BO + 36 * SIZE], b5 2588 FMADD (aa4, bb7, cc14, cc14) 2589 LDF [BO + 37 * SIZE], b6 2590 2591 FMADD (aa3, bb8, cc15, cc15) 2592 LDF [BO + 38 * SIZE], b7 2593 FMADD (aa4, bb8, cc16, cc16) 2594 LDF [BO + 39 * SIZE], b8 2595 2596 FMADD (aa5, bb1, cc01, cc01) 2597 FMADD (aa2, bb1, cc02, cc02) 2598 FMADD (aa5, bb2, cc03, cc03) 2599 FMADD (aa2, bb2, cc04, cc04) 2600 2601 FMADD (aa5, bb3, cc05, cc05) 2602 LDF [BO + 48 * SIZE], b1 2603 FMADD (aa2, bb3, cc06, cc06) 2604 LDF [BO + 41 * SIZE], b2 2605 2606 FMADD (aa5, bb4, cc07, cc07) 2607 LDF [BO + 42 * SIZE], b3 2608 FMADD (aa2, bb4, cc08, cc08) 2609 LDF [BO + 43 * SIZE], b4 2610 2611 FMADD (aa5, bb5, cc09, cc09) 2612 LDF [AO + 10 * SIZE], a3 2613 FMADD (aa2, bb5, cc10, cc10) 2614 LDF [AO + 11 * SIZE], a4 2615 2616 FMADD (aa5, bb6, cc11, cc11) 2617 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 2618 FMADD (aa2, bb6, cc12, cc12) 2619 nop 2620 2621 FMADD (aa5, bb7, cc13, cc13) 2622 LDF [BO + 44 * SIZE], b5 2623 FMADD (aa2, bb7, cc14, cc14) 2624 LDF [BO + 45 * SIZE], b6 2625 2626 FMADD (aa5, bb8, cc15, cc15) 2627 LDF [BO + 46 * SIZE], b7 2628 FMADD (aa2, bb8, cc16, cc16) 2629 LDF [BO + 47 * SIZE], b8 2630 2631 FMADD (aa3, bb9, cc01, cc01) 2632 FMADD (aa4, bb9, cc02, cc02) 2633 FMADD (aa3, bb2, cc03, cc03) 2634 FMADD (aa4, bb2, cc04, cc04) 2635 2636 FMADD (aa3, bb3, cc05, cc05) 2637 LDF [BO + 56 * SIZE], b9 2638 FMADD (aa4, bb3, cc06, cc06) 2639 LDF [BO + 49 * SIZE], b2 2640 2641 FMADD (aa3, bb4, cc07, cc07) 2642 LDF [BO + 50 * SIZE], b3 2643 FMADD (aa4, bb4, cc08, cc08) 2644 LDF [BO + 51 * SIZE], b4 2645 2646 FMADD (aa3, bb5, cc09, cc09) 2647 LDF [AO + 12 * SIZE], a5 2648 FMADD (aa4, bb5, cc10, cc10) 2649 LDF [AO + 13 * SIZE], a2 2650 2651 FMADD (aa3, bb6, cc11, cc11) 2652 cmp L, 0 2653 FMADD (aa4, bb6, cc12, cc12) 2654 nop 2655 2656 FMADD (aa3, bb7, cc13, cc13) 2657 LDF [BO + 52 * SIZE], b5 2658 FMADD (aa4, bb7, cc14, cc14) 2659 LDF [BO + 53 * SIZE], b6 2660 2661 FMADD (aa3, bb8, cc15, cc15) 2662 LDF [BO + 54 * SIZE], b7 2663 FMADD (aa4, bb8, cc16, cc16) 2664 LDF [BO + 55 * SIZE], b8 2665 2666 FMADD (aa5, bb1, cc01, cc01) 2667 FMADD (aa2, bb1, cc02, cc02) 2668 FMADD (aa5, bb2, cc03, cc03) 2669 FMADD (aa2, bb2, cc04, cc04) 2670 2671 FMADD (aa5, bb3, cc05, cc05) 2672 LDF [BO + 64 * SIZE], b1 2673 FMADD (aa2, bb3, cc06, cc06) 2674 LDF [BO + 57 * SIZE], b2 2675 2676 FMADD (aa5, bb4, cc07, cc07) 2677 LDF [BO + 58 * SIZE], b3 2678 FMADD (aa2, bb4, cc08, cc08) 2679 LDF [BO + 59 * SIZE], b4 2680 2681 FMADD (aa5, bb5, cc09, cc09) 2682 LDF [AO + 14 * SIZE], a3 2683 FMADD (aa2, bb5, cc10, cc10) 2684 LDF [AO + 15 * SIZE], a4 2685 2686 FMADD (aa5, bb6, cc11, cc11) 2687 add BO, 64 * SIZE, BO 2688 FMADD (aa2, bb6, cc12, cc12) 2689 add AO, 16 * SIZE, AO 2690 2691 FMADD (aa5, bb7, cc13, cc13) 2692 LDF [BO - 4 * SIZE], b5 2693 FMADD (aa2, bb7, cc14, cc14) 2694 LDF [BO - 3 * SIZE], b6 2695 2696 FMADD (aa5, bb8, cc15, cc15) 2697 LDF [BO - 2 * SIZE], b7 2698 FMADD (aa2, bb8, cc16, cc16) 2699 LDF [BO - 1 * SIZE], b8 2700 2701 FMADD (aa3, bb9, cc01, cc01) 2702 FMADD (aa4, bb9, cc02, cc02) 2703 FMADD (aa3, bb2, cc03, cc03) 2704 FMADD (aa4, bb2, cc04, cc04) 2705 2706 FMADD (aa3, bb3, cc05, cc05) 2707 LDF [BO + 8 * SIZE], b9 2708 FMADD (aa4, bb3, cc06, cc06) 2709 LDF [BO + 1 * SIZE], b2 2710 2711 FMADD (aa3, bb4, cc07, cc07) 2712 LDF [BO + 2 * SIZE], b3 2713 FMADD (aa4, bb4, cc08, cc08) 2714 LDF [BO + 3 * SIZE], b4 2715 2716 FMADD (aa3, bb5, cc09, cc09) 2717 LDF [AO + 8 * SIZE], a5 /****/ 2718 FMADD (aa4, bb5, cc10, cc10) 2719 LDF [AO + 1 * SIZE], a2 2720 2721 FMADD (aa3, bb6, cc11, cc11) 2722 FMADD (aa4, bb6, cc12, cc12) 2723 2724 FMADD (aa3, bb7, cc13, cc13) 2725 LDF [BO + 4 * SIZE], b5 2726 FMADD (aa4, bb7, cc14, cc14) 2727 LDF [BO + 5 * SIZE], b6 2728 2729 FMADD (aa3, bb8, cc15, cc15) 2730 LDF [BO + 6 * SIZE], b7 2731 FMADD (aa4, bb8, cc16, cc16) 2732 bg,pt %icc, .LL13 2733 LDF [BO + 7 * SIZE], b8 2734 .align 4 2735 2736.LL15: 2737#if defined(LT) || defined(RN) 2738 and KK, 7, L 2739#else 2740 sub K, KK, L 2741 and L, 7, L 2742#endif 2743 cmp L, 0 2744 ble,a,pn %icc, .LL18 2745 nop 2746 .align 4 2747 2748.LL17: 2749 FMADD (aa1, bb1, cc01, cc01) 2750 add L, -1, L 2751 FMADD (aa2, bb1, cc02, cc02) 2752 nop 2753 2754 FMADD (aa1, bb2, cc03, cc03) 2755 LDF [BO + 8 * SIZE], b1 2756 FMADD (aa2, bb2, cc04, cc04) 2757 LDF [BO + 9 * SIZE], b2 2758 2759 FMADD (aa1, bb3, cc05, cc05) 2760 cmp L, 0 2761 FMADD (aa2, bb3, cc06, cc06) 2762 nop 2763 2764 FMADD (aa1, bb4, cc07, cc07) 2765 LDF [BO + 10 * SIZE], b3 2766 FMADD (aa2, bb4, cc08, cc08) 2767 LDF [BO + 11 * SIZE], b4 2768 2769 FMADD (aa1, bb5, cc09, cc09) 2770 nop 2771 FMADD (aa2, bb5, cc10, cc10) 2772 nop 2773 2774 FMADD (aa1, bb6, cc11, cc11) 2775 LDF [BO + 12 * SIZE], b5 2776 FMADD (aa2, bb6, cc12, cc12) 2777 LDF [BO + 13 * SIZE], b6 2778 2779 FMADD (aa1, bb7, cc13, cc13) 2780 add AO, 2 * SIZE, AO 2781 FMADD (aa2, bb7, cc14, cc14) 2782 add BO, 8 * SIZE, BO 2783 2784 FMADD (aa1, bb8, cc15, cc15) 2785 LDF [AO + 0 * SIZE], a1 2786 FMADD (aa2, bb8, cc16, cc16) 2787 LDF [AO + 1 * SIZE], a2 2788 2789 LDF [BO + 6 * SIZE], b7 2790 bg,pt %icc, .LL17 2791 LDF [BO + 7 * SIZE], b8 2792 nop 2793 .align 4 2794 2795.LL18: 2796#if defined(LN) || defined(RT) 2797#ifdef LN 2798 sub KK, 2, TEMP1 2799#else 2800 sub KK, 8, TEMP1 2801#endif 2802 sll TEMP1, BASE_SHIFT + 1, TEMP2 2803 sll TEMP1, BASE_SHIFT + 3, TEMP1 2804 2805 add AORIG, TEMP2, AO 2806 add B, TEMP1, BO 2807#endif 2808 2809#if defined(LN) || defined(LT) 2810 LDF [BO + 0 * SIZE], a1 2811 LDF [BO + 1 * SIZE], a2 2812 LDF [BO + 2 * SIZE], a3 2813 LDF [BO + 3 * SIZE], a4 2814 2815 LDF [BO + 4 * SIZE], b1 2816 LDF [BO + 5 * SIZE], b2 2817 LDF [BO + 6 * SIZE], b3 2818 LDF [BO + 7 * SIZE], b4 2819 2820 FSUB a1, c01, c01 2821 FSUB a2, c03, c03 2822 FSUB a3, c05, c05 2823 FSUB a4, c07, c07 2824 2825 FSUB b1, c09, c09 2826 FSUB b2, c11, c11 2827 FSUB b3, c13, c13 2828 FSUB b4, c15, c15 2829 2830 LDF [BO + 8 * SIZE], a1 2831 LDF [BO + 9 * SIZE], a2 2832 LDF [BO + 10 * SIZE], a3 2833 LDF [BO + 11 * SIZE], a4 2834 2835 LDF [BO + 12 * SIZE], b1 2836 LDF [BO + 13 * SIZE], b2 2837 LDF [BO + 14 * SIZE], b3 2838 LDF [BO + 15 * SIZE], b4 2839 2840 FSUB a1, c02, c02 2841 FSUB a2, c04, c04 2842 FSUB a3, c06, c06 2843 FSUB a4, c08, c08 2844 2845 FSUB b1, c10, c10 2846 FSUB b2, c12, c12 2847 FSUB b3, c14, c14 2848 FSUB b4, c16, c16 2849#else 2850 LDF [AO + 0 * SIZE], a1 2851 LDF [AO + 1 * SIZE], a2 2852 LDF [AO + 2 * SIZE], a3 2853 LDF [AO + 3 * SIZE], a4 2854 2855 LDF [AO + 4 * SIZE], b1 2856 LDF [AO + 5 * SIZE], b2 2857 LDF [AO + 6 * SIZE], b3 2858 LDF [AO + 7 * SIZE], b4 2859 2860 FSUB a1, c01, c01 2861 FSUB a2, c02, c02 2862 FSUB a3, c03, c03 2863 FSUB a4, c04, c04 2864 2865 FSUB b1, c05, c05 2866 FSUB b2, c06, c06 2867 FSUB b3, c07, c07 2868 FSUB b4, c08, c08 2869 2870 LDF [AO + 8 * SIZE], a1 2871 LDF [AO + 9 * SIZE], a2 2872 LDF [AO + 10 * SIZE], a3 2873 LDF [AO + 11 * SIZE], a4 2874 2875 LDF [AO + 12 * SIZE], b1 2876 LDF [AO + 13 * SIZE], b2 2877 LDF [AO + 14 * SIZE], b3 2878 LDF [AO + 15 * SIZE], b4 2879 2880 FSUB a1, c09, c09 2881 FSUB a2, c10, c10 2882 FSUB a3, c11, c11 2883 FSUB a4, c12, c12 2884 2885 FSUB b1, c13, c13 2886 FSUB b2, c14, c14 2887 FSUB b3, c15, c15 2888 FSUB b4, c16, c16 2889#endif 2890 2891#ifdef LN 2892 LDF [AO + 3 * SIZE], a1 2893 LDF [AO + 2 * SIZE], a2 2894 LDF [AO + 0 * SIZE], a3 2895 2896 FMUL a1, c02, c02 2897 FMUL a1, c04, c04 2898 FMUL a1, c06, c06 2899 FMUL a1, c08, c08 2900 FMUL a1, c10, c10 2901 FMUL a1, c12, c12 2902 FMUL a1, c14, c14 2903 FMUL a1, c16, c16 2904 2905 FNMSUB (aa2, cc02, cc01, cc01) 2906 FNMSUB (aa2, cc04, cc03, cc03) 2907 FNMSUB (aa2, cc06, cc05, cc05) 2908 FNMSUB (aa2, cc08, cc07, cc07) 2909 FNMSUB (aa2, cc10, cc09, cc09) 2910 FNMSUB (aa2, cc12, cc11, cc11) 2911 FNMSUB (aa2, cc14, cc13, cc13) 2912 FNMSUB (aa2, cc16, cc15, cc15) 2913 2914 FMUL a3, c01, c01 2915 FMUL a3, c03, c03 2916 FMUL a3, c05, c05 2917 FMUL a3, c07, c07 2918 FMUL a3, c09, c09 2919 FMUL a3, c11, c11 2920 FMUL a3, c13, c13 2921 FMUL a3, c15, c15 2922#endif 2923 2924#ifdef LT 2925 LDF [AO + 0 * SIZE], a1 2926 LDF [AO + 1 * SIZE], a2 2927 LDF [AO + 3 * SIZE], a3 2928 2929 FMUL a1, c01, c01 2930 FMUL a1, c03, c03 2931 FMUL a1, c05, c05 2932 FMUL a1, c07, c07 2933 FMUL a1, c09, c09 2934 FMUL a1, c11, c11 2935 FMUL a1, c13, c13 2936 FMUL a1, c15, c15 2937 2938 FNMSUB (aa2, cc01, cc02, cc02) 2939 FNMSUB (aa2, cc03, cc04, cc04) 2940 FNMSUB (aa2, cc05, cc06, cc06) 2941 FNMSUB (aa2, cc07, cc08, cc08) 2942 FNMSUB (aa2, cc09, cc10, cc10) 2943 FNMSUB (aa2, cc11, cc12, cc12) 2944 FNMSUB (aa2, cc13, cc14, cc14) 2945 FNMSUB (aa2, cc15, cc16, cc16) 2946 2947 FMUL a3, c02, c02 2948 FMUL a3, c04, c04 2949 FMUL a3, c06, c06 2950 FMUL a3, c08, c08 2951 FMUL a3, c10, c10 2952 FMUL a3, c12, c12 2953 FMUL a3, c14, c14 2954 FMUL a3, c16, c16 2955#endif 2956 2957#ifdef RN 2958 LDF [BO + 0 * SIZE], a1 2959 LDF [BO + 1 * SIZE], a2 2960 LDF [BO + 2 * SIZE], a3 2961 LDF [BO + 3 * SIZE], a4 2962 LDF [BO + 4 * SIZE], b1 2963 LDF [BO + 5 * SIZE], b2 2964 LDF [BO + 6 * SIZE], b3 2965 LDF [BO + 7 * SIZE], b4 2966 2967 FMUL a1, c01, c01 2968 FMUL a1, c02, c02 2969 2970 FNMSUB (aa2, cc01, cc03, cc03) 2971 FNMSUB (aa2, cc02, cc04, cc04) 2972 FNMSUB (aa3, cc01, cc05, cc05) 2973 FNMSUB (aa3, cc02, cc06, cc06) 2974 FNMSUB (aa4, cc01, cc07, cc07) 2975 FNMSUB (aa4, cc02, cc08, cc08) 2976 FNMSUB (bb1, cc01, cc09, cc09) 2977 FNMSUB (bb1, cc02, cc10, cc10) 2978 FNMSUB (bb2, cc01, cc11, cc11) 2979 FNMSUB (bb2, cc02, cc12, cc12) 2980 FNMSUB (bb3, cc01, cc13, cc13) 2981 FNMSUB (bb3, cc02, cc14, cc14) 2982 FNMSUB (bb4, cc01, cc15, cc15) 2983 FNMSUB (bb4, cc02, cc16, cc16) 2984 2985 LDF [BO + 9 * SIZE], a1 2986 LDF [BO + 10 * SIZE], a2 2987 LDF [BO + 11 * SIZE], a3 2988 LDF [BO + 12 * SIZE], a4 2989 LDF [BO + 13 * SIZE], b1 2990 LDF [BO + 14 * SIZE], b2 2991 LDF [BO + 15 * SIZE], b3 2992 2993 FMUL a1, c03, c03 2994 FMUL a1, c04, c04 2995 2996 FNMSUB (aa2, cc03, cc05, cc05) 2997 FNMSUB (aa2, cc04, cc06, cc06) 2998 FNMSUB (aa3, cc03, cc07, cc07) 2999 FNMSUB (aa3, cc04, cc08, cc08) 3000 FNMSUB (aa4, cc03, cc09, cc09) 3001 FNMSUB (aa4, cc04, cc10, cc10) 3002 FNMSUB (bb1, cc03, cc11, cc11) 3003 FNMSUB (bb1, cc04, cc12, cc12) 3004 FNMSUB (bb2, cc03, cc13, cc13) 3005 FNMSUB (bb2, cc04, cc14, cc14) 3006 FNMSUB (bb3, cc03, cc15, cc15) 3007 FNMSUB (bb3, cc04, cc16, cc16) 3008 3009 LDF [BO + 18 * SIZE], a1 3010 LDF [BO + 19 * SIZE], a2 3011 LDF [BO + 20 * SIZE], a3 3012 LDF [BO + 21 * SIZE], a4 3013 LDF [BO + 22 * SIZE], b1 3014 LDF [BO + 23 * SIZE], b2 3015 3016 FMUL a1, c05, c05 3017 FMUL a1, c06, c06 3018 3019 FNMSUB (aa2, cc05, cc07, cc07) 3020 FNMSUB (aa2, cc06, cc08, cc08) 3021 FNMSUB (aa3, cc05, cc09, cc09) 3022 FNMSUB (aa3, cc06, cc10, cc10) 3023 FNMSUB (aa4, cc05, cc11, cc11) 3024 FNMSUB (aa4, cc06, cc12, cc12) 3025 FNMSUB (bb1, cc05, cc13, cc13) 3026 FNMSUB (bb1, cc06, cc14, cc14) 3027 FNMSUB (bb2, cc05, cc15, cc15) 3028 FNMSUB (bb2, cc06, cc16, cc16) 3029 3030 LDF [BO + 27 * SIZE], a1 3031 LDF [BO + 28 * SIZE], a2 3032 LDF [BO + 29 * SIZE], a3 3033 LDF [BO + 30 * SIZE], a4 3034 LDF [BO + 31 * SIZE], b1 3035 3036 FMUL a1, c07, c07 3037 FMUL a1, c08, c08 3038 3039 FNMSUB (aa2, cc07, cc09, cc09) 3040 FNMSUB (aa2, cc08, cc10, cc10) 3041 FNMSUB (aa3, cc07, cc11, cc11) 3042 FNMSUB (aa3, cc08, cc12, cc12) 3043 FNMSUB (aa4, cc07, cc13, cc13) 3044 FNMSUB (aa4, cc08, cc14, cc14) 3045 FNMSUB (bb1, cc07, cc15, cc15) 3046 FNMSUB (bb1, cc08, cc16, cc16) 3047 3048 LDF [BO + 36 * SIZE], a1 3049 LDF [BO + 37 * SIZE], a2 3050 LDF [BO + 38 * SIZE], a3 3051 LDF [BO + 39 * SIZE], a4 3052 3053 FMUL a1, c09, c09 3054 FMUL a1, c10, c10 3055 3056 FNMSUB (aa2, cc09, cc11, cc11) 3057 FNMSUB (aa2, cc10, cc12, cc12) 3058 FNMSUB (aa3, cc09, cc13, cc13) 3059 FNMSUB (aa3, cc10, cc14, cc14) 3060 FNMSUB (aa4, cc09, cc15, cc15) 3061 FNMSUB (aa4, cc10, cc16, cc16) 3062 3063 LDF [BO + 45 * SIZE], a1 3064 LDF [BO + 46 * SIZE], a2 3065 LDF [BO + 47 * SIZE], a3 3066 3067 FMUL a1, c11, c11 3068 FMUL a1, c12, c12 3069 3070 FNMSUB (aa2, cc11, cc13, cc13) 3071 FNMSUB (aa2, cc12, cc14, cc14) 3072 FNMSUB (aa3, cc11, cc15, cc15) 3073 FNMSUB (aa3, cc12, cc16, cc16) 3074 3075 LDF [BO + 54 * SIZE], a1 3076 LDF [BO + 55 * SIZE], a2 3077 3078 FMUL a1, c13, c13 3079 FMUL a1, c14, c14 3080 3081 FNMSUB (aa2, cc13, cc15, cc15) 3082 FNMSUB (aa2, cc14, cc16, cc16) 3083 3084 LDF [BO + 63 * SIZE], a1 3085 3086 FMUL a1, c15, c15 3087 FMUL a1, c16, c16 3088#endif 3089 3090#ifdef RT 3091 LDF [BO + 63 * SIZE], a1 3092 LDF [BO + 62 * SIZE], a2 3093 LDF [BO + 61 * SIZE], a3 3094 LDF [BO + 60 * SIZE], a4 3095 LDF [BO + 59 * SIZE], b1 3096 LDF [BO + 58 * SIZE], b2 3097 LDF [BO + 57 * SIZE], b3 3098 LDF [BO + 56 * SIZE], b4 3099 3100 FMUL a1, c16, c16 3101 FMUL a1, c15, c15 3102 3103 FNMSUB (aa2, cc16, cc14, cc14) 3104 FNMSUB (aa2, cc15, cc13, cc13) 3105 FNMSUB (aa3, cc16, cc12, cc12) 3106 FNMSUB (aa3, cc15, cc11, cc11) 3107 FNMSUB (aa4, cc16, cc10, cc10) 3108 FNMSUB (aa4, cc15, cc09, cc09) 3109 FNMSUB (bb1, cc16, cc08, cc08) 3110 FNMSUB (bb1, cc15, cc07, cc07) 3111 FNMSUB (bb2, cc16, cc06, cc06) 3112 FNMSUB (bb2, cc15, cc05, cc05) 3113 FNMSUB (bb3, cc16, cc04, cc04) 3114 FNMSUB (bb3, cc15, cc03, cc03) 3115 FNMSUB (bb4, cc16, cc02, cc02) 3116 FNMSUB (bb4, cc15, cc01, cc01) 3117 3118 LDF [BO + 54 * SIZE], a1 3119 LDF [BO + 53 * SIZE], a2 3120 LDF [BO + 52 * SIZE], a3 3121 LDF [BO + 51 * SIZE], a4 3122 LDF [BO + 50 * SIZE], b1 3123 LDF [BO + 49 * SIZE], b2 3124 LDF [BO + 48 * SIZE], b3 3125 3126 FMUL a1, c14, c14 3127 FMUL a1, c13, c13 3128 3129 FNMSUB (aa2, cc14, cc12, cc12) 3130 FNMSUB (aa2, cc13, cc11, cc11) 3131 FNMSUB (aa3, cc14, cc10, cc10) 3132 FNMSUB (aa3, cc13, cc09, cc09) 3133 FNMSUB (aa4, cc14, cc08, cc08) 3134 FNMSUB (aa4, cc13, cc07, cc07) 3135 FNMSUB (bb1, cc14, cc06, cc06) 3136 FNMSUB (bb1, cc13, cc05, cc05) 3137 FNMSUB (bb2, cc14, cc04, cc04) 3138 FNMSUB (bb2, cc13, cc03, cc03) 3139 FNMSUB (bb3, cc14, cc02, cc02) 3140 FNMSUB (bb3, cc13, cc01, cc01) 3141 3142 LDF [BO + 45 * SIZE], a1 3143 LDF [BO + 44 * SIZE], a2 3144 LDF [BO + 43 * SIZE], a3 3145 LDF [BO + 42 * SIZE], a4 3146 LDF [BO + 41 * SIZE], b1 3147 LDF [BO + 40 * SIZE], b2 3148 3149 FMUL a1, c12, c12 3150 FMUL a1, c11, c11 3151 3152 FNMSUB (aa2, cc12, cc10, cc10) 3153 FNMSUB (aa2, cc11, cc09, cc09) 3154 FNMSUB (aa3, cc12, cc08, cc08) 3155 FNMSUB (aa3, cc11, cc07, cc07) 3156 FNMSUB (aa4, cc12, cc06, cc06) 3157 FNMSUB (aa4, cc11, cc05, cc05) 3158 FNMSUB (bb1, cc12, cc04, cc04) 3159 FNMSUB (bb1, cc11, cc03, cc03) 3160 FNMSUB (bb2, cc12, cc02, cc02) 3161 FNMSUB (bb2, cc11, cc01, cc01) 3162 3163 LDF [BO + 36 * SIZE], a1 3164 LDF [BO + 35 * SIZE], a2 3165 LDF [BO + 34 * SIZE], a3 3166 LDF [BO + 33 * SIZE], a4 3167 LDF [BO + 32 * SIZE], b1 3168 3169 FMUL a1, c10, c10 3170 FMUL a1, c09, c09 3171 3172 FNMSUB (aa2, cc10, cc08, cc08) 3173 FNMSUB (aa2, cc09, cc07, cc07) 3174 FNMSUB (aa3, cc10, cc06, cc06) 3175 FNMSUB (aa3, cc09, cc05, cc05) 3176 FNMSUB (aa4, cc10, cc04, cc04) 3177 FNMSUB (aa4, cc09, cc03, cc03) 3178 FNMSUB (bb1, cc10, cc02, cc02) 3179 FNMSUB (bb1, cc09, cc01, cc01) 3180 3181 LDF [BO + 27 * SIZE], a1 3182 LDF [BO + 26 * SIZE], a2 3183 LDF [BO + 25 * SIZE], a3 3184 LDF [BO + 24 * SIZE], a4 3185 3186 FMUL a1, c08, c08 3187 FMUL a1, c07, c07 3188 3189 FNMSUB (aa2, cc08, cc06, cc06) 3190 FNMSUB (aa2, cc07, cc05, cc05) 3191 FNMSUB (aa3, cc08, cc04, cc04) 3192 FNMSUB (aa3, cc07, cc03, cc03) 3193 FNMSUB (aa4, cc08, cc02, cc02) 3194 FNMSUB (aa4, cc07, cc01, cc01) 3195 3196 LDF [BO + 18 * SIZE], a1 3197 LDF [BO + 17 * SIZE], a2 3198 LDF [BO + 16 * SIZE], a3 3199 3200 FMUL a1, c06, c06 3201 FMUL a1, c05, c05 3202 3203 FNMSUB (aa2, cc06, cc04, cc04) 3204 FNMSUB (aa2, cc05, cc03, cc03) 3205 FNMSUB (aa3, cc06, cc02, cc02) 3206 FNMSUB (aa3, cc05, cc01, cc01) 3207 3208 LDF [BO + 9 * SIZE], a1 3209 LDF [BO + 8 * SIZE], a2 3210 3211 FMUL a1, c04, c04 3212 FMUL a1, c03, c03 3213 3214 FNMSUB (aa2, cc04, cc02, cc02) 3215 FNMSUB (aa2, cc03, cc01, cc01) 3216 3217 LDF [BO + 0 * SIZE], a1 3218 3219 FMUL a1, c02, c02 3220 FMUL a1, c01, c01 3221#endif 3222 3223#ifdef LN 3224 add C1, -2 * SIZE, C1 3225 add C2, -2 * SIZE, C2 3226 add C3, -2 * SIZE, C3 3227 add C4, -2 * SIZE, C4 3228 add C5, -2 * SIZE, C5 3229 add C6, -2 * SIZE, C6 3230 add C7, -2 * SIZE, C7 3231 add C8, -2 * SIZE, C8 3232#endif 3233 3234#if defined(LN) || defined(LT) 3235 STF c01, [BO + 0 * SIZE] 3236 STF c03, [BO + 1 * SIZE] 3237 STF c05, [BO + 2 * SIZE] 3238 STF c07, [BO + 3 * SIZE] 3239 3240 STF c09, [BO + 4 * SIZE] 3241 STF c11, [BO + 5 * SIZE] 3242 STF c13, [BO + 6 * SIZE] 3243 STF c15, [BO + 7 * SIZE] 3244 3245 STF c02, [BO + 8 * SIZE] 3246 STF c04, [BO + 9 * SIZE] 3247 STF c06, [BO + 10 * SIZE] 3248 STF c08, [BO + 11 * SIZE] 3249 3250 STF c10, [BO + 12 * SIZE] 3251 STF c12, [BO + 13 * SIZE] 3252 STF c14, [BO + 14 * SIZE] 3253 STF c16, [BO + 15 * SIZE] 3254#else 3255 STF c01, [AO + 0 * SIZE] 3256 STF c02, [AO + 1 * SIZE] 3257 STF c03, [AO + 2 * SIZE] 3258 STF c04, [AO + 3 * SIZE] 3259 3260 STF c05, [AO + 4 * SIZE] 3261 STF c06, [AO + 5 * SIZE] 3262 STF c07, [AO + 6 * SIZE] 3263 STF c08, [AO + 7 * SIZE] 3264 3265 STF c09, [AO + 8 * SIZE] 3266 STF c10, [AO + 9 * SIZE] 3267 STF c11, [AO + 10 * SIZE] 3268 STF c12, [AO + 11 * SIZE] 3269 3270 STF c13, [AO + 12 * SIZE] 3271 STF c14, [AO + 13 * SIZE] 3272 STF c15, [AO + 14 * SIZE] 3273 STF c16, [AO + 15 * SIZE] 3274#endif 3275 3276 STF c01, [C1 + 0 * SIZE] 3277 STF c02, [C1 + 1 * SIZE] 3278 STF c03, [C2 + 0 * SIZE] 3279 STF c04, [C2 + 1 * SIZE] 3280 3281 STF c05, [C3 + 0 * SIZE] 3282 STF c06, [C3 + 1 * SIZE] 3283 STF c07, [C4 + 0 * SIZE] 3284 STF c08, [C4 + 1 * SIZE] 3285 3286 STF c09, [C5 + 0 * SIZE] 3287 STF c10, [C5 + 1 * SIZE] 3288 STF c11, [C6 + 0 * SIZE] 3289 STF c12, [C6 + 1 * SIZE] 3290 3291 STF c13, [C7 + 0 * SIZE] 3292 STF c14, [C7 + 1 * SIZE] 3293 STF c15, [C8 + 0 * SIZE] 3294 STF c16, [C8 + 1 * SIZE] 3295 3296#ifndef LN 3297 add C1, 2 * SIZE, C1 3298 add C2, 2 * SIZE, C2 3299 add C3, 2 * SIZE, C3 3300 add C4, 2 * SIZE, C4 3301 add C5, 2 * SIZE, C5 3302 add C6, 2 * SIZE, C6 3303 add C7, 2 * SIZE, C7 3304 add C8, 2 * SIZE, C8 3305#endif 3306 3307#ifdef RT 3308 sll K, BASE_SHIFT + 1, TEMP1 3309 add AORIG, TEMP1, AORIG 3310#endif 3311 3312#if defined(LT) || defined(RN) 3313 sub K, KK, TEMP1 3314 sll TEMP1, BASE_SHIFT + 1, TEMP2 3315 sll TEMP1, BASE_SHIFT + 3, TEMP1 3316 add AO, TEMP2, AO 3317 add BO, TEMP1, BO 3318#endif 3319 3320#ifdef LT 3321 add KK, 2, KK 3322#endif 3323 3324#ifdef LN 3325 sub KK, 2, KK 3326#endif 3327 3328 add I, -1, I 3329 cmp I, 0 3330 bg,pt %icc, .LL12 3331 nop 3332 .align 4 3333 3334.LL20: 3335 and M, 1, I 3336 cmp I, 0 3337 ble,pn %icc, .LL29 3338 nop 3339 3340#if defined(LT) || defined(RN) 3341 mov B, BO 3342#else 3343#ifdef LN 3344 sll K, BASE_SHIFT + 0, TEMP1 3345 sub AORIG, TEMP1, AORIG 3346#endif 3347 3348 sll KK, BASE_SHIFT + 0, TEMP1 3349 sll KK, BASE_SHIFT + 3, TEMP2 3350 3351 add AORIG, TEMP1, AO 3352 add B, TEMP2, BO 3353#endif 3354 3355 LDF [AO + 0 * SIZE], a1 3356 LDF [AO + 1 * SIZE], a2 3357 LDF [AO + 2 * SIZE], a3 3358 LDF [AO + 3 * SIZE], a4 3359 3360 LDF [BO + 0 * SIZE], b1 3361 FCLR (cc01) 3362 LDF [BO + 1 * SIZE], b2 3363 FCLR (cc03) 3364 LDF [BO + 2 * SIZE], b3 3365 FCLR (cc05) 3366 LDF [BO + 3 * SIZE], b4 3367 FCLR (cc07) 3368 LDF [BO + 4 * SIZE], b5 3369 FCLR (cc09) 3370 LDF [BO + 5 * SIZE], b6 3371 FCLR (cc11) 3372 LDF [BO + 6 * SIZE], b7 3373 FCLR (cc13) 3374 LDF [BO + 7 * SIZE], b8 3375 FCLR (cc15) 3376 3377#if defined(LT) || defined(RN) 3378 sra KK, 2, L 3379#else 3380 sub K, KK, L 3381 sra L, 2, L 3382#endif 3383 cmp L, 0 3384 ble,pn %icc, .LL25 3385 LDF [BO + 8 * SIZE], b9 3386 .align 4 3387 3388.LL23: 3389 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3390 add L, -1, L 3391 3392 FMADD (aa1, bb1, cc01, cc01) 3393 LDF [BO + 16 * SIZE], b1 3394 FMADD (aa1, bb2, cc03, cc03) 3395 LDF [BO + 9 * SIZE], b2 3396 3397 FMADD (aa1, bb3, cc05, cc05) 3398 LDF [BO + 10 * SIZE], b3 3399 FMADD (aa1, bb4, cc07, cc07) 3400 LDF [BO + 11 * SIZE], b4 3401 3402 FMADD (aa1, bb5, cc09, cc09) 3403 LDF [BO + 12 * SIZE], b5 3404 FMADD (aa1, bb6, cc11, cc11) 3405 LDF [BO + 13 * SIZE], b6 3406 3407 FMADD (aa1, bb7, cc13, cc13) 3408 LDF [BO + 14 * SIZE], b7 3409 FMADD (aa1, bb8, cc15, cc15) 3410 LDF [BO + 15 * SIZE], b8 3411 3412 FMADD (aa2, bb9, cc01, cc01) 3413 LDF [BO + 24 * SIZE], b9 3414 FMADD (aa2, bb2, cc03, cc03) 3415 LDF [BO + 17 * SIZE], b2 3416 3417 FMADD (aa2, bb3, cc05, cc05) 3418 LDF [BO + 18 * SIZE], b3 3419 FMADD (aa2, bb4, cc07, cc07) 3420 LDF [BO + 19 * SIZE], b4 3421 3422 FMADD (aa2, bb5, cc09, cc09) 3423 LDF [BO + 20 * SIZE], b5 3424 FMADD (aa2, bb6, cc11, cc11) 3425 LDF [BO + 21 * SIZE], b6 3426 3427 FMADD (aa2, bb7, cc13, cc13) 3428 LDF [BO + 22 * SIZE], b7 3429 FMADD (aa2, bb8, cc15, cc15) 3430 LDF [BO + 23 * SIZE], b8 3431 3432 LDF [AO + 4 * SIZE], a1 3433 LDF [AO + 5 * SIZE], a2 3434 3435 FMADD (aa3, bb1, cc01, cc01) 3436 LDF [BO + 32 * SIZE], b1 3437 FMADD (aa3, bb2, cc03, cc03) 3438 LDF [BO + 25 * SIZE], b2 3439 3440 FMADD (aa3, bb3, cc05, cc05) 3441 LDF [BO + 26 * SIZE], b3 3442 FMADD (aa3, bb4, cc07, cc07) 3443 LDF [BO + 27 * SIZE], b4 3444 3445 FMADD (aa3, bb5, cc09, cc09) 3446 LDF [BO + 28 * SIZE], b5 3447 FMADD (aa3, bb6, cc11, cc11) 3448 LDF [BO + 29 * SIZE], b6 3449 3450 FMADD (aa3, bb7, cc13, cc13) 3451 LDF [BO + 30 * SIZE], b7 3452 FMADD (aa3, bb8, cc15, cc15) 3453 LDF [BO + 31 * SIZE], b8 3454 3455 FMADD (aa4, bb9, cc01, cc01) 3456 LDF [BO + 40 * SIZE], b9 3457 FMADD (aa4, bb2, cc03, cc03) 3458 LDF [BO + 33 * SIZE], b2 3459 3460 FMADD (aa4, bb3, cc05, cc05) 3461 LDF [BO + 34 * SIZE], b3 3462 FMADD (aa4, bb4, cc07, cc07) 3463 LDF [BO + 35 * SIZE], b4 3464 3465 FMADD (aa4, bb5, cc09, cc09) 3466 LDF [BO + 36 * SIZE], b5 3467 FMADD (aa4, bb6, cc11, cc11) 3468 LDF [BO + 37 * SIZE], b6 3469 3470 FMADD (aa4, bb7, cc13, cc13) 3471 LDF [BO + 38 * SIZE], b7 3472 FMADD (aa4, bb8, cc15, cc15) 3473 LDF [BO + 39 * SIZE], b8 3474 3475 LDF [AO + 6 * SIZE], a3 3476 LDF [AO + 7 * SIZE], a4 3477 3478 add AO, 4 * SIZE, AO 3479 cmp L, 0 3480 bg,pt %icc, .LL23 3481 add BO, 32 * SIZE, BO 3482 .align 4 3483 3484.LL25: 3485#if defined(LT) || defined(RN) 3486 and KK, 3, L 3487#else 3488 sub K, KK, L 3489 and L, 3, L 3490#endif 3491 cmp L, 0 3492 ble,a,pn %icc, .LL28 3493 nop 3494 .align 4 3495 3496.LL27: 3497 FMADD (aa1, bb1, cc01, cc01) 3498 LDF [BO + 8 * SIZE], b1 3499 FMADD (aa1, bb2, cc03, cc03) 3500 LDF [BO + 9 * SIZE], b2 3501 3502 FMADD (aa1, bb3, cc05, cc05) 3503 LDF [BO + 10 * SIZE], b3 3504 FMADD (aa1, bb4, cc07, cc07) 3505 LDF [BO + 11 * SIZE], b4 3506 3507 FMADD (aa1, bb5, cc09, cc09) 3508 LDF [BO + 12 * SIZE], b5 3509 FMADD (aa1, bb6, cc11, cc11) 3510 LDF [BO + 13 * SIZE], b6 3511 3512 FMADD (aa1, bb7, cc13, cc13) 3513 LDF [BO + 14 * SIZE], b7 3514 FMADD (aa1, bb8, cc15, cc15) 3515 LDF [BO + 15 * SIZE], b8 3516 3517 LDF [AO + 1 * SIZE], a1 3518 add AO, 1 * SIZE, AO 3519 3520 add L, -1, L 3521 cmp L, 0 3522 bg,pt %icc, .LL27 3523 add BO, 8 * SIZE, BO 3524 .align 4 3525 3526.LL28: 3527#if defined(LN) || defined(RT) 3528#ifdef LN 3529 sub KK, 1, TEMP1 3530#else 3531 sub KK, 8, TEMP1 3532#endif 3533 sll TEMP1, BASE_SHIFT + 0, TEMP2 3534 sll TEMP1, BASE_SHIFT + 3, TEMP1 3535 3536 add AORIG, TEMP2, AO 3537 add B, TEMP1, BO 3538#endif 3539 3540#if defined(LN) || defined(LT) 3541 LDF [BO + 0 * SIZE], a1 3542 LDF [BO + 1 * SIZE], a2 3543 LDF [BO + 2 * SIZE], a3 3544 LDF [BO + 3 * SIZE], a4 3545 3546 LDF [BO + 4 * SIZE], b1 3547 LDF [BO + 5 * SIZE], b2 3548 LDF [BO + 6 * SIZE], b3 3549 LDF [BO + 7 * SIZE], b4 3550 3551 FSUB a1, c01, c01 3552 FSUB a2, c03, c03 3553 FSUB a3, c05, c05 3554 FSUB a4, c07, c07 3555 3556 FSUB b1, c09, c09 3557 FSUB b2, c11, c11 3558 FSUB b3, c13, c13 3559 FSUB b4, c15, c15 3560#else 3561 LDF [AO + 0 * SIZE], a1 3562 LDF [AO + 1 * SIZE], a2 3563 LDF [AO + 2 * SIZE], a3 3564 LDF [AO + 3 * SIZE], a4 3565 3566 LDF [AO + 4 * SIZE], b1 3567 LDF [AO + 5 * SIZE], b2 3568 LDF [AO + 6 * SIZE], b3 3569 LDF [AO + 7 * SIZE], b4 3570 3571 FSUB a1, c01, c01 3572 FSUB a2, c03, c03 3573 FSUB a3, c05, c05 3574 FSUB a4, c07, c07 3575 3576 FSUB b1, c09, c09 3577 FSUB b2, c11, c11 3578 FSUB b3, c13, c13 3579 FSUB b4, c15, c15 3580#endif 3581 3582#if defined(LN) || defined(LT) 3583 LDF [AO + 0 * SIZE], a1 3584 3585 FMUL a1, c01, c01 3586 FMUL a1, c03, c03 3587 FMUL a1, c05, c05 3588 FMUL a1, c07, c07 3589 FMUL a1, c09, c09 3590 FMUL a1, c11, c11 3591 FMUL a1, c13, c13 3592 FMUL a1, c15, c15 3593#endif 3594 3595#ifdef RN 3596 LDF [BO + 0 * SIZE], a1 3597 LDF [BO + 1 * SIZE], a2 3598 LDF [BO + 2 * SIZE], a3 3599 LDF [BO + 3 * SIZE], a4 3600 LDF [BO + 4 * SIZE], b1 3601 LDF [BO + 5 * SIZE], b2 3602 LDF [BO + 6 * SIZE], b3 3603 LDF [BO + 7 * SIZE], b4 3604 3605 FMUL a1, c01, c01 3606 3607 FNMSUB (aa2, cc01, cc03, cc03) 3608 FNMSUB (aa3, cc01, cc05, cc05) 3609 FNMSUB (aa4, cc01, cc07, cc07) 3610 FNMSUB (bb1, cc01, cc09, cc09) 3611 FNMSUB (bb2, cc01, cc11, cc11) 3612 FNMSUB (bb3, cc01, cc13, cc13) 3613 FNMSUB (bb4, cc01, cc15, cc15) 3614 3615 LDF [BO + 9 * SIZE], a1 3616 LDF [BO + 10 * SIZE], a2 3617 LDF [BO + 11 * SIZE], a3 3618 LDF [BO + 12 * SIZE], a4 3619 LDF [BO + 13 * SIZE], b1 3620 LDF [BO + 14 * SIZE], b2 3621 LDF [BO + 15 * SIZE], b3 3622 3623 FMUL a1, c03, c03 3624 3625 FNMSUB (aa2, cc03, cc05, cc05) 3626 FNMSUB (aa3, cc03, cc07, cc07) 3627 FNMSUB (aa4, cc03, cc09, cc09) 3628 FNMSUB (bb1, cc03, cc11, cc11) 3629 FNMSUB (bb2, cc03, cc13, cc13) 3630 FNMSUB (bb3, cc03, cc15, cc15) 3631 3632 LDF [BO + 18 * SIZE], a1 3633 LDF [BO + 19 * SIZE], a2 3634 LDF [BO + 20 * SIZE], a3 3635 LDF [BO + 21 * SIZE], a4 3636 LDF [BO + 22 * SIZE], b1 3637 LDF [BO + 23 * SIZE], b2 3638 3639 FMUL a1, c05, c05 3640 3641 FNMSUB (aa2, cc05, cc07, cc07) 3642 FNMSUB (aa3, cc05, cc09, cc09) 3643 FNMSUB (aa4, cc05, cc11, cc11) 3644 FNMSUB (bb1, cc05, cc13, cc13) 3645 FNMSUB (bb2, cc05, cc15, cc15) 3646 3647 LDF [BO + 27 * SIZE], a1 3648 LDF [BO + 28 * SIZE], a2 3649 LDF [BO + 29 * SIZE], a3 3650 LDF [BO + 30 * SIZE], a4 3651 LDF [BO + 31 * SIZE], b1 3652 3653 FMUL a1, c07, c07 3654 3655 FNMSUB (aa2, cc07, cc09, cc09) 3656 FNMSUB (aa3, cc07, cc11, cc11) 3657 FNMSUB (aa4, cc07, cc13, cc13) 3658 FNMSUB (bb1, cc07, cc15, cc15) 3659 3660 LDF [BO + 36 * SIZE], a1 3661 LDF [BO + 37 * SIZE], a2 3662 LDF [BO + 38 * SIZE], a3 3663 LDF [BO + 39 * SIZE], a4 3664 3665 FMUL a1, c09, c09 3666 3667 FNMSUB (aa2, cc09, cc11, cc11) 3668 FNMSUB (aa3, cc09, cc13, cc13) 3669 FNMSUB (aa4, cc09, cc15, cc15) 3670 3671 LDF [BO + 45 * SIZE], a1 3672 LDF [BO + 46 * SIZE], a2 3673 LDF [BO + 47 * SIZE], a3 3674 3675 FMUL a1, c11, c11 3676 3677 FNMSUB (aa2, cc11, cc13, cc13) 3678 FNMSUB (aa3, cc11, cc15, cc15) 3679 3680 LDF [BO + 54 * SIZE], a1 3681 LDF [BO + 55 * SIZE], a2 3682 3683 FMUL a1, c13, c13 3684 3685 FNMSUB (aa2, cc13, cc15, cc15) 3686 3687 LDF [BO + 63 * SIZE], a1 3688 3689 FMUL a1, c15, c15 3690#endif 3691 3692#ifdef RT 3693 LDF [BO + 63 * SIZE], a1 3694 LDF [BO + 62 * SIZE], a2 3695 LDF [BO + 61 * SIZE], a3 3696 LDF [BO + 60 * SIZE], a4 3697 LDF [BO + 59 * SIZE], b1 3698 LDF [BO + 58 * SIZE], b2 3699 LDF [BO + 57 * SIZE], b3 3700 LDF [BO + 56 * SIZE], b4 3701 3702 FMUL a1, c15, c15 3703 3704 FNMSUB (aa2, cc15, cc13, cc13) 3705 FNMSUB (aa3, cc15, cc11, cc11) 3706 FNMSUB (aa4, cc15, cc09, cc09) 3707 FNMSUB (bb1, cc15, cc07, cc07) 3708 FNMSUB (bb2, cc15, cc05, cc05) 3709 FNMSUB (bb3, cc15, cc03, cc03) 3710 FNMSUB (bb4, cc15, cc01, cc01) 3711 3712 LDF [BO + 54 * SIZE], a1 3713 LDF [BO + 53 * SIZE], a2 3714 LDF [BO + 52 * SIZE], a3 3715 LDF [BO + 51 * SIZE], a4 3716 LDF [BO + 50 * SIZE], b1 3717 LDF [BO + 49 * SIZE], b2 3718 LDF [BO + 48 * SIZE], b3 3719 3720 FMUL a1, c13, c13 3721 3722 FNMSUB (aa2, cc13, cc11, cc11) 3723 FNMSUB (aa3, cc13, cc09, cc09) 3724 FNMSUB (aa4, cc13, cc07, cc07) 3725 FNMSUB (bb1, cc13, cc05, cc05) 3726 FNMSUB (bb2, cc13, cc03, cc03) 3727 FNMSUB (bb3, cc13, cc01, cc01) 3728 3729 LDF [BO + 45 * SIZE], a1 3730 LDF [BO + 44 * SIZE], a2 3731 LDF [BO + 43 * SIZE], a3 3732 LDF [BO + 42 * SIZE], a4 3733 LDF [BO + 41 * SIZE], b1 3734 LDF [BO + 40 * SIZE], b2 3735 3736 FMUL a1, c11, c11 3737 3738 FNMSUB (aa2, cc11, cc09, cc09) 3739 FNMSUB (aa3, cc11, cc07, cc07) 3740 FNMSUB (aa4, cc11, cc05, cc05) 3741 FNMSUB (bb1, cc11, cc03, cc03) 3742 FNMSUB (bb2, cc11, cc01, cc01) 3743 3744 LDF [BO + 36 * SIZE], a1 3745 LDF [BO + 35 * SIZE], a2 3746 LDF [BO + 34 * SIZE], a3 3747 LDF [BO + 33 * SIZE], a4 3748 LDF [BO + 32 * SIZE], b1 3749 3750 FMUL a1, c09, c09 3751 3752 FNMSUB (aa2, cc09, cc07, cc07) 3753 FNMSUB (aa3, cc09, cc05, cc05) 3754 FNMSUB (aa4, cc09, cc03, cc03) 3755 FNMSUB (bb1, cc09, cc01, cc01) 3756 3757 LDF [BO + 27 * SIZE], a1 3758 LDF [BO + 26 * SIZE], a2 3759 LDF [BO + 25 * SIZE], a3 3760 LDF [BO + 24 * SIZE], a4 3761 3762 FMUL a1, c07, c07 3763 3764 FNMSUB (aa2, cc07, cc05, cc05) 3765 FNMSUB (aa3, cc07, cc03, cc03) 3766 FNMSUB (aa4, cc07, cc01, cc01) 3767 3768 LDF [BO + 18 * SIZE], a1 3769 LDF [BO + 17 * SIZE], a2 3770 LDF [BO + 16 * SIZE], a3 3771 3772 FMUL a1, c05, c05 3773 3774 FNMSUB (aa2, cc05, cc03, cc03) 3775 FNMSUB (aa3, cc05, cc01, cc01) 3776 3777 LDF [BO + 9 * SIZE], a1 3778 LDF [BO + 8 * SIZE], a2 3779 3780 FMUL a1, c03, c03 3781 3782 FNMSUB (aa2, cc03, cc01, cc01) 3783 3784 LDF [BO + 0 * SIZE], a1 3785 3786 FMUL a1, c01, c01 3787#endif 3788 3789#ifdef LN 3790 add C1, -1 * SIZE, C1 3791 add C2, -1 * SIZE, C2 3792 add C3, -1 * SIZE, C3 3793 add C4, -1 * SIZE, C4 3794 add C5, -1 * SIZE, C5 3795 add C6, -1 * SIZE, C6 3796 add C7, -1 * SIZE, C7 3797 add C8, -1 * SIZE, C8 3798#endif 3799 3800#if defined(LN) || defined(LT) 3801 STF c01, [BO + 0 * SIZE] 3802 STF c03, [BO + 1 * SIZE] 3803 STF c05, [BO + 2 * SIZE] 3804 STF c07, [BO + 3 * SIZE] 3805 3806 STF c09, [BO + 4 * SIZE] 3807 STF c11, [BO + 5 * SIZE] 3808 STF c13, [BO + 6 * SIZE] 3809 STF c15, [BO + 7 * SIZE] 3810#else 3811 STF c01, [AO + 0 * SIZE] 3812 STF c03, [AO + 1 * SIZE] 3813 STF c05, [AO + 2 * SIZE] 3814 STF c07, [AO + 3 * SIZE] 3815 3816 STF c09, [AO + 4 * SIZE] 3817 STF c11, [AO + 5 * SIZE] 3818 STF c13, [AO + 6 * SIZE] 3819 STF c15, [AO + 7 * SIZE] 3820#endif 3821 3822 STF c01, [C1 + 0 * SIZE] 3823 STF c03, [C2 + 0 * SIZE] 3824 STF c05, [C3 + 0 * SIZE] 3825 STF c07, [C4 + 0 * SIZE] 3826 3827 STF c09, [C5 + 0 * SIZE] 3828 STF c11, [C6 + 0 * SIZE] 3829 STF c13, [C7 + 0 * SIZE] 3830 STF c15, [C8 + 0 * SIZE] 3831 3832#ifdef RT 3833 sll K, BASE_SHIFT + 0, TEMP1 3834 add AORIG, TEMP1, AORIG 3835#endif 3836 3837#if defined(LT) || defined(RN) 3838 sub K, KK, TEMP1 3839 sll TEMP1, BASE_SHIFT + 0, TEMP2 3840 sll TEMP1, BASE_SHIFT + 3, TEMP1 3841 add AO, TEMP2, AO 3842 add BO, TEMP1, BO 3843#endif 3844 3845#ifdef LT 3846 add KK, 1, KK 3847#endif 3848 3849#ifdef LN 3850 sub KK, 1, KK 3851#endif 3852 .align 4 3853 3854.LL29: 3855#ifdef LN 3856 sll K, BASE_SHIFT + 3, TEMP1 3857 add B, TEMP1, B 3858#endif 3859 3860#if defined(LT) || defined(RN) 3861 mov BO, B 3862#endif 3863 3864#ifdef RN 3865 add KK, 8, KK 3866#endif 3867 3868#ifdef RT 3869 sub KK, 8, KK 3870#endif 3871 3872 add J, -1, J 3873 cmp J, 0 3874 bg,pt %icc, .LL11 3875 nop 3876 .align 4 3877 3878.LL999: 3879#ifdef TRMMKERNEL 3880#ifndef __64BIT__ 3881 ld [%sp + STACK_START + 8], %g1 3882 ld [%sp + STACK_START + 12], %g2 3883 ld [%sp + STACK_START + 16], %g3 3884 ld [%sp + STACK_START + 20], %g4 3885#else 3886 ldx [%sp + STACK_START + 32], %g1 3887 ldx [%sp + STACK_START + 40], %g2 3888 ldx [%sp + STACK_START + 48], %g3 3889 ldx [%sp + STACK_START + 56], %g4 3890#endif 3891#endif 3892 3893 return %i7 + 8 3894 clr %o0 3895 3896 EPILOGUE 3897