1/*********************************************************************/ 2/* Copyright 2005-2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define APREFETCHSIZE 24 43#define APREFETCH_CATEGORY 0 44 45#define M %i0 46#define N %i1 47#define K %i2 48 49#if defined(DOUBLE) && !defined(__64BIT__) 50#define A %i5 51#define B %i4 52#else 53#define A %i4 54#define B %i5 55#endif 56 57#define C %o4 58#define LDC %o5 59 60#define AO %l0 61#define BO %l1 62#define I %l2 63#define J %l3 64#define L %l4 65 66#define C1 %o0 67#define C2 %o1 68#define C3 %o2 69#define C4 %o3 70 71#define C5 %l5 72#define C6 %l6 73#define C7 %l7 74#define C8 %i3 75 76#define OFFSET %g1 77#define KK %g2 78#define TEMP1 %g3 79#define TEMP2 %g4 80#define AORIG %o7 81 82#ifdef DOUBLE 83#define c01 %f0 84#define c02 %f2 85#define c03 %f4 86#define c04 %f6 87#define c05 %f8 88#define c06 %f10 89#define c07 %f12 90#define c08 %f14 91#define c09 %f16 92#define c10 %f18 93#define c11 %f20 94#define c12 %f22 95#define c13 %f24 96#define c14 %f26 97#define c15 %f28 98#define c16 %f30 99 100#define a1 %f32 101#define a2 %f34 102#define a3 %f36 103#define a4 %f38 104#define a5 %f40 105 106#define b1 %f42 107#define b2 %f44 108#define b3 %f46 109#define b4 %f48 110#define b5 %f50 111#define b6 %f52 112#define b7 %f54 113#define b8 %f56 114#define b9 %f58 115 116#define cc01 0 117#define cc02 2 118#define cc03 4 119#define cc04 6 120#define cc05 8 121#define cc06 10 122#define cc07 12 123#define cc08 14 124#define cc09 16 125#define cc10 18 126#define cc11 20 127#define cc12 22 128#define cc13 24 129#define cc14 26 130#define cc15 28 131#define cc16 30 132 133#define aa1 1 134#define aa2 3 135#define aa3 5 136#define aa4 7 137#define aa5 9 138 139#define bb1 11 140#define bb2 13 141#define bb3 15 142#define bb4 17 143#define bb5 19 144#define bb6 21 145#define bb7 23 146#define bb8 25 147#define bb9 27 148 149#else 150#define c01 %f0 151#define c02 %f1 152#define c03 %f2 153#define c04 %f3 154#define c05 %f4 155#define c06 %f5 156#define c07 %f6 157#define c08 %f7 158#define c09 %f8 159#define c10 %f9 160#define c11 %f10 161#define c12 %f11 162#define c13 %f12 163#define c14 %f13 164#define c15 %f14 165#define c16 %f15 166 167#define a1 %f16 168#define a2 %f17 169#define a3 %f18 170#define a4 %f19 171#define a5 %f20 172 173#define b1 %f21 174#define b2 %f22 175#define b3 %f23 176#define b4 %f24 177#define b5 %f25 178#define b6 %f26 179#define b7 %f27 180#define b8 %f28 181#define b9 %f29 182 183#define cc01 0 184#define cc02 1 185#define cc03 2 186#define cc04 3 187#define cc05 4 188#define cc06 5 189#define cc07 6 190#define cc08 7 191#define cc09 8 192#define cc10 9 193#define cc11 10 194#define cc12 11 195#define cc13 12 196#define cc14 13 197#define cc15 14 198#define cc16 15 199 200#define aa1 16 201#define aa2 17 202#define aa3 18 203#define aa4 19 204#define aa5 20 205 206#define bb1 21 207#define bb2 22 208#define bb3 23 209#define bb4 24 210#define bb5 25 211#define bb6 26 212#define bb7 27 213#define bb8 28 214#define bb9 29 215 216#endif 217 218 .register %g2, #scratch 219 .register %g3, #scratch 220 221 PROLOGUE 222 SAVESP 223 nop 224 225#ifndef __64BIT__ 226 227#ifdef DOUBLE 228 ld [%sp + STACK_START + 28], B 229 ld [%sp + STACK_START + 32], C 230 ld [%sp + STACK_START + 36], LDC 231 ld [%sp + STACK_START + 40], OFFSET 232#else 233 ld [%sp + STACK_START + 28], C 234 ld [%sp + STACK_START + 32], LDC 235 ld [%sp + STACK_START + 36], OFFSET 236#endif 237 st %g1, [%sp + STACK_START + 8] 238 st %g2, [%sp + STACK_START + 12] 239 st %g3, [%sp + STACK_START + 16] 240 st %g4, [%sp + STACK_START + 20] 241#else 242 243 ldx [%sp+ STACK_START + 56], C 244 ldx [%sp+ STACK_START + 64], LDC 245 ldx [%sp+ STACK_START + 72], OFFSET 246 247 stx %g1, [%sp + STACK_START + 32] 248 stx %g2, [%sp + STACK_START + 40] 249 stx %g3, [%sp + STACK_START + 48] 250 stx %g4, [%sp + STACK_START + 56] 251#endif 252 253#if defined(TRMMKERNEL) && !defined(LEFT) 254 neg OFFSET, KK 255#endif 256 257 sll LDC, BASE_SHIFT, LDC 258 259#ifdef LN 260 smul M, K, TEMP1 261 sll TEMP1, BASE_SHIFT, TEMP1 262 add A, TEMP1, A 263 264 sll M, BASE_SHIFT, TEMP1 265 add C, TEMP1, C 266#endif 267 268#ifdef RN 269 neg OFFSET, KK 270#endif 271 272#ifdef RT 273 smul N, K, TEMP1 274 sll TEMP1, BASE_SHIFT, TEMP1 275 add B, TEMP1, B 276 277 smul N, LDC, TEMP1 278 add C, TEMP1, C 279 280 sub N, OFFSET, KK 281#endif 282 283 sra N, 3, J 284 cmp J, 0 285 ble,pn %icc, .LL30 286 nop 287 .align 4 288 289.LL11: 290#ifdef RT 291 sll K, BASE_SHIFT + 3, TEMP1 292 sub B, TEMP1, B 293#endif 294 295#ifndef RT 296 mov C, C1 297 add C, LDC, C2 298 add C2, LDC, C3 299 add C3, LDC, C4 300 add C4, LDC, C5 301 add C5, LDC, C6 302 add C6, LDC, C7 303 add C7, LDC, C8 304 add C8, LDC, C 305#else 306 sub C, LDC, C8 307 sub C8, LDC, C7 308 sub C7, LDC, C6 309 sub C6, LDC, C5 310 sub C5, LDC, C4 311 sub C4, LDC, C3 312 sub C3, LDC, C2 313 sub C2, LDC, C1 314 sub C2, LDC, C 315#endif 316 317#ifdef LN 318 add M, OFFSET, KK 319#endif 320 321#ifdef LT 322 mov OFFSET, KK 323#endif 324 325#if defined(LN) || defined(RT) 326 mov A, AORIG 327#else 328 mov A, AO 329#endif 330 331 sra M, 1, I 332 cmp I, 0 333 ble,pn %icc, .LL20 334 nop 335 .align 4 336 337.LL12: 338#if defined(LT) || defined(RN) 339 mov B, BO 340#else 341#ifdef LN 342 sll K, BASE_SHIFT + 1, TEMP1 343 sub AORIG, TEMP1, AORIG 344#endif 345 346 sll KK, BASE_SHIFT + 1, TEMP1 347 sll KK, BASE_SHIFT + 3, TEMP2 348 349 add AORIG, TEMP1, AO 350 add B, TEMP2, BO 351#endif 352 353 LDF [AO + 0 * SIZE], a1 354 LDF [AO + 1 * SIZE], a2 355 LDF [AO + 8 * SIZE], a5 356 357 LDF [BO + 0 * SIZE], b1 358 359 LDF [BO + 1 * SIZE], b2 360 FCLR (cc01) 361 LDF [BO + 2 * SIZE], b3 362 FCLR (cc05) 363 LDF [BO + 3 * SIZE], b4 364 FCLR (cc09) 365 LDF [BO + 4 * SIZE], b5 366 FCLR (cc13) 367 368 LDF [BO + 5 * SIZE], b6 369 FCLR (cc02) 370 LDF [BO + 6 * SIZE], b7 371 FCLR (cc06) 372 LDF [BO + 7 * SIZE], b8 373 FCLR (cc10) 374 LDF [BO + 8 * SIZE], b9 375 FCLR (cc14) 376 377 prefetch [C1 + 1 * SIZE], 3 378 FCLR (cc03) 379 prefetch [C2 + 2 * SIZE], 3 380 FCLR (cc07) 381 prefetch [C3 + 1 * SIZE], 3 382 FCLR (cc11) 383 prefetch [C4 + 2 * SIZE], 3 384 FCLR (cc15) 385 386 prefetch [C5 + 1 * SIZE], 3 387 FCLR (cc04) 388 prefetch [C6 + 2 * SIZE], 3 389 FCLR (cc08) 390 prefetch [C7 + 1 * SIZE], 3 391 FCLR (cc12) 392 prefetch [C8 + 2 * SIZE], 3 393 FCLR (cc16) 394 395#if defined(LT) || defined(RN) 396 sra KK, 3, L 397#else 398 sub K, KK, L 399 sra L, 3, L 400#endif 401 cmp L, 0 402 ble,pn %icc, .LL15 403 nop 404 .align 4 405 406.LL13: 407 FMADD (aa1, bb1, cc01, cc01) 408 FMADD (aa2, bb1, cc02, cc02) 409 FMADD (aa1, bb2, cc03, cc03) 410 FMADD (aa2, bb2, cc04, cc04) 411 412 FMADD (aa1, bb3, cc05, cc05) 413 LDF [BO + 16 * SIZE], b1 414 FMADD (aa2, bb3, cc06, cc06) 415 LDF [BO + 9 * SIZE], b2 416 417 FMADD (aa1, bb4, cc07, cc07) 418 LDF [BO + 10 * SIZE], b3 419 FMADD (aa2, bb4, cc08, cc08) 420 LDF [BO + 11 * SIZE], b4 421 422 FMADD (aa1, bb5, cc09, cc09) 423 LDF [AO + 2 * SIZE], a3 424 FMADD (aa2, bb5, cc10, cc10) 425 LDF [AO + 3 * SIZE], a4 426 427 FMADD (aa1, bb6, cc11, cc11) 428 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 429 FMADD (aa2, bb6, cc12, cc12) 430 nop 431 432 FMADD (aa1, bb7, cc13, cc13) 433 LDF [BO + 12 * SIZE], b5 434 FMADD (aa2, bb7, cc14, cc14) 435 LDF [BO + 13 * SIZE], b6 436 437 FMADD (aa1, bb8, cc15, cc15) 438 LDF [BO + 14 * SIZE], b7 439 FMADD (aa2, bb8, cc16, cc16) 440 LDF [BO + 15 * SIZE], b8 441 442 FMADD (aa3, bb9, cc01, cc01) 443 FMADD (aa4, bb9, cc02, cc02) 444 FMADD (aa3, bb2, cc03, cc03) 445 FMADD (aa4, bb2, cc04, cc04) 446 447 FMADD (aa3, bb3, cc05, cc05) 448 LDF [BO + 24 * SIZE], b9 449 FMADD (aa4, bb3, cc06, cc06) 450 LDF [BO + 17 * SIZE], b2 451 452 FMADD (aa3, bb4, cc07, cc07) 453 LDF [BO + 18 * SIZE], b3 454 FMADD (aa4, bb4, cc08, cc08) 455 LDF [BO + 19 * SIZE], b4 456 457 FMADD (aa3, bb5, cc09, cc09) 458 LDF [AO + 4 * SIZE], a1 459 FMADD (aa4, bb5, cc10, cc10) 460 LDF [AO + 5 * SIZE], a2 461 462 FMADD (aa3, bb6, cc11, cc11) 463 add L, -1, L 464 FMADD (aa4, bb6, cc12, cc12) 465 nop 466 467 FMADD (aa3, bb7, cc13, cc13) 468 LDF [BO + 20 * SIZE], b5 469 FMADD (aa4, bb7, cc14, cc14) 470 LDF [BO + 21 * SIZE], b6 471 472 FMADD (aa3, bb8, cc15, cc15) 473 LDF [BO + 22 * SIZE], b7 474 FMADD (aa4, bb8, cc16, cc16) 475 LDF [BO + 23 * SIZE], b8 476 477 FMADD (aa1, bb1, cc01, cc01) 478 FMADD (aa2, bb1, cc02, cc02) 479 FMADD (aa1, bb2, cc03, cc03) 480 FMADD (aa2, bb2, cc04, cc04) 481 482 FMADD (aa1, bb3, cc05, cc05) 483 LDF [BO + 32 * SIZE], b1 484 FMADD (aa2, bb3, cc06, cc06) 485 LDF [BO + 25 * SIZE], b2 486 487 FMADD (aa1, bb4, cc07, cc07) 488 LDF [BO + 26 * SIZE], b3 489 FMADD (aa2, bb4, cc08, cc08) 490 LDF [BO + 27 * SIZE], b4 491 492 FMADD (aa1, bb5, cc09, cc09) 493 LDF [AO + 6 * SIZE], a3 494 FMADD (aa2, bb5, cc10, cc10) 495 LDF [AO + 7 * SIZE], a4 496 497 FMADD (aa1, bb6, cc11, cc11) 498 nop 499 FMADD (aa2, bb6, cc12, cc12) 500 nop 501 502 FMADD (aa1, bb7, cc13, cc13) 503 LDF [BO + 28 * SIZE], b5 504 FMADD (aa2, bb7, cc14, cc14) 505 LDF [BO + 29 * SIZE], b6 506 507 FMADD (aa1, bb8, cc15, cc15) 508 LDF [BO + 30 * SIZE], b7 509 FMADD (aa2, bb8, cc16, cc16) 510 LDF [BO + 31 * SIZE], b8 511 512 FMADD (aa3, bb9, cc01, cc01) 513 FMADD (aa4, bb9, cc02, cc02) 514 FMADD (aa3, bb2, cc03, cc03) 515 FMADD (aa4, bb2, cc04, cc04) 516 517 FMADD (aa3, bb3, cc05, cc05) 518 LDF [BO + 40 * SIZE], b9 519 FMADD (aa4, bb3, cc06, cc06) 520 LDF [BO + 33 * SIZE], b2 521 522 FMADD (aa3, bb4, cc07, cc07) 523 LDF [BO + 34 * SIZE], b3 524 FMADD (aa4, bb4, cc08, cc08) 525 LDF [BO + 35 * SIZE], b4 526 527 FMADD (aa3, bb5, cc09, cc09) 528 LDF [AO + 16 * SIZE], a1 /****/ 529 FMADD (aa4, bb5, cc10, cc10) 530 LDF [AO + 9 * SIZE], a2 531 532 FMADD (aa3, bb6, cc11, cc11) 533 nop 534 FMADD (aa4, bb6, cc12, cc12) 535 nop 536 537 FMADD (aa3, bb7, cc13, cc13) 538 LDF [BO + 36 * SIZE], b5 539 FMADD (aa4, bb7, cc14, cc14) 540 LDF [BO + 37 * SIZE], b6 541 542 FMADD (aa3, bb8, cc15, cc15) 543 LDF [BO + 38 * SIZE], b7 544 FMADD (aa4, bb8, cc16, cc16) 545 LDF [BO + 39 * SIZE], b8 546 547 FMADD (aa5, bb1, cc01, cc01) 548 FMADD (aa2, bb1, cc02, cc02) 549 FMADD (aa5, bb2, cc03, cc03) 550 FMADD (aa2, bb2, cc04, cc04) 551 552 FMADD (aa5, bb3, cc05, cc05) 553 LDF [BO + 48 * SIZE], b1 554 FMADD (aa2, bb3, cc06, cc06) 555 LDF [BO + 41 * SIZE], b2 556 557 FMADD (aa5, bb4, cc07, cc07) 558 LDF [BO + 42 * SIZE], b3 559 FMADD (aa2, bb4, cc08, cc08) 560 LDF [BO + 43 * SIZE], b4 561 562 FMADD (aa5, bb5, cc09, cc09) 563 LDF [AO + 10 * SIZE], a3 564 FMADD (aa2, bb5, cc10, cc10) 565 LDF [AO + 11 * SIZE], a4 566 567 FMADD (aa5, bb6, cc11, cc11) 568 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 569 FMADD (aa2, bb6, cc12, cc12) 570 nop 571 572 FMADD (aa5, bb7, cc13, cc13) 573 LDF [BO + 44 * SIZE], b5 574 FMADD (aa2, bb7, cc14, cc14) 575 LDF [BO + 45 * SIZE], b6 576 577 FMADD (aa5, bb8, cc15, cc15) 578 LDF [BO + 46 * SIZE], b7 579 FMADD (aa2, bb8, cc16, cc16) 580 LDF [BO + 47 * SIZE], b8 581 582 FMADD (aa3, bb9, cc01, cc01) 583 FMADD (aa4, bb9, cc02, cc02) 584 FMADD (aa3, bb2, cc03, cc03) 585 FMADD (aa4, bb2, cc04, cc04) 586 587 FMADD (aa3, bb3, cc05, cc05) 588 LDF [BO + 56 * SIZE], b9 589 FMADD (aa4, bb3, cc06, cc06) 590 LDF [BO + 49 * SIZE], b2 591 592 FMADD (aa3, bb4, cc07, cc07) 593 LDF [BO + 50 * SIZE], b3 594 FMADD (aa4, bb4, cc08, cc08) 595 LDF [BO + 51 * SIZE], b4 596 597 FMADD (aa3, bb5, cc09, cc09) 598 LDF [AO + 12 * SIZE], a5 599 FMADD (aa4, bb5, cc10, cc10) 600 LDF [AO + 13 * SIZE], a2 601 602 FMADD (aa3, bb6, cc11, cc11) 603 cmp L, 0 604 FMADD (aa4, bb6, cc12, cc12) 605 nop 606 607 FMADD (aa3, bb7, cc13, cc13) 608 LDF [BO + 52 * SIZE], b5 609 FMADD (aa4, bb7, cc14, cc14) 610 LDF [BO + 53 * SIZE], b6 611 612 FMADD (aa3, bb8, cc15, cc15) 613 LDF [BO + 54 * SIZE], b7 614 FMADD (aa4, bb8, cc16, cc16) 615 LDF [BO + 55 * SIZE], b8 616 617 FMADD (aa5, bb1, cc01, cc01) 618 FMADD (aa2, bb1, cc02, cc02) 619 FMADD (aa5, bb2, cc03, cc03) 620 FMADD (aa2, bb2, cc04, cc04) 621 622 FMADD (aa5, bb3, cc05, cc05) 623 LDF [BO + 64 * SIZE], b1 624 FMADD (aa2, bb3, cc06, cc06) 625 LDF [BO + 57 * SIZE], b2 626 627 FMADD (aa5, bb4, cc07, cc07) 628 LDF [BO + 58 * SIZE], b3 629 FMADD (aa2, bb4, cc08, cc08) 630 LDF [BO + 59 * SIZE], b4 631 632 FMADD (aa5, bb5, cc09, cc09) 633 LDF [AO + 14 * SIZE], a3 634 FMADD (aa2, bb5, cc10, cc10) 635 LDF [AO + 15 * SIZE], a4 636 637 FMADD (aa5, bb6, cc11, cc11) 638 add BO, 64 * SIZE, BO 639 FMADD (aa2, bb6, cc12, cc12) 640 add AO, 16 * SIZE, AO 641 642 FMADD (aa5, bb7, cc13, cc13) 643 LDF [BO - 4 * SIZE], b5 644 FMADD (aa2, bb7, cc14, cc14) 645 LDF [BO - 3 * SIZE], b6 646 647 FMADD (aa5, bb8, cc15, cc15) 648 LDF [BO - 2 * SIZE], b7 649 FMADD (aa2, bb8, cc16, cc16) 650 LDF [BO - 1 * SIZE], b8 651 652 FMADD (aa3, bb9, cc01, cc01) 653 FMADD (aa4, bb9, cc02, cc02) 654 FMADD (aa3, bb2, cc03, cc03) 655 FMADD (aa4, bb2, cc04, cc04) 656 657 FMADD (aa3, bb3, cc05, cc05) 658 LDF [BO + 8 * SIZE], b9 659 FMADD (aa4, bb3, cc06, cc06) 660 LDF [BO + 1 * SIZE], b2 661 662 FMADD (aa3, bb4, cc07, cc07) 663 LDF [BO + 2 * SIZE], b3 664 FMADD (aa4, bb4, cc08, cc08) 665 LDF [BO + 3 * SIZE], b4 666 667 FMADD (aa3, bb5, cc09, cc09) 668 LDF [AO + 8 * SIZE], a5 /****/ 669 FMADD (aa4, bb5, cc10, cc10) 670 LDF [AO + 1 * SIZE], a2 671 672 FMADD (aa3, bb6, cc11, cc11) 673 FMADD (aa4, bb6, cc12, cc12) 674 675 FMADD (aa3, bb7, cc13, cc13) 676 LDF [BO + 4 * SIZE], b5 677 FMADD (aa4, bb7, cc14, cc14) 678 LDF [BO + 5 * SIZE], b6 679 680 FMADD (aa3, bb8, cc15, cc15) 681 LDF [BO + 6 * SIZE], b7 682 FMADD (aa4, bb8, cc16, cc16) 683 ble,pn %icc, .LL15 684 LDF [BO + 7 * SIZE], b8 685 686 FMADD (aa1, bb1, cc01, cc01) 687 FMADD (aa2, bb1, cc02, cc02) 688 FMADD (aa1, bb2, cc03, cc03) 689 FMADD (aa2, bb2, cc04, cc04) 690 691 FMADD (aa1, bb3, cc05, cc05) 692 LDF [BO + 16 * SIZE], b1 693 FMADD (aa2, bb3, cc06, cc06) 694 LDF [BO + 9 * SIZE], b2 695 696 FMADD (aa1, bb4, cc07, cc07) 697 LDF [BO + 10 * SIZE], b3 698 FMADD (aa2, bb4, cc08, cc08) 699 LDF [BO + 11 * SIZE], b4 700 701 FMADD (aa1, bb5, cc09, cc09) 702 LDF [AO + 2 * SIZE], a3 703 FMADD (aa2, bb5, cc10, cc10) 704 LDF [AO + 3 * SIZE], a4 705 706 FMADD (aa1, bb6, cc11, cc11) 707 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 708 FMADD (aa2, bb6, cc12, cc12) 709 nop 710 711 FMADD (aa1, bb7, cc13, cc13) 712 LDF [BO + 12 * SIZE], b5 713 FMADD (aa2, bb7, cc14, cc14) 714 LDF [BO + 13 * SIZE], b6 715 716 FMADD (aa1, bb8, cc15, cc15) 717 LDF [BO + 14 * SIZE], b7 718 FMADD (aa2, bb8, cc16, cc16) 719 LDF [BO + 15 * SIZE], b8 720 721 FMADD (aa3, bb9, cc01, cc01) 722 FMADD (aa4, bb9, cc02, cc02) 723 FMADD (aa3, bb2, cc03, cc03) 724 FMADD (aa4, bb2, cc04, cc04) 725 726 FMADD (aa3, bb3, cc05, cc05) 727 LDF [BO + 24 * SIZE], b9 728 FMADD (aa4, bb3, cc06, cc06) 729 LDF [BO + 17 * SIZE], b2 730 731 FMADD (aa3, bb4, cc07, cc07) 732 LDF [BO + 18 * SIZE], b3 733 FMADD (aa4, bb4, cc08, cc08) 734 LDF [BO + 19 * SIZE], b4 735 736 FMADD (aa3, bb5, cc09, cc09) 737 LDF [AO + 4 * SIZE], a1 738 FMADD (aa4, bb5, cc10, cc10) 739 LDF [AO + 5 * SIZE], a2 740 741 FMADD (aa3, bb6, cc11, cc11) 742 add L, -1, L 743 FMADD (aa4, bb6, cc12, cc12) 744 nop 745 746 FMADD (aa3, bb7, cc13, cc13) 747 LDF [BO + 20 * SIZE], b5 748 FMADD (aa4, bb7, cc14, cc14) 749 LDF [BO + 21 * SIZE], b6 750 751 FMADD (aa3, bb8, cc15, cc15) 752 LDF [BO + 22 * SIZE], b7 753 FMADD (aa4, bb8, cc16, cc16) 754 LDF [BO + 23 * SIZE], b8 755 756 FMADD (aa1, bb1, cc01, cc01) 757 FMADD (aa2, bb1, cc02, cc02) 758 FMADD (aa1, bb2, cc03, cc03) 759 FMADD (aa2, bb2, cc04, cc04) 760 761 FMADD (aa1, bb3, cc05, cc05) 762 LDF [BO + 32 * SIZE], b1 763 FMADD (aa2, bb3, cc06, cc06) 764 LDF [BO + 25 * SIZE], b2 765 766 FMADD (aa1, bb4, cc07, cc07) 767 LDF [BO + 26 * SIZE], b3 768 FMADD (aa2, bb4, cc08, cc08) 769 LDF [BO + 27 * SIZE], b4 770 771 FMADD (aa1, bb5, cc09, cc09) 772 LDF [AO + 6 * SIZE], a3 773 FMADD (aa2, bb5, cc10, cc10) 774 LDF [AO + 7 * SIZE], a4 775 776 FMADD (aa1, bb6, cc11, cc11) 777 nop 778 FMADD (aa2, bb6, cc12, cc12) 779 nop 780 781 FMADD (aa1, bb7, cc13, cc13) 782 LDF [BO + 28 * SIZE], b5 783 FMADD (aa2, bb7, cc14, cc14) 784 LDF [BO + 29 * SIZE], b6 785 786 FMADD (aa1, bb8, cc15, cc15) 787 LDF [BO + 30 * SIZE], b7 788 FMADD (aa2, bb8, cc16, cc16) 789 LDF [BO + 31 * SIZE], b8 790 791 FMADD (aa3, bb9, cc01, cc01) 792 FMADD (aa4, bb9, cc02, cc02) 793 FMADD (aa3, bb2, cc03, cc03) 794 FMADD (aa4, bb2, cc04, cc04) 795 796 FMADD (aa3, bb3, cc05, cc05) 797 LDF [BO + 40 * SIZE], b9 798 FMADD (aa4, bb3, cc06, cc06) 799 LDF [BO + 33 * SIZE], b2 800 801 FMADD (aa3, bb4, cc07, cc07) 802 LDF [BO + 34 * SIZE], b3 803 FMADD (aa4, bb4, cc08, cc08) 804 LDF [BO + 35 * SIZE], b4 805 806 FMADD (aa3, bb5, cc09, cc09) 807 LDF [AO + 16 * SIZE], a1 /****/ 808 FMADD (aa4, bb5, cc10, cc10) 809 LDF [AO + 9 * SIZE], a2 810 811 FMADD (aa3, bb6, cc11, cc11) 812 nop 813 FMADD (aa4, bb6, cc12, cc12) 814 nop 815 816 FMADD (aa3, bb7, cc13, cc13) 817 LDF [BO + 36 * SIZE], b5 818 FMADD (aa4, bb7, cc14, cc14) 819 LDF [BO + 37 * SIZE], b6 820 821 FMADD (aa3, bb8, cc15, cc15) 822 LDF [BO + 38 * SIZE], b7 823 FMADD (aa4, bb8, cc16, cc16) 824 LDF [BO + 39 * SIZE], b8 825 826 FMADD (aa5, bb1, cc01, cc01) 827 FMADD (aa2, bb1, cc02, cc02) 828 FMADD (aa5, bb2, cc03, cc03) 829 FMADD (aa2, bb2, cc04, cc04) 830 831 FMADD (aa5, bb3, cc05, cc05) 832 LDF [BO + 48 * SIZE], b1 833 FMADD (aa2, bb3, cc06, cc06) 834 LDF [BO + 41 * SIZE], b2 835 836 FMADD (aa5, bb4, cc07, cc07) 837 LDF [BO + 42 * SIZE], b3 838 FMADD (aa2, bb4, cc08, cc08) 839 LDF [BO + 43 * SIZE], b4 840 841 FMADD (aa5, bb5, cc09, cc09) 842 LDF [AO + 10 * SIZE], a3 843 FMADD (aa2, bb5, cc10, cc10) 844 LDF [AO + 11 * SIZE], a4 845 846 FMADD (aa5, bb6, cc11, cc11) 847 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 848 FMADD (aa2, bb6, cc12, cc12) 849 nop 850 851 FMADD (aa5, bb7, cc13, cc13) 852 LDF [BO + 44 * SIZE], b5 853 FMADD (aa2, bb7, cc14, cc14) 854 LDF [BO + 45 * SIZE], b6 855 856 FMADD (aa5, bb8, cc15, cc15) 857 LDF [BO + 46 * SIZE], b7 858 FMADD (aa2, bb8, cc16, cc16) 859 LDF [BO + 47 * SIZE], b8 860 861 FMADD (aa3, bb9, cc01, cc01) 862 FMADD (aa4, bb9, cc02, cc02) 863 FMADD (aa3, bb2, cc03, cc03) 864 FMADD (aa4, bb2, cc04, cc04) 865 866 FMADD (aa3, bb3, cc05, cc05) 867 LDF [BO + 56 * SIZE], b9 868 FMADD (aa4, bb3, cc06, cc06) 869 LDF [BO + 49 * SIZE], b2 870 871 FMADD (aa3, bb4, cc07, cc07) 872 LDF [BO + 50 * SIZE], b3 873 FMADD (aa4, bb4, cc08, cc08) 874 LDF [BO + 51 * SIZE], b4 875 876 FMADD (aa3, bb5, cc09, cc09) 877 LDF [AO + 12 * SIZE], a5 878 FMADD (aa4, bb5, cc10, cc10) 879 LDF [AO + 13 * SIZE], a2 880 881 FMADD (aa3, bb6, cc11, cc11) 882 cmp L, 0 883 FMADD (aa4, bb6, cc12, cc12) 884 nop 885 886 FMADD (aa3, bb7, cc13, cc13) 887 LDF [BO + 52 * SIZE], b5 888 FMADD (aa4, bb7, cc14, cc14) 889 LDF [BO + 53 * SIZE], b6 890 891 FMADD (aa3, bb8, cc15, cc15) 892 LDF [BO + 54 * SIZE], b7 893 FMADD (aa4, bb8, cc16, cc16) 894 LDF [BO + 55 * SIZE], b8 895 896 FMADD (aa5, bb1, cc01, cc01) 897 FMADD (aa2, bb1, cc02, cc02) 898 FMADD (aa5, bb2, cc03, cc03) 899 FMADD (aa2, bb2, cc04, cc04) 900 901 FMADD (aa5, bb3, cc05, cc05) 902 LDF [BO + 64 * SIZE], b1 903 FMADD (aa2, bb3, cc06, cc06) 904 LDF [BO + 57 * SIZE], b2 905 906 FMADD (aa5, bb4, cc07, cc07) 907 LDF [BO + 58 * SIZE], b3 908 FMADD (aa2, bb4, cc08, cc08) 909 LDF [BO + 59 * SIZE], b4 910 911 FMADD (aa5, bb5, cc09, cc09) 912 LDF [AO + 14 * SIZE], a3 913 FMADD (aa2, bb5, cc10, cc10) 914 LDF [AO + 15 * SIZE], a4 915 916 FMADD (aa5, bb6, cc11, cc11) 917 add BO, 64 * SIZE, BO 918 FMADD (aa2, bb6, cc12, cc12) 919 add AO, 16 * SIZE, AO 920 921 FMADD (aa5, bb7, cc13, cc13) 922 LDF [BO - 4 * SIZE], b5 923 FMADD (aa2, bb7, cc14, cc14) 924 LDF [BO - 3 * SIZE], b6 925 926 FMADD (aa5, bb8, cc15, cc15) 927 LDF [BO - 2 * SIZE], b7 928 FMADD (aa2, bb8, cc16, cc16) 929 LDF [BO - 1 * SIZE], b8 930 931 FMADD (aa3, bb9, cc01, cc01) 932 FMADD (aa4, bb9, cc02, cc02) 933 FMADD (aa3, bb2, cc03, cc03) 934 FMADD (aa4, bb2, cc04, cc04) 935 936 FMADD (aa3, bb3, cc05, cc05) 937 LDF [BO + 8 * SIZE], b9 938 FMADD (aa4, bb3, cc06, cc06) 939 LDF [BO + 1 * SIZE], b2 940 941 FMADD (aa3, bb4, cc07, cc07) 942 LDF [BO + 2 * SIZE], b3 943 FMADD (aa4, bb4, cc08, cc08) 944 LDF [BO + 3 * SIZE], b4 945 946 FMADD (aa3, bb5, cc09, cc09) 947 LDF [AO + 8 * SIZE], a5 /****/ 948 FMADD (aa4, bb5, cc10, cc10) 949 LDF [AO + 1 * SIZE], a2 950 951 FMADD (aa3, bb6, cc11, cc11) 952 FMADD (aa4, bb6, cc12, cc12) 953 954 FMADD (aa3, bb7, cc13, cc13) 955 LDF [BO + 4 * SIZE], b5 956 FMADD (aa4, bb7, cc14, cc14) 957 LDF [BO + 5 * SIZE], b6 958 959 FMADD (aa3, bb8, cc15, cc15) 960 LDF [BO + 6 * SIZE], b7 961 FMADD (aa4, bb8, cc16, cc16) 962 bg,pt %icc, .LL13 963 LDF [BO + 7 * SIZE], b8 964 .align 4 965 966.LL15: 967#if defined(LT) || defined(RN) 968 and KK, 7, L 969#else 970 sub K, KK, L 971 and L, 7, L 972#endif 973 cmp L, 0 974 ble,a,pn %icc, .LL18 975 nop 976 .align 4 977 978.LL17: 979 FMADD (aa1, bb1, cc01, cc01) 980 add L, -1, L 981 FMADD (aa2, bb1, cc02, cc02) 982 nop 983 984 FMADD (aa1, bb2, cc03, cc03) 985 LDF [BO + 8 * SIZE], b1 986 FMADD (aa2, bb2, cc04, cc04) 987 LDF [BO + 9 * SIZE], b2 988 989 FMADD (aa1, bb3, cc05, cc05) 990 cmp L, 0 991 FMADD (aa2, bb3, cc06, cc06) 992 nop 993 994 FMADD (aa1, bb4, cc07, cc07) 995 LDF [BO + 10 * SIZE], b3 996 FMADD (aa2, bb4, cc08, cc08) 997 LDF [BO + 11 * SIZE], b4 998 999 FMADD (aa1, bb5, cc09, cc09) 1000 nop 1001 FMADD (aa2, bb5, cc10, cc10) 1002 nop 1003 1004 FMADD (aa1, bb6, cc11, cc11) 1005 LDF [BO + 12 * SIZE], b5 1006 FMADD (aa2, bb6, cc12, cc12) 1007 LDF [BO + 13 * SIZE], b6 1008 1009 FMADD (aa1, bb7, cc13, cc13) 1010 add AO, 2 * SIZE, AO 1011 FMADD (aa2, bb7, cc14, cc14) 1012 add BO, 8 * SIZE, BO 1013 1014 FMADD (aa1, bb8, cc15, cc15) 1015 LDF [AO + 0 * SIZE], a1 1016 FMADD (aa2, bb8, cc16, cc16) 1017 LDF [AO + 1 * SIZE], a2 1018 1019 LDF [BO + 6 * SIZE], b7 1020 bg,pt %icc, .LL17 1021 LDF [BO + 7 * SIZE], b8 1022 nop 1023 .align 4 1024 1025.LL18: 1026#if defined(LN) || defined(RT) 1027#ifdef LN 1028 sub KK, 2, TEMP1 1029#else 1030 sub KK, 8, TEMP1 1031#endif 1032 sll TEMP1, BASE_SHIFT + 1, TEMP2 1033 sll TEMP1, BASE_SHIFT + 3, TEMP1 1034 1035 add AORIG, TEMP2, AO 1036 add B, TEMP1, BO 1037#endif 1038 1039#if defined(LN) || defined(LT) 1040 LDF [BO + 0 * SIZE], a1 1041 LDF [BO + 1 * SIZE], a2 1042 LDF [BO + 2 * SIZE], a3 1043 LDF [BO + 3 * SIZE], a4 1044 1045 LDF [BO + 4 * SIZE], b1 1046 LDF [BO + 5 * SIZE], b2 1047 LDF [BO + 6 * SIZE], b3 1048 LDF [BO + 7 * SIZE], b4 1049 1050 FSUB a1, c01, c01 1051 FSUB a2, c03, c03 1052 FSUB a3, c05, c05 1053 FSUB a4, c07, c07 1054 1055 FSUB b1, c09, c09 1056 FSUB b2, c11, c11 1057 FSUB b3, c13, c13 1058 FSUB b4, c15, c15 1059 1060 LDF [BO + 8 * SIZE], a1 1061 LDF [BO + 9 * SIZE], a2 1062 LDF [BO + 10 * SIZE], a3 1063 LDF [BO + 11 * SIZE], a4 1064 1065 LDF [BO + 12 * SIZE], b1 1066 LDF [BO + 13 * SIZE], b2 1067 LDF [BO + 14 * SIZE], b3 1068 LDF [BO + 15 * SIZE], b4 1069 1070 FSUB a1, c02, c02 1071 FSUB a2, c04, c04 1072 FSUB a3, c06, c06 1073 FSUB a4, c08, c08 1074 1075 FSUB b1, c10, c10 1076 FSUB b2, c12, c12 1077 FSUB b3, c14, c14 1078 FSUB b4, c16, c16 1079#else 1080 LDF [AO + 0 * SIZE], a1 1081 LDF [AO + 1 * SIZE], a2 1082 LDF [AO + 2 * SIZE], a3 1083 LDF [AO + 3 * SIZE], a4 1084 1085 LDF [AO + 4 * SIZE], b1 1086 LDF [AO + 5 * SIZE], b2 1087 LDF [AO + 6 * SIZE], b3 1088 LDF [AO + 7 * SIZE], b4 1089 1090 FSUB a1, c01, c01 1091 FSUB a2, c02, c02 1092 FSUB a3, c03, c03 1093 FSUB a4, c04, c04 1094 1095 FSUB b1, c05, c05 1096 FSUB b2, c06, c06 1097 FSUB b3, c07, c07 1098 FSUB b4, c08, c08 1099 1100 LDF [AO + 8 * SIZE], a1 1101 LDF [AO + 9 * SIZE], a2 1102 LDF [AO + 10 * SIZE], a3 1103 LDF [AO + 11 * SIZE], a4 1104 1105 LDF [AO + 12 * SIZE], b1 1106 LDF [AO + 13 * SIZE], b2 1107 LDF [AO + 14 * SIZE], b3 1108 LDF [AO + 15 * SIZE], b4 1109 1110 FSUB a1, c09, c09 1111 FSUB a2, c10, c10 1112 FSUB a3, c11, c11 1113 FSUB a4, c12, c12 1114 1115 FSUB b1, c13, c13 1116 FSUB b2, c14, c14 1117 FSUB b3, c15, c15 1118 FSUB b4, c16, c16 1119#endif 1120 1121#ifdef LN 1122 LDF [AO + 3 * SIZE], a1 1123 LDF [AO + 2 * SIZE], a2 1124 LDF [AO + 0 * SIZE], a3 1125 1126 FMUL a1, c02, c02 1127 FMUL a1, c04, c04 1128 FMUL a1, c06, c06 1129 FMUL a1, c08, c08 1130 FMUL a1, c10, c10 1131 FMUL a1, c12, c12 1132 FMUL a1, c14, c14 1133 FMUL a1, c16, c16 1134 1135 FNMSUB (aa2, cc02, cc01, cc01) 1136 FNMSUB (aa2, cc04, cc03, cc03) 1137 FNMSUB (aa2, cc06, cc05, cc05) 1138 FNMSUB (aa2, cc08, cc07, cc07) 1139 FNMSUB (aa2, cc10, cc09, cc09) 1140 FNMSUB (aa2, cc12, cc11, cc11) 1141 FNMSUB (aa2, cc14, cc13, cc13) 1142 FNMSUB (aa2, cc16, cc15, cc15) 1143 1144 FMUL a3, c01, c01 1145 FMUL a3, c03, c03 1146 FMUL a3, c05, c05 1147 FMUL a3, c07, c07 1148 FMUL a3, c09, c09 1149 FMUL a3, c11, c11 1150 FMUL a3, c13, c13 1151 FMUL a3, c15, c15 1152#endif 1153 1154#ifdef LT 1155 LDF [AO + 0 * SIZE], a1 1156 LDF [AO + 1 * SIZE], a2 1157 LDF [AO + 3 * SIZE], a3 1158 1159 FMUL a1, c01, c01 1160 FMUL a1, c03, c03 1161 FMUL a1, c05, c05 1162 FMUL a1, c07, c07 1163 FMUL a1, c09, c09 1164 FMUL a1, c11, c11 1165 FMUL a1, c13, c13 1166 FMUL a1, c15, c15 1167 1168 FNMSUB (aa2, cc01, cc02, cc02) 1169 FNMSUB (aa2, cc03, cc04, cc04) 1170 FNMSUB (aa2, cc05, cc06, cc06) 1171 FNMSUB (aa2, cc07, cc08, cc08) 1172 FNMSUB (aa2, cc09, cc10, cc10) 1173 FNMSUB (aa2, cc11, cc12, cc12) 1174 FNMSUB (aa2, cc13, cc14, cc14) 1175 FNMSUB (aa2, cc15, cc16, cc16) 1176 1177 FMUL a3, c02, c02 1178 FMUL a3, c04, c04 1179 FMUL a3, c06, c06 1180 FMUL a3, c08, c08 1181 FMUL a3, c10, c10 1182 FMUL a3, c12, c12 1183 FMUL a3, c14, c14 1184 FMUL a3, c16, c16 1185#endif 1186 1187#ifdef RN 1188 LDF [BO + 0 * SIZE], a1 1189 LDF [BO + 1 * SIZE], a2 1190 LDF [BO + 2 * SIZE], a3 1191 LDF [BO + 3 * SIZE], a4 1192 LDF [BO + 4 * SIZE], b1 1193 LDF [BO + 5 * SIZE], b2 1194 LDF [BO + 6 * SIZE], b3 1195 LDF [BO + 7 * SIZE], b4 1196 1197 FMUL a1, c01, c01 1198 FMUL a1, c02, c02 1199 1200 FNMSUB (aa2, cc01, cc03, cc03) 1201 FNMSUB (aa2, cc02, cc04, cc04) 1202 FNMSUB (aa3, cc01, cc05, cc05) 1203 FNMSUB (aa3, cc02, cc06, cc06) 1204 FNMSUB (aa4, cc01, cc07, cc07) 1205 FNMSUB (aa4, cc02, cc08, cc08) 1206 FNMSUB (bb1, cc01, cc09, cc09) 1207 FNMSUB (bb1, cc02, cc10, cc10) 1208 FNMSUB (bb2, cc01, cc11, cc11) 1209 FNMSUB (bb2, cc02, cc12, cc12) 1210 FNMSUB (bb3, cc01, cc13, cc13) 1211 FNMSUB (bb3, cc02, cc14, cc14) 1212 FNMSUB (bb4, cc01, cc15, cc15) 1213 FNMSUB (bb4, cc02, cc16, cc16) 1214 1215 LDF [BO + 9 * SIZE], a1 1216 LDF [BO + 10 * SIZE], a2 1217 LDF [BO + 11 * SIZE], a3 1218 LDF [BO + 12 * SIZE], a4 1219 LDF [BO + 13 * SIZE], b1 1220 LDF [BO + 14 * SIZE], b2 1221 LDF [BO + 15 * SIZE], b3 1222 1223 FMUL a1, c03, c03 1224 FMUL a1, c04, c04 1225 1226 FNMSUB (aa2, cc03, cc05, cc05) 1227 FNMSUB (aa2, cc04, cc06, cc06) 1228 FNMSUB (aa3, cc03, cc07, cc07) 1229 FNMSUB (aa3, cc04, cc08, cc08) 1230 FNMSUB (aa4, cc03, cc09, cc09) 1231 FNMSUB (aa4, cc04, cc10, cc10) 1232 FNMSUB (bb1, cc03, cc11, cc11) 1233 FNMSUB (bb1, cc04, cc12, cc12) 1234 FNMSUB (bb2, cc03, cc13, cc13) 1235 FNMSUB (bb2, cc04, cc14, cc14) 1236 FNMSUB (bb3, cc03, cc15, cc15) 1237 FNMSUB (bb3, cc04, cc16, cc16) 1238 1239 LDF [BO + 18 * SIZE], a1 1240 LDF [BO + 19 * SIZE], a2 1241 LDF [BO + 20 * SIZE], a3 1242 LDF [BO + 21 * SIZE], a4 1243 LDF [BO + 22 * SIZE], b1 1244 LDF [BO + 23 * SIZE], b2 1245 1246 FMUL a1, c05, c05 1247 FMUL a1, c06, c06 1248 1249 FNMSUB (aa2, cc05, cc07, cc07) 1250 FNMSUB (aa2, cc06, cc08, cc08) 1251 FNMSUB (aa3, cc05, cc09, cc09) 1252 FNMSUB (aa3, cc06, cc10, cc10) 1253 FNMSUB (aa4, cc05, cc11, cc11) 1254 FNMSUB (aa4, cc06, cc12, cc12) 1255 FNMSUB (bb1, cc05, cc13, cc13) 1256 FNMSUB (bb1, cc06, cc14, cc14) 1257 FNMSUB (bb2, cc05, cc15, cc15) 1258 FNMSUB (bb2, cc06, cc16, cc16) 1259 1260 LDF [BO + 27 * SIZE], a1 1261 LDF [BO + 28 * SIZE], a2 1262 LDF [BO + 29 * SIZE], a3 1263 LDF [BO + 30 * SIZE], a4 1264 LDF [BO + 31 * SIZE], b1 1265 1266 FMUL a1, c07, c07 1267 FMUL a1, c08, c08 1268 1269 FNMSUB (aa2, cc07, cc09, cc09) 1270 FNMSUB (aa2, cc08, cc10, cc10) 1271 FNMSUB (aa3, cc07, cc11, cc11) 1272 FNMSUB (aa3, cc08, cc12, cc12) 1273 FNMSUB (aa4, cc07, cc13, cc13) 1274 FNMSUB (aa4, cc08, cc14, cc14) 1275 FNMSUB (bb1, cc07, cc15, cc15) 1276 FNMSUB (bb1, cc08, cc16, cc16) 1277 1278 LDF [BO + 36 * SIZE], a1 1279 LDF [BO + 37 * SIZE], a2 1280 LDF [BO + 38 * SIZE], a3 1281 LDF [BO + 39 * SIZE], a4 1282 1283 FMUL a1, c09, c09 1284 FMUL a1, c10, c10 1285 1286 FNMSUB (aa2, cc09, cc11, cc11) 1287 FNMSUB (aa2, cc10, cc12, cc12) 1288 FNMSUB (aa3, cc09, cc13, cc13) 1289 FNMSUB (aa3, cc10, cc14, cc14) 1290 FNMSUB (aa4, cc09, cc15, cc15) 1291 FNMSUB (aa4, cc10, cc16, cc16) 1292 1293 LDF [BO + 45 * SIZE], a1 1294 LDF [BO + 46 * SIZE], a2 1295 LDF [BO + 47 * SIZE], a3 1296 1297 FMUL a1, c11, c11 1298 FMUL a1, c12, c12 1299 1300 FNMSUB (aa2, cc11, cc13, cc13) 1301 FNMSUB (aa2, cc12, cc14, cc14) 1302 FNMSUB (aa3, cc11, cc15, cc15) 1303 FNMSUB (aa3, cc12, cc16, cc16) 1304 1305 LDF [BO + 54 * SIZE], a1 1306 LDF [BO + 55 * SIZE], a2 1307 1308 FMUL a1, c13, c13 1309 FMUL a1, c14, c14 1310 1311 FNMSUB (aa2, cc13, cc15, cc15) 1312 FNMSUB (aa2, cc14, cc16, cc16) 1313 1314 LDF [BO + 63 * SIZE], a1 1315 1316 FMUL a1, c15, c15 1317 FMUL a1, c16, c16 1318#endif 1319 1320#ifdef RT 1321 LDF [BO + 63 * SIZE], a1 1322 LDF [BO + 62 * SIZE], a2 1323 LDF [BO + 61 * SIZE], a3 1324 LDF [BO + 60 * SIZE], a4 1325 LDF [BO + 59 * SIZE], b1 1326 LDF [BO + 58 * SIZE], b2 1327 LDF [BO + 57 * SIZE], b3 1328 LDF [BO + 56 * SIZE], b4 1329 1330 FMUL a1, c16, c16 1331 FMUL a1, c15, c15 1332 1333 FNMSUB (aa2, cc16, cc14, cc14) 1334 FNMSUB (aa2, cc15, cc13, cc13) 1335 FNMSUB (aa3, cc16, cc12, cc12) 1336 FNMSUB (aa3, cc15, cc11, cc11) 1337 FNMSUB (aa4, cc16, cc10, cc10) 1338 FNMSUB (aa4, cc15, cc09, cc09) 1339 FNMSUB (bb1, cc16, cc08, cc08) 1340 FNMSUB (bb1, cc15, cc07, cc07) 1341 FNMSUB (bb2, cc16, cc06, cc06) 1342 FNMSUB (bb2, cc15, cc05, cc05) 1343 FNMSUB (bb3, cc16, cc04, cc04) 1344 FNMSUB (bb3, cc15, cc03, cc03) 1345 FNMSUB (bb4, cc16, cc02, cc02) 1346 FNMSUB (bb4, cc15, cc01, cc01) 1347 1348 LDF [BO + 54 * SIZE], a1 1349 LDF [BO + 53 * SIZE], a2 1350 LDF [BO + 52 * SIZE], a3 1351 LDF [BO + 51 * SIZE], a4 1352 LDF [BO + 50 * SIZE], b1 1353 LDF [BO + 49 * SIZE], b2 1354 LDF [BO + 48 * SIZE], b3 1355 1356 FMUL a1, c14, c14 1357 FMUL a1, c13, c13 1358 1359 FNMSUB (aa2, cc14, cc12, cc12) 1360 FNMSUB (aa2, cc13, cc11, cc11) 1361 FNMSUB (aa3, cc14, cc10, cc10) 1362 FNMSUB (aa3, cc13, cc09, cc09) 1363 FNMSUB (aa4, cc14, cc08, cc08) 1364 FNMSUB (aa4, cc13, cc07, cc07) 1365 FNMSUB (bb1, cc14, cc06, cc06) 1366 FNMSUB (bb1, cc13, cc05, cc05) 1367 FNMSUB (bb2, cc14, cc04, cc04) 1368 FNMSUB (bb2, cc13, cc03, cc03) 1369 FNMSUB (bb3, cc14, cc02, cc02) 1370 FNMSUB (bb3, cc13, cc01, cc01) 1371 1372 LDF [BO + 45 * SIZE], a1 1373 LDF [BO + 44 * SIZE], a2 1374 LDF [BO + 43 * SIZE], a3 1375 LDF [BO + 42 * SIZE], a4 1376 LDF [BO + 41 * SIZE], b1 1377 LDF [BO + 40 * SIZE], b2 1378 1379 FMUL a1, c12, c12 1380 FMUL a1, c11, c11 1381 1382 FNMSUB (aa2, cc12, cc10, cc10) 1383 FNMSUB (aa2, cc11, cc09, cc09) 1384 FNMSUB (aa3, cc12, cc08, cc08) 1385 FNMSUB (aa3, cc11, cc07, cc07) 1386 FNMSUB (aa4, cc12, cc06, cc06) 1387 FNMSUB (aa4, cc11, cc05, cc05) 1388 FNMSUB (bb1, cc12, cc04, cc04) 1389 FNMSUB (bb1, cc11, cc03, cc03) 1390 FNMSUB (bb2, cc12, cc02, cc02) 1391 FNMSUB (bb2, cc11, cc01, cc01) 1392 1393 LDF [BO + 36 * SIZE], a1 1394 LDF [BO + 35 * SIZE], a2 1395 LDF [BO + 34 * SIZE], a3 1396 LDF [BO + 33 * SIZE], a4 1397 LDF [BO + 32 * SIZE], b1 1398 1399 FMUL a1, c10, c10 1400 FMUL a1, c09, c09 1401 1402 FNMSUB (aa2, cc10, cc08, cc08) 1403 FNMSUB (aa2, cc09, cc07, cc07) 1404 FNMSUB (aa3, cc10, cc06, cc06) 1405 FNMSUB (aa3, cc09, cc05, cc05) 1406 FNMSUB (aa4, cc10, cc04, cc04) 1407 FNMSUB (aa4, cc09, cc03, cc03) 1408 FNMSUB (bb1, cc10, cc02, cc02) 1409 FNMSUB (bb1, cc09, cc01, cc01) 1410 1411 LDF [BO + 27 * SIZE], a1 1412 LDF [BO + 26 * SIZE], a2 1413 LDF [BO + 25 * SIZE], a3 1414 LDF [BO + 24 * SIZE], a4 1415 1416 FMUL a1, c08, c08 1417 FMUL a1, c07, c07 1418 1419 FNMSUB (aa2, cc08, cc06, cc06) 1420 FNMSUB (aa2, cc07, cc05, cc05) 1421 FNMSUB (aa3, cc08, cc04, cc04) 1422 FNMSUB (aa3, cc07, cc03, cc03) 1423 FNMSUB (aa4, cc08, cc02, cc02) 1424 FNMSUB (aa4, cc07, cc01, cc01) 1425 1426 LDF [BO + 18 * SIZE], a1 1427 LDF [BO + 17 * SIZE], a2 1428 LDF [BO + 16 * SIZE], a3 1429 1430 FMUL a1, c06, c06 1431 FMUL a1, c05, c05 1432 1433 FNMSUB (aa2, cc06, cc04, cc04) 1434 FNMSUB (aa2, cc05, cc03, cc03) 1435 FNMSUB (aa3, cc06, cc02, cc02) 1436 FNMSUB (aa3, cc05, cc01, cc01) 1437 1438 LDF [BO + 9 * SIZE], a1 1439 LDF [BO + 8 * SIZE], a2 1440 1441 FMUL a1, c04, c04 1442 FMUL a1, c03, c03 1443 1444 FNMSUB (aa2, cc04, cc02, cc02) 1445 FNMSUB (aa2, cc03, cc01, cc01) 1446 1447 LDF [BO + 0 * SIZE], a1 1448 1449 FMUL a1, c02, c02 1450 FMUL a1, c01, c01 1451#endif 1452 1453#ifdef LN 1454 add C1, -2 * SIZE, C1 1455 add C2, -2 * SIZE, C2 1456 add C3, -2 * SIZE, C3 1457 add C4, -2 * SIZE, C4 1458 add C5, -2 * SIZE, C5 1459 add C6, -2 * SIZE, C6 1460 add C7, -2 * SIZE, C7 1461 add C8, -2 * SIZE, C8 1462#endif 1463 1464#if defined(LN) || defined(LT) 1465 STF c01, [BO + 0 * SIZE] 1466 STF c03, [BO + 1 * SIZE] 1467 STF c05, [BO + 2 * SIZE] 1468 STF c07, [BO + 3 * SIZE] 1469 1470 STF c09, [BO + 4 * SIZE] 1471 STF c11, [BO + 5 * SIZE] 1472 STF c13, [BO + 6 * SIZE] 1473 STF c15, [BO + 7 * SIZE] 1474 1475 STF c02, [BO + 8 * SIZE] 1476 STF c04, [BO + 9 * SIZE] 1477 STF c06, [BO + 10 * SIZE] 1478 STF c08, [BO + 11 * SIZE] 1479 1480 STF c10, [BO + 12 * SIZE] 1481 STF c12, [BO + 13 * SIZE] 1482 STF c14, [BO + 14 * SIZE] 1483 STF c16, [BO + 15 * SIZE] 1484#else 1485 STF c01, [AO + 0 * SIZE] 1486 STF c02, [AO + 1 * SIZE] 1487 STF c03, [AO + 2 * SIZE] 1488 STF c04, [AO + 3 * SIZE] 1489 1490 STF c05, [AO + 4 * SIZE] 1491 STF c06, [AO + 5 * SIZE] 1492 STF c07, [AO + 6 * SIZE] 1493 STF c08, [AO + 7 * SIZE] 1494 1495 STF c09, [AO + 8 * SIZE] 1496 STF c10, [AO + 9 * SIZE] 1497 STF c11, [AO + 10 * SIZE] 1498 STF c12, [AO + 11 * SIZE] 1499 1500 STF c13, [AO + 12 * SIZE] 1501 STF c14, [AO + 13 * SIZE] 1502 STF c15, [AO + 14 * SIZE] 1503 STF c16, [AO + 15 * SIZE] 1504#endif 1505 1506 STF c01, [C1 + 0 * SIZE] 1507 STF c02, [C1 + 1 * SIZE] 1508 STF c03, [C2 + 0 * SIZE] 1509 STF c04, [C2 + 1 * SIZE] 1510 1511 STF c05, [C3 + 0 * SIZE] 1512 STF c06, [C3 + 1 * SIZE] 1513 STF c07, [C4 + 0 * SIZE] 1514 STF c08, [C4 + 1 * SIZE] 1515 1516 STF c09, [C5 + 0 * SIZE] 1517 STF c10, [C5 + 1 * SIZE] 1518 STF c11, [C6 + 0 * SIZE] 1519 STF c12, [C6 + 1 * SIZE] 1520 1521 STF c13, [C7 + 0 * SIZE] 1522 STF c14, [C7 + 1 * SIZE] 1523 STF c15, [C8 + 0 * SIZE] 1524 STF c16, [C8 + 1 * SIZE] 1525 1526#ifndef LN 1527 add C1, 2 * SIZE, C1 1528 add C2, 2 * SIZE, C2 1529 add C3, 2 * SIZE, C3 1530 add C4, 2 * SIZE, C4 1531 add C5, 2 * SIZE, C5 1532 add C6, 2 * SIZE, C6 1533 add C7, 2 * SIZE, C7 1534 add C8, 2 * SIZE, C8 1535#endif 1536 1537#ifdef RT 1538 sll K, BASE_SHIFT + 1, TEMP1 1539 add AORIG, TEMP1, AORIG 1540#endif 1541 1542#if defined(LT) || defined(RN) 1543 sub K, KK, TEMP1 1544 sll TEMP1, BASE_SHIFT + 1, TEMP2 1545 sll TEMP1, BASE_SHIFT + 3, TEMP1 1546 add AO, TEMP2, AO 1547 add BO, TEMP1, BO 1548#endif 1549 1550#ifdef LT 1551 add KK, 2, KK 1552#endif 1553 1554#ifdef LN 1555 sub KK, 2, KK 1556#endif 1557 1558 add I, -1, I 1559 cmp I, 0 1560 bg,pt %icc, .LL12 1561 nop 1562 .align 4 1563 1564.LL20: 1565 and M, 1, I 1566 cmp I, 0 1567 ble,pn %icc, .LL29 1568 nop 1569 1570#if defined(LT) || defined(RN) 1571 mov B, BO 1572#else 1573#ifdef LN 1574 sll K, BASE_SHIFT + 0, TEMP1 1575 sub AORIG, TEMP1, AORIG 1576#endif 1577 1578 sll KK, BASE_SHIFT + 0, TEMP1 1579 sll KK, BASE_SHIFT + 3, TEMP2 1580 1581 add AORIG, TEMP1, AO 1582 add B, TEMP2, BO 1583#endif 1584 1585 LDF [AO + 0 * SIZE], a1 1586 LDF [AO + 1 * SIZE], a2 1587 LDF [AO + 2 * SIZE], a3 1588 LDF [AO + 3 * SIZE], a4 1589 1590 LDF [BO + 0 * SIZE], b1 1591 FCLR (cc01) 1592 LDF [BO + 1 * SIZE], b2 1593 FCLR (cc03) 1594 LDF [BO + 2 * SIZE], b3 1595 FCLR (cc05) 1596 LDF [BO + 3 * SIZE], b4 1597 FCLR (cc07) 1598 LDF [BO + 4 * SIZE], b5 1599 FCLR (cc09) 1600 LDF [BO + 5 * SIZE], b6 1601 FCLR (cc11) 1602 LDF [BO + 6 * SIZE], b7 1603 FCLR (cc13) 1604 LDF [BO + 7 * SIZE], b8 1605 FCLR (cc15) 1606 1607#if defined(LT) || defined(RN) 1608 sra KK, 2, L 1609#else 1610 sub K, KK, L 1611 sra L, 2, L 1612#endif 1613 cmp L, 0 1614 ble,pn %icc, .LL25 1615 LDF [BO + 8 * SIZE], b9 1616 .align 4 1617 1618.LL23: 1619 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1620 add L, -1, L 1621 1622 FMADD (aa1, bb1, cc01, cc01) 1623 LDF [BO + 16 * SIZE], b1 1624 FMADD (aa1, bb2, cc03, cc03) 1625 LDF [BO + 9 * SIZE], b2 1626 1627 FMADD (aa1, bb3, cc05, cc05) 1628 LDF [BO + 10 * SIZE], b3 1629 FMADD (aa1, bb4, cc07, cc07) 1630 LDF [BO + 11 * SIZE], b4 1631 1632 FMADD (aa1, bb5, cc09, cc09) 1633 LDF [BO + 12 * SIZE], b5 1634 FMADD (aa1, bb6, cc11, cc11) 1635 LDF [BO + 13 * SIZE], b6 1636 1637 FMADD (aa1, bb7, cc13, cc13) 1638 LDF [BO + 14 * SIZE], b7 1639 FMADD (aa1, bb8, cc15, cc15) 1640 LDF [BO + 15 * SIZE], b8 1641 1642 FMADD (aa2, bb9, cc01, cc01) 1643 LDF [BO + 24 * SIZE], b9 1644 FMADD (aa2, bb2, cc03, cc03) 1645 LDF [BO + 17 * SIZE], b2 1646 1647 FMADD (aa2, bb3, cc05, cc05) 1648 LDF [BO + 18 * SIZE], b3 1649 FMADD (aa2, bb4, cc07, cc07) 1650 LDF [BO + 19 * SIZE], b4 1651 1652 FMADD (aa2, bb5, cc09, cc09) 1653 LDF [BO + 20 * SIZE], b5 1654 FMADD (aa2, bb6, cc11, cc11) 1655 LDF [BO + 21 * SIZE], b6 1656 1657 FMADD (aa2, bb7, cc13, cc13) 1658 LDF [BO + 22 * SIZE], b7 1659 FMADD (aa2, bb8, cc15, cc15) 1660 LDF [BO + 23 * SIZE], b8 1661 1662 LDF [AO + 4 * SIZE], a1 1663 LDF [AO + 5 * SIZE], a2 1664 1665 FMADD (aa3, bb1, cc01, cc01) 1666 LDF [BO + 32 * SIZE], b1 1667 FMADD (aa3, bb2, cc03, cc03) 1668 LDF [BO + 25 * SIZE], b2 1669 1670 FMADD (aa3, bb3, cc05, cc05) 1671 LDF [BO + 26 * SIZE], b3 1672 FMADD (aa3, bb4, cc07, cc07) 1673 LDF [BO + 27 * SIZE], b4 1674 1675 FMADD (aa3, bb5, cc09, cc09) 1676 LDF [BO + 28 * SIZE], b5 1677 FMADD (aa3, bb6, cc11, cc11) 1678 LDF [BO + 29 * SIZE], b6 1679 1680 FMADD (aa3, bb7, cc13, cc13) 1681 LDF [BO + 30 * SIZE], b7 1682 FMADD (aa3, bb8, cc15, cc15) 1683 LDF [BO + 31 * SIZE], b8 1684 1685 FMADD (aa4, bb9, cc01, cc01) 1686 LDF [BO + 40 * SIZE], b9 1687 FMADD (aa4, bb2, cc03, cc03) 1688 LDF [BO + 33 * SIZE], b2 1689 1690 FMADD (aa4, bb3, cc05, cc05) 1691 LDF [BO + 34 * SIZE], b3 1692 FMADD (aa4, bb4, cc07, cc07) 1693 LDF [BO + 35 * SIZE], b4 1694 1695 FMADD (aa4, bb5, cc09, cc09) 1696 LDF [BO + 36 * SIZE], b5 1697 FMADD (aa4, bb6, cc11, cc11) 1698 LDF [BO + 37 * SIZE], b6 1699 1700 FMADD (aa4, bb7, cc13, cc13) 1701 LDF [BO + 38 * SIZE], b7 1702 FMADD (aa4, bb8, cc15, cc15) 1703 LDF [BO + 39 * SIZE], b8 1704 1705 LDF [AO + 6 * SIZE], a3 1706 LDF [AO + 7 * SIZE], a4 1707 1708 add AO, 4 * SIZE, AO 1709 cmp L, 0 1710 bg,pt %icc, .LL23 1711 add BO, 32 * SIZE, BO 1712 .align 4 1713 1714.LL25: 1715#if defined(LT) || defined(RN) 1716 and KK, 3, L 1717#else 1718 sub K, KK, L 1719 and L, 3, L 1720#endif 1721 cmp L, 0 1722 ble,a,pn %icc, .LL28 1723 nop 1724 .align 4 1725 1726.LL27: 1727 FMADD (aa1, bb1, cc01, cc01) 1728 LDF [BO + 8 * SIZE], b1 1729 FMADD (aa1, bb2, cc03, cc03) 1730 LDF [BO + 9 * SIZE], b2 1731 1732 FMADD (aa1, bb3, cc05, cc05) 1733 LDF [BO + 10 * SIZE], b3 1734 FMADD (aa1, bb4, cc07, cc07) 1735 LDF [BO + 11 * SIZE], b4 1736 1737 FMADD (aa1, bb5, cc09, cc09) 1738 LDF [BO + 12 * SIZE], b5 1739 FMADD (aa1, bb6, cc11, cc11) 1740 LDF [BO + 13 * SIZE], b6 1741 1742 FMADD (aa1, bb7, cc13, cc13) 1743 LDF [BO + 14 * SIZE], b7 1744 FMADD (aa1, bb8, cc15, cc15) 1745 LDF [BO + 15 * SIZE], b8 1746 1747 LDF [AO + 1 * SIZE], a1 1748 add AO, 1 * SIZE, AO 1749 1750 add L, -1, L 1751 cmp L, 0 1752 bg,pt %icc, .LL27 1753 add BO, 8 * SIZE, BO 1754 .align 4 1755 1756.LL28: 1757#if defined(LN) || defined(RT) 1758#ifdef LN 1759 sub KK, 1, TEMP1 1760#else 1761 sub KK, 8, TEMP1 1762#endif 1763 sll TEMP1, BASE_SHIFT + 0, TEMP2 1764 sll TEMP1, BASE_SHIFT + 3, TEMP1 1765 1766 add AORIG, TEMP2, AO 1767 add B, TEMP1, BO 1768#endif 1769 1770#if defined(LN) || defined(LT) 1771 LDF [BO + 0 * SIZE], a1 1772 LDF [BO + 1 * SIZE], a2 1773 LDF [BO + 2 * SIZE], a3 1774 LDF [BO + 3 * SIZE], a4 1775 1776 LDF [BO + 4 * SIZE], b1 1777 LDF [BO + 5 * SIZE], b2 1778 LDF [BO + 6 * SIZE], b3 1779 LDF [BO + 7 * SIZE], b4 1780 1781 FSUB a1, c01, c01 1782 FSUB a2, c03, c03 1783 FSUB a3, c05, c05 1784 FSUB a4, c07, c07 1785 1786 FSUB b1, c09, c09 1787 FSUB b2, c11, c11 1788 FSUB b3, c13, c13 1789 FSUB b4, c15, c15 1790#else 1791 LDF [AO + 0 * SIZE], a1 1792 LDF [AO + 1 * SIZE], a2 1793 LDF [AO + 2 * SIZE], a3 1794 LDF [AO + 3 * SIZE], a4 1795 1796 LDF [AO + 4 * SIZE], b1 1797 LDF [AO + 5 * SIZE], b2 1798 LDF [AO + 6 * SIZE], b3 1799 LDF [AO + 7 * SIZE], b4 1800 1801 FSUB a1, c01, c01 1802 FSUB a2, c03, c03 1803 FSUB a3, c05, c05 1804 FSUB a4, c07, c07 1805 1806 FSUB b1, c09, c09 1807 FSUB b2, c11, c11 1808 FSUB b3, c13, c13 1809 FSUB b4, c15, c15 1810#endif 1811 1812#if defined(LN) || defined(LT) 1813 LDF [AO + 0 * SIZE], a1 1814 1815 FMUL a1, c01, c01 1816 FMUL a1, c03, c03 1817 FMUL a1, c05, c05 1818 FMUL a1, c07, c07 1819 FMUL a1, c09, c09 1820 FMUL a1, c11, c11 1821 FMUL a1, c13, c13 1822 FMUL a1, c15, c15 1823#endif 1824 1825#ifdef RN 1826 LDF [BO + 0 * SIZE], a1 1827 LDF [BO + 1 * SIZE], a2 1828 LDF [BO + 2 * SIZE], a3 1829 LDF [BO + 3 * SIZE], a4 1830 LDF [BO + 4 * SIZE], b1 1831 LDF [BO + 5 * SIZE], b2 1832 LDF [BO + 6 * SIZE], b3 1833 LDF [BO + 7 * SIZE], b4 1834 1835 FMUL a1, c01, c01 1836 1837 FNMSUB (aa2, cc01, cc03, cc03) 1838 FNMSUB (aa3, cc01, cc05, cc05) 1839 FNMSUB (aa4, cc01, cc07, cc07) 1840 FNMSUB (bb1, cc01, cc09, cc09) 1841 FNMSUB (bb2, cc01, cc11, cc11) 1842 FNMSUB (bb3, cc01, cc13, cc13) 1843 FNMSUB (bb4, cc01, cc15, cc15) 1844 1845 LDF [BO + 9 * SIZE], a1 1846 LDF [BO + 10 * SIZE], a2 1847 LDF [BO + 11 * SIZE], a3 1848 LDF [BO + 12 * SIZE], a4 1849 LDF [BO + 13 * SIZE], b1 1850 LDF [BO + 14 * SIZE], b2 1851 LDF [BO + 15 * SIZE], b3 1852 1853 FMUL a1, c03, c03 1854 1855 FNMSUB (aa2, cc03, cc05, cc05) 1856 FNMSUB (aa3, cc03, cc07, cc07) 1857 FNMSUB (aa4, cc03, cc09, cc09) 1858 FNMSUB (bb1, cc03, cc11, cc11) 1859 FNMSUB (bb2, cc03, cc13, cc13) 1860 FNMSUB (bb3, cc03, cc15, cc15) 1861 1862 LDF [BO + 18 * SIZE], a1 1863 LDF [BO + 19 * SIZE], a2 1864 LDF [BO + 20 * SIZE], a3 1865 LDF [BO + 21 * SIZE], a4 1866 LDF [BO + 22 * SIZE], b1 1867 LDF [BO + 23 * SIZE], b2 1868 1869 FMUL a1, c05, c05 1870 1871 FNMSUB (aa2, cc05, cc07, cc07) 1872 FNMSUB (aa3, cc05, cc09, cc09) 1873 FNMSUB (aa4, cc05, cc11, cc11) 1874 FNMSUB (bb1, cc05, cc13, cc13) 1875 FNMSUB (bb2, cc05, cc15, cc15) 1876 1877 LDF [BO + 27 * SIZE], a1 1878 LDF [BO + 28 * SIZE], a2 1879 LDF [BO + 29 * SIZE], a3 1880 LDF [BO + 30 * SIZE], a4 1881 LDF [BO + 31 * SIZE], b1 1882 1883 FMUL a1, c07, c07 1884 1885 FNMSUB (aa2, cc07, cc09, cc09) 1886 FNMSUB (aa3, cc07, cc11, cc11) 1887 FNMSUB (aa4, cc07, cc13, cc13) 1888 FNMSUB (bb1, cc07, cc15, cc15) 1889 1890 LDF [BO + 36 * SIZE], a1 1891 LDF [BO + 37 * SIZE], a2 1892 LDF [BO + 38 * SIZE], a3 1893 LDF [BO + 39 * SIZE], a4 1894 1895 FMUL a1, c09, c09 1896 1897 FNMSUB (aa2, cc09, cc11, cc11) 1898 FNMSUB (aa3, cc09, cc13, cc13) 1899 FNMSUB (aa4, cc09, cc15, cc15) 1900 1901 LDF [BO + 45 * SIZE], a1 1902 LDF [BO + 46 * SIZE], a2 1903 LDF [BO + 47 * SIZE], a3 1904 1905 FMUL a1, c11, c11 1906 1907 FNMSUB (aa2, cc11, cc13, cc13) 1908 FNMSUB (aa3, cc11, cc15, cc15) 1909 1910 LDF [BO + 54 * SIZE], a1 1911 LDF [BO + 55 * SIZE], a2 1912 1913 FMUL a1, c13, c13 1914 1915 FNMSUB (aa2, cc13, cc15, cc15) 1916 1917 LDF [BO + 63 * SIZE], a1 1918 1919 FMUL a1, c15, c15 1920#endif 1921 1922#ifdef RT 1923 LDF [BO + 63 * SIZE], a1 1924 LDF [BO + 62 * SIZE], a2 1925 LDF [BO + 61 * SIZE], a3 1926 LDF [BO + 60 * SIZE], a4 1927 LDF [BO + 59 * SIZE], b1 1928 LDF [BO + 58 * SIZE], b2 1929 LDF [BO + 57 * SIZE], b3 1930 LDF [BO + 56 * SIZE], b4 1931 1932 FMUL a1, c15, c15 1933 1934 FNMSUB (aa2, cc15, cc13, cc13) 1935 FNMSUB (aa3, cc15, cc11, cc11) 1936 FNMSUB (aa4, cc15, cc09, cc09) 1937 FNMSUB (bb1, cc15, cc07, cc07) 1938 FNMSUB (bb2, cc15, cc05, cc05) 1939 FNMSUB (bb3, cc15, cc03, cc03) 1940 FNMSUB (bb4, cc15, cc01, cc01) 1941 1942 LDF [BO + 54 * SIZE], a1 1943 LDF [BO + 53 * SIZE], a2 1944 LDF [BO + 52 * SIZE], a3 1945 LDF [BO + 51 * SIZE], a4 1946 LDF [BO + 50 * SIZE], b1 1947 LDF [BO + 49 * SIZE], b2 1948 LDF [BO + 48 * SIZE], b3 1949 1950 FMUL a1, c13, c13 1951 1952 FNMSUB (aa2, cc13, cc11, cc11) 1953 FNMSUB (aa3, cc13, cc09, cc09) 1954 FNMSUB (aa4, cc13, cc07, cc07) 1955 FNMSUB (bb1, cc13, cc05, cc05) 1956 FNMSUB (bb2, cc13, cc03, cc03) 1957 FNMSUB (bb3, cc13, cc01, cc01) 1958 1959 LDF [BO + 45 * SIZE], a1 1960 LDF [BO + 44 * SIZE], a2 1961 LDF [BO + 43 * SIZE], a3 1962 LDF [BO + 42 * SIZE], a4 1963 LDF [BO + 41 * SIZE], b1 1964 LDF [BO + 40 * SIZE], b2 1965 1966 FMUL a1, c11, c11 1967 1968 FNMSUB (aa2, cc11, cc09, cc09) 1969 FNMSUB (aa3, cc11, cc07, cc07) 1970 FNMSUB (aa4, cc11, cc05, cc05) 1971 FNMSUB (bb1, cc11, cc03, cc03) 1972 FNMSUB (bb2, cc11, cc01, cc01) 1973 1974 LDF [BO + 36 * SIZE], a1 1975 LDF [BO + 35 * SIZE], a2 1976 LDF [BO + 34 * SIZE], a3 1977 LDF [BO + 33 * SIZE], a4 1978 LDF [BO + 32 * SIZE], b1 1979 1980 FMUL a1, c09, c09 1981 1982 FNMSUB (aa2, cc09, cc07, cc07) 1983 FNMSUB (aa3, cc09, cc05, cc05) 1984 FNMSUB (aa4, cc09, cc03, cc03) 1985 FNMSUB (bb1, cc09, cc01, cc01) 1986 1987 LDF [BO + 27 * SIZE], a1 1988 LDF [BO + 26 * SIZE], a2 1989 LDF [BO + 25 * SIZE], a3 1990 LDF [BO + 24 * SIZE], a4 1991 1992 FMUL a1, c07, c07 1993 1994 FNMSUB (aa2, cc07, cc05, cc05) 1995 FNMSUB (aa3, cc07, cc03, cc03) 1996 FNMSUB (aa4, cc07, cc01, cc01) 1997 1998 LDF [BO + 18 * SIZE], a1 1999 LDF [BO + 17 * SIZE], a2 2000 LDF [BO + 16 * SIZE], a3 2001 2002 FMUL a1, c05, c05 2003 2004 FNMSUB (aa2, cc05, cc03, cc03) 2005 FNMSUB (aa3, cc05, cc01, cc01) 2006 2007 LDF [BO + 9 * SIZE], a1 2008 LDF [BO + 8 * SIZE], a2 2009 2010 FMUL a1, c03, c03 2011 2012 FNMSUB (aa2, cc03, cc01, cc01) 2013 2014 LDF [BO + 0 * SIZE], a1 2015 2016 FMUL a1, c01, c01 2017#endif 2018 2019#ifdef LN 2020 add C1, -1 * SIZE, C1 2021 add C2, -1 * SIZE, C2 2022 add C3, -1 * SIZE, C3 2023 add C4, -1 * SIZE, C4 2024 add C5, -1 * SIZE, C5 2025 add C6, -1 * SIZE, C6 2026 add C7, -1 * SIZE, C7 2027 add C8, -1 * SIZE, C8 2028#endif 2029 2030#if defined(LN) || defined(LT) 2031 STF c01, [BO + 0 * SIZE] 2032 STF c03, [BO + 1 * SIZE] 2033 STF c05, [BO + 2 * SIZE] 2034 STF c07, [BO + 3 * SIZE] 2035 2036 STF c09, [BO + 4 * SIZE] 2037 STF c11, [BO + 5 * SIZE] 2038 STF c13, [BO + 6 * SIZE] 2039 STF c15, [BO + 7 * SIZE] 2040#else 2041 STF c01, [AO + 0 * SIZE] 2042 STF c03, [AO + 1 * SIZE] 2043 STF c05, [AO + 2 * SIZE] 2044 STF c07, [AO + 3 * SIZE] 2045 2046 STF c09, [AO + 4 * SIZE] 2047 STF c11, [AO + 5 * SIZE] 2048 STF c13, [AO + 6 * SIZE] 2049 STF c15, [AO + 7 * SIZE] 2050#endif 2051 2052 STF c01, [C1 + 0 * SIZE] 2053 STF c03, [C2 + 0 * SIZE] 2054 STF c05, [C3 + 0 * SIZE] 2055 STF c07, [C4 + 0 * SIZE] 2056 2057 STF c09, [C5 + 0 * SIZE] 2058 STF c11, [C6 + 0 * SIZE] 2059 STF c13, [C7 + 0 * SIZE] 2060 STF c15, [C8 + 0 * SIZE] 2061 2062#ifdef RT 2063 sll K, BASE_SHIFT + 0, TEMP1 2064 add AORIG, TEMP1, AORIG 2065#endif 2066 2067#if defined(LT) || defined(RN) 2068 sub K, KK, TEMP1 2069 sll TEMP1, BASE_SHIFT + 0, TEMP2 2070 sll TEMP1, BASE_SHIFT + 3, TEMP1 2071 add AO, TEMP2, AO 2072 add BO, TEMP1, BO 2073#endif 2074 2075#ifdef LT 2076 add KK, 1, KK 2077#endif 2078 2079#ifdef LN 2080 sub KK, 1, KK 2081#endif 2082 .align 4 2083 2084.LL29: 2085#ifdef LN 2086 sll K, BASE_SHIFT + 3, TEMP1 2087 add B, TEMP1, B 2088#endif 2089 2090#if defined(LT) || defined(RN) 2091 mov BO, B 2092#endif 2093 2094#ifdef RN 2095 add KK, 8, KK 2096#endif 2097 2098#ifdef RT 2099 sub KK, 8, KK 2100#endif 2101 2102 add J, -1, J 2103 cmp J, 0 2104 bg,pt %icc, .LL11 2105 nop 2106 .align 4 2107 2108.LL30: 2109 and N, 4, J 2110 cmp J, 0 2111 ble,pn %icc, .LL50 2112 nop 2113 2114#ifdef RT 2115 sll K, BASE_SHIFT + 2, TEMP1 2116 sub B, TEMP1, B 2117#endif 2118 2119#ifndef RT 2120 mov C, C1 2121 add C, LDC, C2 2122 add C2, LDC, C3 2123 add C3, LDC, C4 2124 add C4, LDC, C 2125#else 2126 sub C, LDC, C4 2127 sub C4, LDC, C3 2128 sub C3, LDC, C2 2129 sub C2, LDC, C1 2130 sub C2, LDC, C 2131#endif 2132 2133#ifdef LN 2134 add M, OFFSET, KK 2135#endif 2136 2137#ifdef LT 2138 mov OFFSET, KK 2139#endif 2140 2141#if defined(LN) || defined(RT) 2142 mov A, AORIG 2143#else 2144 mov A, AO 2145#endif 2146 2147 sra M, 1, I 2148 cmp I, 0 2149 ble,pn %icc, .LL40 2150 nop 2151 .align 4 2152 2153.LL32: 2154#if defined(LT) || defined(RN) 2155 mov B, BO 2156#else 2157#ifdef LN 2158 sll K, BASE_SHIFT + 1, TEMP1 2159 sub AORIG, TEMP1, AORIG 2160#endif 2161 2162 sll KK, BASE_SHIFT + 1, TEMP1 2163 sll KK, BASE_SHIFT + 2, TEMP2 2164 2165 add AORIG, TEMP1, AO 2166 add B, TEMP2, BO 2167#endif 2168 2169 LDF [AO + 0 * SIZE], a1 2170 LDF [AO + 1 * SIZE], a2 2171 2172 LDF [BO + 0 * SIZE], b1 2173 LDF [BO + 1 * SIZE], b2 2174 LDF [BO + 2 * SIZE], b3 2175 LDF [BO + 3 * SIZE], b4 2176 LDF [BO + 4 * SIZE], b5 2177 2178 LDF [BO + 5 * SIZE], b6 2179 FCLR (cc01) 2180 LDF [BO + 6 * SIZE], b7 2181 FCLR (cc02) 2182 LDF [BO + 7 * SIZE], b8 2183 FCLR (cc03) 2184 LDF [BO + 8 * SIZE], b9 2185 FCLR (cc04) 2186 2187 prefetch [C1 + 2 * SIZE], 3 2188 FCLR (cc05) 2189 prefetch [C2 + 2 * SIZE], 3 2190 FCLR (cc06) 2191 prefetch [C3 + 2 * SIZE], 3 2192 FCLR (cc07) 2193 prefetch [C4 + 2 * SIZE], 3 2194 FCLR (cc08) 2195 2196#if defined(LT) || defined(RN) 2197 sra KK, 2, L 2198#else 2199 sub K, KK, L 2200 sra L, 2, L 2201#endif 2202 cmp L, 0 2203 ble,pn %icc, .LL35 2204 nop 2205 .align 4 2206 2207.LL33: 2208 FMADD (aa1, bb1, cc01, cc01) 2209 LDF [AO + 2 * SIZE], a3 2210 FMADD (aa2, bb1, cc02, cc02) 2211 LDF [AO + 3 * SIZE], a4 2212 2213 FMADD (aa1, bb2, cc03, cc03) 2214 LDF [BO + 16 * SIZE], b1 2215 FMADD (aa2, bb2, cc04, cc04) 2216 LDF [BO + 9 * SIZE], b2 2217 2218 FMADD (aa1, bb3, cc05, cc05) 2219 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2220 FMADD (aa2, bb3, cc06, cc06) 2221 add L, -1, L 2222 2223 FMADD (aa1, bb4, cc07, cc07) 2224 LDF [BO + 10 * SIZE], b3 2225 FMADD (aa2, bb4, cc08, cc08) 2226 LDF [BO + 11 * SIZE], b4 2227 2228 FMADD (aa3, bb5, cc01, cc01) 2229 LDF [AO + 4 * SIZE], a1 2230 FMADD (aa4, bb5, cc02, cc02) 2231 LDF [AO + 5 * SIZE], a2 2232 2233 FMADD (aa3, bb6, cc03, cc03) 2234 LDF [BO + 12 * SIZE], b5 2235 FMADD (aa4, bb6, cc04, cc04) 2236 LDF [BO + 13 * SIZE], b6 2237 2238 FMADD (aa3, bb7, cc05, cc05) 2239 cmp L, 0 2240 FMADD (aa4, bb7, cc06, cc06) 2241 add AO, 8 * SIZE, AO 2242 2243 FMADD (aa3, bb8, cc07, cc07) 2244 LDF [BO + 14 * SIZE], b7 2245 FMADD (aa4, bb8, cc08, cc08) 2246 LDF [BO + 15 * SIZE], b8 2247 2248 FMADD (aa1, bb9, cc01, cc01) 2249 LDF [AO - 2 * SIZE], a3 2250 FMADD (aa2, bb9, cc02, cc02) 2251 LDF [AO - 1 * SIZE], a4 2252 2253 FMADD (aa1, bb2, cc03, cc03) 2254 LDF [BO + 24 * SIZE], b9 2255 FMADD (aa2, bb2, cc04, cc04) 2256 LDF [BO + 17 * SIZE], b2 2257 2258 FMADD (aa1, bb3, cc05, cc05) 2259 add BO, 16 * SIZE, BO 2260 FMADD (aa2, bb3, cc06, cc06) 2261 nop 2262 2263 FMADD (aa1, bb4, cc07, cc07) 2264 LDF [BO + 2 * SIZE], b3 2265 FMADD (aa2, bb4, cc08, cc08) 2266 LDF [BO + 3 * SIZE], b4 2267 2268 FMADD (aa3, bb5, cc01, cc01) 2269 LDF [AO + 0 * SIZE], a1 2270 FMADD (aa4, bb5, cc02, cc02) 2271 LDF [AO + 1 * SIZE], a2 2272 FMADD (aa3, bb6, cc03, cc03) 2273 LDF [BO + 4 * SIZE], b5 2274 FMADD (aa4, bb6, cc04, cc04) 2275 LDF [BO + 5 * SIZE], b6 2276 2277 FMADD (aa3, bb7, cc05, cc05) 2278 nop 2279 FMADD (aa4, bb7, cc06, cc06) 2280 LDF [BO + 6 * SIZE], b7 2281 2282 FMADD (aa3, bb8, cc07, cc07) 2283 FMADD (aa4, bb8, cc08, cc08) 2284 bg,pt %icc, .LL33 2285 LDF [BO + 7 * SIZE], b8 2286 .align 4 2287 2288.LL35: 2289#if defined(LT) || defined(RN) 2290 and KK, 3, L 2291#else 2292 sub K, KK, L 2293 and L, 3, L 2294#endif 2295 cmp L, 0 2296 ble,a,pn %icc, .LL38 2297 nop 2298 .align 4 2299 2300.LL37: 2301 FMADD (aa1, bb1, cc01, cc01) 2302 add L, -1, L 2303 FMADD (aa2, bb1, cc02, cc02) 2304 LDF [BO + 4 * SIZE], b1 2305 2306 FMADD (aa1, bb2, cc03, cc03) 2307 add AO, 2 * SIZE, AO 2308 FMADD (aa2, bb2, cc04, cc04) 2309 LDF [BO + 5 * SIZE], b2 2310 2311 FMADD (aa1, bb3, cc05, cc05) 2312 cmp L, 0 2313 FMADD (aa2, bb3, cc06, cc06) 2314 LDF [BO + 6 * SIZE], b3 2315 2316 FMADD (aa1, bb4, cc07, cc07) 2317 LDF [AO + 0 * SIZE], a1 2318 FMADD (aa2, bb4, cc08, cc08) 2319 LDF [AO + 1 * SIZE], a2 2320 2321 LDF [BO + 7 * SIZE], b4 2322 bg,pt %icc, .LL37 2323 add BO, 4 * SIZE, BO 2324 .align 4 2325 2326.LL38: 2327#if defined(LN) || defined(RT) 2328#ifdef LN 2329 sub KK, 2, TEMP1 2330#else 2331 sub KK, 4, TEMP1 2332#endif 2333 sll TEMP1, BASE_SHIFT + 1, TEMP2 2334 sll TEMP1, BASE_SHIFT + 2, TEMP1 2335 2336 add AORIG, TEMP2, AO 2337 add B, TEMP1, BO 2338#endif 2339 2340#if defined(LN) || defined(LT) 2341 LDF [BO + 0 * SIZE], a1 2342 LDF [BO + 1 * SIZE], a2 2343 LDF [BO + 2 * SIZE], a3 2344 LDF [BO + 3 * SIZE], a4 2345 2346 LDF [BO + 4 * SIZE], b1 2347 LDF [BO + 5 * SIZE], b2 2348 LDF [BO + 6 * SIZE], b3 2349 LDF [BO + 7 * SIZE], b4 2350 2351 FSUB a1, c01, c01 2352 FSUB a2, c03, c03 2353 FSUB a3, c05, c05 2354 FSUB a4, c07, c07 2355 2356 FSUB b1, c02, c02 2357 FSUB b2, c04, c04 2358 FSUB b3, c06, c06 2359 FSUB b4, c08, c08 2360#else 2361 LDF [AO + 0 * SIZE], a1 2362 LDF [AO + 1 * SIZE], a2 2363 LDF [AO + 2 * SIZE], a3 2364 LDF [AO + 3 * SIZE], a4 2365 2366 LDF [AO + 4 * SIZE], b1 2367 LDF [AO + 5 * SIZE], b2 2368 LDF [AO + 6 * SIZE], b3 2369 LDF [AO + 7 * SIZE], b4 2370 2371 FSUB a1, c01, c01 2372 FSUB a2, c02, c02 2373 FSUB a3, c03, c03 2374 FSUB a4, c04, c04 2375 2376 FSUB b1, c05, c05 2377 FSUB b2, c06, c06 2378 FSUB b3, c07, c07 2379 FSUB b4, c08, c08 2380 2381#endif 2382 2383#ifdef LN 2384 LDF [AO + 3 * SIZE], a1 2385 LDF [AO + 2 * SIZE], a2 2386 LDF [AO + 0 * SIZE], a3 2387 2388 FMUL a1, c02, c02 2389 FMUL a1, c04, c04 2390 FMUL a1, c06, c06 2391 FMUL a1, c08, c08 2392 2393 FNMSUB (aa2, cc02, cc01, cc01) 2394 FNMSUB (aa2, cc04, cc03, cc03) 2395 FNMSUB (aa2, cc06, cc05, cc05) 2396 FNMSUB (aa2, cc08, cc07, cc07) 2397 2398 FMUL a3, c01, c01 2399 FMUL a3, c03, c03 2400 FMUL a3, c05, c05 2401 FMUL a3, c07, c07 2402#endif 2403 2404#ifdef LT 2405 LDF [AO + 0 * SIZE], a1 2406 LDF [AO + 1 * SIZE], a2 2407 LDF [AO + 3 * SIZE], a3 2408 2409 FMUL a1, c01, c01 2410 FMUL a1, c03, c03 2411 FMUL a1, c05, c05 2412 FMUL a1, c07, c07 2413 2414 FNMSUB (aa2, cc01, cc02, cc02) 2415 FNMSUB (aa2, cc03, cc04, cc04) 2416 FNMSUB (aa2, cc05, cc06, cc06) 2417 FNMSUB (aa2, cc07, cc08, cc08) 2418 2419 FMUL a3, c02, c02 2420 FMUL a3, c04, c04 2421 FMUL a3, c06, c06 2422 FMUL a3, c08, c08 2423#endif 2424 2425#ifdef RN 2426 LDF [BO + 0 * SIZE], a1 2427 LDF [BO + 1 * SIZE], a2 2428 LDF [BO + 2 * SIZE], a3 2429 LDF [BO + 3 * SIZE], a4 2430 2431 FMUL a1, c01, c01 2432 FMUL a1, c02, c02 2433 2434 FNMSUB (aa2, cc01, cc03, cc03) 2435 FNMSUB (aa2, cc02, cc04, cc04) 2436 FNMSUB (aa3, cc01, cc05, cc05) 2437 FNMSUB (aa3, cc02, cc06, cc06) 2438 FNMSUB (aa4, cc01, cc07, cc07) 2439 FNMSUB (aa4, cc02, cc08, cc08) 2440 2441 LDF [BO + 5 * SIZE], a1 2442 LDF [BO + 6 * SIZE], a2 2443 LDF [BO + 7 * SIZE], a3 2444 2445 FMUL a1, c03, c03 2446 FMUL a1, c04, c04 2447 2448 FNMSUB (aa2, cc03, cc05, cc05) 2449 FNMSUB (aa2, cc04, cc06, cc06) 2450 FNMSUB (aa3, cc03, cc07, cc07) 2451 FNMSUB (aa3, cc04, cc08, cc08) 2452 2453 LDF [BO + 10 * SIZE], a1 2454 LDF [BO + 11 * SIZE], a2 2455 2456 FMUL a1, c05, c05 2457 FMUL a1, c06, c06 2458 2459 FNMSUB (aa2, cc05, cc07, cc07) 2460 FNMSUB (aa2, cc06, cc08, cc08) 2461 2462 LDF [BO + 15 * SIZE], a1 2463 2464 FMUL a1, c07, c07 2465 FMUL a1, c08, c08 2466#endif 2467 2468#ifdef RT 2469 LDF [BO + 15 * SIZE], a1 2470 LDF [BO + 14 * SIZE], a2 2471 LDF [BO + 13 * SIZE], a3 2472 LDF [BO + 12 * SIZE], a4 2473 2474 FMUL a1, c08, c08 2475 FMUL a1, c07, c07 2476 2477 FNMSUB (aa2, cc08, cc06, cc06) 2478 FNMSUB (aa2, cc07, cc05, cc05) 2479 FNMSUB (aa3, cc08, cc04, cc04) 2480 FNMSUB (aa3, cc07, cc03, cc03) 2481 FNMSUB (aa4, cc08, cc02, cc02) 2482 FNMSUB (aa4, cc07, cc01, cc01) 2483 2484 LDF [BO + 10 * SIZE], a1 2485 LDF [BO + 9 * SIZE], a2 2486 LDF [BO + 8 * SIZE], a3 2487 2488 FMUL a1, c06, c06 2489 FMUL a1, c05, c05 2490 2491 FNMSUB (aa2, cc06, cc04, cc04) 2492 FNMSUB (aa2, cc05, cc03, cc03) 2493 FNMSUB (aa3, cc06, cc02, cc02) 2494 FNMSUB (aa3, cc05, cc01, cc01) 2495 2496 LDF [BO + 5 * SIZE], a1 2497 LDF [BO + 4 * SIZE], a2 2498 2499 FMUL a1, c04, c04 2500 FMUL a1, c03, c03 2501 2502 FNMSUB (aa2, cc04, cc02, cc02) 2503 FNMSUB (aa2, cc03, cc01, cc01) 2504 2505 LDF [BO + 0 * SIZE], a1 2506 2507 FMUL a1, c02, c02 2508 FMUL a1, c01, c01 2509#endif 2510 2511#ifdef LN 2512 add C1, -2 * SIZE, C1 2513 add C2, -2 * SIZE, C2 2514 add C3, -2 * SIZE, C3 2515 add C4, -2 * SIZE, C4 2516#endif 2517 2518#if defined(LN) || defined(LT) 2519 STF c01, [BO + 0 * SIZE] 2520 STF c03, [BO + 1 * SIZE] 2521 STF c05, [BO + 2 * SIZE] 2522 STF c07, [BO + 3 * SIZE] 2523 2524 STF c02, [BO + 4 * SIZE] 2525 STF c04, [BO + 5 * SIZE] 2526 STF c06, [BO + 6 * SIZE] 2527 STF c08, [BO + 7 * SIZE] 2528#else 2529 STF c01, [AO + 0 * SIZE] 2530 STF c02, [AO + 1 * SIZE] 2531 STF c03, [AO + 2 * SIZE] 2532 STF c04, [AO + 3 * SIZE] 2533 2534 STF c05, [AO + 4 * SIZE] 2535 STF c06, [AO + 5 * SIZE] 2536 STF c07, [AO + 6 * SIZE] 2537 STF c08, [AO + 7 * SIZE] 2538#endif 2539 2540 STF c01, [C1 + 0 * SIZE] 2541 STF c02, [C1 + 1 * SIZE] 2542 STF c03, [C2 + 0 * SIZE] 2543 STF c04, [C2 + 1 * SIZE] 2544 2545 STF c05, [C3 + 0 * SIZE] 2546 STF c06, [C3 + 1 * SIZE] 2547 STF c07, [C4 + 0 * SIZE] 2548 STF c08, [C4 + 1 * SIZE] 2549 2550#ifndef LN 2551 add C1, 2 * SIZE, C1 2552 add C2, 2 * SIZE, C2 2553 add C3, 2 * SIZE, C3 2554 add C4, 2 * SIZE, C4 2555#endif 2556 2557#ifdef RT 2558 sll K, BASE_SHIFT + 1, TEMP1 2559 add AORIG, TEMP1, AORIG 2560#endif 2561 2562#if defined(LT) || defined(RN) 2563 sub K, KK, TEMP1 2564 sll TEMP1, BASE_SHIFT + 1, TEMP2 2565 sll TEMP1, BASE_SHIFT + 2, TEMP1 2566 add AO, TEMP2, AO 2567 add BO, TEMP1, BO 2568#endif 2569 2570#ifdef LT 2571 add KK, 2, KK 2572#endif 2573 2574#ifdef LN 2575 sub KK, 2, KK 2576#endif 2577 2578 add I, -1, I 2579 cmp I, 0 2580 bg,pt %icc, .LL32 2581 nop 2582 2583.LL40: 2584 and M, 1, I 2585 cmp I, 0 2586 ble,pn %icc, .LL49 2587 nop 2588 2589#if defined(LT) || defined(RN) 2590 mov B, BO 2591#else 2592#ifdef LN 2593 sll K, BASE_SHIFT + 0, TEMP1 2594 sub AORIG, TEMP1, AORIG 2595#endif 2596 2597 sll KK, BASE_SHIFT + 0, TEMP1 2598 sll KK, BASE_SHIFT + 2, TEMP2 2599 2600 add AORIG, TEMP1, AO 2601 add B, TEMP2, BO 2602#endif 2603 2604 LDF [AO + 0 * SIZE], a1 2605 LDF [AO + 1 * SIZE], a2 2606 LDF [AO + 2 * SIZE], a3 2607 LDF [AO + 3 * SIZE], a4 2608 2609 LDF [BO + 0 * SIZE], b1 2610 LDF [BO + 1 * SIZE], b2 2611 LDF [BO + 2 * SIZE], b3 2612 LDF [BO + 3 * SIZE], b4 2613 LDF [BO + 4 * SIZE], b5 2614 LDF [BO + 5 * SIZE], b6 2615 FCLR (cc01) 2616 LDF [BO + 6 * SIZE], b7 2617 FCLR (cc03) 2618 LDF [BO + 7 * SIZE], b8 2619 FCLR (cc05) 2620 LDF [BO + 8 * SIZE], b9 2621 FCLR (cc07) 2622 2623#if defined(LT) || defined(RN) 2624 sra KK, 2, L 2625#else 2626 sub K, KK, L 2627 sra L, 2, L 2628#endif 2629 cmp L, 0 2630 ble,pn %icc, .LL45 2631 nop 2632 2633.LL43: 2634 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2635 add L, -1, L 2636 2637 FMADD (aa1, bb1, cc01, cc01) 2638 LDF [BO + 16 * SIZE], b1 2639 FMADD (aa1, bb2, cc03, cc03) 2640 LDF [BO + 9 * SIZE], b2 2641 FMADD (aa1, bb3, cc05, cc05) 2642 LDF [BO + 10 * SIZE], b3 2643 FMADD (aa1, bb4, cc07, cc07) 2644 LDF [BO + 11 * SIZE], b4 2645 2646 LDF [AO + 4 * SIZE], a1 2647 cmp L, 0 2648 2649 FMADD (aa2, bb5, cc01, cc01) 2650 LDF [BO + 12 * SIZE], b5 2651 FMADD (aa2, bb6, cc03, cc03) 2652 LDF [BO + 13 * SIZE], b6 2653 FMADD (aa2, bb7, cc05, cc05) 2654 LDF [BO + 14 * SIZE], b7 2655 FMADD (aa2, bb8, cc07, cc07) 2656 LDF [BO + 15 * SIZE], b8 2657 2658 LDF [AO + 5 * SIZE], a2 2659 add AO, 4 * SIZE, AO 2660 2661 FMADD (aa3, bb9, cc01, cc01) 2662 LDF [BO + 24 * SIZE], b9 2663 FMADD (aa3, bb2, cc03, cc03) 2664 LDF [BO + 17 * SIZE], b2 2665 FMADD (aa3, bb3, cc05, cc05) 2666 LDF [BO + 18 * SIZE], b3 2667 FMADD (aa3, bb4, cc07, cc07) 2668 LDF [BO + 19 * SIZE], b4 2669 2670 LDF [AO + 2 * SIZE], a3 2671 add BO, 16 * SIZE, BO 2672 2673 FMADD (aa4, bb5, cc01, cc01) 2674 LDF [BO + 4 * SIZE], b5 2675 FMADD (aa4, bb6, cc03, cc03) 2676 LDF [BO + 5 * SIZE], b6 2677 FMADD (aa4, bb7, cc05, cc05) 2678 LDF [BO + 6 * SIZE], b7 2679 FMADD (aa4, bb8, cc07, cc07) 2680 LDF [BO + 7 * SIZE], b8 2681 2682 bg,pt %icc, .LL43 2683 LDF [AO + 3 * SIZE], a4 2684 .align 4 2685 2686.LL45: 2687#if defined(LT) || defined(RN) 2688 and KK, 3, L 2689#else 2690 sub K, KK, L 2691 and L, 3, L 2692#endif 2693 cmp L, 0 2694 ble,a,pn %icc, .LL48 2695 nop 2696 .align 4 2697 2698.LL47: 2699 FMADD (aa1, bb1, cc01, cc01) 2700 LDF [BO + 4 * SIZE], b1 2701 add L, -1, L 2702 FMADD (aa1, bb2, cc03, cc03) 2703 LDF [BO + 5 * SIZE], b2 2704 add AO, 1 * SIZE, AO 2705 2706 FMADD (aa1, bb3, cc05, cc05) 2707 LDF [BO + 6 * SIZE], b3 2708 cmp L, 0 2709 FMADD (aa1, bb4, cc07, cc07) 2710 LDF [BO + 7 * SIZE], b4 2711 add BO, 4 * SIZE, BO 2712 2713 bg,pt %icc, .LL47 2714 LDF [AO + 0 * SIZE], a1 2715 .align 4 2716 2717.LL48: 2718#if defined(LN) || defined(RT) 2719#ifdef LN 2720 sub KK, 1, TEMP1 2721#else 2722 sub KK, 4, TEMP1 2723#endif 2724 sll TEMP1, BASE_SHIFT + 0, TEMP2 2725 sll TEMP1, BASE_SHIFT + 2, TEMP1 2726 2727 add AORIG, TEMP2, AO 2728 add B, TEMP1, BO 2729#endif 2730 2731#if defined(LN) || defined(LT) 2732 LDF [BO + 0 * SIZE], a1 2733 LDF [BO + 1 * SIZE], a2 2734 LDF [BO + 2 * SIZE], a3 2735 LDF [BO + 3 * SIZE], a4 2736 2737 FSUB a1, c01, c01 2738 FSUB a2, c03, c03 2739 FSUB a3, c05, c05 2740 FSUB a4, c07, c07 2741#else 2742 LDF [AO + 0 * SIZE], a1 2743 LDF [AO + 1 * SIZE], a2 2744 LDF [AO + 2 * SIZE], a3 2745 LDF [AO + 3 * SIZE], a4 2746 2747 FSUB a1, c01, c01 2748 FSUB a2, c03, c03 2749 FSUB a3, c05, c05 2750 FSUB a4, c07, c07 2751#endif 2752 2753#if defined(LN) || defined(LT) 2754 LDF [AO + 0 * SIZE], a1 2755 2756 FMUL a1, c01, c01 2757 FMUL a1, c03, c03 2758 FMUL a1, c05, c05 2759 FMUL a1, c07, c07 2760#endif 2761 2762#ifdef RN 2763 LDF [BO + 0 * SIZE], a1 2764 LDF [BO + 1 * SIZE], a2 2765 LDF [BO + 2 * SIZE], a3 2766 LDF [BO + 3 * SIZE], a4 2767 2768 FMUL a1, c01, c01 2769 2770 FNMSUB (aa2, cc01, cc03, cc03) 2771 FNMSUB (aa3, cc01, cc05, cc05) 2772 FNMSUB (aa4, cc01, cc07, cc07) 2773 2774 LDF [BO + 5 * SIZE], a1 2775 LDF [BO + 6 * SIZE], a2 2776 LDF [BO + 7 * SIZE], a3 2777 2778 FMUL a1, c03, c03 2779 2780 FNMSUB (aa2, cc03, cc05, cc05) 2781 FNMSUB (aa3, cc03, cc07, cc07) 2782 2783 LDF [BO + 10 * SIZE], a1 2784 LDF [BO + 11 * SIZE], a2 2785 2786 FMUL a1, c05, c05 2787 2788 FNMSUB (aa2, cc05, cc07, cc07) 2789 2790 LDF [BO + 15 * SIZE], a1 2791 2792 FMUL a1, c07, c07 2793#endif 2794 2795#ifdef RT 2796 LDF [BO + 15 * SIZE], a1 2797 LDF [BO + 14 * SIZE], a2 2798 LDF [BO + 13 * SIZE], a3 2799 LDF [BO + 12 * SIZE], a4 2800 2801 FMUL a1, c07, c07 2802 2803 FNMSUB (aa2, cc07, cc05, cc05) 2804 FNMSUB (aa3, cc07, cc03, cc03) 2805 FNMSUB (aa4, cc07, cc01, cc01) 2806 2807 LDF [BO + 10 * SIZE], a1 2808 LDF [BO + 9 * SIZE], a2 2809 LDF [BO + 8 * SIZE], a3 2810 2811 FMUL a1, c05, c05 2812 2813 FNMSUB (aa2, cc05, cc03, cc03) 2814 FNMSUB (aa3, cc05, cc01, cc01) 2815 2816 LDF [BO + 5 * SIZE], a1 2817 LDF [BO + 4 * SIZE], a2 2818 2819 FMUL a1, c03, c03 2820 2821 FNMSUB (aa2, cc03, cc01, cc01) 2822 2823 LDF [BO + 0 * SIZE], a1 2824 2825 FMUL a1, c01, c01 2826#endif 2827 2828#ifdef LN 2829 add C1, -1 * SIZE, C1 2830 add C2, -1 * SIZE, C2 2831 add C3, -1 * SIZE, C3 2832 add C4, -1 * SIZE, C4 2833#endif 2834 2835#if defined(LN) || defined(LT) 2836 STF c01, [BO + 0 * SIZE] 2837 STF c03, [BO + 1 * SIZE] 2838 STF c05, [BO + 2 * SIZE] 2839 STF c07, [BO + 3 * SIZE] 2840#else 2841 STF c01, [AO + 0 * SIZE] 2842 STF c03, [AO + 1 * SIZE] 2843 STF c05, [AO + 2 * SIZE] 2844 STF c07, [AO + 3 * SIZE] 2845#endif 2846 2847 STF c01, [C1 + 0 * SIZE] 2848 STF c03, [C2 + 0 * SIZE] 2849 STF c05, [C3 + 0 * SIZE] 2850 STF c07, [C4 + 0 * SIZE] 2851 2852#ifdef RT 2853 sll K, BASE_SHIFT + 0, TEMP1 2854 add AORIG, TEMP1, AORIG 2855#endif 2856 2857#if defined(LT) || defined(RN) 2858 sub K, KK, TEMP1 2859 sll TEMP1, BASE_SHIFT + 0, TEMP2 2860 sll TEMP1, BASE_SHIFT + 2, TEMP1 2861 add AO, TEMP2, AO 2862 add BO, TEMP1, BO 2863#endif 2864 2865#ifdef LT 2866 add KK, 1, KK 2867#endif 2868 2869#ifdef LN 2870 sub KK, 1, KK 2871#endif 2872 .align 4 2873 2874.LL49: 2875#ifdef LN 2876 sll K, BASE_SHIFT + 2, TEMP1 2877 add B, TEMP1, B 2878#endif 2879 2880#if defined(LT) || defined(RN) 2881 mov BO, B 2882#endif 2883 2884#ifdef RN 2885 add KK, 4, KK 2886#endif 2887 2888#ifdef RT 2889 sub KK, 4, KK 2890#endif 2891 .align 4 2892 2893.LL50: 2894 and N, 2, J 2895 cmp J, 0 2896 ble,pn %icc, .LL70 2897 nop 2898 2899#ifdef RT 2900 sll K, BASE_SHIFT + 1, TEMP1 2901 sub B, TEMP1, B 2902#endif 2903 2904#ifndef RT 2905 mov C, C1 2906 add C, LDC, C2 2907 add C2, LDC, C 2908#else 2909 sub C, LDC, C2 2910 sub C2, LDC, C1 2911 sub C2, LDC, C 2912#endif 2913 2914#ifdef LN 2915 add M, OFFSET, KK 2916#endif 2917 2918#ifdef LT 2919 mov OFFSET, KK 2920#endif 2921 2922#if defined(LN) || defined(RT) 2923 mov A, AORIG 2924#else 2925 mov A, AO 2926#endif 2927 2928 sra M, 1, I 2929 cmp I, 0 2930 ble,pn %icc, .LL60 2931 nop 2932 .align 4 2933 2934.LL52: 2935#if defined(LT) || defined(RN) 2936 mov B, BO 2937#else 2938#ifdef LN 2939 sll K, BASE_SHIFT + 1, TEMP1 2940 sub AORIG, TEMP1, AORIG 2941#endif 2942 2943 sll KK, BASE_SHIFT + 1, TEMP1 2944 sll KK, BASE_SHIFT + 1, TEMP2 2945 2946 add AORIG, TEMP1, AO 2947 add B, TEMP2, BO 2948#endif 2949 2950 LDF [AO + 0 * SIZE], a1 2951 LDF [AO + 1 * SIZE], a2 2952 LDF [AO + 2 * SIZE], a3 2953 LDF [AO + 3 * SIZE], a4 2954 2955 LDF [BO + 0 * SIZE], b1 2956 LDF [BO + 1 * SIZE], b2 2957 LDF [BO + 2 * SIZE], b3 2958 FCLR (cc01) 2959 LDF [BO + 3 * SIZE], b4 2960 FCLR (cc02) 2961 2962 LDF [BO + 4 * SIZE], b5 2963 FCLR (cc03) 2964 LDF [BO + 5 * SIZE], b6 2965 FCLR (cc04) 2966 LDF [BO + 6 * SIZE], b7 2967 FCLR (cc05) 2968 LDF [BO + 7 * SIZE], b8 2969 FCLR (cc06) 2970 2971 prefetch [C1 + 2 * SIZE], 3 2972 FCLR (cc07) 2973 prefetch [C2 + 2 * SIZE], 3 2974 FCLR (cc08) 2975 2976#if defined(LT) || defined(RN) 2977 sra KK, 2, L 2978#else 2979 sub K, KK, L 2980 sra L, 2, L 2981#endif 2982 cmp L, 0 2983 ble,pn %icc, .LL55 2984 nop 2985 .align 4 2986 2987.LL53: 2988 FMADD (aa1, bb1, cc01, cc01) 2989 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2990 FMADD (aa2, bb1, cc02, cc02) 2991 LDF [BO + 8 * SIZE], b1 2992 2993 FMADD (aa1, bb2, cc03, cc03) 2994 LDF [AO + 4 * SIZE], a1 2995 FMADD (aa2, bb2, cc04, cc04) 2996 LDF [AO + 5 * SIZE], a2 2997 2998 FMADD (aa3, bb3, cc01, cc01) 2999 LDF [BO + 9 * SIZE], b2 3000 FMADD (aa4, bb3, cc02, cc02) 3001 LDF [BO + 10 * SIZE], b3 3002 3003 FMADD (aa3, bb4, cc03, cc03) 3004 LDF [AO + 6 * SIZE], a3 3005 FMADD (aa4, bb4, cc04, cc04) 3006 LDF [AO + 7 * SIZE], a4 3007 3008 FMADD (aa1, bb5, cc01, cc01) 3009 LDF [BO + 11 * SIZE], b4 3010 FMADD (aa2, bb5, cc02, cc02) 3011 LDF [BO + 12 * SIZE], b5 3012 3013 FMADD (aa1, bb6, cc03, cc03) 3014 LDF [AO + 8 * SIZE], a1 3015 FMADD (aa2, bb6, cc04, cc04) 3016 LDF [AO + 9 * SIZE], a2 3017 3018 FMADD (aa3, bb7, cc01, cc01) 3019 LDF [BO + 13 * SIZE], b6 3020 3021 FMADD (aa4, bb7, cc02, cc02) 3022 LDF [BO + 14 * SIZE], b7 3023 3024 FMADD (aa3, bb8, cc03, cc03) 3025 LDF [AO + 10 * SIZE], a3 3026 FMADD (aa4, bb8, cc04, cc04) 3027 LDF [AO + 11 * SIZE], a4 3028 3029 add AO, 8 * SIZE, AO 3030 add L, -1, L 3031 add BO, 8 * SIZE, BO 3032 cmp L, 0 3033 3034 bg,pt %icc, .LL53 3035 LDF [BO + 7 * SIZE], b8 3036 .align 4 3037 3038.LL55: 3039#if defined(LT) || defined(RN) 3040 and KK, 3, L 3041#else 3042 sub K, KK, L 3043 and L, 3, L 3044#endif 3045 cmp L, 0 3046 ble,a,pn %icc, .LL58 3047 nop 3048 .align 4 3049 3050.LL57: 3051 FMADD (aa1, bb1, cc01, cc01) 3052 add L, -1, L 3053 FMADD (aa2, bb1, cc02, cc02) 3054 LDF [BO + 2 * SIZE], b1 3055 3056 FMADD (aa1, bb2, cc03, cc03) 3057 LDF [AO + 2 * SIZE], a1 3058 FMADD (aa2, bb2, cc04, cc04) 3059 LDF [AO + 3 * SIZE], a2 3060 3061 add AO, 2 * SIZE, AO 3062 cmp L, 0 3063 add BO, 2 * SIZE, BO 3064 bg,pt %icc, .LL57 3065 LDF [BO + 1 * SIZE], b2 3066 .align 4 3067 3068.LL58: 3069#if defined(LN) || defined(RT) 3070#ifdef LN 3071 sub KK, 2, TEMP1 3072#else 3073 sub KK, 2, TEMP1 3074#endif 3075 sll TEMP1, BASE_SHIFT + 1, TEMP2 3076 sll TEMP1, BASE_SHIFT + 1, TEMP1 3077 3078 add AORIG, TEMP2, AO 3079 add B, TEMP1, BO 3080#endif 3081 3082#if defined(LN) || defined(LT) 3083 LDF [BO + 0 * SIZE], a1 3084 LDF [BO + 1 * SIZE], a2 3085 LDF [BO + 2 * SIZE], a3 3086 LDF [BO + 3 * SIZE], a4 3087 3088 FSUB a1, c01, c01 3089 FSUB a2, c03, c03 3090 FSUB a3, c02, c02 3091 FSUB a4, c04, c04 3092#else 3093 LDF [AO + 0 * SIZE], a1 3094 LDF [AO + 1 * SIZE], a2 3095 LDF [AO + 2 * SIZE], a3 3096 LDF [AO + 3 * SIZE], a4 3097 3098 FSUB a1, c01, c01 3099 FSUB a2, c02, c02 3100 FSUB a3, c03, c03 3101 FSUB a4, c04, c04 3102#endif 3103 3104#ifdef LN 3105 LDF [AO + 3 * SIZE], a1 3106 LDF [AO + 2 * SIZE], a2 3107 LDF [AO + 0 * SIZE], a3 3108 3109 FMUL a1, c02, c02 3110 FMUL a1, c04, c04 3111 3112 FNMSUB (aa2, cc02, cc01, cc01) 3113 FNMSUB (aa2, cc04, cc03, cc03) 3114 3115 FMUL a3, c01, c01 3116 FMUL a3, c03, c03 3117#endif 3118 3119#ifdef LT 3120 LDF [AO + 0 * SIZE], a1 3121 LDF [AO + 1 * SIZE], a2 3122 LDF [AO + 3 * SIZE], a3 3123 3124 FMUL a1, c01, c01 3125 FMUL a1, c03, c03 3126 3127 FNMSUB (aa2, cc01, cc02, cc02) 3128 FNMSUB (aa2, cc03, cc04, cc04) 3129 3130 FMUL a3, c02, c02 3131 FMUL a3, c04, c04 3132#endif 3133 3134#ifdef RN 3135 LDF [BO + 0 * SIZE], a1 3136 LDF [BO + 1 * SIZE], a2 3137 3138 FMUL a1, c01, c01 3139 FMUL a1, c02, c02 3140 3141 FNMSUB (aa2, cc01, cc03, cc03) 3142 FNMSUB (aa2, cc02, cc04, cc04) 3143 3144 LDF [BO + 3 * SIZE], a1 3145 3146 FMUL a1, c03, c03 3147 FMUL a1, c04, c04 3148#endif 3149 3150#ifdef RT 3151 LDF [BO + 3 * SIZE], a1 3152 LDF [BO + 2 * SIZE], a2 3153 3154 FMUL a1, c04, c04 3155 FMUL a1, c03, c03 3156 3157 FNMSUB (aa2, cc04, cc02, cc02) 3158 FNMSUB (aa2, cc03, cc01, cc01) 3159 3160 LDF [BO + 0 * SIZE], a1 3161 3162 FMUL a1, c02, c02 3163 FMUL a1, c01, c01 3164#endif 3165 3166#ifdef LN 3167 add C1, -2 * SIZE, C1 3168 add C2, -2 * SIZE, C2 3169#endif 3170 3171#if defined(LN) || defined(LT) 3172 STF c01, [BO + 0 * SIZE] 3173 STF c03, [BO + 1 * SIZE] 3174 STF c02, [BO + 2 * SIZE] 3175 STF c04, [BO + 3 * SIZE] 3176#else 3177 STF c01, [AO + 0 * SIZE] 3178 STF c02, [AO + 1 * SIZE] 3179 STF c03, [AO + 2 * SIZE] 3180 STF c04, [AO + 3 * SIZE] 3181#endif 3182 3183 STF c01, [C1 + 0 * SIZE] 3184 STF c02, [C1 + 1 * SIZE] 3185 STF c03, [C2 + 0 * SIZE] 3186 STF c04, [C2 + 1 * SIZE] 3187 3188#ifndef LN 3189 add C1, 2 * SIZE, C1 3190 add C2, 2 * SIZE, C2 3191#endif 3192 3193#ifdef RT 3194 sll K, BASE_SHIFT + 1, TEMP1 3195 add AORIG, TEMP1, AORIG 3196#endif 3197 3198#if defined(LT) || defined(RN) 3199 sub K, KK, TEMP1 3200 sll TEMP1, BASE_SHIFT + 1, TEMP2 3201 sll TEMP1, BASE_SHIFT + 1, TEMP1 3202 add AO, TEMP2, AO 3203 add BO, TEMP1, BO 3204#endif 3205 3206#ifdef LT 3207 add KK, 2, KK 3208#endif 3209 3210#ifdef LN 3211 sub KK, 2, KK 3212#endif 3213 3214 add I, -1, I 3215 cmp I, 0 3216 bg,pt %icc, .LL52 3217 nop 3218 .align 4 3219 3220.LL60: 3221 and M, 1, I 3222 cmp I, 0 3223 ble,pn %icc, .LL69 3224 nop 3225 3226#if defined(LT) || defined(RN) 3227 mov B, BO 3228#else 3229#ifdef LN 3230 sll K, BASE_SHIFT + 0, TEMP1 3231 sub AORIG, TEMP1, AORIG 3232#endif 3233 3234 sll KK, BASE_SHIFT + 0, TEMP1 3235 sll KK, BASE_SHIFT + 1, TEMP2 3236 3237 add AORIG, TEMP1, AO 3238 add B, TEMP2, BO 3239#endif 3240 3241 LDF [AO + 0 * SIZE], a1 3242 LDF [AO + 1 * SIZE], a2 3243 LDF [AO + 2 * SIZE], a3 3244 LDF [AO + 3 * SIZE], a4 3245 3246 LDF [BO + 0 * SIZE], b1 3247 LDF [BO + 1 * SIZE], b2 3248 LDF [BO + 2 * SIZE], b3 3249 LDF [BO + 3 * SIZE], b4 3250 LDF [BO + 4 * SIZE], b5 3251 LDF [BO + 5 * SIZE], b6 3252 LDF [BO + 6 * SIZE], b7 3253 FCLR (cc01) 3254 LDF [BO + 7 * SIZE], b8 3255 FCLR (cc03) 3256 3257#if defined(LT) || defined(RN) 3258 sra KK, 2, L 3259#else 3260 sub K, KK, L 3261 sra L, 2, L 3262#endif 3263 cmp L, 0 3264 ble,pn %icc, .LL65 3265 nop 3266 .align 4 3267 3268.LL63: 3269 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3270 add L, -1, L 3271 3272 FMADD (aa1, bb1, cc01, cc01) 3273 LDF [BO + 8 * SIZE], b1 3274 FMADD (aa1, bb2, cc03, cc03) 3275 LDF [BO + 9 * SIZE], b2 3276 3277 LDF [AO + 4 * SIZE], a1 3278 cmp L, 0 3279 3280 FMADD (aa2, bb3, cc01, cc01) 3281 LDF [BO + 10 * SIZE], b3 3282 FMADD (aa2, bb4, cc03, cc03) 3283 LDF [BO + 11 * SIZE], b4 3284 3285 LDF [AO + 5 * SIZE], a2 3286 add AO, 4 * SIZE, AO 3287 3288 FMADD (aa3, bb5, cc01, cc01) 3289 LDF [BO + 12 * SIZE], b5 3290 FMADD (aa3, bb6, cc03, cc03) 3291 LDF [BO + 13 * SIZE], b6 3292 3293 LDF [AO + 2 * SIZE], a3 3294 add BO, 8 * SIZE, BO 3295 3296 FMADD (aa4, bb7, cc01, cc01) 3297 LDF [BO + 6 * SIZE], b7 3298 FMADD (aa4, bb8, cc03, cc03) 3299 LDF [BO + 7 * SIZE], b8 3300 3301 bg,pt %icc, .LL63 3302 LDF [AO + 3 * SIZE], a4 3303 .align 4 3304 3305.LL65: 3306#if defined(LT) || defined(RN) 3307 and KK, 3, L 3308#else 3309 sub K, KK, L 3310 and L, 3, L 3311#endif 3312 cmp L, 0 3313 ble,a,pn %icc, .LL68 3314 nop 3315 .align 4 3316 3317.LL67: 3318 FMADD (aa1, bb1, cc01, cc01) 3319 LDF [BO + 2 * SIZE], b1 3320 FMADD (aa1, bb2, cc03, cc03) 3321 LDF [BO + 3 * SIZE], b2 3322 3323 LDF [AO + 1 * SIZE], a1 3324 add L, -1, L 3325 add AO, 1 * SIZE, AO 3326 cmp L, 0 3327 3328 bg,pt %icc, .LL67 3329 add BO, 2 * SIZE, BO 3330 .align 4 3331 3332.LL68: 3333#if defined(LN) || defined(RT) 3334#ifdef LN 3335 sub KK, 1, TEMP1 3336#else 3337 sub KK, 2, TEMP1 3338#endif 3339 sll TEMP1, BASE_SHIFT + 0, TEMP2 3340 sll TEMP1, BASE_SHIFT + 1, TEMP1 3341 3342 add AORIG, TEMP2, AO 3343 add B, TEMP1, BO 3344#endif 3345 3346#if defined(LN) || defined(LT) 3347 LDF [BO + 0 * SIZE], a1 3348 LDF [BO + 1 * SIZE], a2 3349 3350 FSUB a1, c01, c01 3351 FSUB a2, c03, c03 3352#else 3353 LDF [AO + 0 * SIZE], a1 3354 LDF [AO + 1 * SIZE], a2 3355 3356 FSUB a1, c01, c01 3357 FSUB a2, c03, c03 3358#endif 3359 3360#if defined(LN) || defined(LT) 3361 LDF [AO + 0 * SIZE], a1 3362 3363 FMUL a1, c01, c01 3364 FMUL a1, c03, c03 3365#endif 3366 3367#ifdef RN 3368 LDF [BO + 0 * SIZE], a1 3369 LDF [BO + 1 * SIZE], a2 3370 3371 FMUL a1, c01, c01 3372 3373 FNMSUB (aa2, cc01, cc03, cc03) 3374 3375 LDF [BO + 3 * SIZE], a1 3376 3377 FMUL a1, c03, c03 3378#endif 3379 3380#ifdef RT 3381 LDF [BO + 3 * SIZE], a1 3382 LDF [BO + 2 * SIZE], a2 3383 3384 FMUL a1, c03, c03 3385 3386 FNMSUB (aa2, cc03, cc01, cc01) 3387 3388 LDF [BO + 0 * SIZE], a1 3389 3390 FMUL a1, c01, c01 3391#endif 3392 3393#ifdef LN 3394 add C1, -1 * SIZE, C1 3395 add C2, -1 * SIZE, C2 3396#endif 3397 3398#if defined(LN) || defined(LT) 3399 STF c01, [BO + 0 * SIZE] 3400 STF c03, [BO + 1 * SIZE] 3401#else 3402 STF c01, [AO + 0 * SIZE] 3403 STF c03, [AO + 1 * SIZE] 3404#endif 3405 3406 STF c01, [C1 + 0 * SIZE] 3407 STF c03, [C2 + 0 * SIZE] 3408 3409#ifdef RT 3410 sll K, BASE_SHIFT + 0, TEMP1 3411 add AORIG, TEMP1, AORIG 3412#endif 3413 3414#if defined(LT) || defined(RN) 3415 sub K, KK, TEMP1 3416 sll TEMP1, BASE_SHIFT + 0, TEMP2 3417 sll TEMP1, BASE_SHIFT + 1, TEMP1 3418 add AO, TEMP2, AO 3419 add BO, TEMP1, BO 3420#endif 3421 3422#ifdef LT 3423 add KK, 1, KK 3424#endif 3425 3426#ifdef LN 3427 sub KK, 1, KK 3428#endif 3429 .align 4 3430 3431.LL69: 3432#ifdef LN 3433 sll K, BASE_SHIFT + 1, TEMP1 3434 add B, TEMP1, B 3435#endif 3436 3437#if defined(LT) || defined(RN) 3438 mov BO, B 3439#endif 3440 3441#ifdef RN 3442 add KK, 2, KK 3443#endif 3444 3445#ifdef RT 3446 sub KK, 2, KK 3447#endif 3448 .align 4 3449 3450.LL70: 3451 and N, 1, J 3452 cmp J, 0 3453 ble,pn %icc, .LL999 3454 nop 3455 3456#ifdef RT 3457 sll K, BASE_SHIFT, TEMP1 3458 sub B, TEMP1, B 3459#endif 3460 3461#ifndef RT 3462 mov C, C1 3463 add C1, LDC, C 3464#else 3465 sub C, LDC, C1 3466 sub C, LDC, C 3467#endif 3468 3469#ifdef LN 3470 add M, OFFSET, KK 3471#endif 3472 3473#ifdef LT 3474 mov OFFSET, KK 3475#endif 3476 3477#if defined(LN) || defined(RT) 3478 mov A, AORIG 3479#else 3480 mov A, AO 3481#endif 3482 3483 sra M, 1, I 3484 cmp I, 0 3485 ble,pn %icc, .LL80 3486 nop 3487 .align 4 3488 3489.LL72: 3490#if defined(LT) || defined(RN) 3491 mov B, BO 3492#else 3493#ifdef LN 3494 sll K, BASE_SHIFT + 1, TEMP1 3495 sub AORIG, TEMP1, AORIG 3496#endif 3497 3498 sll KK, BASE_SHIFT + 1, TEMP1 3499 sll KK, BASE_SHIFT + 0, TEMP2 3500 3501 add AORIG, TEMP1, AO 3502 add B, TEMP2, BO 3503#endif 3504 3505 LDF [AO + 0 * SIZE], a1 3506 LDF [AO + 1 * SIZE], a2 3507 LDF [AO + 2 * SIZE], a3 3508 LDF [AO + 3 * SIZE], a4 3509 3510 LDF [BO + 0 * SIZE], b1 3511 LDF [BO + 1 * SIZE], b2 3512 LDF [BO + 2 * SIZE], b3 3513 FCLR (cc01) 3514 LDF [BO + 3 * SIZE], b4 3515 FCLR (cc02) 3516 3517 prefetch [C1 + 2 * SIZE], 3 3518 3519#if defined(LT) || defined(RN) 3520 sra KK, 2, L 3521#else 3522 sub K, KK, L 3523 sra L, 2, L 3524#endif 3525 cmp L, 0 3526 ble,pn %icc, .LL75 3527 nop 3528 3529.LL73: 3530 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3531 add L, -1, L 3532 3533 FMADD (aa1, bb1, cc01, cc01) 3534 LDF [AO + 4 * SIZE], a1 3535 FMADD (aa2, bb1, cc02, cc02) 3536 LDF [AO + 5 * SIZE], a2 3537 3538 LDF [BO + 4 * SIZE], b1 3539 cmp L, 0 3540 3541 FMADD (aa3, bb2, cc01, cc01) 3542 LDF [AO + 6 * SIZE], a3 3543 FMADD (aa4, bb2, cc02, cc02) 3544 LDF [AO + 7 * SIZE], a4 3545 3546 LDF [BO + 5 * SIZE], b2 3547 add BO, 4 * SIZE, BO 3548 3549 FMADD (aa1, bb3, cc01, cc01) 3550 LDF [AO + 8 * SIZE], a1 3551 FMADD (aa2, bb3, cc02, cc02) 3552 LDF [AO + 9 * SIZE], a2 3553 3554 LDF [BO + 2 * SIZE], b3 3555 add AO, 8 * SIZE, AO 3556 3557 FMADD (aa3, bb4, cc01, cc01) 3558 LDF [AO + 2 * SIZE], a3 3559 FMADD (aa4, bb4, cc02, cc02) 3560 LDF [AO + 3 * SIZE], a4 3561 3562 bg,pt %icc, .LL73 3563 LDF [BO + 3 * SIZE], b4 3564 .align 4 3565 3566.LL75: 3567#if defined(LT) || defined(RN) 3568 and KK, 3, L 3569#else 3570 sub K, KK, L 3571 and L, 3, L 3572#endif 3573 cmp L, 0 3574 ble,a,pn %icc, .LL78 3575 nop 3576 .align 4 3577 3578.LL77: 3579 FMADD (aa1, bb1, cc01, cc01) 3580 LDF [AO + 2 * SIZE], a1 3581 FMADD (aa2, bb1, cc02, cc02) 3582 LDF [AO + 3 * SIZE], a2 3583 3584 LDF [BO + 1 * SIZE], b1 3585 add L, -1, L 3586 add AO, 2 * SIZE, AO 3587 cmp L, 0 3588 bg,pt %icc, .LL77 3589 add BO, 1 * SIZE, BO 3590 .align 4 3591 3592.LL78: 3593#if defined(LN) || defined(RT) 3594#ifdef LN 3595 sub KK, 2, TEMP1 3596#else 3597 sub KK, 1, TEMP1 3598#endif 3599 sll TEMP1, BASE_SHIFT + 1, TEMP2 3600 sll TEMP1, BASE_SHIFT + 0, TEMP1 3601 3602 add AORIG, TEMP2, AO 3603 add B, TEMP1, BO 3604#endif 3605 3606#if defined(LN) || defined(LT) 3607 LDF [BO + 0 * SIZE], a1 3608 LDF [BO + 1 * SIZE], a2 3609 3610 FSUB a1, c01, c01 3611 FSUB a2, c02, c02 3612#else 3613 LDF [AO + 0 * SIZE], a1 3614 LDF [AO + 1 * SIZE], a2 3615 3616 FSUB a1, c01, c01 3617 FSUB a2, c02, c02 3618#endif 3619 3620#ifdef LN 3621 LDF [AO + 3 * SIZE], a1 3622 LDF [AO + 2 * SIZE], a2 3623 LDF [AO + 0 * SIZE], a3 3624 3625 FMUL a1, c02, c02 3626 3627 FNMSUB (aa2, cc02, cc01, cc01) 3628 3629 FMUL a3, c01, c01 3630#endif 3631 3632#ifdef LT 3633 LDF [AO + 0 * SIZE], a1 3634 LDF [AO + 1 * SIZE], a2 3635 LDF [AO + 3 * SIZE], a3 3636 3637 FMUL a1, c01, c01 3638 3639 FNMSUB (aa2, cc01, cc02, cc02) 3640 3641 FMUL a3, c02, c02 3642#endif 3643 3644#if defined(RN) || defined(RT) 3645 LDF [BO + 0 * SIZE], a1 3646 3647 FMUL a1, c01, c01 3648 FMUL a1, c02, c02 3649#endif 3650 3651#ifdef LN 3652 add C1, -2 * SIZE, C1 3653#endif 3654 3655#if defined(LN) || defined(LT) 3656 STF c01, [BO + 0 * SIZE] 3657 STF c02, [BO + 1 * SIZE] 3658#else 3659 STF c01, [AO + 0 * SIZE] 3660 STF c02, [AO + 1 * SIZE] 3661#endif 3662 3663 STF c01, [C1 + 0 * SIZE] 3664 STF c02, [C1 + 1 * SIZE] 3665 3666#ifndef LN 3667 add C1, 2 * SIZE, C1 3668#endif 3669 3670#ifdef RT 3671 sll K, BASE_SHIFT + 1, TEMP1 3672 add AORIG, TEMP1, AORIG 3673#endif 3674 3675#if defined(LT) || defined(RN) 3676 sub K, KK, TEMP1 3677 sll TEMP1, BASE_SHIFT + 1, TEMP2 3678 sll TEMP1, BASE_SHIFT + 0, TEMP1 3679 add AO, TEMP2, AO 3680 add BO, TEMP1, BO 3681#endif 3682 3683#ifdef LT 3684 add KK, 2, KK 3685#endif 3686 3687#ifdef LN 3688 sub KK, 2, KK 3689#endif 3690 3691 add I, -1, I 3692 cmp I, 0 3693 bg,pt %icc, .LL72 3694 nop 3695 .align 4 3696 3697.LL80: 3698 and M, 1, I 3699 cmp I, 0 3700 ble,pn %icc, .LL89 3701 nop 3702 3703#if defined(LT) || defined(RN) 3704 mov B, BO 3705#else 3706#ifdef LN 3707 sll K, BASE_SHIFT + 0, TEMP1 3708 sub AORIG, TEMP1, AORIG 3709#endif 3710 3711 sll KK, BASE_SHIFT + 0, TEMP1 3712 sll KK, BASE_SHIFT + 0, TEMP2 3713 3714 add AORIG, TEMP1, AO 3715 add B, TEMP2, BO 3716#endif 3717 3718 LDF [AO + 0 * SIZE], a1 3719 LDF [BO + 0 * SIZE], b1 3720 LDF [AO + 1 * SIZE], a2 3721 LDF [BO + 1 * SIZE], b2 3722 LDF [AO + 2 * SIZE], a3 3723 LDF [BO + 2 * SIZE], b3 3724 LDF [AO + 3 * SIZE], a4 3725 LDF [BO + 3 * SIZE], b4 3726 3727#if defined(LT) || defined(RN) 3728 sra KK, 2, L 3729#else 3730 sub K, KK, L 3731 sra L, 2, L 3732#endif 3733 cmp L, 0 3734 ble,pn %icc, .LL85 3735 FCLR (cc01) 3736 .align 4 3737 3738.LL83: 3739 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3740 add L, -1, L 3741 3742 FMADD (aa1, bb1, cc01, cc01) 3743 LDF [AO + 4 * SIZE], a1 3744 LDF [BO + 4 * SIZE], b1 3745 3746 FMADD (aa2, bb2, cc01, cc01) 3747 LDF [AO + 5 * SIZE], a2 3748 LDF [BO + 5 * SIZE], b2 3749 3750 FMADD (aa3, bb3, cc01, cc01) 3751 LDF [AO + 6 * SIZE], a3 3752 LDF [BO + 6 * SIZE], b3 3753 3754 FMADD (aa4, bb4, cc01, cc01) 3755 LDF [AO + 7 * SIZE], a4 3756 LDF [BO + 7 * SIZE], b4 3757 3758 add AO, 4 * SIZE, AO 3759 cmp L, 0 3760 3761 bg,pt %icc, .LL83 3762 add BO, 4 * SIZE, BO 3763 .align 4 3764 3765.LL85: 3766#if defined(LT) || defined(RN) 3767 and KK, 3, L 3768#else 3769 sub K, KK, L 3770 and L, 3, L 3771#endif 3772 cmp L, 0 3773 ble,a,pn %icc, .LL88 3774 nop 3775 .align 4 3776 3777.LL87: 3778 FMADD (aa1, bb1, cc01, cc01) 3779 LDF [AO + 1 * SIZE], a1 3780 LDF [BO + 1 * SIZE], b1 3781 3782 add AO, 1 * SIZE, AO 3783 add L, -1, L 3784 cmp L, 0 3785 bg,pt %icc, .LL87 3786 add BO, 1 * SIZE, BO 3787 .align 4 3788 3789.LL88: 3790#if defined(LN) || defined(RT) 3791#ifdef LN 3792 sub KK, 1, TEMP1 3793#else 3794 sub KK, 1, TEMP1 3795#endif 3796 sll TEMP1, BASE_SHIFT + 0, TEMP2 3797 sll TEMP1, BASE_SHIFT + 0, TEMP1 3798 3799 add AORIG, TEMP2, AO 3800 add B, TEMP1, BO 3801#endif 3802 3803#if defined(LN) || defined(LT) 3804 LDF [BO + 0 * SIZE], a1 3805 3806 FSUB a1, c01, c01 3807#else 3808 LDF [AO + 0 * SIZE], a1 3809 3810 FSUB a1, c01, c01 3811#endif 3812 3813#if defined(LN) || defined(LT) 3814 LDF [AO + 0 * SIZE], a1 3815 3816 FMUL a1, c01, c01 3817#endif 3818 3819#if defined(RN) || defined(RT) 3820 LDF [BO + 0 * SIZE], a1 3821 3822 FMUL a1, c01, c01 3823#endif 3824 3825#ifdef LN 3826 add C1, -1 * SIZE, C1 3827#endif 3828 3829#if defined(LN) || defined(LT) 3830 STF c01, [BO + 0 * SIZE] 3831#else 3832 STF c01, [AO + 0 * SIZE] 3833#endif 3834 3835 STF c01, [C1 + 0 * SIZE] 3836 3837#ifdef RT 3838 sll K, BASE_SHIFT + 0, TEMP1 3839 add AORIG, TEMP1, AORIG 3840#endif 3841 3842#if defined(LT) || defined(RN) 3843 sub K, KK, TEMP1 3844 sll TEMP1, BASE_SHIFT + 0, TEMP2 3845 sll TEMP1, BASE_SHIFT + 0, TEMP1 3846 add AO, TEMP2, AO 3847 add BO, TEMP1, BO 3848#endif 3849 3850#ifdef LT 3851 add KK, 1, KK 3852#endif 3853 3854#ifdef LN 3855 sub KK, 1, KK 3856#endif 3857 .align 4 3858 3859.LL89: 3860#ifdef LN 3861 sll K, BASE_SHIFT, TEMP1 3862 add B, TEMP1, B 3863#endif 3864 3865#if defined(LT) || defined(RN) 3866 mov BO, B 3867#endif 3868 3869#ifdef RN 3870 add KK, 1, KK 3871#endif 3872 3873#ifdef RT 3874 sub KK, 1, KK 3875#endif 3876 .align 4 3877 3878.LL999: 3879#ifdef TRMMKERNEL 3880#ifndef __64BIT__ 3881 ld [%sp + STACK_START + 8], %g1 3882 ld [%sp + STACK_START + 12], %g2 3883 ld [%sp + STACK_START + 16], %g3 3884 ld [%sp + STACK_START + 20], %g4 3885#else 3886 ldx [%sp + STACK_START + 32], %g1 3887 ldx [%sp + STACK_START + 40], %g2 3888 ldx [%sp + STACK_START + 48], %g3 3889 ldx [%sp + STACK_START + 56], %g4 3890#endif 3891#endif 3892 3893 return %i7 + 8 3894 clr %o0 3895 3896 EPILOGUE 3897