1/*********************************************************************/ 2/* Copyright 2005-2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define APREFETCHSIZE 24 43#define APREFETCH_CATEGORY 0 44 45#define M %i0 46#define N %i1 47#define K %i2 48 49#if defined(DOUBLE) && !defined(__64BIT__) 50#define A %i5 51#define B %i4 52#else 53#define A %i4 54#define B %i5 55#endif 56 57#define C %o4 58#define LDC %o5 59 60#define AO %l0 61#define BO %l1 62#define I %l2 63#define J %l3 64#define L %l4 65 66#define C1 %o0 67#define C2 %o1 68#define C3 %o2 69#define C4 %o3 70 71#define C5 %l5 72#define C6 %l6 73#define C7 %l7 74#define C8 %i3 75 76#define OFFSET %g1 77#define KK %g2 78#define TEMP1 %g3 79#define TEMP2 %g4 80#define AORIG %o7 81 82#ifdef DOUBLE 83#define c01 %f0 84#define c02 %f2 85#define c03 %f4 86#define c04 %f6 87#define c05 %f8 88#define c06 %f10 89#define c07 %f12 90#define c08 %f14 91#define c09 %f16 92#define c10 %f18 93#define c11 %f20 94#define c12 %f22 95#define c13 %f24 96#define c14 %f26 97#define c15 %f28 98#define c16 %f30 99 100#define a1 %f32 101#define a2 %f34 102#define a3 %f36 103#define a4 %f38 104#define a5 %f40 105 106#define b1 %f42 107#define b2 %f44 108#define b3 %f46 109#define b4 %f48 110#define b5 %f50 111#define b6 %f52 112#define b7 %f54 113#define b8 %f56 114#define b9 %f58 115 116#define cc01 0 117#define cc02 2 118#define cc03 4 119#define cc04 6 120#define cc05 8 121#define cc06 10 122#define cc07 12 123#define cc08 14 124#define cc09 16 125#define cc10 18 126#define cc11 20 127#define cc12 22 128#define cc13 24 129#define cc14 26 130#define cc15 28 131#define cc16 30 132 133#define aa1 1 134#define aa2 3 135#define aa3 5 136#define aa4 7 137#define aa5 9 138 139#define bb1 11 140#define bb2 13 141#define bb3 15 142#define bb4 17 143#define bb5 19 144#define bb6 21 145#define bb7 23 146#define bb8 25 147#define bb9 27 148 149#else 150#define c01 %f0 151#define c02 %f1 152#define c03 %f2 153#define c04 %f3 154#define c05 %f4 155#define c06 %f5 156#define c07 %f6 157#define c08 %f7 158#define c09 %f8 159#define c10 %f9 160#define c11 %f10 161#define c12 %f11 162#define c13 %f12 163#define c14 %f13 164#define c15 %f14 165#define c16 %f15 166 167#define a1 %f16 168#define a2 %f17 169#define a3 %f18 170#define a4 %f19 171#define a5 %f20 172 173#define b1 %f21 174#define b2 %f22 175#define b3 %f23 176#define b4 %f24 177#define b5 %f25 178#define b6 %f26 179#define b7 %f27 180#define b8 %f28 181#define b9 %f29 182 183#define cc01 0 184#define cc02 1 185#define cc03 2 186#define cc04 3 187#define cc05 4 188#define cc06 5 189#define cc07 6 190#define cc08 7 191#define cc09 8 192#define cc10 9 193#define cc11 10 194#define cc12 11 195#define cc13 12 196#define cc14 13 197#define cc15 14 198#define cc16 15 199 200#define aa1 16 201#define aa2 17 202#define aa3 18 203#define aa4 19 204#define aa5 20 205 206#define bb1 21 207#define bb2 22 208#define bb3 23 209#define bb4 24 210#define bb5 25 211#define bb6 26 212#define bb7 27 213#define bb8 28 214#define bb9 29 215 216#endif 217 218 .register %g2, #scratch 219 .register %g3, #scratch 220 221 PROLOGUE 222 SAVESP 223 nop 224 225#ifndef __64BIT__ 226 227#ifdef DOUBLE 228 ld [%sp + STACK_START + 28], B 229 ld [%sp + STACK_START + 32], C 230 ld [%sp + STACK_START + 36], LDC 231 ld [%sp + STACK_START + 40], OFFSET 232#else 233 ld [%sp + STACK_START + 28], C 234 ld [%sp + STACK_START + 32], LDC 235 ld [%sp + STACK_START + 36], OFFSET 236#endif 237 238 st %g1, [%sp + STACK_START + 8] 239 st %g2, [%sp + STACK_START + 12] 240 st %g3, [%sp + STACK_START + 16] 241 st %g4, [%sp + STACK_START + 20] 242#else 243 244 ldx [%sp+ STACK_START + 56], C 245 ldx [%sp+ STACK_START + 64], LDC 246 ldx [%sp+ STACK_START + 72], OFFSET 247 248 stx %g1, [%sp + STACK_START + 32] 249 stx %g2, [%sp + STACK_START + 40] 250 stx %g3, [%sp + STACK_START + 48] 251 stx %g4, [%sp + STACK_START + 56] 252#endif 253 254#if defined(TRMMKERNEL) && !defined(LEFT) 255 neg OFFSET, KK 256#endif 257 258 sll LDC, BASE_SHIFT, LDC 259 260#ifdef LN 261 smul M, K, TEMP1 262 sll TEMP1, BASE_SHIFT, TEMP1 263 add A, TEMP1, A 264 265 sll M, BASE_SHIFT, TEMP1 266 add C, TEMP1, C 267#endif 268 269#ifdef RN 270 neg OFFSET, KK 271#endif 272 273#ifdef RT 274 smul N, K, TEMP1 275 sll TEMP1, BASE_SHIFT, TEMP1 276 add B, TEMP1, B 277 278 smul N, LDC, TEMP1 279 add C, TEMP1, C 280 281 sub N, OFFSET, KK 282#endif 283 284 sra N, 3, J 285 cmp J, 0 286 ble,pn %icc, .LL30 287 nop 288 .align 4 289 290.LL11: 291#ifdef RT 292 sll K, BASE_SHIFT + 3, TEMP1 293 sub B, TEMP1, B 294#endif 295 296#ifndef RT 297 mov C, C1 298 add C, LDC, C2 299 add C2, LDC, C3 300 add C3, LDC, C4 301 add C4, LDC, C5 302 add C5, LDC, C6 303 add C6, LDC, C7 304 add C7, LDC, C8 305 add C8, LDC, C 306#else 307 sub C, LDC, C8 308 sub C8, LDC, C7 309 sub C7, LDC, C6 310 sub C6, LDC, C5 311 sub C5, LDC, C4 312 sub C4, LDC, C3 313 sub C3, LDC, C2 314 sub C2, LDC, C1 315 sub C2, LDC, C 316#endif 317 318#ifdef LN 319 add M, OFFSET, KK 320#endif 321 322#ifdef LT 323 mov OFFSET, KK 324#endif 325 326#if defined(LN) || defined(RT) 327 mov A, AORIG 328#else 329 mov A, AO 330#endif 331 332 and M, 1, I 333 cmp I, 0 334 ble,pn %icc, .LL20 335 nop 336 337#if defined(LT) || defined(RN) 338 mov B, BO 339#else 340#ifdef LN 341 sll K, BASE_SHIFT + 0, TEMP1 342 sub AORIG, TEMP1, AORIG 343#endif 344 345 sll KK, BASE_SHIFT + 0, TEMP1 346 sll KK, BASE_SHIFT + 3, TEMP2 347 348 add AORIG, TEMP1, AO 349 add B, TEMP2, BO 350#endif 351 352 LDF [AO + 0 * SIZE], a1 353 LDF [AO + 1 * SIZE], a2 354 LDF [AO + 2 * SIZE], a3 355 LDF [AO + 3 * SIZE], a4 356 357 LDF [BO + 0 * SIZE], b1 358 FCLR (cc01) 359 LDF [BO + 1 * SIZE], b2 360 FCLR (cc03) 361 LDF [BO + 2 * SIZE], b3 362 FCLR (cc05) 363 LDF [BO + 3 * SIZE], b4 364 FCLR (cc07) 365 LDF [BO + 4 * SIZE], b5 366 FCLR (cc09) 367 LDF [BO + 5 * SIZE], b6 368 FCLR (cc11) 369 LDF [BO + 6 * SIZE], b7 370 FCLR (cc13) 371 LDF [BO + 7 * SIZE], b8 372 FCLR (cc15) 373 374#if defined(LT) || defined(RN) 375 sra KK, 2, L 376#else 377 sub K, KK, L 378 sra L, 2, L 379#endif 380 cmp L, 0 381 ble,pn %icc, .LL25 382 LDF [BO + 8 * SIZE], b9 383 .align 4 384 385.LL23: 386 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 387 add L, -1, L 388 389 FMADD (aa1, bb1, cc01, cc01) 390 LDF [BO + 16 * SIZE], b1 391 FMADD (aa1, bb2, cc03, cc03) 392 LDF [BO + 9 * SIZE], b2 393 394 FMADD (aa1, bb3, cc05, cc05) 395 LDF [BO + 10 * SIZE], b3 396 FMADD (aa1, bb4, cc07, cc07) 397 LDF [BO + 11 * SIZE], b4 398 399 FMADD (aa1, bb5, cc09, cc09) 400 LDF [BO + 12 * SIZE], b5 401 FMADD (aa1, bb6, cc11, cc11) 402 LDF [BO + 13 * SIZE], b6 403 404 FMADD (aa1, bb7, cc13, cc13) 405 LDF [BO + 14 * SIZE], b7 406 FMADD (aa1, bb8, cc15, cc15) 407 LDF [BO + 15 * SIZE], b8 408 409 FMADD (aa2, bb9, cc01, cc01) 410 LDF [BO + 24 * SIZE], b9 411 FMADD (aa2, bb2, cc03, cc03) 412 LDF [BO + 17 * SIZE], b2 413 414 FMADD (aa2, bb3, cc05, cc05) 415 LDF [BO + 18 * SIZE], b3 416 FMADD (aa2, bb4, cc07, cc07) 417 LDF [BO + 19 * SIZE], b4 418 419 FMADD (aa2, bb5, cc09, cc09) 420 LDF [BO + 20 * SIZE], b5 421 FMADD (aa2, bb6, cc11, cc11) 422 LDF [BO + 21 * SIZE], b6 423 424 FMADD (aa2, bb7, cc13, cc13) 425 LDF [BO + 22 * SIZE], b7 426 FMADD (aa2, bb8, cc15, cc15) 427 LDF [BO + 23 * SIZE], b8 428 429 LDF [AO + 4 * SIZE], a1 430 LDF [AO + 5 * SIZE], a2 431 432 FMADD (aa3, bb1, cc01, cc01) 433 LDF [BO + 32 * SIZE], b1 434 FMADD (aa3, bb2, cc03, cc03) 435 LDF [BO + 25 * SIZE], b2 436 437 FMADD (aa3, bb3, cc05, cc05) 438 LDF [BO + 26 * SIZE], b3 439 FMADD (aa3, bb4, cc07, cc07) 440 LDF [BO + 27 * SIZE], b4 441 442 FMADD (aa3, bb5, cc09, cc09) 443 LDF [BO + 28 * SIZE], b5 444 FMADD (aa3, bb6, cc11, cc11) 445 LDF [BO + 29 * SIZE], b6 446 447 FMADD (aa3, bb7, cc13, cc13) 448 LDF [BO + 30 * SIZE], b7 449 FMADD (aa3, bb8, cc15, cc15) 450 LDF [BO + 31 * SIZE], b8 451 452 FMADD (aa4, bb9, cc01, cc01) 453 LDF [BO + 40 * SIZE], b9 454 FMADD (aa4, bb2, cc03, cc03) 455 LDF [BO + 33 * SIZE], b2 456 457 FMADD (aa4, bb3, cc05, cc05) 458 LDF [BO + 34 * SIZE], b3 459 FMADD (aa4, bb4, cc07, cc07) 460 LDF [BO + 35 * SIZE], b4 461 462 FMADD (aa4, bb5, cc09, cc09) 463 LDF [BO + 36 * SIZE], b5 464 FMADD (aa4, bb6, cc11, cc11) 465 LDF [BO + 37 * SIZE], b6 466 467 FMADD (aa4, bb7, cc13, cc13) 468 LDF [BO + 38 * SIZE], b7 469 FMADD (aa4, bb8, cc15, cc15) 470 LDF [BO + 39 * SIZE], b8 471 472 LDF [AO + 6 * SIZE], a3 473 LDF [AO + 7 * SIZE], a4 474 475 add AO, 4 * SIZE, AO 476 cmp L, 0 477 bg,pt %icc, .LL23 478 add BO, 32 * SIZE, BO 479 .align 4 480 481.LL25: 482#if defined(LT) || defined(RN) 483 and KK, 3, L 484#else 485 sub K, KK, L 486 and L, 3, L 487#endif 488 cmp L, 0 489 ble,a,pn %icc, .LL28 490 nop 491 .align 4 492 493.LL27: 494 FMADD (aa1, bb1, cc01, cc01) 495 LDF [BO + 8 * SIZE], b1 496 FMADD (aa1, bb2, cc03, cc03) 497 LDF [BO + 9 * SIZE], b2 498 499 FMADD (aa1, bb3, cc05, cc05) 500 LDF [BO + 10 * SIZE], b3 501 FMADD (aa1, bb4, cc07, cc07) 502 LDF [BO + 11 * SIZE], b4 503 504 FMADD (aa1, bb5, cc09, cc09) 505 LDF [BO + 12 * SIZE], b5 506 FMADD (aa1, bb6, cc11, cc11) 507 LDF [BO + 13 * SIZE], b6 508 509 FMADD (aa1, bb7, cc13, cc13) 510 LDF [BO + 14 * SIZE], b7 511 FMADD (aa1, bb8, cc15, cc15) 512 LDF [BO + 15 * SIZE], b8 513 514 LDF [AO + 1 * SIZE], a1 515 add AO, 1 * SIZE, AO 516 517 add L, -1, L 518 cmp L, 0 519 bg,pt %icc, .LL27 520 add BO, 8 * SIZE, BO 521 .align 4 522 523.LL28: 524#if defined(LN) || defined(RT) 525#ifdef LN 526 sub KK, 1, TEMP1 527#else 528 sub KK, 8, TEMP1 529#endif 530 sll TEMP1, BASE_SHIFT + 0, TEMP2 531 sll TEMP1, BASE_SHIFT + 3, TEMP1 532 533 add AORIG, TEMP2, AO 534 add B, TEMP1, BO 535#endif 536 537#if defined(LN) || defined(LT) 538 LDF [BO + 0 * SIZE], a1 539 LDF [BO + 1 * SIZE], a2 540 LDF [BO + 2 * SIZE], a3 541 LDF [BO + 3 * SIZE], a4 542 543 LDF [BO + 4 * SIZE], b1 544 LDF [BO + 5 * SIZE], b2 545 LDF [BO + 6 * SIZE], b3 546 LDF [BO + 7 * SIZE], b4 547 548 FSUB a1, c01, c01 549 FSUB a2, c03, c03 550 FSUB a3, c05, c05 551 FSUB a4, c07, c07 552 553 FSUB b1, c09, c09 554 FSUB b2, c11, c11 555 FSUB b3, c13, c13 556 FSUB b4, c15, c15 557#else 558 LDF [AO + 0 * SIZE], a1 559 LDF [AO + 1 * SIZE], a2 560 LDF [AO + 2 * SIZE], a3 561 LDF [AO + 3 * SIZE], a4 562 563 LDF [AO + 4 * SIZE], b1 564 LDF [AO + 5 * SIZE], b2 565 LDF [AO + 6 * SIZE], b3 566 LDF [AO + 7 * SIZE], b4 567 568 FSUB a1, c01, c01 569 FSUB a2, c03, c03 570 FSUB a3, c05, c05 571 FSUB a4, c07, c07 572 573 FSUB b1, c09, c09 574 FSUB b2, c11, c11 575 FSUB b3, c13, c13 576 FSUB b4, c15, c15 577#endif 578 579#if defined(LN) || defined(LT) 580 LDF [AO + 0 * SIZE], a1 581 582 FMUL a1, c01, c01 583 FMUL a1, c03, c03 584 FMUL a1, c05, c05 585 FMUL a1, c07, c07 586 FMUL a1, c09, c09 587 FMUL a1, c11, c11 588 FMUL a1, c13, c13 589 FMUL a1, c15, c15 590#endif 591 592#ifdef RN 593 LDF [BO + 0 * SIZE], a1 594 LDF [BO + 1 * SIZE], a2 595 LDF [BO + 2 * SIZE], a3 596 LDF [BO + 3 * SIZE], a4 597 LDF [BO + 4 * SIZE], b1 598 LDF [BO + 5 * SIZE], b2 599 LDF [BO + 6 * SIZE], b3 600 LDF [BO + 7 * SIZE], b4 601 602 FMUL a1, c01, c01 603 604 FNMSUB (aa2, cc01, cc03, cc03) 605 FNMSUB (aa3, cc01, cc05, cc05) 606 FNMSUB (aa4, cc01, cc07, cc07) 607 FNMSUB (bb1, cc01, cc09, cc09) 608 FNMSUB (bb2, cc01, cc11, cc11) 609 FNMSUB (bb3, cc01, cc13, cc13) 610 FNMSUB (bb4, cc01, cc15, cc15) 611 612 LDF [BO + 9 * SIZE], a1 613 LDF [BO + 10 * SIZE], a2 614 LDF [BO + 11 * SIZE], a3 615 LDF [BO + 12 * SIZE], a4 616 LDF [BO + 13 * SIZE], b1 617 LDF [BO + 14 * SIZE], b2 618 LDF [BO + 15 * SIZE], b3 619 620 FMUL a1, c03, c03 621 622 FNMSUB (aa2, cc03, cc05, cc05) 623 FNMSUB (aa3, cc03, cc07, cc07) 624 FNMSUB (aa4, cc03, cc09, cc09) 625 FNMSUB (bb1, cc03, cc11, cc11) 626 FNMSUB (bb2, cc03, cc13, cc13) 627 FNMSUB (bb3, cc03, cc15, cc15) 628 629 LDF [BO + 18 * SIZE], a1 630 LDF [BO + 19 * SIZE], a2 631 LDF [BO + 20 * SIZE], a3 632 LDF [BO + 21 * SIZE], a4 633 LDF [BO + 22 * SIZE], b1 634 LDF [BO + 23 * SIZE], b2 635 636 FMUL a1, c05, c05 637 638 FNMSUB (aa2, cc05, cc07, cc07) 639 FNMSUB (aa3, cc05, cc09, cc09) 640 FNMSUB (aa4, cc05, cc11, cc11) 641 FNMSUB (bb1, cc05, cc13, cc13) 642 FNMSUB (bb2, cc05, cc15, cc15) 643 644 LDF [BO + 27 * SIZE], a1 645 LDF [BO + 28 * SIZE], a2 646 LDF [BO + 29 * SIZE], a3 647 LDF [BO + 30 * SIZE], a4 648 LDF [BO + 31 * SIZE], b1 649 650 FMUL a1, c07, c07 651 652 FNMSUB (aa2, cc07, cc09, cc09) 653 FNMSUB (aa3, cc07, cc11, cc11) 654 FNMSUB (aa4, cc07, cc13, cc13) 655 FNMSUB (bb1, cc07, cc15, cc15) 656 657 LDF [BO + 36 * SIZE], a1 658 LDF [BO + 37 * SIZE], a2 659 LDF [BO + 38 * SIZE], a3 660 LDF [BO + 39 * SIZE], a4 661 662 FMUL a1, c09, c09 663 664 FNMSUB (aa2, cc09, cc11, cc11) 665 FNMSUB (aa3, cc09, cc13, cc13) 666 FNMSUB (aa4, cc09, cc15, cc15) 667 668 LDF [BO + 45 * SIZE], a1 669 LDF [BO + 46 * SIZE], a2 670 LDF [BO + 47 * SIZE], a3 671 672 FMUL a1, c11, c11 673 674 FNMSUB (aa2, cc11, cc13, cc13) 675 FNMSUB (aa3, cc11, cc15, cc15) 676 677 LDF [BO + 54 * SIZE], a1 678 LDF [BO + 55 * SIZE], a2 679 680 FMUL a1, c13, c13 681 682 FNMSUB (aa2, cc13, cc15, cc15) 683 684 LDF [BO + 63 * SIZE], a1 685 686 FMUL a1, c15, c15 687#endif 688 689#ifdef RT 690 LDF [BO + 63 * SIZE], a1 691 LDF [BO + 62 * SIZE], a2 692 LDF [BO + 61 * SIZE], a3 693 LDF [BO + 60 * SIZE], a4 694 LDF [BO + 59 * SIZE], b1 695 LDF [BO + 58 * SIZE], b2 696 LDF [BO + 57 * SIZE], b3 697 LDF [BO + 56 * SIZE], b4 698 699 FMUL a1, c15, c15 700 701 FNMSUB (aa2, cc15, cc13, cc13) 702 FNMSUB (aa3, cc15, cc11, cc11) 703 FNMSUB (aa4, cc15, cc09, cc09) 704 FNMSUB (bb1, cc15, cc07, cc07) 705 FNMSUB (bb2, cc15, cc05, cc05) 706 FNMSUB (bb3, cc15, cc03, cc03) 707 FNMSUB (bb4, cc15, cc01, cc01) 708 709 LDF [BO + 54 * SIZE], a1 710 LDF [BO + 53 * SIZE], a2 711 LDF [BO + 52 * SIZE], a3 712 LDF [BO + 51 * SIZE], a4 713 LDF [BO + 50 * SIZE], b1 714 LDF [BO + 49 * SIZE], b2 715 LDF [BO + 48 * SIZE], b3 716 717 FMUL a1, c13, c13 718 719 FNMSUB (aa2, cc13, cc11, cc11) 720 FNMSUB (aa3, cc13, cc09, cc09) 721 FNMSUB (aa4, cc13, cc07, cc07) 722 FNMSUB (bb1, cc13, cc05, cc05) 723 FNMSUB (bb2, cc13, cc03, cc03) 724 FNMSUB (bb3, cc13, cc01, cc01) 725 726 LDF [BO + 45 * SIZE], a1 727 LDF [BO + 44 * SIZE], a2 728 LDF [BO + 43 * SIZE], a3 729 LDF [BO + 42 * SIZE], a4 730 LDF [BO + 41 * SIZE], b1 731 LDF [BO + 40 * SIZE], b2 732 733 FMUL a1, c11, c11 734 735 FNMSUB (aa2, cc11, cc09, cc09) 736 FNMSUB (aa3, cc11, cc07, cc07) 737 FNMSUB (aa4, cc11, cc05, cc05) 738 FNMSUB (bb1, cc11, cc03, cc03) 739 FNMSUB (bb2, cc11, cc01, cc01) 740 741 LDF [BO + 36 * SIZE], a1 742 LDF [BO + 35 * SIZE], a2 743 LDF [BO + 34 * SIZE], a3 744 LDF [BO + 33 * SIZE], a4 745 LDF [BO + 32 * SIZE], b1 746 747 FMUL a1, c09, c09 748 749 FNMSUB (aa2, cc09, cc07, cc07) 750 FNMSUB (aa3, cc09, cc05, cc05) 751 FNMSUB (aa4, cc09, cc03, cc03) 752 FNMSUB (bb1, cc09, cc01, cc01) 753 754 LDF [BO + 27 * SIZE], a1 755 LDF [BO + 26 * SIZE], a2 756 LDF [BO + 25 * SIZE], a3 757 LDF [BO + 24 * SIZE], a4 758 759 FMUL a1, c07, c07 760 761 FNMSUB (aa2, cc07, cc05, cc05) 762 FNMSUB (aa3, cc07, cc03, cc03) 763 FNMSUB (aa4, cc07, cc01, cc01) 764 765 LDF [BO + 18 * SIZE], a1 766 LDF [BO + 17 * SIZE], a2 767 LDF [BO + 16 * SIZE], a3 768 769 FMUL a1, c05, c05 770 771 FNMSUB (aa2, cc05, cc03, cc03) 772 FNMSUB (aa3, cc05, cc01, cc01) 773 774 LDF [BO + 9 * SIZE], a1 775 LDF [BO + 8 * SIZE], a2 776 777 FMUL a1, c03, c03 778 779 FNMSUB (aa2, cc03, cc01, cc01) 780 781 LDF [BO + 0 * SIZE], a1 782 783 FMUL a1, c01, c01 784#endif 785 786#ifdef LN 787 add C1, -1 * SIZE, C1 788 add C2, -1 * SIZE, C2 789 add C3, -1 * SIZE, C3 790 add C4, -1 * SIZE, C4 791 add C5, -1 * SIZE, C5 792 add C6, -1 * SIZE, C6 793 add C7, -1 * SIZE, C7 794 add C8, -1 * SIZE, C8 795#endif 796 797#if defined(LN) || defined(LT) 798 STF c01, [BO + 0 * SIZE] 799 STF c03, [BO + 1 * SIZE] 800 STF c05, [BO + 2 * SIZE] 801 STF c07, [BO + 3 * SIZE] 802 803 STF c09, [BO + 4 * SIZE] 804 STF c11, [BO + 5 * SIZE] 805 STF c13, [BO + 6 * SIZE] 806 STF c15, [BO + 7 * SIZE] 807#else 808 STF c01, [AO + 0 * SIZE] 809 STF c03, [AO + 1 * SIZE] 810 STF c05, [AO + 2 * SIZE] 811 STF c07, [AO + 3 * SIZE] 812 813 STF c09, [AO + 4 * SIZE] 814 STF c11, [AO + 5 * SIZE] 815 STF c13, [AO + 6 * SIZE] 816 STF c15, [AO + 7 * SIZE] 817#endif 818 819 STF c01, [C1 + 0 * SIZE] 820 STF c03, [C2 + 0 * SIZE] 821 STF c05, [C3 + 0 * SIZE] 822 STF c07, [C4 + 0 * SIZE] 823 824 STF c09, [C5 + 0 * SIZE] 825 STF c11, [C6 + 0 * SIZE] 826 STF c13, [C7 + 0 * SIZE] 827 STF c15, [C8 + 0 * SIZE] 828 829#ifdef RT 830 sll K, BASE_SHIFT + 0, TEMP1 831 add AORIG, TEMP1, AORIG 832#endif 833 834#if defined(LT) || defined(RN) 835 sub K, KK, TEMP1 836 sll TEMP1, BASE_SHIFT + 0, TEMP2 837 sll TEMP1, BASE_SHIFT + 3, TEMP1 838 add AO, TEMP2, AO 839 add BO, TEMP1, BO 840#endif 841 842#ifdef LT 843 add KK, 1, KK 844#endif 845 846#ifdef LN 847 sub KK, 1, KK 848#endif 849 .align 4 850 851.LL20: 852 sra M, 1, I 853 cmp I, 0 854 ble,pn %icc, .LL29 855 nop 856 .align 4 857 858.LL12: 859#if defined(LT) || defined(RN) 860 mov B, BO 861#else 862#ifdef LN 863 sll K, BASE_SHIFT + 1, TEMP1 864 sub AORIG, TEMP1, AORIG 865#endif 866 867 sll KK, BASE_SHIFT + 1, TEMP1 868 sll KK, BASE_SHIFT + 3, TEMP2 869 870 add AORIG, TEMP1, AO 871 add B, TEMP2, BO 872#endif 873 874 LDF [AO + 0 * SIZE], a1 875 LDF [AO + 1 * SIZE], a2 876 LDF [AO + 8 * SIZE], a5 877 878 LDF [BO + 0 * SIZE], b1 879 880 LDF [BO + 1 * SIZE], b2 881 FCLR (cc01) 882 LDF [BO + 2 * SIZE], b3 883 FCLR (cc05) 884 LDF [BO + 3 * SIZE], b4 885 FCLR (cc09) 886 LDF [BO + 4 * SIZE], b5 887 FCLR (cc13) 888 889 LDF [BO + 5 * SIZE], b6 890 FCLR (cc02) 891 LDF [BO + 6 * SIZE], b7 892 FCLR (cc06) 893 LDF [BO + 7 * SIZE], b8 894 FCLR (cc10) 895 LDF [BO + 8 * SIZE], b9 896 FCLR (cc14) 897 898 prefetch [C1 + 1 * SIZE], 3 899 FCLR (cc03) 900 prefetch [C2 + 2 * SIZE], 3 901 FCLR (cc07) 902 prefetch [C3 + 1 * SIZE], 3 903 FCLR (cc11) 904 prefetch [C4 + 2 * SIZE], 3 905 FCLR (cc15) 906 907 prefetch [C5 + 1 * SIZE], 3 908 FCLR (cc04) 909 prefetch [C6 + 2 * SIZE], 3 910 FCLR (cc08) 911 prefetch [C7 + 1 * SIZE], 3 912 FCLR (cc12) 913 prefetch [C8 + 2 * SIZE], 3 914 FCLR (cc16) 915 916#if defined(LT) || defined(RN) 917 sra KK, 3, L 918#else 919 sub K, KK, L 920 sra L, 3, L 921#endif 922 cmp L, 0 923 ble,pn %icc, .LL15 924 nop 925 .align 4 926 927.LL13: 928 FMADD (aa1, bb1, cc01, cc01) 929 FMADD (aa2, bb1, cc02, cc02) 930 FMADD (aa1, bb2, cc03, cc03) 931 FMADD (aa2, bb2, cc04, cc04) 932 933 FMADD (aa1, bb3, cc05, cc05) 934 LDF [BO + 16 * SIZE], b1 935 FMADD (aa2, bb3, cc06, cc06) 936 LDF [BO + 9 * SIZE], b2 937 938 FMADD (aa1, bb4, cc07, cc07) 939 LDF [BO + 10 * SIZE], b3 940 FMADD (aa2, bb4, cc08, cc08) 941 LDF [BO + 11 * SIZE], b4 942 943 FMADD (aa1, bb5, cc09, cc09) 944 LDF [AO + 2 * SIZE], a3 945 FMADD (aa2, bb5, cc10, cc10) 946 LDF [AO + 3 * SIZE], a4 947 948 FMADD (aa1, bb6, cc11, cc11) 949 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 950 FMADD (aa2, bb6, cc12, cc12) 951 nop 952 953 FMADD (aa1, bb7, cc13, cc13) 954 LDF [BO + 12 * SIZE], b5 955 FMADD (aa2, bb7, cc14, cc14) 956 LDF [BO + 13 * SIZE], b6 957 958 FMADD (aa1, bb8, cc15, cc15) 959 LDF [BO + 14 * SIZE], b7 960 FMADD (aa2, bb8, cc16, cc16) 961 LDF [BO + 15 * SIZE], b8 962 963 FMADD (aa3, bb9, cc01, cc01) 964 FMADD (aa4, bb9, cc02, cc02) 965 FMADD (aa3, bb2, cc03, cc03) 966 FMADD (aa4, bb2, cc04, cc04) 967 968 FMADD (aa3, bb3, cc05, cc05) 969 LDF [BO + 24 * SIZE], b9 970 FMADD (aa4, bb3, cc06, cc06) 971 LDF [BO + 17 * SIZE], b2 972 973 FMADD (aa3, bb4, cc07, cc07) 974 LDF [BO + 18 * SIZE], b3 975 FMADD (aa4, bb4, cc08, cc08) 976 LDF [BO + 19 * SIZE], b4 977 978 FMADD (aa3, bb5, cc09, cc09) 979 LDF [AO + 4 * SIZE], a1 980 FMADD (aa4, bb5, cc10, cc10) 981 LDF [AO + 5 * SIZE], a2 982 983 FMADD (aa3, bb6, cc11, cc11) 984 add L, -1, L 985 FMADD (aa4, bb6, cc12, cc12) 986 nop 987 988 FMADD (aa3, bb7, cc13, cc13) 989 LDF [BO + 20 * SIZE], b5 990 FMADD (aa4, bb7, cc14, cc14) 991 LDF [BO + 21 * SIZE], b6 992 993 FMADD (aa3, bb8, cc15, cc15) 994 LDF [BO + 22 * SIZE], b7 995 FMADD (aa4, bb8, cc16, cc16) 996 LDF [BO + 23 * SIZE], b8 997 998 FMADD (aa1, bb1, cc01, cc01) 999 FMADD (aa2, bb1, cc02, cc02) 1000 FMADD (aa1, bb2, cc03, cc03) 1001 FMADD (aa2, bb2, cc04, cc04) 1002 1003 FMADD (aa1, bb3, cc05, cc05) 1004 LDF [BO + 32 * SIZE], b1 1005 FMADD (aa2, bb3, cc06, cc06) 1006 LDF [BO + 25 * SIZE], b2 1007 1008 FMADD (aa1, bb4, cc07, cc07) 1009 LDF [BO + 26 * SIZE], b3 1010 FMADD (aa2, bb4, cc08, cc08) 1011 LDF [BO + 27 * SIZE], b4 1012 1013 FMADD (aa1, bb5, cc09, cc09) 1014 LDF [AO + 6 * SIZE], a3 1015 FMADD (aa2, bb5, cc10, cc10) 1016 LDF [AO + 7 * SIZE], a4 1017 1018 FMADD (aa1, bb6, cc11, cc11) 1019 nop 1020 FMADD (aa2, bb6, cc12, cc12) 1021 nop 1022 1023 FMADD (aa1, bb7, cc13, cc13) 1024 LDF [BO + 28 * SIZE], b5 1025 FMADD (aa2, bb7, cc14, cc14) 1026 LDF [BO + 29 * SIZE], b6 1027 1028 FMADD (aa1, bb8, cc15, cc15) 1029 LDF [BO + 30 * SIZE], b7 1030 FMADD (aa2, bb8, cc16, cc16) 1031 LDF [BO + 31 * SIZE], b8 1032 1033 FMADD (aa3, bb9, cc01, cc01) 1034 FMADD (aa4, bb9, cc02, cc02) 1035 FMADD (aa3, bb2, cc03, cc03) 1036 FMADD (aa4, bb2, cc04, cc04) 1037 1038 FMADD (aa3, bb3, cc05, cc05) 1039 LDF [BO + 40 * SIZE], b9 1040 FMADD (aa4, bb3, cc06, cc06) 1041 LDF [BO + 33 * SIZE], b2 1042 1043 FMADD (aa3, bb4, cc07, cc07) 1044 LDF [BO + 34 * SIZE], b3 1045 FMADD (aa4, bb4, cc08, cc08) 1046 LDF [BO + 35 * SIZE], b4 1047 1048 FMADD (aa3, bb5, cc09, cc09) 1049 LDF [AO + 16 * SIZE], a1 /****/ 1050 FMADD (aa4, bb5, cc10, cc10) 1051 LDF [AO + 9 * SIZE], a2 1052 1053 FMADD (aa3, bb6, cc11, cc11) 1054 nop 1055 FMADD (aa4, bb6, cc12, cc12) 1056 nop 1057 1058 FMADD (aa3, bb7, cc13, cc13) 1059 LDF [BO + 36 * SIZE], b5 1060 FMADD (aa4, bb7, cc14, cc14) 1061 LDF [BO + 37 * SIZE], b6 1062 1063 FMADD (aa3, bb8, cc15, cc15) 1064 LDF [BO + 38 * SIZE], b7 1065 FMADD (aa4, bb8, cc16, cc16) 1066 LDF [BO + 39 * SIZE], b8 1067 1068 FMADD (aa5, bb1, cc01, cc01) 1069 FMADD (aa2, bb1, cc02, cc02) 1070 FMADD (aa5, bb2, cc03, cc03) 1071 FMADD (aa2, bb2, cc04, cc04) 1072 1073 FMADD (aa5, bb3, cc05, cc05) 1074 LDF [BO + 48 * SIZE], b1 1075 FMADD (aa2, bb3, cc06, cc06) 1076 LDF [BO + 41 * SIZE], b2 1077 1078 FMADD (aa5, bb4, cc07, cc07) 1079 LDF [BO + 42 * SIZE], b3 1080 FMADD (aa2, bb4, cc08, cc08) 1081 LDF [BO + 43 * SIZE], b4 1082 1083 FMADD (aa5, bb5, cc09, cc09) 1084 LDF [AO + 10 * SIZE], a3 1085 FMADD (aa2, bb5, cc10, cc10) 1086 LDF [AO + 11 * SIZE], a4 1087 1088 FMADD (aa5, bb6, cc11, cc11) 1089 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 1090 FMADD (aa2, bb6, cc12, cc12) 1091 nop 1092 1093 FMADD (aa5, bb7, cc13, cc13) 1094 LDF [BO + 44 * SIZE], b5 1095 FMADD (aa2, bb7, cc14, cc14) 1096 LDF [BO + 45 * SIZE], b6 1097 1098 FMADD (aa5, bb8, cc15, cc15) 1099 LDF [BO + 46 * SIZE], b7 1100 FMADD (aa2, bb8, cc16, cc16) 1101 LDF [BO + 47 * SIZE], b8 1102 1103 FMADD (aa3, bb9, cc01, cc01) 1104 FMADD (aa4, bb9, cc02, cc02) 1105 FMADD (aa3, bb2, cc03, cc03) 1106 FMADD (aa4, bb2, cc04, cc04) 1107 1108 FMADD (aa3, bb3, cc05, cc05) 1109 LDF [BO + 56 * SIZE], b9 1110 FMADD (aa4, bb3, cc06, cc06) 1111 LDF [BO + 49 * SIZE], b2 1112 1113 FMADD (aa3, bb4, cc07, cc07) 1114 LDF [BO + 50 * SIZE], b3 1115 FMADD (aa4, bb4, cc08, cc08) 1116 LDF [BO + 51 * SIZE], b4 1117 1118 FMADD (aa3, bb5, cc09, cc09) 1119 LDF [AO + 12 * SIZE], a5 1120 FMADD (aa4, bb5, cc10, cc10) 1121 LDF [AO + 13 * SIZE], a2 1122 1123 FMADD (aa3, bb6, cc11, cc11) 1124 cmp L, 0 1125 FMADD (aa4, bb6, cc12, cc12) 1126 nop 1127 1128 FMADD (aa3, bb7, cc13, cc13) 1129 LDF [BO + 52 * SIZE], b5 1130 FMADD (aa4, bb7, cc14, cc14) 1131 LDF [BO + 53 * SIZE], b6 1132 1133 FMADD (aa3, bb8, cc15, cc15) 1134 LDF [BO + 54 * SIZE], b7 1135 FMADD (aa4, bb8, cc16, cc16) 1136 LDF [BO + 55 * SIZE], b8 1137 1138 FMADD (aa5, bb1, cc01, cc01) 1139 FMADD (aa2, bb1, cc02, cc02) 1140 FMADD (aa5, bb2, cc03, cc03) 1141 FMADD (aa2, bb2, cc04, cc04) 1142 1143 FMADD (aa5, bb3, cc05, cc05) 1144 LDF [BO + 64 * SIZE], b1 1145 FMADD (aa2, bb3, cc06, cc06) 1146 LDF [BO + 57 * SIZE], b2 1147 1148 FMADD (aa5, bb4, cc07, cc07) 1149 LDF [BO + 58 * SIZE], b3 1150 FMADD (aa2, bb4, cc08, cc08) 1151 LDF [BO + 59 * SIZE], b4 1152 1153 FMADD (aa5, bb5, cc09, cc09) 1154 LDF [AO + 14 * SIZE], a3 1155 FMADD (aa2, bb5, cc10, cc10) 1156 LDF [AO + 15 * SIZE], a4 1157 1158 FMADD (aa5, bb6, cc11, cc11) 1159 add BO, 64 * SIZE, BO 1160 FMADD (aa2, bb6, cc12, cc12) 1161 add AO, 16 * SIZE, AO 1162 1163 FMADD (aa5, bb7, cc13, cc13) 1164 LDF [BO - 4 * SIZE], b5 1165 FMADD (aa2, bb7, cc14, cc14) 1166 LDF [BO - 3 * SIZE], b6 1167 1168 FMADD (aa5, bb8, cc15, cc15) 1169 LDF [BO - 2 * SIZE], b7 1170 FMADD (aa2, bb8, cc16, cc16) 1171 LDF [BO - 1 * SIZE], b8 1172 1173 FMADD (aa3, bb9, cc01, cc01) 1174 FMADD (aa4, bb9, cc02, cc02) 1175 FMADD (aa3, bb2, cc03, cc03) 1176 FMADD (aa4, bb2, cc04, cc04) 1177 1178 FMADD (aa3, bb3, cc05, cc05) 1179 LDF [BO + 8 * SIZE], b9 1180 FMADD (aa4, bb3, cc06, cc06) 1181 LDF [BO + 1 * SIZE], b2 1182 1183 FMADD (aa3, bb4, cc07, cc07) 1184 LDF [BO + 2 * SIZE], b3 1185 FMADD (aa4, bb4, cc08, cc08) 1186 LDF [BO + 3 * SIZE], b4 1187 1188 FMADD (aa3, bb5, cc09, cc09) 1189 LDF [AO + 8 * SIZE], a5 /****/ 1190 FMADD (aa4, bb5, cc10, cc10) 1191 LDF [AO + 1 * SIZE], a2 1192 1193 FMADD (aa3, bb6, cc11, cc11) 1194 FMADD (aa4, bb6, cc12, cc12) 1195 1196 FMADD (aa3, bb7, cc13, cc13) 1197 LDF [BO + 4 * SIZE], b5 1198 FMADD (aa4, bb7, cc14, cc14) 1199 LDF [BO + 5 * SIZE], b6 1200 1201 FMADD (aa3, bb8, cc15, cc15) 1202 LDF [BO + 6 * SIZE], b7 1203 FMADD (aa4, bb8, cc16, cc16) 1204 ble,pn %icc, .LL15 1205 LDF [BO + 7 * SIZE], b8 1206 1207 FMADD (aa1, bb1, cc01, cc01) 1208 FMADD (aa2, bb1, cc02, cc02) 1209 FMADD (aa1, bb2, cc03, cc03) 1210 FMADD (aa2, bb2, cc04, cc04) 1211 1212 FMADD (aa1, bb3, cc05, cc05) 1213 LDF [BO + 16 * SIZE], b1 1214 FMADD (aa2, bb3, cc06, cc06) 1215 LDF [BO + 9 * SIZE], b2 1216 1217 FMADD (aa1, bb4, cc07, cc07) 1218 LDF [BO + 10 * SIZE], b3 1219 FMADD (aa2, bb4, cc08, cc08) 1220 LDF [BO + 11 * SIZE], b4 1221 1222 FMADD (aa1, bb5, cc09, cc09) 1223 LDF [AO + 2 * SIZE], a3 1224 FMADD (aa2, bb5, cc10, cc10) 1225 LDF [AO + 3 * SIZE], a4 1226 1227 FMADD (aa1, bb6, cc11, cc11) 1228 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1229 FMADD (aa2, bb6, cc12, cc12) 1230 nop 1231 1232 FMADD (aa1, bb7, cc13, cc13) 1233 LDF [BO + 12 * SIZE], b5 1234 FMADD (aa2, bb7, cc14, cc14) 1235 LDF [BO + 13 * SIZE], b6 1236 1237 FMADD (aa1, bb8, cc15, cc15) 1238 LDF [BO + 14 * SIZE], b7 1239 FMADD (aa2, bb8, cc16, cc16) 1240 LDF [BO + 15 * SIZE], b8 1241 1242 FMADD (aa3, bb9, cc01, cc01) 1243 FMADD (aa4, bb9, cc02, cc02) 1244 FMADD (aa3, bb2, cc03, cc03) 1245 FMADD (aa4, bb2, cc04, cc04) 1246 1247 FMADD (aa3, bb3, cc05, cc05) 1248 LDF [BO + 24 * SIZE], b9 1249 FMADD (aa4, bb3, cc06, cc06) 1250 LDF [BO + 17 * SIZE], b2 1251 1252 FMADD (aa3, bb4, cc07, cc07) 1253 LDF [BO + 18 * SIZE], b3 1254 FMADD (aa4, bb4, cc08, cc08) 1255 LDF [BO + 19 * SIZE], b4 1256 1257 FMADD (aa3, bb5, cc09, cc09) 1258 LDF [AO + 4 * SIZE], a1 1259 FMADD (aa4, bb5, cc10, cc10) 1260 LDF [AO + 5 * SIZE], a2 1261 1262 FMADD (aa3, bb6, cc11, cc11) 1263 add L, -1, L 1264 FMADD (aa4, bb6, cc12, cc12) 1265 nop 1266 1267 FMADD (aa3, bb7, cc13, cc13) 1268 LDF [BO + 20 * SIZE], b5 1269 FMADD (aa4, bb7, cc14, cc14) 1270 LDF [BO + 21 * SIZE], b6 1271 1272 FMADD (aa3, bb8, cc15, cc15) 1273 LDF [BO + 22 * SIZE], b7 1274 FMADD (aa4, bb8, cc16, cc16) 1275 LDF [BO + 23 * SIZE], b8 1276 1277 FMADD (aa1, bb1, cc01, cc01) 1278 FMADD (aa2, bb1, cc02, cc02) 1279 FMADD (aa1, bb2, cc03, cc03) 1280 FMADD (aa2, bb2, cc04, cc04) 1281 1282 FMADD (aa1, bb3, cc05, cc05) 1283 LDF [BO + 32 * SIZE], b1 1284 FMADD (aa2, bb3, cc06, cc06) 1285 LDF [BO + 25 * SIZE], b2 1286 1287 FMADD (aa1, bb4, cc07, cc07) 1288 LDF [BO + 26 * SIZE], b3 1289 FMADD (aa2, bb4, cc08, cc08) 1290 LDF [BO + 27 * SIZE], b4 1291 1292 FMADD (aa1, bb5, cc09, cc09) 1293 LDF [AO + 6 * SIZE], a3 1294 FMADD (aa2, bb5, cc10, cc10) 1295 LDF [AO + 7 * SIZE], a4 1296 1297 FMADD (aa1, bb6, cc11, cc11) 1298 nop 1299 FMADD (aa2, bb6, cc12, cc12) 1300 nop 1301 1302 FMADD (aa1, bb7, cc13, cc13) 1303 LDF [BO + 28 * SIZE], b5 1304 FMADD (aa2, bb7, cc14, cc14) 1305 LDF [BO + 29 * SIZE], b6 1306 1307 FMADD (aa1, bb8, cc15, cc15) 1308 LDF [BO + 30 * SIZE], b7 1309 FMADD (aa2, bb8, cc16, cc16) 1310 LDF [BO + 31 * SIZE], b8 1311 1312 FMADD (aa3, bb9, cc01, cc01) 1313 FMADD (aa4, bb9, cc02, cc02) 1314 FMADD (aa3, bb2, cc03, cc03) 1315 FMADD (aa4, bb2, cc04, cc04) 1316 1317 FMADD (aa3, bb3, cc05, cc05) 1318 LDF [BO + 40 * SIZE], b9 1319 FMADD (aa4, bb3, cc06, cc06) 1320 LDF [BO + 33 * SIZE], b2 1321 1322 FMADD (aa3, bb4, cc07, cc07) 1323 LDF [BO + 34 * SIZE], b3 1324 FMADD (aa4, bb4, cc08, cc08) 1325 LDF [BO + 35 * SIZE], b4 1326 1327 FMADD (aa3, bb5, cc09, cc09) 1328 LDF [AO + 16 * SIZE], a1 /****/ 1329 FMADD (aa4, bb5, cc10, cc10) 1330 LDF [AO + 9 * SIZE], a2 1331 1332 FMADD (aa3, bb6, cc11, cc11) 1333 nop 1334 FMADD (aa4, bb6, cc12, cc12) 1335 nop 1336 1337 FMADD (aa3, bb7, cc13, cc13) 1338 LDF [BO + 36 * SIZE], b5 1339 FMADD (aa4, bb7, cc14, cc14) 1340 LDF [BO + 37 * SIZE], b6 1341 1342 FMADD (aa3, bb8, cc15, cc15) 1343 LDF [BO + 38 * SIZE], b7 1344 FMADD (aa4, bb8, cc16, cc16) 1345 LDF [BO + 39 * SIZE], b8 1346 1347 FMADD (aa5, bb1, cc01, cc01) 1348 FMADD (aa2, bb1, cc02, cc02) 1349 FMADD (aa5, bb2, cc03, cc03) 1350 FMADD (aa2, bb2, cc04, cc04) 1351 1352 FMADD (aa5, bb3, cc05, cc05) 1353 LDF [BO + 48 * SIZE], b1 1354 FMADD (aa2, bb3, cc06, cc06) 1355 LDF [BO + 41 * SIZE], b2 1356 1357 FMADD (aa5, bb4, cc07, cc07) 1358 LDF [BO + 42 * SIZE], b3 1359 FMADD (aa2, bb4, cc08, cc08) 1360 LDF [BO + 43 * SIZE], b4 1361 1362 FMADD (aa5, bb5, cc09, cc09) 1363 LDF [AO + 10 * SIZE], a3 1364 FMADD (aa2, bb5, cc10, cc10) 1365 LDF [AO + 11 * SIZE], a4 1366 1367 FMADD (aa5, bb6, cc11, cc11) 1368 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 1369 FMADD (aa2, bb6, cc12, cc12) 1370 nop 1371 1372 FMADD (aa5, bb7, cc13, cc13) 1373 LDF [BO + 44 * SIZE], b5 1374 FMADD (aa2, bb7, cc14, cc14) 1375 LDF [BO + 45 * SIZE], b6 1376 1377 FMADD (aa5, bb8, cc15, cc15) 1378 LDF [BO + 46 * SIZE], b7 1379 FMADD (aa2, bb8, cc16, cc16) 1380 LDF [BO + 47 * SIZE], b8 1381 1382 FMADD (aa3, bb9, cc01, cc01) 1383 FMADD (aa4, bb9, cc02, cc02) 1384 FMADD (aa3, bb2, cc03, cc03) 1385 FMADD (aa4, bb2, cc04, cc04) 1386 1387 FMADD (aa3, bb3, cc05, cc05) 1388 LDF [BO + 56 * SIZE], b9 1389 FMADD (aa4, bb3, cc06, cc06) 1390 LDF [BO + 49 * SIZE], b2 1391 1392 FMADD (aa3, bb4, cc07, cc07) 1393 LDF [BO + 50 * SIZE], b3 1394 FMADD (aa4, bb4, cc08, cc08) 1395 LDF [BO + 51 * SIZE], b4 1396 1397 FMADD (aa3, bb5, cc09, cc09) 1398 LDF [AO + 12 * SIZE], a5 1399 FMADD (aa4, bb5, cc10, cc10) 1400 LDF [AO + 13 * SIZE], a2 1401 1402 FMADD (aa3, bb6, cc11, cc11) 1403 cmp L, 0 1404 FMADD (aa4, bb6, cc12, cc12) 1405 nop 1406 1407 FMADD (aa3, bb7, cc13, cc13) 1408 LDF [BO + 52 * SIZE], b5 1409 FMADD (aa4, bb7, cc14, cc14) 1410 LDF [BO + 53 * SIZE], b6 1411 1412 FMADD (aa3, bb8, cc15, cc15) 1413 LDF [BO + 54 * SIZE], b7 1414 FMADD (aa4, bb8, cc16, cc16) 1415 LDF [BO + 55 * SIZE], b8 1416 1417 FMADD (aa5, bb1, cc01, cc01) 1418 FMADD (aa2, bb1, cc02, cc02) 1419 FMADD (aa5, bb2, cc03, cc03) 1420 FMADD (aa2, bb2, cc04, cc04) 1421 1422 FMADD (aa5, bb3, cc05, cc05) 1423 LDF [BO + 64 * SIZE], b1 1424 FMADD (aa2, bb3, cc06, cc06) 1425 LDF [BO + 57 * SIZE], b2 1426 1427 FMADD (aa5, bb4, cc07, cc07) 1428 LDF [BO + 58 * SIZE], b3 1429 FMADD (aa2, bb4, cc08, cc08) 1430 LDF [BO + 59 * SIZE], b4 1431 1432 FMADD (aa5, bb5, cc09, cc09) 1433 LDF [AO + 14 * SIZE], a3 1434 FMADD (aa2, bb5, cc10, cc10) 1435 LDF [AO + 15 * SIZE], a4 1436 1437 FMADD (aa5, bb6, cc11, cc11) 1438 add BO, 64 * SIZE, BO 1439 FMADD (aa2, bb6, cc12, cc12) 1440 add AO, 16 * SIZE, AO 1441 1442 FMADD (aa5, bb7, cc13, cc13) 1443 LDF [BO - 4 * SIZE], b5 1444 FMADD (aa2, bb7, cc14, cc14) 1445 LDF [BO - 3 * SIZE], b6 1446 1447 FMADD (aa5, bb8, cc15, cc15) 1448 LDF [BO - 2 * SIZE], b7 1449 FMADD (aa2, bb8, cc16, cc16) 1450 LDF [BO - 1 * SIZE], b8 1451 1452 FMADD (aa3, bb9, cc01, cc01) 1453 FMADD (aa4, bb9, cc02, cc02) 1454 FMADD (aa3, bb2, cc03, cc03) 1455 FMADD (aa4, bb2, cc04, cc04) 1456 1457 FMADD (aa3, bb3, cc05, cc05) 1458 LDF [BO + 8 * SIZE], b9 1459 FMADD (aa4, bb3, cc06, cc06) 1460 LDF [BO + 1 * SIZE], b2 1461 1462 FMADD (aa3, bb4, cc07, cc07) 1463 LDF [BO + 2 * SIZE], b3 1464 FMADD (aa4, bb4, cc08, cc08) 1465 LDF [BO + 3 * SIZE], b4 1466 1467 FMADD (aa3, bb5, cc09, cc09) 1468 LDF [AO + 8 * SIZE], a5 /****/ 1469 FMADD (aa4, bb5, cc10, cc10) 1470 LDF [AO + 1 * SIZE], a2 1471 1472 FMADD (aa3, bb6, cc11, cc11) 1473 FMADD (aa4, bb6, cc12, cc12) 1474 1475 FMADD (aa3, bb7, cc13, cc13) 1476 LDF [BO + 4 * SIZE], b5 1477 FMADD (aa4, bb7, cc14, cc14) 1478 LDF [BO + 5 * SIZE], b6 1479 1480 FMADD (aa3, bb8, cc15, cc15) 1481 LDF [BO + 6 * SIZE], b7 1482 FMADD (aa4, bb8, cc16, cc16) 1483 bg,pt %icc, .LL13 1484 LDF [BO + 7 * SIZE], b8 1485 .align 4 1486 1487.LL15: 1488#if defined(LT) || defined(RN) 1489 and KK, 7, L 1490#else 1491 sub K, KK, L 1492 and L, 7, L 1493#endif 1494 cmp L, 0 1495 ble,a,pn %icc, .LL18 1496 nop 1497 .align 4 1498 1499.LL17: 1500 FMADD (aa1, bb1, cc01, cc01) 1501 add L, -1, L 1502 FMADD (aa2, bb1, cc02, cc02) 1503 nop 1504 1505 FMADD (aa1, bb2, cc03, cc03) 1506 LDF [BO + 8 * SIZE], b1 1507 FMADD (aa2, bb2, cc04, cc04) 1508 LDF [BO + 9 * SIZE], b2 1509 1510 FMADD (aa1, bb3, cc05, cc05) 1511 cmp L, 0 1512 FMADD (aa2, bb3, cc06, cc06) 1513 nop 1514 1515 FMADD (aa1, bb4, cc07, cc07) 1516 LDF [BO + 10 * SIZE], b3 1517 FMADD (aa2, bb4, cc08, cc08) 1518 LDF [BO + 11 * SIZE], b4 1519 1520 FMADD (aa1, bb5, cc09, cc09) 1521 nop 1522 FMADD (aa2, bb5, cc10, cc10) 1523 nop 1524 1525 FMADD (aa1, bb6, cc11, cc11) 1526 LDF [BO + 12 * SIZE], b5 1527 FMADD (aa2, bb6, cc12, cc12) 1528 LDF [BO + 13 * SIZE], b6 1529 1530 FMADD (aa1, bb7, cc13, cc13) 1531 add AO, 2 * SIZE, AO 1532 FMADD (aa2, bb7, cc14, cc14) 1533 add BO, 8 * SIZE, BO 1534 1535 FMADD (aa1, bb8, cc15, cc15) 1536 LDF [AO + 0 * SIZE], a1 1537 FMADD (aa2, bb8, cc16, cc16) 1538 LDF [AO + 1 * SIZE], a2 1539 1540 LDF [BO + 6 * SIZE], b7 1541 bg,pt %icc, .LL17 1542 LDF [BO + 7 * SIZE], b8 1543 nop 1544 .align 4 1545 1546.LL18: 1547#if defined(LN) || defined(RT) 1548#ifdef LN 1549 sub KK, 2, TEMP1 1550#else 1551 sub KK, 8, TEMP1 1552#endif 1553 sll TEMP1, BASE_SHIFT + 1, TEMP2 1554 sll TEMP1, BASE_SHIFT + 3, TEMP1 1555 1556 add AORIG, TEMP2, AO 1557 add B, TEMP1, BO 1558#endif 1559 1560#if defined(LN) || defined(LT) 1561 LDF [BO + 0 * SIZE], a1 1562 LDF [BO + 1 * SIZE], a2 1563 LDF [BO + 2 * SIZE], a3 1564 LDF [BO + 3 * SIZE], a4 1565 1566 LDF [BO + 4 * SIZE], b1 1567 LDF [BO + 5 * SIZE], b2 1568 LDF [BO + 6 * SIZE], b3 1569 LDF [BO + 7 * SIZE], b4 1570 1571 FSUB a1, c01, c01 1572 FSUB a2, c03, c03 1573 FSUB a3, c05, c05 1574 FSUB a4, c07, c07 1575 1576 FSUB b1, c09, c09 1577 FSUB b2, c11, c11 1578 FSUB b3, c13, c13 1579 FSUB b4, c15, c15 1580 1581 LDF [BO + 8 * SIZE], a1 1582 LDF [BO + 9 * SIZE], a2 1583 LDF [BO + 10 * SIZE], a3 1584 LDF [BO + 11 * SIZE], a4 1585 1586 LDF [BO + 12 * SIZE], b1 1587 LDF [BO + 13 * SIZE], b2 1588 LDF [BO + 14 * SIZE], b3 1589 LDF [BO + 15 * SIZE], b4 1590 1591 FSUB a1, c02, c02 1592 FSUB a2, c04, c04 1593 FSUB a3, c06, c06 1594 FSUB a4, c08, c08 1595 1596 FSUB b1, c10, c10 1597 FSUB b2, c12, c12 1598 FSUB b3, c14, c14 1599 FSUB b4, c16, c16 1600#else 1601 LDF [AO + 0 * SIZE], a1 1602 LDF [AO + 1 * SIZE], a2 1603 LDF [AO + 2 * SIZE], a3 1604 LDF [AO + 3 * SIZE], a4 1605 1606 LDF [AO + 4 * SIZE], b1 1607 LDF [AO + 5 * SIZE], b2 1608 LDF [AO + 6 * SIZE], b3 1609 LDF [AO + 7 * SIZE], b4 1610 1611 FSUB a1, c01, c01 1612 FSUB a2, c02, c02 1613 FSUB a3, c03, c03 1614 FSUB a4, c04, c04 1615 1616 FSUB b1, c05, c05 1617 FSUB b2, c06, c06 1618 FSUB b3, c07, c07 1619 FSUB b4, c08, c08 1620 1621 LDF [AO + 8 * SIZE], a1 1622 LDF [AO + 9 * SIZE], a2 1623 LDF [AO + 10 * SIZE], a3 1624 LDF [AO + 11 * SIZE], a4 1625 1626 LDF [AO + 12 * SIZE], b1 1627 LDF [AO + 13 * SIZE], b2 1628 LDF [AO + 14 * SIZE], b3 1629 LDF [AO + 15 * SIZE], b4 1630 1631 FSUB a1, c09, c09 1632 FSUB a2, c10, c10 1633 FSUB a3, c11, c11 1634 FSUB a4, c12, c12 1635 1636 FSUB b1, c13, c13 1637 FSUB b2, c14, c14 1638 FSUB b3, c15, c15 1639 FSUB b4, c16, c16 1640#endif 1641 1642#ifdef LN 1643 LDF [AO + 3 * SIZE], a1 1644 LDF [AO + 2 * SIZE], a2 1645 LDF [AO + 0 * SIZE], a3 1646 1647 FMUL a1, c02, c02 1648 FMUL a1, c04, c04 1649 FMUL a1, c06, c06 1650 FMUL a1, c08, c08 1651 FMUL a1, c10, c10 1652 FMUL a1, c12, c12 1653 FMUL a1, c14, c14 1654 FMUL a1, c16, c16 1655 1656 FNMSUB (aa2, cc02, cc01, cc01) 1657 FNMSUB (aa2, cc04, cc03, cc03) 1658 FNMSUB (aa2, cc06, cc05, cc05) 1659 FNMSUB (aa2, cc08, cc07, cc07) 1660 FNMSUB (aa2, cc10, cc09, cc09) 1661 FNMSUB (aa2, cc12, cc11, cc11) 1662 FNMSUB (aa2, cc14, cc13, cc13) 1663 FNMSUB (aa2, cc16, cc15, cc15) 1664 1665 FMUL a3, c01, c01 1666 FMUL a3, c03, c03 1667 FMUL a3, c05, c05 1668 FMUL a3, c07, c07 1669 FMUL a3, c09, c09 1670 FMUL a3, c11, c11 1671 FMUL a3, c13, c13 1672 FMUL a3, c15, c15 1673#endif 1674 1675#ifdef LT 1676 LDF [AO + 0 * SIZE], a1 1677 LDF [AO + 1 * SIZE], a2 1678 LDF [AO + 3 * SIZE], a3 1679 1680 FMUL a1, c01, c01 1681 FMUL a1, c03, c03 1682 FMUL a1, c05, c05 1683 FMUL a1, c07, c07 1684 FMUL a1, c09, c09 1685 FMUL a1, c11, c11 1686 FMUL a1, c13, c13 1687 FMUL a1, c15, c15 1688 1689 FNMSUB (aa2, cc01, cc02, cc02) 1690 FNMSUB (aa2, cc03, cc04, cc04) 1691 FNMSUB (aa2, cc05, cc06, cc06) 1692 FNMSUB (aa2, cc07, cc08, cc08) 1693 FNMSUB (aa2, cc09, cc10, cc10) 1694 FNMSUB (aa2, cc11, cc12, cc12) 1695 FNMSUB (aa2, cc13, cc14, cc14) 1696 FNMSUB (aa2, cc15, cc16, cc16) 1697 1698 FMUL a3, c02, c02 1699 FMUL a3, c04, c04 1700 FMUL a3, c06, c06 1701 FMUL a3, c08, c08 1702 FMUL a3, c10, c10 1703 FMUL a3, c12, c12 1704 FMUL a3, c14, c14 1705 FMUL a3, c16, c16 1706#endif 1707 1708#ifdef RN 1709 LDF [BO + 0 * SIZE], a1 1710 LDF [BO + 1 * SIZE], a2 1711 LDF [BO + 2 * SIZE], a3 1712 LDF [BO + 3 * SIZE], a4 1713 LDF [BO + 4 * SIZE], b1 1714 LDF [BO + 5 * SIZE], b2 1715 LDF [BO + 6 * SIZE], b3 1716 LDF [BO + 7 * SIZE], b4 1717 1718 FMUL a1, c01, c01 1719 FMUL a1, c02, c02 1720 1721 FNMSUB (aa2, cc01, cc03, cc03) 1722 FNMSUB (aa2, cc02, cc04, cc04) 1723 FNMSUB (aa3, cc01, cc05, cc05) 1724 FNMSUB (aa3, cc02, cc06, cc06) 1725 FNMSUB (aa4, cc01, cc07, cc07) 1726 FNMSUB (aa4, cc02, cc08, cc08) 1727 FNMSUB (bb1, cc01, cc09, cc09) 1728 FNMSUB (bb1, cc02, cc10, cc10) 1729 FNMSUB (bb2, cc01, cc11, cc11) 1730 FNMSUB (bb2, cc02, cc12, cc12) 1731 FNMSUB (bb3, cc01, cc13, cc13) 1732 FNMSUB (bb3, cc02, cc14, cc14) 1733 FNMSUB (bb4, cc01, cc15, cc15) 1734 FNMSUB (bb4, cc02, cc16, cc16) 1735 1736 LDF [BO + 9 * SIZE], a1 1737 LDF [BO + 10 * SIZE], a2 1738 LDF [BO + 11 * SIZE], a3 1739 LDF [BO + 12 * SIZE], a4 1740 LDF [BO + 13 * SIZE], b1 1741 LDF [BO + 14 * SIZE], b2 1742 LDF [BO + 15 * SIZE], b3 1743 1744 FMUL a1, c03, c03 1745 FMUL a1, c04, c04 1746 1747 FNMSUB (aa2, cc03, cc05, cc05) 1748 FNMSUB (aa2, cc04, cc06, cc06) 1749 FNMSUB (aa3, cc03, cc07, cc07) 1750 FNMSUB (aa3, cc04, cc08, cc08) 1751 FNMSUB (aa4, cc03, cc09, cc09) 1752 FNMSUB (aa4, cc04, cc10, cc10) 1753 FNMSUB (bb1, cc03, cc11, cc11) 1754 FNMSUB (bb1, cc04, cc12, cc12) 1755 FNMSUB (bb2, cc03, cc13, cc13) 1756 FNMSUB (bb2, cc04, cc14, cc14) 1757 FNMSUB (bb3, cc03, cc15, cc15) 1758 FNMSUB (bb3, cc04, cc16, cc16) 1759 1760 LDF [BO + 18 * SIZE], a1 1761 LDF [BO + 19 * SIZE], a2 1762 LDF [BO + 20 * SIZE], a3 1763 LDF [BO + 21 * SIZE], a4 1764 LDF [BO + 22 * SIZE], b1 1765 LDF [BO + 23 * SIZE], b2 1766 1767 FMUL a1, c05, c05 1768 FMUL a1, c06, c06 1769 1770 FNMSUB (aa2, cc05, cc07, cc07) 1771 FNMSUB (aa2, cc06, cc08, cc08) 1772 FNMSUB (aa3, cc05, cc09, cc09) 1773 FNMSUB (aa3, cc06, cc10, cc10) 1774 FNMSUB (aa4, cc05, cc11, cc11) 1775 FNMSUB (aa4, cc06, cc12, cc12) 1776 FNMSUB (bb1, cc05, cc13, cc13) 1777 FNMSUB (bb1, cc06, cc14, cc14) 1778 FNMSUB (bb2, cc05, cc15, cc15) 1779 FNMSUB (bb2, cc06, cc16, cc16) 1780 1781 LDF [BO + 27 * SIZE], a1 1782 LDF [BO + 28 * SIZE], a2 1783 LDF [BO + 29 * SIZE], a3 1784 LDF [BO + 30 * SIZE], a4 1785 LDF [BO + 31 * SIZE], b1 1786 1787 FMUL a1, c07, c07 1788 FMUL a1, c08, c08 1789 1790 FNMSUB (aa2, cc07, cc09, cc09) 1791 FNMSUB (aa2, cc08, cc10, cc10) 1792 FNMSUB (aa3, cc07, cc11, cc11) 1793 FNMSUB (aa3, cc08, cc12, cc12) 1794 FNMSUB (aa4, cc07, cc13, cc13) 1795 FNMSUB (aa4, cc08, cc14, cc14) 1796 FNMSUB (bb1, cc07, cc15, cc15) 1797 FNMSUB (bb1, cc08, cc16, cc16) 1798 1799 LDF [BO + 36 * SIZE], a1 1800 LDF [BO + 37 * SIZE], a2 1801 LDF [BO + 38 * SIZE], a3 1802 LDF [BO + 39 * SIZE], a4 1803 1804 FMUL a1, c09, c09 1805 FMUL a1, c10, c10 1806 1807 FNMSUB (aa2, cc09, cc11, cc11) 1808 FNMSUB (aa2, cc10, cc12, cc12) 1809 FNMSUB (aa3, cc09, cc13, cc13) 1810 FNMSUB (aa3, cc10, cc14, cc14) 1811 FNMSUB (aa4, cc09, cc15, cc15) 1812 FNMSUB (aa4, cc10, cc16, cc16) 1813 1814 LDF [BO + 45 * SIZE], a1 1815 LDF [BO + 46 * SIZE], a2 1816 LDF [BO + 47 * SIZE], a3 1817 1818 FMUL a1, c11, c11 1819 FMUL a1, c12, c12 1820 1821 FNMSUB (aa2, cc11, cc13, cc13) 1822 FNMSUB (aa2, cc12, cc14, cc14) 1823 FNMSUB (aa3, cc11, cc15, cc15) 1824 FNMSUB (aa3, cc12, cc16, cc16) 1825 1826 LDF [BO + 54 * SIZE], a1 1827 LDF [BO + 55 * SIZE], a2 1828 1829 FMUL a1, c13, c13 1830 FMUL a1, c14, c14 1831 1832 FNMSUB (aa2, cc13, cc15, cc15) 1833 FNMSUB (aa2, cc14, cc16, cc16) 1834 1835 LDF [BO + 63 * SIZE], a1 1836 1837 FMUL a1, c15, c15 1838 FMUL a1, c16, c16 1839#endif 1840 1841#ifdef RT 1842 LDF [BO + 63 * SIZE], a1 1843 LDF [BO + 62 * SIZE], a2 1844 LDF [BO + 61 * SIZE], a3 1845 LDF [BO + 60 * SIZE], a4 1846 LDF [BO + 59 * SIZE], b1 1847 LDF [BO + 58 * SIZE], b2 1848 LDF [BO + 57 * SIZE], b3 1849 LDF [BO + 56 * SIZE], b4 1850 1851 FMUL a1, c16, c16 1852 FMUL a1, c15, c15 1853 1854 FNMSUB (aa2, cc16, cc14, cc14) 1855 FNMSUB (aa2, cc15, cc13, cc13) 1856 FNMSUB (aa3, cc16, cc12, cc12) 1857 FNMSUB (aa3, cc15, cc11, cc11) 1858 FNMSUB (aa4, cc16, cc10, cc10) 1859 FNMSUB (aa4, cc15, cc09, cc09) 1860 FNMSUB (bb1, cc16, cc08, cc08) 1861 FNMSUB (bb1, cc15, cc07, cc07) 1862 FNMSUB (bb2, cc16, cc06, cc06) 1863 FNMSUB (bb2, cc15, cc05, cc05) 1864 FNMSUB (bb3, cc16, cc04, cc04) 1865 FNMSUB (bb3, cc15, cc03, cc03) 1866 FNMSUB (bb4, cc16, cc02, cc02) 1867 FNMSUB (bb4, cc15, cc01, cc01) 1868 1869 LDF [BO + 54 * SIZE], a1 1870 LDF [BO + 53 * SIZE], a2 1871 LDF [BO + 52 * SIZE], a3 1872 LDF [BO + 51 * SIZE], a4 1873 LDF [BO + 50 * SIZE], b1 1874 LDF [BO + 49 * SIZE], b2 1875 LDF [BO + 48 * SIZE], b3 1876 1877 FMUL a1, c14, c14 1878 FMUL a1, c13, c13 1879 1880 FNMSUB (aa2, cc14, cc12, cc12) 1881 FNMSUB (aa2, cc13, cc11, cc11) 1882 FNMSUB (aa3, cc14, cc10, cc10) 1883 FNMSUB (aa3, cc13, cc09, cc09) 1884 FNMSUB (aa4, cc14, cc08, cc08) 1885 FNMSUB (aa4, cc13, cc07, cc07) 1886 FNMSUB (bb1, cc14, cc06, cc06) 1887 FNMSUB (bb1, cc13, cc05, cc05) 1888 FNMSUB (bb2, cc14, cc04, cc04) 1889 FNMSUB (bb2, cc13, cc03, cc03) 1890 FNMSUB (bb3, cc14, cc02, cc02) 1891 FNMSUB (bb3, cc13, cc01, cc01) 1892 1893 LDF [BO + 45 * SIZE], a1 1894 LDF [BO + 44 * SIZE], a2 1895 LDF [BO + 43 * SIZE], a3 1896 LDF [BO + 42 * SIZE], a4 1897 LDF [BO + 41 * SIZE], b1 1898 LDF [BO + 40 * SIZE], b2 1899 1900 FMUL a1, c12, c12 1901 FMUL a1, c11, c11 1902 1903 FNMSUB (aa2, cc12, cc10, cc10) 1904 FNMSUB (aa2, cc11, cc09, cc09) 1905 FNMSUB (aa3, cc12, cc08, cc08) 1906 FNMSUB (aa3, cc11, cc07, cc07) 1907 FNMSUB (aa4, cc12, cc06, cc06) 1908 FNMSUB (aa4, cc11, cc05, cc05) 1909 FNMSUB (bb1, cc12, cc04, cc04) 1910 FNMSUB (bb1, cc11, cc03, cc03) 1911 FNMSUB (bb2, cc12, cc02, cc02) 1912 FNMSUB (bb2, cc11, cc01, cc01) 1913 1914 LDF [BO + 36 * SIZE], a1 1915 LDF [BO + 35 * SIZE], a2 1916 LDF [BO + 34 * SIZE], a3 1917 LDF [BO + 33 * SIZE], a4 1918 LDF [BO + 32 * SIZE], b1 1919 1920 FMUL a1, c10, c10 1921 FMUL a1, c09, c09 1922 1923 FNMSUB (aa2, cc10, cc08, cc08) 1924 FNMSUB (aa2, cc09, cc07, cc07) 1925 FNMSUB (aa3, cc10, cc06, cc06) 1926 FNMSUB (aa3, cc09, cc05, cc05) 1927 FNMSUB (aa4, cc10, cc04, cc04) 1928 FNMSUB (aa4, cc09, cc03, cc03) 1929 FNMSUB (bb1, cc10, cc02, cc02) 1930 FNMSUB (bb1, cc09, cc01, cc01) 1931 1932 LDF [BO + 27 * SIZE], a1 1933 LDF [BO + 26 * SIZE], a2 1934 LDF [BO + 25 * SIZE], a3 1935 LDF [BO + 24 * SIZE], a4 1936 1937 FMUL a1, c08, c08 1938 FMUL a1, c07, c07 1939 1940 FNMSUB (aa2, cc08, cc06, cc06) 1941 FNMSUB (aa2, cc07, cc05, cc05) 1942 FNMSUB (aa3, cc08, cc04, cc04) 1943 FNMSUB (aa3, cc07, cc03, cc03) 1944 FNMSUB (aa4, cc08, cc02, cc02) 1945 FNMSUB (aa4, cc07, cc01, cc01) 1946 1947 LDF [BO + 18 * SIZE], a1 1948 LDF [BO + 17 * SIZE], a2 1949 LDF [BO + 16 * SIZE], a3 1950 1951 FMUL a1, c06, c06 1952 FMUL a1, c05, c05 1953 1954 FNMSUB (aa2, cc06, cc04, cc04) 1955 FNMSUB (aa2, cc05, cc03, cc03) 1956 FNMSUB (aa3, cc06, cc02, cc02) 1957 FNMSUB (aa3, cc05, cc01, cc01) 1958 1959 LDF [BO + 9 * SIZE], a1 1960 LDF [BO + 8 * SIZE], a2 1961 1962 FMUL a1, c04, c04 1963 FMUL a1, c03, c03 1964 1965 FNMSUB (aa2, cc04, cc02, cc02) 1966 FNMSUB (aa2, cc03, cc01, cc01) 1967 1968 LDF [BO + 0 * SIZE], a1 1969 1970 FMUL a1, c02, c02 1971 FMUL a1, c01, c01 1972#endif 1973 1974#ifdef LN 1975 add C1, -2 * SIZE, C1 1976 add C2, -2 * SIZE, C2 1977 add C3, -2 * SIZE, C3 1978 add C4, -2 * SIZE, C4 1979 add C5, -2 * SIZE, C5 1980 add C6, -2 * SIZE, C6 1981 add C7, -2 * SIZE, C7 1982 add C8, -2 * SIZE, C8 1983#endif 1984 1985#if defined(LN) || defined(LT) 1986 STF c01, [BO + 0 * SIZE] 1987 STF c03, [BO + 1 * SIZE] 1988 STF c05, [BO + 2 * SIZE] 1989 STF c07, [BO + 3 * SIZE] 1990 1991 STF c09, [BO + 4 * SIZE] 1992 STF c11, [BO + 5 * SIZE] 1993 STF c13, [BO + 6 * SIZE] 1994 STF c15, [BO + 7 * SIZE] 1995 1996 STF c02, [BO + 8 * SIZE] 1997 STF c04, [BO + 9 * SIZE] 1998 STF c06, [BO + 10 * SIZE] 1999 STF c08, [BO + 11 * SIZE] 2000 2001 STF c10, [BO + 12 * SIZE] 2002 STF c12, [BO + 13 * SIZE] 2003 STF c14, [BO + 14 * SIZE] 2004 STF c16, [BO + 15 * SIZE] 2005#else 2006 STF c01, [AO + 0 * SIZE] 2007 STF c02, [AO + 1 * SIZE] 2008 STF c03, [AO + 2 * SIZE] 2009 STF c04, [AO + 3 * SIZE] 2010 2011 STF c05, [AO + 4 * SIZE] 2012 STF c06, [AO + 5 * SIZE] 2013 STF c07, [AO + 6 * SIZE] 2014 STF c08, [AO + 7 * SIZE] 2015 2016 STF c09, [AO + 8 * SIZE] 2017 STF c10, [AO + 9 * SIZE] 2018 STF c11, [AO + 10 * SIZE] 2019 STF c12, [AO + 11 * SIZE] 2020 2021 STF c13, [AO + 12 * SIZE] 2022 STF c14, [AO + 13 * SIZE] 2023 STF c15, [AO + 14 * SIZE] 2024 STF c16, [AO + 15 * SIZE] 2025#endif 2026 2027 STF c01, [C1 + 0 * SIZE] 2028 STF c02, [C1 + 1 * SIZE] 2029 STF c03, [C2 + 0 * SIZE] 2030 STF c04, [C2 + 1 * SIZE] 2031 2032 STF c05, [C3 + 0 * SIZE] 2033 STF c06, [C3 + 1 * SIZE] 2034 STF c07, [C4 + 0 * SIZE] 2035 STF c08, [C4 + 1 * SIZE] 2036 2037 STF c09, [C5 + 0 * SIZE] 2038 STF c10, [C5 + 1 * SIZE] 2039 STF c11, [C6 + 0 * SIZE] 2040 STF c12, [C6 + 1 * SIZE] 2041 2042 STF c13, [C7 + 0 * SIZE] 2043 STF c14, [C7 + 1 * SIZE] 2044 STF c15, [C8 + 0 * SIZE] 2045 STF c16, [C8 + 1 * SIZE] 2046 2047#ifndef LN 2048 add C1, 2 * SIZE, C1 2049 add C2, 2 * SIZE, C2 2050 add C3, 2 * SIZE, C3 2051 add C4, 2 * SIZE, C4 2052 add C5, 2 * SIZE, C5 2053 add C6, 2 * SIZE, C6 2054 add C7, 2 * SIZE, C7 2055 add C8, 2 * SIZE, C8 2056#endif 2057 2058#ifdef RT 2059 sll K, BASE_SHIFT + 1, TEMP1 2060 add AORIG, TEMP1, AORIG 2061#endif 2062 2063#if defined(LT) || defined(RN) 2064 sub K, KK, TEMP1 2065 sll TEMP1, BASE_SHIFT + 1, TEMP2 2066 sll TEMP1, BASE_SHIFT + 3, TEMP1 2067 add AO, TEMP2, AO 2068 add BO, TEMP1, BO 2069#endif 2070 2071#ifdef LT 2072 add KK, 2, KK 2073#endif 2074 2075#ifdef LN 2076 sub KK, 2, KK 2077#endif 2078 2079 add I, -1, I 2080 cmp I, 0 2081 bg,pt %icc, .LL12 2082 nop 2083 .align 4 2084 2085.LL29: 2086#ifdef LN 2087 sll K, BASE_SHIFT + 3, TEMP1 2088 add B, TEMP1, B 2089#endif 2090 2091#if defined(LT) || defined(RN) 2092 mov BO, B 2093#endif 2094 2095#ifdef RN 2096 add KK, 8, KK 2097#endif 2098 2099#ifdef RT 2100 sub KK, 8, KK 2101#endif 2102 2103 add J, -1, J 2104 cmp J, 0 2105 bg,pt %icc, .LL11 2106 nop 2107 .align 4 2108 2109.LL30: 2110 and N, 4, J 2111 cmp J, 0 2112 ble,pn %icc, .LL50 2113 nop 2114 2115#ifdef RT 2116 sll K, BASE_SHIFT + 2, TEMP1 2117 sub B, TEMP1, B 2118#endif 2119 2120#ifndef RT 2121 mov C, C1 2122 add C, LDC, C2 2123 add C2, LDC, C3 2124 add C3, LDC, C4 2125 add C4, LDC, C 2126#else 2127 sub C, LDC, C4 2128 sub C4, LDC, C3 2129 sub C3, LDC, C2 2130 sub C2, LDC, C1 2131 sub C2, LDC, C 2132#endif 2133 2134#ifdef LN 2135 add M, OFFSET, KK 2136#endif 2137 2138#ifdef LT 2139 mov OFFSET, KK 2140#endif 2141 2142#if defined(LN) || defined(RT) 2143 mov A, AORIG 2144#else 2145 mov A, AO 2146#endif 2147 2148 and M, 1, I 2149 cmp I, 0 2150 ble,pn %icc, .LL40 2151 nop 2152 2153#if defined(LT) || defined(RN) 2154 mov B, BO 2155#else 2156#ifdef LN 2157 sll K, BASE_SHIFT + 0, TEMP1 2158 sub AORIG, TEMP1, AORIG 2159#endif 2160 2161 sll KK, BASE_SHIFT + 0, TEMP1 2162 sll KK, BASE_SHIFT + 2, TEMP2 2163 2164 add AORIG, TEMP1, AO 2165 add B, TEMP2, BO 2166#endif 2167 2168 LDF [AO + 0 * SIZE], a1 2169 LDF [AO + 1 * SIZE], a2 2170 LDF [AO + 2 * SIZE], a3 2171 LDF [AO + 3 * SIZE], a4 2172 2173 LDF [BO + 0 * SIZE], b1 2174 LDF [BO + 1 * SIZE], b2 2175 LDF [BO + 2 * SIZE], b3 2176 LDF [BO + 3 * SIZE], b4 2177 LDF [BO + 4 * SIZE], b5 2178 LDF [BO + 5 * SIZE], b6 2179 FCLR (cc01) 2180 LDF [BO + 6 * SIZE], b7 2181 FCLR (cc03) 2182 LDF [BO + 7 * SIZE], b8 2183 FCLR (cc05) 2184 LDF [BO + 8 * SIZE], b9 2185 FCLR (cc07) 2186 2187#if defined(LT) || defined(RN) 2188 sra KK, 2, L 2189#else 2190 sub K, KK, L 2191 sra L, 2, L 2192#endif 2193 cmp L, 0 2194 ble,pn %icc, .LL45 2195 nop 2196 2197.LL43: 2198 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2199 add L, -1, L 2200 2201 FMADD (aa1, bb1, cc01, cc01) 2202 LDF [BO + 16 * SIZE], b1 2203 FMADD (aa1, bb2, cc03, cc03) 2204 LDF [BO + 9 * SIZE], b2 2205 FMADD (aa1, bb3, cc05, cc05) 2206 LDF [BO + 10 * SIZE], b3 2207 FMADD (aa1, bb4, cc07, cc07) 2208 LDF [BO + 11 * SIZE], b4 2209 2210 LDF [AO + 4 * SIZE], a1 2211 cmp L, 0 2212 2213 FMADD (aa2, bb5, cc01, cc01) 2214 LDF [BO + 12 * SIZE], b5 2215 FMADD (aa2, bb6, cc03, cc03) 2216 LDF [BO + 13 * SIZE], b6 2217 FMADD (aa2, bb7, cc05, cc05) 2218 LDF [BO + 14 * SIZE], b7 2219 FMADD (aa2, bb8, cc07, cc07) 2220 LDF [BO + 15 * SIZE], b8 2221 2222 LDF [AO + 5 * SIZE], a2 2223 add AO, 4 * SIZE, AO 2224 2225 FMADD (aa3, bb9, cc01, cc01) 2226 LDF [BO + 24 * SIZE], b9 2227 FMADD (aa3, bb2, cc03, cc03) 2228 LDF [BO + 17 * SIZE], b2 2229 FMADD (aa3, bb3, cc05, cc05) 2230 LDF [BO + 18 * SIZE], b3 2231 FMADD (aa3, bb4, cc07, cc07) 2232 LDF [BO + 19 * SIZE], b4 2233 2234 LDF [AO + 2 * SIZE], a3 2235 add BO, 16 * SIZE, BO 2236 2237 FMADD (aa4, bb5, cc01, cc01) 2238 LDF [BO + 4 * SIZE], b5 2239 FMADD (aa4, bb6, cc03, cc03) 2240 LDF [BO + 5 * SIZE], b6 2241 FMADD (aa4, bb7, cc05, cc05) 2242 LDF [BO + 6 * SIZE], b7 2243 FMADD (aa4, bb8, cc07, cc07) 2244 LDF [BO + 7 * SIZE], b8 2245 2246 bg,pt %icc, .LL43 2247 LDF [AO + 3 * SIZE], a4 2248 .align 4 2249 2250.LL45: 2251#if defined(LT) || defined(RN) 2252 and KK, 3, L 2253#else 2254 sub K, KK, L 2255 and L, 3, L 2256#endif 2257 cmp L, 0 2258 ble,a,pn %icc, .LL48 2259 nop 2260 .align 4 2261 2262.LL47: 2263 FMADD (aa1, bb1, cc01, cc01) 2264 LDF [BO + 4 * SIZE], b1 2265 add L, -1, L 2266 FMADD (aa1, bb2, cc03, cc03) 2267 LDF [BO + 5 * SIZE], b2 2268 add AO, 1 * SIZE, AO 2269 2270 FMADD (aa1, bb3, cc05, cc05) 2271 LDF [BO + 6 * SIZE], b3 2272 cmp L, 0 2273 FMADD (aa1, bb4, cc07, cc07) 2274 LDF [BO + 7 * SIZE], b4 2275 add BO, 4 * SIZE, BO 2276 2277 bg,pt %icc, .LL47 2278 LDF [AO + 0 * SIZE], a1 2279 .align 4 2280 2281.LL48: 2282#if defined(LN) || defined(RT) 2283#ifdef LN 2284 sub KK, 1, TEMP1 2285#else 2286 sub KK, 4, TEMP1 2287#endif 2288 sll TEMP1, BASE_SHIFT + 0, TEMP2 2289 sll TEMP1, BASE_SHIFT + 2, TEMP1 2290 2291 add AORIG, TEMP2, AO 2292 add B, TEMP1, BO 2293#endif 2294 2295#if defined(LN) || defined(LT) 2296 LDF [BO + 0 * SIZE], a1 2297 LDF [BO + 1 * SIZE], a2 2298 LDF [BO + 2 * SIZE], a3 2299 LDF [BO + 3 * SIZE], a4 2300 2301 FSUB a1, c01, c01 2302 FSUB a2, c03, c03 2303 FSUB a3, c05, c05 2304 FSUB a4, c07, c07 2305#else 2306 LDF [AO + 0 * SIZE], a1 2307 LDF [AO + 1 * SIZE], a2 2308 LDF [AO + 2 * SIZE], a3 2309 LDF [AO + 3 * SIZE], a4 2310 2311 FSUB a1, c01, c01 2312 FSUB a2, c03, c03 2313 FSUB a3, c05, c05 2314 FSUB a4, c07, c07 2315#endif 2316 2317#if defined(LN) || defined(LT) 2318 LDF [AO + 0 * SIZE], a1 2319 2320 FMUL a1, c01, c01 2321 FMUL a1, c03, c03 2322 FMUL a1, c05, c05 2323 FMUL a1, c07, c07 2324#endif 2325 2326#ifdef RN 2327 LDF [BO + 0 * SIZE], a1 2328 LDF [BO + 1 * SIZE], a2 2329 LDF [BO + 2 * SIZE], a3 2330 LDF [BO + 3 * SIZE], a4 2331 2332 FMUL a1, c01, c01 2333 2334 FNMSUB (aa2, cc01, cc03, cc03) 2335 FNMSUB (aa3, cc01, cc05, cc05) 2336 FNMSUB (aa4, cc01, cc07, cc07) 2337 2338 LDF [BO + 5 * SIZE], a1 2339 LDF [BO + 6 * SIZE], a2 2340 LDF [BO + 7 * SIZE], a3 2341 2342 FMUL a1, c03, c03 2343 2344 FNMSUB (aa2, cc03, cc05, cc05) 2345 FNMSUB (aa3, cc03, cc07, cc07) 2346 2347 LDF [BO + 10 * SIZE], a1 2348 LDF [BO + 11 * SIZE], a2 2349 2350 FMUL a1, c05, c05 2351 2352 FNMSUB (aa2, cc05, cc07, cc07) 2353 2354 LDF [BO + 15 * SIZE], a1 2355 2356 FMUL a1, c07, c07 2357#endif 2358 2359#ifdef RT 2360 LDF [BO + 15 * SIZE], a1 2361 LDF [BO + 14 * SIZE], a2 2362 LDF [BO + 13 * SIZE], a3 2363 LDF [BO + 12 * SIZE], a4 2364 2365 FMUL a1, c07, c07 2366 2367 FNMSUB (aa2, cc07, cc05, cc05) 2368 FNMSUB (aa3, cc07, cc03, cc03) 2369 FNMSUB (aa4, cc07, cc01, cc01) 2370 2371 LDF [BO + 10 * SIZE], a1 2372 LDF [BO + 9 * SIZE], a2 2373 LDF [BO + 8 * SIZE], a3 2374 2375 FMUL a1, c05, c05 2376 2377 FNMSUB (aa2, cc05, cc03, cc03) 2378 FNMSUB (aa3, cc05, cc01, cc01) 2379 2380 LDF [BO + 5 * SIZE], a1 2381 LDF [BO + 4 * SIZE], a2 2382 2383 FMUL a1, c03, c03 2384 2385 FNMSUB (aa2, cc03, cc01, cc01) 2386 2387 LDF [BO + 0 * SIZE], a1 2388 2389 FMUL a1, c01, c01 2390#endif 2391 2392#ifdef LN 2393 add C1, -1 * SIZE, C1 2394 add C2, -1 * SIZE, C2 2395 add C3, -1 * SIZE, C3 2396 add C4, -1 * SIZE, C4 2397#endif 2398 2399#if defined(LN) || defined(LT) 2400 STF c01, [BO + 0 * SIZE] 2401 STF c03, [BO + 1 * SIZE] 2402 STF c05, [BO + 2 * SIZE] 2403 STF c07, [BO + 3 * SIZE] 2404#else 2405 STF c01, [AO + 0 * SIZE] 2406 STF c03, [AO + 1 * SIZE] 2407 STF c05, [AO + 2 * SIZE] 2408 STF c07, [AO + 3 * SIZE] 2409#endif 2410 2411 STF c01, [C1 + 0 * SIZE] 2412 STF c03, [C2 + 0 * SIZE] 2413 STF c05, [C3 + 0 * SIZE] 2414 STF c07, [C4 + 0 * SIZE] 2415 2416#ifdef RT 2417 sll K, BASE_SHIFT + 0, TEMP1 2418 add AORIG, TEMP1, AORIG 2419#endif 2420 2421#if defined(LT) || defined(RN) 2422 sub K, KK, TEMP1 2423 sll TEMP1, BASE_SHIFT + 0, TEMP2 2424 sll TEMP1, BASE_SHIFT + 2, TEMP1 2425 add AO, TEMP2, AO 2426 add BO, TEMP1, BO 2427#endif 2428 2429#ifdef LT 2430 add KK, 1, KK 2431#endif 2432 2433#ifdef LN 2434 sub KK, 1, KK 2435#endif 2436 .align 4 2437 2438.LL40: 2439 sra M, 1, I 2440 cmp I, 0 2441 ble,pn %icc, .LL49 2442 nop 2443 .align 4 2444 2445.LL32: 2446#if defined(LT) || defined(RN) 2447 mov B, BO 2448#else 2449#ifdef LN 2450 sll K, BASE_SHIFT + 1, TEMP1 2451 sub AORIG, TEMP1, AORIG 2452#endif 2453 2454 sll KK, BASE_SHIFT + 1, TEMP1 2455 sll KK, BASE_SHIFT + 2, TEMP2 2456 2457 add AORIG, TEMP1, AO 2458 add B, TEMP2, BO 2459#endif 2460 2461 LDF [AO + 0 * SIZE], a1 2462 LDF [AO + 1 * SIZE], a2 2463 2464 LDF [BO + 0 * SIZE], b1 2465 LDF [BO + 1 * SIZE], b2 2466 LDF [BO + 2 * SIZE], b3 2467 LDF [BO + 3 * SIZE], b4 2468 LDF [BO + 4 * SIZE], b5 2469 2470 LDF [BO + 5 * SIZE], b6 2471 FCLR (cc01) 2472 LDF [BO + 6 * SIZE], b7 2473 FCLR (cc02) 2474 LDF [BO + 7 * SIZE], b8 2475 FCLR (cc03) 2476 LDF [BO + 8 * SIZE], b9 2477 FCLR (cc04) 2478 2479 prefetch [C1 + 2 * SIZE], 3 2480 FCLR (cc05) 2481 prefetch [C2 + 2 * SIZE], 3 2482 FCLR (cc06) 2483 prefetch [C3 + 2 * SIZE], 3 2484 FCLR (cc07) 2485 prefetch [C4 + 2 * SIZE], 3 2486 FCLR (cc08) 2487 2488#if defined(LT) || defined(RN) 2489 sra KK, 2, L 2490#else 2491 sub K, KK, L 2492 sra L, 2, L 2493#endif 2494 cmp L, 0 2495 ble,pn %icc, .LL35 2496 nop 2497 .align 4 2498 2499.LL33: 2500 FMADD (aa1, bb1, cc01, cc01) 2501 LDF [AO + 2 * SIZE], a3 2502 FMADD (aa2, bb1, cc02, cc02) 2503 LDF [AO + 3 * SIZE], a4 2504 2505 FMADD (aa1, bb2, cc03, cc03) 2506 LDF [BO + 16 * SIZE], b1 2507 FMADD (aa2, bb2, cc04, cc04) 2508 LDF [BO + 9 * SIZE], b2 2509 2510 FMADD (aa1, bb3, cc05, cc05) 2511 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2512 FMADD (aa2, bb3, cc06, cc06) 2513 add L, -1, L 2514 2515 FMADD (aa1, bb4, cc07, cc07) 2516 LDF [BO + 10 * SIZE], b3 2517 FMADD (aa2, bb4, cc08, cc08) 2518 LDF [BO + 11 * SIZE], b4 2519 2520 FMADD (aa3, bb5, cc01, cc01) 2521 LDF [AO + 4 * SIZE], a1 2522 FMADD (aa4, bb5, cc02, cc02) 2523 LDF [AO + 5 * SIZE], a2 2524 2525 FMADD (aa3, bb6, cc03, cc03) 2526 LDF [BO + 12 * SIZE], b5 2527 FMADD (aa4, bb6, cc04, cc04) 2528 LDF [BO + 13 * SIZE], b6 2529 2530 FMADD (aa3, bb7, cc05, cc05) 2531 cmp L, 0 2532 FMADD (aa4, bb7, cc06, cc06) 2533 add AO, 8 * SIZE, AO 2534 2535 FMADD (aa3, bb8, cc07, cc07) 2536 LDF [BO + 14 * SIZE], b7 2537 FMADD (aa4, bb8, cc08, cc08) 2538 LDF [BO + 15 * SIZE], b8 2539 2540 FMADD (aa1, bb9, cc01, cc01) 2541 LDF [AO - 2 * SIZE], a3 2542 FMADD (aa2, bb9, cc02, cc02) 2543 LDF [AO - 1 * SIZE], a4 2544 2545 FMADD (aa1, bb2, cc03, cc03) 2546 LDF [BO + 24 * SIZE], b9 2547 FMADD (aa2, bb2, cc04, cc04) 2548 LDF [BO + 17 * SIZE], b2 2549 2550 FMADD (aa1, bb3, cc05, cc05) 2551 add BO, 16 * SIZE, BO 2552 FMADD (aa2, bb3, cc06, cc06) 2553 nop 2554 2555 FMADD (aa1, bb4, cc07, cc07) 2556 LDF [BO + 2 * SIZE], b3 2557 FMADD (aa2, bb4, cc08, cc08) 2558 LDF [BO + 3 * SIZE], b4 2559 2560 FMADD (aa3, bb5, cc01, cc01) 2561 LDF [AO + 0 * SIZE], a1 2562 FMADD (aa4, bb5, cc02, cc02) 2563 LDF [AO + 1 * SIZE], a2 2564 FMADD (aa3, bb6, cc03, cc03) 2565 LDF [BO + 4 * SIZE], b5 2566 FMADD (aa4, bb6, cc04, cc04) 2567 LDF [BO + 5 * SIZE], b6 2568 2569 FMADD (aa3, bb7, cc05, cc05) 2570 nop 2571 FMADD (aa4, bb7, cc06, cc06) 2572 LDF [BO + 6 * SIZE], b7 2573 2574 FMADD (aa3, bb8, cc07, cc07) 2575 FMADD (aa4, bb8, cc08, cc08) 2576 bg,pt %icc, .LL33 2577 LDF [BO + 7 * SIZE], b8 2578 .align 4 2579 2580.LL35: 2581#if defined(LT) || defined(RN) 2582 and KK, 3, L 2583#else 2584 sub K, KK, L 2585 and L, 3, L 2586#endif 2587 cmp L, 0 2588 ble,a,pn %icc, .LL38 2589 nop 2590 .align 4 2591 2592.LL37: 2593 FMADD (aa1, bb1, cc01, cc01) 2594 add L, -1, L 2595 FMADD (aa2, bb1, cc02, cc02) 2596 LDF [BO + 4 * SIZE], b1 2597 2598 FMADD (aa1, bb2, cc03, cc03) 2599 add AO, 2 * SIZE, AO 2600 FMADD (aa2, bb2, cc04, cc04) 2601 LDF [BO + 5 * SIZE], b2 2602 2603 FMADD (aa1, bb3, cc05, cc05) 2604 cmp L, 0 2605 FMADD (aa2, bb3, cc06, cc06) 2606 LDF [BO + 6 * SIZE], b3 2607 2608 FMADD (aa1, bb4, cc07, cc07) 2609 LDF [AO + 0 * SIZE], a1 2610 FMADD (aa2, bb4, cc08, cc08) 2611 LDF [AO + 1 * SIZE], a2 2612 2613 LDF [BO + 7 * SIZE], b4 2614 bg,pt %icc, .LL37 2615 add BO, 4 * SIZE, BO 2616 .align 4 2617 2618.LL38: 2619#if defined(LN) || defined(RT) 2620#ifdef LN 2621 sub KK, 2, TEMP1 2622#else 2623 sub KK, 4, TEMP1 2624#endif 2625 sll TEMP1, BASE_SHIFT + 1, TEMP2 2626 sll TEMP1, BASE_SHIFT + 2, TEMP1 2627 2628 add AORIG, TEMP2, AO 2629 add B, TEMP1, BO 2630#endif 2631 2632#if defined(LN) || defined(LT) 2633 LDF [BO + 0 * SIZE], a1 2634 LDF [BO + 1 * SIZE], a2 2635 LDF [BO + 2 * SIZE], a3 2636 LDF [BO + 3 * SIZE], a4 2637 2638 LDF [BO + 4 * SIZE], b1 2639 LDF [BO + 5 * SIZE], b2 2640 LDF [BO + 6 * SIZE], b3 2641 LDF [BO + 7 * SIZE], b4 2642 2643 FSUB a1, c01, c01 2644 FSUB a2, c03, c03 2645 FSUB a3, c05, c05 2646 FSUB a4, c07, c07 2647 2648 FSUB b1, c02, c02 2649 FSUB b2, c04, c04 2650 FSUB b3, c06, c06 2651 FSUB b4, c08, c08 2652#else 2653 LDF [AO + 0 * SIZE], a1 2654 LDF [AO + 1 * SIZE], a2 2655 LDF [AO + 2 * SIZE], a3 2656 LDF [AO + 3 * SIZE], a4 2657 2658 LDF [AO + 4 * SIZE], b1 2659 LDF [AO + 5 * SIZE], b2 2660 LDF [AO + 6 * SIZE], b3 2661 LDF [AO + 7 * SIZE], b4 2662 2663 FSUB a1, c01, c01 2664 FSUB a2, c02, c02 2665 FSUB a3, c03, c03 2666 FSUB a4, c04, c04 2667 2668 FSUB b1, c05, c05 2669 FSUB b2, c06, c06 2670 FSUB b3, c07, c07 2671 FSUB b4, c08, c08 2672 2673#endif 2674 2675#ifdef LN 2676 LDF [AO + 3 * SIZE], a1 2677 LDF [AO + 2 * SIZE], a2 2678 LDF [AO + 0 * SIZE], a3 2679 2680 FMUL a1, c02, c02 2681 FMUL a1, c04, c04 2682 FMUL a1, c06, c06 2683 FMUL a1, c08, c08 2684 2685 FNMSUB (aa2, cc02, cc01, cc01) 2686 FNMSUB (aa2, cc04, cc03, cc03) 2687 FNMSUB (aa2, cc06, cc05, cc05) 2688 FNMSUB (aa2, cc08, cc07, cc07) 2689 2690 FMUL a3, c01, c01 2691 FMUL a3, c03, c03 2692 FMUL a3, c05, c05 2693 FMUL a3, c07, c07 2694#endif 2695 2696#ifdef LT 2697 LDF [AO + 0 * SIZE], a1 2698 LDF [AO + 1 * SIZE], a2 2699 LDF [AO + 3 * SIZE], a3 2700 2701 FMUL a1, c01, c01 2702 FMUL a1, c03, c03 2703 FMUL a1, c05, c05 2704 FMUL a1, c07, c07 2705 2706 FNMSUB (aa2, cc01, cc02, cc02) 2707 FNMSUB (aa2, cc03, cc04, cc04) 2708 FNMSUB (aa2, cc05, cc06, cc06) 2709 FNMSUB (aa2, cc07, cc08, cc08) 2710 2711 FMUL a3, c02, c02 2712 FMUL a3, c04, c04 2713 FMUL a3, c06, c06 2714 FMUL a3, c08, c08 2715#endif 2716 2717#ifdef RN 2718 LDF [BO + 0 * SIZE], a1 2719 LDF [BO + 1 * SIZE], a2 2720 LDF [BO + 2 * SIZE], a3 2721 LDF [BO + 3 * SIZE], a4 2722 2723 FMUL a1, c01, c01 2724 FMUL a1, c02, c02 2725 2726 FNMSUB (aa2, cc01, cc03, cc03) 2727 FNMSUB (aa2, cc02, cc04, cc04) 2728 FNMSUB (aa3, cc01, cc05, cc05) 2729 FNMSUB (aa3, cc02, cc06, cc06) 2730 FNMSUB (aa4, cc01, cc07, cc07) 2731 FNMSUB (aa4, cc02, cc08, cc08) 2732 2733 LDF [BO + 5 * SIZE], a1 2734 LDF [BO + 6 * SIZE], a2 2735 LDF [BO + 7 * SIZE], a3 2736 2737 FMUL a1, c03, c03 2738 FMUL a1, c04, c04 2739 2740 FNMSUB (aa2, cc03, cc05, cc05) 2741 FNMSUB (aa2, cc04, cc06, cc06) 2742 FNMSUB (aa3, cc03, cc07, cc07) 2743 FNMSUB (aa3, cc04, cc08, cc08) 2744 2745 LDF [BO + 10 * SIZE], a1 2746 LDF [BO + 11 * SIZE], a2 2747 2748 FMUL a1, c05, c05 2749 FMUL a1, c06, c06 2750 2751 FNMSUB (aa2, cc05, cc07, cc07) 2752 FNMSUB (aa2, cc06, cc08, cc08) 2753 2754 LDF [BO + 15 * SIZE], a1 2755 2756 FMUL a1, c07, c07 2757 FMUL a1, c08, c08 2758#endif 2759 2760#ifdef RT 2761 LDF [BO + 15 * SIZE], a1 2762 LDF [BO + 14 * SIZE], a2 2763 LDF [BO + 13 * SIZE], a3 2764 LDF [BO + 12 * SIZE], a4 2765 2766 FMUL a1, c08, c08 2767 FMUL a1, c07, c07 2768 2769 FNMSUB (aa2, cc08, cc06, cc06) 2770 FNMSUB (aa2, cc07, cc05, cc05) 2771 FNMSUB (aa3, cc08, cc04, cc04) 2772 FNMSUB (aa3, cc07, cc03, cc03) 2773 FNMSUB (aa4, cc08, cc02, cc02) 2774 FNMSUB (aa4, cc07, cc01, cc01) 2775 2776 LDF [BO + 10 * SIZE], a1 2777 LDF [BO + 9 * SIZE], a2 2778 LDF [BO + 8 * SIZE], a3 2779 2780 FMUL a1, c06, c06 2781 FMUL a1, c05, c05 2782 2783 FNMSUB (aa2, cc06, cc04, cc04) 2784 FNMSUB (aa2, cc05, cc03, cc03) 2785 FNMSUB (aa3, cc06, cc02, cc02) 2786 FNMSUB (aa3, cc05, cc01, cc01) 2787 2788 LDF [BO + 5 * SIZE], a1 2789 LDF [BO + 4 * SIZE], a2 2790 2791 FMUL a1, c04, c04 2792 FMUL a1, c03, c03 2793 2794 FNMSUB (aa2, cc04, cc02, cc02) 2795 FNMSUB (aa2, cc03, cc01, cc01) 2796 2797 LDF [BO + 0 * SIZE], a1 2798 2799 FMUL a1, c02, c02 2800 FMUL a1, c01, c01 2801#endif 2802 2803#ifdef LN 2804 add C1, -2 * SIZE, C1 2805 add C2, -2 * SIZE, C2 2806 add C3, -2 * SIZE, C3 2807 add C4, -2 * SIZE, C4 2808#endif 2809 2810#if defined(LN) || defined(LT) 2811 STF c01, [BO + 0 * SIZE] 2812 STF c03, [BO + 1 * SIZE] 2813 STF c05, [BO + 2 * SIZE] 2814 STF c07, [BO + 3 * SIZE] 2815 2816 STF c02, [BO + 4 * SIZE] 2817 STF c04, [BO + 5 * SIZE] 2818 STF c06, [BO + 6 * SIZE] 2819 STF c08, [BO + 7 * SIZE] 2820#else 2821 STF c01, [AO + 0 * SIZE] 2822 STF c02, [AO + 1 * SIZE] 2823 STF c03, [AO + 2 * SIZE] 2824 STF c04, [AO + 3 * SIZE] 2825 2826 STF c05, [AO + 4 * SIZE] 2827 STF c06, [AO + 5 * SIZE] 2828 STF c07, [AO + 6 * SIZE] 2829 STF c08, [AO + 7 * SIZE] 2830#endif 2831 2832 STF c01, [C1 + 0 * SIZE] 2833 STF c02, [C1 + 1 * SIZE] 2834 STF c03, [C2 + 0 * SIZE] 2835 STF c04, [C2 + 1 * SIZE] 2836 2837 STF c05, [C3 + 0 * SIZE] 2838 STF c06, [C3 + 1 * SIZE] 2839 STF c07, [C4 + 0 * SIZE] 2840 STF c08, [C4 + 1 * SIZE] 2841 2842#ifndef LN 2843 add C1, 2 * SIZE, C1 2844 add C2, 2 * SIZE, C2 2845 add C3, 2 * SIZE, C3 2846 add C4, 2 * SIZE, C4 2847#endif 2848 2849#ifdef RT 2850 sll K, BASE_SHIFT + 1, TEMP1 2851 add AORIG, TEMP1, AORIG 2852#endif 2853 2854#if defined(LT) || defined(RN) 2855 sub K, KK, TEMP1 2856 sll TEMP1, BASE_SHIFT + 1, TEMP2 2857 sll TEMP1, BASE_SHIFT + 2, TEMP1 2858 add AO, TEMP2, AO 2859 add BO, TEMP1, BO 2860#endif 2861 2862#ifdef LT 2863 add KK, 2, KK 2864#endif 2865 2866#ifdef LN 2867 sub KK, 2, KK 2868#endif 2869 2870 add I, -1, I 2871 cmp I, 0 2872 bg,pt %icc, .LL32 2873 nop 2874 2875.LL49: 2876#ifdef LN 2877 sll K, BASE_SHIFT + 2, TEMP1 2878 add B, TEMP1, B 2879#endif 2880 2881#if defined(LT) || defined(RN) 2882 mov BO, B 2883#endif 2884 2885#ifdef RN 2886 add KK, 4, KK 2887#endif 2888 2889#ifdef RT 2890 sub KK, 4, KK 2891#endif 2892 .align 4 2893 2894.LL50: 2895 and N, 2, J 2896 cmp J, 0 2897 ble,pn %icc, .LL70 2898 nop 2899 2900#ifdef RT 2901 sll K, BASE_SHIFT + 1, TEMP1 2902 sub B, TEMP1, B 2903#endif 2904 2905#ifndef RT 2906 mov C, C1 2907 add C, LDC, C2 2908 add C2, LDC, C 2909#else 2910 sub C, LDC, C2 2911 sub C2, LDC, C1 2912 sub C2, LDC, C 2913#endif 2914 2915#ifdef LN 2916 add M, OFFSET, KK 2917#endif 2918 2919#ifdef LT 2920 mov OFFSET, KK 2921#endif 2922 2923#if defined(LN) || defined(RT) 2924 mov A, AORIG 2925#else 2926 mov A, AO 2927#endif 2928 2929 and M, 1, I 2930 cmp I, 0 2931 ble,pn %icc, .LL60 2932 nop 2933 2934#if defined(LT) || defined(RN) 2935 mov B, BO 2936#else 2937#ifdef LN 2938 sll K, BASE_SHIFT + 0, TEMP1 2939 sub AORIG, TEMP1, AORIG 2940#endif 2941 2942 sll KK, BASE_SHIFT + 0, TEMP1 2943 sll KK, BASE_SHIFT + 1, TEMP2 2944 2945 add AORIG, TEMP1, AO 2946 add B, TEMP2, BO 2947#endif 2948 2949 LDF [AO + 0 * SIZE], a1 2950 LDF [AO + 1 * SIZE], a2 2951 LDF [AO + 2 * SIZE], a3 2952 LDF [AO + 3 * SIZE], a4 2953 2954 LDF [BO + 0 * SIZE], b1 2955 LDF [BO + 1 * SIZE], b2 2956 LDF [BO + 2 * SIZE], b3 2957 LDF [BO + 3 * SIZE], b4 2958 LDF [BO + 4 * SIZE], b5 2959 LDF [BO + 5 * SIZE], b6 2960 LDF [BO + 6 * SIZE], b7 2961 FCLR (cc01) 2962 LDF [BO + 7 * SIZE], b8 2963 FCLR (cc03) 2964 2965#if defined(LT) || defined(RN) 2966 sra KK, 2, L 2967#else 2968 sub K, KK, L 2969 sra L, 2, L 2970#endif 2971 cmp L, 0 2972 ble,pn %icc, .LL65 2973 nop 2974 .align 4 2975 2976.LL63: 2977 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2978 add L, -1, L 2979 2980 FMADD (aa1, bb1, cc01, cc01) 2981 LDF [BO + 8 * SIZE], b1 2982 FMADD (aa1, bb2, cc03, cc03) 2983 LDF [BO + 9 * SIZE], b2 2984 2985 LDF [AO + 4 * SIZE], a1 2986 cmp L, 0 2987 2988 FMADD (aa2, bb3, cc01, cc01) 2989 LDF [BO + 10 * SIZE], b3 2990 FMADD (aa2, bb4, cc03, cc03) 2991 LDF [BO + 11 * SIZE], b4 2992 2993 LDF [AO + 5 * SIZE], a2 2994 add AO, 4 * SIZE, AO 2995 2996 FMADD (aa3, bb5, cc01, cc01) 2997 LDF [BO + 12 * SIZE], b5 2998 FMADD (aa3, bb6, cc03, cc03) 2999 LDF [BO + 13 * SIZE], b6 3000 3001 LDF [AO + 2 * SIZE], a3 3002 add BO, 8 * SIZE, BO 3003 3004 FMADD (aa4, bb7, cc01, cc01) 3005 LDF [BO + 6 * SIZE], b7 3006 FMADD (aa4, bb8, cc03, cc03) 3007 LDF [BO + 7 * SIZE], b8 3008 3009 bg,pt %icc, .LL63 3010 LDF [AO + 3 * SIZE], a4 3011 .align 4 3012 3013.LL65: 3014#if defined(LT) || defined(RN) 3015 and KK, 3, L 3016#else 3017 sub K, KK, L 3018 and L, 3, L 3019#endif 3020 cmp L, 0 3021 ble,a,pn %icc, .LL68 3022 nop 3023 .align 4 3024 3025.LL67: 3026 FMADD (aa1, bb1, cc01, cc01) 3027 LDF [BO + 2 * SIZE], b1 3028 FMADD (aa1, bb2, cc03, cc03) 3029 LDF [BO + 3 * SIZE], b2 3030 3031 LDF [AO + 1 * SIZE], a1 3032 add L, -1, L 3033 add AO, 1 * SIZE, AO 3034 cmp L, 0 3035 3036 bg,pt %icc, .LL67 3037 add BO, 2 * SIZE, BO 3038 .align 4 3039 3040.LL68: 3041#if defined(LN) || defined(RT) 3042#ifdef LN 3043 sub KK, 1, TEMP1 3044#else 3045 sub KK, 2, TEMP1 3046#endif 3047 sll TEMP1, BASE_SHIFT + 0, TEMP2 3048 sll TEMP1, BASE_SHIFT + 1, TEMP1 3049 3050 add AORIG, TEMP2, AO 3051 add B, TEMP1, BO 3052#endif 3053 3054#if defined(LN) || defined(LT) 3055 LDF [BO + 0 * SIZE], a1 3056 LDF [BO + 1 * SIZE], a2 3057 3058 FSUB a1, c01, c01 3059 FSUB a2, c03, c03 3060#else 3061 LDF [AO + 0 * SIZE], a1 3062 LDF [AO + 1 * SIZE], a2 3063 3064 FSUB a1, c01, c01 3065 FSUB a2, c03, c03 3066#endif 3067 3068#if defined(LN) || defined(LT) 3069 LDF [AO + 0 * SIZE], a1 3070 3071 FMUL a1, c01, c01 3072 FMUL a1, c03, c03 3073#endif 3074 3075#ifdef RN 3076 LDF [BO + 0 * SIZE], a1 3077 LDF [BO + 1 * SIZE], a2 3078 3079 FMUL a1, c01, c01 3080 3081 FNMSUB (aa2, cc01, cc03, cc03) 3082 3083 LDF [BO + 3 * SIZE], a1 3084 3085 FMUL a1, c03, c03 3086#endif 3087 3088#ifdef RT 3089 LDF [BO + 3 * SIZE], a1 3090 LDF [BO + 2 * SIZE], a2 3091 3092 FMUL a1, c03, c03 3093 3094 FNMSUB (aa2, cc03, cc01, cc01) 3095 3096 LDF [BO + 0 * SIZE], a1 3097 3098 FMUL a1, c01, c01 3099#endif 3100 3101#ifdef LN 3102 add C1, -1 * SIZE, C1 3103 add C2, -1 * SIZE, C2 3104#endif 3105 3106#if defined(LN) || defined(LT) 3107 STF c01, [BO + 0 * SIZE] 3108 STF c03, [BO + 1 * SIZE] 3109#else 3110 STF c01, [AO + 0 * SIZE] 3111 STF c03, [AO + 1 * SIZE] 3112#endif 3113 3114 STF c01, [C1 + 0 * SIZE] 3115 STF c03, [C2 + 0 * SIZE] 3116 3117#ifdef RT 3118 sll K, BASE_SHIFT + 0, TEMP1 3119 add AORIG, TEMP1, AORIG 3120#endif 3121 3122#if defined(LT) || defined(RN) 3123 sub K, KK, TEMP1 3124 sll TEMP1, BASE_SHIFT + 0, TEMP2 3125 sll TEMP1, BASE_SHIFT + 1, TEMP1 3126 add AO, TEMP2, AO 3127 add BO, TEMP1, BO 3128#endif 3129 3130#ifdef LT 3131 add KK, 1, KK 3132#endif 3133 3134#ifdef LN 3135 sub KK, 1, KK 3136#endif 3137 .align 4 3138 3139.LL60: 3140 sra M, 1, I 3141 cmp I, 0 3142 ble,pn %icc, .LL69 3143 nop 3144 .align 4 3145 3146.LL52: 3147#if defined(LT) || defined(RN) 3148 mov B, BO 3149#else 3150#ifdef LN 3151 sll K, BASE_SHIFT + 1, TEMP1 3152 sub AORIG, TEMP1, AORIG 3153#endif 3154 3155 sll KK, BASE_SHIFT + 1, TEMP1 3156 sll KK, BASE_SHIFT + 1, TEMP2 3157 3158 add AORIG, TEMP1, AO 3159 add B, TEMP2, BO 3160#endif 3161 3162 LDF [AO + 0 * SIZE], a1 3163 LDF [AO + 1 * SIZE], a2 3164 LDF [AO + 2 * SIZE], a3 3165 LDF [AO + 3 * SIZE], a4 3166 3167 LDF [BO + 0 * SIZE], b1 3168 LDF [BO + 1 * SIZE], b2 3169 LDF [BO + 2 * SIZE], b3 3170 FCLR (cc01) 3171 LDF [BO + 3 * SIZE], b4 3172 FCLR (cc02) 3173 3174 LDF [BO + 4 * SIZE], b5 3175 FCLR (cc03) 3176 LDF [BO + 5 * SIZE], b6 3177 FCLR (cc04) 3178 LDF [BO + 6 * SIZE], b7 3179 FCLR (cc05) 3180 LDF [BO + 7 * SIZE], b8 3181 FCLR (cc06) 3182 3183 prefetch [C1 + 2 * SIZE], 3 3184 FCLR (cc07) 3185 prefetch [C2 + 2 * SIZE], 3 3186 FCLR (cc08) 3187 3188#if defined(LT) || defined(RN) 3189 sra KK, 2, L 3190#else 3191 sub K, KK, L 3192 sra L, 2, L 3193#endif 3194 cmp L, 0 3195 ble,pn %icc, .LL55 3196 nop 3197 .align 4 3198 3199.LL53: 3200 FMADD (aa1, bb1, cc01, cc01) 3201 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3202 FMADD (aa2, bb1, cc02, cc02) 3203 LDF [BO + 8 * SIZE], b1 3204 3205 FMADD (aa1, bb2, cc03, cc03) 3206 LDF [AO + 4 * SIZE], a1 3207 FMADD (aa2, bb2, cc04, cc04) 3208 LDF [AO + 5 * SIZE], a2 3209 3210 FMADD (aa3, bb3, cc01, cc01) 3211 LDF [BO + 9 * SIZE], b2 3212 FMADD (aa4, bb3, cc02, cc02) 3213 LDF [BO + 10 * SIZE], b3 3214 3215 FMADD (aa3, bb4, cc03, cc03) 3216 LDF [AO + 6 * SIZE], a3 3217 FMADD (aa4, bb4, cc04, cc04) 3218 LDF [AO + 7 * SIZE], a4 3219 3220 FMADD (aa1, bb5, cc01, cc01) 3221 LDF [BO + 11 * SIZE], b4 3222 FMADD (aa2, bb5, cc02, cc02) 3223 LDF [BO + 12 * SIZE], b5 3224 3225 FMADD (aa1, bb6, cc03, cc03) 3226 LDF [AO + 8 * SIZE], a1 3227 FMADD (aa2, bb6, cc04, cc04) 3228 LDF [AO + 9 * SIZE], a2 3229 3230 FMADD (aa3, bb7, cc01, cc01) 3231 LDF [BO + 13 * SIZE], b6 3232 3233 FMADD (aa4, bb7, cc02, cc02) 3234 LDF [BO + 14 * SIZE], b7 3235 3236 FMADD (aa3, bb8, cc03, cc03) 3237 LDF [AO + 10 * SIZE], a3 3238 FMADD (aa4, bb8, cc04, cc04) 3239 LDF [AO + 11 * SIZE], a4 3240 3241 add AO, 8 * SIZE, AO 3242 add L, -1, L 3243 add BO, 8 * SIZE, BO 3244 cmp L, 0 3245 3246 bg,pt %icc, .LL53 3247 LDF [BO + 7 * SIZE], b8 3248 .align 4 3249 3250.LL55: 3251#if defined(LT) || defined(RN) 3252 and KK, 3, L 3253#else 3254 sub K, KK, L 3255 and L, 3, L 3256#endif 3257 cmp L, 0 3258 ble,a,pn %icc, .LL58 3259 nop 3260 .align 4 3261 3262.LL57: 3263 FMADD (aa1, bb1, cc01, cc01) 3264 add L, -1, L 3265 FMADD (aa2, bb1, cc02, cc02) 3266 LDF [BO + 2 * SIZE], b1 3267 3268 FMADD (aa1, bb2, cc03, cc03) 3269 LDF [AO + 2 * SIZE], a1 3270 FMADD (aa2, bb2, cc04, cc04) 3271 LDF [AO + 3 * SIZE], a2 3272 3273 add AO, 2 * SIZE, AO 3274 cmp L, 0 3275 add BO, 2 * SIZE, BO 3276 bg,pt %icc, .LL57 3277 LDF [BO + 1 * SIZE], b2 3278 .align 4 3279 3280.LL58: 3281#if defined(LN) || defined(RT) 3282#ifdef LN 3283 sub KK, 2, TEMP1 3284#else 3285 sub KK, 2, TEMP1 3286#endif 3287 sll TEMP1, BASE_SHIFT + 1, TEMP2 3288 sll TEMP1, BASE_SHIFT + 1, TEMP1 3289 3290 add AORIG, TEMP2, AO 3291 add B, TEMP1, BO 3292#endif 3293 3294#if defined(LN) || defined(LT) 3295 LDF [BO + 0 * SIZE], a1 3296 LDF [BO + 1 * SIZE], a2 3297 LDF [BO + 2 * SIZE], a3 3298 LDF [BO + 3 * SIZE], a4 3299 3300 FSUB a1, c01, c01 3301 FSUB a2, c03, c03 3302 FSUB a3, c02, c02 3303 FSUB a4, c04, c04 3304#else 3305 LDF [AO + 0 * SIZE], a1 3306 LDF [AO + 1 * SIZE], a2 3307 LDF [AO + 2 * SIZE], a3 3308 LDF [AO + 3 * SIZE], a4 3309 3310 FSUB a1, c01, c01 3311 FSUB a2, c02, c02 3312 FSUB a3, c03, c03 3313 FSUB a4, c04, c04 3314#endif 3315 3316#ifdef LN 3317 LDF [AO + 3 * SIZE], a1 3318 LDF [AO + 2 * SIZE], a2 3319 LDF [AO + 0 * SIZE], a3 3320 3321 FMUL a1, c02, c02 3322 FMUL a1, c04, c04 3323 3324 FNMSUB (aa2, cc02, cc01, cc01) 3325 FNMSUB (aa2, cc04, cc03, cc03) 3326 3327 FMUL a3, c01, c01 3328 FMUL a3, c03, c03 3329#endif 3330 3331#ifdef LT 3332 LDF [AO + 0 * SIZE], a1 3333 LDF [AO + 1 * SIZE], a2 3334 LDF [AO + 3 * SIZE], a3 3335 3336 FMUL a1, c01, c01 3337 FMUL a1, c03, c03 3338 3339 FNMSUB (aa2, cc01, cc02, cc02) 3340 FNMSUB (aa2, cc03, cc04, cc04) 3341 3342 FMUL a3, c02, c02 3343 FMUL a3, c04, c04 3344#endif 3345 3346#ifdef RN 3347 LDF [BO + 0 * SIZE], a1 3348 LDF [BO + 1 * SIZE], a2 3349 3350 FMUL a1, c01, c01 3351 FMUL a1, c02, c02 3352 3353 FNMSUB (aa2, cc01, cc03, cc03) 3354 FNMSUB (aa2, cc02, cc04, cc04) 3355 3356 LDF [BO + 3 * SIZE], a1 3357 3358 FMUL a1, c03, c03 3359 FMUL a1, c04, c04 3360#endif 3361 3362#ifdef RT 3363 LDF [BO + 3 * SIZE], a1 3364 LDF [BO + 2 * SIZE], a2 3365 3366 FMUL a1, c04, c04 3367 FMUL a1, c03, c03 3368 3369 FNMSUB (aa2, cc04, cc02, cc02) 3370 FNMSUB (aa2, cc03, cc01, cc01) 3371 3372 LDF [BO + 0 * SIZE], a1 3373 3374 FMUL a1, c02, c02 3375 FMUL a1, c01, c01 3376#endif 3377 3378#ifdef LN 3379 add C1, -2 * SIZE, C1 3380 add C2, -2 * SIZE, C2 3381#endif 3382 3383#if defined(LN) || defined(LT) 3384 STF c01, [BO + 0 * SIZE] 3385 STF c03, [BO + 1 * SIZE] 3386 STF c02, [BO + 2 * SIZE] 3387 STF c04, [BO + 3 * SIZE] 3388#else 3389 STF c01, [AO + 0 * SIZE] 3390 STF c02, [AO + 1 * SIZE] 3391 STF c03, [AO + 2 * SIZE] 3392 STF c04, [AO + 3 * SIZE] 3393#endif 3394 3395 STF c01, [C1 + 0 * SIZE] 3396 STF c02, [C1 + 1 * SIZE] 3397 STF c03, [C2 + 0 * SIZE] 3398 STF c04, [C2 + 1 * SIZE] 3399 3400#ifndef LN 3401 add C1, 2 * SIZE, C1 3402 add C2, 2 * SIZE, C2 3403#endif 3404 3405#ifdef RT 3406 sll K, BASE_SHIFT + 1, TEMP1 3407 add AORIG, TEMP1, AORIG 3408#endif 3409 3410#if defined(LT) || defined(RN) 3411 sub K, KK, TEMP1 3412 sll TEMP1, BASE_SHIFT + 1, TEMP2 3413 sll TEMP1, BASE_SHIFT + 1, TEMP1 3414 add AO, TEMP2, AO 3415 add BO, TEMP1, BO 3416#endif 3417 3418#ifdef LT 3419 add KK, 2, KK 3420#endif 3421 3422#ifdef LN 3423 sub KK, 2, KK 3424#endif 3425 3426 add I, -1, I 3427 cmp I, 0 3428 bg,pt %icc, .LL52 3429 nop 3430 .align 4 3431 3432.LL69: 3433#ifdef LN 3434 sll K, BASE_SHIFT + 1, TEMP1 3435 add B, TEMP1, B 3436#endif 3437 3438#if defined(LT) || defined(RN) 3439 mov BO, B 3440#endif 3441 3442#ifdef RN 3443 add KK, 2, KK 3444#endif 3445 3446#ifdef RT 3447 sub KK, 2, KK 3448#endif 3449 .align 4 3450 3451.LL70: 3452 and N, 1, J 3453 cmp J, 0 3454 ble,pn %icc, .LL999 3455 nop 3456 3457#ifdef RT 3458 sll K, BASE_SHIFT, TEMP1 3459 sub B, TEMP1, B 3460#endif 3461 3462#ifndef RT 3463 mov C, C1 3464 add C1, LDC, C 3465#else 3466 sub C, LDC, C1 3467 sub C, LDC, C 3468#endif 3469 3470#ifdef LN 3471 add M, OFFSET, KK 3472#endif 3473 3474#ifdef LT 3475 mov OFFSET, KK 3476#endif 3477 3478#if defined(LN) || defined(RT) 3479 mov A, AORIG 3480#else 3481 mov A, AO 3482#endif 3483 3484 and M, 1, I 3485 cmp I, 0 3486 ble,pn %icc, .LL80 3487 nop 3488 3489#if defined(LT) || defined(RN) 3490 mov B, BO 3491#else 3492#ifdef LN 3493 sll K, BASE_SHIFT + 0, TEMP1 3494 sub AORIG, TEMP1, AORIG 3495#endif 3496 3497 sll KK, BASE_SHIFT + 0, TEMP1 3498 sll KK, BASE_SHIFT + 0, TEMP2 3499 3500 add AORIG, TEMP1, AO 3501 add B, TEMP2, BO 3502#endif 3503 3504 LDF [AO + 0 * SIZE], a1 3505 LDF [BO + 0 * SIZE], b1 3506 LDF [AO + 1 * SIZE], a2 3507 LDF [BO + 1 * SIZE], b2 3508 LDF [AO + 2 * SIZE], a3 3509 LDF [BO + 2 * SIZE], b3 3510 LDF [AO + 3 * SIZE], a4 3511 LDF [BO + 3 * SIZE], b4 3512 3513#if defined(LT) || defined(RN) 3514 sra KK, 2, L 3515#else 3516 sub K, KK, L 3517 sra L, 2, L 3518#endif 3519 cmp L, 0 3520 ble,pn %icc, .LL85 3521 FCLR (cc01) 3522 .align 4 3523 3524.LL83: 3525 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3526 add L, -1, L 3527 3528 FMADD (aa1, bb1, cc01, cc01) 3529 LDF [AO + 4 * SIZE], a1 3530 LDF [BO + 4 * SIZE], b1 3531 3532 FMADD (aa2, bb2, cc01, cc01) 3533 LDF [AO + 5 * SIZE], a2 3534 LDF [BO + 5 * SIZE], b2 3535 3536 FMADD (aa3, bb3, cc01, cc01) 3537 LDF [AO + 6 * SIZE], a3 3538 LDF [BO + 6 * SIZE], b3 3539 3540 FMADD (aa4, bb4, cc01, cc01) 3541 LDF [AO + 7 * SIZE], a4 3542 LDF [BO + 7 * SIZE], b4 3543 3544 add AO, 4 * SIZE, AO 3545 cmp L, 0 3546 3547 bg,pt %icc, .LL83 3548 add BO, 4 * SIZE, BO 3549 .align 4 3550 3551.LL85: 3552#if defined(LT) || defined(RN) 3553 and KK, 3, L 3554#else 3555 sub K, KK, L 3556 and L, 3, L 3557#endif 3558 cmp L, 0 3559 ble,a,pn %icc, .LL88 3560 nop 3561 .align 4 3562 3563.LL87: 3564 FMADD (aa1, bb1, cc01, cc01) 3565 LDF [AO + 1 * SIZE], a1 3566 LDF [BO + 1 * SIZE], b1 3567 3568 add AO, 1 * SIZE, AO 3569 add L, -1, L 3570 cmp L, 0 3571 bg,pt %icc, .LL87 3572 add BO, 1 * SIZE, BO 3573 .align 4 3574 3575.LL88: 3576#if defined(LN) || defined(RT) 3577#ifdef LN 3578 sub KK, 1, TEMP1 3579#else 3580 sub KK, 1, TEMP1 3581#endif 3582 sll TEMP1, BASE_SHIFT + 0, TEMP2 3583 sll TEMP1, BASE_SHIFT + 0, TEMP1 3584 3585 add AORIG, TEMP2, AO 3586 add B, TEMP1, BO 3587#endif 3588 3589#if defined(LN) || defined(LT) 3590 LDF [BO + 0 * SIZE], a1 3591 3592 FSUB a1, c01, c01 3593#else 3594 LDF [AO + 0 * SIZE], a1 3595 3596 FSUB a1, c01, c01 3597#endif 3598 3599#if defined(LN) || defined(LT) 3600 LDF [AO + 0 * SIZE], a1 3601 3602 FMUL a1, c01, c01 3603#endif 3604 3605#if defined(RN) || defined(RT) 3606 LDF [BO + 0 * SIZE], a1 3607 3608 FMUL a1, c01, c01 3609#endif 3610 3611#ifdef LN 3612 add C1, -1 * SIZE, C1 3613#endif 3614 3615#if defined(LN) || defined(LT) 3616 STF c01, [BO + 0 * SIZE] 3617#else 3618 STF c01, [AO + 0 * SIZE] 3619#endif 3620 3621 STF c01, [C1 + 0 * SIZE] 3622 3623#ifdef RT 3624 sll K, BASE_SHIFT + 0, TEMP1 3625 add AORIG, TEMP1, AORIG 3626#endif 3627 3628#if defined(LT) || defined(RN) 3629 sub K, KK, TEMP1 3630 sll TEMP1, BASE_SHIFT + 0, TEMP2 3631 sll TEMP1, BASE_SHIFT + 0, TEMP1 3632 add AO, TEMP2, AO 3633 add BO, TEMP1, BO 3634#endif 3635 3636#ifdef LT 3637 add KK, 1, KK 3638#endif 3639 3640#ifdef LN 3641 sub KK, 1, KK 3642#endif 3643 .align 4 3644 3645.LL80: 3646 sra M, 1, I 3647 cmp I, 0 3648 ble,pn %icc, .LL89 3649 nop 3650 .align 4 3651 3652.LL72: 3653#if defined(LT) || defined(RN) 3654 mov B, BO 3655#else 3656#ifdef LN 3657 sll K, BASE_SHIFT + 1, TEMP1 3658 sub AORIG, TEMP1, AORIG 3659#endif 3660 3661 sll KK, BASE_SHIFT + 1, TEMP1 3662 sll KK, BASE_SHIFT + 0, TEMP2 3663 3664 add AORIG, TEMP1, AO 3665 add B, TEMP2, BO 3666#endif 3667 3668 LDF [AO + 0 * SIZE], a1 3669 LDF [AO + 1 * SIZE], a2 3670 LDF [AO + 2 * SIZE], a3 3671 LDF [AO + 3 * SIZE], a4 3672 3673 LDF [BO + 0 * SIZE], b1 3674 LDF [BO + 1 * SIZE], b2 3675 LDF [BO + 2 * SIZE], b3 3676 FCLR (cc01) 3677 LDF [BO + 3 * SIZE], b4 3678 FCLR (cc02) 3679 3680 prefetch [C1 + 2 * SIZE], 3 3681 3682#if defined(LT) || defined(RN) 3683 sra KK, 2, L 3684#else 3685 sub K, KK, L 3686 sra L, 2, L 3687#endif 3688 cmp L, 0 3689 ble,pn %icc, .LL75 3690 nop 3691 3692.LL73: 3693 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3694 add L, -1, L 3695 3696 FMADD (aa1, bb1, cc01, cc01) 3697 LDF [AO + 4 * SIZE], a1 3698 FMADD (aa2, bb1, cc02, cc02) 3699 LDF [AO + 5 * SIZE], a2 3700 3701 LDF [BO + 4 * SIZE], b1 3702 cmp L, 0 3703 3704 FMADD (aa3, bb2, cc01, cc01) 3705 LDF [AO + 6 * SIZE], a3 3706 FMADD (aa4, bb2, cc02, cc02) 3707 LDF [AO + 7 * SIZE], a4 3708 3709 LDF [BO + 5 * SIZE], b2 3710 add BO, 4 * SIZE, BO 3711 3712 FMADD (aa1, bb3, cc01, cc01) 3713 LDF [AO + 8 * SIZE], a1 3714 FMADD (aa2, bb3, cc02, cc02) 3715 LDF [AO + 9 * SIZE], a2 3716 3717 LDF [BO + 2 * SIZE], b3 3718 add AO, 8 * SIZE, AO 3719 3720 FMADD (aa3, bb4, cc01, cc01) 3721 LDF [AO + 2 * SIZE], a3 3722 FMADD (aa4, bb4, cc02, cc02) 3723 LDF [AO + 3 * SIZE], a4 3724 3725 bg,pt %icc, .LL73 3726 LDF [BO + 3 * SIZE], b4 3727 .align 4 3728 3729.LL75: 3730#if defined(LT) || defined(RN) 3731 and KK, 3, L 3732#else 3733 sub K, KK, L 3734 and L, 3, L 3735#endif 3736 cmp L, 0 3737 ble,a,pn %icc, .LL78 3738 nop 3739 .align 4 3740 3741.LL77: 3742 FMADD (aa1, bb1, cc01, cc01) 3743 LDF [AO + 2 * SIZE], a1 3744 FMADD (aa2, bb1, cc02, cc02) 3745 LDF [AO + 3 * SIZE], a2 3746 3747 LDF [BO + 1 * SIZE], b1 3748 add L, -1, L 3749 add AO, 2 * SIZE, AO 3750 cmp L, 0 3751 bg,pt %icc, .LL77 3752 add BO, 1 * SIZE, BO 3753 .align 4 3754 3755.LL78: 3756#if defined(LN) || defined(RT) 3757#ifdef LN 3758 sub KK, 2, TEMP1 3759#else 3760 sub KK, 1, TEMP1 3761#endif 3762 sll TEMP1, BASE_SHIFT + 1, TEMP2 3763 sll TEMP1, BASE_SHIFT + 0, TEMP1 3764 3765 add AORIG, TEMP2, AO 3766 add B, TEMP1, BO 3767#endif 3768 3769#if defined(LN) || defined(LT) 3770 LDF [BO + 0 * SIZE], a1 3771 LDF [BO + 1 * SIZE], a2 3772 3773 FSUB a1, c01, c01 3774 FSUB a2, c02, c02 3775#else 3776 LDF [AO + 0 * SIZE], a1 3777 LDF [AO + 1 * SIZE], a2 3778 3779 FSUB a1, c01, c01 3780 FSUB a2, c02, c02 3781#endif 3782 3783#ifdef LN 3784 LDF [AO + 3 * SIZE], a1 3785 LDF [AO + 2 * SIZE], a2 3786 LDF [AO + 0 * SIZE], a3 3787 3788 FMUL a1, c02, c02 3789 3790 FNMSUB (aa2, cc02, cc01, cc01) 3791 3792 FMUL a3, c01, c01 3793#endif 3794 3795#ifdef LT 3796 LDF [AO + 0 * SIZE], a1 3797 LDF [AO + 1 * SIZE], a2 3798 LDF [AO + 3 * SIZE], a3 3799 3800 FMUL a1, c01, c01 3801 3802 FNMSUB (aa2, cc01, cc02, cc02) 3803 3804 FMUL a3, c02, c02 3805#endif 3806 3807#if defined(RN) || defined(RT) 3808 LDF [BO + 0 * SIZE], a1 3809 3810 FMUL a1, c01, c01 3811 FMUL a1, c02, c02 3812#endif 3813 3814#ifdef LN 3815 add C1, -2 * SIZE, C1 3816#endif 3817 3818#if defined(LN) || defined(LT) 3819 STF c01, [BO + 0 * SIZE] 3820 STF c02, [BO + 1 * SIZE] 3821#else 3822 STF c01, [AO + 0 * SIZE] 3823 STF c02, [AO + 1 * SIZE] 3824#endif 3825 3826 STF c01, [C1 + 0 * SIZE] 3827 STF c02, [C1 + 1 * SIZE] 3828 3829#ifndef LN 3830 add C1, 2 * SIZE, C1 3831#endif 3832 3833#ifdef RT 3834 sll K, BASE_SHIFT + 1, TEMP1 3835 add AORIG, TEMP1, AORIG 3836#endif 3837 3838#if defined(LT) || defined(RN) 3839 sub K, KK, TEMP1 3840 sll TEMP1, BASE_SHIFT + 1, TEMP2 3841 sll TEMP1, BASE_SHIFT + 0, TEMP1 3842 add AO, TEMP2, AO 3843 add BO, TEMP1, BO 3844#endif 3845 3846#ifdef LT 3847 add KK, 2, KK 3848#endif 3849 3850#ifdef LN 3851 sub KK, 2, KK 3852#endif 3853 3854 add I, -1, I 3855 cmp I, 0 3856 bg,pt %icc, .LL72 3857 nop 3858 .align 4 3859 3860.LL89: 3861#ifdef LN 3862 sll K, BASE_SHIFT, TEMP1 3863 add B, TEMP1, B 3864#endif 3865 3866#if defined(LT) || defined(RN) 3867 mov BO, B 3868#endif 3869 3870#ifdef RN 3871 add KK, 1, KK 3872#endif 3873 3874#ifdef RT 3875 sub KK, 1, KK 3876#endif 3877 .align 4 3878 3879.LL999: 3880#ifdef TRMMKERNEL 3881#ifndef __64BIT__ 3882 ld [%sp + STACK_START + 8], %g1 3883 ld [%sp + STACK_START + 12], %g2 3884 ld [%sp + STACK_START + 16], %g3 3885 ld [%sp + STACK_START + 20], %g4 3886#else 3887 ldx [%sp + STACK_START + 32], %g1 3888 ldx [%sp + STACK_START + 40], %g2 3889 ldx [%sp + STACK_START + 48], %g3 3890 ldx [%sp + STACK_START + 56], %g4 3891#endif 3892#endif 3893 3894 return %i7 + 8 3895 clr %o0 3896 3897 EPILOGUE 3898