1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2005. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define APREFETCHSIZE 24 26#define APREFETCH_CATEGORY 0 27 28#define M %i0 29#define N %i1 30#define K %i2 31 32#if defined(DOUBLE) && !defined(__64BIT__) 33#define A %i5 34#define B %i4 35#else 36#define A %i4 37#define B %i5 38#endif 39 40#define C %o4 41#define LDC %o5 42 43#define AO %l0 44#define BO %l1 45#define I %l2 46#define J %l3 47#define L %l4 48 49#define C1 %o0 50#define C2 %o1 51#define C3 %o2 52#define C4 %o3 53 54#define C5 %l5 55#define C6 %l6 56#define C7 %l7 57#define C8 %i3 58 59#define OFFSET %g1 60#define KK %g2 61#define TEMP1 %g3 62#define TEMP2 %g4 63#define AORIG %o7 64 65#ifdef DOUBLE 66#define c01 %f0 67#define c02 %f2 68#define c03 %f4 69#define c04 %f6 70#define c05 %f8 71#define c06 %f10 72#define c07 %f12 73#define c08 %f14 74#define c09 %f16 75#define c10 %f18 76#define c11 %f20 77#define c12 %f22 78#define c13 %f24 79#define c14 %f26 80#define c15 %f28 81#define c16 %f30 82 83#define a1 %f32 84#define a2 %f34 85#define a3 %f36 86#define a4 %f38 87#define a5 %f40 88 89#define b1 %f42 90#define b2 %f44 91#define b3 %f46 92#define b4 %f48 93#define b5 %f50 94#define b6 %f52 95#define b7 %f54 96#define b8 %f56 97#define b9 %f58 98 99#define cc01 0 100#define cc02 2 101#define cc03 4 102#define cc04 6 103#define cc05 8 104#define cc06 10 105#define cc07 12 106#define cc08 14 107#define cc09 16 108#define cc10 18 109#define cc11 20 110#define cc12 22 111#define cc13 24 112#define cc14 26 113#define cc15 28 114#define cc16 30 115 116#define aa1 1 117#define aa2 3 118#define aa3 5 119#define aa4 7 120#define aa5 9 121 122#define bb1 11 123#define bb2 13 124#define bb3 15 125#define bb4 17 126#define bb5 19 127#define bb6 21 128#define bb7 23 129#define bb8 25 130#define bb9 27 131 132#else 133#define c01 %f0 134#define c02 %f1 135#define c03 %f2 136#define c04 %f3 137#define c05 %f4 138#define c06 %f5 139#define c07 %f6 140#define c08 %f7 141#define c09 %f8 142#define c10 %f9 143#define c11 %f10 144#define c12 %f11 145#define c13 %f12 146#define c14 %f13 147#define c15 %f14 148#define c16 %f15 149 150#define a1 %f16 151#define a2 %f17 152#define a3 %f18 153#define a4 %f19 154#define a5 %f20 155 156#define b1 %f21 157#define b2 %f22 158#define b3 %f23 159#define b4 %f24 160#define b5 %f25 161#define b6 %f26 162#define b7 %f27 163#define b8 %f28 164#define b9 %f29 165 166#define cc01 0 167#define cc02 1 168#define cc03 2 169#define cc04 3 170#define cc05 4 171#define cc06 5 172#define cc07 6 173#define cc08 7 174#define cc09 8 175#define cc10 9 176#define cc11 10 177#define cc12 11 178#define cc13 12 179#define cc14 13 180#define cc15 14 181#define cc16 15 182 183#define aa1 16 184#define aa2 17 185#define aa3 18 186#define aa4 19 187#define aa5 20 188 189#define bb1 21 190#define bb2 22 191#define bb3 23 192#define bb4 24 193#define bb5 25 194#define bb6 26 195#define bb7 27 196#define bb8 28 197#define bb9 29 198 199#endif 200 201 .register %g2, #scratch 202 .register %g3, #scratch 203 204 PROLOGUE 205 SAVESP 206 nop 207 208#ifndef __64BIT__ 209 210#ifdef DOUBLE 211 ld [%sp + STACK_START + 28], B 212 ld [%sp + STACK_START + 32], C 213 ld [%sp + STACK_START + 36], LDC 214 ld [%sp + STACK_START + 40], OFFSET 215#else 216 ld [%sp + STACK_START + 28], C 217 ld [%sp + STACK_START + 32], LDC 218 ld [%sp + STACK_START + 36], OFFSET 219#endif 220 221 st %g1, [%sp + STACK_START + 8] 222 st %g2, [%sp + STACK_START + 12] 223 st %g3, [%sp + STACK_START + 16] 224 st %g4, [%sp + STACK_START + 20] 225#else 226 227 ldx [%sp+ STACK_START + 56], C 228 ldx [%sp+ STACK_START + 64], LDC 229 ldx [%sp+ STACK_START + 72], OFFSET 230 231 stx %g1, [%sp + STACK_START + 32] 232 stx %g2, [%sp + STACK_START + 40] 233 stx %g3, [%sp + STACK_START + 48] 234 stx %g4, [%sp + STACK_START + 56] 235#endif 236 237#if defined(TRMMKERNEL) && !defined(LEFT) 238 neg OFFSET, KK 239#endif 240 241 sll LDC, BASE_SHIFT, LDC 242 243#ifdef LN 244 smul M, K, TEMP1 245 sll TEMP1, BASE_SHIFT, TEMP1 246 add A, TEMP1, A 247 248 sll M, BASE_SHIFT, TEMP1 249 add C, TEMP1, C 250#endif 251 252#ifdef RN 253 neg OFFSET, KK 254#endif 255 256#ifdef RT 257 smul N, K, TEMP1 258 sll TEMP1, BASE_SHIFT, TEMP1 259 add B, TEMP1, B 260 261 smul N, LDC, TEMP1 262 add C, TEMP1, C 263 264 sub N, OFFSET, KK 265#endif 266 267 sra N, 3, J 268 cmp J, 0 269 ble,pn %icc, .LL30 270 nop 271 .align 4 272 273.LL11: 274#ifdef RT 275 sll K, BASE_SHIFT + 3, TEMP1 276 sub B, TEMP1, B 277#endif 278 279#ifndef RT 280 mov C, C1 281 add C, LDC, C2 282 add C2, LDC, C3 283 add C3, LDC, C4 284 add C4, LDC, C5 285 add C5, LDC, C6 286 add C6, LDC, C7 287 add C7, LDC, C8 288 add C8, LDC, C 289#else 290 sub C, LDC, C8 291 sub C8, LDC, C7 292 sub C7, LDC, C6 293 sub C6, LDC, C5 294 sub C5, LDC, C4 295 sub C4, LDC, C3 296 sub C3, LDC, C2 297 sub C2, LDC, C1 298 sub C2, LDC, C 299#endif 300 301#ifdef LN 302 add M, OFFSET, KK 303#endif 304 305#ifdef LT 306 mov OFFSET, KK 307#endif 308 309#if defined(LN) || defined(RT) 310 mov A, AORIG 311#else 312 mov A, AO 313#endif 314 315 and M, 1, I 316 cmp I, 0 317 ble,pn %icc, .LL20 318 nop 319 320#if defined(LT) || defined(RN) 321 mov B, BO 322#else 323#ifdef LN 324 sll K, BASE_SHIFT + 0, TEMP1 325 sub AORIG, TEMP1, AORIG 326#endif 327 328 sll KK, BASE_SHIFT + 0, TEMP1 329 sll KK, BASE_SHIFT + 3, TEMP2 330 331 add AORIG, TEMP1, AO 332 add B, TEMP2, BO 333#endif 334 335 LDF [AO + 0 * SIZE], a1 336 LDF [AO + 1 * SIZE], a2 337 LDF [AO + 2 * SIZE], a3 338 LDF [AO + 3 * SIZE], a4 339 340 LDF [BO + 0 * SIZE], b1 341 FCLR (cc01) 342 LDF [BO + 1 * SIZE], b2 343 FCLR (cc03) 344 LDF [BO + 2 * SIZE], b3 345 FCLR (cc05) 346 LDF [BO + 3 * SIZE], b4 347 FCLR (cc07) 348 LDF [BO + 4 * SIZE], b5 349 FCLR (cc09) 350 LDF [BO + 5 * SIZE], b6 351 FCLR (cc11) 352 LDF [BO + 6 * SIZE], b7 353 FCLR (cc13) 354 LDF [BO + 7 * SIZE], b8 355 FCLR (cc15) 356 357#if defined(LT) || defined(RN) 358 sra KK, 2, L 359#else 360 sub K, KK, L 361 sra L, 2, L 362#endif 363 cmp L, 0 364 ble,pn %icc, .LL25 365 LDF [BO + 8 * SIZE], b9 366 .align 4 367 368.LL23: 369 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 370 add L, -1, L 371 372 FMADD (aa1, bb1, cc01, cc01) 373 LDF [BO + 16 * SIZE], b1 374 FMADD (aa1, bb2, cc03, cc03) 375 LDF [BO + 9 * SIZE], b2 376 377 FMADD (aa1, bb3, cc05, cc05) 378 LDF [BO + 10 * SIZE], b3 379 FMADD (aa1, bb4, cc07, cc07) 380 LDF [BO + 11 * SIZE], b4 381 382 FMADD (aa1, bb5, cc09, cc09) 383 LDF [BO + 12 * SIZE], b5 384 FMADD (aa1, bb6, cc11, cc11) 385 LDF [BO + 13 * SIZE], b6 386 387 FMADD (aa1, bb7, cc13, cc13) 388 LDF [BO + 14 * SIZE], b7 389 FMADD (aa1, bb8, cc15, cc15) 390 LDF [BO + 15 * SIZE], b8 391 392 FMADD (aa2, bb9, cc01, cc01) 393 LDF [BO + 24 * SIZE], b9 394 FMADD (aa2, bb2, cc03, cc03) 395 LDF [BO + 17 * SIZE], b2 396 397 FMADD (aa2, bb3, cc05, cc05) 398 LDF [BO + 18 * SIZE], b3 399 FMADD (aa2, bb4, cc07, cc07) 400 LDF [BO + 19 * SIZE], b4 401 402 FMADD (aa2, bb5, cc09, cc09) 403 LDF [BO + 20 * SIZE], b5 404 FMADD (aa2, bb6, cc11, cc11) 405 LDF [BO + 21 * SIZE], b6 406 407 FMADD (aa2, bb7, cc13, cc13) 408 LDF [BO + 22 * SIZE], b7 409 FMADD (aa2, bb8, cc15, cc15) 410 LDF [BO + 23 * SIZE], b8 411 412 LDF [AO + 4 * SIZE], a1 413 LDF [AO + 5 * SIZE], a2 414 415 FMADD (aa3, bb1, cc01, cc01) 416 LDF [BO + 32 * SIZE], b1 417 FMADD (aa3, bb2, cc03, cc03) 418 LDF [BO + 25 * SIZE], b2 419 420 FMADD (aa3, bb3, cc05, cc05) 421 LDF [BO + 26 * SIZE], b3 422 FMADD (aa3, bb4, cc07, cc07) 423 LDF [BO + 27 * SIZE], b4 424 425 FMADD (aa3, bb5, cc09, cc09) 426 LDF [BO + 28 * SIZE], b5 427 FMADD (aa3, bb6, cc11, cc11) 428 LDF [BO + 29 * SIZE], b6 429 430 FMADD (aa3, bb7, cc13, cc13) 431 LDF [BO + 30 * SIZE], b7 432 FMADD (aa3, bb8, cc15, cc15) 433 LDF [BO + 31 * SIZE], b8 434 435 FMADD (aa4, bb9, cc01, cc01) 436 LDF [BO + 40 * SIZE], b9 437 FMADD (aa4, bb2, cc03, cc03) 438 LDF [BO + 33 * SIZE], b2 439 440 FMADD (aa4, bb3, cc05, cc05) 441 LDF [BO + 34 * SIZE], b3 442 FMADD (aa4, bb4, cc07, cc07) 443 LDF [BO + 35 * SIZE], b4 444 445 FMADD (aa4, bb5, cc09, cc09) 446 LDF [BO + 36 * SIZE], b5 447 FMADD (aa4, bb6, cc11, cc11) 448 LDF [BO + 37 * SIZE], b6 449 450 FMADD (aa4, bb7, cc13, cc13) 451 LDF [BO + 38 * SIZE], b7 452 FMADD (aa4, bb8, cc15, cc15) 453 LDF [BO + 39 * SIZE], b8 454 455 LDF [AO + 6 * SIZE], a3 456 LDF [AO + 7 * SIZE], a4 457 458 add AO, 4 * SIZE, AO 459 cmp L, 0 460 bg,pt %icc, .LL23 461 add BO, 32 * SIZE, BO 462 .align 4 463 464.LL25: 465#if defined(LT) || defined(RN) 466 and KK, 3, L 467#else 468 sub K, KK, L 469 and L, 3, L 470#endif 471 cmp L, 0 472 ble,a,pn %icc, .LL28 473 nop 474 .align 4 475 476.LL27: 477 FMADD (aa1, bb1, cc01, cc01) 478 LDF [BO + 8 * SIZE], b1 479 FMADD (aa1, bb2, cc03, cc03) 480 LDF [BO + 9 * SIZE], b2 481 482 FMADD (aa1, bb3, cc05, cc05) 483 LDF [BO + 10 * SIZE], b3 484 FMADD (aa1, bb4, cc07, cc07) 485 LDF [BO + 11 * SIZE], b4 486 487 FMADD (aa1, bb5, cc09, cc09) 488 LDF [BO + 12 * SIZE], b5 489 FMADD (aa1, bb6, cc11, cc11) 490 LDF [BO + 13 * SIZE], b6 491 492 FMADD (aa1, bb7, cc13, cc13) 493 LDF [BO + 14 * SIZE], b7 494 FMADD (aa1, bb8, cc15, cc15) 495 LDF [BO + 15 * SIZE], b8 496 497 LDF [AO + 1 * SIZE], a1 498 add AO, 1 * SIZE, AO 499 500 add L, -1, L 501 cmp L, 0 502 bg,pt %icc, .LL27 503 add BO, 8 * SIZE, BO 504 .align 4 505 506.LL28: 507#if defined(LN) || defined(RT) 508#ifdef LN 509 sub KK, 1, TEMP1 510#else 511 sub KK, 8, TEMP1 512#endif 513 sll TEMP1, BASE_SHIFT + 0, TEMP2 514 sll TEMP1, BASE_SHIFT + 3, TEMP1 515 516 add AORIG, TEMP2, AO 517 add B, TEMP1, BO 518#endif 519 520#if defined(LN) || defined(LT) 521 LDF [BO + 0 * SIZE], a1 522 LDF [BO + 1 * SIZE], a2 523 LDF [BO + 2 * SIZE], a3 524 LDF [BO + 3 * SIZE], a4 525 526 LDF [BO + 4 * SIZE], b1 527 LDF [BO + 5 * SIZE], b2 528 LDF [BO + 6 * SIZE], b3 529 LDF [BO + 7 * SIZE], b4 530 531 FSUB a1, c01, c01 532 FSUB a2, c03, c03 533 FSUB a3, c05, c05 534 FSUB a4, c07, c07 535 536 FSUB b1, c09, c09 537 FSUB b2, c11, c11 538 FSUB b3, c13, c13 539 FSUB b4, c15, c15 540#else 541 LDF [AO + 0 * SIZE], a1 542 LDF [AO + 1 * SIZE], a2 543 LDF [AO + 2 * SIZE], a3 544 LDF [AO + 3 * SIZE], a4 545 546 LDF [AO + 4 * SIZE], b1 547 LDF [AO + 5 * SIZE], b2 548 LDF [AO + 6 * SIZE], b3 549 LDF [AO + 7 * SIZE], b4 550 551 FSUB a1, c01, c01 552 FSUB a2, c03, c03 553 FSUB a3, c05, c05 554 FSUB a4, c07, c07 555 556 FSUB b1, c09, c09 557 FSUB b2, c11, c11 558 FSUB b3, c13, c13 559 FSUB b4, c15, c15 560#endif 561 562#if defined(LN) || defined(LT) 563 LDF [AO + 0 * SIZE], a1 564 565 FMUL a1, c01, c01 566 FMUL a1, c03, c03 567 FMUL a1, c05, c05 568 FMUL a1, c07, c07 569 FMUL a1, c09, c09 570 FMUL a1, c11, c11 571 FMUL a1, c13, c13 572 FMUL a1, c15, c15 573#endif 574 575#ifdef RN 576 LDF [BO + 0 * SIZE], a1 577 LDF [BO + 1 * SIZE], a2 578 LDF [BO + 2 * SIZE], a3 579 LDF [BO + 3 * SIZE], a4 580 LDF [BO + 4 * SIZE], b1 581 LDF [BO + 5 * SIZE], b2 582 LDF [BO + 6 * SIZE], b3 583 LDF [BO + 7 * SIZE], b4 584 585 FMUL a1, c01, c01 586 587 FNMSUB (aa2, cc01, cc03, cc03) 588 FNMSUB (aa3, cc01, cc05, cc05) 589 FNMSUB (aa4, cc01, cc07, cc07) 590 FNMSUB (bb1, cc01, cc09, cc09) 591 FNMSUB (bb2, cc01, cc11, cc11) 592 FNMSUB (bb3, cc01, cc13, cc13) 593 FNMSUB (bb4, cc01, cc15, cc15) 594 595 LDF [BO + 9 * SIZE], a1 596 LDF [BO + 10 * SIZE], a2 597 LDF [BO + 11 * SIZE], a3 598 LDF [BO + 12 * SIZE], a4 599 LDF [BO + 13 * SIZE], b1 600 LDF [BO + 14 * SIZE], b2 601 LDF [BO + 15 * SIZE], b3 602 603 FMUL a1, c03, c03 604 605 FNMSUB (aa2, cc03, cc05, cc05) 606 FNMSUB (aa3, cc03, cc07, cc07) 607 FNMSUB (aa4, cc03, cc09, cc09) 608 FNMSUB (bb1, cc03, cc11, cc11) 609 FNMSUB (bb2, cc03, cc13, cc13) 610 FNMSUB (bb3, cc03, cc15, cc15) 611 612 LDF [BO + 18 * SIZE], a1 613 LDF [BO + 19 * SIZE], a2 614 LDF [BO + 20 * SIZE], a3 615 LDF [BO + 21 * SIZE], a4 616 LDF [BO + 22 * SIZE], b1 617 LDF [BO + 23 * SIZE], b2 618 619 FMUL a1, c05, c05 620 621 FNMSUB (aa2, cc05, cc07, cc07) 622 FNMSUB (aa3, cc05, cc09, cc09) 623 FNMSUB (aa4, cc05, cc11, cc11) 624 FNMSUB (bb1, cc05, cc13, cc13) 625 FNMSUB (bb2, cc05, cc15, cc15) 626 627 LDF [BO + 27 * SIZE], a1 628 LDF [BO + 28 * SIZE], a2 629 LDF [BO + 29 * SIZE], a3 630 LDF [BO + 30 * SIZE], a4 631 LDF [BO + 31 * SIZE], b1 632 633 FMUL a1, c07, c07 634 635 FNMSUB (aa2, cc07, cc09, cc09) 636 FNMSUB (aa3, cc07, cc11, cc11) 637 FNMSUB (aa4, cc07, cc13, cc13) 638 FNMSUB (bb1, cc07, cc15, cc15) 639 640 LDF [BO + 36 * SIZE], a1 641 LDF [BO + 37 * SIZE], a2 642 LDF [BO + 38 * SIZE], a3 643 LDF [BO + 39 * SIZE], a4 644 645 FMUL a1, c09, c09 646 647 FNMSUB (aa2, cc09, cc11, cc11) 648 FNMSUB (aa3, cc09, cc13, cc13) 649 FNMSUB (aa4, cc09, cc15, cc15) 650 651 LDF [BO + 45 * SIZE], a1 652 LDF [BO + 46 * SIZE], a2 653 LDF [BO + 47 * SIZE], a3 654 655 FMUL a1, c11, c11 656 657 FNMSUB (aa2, cc11, cc13, cc13) 658 FNMSUB (aa3, cc11, cc15, cc15) 659 660 LDF [BO + 54 * SIZE], a1 661 LDF [BO + 55 * SIZE], a2 662 663 FMUL a1, c13, c13 664 665 FNMSUB (aa2, cc13, cc15, cc15) 666 667 LDF [BO + 63 * SIZE], a1 668 669 FMUL a1, c15, c15 670#endif 671 672#ifdef RT 673 LDF [BO + 63 * SIZE], a1 674 LDF [BO + 62 * SIZE], a2 675 LDF [BO + 61 * SIZE], a3 676 LDF [BO + 60 * SIZE], a4 677 LDF [BO + 59 * SIZE], b1 678 LDF [BO + 58 * SIZE], b2 679 LDF [BO + 57 * SIZE], b3 680 LDF [BO + 56 * SIZE], b4 681 682 FMUL a1, c15, c15 683 684 FNMSUB (aa2, cc15, cc13, cc13) 685 FNMSUB (aa3, cc15, cc11, cc11) 686 FNMSUB (aa4, cc15, cc09, cc09) 687 FNMSUB (bb1, cc15, cc07, cc07) 688 FNMSUB (bb2, cc15, cc05, cc05) 689 FNMSUB (bb3, cc15, cc03, cc03) 690 FNMSUB (bb4, cc15, cc01, cc01) 691 692 LDF [BO + 54 * SIZE], a1 693 LDF [BO + 53 * SIZE], a2 694 LDF [BO + 52 * SIZE], a3 695 LDF [BO + 51 * SIZE], a4 696 LDF [BO + 50 * SIZE], b1 697 LDF [BO + 49 * SIZE], b2 698 LDF [BO + 48 * SIZE], b3 699 700 FMUL a1, c13, c13 701 702 FNMSUB (aa2, cc13, cc11, cc11) 703 FNMSUB (aa3, cc13, cc09, cc09) 704 FNMSUB (aa4, cc13, cc07, cc07) 705 FNMSUB (bb1, cc13, cc05, cc05) 706 FNMSUB (bb2, cc13, cc03, cc03) 707 FNMSUB (bb3, cc13, cc01, cc01) 708 709 LDF [BO + 45 * SIZE], a1 710 LDF [BO + 44 * SIZE], a2 711 LDF [BO + 43 * SIZE], a3 712 LDF [BO + 42 * SIZE], a4 713 LDF [BO + 41 * SIZE], b1 714 LDF [BO + 40 * SIZE], b2 715 716 FMUL a1, c11, c11 717 718 FNMSUB (aa2, cc11, cc09, cc09) 719 FNMSUB (aa3, cc11, cc07, cc07) 720 FNMSUB (aa4, cc11, cc05, cc05) 721 FNMSUB (bb1, cc11, cc03, cc03) 722 FNMSUB (bb2, cc11, cc01, cc01) 723 724 LDF [BO + 36 * SIZE], a1 725 LDF [BO + 35 * SIZE], a2 726 LDF [BO + 34 * SIZE], a3 727 LDF [BO + 33 * SIZE], a4 728 LDF [BO + 32 * SIZE], b1 729 730 FMUL a1, c09, c09 731 732 FNMSUB (aa2, cc09, cc07, cc07) 733 FNMSUB (aa3, cc09, cc05, cc05) 734 FNMSUB (aa4, cc09, cc03, cc03) 735 FNMSUB (bb1, cc09, cc01, cc01) 736 737 LDF [BO + 27 * SIZE], a1 738 LDF [BO + 26 * SIZE], a2 739 LDF [BO + 25 * SIZE], a3 740 LDF [BO + 24 * SIZE], a4 741 742 FMUL a1, c07, c07 743 744 FNMSUB (aa2, cc07, cc05, cc05) 745 FNMSUB (aa3, cc07, cc03, cc03) 746 FNMSUB (aa4, cc07, cc01, cc01) 747 748 LDF [BO + 18 * SIZE], a1 749 LDF [BO + 17 * SIZE], a2 750 LDF [BO + 16 * SIZE], a3 751 752 FMUL a1, c05, c05 753 754 FNMSUB (aa2, cc05, cc03, cc03) 755 FNMSUB (aa3, cc05, cc01, cc01) 756 757 LDF [BO + 9 * SIZE], a1 758 LDF [BO + 8 * SIZE], a2 759 760 FMUL a1, c03, c03 761 762 FNMSUB (aa2, cc03, cc01, cc01) 763 764 LDF [BO + 0 * SIZE], a1 765 766 FMUL a1, c01, c01 767#endif 768 769#ifdef LN 770 add C1, -1 * SIZE, C1 771 add C2, -1 * SIZE, C2 772 add C3, -1 * SIZE, C3 773 add C4, -1 * SIZE, C4 774 add C5, -1 * SIZE, C5 775 add C6, -1 * SIZE, C6 776 add C7, -1 * SIZE, C7 777 add C8, -1 * SIZE, C8 778#endif 779 780#if defined(LN) || defined(LT) 781 STF c01, [BO + 0 * SIZE] 782 STF c03, [BO + 1 * SIZE] 783 STF c05, [BO + 2 * SIZE] 784 STF c07, [BO + 3 * SIZE] 785 786 STF c09, [BO + 4 * SIZE] 787 STF c11, [BO + 5 * SIZE] 788 STF c13, [BO + 6 * SIZE] 789 STF c15, [BO + 7 * SIZE] 790#else 791 STF c01, [AO + 0 * SIZE] 792 STF c03, [AO + 1 * SIZE] 793 STF c05, [AO + 2 * SIZE] 794 STF c07, [AO + 3 * SIZE] 795 796 STF c09, [AO + 4 * SIZE] 797 STF c11, [AO + 5 * SIZE] 798 STF c13, [AO + 6 * SIZE] 799 STF c15, [AO + 7 * SIZE] 800#endif 801 802 STF c01, [C1 + 0 * SIZE] 803 STF c03, [C2 + 0 * SIZE] 804 STF c05, [C3 + 0 * SIZE] 805 STF c07, [C4 + 0 * SIZE] 806 807 STF c09, [C5 + 0 * SIZE] 808 STF c11, [C6 + 0 * SIZE] 809 STF c13, [C7 + 0 * SIZE] 810 STF c15, [C8 + 0 * SIZE] 811 812#ifdef RT 813 sll K, BASE_SHIFT + 0, TEMP1 814 add AORIG, TEMP1, AORIG 815#endif 816 817#if defined(LT) || defined(RN) 818 sub K, KK, TEMP1 819 sll TEMP1, BASE_SHIFT + 0, TEMP2 820 sll TEMP1, BASE_SHIFT + 3, TEMP1 821 add AO, TEMP2, AO 822 add BO, TEMP1, BO 823#endif 824 825#ifdef LT 826 add KK, 1, KK 827#endif 828 829#ifdef LN 830 sub KK, 1, KK 831#endif 832 .align 4 833 834.LL20: 835 sra M, 1, I 836 cmp I, 0 837 ble,pn %icc, .LL29 838 nop 839 .align 4 840 841.LL12: 842#if defined(LT) || defined(RN) 843 mov B, BO 844#else 845#ifdef LN 846 sll K, BASE_SHIFT + 1, TEMP1 847 sub AORIG, TEMP1, AORIG 848#endif 849 850 sll KK, BASE_SHIFT + 1, TEMP1 851 sll KK, BASE_SHIFT + 3, TEMP2 852 853 add AORIG, TEMP1, AO 854 add B, TEMP2, BO 855#endif 856 857 LDF [AO + 0 * SIZE], a1 858 LDF [AO + 1 * SIZE], a2 859 LDF [AO + 8 * SIZE], a5 860 861 LDF [BO + 0 * SIZE], b1 862 863 LDF [BO + 1 * SIZE], b2 864 FCLR (cc01) 865 LDF [BO + 2 * SIZE], b3 866 FCLR (cc05) 867 LDF [BO + 3 * SIZE], b4 868 FCLR (cc09) 869 LDF [BO + 4 * SIZE], b5 870 FCLR (cc13) 871 872 LDF [BO + 5 * SIZE], b6 873 FCLR (cc02) 874 LDF [BO + 6 * SIZE], b7 875 FCLR (cc06) 876 LDF [BO + 7 * SIZE], b8 877 FCLR (cc10) 878 LDF [BO + 8 * SIZE], b9 879 FCLR (cc14) 880 881 prefetch [C1 + 1 * SIZE], 3 882 FCLR (cc03) 883 prefetch [C2 + 2 * SIZE], 3 884 FCLR (cc07) 885 prefetch [C3 + 1 * SIZE], 3 886 FCLR (cc11) 887 prefetch [C4 + 2 * SIZE], 3 888 FCLR (cc15) 889 890 prefetch [C5 + 1 * SIZE], 3 891 FCLR (cc04) 892 prefetch [C6 + 2 * SIZE], 3 893 FCLR (cc08) 894 prefetch [C7 + 1 * SIZE], 3 895 FCLR (cc12) 896 prefetch [C8 + 2 * SIZE], 3 897 FCLR (cc16) 898 899#if defined(LT) || defined(RN) 900 sra KK, 3, L 901#else 902 sub K, KK, L 903 sra L, 3, L 904#endif 905 cmp L, 0 906 ble,pn %icc, .LL15 907 nop 908 .align 4 909 910.LL13: 911 FMADD (aa1, bb1, cc01, cc01) 912 FMADD (aa2, bb1, cc02, cc02) 913 FMADD (aa1, bb2, cc03, cc03) 914 FMADD (aa2, bb2, cc04, cc04) 915 916 FMADD (aa1, bb3, cc05, cc05) 917 LDF [BO + 16 * SIZE], b1 918 FMADD (aa2, bb3, cc06, cc06) 919 LDF [BO + 9 * SIZE], b2 920 921 FMADD (aa1, bb4, cc07, cc07) 922 LDF [BO + 10 * SIZE], b3 923 FMADD (aa2, bb4, cc08, cc08) 924 LDF [BO + 11 * SIZE], b4 925 926 FMADD (aa1, bb5, cc09, cc09) 927 LDF [AO + 2 * SIZE], a3 928 FMADD (aa2, bb5, cc10, cc10) 929 LDF [AO + 3 * SIZE], a4 930 931 FMADD (aa1, bb6, cc11, cc11) 932 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 933 FMADD (aa2, bb6, cc12, cc12) 934 nop 935 936 FMADD (aa1, bb7, cc13, cc13) 937 LDF [BO + 12 * SIZE], b5 938 FMADD (aa2, bb7, cc14, cc14) 939 LDF [BO + 13 * SIZE], b6 940 941 FMADD (aa1, bb8, cc15, cc15) 942 LDF [BO + 14 * SIZE], b7 943 FMADD (aa2, bb8, cc16, cc16) 944 LDF [BO + 15 * SIZE], b8 945 946 FMADD (aa3, bb9, cc01, cc01) 947 FMADD (aa4, bb9, cc02, cc02) 948 FMADD (aa3, bb2, cc03, cc03) 949 FMADD (aa4, bb2, cc04, cc04) 950 951 FMADD (aa3, bb3, cc05, cc05) 952 LDF [BO + 24 * SIZE], b9 953 FMADD (aa4, bb3, cc06, cc06) 954 LDF [BO + 17 * SIZE], b2 955 956 FMADD (aa3, bb4, cc07, cc07) 957 LDF [BO + 18 * SIZE], b3 958 FMADD (aa4, bb4, cc08, cc08) 959 LDF [BO + 19 * SIZE], b4 960 961 FMADD (aa3, bb5, cc09, cc09) 962 LDF [AO + 4 * SIZE], a1 963 FMADD (aa4, bb5, cc10, cc10) 964 LDF [AO + 5 * SIZE], a2 965 966 FMADD (aa3, bb6, cc11, cc11) 967 add L, -1, L 968 FMADD (aa4, bb6, cc12, cc12) 969 nop 970 971 FMADD (aa3, bb7, cc13, cc13) 972 LDF [BO + 20 * SIZE], b5 973 FMADD (aa4, bb7, cc14, cc14) 974 LDF [BO + 21 * SIZE], b6 975 976 FMADD (aa3, bb8, cc15, cc15) 977 LDF [BO + 22 * SIZE], b7 978 FMADD (aa4, bb8, cc16, cc16) 979 LDF [BO + 23 * SIZE], b8 980 981 FMADD (aa1, bb1, cc01, cc01) 982 FMADD (aa2, bb1, cc02, cc02) 983 FMADD (aa1, bb2, cc03, cc03) 984 FMADD (aa2, bb2, cc04, cc04) 985 986 FMADD (aa1, bb3, cc05, cc05) 987 LDF [BO + 32 * SIZE], b1 988 FMADD (aa2, bb3, cc06, cc06) 989 LDF [BO + 25 * SIZE], b2 990 991 FMADD (aa1, bb4, cc07, cc07) 992 LDF [BO + 26 * SIZE], b3 993 FMADD (aa2, bb4, cc08, cc08) 994 LDF [BO + 27 * SIZE], b4 995 996 FMADD (aa1, bb5, cc09, cc09) 997 LDF [AO + 6 * SIZE], a3 998 FMADD (aa2, bb5, cc10, cc10) 999 LDF [AO + 7 * SIZE], a4 1000 1001 FMADD (aa1, bb6, cc11, cc11) 1002 nop 1003 FMADD (aa2, bb6, cc12, cc12) 1004 nop 1005 1006 FMADD (aa1, bb7, cc13, cc13) 1007 LDF [BO + 28 * SIZE], b5 1008 FMADD (aa2, bb7, cc14, cc14) 1009 LDF [BO + 29 * SIZE], b6 1010 1011 FMADD (aa1, bb8, cc15, cc15) 1012 LDF [BO + 30 * SIZE], b7 1013 FMADD (aa2, bb8, cc16, cc16) 1014 LDF [BO + 31 * SIZE], b8 1015 1016 FMADD (aa3, bb9, cc01, cc01) 1017 FMADD (aa4, bb9, cc02, cc02) 1018 FMADD (aa3, bb2, cc03, cc03) 1019 FMADD (aa4, bb2, cc04, cc04) 1020 1021 FMADD (aa3, bb3, cc05, cc05) 1022 LDF [BO + 40 * SIZE], b9 1023 FMADD (aa4, bb3, cc06, cc06) 1024 LDF [BO + 33 * SIZE], b2 1025 1026 FMADD (aa3, bb4, cc07, cc07) 1027 LDF [BO + 34 * SIZE], b3 1028 FMADD (aa4, bb4, cc08, cc08) 1029 LDF [BO + 35 * SIZE], b4 1030 1031 FMADD (aa3, bb5, cc09, cc09) 1032 LDF [AO + 16 * SIZE], a1 /****/ 1033 FMADD (aa4, bb5, cc10, cc10) 1034 LDF [AO + 9 * SIZE], a2 1035 1036 FMADD (aa3, bb6, cc11, cc11) 1037 nop 1038 FMADD (aa4, bb6, cc12, cc12) 1039 nop 1040 1041 FMADD (aa3, bb7, cc13, cc13) 1042 LDF [BO + 36 * SIZE], b5 1043 FMADD (aa4, bb7, cc14, cc14) 1044 LDF [BO + 37 * SIZE], b6 1045 1046 FMADD (aa3, bb8, cc15, cc15) 1047 LDF [BO + 38 * SIZE], b7 1048 FMADD (aa4, bb8, cc16, cc16) 1049 LDF [BO + 39 * SIZE], b8 1050 1051 FMADD (aa5, bb1, cc01, cc01) 1052 FMADD (aa2, bb1, cc02, cc02) 1053 FMADD (aa5, bb2, cc03, cc03) 1054 FMADD (aa2, bb2, cc04, cc04) 1055 1056 FMADD (aa5, bb3, cc05, cc05) 1057 LDF [BO + 48 * SIZE], b1 1058 FMADD (aa2, bb3, cc06, cc06) 1059 LDF [BO + 41 * SIZE], b2 1060 1061 FMADD (aa5, bb4, cc07, cc07) 1062 LDF [BO + 42 * SIZE], b3 1063 FMADD (aa2, bb4, cc08, cc08) 1064 LDF [BO + 43 * SIZE], b4 1065 1066 FMADD (aa5, bb5, cc09, cc09) 1067 LDF [AO + 10 * SIZE], a3 1068 FMADD (aa2, bb5, cc10, cc10) 1069 LDF [AO + 11 * SIZE], a4 1070 1071 FMADD (aa5, bb6, cc11, cc11) 1072 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 1073 FMADD (aa2, bb6, cc12, cc12) 1074 nop 1075 1076 FMADD (aa5, bb7, cc13, cc13) 1077 LDF [BO + 44 * SIZE], b5 1078 FMADD (aa2, bb7, cc14, cc14) 1079 LDF [BO + 45 * SIZE], b6 1080 1081 FMADD (aa5, bb8, cc15, cc15) 1082 LDF [BO + 46 * SIZE], b7 1083 FMADD (aa2, bb8, cc16, cc16) 1084 LDF [BO + 47 * SIZE], b8 1085 1086 FMADD (aa3, bb9, cc01, cc01) 1087 FMADD (aa4, bb9, cc02, cc02) 1088 FMADD (aa3, bb2, cc03, cc03) 1089 FMADD (aa4, bb2, cc04, cc04) 1090 1091 FMADD (aa3, bb3, cc05, cc05) 1092 LDF [BO + 56 * SIZE], b9 1093 FMADD (aa4, bb3, cc06, cc06) 1094 LDF [BO + 49 * SIZE], b2 1095 1096 FMADD (aa3, bb4, cc07, cc07) 1097 LDF [BO + 50 * SIZE], b3 1098 FMADD (aa4, bb4, cc08, cc08) 1099 LDF [BO + 51 * SIZE], b4 1100 1101 FMADD (aa3, bb5, cc09, cc09) 1102 LDF [AO + 12 * SIZE], a5 1103 FMADD (aa4, bb5, cc10, cc10) 1104 LDF [AO + 13 * SIZE], a2 1105 1106 FMADD (aa3, bb6, cc11, cc11) 1107 cmp L, 0 1108 FMADD (aa4, bb6, cc12, cc12) 1109 nop 1110 1111 FMADD (aa3, bb7, cc13, cc13) 1112 LDF [BO + 52 * SIZE], b5 1113 FMADD (aa4, bb7, cc14, cc14) 1114 LDF [BO + 53 * SIZE], b6 1115 1116 FMADD (aa3, bb8, cc15, cc15) 1117 LDF [BO + 54 * SIZE], b7 1118 FMADD (aa4, bb8, cc16, cc16) 1119 LDF [BO + 55 * SIZE], b8 1120 1121 FMADD (aa5, bb1, cc01, cc01) 1122 FMADD (aa2, bb1, cc02, cc02) 1123 FMADD (aa5, bb2, cc03, cc03) 1124 FMADD (aa2, bb2, cc04, cc04) 1125 1126 FMADD (aa5, bb3, cc05, cc05) 1127 LDF [BO + 64 * SIZE], b1 1128 FMADD (aa2, bb3, cc06, cc06) 1129 LDF [BO + 57 * SIZE], b2 1130 1131 FMADD (aa5, bb4, cc07, cc07) 1132 LDF [BO + 58 * SIZE], b3 1133 FMADD (aa2, bb4, cc08, cc08) 1134 LDF [BO + 59 * SIZE], b4 1135 1136 FMADD (aa5, bb5, cc09, cc09) 1137 LDF [AO + 14 * SIZE], a3 1138 FMADD (aa2, bb5, cc10, cc10) 1139 LDF [AO + 15 * SIZE], a4 1140 1141 FMADD (aa5, bb6, cc11, cc11) 1142 add BO, 64 * SIZE, BO 1143 FMADD (aa2, bb6, cc12, cc12) 1144 add AO, 16 * SIZE, AO 1145 1146 FMADD (aa5, bb7, cc13, cc13) 1147 LDF [BO - 4 * SIZE], b5 1148 FMADD (aa2, bb7, cc14, cc14) 1149 LDF [BO - 3 * SIZE], b6 1150 1151 FMADD (aa5, bb8, cc15, cc15) 1152 LDF [BO - 2 * SIZE], b7 1153 FMADD (aa2, bb8, cc16, cc16) 1154 LDF [BO - 1 * SIZE], b8 1155 1156 FMADD (aa3, bb9, cc01, cc01) 1157 FMADD (aa4, bb9, cc02, cc02) 1158 FMADD (aa3, bb2, cc03, cc03) 1159 FMADD (aa4, bb2, cc04, cc04) 1160 1161 FMADD (aa3, bb3, cc05, cc05) 1162 LDF [BO + 8 * SIZE], b9 1163 FMADD (aa4, bb3, cc06, cc06) 1164 LDF [BO + 1 * SIZE], b2 1165 1166 FMADD (aa3, bb4, cc07, cc07) 1167 LDF [BO + 2 * SIZE], b3 1168 FMADD (aa4, bb4, cc08, cc08) 1169 LDF [BO + 3 * SIZE], b4 1170 1171 FMADD (aa3, bb5, cc09, cc09) 1172 LDF [AO + 8 * SIZE], a5 /****/ 1173 FMADD (aa4, bb5, cc10, cc10) 1174 LDF [AO + 1 * SIZE], a2 1175 1176 FMADD (aa3, bb6, cc11, cc11) 1177 FMADD (aa4, bb6, cc12, cc12) 1178 1179 FMADD (aa3, bb7, cc13, cc13) 1180 LDF [BO + 4 * SIZE], b5 1181 FMADD (aa4, bb7, cc14, cc14) 1182 LDF [BO + 5 * SIZE], b6 1183 1184 FMADD (aa3, bb8, cc15, cc15) 1185 LDF [BO + 6 * SIZE], b7 1186 FMADD (aa4, bb8, cc16, cc16) 1187 ble,pn %icc, .LL15 1188 LDF [BO + 7 * SIZE], b8 1189 1190 FMADD (aa1, bb1, cc01, cc01) 1191 FMADD (aa2, bb1, cc02, cc02) 1192 FMADD (aa1, bb2, cc03, cc03) 1193 FMADD (aa2, bb2, cc04, cc04) 1194 1195 FMADD (aa1, bb3, cc05, cc05) 1196 LDF [BO + 16 * SIZE], b1 1197 FMADD (aa2, bb3, cc06, cc06) 1198 LDF [BO + 9 * SIZE], b2 1199 1200 FMADD (aa1, bb4, cc07, cc07) 1201 LDF [BO + 10 * SIZE], b3 1202 FMADD (aa2, bb4, cc08, cc08) 1203 LDF [BO + 11 * SIZE], b4 1204 1205 FMADD (aa1, bb5, cc09, cc09) 1206 LDF [AO + 2 * SIZE], a3 1207 FMADD (aa2, bb5, cc10, cc10) 1208 LDF [AO + 3 * SIZE], a4 1209 1210 FMADD (aa1, bb6, cc11, cc11) 1211 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1212 FMADD (aa2, bb6, cc12, cc12) 1213 nop 1214 1215 FMADD (aa1, bb7, cc13, cc13) 1216 LDF [BO + 12 * SIZE], b5 1217 FMADD (aa2, bb7, cc14, cc14) 1218 LDF [BO + 13 * SIZE], b6 1219 1220 FMADD (aa1, bb8, cc15, cc15) 1221 LDF [BO + 14 * SIZE], b7 1222 FMADD (aa2, bb8, cc16, cc16) 1223 LDF [BO + 15 * SIZE], b8 1224 1225 FMADD (aa3, bb9, cc01, cc01) 1226 FMADD (aa4, bb9, cc02, cc02) 1227 FMADD (aa3, bb2, cc03, cc03) 1228 FMADD (aa4, bb2, cc04, cc04) 1229 1230 FMADD (aa3, bb3, cc05, cc05) 1231 LDF [BO + 24 * SIZE], b9 1232 FMADD (aa4, bb3, cc06, cc06) 1233 LDF [BO + 17 * SIZE], b2 1234 1235 FMADD (aa3, bb4, cc07, cc07) 1236 LDF [BO + 18 * SIZE], b3 1237 FMADD (aa4, bb4, cc08, cc08) 1238 LDF [BO + 19 * SIZE], b4 1239 1240 FMADD (aa3, bb5, cc09, cc09) 1241 LDF [AO + 4 * SIZE], a1 1242 FMADD (aa4, bb5, cc10, cc10) 1243 LDF [AO + 5 * SIZE], a2 1244 1245 FMADD (aa3, bb6, cc11, cc11) 1246 add L, -1, L 1247 FMADD (aa4, bb6, cc12, cc12) 1248 nop 1249 1250 FMADD (aa3, bb7, cc13, cc13) 1251 LDF [BO + 20 * SIZE], b5 1252 FMADD (aa4, bb7, cc14, cc14) 1253 LDF [BO + 21 * SIZE], b6 1254 1255 FMADD (aa3, bb8, cc15, cc15) 1256 LDF [BO + 22 * SIZE], b7 1257 FMADD (aa4, bb8, cc16, cc16) 1258 LDF [BO + 23 * SIZE], b8 1259 1260 FMADD (aa1, bb1, cc01, cc01) 1261 FMADD (aa2, bb1, cc02, cc02) 1262 FMADD (aa1, bb2, cc03, cc03) 1263 FMADD (aa2, bb2, cc04, cc04) 1264 1265 FMADD (aa1, bb3, cc05, cc05) 1266 LDF [BO + 32 * SIZE], b1 1267 FMADD (aa2, bb3, cc06, cc06) 1268 LDF [BO + 25 * SIZE], b2 1269 1270 FMADD (aa1, bb4, cc07, cc07) 1271 LDF [BO + 26 * SIZE], b3 1272 FMADD (aa2, bb4, cc08, cc08) 1273 LDF [BO + 27 * SIZE], b4 1274 1275 FMADD (aa1, bb5, cc09, cc09) 1276 LDF [AO + 6 * SIZE], a3 1277 FMADD (aa2, bb5, cc10, cc10) 1278 LDF [AO + 7 * SIZE], a4 1279 1280 FMADD (aa1, bb6, cc11, cc11) 1281 nop 1282 FMADD (aa2, bb6, cc12, cc12) 1283 nop 1284 1285 FMADD (aa1, bb7, cc13, cc13) 1286 LDF [BO + 28 * SIZE], b5 1287 FMADD (aa2, bb7, cc14, cc14) 1288 LDF [BO + 29 * SIZE], b6 1289 1290 FMADD (aa1, bb8, cc15, cc15) 1291 LDF [BO + 30 * SIZE], b7 1292 FMADD (aa2, bb8, cc16, cc16) 1293 LDF [BO + 31 * SIZE], b8 1294 1295 FMADD (aa3, bb9, cc01, cc01) 1296 FMADD (aa4, bb9, cc02, cc02) 1297 FMADD (aa3, bb2, cc03, cc03) 1298 FMADD (aa4, bb2, cc04, cc04) 1299 1300 FMADD (aa3, bb3, cc05, cc05) 1301 LDF [BO + 40 * SIZE], b9 1302 FMADD (aa4, bb3, cc06, cc06) 1303 LDF [BO + 33 * SIZE], b2 1304 1305 FMADD (aa3, bb4, cc07, cc07) 1306 LDF [BO + 34 * SIZE], b3 1307 FMADD (aa4, bb4, cc08, cc08) 1308 LDF [BO + 35 * SIZE], b4 1309 1310 FMADD (aa3, bb5, cc09, cc09) 1311 LDF [AO + 16 * SIZE], a1 /****/ 1312 FMADD (aa4, bb5, cc10, cc10) 1313 LDF [AO + 9 * SIZE], a2 1314 1315 FMADD (aa3, bb6, cc11, cc11) 1316 nop 1317 FMADD (aa4, bb6, cc12, cc12) 1318 nop 1319 1320 FMADD (aa3, bb7, cc13, cc13) 1321 LDF [BO + 36 * SIZE], b5 1322 FMADD (aa4, bb7, cc14, cc14) 1323 LDF [BO + 37 * SIZE], b6 1324 1325 FMADD (aa3, bb8, cc15, cc15) 1326 LDF [BO + 38 * SIZE], b7 1327 FMADD (aa4, bb8, cc16, cc16) 1328 LDF [BO + 39 * SIZE], b8 1329 1330 FMADD (aa5, bb1, cc01, cc01) 1331 FMADD (aa2, bb1, cc02, cc02) 1332 FMADD (aa5, bb2, cc03, cc03) 1333 FMADD (aa2, bb2, cc04, cc04) 1334 1335 FMADD (aa5, bb3, cc05, cc05) 1336 LDF [BO + 48 * SIZE], b1 1337 FMADD (aa2, bb3, cc06, cc06) 1338 LDF [BO + 41 * SIZE], b2 1339 1340 FMADD (aa5, bb4, cc07, cc07) 1341 LDF [BO + 42 * SIZE], b3 1342 FMADD (aa2, bb4, cc08, cc08) 1343 LDF [BO + 43 * SIZE], b4 1344 1345 FMADD (aa5, bb5, cc09, cc09) 1346 LDF [AO + 10 * SIZE], a3 1347 FMADD (aa2, bb5, cc10, cc10) 1348 LDF [AO + 11 * SIZE], a4 1349 1350 FMADD (aa5, bb6, cc11, cc11) 1351 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 1352 FMADD (aa2, bb6, cc12, cc12) 1353 nop 1354 1355 FMADD (aa5, bb7, cc13, cc13) 1356 LDF [BO + 44 * SIZE], b5 1357 FMADD (aa2, bb7, cc14, cc14) 1358 LDF [BO + 45 * SIZE], b6 1359 1360 FMADD (aa5, bb8, cc15, cc15) 1361 LDF [BO + 46 * SIZE], b7 1362 FMADD (aa2, bb8, cc16, cc16) 1363 LDF [BO + 47 * SIZE], b8 1364 1365 FMADD (aa3, bb9, cc01, cc01) 1366 FMADD (aa4, bb9, cc02, cc02) 1367 FMADD (aa3, bb2, cc03, cc03) 1368 FMADD (aa4, bb2, cc04, cc04) 1369 1370 FMADD (aa3, bb3, cc05, cc05) 1371 LDF [BO + 56 * SIZE], b9 1372 FMADD (aa4, bb3, cc06, cc06) 1373 LDF [BO + 49 * SIZE], b2 1374 1375 FMADD (aa3, bb4, cc07, cc07) 1376 LDF [BO + 50 * SIZE], b3 1377 FMADD (aa4, bb4, cc08, cc08) 1378 LDF [BO + 51 * SIZE], b4 1379 1380 FMADD (aa3, bb5, cc09, cc09) 1381 LDF [AO + 12 * SIZE], a5 1382 FMADD (aa4, bb5, cc10, cc10) 1383 LDF [AO + 13 * SIZE], a2 1384 1385 FMADD (aa3, bb6, cc11, cc11) 1386 cmp L, 0 1387 FMADD (aa4, bb6, cc12, cc12) 1388 nop 1389 1390 FMADD (aa3, bb7, cc13, cc13) 1391 LDF [BO + 52 * SIZE], b5 1392 FMADD (aa4, bb7, cc14, cc14) 1393 LDF [BO + 53 * SIZE], b6 1394 1395 FMADD (aa3, bb8, cc15, cc15) 1396 LDF [BO + 54 * SIZE], b7 1397 FMADD (aa4, bb8, cc16, cc16) 1398 LDF [BO + 55 * SIZE], b8 1399 1400 FMADD (aa5, bb1, cc01, cc01) 1401 FMADD (aa2, bb1, cc02, cc02) 1402 FMADD (aa5, bb2, cc03, cc03) 1403 FMADD (aa2, bb2, cc04, cc04) 1404 1405 FMADD (aa5, bb3, cc05, cc05) 1406 LDF [BO + 64 * SIZE], b1 1407 FMADD (aa2, bb3, cc06, cc06) 1408 LDF [BO + 57 * SIZE], b2 1409 1410 FMADD (aa5, bb4, cc07, cc07) 1411 LDF [BO + 58 * SIZE], b3 1412 FMADD (aa2, bb4, cc08, cc08) 1413 LDF [BO + 59 * SIZE], b4 1414 1415 FMADD (aa5, bb5, cc09, cc09) 1416 LDF [AO + 14 * SIZE], a3 1417 FMADD (aa2, bb5, cc10, cc10) 1418 LDF [AO + 15 * SIZE], a4 1419 1420 FMADD (aa5, bb6, cc11, cc11) 1421 add BO, 64 * SIZE, BO 1422 FMADD (aa2, bb6, cc12, cc12) 1423 add AO, 16 * SIZE, AO 1424 1425 FMADD (aa5, bb7, cc13, cc13) 1426 LDF [BO - 4 * SIZE], b5 1427 FMADD (aa2, bb7, cc14, cc14) 1428 LDF [BO - 3 * SIZE], b6 1429 1430 FMADD (aa5, bb8, cc15, cc15) 1431 LDF [BO - 2 * SIZE], b7 1432 FMADD (aa2, bb8, cc16, cc16) 1433 LDF [BO - 1 * SIZE], b8 1434 1435 FMADD (aa3, bb9, cc01, cc01) 1436 FMADD (aa4, bb9, cc02, cc02) 1437 FMADD (aa3, bb2, cc03, cc03) 1438 FMADD (aa4, bb2, cc04, cc04) 1439 1440 FMADD (aa3, bb3, cc05, cc05) 1441 LDF [BO + 8 * SIZE], b9 1442 FMADD (aa4, bb3, cc06, cc06) 1443 LDF [BO + 1 * SIZE], b2 1444 1445 FMADD (aa3, bb4, cc07, cc07) 1446 LDF [BO + 2 * SIZE], b3 1447 FMADD (aa4, bb4, cc08, cc08) 1448 LDF [BO + 3 * SIZE], b4 1449 1450 FMADD (aa3, bb5, cc09, cc09) 1451 LDF [AO + 8 * SIZE], a5 /****/ 1452 FMADD (aa4, bb5, cc10, cc10) 1453 LDF [AO + 1 * SIZE], a2 1454 1455 FMADD (aa3, bb6, cc11, cc11) 1456 FMADD (aa4, bb6, cc12, cc12) 1457 1458 FMADD (aa3, bb7, cc13, cc13) 1459 LDF [BO + 4 * SIZE], b5 1460 FMADD (aa4, bb7, cc14, cc14) 1461 LDF [BO + 5 * SIZE], b6 1462 1463 FMADD (aa3, bb8, cc15, cc15) 1464 LDF [BO + 6 * SIZE], b7 1465 FMADD (aa4, bb8, cc16, cc16) 1466 bg,pt %icc, .LL13 1467 LDF [BO + 7 * SIZE], b8 1468 .align 4 1469 1470.LL15: 1471#if defined(LT) || defined(RN) 1472 and KK, 7, L 1473#else 1474 sub K, KK, L 1475 and L, 7, L 1476#endif 1477 cmp L, 0 1478 ble,a,pn %icc, .LL18 1479 nop 1480 .align 4 1481 1482.LL17: 1483 FMADD (aa1, bb1, cc01, cc01) 1484 add L, -1, L 1485 FMADD (aa2, bb1, cc02, cc02) 1486 nop 1487 1488 FMADD (aa1, bb2, cc03, cc03) 1489 LDF [BO + 8 * SIZE], b1 1490 FMADD (aa2, bb2, cc04, cc04) 1491 LDF [BO + 9 * SIZE], b2 1492 1493 FMADD (aa1, bb3, cc05, cc05) 1494 cmp L, 0 1495 FMADD (aa2, bb3, cc06, cc06) 1496 nop 1497 1498 FMADD (aa1, bb4, cc07, cc07) 1499 LDF [BO + 10 * SIZE], b3 1500 FMADD (aa2, bb4, cc08, cc08) 1501 LDF [BO + 11 * SIZE], b4 1502 1503 FMADD (aa1, bb5, cc09, cc09) 1504 nop 1505 FMADD (aa2, bb5, cc10, cc10) 1506 nop 1507 1508 FMADD (aa1, bb6, cc11, cc11) 1509 LDF [BO + 12 * SIZE], b5 1510 FMADD (aa2, bb6, cc12, cc12) 1511 LDF [BO + 13 * SIZE], b6 1512 1513 FMADD (aa1, bb7, cc13, cc13) 1514 add AO, 2 * SIZE, AO 1515 FMADD (aa2, bb7, cc14, cc14) 1516 add BO, 8 * SIZE, BO 1517 1518 FMADD (aa1, bb8, cc15, cc15) 1519 LDF [AO + 0 * SIZE], a1 1520 FMADD (aa2, bb8, cc16, cc16) 1521 LDF [AO + 1 * SIZE], a2 1522 1523 LDF [BO + 6 * SIZE], b7 1524 bg,pt %icc, .LL17 1525 LDF [BO + 7 * SIZE], b8 1526 nop 1527 .align 4 1528 1529.LL18: 1530#if defined(LN) || defined(RT) 1531#ifdef LN 1532 sub KK, 2, TEMP1 1533#else 1534 sub KK, 8, TEMP1 1535#endif 1536 sll TEMP1, BASE_SHIFT + 1, TEMP2 1537 sll TEMP1, BASE_SHIFT + 3, TEMP1 1538 1539 add AORIG, TEMP2, AO 1540 add B, TEMP1, BO 1541#endif 1542 1543#if defined(LN) || defined(LT) 1544 LDF [BO + 0 * SIZE], a1 1545 LDF [BO + 1 * SIZE], a2 1546 LDF [BO + 2 * SIZE], a3 1547 LDF [BO + 3 * SIZE], a4 1548 1549 LDF [BO + 4 * SIZE], b1 1550 LDF [BO + 5 * SIZE], b2 1551 LDF [BO + 6 * SIZE], b3 1552 LDF [BO + 7 * SIZE], b4 1553 1554 FSUB a1, c01, c01 1555 FSUB a2, c03, c03 1556 FSUB a3, c05, c05 1557 FSUB a4, c07, c07 1558 1559 FSUB b1, c09, c09 1560 FSUB b2, c11, c11 1561 FSUB b3, c13, c13 1562 FSUB b4, c15, c15 1563 1564 LDF [BO + 8 * SIZE], a1 1565 LDF [BO + 9 * SIZE], a2 1566 LDF [BO + 10 * SIZE], a3 1567 LDF [BO + 11 * SIZE], a4 1568 1569 LDF [BO + 12 * SIZE], b1 1570 LDF [BO + 13 * SIZE], b2 1571 LDF [BO + 14 * SIZE], b3 1572 LDF [BO + 15 * SIZE], b4 1573 1574 FSUB a1, c02, c02 1575 FSUB a2, c04, c04 1576 FSUB a3, c06, c06 1577 FSUB a4, c08, c08 1578 1579 FSUB b1, c10, c10 1580 FSUB b2, c12, c12 1581 FSUB b3, c14, c14 1582 FSUB b4, c16, c16 1583#else 1584 LDF [AO + 0 * SIZE], a1 1585 LDF [AO + 1 * SIZE], a2 1586 LDF [AO + 2 * SIZE], a3 1587 LDF [AO + 3 * SIZE], a4 1588 1589 LDF [AO + 4 * SIZE], b1 1590 LDF [AO + 5 * SIZE], b2 1591 LDF [AO + 6 * SIZE], b3 1592 LDF [AO + 7 * SIZE], b4 1593 1594 FSUB a1, c01, c01 1595 FSUB a2, c02, c02 1596 FSUB a3, c03, c03 1597 FSUB a4, c04, c04 1598 1599 FSUB b1, c05, c05 1600 FSUB b2, c06, c06 1601 FSUB b3, c07, c07 1602 FSUB b4, c08, c08 1603 1604 LDF [AO + 8 * SIZE], a1 1605 LDF [AO + 9 * SIZE], a2 1606 LDF [AO + 10 * SIZE], a3 1607 LDF [AO + 11 * SIZE], a4 1608 1609 LDF [AO + 12 * SIZE], b1 1610 LDF [AO + 13 * SIZE], b2 1611 LDF [AO + 14 * SIZE], b3 1612 LDF [AO + 15 * SIZE], b4 1613 1614 FSUB a1, c09, c09 1615 FSUB a2, c10, c10 1616 FSUB a3, c11, c11 1617 FSUB a4, c12, c12 1618 1619 FSUB b1, c13, c13 1620 FSUB b2, c14, c14 1621 FSUB b3, c15, c15 1622 FSUB b4, c16, c16 1623#endif 1624 1625#ifdef LN 1626 LDF [AO + 3 * SIZE], a1 1627 LDF [AO + 2 * SIZE], a2 1628 LDF [AO + 0 * SIZE], a3 1629 1630 FMUL a1, c02, c02 1631 FMUL a1, c04, c04 1632 FMUL a1, c06, c06 1633 FMUL a1, c08, c08 1634 FMUL a1, c10, c10 1635 FMUL a1, c12, c12 1636 FMUL a1, c14, c14 1637 FMUL a1, c16, c16 1638 1639 FNMSUB (aa2, cc02, cc01, cc01) 1640 FNMSUB (aa2, cc04, cc03, cc03) 1641 FNMSUB (aa2, cc06, cc05, cc05) 1642 FNMSUB (aa2, cc08, cc07, cc07) 1643 FNMSUB (aa2, cc10, cc09, cc09) 1644 FNMSUB (aa2, cc12, cc11, cc11) 1645 FNMSUB (aa2, cc14, cc13, cc13) 1646 FNMSUB (aa2, cc16, cc15, cc15) 1647 1648 FMUL a3, c01, c01 1649 FMUL a3, c03, c03 1650 FMUL a3, c05, c05 1651 FMUL a3, c07, c07 1652 FMUL a3, c09, c09 1653 FMUL a3, c11, c11 1654 FMUL a3, c13, c13 1655 FMUL a3, c15, c15 1656#endif 1657 1658#ifdef LT 1659 LDF [AO + 0 * SIZE], a1 1660 LDF [AO + 1 * SIZE], a2 1661 LDF [AO + 3 * SIZE], a3 1662 1663 FMUL a1, c01, c01 1664 FMUL a1, c03, c03 1665 FMUL a1, c05, c05 1666 FMUL a1, c07, c07 1667 FMUL a1, c09, c09 1668 FMUL a1, c11, c11 1669 FMUL a1, c13, c13 1670 FMUL a1, c15, c15 1671 1672 FNMSUB (aa2, cc01, cc02, cc02) 1673 FNMSUB (aa2, cc03, cc04, cc04) 1674 FNMSUB (aa2, cc05, cc06, cc06) 1675 FNMSUB (aa2, cc07, cc08, cc08) 1676 FNMSUB (aa2, cc09, cc10, cc10) 1677 FNMSUB (aa2, cc11, cc12, cc12) 1678 FNMSUB (aa2, cc13, cc14, cc14) 1679 FNMSUB (aa2, cc15, cc16, cc16) 1680 1681 FMUL a3, c02, c02 1682 FMUL a3, c04, c04 1683 FMUL a3, c06, c06 1684 FMUL a3, c08, c08 1685 FMUL a3, c10, c10 1686 FMUL a3, c12, c12 1687 FMUL a3, c14, c14 1688 FMUL a3, c16, c16 1689#endif 1690 1691#ifdef RN 1692 LDF [BO + 0 * SIZE], a1 1693 LDF [BO + 1 * SIZE], a2 1694 LDF [BO + 2 * SIZE], a3 1695 LDF [BO + 3 * SIZE], a4 1696 LDF [BO + 4 * SIZE], b1 1697 LDF [BO + 5 * SIZE], b2 1698 LDF [BO + 6 * SIZE], b3 1699 LDF [BO + 7 * SIZE], b4 1700 1701 FMUL a1, c01, c01 1702 FMUL a1, c02, c02 1703 1704 FNMSUB (aa2, cc01, cc03, cc03) 1705 FNMSUB (aa2, cc02, cc04, cc04) 1706 FNMSUB (aa3, cc01, cc05, cc05) 1707 FNMSUB (aa3, cc02, cc06, cc06) 1708 FNMSUB (aa4, cc01, cc07, cc07) 1709 FNMSUB (aa4, cc02, cc08, cc08) 1710 FNMSUB (bb1, cc01, cc09, cc09) 1711 FNMSUB (bb1, cc02, cc10, cc10) 1712 FNMSUB (bb2, cc01, cc11, cc11) 1713 FNMSUB (bb2, cc02, cc12, cc12) 1714 FNMSUB (bb3, cc01, cc13, cc13) 1715 FNMSUB (bb3, cc02, cc14, cc14) 1716 FNMSUB (bb4, cc01, cc15, cc15) 1717 FNMSUB (bb4, cc02, cc16, cc16) 1718 1719 LDF [BO + 9 * SIZE], a1 1720 LDF [BO + 10 * SIZE], a2 1721 LDF [BO + 11 * SIZE], a3 1722 LDF [BO + 12 * SIZE], a4 1723 LDF [BO + 13 * SIZE], b1 1724 LDF [BO + 14 * SIZE], b2 1725 LDF [BO + 15 * SIZE], b3 1726 1727 FMUL a1, c03, c03 1728 FMUL a1, c04, c04 1729 1730 FNMSUB (aa2, cc03, cc05, cc05) 1731 FNMSUB (aa2, cc04, cc06, cc06) 1732 FNMSUB (aa3, cc03, cc07, cc07) 1733 FNMSUB (aa3, cc04, cc08, cc08) 1734 FNMSUB (aa4, cc03, cc09, cc09) 1735 FNMSUB (aa4, cc04, cc10, cc10) 1736 FNMSUB (bb1, cc03, cc11, cc11) 1737 FNMSUB (bb1, cc04, cc12, cc12) 1738 FNMSUB (bb2, cc03, cc13, cc13) 1739 FNMSUB (bb2, cc04, cc14, cc14) 1740 FNMSUB (bb3, cc03, cc15, cc15) 1741 FNMSUB (bb3, cc04, cc16, cc16) 1742 1743 LDF [BO + 18 * SIZE], a1 1744 LDF [BO + 19 * SIZE], a2 1745 LDF [BO + 20 * SIZE], a3 1746 LDF [BO + 21 * SIZE], a4 1747 LDF [BO + 22 * SIZE], b1 1748 LDF [BO + 23 * SIZE], b2 1749 1750 FMUL a1, c05, c05 1751 FMUL a1, c06, c06 1752 1753 FNMSUB (aa2, cc05, cc07, cc07) 1754 FNMSUB (aa2, cc06, cc08, cc08) 1755 FNMSUB (aa3, cc05, cc09, cc09) 1756 FNMSUB (aa3, cc06, cc10, cc10) 1757 FNMSUB (aa4, cc05, cc11, cc11) 1758 FNMSUB (aa4, cc06, cc12, cc12) 1759 FNMSUB (bb1, cc05, cc13, cc13) 1760 FNMSUB (bb1, cc06, cc14, cc14) 1761 FNMSUB (bb2, cc05, cc15, cc15) 1762 FNMSUB (bb2, cc06, cc16, cc16) 1763 1764 LDF [BO + 27 * SIZE], a1 1765 LDF [BO + 28 * SIZE], a2 1766 LDF [BO + 29 * SIZE], a3 1767 LDF [BO + 30 * SIZE], a4 1768 LDF [BO + 31 * SIZE], b1 1769 1770 FMUL a1, c07, c07 1771 FMUL a1, c08, c08 1772 1773 FNMSUB (aa2, cc07, cc09, cc09) 1774 FNMSUB (aa2, cc08, cc10, cc10) 1775 FNMSUB (aa3, cc07, cc11, cc11) 1776 FNMSUB (aa3, cc08, cc12, cc12) 1777 FNMSUB (aa4, cc07, cc13, cc13) 1778 FNMSUB (aa4, cc08, cc14, cc14) 1779 FNMSUB (bb1, cc07, cc15, cc15) 1780 FNMSUB (bb1, cc08, cc16, cc16) 1781 1782 LDF [BO + 36 * SIZE], a1 1783 LDF [BO + 37 * SIZE], a2 1784 LDF [BO + 38 * SIZE], a3 1785 LDF [BO + 39 * SIZE], a4 1786 1787 FMUL a1, c09, c09 1788 FMUL a1, c10, c10 1789 1790 FNMSUB (aa2, cc09, cc11, cc11) 1791 FNMSUB (aa2, cc10, cc12, cc12) 1792 FNMSUB (aa3, cc09, cc13, cc13) 1793 FNMSUB (aa3, cc10, cc14, cc14) 1794 FNMSUB (aa4, cc09, cc15, cc15) 1795 FNMSUB (aa4, cc10, cc16, cc16) 1796 1797 LDF [BO + 45 * SIZE], a1 1798 LDF [BO + 46 * SIZE], a2 1799 LDF [BO + 47 * SIZE], a3 1800 1801 FMUL a1, c11, c11 1802 FMUL a1, c12, c12 1803 1804 FNMSUB (aa2, cc11, cc13, cc13) 1805 FNMSUB (aa2, cc12, cc14, cc14) 1806 FNMSUB (aa3, cc11, cc15, cc15) 1807 FNMSUB (aa3, cc12, cc16, cc16) 1808 1809 LDF [BO + 54 * SIZE], a1 1810 LDF [BO + 55 * SIZE], a2 1811 1812 FMUL a1, c13, c13 1813 FMUL a1, c14, c14 1814 1815 FNMSUB (aa2, cc13, cc15, cc15) 1816 FNMSUB (aa2, cc14, cc16, cc16) 1817 1818 LDF [BO + 63 * SIZE], a1 1819 1820 FMUL a1, c15, c15 1821 FMUL a1, c16, c16 1822#endif 1823 1824#ifdef RT 1825 LDF [BO + 63 * SIZE], a1 1826 LDF [BO + 62 * SIZE], a2 1827 LDF [BO + 61 * SIZE], a3 1828 LDF [BO + 60 * SIZE], a4 1829 LDF [BO + 59 * SIZE], b1 1830 LDF [BO + 58 * SIZE], b2 1831 LDF [BO + 57 * SIZE], b3 1832 LDF [BO + 56 * SIZE], b4 1833 1834 FMUL a1, c16, c16 1835 FMUL a1, c15, c15 1836 1837 FNMSUB (aa2, cc16, cc14, cc14) 1838 FNMSUB (aa2, cc15, cc13, cc13) 1839 FNMSUB (aa3, cc16, cc12, cc12) 1840 FNMSUB (aa3, cc15, cc11, cc11) 1841 FNMSUB (aa4, cc16, cc10, cc10) 1842 FNMSUB (aa4, cc15, cc09, cc09) 1843 FNMSUB (bb1, cc16, cc08, cc08) 1844 FNMSUB (bb1, cc15, cc07, cc07) 1845 FNMSUB (bb2, cc16, cc06, cc06) 1846 FNMSUB (bb2, cc15, cc05, cc05) 1847 FNMSUB (bb3, cc16, cc04, cc04) 1848 FNMSUB (bb3, cc15, cc03, cc03) 1849 FNMSUB (bb4, cc16, cc02, cc02) 1850 FNMSUB (bb4, cc15, cc01, cc01) 1851 1852 LDF [BO + 54 * SIZE], a1 1853 LDF [BO + 53 * SIZE], a2 1854 LDF [BO + 52 * SIZE], a3 1855 LDF [BO + 51 * SIZE], a4 1856 LDF [BO + 50 * SIZE], b1 1857 LDF [BO + 49 * SIZE], b2 1858 LDF [BO + 48 * SIZE], b3 1859 1860 FMUL a1, c14, c14 1861 FMUL a1, c13, c13 1862 1863 FNMSUB (aa2, cc14, cc12, cc12) 1864 FNMSUB (aa2, cc13, cc11, cc11) 1865 FNMSUB (aa3, cc14, cc10, cc10) 1866 FNMSUB (aa3, cc13, cc09, cc09) 1867 FNMSUB (aa4, cc14, cc08, cc08) 1868 FNMSUB (aa4, cc13, cc07, cc07) 1869 FNMSUB (bb1, cc14, cc06, cc06) 1870 FNMSUB (bb1, cc13, cc05, cc05) 1871 FNMSUB (bb2, cc14, cc04, cc04) 1872 FNMSUB (bb2, cc13, cc03, cc03) 1873 FNMSUB (bb3, cc14, cc02, cc02) 1874 FNMSUB (bb3, cc13, cc01, cc01) 1875 1876 LDF [BO + 45 * SIZE], a1 1877 LDF [BO + 44 * SIZE], a2 1878 LDF [BO + 43 * SIZE], a3 1879 LDF [BO + 42 * SIZE], a4 1880 LDF [BO + 41 * SIZE], b1 1881 LDF [BO + 40 * SIZE], b2 1882 1883 FMUL a1, c12, c12 1884 FMUL a1, c11, c11 1885 1886 FNMSUB (aa2, cc12, cc10, cc10) 1887 FNMSUB (aa2, cc11, cc09, cc09) 1888 FNMSUB (aa3, cc12, cc08, cc08) 1889 FNMSUB (aa3, cc11, cc07, cc07) 1890 FNMSUB (aa4, cc12, cc06, cc06) 1891 FNMSUB (aa4, cc11, cc05, cc05) 1892 FNMSUB (bb1, cc12, cc04, cc04) 1893 FNMSUB (bb1, cc11, cc03, cc03) 1894 FNMSUB (bb2, cc12, cc02, cc02) 1895 FNMSUB (bb2, cc11, cc01, cc01) 1896 1897 LDF [BO + 36 * SIZE], a1 1898 LDF [BO + 35 * SIZE], a2 1899 LDF [BO + 34 * SIZE], a3 1900 LDF [BO + 33 * SIZE], a4 1901 LDF [BO + 32 * SIZE], b1 1902 1903 FMUL a1, c10, c10 1904 FMUL a1, c09, c09 1905 1906 FNMSUB (aa2, cc10, cc08, cc08) 1907 FNMSUB (aa2, cc09, cc07, cc07) 1908 FNMSUB (aa3, cc10, cc06, cc06) 1909 FNMSUB (aa3, cc09, cc05, cc05) 1910 FNMSUB (aa4, cc10, cc04, cc04) 1911 FNMSUB (aa4, cc09, cc03, cc03) 1912 FNMSUB (bb1, cc10, cc02, cc02) 1913 FNMSUB (bb1, cc09, cc01, cc01) 1914 1915 LDF [BO + 27 * SIZE], a1 1916 LDF [BO + 26 * SIZE], a2 1917 LDF [BO + 25 * SIZE], a3 1918 LDF [BO + 24 * SIZE], a4 1919 1920 FMUL a1, c08, c08 1921 FMUL a1, c07, c07 1922 1923 FNMSUB (aa2, cc08, cc06, cc06) 1924 FNMSUB (aa2, cc07, cc05, cc05) 1925 FNMSUB (aa3, cc08, cc04, cc04) 1926 FNMSUB (aa3, cc07, cc03, cc03) 1927 FNMSUB (aa4, cc08, cc02, cc02) 1928 FNMSUB (aa4, cc07, cc01, cc01) 1929 1930 LDF [BO + 18 * SIZE], a1 1931 LDF [BO + 17 * SIZE], a2 1932 LDF [BO + 16 * SIZE], a3 1933 1934 FMUL a1, c06, c06 1935 FMUL a1, c05, c05 1936 1937 FNMSUB (aa2, cc06, cc04, cc04) 1938 FNMSUB (aa2, cc05, cc03, cc03) 1939 FNMSUB (aa3, cc06, cc02, cc02) 1940 FNMSUB (aa3, cc05, cc01, cc01) 1941 1942 LDF [BO + 9 * SIZE], a1 1943 LDF [BO + 8 * SIZE], a2 1944 1945 FMUL a1, c04, c04 1946 FMUL a1, c03, c03 1947 1948 FNMSUB (aa2, cc04, cc02, cc02) 1949 FNMSUB (aa2, cc03, cc01, cc01) 1950 1951 LDF [BO + 0 * SIZE], a1 1952 1953 FMUL a1, c02, c02 1954 FMUL a1, c01, c01 1955#endif 1956 1957#ifdef LN 1958 add C1, -2 * SIZE, C1 1959 add C2, -2 * SIZE, C2 1960 add C3, -2 * SIZE, C3 1961 add C4, -2 * SIZE, C4 1962 add C5, -2 * SIZE, C5 1963 add C6, -2 * SIZE, C6 1964 add C7, -2 * SIZE, C7 1965 add C8, -2 * SIZE, C8 1966#endif 1967 1968#if defined(LN) || defined(LT) 1969 STF c01, [BO + 0 * SIZE] 1970 STF c03, [BO + 1 * SIZE] 1971 STF c05, [BO + 2 * SIZE] 1972 STF c07, [BO + 3 * SIZE] 1973 1974 STF c09, [BO + 4 * SIZE] 1975 STF c11, [BO + 5 * SIZE] 1976 STF c13, [BO + 6 * SIZE] 1977 STF c15, [BO + 7 * SIZE] 1978 1979 STF c02, [BO + 8 * SIZE] 1980 STF c04, [BO + 9 * SIZE] 1981 STF c06, [BO + 10 * SIZE] 1982 STF c08, [BO + 11 * SIZE] 1983 1984 STF c10, [BO + 12 * SIZE] 1985 STF c12, [BO + 13 * SIZE] 1986 STF c14, [BO + 14 * SIZE] 1987 STF c16, [BO + 15 * SIZE] 1988#else 1989 STF c01, [AO + 0 * SIZE] 1990 STF c02, [AO + 1 * SIZE] 1991 STF c03, [AO + 2 * SIZE] 1992 STF c04, [AO + 3 * SIZE] 1993 1994 STF c05, [AO + 4 * SIZE] 1995 STF c06, [AO + 5 * SIZE] 1996 STF c07, [AO + 6 * SIZE] 1997 STF c08, [AO + 7 * SIZE] 1998 1999 STF c09, [AO + 8 * SIZE] 2000 STF c10, [AO + 9 * SIZE] 2001 STF c11, [AO + 10 * SIZE] 2002 STF c12, [AO + 11 * SIZE] 2003 2004 STF c13, [AO + 12 * SIZE] 2005 STF c14, [AO + 13 * SIZE] 2006 STF c15, [AO + 14 * SIZE] 2007 STF c16, [AO + 15 * SIZE] 2008#endif 2009 2010 STF c01, [C1 + 0 * SIZE] 2011 STF c02, [C1 + 1 * SIZE] 2012 STF c03, [C2 + 0 * SIZE] 2013 STF c04, [C2 + 1 * SIZE] 2014 2015 STF c05, [C3 + 0 * SIZE] 2016 STF c06, [C3 + 1 * SIZE] 2017 STF c07, [C4 + 0 * SIZE] 2018 STF c08, [C4 + 1 * SIZE] 2019 2020 STF c09, [C5 + 0 * SIZE] 2021 STF c10, [C5 + 1 * SIZE] 2022 STF c11, [C6 + 0 * SIZE] 2023 STF c12, [C6 + 1 * SIZE] 2024 2025 STF c13, [C7 + 0 * SIZE] 2026 STF c14, [C7 + 1 * SIZE] 2027 STF c15, [C8 + 0 * SIZE] 2028 STF c16, [C8 + 1 * SIZE] 2029 2030#ifndef LN 2031 add C1, 2 * SIZE, C1 2032 add C2, 2 * SIZE, C2 2033 add C3, 2 * SIZE, C3 2034 add C4, 2 * SIZE, C4 2035 add C5, 2 * SIZE, C5 2036 add C6, 2 * SIZE, C6 2037 add C7, 2 * SIZE, C7 2038 add C8, 2 * SIZE, C8 2039#endif 2040 2041#ifdef RT 2042 sll K, BASE_SHIFT + 1, TEMP1 2043 add AORIG, TEMP1, AORIG 2044#endif 2045 2046#if defined(LT) || defined(RN) 2047 sub K, KK, TEMP1 2048 sll TEMP1, BASE_SHIFT + 1, TEMP2 2049 sll TEMP1, BASE_SHIFT + 3, TEMP1 2050 add AO, TEMP2, AO 2051 add BO, TEMP1, BO 2052#endif 2053 2054#ifdef LT 2055 add KK, 2, KK 2056#endif 2057 2058#ifdef LN 2059 sub KK, 2, KK 2060#endif 2061 2062 add I, -1, I 2063 cmp I, 0 2064 bg,pt %icc, .LL12 2065 nop 2066 .align 4 2067 2068.LL29: 2069#ifdef LN 2070 sll K, BASE_SHIFT + 3, TEMP1 2071 add B, TEMP1, B 2072#endif 2073 2074#if defined(LT) || defined(RN) 2075 mov BO, B 2076#endif 2077 2078#ifdef RN 2079 add KK, 8, KK 2080#endif 2081 2082#ifdef RT 2083 sub KK, 8, KK 2084#endif 2085 2086 add J, -1, J 2087 cmp J, 0 2088 bg,pt %icc, .LL11 2089 nop 2090 .align 4 2091 2092.LL30: 2093 and N, 4, J 2094 cmp J, 0 2095 ble,pn %icc, .LL50 2096 nop 2097 2098#ifdef RT 2099 sll K, BASE_SHIFT + 2, TEMP1 2100 sub B, TEMP1, B 2101#endif 2102 2103#ifndef RT 2104 mov C, C1 2105 add C, LDC, C2 2106 add C2, LDC, C3 2107 add C3, LDC, C4 2108 add C4, LDC, C 2109#else 2110 sub C, LDC, C4 2111 sub C4, LDC, C3 2112 sub C3, LDC, C2 2113 sub C2, LDC, C1 2114 sub C2, LDC, C 2115#endif 2116 2117#ifdef LN 2118 add M, OFFSET, KK 2119#endif 2120 2121#ifdef LT 2122 mov OFFSET, KK 2123#endif 2124 2125#if defined(LN) || defined(RT) 2126 mov A, AORIG 2127#else 2128 mov A, AO 2129#endif 2130 2131 and M, 1, I 2132 cmp I, 0 2133 ble,pn %icc, .LL40 2134 nop 2135 2136#if defined(LT) || defined(RN) 2137 mov B, BO 2138#else 2139#ifdef LN 2140 sll K, BASE_SHIFT + 0, TEMP1 2141 sub AORIG, TEMP1, AORIG 2142#endif 2143 2144 sll KK, BASE_SHIFT + 0, TEMP1 2145 sll KK, BASE_SHIFT + 2, TEMP2 2146 2147 add AORIG, TEMP1, AO 2148 add B, TEMP2, BO 2149#endif 2150 2151 LDF [AO + 0 * SIZE], a1 2152 LDF [AO + 1 * SIZE], a2 2153 LDF [AO + 2 * SIZE], a3 2154 LDF [AO + 3 * SIZE], a4 2155 2156 LDF [BO + 0 * SIZE], b1 2157 LDF [BO + 1 * SIZE], b2 2158 LDF [BO + 2 * SIZE], b3 2159 LDF [BO + 3 * SIZE], b4 2160 LDF [BO + 4 * SIZE], b5 2161 LDF [BO + 5 * SIZE], b6 2162 FCLR (cc01) 2163 LDF [BO + 6 * SIZE], b7 2164 FCLR (cc03) 2165 LDF [BO + 7 * SIZE], b8 2166 FCLR (cc05) 2167 LDF [BO + 8 * SIZE], b9 2168 FCLR (cc07) 2169 2170#if defined(LT) || defined(RN) 2171 sra KK, 2, L 2172#else 2173 sub K, KK, L 2174 sra L, 2, L 2175#endif 2176 cmp L, 0 2177 ble,pn %icc, .LL45 2178 nop 2179 2180.LL43: 2181 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2182 add L, -1, L 2183 2184 FMADD (aa1, bb1, cc01, cc01) 2185 LDF [BO + 16 * SIZE], b1 2186 FMADD (aa1, bb2, cc03, cc03) 2187 LDF [BO + 9 * SIZE], b2 2188 FMADD (aa1, bb3, cc05, cc05) 2189 LDF [BO + 10 * SIZE], b3 2190 FMADD (aa1, bb4, cc07, cc07) 2191 LDF [BO + 11 * SIZE], b4 2192 2193 LDF [AO + 4 * SIZE], a1 2194 cmp L, 0 2195 2196 FMADD (aa2, bb5, cc01, cc01) 2197 LDF [BO + 12 * SIZE], b5 2198 FMADD (aa2, bb6, cc03, cc03) 2199 LDF [BO + 13 * SIZE], b6 2200 FMADD (aa2, bb7, cc05, cc05) 2201 LDF [BO + 14 * SIZE], b7 2202 FMADD (aa2, bb8, cc07, cc07) 2203 LDF [BO + 15 * SIZE], b8 2204 2205 LDF [AO + 5 * SIZE], a2 2206 add AO, 4 * SIZE, AO 2207 2208 FMADD (aa3, bb9, cc01, cc01) 2209 LDF [BO + 24 * SIZE], b9 2210 FMADD (aa3, bb2, cc03, cc03) 2211 LDF [BO + 17 * SIZE], b2 2212 FMADD (aa3, bb3, cc05, cc05) 2213 LDF [BO + 18 * SIZE], b3 2214 FMADD (aa3, bb4, cc07, cc07) 2215 LDF [BO + 19 * SIZE], b4 2216 2217 LDF [AO + 2 * SIZE], a3 2218 add BO, 16 * SIZE, BO 2219 2220 FMADD (aa4, bb5, cc01, cc01) 2221 LDF [BO + 4 * SIZE], b5 2222 FMADD (aa4, bb6, cc03, cc03) 2223 LDF [BO + 5 * SIZE], b6 2224 FMADD (aa4, bb7, cc05, cc05) 2225 LDF [BO + 6 * SIZE], b7 2226 FMADD (aa4, bb8, cc07, cc07) 2227 LDF [BO + 7 * SIZE], b8 2228 2229 bg,pt %icc, .LL43 2230 LDF [AO + 3 * SIZE], a4 2231 .align 4 2232 2233.LL45: 2234#if defined(LT) || defined(RN) 2235 and KK, 3, L 2236#else 2237 sub K, KK, L 2238 and L, 3, L 2239#endif 2240 cmp L, 0 2241 ble,a,pn %icc, .LL48 2242 nop 2243 .align 4 2244 2245.LL47: 2246 FMADD (aa1, bb1, cc01, cc01) 2247 LDF [BO + 4 * SIZE], b1 2248 add L, -1, L 2249 FMADD (aa1, bb2, cc03, cc03) 2250 LDF [BO + 5 * SIZE], b2 2251 add AO, 1 * SIZE, AO 2252 2253 FMADD (aa1, bb3, cc05, cc05) 2254 LDF [BO + 6 * SIZE], b3 2255 cmp L, 0 2256 FMADD (aa1, bb4, cc07, cc07) 2257 LDF [BO + 7 * SIZE], b4 2258 add BO, 4 * SIZE, BO 2259 2260 bg,pt %icc, .LL47 2261 LDF [AO + 0 * SIZE], a1 2262 .align 4 2263 2264.LL48: 2265#if defined(LN) || defined(RT) 2266#ifdef LN 2267 sub KK, 1, TEMP1 2268#else 2269 sub KK, 4, TEMP1 2270#endif 2271 sll TEMP1, BASE_SHIFT + 0, TEMP2 2272 sll TEMP1, BASE_SHIFT + 2, TEMP1 2273 2274 add AORIG, TEMP2, AO 2275 add B, TEMP1, BO 2276#endif 2277 2278#if defined(LN) || defined(LT) 2279 LDF [BO + 0 * SIZE], a1 2280 LDF [BO + 1 * SIZE], a2 2281 LDF [BO + 2 * SIZE], a3 2282 LDF [BO + 3 * SIZE], a4 2283 2284 FSUB a1, c01, c01 2285 FSUB a2, c03, c03 2286 FSUB a3, c05, c05 2287 FSUB a4, c07, c07 2288#else 2289 LDF [AO + 0 * SIZE], a1 2290 LDF [AO + 1 * SIZE], a2 2291 LDF [AO + 2 * SIZE], a3 2292 LDF [AO + 3 * SIZE], a4 2293 2294 FSUB a1, c01, c01 2295 FSUB a2, c03, c03 2296 FSUB a3, c05, c05 2297 FSUB a4, c07, c07 2298#endif 2299 2300#if defined(LN) || defined(LT) 2301 LDF [AO + 0 * SIZE], a1 2302 2303 FMUL a1, c01, c01 2304 FMUL a1, c03, c03 2305 FMUL a1, c05, c05 2306 FMUL a1, c07, c07 2307#endif 2308 2309#ifdef RN 2310 LDF [BO + 0 * SIZE], a1 2311 LDF [BO + 1 * SIZE], a2 2312 LDF [BO + 2 * SIZE], a3 2313 LDF [BO + 3 * SIZE], a4 2314 2315 FMUL a1, c01, c01 2316 2317 FNMSUB (aa2, cc01, cc03, cc03) 2318 FNMSUB (aa3, cc01, cc05, cc05) 2319 FNMSUB (aa4, cc01, cc07, cc07) 2320 2321 LDF [BO + 5 * SIZE], a1 2322 LDF [BO + 6 * SIZE], a2 2323 LDF [BO + 7 * SIZE], a3 2324 2325 FMUL a1, c03, c03 2326 2327 FNMSUB (aa2, cc03, cc05, cc05) 2328 FNMSUB (aa3, cc03, cc07, cc07) 2329 2330 LDF [BO + 10 * SIZE], a1 2331 LDF [BO + 11 * SIZE], a2 2332 2333 FMUL a1, c05, c05 2334 2335 FNMSUB (aa2, cc05, cc07, cc07) 2336 2337 LDF [BO + 15 * SIZE], a1 2338 2339 FMUL a1, c07, c07 2340#endif 2341 2342#ifdef RT 2343 LDF [BO + 15 * SIZE], a1 2344 LDF [BO + 14 * SIZE], a2 2345 LDF [BO + 13 * SIZE], a3 2346 LDF [BO + 12 * SIZE], a4 2347 2348 FMUL a1, c07, c07 2349 2350 FNMSUB (aa2, cc07, cc05, cc05) 2351 FNMSUB (aa3, cc07, cc03, cc03) 2352 FNMSUB (aa4, cc07, cc01, cc01) 2353 2354 LDF [BO + 10 * SIZE], a1 2355 LDF [BO + 9 * SIZE], a2 2356 LDF [BO + 8 * SIZE], a3 2357 2358 FMUL a1, c05, c05 2359 2360 FNMSUB (aa2, cc05, cc03, cc03) 2361 FNMSUB (aa3, cc05, cc01, cc01) 2362 2363 LDF [BO + 5 * SIZE], a1 2364 LDF [BO + 4 * SIZE], a2 2365 2366 FMUL a1, c03, c03 2367 2368 FNMSUB (aa2, cc03, cc01, cc01) 2369 2370 LDF [BO + 0 * SIZE], a1 2371 2372 FMUL a1, c01, c01 2373#endif 2374 2375#ifdef LN 2376 add C1, -1 * SIZE, C1 2377 add C2, -1 * SIZE, C2 2378 add C3, -1 * SIZE, C3 2379 add C4, -1 * SIZE, C4 2380#endif 2381 2382#if defined(LN) || defined(LT) 2383 STF c01, [BO + 0 * SIZE] 2384 STF c03, [BO + 1 * SIZE] 2385 STF c05, [BO + 2 * SIZE] 2386 STF c07, [BO + 3 * SIZE] 2387#else 2388 STF c01, [AO + 0 * SIZE] 2389 STF c03, [AO + 1 * SIZE] 2390 STF c05, [AO + 2 * SIZE] 2391 STF c07, [AO + 3 * SIZE] 2392#endif 2393 2394 STF c01, [C1 + 0 * SIZE] 2395 STF c03, [C2 + 0 * SIZE] 2396 STF c05, [C3 + 0 * SIZE] 2397 STF c07, [C4 + 0 * SIZE] 2398 2399#ifdef RT 2400 sll K, BASE_SHIFT + 0, TEMP1 2401 add AORIG, TEMP1, AORIG 2402#endif 2403 2404#if defined(LT) || defined(RN) 2405 sub K, KK, TEMP1 2406 sll TEMP1, BASE_SHIFT + 0, TEMP2 2407 sll TEMP1, BASE_SHIFT + 2, TEMP1 2408 add AO, TEMP2, AO 2409 add BO, TEMP1, BO 2410#endif 2411 2412#ifdef LT 2413 add KK, 1, KK 2414#endif 2415 2416#ifdef LN 2417 sub KK, 1, KK 2418#endif 2419 .align 4 2420 2421.LL40: 2422 sra M, 1, I 2423 cmp I, 0 2424 ble,pn %icc, .LL49 2425 nop 2426 .align 4 2427 2428.LL32: 2429#if defined(LT) || defined(RN) 2430 mov B, BO 2431#else 2432#ifdef LN 2433 sll K, BASE_SHIFT + 1, TEMP1 2434 sub AORIG, TEMP1, AORIG 2435#endif 2436 2437 sll KK, BASE_SHIFT + 1, TEMP1 2438 sll KK, BASE_SHIFT + 2, TEMP2 2439 2440 add AORIG, TEMP1, AO 2441 add B, TEMP2, BO 2442#endif 2443 2444 LDF [AO + 0 * SIZE], a1 2445 LDF [AO + 1 * SIZE], a2 2446 2447 LDF [BO + 0 * SIZE], b1 2448 LDF [BO + 1 * SIZE], b2 2449 LDF [BO + 2 * SIZE], b3 2450 LDF [BO + 3 * SIZE], b4 2451 LDF [BO + 4 * SIZE], b5 2452 2453 LDF [BO + 5 * SIZE], b6 2454 FCLR (cc01) 2455 LDF [BO + 6 * SIZE], b7 2456 FCLR (cc02) 2457 LDF [BO + 7 * SIZE], b8 2458 FCLR (cc03) 2459 LDF [BO + 8 * SIZE], b9 2460 FCLR (cc04) 2461 2462 prefetch [C1 + 2 * SIZE], 3 2463 FCLR (cc05) 2464 prefetch [C2 + 2 * SIZE], 3 2465 FCLR (cc06) 2466 prefetch [C3 + 2 * SIZE], 3 2467 FCLR (cc07) 2468 prefetch [C4 + 2 * SIZE], 3 2469 FCLR (cc08) 2470 2471#if defined(LT) || defined(RN) 2472 sra KK, 2, L 2473#else 2474 sub K, KK, L 2475 sra L, 2, L 2476#endif 2477 cmp L, 0 2478 ble,pn %icc, .LL35 2479 nop 2480 .align 4 2481 2482.LL33: 2483 FMADD (aa1, bb1, cc01, cc01) 2484 LDF [AO + 2 * SIZE], a3 2485 FMADD (aa2, bb1, cc02, cc02) 2486 LDF [AO + 3 * SIZE], a4 2487 2488 FMADD (aa1, bb2, cc03, cc03) 2489 LDF [BO + 16 * SIZE], b1 2490 FMADD (aa2, bb2, cc04, cc04) 2491 LDF [BO + 9 * SIZE], b2 2492 2493 FMADD (aa1, bb3, cc05, cc05) 2494 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2495 FMADD (aa2, bb3, cc06, cc06) 2496 add L, -1, L 2497 2498 FMADD (aa1, bb4, cc07, cc07) 2499 LDF [BO + 10 * SIZE], b3 2500 FMADD (aa2, bb4, cc08, cc08) 2501 LDF [BO + 11 * SIZE], b4 2502 2503 FMADD (aa3, bb5, cc01, cc01) 2504 LDF [AO + 4 * SIZE], a1 2505 FMADD (aa4, bb5, cc02, cc02) 2506 LDF [AO + 5 * SIZE], a2 2507 2508 FMADD (aa3, bb6, cc03, cc03) 2509 LDF [BO + 12 * SIZE], b5 2510 FMADD (aa4, bb6, cc04, cc04) 2511 LDF [BO + 13 * SIZE], b6 2512 2513 FMADD (aa3, bb7, cc05, cc05) 2514 cmp L, 0 2515 FMADD (aa4, bb7, cc06, cc06) 2516 add AO, 8 * SIZE, AO 2517 2518 FMADD (aa3, bb8, cc07, cc07) 2519 LDF [BO + 14 * SIZE], b7 2520 FMADD (aa4, bb8, cc08, cc08) 2521 LDF [BO + 15 * SIZE], b8 2522 2523 FMADD (aa1, bb9, cc01, cc01) 2524 LDF [AO - 2 * SIZE], a3 2525 FMADD (aa2, bb9, cc02, cc02) 2526 LDF [AO - 1 * SIZE], a4 2527 2528 FMADD (aa1, bb2, cc03, cc03) 2529 LDF [BO + 24 * SIZE], b9 2530 FMADD (aa2, bb2, cc04, cc04) 2531 LDF [BO + 17 * SIZE], b2 2532 2533 FMADD (aa1, bb3, cc05, cc05) 2534 add BO, 16 * SIZE, BO 2535 FMADD (aa2, bb3, cc06, cc06) 2536 nop 2537 2538 FMADD (aa1, bb4, cc07, cc07) 2539 LDF [BO + 2 * SIZE], b3 2540 FMADD (aa2, bb4, cc08, cc08) 2541 LDF [BO + 3 * SIZE], b4 2542 2543 FMADD (aa3, bb5, cc01, cc01) 2544 LDF [AO + 0 * SIZE], a1 2545 FMADD (aa4, bb5, cc02, cc02) 2546 LDF [AO + 1 * SIZE], a2 2547 FMADD (aa3, bb6, cc03, cc03) 2548 LDF [BO + 4 * SIZE], b5 2549 FMADD (aa4, bb6, cc04, cc04) 2550 LDF [BO + 5 * SIZE], b6 2551 2552 FMADD (aa3, bb7, cc05, cc05) 2553 nop 2554 FMADD (aa4, bb7, cc06, cc06) 2555 LDF [BO + 6 * SIZE], b7 2556 2557 FMADD (aa3, bb8, cc07, cc07) 2558 FMADD (aa4, bb8, cc08, cc08) 2559 bg,pt %icc, .LL33 2560 LDF [BO + 7 * SIZE], b8 2561 .align 4 2562 2563.LL35: 2564#if defined(LT) || defined(RN) 2565 and KK, 3, L 2566#else 2567 sub K, KK, L 2568 and L, 3, L 2569#endif 2570 cmp L, 0 2571 ble,a,pn %icc, .LL38 2572 nop 2573 .align 4 2574 2575.LL37: 2576 FMADD (aa1, bb1, cc01, cc01) 2577 add L, -1, L 2578 FMADD (aa2, bb1, cc02, cc02) 2579 LDF [BO + 4 * SIZE], b1 2580 2581 FMADD (aa1, bb2, cc03, cc03) 2582 add AO, 2 * SIZE, AO 2583 FMADD (aa2, bb2, cc04, cc04) 2584 LDF [BO + 5 * SIZE], b2 2585 2586 FMADD (aa1, bb3, cc05, cc05) 2587 cmp L, 0 2588 FMADD (aa2, bb3, cc06, cc06) 2589 LDF [BO + 6 * SIZE], b3 2590 2591 FMADD (aa1, bb4, cc07, cc07) 2592 LDF [AO + 0 * SIZE], a1 2593 FMADD (aa2, bb4, cc08, cc08) 2594 LDF [AO + 1 * SIZE], a2 2595 2596 LDF [BO + 7 * SIZE], b4 2597 bg,pt %icc, .LL37 2598 add BO, 4 * SIZE, BO 2599 .align 4 2600 2601.LL38: 2602#if defined(LN) || defined(RT) 2603#ifdef LN 2604 sub KK, 2, TEMP1 2605#else 2606 sub KK, 4, TEMP1 2607#endif 2608 sll TEMP1, BASE_SHIFT + 1, TEMP2 2609 sll TEMP1, BASE_SHIFT + 2, TEMP1 2610 2611 add AORIG, TEMP2, AO 2612 add B, TEMP1, BO 2613#endif 2614 2615#if defined(LN) || defined(LT) 2616 LDF [BO + 0 * SIZE], a1 2617 LDF [BO + 1 * SIZE], a2 2618 LDF [BO + 2 * SIZE], a3 2619 LDF [BO + 3 * SIZE], a4 2620 2621 LDF [BO + 4 * SIZE], b1 2622 LDF [BO + 5 * SIZE], b2 2623 LDF [BO + 6 * SIZE], b3 2624 LDF [BO + 7 * SIZE], b4 2625 2626 FSUB a1, c01, c01 2627 FSUB a2, c03, c03 2628 FSUB a3, c05, c05 2629 FSUB a4, c07, c07 2630 2631 FSUB b1, c02, c02 2632 FSUB b2, c04, c04 2633 FSUB b3, c06, c06 2634 FSUB b4, c08, c08 2635#else 2636 LDF [AO + 0 * SIZE], a1 2637 LDF [AO + 1 * SIZE], a2 2638 LDF [AO + 2 * SIZE], a3 2639 LDF [AO + 3 * SIZE], a4 2640 2641 LDF [AO + 4 * SIZE], b1 2642 LDF [AO + 5 * SIZE], b2 2643 LDF [AO + 6 * SIZE], b3 2644 LDF [AO + 7 * SIZE], b4 2645 2646 FSUB a1, c01, c01 2647 FSUB a2, c02, c02 2648 FSUB a3, c03, c03 2649 FSUB a4, c04, c04 2650 2651 FSUB b1, c05, c05 2652 FSUB b2, c06, c06 2653 FSUB b3, c07, c07 2654 FSUB b4, c08, c08 2655 2656#endif 2657 2658#ifdef LN 2659 LDF [AO + 3 * SIZE], a1 2660 LDF [AO + 2 * SIZE], a2 2661 LDF [AO + 0 * SIZE], a3 2662 2663 FMUL a1, c02, c02 2664 FMUL a1, c04, c04 2665 FMUL a1, c06, c06 2666 FMUL a1, c08, c08 2667 2668 FNMSUB (aa2, cc02, cc01, cc01) 2669 FNMSUB (aa2, cc04, cc03, cc03) 2670 FNMSUB (aa2, cc06, cc05, cc05) 2671 FNMSUB (aa2, cc08, cc07, cc07) 2672 2673 FMUL a3, c01, c01 2674 FMUL a3, c03, c03 2675 FMUL a3, c05, c05 2676 FMUL a3, c07, c07 2677#endif 2678 2679#ifdef LT 2680 LDF [AO + 0 * SIZE], a1 2681 LDF [AO + 1 * SIZE], a2 2682 LDF [AO + 3 * SIZE], a3 2683 2684 FMUL a1, c01, c01 2685 FMUL a1, c03, c03 2686 FMUL a1, c05, c05 2687 FMUL a1, c07, c07 2688 2689 FNMSUB (aa2, cc01, cc02, cc02) 2690 FNMSUB (aa2, cc03, cc04, cc04) 2691 FNMSUB (aa2, cc05, cc06, cc06) 2692 FNMSUB (aa2, cc07, cc08, cc08) 2693 2694 FMUL a3, c02, c02 2695 FMUL a3, c04, c04 2696 FMUL a3, c06, c06 2697 FMUL a3, c08, c08 2698#endif 2699 2700#ifdef RN 2701 LDF [BO + 0 * SIZE], a1 2702 LDF [BO + 1 * SIZE], a2 2703 LDF [BO + 2 * SIZE], a3 2704 LDF [BO + 3 * SIZE], a4 2705 2706 FMUL a1, c01, c01 2707 FMUL a1, c02, c02 2708 2709 FNMSUB (aa2, cc01, cc03, cc03) 2710 FNMSUB (aa2, cc02, cc04, cc04) 2711 FNMSUB (aa3, cc01, cc05, cc05) 2712 FNMSUB (aa3, cc02, cc06, cc06) 2713 FNMSUB (aa4, cc01, cc07, cc07) 2714 FNMSUB (aa4, cc02, cc08, cc08) 2715 2716 LDF [BO + 5 * SIZE], a1 2717 LDF [BO + 6 * SIZE], a2 2718 LDF [BO + 7 * SIZE], a3 2719 2720 FMUL a1, c03, c03 2721 FMUL a1, c04, c04 2722 2723 FNMSUB (aa2, cc03, cc05, cc05) 2724 FNMSUB (aa2, cc04, cc06, cc06) 2725 FNMSUB (aa3, cc03, cc07, cc07) 2726 FNMSUB (aa3, cc04, cc08, cc08) 2727 2728 LDF [BO + 10 * SIZE], a1 2729 LDF [BO + 11 * SIZE], a2 2730 2731 FMUL a1, c05, c05 2732 FMUL a1, c06, c06 2733 2734 FNMSUB (aa2, cc05, cc07, cc07) 2735 FNMSUB (aa2, cc06, cc08, cc08) 2736 2737 LDF [BO + 15 * SIZE], a1 2738 2739 FMUL a1, c07, c07 2740 FMUL a1, c08, c08 2741#endif 2742 2743#ifdef RT 2744 LDF [BO + 15 * SIZE], a1 2745 LDF [BO + 14 * SIZE], a2 2746 LDF [BO + 13 * SIZE], a3 2747 LDF [BO + 12 * SIZE], a4 2748 2749 FMUL a1, c08, c08 2750 FMUL a1, c07, c07 2751 2752 FNMSUB (aa2, cc08, cc06, cc06) 2753 FNMSUB (aa2, cc07, cc05, cc05) 2754 FNMSUB (aa3, cc08, cc04, cc04) 2755 FNMSUB (aa3, cc07, cc03, cc03) 2756 FNMSUB (aa4, cc08, cc02, cc02) 2757 FNMSUB (aa4, cc07, cc01, cc01) 2758 2759 LDF [BO + 10 * SIZE], a1 2760 LDF [BO + 9 * SIZE], a2 2761 LDF [BO + 8 * SIZE], a3 2762 2763 FMUL a1, c06, c06 2764 FMUL a1, c05, c05 2765 2766 FNMSUB (aa2, cc06, cc04, cc04) 2767 FNMSUB (aa2, cc05, cc03, cc03) 2768 FNMSUB (aa3, cc06, cc02, cc02) 2769 FNMSUB (aa3, cc05, cc01, cc01) 2770 2771 LDF [BO + 5 * SIZE], a1 2772 LDF [BO + 4 * SIZE], a2 2773 2774 FMUL a1, c04, c04 2775 FMUL a1, c03, c03 2776 2777 FNMSUB (aa2, cc04, cc02, cc02) 2778 FNMSUB (aa2, cc03, cc01, cc01) 2779 2780 LDF [BO + 0 * SIZE], a1 2781 2782 FMUL a1, c02, c02 2783 FMUL a1, c01, c01 2784#endif 2785 2786#ifdef LN 2787 add C1, -2 * SIZE, C1 2788 add C2, -2 * SIZE, C2 2789 add C3, -2 * SIZE, C3 2790 add C4, -2 * SIZE, C4 2791#endif 2792 2793#if defined(LN) || defined(LT) 2794 STF c01, [BO + 0 * SIZE] 2795 STF c03, [BO + 1 * SIZE] 2796 STF c05, [BO + 2 * SIZE] 2797 STF c07, [BO + 3 * SIZE] 2798 2799 STF c02, [BO + 4 * SIZE] 2800 STF c04, [BO + 5 * SIZE] 2801 STF c06, [BO + 6 * SIZE] 2802 STF c08, [BO + 7 * SIZE] 2803#else 2804 STF c01, [AO + 0 * SIZE] 2805 STF c02, [AO + 1 * SIZE] 2806 STF c03, [AO + 2 * SIZE] 2807 STF c04, [AO + 3 * SIZE] 2808 2809 STF c05, [AO + 4 * SIZE] 2810 STF c06, [AO + 5 * SIZE] 2811 STF c07, [AO + 6 * SIZE] 2812 STF c08, [AO + 7 * SIZE] 2813#endif 2814 2815 STF c01, [C1 + 0 * SIZE] 2816 STF c02, [C1 + 1 * SIZE] 2817 STF c03, [C2 + 0 * SIZE] 2818 STF c04, [C2 + 1 * SIZE] 2819 2820 STF c05, [C3 + 0 * SIZE] 2821 STF c06, [C3 + 1 * SIZE] 2822 STF c07, [C4 + 0 * SIZE] 2823 STF c08, [C4 + 1 * SIZE] 2824 2825#ifndef LN 2826 add C1, 2 * SIZE, C1 2827 add C2, 2 * SIZE, C2 2828 add C3, 2 * SIZE, C3 2829 add C4, 2 * SIZE, C4 2830#endif 2831 2832#ifdef RT 2833 sll K, BASE_SHIFT + 1, TEMP1 2834 add AORIG, TEMP1, AORIG 2835#endif 2836 2837#if defined(LT) || defined(RN) 2838 sub K, KK, TEMP1 2839 sll TEMP1, BASE_SHIFT + 1, TEMP2 2840 sll TEMP1, BASE_SHIFT + 2, TEMP1 2841 add AO, TEMP2, AO 2842 add BO, TEMP1, BO 2843#endif 2844 2845#ifdef LT 2846 add KK, 2, KK 2847#endif 2848 2849#ifdef LN 2850 sub KK, 2, KK 2851#endif 2852 2853 add I, -1, I 2854 cmp I, 0 2855 bg,pt %icc, .LL32 2856 nop 2857 2858.LL49: 2859#ifdef LN 2860 sll K, BASE_SHIFT + 2, TEMP1 2861 add B, TEMP1, B 2862#endif 2863 2864#if defined(LT) || defined(RN) 2865 mov BO, B 2866#endif 2867 2868#ifdef RN 2869 add KK, 4, KK 2870#endif 2871 2872#ifdef RT 2873 sub KK, 4, KK 2874#endif 2875 .align 4 2876 2877.LL50: 2878 and N, 2, J 2879 cmp J, 0 2880 ble,pn %icc, .LL70 2881 nop 2882 2883#ifdef RT 2884 sll K, BASE_SHIFT + 1, TEMP1 2885 sub B, TEMP1, B 2886#endif 2887 2888#ifndef RT 2889 mov C, C1 2890 add C, LDC, C2 2891 add C2, LDC, C 2892#else 2893 sub C, LDC, C2 2894 sub C2, LDC, C1 2895 sub C2, LDC, C 2896#endif 2897 2898#ifdef LN 2899 add M, OFFSET, KK 2900#endif 2901 2902#ifdef LT 2903 mov OFFSET, KK 2904#endif 2905 2906#if defined(LN) || defined(RT) 2907 mov A, AORIG 2908#else 2909 mov A, AO 2910#endif 2911 2912 and M, 1, I 2913 cmp I, 0 2914 ble,pn %icc, .LL60 2915 nop 2916 2917#if defined(LT) || defined(RN) 2918 mov B, BO 2919#else 2920#ifdef LN 2921 sll K, BASE_SHIFT + 0, TEMP1 2922 sub AORIG, TEMP1, AORIG 2923#endif 2924 2925 sll KK, BASE_SHIFT + 0, TEMP1 2926 sll KK, BASE_SHIFT + 1, TEMP2 2927 2928 add AORIG, TEMP1, AO 2929 add B, TEMP2, BO 2930#endif 2931 2932 LDF [AO + 0 * SIZE], a1 2933 LDF [AO + 1 * SIZE], a2 2934 LDF [AO + 2 * SIZE], a3 2935 LDF [AO + 3 * SIZE], a4 2936 2937 LDF [BO + 0 * SIZE], b1 2938 LDF [BO + 1 * SIZE], b2 2939 LDF [BO + 2 * SIZE], b3 2940 LDF [BO + 3 * SIZE], b4 2941 LDF [BO + 4 * SIZE], b5 2942 LDF [BO + 5 * SIZE], b6 2943 LDF [BO + 6 * SIZE], b7 2944 FCLR (cc01) 2945 LDF [BO + 7 * SIZE], b8 2946 FCLR (cc03) 2947 2948#if defined(LT) || defined(RN) 2949 sra KK, 2, L 2950#else 2951 sub K, KK, L 2952 sra L, 2, L 2953#endif 2954 cmp L, 0 2955 ble,pn %icc, .LL65 2956 nop 2957 .align 4 2958 2959.LL63: 2960 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2961 add L, -1, L 2962 2963 FMADD (aa1, bb1, cc01, cc01) 2964 LDF [BO + 8 * SIZE], b1 2965 FMADD (aa1, bb2, cc03, cc03) 2966 LDF [BO + 9 * SIZE], b2 2967 2968 LDF [AO + 4 * SIZE], a1 2969 cmp L, 0 2970 2971 FMADD (aa2, bb3, cc01, cc01) 2972 LDF [BO + 10 * SIZE], b3 2973 FMADD (aa2, bb4, cc03, cc03) 2974 LDF [BO + 11 * SIZE], b4 2975 2976 LDF [AO + 5 * SIZE], a2 2977 add AO, 4 * SIZE, AO 2978 2979 FMADD (aa3, bb5, cc01, cc01) 2980 LDF [BO + 12 * SIZE], b5 2981 FMADD (aa3, bb6, cc03, cc03) 2982 LDF [BO + 13 * SIZE], b6 2983 2984 LDF [AO + 2 * SIZE], a3 2985 add BO, 8 * SIZE, BO 2986 2987 FMADD (aa4, bb7, cc01, cc01) 2988 LDF [BO + 6 * SIZE], b7 2989 FMADD (aa4, bb8, cc03, cc03) 2990 LDF [BO + 7 * SIZE], b8 2991 2992 bg,pt %icc, .LL63 2993 LDF [AO + 3 * SIZE], a4 2994 .align 4 2995 2996.LL65: 2997#if defined(LT) || defined(RN) 2998 and KK, 3, L 2999#else 3000 sub K, KK, L 3001 and L, 3, L 3002#endif 3003 cmp L, 0 3004 ble,a,pn %icc, .LL68 3005 nop 3006 .align 4 3007 3008.LL67: 3009 FMADD (aa1, bb1, cc01, cc01) 3010 LDF [BO + 2 * SIZE], b1 3011 FMADD (aa1, bb2, cc03, cc03) 3012 LDF [BO + 3 * SIZE], b2 3013 3014 LDF [AO + 1 * SIZE], a1 3015 add L, -1, L 3016 add AO, 1 * SIZE, AO 3017 cmp L, 0 3018 3019 bg,pt %icc, .LL67 3020 add BO, 2 * SIZE, BO 3021 .align 4 3022 3023.LL68: 3024#if defined(LN) || defined(RT) 3025#ifdef LN 3026 sub KK, 1, TEMP1 3027#else 3028 sub KK, 2, TEMP1 3029#endif 3030 sll TEMP1, BASE_SHIFT + 0, TEMP2 3031 sll TEMP1, BASE_SHIFT + 1, TEMP1 3032 3033 add AORIG, TEMP2, AO 3034 add B, TEMP1, BO 3035#endif 3036 3037#if defined(LN) || defined(LT) 3038 LDF [BO + 0 * SIZE], a1 3039 LDF [BO + 1 * SIZE], a2 3040 3041 FSUB a1, c01, c01 3042 FSUB a2, c03, c03 3043#else 3044 LDF [AO + 0 * SIZE], a1 3045 LDF [AO + 1 * SIZE], a2 3046 3047 FSUB a1, c01, c01 3048 FSUB a2, c03, c03 3049#endif 3050 3051#if defined(LN) || defined(LT) 3052 LDF [AO + 0 * SIZE], a1 3053 3054 FMUL a1, c01, c01 3055 FMUL a1, c03, c03 3056#endif 3057 3058#ifdef RN 3059 LDF [BO + 0 * SIZE], a1 3060 LDF [BO + 1 * SIZE], a2 3061 3062 FMUL a1, c01, c01 3063 3064 FNMSUB (aa2, cc01, cc03, cc03) 3065 3066 LDF [BO + 3 * SIZE], a1 3067 3068 FMUL a1, c03, c03 3069#endif 3070 3071#ifdef RT 3072 LDF [BO + 3 * SIZE], a1 3073 LDF [BO + 2 * SIZE], a2 3074 3075 FMUL a1, c03, c03 3076 3077 FNMSUB (aa2, cc03, cc01, cc01) 3078 3079 LDF [BO + 0 * SIZE], a1 3080 3081 FMUL a1, c01, c01 3082#endif 3083 3084#ifdef LN 3085 add C1, -1 * SIZE, C1 3086 add C2, -1 * SIZE, C2 3087#endif 3088 3089#if defined(LN) || defined(LT) 3090 STF c01, [BO + 0 * SIZE] 3091 STF c03, [BO + 1 * SIZE] 3092#else 3093 STF c01, [AO + 0 * SIZE] 3094 STF c03, [AO + 1 * SIZE] 3095#endif 3096 3097 STF c01, [C1 + 0 * SIZE] 3098 STF c03, [C2 + 0 * SIZE] 3099 3100#ifdef RT 3101 sll K, BASE_SHIFT + 0, TEMP1 3102 add AORIG, TEMP1, AORIG 3103#endif 3104 3105#if defined(LT) || defined(RN) 3106 sub K, KK, TEMP1 3107 sll TEMP1, BASE_SHIFT + 0, TEMP2 3108 sll TEMP1, BASE_SHIFT + 1, TEMP1 3109 add AO, TEMP2, AO 3110 add BO, TEMP1, BO 3111#endif 3112 3113#ifdef LT 3114 add KK, 1, KK 3115#endif 3116 3117#ifdef LN 3118 sub KK, 1, KK 3119#endif 3120 .align 4 3121 3122.LL60: 3123 sra M, 1, I 3124 cmp I, 0 3125 ble,pn %icc, .LL69 3126 nop 3127 .align 4 3128 3129.LL52: 3130#if defined(LT) || defined(RN) 3131 mov B, BO 3132#else 3133#ifdef LN 3134 sll K, BASE_SHIFT + 1, TEMP1 3135 sub AORIG, TEMP1, AORIG 3136#endif 3137 3138 sll KK, BASE_SHIFT + 1, TEMP1 3139 sll KK, BASE_SHIFT + 1, TEMP2 3140 3141 add AORIG, TEMP1, AO 3142 add B, TEMP2, BO 3143#endif 3144 3145 LDF [AO + 0 * SIZE], a1 3146 LDF [AO + 1 * SIZE], a2 3147 LDF [AO + 2 * SIZE], a3 3148 LDF [AO + 3 * SIZE], a4 3149 3150 LDF [BO + 0 * SIZE], b1 3151 LDF [BO + 1 * SIZE], b2 3152 LDF [BO + 2 * SIZE], b3 3153 FCLR (cc01) 3154 LDF [BO + 3 * SIZE], b4 3155 FCLR (cc02) 3156 3157 LDF [BO + 4 * SIZE], b5 3158 FCLR (cc03) 3159 LDF [BO + 5 * SIZE], b6 3160 FCLR (cc04) 3161 LDF [BO + 6 * SIZE], b7 3162 FCLR (cc05) 3163 LDF [BO + 7 * SIZE], b8 3164 FCLR (cc06) 3165 3166 prefetch [C1 + 2 * SIZE], 3 3167 FCLR (cc07) 3168 prefetch [C2 + 2 * SIZE], 3 3169 FCLR (cc08) 3170 3171#if defined(LT) || defined(RN) 3172 sra KK, 2, L 3173#else 3174 sub K, KK, L 3175 sra L, 2, L 3176#endif 3177 cmp L, 0 3178 ble,pn %icc, .LL55 3179 nop 3180 .align 4 3181 3182.LL53: 3183 FMADD (aa1, bb1, cc01, cc01) 3184 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3185 FMADD (aa2, bb1, cc02, cc02) 3186 LDF [BO + 8 * SIZE], b1 3187 3188 FMADD (aa1, bb2, cc03, cc03) 3189 LDF [AO + 4 * SIZE], a1 3190 FMADD (aa2, bb2, cc04, cc04) 3191 LDF [AO + 5 * SIZE], a2 3192 3193 FMADD (aa3, bb3, cc01, cc01) 3194 LDF [BO + 9 * SIZE], b2 3195 FMADD (aa4, bb3, cc02, cc02) 3196 LDF [BO + 10 * SIZE], b3 3197 3198 FMADD (aa3, bb4, cc03, cc03) 3199 LDF [AO + 6 * SIZE], a3 3200 FMADD (aa4, bb4, cc04, cc04) 3201 LDF [AO + 7 * SIZE], a4 3202 3203 FMADD (aa1, bb5, cc01, cc01) 3204 LDF [BO + 11 * SIZE], b4 3205 FMADD (aa2, bb5, cc02, cc02) 3206 LDF [BO + 12 * SIZE], b5 3207 3208 FMADD (aa1, bb6, cc03, cc03) 3209 LDF [AO + 8 * SIZE], a1 3210 FMADD (aa2, bb6, cc04, cc04) 3211 LDF [AO + 9 * SIZE], a2 3212 3213 FMADD (aa3, bb7, cc01, cc01) 3214 LDF [BO + 13 * SIZE], b6 3215 3216 FMADD (aa4, bb7, cc02, cc02) 3217 LDF [BO + 14 * SIZE], b7 3218 3219 FMADD (aa3, bb8, cc03, cc03) 3220 LDF [AO + 10 * SIZE], a3 3221 FMADD (aa4, bb8, cc04, cc04) 3222 LDF [AO + 11 * SIZE], a4 3223 3224 add AO, 8 * SIZE, AO 3225 add L, -1, L 3226 add BO, 8 * SIZE, BO 3227 cmp L, 0 3228 3229 bg,pt %icc, .LL53 3230 LDF [BO + 7 * SIZE], b8 3231 .align 4 3232 3233.LL55: 3234#if defined(LT) || defined(RN) 3235 and KK, 3, L 3236#else 3237 sub K, KK, L 3238 and L, 3, L 3239#endif 3240 cmp L, 0 3241 ble,a,pn %icc, .LL58 3242 nop 3243 .align 4 3244 3245.LL57: 3246 FMADD (aa1, bb1, cc01, cc01) 3247 add L, -1, L 3248 FMADD (aa2, bb1, cc02, cc02) 3249 LDF [BO + 2 * SIZE], b1 3250 3251 FMADD (aa1, bb2, cc03, cc03) 3252 LDF [AO + 2 * SIZE], a1 3253 FMADD (aa2, bb2, cc04, cc04) 3254 LDF [AO + 3 * SIZE], a2 3255 3256 add AO, 2 * SIZE, AO 3257 cmp L, 0 3258 add BO, 2 * SIZE, BO 3259 bg,pt %icc, .LL57 3260 LDF [BO + 1 * SIZE], b2 3261 .align 4 3262 3263.LL58: 3264#if defined(LN) || defined(RT) 3265#ifdef LN 3266 sub KK, 2, TEMP1 3267#else 3268 sub KK, 2, TEMP1 3269#endif 3270 sll TEMP1, BASE_SHIFT + 1, TEMP2 3271 sll TEMP1, BASE_SHIFT + 1, TEMP1 3272 3273 add AORIG, TEMP2, AO 3274 add B, TEMP1, BO 3275#endif 3276 3277#if defined(LN) || defined(LT) 3278 LDF [BO + 0 * SIZE], a1 3279 LDF [BO + 1 * SIZE], a2 3280 LDF [BO + 2 * SIZE], a3 3281 LDF [BO + 3 * SIZE], a4 3282 3283 FSUB a1, c01, c01 3284 FSUB a2, c03, c03 3285 FSUB a3, c02, c02 3286 FSUB a4, c04, c04 3287#else 3288 LDF [AO + 0 * SIZE], a1 3289 LDF [AO + 1 * SIZE], a2 3290 LDF [AO + 2 * SIZE], a3 3291 LDF [AO + 3 * SIZE], a4 3292 3293 FSUB a1, c01, c01 3294 FSUB a2, c02, c02 3295 FSUB a3, c03, c03 3296 FSUB a4, c04, c04 3297#endif 3298 3299#ifdef LN 3300 LDF [AO + 3 * SIZE], a1 3301 LDF [AO + 2 * SIZE], a2 3302 LDF [AO + 0 * SIZE], a3 3303 3304 FMUL a1, c02, c02 3305 FMUL a1, c04, c04 3306 3307 FNMSUB (aa2, cc02, cc01, cc01) 3308 FNMSUB (aa2, cc04, cc03, cc03) 3309 3310 FMUL a3, c01, c01 3311 FMUL a3, c03, c03 3312#endif 3313 3314#ifdef LT 3315 LDF [AO + 0 * SIZE], a1 3316 LDF [AO + 1 * SIZE], a2 3317 LDF [AO + 3 * SIZE], a3 3318 3319 FMUL a1, c01, c01 3320 FMUL a1, c03, c03 3321 3322 FNMSUB (aa2, cc01, cc02, cc02) 3323 FNMSUB (aa2, cc03, cc04, cc04) 3324 3325 FMUL a3, c02, c02 3326 FMUL a3, c04, c04 3327#endif 3328 3329#ifdef RN 3330 LDF [BO + 0 * SIZE], a1 3331 LDF [BO + 1 * SIZE], a2 3332 3333 FMUL a1, c01, c01 3334 FMUL a1, c02, c02 3335 3336 FNMSUB (aa2, cc01, cc03, cc03) 3337 FNMSUB (aa2, cc02, cc04, cc04) 3338 3339 LDF [BO + 3 * SIZE], a1 3340 3341 FMUL a1, c03, c03 3342 FMUL a1, c04, c04 3343#endif 3344 3345#ifdef RT 3346 LDF [BO + 3 * SIZE], a1 3347 LDF [BO + 2 * SIZE], a2 3348 3349 FMUL a1, c04, c04 3350 FMUL a1, c03, c03 3351 3352 FNMSUB (aa2, cc04, cc02, cc02) 3353 FNMSUB (aa2, cc03, cc01, cc01) 3354 3355 LDF [BO + 0 * SIZE], a1 3356 3357 FMUL a1, c02, c02 3358 FMUL a1, c01, c01 3359#endif 3360 3361#ifdef LN 3362 add C1, -2 * SIZE, C1 3363 add C2, -2 * SIZE, C2 3364#endif 3365 3366#if defined(LN) || defined(LT) 3367 STF c01, [BO + 0 * SIZE] 3368 STF c03, [BO + 1 * SIZE] 3369 STF c02, [BO + 2 * SIZE] 3370 STF c04, [BO + 3 * SIZE] 3371#else 3372 STF c01, [AO + 0 * SIZE] 3373 STF c02, [AO + 1 * SIZE] 3374 STF c03, [AO + 2 * SIZE] 3375 STF c04, [AO + 3 * SIZE] 3376#endif 3377 3378 STF c01, [C1 + 0 * SIZE] 3379 STF c02, [C1 + 1 * SIZE] 3380 STF c03, [C2 + 0 * SIZE] 3381 STF c04, [C2 + 1 * SIZE] 3382 3383#ifndef LN 3384 add C1, 2 * SIZE, C1 3385 add C2, 2 * SIZE, C2 3386#endif 3387 3388#ifdef RT 3389 sll K, BASE_SHIFT + 1, TEMP1 3390 add AORIG, TEMP1, AORIG 3391#endif 3392 3393#if defined(LT) || defined(RN) 3394 sub K, KK, TEMP1 3395 sll TEMP1, BASE_SHIFT + 1, TEMP2 3396 sll TEMP1, BASE_SHIFT + 1, TEMP1 3397 add AO, TEMP2, AO 3398 add BO, TEMP1, BO 3399#endif 3400 3401#ifdef LT 3402 add KK, 2, KK 3403#endif 3404 3405#ifdef LN 3406 sub KK, 2, KK 3407#endif 3408 3409 add I, -1, I 3410 cmp I, 0 3411 bg,pt %icc, .LL52 3412 nop 3413 .align 4 3414 3415.LL69: 3416#ifdef LN 3417 sll K, BASE_SHIFT + 1, TEMP1 3418 add B, TEMP1, B 3419#endif 3420 3421#if defined(LT) || defined(RN) 3422 mov BO, B 3423#endif 3424 3425#ifdef RN 3426 add KK, 2, KK 3427#endif 3428 3429#ifdef RT 3430 sub KK, 2, KK 3431#endif 3432 .align 4 3433 3434.LL70: 3435 and N, 1, J 3436 cmp J, 0 3437 ble,pn %icc, .LL999 3438 nop 3439 3440#ifdef RT 3441 sll K, BASE_SHIFT, TEMP1 3442 sub B, TEMP1, B 3443#endif 3444 3445#ifndef RT 3446 mov C, C1 3447 add C1, LDC, C 3448#else 3449 sub C, LDC, C1 3450 sub C, LDC, C 3451#endif 3452 3453#ifdef LN 3454 add M, OFFSET, KK 3455#endif 3456 3457#ifdef LT 3458 mov OFFSET, KK 3459#endif 3460 3461#if defined(LN) || defined(RT) 3462 mov A, AORIG 3463#else 3464 mov A, AO 3465#endif 3466 3467 and M, 1, I 3468 cmp I, 0 3469 ble,pn %icc, .LL80 3470 nop 3471 3472#if defined(LT) || defined(RN) 3473 mov B, BO 3474#else 3475#ifdef LN 3476 sll K, BASE_SHIFT + 0, TEMP1 3477 sub AORIG, TEMP1, AORIG 3478#endif 3479 3480 sll KK, BASE_SHIFT + 0, TEMP1 3481 sll KK, BASE_SHIFT + 0, TEMP2 3482 3483 add AORIG, TEMP1, AO 3484 add B, TEMP2, BO 3485#endif 3486 3487 LDF [AO + 0 * SIZE], a1 3488 LDF [BO + 0 * SIZE], b1 3489 LDF [AO + 1 * SIZE], a2 3490 LDF [BO + 1 * SIZE], b2 3491 LDF [AO + 2 * SIZE], a3 3492 LDF [BO + 2 * SIZE], b3 3493 LDF [AO + 3 * SIZE], a4 3494 LDF [BO + 3 * SIZE], b4 3495 3496#if defined(LT) || defined(RN) 3497 sra KK, 2, L 3498#else 3499 sub K, KK, L 3500 sra L, 2, L 3501#endif 3502 cmp L, 0 3503 ble,pn %icc, .LL85 3504 FCLR (cc01) 3505 .align 4 3506 3507.LL83: 3508 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3509 add L, -1, L 3510 3511 FMADD (aa1, bb1, cc01, cc01) 3512 LDF [AO + 4 * SIZE], a1 3513 LDF [BO + 4 * SIZE], b1 3514 3515 FMADD (aa2, bb2, cc01, cc01) 3516 LDF [AO + 5 * SIZE], a2 3517 LDF [BO + 5 * SIZE], b2 3518 3519 FMADD (aa3, bb3, cc01, cc01) 3520 LDF [AO + 6 * SIZE], a3 3521 LDF [BO + 6 * SIZE], b3 3522 3523 FMADD (aa4, bb4, cc01, cc01) 3524 LDF [AO + 7 * SIZE], a4 3525 LDF [BO + 7 * SIZE], b4 3526 3527 add AO, 4 * SIZE, AO 3528 cmp L, 0 3529 3530 bg,pt %icc, .LL83 3531 add BO, 4 * SIZE, BO 3532 .align 4 3533 3534.LL85: 3535#if defined(LT) || defined(RN) 3536 and KK, 3, L 3537#else 3538 sub K, KK, L 3539 and L, 3, L 3540#endif 3541 cmp L, 0 3542 ble,a,pn %icc, .LL88 3543 nop 3544 .align 4 3545 3546.LL87: 3547 FMADD (aa1, bb1, cc01, cc01) 3548 LDF [AO + 1 * SIZE], a1 3549 LDF [BO + 1 * SIZE], b1 3550 3551 add AO, 1 * SIZE, AO 3552 add L, -1, L 3553 cmp L, 0 3554 bg,pt %icc, .LL87 3555 add BO, 1 * SIZE, BO 3556 .align 4 3557 3558.LL88: 3559#if defined(LN) || defined(RT) 3560#ifdef LN 3561 sub KK, 1, TEMP1 3562#else 3563 sub KK, 1, TEMP1 3564#endif 3565 sll TEMP1, BASE_SHIFT + 0, TEMP2 3566 sll TEMP1, BASE_SHIFT + 0, TEMP1 3567 3568 add AORIG, TEMP2, AO 3569 add B, TEMP1, BO 3570#endif 3571 3572#if defined(LN) || defined(LT) 3573 LDF [BO + 0 * SIZE], a1 3574 3575 FSUB a1, c01, c01 3576#else 3577 LDF [AO + 0 * SIZE], a1 3578 3579 FSUB a1, c01, c01 3580#endif 3581 3582#if defined(LN) || defined(LT) 3583 LDF [AO + 0 * SIZE], a1 3584 3585 FMUL a1, c01, c01 3586#endif 3587 3588#if defined(RN) || defined(RT) 3589 LDF [BO + 0 * SIZE], a1 3590 3591 FMUL a1, c01, c01 3592#endif 3593 3594#ifdef LN 3595 add C1, -1 * SIZE, C1 3596#endif 3597 3598#if defined(LN) || defined(LT) 3599 STF c01, [BO + 0 * SIZE] 3600#else 3601 STF c01, [AO + 0 * SIZE] 3602#endif 3603 3604 STF c01, [C1 + 0 * SIZE] 3605 3606#ifdef RT 3607 sll K, BASE_SHIFT + 0, TEMP1 3608 add AORIG, TEMP1, AORIG 3609#endif 3610 3611#if defined(LT) || defined(RN) 3612 sub K, KK, TEMP1 3613 sll TEMP1, BASE_SHIFT + 0, TEMP2 3614 sll TEMP1, BASE_SHIFT + 0, TEMP1 3615 add AO, TEMP2, AO 3616 add BO, TEMP1, BO 3617#endif 3618 3619#ifdef LT 3620 add KK, 1, KK 3621#endif 3622 3623#ifdef LN 3624 sub KK, 1, KK 3625#endif 3626 .align 4 3627 3628.LL80: 3629 sra M, 1, I 3630 cmp I, 0 3631 ble,pn %icc, .LL89 3632 nop 3633 .align 4 3634 3635.LL72: 3636#if defined(LT) || defined(RN) 3637 mov B, BO 3638#else 3639#ifdef LN 3640 sll K, BASE_SHIFT + 1, TEMP1 3641 sub AORIG, TEMP1, AORIG 3642#endif 3643 3644 sll KK, BASE_SHIFT + 1, TEMP1 3645 sll KK, BASE_SHIFT + 0, TEMP2 3646 3647 add AORIG, TEMP1, AO 3648 add B, TEMP2, BO 3649#endif 3650 3651 LDF [AO + 0 * SIZE], a1 3652 LDF [AO + 1 * SIZE], a2 3653 LDF [AO + 2 * SIZE], a3 3654 LDF [AO + 3 * SIZE], a4 3655 3656 LDF [BO + 0 * SIZE], b1 3657 LDF [BO + 1 * SIZE], b2 3658 LDF [BO + 2 * SIZE], b3 3659 FCLR (cc01) 3660 LDF [BO + 3 * SIZE], b4 3661 FCLR (cc02) 3662 3663 prefetch [C1 + 2 * SIZE], 3 3664 3665#if defined(LT) || defined(RN) 3666 sra KK, 2, L 3667#else 3668 sub K, KK, L 3669 sra L, 2, L 3670#endif 3671 cmp L, 0 3672 ble,pn %icc, .LL75 3673 nop 3674 3675.LL73: 3676 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3677 add L, -1, L 3678 3679 FMADD (aa1, bb1, cc01, cc01) 3680 LDF [AO + 4 * SIZE], a1 3681 FMADD (aa2, bb1, cc02, cc02) 3682 LDF [AO + 5 * SIZE], a2 3683 3684 LDF [BO + 4 * SIZE], b1 3685 cmp L, 0 3686 3687 FMADD (aa3, bb2, cc01, cc01) 3688 LDF [AO + 6 * SIZE], a3 3689 FMADD (aa4, bb2, cc02, cc02) 3690 LDF [AO + 7 * SIZE], a4 3691 3692 LDF [BO + 5 * SIZE], b2 3693 add BO, 4 * SIZE, BO 3694 3695 FMADD (aa1, bb3, cc01, cc01) 3696 LDF [AO + 8 * SIZE], a1 3697 FMADD (aa2, bb3, cc02, cc02) 3698 LDF [AO + 9 * SIZE], a2 3699 3700 LDF [BO + 2 * SIZE], b3 3701 add AO, 8 * SIZE, AO 3702 3703 FMADD (aa3, bb4, cc01, cc01) 3704 LDF [AO + 2 * SIZE], a3 3705 FMADD (aa4, bb4, cc02, cc02) 3706 LDF [AO + 3 * SIZE], a4 3707 3708 bg,pt %icc, .LL73 3709 LDF [BO + 3 * SIZE], b4 3710 .align 4 3711 3712.LL75: 3713#if defined(LT) || defined(RN) 3714 and KK, 3, L 3715#else 3716 sub K, KK, L 3717 and L, 3, L 3718#endif 3719 cmp L, 0 3720 ble,a,pn %icc, .LL78 3721 nop 3722 .align 4 3723 3724.LL77: 3725 FMADD (aa1, bb1, cc01, cc01) 3726 LDF [AO + 2 * SIZE], a1 3727 FMADD (aa2, bb1, cc02, cc02) 3728 LDF [AO + 3 * SIZE], a2 3729 3730 LDF [BO + 1 * SIZE], b1 3731 add L, -1, L 3732 add AO, 2 * SIZE, AO 3733 cmp L, 0 3734 bg,pt %icc, .LL77 3735 add BO, 1 * SIZE, BO 3736 .align 4 3737 3738.LL78: 3739#if defined(LN) || defined(RT) 3740#ifdef LN 3741 sub KK, 2, TEMP1 3742#else 3743 sub KK, 1, TEMP1 3744#endif 3745 sll TEMP1, BASE_SHIFT + 1, TEMP2 3746 sll TEMP1, BASE_SHIFT + 0, TEMP1 3747 3748 add AORIG, TEMP2, AO 3749 add B, TEMP1, BO 3750#endif 3751 3752#if defined(LN) || defined(LT) 3753 LDF [BO + 0 * SIZE], a1 3754 LDF [BO + 1 * SIZE], a2 3755 3756 FSUB a1, c01, c01 3757 FSUB a2, c02, c02 3758#else 3759 LDF [AO + 0 * SIZE], a1 3760 LDF [AO + 1 * SIZE], a2 3761 3762 FSUB a1, c01, c01 3763 FSUB a2, c02, c02 3764#endif 3765 3766#ifdef LN 3767 LDF [AO + 3 * SIZE], a1 3768 LDF [AO + 2 * SIZE], a2 3769 LDF [AO + 0 * SIZE], a3 3770 3771 FMUL a1, c02, c02 3772 3773 FNMSUB (aa2, cc02, cc01, cc01) 3774 3775 FMUL a3, c01, c01 3776#endif 3777 3778#ifdef LT 3779 LDF [AO + 0 * SIZE], a1 3780 LDF [AO + 1 * SIZE], a2 3781 LDF [AO + 3 * SIZE], a3 3782 3783 FMUL a1, c01, c01 3784 3785 FNMSUB (aa2, cc01, cc02, cc02) 3786 3787 FMUL a3, c02, c02 3788#endif 3789 3790#if defined(RN) || defined(RT) 3791 LDF [BO + 0 * SIZE], a1 3792 3793 FMUL a1, c01, c01 3794 FMUL a1, c02, c02 3795#endif 3796 3797#ifdef LN 3798 add C1, -2 * SIZE, C1 3799#endif 3800 3801#if defined(LN) || defined(LT) 3802 STF c01, [BO + 0 * SIZE] 3803 STF c02, [BO + 1 * SIZE] 3804#else 3805 STF c01, [AO + 0 * SIZE] 3806 STF c02, [AO + 1 * SIZE] 3807#endif 3808 3809 STF c01, [C1 + 0 * SIZE] 3810 STF c02, [C1 + 1 * SIZE] 3811 3812#ifndef LN 3813 add C1, 2 * SIZE, C1 3814#endif 3815 3816#ifdef RT 3817 sll K, BASE_SHIFT + 1, TEMP1 3818 add AORIG, TEMP1, AORIG 3819#endif 3820 3821#if defined(LT) || defined(RN) 3822 sub K, KK, TEMP1 3823 sll TEMP1, BASE_SHIFT + 1, TEMP2 3824 sll TEMP1, BASE_SHIFT + 0, TEMP1 3825 add AO, TEMP2, AO 3826 add BO, TEMP1, BO 3827#endif 3828 3829#ifdef LT 3830 add KK, 2, KK 3831#endif 3832 3833#ifdef LN 3834 sub KK, 2, KK 3835#endif 3836 3837 add I, -1, I 3838 cmp I, 0 3839 bg,pt %icc, .LL72 3840 nop 3841 .align 4 3842 3843.LL89: 3844#ifdef LN 3845 sll K, BASE_SHIFT, TEMP1 3846 add B, TEMP1, B 3847#endif 3848 3849#if defined(LT) || defined(RN) 3850 mov BO, B 3851#endif 3852 3853#ifdef RN 3854 add KK, 1, KK 3855#endif 3856 3857#ifdef RT 3858 sub KK, 1, KK 3859#endif 3860 .align 4 3861 3862.LL999: 3863#ifdef TRMMKERNEL 3864#ifndef __64BIT__ 3865 ld [%sp + STACK_START + 8], %g1 3866 ld [%sp + STACK_START + 12], %g2 3867 ld [%sp + STACK_START + 16], %g3 3868 ld [%sp + STACK_START + 20], %g4 3869#else 3870 ldx [%sp + STACK_START + 32], %g1 3871 ldx [%sp + STACK_START + 40], %g2 3872 ldx [%sp + STACK_START + 48], %g3 3873 ldx [%sp + STACK_START + 56], %g4 3874#endif 3875#endif 3876 3877 return %i7 + 8 3878 clr %o0 3879 3880 EPILOGUE 3881