1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2005. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define APREFETCHSIZE 24 26#define APREFETCH_CATEGORY 0 27 28#define M %i0 29#define N %i1 30#define K %i2 31 32#if defined(DOUBLE) && !defined(__64BIT__) 33#define A %i5 34#define B %i4 35#else 36#define A %i4 37#define B %i5 38#endif 39 40#define C %o4 41#define LDC %o5 42 43#define AO %l0 44#define BO %l1 45#define I %l2 46#define J %l3 47#define L %l4 48 49#define C1 %o0 50#define C2 %o1 51#define C3 %o2 52#define C4 %o3 53 54#define C5 %l5 55#define C6 %l6 56#define C7 %l7 57#define C8 %i3 58 59#define OFFSET %g1 60#define KK %g2 61#define TEMP1 %g3 62#define TEMP2 %g4 63#define AORIG %o7 64 65#ifdef DOUBLE 66#define c01 %f0 67#define c02 %f2 68#define c03 %f4 69#define c04 %f6 70#define c05 %f8 71#define c06 %f10 72#define c07 %f12 73#define c08 %f14 74#define c09 %f16 75#define c10 %f18 76#define c11 %f20 77#define c12 %f22 78#define c13 %f24 79#define c14 %f26 80#define c15 %f28 81#define c16 %f30 82 83#define a1 %f32 84#define a2 %f34 85#define a3 %f36 86#define a4 %f38 87#define a5 %f40 88 89#define b1 %f42 90#define b2 %f44 91#define b3 %f46 92#define b4 %f48 93#define b5 %f50 94#define b6 %f52 95#define b7 %f54 96#define b8 %f56 97#define b9 %f58 98 99#define cc01 0 100#define cc02 2 101#define cc03 4 102#define cc04 6 103#define cc05 8 104#define cc06 10 105#define cc07 12 106#define cc08 14 107#define cc09 16 108#define cc10 18 109#define cc11 20 110#define cc12 22 111#define cc13 24 112#define cc14 26 113#define cc15 28 114#define cc16 30 115 116#define aa1 1 117#define aa2 3 118#define aa3 5 119#define aa4 7 120#define aa5 9 121 122#define bb1 11 123#define bb2 13 124#define bb3 15 125#define bb4 17 126#define bb5 19 127#define bb6 21 128#define bb7 23 129#define bb8 25 130#define bb9 27 131 132#else 133#define c01 %f0 134#define c02 %f1 135#define c03 %f2 136#define c04 %f3 137#define c05 %f4 138#define c06 %f5 139#define c07 %f6 140#define c08 %f7 141#define c09 %f8 142#define c10 %f9 143#define c11 %f10 144#define c12 %f11 145#define c13 %f12 146#define c14 %f13 147#define c15 %f14 148#define c16 %f15 149 150#define a1 %f16 151#define a2 %f17 152#define a3 %f18 153#define a4 %f19 154#define a5 %f20 155 156#define b1 %f21 157#define b2 %f22 158#define b3 %f23 159#define b4 %f24 160#define b5 %f25 161#define b6 %f26 162#define b7 %f27 163#define b8 %f28 164#define b9 %f29 165 166#define cc01 0 167#define cc02 1 168#define cc03 2 169#define cc04 3 170#define cc05 4 171#define cc06 5 172#define cc07 6 173#define cc08 7 174#define cc09 8 175#define cc10 9 176#define cc11 10 177#define cc12 11 178#define cc13 12 179#define cc14 13 180#define cc15 14 181#define cc16 15 182 183#define aa1 16 184#define aa2 17 185#define aa3 18 186#define aa4 19 187#define aa5 20 188 189#define bb1 21 190#define bb2 22 191#define bb3 23 192#define bb4 24 193#define bb5 25 194#define bb6 26 195#define bb7 27 196#define bb8 28 197#define bb9 29 198 199#endif 200 201 .register %g2, #scratch 202 .register %g3, #scratch 203 204 PROLOGUE 205 SAVESP 206 nop 207 208#ifndef __64BIT__ 209 210#ifdef DOUBLE 211 ld [%sp + STACK_START + 28], B 212 ld [%sp + STACK_START + 32], C 213 ld [%sp + STACK_START + 36], LDC 214 ld [%sp + STACK_START + 40], OFFSET 215#else 216 ld [%sp + STACK_START + 28], C 217 ld [%sp + STACK_START + 32], LDC 218 ld [%sp + STACK_START + 36], OFFSET 219#endif 220 st %g1, [%sp + STACK_START + 8] 221 st %g2, [%sp + STACK_START + 12] 222 st %g3, [%sp + STACK_START + 16] 223 st %g4, [%sp + STACK_START + 20] 224#else 225 226 ldx [%sp+ STACK_START + 56], C 227 ldx [%sp+ STACK_START + 64], LDC 228 ldx [%sp+ STACK_START + 72], OFFSET 229 230 stx %g1, [%sp + STACK_START + 32] 231 stx %g2, [%sp + STACK_START + 40] 232 stx %g3, [%sp + STACK_START + 48] 233 stx %g4, [%sp + STACK_START + 56] 234#endif 235 236#if defined(TRMMKERNEL) && !defined(LEFT) 237 neg OFFSET, KK 238#endif 239 240 sll LDC, BASE_SHIFT, LDC 241 242#ifdef LN 243 smul M, K, TEMP1 244 sll TEMP1, BASE_SHIFT, TEMP1 245 add A, TEMP1, A 246 247 sll M, BASE_SHIFT, TEMP1 248 add C, TEMP1, C 249#endif 250 251#ifdef RN 252 neg OFFSET, KK 253#endif 254 255#ifdef RT 256 smul N, K, TEMP1 257 sll TEMP1, BASE_SHIFT, TEMP1 258 add B, TEMP1, B 259 260 smul N, LDC, TEMP1 261 add C, TEMP1, C 262 263 sub N, OFFSET, KK 264#endif 265 266 sra N, 3, J 267 cmp J, 0 268 ble,pn %icc, .LL30 269 nop 270 .align 4 271 272.LL11: 273#ifdef RT 274 sll K, BASE_SHIFT + 3, TEMP1 275 sub B, TEMP1, B 276#endif 277 278#ifndef RT 279 mov C, C1 280 add C, LDC, C2 281 add C2, LDC, C3 282 add C3, LDC, C4 283 add C4, LDC, C5 284 add C5, LDC, C6 285 add C6, LDC, C7 286 add C7, LDC, C8 287 add C8, LDC, C 288#else 289 sub C, LDC, C8 290 sub C8, LDC, C7 291 sub C7, LDC, C6 292 sub C6, LDC, C5 293 sub C5, LDC, C4 294 sub C4, LDC, C3 295 sub C3, LDC, C2 296 sub C2, LDC, C1 297 sub C2, LDC, C 298#endif 299 300#ifdef LN 301 add M, OFFSET, KK 302#endif 303 304#ifdef LT 305 mov OFFSET, KK 306#endif 307 308#if defined(LN) || defined(RT) 309 mov A, AORIG 310#else 311 mov A, AO 312#endif 313 314 sra M, 1, I 315 cmp I, 0 316 ble,pn %icc, .LL20 317 nop 318 .align 4 319 320.LL12: 321#if defined(LT) || defined(RN) 322 mov B, BO 323#else 324#ifdef LN 325 sll K, BASE_SHIFT + 1, TEMP1 326 sub AORIG, TEMP1, AORIG 327#endif 328 329 sll KK, BASE_SHIFT + 1, TEMP1 330 sll KK, BASE_SHIFT + 3, TEMP2 331 332 add AORIG, TEMP1, AO 333 add B, TEMP2, BO 334#endif 335 336 LDF [AO + 0 * SIZE], a1 337 LDF [AO + 1 * SIZE], a2 338 LDF [AO + 8 * SIZE], a5 339 340 LDF [BO + 0 * SIZE], b1 341 342 LDF [BO + 1 * SIZE], b2 343 FCLR (cc01) 344 LDF [BO + 2 * SIZE], b3 345 FCLR (cc05) 346 LDF [BO + 3 * SIZE], b4 347 FCLR (cc09) 348 LDF [BO + 4 * SIZE], b5 349 FCLR (cc13) 350 351 LDF [BO + 5 * SIZE], b6 352 FCLR (cc02) 353 LDF [BO + 6 * SIZE], b7 354 FCLR (cc06) 355 LDF [BO + 7 * SIZE], b8 356 FCLR (cc10) 357 LDF [BO + 8 * SIZE], b9 358 FCLR (cc14) 359 360 prefetch [C1 + 1 * SIZE], 3 361 FCLR (cc03) 362 prefetch [C2 + 2 * SIZE], 3 363 FCLR (cc07) 364 prefetch [C3 + 1 * SIZE], 3 365 FCLR (cc11) 366 prefetch [C4 + 2 * SIZE], 3 367 FCLR (cc15) 368 369 prefetch [C5 + 1 * SIZE], 3 370 FCLR (cc04) 371 prefetch [C6 + 2 * SIZE], 3 372 FCLR (cc08) 373 prefetch [C7 + 1 * SIZE], 3 374 FCLR (cc12) 375 prefetch [C8 + 2 * SIZE], 3 376 FCLR (cc16) 377 378#if defined(LT) || defined(RN) 379 sra KK, 3, L 380#else 381 sub K, KK, L 382 sra L, 3, L 383#endif 384 cmp L, 0 385 ble,pn %icc, .LL15 386 nop 387 .align 4 388 389.LL13: 390 FMADD (aa1, bb1, cc01, cc01) 391 FMADD (aa2, bb1, cc02, cc02) 392 FMADD (aa1, bb2, cc03, cc03) 393 FMADD (aa2, bb2, cc04, cc04) 394 395 FMADD (aa1, bb3, cc05, cc05) 396 LDF [BO + 16 * SIZE], b1 397 FMADD (aa2, bb3, cc06, cc06) 398 LDF [BO + 9 * SIZE], b2 399 400 FMADD (aa1, bb4, cc07, cc07) 401 LDF [BO + 10 * SIZE], b3 402 FMADD (aa2, bb4, cc08, cc08) 403 LDF [BO + 11 * SIZE], b4 404 405 FMADD (aa1, bb5, cc09, cc09) 406 LDF [AO + 2 * SIZE], a3 407 FMADD (aa2, bb5, cc10, cc10) 408 LDF [AO + 3 * SIZE], a4 409 410 FMADD (aa1, bb6, cc11, cc11) 411 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 412 FMADD (aa2, bb6, cc12, cc12) 413 nop 414 415 FMADD (aa1, bb7, cc13, cc13) 416 LDF [BO + 12 * SIZE], b5 417 FMADD (aa2, bb7, cc14, cc14) 418 LDF [BO + 13 * SIZE], b6 419 420 FMADD (aa1, bb8, cc15, cc15) 421 LDF [BO + 14 * SIZE], b7 422 FMADD (aa2, bb8, cc16, cc16) 423 LDF [BO + 15 * SIZE], b8 424 425 FMADD (aa3, bb9, cc01, cc01) 426 FMADD (aa4, bb9, cc02, cc02) 427 FMADD (aa3, bb2, cc03, cc03) 428 FMADD (aa4, bb2, cc04, cc04) 429 430 FMADD (aa3, bb3, cc05, cc05) 431 LDF [BO + 24 * SIZE], b9 432 FMADD (aa4, bb3, cc06, cc06) 433 LDF [BO + 17 * SIZE], b2 434 435 FMADD (aa3, bb4, cc07, cc07) 436 LDF [BO + 18 * SIZE], b3 437 FMADD (aa4, bb4, cc08, cc08) 438 LDF [BO + 19 * SIZE], b4 439 440 FMADD (aa3, bb5, cc09, cc09) 441 LDF [AO + 4 * SIZE], a1 442 FMADD (aa4, bb5, cc10, cc10) 443 LDF [AO + 5 * SIZE], a2 444 445 FMADD (aa3, bb6, cc11, cc11) 446 add L, -1, L 447 FMADD (aa4, bb6, cc12, cc12) 448 nop 449 450 FMADD (aa3, bb7, cc13, cc13) 451 LDF [BO + 20 * SIZE], b5 452 FMADD (aa4, bb7, cc14, cc14) 453 LDF [BO + 21 * SIZE], b6 454 455 FMADD (aa3, bb8, cc15, cc15) 456 LDF [BO + 22 * SIZE], b7 457 FMADD (aa4, bb8, cc16, cc16) 458 LDF [BO + 23 * SIZE], b8 459 460 FMADD (aa1, bb1, cc01, cc01) 461 FMADD (aa2, bb1, cc02, cc02) 462 FMADD (aa1, bb2, cc03, cc03) 463 FMADD (aa2, bb2, cc04, cc04) 464 465 FMADD (aa1, bb3, cc05, cc05) 466 LDF [BO + 32 * SIZE], b1 467 FMADD (aa2, bb3, cc06, cc06) 468 LDF [BO + 25 * SIZE], b2 469 470 FMADD (aa1, bb4, cc07, cc07) 471 LDF [BO + 26 * SIZE], b3 472 FMADD (aa2, bb4, cc08, cc08) 473 LDF [BO + 27 * SIZE], b4 474 475 FMADD (aa1, bb5, cc09, cc09) 476 LDF [AO + 6 * SIZE], a3 477 FMADD (aa2, bb5, cc10, cc10) 478 LDF [AO + 7 * SIZE], a4 479 480 FMADD (aa1, bb6, cc11, cc11) 481 nop 482 FMADD (aa2, bb6, cc12, cc12) 483 nop 484 485 FMADD (aa1, bb7, cc13, cc13) 486 LDF [BO + 28 * SIZE], b5 487 FMADD (aa2, bb7, cc14, cc14) 488 LDF [BO + 29 * SIZE], b6 489 490 FMADD (aa1, bb8, cc15, cc15) 491 LDF [BO + 30 * SIZE], b7 492 FMADD (aa2, bb8, cc16, cc16) 493 LDF [BO + 31 * SIZE], b8 494 495 FMADD (aa3, bb9, cc01, cc01) 496 FMADD (aa4, bb9, cc02, cc02) 497 FMADD (aa3, bb2, cc03, cc03) 498 FMADD (aa4, bb2, cc04, cc04) 499 500 FMADD (aa3, bb3, cc05, cc05) 501 LDF [BO + 40 * SIZE], b9 502 FMADD (aa4, bb3, cc06, cc06) 503 LDF [BO + 33 * SIZE], b2 504 505 FMADD (aa3, bb4, cc07, cc07) 506 LDF [BO + 34 * SIZE], b3 507 FMADD (aa4, bb4, cc08, cc08) 508 LDF [BO + 35 * SIZE], b4 509 510 FMADD (aa3, bb5, cc09, cc09) 511 LDF [AO + 16 * SIZE], a1 /****/ 512 FMADD (aa4, bb5, cc10, cc10) 513 LDF [AO + 9 * SIZE], a2 514 515 FMADD (aa3, bb6, cc11, cc11) 516 nop 517 FMADD (aa4, bb6, cc12, cc12) 518 nop 519 520 FMADD (aa3, bb7, cc13, cc13) 521 LDF [BO + 36 * SIZE], b5 522 FMADD (aa4, bb7, cc14, cc14) 523 LDF [BO + 37 * SIZE], b6 524 525 FMADD (aa3, bb8, cc15, cc15) 526 LDF [BO + 38 * SIZE], b7 527 FMADD (aa4, bb8, cc16, cc16) 528 LDF [BO + 39 * SIZE], b8 529 530 FMADD (aa5, bb1, cc01, cc01) 531 FMADD (aa2, bb1, cc02, cc02) 532 FMADD (aa5, bb2, cc03, cc03) 533 FMADD (aa2, bb2, cc04, cc04) 534 535 FMADD (aa5, bb3, cc05, cc05) 536 LDF [BO + 48 * SIZE], b1 537 FMADD (aa2, bb3, cc06, cc06) 538 LDF [BO + 41 * SIZE], b2 539 540 FMADD (aa5, bb4, cc07, cc07) 541 LDF [BO + 42 * SIZE], b3 542 FMADD (aa2, bb4, cc08, cc08) 543 LDF [BO + 43 * SIZE], b4 544 545 FMADD (aa5, bb5, cc09, cc09) 546 LDF [AO + 10 * SIZE], a3 547 FMADD (aa2, bb5, cc10, cc10) 548 LDF [AO + 11 * SIZE], a4 549 550 FMADD (aa5, bb6, cc11, cc11) 551 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 552 FMADD (aa2, bb6, cc12, cc12) 553 nop 554 555 FMADD (aa5, bb7, cc13, cc13) 556 LDF [BO + 44 * SIZE], b5 557 FMADD (aa2, bb7, cc14, cc14) 558 LDF [BO + 45 * SIZE], b6 559 560 FMADD (aa5, bb8, cc15, cc15) 561 LDF [BO + 46 * SIZE], b7 562 FMADD (aa2, bb8, cc16, cc16) 563 LDF [BO + 47 * SIZE], b8 564 565 FMADD (aa3, bb9, cc01, cc01) 566 FMADD (aa4, bb9, cc02, cc02) 567 FMADD (aa3, bb2, cc03, cc03) 568 FMADD (aa4, bb2, cc04, cc04) 569 570 FMADD (aa3, bb3, cc05, cc05) 571 LDF [BO + 56 * SIZE], b9 572 FMADD (aa4, bb3, cc06, cc06) 573 LDF [BO + 49 * SIZE], b2 574 575 FMADD (aa3, bb4, cc07, cc07) 576 LDF [BO + 50 * SIZE], b3 577 FMADD (aa4, bb4, cc08, cc08) 578 LDF [BO + 51 * SIZE], b4 579 580 FMADD (aa3, bb5, cc09, cc09) 581 LDF [AO + 12 * SIZE], a5 582 FMADD (aa4, bb5, cc10, cc10) 583 LDF [AO + 13 * SIZE], a2 584 585 FMADD (aa3, bb6, cc11, cc11) 586 cmp L, 0 587 FMADD (aa4, bb6, cc12, cc12) 588 nop 589 590 FMADD (aa3, bb7, cc13, cc13) 591 LDF [BO + 52 * SIZE], b5 592 FMADD (aa4, bb7, cc14, cc14) 593 LDF [BO + 53 * SIZE], b6 594 595 FMADD (aa3, bb8, cc15, cc15) 596 LDF [BO + 54 * SIZE], b7 597 FMADD (aa4, bb8, cc16, cc16) 598 LDF [BO + 55 * SIZE], b8 599 600 FMADD (aa5, bb1, cc01, cc01) 601 FMADD (aa2, bb1, cc02, cc02) 602 FMADD (aa5, bb2, cc03, cc03) 603 FMADD (aa2, bb2, cc04, cc04) 604 605 FMADD (aa5, bb3, cc05, cc05) 606 LDF [BO + 64 * SIZE], b1 607 FMADD (aa2, bb3, cc06, cc06) 608 LDF [BO + 57 * SIZE], b2 609 610 FMADD (aa5, bb4, cc07, cc07) 611 LDF [BO + 58 * SIZE], b3 612 FMADD (aa2, bb4, cc08, cc08) 613 LDF [BO + 59 * SIZE], b4 614 615 FMADD (aa5, bb5, cc09, cc09) 616 LDF [AO + 14 * SIZE], a3 617 FMADD (aa2, bb5, cc10, cc10) 618 LDF [AO + 15 * SIZE], a4 619 620 FMADD (aa5, bb6, cc11, cc11) 621 add BO, 64 * SIZE, BO 622 FMADD (aa2, bb6, cc12, cc12) 623 add AO, 16 * SIZE, AO 624 625 FMADD (aa5, bb7, cc13, cc13) 626 LDF [BO - 4 * SIZE], b5 627 FMADD (aa2, bb7, cc14, cc14) 628 LDF [BO - 3 * SIZE], b6 629 630 FMADD (aa5, bb8, cc15, cc15) 631 LDF [BO - 2 * SIZE], b7 632 FMADD (aa2, bb8, cc16, cc16) 633 LDF [BO - 1 * SIZE], b8 634 635 FMADD (aa3, bb9, cc01, cc01) 636 FMADD (aa4, bb9, cc02, cc02) 637 FMADD (aa3, bb2, cc03, cc03) 638 FMADD (aa4, bb2, cc04, cc04) 639 640 FMADD (aa3, bb3, cc05, cc05) 641 LDF [BO + 8 * SIZE], b9 642 FMADD (aa4, bb3, cc06, cc06) 643 LDF [BO + 1 * SIZE], b2 644 645 FMADD (aa3, bb4, cc07, cc07) 646 LDF [BO + 2 * SIZE], b3 647 FMADD (aa4, bb4, cc08, cc08) 648 LDF [BO + 3 * SIZE], b4 649 650 FMADD (aa3, bb5, cc09, cc09) 651 LDF [AO + 8 * SIZE], a5 /****/ 652 FMADD (aa4, bb5, cc10, cc10) 653 LDF [AO + 1 * SIZE], a2 654 655 FMADD (aa3, bb6, cc11, cc11) 656 FMADD (aa4, bb6, cc12, cc12) 657 658 FMADD (aa3, bb7, cc13, cc13) 659 LDF [BO + 4 * SIZE], b5 660 FMADD (aa4, bb7, cc14, cc14) 661 LDF [BO + 5 * SIZE], b6 662 663 FMADD (aa3, bb8, cc15, cc15) 664 LDF [BO + 6 * SIZE], b7 665 FMADD (aa4, bb8, cc16, cc16) 666 ble,pn %icc, .LL15 667 LDF [BO + 7 * SIZE], b8 668 669 FMADD (aa1, bb1, cc01, cc01) 670 FMADD (aa2, bb1, cc02, cc02) 671 FMADD (aa1, bb2, cc03, cc03) 672 FMADD (aa2, bb2, cc04, cc04) 673 674 FMADD (aa1, bb3, cc05, cc05) 675 LDF [BO + 16 * SIZE], b1 676 FMADD (aa2, bb3, cc06, cc06) 677 LDF [BO + 9 * SIZE], b2 678 679 FMADD (aa1, bb4, cc07, cc07) 680 LDF [BO + 10 * SIZE], b3 681 FMADD (aa2, bb4, cc08, cc08) 682 LDF [BO + 11 * SIZE], b4 683 684 FMADD (aa1, bb5, cc09, cc09) 685 LDF [AO + 2 * SIZE], a3 686 FMADD (aa2, bb5, cc10, cc10) 687 LDF [AO + 3 * SIZE], a4 688 689 FMADD (aa1, bb6, cc11, cc11) 690 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 691 FMADD (aa2, bb6, cc12, cc12) 692 nop 693 694 FMADD (aa1, bb7, cc13, cc13) 695 LDF [BO + 12 * SIZE], b5 696 FMADD (aa2, bb7, cc14, cc14) 697 LDF [BO + 13 * SIZE], b6 698 699 FMADD (aa1, bb8, cc15, cc15) 700 LDF [BO + 14 * SIZE], b7 701 FMADD (aa2, bb8, cc16, cc16) 702 LDF [BO + 15 * SIZE], b8 703 704 FMADD (aa3, bb9, cc01, cc01) 705 FMADD (aa4, bb9, cc02, cc02) 706 FMADD (aa3, bb2, cc03, cc03) 707 FMADD (aa4, bb2, cc04, cc04) 708 709 FMADD (aa3, bb3, cc05, cc05) 710 LDF [BO + 24 * SIZE], b9 711 FMADD (aa4, bb3, cc06, cc06) 712 LDF [BO + 17 * SIZE], b2 713 714 FMADD (aa3, bb4, cc07, cc07) 715 LDF [BO + 18 * SIZE], b3 716 FMADD (aa4, bb4, cc08, cc08) 717 LDF [BO + 19 * SIZE], b4 718 719 FMADD (aa3, bb5, cc09, cc09) 720 LDF [AO + 4 * SIZE], a1 721 FMADD (aa4, bb5, cc10, cc10) 722 LDF [AO + 5 * SIZE], a2 723 724 FMADD (aa3, bb6, cc11, cc11) 725 add L, -1, L 726 FMADD (aa4, bb6, cc12, cc12) 727 nop 728 729 FMADD (aa3, bb7, cc13, cc13) 730 LDF [BO + 20 * SIZE], b5 731 FMADD (aa4, bb7, cc14, cc14) 732 LDF [BO + 21 * SIZE], b6 733 734 FMADD (aa3, bb8, cc15, cc15) 735 LDF [BO + 22 * SIZE], b7 736 FMADD (aa4, bb8, cc16, cc16) 737 LDF [BO + 23 * SIZE], b8 738 739 FMADD (aa1, bb1, cc01, cc01) 740 FMADD (aa2, bb1, cc02, cc02) 741 FMADD (aa1, bb2, cc03, cc03) 742 FMADD (aa2, bb2, cc04, cc04) 743 744 FMADD (aa1, bb3, cc05, cc05) 745 LDF [BO + 32 * SIZE], b1 746 FMADD (aa2, bb3, cc06, cc06) 747 LDF [BO + 25 * SIZE], b2 748 749 FMADD (aa1, bb4, cc07, cc07) 750 LDF [BO + 26 * SIZE], b3 751 FMADD (aa2, bb4, cc08, cc08) 752 LDF [BO + 27 * SIZE], b4 753 754 FMADD (aa1, bb5, cc09, cc09) 755 LDF [AO + 6 * SIZE], a3 756 FMADD (aa2, bb5, cc10, cc10) 757 LDF [AO + 7 * SIZE], a4 758 759 FMADD (aa1, bb6, cc11, cc11) 760 nop 761 FMADD (aa2, bb6, cc12, cc12) 762 nop 763 764 FMADD (aa1, bb7, cc13, cc13) 765 LDF [BO + 28 * SIZE], b5 766 FMADD (aa2, bb7, cc14, cc14) 767 LDF [BO + 29 * SIZE], b6 768 769 FMADD (aa1, bb8, cc15, cc15) 770 LDF [BO + 30 * SIZE], b7 771 FMADD (aa2, bb8, cc16, cc16) 772 LDF [BO + 31 * SIZE], b8 773 774 FMADD (aa3, bb9, cc01, cc01) 775 FMADD (aa4, bb9, cc02, cc02) 776 FMADD (aa3, bb2, cc03, cc03) 777 FMADD (aa4, bb2, cc04, cc04) 778 779 FMADD (aa3, bb3, cc05, cc05) 780 LDF [BO + 40 * SIZE], b9 781 FMADD (aa4, bb3, cc06, cc06) 782 LDF [BO + 33 * SIZE], b2 783 784 FMADD (aa3, bb4, cc07, cc07) 785 LDF [BO + 34 * SIZE], b3 786 FMADD (aa4, bb4, cc08, cc08) 787 LDF [BO + 35 * SIZE], b4 788 789 FMADD (aa3, bb5, cc09, cc09) 790 LDF [AO + 16 * SIZE], a1 /****/ 791 FMADD (aa4, bb5, cc10, cc10) 792 LDF [AO + 9 * SIZE], a2 793 794 FMADD (aa3, bb6, cc11, cc11) 795 nop 796 FMADD (aa4, bb6, cc12, cc12) 797 nop 798 799 FMADD (aa3, bb7, cc13, cc13) 800 LDF [BO + 36 * SIZE], b5 801 FMADD (aa4, bb7, cc14, cc14) 802 LDF [BO + 37 * SIZE], b6 803 804 FMADD (aa3, bb8, cc15, cc15) 805 LDF [BO + 38 * SIZE], b7 806 FMADD (aa4, bb8, cc16, cc16) 807 LDF [BO + 39 * SIZE], b8 808 809 FMADD (aa5, bb1, cc01, cc01) 810 FMADD (aa2, bb1, cc02, cc02) 811 FMADD (aa5, bb2, cc03, cc03) 812 FMADD (aa2, bb2, cc04, cc04) 813 814 FMADD (aa5, bb3, cc05, cc05) 815 LDF [BO + 48 * SIZE], b1 816 FMADD (aa2, bb3, cc06, cc06) 817 LDF [BO + 41 * SIZE], b2 818 819 FMADD (aa5, bb4, cc07, cc07) 820 LDF [BO + 42 * SIZE], b3 821 FMADD (aa2, bb4, cc08, cc08) 822 LDF [BO + 43 * SIZE], b4 823 824 FMADD (aa5, bb5, cc09, cc09) 825 LDF [AO + 10 * SIZE], a3 826 FMADD (aa2, bb5, cc10, cc10) 827 LDF [AO + 11 * SIZE], a4 828 829 FMADD (aa5, bb6, cc11, cc11) 830 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 831 FMADD (aa2, bb6, cc12, cc12) 832 nop 833 834 FMADD (aa5, bb7, cc13, cc13) 835 LDF [BO + 44 * SIZE], b5 836 FMADD (aa2, bb7, cc14, cc14) 837 LDF [BO + 45 * SIZE], b6 838 839 FMADD (aa5, bb8, cc15, cc15) 840 LDF [BO + 46 * SIZE], b7 841 FMADD (aa2, bb8, cc16, cc16) 842 LDF [BO + 47 * SIZE], b8 843 844 FMADD (aa3, bb9, cc01, cc01) 845 FMADD (aa4, bb9, cc02, cc02) 846 FMADD (aa3, bb2, cc03, cc03) 847 FMADD (aa4, bb2, cc04, cc04) 848 849 FMADD (aa3, bb3, cc05, cc05) 850 LDF [BO + 56 * SIZE], b9 851 FMADD (aa4, bb3, cc06, cc06) 852 LDF [BO + 49 * SIZE], b2 853 854 FMADD (aa3, bb4, cc07, cc07) 855 LDF [BO + 50 * SIZE], b3 856 FMADD (aa4, bb4, cc08, cc08) 857 LDF [BO + 51 * SIZE], b4 858 859 FMADD (aa3, bb5, cc09, cc09) 860 LDF [AO + 12 * SIZE], a5 861 FMADD (aa4, bb5, cc10, cc10) 862 LDF [AO + 13 * SIZE], a2 863 864 FMADD (aa3, bb6, cc11, cc11) 865 cmp L, 0 866 FMADD (aa4, bb6, cc12, cc12) 867 nop 868 869 FMADD (aa3, bb7, cc13, cc13) 870 LDF [BO + 52 * SIZE], b5 871 FMADD (aa4, bb7, cc14, cc14) 872 LDF [BO + 53 * SIZE], b6 873 874 FMADD (aa3, bb8, cc15, cc15) 875 LDF [BO + 54 * SIZE], b7 876 FMADD (aa4, bb8, cc16, cc16) 877 LDF [BO + 55 * SIZE], b8 878 879 FMADD (aa5, bb1, cc01, cc01) 880 FMADD (aa2, bb1, cc02, cc02) 881 FMADD (aa5, bb2, cc03, cc03) 882 FMADD (aa2, bb2, cc04, cc04) 883 884 FMADD (aa5, bb3, cc05, cc05) 885 LDF [BO + 64 * SIZE], b1 886 FMADD (aa2, bb3, cc06, cc06) 887 LDF [BO + 57 * SIZE], b2 888 889 FMADD (aa5, bb4, cc07, cc07) 890 LDF [BO + 58 * SIZE], b3 891 FMADD (aa2, bb4, cc08, cc08) 892 LDF [BO + 59 * SIZE], b4 893 894 FMADD (aa5, bb5, cc09, cc09) 895 LDF [AO + 14 * SIZE], a3 896 FMADD (aa2, bb5, cc10, cc10) 897 LDF [AO + 15 * SIZE], a4 898 899 FMADD (aa5, bb6, cc11, cc11) 900 add BO, 64 * SIZE, BO 901 FMADD (aa2, bb6, cc12, cc12) 902 add AO, 16 * SIZE, AO 903 904 FMADD (aa5, bb7, cc13, cc13) 905 LDF [BO - 4 * SIZE], b5 906 FMADD (aa2, bb7, cc14, cc14) 907 LDF [BO - 3 * SIZE], b6 908 909 FMADD (aa5, bb8, cc15, cc15) 910 LDF [BO - 2 * SIZE], b7 911 FMADD (aa2, bb8, cc16, cc16) 912 LDF [BO - 1 * SIZE], b8 913 914 FMADD (aa3, bb9, cc01, cc01) 915 FMADD (aa4, bb9, cc02, cc02) 916 FMADD (aa3, bb2, cc03, cc03) 917 FMADD (aa4, bb2, cc04, cc04) 918 919 FMADD (aa3, bb3, cc05, cc05) 920 LDF [BO + 8 * SIZE], b9 921 FMADD (aa4, bb3, cc06, cc06) 922 LDF [BO + 1 * SIZE], b2 923 924 FMADD (aa3, bb4, cc07, cc07) 925 LDF [BO + 2 * SIZE], b3 926 FMADD (aa4, bb4, cc08, cc08) 927 LDF [BO + 3 * SIZE], b4 928 929 FMADD (aa3, bb5, cc09, cc09) 930 LDF [AO + 8 * SIZE], a5 /****/ 931 FMADD (aa4, bb5, cc10, cc10) 932 LDF [AO + 1 * SIZE], a2 933 934 FMADD (aa3, bb6, cc11, cc11) 935 FMADD (aa4, bb6, cc12, cc12) 936 937 FMADD (aa3, bb7, cc13, cc13) 938 LDF [BO + 4 * SIZE], b5 939 FMADD (aa4, bb7, cc14, cc14) 940 LDF [BO + 5 * SIZE], b6 941 942 FMADD (aa3, bb8, cc15, cc15) 943 LDF [BO + 6 * SIZE], b7 944 FMADD (aa4, bb8, cc16, cc16) 945 bg,pt %icc, .LL13 946 LDF [BO + 7 * SIZE], b8 947 .align 4 948 949.LL15: 950#if defined(LT) || defined(RN) 951 and KK, 7, L 952#else 953 sub K, KK, L 954 and L, 7, L 955#endif 956 cmp L, 0 957 ble,a,pn %icc, .LL18 958 nop 959 .align 4 960 961.LL17: 962 FMADD (aa1, bb1, cc01, cc01) 963 add L, -1, L 964 FMADD (aa2, bb1, cc02, cc02) 965 nop 966 967 FMADD (aa1, bb2, cc03, cc03) 968 LDF [BO + 8 * SIZE], b1 969 FMADD (aa2, bb2, cc04, cc04) 970 LDF [BO + 9 * SIZE], b2 971 972 FMADD (aa1, bb3, cc05, cc05) 973 cmp L, 0 974 FMADD (aa2, bb3, cc06, cc06) 975 nop 976 977 FMADD (aa1, bb4, cc07, cc07) 978 LDF [BO + 10 * SIZE], b3 979 FMADD (aa2, bb4, cc08, cc08) 980 LDF [BO + 11 * SIZE], b4 981 982 FMADD (aa1, bb5, cc09, cc09) 983 nop 984 FMADD (aa2, bb5, cc10, cc10) 985 nop 986 987 FMADD (aa1, bb6, cc11, cc11) 988 LDF [BO + 12 * SIZE], b5 989 FMADD (aa2, bb6, cc12, cc12) 990 LDF [BO + 13 * SIZE], b6 991 992 FMADD (aa1, bb7, cc13, cc13) 993 add AO, 2 * SIZE, AO 994 FMADD (aa2, bb7, cc14, cc14) 995 add BO, 8 * SIZE, BO 996 997 FMADD (aa1, bb8, cc15, cc15) 998 LDF [AO + 0 * SIZE], a1 999 FMADD (aa2, bb8, cc16, cc16) 1000 LDF [AO + 1 * SIZE], a2 1001 1002 LDF [BO + 6 * SIZE], b7 1003 bg,pt %icc, .LL17 1004 LDF [BO + 7 * SIZE], b8 1005 nop 1006 .align 4 1007 1008.LL18: 1009#if defined(LN) || defined(RT) 1010#ifdef LN 1011 sub KK, 2, TEMP1 1012#else 1013 sub KK, 8, TEMP1 1014#endif 1015 sll TEMP1, BASE_SHIFT + 1, TEMP2 1016 sll TEMP1, BASE_SHIFT + 3, TEMP1 1017 1018 add AORIG, TEMP2, AO 1019 add B, TEMP1, BO 1020#endif 1021 1022#if defined(LN) || defined(LT) 1023 LDF [BO + 0 * SIZE], a1 1024 LDF [BO + 1 * SIZE], a2 1025 LDF [BO + 2 * SIZE], a3 1026 LDF [BO + 3 * SIZE], a4 1027 1028 LDF [BO + 4 * SIZE], b1 1029 LDF [BO + 5 * SIZE], b2 1030 LDF [BO + 6 * SIZE], b3 1031 LDF [BO + 7 * SIZE], b4 1032 1033 FSUB a1, c01, c01 1034 FSUB a2, c03, c03 1035 FSUB a3, c05, c05 1036 FSUB a4, c07, c07 1037 1038 FSUB b1, c09, c09 1039 FSUB b2, c11, c11 1040 FSUB b3, c13, c13 1041 FSUB b4, c15, c15 1042 1043 LDF [BO + 8 * SIZE], a1 1044 LDF [BO + 9 * SIZE], a2 1045 LDF [BO + 10 * SIZE], a3 1046 LDF [BO + 11 * SIZE], a4 1047 1048 LDF [BO + 12 * SIZE], b1 1049 LDF [BO + 13 * SIZE], b2 1050 LDF [BO + 14 * SIZE], b3 1051 LDF [BO + 15 * SIZE], b4 1052 1053 FSUB a1, c02, c02 1054 FSUB a2, c04, c04 1055 FSUB a3, c06, c06 1056 FSUB a4, c08, c08 1057 1058 FSUB b1, c10, c10 1059 FSUB b2, c12, c12 1060 FSUB b3, c14, c14 1061 FSUB b4, c16, c16 1062#else 1063 LDF [AO + 0 * SIZE], a1 1064 LDF [AO + 1 * SIZE], a2 1065 LDF [AO + 2 * SIZE], a3 1066 LDF [AO + 3 * SIZE], a4 1067 1068 LDF [AO + 4 * SIZE], b1 1069 LDF [AO + 5 * SIZE], b2 1070 LDF [AO + 6 * SIZE], b3 1071 LDF [AO + 7 * SIZE], b4 1072 1073 FSUB a1, c01, c01 1074 FSUB a2, c02, c02 1075 FSUB a3, c03, c03 1076 FSUB a4, c04, c04 1077 1078 FSUB b1, c05, c05 1079 FSUB b2, c06, c06 1080 FSUB b3, c07, c07 1081 FSUB b4, c08, c08 1082 1083 LDF [AO + 8 * SIZE], a1 1084 LDF [AO + 9 * SIZE], a2 1085 LDF [AO + 10 * SIZE], a3 1086 LDF [AO + 11 * SIZE], a4 1087 1088 LDF [AO + 12 * SIZE], b1 1089 LDF [AO + 13 * SIZE], b2 1090 LDF [AO + 14 * SIZE], b3 1091 LDF [AO + 15 * SIZE], b4 1092 1093 FSUB a1, c09, c09 1094 FSUB a2, c10, c10 1095 FSUB a3, c11, c11 1096 FSUB a4, c12, c12 1097 1098 FSUB b1, c13, c13 1099 FSUB b2, c14, c14 1100 FSUB b3, c15, c15 1101 FSUB b4, c16, c16 1102#endif 1103 1104#ifdef LN 1105 LDF [AO + 3 * SIZE], a1 1106 LDF [AO + 2 * SIZE], a2 1107 LDF [AO + 0 * SIZE], a3 1108 1109 FMUL a1, c02, c02 1110 FMUL a1, c04, c04 1111 FMUL a1, c06, c06 1112 FMUL a1, c08, c08 1113 FMUL a1, c10, c10 1114 FMUL a1, c12, c12 1115 FMUL a1, c14, c14 1116 FMUL a1, c16, c16 1117 1118 FNMSUB (aa2, cc02, cc01, cc01) 1119 FNMSUB (aa2, cc04, cc03, cc03) 1120 FNMSUB (aa2, cc06, cc05, cc05) 1121 FNMSUB (aa2, cc08, cc07, cc07) 1122 FNMSUB (aa2, cc10, cc09, cc09) 1123 FNMSUB (aa2, cc12, cc11, cc11) 1124 FNMSUB (aa2, cc14, cc13, cc13) 1125 FNMSUB (aa2, cc16, cc15, cc15) 1126 1127 FMUL a3, c01, c01 1128 FMUL a3, c03, c03 1129 FMUL a3, c05, c05 1130 FMUL a3, c07, c07 1131 FMUL a3, c09, c09 1132 FMUL a3, c11, c11 1133 FMUL a3, c13, c13 1134 FMUL a3, c15, c15 1135#endif 1136 1137#ifdef LT 1138 LDF [AO + 0 * SIZE], a1 1139 LDF [AO + 1 * SIZE], a2 1140 LDF [AO + 3 * SIZE], a3 1141 1142 FMUL a1, c01, c01 1143 FMUL a1, c03, c03 1144 FMUL a1, c05, c05 1145 FMUL a1, c07, c07 1146 FMUL a1, c09, c09 1147 FMUL a1, c11, c11 1148 FMUL a1, c13, c13 1149 FMUL a1, c15, c15 1150 1151 FNMSUB (aa2, cc01, cc02, cc02) 1152 FNMSUB (aa2, cc03, cc04, cc04) 1153 FNMSUB (aa2, cc05, cc06, cc06) 1154 FNMSUB (aa2, cc07, cc08, cc08) 1155 FNMSUB (aa2, cc09, cc10, cc10) 1156 FNMSUB (aa2, cc11, cc12, cc12) 1157 FNMSUB (aa2, cc13, cc14, cc14) 1158 FNMSUB (aa2, cc15, cc16, cc16) 1159 1160 FMUL a3, c02, c02 1161 FMUL a3, c04, c04 1162 FMUL a3, c06, c06 1163 FMUL a3, c08, c08 1164 FMUL a3, c10, c10 1165 FMUL a3, c12, c12 1166 FMUL a3, c14, c14 1167 FMUL a3, c16, c16 1168#endif 1169 1170#ifdef RN 1171 LDF [BO + 0 * SIZE], a1 1172 LDF [BO + 1 * SIZE], a2 1173 LDF [BO + 2 * SIZE], a3 1174 LDF [BO + 3 * SIZE], a4 1175 LDF [BO + 4 * SIZE], b1 1176 LDF [BO + 5 * SIZE], b2 1177 LDF [BO + 6 * SIZE], b3 1178 LDF [BO + 7 * SIZE], b4 1179 1180 FMUL a1, c01, c01 1181 FMUL a1, c02, c02 1182 1183 FNMSUB (aa2, cc01, cc03, cc03) 1184 FNMSUB (aa2, cc02, cc04, cc04) 1185 FNMSUB (aa3, cc01, cc05, cc05) 1186 FNMSUB (aa3, cc02, cc06, cc06) 1187 FNMSUB (aa4, cc01, cc07, cc07) 1188 FNMSUB (aa4, cc02, cc08, cc08) 1189 FNMSUB (bb1, cc01, cc09, cc09) 1190 FNMSUB (bb1, cc02, cc10, cc10) 1191 FNMSUB (bb2, cc01, cc11, cc11) 1192 FNMSUB (bb2, cc02, cc12, cc12) 1193 FNMSUB (bb3, cc01, cc13, cc13) 1194 FNMSUB (bb3, cc02, cc14, cc14) 1195 FNMSUB (bb4, cc01, cc15, cc15) 1196 FNMSUB (bb4, cc02, cc16, cc16) 1197 1198 LDF [BO + 9 * SIZE], a1 1199 LDF [BO + 10 * SIZE], a2 1200 LDF [BO + 11 * SIZE], a3 1201 LDF [BO + 12 * SIZE], a4 1202 LDF [BO + 13 * SIZE], b1 1203 LDF [BO + 14 * SIZE], b2 1204 LDF [BO + 15 * SIZE], b3 1205 1206 FMUL a1, c03, c03 1207 FMUL a1, c04, c04 1208 1209 FNMSUB (aa2, cc03, cc05, cc05) 1210 FNMSUB (aa2, cc04, cc06, cc06) 1211 FNMSUB (aa3, cc03, cc07, cc07) 1212 FNMSUB (aa3, cc04, cc08, cc08) 1213 FNMSUB (aa4, cc03, cc09, cc09) 1214 FNMSUB (aa4, cc04, cc10, cc10) 1215 FNMSUB (bb1, cc03, cc11, cc11) 1216 FNMSUB (bb1, cc04, cc12, cc12) 1217 FNMSUB (bb2, cc03, cc13, cc13) 1218 FNMSUB (bb2, cc04, cc14, cc14) 1219 FNMSUB (bb3, cc03, cc15, cc15) 1220 FNMSUB (bb3, cc04, cc16, cc16) 1221 1222 LDF [BO + 18 * SIZE], a1 1223 LDF [BO + 19 * SIZE], a2 1224 LDF [BO + 20 * SIZE], a3 1225 LDF [BO + 21 * SIZE], a4 1226 LDF [BO + 22 * SIZE], b1 1227 LDF [BO + 23 * SIZE], b2 1228 1229 FMUL a1, c05, c05 1230 FMUL a1, c06, c06 1231 1232 FNMSUB (aa2, cc05, cc07, cc07) 1233 FNMSUB (aa2, cc06, cc08, cc08) 1234 FNMSUB (aa3, cc05, cc09, cc09) 1235 FNMSUB (aa3, cc06, cc10, cc10) 1236 FNMSUB (aa4, cc05, cc11, cc11) 1237 FNMSUB (aa4, cc06, cc12, cc12) 1238 FNMSUB (bb1, cc05, cc13, cc13) 1239 FNMSUB (bb1, cc06, cc14, cc14) 1240 FNMSUB (bb2, cc05, cc15, cc15) 1241 FNMSUB (bb2, cc06, cc16, cc16) 1242 1243 LDF [BO + 27 * SIZE], a1 1244 LDF [BO + 28 * SIZE], a2 1245 LDF [BO + 29 * SIZE], a3 1246 LDF [BO + 30 * SIZE], a4 1247 LDF [BO + 31 * SIZE], b1 1248 1249 FMUL a1, c07, c07 1250 FMUL a1, c08, c08 1251 1252 FNMSUB (aa2, cc07, cc09, cc09) 1253 FNMSUB (aa2, cc08, cc10, cc10) 1254 FNMSUB (aa3, cc07, cc11, cc11) 1255 FNMSUB (aa3, cc08, cc12, cc12) 1256 FNMSUB (aa4, cc07, cc13, cc13) 1257 FNMSUB (aa4, cc08, cc14, cc14) 1258 FNMSUB (bb1, cc07, cc15, cc15) 1259 FNMSUB (bb1, cc08, cc16, cc16) 1260 1261 LDF [BO + 36 * SIZE], a1 1262 LDF [BO + 37 * SIZE], a2 1263 LDF [BO + 38 * SIZE], a3 1264 LDF [BO + 39 * SIZE], a4 1265 1266 FMUL a1, c09, c09 1267 FMUL a1, c10, c10 1268 1269 FNMSUB (aa2, cc09, cc11, cc11) 1270 FNMSUB (aa2, cc10, cc12, cc12) 1271 FNMSUB (aa3, cc09, cc13, cc13) 1272 FNMSUB (aa3, cc10, cc14, cc14) 1273 FNMSUB (aa4, cc09, cc15, cc15) 1274 FNMSUB (aa4, cc10, cc16, cc16) 1275 1276 LDF [BO + 45 * SIZE], a1 1277 LDF [BO + 46 * SIZE], a2 1278 LDF [BO + 47 * SIZE], a3 1279 1280 FMUL a1, c11, c11 1281 FMUL a1, c12, c12 1282 1283 FNMSUB (aa2, cc11, cc13, cc13) 1284 FNMSUB (aa2, cc12, cc14, cc14) 1285 FNMSUB (aa3, cc11, cc15, cc15) 1286 FNMSUB (aa3, cc12, cc16, cc16) 1287 1288 LDF [BO + 54 * SIZE], a1 1289 LDF [BO + 55 * SIZE], a2 1290 1291 FMUL a1, c13, c13 1292 FMUL a1, c14, c14 1293 1294 FNMSUB (aa2, cc13, cc15, cc15) 1295 FNMSUB (aa2, cc14, cc16, cc16) 1296 1297 LDF [BO + 63 * SIZE], a1 1298 1299 FMUL a1, c15, c15 1300 FMUL a1, c16, c16 1301#endif 1302 1303#ifdef RT 1304 LDF [BO + 63 * SIZE], a1 1305 LDF [BO + 62 * SIZE], a2 1306 LDF [BO + 61 * SIZE], a3 1307 LDF [BO + 60 * SIZE], a4 1308 LDF [BO + 59 * SIZE], b1 1309 LDF [BO + 58 * SIZE], b2 1310 LDF [BO + 57 * SIZE], b3 1311 LDF [BO + 56 * SIZE], b4 1312 1313 FMUL a1, c16, c16 1314 FMUL a1, c15, c15 1315 1316 FNMSUB (aa2, cc16, cc14, cc14) 1317 FNMSUB (aa2, cc15, cc13, cc13) 1318 FNMSUB (aa3, cc16, cc12, cc12) 1319 FNMSUB (aa3, cc15, cc11, cc11) 1320 FNMSUB (aa4, cc16, cc10, cc10) 1321 FNMSUB (aa4, cc15, cc09, cc09) 1322 FNMSUB (bb1, cc16, cc08, cc08) 1323 FNMSUB (bb1, cc15, cc07, cc07) 1324 FNMSUB (bb2, cc16, cc06, cc06) 1325 FNMSUB (bb2, cc15, cc05, cc05) 1326 FNMSUB (bb3, cc16, cc04, cc04) 1327 FNMSUB (bb3, cc15, cc03, cc03) 1328 FNMSUB (bb4, cc16, cc02, cc02) 1329 FNMSUB (bb4, cc15, cc01, cc01) 1330 1331 LDF [BO + 54 * SIZE], a1 1332 LDF [BO + 53 * SIZE], a2 1333 LDF [BO + 52 * SIZE], a3 1334 LDF [BO + 51 * SIZE], a4 1335 LDF [BO + 50 * SIZE], b1 1336 LDF [BO + 49 * SIZE], b2 1337 LDF [BO + 48 * SIZE], b3 1338 1339 FMUL a1, c14, c14 1340 FMUL a1, c13, c13 1341 1342 FNMSUB (aa2, cc14, cc12, cc12) 1343 FNMSUB (aa2, cc13, cc11, cc11) 1344 FNMSUB (aa3, cc14, cc10, cc10) 1345 FNMSUB (aa3, cc13, cc09, cc09) 1346 FNMSUB (aa4, cc14, cc08, cc08) 1347 FNMSUB (aa4, cc13, cc07, cc07) 1348 FNMSUB (bb1, cc14, cc06, cc06) 1349 FNMSUB (bb1, cc13, cc05, cc05) 1350 FNMSUB (bb2, cc14, cc04, cc04) 1351 FNMSUB (bb2, cc13, cc03, cc03) 1352 FNMSUB (bb3, cc14, cc02, cc02) 1353 FNMSUB (bb3, cc13, cc01, cc01) 1354 1355 LDF [BO + 45 * SIZE], a1 1356 LDF [BO + 44 * SIZE], a2 1357 LDF [BO + 43 * SIZE], a3 1358 LDF [BO + 42 * SIZE], a4 1359 LDF [BO + 41 * SIZE], b1 1360 LDF [BO + 40 * SIZE], b2 1361 1362 FMUL a1, c12, c12 1363 FMUL a1, c11, c11 1364 1365 FNMSUB (aa2, cc12, cc10, cc10) 1366 FNMSUB (aa2, cc11, cc09, cc09) 1367 FNMSUB (aa3, cc12, cc08, cc08) 1368 FNMSUB (aa3, cc11, cc07, cc07) 1369 FNMSUB (aa4, cc12, cc06, cc06) 1370 FNMSUB (aa4, cc11, cc05, cc05) 1371 FNMSUB (bb1, cc12, cc04, cc04) 1372 FNMSUB (bb1, cc11, cc03, cc03) 1373 FNMSUB (bb2, cc12, cc02, cc02) 1374 FNMSUB (bb2, cc11, cc01, cc01) 1375 1376 LDF [BO + 36 * SIZE], a1 1377 LDF [BO + 35 * SIZE], a2 1378 LDF [BO + 34 * SIZE], a3 1379 LDF [BO + 33 * SIZE], a4 1380 LDF [BO + 32 * SIZE], b1 1381 1382 FMUL a1, c10, c10 1383 FMUL a1, c09, c09 1384 1385 FNMSUB (aa2, cc10, cc08, cc08) 1386 FNMSUB (aa2, cc09, cc07, cc07) 1387 FNMSUB (aa3, cc10, cc06, cc06) 1388 FNMSUB (aa3, cc09, cc05, cc05) 1389 FNMSUB (aa4, cc10, cc04, cc04) 1390 FNMSUB (aa4, cc09, cc03, cc03) 1391 FNMSUB (bb1, cc10, cc02, cc02) 1392 FNMSUB (bb1, cc09, cc01, cc01) 1393 1394 LDF [BO + 27 * SIZE], a1 1395 LDF [BO + 26 * SIZE], a2 1396 LDF [BO + 25 * SIZE], a3 1397 LDF [BO + 24 * SIZE], a4 1398 1399 FMUL a1, c08, c08 1400 FMUL a1, c07, c07 1401 1402 FNMSUB (aa2, cc08, cc06, cc06) 1403 FNMSUB (aa2, cc07, cc05, cc05) 1404 FNMSUB (aa3, cc08, cc04, cc04) 1405 FNMSUB (aa3, cc07, cc03, cc03) 1406 FNMSUB (aa4, cc08, cc02, cc02) 1407 FNMSUB (aa4, cc07, cc01, cc01) 1408 1409 LDF [BO + 18 * SIZE], a1 1410 LDF [BO + 17 * SIZE], a2 1411 LDF [BO + 16 * SIZE], a3 1412 1413 FMUL a1, c06, c06 1414 FMUL a1, c05, c05 1415 1416 FNMSUB (aa2, cc06, cc04, cc04) 1417 FNMSUB (aa2, cc05, cc03, cc03) 1418 FNMSUB (aa3, cc06, cc02, cc02) 1419 FNMSUB (aa3, cc05, cc01, cc01) 1420 1421 LDF [BO + 9 * SIZE], a1 1422 LDF [BO + 8 * SIZE], a2 1423 1424 FMUL a1, c04, c04 1425 FMUL a1, c03, c03 1426 1427 FNMSUB (aa2, cc04, cc02, cc02) 1428 FNMSUB (aa2, cc03, cc01, cc01) 1429 1430 LDF [BO + 0 * SIZE], a1 1431 1432 FMUL a1, c02, c02 1433 FMUL a1, c01, c01 1434#endif 1435 1436#ifdef LN 1437 add C1, -2 * SIZE, C1 1438 add C2, -2 * SIZE, C2 1439 add C3, -2 * SIZE, C3 1440 add C4, -2 * SIZE, C4 1441 add C5, -2 * SIZE, C5 1442 add C6, -2 * SIZE, C6 1443 add C7, -2 * SIZE, C7 1444 add C8, -2 * SIZE, C8 1445#endif 1446 1447#if defined(LN) || defined(LT) 1448 STF c01, [BO + 0 * SIZE] 1449 STF c03, [BO + 1 * SIZE] 1450 STF c05, [BO + 2 * SIZE] 1451 STF c07, [BO + 3 * SIZE] 1452 1453 STF c09, [BO + 4 * SIZE] 1454 STF c11, [BO + 5 * SIZE] 1455 STF c13, [BO + 6 * SIZE] 1456 STF c15, [BO + 7 * SIZE] 1457 1458 STF c02, [BO + 8 * SIZE] 1459 STF c04, [BO + 9 * SIZE] 1460 STF c06, [BO + 10 * SIZE] 1461 STF c08, [BO + 11 * SIZE] 1462 1463 STF c10, [BO + 12 * SIZE] 1464 STF c12, [BO + 13 * SIZE] 1465 STF c14, [BO + 14 * SIZE] 1466 STF c16, [BO + 15 * SIZE] 1467#else 1468 STF c01, [AO + 0 * SIZE] 1469 STF c02, [AO + 1 * SIZE] 1470 STF c03, [AO + 2 * SIZE] 1471 STF c04, [AO + 3 * SIZE] 1472 1473 STF c05, [AO + 4 * SIZE] 1474 STF c06, [AO + 5 * SIZE] 1475 STF c07, [AO + 6 * SIZE] 1476 STF c08, [AO + 7 * SIZE] 1477 1478 STF c09, [AO + 8 * SIZE] 1479 STF c10, [AO + 9 * SIZE] 1480 STF c11, [AO + 10 * SIZE] 1481 STF c12, [AO + 11 * SIZE] 1482 1483 STF c13, [AO + 12 * SIZE] 1484 STF c14, [AO + 13 * SIZE] 1485 STF c15, [AO + 14 * SIZE] 1486 STF c16, [AO + 15 * SIZE] 1487#endif 1488 1489 STF c01, [C1 + 0 * SIZE] 1490 STF c02, [C1 + 1 * SIZE] 1491 STF c03, [C2 + 0 * SIZE] 1492 STF c04, [C2 + 1 * SIZE] 1493 1494 STF c05, [C3 + 0 * SIZE] 1495 STF c06, [C3 + 1 * SIZE] 1496 STF c07, [C4 + 0 * SIZE] 1497 STF c08, [C4 + 1 * SIZE] 1498 1499 STF c09, [C5 + 0 * SIZE] 1500 STF c10, [C5 + 1 * SIZE] 1501 STF c11, [C6 + 0 * SIZE] 1502 STF c12, [C6 + 1 * SIZE] 1503 1504 STF c13, [C7 + 0 * SIZE] 1505 STF c14, [C7 + 1 * SIZE] 1506 STF c15, [C8 + 0 * SIZE] 1507 STF c16, [C8 + 1 * SIZE] 1508 1509#ifndef LN 1510 add C1, 2 * SIZE, C1 1511 add C2, 2 * SIZE, C2 1512 add C3, 2 * SIZE, C3 1513 add C4, 2 * SIZE, C4 1514 add C5, 2 * SIZE, C5 1515 add C6, 2 * SIZE, C6 1516 add C7, 2 * SIZE, C7 1517 add C8, 2 * SIZE, C8 1518#endif 1519 1520#ifdef RT 1521 sll K, BASE_SHIFT + 1, TEMP1 1522 add AORIG, TEMP1, AORIG 1523#endif 1524 1525#if defined(LT) || defined(RN) 1526 sub K, KK, TEMP1 1527 sll TEMP1, BASE_SHIFT + 1, TEMP2 1528 sll TEMP1, BASE_SHIFT + 3, TEMP1 1529 add AO, TEMP2, AO 1530 add BO, TEMP1, BO 1531#endif 1532 1533#ifdef LT 1534 add KK, 2, KK 1535#endif 1536 1537#ifdef LN 1538 sub KK, 2, KK 1539#endif 1540 1541 add I, -1, I 1542 cmp I, 0 1543 bg,pt %icc, .LL12 1544 nop 1545 .align 4 1546 1547.LL20: 1548 and M, 1, I 1549 cmp I, 0 1550 ble,pn %icc, .LL29 1551 nop 1552 1553#if defined(LT) || defined(RN) 1554 mov B, BO 1555#else 1556#ifdef LN 1557 sll K, BASE_SHIFT + 0, TEMP1 1558 sub AORIG, TEMP1, AORIG 1559#endif 1560 1561 sll KK, BASE_SHIFT + 0, TEMP1 1562 sll KK, BASE_SHIFT + 3, TEMP2 1563 1564 add AORIG, TEMP1, AO 1565 add B, TEMP2, BO 1566#endif 1567 1568 LDF [AO + 0 * SIZE], a1 1569 LDF [AO + 1 * SIZE], a2 1570 LDF [AO + 2 * SIZE], a3 1571 LDF [AO + 3 * SIZE], a4 1572 1573 LDF [BO + 0 * SIZE], b1 1574 FCLR (cc01) 1575 LDF [BO + 1 * SIZE], b2 1576 FCLR (cc03) 1577 LDF [BO + 2 * SIZE], b3 1578 FCLR (cc05) 1579 LDF [BO + 3 * SIZE], b4 1580 FCLR (cc07) 1581 LDF [BO + 4 * SIZE], b5 1582 FCLR (cc09) 1583 LDF [BO + 5 * SIZE], b6 1584 FCLR (cc11) 1585 LDF [BO + 6 * SIZE], b7 1586 FCLR (cc13) 1587 LDF [BO + 7 * SIZE], b8 1588 FCLR (cc15) 1589 1590#if defined(LT) || defined(RN) 1591 sra KK, 2, L 1592#else 1593 sub K, KK, L 1594 sra L, 2, L 1595#endif 1596 cmp L, 0 1597 ble,pn %icc, .LL25 1598 LDF [BO + 8 * SIZE], b9 1599 .align 4 1600 1601.LL23: 1602 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1603 add L, -1, L 1604 1605 FMADD (aa1, bb1, cc01, cc01) 1606 LDF [BO + 16 * SIZE], b1 1607 FMADD (aa1, bb2, cc03, cc03) 1608 LDF [BO + 9 * SIZE], b2 1609 1610 FMADD (aa1, bb3, cc05, cc05) 1611 LDF [BO + 10 * SIZE], b3 1612 FMADD (aa1, bb4, cc07, cc07) 1613 LDF [BO + 11 * SIZE], b4 1614 1615 FMADD (aa1, bb5, cc09, cc09) 1616 LDF [BO + 12 * SIZE], b5 1617 FMADD (aa1, bb6, cc11, cc11) 1618 LDF [BO + 13 * SIZE], b6 1619 1620 FMADD (aa1, bb7, cc13, cc13) 1621 LDF [BO + 14 * SIZE], b7 1622 FMADD (aa1, bb8, cc15, cc15) 1623 LDF [BO + 15 * SIZE], b8 1624 1625 FMADD (aa2, bb9, cc01, cc01) 1626 LDF [BO + 24 * SIZE], b9 1627 FMADD (aa2, bb2, cc03, cc03) 1628 LDF [BO + 17 * SIZE], b2 1629 1630 FMADD (aa2, bb3, cc05, cc05) 1631 LDF [BO + 18 * SIZE], b3 1632 FMADD (aa2, bb4, cc07, cc07) 1633 LDF [BO + 19 * SIZE], b4 1634 1635 FMADD (aa2, bb5, cc09, cc09) 1636 LDF [BO + 20 * SIZE], b5 1637 FMADD (aa2, bb6, cc11, cc11) 1638 LDF [BO + 21 * SIZE], b6 1639 1640 FMADD (aa2, bb7, cc13, cc13) 1641 LDF [BO + 22 * SIZE], b7 1642 FMADD (aa2, bb8, cc15, cc15) 1643 LDF [BO + 23 * SIZE], b8 1644 1645 LDF [AO + 4 * SIZE], a1 1646 LDF [AO + 5 * SIZE], a2 1647 1648 FMADD (aa3, bb1, cc01, cc01) 1649 LDF [BO + 32 * SIZE], b1 1650 FMADD (aa3, bb2, cc03, cc03) 1651 LDF [BO + 25 * SIZE], b2 1652 1653 FMADD (aa3, bb3, cc05, cc05) 1654 LDF [BO + 26 * SIZE], b3 1655 FMADD (aa3, bb4, cc07, cc07) 1656 LDF [BO + 27 * SIZE], b4 1657 1658 FMADD (aa3, bb5, cc09, cc09) 1659 LDF [BO + 28 * SIZE], b5 1660 FMADD (aa3, bb6, cc11, cc11) 1661 LDF [BO + 29 * SIZE], b6 1662 1663 FMADD (aa3, bb7, cc13, cc13) 1664 LDF [BO + 30 * SIZE], b7 1665 FMADD (aa3, bb8, cc15, cc15) 1666 LDF [BO + 31 * SIZE], b8 1667 1668 FMADD (aa4, bb9, cc01, cc01) 1669 LDF [BO + 40 * SIZE], b9 1670 FMADD (aa4, bb2, cc03, cc03) 1671 LDF [BO + 33 * SIZE], b2 1672 1673 FMADD (aa4, bb3, cc05, cc05) 1674 LDF [BO + 34 * SIZE], b3 1675 FMADD (aa4, bb4, cc07, cc07) 1676 LDF [BO + 35 * SIZE], b4 1677 1678 FMADD (aa4, bb5, cc09, cc09) 1679 LDF [BO + 36 * SIZE], b5 1680 FMADD (aa4, bb6, cc11, cc11) 1681 LDF [BO + 37 * SIZE], b6 1682 1683 FMADD (aa4, bb7, cc13, cc13) 1684 LDF [BO + 38 * SIZE], b7 1685 FMADD (aa4, bb8, cc15, cc15) 1686 LDF [BO + 39 * SIZE], b8 1687 1688 LDF [AO + 6 * SIZE], a3 1689 LDF [AO + 7 * SIZE], a4 1690 1691 add AO, 4 * SIZE, AO 1692 cmp L, 0 1693 bg,pt %icc, .LL23 1694 add BO, 32 * SIZE, BO 1695 .align 4 1696 1697.LL25: 1698#if defined(LT) || defined(RN) 1699 and KK, 3, L 1700#else 1701 sub K, KK, L 1702 and L, 3, L 1703#endif 1704 cmp L, 0 1705 ble,a,pn %icc, .LL28 1706 nop 1707 .align 4 1708 1709.LL27: 1710 FMADD (aa1, bb1, cc01, cc01) 1711 LDF [BO + 8 * SIZE], b1 1712 FMADD (aa1, bb2, cc03, cc03) 1713 LDF [BO + 9 * SIZE], b2 1714 1715 FMADD (aa1, bb3, cc05, cc05) 1716 LDF [BO + 10 * SIZE], b3 1717 FMADD (aa1, bb4, cc07, cc07) 1718 LDF [BO + 11 * SIZE], b4 1719 1720 FMADD (aa1, bb5, cc09, cc09) 1721 LDF [BO + 12 * SIZE], b5 1722 FMADD (aa1, bb6, cc11, cc11) 1723 LDF [BO + 13 * SIZE], b6 1724 1725 FMADD (aa1, bb7, cc13, cc13) 1726 LDF [BO + 14 * SIZE], b7 1727 FMADD (aa1, bb8, cc15, cc15) 1728 LDF [BO + 15 * SIZE], b8 1729 1730 LDF [AO + 1 * SIZE], a1 1731 add AO, 1 * SIZE, AO 1732 1733 add L, -1, L 1734 cmp L, 0 1735 bg,pt %icc, .LL27 1736 add BO, 8 * SIZE, BO 1737 .align 4 1738 1739.LL28: 1740#if defined(LN) || defined(RT) 1741#ifdef LN 1742 sub KK, 1, TEMP1 1743#else 1744 sub KK, 8, TEMP1 1745#endif 1746 sll TEMP1, BASE_SHIFT + 0, TEMP2 1747 sll TEMP1, BASE_SHIFT + 3, TEMP1 1748 1749 add AORIG, TEMP2, AO 1750 add B, TEMP1, BO 1751#endif 1752 1753#if defined(LN) || defined(LT) 1754 LDF [BO + 0 * SIZE], a1 1755 LDF [BO + 1 * SIZE], a2 1756 LDF [BO + 2 * SIZE], a3 1757 LDF [BO + 3 * SIZE], a4 1758 1759 LDF [BO + 4 * SIZE], b1 1760 LDF [BO + 5 * SIZE], b2 1761 LDF [BO + 6 * SIZE], b3 1762 LDF [BO + 7 * SIZE], b4 1763 1764 FSUB a1, c01, c01 1765 FSUB a2, c03, c03 1766 FSUB a3, c05, c05 1767 FSUB a4, c07, c07 1768 1769 FSUB b1, c09, c09 1770 FSUB b2, c11, c11 1771 FSUB b3, c13, c13 1772 FSUB b4, c15, c15 1773#else 1774 LDF [AO + 0 * SIZE], a1 1775 LDF [AO + 1 * SIZE], a2 1776 LDF [AO + 2 * SIZE], a3 1777 LDF [AO + 3 * SIZE], a4 1778 1779 LDF [AO + 4 * SIZE], b1 1780 LDF [AO + 5 * SIZE], b2 1781 LDF [AO + 6 * SIZE], b3 1782 LDF [AO + 7 * SIZE], b4 1783 1784 FSUB a1, c01, c01 1785 FSUB a2, c03, c03 1786 FSUB a3, c05, c05 1787 FSUB a4, c07, c07 1788 1789 FSUB b1, c09, c09 1790 FSUB b2, c11, c11 1791 FSUB b3, c13, c13 1792 FSUB b4, c15, c15 1793#endif 1794 1795#if defined(LN) || defined(LT) 1796 LDF [AO + 0 * SIZE], a1 1797 1798 FMUL a1, c01, c01 1799 FMUL a1, c03, c03 1800 FMUL a1, c05, c05 1801 FMUL a1, c07, c07 1802 FMUL a1, c09, c09 1803 FMUL a1, c11, c11 1804 FMUL a1, c13, c13 1805 FMUL a1, c15, c15 1806#endif 1807 1808#ifdef RN 1809 LDF [BO + 0 * SIZE], a1 1810 LDF [BO + 1 * SIZE], a2 1811 LDF [BO + 2 * SIZE], a3 1812 LDF [BO + 3 * SIZE], a4 1813 LDF [BO + 4 * SIZE], b1 1814 LDF [BO + 5 * SIZE], b2 1815 LDF [BO + 6 * SIZE], b3 1816 LDF [BO + 7 * SIZE], b4 1817 1818 FMUL a1, c01, c01 1819 1820 FNMSUB (aa2, cc01, cc03, cc03) 1821 FNMSUB (aa3, cc01, cc05, cc05) 1822 FNMSUB (aa4, cc01, cc07, cc07) 1823 FNMSUB (bb1, cc01, cc09, cc09) 1824 FNMSUB (bb2, cc01, cc11, cc11) 1825 FNMSUB (bb3, cc01, cc13, cc13) 1826 FNMSUB (bb4, cc01, cc15, cc15) 1827 1828 LDF [BO + 9 * SIZE], a1 1829 LDF [BO + 10 * SIZE], a2 1830 LDF [BO + 11 * SIZE], a3 1831 LDF [BO + 12 * SIZE], a4 1832 LDF [BO + 13 * SIZE], b1 1833 LDF [BO + 14 * SIZE], b2 1834 LDF [BO + 15 * SIZE], b3 1835 1836 FMUL a1, c03, c03 1837 1838 FNMSUB (aa2, cc03, cc05, cc05) 1839 FNMSUB (aa3, cc03, cc07, cc07) 1840 FNMSUB (aa4, cc03, cc09, cc09) 1841 FNMSUB (bb1, cc03, cc11, cc11) 1842 FNMSUB (bb2, cc03, cc13, cc13) 1843 FNMSUB (bb3, cc03, cc15, cc15) 1844 1845 LDF [BO + 18 * SIZE], a1 1846 LDF [BO + 19 * SIZE], a2 1847 LDF [BO + 20 * SIZE], a3 1848 LDF [BO + 21 * SIZE], a4 1849 LDF [BO + 22 * SIZE], b1 1850 LDF [BO + 23 * SIZE], b2 1851 1852 FMUL a1, c05, c05 1853 1854 FNMSUB (aa2, cc05, cc07, cc07) 1855 FNMSUB (aa3, cc05, cc09, cc09) 1856 FNMSUB (aa4, cc05, cc11, cc11) 1857 FNMSUB (bb1, cc05, cc13, cc13) 1858 FNMSUB (bb2, cc05, cc15, cc15) 1859 1860 LDF [BO + 27 * SIZE], a1 1861 LDF [BO + 28 * SIZE], a2 1862 LDF [BO + 29 * SIZE], a3 1863 LDF [BO + 30 * SIZE], a4 1864 LDF [BO + 31 * SIZE], b1 1865 1866 FMUL a1, c07, c07 1867 1868 FNMSUB (aa2, cc07, cc09, cc09) 1869 FNMSUB (aa3, cc07, cc11, cc11) 1870 FNMSUB (aa4, cc07, cc13, cc13) 1871 FNMSUB (bb1, cc07, cc15, cc15) 1872 1873 LDF [BO + 36 * SIZE], a1 1874 LDF [BO + 37 * SIZE], a2 1875 LDF [BO + 38 * SIZE], a3 1876 LDF [BO + 39 * SIZE], a4 1877 1878 FMUL a1, c09, c09 1879 1880 FNMSUB (aa2, cc09, cc11, cc11) 1881 FNMSUB (aa3, cc09, cc13, cc13) 1882 FNMSUB (aa4, cc09, cc15, cc15) 1883 1884 LDF [BO + 45 * SIZE], a1 1885 LDF [BO + 46 * SIZE], a2 1886 LDF [BO + 47 * SIZE], a3 1887 1888 FMUL a1, c11, c11 1889 1890 FNMSUB (aa2, cc11, cc13, cc13) 1891 FNMSUB (aa3, cc11, cc15, cc15) 1892 1893 LDF [BO + 54 * SIZE], a1 1894 LDF [BO + 55 * SIZE], a2 1895 1896 FMUL a1, c13, c13 1897 1898 FNMSUB (aa2, cc13, cc15, cc15) 1899 1900 LDF [BO + 63 * SIZE], a1 1901 1902 FMUL a1, c15, c15 1903#endif 1904 1905#ifdef RT 1906 LDF [BO + 63 * SIZE], a1 1907 LDF [BO + 62 * SIZE], a2 1908 LDF [BO + 61 * SIZE], a3 1909 LDF [BO + 60 * SIZE], a4 1910 LDF [BO + 59 * SIZE], b1 1911 LDF [BO + 58 * SIZE], b2 1912 LDF [BO + 57 * SIZE], b3 1913 LDF [BO + 56 * SIZE], b4 1914 1915 FMUL a1, c15, c15 1916 1917 FNMSUB (aa2, cc15, cc13, cc13) 1918 FNMSUB (aa3, cc15, cc11, cc11) 1919 FNMSUB (aa4, cc15, cc09, cc09) 1920 FNMSUB (bb1, cc15, cc07, cc07) 1921 FNMSUB (bb2, cc15, cc05, cc05) 1922 FNMSUB (bb3, cc15, cc03, cc03) 1923 FNMSUB (bb4, cc15, cc01, cc01) 1924 1925 LDF [BO + 54 * SIZE], a1 1926 LDF [BO + 53 * SIZE], a2 1927 LDF [BO + 52 * SIZE], a3 1928 LDF [BO + 51 * SIZE], a4 1929 LDF [BO + 50 * SIZE], b1 1930 LDF [BO + 49 * SIZE], b2 1931 LDF [BO + 48 * SIZE], b3 1932 1933 FMUL a1, c13, c13 1934 1935 FNMSUB (aa2, cc13, cc11, cc11) 1936 FNMSUB (aa3, cc13, cc09, cc09) 1937 FNMSUB (aa4, cc13, cc07, cc07) 1938 FNMSUB (bb1, cc13, cc05, cc05) 1939 FNMSUB (bb2, cc13, cc03, cc03) 1940 FNMSUB (bb3, cc13, cc01, cc01) 1941 1942 LDF [BO + 45 * SIZE], a1 1943 LDF [BO + 44 * SIZE], a2 1944 LDF [BO + 43 * SIZE], a3 1945 LDF [BO + 42 * SIZE], a4 1946 LDF [BO + 41 * SIZE], b1 1947 LDF [BO + 40 * SIZE], b2 1948 1949 FMUL a1, c11, c11 1950 1951 FNMSUB (aa2, cc11, cc09, cc09) 1952 FNMSUB (aa3, cc11, cc07, cc07) 1953 FNMSUB (aa4, cc11, cc05, cc05) 1954 FNMSUB (bb1, cc11, cc03, cc03) 1955 FNMSUB (bb2, cc11, cc01, cc01) 1956 1957 LDF [BO + 36 * SIZE], a1 1958 LDF [BO + 35 * SIZE], a2 1959 LDF [BO + 34 * SIZE], a3 1960 LDF [BO + 33 * SIZE], a4 1961 LDF [BO + 32 * SIZE], b1 1962 1963 FMUL a1, c09, c09 1964 1965 FNMSUB (aa2, cc09, cc07, cc07) 1966 FNMSUB (aa3, cc09, cc05, cc05) 1967 FNMSUB (aa4, cc09, cc03, cc03) 1968 FNMSUB (bb1, cc09, cc01, cc01) 1969 1970 LDF [BO + 27 * SIZE], a1 1971 LDF [BO + 26 * SIZE], a2 1972 LDF [BO + 25 * SIZE], a3 1973 LDF [BO + 24 * SIZE], a4 1974 1975 FMUL a1, c07, c07 1976 1977 FNMSUB (aa2, cc07, cc05, cc05) 1978 FNMSUB (aa3, cc07, cc03, cc03) 1979 FNMSUB (aa4, cc07, cc01, cc01) 1980 1981 LDF [BO + 18 * SIZE], a1 1982 LDF [BO + 17 * SIZE], a2 1983 LDF [BO + 16 * SIZE], a3 1984 1985 FMUL a1, c05, c05 1986 1987 FNMSUB (aa2, cc05, cc03, cc03) 1988 FNMSUB (aa3, cc05, cc01, cc01) 1989 1990 LDF [BO + 9 * SIZE], a1 1991 LDF [BO + 8 * SIZE], a2 1992 1993 FMUL a1, c03, c03 1994 1995 FNMSUB (aa2, cc03, cc01, cc01) 1996 1997 LDF [BO + 0 * SIZE], a1 1998 1999 FMUL a1, c01, c01 2000#endif 2001 2002#ifdef LN 2003 add C1, -1 * SIZE, C1 2004 add C2, -1 * SIZE, C2 2005 add C3, -1 * SIZE, C3 2006 add C4, -1 * SIZE, C4 2007 add C5, -1 * SIZE, C5 2008 add C6, -1 * SIZE, C6 2009 add C7, -1 * SIZE, C7 2010 add C8, -1 * SIZE, C8 2011#endif 2012 2013#if defined(LN) || defined(LT) 2014 STF c01, [BO + 0 * SIZE] 2015 STF c03, [BO + 1 * SIZE] 2016 STF c05, [BO + 2 * SIZE] 2017 STF c07, [BO + 3 * SIZE] 2018 2019 STF c09, [BO + 4 * SIZE] 2020 STF c11, [BO + 5 * SIZE] 2021 STF c13, [BO + 6 * SIZE] 2022 STF c15, [BO + 7 * SIZE] 2023#else 2024 STF c01, [AO + 0 * SIZE] 2025 STF c03, [AO + 1 * SIZE] 2026 STF c05, [AO + 2 * SIZE] 2027 STF c07, [AO + 3 * SIZE] 2028 2029 STF c09, [AO + 4 * SIZE] 2030 STF c11, [AO + 5 * SIZE] 2031 STF c13, [AO + 6 * SIZE] 2032 STF c15, [AO + 7 * SIZE] 2033#endif 2034 2035 STF c01, [C1 + 0 * SIZE] 2036 STF c03, [C2 + 0 * SIZE] 2037 STF c05, [C3 + 0 * SIZE] 2038 STF c07, [C4 + 0 * SIZE] 2039 2040 STF c09, [C5 + 0 * SIZE] 2041 STF c11, [C6 + 0 * SIZE] 2042 STF c13, [C7 + 0 * SIZE] 2043 STF c15, [C8 + 0 * SIZE] 2044 2045#ifdef RT 2046 sll K, BASE_SHIFT + 0, TEMP1 2047 add AORIG, TEMP1, AORIG 2048#endif 2049 2050#if defined(LT) || defined(RN) 2051 sub K, KK, TEMP1 2052 sll TEMP1, BASE_SHIFT + 0, TEMP2 2053 sll TEMP1, BASE_SHIFT + 3, TEMP1 2054 add AO, TEMP2, AO 2055 add BO, TEMP1, BO 2056#endif 2057 2058#ifdef LT 2059 add KK, 1, KK 2060#endif 2061 2062#ifdef LN 2063 sub KK, 1, KK 2064#endif 2065 .align 4 2066 2067.LL29: 2068#ifdef LN 2069 sll K, BASE_SHIFT + 3, TEMP1 2070 add B, TEMP1, B 2071#endif 2072 2073#if defined(LT) || defined(RN) 2074 mov BO, B 2075#endif 2076 2077#ifdef RN 2078 add KK, 8, KK 2079#endif 2080 2081#ifdef RT 2082 sub KK, 8, KK 2083#endif 2084 2085 add J, -1, J 2086 cmp J, 0 2087 bg,pt %icc, .LL11 2088 nop 2089 .align 4 2090 2091.LL30: 2092 and N, 4, J 2093 cmp J, 0 2094 ble,pn %icc, .LL50 2095 nop 2096 2097#ifdef RT 2098 sll K, BASE_SHIFT + 2, TEMP1 2099 sub B, TEMP1, B 2100#endif 2101 2102#ifndef RT 2103 mov C, C1 2104 add C, LDC, C2 2105 add C2, LDC, C3 2106 add C3, LDC, C4 2107 add C4, LDC, C 2108#else 2109 sub C, LDC, C4 2110 sub C4, LDC, C3 2111 sub C3, LDC, C2 2112 sub C2, LDC, C1 2113 sub C2, LDC, C 2114#endif 2115 2116#ifdef LN 2117 add M, OFFSET, KK 2118#endif 2119 2120#ifdef LT 2121 mov OFFSET, KK 2122#endif 2123 2124#if defined(LN) || defined(RT) 2125 mov A, AORIG 2126#else 2127 mov A, AO 2128#endif 2129 2130 sra M, 1, I 2131 cmp I, 0 2132 ble,pn %icc, .LL40 2133 nop 2134 .align 4 2135 2136.LL32: 2137#if defined(LT) || defined(RN) 2138 mov B, BO 2139#else 2140#ifdef LN 2141 sll K, BASE_SHIFT + 1, TEMP1 2142 sub AORIG, TEMP1, AORIG 2143#endif 2144 2145 sll KK, BASE_SHIFT + 1, TEMP1 2146 sll KK, BASE_SHIFT + 2, TEMP2 2147 2148 add AORIG, TEMP1, AO 2149 add B, TEMP2, BO 2150#endif 2151 2152 LDF [AO + 0 * SIZE], a1 2153 LDF [AO + 1 * SIZE], a2 2154 2155 LDF [BO + 0 * SIZE], b1 2156 LDF [BO + 1 * SIZE], b2 2157 LDF [BO + 2 * SIZE], b3 2158 LDF [BO + 3 * SIZE], b4 2159 LDF [BO + 4 * SIZE], b5 2160 2161 LDF [BO + 5 * SIZE], b6 2162 FCLR (cc01) 2163 LDF [BO + 6 * SIZE], b7 2164 FCLR (cc02) 2165 LDF [BO + 7 * SIZE], b8 2166 FCLR (cc03) 2167 LDF [BO + 8 * SIZE], b9 2168 FCLR (cc04) 2169 2170 prefetch [C1 + 2 * SIZE], 3 2171 FCLR (cc05) 2172 prefetch [C2 + 2 * SIZE], 3 2173 FCLR (cc06) 2174 prefetch [C3 + 2 * SIZE], 3 2175 FCLR (cc07) 2176 prefetch [C4 + 2 * SIZE], 3 2177 FCLR (cc08) 2178 2179#if defined(LT) || defined(RN) 2180 sra KK, 2, L 2181#else 2182 sub K, KK, L 2183 sra L, 2, L 2184#endif 2185 cmp L, 0 2186 ble,pn %icc, .LL35 2187 nop 2188 .align 4 2189 2190.LL33: 2191 FMADD (aa1, bb1, cc01, cc01) 2192 LDF [AO + 2 * SIZE], a3 2193 FMADD (aa2, bb1, cc02, cc02) 2194 LDF [AO + 3 * SIZE], a4 2195 2196 FMADD (aa1, bb2, cc03, cc03) 2197 LDF [BO + 16 * SIZE], b1 2198 FMADD (aa2, bb2, cc04, cc04) 2199 LDF [BO + 9 * SIZE], b2 2200 2201 FMADD (aa1, bb3, cc05, cc05) 2202 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2203 FMADD (aa2, bb3, cc06, cc06) 2204 add L, -1, L 2205 2206 FMADD (aa1, bb4, cc07, cc07) 2207 LDF [BO + 10 * SIZE], b3 2208 FMADD (aa2, bb4, cc08, cc08) 2209 LDF [BO + 11 * SIZE], b4 2210 2211 FMADD (aa3, bb5, cc01, cc01) 2212 LDF [AO + 4 * SIZE], a1 2213 FMADD (aa4, bb5, cc02, cc02) 2214 LDF [AO + 5 * SIZE], a2 2215 2216 FMADD (aa3, bb6, cc03, cc03) 2217 LDF [BO + 12 * SIZE], b5 2218 FMADD (aa4, bb6, cc04, cc04) 2219 LDF [BO + 13 * SIZE], b6 2220 2221 FMADD (aa3, bb7, cc05, cc05) 2222 cmp L, 0 2223 FMADD (aa4, bb7, cc06, cc06) 2224 add AO, 8 * SIZE, AO 2225 2226 FMADD (aa3, bb8, cc07, cc07) 2227 LDF [BO + 14 * SIZE], b7 2228 FMADD (aa4, bb8, cc08, cc08) 2229 LDF [BO + 15 * SIZE], b8 2230 2231 FMADD (aa1, bb9, cc01, cc01) 2232 LDF [AO - 2 * SIZE], a3 2233 FMADD (aa2, bb9, cc02, cc02) 2234 LDF [AO - 1 * SIZE], a4 2235 2236 FMADD (aa1, bb2, cc03, cc03) 2237 LDF [BO + 24 * SIZE], b9 2238 FMADD (aa2, bb2, cc04, cc04) 2239 LDF [BO + 17 * SIZE], b2 2240 2241 FMADD (aa1, bb3, cc05, cc05) 2242 add BO, 16 * SIZE, BO 2243 FMADD (aa2, bb3, cc06, cc06) 2244 nop 2245 2246 FMADD (aa1, bb4, cc07, cc07) 2247 LDF [BO + 2 * SIZE], b3 2248 FMADD (aa2, bb4, cc08, cc08) 2249 LDF [BO + 3 * SIZE], b4 2250 2251 FMADD (aa3, bb5, cc01, cc01) 2252 LDF [AO + 0 * SIZE], a1 2253 FMADD (aa4, bb5, cc02, cc02) 2254 LDF [AO + 1 * SIZE], a2 2255 FMADD (aa3, bb6, cc03, cc03) 2256 LDF [BO + 4 * SIZE], b5 2257 FMADD (aa4, bb6, cc04, cc04) 2258 LDF [BO + 5 * SIZE], b6 2259 2260 FMADD (aa3, bb7, cc05, cc05) 2261 nop 2262 FMADD (aa4, bb7, cc06, cc06) 2263 LDF [BO + 6 * SIZE], b7 2264 2265 FMADD (aa3, bb8, cc07, cc07) 2266 FMADD (aa4, bb8, cc08, cc08) 2267 bg,pt %icc, .LL33 2268 LDF [BO + 7 * SIZE], b8 2269 .align 4 2270 2271.LL35: 2272#if defined(LT) || defined(RN) 2273 and KK, 3, L 2274#else 2275 sub K, KK, L 2276 and L, 3, L 2277#endif 2278 cmp L, 0 2279 ble,a,pn %icc, .LL38 2280 nop 2281 .align 4 2282 2283.LL37: 2284 FMADD (aa1, bb1, cc01, cc01) 2285 add L, -1, L 2286 FMADD (aa2, bb1, cc02, cc02) 2287 LDF [BO + 4 * SIZE], b1 2288 2289 FMADD (aa1, bb2, cc03, cc03) 2290 add AO, 2 * SIZE, AO 2291 FMADD (aa2, bb2, cc04, cc04) 2292 LDF [BO + 5 * SIZE], b2 2293 2294 FMADD (aa1, bb3, cc05, cc05) 2295 cmp L, 0 2296 FMADD (aa2, bb3, cc06, cc06) 2297 LDF [BO + 6 * SIZE], b3 2298 2299 FMADD (aa1, bb4, cc07, cc07) 2300 LDF [AO + 0 * SIZE], a1 2301 FMADD (aa2, bb4, cc08, cc08) 2302 LDF [AO + 1 * SIZE], a2 2303 2304 LDF [BO + 7 * SIZE], b4 2305 bg,pt %icc, .LL37 2306 add BO, 4 * SIZE, BO 2307 .align 4 2308 2309.LL38: 2310#if defined(LN) || defined(RT) 2311#ifdef LN 2312 sub KK, 2, TEMP1 2313#else 2314 sub KK, 4, TEMP1 2315#endif 2316 sll TEMP1, BASE_SHIFT + 1, TEMP2 2317 sll TEMP1, BASE_SHIFT + 2, TEMP1 2318 2319 add AORIG, TEMP2, AO 2320 add B, TEMP1, BO 2321#endif 2322 2323#if defined(LN) || defined(LT) 2324 LDF [BO + 0 * SIZE], a1 2325 LDF [BO + 1 * SIZE], a2 2326 LDF [BO + 2 * SIZE], a3 2327 LDF [BO + 3 * SIZE], a4 2328 2329 LDF [BO + 4 * SIZE], b1 2330 LDF [BO + 5 * SIZE], b2 2331 LDF [BO + 6 * SIZE], b3 2332 LDF [BO + 7 * SIZE], b4 2333 2334 FSUB a1, c01, c01 2335 FSUB a2, c03, c03 2336 FSUB a3, c05, c05 2337 FSUB a4, c07, c07 2338 2339 FSUB b1, c02, c02 2340 FSUB b2, c04, c04 2341 FSUB b3, c06, c06 2342 FSUB b4, c08, c08 2343#else 2344 LDF [AO + 0 * SIZE], a1 2345 LDF [AO + 1 * SIZE], a2 2346 LDF [AO + 2 * SIZE], a3 2347 LDF [AO + 3 * SIZE], a4 2348 2349 LDF [AO + 4 * SIZE], b1 2350 LDF [AO + 5 * SIZE], b2 2351 LDF [AO + 6 * SIZE], b3 2352 LDF [AO + 7 * SIZE], b4 2353 2354 FSUB a1, c01, c01 2355 FSUB a2, c02, c02 2356 FSUB a3, c03, c03 2357 FSUB a4, c04, c04 2358 2359 FSUB b1, c05, c05 2360 FSUB b2, c06, c06 2361 FSUB b3, c07, c07 2362 FSUB b4, c08, c08 2363 2364#endif 2365 2366#ifdef LN 2367 LDF [AO + 3 * SIZE], a1 2368 LDF [AO + 2 * SIZE], a2 2369 LDF [AO + 0 * SIZE], a3 2370 2371 FMUL a1, c02, c02 2372 FMUL a1, c04, c04 2373 FMUL a1, c06, c06 2374 FMUL a1, c08, c08 2375 2376 FNMSUB (aa2, cc02, cc01, cc01) 2377 FNMSUB (aa2, cc04, cc03, cc03) 2378 FNMSUB (aa2, cc06, cc05, cc05) 2379 FNMSUB (aa2, cc08, cc07, cc07) 2380 2381 FMUL a3, c01, c01 2382 FMUL a3, c03, c03 2383 FMUL a3, c05, c05 2384 FMUL a3, c07, c07 2385#endif 2386 2387#ifdef LT 2388 LDF [AO + 0 * SIZE], a1 2389 LDF [AO + 1 * SIZE], a2 2390 LDF [AO + 3 * SIZE], a3 2391 2392 FMUL a1, c01, c01 2393 FMUL a1, c03, c03 2394 FMUL a1, c05, c05 2395 FMUL a1, c07, c07 2396 2397 FNMSUB (aa2, cc01, cc02, cc02) 2398 FNMSUB (aa2, cc03, cc04, cc04) 2399 FNMSUB (aa2, cc05, cc06, cc06) 2400 FNMSUB (aa2, cc07, cc08, cc08) 2401 2402 FMUL a3, c02, c02 2403 FMUL a3, c04, c04 2404 FMUL a3, c06, c06 2405 FMUL a3, c08, c08 2406#endif 2407 2408#ifdef RN 2409 LDF [BO + 0 * SIZE], a1 2410 LDF [BO + 1 * SIZE], a2 2411 LDF [BO + 2 * SIZE], a3 2412 LDF [BO + 3 * SIZE], a4 2413 2414 FMUL a1, c01, c01 2415 FMUL a1, c02, c02 2416 2417 FNMSUB (aa2, cc01, cc03, cc03) 2418 FNMSUB (aa2, cc02, cc04, cc04) 2419 FNMSUB (aa3, cc01, cc05, cc05) 2420 FNMSUB (aa3, cc02, cc06, cc06) 2421 FNMSUB (aa4, cc01, cc07, cc07) 2422 FNMSUB (aa4, cc02, cc08, cc08) 2423 2424 LDF [BO + 5 * SIZE], a1 2425 LDF [BO + 6 * SIZE], a2 2426 LDF [BO + 7 * SIZE], a3 2427 2428 FMUL a1, c03, c03 2429 FMUL a1, c04, c04 2430 2431 FNMSUB (aa2, cc03, cc05, cc05) 2432 FNMSUB (aa2, cc04, cc06, cc06) 2433 FNMSUB (aa3, cc03, cc07, cc07) 2434 FNMSUB (aa3, cc04, cc08, cc08) 2435 2436 LDF [BO + 10 * SIZE], a1 2437 LDF [BO + 11 * SIZE], a2 2438 2439 FMUL a1, c05, c05 2440 FMUL a1, c06, c06 2441 2442 FNMSUB (aa2, cc05, cc07, cc07) 2443 FNMSUB (aa2, cc06, cc08, cc08) 2444 2445 LDF [BO + 15 * SIZE], a1 2446 2447 FMUL a1, c07, c07 2448 FMUL a1, c08, c08 2449#endif 2450 2451#ifdef RT 2452 LDF [BO + 15 * SIZE], a1 2453 LDF [BO + 14 * SIZE], a2 2454 LDF [BO + 13 * SIZE], a3 2455 LDF [BO + 12 * SIZE], a4 2456 2457 FMUL a1, c08, c08 2458 FMUL a1, c07, c07 2459 2460 FNMSUB (aa2, cc08, cc06, cc06) 2461 FNMSUB (aa2, cc07, cc05, cc05) 2462 FNMSUB (aa3, cc08, cc04, cc04) 2463 FNMSUB (aa3, cc07, cc03, cc03) 2464 FNMSUB (aa4, cc08, cc02, cc02) 2465 FNMSUB (aa4, cc07, cc01, cc01) 2466 2467 LDF [BO + 10 * SIZE], a1 2468 LDF [BO + 9 * SIZE], a2 2469 LDF [BO + 8 * SIZE], a3 2470 2471 FMUL a1, c06, c06 2472 FMUL a1, c05, c05 2473 2474 FNMSUB (aa2, cc06, cc04, cc04) 2475 FNMSUB (aa2, cc05, cc03, cc03) 2476 FNMSUB (aa3, cc06, cc02, cc02) 2477 FNMSUB (aa3, cc05, cc01, cc01) 2478 2479 LDF [BO + 5 * SIZE], a1 2480 LDF [BO + 4 * SIZE], a2 2481 2482 FMUL a1, c04, c04 2483 FMUL a1, c03, c03 2484 2485 FNMSUB (aa2, cc04, cc02, cc02) 2486 FNMSUB (aa2, cc03, cc01, cc01) 2487 2488 LDF [BO + 0 * SIZE], a1 2489 2490 FMUL a1, c02, c02 2491 FMUL a1, c01, c01 2492#endif 2493 2494#ifdef LN 2495 add C1, -2 * SIZE, C1 2496 add C2, -2 * SIZE, C2 2497 add C3, -2 * SIZE, C3 2498 add C4, -2 * SIZE, C4 2499#endif 2500 2501#if defined(LN) || defined(LT) 2502 STF c01, [BO + 0 * SIZE] 2503 STF c03, [BO + 1 * SIZE] 2504 STF c05, [BO + 2 * SIZE] 2505 STF c07, [BO + 3 * SIZE] 2506 2507 STF c02, [BO + 4 * SIZE] 2508 STF c04, [BO + 5 * SIZE] 2509 STF c06, [BO + 6 * SIZE] 2510 STF c08, [BO + 7 * SIZE] 2511#else 2512 STF c01, [AO + 0 * SIZE] 2513 STF c02, [AO + 1 * SIZE] 2514 STF c03, [AO + 2 * SIZE] 2515 STF c04, [AO + 3 * SIZE] 2516 2517 STF c05, [AO + 4 * SIZE] 2518 STF c06, [AO + 5 * SIZE] 2519 STF c07, [AO + 6 * SIZE] 2520 STF c08, [AO + 7 * SIZE] 2521#endif 2522 2523 STF c01, [C1 + 0 * SIZE] 2524 STF c02, [C1 + 1 * SIZE] 2525 STF c03, [C2 + 0 * SIZE] 2526 STF c04, [C2 + 1 * SIZE] 2527 2528 STF c05, [C3 + 0 * SIZE] 2529 STF c06, [C3 + 1 * SIZE] 2530 STF c07, [C4 + 0 * SIZE] 2531 STF c08, [C4 + 1 * SIZE] 2532 2533#ifndef LN 2534 add C1, 2 * SIZE, C1 2535 add C2, 2 * SIZE, C2 2536 add C3, 2 * SIZE, C3 2537 add C4, 2 * SIZE, C4 2538#endif 2539 2540#ifdef RT 2541 sll K, BASE_SHIFT + 1, TEMP1 2542 add AORIG, TEMP1, AORIG 2543#endif 2544 2545#if defined(LT) || defined(RN) 2546 sub K, KK, TEMP1 2547 sll TEMP1, BASE_SHIFT + 1, TEMP2 2548 sll TEMP1, BASE_SHIFT + 2, TEMP1 2549 add AO, TEMP2, AO 2550 add BO, TEMP1, BO 2551#endif 2552 2553#ifdef LT 2554 add KK, 2, KK 2555#endif 2556 2557#ifdef LN 2558 sub KK, 2, KK 2559#endif 2560 2561 add I, -1, I 2562 cmp I, 0 2563 bg,pt %icc, .LL32 2564 nop 2565 2566.LL40: 2567 and M, 1, I 2568 cmp I, 0 2569 ble,pn %icc, .LL49 2570 nop 2571 2572#if defined(LT) || defined(RN) 2573 mov B, BO 2574#else 2575#ifdef LN 2576 sll K, BASE_SHIFT + 0, TEMP1 2577 sub AORIG, TEMP1, AORIG 2578#endif 2579 2580 sll KK, BASE_SHIFT + 0, TEMP1 2581 sll KK, BASE_SHIFT + 2, TEMP2 2582 2583 add AORIG, TEMP1, AO 2584 add B, TEMP2, BO 2585#endif 2586 2587 LDF [AO + 0 * SIZE], a1 2588 LDF [AO + 1 * SIZE], a2 2589 LDF [AO + 2 * SIZE], a3 2590 LDF [AO + 3 * SIZE], a4 2591 2592 LDF [BO + 0 * SIZE], b1 2593 LDF [BO + 1 * SIZE], b2 2594 LDF [BO + 2 * SIZE], b3 2595 LDF [BO + 3 * SIZE], b4 2596 LDF [BO + 4 * SIZE], b5 2597 LDF [BO + 5 * SIZE], b6 2598 FCLR (cc01) 2599 LDF [BO + 6 * SIZE], b7 2600 FCLR (cc03) 2601 LDF [BO + 7 * SIZE], b8 2602 FCLR (cc05) 2603 LDF [BO + 8 * SIZE], b9 2604 FCLR (cc07) 2605 2606#if defined(LT) || defined(RN) 2607 sra KK, 2, L 2608#else 2609 sub K, KK, L 2610 sra L, 2, L 2611#endif 2612 cmp L, 0 2613 ble,pn %icc, .LL45 2614 nop 2615 2616.LL43: 2617 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2618 add L, -1, L 2619 2620 FMADD (aa1, bb1, cc01, cc01) 2621 LDF [BO + 16 * SIZE], b1 2622 FMADD (aa1, bb2, cc03, cc03) 2623 LDF [BO + 9 * SIZE], b2 2624 FMADD (aa1, bb3, cc05, cc05) 2625 LDF [BO + 10 * SIZE], b3 2626 FMADD (aa1, bb4, cc07, cc07) 2627 LDF [BO + 11 * SIZE], b4 2628 2629 LDF [AO + 4 * SIZE], a1 2630 cmp L, 0 2631 2632 FMADD (aa2, bb5, cc01, cc01) 2633 LDF [BO + 12 * SIZE], b5 2634 FMADD (aa2, bb6, cc03, cc03) 2635 LDF [BO + 13 * SIZE], b6 2636 FMADD (aa2, bb7, cc05, cc05) 2637 LDF [BO + 14 * SIZE], b7 2638 FMADD (aa2, bb8, cc07, cc07) 2639 LDF [BO + 15 * SIZE], b8 2640 2641 LDF [AO + 5 * SIZE], a2 2642 add AO, 4 * SIZE, AO 2643 2644 FMADD (aa3, bb9, cc01, cc01) 2645 LDF [BO + 24 * SIZE], b9 2646 FMADD (aa3, bb2, cc03, cc03) 2647 LDF [BO + 17 * SIZE], b2 2648 FMADD (aa3, bb3, cc05, cc05) 2649 LDF [BO + 18 * SIZE], b3 2650 FMADD (aa3, bb4, cc07, cc07) 2651 LDF [BO + 19 * SIZE], b4 2652 2653 LDF [AO + 2 * SIZE], a3 2654 add BO, 16 * SIZE, BO 2655 2656 FMADD (aa4, bb5, cc01, cc01) 2657 LDF [BO + 4 * SIZE], b5 2658 FMADD (aa4, bb6, cc03, cc03) 2659 LDF [BO + 5 * SIZE], b6 2660 FMADD (aa4, bb7, cc05, cc05) 2661 LDF [BO + 6 * SIZE], b7 2662 FMADD (aa4, bb8, cc07, cc07) 2663 LDF [BO + 7 * SIZE], b8 2664 2665 bg,pt %icc, .LL43 2666 LDF [AO + 3 * SIZE], a4 2667 .align 4 2668 2669.LL45: 2670#if defined(LT) || defined(RN) 2671 and KK, 3, L 2672#else 2673 sub K, KK, L 2674 and L, 3, L 2675#endif 2676 cmp L, 0 2677 ble,a,pn %icc, .LL48 2678 nop 2679 .align 4 2680 2681.LL47: 2682 FMADD (aa1, bb1, cc01, cc01) 2683 LDF [BO + 4 * SIZE], b1 2684 add L, -1, L 2685 FMADD (aa1, bb2, cc03, cc03) 2686 LDF [BO + 5 * SIZE], b2 2687 add AO, 1 * SIZE, AO 2688 2689 FMADD (aa1, bb3, cc05, cc05) 2690 LDF [BO + 6 * SIZE], b3 2691 cmp L, 0 2692 FMADD (aa1, bb4, cc07, cc07) 2693 LDF [BO + 7 * SIZE], b4 2694 add BO, 4 * SIZE, BO 2695 2696 bg,pt %icc, .LL47 2697 LDF [AO + 0 * SIZE], a1 2698 .align 4 2699 2700.LL48: 2701#if defined(LN) || defined(RT) 2702#ifdef LN 2703 sub KK, 1, TEMP1 2704#else 2705 sub KK, 4, TEMP1 2706#endif 2707 sll TEMP1, BASE_SHIFT + 0, TEMP2 2708 sll TEMP1, BASE_SHIFT + 2, TEMP1 2709 2710 add AORIG, TEMP2, AO 2711 add B, TEMP1, BO 2712#endif 2713 2714#if defined(LN) || defined(LT) 2715 LDF [BO + 0 * SIZE], a1 2716 LDF [BO + 1 * SIZE], a2 2717 LDF [BO + 2 * SIZE], a3 2718 LDF [BO + 3 * SIZE], a4 2719 2720 FSUB a1, c01, c01 2721 FSUB a2, c03, c03 2722 FSUB a3, c05, c05 2723 FSUB a4, c07, c07 2724#else 2725 LDF [AO + 0 * SIZE], a1 2726 LDF [AO + 1 * SIZE], a2 2727 LDF [AO + 2 * SIZE], a3 2728 LDF [AO + 3 * SIZE], a4 2729 2730 FSUB a1, c01, c01 2731 FSUB a2, c03, c03 2732 FSUB a3, c05, c05 2733 FSUB a4, c07, c07 2734#endif 2735 2736#if defined(LN) || defined(LT) 2737 LDF [AO + 0 * SIZE], a1 2738 2739 FMUL a1, c01, c01 2740 FMUL a1, c03, c03 2741 FMUL a1, c05, c05 2742 FMUL a1, c07, c07 2743#endif 2744 2745#ifdef RN 2746 LDF [BO + 0 * SIZE], a1 2747 LDF [BO + 1 * SIZE], a2 2748 LDF [BO + 2 * SIZE], a3 2749 LDF [BO + 3 * SIZE], a4 2750 2751 FMUL a1, c01, c01 2752 2753 FNMSUB (aa2, cc01, cc03, cc03) 2754 FNMSUB (aa3, cc01, cc05, cc05) 2755 FNMSUB (aa4, cc01, cc07, cc07) 2756 2757 LDF [BO + 5 * SIZE], a1 2758 LDF [BO + 6 * SIZE], a2 2759 LDF [BO + 7 * SIZE], a3 2760 2761 FMUL a1, c03, c03 2762 2763 FNMSUB (aa2, cc03, cc05, cc05) 2764 FNMSUB (aa3, cc03, cc07, cc07) 2765 2766 LDF [BO + 10 * SIZE], a1 2767 LDF [BO + 11 * SIZE], a2 2768 2769 FMUL a1, c05, c05 2770 2771 FNMSUB (aa2, cc05, cc07, cc07) 2772 2773 LDF [BO + 15 * SIZE], a1 2774 2775 FMUL a1, c07, c07 2776#endif 2777 2778#ifdef RT 2779 LDF [BO + 15 * SIZE], a1 2780 LDF [BO + 14 * SIZE], a2 2781 LDF [BO + 13 * SIZE], a3 2782 LDF [BO + 12 * SIZE], a4 2783 2784 FMUL a1, c07, c07 2785 2786 FNMSUB (aa2, cc07, cc05, cc05) 2787 FNMSUB (aa3, cc07, cc03, cc03) 2788 FNMSUB (aa4, cc07, cc01, cc01) 2789 2790 LDF [BO + 10 * SIZE], a1 2791 LDF [BO + 9 * SIZE], a2 2792 LDF [BO + 8 * SIZE], a3 2793 2794 FMUL a1, c05, c05 2795 2796 FNMSUB (aa2, cc05, cc03, cc03) 2797 FNMSUB (aa3, cc05, cc01, cc01) 2798 2799 LDF [BO + 5 * SIZE], a1 2800 LDF [BO + 4 * SIZE], a2 2801 2802 FMUL a1, c03, c03 2803 2804 FNMSUB (aa2, cc03, cc01, cc01) 2805 2806 LDF [BO + 0 * SIZE], a1 2807 2808 FMUL a1, c01, c01 2809#endif 2810 2811#ifdef LN 2812 add C1, -1 * SIZE, C1 2813 add C2, -1 * SIZE, C2 2814 add C3, -1 * SIZE, C3 2815 add C4, -1 * SIZE, C4 2816#endif 2817 2818#if defined(LN) || defined(LT) 2819 STF c01, [BO + 0 * SIZE] 2820 STF c03, [BO + 1 * SIZE] 2821 STF c05, [BO + 2 * SIZE] 2822 STF c07, [BO + 3 * SIZE] 2823#else 2824 STF c01, [AO + 0 * SIZE] 2825 STF c03, [AO + 1 * SIZE] 2826 STF c05, [AO + 2 * SIZE] 2827 STF c07, [AO + 3 * SIZE] 2828#endif 2829 2830 STF c01, [C1 + 0 * SIZE] 2831 STF c03, [C2 + 0 * SIZE] 2832 STF c05, [C3 + 0 * SIZE] 2833 STF c07, [C4 + 0 * SIZE] 2834 2835#ifdef RT 2836 sll K, BASE_SHIFT + 0, TEMP1 2837 add AORIG, TEMP1, AORIG 2838#endif 2839 2840#if defined(LT) || defined(RN) 2841 sub K, KK, TEMP1 2842 sll TEMP1, BASE_SHIFT + 0, TEMP2 2843 sll TEMP1, BASE_SHIFT + 2, TEMP1 2844 add AO, TEMP2, AO 2845 add BO, TEMP1, BO 2846#endif 2847 2848#ifdef LT 2849 add KK, 1, KK 2850#endif 2851 2852#ifdef LN 2853 sub KK, 1, KK 2854#endif 2855 .align 4 2856 2857.LL49: 2858#ifdef LN 2859 sll K, BASE_SHIFT + 2, TEMP1 2860 add B, TEMP1, B 2861#endif 2862 2863#if defined(LT) || defined(RN) 2864 mov BO, B 2865#endif 2866 2867#ifdef RN 2868 add KK, 4, KK 2869#endif 2870 2871#ifdef RT 2872 sub KK, 4, KK 2873#endif 2874 .align 4 2875 2876.LL50: 2877 and N, 2, J 2878 cmp J, 0 2879 ble,pn %icc, .LL70 2880 nop 2881 2882#ifdef RT 2883 sll K, BASE_SHIFT + 1, TEMP1 2884 sub B, TEMP1, B 2885#endif 2886 2887#ifndef RT 2888 mov C, C1 2889 add C, LDC, C2 2890 add C2, LDC, C 2891#else 2892 sub C, LDC, C2 2893 sub C2, LDC, C1 2894 sub C2, LDC, C 2895#endif 2896 2897#ifdef LN 2898 add M, OFFSET, KK 2899#endif 2900 2901#ifdef LT 2902 mov OFFSET, KK 2903#endif 2904 2905#if defined(LN) || defined(RT) 2906 mov A, AORIG 2907#else 2908 mov A, AO 2909#endif 2910 2911 sra M, 1, I 2912 cmp I, 0 2913 ble,pn %icc, .LL60 2914 nop 2915 .align 4 2916 2917.LL52: 2918#if defined(LT) || defined(RN) 2919 mov B, BO 2920#else 2921#ifdef LN 2922 sll K, BASE_SHIFT + 1, TEMP1 2923 sub AORIG, TEMP1, AORIG 2924#endif 2925 2926 sll KK, BASE_SHIFT + 1, TEMP1 2927 sll KK, BASE_SHIFT + 1, TEMP2 2928 2929 add AORIG, TEMP1, AO 2930 add B, TEMP2, BO 2931#endif 2932 2933 LDF [AO + 0 * SIZE], a1 2934 LDF [AO + 1 * SIZE], a2 2935 LDF [AO + 2 * SIZE], a3 2936 LDF [AO + 3 * SIZE], a4 2937 2938 LDF [BO + 0 * SIZE], b1 2939 LDF [BO + 1 * SIZE], b2 2940 LDF [BO + 2 * SIZE], b3 2941 FCLR (cc01) 2942 LDF [BO + 3 * SIZE], b4 2943 FCLR (cc02) 2944 2945 LDF [BO + 4 * SIZE], b5 2946 FCLR (cc03) 2947 LDF [BO + 5 * SIZE], b6 2948 FCLR (cc04) 2949 LDF [BO + 6 * SIZE], b7 2950 FCLR (cc05) 2951 LDF [BO + 7 * SIZE], b8 2952 FCLR (cc06) 2953 2954 prefetch [C1 + 2 * SIZE], 3 2955 FCLR (cc07) 2956 prefetch [C2 + 2 * SIZE], 3 2957 FCLR (cc08) 2958 2959#if defined(LT) || defined(RN) 2960 sra KK, 2, L 2961#else 2962 sub K, KK, L 2963 sra L, 2, L 2964#endif 2965 cmp L, 0 2966 ble,pn %icc, .LL55 2967 nop 2968 .align 4 2969 2970.LL53: 2971 FMADD (aa1, bb1, cc01, cc01) 2972 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2973 FMADD (aa2, bb1, cc02, cc02) 2974 LDF [BO + 8 * SIZE], b1 2975 2976 FMADD (aa1, bb2, cc03, cc03) 2977 LDF [AO + 4 * SIZE], a1 2978 FMADD (aa2, bb2, cc04, cc04) 2979 LDF [AO + 5 * SIZE], a2 2980 2981 FMADD (aa3, bb3, cc01, cc01) 2982 LDF [BO + 9 * SIZE], b2 2983 FMADD (aa4, bb3, cc02, cc02) 2984 LDF [BO + 10 * SIZE], b3 2985 2986 FMADD (aa3, bb4, cc03, cc03) 2987 LDF [AO + 6 * SIZE], a3 2988 FMADD (aa4, bb4, cc04, cc04) 2989 LDF [AO + 7 * SIZE], a4 2990 2991 FMADD (aa1, bb5, cc01, cc01) 2992 LDF [BO + 11 * SIZE], b4 2993 FMADD (aa2, bb5, cc02, cc02) 2994 LDF [BO + 12 * SIZE], b5 2995 2996 FMADD (aa1, bb6, cc03, cc03) 2997 LDF [AO + 8 * SIZE], a1 2998 FMADD (aa2, bb6, cc04, cc04) 2999 LDF [AO + 9 * SIZE], a2 3000 3001 FMADD (aa3, bb7, cc01, cc01) 3002 LDF [BO + 13 * SIZE], b6 3003 3004 FMADD (aa4, bb7, cc02, cc02) 3005 LDF [BO + 14 * SIZE], b7 3006 3007 FMADD (aa3, bb8, cc03, cc03) 3008 LDF [AO + 10 * SIZE], a3 3009 FMADD (aa4, bb8, cc04, cc04) 3010 LDF [AO + 11 * SIZE], a4 3011 3012 add AO, 8 * SIZE, AO 3013 add L, -1, L 3014 add BO, 8 * SIZE, BO 3015 cmp L, 0 3016 3017 bg,pt %icc, .LL53 3018 LDF [BO + 7 * SIZE], b8 3019 .align 4 3020 3021.LL55: 3022#if defined(LT) || defined(RN) 3023 and KK, 3, L 3024#else 3025 sub K, KK, L 3026 and L, 3, L 3027#endif 3028 cmp L, 0 3029 ble,a,pn %icc, .LL58 3030 nop 3031 .align 4 3032 3033.LL57: 3034 FMADD (aa1, bb1, cc01, cc01) 3035 add L, -1, L 3036 FMADD (aa2, bb1, cc02, cc02) 3037 LDF [BO + 2 * SIZE], b1 3038 3039 FMADD (aa1, bb2, cc03, cc03) 3040 LDF [AO + 2 * SIZE], a1 3041 FMADD (aa2, bb2, cc04, cc04) 3042 LDF [AO + 3 * SIZE], a2 3043 3044 add AO, 2 * SIZE, AO 3045 cmp L, 0 3046 add BO, 2 * SIZE, BO 3047 bg,pt %icc, .LL57 3048 LDF [BO + 1 * SIZE], b2 3049 .align 4 3050 3051.LL58: 3052#if defined(LN) || defined(RT) 3053#ifdef LN 3054 sub KK, 2, TEMP1 3055#else 3056 sub KK, 2, TEMP1 3057#endif 3058 sll TEMP1, BASE_SHIFT + 1, TEMP2 3059 sll TEMP1, BASE_SHIFT + 1, TEMP1 3060 3061 add AORIG, TEMP2, AO 3062 add B, TEMP1, BO 3063#endif 3064 3065#if defined(LN) || defined(LT) 3066 LDF [BO + 0 * SIZE], a1 3067 LDF [BO + 1 * SIZE], a2 3068 LDF [BO + 2 * SIZE], a3 3069 LDF [BO + 3 * SIZE], a4 3070 3071 FSUB a1, c01, c01 3072 FSUB a2, c03, c03 3073 FSUB a3, c02, c02 3074 FSUB a4, c04, c04 3075#else 3076 LDF [AO + 0 * SIZE], a1 3077 LDF [AO + 1 * SIZE], a2 3078 LDF [AO + 2 * SIZE], a3 3079 LDF [AO + 3 * SIZE], a4 3080 3081 FSUB a1, c01, c01 3082 FSUB a2, c02, c02 3083 FSUB a3, c03, c03 3084 FSUB a4, c04, c04 3085#endif 3086 3087#ifdef LN 3088 LDF [AO + 3 * SIZE], a1 3089 LDF [AO + 2 * SIZE], a2 3090 LDF [AO + 0 * SIZE], a3 3091 3092 FMUL a1, c02, c02 3093 FMUL a1, c04, c04 3094 3095 FNMSUB (aa2, cc02, cc01, cc01) 3096 FNMSUB (aa2, cc04, cc03, cc03) 3097 3098 FMUL a3, c01, c01 3099 FMUL a3, c03, c03 3100#endif 3101 3102#ifdef LT 3103 LDF [AO + 0 * SIZE], a1 3104 LDF [AO + 1 * SIZE], a2 3105 LDF [AO + 3 * SIZE], a3 3106 3107 FMUL a1, c01, c01 3108 FMUL a1, c03, c03 3109 3110 FNMSUB (aa2, cc01, cc02, cc02) 3111 FNMSUB (aa2, cc03, cc04, cc04) 3112 3113 FMUL a3, c02, c02 3114 FMUL a3, c04, c04 3115#endif 3116 3117#ifdef RN 3118 LDF [BO + 0 * SIZE], a1 3119 LDF [BO + 1 * SIZE], a2 3120 3121 FMUL a1, c01, c01 3122 FMUL a1, c02, c02 3123 3124 FNMSUB (aa2, cc01, cc03, cc03) 3125 FNMSUB (aa2, cc02, cc04, cc04) 3126 3127 LDF [BO + 3 * SIZE], a1 3128 3129 FMUL a1, c03, c03 3130 FMUL a1, c04, c04 3131#endif 3132 3133#ifdef RT 3134 LDF [BO + 3 * SIZE], a1 3135 LDF [BO + 2 * SIZE], a2 3136 3137 FMUL a1, c04, c04 3138 FMUL a1, c03, c03 3139 3140 FNMSUB (aa2, cc04, cc02, cc02) 3141 FNMSUB (aa2, cc03, cc01, cc01) 3142 3143 LDF [BO + 0 * SIZE], a1 3144 3145 FMUL a1, c02, c02 3146 FMUL a1, c01, c01 3147#endif 3148 3149#ifdef LN 3150 add C1, -2 * SIZE, C1 3151 add C2, -2 * SIZE, C2 3152#endif 3153 3154#if defined(LN) || defined(LT) 3155 STF c01, [BO + 0 * SIZE] 3156 STF c03, [BO + 1 * SIZE] 3157 STF c02, [BO + 2 * SIZE] 3158 STF c04, [BO + 3 * SIZE] 3159#else 3160 STF c01, [AO + 0 * SIZE] 3161 STF c02, [AO + 1 * SIZE] 3162 STF c03, [AO + 2 * SIZE] 3163 STF c04, [AO + 3 * SIZE] 3164#endif 3165 3166 STF c01, [C1 + 0 * SIZE] 3167 STF c02, [C1 + 1 * SIZE] 3168 STF c03, [C2 + 0 * SIZE] 3169 STF c04, [C2 + 1 * SIZE] 3170 3171#ifndef LN 3172 add C1, 2 * SIZE, C1 3173 add C2, 2 * SIZE, C2 3174#endif 3175 3176#ifdef RT 3177 sll K, BASE_SHIFT + 1, TEMP1 3178 add AORIG, TEMP1, AORIG 3179#endif 3180 3181#if defined(LT) || defined(RN) 3182 sub K, KK, TEMP1 3183 sll TEMP1, BASE_SHIFT + 1, TEMP2 3184 sll TEMP1, BASE_SHIFT + 1, TEMP1 3185 add AO, TEMP2, AO 3186 add BO, TEMP1, BO 3187#endif 3188 3189#ifdef LT 3190 add KK, 2, KK 3191#endif 3192 3193#ifdef LN 3194 sub KK, 2, KK 3195#endif 3196 3197 add I, -1, I 3198 cmp I, 0 3199 bg,pt %icc, .LL52 3200 nop 3201 .align 4 3202 3203.LL60: 3204 and M, 1, I 3205 cmp I, 0 3206 ble,pn %icc, .LL69 3207 nop 3208 3209#if defined(LT) || defined(RN) 3210 mov B, BO 3211#else 3212#ifdef LN 3213 sll K, BASE_SHIFT + 0, TEMP1 3214 sub AORIG, TEMP1, AORIG 3215#endif 3216 3217 sll KK, BASE_SHIFT + 0, TEMP1 3218 sll KK, BASE_SHIFT + 1, TEMP2 3219 3220 add AORIG, TEMP1, AO 3221 add B, TEMP2, BO 3222#endif 3223 3224 LDF [AO + 0 * SIZE], a1 3225 LDF [AO + 1 * SIZE], a2 3226 LDF [AO + 2 * SIZE], a3 3227 LDF [AO + 3 * SIZE], a4 3228 3229 LDF [BO + 0 * SIZE], b1 3230 LDF [BO + 1 * SIZE], b2 3231 LDF [BO + 2 * SIZE], b3 3232 LDF [BO + 3 * SIZE], b4 3233 LDF [BO + 4 * SIZE], b5 3234 LDF [BO + 5 * SIZE], b6 3235 LDF [BO + 6 * SIZE], b7 3236 FCLR (cc01) 3237 LDF [BO + 7 * SIZE], b8 3238 FCLR (cc03) 3239 3240#if defined(LT) || defined(RN) 3241 sra KK, 2, L 3242#else 3243 sub K, KK, L 3244 sra L, 2, L 3245#endif 3246 cmp L, 0 3247 ble,pn %icc, .LL65 3248 nop 3249 .align 4 3250 3251.LL63: 3252 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3253 add L, -1, L 3254 3255 FMADD (aa1, bb1, cc01, cc01) 3256 LDF [BO + 8 * SIZE], b1 3257 FMADD (aa1, bb2, cc03, cc03) 3258 LDF [BO + 9 * SIZE], b2 3259 3260 LDF [AO + 4 * SIZE], a1 3261 cmp L, 0 3262 3263 FMADD (aa2, bb3, cc01, cc01) 3264 LDF [BO + 10 * SIZE], b3 3265 FMADD (aa2, bb4, cc03, cc03) 3266 LDF [BO + 11 * SIZE], b4 3267 3268 LDF [AO + 5 * SIZE], a2 3269 add AO, 4 * SIZE, AO 3270 3271 FMADD (aa3, bb5, cc01, cc01) 3272 LDF [BO + 12 * SIZE], b5 3273 FMADD (aa3, bb6, cc03, cc03) 3274 LDF [BO + 13 * SIZE], b6 3275 3276 LDF [AO + 2 * SIZE], a3 3277 add BO, 8 * SIZE, BO 3278 3279 FMADD (aa4, bb7, cc01, cc01) 3280 LDF [BO + 6 * SIZE], b7 3281 FMADD (aa4, bb8, cc03, cc03) 3282 LDF [BO + 7 * SIZE], b8 3283 3284 bg,pt %icc, .LL63 3285 LDF [AO + 3 * SIZE], a4 3286 .align 4 3287 3288.LL65: 3289#if defined(LT) || defined(RN) 3290 and KK, 3, L 3291#else 3292 sub K, KK, L 3293 and L, 3, L 3294#endif 3295 cmp L, 0 3296 ble,a,pn %icc, .LL68 3297 nop 3298 .align 4 3299 3300.LL67: 3301 FMADD (aa1, bb1, cc01, cc01) 3302 LDF [BO + 2 * SIZE], b1 3303 FMADD (aa1, bb2, cc03, cc03) 3304 LDF [BO + 3 * SIZE], b2 3305 3306 LDF [AO + 1 * SIZE], a1 3307 add L, -1, L 3308 add AO, 1 * SIZE, AO 3309 cmp L, 0 3310 3311 bg,pt %icc, .LL67 3312 add BO, 2 * SIZE, BO 3313 .align 4 3314 3315.LL68: 3316#if defined(LN) || defined(RT) 3317#ifdef LN 3318 sub KK, 1, TEMP1 3319#else 3320 sub KK, 2, TEMP1 3321#endif 3322 sll TEMP1, BASE_SHIFT + 0, TEMP2 3323 sll TEMP1, BASE_SHIFT + 1, TEMP1 3324 3325 add AORIG, TEMP2, AO 3326 add B, TEMP1, BO 3327#endif 3328 3329#if defined(LN) || defined(LT) 3330 LDF [BO + 0 * SIZE], a1 3331 LDF [BO + 1 * SIZE], a2 3332 3333 FSUB a1, c01, c01 3334 FSUB a2, c03, c03 3335#else 3336 LDF [AO + 0 * SIZE], a1 3337 LDF [AO + 1 * SIZE], a2 3338 3339 FSUB a1, c01, c01 3340 FSUB a2, c03, c03 3341#endif 3342 3343#if defined(LN) || defined(LT) 3344 LDF [AO + 0 * SIZE], a1 3345 3346 FMUL a1, c01, c01 3347 FMUL a1, c03, c03 3348#endif 3349 3350#ifdef RN 3351 LDF [BO + 0 * SIZE], a1 3352 LDF [BO + 1 * SIZE], a2 3353 3354 FMUL a1, c01, c01 3355 3356 FNMSUB (aa2, cc01, cc03, cc03) 3357 3358 LDF [BO + 3 * SIZE], a1 3359 3360 FMUL a1, c03, c03 3361#endif 3362 3363#ifdef RT 3364 LDF [BO + 3 * SIZE], a1 3365 LDF [BO + 2 * SIZE], a2 3366 3367 FMUL a1, c03, c03 3368 3369 FNMSUB (aa2, cc03, cc01, cc01) 3370 3371 LDF [BO + 0 * SIZE], a1 3372 3373 FMUL a1, c01, c01 3374#endif 3375 3376#ifdef LN 3377 add C1, -1 * SIZE, C1 3378 add C2, -1 * SIZE, C2 3379#endif 3380 3381#if defined(LN) || defined(LT) 3382 STF c01, [BO + 0 * SIZE] 3383 STF c03, [BO + 1 * SIZE] 3384#else 3385 STF c01, [AO + 0 * SIZE] 3386 STF c03, [AO + 1 * SIZE] 3387#endif 3388 3389 STF c01, [C1 + 0 * SIZE] 3390 STF c03, [C2 + 0 * SIZE] 3391 3392#ifdef RT 3393 sll K, BASE_SHIFT + 0, TEMP1 3394 add AORIG, TEMP1, AORIG 3395#endif 3396 3397#if defined(LT) || defined(RN) 3398 sub K, KK, TEMP1 3399 sll TEMP1, BASE_SHIFT + 0, TEMP2 3400 sll TEMP1, BASE_SHIFT + 1, TEMP1 3401 add AO, TEMP2, AO 3402 add BO, TEMP1, BO 3403#endif 3404 3405#ifdef LT 3406 add KK, 1, KK 3407#endif 3408 3409#ifdef LN 3410 sub KK, 1, KK 3411#endif 3412 .align 4 3413 3414.LL69: 3415#ifdef LN 3416 sll K, BASE_SHIFT + 1, TEMP1 3417 add B, TEMP1, B 3418#endif 3419 3420#if defined(LT) || defined(RN) 3421 mov BO, B 3422#endif 3423 3424#ifdef RN 3425 add KK, 2, KK 3426#endif 3427 3428#ifdef RT 3429 sub KK, 2, KK 3430#endif 3431 .align 4 3432 3433.LL70: 3434 and N, 1, J 3435 cmp J, 0 3436 ble,pn %icc, .LL999 3437 nop 3438 3439#ifdef RT 3440 sll K, BASE_SHIFT, TEMP1 3441 sub B, TEMP1, B 3442#endif 3443 3444#ifndef RT 3445 mov C, C1 3446 add C1, LDC, C 3447#else 3448 sub C, LDC, C1 3449 sub C, LDC, C 3450#endif 3451 3452#ifdef LN 3453 add M, OFFSET, KK 3454#endif 3455 3456#ifdef LT 3457 mov OFFSET, KK 3458#endif 3459 3460#if defined(LN) || defined(RT) 3461 mov A, AORIG 3462#else 3463 mov A, AO 3464#endif 3465 3466 sra M, 1, I 3467 cmp I, 0 3468 ble,pn %icc, .LL80 3469 nop 3470 .align 4 3471 3472.LL72: 3473#if defined(LT) || defined(RN) 3474 mov B, BO 3475#else 3476#ifdef LN 3477 sll K, BASE_SHIFT + 1, TEMP1 3478 sub AORIG, TEMP1, AORIG 3479#endif 3480 3481 sll KK, BASE_SHIFT + 1, TEMP1 3482 sll KK, BASE_SHIFT + 0, TEMP2 3483 3484 add AORIG, TEMP1, AO 3485 add B, TEMP2, BO 3486#endif 3487 3488 LDF [AO + 0 * SIZE], a1 3489 LDF [AO + 1 * SIZE], a2 3490 LDF [AO + 2 * SIZE], a3 3491 LDF [AO + 3 * SIZE], a4 3492 3493 LDF [BO + 0 * SIZE], b1 3494 LDF [BO + 1 * SIZE], b2 3495 LDF [BO + 2 * SIZE], b3 3496 FCLR (cc01) 3497 LDF [BO + 3 * SIZE], b4 3498 FCLR (cc02) 3499 3500 prefetch [C1 + 2 * SIZE], 3 3501 3502#if defined(LT) || defined(RN) 3503 sra KK, 2, L 3504#else 3505 sub K, KK, L 3506 sra L, 2, L 3507#endif 3508 cmp L, 0 3509 ble,pn %icc, .LL75 3510 nop 3511 3512.LL73: 3513 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3514 add L, -1, L 3515 3516 FMADD (aa1, bb1, cc01, cc01) 3517 LDF [AO + 4 * SIZE], a1 3518 FMADD (aa2, bb1, cc02, cc02) 3519 LDF [AO + 5 * SIZE], a2 3520 3521 LDF [BO + 4 * SIZE], b1 3522 cmp L, 0 3523 3524 FMADD (aa3, bb2, cc01, cc01) 3525 LDF [AO + 6 * SIZE], a3 3526 FMADD (aa4, bb2, cc02, cc02) 3527 LDF [AO + 7 * SIZE], a4 3528 3529 LDF [BO + 5 * SIZE], b2 3530 add BO, 4 * SIZE, BO 3531 3532 FMADD (aa1, bb3, cc01, cc01) 3533 LDF [AO + 8 * SIZE], a1 3534 FMADD (aa2, bb3, cc02, cc02) 3535 LDF [AO + 9 * SIZE], a2 3536 3537 LDF [BO + 2 * SIZE], b3 3538 add AO, 8 * SIZE, AO 3539 3540 FMADD (aa3, bb4, cc01, cc01) 3541 LDF [AO + 2 * SIZE], a3 3542 FMADD (aa4, bb4, cc02, cc02) 3543 LDF [AO + 3 * SIZE], a4 3544 3545 bg,pt %icc, .LL73 3546 LDF [BO + 3 * SIZE], b4 3547 .align 4 3548 3549.LL75: 3550#if defined(LT) || defined(RN) 3551 and KK, 3, L 3552#else 3553 sub K, KK, L 3554 and L, 3, L 3555#endif 3556 cmp L, 0 3557 ble,a,pn %icc, .LL78 3558 nop 3559 .align 4 3560 3561.LL77: 3562 FMADD (aa1, bb1, cc01, cc01) 3563 LDF [AO + 2 * SIZE], a1 3564 FMADD (aa2, bb1, cc02, cc02) 3565 LDF [AO + 3 * SIZE], a2 3566 3567 LDF [BO + 1 * SIZE], b1 3568 add L, -1, L 3569 add AO, 2 * SIZE, AO 3570 cmp L, 0 3571 bg,pt %icc, .LL77 3572 add BO, 1 * SIZE, BO 3573 .align 4 3574 3575.LL78: 3576#if defined(LN) || defined(RT) 3577#ifdef LN 3578 sub KK, 2, TEMP1 3579#else 3580 sub KK, 1, TEMP1 3581#endif 3582 sll TEMP1, BASE_SHIFT + 1, TEMP2 3583 sll TEMP1, BASE_SHIFT + 0, TEMP1 3584 3585 add AORIG, TEMP2, AO 3586 add B, TEMP1, BO 3587#endif 3588 3589#if defined(LN) || defined(LT) 3590 LDF [BO + 0 * SIZE], a1 3591 LDF [BO + 1 * SIZE], a2 3592 3593 FSUB a1, c01, c01 3594 FSUB a2, c02, c02 3595#else 3596 LDF [AO + 0 * SIZE], a1 3597 LDF [AO + 1 * SIZE], a2 3598 3599 FSUB a1, c01, c01 3600 FSUB a2, c02, c02 3601#endif 3602 3603#ifdef LN 3604 LDF [AO + 3 * SIZE], a1 3605 LDF [AO + 2 * SIZE], a2 3606 LDF [AO + 0 * SIZE], a3 3607 3608 FMUL a1, c02, c02 3609 3610 FNMSUB (aa2, cc02, cc01, cc01) 3611 3612 FMUL a3, c01, c01 3613#endif 3614 3615#ifdef LT 3616 LDF [AO + 0 * SIZE], a1 3617 LDF [AO + 1 * SIZE], a2 3618 LDF [AO + 3 * SIZE], a3 3619 3620 FMUL a1, c01, c01 3621 3622 FNMSUB (aa2, cc01, cc02, cc02) 3623 3624 FMUL a3, c02, c02 3625#endif 3626 3627#if defined(RN) || defined(RT) 3628 LDF [BO + 0 * SIZE], a1 3629 3630 FMUL a1, c01, c01 3631 FMUL a1, c02, c02 3632#endif 3633 3634#ifdef LN 3635 add C1, -2 * SIZE, C1 3636#endif 3637 3638#if defined(LN) || defined(LT) 3639 STF c01, [BO + 0 * SIZE] 3640 STF c02, [BO + 1 * SIZE] 3641#else 3642 STF c01, [AO + 0 * SIZE] 3643 STF c02, [AO + 1 * SIZE] 3644#endif 3645 3646 STF c01, [C1 + 0 * SIZE] 3647 STF c02, [C1 + 1 * SIZE] 3648 3649#ifndef LN 3650 add C1, 2 * SIZE, C1 3651#endif 3652 3653#ifdef RT 3654 sll K, BASE_SHIFT + 1, TEMP1 3655 add AORIG, TEMP1, AORIG 3656#endif 3657 3658#if defined(LT) || defined(RN) 3659 sub K, KK, TEMP1 3660 sll TEMP1, BASE_SHIFT + 1, TEMP2 3661 sll TEMP1, BASE_SHIFT + 0, TEMP1 3662 add AO, TEMP2, AO 3663 add BO, TEMP1, BO 3664#endif 3665 3666#ifdef LT 3667 add KK, 2, KK 3668#endif 3669 3670#ifdef LN 3671 sub KK, 2, KK 3672#endif 3673 3674 add I, -1, I 3675 cmp I, 0 3676 bg,pt %icc, .LL72 3677 nop 3678 .align 4 3679 3680.LL80: 3681 and M, 1, I 3682 cmp I, 0 3683 ble,pn %icc, .LL89 3684 nop 3685 3686#if defined(LT) || defined(RN) 3687 mov B, BO 3688#else 3689#ifdef LN 3690 sll K, BASE_SHIFT + 0, TEMP1 3691 sub AORIG, TEMP1, AORIG 3692#endif 3693 3694 sll KK, BASE_SHIFT + 0, TEMP1 3695 sll KK, BASE_SHIFT + 0, TEMP2 3696 3697 add AORIG, TEMP1, AO 3698 add B, TEMP2, BO 3699#endif 3700 3701 LDF [AO + 0 * SIZE], a1 3702 LDF [BO + 0 * SIZE], b1 3703 LDF [AO + 1 * SIZE], a2 3704 LDF [BO + 1 * SIZE], b2 3705 LDF [AO + 2 * SIZE], a3 3706 LDF [BO + 2 * SIZE], b3 3707 LDF [AO + 3 * SIZE], a4 3708 LDF [BO + 3 * SIZE], b4 3709 3710#if defined(LT) || defined(RN) 3711 sra KK, 2, L 3712#else 3713 sub K, KK, L 3714 sra L, 2, L 3715#endif 3716 cmp L, 0 3717 ble,pn %icc, .LL85 3718 FCLR (cc01) 3719 .align 4 3720 3721.LL83: 3722 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3723 add L, -1, L 3724 3725 FMADD (aa1, bb1, cc01, cc01) 3726 LDF [AO + 4 * SIZE], a1 3727 LDF [BO + 4 * SIZE], b1 3728 3729 FMADD (aa2, bb2, cc01, cc01) 3730 LDF [AO + 5 * SIZE], a2 3731 LDF [BO + 5 * SIZE], b2 3732 3733 FMADD (aa3, bb3, cc01, cc01) 3734 LDF [AO + 6 * SIZE], a3 3735 LDF [BO + 6 * SIZE], b3 3736 3737 FMADD (aa4, bb4, cc01, cc01) 3738 LDF [AO + 7 * SIZE], a4 3739 LDF [BO + 7 * SIZE], b4 3740 3741 add AO, 4 * SIZE, AO 3742 cmp L, 0 3743 3744 bg,pt %icc, .LL83 3745 add BO, 4 * SIZE, BO 3746 .align 4 3747 3748.LL85: 3749#if defined(LT) || defined(RN) 3750 and KK, 3, L 3751#else 3752 sub K, KK, L 3753 and L, 3, L 3754#endif 3755 cmp L, 0 3756 ble,a,pn %icc, .LL88 3757 nop 3758 .align 4 3759 3760.LL87: 3761 FMADD (aa1, bb1, cc01, cc01) 3762 LDF [AO + 1 * SIZE], a1 3763 LDF [BO + 1 * SIZE], b1 3764 3765 add AO, 1 * SIZE, AO 3766 add L, -1, L 3767 cmp L, 0 3768 bg,pt %icc, .LL87 3769 add BO, 1 * SIZE, BO 3770 .align 4 3771 3772.LL88: 3773#if defined(LN) || defined(RT) 3774#ifdef LN 3775 sub KK, 1, TEMP1 3776#else 3777 sub KK, 1, TEMP1 3778#endif 3779 sll TEMP1, BASE_SHIFT + 0, TEMP2 3780 sll TEMP1, BASE_SHIFT + 0, TEMP1 3781 3782 add AORIG, TEMP2, AO 3783 add B, TEMP1, BO 3784#endif 3785 3786#if defined(LN) || defined(LT) 3787 LDF [BO + 0 * SIZE], a1 3788 3789 FSUB a1, c01, c01 3790#else 3791 LDF [AO + 0 * SIZE], a1 3792 3793 FSUB a1, c01, c01 3794#endif 3795 3796#if defined(LN) || defined(LT) 3797 LDF [AO + 0 * SIZE], a1 3798 3799 FMUL a1, c01, c01 3800#endif 3801 3802#if defined(RN) || defined(RT) 3803 LDF [BO + 0 * SIZE], a1 3804 3805 FMUL a1, c01, c01 3806#endif 3807 3808#ifdef LN 3809 add C1, -1 * SIZE, C1 3810#endif 3811 3812#if defined(LN) || defined(LT) 3813 STF c01, [BO + 0 * SIZE] 3814#else 3815 STF c01, [AO + 0 * SIZE] 3816#endif 3817 3818 STF c01, [C1 + 0 * SIZE] 3819 3820#ifdef RT 3821 sll K, BASE_SHIFT + 0, TEMP1 3822 add AORIG, TEMP1, AORIG 3823#endif 3824 3825#if defined(LT) || defined(RN) 3826 sub K, KK, TEMP1 3827 sll TEMP1, BASE_SHIFT + 0, TEMP2 3828 sll TEMP1, BASE_SHIFT + 0, TEMP1 3829 add AO, TEMP2, AO 3830 add BO, TEMP1, BO 3831#endif 3832 3833#ifdef LT 3834 add KK, 1, KK 3835#endif 3836 3837#ifdef LN 3838 sub KK, 1, KK 3839#endif 3840 .align 4 3841 3842.LL89: 3843#ifdef LN 3844 sll K, BASE_SHIFT, TEMP1 3845 add B, TEMP1, B 3846#endif 3847 3848#if defined(LT) || defined(RN) 3849 mov BO, B 3850#endif 3851 3852#ifdef RN 3853 add KK, 1, KK 3854#endif 3855 3856#ifdef RT 3857 sub KK, 1, KK 3858#endif 3859 .align 4 3860 3861.LL999: 3862#ifdef TRMMKERNEL 3863#ifndef __64BIT__ 3864 ld [%sp + STACK_START + 8], %g1 3865 ld [%sp + STACK_START + 12], %g2 3866 ld [%sp + STACK_START + 16], %g3 3867 ld [%sp + STACK_START + 20], %g4 3868#else 3869 ldx [%sp + STACK_START + 32], %g1 3870 ldx [%sp + STACK_START + 40], %g2 3871 ldx [%sp + STACK_START + 48], %g3 3872 ldx [%sp + STACK_START + 56], %g4 3873#endif 3874#endif 3875 3876 return %i7 + 8 3877 clr %o0 3878 3879 EPILOGUE 3880