1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2005. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define APREFETCHSIZE 24 26#define APREFETCH_CATEGORY 0 27 28#define M %i0 29#define N %i1 30#define K %i2 31 32#if defined(DOUBLE) && !defined(__64BIT__) 33#define A %i5 34#define B %i4 35#else 36#define A %i4 37#define B %i5 38#endif 39 40#define C %o4 41#define LDC %o5 42 43#define AO %l0 44#define BO %l1 45#define I %l2 46#define J %l3 47#define L %l4 48 49#define C1 %o0 50#define C2 %o1 51#define C3 %o2 52#define C4 %o3 53 54#define C5 %l5 55#define C6 %l6 56#define C7 %l7 57#define C8 %i3 58 59#define OFFSET %g1 60#define KK %g2 61#define TEMP1 %g3 62#define TEMP2 %g4 63#define AORIG %o7 64 65#ifdef DOUBLE 66#define c01 %f0 67#define c02 %f2 68#define c03 %f4 69#define c04 %f6 70#define c05 %f8 71#define c06 %f10 72#define c07 %f12 73#define c08 %f14 74#define c09 %f16 75#define c10 %f18 76#define c11 %f20 77#define c12 %f22 78#define c13 %f24 79#define c14 %f26 80#define c15 %f28 81#define c16 %f30 82 83#define a1 %f32 84#define a2 %f34 85#define a3 %f36 86#define a4 %f38 87#define a5 %f40 88 89#define b1 %f42 90#define b2 %f44 91#define b3 %f46 92#define b4 %f48 93#define b5 %f50 94#define b6 %f52 95#define b7 %f54 96#define b8 %f56 97#define b9 %f58 98 99#define cc01 0 100#define cc02 2 101#define cc03 4 102#define cc04 6 103#define cc05 8 104#define cc06 10 105#define cc07 12 106#define cc08 14 107#define cc09 16 108#define cc10 18 109#define cc11 20 110#define cc12 22 111#define cc13 24 112#define cc14 26 113#define cc15 28 114#define cc16 30 115 116#define aa1 1 117#define aa2 3 118#define aa3 5 119#define aa4 7 120#define aa5 9 121 122#define bb1 11 123#define bb2 13 124#define bb3 15 125#define bb4 17 126#define bb5 19 127#define bb6 21 128#define bb7 23 129#define bb8 25 130#define bb9 27 131 132#else 133#define c01 %f0 134#define c02 %f1 135#define c03 %f2 136#define c04 %f3 137#define c05 %f4 138#define c06 %f5 139#define c07 %f6 140#define c08 %f7 141#define c09 %f8 142#define c10 %f9 143#define c11 %f10 144#define c12 %f11 145#define c13 %f12 146#define c14 %f13 147#define c15 %f14 148#define c16 %f15 149 150#define a1 %f16 151#define a2 %f17 152#define a3 %f18 153#define a4 %f19 154#define a5 %f20 155 156#define b1 %f21 157#define b2 %f22 158#define b3 %f23 159#define b4 %f24 160#define b5 %f25 161#define b6 %f26 162#define b7 %f27 163#define b8 %f28 164#define b9 %f29 165 166#define cc01 0 167#define cc02 1 168#define cc03 2 169#define cc04 3 170#define cc05 4 171#define cc06 5 172#define cc07 6 173#define cc08 7 174#define cc09 8 175#define cc10 9 176#define cc11 10 177#define cc12 11 178#define cc13 12 179#define cc14 13 180#define cc15 14 181#define cc16 15 182 183#define aa1 16 184#define aa2 17 185#define aa3 18 186#define aa4 19 187#define aa5 20 188 189#define bb1 21 190#define bb2 22 191#define bb3 23 192#define bb4 24 193#define bb5 25 194#define bb6 26 195#define bb7 27 196#define bb8 28 197#define bb9 29 198 199#endif 200 201 .register %g2, #scratch 202 .register %g3, #scratch 203 204 PROLOGUE 205 SAVESP 206 nop 207 208#ifndef __64BIT__ 209 210#ifdef DOUBLE 211 ld [%sp + STACK_START + 28], B 212 ld [%sp + STACK_START + 32], C 213 ld [%sp + STACK_START + 36], LDC 214 ld [%sp + STACK_START + 40], OFFSET 215#else 216 ld [%sp + STACK_START + 28], C 217 ld [%sp + STACK_START + 32], LDC 218 ld [%sp + STACK_START + 36], OFFSET 219#endif 220 st %g1, [%sp + STACK_START + 8] 221 st %g2, [%sp + STACK_START + 12] 222 st %g3, [%sp + STACK_START + 16] 223 st %g4, [%sp + STACK_START + 20] 224#else 225 226 ldx [%sp+ STACK_START + 56], C 227 ldx [%sp+ STACK_START + 64], LDC 228 ldx [%sp+ STACK_START + 72], OFFSET 229 230 stx %g1, [%sp + STACK_START + 32] 231 stx %g2, [%sp + STACK_START + 40] 232 stx %g3, [%sp + STACK_START + 48] 233 stx %g4, [%sp + STACK_START + 56] 234#endif 235 236#if defined(TRMMKERNEL) && !defined(LEFT) 237 neg OFFSET, KK 238#endif 239 240 sll LDC, BASE_SHIFT, LDC 241 242#ifdef LN 243 smul M, K, TEMP1 244 sll TEMP1, BASE_SHIFT, TEMP1 245 add A, TEMP1, A 246 247 sll M, BASE_SHIFT, TEMP1 248 add C, TEMP1, C 249#endif 250 251#ifdef RN 252 neg OFFSET, KK 253#endif 254 255#ifdef RT 256 smul N, K, TEMP1 257 sll TEMP1, BASE_SHIFT, TEMP1 258 add B, TEMP1, B 259 260 smul N, LDC, TEMP1 261 add C, TEMP1, C 262 263 sub N, OFFSET, KK 264#endif 265 266 and N, 1, J 267 cmp J, 0 268 ble,pn %icc, .LL50 269 nop 270 271#ifdef RT 272 sll K, BASE_SHIFT, TEMP1 273 sub B, TEMP1, B 274#endif 275 276#ifndef RT 277 mov C, C1 278 add C1, LDC, C 279#else 280 sub C, LDC, C1 281 sub C, LDC, C 282#endif 283 284#ifdef LN 285 add M, OFFSET, KK 286#endif 287 288#ifdef LT 289 mov OFFSET, KK 290#endif 291 292#if defined(LN) || defined(RT) 293 mov A, AORIG 294#else 295 mov A, AO 296#endif 297 298 sra M, 1, I 299 cmp I, 0 300 ble,pn %icc, .LL80 301 nop 302 .align 4 303 304.LL72: 305#if defined(LT) || defined(RN) 306 mov B, BO 307#else 308#ifdef LN 309 sll K, BASE_SHIFT + 1, TEMP1 310 sub AORIG, TEMP1, AORIG 311#endif 312 313 sll KK, BASE_SHIFT + 1, TEMP1 314 sll KK, BASE_SHIFT + 0, TEMP2 315 316 add AORIG, TEMP1, AO 317 add B, TEMP2, BO 318#endif 319 320 LDF [AO + 0 * SIZE], a1 321 LDF [AO + 1 * SIZE], a2 322 LDF [AO + 2 * SIZE], a3 323 LDF [AO + 3 * SIZE], a4 324 325 LDF [BO + 0 * SIZE], b1 326 LDF [BO + 1 * SIZE], b2 327 LDF [BO + 2 * SIZE], b3 328 FCLR (cc01) 329 LDF [BO + 3 * SIZE], b4 330 FCLR (cc02) 331 332 prefetch [C1 + 2 * SIZE], 3 333 334#if defined(LT) || defined(RN) 335 sra KK, 2, L 336#else 337 sub K, KK, L 338 sra L, 2, L 339#endif 340 cmp L, 0 341 ble,pn %icc, .LL75 342 nop 343 344.LL73: 345 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 346 add L, -1, L 347 348 FMADD (aa1, bb1, cc01, cc01) 349 LDF [AO + 4 * SIZE], a1 350 FMADD (aa2, bb1, cc02, cc02) 351 LDF [AO + 5 * SIZE], a2 352 353 LDF [BO + 4 * SIZE], b1 354 cmp L, 0 355 356 FMADD (aa3, bb2, cc01, cc01) 357 LDF [AO + 6 * SIZE], a3 358 FMADD (aa4, bb2, cc02, cc02) 359 LDF [AO + 7 * SIZE], a4 360 361 LDF [BO + 5 * SIZE], b2 362 add BO, 4 * SIZE, BO 363 364 FMADD (aa1, bb3, cc01, cc01) 365 LDF [AO + 8 * SIZE], a1 366 FMADD (aa2, bb3, cc02, cc02) 367 LDF [AO + 9 * SIZE], a2 368 369 LDF [BO + 2 * SIZE], b3 370 add AO, 8 * SIZE, AO 371 372 FMADD (aa3, bb4, cc01, cc01) 373 LDF [AO + 2 * SIZE], a3 374 FMADD (aa4, bb4, cc02, cc02) 375 LDF [AO + 3 * SIZE], a4 376 377 bg,pt %icc, .LL73 378 LDF [BO + 3 * SIZE], b4 379 .align 4 380 381.LL75: 382#if defined(LT) || defined(RN) 383 and KK, 3, L 384#else 385 sub K, KK, L 386 and L, 3, L 387#endif 388 cmp L, 0 389 ble,a,pn %icc, .LL78 390 nop 391 .align 4 392 393.LL77: 394 FMADD (aa1, bb1, cc01, cc01) 395 LDF [AO + 2 * SIZE], a1 396 FMADD (aa2, bb1, cc02, cc02) 397 LDF [AO + 3 * SIZE], a2 398 399 LDF [BO + 1 * SIZE], b1 400 add L, -1, L 401 add AO, 2 * SIZE, AO 402 cmp L, 0 403 bg,pt %icc, .LL77 404 add BO, 1 * SIZE, BO 405 .align 4 406 407.LL78: 408#if defined(LN) || defined(RT) 409#ifdef LN 410 sub KK, 2, TEMP1 411#else 412 sub KK, 1, TEMP1 413#endif 414 sll TEMP1, BASE_SHIFT + 1, TEMP2 415 sll TEMP1, BASE_SHIFT + 0, TEMP1 416 417 add AORIG, TEMP2, AO 418 add B, TEMP1, BO 419#endif 420 421#if defined(LN) || defined(LT) 422 LDF [BO + 0 * SIZE], a1 423 LDF [BO + 1 * SIZE], a2 424 425 FSUB a1, c01, c01 426 FSUB a2, c02, c02 427#else 428 LDF [AO + 0 * SIZE], a1 429 LDF [AO + 1 * SIZE], a2 430 431 FSUB a1, c01, c01 432 FSUB a2, c02, c02 433#endif 434 435#ifdef LN 436 LDF [AO + 3 * SIZE], a1 437 LDF [AO + 2 * SIZE], a2 438 LDF [AO + 0 * SIZE], a3 439 440 FMUL a1, c02, c02 441 442 FNMSUB (aa2, cc02, cc01, cc01) 443 444 FMUL a3, c01, c01 445#endif 446 447#ifdef LT 448 LDF [AO + 0 * SIZE], a1 449 LDF [AO + 1 * SIZE], a2 450 LDF [AO + 3 * SIZE], a3 451 452 FMUL a1, c01, c01 453 454 FNMSUB (aa2, cc01, cc02, cc02) 455 456 FMUL a3, c02, c02 457#endif 458 459#if defined(RN) || defined(RT) 460 LDF [BO + 0 * SIZE], a1 461 462 FMUL a1, c01, c01 463 FMUL a1, c02, c02 464#endif 465 466#ifdef LN 467 add C1, -2 * SIZE, C1 468#endif 469 470#if defined(LN) || defined(LT) 471 STF c01, [BO + 0 * SIZE] 472 STF c02, [BO + 1 * SIZE] 473#else 474 STF c01, [AO + 0 * SIZE] 475 STF c02, [AO + 1 * SIZE] 476#endif 477 478 STF c01, [C1 + 0 * SIZE] 479 STF c02, [C1 + 1 * SIZE] 480 481#ifndef LN 482 add C1, 2 * SIZE, C1 483#endif 484 485#ifdef RT 486 sll K, BASE_SHIFT + 1, TEMP1 487 add AORIG, TEMP1, AORIG 488#endif 489 490#if defined(LT) || defined(RN) 491 sub K, KK, TEMP1 492 sll TEMP1, BASE_SHIFT + 1, TEMP2 493 sll TEMP1, BASE_SHIFT + 0, TEMP1 494 add AO, TEMP2, AO 495 add BO, TEMP1, BO 496#endif 497 498#ifdef LT 499 add KK, 2, KK 500#endif 501 502#ifdef LN 503 sub KK, 2, KK 504#endif 505 506 add I, -1, I 507 cmp I, 0 508 bg,pt %icc, .LL72 509 nop 510 .align 4 511 512.LL80: 513 and M, 1, I 514 cmp I, 0 515 ble,pn %icc, .LL89 516 nop 517 518#if defined(LT) || defined(RN) 519 mov B, BO 520#else 521#ifdef LN 522 sll K, BASE_SHIFT + 0, TEMP1 523 sub AORIG, TEMP1, AORIG 524#endif 525 526 sll KK, BASE_SHIFT + 0, TEMP1 527 sll KK, BASE_SHIFT + 0, TEMP2 528 529 add AORIG, TEMP1, AO 530 add B, TEMP2, BO 531#endif 532 533 LDF [AO + 0 * SIZE], a1 534 LDF [BO + 0 * SIZE], b1 535 LDF [AO + 1 * SIZE], a2 536 LDF [BO + 1 * SIZE], b2 537 LDF [AO + 2 * SIZE], a3 538 LDF [BO + 2 * SIZE], b3 539 LDF [AO + 3 * SIZE], a4 540 LDF [BO + 3 * SIZE], b4 541 542#if defined(LT) || defined(RN) 543 sra KK, 2, L 544#else 545 sub K, KK, L 546 sra L, 2, L 547#endif 548 cmp L, 0 549 ble,pn %icc, .LL85 550 FCLR (cc01) 551 .align 4 552 553.LL83: 554 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 555 add L, -1, L 556 557 FMADD (aa1, bb1, cc01, cc01) 558 LDF [AO + 4 * SIZE], a1 559 LDF [BO + 4 * SIZE], b1 560 561 FMADD (aa2, bb2, cc01, cc01) 562 LDF [AO + 5 * SIZE], a2 563 LDF [BO + 5 * SIZE], b2 564 565 FMADD (aa3, bb3, cc01, cc01) 566 LDF [AO + 6 * SIZE], a3 567 LDF [BO + 6 * SIZE], b3 568 569 FMADD (aa4, bb4, cc01, cc01) 570 LDF [AO + 7 * SIZE], a4 571 LDF [BO + 7 * SIZE], b4 572 573 add AO, 4 * SIZE, AO 574 cmp L, 0 575 576 bg,pt %icc, .LL83 577 add BO, 4 * SIZE, BO 578 .align 4 579 580.LL85: 581#if defined(LT) || defined(RN) 582 and KK, 3, L 583#else 584 sub K, KK, L 585 and L, 3, L 586#endif 587 cmp L, 0 588 ble,a,pn %icc, .LL88 589 nop 590 .align 4 591 592.LL87: 593 FMADD (aa1, bb1, cc01, cc01) 594 LDF [AO + 1 * SIZE], a1 595 LDF [BO + 1 * SIZE], b1 596 597 add AO, 1 * SIZE, AO 598 add L, -1, L 599 cmp L, 0 600 bg,pt %icc, .LL87 601 add BO, 1 * SIZE, BO 602 .align 4 603 604.LL88: 605#if defined(LN) || defined(RT) 606#ifdef LN 607 sub KK, 1, TEMP1 608#else 609 sub KK, 1, TEMP1 610#endif 611 sll TEMP1, BASE_SHIFT + 0, TEMP2 612 sll TEMP1, BASE_SHIFT + 0, TEMP1 613 614 add AORIG, TEMP2, AO 615 add B, TEMP1, BO 616#endif 617 618#if defined(LN) || defined(LT) 619 LDF [BO + 0 * SIZE], a1 620 621 FSUB a1, c01, c01 622#else 623 LDF [AO + 0 * SIZE], a1 624 625 FSUB a1, c01, c01 626#endif 627 628#if defined(LN) || defined(LT) 629 LDF [AO + 0 * SIZE], a1 630 631 FMUL a1, c01, c01 632#endif 633 634#if defined(RN) || defined(RT) 635 LDF [BO + 0 * SIZE], a1 636 637 FMUL a1, c01, c01 638#endif 639 640#ifdef LN 641 add C1, -1 * SIZE, C1 642#endif 643 644#if defined(LN) || defined(LT) 645 STF c01, [BO + 0 * SIZE] 646#else 647 STF c01, [AO + 0 * SIZE] 648#endif 649 650 STF c01, [C1 + 0 * SIZE] 651 652#ifdef RT 653 sll K, BASE_SHIFT + 0, TEMP1 654 add AORIG, TEMP1, AORIG 655#endif 656 657#if defined(LT) || defined(RN) 658 sub K, KK, TEMP1 659 sll TEMP1, BASE_SHIFT + 0, TEMP2 660 sll TEMP1, BASE_SHIFT + 0, TEMP1 661 add AO, TEMP2, AO 662 add BO, TEMP1, BO 663#endif 664 665#ifdef LT 666 add KK, 1, KK 667#endif 668 669#ifdef LN 670 sub KK, 1, KK 671#endif 672 .align 4 673 674.LL89: 675#ifdef LN 676 sll K, BASE_SHIFT, TEMP1 677 add B, TEMP1, B 678#endif 679 680#if defined(LT) || defined(RN) 681 mov BO, B 682#endif 683 684#ifdef RN 685 add KK, 1, KK 686#endif 687 688#ifdef RT 689 sub KK, 1, KK 690#endif 691 .align 4 692 693.LL50: 694 and N, 2, J 695 cmp J, 0 696 ble,pn %icc, .LL30 697 nop 698 699#ifdef RT 700 sll K, BASE_SHIFT + 1, TEMP1 701 sub B, TEMP1, B 702#endif 703 704#ifndef RT 705 mov C, C1 706 add C, LDC, C2 707 add C2, LDC, C 708#else 709 sub C, LDC, C2 710 sub C2, LDC, C1 711 sub C2, LDC, C 712#endif 713 714#ifdef LN 715 add M, OFFSET, KK 716#endif 717 718#ifdef LT 719 mov OFFSET, KK 720#endif 721 722#if defined(LN) || defined(RT) 723 mov A, AORIG 724#else 725 mov A, AO 726#endif 727 728 sra M, 1, I 729 cmp I, 0 730 ble,pn %icc, .LL60 731 nop 732 .align 4 733 734.LL52: 735#if defined(LT) || defined(RN) 736 mov B, BO 737#else 738#ifdef LN 739 sll K, BASE_SHIFT + 1, TEMP1 740 sub AORIG, TEMP1, AORIG 741#endif 742 743 sll KK, BASE_SHIFT + 1, TEMP1 744 sll KK, BASE_SHIFT + 1, TEMP2 745 746 add AORIG, TEMP1, AO 747 add B, TEMP2, BO 748#endif 749 750 LDF [AO + 0 * SIZE], a1 751 LDF [AO + 1 * SIZE], a2 752 LDF [AO + 2 * SIZE], a3 753 LDF [AO + 3 * SIZE], a4 754 755 LDF [BO + 0 * SIZE], b1 756 LDF [BO + 1 * SIZE], b2 757 LDF [BO + 2 * SIZE], b3 758 FCLR (cc01) 759 LDF [BO + 3 * SIZE], b4 760 FCLR (cc02) 761 762 LDF [BO + 4 * SIZE], b5 763 FCLR (cc03) 764 LDF [BO + 5 * SIZE], b6 765 FCLR (cc04) 766 LDF [BO + 6 * SIZE], b7 767 FCLR (cc05) 768 LDF [BO + 7 * SIZE], b8 769 FCLR (cc06) 770 771 prefetch [C1 + 2 * SIZE], 3 772 FCLR (cc07) 773 prefetch [C2 + 2 * SIZE], 3 774 FCLR (cc08) 775 776#if defined(LT) || defined(RN) 777 sra KK, 2, L 778#else 779 sub K, KK, L 780 sra L, 2, L 781#endif 782 cmp L, 0 783 ble,pn %icc, .LL55 784 nop 785 .align 4 786 787.LL53: 788 FMADD (aa1, bb1, cc01, cc01) 789 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 790 FMADD (aa2, bb1, cc02, cc02) 791 LDF [BO + 8 * SIZE], b1 792 793 FMADD (aa1, bb2, cc03, cc03) 794 LDF [AO + 4 * SIZE], a1 795 FMADD (aa2, bb2, cc04, cc04) 796 LDF [AO + 5 * SIZE], a2 797 798 FMADD (aa3, bb3, cc01, cc01) 799 LDF [BO + 9 * SIZE], b2 800 FMADD (aa4, bb3, cc02, cc02) 801 LDF [BO + 10 * SIZE], b3 802 803 FMADD (aa3, bb4, cc03, cc03) 804 LDF [AO + 6 * SIZE], a3 805 FMADD (aa4, bb4, cc04, cc04) 806 LDF [AO + 7 * SIZE], a4 807 808 FMADD (aa1, bb5, cc01, cc01) 809 LDF [BO + 11 * SIZE], b4 810 FMADD (aa2, bb5, cc02, cc02) 811 LDF [BO + 12 * SIZE], b5 812 813 FMADD (aa1, bb6, cc03, cc03) 814 LDF [AO + 8 * SIZE], a1 815 FMADD (aa2, bb6, cc04, cc04) 816 LDF [AO + 9 * SIZE], a2 817 818 FMADD (aa3, bb7, cc01, cc01) 819 LDF [BO + 13 * SIZE], b6 820 821 FMADD (aa4, bb7, cc02, cc02) 822 LDF [BO + 14 * SIZE], b7 823 824 FMADD (aa3, bb8, cc03, cc03) 825 LDF [AO + 10 * SIZE], a3 826 FMADD (aa4, bb8, cc04, cc04) 827 LDF [AO + 11 * SIZE], a4 828 829 add AO, 8 * SIZE, AO 830 add L, -1, L 831 add BO, 8 * SIZE, BO 832 cmp L, 0 833 834 bg,pt %icc, .LL53 835 LDF [BO + 7 * SIZE], b8 836 .align 4 837 838.LL55: 839#if defined(LT) || defined(RN) 840 and KK, 3, L 841#else 842 sub K, KK, L 843 and L, 3, L 844#endif 845 cmp L, 0 846 ble,a,pn %icc, .LL58 847 nop 848 .align 4 849 850.LL57: 851 FMADD (aa1, bb1, cc01, cc01) 852 add L, -1, L 853 FMADD (aa2, bb1, cc02, cc02) 854 LDF [BO + 2 * SIZE], b1 855 856 FMADD (aa1, bb2, cc03, cc03) 857 LDF [AO + 2 * SIZE], a1 858 FMADD (aa2, bb2, cc04, cc04) 859 LDF [AO + 3 * SIZE], a2 860 861 add AO, 2 * SIZE, AO 862 cmp L, 0 863 add BO, 2 * SIZE, BO 864 bg,pt %icc, .LL57 865 LDF [BO + 1 * SIZE], b2 866 .align 4 867 868.LL58: 869#if defined(LN) || defined(RT) 870#ifdef LN 871 sub KK, 2, TEMP1 872#else 873 sub KK, 2, TEMP1 874#endif 875 sll TEMP1, BASE_SHIFT + 1, TEMP2 876 sll TEMP1, BASE_SHIFT + 1, TEMP1 877 878 add AORIG, TEMP2, AO 879 add B, TEMP1, BO 880#endif 881 882#if defined(LN) || defined(LT) 883 LDF [BO + 0 * SIZE], a1 884 LDF [BO + 1 * SIZE], a2 885 LDF [BO + 2 * SIZE], a3 886 LDF [BO + 3 * SIZE], a4 887 888 FSUB a1, c01, c01 889 FSUB a2, c03, c03 890 FSUB a3, c02, c02 891 FSUB a4, c04, c04 892#else 893 LDF [AO + 0 * SIZE], a1 894 LDF [AO + 1 * SIZE], a2 895 LDF [AO + 2 * SIZE], a3 896 LDF [AO + 3 * SIZE], a4 897 898 FSUB a1, c01, c01 899 FSUB a2, c02, c02 900 FSUB a3, c03, c03 901 FSUB a4, c04, c04 902#endif 903 904#ifdef LN 905 LDF [AO + 3 * SIZE], a1 906 LDF [AO + 2 * SIZE], a2 907 LDF [AO + 0 * SIZE], a3 908 909 FMUL a1, c02, c02 910 FMUL a1, c04, c04 911 912 FNMSUB (aa2, cc02, cc01, cc01) 913 FNMSUB (aa2, cc04, cc03, cc03) 914 915 FMUL a3, c01, c01 916 FMUL a3, c03, c03 917#endif 918 919#ifdef LT 920 LDF [AO + 0 * SIZE], a1 921 LDF [AO + 1 * SIZE], a2 922 LDF [AO + 3 * SIZE], a3 923 924 FMUL a1, c01, c01 925 FMUL a1, c03, c03 926 927 FNMSUB (aa2, cc01, cc02, cc02) 928 FNMSUB (aa2, cc03, cc04, cc04) 929 930 FMUL a3, c02, c02 931 FMUL a3, c04, c04 932#endif 933 934#ifdef RN 935 LDF [BO + 0 * SIZE], a1 936 LDF [BO + 1 * SIZE], a2 937 938 FMUL a1, c01, c01 939 FMUL a1, c02, c02 940 941 FNMSUB (aa2, cc01, cc03, cc03) 942 FNMSUB (aa2, cc02, cc04, cc04) 943 944 LDF [BO + 3 * SIZE], a1 945 946 FMUL a1, c03, c03 947 FMUL a1, c04, c04 948#endif 949 950#ifdef RT 951 LDF [BO + 3 * SIZE], a1 952 LDF [BO + 2 * SIZE], a2 953 954 FMUL a1, c04, c04 955 FMUL a1, c03, c03 956 957 FNMSUB (aa2, cc04, cc02, cc02) 958 FNMSUB (aa2, cc03, cc01, cc01) 959 960 LDF [BO + 0 * SIZE], a1 961 962 FMUL a1, c02, c02 963 FMUL a1, c01, c01 964#endif 965 966#ifdef LN 967 add C1, -2 * SIZE, C1 968 add C2, -2 * SIZE, C2 969#endif 970 971#if defined(LN) || defined(LT) 972 STF c01, [BO + 0 * SIZE] 973 STF c03, [BO + 1 * SIZE] 974 STF c02, [BO + 2 * SIZE] 975 STF c04, [BO + 3 * SIZE] 976#else 977 STF c01, [AO + 0 * SIZE] 978 STF c02, [AO + 1 * SIZE] 979 STF c03, [AO + 2 * SIZE] 980 STF c04, [AO + 3 * SIZE] 981#endif 982 983 STF c01, [C1 + 0 * SIZE] 984 STF c02, [C1 + 1 * SIZE] 985 STF c03, [C2 + 0 * SIZE] 986 STF c04, [C2 + 1 * SIZE] 987 988#ifndef LN 989 add C1, 2 * SIZE, C1 990 add C2, 2 * SIZE, C2 991#endif 992 993#ifdef RT 994 sll K, BASE_SHIFT + 1, TEMP1 995 add AORIG, TEMP1, AORIG 996#endif 997 998#if defined(LT) || defined(RN) 999 sub K, KK, TEMP1 1000 sll TEMP1, BASE_SHIFT + 1, TEMP2 1001 sll TEMP1, BASE_SHIFT + 1, TEMP1 1002 add AO, TEMP2, AO 1003 add BO, TEMP1, BO 1004#endif 1005 1006#ifdef LT 1007 add KK, 2, KK 1008#endif 1009 1010#ifdef LN 1011 sub KK, 2, KK 1012#endif 1013 1014 add I, -1, I 1015 cmp I, 0 1016 bg,pt %icc, .LL52 1017 nop 1018 .align 4 1019 1020.LL60: 1021 and M, 1, I 1022 cmp I, 0 1023 ble,pn %icc, .LL69 1024 nop 1025 1026#if defined(LT) || defined(RN) 1027 mov B, BO 1028#else 1029#ifdef LN 1030 sll K, BASE_SHIFT + 0, TEMP1 1031 sub AORIG, TEMP1, AORIG 1032#endif 1033 1034 sll KK, BASE_SHIFT + 0, TEMP1 1035 sll KK, BASE_SHIFT + 1, TEMP2 1036 1037 add AORIG, TEMP1, AO 1038 add B, TEMP2, BO 1039#endif 1040 1041 LDF [AO + 0 * SIZE], a1 1042 LDF [AO + 1 * SIZE], a2 1043 LDF [AO + 2 * SIZE], a3 1044 LDF [AO + 3 * SIZE], a4 1045 1046 LDF [BO + 0 * SIZE], b1 1047 LDF [BO + 1 * SIZE], b2 1048 LDF [BO + 2 * SIZE], b3 1049 LDF [BO + 3 * SIZE], b4 1050 LDF [BO + 4 * SIZE], b5 1051 LDF [BO + 5 * SIZE], b6 1052 LDF [BO + 6 * SIZE], b7 1053 FCLR (cc01) 1054 LDF [BO + 7 * SIZE], b8 1055 FCLR (cc03) 1056 1057#if defined(LT) || defined(RN) 1058 sra KK, 2, L 1059#else 1060 sub K, KK, L 1061 sra L, 2, L 1062#endif 1063 cmp L, 0 1064 ble,pn %icc, .LL65 1065 nop 1066 .align 4 1067 1068.LL63: 1069 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1070 add L, -1, L 1071 1072 FMADD (aa1, bb1, cc01, cc01) 1073 LDF [BO + 8 * SIZE], b1 1074 FMADD (aa1, bb2, cc03, cc03) 1075 LDF [BO + 9 * SIZE], b2 1076 1077 LDF [AO + 4 * SIZE], a1 1078 cmp L, 0 1079 1080 FMADD (aa2, bb3, cc01, cc01) 1081 LDF [BO + 10 * SIZE], b3 1082 FMADD (aa2, bb4, cc03, cc03) 1083 LDF [BO + 11 * SIZE], b4 1084 1085 LDF [AO + 5 * SIZE], a2 1086 add AO, 4 * SIZE, AO 1087 1088 FMADD (aa3, bb5, cc01, cc01) 1089 LDF [BO + 12 * SIZE], b5 1090 FMADD (aa3, bb6, cc03, cc03) 1091 LDF [BO + 13 * SIZE], b6 1092 1093 LDF [AO + 2 * SIZE], a3 1094 add BO, 8 * SIZE, BO 1095 1096 FMADD (aa4, bb7, cc01, cc01) 1097 LDF [BO + 6 * SIZE], b7 1098 FMADD (aa4, bb8, cc03, cc03) 1099 LDF [BO + 7 * SIZE], b8 1100 1101 bg,pt %icc, .LL63 1102 LDF [AO + 3 * SIZE], a4 1103 .align 4 1104 1105.LL65: 1106#if defined(LT) || defined(RN) 1107 and KK, 3, L 1108#else 1109 sub K, KK, L 1110 and L, 3, L 1111#endif 1112 cmp L, 0 1113 ble,a,pn %icc, .LL68 1114 nop 1115 .align 4 1116 1117.LL67: 1118 FMADD (aa1, bb1, cc01, cc01) 1119 LDF [BO + 2 * SIZE], b1 1120 FMADD (aa1, bb2, cc03, cc03) 1121 LDF [BO + 3 * SIZE], b2 1122 1123 LDF [AO + 1 * SIZE], a1 1124 add L, -1, L 1125 add AO, 1 * SIZE, AO 1126 cmp L, 0 1127 1128 bg,pt %icc, .LL67 1129 add BO, 2 * SIZE, BO 1130 .align 4 1131 1132.LL68: 1133#if defined(LN) || defined(RT) 1134#ifdef LN 1135 sub KK, 1, TEMP1 1136#else 1137 sub KK, 2, TEMP1 1138#endif 1139 sll TEMP1, BASE_SHIFT + 0, TEMP2 1140 sll TEMP1, BASE_SHIFT + 1, TEMP1 1141 1142 add AORIG, TEMP2, AO 1143 add B, TEMP1, BO 1144#endif 1145 1146#if defined(LN) || defined(LT) 1147 LDF [BO + 0 * SIZE], a1 1148 LDF [BO + 1 * SIZE], a2 1149 1150 FSUB a1, c01, c01 1151 FSUB a2, c03, c03 1152#else 1153 LDF [AO + 0 * SIZE], a1 1154 LDF [AO + 1 * SIZE], a2 1155 1156 FSUB a1, c01, c01 1157 FSUB a2, c03, c03 1158#endif 1159 1160#if defined(LN) || defined(LT) 1161 LDF [AO + 0 * SIZE], a1 1162 1163 FMUL a1, c01, c01 1164 FMUL a1, c03, c03 1165#endif 1166 1167#ifdef RN 1168 LDF [BO + 0 * SIZE], a1 1169 LDF [BO + 1 * SIZE], a2 1170 1171 FMUL a1, c01, c01 1172 1173 FNMSUB (aa2, cc01, cc03, cc03) 1174 1175 LDF [BO + 3 * SIZE], a1 1176 1177 FMUL a1, c03, c03 1178#endif 1179 1180#ifdef RT 1181 LDF [BO + 3 * SIZE], a1 1182 LDF [BO + 2 * SIZE], a2 1183 1184 FMUL a1, c03, c03 1185 1186 FNMSUB (aa2, cc03, cc01, cc01) 1187 1188 LDF [BO + 0 * SIZE], a1 1189 1190 FMUL a1, c01, c01 1191#endif 1192 1193#ifdef LN 1194 add C1, -1 * SIZE, C1 1195 add C2, -1 * SIZE, C2 1196#endif 1197 1198#if defined(LN) || defined(LT) 1199 STF c01, [BO + 0 * SIZE] 1200 STF c03, [BO + 1 * SIZE] 1201#else 1202 STF c01, [AO + 0 * SIZE] 1203 STF c03, [AO + 1 * SIZE] 1204#endif 1205 1206 STF c01, [C1 + 0 * SIZE] 1207 STF c03, [C2 + 0 * SIZE] 1208 1209#ifdef RT 1210 sll K, BASE_SHIFT + 0, TEMP1 1211 add AORIG, TEMP1, AORIG 1212#endif 1213 1214#if defined(LT) || defined(RN) 1215 sub K, KK, TEMP1 1216 sll TEMP1, BASE_SHIFT + 0, TEMP2 1217 sll TEMP1, BASE_SHIFT + 1, TEMP1 1218 add AO, TEMP2, AO 1219 add BO, TEMP1, BO 1220#endif 1221 1222#ifdef LT 1223 add KK, 1, KK 1224#endif 1225 1226#ifdef LN 1227 sub KK, 1, KK 1228#endif 1229 .align 4 1230 1231.LL69: 1232#ifdef LN 1233 sll K, BASE_SHIFT + 1, TEMP1 1234 add B, TEMP1, B 1235#endif 1236 1237#if defined(LT) || defined(RN) 1238 mov BO, B 1239#endif 1240 1241#ifdef RN 1242 add KK, 2, KK 1243#endif 1244 1245#ifdef RT 1246 sub KK, 2, KK 1247#endif 1248 .align 4 1249 1250.LL30: 1251 and N, 4, J 1252 cmp J, 0 1253 ble,pn %icc, .LL10 1254 nop 1255 1256#ifdef RT 1257 sll K, BASE_SHIFT + 2, TEMP1 1258 sub B, TEMP1, B 1259#endif 1260 1261#ifndef RT 1262 mov C, C1 1263 add C, LDC, C2 1264 add C2, LDC, C3 1265 add C3, LDC, C4 1266 add C4, LDC, C 1267#else 1268 sub C, LDC, C4 1269 sub C4, LDC, C3 1270 sub C3, LDC, C2 1271 sub C2, LDC, C1 1272 sub C2, LDC, C 1273#endif 1274 1275#ifdef LN 1276 add M, OFFSET, KK 1277#endif 1278 1279#ifdef LT 1280 mov OFFSET, KK 1281#endif 1282 1283#if defined(LN) || defined(RT) 1284 mov A, AORIG 1285#else 1286 mov A, AO 1287#endif 1288 1289 sra M, 1, I 1290 cmp I, 0 1291 ble,pn %icc, .LL40 1292 nop 1293 .align 4 1294 1295.LL32: 1296#if defined(LT) || defined(RN) 1297 mov B, BO 1298#else 1299#ifdef LN 1300 sll K, BASE_SHIFT + 1, TEMP1 1301 sub AORIG, TEMP1, AORIG 1302#endif 1303 1304 sll KK, BASE_SHIFT + 1, TEMP1 1305 sll KK, BASE_SHIFT + 2, TEMP2 1306 1307 add AORIG, TEMP1, AO 1308 add B, TEMP2, BO 1309#endif 1310 1311 LDF [AO + 0 * SIZE], a1 1312 LDF [AO + 1 * SIZE], a2 1313 1314 LDF [BO + 0 * SIZE], b1 1315 LDF [BO + 1 * SIZE], b2 1316 LDF [BO + 2 * SIZE], b3 1317 LDF [BO + 3 * SIZE], b4 1318 LDF [BO + 4 * SIZE], b5 1319 1320 LDF [BO + 5 * SIZE], b6 1321 FCLR (cc01) 1322 LDF [BO + 6 * SIZE], b7 1323 FCLR (cc02) 1324 LDF [BO + 7 * SIZE], b8 1325 FCLR (cc03) 1326 LDF [BO + 8 * SIZE], b9 1327 FCLR (cc04) 1328 1329 prefetch [C1 + 2 * SIZE], 3 1330 FCLR (cc05) 1331 prefetch [C2 + 2 * SIZE], 3 1332 FCLR (cc06) 1333 prefetch [C3 + 2 * SIZE], 3 1334 FCLR (cc07) 1335 prefetch [C4 + 2 * SIZE], 3 1336 FCLR (cc08) 1337 1338#if defined(LT) || defined(RN) 1339 sra KK, 2, L 1340#else 1341 sub K, KK, L 1342 sra L, 2, L 1343#endif 1344 cmp L, 0 1345 ble,pn %icc, .LL35 1346 nop 1347 .align 4 1348 1349.LL33: 1350 FMADD (aa1, bb1, cc01, cc01) 1351 LDF [AO + 2 * SIZE], a3 1352 FMADD (aa2, bb1, cc02, cc02) 1353 LDF [AO + 3 * SIZE], a4 1354 1355 FMADD (aa1, bb2, cc03, cc03) 1356 LDF [BO + 16 * SIZE], b1 1357 FMADD (aa2, bb2, cc04, cc04) 1358 LDF [BO + 9 * SIZE], b2 1359 1360 FMADD (aa1, bb3, cc05, cc05) 1361 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1362 FMADD (aa2, bb3, cc06, cc06) 1363 add L, -1, L 1364 1365 FMADD (aa1, bb4, cc07, cc07) 1366 LDF [BO + 10 * SIZE], b3 1367 FMADD (aa2, bb4, cc08, cc08) 1368 LDF [BO + 11 * SIZE], b4 1369 1370 FMADD (aa3, bb5, cc01, cc01) 1371 LDF [AO + 4 * SIZE], a1 1372 FMADD (aa4, bb5, cc02, cc02) 1373 LDF [AO + 5 * SIZE], a2 1374 1375 FMADD (aa3, bb6, cc03, cc03) 1376 LDF [BO + 12 * SIZE], b5 1377 FMADD (aa4, bb6, cc04, cc04) 1378 LDF [BO + 13 * SIZE], b6 1379 1380 FMADD (aa3, bb7, cc05, cc05) 1381 cmp L, 0 1382 FMADD (aa4, bb7, cc06, cc06) 1383 add AO, 8 * SIZE, AO 1384 1385 FMADD (aa3, bb8, cc07, cc07) 1386 LDF [BO + 14 * SIZE], b7 1387 FMADD (aa4, bb8, cc08, cc08) 1388 LDF [BO + 15 * SIZE], b8 1389 1390 FMADD (aa1, bb9, cc01, cc01) 1391 LDF [AO - 2 * SIZE], a3 1392 FMADD (aa2, bb9, cc02, cc02) 1393 LDF [AO - 1 * SIZE], a4 1394 1395 FMADD (aa1, bb2, cc03, cc03) 1396 LDF [BO + 24 * SIZE], b9 1397 FMADD (aa2, bb2, cc04, cc04) 1398 LDF [BO + 17 * SIZE], b2 1399 1400 FMADD (aa1, bb3, cc05, cc05) 1401 add BO, 16 * SIZE, BO 1402 FMADD (aa2, bb3, cc06, cc06) 1403 nop 1404 1405 FMADD (aa1, bb4, cc07, cc07) 1406 LDF [BO + 2 * SIZE], b3 1407 FMADD (aa2, bb4, cc08, cc08) 1408 LDF [BO + 3 * SIZE], b4 1409 1410 FMADD (aa3, bb5, cc01, cc01) 1411 LDF [AO + 0 * SIZE], a1 1412 FMADD (aa4, bb5, cc02, cc02) 1413 LDF [AO + 1 * SIZE], a2 1414 FMADD (aa3, bb6, cc03, cc03) 1415 LDF [BO + 4 * SIZE], b5 1416 FMADD (aa4, bb6, cc04, cc04) 1417 LDF [BO + 5 * SIZE], b6 1418 1419 FMADD (aa3, bb7, cc05, cc05) 1420 nop 1421 FMADD (aa4, bb7, cc06, cc06) 1422 LDF [BO + 6 * SIZE], b7 1423 1424 FMADD (aa3, bb8, cc07, cc07) 1425 FMADD (aa4, bb8, cc08, cc08) 1426 bg,pt %icc, .LL33 1427 LDF [BO + 7 * SIZE], b8 1428 .align 4 1429 1430.LL35: 1431#if defined(LT) || defined(RN) 1432 and KK, 3, L 1433#else 1434 sub K, KK, L 1435 and L, 3, L 1436#endif 1437 cmp L, 0 1438 ble,a,pn %icc, .LL38 1439 nop 1440 .align 4 1441 1442.LL37: 1443 FMADD (aa1, bb1, cc01, cc01) 1444 add L, -1, L 1445 FMADD (aa2, bb1, cc02, cc02) 1446 LDF [BO + 4 * SIZE], b1 1447 1448 FMADD (aa1, bb2, cc03, cc03) 1449 add AO, 2 * SIZE, AO 1450 FMADD (aa2, bb2, cc04, cc04) 1451 LDF [BO + 5 * SIZE], b2 1452 1453 FMADD (aa1, bb3, cc05, cc05) 1454 cmp L, 0 1455 FMADD (aa2, bb3, cc06, cc06) 1456 LDF [BO + 6 * SIZE], b3 1457 1458 FMADD (aa1, bb4, cc07, cc07) 1459 LDF [AO + 0 * SIZE], a1 1460 FMADD (aa2, bb4, cc08, cc08) 1461 LDF [AO + 1 * SIZE], a2 1462 1463 LDF [BO + 7 * SIZE], b4 1464 bg,pt %icc, .LL37 1465 add BO, 4 * SIZE, BO 1466 .align 4 1467 1468.LL38: 1469#if defined(LN) || defined(RT) 1470#ifdef LN 1471 sub KK, 2, TEMP1 1472#else 1473 sub KK, 4, TEMP1 1474#endif 1475 sll TEMP1, BASE_SHIFT + 1, TEMP2 1476 sll TEMP1, BASE_SHIFT + 2, TEMP1 1477 1478 add AORIG, TEMP2, AO 1479 add B, TEMP1, BO 1480#endif 1481 1482#if defined(LN) || defined(LT) 1483 LDF [BO + 0 * SIZE], a1 1484 LDF [BO + 1 * SIZE], a2 1485 LDF [BO + 2 * SIZE], a3 1486 LDF [BO + 3 * SIZE], a4 1487 1488 LDF [BO + 4 * SIZE], b1 1489 LDF [BO + 5 * SIZE], b2 1490 LDF [BO + 6 * SIZE], b3 1491 LDF [BO + 7 * SIZE], b4 1492 1493 FSUB a1, c01, c01 1494 FSUB a2, c03, c03 1495 FSUB a3, c05, c05 1496 FSUB a4, c07, c07 1497 1498 FSUB b1, c02, c02 1499 FSUB b2, c04, c04 1500 FSUB b3, c06, c06 1501 FSUB b4, c08, c08 1502#else 1503 LDF [AO + 0 * SIZE], a1 1504 LDF [AO + 1 * SIZE], a2 1505 LDF [AO + 2 * SIZE], a3 1506 LDF [AO + 3 * SIZE], a4 1507 1508 LDF [AO + 4 * SIZE], b1 1509 LDF [AO + 5 * SIZE], b2 1510 LDF [AO + 6 * SIZE], b3 1511 LDF [AO + 7 * SIZE], b4 1512 1513 FSUB a1, c01, c01 1514 FSUB a2, c02, c02 1515 FSUB a3, c03, c03 1516 FSUB a4, c04, c04 1517 1518 FSUB b1, c05, c05 1519 FSUB b2, c06, c06 1520 FSUB b3, c07, c07 1521 FSUB b4, c08, c08 1522 1523#endif 1524 1525#ifdef LN 1526 LDF [AO + 3 * SIZE], a1 1527 LDF [AO + 2 * SIZE], a2 1528 LDF [AO + 0 * SIZE], a3 1529 1530 FMUL a1, c02, c02 1531 FMUL a1, c04, c04 1532 FMUL a1, c06, c06 1533 FMUL a1, c08, c08 1534 1535 FNMSUB (aa2, cc02, cc01, cc01) 1536 FNMSUB (aa2, cc04, cc03, cc03) 1537 FNMSUB (aa2, cc06, cc05, cc05) 1538 FNMSUB (aa2, cc08, cc07, cc07) 1539 1540 FMUL a3, c01, c01 1541 FMUL a3, c03, c03 1542 FMUL a3, c05, c05 1543 FMUL a3, c07, c07 1544#endif 1545 1546#ifdef LT 1547 LDF [AO + 0 * SIZE], a1 1548 LDF [AO + 1 * SIZE], a2 1549 LDF [AO + 3 * SIZE], a3 1550 1551 FMUL a1, c01, c01 1552 FMUL a1, c03, c03 1553 FMUL a1, c05, c05 1554 FMUL a1, c07, c07 1555 1556 FNMSUB (aa2, cc01, cc02, cc02) 1557 FNMSUB (aa2, cc03, cc04, cc04) 1558 FNMSUB (aa2, cc05, cc06, cc06) 1559 FNMSUB (aa2, cc07, cc08, cc08) 1560 1561 FMUL a3, c02, c02 1562 FMUL a3, c04, c04 1563 FMUL a3, c06, c06 1564 FMUL a3, c08, c08 1565#endif 1566 1567#ifdef RN 1568 LDF [BO + 0 * SIZE], a1 1569 LDF [BO + 1 * SIZE], a2 1570 LDF [BO + 2 * SIZE], a3 1571 LDF [BO + 3 * SIZE], a4 1572 1573 FMUL a1, c01, c01 1574 FMUL a1, c02, c02 1575 1576 FNMSUB (aa2, cc01, cc03, cc03) 1577 FNMSUB (aa2, cc02, cc04, cc04) 1578 FNMSUB (aa3, cc01, cc05, cc05) 1579 FNMSUB (aa3, cc02, cc06, cc06) 1580 FNMSUB (aa4, cc01, cc07, cc07) 1581 FNMSUB (aa4, cc02, cc08, cc08) 1582 1583 LDF [BO + 5 * SIZE], a1 1584 LDF [BO + 6 * SIZE], a2 1585 LDF [BO + 7 * SIZE], a3 1586 1587 FMUL a1, c03, c03 1588 FMUL a1, c04, c04 1589 1590 FNMSUB (aa2, cc03, cc05, cc05) 1591 FNMSUB (aa2, cc04, cc06, cc06) 1592 FNMSUB (aa3, cc03, cc07, cc07) 1593 FNMSUB (aa3, cc04, cc08, cc08) 1594 1595 LDF [BO + 10 * SIZE], a1 1596 LDF [BO + 11 * SIZE], a2 1597 1598 FMUL a1, c05, c05 1599 FMUL a1, c06, c06 1600 1601 FNMSUB (aa2, cc05, cc07, cc07) 1602 FNMSUB (aa2, cc06, cc08, cc08) 1603 1604 LDF [BO + 15 * SIZE], a1 1605 1606 FMUL a1, c07, c07 1607 FMUL a1, c08, c08 1608#endif 1609 1610#ifdef RT 1611 LDF [BO + 15 * SIZE], a1 1612 LDF [BO + 14 * SIZE], a2 1613 LDF [BO + 13 * SIZE], a3 1614 LDF [BO + 12 * SIZE], a4 1615 1616 FMUL a1, c08, c08 1617 FMUL a1, c07, c07 1618 1619 FNMSUB (aa2, cc08, cc06, cc06) 1620 FNMSUB (aa2, cc07, cc05, cc05) 1621 FNMSUB (aa3, cc08, cc04, cc04) 1622 FNMSUB (aa3, cc07, cc03, cc03) 1623 FNMSUB (aa4, cc08, cc02, cc02) 1624 FNMSUB (aa4, cc07, cc01, cc01) 1625 1626 LDF [BO + 10 * SIZE], a1 1627 LDF [BO + 9 * SIZE], a2 1628 LDF [BO + 8 * SIZE], a3 1629 1630 FMUL a1, c06, c06 1631 FMUL a1, c05, c05 1632 1633 FNMSUB (aa2, cc06, cc04, cc04) 1634 FNMSUB (aa2, cc05, cc03, cc03) 1635 FNMSUB (aa3, cc06, cc02, cc02) 1636 FNMSUB (aa3, cc05, cc01, cc01) 1637 1638 LDF [BO + 5 * SIZE], a1 1639 LDF [BO + 4 * SIZE], a2 1640 1641 FMUL a1, c04, c04 1642 FMUL a1, c03, c03 1643 1644 FNMSUB (aa2, cc04, cc02, cc02) 1645 FNMSUB (aa2, cc03, cc01, cc01) 1646 1647 LDF [BO + 0 * SIZE], a1 1648 1649 FMUL a1, c02, c02 1650 FMUL a1, c01, c01 1651#endif 1652 1653#ifdef LN 1654 add C1, -2 * SIZE, C1 1655 add C2, -2 * SIZE, C2 1656 add C3, -2 * SIZE, C3 1657 add C4, -2 * SIZE, C4 1658#endif 1659 1660#if defined(LN) || defined(LT) 1661 STF c01, [BO + 0 * SIZE] 1662 STF c03, [BO + 1 * SIZE] 1663 STF c05, [BO + 2 * SIZE] 1664 STF c07, [BO + 3 * SIZE] 1665 1666 STF c02, [BO + 4 * SIZE] 1667 STF c04, [BO + 5 * SIZE] 1668 STF c06, [BO + 6 * SIZE] 1669 STF c08, [BO + 7 * SIZE] 1670#else 1671 STF c01, [AO + 0 * SIZE] 1672 STF c02, [AO + 1 * SIZE] 1673 STF c03, [AO + 2 * SIZE] 1674 STF c04, [AO + 3 * SIZE] 1675 1676 STF c05, [AO + 4 * SIZE] 1677 STF c06, [AO + 5 * SIZE] 1678 STF c07, [AO + 6 * SIZE] 1679 STF c08, [AO + 7 * SIZE] 1680#endif 1681 1682 STF c01, [C1 + 0 * SIZE] 1683 STF c02, [C1 + 1 * SIZE] 1684 STF c03, [C2 + 0 * SIZE] 1685 STF c04, [C2 + 1 * SIZE] 1686 1687 STF c05, [C3 + 0 * SIZE] 1688 STF c06, [C3 + 1 * SIZE] 1689 STF c07, [C4 + 0 * SIZE] 1690 STF c08, [C4 + 1 * SIZE] 1691 1692#ifndef LN 1693 add C1, 2 * SIZE, C1 1694 add C2, 2 * SIZE, C2 1695 add C3, 2 * SIZE, C3 1696 add C4, 2 * SIZE, C4 1697#endif 1698 1699#ifdef RT 1700 sll K, BASE_SHIFT + 1, TEMP1 1701 add AORIG, TEMP1, AORIG 1702#endif 1703 1704#if defined(LT) || defined(RN) 1705 sub K, KK, TEMP1 1706 sll TEMP1, BASE_SHIFT + 1, TEMP2 1707 sll TEMP1, BASE_SHIFT + 2, TEMP1 1708 add AO, TEMP2, AO 1709 add BO, TEMP1, BO 1710#endif 1711 1712#ifdef LT 1713 add KK, 2, KK 1714#endif 1715 1716#ifdef LN 1717 sub KK, 2, KK 1718#endif 1719 1720 add I, -1, I 1721 cmp I, 0 1722 bg,pt %icc, .LL32 1723 nop 1724 1725.LL40: 1726 and M, 1, I 1727 cmp I, 0 1728 ble,pn %icc, .LL49 1729 nop 1730 1731#if defined(LT) || defined(RN) 1732 mov B, BO 1733#else 1734#ifdef LN 1735 sll K, BASE_SHIFT + 0, TEMP1 1736 sub AORIG, TEMP1, AORIG 1737#endif 1738 1739 sll KK, BASE_SHIFT + 0, TEMP1 1740 sll KK, BASE_SHIFT + 2, TEMP2 1741 1742 add AORIG, TEMP1, AO 1743 add B, TEMP2, BO 1744#endif 1745 1746 LDF [AO + 0 * SIZE], a1 1747 LDF [AO + 1 * SIZE], a2 1748 LDF [AO + 2 * SIZE], a3 1749 LDF [AO + 3 * SIZE], a4 1750 1751 LDF [BO + 0 * SIZE], b1 1752 LDF [BO + 1 * SIZE], b2 1753 LDF [BO + 2 * SIZE], b3 1754 LDF [BO + 3 * SIZE], b4 1755 LDF [BO + 4 * SIZE], b5 1756 LDF [BO + 5 * SIZE], b6 1757 FCLR (cc01) 1758 LDF [BO + 6 * SIZE], b7 1759 FCLR (cc03) 1760 LDF [BO + 7 * SIZE], b8 1761 FCLR (cc05) 1762 LDF [BO + 8 * SIZE], b9 1763 FCLR (cc07) 1764 1765#if defined(LT) || defined(RN) 1766 sra KK, 2, L 1767#else 1768 sub K, KK, L 1769 sra L, 2, L 1770#endif 1771 cmp L, 0 1772 ble,pn %icc, .LL45 1773 nop 1774 1775.LL43: 1776 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1777 add L, -1, L 1778 1779 FMADD (aa1, bb1, cc01, cc01) 1780 LDF [BO + 16 * SIZE], b1 1781 FMADD (aa1, bb2, cc03, cc03) 1782 LDF [BO + 9 * SIZE], b2 1783 FMADD (aa1, bb3, cc05, cc05) 1784 LDF [BO + 10 * SIZE], b3 1785 FMADD (aa1, bb4, cc07, cc07) 1786 LDF [BO + 11 * SIZE], b4 1787 1788 LDF [AO + 4 * SIZE], a1 1789 cmp L, 0 1790 1791 FMADD (aa2, bb5, cc01, cc01) 1792 LDF [BO + 12 * SIZE], b5 1793 FMADD (aa2, bb6, cc03, cc03) 1794 LDF [BO + 13 * SIZE], b6 1795 FMADD (aa2, bb7, cc05, cc05) 1796 LDF [BO + 14 * SIZE], b7 1797 FMADD (aa2, bb8, cc07, cc07) 1798 LDF [BO + 15 * SIZE], b8 1799 1800 LDF [AO + 5 * SIZE], a2 1801 add AO, 4 * SIZE, AO 1802 1803 FMADD (aa3, bb9, cc01, cc01) 1804 LDF [BO + 24 * SIZE], b9 1805 FMADD (aa3, bb2, cc03, cc03) 1806 LDF [BO + 17 * SIZE], b2 1807 FMADD (aa3, bb3, cc05, cc05) 1808 LDF [BO + 18 * SIZE], b3 1809 FMADD (aa3, bb4, cc07, cc07) 1810 LDF [BO + 19 * SIZE], b4 1811 1812 LDF [AO + 2 * SIZE], a3 1813 add BO, 16 * SIZE, BO 1814 1815 FMADD (aa4, bb5, cc01, cc01) 1816 LDF [BO + 4 * SIZE], b5 1817 FMADD (aa4, bb6, cc03, cc03) 1818 LDF [BO + 5 * SIZE], b6 1819 FMADD (aa4, bb7, cc05, cc05) 1820 LDF [BO + 6 * SIZE], b7 1821 FMADD (aa4, bb8, cc07, cc07) 1822 LDF [BO + 7 * SIZE], b8 1823 1824 bg,pt %icc, .LL43 1825 LDF [AO + 3 * SIZE], a4 1826 .align 4 1827 1828.LL45: 1829#if defined(LT) || defined(RN) 1830 and KK, 3, L 1831#else 1832 sub K, KK, L 1833 and L, 3, L 1834#endif 1835 cmp L, 0 1836 ble,a,pn %icc, .LL48 1837 nop 1838 .align 4 1839 1840.LL47: 1841 FMADD (aa1, bb1, cc01, cc01) 1842 LDF [BO + 4 * SIZE], b1 1843 add L, -1, L 1844 FMADD (aa1, bb2, cc03, cc03) 1845 LDF [BO + 5 * SIZE], b2 1846 add AO, 1 * SIZE, AO 1847 1848 FMADD (aa1, bb3, cc05, cc05) 1849 LDF [BO + 6 * SIZE], b3 1850 cmp L, 0 1851 FMADD (aa1, bb4, cc07, cc07) 1852 LDF [BO + 7 * SIZE], b4 1853 add BO, 4 * SIZE, BO 1854 1855 bg,pt %icc, .LL47 1856 LDF [AO + 0 * SIZE], a1 1857 .align 4 1858 1859.LL48: 1860#if defined(LN) || defined(RT) 1861#ifdef LN 1862 sub KK, 1, TEMP1 1863#else 1864 sub KK, 4, TEMP1 1865#endif 1866 sll TEMP1, BASE_SHIFT + 0, TEMP2 1867 sll TEMP1, BASE_SHIFT + 2, TEMP1 1868 1869 add AORIG, TEMP2, AO 1870 add B, TEMP1, BO 1871#endif 1872 1873#if defined(LN) || defined(LT) 1874 LDF [BO + 0 * SIZE], a1 1875 LDF [BO + 1 * SIZE], a2 1876 LDF [BO + 2 * SIZE], a3 1877 LDF [BO + 3 * SIZE], a4 1878 1879 FSUB a1, c01, c01 1880 FSUB a2, c03, c03 1881 FSUB a3, c05, c05 1882 FSUB a4, c07, c07 1883#else 1884 LDF [AO + 0 * SIZE], a1 1885 LDF [AO + 1 * SIZE], a2 1886 LDF [AO + 2 * SIZE], a3 1887 LDF [AO + 3 * SIZE], a4 1888 1889 FSUB a1, c01, c01 1890 FSUB a2, c03, c03 1891 FSUB a3, c05, c05 1892 FSUB a4, c07, c07 1893#endif 1894 1895#if defined(LN) || defined(LT) 1896 LDF [AO + 0 * SIZE], a1 1897 1898 FMUL a1, c01, c01 1899 FMUL a1, c03, c03 1900 FMUL a1, c05, c05 1901 FMUL a1, c07, c07 1902#endif 1903 1904#ifdef RN 1905 LDF [BO + 0 * SIZE], a1 1906 LDF [BO + 1 * SIZE], a2 1907 LDF [BO + 2 * SIZE], a3 1908 LDF [BO + 3 * SIZE], a4 1909 1910 FMUL a1, c01, c01 1911 1912 FNMSUB (aa2, cc01, cc03, cc03) 1913 FNMSUB (aa3, cc01, cc05, cc05) 1914 FNMSUB (aa4, cc01, cc07, cc07) 1915 1916 LDF [BO + 5 * SIZE], a1 1917 LDF [BO + 6 * SIZE], a2 1918 LDF [BO + 7 * SIZE], a3 1919 1920 FMUL a1, c03, c03 1921 1922 FNMSUB (aa2, cc03, cc05, cc05) 1923 FNMSUB (aa3, cc03, cc07, cc07) 1924 1925 LDF [BO + 10 * SIZE], a1 1926 LDF [BO + 11 * SIZE], a2 1927 1928 FMUL a1, c05, c05 1929 1930 FNMSUB (aa2, cc05, cc07, cc07) 1931 1932 LDF [BO + 15 * SIZE], a1 1933 1934 FMUL a1, c07, c07 1935#endif 1936 1937#ifdef RT 1938 LDF [BO + 15 * SIZE], a1 1939 LDF [BO + 14 * SIZE], a2 1940 LDF [BO + 13 * SIZE], a3 1941 LDF [BO + 12 * SIZE], a4 1942 1943 FMUL a1, c07, c07 1944 1945 FNMSUB (aa2, cc07, cc05, cc05) 1946 FNMSUB (aa3, cc07, cc03, cc03) 1947 FNMSUB (aa4, cc07, cc01, cc01) 1948 1949 LDF [BO + 10 * SIZE], a1 1950 LDF [BO + 9 * SIZE], a2 1951 LDF [BO + 8 * SIZE], a3 1952 1953 FMUL a1, c05, c05 1954 1955 FNMSUB (aa2, cc05, cc03, cc03) 1956 FNMSUB (aa3, cc05, cc01, cc01) 1957 1958 LDF [BO + 5 * SIZE], a1 1959 LDF [BO + 4 * SIZE], a2 1960 1961 FMUL a1, c03, c03 1962 1963 FNMSUB (aa2, cc03, cc01, cc01) 1964 1965 LDF [BO + 0 * SIZE], a1 1966 1967 FMUL a1, c01, c01 1968#endif 1969 1970#ifdef LN 1971 add C1, -1 * SIZE, C1 1972 add C2, -1 * SIZE, C2 1973 add C3, -1 * SIZE, C3 1974 add C4, -1 * SIZE, C4 1975#endif 1976 1977#if defined(LN) || defined(LT) 1978 STF c01, [BO + 0 * SIZE] 1979 STF c03, [BO + 1 * SIZE] 1980 STF c05, [BO + 2 * SIZE] 1981 STF c07, [BO + 3 * SIZE] 1982#else 1983 STF c01, [AO + 0 * SIZE] 1984 STF c03, [AO + 1 * SIZE] 1985 STF c05, [AO + 2 * SIZE] 1986 STF c07, [AO + 3 * SIZE] 1987#endif 1988 1989 STF c01, [C1 + 0 * SIZE] 1990 STF c03, [C2 + 0 * SIZE] 1991 STF c05, [C3 + 0 * SIZE] 1992 STF c07, [C4 + 0 * SIZE] 1993 1994#ifdef RT 1995 sll K, BASE_SHIFT + 0, TEMP1 1996 add AORIG, TEMP1, AORIG 1997#endif 1998 1999#if defined(LT) || defined(RN) 2000 sub K, KK, TEMP1 2001 sll TEMP1, BASE_SHIFT + 0, TEMP2 2002 sll TEMP1, BASE_SHIFT + 2, TEMP1 2003 add AO, TEMP2, AO 2004 add BO, TEMP1, BO 2005#endif 2006 2007#ifdef LT 2008 add KK, 1, KK 2009#endif 2010 2011#ifdef LN 2012 sub KK, 1, KK 2013#endif 2014 .align 4 2015 2016.LL49: 2017#ifdef LN 2018 sll K, BASE_SHIFT + 2, TEMP1 2019 add B, TEMP1, B 2020#endif 2021 2022#if defined(LT) || defined(RN) 2023 mov BO, B 2024#endif 2025 2026#ifdef RN 2027 add KK, 4, KK 2028#endif 2029 2030#ifdef RT 2031 sub KK, 4, KK 2032#endif 2033 .align 4 2034 2035.LL10: 2036 sra N, 3, J 2037 cmp J, 0 2038 ble,pn %icc, .LL999 2039 nop 2040 .align 4 2041 2042.LL11: 2043#ifdef RT 2044 sll K, BASE_SHIFT + 3, TEMP1 2045 sub B, TEMP1, B 2046#endif 2047 2048#ifndef RT 2049 mov C, C1 2050 add C, LDC, C2 2051 add C2, LDC, C3 2052 add C3, LDC, C4 2053 add C4, LDC, C5 2054 add C5, LDC, C6 2055 add C6, LDC, C7 2056 add C7, LDC, C8 2057 add C8, LDC, C 2058#else 2059 sub C, LDC, C8 2060 sub C8, LDC, C7 2061 sub C7, LDC, C6 2062 sub C6, LDC, C5 2063 sub C5, LDC, C4 2064 sub C4, LDC, C3 2065 sub C3, LDC, C2 2066 sub C2, LDC, C1 2067 sub C2, LDC, C 2068#endif 2069 2070#ifdef LN 2071 add M, OFFSET, KK 2072#endif 2073 2074#ifdef LT 2075 mov OFFSET, KK 2076#endif 2077 2078#if defined(LN) || defined(RT) 2079 mov A, AORIG 2080#else 2081 mov A, AO 2082#endif 2083 2084 sra M, 1, I 2085 cmp I, 0 2086 ble,pn %icc, .LL20 2087 nop 2088 .align 4 2089 2090.LL12: 2091#if defined(LT) || defined(RN) 2092 mov B, BO 2093#else 2094#ifdef LN 2095 sll K, BASE_SHIFT + 1, TEMP1 2096 sub AORIG, TEMP1, AORIG 2097#endif 2098 2099 sll KK, BASE_SHIFT + 1, TEMP1 2100 sll KK, BASE_SHIFT + 3, TEMP2 2101 2102 add AORIG, TEMP1, AO 2103 add B, TEMP2, BO 2104#endif 2105 2106 LDF [AO + 0 * SIZE], a1 2107 LDF [AO + 1 * SIZE], a2 2108 LDF [AO + 8 * SIZE], a5 2109 2110 LDF [BO + 0 * SIZE], b1 2111 2112 LDF [BO + 1 * SIZE], b2 2113 FCLR (cc01) 2114 LDF [BO + 2 * SIZE], b3 2115 FCLR (cc05) 2116 LDF [BO + 3 * SIZE], b4 2117 FCLR (cc09) 2118 LDF [BO + 4 * SIZE], b5 2119 FCLR (cc13) 2120 2121 LDF [BO + 5 * SIZE], b6 2122 FCLR (cc02) 2123 LDF [BO + 6 * SIZE], b7 2124 FCLR (cc06) 2125 LDF [BO + 7 * SIZE], b8 2126 FCLR (cc10) 2127 LDF [BO + 8 * SIZE], b9 2128 FCLR (cc14) 2129 2130 prefetch [C1 + 1 * SIZE], 3 2131 FCLR (cc03) 2132 prefetch [C2 + 2 * SIZE], 3 2133 FCLR (cc07) 2134 prefetch [C3 + 1 * SIZE], 3 2135 FCLR (cc11) 2136 prefetch [C4 + 2 * SIZE], 3 2137 FCLR (cc15) 2138 2139 prefetch [C5 + 1 * SIZE], 3 2140 FCLR (cc04) 2141 prefetch [C6 + 2 * SIZE], 3 2142 FCLR (cc08) 2143 prefetch [C7 + 1 * SIZE], 3 2144 FCLR (cc12) 2145 prefetch [C8 + 2 * SIZE], 3 2146 FCLR (cc16) 2147 2148#if defined(LT) || defined(RN) 2149 sra KK, 3, L 2150#else 2151 sub K, KK, L 2152 sra L, 3, L 2153#endif 2154 cmp L, 0 2155 ble,pn %icc, .LL15 2156 nop 2157 .align 4 2158 2159.LL13: 2160 FMADD (aa1, bb1, cc01, cc01) 2161 FMADD (aa2, bb1, cc02, cc02) 2162 FMADD (aa1, bb2, cc03, cc03) 2163 FMADD (aa2, bb2, cc04, cc04) 2164 2165 FMADD (aa1, bb3, cc05, cc05) 2166 LDF [BO + 16 * SIZE], b1 2167 FMADD (aa2, bb3, cc06, cc06) 2168 LDF [BO + 9 * SIZE], b2 2169 2170 FMADD (aa1, bb4, cc07, cc07) 2171 LDF [BO + 10 * SIZE], b3 2172 FMADD (aa2, bb4, cc08, cc08) 2173 LDF [BO + 11 * SIZE], b4 2174 2175 FMADD (aa1, bb5, cc09, cc09) 2176 LDF [AO + 2 * SIZE], a3 2177 FMADD (aa2, bb5, cc10, cc10) 2178 LDF [AO + 3 * SIZE], a4 2179 2180 FMADD (aa1, bb6, cc11, cc11) 2181 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2182 FMADD (aa2, bb6, cc12, cc12) 2183 nop 2184 2185 FMADD (aa1, bb7, cc13, cc13) 2186 LDF [BO + 12 * SIZE], b5 2187 FMADD (aa2, bb7, cc14, cc14) 2188 LDF [BO + 13 * SIZE], b6 2189 2190 FMADD (aa1, bb8, cc15, cc15) 2191 LDF [BO + 14 * SIZE], b7 2192 FMADD (aa2, bb8, cc16, cc16) 2193 LDF [BO + 15 * SIZE], b8 2194 2195 FMADD (aa3, bb9, cc01, cc01) 2196 FMADD (aa4, bb9, cc02, cc02) 2197 FMADD (aa3, bb2, cc03, cc03) 2198 FMADD (aa4, bb2, cc04, cc04) 2199 2200 FMADD (aa3, bb3, cc05, cc05) 2201 LDF [BO + 24 * SIZE], b9 2202 FMADD (aa4, bb3, cc06, cc06) 2203 LDF [BO + 17 * SIZE], b2 2204 2205 FMADD (aa3, bb4, cc07, cc07) 2206 LDF [BO + 18 * SIZE], b3 2207 FMADD (aa4, bb4, cc08, cc08) 2208 LDF [BO + 19 * SIZE], b4 2209 2210 FMADD (aa3, bb5, cc09, cc09) 2211 LDF [AO + 4 * SIZE], a1 2212 FMADD (aa4, bb5, cc10, cc10) 2213 LDF [AO + 5 * SIZE], a2 2214 2215 FMADD (aa3, bb6, cc11, cc11) 2216 add L, -1, L 2217 FMADD (aa4, bb6, cc12, cc12) 2218 nop 2219 2220 FMADD (aa3, bb7, cc13, cc13) 2221 LDF [BO + 20 * SIZE], b5 2222 FMADD (aa4, bb7, cc14, cc14) 2223 LDF [BO + 21 * SIZE], b6 2224 2225 FMADD (aa3, bb8, cc15, cc15) 2226 LDF [BO + 22 * SIZE], b7 2227 FMADD (aa4, bb8, cc16, cc16) 2228 LDF [BO + 23 * SIZE], b8 2229 2230 FMADD (aa1, bb1, cc01, cc01) 2231 FMADD (aa2, bb1, cc02, cc02) 2232 FMADD (aa1, bb2, cc03, cc03) 2233 FMADD (aa2, bb2, cc04, cc04) 2234 2235 FMADD (aa1, bb3, cc05, cc05) 2236 LDF [BO + 32 * SIZE], b1 2237 FMADD (aa2, bb3, cc06, cc06) 2238 LDF [BO + 25 * SIZE], b2 2239 2240 FMADD (aa1, bb4, cc07, cc07) 2241 LDF [BO + 26 * SIZE], b3 2242 FMADD (aa2, bb4, cc08, cc08) 2243 LDF [BO + 27 * SIZE], b4 2244 2245 FMADD (aa1, bb5, cc09, cc09) 2246 LDF [AO + 6 * SIZE], a3 2247 FMADD (aa2, bb5, cc10, cc10) 2248 LDF [AO + 7 * SIZE], a4 2249 2250 FMADD (aa1, bb6, cc11, cc11) 2251 nop 2252 FMADD (aa2, bb6, cc12, cc12) 2253 nop 2254 2255 FMADD (aa1, bb7, cc13, cc13) 2256 LDF [BO + 28 * SIZE], b5 2257 FMADD (aa2, bb7, cc14, cc14) 2258 LDF [BO + 29 * SIZE], b6 2259 2260 FMADD (aa1, bb8, cc15, cc15) 2261 LDF [BO + 30 * SIZE], b7 2262 FMADD (aa2, bb8, cc16, cc16) 2263 LDF [BO + 31 * SIZE], b8 2264 2265 FMADD (aa3, bb9, cc01, cc01) 2266 FMADD (aa4, bb9, cc02, cc02) 2267 FMADD (aa3, bb2, cc03, cc03) 2268 FMADD (aa4, bb2, cc04, cc04) 2269 2270 FMADD (aa3, bb3, cc05, cc05) 2271 LDF [BO + 40 * SIZE], b9 2272 FMADD (aa4, bb3, cc06, cc06) 2273 LDF [BO + 33 * SIZE], b2 2274 2275 FMADD (aa3, bb4, cc07, cc07) 2276 LDF [BO + 34 * SIZE], b3 2277 FMADD (aa4, bb4, cc08, cc08) 2278 LDF [BO + 35 * SIZE], b4 2279 2280 FMADD (aa3, bb5, cc09, cc09) 2281 LDF [AO + 16 * SIZE], a1 /****/ 2282 FMADD (aa4, bb5, cc10, cc10) 2283 LDF [AO + 9 * SIZE], a2 2284 2285 FMADD (aa3, bb6, cc11, cc11) 2286 nop 2287 FMADD (aa4, bb6, cc12, cc12) 2288 nop 2289 2290 FMADD (aa3, bb7, cc13, cc13) 2291 LDF [BO + 36 * SIZE], b5 2292 FMADD (aa4, bb7, cc14, cc14) 2293 LDF [BO + 37 * SIZE], b6 2294 2295 FMADD (aa3, bb8, cc15, cc15) 2296 LDF [BO + 38 * SIZE], b7 2297 FMADD (aa4, bb8, cc16, cc16) 2298 LDF [BO + 39 * SIZE], b8 2299 2300 FMADD (aa5, bb1, cc01, cc01) 2301 FMADD (aa2, bb1, cc02, cc02) 2302 FMADD (aa5, bb2, cc03, cc03) 2303 FMADD (aa2, bb2, cc04, cc04) 2304 2305 FMADD (aa5, bb3, cc05, cc05) 2306 LDF [BO + 48 * SIZE], b1 2307 FMADD (aa2, bb3, cc06, cc06) 2308 LDF [BO + 41 * SIZE], b2 2309 2310 FMADD (aa5, bb4, cc07, cc07) 2311 LDF [BO + 42 * SIZE], b3 2312 FMADD (aa2, bb4, cc08, cc08) 2313 LDF [BO + 43 * SIZE], b4 2314 2315 FMADD (aa5, bb5, cc09, cc09) 2316 LDF [AO + 10 * SIZE], a3 2317 FMADD (aa2, bb5, cc10, cc10) 2318 LDF [AO + 11 * SIZE], a4 2319 2320 FMADD (aa5, bb6, cc11, cc11) 2321 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 2322 FMADD (aa2, bb6, cc12, cc12) 2323 nop 2324 2325 FMADD (aa5, bb7, cc13, cc13) 2326 LDF [BO + 44 * SIZE], b5 2327 FMADD (aa2, bb7, cc14, cc14) 2328 LDF [BO + 45 * SIZE], b6 2329 2330 FMADD (aa5, bb8, cc15, cc15) 2331 LDF [BO + 46 * SIZE], b7 2332 FMADD (aa2, bb8, cc16, cc16) 2333 LDF [BO + 47 * SIZE], b8 2334 2335 FMADD (aa3, bb9, cc01, cc01) 2336 FMADD (aa4, bb9, cc02, cc02) 2337 FMADD (aa3, bb2, cc03, cc03) 2338 FMADD (aa4, bb2, cc04, cc04) 2339 2340 FMADD (aa3, bb3, cc05, cc05) 2341 LDF [BO + 56 * SIZE], b9 2342 FMADD (aa4, bb3, cc06, cc06) 2343 LDF [BO + 49 * SIZE], b2 2344 2345 FMADD (aa3, bb4, cc07, cc07) 2346 LDF [BO + 50 * SIZE], b3 2347 FMADD (aa4, bb4, cc08, cc08) 2348 LDF [BO + 51 * SIZE], b4 2349 2350 FMADD (aa3, bb5, cc09, cc09) 2351 LDF [AO + 12 * SIZE], a5 2352 FMADD (aa4, bb5, cc10, cc10) 2353 LDF [AO + 13 * SIZE], a2 2354 2355 FMADD (aa3, bb6, cc11, cc11) 2356 cmp L, 0 2357 FMADD (aa4, bb6, cc12, cc12) 2358 nop 2359 2360 FMADD (aa3, bb7, cc13, cc13) 2361 LDF [BO + 52 * SIZE], b5 2362 FMADD (aa4, bb7, cc14, cc14) 2363 LDF [BO + 53 * SIZE], b6 2364 2365 FMADD (aa3, bb8, cc15, cc15) 2366 LDF [BO + 54 * SIZE], b7 2367 FMADD (aa4, bb8, cc16, cc16) 2368 LDF [BO + 55 * SIZE], b8 2369 2370 FMADD (aa5, bb1, cc01, cc01) 2371 FMADD (aa2, bb1, cc02, cc02) 2372 FMADD (aa5, bb2, cc03, cc03) 2373 FMADD (aa2, bb2, cc04, cc04) 2374 2375 FMADD (aa5, bb3, cc05, cc05) 2376 LDF [BO + 64 * SIZE], b1 2377 FMADD (aa2, bb3, cc06, cc06) 2378 LDF [BO + 57 * SIZE], b2 2379 2380 FMADD (aa5, bb4, cc07, cc07) 2381 LDF [BO + 58 * SIZE], b3 2382 FMADD (aa2, bb4, cc08, cc08) 2383 LDF [BO + 59 * SIZE], b4 2384 2385 FMADD (aa5, bb5, cc09, cc09) 2386 LDF [AO + 14 * SIZE], a3 2387 FMADD (aa2, bb5, cc10, cc10) 2388 LDF [AO + 15 * SIZE], a4 2389 2390 FMADD (aa5, bb6, cc11, cc11) 2391 add BO, 64 * SIZE, BO 2392 FMADD (aa2, bb6, cc12, cc12) 2393 add AO, 16 * SIZE, AO 2394 2395 FMADD (aa5, bb7, cc13, cc13) 2396 LDF [BO - 4 * SIZE], b5 2397 FMADD (aa2, bb7, cc14, cc14) 2398 LDF [BO - 3 * SIZE], b6 2399 2400 FMADD (aa5, bb8, cc15, cc15) 2401 LDF [BO - 2 * SIZE], b7 2402 FMADD (aa2, bb8, cc16, cc16) 2403 LDF [BO - 1 * SIZE], b8 2404 2405 FMADD (aa3, bb9, cc01, cc01) 2406 FMADD (aa4, bb9, cc02, cc02) 2407 FMADD (aa3, bb2, cc03, cc03) 2408 FMADD (aa4, bb2, cc04, cc04) 2409 2410 FMADD (aa3, bb3, cc05, cc05) 2411 LDF [BO + 8 * SIZE], b9 2412 FMADD (aa4, bb3, cc06, cc06) 2413 LDF [BO + 1 * SIZE], b2 2414 2415 FMADD (aa3, bb4, cc07, cc07) 2416 LDF [BO + 2 * SIZE], b3 2417 FMADD (aa4, bb4, cc08, cc08) 2418 LDF [BO + 3 * SIZE], b4 2419 2420 FMADD (aa3, bb5, cc09, cc09) 2421 LDF [AO + 8 * SIZE], a5 /****/ 2422 FMADD (aa4, bb5, cc10, cc10) 2423 LDF [AO + 1 * SIZE], a2 2424 2425 FMADD (aa3, bb6, cc11, cc11) 2426 FMADD (aa4, bb6, cc12, cc12) 2427 2428 FMADD (aa3, bb7, cc13, cc13) 2429 LDF [BO + 4 * SIZE], b5 2430 FMADD (aa4, bb7, cc14, cc14) 2431 LDF [BO + 5 * SIZE], b6 2432 2433 FMADD (aa3, bb8, cc15, cc15) 2434 LDF [BO + 6 * SIZE], b7 2435 FMADD (aa4, bb8, cc16, cc16) 2436 ble,pn %icc, .LL15 2437 LDF [BO + 7 * SIZE], b8 2438 2439 FMADD (aa1, bb1, cc01, cc01) 2440 FMADD (aa2, bb1, cc02, cc02) 2441 FMADD (aa1, bb2, cc03, cc03) 2442 FMADD (aa2, bb2, cc04, cc04) 2443 2444 FMADD (aa1, bb3, cc05, cc05) 2445 LDF [BO + 16 * SIZE], b1 2446 FMADD (aa2, bb3, cc06, cc06) 2447 LDF [BO + 9 * SIZE], b2 2448 2449 FMADD (aa1, bb4, cc07, cc07) 2450 LDF [BO + 10 * SIZE], b3 2451 FMADD (aa2, bb4, cc08, cc08) 2452 LDF [BO + 11 * SIZE], b4 2453 2454 FMADD (aa1, bb5, cc09, cc09) 2455 LDF [AO + 2 * SIZE], a3 2456 FMADD (aa2, bb5, cc10, cc10) 2457 LDF [AO + 3 * SIZE], a4 2458 2459 FMADD (aa1, bb6, cc11, cc11) 2460 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 2461 FMADD (aa2, bb6, cc12, cc12) 2462 nop 2463 2464 FMADD (aa1, bb7, cc13, cc13) 2465 LDF [BO + 12 * SIZE], b5 2466 FMADD (aa2, bb7, cc14, cc14) 2467 LDF [BO + 13 * SIZE], b6 2468 2469 FMADD (aa1, bb8, cc15, cc15) 2470 LDF [BO + 14 * SIZE], b7 2471 FMADD (aa2, bb8, cc16, cc16) 2472 LDF [BO + 15 * SIZE], b8 2473 2474 FMADD (aa3, bb9, cc01, cc01) 2475 FMADD (aa4, bb9, cc02, cc02) 2476 FMADD (aa3, bb2, cc03, cc03) 2477 FMADD (aa4, bb2, cc04, cc04) 2478 2479 FMADD (aa3, bb3, cc05, cc05) 2480 LDF [BO + 24 * SIZE], b9 2481 FMADD (aa4, bb3, cc06, cc06) 2482 LDF [BO + 17 * SIZE], b2 2483 2484 FMADD (aa3, bb4, cc07, cc07) 2485 LDF [BO + 18 * SIZE], b3 2486 FMADD (aa4, bb4, cc08, cc08) 2487 LDF [BO + 19 * SIZE], b4 2488 2489 FMADD (aa3, bb5, cc09, cc09) 2490 LDF [AO + 4 * SIZE], a1 2491 FMADD (aa4, bb5, cc10, cc10) 2492 LDF [AO + 5 * SIZE], a2 2493 2494 FMADD (aa3, bb6, cc11, cc11) 2495 add L, -1, L 2496 FMADD (aa4, bb6, cc12, cc12) 2497 nop 2498 2499 FMADD (aa3, bb7, cc13, cc13) 2500 LDF [BO + 20 * SIZE], b5 2501 FMADD (aa4, bb7, cc14, cc14) 2502 LDF [BO + 21 * SIZE], b6 2503 2504 FMADD (aa3, bb8, cc15, cc15) 2505 LDF [BO + 22 * SIZE], b7 2506 FMADD (aa4, bb8, cc16, cc16) 2507 LDF [BO + 23 * SIZE], b8 2508 2509 FMADD (aa1, bb1, cc01, cc01) 2510 FMADD (aa2, bb1, cc02, cc02) 2511 FMADD (aa1, bb2, cc03, cc03) 2512 FMADD (aa2, bb2, cc04, cc04) 2513 2514 FMADD (aa1, bb3, cc05, cc05) 2515 LDF [BO + 32 * SIZE], b1 2516 FMADD (aa2, bb3, cc06, cc06) 2517 LDF [BO + 25 * SIZE], b2 2518 2519 FMADD (aa1, bb4, cc07, cc07) 2520 LDF [BO + 26 * SIZE], b3 2521 FMADD (aa2, bb4, cc08, cc08) 2522 LDF [BO + 27 * SIZE], b4 2523 2524 FMADD (aa1, bb5, cc09, cc09) 2525 LDF [AO + 6 * SIZE], a3 2526 FMADD (aa2, bb5, cc10, cc10) 2527 LDF [AO + 7 * SIZE], a4 2528 2529 FMADD (aa1, bb6, cc11, cc11) 2530 nop 2531 FMADD (aa2, bb6, cc12, cc12) 2532 nop 2533 2534 FMADD (aa1, bb7, cc13, cc13) 2535 LDF [BO + 28 * SIZE], b5 2536 FMADD (aa2, bb7, cc14, cc14) 2537 LDF [BO + 29 * SIZE], b6 2538 2539 FMADD (aa1, bb8, cc15, cc15) 2540 LDF [BO + 30 * SIZE], b7 2541 FMADD (aa2, bb8, cc16, cc16) 2542 LDF [BO + 31 * SIZE], b8 2543 2544 FMADD (aa3, bb9, cc01, cc01) 2545 FMADD (aa4, bb9, cc02, cc02) 2546 FMADD (aa3, bb2, cc03, cc03) 2547 FMADD (aa4, bb2, cc04, cc04) 2548 2549 FMADD (aa3, bb3, cc05, cc05) 2550 LDF [BO + 40 * SIZE], b9 2551 FMADD (aa4, bb3, cc06, cc06) 2552 LDF [BO + 33 * SIZE], b2 2553 2554 FMADD (aa3, bb4, cc07, cc07) 2555 LDF [BO + 34 * SIZE], b3 2556 FMADD (aa4, bb4, cc08, cc08) 2557 LDF [BO + 35 * SIZE], b4 2558 2559 FMADD (aa3, bb5, cc09, cc09) 2560 LDF [AO + 16 * SIZE], a1 /****/ 2561 FMADD (aa4, bb5, cc10, cc10) 2562 LDF [AO + 9 * SIZE], a2 2563 2564 FMADD (aa3, bb6, cc11, cc11) 2565 nop 2566 FMADD (aa4, bb6, cc12, cc12) 2567 nop 2568 2569 FMADD (aa3, bb7, cc13, cc13) 2570 LDF [BO + 36 * SIZE], b5 2571 FMADD (aa4, bb7, cc14, cc14) 2572 LDF [BO + 37 * SIZE], b6 2573 2574 FMADD (aa3, bb8, cc15, cc15) 2575 LDF [BO + 38 * SIZE], b7 2576 FMADD (aa4, bb8, cc16, cc16) 2577 LDF [BO + 39 * SIZE], b8 2578 2579 FMADD (aa5, bb1, cc01, cc01) 2580 FMADD (aa2, bb1, cc02, cc02) 2581 FMADD (aa5, bb2, cc03, cc03) 2582 FMADD (aa2, bb2, cc04, cc04) 2583 2584 FMADD (aa5, bb3, cc05, cc05) 2585 LDF [BO + 48 * SIZE], b1 2586 FMADD (aa2, bb3, cc06, cc06) 2587 LDF [BO + 41 * SIZE], b2 2588 2589 FMADD (aa5, bb4, cc07, cc07) 2590 LDF [BO + 42 * SIZE], b3 2591 FMADD (aa2, bb4, cc08, cc08) 2592 LDF [BO + 43 * SIZE], b4 2593 2594 FMADD (aa5, bb5, cc09, cc09) 2595 LDF [AO + 10 * SIZE], a3 2596 FMADD (aa2, bb5, cc10, cc10) 2597 LDF [AO + 11 * SIZE], a4 2598 2599 FMADD (aa5, bb6, cc11, cc11) 2600 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 2601 FMADD (aa2, bb6, cc12, cc12) 2602 nop 2603 2604 FMADD (aa5, bb7, cc13, cc13) 2605 LDF [BO + 44 * SIZE], b5 2606 FMADD (aa2, bb7, cc14, cc14) 2607 LDF [BO + 45 * SIZE], b6 2608 2609 FMADD (aa5, bb8, cc15, cc15) 2610 LDF [BO + 46 * SIZE], b7 2611 FMADD (aa2, bb8, cc16, cc16) 2612 LDF [BO + 47 * SIZE], b8 2613 2614 FMADD (aa3, bb9, cc01, cc01) 2615 FMADD (aa4, bb9, cc02, cc02) 2616 FMADD (aa3, bb2, cc03, cc03) 2617 FMADD (aa4, bb2, cc04, cc04) 2618 2619 FMADD (aa3, bb3, cc05, cc05) 2620 LDF [BO + 56 * SIZE], b9 2621 FMADD (aa4, bb3, cc06, cc06) 2622 LDF [BO + 49 * SIZE], b2 2623 2624 FMADD (aa3, bb4, cc07, cc07) 2625 LDF [BO + 50 * SIZE], b3 2626 FMADD (aa4, bb4, cc08, cc08) 2627 LDF [BO + 51 * SIZE], b4 2628 2629 FMADD (aa3, bb5, cc09, cc09) 2630 LDF [AO + 12 * SIZE], a5 2631 FMADD (aa4, bb5, cc10, cc10) 2632 LDF [AO + 13 * SIZE], a2 2633 2634 FMADD (aa3, bb6, cc11, cc11) 2635 cmp L, 0 2636 FMADD (aa4, bb6, cc12, cc12) 2637 nop 2638 2639 FMADD (aa3, bb7, cc13, cc13) 2640 LDF [BO + 52 * SIZE], b5 2641 FMADD (aa4, bb7, cc14, cc14) 2642 LDF [BO + 53 * SIZE], b6 2643 2644 FMADD (aa3, bb8, cc15, cc15) 2645 LDF [BO + 54 * SIZE], b7 2646 FMADD (aa4, bb8, cc16, cc16) 2647 LDF [BO + 55 * SIZE], b8 2648 2649 FMADD (aa5, bb1, cc01, cc01) 2650 FMADD (aa2, bb1, cc02, cc02) 2651 FMADD (aa5, bb2, cc03, cc03) 2652 FMADD (aa2, bb2, cc04, cc04) 2653 2654 FMADD (aa5, bb3, cc05, cc05) 2655 LDF [BO + 64 * SIZE], b1 2656 FMADD (aa2, bb3, cc06, cc06) 2657 LDF [BO + 57 * SIZE], b2 2658 2659 FMADD (aa5, bb4, cc07, cc07) 2660 LDF [BO + 58 * SIZE], b3 2661 FMADD (aa2, bb4, cc08, cc08) 2662 LDF [BO + 59 * SIZE], b4 2663 2664 FMADD (aa5, bb5, cc09, cc09) 2665 LDF [AO + 14 * SIZE], a3 2666 FMADD (aa2, bb5, cc10, cc10) 2667 LDF [AO + 15 * SIZE], a4 2668 2669 FMADD (aa5, bb6, cc11, cc11) 2670 add BO, 64 * SIZE, BO 2671 FMADD (aa2, bb6, cc12, cc12) 2672 add AO, 16 * SIZE, AO 2673 2674 FMADD (aa5, bb7, cc13, cc13) 2675 LDF [BO - 4 * SIZE], b5 2676 FMADD (aa2, bb7, cc14, cc14) 2677 LDF [BO - 3 * SIZE], b6 2678 2679 FMADD (aa5, bb8, cc15, cc15) 2680 LDF [BO - 2 * SIZE], b7 2681 FMADD (aa2, bb8, cc16, cc16) 2682 LDF [BO - 1 * SIZE], b8 2683 2684 FMADD (aa3, bb9, cc01, cc01) 2685 FMADD (aa4, bb9, cc02, cc02) 2686 FMADD (aa3, bb2, cc03, cc03) 2687 FMADD (aa4, bb2, cc04, cc04) 2688 2689 FMADD (aa3, bb3, cc05, cc05) 2690 LDF [BO + 8 * SIZE], b9 2691 FMADD (aa4, bb3, cc06, cc06) 2692 LDF [BO + 1 * SIZE], b2 2693 2694 FMADD (aa3, bb4, cc07, cc07) 2695 LDF [BO + 2 * SIZE], b3 2696 FMADD (aa4, bb4, cc08, cc08) 2697 LDF [BO + 3 * SIZE], b4 2698 2699 FMADD (aa3, bb5, cc09, cc09) 2700 LDF [AO + 8 * SIZE], a5 /****/ 2701 FMADD (aa4, bb5, cc10, cc10) 2702 LDF [AO + 1 * SIZE], a2 2703 2704 FMADD (aa3, bb6, cc11, cc11) 2705 FMADD (aa4, bb6, cc12, cc12) 2706 2707 FMADD (aa3, bb7, cc13, cc13) 2708 LDF [BO + 4 * SIZE], b5 2709 FMADD (aa4, bb7, cc14, cc14) 2710 LDF [BO + 5 * SIZE], b6 2711 2712 FMADD (aa3, bb8, cc15, cc15) 2713 LDF [BO + 6 * SIZE], b7 2714 FMADD (aa4, bb8, cc16, cc16) 2715 bg,pt %icc, .LL13 2716 LDF [BO + 7 * SIZE], b8 2717 .align 4 2718 2719.LL15: 2720#if defined(LT) || defined(RN) 2721 and KK, 7, L 2722#else 2723 sub K, KK, L 2724 and L, 7, L 2725#endif 2726 cmp L, 0 2727 ble,a,pn %icc, .LL18 2728 nop 2729 .align 4 2730 2731.LL17: 2732 FMADD (aa1, bb1, cc01, cc01) 2733 add L, -1, L 2734 FMADD (aa2, bb1, cc02, cc02) 2735 nop 2736 2737 FMADD (aa1, bb2, cc03, cc03) 2738 LDF [BO + 8 * SIZE], b1 2739 FMADD (aa2, bb2, cc04, cc04) 2740 LDF [BO + 9 * SIZE], b2 2741 2742 FMADD (aa1, bb3, cc05, cc05) 2743 cmp L, 0 2744 FMADD (aa2, bb3, cc06, cc06) 2745 nop 2746 2747 FMADD (aa1, bb4, cc07, cc07) 2748 LDF [BO + 10 * SIZE], b3 2749 FMADD (aa2, bb4, cc08, cc08) 2750 LDF [BO + 11 * SIZE], b4 2751 2752 FMADD (aa1, bb5, cc09, cc09) 2753 nop 2754 FMADD (aa2, bb5, cc10, cc10) 2755 nop 2756 2757 FMADD (aa1, bb6, cc11, cc11) 2758 LDF [BO + 12 * SIZE], b5 2759 FMADD (aa2, bb6, cc12, cc12) 2760 LDF [BO + 13 * SIZE], b6 2761 2762 FMADD (aa1, bb7, cc13, cc13) 2763 add AO, 2 * SIZE, AO 2764 FMADD (aa2, bb7, cc14, cc14) 2765 add BO, 8 * SIZE, BO 2766 2767 FMADD (aa1, bb8, cc15, cc15) 2768 LDF [AO + 0 * SIZE], a1 2769 FMADD (aa2, bb8, cc16, cc16) 2770 LDF [AO + 1 * SIZE], a2 2771 2772 LDF [BO + 6 * SIZE], b7 2773 bg,pt %icc, .LL17 2774 LDF [BO + 7 * SIZE], b8 2775 nop 2776 .align 4 2777 2778.LL18: 2779#if defined(LN) || defined(RT) 2780#ifdef LN 2781 sub KK, 2, TEMP1 2782#else 2783 sub KK, 8, TEMP1 2784#endif 2785 sll TEMP1, BASE_SHIFT + 1, TEMP2 2786 sll TEMP1, BASE_SHIFT + 3, TEMP1 2787 2788 add AORIG, TEMP2, AO 2789 add B, TEMP1, BO 2790#endif 2791 2792#if defined(LN) || defined(LT) 2793 LDF [BO + 0 * SIZE], a1 2794 LDF [BO + 1 * SIZE], a2 2795 LDF [BO + 2 * SIZE], a3 2796 LDF [BO + 3 * SIZE], a4 2797 2798 LDF [BO + 4 * SIZE], b1 2799 LDF [BO + 5 * SIZE], b2 2800 LDF [BO + 6 * SIZE], b3 2801 LDF [BO + 7 * SIZE], b4 2802 2803 FSUB a1, c01, c01 2804 FSUB a2, c03, c03 2805 FSUB a3, c05, c05 2806 FSUB a4, c07, c07 2807 2808 FSUB b1, c09, c09 2809 FSUB b2, c11, c11 2810 FSUB b3, c13, c13 2811 FSUB b4, c15, c15 2812 2813 LDF [BO + 8 * SIZE], a1 2814 LDF [BO + 9 * SIZE], a2 2815 LDF [BO + 10 * SIZE], a3 2816 LDF [BO + 11 * SIZE], a4 2817 2818 LDF [BO + 12 * SIZE], b1 2819 LDF [BO + 13 * SIZE], b2 2820 LDF [BO + 14 * SIZE], b3 2821 LDF [BO + 15 * SIZE], b4 2822 2823 FSUB a1, c02, c02 2824 FSUB a2, c04, c04 2825 FSUB a3, c06, c06 2826 FSUB a4, c08, c08 2827 2828 FSUB b1, c10, c10 2829 FSUB b2, c12, c12 2830 FSUB b3, c14, c14 2831 FSUB b4, c16, c16 2832#else 2833 LDF [AO + 0 * SIZE], a1 2834 LDF [AO + 1 * SIZE], a2 2835 LDF [AO + 2 * SIZE], a3 2836 LDF [AO + 3 * SIZE], a4 2837 2838 LDF [AO + 4 * SIZE], b1 2839 LDF [AO + 5 * SIZE], b2 2840 LDF [AO + 6 * SIZE], b3 2841 LDF [AO + 7 * SIZE], b4 2842 2843 FSUB a1, c01, c01 2844 FSUB a2, c02, c02 2845 FSUB a3, c03, c03 2846 FSUB a4, c04, c04 2847 2848 FSUB b1, c05, c05 2849 FSUB b2, c06, c06 2850 FSUB b3, c07, c07 2851 FSUB b4, c08, c08 2852 2853 LDF [AO + 8 * SIZE], a1 2854 LDF [AO + 9 * SIZE], a2 2855 LDF [AO + 10 * SIZE], a3 2856 LDF [AO + 11 * SIZE], a4 2857 2858 LDF [AO + 12 * SIZE], b1 2859 LDF [AO + 13 * SIZE], b2 2860 LDF [AO + 14 * SIZE], b3 2861 LDF [AO + 15 * SIZE], b4 2862 2863 FSUB a1, c09, c09 2864 FSUB a2, c10, c10 2865 FSUB a3, c11, c11 2866 FSUB a4, c12, c12 2867 2868 FSUB b1, c13, c13 2869 FSUB b2, c14, c14 2870 FSUB b3, c15, c15 2871 FSUB b4, c16, c16 2872#endif 2873 2874#ifdef LN 2875 LDF [AO + 3 * SIZE], a1 2876 LDF [AO + 2 * SIZE], a2 2877 LDF [AO + 0 * SIZE], a3 2878 2879 FMUL a1, c02, c02 2880 FMUL a1, c04, c04 2881 FMUL a1, c06, c06 2882 FMUL a1, c08, c08 2883 FMUL a1, c10, c10 2884 FMUL a1, c12, c12 2885 FMUL a1, c14, c14 2886 FMUL a1, c16, c16 2887 2888 FNMSUB (aa2, cc02, cc01, cc01) 2889 FNMSUB (aa2, cc04, cc03, cc03) 2890 FNMSUB (aa2, cc06, cc05, cc05) 2891 FNMSUB (aa2, cc08, cc07, cc07) 2892 FNMSUB (aa2, cc10, cc09, cc09) 2893 FNMSUB (aa2, cc12, cc11, cc11) 2894 FNMSUB (aa2, cc14, cc13, cc13) 2895 FNMSUB (aa2, cc16, cc15, cc15) 2896 2897 FMUL a3, c01, c01 2898 FMUL a3, c03, c03 2899 FMUL a3, c05, c05 2900 FMUL a3, c07, c07 2901 FMUL a3, c09, c09 2902 FMUL a3, c11, c11 2903 FMUL a3, c13, c13 2904 FMUL a3, c15, c15 2905#endif 2906 2907#ifdef LT 2908 LDF [AO + 0 * SIZE], a1 2909 LDF [AO + 1 * SIZE], a2 2910 LDF [AO + 3 * SIZE], a3 2911 2912 FMUL a1, c01, c01 2913 FMUL a1, c03, c03 2914 FMUL a1, c05, c05 2915 FMUL a1, c07, c07 2916 FMUL a1, c09, c09 2917 FMUL a1, c11, c11 2918 FMUL a1, c13, c13 2919 FMUL a1, c15, c15 2920 2921 FNMSUB (aa2, cc01, cc02, cc02) 2922 FNMSUB (aa2, cc03, cc04, cc04) 2923 FNMSUB (aa2, cc05, cc06, cc06) 2924 FNMSUB (aa2, cc07, cc08, cc08) 2925 FNMSUB (aa2, cc09, cc10, cc10) 2926 FNMSUB (aa2, cc11, cc12, cc12) 2927 FNMSUB (aa2, cc13, cc14, cc14) 2928 FNMSUB (aa2, cc15, cc16, cc16) 2929 2930 FMUL a3, c02, c02 2931 FMUL a3, c04, c04 2932 FMUL a3, c06, c06 2933 FMUL a3, c08, c08 2934 FMUL a3, c10, c10 2935 FMUL a3, c12, c12 2936 FMUL a3, c14, c14 2937 FMUL a3, c16, c16 2938#endif 2939 2940#ifdef RN 2941 LDF [BO + 0 * SIZE], a1 2942 LDF [BO + 1 * SIZE], a2 2943 LDF [BO + 2 * SIZE], a3 2944 LDF [BO + 3 * SIZE], a4 2945 LDF [BO + 4 * SIZE], b1 2946 LDF [BO + 5 * SIZE], b2 2947 LDF [BO + 6 * SIZE], b3 2948 LDF [BO + 7 * SIZE], b4 2949 2950 FMUL a1, c01, c01 2951 FMUL a1, c02, c02 2952 2953 FNMSUB (aa2, cc01, cc03, cc03) 2954 FNMSUB (aa2, cc02, cc04, cc04) 2955 FNMSUB (aa3, cc01, cc05, cc05) 2956 FNMSUB (aa3, cc02, cc06, cc06) 2957 FNMSUB (aa4, cc01, cc07, cc07) 2958 FNMSUB (aa4, cc02, cc08, cc08) 2959 FNMSUB (bb1, cc01, cc09, cc09) 2960 FNMSUB (bb1, cc02, cc10, cc10) 2961 FNMSUB (bb2, cc01, cc11, cc11) 2962 FNMSUB (bb2, cc02, cc12, cc12) 2963 FNMSUB (bb3, cc01, cc13, cc13) 2964 FNMSUB (bb3, cc02, cc14, cc14) 2965 FNMSUB (bb4, cc01, cc15, cc15) 2966 FNMSUB (bb4, cc02, cc16, cc16) 2967 2968 LDF [BO + 9 * SIZE], a1 2969 LDF [BO + 10 * SIZE], a2 2970 LDF [BO + 11 * SIZE], a3 2971 LDF [BO + 12 * SIZE], a4 2972 LDF [BO + 13 * SIZE], b1 2973 LDF [BO + 14 * SIZE], b2 2974 LDF [BO + 15 * SIZE], b3 2975 2976 FMUL a1, c03, c03 2977 FMUL a1, c04, c04 2978 2979 FNMSUB (aa2, cc03, cc05, cc05) 2980 FNMSUB (aa2, cc04, cc06, cc06) 2981 FNMSUB (aa3, cc03, cc07, cc07) 2982 FNMSUB (aa3, cc04, cc08, cc08) 2983 FNMSUB (aa4, cc03, cc09, cc09) 2984 FNMSUB (aa4, cc04, cc10, cc10) 2985 FNMSUB (bb1, cc03, cc11, cc11) 2986 FNMSUB (bb1, cc04, cc12, cc12) 2987 FNMSUB (bb2, cc03, cc13, cc13) 2988 FNMSUB (bb2, cc04, cc14, cc14) 2989 FNMSUB (bb3, cc03, cc15, cc15) 2990 FNMSUB (bb3, cc04, cc16, cc16) 2991 2992 LDF [BO + 18 * SIZE], a1 2993 LDF [BO + 19 * SIZE], a2 2994 LDF [BO + 20 * SIZE], a3 2995 LDF [BO + 21 * SIZE], a4 2996 LDF [BO + 22 * SIZE], b1 2997 LDF [BO + 23 * SIZE], b2 2998 2999 FMUL a1, c05, c05 3000 FMUL a1, c06, c06 3001 3002 FNMSUB (aa2, cc05, cc07, cc07) 3003 FNMSUB (aa2, cc06, cc08, cc08) 3004 FNMSUB (aa3, cc05, cc09, cc09) 3005 FNMSUB (aa3, cc06, cc10, cc10) 3006 FNMSUB (aa4, cc05, cc11, cc11) 3007 FNMSUB (aa4, cc06, cc12, cc12) 3008 FNMSUB (bb1, cc05, cc13, cc13) 3009 FNMSUB (bb1, cc06, cc14, cc14) 3010 FNMSUB (bb2, cc05, cc15, cc15) 3011 FNMSUB (bb2, cc06, cc16, cc16) 3012 3013 LDF [BO + 27 * SIZE], a1 3014 LDF [BO + 28 * SIZE], a2 3015 LDF [BO + 29 * SIZE], a3 3016 LDF [BO + 30 * SIZE], a4 3017 LDF [BO + 31 * SIZE], b1 3018 3019 FMUL a1, c07, c07 3020 FMUL a1, c08, c08 3021 3022 FNMSUB (aa2, cc07, cc09, cc09) 3023 FNMSUB (aa2, cc08, cc10, cc10) 3024 FNMSUB (aa3, cc07, cc11, cc11) 3025 FNMSUB (aa3, cc08, cc12, cc12) 3026 FNMSUB (aa4, cc07, cc13, cc13) 3027 FNMSUB (aa4, cc08, cc14, cc14) 3028 FNMSUB (bb1, cc07, cc15, cc15) 3029 FNMSUB (bb1, cc08, cc16, cc16) 3030 3031 LDF [BO + 36 * SIZE], a1 3032 LDF [BO + 37 * SIZE], a2 3033 LDF [BO + 38 * SIZE], a3 3034 LDF [BO + 39 * SIZE], a4 3035 3036 FMUL a1, c09, c09 3037 FMUL a1, c10, c10 3038 3039 FNMSUB (aa2, cc09, cc11, cc11) 3040 FNMSUB (aa2, cc10, cc12, cc12) 3041 FNMSUB (aa3, cc09, cc13, cc13) 3042 FNMSUB (aa3, cc10, cc14, cc14) 3043 FNMSUB (aa4, cc09, cc15, cc15) 3044 FNMSUB (aa4, cc10, cc16, cc16) 3045 3046 LDF [BO + 45 * SIZE], a1 3047 LDF [BO + 46 * SIZE], a2 3048 LDF [BO + 47 * SIZE], a3 3049 3050 FMUL a1, c11, c11 3051 FMUL a1, c12, c12 3052 3053 FNMSUB (aa2, cc11, cc13, cc13) 3054 FNMSUB (aa2, cc12, cc14, cc14) 3055 FNMSUB (aa3, cc11, cc15, cc15) 3056 FNMSUB (aa3, cc12, cc16, cc16) 3057 3058 LDF [BO + 54 * SIZE], a1 3059 LDF [BO + 55 * SIZE], a2 3060 3061 FMUL a1, c13, c13 3062 FMUL a1, c14, c14 3063 3064 FNMSUB (aa2, cc13, cc15, cc15) 3065 FNMSUB (aa2, cc14, cc16, cc16) 3066 3067 LDF [BO + 63 * SIZE], a1 3068 3069 FMUL a1, c15, c15 3070 FMUL a1, c16, c16 3071#endif 3072 3073#ifdef RT 3074 LDF [BO + 63 * SIZE], a1 3075 LDF [BO + 62 * SIZE], a2 3076 LDF [BO + 61 * SIZE], a3 3077 LDF [BO + 60 * SIZE], a4 3078 LDF [BO + 59 * SIZE], b1 3079 LDF [BO + 58 * SIZE], b2 3080 LDF [BO + 57 * SIZE], b3 3081 LDF [BO + 56 * SIZE], b4 3082 3083 FMUL a1, c16, c16 3084 FMUL a1, c15, c15 3085 3086 FNMSUB (aa2, cc16, cc14, cc14) 3087 FNMSUB (aa2, cc15, cc13, cc13) 3088 FNMSUB (aa3, cc16, cc12, cc12) 3089 FNMSUB (aa3, cc15, cc11, cc11) 3090 FNMSUB (aa4, cc16, cc10, cc10) 3091 FNMSUB (aa4, cc15, cc09, cc09) 3092 FNMSUB (bb1, cc16, cc08, cc08) 3093 FNMSUB (bb1, cc15, cc07, cc07) 3094 FNMSUB (bb2, cc16, cc06, cc06) 3095 FNMSUB (bb2, cc15, cc05, cc05) 3096 FNMSUB (bb3, cc16, cc04, cc04) 3097 FNMSUB (bb3, cc15, cc03, cc03) 3098 FNMSUB (bb4, cc16, cc02, cc02) 3099 FNMSUB (bb4, cc15, cc01, cc01) 3100 3101 LDF [BO + 54 * SIZE], a1 3102 LDF [BO + 53 * SIZE], a2 3103 LDF [BO + 52 * SIZE], a3 3104 LDF [BO + 51 * SIZE], a4 3105 LDF [BO + 50 * SIZE], b1 3106 LDF [BO + 49 * SIZE], b2 3107 LDF [BO + 48 * SIZE], b3 3108 3109 FMUL a1, c14, c14 3110 FMUL a1, c13, c13 3111 3112 FNMSUB (aa2, cc14, cc12, cc12) 3113 FNMSUB (aa2, cc13, cc11, cc11) 3114 FNMSUB (aa3, cc14, cc10, cc10) 3115 FNMSUB (aa3, cc13, cc09, cc09) 3116 FNMSUB (aa4, cc14, cc08, cc08) 3117 FNMSUB (aa4, cc13, cc07, cc07) 3118 FNMSUB (bb1, cc14, cc06, cc06) 3119 FNMSUB (bb1, cc13, cc05, cc05) 3120 FNMSUB (bb2, cc14, cc04, cc04) 3121 FNMSUB (bb2, cc13, cc03, cc03) 3122 FNMSUB (bb3, cc14, cc02, cc02) 3123 FNMSUB (bb3, cc13, cc01, cc01) 3124 3125 LDF [BO + 45 * SIZE], a1 3126 LDF [BO + 44 * SIZE], a2 3127 LDF [BO + 43 * SIZE], a3 3128 LDF [BO + 42 * SIZE], a4 3129 LDF [BO + 41 * SIZE], b1 3130 LDF [BO + 40 * SIZE], b2 3131 3132 FMUL a1, c12, c12 3133 FMUL a1, c11, c11 3134 3135 FNMSUB (aa2, cc12, cc10, cc10) 3136 FNMSUB (aa2, cc11, cc09, cc09) 3137 FNMSUB (aa3, cc12, cc08, cc08) 3138 FNMSUB (aa3, cc11, cc07, cc07) 3139 FNMSUB (aa4, cc12, cc06, cc06) 3140 FNMSUB (aa4, cc11, cc05, cc05) 3141 FNMSUB (bb1, cc12, cc04, cc04) 3142 FNMSUB (bb1, cc11, cc03, cc03) 3143 FNMSUB (bb2, cc12, cc02, cc02) 3144 FNMSUB (bb2, cc11, cc01, cc01) 3145 3146 LDF [BO + 36 * SIZE], a1 3147 LDF [BO + 35 * SIZE], a2 3148 LDF [BO + 34 * SIZE], a3 3149 LDF [BO + 33 * SIZE], a4 3150 LDF [BO + 32 * SIZE], b1 3151 3152 FMUL a1, c10, c10 3153 FMUL a1, c09, c09 3154 3155 FNMSUB (aa2, cc10, cc08, cc08) 3156 FNMSUB (aa2, cc09, cc07, cc07) 3157 FNMSUB (aa3, cc10, cc06, cc06) 3158 FNMSUB (aa3, cc09, cc05, cc05) 3159 FNMSUB (aa4, cc10, cc04, cc04) 3160 FNMSUB (aa4, cc09, cc03, cc03) 3161 FNMSUB (bb1, cc10, cc02, cc02) 3162 FNMSUB (bb1, cc09, cc01, cc01) 3163 3164 LDF [BO + 27 * SIZE], a1 3165 LDF [BO + 26 * SIZE], a2 3166 LDF [BO + 25 * SIZE], a3 3167 LDF [BO + 24 * SIZE], a4 3168 3169 FMUL a1, c08, c08 3170 FMUL a1, c07, c07 3171 3172 FNMSUB (aa2, cc08, cc06, cc06) 3173 FNMSUB (aa2, cc07, cc05, cc05) 3174 FNMSUB (aa3, cc08, cc04, cc04) 3175 FNMSUB (aa3, cc07, cc03, cc03) 3176 FNMSUB (aa4, cc08, cc02, cc02) 3177 FNMSUB (aa4, cc07, cc01, cc01) 3178 3179 LDF [BO + 18 * SIZE], a1 3180 LDF [BO + 17 * SIZE], a2 3181 LDF [BO + 16 * SIZE], a3 3182 3183 FMUL a1, c06, c06 3184 FMUL a1, c05, c05 3185 3186 FNMSUB (aa2, cc06, cc04, cc04) 3187 FNMSUB (aa2, cc05, cc03, cc03) 3188 FNMSUB (aa3, cc06, cc02, cc02) 3189 FNMSUB (aa3, cc05, cc01, cc01) 3190 3191 LDF [BO + 9 * SIZE], a1 3192 LDF [BO + 8 * SIZE], a2 3193 3194 FMUL a1, c04, c04 3195 FMUL a1, c03, c03 3196 3197 FNMSUB (aa2, cc04, cc02, cc02) 3198 FNMSUB (aa2, cc03, cc01, cc01) 3199 3200 LDF [BO + 0 * SIZE], a1 3201 3202 FMUL a1, c02, c02 3203 FMUL a1, c01, c01 3204#endif 3205 3206#ifdef LN 3207 add C1, -2 * SIZE, C1 3208 add C2, -2 * SIZE, C2 3209 add C3, -2 * SIZE, C3 3210 add C4, -2 * SIZE, C4 3211 add C5, -2 * SIZE, C5 3212 add C6, -2 * SIZE, C6 3213 add C7, -2 * SIZE, C7 3214 add C8, -2 * SIZE, C8 3215#endif 3216 3217#if defined(LN) || defined(LT) 3218 STF c01, [BO + 0 * SIZE] 3219 STF c03, [BO + 1 * SIZE] 3220 STF c05, [BO + 2 * SIZE] 3221 STF c07, [BO + 3 * SIZE] 3222 3223 STF c09, [BO + 4 * SIZE] 3224 STF c11, [BO + 5 * SIZE] 3225 STF c13, [BO + 6 * SIZE] 3226 STF c15, [BO + 7 * SIZE] 3227 3228 STF c02, [BO + 8 * SIZE] 3229 STF c04, [BO + 9 * SIZE] 3230 STF c06, [BO + 10 * SIZE] 3231 STF c08, [BO + 11 * SIZE] 3232 3233 STF c10, [BO + 12 * SIZE] 3234 STF c12, [BO + 13 * SIZE] 3235 STF c14, [BO + 14 * SIZE] 3236 STF c16, [BO + 15 * SIZE] 3237#else 3238 STF c01, [AO + 0 * SIZE] 3239 STF c02, [AO + 1 * SIZE] 3240 STF c03, [AO + 2 * SIZE] 3241 STF c04, [AO + 3 * SIZE] 3242 3243 STF c05, [AO + 4 * SIZE] 3244 STF c06, [AO + 5 * SIZE] 3245 STF c07, [AO + 6 * SIZE] 3246 STF c08, [AO + 7 * SIZE] 3247 3248 STF c09, [AO + 8 * SIZE] 3249 STF c10, [AO + 9 * SIZE] 3250 STF c11, [AO + 10 * SIZE] 3251 STF c12, [AO + 11 * SIZE] 3252 3253 STF c13, [AO + 12 * SIZE] 3254 STF c14, [AO + 13 * SIZE] 3255 STF c15, [AO + 14 * SIZE] 3256 STF c16, [AO + 15 * SIZE] 3257#endif 3258 3259 STF c01, [C1 + 0 * SIZE] 3260 STF c02, [C1 + 1 * SIZE] 3261 STF c03, [C2 + 0 * SIZE] 3262 STF c04, [C2 + 1 * SIZE] 3263 3264 STF c05, [C3 + 0 * SIZE] 3265 STF c06, [C3 + 1 * SIZE] 3266 STF c07, [C4 + 0 * SIZE] 3267 STF c08, [C4 + 1 * SIZE] 3268 3269 STF c09, [C5 + 0 * SIZE] 3270 STF c10, [C5 + 1 * SIZE] 3271 STF c11, [C6 + 0 * SIZE] 3272 STF c12, [C6 + 1 * SIZE] 3273 3274 STF c13, [C7 + 0 * SIZE] 3275 STF c14, [C7 + 1 * SIZE] 3276 STF c15, [C8 + 0 * SIZE] 3277 STF c16, [C8 + 1 * SIZE] 3278 3279#ifndef LN 3280 add C1, 2 * SIZE, C1 3281 add C2, 2 * SIZE, C2 3282 add C3, 2 * SIZE, C3 3283 add C4, 2 * SIZE, C4 3284 add C5, 2 * SIZE, C5 3285 add C6, 2 * SIZE, C6 3286 add C7, 2 * SIZE, C7 3287 add C8, 2 * SIZE, C8 3288#endif 3289 3290#ifdef RT 3291 sll K, BASE_SHIFT + 1, TEMP1 3292 add AORIG, TEMP1, AORIG 3293#endif 3294 3295#if defined(LT) || defined(RN) 3296 sub K, KK, TEMP1 3297 sll TEMP1, BASE_SHIFT + 1, TEMP2 3298 sll TEMP1, BASE_SHIFT + 3, TEMP1 3299 add AO, TEMP2, AO 3300 add BO, TEMP1, BO 3301#endif 3302 3303#ifdef LT 3304 add KK, 2, KK 3305#endif 3306 3307#ifdef LN 3308 sub KK, 2, KK 3309#endif 3310 3311 add I, -1, I 3312 cmp I, 0 3313 bg,pt %icc, .LL12 3314 nop 3315 .align 4 3316 3317.LL20: 3318 and M, 1, I 3319 cmp I, 0 3320 ble,pn %icc, .LL29 3321 nop 3322 3323#if defined(LT) || defined(RN) 3324 mov B, BO 3325#else 3326#ifdef LN 3327 sll K, BASE_SHIFT + 0, TEMP1 3328 sub AORIG, TEMP1, AORIG 3329#endif 3330 3331 sll KK, BASE_SHIFT + 0, TEMP1 3332 sll KK, BASE_SHIFT + 3, TEMP2 3333 3334 add AORIG, TEMP1, AO 3335 add B, TEMP2, BO 3336#endif 3337 3338 LDF [AO + 0 * SIZE], a1 3339 LDF [AO + 1 * SIZE], a2 3340 LDF [AO + 2 * SIZE], a3 3341 LDF [AO + 3 * SIZE], a4 3342 3343 LDF [BO + 0 * SIZE], b1 3344 FCLR (cc01) 3345 LDF [BO + 1 * SIZE], b2 3346 FCLR (cc03) 3347 LDF [BO + 2 * SIZE], b3 3348 FCLR (cc05) 3349 LDF [BO + 3 * SIZE], b4 3350 FCLR (cc07) 3351 LDF [BO + 4 * SIZE], b5 3352 FCLR (cc09) 3353 LDF [BO + 5 * SIZE], b6 3354 FCLR (cc11) 3355 LDF [BO + 6 * SIZE], b7 3356 FCLR (cc13) 3357 LDF [BO + 7 * SIZE], b8 3358 FCLR (cc15) 3359 3360#if defined(LT) || defined(RN) 3361 sra KK, 2, L 3362#else 3363 sub K, KK, L 3364 sra L, 2, L 3365#endif 3366 cmp L, 0 3367 ble,pn %icc, .LL25 3368 LDF [BO + 8 * SIZE], b9 3369 .align 4 3370 3371.LL23: 3372 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 3373 add L, -1, L 3374 3375 FMADD (aa1, bb1, cc01, cc01) 3376 LDF [BO + 16 * SIZE], b1 3377 FMADD (aa1, bb2, cc03, cc03) 3378 LDF [BO + 9 * SIZE], b2 3379 3380 FMADD (aa1, bb3, cc05, cc05) 3381 LDF [BO + 10 * SIZE], b3 3382 FMADD (aa1, bb4, cc07, cc07) 3383 LDF [BO + 11 * SIZE], b4 3384 3385 FMADD (aa1, bb5, cc09, cc09) 3386 LDF [BO + 12 * SIZE], b5 3387 FMADD (aa1, bb6, cc11, cc11) 3388 LDF [BO + 13 * SIZE], b6 3389 3390 FMADD (aa1, bb7, cc13, cc13) 3391 LDF [BO + 14 * SIZE], b7 3392 FMADD (aa1, bb8, cc15, cc15) 3393 LDF [BO + 15 * SIZE], b8 3394 3395 FMADD (aa2, bb9, cc01, cc01) 3396 LDF [BO + 24 * SIZE], b9 3397 FMADD (aa2, bb2, cc03, cc03) 3398 LDF [BO + 17 * SIZE], b2 3399 3400 FMADD (aa2, bb3, cc05, cc05) 3401 LDF [BO + 18 * SIZE], b3 3402 FMADD (aa2, bb4, cc07, cc07) 3403 LDF [BO + 19 * SIZE], b4 3404 3405 FMADD (aa2, bb5, cc09, cc09) 3406 LDF [BO + 20 * SIZE], b5 3407 FMADD (aa2, bb6, cc11, cc11) 3408 LDF [BO + 21 * SIZE], b6 3409 3410 FMADD (aa2, bb7, cc13, cc13) 3411 LDF [BO + 22 * SIZE], b7 3412 FMADD (aa2, bb8, cc15, cc15) 3413 LDF [BO + 23 * SIZE], b8 3414 3415 LDF [AO + 4 * SIZE], a1 3416 LDF [AO + 5 * SIZE], a2 3417 3418 FMADD (aa3, bb1, cc01, cc01) 3419 LDF [BO + 32 * SIZE], b1 3420 FMADD (aa3, bb2, cc03, cc03) 3421 LDF [BO + 25 * SIZE], b2 3422 3423 FMADD (aa3, bb3, cc05, cc05) 3424 LDF [BO + 26 * SIZE], b3 3425 FMADD (aa3, bb4, cc07, cc07) 3426 LDF [BO + 27 * SIZE], b4 3427 3428 FMADD (aa3, bb5, cc09, cc09) 3429 LDF [BO + 28 * SIZE], b5 3430 FMADD (aa3, bb6, cc11, cc11) 3431 LDF [BO + 29 * SIZE], b6 3432 3433 FMADD (aa3, bb7, cc13, cc13) 3434 LDF [BO + 30 * SIZE], b7 3435 FMADD (aa3, bb8, cc15, cc15) 3436 LDF [BO + 31 * SIZE], b8 3437 3438 FMADD (aa4, bb9, cc01, cc01) 3439 LDF [BO + 40 * SIZE], b9 3440 FMADD (aa4, bb2, cc03, cc03) 3441 LDF [BO + 33 * SIZE], b2 3442 3443 FMADD (aa4, bb3, cc05, cc05) 3444 LDF [BO + 34 * SIZE], b3 3445 FMADD (aa4, bb4, cc07, cc07) 3446 LDF [BO + 35 * SIZE], b4 3447 3448 FMADD (aa4, bb5, cc09, cc09) 3449 LDF [BO + 36 * SIZE], b5 3450 FMADD (aa4, bb6, cc11, cc11) 3451 LDF [BO + 37 * SIZE], b6 3452 3453 FMADD (aa4, bb7, cc13, cc13) 3454 LDF [BO + 38 * SIZE], b7 3455 FMADD (aa4, bb8, cc15, cc15) 3456 LDF [BO + 39 * SIZE], b8 3457 3458 LDF [AO + 6 * SIZE], a3 3459 LDF [AO + 7 * SIZE], a4 3460 3461 add AO, 4 * SIZE, AO 3462 cmp L, 0 3463 bg,pt %icc, .LL23 3464 add BO, 32 * SIZE, BO 3465 .align 4 3466 3467.LL25: 3468#if defined(LT) || defined(RN) 3469 and KK, 3, L 3470#else 3471 sub K, KK, L 3472 and L, 3, L 3473#endif 3474 cmp L, 0 3475 ble,a,pn %icc, .LL28 3476 nop 3477 .align 4 3478 3479.LL27: 3480 FMADD (aa1, bb1, cc01, cc01) 3481 LDF [BO + 8 * SIZE], b1 3482 FMADD (aa1, bb2, cc03, cc03) 3483 LDF [BO + 9 * SIZE], b2 3484 3485 FMADD (aa1, bb3, cc05, cc05) 3486 LDF [BO + 10 * SIZE], b3 3487 FMADD (aa1, bb4, cc07, cc07) 3488 LDF [BO + 11 * SIZE], b4 3489 3490 FMADD (aa1, bb5, cc09, cc09) 3491 LDF [BO + 12 * SIZE], b5 3492 FMADD (aa1, bb6, cc11, cc11) 3493 LDF [BO + 13 * SIZE], b6 3494 3495 FMADD (aa1, bb7, cc13, cc13) 3496 LDF [BO + 14 * SIZE], b7 3497 FMADD (aa1, bb8, cc15, cc15) 3498 LDF [BO + 15 * SIZE], b8 3499 3500 LDF [AO + 1 * SIZE], a1 3501 add AO, 1 * SIZE, AO 3502 3503 add L, -1, L 3504 cmp L, 0 3505 bg,pt %icc, .LL27 3506 add BO, 8 * SIZE, BO 3507 .align 4 3508 3509.LL28: 3510#if defined(LN) || defined(RT) 3511#ifdef LN 3512 sub KK, 1, TEMP1 3513#else 3514 sub KK, 8, TEMP1 3515#endif 3516 sll TEMP1, BASE_SHIFT + 0, TEMP2 3517 sll TEMP1, BASE_SHIFT + 3, TEMP1 3518 3519 add AORIG, TEMP2, AO 3520 add B, TEMP1, BO 3521#endif 3522 3523#if defined(LN) || defined(LT) 3524 LDF [BO + 0 * SIZE], a1 3525 LDF [BO + 1 * SIZE], a2 3526 LDF [BO + 2 * SIZE], a3 3527 LDF [BO + 3 * SIZE], a4 3528 3529 LDF [BO + 4 * SIZE], b1 3530 LDF [BO + 5 * SIZE], b2 3531 LDF [BO + 6 * SIZE], b3 3532 LDF [BO + 7 * SIZE], b4 3533 3534 FSUB a1, c01, c01 3535 FSUB a2, c03, c03 3536 FSUB a3, c05, c05 3537 FSUB a4, c07, c07 3538 3539 FSUB b1, c09, c09 3540 FSUB b2, c11, c11 3541 FSUB b3, c13, c13 3542 FSUB b4, c15, c15 3543#else 3544 LDF [AO + 0 * SIZE], a1 3545 LDF [AO + 1 * SIZE], a2 3546 LDF [AO + 2 * SIZE], a3 3547 LDF [AO + 3 * SIZE], a4 3548 3549 LDF [AO + 4 * SIZE], b1 3550 LDF [AO + 5 * SIZE], b2 3551 LDF [AO + 6 * SIZE], b3 3552 LDF [AO + 7 * SIZE], b4 3553 3554 FSUB a1, c01, c01 3555 FSUB a2, c03, c03 3556 FSUB a3, c05, c05 3557 FSUB a4, c07, c07 3558 3559 FSUB b1, c09, c09 3560 FSUB b2, c11, c11 3561 FSUB b3, c13, c13 3562 FSUB b4, c15, c15 3563#endif 3564 3565#if defined(LN) || defined(LT) 3566 LDF [AO + 0 * SIZE], a1 3567 3568 FMUL a1, c01, c01 3569 FMUL a1, c03, c03 3570 FMUL a1, c05, c05 3571 FMUL a1, c07, c07 3572 FMUL a1, c09, c09 3573 FMUL a1, c11, c11 3574 FMUL a1, c13, c13 3575 FMUL a1, c15, c15 3576#endif 3577 3578#ifdef RN 3579 LDF [BO + 0 * SIZE], a1 3580 LDF [BO + 1 * SIZE], a2 3581 LDF [BO + 2 * SIZE], a3 3582 LDF [BO + 3 * SIZE], a4 3583 LDF [BO + 4 * SIZE], b1 3584 LDF [BO + 5 * SIZE], b2 3585 LDF [BO + 6 * SIZE], b3 3586 LDF [BO + 7 * SIZE], b4 3587 3588 FMUL a1, c01, c01 3589 3590 FNMSUB (aa2, cc01, cc03, cc03) 3591 FNMSUB (aa3, cc01, cc05, cc05) 3592 FNMSUB (aa4, cc01, cc07, cc07) 3593 FNMSUB (bb1, cc01, cc09, cc09) 3594 FNMSUB (bb2, cc01, cc11, cc11) 3595 FNMSUB (bb3, cc01, cc13, cc13) 3596 FNMSUB (bb4, cc01, cc15, cc15) 3597 3598 LDF [BO + 9 * SIZE], a1 3599 LDF [BO + 10 * SIZE], a2 3600 LDF [BO + 11 * SIZE], a3 3601 LDF [BO + 12 * SIZE], a4 3602 LDF [BO + 13 * SIZE], b1 3603 LDF [BO + 14 * SIZE], b2 3604 LDF [BO + 15 * SIZE], b3 3605 3606 FMUL a1, c03, c03 3607 3608 FNMSUB (aa2, cc03, cc05, cc05) 3609 FNMSUB (aa3, cc03, cc07, cc07) 3610 FNMSUB (aa4, cc03, cc09, cc09) 3611 FNMSUB (bb1, cc03, cc11, cc11) 3612 FNMSUB (bb2, cc03, cc13, cc13) 3613 FNMSUB (bb3, cc03, cc15, cc15) 3614 3615 LDF [BO + 18 * SIZE], a1 3616 LDF [BO + 19 * SIZE], a2 3617 LDF [BO + 20 * SIZE], a3 3618 LDF [BO + 21 * SIZE], a4 3619 LDF [BO + 22 * SIZE], b1 3620 LDF [BO + 23 * SIZE], b2 3621 3622 FMUL a1, c05, c05 3623 3624 FNMSUB (aa2, cc05, cc07, cc07) 3625 FNMSUB (aa3, cc05, cc09, cc09) 3626 FNMSUB (aa4, cc05, cc11, cc11) 3627 FNMSUB (bb1, cc05, cc13, cc13) 3628 FNMSUB (bb2, cc05, cc15, cc15) 3629 3630 LDF [BO + 27 * SIZE], a1 3631 LDF [BO + 28 * SIZE], a2 3632 LDF [BO + 29 * SIZE], a3 3633 LDF [BO + 30 * SIZE], a4 3634 LDF [BO + 31 * SIZE], b1 3635 3636 FMUL a1, c07, c07 3637 3638 FNMSUB (aa2, cc07, cc09, cc09) 3639 FNMSUB (aa3, cc07, cc11, cc11) 3640 FNMSUB (aa4, cc07, cc13, cc13) 3641 FNMSUB (bb1, cc07, cc15, cc15) 3642 3643 LDF [BO + 36 * SIZE], a1 3644 LDF [BO + 37 * SIZE], a2 3645 LDF [BO + 38 * SIZE], a3 3646 LDF [BO + 39 * SIZE], a4 3647 3648 FMUL a1, c09, c09 3649 3650 FNMSUB (aa2, cc09, cc11, cc11) 3651 FNMSUB (aa3, cc09, cc13, cc13) 3652 FNMSUB (aa4, cc09, cc15, cc15) 3653 3654 LDF [BO + 45 * SIZE], a1 3655 LDF [BO + 46 * SIZE], a2 3656 LDF [BO + 47 * SIZE], a3 3657 3658 FMUL a1, c11, c11 3659 3660 FNMSUB (aa2, cc11, cc13, cc13) 3661 FNMSUB (aa3, cc11, cc15, cc15) 3662 3663 LDF [BO + 54 * SIZE], a1 3664 LDF [BO + 55 * SIZE], a2 3665 3666 FMUL a1, c13, c13 3667 3668 FNMSUB (aa2, cc13, cc15, cc15) 3669 3670 LDF [BO + 63 * SIZE], a1 3671 3672 FMUL a1, c15, c15 3673#endif 3674 3675#ifdef RT 3676 LDF [BO + 63 * SIZE], a1 3677 LDF [BO + 62 * SIZE], a2 3678 LDF [BO + 61 * SIZE], a3 3679 LDF [BO + 60 * SIZE], a4 3680 LDF [BO + 59 * SIZE], b1 3681 LDF [BO + 58 * SIZE], b2 3682 LDF [BO + 57 * SIZE], b3 3683 LDF [BO + 56 * SIZE], b4 3684 3685 FMUL a1, c15, c15 3686 3687 FNMSUB (aa2, cc15, cc13, cc13) 3688 FNMSUB (aa3, cc15, cc11, cc11) 3689 FNMSUB (aa4, cc15, cc09, cc09) 3690 FNMSUB (bb1, cc15, cc07, cc07) 3691 FNMSUB (bb2, cc15, cc05, cc05) 3692 FNMSUB (bb3, cc15, cc03, cc03) 3693 FNMSUB (bb4, cc15, cc01, cc01) 3694 3695 LDF [BO + 54 * SIZE], a1 3696 LDF [BO + 53 * SIZE], a2 3697 LDF [BO + 52 * SIZE], a3 3698 LDF [BO + 51 * SIZE], a4 3699 LDF [BO + 50 * SIZE], b1 3700 LDF [BO + 49 * SIZE], b2 3701 LDF [BO + 48 * SIZE], b3 3702 3703 FMUL a1, c13, c13 3704 3705 FNMSUB (aa2, cc13, cc11, cc11) 3706 FNMSUB (aa3, cc13, cc09, cc09) 3707 FNMSUB (aa4, cc13, cc07, cc07) 3708 FNMSUB (bb1, cc13, cc05, cc05) 3709 FNMSUB (bb2, cc13, cc03, cc03) 3710 FNMSUB (bb3, cc13, cc01, cc01) 3711 3712 LDF [BO + 45 * SIZE], a1 3713 LDF [BO + 44 * SIZE], a2 3714 LDF [BO + 43 * SIZE], a3 3715 LDF [BO + 42 * SIZE], a4 3716 LDF [BO + 41 * SIZE], b1 3717 LDF [BO + 40 * SIZE], b2 3718 3719 FMUL a1, c11, c11 3720 3721 FNMSUB (aa2, cc11, cc09, cc09) 3722 FNMSUB (aa3, cc11, cc07, cc07) 3723 FNMSUB (aa4, cc11, cc05, cc05) 3724 FNMSUB (bb1, cc11, cc03, cc03) 3725 FNMSUB (bb2, cc11, cc01, cc01) 3726 3727 LDF [BO + 36 * SIZE], a1 3728 LDF [BO + 35 * SIZE], a2 3729 LDF [BO + 34 * SIZE], a3 3730 LDF [BO + 33 * SIZE], a4 3731 LDF [BO + 32 * SIZE], b1 3732 3733 FMUL a1, c09, c09 3734 3735 FNMSUB (aa2, cc09, cc07, cc07) 3736 FNMSUB (aa3, cc09, cc05, cc05) 3737 FNMSUB (aa4, cc09, cc03, cc03) 3738 FNMSUB (bb1, cc09, cc01, cc01) 3739 3740 LDF [BO + 27 * SIZE], a1 3741 LDF [BO + 26 * SIZE], a2 3742 LDF [BO + 25 * SIZE], a3 3743 LDF [BO + 24 * SIZE], a4 3744 3745 FMUL a1, c07, c07 3746 3747 FNMSUB (aa2, cc07, cc05, cc05) 3748 FNMSUB (aa3, cc07, cc03, cc03) 3749 FNMSUB (aa4, cc07, cc01, cc01) 3750 3751 LDF [BO + 18 * SIZE], a1 3752 LDF [BO + 17 * SIZE], a2 3753 LDF [BO + 16 * SIZE], a3 3754 3755 FMUL a1, c05, c05 3756 3757 FNMSUB (aa2, cc05, cc03, cc03) 3758 FNMSUB (aa3, cc05, cc01, cc01) 3759 3760 LDF [BO + 9 * SIZE], a1 3761 LDF [BO + 8 * SIZE], a2 3762 3763 FMUL a1, c03, c03 3764 3765 FNMSUB (aa2, cc03, cc01, cc01) 3766 3767 LDF [BO + 0 * SIZE], a1 3768 3769 FMUL a1, c01, c01 3770#endif 3771 3772#ifdef LN 3773 add C1, -1 * SIZE, C1 3774 add C2, -1 * SIZE, C2 3775 add C3, -1 * SIZE, C3 3776 add C4, -1 * SIZE, C4 3777 add C5, -1 * SIZE, C5 3778 add C6, -1 * SIZE, C6 3779 add C7, -1 * SIZE, C7 3780 add C8, -1 * SIZE, C8 3781#endif 3782 3783#if defined(LN) || defined(LT) 3784 STF c01, [BO + 0 * SIZE] 3785 STF c03, [BO + 1 * SIZE] 3786 STF c05, [BO + 2 * SIZE] 3787 STF c07, [BO + 3 * SIZE] 3788 3789 STF c09, [BO + 4 * SIZE] 3790 STF c11, [BO + 5 * SIZE] 3791 STF c13, [BO + 6 * SIZE] 3792 STF c15, [BO + 7 * SIZE] 3793#else 3794 STF c01, [AO + 0 * SIZE] 3795 STF c03, [AO + 1 * SIZE] 3796 STF c05, [AO + 2 * SIZE] 3797 STF c07, [AO + 3 * SIZE] 3798 3799 STF c09, [AO + 4 * SIZE] 3800 STF c11, [AO + 5 * SIZE] 3801 STF c13, [AO + 6 * SIZE] 3802 STF c15, [AO + 7 * SIZE] 3803#endif 3804 3805 STF c01, [C1 + 0 * SIZE] 3806 STF c03, [C2 + 0 * SIZE] 3807 STF c05, [C3 + 0 * SIZE] 3808 STF c07, [C4 + 0 * SIZE] 3809 3810 STF c09, [C5 + 0 * SIZE] 3811 STF c11, [C6 + 0 * SIZE] 3812 STF c13, [C7 + 0 * SIZE] 3813 STF c15, [C8 + 0 * SIZE] 3814 3815#ifdef RT 3816 sll K, BASE_SHIFT + 0, TEMP1 3817 add AORIG, TEMP1, AORIG 3818#endif 3819 3820#if defined(LT) || defined(RN) 3821 sub K, KK, TEMP1 3822 sll TEMP1, BASE_SHIFT + 0, TEMP2 3823 sll TEMP1, BASE_SHIFT + 3, TEMP1 3824 add AO, TEMP2, AO 3825 add BO, TEMP1, BO 3826#endif 3827 3828#ifdef LT 3829 add KK, 1, KK 3830#endif 3831 3832#ifdef LN 3833 sub KK, 1, KK 3834#endif 3835 .align 4 3836 3837.LL29: 3838#ifdef LN 3839 sll K, BASE_SHIFT + 3, TEMP1 3840 add B, TEMP1, B 3841#endif 3842 3843#if defined(LT) || defined(RN) 3844 mov BO, B 3845#endif 3846 3847#ifdef RN 3848 add KK, 8, KK 3849#endif 3850 3851#ifdef RT 3852 sub KK, 8, KK 3853#endif 3854 3855 add J, -1, J 3856 cmp J, 0 3857 bg,pt %icc, .LL11 3858 nop 3859 .align 4 3860 3861.LL999: 3862#ifdef TRMMKERNEL 3863#ifndef __64BIT__ 3864 ld [%sp + STACK_START + 8], %g1 3865 ld [%sp + STACK_START + 12], %g2 3866 ld [%sp + STACK_START + 16], %g3 3867 ld [%sp + STACK_START + 20], %g4 3868#else 3869 ldx [%sp + STACK_START + 32], %g1 3870 ldx [%sp + STACK_START + 40], %g2 3871 ldx [%sp + STACK_START + 48], %g3 3872 ldx [%sp + STACK_START + 56], %g4 3873#endif 3874#endif 3875 3876 return %i7 + 8 3877 clr %o0 3878 3879 EPILOGUE 3880