1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %i0 43#define N %i1 44#define K %i2 45 46#if defined(DOUBLE) && !defined(__64BIT__) 47#define A %i5 48#define B %i4 49#else 50#define A %i4 51#define B %i5 52#endif 53 54#define C %o4 55#define LDC %o5 56 57#define AO %l0 58#define BO %l1 59#define I %l2 60#define J %l3 61#define L %l4 62 63#define C1 %o0 64#define C2 %o1 65#define C3 %o2 66#define C4 %o3 67 68#define OFFSET %l5 69#define KK %l6 70#define TEMP1 %l7 71#define TEMP2 %i3 72#define AORIG %g1 73 74#ifdef DOUBLE 75#define c01 %f0 76#define c02 %f2 77#define c03 %f4 78#define c04 %f6 79#define c05 %f8 80#define c06 %f10 81#define c07 %f12 82#define c08 %f14 83#define c09 %f16 84#define c10 %f18 85#define c11 %f20 86#define c12 %f22 87#define c13 %f24 88#define c14 %f26 89#define c15 %f28 90#define c16 %f30 91 92#define t1 %f32 93#define t2 %f34 94#define t3 %f36 95#define t4 %f38 96 97#define a1 %f40 98#define a2 %f42 99#define a3 %f44 100#define a4 %f46 101#define a5 %f58 102 103#define b1 %f48 104#define b2 %f50 105#define b3 %f52 106#define b4 %f54 107#define b5 %f56 108 109#define FZERO %f60 110#define ALPHA %f62 111#else 112#define c01 %f0 113#define c02 %f1 114#define c03 %f2 115#define c04 %f3 116#define c05 %f4 117#define c06 %f5 118#define c07 %f6 119#define c08 %f7 120#define c09 %f8 121#define c10 %f9 122#define c11 %f10 123#define c12 %f11 124#define c13 %f12 125#define c14 %f13 126#define c15 %f14 127#define c16 %f15 128 129#define t1 %f16 130#define t2 %f17 131#define t3 %f18 132#define t4 %f19 133 134#define a1 %f20 135#define a2 %f21 136#define a3 %f22 137#define a4 %f23 138#define a5 %f31 139 140#define b1 %f24 141#define b2 %f25 142#define b3 %f26 143#define b4 %f27 144#define b5 %f28 145 146#define FZERO %f29 147#define ALPHA %f30 148#endif 149 150#define APREFETCHSIZE 40 151#define BPREFETCHSIZE 40 152 153#define APREFETCH_CATEGORY 0 154#define BPREFETCH_CATEGORY 0 155 156 PROLOGUE 157 SAVESP 158 nop 159 160#ifndef __64BIT__ 161#ifdef DOUBLE 162 ld [%sp + STACK_START + 28], B 163 ld [%sp + STACK_START + 32], C 164 ld [%sp + STACK_START + 36], LDC 165 ld [%sp + STACK_START + 40], OFFSET 166#else 167 ld [%sp + STACK_START + 28], C 168 ld [%sp + STACK_START + 32], LDC 169 ld [%sp + STACK_START + 36], OFFSET 170#endif 171#else 172 ldx [%sp+ STACK_START + 56], C 173 ldx [%sp+ STACK_START + 64], LDC 174 ldx [%sp+ STACK_START + 72], OFFSET 175#endif 176 177 FCLR(29) 178 179 sll LDC, BASE_SHIFT, LDC 180 181#ifdef LN 182 smul M, K, TEMP1 183 sll TEMP1, BASE_SHIFT, TEMP1 184 add A, TEMP1, A 185 186 sll M, BASE_SHIFT, TEMP1 187 add C, TEMP1, C 188#endif 189 190#ifdef RN 191 neg OFFSET, KK 192#endif 193 194#ifdef RT 195 smul N, K, TEMP1 196 sll TEMP1, BASE_SHIFT, TEMP1 197 add B, TEMP1, B 198 199 smul N, LDC, TEMP1 200 add C, TEMP1, C 201 202 sub N, OFFSET, KK 203#endif 204 205 sra N, 2, J 206 cmp J, 0 207 ble,pn %icc, .LL100 208 nop 209 210.LL11: 211#ifdef RT 212 sll K, 2 + BASE_SHIFT, TEMP1 213 sub B, TEMP1, B 214 215 sll LDC, 2, TEMP1 216 sub C, TEMP1, C 217#endif 218 219 mov C, C1 220 add C, LDC, C2 221 add C2, LDC, C3 222 add C3, LDC, C4 223 224#ifdef LN 225 add M, OFFSET, KK 226#endif 227 228#ifdef LT 229 mov OFFSET, KK 230#endif 231 232#if defined(LN) || defined(RT) 233 mov A, AORIG 234#else 235 mov A, AO 236#endif 237 238#ifndef RT 239 add C4, LDC, C 240#endif 241 242 and M, 1, I 243 cmp I, 0 244 ble,pn %icc, .LL50 245 nop 246 247#if defined(LT) || defined(RN) 248 sra KK, 2, L 249 250 mov B, BO 251 cmp L, 0 252#else 253 254#ifdef LN 255 sll K, 0 + BASE_SHIFT, TEMP1 256 sub AORIG, TEMP1, AORIG 257#endif 258 259 sll KK, 0 + BASE_SHIFT, TEMP1 260 sll KK, 2 + BASE_SHIFT, TEMP2 261 262 add AORIG, TEMP1, AO 263 add B, TEMP2, BO 264 265 sub K, KK, TEMP1 266 sra TEMP1, 2, L 267 cmp L, 0 268#endif 269 270 LDF [AO + 0 * SIZE], a1 271 FMOV FZERO, c01 272 LDF [BO + 0 * SIZE], b1 273 FMOV FZERO, t1 274 LDF [AO + 1 * SIZE], a2 275 FMOV FZERO, c02 276 LDF [BO + 1 * SIZE], b2 277 FMOV FZERO, t2 278 LDF [AO + 2 * SIZE], a3 279 FMOV FZERO, c03 280 LDF [BO + 2 * SIZE], b3 281 FMOV FZERO, t3 282 LDF [AO + 3 * SIZE], a4 283 FMOV FZERO, c04 284 LDF [BO + 3 * SIZE], b4 285 FMOV FZERO, t4 286 287 ble,pn %icc, .LL75 288 nop 289 290.LL72: 291 FADD c01, t1, c01 292 add L, -1, L 293 FMUL a1, b1, t1 294 LDF [BO + 4 * SIZE], b1 295 296 FADD c02, t2, c02 297 cmp L, 0 298 FMUL a1, b2, t2 299 LDF [BO + 5 * SIZE], b2 300 301 FADD c03, t3, c03 302 FMUL a1, b3, t3 303 LDF [BO + 6 * SIZE], b3 304 305 FADD c04, t4, c04 306 FMUL a1, b4, t4 307 LDF [BO + 7 * SIZE], b4 308 LDF [AO + 4 * SIZE], a1 309 310 FADD c01, t1, c01 311 add AO, 4 * SIZE, AO 312 FMUL a2, b1, t1 313 LDF [BO + 8 * SIZE], b1 314 315 FADD c02, t2, c02 316 FMUL a2, b2, t2 317 LDF [BO + 9 * SIZE], b2 318 319 FADD c03, t3, c03 320 FMUL a2, b3, t3 321 LDF [BO + 10 * SIZE], b3 322 323 FADD c04, t4, c04 324 FMUL a2, b4, t4 325 LDF [BO + 11 * SIZE], b4 326 LDF [AO + 1 * SIZE], a2 327 328 FADD c01, t1, c01 329 FMUL a3, b1, t1 330 LDF [BO + 12 * SIZE], b1 331 332 FADD c02, t2, c02 333 FMUL a3, b2, t2 334 LDF [BO + 13 * SIZE], b2 335 336 FADD c03, t3, c03 337 FMUL a3, b3, t3 338 LDF [BO + 14 * SIZE], b3 339 340 FADD c04, t4, c04 341 FMUL a3, b4, t4 342 LDF [BO + 15 * SIZE], b4 343 LDF [AO + 2 * SIZE], a3 344 345 FADD c01, t1, c01 346 FMUL a4, b1, t1 347 LDF [BO + 16 * SIZE], b1 348 349 FADD c02, t2, c02 350 FMUL a4, b2, t2 351 LDF [BO + 17 * SIZE], b2 352 353 FADD c03, t3, c03 354 FMUL a4, b3, t3 355 LDF [BO + 18 * SIZE], b3 356 357 FADD c04, t4, c04 358 FMUL a4, b4, t4 359 LDF [BO + 19 * SIZE], b4 360 361 add BO, 16 * SIZE, BO 362 bg,pt %icc, .LL72 363 LDF [AO + 3 * SIZE], a4 364 365.LL75: 366#if defined(LT) || defined(RN) 367 and KK, 3, L 368#else 369 and TEMP1, 3, L 370#endif 371 cmp L, 0 372 ble,a,pn %icc, .LL79 373 nop 374 375.LL76: 376 FADD c01, t1, c01 377 add AO, 1 * SIZE, AO 378 FMUL a1, b1, t1 379 LDF [BO + 4 * SIZE], b1 380 381 FADD c02, t2, c02 382 add L, -1, L 383 FMUL a1, b2, t2 384 LDF [BO + 5 * SIZE], b2 385 386 FADD c03, t3, c03 387 cmp L, 0 388 FMUL a1, b3, t3 389 LDF [BO + 6 * SIZE], b3 390 391 FADD c04, t4, c04 392 add BO, 4 * SIZE, BO 393 FMUL a1, b4, t4 394 LDF [AO + 0 * SIZE], a1 395 396 bg,pt %icc, .LL76 397 LDF [BO + 3 * SIZE], b4 398 399 400.LL79: 401 FADD c01, t1, c01 402 FADD c02, t2, c02 403 FADD c03, t3, c03 404 FADD c04, t4, c04 405 406#if defined(LN) || defined(RT) 407#ifdef LN 408 sub KK, 1, TEMP1 409#else 410 sub KK, 4, TEMP1 411#endif 412 sll TEMP1, 0 + BASE_SHIFT, TEMP2 413 sll TEMP1, 2 + BASE_SHIFT, TEMP1 414 add AORIG, TEMP2, AO 415 add B, TEMP1, BO 416#endif 417 418#if defined(LN) || defined(LT) 419 LDF [BO + 0 * SIZE], a1 420 LDF [BO + 1 * SIZE], a2 421 LDF [BO + 2 * SIZE], a3 422 LDF [BO + 3 * SIZE], a4 423 424 FSUB a1, c01, c01 425 FSUB a2, c02, c02 426 FSUB a3, c03, c03 427 FSUB a4, c04, c04 428#else 429 LDF [AO + 0 * SIZE], a1 430 LDF [AO + 1 * SIZE], a2 431 LDF [AO + 2 * SIZE], a3 432 LDF [AO + 3 * SIZE], a4 433 434 FSUB a1, c01, c01 435 FSUB a2, c02, c02 436 FSUB a3, c03, c03 437 FSUB a4, c04, c04 438#endif 439 440#ifdef LN 441 LDF [AO + 0 * SIZE], a1 442 443 FMUL a1, c01, c01 444 FMUL a1, c02, c02 445 FMUL a1, c03, c03 446 FMUL a1, c04, c04 447#endif 448 449#ifdef LT 450 LDF [AO + 0 * SIZE], a1 451 452 FMUL a1, c01, c01 453 FMUL a1, c02, c02 454 FMUL a1, c03, c03 455 FMUL a1, c04, c04 456#endif 457 458#ifdef RN 459 LDF [BO + 0 * SIZE], a1 460 LDF [BO + 1 * SIZE], a2 461 LDF [BO + 2 * SIZE], a3 462 LDF [BO + 3 * SIZE], a4 463 464 FMUL a1, c01, c01 465 FMUL a2, c01, t1 466 FSUB c02, t1, c02 467 FMUL a3, c01, t1 468 FSUB c03, t1, c03 469 FMUL a4, c01, t1 470 FSUB c04, t1, c04 471 472 LDF [BO + 5 * SIZE], a1 473 LDF [BO + 6 * SIZE], a2 474 LDF [BO + 7 * SIZE], a3 475 476 FMUL a1, c02, c02 477 FMUL a2, c02, t1 478 FSUB c03, t1, c03 479 FMUL a3, c02, t1 480 FSUB c04, t1, c04 481 482 LDF [BO + 10 * SIZE], a1 483 LDF [BO + 11 * SIZE], a2 484 485 FMUL a1, c03, c03 486 FMUL a2, c03, t1 487 FSUB c04, t1, c04 488 489 LDF [BO + 15 * SIZE], a1 490 491 FMUL a1, c04, c04 492#endif 493 494#ifdef RT 495 LDF [BO + 15 * SIZE], a1 496 LDF [BO + 14 * SIZE], a2 497 LDF [BO + 13 * SIZE], a3 498 LDF [BO + 12 * SIZE], a4 499 500 FMUL a1, c04, c04 501 FMUL a2, c04, t1 502 FSUB c03, t1, c03 503 FMUL a3, c04, t1 504 FSUB c02, t1, c02 505 FMUL a4, c04, t1 506 FSUB c01, t1, c01 507 508 LDF [BO + 10 * SIZE], a1 509 LDF [BO + 9 * SIZE], a2 510 LDF [BO + 8 * SIZE], a3 511 512 FMUL a1, c03, c03 513 FMUL a2, c03, t1 514 FSUB c02, t1, c02 515 FMUL a3, c03, t1 516 FSUB c01, t1, c01 517 518 LDF [BO + 5 * SIZE], a1 519 LDF [BO + 4 * SIZE], a2 520 521 FMUL a1, c02, c02 522 FMUL a2, c02, t1 523 FSUB c01, t1, c01 524 525 LDF [BO + 0 * SIZE], a1 526 527 FMUL a1, c01, c01 528#endif 529 530#ifdef LN 531 add C1, -1 * SIZE, C1 532 add C2, -1 * SIZE, C2 533 add C3, -1 * SIZE, C3 534 add C4, -1 * SIZE, C4 535#endif 536 537#if defined(LN) || defined(LT) 538 STF c01, [BO + 0 * SIZE] 539 STF c02, [BO + 1 * SIZE] 540 STF c03, [BO + 2 * SIZE] 541 STF c04, [BO + 3 * SIZE] 542#else 543 STF c01, [AO + 0 * SIZE] 544 STF c02, [AO + 1 * SIZE] 545 STF c03, [AO + 2 * SIZE] 546 STF c04, [AO + 3 * SIZE] 547#endif 548 549 STF c01, [C1 + 0 * SIZE] 550 STF c02, [C2 + 0 * SIZE] 551 STF c03, [C3 + 0 * SIZE] 552 STF c04, [C4 + 0 * SIZE] 553 554 FMOV FZERO, t1 555 FMOV FZERO, t2 556 FMOV FZERO, t3 557 FMOV FZERO, t4 558 559#ifndef LN 560 add C1, 1 * SIZE, C1 561 add C2, 1 * SIZE, C2 562 add C3, 1 * SIZE, C3 563 add C4, 1 * SIZE, C4 564#endif 565 566#ifdef RT 567 sll K, 0 + BASE_SHIFT, TEMP1 568 add AORIG, TEMP1, AORIG 569#endif 570 571#if defined(LT) || defined(RN) 572 sub K, KK, TEMP1 573 sll TEMP1, 0 + BASE_SHIFT, TEMP2 574 sll TEMP1, 2 + BASE_SHIFT, TEMP1 575 add AO, TEMP2, AO 576 add BO, TEMP1, BO 577#endif 578 579#ifdef LT 580 add KK, 1, KK 581#endif 582 583#ifdef LN 584 sub KK, 1, KK 585#endif 586 587.LL50: 588 and M, 2, I 589 cmp I, 0 590 ble,pn %icc, .LL70 591 nop 592 593#if defined(LT) || defined(RN) 594 sra KK, 2, L 595 596 mov B, BO 597 cmp L, 0 598#else 599 600#ifdef LN 601 sll K, 1 + BASE_SHIFT, TEMP1 602 sub AORIG, TEMP1, AORIG 603#endif 604 605 sll KK, 1 + BASE_SHIFT, TEMP1 606 sll KK, 2 + BASE_SHIFT, TEMP2 607 608 add AORIG, TEMP1, AO 609 add B, TEMP2, BO 610 611 sub K, KK, TEMP1 612 sra TEMP1, 2, L 613 cmp L, 0 614#endif 615 616 FMOV FZERO, c02 617 FMOV FZERO, t1 618 FMOV FZERO, c04 619 620 LDF [AO + 0 * SIZE], a1 621 FMOV FZERO, t2 622 LDF [BO + 0 * SIZE], b1 623 FMOV FZERO, c06 624 LDF [AO + 1 * SIZE], a2 625 FMOV FZERO, t3 626 LDF [BO + 1 * SIZE], b2 627 FMOV FZERO, c08 628 LDF [AO + 2 * SIZE], a3 629 FMOV FZERO, t4 630 LDF [BO + 2 * SIZE], b3 631 FMOV FZERO, c01 632 LDF [AO + 3 * SIZE], a4 633 FMOV FZERO, c03 634 LDF [BO + 3 * SIZE], b4 635 FMOV FZERO, c05 636 637 ble,pn %icc, .LL55 638 FMOV FZERO, c07 639 640.LL52: 641 FADD c02, t1, c02 642 add AO, 8 * SIZE, AO 643 prefetch [AO + APREFETCHSIZE * SIZE], 0 644 645 FMUL a1, b1, t1 646 add BO, 16 * SIZE, BO 647 648 FADD c04, t2, c04 649 add L, -1, L 650 FMUL a1, b2, t2 651 652 FADD c06, t3, c06 653 cmp L, 0 654 FMUL a1, b3, t3 655 656 FADD c08, t4, c08 657 FMUL a1, b4, t4 658 LDF [AO - 4 * SIZE], a1 659 660 FADD c01, t1, c01 661 FMUL a2, b1, t1 662 LDF [BO - 12 * SIZE], b1 663 FADD c03, t2, c03 664 FMUL a2, b2, t2 665 LDF [BO - 11 * SIZE], b2 666 667 FADD c05, t3, c05 668 FMUL a2, b3, t3 669 LDF [BO - 10 * SIZE], b3 670 FADD c07, t4, c07 671 FMUL a2, b4, t4 672 LDF [BO - 9 * SIZE], b4 673 674 FADD c02, t1, c02 675 FMUL a3, b1, t1 676 LDF [AO - 3 * SIZE], a2 677 FADD c04, t2, c04 678 FMUL a3, b2, t2 679 680 FADD c06, t3, c06 681 FMUL a3, b3, t3 682 FADD c08, t4, c08 683 FMUL a3, b4, t4 684 LDF [AO - 2 * SIZE], a3 685 686 FADD c01, t1, c01 687 FMUL a4, b1, t1 688 LDF [BO - 8 * SIZE], b1 689 FADD c03, t2, c03 690 FMUL a4, b2, t2 691 LDF [BO - 7 * SIZE], b2 692 693 FADD c05, t3, c05 694 FMUL a4, b3, t3 695 LDF [BO - 6 * SIZE], b3 696 FADD c07, t4, c07 697 FMUL a4, b4, t4 698 LDF [BO - 5 * SIZE], b4 699 700 FADD c02, t1, c02 701 FMUL a1, b1, t1 702 LDF [AO - 1 * SIZE], a4 703 FADD c04, t2, c04 704 FMUL a1, b2, t2 705 706 FADD c06, t3, c06 707 FMUL a1, b3, t3 708 FADD c08, t4, c08 709 FMUL a1, b4, t4 710 LDF [AO + 0 * SIZE], a1 711 712 FADD c01, t1, c01 713 FMUL a2, b1, t1 714 LDF [BO - 4 * SIZE], b1 715 716 FADD c03, t2, c03 717 FMUL a2, b2, t2 718 LDF [BO - 3 * SIZE], b2 719 720 FADD c05, t3, c05 721 FMUL a2, b3, t3 722 LDF [BO - 2 * SIZE], b3 723 FADD c07, t4, c07 724 FMUL a2, b4, t4 725 LDF [BO - 1 * SIZE], b4 726 727 FADD c02, t1, c02 728 FMUL a3, b1, t1 729 LDF [AO + 1 * SIZE], a2 730 FADD c04, t2, c04 731 FMUL a3, b2, t2 732 733 FADD c06, t3, c06 734 FMUL a3, b3, t3 735 FADD c08, t4, c08 736 FMUL a3, b4, t4 737 LDF [AO + 2 * SIZE], a3 738 739 FADD c01, t1, c01 740 FMUL a4, b1, t1 741 LDF [BO + 0 * SIZE], b1 742 FADD c03, t2, c03 743 FMUL a4, b2, t2 744 LDF [BO + 1 * SIZE], b2 745 746 FADD c05, t3, c05 747 FMUL a4, b3, t3 748 LDF [BO + 2 * SIZE], b3 749 FADD c07, t4, c07 750 FMUL a4, b4, t4 751 LDF [BO + 3 * SIZE], b4 752 753 bg,pt %icc, .LL52 754 LDF [AO + 3 * SIZE], a4 755 756.LL55: 757#if defined(LT) || defined(RN) 758 and KK, 3, L 759#else 760 and TEMP1, 3, L 761#endif 762 cmp L, 0 763 ble,a,pn %icc, .LL59 764 nop 765 766.LL56: 767 FADD c02, t1, c02 768 add AO, 2 * SIZE, AO 769 FMUL a1, b1, t1 770 add L, -1, L 771 772 add BO, 4 * SIZE, BO 773 FADD c04, t2, c04 774 cmp L, 0 775 FMUL a1, b2, t2 776 777 FADD c06, t3, c06 778 FMUL a1, b3, t3 779 FADD c08, t4, c08 780 FMUL a1, b4, t4 781 LDF [AO + 0 * SIZE], a1 782 783 FADD c01, t1, c01 784 FMUL a2, b1, t1 785 LDF [BO + 0 * SIZE], b1 786 FADD c03, t2, c03 787 FMUL a2, b2, t2 788 LDF [BO + 1 * SIZE], b2 789 790 FADD c05, t3, c05 791 FMUL a2, b3, t3 792 LDF [BO + 2 * SIZE], b3 793 FADD c07, t4, c07 794 FMUL a2, b4, t4 795 LDF [BO + 3 * SIZE], b4 796 797 bg,pt %icc, .LL56 798 LDF [AO + 1 * SIZE], a2 799 800.LL59: 801#if defined(LN) || defined(RT) 802#ifdef LN 803 sub KK, 2, TEMP1 804#else 805 sub KK, 4, TEMP1 806#endif 807 sll TEMP1, 1 + BASE_SHIFT, TEMP2 808 sll TEMP1, 2 + BASE_SHIFT, TEMP1 809 add AORIG, TEMP2, AO 810 add B, TEMP1, BO 811#endif 812 813 FADD c02, t1, c02 814 FADD c04, t2, c04 815 FADD c06, t3, c06 816 FADD c08, t4, c08 817 818#if defined(LN) || defined(LT) 819 LDF [BO + 0 * SIZE], a1 820 LDF [BO + 1 * SIZE], a2 821 LDF [BO + 2 * SIZE], a3 822 LDF [BO + 3 * SIZE], a4 823 824 LDF [BO + 4 * SIZE], b1 825 LDF [BO + 5 * SIZE], b2 826 LDF [BO + 6 * SIZE], b3 827 LDF [BO + 7 * SIZE], b4 828 829 FSUB a1, c01, c01 830 FSUB a2, c03, c03 831 FSUB a3, c05, c05 832 FSUB a4, c07, c07 833 834 FSUB b1, c02, c02 835 FSUB b2, c04, c04 836 FSUB b3, c06, c06 837 FSUB b4, c08, c08 838#else 839 LDF [AO + 0 * SIZE], a1 840 LDF [AO + 1 * SIZE], a2 841 LDF [AO + 2 * SIZE], a3 842 LDF [AO + 3 * SIZE], a4 843 844 LDF [AO + 4 * SIZE], b1 845 LDF [AO + 5 * SIZE], b2 846 LDF [AO + 6 * SIZE], b3 847 LDF [AO + 7 * SIZE], b4 848 849 FSUB a1, c01, c01 850 FSUB a2, c02, c02 851 FSUB a3, c03, c03 852 FSUB a4, c04, c04 853 854 FSUB b1, c05, c05 855 FSUB b2, c06, c06 856 FSUB b3, c07, c07 857 FSUB b4, c08, c08 858#endif 859 860#ifdef LN 861 LDF [AO + 3 * SIZE], a1 862 LDF [AO + 2 * SIZE], a2 863 LDF [AO + 0 * SIZE], a3 864 865 FMUL a1, c02, c02 866 FMUL a1, c04, c04 867 FMUL a1, c06, c06 868 FMUL a1, c08, c08 869 870 FMUL a2, c02, t1 871 FMUL a2, c04, t2 872 FMUL a2, c06, t3 873 FMUL a2, c08, t4 874 875 FSUB c01, t1, c01 876 FSUB c03, t2, c03 877 FSUB c05, t3, c05 878 FSUB c07, t4, c07 879 880 FMUL a3, c01, c01 881 FMUL a3, c03, c03 882 FMUL a3, c05, c05 883 FMUL a3, c07, c07 884#endif 885 886#ifdef LT 887 LDF [AO + 0 * SIZE], a1 888 LDF [AO + 1 * SIZE], a2 889 LDF [AO + 3 * SIZE], a3 890 891 FMUL a1, c01, c01 892 FMUL a1, c03, c03 893 FMUL a1, c05, c05 894 FMUL a1, c07, c07 895 896 FMUL a2, c01, t1 897 FMUL a2, c03, t2 898 FMUL a2, c05, t3 899 FMUL a2, c07, t4 900 901 FSUB c02, t1, c02 902 FSUB c04, t2, c04 903 FSUB c06, t3, c06 904 FSUB c08, t4, c08 905 906 FMUL a3, c02, c02 907 FMUL a3, c04, c04 908 FMUL a3, c06, c06 909 FMUL a3, c08, c08 910#endif 911 912#ifdef RN 913 LDF [BO + 0 * SIZE], a1 914 LDF [BO + 1 * SIZE], a2 915 LDF [BO + 2 * SIZE], a3 916 LDF [BO + 3 * SIZE], a4 917 918 FMUL a1, c01, c01 919 FMUL a1, c02, c02 920 921 FMUL a2, c01, t1 922 FMUL a2, c02, t2 923 924 FSUB c03, t1, c03 925 FSUB c04, t2, c04 926 927 FMUL a3, c01, t1 928 FMUL a3, c02, t2 929 930 FSUB c05, t1, c05 931 FSUB c06, t2, c06 932 933 FMUL a4, c01, t1 934 FMUL a4, c02, t2 935 936 FSUB c07, t1, c07 937 FSUB c08, t2, c08 938 939 LDF [BO + 5 * SIZE], a1 940 LDF [BO + 6 * SIZE], a2 941 LDF [BO + 7 * SIZE], a3 942 943 FMUL a1, c03, c03 944 FMUL a1, c04, c04 945 946 FMUL a2, c03, t1 947 FMUL a2, c04, t2 948 949 FSUB c05, t1, c05 950 FSUB c06, t2, c06 951 952 FMUL a3, c03, t1 953 FMUL a3, c04, t2 954 955 FSUB c07, t1, c07 956 FSUB c08, t2, c08 957 958 LDF [BO + 10 * SIZE], a1 959 LDF [BO + 11 * SIZE], a2 960 961 FMUL a1, c05, c05 962 FMUL a1, c06, c06 963 964 FMUL a2, c05, t1 965 FMUL a2, c06, t2 966 967 FSUB c07, t1, c07 968 FSUB c08, t2, c08 969 970 LDF [BO + 15 * SIZE], a1 971 972 FMUL a1, c07, c07 973 FMUL a1, c08, c08 974#endif 975 976#ifdef RT 977 LDF [BO + 15 * SIZE], a1 978 LDF [BO + 14 * SIZE], a2 979 LDF [BO + 13 * SIZE], a3 980 LDF [BO + 12 * SIZE], a4 981 982 FMUL a1, c07, c07 983 FMUL a1, c08, c08 984 985 FMUL a2, c07, t1 986 FMUL a2, c08, t2 987 988 FSUB c05, t1, c05 989 FSUB c06, t2, c06 990 991 FMUL a3, c07, t1 992 FMUL a3, c08, t2 993 994 FSUB c03, t1, c03 995 FSUB c04, t2, c04 996 997 FMUL a4, c07, t1 998 FMUL a4, c08, t2 999 1000 FSUB c01, t1, c01 1001 FSUB c02, t2, c02 1002 1003 LDF [BO + 10 * SIZE], a1 1004 LDF [BO + 9 * SIZE], a2 1005 LDF [BO + 8 * SIZE], a3 1006 1007 FMUL a1, c05, c05 1008 FMUL a1, c06, c06 1009 1010 FMUL a2, c05, t1 1011 FMUL a2, c06, t2 1012 1013 FSUB c03, t1, c03 1014 FSUB c04, t2, c04 1015 1016 FMUL a3, c05, t1 1017 FMUL a3, c06, t2 1018 1019 FSUB c01, t1, c01 1020 FSUB c02, t2, c02 1021 1022 LDF [BO + 5 * SIZE], a1 1023 LDF [BO + 4 * SIZE], a2 1024 1025 FMUL a1, c03, c03 1026 FMUL a1, c04, c04 1027 1028 FMUL a2, c03, t1 1029 FMUL a2, c04, t2 1030 1031 FSUB c01, t1, c01 1032 FSUB c02, t2, c02 1033 1034 LDF [BO + 0 * SIZE], a1 1035 1036 FMUL a1, c01, c01 1037 FMUL a1, c02, c02 1038#endif 1039 1040#ifdef LN 1041 add C1, -2 * SIZE, C1 1042 add C2, -2 * SIZE, C2 1043 add C3, -2 * SIZE, C3 1044 add C4, -2 * SIZE, C4 1045#endif 1046 1047#if defined(LN) || defined(LT) 1048 STF c01, [BO + 0 * SIZE] 1049 STF c03, [BO + 1 * SIZE] 1050 STF c05, [BO + 2 * SIZE] 1051 STF c07, [BO + 3 * SIZE] 1052 1053 STF c02, [BO + 4 * SIZE] 1054 STF c04, [BO + 5 * SIZE] 1055 STF c06, [BO + 6 * SIZE] 1056 STF c08, [BO + 7 * SIZE] 1057#else 1058 STF c01, [AO + 0 * SIZE] 1059 STF c02, [AO + 1 * SIZE] 1060 STF c03, [AO + 2 * SIZE] 1061 STF c04, [AO + 3 * SIZE] 1062 1063 STF c05, [AO + 4 * SIZE] 1064 STF c06, [AO + 5 * SIZE] 1065 STF c07, [AO + 6 * SIZE] 1066 STF c08, [AO + 7 * SIZE] 1067#endif 1068 1069 STF c01, [C1 + 0 * SIZE] 1070 STF c02, [C1 + 1 * SIZE] 1071 STF c03, [C2 + 0 * SIZE] 1072 STF c04, [C2 + 1 * SIZE] 1073 1074 STF c05, [C3 + 0 * SIZE] 1075 STF c06, [C3 + 1 * SIZE] 1076 STF c07, [C4 + 0 * SIZE] 1077 STF c08, [C4 + 1 * SIZE] 1078 1079 FMOV FZERO, t1 1080 FMOV FZERO, t2 1081 FMOV FZERO, t3 1082 FMOV FZERO, t4 1083 1084#ifndef LN 1085 add C1, 2 * SIZE, C1 1086 add C2, 2 * SIZE, C2 1087 add C3, 2 * SIZE, C3 1088 add C4, 2 * SIZE, C4 1089#endif 1090 1091#ifdef RT 1092 sll K, 1 + BASE_SHIFT, TEMP1 1093 add AORIG, TEMP1, AORIG 1094#endif 1095 1096#if defined(LT) || defined(RN) 1097 sub K, KK, TEMP1 1098 sll TEMP1, 1 + BASE_SHIFT, TEMP2 1099 sll TEMP1, 2 + BASE_SHIFT, TEMP1 1100 add AO, TEMP2, AO 1101 add BO, TEMP1, BO 1102#endif 1103 1104#ifdef LT 1105 add KK, 2, KK 1106#endif 1107 1108#ifdef LN 1109 sub KK, 2, KK 1110#endif 1111 1112.LL70: 1113 sra M, 2, I 1114 cmp I, 0 1115 ble,pn %icc, .LL99 1116 nop 1117 1118.LL21: 1119 FMOV FZERO, t1 1120 FMOV FZERO, t2 1121 FMOV FZERO, t3 1122 FMOV FZERO, t4 1123 1124 FMOV FZERO, c01 1125 FMOV FZERO, c02 1126 FMOV FZERO, c03 1127 1128#if defined(LT) || defined(RN) 1129 sra KK, 2, L 1130 1131 mov B, BO 1132 cmp L, 0 1133#else 1134 1135#ifdef LN 1136 sll K, 2 + BASE_SHIFT, TEMP1 1137 sub AORIG, TEMP1, AORIG 1138#endif 1139 1140 sll KK, 2 + BASE_SHIFT, TEMP1 1141 1142 add AORIG, TEMP1, AO 1143 add B, TEMP1, BO 1144 1145 sub K, KK, TEMP1 1146 1147 sra TEMP1, 2, L 1148 cmp L, 0 1149#endif 1150 1151 LDF [AO + 0 * SIZE], a1 1152 FMOV FZERO, c04 1153 LDF [BO + 0 * SIZE], b1 1154 FMOV FZERO, c05 1155 LDF [AO + 1 * SIZE], a2 1156 FMOV FZERO, c06 1157 LDF [BO + 1 * SIZE], b2 1158 FMOV FZERO, c07 1159 1160 LDF [AO + 2 * SIZE], a3 1161 FMOV FZERO, c08 1162 LDF [BO + 2 * SIZE], b3 1163 FMOV FZERO, c09 1164 LDF [AO + 3 * SIZE], a4 1165 FMOV FZERO, c10 1166 LDF [BO + 3 * SIZE], b4 1167 FMOV FZERO, c11 1168 LDF [BO + 4 * SIZE], b5 /* ***** */ 1169 1170 LDF [AO + 4 * SIZE], a5 /* ***** */ 1171 1172#ifdef LN 1173 prefetch [C1 + 3 * SIZE], 3 1174 FMOV FZERO, c12 1175 prefetch [C2 + 3 * SIZE], 3 1176 FMOV FZERO, c13 1177 prefetch [C3 + 3 * SIZE], 3 1178 FMOV FZERO, c14 1179 prefetch [C4 + 3 * SIZE], 3 1180 FMOV FZERO, c15 1181#else 1182 prefetch [C1 - 3 * SIZE], 3 1183 FMOV FZERO, c12 1184 prefetch [C2 - 3 * SIZE], 3 1185 FMOV FZERO, c13 1186 prefetch [C3 - 3 * SIZE], 3 1187 FMOV FZERO, c14 1188 prefetch [C4 - 3 * SIZE], 3 1189 FMOV FZERO, c15 1190#endif 1191 1192 ble,pn %icc, .LL25 1193 FMOV FZERO, c16 1194 1195.LL22: 1196 FADD c04, t1, c04 1197 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY 1198 FMUL a1, b1, t1 1199 nop 1200 1201 FADD c08, t2, c08 1202 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY 1203 FMUL a1, b2, t2 1204 add AO, 16 * SIZE, AO 1205 1206 FADD c12, t3, c12 1207 LDF [AO - 13 * SIZE], a4 1208 FMUL a1, b3, t3 1209 add BO, 16 * SIZE, BO 1210 1211 FADD c16, t4, c16 1212 nop 1213 FMUL a1, b4, t4 1214 LDF [AO - 8 * SIZE], a1 1215 1216 FADD c01, t1, c01 1217 nop 1218 FMUL a2, b1, t1 1219 nop 1220 1221 FADD c05, t2, c05 1222 nop 1223 FMUL a2, b2, t2 1224 nop 1225 1226 FADD c09, t3, c09 1227 nop 1228 FMUL a2, b3, t3 1229 nop 1230 1231 FADD c13, t4, c13 1232 add L, -1, L 1233 FMUL a2, b4, t4 1234 LDF [AO - 11 * SIZE], a2 1235 1236 FADD c02, t1, c02 1237 nop 1238 FMUL a3, b1, t1 1239 nop 1240 1241 FADD c06, t2, c06 1242 nop 1243 FMUL a3, b2, t2 1244 nop 1245 1246 FADD c10, t3, c10 1247 nop 1248 FMUL a3, b3, t3 1249 nop 1250 1251 FADD c14, t4, c14 1252 nop 1253 FMUL a3, b4, t4 1254 LDF [AO - 10 * SIZE], a3 1255 1256 FADD c03, t1, c03 1257 nop 1258 FMUL a4, b1, t1 1259 LDF [BO - 8 * SIZE], b1 1260 1261 FADD c07, t2, c07 1262 nop 1263 FMUL a4, b2, t2 1264 LDF [BO - 11 * SIZE], b2 1265 1266 FADD c11, t3, c11 1267 nop 1268 FMUL a4, b3, t3 1269 LDF [BO - 10 * SIZE], b3 1270 1271 FADD c15, t4, c15 1272 nop 1273 FMUL a4, b4, t4 1274 LDF [BO - 9 * SIZE], b4 1275 1276 FADD c04, t1, c04 1277 nop 1278 FMUL a5, b5, t1 1279 LDF [AO - 9 * SIZE], a4 1280 1281 FADD c08, t2, c08 1282 nop 1283 FMUL a5, b2, t2 1284 nop 1285 1286 FADD c12, t3, c12 1287 nop 1288 FMUL a5, b3, t3 1289 nop 1290 1291 FADD c16, t4, c16 1292 nop 1293 FMUL a5, b4, t4 1294 LDF [AO - 4 * SIZE], a5 1295 1296 FADD c01, t1, c01 1297 nop 1298 FMUL a2, b5, t1 1299 nop 1300 1301 FADD c05, t2, c05 1302 nop 1303 FMUL a2, b2, t2 1304 nop 1305 1306 FADD c09, t3, c09 1307 nop 1308 FMUL a2, b3, t3 1309 nop 1310 1311 FADD c13, t4, c13 1312 nop 1313 FMUL a2, b4, t4 1314 LDF [AO - 7 * SIZE], a2 1315 1316 FADD c02, t1, c02 1317 nop 1318 FMUL a3, b5, t1 1319 nop 1320 1321 FADD c06, t2, c06 1322 nop 1323 FMUL a3, b2, t2 1324 nop 1325 1326 FADD c10, t3, c10 1327 nop 1328 FMUL a3, b3, t3 1329 nop 1330 1331 FADD c14, t4, c14 1332 nop 1333 FMUL a3, b4, t4 1334 LDF [AO - 6 * SIZE], a3 1335 1336 FADD c03, t1, c03 1337 nop 1338 FMUL a4, b5, t1 1339 LDF [BO - 4 * SIZE], b5 1340 1341 FADD c07, t2, c07 1342 nop 1343 FMUL a4, b2, t2 1344 LDF [BO - 7 * SIZE], b2 1345 1346 FADD c11, t3, c11 1347 nop 1348 FMUL a4, b3, t3 1349 LDF [BO - 6 * SIZE], b3 1350 1351 FADD c15, t4, c15 1352 nop 1353 FMUL a4, b4, t4 1354 LDF [BO - 5 * SIZE], b4 1355 1356 FADD c04, t1, c04 1357 nop 1358 FMUL a1, b1, t1 1359 LDF [AO - 5 * SIZE], a4 1360 1361 FADD c08, t2, c08 1362 nop 1363 FMUL a1, b2, t2 1364 nop 1365 1366 FADD c12, t3, c12 1367 nop 1368 FMUL a1, b3, t3 1369 nop 1370 1371 FADD c16, t4, c16 1372 nop 1373 FMUL a1, b4, t4 1374 LDF [AO - 0 * SIZE], a1 1375 1376 FADD c01, t1, c01 1377 nop 1378 FMUL a2, b1, t1 1379 nop 1380 1381#ifdef DOUBLE 1382 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 1383#else 1384 nop 1385#endif 1386 FADD c05, t2, c05 1387 nop 1388 FMUL a2, b2, t2 1389 1390 FADD c09, t3, c09 1391 nop 1392 FMUL a2, b3, t3 1393 nop 1394 1395 FADD c13, t4, c13 1396 nop 1397 FMUL a2, b4, t4 1398 nop 1399 1400 FADD c02, t1, c02 1401 nop 1402 FMUL a3, b1, t1 1403 LDF [AO - 3 * SIZE], a2 1404 1405 FADD c06, t2, c06 1406#ifdef DOUBLE 1407 prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY 1408#else 1409 nop 1410#endif 1411 FMUL a3, b2, t2 1412 nop 1413 1414 FADD c10, t3, c10 1415 nop 1416 FMUL a3, b3, t3 1417 nop 1418 1419 FADD c14, t4, c14 1420 nop 1421 FMUL a3, b4, t4 1422 LDF [AO - 2 * SIZE], a3 1423 1424 FADD c03, t1, c03 1425 nop 1426 FMUL a4, b1, t1 1427 LDF [BO - 0 * SIZE], b1 1428 1429 FADD c07, t2, c07 1430 nop 1431 FMUL a4, b2, t2 1432 LDF [BO - 3 * SIZE], b2 1433 1434 FADD c11, t3, c11 1435 nop 1436 FMUL a4, b3, t3 1437 LDF [BO - 2 * SIZE], b3 1438 1439 FADD c15, t4, c15 1440 nop 1441 FMUL a4, b4, t4 1442 LDF [BO - 1 * SIZE], b4 1443 1444 FADD c04, t1, c04 1445 nop 1446 FMUL a5, b5, t1 1447 LDF [AO - 1 * SIZE], a4 1448 1449 FADD c08, t2, c08 1450 FMUL a5, b2, t2 1451 FADD c12, t3, c12 1452 FMUL a5, b3, t3 1453 1454 FADD c16, t4, c16 1455 nop 1456 FMUL a5, b4, t4 1457 LDF [AO + 4 * SIZE], a5 1458 1459 FADD c01, t1, c01 1460 nop 1461 FMUL a2, b5, t1 1462 nop 1463 1464 FADD c05, t2, c05 1465 nop 1466 FMUL a2, b2, t2 1467 nop 1468 1469 FADD c09, t3, c09 1470 nop 1471 FMUL a2, b3, t3 1472 nop 1473 1474 FADD c13, t4, c13 1475 nop 1476 FMUL a2, b4, t4 1477 LDF [AO + 1 * SIZE], a2 1478 1479 FADD c02, t1, c02 1480 nop 1481 FMUL a3, b5, t1 1482 nop 1483 1484 FADD c06, t2, c06 1485 nop 1486 FMUL a3, b2, t2 1487 nop 1488 1489 FADD c10, t3, c10 1490 nop 1491 FMUL a3, b3, t3 1492 nop 1493 1494 FADD c14, t4, c14 1495 nop 1496 FMUL a3, b4, t4 1497 LDF [AO + 2 * SIZE], a3 1498 1499 FADD c03, t1, c03 1500 cmp L, 0 1501 FMUL a4, b5, t1 1502 LDF [BO + 4 * SIZE], b5 1503 1504 FADD c07, t2, c07 1505 nop 1506 FMUL a4, b2, t2 1507 LDF [BO + 1 * SIZE], b2 1508 1509 FADD c11, t3, c11 1510 nop 1511 FMUL a4, b3, t3 1512 LDF [BO + 2 * SIZE], b3 1513 1514 FADD c15, t4, c15 1515 FMUL a4, b4, t4 1516 bg,pt %icc, .LL22 1517 LDF [BO + 3 * SIZE], b4 1518 1519.LL25: 1520#if defined(LT) || defined(RN) 1521 and KK, 3, L 1522#else 1523 and TEMP1, 3, L 1524#endif 1525 cmp L, 0 1526 ble,a,pn %icc, .LL29 1527 nop 1528 1529.LL26: 1530 FADD c04, t1, c04 1531 LDF [AO + 3 * SIZE], a4 1532 FMUL a1, b1, t1 1533 add AO, 4 * SIZE, AO 1534 1535 FADD c08, t2, c08 1536 add BO, 4 * SIZE, BO 1537 FMUL a1, b2, t2 1538 add L, -1, L 1539 1540 FADD c12, t3, c12 1541 nop 1542 FMUL a1, b3, t3 1543 cmp L, 0 1544 1545 FADD c16, t4, c16 1546 nop 1547 FMUL a1, b4, t4 1548 LDF [AO + 0 * SIZE], a1 1549 1550 FADD c01, t1, c01 1551 nop 1552 FMUL a2, b1, t1 1553 nop 1554 1555 FADD c05, t2, c05 1556 nop 1557 FMUL a2, b2, t2 1558 nop 1559 1560 FADD c09, t3, c09 1561 nop 1562 FMUL a2, b3, t3 1563 nop 1564 1565 FADD c13, t4, c13 1566 nop 1567 FMUL a2, b4, t4 1568 LDF [AO + 1 * SIZE], a2 1569 1570 FADD c02, t1, c02 1571 nop 1572 FMUL a3, b1, t1 1573 nop 1574 1575 FADD c06, t2, c06 1576 nop 1577 FMUL a3, b2, t2 1578 nop 1579 1580 FADD c10, t3, c10 1581 nop 1582 FMUL a3, b3, t3 1583 nop 1584 1585 FADD c14, t4, c14 1586 nop 1587 FMUL a3, b4, t4 1588 LDF [AO + 2 * SIZE], a3 1589 1590 FADD c03, t1, c03 1591 nop 1592 FMUL a4, b1, t1 1593 LDF [BO + 0 * SIZE], b1 1594 1595 FADD c07, t2, c07 1596 nop 1597 FMUL a4, b2, t2 1598 LDF [BO + 1 * SIZE], b2 1599 1600 FADD c11, t3, c11 1601 nop 1602 FMUL a4, b3, t3 1603 LDF [BO + 2 * SIZE], b3 1604 1605 FADD c15, t4, c15 1606 FMUL a4, b4, t4 1607 bg,pt %icc, .LL26 1608 LDF [BO + 3 * SIZE], b4 1609 1610.LL29: 1611#if defined(LN) || defined(RT) 1612 sub KK, 4, TEMP1 1613 sll TEMP1, 2 + BASE_SHIFT, TEMP1 1614 add AORIG, TEMP1, AO 1615 add B, TEMP1, BO 1616#endif 1617 1618 FADD c04, t1, c04 1619 FADD c08, t2, c08 1620 FADD c12, t3, c12 1621 FADD c16, t4, c16 1622 1623#if defined(LN) || defined(LT) 1624 LDF [BO + 0 * SIZE], a1 1625 LDF [BO + 1 * SIZE], a2 1626 LDF [BO + 2 * SIZE], a3 1627 LDF [BO + 3 * SIZE], a4 1628 1629 LDF [BO + 4 * SIZE], b1 1630 LDF [BO + 5 * SIZE], b2 1631 LDF [BO + 6 * SIZE], b3 1632 LDF [BO + 7 * SIZE], b4 1633 1634 FSUB a1, c01, c01 1635 FSUB a2, c05, c05 1636 FSUB a3, c09, c09 1637 FSUB a4, c13, c13 1638 1639 FSUB b1, c02, c02 1640 FSUB b2, c06, c06 1641 FSUB b3, c10, c10 1642 FSUB b4, c14, c14 1643 1644 LDF [BO + 8 * SIZE], a1 1645 LDF [BO + 9 * SIZE], a2 1646 LDF [BO + 10 * SIZE], a3 1647 LDF [BO + 11 * SIZE], a4 1648 1649 LDF [BO + 12 * SIZE], b1 1650 LDF [BO + 13 * SIZE], b2 1651 LDF [BO + 14 * SIZE], b3 1652 LDF [BO + 15 * SIZE], b4 1653 1654 FSUB a1, c03, c03 1655 FSUB a2, c07, c07 1656 FSUB a3, c11, c11 1657 FSUB a4, c15, c15 1658 1659 FSUB b1, c04, c04 1660 FSUB b2, c08, c08 1661 FSUB b3, c12, c12 1662 FSUB b4, c16, c16 1663#else 1664 LDF [AO + 0 * SIZE], a1 1665 LDF [AO + 1 * SIZE], a2 1666 LDF [AO + 2 * SIZE], a3 1667 LDF [AO + 3 * SIZE], a4 1668 1669 LDF [AO + 4 * SIZE], b1 1670 LDF [AO + 5 * SIZE], b2 1671 LDF [AO + 6 * SIZE], b3 1672 LDF [AO + 7 * SIZE], b4 1673 1674 FSUB a1, c01, c01 1675 FSUB a2, c02, c02 1676 FSUB a3, c03, c03 1677 FSUB a4, c04, c04 1678 1679 FSUB b1, c05, c05 1680 FSUB b2, c06, c06 1681 FSUB b3, c07, c07 1682 FSUB b4, c08, c08 1683 1684 LDF [AO + 8 * SIZE], a1 1685 LDF [AO + 9 * SIZE], a2 1686 LDF [AO + 10 * SIZE], a3 1687 LDF [AO + 11 * SIZE], a4 1688 1689 LDF [AO + 12 * SIZE], b1 1690 LDF [AO + 13 * SIZE], b2 1691 LDF [AO + 14 * SIZE], b3 1692 LDF [AO + 15 * SIZE], b4 1693 1694 FSUB a1, c09, c09 1695 FSUB a2, c10, c10 1696 FSUB a3, c11, c11 1697 FSUB a4, c12, c12 1698 1699 FSUB b1, c13, c13 1700 FSUB b2, c14, c14 1701 FSUB b3, c15, c15 1702 FSUB b4, c16, c16 1703#endif 1704 1705#ifdef LN 1706 LDF [AO + 15 * SIZE], a1 1707 LDF [AO + 14 * SIZE], a2 1708 LDF [AO + 13 * SIZE], a3 1709 LDF [AO + 12 * SIZE], a4 1710 1711 FMUL a1, c04, c04 1712 FMUL a1, c08, c08 1713 FMUL a1, c12, c12 1714 FMUL a1, c16, c16 1715 1716 FMUL a2, c04, t1 1717 FMUL a2, c08, t2 1718 FMUL a2, c12, t3 1719 FMUL a2, c16, t4 1720 1721 FSUB c03, t1, c03 1722 FSUB c07, t2, c07 1723 FSUB c11, t3, c11 1724 FSUB c15, t4, c15 1725 1726 FMUL a3, c04, t1 1727 FMUL a3, c08, t2 1728 FMUL a3, c12, t3 1729 FMUL a3, c16, t4 1730 1731 FSUB c02, t1, c02 1732 FSUB c06, t2, c06 1733 FSUB c10, t3, c10 1734 FSUB c14, t4, c14 1735 1736 FMUL a4, c04, t1 1737 FMUL a4, c08, t2 1738 FMUL a4, c12, t3 1739 FMUL a4, c16, t4 1740 1741 FSUB c01, t1, c01 1742 FSUB c05, t2, c05 1743 FSUB c09, t3, c09 1744 FSUB c13, t4, c13 1745 1746 LDF [AO + 10 * SIZE], a1 1747 LDF [AO + 9 * SIZE], a2 1748 LDF [AO + 8 * SIZE], a3 1749 1750 FMUL a1, c03, c03 1751 FMUL a1, c07, c07 1752 FMUL a1, c11, c11 1753 FMUL a1, c15, c15 1754 1755 FMUL a2, c03, t1 1756 FMUL a2, c07, t2 1757 FMUL a2, c11, t3 1758 FMUL a2, c15, t4 1759 1760 FSUB c02, t1, c02 1761 FSUB c06, t2, c06 1762 FSUB c10, t3, c10 1763 FSUB c14, t4, c14 1764 1765 FMUL a3, c03, t1 1766 FMUL a3, c07, t2 1767 FMUL a3, c11, t3 1768 FMUL a3, c15, t4 1769 1770 FSUB c01, t1, c01 1771 FSUB c05, t2, c05 1772 FSUB c09, t3, c09 1773 FSUB c13, t4, c13 1774 1775 LDF [AO + 5 * SIZE], a1 1776 LDF [AO + 4 * SIZE], a2 1777 1778 FMUL a1, c02, c02 1779 FMUL a1, c06, c06 1780 FMUL a1, c10, c10 1781 FMUL a1, c14, c14 1782 1783 FMUL a2, c02, t1 1784 FMUL a2, c06, t2 1785 FMUL a2, c10, t3 1786 FMUL a2, c14, t4 1787 1788 FSUB c01, t1, c01 1789 FSUB c05, t2, c05 1790 FSUB c09, t3, c09 1791 FSUB c13, t4, c13 1792 1793 LDF [AO + 0 * SIZE], a1 1794 1795 FMUL a1, c01, c01 1796 FMUL a1, c05, c05 1797 FMUL a1, c09, c09 1798 FMUL a1, c13, c13 1799#endif 1800 1801#ifdef LT 1802 LDF [AO + 0 * SIZE], a1 1803 LDF [AO + 1 * SIZE], a2 1804 LDF [AO + 2 * SIZE], a3 1805 LDF [AO + 3 * SIZE], a4 1806 1807 FMUL a1, c01, c01 1808 FMUL a1, c05, c05 1809 FMUL a1, c09, c09 1810 FMUL a1, c13, c13 1811 1812 FMUL a2, c01, t1 1813 FMUL a2, c05, t2 1814 FMUL a2, c09, t3 1815 FMUL a2, c13, t4 1816 1817 FSUB c02, t1, c02 1818 FSUB c06, t2, c06 1819 FSUB c10, t3, c10 1820 FSUB c14, t4, c14 1821 1822 FMUL a3, c01, t1 1823 FMUL a3, c05, t2 1824 FMUL a3, c09, t3 1825 FMUL a3, c13, t4 1826 1827 FSUB c03, t1, c03 1828 FSUB c07, t2, c07 1829 FSUB c11, t3, c11 1830 FSUB c15, t4, c15 1831 1832 FMUL a4, c01, t1 1833 FMUL a4, c05, t2 1834 FMUL a4, c09, t3 1835 FMUL a4, c13, t4 1836 1837 FSUB c04, t1, c04 1838 FSUB c08, t2, c08 1839 FSUB c12, t3, c12 1840 FSUB c16, t4, c16 1841 1842 LDF [AO + 5 * SIZE], a1 1843 LDF [AO + 6 * SIZE], a2 1844 LDF [AO + 7 * SIZE], a3 1845 1846 FMUL a1, c02, c02 1847 FMUL a1, c06, c06 1848 FMUL a1, c10, c10 1849 FMUL a1, c14, c14 1850 1851 FMUL a2, c02, t1 1852 FMUL a2, c06, t2 1853 FMUL a2, c10, t3 1854 FMUL a2, c14, t4 1855 1856 FSUB c03, t1, c03 1857 FSUB c07, t2, c07 1858 FSUB c11, t3, c11 1859 FSUB c15, t4, c15 1860 1861 FMUL a3, c02, t1 1862 FMUL a3, c06, t2 1863 FMUL a3, c10, t3 1864 FMUL a3, c14, t4 1865 1866 FSUB c04, t1, c04 1867 FSUB c08, t2, c08 1868 FSUB c12, t3, c12 1869 FSUB c16, t4, c16 1870 1871 LDF [AO + 10 * SIZE], a1 1872 LDF [AO + 11 * SIZE], a2 1873 1874 FMUL a1, c03, c03 1875 FMUL a1, c07, c07 1876 FMUL a1, c11, c11 1877 FMUL a1, c15, c15 1878 1879 FMUL a2, c03, t1 1880 FMUL a2, c07, t2 1881 FMUL a2, c11, t3 1882 FMUL a2, c15, t4 1883 1884 FSUB c04, t1, c04 1885 FSUB c08, t2, c08 1886 FSUB c12, t3, c12 1887 FSUB c16, t4, c16 1888 1889 LDF [AO + 15 * SIZE], a1 1890 1891 FMUL a1, c04, c04 1892 FMUL a1, c08, c08 1893 FMUL a1, c12, c12 1894 FMUL a1, c16, c16 1895#endif 1896 1897#ifdef RN 1898 LDF [BO + 0 * SIZE], a1 1899 LDF [BO + 1 * SIZE], a2 1900 LDF [BO + 2 * SIZE], a3 1901 LDF [BO + 3 * SIZE], a4 1902 1903 FMUL a1, c01, c01 1904 FMUL a1, c02, c02 1905 FMUL a1, c03, c03 1906 FMUL a1, c04, c04 1907 1908 FMUL a2, c01, t1 1909 FMUL a2, c02, t2 1910 FMUL a2, c03, t3 1911 FMUL a2, c04, t4 1912 1913 FSUB c05, t1, c05 1914 FSUB c06, t2, c06 1915 FSUB c07, t3, c07 1916 FSUB c08, t4, c08 1917 1918 FMUL a3, c01, t1 1919 FMUL a3, c02, t2 1920 FMUL a3, c03, t3 1921 FMUL a3, c04, t4 1922 1923 FSUB c09, t1, c09 1924 FSUB c10, t2, c10 1925 FSUB c11, t3, c11 1926 FSUB c12, t4, c12 1927 1928 FMUL a4, c01, t1 1929 FMUL a4, c02, t2 1930 FMUL a4, c03, t3 1931 FMUL a4, c04, t4 1932 1933 FSUB c13, t1, c13 1934 FSUB c14, t2, c14 1935 FSUB c15, t3, c15 1936 FSUB c16, t4, c16 1937 1938 LDF [BO + 5 * SIZE], a1 1939 LDF [BO + 6 * SIZE], a2 1940 LDF [BO + 7 * SIZE], a3 1941 1942 FMUL a1, c05, c05 1943 FMUL a1, c06, c06 1944 FMUL a1, c07, c07 1945 FMUL a1, c08, c08 1946 1947 FMUL a2, c05, t1 1948 FMUL a2, c06, t2 1949 FMUL a2, c07, t3 1950 FMUL a2, c08, t4 1951 1952 FSUB c09, t1, c09 1953 FSUB c10, t2, c10 1954 FSUB c11, t3, c11 1955 FSUB c12, t4, c12 1956 1957 FMUL a3, c05, t1 1958 FMUL a3, c06, t2 1959 FMUL a3, c07, t3 1960 FMUL a3, c08, t4 1961 1962 FSUB c13, t1, c13 1963 FSUB c14, t2, c14 1964 FSUB c15, t3, c15 1965 FSUB c16, t4, c16 1966 1967 LDF [BO + 10 * SIZE], a1 1968 LDF [BO + 11 * SIZE], a2 1969 1970 FMUL a1, c09, c09 1971 FMUL a1, c10, c10 1972 FMUL a1, c11, c11 1973 FMUL a1, c12, c12 1974 1975 FMUL a2, c09, t1 1976 FMUL a2, c10, t2 1977 FMUL a2, c11, t3 1978 FMUL a2, c12, t4 1979 1980 FSUB c13, t1, c13 1981 FSUB c14, t2, c14 1982 FSUB c15, t3, c15 1983 FSUB c16, t4, c16 1984 1985 LDF [BO + 15 * SIZE], a1 1986 1987 FMUL a1, c13, c13 1988 FMUL a1, c14, c14 1989 FMUL a1, c15, c15 1990 FMUL a1, c16, c16 1991#endif 1992 1993#ifdef RT 1994 LDF [BO + 15 * SIZE], a1 1995 LDF [BO + 14 * SIZE], a2 1996 LDF [BO + 13 * SIZE], a3 1997 LDF [BO + 12 * SIZE], a4 1998 1999 FMUL a1, c13, c13 2000 FMUL a1, c14, c14 2001 FMUL a1, c15, c15 2002 FMUL a1, c16, c16 2003 2004 FMUL a2, c13, t1 2005 FMUL a2, c14, t2 2006 FMUL a2, c15, t3 2007 FMUL a2, c16, t4 2008 2009 FSUB c09, t1, c09 2010 FSUB c10, t2, c10 2011 FSUB c11, t3, c11 2012 FSUB c12, t4, c12 2013 2014 FMUL a3, c13, t1 2015 FMUL a3, c14, t2 2016 FMUL a3, c15, t3 2017 FMUL a3, c16, t4 2018 2019 FSUB c05, t1, c05 2020 FSUB c06, t2, c06 2021 FSUB c07, t3, c07 2022 FSUB c08, t4, c08 2023 2024 FMUL a4, c13, t1 2025 FMUL a4, c14, t2 2026 FMUL a4, c15, t3 2027 FMUL a4, c16, t4 2028 2029 FSUB c01, t1, c01 2030 FSUB c02, t2, c02 2031 FSUB c03, t3, c03 2032 FSUB c04, t4, c04 2033 2034 LDF [BO + 10 * SIZE], a1 2035 LDF [BO + 9 * SIZE], a2 2036 LDF [BO + 8 * SIZE], a3 2037 2038 FMUL a1, c09, c09 2039 FMUL a1, c10, c10 2040 FMUL a1, c11, c11 2041 FMUL a1, c12, c12 2042 2043 FMUL a2, c09, t1 2044 FMUL a2, c10, t2 2045 FMUL a2, c11, t3 2046 FMUL a2, c12, t4 2047 2048 FSUB c05, t1, c05 2049 FSUB c06, t2, c06 2050 FSUB c07, t3, c07 2051 FSUB c08, t4, c08 2052 2053 FMUL a3, c09, t1 2054 FMUL a3, c10, t2 2055 FMUL a3, c11, t3 2056 FMUL a3, c12, t4 2057 2058 FSUB c01, t1, c01 2059 FSUB c02, t2, c02 2060 FSUB c03, t3, c03 2061 FSUB c04, t4, c04 2062 2063 LDF [BO + 5 * SIZE], a1 2064 LDF [BO + 4 * SIZE], a2 2065 2066 FMUL a1, c05, c05 2067 FMUL a1, c06, c06 2068 FMUL a1, c07, c07 2069 FMUL a1, c08, c08 2070 2071 FMUL a2, c05, t1 2072 FMUL a2, c06, t2 2073 FMUL a2, c07, t3 2074 FMUL a2, c08, t4 2075 2076 FSUB c01, t1, c01 2077 FSUB c02, t2, c02 2078 FSUB c03, t3, c03 2079 FSUB c04, t4, c04 2080 2081 LDF [BO + 0 * SIZE], a1 2082 2083 FMUL a1, c01, c01 2084 FMUL a1, c02, c02 2085 FMUL a1, c03, c03 2086 FMUL a1, c04, c04 2087#endif 2088 2089#ifdef LN 2090 add C1, -4 * SIZE, C1 2091 add C2, -4 * SIZE, C2 2092 add C3, -4 * SIZE, C3 2093 add C4, -4 * SIZE, C4 2094#endif 2095 2096#if defined(LN) || defined(LT) 2097 STF c01, [BO + 0 * SIZE] 2098 STF c05, [BO + 1 * SIZE] 2099 STF c09, [BO + 2 * SIZE] 2100 STF c13, [BO + 3 * SIZE] 2101 2102 STF c02, [BO + 4 * SIZE] 2103 STF c06, [BO + 5 * SIZE] 2104 STF c10, [BO + 6 * SIZE] 2105 STF c14, [BO + 7 * SIZE] 2106 2107 STF c03, [BO + 8 * SIZE] 2108 STF c07, [BO + 9 * SIZE] 2109 STF c11, [BO + 10 * SIZE] 2110 STF c15, [BO + 11 * SIZE] 2111 2112 STF c04, [BO + 12 * SIZE] 2113 STF c08, [BO + 13 * SIZE] 2114 STF c12, [BO + 14 * SIZE] 2115 STF c16, [BO + 15 * SIZE] 2116#else 2117 STF c01, [AO + 0 * SIZE] 2118 STF c02, [AO + 1 * SIZE] 2119 STF c03, [AO + 2 * SIZE] 2120 STF c04, [AO + 3 * SIZE] 2121 2122 STF c05, [AO + 4 * SIZE] 2123 STF c06, [AO + 5 * SIZE] 2124 STF c07, [AO + 6 * SIZE] 2125 STF c08, [AO + 7 * SIZE] 2126 2127 STF c09, [AO + 8 * SIZE] 2128 STF c10, [AO + 9 * SIZE] 2129 STF c11, [AO + 10 * SIZE] 2130 STF c12, [AO + 11 * SIZE] 2131 2132 STF c13, [AO + 12 * SIZE] 2133 STF c14, [AO + 13 * SIZE] 2134 STF c15, [AO + 14 * SIZE] 2135 STF c16, [AO + 15 * SIZE] 2136#endif 2137 2138 STF c01, [C1 + 0 * SIZE] 2139 STF c02, [C1 + 1 * SIZE] 2140 STF c03, [C1 + 2 * SIZE] 2141 STF c04, [C1 + 3 * SIZE] 2142 2143 STF c05, [C2 + 0 * SIZE] 2144 STF c06, [C2 + 1 * SIZE] 2145 STF c07, [C2 + 2 * SIZE] 2146 STF c08, [C2 + 3 * SIZE] 2147 2148 STF c09, [C3 + 0 * SIZE] 2149 STF c10, [C3 + 1 * SIZE] 2150 STF c11, [C3 + 2 * SIZE] 2151 STF c12, [C3 + 3 * SIZE] 2152 2153 STF c13, [C4 + 0 * SIZE] 2154 STF c14, [C4 + 1 * SIZE] 2155 STF c15, [C4 + 2 * SIZE] 2156 STF c16, [C4 + 3 * SIZE] 2157 2158 FMOV FZERO, t1 2159 FMOV FZERO, t2 2160 FMOV FZERO, t3 2161 FMOV FZERO, t4 2162 2163#ifndef LN 2164 add C1, 4 * SIZE, C1 2165 add C2, 4 * SIZE, C2 2166 add C3, 4 * SIZE, C3 2167 add C4, 4 * SIZE, C4 2168#endif 2169 2170#ifdef RT 2171 sll K, 2 + BASE_SHIFT, TEMP1 2172 add AORIG, TEMP1, AORIG 2173#endif 2174 2175#if defined(LT) || defined(RN) 2176 sub K, KK, TEMP1 2177 sll TEMP1, 2 + BASE_SHIFT, TEMP1 2178 add AO, TEMP1, AO 2179 add BO, TEMP1, BO 2180#endif 2181 2182#ifdef LT 2183 add KK, 4, KK 2184#endif 2185 2186#ifdef LN 2187 sub KK, 4, KK 2188#endif 2189 2190 add I, -1, I 2191 cmp I, 0 2192 2193 sra K, 2, L 2194 bg,pt %icc, .LL21 2195 FMOV FZERO, c01 2196 2197 2198 2199 2200 2201 2202 2203.LL99: 2204#ifdef LN 2205 sll K, 2 + BASE_SHIFT, TEMP1 2206 add B, TEMP1, B 2207#endif 2208 2209#if defined(LT) || defined(RN) 2210 mov BO, B 2211#endif 2212 2213#ifdef RN 2214 add KK, 4, KK 2215#endif 2216 2217#ifdef RT 2218 sub KK, 4, KK 2219#endif 2220 2221 add J, -1, J 2222 cmp J, 0 2223 bg,pt %icc, .LL11 2224 nop 2225 2226.LL100: /* n & 2 */ 2227 and N, 2, J 2228 cmp J, 0 2229 ble,pn %icc, .LL200 2230 nop 2231 2232#ifdef RT 2233 sll K, 1 + BASE_SHIFT, TEMP1 2234 sub B, TEMP1, B 2235 2236 sll LDC, 1, TEMP1 2237 sub C, TEMP1, C 2238#endif 2239 2240 mov C, C1 2241 add C, LDC, C2 2242 2243#ifdef LN 2244 add M, OFFSET, KK 2245#endif 2246 2247#ifdef LT 2248 mov OFFSET, KK 2249#endif 2250 2251#if defined(LN) || defined(RT) 2252 mov A, AORIG 2253#else 2254 mov A, AO 2255#endif 2256 2257#ifndef RT 2258 add C2, LDC, C 2259#endif 2260 2261 and M, 1, I 2262 cmp I, 0 2263 ble,pn %icc, .LL150 2264 nop 2265 2266#if defined(LT) || defined(RN) 2267 sra KK, 2, L 2268 2269 mov B, BO 2270 cmp L, 0 2271#else 2272 2273#ifdef LN 2274 sll K, 0 + BASE_SHIFT, TEMP1 2275 sub AORIG, TEMP1, AORIG 2276#endif 2277 2278 sll KK, 0 + BASE_SHIFT, TEMP1 2279 sll KK, 1 + BASE_SHIFT, TEMP2 2280 2281 add AORIG, TEMP1, AO 2282 add B, TEMP2, BO 2283 2284 sub K, KK, TEMP1 2285 sra TEMP1, 2, L 2286 cmp L, 0 2287#endif 2288 2289 LDF [AO + 0 * SIZE], a1 2290 FMOV FZERO, c01 2291 LDF [BO + 0 * SIZE], b1 2292 FMOV FZERO, t1 2293 2294 LDF [AO + 1 * SIZE], a2 2295 FMOV FZERO, c02 2296 LDF [BO + 1 * SIZE], b2 2297 FMOV FZERO, t2 2298 2299 LDF [AO + 2 * SIZE], a3 2300 FMOV FZERO, c03 2301 2302 LDF [BO + 2 * SIZE], b3 2303 FMOV FZERO, t3 2304 2305 LDF [AO + 3 * SIZE], a4 2306 FMOV FZERO, c04 2307 LDF [BO + 3 * SIZE], b4 2308 FMOV FZERO, t4 2309 2310 ble,pn %icc, .LL175 2311 nop 2312 2313.LL172: 2314 FADD c01, t1, c01 2315 add AO, 4 * SIZE, AO 2316 FMUL a1, b1, t1 2317 LDF [BO + 4 * SIZE], b1 2318 2319 FADD c02, t2, c02 2320 FMUL a1, b2, t2 2321 LDF [BO + 5 * SIZE], b2 2322 2323 add L, -1, L 2324 LDF [AO + 0 * SIZE], a1 2325 2326 FADD c03, t3, c03 2327 cmp L, 0 2328 FMUL a2, b3, t3 2329 LDF [BO + 6 * SIZE], b3 2330 2331 FADD c04, t4, c04 2332 FMUL a2, b4, t4 2333 LDF [BO + 7 * SIZE], b4 2334 LDF [AO + 1 * SIZE], a2 2335 2336 FADD c01, t1, c01 2337 FMUL a3, b1, t1 2338 LDF [BO + 8 * SIZE], b1 2339 2340 FADD c02, t2, c02 2341 FMUL a3, b2, t2 2342 LDF [BO + 9 * SIZE], b2 2343 LDF [AO + 2 * SIZE], a3 2344 2345 FADD c03, t3, c03 2346 FMUL a4, b3, t3 2347 LDF [BO + 10 * SIZE], b3 2348 FADD c04, t4, c04 2349 FMUL a4, b4, t4 2350 LDF [BO + 11 * SIZE], b4 2351 add BO, 8 * SIZE, BO 2352 2353 bg,pt %icc, .LL172 2354 LDF [AO + 3 * SIZE], a4 2355 2356.LL175: 2357#if defined(LT) || defined(RN) 2358 and KK, 3, L 2359#else 2360 and TEMP1, 3, L 2361#endif 2362 cmp L, 0 2363 ble,a,pn %icc, .LL179 2364 nop 2365 2366.LL176: 2367 FADD c01, t1, c01 2368 add L, -1, L 2369 FMUL a1, b1, t1 2370 add AO, 1 * SIZE, AO 2371 LDF [BO + 2 * SIZE], b1 2372 FADD c02, t2, c02 2373 cmp L, 0 2374 FMUL a1, b2, t2 2375 LDF [BO + 3 * SIZE], b2 2376 2377 add BO, 2 * SIZE, BO 2378 bg,pt %icc, .LL176 2379 LDF [AO + 0 * SIZE], a1 2380 2381.LL179: 2382 FADD c01, t1, c01 2383 FADD c02, t2, c02 2384 FADD c03, t3, c03 2385 FADD c04, t4, c04 2386 2387 FADD c01, c03, c01 2388 FADD c02, c04, c02 2389 2390 2391#if defined(LN) || defined(RT) 2392#ifdef LN 2393 sub KK, 1, TEMP1 2394#else 2395 sub KK, 2, TEMP1 2396#endif 2397 sll TEMP1, 0 + BASE_SHIFT, TEMP2 2398 sll TEMP1, 1 + BASE_SHIFT, TEMP1 2399 add AORIG, TEMP2, AO 2400 add B, TEMP1, BO 2401#endif 2402 2403#if defined(LN) || defined(LT) 2404 LDF [BO + 0 * SIZE], a1 2405 LDF [BO + 1 * SIZE], a2 2406 2407 FSUB a1, c01, c01 2408 FSUB a2, c02, c02 2409#else 2410 LDF [AO + 0 * SIZE], a1 2411 LDF [AO + 1 * SIZE], a2 2412 2413 FSUB a1, c01, c01 2414 FSUB a2, c02, c02 2415#endif 2416 2417#ifdef LN 2418 LDF [AO + 0 * SIZE], a1 2419 2420 FMUL a1, c01, c01 2421 FMUL a1, c02, c02 2422#endif 2423 2424#ifdef LT 2425 LDF [AO + 0 * SIZE], a1 2426 2427 FMUL a1, c01, c01 2428 FMUL a1, c02, c02 2429#endif 2430 2431#ifdef RN 2432 LDF [BO + 0 * SIZE], a1 2433 LDF [BO + 1 * SIZE], a2 2434 LDF [BO + 3 * SIZE], a3 2435 2436 FMUL a1, c01, c01 2437 FMUL a2, c01, t1 2438 FSUB c02, t1, c02 2439 FMUL a3, c02, c02 2440#endif 2441 2442#ifdef RT 2443 LDF [BO + 3 * SIZE], a1 2444 LDF [BO + 2 * SIZE], a2 2445 LDF [BO + 0 * SIZE], a3 2446 2447 FMUL a1, c02, c02 2448 FMUL a2, c02, t1 2449 FSUB c01, t1, c01 2450 FMUL a3, c01, c01 2451#endif 2452 2453#ifdef LN 2454 add C1, -1 * SIZE, C1 2455 add C2, -1 * SIZE, C2 2456#endif 2457 2458#if defined(LN) || defined(LT) 2459 STF c01, [BO + 0 * SIZE] 2460 STF c02, [BO + 1 * SIZE] 2461#else 2462 STF c01, [AO + 0 * SIZE] 2463 STF c02, [AO + 1 * SIZE] 2464#endif 2465 2466 STF c01, [C1 + 0 * SIZE] 2467 STF c02, [C2 + 0 * SIZE] 2468 2469 FMOV FZERO, t1 2470 FMOV FZERO, t2 2471 FMOV FZERO, t3 2472 FMOV FZERO, t4 2473 2474#ifndef LN 2475 add C1, 1 * SIZE, C1 2476 add C2, 1 * SIZE, C2 2477#endif 2478 2479#ifdef RT 2480 sll K, 0 + BASE_SHIFT, TEMP1 2481 add AORIG, TEMP1, AORIG 2482#endif 2483 2484#if defined(LT) || defined(RN) 2485 sub K, KK, TEMP1 2486 sll TEMP1, 0 + BASE_SHIFT, TEMP2 2487 sll TEMP1, 1 + BASE_SHIFT, TEMP1 2488 add AO, TEMP2, AO 2489 add BO, TEMP1, BO 2490#endif 2491 2492#ifdef LT 2493 add KK, 1, KK 2494#endif 2495 2496#ifdef LN 2497 sub KK, 1, KK 2498#endif 2499 2500.LL150: 2501 and M, 2, I 2502 cmp I, 0 2503 ble,pn %icc, .LL170 2504 nop 2505 2506#if defined(LT) || defined(RN) 2507 sra KK, 2, L 2508 2509 mov B, BO 2510 cmp L, 0 2511#else 2512 2513#ifdef LN 2514 sll K, 1 + BASE_SHIFT, TEMP1 2515 sub AORIG, TEMP1, AORIG 2516#endif 2517 2518 sll KK, 1 + BASE_SHIFT, TEMP1 2519 sll KK, 1 + BASE_SHIFT, TEMP2 2520 2521 add AORIG, TEMP1, AO 2522 add B, TEMP2, BO 2523 2524 sub K, KK, TEMP1 2525 sra TEMP1, 2, L 2526 cmp L, 0 2527#endif 2528 2529 LDF [AO + 0 * SIZE], a1 2530 FMOV FZERO, c01 2531 LDF [BO + 0 * SIZE], b1 2532 FMOV FZERO, t1 2533 2534 LDF [AO + 1 * SIZE], a2 2535 cmp L, 0 2536 FMOV FZERO, c02 2537 LDF [BO + 1 * SIZE], b2 2538 FMOV FZERO, t2 2539 2540 LDF [AO + 2 * SIZE], a3 2541 FMOV FZERO, c03 2542 LDF [BO + 2 * SIZE], b3 2543 FMOV FZERO, t3 2544 2545 LDF [AO + 3 * SIZE], a4 2546 FMOV FZERO, c04 2547 LDF [BO + 3 * SIZE], b4 2548 FMOV FZERO, t4 2549 ble,pn %icc, .LL155 2550 nop 2551 2552.LL152: 2553 FADD c01, t1, c01 2554 add L, -1, L 2555 FMUL a1, b1, t1 2556 prefetch [AO + APREFETCHSIZE * SIZE], 0 2557 2558 FADD c02, t2, c02 2559 add BO, 8 * SIZE, BO 2560 FMUL a1, b2, t2 2561 LDF [AO + 4 * SIZE], a1 2562 2563 FADD c03, t3, c03 2564 cmp L, 0 2565 FMUL a2, b1, t3 2566 LDF [BO - 4 * SIZE], b1 2567 2568 FADD c04, t4, c04 2569 nop 2570 FMUL a2, b2, t4 2571 LDF [AO + 5 * SIZE], a2 2572 2573 FADD c01, t1, c01 2574 nop 2575 FMUL a3, b3, t1 2576 LDF [BO - 3 * SIZE], b2 2577 2578 FADD c02, t2, c02 2579 nop 2580 FMUL a3, b4, t2 2581 LDF [AO + 6 * SIZE], a3 2582 2583 FADD c03, t3, c03 2584 nop 2585 FMUL a4, b3, t3 2586 LDF [BO - 2 * SIZE], b3 2587 2588 FADD c04, t4, c04 2589 nop 2590 FMUL a4, b4, t4 2591 LDF [AO + 7 * SIZE], a4 2592 2593 FADD c01, t1, c01 2594 nop 2595 FMUL a1, b1, t1 2596 LDF [BO - 1 * SIZE], b4 2597 2598 FADD c02, t2, c02 2599 FMUL a1, b2, t2 2600 LDF [AO + 8 * SIZE], a1 2601 2602 FADD c03, t3, c03 2603 FMUL a2, b1, t3 2604 LDF [BO + 0 * SIZE], b1 2605 2606 FADD c04, t4, c04 2607 FMUL a2, b2, t4 2608 LDF [AO + 9 * SIZE], a2 2609 2610 FADD c01, t1, c01 2611 FMUL a3, b3, t1 2612 LDF [BO + 1 * SIZE], b2 2613 2614 FADD c02, t2, c02 2615 FMUL a3, b4, t2 2616 LDF [AO + 10 * SIZE], a3 2617 2618 FADD c03, t3, c03 2619 FMUL a4, b3, t3 2620 LDF [BO + 2 * SIZE], b3 2621 2622 FADD c04, t4, c04 2623 FMUL a4, b4, t4 2624 LDF [AO + 11 * SIZE], a4 2625 2626 add AO, 8 * SIZE, AO 2627 bg,pt %icc, .LL152 2628 LDF [BO + 3 * SIZE], b4 2629 2630.LL155: 2631#if defined(LT) || defined(RN) 2632 and KK, 3, L 2633#else 2634 and TEMP1, 3, L 2635#endif 2636 cmp L, 0 2637 ble,a,pn %icc, .LL159 2638 nop 2639 2640.LL156: 2641 LDF [AO + 0 * SIZE], a1 2642 LDF [AO + 1 * SIZE], a2 2643 2644 LDF [BO + 0 * SIZE], b1 2645 LDF [BO + 1 * SIZE], b2 2646 2647 FADD c01, t1, c01 2648 FADD c02, t2, c02 2649 FADD c03, t3, c03 2650 FADD c04, t4, c04 2651 2652 FMUL a1, b1, t1 2653 FMUL a1, b2, t2 2654 FMUL a2, b1, t3 2655 FMUL a2, b2, t4 2656 2657 add AO, 2 * SIZE, AO 2658 add BO, 2 * SIZE, BO 2659 2660 add L, -1, L 2661 cmp L, 0 2662 bg,pt %icc, .LL156 2663 nop 2664 2665.LL159: 2666 FADD c01, t1, c01 2667 FADD c02, t2, c02 2668 FADD c03, t3, c03 2669 FADD c04, t4, c04 2670 2671#if defined(LN) || defined(RT) 2672#ifdef LN 2673 sub KK, 2, TEMP1 2674#else 2675 sub KK, 2, TEMP1 2676#endif 2677 sll TEMP1, 1 + BASE_SHIFT, TEMP2 2678 sll TEMP1, 1 + BASE_SHIFT, TEMP1 2679 add AORIG, TEMP2, AO 2680 add B, TEMP1, BO 2681#endif 2682 2683#if defined(LN) || defined(LT) 2684 LDF [BO + 0 * SIZE], a1 2685 LDF [BO + 1 * SIZE], a2 2686 LDF [BO + 2 * SIZE], a3 2687 LDF [BO + 3 * SIZE], a4 2688 2689 FSUB a1, c01, c01 2690 FSUB a2, c02, c02 2691 FSUB a3, c03, c03 2692 FSUB a4, c04, c04 2693#else 2694 LDF [AO + 0 * SIZE], a1 2695 LDF [AO + 1 * SIZE], a2 2696 LDF [AO + 2 * SIZE], a3 2697 LDF [AO + 3 * SIZE], a4 2698 2699 FSUB a1, c01, c01 2700 FSUB a2, c03, c03 2701 FSUB a3, c02, c02 2702 FSUB a4, c04, c04 2703#endif 2704 2705#ifdef LN 2706 LDF [AO + 3 * SIZE], a1 2707 LDF [AO + 2 * SIZE], a2 2708 LDF [AO + 0 * SIZE], a3 2709 2710 FMUL a1, c03, c03 2711 FMUL a1, c04, c04 2712 FMUL a2, c03, t1 2713 FMUL a2, c04, t2 2714 2715 FSUB c01, t1, c01 2716 FSUB c02, t2, c02 2717 FMUL a3, c01, c01 2718 FMUL a3, c02, c02 2719#endif 2720 2721#ifdef LT 2722 LDF [AO + 0 * SIZE], a1 2723 LDF [AO + 1 * SIZE], a2 2724 LDF [AO + 3 * SIZE], a3 2725 2726 FMUL a1, c01, c01 2727 FMUL a1, c02, c02 2728 2729 FMUL a2, c01, t1 2730 FMUL a2, c02, t2 2731 2732 FSUB c03, t1, c03 2733 FSUB c04, t2, c04 2734 2735 FMUL a3, c03, c03 2736 FMUL a3, c04, c04 2737#endif 2738 2739#ifdef RN 2740 LDF [BO + 0 * SIZE], a1 2741 LDF [BO + 1 * SIZE], a2 2742 LDF [BO + 3 * SIZE], a3 2743 2744 FMUL a1, c01, c01 2745 FMUL a1, c03, c03 2746 FMUL a2, c01, t1 2747 FMUL a2, c03, t2 2748 2749 FSUB c02, t1, c02 2750 FSUB c04, t2, c04 2751 FMUL a3, c02, c02 2752 FMUL a3, c04, c04 2753#endif 2754 2755#ifdef RT 2756 LDF [BO + 3 * SIZE], a1 2757 LDF [BO + 2 * SIZE], a2 2758 LDF [BO + 0 * SIZE], a3 2759 2760 FMUL a1, c02, c02 2761 FMUL a1, c04, c04 2762 2763 FMUL a2, c02, t1 2764 FMUL a2, c04, t2 2765 FSUB c01, t1, c01 2766 FSUB c03, t2, c03 2767 2768 FMUL a3, c01, c01 2769 FMUL a3, c03, c03 2770#endif 2771 2772#ifdef LN 2773 add C1, -2 * SIZE, C1 2774 add C2, -2 * SIZE, C2 2775#endif 2776 2777#if defined(LN) || defined(LT) 2778 STF c01, [BO + 0 * SIZE] 2779 STF c02, [BO + 1 * SIZE] 2780 STF c03, [BO + 2 * SIZE] 2781 STF c04, [BO + 3 * SIZE] 2782#else 2783 STF c01, [AO + 0 * SIZE] 2784 STF c03, [AO + 1 * SIZE] 2785 STF c02, [AO + 2 * SIZE] 2786 STF c04, [AO + 3 * SIZE] 2787#endif 2788 2789 STF c01, [C1 + 0 * SIZE] 2790 STF c03, [C1 + 1 * SIZE] 2791 STF c02, [C2 + 0 * SIZE] 2792 STF c04, [C2 + 1 * SIZE] 2793 2794 FMOV FZERO, t1 2795 FMOV FZERO, t2 2796 FMOV FZERO, t3 2797 FMOV FZERO, t4 2798 2799#ifndef LN 2800 add C1, 2 * SIZE, C1 2801 add C2, 2 * SIZE, C2 2802#endif 2803 2804#ifdef RT 2805 sll K, 1 + BASE_SHIFT, TEMP1 2806 add AORIG, TEMP1, AORIG 2807#endif 2808 2809#if defined(LT) || defined(RN) 2810 sub K, KK, TEMP1 2811 sll TEMP1, 1 + BASE_SHIFT, TEMP2 2812 sll TEMP1, 1 + BASE_SHIFT, TEMP1 2813 add AO, TEMP2, AO 2814 add BO, TEMP1, BO 2815#endif 2816 2817#ifdef LT 2818 add KK, 2, KK 2819#endif 2820 2821#ifdef LN 2822 sub KK, 2, KK 2823#endif 2824 2825.LL170: 2826 sra M, 2, I 2827 cmp I, 0 2828 ble,pn %icc, .LL199 2829 FMOV FZERO, c03 2830 2831.LL121: 2832#if defined(LT) || defined(RN) 2833 sra KK, 2, L 2834 2835 mov B, BO 2836 cmp L, 0 2837#else 2838 2839#ifdef LN 2840 sll K, 2 + BASE_SHIFT, TEMP1 2841 sub AORIG, TEMP1, AORIG 2842#endif 2843 2844 sll KK, 2 + BASE_SHIFT, TEMP1 2845 sll KK, 1 + BASE_SHIFT, TEMP2 2846 2847 add AORIG, TEMP1, AO 2848 add B, TEMP2, BO 2849 2850 sub K, KK, TEMP1 2851 sra TEMP1, 2, L 2852 cmp L, 0 2853#endif 2854 2855 LDF [AO + 0 * SIZE], a1 2856 FMOV FZERO, t1 2857 LDF [BO + 0 * SIZE], b1 2858 FMOV FZERO, c07 2859 2860 LDF [AO + 1 * SIZE], a2 2861 FMOV FZERO, t2 2862 LDF [BO + 1 * SIZE], b2 2863 FMOV FZERO, c04 2864 2865 LDF [AO + 2 * SIZE], a3 2866 FMOV FZERO, t3 2867 LDF [BO + 2 * SIZE], b3 2868 FMOV FZERO, c08 2869 2870 LDF [AO + 3 * SIZE], a4 2871 FMOV FZERO, t4 2872 LDF [BO + 3 * SIZE], b4 2873 FMOV FZERO, c01 2874 2875#ifdef LN 2876 prefetch [C1 - 3 * SIZE], 2 2877 FMOV FZERO, c05 2878 prefetch [C2 - 3 * SIZE], 2 2879 FMOV FZERO, c02 2880#else 2881 prefetch [C1 + 3 * SIZE], 2 2882 FMOV FZERO, c05 2883 prefetch [C2 + 3 * SIZE], 2 2884 FMOV FZERO, c02 2885#endif 2886 2887 ble,pn %icc, .LL125 2888 FMOV FZERO, c06 2889 2890.LL122: 2891 FADD c03, t1, c03 2892 add L, -1, L 2893 FMUL a1, b1, t1 2894 prefetch [AO + APREFETCHSIZE * SIZE], 0 2895 2896 FADD c07, t2, c07 2897 add BO, 8 * SIZE, BO 2898 FMUL a1, b2, t2 2899 LDF [AO + 4 * SIZE], a1 2900 2901 FADD c04, t3, c04 2902 add AO, 16 * SIZE, AO 2903 FMUL a2, b1, t3 2904 cmp L, 0 2905 2906 FADD c08, t4, c08 2907 nop 2908 FMUL a2, b2, t4 2909 LDF [AO - 11 * SIZE], a2 2910 2911 FADD c01, t1, c01 2912 nop 2913 FMUL a3, b1, t1 2914 nop 2915 2916 FADD c05, t2, c05 2917 nop 2918 FMUL a3, b2, t2 2919 LDF [AO - 10 * SIZE], a3 2920 2921 FADD c02, t3, c02 2922 nop 2923 FMUL a4, b1, t3 2924 LDF [BO - 4 * SIZE], b1 2925 2926 FADD c06, t4, c06 2927 nop 2928 FMUL a4, b2, t4 2929 LDF [BO - 3 * SIZE], b2 2930 2931 FADD c03, t1, c03 2932 nop 2933 FMUL a1, b3, t1 2934 LDF [AO - 9 * SIZE], a4 2935 2936 FADD c07, t2, c07 2937 nop 2938 FMUL a1, b4, t2 2939 LDF [AO - 8 * SIZE], a1 2940 2941 FADD c04, t3, c04 2942 nop 2943 FMUL a2, b3, t3 2944 nop 2945 2946 FADD c08, t4, c08 2947 nop 2948 FMUL a2, b4, t4 2949 LDF [AO - 7 * SIZE], a2 2950 2951 FADD c01, t1, c01 2952 nop 2953 FMUL a3, b3, t1 2954 nop 2955 2956 FADD c05, t2, c05 2957 nop 2958 FMUL a3, b4, t2 2959 LDF [AO - 6 * SIZE], a3 2960 2961 FADD c02, t3, c02 2962 nop 2963 FMUL a4, b3, t3 2964 LDF [BO - 2 * SIZE], b3 2965 2966 FADD c06, t4, c06 2967 nop 2968 FMUL a4, b4, t4 2969 LDF [BO - 1 * SIZE], b4 2970 2971 FADD c03, t1, c03 2972 nop 2973 FMUL a1, b1, t1 2974 LDF [AO - 5 * SIZE], a4 2975 2976 FADD c07, t2, c07 2977 nop 2978 FMUL a1, b2, t2 2979 LDF [AO - 4 * SIZE], a1 2980 2981 FADD c04, t3, c04 2982 nop 2983 FMUL a2, b1, t3 2984 nop 2985 2986 FADD c08, t4, c08 2987 nop 2988 FMUL a2, b2, t4 2989 LDF [AO - 3 * SIZE], a2 2990 2991 FADD c01, t1, c01 2992 nop 2993 FMUL a3, b1, t1 2994 nop 2995 2996 FADD c05, t2, c05 2997 nop 2998 FMUL a3, b2, t2 2999 LDF [AO - 2 * SIZE], a3 3000 3001 FADD c02, t3, c02 3002 nop 3003 FMUL a4, b1, t3 3004 LDF [BO + 0 * SIZE], b1 3005 3006 FADD c06, t4, c06 3007 nop 3008 FMUL a4, b2, t4 3009 LDF [BO + 1 * SIZE], b2 3010 3011 FADD c03, t1, c03 3012 nop 3013 FMUL a1, b3, t1 3014 LDF [AO - 1 * SIZE], a4 3015 3016 FADD c07, t2, c07 3017 nop 3018 FMUL a1, b4, t2 3019 LDF [AO + 0 * SIZE], a1 3020 3021 FADD c04, t3, c04 3022 nop 3023 FMUL a2, b3, t3 3024 nop 3025 3026 FADD c08, t4, c08 3027 nop 3028 FMUL a2, b4, t4 3029 LDF [AO + 1 * SIZE], a2 3030 3031 FADD c01, t1, c01 3032 nop 3033 FMUL a3, b3, t1 3034 nop 3035 3036 FADD c05, t2, c05 3037 nop 3038 FMUL a3, b4, t2 3039 LDF [AO + 2 * SIZE], a3 3040 3041 FADD c02, t3, c02 3042 nop 3043 FMUL a4, b3, t3 3044 LDF [BO + 2 * SIZE], b3 3045 3046 FADD c06, t4, c06 3047 FMUL a4, b4, t4 3048 LDF [AO + 3 * SIZE], a4 3049 3050 bg,pt %icc, .LL122 3051 LDF [BO + 3 * SIZE], b4 3052 3053.LL125: 3054#if defined(LT) || defined(RN) 3055 and KK, 3, L 3056#else 3057 and TEMP1, 3, L 3058#endif 3059 cmp L, 0 3060 ble,a,pn %icc, .LL129 3061 nop 3062 3063.LL126: 3064 FADD c03, t1, c03 3065 add AO, 4 * SIZE, AO 3066 FMUL a1, b1, t1 3067 add BO, 2 * SIZE, BO 3068 3069 FADD c07, t2, c07 3070 add L, -1, L 3071 FMUL a1, b2, t2 3072 LDF [AO + 0 * SIZE], a1 3073 3074 FADD c04, t3, c04 3075 cmp L, 0 3076 FMUL a2, b1, t3 3077 3078 FADD c08, t4, c08 3079 FMUL a2, b2, t4 3080 LDF [AO + 1 * SIZE], a2 3081 3082 FADD c01, t1, c01 3083 FMUL a3, b1, t1 3084 FADD c05, t2, c05 3085 FMUL a3, b2, t2 3086 LDF [AO + 2 * SIZE], a3 3087 3088 FADD c02, t3, c02 3089 FMUL a4, b1, t3 3090 LDF [BO + 0 * SIZE], b1 3091 FADD c06, t4, c06 3092 FMUL a4, b2, t4 3093 LDF [BO + 1 * SIZE], b2 3094 bg,pt %icc, .LL126 3095 LDF [AO + 3 * SIZE], a4 3096 3097.LL129: 3098 FADD c03, t1, c03 3099 FADD c07, t2, c07 3100 FADD c04, t3, c04 3101 FADD c08, t4, c08 3102 3103#if defined(LN) || defined(RT) 3104#ifdef LN 3105 sub KK, 4, TEMP1 3106#else 3107 sub KK, 2, TEMP1 3108#endif 3109 sll TEMP1, 2 + BASE_SHIFT, TEMP2 3110 sll TEMP1, 1 + BASE_SHIFT, TEMP1 3111 add AORIG, TEMP2, AO 3112 add B, TEMP1, BO 3113#endif 3114 3115#if defined(LN) || defined(LT) 3116 LDF [BO + 0 * SIZE], a1 3117 LDF [BO + 1 * SIZE], a2 3118 LDF [BO + 2 * SIZE], a3 3119 LDF [BO + 3 * SIZE], a4 3120 3121 LDF [BO + 4 * SIZE], b1 3122 LDF [BO + 5 * SIZE], b2 3123 LDF [BO + 6 * SIZE], b3 3124 LDF [BO + 7 * SIZE], b4 3125 3126 FSUB a1, c01, c01 3127 FSUB a2, c05, c05 3128 FSUB a3, c02, c02 3129 FSUB a4, c06, c06 3130 3131 FSUB b1, c03, c03 3132 FSUB b2, c07, c07 3133 FSUB b3, c04, c04 3134 FSUB b4, c08, c08 3135#else 3136 LDF [AO + 0 * SIZE], a1 3137 LDF [AO + 1 * SIZE], a2 3138 LDF [AO + 2 * SIZE], a3 3139 LDF [AO + 3 * SIZE], a4 3140 3141 LDF [AO + 4 * SIZE], b1 3142 LDF [AO + 5 * SIZE], b2 3143 LDF [AO + 6 * SIZE], b3 3144 LDF [AO + 7 * SIZE], b4 3145 3146 FSUB a1, c01, c01 3147 FSUB a2, c02, c02 3148 FSUB a3, c03, c03 3149 FSUB a4, c04, c04 3150 3151 FSUB b1, c05, c05 3152 FSUB b2, c06, c06 3153 FSUB b3, c07, c07 3154 FSUB b4, c08, c08 3155#endif 3156 3157#ifdef LN 3158 LDF [AO + 15 * SIZE], a1 3159 LDF [AO + 14 * SIZE], a2 3160 LDF [AO + 13 * SIZE], a3 3161 LDF [AO + 12 * SIZE], a4 3162 3163 FMUL a1, c04, c04 3164 FMUL a1, c08, c08 3165 FMUL a2, c04, t1 3166 FMUL a2, c08, t2 3167 3168 FSUB c03, t1, c03 3169 FSUB c07, t2, c07 3170 FMUL a3, c04, t1 3171 FMUL a3, c08, t2 3172 3173 FSUB c02, t1, c02 3174 FSUB c06, t2, c06 3175 FMUL a4, c04, t1 3176 FMUL a4, c08, t2 3177 3178 FSUB c01, t1, c01 3179 FSUB c05, t2, c05 3180 3181 LDF [AO + 10 * SIZE], a1 3182 LDF [AO + 9 * SIZE], a2 3183 LDF [AO + 8 * SIZE], a3 3184 3185 FMUL a1, c03, c03 3186 FMUL a1, c07, c07 3187 FMUL a2, c03, t1 3188 FMUL a2, c07, t2 3189 3190 FSUB c02, t1, c02 3191 FSUB c06, t2, c06 3192 FMUL a3, c03, t1 3193 FMUL a3, c07, t2 3194 3195 FSUB c01, t1, c01 3196 FSUB c05, t2, c05 3197 3198 LDF [AO + 5 * SIZE], a1 3199 LDF [AO + 4 * SIZE], a2 3200 3201 FMUL a1, c02, c02 3202 FMUL a1, c06, c06 3203 FMUL a2, c02, t1 3204 FMUL a2, c06, t2 3205 3206 FSUB c01, t1, c01 3207 FSUB c05, t2, c05 3208 3209 LDF [AO + 0 * SIZE], a1 3210 3211 FMUL a1, c01, c01 3212 FMUL a1, c05, c05 3213#endif 3214 3215#ifdef LT 3216 LDF [AO + 0 * SIZE], a1 3217 LDF [AO + 1 * SIZE], a2 3218 LDF [AO + 2 * SIZE], a3 3219 LDF [AO + 3 * SIZE], a4 3220 3221 FMUL a1, c01, c01 3222 FMUL a1, c05, c05 3223 FMUL a2, c01, t1 3224 FMUL a2, c05, t2 3225 3226 FSUB c02, t1, c02 3227 FSUB c06, t2, c06 3228 FMUL a3, c01, t1 3229 FMUL a3, c05, t2 3230 3231 FSUB c03, t1, c03 3232 FSUB c07, t2, c07 3233 FMUL a4, c01, t1 3234 FMUL a4, c05, t2 3235 3236 FSUB c04, t1, c04 3237 FSUB c08, t2, c08 3238 3239 LDF [AO + 5 * SIZE], a1 3240 LDF [AO + 6 * SIZE], a2 3241 LDF [AO + 7 * SIZE], a3 3242 3243 FMUL a1, c02, c02 3244 FMUL a1, c06, c06 3245 FMUL a2, c02, t1 3246 FMUL a2, c06, t2 3247 3248 FSUB c03, t1, c03 3249 FSUB c07, t2, c07 3250 FMUL a3, c02, t1 3251 FMUL a3, c06, t2 3252 FSUB c04, t1, c04 3253 FSUB c08, t2, c08 3254 3255 LDF [AO + 10 * SIZE], a1 3256 LDF [AO + 11 * SIZE], a2 3257 3258 FMUL a1, c03, c03 3259 FMUL a1, c07, c07 3260 FMUL a2, c03, t1 3261 FMUL a2, c07, t2 3262 3263 FSUB c04, t1, c04 3264 FSUB c08, t2, c08 3265 3266 LDF [AO + 15 * SIZE], a1 3267 3268 FMUL a1, c04, c04 3269 FMUL a1, c08, c08 3270#endif 3271 3272#ifdef RN 3273 LDF [BO + 0 * SIZE], a1 3274 LDF [BO + 1 * SIZE], a2 3275 LDF [BO + 3 * SIZE], a3 3276 3277 FMUL a1, c01, c01 3278 FMUL a1, c02, c02 3279 FMUL a1, c03, c03 3280 FMUL a1, c04, c04 3281 3282 FMUL a2, c01, t1 3283 FMUL a2, c02, t2 3284 FMUL a2, c03, t3 3285 FMUL a2, c04, t4 3286 3287 FSUB c05, t1, c05 3288 FSUB c06, t2, c06 3289 FSUB c07, t3, c07 3290 FSUB c08, t4, c08 3291 3292 FMUL a3, c05, c05 3293 FMUL a3, c06, c06 3294 FMUL a3, c07, c07 3295 FMUL a3, c08, c08 3296#endif 3297 3298#ifdef RT 3299 LDF [BO + 3 * SIZE], a1 3300 LDF [BO + 2 * SIZE], a2 3301 LDF [BO + 0 * SIZE], a3 3302 3303 FMUL a1, c05, c05 3304 FMUL a1, c06, c06 3305 FMUL a1, c07, c07 3306 FMUL a1, c08, c08 3307 3308 FMUL a2, c05, t1 3309 FMUL a2, c06, t2 3310 FMUL a2, c07, t3 3311 FMUL a2, c08, t4 3312 3313 FSUB c01, t1, c01 3314 FSUB c02, t2, c02 3315 FSUB c03, t3, c03 3316 FSUB c04, t4, c04 3317 3318 FMUL a3, c01, c01 3319 FMUL a3, c02, c02 3320 FMUL a3, c03, c03 3321 FMUL a3, c04, c04 3322#endif 3323 3324#ifdef LN 3325 add C1, -4 * SIZE, C1 3326 add C2, -4 * SIZE, C2 3327#endif 3328 3329#if defined(LN) || defined(LT) 3330 STF c01, [BO + 0 * SIZE] 3331 STF c05, [BO + 1 * SIZE] 3332 STF c02, [BO + 2 * SIZE] 3333 STF c06, [BO + 3 * SIZE] 3334 3335 STF c03, [BO + 4 * SIZE] 3336 STF c07, [BO + 5 * SIZE] 3337 STF c04, [BO + 6 * SIZE] 3338 STF c08, [BO + 7 * SIZE] 3339#else 3340 STF c01, [AO + 0 * SIZE] 3341 STF c02, [AO + 1 * SIZE] 3342 STF c03, [AO + 2 * SIZE] 3343 STF c04, [AO + 3 * SIZE] 3344 3345 STF c05, [AO + 4 * SIZE] 3346 STF c06, [AO + 5 * SIZE] 3347 STF c07, [AO + 6 * SIZE] 3348 STF c08, [AO + 7 * SIZE] 3349#endif 3350 3351 STF c01, [C1 + 0 * SIZE] 3352 STF c02, [C1 + 1 * SIZE] 3353 STF c03, [C1 + 2 * SIZE] 3354 STF c04, [C1 + 3 * SIZE] 3355 3356 STF c05, [C2 + 0 * SIZE] 3357 STF c06, [C2 + 1 * SIZE] 3358 STF c07, [C2 + 2 * SIZE] 3359 STF c08, [C2 + 3 * SIZE] 3360 3361 FMOV FZERO, t1 3362 FMOV FZERO, t2 3363 FMOV FZERO, t3 3364 FMOV FZERO, t4 3365 3366#ifndef LN 3367 add C1, 4 * SIZE, C1 3368 add C2, 4 * SIZE, C2 3369#endif 3370 3371#ifdef RT 3372 sll K, 2 + BASE_SHIFT, TEMP1 3373 add AORIG, TEMP1, AORIG 3374#endif 3375 3376#if defined(LT) || defined(RN) 3377 sub K, KK, TEMP1 3378 sll TEMP1, 2 + BASE_SHIFT, TEMP2 3379 sll TEMP1, 1 + BASE_SHIFT, TEMP1 3380 add AO, TEMP2, AO 3381 add BO, TEMP1, BO 3382#endif 3383 3384#ifdef LT 3385 add KK, 4, KK 3386#endif 3387 3388#ifdef LN 3389 sub KK, 4, KK 3390#endif 3391 3392 add I, -1, I 3393 cmp I, 0 3394 3395 bg,pt %icc, .LL121 3396 FMOV FZERO, c03 3397 3398.LL199: 3399#ifdef LN 3400 sll K, 1 + BASE_SHIFT, TEMP1 3401 add B, TEMP1, B 3402#endif 3403 3404#if defined(LT) || defined(RN) 3405 mov BO, B 3406#endif 3407 3408#ifdef RN 3409 add KK, 2, KK 3410#endif 3411 3412#ifdef RT 3413 sub KK, 2, KK 3414#endif 3415 3416.LL200: 3417 and N, 1, J 3418 3419 cmp J, 0 3420 ble,pn %icc, .LL999 3421 nop 3422 3423#ifdef RT 3424 sll K, 0 + BASE_SHIFT, TEMP1 3425 sub B, TEMP1, B 3426 3427 sub C, LDC, C 3428#endif 3429 3430 mov C, C1 3431 3432#ifdef LN 3433 add M, OFFSET, KK 3434#endif 3435 3436#ifdef LT 3437 mov OFFSET, KK 3438#endif 3439 3440#if defined(LN) || defined(RT) 3441 mov A, AORIG 3442#else 3443 mov A, AO 3444#endif 3445 3446#ifndef RT 3447 add C, LDC, C 3448#endif 3449 3450 and M, 1, I 3451 cmp I, 0 3452 ble,pn %icc, .LL250 3453 nop 3454 3455#if defined(LT) || defined(RN) 3456 sra KK, 2, L 3457 3458 mov B, BO 3459 cmp L, 0 3460#else 3461 3462#ifdef LN 3463 sll K, 0 + BASE_SHIFT, TEMP1 3464 sub AORIG, TEMP1, AORIG 3465#endif 3466 3467 sll KK, 0 + BASE_SHIFT, TEMP1 3468 3469 add AORIG, TEMP1, AO 3470 add B, TEMP1, BO 3471 3472 sub K, KK, TEMP1 3473 sra TEMP1, 2, L 3474 cmp L, 0 3475#endif 3476 3477 LDF [AO + 0 * SIZE], a1 3478 FMOV FZERO, t1 3479 LDF [AO + 1 * SIZE], a2 3480 FMOV FZERO, c01 3481 3482 LDF [AO + 2 * SIZE], a3 3483 FMOV FZERO, t2 3484 LDF [AO + 3 * SIZE], a4 3485 FMOV FZERO, c02 3486 3487 LDF [BO + 0 * SIZE], b1 3488 FMOV FZERO, t3 3489 LDF [BO + 1 * SIZE], b2 3490 FMOV FZERO, t4 3491 LDF [BO + 2 * SIZE], b3 3492 3493 ble,pn %icc, .LL275 3494 LDF [BO + 3 * SIZE], b4 3495 3496.LL272: 3497 FADD c01, t1, c01 3498 add L, -1, L 3499 add AO, 4 * SIZE, AO 3500 3501 FMUL a1, b1, t1 3502 add BO, 4 * SIZE, BO 3503 LDF [AO + 0 * SIZE], a1 3504 3505 FADD c02, t2, c02 3506 cmp L, 0 3507 LDF [BO + 0 * SIZE], b1 3508 FMUL a2, b2, t2 3509 3510 LDF [AO + 1 * SIZE], a2 3511 FADD c01, t3, c01 3512 LDF [BO + 1 * SIZE], b2 3513 FMUL a3, b3, t3 3514 3515 LDF [AO + 2 * SIZE], a3 3516 FADD c02, t4, c02 3517 LDF [BO + 2 * SIZE], b3 3518 FMUL a4, b4, t4 3519 LDF [AO + 3 * SIZE], a4 3520 3521 bg,pt %icc, .LL272 3522 LDF [BO + 3 * SIZE], b4 3523 3524.LL275: 3525#if defined(LT) || defined(RN) 3526 and KK, 3, L 3527#else 3528 and TEMP1, 3, L 3529#endif 3530 cmp L, 0 3531 ble,a,pn %icc, .LL279 3532 nop 3533 3534.LL276: 3535 FADD c01, t1, c01 3536 add L, -1, L 3537 FMUL a1, b1, t1 3538 LDF [AO + 1 * SIZE], a1 3539 3540 LDF [BO + 1 * SIZE], b1 3541 add BO, 1 * SIZE, BO 3542 cmp L, 0 3543 bg,pt %icc, .LL276 3544 add AO, 1 * SIZE, AO 3545 3546.LL279: 3547 FADD c01, t1, c01 3548 FADD c02, t2, c02 3549 FADD c01, t3, c01 3550 FADD c02, t4, c02 3551 3552 FADD c01, c02, c01 3553 3554#if defined(LN) || defined(RT) 3555 sub KK, 1, TEMP1 3556 sll TEMP1, 0 + BASE_SHIFT, TEMP1 3557 add AORIG, TEMP1, AO 3558 add B, TEMP1, BO 3559#endif 3560 3561#if defined(LN) || defined(LT) 3562 LDF [BO + 0 * SIZE], a1 3563 FSUB a1, c01, c01 3564#else 3565 LDF [AO + 0 * SIZE], a1 3566 FSUB a1, c01, c01 3567#endif 3568 3569#ifdef LN 3570 LDF [AO + 0 * SIZE], a1 3571 FMUL a1, c01, c01 3572#endif 3573 3574#ifdef LT 3575 LDF [AO + 0 * SIZE], a1 3576 FMUL a1, c01, c01 3577#endif 3578 3579#ifdef RN 3580 LDF [BO + 0 * SIZE], a1 3581 FMUL a1, c01, c01 3582#endif 3583 3584#ifdef RT 3585 LDF [BO + 0 * SIZE], a1 3586 FMUL a1, c01, c01 3587#endif 3588 3589#ifdef LN 3590 add C1, -1 * SIZE, C1 3591#endif 3592 3593#if defined(LN) || defined(LT) 3594 STF c01, [BO + 0 * SIZE] 3595#else 3596 STF c01, [AO + 0 * SIZE] 3597#endif 3598 3599 STF c01, [C1 + 0 * SIZE] 3600 3601 FMOV FZERO, t1 3602 FMOV FZERO, t2 3603 FMOV FZERO, t3 3604 FMOV FZERO, t4 3605 3606#ifndef LN 3607 add C1, 1 * SIZE, C1 3608#endif 3609 3610#ifdef RT 3611 sll K, 0 + BASE_SHIFT, TEMP1 3612 add AORIG, TEMP1, AORIG 3613#endif 3614 3615#if defined(LT) || defined(RN) 3616 sub K, KK, TEMP1 3617 sll TEMP1, 0 + BASE_SHIFT, TEMP1 3618 add AO, TEMP1, AO 3619 add BO, TEMP1, BO 3620#endif 3621 3622#ifdef LT 3623 add KK, 1, KK 3624#endif 3625 3626#ifdef LN 3627 sub KK, 1, KK 3628#endif 3629 3630.LL250: 3631 and M, 2, I 3632 cmp I, 0 3633 ble,pn %icc, .LL270 3634 nop 3635 3636#if defined(LT) || defined(RN) 3637 sra KK, 2, L 3638 3639 mov B, BO 3640 cmp L, 0 3641#else 3642 3643#ifdef LN 3644 sll K, 1 + BASE_SHIFT, TEMP1 3645 sub AORIG, TEMP1, AORIG 3646#endif 3647 3648 sll KK, 1 + BASE_SHIFT, TEMP1 3649 sll KK, 0 + BASE_SHIFT, TEMP2 3650 3651 add AORIG, TEMP1, AO 3652 add B, TEMP2, BO 3653 3654 sub K, KK, TEMP1 3655 sra TEMP1, 2, L 3656 cmp L, 0 3657#endif 3658 3659 LDF [AO + 0 * SIZE], a1 3660 FMOV FZERO, c01 3661 LDF [BO + 0 * SIZE], b1 3662 FMOV FZERO, t1 3663 3664 LDF [AO + 1 * SIZE], a2 3665 FMOV FZERO, c02 3666 LDF [BO + 1 * SIZE], b2 3667 FMOV FZERO, t2 3668 3669 LDF [AO + 2 * SIZE], a3 3670 FMOV FZERO, c03 3671 LDF [BO + 2 * SIZE], b3 3672 FMOV FZERO, t3 3673 3674 LDF [AO + 3 * SIZE], a4 3675 FMOV FZERO, c04 3676 LDF [BO + 3 * SIZE], b4 3677 FMOV FZERO, t4 3678 3679 ble,pn %icc, .LL255 3680 nop 3681 3682.LL252: 3683 FADD c01, t1, c01 3684 add L, -1, L 3685 FMUL a1, b1, t1 3686 LDF [AO + 4 * SIZE], a1 3687 3688 FADD c02, t2, c02 3689 FMUL a2, b1, t2 3690 LDF [AO + 5 * SIZE], a2 3691 LDF [BO + 4 * SIZE], b1 3692 3693 FADD c03, t3, c03 3694 cmp L, 0 3695 FMUL a3, b2, t3 3696 LDF [AO + 6 * SIZE], a3 3697 3698 FADD c04, t4, c04 3699 FMUL a4, b2, t4 3700 LDF [AO + 7 * SIZE], a4 3701 LDF [BO + 5 * SIZE], b2 3702 3703 FADD c01, t1, c01 3704 FMUL a1, b3, t1 3705 LDF [AO + 8 * SIZE], a1 3706 3707 FADD c02, t2, c02 3708 FMUL a2, b3, t2 3709 LDF [AO + 9 * SIZE], a2 3710 LDF [BO + 6 * SIZE], b3 3711 3712 FADD c03, t3, c03 3713 FMUL a3, b4, t3 3714 LDF [AO + 10 * SIZE], a3 3715 3716 FADD c04, t4, c04 3717 FMUL a4, b4, t4 3718 LDF [AO + 11 * SIZE], a4 3719 add AO, 8 * SIZE, AO 3720 3721 LDF [BO + 7 * SIZE], b4 3722 bg,pt %icc, .LL252 3723 add BO, 4 * SIZE, BO 3724 3725.LL255: 3726#if defined(LT) || defined(RN) 3727 and KK, 3, L 3728#else 3729 and TEMP1, 3, L 3730#endif 3731 3732 cmp L, 0 3733 ble,a,pn %icc, .LL259 3734 nop 3735 3736.LL256: 3737 FADD c01, t1, c01 3738 add L, -1, L 3739 FMUL a1, b1, t1 3740 LDF [AO + 2 * SIZE], a1 3741 3742 FADD c02, t2, c02 3743 cmp L, 0 3744 FMUL a2, b1, t2 3745 LDF [AO + 3 * SIZE], a2 3746 3747 LDF [BO + 1 * SIZE], b1 3748 add AO, 2 * SIZE, AO 3749 3750 bg,pt %icc, .LL256 3751 add BO, 1 * SIZE, BO 3752 3753.LL259: 3754 FADD c01, t1, c01 3755 FADD c02, t2, c02 3756 FADD c03, t3, c03 3757 FADD c04, t4, c04 3758 3759 FADD c01, c03, c01 3760 FADD c02, c04, c02 3761 3762#if defined(LN) || defined(RT) 3763#ifdef LN 3764 sub KK, 2, TEMP1 3765#else 3766 sub KK, 1, TEMP1 3767#endif 3768 sll TEMP1, 1 + BASE_SHIFT, TEMP2 3769 sll TEMP1, 0 + BASE_SHIFT, TEMP1 3770 add AORIG, TEMP2, AO 3771 add B, TEMP1, BO 3772#endif 3773 3774#if defined(LN) || defined(LT) 3775 LDF [BO + 0 * SIZE], a1 3776 LDF [BO + 1 * SIZE], a2 3777 3778 FSUB a1, c01, c01 3779 FSUB a2, c02, c02 3780#else 3781 LDF [AO + 0 * SIZE], a1 3782 LDF [AO + 1 * SIZE], a2 3783 3784 FSUB a1, c01, c01 3785 FSUB a2, c02, c02 3786#endif 3787 3788#ifdef LN 3789 LDF [AO + 3 * SIZE], a1 3790 LDF [AO + 2 * SIZE], a2 3791 LDF [AO + 0 * SIZE], a3 3792 3793 FMUL a1, c02, c02 3794 FMUL a2, c02, t1 3795 FSUB c01, t1, c01 3796 FMUL a3, c01, c01 3797#endif 3798 3799#ifdef LT 3800 LDF [AO + 0 * SIZE], a1 3801 LDF [AO + 1 * SIZE], a2 3802 LDF [AO + 3 * SIZE], a3 3803 3804 FMUL a1, c01, c01 3805 FMUL a2, c01, t1 3806 FSUB c02, t1, c02 3807 FMUL a3, c02, c02 3808#endif 3809 3810#ifdef RN 3811 LDF [BO + 0 * SIZE], a1 3812 3813 FMUL a1, c01, c01 3814 FMUL a1, c02, c02 3815#endif 3816 3817#ifdef RT 3818 LDF [BO + 0 * SIZE], a1 3819 3820 FMUL a1, c01, c01 3821 FMUL a1, c02, c02 3822#endif 3823 3824#ifdef LN 3825 add C1, -2 * SIZE, C1 3826#endif 3827 3828#if defined(LN) || defined(LT) 3829 STF c01, [BO + 0 * SIZE] 3830 STF c02, [BO + 1 * SIZE] 3831#else 3832 STF c01, [AO + 0 * SIZE] 3833 STF c02, [AO + 1 * SIZE] 3834#endif 3835 3836 STF c01, [C1 + 0 * SIZE] 3837 STF c02, [C1 + 1 * SIZE] 3838 3839 FMOV FZERO, t1 3840 FMOV FZERO, t2 3841 FMOV FZERO, t3 3842 FMOV FZERO, t4 3843 3844#ifndef LN 3845 add C1, 2 * SIZE, C1 3846#endif 3847 3848#ifdef RT 3849 sll K, 1 + BASE_SHIFT, TEMP1 3850 add AORIG, TEMP1, AORIG 3851#endif 3852 3853#if defined(LT) || defined(RN) 3854 sub K, KK, TEMP1 3855 sll TEMP1, 1 + BASE_SHIFT, TEMP2 3856 sll TEMP1, 0 + BASE_SHIFT, TEMP1 3857 add AO, TEMP2, AO 3858 add BO, TEMP1, BO 3859#endif 3860 3861#ifdef LT 3862 add KK, 2, KK 3863#endif 3864 3865#ifdef LN 3866 sub KK, 2, KK 3867#endif 3868 3869.LL270: 3870 sra M, 2, I 3871 cmp I, 0 3872 ble,pn %icc, .LL299 3873 nop 3874 3875.LL221: 3876#if defined(LT) || defined(RN) 3877 sra KK, 2, L 3878 3879 mov B, BO 3880 cmp L, 0 3881#else 3882 3883#ifdef LN 3884 sll K, 2 + BASE_SHIFT, TEMP1 3885 sub AORIG, TEMP1, AORIG 3886#endif 3887 3888 sll KK, 2 + BASE_SHIFT, TEMP1 3889 sll KK, 0 + BASE_SHIFT, TEMP2 3890 3891 add AORIG, TEMP1, AO 3892 add B, TEMP2, BO 3893 3894 sub K, KK, TEMP1 3895 sra TEMP1, 2, L 3896 cmp L, 0 3897#endif 3898 3899 LDF [AO + 0 * SIZE], a1 3900 FMOV FZERO, c01 3901 LDF [BO + 0 * SIZE], b1 3902 FMOV FZERO, t1 3903 3904 LDF [AO + 1 * SIZE], a2 3905 FMOV FZERO, c02 3906 LDF [BO + 1 * SIZE], b2 3907 FMOV FZERO, t2 3908 3909 LDF [AO + 2 * SIZE], a3 3910 FMOV FZERO, c03 3911 LDF [BO + 2 * SIZE], b3 3912 FMOV FZERO, t3 3913 3914 LDF [AO + 3 * SIZE], a4 3915 FMOV FZERO, c04 3916 LDF [BO + 3 * SIZE], b4 3917 FMOV FZERO, t4 3918 3919#ifdef LN 3920 prefetch [C1 - 3 * SIZE], 2 3921#else 3922 prefetch [C1 + 3 * SIZE], 2 3923#endif 3924 3925 ble,pn %icc, .LL225 3926 prefetch [C1 + 4 * SIZE], 2 3927 3928.LL222: 3929 FADD c01, t1, c01 3930 add BO, 4 * SIZE, BO 3931 FMUL a1, b1, t1 3932 LDF [AO + 4 * SIZE], a1 3933 3934 FADD c02, t2, c02 3935 FMUL a2, b1, t2 3936 LDF [AO + 5 * SIZE], a2 3937 3938 FADD c03, t3, c03 3939 add L, -1, L 3940 FMUL a3, b1, t3 3941 LDF [AO + 6 * SIZE], a3 3942 3943 FADD c04, t4, c04 3944 FMUL a4, b1, t4 3945 LDF [AO + 7 * SIZE], a4 3946 LDF [BO + 0 * SIZE], b1 3947 3948 FADD c01, t1, c01 3949 cmp L, 0 3950 FMUL a1, b2, t1 3951 LDF [AO + 8 * SIZE], a1 3952 3953 FADD c02, t2, c02 3954 FMUL a2, b2, t2 3955 LDF [AO + 9 * SIZE], a2 3956 3957 FADD c03, t3, c03 3958 FMUL a3, b2, t3 3959 LDF [AO + 10 * SIZE], a3 3960 3961 FADD c04, t4, c04 3962 FMUL a4, b2, t4 3963 LDF [AO + 11 * SIZE], a4 3964 LDF [BO + 1 * SIZE], b2 3965 3966 FADD c01, t1, c01 3967 FMUL a1, b3, t1 3968 LDF [AO + 12 * SIZE], a1 3969 3970 FADD c02, t2, c02 3971 FMUL a2, b3, t2 3972 LDF [AO + 13 * SIZE], a2 3973 3974 FADD c03, t3, c03 3975 FMUL a3, b3, t3 3976 LDF [AO + 14 * SIZE], a3 3977 3978 FADD c04, t4, c04 3979 FMUL a4, b3, t4 3980 LDF [AO + 15 * SIZE], a4 3981 LDF [BO + 2 * SIZE], b3 3982 3983 FADD c01, t1, c01 3984 FMUL a1, b4, t1 3985 LDF [AO + 16 * SIZE], a1 3986 3987 FADD c02, t2, c02 3988 FMUL a2, b4, t2 3989 LDF [AO + 17 * SIZE], a2 3990 3991 FADD c03, t3, c03 3992 FMUL a3, b4, t3 3993 LDF [AO + 18 * SIZE], a3 3994 3995 FADD c04, t4, c04 3996 FMUL a4, b4, t4 3997 LDF [AO + 19 * SIZE], a4 3998 add AO, 16 * SIZE, AO 3999 4000 bg,pt %icc, .LL222 4001 LDF [BO + 3 * SIZE], b4 4002 4003.LL225: 4004#if defined(LT) || defined(RN) 4005 and KK, 3, L 4006#else 4007 and TEMP1, 3, L 4008#endif 4009 cmp L, 0 4010 ble,a,pn %icc, .LL229 4011 nop 4012 4013.LL226: 4014 FADD c01, t1, c01 4015 add BO, 1 * SIZE, BO 4016 FMUL a1, b1, t1 4017 LDF [AO + 4 * SIZE], a1 4018 4019 FADD c02, t2, c02 4020 add L, -1, L 4021 FMUL a2, b1, t2 4022 LDF [AO + 5 * SIZE], a2 4023 4024 FADD c03, t3, c03 4025 cmp L, 0 4026 FMUL a3, b1, t3 4027 LDF [AO + 6 * SIZE], a3 4028 4029 FADD c04, t4, c04 4030 FMUL a4, b1, t4 4031 LDF [AO + 7 * SIZE], a4 4032 add AO, 4 * SIZE, AO 4033 4034 bg,pt %icc, .LL226 4035 LDF [BO + 0 * SIZE], b1 4036 4037.LL229: 4038 FADD c01, t1, c01 4039 FADD c02, t2, c02 4040 FADD c03, t3, c03 4041 FADD c04, t4, c04 4042 4043#if defined(LN) || defined(RT) 4044#ifdef LN 4045 sub KK, 4, TEMP1 4046#else 4047 sub KK, 1, TEMP1 4048#endif 4049 sll TEMP1, 2 + BASE_SHIFT, TEMP2 4050 sll TEMP1, 0 + BASE_SHIFT, TEMP1 4051 add AORIG, TEMP2, AO 4052 add B, TEMP1, BO 4053#endif 4054 4055#if defined(LN) || defined(LT) 4056 LDF [BO + 0 * SIZE], a1 4057 LDF [BO + 1 * SIZE], a2 4058 LDF [BO + 2 * SIZE], a3 4059 LDF [BO + 3 * SIZE], a4 4060 4061 FSUB a1, c01, c01 4062 FSUB a2, c02, c02 4063 FSUB a3, c03, c03 4064 FSUB a4, c04, c04 4065#else 4066 LDF [AO + 0 * SIZE], a1 4067 LDF [AO + 1 * SIZE], a2 4068 LDF [AO + 2 * SIZE], a3 4069 LDF [AO + 3 * SIZE], a4 4070 4071 FSUB a1, c01, c01 4072 FSUB a2, c02, c02 4073 FSUB a3, c03, c03 4074 FSUB a4, c04, c04 4075#endif 4076 4077#ifdef LN 4078 LDF [AO + 15 * SIZE], a1 4079 LDF [AO + 14 * SIZE], a2 4080 LDF [AO + 13 * SIZE], a3 4081 LDF [AO + 12 * SIZE], a4 4082 4083 FMUL a1, c04, c04 4084 FMUL a2, c04, t1 4085 4086 FSUB c03, t1, c03 4087 FMUL a3, c04, t1 4088 4089 FSUB c02, t1, c02 4090 FMUL a4, c04, t1 4091 4092 FSUB c01, t1, c01 4093 4094 LDF [AO + 10 * SIZE], a1 4095 LDF [AO + 9 * SIZE], a2 4096 LDF [AO + 8 * SIZE], a3 4097 4098 FMUL a1, c03, c03 4099 FMUL a2, c03, t1 4100 4101 FSUB c02, t1, c02 4102 FMUL a3, c03, t1 4103 FSUB c01, t1, c01 4104 4105 LDF [AO + 5 * SIZE], a1 4106 LDF [AO + 4 * SIZE], a2 4107 4108 FMUL a1, c02, c02 4109 FMUL a2, c02, t1 4110 FSUB c01, t1, c01 4111 4112 LDF [AO + 0 * SIZE], a1 4113 4114 FMUL a1, c01, c01 4115#endif 4116 4117#ifdef LT 4118 LDF [AO + 0 * SIZE], a1 4119 LDF [AO + 1 * SIZE], a2 4120 LDF [AO + 2 * SIZE], a3 4121 LDF [AO + 3 * SIZE], a4 4122 4123 FMUL a1, c01, c01 4124 FMUL a2, c01, t1 4125 FSUB c02, t1, c02 4126 FMUL a3, c01, t1 4127 FSUB c03, t1, c03 4128 FMUL a4, c01, t1 4129 FSUB c04, t1, c04 4130 4131 LDF [AO + 5 * SIZE], a1 4132 LDF [AO + 6 * SIZE], a2 4133 LDF [AO + 7 * SIZE], a3 4134 4135 FMUL a1, c02, c02 4136 FMUL a2, c02, t1 4137 FSUB c03, t1, c03 4138 FMUL a3, c02, t1 4139 FSUB c04, t1, c04 4140 4141 LDF [AO + 10 * SIZE], a1 4142 LDF [AO + 11 * SIZE], a2 4143 4144 FMUL a1, c03, c03 4145 FMUL a2, c03, t1 4146 4147 FSUB c04, t1, c04 4148 4149 LDF [AO + 15 * SIZE], a1 4150 4151 FMUL a1, c04, c04 4152#endif 4153 4154#ifdef RN 4155 LDF [BO + 0 * SIZE], a1 4156 4157 FMUL a1, c01, c01 4158 FMUL a1, c02, c02 4159 FMUL a1, c03, c03 4160 FMUL a1, c04, c04 4161#endif 4162 4163#ifdef RT 4164 LDF [BO + 0 * SIZE], a1 4165 4166 FMUL a1, c01, c01 4167 FMUL a1, c02, c02 4168 FMUL a1, c03, c03 4169 FMUL a1, c04, c04 4170#endif 4171 4172#ifdef LN 4173 add C1, -4 * SIZE, C1 4174#endif 4175 4176#if defined(LN) || defined(LT) 4177 STF c01, [BO + 0 * SIZE] 4178 STF c02, [BO + 1 * SIZE] 4179 STF c03, [BO + 2 * SIZE] 4180 STF c04, [BO + 3 * SIZE] 4181#else 4182 STF c01, [AO + 0 * SIZE] 4183 STF c02, [AO + 1 * SIZE] 4184 STF c03, [AO + 2 * SIZE] 4185 STF c04, [AO + 3 * SIZE] 4186#endif 4187 4188 STF c01, [C1 + 0 * SIZE] 4189 STF c02, [C1 + 1 * SIZE] 4190 STF c03, [C1 + 2 * SIZE] 4191 STF c04, [C1 + 3 * SIZE] 4192 4193 FMOV FZERO, t1 4194 FMOV FZERO, t2 4195 FMOV FZERO, t3 4196 FMOV FZERO, t4 4197 4198#ifndef LN 4199 add C1, 4 * SIZE, C1 4200#endif 4201 4202#ifdef RT 4203 sll K, 2 + BASE_SHIFT, TEMP1 4204 add AORIG, TEMP1, AORIG 4205#endif 4206 4207#if defined(LT) || defined(RN) 4208 sub K, KK, TEMP1 4209 sll TEMP1, 2 + BASE_SHIFT, TEMP2 4210 sll TEMP1, 0 + BASE_SHIFT, TEMP1 4211 add AO, TEMP2, AO 4212 add BO, TEMP1, BO 4213#endif 4214 4215#ifdef LT 4216 add KK, 4, KK 4217#endif 4218 4219#ifdef LN 4220 sub KK, 4, KK 4221#endif 4222 4223 add I, -1, I 4224 cmp I, 0 4225 4226 bg,pt %icc, .LL221 4227 nop 4228 4229 4230 4231.LL299: 4232#ifdef LN 4233 sll K, 0 + BASE_SHIFT, TEMP1 4234 add B, TEMP1, B 4235#endif 4236 4237#if defined(LT) || defined(RN) 4238 mov BO, B 4239#endif 4240 4241#ifdef RN 4242 add KK, 1, KK 4243#endif 4244 4245#ifdef RT 4246 sub KK, 1, KK 4247#endif 4248 4249 4250.LL999: 4251 return %i7 + 8 4252 clr %o0 4253 4254 EPILOGUE 4255