1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %i0 43#define N %i1 44#define K %i2 45#define A %i5 46#define B %i3 47#define C %i4 48 49#define LDC %o0 50#define AO %o1 51#define BO %o2 52#define I %o3 53#define J %o4 54#define L %o5 55 56#define C1 %l0 57#define C2 %l1 58 59#define OFFSET %l2 60#define KK %l3 61#define TEMP1 %l4 62#define TEMP2 %l5 63#define AORIG %l6 64 65#ifdef DOUBLE 66#define c01 %f0 67#define c02 %f2 68#define c03 %f4 69#define c04 %f6 70#define c05 %f8 71#define c06 %f10 72#define c07 %f12 73#define c08 %f14 74#define c09 %f16 75#define c10 %f18 76#define c11 %f20 77#define c12 %f22 78#define c13 %f24 79#define c14 %f26 80#define c15 %f28 81#define c16 %f30 82 83#define t1 %f32 84#define t2 %f34 85#define t3 %f36 86#define t4 %f38 87 88#define a1 %f40 89#define a2 %f42 90#define a3 %f44 91#define a4 %f46 92#define a5 %f62 93 94#define b1 %f48 95#define b2 %f50 96#define b3 %f52 97#define b4 %f54 98#define b5 %f56 99 100#define FZERO %f58 101 102#else 103#define c01 %f0 104#define c02 %f1 105#define c03 %f2 106#define c04 %f3 107#define c05 %f4 108#define c06 %f5 109#define c07 %f6 110#define c08 %f7 111#define c09 %f8 112#define c10 %f9 113#define c11 %f10 114#define c12 %f11 115#define c13 %f12 116#define c14 %f13 117#define c15 %f14 118#define c16 %f15 119 120#define t1 %f16 121#define t2 %f17 122#define t3 %f18 123#define t4 %f19 124 125#define a1 %f20 126#define a2 %f21 127#define a3 %f22 128#define a4 %f23 129#define a5 %f31 130 131#define b1 %f24 132#define b2 %f25 133#define b3 %f26 134#define b4 %f27 135#define b5 %f28 136 137#define FZERO %f29 138#endif 139 140#define t5 c13 141#define t6 c14 142#define t7 c15 143#define t8 c16 144 145#ifndef CONJ 146#define FADD1 FADD 147#define FADD2 FADD 148#define FADD3 FADD 149#define FADD4 FSUB 150#else 151 152#if defined(LN) || defined(LT) 153#define FADD1 FADD 154#define FADD2 FSUB 155#define FADD3 FADD 156#define FADD4 FADD 157#endif 158 159#if defined(RN) || defined(RT) 160#define FADD1 FADD 161#define FADD2 FADD 162#define FADD3 FSUB 163#define FADD4 FADD 164#endif 165#endif 166 167#define APREFETCHSIZE 40 168#define BPREFETCHSIZE 40 169 170#define APREFETCH_CATEGORY 0 171#define BPREFETCH_CATEGORY 0 172 173 PROLOGUE 174 SAVESP 175 176#ifndef __64BIT__ 177#ifdef DOUBLE 178 ld [%sp + STACK_START + 32], A 179 ld [%sp + STACK_START + 36], B 180 ld [%sp + STACK_START + 40], C 181 ld [%sp + STACK_START + 44], LDC 182 ld [%sp + STACK_START + 48], OFFSET 183#else 184 ld [%sp + STACK_START + 28], B 185 ld [%sp + STACK_START + 32], C 186 ld [%sp + STACK_START + 36], LDC 187 ld [%sp + STACK_START + 40], OFFSET 188#endif 189#else 190 ldx [%sp+ STACK_START + 56], B 191 ldx [%sp+ STACK_START + 64], C 192 ldx [%sp+ STACK_START + 72], LDC 193 ldx [%sp+ STACK_START + 80], OFFSET 194#endif 195 196#ifdef DOUBLE 197 FCLR(27) 198#else 199 FCLR(29) 200#endif 201 202 sll LDC, ZBASE_SHIFT, LDC 203 204#ifdef LN 205 smul M, K, TEMP1 206 sll TEMP1, ZBASE_SHIFT, TEMP1 207 add A, TEMP1, A 208 209 sll M, ZBASE_SHIFT, TEMP1 210 add C, TEMP1, C 211#endif 212 213#ifdef RN 214 neg OFFSET, KK 215#endif 216 217#ifdef RT 218 smul N, K, TEMP1 219 sll TEMP1, ZBASE_SHIFT, TEMP1 220 add B, TEMP1, B 221 222 smul N, LDC, TEMP1 223 add C, TEMP1, C 224 225 sub N, OFFSET, KK 226#endif 227 228 sra N, 1, J 229 cmp J, 0 230 ble,pn %icc, .LL100 231 nop 232 233.LL11: 234#ifdef RT 235 sll K, 1 + ZBASE_SHIFT, TEMP1 236 sub B, TEMP1, B 237 238 add LDC, LDC, TEMP1 239 sub C, TEMP1, C 240#endif 241 242 mov C, C1 243 add C, LDC, C2 244 245#ifdef LN 246 add M, OFFSET, KK 247#endif 248 249#ifdef LT 250 mov OFFSET, KK 251#endif 252 253#if defined(LN) || defined(RT) 254 mov A, AORIG 255#else 256 mov A, AO 257#endif 258 259#ifndef RT 260 add C2, LDC, C 261#endif 262 263 and M, 1, I 264 cmp I, 0 265 ble,pn %icc, .LL50 266 nop 267 268#if defined(LT) || defined(RN) 269 sra KK, 2, L 270 271 mov B, BO 272 cmp L, 0 273#else 274 275#ifdef LN 276 sll K, 0 + ZBASE_SHIFT, TEMP1 277 sub AORIG, TEMP1, AORIG 278#endif 279 280 sll KK, 0 + ZBASE_SHIFT, TEMP1 281 sll KK, 1 + ZBASE_SHIFT, TEMP2 282 283 add AORIG, TEMP1, AO 284 add B, TEMP2, BO 285 286 sub K, KK, TEMP1 287 288 sra TEMP1, 2, L 289 cmp L, 0 290#endif 291 292 FMOV FZERO, c02 293 FMOV FZERO, t1 294 FMOV FZERO, c04 295 296 LDF [AO + 0 * SIZE], a1 297 FMOV FZERO, t2 298 LDF [BO + 0 * SIZE], b1 299 FMOV FZERO, c06 300 LDF [AO + 1 * SIZE], a2 301 FMOV FZERO, t3 302 LDF [BO + 1 * SIZE], b2 303 FMOV FZERO, c08 304 LDF [AO + 2 * SIZE], a3 305 FMOV FZERO, t4 306 LDF [BO + 2 * SIZE], b3 307 FMOV FZERO, c01 308 LDF [AO + 3 * SIZE], a4 309 FMOV FZERO, c03 310 LDF [BO + 3 * SIZE], b4 311 FMOV FZERO, c05 312 313 ble,pn %icc, .LL55 314 FMOV FZERO, c07 315 316.LL52: 317 FADD2 c02, t1, c02 318 add AO, 8 * SIZE, AO 319 prefetch [AO + APREFETCHSIZE * SIZE], 0 320 321 FMUL a1, b1, t1 322 add BO, 16 * SIZE, BO 323 324 FADD4 c04, t2, c04 325 add L, -1, L 326 FMUL a1, b2, t2 327 328 FADD2 c06, t3, c06 329 cmp L, 0 330 FMUL a1, b3, t3 331 332 FADD4 c08, t4, c08 333 FMUL a1, b4, t4 334 LDF [AO - 4 * SIZE], a1 335 336 FADD1 c01, t1, c01 337 FMUL a2, b1, t1 338 LDF [BO - 12 * SIZE], b1 339 FADD3 c03, t2, c03 340 FMUL a2, b2, t2 341 LDF [BO - 11 * SIZE], b2 342 343 FADD1 c05, t3, c05 344 FMUL a2, b3, t3 345 LDF [BO - 10 * SIZE], b3 346 FADD3 c07, t4, c07 347 FMUL a2, b4, t4 348 LDF [BO - 9 * SIZE], b4 349 350 FADD2 c02, t1, c02 351 FMUL a3, b1, t1 352 LDF [AO - 3 * SIZE], a2 353 FADD4 c04, t2, c04 354 FMUL a3, b2, t2 355 356 FADD2 c06, t3, c06 357 FMUL a3, b3, t3 358 FADD4 c08, t4, c08 359 FMUL a3, b4, t4 360 LDF [AO - 2 * SIZE], a3 361 362 FADD1 c01, t1, c01 363 FMUL a4, b1, t1 364 LDF [BO - 8 * SIZE], b1 365 FADD3 c03, t2, c03 366 FMUL a4, b2, t2 367 LDF [BO - 7 * SIZE], b2 368 369 FADD1 c05, t3, c05 370 FMUL a4, b3, t3 371 LDF [BO - 6 * SIZE], b3 372 FADD3 c07, t4, c07 373 FMUL a4, b4, t4 374 LDF [BO - 5 * SIZE], b4 375 376 FADD2 c02, t1, c02 377 FMUL a1, b1, t1 378 LDF [AO - 1 * SIZE], a4 379 FADD4 c04, t2, c04 380 FMUL a1, b2, t2 381 382 FADD2 c06, t3, c06 383 FMUL a1, b3, t3 384 FADD4 c08, t4, c08 385 FMUL a1, b4, t4 386 LDF [AO + 0 * SIZE], a1 387 388 FADD1 c01, t1, c01 389 FMUL a2, b1, t1 390 LDF [BO - 4 * SIZE], b1 391 392 FADD3 c03, t2, c03 393 FMUL a2, b2, t2 394 LDF [BO - 3 * SIZE], b2 395 396 FADD1 c05, t3, c05 397 FMUL a2, b3, t3 398 LDF [BO - 2 * SIZE], b3 399 FADD3 c07, t4, c07 400 FMUL a2, b4, t4 401 LDF [BO - 1 * SIZE], b4 402 403 FADD2 c02, t1, c02 404 FMUL a3, b1, t1 405 LDF [AO + 1 * SIZE], a2 406 FADD4 c04, t2, c04 407 FMUL a3, b2, t2 408 409 FADD2 c06, t3, c06 410 FMUL a3, b3, t3 411 FADD4 c08, t4, c08 412 FMUL a3, b4, t4 413 LDF [AO + 2 * SIZE], a3 414 415 FADD1 c01, t1, c01 416 FMUL a4, b1, t1 417 LDF [BO + 0 * SIZE], b1 418 FADD3 c03, t2, c03 419 FMUL a4, b2, t2 420 LDF [BO + 1 * SIZE], b2 421 422 FADD1 c05, t3, c05 423 FMUL a4, b3, t3 424 LDF [BO + 2 * SIZE], b3 425 FADD3 c07, t4, c07 426 FMUL a4, b4, t4 427 LDF [BO + 3 * SIZE], b4 428 429 bg,pt %icc, .LL52 430 LDF [AO + 3 * SIZE], a4 431 432.LL55: 433#if defined(LT) || defined(RN) 434 and KK, 3, L 435#else 436 and TEMP1, 3, L 437#endif 438 cmp L, 0 439 ble,a,pn %icc, .LL59 440 nop 441 442.LL56: 443 FADD2 c02, t1, c02 444 add AO, 2 * SIZE, AO 445 FMUL a1, b1, t1 446 add L, -1, L 447 448 add BO, 4 * SIZE, BO 449 FADD4 c04, t2, c04 450 cmp L, 0 451 FMUL a1, b2, t2 452 453 FADD2 c06, t3, c06 454 FMUL a1, b3, t3 455 FADD4 c08, t4, c08 456 FMUL a1, b4, t4 457 LDF [AO + 0 * SIZE], a1 458 459 FADD1 c01, t1, c01 460 FMUL a2, b1, t1 461 LDF [BO + 0 * SIZE], b1 462 FADD3 c03, t2, c03 463 FMUL a2, b2, t2 464 LDF [BO + 1 * SIZE], b2 465 466 FADD1 c05, t3, c05 467 FMUL a2, b3, t3 468 LDF [BO + 2 * SIZE], b3 469 FADD3 c07, t4, c07 470 FMUL a2, b4, t4 471 LDF [BO + 3 * SIZE], b4 472 473 bg,pt %icc, .LL56 474 LDF [AO + 1 * SIZE], a2 475 476.LL59: 477#if defined(LN) || defined(RT) 478#ifdef LN 479 sub KK, 1, TEMP1 480#else 481 sub KK, 2, TEMP1 482#endif 483 sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 484 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 485 add AORIG, TEMP2, AO 486 add B, TEMP1, BO 487#endif 488 489 FADD2 c02, t1, c02 490 FADD4 c04, t2, c04 491 FADD2 c06, t3, c06 492 FADD4 c08, t4, c08 493 494 FADD c01, c04, c01 495 FADD c02, c03, c02 496 FADD c05, c08, c05 497 FADD c06, c07, c06 498 499#if defined(LN) || defined(LT) 500 LDF [BO + 0 * SIZE], a1 501 LDF [BO + 1 * SIZE], a2 502 LDF [BO + 2 * SIZE], a3 503 LDF [BO + 3 * SIZE], a4 504 505 FSUB a1, c01, c01 506 FSUB a2, c02, c02 507 FSUB a3, c05, c05 508 FSUB a4, c06, c06 509 510#else 511 LDF [AO + 0 * SIZE], a1 512 LDF [AO + 1 * SIZE], a2 513 LDF [AO + 2 * SIZE], a3 514 LDF [AO + 3 * SIZE], a4 515 516 FSUB a1, c01, c01 517 FSUB a2, c02, c02 518 FSUB a3, c05, c05 519 FSUB a4, c06, c06 520#endif 521 522#ifdef LN 523 LDF [AO + 0 * SIZE], a1 524 LDF [AO + 1 * SIZE], a2 525 526 FMUL a1, c01, t1 527 FMUL a2, c02, t2 528 FMUL a1, c02, t3 529 FMUL a2, c01, t4 530 531 FMUL a1, c05, t5 532 FMUL a2, c06, t6 533 FMUL a1, c06, t7 534 FMUL a2, c05, t8 535 536 FADD4 t1, t2, c01 537 FADD2 t3, t4, c02 538 FADD4 t5, t6, c05 539 FADD2 t7, t8, c06 540#endif 541 542#ifdef LT 543 LDF [AO + 0 * SIZE], a1 544 LDF [AO + 1 * SIZE], a2 545 546 FMUL a1, c01, t1 547 FMUL a2, c02, t2 548 FMUL a1, c02, t3 549 FMUL a2, c01, t4 550 551 FMUL a1, c05, t5 552 FMUL a2, c06, t6 553 FMUL a1, c06, t7 554 FMUL a2, c05, t8 555 556 FADD4 t1, t2, c01 557 FADD2 t3, t4, c02 558 FADD4 t5, t6, c05 559 FADD2 t7, t8, c06 560#endif 561 562#ifdef RN 563 LDF [BO + 0 * SIZE], a1 564 LDF [BO + 1 * SIZE], a2 565 LDF [BO + 2 * SIZE], a3 566 LDF [BO + 3 * SIZE], a4 567 LDF [BO + 6 * SIZE], b1 568 LDF [BO + 7 * SIZE], b2 569 570 FMUL a1, c01, t1 571 FMUL a2, c02, t2 572 FMUL a1, c02, t3 573 FMUL a2, c01, t4 574 575 FADD4 t1, t2, c01 576 FADD3 t3, t4, c02 577 578 FMUL a3, c01, t1 579 FMUL a3, c02, t2 580 FMUL a4, c02, t3 581 FMUL a4, c01, t4 582 583 FSUB c05, t1, c05 584 FSUB c06, t2, c06 585 FADD3 c05, t3, c05 586 FADD4 c06, t4, c06 587 588 FMUL b1, c05, t1 589 FMUL b2, c06, t2 590 FMUL b1, c06, t3 591 FMUL b2, c05, t4 592 593 FADD4 t1, t2, c05 594 FADD3 t3, t4, c06 595#endif 596 597#ifdef RT 598 LDF [BO + 6 * SIZE], a1 599 LDF [BO + 7 * SIZE], a2 600 LDF [BO + 4 * SIZE], a3 601 LDF [BO + 5 * SIZE], a4 602 LDF [BO + 0 * SIZE], b1 603 LDF [BO + 1 * SIZE], b2 604 605 FMUL a1, c05, t1 606 FMUL a2, c06, t2 607 FMUL a1, c06, t3 608 FMUL a2, c05, t4 609 610 FADD4 t1, t2, c05 611 FADD3 t3, t4, c06 612 613 FMUL a3, c05, t1 614 FMUL a3, c06, t2 615 FMUL a4, c06, t3 616 FMUL a4, c05, t4 617 618 FSUB c01, t1, c01 619 FSUB c02, t2, c02 620 FADD3 c01, t3, c01 621 FADD4 c02, t4, c02 622 623 FMUL b1, c01, t1 624 FMUL b2, c02, t2 625 FMUL b1, c02, t3 626 FMUL b2, c01, t4 627 628 FADD4 t1, t2, c01 629 FADD3 t3, t4, c02 630#endif 631 632#ifdef LN 633 add C1, -2 * SIZE, C1 634 add C2, -2 * SIZE, C2 635#endif 636 637#if defined(LN) || defined(LT) 638 STF c01, [BO + 0 * SIZE] 639 STF c02, [BO + 1 * SIZE] 640 STF c05, [BO + 2 * SIZE] 641 STF c06, [BO + 3 * SIZE] 642#else 643 STF c01, [AO + 0 * SIZE] 644 STF c02, [AO + 1 * SIZE] 645 STF c05, [AO + 2 * SIZE] 646 STF c06, [AO + 3 * SIZE] 647#endif 648 649 STF c01, [C1 + 0 * SIZE] 650 STF c02, [C1 + 1 * SIZE] 651 STF c05, [C2 + 0 * SIZE] 652 STF c06, [C2 + 1 * SIZE] 653 654 FMOV FZERO, t1 655 FMOV FZERO, t2 656 FMOV FZERO, t3 657 FMOV FZERO, t4 658 659#ifndef LN 660 add C1, 2 * SIZE, C1 661 add C2, 2 * SIZE, C2 662#endif 663 664#ifdef RT 665 sll K, 0 + ZBASE_SHIFT, TEMP1 666 add AORIG, TEMP1, AORIG 667#endif 668 669#if defined(LT) || defined(RN) 670 sub K, KK, TEMP1 671 sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 672 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 673 add AO, TEMP2, AO 674 add BO, TEMP1, BO 675#endif 676 677#ifdef LT 678 add KK, 1, KK 679#endif 680 681#ifdef LN 682 sub KK, 1, KK 683#endif 684 685.LL50: 686 sra M, 1, I 687 cmp I, 0 688 ble,pn %icc, .LL99 689 nop 690 691.LL21: 692#if defined(LT) || defined(RN) 693 sra KK, 2, L 694 695 mov B, BO 696 cmp L, 0 697#else 698 699#ifdef LN 700 sll K, 1 + ZBASE_SHIFT, TEMP1 701 sub AORIG, TEMP1, AORIG 702#endif 703 704 sll KK, 1 + ZBASE_SHIFT, TEMP1 705 706 add AORIG, TEMP1, AO 707 add B, TEMP1, BO 708 709 sub K, KK, TEMP1 710 711 sra TEMP1, 2, L 712 cmp L, 0 713#endif 714 715 FMOV FZERO, t1 716 FMOV FZERO, t2 717 FMOV FZERO, t3 718 FMOV FZERO, t4 719 720 FMOV FZERO, c01 721 FMOV FZERO, c02 722 723 LDF [AO + 0 * SIZE], a1 724 FMOV FZERO, c03 725 LDF [BO + 0 * SIZE], b1 726 FMOV FZERO, c04 727 728 LDF [AO + 1 * SIZE], a2 729 FMOV FZERO, c05 730 LDF [BO + 1 * SIZE], b2 731 FMOV FZERO, c06 732 733 LDF [AO + 2 * SIZE], a3 734 FMOV FZERO, c07 735 LDF [BO + 2 * SIZE], b3 736 FMOV FZERO, c08 737 738 LDF [AO + 3 * SIZE], a4 739 FMOV FZERO, c09 740 LDF [BO + 3 * SIZE], b4 741 FMOV FZERO, c10 742 743 LDF [BO + 4 * SIZE], b5 744 FMOV FZERO, c11 745 LDF [AO + 4 * SIZE], a5 746 FMOV FZERO, c12 747 748#ifdef LN 749 prefetch [C1 - 3 * SIZE], 3 750 FMOV FZERO, c13 751 prefetch [C2 - 3 * SIZE], 3 752 FMOV FZERO, c14 753#else 754 prefetch [C1 + 3 * SIZE], 3 755 FMOV FZERO, c13 756 prefetch [C2 + 3 * SIZE], 3 757 FMOV FZERO, c14 758#endif 759 760 FMOV FZERO, c15 761 ble,pn %icc, .LL25 762 FMOV FZERO, c16 763 764.LL22: 765 FADD2 c04, t1, c04 766 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY 767 FMUL a1, b1, t1 768 nop 769 770 FADD4 c08, t2, c08 771 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY 772 FMUL a1, b2, t2 773 add AO, 16 * SIZE, AO 774 775 FADD2 c12, t3, c12 776 LDF [AO - 13 * SIZE], a4 777 FMUL a1, b3, t3 778 add BO, 16 * SIZE, BO 779 780 FADD4 c16, t4, c16 781 nop 782 FMUL a1, b4, t4 783 LDF [AO - 8 * SIZE], a1 784 785 FADD1 c01, t1, c01 786 nop 787 FMUL a2, b1, t1 788 nop 789 790 FADD3 c05, t2, c05 791 nop 792 FMUL a2, b2, t2 793 nop 794 795 FADD1 c09, t3, c09 796 nop 797 FMUL a2, b3, t3 798 nop 799 800 FADD3 c13, t4, c13 801 add L, -1, L 802 FMUL a2, b4, t4 803 LDF [AO - 11 * SIZE], a2 804 805 FADD2 c02, t1, c02 806 nop 807 FMUL a3, b1, t1 808 nop 809 810 FADD4 c06, t2, c06 811 nop 812 FMUL a3, b2, t2 813 nop 814 815 FADD2 c10, t3, c10 816 nop 817 FMUL a3, b3, t3 818 nop 819 820 FADD4 c14, t4, c14 821 nop 822 FMUL a3, b4, t4 823 LDF [AO - 10 * SIZE], a3 824 825 FADD1 c03, t1, c03 826 nop 827 FMUL a4, b1, t1 828 LDF [BO - 8 * SIZE], b1 829 830 FADD3 c07, t2, c07 831 nop 832 FMUL a4, b2, t2 833 LDF [BO - 11 * SIZE], b2 834 835 FADD1 c11, t3, c11 836 nop 837 FMUL a4, b3, t3 838 LDF [BO - 10 * SIZE], b3 839 840 FADD3 c15, t4, c15 841 nop 842 FMUL a4, b4, t4 843 LDF [BO - 9 * SIZE], b4 844 845 FADD2 c04, t1, c04 846 nop 847 FMUL a5, b5, t1 848 LDF [AO - 9 * SIZE], a4 849 850 FADD4 c08, t2, c08 851 nop 852 FMUL a5, b2, t2 853 nop 854 855 FADD2 c12, t3, c12 856 nop 857 FMUL a5, b3, t3 858 nop 859 860 FADD4 c16, t4, c16 861 nop 862 FMUL a5, b4, t4 863 LDF [AO - 4 * SIZE], a5 864 865 FADD1 c01, t1, c01 866 nop 867 FMUL a2, b5, t1 868 nop 869 870 FADD3 c05, t2, c05 871 nop 872 FMUL a2, b2, t2 873 nop 874 875 FADD1 c09, t3, c09 876 nop 877 FMUL a2, b3, t3 878 nop 879 880 FADD3 c13, t4, c13 881 nop 882 FMUL a2, b4, t4 883 LDF [AO - 7 * SIZE], a2 884 885 FADD2 c02, t1, c02 886 nop 887 FMUL a3, b5, t1 888 nop 889 890 FADD4 c06, t2, c06 891 nop 892 FMUL a3, b2, t2 893 nop 894 895 FADD2 c10, t3, c10 896 nop 897 FMUL a3, b3, t3 898 nop 899 900 FADD4 c14, t4, c14 901 nop 902 FMUL a3, b4, t4 903 LDF [AO - 6 * SIZE], a3 904 905 FADD1 c03, t1, c03 906 nop 907 FMUL a4, b5, t1 908 LDF [BO - 4 * SIZE], b5 909 910 FADD3 c07, t2, c07 911 nop 912 FMUL a4, b2, t2 913 LDF [BO - 7 * SIZE], b2 914 915 FADD1 c11, t3, c11 916 nop 917 FMUL a4, b3, t3 918 LDF [BO - 6 * SIZE], b3 919 920 FADD3 c15, t4, c15 921 nop 922 FMUL a4, b4, t4 923 LDF [BO - 5 * SIZE], b4 924 925 FADD2 c04, t1, c04 926 nop 927 FMUL a1, b1, t1 928 LDF [AO - 5 * SIZE], a4 929 930 FADD4 c08, t2, c08 931 nop 932 FMUL a1, b2, t2 933 nop 934 935 FADD2 c12, t3, c12 936 nop 937 FMUL a1, b3, t3 938 nop 939 940 FADD4 c16, t4, c16 941 nop 942 FMUL a1, b4, t4 943 LDF [AO - 0 * SIZE], a1 944 945 FADD1 c01, t1, c01 946 nop 947 FMUL a2, b1, t1 948 nop 949 950#ifdef DOUBLE 951 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 952#else 953 nop 954#endif 955 FADD3 c05, t2, c05 956 nop 957 FMUL a2, b2, t2 958 959 FADD1 c09, t3, c09 960 nop 961 FMUL a2, b3, t3 962 nop 963 964 FADD3 c13, t4, c13 965 nop 966 FMUL a2, b4, t4 967 nop 968 969 FADD2 c02, t1, c02 970 nop 971 FMUL a3, b1, t1 972 LDF [AO - 3 * SIZE], a2 973 974 FADD4 c06, t2, c06 975#ifdef DOUBLE 976 prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY 977#else 978 nop 979#endif 980 FMUL a3, b2, t2 981 nop 982 983 FADD2 c10, t3, c10 984 nop 985 FMUL a3, b3, t3 986 nop 987 988 FADD4 c14, t4, c14 989 nop 990 FMUL a3, b4, t4 991 LDF [AO - 2 * SIZE], a3 992 993 FADD1 c03, t1, c03 994 nop 995 FMUL a4, b1, t1 996 LDF [BO - 0 * SIZE], b1 997 998 FADD3 c07, t2, c07 999 nop 1000 FMUL a4, b2, t2 1001 LDF [BO - 3 * SIZE], b2 1002 1003 FADD1 c11, t3, c11 1004 nop 1005 FMUL a4, b3, t3 1006 LDF [BO - 2 * SIZE], b3 1007 1008 FADD3 c15, t4, c15 1009 nop 1010 FMUL a4, b4, t4 1011 LDF [BO - 1 * SIZE], b4 1012 1013 FADD2 c04, t1, c04 1014 nop 1015 FMUL a5, b5, t1 1016 LDF [AO - 1 * SIZE], a4 1017 1018 FADD4 c08, t2, c08 1019 FMUL a5, b2, t2 1020 FADD2 c12, t3, c12 1021 FMUL a5, b3, t3 1022 1023 FADD4 c16, t4, c16 1024 nop 1025 FMUL a5, b4, t4 1026 LDF [AO + 4 * SIZE], a5 1027 1028 FADD1 c01, t1, c01 1029 nop 1030 FMUL a2, b5, t1 1031 nop 1032 1033 FADD3 c05, t2, c05 1034 nop 1035 FMUL a2, b2, t2 1036 nop 1037 1038 FADD1 c09, t3, c09 1039 nop 1040 FMUL a2, b3, t3 1041 nop 1042 1043 FADD3 c13, t4, c13 1044 nop 1045 FMUL a2, b4, t4 1046 LDF [AO + 1 * SIZE], a2 1047 1048 FADD2 c02, t1, c02 1049 nop 1050 FMUL a3, b5, t1 1051 nop 1052 1053 FADD4 c06, t2, c06 1054 nop 1055 FMUL a3, b2, t2 1056 nop 1057 1058 FADD2 c10, t3, c10 1059 nop 1060 FMUL a3, b3, t3 1061 nop 1062 1063 FADD4 c14, t4, c14 1064 nop 1065 FMUL a3, b4, t4 1066 LDF [AO + 2 * SIZE], a3 1067 1068 FADD1 c03, t1, c03 1069 cmp L, 0 1070 FMUL a4, b5, t1 1071 LDF [BO + 4 * SIZE], b5 1072 1073 FADD3 c07, t2, c07 1074 nop 1075 FMUL a4, b2, t2 1076 LDF [BO + 1 * SIZE], b2 1077 1078 FADD1 c11, t3, c11 1079 nop 1080 FMUL a4, b3, t3 1081 LDF [BO + 2 * SIZE], b3 1082 1083 FADD3 c15, t4, c15 1084 FMUL a4, b4, t4 1085 bg,pt %icc, .LL22 1086 LDF [BO + 3 * SIZE], b4 1087 1088.LL25: 1089#if defined(LT) || defined(RN) 1090 and KK, 3, L 1091#else 1092 and TEMP1, 3, L 1093#endif 1094 cmp L, 0 1095 ble,pn %icc, .LL29 1096 nop 1097 1098.LL26: 1099 FADD2 c04, t1, c04 1100 LDF [AO + 3 * SIZE], a4 1101 FMUL a1, b1, t1 1102 add AO, 4 * SIZE, AO 1103 1104 FADD4 c08, t2, c08 1105 add BO, 4 * SIZE, BO 1106 FMUL a1, b2, t2 1107 add L, -1, L 1108 1109 FADD2 c12, t3, c12 1110 nop 1111 FMUL a1, b3, t3 1112 cmp L, 0 1113 1114 FADD4 c16, t4, c16 1115 nop 1116 FMUL a1, b4, t4 1117 LDF [AO + 0 * SIZE], a1 1118 1119 FADD1 c01, t1, c01 1120 nop 1121 FMUL a2, b1, t1 1122 nop 1123 1124 FADD3 c05, t2, c05 1125 nop 1126 FMUL a2, b2, t2 1127 nop 1128 1129 FADD1 c09, t3, c09 1130 nop 1131 FMUL a2, b3, t3 1132 nop 1133 1134 FADD3 c13, t4, c13 1135 nop 1136 FMUL a2, b4, t4 1137 LDF [AO + 1 * SIZE], a2 1138 1139 FADD2 c02, t1, c02 1140 nop 1141 FMUL a3, b1, t1 1142 nop 1143 1144 FADD4 c06, t2, c06 1145 nop 1146 FMUL a3, b2, t2 1147 nop 1148 1149 FADD2 c10, t3, c10 1150 nop 1151 FMUL a3, b3, t3 1152 nop 1153 1154 FADD4 c14, t4, c14 1155 nop 1156 FMUL a3, b4, t4 1157 LDF [AO + 2 * SIZE], a3 1158 1159 FADD1 c03, t1, c03 1160 nop 1161 FMUL a4, b1, t1 1162 LDF [BO + 0 * SIZE], b1 1163 1164 FADD3 c07, t2, c07 1165 nop 1166 FMUL a4, b2, t2 1167 LDF [BO + 1 * SIZE], b2 1168 1169 FADD1 c11, t3, c11 1170 nop 1171 FMUL a4, b3, t3 1172 LDF [BO + 2 * SIZE], b3 1173 1174 FADD3 c15, t4, c15 1175 FMUL a4, b4, t4 1176 bg,pt %icc, .LL26 1177 LDF [BO + 3 * SIZE], b4 1178 1179.LL29: 1180#if defined(LN) || defined(RT) 1181 sub KK, 2, TEMP1 1182 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 1183 add AORIG, TEMP1, AO 1184 add B, TEMP1, BO 1185#endif 1186 1187 FADD2 c04, t1, c04 1188 FADD4 c08, t2, c08 1189 FADD2 c12, t3, c12 1190 FADD4 c16, t4, c16 1191 1192 FADD c01, c06, c01 1193 FADD c02, c05, c02 1194 FADD c03, c08, c03 1195 FADD c04, c07, c04 1196 1197 FADD c09, c14, c09 1198 FADD c10, c13, c10 1199 FADD c11, c16, c11 1200 FADD c12, c15, c12 1201 1202#if defined(LN) || defined(LT) 1203 LDF [BO + 0 * SIZE], a1 1204 LDF [BO + 1 * SIZE], a2 1205 LDF [BO + 2 * SIZE], a3 1206 LDF [BO + 3 * SIZE], a4 1207 1208 LDF [BO + 4 * SIZE], b1 1209 LDF [BO + 5 * SIZE], b2 1210 LDF [BO + 6 * SIZE], b3 1211 LDF [BO + 7 * SIZE], b4 1212 1213 FSUB a1, c01, c01 1214 FSUB a2, c02, c02 1215 FSUB a3, c09, c09 1216 FSUB a4, c10, c10 1217 1218 FSUB b1, c03, c03 1219 FSUB b2, c04, c04 1220 FSUB b3, c11, c11 1221 FSUB b4, c12, c12 1222#else 1223 LDF [AO + 0 * SIZE], a1 1224 LDF [AO + 1 * SIZE], a2 1225 LDF [AO + 2 * SIZE], a3 1226 LDF [AO + 3 * SIZE], a4 1227 1228 LDF [AO + 4 * SIZE], b1 1229 LDF [AO + 5 * SIZE], b2 1230 LDF [AO + 6 * SIZE], b3 1231 LDF [AO + 7 * SIZE], b4 1232 1233 FSUB a1, c01, c01 1234 FSUB a2, c02, c02 1235 FSUB a3, c03, c03 1236 FSUB a4, c04, c04 1237 1238 FSUB b1, c09, c09 1239 FSUB b2, c10, c10 1240 FSUB b3, c11, c11 1241 FSUB b4, c12, c12 1242#endif 1243 1244#ifdef LN 1245 LDF [AO + 6 * SIZE], a1 1246 LDF [AO + 7 * SIZE], a2 1247 LDF [AO + 4 * SIZE], a3 1248 LDF [AO + 5 * SIZE], a4 1249 LDF [AO + 0 * SIZE], b1 1250 LDF [AO + 1 * SIZE], b2 1251 1252 FMUL a1, c03, t1 1253 FMUL a2, c04, t2 1254 FMUL a1, c04, t3 1255 FMUL a2, c03, t4 1256 1257 FMUL a1, c11, t5 1258 FMUL a2, c12, t6 1259 FMUL a1, c12, t7 1260 FMUL a2, c11, t8 1261 1262 FADD4 t1, t2, c03 1263 FADD2 t3, t4, c04 1264 FADD4 t5, t6, c11 1265 FADD2 t7, t8, c12 1266 1267 FMUL a3, c03, t1 1268 FMUL a3, c04, t2 1269 FMUL a3, c11, t3 1270 FMUL a3, c12, t4 1271 1272 FMUL a4, c04, t5 1273 FMUL a4, c03, t6 1274 FMUL a4, c12, t7 1275 FMUL a4, c11, t8 1276 1277 FSUB c01, t1, c01 1278 FSUB c02, t2, c02 1279 FSUB c09, t3, c09 1280 FSUB c10, t4, c10 1281 1282 FADD2 c01, t5, c01 1283 FADD4 c02, t6, c02 1284 FADD2 c09, t7, c09 1285 FADD4 c10, t8, c10 1286 1287 FMUL b1, c01, t1 1288 FMUL b2, c02, t2 1289 FMUL b1, c02, t3 1290 FMUL b2, c01, t4 1291 1292 FMUL b1, c09, t5 1293 FMUL b2, c10, t6 1294 FMUL b1, c10, t7 1295 FMUL b2, c09, t8 1296 1297 FADD4 t1, t2, c01 1298 FADD2 t3, t4, c02 1299 FADD4 t5, t6, c09 1300 FADD2 t7, t8, c10 1301#endif 1302 1303#ifdef LT 1304 LDF [AO + 0 * SIZE], a1 1305 LDF [AO + 1 * SIZE], a2 1306 LDF [AO + 2 * SIZE], a3 1307 LDF [AO + 3 * SIZE], a4 1308 LDF [AO + 6 * SIZE], b1 1309 LDF [AO + 7 * SIZE], b2 1310 1311 FMUL a1, c01, t1 1312 FMUL a2, c02, t2 1313 FMUL a1, c02, t3 1314 FMUL a2, c01, t4 1315 1316 FMUL a1, c09, t5 1317 FMUL a2, c10, t6 1318 FMUL a1, c10, t7 1319 FMUL a2, c09, t8 1320 1321 FADD4 t1, t2, c01 1322 FADD2 t3, t4, c02 1323 FADD4 t5, t6, c09 1324 FADD2 t7, t8, c10 1325 1326 FMUL a3, c01, t1 1327 FMUL a3, c02, t2 1328 FMUL a3, c09, t3 1329 FMUL a3, c10, t4 1330 1331 FMUL a4, c02, t5 1332 FMUL a4, c01, t6 1333 FMUL a4, c10, t7 1334 FMUL a4, c09, t8 1335 1336 FSUB c03, t1, c03 1337 FSUB c04, t2, c04 1338 FSUB c11, t3, c11 1339 FSUB c12, t4, c12 1340 1341 FADD2 c03, t5, c03 1342 FADD4 c04, t6, c04 1343 FADD2 c11, t7, c11 1344 FADD4 c12, t8, c12 1345 1346 FMUL b1, c03, t1 1347 FMUL b2, c04, t2 1348 FMUL b1, c04, t3 1349 FMUL b2, c03, t4 1350 1351 FMUL b1, c11, t5 1352 FMUL b2, c12, t6 1353 FMUL b1, c12, t7 1354 FMUL b2, c11, t8 1355 1356 FADD4 t1, t2, c03 1357 FADD2 t3, t4, c04 1358 FADD4 t5, t6, c11 1359 FADD2 t7, t8, c12 1360#endif 1361 1362#ifdef RN 1363 LDF [BO + 0 * SIZE], a1 1364 LDF [BO + 1 * SIZE], a2 1365 LDF [BO + 2 * SIZE], a3 1366 LDF [BO + 3 * SIZE], a4 1367 LDF [BO + 6 * SIZE], b1 1368 LDF [BO + 7 * SIZE], b2 1369 1370 FMUL a1, c01, t1 1371 FMUL a2, c02, t2 1372 FMUL a1, c02, t3 1373 FMUL a2, c01, t4 1374 1375 FMUL a1, c03, t5 1376 FMUL a2, c04, t6 1377 FMUL a1, c04, t7 1378 FMUL a2, c03, t8 1379 1380 FADD4 t1, t2, c01 1381 FADD3 t3, t4, c02 1382 FADD4 t5, t6, c03 1383 FADD3 t7, t8, c04 1384 1385 FMUL a3, c01, t1 1386 FMUL a3, c02, t2 1387 FMUL a3, c03, t3 1388 FMUL a3, c04, t4 1389 1390 FMUL a4, c02, t5 1391 FMUL a4, c01, t6 1392 FMUL a4, c04, t7 1393 FMUL a4, c03, t8 1394 1395 FSUB c09, t1, c09 1396 FSUB c10, t2, c10 1397 FSUB c11, t3, c11 1398 FSUB c12, t4, c12 1399 1400 FADD3 c09, t5, c09 1401 FADD4 c10, t6, c10 1402 FADD3 c11, t7, c11 1403 FADD4 c12, t8, c12 1404 1405 FMUL b1, c09, t1 1406 FMUL b2, c10, t2 1407 FMUL b1, c10, t3 1408 FMUL b2, c09, t4 1409 1410 FMUL b1, c11, t5 1411 FMUL b2, c12, t6 1412 FMUL b1, c12, t7 1413 FMUL b2, c11, t8 1414 1415 FADD4 t1, t2, c09 1416 FADD3 t3, t4, c10 1417 FADD4 t5, t6, c11 1418 FADD3 t7, t8, c12 1419#endif 1420 1421#ifdef RT 1422 LDF [BO + 6 * SIZE], a1 1423 LDF [BO + 7 * SIZE], a2 1424 LDF [BO + 4 * SIZE], a3 1425 LDF [BO + 5 * SIZE], a4 1426 LDF [BO + 0 * SIZE], b1 1427 LDF [BO + 1 * SIZE], b2 1428 1429 FMUL a1, c09, t1 1430 FMUL a2, c10, t2 1431 FMUL a1, c10, t3 1432 FMUL a2, c09, t4 1433 1434 FMUL a1, c11, t5 1435 FMUL a2, c12, t6 1436 FMUL a1, c12, t7 1437 FMUL a2, c11, t8 1438 1439 FADD4 t1, t2, c09 1440 FADD3 t3, t4, c10 1441 FADD4 t5, t6, c11 1442 FADD3 t7, t8, c12 1443 1444 FMUL a3, c09, t1 1445 FMUL a3, c10, t2 1446 FMUL a3, c11, t3 1447 FMUL a3, c12, t4 1448 1449 FMUL a4, c10, t5 1450 FMUL a4, c09, t6 1451 FMUL a4, c12, t7 1452 FMUL a4, c11, t8 1453 1454 FSUB c01, t1, c01 1455 FSUB c02, t2, c02 1456 FSUB c03, t3, c03 1457 FSUB c04, t4, c04 1458 1459 FADD3 c01, t5, c01 1460 FADD4 c02, t6, c02 1461 FADD3 c03, t7, c03 1462 FADD4 c04, t8, c04 1463 1464 FMUL b1, c01, t1 1465 FMUL b2, c02, t2 1466 FMUL b1, c02, t3 1467 FMUL b2, c01, t4 1468 1469 FMUL b1, c03, t5 1470 FMUL b2, c04, t6 1471 FMUL b1, c04, t7 1472 FMUL b2, c03, t8 1473 1474 FADD4 t1, t2, c01 1475 FADD3 t3, t4, c02 1476 FADD4 t5, t6, c03 1477 FADD3 t7, t8, c04 1478#endif 1479 1480#ifdef LN 1481 add C1, -4 * SIZE, C1 1482 add C2, -4 * SIZE, C2 1483#endif 1484 1485#if defined(LN) || defined(LT) 1486 STF c01, [BO + 0 * SIZE] 1487 STF c02, [BO + 1 * SIZE] 1488 STF c09, [BO + 2 * SIZE] 1489 STF c10, [BO + 3 * SIZE] 1490 1491 STF c03, [BO + 4 * SIZE] 1492 STF c04, [BO + 5 * SIZE] 1493 STF c11, [BO + 6 * SIZE] 1494 STF c12, [BO + 7 * SIZE] 1495#else 1496 STF c01, [AO + 0 * SIZE] 1497 STF c02, [AO + 1 * SIZE] 1498 STF c03, [AO + 2 * SIZE] 1499 STF c04, [AO + 3 * SIZE] 1500 1501 STF c09, [AO + 4 * SIZE] 1502 STF c10, [AO + 5 * SIZE] 1503 STF c11, [AO + 6 * SIZE] 1504 STF c12, [AO + 7 * SIZE] 1505#endif 1506 1507 STF c01, [C1 + 0 * SIZE] 1508 STF c02, [C1 + 1 * SIZE] 1509 STF c03, [C1 + 2 * SIZE] 1510 STF c04, [C1 + 3 * SIZE] 1511 1512 STF c09, [C2 + 0 * SIZE] 1513 STF c10, [C2 + 1 * SIZE] 1514 STF c11, [C2 + 2 * SIZE] 1515 STF c12, [C2 + 3 * SIZE] 1516 1517#ifndef LN 1518 add C1, 4 * SIZE, C1 1519 add C2, 4 * SIZE, C2 1520#endif 1521 1522#ifdef RT 1523 sll K, 1 + ZBASE_SHIFT, TEMP1 1524 add AORIG, TEMP1, AORIG 1525#endif 1526 1527#if defined(LT) || defined(RN) 1528 sub K, KK, TEMP1 1529 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 1530 add AO, TEMP1, AO 1531 add BO, TEMP1, BO 1532#endif 1533 1534#ifdef LT 1535 add KK, 2, KK 1536#endif 1537 1538#ifdef LN 1539 sub KK, 2, KK 1540#endif 1541 1542 add I, -1, I 1543 cmp I, 0 1544 bg,pt %icc, .LL21 1545 nop 1546 1547.LL99: 1548#ifdef LN 1549 sll K, 1 + ZBASE_SHIFT, TEMP1 1550 add B, TEMP1, B 1551#endif 1552 1553#if defined(LT) || defined(RN) 1554 mov BO, B 1555#endif 1556 1557#ifdef RN 1558 add KK, 2, KK 1559#endif 1560 1561#ifdef RT 1562 sub KK, 2, KK 1563#endif 1564 1565 add J, -1, J 1566 cmp J, 0 1567 bg,pt %icc, .LL11 1568 nop 1569 1570.LL100: 1571 and N, 1, J 1572 1573 cmp J, 0 1574 ble,pn %icc, .LL999 1575 nop 1576 1577#ifdef RT 1578 sll K, 0 + ZBASE_SHIFT, TEMP1 1579 sub B, TEMP1, B 1580 1581 sub C, LDC, C 1582#endif 1583 1584 mov C, C1 1585 1586#ifdef LN 1587 add M, OFFSET, KK 1588#endif 1589 1590#ifdef LT 1591 mov OFFSET, KK 1592#endif 1593 1594#if defined(LN) || defined(RT) 1595 mov A, AORIG 1596#else 1597 mov A, AO 1598#endif 1599 1600#ifndef RT 1601 add C, LDC, C 1602#endif 1603 1604 and M, 1, I 1605 cmp I, 0 1606 ble,pn %icc, .LL150 1607 nop 1608 1609#if defined(LT) || defined(RN) 1610 sra KK, 2, L 1611 1612 mov B, BO 1613 cmp L, 0 1614#else 1615 1616#ifdef LN 1617 sll K, 0 + ZBASE_SHIFT, TEMP1 1618 sub AORIG, TEMP1, AORIG 1619#endif 1620 1621 sll KK, 0 + ZBASE_SHIFT, TEMP1 1622 add AORIG, TEMP1, AO 1623 add B, TEMP1, BO 1624 1625 sub K, KK, TEMP1 1626 1627 sra TEMP1, 2, L 1628 cmp L, 0 1629#endif 1630 1631 LDF [AO + 0 * SIZE], a1 1632 FMOV FZERO, c01 1633 LDF [BO + 0 * SIZE], b1 1634 FMOV FZERO, t1 1635 1636 LDF [AO + 1 * SIZE], a2 1637 FMOV FZERO, c02 1638 LDF [BO + 1 * SIZE], b2 1639 FMOV FZERO, t2 1640 1641 LDF [AO + 2 * SIZE], a3 1642 FMOV FZERO, c03 1643 LDF [BO + 2 * SIZE], b3 1644 FMOV FZERO, t3 1645 1646 LDF [AO + 3 * SIZE], a4 1647 FMOV FZERO, c04 1648 LDF [BO + 3 * SIZE], b4 1649 FMOV FZERO, t4 1650 1651 ble,pn %icc, .LL155 1652 nop 1653 1654.LL152: 1655 FADD1 c01, t1, c01 1656 add L, -1, L 1657 FMUL a1, b1, t1 1658 prefetch [AO + APREFETCHSIZE * SIZE], 0 1659 1660 FADD3 c02, t2, c02 1661 add BO, 8 * SIZE, BO 1662 FMUL a1, b2, t2 1663 LDF [AO + 4 * SIZE], a1 1664 1665 FADD2 c03, t3, c03 1666 cmp L, 0 1667 FMUL a2, b1, t3 1668 LDF [BO - 4 * SIZE], b1 1669 1670 FADD4 c04, t4, c04 1671 nop 1672 FMUL a2, b2, t4 1673 LDF [AO + 5 * SIZE], a2 1674 1675 FADD1 c01, t1, c01 1676 nop 1677 FMUL a3, b3, t1 1678 LDF [BO - 3 * SIZE], b2 1679 1680 FADD3 c02, t2, c02 1681 nop 1682 FMUL a3, b4, t2 1683 LDF [AO + 6 * SIZE], a3 1684 1685 FADD2 c03, t3, c03 1686 nop 1687 FMUL a4, b3, t3 1688 LDF [BO - 2 * SIZE], b3 1689 1690 FADD4 c04, t4, c04 1691 nop 1692 FMUL a4, b4, t4 1693 LDF [AO + 7 * SIZE], a4 1694 1695 FADD1 c01, t1, c01 1696 nop 1697 FMUL a1, b1, t1 1698 LDF [BO - 1 * SIZE], b4 1699 1700 FADD3 c02, t2, c02 1701 FMUL a1, b2, t2 1702 LDF [AO + 8 * SIZE], a1 1703 1704 FADD2 c03, t3, c03 1705 FMUL a2, b1, t3 1706 LDF [BO + 0 * SIZE], b1 1707 1708 FADD4 c04, t4, c04 1709 FMUL a2, b2, t4 1710 LDF [AO + 9 * SIZE], a2 1711 1712 FADD1 c01, t1, c01 1713 FMUL a3, b3, t1 1714 LDF [BO + 1 * SIZE], b2 1715 1716 FADD3 c02, t2, c02 1717 FMUL a3, b4, t2 1718 LDF [AO + 10 * SIZE], a3 1719 1720 FADD2 c03, t3, c03 1721 FMUL a4, b3, t3 1722 LDF [BO + 2 * SIZE], b3 1723 1724 FADD4 c04, t4, c04 1725 FMUL a4, b4, t4 1726 LDF [AO + 11 * SIZE], a4 1727 1728 add AO, 8 * SIZE, AO 1729 bg,pt %icc, .LL152 1730 LDF [BO + 3 * SIZE], b4 1731 1732.LL155: 1733#if defined(LT) || defined(RN) 1734 and KK, 3, L 1735#else 1736 and TEMP1, 3, L 1737#endif 1738 cmp L, 0 1739 ble,a,pn %icc, .LL159 1740 nop 1741 1742.LL156: 1743 FADD1 c01, t1, c01 1744 add AO, 2 * SIZE, AO 1745 FMUL a1, b1, t1 1746 add BO, 2 * SIZE, BO 1747 FADD3 c02, t2, c02 1748 add L, -1, L 1749 FMUL a1, b2, t2 1750 LDF [AO + 0 * SIZE], a1 1751 FADD2 c03, t3, c03 1752 FMUL a2, b1, t3 1753 LDF [BO + 0 * SIZE], b1 1754 cmp L, 0 1755 FADD4 c04, t4, c04 1756 FMUL a2, b2, t4 1757 LDF [BO + 1 * SIZE], b2 1758 1759 bg,pt %icc, .LL156 1760 LDF [AO + 1 * SIZE], a2 1761 1762.LL159: 1763 FADD1 c01, t1, c01 1764 FADD3 c02, t2, c02 1765 FADD2 c03, t3, c03 1766 FADD4 c04, t4, c04 1767 1768 FADD c01, c04, c01 1769 FADD c02, c03, c02 1770 1771#if defined(LN) || defined(RT) 1772 sub KK, 1, TEMP1 1773 1774 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 1775 add AORIG, TEMP1, AO 1776 add B, TEMP1, BO 1777#endif 1778 1779#if defined(LN) || defined(LT) 1780 LDF [BO + 0 * SIZE], a1 1781 LDF [BO + 1 * SIZE], a2 1782 1783 FSUB a1, c01, c01 1784 FSUB a2, c02, c02 1785#else 1786 LDF [AO + 0 * SIZE], a1 1787 LDF [AO + 1 * SIZE], a2 1788 1789 FSUB a1, c01, c01 1790 FSUB a2, c02, c02 1791#endif 1792 1793#ifdef LN 1794 LDF [AO + 0 * SIZE], a1 1795 LDF [AO + 1 * SIZE], a2 1796 1797 FMUL a1, c01, t1 1798 FMUL a2, c02, t2 1799 FMUL a1, c02, t3 1800 FMUL a2, c01, t4 1801 1802 FADD4 t1, t2, c01 1803 FADD2 t3, t4, c02 1804#endif 1805 1806#ifdef LT 1807 LDF [AO + 0 * SIZE], a1 1808 LDF [AO + 1 * SIZE], a2 1809 1810 FMUL a1, c01, t1 1811 FMUL a2, c02, t2 1812 FMUL a1, c02, t3 1813 FMUL a2, c01, t4 1814 1815 FADD4 t1, t2, c01 1816 FADD2 t3, t4, c02 1817#endif 1818 1819#ifdef RN 1820 LDF [BO + 0 * SIZE], a1 1821 LDF [BO + 1 * SIZE], a2 1822 1823 FMUL a1, c01, t1 1824 FMUL a2, c02, t2 1825 FMUL a1, c02, t3 1826 FMUL a2, c01, t4 1827 1828 FADD4 t1, t2, c01 1829 FADD3 t3, t4, c02 1830#endif 1831 1832#ifdef RT 1833 LDF [BO + 0 * SIZE], a1 1834 LDF [BO + 1 * SIZE], a2 1835 1836 FMUL a1, c01, t1 1837 FMUL a2, c02, t2 1838 FMUL a1, c02, t3 1839 FMUL a2, c01, t4 1840 1841 FADD4 t1, t2, c01 1842 FADD3 t3, t4, c02 1843#endif 1844 1845#ifdef LN 1846 add C1, -2 * SIZE, C1 1847#endif 1848 1849#if defined(LN) || defined(LT) 1850 STF c01, [BO + 0 * SIZE] 1851 STF c02, [BO + 1 * SIZE] 1852#else 1853 STF c01, [AO + 0 * SIZE] 1854 STF c02, [AO + 1 * SIZE] 1855#endif 1856 1857 STF c01, [C1 + 0 * SIZE] 1858 STF c02, [C1 + 1 * SIZE] 1859 1860 FMOV FZERO, t1 1861 FMOV FZERO, t2 1862 FMOV FZERO, t3 1863 FMOV FZERO, t4 1864 1865#ifndef LN 1866 add C1, 2 * SIZE, C1 1867#endif 1868 1869#ifdef RT 1870 sll K, 0 + ZBASE_SHIFT, TEMP1 1871 add AORIG, TEMP1, AORIG 1872#endif 1873 1874#if defined(LT) || defined(RN) 1875 sub K, KK, TEMP1 1876 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 1877 add AO, TEMP1, AO 1878 add BO, TEMP1, BO 1879#endif 1880 1881#ifdef LT 1882 add KK, 1, KK 1883#endif 1884 1885#ifdef LN 1886 sub KK, 1, KK 1887#endif 1888 1889.LL150: 1890 sra M, 1, I 1891 cmp I, 0 1892 ble,pn %icc, .LL199 1893 nop 1894 1895 1896.LL121: 1897#if defined(LT) || defined(RN) 1898 sra KK, 2, L 1899 1900 mov B, BO 1901 cmp L, 0 1902#else 1903 1904#ifdef LN 1905 sll K, 1 + ZBASE_SHIFT, TEMP1 1906 sub AORIG, TEMP1, AORIG 1907#endif 1908 1909 sll KK, 1 + ZBASE_SHIFT, TEMP1 1910 sll KK, 0 + ZBASE_SHIFT, TEMP2 1911 1912 add AORIG, TEMP1, AO 1913 add B, TEMP2, BO 1914 1915 sub K, KK, TEMP1 1916 sra TEMP1, 2, L 1917 cmp L, 0 1918#endif 1919 1920 FMOV FZERO, c03 1921 1922 LDF [AO + 0 * SIZE], a1 1923 FMOV FZERO, t1 1924 LDF [BO + 0 * SIZE], b1 1925 FMOV FZERO, c07 1926 1927 LDF [AO + 1 * SIZE], a2 1928 FMOV FZERO, t2 1929 LDF [BO + 1 * SIZE], b2 1930 FMOV FZERO, c04 1931 1932 LDF [AO + 2 * SIZE], a3 1933 FMOV FZERO, t3 1934 LDF [BO + 2 * SIZE], b3 1935 FMOV FZERO, c08 1936 1937 LDF [AO + 3 * SIZE], a4 1938 FMOV FZERO, t4 1939 LDF [BO + 3 * SIZE], b4 1940 FMOV FZERO, c01 1941 1942#ifdef LN 1943 prefetch [C1 - 3 * SIZE], 3 1944#else 1945 prefetch [C1 + 3 * SIZE], 3 1946#endif 1947 FMOV FZERO, c05 1948 FMOV FZERO, c02 1949 1950 ble,pn %icc, .LL125 1951 FMOV FZERO, c06 1952 1953.LL122: 1954 FADD1 c03, t1, c03 1955 add L, -1, L 1956 FMUL a1, b1, t1 1957 prefetch [AO + APREFETCHSIZE * SIZE], 0 1958 1959 FADD3 c07, t2, c07 1960 add BO, 8 * SIZE, BO 1961 FMUL a1, b2, t2 1962 LDF [AO + 4 * SIZE], a1 1963 1964 FADD2 c04, t3, c04 1965 add AO, 16 * SIZE, AO 1966 FMUL a2, b1, t3 1967 cmp L, 0 1968 1969 FADD4 c08, t4, c08 1970 nop 1971 FMUL a2, b2, t4 1972 LDF [AO - 11 * SIZE], a2 1973 1974 FADD1 c01, t1, c01 1975 nop 1976 FMUL a3, b1, t1 1977 nop 1978 1979 FADD3 c05, t2, c05 1980 nop 1981 FMUL a3, b2, t2 1982 LDF [AO - 10 * SIZE], a3 1983 1984 FADD2 c02, t3, c02 1985 nop 1986 FMUL a4, b1, t3 1987 LDF [BO - 4 * SIZE], b1 1988 1989 FADD4 c06, t4, c06 1990 nop 1991 FMUL a4, b2, t4 1992 LDF [BO - 3 * SIZE], b2 1993 1994 FADD1 c03, t1, c03 1995 nop 1996 FMUL a1, b3, t1 1997 LDF [AO - 9 * SIZE], a4 1998 1999 FADD3 c07, t2, c07 2000 nop 2001 FMUL a1, b4, t2 2002 LDF [AO - 8 * SIZE], a1 2003 2004 FADD2 c04, t3, c04 2005 nop 2006 FMUL a2, b3, t3 2007 nop 2008 2009 FADD4 c08, t4, c08 2010 nop 2011 FMUL a2, b4, t4 2012 LDF [AO - 7 * SIZE], a2 2013 2014 FADD1 c01, t1, c01 2015 nop 2016 FMUL a3, b3, t1 2017 nop 2018 2019 FADD3 c05, t2, c05 2020 nop 2021 FMUL a3, b4, t2 2022 LDF [AO - 6 * SIZE], a3 2023 2024 FADD2 c02, t3, c02 2025 nop 2026 FMUL a4, b3, t3 2027 LDF [BO - 2 * SIZE], b3 2028 2029 FADD4 c06, t4, c06 2030 nop 2031 FMUL a4, b4, t4 2032 LDF [BO - 1 * SIZE], b4 2033 2034 FADD1 c03, t1, c03 2035 nop 2036 FMUL a1, b1, t1 2037 LDF [AO - 5 * SIZE], a4 2038 2039 FADD3 c07, t2, c07 2040 nop 2041 FMUL a1, b2, t2 2042 LDF [AO - 4 * SIZE], a1 2043 2044 FADD2 c04, t3, c04 2045 nop 2046 FMUL a2, b1, t3 2047 nop 2048 2049 FADD4 c08, t4, c08 2050 nop 2051 FMUL a2, b2, t4 2052 LDF [AO - 3 * SIZE], a2 2053 2054 FADD1 c01, t1, c01 2055 nop 2056 FMUL a3, b1, t1 2057 nop 2058 2059 FADD3 c05, t2, c05 2060 nop 2061 FMUL a3, b2, t2 2062 LDF [AO - 2 * SIZE], a3 2063 2064 FADD2 c02, t3, c02 2065 nop 2066 FMUL a4, b1, t3 2067 LDF [BO + 0 * SIZE], b1 2068 2069 FADD4 c06, t4, c06 2070 nop 2071 FMUL a4, b2, t4 2072 LDF [BO + 1 * SIZE], b2 2073 2074 FADD1 c03, t1, c03 2075 nop 2076 FMUL a1, b3, t1 2077 LDF [AO - 1 * SIZE], a4 2078 2079 FADD3 c07, t2, c07 2080 nop 2081 FMUL a1, b4, t2 2082 LDF [AO + 0 * SIZE], a1 2083 2084 FADD2 c04, t3, c04 2085 nop 2086 FMUL a2, b3, t3 2087 nop 2088 2089 FADD4 c08, t4, c08 2090 nop 2091 FMUL a2, b4, t4 2092 LDF [AO + 1 * SIZE], a2 2093 2094 FADD1 c01, t1, c01 2095 nop 2096 FMUL a3, b3, t1 2097 nop 2098 2099 FADD3 c05, t2, c05 2100 nop 2101 FMUL a3, b4, t2 2102 LDF [AO + 2 * SIZE], a3 2103 2104 FADD2 c02, t3, c02 2105 nop 2106 FMUL a4, b3, t3 2107 LDF [BO + 2 * SIZE], b3 2108 2109 FADD4 c06, t4, c06 2110 FMUL a4, b4, t4 2111 LDF [AO + 3 * SIZE], a4 2112 2113 bg,pt %icc, .LL122 2114 LDF [BO + 3 * SIZE], b4 2115 2116.LL125: 2117#if defined(LT) || defined(RN) 2118 and KK, 3, L 2119#else 2120 and TEMP1, 3, L 2121#endif 2122 cmp L, 0 2123 ble,a,pn %icc, .LL129 2124 nop 2125 2126.LL126: 2127 FADD1 c03, t1, c03 2128 add AO, 4 * SIZE, AO 2129 FMUL a1, b1, t1 2130 add BO, 2 * SIZE, BO 2131 2132 FADD3 c07, t2, c07 2133 add L, -1, L 2134 FMUL a1, b2, t2 2135 LDF [AO + 0 * SIZE], a1 2136 2137 FADD2 c04, t3, c04 2138 cmp L, 0 2139 FMUL a2, b1, t3 2140 2141 FADD4 c08, t4, c08 2142 FMUL a2, b2, t4 2143 LDF [AO + 1 * SIZE], a2 2144 2145 FADD1 c01, t1, c01 2146 FMUL a3, b1, t1 2147 FADD3 c05, t2, c05 2148 FMUL a3, b2, t2 2149 LDF [AO + 2 * SIZE], a3 2150 2151 FADD2 c02, t3, c02 2152 FMUL a4, b1, t3 2153 LDF [BO + 0 * SIZE], b1 2154 FADD4 c06, t4, c06 2155 FMUL a4, b2, t4 2156 LDF [BO + 1 * SIZE], b2 2157 bg,pt %icc, .LL126 2158 LDF [AO + 3 * SIZE], a4 2159 2160.LL129: 2161 FADD1 c03, t1, c03 2162 FADD3 c07, t2, c07 2163 FADD2 c04, t3, c04 2164 FADD4 c08, t4, c08 2165 2166 FADD c01, c06, c01 2167 FADD c02, c05, c02 2168 FADD c03, c08, c03 2169 FADD c04, c07, c04 2170 2171#if defined(LN) || defined(RT) 2172#ifdef LN 2173 sub KK, 2, TEMP1 2174#else 2175 sub KK, 1, TEMP1 2176#endif 2177 sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 2178 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 2179 add AORIG, TEMP2, AO 2180 add B, TEMP1, BO 2181#endif 2182 2183#if defined(LN) || defined(LT) 2184 LDF [BO + 0 * SIZE], a1 2185 LDF [BO + 1 * SIZE], a2 2186 LDF [BO + 2 * SIZE], a3 2187 LDF [BO + 3 * SIZE], a4 2188 2189 FSUB a1, c01, c01 2190 FSUB a2, c02, c02 2191 FSUB a3, c03, c03 2192 FSUB a4, c04, c04 2193#else 2194 LDF [AO + 0 * SIZE], a1 2195 LDF [AO + 1 * SIZE], a2 2196 LDF [AO + 2 * SIZE], a3 2197 LDF [AO + 3 * SIZE], a4 2198 2199 FSUB a1, c01, c01 2200 FSUB a2, c02, c02 2201 FSUB a3, c03, c03 2202 FSUB a4, c04, c04 2203#endif 2204 2205#ifdef LN 2206 LDF [AO + 6 * SIZE], a1 2207 LDF [AO + 7 * SIZE], a2 2208 LDF [AO + 4 * SIZE], a3 2209 LDF [AO + 5 * SIZE], a4 2210 LDF [AO + 0 * SIZE], b1 2211 LDF [AO + 1 * SIZE], b2 2212 2213 FMUL a1, c03, t1 2214 FMUL a2, c04, t2 2215 FMUL a1, c04, t3 2216 FMUL a2, c03, t4 2217 2218 FADD4 t1, t2, c03 2219 FADD2 t3, t4, c04 2220 2221 FMUL a3, c03, t1 2222 FMUL a3, c04, t2 2223 2224 FMUL a4, c04, t5 2225 FMUL a4, c03, t6 2226 2227 FSUB c01, t1, c01 2228 FSUB c02, t2, c02 2229 2230 FADD2 c01, t5, c01 2231 FADD4 c02, t6, c02 2232 2233 FMUL b1, c01, t1 2234 FMUL b2, c02, t2 2235 FMUL b1, c02, t3 2236 FMUL b2, c01, t4 2237 2238 FADD4 t1, t2, c01 2239 FADD2 t3, t4, c02 2240#endif 2241 2242#ifdef LT 2243 LDF [AO + 0 * SIZE], a1 2244 LDF [AO + 1 * SIZE], a2 2245 LDF [AO + 2 * SIZE], a3 2246 LDF [AO + 3 * SIZE], a4 2247 LDF [AO + 6 * SIZE], b1 2248 LDF [AO + 7 * SIZE], b2 2249 2250 FMUL a1, c01, t1 2251 FMUL a2, c02, t2 2252 FMUL a1, c02, t3 2253 FMUL a2, c01, t4 2254 2255 FADD4 t1, t2, c01 2256 FADD2 t3, t4, c02 2257 2258 FMUL a3, c01, t1 2259 FMUL a3, c02, t2 2260 FMUL a4, c02, t5 2261 FMUL a4, c01, t6 2262 2263 FSUB c03, t1, c03 2264 FSUB c04, t2, c04 2265 FADD2 c03, t5, c03 2266 FADD4 c04, t6, c04 2267 2268 FMUL b1, c03, t1 2269 FMUL b2, c04, t2 2270 FMUL b1, c04, t3 2271 FMUL b2, c03, t4 2272 2273 FADD4 t1, t2, c03 2274 FADD2 t3, t4, c04 2275#endif 2276 2277#ifdef RN 2278 LDF [BO + 0 * SIZE], a1 2279 LDF [BO + 1 * SIZE], a2 2280 2281 FMUL a1, c01, t1 2282 FMUL a2, c02, t2 2283 FMUL a1, c02, t3 2284 FMUL a2, c01, t4 2285 2286 FMUL a1, c03, t5 2287 FMUL a2, c04, t6 2288 FMUL a1, c04, t7 2289 FMUL a2, c03, t8 2290 2291 FADD4 t1, t2, c01 2292 FADD3 t3, t4, c02 2293 FADD4 t5, t6, c03 2294 FADD3 t7, t8, c04 2295#endif 2296 2297#ifdef RT 2298 LDF [BO + 0 * SIZE], a1 2299 LDF [BO + 1 * SIZE], a2 2300 2301 FMUL a1, c01, t1 2302 FMUL a2, c02, t2 2303 FMUL a1, c02, t3 2304 FMUL a2, c01, t4 2305 2306 FMUL a1, c03, t5 2307 FMUL a2, c04, t6 2308 FMUL a1, c04, t7 2309 FMUL a2, c03, t8 2310 2311 FADD4 t1, t2, c01 2312 FADD3 t3, t4, c02 2313 FADD4 t5, t6, c03 2314 FADD3 t7, t8, c04 2315#endif 2316 2317#ifdef LN 2318 add C1, -4 * SIZE, C1 2319#endif 2320 2321#if defined(LN) || defined(LT) 2322 STF c01, [BO + 0 * SIZE] 2323 STF c02, [BO + 1 * SIZE] 2324 STF c03, [BO + 2 * SIZE] 2325 STF c04, [BO + 3 * SIZE] 2326#else 2327 STF c01, [AO + 0 * SIZE] 2328 STF c02, [AO + 1 * SIZE] 2329 STF c03, [AO + 2 * SIZE] 2330 STF c04, [AO + 3 * SIZE] 2331#endif 2332 2333 STF c01, [C1 + 0 * SIZE] 2334 STF c02, [C1 + 1 * SIZE] 2335 STF c03, [C1 + 2 * SIZE] 2336 STF c04, [C1 + 3 * SIZE] 2337 2338 FMOV FZERO, t1 2339 FMOV FZERO, t2 2340 FMOV FZERO, t3 2341 FMOV FZERO, t4 2342 2343#ifndef LN 2344 add C1, 4 * SIZE, C1 2345#endif 2346 2347#ifdef RT 2348 sll K, 1 + ZBASE_SHIFT, TEMP1 2349 add AORIG, TEMP1, AORIG 2350#endif 2351 2352#if defined(LT) || defined(RN) 2353 sub K, KK, TEMP1 2354 sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 2355 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 2356 add AO, TEMP2, AO 2357 add BO, TEMP1, BO 2358#endif 2359 2360#ifdef LT 2361 add KK, 2, KK 2362#endif 2363 2364#ifdef LN 2365 sub KK, 2, KK 2366#endif 2367 2368 add I, -1, I 2369 cmp I, 0 2370 bg,pt %icc, .LL121 2371 FMOV FZERO, c03 2372 2373.LL199: 2374#ifdef LN 2375 sll K, 0 + ZBASE_SHIFT, TEMP1 2376 add B, TEMP1, B 2377#endif 2378 2379#if defined(LT) || defined(RN) 2380 mov BO, B 2381#endif 2382 2383#ifdef RN 2384 add KK, 1, KK 2385#endif 2386 2387#ifdef RT 2388 sub KK, 1, KK 2389#endif 2390 2391.LL999: 2392 return %i7 + 8 2393 clr %o0 2394 2395 EPILOGUE 2396