1/*********************************************************************/ 2/* Copyright 2005-2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define APREFETCHSIZE 24 43#define APREFETCH_CATEGORY 0 44 45#define M %i0 46#define N %i1 47#define K %i2 48#define A %i5 49#define B %i3 50#define C %i4 51 52#define LDC %o0 53#define AO %o1 54#define BO %o2 55#define I %o3 56#define J %o4 57#define L %o5 58 59#define C1 %l0 60#define C2 %l1 61#define C3 %l2 62#define C4 %l3 63 64#define OFFSET %l4 65#define KK %l5 66#define TEMP1 %l6 67#define TEMP2 %l7 68#define AORIG %o7 69 70#ifdef DOUBLE 71#define c01 %f0 72#define c02 %f2 73#define c03 %f4 74#define c04 %f6 75#define c05 %f8 76#define c06 %f10 77#define c07 %f12 78#define c08 %f14 79#define c09 %f16 80#define c10 %f18 81#define c11 %f20 82#define c12 %f22 83#define c13 %f24 84#define c14 %f26 85#define c15 %f28 86#define c16 %f30 87 88#define a1 %f32 89#define a2 %f34 90#define a3 %f36 91#define a4 %f38 92#define a5 %f40 93 94#define b1 %f42 95#define b2 %f44 96#define b3 %f46 97#define b4 %f48 98#define b5 %f50 99#define b6 %f52 100#define b7 %f54 101#define b8 %f56 102#define b9 %f58 103 104#define cc01 0 105#define cc02 2 106#define cc03 4 107#define cc04 6 108#define cc05 8 109#define cc06 10 110#define cc07 12 111#define cc08 14 112#define cc09 16 113#define cc10 18 114#define cc11 20 115#define cc12 22 116#define cc13 24 117#define cc14 26 118#define cc15 28 119#define cc16 30 120 121#define aa1 1 122#define aa2 3 123#define aa3 5 124#define aa4 7 125#define aa5 9 126 127#define bb1 11 128#define bb2 13 129#define bb3 15 130#define bb4 17 131#define bb5 19 132#define bb6 21 133#define bb7 23 134#define bb8 25 135#define bb9 27 136#else 137#define c01 %f0 138#define c02 %f1 139#define c03 %f2 140#define c04 %f3 141#define c05 %f4 142#define c06 %f5 143#define c07 %f6 144#define c08 %f7 145#define c09 %f8 146#define c10 %f9 147#define c11 %f10 148#define c12 %f11 149#define c13 %f12 150#define c14 %f13 151#define c15 %f14 152#define c16 %f15 153 154#define a1 %f16 155#define a2 %f17 156#define a3 %f18 157#define a4 %f19 158#define a5 %f20 159 160#define b1 %f21 161#define b2 %f22 162#define b3 %f23 163#define b4 %f24 164#define b5 %f25 165#define b6 %f26 166#define b7 %f27 167#define b8 %f28 168#define b9 %f29 169 170#define cc01 0 171#define cc02 1 172#define cc03 2 173#define cc04 3 174#define cc05 4 175#define cc06 5 176#define cc07 6 177#define cc08 7 178#define cc09 8 179#define cc10 9 180#define cc11 10 181#define cc12 11 182#define cc13 12 183#define cc14 13 184#define cc15 14 185#define cc16 15 186 187#define aa1 16 188#define aa2 17 189#define aa3 18 190#define aa4 19 191#define aa5 20 192 193#define bb1 21 194#define bb2 22 195#define bb3 23 196#define bb4 24 197#define bb5 25 198#define bb6 26 199#define bb7 27 200#define bb8 28 201#define bb9 29 202#endif 203 204#ifndef CONJ 205#define FMADD1 FMADD 206#define FMADD2 FMADD 207#define FMADD3 FMADD 208#define FMADD4 FNMSUB 209#else 210#if defined(LN) || defined(LT) 211#define FMADD1 FMADD 212#define FMADD2 FNMSUB 213#define FMADD3 FMADD 214#define FMADD4 FMADD 215#endif 216#if defined(RN) || defined(RT) 217#define FMADD1 FMADD 218#define FMADD2 FMADD 219#define FMADD3 FNMSUB 220#define FMADD4 FMADD 221#endif 222#endif 223 224 .register %g2, #scratch 225 .register %g3, #scratch 226 227 PROLOGUE 228 SAVESP 229 230#ifndef __64BIT__ 231#ifdef DOUBLE 232 ld [%sp + STACK_START + 32], A 233 ld [%sp + STACK_START + 36], B 234 ld [%sp + STACK_START + 40], C 235 ld [%sp + STACK_START + 44], LDC 236 ld [%sp + STACK_START + 48], OFFSET 237#else 238 ld [%sp + STACK_START + 28], B 239 ld [%sp + STACK_START + 32], C 240 ld [%sp + STACK_START + 36], LDC 241 ld [%sp + STACK_START + 40], OFFSET 242#endif 243#else 244 ldx [%sp + STACK_START + 56], B 245 ldx [%sp + STACK_START + 64], C 246 ldx [%sp + STACK_START + 72], LDC 247 ldx [%sp + STACK_START + 80], OFFSET 248#endif 249 250 cmp M, 0 251 ble,pn %icc, .LL999 252 nop 253 254 sll LDC, ZBASE_SHIFT, LDC 255 256#ifdef LN 257 smul M, K, TEMP1 258 sll TEMP1, ZBASE_SHIFT, TEMP1 259 add A, TEMP1, A 260 261 sll M, ZBASE_SHIFT, TEMP1 262 add C, TEMP1, C 263#endif 264 265#ifdef RN 266 neg OFFSET, KK 267#endif 268 269#ifdef RT 270 smul N, K, TEMP1 271 sll TEMP1, ZBASE_SHIFT, TEMP1 272 add B, TEMP1, B 273 274 smul N, LDC, TEMP1 275 add C, TEMP1, C 276 277 sub N, OFFSET, KK 278#endif 279 280 sra N, 2, J 281 cmp J, 0 282 ble,pn %icc, .LL20 283 nop 284 .align 4 285 286.LL11: 287#ifdef RT 288 sll K, ZBASE_SHIFT + 2, TEMP1 289 sub B, TEMP1, B 290#endif 291 292#ifndef RT 293 mov C, C1 294 add C, LDC, C2 295 add C2, LDC, C3 296 add C3, LDC, C4 297 add C4, LDC, C 298#else 299 sub C, LDC, C4 300 sub C4, LDC, C3 301 sub C3, LDC, C2 302 sub C2, LDC, C1 303 sub C2, LDC, C 304#endif 305 306#ifdef LN 307 add M, OFFSET, KK 308#endif 309 310#ifdef LT 311 mov OFFSET, KK 312#endif 313 314#if defined(LN) || defined(RT) 315 mov A, AORIG 316#else 317 mov A, AO 318#endif 319 320 mov M, I 321 .align 4 322 323.LL12: 324#if defined(LT) || defined(RN) 325 mov B, BO 326#else 327#ifdef LN 328 sll K, ZBASE_SHIFT, TEMP1 329 sub AORIG, TEMP1, AORIG 330#endif 331 332 sll KK, ZBASE_SHIFT + 0, TEMP1 333 sll KK, ZBASE_SHIFT + 2, TEMP2 334 335 add AORIG, TEMP1, AO 336 add B, TEMP2, BO 337#endif 338 339 LDF [AO + 0 * SIZE], a1 340 FCLR (cc01) 341 LDF [AO + 1 * SIZE], a2 342 FCLR (cc05) 343 LDF [AO + 8 * SIZE], a5 344 FCLR (cc09) 345 LDF [BO + 0 * SIZE], b1 346 FCLR (cc13) 347 348 LDF [BO + 1 * SIZE], b2 349 FCLR (cc02) 350 LDF [BO + 2 * SIZE], b3 351 FCLR (cc06) 352 LDF [BO + 3 * SIZE], b4 353 FCLR (cc10) 354 LDF [BO + 4 * SIZE], b5 355 FCLR (cc14) 356 357 LDF [BO + 5 * SIZE], b6 358 FCLR (cc03) 359 LDF [BO + 6 * SIZE], b7 360 FCLR (cc07) 361 LDF [BO + 7 * SIZE], b8 362 FCLR (cc11) 363 LDF [BO + 8 * SIZE], b9 364 FCLR (cc15) 365 366 prefetch [C1 + 1 * SIZE], 3 367 FCLR (cc04) 368 prefetch [C2 + 2 * SIZE], 3 369 FCLR (cc08) 370 prefetch [C3 + 1 * SIZE], 3 371 FCLR (cc12) 372 prefetch [C4 + 2 * SIZE], 3 373 FCLR (cc16) 374 375#if defined(LT) || defined(RN) 376 sra KK, 3, L 377#else 378 sub K, KK, L 379 sra L, 3, L 380#endif 381 cmp L, 0 382 ble,pn %icc, .LL15 383 nop 384 .align 4 385 386.LL13: 387 FMADD1 (aa1, bb1, cc01, cc01) 388 FMADD2 (aa2, bb1, cc02, cc02) 389 FMADD3 (aa1, bb2, cc03, cc03) 390 FMADD4 (aa2, bb2, cc04, cc04) 391 392 FMADD1 (aa1, bb3, cc05, cc05) 393 LDF [BO + 16 * SIZE], b1 394 FMADD2 (aa2, bb3, cc06, cc06) 395 LDF [BO + 9 * SIZE], b2 396 397 FMADD3 (aa1, bb4, cc07, cc07) 398 LDF [BO + 10 * SIZE], b3 399 FMADD4 (aa2, bb4, cc08, cc08) 400 LDF [BO + 11 * SIZE], b4 401 402 FMADD1 (aa1, bb5, cc09, cc09) 403 LDF [AO + 2 * SIZE], a3 404 FMADD2 (aa2, bb5, cc10, cc10) 405 LDF [AO + 3 * SIZE], a4 406 407 FMADD3 (aa1, bb6, cc11, cc11) 408 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 409 FMADD4 (aa2, bb6, cc12, cc12) 410 nop 411 412 FMADD1 (aa1, bb7, cc13, cc13) 413 LDF [BO + 12 * SIZE], b5 414 FMADD2 (aa2, bb7, cc14, cc14) 415 LDF [BO + 13 * SIZE], b6 416 417 FMADD3 (aa1, bb8, cc15, cc15) 418 LDF [BO + 14 * SIZE], b7 419 FMADD4 (aa2, bb8, cc16, cc16) 420 LDF [BO + 15 * SIZE], b8 421 422 FMADD1 (aa3, bb9, cc01, cc01) 423 FMADD2 (aa4, bb9, cc02, cc02) 424 FMADD3 (aa3, bb2, cc03, cc03) 425 FMADD4 (aa4, bb2, cc04, cc04) 426 427 FMADD1 (aa3, bb3, cc05, cc05) 428 LDF [BO + 24 * SIZE], b9 429 FMADD2 (aa4, bb3, cc06, cc06) 430 LDF [BO + 17 * SIZE], b2 431 432 FMADD3 (aa3, bb4, cc07, cc07) 433 LDF [BO + 18 * SIZE], b3 434 FMADD4 (aa4, bb4, cc08, cc08) 435 LDF [BO + 19 * SIZE], b4 436 437 FMADD1 (aa3, bb5, cc09, cc09) 438 LDF [AO + 4 * SIZE], a1 439 FMADD2 (aa4, bb5, cc10, cc10) 440 LDF [AO + 5 * SIZE], a2 441 442 FMADD3 (aa3, bb6, cc11, cc11) 443 add L, -1, L 444 FMADD4 (aa4, bb6, cc12, cc12) 445 nop 446 447 FMADD1 (aa3, bb7, cc13, cc13) 448 LDF [BO + 20 * SIZE], b5 449 FMADD2 (aa4, bb7, cc14, cc14) 450 LDF [BO + 21 * SIZE], b6 451 452 FMADD3 (aa3, bb8, cc15, cc15) 453 LDF [BO + 22 * SIZE], b7 454 FMADD4 (aa4, bb8, cc16, cc16) 455 LDF [BO + 23 * SIZE], b8 456 457 FMADD1 (aa1, bb1, cc01, cc01) 458 FMADD2 (aa2, bb1, cc02, cc02) 459 FMADD3 (aa1, bb2, cc03, cc03) 460 FMADD4 (aa2, bb2, cc04, cc04) 461 462 FMADD1 (aa1, bb3, cc05, cc05) 463 LDF [BO + 32 * SIZE], b1 464 FMADD2 (aa2, bb3, cc06, cc06) 465 LDF [BO + 25 * SIZE], b2 466 467 FMADD3 (aa1, bb4, cc07, cc07) 468 LDF [BO + 26 * SIZE], b3 469 FMADD4 (aa2, bb4, cc08, cc08) 470 LDF [BO + 27 * SIZE], b4 471 472 FMADD1 (aa1, bb5, cc09, cc09) 473 LDF [AO + 6 * SIZE], a3 474 FMADD2 (aa2, bb5, cc10, cc10) 475 LDF [AO + 7 * SIZE], a4 476 477 FMADD3 (aa1, bb6, cc11, cc11) 478 nop 479 FMADD4 (aa2, bb6, cc12, cc12) 480 nop 481 482 FMADD1 (aa1, bb7, cc13, cc13) 483 LDF [BO + 28 * SIZE], b5 484 FMADD2 (aa2, bb7, cc14, cc14) 485 LDF [BO + 29 * SIZE], b6 486 487 FMADD3 (aa1, bb8, cc15, cc15) 488 LDF [BO + 30 * SIZE], b7 489 FMADD4 (aa2, bb8, cc16, cc16) 490 LDF [BO + 31 * SIZE], b8 491 492 FMADD1 (aa3, bb9, cc01, cc01) 493 FMADD2 (aa4, bb9, cc02, cc02) 494 FMADD3 (aa3, bb2, cc03, cc03) 495 FMADD4 (aa4, bb2, cc04, cc04) 496 497 FMADD1 (aa3, bb3, cc05, cc05) 498 LDF [BO + 40 * SIZE], b9 499 FMADD2 (aa4, bb3, cc06, cc06) 500 LDF [BO + 33 * SIZE], b2 501 502 FMADD3 (aa3, bb4, cc07, cc07) 503 LDF [BO + 34 * SIZE], b3 504 FMADD4 (aa4, bb4, cc08, cc08) 505 LDF [BO + 35 * SIZE], b4 506 507 FMADD1 (aa3, bb5, cc09, cc09) 508 LDF [AO + 16 * SIZE], a1 /****/ 509 FMADD2 (aa4, bb5, cc10, cc10) 510 LDF [AO + 9 * SIZE], a2 511 512 FMADD3 (aa3, bb6, cc11, cc11) 513 nop 514 FMADD4 (aa4, bb6, cc12, cc12) 515 nop 516 517 FMADD1 (aa3, bb7, cc13, cc13) 518 LDF [BO + 36 * SIZE], b5 519 FMADD2 (aa4, bb7, cc14, cc14) 520 LDF [BO + 37 * SIZE], b6 521 522 FMADD3 (aa3, bb8, cc15, cc15) 523 LDF [BO + 38 * SIZE], b7 524 FMADD4 (aa4, bb8, cc16, cc16) 525 LDF [BO + 39 * SIZE], b8 526 527 FMADD1 (aa5, bb1, cc01, cc01) 528 FMADD2 (aa2, bb1, cc02, cc02) 529 FMADD3 (aa5, bb2, cc03, cc03) 530 FMADD4 (aa2, bb2, cc04, cc04) 531 532 FMADD1 (aa5, bb3, cc05, cc05) 533 LDF [BO + 48 * SIZE], b1 534 FMADD2 (aa2, bb3, cc06, cc06) 535 LDF [BO + 41 * SIZE], b2 536 537 FMADD3 (aa5, bb4, cc07, cc07) 538 LDF [BO + 42 * SIZE], b3 539 FMADD4 (aa2, bb4, cc08, cc08) 540 LDF [BO + 43 * SIZE], b4 541 542 FMADD1 (aa5, bb5, cc09, cc09) 543 LDF [AO + 10 * SIZE], a3 544 FMADD2 (aa2, bb5, cc10, cc10) 545 LDF [AO + 11 * SIZE], a4 546 547 FMADD3 (aa5, bb6, cc11, cc11) 548 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 549 FMADD4 (aa2, bb6, cc12, cc12) 550 nop 551 552 FMADD1 (aa5, bb7, cc13, cc13) 553 LDF [BO + 44 * SIZE], b5 554 FMADD2 (aa2, bb7, cc14, cc14) 555 LDF [BO + 45 * SIZE], b6 556 557 FMADD3 (aa5, bb8, cc15, cc15) 558 LDF [BO + 46 * SIZE], b7 559 FMADD4 (aa2, bb8, cc16, cc16) 560 LDF [BO + 47 * SIZE], b8 561 562 FMADD1 (aa3, bb9, cc01, cc01) 563 FMADD2 (aa4, bb9, cc02, cc02) 564 FMADD3 (aa3, bb2, cc03, cc03) 565 FMADD4 (aa4, bb2, cc04, cc04) 566 567 FMADD1 (aa3, bb3, cc05, cc05) 568 LDF [BO + 56 * SIZE], b9 569 FMADD2 (aa4, bb3, cc06, cc06) 570 LDF [BO + 49 * SIZE], b2 571 572 FMADD3 (aa3, bb4, cc07, cc07) 573 LDF [BO + 50 * SIZE], b3 574 FMADD4 (aa4, bb4, cc08, cc08) 575 LDF [BO + 51 * SIZE], b4 576 577 FMADD1 (aa3, bb5, cc09, cc09) 578 LDF [AO + 12 * SIZE], a5 579 FMADD2 (aa4, bb5, cc10, cc10) 580 LDF [AO + 13 * SIZE], a2 581 582 FMADD3 (aa3, bb6, cc11, cc11) 583 cmp L, 0 584 FMADD4 (aa4, bb6, cc12, cc12) 585 nop 586 587 FMADD1 (aa3, bb7, cc13, cc13) 588 LDF [BO + 52 * SIZE], b5 589 FMADD2 (aa4, bb7, cc14, cc14) 590 LDF [BO + 53 * SIZE], b6 591 592 FMADD3 (aa3, bb8, cc15, cc15) 593 LDF [BO + 54 * SIZE], b7 594 FMADD4 (aa4, bb8, cc16, cc16) 595 LDF [BO + 55 * SIZE], b8 596 597 FMADD1 (aa5, bb1, cc01, cc01) 598 FMADD2 (aa2, bb1, cc02, cc02) 599 FMADD3 (aa5, bb2, cc03, cc03) 600 FMADD4 (aa2, bb2, cc04, cc04) 601 602 FMADD1 (aa5, bb3, cc05, cc05) 603 LDF [BO + 64 * SIZE], b1 604 FMADD2 (aa2, bb3, cc06, cc06) 605 LDF [BO + 57 * SIZE], b2 606 607 FMADD3 (aa5, bb4, cc07, cc07) 608 LDF [BO + 58 * SIZE], b3 609 FMADD4 (aa2, bb4, cc08, cc08) 610 LDF [BO + 59 * SIZE], b4 611 612 FMADD1 (aa5, bb5, cc09, cc09) 613 LDF [AO + 14 * SIZE], a3 614 FMADD2 (aa2, bb5, cc10, cc10) 615 LDF [AO + 15 * SIZE], a4 616 617 FMADD3 (aa5, bb6, cc11, cc11) 618 add BO, 64 * SIZE, BO 619 FMADD4 (aa2, bb6, cc12, cc12) 620 add AO, 16 * SIZE, AO 621 622 FMADD1 (aa5, bb7, cc13, cc13) 623 LDF [BO - 4 * SIZE], b5 624 FMADD2 (aa2, bb7, cc14, cc14) 625 LDF [BO - 3 * SIZE], b6 626 627 FMADD3 (aa5, bb8, cc15, cc15) 628 LDF [BO - 2 * SIZE], b7 629 FMADD4 (aa2, bb8, cc16, cc16) 630 LDF [BO - 1 * SIZE], b8 631 632 FMADD1 (aa3, bb9, cc01, cc01) 633 FMADD2 (aa4, bb9, cc02, cc02) 634 FMADD3 (aa3, bb2, cc03, cc03) 635 FMADD4 (aa4, bb2, cc04, cc04) 636 637 FMADD1 (aa3, bb3, cc05, cc05) 638 LDF [BO + 8 * SIZE], b9 639 FMADD2 (aa4, bb3, cc06, cc06) 640 LDF [BO + 1 * SIZE], b2 641 642 FMADD3 (aa3, bb4, cc07, cc07) 643 LDF [BO + 2 * SIZE], b3 644 FMADD4 (aa4, bb4, cc08, cc08) 645 LDF [BO + 3 * SIZE], b4 646 647 FMADD1 (aa3, bb5, cc09, cc09) 648 LDF [AO + 8 * SIZE], a5 /****/ 649 FMADD2 (aa4, bb5, cc10, cc10) 650 LDF [AO + 1 * SIZE], a2 651 652 FMADD3 (aa3, bb6, cc11, cc11) 653 FMADD4 (aa4, bb6, cc12, cc12) 654 655 FMADD1 (aa3, bb7, cc13, cc13) 656 LDF [BO + 4 * SIZE], b5 657 FMADD2 (aa4, bb7, cc14, cc14) 658 LDF [BO + 5 * SIZE], b6 659 660 FMADD3 (aa3, bb8, cc15, cc15) 661 LDF [BO + 6 * SIZE], b7 662 FMADD4 (aa4, bb8, cc16, cc16) 663 ble,pn %icc, .LL15 664 LDF [BO + 7 * SIZE], b8 665 666 FMADD1 (aa1, bb1, cc01, cc01) 667 FMADD2 (aa2, bb1, cc02, cc02) 668 FMADD3 (aa1, bb2, cc03, cc03) 669 FMADD4 (aa2, bb2, cc04, cc04) 670 671 FMADD1 (aa1, bb3, cc05, cc05) 672 LDF [BO + 16 * SIZE], b1 673 FMADD2 (aa2, bb3, cc06, cc06) 674 LDF [BO + 9 * SIZE], b2 675 676 FMADD3 (aa1, bb4, cc07, cc07) 677 LDF [BO + 10 * SIZE], b3 678 FMADD4 (aa2, bb4, cc08, cc08) 679 LDF [BO + 11 * SIZE], b4 680 681 FMADD1 (aa1, bb5, cc09, cc09) 682 LDF [AO + 2 * SIZE], a3 683 FMADD2 (aa2, bb5, cc10, cc10) 684 LDF [AO + 3 * SIZE], a4 685 686 FMADD3 (aa1, bb6, cc11, cc11) 687 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 688 FMADD4 (aa2, bb6, cc12, cc12) 689 nop 690 691 FMADD1 (aa1, bb7, cc13, cc13) 692 LDF [BO + 12 * SIZE], b5 693 FMADD2 (aa2, bb7, cc14, cc14) 694 LDF [BO + 13 * SIZE], b6 695 696 FMADD3 (aa1, bb8, cc15, cc15) 697 LDF [BO + 14 * SIZE], b7 698 FMADD4 (aa2, bb8, cc16, cc16) 699 LDF [BO + 15 * SIZE], b8 700 701 FMADD1 (aa3, bb9, cc01, cc01) 702 FMADD2 (aa4, bb9, cc02, cc02) 703 FMADD3 (aa3, bb2, cc03, cc03) 704 FMADD4 (aa4, bb2, cc04, cc04) 705 706 FMADD1 (aa3, bb3, cc05, cc05) 707 LDF [BO + 24 * SIZE], b9 708 FMADD2 (aa4, bb3, cc06, cc06) 709 LDF [BO + 17 * SIZE], b2 710 711 FMADD3 (aa3, bb4, cc07, cc07) 712 LDF [BO + 18 * SIZE], b3 713 FMADD4 (aa4, bb4, cc08, cc08) 714 LDF [BO + 19 * SIZE], b4 715 716 FMADD1 (aa3, bb5, cc09, cc09) 717 LDF [AO + 4 * SIZE], a1 718 FMADD2 (aa4, bb5, cc10, cc10) 719 LDF [AO + 5 * SIZE], a2 720 721 FMADD3 (aa3, bb6, cc11, cc11) 722 add L, -1, L 723 FMADD4 (aa4, bb6, cc12, cc12) 724 nop 725 726 FMADD1 (aa3, bb7, cc13, cc13) 727 LDF [BO + 20 * SIZE], b5 728 FMADD2 (aa4, bb7, cc14, cc14) 729 LDF [BO + 21 * SIZE], b6 730 731 FMADD3 (aa3, bb8, cc15, cc15) 732 LDF [BO + 22 * SIZE], b7 733 FMADD4 (aa4, bb8, cc16, cc16) 734 LDF [BO + 23 * SIZE], b8 735 736 FMADD1 (aa1, bb1, cc01, cc01) 737 FMADD2 (aa2, bb1, cc02, cc02) 738 FMADD3 (aa1, bb2, cc03, cc03) 739 FMADD4 (aa2, bb2, cc04, cc04) 740 741 FMADD1 (aa1, bb3, cc05, cc05) 742 LDF [BO + 32 * SIZE], b1 743 FMADD2 (aa2, bb3, cc06, cc06) 744 LDF [BO + 25 * SIZE], b2 745 746 FMADD3 (aa1, bb4, cc07, cc07) 747 LDF [BO + 26 * SIZE], b3 748 FMADD4 (aa2, bb4, cc08, cc08) 749 LDF [BO + 27 * SIZE], b4 750 751 FMADD1 (aa1, bb5, cc09, cc09) 752 LDF [AO + 6 * SIZE], a3 753 FMADD2 (aa2, bb5, cc10, cc10) 754 LDF [AO + 7 * SIZE], a4 755 756 FMADD3 (aa1, bb6, cc11, cc11) 757 nop 758 FMADD4 (aa2, bb6, cc12, cc12) 759 nop 760 761 FMADD1 (aa1, bb7, cc13, cc13) 762 LDF [BO + 28 * SIZE], b5 763 FMADD2 (aa2, bb7, cc14, cc14) 764 LDF [BO + 29 * SIZE], b6 765 766 FMADD3 (aa1, bb8, cc15, cc15) 767 LDF [BO + 30 * SIZE], b7 768 FMADD4 (aa2, bb8, cc16, cc16) 769 LDF [BO + 31 * SIZE], b8 770 771 FMADD1 (aa3, bb9, cc01, cc01) 772 FMADD2 (aa4, bb9, cc02, cc02) 773 FMADD3 (aa3, bb2, cc03, cc03) 774 FMADD4 (aa4, bb2, cc04, cc04) 775 776 FMADD1 (aa3, bb3, cc05, cc05) 777 LDF [BO + 40 * SIZE], b9 778 FMADD2 (aa4, bb3, cc06, cc06) 779 LDF [BO + 33 * SIZE], b2 780 781 FMADD3 (aa3, bb4, cc07, cc07) 782 LDF [BO + 34 * SIZE], b3 783 FMADD4 (aa4, bb4, cc08, cc08) 784 LDF [BO + 35 * SIZE], b4 785 786 FMADD1 (aa3, bb5, cc09, cc09) 787 LDF [AO + 16 * SIZE], a1 /****/ 788 FMADD2 (aa4, bb5, cc10, cc10) 789 LDF [AO + 9 * SIZE], a2 790 791 FMADD3 (aa3, bb6, cc11, cc11) 792 nop 793 FMADD4 (aa4, bb6, cc12, cc12) 794 nop 795 796 FMADD1 (aa3, bb7, cc13, cc13) 797 LDF [BO + 36 * SIZE], b5 798 FMADD2 (aa4, bb7, cc14, cc14) 799 LDF [BO + 37 * SIZE], b6 800 801 FMADD3 (aa3, bb8, cc15, cc15) 802 LDF [BO + 38 * SIZE], b7 803 FMADD4 (aa4, bb8, cc16, cc16) 804 LDF [BO + 39 * SIZE], b8 805 806 FMADD1 (aa5, bb1, cc01, cc01) 807 FMADD2 (aa2, bb1, cc02, cc02) 808 FMADD3 (aa5, bb2, cc03, cc03) 809 FMADD4 (aa2, bb2, cc04, cc04) 810 811 FMADD1 (aa5, bb3, cc05, cc05) 812 LDF [BO + 48 * SIZE], b1 813 FMADD2 (aa2, bb3, cc06, cc06) 814 LDF [BO + 41 * SIZE], b2 815 816 FMADD3 (aa5, bb4, cc07, cc07) 817 LDF [BO + 42 * SIZE], b3 818 FMADD4 (aa2, bb4, cc08, cc08) 819 LDF [BO + 43 * SIZE], b4 820 821 FMADD1 (aa5, bb5, cc09, cc09) 822 LDF [AO + 10 * SIZE], a3 823 FMADD2 (aa2, bb5, cc10, cc10) 824 LDF [AO + 11 * SIZE], a4 825 826 FMADD3 (aa5, bb6, cc11, cc11) 827 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 828 FMADD4 (aa2, bb6, cc12, cc12) 829 nop 830 831 FMADD1 (aa5, bb7, cc13, cc13) 832 LDF [BO + 44 * SIZE], b5 833 FMADD2 (aa2, bb7, cc14, cc14) 834 LDF [BO + 45 * SIZE], b6 835 836 FMADD3 (aa5, bb8, cc15, cc15) 837 LDF [BO + 46 * SIZE], b7 838 FMADD4 (aa2, bb8, cc16, cc16) 839 LDF [BO + 47 * SIZE], b8 840 841 FMADD1 (aa3, bb9, cc01, cc01) 842 FMADD2 (aa4, bb9, cc02, cc02) 843 FMADD3 (aa3, bb2, cc03, cc03) 844 FMADD4 (aa4, bb2, cc04, cc04) 845 846 FMADD1 (aa3, bb3, cc05, cc05) 847 LDF [BO + 56 * SIZE], b9 848 FMADD2 (aa4, bb3, cc06, cc06) 849 LDF [BO + 49 * SIZE], b2 850 851 FMADD3 (aa3, bb4, cc07, cc07) 852 LDF [BO + 50 * SIZE], b3 853 FMADD4 (aa4, bb4, cc08, cc08) 854 LDF [BO + 51 * SIZE], b4 855 856 FMADD1 (aa3, bb5, cc09, cc09) 857 LDF [AO + 12 * SIZE], a5 858 FMADD2 (aa4, bb5, cc10, cc10) 859 LDF [AO + 13 * SIZE], a2 860 861 FMADD3 (aa3, bb6, cc11, cc11) 862 cmp L, 0 863 FMADD4 (aa4, bb6, cc12, cc12) 864 nop 865 866 FMADD1 (aa3, bb7, cc13, cc13) 867 LDF [BO + 52 * SIZE], b5 868 FMADD2 (aa4, bb7, cc14, cc14) 869 LDF [BO + 53 * SIZE], b6 870 871 FMADD3 (aa3, bb8, cc15, cc15) 872 LDF [BO + 54 * SIZE], b7 873 FMADD4 (aa4, bb8, cc16, cc16) 874 LDF [BO + 55 * SIZE], b8 875 876 FMADD1 (aa5, bb1, cc01, cc01) 877 FMADD2 (aa2, bb1, cc02, cc02) 878 FMADD3 (aa5, bb2, cc03, cc03) 879 FMADD4 (aa2, bb2, cc04, cc04) 880 881 FMADD1 (aa5, bb3, cc05, cc05) 882 LDF [BO + 64 * SIZE], b1 883 FMADD2 (aa2, bb3, cc06, cc06) 884 LDF [BO + 57 * SIZE], b2 885 886 FMADD3 (aa5, bb4, cc07, cc07) 887 LDF [BO + 58 * SIZE], b3 888 FMADD4 (aa2, bb4, cc08, cc08) 889 LDF [BO + 59 * SIZE], b4 890 891 FMADD1 (aa5, bb5, cc09, cc09) 892 LDF [AO + 14 * SIZE], a3 893 FMADD2 (aa2, bb5, cc10, cc10) 894 LDF [AO + 15 * SIZE], a4 895 896 FMADD3 (aa5, bb6, cc11, cc11) 897 add BO, 64 * SIZE, BO 898 FMADD4 (aa2, bb6, cc12, cc12) 899 add AO, 16 * SIZE, AO 900 901 FMADD1 (aa5, bb7, cc13, cc13) 902 LDF [BO - 4 * SIZE], b5 903 FMADD2 (aa2, bb7, cc14, cc14) 904 LDF [BO - 3 * SIZE], b6 905 906 FMADD3 (aa5, bb8, cc15, cc15) 907 LDF [BO - 2 * SIZE], b7 908 FMADD4 (aa2, bb8, cc16, cc16) 909 LDF [BO - 1 * SIZE], b8 910 911 FMADD1 (aa3, bb9, cc01, cc01) 912 FMADD2 (aa4, bb9, cc02, cc02) 913 FMADD3 (aa3, bb2, cc03, cc03) 914 FMADD4 (aa4, bb2, cc04, cc04) 915 916 FMADD1 (aa3, bb3, cc05, cc05) 917 LDF [BO + 8 * SIZE], b9 918 FMADD2 (aa4, bb3, cc06, cc06) 919 LDF [BO + 1 * SIZE], b2 920 921 FMADD3 (aa3, bb4, cc07, cc07) 922 LDF [BO + 2 * SIZE], b3 923 FMADD4 (aa4, bb4, cc08, cc08) 924 LDF [BO + 3 * SIZE], b4 925 926 FMADD1 (aa3, bb5, cc09, cc09) 927 LDF [AO + 8 * SIZE], a5 /****/ 928 FMADD2 (aa4, bb5, cc10, cc10) 929 LDF [AO + 1 * SIZE], a2 930 931 FMADD3 (aa3, bb6, cc11, cc11) 932 FMADD4 (aa4, bb6, cc12, cc12) 933 934 FMADD1 (aa3, bb7, cc13, cc13) 935 LDF [BO + 4 * SIZE], b5 936 FMADD2 (aa4, bb7, cc14, cc14) 937 LDF [BO + 5 * SIZE], b6 938 939 FMADD3 (aa3, bb8, cc15, cc15) 940 LDF [BO + 6 * SIZE], b7 941 FMADD4 (aa4, bb8, cc16, cc16) 942 bg,pt %icc, .LL13 943 LDF [BO + 7 * SIZE], b8 944 .align 4 945 946.LL15: 947#if defined(LT) || defined(RN) 948 and KK, 7, L 949#else 950 sub K, KK, L 951 and L, 7, L 952#endif 953 cmp L, 0 954 ble,a,pn %icc, .LL18 955 nop 956 .align 4 957 958.LL17: 959 FMADD1 (aa1, bb1, cc01, cc01) 960 add L, -1, L 961 FMADD2 (aa2, bb1, cc02, cc02) 962 nop 963 964 FMADD3 (aa1, bb2, cc03, cc03) 965 LDF [BO + 8 * SIZE], b1 966 FMADD4 (aa2, bb2, cc04, cc04) 967 LDF [BO + 9 * SIZE], b2 968 969 FMADD1 (aa1, bb3, cc05, cc05) 970 cmp L, 0 971 FMADD2 (aa2, bb3, cc06, cc06) 972 nop 973 974 FMADD3 (aa1, bb4, cc07, cc07) 975 LDF [BO + 10 * SIZE], b3 976 FMADD4 (aa2, bb4, cc08, cc08) 977 LDF [BO + 11 * SIZE], b4 978 979 FMADD1 (aa1, bb5, cc09, cc09) 980 nop 981 FMADD2 (aa2, bb5, cc10, cc10) 982 nop 983 984 FMADD3 (aa1, bb6, cc11, cc11) 985 LDF [BO + 12 * SIZE], b5 986 FMADD4 (aa2, bb6, cc12, cc12) 987 LDF [BO + 13 * SIZE], b6 988 989 FMADD1 (aa1, bb7, cc13, cc13) 990 add AO, 2 * SIZE, AO 991 FMADD2 (aa2, bb7, cc14, cc14) 992 add BO, 8 * SIZE, BO 993 994 FMADD3 (aa1, bb8, cc15, cc15) 995 LDF [AO + 0 * SIZE], a1 996 FMADD4 (aa2, bb8, cc16, cc16) 997 LDF [AO + 1 * SIZE], a2 998 999 LDF [BO + 6 * SIZE], b7 1000 bg,pt %icc, .LL17 1001 LDF [BO + 7 * SIZE], b8 1002 nop 1003 .align 4 1004 1005.LL18: 1006 FADD c01, c04, c01 1007 FADD c02, c03, c02 1008 FADD c05, c08, c05 1009 FADD c06, c07, c06 1010 1011 FADD c09, c12, c09 1012 FADD c10, c11, c10 1013 FADD c13, c16, c13 1014 FADD c14, c15, c14 1015 1016#if defined(LN) || defined(RT) 1017#ifdef LN 1018 sub KK, 1, TEMP1 1019#else 1020 sub KK, 4, TEMP1 1021#endif 1022 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 1023 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 1024 1025 add AORIG, TEMP2, AO 1026 add B, TEMP1, BO 1027#endif 1028 1029#if defined(LN) || defined(LT) 1030 LDF [BO + 0 * SIZE], a1 1031 LDF [BO + 1 * SIZE], a2 1032 LDF [BO + 2 * SIZE], a3 1033 LDF [BO + 3 * SIZE], a4 1034 1035 LDF [BO + 4 * SIZE], b1 1036 LDF [BO + 5 * SIZE], b2 1037 LDF [BO + 6 * SIZE], b3 1038 LDF [BO + 7 * SIZE], b4 1039#else 1040 LDF [AO + 0 * SIZE], a1 1041 LDF [AO + 1 * SIZE], a2 1042 LDF [AO + 2 * SIZE], a3 1043 LDF [AO + 3 * SIZE], a4 1044 1045 LDF [AO + 4 * SIZE], b1 1046 LDF [AO + 5 * SIZE], b2 1047 LDF [AO + 6 * SIZE], b3 1048 LDF [AO + 7 * SIZE], b4 1049#endif 1050 1051 FSUB a1, c01, c01 1052 FSUB a2, c02, c02 1053 FSUB a3, c05, c05 1054 FSUB a4, c06, c06 1055 1056 FSUB b1, c09, c09 1057 FSUB b2, c10, c10 1058 FSUB b3, c13, c13 1059 FSUB b4, c14, c14 1060 1061#if defined(LN) || defined(LT) 1062 LDF [AO + 0 * SIZE], a1 1063 LDF [AO + 1 * SIZE], a2 1064 1065 FMUL a1, c01, b1 1066 FMUL a2, c01, b2 1067 FMUL a1, c05, b3 1068 FMUL a2, c05, b4 1069 FMUL a1, c09, b5 1070 FMUL a2, c09, b6 1071 FMUL a1, c13, b7 1072 FMUL a2, c13, b8 1073 1074#ifndef CONJ 1075 FNMSUB (aa2, cc02, bb1, cc01) 1076 FMADD (aa1, cc02, bb2, cc02) 1077 FNMSUB (aa2, cc06, bb3, cc05) 1078 FMADD (aa1, cc06, bb4, cc06) 1079 FNMSUB (aa2, cc10, bb5, cc09) 1080 FMADD (aa1, cc10, bb6, cc10) 1081 FNMSUB (aa2, cc14, bb7, cc13) 1082 FMADD (aa1, cc14, bb8, cc14) 1083#else 1084 FMADD (aa2, cc02, bb1, cc01) 1085 FMSUB (aa1, cc02, bb2, cc02) 1086 FMADD (aa2, cc06, bb3, cc05) 1087 FMSUB (aa1, cc06, bb4, cc06) 1088 FMADD (aa2, cc10, bb5, cc09) 1089 FMSUB (aa1, cc10, bb6, cc10) 1090 FMADD (aa2, cc14, bb7, cc13) 1091 FMSUB (aa1, cc14, bb8, cc14) 1092#endif 1093#endif 1094 1095#ifdef RN 1096 LDF [BO + 0 * SIZE], b1 1097 LDF [BO + 1 * SIZE], b2 1098 LDF [BO + 2 * SIZE], b3 1099 LDF [BO + 3 * SIZE], b4 1100 LDF [BO + 4 * SIZE], b5 1101 LDF [BO + 5 * SIZE], b6 1102 LDF [BO + 6 * SIZE], b7 1103 LDF [BO + 7 * SIZE], b8 1104 1105 FMUL b1, c01, a1 1106 FMUL b2, c01, a2 1107 1108#ifndef CONJ 1109 FNMSUB (bb2, cc02, aa1, cc01) 1110 FMADD (bb1, cc02, aa2, cc02) 1111#else 1112 FMADD (bb2, cc02, aa1, cc01) 1113 FMSUB (bb1, cc02, aa2, cc02) 1114#endif 1115 1116 FNMSUB (bb3, cc01, cc05, cc05) 1117 FNMSUB (bb3, cc02, cc06, cc06) 1118 FNMSUB (bb5, cc01, cc09, cc09) 1119 FNMSUB (bb5, cc02, cc10, cc10) 1120 FNMSUB (bb7, cc01, cc13, cc13) 1121 FNMSUB (bb7, cc02, cc14, cc14) 1122 1123#ifndef CONJ 1124 FMADD (bb4, cc02, cc05, cc05) 1125 FNMSUB (bb4, cc01, cc06, cc06) 1126 FMADD (bb6, cc02, cc09, cc09) 1127 FNMSUB (bb6, cc01, cc10, cc10) 1128 FMADD (bb8, cc02, cc13, cc13) 1129 FNMSUB (bb8, cc01, cc14, cc14) 1130#else 1131 FNMSUB (bb4, cc02, cc05, cc05) 1132 FMADD (bb4, cc01, cc06, cc06) 1133 FNMSUB (bb6, cc02, cc09, cc09) 1134 FMADD (bb6, cc01, cc10, cc10) 1135 FNMSUB (bb8, cc02, cc13, cc13) 1136 FMADD (bb8, cc01, cc14, cc14) 1137#endif 1138 1139 LDF [BO + 10 * SIZE], b1 1140 LDF [BO + 11 * SIZE], b2 1141 LDF [BO + 12 * SIZE], b3 1142 LDF [BO + 13 * SIZE], b4 1143 LDF [BO + 14 * SIZE], b5 1144 LDF [BO + 15 * SIZE], b6 1145 1146 FMUL b1, c05, a1 1147 FMUL b2, c05, a2 1148 1149#ifndef CONJ 1150 FNMSUB (bb2, cc06, aa1, cc05) 1151 FMADD (bb1, cc06, aa2, cc06) 1152#else 1153 FMADD (bb2, cc06, aa1, cc05) 1154 FMSUB (bb1, cc06, aa2, cc06) 1155#endif 1156 1157 FNMSUB (bb3, cc05, cc09, cc09) 1158 FNMSUB (bb3, cc06, cc10, cc10) 1159 FNMSUB (bb5, cc05, cc13, cc13) 1160 FNMSUB (bb5, cc06, cc14, cc14) 1161 1162#ifndef CONJ 1163 FMADD (bb4, cc06, cc09, cc09) 1164 FNMSUB (bb4, cc05, cc10, cc10) 1165 FMADD (bb6, cc06, cc13, cc13) 1166 FNMSUB (bb6, cc05, cc14, cc14) 1167#else 1168 FNMSUB (bb4, cc06, cc09, cc09) 1169 FMADD (bb4, cc05, cc10, cc10) 1170 FNMSUB (bb6, cc06, cc13, cc13) 1171 FMADD (bb6, cc05, cc14, cc14) 1172#endif 1173 1174 LDF [BO + 20 * SIZE], b1 1175 LDF [BO + 21 * SIZE], b2 1176 LDF [BO + 22 * SIZE], b3 1177 LDF [BO + 23 * SIZE], b4 1178 1179 FMUL b1, c09, a1 1180 FMUL b2, c09, a2 1181 1182#ifndef CONJ 1183 FNMSUB (bb2, cc10, aa1, cc09) 1184 FMADD (bb1, cc10, aa2, cc10) 1185#else 1186 FMADD (bb2, cc10, aa1, cc09) 1187 FMSUB (bb1, cc10, aa2, cc10) 1188#endif 1189 1190 FNMSUB (bb3, cc09, cc13, cc13) 1191 FNMSUB (bb3, cc10, cc14, cc14) 1192 1193#ifndef CONJ 1194 FMADD (bb4, cc10, cc13, cc13) 1195 FNMSUB (bb4, cc09, cc14, cc14) 1196#else 1197 FNMSUB (bb4, cc10, cc13, cc13) 1198 FMADD (bb4, cc09, cc14, cc14) 1199#endif 1200 1201 LDF [BO + 30 * SIZE], b1 1202 LDF [BO + 31 * SIZE], b2 1203 1204 FMUL b1, c13, a1 1205 FMUL b2, c13, a2 1206 1207#ifndef CONJ 1208 FNMSUB (bb2, cc14, aa1, cc13) 1209 FMADD (bb1, cc14, aa2, cc14) 1210#else 1211 FMADD (bb2, cc14, aa1, cc13) 1212 FMSUB (bb1, cc14, aa2, cc14) 1213#endif 1214#endif 1215 1216#ifdef RT 1217 LDF [BO + 30 * SIZE], b1 1218 LDF [BO + 31 * SIZE], b2 1219 LDF [BO + 28 * SIZE], b3 1220 LDF [BO + 29 * SIZE], b4 1221 LDF [BO + 26 * SIZE], b5 1222 LDF [BO + 27 * SIZE], b6 1223 LDF [BO + 24 * SIZE], b7 1224 LDF [BO + 25 * SIZE], b8 1225 1226 FMUL b1, c13, a1 1227 FMUL b2, c13, a2 1228 1229#ifndef CONJ 1230 FNMSUB (bb2, cc14, aa1, cc13) 1231 FMADD (bb1, cc14, aa2, cc14) 1232#else 1233 FMADD (bb2, cc14, aa1, cc13) 1234 FMSUB (bb1, cc14, aa2, cc14) 1235#endif 1236 1237 FNMSUB (bb3, cc13, cc09, cc09) 1238 FNMSUB (bb3, cc14, cc10, cc10) 1239 FNMSUB (bb5, cc13, cc05, cc05) 1240 FNMSUB (bb5, cc14, cc06, cc06) 1241 FNMSUB (bb7, cc13, cc01, cc01) 1242 FNMSUB (bb7, cc14, cc02, cc02) 1243 1244#ifndef CONJ 1245 FMADD (bb4, cc14, cc09, cc09) 1246 FNMSUB (bb4, cc13, cc10, cc10) 1247 FMADD (bb6, cc14, cc05, cc05) 1248 FNMSUB (bb6, cc13, cc06, cc06) 1249 FMADD (bb8, cc14, cc01, cc01) 1250 FNMSUB (bb8, cc13, cc02, cc02) 1251#else 1252 FNMSUB (bb4, cc14, cc09, cc09) 1253 FMADD (bb4, cc13, cc10, cc10) 1254 FNMSUB (bb6, cc14, cc05, cc05) 1255 FMADD (bb6, cc13, cc06, cc06) 1256 FNMSUB (bb8, cc14, cc01, cc01) 1257 FMADD (bb8, cc13, cc02, cc02) 1258#endif 1259 1260 LDF [BO + 20 * SIZE], b1 1261 LDF [BO + 21 * SIZE], b2 1262 LDF [BO + 18 * SIZE], b3 1263 LDF [BO + 19 * SIZE], b4 1264 LDF [BO + 16 * SIZE], b5 1265 LDF [BO + 17 * SIZE], b6 1266 1267 FMUL b1, c09, a1 1268 FMUL b2, c09, a2 1269 1270#ifndef CONJ 1271 FNMSUB (bb2, cc10, aa1, cc09) 1272 FMADD (bb1, cc10, aa2, cc10) 1273#else 1274 FMADD (bb2, cc10, aa1, cc09) 1275 FMSUB (bb1, cc10, aa2, cc10) 1276#endif 1277 1278 FNMSUB (bb3, cc09, cc05, cc05) 1279 FNMSUB (bb3, cc10, cc06, cc06) 1280 FNMSUB (bb5, cc09, cc01, cc01) 1281 FNMSUB (bb5, cc10, cc02, cc02) 1282 1283#ifndef CONJ 1284 FMADD (bb4, cc10, cc05, cc05) 1285 FNMSUB (bb4, cc09, cc06, cc06) 1286 FMADD (bb6, cc10, cc01, cc01) 1287 FNMSUB (bb6, cc09, cc02, cc02) 1288#else 1289 FNMSUB (bb4, cc10, cc05, cc05) 1290 FMADD (bb4, cc09, cc06, cc06) 1291 FNMSUB (bb6, cc10, cc01, cc01) 1292 FMADD (bb6, cc09, cc02, cc02) 1293#endif 1294 1295 LDF [BO + 10 * SIZE], b1 1296 LDF [BO + 11 * SIZE], b2 1297 LDF [BO + 8 * SIZE], b3 1298 LDF [BO + 9 * SIZE], b4 1299 1300 FMUL b1, c05, a1 1301 FMUL b2, c05, a2 1302 1303#ifndef CONJ 1304 FNMSUB (bb2, cc06, aa1, cc05) 1305 FMADD (bb1, cc06, aa2, cc06) 1306#else 1307 FMADD (bb2, cc06, aa1, cc05) 1308 FMSUB (bb1, cc06, aa2, cc06) 1309#endif 1310 1311 FNMSUB (bb3, cc05, cc01, cc01) 1312 FNMSUB (bb3, cc06, cc02, cc02) 1313 1314#ifndef CONJ 1315 FMADD (bb4, cc06, cc01, cc01) 1316 FNMSUB (bb4, cc05, cc02, cc02) 1317#else 1318 FNMSUB (bb4, cc06, cc01, cc01) 1319 FMADD (bb4, cc05, cc02, cc02) 1320#endif 1321 1322 LDF [BO + 0 * SIZE], b1 1323 LDF [BO + 1 * SIZE], b2 1324 1325 FMUL b1, c01, a1 1326 FMUL b2, c01, a2 1327 1328#ifndef CONJ 1329 FNMSUB (bb2, cc02, aa1, cc01) 1330 FMADD (bb1, cc02, aa2, cc02) 1331#else 1332 FMADD (bb2, cc02, aa1, cc01) 1333 FMSUB (bb1, cc02, aa2, cc02) 1334#endif 1335#endif 1336 1337#ifdef LN 1338 add C1, -2 * SIZE, C1 1339 add C2, -2 * SIZE, C2 1340 add C3, -2 * SIZE, C3 1341 add C4, -2 * SIZE, C4 1342#endif 1343 1344#if defined(LN) || defined(LT) 1345 STF c01, [BO + 0 * SIZE] 1346 STF c02, [BO + 1 * SIZE] 1347 STF c05, [BO + 2 * SIZE] 1348 STF c06, [BO + 3 * SIZE] 1349 1350 STF c09, [BO + 4 * SIZE] 1351 STF c10, [BO + 5 * SIZE] 1352 STF c13, [BO + 6 * SIZE] 1353 STF c14, [BO + 7 * SIZE] 1354#else 1355 STF c01, [AO + 0 * SIZE] 1356 STF c02, [AO + 1 * SIZE] 1357 STF c05, [AO + 2 * SIZE] 1358 STF c06, [AO + 3 * SIZE] 1359 1360 STF c09, [AO + 4 * SIZE] 1361 STF c10, [AO + 5 * SIZE] 1362 STF c13, [AO + 6 * SIZE] 1363 STF c14, [AO + 7 * SIZE] 1364#endif 1365 1366 STF c01, [C1 + 0 * SIZE] 1367 STF c02, [C1 + 1 * SIZE] 1368 STF c05, [C2 + 0 * SIZE] 1369 STF c06, [C2 + 1 * SIZE] 1370 1371 STF c09, [C3 + 0 * SIZE] 1372 STF c10, [C3 + 1 * SIZE] 1373 STF c13, [C4 + 0 * SIZE] 1374 STF c14, [C4 + 1 * SIZE] 1375 1376#ifndef LN 1377 add C1, 2 * SIZE, C1 1378 add C2, 2 * SIZE, C2 1379 add C3, 2 * SIZE, C3 1380 add C4, 2 * SIZE, C4 1381#endif 1382 1383#ifdef RT 1384 sll K, ZBASE_SHIFT, TEMP1 1385 add AORIG, TEMP1, AORIG 1386#endif 1387 1388#if defined(LT) || defined(RN) 1389 sub K, KK, TEMP1 1390 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 1391 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 1392 add AO, TEMP2, AO 1393 add BO, TEMP1, BO 1394#endif 1395 1396#ifdef LT 1397 add KK, 1, KK 1398#endif 1399 1400#ifdef LN 1401 sub KK, 1, KK 1402#endif 1403 1404 add I, -1, I 1405 cmp I, 0 1406 bg,pt %icc, .LL12 1407 nop 1408 1409#ifdef LN 1410 sll K, ZBASE_SHIFT + 2, TEMP1 1411 add B, TEMP1, B 1412#endif 1413 1414#if defined(LT) || defined(RN) 1415 mov BO, B 1416#endif 1417 1418#ifdef RN 1419 add KK, 4, KK 1420#endif 1421 1422#ifdef RT 1423 sub KK, 4, KK 1424#endif 1425 1426 add J, -1, J 1427 cmp J, 0 1428 bg,pt %icc, .LL11 1429 nop 1430 .align 4 1431 1432.LL20: 1433 and N, 2, J 1434 cmp J, 0 1435 ble,pn %icc, .LL30 1436 nop 1437 1438#ifdef RT 1439 sll K, ZBASE_SHIFT + 1, TEMP1 1440 sub B, TEMP1, B 1441#endif 1442 1443#ifndef RT 1444 mov C, C1 1445 add C, LDC, C2 1446 add C2, LDC, C 1447#else 1448 sub C, LDC, C2 1449 sub C2, LDC, C1 1450 sub C2, LDC, C 1451#endif 1452 1453#ifdef LN 1454 add M, OFFSET, KK 1455#endif 1456 1457#ifdef LT 1458 mov OFFSET, KK 1459#endif 1460 1461#if defined(LN) || defined(RT) 1462 mov A, AORIG 1463#else 1464 mov A, AO 1465#endif 1466 1467 mov M, I 1468 .align 4 1469 1470.LL22: 1471#if defined(LT) || defined(RN) 1472 mov B, BO 1473#else 1474#ifdef LN 1475 sll K, ZBASE_SHIFT, TEMP1 1476 sub AORIG, TEMP1, AORIG 1477#endif 1478 1479 sll KK, ZBASE_SHIFT + 0, TEMP1 1480 sll KK, ZBASE_SHIFT + 1, TEMP2 1481 1482 add AORIG, TEMP1, AO 1483 add B, TEMP2, BO 1484#endif 1485 1486 LDF [AO + 0 * SIZE], a1 1487 LDF [AO + 1 * SIZE], a2 1488 1489 LDF [BO + 0 * SIZE], b1 1490 LDF [BO + 1 * SIZE], b2 1491 LDF [BO + 2 * SIZE], b3 1492 LDF [BO + 3 * SIZE], b4 1493 LDF [BO + 4 * SIZE], b5 1494 FCLR (cc01) 1495 1496 LDF [BO + 5 * SIZE], b6 1497 FCLR (cc02) 1498 LDF [BO + 6 * SIZE], b7 1499 FCLR (cc03) 1500 LDF [BO + 7 * SIZE], b8 1501 FCLR (cc04) 1502 LDF [BO + 8 * SIZE], b9 1503 FCLR (cc05) 1504 1505 prefetch [C1 + 2 * SIZE], 3 1506 FCLR (cc06) 1507 prefetch [C2 + 2 * SIZE], 3 1508 FCLR (cc07) 1509 1510#if defined(LT) || defined(RN) 1511 sra KK, 2, L 1512#else 1513 sub K, KK, L 1514 sra L, 2, L 1515#endif 1516 cmp L, 0 1517 ble,pn %icc, .LL25 1518 FCLR (cc08) 1519 .align 4 1520 1521.LL23: 1522 FMADD1 (aa1, bb1, cc01, cc01) 1523 LDF [AO + 2 * SIZE], a3 1524 FMADD2 (aa2, bb1, cc02, cc02) 1525 LDF [AO + 3 * SIZE], a4 1526 1527 FMADD3 (aa1, bb2, cc03, cc03) 1528 LDF [BO + 16 * SIZE], b1 1529 FMADD4 (aa2, bb2, cc04, cc04) 1530 LDF [BO + 9 * SIZE], b2 1531 1532 FMADD1 (aa1, bb3, cc05, cc05) 1533 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1534 FMADD2 (aa2, bb3, cc06, cc06) 1535 add L, -1, L 1536 1537 FMADD3 (aa1, bb4, cc07, cc07) 1538 LDF [BO + 10 * SIZE], b3 1539 FMADD4 (aa2, bb4, cc08, cc08) 1540 LDF [BO + 11 * SIZE], b4 1541 1542 FMADD1 (aa3, bb5, cc01, cc01) 1543 LDF [AO + 4 * SIZE], a1 1544 FMADD2 (aa4, bb5, cc02, cc02) 1545 LDF [AO + 5 * SIZE], a2 1546 1547 FMADD3 (aa3, bb6, cc03, cc03) 1548 LDF [BO + 12 * SIZE], b5 1549 FMADD4 (aa4, bb6, cc04, cc04) 1550 LDF [BO + 13 * SIZE], b6 1551 1552 FMADD1 (aa3, bb7, cc05, cc05) 1553 cmp L, 0 1554 FMADD2 (aa4, bb7, cc06, cc06) 1555 add AO, 8 * SIZE, AO 1556 1557 FMADD3 (aa3, bb8, cc07, cc07) 1558 LDF [BO + 14 * SIZE], b7 1559 FMADD4 (aa4, bb8, cc08, cc08) 1560 LDF [BO + 15 * SIZE], b8 1561 1562 FMADD1 (aa1, bb9, cc01, cc01) 1563 LDF [AO - 2 * SIZE], a3 1564 FMADD2 (aa2, bb9, cc02, cc02) 1565 LDF [AO - 1 * SIZE], a4 1566 1567 FMADD3 (aa1, bb2, cc03, cc03) 1568 LDF [BO + 24 * SIZE], b9 1569 FMADD4 (aa2, bb2, cc04, cc04) 1570 LDF [BO + 17 * SIZE], b2 1571 1572 FMADD1 (aa1, bb3, cc05, cc05) 1573 add BO, 16 * SIZE, BO 1574 FMADD2 (aa2, bb3, cc06, cc06) 1575 nop 1576 1577 FMADD3 (aa1, bb4, cc07, cc07) 1578 LDF [BO + 2 * SIZE], b3 1579 FMADD4 (aa2, bb4, cc08, cc08) 1580 LDF [BO + 3 * SIZE], b4 1581 1582 FMADD1 (aa3, bb5, cc01, cc01) 1583 LDF [AO + 0 * SIZE], a1 1584 FMADD2 (aa4, bb5, cc02, cc02) 1585 LDF [AO + 1 * SIZE], a2 1586 FMADD3 (aa3, bb6, cc03, cc03) 1587 LDF [BO + 4 * SIZE], b5 1588 FMADD4 (aa4, bb6, cc04, cc04) 1589 LDF [BO + 5 * SIZE], b6 1590 1591 FMADD1 (aa3, bb7, cc05, cc05) 1592 nop 1593 FMADD2 (aa4, bb7, cc06, cc06) 1594 LDF [BO + 6 * SIZE], b7 1595 1596 FMADD3 (aa3, bb8, cc07, cc07) 1597 FMADD4 (aa4, bb8, cc08, cc08) 1598 bg,pt %icc, .LL23 1599 LDF [BO + 7 * SIZE], b8 1600 .align 4 1601 1602.LL25: 1603#if defined(LT) || defined(RN) 1604 and KK, 3, L 1605#else 1606 sub K, KK, L 1607 and L, 3, L 1608#endif 1609 cmp L, 0 1610 ble,a,pn %icc, .LL28 1611 nop 1612 .align 4 1613 1614.LL27: 1615 FMADD1 (aa1, bb1, cc01, cc01) 1616 add L, -1, L 1617 FMADD2 (aa2, bb1, cc02, cc02) 1618 LDF [BO + 4 * SIZE], b1 1619 1620 FMADD3 (aa1, bb2, cc03, cc03) 1621 add AO, 2 * SIZE, AO 1622 FMADD4 (aa2, bb2, cc04, cc04) 1623 LDF [BO + 5 * SIZE], b2 1624 1625 FMADD1 (aa1, bb3, cc05, cc05) 1626 cmp L, 0 1627 FMADD2 (aa2, bb3, cc06, cc06) 1628 LDF [BO + 6 * SIZE], b3 1629 1630 FMADD3 (aa1, bb4, cc07, cc07) 1631 LDF [AO + 0 * SIZE], a1 1632 FMADD4 (aa2, bb4, cc08, cc08) 1633 LDF [AO + 1 * SIZE], a2 1634 1635 LDF [BO + 7 * SIZE], b4 1636 bg,pt %icc, .LL27 1637 add BO, 4 * SIZE, BO 1638 .align 4 1639 1640.LL28: 1641 FADD c01, c04, c01 1642 FADD c02, c03, c02 1643 FADD c05, c08, c05 1644 FADD c06, c07, c06 1645 1646#if defined(LN) || defined(RT) 1647#ifdef LN 1648 sub KK, 1, TEMP1 1649#else 1650 sub KK, 2, TEMP1 1651#endif 1652 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 1653 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 1654 1655 add AORIG, TEMP2, AO 1656 add B, TEMP1, BO 1657#endif 1658 1659#if defined(LN) || defined(LT) 1660 LDF [BO + 0 * SIZE], a1 1661 LDF [BO + 1 * SIZE], a2 1662 LDF [BO + 2 * SIZE], a3 1663 LDF [BO + 3 * SIZE], a4 1664#else 1665 LDF [AO + 0 * SIZE], a1 1666 LDF [AO + 1 * SIZE], a2 1667 LDF [AO + 2 * SIZE], a3 1668 LDF [AO + 3 * SIZE], a4 1669#endif 1670 1671 FSUB a1, c01, c01 1672 FSUB a2, c02, c02 1673 FSUB a3, c05, c05 1674 FSUB a4, c06, c06 1675 1676#if defined(LN) || defined(LT) 1677 LDF [AO + 0 * SIZE], a1 1678 LDF [AO + 1 * SIZE], a2 1679 1680 FMUL a1, c01, b1 1681 FMUL a2, c01, b2 1682 FMUL a1, c05, b3 1683 FMUL a2, c05, b4 1684 1685#ifndef CONJ 1686 FNMSUB (aa2, cc02, bb1, cc01) 1687 FMADD (aa1, cc02, bb2, cc02) 1688 FNMSUB (aa2, cc06, bb3, cc05) 1689 FMADD (aa1, cc06, bb4, cc06) 1690#else 1691 FMADD (aa2, cc02, bb1, cc01) 1692 FMSUB (aa1, cc02, bb2, cc02) 1693 FMADD (aa2, cc06, bb3, cc05) 1694 FMSUB (aa1, cc06, bb4, cc06) 1695#endif 1696#endif 1697 1698#ifdef RN 1699 LDF [BO + 0 * SIZE], b1 1700 LDF [BO + 1 * SIZE], b2 1701 LDF [BO + 2 * SIZE], b3 1702 LDF [BO + 3 * SIZE], b4 1703 1704 FMUL b1, c01, a1 1705 FMUL b2, c01, a2 1706 1707#ifndef CONJ 1708 FNMSUB (bb2, cc02, aa1, cc01) 1709 FMADD (bb1, cc02, aa2, cc02) 1710#else 1711 FMADD (bb2, cc02, aa1, cc01) 1712 FMSUB (bb1, cc02, aa2, cc02) 1713#endif 1714 1715 FNMSUB (bb3, cc01, cc05, cc05) 1716 FNMSUB (bb3, cc02, cc06, cc06) 1717 1718#ifndef CONJ 1719 FMADD (bb4, cc02, cc05, cc05) 1720 FNMSUB (bb4, cc01, cc06, cc06) 1721#else 1722 FNMSUB (bb4, cc02, cc05, cc05) 1723 FMADD (bb4, cc01, cc06, cc06) 1724#endif 1725 1726 LDF [BO + 6 * SIZE], b1 1727 LDF [BO + 7 * SIZE], b2 1728 1729 FMUL b1, c05, a1 1730 FMUL b2, c05, a2 1731 1732#ifndef CONJ 1733 FNMSUB (bb2, cc06, aa1, cc05) 1734 FMADD (bb1, cc06, aa2, cc06) 1735#else 1736 FMADD (bb2, cc06, aa1, cc05) 1737 FMSUB (bb1, cc06, aa2, cc06) 1738#endif 1739#endif 1740 1741#ifdef RT 1742 LDF [BO + 6 * SIZE], b1 1743 LDF [BO + 7 * SIZE], b2 1744 LDF [BO + 4 * SIZE], b3 1745 LDF [BO + 5 * SIZE], b4 1746 1747 FMUL b1, c05, a1 1748 FMUL b2, c05, a2 1749 1750#ifndef CONJ 1751 FNMSUB (bb2, cc06, aa1, cc05) 1752 FMADD (bb1, cc06, aa2, cc06) 1753#else 1754 FMADD (bb2, cc06, aa1, cc05) 1755 FMSUB (bb1, cc06, aa2, cc06) 1756#endif 1757 1758 FNMSUB (bb3, cc05, cc01, cc01) 1759 FNMSUB (bb3, cc06, cc02, cc02) 1760 1761#ifndef CONJ 1762 FMADD (bb4, cc06, cc01, cc01) 1763 FNMSUB (bb4, cc05, cc02, cc02) 1764#else 1765 FNMSUB (bb4, cc06, cc01, cc01) 1766 FMADD (bb4, cc05, cc02, cc02) 1767#endif 1768 1769 LDF [BO + 0 * SIZE], b1 1770 LDF [BO + 1 * SIZE], b2 1771 1772 FMUL b1, c01, a1 1773 FMUL b2, c01, a2 1774 1775#ifndef CONJ 1776 FNMSUB (bb2, cc02, aa1, cc01) 1777 FMADD (bb1, cc02, aa2, cc02) 1778#else 1779 FMADD (bb2, cc02, aa1, cc01) 1780 FMSUB (bb1, cc02, aa2, cc02) 1781#endif 1782#endif 1783 1784#ifdef LN 1785 add C1, -2 * SIZE, C1 1786 add C2, -2 * SIZE, C2 1787#endif 1788 1789#if defined(LN) || defined(LT) 1790 STF c01, [BO + 0 * SIZE] 1791 STF c02, [BO + 1 * SIZE] 1792 STF c05, [BO + 2 * SIZE] 1793 STF c06, [BO + 3 * SIZE] 1794#else 1795 STF c01, [AO + 0 * SIZE] 1796 STF c02, [AO + 1 * SIZE] 1797 STF c05, [AO + 2 * SIZE] 1798 STF c06, [AO + 3 * SIZE] 1799#endif 1800 1801 STF c01, [C1 + 0 * SIZE] 1802 STF c02, [C1 + 1 * SIZE] 1803 STF c05, [C2 + 0 * SIZE] 1804 STF c06, [C2 + 1 * SIZE] 1805 1806#ifndef LN 1807 add C1, 2 * SIZE, C1 1808 add C2, 2 * SIZE, C2 1809#endif 1810 1811#ifdef RT 1812 sll K, ZBASE_SHIFT, TEMP1 1813 add AORIG, TEMP1, AORIG 1814#endif 1815 1816#if defined(LT) || defined(RN) 1817 sub K, KK, TEMP1 1818 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 1819 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 1820 add AO, TEMP2, AO 1821 add BO, TEMP1, BO 1822#endif 1823 1824#ifdef LT 1825 add KK, 1, KK 1826#endif 1827 1828#ifdef LN 1829 sub KK, 1, KK 1830#endif 1831 1832 add I, -1, I 1833 cmp I, 0 1834 bg,pt %icc, .LL22 1835 nop 1836 1837#ifdef LN 1838 sll K, ZBASE_SHIFT + 1, TEMP1 1839 add B, TEMP1, B 1840#endif 1841 1842#if defined(LT) || defined(RN) 1843 mov BO, B 1844#endif 1845 1846#ifdef RN 1847 add KK, 2, KK 1848#endif 1849 1850#ifdef RT 1851 sub KK, 2, KK 1852#endif 1853 .align 4 1854 1855.LL30: 1856 and N, 1, J 1857 cmp J, 0 1858 ble,pn %icc, .LL999 1859 nop 1860 1861#ifdef RT 1862 sll K, ZBASE_SHIFT, TEMP1 1863 sub B, TEMP1, B 1864#endif 1865 1866#ifndef RT 1867 mov C, C1 1868 add C, LDC, C 1869#else 1870 sub C, LDC, C1 1871 sub C, LDC, C 1872#endif 1873 1874#ifdef LN 1875 add M, OFFSET, KK 1876#endif 1877 1878#ifdef LT 1879 mov OFFSET, KK 1880#endif 1881 1882#if defined(LN) || defined(RT) 1883 mov A, AORIG 1884#else 1885 mov A, AO 1886#endif 1887 1888 mov M, I 1889 .align 4 1890 1891.LL32: 1892#if defined(LT) || defined(RN) 1893 mov B, BO 1894#else 1895#ifdef LN 1896 sll K, ZBASE_SHIFT, TEMP1 1897 sub AORIG, TEMP1, AORIG 1898#endif 1899 1900 sll KK, ZBASE_SHIFT + 0, TEMP1 1901 1902 add AORIG, TEMP1, AO 1903 add B, TEMP1, BO 1904#endif 1905 1906 LDF [AO + 0 * SIZE], a1 1907 LDF [AO + 1 * SIZE], a2 1908 LDF [AO + 2 * SIZE], a3 1909 LDF [AO + 3 * SIZE], a4 1910 1911 LDF [BO + 0 * SIZE], b1 1912 LDF [BO + 1 * SIZE], b2 1913 LDF [BO + 2 * SIZE], b3 1914 FCLR (cc01) 1915 LDF [BO + 3 * SIZE], b4 1916 FCLR (cc02) 1917 1918 LDF [BO + 4 * SIZE], b5 1919 FCLR (cc03) 1920 LDF [BO + 5 * SIZE], b6 1921 FCLR (cc04) 1922 LDF [BO + 6 * SIZE], b7 1923 FCLR (cc05) 1924 LDF [BO + 7 * SIZE], b8 1925 FCLR (cc06) 1926 1927 prefetch [C1 + 2 * SIZE], 3 1928 FCLR (cc07) 1929 1930#if defined(LT) || defined(RN) 1931 sra KK, 2, L 1932#else 1933 sub K, KK, L 1934 sra L, 2, L 1935#endif 1936 cmp L, 0 1937 ble,pn %icc, .LL35 1938 FCLR (cc08) 1939 .align 4 1940 1941.LL33: 1942 FMADD1 (aa1, bb1, cc01, cc01) 1943 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1944 FMADD2 (aa2, bb1, cc02, cc02) 1945 LDF [BO + 8 * SIZE], b1 1946 1947 FMADD3 (aa1, bb2, cc03, cc03) 1948 LDF [AO + 4 * SIZE], a1 1949 FMADD4 (aa2, bb2, cc04, cc04) 1950 LDF [AO + 5 * SIZE], a2 1951 1952 FMADD1 (aa3, bb3, cc01, cc01) 1953 LDF [BO + 9 * SIZE], b2 1954 FMADD2 (aa4, bb3, cc02, cc02) 1955 LDF [BO + 10 * SIZE], b3 1956 1957 FMADD3 (aa3, bb4, cc03, cc03) 1958 LDF [AO + 6 * SIZE], a3 1959 FMADD4 (aa4, bb4, cc04, cc04) 1960 LDF [AO + 7 * SIZE], a4 1961 1962 FMADD1 (aa1, bb5, cc01, cc01) 1963 LDF [BO + 11 * SIZE], b4 1964 FMADD2 (aa2, bb5, cc02, cc02) 1965 LDF [BO + 12 * SIZE], b5 1966 1967 FMADD3 (aa1, bb6, cc03, cc03) 1968 LDF [AO + 8 * SIZE], a1 1969 FMADD4 (aa2, bb6, cc04, cc04) 1970 LDF [AO + 9 * SIZE], a2 1971 1972 FMADD1 (aa3, bb7, cc01, cc01) 1973 LDF [BO + 13 * SIZE], b6 1974 1975 FMADD2 (aa4, bb7, cc02, cc02) 1976 LDF [BO + 14 * SIZE], b7 1977 1978 FMADD3 (aa3, bb8, cc03, cc03) 1979 LDF [AO + 10 * SIZE], a3 1980 FMADD4 (aa4, bb8, cc04, cc04) 1981 LDF [AO + 11 * SIZE], a4 1982 1983 add AO, 8 * SIZE, AO 1984 add L, -1, L 1985 add BO, 8 * SIZE, BO 1986 cmp L, 0 1987 1988 bg,pt %icc, .LL33 1989 LDF [BO + 7 * SIZE], b8 1990 .align 4 1991 1992.LL35: 1993#if defined(LT) || defined(RN) 1994 and KK, 3, L 1995#else 1996 sub K, KK, L 1997 and L, 3, L 1998#endif 1999 cmp L, 0 2000 ble,a,pn %icc, .LL38 2001 nop 2002 .align 4 2003 2004.LL37: 2005 FMADD1 (aa1, bb1, cc01, cc01) 2006 add L, -1, L 2007 FMADD2 (aa2, bb1, cc02, cc02) 2008 LDF [BO + 2 * SIZE], b1 2009 2010 FMADD3 (aa1, bb2, cc03, cc03) 2011 LDF [AO + 2 * SIZE], a1 2012 FMADD4 (aa2, bb2, cc04, cc04) 2013 LDF [AO + 3 * SIZE], a2 2014 2015 add AO, 2 * SIZE, AO 2016 cmp L, 0 2017 add BO, 2 * SIZE, BO 2018 bg,pt %icc, .LL37 2019 LDF [BO + 1 * SIZE], b2 2020 .align 4 2021 2022.LL38: 2023 FADD c01, c04, c01 2024 FADD c02, c03, c02 2025 2026#if defined(LN) || defined(RT) 2027 sub KK, 1, TEMP1 2028 2029 sll TEMP1, ZBASE_SHIFT, TEMP1 2030 2031 add AORIG, TEMP1, AO 2032 add B, TEMP1, BO 2033#endif 2034 2035#if defined(LN) || defined(LT) 2036 LDF [BO + 0 * SIZE], a1 2037 LDF [BO + 1 * SIZE], a2 2038#else 2039 LDF [AO + 0 * SIZE], a1 2040 LDF [AO + 1 * SIZE], a2 2041#endif 2042 2043 FSUB a1, c01, c01 2044 FSUB a2, c02, c02 2045 2046#if defined(LN) || defined(LT) 2047 LDF [AO + 0 * SIZE], a1 2048 LDF [AO + 1 * SIZE], a2 2049#else 2050 LDF [BO + 0 * SIZE], a1 2051 LDF [BO + 1 * SIZE], a2 2052#endif 2053 2054 FMUL a1, c01, b1 2055 FMUL a2, c01, b2 2056 2057#ifndef CONJ 2058 FNMSUB (aa2, cc02, bb1, cc01) 2059 FMADD (aa1, cc02, bb2, cc02) 2060#else 2061 FMADD (aa2, cc02, bb1, cc01) 2062 FMSUB (aa1, cc02, bb2, cc02) 2063#endif 2064 2065#ifdef LN 2066 add C1, -2 * SIZE, C1 2067#endif 2068 2069#if defined(LN) || defined(LT) 2070 STF c01, [BO + 0 * SIZE] 2071 STF c02, [BO + 1 * SIZE] 2072#else 2073 STF c01, [AO + 0 * SIZE] 2074 STF c02, [AO + 1 * SIZE] 2075#endif 2076 2077 STF c01, [C1 + 0 * SIZE] 2078 STF c02, [C1 + 1 * SIZE] 2079 2080#ifndef LN 2081 add C1, 2 * SIZE, C1 2082#endif 2083 2084#ifdef RT 2085 sll K, ZBASE_SHIFT, TEMP1 2086 add AORIG, TEMP1, AORIG 2087#endif 2088 2089#if defined(LT) || defined(RN) 2090 sub K, KK, TEMP1 2091 sll TEMP1, ZBASE_SHIFT, TEMP1 2092 add AO, TEMP1, AO 2093 add BO, TEMP1, BO 2094#endif 2095 2096#ifdef LT 2097 add KK, 1, KK 2098#endif 2099 2100#ifdef LN 2101 sub KK, 1, KK 2102#endif 2103 2104 add I, -1, I 2105 cmp I, 0 2106 bg,pt %icc, .LL32 2107 nop 2108 2109#ifdef LN 2110 sll K, ZBASE_SHIFT, TEMP1 2111 add B, TEMP1, B 2112#endif 2113 2114#if defined(LT) || defined(RN) 2115 mov BO, B 2116#endif 2117 2118#ifdef RN 2119 add KK, 1, KK 2120#endif 2121 2122#ifdef RT 2123 sub KK, 1, KK 2124#endif 2125 .align 4 2126 2127.LL999: 2128 return %i7 + 8 2129 clr %o0 2130 2131 EPILOGUE 2132