1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2005. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define APREFETCHSIZE 24 26#define APREFETCH_CATEGORY 0 27 28#define M %i0 29#define N %i1 30#define K %i2 31#define A %i5 32#define B %i3 33#define C %i4 34 35#define LDC %o0 36#define AO %o1 37#define BO %o2 38#define I %o3 39#define J %o4 40#define L %o5 41 42#define C1 %l0 43#define C2 %l1 44#define C3 %l2 45#define C4 %l3 46 47#define OFFSET %l4 48#define KK %l5 49#define TEMP1 %l6 50#define TEMP2 %l7 51#define AORIG %o7 52 53#ifdef DOUBLE 54#define c01 %f0 55#define c02 %f2 56#define c03 %f4 57#define c04 %f6 58#define c05 %f8 59#define c06 %f10 60#define c07 %f12 61#define c08 %f14 62#define c09 %f16 63#define c10 %f18 64#define c11 %f20 65#define c12 %f22 66#define c13 %f24 67#define c14 %f26 68#define c15 %f28 69#define c16 %f30 70 71#define a1 %f32 72#define a2 %f34 73#define a3 %f36 74#define a4 %f38 75#define a5 %f40 76 77#define b1 %f42 78#define b2 %f44 79#define b3 %f46 80#define b4 %f48 81#define b5 %f50 82#define b6 %f52 83#define b7 %f54 84#define b8 %f56 85#define b9 %f58 86 87#define cc01 0 88#define cc02 2 89#define cc03 4 90#define cc04 6 91#define cc05 8 92#define cc06 10 93#define cc07 12 94#define cc08 14 95#define cc09 16 96#define cc10 18 97#define cc11 20 98#define cc12 22 99#define cc13 24 100#define cc14 26 101#define cc15 28 102#define cc16 30 103 104#define aa1 1 105#define aa2 3 106#define aa3 5 107#define aa4 7 108#define aa5 9 109 110#define bb1 11 111#define bb2 13 112#define bb3 15 113#define bb4 17 114#define bb5 19 115#define bb6 21 116#define bb7 23 117#define bb8 25 118#define bb9 27 119#else 120#define c01 %f0 121#define c02 %f1 122#define c03 %f2 123#define c04 %f3 124#define c05 %f4 125#define c06 %f5 126#define c07 %f6 127#define c08 %f7 128#define c09 %f8 129#define c10 %f9 130#define c11 %f10 131#define c12 %f11 132#define c13 %f12 133#define c14 %f13 134#define c15 %f14 135#define c16 %f15 136 137#define a1 %f16 138#define a2 %f17 139#define a3 %f18 140#define a4 %f19 141#define a5 %f20 142 143#define b1 %f21 144#define b2 %f22 145#define b3 %f23 146#define b4 %f24 147#define b5 %f25 148#define b6 %f26 149#define b7 %f27 150#define b8 %f28 151#define b9 %f29 152 153#define cc01 0 154#define cc02 1 155#define cc03 2 156#define cc04 3 157#define cc05 4 158#define cc06 5 159#define cc07 6 160#define cc08 7 161#define cc09 8 162#define cc10 9 163#define cc11 10 164#define cc12 11 165#define cc13 12 166#define cc14 13 167#define cc15 14 168#define cc16 15 169 170#define aa1 16 171#define aa2 17 172#define aa3 18 173#define aa4 19 174#define aa5 20 175 176#define bb1 21 177#define bb2 22 178#define bb3 23 179#define bb4 24 180#define bb5 25 181#define bb6 26 182#define bb7 27 183#define bb8 28 184#define bb9 29 185#endif 186 187#ifndef CONJ 188#define FMADD1 FMADD 189#define FMADD2 FMADD 190#define FMADD3 FMADD 191#define FMADD4 FNMSUB 192#else 193#if defined(LN) || defined(LT) 194#define FMADD1 FMADD 195#define FMADD2 FNMSUB 196#define FMADD3 FMADD 197#define FMADD4 FMADD 198#endif 199#if defined(RN) || defined(RT) 200#define FMADD1 FMADD 201#define FMADD2 FMADD 202#define FMADD3 FNMSUB 203#define FMADD4 FMADD 204#endif 205#endif 206 207 .register %g2, #scratch 208 .register %g3, #scratch 209 210 PROLOGUE 211 SAVESP 212 213#ifndef __64BIT__ 214#ifdef DOUBLE 215 ld [%sp + STACK_START + 32], A 216 ld [%sp + STACK_START + 36], B 217 ld [%sp + STACK_START + 40], C 218 ld [%sp + STACK_START + 44], LDC 219 ld [%sp + STACK_START + 48], OFFSET 220#else 221 ld [%sp + STACK_START + 28], B 222 ld [%sp + STACK_START + 32], C 223 ld [%sp + STACK_START + 36], LDC 224 ld [%sp + STACK_START + 40], OFFSET 225#endif 226#else 227 ldx [%sp + STACK_START + 56], B 228 ldx [%sp + STACK_START + 64], C 229 ldx [%sp + STACK_START + 72], LDC 230 ldx [%sp + STACK_START + 80], OFFSET 231#endif 232 233 cmp M, 0 234 ble,pn %icc, .LL999 235 nop 236 237 sll LDC, ZBASE_SHIFT, LDC 238 239#ifdef LN 240 smul M, K, TEMP1 241 sll TEMP1, ZBASE_SHIFT, TEMP1 242 add A, TEMP1, A 243 244 sll M, ZBASE_SHIFT, TEMP1 245 add C, TEMP1, C 246#endif 247 248#ifdef RN 249 neg OFFSET, KK 250#endif 251 252#ifdef RT 253 smul N, K, TEMP1 254 sll TEMP1, ZBASE_SHIFT, TEMP1 255 add B, TEMP1, B 256 257 smul N, LDC, TEMP1 258 add C, TEMP1, C 259 260 sub N, OFFSET, KK 261#endif 262 263 sra N, 2, J 264 cmp J, 0 265 ble,pn %icc, .LL20 266 nop 267 .align 4 268 269.LL11: 270#ifdef RT 271 sll K, ZBASE_SHIFT + 2, TEMP1 272 sub B, TEMP1, B 273#endif 274 275#ifndef RT 276 mov C, C1 277 add C, LDC, C2 278 add C2, LDC, C3 279 add C3, LDC, C4 280 add C4, LDC, C 281#else 282 sub C, LDC, C4 283 sub C4, LDC, C3 284 sub C3, LDC, C2 285 sub C2, LDC, C1 286 sub C2, LDC, C 287#endif 288 289#ifdef LN 290 add M, OFFSET, KK 291#endif 292 293#ifdef LT 294 mov OFFSET, KK 295#endif 296 297#if defined(LN) || defined(RT) 298 mov A, AORIG 299#else 300 mov A, AO 301#endif 302 303 mov M, I 304 .align 4 305 306.LL12: 307#if defined(LT) || defined(RN) 308 mov B, BO 309#else 310#ifdef LN 311 sll K, ZBASE_SHIFT, TEMP1 312 sub AORIG, TEMP1, AORIG 313#endif 314 315 sll KK, ZBASE_SHIFT + 0, TEMP1 316 sll KK, ZBASE_SHIFT + 2, TEMP2 317 318 add AORIG, TEMP1, AO 319 add B, TEMP2, BO 320#endif 321 322 LDF [AO + 0 * SIZE], a1 323 FCLR (cc01) 324 LDF [AO + 1 * SIZE], a2 325 FCLR (cc05) 326 LDF [AO + 8 * SIZE], a5 327 FCLR (cc09) 328 LDF [BO + 0 * SIZE], b1 329 FCLR (cc13) 330 331 LDF [BO + 1 * SIZE], b2 332 FCLR (cc02) 333 LDF [BO + 2 * SIZE], b3 334 FCLR (cc06) 335 LDF [BO + 3 * SIZE], b4 336 FCLR (cc10) 337 LDF [BO + 4 * SIZE], b5 338 FCLR (cc14) 339 340 LDF [BO + 5 * SIZE], b6 341 FCLR (cc03) 342 LDF [BO + 6 * SIZE], b7 343 FCLR (cc07) 344 LDF [BO + 7 * SIZE], b8 345 FCLR (cc11) 346 LDF [BO + 8 * SIZE], b9 347 FCLR (cc15) 348 349 prefetch [C1 + 1 * SIZE], 3 350 FCLR (cc04) 351 prefetch [C2 + 2 * SIZE], 3 352 FCLR (cc08) 353 prefetch [C3 + 1 * SIZE], 3 354 FCLR (cc12) 355 prefetch [C4 + 2 * SIZE], 3 356 FCLR (cc16) 357 358#if defined(LT) || defined(RN) 359 sra KK, 3, L 360#else 361 sub K, KK, L 362 sra L, 3, L 363#endif 364 cmp L, 0 365 ble,pn %icc, .LL15 366 nop 367 .align 4 368 369.LL13: 370 FMADD1 (aa1, bb1, cc01, cc01) 371 FMADD2 (aa2, bb1, cc02, cc02) 372 FMADD3 (aa1, bb2, cc03, cc03) 373 FMADD4 (aa2, bb2, cc04, cc04) 374 375 FMADD1 (aa1, bb3, cc05, cc05) 376 LDF [BO + 16 * SIZE], b1 377 FMADD2 (aa2, bb3, cc06, cc06) 378 LDF [BO + 9 * SIZE], b2 379 380 FMADD3 (aa1, bb4, cc07, cc07) 381 LDF [BO + 10 * SIZE], b3 382 FMADD4 (aa2, bb4, cc08, cc08) 383 LDF [BO + 11 * SIZE], b4 384 385 FMADD1 (aa1, bb5, cc09, cc09) 386 LDF [AO + 2 * SIZE], a3 387 FMADD2 (aa2, bb5, cc10, cc10) 388 LDF [AO + 3 * SIZE], a4 389 390 FMADD3 (aa1, bb6, cc11, cc11) 391 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 392 FMADD4 (aa2, bb6, cc12, cc12) 393 nop 394 395 FMADD1 (aa1, bb7, cc13, cc13) 396 LDF [BO + 12 * SIZE], b5 397 FMADD2 (aa2, bb7, cc14, cc14) 398 LDF [BO + 13 * SIZE], b6 399 400 FMADD3 (aa1, bb8, cc15, cc15) 401 LDF [BO + 14 * SIZE], b7 402 FMADD4 (aa2, bb8, cc16, cc16) 403 LDF [BO + 15 * SIZE], b8 404 405 FMADD1 (aa3, bb9, cc01, cc01) 406 FMADD2 (aa4, bb9, cc02, cc02) 407 FMADD3 (aa3, bb2, cc03, cc03) 408 FMADD4 (aa4, bb2, cc04, cc04) 409 410 FMADD1 (aa3, bb3, cc05, cc05) 411 LDF [BO + 24 * SIZE], b9 412 FMADD2 (aa4, bb3, cc06, cc06) 413 LDF [BO + 17 * SIZE], b2 414 415 FMADD3 (aa3, bb4, cc07, cc07) 416 LDF [BO + 18 * SIZE], b3 417 FMADD4 (aa4, bb4, cc08, cc08) 418 LDF [BO + 19 * SIZE], b4 419 420 FMADD1 (aa3, bb5, cc09, cc09) 421 LDF [AO + 4 * SIZE], a1 422 FMADD2 (aa4, bb5, cc10, cc10) 423 LDF [AO + 5 * SIZE], a2 424 425 FMADD3 (aa3, bb6, cc11, cc11) 426 add L, -1, L 427 FMADD4 (aa4, bb6, cc12, cc12) 428 nop 429 430 FMADD1 (aa3, bb7, cc13, cc13) 431 LDF [BO + 20 * SIZE], b5 432 FMADD2 (aa4, bb7, cc14, cc14) 433 LDF [BO + 21 * SIZE], b6 434 435 FMADD3 (aa3, bb8, cc15, cc15) 436 LDF [BO + 22 * SIZE], b7 437 FMADD4 (aa4, bb8, cc16, cc16) 438 LDF [BO + 23 * SIZE], b8 439 440 FMADD1 (aa1, bb1, cc01, cc01) 441 FMADD2 (aa2, bb1, cc02, cc02) 442 FMADD3 (aa1, bb2, cc03, cc03) 443 FMADD4 (aa2, bb2, cc04, cc04) 444 445 FMADD1 (aa1, bb3, cc05, cc05) 446 LDF [BO + 32 * SIZE], b1 447 FMADD2 (aa2, bb3, cc06, cc06) 448 LDF [BO + 25 * SIZE], b2 449 450 FMADD3 (aa1, bb4, cc07, cc07) 451 LDF [BO + 26 * SIZE], b3 452 FMADD4 (aa2, bb4, cc08, cc08) 453 LDF [BO + 27 * SIZE], b4 454 455 FMADD1 (aa1, bb5, cc09, cc09) 456 LDF [AO + 6 * SIZE], a3 457 FMADD2 (aa2, bb5, cc10, cc10) 458 LDF [AO + 7 * SIZE], a4 459 460 FMADD3 (aa1, bb6, cc11, cc11) 461 nop 462 FMADD4 (aa2, bb6, cc12, cc12) 463 nop 464 465 FMADD1 (aa1, bb7, cc13, cc13) 466 LDF [BO + 28 * SIZE], b5 467 FMADD2 (aa2, bb7, cc14, cc14) 468 LDF [BO + 29 * SIZE], b6 469 470 FMADD3 (aa1, bb8, cc15, cc15) 471 LDF [BO + 30 * SIZE], b7 472 FMADD4 (aa2, bb8, cc16, cc16) 473 LDF [BO + 31 * SIZE], b8 474 475 FMADD1 (aa3, bb9, cc01, cc01) 476 FMADD2 (aa4, bb9, cc02, cc02) 477 FMADD3 (aa3, bb2, cc03, cc03) 478 FMADD4 (aa4, bb2, cc04, cc04) 479 480 FMADD1 (aa3, bb3, cc05, cc05) 481 LDF [BO + 40 * SIZE], b9 482 FMADD2 (aa4, bb3, cc06, cc06) 483 LDF [BO + 33 * SIZE], b2 484 485 FMADD3 (aa3, bb4, cc07, cc07) 486 LDF [BO + 34 * SIZE], b3 487 FMADD4 (aa4, bb4, cc08, cc08) 488 LDF [BO + 35 * SIZE], b4 489 490 FMADD1 (aa3, bb5, cc09, cc09) 491 LDF [AO + 16 * SIZE], a1 /****/ 492 FMADD2 (aa4, bb5, cc10, cc10) 493 LDF [AO + 9 * SIZE], a2 494 495 FMADD3 (aa3, bb6, cc11, cc11) 496 nop 497 FMADD4 (aa4, bb6, cc12, cc12) 498 nop 499 500 FMADD1 (aa3, bb7, cc13, cc13) 501 LDF [BO + 36 * SIZE], b5 502 FMADD2 (aa4, bb7, cc14, cc14) 503 LDF [BO + 37 * SIZE], b6 504 505 FMADD3 (aa3, bb8, cc15, cc15) 506 LDF [BO + 38 * SIZE], b7 507 FMADD4 (aa4, bb8, cc16, cc16) 508 LDF [BO + 39 * SIZE], b8 509 510 FMADD1 (aa5, bb1, cc01, cc01) 511 FMADD2 (aa2, bb1, cc02, cc02) 512 FMADD3 (aa5, bb2, cc03, cc03) 513 FMADD4 (aa2, bb2, cc04, cc04) 514 515 FMADD1 (aa5, bb3, cc05, cc05) 516 LDF [BO + 48 * SIZE], b1 517 FMADD2 (aa2, bb3, cc06, cc06) 518 LDF [BO + 41 * SIZE], b2 519 520 FMADD3 (aa5, bb4, cc07, cc07) 521 LDF [BO + 42 * SIZE], b3 522 FMADD4 (aa2, bb4, cc08, cc08) 523 LDF [BO + 43 * SIZE], b4 524 525 FMADD1 (aa5, bb5, cc09, cc09) 526 LDF [AO + 10 * SIZE], a3 527 FMADD2 (aa2, bb5, cc10, cc10) 528 LDF [AO + 11 * SIZE], a4 529 530 FMADD3 (aa5, bb6, cc11, cc11) 531 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 532 FMADD4 (aa2, bb6, cc12, cc12) 533 nop 534 535 FMADD1 (aa5, bb7, cc13, cc13) 536 LDF [BO + 44 * SIZE], b5 537 FMADD2 (aa2, bb7, cc14, cc14) 538 LDF [BO + 45 * SIZE], b6 539 540 FMADD3 (aa5, bb8, cc15, cc15) 541 LDF [BO + 46 * SIZE], b7 542 FMADD4 (aa2, bb8, cc16, cc16) 543 LDF [BO + 47 * SIZE], b8 544 545 FMADD1 (aa3, bb9, cc01, cc01) 546 FMADD2 (aa4, bb9, cc02, cc02) 547 FMADD3 (aa3, bb2, cc03, cc03) 548 FMADD4 (aa4, bb2, cc04, cc04) 549 550 FMADD1 (aa3, bb3, cc05, cc05) 551 LDF [BO + 56 * SIZE], b9 552 FMADD2 (aa4, bb3, cc06, cc06) 553 LDF [BO + 49 * SIZE], b2 554 555 FMADD3 (aa3, bb4, cc07, cc07) 556 LDF [BO + 50 * SIZE], b3 557 FMADD4 (aa4, bb4, cc08, cc08) 558 LDF [BO + 51 * SIZE], b4 559 560 FMADD1 (aa3, bb5, cc09, cc09) 561 LDF [AO + 12 * SIZE], a5 562 FMADD2 (aa4, bb5, cc10, cc10) 563 LDF [AO + 13 * SIZE], a2 564 565 FMADD3 (aa3, bb6, cc11, cc11) 566 cmp L, 0 567 FMADD4 (aa4, bb6, cc12, cc12) 568 nop 569 570 FMADD1 (aa3, bb7, cc13, cc13) 571 LDF [BO + 52 * SIZE], b5 572 FMADD2 (aa4, bb7, cc14, cc14) 573 LDF [BO + 53 * SIZE], b6 574 575 FMADD3 (aa3, bb8, cc15, cc15) 576 LDF [BO + 54 * SIZE], b7 577 FMADD4 (aa4, bb8, cc16, cc16) 578 LDF [BO + 55 * SIZE], b8 579 580 FMADD1 (aa5, bb1, cc01, cc01) 581 FMADD2 (aa2, bb1, cc02, cc02) 582 FMADD3 (aa5, bb2, cc03, cc03) 583 FMADD4 (aa2, bb2, cc04, cc04) 584 585 FMADD1 (aa5, bb3, cc05, cc05) 586 LDF [BO + 64 * SIZE], b1 587 FMADD2 (aa2, bb3, cc06, cc06) 588 LDF [BO + 57 * SIZE], b2 589 590 FMADD3 (aa5, bb4, cc07, cc07) 591 LDF [BO + 58 * SIZE], b3 592 FMADD4 (aa2, bb4, cc08, cc08) 593 LDF [BO + 59 * SIZE], b4 594 595 FMADD1 (aa5, bb5, cc09, cc09) 596 LDF [AO + 14 * SIZE], a3 597 FMADD2 (aa2, bb5, cc10, cc10) 598 LDF [AO + 15 * SIZE], a4 599 600 FMADD3 (aa5, bb6, cc11, cc11) 601 add BO, 64 * SIZE, BO 602 FMADD4 (aa2, bb6, cc12, cc12) 603 add AO, 16 * SIZE, AO 604 605 FMADD1 (aa5, bb7, cc13, cc13) 606 LDF [BO - 4 * SIZE], b5 607 FMADD2 (aa2, bb7, cc14, cc14) 608 LDF [BO - 3 * SIZE], b6 609 610 FMADD3 (aa5, bb8, cc15, cc15) 611 LDF [BO - 2 * SIZE], b7 612 FMADD4 (aa2, bb8, cc16, cc16) 613 LDF [BO - 1 * SIZE], b8 614 615 FMADD1 (aa3, bb9, cc01, cc01) 616 FMADD2 (aa4, bb9, cc02, cc02) 617 FMADD3 (aa3, bb2, cc03, cc03) 618 FMADD4 (aa4, bb2, cc04, cc04) 619 620 FMADD1 (aa3, bb3, cc05, cc05) 621 LDF [BO + 8 * SIZE], b9 622 FMADD2 (aa4, bb3, cc06, cc06) 623 LDF [BO + 1 * SIZE], b2 624 625 FMADD3 (aa3, bb4, cc07, cc07) 626 LDF [BO + 2 * SIZE], b3 627 FMADD4 (aa4, bb4, cc08, cc08) 628 LDF [BO + 3 * SIZE], b4 629 630 FMADD1 (aa3, bb5, cc09, cc09) 631 LDF [AO + 8 * SIZE], a5 /****/ 632 FMADD2 (aa4, bb5, cc10, cc10) 633 LDF [AO + 1 * SIZE], a2 634 635 FMADD3 (aa3, bb6, cc11, cc11) 636 FMADD4 (aa4, bb6, cc12, cc12) 637 638 FMADD1 (aa3, bb7, cc13, cc13) 639 LDF [BO + 4 * SIZE], b5 640 FMADD2 (aa4, bb7, cc14, cc14) 641 LDF [BO + 5 * SIZE], b6 642 643 FMADD3 (aa3, bb8, cc15, cc15) 644 LDF [BO + 6 * SIZE], b7 645 FMADD4 (aa4, bb8, cc16, cc16) 646 ble,pn %icc, .LL15 647 LDF [BO + 7 * SIZE], b8 648 649 FMADD1 (aa1, bb1, cc01, cc01) 650 FMADD2 (aa2, bb1, cc02, cc02) 651 FMADD3 (aa1, bb2, cc03, cc03) 652 FMADD4 (aa2, bb2, cc04, cc04) 653 654 FMADD1 (aa1, bb3, cc05, cc05) 655 LDF [BO + 16 * SIZE], b1 656 FMADD2 (aa2, bb3, cc06, cc06) 657 LDF [BO + 9 * SIZE], b2 658 659 FMADD3 (aa1, bb4, cc07, cc07) 660 LDF [BO + 10 * SIZE], b3 661 FMADD4 (aa2, bb4, cc08, cc08) 662 LDF [BO + 11 * SIZE], b4 663 664 FMADD1 (aa1, bb5, cc09, cc09) 665 LDF [AO + 2 * SIZE], a3 666 FMADD2 (aa2, bb5, cc10, cc10) 667 LDF [AO + 3 * SIZE], a4 668 669 FMADD3 (aa1, bb6, cc11, cc11) 670 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 671 FMADD4 (aa2, bb6, cc12, cc12) 672 nop 673 674 FMADD1 (aa1, bb7, cc13, cc13) 675 LDF [BO + 12 * SIZE], b5 676 FMADD2 (aa2, bb7, cc14, cc14) 677 LDF [BO + 13 * SIZE], b6 678 679 FMADD3 (aa1, bb8, cc15, cc15) 680 LDF [BO + 14 * SIZE], b7 681 FMADD4 (aa2, bb8, cc16, cc16) 682 LDF [BO + 15 * SIZE], b8 683 684 FMADD1 (aa3, bb9, cc01, cc01) 685 FMADD2 (aa4, bb9, cc02, cc02) 686 FMADD3 (aa3, bb2, cc03, cc03) 687 FMADD4 (aa4, bb2, cc04, cc04) 688 689 FMADD1 (aa3, bb3, cc05, cc05) 690 LDF [BO + 24 * SIZE], b9 691 FMADD2 (aa4, bb3, cc06, cc06) 692 LDF [BO + 17 * SIZE], b2 693 694 FMADD3 (aa3, bb4, cc07, cc07) 695 LDF [BO + 18 * SIZE], b3 696 FMADD4 (aa4, bb4, cc08, cc08) 697 LDF [BO + 19 * SIZE], b4 698 699 FMADD1 (aa3, bb5, cc09, cc09) 700 LDF [AO + 4 * SIZE], a1 701 FMADD2 (aa4, bb5, cc10, cc10) 702 LDF [AO + 5 * SIZE], a2 703 704 FMADD3 (aa3, bb6, cc11, cc11) 705 add L, -1, L 706 FMADD4 (aa4, bb6, cc12, cc12) 707 nop 708 709 FMADD1 (aa3, bb7, cc13, cc13) 710 LDF [BO + 20 * SIZE], b5 711 FMADD2 (aa4, bb7, cc14, cc14) 712 LDF [BO + 21 * SIZE], b6 713 714 FMADD3 (aa3, bb8, cc15, cc15) 715 LDF [BO + 22 * SIZE], b7 716 FMADD4 (aa4, bb8, cc16, cc16) 717 LDF [BO + 23 * SIZE], b8 718 719 FMADD1 (aa1, bb1, cc01, cc01) 720 FMADD2 (aa2, bb1, cc02, cc02) 721 FMADD3 (aa1, bb2, cc03, cc03) 722 FMADD4 (aa2, bb2, cc04, cc04) 723 724 FMADD1 (aa1, bb3, cc05, cc05) 725 LDF [BO + 32 * SIZE], b1 726 FMADD2 (aa2, bb3, cc06, cc06) 727 LDF [BO + 25 * SIZE], b2 728 729 FMADD3 (aa1, bb4, cc07, cc07) 730 LDF [BO + 26 * SIZE], b3 731 FMADD4 (aa2, bb4, cc08, cc08) 732 LDF [BO + 27 * SIZE], b4 733 734 FMADD1 (aa1, bb5, cc09, cc09) 735 LDF [AO + 6 * SIZE], a3 736 FMADD2 (aa2, bb5, cc10, cc10) 737 LDF [AO + 7 * SIZE], a4 738 739 FMADD3 (aa1, bb6, cc11, cc11) 740 nop 741 FMADD4 (aa2, bb6, cc12, cc12) 742 nop 743 744 FMADD1 (aa1, bb7, cc13, cc13) 745 LDF [BO + 28 * SIZE], b5 746 FMADD2 (aa2, bb7, cc14, cc14) 747 LDF [BO + 29 * SIZE], b6 748 749 FMADD3 (aa1, bb8, cc15, cc15) 750 LDF [BO + 30 * SIZE], b7 751 FMADD4 (aa2, bb8, cc16, cc16) 752 LDF [BO + 31 * SIZE], b8 753 754 FMADD1 (aa3, bb9, cc01, cc01) 755 FMADD2 (aa4, bb9, cc02, cc02) 756 FMADD3 (aa3, bb2, cc03, cc03) 757 FMADD4 (aa4, bb2, cc04, cc04) 758 759 FMADD1 (aa3, bb3, cc05, cc05) 760 LDF [BO + 40 * SIZE], b9 761 FMADD2 (aa4, bb3, cc06, cc06) 762 LDF [BO + 33 * SIZE], b2 763 764 FMADD3 (aa3, bb4, cc07, cc07) 765 LDF [BO + 34 * SIZE], b3 766 FMADD4 (aa4, bb4, cc08, cc08) 767 LDF [BO + 35 * SIZE], b4 768 769 FMADD1 (aa3, bb5, cc09, cc09) 770 LDF [AO + 16 * SIZE], a1 /****/ 771 FMADD2 (aa4, bb5, cc10, cc10) 772 LDF [AO + 9 * SIZE], a2 773 774 FMADD3 (aa3, bb6, cc11, cc11) 775 nop 776 FMADD4 (aa4, bb6, cc12, cc12) 777 nop 778 779 FMADD1 (aa3, bb7, cc13, cc13) 780 LDF [BO + 36 * SIZE], b5 781 FMADD2 (aa4, bb7, cc14, cc14) 782 LDF [BO + 37 * SIZE], b6 783 784 FMADD3 (aa3, bb8, cc15, cc15) 785 LDF [BO + 38 * SIZE], b7 786 FMADD4 (aa4, bb8, cc16, cc16) 787 LDF [BO + 39 * SIZE], b8 788 789 FMADD1 (aa5, bb1, cc01, cc01) 790 FMADD2 (aa2, bb1, cc02, cc02) 791 FMADD3 (aa5, bb2, cc03, cc03) 792 FMADD4 (aa2, bb2, cc04, cc04) 793 794 FMADD1 (aa5, bb3, cc05, cc05) 795 LDF [BO + 48 * SIZE], b1 796 FMADD2 (aa2, bb3, cc06, cc06) 797 LDF [BO + 41 * SIZE], b2 798 799 FMADD3 (aa5, bb4, cc07, cc07) 800 LDF [BO + 42 * SIZE], b3 801 FMADD4 (aa2, bb4, cc08, cc08) 802 LDF [BO + 43 * SIZE], b4 803 804 FMADD1 (aa5, bb5, cc09, cc09) 805 LDF [AO + 10 * SIZE], a3 806 FMADD2 (aa2, bb5, cc10, cc10) 807 LDF [AO + 11 * SIZE], a4 808 809 FMADD3 (aa5, bb6, cc11, cc11) 810 prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY 811 FMADD4 (aa2, bb6, cc12, cc12) 812 nop 813 814 FMADD1 (aa5, bb7, cc13, cc13) 815 LDF [BO + 44 * SIZE], b5 816 FMADD2 (aa2, bb7, cc14, cc14) 817 LDF [BO + 45 * SIZE], b6 818 819 FMADD3 (aa5, bb8, cc15, cc15) 820 LDF [BO + 46 * SIZE], b7 821 FMADD4 (aa2, bb8, cc16, cc16) 822 LDF [BO + 47 * SIZE], b8 823 824 FMADD1 (aa3, bb9, cc01, cc01) 825 FMADD2 (aa4, bb9, cc02, cc02) 826 FMADD3 (aa3, bb2, cc03, cc03) 827 FMADD4 (aa4, bb2, cc04, cc04) 828 829 FMADD1 (aa3, bb3, cc05, cc05) 830 LDF [BO + 56 * SIZE], b9 831 FMADD2 (aa4, bb3, cc06, cc06) 832 LDF [BO + 49 * SIZE], b2 833 834 FMADD3 (aa3, bb4, cc07, cc07) 835 LDF [BO + 50 * SIZE], b3 836 FMADD4 (aa4, bb4, cc08, cc08) 837 LDF [BO + 51 * SIZE], b4 838 839 FMADD1 (aa3, bb5, cc09, cc09) 840 LDF [AO + 12 * SIZE], a5 841 FMADD2 (aa4, bb5, cc10, cc10) 842 LDF [AO + 13 * SIZE], a2 843 844 FMADD3 (aa3, bb6, cc11, cc11) 845 cmp L, 0 846 FMADD4 (aa4, bb6, cc12, cc12) 847 nop 848 849 FMADD1 (aa3, bb7, cc13, cc13) 850 LDF [BO + 52 * SIZE], b5 851 FMADD2 (aa4, bb7, cc14, cc14) 852 LDF [BO + 53 * SIZE], b6 853 854 FMADD3 (aa3, bb8, cc15, cc15) 855 LDF [BO + 54 * SIZE], b7 856 FMADD4 (aa4, bb8, cc16, cc16) 857 LDF [BO + 55 * SIZE], b8 858 859 FMADD1 (aa5, bb1, cc01, cc01) 860 FMADD2 (aa2, bb1, cc02, cc02) 861 FMADD3 (aa5, bb2, cc03, cc03) 862 FMADD4 (aa2, bb2, cc04, cc04) 863 864 FMADD1 (aa5, bb3, cc05, cc05) 865 LDF [BO + 64 * SIZE], b1 866 FMADD2 (aa2, bb3, cc06, cc06) 867 LDF [BO + 57 * SIZE], b2 868 869 FMADD3 (aa5, bb4, cc07, cc07) 870 LDF [BO + 58 * SIZE], b3 871 FMADD4 (aa2, bb4, cc08, cc08) 872 LDF [BO + 59 * SIZE], b4 873 874 FMADD1 (aa5, bb5, cc09, cc09) 875 LDF [AO + 14 * SIZE], a3 876 FMADD2 (aa2, bb5, cc10, cc10) 877 LDF [AO + 15 * SIZE], a4 878 879 FMADD3 (aa5, bb6, cc11, cc11) 880 add BO, 64 * SIZE, BO 881 FMADD4 (aa2, bb6, cc12, cc12) 882 add AO, 16 * SIZE, AO 883 884 FMADD1 (aa5, bb7, cc13, cc13) 885 LDF [BO - 4 * SIZE], b5 886 FMADD2 (aa2, bb7, cc14, cc14) 887 LDF [BO - 3 * SIZE], b6 888 889 FMADD3 (aa5, bb8, cc15, cc15) 890 LDF [BO - 2 * SIZE], b7 891 FMADD4 (aa2, bb8, cc16, cc16) 892 LDF [BO - 1 * SIZE], b8 893 894 FMADD1 (aa3, bb9, cc01, cc01) 895 FMADD2 (aa4, bb9, cc02, cc02) 896 FMADD3 (aa3, bb2, cc03, cc03) 897 FMADD4 (aa4, bb2, cc04, cc04) 898 899 FMADD1 (aa3, bb3, cc05, cc05) 900 LDF [BO + 8 * SIZE], b9 901 FMADD2 (aa4, bb3, cc06, cc06) 902 LDF [BO + 1 * SIZE], b2 903 904 FMADD3 (aa3, bb4, cc07, cc07) 905 LDF [BO + 2 * SIZE], b3 906 FMADD4 (aa4, bb4, cc08, cc08) 907 LDF [BO + 3 * SIZE], b4 908 909 FMADD1 (aa3, bb5, cc09, cc09) 910 LDF [AO + 8 * SIZE], a5 /****/ 911 FMADD2 (aa4, bb5, cc10, cc10) 912 LDF [AO + 1 * SIZE], a2 913 914 FMADD3 (aa3, bb6, cc11, cc11) 915 FMADD4 (aa4, bb6, cc12, cc12) 916 917 FMADD1 (aa3, bb7, cc13, cc13) 918 LDF [BO + 4 * SIZE], b5 919 FMADD2 (aa4, bb7, cc14, cc14) 920 LDF [BO + 5 * SIZE], b6 921 922 FMADD3 (aa3, bb8, cc15, cc15) 923 LDF [BO + 6 * SIZE], b7 924 FMADD4 (aa4, bb8, cc16, cc16) 925 bg,pt %icc, .LL13 926 LDF [BO + 7 * SIZE], b8 927 .align 4 928 929.LL15: 930#if defined(LT) || defined(RN) 931 and KK, 7, L 932#else 933 sub K, KK, L 934 and L, 7, L 935#endif 936 cmp L, 0 937 ble,a,pn %icc, .LL18 938 nop 939 .align 4 940 941.LL17: 942 FMADD1 (aa1, bb1, cc01, cc01) 943 add L, -1, L 944 FMADD2 (aa2, bb1, cc02, cc02) 945 nop 946 947 FMADD3 (aa1, bb2, cc03, cc03) 948 LDF [BO + 8 * SIZE], b1 949 FMADD4 (aa2, bb2, cc04, cc04) 950 LDF [BO + 9 * SIZE], b2 951 952 FMADD1 (aa1, bb3, cc05, cc05) 953 cmp L, 0 954 FMADD2 (aa2, bb3, cc06, cc06) 955 nop 956 957 FMADD3 (aa1, bb4, cc07, cc07) 958 LDF [BO + 10 * SIZE], b3 959 FMADD4 (aa2, bb4, cc08, cc08) 960 LDF [BO + 11 * SIZE], b4 961 962 FMADD1 (aa1, bb5, cc09, cc09) 963 nop 964 FMADD2 (aa2, bb5, cc10, cc10) 965 nop 966 967 FMADD3 (aa1, bb6, cc11, cc11) 968 LDF [BO + 12 * SIZE], b5 969 FMADD4 (aa2, bb6, cc12, cc12) 970 LDF [BO + 13 * SIZE], b6 971 972 FMADD1 (aa1, bb7, cc13, cc13) 973 add AO, 2 * SIZE, AO 974 FMADD2 (aa2, bb7, cc14, cc14) 975 add BO, 8 * SIZE, BO 976 977 FMADD3 (aa1, bb8, cc15, cc15) 978 LDF [AO + 0 * SIZE], a1 979 FMADD4 (aa2, bb8, cc16, cc16) 980 LDF [AO + 1 * SIZE], a2 981 982 LDF [BO + 6 * SIZE], b7 983 bg,pt %icc, .LL17 984 LDF [BO + 7 * SIZE], b8 985 nop 986 .align 4 987 988.LL18: 989 FADD c01, c04, c01 990 FADD c02, c03, c02 991 FADD c05, c08, c05 992 FADD c06, c07, c06 993 994 FADD c09, c12, c09 995 FADD c10, c11, c10 996 FADD c13, c16, c13 997 FADD c14, c15, c14 998 999#if defined(LN) || defined(RT) 1000#ifdef LN 1001 sub KK, 1, TEMP1 1002#else 1003 sub KK, 4, TEMP1 1004#endif 1005 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 1006 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 1007 1008 add AORIG, TEMP2, AO 1009 add B, TEMP1, BO 1010#endif 1011 1012#if defined(LN) || defined(LT) 1013 LDF [BO + 0 * SIZE], a1 1014 LDF [BO + 1 * SIZE], a2 1015 LDF [BO + 2 * SIZE], a3 1016 LDF [BO + 3 * SIZE], a4 1017 1018 LDF [BO + 4 * SIZE], b1 1019 LDF [BO + 5 * SIZE], b2 1020 LDF [BO + 6 * SIZE], b3 1021 LDF [BO + 7 * SIZE], b4 1022#else 1023 LDF [AO + 0 * SIZE], a1 1024 LDF [AO + 1 * SIZE], a2 1025 LDF [AO + 2 * SIZE], a3 1026 LDF [AO + 3 * SIZE], a4 1027 1028 LDF [AO + 4 * SIZE], b1 1029 LDF [AO + 5 * SIZE], b2 1030 LDF [AO + 6 * SIZE], b3 1031 LDF [AO + 7 * SIZE], b4 1032#endif 1033 1034 FSUB a1, c01, c01 1035 FSUB a2, c02, c02 1036 FSUB a3, c05, c05 1037 FSUB a4, c06, c06 1038 1039 FSUB b1, c09, c09 1040 FSUB b2, c10, c10 1041 FSUB b3, c13, c13 1042 FSUB b4, c14, c14 1043 1044#if defined(LN) || defined(LT) 1045 LDF [AO + 0 * SIZE], a1 1046 LDF [AO + 1 * SIZE], a2 1047 1048 FMUL a1, c01, b1 1049 FMUL a2, c01, b2 1050 FMUL a1, c05, b3 1051 FMUL a2, c05, b4 1052 FMUL a1, c09, b5 1053 FMUL a2, c09, b6 1054 FMUL a1, c13, b7 1055 FMUL a2, c13, b8 1056 1057#ifndef CONJ 1058 FNMSUB (aa2, cc02, bb1, cc01) 1059 FMADD (aa1, cc02, bb2, cc02) 1060 FNMSUB (aa2, cc06, bb3, cc05) 1061 FMADD (aa1, cc06, bb4, cc06) 1062 FNMSUB (aa2, cc10, bb5, cc09) 1063 FMADD (aa1, cc10, bb6, cc10) 1064 FNMSUB (aa2, cc14, bb7, cc13) 1065 FMADD (aa1, cc14, bb8, cc14) 1066#else 1067 FMADD (aa2, cc02, bb1, cc01) 1068 FMSUB (aa1, cc02, bb2, cc02) 1069 FMADD (aa2, cc06, bb3, cc05) 1070 FMSUB (aa1, cc06, bb4, cc06) 1071 FMADD (aa2, cc10, bb5, cc09) 1072 FMSUB (aa1, cc10, bb6, cc10) 1073 FMADD (aa2, cc14, bb7, cc13) 1074 FMSUB (aa1, cc14, bb8, cc14) 1075#endif 1076#endif 1077 1078#ifdef RN 1079 LDF [BO + 0 * SIZE], b1 1080 LDF [BO + 1 * SIZE], b2 1081 LDF [BO + 2 * SIZE], b3 1082 LDF [BO + 3 * SIZE], b4 1083 LDF [BO + 4 * SIZE], b5 1084 LDF [BO + 5 * SIZE], b6 1085 LDF [BO + 6 * SIZE], b7 1086 LDF [BO + 7 * SIZE], b8 1087 1088 FMUL b1, c01, a1 1089 FMUL b2, c01, a2 1090 1091#ifndef CONJ 1092 FNMSUB (bb2, cc02, aa1, cc01) 1093 FMADD (bb1, cc02, aa2, cc02) 1094#else 1095 FMADD (bb2, cc02, aa1, cc01) 1096 FMSUB (bb1, cc02, aa2, cc02) 1097#endif 1098 1099 FNMSUB (bb3, cc01, cc05, cc05) 1100 FNMSUB (bb3, cc02, cc06, cc06) 1101 FNMSUB (bb5, cc01, cc09, cc09) 1102 FNMSUB (bb5, cc02, cc10, cc10) 1103 FNMSUB (bb7, cc01, cc13, cc13) 1104 FNMSUB (bb7, cc02, cc14, cc14) 1105 1106#ifndef CONJ 1107 FMADD (bb4, cc02, cc05, cc05) 1108 FNMSUB (bb4, cc01, cc06, cc06) 1109 FMADD (bb6, cc02, cc09, cc09) 1110 FNMSUB (bb6, cc01, cc10, cc10) 1111 FMADD (bb8, cc02, cc13, cc13) 1112 FNMSUB (bb8, cc01, cc14, cc14) 1113#else 1114 FNMSUB (bb4, cc02, cc05, cc05) 1115 FMADD (bb4, cc01, cc06, cc06) 1116 FNMSUB (bb6, cc02, cc09, cc09) 1117 FMADD (bb6, cc01, cc10, cc10) 1118 FNMSUB (bb8, cc02, cc13, cc13) 1119 FMADD (bb8, cc01, cc14, cc14) 1120#endif 1121 1122 LDF [BO + 10 * SIZE], b1 1123 LDF [BO + 11 * SIZE], b2 1124 LDF [BO + 12 * SIZE], b3 1125 LDF [BO + 13 * SIZE], b4 1126 LDF [BO + 14 * SIZE], b5 1127 LDF [BO + 15 * SIZE], b6 1128 1129 FMUL b1, c05, a1 1130 FMUL b2, c05, a2 1131 1132#ifndef CONJ 1133 FNMSUB (bb2, cc06, aa1, cc05) 1134 FMADD (bb1, cc06, aa2, cc06) 1135#else 1136 FMADD (bb2, cc06, aa1, cc05) 1137 FMSUB (bb1, cc06, aa2, cc06) 1138#endif 1139 1140 FNMSUB (bb3, cc05, cc09, cc09) 1141 FNMSUB (bb3, cc06, cc10, cc10) 1142 FNMSUB (bb5, cc05, cc13, cc13) 1143 FNMSUB (bb5, cc06, cc14, cc14) 1144 1145#ifndef CONJ 1146 FMADD (bb4, cc06, cc09, cc09) 1147 FNMSUB (bb4, cc05, cc10, cc10) 1148 FMADD (bb6, cc06, cc13, cc13) 1149 FNMSUB (bb6, cc05, cc14, cc14) 1150#else 1151 FNMSUB (bb4, cc06, cc09, cc09) 1152 FMADD (bb4, cc05, cc10, cc10) 1153 FNMSUB (bb6, cc06, cc13, cc13) 1154 FMADD (bb6, cc05, cc14, cc14) 1155#endif 1156 1157 LDF [BO + 20 * SIZE], b1 1158 LDF [BO + 21 * SIZE], b2 1159 LDF [BO + 22 * SIZE], b3 1160 LDF [BO + 23 * SIZE], b4 1161 1162 FMUL b1, c09, a1 1163 FMUL b2, c09, a2 1164 1165#ifndef CONJ 1166 FNMSUB (bb2, cc10, aa1, cc09) 1167 FMADD (bb1, cc10, aa2, cc10) 1168#else 1169 FMADD (bb2, cc10, aa1, cc09) 1170 FMSUB (bb1, cc10, aa2, cc10) 1171#endif 1172 1173 FNMSUB (bb3, cc09, cc13, cc13) 1174 FNMSUB (bb3, cc10, cc14, cc14) 1175 1176#ifndef CONJ 1177 FMADD (bb4, cc10, cc13, cc13) 1178 FNMSUB (bb4, cc09, cc14, cc14) 1179#else 1180 FNMSUB (bb4, cc10, cc13, cc13) 1181 FMADD (bb4, cc09, cc14, cc14) 1182#endif 1183 1184 LDF [BO + 30 * SIZE], b1 1185 LDF [BO + 31 * SIZE], b2 1186 1187 FMUL b1, c13, a1 1188 FMUL b2, c13, a2 1189 1190#ifndef CONJ 1191 FNMSUB (bb2, cc14, aa1, cc13) 1192 FMADD (bb1, cc14, aa2, cc14) 1193#else 1194 FMADD (bb2, cc14, aa1, cc13) 1195 FMSUB (bb1, cc14, aa2, cc14) 1196#endif 1197#endif 1198 1199#ifdef RT 1200 LDF [BO + 30 * SIZE], b1 1201 LDF [BO + 31 * SIZE], b2 1202 LDF [BO + 28 * SIZE], b3 1203 LDF [BO + 29 * SIZE], b4 1204 LDF [BO + 26 * SIZE], b5 1205 LDF [BO + 27 * SIZE], b6 1206 LDF [BO + 24 * SIZE], b7 1207 LDF [BO + 25 * SIZE], b8 1208 1209 FMUL b1, c13, a1 1210 FMUL b2, c13, a2 1211 1212#ifndef CONJ 1213 FNMSUB (bb2, cc14, aa1, cc13) 1214 FMADD (bb1, cc14, aa2, cc14) 1215#else 1216 FMADD (bb2, cc14, aa1, cc13) 1217 FMSUB (bb1, cc14, aa2, cc14) 1218#endif 1219 1220 FNMSUB (bb3, cc13, cc09, cc09) 1221 FNMSUB (bb3, cc14, cc10, cc10) 1222 FNMSUB (bb5, cc13, cc05, cc05) 1223 FNMSUB (bb5, cc14, cc06, cc06) 1224 FNMSUB (bb7, cc13, cc01, cc01) 1225 FNMSUB (bb7, cc14, cc02, cc02) 1226 1227#ifndef CONJ 1228 FMADD (bb4, cc14, cc09, cc09) 1229 FNMSUB (bb4, cc13, cc10, cc10) 1230 FMADD (bb6, cc14, cc05, cc05) 1231 FNMSUB (bb6, cc13, cc06, cc06) 1232 FMADD (bb8, cc14, cc01, cc01) 1233 FNMSUB (bb8, cc13, cc02, cc02) 1234#else 1235 FNMSUB (bb4, cc14, cc09, cc09) 1236 FMADD (bb4, cc13, cc10, cc10) 1237 FNMSUB (bb6, cc14, cc05, cc05) 1238 FMADD (bb6, cc13, cc06, cc06) 1239 FNMSUB (bb8, cc14, cc01, cc01) 1240 FMADD (bb8, cc13, cc02, cc02) 1241#endif 1242 1243 LDF [BO + 20 * SIZE], b1 1244 LDF [BO + 21 * SIZE], b2 1245 LDF [BO + 18 * SIZE], b3 1246 LDF [BO + 19 * SIZE], b4 1247 LDF [BO + 16 * SIZE], b5 1248 LDF [BO + 17 * SIZE], b6 1249 1250 FMUL b1, c09, a1 1251 FMUL b2, c09, a2 1252 1253#ifndef CONJ 1254 FNMSUB (bb2, cc10, aa1, cc09) 1255 FMADD (bb1, cc10, aa2, cc10) 1256#else 1257 FMADD (bb2, cc10, aa1, cc09) 1258 FMSUB (bb1, cc10, aa2, cc10) 1259#endif 1260 1261 FNMSUB (bb3, cc09, cc05, cc05) 1262 FNMSUB (bb3, cc10, cc06, cc06) 1263 FNMSUB (bb5, cc09, cc01, cc01) 1264 FNMSUB (bb5, cc10, cc02, cc02) 1265 1266#ifndef CONJ 1267 FMADD (bb4, cc10, cc05, cc05) 1268 FNMSUB (bb4, cc09, cc06, cc06) 1269 FMADD (bb6, cc10, cc01, cc01) 1270 FNMSUB (bb6, cc09, cc02, cc02) 1271#else 1272 FNMSUB (bb4, cc10, cc05, cc05) 1273 FMADD (bb4, cc09, cc06, cc06) 1274 FNMSUB (bb6, cc10, cc01, cc01) 1275 FMADD (bb6, cc09, cc02, cc02) 1276#endif 1277 1278 LDF [BO + 10 * SIZE], b1 1279 LDF [BO + 11 * SIZE], b2 1280 LDF [BO + 8 * SIZE], b3 1281 LDF [BO + 9 * SIZE], b4 1282 1283 FMUL b1, c05, a1 1284 FMUL b2, c05, a2 1285 1286#ifndef CONJ 1287 FNMSUB (bb2, cc06, aa1, cc05) 1288 FMADD (bb1, cc06, aa2, cc06) 1289#else 1290 FMADD (bb2, cc06, aa1, cc05) 1291 FMSUB (bb1, cc06, aa2, cc06) 1292#endif 1293 1294 FNMSUB (bb3, cc05, cc01, cc01) 1295 FNMSUB (bb3, cc06, cc02, cc02) 1296 1297#ifndef CONJ 1298 FMADD (bb4, cc06, cc01, cc01) 1299 FNMSUB (bb4, cc05, cc02, cc02) 1300#else 1301 FNMSUB (bb4, cc06, cc01, cc01) 1302 FMADD (bb4, cc05, cc02, cc02) 1303#endif 1304 1305 LDF [BO + 0 * SIZE], b1 1306 LDF [BO + 1 * SIZE], b2 1307 1308 FMUL b1, c01, a1 1309 FMUL b2, c01, a2 1310 1311#ifndef CONJ 1312 FNMSUB (bb2, cc02, aa1, cc01) 1313 FMADD (bb1, cc02, aa2, cc02) 1314#else 1315 FMADD (bb2, cc02, aa1, cc01) 1316 FMSUB (bb1, cc02, aa2, cc02) 1317#endif 1318#endif 1319 1320#ifdef LN 1321 add C1, -2 * SIZE, C1 1322 add C2, -2 * SIZE, C2 1323 add C3, -2 * SIZE, C3 1324 add C4, -2 * SIZE, C4 1325#endif 1326 1327#if defined(LN) || defined(LT) 1328 STF c01, [BO + 0 * SIZE] 1329 STF c02, [BO + 1 * SIZE] 1330 STF c05, [BO + 2 * SIZE] 1331 STF c06, [BO + 3 * SIZE] 1332 1333 STF c09, [BO + 4 * SIZE] 1334 STF c10, [BO + 5 * SIZE] 1335 STF c13, [BO + 6 * SIZE] 1336 STF c14, [BO + 7 * SIZE] 1337#else 1338 STF c01, [AO + 0 * SIZE] 1339 STF c02, [AO + 1 * SIZE] 1340 STF c05, [AO + 2 * SIZE] 1341 STF c06, [AO + 3 * SIZE] 1342 1343 STF c09, [AO + 4 * SIZE] 1344 STF c10, [AO + 5 * SIZE] 1345 STF c13, [AO + 6 * SIZE] 1346 STF c14, [AO + 7 * SIZE] 1347#endif 1348 1349 STF c01, [C1 + 0 * SIZE] 1350 STF c02, [C1 + 1 * SIZE] 1351 STF c05, [C2 + 0 * SIZE] 1352 STF c06, [C2 + 1 * SIZE] 1353 1354 STF c09, [C3 + 0 * SIZE] 1355 STF c10, [C3 + 1 * SIZE] 1356 STF c13, [C4 + 0 * SIZE] 1357 STF c14, [C4 + 1 * SIZE] 1358 1359#ifndef LN 1360 add C1, 2 * SIZE, C1 1361 add C2, 2 * SIZE, C2 1362 add C3, 2 * SIZE, C3 1363 add C4, 2 * SIZE, C4 1364#endif 1365 1366#ifdef RT 1367 sll K, ZBASE_SHIFT, TEMP1 1368 add AORIG, TEMP1, AORIG 1369#endif 1370 1371#if defined(LT) || defined(RN) 1372 sub K, KK, TEMP1 1373 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 1374 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 1375 add AO, TEMP2, AO 1376 add BO, TEMP1, BO 1377#endif 1378 1379#ifdef LT 1380 add KK, 1, KK 1381#endif 1382 1383#ifdef LN 1384 sub KK, 1, KK 1385#endif 1386 1387 add I, -1, I 1388 cmp I, 0 1389 bg,pt %icc, .LL12 1390 nop 1391 1392#ifdef LN 1393 sll K, ZBASE_SHIFT + 2, TEMP1 1394 add B, TEMP1, B 1395#endif 1396 1397#if defined(LT) || defined(RN) 1398 mov BO, B 1399#endif 1400 1401#ifdef RN 1402 add KK, 4, KK 1403#endif 1404 1405#ifdef RT 1406 sub KK, 4, KK 1407#endif 1408 1409 add J, -1, J 1410 cmp J, 0 1411 bg,pt %icc, .LL11 1412 nop 1413 .align 4 1414 1415.LL20: 1416 and N, 2, J 1417 cmp J, 0 1418 ble,pn %icc, .LL30 1419 nop 1420 1421#ifdef RT 1422 sll K, ZBASE_SHIFT + 1, TEMP1 1423 sub B, TEMP1, B 1424#endif 1425 1426#ifndef RT 1427 mov C, C1 1428 add C, LDC, C2 1429 add C2, LDC, C 1430#else 1431 sub C, LDC, C2 1432 sub C2, LDC, C1 1433 sub C2, LDC, C 1434#endif 1435 1436#ifdef LN 1437 add M, OFFSET, KK 1438#endif 1439 1440#ifdef LT 1441 mov OFFSET, KK 1442#endif 1443 1444#if defined(LN) || defined(RT) 1445 mov A, AORIG 1446#else 1447 mov A, AO 1448#endif 1449 1450 mov M, I 1451 .align 4 1452 1453.LL22: 1454#if defined(LT) || defined(RN) 1455 mov B, BO 1456#else 1457#ifdef LN 1458 sll K, ZBASE_SHIFT, TEMP1 1459 sub AORIG, TEMP1, AORIG 1460#endif 1461 1462 sll KK, ZBASE_SHIFT + 0, TEMP1 1463 sll KK, ZBASE_SHIFT + 1, TEMP2 1464 1465 add AORIG, TEMP1, AO 1466 add B, TEMP2, BO 1467#endif 1468 1469 LDF [AO + 0 * SIZE], a1 1470 LDF [AO + 1 * SIZE], a2 1471 1472 LDF [BO + 0 * SIZE], b1 1473 LDF [BO + 1 * SIZE], b2 1474 LDF [BO + 2 * SIZE], b3 1475 LDF [BO + 3 * SIZE], b4 1476 LDF [BO + 4 * SIZE], b5 1477 FCLR (cc01) 1478 1479 LDF [BO + 5 * SIZE], b6 1480 FCLR (cc02) 1481 LDF [BO + 6 * SIZE], b7 1482 FCLR (cc03) 1483 LDF [BO + 7 * SIZE], b8 1484 FCLR (cc04) 1485 LDF [BO + 8 * SIZE], b9 1486 FCLR (cc05) 1487 1488 prefetch [C1 + 2 * SIZE], 3 1489 FCLR (cc06) 1490 prefetch [C2 + 2 * SIZE], 3 1491 FCLR (cc07) 1492 1493#if defined(LT) || defined(RN) 1494 sra KK, 2, L 1495#else 1496 sub K, KK, L 1497 sra L, 2, L 1498#endif 1499 cmp L, 0 1500 ble,pn %icc, .LL25 1501 FCLR (cc08) 1502 .align 4 1503 1504.LL23: 1505 FMADD1 (aa1, bb1, cc01, cc01) 1506 LDF [AO + 2 * SIZE], a3 1507 FMADD2 (aa2, bb1, cc02, cc02) 1508 LDF [AO + 3 * SIZE], a4 1509 1510 FMADD3 (aa1, bb2, cc03, cc03) 1511 LDF [BO + 16 * SIZE], b1 1512 FMADD4 (aa2, bb2, cc04, cc04) 1513 LDF [BO + 9 * SIZE], b2 1514 1515 FMADD1 (aa1, bb3, cc05, cc05) 1516 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1517 FMADD2 (aa2, bb3, cc06, cc06) 1518 add L, -1, L 1519 1520 FMADD3 (aa1, bb4, cc07, cc07) 1521 LDF [BO + 10 * SIZE], b3 1522 FMADD4 (aa2, bb4, cc08, cc08) 1523 LDF [BO + 11 * SIZE], b4 1524 1525 FMADD1 (aa3, bb5, cc01, cc01) 1526 LDF [AO + 4 * SIZE], a1 1527 FMADD2 (aa4, bb5, cc02, cc02) 1528 LDF [AO + 5 * SIZE], a2 1529 1530 FMADD3 (aa3, bb6, cc03, cc03) 1531 LDF [BO + 12 * SIZE], b5 1532 FMADD4 (aa4, bb6, cc04, cc04) 1533 LDF [BO + 13 * SIZE], b6 1534 1535 FMADD1 (aa3, bb7, cc05, cc05) 1536 cmp L, 0 1537 FMADD2 (aa4, bb7, cc06, cc06) 1538 add AO, 8 * SIZE, AO 1539 1540 FMADD3 (aa3, bb8, cc07, cc07) 1541 LDF [BO + 14 * SIZE], b7 1542 FMADD4 (aa4, bb8, cc08, cc08) 1543 LDF [BO + 15 * SIZE], b8 1544 1545 FMADD1 (aa1, bb9, cc01, cc01) 1546 LDF [AO - 2 * SIZE], a3 1547 FMADD2 (aa2, bb9, cc02, cc02) 1548 LDF [AO - 1 * SIZE], a4 1549 1550 FMADD3 (aa1, bb2, cc03, cc03) 1551 LDF [BO + 24 * SIZE], b9 1552 FMADD4 (aa2, bb2, cc04, cc04) 1553 LDF [BO + 17 * SIZE], b2 1554 1555 FMADD1 (aa1, bb3, cc05, cc05) 1556 add BO, 16 * SIZE, BO 1557 FMADD2 (aa2, bb3, cc06, cc06) 1558 nop 1559 1560 FMADD3 (aa1, bb4, cc07, cc07) 1561 LDF [BO + 2 * SIZE], b3 1562 FMADD4 (aa2, bb4, cc08, cc08) 1563 LDF [BO + 3 * SIZE], b4 1564 1565 FMADD1 (aa3, bb5, cc01, cc01) 1566 LDF [AO + 0 * SIZE], a1 1567 FMADD2 (aa4, bb5, cc02, cc02) 1568 LDF [AO + 1 * SIZE], a2 1569 FMADD3 (aa3, bb6, cc03, cc03) 1570 LDF [BO + 4 * SIZE], b5 1571 FMADD4 (aa4, bb6, cc04, cc04) 1572 LDF [BO + 5 * SIZE], b6 1573 1574 FMADD1 (aa3, bb7, cc05, cc05) 1575 nop 1576 FMADD2 (aa4, bb7, cc06, cc06) 1577 LDF [BO + 6 * SIZE], b7 1578 1579 FMADD3 (aa3, bb8, cc07, cc07) 1580 FMADD4 (aa4, bb8, cc08, cc08) 1581 bg,pt %icc, .LL23 1582 LDF [BO + 7 * SIZE], b8 1583 .align 4 1584 1585.LL25: 1586#if defined(LT) || defined(RN) 1587 and KK, 3, L 1588#else 1589 sub K, KK, L 1590 and L, 3, L 1591#endif 1592 cmp L, 0 1593 ble,a,pn %icc, .LL28 1594 nop 1595 .align 4 1596 1597.LL27: 1598 FMADD1 (aa1, bb1, cc01, cc01) 1599 add L, -1, L 1600 FMADD2 (aa2, bb1, cc02, cc02) 1601 LDF [BO + 4 * SIZE], b1 1602 1603 FMADD3 (aa1, bb2, cc03, cc03) 1604 add AO, 2 * SIZE, AO 1605 FMADD4 (aa2, bb2, cc04, cc04) 1606 LDF [BO + 5 * SIZE], b2 1607 1608 FMADD1 (aa1, bb3, cc05, cc05) 1609 cmp L, 0 1610 FMADD2 (aa2, bb3, cc06, cc06) 1611 LDF [BO + 6 * SIZE], b3 1612 1613 FMADD3 (aa1, bb4, cc07, cc07) 1614 LDF [AO + 0 * SIZE], a1 1615 FMADD4 (aa2, bb4, cc08, cc08) 1616 LDF [AO + 1 * SIZE], a2 1617 1618 LDF [BO + 7 * SIZE], b4 1619 bg,pt %icc, .LL27 1620 add BO, 4 * SIZE, BO 1621 .align 4 1622 1623.LL28: 1624 FADD c01, c04, c01 1625 FADD c02, c03, c02 1626 FADD c05, c08, c05 1627 FADD c06, c07, c06 1628 1629#if defined(LN) || defined(RT) 1630#ifdef LN 1631 sub KK, 1, TEMP1 1632#else 1633 sub KK, 2, TEMP1 1634#endif 1635 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 1636 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 1637 1638 add AORIG, TEMP2, AO 1639 add B, TEMP1, BO 1640#endif 1641 1642#if defined(LN) || defined(LT) 1643 LDF [BO + 0 * SIZE], a1 1644 LDF [BO + 1 * SIZE], a2 1645 LDF [BO + 2 * SIZE], a3 1646 LDF [BO + 3 * SIZE], a4 1647#else 1648 LDF [AO + 0 * SIZE], a1 1649 LDF [AO + 1 * SIZE], a2 1650 LDF [AO + 2 * SIZE], a3 1651 LDF [AO + 3 * SIZE], a4 1652#endif 1653 1654 FSUB a1, c01, c01 1655 FSUB a2, c02, c02 1656 FSUB a3, c05, c05 1657 FSUB a4, c06, c06 1658 1659#if defined(LN) || defined(LT) 1660 LDF [AO + 0 * SIZE], a1 1661 LDF [AO + 1 * SIZE], a2 1662 1663 FMUL a1, c01, b1 1664 FMUL a2, c01, b2 1665 FMUL a1, c05, b3 1666 FMUL a2, c05, b4 1667 1668#ifndef CONJ 1669 FNMSUB (aa2, cc02, bb1, cc01) 1670 FMADD (aa1, cc02, bb2, cc02) 1671 FNMSUB (aa2, cc06, bb3, cc05) 1672 FMADD (aa1, cc06, bb4, cc06) 1673#else 1674 FMADD (aa2, cc02, bb1, cc01) 1675 FMSUB (aa1, cc02, bb2, cc02) 1676 FMADD (aa2, cc06, bb3, cc05) 1677 FMSUB (aa1, cc06, bb4, cc06) 1678#endif 1679#endif 1680 1681#ifdef RN 1682 LDF [BO + 0 * SIZE], b1 1683 LDF [BO + 1 * SIZE], b2 1684 LDF [BO + 2 * SIZE], b3 1685 LDF [BO + 3 * SIZE], b4 1686 1687 FMUL b1, c01, a1 1688 FMUL b2, c01, a2 1689 1690#ifndef CONJ 1691 FNMSUB (bb2, cc02, aa1, cc01) 1692 FMADD (bb1, cc02, aa2, cc02) 1693#else 1694 FMADD (bb2, cc02, aa1, cc01) 1695 FMSUB (bb1, cc02, aa2, cc02) 1696#endif 1697 1698 FNMSUB (bb3, cc01, cc05, cc05) 1699 FNMSUB (bb3, cc02, cc06, cc06) 1700 1701#ifndef CONJ 1702 FMADD (bb4, cc02, cc05, cc05) 1703 FNMSUB (bb4, cc01, cc06, cc06) 1704#else 1705 FNMSUB (bb4, cc02, cc05, cc05) 1706 FMADD (bb4, cc01, cc06, cc06) 1707#endif 1708 1709 LDF [BO + 6 * SIZE], b1 1710 LDF [BO + 7 * SIZE], b2 1711 1712 FMUL b1, c05, a1 1713 FMUL b2, c05, a2 1714 1715#ifndef CONJ 1716 FNMSUB (bb2, cc06, aa1, cc05) 1717 FMADD (bb1, cc06, aa2, cc06) 1718#else 1719 FMADD (bb2, cc06, aa1, cc05) 1720 FMSUB (bb1, cc06, aa2, cc06) 1721#endif 1722#endif 1723 1724#ifdef RT 1725 LDF [BO + 6 * SIZE], b1 1726 LDF [BO + 7 * SIZE], b2 1727 LDF [BO + 4 * SIZE], b3 1728 LDF [BO + 5 * SIZE], b4 1729 1730 FMUL b1, c05, a1 1731 FMUL b2, c05, a2 1732 1733#ifndef CONJ 1734 FNMSUB (bb2, cc06, aa1, cc05) 1735 FMADD (bb1, cc06, aa2, cc06) 1736#else 1737 FMADD (bb2, cc06, aa1, cc05) 1738 FMSUB (bb1, cc06, aa2, cc06) 1739#endif 1740 1741 FNMSUB (bb3, cc05, cc01, cc01) 1742 FNMSUB (bb3, cc06, cc02, cc02) 1743 1744#ifndef CONJ 1745 FMADD (bb4, cc06, cc01, cc01) 1746 FNMSUB (bb4, cc05, cc02, cc02) 1747#else 1748 FNMSUB (bb4, cc06, cc01, cc01) 1749 FMADD (bb4, cc05, cc02, cc02) 1750#endif 1751 1752 LDF [BO + 0 * SIZE], b1 1753 LDF [BO + 1 * SIZE], b2 1754 1755 FMUL b1, c01, a1 1756 FMUL b2, c01, a2 1757 1758#ifndef CONJ 1759 FNMSUB (bb2, cc02, aa1, cc01) 1760 FMADD (bb1, cc02, aa2, cc02) 1761#else 1762 FMADD (bb2, cc02, aa1, cc01) 1763 FMSUB (bb1, cc02, aa2, cc02) 1764#endif 1765#endif 1766 1767#ifdef LN 1768 add C1, -2 * SIZE, C1 1769 add C2, -2 * SIZE, C2 1770#endif 1771 1772#if defined(LN) || defined(LT) 1773 STF c01, [BO + 0 * SIZE] 1774 STF c02, [BO + 1 * SIZE] 1775 STF c05, [BO + 2 * SIZE] 1776 STF c06, [BO + 3 * SIZE] 1777#else 1778 STF c01, [AO + 0 * SIZE] 1779 STF c02, [AO + 1 * SIZE] 1780 STF c05, [AO + 2 * SIZE] 1781 STF c06, [AO + 3 * SIZE] 1782#endif 1783 1784 STF c01, [C1 + 0 * SIZE] 1785 STF c02, [C1 + 1 * SIZE] 1786 STF c05, [C2 + 0 * SIZE] 1787 STF c06, [C2 + 1 * SIZE] 1788 1789#ifndef LN 1790 add C1, 2 * SIZE, C1 1791 add C2, 2 * SIZE, C2 1792#endif 1793 1794#ifdef RT 1795 sll K, ZBASE_SHIFT, TEMP1 1796 add AORIG, TEMP1, AORIG 1797#endif 1798 1799#if defined(LT) || defined(RN) 1800 sub K, KK, TEMP1 1801 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 1802 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 1803 add AO, TEMP2, AO 1804 add BO, TEMP1, BO 1805#endif 1806 1807#ifdef LT 1808 add KK, 1, KK 1809#endif 1810 1811#ifdef LN 1812 sub KK, 1, KK 1813#endif 1814 1815 add I, -1, I 1816 cmp I, 0 1817 bg,pt %icc, .LL22 1818 nop 1819 1820#ifdef LN 1821 sll K, ZBASE_SHIFT + 1, TEMP1 1822 add B, TEMP1, B 1823#endif 1824 1825#if defined(LT) || defined(RN) 1826 mov BO, B 1827#endif 1828 1829#ifdef RN 1830 add KK, 2, KK 1831#endif 1832 1833#ifdef RT 1834 sub KK, 2, KK 1835#endif 1836 .align 4 1837 1838.LL30: 1839 and N, 1, J 1840 cmp J, 0 1841 ble,pn %icc, .LL999 1842 nop 1843 1844#ifdef RT 1845 sll K, ZBASE_SHIFT, TEMP1 1846 sub B, TEMP1, B 1847#endif 1848 1849#ifndef RT 1850 mov C, C1 1851 add C, LDC, C 1852#else 1853 sub C, LDC, C1 1854 sub C, LDC, C 1855#endif 1856 1857#ifdef LN 1858 add M, OFFSET, KK 1859#endif 1860 1861#ifdef LT 1862 mov OFFSET, KK 1863#endif 1864 1865#if defined(LN) || defined(RT) 1866 mov A, AORIG 1867#else 1868 mov A, AO 1869#endif 1870 1871 mov M, I 1872 .align 4 1873 1874.LL32: 1875#if defined(LT) || defined(RN) 1876 mov B, BO 1877#else 1878#ifdef LN 1879 sll K, ZBASE_SHIFT, TEMP1 1880 sub AORIG, TEMP1, AORIG 1881#endif 1882 1883 sll KK, ZBASE_SHIFT + 0, TEMP1 1884 1885 add AORIG, TEMP1, AO 1886 add B, TEMP1, BO 1887#endif 1888 1889 LDF [AO + 0 * SIZE], a1 1890 LDF [AO + 1 * SIZE], a2 1891 LDF [AO + 2 * SIZE], a3 1892 LDF [AO + 3 * SIZE], a4 1893 1894 LDF [BO + 0 * SIZE], b1 1895 LDF [BO + 1 * SIZE], b2 1896 LDF [BO + 2 * SIZE], b3 1897 FCLR (cc01) 1898 LDF [BO + 3 * SIZE], b4 1899 FCLR (cc02) 1900 1901 LDF [BO + 4 * SIZE], b5 1902 FCLR (cc03) 1903 LDF [BO + 5 * SIZE], b6 1904 FCLR (cc04) 1905 LDF [BO + 6 * SIZE], b7 1906 FCLR (cc05) 1907 LDF [BO + 7 * SIZE], b8 1908 FCLR (cc06) 1909 1910 prefetch [C1 + 2 * SIZE], 3 1911 FCLR (cc07) 1912 1913#if defined(LT) || defined(RN) 1914 sra KK, 2, L 1915#else 1916 sub K, KK, L 1917 sra L, 2, L 1918#endif 1919 cmp L, 0 1920 ble,pn %icc, .LL35 1921 FCLR (cc08) 1922 .align 4 1923 1924.LL33: 1925 FMADD1 (aa1, bb1, cc01, cc01) 1926 prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY 1927 FMADD2 (aa2, bb1, cc02, cc02) 1928 LDF [BO + 8 * SIZE], b1 1929 1930 FMADD3 (aa1, bb2, cc03, cc03) 1931 LDF [AO + 4 * SIZE], a1 1932 FMADD4 (aa2, bb2, cc04, cc04) 1933 LDF [AO + 5 * SIZE], a2 1934 1935 FMADD1 (aa3, bb3, cc01, cc01) 1936 LDF [BO + 9 * SIZE], b2 1937 FMADD2 (aa4, bb3, cc02, cc02) 1938 LDF [BO + 10 * SIZE], b3 1939 1940 FMADD3 (aa3, bb4, cc03, cc03) 1941 LDF [AO + 6 * SIZE], a3 1942 FMADD4 (aa4, bb4, cc04, cc04) 1943 LDF [AO + 7 * SIZE], a4 1944 1945 FMADD1 (aa1, bb5, cc01, cc01) 1946 LDF [BO + 11 * SIZE], b4 1947 FMADD2 (aa2, bb5, cc02, cc02) 1948 LDF [BO + 12 * SIZE], b5 1949 1950 FMADD3 (aa1, bb6, cc03, cc03) 1951 LDF [AO + 8 * SIZE], a1 1952 FMADD4 (aa2, bb6, cc04, cc04) 1953 LDF [AO + 9 * SIZE], a2 1954 1955 FMADD1 (aa3, bb7, cc01, cc01) 1956 LDF [BO + 13 * SIZE], b6 1957 1958 FMADD2 (aa4, bb7, cc02, cc02) 1959 LDF [BO + 14 * SIZE], b7 1960 1961 FMADD3 (aa3, bb8, cc03, cc03) 1962 LDF [AO + 10 * SIZE], a3 1963 FMADD4 (aa4, bb8, cc04, cc04) 1964 LDF [AO + 11 * SIZE], a4 1965 1966 add AO, 8 * SIZE, AO 1967 add L, -1, L 1968 add BO, 8 * SIZE, BO 1969 cmp L, 0 1970 1971 bg,pt %icc, .LL33 1972 LDF [BO + 7 * SIZE], b8 1973 .align 4 1974 1975.LL35: 1976#if defined(LT) || defined(RN) 1977 and KK, 3, L 1978#else 1979 sub K, KK, L 1980 and L, 3, L 1981#endif 1982 cmp L, 0 1983 ble,a,pn %icc, .LL38 1984 nop 1985 .align 4 1986 1987.LL37: 1988 FMADD1 (aa1, bb1, cc01, cc01) 1989 add L, -1, L 1990 FMADD2 (aa2, bb1, cc02, cc02) 1991 LDF [BO + 2 * SIZE], b1 1992 1993 FMADD3 (aa1, bb2, cc03, cc03) 1994 LDF [AO + 2 * SIZE], a1 1995 FMADD4 (aa2, bb2, cc04, cc04) 1996 LDF [AO + 3 * SIZE], a2 1997 1998 add AO, 2 * SIZE, AO 1999 cmp L, 0 2000 add BO, 2 * SIZE, BO 2001 bg,pt %icc, .LL37 2002 LDF [BO + 1 * SIZE], b2 2003 .align 4 2004 2005.LL38: 2006 FADD c01, c04, c01 2007 FADD c02, c03, c02 2008 2009#if defined(LN) || defined(RT) 2010 sub KK, 1, TEMP1 2011 2012 sll TEMP1, ZBASE_SHIFT, TEMP1 2013 2014 add AORIG, TEMP1, AO 2015 add B, TEMP1, BO 2016#endif 2017 2018#if defined(LN) || defined(LT) 2019 LDF [BO + 0 * SIZE], a1 2020 LDF [BO + 1 * SIZE], a2 2021#else 2022 LDF [AO + 0 * SIZE], a1 2023 LDF [AO + 1 * SIZE], a2 2024#endif 2025 2026 FSUB a1, c01, c01 2027 FSUB a2, c02, c02 2028 2029#if defined(LN) || defined(LT) 2030 LDF [AO + 0 * SIZE], a1 2031 LDF [AO + 1 * SIZE], a2 2032#else 2033 LDF [BO + 0 * SIZE], a1 2034 LDF [BO + 1 * SIZE], a2 2035#endif 2036 2037 FMUL a1, c01, b1 2038 FMUL a2, c01, b2 2039 2040#ifndef CONJ 2041 FNMSUB (aa2, cc02, bb1, cc01) 2042 FMADD (aa1, cc02, bb2, cc02) 2043#else 2044 FMADD (aa2, cc02, bb1, cc01) 2045 FMSUB (aa1, cc02, bb2, cc02) 2046#endif 2047 2048#ifdef LN 2049 add C1, -2 * SIZE, C1 2050#endif 2051 2052#if defined(LN) || defined(LT) 2053 STF c01, [BO + 0 * SIZE] 2054 STF c02, [BO + 1 * SIZE] 2055#else 2056 STF c01, [AO + 0 * SIZE] 2057 STF c02, [AO + 1 * SIZE] 2058#endif 2059 2060 STF c01, [C1 + 0 * SIZE] 2061 STF c02, [C1 + 1 * SIZE] 2062 2063#ifndef LN 2064 add C1, 2 * SIZE, C1 2065#endif 2066 2067#ifdef RT 2068 sll K, ZBASE_SHIFT, TEMP1 2069 add AORIG, TEMP1, AORIG 2070#endif 2071 2072#if defined(LT) || defined(RN) 2073 sub K, KK, TEMP1 2074 sll TEMP1, ZBASE_SHIFT, TEMP1 2075 add AO, TEMP1, AO 2076 add BO, TEMP1, BO 2077#endif 2078 2079#ifdef LT 2080 add KK, 1, KK 2081#endif 2082 2083#ifdef LN 2084 sub KK, 1, KK 2085#endif 2086 2087 add I, -1, I 2088 cmp I, 0 2089 bg,pt %icc, .LL32 2090 nop 2091 2092#ifdef LN 2093 sll K, ZBASE_SHIFT, TEMP1 2094 add B, TEMP1, B 2095#endif 2096 2097#if defined(LT) || defined(RN) 2098 mov BO, B 2099#endif 2100 2101#ifdef RN 2102 add KK, 1, KK 2103#endif 2104 2105#ifdef RT 2106 sub KK, 1, KK 2107#endif 2108 .align 4 2109 2110.LL999: 2111 return %i7 + 8 2112 clr %o0 2113 2114 EPILOGUE 2115