1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef DOUBLE 43#define PREFETCHSIZE 44 44#else 45#define PREFETCHSIZE 88 46#endif 47 48#define M %i0 49#define N %i1 50#define A %i5 51#define LDA %i2 52#define X %i3 53#define INCX %i4 54 55#define Y %l0 56#define INCY %l1 57#define BUFFER %l2 58 59#define I %l3 60#define J %l5 61 62#define A1 %o0 63#define A2 %o1 64#define A3 %o2 65#define A4 %o3 66 67#define Y1 %l4 68#define YY %l6 69 70#ifdef DOUBLE 71#define t1 %f0 72#define t2 %f2 73#define t3 %f4 74#define t4 %f6 75 76#define y1 %f8 77#define y2 %f10 78#define y3 %f12 79#define y4 %f14 80#define y5 %f16 81#define y6 %f18 82#define y7 %f20 83#define y8 %f22 84 85#define a1 %f24 86#define a2 %f26 87#define a3 %f28 88#define a4 %f30 89#define a5 %f32 90#define a6 %f34 91#define a7 %f36 92#define a8 %f38 93 94#define a9 %f40 95#define a10 %f42 96#define a11 %f44 97#define a12 %f46 98#define a13 %f48 99#define a14 %f50 100#define a15 %f52 101#define a16 %f54 102 103#define x1 %f56 104#define x2 %f58 105#define x3 %f60 106#define x4 %f62 107 108#define FZERO %f50 109#define ALPHA_R %f52 110#define ALPHA_I %f54 111#else 112#define t1 %f0 113#define t2 %f1 114#define t3 %f2 115#define t4 %f3 116 117#define y1 %f4 118#define y2 %f5 119#define y3 %f6 120#define y4 %f7 121#define y5 %f8 122#define y6 %f9 123#define y7 %f10 124#define y8 %f11 125 126#define a1 %f12 127#define a2 %f13 128#define a3 %f14 129#define a4 %f15 130#define a5 %f16 131#define a6 %f17 132#define a7 %f18 133#define a8 %f19 134 135#define a9 %f20 136#define a10 %f21 137#define a11 %f22 138#define a12 %f23 139#define a13 %f24 140#define a14 %f25 141#define a15 %f26 142#define a16 %f27 143 144#define x1 %f28 145#define x2 %f29 146#define x3 %f30 147#define x4 %f31 148 149#define FZERO %f25 150#define ALPHA_R %f26 151#define ALPHA_I %f27 152#endif 153 154#ifndef __64BIT__ 155#define STACK_ALPHA_R [%sp + STACK_START + 16] 156#ifndef DOUBLE 157#define STACK_ALPHA_I [%sp + STACK_START + 20] 158#else 159#define STACK_ALPHA_I [%sp + STACK_START + 24] 160#endif 161#else 162#define STACK_ALPHA_R [%sp + STACK_START + 32] 163#define STACK_ALPHA_I [%sp + STACK_START + 40] 164#endif 165 166#ifndef CONJ 167#define FSUBX FSUB 168#define FADDX FADD 169#else 170#define FSUBX FADD 171#define FADDX FSUB 172#endif 173 174 PROLOGUE 175 SAVESP 176 177#ifndef __64BIT__ 178#ifdef DOUBLE 179 st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ 180 st %i4, [%sp + STACK_START + 20] 181 st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ 182 183 ld [%sp + STACK_START + 32], A 184 ld [%sp + STACK_START + 36], LDA 185 ld [%sp + STACK_START + 40], X 186 ld [%sp + STACK_START + 44], INCX 187 ld [%sp + STACK_START + 48], Y 188 ld [%sp + STACK_START + 52], INCY 189 ld [%sp + STACK_START + 56], BUFFER 190#else 191 st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ 192 st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ 193 194 ld [%sp + STACK_START + 28], LDA 195 ld [%sp + STACK_START + 32], X 196 ld [%sp + STACK_START + 36], INCX 197 ld [%sp + STACK_START + 40], Y 198 ld [%sp + STACK_START + 44], INCY 199 ld [%sp + STACK_START + 48], BUFFER 200#endif 201#else 202 ldx [%sp + STACK_START + 56], LDA 203 ldx [%sp + STACK_START + 64], X 204 ldx [%sp + STACK_START + 72], INCX 205 ldx [%sp + STACK_START + 80], Y 206 ldx [%sp + STACK_START + 88], INCY 207 ldx [%sp + STACK_START + 96], BUFFER 208 209#ifdef DOUBLE 210 std %f6, STACK_ALPHA_R 211 std %f8, STACK_ALPHA_I 212#else 213 st %f7, STACK_ALPHA_R 214 st %f9, STACK_ALPHA_I 215#endif 216#endif 217 218 sll LDA, ZBASE_SHIFT, LDA 219 220 cmp M, 0 221 ble %icc, .LL999 222 sll INCX, ZBASE_SHIFT, INCX 223 224 cmp N, 0 225 ble %icc, .LL999 226 sll INCY, ZBASE_SHIFT, INCY 227 228 cmp INCY, 2 * SIZE 229 be %icc, .LL20 230 mov Y, YY 231 232#ifdef DOUBLE 233 FCLR(19) 234#else 235 FCLR(25) 236#endif 237 238 add M, 3, J 239 sra J, 2, J 240 mov BUFFER, YY 241 mov BUFFER, Y1 242 243.LL01: 244 STF FZERO, [Y1 + 0 * SIZE] 245 nop 246 STF FZERO, [Y1 + 1 * SIZE] 247 STF FZERO, [Y1 + 2 * SIZE] 248 STF FZERO, [Y1 + 3 * SIZE] 249 STF FZERO, [Y1 + 4 * SIZE] 250 nop 251 STF FZERO, [Y1 + 5 * SIZE] 252 deccc J 253 STF FZERO, [Y1 + 6 * SIZE] 254 nop 255 STF FZERO, [Y1 + 7 * SIZE] 256 bg,pn %icc, .LL01 257 add Y1, 8 * SIZE, Y1 258 259.LL20: 260 sra N, 1, J 261 cmp J, 0 262 ble,pn %icc, .LL30 263 nop 264 265.LL21: 266 mov YY, Y1 267 mov A, A1 268 LDF STACK_ALPHA_R, ALPHA_R 269 LDF STACK_ALPHA_I, ALPHA_I 270 271 add A, LDA, A2 272 add A2, LDA, A 273 274 LDF [X + 0 * SIZE], x1 275 LDF [X + 1 * SIZE], x2 276 add X, INCX, X 277 LDF [X + 0 * SIZE], x3 278 LDF [X + 1 * SIZE], x4 279 add X, INCX, X 280 281 FMUL ALPHA_R, x1, a1 282 FMUL ALPHA_I, x2, a4 283 FMUL ALPHA_I, x1, a2 284 FMUL ALPHA_R, x2, a3 285 286 FMUL ALPHA_R, x3, a5 287 FMUL ALPHA_I, x4, a8 288 FMUL ALPHA_I, x3, a6 289 FMUL ALPHA_R, x4, a7 290 291#ifndef XCONJ 292 FSUB a1, a4, x1 293 FADD a2, a3, x2 294 FSUB a5, a8, x3 295 FADD a6, a7, x4 296#else 297 FADD a1, a4, x1 298 FSUB a2, a3, x2 299 FADD a5, a8, x3 300 FSUB a6, a7, x4 301#endif 302 303 sra M, 2, I 304 cmp I, 0 305 ble,pn %icc, .LL27 306 nop 307 308 LDF [A1 + 0 * SIZE], a1 309 LDF [A1 + 1 * SIZE], a2 310 LDF [A1 + 2 * SIZE], a3 311 LDF [A1 + 3 * SIZE], a4 312 313 LDF [A1 + 4 * SIZE], a9 314 LDF [A1 + 5 * SIZE], a10 315 LDF [A1 + 6 * SIZE], a11 316 LDF [A1 + 7 * SIZE], a12 317 318 LDF [A2 + 0 * SIZE], a5 319 LDF [A2 + 1 * SIZE], a6 320 LDF [A2 + 2 * SIZE], a7 321 LDF [A2 + 3 * SIZE], a8 322 323 LDF [A2 + 4 * SIZE], a13 324 LDF [A2 + 5 * SIZE], a14 325 LDF [A2 + 6 * SIZE], a15 326 LDF [A2 + 7 * SIZE], a16 327 328 LDF [Y1 + 0 * SIZE], y1 329 LDF [Y1 + 1 * SIZE], y2 330 LDF [Y1 + 2 * SIZE], y3 331 332 333 FMUL a1, x1, t1 334 deccc I 335 FMUL a1, x2, t2 336 LDF [A1 + 8 * SIZE], a1 337 338 FMUL a3, x1, t3 339 FMUL a3, x2, t4 340 ble,pn %icc, .LL26 341 LDF [A1 + 10 * SIZE], a3 342 343 FADD y1, t1, y1 344 LDF [Y1 + 3 * SIZE], y4 345 FMUL a2, x2, t1 346 347 FADD y2, t2, y2 348 FMUL a2, x1, t2 349 LDF [A1 + 9 * SIZE], a2 350 351 FADD y3, t3, y3 352 LDF [Y1 + 4 * SIZE], y5 353 FMUL a4, x2, t3 354 355 FADD y4, t4, y4 356 FMUL a4, x1, t4 357 LDF [A1 + 11 * SIZE], a4 358 359 FSUBX y1, t1, y1 360 LDF [Y1 + 5 * SIZE], y6 361 FMUL a5, x3, t1 362 363 FADDX y2, t2, y2 364 FMUL a5, x4, t2 365 LDF [A2 + 8 * SIZE], a5 366 367 FSUBX y3, t3, y3 368 LDF [Y1 + 6 * SIZE], y7 369 FMUL a7, x3, t3 370 371 FADDX y4, t4, y4 372 FMUL a7, x4, t4 373 LDF [A2 + 10 * SIZE], a7 374 375 FADD y1, t1, y1 376 LDF [Y1 + 7 * SIZE], y8 377 FMUL a6, x4, t1 378 379 FADD y2, t2, y2 380 FMUL a6, x3, t2 381 LDF [A2 + 9 * SIZE], a6 382 383 FADD y3, t3, y3 384 FMUL a8, x4, t3 385 386 FADD y4, t4, y4 387 FMUL a8, x3, t4 388 LDF [A2 + 11 * SIZE], a8 389 390 FSUBX y1, t1, y1 391 FMUL a9, x1, t1 392 393 FADDX y2, t2, y2 394 FMUL a9, x2, t2 395 LDF [A1 + 12 * SIZE], a9 396 397 FSUBX y3, t3, y3 398 deccc I 399 FMUL a11, x1, t3 400 401 FADDX y4, t4, y4 402 FMUL a11, x2, t4 403 ble,pn %icc, .LL23 404 LDF [A1 + 14 * SIZE], a11 405 406.LL22: 407 FADD y5, t1, y5 408 prefetch [A1 + PREFETCHSIZE * SIZE], 1 409 FMUL a10, x2, t1 410 LDF [Y1 + 7 * SIZE], y8 411 412 FADD y6, t2, y6 413 FMUL a10, x1, t2 414 LDF [A1 + 13 * SIZE], a10 415 416 FADD y7, t3, y7 417 FMUL a12, x2, t3 418 STF y1, [Y1 + 0 * SIZE] 419 420 FADD y8, t4, y8 421 FMUL a12, x1, t4 422 LDF [A1 + 15 * SIZE], a12 423 424 FSUBX y5, t1, y5 425 FMUL a13, x3, t1 426 STF y2, [Y1 + 1 * SIZE] 427 428 FADDX y6, t2, y6 429 FMUL a13, x4, t2 430 LDF [A2 + 12 * SIZE], a13 431 432 FSUBX y7, t3, y7 433 FMUL a15, x3, t3 434 STF y3, [Y1 + 2 * SIZE] 435 436 FADDX y8, t4, y8 437 FMUL a15, x4, t4 438 LDF [A2 + 14 * SIZE], a15 439 440 FADD y5, t1, y5 441 FMUL a14, x4, t1 442 STF y4, [Y1 + 3 * SIZE] 443 444 FADD y6, t2, y6 445 FMUL a14, x3, t2 446 LDF [A2 + 13 * SIZE], a14 447 448 FADD y7, t3, y7 449 FMUL a16, x4, t3 450 LDF [Y1 + 8 * SIZE], y1 451 452 FADD y8, t4, y8 453 FMUL a16, x3, t4 454 LDF [A2 + 15 * SIZE], a16 455 456 FSUBX y5, t1, y5 457 FMUL a1, x1, t1 458 LDF [Y1 + 9 * SIZE], y2 459 460 FADDX y6, t2, y6 461 FMUL a1, x2, t2 462 LDF [A1 + 16 * SIZE], a1 463 464 FSUBX y7, t3, y7 465 FMUL a3, x1, t3 466 LDF [Y1 + 10 * SIZE], y3 467 468 FADDX y8, t4, y8 469 FMUL a3, x2, t4 470 LDF [A1 + 18 * SIZE], a3 471 472 FADD y1, t1, y1 473 prefetch [A2 + PREFETCHSIZE * SIZE], 1 474 FMUL a2, x2, t1 475 LDF [Y1 + 11 * SIZE], y4 476 477 FADD y2, t2, y2 478 FMUL a2, x1, t2 479 LDF [A1 + 17 * SIZE], a2 480 481 FADD y3, t3, y3 482 FMUL a4, x2, t3 483 STF y5, [Y1 + 4 * SIZE] 484 485 FADD y4, t4, y4 486 FMUL a4, x1, t4 487 LDF [A1 + 19 * SIZE], a4 488 489 FSUBX y1, t1, y1 490 FMUL a5, x3, t1 491 STF y6, [Y1 + 5 * SIZE] 492 493 FADDX y2, t2, y2 494 FMUL a5, x4, t2 495 LDF [A2 + 16 * SIZE], a5 496 497 FSUBX y3, t3, y3 498 FMUL a7, x3, t3 499 STF y7, [Y1 + 6 * SIZE] 500 501 FADDX y4, t4, y4 502 deccc I 503 FMUL a7, x4, t4 504 LDF [A2 + 18 * SIZE], a7 505 506 FADD y1, t1, y1 507 FMUL a6, x4, t1 508 STF y8, [Y1 + 7 * SIZE] 509 510 FADD y2, t2, y2 511 FMUL a6, x3, t2 512 LDF [A2 + 17 * SIZE], a6 513 514 FADD y3, t3, y3 515 add A1, 8 * SIZE, A1 516 FMUL a8, x4, t3 517 LDF [Y1 + 12 * SIZE], y5 518 519 FADD y4, t4, y4 520 FMUL a8, x3, t4 521 LDF [A2 + 19 * SIZE], a8 522 523 FSUBX y1, t1, y1 524 add A2, 8 * SIZE, A2 525 FMUL a9, x1, t1 526 LDF [Y1 + 13 * SIZE], y6 527 528 FADDX y2, t2, y2 529 add Y1, 8 * SIZE, Y1 530 FMUL a9, x2, t2 531 LDF [A1 + 12 * SIZE], a9 532 533 FSUBX y3, t3, y3 534 FMUL a11, x1, t3 535 LDF [Y1 + 6 * SIZE], y7 536 537 FADDX y4, t4, y4 538 FMUL a11, x2, t4 539 bg,pn %icc, .LL22 540 LDF [A1 + 14 * SIZE], a11 541 542.LL23: 543 FADD y5, t1, y5 544 FMUL a10, x2, t1 545 LDF [Y1 + 7 * SIZE], y8 546 547 FADD y6, t2, y6 548 FMUL a10, x1, t2 549 LDF [A1 + 13 * SIZE], a10 550 551 FADD y7, t3, y7 552 FMUL a12, x2, t3 553 STF y1, [Y1 + 0 * SIZE] 554 555 FADD y8, t4, y8 556 FMUL a12, x1, t4 557 LDF [A1 + 15 * SIZE], a12 558 559 FSUBX y5, t1, y5 560 FMUL a13, x3, t1 561 STF y2, [Y1 + 1 * SIZE] 562 563 FADDX y6, t2, y6 564 FMUL a13, x4, t2 565 LDF [A2 + 12 * SIZE], a13 566 567 FSUBX y7, t3, y7 568 FMUL a15, x3, t3 569 STF y3, [Y1 + 2 * SIZE] 570 FADDX y8, t4, y8 571 FMUL a15, x4, t4 572 LDF [A2 + 14 * SIZE], a15 573 574 FADD y5, t1, y5 575 FMUL a14, x4, t1 576 STF y4, [Y1 + 3 * SIZE] 577 FADD y6, t2, y6 578 FMUL a14, x3, t2 579 LDF [A2 + 13 * SIZE], a14 580 581 FADD y7, t3, y7 582 FMUL a16, x4, t3 583 LDF [Y1 + 8 * SIZE], y1 584 FADD y8, t4, y8 585 FMUL a16, x3, t4 586 LDF [A2 + 15 * SIZE], a16 587 588 FSUBX y5, t1, y5 589 add A1, 8 * SIZE, A1 590 FMUL a1, x1, t1 591 LDF [Y1 + 9 * SIZE], y2 592 593 FADDX y6, t2, y6 594 add A2, 8 * SIZE, A2 595 FMUL a1, x2, t2 596 LDF [A1 + 8 * SIZE], a1 597 598 FSUBX y7, t3, y7 599 FMUL a3, x1, t3 600 LDF [Y1 + 10 * SIZE], y3 601 602 FADDX y8, t4, y8 603 add Y1, 8 * SIZE, Y1 604 FMUL a3, x2, t4 605 LDF [A1 + 10 * SIZE], a3 606 607 STF y5, [Y1 - 4 * SIZE] 608 STF y6, [Y1 - 3 * SIZE] 609 STF y7, [Y1 - 2 * SIZE] 610 STF y8, [Y1 - 1 * SIZE] 611 612.LL26: 613 FADD y1, t1, y1 614 LDF [Y1 + 3 * SIZE], y4 615 FMUL a2, x2, t1 616 FADD y2, t2, y2 617 FMUL a2, x1, t2 618 619 FADD y3, t3, y3 620 LDF [Y1 + 4 * SIZE], y5 621 FMUL a4, x2, t3 622 FADD y4, t4, y4 623 FMUL a4, x1, t4 624 625 FSUBX y1, t1, y1 626 LDF [Y1 + 5 * SIZE], y6 627 FMUL a5, x3, t1 628 FADDX y2, t2, y2 629 FMUL a5, x4, t2 630 631 FSUBX y3, t3, y3 632 LDF [Y1 + 6 * SIZE], y7 633 FADDX y4, t4, y4 634 FMUL a7, x4, t4 635 636 FADD y1, t1, y1 637 LDF [Y1 + 7 * SIZE], y8 638 FMUL a7, x3, t3 639 FMUL a6, x4, t1 640 FADD y2, t2, y2 641 FMUL a6, x3, t2 642 643 FADD y3, t3, y3 644 FMUL a8, x4, t3 645 FADD y4, t4, y4 646 FMUL a8, x3, t4 647 648 FSUBX y1, t1, y1 649 FMUL a9, x1, t1 650 FADDX y2, t2, y2 651 FMUL a9, x2, t2 652 653 FSUBX y3, t3, y3 654 FMUL a11, x1, t3 655 FADDX y4, t4, y4 656 FMUL a11, x2, t4 657 658 FADD y5, t1, y5 659 FMUL a10, x2, t1 660 FADD y6, t2, y6 661 FMUL a10, x1, t2 662 663 FADD y7, t3, y7 664 FMUL a12, x2, t3 665 FADD y8, t4, y8 666 FMUL a12, x1, t4 667 668 FSUBX y5, t1, y5 669 FMUL a13, x3, t1 670 FADDX y6, t2, y6 671 FMUL a13, x4, t2 672 673 FSUBX y7, t3, y7 674 FMUL a15, x3, t3 675 FADDX y8, t4, y8 676 FMUL a15, x4, t4 677 678 FADD y5, t1, y5 679 FMUL a14, x4, t1 680 FADD y6, t2, y6 681 FMUL a14, x3, t2 682 683 FADD y7, t3, y7 684 FMUL a16, x4, t3 685 FADD y8, t4, y8 686 FMUL a16, x3, t4 687 688 STF y1, [Y1 + 0 * SIZE] 689 FSUBX y5, t1, y5 690 STF y2, [Y1 + 1 * SIZE] 691 FADDX y6, t2, y6 692 STF y3, [Y1 + 2 * SIZE] 693 FSUBX y7, t3, y7 694 STF y4, [Y1 + 3 * SIZE] 695 FADDX y8, t4, y8 696 697 STF y5, [Y1 + 4 * SIZE] 698 add A1, 8 * SIZE, A1 699 STF y6, [Y1 + 5 * SIZE] 700 add A2, 8 * SIZE, A2 701 STF y7, [Y1 + 6 * SIZE] 702 STF y8, [Y1 + 7 * SIZE] 703 add Y1, 8 * SIZE, Y1 704 705.LL27: 706 andcc M, 2, I 707 ble,pn %icc, .LL28 708 nop 709 710 LDF [A1 + 0 * SIZE], a1 711 LDF [A1 + 1 * SIZE], a2 712 LDF [A1 + 2 * SIZE], a3 713 LDF [A1 + 3 * SIZE], a4 714 715 LDF [Y1 + 0 * SIZE], y1 716 LDF [Y1 + 1 * SIZE], y2 717 LDF [Y1 + 2 * SIZE], y3 718 LDF [Y1 + 3 * SIZE], y4 719 720 FMUL a1, x1, t1 721 LDF [A2 + 0 * SIZE], a5 722 FMUL a1, x2, t2 723 LDF [A2 + 1 * SIZE], a6 724 FMUL a3, x1, t3 725 LDF [A2 + 2 * SIZE], a7 726 FMUL a3, x2, t4 727 LDF [A2 + 3 * SIZE], a8 728 729 FADD y1, t1, y1 730 FMUL a2, x2, t1 731 FADD y2, t2, y2 732 FMUL a2, x1, t2 733 734 FADD y3, t3, y3 735 FMUL a4, x2, t3 736 FADD y4, t4, y4 737 FMUL a4, x1, t4 738 739 FSUBX y1, t1, y1 740 FMUL a5, x3, t1 741 FADDX y2, t2, y2 742 FMUL a5, x4, t2 743 744 FSUBX y3, t3, y3 745 FMUL a7, x3, t3 746 FADDX y4, t4, y4 747 FMUL a7, x4, t4 748 749 FADD y1, t1, y1 750 FMUL a6, x4, t1 751 FADD y2, t2, y2 752 FMUL a6, x3, t2 753 754 FADD y3, t3, y3 755 FMUL a8, x4, t3 756 FADD y4, t4, y4 757 FMUL a8, x3, t4 758 759 FSUBX y1, t1, y1 760 FADDX y2, t2, y2 761 FSUBX y3, t3, y3 762 FADDX y4, t4, y4 763 764 STF y1, [Y1 + 0 * SIZE] 765 add A1, 4 * SIZE, A1 766 STF y2, [Y1 + 1 * SIZE] 767 add A2, 4 * SIZE, A2 768 STF y3, [Y1 + 2 * SIZE] 769 nop 770 STF y4, [Y1 + 3 * SIZE] 771 add Y1, 4 * SIZE, Y1 772 773.LL28: 774 andcc M, 1, I 775 ble,pn %icc, .LL29 776 nop 777 778 LDF [A1 + 0 * SIZE], a1 779 LDF [A1 + 1 * SIZE], a2 780 LDF [A2 + 0 * SIZE], a3 781 LDF [A2 + 1 * SIZE], a4 782 783 LDF [Y1 + 0 * SIZE], y1 784 LDF [Y1 + 1 * SIZE], y2 785 786 FMUL a1, x1, t1 787 FMUL a1, x2, t2 788 FMUL a2, x2, t3 789 FMUL a2, x1, t4 790 791 FADD y1, t1, y1 792 FMUL a3, x3, t1 793 FADD y2, t2, y2 794 FMUL a3, x4, t2 795 796 FSUBX y1, t3, y1 797 FMUL a4, x4, t3 798 FADDX y2, t4, y2 799 FMUL a4, x3, t4 800 801 FADD y1, t1, y1 802 FADD y2, t2, y2 803 FSUBX y1, t3, y1 804 FADDX y2, t4, y2 805 806 STF y1, [Y1 + 0 * SIZE] 807 STF y2, [Y1 + 1 * SIZE] 808 809.LL29: 810 deccc J 811 bg %icc, .LL21 812 nop 813 814 815.LL30: 816 andcc N, 1, J 817 ble,pn %icc, .LL990 818 nop 819 820.LL31: 821 mov YY, Y1 822 mov A, A1 823 824 LDF STACK_ALPHA_R, ALPHA_R 825 LDF STACK_ALPHA_I, ALPHA_I 826 827 LDF [X + 0 * SIZE], x1 828 LDF [X + 1 * SIZE], x2 829 830 FMUL ALPHA_R, x1, a1 /* AC */ 831 FMUL ALPHA_I, x1, a2 /* AD */ 832 FMUL ALPHA_R, x2, a3 /* BC */ 833 FMUL ALPHA_I, x2, a4 /* BD */ 834 835#ifndef XCONJ 836 FSUB a1, a4, x1 837 FADD a2, a3, x2 838#else 839 FADD a1, a4, x1 840 FSUB a2, a3, x2 841#endif 842 843 sra M, 2, I 844 cmp I, 0 845 ble,pn %icc, .LL37 846 nop 847 848 LDF [A1 + 0 * SIZE], a1 849 LDF [A1 + 1 * SIZE], a2 850 LDF [A1 + 2 * SIZE], a3 851 LDF [A1 + 3 * SIZE], a4 852 853 LDF [A1 + 4 * SIZE], a9 854 LDF [A1 + 5 * SIZE], a10 855 LDF [A1 + 6 * SIZE], a11 856 LDF [A1 + 7 * SIZE], a12 857 858 LDF [Y1 + 0 * SIZE], y1 859 LDF [Y1 + 1 * SIZE], y2 860 LDF [Y1 + 2 * SIZE], y3 861 LDF [Y1 + 3 * SIZE], y4 862 863 LDF [Y1 + 4 * SIZE], y5 864 LDF [Y1 + 5 * SIZE], y6 865 LDF [Y1 + 6 * SIZE], y7 866 LDF [Y1 + 7 * SIZE], y8 867 868 FMUL a1, x1, t1 869 deccc I 870 FMUL a1, x2, t2 871 LDF [A1 + 8 * SIZE], a1 872 FMUL a3, x1, t3 873 FMUL a3, x2, t4 874 ble,pn %icc, .LL33 875 LDF [A1 + 10 * SIZE], a3 876 877.LL32: 878 FADD y1, t1, y1 879 prefetch [A1 + PREFETCHSIZE * SIZE], 1 880 FMUL a2, x2, t1 881 FADD y2, t2, y2 882 FMUL a2, x1, t2 883 LDF [A1 + 9 * SIZE], a2 884 885 FADD y3, t3, y3 886 FMUL a4, x2, t3 887 FADD y4, t4, y4 888 FMUL a4, x1, t4 889 LDF [A1 + 11 * SIZE], a4 890 891 FSUBX y1, t1, y1 892 FMUL a9, x1, t1 893 FADDX y2, t2, y2 894 FMUL a9, x2, t2 895 LDF [A1 + 12 * SIZE], a9 896 897 FSUBX y3, t3, y3 898 FMUL a11, x1, t3 899 FADDX y4, t4, y4 900 FMUL a11, x2, t4 901 LDF [A1 + 14 * SIZE], a11 902 903 STF y1, [Y1 + 0 * SIZE] 904 STF y2, [Y1 + 1 * SIZE] 905 STF y3, [Y1 + 2 * SIZE] 906 STF y4, [Y1 + 3 * SIZE] 907 908 FADD y5, t1, y5 909 FMUL a10, x2, t1 910 LDF [Y1 + 8 * SIZE], y1 911 FADD y6, t2, y6 912 FMUL a10, x1, t2 913 LDF [A1 + 13 * SIZE], a10 914 915 FADD y7, t3, y7 916 deccc I 917 FMUL a12, x2, t3 918 LDF [Y1 + 9 * SIZE], y2 919 FADD y8, t4, y8 920 FMUL a12, x1, t4 921 LDF [A1 + 15 * SIZE], a12 922 923 FSUBX y5, t1, y5 924 add A1, 8 * SIZE, A1 925 FMUL a1, x1, t1 926 LDF [Y1 + 10 * SIZE], y3 927 FADDX y6, t2, y6 928 FMUL a1, x2, t2 929 LDF [A1 + 8 * SIZE], a1 930 931 FSUBX y7, t3, y7 932 FMUL a3, x1, t3 933 LDF [Y1 + 11 * SIZE], y4 934 FADDX y8, t4, y8 935 FMUL a3, x2, t4 936 LDF [A1 + 10 * SIZE], a3 937 938 STF y5, [Y1 + 4 * SIZE] 939 STF y6, [Y1 + 5 * SIZE] 940 STF y7, [Y1 + 6 * SIZE] 941 STF y8, [Y1 + 7 * SIZE] 942 943 LDF [Y1 + 12 * SIZE], y5 944 LDF [Y1 + 13 * SIZE], y6 945 LDF [Y1 + 14 * SIZE], y7 946 add Y1, 8 * SIZE, Y1 947 bg,pn %icc, .LL32 948 LDF [Y1 + 7 * SIZE], y8 949 950.LL33: 951 FADD y1, t1, y1 952 FMUL a2, x2, t1 953 FADD y2, t2, y2 954 FMUL a2, x1, t2 955 956 FADD y3, t3, y3 957 FMUL a4, x2, t3 958 FADD y4, t4, y4 959 FMUL a4, x1, t4 960 961 FSUBX y1, t1, y1 962 FMUL a9, x1, t1 963 FADDX y2, t2, y2 964 FMUL a9, x2, t2 965 966 FSUBX y3, t3, y3 967 FMUL a11, x1, t3 968 FADDX y4, t4, y4 969 FMUL a11, x2, t4 970 971 FADD y5, t1, y5 972 FMUL a10, x2, t1 973 FADD y6, t2, y6 974 FMUL a10, x1, t2 975 976 FADD y7, t3, y7 977 FMUL a12, x2, t3 978 FADD y8, t4, y8 979 FMUL a12, x1, t4 980 981 FSUBX y5, t1, y5 982 FADDX y6, t2, y6 983 FSUBX y7, t3, y7 984 FADDX y8, t4, y8 985 986 STF y1, [Y1 + 0 * SIZE] 987 STF y2, [Y1 + 1 * SIZE] 988 STF y3, [Y1 + 2 * SIZE] 989 STF y4, [Y1 + 3 * SIZE] 990 991 STF y5, [Y1 + 4 * SIZE] 992 STF y6, [Y1 + 5 * SIZE] 993 STF y7, [Y1 + 6 * SIZE] 994 STF y8, [Y1 + 7 * SIZE] 995 996 add A1, 8 * SIZE, A1 997 add Y1, 8 * SIZE, Y1 998 999 1000.LL37: 1001 andcc M, 2, I 1002 ble,pn %icc, .LL38 1003 nop 1004 1005 LDF [A1 + 0 * SIZE], a1 1006 LDF [A1 + 1 * SIZE], a2 1007 LDF [A1 + 2 * SIZE], a3 1008 LDF [A1 + 3 * SIZE], a4 1009 1010 LDF [Y1 + 0 * SIZE], y1 1011 FMUL a1, x1, t1 1012 LDF [Y1 + 1 * SIZE], y2 1013 FMUL a1, x2, t2 1014 LDF [Y1 + 2 * SIZE], y3 1015 FMUL a3, x1, t3 1016 LDF [Y1 + 3 * SIZE], y4 1017 FMUL a3, x2, t4 1018 1019 FADD y1, t1, y1 1020 FMUL a2, x2, t1 1021 FADD y2, t2, y2 1022 FMUL a2, x1, t2 1023 FADD y3, t3, y3 1024 FMUL a4, x2, t3 1025 FADD y4, t4, y4 1026 FMUL a4, x1, t4 1027 1028 FSUBX y1, t1, y1 1029 FADDX y2, t2, y2 1030 FSUBX y3, t3, y3 1031 FADDX y4, t4, y4 1032 1033 STF y1, [Y1 + 0 * SIZE] 1034 STF y2, [Y1 + 1 * SIZE] 1035 STF y3, [Y1 + 2 * SIZE] 1036 STF y4, [Y1 + 3 * SIZE] 1037 1038 add A1, 4 * SIZE, A1 1039 add Y1, 4 * SIZE, Y1 1040 1041.LL38: 1042 andcc M, 1, I 1043 ble,pn %icc, .LL990 1044 nop 1045 1046 LDF [A1 + 0 * SIZE], a1 1047 LDF [A1 + 1 * SIZE], a2 1048 LDF [Y1 + 0 * SIZE], y1 1049 LDF [Y1 + 1 * SIZE], y2 1050 1051 FMUL a1, x1, t1 1052 FMUL a1, x2, t2 1053 FMUL a2, x2, t3 1054 FMUL a2, x1, t4 1055 1056 FADD y1, t1, y1 1057 FADD y2, t2, y2 1058 FSUBX y1, t3, y1 1059 FADDX y2, t4, y2 1060 1061 STF y1, [Y1 + 0 * SIZE] 1062 STF y2, [Y1 + 1 * SIZE] 1063 1064.LL990: 1065 cmp INCY, 2 * SIZE 1066 be %icc, .LL999 1067 mov Y, Y1 1068 1069 sra M, 2, I 1070 cmp I, 0 1071 ble,pn %icc, .LL995 1072 nop 1073 1074.LL991: 1075 LDF [BUFFER + 0 * SIZE], a1 1076 LDF [BUFFER + 1 * SIZE], a2 1077 LDF [Y + 0 * SIZE], y1 1078 LDF [Y + 1 * SIZE], y2 1079 add Y, INCY, Y 1080 1081 LDF [BUFFER + 2 * SIZE], a3 1082 LDF [BUFFER + 3 * SIZE], a4 1083 LDF [Y + 0 * SIZE], y3 1084 LDF [Y + 1 * SIZE], y4 1085 add Y, INCY, Y 1086 1087 LDF [BUFFER + 4 * SIZE], a5 1088 LDF [BUFFER + 5 * SIZE], a6 1089 LDF [Y + 0 * SIZE], y5 1090 LDF [Y + 1 * SIZE], y6 1091 add Y, INCY, Y 1092 1093 LDF [BUFFER + 6 * SIZE], a7 1094 LDF [BUFFER + 7 * SIZE], a8 1095 LDF [Y + 0 * SIZE], y7 1096 LDF [Y + 1 * SIZE], y8 1097 add Y, INCY, Y 1098 1099 FADD y1, a1, y1 1100 FADD y2, a2, y2 1101 FADD y3, a3, y3 1102 FADD y4, a4, y4 1103 FADD y5, a5, y5 1104 FADD y6, a6, y6 1105 FADD y7, a7, y7 1106 FADD y8, a8, y8 1107 1108 STF y1, [Y1 + 0 * SIZE] 1109 STF y2, [Y1 + 1 * SIZE] 1110 add Y1, INCY, Y1 1111 STF y3, [Y1 + 0 * SIZE] 1112 STF y4, [Y1 + 1 * SIZE] 1113 add Y1, INCY, Y1 1114 STF y5, [Y1 + 0 * SIZE] 1115 STF y6, [Y1 + 1 * SIZE] 1116 add Y1, INCY, Y1 1117 STF y7, [Y1 + 0 * SIZE] 1118 STF y8, [Y1 + 1 * SIZE] 1119 add Y1, INCY, Y1 1120 1121 deccc I 1122 bg,pn %icc, .LL991 1123 add BUFFER, 8 * SIZE, BUFFER 1124 1125.LL995: 1126 andcc M, 2, I 1127 ble,pn %icc, .LL996 1128 nop 1129 1130 LDF [BUFFER + 0 * SIZE], a1 1131 LDF [BUFFER + 1 * SIZE], a2 1132 LDF [Y + 0 * SIZE], y1 1133 LDF [Y + 1 * SIZE], y2 1134 add Y, INCY, Y 1135 1136 LDF [BUFFER + 2 * SIZE], a3 1137 LDF [BUFFER + 3 * SIZE], a4 1138 LDF [Y + 0 * SIZE], y3 1139 LDF [Y + 1 * SIZE], y4 1140 add Y, INCY, Y 1141 1142 FADD y1, a1, y1 1143 FADD y2, a2, y2 1144 FADD y3, a3, y3 1145 FADD y4, a4, y4 1146 1147 STF y1, [Y1 + 0 * SIZE] 1148 STF y2, [Y1 + 1 * SIZE] 1149 add Y1, INCY, Y1 1150 STF y3, [Y1 + 0 * SIZE] 1151 STF y4, [Y1 + 1 * SIZE] 1152 add Y1, INCY, Y1 1153 1154 add BUFFER, 4 * SIZE, BUFFER 1155 1156.LL996: 1157 andcc M, 1, I 1158 ble,pn %icc, .LL999 1159 nop 1160 1161 LDF [BUFFER + 0 * SIZE], a1 1162 LDF [BUFFER + 1 * SIZE], a2 1163 LDF [Y + 0 * SIZE], y1 1164 LDF [Y + 1 * SIZE], y2 1165 1166 FADD y1, a1, y1 1167 FADD y2, a2, y2 1168 1169 STF y1, [Y1 + 0 * SIZE] 1170 STF y2, [Y1 + 1 * SIZE] 1171 1172.LL999: 1173 return %i7 + 8 1174 clr %o0 1175 1176 EPILOGUE 1177