1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define P 4000 43 44#define M %i0 45#define N %i1 46#define A %i5 47#define LDA %i2 48#define X %i3 49#define INCX %i4 50 51#define Y %l0 52#define INCY %l1 53#define BUFFER %l2 54 55#define I %l3 56#define IS %l4 57#define J %l5 58#define MIN_M %l6 59#define XP %l7 60 61#define A1 %o0 62#define A2 %o1 63#define A3 %o2 64#define A4 %o3 65 66#define X1 %o4 67#define Y1 %o5 68#define PNLDA %g1 69#define Y2 %o7 /* Danger? */ 70 71#ifdef DOUBLE 72#define t1 %f0 73#define t2 %f2 74#define t3 %f4 75#define t4 %f6 76 77#define c1 %f8 78#define c2 %f10 79#define c3 %f12 80#define c4 %f14 81#define c5 %f16 82#define c6 %f18 83#define c7 %f20 84#define c8 %f22 85#define c9 %f24 86#define c10 %f26 87#define c11 %f28 88#define c12 %f30 89#define c13 %f32 90#define c14 %f34 91#define c15 %f36 92#define c16 %f38 93 94#define a1 %f40 95#define a2 %f42 96#define a3 %f44 97#define a4 %f46 98#define a5 %f48 99#define a6 %f50 100#define a7 %f52 101#define a8 %f54 102 103#define b1 %f56 104#define b2 %f58 105#define b3 %f60 106#define b4 %f62 107#else 108#define t1 %f0 109#define t2 %f1 110#define t3 %f2 111#define t4 %f3 112 113#define c1 %f4 114#define c2 %f5 115#define c3 %f6 116#define c4 %f7 117#define c5 %f8 118#define c6 %f9 119#define c7 %f10 120#define c8 %f11 121#define c9 %f12 122#define c10 %f13 123#define c11 %f14 124#define c12 %f15 125#define c13 %f16 126#define c14 %f17 127#define c15 %f18 128#define c16 %f19 129 130#define a1 %f20 131#define a2 %f21 132#define a3 %f22 133#define a4 %f23 134#define a5 %f24 135#define a6 %f25 136#define a7 %f26 137#define a8 %f27 138 139#define b1 %f28 140#define b2 %f29 141#define b3 %f30 142#define b4 %f31 143#endif 144 145#ifndef __64BIT__ 146#define ALPHA_R [%sp + STACK_START + 16] 147#ifndef DOUBLE 148#define ALPHA_I [%sp + STACK_START + 20] 149#else 150#define ALPHA_I [%sp + STACK_START + 24] 151#endif 152#else 153#define ALPHA_R [%sp + STACK_START + 32] 154#define ALPHA_I [%sp + STACK_START + 40] 155#endif 156 157#ifdef DOUBLE 158#define PREFETCHSIZE 18 159#else 160#define PREFETCHSIZE 36 161#endif 162 163 PROLOGUE 164 SAVESP 165 nop 166 167#ifndef __64BIT__ 168 169#ifdef DOUBLE 170 st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ 171 st %i4, [%sp + STACK_START + 20] 172 st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ 173 174 ld [%sp + STACK_START + 32], A 175 ld [%sp + STACK_START + 36], LDA 176 ld [%sp + STACK_START + 40], X 177 ld [%sp + STACK_START + 44], INCX 178 ld [%sp + STACK_START + 48], Y 179 ld [%sp + STACK_START + 52], INCY 180 ld [%sp + STACK_START + 56], BUFFER 181#else 182 st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ 183 st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ 184 185 ld [%sp + STACK_START + 28], LDA 186 ld [%sp + STACK_START + 32], X 187 ld [%sp + STACK_START + 36], INCX 188 ld [%sp + STACK_START + 40], Y 189 ld [%sp + STACK_START + 44], INCY 190 ld [%sp + STACK_START + 48], BUFFER 191#endif 192#else 193 ldx [%sp + STACK_START + 56], LDA 194 ldx [%sp + STACK_START + 64], X 195 ldx [%sp + STACK_START + 72], INCX 196 ldx [%sp + STACK_START + 80], Y 197 ldx [%sp + STACK_START + 88], INCY 198 ldx [%sp + STACK_START + 96], BUFFER 199#ifdef DOUBLE 200 std %f6, ALPHA_R 201 std %f8, ALPHA_I 202#else 203 st %f7, ALPHA_R 204 st %f9, ALPHA_I 205#endif 206#endif 207 208 clr IS 209 mov P, I 210 sll LDA, ZBASE_SHIFT, LDA 211 sll I, ZBASE_SHIFT, I 212 smul LDA, N, PNLDA 213 sll INCX, ZBASE_SHIFT, INCX 214 sll INCY, ZBASE_SHIFT, INCY 215 sub I, PNLDA, PNLDA 216 217.LL10: 218 sll IS, ZBASE_SHIFT, I 219 sub M, IS, MIN_M 220 mov P, J 221 222 cmp MIN_M, J 223 nop 224 movg %icc, J, MIN_M 225 nop 226 cmp INCX, 2 * SIZE 227 beq .LL100 228 add X, I, XP 229 230 sra MIN_M, 2, I 231 mov BUFFER, XP 232 cmp I, 0 233 ble,pn %icc, .LL15 234 mov BUFFER, Y1 235 236.LL11: 237 LDF [X + 0 * SIZE], a1 238 LDF [X + 1 * SIZE], a2 239 add X, INCX, X 240 LDF [X + 0 * SIZE], a3 241 LDF [X + 1 * SIZE], a4 242 add X, INCX, X 243 LDF [X + 0 * SIZE], a5 244 LDF [X + 1 * SIZE], a6 245 add X, INCX, X 246 LDF [X + 0 * SIZE], a7 247 LDF [X + 1 * SIZE], a8 248 add X, INCX, X 249 250 STF a1, [Y1 + 0 * SIZE] 251 add I, -1, I 252 STF a2, [Y1 + 1 * SIZE] 253 cmp I, 0 254 STF a3, [Y1 + 2 * SIZE] 255 STF a4, [Y1 + 3 * SIZE] 256 STF a5, [Y1 + 4 * SIZE] 257 STF a6, [Y1 + 5 * SIZE] 258 STF a7, [Y1 + 6 * SIZE] 259 STF a8, [Y1 + 7 * SIZE] 260 bg,pn %icc, .LL11 261 add Y1, 8 * SIZE, Y1 262 263.LL15: 264 and MIN_M, 3, I 265 cmp I, 0 266 ble,pn %icc, .LL100 267 nop 268 269.LL16: 270 LDF [X + 0 * SIZE], a1 271 LDF [X + 1 * SIZE], a2 272 add X, INCX, X 273 add I, -1, I 274 cmp I, 0 275 nop 276 STF a1, [Y1 + 0 * SIZE] 277 STF a2, [Y1 + 1 * SIZE] 278 bg,pn %icc, .LL16 279 add Y1, 2 * SIZE, Y1 280 281.LL100: 282 sra N, 2, J 283 cmp J, 0 284 ble %icc, .LL200 285 mov Y, Y1 286 287.LL110: 288 FCLR(0) 289 290 FMOV t1, c1 291 sra MIN_M, 2, I 292 FMOV t1, c2 293 add A, LDA, A2 294 FMOV t1, c3 295 mov A, A1 296 FMOV t1, c4 297 add A2, LDA, A3 298 299 FMOV t1, c5 300 FMOV t1, c6 301 FMOV t1, c7 302 FMOV t1, c8 303 FMOV t1, c9 304 FMOV t1, c10 305 FMOV t1, c11 306 FMOV t1, c12 307 FMOV t1, c13 308 FMOV t1, c14 309 FMOV t1, c15 310 FMOV t1, c16 311 312 add A3, LDA, A4 313 FMOV t1, t2 314 mov XP, X1 315 FMOV t1, t3 316 add A4, LDA, A 317 cmp I, 0 318 ble %icc, .LL115 319 FMOV t1, t4 320 321 LDF [A1 + 0 * SIZE], a1 322 nop 323 LDF [A1 + 1 * SIZE], a2 324 add A1, 2 * SIZE, A1 325 LDF [A2 + 0 * SIZE], a3 326 LDF [A2 + 1 * SIZE], a4 327 add A2, 2 * SIZE, A2 328 LDF [A3 + 0 * SIZE], a5 329 LDF [A3 + 1 * SIZE], a6 330 add A3, 2 * SIZE, A3 331 LDF [A4 + 0 * SIZE], a7 332 LDF [A4 + 1 * SIZE], a8 333 add A4, 2 * SIZE, A4 334 335 LDF [X1 + 0 * SIZE], b1 336 nop 337 LDF [X1 + 1 * SIZE], b2 338 nop 339 LDF [X1 + 2 * SIZE], b3 340 add X1, 4 * SIZE, X1 341 342 deccc I 343 ble .LL112 344 prefetch [Y1 + 7 * SIZE], 2 345 346#ifndef XCONJ 347#define FADDX FADD 348#else 349#define FADDX FSUB 350#endif 351 352.LL111: 353 FADD c13, t1, c13 354 prefetch [A1 + PREFETCHSIZE * SIZE], 1 355 FMUL a1, b1, t1 356 nop 357 358 FADDX c14, t2, c14 359 nop 360 FMUL a1, b2, t2 361 LDF [A1 + 0 * SIZE], a1 362 363 FADD c15, t3, c15 364 nop 365 FMUL a2, b1, t3 366 LDF [X1 - 1 * SIZE], b4 367 368 FADD c16, t4, c16 369 nop 370 FMUL a2, b2, t4 371 LDF [A1 + 1 * SIZE], a2 372 373 FADD c1, t1, c1 374 nop 375 FMUL a3, b1, t1 376 nop 377 378 FADDX c2, t2, c2 379 nop 380 FMUL a3, b2, t2 381 LDF [A2 + 0 * SIZE], a3 382 383 FADD c3, t3, c3 384 nop 385 FMUL a4, b1, t3 386 nop 387 388 FADD c4, t4, c4 389 nop 390 FMUL a4, b2, t4 391 LDF [A2 + 1 * SIZE], a4 392 393 FADD c5, t1, c5 394 nop 395 FMUL a5, b1, t1 396 nop 397 398 FADDX c6, t2, c6 399 nop 400 FMUL a5, b2, t2 401 LDF [A3 + 0 * SIZE], a5 402 403 FADD c7, t3, c7 404 nop 405 FMUL a6, b1, t3 406 nop 407 408 FADD c8, t4, c8 409 nop 410 FMUL a6, b2, t4 411 LDF [A3 + 1 * SIZE], a6 412 413 FADD c9, t1, c9 414 nop 415 FMUL a7, b1, t1 416 nop 417 418 FADDX c10, t2, c10 419 nop 420 FMUL a7, b2, t2 421 LDF [A4 + 0 * SIZE], a7 422 423 FADD c11, t3, c11 424 nop 425 FMUL a8, b1, t3 426 LDF [X1 + 0 * SIZE], b1 427 428 FADD c12, t4, c12 429 nop 430 FMUL a8, b2, t4 431 LDF [A4 + 1 * SIZE], a8 432 433 FADD c13, t1, c13 434 nop 435 FMUL a1, b3, t1 436 prefetch [A2 + PREFETCHSIZE * SIZE], 1 437 438 FADDX c14, t2, c14 439 nop 440 FMUL a1, b4, t2 441 LDF [A1 + 2 * SIZE], a1 442 443 FADD c15, t3, c15 444 nop 445 FMUL a2, b3, t3 446 LDF [X1 + 1 * SIZE], b2 447 448 FADD c16, t4, c16 449 nop 450 FMUL a2, b4, t4 451 LDF [A1 + 3 * SIZE], a2 452 453 FADD c1, t1, c1 454 nop 455 FMUL a3, b3, t1 456 nop 457 458 FADDX c2, t2, c2 459 nop 460 FMUL a3, b4, t2 461 LDF [A2 + 2 * SIZE], a3 462 463 FADD c3, t3, c3 464 nop 465 FMUL a4, b3, t3 466 nop 467 468 FADD c4, t4, c4 469 nop 470 FMUL a4, b4, t4 471 LDF [A2 + 3 * SIZE], a4 472 473 FADD c5, t1, c5 474 nop 475 FMUL a5, b3, t1 476 nop 477 478 FADDX c6, t2, c6 479 nop 480 FMUL a5, b4, t2 481 LDF [A3 + 2 * SIZE], a5 482 483 FADD c7, t3, c7 484 nop 485 FMUL a6, b3, t3 486 nop 487 488 FADD c8, t4, c8 489 nop 490 FMUL a6, b4, t4 491 LDF [A3 + 3 * SIZE], a6 492 493 FADD c9, t1, c9 494 nop 495 FMUL a7, b3, t1 496 nop 497 498 FADDX c10, t2, c10 499 nop 500 FMUL a7, b4, t2 501 LDF [A4 + 2 * SIZE], a7 502 503 FADD c11, t3, c11 504 nop 505 FMUL a8, b3, t3 506 LDF [X1 + 2 * SIZE], b3 507 508 FADD c12, t4, c12 509 nop 510 FMUL a8, b4, t4 511 LDF [A4 + 3 * SIZE], a8 512 513 FADD c13, t1, c13 514 prefetch [A3 + PREFETCHSIZE * SIZE], 1 515 FMUL a1, b1, t1 516 nop 517 518 FADDX c14, t2, c14 519 nop 520 FMUL a1, b2, t2 521 LDF [A1 + 4 * SIZE], a1 522 523 FADD c15, t3, c15 524 nop 525 FMUL a2, b1, t3 526 LDF [X1 + 3 * SIZE], b4 527 528 FADD c16, t4, c16 529 nop 530 FMUL a2, b2, t4 531 LDF [A1 + 5 * SIZE], a2 532 533 FADD c1, t1, c1 534 nop 535 FMUL a3, b1, t1 536 nop 537 538 FADDX c2, t2, c2 539 nop 540 FMUL a3, b2, t2 541 LDF [A2 + 4 * SIZE], a3 542 543 FADD c3, t3, c3 544 nop 545 FMUL a4, b1, t3 546 nop 547 548 FADD c4, t4, c4 549 nop 550 FMUL a4, b2, t4 551 LDF [A2 + 5 * SIZE], a4 552 553 FADD c5, t1, c5 554 nop 555 FMUL a5, b1, t1 556 nop 557 558 FADDX c6, t2, c6 559 nop 560 FMUL a5, b2, t2 561 LDF [A3 + 4 * SIZE], a5 562 563 FADD c7, t3, c7 564 deccc I 565 FMUL a6, b1, t3 566 nop 567 568 FADD c8, t4, c8 569 nop 570 FMUL a6, b2, t4 571 LDF [A3 + 5 * SIZE], a6 572 573 FADD c9, t1, c9 574 nop 575 FMUL a7, b1, t1 576 nop 577 578 FADDX c10, t2, c10 579 nop 580 FMUL a7, b2, t2 581 LDF [A4 + 4 * SIZE], a7 582 583 FADD c11, t3, c11 584 nop 585 FMUL a8, b1, t3 586 LDF [X1 + 4 * SIZE], b1 587 588 FADD c12, t4, c12 589 nop 590 FMUL a8, b2, t4 591 LDF [A4 + 5 * SIZE], a8 592 593 FADD c13, t1, c13 594 prefetch [A4 + PREFETCHSIZE * SIZE], 1 595 FMUL a1, b3, t1 596 nop 597 598 FADDX c14, t2, c14 599 nop 600 FMUL a1, b4, t2 601 LDF [A1 + 6 * SIZE], a1 602 603 FADD c15, t3, c15 604 nop 605 FMUL a2, b3, t3 606 LDF [X1 + 5 * SIZE], b2 607 608 FADD c16, t4, c16 609 nop 610 FMUL a2, b4, t4 611 LDF [A1 + 7 * SIZE], a2 612 613 FADD c1, t1, c1 614 add A1, 8 * SIZE, A1 615 FMUL a3, b3, t1 616 nop 617 618 FADDX c2, t2, c2 619 nop 620 FMUL a3, b4, t2 621 LDF [A2 + 6 * SIZE], a3 622 623 FADD c3, t3, c3 624 nop 625 FMUL a4, b3, t3 626 nop 627 628 FADD c4, t4, c4 629 nop 630 FMUL a4, b4, t4 631 LDF [A2 + 7 * SIZE], a4 632 633 FADD c5, t1, c5 634 add A2, 8 * SIZE, A2 635 FMUL a5, b3, t1 636 nop 637 638 FADDX c6, t2, c6 639 nop 640 FMUL a5, b4, t2 641 LDF [A3 + 6 * SIZE], a5 642 643 FADD c7, t3, c7 644 add A4, 8 * SIZE, A4 645 FMUL a6, b3, t3 646 nop 647 648 FADD c8, t4, c8 649 nop 650 FMUL a6, b4, t4 651 LDF [A3 + 7 * SIZE], a6 652 653 FADD c9, t1, c9 654 add A3, 8 * SIZE, A3 655 FMUL a7, b3, t1 656 nop 657 658 FADDX c10, t2, c10 659 add X1, 8 * SIZE, X1 660 FMUL a7, b4, t2 661 LDF [A4 - 2 * SIZE], a7 662 663 FADD c11, t3, c11 664 nop 665 FMUL a8, b3, t3 666 LDF [X1 - 2 * SIZE], b3 667 668 FADD c12, t4, c12 669 FMUL a8, b4, t4 670 bg,pn %icc, .LL111 671 LDF [A4 - 1 * SIZE], a8 672 673.LL112: 674 FADD c13, t1, c13 675 nop 676 FMUL a1, b1, t1 677 LDF [X1 - 1 * SIZE], b4 678 679 FADDX c14, t2, c14 680 nop 681 FMUL a1, b2, t2 682 LDF [A1 + 0 * SIZE], a1 683 684 FADD c15, t3, c15 685 nop 686 FMUL a2, b1, t3 687 LDF [X1 - 1 * SIZE], b4 688 689 FADD c16, t4, c16 690 nop 691 FMUL a2, b2, t4 692 LDF [A1 + 1 * SIZE], a2 693 694 FADD c1, t1, c1 695 nop 696 FMUL a3, b1, t1 697 nop 698 699 FADDX c2, t2, c2 700 nop 701 FMUL a3, b2, t2 702 LDF [A2 + 0 * SIZE], a3 703 704 FADD c3, t3, c3 705 nop 706 FMUL a4, b1, t3 707 nop 708 709 FADD c4, t4, c4 710 nop 711 FMUL a4, b2, t4 712 LDF [A2 + 1 * SIZE], a4 713 714 FADD c5, t1, c5 715 nop 716 FMUL a5, b1, t1 717 nop 718 719 FADDX c6, t2, c6 720 nop 721 FMUL a5, b2, t2 722 LDF [A3 + 0 * SIZE], a5 723 724 FADD c7, t3, c7 725 nop 726 FMUL a6, b1, t3 727 nop 728 729 FADD c8, t4, c8 730 nop 731 FMUL a6, b2, t4 732 LDF [A3 + 1 * SIZE], a6 733 734 FADD c9, t1, c9 735 nop 736 FMUL a7, b1, t1 737 nop 738 739 FADDX c10, t2, c10 740 nop 741 FMUL a7, b2, t2 742 LDF [A4 + 0 * SIZE], a7 743 744 FADD c11, t3, c11 745 nop 746 FMUL a8, b1, t3 747 LDF [X1 + 0 * SIZE], b1 748 749 FADD c12, t4, c12 750 nop 751 FMUL a8, b2, t4 752 LDF [A4 + 1 * SIZE], a8 753 754 FADD c13, t1, c13 755 nop 756 FMUL a1, b3, t1 757 LDF [X1 + 1 * SIZE], b2 758 759 FADDX c14, t2, c14 760 nop 761 FMUL a1, b4, t2 762 LDF [A1 + 2 * SIZE], a1 763 764 FADD c15, t3, c15 765 nop 766 FMUL a2, b3, t3 767 nop 768 769 FADD c16, t4, c16 770 nop 771 FMUL a2, b4, t4 772 LDF [A1 + 3 * SIZE], a2 773 774 FADD c1, t1, c1 775 nop 776 FMUL a3, b3, t1 777 nop 778 779 FADDX c2, t2, c2 780 nop 781 FMUL a3, b4, t2 782 LDF [A2 + 2 * SIZE], a3 783 784 FADD c3, t3, c3 785 nop 786 FMUL a4, b3, t3 787 nop 788 789 FADD c4, t4, c4 790 nop 791 FMUL a4, b4, t4 792 LDF [A2 + 3 * SIZE], a4 793 794 FADD c5, t1, c5 795 nop 796 FMUL a5, b3, t1 797 nop 798 799 FADDX c6, t2, c6 800 nop 801 FMUL a5, b4, t2 802 LDF [A3 + 2 * SIZE], a5 803 804 FADD c7, t3, c7 805 nop 806 FMUL a6, b3, t3 807 nop 808 809 FADD c8, t4, c8 810 nop 811 FMUL a6, b4, t4 812 LDF [A3 + 3 * SIZE], a6 813 814 FADD c9, t1, c9 815 nop 816 FMUL a7, b3, t1 817 nop 818 819 FADDX c10, t2, c10 820 nop 821 FMUL a7, b4, t2 822 LDF [A4 + 2 * SIZE], a7 823 824 FADD c11, t3, c11 825 nop 826 FMUL a8, b3, t3 827 LDF [X1 + 2 * SIZE], b3 828 829 FADD c12, t4, c12 830 nop 831 FMUL a8, b4, t4 832 LDF [A4 + 3 * SIZE], a8 833 834 FADD c13, t1, c13 835 nop 836 FMUL a1, b1, t1 837 LDF [X1 + 3 * SIZE], b4 838 839 FADDX c14, t2, c14 840 add X1, 4 * SIZE, X1 841 FMUL a1, b2, t2 842 LDF [A1 + 4 * SIZE], a1 843 844 FADD c15, t3, c15 845 nop 846 FMUL a2, b1, t3 847 nop 848 849 FADD c16, t4, c16 850 nop 851 FMUL a2, b2, t4 852 LDF [A1 + 5 * SIZE], a2 853 854 FADD c1, t1, c1 855 add A1, 6 * SIZE, A1 856 FMUL a3, b1, t1 857 nop 858 859 FADDX c2, t2, c2 860 nop 861 FMUL a3, b2, t2 862 LDF [A2 + 4 * SIZE], a3 863 864 FADD c3, t3, c3 865 nop 866 FMUL a4, b1, t3 867 nop 868 869 FADD c4, t4, c4 870 nop 871 FMUL a4, b2, t4 872 LDF [A2 + 5 * SIZE], a4 873 874 FADD c5, t1, c5 875 add A2, 6 * SIZE, A2 876 FMUL a5, b1, t1 877 nop 878 879 FADDX c6, t2, c6 880 nop 881 FMUL a5, b2, t2 882 LDF [A3 + 4 * SIZE], a5 883 884 FADD c7, t3, c7 885 nop 886 FMUL a6, b1, t3 887 nop 888 889 FADD c8, t4, c8 890 nop 891 FMUL a6, b2, t4 892 LDF [A3 + 5 * SIZE], a6 893 894 FADD c9, t1, c9 895 add A3, 6 * SIZE, A3 896 FMUL a7, b1, t1 897 nop 898 899 FADDX c10, t2, c10 900 nop 901 FMUL a7, b2, t2 902 LDF [A4 + 4 * SIZE], a7 903 904 FADD c11, t3, c11 905 nop 906 FMUL a8, b1, t3 907 nop 908 909 FADD c12, t4, c12 910 nop 911 FMUL a8, b2, t4 912 LDF [A4 + 5 * SIZE], a8 913 914 FADD c13, t1, c13 915 add A4, 6 * SIZE, A4 916 FMUL a1, b3, t1 917 nop 918 919 FADDX c14, t2, c14 920 nop 921 FMUL a1, b4, t2 922 nop 923 924 FADD c15, t3, c15 925 FMUL a2, b3, t3 926 FADD c16, t4, c16 927 FMUL a2, b4, t4 928 929 FADD c1, t1, c1 930 FMUL a3, b3, t1 931 FADDX c2, t2, c2 932 FMUL a3, b4, t2 933 FADD c3, t3, c3 934 FMUL a4, b3, t3 935 FADD c4, t4, c4 936 FMUL a4, b4, t4 937 938 FADD c5, t1, c5 939 FMUL a5, b3, t1 940 FADDX c6, t2, c6 941 FMUL a5, b4, t2 942 FADD c7, t3, c7 943 FMUL a6, b3, t3 944 FADD c8, t4, c8 945 FMUL a6, b4, t4 946 947 FADD c9, t1, c9 948 FMUL a7, b3, t1 949 FADDX c10, t2, c10 950 FMUL a7, b4, t2 951 FADD c11, t3, c11 952 FMUL a8, b3, t3 953 FADD c12, t4, c12 954 FMUL a8, b4, t4 955 956.LL115: 957 andcc MIN_M, 3, I 958 LDF ALPHA_R, b3 959 mov Y1, Y2 960 ble,pn %icc, .LL119 961 LDF ALPHA_I, b4 962 963.L116: 964 LDF [A1 + 0 * SIZE], a1 965 LDF [A1 + 1 * SIZE], a2 966 add A1, 2 * SIZE, A1 967 LDF [X1 + 0 * SIZE], b1 968 LDF [X1 + 1 * SIZE], b2 969 add X1, 2 * SIZE, X1 970 LDF [A2 + 0 * SIZE], a3 971 LDF [A2 + 1 * SIZE], a4 972 add A2, 2 * SIZE, A2 973 LDF [A3 + 0 * SIZE], a5 974 LDF [A3 + 1 * SIZE], a6 975 add A3, 2 * SIZE, A3 976 LDF [A4 + 0 * SIZE], a7 977 LDF [A4 + 1 * SIZE], a8 978 add A4, 2 * SIZE, A4 979 980 FADD c13, t1, c13 981 FMUL a1, b1, t1 982 FADDX c14, t2, c14 983 FMUL a1, b2, t2 984 FADD c15, t3, c15 985 FMUL a2, b1, t3 986 FADD c16, t4, c16 987 FMUL a2, b2, t4 988 989 FADD c1, t1, c1 990 FMUL a3, b1, t1 991 FADDX c2, t2, c2 992 FMUL a3, b2, t2 993 FADD c3, t3, c3 994 FMUL a4, b1, t3 995 FADD c4, t4, c4 996 FMUL a4, b2, t4 997 998 FADD c5, t1, c5 999 FMUL a5, b1, t1 1000 FADDX c6, t2, c6 1001 FMUL a5, b2, t2 1002 FADD c7, t3, c7 1003 FMUL a6, b1, t3 1004 FADD c8, t4, c8 1005 FMUL a6, b2, t4 1006 1007 FADD c9, t1, c9 1008 FMUL a7, b1, t1 1009 FADDX c10, t2, c10 1010 FMUL a7, b2, t2 1011 FADD c11, t3, c11 1012 FMUL a8, b1, t3 1013 FADD c12, t4, c12 1014 FMUL a8, b2, t4 1015 1016 deccc I 1017 bg %icc, .L116 1018 nop 1019 1020.LL119: 1021 FADD c13, t1, c13 1022 LDF [Y1 + 0 * SIZE], a1 1023 FADDX c14, t2, c14 1024 LDF [Y1 + 1 * SIZE] ,a2 1025 add Y1, INCY, Y1 1026 FADD c15, t3, c15 1027 LDF [Y1 + 0 * SIZE], a3 1028 FADD c16, t4, c16 1029 LDF [Y1 + 1 * SIZE] ,a4 1030 add Y1, INCY, Y1 1031 1032#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) 1033 FSUB c1, c4, c1 1034 LDF [Y1 + 0 * SIZE], a5 1035 FSUB c5, c8, c5 1036 LDF [Y1 + 1 * SIZE] ,a6 1037 add Y1, INCY, Y1 1038 FSUB c9, c12, c9 1039 LDF [Y1 + 0 * SIZE], a7 1040 FSUB c13, c16, c13 1041 LDF [Y1 + 1 * SIZE] ,a8 1042 add Y1, INCY, Y1 1043#else 1044 FADD c1, c4, c1 1045 LDF [Y1 + 0 * SIZE], a5 1046 FADD c5, c8, c5 1047 LDF [Y1 + 1 * SIZE] ,a6 1048 add Y1, INCY, Y1 1049 FADD c9, c12, c9 1050 LDF [Y1 + 0 * SIZE], a7 1051 FADD c13, c16, c13 1052 LDF [Y1 + 1 * SIZE] ,a8 1053 add Y1, INCY, Y1 1054#endif 1055 1056#ifndef CONJ 1057 FADD c2, c3, c2 1058 FCLR(0) 1059 FADD c6, c7, c6 1060 FADD c10, c11, c10 1061 FADD c14, c15, c14 1062#else 1063 FSUB c2, c3, c2 1064 FCLR(0) 1065 FSUB c6, c7, c6 1066 FSUB c10, c11, c10 1067 FSUB c14, c15, c14 1068#endif 1069 1070 FMUL b3, c1, c3 1071 FMOV t1, t2 1072 FMUL b4, c1, c4 1073 FMOV t1, t3 1074 FMUL b4, c2, c1 1075 FMOV t1, t4 1076 FMUL b3, c2, c2 1077 1078 FMUL b3, c5, c7 1079 FMUL b4, c5, c8 1080 FMUL b4, c6, c5 1081 FMUL b3, c6, c6 1082 1083 FMUL b3, c9, c11 1084 FMUL b4, c9, c12 1085 FMUL b4, c10, c9 1086 FMUL b3, c10, c10 1087 1088 FMUL b3, c13, c15 1089 FSUB c3, c1, c1 1090 FMUL b4, c13, c16 1091 FADD c2, c4, c2 1092 FMUL b4, c14, c13 1093 FSUB c7, c5, c5 1094 FMUL b3, c14, c14 1095 FADD c6, c8, c6 1096 1097 FSUB c11, c9, c9 1098 FADD c10, c12, c10 1099 FSUB c15, c13, c13 1100 FADD c14, c16, c14 1101 1102 FADD a1, c1, a1 1103 FADD a2, c2, a2 1104 FADD a3, c5, a3 1105 FADD a4, c6, a4 1106 1107 STF a1, [Y2 + 0 * SIZE] 1108 FADD a5, c9, a5 1109 STF a2, [Y2 + 1 * SIZE] 1110 FADD a6, c10, a6 1111 add Y2, INCY, Y2 1112 STF a3, [Y2 + 0 * SIZE] 1113 FADD a7, c13, a7 1114 STF a4, [Y2 + 1 * SIZE] 1115 FADD a8, c14, a8 1116 add Y2, INCY, Y2 1117 1118 STF a5, [Y2 + 0 * SIZE] 1119 FMOV t1, c1 1120 add J, -1, J 1121 STF a6, [Y2 + 1 * SIZE] 1122 FMOV t1, c2 1123 cmp J, 0 1124 add Y2, INCY, Y2 1125 STF a7, [Y2 + 0 * SIZE] 1126 FMOV t1, c3 1127 STF a8, [Y2 + 1 * SIZE] 1128 FMOV t1, c4 1129 add Y2, INCY, Y2 1130 1131 FMOV t1, c5 1132 bg %icc, .LL110 1133 FMOV t1, c6 1134 1135.LL200: 1136 FCLR(0) 1137 1138 and N, 2, J 1139 cmp J, 0 1140 FMOV t1, c1 1141 ble %icc, .LL300 1142 1143 FMOV t1, c2 1144 sra MIN_M, 2, I 1145 FMOV t1, t2 1146 add A, LDA, A2 1147 FMOV t1, c3 1148 mov A, A1 1149 FMOV t1, t3 1150 cmp I, 0 1151 FMOV t1, c4 1152 1153 FMOV t1, c5 1154 FMOV t1, c6 1155 FMOV t1, c7 1156 FMOV t1, c8 1157 1158 add A2, LDA, A 1159 FMOV t1, t4 1160 ble %icc, .LL215 1161 mov XP, X1 1162 1163 LDF [A1 + 0 * SIZE], a1 1164 LDF [A1 + 1 * SIZE], a2 1165 LDF [A1 + 2 * SIZE], a5 1166 LDF [A1 + 3 * SIZE], a6 1167 add A1, 4 * SIZE, A1 1168 1169 LDF [A2 + 0 * SIZE], a3 1170 LDF [A2 + 1 * SIZE], a4 1171 LDF [A2 + 2 * SIZE], a7 1172 LDF [A2 + 3 * SIZE], a8 1173 add A2, 4 * SIZE, A2 1174 1175 LDF [X1 + 0 * SIZE], b1 1176 add I, -1, I 1177 LDF [X1 + 1 * SIZE], b2 1178 cmp I, 0 1179 LDF [X1 + 2 * SIZE], b3 1180 LDF [X1 + 3 * SIZE], b4 1181 ble %icc, .LL212 1182 add X1, 4 * SIZE, X1 1183 1184.LL211: 1185 prefetch [A1 + PREFETCHSIZE * SIZE], 1 1186 1187 FADD c5, t1, c5 1188 FMUL a1, b1, t1 1189 FADDX c6, t2, c6 1190 FMUL a1, b2, t2 1191 LDF [A1 + 0 * SIZE], a1 1192 FADD c7, t3, c7 1193 FMUL a2, b1, t3 1194 FADD c8, t4, c8 1195 FMUL a2, b2, t4 1196 LDF [A1 + 1 * SIZE], a2 1197 1198 FADD c1, t1, c1 1199 FMUL a3, b1, t1 1200 FADDX c2, t2, c2 1201 FMUL a3, b2, t2 1202 LDF [A2 + 0 * SIZE], a3 1203 FADD c3, t3, c3 1204 FMUL a4, b1, t3 1205 LDF [X1 + 0 * SIZE], b1 1206 FADD c4, t4, c4 1207 FMUL a4, b2, t4 1208 LDF [A2 + 1 * SIZE], a4 1209 1210 FADD c5, t1, c5 1211 LDF [X1 + 1 * SIZE], b2 1212 FMUL a5, b3, t1 1213 FADDX c6, t2, c6 1214 FMUL a5, b4, t2 1215 LDF [A1 + 2 * SIZE], a5 1216 FADD c7, t3, c7 1217 add I, -1, I 1218 FMUL a6, b3, t3 1219 FADD c8, t4, c8 1220 cmp I, 0 1221 FMUL a6, b4, t4 1222 LDF [A1 + 3 * SIZE], a6 1223 1224 FADD c1, t1, c1 1225 FMUL a7, b3, t1 1226 FADDX c2, t2, c2 1227 FMUL a7, b4, t2 1228 LDF [A2 + 2 * SIZE], a7 1229 FADD c3, t3, c3 1230 FMUL a8, b3, t3 1231 LDF [X1 + 2 * SIZE], b3 1232 FADD c4, t4, c4 1233 FMUL a8, b4, t4 1234 LDF [A2 + 3 * SIZE], a8 1235 1236 prefetch [A2 + PREFETCHSIZE * SIZE], 1 1237 FADD c5, t1, c5 1238 LDF [X1 + 3 * SIZE], b4 1239 FMUL a1, b1, t1 1240 FADDX c6, t2, c6 1241 FMUL a1, b2, t2 1242 LDF [A1 + 4 * SIZE], a1 1243 FADD c7, t3, c7 1244 FMUL a2, b1, t3 1245 FADD c8, t4, c8 1246 FMUL a2, b2, t4 1247 LDF [A1 + 5 * SIZE], a2 1248 1249 FADD c1, t1, c1 1250 FMUL a3, b1, t1 1251 FADDX c2, t2, c2 1252 FMUL a3, b2, t2 1253 LDF [A2 + 4 * SIZE], a3 1254 FADD c3, t3, c3 1255 FMUL a4, b1, t3 1256 LDF [X1 + 4 * SIZE], b1 1257 FADD c4, t4, c4 1258 FMUL a4, b2, t4 1259 LDF [A2 + 5 * SIZE], a4 1260 1261 FADD c5, t1, c5 1262 LDF [X1 + 5 * SIZE], b2 1263 FMUL a5, b3, t1 1264 FADDX c6, t2, c6 1265 FMUL a5, b4, t2 1266 LDF [A1 + 6 * SIZE], a5 1267 FADD c7, t3, c7 1268 FMUL a6, b3, t3 1269 FADD c8, t4, c8 1270 FMUL a6, b4, t4 1271 LDF [A1 + 7 * SIZE], a6 1272 add A1, 8 * SIZE, A1 1273 1274 FADD c1, t1, c1 1275 FMUL a7, b3, t1 1276 FADDX c2, t2, c2 1277 FMUL a7, b4, t2 1278 LDF [A2 + 6 * SIZE], a7 1279 FADD c3, t3, c3 1280 FMUL a8, b3, t3 1281 LDF [X1 + 6 * SIZE], b3 1282 FADD c4, t4, c4 1283 add X1, 8 * SIZE, X1 1284 FMUL a8, b4, t4 1285 LDF [A2 + 7 * SIZE], a8 1286 add A2, 8 * SIZE, A2 1287 bg,pn %icc, .LL211 1288 LDF [X1 - 1 * SIZE], b4 1289 1290.LL212: 1291 FADD c5, t1, c5 1292 FMUL a1, b1, t1 1293 FADDX c6, t2, c6 1294 FMUL a1, b2, t2 1295 LDF [A1 + 0 * SIZE], a1 1296 FADD c7, t3, c7 1297 FMUL a2, b1, t3 1298 FADD c8, t4, c8 1299 FMUL a2, b2, t4 1300 LDF [A1 + 1 * SIZE], a2 1301 1302 FADD c1, t1, c1 1303 FMUL a3, b1, t1 1304 FADDX c2, t2, c2 1305 FMUL a3, b2, t2 1306 LDF [A2 + 0 * SIZE], a3 1307 FADD c3, t3, c3 1308 FMUL a4, b1, t3 1309 LDF [X1 + 0 * SIZE], b1 1310 FADD c4, t4, c4 1311 FMUL a4, b2, t4 1312 LDF [A2 + 1 * SIZE], a4 1313 1314 FADD c5, t1, c5 1315 LDF [X1 + 1 * SIZE], b2 1316 FMUL a5, b3, t1 1317 FADDX c6, t2, c6 1318 FMUL a5, b4, t2 1319 LDF [A1 + 2 * SIZE], a5 1320 FADD c7, t3, c7 1321 FMUL a6, b3, t3 1322 FADD c8, t4, c8 1323 FMUL a6, b4, t4 1324 LDF [A1 + 3 * SIZE], a6 1325 add A1, 4 * SIZE, A1 1326 1327 FADD c1, t1, c1 1328 FMUL a7, b3, t1 1329 FADDX c2, t2, c2 1330 FMUL a7, b4, t2 1331 LDF [A2 + 2 * SIZE], a7 1332 FADD c3, t3, c3 1333 FMUL a8, b3, t3 1334 LDF [X1 + 2 * SIZE], b3 1335 FADD c4, t4, c4 1336 FMUL a8, b4, t4 1337 LDF [A2 + 3 * SIZE], a8 1338 add A2, 4 * SIZE, A2 1339 1340 FADD c5, t1, c5 1341 LDF [X1 + 3 * SIZE], b4 1342 add X1, 4 * SIZE, X1 1343 FMUL a1, b1, t1 1344 FADDX c6, t2, c6 1345 FMUL a1, b2, t2 1346 FADD c7, t3, c7 1347 FMUL a2, b1, t3 1348 FADD c8, t4, c8 1349 FMUL a2, b2, t4 1350 1351 FADD c1, t1, c1 1352 FMUL a3, b1, t1 1353 FADDX c2, t2, c2 1354 FMUL a3, b2, t2 1355 FADD c3, t3, c3 1356 FMUL a4, b1, t3 1357 FADD c4, t4, c4 1358 FMUL a4, b2, t4 1359 1360 FADD c5, t1, c5 1361 FMUL a5, b3, t1 1362 FADDX c6, t2, c6 1363 FMUL a5, b4, t2 1364 FADD c7, t3, c7 1365 FMUL a6, b3, t3 1366 FADD c8, t4, c8 1367 FMUL a6, b4, t4 1368 1369 FADD c1, t1, c1 1370 FMUL a7, b3, t1 1371 FADDX c2, t2, c2 1372 FMUL a7, b4, t2 1373 FADD c3, t3, c3 1374 FMUL a8, b3, t3 1375 FADD c4, t4, c4 1376 FMUL a8, b4, t4 1377 1378.LL215: 1379 andcc MIN_M, 3, I 1380 LDF ALPHA_R, b3 1381 mov Y1, Y2 1382 ble %icc, .LL219 1383 LDF ALPHA_I, b4 1384 1385 LDF [A1 + 0 * SIZE], a1 1386 add I, -1, I 1387 LDF [A1 + 1 * SIZE], a2 1388 cmp I, 0 1389 add A1, 2 * SIZE, A1 1390 1391 LDF [A2 + 0 * SIZE], a3 1392 LDF [A2 + 1 * SIZE], a4 1393 add A2, 2 * SIZE, A2 1394 1395 LDF [X1 + 0 * SIZE], b1 1396 LDF [X1 + 1 * SIZE], b2 1397 ble %icc, .LL217 1398 add X1, 2 * SIZE, X1 1399 1400.LL216: 1401 FADD c5, t1, c5 1402 FMUL a1, b1, t1 1403 FADDX c6, t2, c6 1404 FMUL a1, b2, t2 1405 LDF [A1 + 0 * SIZE], a1 1406 FADD c7, t3, c7 1407 add I, -1, I 1408 FMUL a2, b1, t3 1409 FADD c8, t4, c8 1410 cmp I, 0 1411 FMUL a2, b2, t4 1412 LDF [A1 + 1 * SIZE], a2 1413 add A1, 2 * SIZE, A1 1414 1415 FADD c1, t1, c1 1416 FMUL a3, b1, t1 1417 FADDX c2, t2, c2 1418 FMUL a3, b2, t2 1419 LDF [A2 + 0 * SIZE], a3 1420 FADD c3, t3, c3 1421 FMUL a4, b1, t3 1422 LDF [X1 + 0 * SIZE], b1 1423 FADD c4, t4, c4 1424 add X1, 2 * SIZE, X1 1425 FMUL a4, b2, t4 1426 LDF [A2 + 1 * SIZE], a4 1427 add A2, 2 * SIZE, A2 1428 bg,pn %icc, .LL216 1429 LDF [X1 - 1 * SIZE], b2 1430 1431.LL217: 1432 FADD c5, t1, c5 1433 FMUL a1, b1, t1 1434 FADDX c6, t2, c6 1435 FMUL a1, b2, t2 1436 FADD c7, t3, c7 1437 FMUL a2, b1, t3 1438 FADD c8, t4, c8 1439 FMUL a2, b2, t4 1440 1441 FADD c1, t1, c1 1442 FMUL a3, b1, t1 1443 FADDX c2, t2, c2 1444 FMUL a3, b2, t2 1445 FADD c3, t3, c3 1446 FMUL a4, b1, t3 1447 FADD c4, t4, c4 1448 FMUL a4, b2, t4 1449 1450.LL219: 1451 FADD c5, t1, c5 1452 LDF [Y1 + 0 * SIZE], a1 1453 FADDX c6, t2, c6 1454 LDF [Y1 + 1 * SIZE] ,a2 1455 add Y1, INCY, Y1 1456 FADD c7, t3, c7 1457 LDF [Y1 + 0 * SIZE], a3 1458 FADD c8, t4, c8 1459 LDF [Y1 + 1 * SIZE] ,a4 1460 add Y1, INCY, Y1 1461 1462#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) 1463 FSUB c1, c4, c1 1464 FSUB c5, c8, c5 1465#else 1466 FADD c1, c4, c1 1467 FADD c5, c8, c5 1468#endif 1469 1470#ifndef CONJ 1471 FADD c2, c3, c2 1472 FADD c6, c7, c6 1473#else 1474 FSUB c2, c3, c2 1475 FSUB c6, c7, c6 1476#endif 1477 1478 FMUL b3, c1, c3 1479 FMUL b4, c1, c4 1480 FMUL b4, c2, c1 1481 FMUL b3, c2, c2 1482 1483 FMUL b3, c5, c7 1484 FMUL b4, c5, c8 1485 FMUL b4, c6, c5 1486 FMUL b3, c6, c6 1487 1488 FSUB c3, c1, c1 1489 FADD c2, c4, c2 1490 FSUB c7, c5, c5 1491 FADD c6, c8, c6 1492 1493 FADD a1, c1, a1 1494 FADD a2, c2, a2 1495 FADD a3, c5, a3 1496 FADD a4, c6, a4 1497 1498 STF a1, [Y2 + 0 * SIZE] 1499 STF a2, [Y2 + 1 * SIZE] 1500 add Y2, INCY, Y2 1501 STF a3, [Y2 + 0 * SIZE] 1502 STF a4, [Y2 + 1 * SIZE] 1503 1504.LL300: 1505 andcc N, 1, J 1506 FCLR(0) 1507 ble %icc, .LL400 1508 FMOV t1, c1 1509 1510.LL310: 1511 sra MIN_M, 2, I 1512 FMOV t1, c2 1513 FMOV t1, c3 1514 FMOV t1, c4 1515 mov A, A1 1516 FMOV t1, t2 1517 add A, LDA, A 1518 FMOV t1, t3 1519 cmp I, 0 1520 FMOV t1, t4 1521 ble %icc, .LL315 1522 mov XP, X1 1523 1524 LDF [A1 + 0 * SIZE], a1 1525 LDF [A1 + 1 * SIZE], a2 1526 LDF [A1 + 2 * SIZE], a3 1527 LDF [A1 + 3 * SIZE], a4 1528 LDF [A1 + 4 * SIZE], a5 1529 LDF [A1 + 5 * SIZE], a6 1530 LDF [A1 + 6 * SIZE], a7 1531 LDF [A1 + 7 * SIZE], a8 1532 add A1, 8 * SIZE, A1 1533 1534 LDF [X1 + 0 * SIZE], c9 1535 add I, -1, I 1536 LDF [X1 + 1 * SIZE], c10 1537 cmp I, 0 1538 LDF [X1 + 2 * SIZE], c11 1539 LDF [X1 + 3 * SIZE], c12 1540 LDF [X1 + 4 * SIZE], c13 1541 LDF [X1 + 5 * SIZE], c14 1542 LDF [X1 + 6 * SIZE], c15 1543 LDF [X1 + 7 * SIZE], c16 1544 ble %icc, .LL312 1545 add X1, 8 * SIZE, X1 1546 1547.LL311: 1548 prefetch [A1 + PREFETCHSIZE * SIZE], 1 1549 1550 FADD c1, t1, c1 1551 FMUL a1, c9, t1 1552 FADDX c2, t2, c2 1553 FMUL a1, c10, t2 1554 LDF [A1 + 0 * SIZE], a1 1555 FADD c3, t3, c3 1556 FMUL a2, c9, t3 1557 LDF [X1 + 0 * SIZE], c9 1558 FADD c4, t4, c4 1559 FMUL a2, c10, t4 1560 LDF [A1 + 1 * SIZE], a2 1561 LDF [X1 + 1 * SIZE], c10 1562 1563 FADD c1, t1, c1 1564 FMUL a3, c11, t1 1565 FADDX c2, t2, c2 1566 FMUL a3, c12, t2 1567 LDF [A1 + 2 * SIZE], a3 1568 FADD c3, t3, c3 1569 add I, -1, I 1570 FMUL a4, c11, t3 1571 LDF [X1 + 2 * SIZE], c11 1572 FADD c4, t4, c4 1573 cmp I, 0 1574 FMUL a4, c12, t4 1575 LDF [A1 + 3 * SIZE], a4 1576 LDF [X1 + 3 * SIZE], c12 1577 1578 FADD c1, t1, c1 1579 FMUL a5, c13, t1 1580 FADDX c2, t2, c2 1581 FMUL a5, c14, t2 1582 LDF [A1 + 4 * SIZE], a5 1583 FADD c3, t3, c3 1584 FMUL a6, c13, t3 1585 LDF [X1 + 4 * SIZE], c13 1586 FADD c4, t4, c4 1587 FMUL a6, c14, t4 1588 LDF [A1 + 5 * SIZE], a6 1589 LDF [X1 + 5 * SIZE], c14 1590 1591 FADD c1, t1, c1 1592 FMUL a7, c15, t1 1593 FADDX c2, t2, c2 1594 FMUL a7, c16, t2 1595 LDF [A1 + 6 * SIZE], a7 1596 1597 FADD c3, t3, c3 1598 FMUL a8, c15, t3 1599 LDF [X1 + 6 * SIZE], c15 1600 FADD c4, t4, c4 1601 add X1, 8 * SIZE, X1 1602 FMUL a8, c16, t4 1603 LDF [A1 + 7 * SIZE], a8 1604 add A1, 8 * SIZE, A1 1605 bg,pn %icc, .LL311 1606 LDF [X1 - 1 * SIZE], c16 1607 1608.LL312: 1609 FADD c1, t1, c1 1610 FMUL a1, c9, t1 1611 FADDX c2, t2, c2 1612 FMUL a1, c10, t2 1613 FADD c3, t3, c3 1614 FMUL a2, c9, t3 1615 FADD c4, t4, c4 1616 FMUL a2, c10, t4 1617 1618 FADD c1, t1, c1 1619 FMUL a3, c11, t1 1620 FADDX c2, t2, c2 1621 FMUL a3, c12, t2 1622 FADD c3, t3, c3 1623 FMUL a4, c11, t3 1624 FADD c4, t4, c4 1625 FMUL a4, c12, t4 1626 1627 FADD c1, t1, c1 1628 FMUL a5, c13, t1 1629 FADDX c2, t2, c2 1630 FMUL a5, c14, t2 1631 FADD c3, t3, c3 1632 FMUL a6, c13, t3 1633 FADD c4, t4, c4 1634 FMUL a6, c14, t4 1635 1636 FADD c1, t1, c1 1637 FMUL a7, c15, t1 1638 FADDX c2, t2, c2 1639 FMUL a7, c16, t2 1640 FADD c3, t3, c3 1641 FMUL a8, c15, t3 1642 FADD c4, t4, c4 1643 FMUL a8, c16, t4 1644 1645.LL315: 1646 andcc MIN_M, 3, I 1647 LDF ALPHA_R, b3 1648 mov Y1, Y2 1649 ble %icc, .LL319 1650 LDF ALPHA_I, b4 1651 1652 LDF [A1 + 0 * SIZE], a1 1653 add I, -1, I 1654 LDF [A1 + 1 * SIZE], a2 1655 add A1, 2 * SIZE, A1 1656 LDF [X1 + 0 * SIZE], b1 1657 cmp I, 0 1658 LDF [X1 + 1 * SIZE], b2 1659 ble %icc, .LL317 1660 add X1, 2 * SIZE, X1 1661 1662.LL316: 1663 FADD c1, t1, c1 1664 add I, -1, I 1665 FMUL a1, b1, t1 1666 FADDX c2, t2, c2 1667 FMUL a1, b2, t2 1668 LDF [A1 + 0 * SIZE], a1 1669 FADD c3, t3, c3 1670 cmp I, 0 1671 FMUL a2, b1, t3 1672 LDF [X1 + 0 * SIZE], b1 1673 FADD c4, t4, c4 1674 add X1, 2 * SIZE, X1 1675 FMUL a2, b2, t4 1676 LDF [A1 + 1 * SIZE], a2 1677 add A1, 2 * SIZE, A1 1678 1679 bg,pn %icc, .LL316 1680 LDF [X1 - 1 * SIZE], b2 1681 1682.LL317: 1683 FADD c1, t1, c1 1684 FMUL a1, b1, t1 1685 FADDX c2, t2, c2 1686 FMUL a1, b2, t2 1687 FADD c3, t3, c3 1688 FMUL a2, b1, t3 1689 FADD c4, t4, c4 1690 FMUL a2, b2, t4 1691 1692.LL319: 1693 FADD c1, t1, c1 1694 LDF [Y1 + 0 * SIZE], a1 1695 FADDX c2, t2, c2 1696 LDF [Y1 + 1 * SIZE] ,a2 1697 add Y1, INCY, Y1 1698 FADD c3, t3, c3 1699 FADD c4, t4, c4 1700 1701#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) 1702 FSUB c1, c4, c1 1703#else 1704 FADD c1, c4, c1 1705#endif 1706 1707#ifndef CONJ 1708 FADD c2, c3, c2 1709#else 1710 FSUB c2, c3, c2 1711#endif 1712 1713 FMUL b3, c1, c3 1714 FMUL b4, c1, c4 1715 FMUL b4, c2, c1 1716 FMUL b3, c2, c2 1717 1718 FSUB c3, c1, c1 1719 FADD c2, c4, c2 1720 FADD a1, c1, a1 1721 FADD a2, c2, a2 1722 1723 STF a1, [Y2 + 0 * SIZE] 1724 STF a2, [Y2 + 1 * SIZE] 1725 1726.LL400: 1727 mov P, I 1728 add IS, I, IS 1729 cmp IS, M 1730 bl %icc, .LL10 1731 add A, PNLDA, A 1732 1733.LL999: 1734 return %i7 + 8 1735 clr %o0 1736 1737 EPILOGUE 1738