1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef DOUBLE 43#define PREFETCHSIZE (16 * 8) 44#else 45#define PREFETCHSIZE (32 * 8) 46#endif 47 48#define CPREFETCHSIZE 15 49#define CPREFETCH lfetch.excl.nt1 50 51#define M r32 52#define N r33 53#define K r34 54#define A r37 55#define B r38 56#define C r39 57#define LDC r35 58 59#define I r15 60#define J r16 61#define AOFFSET r17 62#define BOFFSET r18 63#define L r20 64 65#define C1 r21 66#define C2 r22 67#define C3 r23 68#define C4 r24 69#define C5 r25 70#define C6 r26 71#define C7 r27 72#define C8 r28 73 74#define C9 loc0 75#define C10 loc1 76#define C11 loc2 77#define C12 loc3 78#define C13 loc4 79#define C14 loc5 80#define C15 loc6 81#define C16 loc7 82 83#define PREA r8 84#define PREB r9 85#define PREC r10 86#define SP r12 87#define ARLC r29 88#define PR r30 89#define ARPFS r31 90 91#define ALPHA_R f8 92#define ALPHA_I f9 93 94 PROLOGUE 95 .prologue 96 PROFCODE 97 98 { .mmi 99 .save ar.pfs, ARPFS 100 alloc ARPFS = ar.pfs, 8, 16, 0, 0 101 adds r14 = 16, SP 102 mov ARLC = ar.lc 103 } 104 { .mmi 105 adds r8 = -16 * 16, SP 106 adds r9 = -15 * 16, SP 107 adds SP = -16 * 16, SP 108 } 109 ;; 110 { .mmi 111 stf.spill [r8] = f16, 32 112 stf.spill [r9] = f17, 32 113 mov PR = pr 114 } 115 { .mmi 116 ld8 LDC = [r14], 8 117 nop __LINE__ 118 nop __LINE__ 119 } 120 ;; 121 stf.spill [r8] = f18, 32 122 stf.spill [r9] = f19, 32 123 shr J = N, 3 124 ;; 125 stf.spill [r8] = f20, 32 126 stf.spill [r9] = f21, 32 127 shladd LDC = LDC, ZBASE_SHIFT, r0 128 ;; 129 stf.spill [r8] = f22, 32 130 stf.spill [r9] = f23, 32 131 mov AOFFSET = A 132 ;; 133 stf.spill [r8] = f24, 32 134 stf.spill [r9] = f25, 32 135 cmp.ge p6, p0 = 0, J 136 ;; 137 stf.spill [r8] = f26, 32 138 stf.spill [r9] = f27, 32 139 ;; 140 stf.spill [r8] = f28, 32 141 stf.spill [r9] = f29, 32 142 ;; 143 stf.spill [r8] = f30 144 stf.spill [r9] = f31 145 (p6) br.cond.dpnt .L050 146 .body 147 ;; 148 .align 32 149 150.L010: 151 { .mfi 152 adds J = -1, J 153 mov f64 = f0 154 shr I = M, 3 155 } 156 { .mfi 157 mov C1 = C // coffset1 = c + 0 * ldc 158 mov f72 = f0 159 } 160 ;; 161 { .mmf 162 cmp.eq p6, p7 = 0, I 163 nop __LINE__ 164 mov f80 = f0 165 } 166 { .mmf 167 add C2 = LDC, C // coffset2 = c + 1 * ldc 168 shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc 169 mov f88 = f0 170 } 171 ;; 172 { .mmf 173 shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc 174 shladd C = LDC, 3, C // coffset += 8 * ldc 175 mov f96 = f0 176 } 177 { .mmf 178 shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc 179 shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc 180 mov f104 = f0 181 } 182 ;; 183 { .mfi 184 shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc 185 mov f112 = f0 186 nop __LINE__ 187 } 188 { .mfb 189 sub C8 = C, LDC // coffset8 = c + 7 * ldc 190 mov f120 = f0 191 (p6) br.cond.dpnt .L020 192 } 193 ;; 194 .align 16 195 196.L011: 197 { .mfb 198 LDFPD f48, f49 = [B] 199 mov f65 = f0 200 nop __LINE__ 201 } 202 { .mfb 203 adds BOFFSET = 2 * SIZE, B 204 mov f73 = f0 205 nop __LINE__ 206 } 207 ;; 208 { .mfb 209 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 210 mov f81 = f0 211 nop __LINE__ 212 } 213 { .mfb 214 LDFPD f50, f51 = [BOFFSET], 2 * SIZE 215 mov f89 = f0 216 nop __LINE__ 217 } 218 ;; 219 { .mmf 220 LDFPD f52, f53 = [BOFFSET], 2 * SIZE 221 setf.d f97 = r0 222 mov f105 = f0 223 } 224 { .mfb 225 setf.d f113 = r0 226 mov f121 = f0 227 nop __LINE__ 228 } 229 ;; 230 { .mmf 231 LDFPD f54, f55 = [BOFFSET], 2 * SIZE 232 setf.d f66 = r0 233 mov f74 = f0 234 } 235 { .mfb 236 setf.d f82 = r0 237 mov f90 = f0 238 nop __LINE__ 239 } 240 ;; 241 { .mmf 242 LDFPD f34, f35 = [AOFFSET], 2 * SIZE 243 setf.d f98 = r0 244 mov f106 = f0 245 } 246 { .mfb 247 setf.d f114 = r0 248 mov f122 = f0 249 nop __LINE__ 250 } 251 ;; 252 { .mmf 253 LDFPD f36, f37 = [AOFFSET], 2 * SIZE 254 setf.d f67 = r0 255 mov f75 = f0 256 } 257 { .mfi 258 setf.d f83 = r0 259 mov f91 = f0 260 nop __LINE__ 261 } 262 ;; 263 { .mmf 264 LDFPD f38, f39 = [AOFFSET], 2 * SIZE 265 setf.d f99 = r0 266 mov f107 = f0 267 } 268 { .mfi 269 setf.d f115 = r0 270 mov f123 = f0 271 adds PREC = CPREFETCHSIZE * SIZE, C1 272 } 273 ;; 274 { .mmf 275 CPREFETCH [PREC], LDC 276 setf.d f68 = r0 277 mov f76 = f0 278 } 279 { .mfi 280 setf.d f84 = r0 281 mov f92 = f0 282 adds L = 1, K 283 } 284 ;; 285 { .mmf 286 CPREFETCH [PREC], LDC 287 setf.d f100 = r0 288 mov f108 = f0 289 } 290 { .mfi 291 setf.d f116 = r0 292 mov f124 = f0 293 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET 294 } 295 ;; 296 { .mmf 297 CPREFETCH [PREC], LDC 298 setf.d f69 = r0 299 mov f77 = f0 300 } 301 { .mfi 302 setf.d f85 = r0 303 mov f93 = f0 304 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET 305 } 306 ;; 307 { .mmf 308 CPREFETCH [PREC], LDC 309 setf.d f101 = r0 310 mov f109 = f0 311 } 312 { .mfi 313 setf.d f117 = r0 314 mov f125 = f0 315 tbit.z p12, p0 = L, 0 316 } 317 ;; 318 { .mmf 319 CPREFETCH [PREC], LDC 320 setf.d f70 = r0 321 mov f78 = f0 322 } 323 { .mfi 324 setf.d f86 = r0 325 mov f94 = f0 326 shr L = L, 1 327 } 328 ;; 329 { .mmf 330 CPREFETCH [PREC], LDC 331 setf.d f102 = r0 332 mov f110 = f0 333 } 334 { .mfi 335 setf.d f118 = r0 336 mov f126 = f0 337 adds L = -1, L 338 } 339 ;; 340 { .mmf 341 CPREFETCH [PREC], LDC 342 setf.d f71 = r0 343 mov f79 = f0 344 } 345 { .mfi 346 setf.d f87 = r0 347 mov f95 = f0 348 mov ar.lc = L 349 } 350 ;; 351 { .mmf 352 CPREFETCH [PREC] 353 setf.d f103 = r0 354 mov f111 = f0 355 } 356 { .mfi 357 setf.d f119 = r0 358 mov f127 = f0 359 cmp.eq p3, p0 = r0, r0 360 } 361 ;; 362 .align 16 363 364.L012: 365/* 1 */ 366 { .mfi 367 lfetch.nt1 [PREA], 16 * SIZE 368 FMA f64 = f32, f48, f64 // A1 * B1 369 nop __LINE__ 370 } 371 { .mfi 372 (p12) cmp.ne p3, p0 = 0, L 373 FMA f72 = f32, f49, f72 // A1 * B2 374 nop __LINE__ 375 } 376 ;; 377/* 2 */ 378 { .mfi 379 lfetch.nt1 [PREB], 16 * SIZE 380 FMA f80 = f32, f50, f80 // A1 * B3 381 nop __LINE__ 382 } 383 { .mfi 384 cmp.ne p4, p5 = 0, L 385 FMA f88 = f32, f51, f88 // A1 * B4 386 nop __LINE__ 387 } 388 ;; 389/* 3 */ 390 { .mfi 391 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 392 FMA f96 = f32, f52, f96 // A1 * B5 393 nop __LINE__ 394 } 395 { .mfi 396 adds C9 = 4 * SIZE, C1 397 FMA f104 = f32, f53, f104 // A1 * B6 398 nop __LINE__ 399 } 400 ;; 401/* 4 */ 402 { .mfi 403 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 404 FMA f112 = f32, f54, f112 // A1 * B7 405 nop __LINE__ 406 } 407 { .mfi 408 adds C10 = 4 * SIZE, C2 409 FMA f120 = f32, f55, f120 // A1 * B8 410 nop __LINE__ 411 } 412 ;; 413/* 5 */ 414 { .mfi 415 (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE 416 FMA f65 = f33, f48, f65 // A2 * B1 417 nop __LINE__ 418 } 419 { .mfi 420 adds C11 = 4 * SIZE, C3 421 FMA f73 = f33, f49, f73 // A2 * B2 422 nop __LINE__ 423 } 424 ;; 425/* 6 */ 426 { .mfi 427 (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE 428 FMA f81 = f33, f50, f81 // A2 * B3 429 nop __LINE__ 430 } 431 { .mfi 432 adds C12 = 4 * SIZE, C4 433 FMA f89 = f33, f51, f89 // A2 * B4 434 nop __LINE__ 435 } 436 ;; 437/* 7 */ 438 { .mfi 439 (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE 440 FMA f97 = f33, f52, f97 // A2 * B5 441 nop __LINE__ 442 } 443 { .mfi 444 adds C13 = 4 * SIZE, C5 445 FMA f105 = f33, f53, f105 // A2 * B6 446 nop __LINE__ 447 } 448 ;; 449/* 8 */ 450 { .mfi 451 (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE 452 FMA f113 = f33, f54, f113 // A2 * B7 453 nop __LINE__ 454 } 455 { .mfi 456 adds C14 = 4 * SIZE, C6 457 FMA f121 = f33, f55, f121 // A2 * B8 458 nop __LINE__ 459 } 460 ;; 461/* 9 */ 462 { .mfi 463 (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE 464 FMA f66 = f34, f48, f66 // A3 * B1 465 nop __LINE__ 466 } 467 { .mfi 468 adds C15 = 4 * SIZE, C7 469 FMA f74 = f34, f49, f74 // A3 * B2 470 nop __LINE__ 471 } 472 ;; 473/* 10 */ 474 { .mfi 475 (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE 476 FMA f82 = f34, f50, f82 // A3 * B3 477 nop __LINE__ 478 } 479 { .mfi 480 adds C16 = 4 * SIZE, C8 481 FMA f90 = f34, f51, f90 // A3 * B4 482 nop __LINE__ 483 } 484 ;; 485/* 11 */ 486 { .mfi 487 FMA f98 = f34, f52, f98 // A3 * B5 488 nop __LINE__ 489 } 490 { .mfi 491 nop __LINE__ 492 FMA f106 = f34, f53, f106 // A3 * B6 493 nop __LINE__ 494 } 495 ;; 496/* 12 */ 497 { .mfi 498 FMA f114 = f34, f54, f114 // A3 * B7 499 nop __LINE__ 500 } 501 { .mfi 502 nop __LINE__ 503 FMA f122 = f34, f55, f122 // A3 * B8 504 nop __LINE__ 505 } 506 ;; 507/* 13 */ 508 { .mfi 509 nop __LINE__ 510 FMA f67 = f35, f48, f67 // A4 * B1 511 } 512 { .mfi 513 nop __LINE__ 514 FMA f75 = f35, f49, f75 // A4 * B2 515 nop __LINE__ 516 } 517 ;; 518/* 14 */ 519 { .mfi 520 FMA f83 = f35, f50, f83 // A4 * B3 521 nop __LINE__ 522 } 523 { .mfi 524 nop __LINE__ 525 FMA f91 = f35, f51, f91 // A4 * B4 526 nop __LINE__ 527 } 528 ;; 529/* 15 */ 530 { .mfi 531 FMA f99 = f35, f52, f99 // A4 * B5 532 nop __LINE__ 533 } 534 { .mfi 535 nop __LINE__ 536 FMA f107 = f35, f53, f107 // A4 * B6 537 nop __LINE__ 538 } 539 ;; 540/* 16 */ 541 { .mfi 542 FMA f115 = f35, f54, f115 // A4 * B7 543 nop __LINE__ 544 } 545 { .mfi 546 nop __LINE__ 547 FMA f123 = f35, f55, f123 // A4 * B8 548 nop __LINE__ 549 } 550 ;; 551/* 17 */ 552 { .mfi 553 nop __LINE__ 554 FMA f68 = f36, f48, f68 // A5 * B1 555 nop __LINE__ 556 } 557 { .mfi 558 nop __LINE__ 559 FMA f76 = f36, f49, f76 // A5 * B2 560 nop __LINE__ 561 } 562 ;; 563/* 18 */ 564 { .mfi 565 nop __LINE__ 566 FMA f84 = f36, f50, f84 // A5 * B3 567 nop __LINE__ 568 } 569 { .mfi 570 nop __LINE__ 571 FMA f92 = f36, f51, f92 // A5 * B4 572 nop __LINE__ 573 } 574 ;; 575/* 19 */ 576 { .mfi 577 nop __LINE__ 578 FMA f100 = f36, f52, f100 // A5 * B5 579 nop __LINE__ 580 } 581 { .mfi 582 nop __LINE__ 583 FMA f108 = f36, f53, f108 // A5 * B6 584 nop __LINE__ 585 } 586 ;; 587/* 20 */ 588 { .mfi 589 nop __LINE__ 590 FMA f116 = f36, f54, f116 // A5 * B7 591 nop __LINE__ 592 } 593 { .mfi 594 nop __LINE__ 595 FMA f124 = f36, f55, f124 // A5 * B8 596 nop __LINE__ 597 } 598 ;; 599/* 21 */ 600 { .mfi 601 nop __LINE__ 602 FMA f69 = f37, f48, f69 // A6 * B1 603 nop __LINE__ 604 } 605 { .mfi 606 nop __LINE__ 607 FMA f77 = f37, f49, f77 // A6 * B2 608 nop __LINE__ 609 } 610 ;; 611/* 22 */ 612 { .mfi 613 nop __LINE__ 614 FMA f85 = f37, f50, f85 // A6 * B3 615 nop __LINE__ 616 } 617 { .mfi 618 nop __LINE__ 619 FMA f93 = f37, f51, f93 // A6 * B4 620 nop __LINE__ 621 } 622 ;; 623/* 23 */ 624 { .mfi 625 nop __LINE__ 626 FMA f101 = f37, f52, f101 // A6 * B5 627 nop __LINE__ 628 } 629 { .mfi 630 nop __LINE__ 631 FMA f109 = f37, f53, f109 // A6 * B6 632 nop __LINE__ 633 } 634 ;; 635/* 24 */ 636 { .mfi 637 nop __LINE__ 638 FMA f117 = f37, f54, f117 // A6 * B7 639 nop __LINE__ 640 } 641 { .mfi 642 nop __LINE__ 643 FMA f125 = f37, f55, f125 // A6 * B8 644 nop __LINE__ 645 } 646 ;; 647/* 25 */ 648 { .mfi 649 nop __LINE__ 650 FMA f70 = f38, f48, f70 // A7 * B1 651 nop __LINE__ 652 } 653 { .mfi 654 nop __LINE__ 655 FMA f78 = f38, f49, f78 // A7 * B2 656 nop __LINE__ 657 } 658 ;; 659/* 26 */ 660 { .mfi 661 nop __LINE__ 662 FMA f86 = f38, f50, f86 // A7 * B3 663 nop __LINE__ 664 } 665 { .mfi 666 nop __LINE__ 667 FMA f94 = f38, f51, f94 // A7 * B4 668 nop __LINE__ 669 } 670 ;; 671/* 27 */ 672 { .mfi 673 nop __LINE__ 674 FMA f102 = f38, f52, f102 // A7 * B5 675 nop __LINE__ 676 } 677 { .mfi 678 nop __LINE__ 679 FMA f110 = f38, f53, f110 // A7 * B6 680 nop __LINE__ 681 } 682 ;; 683/* 28 */ 684 { .mfi 685 nop __LINE__ 686 FMA f118 = f38, f54, f118 // A7 * B7 687 nop __LINE__ 688 } 689 { .mfi 690 nop __LINE__ 691 FMA f126 = f38, f55, f126 // A7 * B8 692 nop __LINE__ 693 } 694 ;; 695/* 29 */ 696 { .mfi 697 nop __LINE__ 698 FMA f71 = f39, f48, f71 // A8 * B1 699 nop __LINE__ 700 } 701 { .mfi 702 nop __LINE__ 703 FMA f79 = f39, f49, f79 // A8 * B2 704 nop __LINE__ 705 } 706 ;; 707/* 30 */ 708 { .mfi 709 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 710 FMA f87 = f39, f50, f87 // A8 * B3 711 nop __LINE__ 712 } 713 { .mfi 714 nop __LINE__ 715 FMA f95 = f39, f51, f95 // A8 * B4 716 nop __LINE__ 717 } 718 ;; 719/* 31 */ 720 { .mfi 721 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 722 FMA f103 = f39, f52, f103 // A8 * B5 723 nop __LINE__ 724 } 725 { .mfi 726 nop __LINE__ 727 FMA f111 = f39, f53, f111 // A8 * B6 728 nop __LINE__ 729 } 730 ;; 731/* 32 */ 732 { .mfi 733 nop __LINE__ 734 FMA f119 = f39, f54, f119 // A8 * B7 735 nop __LINE__ 736 } 737 { .mfi 738 nop __LINE__ 739 FMA f127 = f39, f55, f127 // A8 * B8 740 nop __LINE__ 741 } 742 ;; 743/* 33 */ 744 { .mfi 745 nop __LINE__ 746 (p3) FMA f64 = f40, f56, f64 // A1 * B1 747 nop __LINE__ 748 } 749 { .mfi 750 nop __LINE__ 751 (p3) FMA f72 = f40, f57, f72 // A1 * B2 752 nop __LINE__ 753 } 754 ;; 755/* 34 */ 756 { .mfi 757 (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE 758 (p3) FMA f80 = f40, f58, f80 // A1 * B3 759 nop __LINE__ 760 } 761 { .mfi 762 nop __LINE__ 763 (p3) FMA f88 = f40, f59, f88 // A1 * B4 764 nop __LINE__ 765 } 766 ;; 767/* 35 */ 768 { .mfi 769 (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE 770 (p3) FMA f96 = f40, f60, f96 // A1 * B5 771 nop __LINE__ 772 } 773 { .mfi 774 nop __LINE__ 775 (p3) FMA f104 = f40, f61, f104 // A1 * B6 776 nop __LINE__ 777 } 778 ;; 779/* 36 */ 780 { .mfi 781 (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE 782 (p3) FMA f112 = f40, f62, f112 // A1 * B7 783 nop __LINE__ 784 } 785 { .mfi 786 nop __LINE__ 787 (p3) FMA f120 = f40, f63, f120 // A1 * B8 788 nop __LINE__ 789 } 790 ;; 791/* 37 */ 792 { .mfi 793 (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE 794 (p3) FMA f65 = f41, f56, f65 // A2 * B1 795 nop __LINE__ 796 } 797 { .mfi 798 nop __LINE__ 799 (p3) FMA f73 = f41, f57, f73 // A2 * B2 800 nop __LINE__ 801 } 802 ;; 803/* 38 */ 804 { .mfi 805 (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE 806 (p3) FMA f81 = f41, f58, f81 // A2 * B3 807 nop __LINE__ 808 } 809 { .mfi 810 nop __LINE__ 811 (p3) FMA f89 = f41, f59, f89 // A2 * B4 812 nop __LINE__ 813 } 814 ;; 815/* 39 */ 816 { .mfi 817 (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE 818 (p3) FMA f97 = f41, f60, f97 // A2 * B5 819 nop __LINE__ 820 } 821 { .mfi 822 nop __LINE__ 823 (p3) FMA f105 = f41, f61, f105 // A2 * B6 824 nop __LINE__ 825 } 826 ;; 827/* 40 */ 828 { .mfi 829 (p5) LDFD f6 = [C1 ], SIZE 830 (p3) FMA f113 = f41, f62, f113 // A2 * B7 831 nop __LINE__ 832 } 833 { .mfi 834 (p5) LDFD f7 = [C9 ], SIZE 835 (p3) FMA f121 = f41, f63, f121 // A2 * B8 836 nop __LINE__ 837 } 838 ;; 839 /* 41 */ 840 { .mfi 841 (p5) LDFD f10 = [C1 ], SIZE 842 (p3) FMA f66 = f42, f56, f66 // A3 * B1 843 nop __LINE__ 844 } 845 { .mfi 846 (p5) LDFD f11 = [C9 ], SIZE 847 (p3) FMA f74 = f42, f57, f74 // A3 * B2 848 nop __LINE__ 849 } 850 ;; 851/* 42 */ 852 { .mfi 853 (p5) LDFD f12 = [C1 ], SIZE 854 (p3) FMA f82 = f42, f58, f82 // A3 * B3 855 nop __LINE__ 856 } 857 { .mfi 858 (p5) LDFD f13 = [C9 ], SIZE 859 (p3) FMA f90 = f42, f59, f90 // A3 * B4 860 nop __LINE__ 861 } 862 ;; 863/* 43 */ 864 { .mfi 865 (p5) LDFD f14 = [C1 ], 5 * SIZE 866 (p3) FMA f98 = f42, f60, f98 // A3 * B5 867 nop __LINE__ 868 } 869 { .mfi 870 (p5) LDFD f15 = [C9 ], 5 * SIZE 871 (p3) FMA f106 = f42, f61, f106 // A3 * B6 872 nop __LINE__ 873 } 874 ;; 875/* 44 */ 876 { .mfi 877 (p5) LDFD f16 = [C1 ], SIZE 878 (p3) FMA f114 = f42, f62, f114 // A3 * B7 879 nop __LINE__ 880 } 881 { .mfi 882 (p5) LDFD f17 = [C9 ], SIZE 883 (p3) FMA f122 = f42, f63, f122 // A3 * B8 884 nop __LINE__ 885 } 886 ;; 887/* 45 */ 888 { .mfi 889 (p5) LDFD f18 = [C1 ], SIZE 890 (p3) FMA f67 = f43, f56, f67 // A4 * B1 891 nop __LINE__ 892 } 893 { .mfi 894 (p5) LDFD f19 = [C9 ], SIZE 895 (p3) FMA f75 = f43, f57, f75 // A4 * B2 896 nop __LINE__ 897 } 898 ;; 899/* 46 */ 900 { .mfi 901 (p5) LDFD f20 = [C1 ], SIZE 902 (p3) FMA f83 = f43, f58, f83 // A4 * B3 903 nop __LINE__ 904 } 905 { .mfi 906 (p5) LDFD f21 = [C9 ], SIZE 907 (p3) FMA f91 = f43, f59, f91 // A4 * B4 908 nop __LINE__ 909 } 910 ;; 911/* 47 */ 912 { .mfi 913 (p5) LDFD f22 = [C1 ], - 11 * SIZE 914 (p3) FMA f99 = f43, f60, f99 // A4 * B5 915 nop __LINE__ 916 } 917 { .mfi 918 (p5) LDFD f23 = [C9 ], - 11 * SIZE 919 (p3) FMA f107 = f43, f61, f107 // A4 * B6 920 nop __LINE__ 921 } 922 ;; 923/* 48 */ 924 { .mfi 925 (p5) LDFD f24 = [C2 ], SIZE 926 (p3) FMA f115 = f43, f62, f115 // A4 * B7 927 nop __LINE__ 928 } 929 { .mfi 930 (p5) LDFD f25 = [C10], SIZE 931 (p3) FMA f123 = f43, f63, f123 // A4 * B8 932 nop __LINE__ 933 } 934 ;; 935/* 49 */ 936 { .mfi 937 (p5) LDFD f26 = [C2 ], SIZE 938 (p3) FMA f68 = f44, f56, f68 // A5 * B1 939 nop __LINE__ 940 } 941 { .mfi 942 (p5) LDFD f27 = [C10], SIZE 943 (p3) FMA f76 = f44, f57, f76 // A5 * B2 944 nop __LINE__ 945 } 946 ;; 947/* 50 */ 948 { .mfi 949 (p5) LDFD f28 = [C2 ], SIZE 950 (p3) FMA f84 = f44, f58, f84 // A5 * B3 951 nop __LINE__ 952 } 953 { .mfi 954 (p5) LDFD f29 = [C10], SIZE 955 (p3) FMA f92 = f44, f59, f92 // A5 * B4 956 nop __LINE__ 957 } 958 ;; 959/* 51 */ 960 { .mfi 961 (p5) LDFD f30 = [C2 ], 5 * SIZE 962 (p3) FMA f100 = f44, f60, f100 // A5 * B5 963 nop __LINE__ 964 } 965 { .mfi 966 (p5) LDFD f31 = [C10], 5 * SIZE 967 (p3) FMA f108 = f44, f61, f108 // A5 * B6 968 nop __LINE__ 969 } 970 ;; 971/* 52 */ 972 { .mfi 973 (p5) LDFD f32 = [C2 ], SIZE 974 (p3) FMA f116 = f44, f62, f116 // A5 * B7 975 nop __LINE__ 976 } 977 { .mfi 978 (p5) LDFD f33 = [C10], SIZE 979 (p3) FMA f124 = f44, f63, f124 // A5 * B8 980 nop __LINE__ 981 } 982 ;; 983/* 53 */ 984 { .mfi 985 (p5) LDFD f34 = [C2 ], SIZE 986 (p3) FMA f69 = f45, f56, f69 // A6 * B1 987 nop __LINE__ 988 } 989 { .mfi 990 (p5) LDFD f35 = [C10], SIZE 991 (p3) FMA f77 = f45, f57, f77 // A6 * B2 992 nop __LINE__ 993 } 994 ;; 995/* 54 */ 996 { .mfi 997 (p5) LDFD f36 = [C2 ], SIZE 998 (p3) FMA f85 = f45, f58, f85 // A6 * B3 999 nop __LINE__ 1000 } 1001 { .mfi 1002 (p5) LDFD f37 = [C10], SIZE 1003 (p3) FMA f93 = f45, f59, f93 // A6 * B4 1004 nop __LINE__ 1005 } 1006 ;; 1007/* 55 */ 1008 { .mfi 1009 (p5) LDFD f38 = [C2 ], - 11 * SIZE 1010 (p3) FMA f101 = f45, f60, f101 // A6 * B5 1011 nop __LINE__ 1012 } 1013 { .mfi 1014 (p5) LDFD f39 = [C10], - 11 * SIZE 1015 (p3) FMA f109 = f45, f61, f109 // A6 * B6 1016 nop __LINE__ 1017 } 1018 ;; 1019/* 56 */ 1020 { .mfi 1021 (p5) LDFD f48 = [C3 ], SIZE 1022 (p3) FMA f117 = f45, f62, f117 // A6 * B7 1023 nop __LINE__ 1024 } 1025 { .mfi 1026 (p5) LDFD f49 = [C11], SIZE 1027 (p3) FMA f125 = f45, f63, f125 // A6 * B8 1028 nop __LINE__ 1029 } 1030 ;; 1031/* 57 */ 1032 { .mfi 1033 (p5) LDFD f50 = [C3 ], SIZE 1034 (p3) FMA f70 = f46, f56, f70 // A7 * B1 1035 nop __LINE__ 1036 } 1037 { .mfi 1038 (p5) LDFD f51 = [C11], SIZE 1039 (p3) FMA f78 = f46, f57, f78 // A7 * B2 1040 nop __LINE__ 1041 } 1042 ;; 1043/* 58 */ 1044 { .mfi 1045 (p5) LDFD f52 = [C3 ], SIZE 1046 (p3) FMA f86 = f46, f58, f86 // A7 * B3 1047 nop __LINE__ 1048 } 1049 { .mfi 1050 (p5) LDFD f53 = [C11], SIZE 1051 (p3) FMA f94 = f46, f59, f94 // A7 * B4 1052 nop __LINE__ 1053 } 1054 ;; 1055/* 59 */ 1056 { .mfi 1057 (p5) LDFD f54 = [C3 ], 5 * SIZE 1058 (p3) FMA f102 = f46, f60, f102 // A7 * B5 1059 nop __LINE__ 1060 } 1061 { .mfi 1062 (p5) LDFD f55 = [C11], 5 * SIZE 1063 (p3) FMA f110 = f46, f61, f110 // A7 * B6 1064 nop __LINE__ 1065 } 1066 ;; 1067/* 60 */ 1068 { .mfi 1069 (p5) LDFD f40 = [C3 ], SIZE 1070 (p3) FMA f118 = f46, f62, f118 // A7 * B7 1071 nop __LINE__ 1072 } 1073 { .mfi 1074 (p5) LDFD f41 = [C11], SIZE 1075 (p3) FMA f126 = f46, f63, f126 // A7 * B8 1076 nop __LINE__ 1077 } 1078 ;; 1079/* 61 */ 1080 { .mfi 1081 (p5) LDFD f42 = [C3 ], SIZE 1082 (p3) FMA f71 = f47, f56, f71 // A8 * B1 1083 nop __LINE__ 1084 } 1085 { .mfi 1086 (p5) LDFD f43 = [C11], SIZE 1087 (p3) FMA f79 = f47, f57, f79 // A8 * B2 1088 nop __LINE__ 1089 } 1090 ;; 1091/* 62 */ 1092 { .mfi 1093 (p5) LDFD f44 = [C3 ], SIZE 1094 (p3) FMA f87 = f47, f58, f87 // A8 * B3 1095 nop __LINE__ 1096 } 1097 { .mfi 1098 (p5) LDFD f45 = [C11], SIZE 1099 (p3) FMA f95 = f47, f59, f95 // A8 * B4 1100 nop __LINE__ 1101 } 1102 ;; 1103/* 63 */ 1104 { .mfi 1105 (p5) LDFD f46 = [C3 ], - 11 * SIZE 1106 (p3) FMA f103 = f47, f60, f103 // A8 * B5 1107 nop __LINE__ 1108 } 1109 { .mfi 1110 (p5) LDFD f56 = [C11], - 11 * SIZE 1111 (p3) FMA f111 = f47, f61, f111 // A8 * B6 1112 nop __LINE__ 1113 } 1114 ;; 1115/* 64 */ 1116 { .mfi 1117 (p5) LDFD f57 = [C4 ], SIZE 1118 (p3) FMA f119 = f47, f62, f119 // A8 * B7 1119 adds L = -1, L 1120 } 1121 { .mfb 1122 (p5) LDFD f58 = [C12], SIZE 1123 (p3) FMA f127 = f47, f63, f127 // A8 * B8 1124 br.cloop.sptk.few .L012 1125 } 1126 ;; 1127.L013: 1128 { .mmf 1129 (p5) LDFD f59 = [C4 ], SIZE 1130 (p5) LDFD f60 = [C12], SIZE 1131 FMA f6 = ALPHA_R, f64, f6 1132 } 1133 { .mmf 1134 cmp.ne p6, p0 = 1, I 1135 nop __LINE__ 1136 FMA f7 = ALPHA_R, f66, f7 1137 } 1138 ;; 1139 { .mmf 1140 (p5) LDFD f61 = [C4 ], SIZE 1141 (p5) LDFD f62 = [C12], SIZE 1142 FMA f10 = ALPHA_I, f64, f10 1143 } 1144 { .mmf 1145 nop __LINE__ 1146 nop __LINE__ 1147 FMA f11 = ALPHA_I, f66, f11 1148 } 1149 ;; 1150 { .mmf 1151 (p5) LDFD f63 = [C4 ], 5 * SIZE 1152 (p5) LDFD f47 = [C12], 5 * SIZE 1153 FMA f12 = ALPHA_R, f65, f12 1154 } 1155 { .mmf 1156 nop __LINE__ 1157 nop __LINE__ 1158 FMA f13 = ALPHA_R, f67, f13 1159 } 1160 ;; 1161 { .mfi 1162 (p5) LDFD f64 = [C4 ], SIZE 1163 FMA f14 = ALPHA_I, f65, f14 1164 nop __LINE__ 1165 } 1166 { .mfi 1167 (p5) LDFD f65 = [C12], SIZE 1168 FMA f15 = ALPHA_I, f67, f15 1169 nop __LINE__ 1170 } 1171 ;; 1172 { .mmf 1173 STFD [C1 ] = f6, SIZE 1174 STFD [C9 ] = f7, SIZE 1175 FMA f16 = ALPHA_R, f68, f16 1176 } 1177 { .mmf 1178 (p5) LDFD f6 = [C4 ], SIZE 1179 (p5) LDFD f7 = [C12], SIZE 1180 FMA f17 = ALPHA_R, f70, f17 1181 } 1182 ;; 1183 { .mmf 1184 STFD [C1 ] = f10, SIZE 1185 STFD [C9 ] = f11, SIZE 1186 FMA f18 = ALPHA_I, f68, f18 1187 } 1188 { .mmf 1189 (p5) LDFD f10 = [C4 ], SIZE 1190 (p5) LDFD f11 = [C12], SIZE 1191 FMA f19 = ALPHA_I, f70, f19 1192 } 1193 ;; 1194 { .mmf 1195 STFD [C1 ] = f12, SIZE 1196 STFD [C9 ] = f13, SIZE 1197 FMA f20 = ALPHA_R, f69, f20 1198 } 1199 { .mmf 1200 (p5) LDFD f12 = [C4 ], - 11 * SIZE 1201 (p5) LDFD f13 = [C12], - 11 * SIZE 1202 FMA f21 = ALPHA_R, f71, f21 1203 } 1204 ;; 1205 { .mmf 1206 STFD [C1 ] = f14, 5 * SIZE 1207 STFD [C9 ] = f15, 5 * SIZE 1208 FMA f22 = ALPHA_I, f69, f22 1209 } 1210 { .mmf 1211 (p5) LDFD f14 = [C5 ], SIZE 1212 (p5) LDFD f15 = [C13], SIZE 1213 FMA f23 = ALPHA_I, f71, f23 1214 } 1215 ;; 1216 { .mmf 1217 STFD [C1 ] = f16, SIZE 1218 STFD [C9 ] = f17, SIZE 1219 FMA f24 = ALPHA_R, f72, f24 1220 } 1221 { .mmf 1222 (p5) LDFD f16 = [C5 ], SIZE 1223 (p5) LDFD f17 = [C13], SIZE 1224 FMA f25 = ALPHA_R, f74, f25 1225 } 1226 ;; 1227 { .mmf 1228 STFD [C1 ] = f18, SIZE 1229 STFD [C9 ] = f19, SIZE 1230 FMA f26 = ALPHA_I, f72, f26 1231 } 1232 { .mmf 1233 (p5) LDFD f18 = [C5 ], SIZE 1234 (p5) LDFD f19 = [C13], SIZE 1235 FMA f27 = ALPHA_I, f74, f27 1236 } 1237 ;; 1238 { .mmf 1239 STFD [C1 ] = f20, SIZE 1240 STFD [C9 ] = f21, SIZE 1241 FMA f28 = ALPHA_R, f73, f28 1242 } 1243 { .mmf 1244 (p5) LDFD f20 = [C5 ], 5 * SIZE 1245 (p5) LDFD f21 = [C13], 5 * SIZE 1246 FMA f29 = ALPHA_R, f75, f29 1247 } 1248 ;; 1249 { .mmf 1250 STFD [C1 ] = f22, 5 * SIZE 1251 STFD [C9 ] = f23, 5 * SIZE 1252 FMA f30 = ALPHA_I, f73, f30 1253 } 1254 { .mmf 1255 (p5) LDFD f22 = [C5 ], SIZE 1256 (p5) LDFD f23 = [C13], SIZE 1257 FMA f31 = ALPHA_I, f75, f31 1258 } 1259 ;; 1260 { .mmf 1261 STFD [C2 ] = f24, SIZE 1262 STFD [C10] = f25, SIZE 1263 FMA f32 = ALPHA_R, f76, f32 1264 } 1265 { .mmf 1266 (p5) LDFD f24 = [C5 ], SIZE 1267 (p5) LDFD f25 = [C13], SIZE 1268 FMA f33 = ALPHA_R, f78, f33 1269 } 1270 ;; 1271 { .mmf 1272 STFD [C2 ] = f26, SIZE 1273 STFD [C10] = f27, SIZE 1274 FMA f34 = ALPHA_I, f76, f34 1275 } 1276 { .mmf 1277 (p5) LDFD f26 = [C5 ], SIZE 1278 (p5) LDFD f27 = [C13], SIZE 1279 FMA f35 = ALPHA_I, f78, f35 1280 } 1281 ;; 1282 { .mmf 1283 STFD [C2 ] = f28, SIZE 1284 STFD [C10] = f29, SIZE 1285 FMA f36 = ALPHA_R, f77, f36 1286 } 1287 { .mmf 1288 (p5) LDFD f28 = [C5 ], - 11 * SIZE 1289 (p5) LDFD f29 = [C13], - 11 * SIZE 1290 FMA f37 = ALPHA_R, f79, f37 1291 } 1292 ;; 1293 { .mmf 1294 STFD [C2 ] = f30, 5 * SIZE 1295 STFD [C10] = f31, 5 * SIZE 1296 FMA f38 = ALPHA_I, f77, f38 1297 } 1298 { .mmf 1299 (p5) LDFD f30 = [C6 ], SIZE 1300 (p5) LDFD f31 = [C14], SIZE 1301 FMA f39 = ALPHA_I, f79, f39 1302 } 1303 ;; 1304 { .mmf 1305 STFD [C2 ] = f32, SIZE 1306 STFD [C10] = f33, SIZE 1307 FMA f48 = ALPHA_R, f80, f48 1308 } 1309 { .mmf 1310 (p5) LDFD f32 = [C6 ], SIZE 1311 (p5) LDFD f33 = [C14], SIZE 1312 FMA f49 = ALPHA_R, f82, f49 1313 } 1314 ;; 1315 { .mmf 1316 STFD [C2 ] = f34, SIZE 1317 STFD [C10] = f35, SIZE 1318 FMA f50 = ALPHA_I, f80, f50 1319 } 1320 { .mmf 1321 (p5) LDFD f34 = [C6 ], SIZE 1322 (p5) LDFD f35 = [C14], SIZE 1323 FMA f51 = ALPHA_I, f82, f51 1324 } 1325 ;; 1326 { .mmf 1327 STFD [C2 ] = f36, SIZE 1328 STFD [C10] = f37, SIZE 1329 FMA f52 = ALPHA_R, f81, f52 1330 } 1331 { .mmf 1332 (p5) LDFD f36 = [C6 ], 5 * SIZE 1333 (p5) LDFD f37 = [C14], 5 * SIZE 1334 FMA f53 = ALPHA_R, f83, f53 1335 } 1336 ;; 1337 { .mmf 1338 STFD [C2 ] = f38, 5 * SIZE 1339 STFD [C10] = f39, 5 * SIZE 1340 FMA f54 = ALPHA_I, f81, f54 1341 } 1342 { .mmf 1343 (p5) LDFD f38 = [C6 ], SIZE 1344 (p5) LDFD f39 = [C14], SIZE 1345 FMA f55 = ALPHA_I, f83, f55 1346 } 1347 ;; 1348 { .mmf 1349 STFD [C3 ] = f48, SIZE 1350 STFD [C11] = f49, SIZE 1351 FMA f40 = ALPHA_R, f84, f40 1352 } 1353 { .mmf 1354 (p5) LDFD f48 = [C6 ], SIZE 1355 (p5) LDFD f49 = [C14], SIZE 1356 FMA f41 = ALPHA_R, f86, f41 1357 } 1358 ;; 1359 { .mmf 1360 STFD [C3 ] = f50, SIZE 1361 STFD [C11] = f51, SIZE 1362 FMA f42 = ALPHA_I, f84, f42 1363 } 1364 { .mmf 1365 (p5) LDFD f50 = [C6 ], SIZE 1366 (p5) LDFD f51 = [C14], SIZE 1367 FMA f43 = ALPHA_I, f86, f43 1368 } 1369 ;; 1370 { .mmf 1371 STFD [C3 ] = f52, SIZE 1372 STFD [C11] = f53, SIZE 1373 FMA f44 = ALPHA_R, f85, f44 1374 } 1375 { .mmf 1376 (p5) LDFD f52 = [C6 ], - 11 * SIZE 1377 (p5) LDFD f53 = [C14], - 11 * SIZE 1378 FMA f45 = ALPHA_R, f87, f45 1379 } 1380 ;; 1381 { .mmf 1382 STFD [C3 ] = f54, 5 * SIZE 1383 STFD [C11] = f55, 5 * SIZE 1384 FMA f46 = ALPHA_I, f85, f46 1385 } 1386 { .mmf 1387 (p5) LDFD f54 = [C7 ], SIZE 1388 (p5) LDFD f55 = [C15], SIZE 1389 FMA f56 = ALPHA_I, f87, f56 1390 } 1391 ;; 1392 { .mmf 1393 STFD [C3 ] = f40, SIZE 1394 STFD [C11] = f41, SIZE 1395 FMA f57 = ALPHA_R, f88, f57 1396 } 1397 { .mmf 1398 (p5) LDFD f40 = [C7 ], SIZE 1399 (p5) LDFD f41 = [C15], SIZE 1400 FMA f58 = ALPHA_R, f90, f58 1401 } 1402 ;; 1403 { .mmf 1404 STFD [C3 ] = f42, SIZE 1405 STFD [C11] = f43, SIZE 1406 FMA f59 = ALPHA_I, f88, f59 1407 } 1408 { .mmf 1409 (p5) LDFD f42 = [C7 ], SIZE 1410 (p5) LDFD f43 = [C15], SIZE 1411 FMA f60 = ALPHA_I, f90, f60 1412 } 1413 ;; 1414 { .mmf 1415 STFD [C3 ] = f44, SIZE 1416 STFD [C11] = f45, SIZE 1417 FMA f61 = ALPHA_R, f89, f61 1418 } 1419 { .mmf 1420 (p5) LDFD f44 = [C7 ], 5 * SIZE 1421 (p5) LDFD f45 = [C15], 5 * SIZE 1422 FMA f62 = ALPHA_R, f91, f62 1423 } 1424 ;; 1425 { .mmf 1426 STFD [C3 ] = f46, 5 * SIZE 1427 STFD [C11] = f56, 5 * SIZE 1428 FMA f63 = ALPHA_I, f89, f63 1429 } 1430 { .mmf 1431 (p5) LDFD f46 = [C7 ], SIZE 1432 (p5) LDFD f56 = [C15], SIZE 1433 FMA f47 = ALPHA_I, f91, f47 1434 } 1435 ;; 1436 { .mmf 1437 STFD [C4 ] = f57, SIZE 1438 STFD [C12] = f58, SIZE 1439 FMA f64 = ALPHA_R, f92, f64 1440 } 1441 { .mmf 1442 (p5) LDFD f57 = [C7 ], SIZE 1443 (p5) LDFD f58 = [C15], SIZE 1444 FMA f65 = ALPHA_R, f94, f65 1445 } 1446 ;; 1447 { .mmf 1448 STFD [C4 ] = f59, SIZE 1449 STFD [C12] = f60, SIZE 1450 FMA f6 = ALPHA_I, f92, f6 1451 } 1452 { .mmf 1453 (p5) LDFD f59 = [C7 ], SIZE 1454 (p5) LDFD f60 = [C15], SIZE 1455 FMA f7 = ALPHA_I, f94, f7 1456 } 1457 ;; 1458 { .mmf 1459 STFD [C4 ] = f61, SIZE 1460 STFD [C12] = f62, SIZE 1461 FMA f10 = ALPHA_R, f93, f10 1462 } 1463 { .mmf 1464 (p5) LDFD f61 = [C7 ], - 11 * SIZE 1465 (p5) LDFD f62 = [C15], - 11 * SIZE 1466 FMA f11 = ALPHA_R, f95, f11 1467 } 1468 ;; 1469 { .mmf 1470 STFD [C4 ] = f63, 5 * SIZE 1471 STFD [C12] = f47, 5 * SIZE 1472 FMA f12 = ALPHA_I, f93, f12 1473 } 1474 { .mmf 1475 (p5) LDFD f63 = [C8 ], SIZE 1476 (p5) LDFD f47 = [C16], SIZE 1477 FMA f13 = ALPHA_I, f95, f13 1478 } 1479 ;; 1480 { .mmf 1481 STFD [C4 ] = f64, SIZE 1482 STFD [C12] = f65, SIZE 1483 FMA f14 = ALPHA_R, f96, f14 1484 } 1485 { .mmf 1486 (p5) LDFD f64 = [C8 ], SIZE 1487 (p5) LDFD f65 = [C16], SIZE 1488 FMA f15 = ALPHA_R, f98, f15 1489 } 1490 ;; 1491 { .mmf 1492 STFD [C4 ] = f6, SIZE 1493 STFD [C12] = f7, SIZE 1494 FMA f16 = ALPHA_I, f96, f16 1495 } 1496 { .mmf 1497 (p5) LDFD f6 = [C8 ], SIZE 1498 (p5) LDFD f7 = [C16], SIZE 1499 FMA f17 = ALPHA_I, f98, f17 1500 } 1501 ;; 1502 { .mmf 1503 STFD [C4 ] = f10, SIZE 1504 STFD [C12] = f11, SIZE 1505 FMA f18 = ALPHA_R, f97, f18 1506 } 1507 { .mmf 1508 (p5) LDFD f10 = [C8 ], 5 * SIZE 1509 (p5) LDFD f11 = [C16], 5 * SIZE 1510 FMA f19 = ALPHA_R, f99, f19 1511 } 1512 ;; 1513 { .mmf 1514 STFD [C4 ] = f12, 5 * SIZE 1515 STFD [C12] = f13, 5 * SIZE 1516 FMA f20 = ALPHA_I, f97, f20 1517 } 1518 { .mmf 1519 (p5) LDFD f12 = [C8 ], SIZE 1520 (p5) LDFD f13 = [C16], SIZE 1521 FMA f21 = ALPHA_I, f99, f21 1522 } 1523 ;; 1524 { .mmf 1525 STFD [C5 ] = f14, SIZE 1526 STFD [C13] = f15, SIZE 1527 FMA f22 = ALPHA_R, f100, f22 1528 } 1529 { .mmf 1530 (p5) LDFD f14 = [C8 ], SIZE 1531 (p5) LDFD f15 = [C16], SIZE 1532 FMA f23 = ALPHA_R, f102, f23 1533 } 1534 ;; 1535 { .mmf 1536 STFD [C5 ] = f16, SIZE 1537 STFD [C13] = f17, SIZE 1538 FMA f24 = ALPHA_I, f100, f24 1539 } 1540 { .mmf 1541 (p5) LDFD f16 = [C8 ], SIZE 1542 (p5) LDFD f17 = [C16], SIZE 1543 FMA f25 = ALPHA_I, f102, f25 1544 } 1545 ;; 1546 { .mmf 1547 STFD [C5 ] = f18, SIZE 1548 STFD [C13] = f19, SIZE 1549 FMA f26 = ALPHA_R, f101, f26 1550 } 1551 { .mmf 1552 (p5) LDFD f18 = [C8 ], - 11 * SIZE 1553 (p5) LDFD f19 = [C16], - 11 * SIZE 1554 FMA f27 = ALPHA_R, f103, f27 1555 } 1556 ;; 1557 { .mmf 1558 STFD [C5 ] = f20, 5 * SIZE 1559 STFD [C13] = f21, 5 * SIZE 1560 FMA f28 = ALPHA_I, f101, f28 1561 } 1562 { .mmf 1563 nop __LINE__ 1564 nop __LINE__ 1565 FMA f29 = ALPHA_I, f103, f29 1566 } 1567 ;; 1568 { .mmf 1569 STFD [C5 ] = f22, SIZE 1570 STFD [C13] = f23, SIZE 1571 FMA f30 = ALPHA_R, f104, f30 1572 } 1573 { .mmf 1574 nop __LINE__ 1575 nop __LINE__ 1576 FMA f31 = ALPHA_R, f106, f31 1577 } 1578 ;; 1579 { .mmf 1580 STFD [C5 ] = f24, SIZE 1581 STFD [C13] = f25, SIZE 1582 FMA f32 = ALPHA_I, f104, f32 1583 } 1584 { .mmf 1585 nop __LINE__ 1586 nop __LINE__ 1587 FMA f33 = ALPHA_I, f106, f33 1588 } 1589 ;; 1590 { .mmf 1591 STFD [C5 ] = f26, SIZE 1592 STFD [C13] = f27, SIZE 1593 FMA f34 = ALPHA_R, f105, f34 1594 } 1595 { .mmf 1596 nop __LINE__ 1597 nop __LINE__ 1598 FMA f35 = ALPHA_R, f107, f35 1599 } 1600 ;; 1601 { .mmf 1602 STFD [C5 ] = f28, 5 * SIZE 1603 STFD [C13] = f29, 5 * SIZE 1604 FMA f36 = ALPHA_I, f105, f36 1605 } 1606 { .mmf 1607 nop __LINE__ 1608 nop __LINE__ 1609 FMA f37 = ALPHA_I, f107, f37 1610 } 1611 ;; 1612 { .mmf 1613 STFD [C6 ] = f30, SIZE 1614 STFD [C14] = f31, SIZE 1615 FMA f38 = ALPHA_R, f108, f38 1616 } 1617 { .mmf 1618 nop __LINE__ 1619 nop __LINE__ 1620 FMA f39 = ALPHA_R, f110, f39 1621 } 1622 ;; 1623 { .mmf 1624 STFD [C6 ] = f32, SIZE 1625 STFD [C14] = f33, SIZE 1626 FMA f48 = ALPHA_I, f108, f48 1627 } 1628 { .mmf 1629 nop __LINE__ 1630 nop __LINE__ 1631 FMA f49 = ALPHA_I, f110, f49 1632 } 1633 ;; 1634 { .mmf 1635 STFD [C6 ] = f34, SIZE 1636 STFD [C14] = f35, SIZE 1637 FMA f50 = ALPHA_R, f109, f50 1638 } 1639 { .mmf 1640 nop __LINE__ 1641 nop __LINE__ 1642 FMA f51 = ALPHA_R, f111, f51 1643 } 1644 ;; 1645 { .mmf 1646 STFD [C6 ] = f36, 5 * SIZE 1647 STFD [C14] = f37, 5 * SIZE 1648 FMA f52 = ALPHA_I, f109, f52 1649 } 1650 { .mmf 1651 nop __LINE__ 1652 nop __LINE__ 1653 FMA f53 = ALPHA_I, f111, f53 1654 } 1655 ;; 1656 { .mmf 1657 STFD [C6 ] = f38, SIZE 1658 STFD [C14] = f39, SIZE 1659 FMA f54 = ALPHA_R, f112, f54 1660 } 1661 { .mmf 1662 nop __LINE__ 1663 nop __LINE__ 1664 FMA f55 = ALPHA_R, f114, f55 1665 } 1666 ;; 1667 { .mmf 1668 STFD [C6 ] = f48, SIZE 1669 STFD [C14] = f49, SIZE 1670 FMA f40 = ALPHA_I, f112, f40 1671 } 1672 { .mmf 1673 nop __LINE__ 1674 nop __LINE__ 1675 FMA f41 = ALPHA_I, f114, f41 1676 } 1677 ;; 1678 { .mmf 1679 STFD [C6 ] = f50, SIZE 1680 STFD [C14] = f51, SIZE 1681 FMA f42 = ALPHA_R, f113, f42 1682 } 1683 { .mmf 1684 nop __LINE__ 1685 nop __LINE__ 1686 FMA f43 = ALPHA_R, f115, f43 1687 } 1688 ;; 1689 { .mmf 1690 STFD [C6 ] = f52, 5 * SIZE 1691 STFD [C14] = f53, 5 * SIZE 1692 FMA f44 = ALPHA_I, f113, f44 1693 } 1694 { .mmf 1695 nop __LINE__ 1696 nop __LINE__ 1697 FMA f45 = ALPHA_I, f115, f45 1698 } 1699 ;; 1700 { .mmf 1701 STFD [C7 ] = f54, SIZE 1702 STFD [C15] = f55, SIZE 1703 FMA f46 = ALPHA_R, f116, f46 1704 } 1705 { .mmf 1706 nop __LINE__ 1707 nop __LINE__ 1708 FMA f56 = ALPHA_R, f118, f56 1709 } 1710 ;; 1711 { .mmf 1712 STFD [C7 ] = f40, SIZE 1713 STFD [C15] = f41, SIZE 1714 FMA f57 = ALPHA_I, f116, f57 1715 } 1716 { .mmf 1717 nop __LINE__ 1718 nop __LINE__ 1719 FMA f58 = ALPHA_I, f118, f58 1720 } 1721 ;; 1722 { .mmf 1723 STFD [C7 ] = f42, SIZE 1724 STFD [C15] = f43, SIZE 1725 FMA f59 = ALPHA_R, f117, f59 1726 } 1727 { .mmf 1728 nop __LINE__ 1729 nop __LINE__ 1730 FMA f60 = ALPHA_R, f119, f60 1731 } 1732 ;; 1733 { .mmf 1734 STFD [C7 ] = f44, 5 * SIZE 1735 STFD [C15] = f45, 5 * SIZE 1736 FMA f61 = ALPHA_I, f117, f61 1737 } 1738 { .mmf 1739 nop __LINE__ 1740 nop __LINE__ 1741 FMA f62 = ALPHA_I, f119, f62 1742 } 1743 ;; 1744 { .mmf 1745 STFD [C7 ] = f46, SIZE 1746 STFD [C15] = f56, SIZE 1747 FMA f63 = ALPHA_R, f120, f63 1748 } 1749 { .mmf 1750 nop __LINE__ 1751 nop __LINE__ 1752 FMA f47 = ALPHA_R, f122, f47 1753 } 1754 ;; 1755 { .mmf 1756 STFD [C7 ] = f57, SIZE 1757 STFD [C15] = f58, SIZE 1758 FMA f64 = ALPHA_I, f120, f64 1759 } 1760 { .mmf 1761 nop __LINE__ 1762 nop __LINE__ 1763 FMA f65 = ALPHA_I, f122, f65 1764 } 1765 ;; 1766 { .mmf 1767 STFD [C7 ] = f59, SIZE 1768 STFD [C15] = f60, SIZE 1769 FMA f6 = ALPHA_R, f121, f6 1770 } 1771 { .mmf 1772 nop __LINE__ 1773 nop __LINE__ 1774 FMA f7 = ALPHA_R, f123, f7 1775 } 1776 ;; 1777 { .mmf 1778 STFD [C7 ] = f61, 5 * SIZE 1779 STFD [C15] = f62, 5 * SIZE 1780 FMA f10 = ALPHA_I, f121, f10 1781 } 1782 { .mmf 1783 nop __LINE__ 1784 nop __LINE__ 1785 FMA f11 = ALPHA_I, f123, f11 1786 } 1787 ;; 1788 { .mmf 1789 STFD [C8 ] = f63, SIZE 1790 STFD [C16] = f47, SIZE 1791 FMA f12 = ALPHA_R, f124, f12 1792 } 1793 { .mmf 1794 nop __LINE__ 1795 nop __LINE__ 1796 FMA f13 = ALPHA_R, f126, f13 1797 } 1798 ;; 1799 { .mmf 1800 STFD [C8 ] = f64, SIZE 1801 STFD [C16] = f65, SIZE 1802 FMA f14 = ALPHA_I, f124, f14 1803 } 1804 { .mmf 1805 nop __LINE__ 1806 nop __LINE__ 1807 FMA f15 = ALPHA_I, f126, f15 1808 } 1809 ;; 1810 { .mmf 1811 STFD [C8 ] = f6, SIZE 1812 STFD [C16] = f7, SIZE 1813 FMA f16 = ALPHA_R, f125, f16 1814 } 1815 { .mmf 1816 nop __LINE__ 1817 nop __LINE__ 1818 FMA f17 = ALPHA_R, f127, f17 1819 } 1820 ;; 1821 { .mmf 1822 STFD [C8 ] = f10, 5 * SIZE 1823 STFD [C16] = f11, 5 * SIZE 1824 FMA f18 = ALPHA_I, f125, f18 1825 } 1826 { .mmf 1827 nop __LINE__ 1828 nop __LINE__ 1829 FMA f19 = ALPHA_I, f127, f19 1830 } 1831 ;; 1832 { .mmf 1833 STFD [C8 ] = f12, SIZE 1834 STFD [C16] = f13, SIZE 1835 mov f64 = f0 1836 } 1837 { .mmf 1838 nop __LINE__ 1839 nop __LINE__ 1840 mov f72 = f0 1841 } 1842 ;; 1843 { .mmf 1844 STFD [C8 ] = f14, SIZE 1845 STFD [C16] = f15, SIZE 1846 mov f80 = f0 1847 } 1848 { .mmf 1849 nop __LINE__ 1850 nop __LINE__ 1851 mov f88 = f0 1852 } 1853 ;; 1854 { .mmf 1855 STFD [C8 ] = f16, SIZE 1856 STFD [C16] = f17, SIZE 1857 mov f96 = f0 1858 } 1859 { .mmf 1860 nop __LINE__ 1861 nop __LINE__ 1862 mov f104 = f0 1863 } 1864 ;; 1865 { .mmf 1866 STFD [C8 ] = f18, 5 * SIZE 1867 STFD [C16] = f19, 5 * SIZE 1868 mov f112 = f0 1869 } 1870 { .mfb 1871 adds I = -1, I 1872 mov f120 = f0 1873 (p6) br.cond.dptk .L011 1874 } 1875 ;; 1876 1877.L020: 1878 { .mfi 1879 cmp.eq p3, p0 = r0, r0 1880 mov f89 = f0 1881 tbit.z p6, p7 = M, 2 1882 } 1883 { .mfb 1884 nop __LINE__ 1885 mov f81 = f0 1886 (p6) br.cond.dptk .L030 1887 } 1888 ;; 1889 { .mfi 1890 LDFPD f48, f49 = [B] 1891 mov f65 = f0 1892 nop __LINE__ 1893 } 1894 { .mfi 1895 adds BOFFSET = 2 * SIZE, B 1896 mov f73 = f0 1897 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 1898 } 1899 ;; 1900 { .mmf 1901 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 1902 setf.d f97 = r0 1903 mov f105 = f0 1904 } 1905 { .mfi 1906 setf.d f113 = r0 1907 mov f121 = f0 1908 adds L = 1, K 1909 } 1910 ;; 1911 { .mmf 1912 LDFPD f50, f51 = [BOFFSET], 2 * SIZE 1913 setf.d f66 = r0 1914 mov f74 = f0 1915 } 1916 { .mfi 1917 setf.d f82 = r0 1918 mov f90 = f0 1919 tbit.z p12, p0 = L, 0 1920 } 1921 ;; 1922 { .mmf 1923 LDFPD f52, f53 = [BOFFSET], 2 * SIZE 1924 setf.d f98 = r0 1925 mov f106 = f0 1926 } 1927 { .mfi 1928 setf.d f114 = r0 1929 mov f122 = f0 1930 shr L = L, 1 1931 } 1932 ;; 1933 { .mfi 1934 LDFPD f54, f55 = [BOFFSET], 2 * SIZE 1935 mov f75 = f0 1936 adds L = -1, L 1937 } 1938 { .mmf 1939 setf.d f67 = r0 1940 setf.d f83 = r0 1941 mov f91 = f0 1942 } 1943 ;; 1944 { .mfi 1945 LDFPD f34, f35 = [AOFFSET], 2 * SIZE 1946 mov f107 = f0 1947 mov ar.lc = L 1948 } 1949 { .mmf 1950 setf.d f99 = r0 1951 setf.d f115 = r0 1952 mov f123 = f0 1953 } 1954 ;; 1955 .align 32 1956 1957.L022: 1958 { .mfi 1959 lfetch.nt1 [PREA], 16 * SIZE 1960 FMA f64 = f32, f48, f64 // A1 * B1 1961 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 1962 } 1963 { .mfi 1964 nop __LINE__ 1965 FMA f72 = f32, f49, f72 // A1 * B2 1966 (p12) cmp.ne p3, p0 = 0, L 1967 } 1968 ;; 1969 { .mfi 1970 lfetch.nt1 [PREB], 16 * SIZE 1971 FMA f80 = f32, f50, f80 // A1 * B3 1972 cmp.ne p4, p5 = 0, L 1973 } 1974 { .mfb 1975 nop __LINE__ 1976 FMA f88 = f32, f51, f88 // A1 * B4 1977 nop __LINE__ 1978 } 1979 ;; 1980 { .mfi 1981 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 1982 FMA f96 = f32, f52, f96 // A1 * B5 1983 (p5) adds C9 = 4 * SIZE, C1 1984 } 1985 { .mfi 1986 nop __LINE__ 1987 FMA f104 = f32, f53, f104 // A1 * B6 1988 (p5) adds C10 = 4 * SIZE, C2 1989 } 1990 ;; 1991 { .mfi 1992 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 1993 FMA f112 = f32, f54, f112 // A1 * B7 1994 (p5) adds C11 = 4 * SIZE, C3 1995 } 1996 { .mfi 1997 nop __LINE__ 1998 FMA f120 = f32, f55, f120 // A1 * B8 1999 (p5) adds C12 = 4 * SIZE, C4 2000 } 2001 ;; 2002 { .mfi 2003 (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE 2004 FMA f65 = f33, f48, f65 // A2 * B1 2005 (p5) adds C13 = 4 * SIZE, C5 2006 } 2007 { .mfi 2008 nop __LINE__ 2009 FMA f73 = f33, f49, f73 // A2 * B2 2010 (p5) adds C14 = 4 * SIZE, C6 2011 } 2012 ;; 2013 { .mfi 2014 (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE 2015 FMA f81 = f33, f50, f81 // A2 * B3 2016 (p5) adds C15 = 4 * SIZE, C7 2017 } 2018 { .mfi 2019 nop __LINE__ 2020 FMA f89 = f33, f51, f89 // A2 * B4 2021 (p5) adds C16 = 4 * SIZE, C8 2022 } 2023 ;; 2024 { .mfb 2025 (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE 2026 FMA f97 = f33, f52, f97 // A2 * B5 2027 nop __LINE__ 2028 } 2029 { .mfb 2030 nop __LINE__ 2031 FMA f105 = f33, f53, f105 // A2 * B6 2032 nop __LINE__ 2033 } 2034 ;; 2035 { .mfb 2036 (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE 2037 FMA f113 = f33, f54, f113 // A2 * B7 2038 nop __LINE__ 2039 } 2040 { .mfb 2041 nop __LINE__ 2042 FMA f121 = f33, f55, f121 // A2 * B8 2043 nop __LINE__ 2044 } 2045 ;; 2046 { .mfb 2047 nop __LINE__ 2048 FMA f66 = f34, f48, f66 // A3 * B1 2049 nop __LINE__ 2050 } 2051 { .mfb 2052 nop __LINE__ 2053 FMA f74 = f34, f49, f74 // A3 * B2 2054 nop __LINE__ 2055 } 2056 ;; 2057 { .mfb 2058 nop __LINE__ 2059 FMA f82 = f34, f50, f82 // A3 * B3 2060 nop __LINE__ 2061 } 2062 { .mfb 2063 nop __LINE__ 2064 FMA f90 = f34, f51, f90 // A3 * B4 2065 nop __LINE__ 2066 } 2067 ;; 2068 { .mfb 2069 nop __LINE__ 2070 FMA f98 = f34, f52, f98 // A3 * B5 2071 nop __LINE__ 2072 } 2073 { .mfb 2074 nop __LINE__ 2075 FMA f106 = f34, f53, f106 // A3 * B6 2076 nop __LINE__ 2077 } 2078 ;; 2079 { .mfb 2080 nop __LINE__ 2081 FMA f114 = f34, f54, f114 // A3 * B7 2082 nop __LINE__ 2083 } 2084 { .mfb 2085 nop __LINE__ 2086 FMA f122 = f34, f55, f122 // A3 * B8 2087 nop __LINE__ 2088 } 2089 ;; 2090 { .mfb 2091 nop __LINE__ 2092 FMA f67 = f35, f48, f67 // A4 * B1 2093 nop __LINE__ 2094 } 2095 { .mfb 2096 nop __LINE__ 2097 FMA f75 = f35, f49, f75 // A4 * B2 2098 nop __LINE__ 2099 } 2100 ;; 2101 { .mfb 2102 nop __LINE__ 2103 FMA f83 = f35, f50, f83 // A4 * B3 2104 nop __LINE__ 2105 } 2106 { .mfb 2107 nop __LINE__ 2108 FMA f91 = f35, f51, f91 // A4 * B4 2109 nop __LINE__ 2110 } 2111 ;; 2112 { .mfb 2113 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 2114 FMA f99 = f35, f52, f99 // A4 * B5 2115 nop __LINE__ 2116 } 2117 { .mfb 2118 nop __LINE__ 2119 FMA f107 = f35, f53, f107 // A4 * B6 2120 nop __LINE__ 2121 } 2122 ;; 2123 { .mfb 2124 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 2125 FMA f115 = f35, f54, f115 // A4 * B7 2126 nop __LINE__ 2127 } 2128 { .mfb 2129 nop __LINE__ 2130 FMA f123 = f35, f55, f123 // A4 * B8 2131 nop __LINE__ 2132 } 2133 ;; 2134 { .mfb 2135 (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE 2136 (p3) FMA f64 = f40, f56, f64 // A1 * B1 2137 nop __LINE__ 2138 } 2139 { .mfb 2140 nop __LINE__ 2141 (p3) FMA f72 = f40, f57, f72 // A1 * B2 2142 nop __LINE__ 2143 } 2144 ;; 2145 { .mfb 2146 (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE 2147 (p3) FMA f80 = f40, f58, f80 // A1 * B3 2148 nop __LINE__ 2149 } 2150 { .mfb 2151 nop __LINE__ 2152 (p3) FMA f88 = f40, f59, f88 // A1 * B4 2153 nop __LINE__ 2154 } 2155 ;; 2156 { .mfb 2157 (p5) LDFD f6 = [C1 ], SIZE 2158 (p3) FMA f96 = f40, f60, f96 // A1 * B5 2159 nop __LINE__ 2160 } 2161 { .mfb 2162 (p5) LDFD f7 = [C9 ], SIZE 2163 (p3) FMA f104 = f40, f61, f104 // A1 * B6 2164 nop __LINE__ 2165 } 2166 ;; 2167 { .mfb 2168 (p5) LDFD f10 = [C1 ], SIZE 2169 (p3) FMA f112 = f40, f62, f112 // A1 * B7 2170 nop __LINE__ 2171 } 2172 { .mfb 2173 (p5) LDFD f11 = [C9 ], SIZE 2174 (p3) FMA f120 = f40, f63, f120 // A1 * B8 2175 nop __LINE__ 2176 } 2177 ;; 2178 { .mfb 2179 (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE 2180 (p3) FMA f65 = f41, f56, f65 // A2 * B1 2181 nop __LINE__ 2182 } 2183 { .mfb 2184 (p3) FMA f73 = f41, f57, f73 // A2 * B2 2185 nop __LINE__ 2186 } 2187 { .mfb 2188 (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE 2189 (p3) FMA f81 = f41, f58, f81 // A2 * B3 2190 nop __LINE__ 2191 } 2192 { .mfb 2193 (p3) FMA f89 = f41, f59, f89 // A2 * B4 2194 nop __LINE__ 2195 } 2196 ;; 2197 { .mfb 2198 (p5) LDFD f12 = [C1 ], SIZE 2199 (p3) FMA f97 = f41, f60, f97 // A2 * B5 2200 nop __LINE__ 2201 } 2202 { .mfb 2203 (p5) LDFD f13 = [C9 ], SIZE 2204 (p3) FMA f105 = f41, f61, f105 // A2 * B6 2205 nop __LINE__ 2206 } 2207 ;; 2208 { .mfb 2209 (p5) LDFD f14 = [C1 ], - 3 * SIZE 2210 (p3) FMA f113 = f41, f62, f113 // A2 * B7 2211 nop __LINE__ 2212 } 2213 { .mfb 2214 (p5) LDFD f15 = [C9 ], - 3 * SIZE 2215 (p3) FMA f121 = f41, f63, f121 // A2 * B8 2216 nop __LINE__ 2217 } 2218 ;; 2219 { .mfb 2220 (p5) LDFD f16 = [C2 ], SIZE 2221 (p3) FMA f66 = f42, f56, f66 // A3 * B1 2222 nop __LINE__ 2223 } 2224 { .mfb 2225 (p5) LDFD f17 = [C10], SIZE 2226 (p3) FMA f74 = f42, f57, f74 // A3 * B2 2227 nop __LINE__ 2228 } 2229 ;; 2230 { .mfb 2231 (p5) LDFD f18 = [C2 ], SIZE 2232 (p3) FMA f82 = f42, f58, f82 // A3 * B3 2233 nop __LINE__ 2234 } 2235 { .mfb 2236 (p5) LDFD f19 = [C10], SIZE 2237 (p3) FMA f90 = f42, f59, f90 // A3 * B4 2238 nop __LINE__ 2239 } 2240 ;; 2241 { .mfb 2242 (p5) LDFD f20 = [C2 ], SIZE 2243 (p3) FMA f98 = f42, f60, f98 // A3 * B5 2244 nop __LINE__ 2245 } 2246 { .mfb 2247 (p5) LDFD f21 = [C10], SIZE 2248 (p3) FMA f106 = f42, f61, f106 // A3 * B6 2249 nop __LINE__ 2250 } 2251 ;; 2252 { .mfb 2253 (p5) LDFD f22 = [C2 ], - 3 * SIZE 2254 (p3) FMA f114 = f42, f62, f114 // A3 * B7 2255 nop __LINE__ 2256 } 2257 { .mfb 2258 (p5) LDFD f23 = [C10], - 3 * SIZE 2259 (p3) FMA f122 = f42, f63, f122 // A3 * B8 2260 nop __LINE__ 2261 } 2262 ;; 2263 { .mfb 2264 (p5) LDFD f24 = [C3 ], SIZE 2265 (p3) FMA f67 = f43, f56, f67 // A4 * B1 2266 nop __LINE__ 2267 } 2268 { .mfb 2269 (p5) LDFD f25 = [C11], SIZE 2270 (p3) FMA f75 = f43, f57, f75 // A4 * B2 2271 nop __LINE__ 2272 } 2273 ;; 2274 { .mfb 2275 (p5) LDFD f26 = [C3 ], SIZE 2276 (p3) FMA f83 = f43, f58, f83 // A4 * B3 2277 nop __LINE__ 2278 } 2279 { .mfb 2280 (p5) LDFD f27 = [C11], SIZE 2281 (p3) FMA f91 = f43, f59, f91 // A4 * B4 2282 nop __LINE__ 2283 } 2284 ;; 2285 { .mfb 2286 (p5) LDFD f28 = [C3 ], SIZE 2287 (p3) FMA f99 = f43, f60, f99 // A4 * B5 2288 nop __LINE__ 2289 } 2290 { .mfb 2291 (p5) LDFD f29 = [C11], SIZE 2292 (p3) FMA f107 = f43, f61, f107 // A4 * B6 2293 nop __LINE__ 2294 } 2295 ;; 2296 { .mfi 2297 (p5) LDFD f30 = [C3 ], - 3 * SIZE 2298 (p3) FMA f115 = f43, f62, f115 // A4 * B7 2299 adds L = -1, L 2300 } 2301 { .mfb 2302 (p5) LDFD f31 = [C11], - 3 * SIZE 2303 (p3) FMA f123 = f43, f63, f123 // A4 * B8 2304 br.cloop.sptk.few .L022 2305 } 2306 ;; 2307 2308.L028: 2309 { .mmf 2310 LDFD f68 = [C4 ], SIZE 2311 LDFD f69 = [C12], SIZE 2312 FMA f6 = ALPHA_R, f64, f6 2313 } 2314 { .mmf 2315 nop __LINE__ 2316 nop __LINE__ 2317 FMA f7 = ALPHA_R, f66, f7 2318 } 2319 ;; 2320 { .mmf 2321 LDFD f70 = [C4 ], SIZE 2322 LDFD f71 = [C12], SIZE 2323 FMA f10 = ALPHA_I, f64, f10 2324 } 2325 { .mmf 2326 nop __LINE__ 2327 nop __LINE__ 2328 FMA f11 = ALPHA_I, f66, f11 2329 } 2330 ;; 2331 { .mmf 2332 LDFD f76 = [C4 ], SIZE 2333 LDFD f77 = [C12], SIZE 2334 FMA f12 = ALPHA_R, f65, f12 2335 } 2336 { .mmf 2337 nop __LINE__ 2338 nop __LINE__ 2339 FMA f13 = ALPHA_R, f67, f13 2340 } 2341 ;; 2342 { .mmf 2343 LDFD f78 = [C4 ], -3 * SIZE 2344 LDFD f79 = [C12], -3 * SIZE 2345 FMA f14 = ALPHA_I, f65, f14 2346 } 2347 { .mmf 2348 nop __LINE__ 2349 nop __LINE__ 2350 FMA f15 = ALPHA_I, f67, f15 2351 } 2352 ;; 2353 { .mmf 2354 STFD [C1 ] = f6, SIZE 2355 STFD [C9 ] = f7, SIZE 2356 FMA f16 = ALPHA_R, f72, f16 2357 } 2358 { .mmf 2359 LDFD f84 = [C5 ], SIZE 2360 LDFD f85 = [C13], SIZE 2361 FMA f17 = ALPHA_R, f74, f17 2362 } 2363 ;; 2364 { .mmf 2365 STFD [C1 ] = f10, SIZE 2366 STFD [C9 ] = f11, SIZE 2367 FMA f18 = ALPHA_I, f72, f18 2368 } 2369 { .mmf 2370 LDFD f86 = [C5 ], SIZE 2371 LDFD f87 = [C13], SIZE 2372 FMA f19 = ALPHA_I, f74, f19 2373 } 2374 ;; 2375 { .mmf 2376 STFD [C1 ] = f12, SIZE 2377 STFD [C9 ] = f13, SIZE 2378 FMA f20 = ALPHA_R, f73, f20 2379 } 2380 { .mmf 2381 LDFD f92 = [C5 ], SIZE 2382 LDFD f93 = [C13], SIZE 2383 FMA f21 = ALPHA_R, f75, f21 2384 } 2385 ;; 2386 { .mmf 2387 STFD [C1 ] = f14, 5 * SIZE 2388 STFD [C9 ] = f15, 5 * SIZE 2389 FMA f22 = ALPHA_I, f73, f22 2390 } 2391 { .mmf 2392 LDFD f94 = [C5 ], -3 * SIZE 2393 LDFD f95 = [C13], -3 * SIZE 2394 FMA f23 = ALPHA_I, f75, f23 2395 } 2396 ;; 2397 { .mmf 2398 STFD [C2 ] = f16, SIZE 2399 STFD [C10] = f17, SIZE 2400 FMA f24 = ALPHA_R, f80, f24 2401 } 2402 { .mmf 2403 LDFD f100 = [C6 ], SIZE 2404 LDFD f101 = [C14], SIZE 2405 FMA f25 = ALPHA_R, f82, f25 2406 } 2407 ;; 2408 { .mmf 2409 STFD [C2 ] = f18, SIZE 2410 STFD [C10] = f19, SIZE 2411 FMA f26 = ALPHA_I, f80, f26 2412 } 2413 { .mmf 2414 LDFD f102 = [C6 ], SIZE 2415 LDFD f103 = [C14], SIZE 2416 FMA f27 = ALPHA_I, f82, f27 2417 } 2418 ;; 2419 { .mmf 2420 STFD [C2 ] = f20, SIZE 2421 STFD [C10] = f21, SIZE 2422 FMA f28 = ALPHA_R, f81, f28 2423 } 2424 { .mmf 2425 LDFD f108 = [C6 ], SIZE 2426 LDFD f109 = [C14], SIZE 2427 FMA f29 = ALPHA_R, f83, f29 2428 } 2429 ;; 2430 { .mmf 2431 STFD [C2 ] = f22, 5 * SIZE 2432 STFD [C10] = f23, 5 * SIZE 2433 FMA f30 = ALPHA_I, f81, f30 2434 } 2435 { .mmf 2436 LDFD f110 = [C6 ], -3 * SIZE 2437 LDFD f111 = [C14], -3 * SIZE 2438 FMA f31 = ALPHA_I, f83, f31 2439 } 2440 ;; 2441 { .mmf 2442 STFD [C3 ] = f24, SIZE 2443 STFD [C11] = f25, SIZE 2444 FMA f68 = ALPHA_R, f88, f68 2445 } 2446 { .mmf 2447 LDFD f116 = [C7 ], SIZE 2448 LDFD f117 = [C15], SIZE 2449 FMA f69 = ALPHA_R, f90, f69 2450 } 2451 ;; 2452 { .mmf 2453 STFD [C3 ] = f26, SIZE 2454 STFD [C11] = f27, SIZE 2455 FMA f70 = ALPHA_I, f88, f70 2456 } 2457 { .mmf 2458 LDFD f118 = [C7 ], SIZE 2459 LDFD f119 = [C15], SIZE 2460 FMA f71 = ALPHA_I, f90, f71 2461 } 2462 ;; 2463 { .mmf 2464 STFD [C3 ] = f28, SIZE 2465 STFD [C11] = f29, SIZE 2466 FMA f76 = ALPHA_R, f89, f76 2467 } 2468 { .mmf 2469 LDFD f124 = [C7 ], SIZE 2470 LDFD f125 = [C15], SIZE 2471 FMA f77 = ALPHA_R, f91, f77 2472 } 2473 ;; 2474 { .mmf 2475 STFD [C3 ] = f30, 5 * SIZE 2476 STFD [C11] = f31, 5 * SIZE 2477 FMA f78 = ALPHA_I, f89, f78 2478 } 2479 { .mmf 2480 LDFD f126 = [C7 ], -3 * SIZE 2481 LDFD f127 = [C15], -3 * SIZE 2482 FMA f79 = ALPHA_I, f91, f79 2483 } 2484 ;; 2485 { .mmf 2486 STFD [C4 ] = f68, SIZE 2487 STFD [C12] = f69, SIZE 2488 FMA f84 = ALPHA_R, f96, f84 2489 } 2490 { .mmf 2491 LDFD f32 = [C8 ], SIZE 2492 LDFD f33 = [C16], SIZE 2493 FMA f85 = ALPHA_R, f98, f85 2494 } 2495 ;; 2496 { .mmf 2497 STFD [C4 ] = f70, SIZE 2498 STFD [C12] = f71, SIZE 2499 FMA f86 = ALPHA_I, f96, f86 2500 } 2501 { .mmf 2502 LDFD f34 = [C8 ], SIZE 2503 LDFD f35 = [C16], SIZE 2504 FMA f87 = ALPHA_I, f98, f87 2505 } 2506 ;; 2507 { .mmf 2508 STFD [C4 ] = f76, SIZE 2509 STFD [C12] = f77, SIZE 2510 FMA f92 = ALPHA_R, f97, f92 2511 } 2512 { .mmf 2513 LDFD f36 = [C8 ], SIZE 2514 LDFD f37 = [C16], SIZE 2515 FMA f93 = ALPHA_R, f99, f93 2516 } 2517 ;; 2518 { .mmf 2519 STFD [C4 ] = f78, 5 * SIZE 2520 STFD [C12] = f79, 5 * SIZE 2521 FMA f94 = ALPHA_I, f97, f94 2522 } 2523 { .mmf 2524 LDFD f38 = [C8 ], -3 * SIZE 2525 LDFD f39 = [C16], -3 * SIZE 2526 FMA f95 = ALPHA_I, f99, f95 2527 } 2528 ;; 2529 { .mmf 2530 STFD [C5 ] = f84, SIZE 2531 STFD [C13] = f85, SIZE 2532 FMA f100 = ALPHA_R, f104, f100 2533 } 2534 { .mmf 2535 nop __LINE__ 2536 nop __LINE__ 2537 FMA f101 = ALPHA_R, f106, f101 2538 } 2539 ;; 2540 { .mmf 2541 STFD [C5 ] = f86, SIZE 2542 STFD [C13] = f87, SIZE 2543 FMA f102 = ALPHA_I, f104, f102 2544 } 2545 { .mmf 2546 nop __LINE__ 2547 nop __LINE__ 2548 FMA f103 = ALPHA_I, f106, f103 2549 } 2550 ;; 2551 { .mmf 2552 STFD [C5 ] = f92, SIZE 2553 STFD [C13] = f93, SIZE 2554 FMA f108 = ALPHA_R, f105, f108 2555 } 2556 { .mmf 2557 nop __LINE__ 2558 nop __LINE__ 2559 FMA f109 = ALPHA_R, f107, f109 2560 } 2561 ;; 2562 { .mmf 2563 STFD [C5 ] = f94, 5 * SIZE 2564 STFD [C13] = f95, 5 * SIZE 2565 FMA f110 = ALPHA_I, f105, f110 2566 } 2567 { .mmf 2568 nop __LINE__ 2569 nop __LINE__ 2570 FMA f111 = ALPHA_I, f107, f111 2571 } 2572 ;; 2573 { .mmf 2574 STFD [C6 ] = f100, SIZE 2575 STFD [C14] = f101, SIZE 2576 FMA f116 = ALPHA_R, f112, f116 2577 } 2578 { .mmf 2579 nop __LINE__ 2580 nop __LINE__ 2581 FMA f117 = ALPHA_R, f114, f117 2582 } 2583 ;; 2584 { .mmf 2585 STFD [C6 ] = f102, SIZE 2586 STFD [C14] = f103, SIZE 2587 FMA f118 = ALPHA_I, f112, f118 2588 } 2589 { .mmf 2590 nop __LINE__ 2591 nop __LINE__ 2592 FMA f119 = ALPHA_I, f114, f119 2593 } 2594 ;; 2595 { .mmf 2596 STFD [C6 ] = f108, SIZE 2597 STFD [C14] = f109, SIZE 2598 FMA f124 = ALPHA_R, f113, f124 2599 } 2600 { .mmf 2601 nop __LINE__ 2602 nop __LINE__ 2603 FMA f125 = ALPHA_R, f115, f125 2604 } 2605 ;; 2606 { .mmf 2607 STFD [C6 ] = f110, 5 * SIZE 2608 STFD [C14] = f111, 5 * SIZE 2609 FMA f126 = ALPHA_I, f113, f126 2610 } 2611 { .mmf 2612 nop __LINE__ 2613 nop __LINE__ 2614 FMA f127 = ALPHA_I, f115, f127 2615 } 2616 ;; 2617 { .mmf 2618 STFD [C7 ] = f116, SIZE 2619 STFD [C15] = f117, SIZE 2620 FMA f32 = ALPHA_R, f120, f32 2621 } 2622 { .mmf 2623 nop __LINE__ 2624 nop __LINE__ 2625 FMA f33 = ALPHA_R, f122, f33 2626 } 2627 ;; 2628 { .mmf 2629 STFD [C7 ] = f118, SIZE 2630 STFD [C15] = f119, SIZE 2631 FMA f34 = ALPHA_I, f120, f34 2632 } 2633 { .mmf 2634 nop __LINE__ 2635 nop __LINE__ 2636 FMA f35 = ALPHA_I, f122, f35 2637 } 2638 ;; 2639 { .mmf 2640 STFD [C7 ] = f124, SIZE 2641 STFD [C15] = f125, SIZE 2642 FMA f36 = ALPHA_R, f121, f36 2643 } 2644 { .mmf 2645 nop __LINE__ 2646 nop __LINE__ 2647 FMA f37 = ALPHA_R, f123, f37 2648 } 2649 ;; 2650 { .mmf 2651 STFD [C7 ] = f126, 5 * SIZE 2652 STFD [C15] = f127, 5 * SIZE 2653 FMA f38 = ALPHA_I, f121, f38 2654 } 2655 { .mmf 2656 nop __LINE__ 2657 nop __LINE__ 2658 FMA f39 = ALPHA_I, f123, f39 2659 } 2660 ;; 2661 { .mmf 2662 STFD [C8 ] = f32, SIZE 2663 STFD [C16] = f33, SIZE 2664 mov f64 = f0 2665 } 2666 { .mmf 2667 nop __LINE__ 2668 nop __LINE__ 2669 mov f72 = f0 2670 } 2671 ;; 2672 { .mmf 2673 STFD [C8 ] = f34, SIZE 2674 STFD [C16] = f35, SIZE 2675 mov f80 = f0 2676 } 2677 { .mmf 2678 nop __LINE__ 2679 nop __LINE__ 2680 mov f88 = f0 2681 } 2682 ;; 2683 { .mmf 2684 STFD [C8 ] = f36, SIZE 2685 STFD [C16] = f37, SIZE 2686 mov f96 = f0 2687 } 2688 { .mmf 2689 nop __LINE__ 2690 nop __LINE__ 2691 mov f104 = f0 2692 } 2693 ;; 2694 { .mmf 2695 STFD [C8 ] = f38, 5 * SIZE 2696 STFD [C16] = f39, 5 * SIZE 2697 mov f112 = f0 2698 } 2699 { .mmf 2700 nop __LINE__ 2701 nop __LINE__ 2702 mov f120 = f0 2703 } 2704 ;; 2705 .align 32 2706 2707.L030: 2708 { .mib 2709 nop __LINE__ 2710 tbit.z p6, p7 = M, 1 2711 (p6) br.cond.dptk .L040 2712 } 2713 ;; 2714 { .mfi 2715 LDFPD f48, f49 = [B] 2716 mov f65 = f0 2717 nop __LINE__ 2718 } 2719 { .mfi 2720 adds BOFFSET = 2 * SIZE, B 2721 mov f73 = f0 2722 adds L = 1, K 2723 } 2724 ;; 2725 { .mfi 2726 LDFPD f50, f51 = [BOFFSET], 2 * SIZE 2727 mov f81 = f0 2728 tbit.z p12, p0 = L, 0 2729 } 2730 { .mfi 2731 (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 2732 mov f89 = f0 2733 shr L = L, 1 2734 } 2735 ;; 2736 { .mfi 2737 LDFPD f52, f53 = [BOFFSET], 2 * SIZE 2738 mov f97 = f0 2739 adds L = -1, L 2740 } 2741 { .mfi 2742 nop __LINE__ 2743 mov f105 = f0 2744 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 2745 } 2746 ;; 2747 { .mfi 2748 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 2749 mov f113 = f0 2750 mov ar.lc = L 2751 } 2752 { .mfi 2753 LDFPD f54, f55 = [BOFFSET], 2 * SIZE 2754 mov f121 = f0 2755 cmp.eq p3, p0 = r0, r0 2756 } 2757 ;; 2758 .align 32 2759 2760.L032: 2761 { .mfb 2762 lfetch.nt1 [PREA], 4 * SIZE 2763 FMA f64 = f32, f48, f64 // A1 * B1 2764 nop __LINE__ 2765 } 2766 { .mfi 2767 nop __LINE__ 2768 FMA f72 = f32, f49, f72 // A1 * B2 2769 (p12) cmp.ne p3, p0 = 0, L 2770 } 2771 ;; 2772 { .mfi 2773 lfetch.nt1 [PREB], 16 * SIZE 2774 FMA f80 = f32, f50, f80 // A1 * B3 2775 cmp.ne p4, p5 = 0, L 2776 } 2777 { .mfb 2778 nop __LINE__ 2779 FMA f88 = f32, f51, f88 // A1 * B4 2780 nop __LINE__ 2781 } 2782 ;; 2783 { .mfb 2784 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 2785 FMA f96 = f32, f52, f96 // A1 * B5 2786 nop __LINE__ 2787 } 2788 { .mfb 2789 nop __LINE__ 2790 FMA f104 = f32, f53, f104 // A1 * B6 2791 nop __LINE__ 2792 } 2793 ;; 2794 { .mfb 2795 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 2796 FMA f112 = f32, f54, f112 // A1 * B7 2797 nop __LINE__ 2798 } 2799 { .mfb 2800 nop __LINE__ 2801 FMA f120 = f32, f55, f120 // A1 * B8 2802 nop __LINE__ 2803 } 2804 ;; 2805 { .mfb 2806 (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE 2807 FMA f65 = f33, f48, f65 // A2 * B1 2808 nop __LINE__ 2809 } 2810 { .mfb 2811 nop __LINE__ 2812 FMA f73 = f33, f49, f73 // A2 * B2 2813 nop __LINE__ 2814 } 2815 ;; 2816 { .mfb 2817 (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE 2818 FMA f81 = f33, f50, f81 // A2 * B3 2819 nop __LINE__ 2820 } 2821 { .mfb 2822 nop __LINE__ 2823 FMA f89 = f33, f51, f89 // A2 * B4 2824 nop __LINE__ 2825 } 2826 ;; 2827 { .mfb 2828 (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE 2829 FMA f97 = f33, f52, f97 // A2 * B5 2830 nop __LINE__ 2831 } 2832 { .mfb 2833 nop __LINE__ 2834 FMA f105 = f33, f53, f105 // A2 * B6 2835 nop __LINE__ 2836 } 2837 ;; 2838 { .mfb 2839 nop __LINE__ 2840 FMA f113 = f33, f54, f113 // A2 * B7 2841 nop __LINE__ 2842 } 2843 { .mfb 2844 nop __LINE__ 2845 FMA f121 = f33, f55, f121 // A2 * B8 2846 nop __LINE__ 2847 } 2848 ;; 2849 { .mfb 2850 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 2851 (p3) FMA f64 = f40, f56, f64 // A1 * B1 2852 nop __LINE__ 2853 } 2854 { .mfb 2855 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 2856 (p3) FMA f72 = f40, f57, f72 // A1 * B2 2857 nop __LINE__ 2858 } 2859 ;; 2860 { .mfb 2861 (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE 2862 (p3) FMA f80 = f40, f58, f80 // A1 * B3 2863 nop __LINE__ 2864 } 2865 { .mfb 2866 nop __LINE__ 2867 (p3) FMA f88 = f40, f59, f88 // A1 * B4 2868 nop __LINE__ 2869 } 2870 ;; 2871 { .mfb 2872 (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE 2873 (p3) FMA f96 = f40, f60, f96 // A1 * B5 2874 nop __LINE__ 2875 } 2876 { .mfb 2877 nop __LINE__ 2878 (p3) FMA f104 = f40, f61, f104 // A1 * B6 2879 nop __LINE__ 2880 } 2881 ;; 2882 { .mfb 2883 (p5) LDFD f6 = [C1], SIZE 2884 (p3) FMA f112 = f40, f62, f112 // A1 * B7 2885 nop __LINE__ 2886 } 2887 { .mfb 2888 (p5) LDFD f12 = [C2], SIZE 2889 (p3) FMA f120 = f40, f63, f120 // A1 * B8 2890 nop __LINE__ 2891 } 2892 ;; 2893 { .mfb 2894 (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE 2895 (p3) FMA f65 = f41, f56, f65 // A2 * B1 2896 nop __LINE__ 2897 } 2898 { .mfb 2899 (p3) FMA f73 = f41, f57, f73 // A2 * B2 2900 nop __LINE__ 2901 } 2902 { .mfb 2903 (p5) LDFD f7 = [C1], SIZE 2904 (p3) FMA f81 = f41, f58, f81 // A2 * B3 2905 nop __LINE__ 2906 } 2907 { .mfb 2908 (p5) LDFD f13 = [C2], SIZE 2909 (p3) FMA f89 = f41, f59, f89 // A2 * B4 2910 nop __LINE__ 2911 } 2912 ;; 2913 { .mfb 2914 (p5) LDFD f10 = [C1], SIZE 2915 (p3) FMA f97 = f41, f60, f97 // A2 * B5 2916 nop __LINE__ 2917 } 2918 { .mfb 2919 (p5) LDFD f14 = [C2], SIZE 2920 (p3) FMA f105 = f41, f61, f105 // A2 * B6 2921 nop __LINE__ 2922 } 2923 ;; 2924 { .mfi 2925 (p5) LDFD f11 = [C1], -3 * SIZE 2926 (p3) FMA f113 = f41, f62, f113 // A2 * B7 2927 adds L = -1, L 2928 } 2929 { .mfb 2930 (p5) LDFD f15 = [C2], -3 * SIZE 2931 (p3) FMA f121 = f41, f63, f121 // A2 * B8 2932 br.cloop.sptk.few .L032 2933 } 2934 ;; 2935 2936.L038: 2937 { .mmf 2938 LDFD f16 = [C3], SIZE 2939 LDFD f20 = [C4], SIZE 2940 FMA f6 = ALPHA_R, f64, f6 2941 } 2942 { .mmf 2943 nop __LINE__ 2944 nop __LINE__ 2945 FMA f12 = ALPHA_R, f72, f12 2946 } 2947 ;; 2948 { .mmf 2949 LDFD f17 = [C3], SIZE 2950 LDFD f21 = [C4], SIZE 2951 FMA f7 = ALPHA_I, f64, f7 2952 } 2953 { .mmf 2954 nop __LINE__ 2955 nop __LINE__ 2956 FMA f13 = ALPHA_I, f72, f13 2957 } 2958 ;; 2959 { .mmf 2960 LDFD f18 = [C3], SIZE 2961 LDFD f22 = [C4], SIZE 2962 FMA f10 = ALPHA_R, f65, f10 2963 } 2964 { .mmf 2965 nop __LINE__ 2966 nop __LINE__ 2967 FMA f14 = ALPHA_R, f73, f14 2968 } 2969 ;; 2970 { .mmf 2971 LDFD f19 = [C3], - 3 * SIZE 2972 LDFD f23 = [C4], - 3 * SIZE 2973 FMA f11 = ALPHA_I, f65, f11 2974 } 2975 { .mmf 2976 nop __LINE__ 2977 nop __LINE__ 2978 FMA f15 = ALPHA_I, f73, f15 2979 } 2980 ;; 2981 { .mmf 2982 STFD [C1] = f6, SIZE 2983 STFD [C2] = f12, SIZE 2984 FMA f16 = ALPHA_R, f80, f16 2985 } 2986 { .mmf 2987 LDFD f24 = [C5], SIZE 2988 LDFD f28 = [C6], SIZE 2989 FMA f20 = ALPHA_R, f88, f20 2990 } 2991 ;; 2992 { .mmf 2993 STFD [C1] = f7, SIZE 2994 STFD [C2] = f13, SIZE 2995 FMA f17 = ALPHA_I, f80, f17 2996 } 2997 { .mmf 2998 LDFD f25 = [C5], SIZE 2999 LDFD f29 = [C6], SIZE 3000 FMA f21 = ALPHA_I, f88, f21 3001 } 3002 ;; 3003 { .mmf 3004 STFD [C1] = f10, SIZE 3005 STFD [C2] = f14, SIZE 3006 FMA f18 = ALPHA_R, f81, f18 3007 } 3008 { .mmf 3009 LDFD f26 = [C5], SIZE 3010 LDFD f30 = [C6], SIZE 3011 FMA f22 = ALPHA_R, f89, f22 3012 } 3013 ;; 3014 { .mmf 3015 STFD [C1] = f11, SIZE 3016 STFD [C2] = f15, SIZE 3017 FMA f19 = ALPHA_I, f81, f19 3018 } 3019 { .mmf 3020 LDFD f27 = [C5], - 3 * SIZE 3021 LDFD f31 = [C6], - 3 * SIZE 3022 FMA f23 = ALPHA_I, f89, f23 3023 } 3024 ;; 3025 { .mmf 3026 STFD [C3] = f16, SIZE 3027 STFD [C4] = f20, SIZE 3028 FMA f24 = ALPHA_R, f96, f24 3029 } 3030 { .mmf 3031 LDFD f32 = [C7], SIZE 3032 LDFD f36 = [C8], SIZE 3033 FMA f28 = ALPHA_R, f104, f28 3034 } 3035 ;; 3036 { .mmf 3037 STFD [C3] = f17, SIZE 3038 STFD [C4] = f21, SIZE 3039 FMA f25 = ALPHA_I, f96, f25 3040 } 3041 { .mmf 3042 LDFD f33 = [C7], SIZE 3043 LDFD f37 = [C8], SIZE 3044 FMA f29 = ALPHA_I, f104, f29 3045 } 3046 ;; 3047 { .mmf 3048 STFD [C3] = f18, SIZE 3049 STFD [C4] = f22, SIZE 3050 FMA f26 = ALPHA_R, f97, f26 3051 } 3052 { .mmf 3053 LDFD f34 = [C7], SIZE 3054 LDFD f38 = [C8], SIZE 3055 FMA f30 = ALPHA_R, f105, f30 3056 } 3057 ;; 3058 { .mmf 3059 STFD [C3] = f19, SIZE 3060 STFD [C4] = f23, SIZE 3061 FMA f27 = ALPHA_I, f97, f27 3062 } 3063 { .mmf 3064 LDFD f35 = [C7], - 3 * SIZE 3065 LDFD f39 = [C8], - 3 * SIZE 3066 FMA f31 = ALPHA_I, f105, f31 3067 } 3068 ;; 3069 { .mmf 3070 STFD [C5] = f24, SIZE 3071 STFD [C6] = f28, SIZE 3072 FMA f32 = ALPHA_R, f112, f32 3073 } 3074 { .mmf 3075 nop __LINE__ 3076 nop __LINE__ 3077 FMA f36 = ALPHA_R, f120, f36 3078 } 3079 ;; 3080 { .mmf 3081 STFD [C5] = f25, SIZE 3082 STFD [C6] = f29, SIZE 3083 FMA f33 = ALPHA_I, f112, f33 3084 } 3085 { .mmf 3086 nop __LINE__ 3087 nop __LINE__ 3088 FMA f37 = ALPHA_I, f120, f37 3089 } 3090 ;; 3091 { .mmf 3092 STFD [C5] = f26, SIZE 3093 STFD [C6] = f30, SIZE 3094 FMA f34 = ALPHA_R, f113, f34 3095 } 3096 { .mmf 3097 nop __LINE__ 3098 nop __LINE__ 3099 FMA f38 = ALPHA_R, f121, f38 3100 } 3101 ;; 3102 { .mmf 3103 STFD [C5] = f27, SIZE 3104 STFD [C6] = f31, SIZE 3105 FMA f35 = ALPHA_I, f113, f35 3106 } 3107 { .mmf 3108 nop __LINE__ 3109 nop __LINE__ 3110 FMA f39 = ALPHA_I, f121, f39 3111 } 3112 ;; 3113 { .mmf 3114 STFD [C7] = f32, SIZE 3115 STFD [C8] = f36, SIZE 3116 mov f64 = f0 3117 } 3118 { .mmf 3119 nop __LINE__ 3120 nop __LINE__ 3121 mov f72 = f0 3122 } 3123 ;; 3124 { .mmf 3125 STFD [C7] = f33, SIZE 3126 STFD [C8] = f37, SIZE 3127 mov f80 = f0 3128 } 3129 { .mmf 3130 nop __LINE__ 3131 nop __LINE__ 3132 mov f88 = f0 3133 } 3134 ;; 3135 { .mmf 3136 STFD [C7] = f34, SIZE 3137 STFD [C8] = f38, SIZE 3138 mov f96 = f0 3139 } 3140 { .mmf 3141 nop __LINE__ 3142 nop __LINE__ 3143 mov f104 = f0 3144 } 3145 ;; 3146 { .mmf 3147 STFD [C7] = f35, SIZE 3148 STFD [C8] = f39, SIZE 3149 mov f112 = f0 3150 } 3151 { .mmf 3152 nop __LINE__ 3153 nop __LINE__ 3154 mov f120 = f0 3155 } 3156 ;; 3157 .align 32 3158 3159.L040: 3160 { .mib 3161 nop __LINE__ 3162 tbit.z p6, p7 = M, 0 3163 (p6) br.cond.dptk .L049 3164 } 3165 ;; 3166 { .mmi 3167 LDFPD f48, f49 = [B] 3168 adds BOFFSET = 2 * SIZE, B 3169 adds L = 1, K 3170 } 3171 ;; 3172 { .mii 3173 LDFPD f50, f51 = [BOFFSET], 2 * SIZE 3174 tbit.z p12, p0 = L, 0 3175 shr L = L, 1 3176 } 3177 ;; 3178 { .mmi 3179 LDFPD f52, f53 = [BOFFSET], 2 * SIZE 3180 LDFD f32 = [AOFFSET], 1 * SIZE 3181 adds L = -1, L 3182 } 3183 ;; 3184 { .mmi 3185 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 3186 cmp.eq p3, p0 = r0, r0 3187 mov ar.lc = L 3188 } 3189 { .mmi 3190 LDFPD f54, f55 = [BOFFSET], 2 * SIZE 3191 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 3192 nop __LINE__ 3193 } 3194 ;; 3195 .align 32 3196 3197.L042: 3198 { .mfb 3199 lfetch.nt1 [PREB], 16 * SIZE 3200 FMA f64 = f32, f48, f64 // A1 * B1 3201 nop __LINE__ 3202 } 3203 { .mfb 3204 (p12) cmp.ne p3, p0 = 0, L 3205 FMA f72 = f32, f49, f72 // A1 * B2 3206 nop __LINE__ 3207 } 3208 ;; 3209 { .mfi 3210 (p3) LDFD f40 = [AOFFSET], 1 * SIZE 3211 FMA f80 = f32, f50, f80 // A1 * B3 3212 cmp.ne p4, p5 = 0, L 3213 } 3214 { .mfb 3215 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 3216 FMA f88 = f32, f51, f88 // A1 * B4 3217 nop __LINE__ 3218 } 3219 ;; 3220 { .mfi 3221 (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE 3222 FMA f96 = f32, f52, f96 // A1 * B5 3223 nop __LINE__ 3224 } 3225 { .mmf 3226 (p5) LDFD f6 = [C1], SIZE 3227 (p5) LDFD f10 = [C2], SIZE 3228 FMA f104 = f32, f53, f104 // A1 * B6 3229 } 3230 ;; 3231 { .mfi 3232 (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE 3233 FMA f112 = f32, f54, f112 // A1 * B7 3234 nop __LINE__ 3235 } 3236 { .mmf 3237 (p5) LDFD f7 = [C1], -SIZE 3238 (p5) LDFD f11 = [C2], -SIZE 3239 FMA f120 = f32, f55, f120 // A1 * B8 3240 } 3241 ;; 3242 { .mmf 3243 (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE 3244 (p4) LDFD f32 = [AOFFSET], 1 * SIZE 3245 (p3) FMA f64 = f40, f56, f64 // A1 * B1 3246 } 3247 { .mmf 3248 (p5) LDFD f12 = [C3], SIZE 3249 (p5) LDFD f14 = [C4], SIZE 3250 (p3) FMA f72 = f40, f57, f72 // A1 * B2 3251 } 3252 ;; 3253 { .mfi 3254 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 3255 (p3) FMA f80 = f40, f58, f80 // A1 * B3 3256 nop __LINE__ 3257 } 3258 { .mmf 3259 (p5) LDFD f13 = [C3], -SIZE 3260 (p5) LDFD f15 = [C4], -SIZE 3261 (p3) FMA f88 = f40, f59, f88 // A1 * B4 3262 } 3263 ;; 3264 { .mfi 3265 (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE 3266 (p3) FMA f96 = f40, f60, f96 // A1 * B5 3267 nop __LINE__ 3268 } 3269 { .mmf 3270 (p5) LDFD f16 = [C5], SIZE 3271 (p5) LDFD f18 = [C6], SIZE 3272 (p3) FMA f104 = f40, f61, f104 // A1 * B6 3273 } 3274 ;; 3275 { .mfi 3276 (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE 3277 (p3) FMA f112 = f40, f62, f112 // A1 * B7 3278 adds L = -1, L 3279 } 3280 { .mmb 3281 (p5) LDFD f17 = [C5], -SIZE 3282 (p5) LDFD f19 = [C6], -SIZE 3283 nop __LINE__ 3284 } 3285 ;; 3286 { .mfb 3287 (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE 3288 (p3) FMA f120 = f40, f63, f120 // A1 * B8 3289 nop __LINE__ 3290 } 3291 { .mmb 3292 (p5) LDFD f20 = [C7], SIZE 3293 (p5) LDFD f22 = [C8], SIZE 3294 br.cloop.sptk.few .L042 3295 } 3296 ;; 3297 { .mmf 3298 LDFD f21 = [C7], -SIZE 3299 LDFD f23 = [C8], -SIZE 3300 FMA f6 = ALPHA_R, f64, f6 3301 } 3302 { .mmf 3303 nop __LINE__ 3304 nop __LINE__ 3305 FMA f10 = ALPHA_R, f72, f10 3306 } 3307 ;; 3308 { .mmf 3309 nop __LINE__ 3310 nop __LINE__ 3311 FMA f7 = ALPHA_I, f64, f7 3312 } 3313 { .mmf 3314 nop __LINE__ 3315 nop __LINE__ 3316 FMA f11 = ALPHA_I, f72, f11 3317 } 3318 ;; 3319 { .mmf 3320 nop __LINE__ 3321 nop __LINE__ 3322 FMA f12 = ALPHA_R, f80, f12 3323 } 3324 { .mmf 3325 nop __LINE__ 3326 nop __LINE__ 3327 FMA f14 = ALPHA_R, f88, f14 3328 } 3329 ;; 3330 { .mmf 3331 nop __LINE__ 3332 nop __LINE__ 3333 FMA f13 = ALPHA_I, f80, f13 3334 } 3335 { .mmf 3336 nop __LINE__ 3337 nop __LINE__ 3338 FMA f15 = ALPHA_I, f88, f15 3339 } 3340 ;; 3341 { .mmf 3342 STFD [C1 ] = f6, SIZE 3343 STFD [C2 ] = f10, SIZE 3344 FMA f16 = ALPHA_R, f96, f16 3345 } 3346 { .mmf 3347 nop __LINE__ 3348 nop __LINE__ 3349 FMA f18 = ALPHA_R, f104, f18 3350 } 3351 ;; 3352 { .mmf 3353 STFD [C1 ] = f7, SIZE 3354 STFD [C2 ] = f11, SIZE 3355 FMA f17 = ALPHA_I, f96, f17 3356 } 3357 { .mmf 3358 nop __LINE__ 3359 nop __LINE__ 3360 FMA f19 = ALPHA_I, f104, f19 3361 } 3362 ;; 3363 { .mmf 3364 STFD [C3 ] = f12, SIZE 3365 STFD [C4 ] = f14, SIZE 3366 FMA f20 = ALPHA_R, f112, f20 3367 } 3368 { .mmf 3369 nop __LINE__ 3370 nop __LINE__ 3371 FMA f22 = ALPHA_R, f120, f22 3372 } 3373 ;; 3374 { .mmf 3375 STFD [C3 ] = f13, SIZE 3376 STFD [C4 ] = f15, SIZE 3377 FMA f21 = ALPHA_I, f112, f21 3378 } 3379 { .mmf 3380 nop __LINE__ 3381 nop __LINE__ 3382 FMA f23 = ALPHA_I, f120, f23 3383 } 3384 ;; 3385 { .mmi 3386 STFD [C5 ] = f16, SIZE 3387 STFD [C6 ] = f18, SIZE 3388 nop __LINE__ 3389 } 3390 ;; 3391 { .mmi 3392 STFD [C5 ] = f17, SIZE 3393 STFD [C6 ] = f19, SIZE 3394 nop __LINE__ 3395 } 3396 ;; 3397 { .mmi 3398 STFD [C7 ] = f20, SIZE 3399 STFD [C8 ] = f22, SIZE 3400 nop __LINE__ 3401 } 3402 ;; 3403 { .mmi 3404 STFD [C7 ] = f21, SIZE 3405 STFD [C8 ] = f23, SIZE 3406 nop __LINE__ 3407 } 3408 ;; 3409 .align 32 3410 3411.L049: 3412 { .mmi 3413 mov B = BOFFSET 3414 mov AOFFSET = A 3415 nop __LINE__ 3416 } 3417 ;; 3418 { .mmb 3419 nop __LINE__ 3420 cmp.lt p6, p0 = 0, J 3421 (p6) br.cond.dptk .L010 3422 } 3423 ;; 3424 .align 32 3425 3426.L050: 3427 { .mfi 3428 mov C1 = C 3429 mov f64 = f0 3430 tbit.z p6, p0 = N, 2 3431 } 3432 { .mfi 3433 add C2 = LDC, C 3434 mov f72 = f0 3435 shr I = M, 3 3436 } 3437 ;; 3438 { .mfi 3439 shladd C3 = LDC, 1, C 3440 mov f80 = f0 3441 nop __LINE__ 3442 } 3443 { .mfb 3444 mov AOFFSET = A 3445 mov f88 = f0 3446 (p6) br.cond.dpnt .L090 3447 } 3448 ;; 3449 { .mfi 3450 cmp.eq p6, p7 = 0, I 3451 mov f65 = f0 3452 nop __LINE__ 3453 } 3454 { .mfi 3455 shladd C4 = LDC, 1, C2 3456 mov f73 = f0 3457 nop __LINE__ 3458 } 3459 ;; 3460 { .mfi 3461 nop __LINE__ 3462 mov f81 = f0 3463 nop __LINE__ 3464 } 3465 { .mfb 3466 shladd C = LDC, 2, C 3467 mov f89 = f0 3468 (p6) br.cond.dpnt .L060 3469 } 3470 ;; 3471 .align 32 3472 3473.L052: 3474 { .mfb 3475 LDFPD f48, f49 = [B] 3476 mov f66 = f0 3477 nop __LINE__ 3478 } 3479 { .mfb 3480 adds BOFFSET = 2 * SIZE, B 3481 mov f74 = f0 3482 nop __LINE__ 3483 } 3484 ;; 3485 { .mfi 3486 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 3487 mov f82 = f0 3488 nop __LINE__ 3489 } 3490 { .mfi 3491 setf.d f84 = r0 3492 mov f90 = f0 3493 nop __LINE__ 3494 } 3495 ;; 3496 { .mfi 3497 LDFPD f50, f51 = [BOFFSET], 2 * SIZE 3498 mov f67 = f0 3499 adds PREC = CPREFETCHSIZE * SIZE, C1 3500 } 3501 { .mfi 3502 LDFPD f34, f35 = [AOFFSET], 2 * SIZE 3503 mov f75 = f0 3504 adds L = 1, K 3505 } 3506 ;; 3507 { .mfi 3508 LDFPD f36, f37 = [AOFFSET], 2 * SIZE 3509 mov f83 = f0 3510 tbit.z p12, p0 = L, 0 3511 } 3512 { .mfi 3513 setf.d f91 = r0 3514 mov f68 = f0 3515 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 3516 } 3517 ;; 3518 { .mfi 3519 CPREFETCH [PREC], LDC 3520 mov f76 = f0 3521 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET 3522 } 3523 { .mfi 3524 LDFPD f38, f39 = [AOFFSET], 2 * SIZE 3525 mov f92 = f0 3526 cmp.eq p3, p0 = r0, r0 3527 } 3528 ;; 3529 { .mfi 3530 CPREFETCH [PREC], LDC 3531 mov f69 = f0 3532 shr L = L, 1 3533 } 3534 { .mmf 3535 setf.d f77 = r0 3536 setf.d f85 = r0 3537 mov f93 = f0 3538 } 3539 ;; 3540 { .mfi 3541 CPREFETCH [PREC], LDC 3542 mov f70 = f0 3543 adds L = -1, L 3544 } 3545 { .mmf 3546 setf.d f78 = r0 3547 setf.d f86 = r0 3548 mov f94 = f0 3549 } 3550 ;; 3551 { .mfi 3552 CPREFETCH [PREC] 3553 mov f71 = f0 3554 mov ar.lc = L 3555 } 3556 { .mmf 3557 setf.d f79 = r0 3558 setf.d f87 = r0 3559 mov f95 = f0 3560 } 3561 ;; 3562 .align 32 3563 3564.L053: 3565 { .mfb 3566 lfetch.nt1 [PREA], 16 * SIZE 3567 FMA f64 = f32, f48, f64 // A1 * B1 3568 nop __LINE__ 3569 } 3570 { .mfi 3571 nop __LINE__ 3572 FMA f72 = f32, f49, f72 // A1 * B2 3573 (p12) cmp.ne p3, p0 = 0, L 3574 } 3575 ;; 3576 { .mfi 3577 lfetch.nt1 [PREB], 8 * SIZE 3578 FMA f80 = f32, f50, f80 // A1 * B3 3579 cmp.ne p4, p5 = 0, L 3580 } 3581 { .mfi 3582 nop __LINE__ 3583 FMA f88 = f32, f51, f88 // A1 * B4 3584 adds C9 = 4 * SIZE, C1 3585 } 3586 ;; 3587 { .mfi 3588 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 3589 FMA f65 = f33, f48, f65 // A2 * B1 3590 adds C10 = 4 * SIZE, C2 3591 } 3592 { .mfi 3593 nop __LINE__ 3594 FMA f73 = f33, f49, f73 // A2 * B2 3595 adds C11 = 4 * SIZE, C3 3596 } 3597 ;; 3598 { .mfi 3599 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 3600 FMA f81 = f33, f50, f81 // A2 * B3 3601 adds C12 = 4 * SIZE, C4 3602 } 3603 { .mfb 3604 nop __LINE__ 3605 FMA f89 = f33, f51, f89 // A2 * B4 3606 nop __LINE__ 3607 } 3608 ;; 3609 { .mfb 3610 (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE 3611 FMA f66 = f34, f48, f66 // A3 * B1 3612 nop __LINE__ 3613 } 3614 { .mfb 3615 nop __LINE__ 3616 FMA f74 = f34, f49, f74 // A3 * B2 3617 nop __LINE__ 3618 } 3619 ;; 3620 { .mfb 3621 (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE 3622 FMA f82 = f34, f50, f82 // A3 * B3 3623 nop __LINE__ 3624 } 3625 { .mfb 3626 nop __LINE__ 3627 FMA f90 = f34, f51, f90 // A3 * B4 3628 nop __LINE__ 3629 } 3630 ;; 3631 { .mfb 3632 (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE 3633 FMA f67 = f35, f48, f67 // A4 * B1 3634 nop __LINE__ 3635 } 3636 { .mfb 3637 nop __LINE__ 3638 FMA f75 = f35, f49, f75 // A4 * B2 3639 nop __LINE__ 3640 } 3641 ;; 3642 { .mfb 3643 (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE 3644 FMA f83 = f35, f50, f83 // A4 * B3 3645 nop __LINE__ 3646 } 3647 { .mfb 3648 nop __LINE__ 3649 FMA f91 = f35, f51, f91 // A4 * B4 3650 nop __LINE__ 3651 } 3652 ;; 3653 { .mfb 3654 nop __LINE__ 3655 FMA f68 = f36, f48, f68 // A5 * B1 3656 nop __LINE__ 3657 } 3658 { .mfb 3659 nop __LINE__ 3660 FMA f76 = f36, f49, f76 // A5 * B2 3661 nop __LINE__ 3662 } 3663 ;; 3664 { .mfb 3665 nop __LINE__ 3666 FMA f84 = f36, f50, f84 // A5 * B3 3667 nop __LINE__ 3668 } 3669 { .mfb 3670 nop __LINE__ 3671 FMA f92 = f36, f51, f92 // A5 * B4 3672 nop __LINE__ 3673 } 3674 ;; 3675 { .mfb 3676 nop __LINE__ 3677 FMA f69 = f37, f48, f69 // A6 * B1 3678 nop __LINE__ 3679 } 3680 { .mfb 3681 nop __LINE__ 3682 FMA f77 = f37, f49, f77 // A6 * B2 3683 nop __LINE__ 3684 } 3685 ;; 3686 { .mfb 3687 nop __LINE__ 3688 FMA f85 = f37, f50, f85 // A6 * B3 3689 nop __LINE__ 3690 } 3691 { .mfb 3692 nop __LINE__ 3693 FMA f93 = f37, f51, f93 // A6 * B4 3694 nop __LINE__ 3695 } 3696 ;; 3697 { .mfb 3698 nop __LINE__ 3699 FMA f70 = f38, f48, f70 // A7 * B1 3700 nop __LINE__ 3701 } 3702 { .mfb 3703 nop __LINE__ 3704 FMA f78 = f38, f49, f78 // A7 * B2 3705 nop __LINE__ 3706 } 3707 ;; 3708 { .mfb 3709 nop __LINE__ 3710 FMA f86 = f38, f50, f86 // A7 * B3 3711 nop __LINE__ 3712 } 3713 { .mfb 3714 nop __LINE__ 3715 FMA f94 = f38, f51, f94 // A7 * B4 3716 nop __LINE__ 3717 } 3718 ;; 3719 { .mfb 3720 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 3721 FMA f71 = f39, f48, f71 // A8 * B1 3722 nop __LINE__ 3723 } 3724 { .mfb 3725 nop __LINE__ 3726 FMA f79 = f39, f49, f79 // A8 * B2 3727 nop __LINE__ 3728 } 3729 ;; 3730 { .mfb 3731 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 3732 FMA f87 = f39, f50, f87 // A8 * B3 3733 nop __LINE__ 3734 } 3735 { .mfb 3736 nop __LINE__ 3737 FMA f95 = f39, f51, f95 // A8 * B4 3738 nop __LINE__ 3739 } 3740 ;; 3741 { .mfb 3742 (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE 3743 (p3) FMA f64 = f40, f56, f64 // A1 * B1 3744 nop __LINE__ 3745 } 3746 { .mfb 3747 nop __LINE__ 3748 (p3) FMA f72 = f40, f57, f72 // A1 * B2 3749 nop __LINE__ 3750 } 3751 ;; 3752 { .mfb 3753 (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE 3754 (p3) FMA f80 = f40, f58, f80 // A1 * B3 3755 nop __LINE__ 3756 } 3757 { .mfb 3758 nop __LINE__ 3759 (p3) FMA f88 = f40, f59, f88 // A1 * B4 3760 nop __LINE__ 3761 } 3762 ;; 3763 { .mfb 3764 (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE 3765 (p3) FMA f65 = f41, f56, f65 // A2 * B1 3766 nop __LINE__ 3767 } 3768 { .mfb 3769 nop __LINE__ 3770 (p3) FMA f73 = f41, f57, f73 // A2 * B2 3771 nop __LINE__ 3772 } 3773 ;; 3774 { .mfb 3775 (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE 3776 (p3) FMA f81 = f41, f58, f81 // A2 * B3 3777 nop __LINE__ 3778 } 3779 { .mfb 3780 nop __LINE__ 3781 (p3) FMA f89 = f41, f59, f89 // A2 * B4 3782 nop __LINE__ 3783 } 3784 ;; 3785 { .mfb 3786 (p5) LDFD f6 = [C1 ], SIZE 3787 (p3) FMA f66 = f42, f56, f66 // A3 * B1 3788 nop __LINE__ 3789 } 3790 { .mfb 3791 (p5) LDFD f7 = [C9 ], SIZE 3792 (p3) FMA f74 = f42, f57, f74 // A3 * B2 3793 nop __LINE__ 3794 } 3795 ;; 3796 { .mfb 3797 (p5) LDFD f10 = [C1 ], SIZE 3798 (p3) FMA f82 = f42, f58, f82 // A3 * B3 3799 nop __LINE__ 3800 } 3801 { .mfb 3802 (p5) LDFD f11 = [C9 ], SIZE 3803 (p3) FMA f90 = f42, f59, f90 // A3 * B4 3804 nop __LINE__ 3805 } 3806 ;; 3807 { .mfb 3808 (p5) LDFD f12 = [C1 ], SIZE 3809 (p3) FMA f67 = f43, f56, f67 // A4 * B1 3810 nop __LINE__ 3811 } 3812 { .mfb 3813 (p5) LDFD f13 = [C9 ], SIZE 3814 (p3) FMA f75 = f43, f57, f75 // A4 * B2 3815 nop __LINE__ 3816 } 3817 ;; 3818 { .mfb 3819 (p5) LDFD f14 = [C1 ], 5 * SIZE 3820 (p3) FMA f83 = f43, f58, f83 // A4 * B3 3821 nop __LINE__ 3822 } 3823 { .mfb 3824 (p5) LDFD f15 = [C9 ], 5 * SIZE 3825 (p3) FMA f91 = f43, f59, f91 // A4 * B4 3826 nop __LINE__ 3827 } 3828 ;; 3829 { .mfb 3830 (p5) LDFD f16 = [C1 ], SIZE 3831 (p3) FMA f68 = f44, f56, f68 // A5 * B1 3832 nop __LINE__ 3833 } 3834 { .mfb 3835 (p5) LDFD f17 = [C9], SIZE 3836 (p3) FMA f76 = f44, f57, f76 // A5 * B2 3837 nop __LINE__ 3838 } 3839 ;; 3840 { .mfb 3841 (p5) LDFD f18 = [C1 ], SIZE 3842 (p3) FMA f84 = f44, f58, f84 // A5 * B3 3843 nop __LINE__ 3844 } 3845 { .mfb 3846 (p5) LDFD f19 = [C9], SIZE 3847 (p3) FMA f92 = f44, f59, f92 // A5 * B4 3848 nop __LINE__ 3849 } 3850 ;; 3851 { .mfb 3852 (p5) LDFD f20 = [C1 ], SIZE 3853 (p3) FMA f69 = f45, f56, f69 // A6 * B1 3854 nop __LINE__ 3855 } 3856 { .mfb 3857 (p5) LDFD f21 = [C9], SIZE 3858 (p3) FMA f77 = f45, f57, f77 // A6 * B2 3859 nop __LINE__ 3860 } 3861 ;; 3862 { .mfb 3863 (p5) LDFD f22 = [C1 ], -11 * SIZE 3864 (p3) FMA f85 = f45, f58, f85 // A6 * B3 3865 nop __LINE__ 3866 } 3867 { .mfb 3868 (p5) LDFD f23 = [C9 ], -11 * SIZE 3869 (p3) FMA f93 = f45, f59, f93 // A6 * B4 3870 nop __LINE__ 3871 } 3872 ;; 3873 { .mfb 3874 (p5) LDFD f24 = [C2 ], SIZE 3875 (p3) FMA f70 = f46, f56, f70 // A7 * B1 3876 nop __LINE__ 3877 } 3878 { .mfb 3879 (p5) LDFD f25 = [C10], SIZE 3880 (p3) FMA f78 = f46, f57, f78 // A7 * B2 3881 nop __LINE__ 3882 } 3883 ;; 3884 { .mfb 3885 (p5) LDFD f26 = [C2 ], SIZE 3886 (p3) FMA f86 = f46, f58, f86 // A7 * B3 3887 nop __LINE__ 3888 } 3889 { .mfb 3890 (p5) LDFD f27 = [C10], SIZE 3891 (p3) FMA f94 = f46, f59, f94 // A7 * B4 3892 nop __LINE__ 3893 } 3894 ;; 3895 { .mfb 3896 (p5) LDFD f28 = [C2 ], SIZE 3897 (p3) FMA f71 = f47, f56, f71 // A8 * B1 3898 nop __LINE__ 3899 } 3900 { .mfb 3901 (p5) LDFD f29 = [C10], SIZE 3902 (p3) FMA f79 = f47, f57, f79 // A8 * B2 3903 nop __LINE__ 3904 } 3905 ;; 3906 { .mfi 3907 (p5) LDFD f30 = [C2 ], 5 * SIZE 3908 (p3) FMA f87 = f47, f58, f87 // A8 * B3 3909 adds L = -1, L 3910 } 3911 { .mfb 3912 (p5) LDFD f31 = [C10], 5 * SIZE 3913 (p3) FMA f95 = f47, f59, f95 // A8 * B4 3914 br.cloop.sptk.few .L053 3915 } 3916 ;; 3917 .align 32 3918 3919.L058: 3920 { .mmf 3921 LDFD f32 = [C2 ], SIZE 3922 LDFD f33 = [C10], SIZE 3923 FMA f6 = ALPHA_R, f64, f6 3924 } 3925 { .mmf 3926 nop __LINE__ 3927 nop __LINE__ 3928 FMA f7 = ALPHA_R, f66, f7 3929 } 3930 ;; 3931 { .mmf 3932 LDFD f34 = [C2 ], SIZE 3933 LDFD f35 = [C10], SIZE 3934 FMA f10 = ALPHA_I, f64, f10 3935 } 3936 { .mmf 3937 nop __LINE__ 3938 nop __LINE__ 3939 FMA f11 = ALPHA_I, f66, f11 3940 } 3941 ;; 3942 { .mmf 3943 LDFD f36 = [C2 ], SIZE 3944 LDFD f37 = [C10], SIZE 3945 FMA f12 = ALPHA_R, f65, f12 3946 } 3947 { .mmf 3948 nop __LINE__ 3949 nop __LINE__ 3950 FMA f13 = ALPHA_R, f67, f13 3951 } 3952 ;; 3953 { .mmf 3954 LDFD f38 = [C2 ], - 11 * SIZE 3955 LDFD f39 = [C10], - 11 * SIZE 3956 FMA f14 = ALPHA_I, f65, f14 3957 } 3958 { .mmf 3959 nop __LINE__ 3960 nop __LINE__ 3961 FMA f15 = ALPHA_I, f67, f15 3962 } 3963 ;; 3964 { .mmf 3965 STFD [C1 ] = f6, SIZE 3966 STFD [C9 ] = f7, SIZE 3967 FMA f16 = ALPHA_R, f68, f16 3968 } 3969 { .mmf 3970 LDFD f48 = [C3 ], SIZE 3971 LDFD f49 = [C11], SIZE 3972 FMA f17 = ALPHA_R, f70, f17 3973 } 3974 ;; 3975 { .mmf 3976 STFD [C1 ] = f10, SIZE 3977 STFD [C9 ] = f11, SIZE 3978 FMA f18 = ALPHA_I, f68, f18 3979 } 3980 { .mmf 3981 LDFD f50 = [C3 ], SIZE 3982 LDFD f51 = [C11], SIZE 3983 FMA f19 = ALPHA_I, f70, f19 3984 } 3985 ;; 3986 { .mmf 3987 STFD [C1 ] = f12, SIZE 3988 STFD [C9 ] = f13, SIZE 3989 FMA f20 = ALPHA_R, f69, f20 3990 } 3991 { .mmf 3992 LDFD f52 = [C3 ], SIZE 3993 LDFD f53 = [C11], SIZE 3994 FMA f21 = ALPHA_R, f71, f21 3995 } 3996 ;; 3997 { .mmf 3998 STFD [C1 ] = f14, 5 * SIZE 3999 STFD [C9 ] = f15, 5 * SIZE 4000 FMA f22 = ALPHA_I, f69, f22 4001 } 4002 { .mmf 4003 LDFD f54 = [C3 ], 5 * SIZE 4004 LDFD f55 = [C11], 5 * SIZE 4005 FMA f23 = ALPHA_I, f71, f23 4006 } 4007 ;; 4008 { .mmf 4009 STFD [C1 ] = f16, SIZE 4010 STFD [C9 ] = f17, SIZE 4011 FMA f24 = ALPHA_R, f72, f24 4012 } 4013 { .mmf 4014 LDFD f40 = [C3 ], SIZE 4015 LDFD f41 = [C11], SIZE 4016 FMA f25 = ALPHA_R, f74, f25 4017 } 4018 ;; 4019 { .mmf 4020 STFD [C1 ] = f18, SIZE 4021 STFD [C9 ] = f19, SIZE 4022 FMA f26 = ALPHA_I, f72, f26 4023 } 4024 { .mmf 4025 LDFD f42 = [C3 ], SIZE 4026 LDFD f43 = [C11], SIZE 4027 FMA f27 = ALPHA_I, f74, f27 4028 } 4029 ;; 4030 { .mmf 4031 STFD [C1 ] = f20, SIZE 4032 STFD [C9 ] = f21, SIZE 4033 FMA f28 = ALPHA_R, f73, f28 4034 } 4035 { .mmf 4036 LDFD f44 = [C3 ], SIZE 4037 LDFD f45 = [C11], SIZE 4038 FMA f29 = ALPHA_R, f75, f29 4039 } 4040 ;; 4041 { .mmf 4042 STFD [C1 ] = f22, 5 * SIZE 4043 STFD [C9 ] = f23, 5 * SIZE 4044 FMA f30 = ALPHA_I, f73, f30 4045 } 4046 { .mmf 4047 LDFD f46 = [C3 ], - 11 * SIZE 4048 LDFD f56 = [C11], - 11 * SIZE 4049 FMA f31 = ALPHA_I, f75, f31 4050 } 4051 ;; 4052 { .mmf 4053 STFD [C2 ] = f24, SIZE 4054 STFD [C10] = f25, SIZE 4055 FMA f32 = ALPHA_R, f76, f32 4056 } 4057 { .mmf 4058 LDFD f57 = [C4 ], SIZE 4059 LDFD f58 = [C12], SIZE 4060 FMA f33 = ALPHA_R, f78, f33 4061 } 4062 ;; 4063 { .mmf 4064 STFD [C2 ] = f26, SIZE 4065 STFD [C10] = f27, SIZE 4066 FMA f34 = ALPHA_I, f76, f34 4067 } 4068 { .mmf 4069 LDFD f59 = [C4 ], SIZE 4070 LDFD f60 = [C12], SIZE 4071 FMA f35 = ALPHA_I, f78, f35 4072 } 4073 ;; 4074 { .mmf 4075 STFD [C2 ] = f28, SIZE 4076 STFD [C10] = f29, SIZE 4077 FMA f36 = ALPHA_R, f77, f36 4078 } 4079 { .mmf 4080 LDFD f61 = [C4 ], SIZE 4081 LDFD f62 = [C12], SIZE 4082 FMA f37 = ALPHA_R, f79, f37 4083 } 4084 ;; 4085 { .mmf 4086 STFD [C2 ] = f30, 5 * SIZE 4087 STFD [C10] = f31, 5 * SIZE 4088 FMA f38 = ALPHA_I, f77, f38 4089 } 4090 { .mmf 4091 LDFD f63 = [C4 ], 5 * SIZE 4092 LDFD f47 = [C12], 5 * SIZE 4093 FMA f39 = ALPHA_I, f79, f39 4094 } 4095 ;; 4096 { .mmf 4097 STFD [C2 ] = f32, SIZE 4098 STFD [C10] = f33, SIZE 4099 FMA f48 = ALPHA_R, f80, f48 4100 } 4101 { .mmf 4102 LDFD f64 = [C4 ], SIZE 4103 LDFD f65 = [C12], SIZE 4104 FMA f49 = ALPHA_R, f82, f49 4105 } 4106 ;; 4107 { .mmf 4108 STFD [C2 ] = f34, SIZE 4109 STFD [C10] = f35, SIZE 4110 FMA f50 = ALPHA_I, f80, f50 4111 } 4112 { .mmf 4113 LDFD f6 = [C4 ], SIZE 4114 LDFD f7 = [C12], SIZE 4115 FMA f51 = ALPHA_I, f82, f51 4116 } 4117 ;; 4118 { .mmf 4119 STFD [C2 ] = f36, SIZE 4120 STFD [C10] = f37, SIZE 4121 FMA f52 = ALPHA_R, f81, f52 4122 } 4123 { .mmf 4124 LDFD f10 = [C4 ], SIZE 4125 LDFD f11 = [C12], SIZE 4126 FMA f53 = ALPHA_R, f83, f53 4127 } 4128 ;; 4129 { .mmf 4130 STFD [C2 ] = f38, 5 * SIZE 4131 STFD [C10] = f39, 5 * SIZE 4132 FMA f54 = ALPHA_I, f81, f54 4133 } 4134 { .mmf 4135 LDFD f12 = [C4 ], - 11 * SIZE 4136 LDFD f13 = [C12], - 11 * SIZE 4137 FMA f55 = ALPHA_I, f83, f55 4138 } 4139 ;; 4140 { .mmf 4141 STFD [C3 ] = f48, SIZE 4142 STFD [C11] = f49, SIZE 4143 FMA f40 = ALPHA_R, f84, f40 4144 } 4145 { .mmf 4146 nop __LINE__ 4147 nop __LINE__ 4148 FMA f41 = ALPHA_R, f86, f41 4149 } 4150 ;; 4151 { .mmf 4152 STFD [C3 ] = f50, SIZE 4153 STFD [C11] = f51, SIZE 4154 FMA f42 = ALPHA_I, f84, f42 4155 } 4156 { .mmf 4157 nop __LINE__ 4158 nop __LINE__ 4159 FMA f43 = ALPHA_I, f86, f43 4160 } 4161 ;; 4162 { .mmf 4163 STFD [C3 ] = f52, SIZE 4164 STFD [C11] = f53, SIZE 4165 FMA f44 = ALPHA_R, f85, f44 4166 } 4167 { .mmf 4168 nop __LINE__ 4169 nop __LINE__ 4170 FMA f45 = ALPHA_R, f87, f45 4171 } 4172 ;; 4173 { .mmf 4174 STFD [C3 ] = f54, 5 * SIZE 4175 STFD [C11] = f55, 5 * SIZE 4176 FMA f46 = ALPHA_I, f85, f46 4177 } 4178 { .mmf 4179 nop __LINE__ 4180 nop __LINE__ 4181 FMA f56 = ALPHA_I, f87, f56 4182 } 4183 ;; 4184 { .mmf 4185 STFD [C3 ] = f40, SIZE 4186 STFD [C11] = f41, SIZE 4187 FMA f57 = ALPHA_R, f88, f57 4188 } 4189 { .mmf 4190 nop __LINE__ 4191 nop __LINE__ 4192 FMA f58 = ALPHA_R, f90, f58 4193 } 4194 ;; 4195 { .mmf 4196 STFD [C3 ] = f42, SIZE 4197 STFD [C11] = f43, SIZE 4198 FMA f59 = ALPHA_I, f88, f59 4199 } 4200 { .mmf 4201 nop __LINE__ 4202 nop __LINE__ 4203 FMA f60 = ALPHA_I, f90, f60 4204 } 4205 ;; 4206 { .mmf 4207 STFD [C3 ] = f44, SIZE 4208 STFD [C11] = f45, SIZE 4209 FMA f61 = ALPHA_R, f89, f61 4210 } 4211 { .mmf 4212 nop __LINE__ 4213 nop __LINE__ 4214 FMA f62 = ALPHA_R, f91, f62 4215 } 4216 ;; 4217 { .mmf 4218 STFD [C3 ] = f46, 5 * SIZE 4219 STFD [C11] = f56, 5 * SIZE 4220 FMA f63 = ALPHA_I, f89, f63 4221 } 4222 { .mmf 4223 nop __LINE__ 4224 nop __LINE__ 4225 FMA f47 = ALPHA_I, f91, f47 4226 } 4227 ;; 4228 { .mmf 4229 STFD [C4 ] = f57, SIZE 4230 STFD [C12] = f58, SIZE 4231 FMA f64 = ALPHA_R, f92, f64 4232 } 4233 { .mmf 4234 nop __LINE__ 4235 nop __LINE__ 4236 FMA f65 = ALPHA_R, f94, f65 4237 } 4238 ;; 4239 { .mmf 4240 STFD [C4 ] = f59, SIZE 4241 STFD [C12] = f60, SIZE 4242 FMA f6 = ALPHA_I, f92, f6 4243 } 4244 { .mmf 4245 nop __LINE__ 4246 nop __LINE__ 4247 FMA f7 = ALPHA_I, f94, f7 4248 } 4249 ;; 4250 { .mmf 4251 STFD [C4 ] = f61, SIZE 4252 STFD [C12] = f62, SIZE 4253 FMA f10 = ALPHA_R, f93, f10 4254 } 4255 { .mmf 4256 nop __LINE__ 4257 nop __LINE__ 4258 FMA f11 = ALPHA_R, f95, f11 4259 } 4260 ;; 4261 { .mmf 4262 STFD [C4 ] = f63, 5 * SIZE 4263 STFD [C12] = f47, 5 * SIZE 4264 FMA f12 = ALPHA_I, f93, f12 4265 } 4266 { .mmf 4267 nop __LINE__ 4268 nop __LINE__ 4269 FMA f13 = ALPHA_I, f95, f13 4270 } 4271 ;; 4272 { .mmf 4273 STFD [C4 ] = f64, SIZE 4274 STFD [C12] = f65, SIZE 4275 mov f64 = f0 4276 } 4277 { .mmf 4278 cmp.ne p6, p0 = 1, I 4279 nop __LINE__ 4280 mov f72 = f0 4281 } 4282 ;; 4283 { .mmf 4284 STFD [C4 ] = f6, SIZE 4285 STFD [C12] = f7, SIZE 4286 mov f80 = f0 4287 } 4288 { .mmf 4289 nop __LINE__ 4290 nop __LINE__ 4291 mov f88 = f0 4292 } 4293 ;; 4294 { .mmf 4295 STFD [C4 ] = f10, SIZE 4296 STFD [C12] = f11, SIZE 4297 mov f65 = f0 4298 } 4299 { .mmf 4300 nop __LINE__ 4301 nop __LINE__ 4302 mov f73 = f0 4303 } 4304 ;; 4305 { .mmf 4306 STFD [C4 ] = f12, 5 * SIZE 4307 STFD [C12] = f13, 5 * SIZE 4308 mov f81 = f0 4309 } 4310 { .mfb 4311 adds I = -1, I 4312 mov f89 = f0 4313 (p6) br.cond.dptk .L052 4314 } 4315 ;; 4316 .align 32 4317 4318.L060: 4319 { .mfi 4320 nop __LINE__ 4321 mov f66 = f0 4322 tbit.z p6, p7 = M, 2 4323 } 4324 { .mfb 4325 nop __LINE__ 4326 mov f74 = f0 4327 (p6) br.cond.dptk .L070 4328 } 4329 ;; 4330 { .mfb 4331 LDFPD f48, f49 = [B] 4332 mov f82 = f0 4333 nop __LINE__ 4334 } 4335 { .mfi 4336 adds BOFFSET = 2 * SIZE, B 4337 mov f90 = f0 4338 adds L = 1, K 4339 } 4340 ;; 4341 { .mii 4342 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 4343 tbit.z p12, p0 = L, 0 4344 shr L = L, 1 4345 } 4346 ;; 4347 { .mfi 4348 LDFPD f34, f35 = [AOFFSET], 2 * SIZE 4349 mov f67 = f0 4350 adds L = -1, L 4351 } 4352 { .mfi 4353 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 4354 mov f75 = f0 4355 nop __LINE__ 4356 } 4357 ;; 4358 { .mfi 4359 LDFPD f50, f51 = [BOFFSET], 2 * SIZE 4360 mov f83 = f0 4361 mov ar.lc = L 4362 } 4363 { .mfi 4364 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 4365 mov f91 = f0 4366 cmp.eq p3, p0 = r0, r0 4367 } 4368 ;; 4369 .align 32 4370 4371.L062: 4372 { .mfi 4373 lfetch.nt1 [PREA], 8 * SIZE 4374 FMA f64 = f32, f48, f64 // A1 * B1 4375 cmp.ne p4, p5 = 0, L 4376 } 4377 { .mfi 4378 nop __LINE__ 4379 FMA f72 = f32, f49, f72 // A1 * B2 4380 (p12) cmp.ne p3, p0 = 0, L 4381 } 4382 ;; 4383 { .mfi 4384 lfetch.nt1 [PREB], 8 * SIZE 4385 FMA f80 = f32, f50, f80 // A1 * B3 4386 (p5) adds C9 = 4 * SIZE, C1 4387 } 4388 { .mfi 4389 nop __LINE__ 4390 FMA f88 = f32, f51, f88 // A1 * B4 4391 (p5) adds C10 = 4 * SIZE, C2 4392 } 4393 ;; 4394 { .mfi 4395 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 4396 FMA f65 = f33, f48, f65 // A2 * B1 4397 (p5) adds C11 = 4 * SIZE, C3 4398 } 4399 { .mfi 4400 nop __LINE__ 4401 FMA f73 = f33, f49, f73 // A2 * B2 4402 (p5) adds C12 = 4 * SIZE, C4 4403 } 4404 ;; 4405 { .mfb 4406 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 4407 FMA f81 = f33, f50, f81 // A2 * B3 4408 nop __LINE__ 4409 } 4410 { .mfb 4411 nop __LINE__ 4412 FMA f89 = f33, f51, f89 // A2 * B4 4413 nop __LINE__ 4414 } 4415 ;; 4416 { .mfb 4417 (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE 4418 FMA f66 = f34, f48, f66 // A3 * B1 4419 nop __LINE__ 4420 } 4421 { .mfb 4422 nop __LINE__ 4423 FMA f74 = f34, f49, f74 // A3 * B2 4424 nop __LINE__ 4425 } 4426 ;; 4427 { .mfb 4428 (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE 4429 FMA f82 = f34, f50, f82 // A3 * B3 4430 nop __LINE__ 4431 } 4432 { .mfb 4433 nop __LINE__ 4434 FMA f90 = f34, f51, f90 // A3 * B4 4435 nop __LINE__ 4436 } 4437 ;; 4438 { .mfb 4439 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 4440 FMA f67 = f35, f48, f67 // A4 * B1 4441 } 4442 { .mfb 4443 (p5) LDFD f6 = [C1 ], SIZE 4444 FMA f75 = f35, f49, f75 // A4 * B2 4445 nop __LINE__ 4446 } 4447 4448 { .mfb 4449 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 4450 FMA f83 = f35, f50, f83 // A4 * B3 4451 nop __LINE__ 4452 } 4453 { .mfb 4454 (p5) LDFD f7 = [C9 ], SIZE 4455 FMA f91 = f35, f51, f91 // A4 * B4 4456 nop __LINE__ 4457 } 4458 ;; 4459 { .mfb 4460 (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE 4461 (p3) FMA f64 = f40, f56, f64 // A1 * B1 4462 nop __LINE__ 4463 } 4464 { .mfb 4465 (p5) LDFD f10 = [C1 ], SIZE 4466 (p3) FMA f72 = f40, f57, f72 // A1 * B2 4467 nop __LINE__ 4468 } 4469 ;; 4470 { .mfb 4471 (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE 4472 (p3) FMA f80 = f40, f58, f80 // A1 * B3 4473 nop __LINE__ 4474 } 4475 { .mfb 4476 (p5) LDFD f11 = [C9 ], SIZE 4477 (p3) FMA f88 = f40, f59, f88 // A1 * B4 4478 nop __LINE__ 4479 } 4480 ;; 4481 { .mfb 4482 (p5) LDFD f12 = [C1 ], SIZE 4483 (p3) FMA f65 = f41, f56, f65 // A2 * B1 4484 nop __LINE__ 4485 } 4486 { .mfb 4487 (p5) LDFD f13 = [C9], SIZE 4488 (p3) FMA f73 = f41, f57, f73 // A2 * B2 4489 nop __LINE__ 4490 } 4491 ;; 4492 { .mfb 4493 (p5) LDFD f14 = [C1 ], - 3 * SIZE 4494 (p3) FMA f81 = f41, f58, f81 // A2 * B3 4495 nop __LINE__ 4496 } 4497 { .mfb 4498 (p5) LDFD f15 = [C9], - 3 * SIZE 4499 (p3) FMA f89 = f41, f59, f89 // A2 * B4 4500 nop __LINE__ 4501 } 4502 ;; 4503 { .mfb 4504 (p5) LDFD f16 = [C2 ], SIZE 4505 (p3) FMA f66 = f42, f56, f66 // A3 * B1 4506 nop __LINE__ 4507 } 4508 { .mfb 4509 (p5) LDFD f17 = [C10], SIZE 4510 (p3) FMA f74 = f42, f57, f74 // A3 * B2 4511 nop __LINE__ 4512 } 4513 ;; 4514 { .mfb 4515 (p5) LDFD f18 = [C2 ], SIZE 4516 (p3) FMA f82 = f42, f58, f82 // A3 * B3 4517 nop __LINE__ 4518 } 4519 { .mfb 4520 (p5) LDFD f19 = [C10], SIZE 4521 (p3) FMA f90 = f42, f59, f90 // A3 * B4 4522 nop __LINE__ 4523 } 4524 ;; 4525 { .mfb 4526 (p5) LDFD f20 = [C2 ], SIZE 4527 (p3) FMA f67 = f43, f56, f67 // A4 * B1 4528 nop __LINE__ 4529 } 4530 { .mfb 4531 (p5) LDFD f21 = [C10], SIZE 4532 (p3) FMA f75 = f43, f57, f75 // A4 * B2 4533 nop __LINE__ 4534 } 4535 ;; 4536 { .mfi 4537 (p5) LDFD f22 = [C2 ], -3 * SIZE 4538 (p3) FMA f83 = f43, f58, f83 // A4 * B3 4539 adds L = -1, L 4540 } 4541 { .mfb 4542 (p5) LDFD f23 = [C10], -3 * SIZE 4543 (p3) FMA f91 = f43, f59, f91 // A4 * B4 4544 br.cloop.sptk.few .L062 4545 } 4546 ;; 4547 { .mmf 4548 LDFD f24 = [C3 ], SIZE 4549 LDFD f25 = [C11], SIZE 4550 FMA f6 = ALPHA_R, f64, f6 4551 } 4552 { .mmf 4553 nop __LINE__ 4554 nop __LINE__ 4555 FMA f7 = ALPHA_R, f66, f7 4556 } 4557 ;; 4558 { .mmf 4559 LDFD f26 = [C3 ], SIZE 4560 LDFD f27 = [C11], SIZE 4561 FMA f10 = ALPHA_I, f64, f10 4562 } 4563 { .mmf 4564 nop __LINE__ 4565 nop __LINE__ 4566 FMA f11 = ALPHA_I, f66, f11 4567 } 4568 ;; 4569 { .mmf 4570 LDFD f28 = [C3 ], SIZE 4571 LDFD f29 = [C11], SIZE 4572 FMA f12 = ALPHA_R, f65, f12 4573 } 4574 { .mmf 4575 nop __LINE__ 4576 nop __LINE__ 4577 FMA f13 = ALPHA_R, f67, f13 4578 } 4579 ;; 4580 { .mmf 4581 LDFD f30 = [C3 ], - 3 * SIZE 4582 LDFD f31 = [C11], - 3 * SIZE 4583 FMA f14 = ALPHA_I, f65, f14 4584 } 4585 { .mmf 4586 nop __LINE__ 4587 nop __LINE__ 4588 FMA f15 = ALPHA_I, f67, f15 4589 } 4590 ;; 4591 { .mmf 4592 STFD [C1 ] = f6, SIZE 4593 STFD [C9 ] = f7, SIZE 4594 FMA f16 = ALPHA_R, f72, f16 4595 } 4596 { .mmf 4597 LDFD f32 = [C4 ], SIZE 4598 LDFD f33 = [C12], SIZE 4599 FMA f17 = ALPHA_R, f74, f17 4600 } 4601 ;; 4602 { .mmf 4603 STFD [C1 ] = f10, SIZE 4604 STFD [C9 ] = f11, SIZE 4605 FMA f18 = ALPHA_I, f72, f18 4606 } 4607 { .mmf 4608 LDFD f34 = [C4 ], SIZE 4609 LDFD f35 = [C12], SIZE 4610 FMA f19 = ALPHA_I, f74, f19 4611 } 4612 ;; 4613 { .mmf 4614 STFD [C1 ] = f12, SIZE 4615 STFD [C9 ] = f13, SIZE 4616 FMA f20 = ALPHA_R, f73, f20 4617 } 4618 { .mmf 4619 LDFD f36 = [C4 ], SIZE 4620 LDFD f37 = [C12], SIZE 4621 FMA f21 = ALPHA_R, f75, f21 4622 } 4623 ;; 4624 { .mmf 4625 STFD [C1 ] = f14, 5 * SIZE 4626 STFD [C9 ] = f15, 5 * SIZE 4627 FMA f22 = ALPHA_I, f73, f22 4628 } 4629 { .mmf 4630 LDFD f38 = [C4 ], - 3 * SIZE 4631 LDFD f39 = [C12], - 3 * SIZE 4632 FMA f23 = ALPHA_I, f75, f23 4633 } 4634 ;; 4635 { .mmf 4636 STFD [C2 ] = f16, SIZE 4637 STFD [C10] = f17, SIZE 4638 FMA f24 = ALPHA_R, f80, f24 4639 } 4640 { .mmf 4641 nop __LINE__ 4642 nop __LINE__ 4643 FMA f25 = ALPHA_R, f82, f25 4644 } 4645 ;; 4646 { .mmf 4647 STFD [C2 ] = f18, SIZE 4648 STFD [C10] = f19, SIZE 4649 FMA f26 = ALPHA_I, f80, f26 4650 } 4651 { .mmf 4652 nop __LINE__ 4653 nop __LINE__ 4654 FMA f27 = ALPHA_I, f82, f27 4655 } 4656 ;; 4657 { .mmf 4658 STFD [C2 ] = f20, SIZE 4659 STFD [C10] = f21, SIZE 4660 FMA f28 = ALPHA_R, f81, f28 4661 } 4662 { .mmf 4663 nop __LINE__ 4664 nop __LINE__ 4665 FMA f29 = ALPHA_R, f83, f29 4666 } 4667 ;; 4668 { .mmf 4669 STFD [C2 ] = f22, 5 * SIZE 4670 STFD [C10] = f23, 5 * SIZE 4671 FMA f30 = ALPHA_I, f81, f30 4672 } 4673 { .mmf 4674 nop __LINE__ 4675 nop __LINE__ 4676 FMA f31 = ALPHA_I, f83, f31 4677 } 4678 ;; 4679 { .mmf 4680 STFD [C3 ] = f24, SIZE 4681 STFD [C11] = f25, SIZE 4682 FMA f32 = ALPHA_R, f88, f32 4683 } 4684 { .mmf 4685 nop __LINE__ 4686 nop __LINE__ 4687 FMA f33 = ALPHA_R, f90, f33 4688 } 4689 ;; 4690 { .mmf 4691 STFD [C3 ] = f26, SIZE 4692 STFD [C11] = f27, SIZE 4693 FMA f34 = ALPHA_I, f88, f34 4694 } 4695 { .mmf 4696 nop __LINE__ 4697 nop __LINE__ 4698 FMA f35 = ALPHA_I, f90, f35 4699 } 4700 ;; 4701 { .mmf 4702 STFD [C3 ] = f28, SIZE 4703 STFD [C11] = f29, SIZE 4704 FMA f36 = ALPHA_R, f89, f36 4705 } 4706 { .mmf 4707 nop __LINE__ 4708 nop __LINE__ 4709 FMA f37 = ALPHA_R, f91, f37 4710 } 4711 ;; 4712 { .mmf 4713 STFD [C3 ] = f30, 5 * SIZE 4714 STFD [C11] = f31, 5 * SIZE 4715 FMA f38 = ALPHA_I, f89, f38 4716 } 4717 { .mmf 4718 nop __LINE__ 4719 nop __LINE__ 4720 FMA f39 = ALPHA_I, f91, f39 4721 } 4722 ;; 4723 { .mmf 4724 STFD [C4 ] = f32, SIZE 4725 STFD [C12] = f33, SIZE 4726 mov f64 = f0 4727 } 4728 { .mmf 4729 nop __LINE__ 4730 nop __LINE__ 4731 mov f72 = f0 4732 } 4733 ;; 4734 { .mmf 4735 STFD [C4 ] = f34, SIZE 4736 STFD [C12] = f35, SIZE 4737 mov f80 = f0 4738 } 4739 { .mmf 4740 nop __LINE__ 4741 nop __LINE__ 4742 mov f88 = f0 4743 } 4744 ;; 4745 { .mmf 4746 STFD [C4 ] = f36, SIZE 4747 STFD [C12] = f37, SIZE 4748 mov f81 = f0 4749 } 4750 { .mmf 4751 nop __LINE__ 4752 nop __LINE__ 4753 mov f65 = f0 4754 } 4755 ;; 4756 { .mmf 4757 STFD [C4 ] = f38, 5 * SIZE 4758 STFD [C12] = f39, 5 * SIZE 4759 mov f89 = f0 4760 } 4761 { .mmf 4762 nop __LINE__ 4763 nop __LINE__ 4764 mov f73 = f0 4765 } 4766 ;; 4767 .align 32 4768 4769.L070: 4770 { .mib 4771 nop __LINE__ 4772 tbit.z p6,p7 = M, 1 4773 (p6) br.cond.dptk .L080 4774 } 4775 ;; 4776 { .mmi 4777 LDFPD f48, f49 = [B] 4778 adds BOFFSET = 2 * SIZE, B 4779 adds L = 1, K 4780 } 4781 ;; 4782 { .mii 4783 cmp.eq p3, p0 = r0, r0 4784 tbit.z p12, p0 = L, 0 4785 shr L = L, 1 4786 } 4787 ;; 4788 { .mmi 4789 (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 4790 adds L = -1, L 4791 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 4792 } 4793 ;; 4794 { .mmi 4795 LDFPD f50, f51 = [BOFFSET], 2 * SIZE 4796 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 4797 mov ar.lc = L 4798 } 4799 ;; 4800 .align 32 4801 4802.L072: 4803 { .mfb 4804 lfetch.nt1 [PREA], 4 * SIZE 4805 FMA f64 = f32, f48, f64 // A1 * B1 4806 nop __LINE__ 4807 } 4808 { .mfi 4809 nop __LINE__ 4810 FMA f72 = f32, f49, f72 // A1 * B2 4811 (p12) cmp.ne p3, p0 = 0, L 4812 } 4813 ;; 4814 { .mfi 4815 lfetch.nt1 [PREB], 8 * SIZE 4816 FMA f80 = f32, f50, f80 // A1 * B3 4817 cmp.ne p4, p5 = 0, L 4818 } 4819 { .mfb 4820 nop __LINE__ 4821 FMA f88 = f32, f51, f88 // A1 * B4 4822 nop __LINE__ 4823 } 4824 ;; 4825 { .mfi 4826 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 4827 FMA f65 = f33, f48, f65 // A2 * B1 4828 } 4829 { .mfi 4830 nop __LINE__ 4831 FMA f73 = f33, f49, f73 // A2 * B2 4832 } 4833 ;; 4834 { .mfi 4835 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 4836 FMA f81 = f33, f50, f81 // A2 * B3 4837 } 4838 { .mmf 4839 (p5) LDFD f6 = [C1 ], SIZE 4840 (p5) LDFD f12 = [C2 ], SIZE 4841 FMA f89 = f33, f51, f89 // A2 * B4 4842 } 4843 ;; 4844 { .mfb 4845 (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE 4846 (p3) FMA f64 = f40, f56, f64 // A1 * B1 4847 nop __LINE__ 4848 } 4849 { .mmf 4850 (p5) LDFD f7 = [C1 ], SIZE 4851 (p5) LDFD f13 = [C2 ], SIZE 4852 (p3) FMA f72 = f40, f57, f72 // A1 * B2 4853 } 4854 ;; 4855 { .mfb 4856 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 4857 (p3) FMA f80 = f40, f58, f80 // A1 * B3 4858 nop __LINE__ 4859 } 4860 { .mmf 4861 (p5) LDFD f10 = [C1 ], SIZE 4862 (p5) LDFD f14 = [C2 ], SIZE 4863 (p3) FMA f88 = f40, f59, f88 // A1 * B4 4864 } 4865 ;; 4866 { .mfb 4867 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 4868 (p3) FMA f65 = f41, f56, f65 // A2 * B1 4869 nop __LINE__ 4870 } 4871 { .mfb 4872 (p5) LDFD f11 = [C1 ], - 3 * SIZE 4873 (p3) FMA f73 = f41, f57, f73 // A2 * B2 4874 nop __LINE__ 4875 } 4876 ;; 4877 { .mfi 4878 (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE 4879 (p3) FMA f81 = f41, f58, f81 // A2 * B3 4880 adds L = -1, L 4881 } 4882 { .mfb 4883 (p5) LDFD f15 = [C2 ], - 3 * SIZE 4884 (p3) FMA f89 = f41, f59, f89 // A2 * B4 4885 br.cloop.sptk.few .L072 4886 } 4887 ;; 4888 { .mmf 4889 LDFD f16 = [C3], SIZE 4890 LDFD f20 = [C4], SIZE 4891 FMA f6 = ALPHA_R, f64, f6 4892 } 4893 { .mmf 4894 nop __LINE__ 4895 nop __LINE__ 4896 FMA f12 = ALPHA_R, f72, f12 4897 } 4898 ;; 4899 { .mmf 4900 LDFD f17 = [C3], SIZE 4901 LDFD f21 = [C4], SIZE 4902 FMA f7 = ALPHA_I, f64, f7 4903 } 4904 { .mmf 4905 nop __LINE__ 4906 nop __LINE__ 4907 FMA f13 = ALPHA_I, f72, f13 4908 } 4909 ;; 4910 { .mmf 4911 LDFD f18 = [C3], SIZE 4912 LDFD f22 = [C4], SIZE 4913 FMA f10 = ALPHA_R, f65, f10 4914 } 4915 { .mmf 4916 nop __LINE__ 4917 nop __LINE__ 4918 FMA f14 = ALPHA_R, f73, f14 4919 } 4920 ;; 4921 { .mmf 4922 LDFD f19 = [C3], - 3 * SIZE 4923 LDFD f23 = [C4], - 3 * SIZE 4924 FMA f11 = ALPHA_I, f65, f11 4925 } 4926 { .mmf 4927 nop __LINE__ 4928 nop __LINE__ 4929 FMA f15 = ALPHA_I, f73, f15 4930 } 4931 ;; 4932 { .mmf 4933 STFD [C1] = f6, SIZE 4934 STFD [C2] = f12, SIZE 4935 FMA f16 = ALPHA_R, f80, f16 4936 } 4937 { .mmf 4938 nop __LINE__ 4939 nop __LINE__ 4940 FMA f20 = ALPHA_R, f88, f20 4941 } 4942 ;; 4943 { .mmf 4944 STFD [C1] = f7, SIZE 4945 STFD [C2] = f13, SIZE 4946 FMA f17 = ALPHA_I, f80, f17 4947 } 4948 { .mmf 4949 nop __LINE__ 4950 nop __LINE__ 4951 FMA f21 = ALPHA_I, f88, f21 4952 } 4953 ;; 4954 { .mmf 4955 STFD [C1] = f10, SIZE 4956 STFD [C2] = f14, SIZE 4957 FMA f18 = ALPHA_R, f81, f18 4958 } 4959 { .mmf 4960 nop __LINE__ 4961 nop __LINE__ 4962 FMA f22 = ALPHA_R, f89, f22 4963 } 4964 ;; 4965 { .mmf 4966 STFD [C1] = f11, SIZE 4967 STFD [C2] = f15, SIZE 4968 FMA f19 = ALPHA_I, f81, f19 4969 } 4970 { .mmf 4971 nop __LINE__ 4972 nop __LINE__ 4973 FMA f23 = ALPHA_I, f89, f23 4974 } 4975 ;; 4976 { .mmf 4977 STFD [C3] = f16, SIZE 4978 STFD [C4] = f20, SIZE 4979 mov f64 = f0 4980 } 4981 ;; 4982 { .mmf 4983 STFD [C3] = f17, SIZE 4984 STFD [C4] = f21, SIZE 4985 mov f72 = f0 4986 } 4987 ;; 4988 { .mmf 4989 STFD [C3] = f18, SIZE 4990 STFD [C4] = f22, SIZE 4991 mov f80 = f0 4992 } 4993 ;; 4994 { .mmf 4995 STFD [C3] = f19, SIZE 4996 STFD [C4] = f23, SIZE 4997 mov f88 = f0 4998 } 4999 ;; 5000 .align 32 5001 5002.L080: 5003 { .mib 5004 nop __LINE__ 5005 tbit.z p6,p7 = M, 0 5006 (p6) br.cond.dptk .L089 5007 } 5008 ;; 5009 { .mmi 5010 LDFPD f48, f49 = [B] 5011 adds BOFFSET = 2 * SIZE, B 5012 adds L = 1, K 5013 } 5014 ;; 5015 { .mii 5016 LDFD f32 = [AOFFSET], 1 * SIZE 5017 tbit.z p12, p0 = L, 0 5018 shr L = L, 1 5019 } 5020 ;; 5021 { .mmi 5022 nop __LINE__ 5023 nop __LINE__ 5024 adds L = -1, L 5025 } 5026 ;; 5027 { .mmi 5028 LDFPD f50, f51 = [BOFFSET], 2 * SIZE 5029 cmp.eq p3, p0 = r0, r0 5030 mov ar.lc = L 5031 } 5032 ;; 5033 .align 32 5034 5035.L082: 5036 { .mfb 5037 cmp.ne p4, p5 = 0, L 5038 FMA f64 = f32, f48, f64 // A1 * B1 5039 nop __LINE__ 5040 } 5041 { .mfi 5042 (p12) cmp.ne p3, p0 = 0, L 5043 FMA f72 = f32, f49, f72 // A1 * B2 5044 nop __LINE__ 5045 } 5046 ;; 5047 { .mfb 5048 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 5049 FMA f80 = f32, f50, f80 // A1 * B3 5050 nop __LINE__ 5051 } 5052 { .mfb 5053 (p3) LDFD f40 = [AOFFSET], 1 * SIZE 5054 FMA f88 = f32, f51, f88 // A1 * B4 5055 nop __LINE__ 5056 } 5057 ;; 5058 { .mfb 5059 (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE 5060 (p3) FMA f64 = f40, f56, f64 // A1 * B1 5061 nop __LINE__ 5062 } 5063 { .mmf 5064 (p5) LDFD f6 = [C1], SIZE 5065 (p5) LDFD f10 = [C2], SIZE 5066 (p3) FMA f72 = f40, f57, f72 // A1 * B2 5067 } 5068 ;; 5069 { .mmf 5070 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 5071 (p4) LDFD f32 = [AOFFSET], 1 * SIZE 5072 (p3) FMA f80 = f40, f58, f80 // A1 * B3 5073 } 5074 { .mmf 5075 (p5) LDFD f7 = [C1], -SIZE 5076 (p5) LDFD f11 = [C2], -SIZE 5077 (p3) FMA f88 = f40, f59, f88 // A1 * B4 5078 } 5079 ;; 5080 { .mib 5081 (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE 5082 adds L = -1, L 5083 br.cloop.sptk.few .L082 5084 } 5085 ;; 5086 { .mmf 5087 LDFD f12 = [C3], SIZE 5088 LDFD f14 = [C4], SIZE 5089 FMA f6 = ALPHA_R, f64, f6 5090 } 5091 { .mmf 5092 nop __LINE__ 5093 nop __LINE__ 5094 FMA f10 = ALPHA_R, f72, f10 5095 } 5096 ;; 5097 { .mmf 5098 LDFD f13 = [C3], -SIZE 5099 LDFD f15 = [C4], -SIZE 5100 FMA f7 = ALPHA_I, f64, f7 5101 } 5102 { .mmf 5103 nop __LINE__ 5104 nop __LINE__ 5105 FMA f11 = ALPHA_I, f72, f11 5106 } 5107 ;; 5108 { .mmf 5109 nop __LINE__ 5110 nop __LINE__ 5111 FMA f12 = ALPHA_R, f80, f12 5112 } 5113 { .mmf 5114 nop __LINE__ 5115 nop __LINE__ 5116 FMA f14 = ALPHA_R, f88, f14 5117 } 5118 ;; 5119 { .mmf 5120 nop __LINE__ 5121 nop __LINE__ 5122 FMA f13 = ALPHA_I, f80, f13 5123 } 5124 { .mmf 5125 nop __LINE__ 5126 nop __LINE__ 5127 FMA f15 = ALPHA_I, f88, f15 5128 } 5129 ;; 5130 { .mmi 5131 STFD [C1] = f6, SIZE 5132 STFD [C2] = f10, SIZE 5133 nop __LINE__ 5134 } 5135 ;; 5136 { .mmi 5137 STFD [C1] = f7, SIZE 5138 STFD [C2] = f11, SIZE 5139 nop __LINE__ 5140 } 5141 ;; 5142 { .mmi 5143 STFD [C3] = f12, SIZE 5144 STFD [C4] = f14, SIZE 5145 nop __LINE__ 5146 } 5147 ;; 5148 { .mmi 5149 STFD [C3] = f13, SIZE 5150 STFD [C4] = f15, SIZE 5151 nop __LINE__ 5152 } 5153 ;; 5154 .align 32 5155 5156.L089: 5157 { .mmi 5158 mov B = BOFFSET 5159 mov AOFFSET = A 5160 nop __LINE__ 5161 } 5162 ;; 5163 .align 16 5164 5165.L090: 5166 { .mfi 5167 mov C1 = C 5168 mov f64 = f0 5169 tbit.z p6, p0 = N, 1 5170 } 5171 { .mfi 5172 add C2 = LDC, C 5173 mov f72 = f0 5174 shr I = M, 3 5175 } 5176 ;; 5177 { .mfi 5178 setf.d f66 = r0 5179 mov f65 = f0 5180 nop __LINE__ 5181 } 5182 { .mfb 5183 mov AOFFSET = A 5184 mov f73 = f0 5185 (p6) br.cond.dpnt .L130 5186 } 5187 ;; 5188 { .mfi 5189 nop __LINE__ 5190 mov f67 = f0 5191 shladd C = LDC, 1, C 5192 } 5193 { .mfb 5194 cmp.eq p6, p7 = 0, I 5195 mov f74 = f0 5196 (p6) br.cond.dpnt .L100 5197 } 5198 ;; 5199 .align 32 5200 5201.L092: 5202 { .mfb 5203 LDFPD f48, f49 = [B] 5204 mov f68 = f0 5205 nop __LINE__ 5206 } 5207 { .mfb 5208 adds BOFFSET = 2 * SIZE, B 5209 mov f79 = f0 5210 nop __LINE__ 5211 } 5212 ;; 5213 { .mfi 5214 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 5215 mov f75 = f0 5216 nop __LINE__ 5217 } 5218 ;; 5219 { .mfi 5220 adds PREC = CPREFETCHSIZE * SIZE, C1 5221 mov f76 = f0 5222 adds L = 1, K 5223 } 5224 ;; 5225 { .mfi 5226 LDFPD f34, f35 = [AOFFSET], 2 * SIZE 5227 mov f69 = f0 5228 tbit.z p12, p0 = L, 0 5229 } 5230 { .mfi 5231 cmp.eq p3, p0 = r0, r0 5232 mov f77 = f0 5233 shr L = L, 1 5234 } 5235 ;; 5236 { .mfi 5237 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 5238 adds L = -1, L 5239 } 5240 { .mmf 5241 LDFPD f36, f37 = [AOFFSET], 2 * SIZE 5242 CPREFETCH [PREC], LDC 5243 mov f70 = f0 5244 } 5245 ;; 5246 { .mfi 5247 LDFPD f38, f39 = [AOFFSET], 2 * SIZE 5248 mov f78 = f0 5249 mov ar.lc = L 5250 } 5251 { .mfi 5252 CPREFETCH [PREC] 5253 mov f71 = f0 5254 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 5255 } 5256 ;; 5257 .align 32 5258 5259.L093: 5260/* 1 */ 5261 { .mfi 5262 lfetch.nt1 [PREA], 16 * SIZE 5263 FMA f64 = f32, f48, f64 // A1 * B1 5264 cmp.ne p4, p5 = 0, L 5265 } 5266 { .mfi 5267 nop __LINE__ 5268 FMA f72 = f32, f49, f72 // A1 * B2 5269 (p12) cmp.ne p3, p0 = 0, L 5270 } 5271 ;; 5272 { .mfi 5273 lfetch.nt1 [PREB], 4 * SIZE 5274 FMA f65 = f33, f48, f65 // A2 * B1 5275 adds C9 = 4 * SIZE, C1 5276 } 5277 { .mfi 5278 nop __LINE__ 5279 FMA f73 = f33, f49, f73 // A2 * B2 5280 adds C10 = 4 * SIZE, C2 5281 } 5282 ;; 5283 { .mfi 5284 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 5285 FMA f66 = f34, f48, f66 // A3 * B1 5286 adds C11 = 4 * SIZE, C3 5287 } 5288 { .mfi 5289 nop __LINE__ 5290 FMA f74 = f34, f49, f74 // A3 * B2 5291 adds C12 = 4 * SIZE, C4 5292 } 5293 ;; 5294 { .mfb 5295 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 5296 FMA f67 = f35, f48, f67 // A4 * B1 5297 nop __LINE__ 5298 } 5299 { .mfb 5300 (p5) LDFD f6 = [C1 ], SIZE 5301 FMA f75 = f35, f49, f75 // A4 * B2 5302 nop __LINE__ 5303 } 5304 ;; 5305 { .mfb 5306 (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE 5307 FMA f68 = f36, f48, f68 // A5 * B1 5308 nop __LINE__ 5309 } 5310 { .mfb 5311 (p5) LDFD f7 = [C9 ], SIZE 5312 FMA f76 = f36, f49, f76 // A5 * B2 5313 nop __LINE__ 5314 } 5315 ;; 5316 { .mfb 5317 (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE 5318 FMA f69 = f37, f48, f69 // A6 * B1 5319 nop __LINE__ 5320 } 5321 { .mfb 5322 (p5) LDFD f10 = [C1 ], SIZE 5323 FMA f77 = f37, f49, f77 // A6 * B2 5324 nop __LINE__ 5325 } 5326 ;; 5327 { .mfb 5328 (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE 5329 FMA f70 = f38, f48, f70 // A7 * B1 5330 nop __LINE__ 5331 } 5332 { .mfb 5333 (p5) LDFD f11 = [C9 ], SIZE 5334 FMA f78 = f38, f49, f78 // A7 * B2 5335 nop __LINE__ 5336 } 5337 ;; 5338 { .mfb 5339 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 5340 FMA f71 = f39, f48, f71 // A8 * B1 5341 nop __LINE__ 5342 } 5343 { .mfb 5344 (p5) LDFD f12 = [C1 ], SIZE 5345 FMA f79 = f39, f49, f79 // A8 * B2 5346 nop __LINE__ 5347 } 5348 ;; 5349 { .mfb 5350 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 5351 (p3) FMA f64 = f40, f56, f64 // A1 * B1 5352 nop __LINE__ 5353 } 5354 { .mfb 5355 (p5) LDFD f13 = [C9 ], SIZE 5356 (p3) FMA f72 = f40, f57, f72 // A1 * B2 5357 nop __LINE__ 5358 } 5359 ;; 5360 { .mfb 5361 (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE 5362 (p3) FMA f65 = f41, f56, f65 // A2 * B1 5363 nop __LINE__ 5364 } 5365 { .mfb 5366 (p5) LDFD f14 = [C1 ], 5 * SIZE 5367 (p3) FMA f73 = f41, f57, f73 // A2 * B2 5368 nop __LINE__ 5369 } 5370 ;; 5371 { .mfb 5372 (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE 5373 (p3) FMA f66 = f42, f56, f66 // A3 * B1 5374 nop __LINE__ 5375 } 5376 { .mfb 5377 (p5) LDFD f15 = [C9 ], 5 * SIZE 5378 (p3) FMA f74 = f42, f57, f74 // A3 * B2 5379 nop __LINE__ 5380 } 5381 ;; 5382 { .mfb 5383 (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE 5384 (p3) FMA f67 = f43, f56, f67 // A4 * B1 5385 nop __LINE__ 5386 } 5387 { .mfb 5388 nop __LINE__ 5389 (p3) FMA f75 = f43, f57, f75 // A4 * B2 5390 nop __LINE__ 5391 } 5392 ;; 5393 { .mfb 5394 (p5) LDFD f16 = [C1 ], SIZE 5395 (p3) FMA f68 = f44, f56, f68 // A5 * B1 5396 nop __LINE__ 5397 } 5398 { .mfb 5399 (p5) LDFD f17 = [C9 ], SIZE 5400 (p3) FMA f76 = f44, f57, f76 // A5 * B2 5401 nop __LINE__ 5402 } 5403 ;; 5404 { .mfb 5405 (p5) LDFD f18 = [C1 ], SIZE 5406 (p3) FMA f69 = f45, f56, f69 // A6 * B1 5407 nop __LINE__ 5408 } 5409 { .mfb 5410 (p5) LDFD f19 = [C9 ], SIZE 5411 (p3) FMA f77 = f45, f57, f77 // A6 * B2 5412 nop __LINE__ 5413 } 5414 ;; 5415 { .mfb 5416 (p5) LDFD f20 = [C1 ], SIZE 5417 (p3) FMA f70 = f46, f56, f70 // A7 * B1 5418 nop __LINE__ 5419 } 5420 { .mfb 5421 (p5) LDFD f21 = [C9 ], SIZE 5422 (p3) FMA f78 = f46, f57, f78 // A7 * B2 5423 nop __LINE__ 5424 } 5425 ;; 5426 { .mfi 5427 (p5) LDFD f22 = [C1 ], -11 * SIZE 5428 (p3) FMA f71 = f47, f56, f71 // A8 * B1 5429 adds L = -1, L 5430 } 5431 { .mfb 5432 (p5) LDFD f23 = [C9 ], -11 * SIZE 5433 (p3) FMA f79 = f47, f57, f79 // A8 * B2 5434 br.cloop.sptk.few .L093 5435 } 5436 ;; 5437 { .mmf 5438 LDFD f24 = [C2 ], SIZE 5439 LDFD f25 = [C10], SIZE 5440 FMA f6 = ALPHA_R, f64, f6 5441 } 5442 { .mmf 5443 nop __LINE__ 5444 nop __LINE__ 5445 FMA f7 = ALPHA_R, f66, f7 5446 } 5447 ;; 5448 { .mmf 5449 LDFD f26 = [C2 ], SIZE 5450 LDFD f27 = [C10], SIZE 5451 FMA f10 = ALPHA_I, f64, f10 5452 } 5453 { .mmf 5454 nop __LINE__ 5455 nop __LINE__ 5456 FMA f11 = ALPHA_I, f66, f11 5457 } 5458 ;; 5459 { .mmf 5460 LDFD f28 = [C2 ], SIZE 5461 LDFD f29 = [C10], SIZE 5462 FMA f12 = ALPHA_R, f65, f12 5463 } 5464 { .mmf 5465 nop __LINE__ 5466 nop __LINE__ 5467 FMA f13 = ALPHA_R, f67, f13 5468 } 5469 ;; 5470 { .mmf 5471 LDFD f30 = [C2 ], 5 * SIZE 5472 LDFD f31 = [C10], 5 * SIZE 5473 FMA f14 = ALPHA_I, f65, f14 5474 } 5475 { .mmf 5476 nop __LINE__ 5477 nop __LINE__ 5478 FMA f15 = ALPHA_I, f67, f15 5479 } 5480 ;; 5481 { .mmf 5482 STFD [C1 ] = f6, SIZE 5483 STFD [C9 ] = f7, SIZE 5484 FMA f16 = ALPHA_R, f68, f16 5485 } 5486 { .mmf 5487 LDFD f32 = [C2 ], SIZE 5488 LDFD f33 = [C10], SIZE 5489 FMA f17 = ALPHA_R, f70, f17 5490 } 5491 ;; 5492 { .mmf 5493 STFD [C1 ] = f10, SIZE 5494 STFD [C9 ] = f11, SIZE 5495 FMA f18 = ALPHA_I, f68, f18 5496 } 5497 { .mmf 5498 LDFD f34 = [C2 ], SIZE 5499 LDFD f35 = [C10], SIZE 5500 FMA f19 = ALPHA_I, f70, f19 5501 } 5502 ;; 5503 { .mmf 5504 STFD [C1 ] = f12, SIZE 5505 STFD [C9 ] = f13, SIZE 5506 FMA f20 = ALPHA_R, f69, f20 5507 } 5508 { .mmf 5509 LDFD f36 = [C2 ], SIZE 5510 LDFD f37 = [C10], SIZE 5511 FMA f21 = ALPHA_R, f71, f21 5512 } 5513 ;; 5514 { .mmf 5515 STFD [C1 ] = f14, 5 * SIZE 5516 STFD [C9 ] = f15, 5 * SIZE 5517 FMA f22 = ALPHA_I, f69, f22 5518 } 5519 { .mmf 5520 LDFD f38 = [C2 ], - 11 * SIZE 5521 LDFD f39 = [C10], - 11 * SIZE 5522 FMA f23 = ALPHA_I, f71, f23 5523 } 5524 ;; 5525 { .mmf 5526 STFD [C1 ] = f16, SIZE 5527 STFD [C9 ] = f17, SIZE 5528 FMA f24 = ALPHA_R, f72, f24 5529 } 5530 { .mmf 5531 nop __LINE__ 5532 nop __LINE__ 5533 FMA f25 = ALPHA_R, f74, f25 5534 } 5535 ;; 5536 { .mmf 5537 STFD [C1 ] = f18, SIZE 5538 STFD [C9 ] = f19, SIZE 5539 FMA f26 = ALPHA_I, f72, f26 5540 } 5541 { .mmf 5542 nop __LINE__ 5543 nop __LINE__ 5544 FMA f27 = ALPHA_I, f74, f27 5545 } 5546 ;; 5547 { .mmf 5548 STFD [C1 ] = f20, SIZE 5549 STFD [C9 ] = f21, SIZE 5550 FMA f28 = ALPHA_R, f73, f28 5551 } 5552 { .mmf 5553 nop __LINE__ 5554 nop __LINE__ 5555 FMA f29 = ALPHA_R, f75, f29 5556 } 5557 ;; 5558 { .mmf 5559 STFD [C1 ] = f22, 5 * SIZE 5560 STFD [C9 ] = f23, 5 * SIZE 5561 FMA f30 = ALPHA_I, f73, f30 5562 } 5563 { .mmf 5564 nop __LINE__ 5565 nop __LINE__ 5566 FMA f31 = ALPHA_I, f75, f31 5567 } 5568 ;; 5569 { .mmf 5570 STFD [C2 ] = f24, SIZE 5571 STFD [C10] = f25, SIZE 5572 FMA f32 = ALPHA_R, f76, f32 5573 } 5574 { .mmf 5575 nop __LINE__ 5576 nop __LINE__ 5577 FMA f33 = ALPHA_R, f78, f33 5578 } 5579 ;; 5580 { .mmf 5581 STFD [C2 ] = f26, SIZE 5582 STFD [C10] = f27, SIZE 5583 FMA f34 = ALPHA_I, f76, f34 5584 } 5585 { .mmf 5586 nop __LINE__ 5587 nop __LINE__ 5588 FMA f35 = ALPHA_I, f78, f35 5589 } 5590 ;; 5591 { .mmf 5592 STFD [C2 ] = f28, SIZE 5593 STFD [C10] = f29, SIZE 5594 FMA f36 = ALPHA_R, f77, f36 5595 } 5596 { .mmf 5597 nop __LINE__ 5598 nop __LINE__ 5599 FMA f37 = ALPHA_R, f79, f37 5600 } 5601 ;; 5602 { .mmf 5603 STFD [C2 ] = f30, 5 * SIZE 5604 STFD [C10] = f31, 5 * SIZE 5605 FMA f38 = ALPHA_I, f77, f38 5606 } 5607 { .mmf 5608 nop __LINE__ 5609 nop __LINE__ 5610 FMA f39 = ALPHA_I, f79, f39 5611 } 5612 ;; 5613 { .mmf 5614 STFD [C2 ] = f32, SIZE 5615 STFD [C10] = f33, SIZE 5616 mov f64 = f0 5617 } 5618 { .mmf 5619 cmp.ne p6, p0 = 1, I 5620 nop __LINE__ 5621 mov f72 = f0 5622 } 5623 ;; 5624 { .mmf 5625 STFD [C2 ] = f34, SIZE 5626 STFD [C10] = f35, SIZE 5627 mov f65 = f0 5628 } 5629 { .mmf 5630 nop __LINE__ 5631 nop __LINE__ 5632 mov f73 = f0 5633 } 5634 ;; 5635 { .mmf 5636 STFD [C2 ] = f36, SIZE 5637 STFD [C10] = f37, SIZE 5638 mov f66 = f0 5639 } 5640 { .mmf 5641 nop __LINE__ 5642 nop __LINE__ 5643 mov f74 = f0 5644 } 5645 ;; 5646 { .mmf 5647 STFD [C2 ] = f38, 5 * SIZE 5648 STFD [C10] = f39, 5 * SIZE 5649 mov f67 = f0 5650 } 5651 { .mfb 5652 adds I = -1, I 5653 mov f75 = f0 5654 (p6) br.cond.dptk .L092 5655 } 5656 ;; 5657 .align 32 5658 5659.L100: 5660 { .mib 5661 nop __LINE__ 5662 tbit.z p6, p7 = M, 2 5663 (p6) br.cond.dptk .L110 5664 } 5665 ;; 5666 { .mmf 5667 LDFPD f48, f49 = [B] 5668 adds BOFFSET = 2 * SIZE, B 5669 mov f75 = f0 5670 } 5671 { .mii 5672 nop __LINE__ 5673 adds L = 1, K 5674 } 5675 ;; 5676 { .mii 5677 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 5678 tbit.z p12, p0 = L, 0 5679 shr L = L, 1 5680 } 5681 ;; 5682 { .mmi 5683 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 5684 nop __LINE__ 5685 adds L = -1, L 5686 } 5687 ;; 5688 { .mmi 5689 LDFPD f34, f35 = [AOFFSET], 2 * SIZE 5690 cmp.eq p3, p0 = r0, r0 5691 mov ar.lc = L 5692 } 5693 ;; 5694 .align 32 5695 5696.L102: 5697 { .mfi 5698 lfetch.nt1 [PREA], 8 * SIZE 5699 FMA f64 = f32, f48, f64 // A1 * B1 5700 cmp.ne p4, p5 = 0, L 5701 } 5702 { .mfi 5703 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 5704 FMA f72 = f32, f49, f72 // A1 * B2 5705 (p12) cmp.ne p3, p0 = 0, L 5706 } 5707 ;; 5708 { .mfi 5709 lfetch.nt1 [PREB], 4 * SIZE 5710 FMA f65 = f33, f48, f65 // A2 * B1 5711 adds C9 = 4 * SIZE, C1 5712 } 5713 { .mfi 5714 nop __LINE__ 5715 FMA f73 = f33, f49, f73 // A2 * B2 5716 adds C10 = 4 * SIZE, C2 5717 } 5718 ;; 5719 { .mfb 5720 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 5721 FMA f66 = f34, f48, f66 // A3 * B1 5722 nop __LINE__ 5723 } 5724 { .mfb 5725 (p5) LDFD f6 = [C1 ], SIZE 5726 FMA f74 = f34, f49, f74 // A3 * B2 5727 nop __LINE__ 5728 } 5729 ;; 5730 { .mfb 5731 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 5732 FMA f67 = f35, f48, f67 // A4 * B1 5733 nop __LINE__ 5734 } 5735 { .mfb 5736 (p5) LDFD f7 = [C9 ], SIZE 5737 FMA f75 = f35, f49, f75 // A4 * B2 5738 nop __LINE__ 5739 } 5740 ;; 5741 { .mfb 5742 (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE 5743 (p3) FMA f64 = f40, f56, f64 // A1 * B1 5744 nop __LINE__ 5745 } 5746 { .mfb 5747 (p5) LDFD f10 = [C1 ], SIZE 5748 (p3) FMA f72 = f40, f57, f72 // A1 * B2 5749 nop __LINE__ 5750 } 5751 ;; 5752 { .mfb 5753 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 5754 (p3) FMA f65 = f41, f56, f65 // A2 * B1 5755 nop __LINE__ 5756 } 5757 { .mfb 5758 (p5) LDFD f11 = [C9 ], SIZE 5759 (p3) FMA f73 = f41, f57, f73 // A2 * B2 5760 nop __LINE__ 5761 } 5762 ;; 5763 { .mfb 5764 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 5765 (p3) FMA f66 = f42, f56, f66 // A3 * B1 5766 nop __LINE__ 5767 } 5768 { .mfb 5769 (p5) LDFD f12 = [C1], SIZE 5770 (p3) FMA f74 = f42, f57, f74 // A3 * B2 5771 nop __LINE__ 5772 } 5773 ;; 5774 { .mfi 5775 (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE 5776 (p3) FMA f67 = f43, f56, f67 // A4 * B1 5777 adds L = -1, L 5778 } 5779 { .mfb 5780 (p5) LDFD f13 = [C9], SIZE 5781 (p3) FMA f75 = f43, f57, f75 // A4 * B2 5782 br.cloop.sptk.few .L102 5783 } 5784 ;; 5785 { .mmf 5786 LDFD f14 = [C1], - 3 * SIZE 5787 LDFD f15 = [C9], - 3 * SIZE 5788 FMA f6 = ALPHA_R, f64, f6 5789 } 5790 { .mmf 5791 nop __LINE__ 5792 nop __LINE__ 5793 FMA f7 = ALPHA_R, f66, f7 5794 } 5795 ;; 5796 { .mmf 5797 LDFD f16 = [C2 ], SIZE 5798 LDFD f17 = [C10], SIZE 5799 FMA f10 = ALPHA_I, f64, f10 5800 } 5801 { .mmf 5802 nop __LINE__ 5803 nop __LINE__ 5804 FMA f11 = ALPHA_I, f66, f11 5805 } 5806 ;; 5807 { .mmf 5808 LDFD f18 = [C2 ], SIZE 5809 LDFD f19 = [C10], SIZE 5810 FMA f12 = ALPHA_R, f65, f12 5811 } 5812 { .mmf 5813 nop __LINE__ 5814 nop __LINE__ 5815 FMA f13 = ALPHA_R, f67, f13 5816 } 5817 ;; 5818 { .mmf 5819 LDFD f20 = [C2 ], SIZE 5820 LDFD f21 = [C10], SIZE 5821 FMA f14 = ALPHA_I, f65, f14 5822 } 5823 { .mmf 5824 nop __LINE__ 5825 nop __LINE__ 5826 FMA f15 = ALPHA_I, f67, f15 5827 } 5828 ;; 5829 { .mmf 5830 STFD [C1 ] = f6, SIZE 5831 STFD [C9 ] = f7, SIZE 5832 FMA f16 = ALPHA_R, f72, f16 5833 } 5834 { .mmf 5835 LDFD f22 = [C2 ], - 3 * SIZE 5836 LDFD f23 = [C10], - 3 * SIZE 5837 FMA f17 = ALPHA_R, f74, f17 5838 } 5839 ;; 5840 { .mmf 5841 STFD [C1 ] = f10, SIZE 5842 STFD [C9 ] = f11, SIZE 5843 FMA f18 = ALPHA_I, f72, f18 5844 } 5845 { .mmf 5846 nop __LINE__ 5847 nop __LINE__ 5848 FMA f19 = ALPHA_I, f74, f19 5849 } 5850 ;; 5851 { .mmf 5852 STFD [C1 ] = f12, SIZE 5853 STFD [C9 ] = f13, SIZE 5854 FMA f20 = ALPHA_R, f73, f20 5855 } 5856 { .mmf 5857 nop __LINE__ 5858 nop __LINE__ 5859 FMA f21 = ALPHA_R, f75, f21 5860 } 5861 ;; 5862 { .mmf 5863 STFD [C1 ] = f14, 5 * SIZE 5864 STFD [C9 ] = f15, 5 * SIZE 5865 FMA f22 = ALPHA_I, f73, f22 5866 } 5867 { .mmf 5868 nop __LINE__ 5869 nop __LINE__ 5870 FMA f23 = ALPHA_I, f75, f23 5871 } 5872 ;; 5873 { .mmf 5874 STFD [C2 ] = f16, SIZE 5875 STFD [C10] = f17, SIZE 5876 mov f64 = f0 5877 } 5878 ;; 5879 { .mmf 5880 STFD [C2 ] = f18, SIZE 5881 STFD [C10] = f19, SIZE 5882 mov f65 = f0 5883 } 5884 ;; 5885 { .mmf 5886 STFD [C2 ] = f20, SIZE 5887 STFD [C10] = f21, SIZE 5888 mov f72 = f0 5889 } 5890 ;; 5891 { .mmf 5892 STFD [C2 ] = f22, 5 * SIZE 5893 STFD [C10] = f23, 5 * SIZE 5894 mov f73 = f0 5895 } 5896 ;; 5897 .align 32 5898 5899.L110: 5900 { .mib 5901 nop __LINE__ 5902 tbit.z p6, p7 = M, 1 5903 (p6) br.cond.dptk .L120 5904 } 5905 ;; 5906 { .mmi 5907 LDFPD f48, f49 = [B] 5908 adds BOFFSET = 2 * SIZE, B 5909 adds L = 1, K 5910 } 5911 ;; 5912 { .mii 5913 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 5914 tbit.z p12, p0 = L, 0 5915 shr L = L, 1 5916 } 5917 ;; 5918 { .mmi 5919 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 5920 nop __LINE__ 5921 adds L = -1, L 5922 } 5923 ;; 5924 { .mmi 5925 cmp.eq p3, p0 = r0, r0 5926 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 5927 mov ar.lc = L 5928 } 5929 ;; 5930 .align 32 5931 5932.L112: 5933 { .mfi 5934 lfetch.nt1 [PREA], 4 * SIZE 5935 FMA f64 = f32, f48, f64 // A1 * B1 5936 cmp.ne p4, p5 = 0, L 5937 } 5938 { .mfi 5939 lfetch.nt1 [PREB], 4 * SIZE 5940 FMA f72 = f32, f49, f72 // A1 * B2 5941 (p12) cmp.ne p3, p0 = 0, L 5942 } 5943 ;; 5944 { .mmf 5945 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 5946 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 5947 FMA f65 = f33, f48, f65 // A2 * B1 5948 } 5949 { .mmf 5950 (p5) LDFD f6 = [C1 ], SIZE 5951 (p5) LDFD f7 = [C2 ], SIZE 5952 FMA f73 = f33, f49, f73 // A2 * B2 5953 } 5954 ;; 5955 { .mfb 5956 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 5957 (p3) FMA f64 = f40, f56, f64 // A1 * B1 5958 nop __LINE__ 5959 } 5960 { .mfb 5961 (p5) LDFD f10 = [C1 ], SIZE 5962 (p3) FMA f72 = f40, f57, f72 // A1 * B2 5963 nop __LINE__ 5964 } 5965 ;; 5966 { .mfi 5967 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 5968 (p3) FMA f65 = f41, f56, f65 // A2 * B1 5969 adds L = -1, L 5970 } 5971 { .mfb 5972 (p5) LDFD f11 = [C2 ], SIZE 5973 (p3) FMA f73 = f41, f57, f73 // A2 * B2 5974 br.cloop.sptk.few .L112 5975 } 5976 ;; 5977 { .mmf 5978 LDFD f12 = [C1], SIZE 5979 LDFD f13 = [C2], SIZE 5980 FMA f6 = ALPHA_R, f64, f6 5981 } 5982 { .mmf 5983 nop __LINE__ 5984 nop __LINE__ 5985 FMA f7 = ALPHA_R, f72, f7 5986 } 5987 ;; 5988 { .mmf 5989 LDFD f14 = [C1], - 3 * SIZE 5990 LDFD f15 = [C2], - 3 * SIZE 5991 FMA f10 = ALPHA_I, f64, f10 5992 } 5993 { .mmf 5994 nop __LINE__ 5995 nop __LINE__ 5996 FMA f11 = ALPHA_I, f72, f11 5997 } 5998 ;; 5999 { .mmf 6000 nop __LINE__ 6001 nop __LINE__ 6002 FMA f12 = ALPHA_R, f65, f12 6003 } 6004 { .mmf 6005 nop __LINE__ 6006 nop __LINE__ 6007 FMA f13 = ALPHA_R, f73, f13 6008 } 6009 ;; 6010 { .mmf 6011 nop __LINE__ 6012 nop __LINE__ 6013 FMA f14 = ALPHA_I, f65, f14 6014 } 6015 { .mmf 6016 nop __LINE__ 6017 nop __LINE__ 6018 FMA f15 = ALPHA_I, f73, f15 6019 } 6020 ;; 6021 { .mmf 6022 STFD [C1] = f6, SIZE 6023 STFD [C2] = f7, SIZE 6024 mov f64 = f0 6025 } 6026 ;; 6027 { .mmf 6028 STFD [C1] = f10, SIZE 6029 STFD [C2] = f11, SIZE 6030 mov f72 = f0 6031 } 6032 ;; 6033 { .mmf 6034 STFD [C1] = f12, SIZE 6035 STFD [C2] = f13, SIZE 6036 mov f65 = f0 6037 } 6038 ;; 6039 { .mmf 6040 STFD [C1] = f14, SIZE 6041 STFD [C2] = f15, SIZE 6042 mov f73 = f0 6043 } 6044 ;; 6045 .align 32 6046 6047.L120: 6048 { .mib 6049 nop __LINE__ 6050 tbit.z p6, p7 = M, 0 6051 (p6) br.cond.dptk .L129 6052 } 6053 ;; 6054 { .mmi 6055 LDFPD f48, f49 = [B] 6056 adds BOFFSET = 2 * SIZE, B 6057 adds L = 1, K 6058 } 6059 ;; 6060 { .mii 6061 nop __LINE__ 6062 tbit.z p12, p0 = L, 0 6063 shr L = L, 1 6064 } 6065 ;; 6066 { .mmi 6067 LDFD f32 = [AOFFSET], 1 * SIZE 6068 nop __LINE__ 6069 adds L = -1, L 6070 } 6071 ;; 6072 { .mmi 6073 cmp.eq p3, p0 = r0, r0 6074 nop __LINE__ 6075 mov ar.lc = L 6076 } 6077 ;; 6078 .align 32 6079 6080.L122: 6081 { .mfi 6082 FMA f64 = f32, f48, f64 // A1 * B1 6083 cmp.ne p4, p5 = 0, L 6084 } 6085 { .mfi 6086 nop __LINE__ 6087 FMA f72 = f32, f49, f72 // A1 * B2 6088 (p12) cmp.ne p3, p0 = 0, L 6089 } 6090 ;; 6091 { .mmi 6092 (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE 6093 (p3) LDFD f40 = [AOFFSET], 1 * SIZE 6094 nop __LINE__ 6095 } 6096 { .mmi 6097 (p5) LDFD f6 = [C1], SIZE 6098 (p5) LDFD f7 = [C2], SIZE 6099 } 6100 ;; 6101 { .mfi 6102 (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE 6103 (p3) FMA f64 = f40, f56, f64 // A1 * B1 6104 adds L = -1, L 6105 } 6106 { .mfb 6107 (p4) LDFD f32 = [AOFFSET], 1 * SIZE 6108 (p3) FMA f72 = f40, f57, f72 // A1 * B2 6109 br.cloop.sptk.few .L122 6110 } 6111 ;; 6112 6113.L128: 6114 { .mmf 6115 (p5) LDFD f10 = [C1], -SIZE 6116 (p5) LDFD f11 = [C2], -SIZE 6117 FMA f6 = ALPHA_R, f64, f6 6118 } 6119 { .mmf 6120 nop __LINE__ 6121 nop __LINE__ 6122 FMA f7 = ALPHA_R, f72, f7 6123 } 6124 ;; 6125 { .mmf 6126 nop __LINE__ 6127 nop __LINE__ 6128 FMA f10 = ALPHA_I, f64, f10 6129 } 6130 { .mmf 6131 nop __LINE__ 6132 nop __LINE__ 6133 FMA f11 = ALPHA_I, f72, f11 6134 } 6135 ;; 6136 { .mmi 6137 STFD [C1 ] = f6, SIZE 6138 STFD [C2 ] = f7, SIZE 6139 nop __LINE__ 6140 } 6141 ;; 6142 { .mmi 6143 STFD [C1 ] = f10, SIZE 6144 STFD [C2 ] = f11, SIZE 6145 nop __LINE__ 6146 } 6147 ;; 6148 .align 32 6149 6150.L129: 6151 { .mmi 6152 mov B = BOFFSET 6153 mov AOFFSET = A 6154 nop __LINE__ 6155 } 6156 ;; 6157 .align 16 6158 6159.L130: 6160 { .mfi 6161 nop __LINE__ 6162 mov f64 = f0 6163 tbit.z p6, p0 = N, 0 6164 } 6165 { .mib 6166 mov AOFFSET = A 6167 shr I = M, 3 6168 (p6) br.cond.dpnt .L999 6169 } 6170 ;; 6171 { .mfi 6172 mov C1 = C 6173 mov f65 = f0 6174 nop __LINE__ 6175 } 6176 ;; 6177 { .mfi 6178 nop __LINE__ 6179 mov f66 = f0 6180 nop __LINE__ 6181 } 6182 { .mfb 6183 cmp.eq p7, p0 = 0, I 6184 mov f67 = f0 6185 (p7) br.cond.dpnt .L140 6186 } 6187 ;; 6188 .align 32 6189 6190.L132: 6191 { .mfb 6192 LDFD f48 = [B] 6193 mov f68 = f0 6194 nop __LINE__ 6195 } 6196 { .mfi 6197 adds BOFFSET = 1 * SIZE, B 6198 mov f69 = f0 6199 nop __LINE__ 6200 } 6201 ;; 6202 { .mfi 6203 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 6204 mov f70 = f0 6205 adds L = 1, K 6206 } 6207 ;; 6208 { .mii 6209 LDFPD f34, f35 = [AOFFSET], 2 * SIZE 6210 tbit.z p12, p0 = L, 0 6211 shr L = L, 1 6212 } 6213 ;; 6214 { .mfi 6215 LDFPD f36, f37 = [AOFFSET], 2 * SIZE 6216 mov f71 = f0 6217 adds L = -1, L 6218 } 6219 ;; 6220 { .mmi 6221 LDFPD f38, f39 = [AOFFSET], 2 * SIZE 6222 adds PREC = CPREFETCHSIZE * SIZE, C1 6223 cmp.eq p3, p0 = r0, r0 6224 } 6225 ;; 6226 { .mmi 6227 CPREFETCH [PREC] 6228 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 6229 mov ar.lc = L 6230 } 6231 ;; 6232 .align 32 6233 6234.L133: 6235 { .mfi 6236 lfetch.nt1 [PREA], 16 * SIZE 6237 FMA f64 = f32, f48, f64 // A1 * B1 6238 cmp.ne p4, p5 = 0, L 6239 } 6240 { .mfi 6241 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET 6242 FMA f65 = f33, f48, f65 // A2 * B1 6243 (p12) cmp.ne p3, p0 = 0, L 6244 } 6245 ;; 6246 { .mfi 6247 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 6248 FMA f66 = f34, f48, f66 // A3 * B1 6249 adds C9 = 4 * SIZE, C1 6250 } 6251 { .mmf 6252 (p3) LDFD f56 = [BOFFSET], 1 * SIZE 6253 (p5) LDFD f6 = [C1 ], SIZE 6254 FMA f67 = f35, f48, f67 // A4 * B1 6255 } 6256 ;; 6257 { .mfb 6258 (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE 6259 FMA f68 = f36, f48, f68 // A5 * B1 6260 nop __LINE__ 6261 } 6262 { .mfb 6263 (p5) LDFD f7 = [C9 ], SIZE 6264 FMA f69 = f37, f48, f69 // A6 * B1 6265 nop __LINE__ 6266 } 6267 ;; 6268 { .mfb 6269 (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE 6270 FMA f70 = f38, f48, f70 // A7 * B1 6271 nop __LINE__ 6272 } 6273 { .mfb 6274 (p5) LDFD f10 = [C1 ], SIZE 6275 FMA f71 = f39, f48, f71 // A8 * B1 6276 nop __LINE__ 6277 } 6278 ;; 6279 { .mfb 6280 (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE 6281 (p3) FMA f64 = f40, f56, f64 // A1 * B1 6282 nop __LINE__ 6283 } 6284 { .mfb 6285 (p5) LDFD f11 = [C9 ], SIZE 6286 (p3) FMA f65 = f41, f56, f65 // A2 * B1 6287 nop __LINE__ 6288 } 6289 ;; 6290 { .mfb 6291 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 6292 (p3) FMA f66 = f42, f56, f66 // A3 * B1 6293 nop __LINE__ 6294 } 6295 { .mmf 6296 (p4) LDFD f48 = [BOFFSET], 1 * SIZE 6297 (p5) LDFD f12 = [C1 ], SIZE 6298 (p3) FMA f67 = f43, f56, f67 // A4 * B1 6299 } 6300 ;; 6301 { .mfb 6302 (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE 6303 (p3) FMA f68 = f44, f56, f68 // A5 * B1 6304 nop __LINE__ 6305 } 6306 { .mfb 6307 (p5) LDFD f13 = [C9 ], SIZE 6308 (p3) FMA f69 = f45, f56, f69 // A6 * B1 6309 nop __LINE__ 6310 } 6311 ;; 6312 { .mfi 6313 (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE 6314 (p3) FMA f70 = f46, f56, f70 // A7 * B1 6315 adds L = -1, L 6316 } 6317 { .mfb 6318 (p5) LDFD f14 = [C1 ], 5 * SIZE 6319 (p3) FMA f71 = f47, f56, f71 // A8 * B1 6320 nop __LINE__ 6321 } 6322 ;; 6323 { .mfb 6324 (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE 6325 nop __LINE__ 6326 nop __LINE__ 6327 } 6328 { .mfb 6329 (p5) LDFD f15 = [C9 ], 5 * SIZE 6330 nop __LINE__ 6331 br.cloop.sptk.few .L133 6332 } 6333 ;; 6334 6335.L138: 6336 { .mmf 6337 LDFD f16 = [C1 ], SIZE 6338 LDFD f17 = [C9 ], SIZE 6339 FMA f6 = ALPHA_R, f64, f6 6340 } 6341 { .mmf 6342 nop __LINE__ 6343 nop __LINE__ 6344 FMA f7 = ALPHA_R, f66, f7 6345 } 6346 ;; 6347 { .mmf 6348 LDFD f18 = [C1 ], SIZE 6349 LDFD f19 = [C9 ], SIZE 6350 FMA f10 = ALPHA_I, f64, f10 6351 } 6352 { .mmf 6353 nop __LINE__ 6354 nop __LINE__ 6355 FMA f11 = ALPHA_I, f66, f11 6356 } 6357 ;; 6358 { .mmf 6359 LDFD f20 = [C1 ], SIZE 6360 LDFD f21 = [C9 ], SIZE 6361 FMA f12 = ALPHA_R, f65, f12 6362 } 6363 { .mmf 6364 nop __LINE__ 6365 nop __LINE__ 6366 FMA f13 = ALPHA_R, f67, f13 6367 } 6368 ;; 6369 { .mmf 6370 LDFD f22 = [C1 ], - 11 * SIZE 6371 LDFD f23 = [C9 ], - 11 * SIZE 6372 FMA f14 = ALPHA_I, f65, f14 6373 } 6374 { .mmf 6375 nop __LINE__ 6376 nop __LINE__ 6377 FMA f15 = ALPHA_I, f67, f15 6378 } 6379 ;; 6380 { .mmf 6381 STFD [C1 ] = f6, SIZE 6382 STFD [C9 ] = f7, SIZE 6383 FMA f16 = ALPHA_R, f68, f16 6384 } 6385 { .mmf 6386 nop __LINE__ 6387 nop __LINE__ 6388 FMA f17 = ALPHA_R, f70, f17 6389 } 6390 ;; 6391 { .mmf 6392 STFD [C1 ] = f10, SIZE 6393 STFD [C9 ] = f11, SIZE 6394 FMA f18 = ALPHA_I, f68, f18 6395 } 6396 { .mmf 6397 nop __LINE__ 6398 nop __LINE__ 6399 FMA f19 = ALPHA_I, f70, f19 6400 } 6401 ;; 6402 { .mmf 6403 STFD [C1 ] = f12, SIZE 6404 STFD [C9 ] = f13, SIZE 6405 FMA f20 = ALPHA_R, f69, f20 6406 } 6407 { .mmf 6408 cmp.ne p6, p0 = 1, I 6409 adds I = -1, I 6410 FMA f21 = ALPHA_R, f71, f21 6411 } 6412 ;; 6413 { .mmf 6414 STFD [C1 ] = f14, 5 * SIZE 6415 STFD [C9 ] = f15, 5 * SIZE 6416 FMA f22 = ALPHA_I, f69, f22 6417 } 6418 { .mmf 6419 nop __LINE__ 6420 nop __LINE__ 6421 FMA f23 = ALPHA_I, f71, f23 6422 } 6423 ;; 6424 { .mmf 6425 STFD [C1 ] = f16, SIZE 6426 STFD [C9 ] = f17, SIZE 6427 mov f64 = f0 6428 } 6429 ;; 6430 { .mmf 6431 STFD [C1 ] = f18, SIZE 6432 STFD [C9 ] = f19, SIZE 6433 mov f65 = f0 6434 } 6435 ;; 6436 { .mmf 6437 STFD [C1 ] = f20, SIZE 6438 STFD [C9 ] = f21, SIZE 6439 mov f66 = f0 6440 } 6441 ;; 6442 { .mmf 6443 STFD [C1 ] = f22, 5 * SIZE 6444 STFD [C9 ] = f23, 5 * SIZE 6445 mov f67 = f0 6446 } 6447 { .mmb 6448 nop __LINE__ 6449 nop __LINE__ 6450 (p6) br.cond.dptk .L132 6451 } 6452 ;; 6453 .align 32 6454 6455.L140: 6456 { .mib 6457 nop __LINE__ 6458 tbit.z p6, p7 = M, 2 6459 (p6) br.cond.dptk .L150 6460 } 6461 ;; 6462 { .mmi 6463 LDFD f48 = [B] 6464 adds BOFFSET = 1 * SIZE, B 6465 adds L = 1, K 6466 } 6467 ;; 6468 { .mii 6469 (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 6470 tbit.z p12, p0 = L, 0 6471 shr L = L, 1 6472 } 6473 ;; 6474 { .mmi 6475 LDFPD f34, f35 = [AOFFSET], 2 * SIZE 6476 adds L = -1, L 6477 nop __LINE__ 6478 } 6479 ;; 6480 { .mmi 6481 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET 6482 cmp.eq p3, p0 = r0, r0 6483 mov ar.lc = L 6484 } 6485 ;; 6486 .align 32 6487 6488.L142: 6489 { .mfi 6490 lfetch.nt1 [PREA], 8 * SIZE 6491 FMA f64 = f32, f48, f64 // A1 * B1 6492 cmp.ne p4, p5 = 0, L 6493 } 6494 { .mfi 6495 nop __LINE__ 6496 FMA f65 = f33, f48, f65 // A2 * B1 6497 (p12) cmp.ne p3, p0 = 0, L 6498 } 6499 ;; 6500 { .mfi 6501 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 6502 FMA f66 = f34, f48, f66 // A3 * B1 6503 (p5) adds C9 = 4 * SIZE, C1 6504 } 6505 { .mmf 6506 (p3) LDFD f56 = [BOFFSET], 1 * SIZE 6507 FMA f67 = f35, f48, f67 // A4 * B1 6508 } 6509 ;; 6510 { .mfi 6511 (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE 6512 (p3) FMA f64 = f40, f56, f64 // A1 * B1 6513 (p5) adds C10 = 2 * SIZE, C2 6514 } 6515 { .mmf 6516 (p5) LDFD f6 = [C1 ], SIZE 6517 (p5) LDFD f7 = [C9 ], SIZE 6518 (p3) FMA f65 = f41, f56, f65 // A2 * B1 6519 } 6520 ;; 6521 { .mmf 6522 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 6523 (p4) LDFD f48 = [BOFFSET], 1 * SIZE 6524 (p3) FMA f66 = f42, f56, f66 // A3 * B1 6525 } 6526 { .mmf 6527 (p5) LDFD f10 = [C1 ], SIZE 6528 (p5) LDFD f11 = [C9 ], SIZE 6529 (p3) FMA f67 = f43, f56, f67 // A4 * B1 6530 } 6531 ;; 6532 { .mfi 6533 (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE 6534 nop __LINE__ 6535 adds L = -1, L 6536 } 6537 { .mmb 6538 (p5) LDFD f12 = [C1 ], SIZE 6539 (p5) LDFD f13 = [C9 ], SIZE 6540 br.cloop.sptk.few .L142 6541 } 6542 ;; 6543 6544.L148: 6545 { .mmf 6546 LDFD f14 = [C1 ], - 3 * SIZE 6547 LDFD f15 = [C9 ], - 3 * SIZE 6548 FMA f6 = ALPHA_R, f64, f6 6549 } 6550 { .mmf 6551 nop __LINE__ 6552 nop __LINE__ 6553 FMA f7 = ALPHA_R, f66, f7 6554 } 6555 ;; 6556 { .mmf 6557 nop __LINE__ 6558 nop __LINE__ 6559 FMA f10 = ALPHA_I, f64, f10 6560 } 6561 { .mmf 6562 nop __LINE__ 6563 nop __LINE__ 6564 FMA f11 = ALPHA_I, f66, f11 6565 } 6566 ;; 6567 { .mmf 6568 nop __LINE__ 6569 nop __LINE__ 6570 FMA f12 = ALPHA_R, f65, f12 6571 } 6572 { .mmf 6573 nop __LINE__ 6574 nop __LINE__ 6575 FMA f13 = ALPHA_R, f67, f13 6576 } 6577 ;; 6578 { .mmf 6579 nop __LINE__ 6580 nop __LINE__ 6581 FMA f14 = ALPHA_I, f65, f14 6582 } 6583 { .mmf 6584 nop __LINE__ 6585 nop __LINE__ 6586 FMA f15 = ALPHA_I, f67, f15 6587 } 6588 ;; 6589 { .mmf 6590 STFD [C1 ] = f6, SIZE 6591 STFD [C9 ] = f7, SIZE 6592 mov f64 = f0 6593 } 6594 ;; 6595 { .mmf 6596 STFD [C1 ] = f10, SIZE 6597 STFD [C9 ] = f11, SIZE 6598 mov f65 = f0 6599 } 6600 ;; 6601 { .mmf 6602 STFD [C1 ] = f12, SIZE 6603 STFD [C9 ] = f13, SIZE 6604 mov f66 = f0 6605 } 6606 ;; 6607 { .mmf 6608 STFD [C1 ] = f14, 5 * SIZE 6609 STFD [C9 ] = f15, 5 * SIZE 6610 mov f67 = f0 6611 } 6612 ;; 6613 .align 32 6614 6615.L150: 6616 { .mib 6617 nop __LINE__ 6618 tbit.z p6, p7 = M, 1 6619 (p6) br.cond.dptk .L160 6620 } 6621 ;; 6622 { .mmi 6623 LDFD f48 = [B] 6624 adds BOFFSET = 1 * SIZE, B 6625 adds L = 1, K 6626 } 6627 ;; 6628 { .mii 6629 cmp.eq p3, p0 = r0, r0 6630 tbit.z p12, p0 = L, 0 6631 shr L = L, 1 6632 } 6633 ;; 6634 { .mii 6635 LDFPD f32, f33 = [AOFFSET], 2 * SIZE 6636 adds L = -1, L 6637 ;; 6638 mov ar.lc = L 6639 } 6640 ;; 6641 .align 32 6642 6643.L152: 6644 { .mfi 6645 cmp.ne p4, p5 = 0, L 6646 FMA f64 = f32, f48, f64 // A1 * B1 6647 (p12) cmp.ne p3, p0 = 0, L 6648 } 6649 ;; 6650 { .mmf 6651 (p3) LDFD f56 = [BOFFSET], 1 * SIZE 6652 (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE 6653 FMA f65 = f33, f48, f65 // A2 * B1 6654 } 6655 ;; 6656 { .mfi 6657 (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE 6658 (p3) FMA f64 = f40, f56, f64 // A1 * B1 6659 adds L = -1, L 6660 } 6661 ;; 6662 { .mfb 6663 (p4) LDFD f48 = [BOFFSET], 1 * SIZE 6664 (p3) FMA f65 = f41, f56, f65 // A2 * B1 6665 br.cloop.sptk.few .L152 6666 } 6667 ;; 6668 6669.L158: 6670 LDFD f68 = [C1 ], 1 * SIZE 6671 ;; 6672 LDFD f69 = [C1 ], 1 * SIZE 6673 ;; 6674 LDFD f70 = [C1 ], 1 * SIZE 6675 ;; 6676 LDFD f71 = [C1 ], - 3 * SIZE 6677 ;; 6678 FMA f68 = ALPHA_R, f64, f68 6679 FMA f69 = ALPHA_I, f64, f69 6680 FMA f70 = ALPHA_R, f65, f70 6681 FMA f71 = ALPHA_I, f65, f71 6682 ;; 6683 STFD [C1 ] = f68, SIZE 6684 ;; 6685 STFD [C1 ] = f69, SIZE 6686 ;; 6687 STFD [C1 ] = f70, SIZE 6688 mov f64 = f0 6689 ;; 6690 STFD [C1 ] = f71, SIZE 6691 mov f65 = f0 6692 ;; 6693 .align 32 6694 6695.L160: 6696 { .mib 6697 nop __LINE__ 6698 tbit.z p6, p7 = M, 0 6699 (p6) br.cond.dptk .L169 6700 } 6701 ;; 6702 { .mmi 6703 LDFD f48 = [B] 6704 adds BOFFSET = 1 * SIZE, B 6705 adds L = 1, K 6706 } 6707 ;; 6708 { .mii 6709 LDFD f32 = [AOFFSET], 1 * SIZE 6710 tbit.z p12, p0 = L, 0 6711 shr L = L, 1 6712 } 6713 ;; 6714 { .mii 6715 adds L = -1, L 6716 cmp.eq p3, p0 = r0, r0 6717 ;; 6718 mov ar.lc = L 6719 } 6720 ;; 6721 .align 32 6722 6723.L162: 6724 { .mmf 6725 cmp.ne p4, p5 = 0, L 6726 (p12) cmp.ne p3, p0 = 0, L 6727 FMA f64 = f32, f48, f64 // A1 * B1 6728 } 6729 ;; 6730 { .mmi 6731 (p3) LDFD f56 = [BOFFSET], 1 * SIZE 6732 (p3) LDFD f40 = [AOFFSET], 1 * SIZE 6733 nop __LINE__ 6734 } 6735 ;; 6736 { .mmi 6737 (p4) LDFD f32 = [AOFFSET], 1 * SIZE 6738 (p5) LDFD f68 = [C1], 1 * SIZE 6739 adds L = -1, L 6740 } 6741 ;; 6742 { .mmf 6743 (p4) LDFD f48 = [BOFFSET], 1 * SIZE 6744 (p5) LDFD f69 = [C1], - 1 * SIZE 6745 (p3) FMA f64 = f40, f56, f64 // A1 * B1 6746 } 6747 { .mib 6748 nop __LINE__ 6749 nop __LINE__ 6750 br.cloop.sptk.few .L162 6751 } 6752 ;; 6753 FMA f68 = ALPHA_R, f64, f68 6754 FMA f69 = ALPHA_I, f64, f69 6755 ;; 6756 STFD [C1 ] = f68, SIZE 6757 ;; 6758 STFD [C1 ] = f69, SIZE 6759 ;; 6760 .align 32 6761 6762.L169: 6763 { .mmi 6764 mov B = BOFFSET 6765 mov AOFFSET = A 6766 nop __LINE__ 6767 } 6768 ;; 6769 .align 16 6770 6771.L999: 6772 mov r8 = r0 6773 adds r9 = 1 * 16, SP 6774 ;; 6775 ldf.fill f16 = [SP], 32 6776 ldf.fill f17 = [r9], 32 6777 ;; 6778 ldf.fill f18 = [SP], 32 6779 ldf.fill f19 = [r9], 32 6780 ;; 6781 ldf.fill f20 = [SP], 32 6782 ldf.fill f21 = [r9], 32 6783 ;; 6784 ldf.fill f22 = [SP], 32 6785 ldf.fill f23 = [r9], 32 6786 mov ar.lc = ARLC 6787 ;; 6788 ldf.fill f24 = [SP], 32 6789 ldf.fill f25 = [r9], 32 6790 mov pr = PR, -1 6791 ;; 6792 ldf.fill f26 = [SP], 32 6793 ldf.fill f27 = [r9], 32 6794 mov ar.pfs = ARPFS 6795 ;; 6796 ldf.fill f28 = [SP], 32 6797 ldf.fill f29 = [r9], 32 6798 ;; 6799 ldf.fill f30 = [SP], 32 6800 ldf.fill f31 = [r9] 6801 br.ret.sptk.many b0 6802 EPILOGUE 6803 6804