1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define PREFETCHSIZE 64 * 8 43 44#define N r32 45#define X r36 46#define INCX r37 47#define Y r38 48#define INCY r39 49 50#define PRE1 r2 51#define PRE2 r3 52 53#define I r14 54#define J r15 55#define Y1 r16 56#define Y2 r17 57#define X1 r18 58#define X2 r19 59#define INCX16 r20 60#define INCY16 r21 61#define YYY r25 62#define YY r27 63#define XA r28 64#define XB r29 65#define PR r30 66#define ARLC r31 67 68#define ALPHA f8 69#define ALPHA_P f9 70 71 PROLOGUE 72 .prologue 73 PROFCODE 74 75 { .mii 76 shladd INCX = INCX, BASE_SHIFT, r0 77 .save ar.lc, ARLC 78 mov ARLC = ar.lc 79 tbit.nz p10, p0 = X, BASE_SHIFT 80 } 81 { .mfb 82 cmp.lt p0, p6 = r0, N 83 fcmp.eq p7, p0 = ALPHA, f0 84 (p6) br.ret.sptk.many b0 85 } 86 ;; 87 .body 88 { .mmi 89 (p10) LDFD f32 = [X], INCX 90 shladd INCY = INCY, BASE_SHIFT, r0 91 mov PR = pr 92 } 93 { .mib 94 (p10) adds N = -1, N 95 mov YYY = Y 96 (p7) br.ret.sptk.many b0 97 } 98 ;; 99 { .mmi 100 (p10) LDFD f33 = [Y], INCY 101 cmp.ne p13, p0 = SIZE, INCX 102 shr XA = X, 2 103 } 104 { .mmi 105 shladd INCX16 = INCX, 4, r0 106 shladd INCY16 = INCY, 4, r0 107 nop.i 0 108 } 109 ;; 110 { .mii 111 mov Y1 = Y 112 tbit.nz p11, p0 = Y, BASE_SHIFT 113 shr XB = Y, 2 114 } 115 ;; 116 { .mmf 117 and XA = 0x3f, XA 118 and XB = 0x3f, XB 119 (p10) FMA f32 = ALPHA, f32, f33 120 } 121 ;; 122 { .mmi 123 sub XA = XB, XA 124 shladd Y2 = INCY, 2, Y 125 mov pr.rot = 0x10000 126 } 127 { .mbb 128 cmp.ne p14, p0 = SIZE, INCY 129 (p13) br.cond.dpnt .L100 130 (p14) br.cond.dpnt .L100 131 } 132 ;; 133 { .mmi 134 cmp.gt p14, p0 = r0, XA 135 ;; 136 and J = 15, N 137 shr I = N, 4 138 } 139 { .mfb 140 (p14) adds XA = 64, XA 141 fpack ALPHA_P = f8, f8 142 (p11) br.cond.dpnt .L30 143 } 144 ;; 145 { .mmi 146 cmp.gt p14, p0 = 32, XA 147 cmp.lt p15, p0 = 58, XA 148 mov ar.ec = 3 149 } 150 { .mmi 151 and J = 31, N 152 cmp.eq p16, p0 = r0, r0 153 shr I = N, 5 154 } 155 ;; 156 { .mmi 157 cmp.eq p9, p0 = r0, J 158 cmp.eq p7 ,p0 = 0, I 159 adds I = -1, I 160 } 161 { .mbb 162 nop.m 0 163 (p14) br.cond.dpnt .L20 164 (p15) br.cond.dpnt .L20 165 } 166 ;; 167 { .mmi 168 (p10) STFD [YYY] = f32 169 adds PRE1 = PREFETCHSIZE * SIZE, X 170 mov ar.lc = I 171 } 172 { .mib 173 adds PRE2 = (PREFETCHSIZE - 24) * SIZE, Y 174 tbit.z p0, p11 = N, 4 175 (p7) br.cond.dpnt .L15 176 } 177 ;; 178 .align 32 179 180.L12: 181/* 0 */ 182 { .mmf 183 (p18) stf8 [Y1] = f6, 2 * SIZE 184 (p16) lfetch.nt1 [PRE1], 32 * SIZE 185 (p18) fpma f12 = ALPHA_P, f46, f94 186 } 187 { .mmi 188 (p16) ldf8 f32 = [X], 2 * SIZE 189 (p16) ldf8 f80 = [Y], 2 * SIZE 190 } 191 ;; 192/* 1 */ 193 { .mmf 194 (p18) stf8 [Y1] = f7, 2 * SIZE 195 (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE 196 (p18) fpma f13 = ALPHA_P, f49, f97 197 } 198 { .mmi 199 (p16) ldf8 f35 = [X], 2 * SIZE 200 (p16) ldf8 f83 = [Y], 2 * SIZE 201 } 202 ;; 203/* 2 */ 204 { .mmf 205 (p18) stf8 [Y1] = f10, 2 * SIZE 206 (p18) fpma f14 = ALPHA_P, f52, f100 207 } 208 { .mmi 209 (p16) ldf8 f38 = [X], 2 * SIZE 210 (p16) ldf8 f86 = [Y], 2 * SIZE 211 } 212 ;; 213/* 3 */ 214 { .mmf 215 (p18) stf8 [Y1] = f11, 2 * SIZE 216 (p18) fpma f15 = ALPHA_P, f55, f103 217 } 218 { .mmi 219 (p16) ldf8 f41 = [X], 2 * SIZE 220 (p16) ldf8 f89 = [Y], 2 * SIZE 221 } 222 ;; 223/* 4 */ 224 { .mmf 225 (p18) stf8 [Y1] = f12, 2 * SIZE 226 (p18) fpma f6 = ALPHA_P, f58, f106 227 } 228 { .mmi 229 (p16) ldf8 f44 = [X], 2 * SIZE 230 (p16) ldf8 f92 = [Y], 2 * SIZE 231 } 232 ;; 233/* 5 */ 234 { .mmf 235 (p18) stf8 [Y1] = f13, 2 * SIZE 236 (p18) fpma f7 = ALPHA_P, f61, f109 237 } 238 { .mmi 239 (p16) ldf8 f47 = [X], 2 * SIZE 240 (p16) ldf8 f95 = [Y], 2 * SIZE 241 } 242 ;; 243/* 6 */ 244 { .mmf 245 (p18) stf8 [Y1] = f14, 2 * SIZE 246 (p18) fpma f10 = ALPHA_P, f64, f112 247 } 248 { .mmi 249 (p16) ldf8 f50 = [X], 2 * SIZE 250 (p16) ldf8 f98 = [Y], 2 * SIZE 251 } 252 ;; 253/* 7 */ 254 { .mmf 255 (p18) stf8 [Y1] = f15, 2 * SIZE 256 (p18) fpma f11 = ALPHA_P, f67, f115 257 } 258 { .mmi 259 (p16) ldf8 f53 = [X], 2 * SIZE 260 (p16) ldf8 f101 = [Y], 2 * SIZE 261 } 262 ;; 263/* 8 */ 264 { .mmf 265 (p18) stf8 [Y1] = f6, 2 * SIZE 266 (p18) fpma f12 = ALPHA_P, f70, f118 267 } 268 { .mmi 269 (p16) ldf8 f56 = [X], 2 * SIZE 270 (p16) ldf8 f104 = [Y], 2 * SIZE 271 } 272 ;; 273/* 9 */ 274 { .mmf 275 (p18) stf8 [Y1] = f7, 2 * SIZE 276 (p18) fpma f13 = ALPHA_P, f73, f121 277 } 278 { .mmi 279 (p16) ldf8 f59 = [X], 2 * SIZE 280 (p16) ldf8 f107 = [Y], 2 * SIZE 281 } 282 ;; 283/* 10 */ 284 { .mmf 285 (p18) stf8 [Y1] = f10, 2 * SIZE 286 (p18) fpma f14 = ALPHA_P, f76, f124 287 } 288 { .mmi 289 (p16) ldf8 f62 = [X], 2 * SIZE 290 (p16) ldf8 f110 = [Y], 2 * SIZE 291 } 292 ;; 293/* 11 */ 294 { .mmf 295 (p18) stf8 [Y1] = f11, 2 * SIZE 296 (p18) fpma f15 = ALPHA_P, f79, f127 297 } 298 { .mmi 299 (p16) ldf8 f65 = [X], 2 * SIZE 300 (p16) ldf8 f113 = [Y], 2 * SIZE 301 } 302 ;; 303/* 12 */ 304 { .mmf 305 (p18) stf8 [Y1] = f12, 2 * SIZE 306 (p17) fpma f6 = ALPHA_P, f33, f81 307 } 308 { .mmi 309 (p16) ldf8 f68 = [X], 2 * SIZE 310 (p16) ldf8 f116 = [Y], 2 * SIZE 311 } 312 ;; 313/* 13 */ 314 { .mmf 315 (p18) stf8 [Y1] = f13, 2 * SIZE 316 (p17) fpma f7 = ALPHA_P, f36, f84 317 } 318 { .mmi 319 (p16) ldf8 f71 = [X], 2 * SIZE 320 (p16) ldf8 f119 = [Y], 2 * SIZE 321 } 322 ;; 323/* 14 */ 324 { .mmf 325 (p18) stf8 [Y1] = f14, 2 * SIZE 326 (p17) fpma f10 = ALPHA_P, f39, f87 327 } 328 { .mmi 329 (p16) ldf8 f74 = [X], 2 * SIZE 330 (p16) ldf8 f122 = [Y], 2 * SIZE 331 } 332 ;; 333/*15 */ 334 { .mmf 335 (p18) stf8 [Y1] = f15, 2 * SIZE 336 (p17) fpma f11 = ALPHA_P, f42, f90 337 } 338 { .mmb 339 (p16) ldf8 f77 = [X], 2 * SIZE 340 (p16) ldf8 f125 = [Y], 2 * SIZE 341 br.ctop.sptk.few .L12 342 } 343 ;; 344 .align 32 345 346.L15: 347 { .mmi 348 (p11) ldf8 f32 = [X], 2 * SIZE 349 (p11) ldf8 f33 = [Y], 2 * SIZE 350 mov pr = PR, -65474 351 } 352 ;; 353 { .mmi 354 (p11) ldf8 f34 = [X], 2 * SIZE 355 (p11) ldf8 f35 = [Y], 2 * SIZE 356 mov ar.lc = ARLC 357 } 358 ;; 359 { .mmb 360 (p11) ldf8 f36 = [X], 2 * SIZE 361 (p11) ldf8 f37 = [Y], 2 * SIZE 362 (p9) br.ret.sptk.many b0 363 } 364 ;; 365 { .mmi 366 (p11) ldf8 f38 = [X], 2 * SIZE 367 (p11) ldf8 f39 = [Y], 2 * SIZE 368 tbit.z p0, p12 = N, 3 369 } 370 ;; 371 { .mmi 372 (p11) ldf8 f40 = [X], 2 * SIZE 373 (p11) ldf8 f41 = [Y], 2 * SIZE 374 tbit.z p0, p13 = N, 2 375 } 376 ;; 377 { .mmi 378 (p11) ldf8 f42 = [X], 2 * SIZE 379 (p11) ldf8 f43 = [Y], 2 * SIZE 380 tbit.z p0, p14 = N, 1 381 } 382 ;; 383 { .mmf 384 (p11) ldf8 f44 = [X], 2 * SIZE 385 (p11) ldf8 f45 = [Y], 2 * SIZE 386 (p11) fpma f6 = ALPHA_P, f32, f33 387 } 388 ;; 389 { .mmf 390 (p11) ldf8 f46 = [X], 2 * SIZE 391 (p11) ldf8 f47 = [Y], 2 * SIZE 392 (p11) fpma f7 = ALPHA_P, f34, f35 393 } 394 ;; 395 { .mmf 396 (p12) ldf8 f48 = [X], 2 * SIZE 397 (p12) ldf8 f49 = [Y], 2 * SIZE 398 (p11) fpma f10 = ALPHA_P, f36, f37 399 } 400 ;; 401 { .mmi 402 (p11) stf8 [Y1] = f6, 2 * SIZE 403 nop.m 0 404 tbit.z p0, p15 = N, 0 405 } 406 { .mmf 407 (p12) ldf8 f50 = [X], 2 * SIZE 408 (p12) ldf8 f51 = [Y], 2 * SIZE 409 (p11) fpma f11 = ALPHA_P, f38, f39 410 } 411 ;; 412 { .mmi 413 (p11) stf8 [Y1] = f7, 2 * SIZE 414 nop.m 0 415 nop.i 0 416 } 417 { .mmf 418 (p12) ldf8 f52 = [X], 2 * SIZE 419 (p12) ldf8 f53 = [Y], 2 * SIZE 420 } 421 ;; 422 { .mmi 423 (p11) stf8 [Y1] = f10, 2 * SIZE 424 nop.m 0 425 nop.i 0 426 } 427 { .mmf 428 (p12) ldf8 f54 = [X], 2 * SIZE 429 (p12) ldf8 f55 = [Y], 2 * SIZE 430 (p11) fpma f12 = ALPHA_P, f40, f41 431 } 432 ;; 433 { .mmi 434 (p11) stf8 [Y1] = f11, 2 * SIZE 435 nop.m 0 436 nop.i 0 437 } 438 { .mmf 439 (p13) ldf8 f56 = [X], 2 * SIZE 440 (p13) ldf8 f57 = [Y], 2 * SIZE 441 (p11) fpma f13 = ALPHA_P, f42, f43 442 } 443 ;; 444 { .mmi 445 (p11) stf8 [Y1] = f12, 2 * SIZE 446 nop.m 0 447 nop.i 0 448 } 449 { .mmf 450 (p13) ldf8 f58 = [X], 2 * SIZE 451 (p13) ldf8 f59 = [Y], 2 * SIZE 452 (p11) fpma f14 = ALPHA_P, f44, f45 453 } 454 ;; 455 { .mmi 456 (p11) stf8 [Y1] = f13, 2 * SIZE 457 nop.m 0 458 nop.i 0 459 } 460 { .mmf 461 (p14) ldf8 f60 = [X], 2 * SIZE 462 (p14) ldf8 f61 = [Y], 2 * SIZE 463 (p11) fpma f15 = ALPHA_P, f46, f47 464 } 465 ;; 466 { .mmi 467 (p11) stf8 [Y1] = f14, 2 * SIZE 468 nop.m 0 469 nop.i 0 470 } 471 { .mmf 472 (p15) ldfs f62 = [X] 473 (p15) ldfs f63 = [Y] 474 (p12) fpma f6 = ALPHA_P, f48, f49 475 } 476 ;; 477 (p12) fpma f7 = ALPHA_P, f50, f51 478 (p12) fpma f10 = ALPHA_P, f52, f53 479 ;; 480 (p11) stf8 [Y1] = f15, 2 * SIZE 481 (p12) fpma f11 = ALPHA_P, f54, f55 482 ;; 483 (p12) stf8 [Y1] = f6, 2 * SIZE 484 (p13) fpma f12 = ALPHA_P, f56, f57 485 ;; 486 (p12) stf8 [Y1] = f7, 2 * SIZE 487 (p13) fpma f13 = ALPHA_P, f58, f59 488 ;; 489 (p12) stf8 [Y1] = f10, 2 * SIZE 490 (p14) fpma f14 = ALPHA_P, f60, f61 491 ;; 492 (p12) stf8 [Y1] = f11, 2 * SIZE 493 (p15) FMA f15 = ALPHA, f62, f63 494 ;; 495 (p13) stf8 [Y1] = f12, 2 * SIZE 496 ;; 497 (p13) stf8 [Y1] = f13, 2 * SIZE 498 ;; 499 (p14) stf8 [Y1] = f14, 2 * SIZE 500 ;; 501 (p15) stfs [Y1] = f15 502 br.ret.sptk.many b0 503 ;; 504 .align 32 505 506/* X is aligned; case 2 */ 507 508.L20: 509 { .mmi 510 (p10) STFD [YYY] = f32 511 adds PRE1 = (PREFETCHSIZE - 28) * SIZE, X 512 mov ar.lc = I 513 } 514 { .mib 515 adds PRE2 = (PREFETCHSIZE + 4) * SIZE, Y 516 tbit.z p0, p11 = N, 4 517 (p7) br.cond.dpnt .L25 518 } 519 ;; 520 .align 32 521 522.L22: 523/* 0 */ 524 { .mmf 525 (p18) stf8 [Y1] = f6, 2 * SIZE 526 (p16) lfetch.nt1 [PRE1], 32 * SIZE 527 (p18) fpma f12 = ALPHA_P, f46, f94 528 } 529 { .mmi 530 (p17) ldf8 f60 = [X], 2 * SIZE 531 (p16) ldf8 f80 = [Y], 2 * SIZE 532 } 533 ;; 534/* 1 */ 535 { .mmf 536 (p18) stf8 [Y1] = f7, 2 * SIZE 537 (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE 538 (p18) fpma f13 = ALPHA_P, f49, f97 539 } 540 { .mmi 541 (p17) ldf8 f63 = [X], 2 * SIZE 542 (p16) ldf8 f83 = [Y], 2 * SIZE 543 } 544 ;; 545/* 2 */ 546 { .mmf 547 (p18) stf8 [Y1] = f10, 2 * SIZE 548 (p18) fpma f14 = ALPHA_P, f52, f100 549 } 550 { .mmi 551 (p17) ldf8 f66 = [X], 2 * SIZE 552 (p16) ldf8 f86 = [Y], 2 * SIZE 553 } 554 ;; 555/* 3 */ 556 { .mmf 557 (p18) stf8 [Y1] = f11, 2 * SIZE 558 (p18) fpma f15 = ALPHA_P, f55, f103 559 } 560 { .mmi 561 (p17) ldf8 f69 = [X], 2 * SIZE 562 (p16) ldf8 f89 = [Y], 2 * SIZE 563 } 564 ;; 565/* 4 */ 566 { .mmf 567 (p18) stf8 [Y1] = f12, 2 * SIZE 568 (p18) fpma f6 = ALPHA_P, f58, f106 569 } 570 { .mmi 571 (p17) ldf8 f72 = [X], 2 * SIZE 572 (p16) ldf8 f92 = [Y], 2 * SIZE 573 } 574 ;; 575/* 5 */ 576 { .mmf 577 (p18) stf8 [Y1] = f13, 2 * SIZE 578 (p18) fpma f7 = ALPHA_P, f61, f109 579 } 580 { .mmi 581 (p17) ldf8 f75 = [X], 2 * SIZE 582 (p16) ldf8 f95 = [Y], 2 * SIZE 583 } 584 ;; 585/* 6 */ 586 { .mmf 587 (p18) stf8 [Y1] = f14, 2 * SIZE 588 (p18) fpma f10 = ALPHA_P, f64, f112 589 } 590 { .mmi 591 (p17) ldf8 f78 = [X], 2 * SIZE 592 (p16) ldf8 f98 = [Y], 2 * SIZE 593 } 594 ;; 595/* 7 */ 596 { .mmf 597 (p18) stf8 [Y1] = f15, 2 * SIZE 598 (p18) fpma f11 = ALPHA_P, f67, f115 599 } 600 { .mmi 601 (p16) ldf8 f32 = [X], 2 * SIZE 602 (p16) ldf8 f101 = [Y], 2 * SIZE 603 } 604 ;; 605/* 8 */ 606 { .mmf 607 (p18) stf8 [Y1] = f6, 2 * SIZE 608 (p18) fpma f12 = ALPHA_P, f70, f118 609 } 610 { .mmi 611 (p16) ldf8 f35 = [X], 2 * SIZE 612 (p16) ldf8 f104 = [Y], 2 * SIZE 613 } 614 ;; 615/* 9 */ 616 { .mmf 617 (p18) stf8 [Y1] = f7, 2 * SIZE 618 (p18) fpma f13 = ALPHA_P, f73, f121 619 } 620 { .mmi 621 (p16) ldf8 f38 = [X], 2 * SIZE 622 (p16) ldf8 f107 = [Y], 2 * SIZE 623 } 624 ;; 625/* 10 */ 626 { .mmf 627 (p18) stf8 [Y1] = f10, 2 * SIZE 628 (p18) fpma f14 = ALPHA_P, f76, f124 629 } 630 { .mmi 631 (p16) ldf8 f41 = [X], 2 * SIZE 632 (p16) ldf8 f110 = [Y], 2 * SIZE 633 } 634 ;; 635/* 11 */ 636 { .mmf 637 (p18) stf8 [Y1] = f11, 2 * SIZE 638 (p18) fpma f15 = ALPHA_P, f79, f127 639 } 640 { .mmi 641 (p16) ldf8 f44 = [X], 2 * SIZE 642 (p16) ldf8 f113 = [Y], 2 * SIZE 643 } 644 ;; 645/* 12 */ 646 { .mmf 647 (p18) stf8 [Y1] = f12, 2 * SIZE 648 (p17) fpma f6 = ALPHA_P, f33, f81 649 } 650 { .mmi 651 (p16) ldf8 f47 = [X], 2 * SIZE 652 (p16) ldf8 f116 = [Y], 2 * SIZE 653 } 654 ;; 655/* 13 */ 656 { .mmf 657 (p18) stf8 [Y1] = f13, 2 * SIZE 658 (p17) fpma f7 = ALPHA_P, f36, f84 659 } 660 { .mmi 661 (p16) ldf8 f50 = [X], 2 * SIZE 662 (p16) ldf8 f119 = [Y], 2 * SIZE 663 } 664 ;; 665/* 14 */ 666 { .mmf 667 (p18) stf8 [Y1] = f14, 2 * SIZE 668 (p17) fpma f10 = ALPHA_P, f39, f87 669 } 670 { .mmi 671 (p16) ldf8 f53 = [X], 2 * SIZE 672 (p16) ldf8 f122 = [Y], 2 * SIZE 673 } 674 ;; 675/*15 */ 676 { .mmf 677 (p18) stf8 [Y1] = f15, 2 * SIZE 678 (p17) fpma f11 = ALPHA_P, f42, f90 679 } 680 { .mmb 681 (p16) ldf8 f56 = [X], 2 * SIZE 682 (p16) ldf8 f125 = [Y], 2 * SIZE 683 br.ctop.sptk.few .L22 684 } 685 ;; 686 .align 32 687 688.L25: 689 { .mmi 690 (p11) ldf8 f32 = [X], 2 * SIZE 691 (p11) ldf8 f33 = [Y], 2 * SIZE 692 mov pr = PR, -65474 693 } 694 ;; 695 { .mmi 696 (p11) ldf8 f34 = [X], 2 * SIZE 697 (p11) ldf8 f35 = [Y], 2 * SIZE 698 mov ar.lc = ARLC 699 } 700 ;; 701 { .mmb 702 (p11) ldf8 f36 = [X], 2 * SIZE 703 (p11) ldf8 f37 = [Y], 2 * SIZE 704 (p9) br.ret.sptk.many b0 705 } 706 ;; 707 { .mmi 708 (p11) ldf8 f38 = [X], 2 * SIZE 709 (p11) ldf8 f39 = [Y], 2 * SIZE 710 tbit.z p0, p12 = N, 3 711 } 712 ;; 713 { .mmi 714 (p11) ldf8 f40 = [X], 2 * SIZE 715 (p11) ldf8 f41 = [Y], 2 * SIZE 716 tbit.z p0, p13 = N, 2 717 } 718 ;; 719 { .mmi 720 (p11) ldf8 f42 = [X], 2 * SIZE 721 (p11) ldf8 f43 = [Y], 2 * SIZE 722 tbit.z p0, p14 = N, 1 723 } 724 ;; 725 { .mmf 726 (p11) ldf8 f44 = [X], 2 * SIZE 727 (p11) ldf8 f45 = [Y], 2 * SIZE 728 (p11) fpma f6 = ALPHA_P, f32, f33 729 } 730 ;; 731 { .mmf 732 (p11) ldf8 f46 = [X], 2 * SIZE 733 (p11) ldf8 f47 = [Y], 2 * SIZE 734 (p11) fpma f7 = ALPHA_P, f34, f35 735 } 736 ;; 737 { .mmf 738 (p12) ldf8 f48 = [X], 2 * SIZE 739 (p12) ldf8 f49 = [Y], 2 * SIZE 740 (p11) fpma f10 = ALPHA_P, f36, f37 741 } 742 ;; 743 { .mmi 744 (p11) stf8 [Y1] = f6, 2 * SIZE 745 nop.m 0 746 tbit.z p0, p15 = N, 0 747 } 748 { .mmf 749 (p12) ldf8 f50 = [X], 2 * SIZE 750 (p12) ldf8 f51 = [Y], 2 * SIZE 751 (p11) fpma f11 = ALPHA_P, f38, f39 752 } 753 ;; 754 { .mmi 755 (p11) stf8 [Y1] = f7, 2 * SIZE 756 nop.m 0 757 nop.i 0 758 } 759 { .mmf 760 (p12) ldf8 f52 = [X], 2 * SIZE 761 (p12) ldf8 f53 = [Y], 2 * SIZE 762 } 763 ;; 764 { .mmi 765 (p11) stf8 [Y1] = f10, 2 * SIZE 766 nop.m 0 767 nop.i 0 768 } 769 { .mmf 770 (p12) ldf8 f54 = [X], 2 * SIZE 771 (p12) ldf8 f55 = [Y], 2 * SIZE 772 (p11) fpma f12 = ALPHA_P, f40, f41 773 } 774 ;; 775 { .mmi 776 (p11) stf8 [Y1] = f11, 2 * SIZE 777 nop.m 0 778 nop.i 0 779 } 780 { .mmf 781 (p13) ldf8 f56 = [X], 2 * SIZE 782 (p13) ldf8 f57 = [Y], 2 * SIZE 783 (p11) fpma f13 = ALPHA_P, f42, f43 784 } 785 ;; 786 { .mmi 787 (p11) stf8 [Y1] = f12, 2 * SIZE 788 nop.m 0 789 nop.i 0 790 } 791 { .mmf 792 (p13) ldf8 f58 = [X], 2 * SIZE 793 (p13) ldf8 f59 = [Y], 2 * SIZE 794 (p11) fpma f14 = ALPHA_P, f44, f45 795 } 796 ;; 797 { .mmi 798 (p11) stf8 [Y1] = f13, 2 * SIZE 799 nop.m 0 800 nop.i 0 801 } 802 { .mmf 803 (p14) ldf8 f60 = [X], 2 * SIZE 804 (p14) ldf8 f61 = [Y], 2 * SIZE 805 (p11) fpma f15 = ALPHA_P, f46, f47 806 } 807 ;; 808 { .mmi 809 (p11) stf8 [Y1] = f14, 2 * SIZE 810 nop.m 0 811 nop.i 0 812 } 813 { .mmf 814 (p15) ldfs f62 = [X] 815 (p15) ldfs f63 = [Y] 816 (p12) fpma f6 = ALPHA_P, f48, f49 817 } 818 ;; 819 (p12) fpma f7 = ALPHA_P, f50, f51 820 (p12) fpma f10 = ALPHA_P, f52, f53 821 ;; 822 (p11) stf8 [Y1] = f15, 2 * SIZE 823 (p12) fpma f11 = ALPHA_P, f54, f55 824 ;; 825 (p12) stf8 [Y1] = f6, 2 * SIZE 826 (p13) fpma f12 = ALPHA_P, f56, f57 827 ;; 828 (p12) stf8 [Y1] = f7, 2 * SIZE 829 (p13) fpma f13 = ALPHA_P, f58, f59 830 ;; 831 (p12) stf8 [Y1] = f10, 2 * SIZE 832 (p14) fpma f14 = ALPHA_P, f60, f61 833 ;; 834 (p12) stf8 [Y1] = f11, 2 * SIZE 835 (p15) FMA f15 = ALPHA, f62, f63 836 ;; 837 (p13) stf8 [Y1] = f12, 2 * SIZE 838 ;; 839 (p13) stf8 [Y1] = f13, 2 * SIZE 840 ;; 841 (p14) stf8 [Y1] = f14, 2 * SIZE 842 ;; 843 (p15) stfs [Y1] = f15 844 br.ret.sptk.many b0 845 ;; 846 .align 32 847 848.L30: 849 { .mmi 850 cmp.eq p9, p0 = r0, J 851 cmp.eq p7 ,p0 = 0, I 852 mov ar.ec = 4 853 } 854 { .mmi 855 cmp.lt p12, p0 = 33, XA 856 adds I = -1, I 857 } 858 ;; 859 { .mmi 860 cmp.gt p14, p0 = 15, XA 861 cmp.lt p15, p0 = 60, XA 862 (p12) cmp.gt.unc p13, p0 = 53, XA 863 } 864 { .bbb 865 (p13) br.cond.dpnt .L40 866 (p14) br.cond.dpnt .L40 867 (p15) br.cond.dpnt .L40 868 } 869 ;; 870 { .mmi 871 (p10) STFD [YYY] = f32 872 adds PRE1 = (PREFETCHSIZE + 6) * SIZE, X 873 mov ar.lc = I 874 } 875 { .mib 876 adds PRE2 = (PREFETCHSIZE + 0) * SIZE, Y 877 tbit.z p0, p12 = N, 3 878 (p7) br.cond.dpnt .L35 879 } 880 ;; 881 .align 32 882 883.L32: 884 { .mmf 885 (p19) STFD [Y1] = f6, 1 * SIZE 886 (p19) STFD [Y2] = f7, 1 * SIZE 887 (p18) FMA f6 = ALPHA, f34, f82 888 } 889 { .mmf 890 (p16) LDFPD f32, f35 = [X], 2 * SIZE 891 (p16) LDFD f80 = [Y], 1 * SIZE 892 (p18) FMA f7 = ALPHA, f46, f94 893 } 894 ;; 895 { .mmf 896 (p19) STFD [Y1] = f10, 1 * SIZE 897 (p19) STFD [Y2] = f11, 1 * SIZE 898 (p18) FMA f10 = ALPHA, f37, f85 899 } 900 { .mmf 901 (p16) LDFPD f38, f41 = [X], 2 * SIZE 902 (p16) LDFPD f83, f86 = [Y], 2 * SIZE 903 (p18) FMA f11 = ALPHA, f49, f97 904 } 905 ;; 906 { .mmf 907 (p19) STFD [Y1] = f12, 1 * SIZE 908 (p19) STFD [Y2] = f13, 1 * SIZE 909 (p18) FMA f12 = ALPHA, f40, f88 910 } 911 { .mmf 912 (p16) LDFPD f44, f47 = [X], 2 * SIZE 913 (p16) LDFPD f89, f92 = [Y], 2 * SIZE 914 (p18) FMA f13 = ALPHA, f52, f100 915 } 916 ;; 917 { .mmf 918 (p19) STFD [Y1] = f14, 5 * SIZE 919 (p19) STFD [Y2] = f15, 5 * SIZE 920 (p18) FMA f14 = ALPHA, f43, f91 921 } 922 { .mmf 923 (p16) LDFPD f50, f53 = [X], 2 * SIZE 924 (p16) LDFPD f95, f98 = [Y], 2 * SIZE 925 (p18) FMA f15 = ALPHA, f55, f103 926 } 927 ;; 928 { .mmf 929 (p18) STFD [Y1] = f6, 1 * SIZE 930 (p18) STFD [Y2] = f7, 1 * SIZE 931 (p18) FMA f6 = ALPHA, f58, f106 932 } 933 { .mmf 934 (p16) LDFPD f56, f59 = [X], 2 * SIZE 935 (p16) LDFPD f101, f104 = [Y], 2 * SIZE 936 (p18) FMA f7 = ALPHA, f70, f118 937 } 938 ;; 939 { .mmf 940 (p18) STFD [Y1] = f10, 1 * SIZE 941 (p18) STFD [Y2] = f11, 1 * SIZE 942 (p18) FMA f10 = ALPHA, f61, f109 943 } 944 { .mmf 945 (p16) LDFPD f62, f65 = [X], 2 * SIZE 946 (p16) LDFPD f107, f110 = [Y], 2 * SIZE 947 (p18) FMA f11 = ALPHA, f73, f121 948 } 949 ;; 950 { .mmf 951 (p18) STFD [Y1] = f12, 1 * SIZE 952 (p18) STFD [Y2] = f13, 1 * SIZE 953 (p18) FMA f12 = ALPHA, f64, f112 954 } 955 { .mmf 956 (p16) LDFPD f68, f71 = [X], 2 * SIZE 957 (p16) LDFPD f113, f116 = [Y], 2 * SIZE 958 (p18) FMA f13 = ALPHA, f76, f124 959 } 960 ;; 961 { .mmf 962 (p18) STFD [Y1] = f14, 5 * SIZE 963 (p18) STFD [Y2] = f15, 5 * SIZE 964 (p18) FMA f14 = ALPHA, f67, f115 965 } 966 { .mmf 967 (p16) LDFPD f74, f77 = [X], 2 * SIZE 968 (p16) LDFPD f119, f122 = [Y], 2 * SIZE 969 (p18) FMA f15 = ALPHA, f79, f127 970 } 971 ;; 972 { .mmi 973 (p16) lfetch.nt1 [PRE1], 16 * SIZE 974 (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE 975 nop.i 0 976 } 977 { .mmb 978 (p16) LDFD f125 = [Y], 1 * SIZE 979 nop.m 0 980 br.ctop.sptk.few .L32 981 } 982 ;; 983 .align 32 984 985.L35: 986 { .mmi 987 (p12) LDFPD f32, f33 = [X], 2 * SIZE 988 (p12) LDFD f34 = [Y], 1 * SIZE; 989 mov pr = PR, -65474 990 } 991 ;; 992 { .mmi 993 (p12) LDFPD f36, f37 = [X], 2 * SIZE 994 (p12) LDFPD f35, f38 = [Y], 2 * SIZE 995 mov ar.lc = ARLC 996 } 997 ;; 998 { .mmb 999 (p12) LDFPD f40, f41 = [X], 2 * SIZE 1000 (p12) LDFPD f39, f42 = [Y], 2 * SIZE 1001 (p9) br.ret.sptk.many b0 1002 } 1003 ;; 1004 { .mmi 1005 (p12) LDFPD f44, f45 = [X], 2 * SIZE 1006 (p12) LDFPD f43, f46 = [Y], 2 * SIZE 1007 tbit.z p0, p13 = N, 2 1008 } 1009 ;; 1010 { .mmi 1011 (p13) LDFPD f48, f49 = [X], 2 * SIZE 1012 (p12) LDFD f47 = [Y], 1 * SIZE 1013 tbit.z p0, p14 = N, 1 1014 } 1015 ;; 1016 { .mmi 1017 (p13) LDFPD f52, f53 = [X], 2 * SIZE 1018 (p13) LDFD f50 = [Y], 1 * SIZE 1019 tbit.z p0, p15 = N, 0 1020 } 1021 ;; 1022 { .mmi 1023 (p14) LDFPD f56, f57 = [X], 2 * SIZE 1024 (p13) LDFPD f51, f54 = [Y], 2 * SIZE 1025 mov YY = Y1; 1026 } 1027 ;; 1028 (p15) LDFD f60 = [X] 1029 (p13) LDFD f55 = [Y], 1 * SIZE 1030 ;; 1031 (p14) LDFD f58 = [Y], 1 * SIZE 1032 (p12) FMA f6 = ALPHA, f32, f34 1033 (p12) FMA f7 = ALPHA, f40, f42 1034 ;; 1035 (p14) LDFD f59 = [Y], 1 * SIZE 1036 (p12) shladd YY = INCY, 3, YY 1037 (p12) FMA f10 = ALPHA, f33, f35 1038 (p12) FMA f11 = ALPHA, f41, f43 1039 ;; 1040 (p15) LDFD f61 = [Y] 1041 (p13) shladd YY = INCY, 2, YY 1042 (p12) FMA f12 = ALPHA, f36, f38 1043 (p12) FMA f13 = ALPHA, f44, f46 1044 ;; 1045 (p12) STFD [Y1] = f6, 1 * SIZE 1046 (p12) FMA f14 = ALPHA, f37, f39 1047 (p12) STFD [Y2] = f7, 1 * SIZE 1048 (p12) FMA f15 = ALPHA, f45, f47 1049 ;; 1050 (p12) STFD [Y1] = f10, 1 * SIZE 1051 (p13) FMA f6 = ALPHA, f48, f50 1052 (p12) STFD [Y2] = f11, 1 * SIZE 1053 (p14) FMA f7 = ALPHA, f56, f58 1054 ;; 1055 (p12) STFD [Y1] = f12, 1 * SIZE 1056 (p13) FMA f10 = ALPHA, f49, f51 1057 (p12) STFD [Y2] = f13, 1 * SIZE 1058 (p14) FMA f11 = ALPHA, f57, f59 1059 ;; 1060 (p12) STFD [Y1] = f14, 5 * SIZE 1061 (p13) FMA f12 = ALPHA, f52, f54 1062 (p12) STFD [Y2] = f15, 5 * SIZE 1063 (p15) FMA f13 = ALPHA, f60, f61 1064 ;; 1065 (p13) STFD [Y1] = f6, 1 * SIZE 1066 (p14) STFD [YY] = f7, 1 * SIZE 1067 (p13) FMA f14 = ALPHA, f53, f55 1068 ;; 1069 (p13) STFD [Y1] = f10, 1 * SIZE 1070 (p14) STFD [YY] = f11, 1 * SIZE 1071 ;; 1072 (p13) STFD [Y1] = f12, 1 * SIZE 1073 (p15) STFD [YY] = f13 1074 ;; 1075 (p13) STFD [Y1] = f14 1076 br.ret.sptk.many b0 1077 ;; 1078 .align 32 1079 1080.L40: 1081 { .mmi 1082 (p10) STFD [YYY] = f32 1083 adds PRE1 = (PREFETCHSIZE + 38) * SIZE, X 1084 mov ar.lc = I 1085 } 1086 { .mib 1087 adds PRE2 = (PREFETCHSIZE + 14) * SIZE, Y 1088 tbit.z p0, p12 = N, 3 1089 (p7) br.cond.dpnt .L45 1090 } 1091 ;; 1092 .align 32 1093 1094.L42: 1095 { .mmf 1096 (p19) STFD [Y1] = f6, 1 * SIZE 1097 (p19) STFD [Y2] = f7, 1 * SIZE 1098 (p18) FMA f6 = ALPHA, f34, f82 1099 } 1100 { .mmf 1101 (p16) lfetch.nt1 [PRE1], 16 * SIZE 1102 (p17) LDFPD f102, f105 = [Y], 2 * SIZE 1103 (p18) FMA f7 = ALPHA, f46, f94 1104 } 1105 ;; 1106 { .mmf 1107 (p19) STFD [Y1] = f10, 1 * SIZE 1108 (p19) STFD [Y2] = f11, 1 * SIZE 1109 (p18) FMA f10 = ALPHA, f37, f85 1110 } 1111 { .mmf 1112 (p17) LDFPD f33, f36 = [X], 2 * SIZE 1113 (p17) LDFPD f108, f111 = [Y], 2 * SIZE 1114 (p18) FMA f11 = ALPHA, f49, f97 1115 } 1116 ;; 1117 { .mmf 1118 (p19) STFD [Y1] = f12, 1 * SIZE 1119 (p19) STFD [Y2] = f13, 1 * SIZE 1120 (p18) FMA f12 = ALPHA, f40, f88 1121 } 1122 { .mmf 1123 (p17) LDFPD f39, f42 = [X], 2 * SIZE 1124 (p17) LDFPD f114, f117 = [Y], 2 * SIZE 1125 (p18) FMA f13 = ALPHA, f52, f100 1126 } 1127 ;; 1128 { .mmf 1129 (p19) STFD [Y1] = f14, 5 * SIZE 1130 (p19) STFD [Y2] = f15, 5 * SIZE 1131 (p18) FMA f14 = ALPHA, f43, f91 1132 } 1133 { .mmf 1134 (p17) LDFPD f45, f48 = [X], 2 * SIZE 1135 (p17) LDFPD f120, f123 = [Y], 2 * SIZE 1136 (p18) FMA f15 = ALPHA, f55, f103 1137 } 1138 ;; 1139 { .mmf 1140 (p18) STFD [Y1] = f6, 1 * SIZE 1141 (p18) STFD [Y2] = f7, 1 * SIZE 1142 (p18) FMA f6 = ALPHA, f58, f106 1143 } 1144 { .mmf 1145 (p17) LDFPD f51, f54 = [X], 2 * SIZE 1146 (p17) LDFD f126 = [Y], 1 * SIZE 1147 (p18) FMA f7 = ALPHA, f70, f118 1148 } 1149 ;; 1150 { .mmf 1151 (p18) STFD [Y1] = f10, 1 * SIZE 1152 (p18) STFD [Y2] = f11, 1 * SIZE 1153 (p18) FMA f10 = ALPHA, f61, f109 1154 } 1155 { .mmf 1156 (p17) LDFPD f57, f60 = [X], 2 * SIZE 1157 (p16) LDFD f80 = [Y], 1 * SIZE 1158 (p18) FMA f11 = ALPHA, f73, f121 1159 } 1160 ;; 1161 { .mmf 1162 (p18) STFD [Y1] = f12, 1 * SIZE 1163 (p18) STFD [Y2] = f13, 1 * SIZE 1164 (p18) FMA f12 = ALPHA, f64, f112 1165 } 1166 { .mmf 1167 (p17) LDFPD f63, f66 = [X], 2 * SIZE 1168 (p16) LDFPD f83, f86 = [Y], 2 * SIZE 1169 (p18) FMA f13 = ALPHA, f76, f124 1170 } 1171 ;; 1172 { .mmf 1173 (p18) STFD [Y1] = f14, 5 * SIZE 1174 (p18) STFD [Y2] = f15, 5 * SIZE 1175 (p18) FMA f14 = ALPHA, f67, f115 1176 } 1177 { .mmf 1178 (p17) LDFPD f69, f72 = [X], 2 * SIZE 1179 (p16) LDFPD f89, f92 = [Y], 2 * SIZE 1180 (p18) FMA f15 = ALPHA, f79, f127 1181 } 1182 ;; 1183#if 0 1184 (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE 1185#endif 1186 { .mmb 1187 (p17) LDFPD f75, f78 = [X], 2 * SIZE 1188 (p16) LDFPD f95, f98 = [Y], 2 * SIZE 1189 br.ctop.sptk.few .L42 1190 } 1191 ;; 1192 { .mmf 1193 (p19) STFD [Y1] = f6, 1 * SIZE 1194 (p19) STFD [Y2] = f7, 1 * SIZE 1195 } 1196 ;; 1197 { .mmf 1198 (p19) STFD [Y1] = f10, 1 * SIZE 1199 (p19) STFD [Y2] = f11, 1 * SIZE 1200 } 1201 ;; 1202 { .mmf 1203 (p19) STFD [Y1] = f12, 1 * SIZE 1204 (p19) STFD [Y2] = f13, 1 * SIZE 1205 } 1206 ;; 1207 { .mmf 1208 (p19) STFD [Y1] = f14, 5 * SIZE 1209 (p19) STFD [Y2] = f15, 5 * SIZE 1210 } 1211 ;; 1212 .align 32 1213 1214.L45: 1215 { .mmi 1216 (p12) LDFPD f32, f33 = [X], 2 * SIZE 1217 (p12) LDFD f34 = [Y], 1 * SIZE; 1218 mov pr = PR, -65474 1219 } 1220 ;; 1221 { .mmi 1222 (p12) LDFPD f36, f37 = [X], 2 * SIZE 1223 (p12) LDFPD f35, f38 = [Y], 2 * SIZE 1224 mov ar.lc = ARLC 1225 } 1226 ;; 1227 { .mmb 1228 (p12) LDFPD f40, f41 = [X], 2 * SIZE 1229 (p12) LDFPD f39, f42 = [Y], 2 * SIZE 1230 (p9) br.ret.sptk.many b0 1231 } 1232 ;; 1233 { .mmi 1234 (p12) LDFPD f44, f45 = [X], 2 * SIZE 1235 (p12) LDFPD f43, f46 = [Y], 2 * SIZE 1236 tbit.z p0, p13 = N, 2 1237 } 1238 ;; 1239 { .mmi 1240 (p13) LDFPD f48, f49 = [X], 2 * SIZE 1241 (p12) LDFD f47 = [Y], 1 * SIZE 1242 tbit.z p0, p14 = N, 1 1243 } 1244 ;; 1245 { .mmi 1246 (p13) LDFPD f52, f53 = [X], 2 * SIZE 1247 (p13) LDFD f50 = [Y], 1 * SIZE 1248 tbit.z p0, p15 = N, 0 1249 } 1250 ;; 1251 { .mmi 1252 (p14) LDFPD f56, f57 = [X], 2 * SIZE 1253 (p13) LDFPD f51, f54 = [Y], 2 * SIZE 1254 mov YY = Y1; 1255 } 1256 ;; 1257 (p15) LDFD f60 = [X] 1258 (p13) LDFD f55 = [Y], 1 * SIZE 1259 ;; 1260 (p14) LDFD f58 = [Y], 1 * SIZE 1261 (p12) FMA f6 = ALPHA, f32, f34 1262 (p12) FMA f7 = ALPHA, f40, f42 1263 ;; 1264 (p14) LDFD f59 = [Y], 1 * SIZE 1265 (p12) shladd YY = INCY, 3, YY 1266 (p12) FMA f10 = ALPHA, f33, f35 1267 (p12) FMA f11 = ALPHA, f41, f43 1268 ;; 1269 (p15) LDFD f61 = [Y] 1270 (p13) shladd YY = INCY, 2, YY 1271 (p12) FMA f12 = ALPHA, f36, f38 1272 (p12) FMA f13 = ALPHA, f44, f46 1273 ;; 1274 (p12) STFD [Y1] = f6, 1 * SIZE 1275 (p12) FMA f14 = ALPHA, f37, f39 1276 (p12) STFD [Y2] = f7, 1 * SIZE 1277 (p12) FMA f15 = ALPHA, f45, f47 1278 ;; 1279 (p12) STFD [Y1] = f10, 1 * SIZE 1280 (p13) FMA f6 = ALPHA, f48, f50 1281 (p12) STFD [Y2] = f11, 1 * SIZE 1282 (p14) FMA f7 = ALPHA, f56, f58 1283 ;; 1284 (p12) STFD [Y1] = f12, 1 * SIZE 1285 (p13) FMA f10 = ALPHA, f49, f51 1286 (p12) STFD [Y2] = f13, 1 * SIZE 1287 (p14) FMA f11 = ALPHA, f57, f59 1288 ;; 1289 (p12) STFD [Y1] = f14, 5 * SIZE 1290 (p13) FMA f12 = ALPHA, f52, f54 1291 (p12) STFD [Y2] = f15, 5 * SIZE 1292 (p15) FMA f13 = ALPHA, f60, f61 1293 ;; 1294 (p13) STFD [Y1] = f6, 1 * SIZE 1295 (p14) STFD [YY] = f7, 1 * SIZE 1296 (p13) FMA f14 = ALPHA, f53, f55 1297 ;; 1298 (p13) STFD [Y1] = f10, 1 * SIZE 1299 (p14) STFD [YY] = f11, 1 * SIZE 1300 ;; 1301 (p13) STFD [Y1] = f12, 1 * SIZE 1302 (p15) STFD [YY] = f13 1303 ;; 1304 (p13) STFD [Y1] = f14 1305 br.ret.sptk.many b0 1306 ;; 1307 .align 32 1308 1309.L100: 1310 { .mii 1311 and J = 15, N 1312 shr I = N, 4 1313 mov ar.ec = 3 1314 } 1315 ;; 1316 { .mmi 1317 cmp.eq p9, p0 = r0, J 1318 cmp.eq p7 ,p0 = 0, I 1319 adds I = -1, I 1320 } 1321 ;; 1322 { .mmi 1323 (p10) STFD [YYY] = f32 1324 adds PRE1 = PREFETCHSIZE * SIZE, X 1325 mov ar.lc = I 1326 } 1327 { .mib 1328 adds PRE2 = PREFETCHSIZE * SIZE, Y 1329 tbit.z p0, p12 = N, 3 1330 (p7) br.cond.dpnt .L115 1331 } 1332 ;; 1333 .align 32 1334 1335.L112: 1336 { .mmi 1337 (p18) STFD [Y1] = f6 1338 (p16) lfetch.nt1 [PRE1], INCX16 1339 (p18) add Y1 = INCY, Y1 1340 } 1341 {.mmf 1342 (p16) LDFD f32 = [X], INCX 1343 (p16) LDFD f80 = [Y], INCY 1344 (p18) FMA f6 = ALPHA, f58, f106 1345 } 1346 ;; 1347 { .mmi 1348 (p18) STFD [Y1] = f7 1349 (p16) lfetch.excl.nt1 [PRE2], INCY16 1350 (p18) add Y1 = INCY, Y1 1351 } 1352 { .mmf 1353 (p16) LDFD f35 = [X], INCX 1354 (p16) LDFD f83 = [Y], INCY 1355 (p18) FMA f7 = ALPHA, f61, f109 1356 } 1357 ;; 1358 { .mmi 1359 (p18) STFD [Y1] = f10 1360 (p18) add Y1 = INCY, Y1 1361 nop.i 0 1362 } 1363 { .mmf 1364 (p16) LDFD f38 = [X], INCX 1365 (p16) LDFD f86 = [Y], INCY 1366 (p18) FMA f10 = ALPHA, f64, f112 1367 } 1368 ;; 1369 { .mmi 1370 (p18) STFD [Y1] = f11 1371 (p18) add Y1 = INCY, Y1 1372 nop.i 0 1373 } 1374 { .mmf 1375 (p16) LDFD f41 = [X], INCX 1376 (p16) LDFD f89 = [Y], INCY 1377 (p18) FMA f11 = ALPHA, f67, f115 1378 } 1379 ;; 1380 { .mmi 1381 (p18) STFD [Y1] = f12 1382 (p18) add Y1 = INCY, Y1 1383 nop.i 0 1384 } 1385 { .mmf 1386 (p16) LDFD f44 = [X], INCX 1387 (p16) LDFD f92 = [Y], INCY 1388 (p18) FMA f12 = ALPHA, f70, f118 1389 } 1390 ;; 1391 { .mmi 1392 (p18) STFD [Y1] = f13 1393 (p18) add Y1 = INCY, Y1 1394 nop.i 0 1395 } 1396 { .mmf 1397 (p16) LDFD f47 = [X], INCX 1398 (p16) LDFD f95 = [Y], INCY 1399 (p18) FMA f13 = ALPHA, f73, f121 1400 } 1401 ;; 1402 { .mmi 1403 (p18) STFD [Y1] = f14 1404 (p18) add Y1 = INCY, Y1 1405 nop.i 0 1406 } 1407 { .mmf 1408 (p16) LDFD f50 = [X], INCX 1409 (p16) LDFD f98 = [Y], INCY 1410 (p18) FMA f14 = ALPHA, f76, f124 1411 } 1412 ;; 1413 { .mmi 1414 (p18) STFD [Y1] = f15 1415 (p18) add Y1 = INCY, Y1 1416 nop.i 0 1417 } 1418 { .mmf 1419 (p16) LDFD f53 = [X], INCX 1420 (p16) LDFD f101 = [Y], INCY 1421 (p18) FMA f15 = ALPHA, f79, f127 1422 } 1423 ;; 1424 { .mmi 1425 (p18) STFD [Y1] = f6 1426 (p18) add Y1 = INCY, Y1 1427 nop.i 0 1428 } 1429 { .mmf 1430 (p16) LDFD f56 = [X], INCX 1431 (p16) LDFD f104 = [Y], INCY 1432 (p17) FMA f6 = ALPHA, f33, f81 1433 } 1434 ;; 1435 { .mmi 1436 (p18) STFD [Y1] = f7 1437 (p18) add Y1 = INCY, Y1 1438 nop.i 0 1439 } 1440 { .mmf 1441 (p16) LDFD f59 = [X], INCX 1442 (p16) LDFD f107 = [Y], INCY 1443 (p17) FMA f7 = ALPHA, f36, f84 1444 } 1445 ;; 1446 { .mmi 1447 (p18) STFD [Y1] = f10 1448 (p18) add Y1 = INCY, Y1 1449 nop.i 0 1450 } 1451 { .mmf 1452 (p16) LDFD f62 = [X], INCX 1453 (p16) LDFD f110 = [Y], INCY 1454 (p17) FMA f10 = ALPHA, f39, f87 1455 } 1456 ;; 1457 { .mmi 1458 (p18) STFD [Y1] = f11 1459 (p18) add Y1 = INCY, Y1 1460 nop.i 0 1461 } 1462 { .mmf 1463 (p16) LDFD f65 = [X], INCX 1464 (p16) LDFD f113 = [Y], INCY 1465 (p17) FMA f11 = ALPHA, f42, f90 1466 } 1467 ;; 1468 { .mmi 1469 (p18) STFD [Y1] = f12 1470 (p18) add Y1 = INCY, Y1 1471 nop.i 0 1472 } 1473 { .mmf 1474 (p16) LDFD f68 = [X], INCX 1475 (p16) LDFD f116 = [Y], INCY 1476 (p17) FMA f12 = ALPHA, f45, f93 1477 } 1478 ;; 1479 { .mmi 1480 (p18) STFD [Y1] = f13 1481 (p18) add Y1 = INCY, Y1 1482 nop.i 0 1483 } 1484 { .mmf 1485 (p16) LDFD f71 = [X], INCX 1486 (p16) LDFD f119 = [Y], INCY 1487 (p17) FMA f13 = ALPHA, f48, f96 1488 } 1489 ;; 1490 { .mmi 1491 (p18) STFD [Y1] = f14 1492 (p18) add Y1 = INCY, Y1 1493 nop.i 0 1494 } 1495 { .mmf 1496 (p16) LDFD f74 = [X], INCX 1497 (p16) LDFD f122 = [Y], INCY 1498 (p17) FMA f14 = ALPHA, f51, f99 1499 } 1500 ;; 1501 { .mmf 1502 (p18) STFD [Y1] = f15 1503 (p18) add Y1 = INCY, Y1 1504 (p17) FMA f15 = ALPHA, f54, f102 1505 } 1506 { .mmb 1507 (p16) LDFD f77 = [X], INCX 1508 (p16) LDFD f125 = [Y], INCY 1509 br.ctop.sptk.few .L112 1510 } 1511 ;; 1512 .align 32 1513 1514.L115: 1515 (p12) LDFD f32 = [X], INCX 1516 (p12) LDFD f34 = [Y], INCY 1517 mov pr = PR, -65474 1518 ;; 1519 (p12) LDFD f33 = [X], INCX 1520 (p12) LDFD f35 = [Y], INCY 1521 mov ar.lc = ARLC 1522 ;; 1523 (p12) LDFD f36 = [X], INCX 1524 (p12) LDFD f38 = [Y], INCY 1525 (p9) br.ret.sptk.many b0 1526 ;; 1527 (p12) LDFD f37 = [X], INCX 1528 (p12) LDFD f39 = [Y], INCY 1529 tbit.z p0, p13 = N, 2 1530 ;; 1531 (p12) LDFD f40 = [X], INCX 1532 (p12) LDFD f42 = [Y], INCY 1533 tbit.z p0, p14 = N, 1 1534 ;; 1535 (p12) LDFD f41 = [X], INCX 1536 (p12) LDFD f43 = [Y], INCY 1537 tbit.z p0, p15 = N, 0 1538 ;; 1539 { .mmf 1540 (p12) LDFD f44 = [X], INCX 1541 (p12) LDFD f46 = [Y], INCY 1542 (p12) FMA f6 = ALPHA, f32, f34 1543 } 1544 ;; 1545 { .mmf 1546 (p12) LDFD f45 = [X], INCX 1547 (p12) LDFD f47 = [Y], INCY 1548 (p12) FMA f7 = ALPHA, f33, f35 1549 } 1550 ;; 1551 { .mmf 1552 (p13) LDFD f48 = [X], INCX 1553 (p13) LDFD f50 = [Y], INCY 1554 (p12) FMA f10 = ALPHA, f36, f38 1555 } 1556 ;; 1557 { .mmf 1558 (p13) LDFD f49 = [X], INCX 1559 (p13) LDFD f51 = [Y], INCY 1560 (p12) FMA f11 = ALPHA, f37, f39 1561 } 1562 ;; 1563 { .mmi 1564 (p12) STFD [Y1] = f6 1565 (p12) add Y1 = INCY, Y1 1566 nop.i 0 1567 } 1568 { .mmf 1569 (p13) LDFD f52 = [X], INCX 1570 (p13) LDFD f54 = [Y], INCY 1571 (p12) FMA f12 = ALPHA, f40, f42 1572 } 1573 ;; 1574 { .mmi 1575 (p12) STFD [Y1] = f7 1576 (p12) add Y1 = INCY, Y1 1577 nop.i 0 1578 } 1579 { .mmf 1580 (p13) LDFD f53 = [X], INCX 1581 (p13) LDFD f55 = [Y], INCY 1582 (p12) FMA f13 = ALPHA, f41, f43 1583 } 1584 ;; 1585 { .mmi 1586 (p12) STFD [Y1] = f10 1587 (p12) add Y1 = INCY, Y1 1588 nop.i 0 1589 } 1590 { .mmf 1591 (p14) LDFD f56 = [X], INCX 1592 (p14) LDFD f58 = [Y], INCY 1593 (p12) FMA f14 = ALPHA, f44, f46 1594 } 1595 ;; 1596 { .mmi 1597 (p12) STFD [Y1] = f11 1598 (p12) add Y1 = INCY, Y1 1599 nop.i 0 1600 } 1601 { .mmf 1602 (p14) LDFD f57 = [X], INCX 1603 (p14) LDFD f59 = [Y], INCY 1604 (p12) FMA f15 = ALPHA, f45, f47 1605 } 1606 ;; 1607 { .mmi 1608 (p12) STFD [Y1] = f12 1609 (p12) add Y1 = INCY, Y1 1610 nop.i 0 1611 } 1612 { .mmf 1613 (p15) LDFD f60 = [X] 1614 (p15) LDFD f61 = [Y] 1615 (p13) FMA f6 = ALPHA, f48, f50 1616 } 1617 ;; 1618 { .mmf 1619 (p12) STFD [Y1] = f13 1620 (p12) add Y1 = INCY, Y1 1621 (p13) FMA f7 = ALPHA, f49, f51 1622 } 1623 ;; 1624 { .mmf 1625 (p12) STFD [Y1] = f14 1626 (p12) add Y1 = INCY, Y1 1627 (p13) FMA f10 = ALPHA, f52, f54 1628 } 1629 ;; 1630 { .mmf 1631 (p12) STFD [Y1] = f15 1632 (p12) add Y1 = INCY, Y1 1633 (p13) FMA f11 = ALPHA, f53, f55 1634 } 1635 ;; 1636 { .mmf 1637 (p13) STFD [Y1] = f6 1638 (p13) add Y1 = INCY, Y1 1639 (p14) FMA f12 = ALPHA, f56, f58 1640 } 1641 ;; 1642 { .mmf 1643 (p13) STFD [Y1] = f7 1644 (p13) add Y1 = INCY, Y1 1645 (p14) FMA f13 = ALPHA, f57, f59 1646 } 1647 ;; 1648 { .mmf 1649 (p13) STFD [Y1] = f10 1650 (p13) add Y1 = INCY, Y1 1651 (p15) FMA f14 = ALPHA, f60, f61 1652 } 1653 ;; 1654 (p13) STFD [Y1] = f11 1655 (p13) add Y1 = INCY, Y1 1656 ;; 1657 (p14) STFD [Y1] = f12 1658 (p14) add Y1 = INCY, Y1 1659 ;; 1660 (p14) STFD [Y1] = f13 1661 (p14) add Y1 = INCY, Y1 1662 ;; 1663 (p15) STFD [Y1] = f14 1664 br.ret.sptk.many b0 1665 ;; 1666 EPILOGUE 1667 1668