1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41#include "version.h" 42 43#define N $16 44#define X $17 45#define INCX $18 46#define Y $19 47#define INCY $20 48#define I $21 49#define XX $23 50#define YY $24 51 52#define C $f10 53#define S $f11 54 55#define PREFETCH_SIZE 80 56 57 PROLOGUE 58 PROFCODE 59 .frame $sp, 0, $26, 0 60 61#ifndef PROFILE 62 .prologue 0 63#else 64 .prologue 1 65#endif 66 67 fmov $f21, C 68 LD S, 0($sp) 69 70 addq INCX, INCX, INCX 71 addq INCY, INCY, INCY 72 73 cmpeq INCX, 2, $23 74 cmpeq INCY, 2, $24 75 ble N, $L998 76 77 and $23, $24, $23 78 beq $23, $L50 79 80 sra N, 2, I 81 ble I, $L15 82 83 LD $f12, 0*SIZE(X) 84 LD $f13, 0*SIZE(Y) 85 LD $f14, 1*SIZE(X) 86 LD $f15, 1*SIZE(Y) 87 88 LD $f16, 2*SIZE(X) 89 LD $f17, 2*SIZE(Y) 90 LD $f18, 3*SIZE(X) 91 LD $f19, 3*SIZE(Y) 92 93 MUL C, $f12, $f21 94 unop 95 MUL S, $f13, $f22 96 MUL C, $f13, $f23 97 98 LD $f13, 4*SIZE(Y) 99 MUL S, $f12, $f24 100 LD $f12, 4*SIZE(X) 101 MUL C, $f14, $f25 102 103 lda I, -1(I) 104 MUL S, $f15, $f26 105 ADD $f21, $f22, $f22 106 MUL C, $f15, $f27 107 108 LD $f15, 5*SIZE(Y) 109 MUL S, $f14, $f28 110 SUB $f23, $f24, $f24 111 ble I, $L13 112 .align 4 113 114$L12: 115 MUL C, $f16, $f21 116 lds $f31, (PREFETCH_SIZE) * SIZE(X) 117 unop 118 LD $f14, 5*SIZE(X) 119 120 ST $f22, 0*SIZE(X) 121 MUL S, $f17, $f22 122 unop 123 ADD $f25, $f26, $f26 124 125 MUL C, $f17, $f23 126 lds $f31, (PREFETCH_SIZE) * SIZE(Y) 127 unop 128 LD $f17, 6*SIZE(Y) 129 130 ST $f24, 0*SIZE(Y) 131 MUL S, $f16, $f24 132 unop 133 SUB $f27, $f28, $f28 134 135 MUL C, $f18, $f25 136 LD $f16, 6*SIZE(X) 137 unop 138 unop 139 140 ST $f26, 1*SIZE(X) 141 MUL S, $f19, $f26 142 unop 143 ADD $f21, $f22, $f22 144 145 MUL C, $f19, $f27 146 unop 147 unop 148 LD $f19, 7*SIZE(Y) 149 150 ST $f28, 1*SIZE(Y) 151 MUL S, $f18, $f28 152 unop 153 SUB $f23, $f24, $f24 154 155 MUL C, $f12, $f21 156 LD $f18, 7*SIZE(X) 157 unop 158 unop 159 160 ST $f22, 2*SIZE(X) 161 unop 162 MUL S, $f13, $f22 163 ADD $f25, $f26, $f26 164 165 MUL C, $f13, $f23 166 LD $f13, 8*SIZE(Y) 167 unop 168 unop 169 170 ST $f24, 2*SIZE(Y) 171 MUL S, $f12, $f24 172 unop 173 SUB $f27, $f28, $f28 174 175 MUL C, $f14, $f25 176 LD $f12, 8*SIZE(X) 177 unop 178 unop 179 180 ST $f26, 3*SIZE(X) 181 MUL S, $f15, $f26 182 unop 183 ADD $f21, $f22, $f22 184 185 MUL C, $f15, $f27 186 LD $f15, 9*SIZE(Y) 187 unop 188 unop 189 190 ST $f28, 3*SIZE(Y) 191 MUL S, $f14, $f28 192 unop 193 SUB $f23, $f24, $f24 194 195 MUL C, $f16, $f21 196 LD $f14, 9*SIZE(X) 197 unop 198 unop 199 200 ST $f22, 4*SIZE(X) 201 MUL S, $f17, $f22 202 unop 203 ADD $f25, $f26, $f26 204 205 MUL C, $f17, $f23 206 LD $f17, 10*SIZE(Y) 207 unop 208 unop 209 210 ST $f24, 4*SIZE(Y) 211 MUL S, $f16, $f24 212 unop 213 SUB $f27, $f28, $f28 214 215 MUL C, $f18, $f25 216 LD $f16, 10*SIZE(X) 217 unop 218 unop 219 220 ST $f26, 5*SIZE(X) 221 MUL S, $f19, $f26 222 unop 223 ADD $f21, $f22, $f22 224 225 MUL C, $f19, $f27 226 LD $f19, 11*SIZE(Y) 227 unop 228 unop 229 230 ST $f28, 5*SIZE(Y) 231 MUL S, $f18, $f28 232 lda I, -1(I) 233 SUB $f23, $f24, $f24 234 235 MUL C, $f12, $f21 236 LD $f18, 11*SIZE(X) 237 unop 238 unop 239 240 ST $f22, 6*SIZE(X) 241 MUL S, $f13, $f22 242 unop 243 ADD $f25, $f26, $f26 244 245 MUL C, $f13, $f23 246 LD $f13, 12*SIZE(Y) 247 lda X, 8*SIZE(X) 248 unop 249 250 ST $f24, 6*SIZE(Y) 251 MUL S, $f12, $f24 252 unop 253 SUB $f27, $f28, $f28 254 255 MUL C, $f14, $f25 256 LD $f12, 4*SIZE(X) 257 lda Y, 8*SIZE(Y) 258 unop 259 260 ST $f26, -1*SIZE(X) 261 MUL S, $f15, $f26 262 unop 263 ADD $f21, $f22, $f22 264 265 MUL C, $f15, $f27 266 LD $f15, 5*SIZE(Y) 267 unop 268 unop 269 270 ST $f28, -1*SIZE(Y) 271 MUL S, $f14, $f28 272 SUB $f23, $f24, $f24 273 bgt I, $L12 274 .align 4 275 276$L13: 277 MUL C, $f16, $f21 278 LD $f14, 5*SIZE(X) 279 unop 280 unop 281 282 ST $f22, 0*SIZE(X) 283 MUL S, $f17, $f22 284 unop 285 ADD $f25, $f26, $f26 286 287 MUL C, $f17, $f23 288 unop 289 unop 290 LD $f17, 6*SIZE(Y) 291 292 ST $f24, 0*SIZE(Y) 293 MUL S, $f16, $f24 294 LD $f16, 6*SIZE(X) 295 SUB $f27, $f28, $f28 296 297 MUL C, $f18, $f25 298 unop 299 unop 300 unop 301 302 ST $f26, 1*SIZE(X) 303 MUL S, $f19, $f26 304 unop 305 ADD $f21, $f22, $f22 306 307 MUL C, $f19, $f27 308 unop 309 unop 310 LD $f19, 7*SIZE(Y) 311 312 ST $f28, 1*SIZE(Y) 313 MUL S, $f18, $f28 314 LD $f18, 7*SIZE(X) 315 SUB $f23, $f24, $f24 316 317 MUL C, $f12, $f21 318 unop 319 unop 320 unop 321 322 ST $f22, 2*SIZE(X) 323 unop 324 MUL S, $f13, $f22 325 ADD $f25, $f26, $f26 326 327 MUL C, $f13, $f23 328 unop 329 unop 330 unop 331 332 ST $f24, 2*SIZE(Y) 333 MUL S, $f12, $f24 334 unop 335 SUB $f27, $f28, $f28 336 337 MUL C, $f14, $f25 338 unop 339 unop 340 unop 341 342 ST $f26, 3*SIZE(X) 343 MUL S, $f15, $f26 344 unop 345 ADD $f21, $f22, $f22 346 347 MUL C, $f15, $f27 348 unop 349 unop 350 unop 351 352 ST $f28, 3*SIZE(Y) 353 MUL S, $f14, $f28 354 unop 355 SUB $f23, $f24, $f24 356 357 MUL C, $f16, $f21 358 unop 359 unop 360 unop 361 362 ST $f22, 4*SIZE(X) 363 MUL S, $f17, $f22 364 unop 365 ADD $f25, $f26, $f26 366 367 MUL C, $f17, $f23 368 unop 369 unop 370 unop 371 372 ST $f24, 4*SIZE(Y) 373 MUL S, $f16, $f24 374 unop 375 SUB $f27, $f28, $f28 376 377 MUL C, $f18, $f25 378 unop 379 unop 380 unop 381 382 ST $f26, 5*SIZE(X) 383 MUL S, $f19, $f26 384 unop 385 ADD $f21, $f22, $f22 386 387 MUL C, $f19, $f27 388 unop 389 unop 390 unop 391 392 ST $f28, 5*SIZE(Y) 393 MUL S, $f18, $f28 394 unop 395 SUB $f23, $f24, $f24 396 397 ST $f22, 6*SIZE(X) 398 ADD $f25, $f26, $f26 399 ST $f24, 6*SIZE(Y) 400 SUB $f27, $f28, $f28 401 402 ST $f26, 7*SIZE(X) 403 lda X, 8*SIZE(X) 404 ST $f28, 7*SIZE(Y) 405 lda Y, 8*SIZE(Y) 406 .align 4 407 408 409$L15: 410 and N, 3, I 411 ble I, $L998 412 .align 4 413 414$L16: 415 LD $f12, 0*SIZE(X) 416 LD $f13, 0*SIZE(Y) 417 LD $f14, 1*SIZE(X) 418 LD $f15, 1*SIZE(Y) 419 420 MUL C, $f12, $f21 421 MUL S, $f13, $f22 422 MUL C, $f13, $f23 423 MUL S, $f12, $f24 424 425 ADD $f21, $f22, $f22 426 SUB $f23, $f24, $f24 427 428 MUL C, $f14, $f25 429 MUL S, $f15, $f26 430 MUL C, $f15, $f27 431 MUL S, $f14, $f28 432 433 ADD $f25, $f26, $f26 434 SUB $f27, $f28, $f28 435 436 ST $f22, 0*SIZE(X) 437 ST $f24, 0*SIZE(Y) 438 lda I, -1(I) 439 440 ST $f26, 1*SIZE(X) 441 lda X, 2 * SIZE(X) 442 ST $f28, 1*SIZE(Y) 443 lda Y, 2 * SIZE(Y) 444 445 bgt I, $L16 446 .align 4 447 448$L998: 449 clr $0 450 ret 451 .align 4 452 453$L50: 454 mov X, XX 455 mov Y, YY 456 457 sra N, 2, I 458 ble I, $L55 459 .align 4 460 461$L51: 462 LD $f12, 0*SIZE(X) 463 LD $f13, 0*SIZE(Y) 464 LD $f14, 1*SIZE(X) 465 SXADDQ INCX, X, X 466 LD $f15, 1*SIZE(Y) 467 SXADDQ INCY, Y, Y 468 469 MUL C, $f12, $f21 470 MUL S, $f13, $f22 471 MUL C, $f13, $f23 472 MUL S, $f12, $f24 473 474 ADD $f21, $f22, $f22 475 SUB $f23, $f24, $f24 476 477 MUL C, $f14, $f25 478 MUL S, $f15, $f26 479 MUL C, $f15, $f27 480 MUL S, $f14, $f28 481 482 ADD $f25, $f26, $f26 483 SUB $f27, $f28, $f28 484 485 ST $f22, 0*SIZE(XX) 486 ST $f24, 0*SIZE(YY) 487 ST $f26, 1*SIZE(XX) 488 SXADDQ INCX, XX, XX 489 ST $f28, 1*SIZE(YY) 490 SXADDQ INCY, YY, YY 491 492 493 LD $f12, 0*SIZE(X) 494 LD $f13, 0*SIZE(Y) 495 LD $f14, 1*SIZE(X) 496 SXADDQ INCX, X, X 497 LD $f15, 1*SIZE(Y) 498 SXADDQ INCY, Y, Y 499 500 MUL C, $f12, $f21 501 MUL S, $f13, $f22 502 MUL C, $f13, $f23 503 MUL S, $f12, $f24 504 505 ADD $f21, $f22, $f22 506 SUB $f23, $f24, $f24 507 508 MUL C, $f14, $f25 509 MUL S, $f15, $f26 510 MUL C, $f15, $f27 511 MUL S, $f14, $f28 512 513 ADD $f25, $f26, $f26 514 SUB $f27, $f28, $f28 515 516 ST $f22, 0*SIZE(XX) 517 ST $f24, 0*SIZE(YY) 518 ST $f26, 1*SIZE(XX) 519 SXADDQ INCX, XX, XX 520 ST $f28, 1*SIZE(YY) 521 SXADDQ INCY, YY, YY 522 523 524 LD $f12, 0*SIZE(X) 525 LD $f13, 0*SIZE(Y) 526 LD $f14, 1*SIZE(X) 527 SXADDQ INCX, X, X 528 LD $f15, 1*SIZE(Y) 529 SXADDQ INCY, Y, Y 530 531 MUL C, $f12, $f21 532 MUL S, $f13, $f22 533 MUL C, $f13, $f23 534 MUL S, $f12, $f24 535 536 ADD $f21, $f22, $f22 537 SUB $f23, $f24, $f24 538 539 MUL C, $f14, $f25 540 MUL S, $f15, $f26 541 MUL C, $f15, $f27 542 MUL S, $f14, $f28 543 544 ADD $f25, $f26, $f26 545 SUB $f27, $f28, $f28 546 547 ST $f22, 0*SIZE(XX) 548 ST $f24, 0*SIZE(YY) 549 ST $f26, 1*SIZE(XX) 550 SXADDQ INCX, XX, XX 551 ST $f28, 1*SIZE(YY) 552 SXADDQ INCY, YY, YY 553 554 555 LD $f12, 0*SIZE(X) 556 LD $f13, 0*SIZE(Y) 557 LD $f14, 1*SIZE(X) 558 SXADDQ INCX, X, X 559 LD $f15, 1*SIZE(Y) 560 SXADDQ INCY, Y, Y 561 562 MUL C, $f12, $f21 563 MUL S, $f13, $f22 564 MUL C, $f13, $f23 565 MUL S, $f12, $f24 566 567 ADD $f21, $f22, $f22 568 SUB $f23, $f24, $f24 569 570 MUL C, $f14, $f25 571 MUL S, $f15, $f26 572 MUL C, $f15, $f27 573 MUL S, $f14, $f28 574 575 ADD $f25, $f26, $f26 576 SUB $f27, $f28, $f28 577 578 ST $f22, 0*SIZE(XX) 579 ST $f24, 0*SIZE(YY) 580 ST $f26, 1*SIZE(XX) 581 SXADDQ INCX, XX, XX 582 ST $f28, 1*SIZE(YY) 583 SXADDQ INCY, YY, YY 584 585 lda I, -1(I) 586 bgt I, $L51 587 .align 4 588 589$L55: 590 and N, 3, I 591 ble I, $L999 592 .align 4 593 594$L56: 595 LD $f12, 0*SIZE(X) 596 LD $f13, 0*SIZE(Y) 597 LD $f14, 1*SIZE(X) 598 LD $f15, 1*SIZE(Y) 599 600 MUL C, $f12, $f21 601 MUL S, $f13, $f22 602 MUL C, $f13, $f23 603 MUL S, $f12, $f24 604 605 ADD $f21, $f22, $f22 606 SUB $f23, $f24, $f24 607 608 MUL C, $f14, $f25 609 MUL S, $f15, $f26 610 MUL C, $f15, $f27 611 MUL S, $f14, $f28 612 613 ADD $f25, $f26, $f26 614 SUB $f27, $f28, $f28 615 616 ST $f22, 0*SIZE(X) 617 ST $f24, 0*SIZE(Y) 618 lda I, -1(I) 619 620 ST $f26, 1*SIZE(X) 621 ST $f28, 1*SIZE(Y) 622 SXADDQ INCX, X, X 623 SXADDQ INCY, Y, Y 624 625 bgt I, $L56 626 .align 4 627 628$L999: 629 clr $0 630 ret 631 EPILOGUE 632