1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r6 44#define INCX r7 45#define Y r8 46#define INCY r9 47 48#define YY r4 49#define INCX2 r5 50#define INCY2 r10 51 52#define ALPHA f1 53 54#define A1 f0 55#define A2 f8 56#define A3 f2 57#define A4 f3 58#define A5 f4 59#define A6 f5 60#define A7 f6 61#define A8 f7 62#define A9 f25 63 64#define B1 f9 65#define B2 f10 66#define B3 f11 67#define B4 f12 68#define B5 f13 69#define B6 f14 70#define B7 f15 71#define B8 f16 72 73#define C1 f17 74#define C2 f18 75#define C3 f19 76#define C4 f20 77#define C5 f21 78#define C6 f22 79#define C7 f23 80#define C8 f24 81 82 83 PROLOGUE 84 PROFCODE 85 86 li r10, -16 87 88 stfpdux f14, SP, r10 89 stfpdux f15, SP, r10 90 stfpdux f16, SP, r10 91 stfpdux f17, SP, r10 92 93 stfpdux f18, SP, r10 94 stfpdux f19, SP, r10 95 stfpdux f20, SP, r10 96 stfpdux f21, SP, r10 97 98 stfpdux f22, SP, r10 99 stfpdux f23, SP, r10 100 stfpdux f24, SP, r10 101 stfpdux f25, SP, r10 102 103 fsmfp ALPHA, ALPHA 104 105 slwi INCX, INCX, BASE_SHIFT 106 slwi INCY, INCY, BASE_SHIFT 107 108 add INCX2, INCX, INCX 109 add INCY2, INCY, INCY 110 111 cmpwi cr0, N, 0 112 ble LL(999) 113 114 cmpwi cr0, INCX, SIZE 115 bne LL(100) 116 cmpwi cr0, INCY, SIZE 117 bne LL(100) 118 119 andi. r0, Y, 2 * SIZE - 1 120 beq LL(05) 121 122 LFD A1, 0 * SIZE(X) 123 LFD B1, 0 * SIZE(Y) 124 125 addi X, X, SIZE 126 addi Y, Y, SIZE 127 128 fmadd C1, ALPHA, A1, B1 129 addi N, N, -1 130 STFD C1, -1 * SIZE(Y) 131 132LL(05): 133 andi. r0, X, 2 * SIZE - 1 134 bne LL(20) 135 136 sub X, X, INCX2 137 sub Y, Y, INCY2 138 mr YY, Y 139 140 srawi. r0, N, 4 141 mtspr CTR, r0 142 beq- LL(15) 143 144 LFPDUX A1, X, INCX2 145 LFPDUX B1, Y, INCY2 146 LFPDUX A2, X, INCX2 147 LFPDUX B2, Y, INCY2 148 LFPDUX A3, X, INCX2 149 LFPDUX B3, Y, INCY2 150 LFPDUX A4, X, INCX2 151 LFPDUX B4, Y, INCY2 152 153 LFPDUX A5, X, INCX2 154 LFPDUX B5, Y, INCY2 155 LFPDUX A6, X, INCX2 156 LFPDUX B6, Y, INCY2 157 LFPDUX A7, X, INCX2 158 LFPDUX B7, Y, INCY2 159 LFPDUX A8, X, INCX2 160 LFPDUX B8, Y, INCY2 161 bdz LL(13) 162 .align 4 163 164LL(12): 165 fpmadd C1, ALPHA, A1, B1 166 LFPDUX A1, X, INCX2 167 LFPDUX B1, Y, INCY2 168 fpmadd C2, ALPHA, A2, B2 169 LFPDUX A2, X, INCX2 170 LFPDUX B2, Y, INCY2 171 172 fpmadd C3, ALPHA, A3, B3 173 LFPDUX A3, X, INCX2 174 LFPDUX B3, Y, INCY2 175 fpmadd C4, ALPHA, A4, B4 176 LFPDUX A4, X, INCX2 177 LFPDUX B4, Y, INCY2 178 179 fpmadd C5, ALPHA, A5, B5 180 LFPDUX A5, X, INCX2 181 LFPDUX B5, Y, INCY2 182 fpmadd C6, ALPHA, A6, B6 183 LFPDUX A6, X, INCX2 184 LFPDUX B6, Y, INCY2 185 186 fpmadd C7, ALPHA, A7, B7 187 LFPDUX A7, X, INCX2 188 LFPDUX B7, Y, INCY2 189 fpmadd C8, ALPHA, A8, B8 190 LFPDUX A8, X, INCX2 191 LFPDUX B8, Y, INCY2 192 193 STFPDUX C1, YY, INCY2 194 STFPDUX C2, YY, INCY2 195 STFPDUX C3, YY, INCY2 196 STFPDUX C4, YY, INCY2 197 198 STFPDUX C5, YY, INCY2 199 STFPDUX C6, YY, INCY2 200 STFPDUX C7, YY, INCY2 201 STFPDUX C8, YY, INCY2 202 bdnz LL(12) 203 .align 4 204 205LL(13): 206 fpmadd C1, ALPHA, A1, B1 207 fpmadd C2, ALPHA, A2, B2 208 fpmadd C3, ALPHA, A3, B3 209 fpmadd C4, ALPHA, A4, B4 210 211 fpmadd C5, ALPHA, A5, B5 212 fpmadd C6, ALPHA, A6, B6 213 STFPDUX C1, YY, INCY2 214 fpmadd C7, ALPHA, A7, B7 215 STFPDUX C2, YY, INCY2 216 fpmadd C8, ALPHA, A8, B8 217 STFPDUX C3, YY, INCY2 218 STFPDUX C4, YY, INCY2 219 220 STFPDUX C5, YY, INCY2 221 STFPDUX C6, YY, INCY2 222 STFPDUX C7, YY, INCY2 223 STFPDUX C8, YY, INCY2 224 .align 4 225 226LL(15): 227 andi. r0, N, 15 228 beq LL(999) 229 230 andi. r0, N, 8 231 beq LL(16) 232 233 LFPDUX A1, X, INCX2 234 LFPDUX B1, Y, INCY2 235 LFPDUX A2, X, INCX2 236 LFPDUX B2, Y, INCY2 237 LFPDUX A3, X, INCX2 238 LFPDUX B3, Y, INCY2 239 LFPDUX A4, X, INCX2 240 LFPDUX B4, Y, INCY2 241 242 fpmadd C1, ALPHA, A1, B1 243 fpmadd C2, ALPHA, A2, B2 244 fpmadd C3, ALPHA, A3, B3 245 fpmadd C4, ALPHA, A4, B4 246 247 STFPDUX C1, YY, INCY2 248 STFPDUX C2, YY, INCY2 249 STFPDUX C3, YY, INCY2 250 STFPDUX C4, YY, INCY2 251 .align 4 252 253LL(16): 254 andi. r0, N, 4 255 beq LL(17) 256 257 LFPDUX A1, X, INCX2 258 LFPDUX B1, Y, INCY2 259 LFPDUX A2, X, INCX2 260 LFPDUX B2, Y, INCY2 261 262 fpmadd C1, ALPHA, A1, B1 263 fpmadd C2, ALPHA, A2, B2 264 265 STFPDUX C1, YY, INCY2 266 STFPDUX C2, YY, INCY2 267 .align 4 268 269LL(17): 270 andi. r0, N, 2 271 beq LL(18) 272 273 LFPDUX A1, X, INCX2 274 LFPDUX B1, Y, INCY2 275 276 fpmadd C1, ALPHA, A1, B1 277 278 STFPDUX C1, YY, INCY2 279 .align 4 280 281LL(18): 282 andi. r0, N, 1 283 beq LL(999) 284 285 LFDUX A1, X, INCX2 286 LFDUX B1, Y, INCY2 287 288 fmadd C1, ALPHA, A1, B1 289 STFDUX C1, YY, INCY2 290 b LL(999) 291 .align 4 292 293/* X is unaliged */ 294 295LL(20): 296 LFD A1, 0 * SIZE(X) 297 addi X, X, SIZE 298 sub X, X, INCX2 299 sub Y, Y, INCY2 300 mr YY, Y 301 302 srawi. r0, N, 4 303 mtspr CTR, r0 304 beq- LL(25) 305 306 LFXDUX A2, X, INCX2 307 LFPDUX B1, Y, INCY2 308 LFXDUX A3, X, INCX2 309 LFPDUX B2, Y, INCY2 310 LFXDUX A4, X, INCX2 311 LFPDUX B3, Y, INCY2 312 LFXDUX A5, X, INCX2 313 LFPDUX B4, Y, INCY2 314 315 LFXDUX A6, X, INCX2 316 LFPDUX B5, Y, INCY2 317 LFXDUX A7, X, INCX2 318 LFPDUX B6, Y, INCY2 319 fsmr A1, A2 320 LFXDUX A8, X, INCX2 321 fsmr A2, A3 322 LFPDUX B7, Y, INCY2 323 fsmr A3, A4 324 LFXDUX A9, X, INCX2 325 fsmr A4, A5 326 LFPDUX B8, Y, INCY2 327 bdz LL(23) 328 .align 4 329 330LL(22): 331 fpmadd C1, ALPHA, A1, B1 332 fsmr A5, A6 333 LFPDUX B1, Y, INCY2 334 fpmadd C2, ALPHA, A2, B2 335 LFXDUX A2, X, INCX2 336 fsmr A6, A7 337 LFPDUX B2, Y, INCY2 338 fpmadd C3, ALPHA, A3, B3 339 LFXDUX A3, X, INCX2 340 fsmr A7, A8 341 LFPDUX B3, Y, INCY2 342 fpmadd C4, ALPHA, A4, B4 343 LFXDUX A4, X, INCX2 344 fsmr A8, A9 345 LFPDUX B4, Y, INCY2 346 347 fpmadd C5, ALPHA, A5, B5 348 LFXDUX A5, X, INCX2 349 LFPDUX B5, Y, INCY2 350 fpmadd C6, ALPHA, A6, B6 351 LFXDUX A6, X, INCX2 352 LFPDUX B6, Y, INCY2 353 354 fpmadd C7, ALPHA, A7, B7 355 LFXDUX A7, X, INCX2 356 LFPDUX B7, Y, INCY2 357 fpmadd C8, ALPHA, A8, B8 358 LFXDUX A8, X, INCX2 359 LFPDUX B8, Y, INCY2 360 361 fpmr A1, A9 362 LFXDUX A9, X, INCX2 363 364 STFPDUX C1, YY, INCY2 365 STFPDUX C2, YY, INCY2 366 STFPDUX C3, YY, INCY2 367 STFPDUX C4, YY, INCY2 368 fsmr A1, A2 369 370 STFPDUX C5, YY, INCY2 371 fsmr A2, A3 372 STFPDUX C6, YY, INCY2 373 fsmr A3, A4 374 STFPDUX C7, YY, INCY2 375 fsmr A4, A5 376 STFPDUX C8, YY, INCY2 377 bdnz LL(22) 378 .align 4 379 380LL(23): 381 fpmadd C1, ALPHA, A1, B1 382 fsmr A5, A6 383 fpmadd C2, ALPHA, A2, B2 384 fsmr A6, A7 385 fpmadd C3, ALPHA, A3, B3 386 fsmr A7, A8 387 fpmadd C4, ALPHA, A4, B4 388 fsmr A8, A9 389 390 fpmadd C5, ALPHA, A5, B5 391 fpmadd C6, ALPHA, A6, B6 392 fpmadd C7, ALPHA, A7, B7 393 fpmadd C8, ALPHA, A8, B8 394 fpmr A1, A9 395 396 STFPDUX C1, YY, INCY2 397 STFPDUX C2, YY, INCY2 398 STFPDUX C3, YY, INCY2 399 STFPDUX C4, YY, INCY2 400 401 STFPDUX C5, YY, INCY2 402 STFPDUX C6, YY, INCY2 403 STFPDUX C7, YY, INCY2 404 STFPDUX C8, YY, INCY2 405 .align 4 406 407LL(25): 408 andi. r0, N, 15 409 beq LL(999) 410 411 andi. r0, N, 8 412 beq LL(26) 413 414 LFXDUX A2, X, INCX2 415 LFPDUX B1, Y, INCY2 416 LFXDUX A3, X, INCX2 417 LFPDUX B2, Y, INCY2 418 LFXDUX A4, X, INCX2 419 LFPDUX B3, Y, INCY2 420 LFXDUX A5, X, INCX2 421 LFPDUX B4, Y, INCY2 422 423 fsmr A1, A2 424 fsmr A2, A3 425 fsmr A3, A4 426 fsmr A4, A5 427 428 fpmadd C1, ALPHA, A1, B1 429 fpmadd C2, ALPHA, A2, B2 430 fpmadd C3, ALPHA, A3, B3 431 fpmadd C4, ALPHA, A4, B4 432 fpmr A1, A5 433 434 STFPDUX C1, YY, INCY2 435 STFPDUX C2, YY, INCY2 436 STFPDUX C3, YY, INCY2 437 STFPDUX C4, YY, INCY2 438 .align 4 439 440LL(26): 441 andi. r0, N, 4 442 beq LL(27) 443 444 LFXDUX A2, X, INCX2 445 LFPDUX B1, Y, INCY2 446 LFXDUX A3, X, INCX2 447 LFPDUX B2, Y, INCY2 448 449 fsmr A1, A2 450 fsmr A2, A3 451 fpmadd C1, ALPHA, A1, B1 452 fpmadd C2, ALPHA, A2, B2 453 fpmr A1, A3 454 455 STFPDUX C1, YY, INCY2 456 STFPDUX C2, YY, INCY2 457 .align 4 458 459LL(27): 460 andi. r0, N, 2 461 beq LL(28) 462 463 LFXDUX A2, X, INCX2 464 LFPDUX B1, Y, INCY2 465 466 fsmr A1, A2 467 fpmadd C1, ALPHA, A1, B1 468 fpmr A1, A2 469 470 STFPDUX C1, YY, INCY2 471 .align 4 472 473LL(28): 474 andi. r0, N, 1 475 beq LL(999) 476 477 LFDUX B1, Y, INCY2 478 479 fmadd C1, ALPHA, A1, B1 480 STFDUX C1, YY, INCY2 481 b LL(999) 482 .align 4 483#### 484 485 486LL(100): 487 sub X, X, INCX 488 sub Y, Y, INCY 489 mr YY, Y 490 491 srawi. r0, N, 3 492 mtspr CTR, r0 493 beq- LL(115) 494 495 LFDUX A1, X, INCX 496 LFDUX B1, Y, INCY 497 LFDUX A2, X, INCX 498 LFDUX B2, Y, INCY 499 500 LFDUX A3, X, INCX 501 LFDUX B3, Y, INCY 502 LFDUX A4, X, INCX 503 LFDUX B4, Y, INCY 504 505 LFDUX A5, X, INCX 506 LFDUX B5, Y, INCY 507 LFDUX A6, X, INCX 508 LFDUX B6, Y, INCY 509 510 LFDUX A7, X, INCX 511 LFDUX B7, Y, INCY 512 LFDUX A8, X, INCX 513 LFDUX B8, Y, INCY 514 bdz LL(113) 515 .align 4 516 517LL(112): 518 fmadd C1, ALPHA, A1, B1 519 LFDUX A1, X, INCX 520 LFDUX B1, Y, INCY 521 522 fmadd C2, ALPHA, A2, B2 523 LFDUX A2, X, INCX 524 LFDUX B2, Y, INCY 525 526 fmadd C3, ALPHA, A3, B3 527 LFDUX A3, X, INCX 528 LFDUX B3, Y, INCY 529 530 fmadd C4, ALPHA, A4, B4 531 LFDUX A4, X, INCX 532 LFDUX B4, Y, INCY 533 534 fmadd C5, ALPHA, A5, B5 535 LFDUX A5, X, INCX 536 LFDUX B5, Y, INCY 537 fmadd C6, ALPHA, A6, B6 538 LFDUX A6, X, INCX 539 LFDUX B6, Y, INCY 540 fmadd C7, ALPHA, A7, B7 541 LFDUX A7, X, INCX 542 LFDUX B7, Y, INCY 543 fmadd C8, ALPHA, A8, B8 544 LFDUX A8, X, INCX 545 LFDUX B8, Y, INCY 546 547 STFDUX C1, YY, INCY 548 STFDUX C2, YY, INCY 549 STFDUX C3, YY, INCY 550 STFDUX C4, YY, INCY 551 552 STFDUX C5, YY, INCY 553 STFDUX C6, YY, INCY 554 STFDUX C7, YY, INCY 555 STFDUX C8, YY, INCY 556 bdnz LL(112) 557 .align 4 558 559LL(113): 560 fmadd C1, ALPHA, A1, B1 561 fmadd C2, ALPHA, A2, B2 562 fmadd C3, ALPHA, A3, B3 563 fmadd C4, ALPHA, A4, B4 564 565 fmadd C5, ALPHA, A5, B5 566 fmadd C6, ALPHA, A6, B6 567 STFDUX C1, YY, INCY 568 fmadd C7, ALPHA, A7, B7 569 STFDUX C2, YY, INCY 570 fmadd C8, ALPHA, A8, B8 571 STFDUX C3, YY, INCY 572 573 STFDUX C4, YY, INCY 574 STFDUX C5, YY, INCY 575 STFDUX C6, YY, INCY 576 STFDUX C7, YY, INCY 577 STFDUX C8, YY, INCY 578 .align 4 579 580LL(115): 581 andi. r0, N, 7 582 beq LL(999) 583 andi. r0, N, 4 584 beq LL(117) 585 586 LFDUX A1, X, INCX 587 LFDUX B1, Y, INCY 588 LFDUX A2, X, INCX 589 LFDUX B2, Y, INCY 590 591 LFDUX A3, X, INCX 592 LFDUX B3, Y, INCY 593 LFDUX A4, X, INCX 594 LFDUX B4, Y, INCY 595 596 fmadd C1, ALPHA, A1, B1 597 fmadd C2, ALPHA, A2, B2 598 fmadd C3, ALPHA, A3, B3 599 fmadd C4, ALPHA, A4, B4 600 601 STFDUX C1, YY, INCY 602 STFDUX C2, YY, INCY 603 STFDUX C3, YY, INCY 604 STFDUX C4, YY, INCY 605 .align 4 606 607LL(117): 608 andi. r0, N, 2 609 beq LL(118) 610 611 LFDUX A1, X, INCX 612 LFDUX B1, Y, INCY 613 LFDUX A2, X, INCX 614 LFDUX B2, Y, INCY 615 616 fmadd C1, ALPHA, A1, B1 617 fmadd C2, ALPHA, A2, B2 618 619 STFDUX C1, YY, INCY 620 STFDUX C2, YY, INCY 621 .align 4 622 623LL(118): 624 andi. r0, N, 1 625 beq LL(999) 626 627 LFDUX A1, X, INCX 628 LFDUX B1, Y, INCY 629 630 fmadd C1, ALPHA, A1, B1 631 STFDUX C1, YY, INCY 632 .align 4 633 634LL(999): 635 li r10, 16 636 subi SP, SP, 16 637 638 lfpdux f25, SP, r10 639 lfpdux f24, SP, r10 640 lfpdux f23, SP, r10 641 lfpdux f22, SP, r10 642 643 lfpdux f21, SP, r10 644 lfpdux f20, SP, r10 645 lfpdux f19, SP, r10 646 lfpdux f18, SP, r10 647 648 lfpdux f17, SP, r10 649 lfpdux f16, SP, r10 650 lfpdux f15, SP, r10 651 lfpdux f14, SP, r10 652 653 addi SP, SP, 16 654 blr 655 656 EPILOGUE 657