1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r4 44#define INCX r5 45#define Y r6 46#define INCY r7 47 48#define INCX2 r8 49#define INCY2 r9 50#define X2 r10 51#define Y2 r11 52 53#define A1 f0 54#define A2 f1 55#define A3 f2 56#define A4 f3 57#define A5 f4 58#define A6 f5 59#define A7 f6 60#define A8 f7 61#define A9 f8 62 63#define T1 f9 64#define T2 f10 65#define T3 f11 66#define T4 f12 67#define T5 f13 68#define T6 f14 69#define T7 f15 70 71 PROLOGUE 72 PROFCODE 73 74 li r10, -16 75 76 stfpdux f14, SP, r10 77 stfpdux f15, SP, r10 78 79 slwi INCX, INCX, BASE_SHIFT 80 slwi INCY, INCY, BASE_SHIFT 81 add INCX2, INCX, INCX 82 add INCY2, INCY, INCY 83 84 cmpwi cr0, N, 0 85 ble LL(999) 86 87 sub X, X, INCX2 88 sub Y, Y, INCY2 89 90 cmpwi cr0, INCX, SIZE 91 bne LL(100) 92 cmpwi cr0, INCY, SIZE 93 bne LL(100) 94 95 andi. r0, X, 2 * SIZE - 1 96 bne LL(30) 97 andi. r0, Y, 2 * SIZE - 1 98 bne LL(20) 99 .align 4 100 101LL(10): /* X ): aligned Y ): aligned */ 102 srawi. r0, N, 3 103 mtspr CTR, r0 104 beq- LL(15) 105 106 LFPDUX A1, X, INCX2 107 LFPDUX A2, X, INCX2 108 LFPDUX A3, X, INCX2 109 LFPDUX A4, X, INCX2 110 LFPDUX A5, X, INCX2 111 LFPDUX A6, X, INCX2 112 LFPDUX A7, X, INCX2 113 LFPDUX A8, X, INCX2 114 bdz LL(13) 115 .align 4 116 117LL(12): 118 STFPDUX A1, Y, INCY2 119 LFPDUX A1, X, INCX2 120 STFPDUX A2, Y, INCY2 121 LFPDUX A2, X, INCX2 122 STFPDUX A3, Y, INCY2 123 LFPDUX A3, X, INCX2 124 STFPDUX A4, Y, INCY2 125 LFPDUX A4, X, INCX2 126 127 STFPDUX A5, Y, INCY2 128 LFPDUX A5, X, INCX2 129 STFPDUX A6, Y, INCY2 130 LFPDUX A6, X, INCX2 131 STFPDUX A7, Y, INCY2 132 LFPDUX A7, X, INCX2 133 STFPDUX A8, Y, INCY2 134 LFPDUX A8, X, INCX2 135 bdnz LL(12) 136 .align 4 137 138LL(13): 139 STFPDUX A1, Y, INCY2 140 STFPDUX A2, Y, INCY2 141 STFPDUX A3, Y, INCY2 142 STFPDUX A4, Y, INCY2 143 STFPDUX A5, Y, INCY2 144 STFPDUX A6, Y, INCY2 145 STFPDUX A7, Y, INCY2 146 STFPDUX A8, Y, INCY2 147 .align 4 148 149LL(15): 150 andi. r0, N, 7 151 beq LL(999) 152 153 andi. r0, N, 4 154 beq LL(16) 155 156 LFPDUX A1, X, INCX2 157 LFPDUX A2, X, INCX2 158 LFPDUX A3, X, INCX2 159 LFPDUX A4, X, INCX2 160 161 STFPDUX A1, Y, INCY2 162 STFPDUX A2, Y, INCY2 163 STFPDUX A3, Y, INCY2 164 STFPDUX A4, Y, INCY2 165 .align 4 166 167LL(16): 168 andi. r0, N, 2 169 beq LL(17) 170 171 LFPDUX A1, X, INCX2 172 LFPDUX A2, X, INCX2 173 174 STFPDUX A1, Y, INCY2 175 STFPDUX A2, Y, INCY2 176 .align 4 177 178LL(17): 179 andi. r0, N, 1 180 beq LL(999) 181 182 LFPDUX A1, X, INCX2 183 STFPDUX A1, Y, INCY2 184 b LL(999) 185 .align 4 186 187LL(20): /* X : aligned Y : unaligned */ 188 189 LFXDUX A1, X, INCX2 190 addi N, N, -1 191 cmpwi cr0, N, 0 192 STFSDX A1, Y, INCY2 193 add Y, Y, INCY 194 ble LL(29) 195 .align 4 196 197 srawi. r0, N, 3 198 mtspr CTR, r0 199 beq- LL(25) 200 201 LFXDUX T1, X, INCX2 202 LFXDUX T2, X, INCX2 203 LFXDUX T3, X, INCX2 204 LFXDUX T4, X, INCX2 205 206 LFPDUX A6, X, INCX2 207 fsmr A1, T1 208 LFPDUX A7, X, INCX2 209 fsmr T1, T2 210 LFPDUX A8, X, INCX2 211 fsmr T2, T3 212 LFPDUX A9, X, INCX2 213 fsmr T3, T4 214 bdz LL(23) 215 .align 4 216 217LL(22): 218 STFPDUX A1, Y, INCY2 219 fxmr T5, A6 220 STFPDUX T1, Y, INCY2 221 fxmr T6, A7 222 STFPDUX T2, Y, INCY2 223 fxmr T7, A8 224 STFPDUX T3, Y, INCY2 225 fxmr A1, A9 226 227 fsmr T4, T5 228 LFPDUX A2, X, INCX2 229 fsmr T5, T6 230 LFPDUX A3, X, INCX2 231 fsmr T6, T7 232 LFPDUX A4, X, INCX2 233 fsmr T7, A1 234 LFPDUX A5, X, INCX2 235 236 STFPDUX T4, Y, INCY2 237 fxmr T1, A2 238 STFPDUX T5, Y, INCY2 239 fxmr T2, A3 240 STFPDUX T6, Y, INCY2 241 fxmr T3, A4 242 STFPDUX T7, Y, INCY2 243 fxmr T4, A5 244 245 LFPDUX A6, X, INCX2 246 fsmr A1, T1 247 LFPDUX A7, X, INCX2 248 fsmr T1, T2 249 LFPDUX A8, X, INCX2 250 fsmr T2, T3 251 LFPDUX A9, X, INCX2 252 fsmr T3, T4 253 bdnz LL(22) 254 .align 4 255 256LL(23): 257 STFPDUX A1, Y, INCY2 258 fxmr T5, A6 259 STFPDUX T1, Y, INCY2 260 fxmr T6, A7 261 STFPDUX T2, Y, INCY2 262 fxmr T7, A8 263 STFPDUX T3, Y, INCY2 264 fxmr A1, A9 265 266 fsmr T4, T5 267 fsmr T5, T6 268 fsmr T6, T7 269 fsmr T7, A1 270 271 STFPDUX T4, Y, INCY2 272 STFPDUX T5, Y, INCY2 273 STFPDUX T6, Y, INCY2 274 STFPDUX T7, Y, INCY2 275 .align 4 276 277LL(25): 278 andi. r0, N, 7 279 beq LL(29) 280 281 andi. r0, N, 4 282 beq LL(26) 283 284 LFXDUX A2, X, INCX2 285 LFXDUX A3, X, INCX2 286 LFXDUX A4, X, INCX2 287 LFXDUX A5, X, INCX2 288 289 fsmr A1, A2 290 fsmr A2, A3 291 fsmr A3, A4 292 fsmr A4, A5 293 294 STFPDUX A1, Y, INCY2 295 STFPDUX A2, Y, INCY2 296 STFPDUX A3, Y, INCY2 297 STFPDUX A4, Y, INCY2 298 fpmr A1, A5 299 .align 4 300 301LL(26): 302 andi. r0, N, 2 303 beq LL(27) 304 305 LFXDUX A2, X, INCX2 306 LFXDUX A3, X, INCX2 307 fsmr A1, A2 308 fsmr A2, A3 309 STFPDUX A1, Y, INCY2 310 STFPDUX A2, Y, INCY2 311 fpmr A1, A3 312 .align 4 313 314LL(27): 315 andi. r0, N, 1 316 beq LL(29) 317 318 LFXDUX A2, X, INCX2 319 fsmr A1, A2 320 STFPDUX A1, Y, INCY2 321 fpmr A1, A2 322 .align 4 323 324LL(29): 325 STFDUX A1, Y, INCY2 326 b LL(999) 327 .align 4 328 329LL(30): /* X ): unaligned Y ): aligned */ 330 andi. r0, Y, 2 * SIZE - 1 331 bne LL(40) 332 333 LFDX A1, X, INCX2 334 add X, X, INCX 335 336 srawi. r0, N, 3 337 mtspr CTR, r0 338 beq- LL(35) 339 340 LFXDUX T1, X, INCX2 341 LFXDUX T2, X, INCX2 342 LFXDUX T3, X, INCX2 343 LFXDUX T4, X, INCX2 344 345 LFPDUX A6, X, INCX2 346 fsmr A1, T1 347 LFPDUX A7, X, INCX2 348 fsmr T1, T2 349 LFPDUX A8, X, INCX2 350 fsmr T2, T3 351 LFPDUX A9, X, INCX2 352 fsmr T3, T4 353 bdz LL(33) 354 .align 4 355 356LL(32): 357 fxmr T5, A6 358 STFPDUX A1, Y, INCY2 359 fxmr T6, A7 360 STFPDUX T1, Y, INCY2 361 fxmr T7, A8 362 STFPDUX T2, Y, INCY2 363 fxmr A1, A9 364 STFPDUX T3, Y, INCY2 365 366 LFPDUX A2, X, INCX2 367 fsmr T4, T5 368 LFPDUX A3, X, INCX2 369 fsmr T5, T6 370 LFPDUX A4, X, INCX2 371 fsmr T6, T7 372 LFPDUX A5, X, INCX2 373 fsmr T7, A1 374 375 fxmr T1, A2 376 STFPDUX T4, Y, INCY2 377 fxmr T2, A3 378 STFPDUX T5, Y, INCY2 379 fxmr T3, A4 380 STFPDUX T6, Y, INCY2 381 fxmr T4, A5 382 STFPDUX T7, Y, INCY2 383 384 fsmr A1, T1 385 LFPDUX A6, X, INCX2 386 fsmr T1, T2 387 LFPDUX A7, X, INCX2 388 fsmr T2, T3 389 LFPDUX A8, X, INCX2 390 fsmr T3, T4 391 LFPDUX A9, X, INCX2 392 bdnz LL(32) 393 .align 4 394 395LL(33): 396 STFPDUX A1, Y, INCY2 397 fxmr T5, A6 398 STFPDUX T1, Y, INCY2 399 fxmr T6, A7 400 STFPDUX T2, Y, INCY2 401 fxmr T7, A8 402 STFPDUX T3, Y, INCY2 403 fxmr A1, A9 404 405 fsmr T4, T5 406 fsmr T5, T6 407 fsmr T6, T7 408 fsmr T7, A1 409 410 STFPDUX T4, Y, INCY2 411 STFPDUX T5, Y, INCY2 412 STFPDUX T6, Y, INCY2 413 STFPDUX T7, Y, INCY2 414 .align 4 415 416LL(35): 417 andi. r0, N, 7 418 beq LL(999) 419 420 andi. r0, N, 4 421 beq LL(36) 422 423 LFXDUX A2, X, INCX2 424 LFXDUX A3, X, INCX2 425 LFXDUX A4, X, INCX2 426 LFXDUX A5, X, INCX2 427 428 fsmr A1, A2 429 fsmr A2, A3 430 fsmr A3, A4 431 fsmr A4, A5 432 433 STFPDUX A1, Y, INCY2 434 STFPDUX A2, Y, INCY2 435 STFPDUX A3, Y, INCY2 436 STFPDUX A4, Y, INCY2 437 fpmr A1, A5 438 .align 4 439 440LL(36): 441 andi. r0, N, 2 442 beq LL(37) 443 444 LFXDUX A2, X, INCX2 445 LFXDUX A3, X, INCX2 446 fsmr A1, A2 447 fsmr A2, A3 448 STFPDUX A1, Y, INCY2 449 STFPDUX A2, Y, INCY2 450 fpmr A1, A3 451 .align 4 452 453LL(37): 454 andi. r0, N, 1 455 beq LL(999) 456 457 LFXDUX A2, X, INCX2 458 fsmr A1, A2 459 STFPDUX A1, Y, INCY2 460 b LL(999) 461 .align 4 462 463LL(40): /* X : unaligned Y : unaligned */ 464 465 LFDX A1, X, INCX2 466 add X, X, INCX 467 468 addi N, N, -1 469 cmpwi cr0, N, 0 470 STFDX A1, Y, INCY2 471 add Y, Y, INCY 472 ble LL(49) 473 474 srawi. r0, N, 3 475 mtspr CTR, r0 476 beq- LL(45) 477 478 LFPDUX A1, X, INCX2 479 LFPDUX A2, X, INCX2 480 LFPDUX A3, X, INCX2 481 LFPDUX A4, X, INCX2 482 LFPDUX A5, X, INCX2 483 LFPDUX A6, X, INCX2 484 LFPDUX A7, X, INCX2 485 LFPDUX A8, X, INCX2 486 bdz LL(43) 487 .align 4 488 489LL(42): 490 STFPDUX A1, Y, INCY2 491 LFPDUX A1, X, INCX2 492 STFPDUX A2, Y, INCY2 493 LFPDUX A2, X, INCX2 494 STFPDUX A3, Y, INCY2 495 LFPDUX A3, X, INCX2 496 STFPDUX A4, Y, INCY2 497 LFPDUX A4, X, INCX2 498 499 STFPDUX A5, Y, INCY2 500 LFPDUX A5, X, INCX2 501 STFPDUX A6, Y, INCY2 502 LFPDUX A6, X, INCX2 503 STFPDUX A7, Y, INCY2 504 LFPDUX A7, X, INCX2 505 STFPDUX A8, Y, INCY2 506 LFPDUX A8, X, INCX2 507 bdnz LL(42) 508 .align 4 509 510LL(43): 511 STFPDUX A1, Y, INCY2 512 STFPDUX A2, Y, INCY2 513 STFPDUX A3, Y, INCY2 514 STFPDUX A4, Y, INCY2 515 STFPDUX A5, Y, INCY2 516 STFPDUX A6, Y, INCY2 517 STFPDUX A7, Y, INCY2 518 STFPDUX A8, Y, INCY2 519 .align 4 520 521LL(45): 522 andi. r0, N, 7 523 beq LL(49) 524 525 andi. r0, N, 4 526 beq LL(46) 527 528 LFPDUX A1, X, INCX2 529 LFPDUX A2, X, INCX2 530 LFPDUX A3, X, INCX2 531 LFPDUX A4, X, INCX2 532 533 STFPDUX A1, Y, INCY2 534 STFPDUX A2, Y, INCY2 535 STFPDUX A3, Y, INCY2 536 STFPDUX A4, Y, INCY2 537 .align 4 538 539LL(46): 540 andi. r0, N, 2 541 beq LL(47) 542 543 LFPDUX A1, X, INCX2 544 LFPDUX A2, X, INCX2 545 546 STFPDUX A1, Y, INCY2 547 STFPDUX A2, Y, INCY2 548 .align 4 549 550LL(47): 551 andi. r0, N, 1 552 beq LL(49) 553 554 LFPDUX A1, X, INCX2 555 STFPDUX A1, Y, INCY2 556 557LL(49): 558 LFDUX A1, X, INCX2 559 STFDUX A1, Y, INCY2 560 b LL(999) 561 .align 4 562 563LL(100): 564 addi X2, X, SIZE 565 addi Y2, Y, SIZE 566 567 srawi. r0, N, 2 568 mtspr CTR, r0 569 beq- LL(115) 570 571 LFDUX A1, X, INCX2 572 LFDUX A2, X2, INCX2 573 LFDUX A3, X, INCX2 574 LFDUX A4, X2, INCX2 575 LFDUX A5, X, INCX2 576 LFDUX A6, X2, INCX2 577 LFDUX A7, X, INCX2 578 LFDUX A8, X2, INCX2 579 bdz LL(113) 580 .align 4 581 582LL(112): 583 STFDUX A1, Y, INCY2 584 LFDUX A1, X, INCX2 585 STFDUX A2, Y2, INCY2 586 LFDUX A2, X2, INCX2 587 STFDUX A3, Y, INCY2 588 LFDUX A3, X, INCX2 589 STFDUX A4, Y2, INCY2 590 LFDUX A4, X2, INCX2 591 592 STFDUX A5, Y, INCY2 593 LFDUX A5, X, INCX2 594 STFDUX A6, Y2, INCY2 595 LFDUX A6, X2, INCX2 596 STFDUX A7, Y, INCY2 597 LFDUX A7, X, INCX2 598 STFDUX A8, Y2, INCY2 599 LFDUX A8, X2, INCX2 600 bdnz LL(112) 601 .align 4 602 603LL(113): 604 STFDUX A1, Y, INCY2 605 STFDUX A2, Y2, INCY2 606 STFDUX A3, Y, INCY2 607 STFDUX A4, Y2, INCY2 608 STFDUX A5, Y, INCY2 609 STFDUX A6, Y2, INCY2 610 STFDUX A7, Y, INCY2 611 STFDUX A8, Y2, INCY2 612 .align 4 613 614LL(115): 615 andi. r0, N, 3 616 beq LL(999) 617 andi. r0, N, 2 618 beq LL(117) 619 620 LFDUX A1, X, INCX2 621 LFDUX A2, X2, INCX2 622 LFDUX A3, X, INCX2 623 LFDUX A4, X2, INCX2 624 625 STFDUX A1, Y, INCY2 626 STFDUX A2, Y2, INCY2 627 STFDUX A3, Y, INCY2 628 STFDUX A4, Y2, INCY2 629 .align 4 630 631LL(117): 632 andi. r0, N, 1 633 beq LL(999) 634 635 LFDUX A1, X, INCX2 636 LFDUX A2, X2, INCX2 637 638 STFDUX A1, Y, INCY2 639 STFDUX A2, Y2, INCY2 640 .align 4 641 642LL(999): 643 li r10, 16 644 addi SP, SP, -16 645 646 lfpdux f15, SP, r10 647 lfpdux f14, SP, r10 648 649 addi SP, SP, 16 650 blr 651 652 EPILOGUE 653