1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r4 44#define INCX r5 45 46#define NN r6 47#define XX r7 48#define INC1 r9 49#define PRE r10 50 51#define FZERO 144(SP) 52#define FONE 148(SP) 53#define FMAX 152(SP) 54#define C1 156(SP) 55#define C2 160(SP) 56 57#define STACKSIZE 168 58 59 PROLOGUE 60 PROFCODE 61 62 addi SP, SP, -STACKSIZE 63 li r10, 0 64 lis r11, 0x3f80 65 lis r12, 0x5fe0 66 lis r6, 0x3f00 67 lis r7, 0x4040 68 69 stfd f14, 0(SP) 70 stfd f15, 8(SP) 71 stfd f16, 16(SP) 72 stfd f17, 24(SP) 73 74 stfd f18, 32(SP) 75 stfd f19, 40(SP) 76 stfd f20, 48(SP) 77 stfd f21, 56(SP) 78 79 stfd f22, 64(SP) 80 stfd f23, 72(SP) 81 stfd f24, 80(SP) 82 stfd f25, 88(SP) 83 84 stfd f26, 96(SP) 85 stfd f27, 104(SP) 86 stfd f28, 112(SP) 87 stfd f29, 120(SP) 88 89 stfd f30, 128(SP) 90 stfd f31, 136(SP) 91 92 stw r10, FZERO 93 stw r11, FONE 94 stw r12, FMAX 95 stw r10, 4 + FMAX 96 stw r6, C1 97 stw r7, C2 98 99 lfs f1, FZERO 100 101#ifdef F_INTERFACE 102 LDINT N, 0(N) 103 LDINT INCX, 0(INCX) 104#endif 105 106 slwi INCX, INCX, ZBASE_SHIFT 107 sub X, X, INCX 108 li INC1, SIZE 109 110 li PRE, 3 * 16 * SIZE 111 112 cmpwi cr0, N, 0 113 ble- LL(999) 114 cmpwi cr0, INCX, 0 115 ble- LL(999) 116 117 mr NN, N 118 mr XX, X 119 120 LFDUX f0, X, INCX 121 LFDX f1, X, INC1 122 123 fabs f2, f0 124 fabs f3, f1 125 fabs f4, f0 126 fabs f5, f1 127 fabs f6, f0 128 fabs f7, f1 129 fabs f0, f0 130 fabs f1, f1 131 132 subi N, N, 1 133 134 srawi. r0, N, 3 135 mtspr CTR, r0 136 beq- LL(50) 137 138 LFDUX f24, X, INCX 139 LFDX f25, X, INC1 140 LFDUX f26, X, INCX 141 LFDX f27, X, INC1 142 LFDUX f28, X, INCX 143 LFDX f29, X, INC1 144 LFDUX f30, X, INCX 145 LFDX f31, X, INC1 146 147 fabs f8, f24 148 LFDUX f24, X, INCX 149 fabs f9, f25 150 LFDX f25, X, INC1 151 fabs f10, f26 152 LFDUX f26, X, INCX 153 fabs f11, f27 154 LFDX f27, X, INC1 155 156 fabs f12, f28 157 LFDUX f28, X, INCX 158 fabs f13, f29 159 LFDX f29, X, INC1 160 fabs f14, f30 161 LFDUX f30, X, INCX 162 fabs f15, f31 163 LFDX f31, X, INC1 164 bdz LL(20) 165 .align 4 166 167LL(10): 168 fsub f16, f0, f8 169 fsub f17, f1, f9 170 fsub f18, f2, f10 171 fsub f19, f3, f11 172 fsub f20, f4, f12 173 fsub f21, f5, f13 174 fsub f22, f6, f14 175 fsub f23, f7, f15 176 177 fsel f0, f16, f0, f8 178#ifdef PPCG4 179 dcbt X, PRE 180#endif 181 fabs f8, f24 182 LFDUX f24, X, INCX 183 fsel f1, f17, f1, f9 184 fabs f9, f25 185 LFDX f25, X, INC1 186 fsel f2, f18, f2, f10 187 fabs f10, f26 188 LFDUX f26, X, INCX 189 fsel f3, f19, f3, f11 190 fabs f11, f27 191 LFDX f27, X, INC1 192 193 fsel f4, f20, f4, f12 194#ifdef PPCG4 195 dcbt X, PRE 196#endif 197 fabs f12, f28 198 LFDUX f28, X, INCX 199 fsel f5, f21, f5, f13 200 fabs f13, f29 201 LFDX f29, X, INC1 202 fsel f6, f22, f6, f14 203 fabs f14, f30 204 LFDUX f30, X, INCX 205 fsel f7, f23, f7, f15 206 fabs f15, f31 207 LFDX f31, X, INC1 208 209 fsub f16, f0, f8 210 fsub f17, f1, f9 211 fsub f18, f2, f10 212 fsub f19, f3, f11 213 fsub f20, f4, f12 214 fsub f21, f5, f13 215 fsub f22, f6, f14 216 fsub f23, f7, f15 217 218 fsel f0, f16, f0, f8 219#ifdef PPCG4 220 dcbt X, PRE 221#endif 222 fabs f8, f24 223 LFDUX f24, X, INCX 224 fsel f1, f17, f1, f9 225 fabs f9, f25 226 LFDX f25, X, INC1 227 fsel f2, f18, f2, f10 228 fabs f10, f26 229 LFDUX f26, X, INCX 230 fsel f3, f19, f3, f11 231 fabs f11, f27 232 LFDX f27, X, INC1 233 234 fsel f4, f20, f4, f12 235#ifdef PPCG4 236 dcbt X, PRE 237#endif 238 fabs f12, f28 239 LFDUX f28, X, INCX 240 fsel f5, f21, f5, f13 241 fabs f13, f29 242 LFDX f29, X, INC1 243 fsel f6, f22, f6, f14 244 fabs f14, f30 245 LFDUX f30, X, INCX 246 fsel f7, f23, f7, f15 247 fabs f15, f31 248 LFDX f31, X, INC1 249 bdnz LL(10) 250 .align 4 251 252LL(20): 253 fsub f16, f0, f8 254 fsub f17, f1, f9 255 fsub f18, f2, f10 256 fsub f19, f3, f11 257 fsub f20, f4, f12 258 fsub f21, f5, f13 259 fsub f22, f6, f14 260 fsub f23, f7, f15 261 262 fsel f0, f16, f0, f8 263 fabs f8, f24 264 fsel f1, f17, f1, f9 265 fabs f9, f25 266 fsel f2, f18, f2, f10 267 fabs f10, f26 268 fsel f3, f19, f3, f11 269 fabs f11, f27 270 271 fsel f4, f20, f4, f12 272 fabs f12, f28 273 fsel f5, f21, f5, f13 274 fabs f13, f29 275 fsel f6, f22, f6, f14 276 fabs f14, f30 277 fsel f7, f23, f7, f15 278 fabs f15, f31 279 280 fsub f16, f0, f8 281 fsub f17, f1, f9 282 fsub f18, f2, f10 283 fsub f19, f3, f11 284 fsub f20, f4, f12 285 fsub f21, f5, f13 286 fsub f22, f6, f14 287 fsub f23, f7, f15 288 289 fsel f0, f16, f0, f8 290 fsel f1, f17, f1, f9 291 fsel f2, f18, f2, f10 292 fsel f3, f19, f3, f11 293 fsel f4, f20, f4, f12 294 fsel f5, f21, f5, f13 295 fsel f6, f22, f6, f14 296 fsel f7, f23, f7, f15 297 .align 4 298 299LL(50): 300 andi. r0, N, 7 301 mtspr CTR, r0 302 beq LL(99) 303 .align 4 304 305LL(60): 306 LFDUX f8, X, INCX 307 LFDX f9, X, INC1 308 309 fabs f8, f8 310 fabs f9, f9 311 fsub f16, f0, f8 312 fsub f17, f1, f9 313 fsel f0, f16, f0, f8 314 fsel f1, f17, f1, f9 315 bdnz LL(60) 316 .align 4 317 318LL(99): 319 fsub f8, f0, f1 320 fsub f9, f2, f3 321 fsub f10, f4, f5 322 fsub f11, f6, f7 323 324 fsel f0, f8, f0, f1 325 fsel f2, f9, f2, f3 326 fsel f4, f10, f4, f5 327 fsel f6, f11, f6, f7 328 329 fsub f8, f0, f2 330 fsub f9, f4, f6 331 fsel f0, f8, f0, f2 332 fsel f4, f9, f4, f6 333 334 fsub f8, f0, f4 335 fsel f31, f8, f0, f4 336 337 lfs f1, FZERO 338 lfs f0, FONE 339 340 fcmpu cr0, f1, f31 341 beq- cr0, LL(999) 342 343 fdiv f30, f0, f31 344 345 fmr f0, f1 346 fmr f2, f1 347 fmr f3, f1 348 fmr f4, f1 349 fmr f5, f1 350 fmr f6, f1 351 fmr f7, f1 352 353 srawi. r0, NN, 3 354 mtspr CTR, r0 355 beq- cr0, LL(150) 356 357 LFDUX f8, XX, INCX 358 LFDX f9, XX, INC1 359 LFDUX f10, XX, INCX 360 LFDX f11, XX, INC1 361 LFDUX f12, XX, INCX 362 LFDX f13, XX, INC1 363 LFDUX f14, XX, INCX 364 LFDX f15, XX, INC1 365 366 fmul f16, f30, f8 367 LFDUX f8, XX, INCX 368 fmul f17, f30, f9 369 LFDX f9, XX, INC1 370 fmul f18, f30, f10 371 LFDUX f10, XX, INCX 372 fmul f19, f30, f11 373 LFDX f11, XX, INC1 374 375 fmul f20, f30, f12 376 LFDUX f12, XX, INCX 377 fmul f21, f30, f13 378 LFDX f13, XX, INC1 379 fmul f22, f30, f14 380 LFDUX f14, XX, INCX 381 fmul f23, f30, f15 382 LFDX f15, XX, INC1 383 bdz LL(120) 384 .align 4 385 386LL(110): 387 fmadd f0, f16, f16, f0 388#ifdef PPCG4 389 dcbt XX, PRE 390#endif 391 fmul f16, f30, f8 392 LFDUX f8, XX, INCX 393 fmadd f1, f17, f17, f1 394 fmul f17, f30, f9 395 LFDX f9, XX, INC1 396 fmadd f2, f18, f18, f2 397 fmul f18, f30, f10 398 LFDUX f10, XX, INCX 399 fmadd f3, f19, f19, f3 400 fmul f19, f30, f11 401 LFDX f11, XX, INC1 402 403 fmadd f4, f20, f20, f4 404#ifdef PPCG4 405 dcbt XX, PRE 406#endif 407 fmul f20, f30, f12 408 LFDUX f12, XX, INCX 409 fmadd f5, f21, f21, f5 410 fmul f21, f30, f13 411 LFDX f13, XX, INC1 412 fmadd f6, f22, f22, f6 413 fmul f22, f30, f14 414 LFDUX f14, XX, INCX 415 fmadd f7, f23, f23, f7 416 fmul f23, f30, f15 417 LFDX f15, XX, INC1 418 419 fmadd f0, f16, f16, f0 420#ifdef PPCG4 421 dcbt XX, PRE 422#endif 423 fmul f16, f30, f8 424 LFDUX f8, XX, INCX 425 fmadd f1, f17, f17, f1 426 fmul f17, f30, f9 427 LFDX f9, XX, INC1 428 fmadd f2, f18, f18, f2 429 fmul f18, f30, f10 430 LFDUX f10, XX, INCX 431 fmadd f3, f19, f19, f3 432 fmul f19, f30, f11 433 LFDX f11, XX, INC1 434 435 fmadd f4, f20, f20, f4 436#ifdef PPCG4 437 dcbt XX, PRE 438#endif 439 fmul f20, f30, f12 440 LFDUX f12, XX, INCX 441 fmadd f5, f21, f21, f5 442 fmul f21, f30, f13 443 LFDX f13, XX, INC1 444 fmadd f6, f22, f22, f6 445 fmul f22, f30, f14 446 LFDUX f14, XX, INCX 447 fmadd f7, f23, f23, f7 448 fmul f23, f30, f15 449 LFDX f15, XX, INC1 450 bdnz LL(110) 451 .align 4 452 453LL(120): 454 fmadd f0, f16, f16, f0 455 fmul f16, f30, f8 456 fmadd f1, f17, f17, f1 457 fmul f17, f30, f9 458 fmadd f2, f18, f18, f2 459 fmul f18, f30, f10 460 fmadd f3, f19, f19, f3 461 fmul f19, f30, f11 462 463 fmadd f4, f20, f20, f4 464 fmul f20, f30, f12 465 fmadd f5, f21, f21, f5 466 fmul f21, f30, f13 467 fmadd f6, f22, f22, f6 468 fmul f22, f30, f14 469 fmadd f7, f23, f23, f7 470 fmul f23, f30, f15 471 472 fmadd f0, f16, f16, f0 473 fmadd f1, f17, f17, f1 474 fmadd f2, f18, f18, f2 475 fmadd f3, f19, f19, f3 476 fmadd f4, f20, f20, f4 477 fmadd f5, f21, f21, f5 478 fmadd f6, f22, f22, f6 479 fmadd f7, f23, f23, f7 480 .align 4 481 482LL(150): 483 andi. r0, NN, 7 484 mtspr CTR, r0 485 beq- cr0, LL(170) 486 .align 4 487 488LL(160): 489 LFDUX f8, XX, INCX 490 LFDX f9, XX, INC1 491 492 fmul f16, f30, f8 493 fmul f17, f30, f9 494 fmadd f0, f16, f16, f0 495 fmadd f1, f17, f17, f1 496 bdnz LL(160) 497 .align 4 498 499LL(170): 500 fadd f0, f0, f1 501 fadd f2, f2, f3 502 fadd f4, f4, f5 503 fadd f6, f6, f7 504 505 fadd f0, f0, f2 506 fadd f4, f4, f6 507 508 fadd f1, f0, f4 509 510 frsqrte f0, f1 511 lfs f8, C1 512 lfs f9, C2 513 514 fmul f2, f1, f0 515 fadd f7, f8, f8 516 fmul f3, f0, f8 517 fnmsub f4, f2, f0, f9 518 fmul f0, f3, f4 519 520 fmul f2, f1, f0 521 fmul f3, f0, f8 522 fnmsub f4, f2, f0, f9 523 fmul f0, f3, f4 524 525 fmul f2, f1, f0 526 fmul f3, f0, f8 527 fnmsub f4, f2, f0, f9 528 fmul f0, f3, f4 529 530 fmul f5, f1, f0 531 fmul f2, f5, f8 532 fnmsub f3, f5, f0, f7 533 fmadd f1, f2, f3, f5 534 fmul f1, f31, f1 535 .align 4 536 537LL(999): 538 lfd f14, 0(SP) 539 lfd f15, 8(SP) 540 lfd f16, 16(SP) 541 lfd f17, 24(SP) 542 543 lfd f18, 32(SP) 544 lfd f19, 40(SP) 545 lfd f20, 48(SP) 546 lfd f21, 56(SP) 547 548 lfd f22, 64(SP) 549 lfd f23, 72(SP) 550 lfd f24, 80(SP) 551 lfd f25, 88(SP) 552 553 lfd f26, 96(SP) 554 lfd f27, 104(SP) 555 lfd f28, 112(SP) 556 lfd f29, 120(SP) 557 558 lfd f30, 128(SP) 559 lfd f31, 136(SP) 560 561 addi SP, SP, STACKSIZE 562 blr 563 564 EPILOGUE 565