1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r4 44#define INCX r5 45 46#define NN r6 47#define XX r7 48 49#define PRE r8 50 51#define FZERO 144(SP) 52#define FONE 148(SP) 53#define FMAX 152(SP) 54#define C1 156(SP) 55#define C2 160(SP) 56 57#define STACKSIZE 168 58 59 PROLOGUE 60 PROFCODE 61 62 addi SP, SP, -STACKSIZE 63 li r10, 0 64 lis r11, 0x3f80 65 lis r12, 0x5fe0 66 lis r6, 0x3f00 67 lis r7, 0x4040 68 69 stfd f14, 0(SP) 70 stfd f15, 8(SP) 71 stfd f16, 16(SP) 72 stfd f17, 24(SP) 73 74 stfd f18, 32(SP) 75 stfd f19, 40(SP) 76 stfd f20, 48(SP) 77 stfd f21, 56(SP) 78 79 stfd f22, 64(SP) 80 stfd f23, 72(SP) 81 stfd f24, 80(SP) 82 stfd f25, 88(SP) 83 84 stfd f26, 96(SP) 85 stfd f27, 104(SP) 86 stfd f28, 112(SP) 87 stfd f29, 120(SP) 88 89 stfd f30, 128(SP) 90 stfd f31, 136(SP) 91 92 stw r10, FZERO 93 stw r11, FONE 94 stw r12, FMAX 95 stw r10, 4 + FMAX 96 stw r6, C1 97 stw r7, C2 98 99 lfs f1, FZERO 100 101#ifdef F_INTERFACE 102 LDINT N, 0(N) 103 LDINT INCX, 0(INCX) 104#endif 105 106 slwi INCX, INCX, BASE_SHIFT 107 sub X, X, INCX 108 109 li PRE, 3 * 16 * SIZE 110 111 cmpwi cr0, N, 0 112 ble- LL(999) 113 cmpwi cr0, INCX, 0 114 ble- LL(999) 115 116 mr NN, N 117 mr XX, X 118 119 LFDUX f1, X, INCX 120 121 fabs f0, f1 122 fabs f2, f1 123 fabs f3, f1 124 fabs f4, f1 125 fabs f5, f1 126 fabs f6, f1 127 fabs f7, f1 128 fabs f1, f1 129 subi N, N, 1 130 131 cmpwi cr0, N, 0 132 ble- LL(999) 133 134 srawi. r0, N, 4 135 mtspr CTR, r0 136 beq- LL(50) 137 138 LFDUX f24, X, INCX 139 LFDUX f25, X, INCX 140 LFDUX f26, X, INCX 141 LFDUX f27, X, INCX 142 LFDUX f28, X, INCX 143 LFDUX f29, X, INCX 144 LFDUX f30, X, INCX 145 LFDUX f31, X, INCX 146 147 fabs f8, f24 148 LFDUX f24, X, INCX 149 fabs f9, f25 150 LFDUX f25, X, INCX 151 fabs f10, f26 152 LFDUX f26, X, INCX 153 fabs f11, f27 154 LFDUX f27, X, INCX 155 156 fabs f12, f28 157 LFDUX f28, X, INCX 158 fabs f13, f29 159 LFDUX f29, X, INCX 160 fabs f14, f30 161 LFDUX f30, X, INCX 162 fabs f15, f31 163 LFDUX f31, X, INCX 164 bdz LL(20) 165 .align 4 166 167LL(10): 168 fsub f16, f0, f8 169 fsub f17, f1, f9 170 fsub f18, f2, f10 171 fsub f19, f3, f11 172 fsub f20, f4, f12 173 fsub f21, f5, f13 174 fsub f22, f6, f14 175 fsub f23, f7, f15 176 177 fsel f0, f16, f0, f8 178#ifdef PPCG4 179 dcbt X, PRE 180#endif 181 fabs f8, f24 182 LFDUX f24, X, INCX 183 fsel f1, f17, f1, f9 184 fabs f9, f25 185 LFDUX f25, X, INCX 186 fsel f2, f18, f2, f10 187 fabs f10, f26 188 LFDUX f26, X, INCX 189 fsel f3, f19, f3, f11 190 fabs f11, f27 191 LFDUX f27, X, INCX 192 193 fsel f4, f20, f4, f12 194#ifdef PPCG4 195 dcbt X, PRE 196#endif 197 fabs f12, f28 198 LFDUX f28, X, INCX 199 fsel f5, f21, f5, f13 200 fabs f13, f29 201 LFDUX f29, X, INCX 202 fsel f6, f22, f6, f14 203 fabs f14, f30 204 LFDUX f30, X, INCX 205 fsel f7, f23, f7, f15 206 fabs f15, f31 207 LFDUX f31, X, INCX 208 209 fsub f16, f0, f8 210 fsub f17, f1, f9 211 fsub f18, f2, f10 212 fsub f19, f3, f11 213 fsub f20, f4, f12 214 fsub f21, f5, f13 215 fsub f22, f6, f14 216 fsub f23, f7, f15 217 218 fsel f0, f16, f0, f8 219#ifdef PPCG4 220 dcbt X, PRE 221#endif 222 fabs f8, f24 223 LFDUX f24, X, INCX 224 fsel f1, f17, f1, f9 225 fabs f9, f25 226 LFDUX f25, X, INCX 227 fsel f2, f18, f2, f10 228 fabs f10, f26 229 LFDUX f26, X, INCX 230 fsel f3, f19, f3, f11 231 fabs f11, f27 232 LFDUX f27, X, INCX 233 234 fsel f4, f20, f4, f12 235#ifdef PPCG4 236 dcbt X, PRE 237#endif 238 fabs f12, f28 239 LFDUX f28, X, INCX 240 fsel f5, f21, f5, f13 241 fabs f13, f29 242 LFDUX f29, X, INCX 243 fsel f6, f22, f6, f14 244 fabs f14, f30 245 LFDUX f30, X, INCX 246 fsel f7, f23, f7, f15 247 fabs f15, f31 248 LFDUX f31, X, INCX 249 bdnz LL(10) 250 .align 4 251 252LL(20): 253 fsub f16, f0, f8 254 fsub f17, f1, f9 255 fsub f18, f2, f10 256 fsub f19, f3, f11 257 fsub f20, f4, f12 258 fsub f21, f5, f13 259 fsub f22, f6, f14 260 fsub f23, f7, f15 261 262 fsel f0, f16, f0, f8 263 fabs f8, f24 264 fsel f1, f17, f1, f9 265 fabs f9, f25 266 fsel f2, f18, f2, f10 267 fabs f10, f26 268 fsel f3, f19, f3, f11 269 fabs f11, f27 270 271 fsel f4, f20, f4, f12 272 fabs f12, f28 273 fsel f5, f21, f5, f13 274 fabs f13, f29 275 fsel f6, f22, f6, f14 276 fabs f14, f30 277 fsel f7, f23, f7, f15 278 fabs f15, f31 279 280 fsub f16, f0, f8 281 fsub f17, f1, f9 282 fsub f18, f2, f10 283 fsub f19, f3, f11 284 fsub f20, f4, f12 285 fsub f21, f5, f13 286 fsub f22, f6, f14 287 fsub f23, f7, f15 288 289 fsel f0, f16, f0, f8 290 fsel f1, f17, f1, f9 291 fsel f2, f18, f2, f10 292 fsel f3, f19, f3, f11 293 fsel f4, f20, f4, f12 294 fsel f5, f21, f5, f13 295 fsel f6, f22, f6, f14 296 fsel f7, f23, f7, f15 297 .align 4 298 299LL(50): 300 andi. r0, N, 15 301 mtspr CTR, r0 302 beq LL(99) 303 .align 4 304 305LL(60): 306 LFDUX f8, X, INCX 307 fabs f8, f8 308 fsub f16, f1, f8 309 fsel f1, f16, f1, f8 310 bdnz LL(60) 311 .align 4 312 313LL(99): 314 fsub f8, f0, f1 315 fsub f9, f2, f3 316 fsub f10, f4, f5 317 fsub f11, f6, f7 318 319 fsel f0, f8, f0, f1 320 fsel f2, f9, f2, f3 321 fsel f4, f10, f4, f5 322 fsel f6, f11, f6, f7 323 324 fsub f8, f0, f2 325 fsub f9, f4, f6 326 fsel f0, f8, f0, f2 327 fsel f4, f9, f4, f6 328 329 fsub f8, f0, f4 330 fsel f31, f8, f0, f4 331 332 lfs f1, FZERO 333 lfs f0, FONE 334 lfd f2, FMAX 335 336 fcmpu cr0, f1, f31 337 beq- cr0, LL(999) 338 339 fdiv f30, f0, f31 340 341 fmr f0, f1 342 fmr f2, f1 343 fmr f3, f1 344 fmr f4, f1 345 fmr f5, f1 346 fmr f6, f1 347 fmr f7, f1 348 349 srawi. r0, NN, 4 350 mtspr CTR, r0 351 beq- cr0, LL(150) 352 353 LFDUX f8, XX, INCX 354 LFDUX f9, XX, INCX 355 LFDUX f10, XX, INCX 356 LFDUX f11, XX, INCX 357 LFDUX f12, XX, INCX 358 LFDUX f13, XX, INCX 359 LFDUX f14, XX, INCX 360 LFDUX f15, XX, INCX 361 362 fmul f16, f30, f8 363 LFDUX f8, XX, INCX 364 fmul f17, f30, f9 365 LFDUX f9, XX, INCX 366 fmul f18, f30, f10 367 LFDUX f10, XX, INCX 368 fmul f19, f30, f11 369 LFDUX f11, XX, INCX 370 371 fmul f20, f30, f12 372 LFDUX f12, XX, INCX 373 fmul f21, f30, f13 374 LFDUX f13, XX, INCX 375 fmul f22, f30, f14 376 LFDUX f14, XX, INCX 377 fmul f23, f30, f15 378 LFDUX f15, XX, INCX 379 bdz LL(120) 380 .align 4 381 382LL(110): 383 fmadd f0, f16, f16, f0 384#ifdef PPCG4 385 dcbt XX, PRE 386#endif 387 fmul f16, f30, f8 388 LFDUX f8, XX, INCX 389 fmadd f1, f17, f17, f1 390 fmul f17, f30, f9 391 LFDUX f9, XX, INCX 392 fmadd f2, f18, f18, f2 393 fmul f18, f30, f10 394 LFDUX f10, XX, INCX 395 fmadd f3, f19, f19, f3 396 fmul f19, f30, f11 397 LFDUX f11, XX, INCX 398 399 fmadd f4, f20, f20, f4 400#ifdef PPCG4 401 dcbt XX, PRE 402#endif 403 fmul f20, f30, f12 404 LFDUX f12, XX, INCX 405 fmadd f5, f21, f21, f5 406 fmul f21, f30, f13 407 LFDUX f13, XX, INCX 408 fmadd f6, f22, f22, f6 409 fmul f22, f30, f14 410 LFDUX f14, XX, INCX 411 fmadd f7, f23, f23, f7 412 fmul f23, f30, f15 413 LFDUX f15, XX, INCX 414 415 fmadd f0, f16, f16, f0 416#ifdef PPCG4 417 dcbt XX, PRE 418#endif 419 fmul f16, f30, f8 420 LFDUX f8, XX, INCX 421 fmadd f1, f17, f17, f1 422 fmul f17, f30, f9 423 LFDUX f9, XX, INCX 424 fmadd f2, f18, f18, f2 425 fmul f18, f30, f10 426 LFDUX f10, XX, INCX 427 fmadd f3, f19, f19, f3 428 fmul f19, f30, f11 429 LFDUX f11, XX, INCX 430 431 fmadd f4, f20, f20, f4 432#ifdef PPCG4 433 dcbt XX, PRE 434#endif 435 fmul f20, f30, f12 436 LFDUX f12, XX, INCX 437 fmadd f5, f21, f21, f5 438 fmul f21, f30, f13 439 LFDUX f13, XX, INCX 440 fmadd f6, f22, f22, f6 441 fmul f22, f30, f14 442 LFDUX f14, XX, INCX 443 fmadd f7, f23, f23, f7 444 fmul f23, f30, f15 445 LFDUX f15, XX, INCX 446 bdnz LL(110) 447 .align 4 448 449LL(120): 450 fmadd f0, f16, f16, f0 451 fmul f16, f30, f8 452 fmadd f1, f17, f17, f1 453 fmul f17, f30, f9 454 fmadd f2, f18, f18, f2 455 fmul f18, f30, f10 456 fmadd f3, f19, f19, f3 457 fmul f19, f30, f11 458 459 fmadd f4, f20, f20, f4 460 fmul f20, f30, f12 461 fmadd f5, f21, f21, f5 462 fmul f21, f30, f13 463 fmadd f6, f22, f22, f6 464 fmul f22, f30, f14 465 fmadd f7, f23, f23, f7 466 fmul f23, f30, f15 467 468 fmadd f0, f16, f16, f0 469 fmadd f1, f17, f17, f1 470 fmadd f2, f18, f18, f2 471 fmadd f3, f19, f19, f3 472 fmadd f4, f20, f20, f4 473 fmadd f5, f21, f21, f5 474 fmadd f6, f22, f22, f6 475 fmadd f7, f23, f23, f7 476 .align 4 477 478LL(150): 479 andi. r0, NN, 15 480 mtspr CTR, r0 481 beq- cr0, LL(170) 482 .align 4 483 484LL(160): 485 LFDUX f8, XX, INCX 486 487 fmul f16, f30, f8 488 fmadd f0, f16, f16, f0 489 bdnz LL(160) 490 .align 4 491 492LL(170): 493 fadd f0, f0, f1 494 fadd f2, f2, f3 495 fadd f4, f4, f5 496 fadd f6, f6, f7 497 498 fadd f0, f0, f2 499 fadd f4, f4, f6 500 501 fadd f1, f0, f4 502 503 frsqrte f0, f1 504 lfs f8, C1 505 lfs f9, C2 506 507 fmul f2, f1, f0 508 fadd f7, f8, f8 509 fmul f3, f0, f8 510 fnmsub f4, f2, f0, f9 511 fmul f0, f3, f4 512 513 fmul f2, f1, f0 514 fmul f3, f0, f8 515 fnmsub f4, f2, f0, f9 516 fmul f0, f3, f4 517 518 fmul f2, f1, f0 519 fmul f3, f0, f8 520 fnmsub f4, f2, f0, f9 521 fmul f0, f3, f4 522 523 fmul f5, f1, f0 524 fmul f2, f5, f8 525 fnmsub f3, f5, f0, f7 526 fmadd f1, f2, f3, f5 527 fmul f1, f31, f1 528 .align 4 529 530LL(999): 531 lfd f14, 0(SP) 532 lfd f15, 8(SP) 533 lfd f16, 16(SP) 534 lfd f17, 24(SP) 535 536 lfd f18, 32(SP) 537 lfd f19, 40(SP) 538 lfd f20, 48(SP) 539 lfd f21, 56(SP) 540 541 lfd f22, 64(SP) 542 lfd f23, 72(SP) 543 lfd f24, 80(SP) 544 lfd f25, 88(SP) 545 546 lfd f26, 96(SP) 547 lfd f27, 104(SP) 548 lfd f28, 112(SP) 549 lfd f29, 120(SP) 550 551 lfd f30, 128(SP) 552 lfd f31, 136(SP) 553 554 addi SP, SP, STACKSIZE 555 blr 556 EPILOGUE 557