1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r4 44#define INCX r5 45 46#define PREA r8 47#define INCXM1 r9 48 49#define FZERO f1 50 51#define STACKSIZE 160 52 53 PROLOGUE 54 PROFCODE 55 56 addi SP, SP, -STACKSIZE 57 li r0, 0 58 59 stfd f14, 0(SP) 60 stfd f15, 8(SP) 61 stfd f16, 16(SP) 62 stfd f17, 24(SP) 63 64 stfd f18, 32(SP) 65 stfd f19, 40(SP) 66 stfd f20, 48(SP) 67 stfd f21, 56(SP) 68 69 stfd f22, 64(SP) 70 stfd f23, 72(SP) 71 stfd f24, 80(SP) 72 stfd f25, 88(SP) 73 74 stfd f26, 96(SP) 75 stfd f27, 104(SP) 76 stfd f28, 112(SP) 77 stfd f29, 120(SP) 78 79 stfd f30, 128(SP) 80 stfd f31, 136(SP) 81 82 stw r0, 144(SP) 83 lfs FZERO,144(SP) 84 85#ifdef F_INTERFACE 86 LDINT N, 0(N) 87 LDINT INCX, 0(INCX) 88#endif 89 90 slwi INCX, INCX, ZBASE_SHIFT 91 subi INCXM1, INCX, SIZE 92 93 li PREA, 10 * 16 * SIZE 94 95 cmpwi cr0, N, 0 96 ble- LL(9999) 97 cmpwi cr0, INCX, 0 98 ble- LL(9999) 99 100 LFD f1, 0 * SIZE(X) 101 LFD f2, 1 * SIZE(X) 102 add X, X, INCX 103 104 fabs f1, f1 105 fabs f2, f2 106 fadd f1, f1, f2 107 108 fmr f0, f1 109 fmr f2, f1 110 fmr f3, f1 111 112 subi N, N, 1 113 114 cmpwi cr0, INCX, 2 * SIZE 115 bne- cr0, LL(100) 116 117 srawi. r0, N, 3 118 mtspr CTR, r0 119 beq- cr0, LL(50) 120 .align 4 121 122 LFD f24, 0 * SIZE(X) 123 LFD f25, 1 * SIZE(X) 124 125 fabs f8, f24 126 LFD f26, 2 * SIZE(X) 127 fabs f9, f25 128 LFD f27, 3 * SIZE(X) 129 fabs f10, f26 130 LFD f28, 4 * SIZE(X) 131 fabs f11, f27 132 LFD f29, 5 * SIZE(X) 133 fabs f12, f28 134 LFD f30, 6 * SIZE(X) 135 fabs f13, f29 136 LFD f31, 7 * SIZE(X) 137 fabs f14, f30 138 nop 139 fabs f15, f31 140 bdz LL(20) 141 .align 4 142 143LL(10): 144 fadd f4, f8, f9 145 dcbt X, PREA 146 fadd f5, f10, f11 147 nop 148 fadd f6, f12, f13 149 LFD f24, 8 * SIZE(X) 150 fadd f7, f14, f15 151 LFD f25, 9 * SIZE(X) 152 153 fabs f8, f24 154 LFD f26, 10 * SIZE(X) 155 fabs f9, f25 156 LFD f27, 11 * SIZE(X) 157 fabs f10, f26 158 fabs f11, f27 159 160 fsub f16, f0, f4 161 fsub f17, f1, f5 162 fsub f18, f2, f6 163 LFD f28, 12 * SIZE(X) 164 fsub f19, f3, f7 165 LFD f29, 13 * SIZE(X) 166 167 fabs f12, f28 168 LFD f30, 14 * SIZE(X) 169 fabs f13, f29 170 LFD f31, 15 * SIZE(X) 171 fabs f14, f30 172 fabs f15, f31 173 174 fsel f0, f16, f4, f0 175 fsel f1, f17, f5, f1 176 fsel f2, f18, f6, f2 177 fsel f3, f19, f7, f3 178 179 fadd f20, f8, f9 180 fadd f21, f10, f11 181 fadd f22, f12, f13 182 LFD f24, 16 * SIZE(X) 183 fadd f23, f14, f15 184 LFD f25, 17 * SIZE(X) 185 186 fabs f8, f24 187 LFD f26, 18 * SIZE(X) 188 fabs f9, f25 189 LFD f27, 19 * SIZE(X) 190 fabs f10, f26 191 fabs f11, f27 192 193 fsub f16, f0, f20 194 fsub f17, f1, f21 195 fsub f18, f2, f22 196 LFD f28, 20 * SIZE(X) 197 fsub f19, f3, f23 198 LFD f29, 21 * SIZE(X) 199 200 fabs f12, f28 201 LFD f30, 22 * SIZE(X) 202 fabs f13, f29 203 LFD f31, 23 * SIZE(X) 204 fabs f14, f30 205 addi X, X, 16 * SIZE 206 fabs f15, f31 207 208 fsel f0, f16, f20, f0 209 fsel f1, f17, f21, f1 210 fsel f2, f18, f22, f2 211 fsel f3, f19, f23, f3 212 213 bdnz LL(10) 214 .align 4 215 216LL(20): 217 fadd f4, f8, f9 218 fadd f5, f10, f11 219 fadd f6, f12, f13 220 LFD f24, 8 * SIZE(X) 221 fadd f7, f14, f15 222 LFD f25, 9 * SIZE(X) 223 224 fabs f8, f24 225 LFD f26, 10 * SIZE(X) 226 fabs f9, f25 227 LFD f27, 11 * SIZE(X) 228 fabs f10, f26 229 fabs f11, f27 230 231 fsub f16, f0, f4 232 fsub f17, f1, f5 233 fsub f18, f2, f6 234 LFD f28, 12 * SIZE(X) 235 fsub f19, f3, f7 236 LFD f29, 13 * SIZE(X) 237 238 fabs f12, f28 239 LFD f30, 14 * SIZE(X) 240 fabs f13, f29 241 LFD f31, 15 * SIZE(X) 242 fabs f14, f30 243 fabs f15, f31 244 245 fsel f0, f16, f4, f0 246 fsel f1, f17, f5, f1 247 fsel f2, f18, f6, f2 248 fsel f3, f19, f7, f3 249 250 fadd f20, f8, f9 251 fadd f21, f10, f11 252 fadd f22, f12, f13 253 fadd f23, f14, f15 254 255 fsub f16, f0, f20 256 fsub f17, f1, f21 257 fsub f18, f2, f22 258 fsub f19, f3, f23 259 260 fsel f0, f16, f20, f0 261 fsel f1, f17, f21, f1 262 fsel f2, f18, f22, f2 263 fsel f3, f19, f23, f3 264 addi X, X, 16 * SIZE 265 266 .align 4 267 268LL(50): 269 andi. r0, N, 7 270 mtspr CTR, r0 271 beq LL(999) 272 .align 4 273 274LL(60): 275 LFD f8, 0 * SIZE(X) 276 LFD f9, 1 * SIZE(X) 277 addi X, X, 2 * SIZE 278 279 fabs f8, f8 280 fabs f9, f9 281 fadd f8, f8, f9 282 fsub f16, f1, f8 283 fsel f1, f16, f8, f1 284 bdnz LL(60) 285 b LL(999) 286 .align 4 287 288LL(100): 289 sub X, X, INCXM1 290 291 srawi. r0, N, 3 292 mtspr CTR, r0 293 beq- LL(150) 294 295 LFDX f24, X, INCXM1 296 LFDUX f25, X, INCX 297 LFDX f26, X, INCXM1 298 LFDUX f27, X, INCX 299 LFDX f28, X, INCXM1 300 LFDUX f29, X, INCX 301 LFDX f30, X, INCXM1 302 LFDUX f31, X, INCX 303 304 fabs f8, f24 305 fabs f9, f25 306 fabs f10, f26 307 fabs f11, f27 308 fabs f12, f28 309 fabs f13, f29 310 fabs f14, f30 311 fabs f15, f31 312 313 LFDX f24, X, INCXM1 314 LFDUX f25, X, INCX 315 LFDX f26, X, INCXM1 316 LFDUX f27, X, INCX 317 LFDX f28, X, INCXM1 318 LFDUX f29, X, INCX 319 LFDX f30, X, INCXM1 320 LFDUX f31, X, INCX 321 322 bdz LL(120) 323 .align 4 324 325LL(110): 326 fadd f4, f8, f9 327 fadd f5, f10, f11 328 fadd f6, f12, f13 329 fadd f7, f14, f15 330 331 fabs f8, f24 332 fabs f9, f25 333 fabs f10, f26 334 fabs f11, f27 335 336 LFDX f24, X, INCXM1 337 LFDUX f25, X, INCX 338 LFDX f26, X, INCXM1 339 LFDUX f27, X, INCX 340 341 fabs f12, f28 342 fabs f13, f29 343 fabs f14, f30 344 fabs f15, f31 345 346 LFDX f28, X, INCXM1 347 LFDUX f29, X, INCX 348 LFDX f30, X, INCXM1 349 LFDUX f31, X, INCX 350 351 fsub f16, f0, f4 352 fsub f17, f1, f5 353 fsub f18, f2, f6 354 fsub f19, f3, f7 355 356 fadd f20, f8, f9 357 fadd f21, f10, f11 358 fadd f22, f12, f13 359 fadd f23, f14, f15 360 361 fabs f8, f24 362 fabs f9, f25 363 fabs f10, f26 364 fabs f11, f27 365 366 LFDX f24, X, INCXM1 367 LFDUX f25, X, INCX 368 LFDX f26, X, INCXM1 369 LFDUX f27, X, INCX 370 371 fsel f0, f16, f4, f0 372 fsel f1, f17, f5, f1 373 fsel f2, f18, f6, f2 374 fsel f3, f19, f7, f3 375 376 fabs f12, f28 377 fabs f13, f29 378 fabs f14, f30 379 fabs f15, f31 380 381 LFDX f28, X, INCXM1 382 LFDUX f29, X, INCX 383 LFDX f30, X, INCXM1 384 LFDUX f31, X, INCX 385 386 fsub f16, f0, f20 387 fsub f17, f1, f21 388 fsub f18, f2, f22 389 fsub f19, f3, f23 390 391 fsel f0, f16, f20, f0 392 fsel f1, f17, f21, f1 393 fsel f2, f18, f22, f2 394 fsel f3, f19, f23, f3 395 bdnz LL(110) 396 .align 4 397 398LL(120): 399 fadd f4, f8, f9 400 fadd f5, f10, f11 401 fadd f6, f12, f13 402 fadd f7, f14, f15 403 404 fabs f8, f24 405 fabs f9, f25 406 fabs f10, f26 407 fabs f11, f27 408 409 fabs f12, f28 410 fabs f13, f29 411 fabs f14, f30 412 fabs f15, f31 413 414 fsub f16, f0, f4 415 fsub f17, f1, f5 416 fsub f18, f2, f6 417 fsub f19, f3, f7 418 419 fadd f20, f8, f9 420 fadd f21, f10, f11 421 fadd f22, f12, f13 422 fadd f23, f14, f15 423 424 fsel f0, f16, f4, f0 425 fsel f1, f17, f5, f1 426 fsel f2, f18, f6, f2 427 fsel f3, f19, f7, f3 428 429 fsub f16, f0, f20 430 fsub f17, f1, f21 431 fsub f18, f2, f22 432 fsub f19, f3, f23 433 434 fsel f0, f16, f20, f0 435 fsel f1, f17, f21, f1 436 fsel f2, f18, f22, f2 437 fsel f3, f19, f23, f3 438 .align 4 439 440LL(150): 441 andi. r0, N, 7 442 mtspr CTR, r0 443 beq LL(999) 444 .align 4 445 446LL(160): 447 LFDX f8, X, INCXM1 448 LFDUX f9, X, INCX 449 450 fabs f8, f8 451 fabs f9, f9 452 fadd f8, f8, f9 453 fsub f16, f1, f8 454 fsel f1, f16, f8, f1 455 bdnz LL(160) 456 .align 4 457 458LL(999): 459 fsub f8, f0, f1 460 fsub f9, f2, f3 461 462 fsel f0, f8, f1, f0 463 fsel f2, f9, f3, f2 464 fsub f8, f0, f2 465 fsel f1, f8, f2, f0 466 .align 4 467 468LL(9999): 469 lfd f14, 0(SP) 470 lfd f15, 8(SP) 471 lfd f16, 16(SP) 472 lfd f17, 24(SP) 473 474 lfd f18, 32(SP) 475 lfd f19, 40(SP) 476 lfd f20, 48(SP) 477 lfd f21, 56(SP) 478 479 lfd f22, 64(SP) 480 lfd f23, 72(SP) 481 lfd f24, 80(SP) 482 lfd f25, 88(SP) 483 484 lfd f26, 96(SP) 485 lfd f27, 104(SP) 486 lfd f28, 112(SP) 487 lfd f29, 120(SP) 488 489 lfd f30, 128(SP) 490 lfd f31, 136(SP) 491 492 addi SP, SP, STACKSIZE 493 blr 494 495 EPILOGUE 496