1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r4 44#define INCX r5 45 46#define INCX2 r6 47#define X2 r7 48 49#define C1 f1 50#define C2 f0 51#define C3 f2 52#define C4 f3 53 54#define A1 f4 55#define A2 f5 56#define A3 f6 57#define A4 f7 58#define A5 f8 59#define A6 f9 60#define A7 f10 61#define A8 f11 62 63#define T1 f12 64#define T2 f13 65#define T3 f14 66#define T4 f15 67 68 PROLOGUE 69 PROFCODE 70 71 li r10, -16 72 73 stfpdux f14, SP, r10 74 stfpdux f15, SP, r10 75 76 li r10, 0 77 stwu r10, -4(SP) 78 stwu r10, -4(SP) 79 stwu r10, -4(SP) 80 stwu r10, -4(SP) 81 82#ifdef F_INTERFACE 83 LDINT N, 0(N) 84 LDINT INCX, 0(INCX) 85#endif 86 87 lfpdx C1, SP, r10 # Zero clear 88 89 slwi INCX, INCX, BASE_SHIFT 90 add INCX2, INCX, INCX 91 92 fpmr C2, C1 93 fpmr C3, C1 94 fpmr C4, C1 95 96 cmpwi cr0, N, 0 97 ble LL(999) 98 cmpwi cr0, INCX, 0 99 ble LL(999) 100 101 cmpwi cr0, INCX, SIZE 102 bne LL(100) 103 104 andi. r0, X, 2 * SIZE - 1 105 beq LL(05) 106 107 LFD C1, 0(X) 108 addi X, X, 1 * SIZE 109 addi N, N, -1 110 cmpwi cr0, N, 0 111 fabs C1, C1 112 ble LL(999) 113 .align 4 114 115LL(05): 116 srawi. r0, N, 4 117 sub X, X, INCX2 118 mtspr CTR, r0 119 beq- LL(15) 120 121 LFPDUX A1, X, INCX2 122 fpmr T1, C2 123 LFPDUX A2, X, INCX2 124 fpmr T2, C2 125 LFPDUX A3, X, INCX2 126 fpmr T3, C2 127 LFPDUX A4, X, INCX2 128 fpmr T4, C2 129 LFPDUX A5, X, INCX2 130 LFPDUX A6, X, INCX2 131 LFPDUX A7, X, INCX2 132 LFPDUX A8, X, INCX2 133 bdz LL(13) 134 .align 4 135 136LL(12): 137 fpadd C1, C1, T1 138 nop 139 fpabs T1, A1 140 LFPDUX A1, X, INCX2 141 142 fpadd C2, C2, T2 143 nop 144 fpabs T2, A2 145 LFPDUX A2, X, INCX2 146 147 fpadd C3, C3, T3 148 nop 149 fpabs T3, A3 150 LFPDUX A3, X, INCX2 151 152 fpadd C4, C4, T4 153 nop 154 fpabs T4, A4 155 LFPDUX A4, X, INCX2 156 157 fpadd C1, C1, T1 158 nop 159 fpabs T1, A5 160 LFPDUX A5, X, INCX2 161 162 fpadd C2, C2, T2 163 nop 164 fpabs T2, A6 165 LFPDUX A6, X, INCX2 166 167 fpadd C3, C3, T3 168 nop 169 fpabs T3, A7 170 LFPDUX A7, X, INCX2 171 172 fpadd C4, C4, T4 173 fpabs T4, A8 174 LFPDUX A8, X, INCX2 175 bdnz LL(12) 176 .align 4 177 178LL(13): 179 fpadd C1, C1, T1 180 fpabs T1, A1 181 fpadd C2, C2, T2 182 fpabs T2, A2 183 fpadd C3, C3, T3 184 fpabs T3, A3 185 fpadd C4, C4, T4 186 fpabs T4, A4 187 188 fpadd C1, C1, T1 189 fpabs T1, A5 190 fpadd C2, C2, T2 191 fpabs T2, A6 192 fpadd C3, C3, T3 193 fpabs T3, A7 194 fpadd C4, C4, T4 195 fpabs T4, A8 196 197 fpadd C1, C1, T1 198 fpadd C2, C2, T2 199 fpadd C3, C3, T3 200 fpadd C4, C4, T4 201 .align 4 202 203LL(15): 204 andi. r0, N, 15 205 beq LL(999) 206 andi. r0, N, 8 207 beq LL(16) 208 209 LFPDUX A1, X, INCX2 210 LFPDUX A2, X, INCX2 211 LFPDUX A3, X, INCX2 212 LFPDUX A4, X, INCX2 213 214 fpabs T1, A1 215 fpabs T2, A2 216 fpabs T3, A3 217 fpabs T4, A4 218 219 fpadd C1, C1, T1 220 fpadd C2, C2, T2 221 fpadd C3, C3, T3 222 fpadd C4, C4, T4 223 .align 4 224 225LL(16): 226 andi. r0, N, 4 227 beq LL(17) 228 229 LFPDUX A1, X, INCX2 230 LFPDUX A2, X, INCX2 231 fpabs T1, A1 232 fpabs T2, A2 233 234 fpadd C1, C1, T1 235 fpadd C2, C2, T2 236 .align 4 237 238LL(17): 239 andi. r0, N, 2 240 beq LL(18) 241 242 LFPDUX A1, X, INCX2 243 fpabs T1, A1 244 fpadd C1, C1, T1 245 .align 4 246 247LL(18): 248 andi. r0, N, 1 249 beq LL(999) 250 251 LFDX A1, X, INCX2 252 fabs T1, A1 253 fadd C1, C1, T1 254 b LL(999) 255 .align 4 256 257LL(100): 258 sub X2, X, INCX 259 sub X, X, INCX2 260 261 srawi. r0, N, 4 262 mtspr CTR, r0 263 beq- LL(115) 264 265 266 LFDUX A1, X, INCX2 267 fpmr T1, C2 268 LFDUX A2, X, INCX2 269 fpmr T2, C2 270 LFDUX A3, X, INCX2 271 fpmr T3, C2 272 LFDUX A4, X, INCX2 273 fpmr T4, C2 274 275 LFDUX A5, X, INCX2 276 LFSDUX A1, X2, INCX2 277 278 LFDUX A6, X, INCX2 279 LFSDUX A2, X2, INCX2 280 281 LFDUX A7, X, INCX2 282 LFSDUX A3, X2, INCX2 283 284 LFDUX A8, X, INCX2 285 LFSDUX A4, X2, INCX2 286 bdz LL(113) 287 .align 4 288 289LL(112): 290 fpadd C1, C1, T1 291 LFSDUX A5, X2, INCX2 292 fpabs T1, A1 293 LFDUX A1, X, INCX2 294 295 fpadd C2, C2, T2 296 LFSDUX A6, X2, INCX2 297 fpabs T2, A2 298 LFDUX A2, X, INCX2 299 300 fpadd C3, C3, T3 301 LFSDUX A7, X2, INCX2 302 fpabs T3, A3 303 LFDUX A3, X, INCX2 304 305 fpadd C4, C4, T4 306 LFSDUX A8, X2, INCX2 307 fpabs T4, A4 308 LFDUX A4, X, INCX2 309 310 fpadd C1, C1, T1 311 LFSDUX A1, X2, INCX2 312 fpabs T1, A5 313 LFDUX A5, X, INCX2 314 fpadd C2, C2, T2 315 LFSDUX A2, X2, INCX2 316 fpabs T2, A6 317 LFDUX A6, X, INCX2 318 319 fpadd C3, C3, T3 320 LFSDUX A3, X2, INCX2 321 fpabs T3, A7 322 LFDUX A7, X, INCX2 323 fpadd C4, C4, T4 324 LFSDUX A4, X2, INCX2 325 fpabs T4, A8 326 LFDUX A8, X, INCX2 327 328 bdnz LL(112) 329 .align 4 330 331LL(113): 332 fpadd C1, C1, T1 333 nop 334 fpabs T1, A1 335 LFSDUX A5, X2, INCX2 336 fpadd C2, C2, T2 337 nop 338 fpabs T2, A2 339 LFSDUX A6, X2, INCX2 340 fpadd C3, C3, T3 341 342 nop 343 fpabs T3, A3 344 LFSDUX A7, X2, INCX2 345 fpadd C4, C4, T4 346 nop 347 fpabs T4, A4 348 LFSDUX A8, X2, INCX2 349 350 fpadd C1, C1, T1 351 fpabs T1, A5 352 fpadd C2, C2, T2 353 fpabs T2, A6 354 fpadd C3, C3, T3 355 fpabs T3, A7 356 fpadd C4, C4, T4 357 fpabs T4, A8 358 359 fpadd C1, C1, T1 360 fpadd C2, C2, T2 361 fpadd C3, C3, T3 362 fpadd C4, C4, T4 363 .align 4 364 365LL(115): 366 andi. r0, N, 15 367 beq LL(999) 368 andi. r0, N, 8 369 beq LL(116) 370 371 LFDUX A1, X, INCX2 372 LFDUX A2, X2, INCX2 373 LFDUX A3, X, INCX2 374 LFDUX A4, X2, INCX2 375 376 fabs T1, A1 377 LFDUX A5, X, INCX2 378 fabs T2, A2 379 LFDUX A6, X2, INCX2 380 fabs T3, A3 381 LFDUX A7, X, INCX2 382 fabs T4, A4 383 LFDUX A8, X2, INCX2 384 385 fadd C1, C1, T1 386 fabs T1, A5 387 fadd C2, C2, T2 388 fabs T2, A6 389 390 fadd C3, C3, T3 391 fabs T3, A7 392 fadd C4, C4, T4 393 fabs T4, A8 394 395 fadd C1, C1, T1 396 fadd C2, C2, T2 397 fadd C3, C3, T3 398 fadd C4, C4, T4 399 .align 4 400 401LL(116): 402 andi. r0, N, 4 403 beq LL(117) 404 405 LFDUX A1, X, INCX2 406 LFDUX A2, X2, INCX2 407 LFDUX A3, X, INCX2 408 LFDUX A4, X2, INCX2 409 410 fabs T1, A1 411 fabs T2, A2 412 fabs T3, A3 413 fabs T4, A4 414 415 fadd C1, C1, T1 416 fadd C2, C2, T2 417 fadd C3, C3, T3 418 fadd C4, C4, T4 419 .align 4 420 421LL(117): 422 andi. r0, N, 2 423 beq LL(118) 424 425 LFDUX A1, X, INCX2 426 LFDUX A2, X2, INCX2 427 428 fabs T1, A1 429 fabs T2, A2 430 fadd C1, C1, T1 431 fadd C2, C2, T2 432 .align 4 433 434LL(118): 435 andi. r0, N, 1 436 beq LL(999) 437 438 LFDX A1, X, INCX2 439 fabs T1, A1 440 fadd C1, C1, T1 441 .align 4 442 443LL(999): 444 fpadd C1, C1, C2 445 li r10, 16 446 fpadd C3, C3, C4 447 fpadd C1, C1, C3 448 lfpdux f15, SP, r10 449 fsmtp C2, C1 450 lfpdux f14, SP, r10 451 addi SP, SP, 16 452 fadd C1, C2, C1 453 blr 454 455 EPILOGUE 456