1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r4 44#define INCX r5 45 46#define INCX2 r6 47#define X2 r7 48#define FLAG r8 49 50#define C1 f1 51#define C2 f0 52#define C3 f2 53#define C4 f3 54 55#define A1 f4 56#define A2 f5 57#define A3 f6 58#define A4 f7 59#define A5 f8 60#define A6 f9 61#define A7 f10 62#define A8 f11 63 64#define T1 f12 65#define T2 f13 66#define T3 f14 67#define T4 f15 68 69 PROLOGUE 70 PROFCODE 71 72 li r10, -16 73 74 stfpdux f14, SP, r10 75 stfpdux f15, SP, r10 76 77 li r10, 0 78 stwu r10, -4(SP) 79 stwu r10, -4(SP) 80 stwu r10, -4(SP) 81 stwu r10, -4(SP) 82 83#ifdef F_INTERFACE 84 LDINT N, 0(N) 85 LDINT INCX, 0(INCX) 86#endif 87 88 lfpdx C1, SP, r10 # Zero clear 89 90 slwi INCX, INCX, BASE_SHIFT 91 add INCX2, INCX, INCX 92 93 fpmr C2, C1 94 fpmr C3, C1 95 li FLAG, 0 96 fpmr C4, C1 97 98 cmpwi cr0, N, 0 99 ble LL(999) 100 cmpwi cr0, INCX, 0 101 ble LL(999) 102 103 sub X, X, INCX2 104 105 cmpwi cr0, INCX, SIZE 106 bne LL(100) 107 108 andi. r0, X, 2 * SIZE - 1 109 beq LL(05) 110 111 LFD C1, 2 * SIZE(X) 112 li FLAG, 1 113 addi X, X, 1 * SIZE 114 addi N, N, -1 115 cmpwi cr0, N, 0 116 fabs C1, C1 117 ble LL(99) 118 .align 4 119 120LL(05): 121 srawi. r0, N, 3 122 mtspr CTR, r0 123 beq- LL(15) 124 125 LFPDUX A1, X, INCX2 126 fpmr T1, C2 127 LFPDUX A2, X, INCX2 128 fpmr T2, C2 129 LFPDUX A3, X, INCX2 130 fpmr T3, C2 131 LFPDUX A4, X, INCX2 132 fpmr T4, C2 133 LFPDUX A5, X, INCX2 134 LFPDUX A6, X, INCX2 135 LFPDUX A7, X, INCX2 136 LFPDUX A8, X, INCX2 137 bdz LL(13) 138 .align 4 139 140LL(12): 141 fpadd C1, C1, T1 142 nop 143 fpabs T1, A1 144 LFPDUX A1, X, INCX2 145 146 fpadd C2, C2, T2 147 nop 148 fpabs T2, A2 149 LFPDUX A2, X, INCX2 150 151 fpadd C3, C3, T3 152 nop 153 fpabs T3, A3 154 LFPDUX A3, X, INCX2 155 156 fpadd C4, C4, T4 157 nop 158 fpabs T4, A4 159 LFPDUX A4, X, INCX2 160 161 fpadd C1, C1, T1 162 nop 163 fpabs T1, A5 164 LFPDUX A5, X, INCX2 165 166 fpadd C2, C2, T2 167 nop 168 fpabs T2, A6 169 LFPDUX A6, X, INCX2 170 171 fpadd C3, C3, T3 172 nop 173 fpabs T3, A7 174 LFPDUX A7, X, INCX2 175 176 fpadd C4, C4, T4 177 fpabs T4, A8 178 LFPDUX A8, X, INCX2 179 bdnz LL(12) 180 .align 4 181 182LL(13): 183 fpadd C1, C1, T1 184 fpabs T1, A1 185 fpadd C2, C2, T2 186 fpabs T2, A2 187 fpadd C3, C3, T3 188 fpabs T3, A3 189 fpadd C4, C4, T4 190 fpabs T4, A4 191 192 fpadd C1, C1, T1 193 fpabs T1, A5 194 fpadd C2, C2, T2 195 fpabs T2, A6 196 fpadd C3, C3, T3 197 fpabs T3, A7 198 fpadd C4, C4, T4 199 fpabs T4, A8 200 201 fpadd C1, C1, T1 202 fpadd C2, C2, T2 203 fpadd C3, C3, T3 204 fpadd C4, C4, T4 205 .align 4 206 207LL(15): 208 andi. r0, N, 7 209 beq LL(99) 210 andi. r0, N, 4 211 beq LL(16) 212 213 LFPDUX A1, X, INCX2 214 LFPDUX A2, X, INCX2 215 LFPDUX A3, X, INCX2 216 LFPDUX A4, X, INCX2 217 218 fpabs T1, A1 219 fpabs T2, A2 220 fpabs T3, A3 221 fpabs T4, A4 222 223 fpadd C1, C1, T1 224 fpadd C2, C2, T2 225 fpadd C3, C3, T3 226 fpadd C4, C4, T4 227 .align 4 228 229LL(16): 230 andi. r0, N, 2 231 beq LL(17) 232 233 LFPDUX A1, X, INCX2 234 LFPDUX A2, X, INCX2 235 fpabs T1, A1 236 fpabs T2, A2 237 238 fpadd C1, C1, T1 239 fpadd C2, C2, T2 240 .align 4 241 242LL(17): 243 andi. r0, N, 1 244 beq LL(99) 245 246 LFPDUX A1, X, INCX2 247 fpabs T1, A1 248 fpadd C1, C1, T1 249 .align 4 250 251LL(99): 252 cmpwi cr0, FLAG, 0 253 beq LL(999) 254 255 LFD A1, 2 * SIZE(X) 256 fabs T1, A1 257 fadd C2, C2, T1 258 b LL(999) 259 .align 4 260 261LL(100): 262 addi X2, X, SIZE 263 andi. r0, X, 2 * SIZE - 1 264 bne LL(200) 265 266 srawi. r0, N, 3 267 mtspr CTR, r0 268 beq- LL(115) 269 270 LFPDUX A1, X, INCX2 271 fpmr T1, C2 272 LFPDUX A2, X, INCX2 273 fpmr T2, C2 274 LFPDUX A3, X, INCX2 275 fpmr T3, C2 276 LFPDUX A4, X, INCX2 277 fpmr T4, C2 278 LFPDUX A5, X, INCX2 279 LFPDUX A6, X, INCX2 280 LFPDUX A7, X, INCX2 281 LFPDUX A8, X, INCX2 282 bdz LL(113) 283 .align 4 284 285LL(112): 286 fpadd C1, C1, T1 287 nop 288 fpabs T1, A1 289 LFPDUX A1, X, INCX2 290 291 fpadd C2, C2, T2 292 nop 293 fpabs T2, A2 294 LFPDUX A2, X, INCX2 295 296 fpadd C3, C3, T3 297 nop 298 fpabs T3, A3 299 LFPDUX A3, X, INCX2 300 301 fpadd C4, C4, T4 302 nop 303 fpabs T4, A4 304 LFPDUX A4, X, INCX2 305 306 fpadd C1, C1, T1 307 nop 308 fpabs T1, A5 309 LFPDUX A5, X, INCX2 310 311 fpadd C2, C2, T2 312 nop 313 fpabs T2, A6 314 LFPDUX A6, X, INCX2 315 316 fpadd C3, C3, T3 317 nop 318 fpabs T3, A7 319 LFPDUX A7, X, INCX2 320 321 fpadd C4, C4, T4 322 fpabs T4, A8 323 LFPDUX A8, X, INCX2 324 bdnz LL(112) 325 .align 4 326 327LL(113): 328 fpadd C1, C1, T1 329 fpabs T1, A1 330 fpadd C2, C2, T2 331 fpabs T2, A2 332 fpadd C3, C3, T3 333 fpabs T3, A3 334 fpadd C4, C4, T4 335 fpabs T4, A4 336 337 fpadd C1, C1, T1 338 fpabs T1, A5 339 fpadd C2, C2, T2 340 fpabs T2, A6 341 fpadd C3, C3, T3 342 fpabs T3, A7 343 fpadd C4, C4, T4 344 fpabs T4, A8 345 346 fpadd C1, C1, T1 347 fpadd C2, C2, T2 348 fpadd C3, C3, T3 349 fpadd C4, C4, T4 350 .align 4 351 352LL(115): 353 andi. r0, N, 7 354 beq LL(999) 355 andi. r0, N, 4 356 beq LL(116) 357 358 LFPDUX A1, X, INCX2 359 LFPDUX A2, X, INCX2 360 LFPDUX A3, X, INCX2 361 LFPDUX A4, X, INCX2 362 363 fpabs T1, A1 364 fpabs T2, A2 365 fpabs T3, A3 366 fpabs T4, A4 367 368 fpadd C1, C1, T1 369 fpadd C2, C2, T2 370 fpadd C3, C3, T3 371 fpadd C4, C4, T4 372 .align 4 373 374LL(116): 375 andi. r0, N, 2 376 beq LL(117) 377 378 LFPDUX A1, X, INCX2 379 LFPDUX A2, X, INCX2 380 fpabs T1, A1 381 fpabs T2, A2 382 383 fpadd C1, C1, T1 384 fpadd C2, C2, T2 385 .align 4 386 387LL(117): 388 andi. r0, N, 1 389 beq LL(999) 390 391 LFPDUX A1, X, INCX2 392 fpabs T1, A1 393 fpadd C1, C1, T1 394 b LL(999) 395 .align 4 396 397LL(200): 398 srawi. r0, N, 3 399 mtspr CTR, r0 400 beq- LL(215) 401 402 403 LFDUX A1, X, INCX2 404 fpmr T1, C2 405 LFDUX A2, X, INCX2 406 fpmr T2, C2 407 LFDUX A3, X, INCX2 408 fpmr T3, C2 409 LFDUX A4, X, INCX2 410 fpmr T4, C2 411 412 LFDUX A5, X, INCX2 413 LFSDUX A1, X2, INCX2 414 415 LFDUX A6, X, INCX2 416 LFSDUX A2, X2, INCX2 417 418 LFDUX A7, X, INCX2 419 LFSDUX A3, X2, INCX2 420 421 LFDUX A8, X, INCX2 422 LFSDUX A4, X2, INCX2 423 bdz LL(213) 424 .align 4 425 426LL(212): 427 fpadd C1, C1, T1 428 LFSDUX A5, X2, INCX2 429 fpabs T1, A1 430 LFDUX A1, X, INCX2 431 432 fpadd C2, C2, T2 433 LFSDUX A6, X2, INCX2 434 fpabs T2, A2 435 LFDUX A2, X, INCX2 436 437 fpadd C3, C3, T3 438 LFSDUX A7, X2, INCX2 439 fpabs T3, A3 440 LFDUX A3, X, INCX2 441 442 fpadd C4, C4, T4 443 LFSDUX A8, X2, INCX2 444 fpabs T4, A4 445 LFDUX A4, X, INCX2 446 447 fpadd C1, C1, T1 448 LFSDUX A1, X2, INCX2 449 fpabs T1, A5 450 LFDUX A5, X, INCX2 451 fpadd C2, C2, T2 452 LFSDUX A2, X2, INCX2 453 fpabs T2, A6 454 LFDUX A6, X, INCX2 455 456 fpadd C3, C3, T3 457 LFSDUX A3, X2, INCX2 458 fpabs T3, A7 459 LFDUX A7, X, INCX2 460 fpadd C4, C4, T4 461 LFSDUX A4, X2, INCX2 462 fpabs T4, A8 463 LFDUX A8, X, INCX2 464 465 bdnz LL(212) 466 .align 4 467 468LL(213): 469 fpadd C1, C1, T1 470 nop 471 fpabs T1, A1 472 LFSDUX A5, X2, INCX2 473 fpadd C2, C2, T2 474 nop 475 fpabs T2, A2 476 LFSDUX A6, X2, INCX2 477 fpadd C3, C3, T3 478 479 nop 480 fpabs T3, A3 481 LFSDUX A7, X2, INCX2 482 fpadd C4, C4, T4 483 nop 484 fpabs T4, A4 485 LFSDUX A8, X2, INCX2 486 487 fpadd C1, C1, T1 488 fpabs T1, A5 489 fpadd C2, C2, T2 490 fpabs T2, A6 491 fpadd C3, C3, T3 492 fpabs T3, A7 493 fpadd C4, C4, T4 494 fpabs T4, A8 495 496 fpadd C1, C1, T1 497 fpadd C2, C2, T2 498 fpadd C3, C3, T3 499 fpadd C4, C4, T4 500 .align 4 501 502LL(215): 503 andi. r0, N, 7 504 beq LL(999) 505 andi. r0, N, 4 506 beq LL(216) 507 508 LFDUX A1, X, INCX2 509 LFDUX A2, X2, INCX2 510 LFDUX A3, X, INCX2 511 LFDUX A4, X2, INCX2 512 513 fabs T1, A1 514 LFDUX A5, X, INCX2 515 fabs T2, A2 516 LFDUX A6, X2, INCX2 517 fabs T3, A3 518 LFDUX A7, X, INCX2 519 fabs T4, A4 520 LFDUX A8, X2, INCX2 521 522 fadd C1, C1, T1 523 fabs T1, A5 524 fadd C2, C2, T2 525 fabs T2, A6 526 527 fadd C3, C3, T3 528 fabs T3, A7 529 fadd C4, C4, T4 530 fabs T4, A8 531 532 fadd C1, C1, T1 533 fadd C2, C2, T2 534 fadd C3, C3, T3 535 fadd C4, C4, T4 536 .align 4 537 538LL(216): 539 andi. r0, N, 2 540 beq LL(217) 541 542 LFDUX A1, X, INCX2 543 LFDUX A2, X2, INCX2 544 LFDUX A3, X, INCX2 545 LFDUX A4, X2, INCX2 546 547 fabs T1, A1 548 fabs T2, A2 549 fabs T3, A3 550 fabs T4, A4 551 552 fadd C1, C1, T1 553 fadd C2, C2, T2 554 fadd C3, C3, T3 555 fadd C4, C4, T4 556 .align 4 557 558LL(217): 559 andi. r0, N, 1 560 beq LL(999) 561 562 LFDUX A1, X, INCX2 563 LFDUX A2, X2, INCX2 564 565 fabs T1, A1 566 fabs T2, A2 567 fadd C1, C1, T1 568 fadd C2, C2, T2 569 .align 4 570 571LL(999): 572 fpadd C1, C1, C2 573 li r10, 16 574 fpadd C3, C3, C4 575 fpadd C1, C1, C3 576 lfpdux f15, SP, r10 577 fsmtp C2, C1 578 lfpdux f14, SP, r10 579 addi SP, SP, 16 580 fadd C1, C2, C1 581 blr 582 583 EPILOGUE 584