1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r4 44#define INCX r5 45 46#define INCX2 r6 47#define X2 r7 48 49#define XX r8 50#define RET r9 51#define NN r10 52 53#define C1 f1 54#define C2 f0 55#define C3 f2 56#define C4 f3 57 58#define A1 f4 59#define A2 f5 60#define A3 f6 61#define A4 f7 62#define A5 f8 63#define A6 f9 64#define A7 f10 65#define A8 f11 66 67#define F1 f12 68#define F2 f13 69#define F3 f14 70#define F4 f15 71 72#define T1 f16 73#define T2 f17 74#define T3 f18 75#define T4 f19 76 77#define B1 f20 78#define B2 f21 79#define B3 f22 80#define B4 f23 81#define B5 f24 82#define B6 f25 83#define B7 f26 84#define B8 f27 85 86 87 PROLOGUE 88 PROFCODE 89 90 li r10, -16 91 92 stfpdux f14, SP, r10 93 stfpdux f15, SP, r10 94 95 stfpdux f16, SP, r10 96 stfpdux f17, SP, r10 97 stfpdux f18, SP, r10 98 stfpdux f19, SP, r10 99 100 stfpdux f20, SP, r10 101 stfpdux f21, SP, r10 102 stfpdux f22, SP, r10 103 stfpdux f23, SP, r10 104 105 stfpdux f24, SP, r10 106 stfpdux f25, SP, r10 107 stfpdux f26, SP, r10 108 stfpdux f27, SP, r10 109 110#ifdef F_INTERFACE 111 LDINT N, 0(N) 112 LDINT INCX, 0(INCX) 113#endif 114 115 slwi INCX, INCX, BASE_SHIFT 116 add INCX2, INCX, INCX 117 118 li RET, 0 119 cmpwi cr0, N, 0 120 ble LL(999) 121 cmpwi cr0, INCX, 0 122 mr NN, N 123 ble LL(999) 124 125 mr XX, X 126 127 LFD A1, 0 * SIZE(X) 128 LFD A2, 1 * SIZE(X) 129 add X, X, INCX2 130 li RET, 1 131 132 fabs A1, A1 133 fabs A2, A2 134 135 subi INCX2, INCX2, SIZE 136 137 addi N, N, -1 138 cmpwi cr0, N, 0 139 fadd C1, A1, A2 140 ble LL(999) 141 142 fsmfp C1, C1 143 li INCX, SIZE 144 fpmr C2, C1 145 sub X, X, INCX2 146 fpmr C3, C1 147 srawi. r0, N, 3 148 fpmr C4, C1 149 mtspr CTR, r0 150 beq- LL(105) 151 152 LFDUX A1, X, INCX2 153 LFDUX A2, X, INCX 154 LFDUX A3, X, INCX2 155 LFDUX A4, X, INCX 156 157 LFSDUX A1, X, INCX2 158 LFSDUX A2, X, INCX 159 LFSDUX A3, X, INCX2 160 LFSDUX A4, X, INCX 161 162 LFDUX A5, X, INCX2 163 LFDUX A6, X, INCX 164 LFDUX A7, X, INCX2 165 LFDUX A8, X, INCX 166 167 LFSDUX A5, X, INCX2 168 LFSDUX A6, X, INCX 169 LFSDUX A7, X, INCX2 170 LFSDUX A8, X, INCX 171 bdz LL(103) 172 .align 4 173 174LL(102): 175 fpabs B1, A1 176 LFDUX A1, X, INCX2 177 fpabs B2, A2 178 LFDUX A2, X, INCX 179 fpabs B3, A3 180 LFDUX A3, X, INCX2 181 fpabs B4, A4 182 LFDUX A4, X, INCX 183 184 fpabs B5, A5 185 LFSDUX A1, X, INCX2 186 fpabs B6, A6 187 LFSDUX A2, X, INCX 188 fpabs B7, A7 189 LFSDUX A3, X, INCX2 190 fpabs B8, A8 191 LFSDUX A4, X, INCX 192 193 fpadd T1, B1, B2 194 LFDUX A5, X, INCX2 195 fpadd T2, B3, B4 196 LFDUX A6, X, INCX 197 fpadd T3, B5, B6 198 LFDUX A7, X, INCX2 199 fpadd T4, B7, B8 200 LFDUX A8, X, INCX 201 202 fpsub F1, T1, C1 203 LFSDUX A5, X, INCX2 204 fpsub F2, T2, C2 205 LFSDUX A6, X, INCX 206 fpsub F3, T3, C3 207 LFSDUX A7, X, INCX2 208 fpsub F4, T4, C4 209 LFSDUX A8, X, INCX 210 211 fpsel C1, F1, C1, T1 212 fpsel C2, F2, C2, T2 213 fpsel C3, F3, C3, T3 214 fpsel C4, F4, C4, T4 215 bdnz LL(102) 216 .align 4 217 218LL(103): 219 fpabs B1, A1 220 fpabs B2, A2 221 fpabs B3, A3 222 fpabs B4, A4 223 224 fpabs B5, A5 225 fpabs B6, A6 226 fpabs B7, A7 227 fpabs B8, A8 228 229 fpadd T1, B1, B2 230 fpadd T2, B3, B4 231 fpadd T3, B5, B6 232 fpadd T4, B7, B8 233 234 fpsub F1, T1, C1 235 fpsub F2, T2, C2 236 fpsub F3, T3, C3 237 fpsub F4, T4, C4 238 239 fpsel C1, F1, C1, T1 240 fpsel C2, F2, C2, T2 241 fpsel C3, F3, C3, T3 242 fpsel C4, F4, C4, T4 243 .align 4 244 245LL(105): 246 andi. r0, N, 7 247 beq LL(120) 248 249 andi. r0, N, 4 250 beq LL(106) 251 252 LFDUX A1, X, INCX2 253 LFDUX A2, X, INCX 254 LFDUX A3, X, INCX2 255 LFDUX A4, X, INCX 256 257 LFSDUX A1, X, INCX2 258 LFSDUX A2, X, INCX 259 LFSDUX A3, X, INCX2 260 LFSDUX A4, X, INCX 261 262 fpabs A1, A1 263 fpabs A2, A2 264 fpabs A3, A3 265 fpabs A4, A4 266 267 fpadd A1, A1, A2 268 fpadd A3, A3, A4 269 270 fpsub F1, A1, C1 271 fpsub F2, A3, C2 272 273 fpsel C1, F1, C1, A1 274 fpsel C2, F2, C2, A3 275 .align 4 276 277LL(106): 278 andi. r0, N, 2 279 beq LL(107) 280 281 LFDUX A1, X, INCX2 282 LFDUX A2, X, INCX 283 LFSDUX A1, X, INCX2 284 LFSDUX A2, X, INCX 285 286 fpabs A1, A1 287 fpabs A2, A2 288 289 fpadd A1, A1, A2 290 291 fpsub F1, A1, C1 292 fpsel C1, F1, C1, A1 293 .align 4 294 295LL(107): 296 andi. r0, N, 1 297 beq LL(120) 298 299 LFDUX A1, X, INCX2 300 LFDUX A2, X, INCX 301 302 fabs A1, A1 303 fabs A2, A2 304 305 fadd A1, A1, A2 306 307 fsub F1, A1, C1 308 fsel C1, F1, C1, A1 309 .align 4 310 311LL(120): 312 fpsub F1, C2, C1 313 fpsub F2, C4, C3 314 315 fpsel C1, F1, C1, C2 316 fpsel C3, F2, C3, C4 317 318 fpsub F1, C3, C1 319 fpsel C1, F1, C1, C3 320 321 fsmtp C2, C1 322 323 li RET, 0 324 fsub F1, C2, C1 325 fsel C1, F1, C1, C2 326 327 fsmfp C1, C1 328 329 sub XX, XX, INCX2 330 331 srawi. r0, NN, 3 332 mtspr CTR, r0 333 beq- LL(125) 334 335 LFDUX A1, XX, INCX2 336 LFDUX A2, XX, INCX 337 LFDUX A3, XX, INCX2 338 LFDUX A4, XX, INCX 339 340 LFSDUX A1, XX, INCX2 341 LFSDUX A2, XX, INCX 342 LFSDUX A3, XX, INCX2 343 LFSDUX A4, XX, INCX 344 345 LFDUX A5, XX, INCX2 346 LFDUX A6, XX, INCX 347 LFDUX A7, XX, INCX2 348 LFDUX A8, XX, INCX 349 350 LFSDUX A5, XX, INCX2 351 LFSDUX A6, XX, INCX 352 LFSDUX A7, XX, INCX2 353 LFSDUX A8, XX, INCX 354 355 fpabs T1, A1 356 fpabs T2, A2 357 fpabs T3, A3 358 fpabs T4, A4 359 360 fpadd B1, T1, T2 361 fpadd B2, T3, T4 362 363 bdz LL(123) 364 .align 4 365 366LL(122): 367 LFDUX A1, XX, INCX2 368 fpabs T1, A5 369 addi RET, RET, 1 370 fcmpu cr0, C1, B1 371 LFDUX A2, XX, INCX 372 beq cr0, LL(999) 373 374 LFDUX A3, XX, INCX2 375 fpabs T2, A6 376 addi RET, RET, 1 377 fcmpu cr0, C1, B2 378 LFDUX A4, XX, INCX 379 beq cr0, LL(999) 380 381 LFSDUX A1, XX, INCX2 382 fpabs T3, A7 383 addi RET, RET, 1 384 fscmp cr0, C1, B1 385 LFSDUX A2, XX, INCX 386 beq cr0, LL(999) 387 388 LFSDUX A3, XX, INCX2 389 fpabs T4, A8 390 addi RET, RET, 1 391 fscmp cr0, C1, B2 392 LFSDUX A4, XX, INCX 393 beq cr0, LL(999) 394 395 fpadd B3, T1, T2 396 fpadd B4, T3, T4 397 398 LFDUX A5, XX, INCX2 399 fpabs T1, A1 400 addi RET, RET, 1 401 fcmpu cr0, C1, B3 402 LFDUX A6, XX, INCX 403 beq cr0, LL(999) 404 405 LFDUX A7, XX, INCX2 406 fpabs T2, A2 407 addi RET, RET, 1 408 fcmpu cr0, C1, B4 409 LFDUX A8, XX, INCX 410 beq cr0, LL(999) 411 412 LFSDUX A5, XX, INCX2 413 fpabs T3, A3 414 addi RET, RET, 1 415 fscmp cr0, C1, B3 416 LFSDUX A6, XX, INCX 417 beq cr0, LL(999) 418 419 LFSDUX A7, XX, INCX2 420 fpabs T4, A4 421 addi RET, RET, 1 422 fscmp cr0, C1, B4 423 LFSDUX A8, XX, INCX 424 beq cr0, LL(999) 425 426 fpadd B1, T1, T2 427 fpadd B2, T3, T4 428 bdnz LL(122) 429 .align 4 430 431LL(123): 432 fpabs T1, A5 433 addi RET, RET, 1 434 fcmpu cr0, C1, B1 435 beq cr0, LL(999) 436 437 fpabs T2, A6 438 addi RET, RET, 1 439 fcmpu cr0, C1, B2 440 beq cr0, LL(999) 441 442 fpabs T3, A7 443 addi RET, RET, 1 444 fscmp cr0, C1, B1 445 beq cr0, LL(999) 446 447 fpabs T4, A8 448 addi RET, RET, 1 449 fscmp cr0, C1, B2 450 beq cr0, LL(999) 451 452 fpadd B3, T1, T2 453 fpadd B4, T3, T4 454 455 addi RET, RET, 1 456 fcmpu cr0, C1, B3 457 beq cr0, LL(999) 458 459 addi RET, RET, 1 460 fcmpu cr0, C1, B4 461 beq cr0, LL(999) 462 463 addi RET, RET, 1 464 fscmp cr0, C1, B3 465 beq cr0, LL(999) 466 467 addi RET, RET, 1 468 fscmp cr0, C1, B4 469 beq cr0, LL(999) 470 .align 4 471 472LL(125): 473 andi. r0, NN, 4 474 beq LL(126) 475 476 LFDUX A1, XX, INCX2 477 LFDUX A2, XX, INCX 478 LFDUX A3, XX, INCX2 479 LFDUX A4, XX, INCX 480 481 LFSDUX A1, XX, INCX2 482 LFSDUX A2, XX, INCX 483 LFSDUX A3, XX, INCX2 484 LFSDUX A4, XX, INCX 485 486 fpabs A1, A1 487 fpabs A2, A2 488 fpabs A3, A3 489 fpabs A4, A4 490 491 fpadd A1, A1, A2 492 fpadd A3, A3, A4 493 494 addi RET, RET, 1 495 fcmpu cr0, C1, A1 496 beq cr0, LL(999) 497 498 addi RET, RET, 1 499 fcmpu cr0, C1, A3 500 beq cr0, LL(999) 501 502 addi RET, RET, 1 503 fscmp cr0, C1, A1 504 beq cr0, LL(999) 505 506 addi RET, RET, 1 507 fscmp cr0, C1, A3 508 beq cr0, LL(999) 509 .align 4 510 511LL(126): 512 andi. r0, NN, 2 513 beq LL(127) 514 515 LFDUX A1, XX, INCX2 516 LFDUX A2, XX, INCX 517 LFDUX A3, XX, INCX2 518 LFDUX A4, XX, INCX 519 520 fabs A1, A1 521 fabs A2, A2 522 fabs A3, A3 523 fabs A4, A4 524 525 fadd A1, A1, A2 526 fadd A3, A3, A4 527 528 addi RET, RET, 1 529 fcmpu cr0, C1, A1 530 beq cr0, LL(999) 531 532 addi RET, RET, 1 533 fcmpu cr0, C1, A3 534 beq cr0, LL(999) 535 .align 4 536 537LL(127): 538 addi RET, RET, 1 539 .align 4 540 541LL(999): 542 li r10, 16 543 addi SP, SP, -16 544 mr r3, RET 545 546 lfpdux f27, SP, r10 547 lfpdux f26, SP, r10 548 lfpdux f25, SP, r10 549 lfpdux f24, SP, r10 550 551 lfpdux f23, SP, r10 552 lfpdux f22, SP, r10 553 lfpdux f21, SP, r10 554 lfpdux f20, SP, r10 555 556 lfpdux f19, SP, r10 557 lfpdux f18, SP, r10 558 lfpdux f17, SP, r10 559 lfpdux f16, SP, r10 560 561 lfpdux f15, SP, r10 562 lfpdux f14, SP, r10 563 addi SP, SP, 16 564 blr 565 566 EPILOGUE 567