1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define N r3 26#define X r4 27#define INCX r5 28 29#define INCX2 r6 30#define X2 r7 31 32#define C1 f1 33#define C2 f0 34#define C3 f2 35#define C4 f3 36 37#define A1 f4 38#define A2 f5 39#define A3 f6 40#define A4 f7 41#define A5 f8 42#define A6 f9 43#define A7 f10 44#define A8 f11 45 46#define T1 f12 47#define T2 f13 48#define T3 f14 49#define T4 f15 50 51 PROLOGUE 52 PROFCODE 53 54 li r10, -16 55 56 stfpdux f14, SP, r10 57 stfpdux f15, SP, r10 58 59 li r10, 0 60 stwu r10, -4(SP) 61 stwu r10, -4(SP) 62 stwu r10, -4(SP) 63 stwu r10, -4(SP) 64 65#ifdef F_INTERFACE 66 LDINT N, 0(N) 67 LDINT INCX, 0(INCX) 68#endif 69 70 lfpdx C1, SP, r10 # Zero clear 71 72 slwi INCX, INCX, BASE_SHIFT 73 add INCX2, INCX, INCX 74 75 fpmr C2, C1 76 fpmr C3, C1 77 fpmr C4, C1 78 79 cmpwi cr0, N, 0 80 ble LL(999) 81 cmpwi cr0, INCX, 0 82 ble LL(999) 83 84 cmpwi cr0, INCX, SIZE 85 bne LL(100) 86 87 andi. r0, X, 2 * SIZE - 1 88 beq LL(05) 89 90 LFD C1, 0(X) 91 addi X, X, 1 * SIZE 92 addi N, N, -1 93 cmpwi cr0, N, 0 94 fabs C1, C1 95 ble LL(999) 96 .align 4 97 98LL(05): 99 srawi. r0, N, 4 100 sub X, X, INCX2 101 mtspr CTR, r0 102 beq- LL(15) 103 104 LFPDUX A1, X, INCX2 105 fpmr T1, C2 106 LFPDUX A2, X, INCX2 107 fpmr T2, C2 108 LFPDUX A3, X, INCX2 109 fpmr T3, C2 110 LFPDUX A4, X, INCX2 111 fpmr T4, C2 112 LFPDUX A5, X, INCX2 113 LFPDUX A6, X, INCX2 114 LFPDUX A7, X, INCX2 115 LFPDUX A8, X, INCX2 116 bdz LL(13) 117 .align 4 118 119LL(12): 120 fpadd C1, C1, T1 121 nop 122 fpabs T1, A1 123 LFPDUX A1, X, INCX2 124 125 fpadd C2, C2, T2 126 nop 127 fpabs T2, A2 128 LFPDUX A2, X, INCX2 129 130 fpadd C3, C3, T3 131 nop 132 fpabs T3, A3 133 LFPDUX A3, X, INCX2 134 135 fpadd C4, C4, T4 136 nop 137 fpabs T4, A4 138 LFPDUX A4, X, INCX2 139 140 fpadd C1, C1, T1 141 nop 142 fpabs T1, A5 143 LFPDUX A5, X, INCX2 144 145 fpadd C2, C2, T2 146 nop 147 fpabs T2, A6 148 LFPDUX A6, X, INCX2 149 150 fpadd C3, C3, T3 151 nop 152 fpabs T3, A7 153 LFPDUX A7, X, INCX2 154 155 fpadd C4, C4, T4 156 fpabs T4, A8 157 LFPDUX A8, X, INCX2 158 bdnz LL(12) 159 .align 4 160 161LL(13): 162 fpadd C1, C1, T1 163 fpabs T1, A1 164 fpadd C2, C2, T2 165 fpabs T2, A2 166 fpadd C3, C3, T3 167 fpabs T3, A3 168 fpadd C4, C4, T4 169 fpabs T4, A4 170 171 fpadd C1, C1, T1 172 fpabs T1, A5 173 fpadd C2, C2, T2 174 fpabs T2, A6 175 fpadd C3, C3, T3 176 fpabs T3, A7 177 fpadd C4, C4, T4 178 fpabs T4, A8 179 180 fpadd C1, C1, T1 181 fpadd C2, C2, T2 182 fpadd C3, C3, T3 183 fpadd C4, C4, T4 184 .align 4 185 186LL(15): 187 andi. r0, N, 15 188 beq LL(999) 189 andi. r0, N, 8 190 beq LL(16) 191 192 LFPDUX A1, X, INCX2 193 LFPDUX A2, X, INCX2 194 LFPDUX A3, X, INCX2 195 LFPDUX A4, X, INCX2 196 197 fpabs T1, A1 198 fpabs T2, A2 199 fpabs T3, A3 200 fpabs T4, A4 201 202 fpadd C1, C1, T1 203 fpadd C2, C2, T2 204 fpadd C3, C3, T3 205 fpadd C4, C4, T4 206 .align 4 207 208LL(16): 209 andi. r0, N, 4 210 beq LL(17) 211 212 LFPDUX A1, X, INCX2 213 LFPDUX A2, X, INCX2 214 fpabs T1, A1 215 fpabs T2, A2 216 217 fpadd C1, C1, T1 218 fpadd C2, C2, T2 219 .align 4 220 221LL(17): 222 andi. r0, N, 2 223 beq LL(18) 224 225 LFPDUX A1, X, INCX2 226 fpabs T1, A1 227 fpadd C1, C1, T1 228 .align 4 229 230LL(18): 231 andi. r0, N, 1 232 beq LL(999) 233 234 LFDX A1, X, INCX2 235 fabs T1, A1 236 fadd C1, C1, T1 237 b LL(999) 238 .align 4 239 240LL(100): 241 sub X2, X, INCX 242 sub X, X, INCX2 243 244 srawi. r0, N, 4 245 mtspr CTR, r0 246 beq- LL(115) 247 248 249 LFDUX A1, X, INCX2 250 fpmr T1, C2 251 LFDUX A2, X, INCX2 252 fpmr T2, C2 253 LFDUX A3, X, INCX2 254 fpmr T3, C2 255 LFDUX A4, X, INCX2 256 fpmr T4, C2 257 258 LFDUX A5, X, INCX2 259 LFSDUX A1, X2, INCX2 260 261 LFDUX A6, X, INCX2 262 LFSDUX A2, X2, INCX2 263 264 LFDUX A7, X, INCX2 265 LFSDUX A3, X2, INCX2 266 267 LFDUX A8, X, INCX2 268 LFSDUX A4, X2, INCX2 269 bdz LL(113) 270 .align 4 271 272LL(112): 273 fpadd C1, C1, T1 274 LFSDUX A5, X2, INCX2 275 fpabs T1, A1 276 LFDUX A1, X, INCX2 277 278 fpadd C2, C2, T2 279 LFSDUX A6, X2, INCX2 280 fpabs T2, A2 281 LFDUX A2, X, INCX2 282 283 fpadd C3, C3, T3 284 LFSDUX A7, X2, INCX2 285 fpabs T3, A3 286 LFDUX A3, X, INCX2 287 288 fpadd C4, C4, T4 289 LFSDUX A8, X2, INCX2 290 fpabs T4, A4 291 LFDUX A4, X, INCX2 292 293 fpadd C1, C1, T1 294 LFSDUX A1, X2, INCX2 295 fpabs T1, A5 296 LFDUX A5, X, INCX2 297 fpadd C2, C2, T2 298 LFSDUX A2, X2, INCX2 299 fpabs T2, A6 300 LFDUX A6, X, INCX2 301 302 fpadd C3, C3, T3 303 LFSDUX A3, X2, INCX2 304 fpabs T3, A7 305 LFDUX A7, X, INCX2 306 fpadd C4, C4, T4 307 LFSDUX A4, X2, INCX2 308 fpabs T4, A8 309 LFDUX A8, X, INCX2 310 311 bdnz LL(112) 312 .align 4 313 314LL(113): 315 fpadd C1, C1, T1 316 nop 317 fpabs T1, A1 318 LFSDUX A5, X2, INCX2 319 fpadd C2, C2, T2 320 nop 321 fpabs T2, A2 322 LFSDUX A6, X2, INCX2 323 fpadd C3, C3, T3 324 325 nop 326 fpabs T3, A3 327 LFSDUX A7, X2, INCX2 328 fpadd C4, C4, T4 329 nop 330 fpabs T4, A4 331 LFSDUX A8, X2, INCX2 332 333 fpadd C1, C1, T1 334 fpabs T1, A5 335 fpadd C2, C2, T2 336 fpabs T2, A6 337 fpadd C3, C3, T3 338 fpabs T3, A7 339 fpadd C4, C4, T4 340 fpabs T4, A8 341 342 fpadd C1, C1, T1 343 fpadd C2, C2, T2 344 fpadd C3, C3, T3 345 fpadd C4, C4, T4 346 .align 4 347 348LL(115): 349 andi. r0, N, 15 350 beq LL(999) 351 andi. r0, N, 8 352 beq LL(116) 353 354 LFDUX A1, X, INCX2 355 LFDUX A2, X2, INCX2 356 LFDUX A3, X, INCX2 357 LFDUX A4, X2, INCX2 358 359 fabs T1, A1 360 LFDUX A5, X, INCX2 361 fabs T2, A2 362 LFDUX A6, X2, INCX2 363 fabs T3, A3 364 LFDUX A7, X, INCX2 365 fabs T4, A4 366 LFDUX A8, X2, INCX2 367 368 fadd C1, C1, T1 369 fabs T1, A5 370 fadd C2, C2, T2 371 fabs T2, A6 372 373 fadd C3, C3, T3 374 fabs T3, A7 375 fadd C4, C4, T4 376 fabs T4, A8 377 378 fadd C1, C1, T1 379 fadd C2, C2, T2 380 fadd C3, C3, T3 381 fadd C4, C4, T4 382 .align 4 383 384LL(116): 385 andi. r0, N, 4 386 beq LL(117) 387 388 LFDUX A1, X, INCX2 389 LFDUX A2, X2, INCX2 390 LFDUX A3, X, INCX2 391 LFDUX A4, X2, INCX2 392 393 fabs T1, A1 394 fabs T2, A2 395 fabs T3, A3 396 fabs T4, A4 397 398 fadd C1, C1, T1 399 fadd C2, C2, T2 400 fadd C3, C3, T3 401 fadd C4, C4, T4 402 .align 4 403 404LL(117): 405 andi. r0, N, 2 406 beq LL(118) 407 408 LFDUX A1, X, INCX2 409 LFDUX A2, X2, INCX2 410 411 fabs T1, A1 412 fabs T2, A2 413 fadd C1, C1, T1 414 fadd C2, C2, T2 415 .align 4 416 417LL(118): 418 andi. r0, N, 1 419 beq LL(999) 420 421 LFDX A1, X, INCX2 422 fabs T1, A1 423 fadd C1, C1, T1 424 .align 4 425 426LL(999): 427 fpadd C1, C1, C2 428 li r10, 16 429 fpadd C3, C3, C4 430 fpadd C1, C1, C3 431 lfpdux f15, SP, r10 432 fsmtp C2, C1 433 lfpdux f14, SP, r10 434 addi SP, SP, 16 435 fadd C1, C2, C1 436 blr 437 438 EPILOGUE 439