1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define N r3 26#define X r4 27#define INCX r5 28 29#define PREA r8 30 31#define FZERO 144(SP) 32#define FONE 148(SP) 33 34#define STACKSIZE 160 35 36 PROLOGUE 37 PROFCODE 38 39 addi SP, SP, -STACKSIZE 40 li r10, 0 41 lis r11, 0x3f80 42 43 stfd f14, 0(SP) 44 stfd f15, 8(SP) 45 stfd f16, 16(SP) 46 stfd f17, 24(SP) 47 48 stfd f18, 32(SP) 49 stfd f19, 40(SP) 50 stfd f20, 48(SP) 51 stfd f21, 56(SP) 52 53 stfd f22, 64(SP) 54 stfd f23, 72(SP) 55 stfd f24, 80(SP) 56 stfd f25, 88(SP) 57 58 stfd f26, 96(SP) 59 stfd f27, 104(SP) 60 stfd f28, 112(SP) 61 stfd f29, 120(SP) 62 63 stfd f30, 128(SP) 64 stfd f31, 136(SP) 65 66 stw r10, FZERO 67 stw r11, FONE 68 69 lfs f1, FZERO 70 71#ifdef F_INTERFACE 72 LDINT N, 0(N) 73 LDINT INCX, 0(INCX) 74#endif 75 76 slwi INCX, INCX, BASE_SHIFT 77 78 li PREA, 4 * 16 * SIZE 79 80 cmpwi cr0, N, 0 81 ble- LL(9999) 82 cmpwi cr0, INCX, 0 83 ble- LL(9999) 84 85 fmr f0, f1 86 fmr f2, f1 87 fmr f3, f1 88 fmr f4, f1 89 fmr f5, f1 90 fmr f6, f1 91 fmr f7, f1 92 fmr f8, f1 93 fmr f9, f1 94 fmr f10, f1 95 fmr f11, f1 96 fmr f12, f1 97 fmr f13, f1 98 fmr f14, f1 99 fmr f15, f1 100 101 cmpwi cr0, INCX, SIZE 102 bne- cr0, LL(1000) 103 104 srawi. r0, N, 4 105 mtspr CTR, r0 106 beq- cr0, LL(150) 107 108 LFD f16, 0 * SIZE(X) 109 LFD f17, 1 * SIZE(X) 110 LFD f18, 2 * SIZE(X) 111 LFD f19, 3 * SIZE(X) 112 LFD f20, 4 * SIZE(X) 113 LFD f21, 5 * SIZE(X) 114 LFD f22, 6 * SIZE(X) 115 LFD f23, 7 * SIZE(X) 116 117 LFD f24, 8 * SIZE(X) 118 LFD f25, 9 * SIZE(X) 119 LFD f26, 10 * SIZE(X) 120 LFD f27, 11 * SIZE(X) 121 LFD f28, 12 * SIZE(X) 122 LFD f29, 13 * SIZE(X) 123 LFD f30, 14 * SIZE(X) 124 LFD f31, 15 * SIZE(X) 125 126 bdz LL(120) 127 .align 4 128 129LL(110): 130 fmadd f0, f16, f16, f0 131 fmadd f1, f17, f17, f1 132 fmadd f2, f18, f18, f2 133 fmadd f3, f19, f19, f3 134 135 LFD f16, 16 * SIZE(X) 136 LFD f17, 17 * SIZE(X) 137 LFD f18, 18 * SIZE(X) 138 LFD f19, 19 * SIZE(X) 139 140 fmadd f4, f20, f20, f4 141 fmadd f5, f21, f21, f5 142 fmadd f6, f22, f22, f6 143 fmadd f7, f23, f23, f7 144 145 LFD f20, 20 * SIZE(X) 146 LFD f21, 21 * SIZE(X) 147 LFD f22, 22 * SIZE(X) 148 LFD f23, 23 * SIZE(X) 149 150 fmadd f8, f24, f24, f8 151 fmadd f9, f25, f25, f9 152 fmadd f10, f26, f26, f10 153 fmadd f11, f27, f27, f11 154 155 LFD f24, 24 * SIZE(X) 156 LFD f25, 25 * SIZE(X) 157 LFD f26, 26 * SIZE(X) 158 LFD f27, 27 * SIZE(X) 159 160 fmadd f12, f28, f28, f12 161 fmadd f13, f29, f29, f13 162 fmadd f14, f30, f30, f14 163 fmadd f15, f31, f31, f15 164 165 LFD f28, 28 * SIZE(X) 166 LFD f29, 29 * SIZE(X) 167 LFD f30, 30 * SIZE(X) 168 LFD f31, 31 * SIZE(X) 169 170#ifndef POWER6 171 L1_PREFETCH X, PREA 172#endif 173 addi X, X, 16 * SIZE 174#ifdef POWER6 175 L1_PREFETCH X, PREA 176#endif 177 178 bdnz LL(110) 179 .align 4 180 181LL(120): 182 fmadd f0, f16, f16, f0 183 fmadd f1, f17, f17, f1 184 fmadd f2, f18, f18, f2 185 fmadd f3, f19, f19, f3 186 fmadd f4, f20, f20, f4 187 fmadd f5, f21, f21, f5 188 fmadd f6, f22, f22, f6 189 fmadd f7, f23, f23, f7 190 fmadd f8, f24, f24, f8 191 fmadd f9, f25, f25, f9 192 fmadd f10, f26, f26, f10 193 fmadd f11, f27, f27, f11 194 fmadd f12, f28, f28, f12 195 fmadd f13, f29, f29, f13 196 fmadd f14, f30, f30, f14 197 fmadd f15, f31, f31, f15 198 addi X, X, 16 * SIZE 199 .align 4 200 201LL(150): 202 andi. r0, N, 15 203 mtspr CTR, r0 204 beq- cr0, LL(170) 205 .align 4 206 207LL(160): 208 LFD f16, 0 * SIZE(X) 209 addi X, X, 1 * SIZE 210 fmadd f0, f16, f16, f0 211 bdnz LL(160) 212 .align 4 213 214LL(170): 215 fadd f0, f0, f1 216 fadd f2, f2, f3 217 fadd f4, f4, f5 218 fadd f6, f6, f7 219 220 fadd f8, f8, f9 221 fadd f10, f10, f11 222 fadd f12, f12, f13 223 fadd f14, f14, f15 224 225 fadd f0, f0, f2 226 fadd f4, f4, f6 227 fadd f8, f8, f10 228 fadd f12, f12, f14 229 230 fadd f0, f0, f4 231 fadd f8, f8, f12 232 233 fadd f0, f0, f8 234 235 fsqrts f1, f0 236 b LL(9999) 237 .align 4 238 239LL(1000): 240 sub X, X, INCX 241 242 srawi. r0, N, 4 243 mtspr CTR, r0 244 beq- cr0, LL(1150) 245 246 LFDUX f16, X, INCX 247 LFDUX f17, X, INCX 248 LFDUX f18, X, INCX 249 LFDUX f19, X, INCX 250 LFDUX f20, X, INCX 251 LFDUX f21, X, INCX 252 LFDUX f22, X, INCX 253 LFDUX f23, X, INCX 254 255 LFDUX f24, X, INCX 256 LFDUX f25, X, INCX 257 LFDUX f26, X, INCX 258 LFDUX f27, X, INCX 259 260 LFDUX f28, X, INCX 261 LFDUX f29, X, INCX 262 LFDUX f30, X, INCX 263 LFDUX f31, X, INCX 264 bdz LL(1120) 265 .align 4 266 267LL(1110): 268 fmadd f0, f16, f16, f0 269 fmadd f1, f17, f17, f1 270 fmadd f2, f18, f18, f2 271 fmadd f3, f19, f19, f3 272 273 LFDUX f16, X, INCX 274 LFDUX f17, X, INCX 275 LFDUX f18, X, INCX 276 LFDUX f19, X, INCX 277 278 fmadd f4, f20, f20, f4 279 fmadd f5, f21, f21, f5 280 fmadd f6, f22, f22, f6 281 fmadd f7, f23, f23, f7 282 283 LFDUX f20, X, INCX 284 LFDUX f21, X, INCX 285 LFDUX f22, X, INCX 286 LFDUX f23, X, INCX 287 288 fmadd f8, f24, f24, f8 289 fmadd f9, f25, f25, f9 290 fmadd f10, f26, f26, f10 291 fmadd f11, f27, f27, f11 292 293 LFDUX f24, X, INCX 294 LFDUX f25, X, INCX 295 LFDUX f26, X, INCX 296 LFDUX f27, X, INCX 297 298 fmadd f12, f28, f28, f12 299 fmadd f13, f29, f29, f13 300 fmadd f14, f30, f30, f14 301 fmadd f15, f31, f31, f15 302 303 LFDUX f28, X, INCX 304 LFDUX f29, X, INCX 305 LFDUX f30, X, INCX 306 LFDUX f31, X, INCX 307 bdnz LL(1110) 308 .align 4 309 310LL(1120): 311 fmadd f0, f16, f16, f0 312 fmadd f1, f17, f17, f1 313 fmadd f2, f18, f18, f2 314 fmadd f3, f19, f19, f3 315 316 fmadd f4, f20, f20, f4 317 fmadd f5, f21, f21, f5 318 fmadd f6, f22, f22, f6 319 fmadd f7, f23, f23, f7 320 321 fmadd f8, f24, f24, f8 322 fmadd f9, f25, f25, f9 323 fmadd f10, f26, f26, f10 324 fmadd f11, f27, f27, f11 325 326 fmadd f12, f28, f28, f12 327 fmadd f13, f29, f29, f13 328 fmadd f14, f30, f30, f14 329 fmadd f15, f31, f31, f15 330 .align 4 331 332LL(1150): 333 andi. r0, N, 15 334 mtspr CTR, r0 335 beq- cr0, LL(1170) 336 .align 4 337 338LL(1160): 339 LFDUX f16, X, INCX 340 fmadd f0, f16, f16, f0 341 bdnz LL(1160) 342 .align 4 343 344LL(1170): 345 fadd f0, f0, f1 346 fadd f2, f2, f3 347 fadd f4, f4, f5 348 fadd f6, f6, f7 349 350 fadd f8, f8, f9 351 fadd f10, f10, f11 352 fadd f12, f12, f13 353 fadd f14, f14, f15 354 355 fadd f0, f0, f2 356 fadd f4, f4, f6 357 fadd f8, f8, f10 358 fadd f12, f12, f14 359 360 fadd f0, f0, f4 361 fadd f8, f8, f12 362 363 fadd f0, f0, f8 364 365 fsqrts f1, f0 366 .align 4 367 368LL(9999): 369 lfd f14, 0(SP) 370 lfd f15, 8(SP) 371 lfd f16, 16(SP) 372 lfd f17, 24(SP) 373 374 lfd f18, 32(SP) 375 lfd f19, 40(SP) 376 lfd f20, 48(SP) 377 lfd f21, 56(SP) 378 379 lfd f22, 64(SP) 380 lfd f23, 72(SP) 381 lfd f24, 80(SP) 382 lfd f25, 88(SP) 383 384 lfd f26, 96(SP) 385 lfd f27, 104(SP) 386 lfd f28, 112(SP) 387 lfd f29, 120(SP) 388 389 lfd f30, 128(SP) 390 lfd f31, 136(SP) 391 392 addi SP, SP, STACKSIZE 393 blr 394 395 EPILOGUE 396