1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef linux 26#ifndef __64BIT__ 27#define N r3 28#define X r6 29#define INCX r7 30#define Y r8 31#define INCY r9 32#define YY r4 33#define PRE r5 34#else 35#define N r3 36#define X r8 37#define INCX r9 38#define Y r5 39#define INCY r4 40#define YY r6 41#define PRE r7 42#endif 43#endif 44 45#if defined(_AIX) || defined(__APPLE__) 46#if !defined(__64BIT__) && defined(DOUBLE) 47#define N r3 48#define X r10 49#define INCX r4 50#define Y r5 51#define INCY r6 52#define YY r7 53#define PRE r8 54#else 55#define N r3 56#define X r8 57#define INCX r9 58#define Y r10 59#define INCY r4 60#define YY r5 61#define PRE r6 62#endif 63#endif 64 65#define ALPHA_R f24 66#define ALPHA_I f25 67 68#ifndef CONJ 69#define ADD1 FNMSUB 70#define ADD2 FMADD 71#else 72#define ADD1 FMADD 73#define ADD2 FNMSUB 74#endif 75 76#define STACKSIZE 96 77 78 PROLOGUE 79 PROFCODE 80 81 subi SP, SP, STACKSIZE 82 83 stfd f14, 0(SP) 84 stfd f15, 8(SP) 85 stfd f16, 16(SP) 86 stfd f17, 24(SP) 87 88 stfd f18, 32(SP) 89 stfd f19, 40(SP) 90 stfd f20, 48(SP) 91 stfd f21, 56(SP) 92 93 stfd f22, 64(SP) 94 stfd f23, 72(SP) 95 stfd f24, 80(SP) 96 stfd f25, 88(SP) 97 98#if defined(linux) && defined(__64BIT__) 99 ld INCY, 112 + STACKSIZE(SP) 100#endif 101 102#if defined(_AIX) || defined(__APPLE__) 103#ifdef __64BIT__ 104 ld INCY, 112 + STACKSIZE(SP) 105#else 106#ifdef DOUBLE 107 lwz INCX, 56 + STACKSIZE(SP) 108 lwz Y, 60 + STACKSIZE(SP) 109 lwz INCY, 64 + STACKSIZE(SP) 110#else 111 lwz INCY, 56 + STACKSIZE(SP) 112#endif 113#endif 114#endif 115 116 fmr ALPHA_R, f1 117 slwi INCX, INCX, ZBASE_SHIFT 118 fmr ALPHA_I, f2 119 slwi INCY, INCY, ZBASE_SHIFT 120 121 subi INCX, INCX, SIZE 122 subi INCY, INCY, SIZE 123 124 li PRE, 2 * 16 * SIZE 125 126 cmpwi cr0, N, 0 127 ble- LL(999) 128 129 sub X, X, INCX 130 sub Y, Y, INCY 131 mr YY, Y 132 133 srawi. r0, N, 3 134 mtspr CTR, r0 135 ble- LL(150) 136 .align 4 137 138 LFDUX f0, X, INCX 139 LFDU f1, 1 * SIZE(X) 140 LFDUX f2, X, INCX 141 LFDU f3, 1 * SIZE(X) 142 143 LFDUX f8, Y, INCY 144 LFDU f9, 1 * SIZE(Y) 145 LFDUX f10, Y, INCY 146 LFDU f11, 1 * SIZE(Y) 147 148 LFDUX f4, X, INCX 149 LFDU f5, 1 * SIZE(X) 150 LFDUX f6, X, INCX 151 LFDU f7, 1 * SIZE(X) 152 153 LFDUX f12, Y, INCY 154 LFDU f13, 1 * SIZE(Y) 155 LFDUX f14, Y, INCY 156 LFDU f15, 1 * SIZE(Y) 157 bdz LL(120) 158 .align 4 159 160LL(110): 161 FMADD f16, ALPHA_R, f0, f8 162 LFDUX f8, Y, INCY 163 FMADD f17, ALPHA_I, f0, f9 164 LFDU f9, 1 * SIZE(Y) 165 FMADD f18, ALPHA_R, f2, f10 166 LFDUX f10, Y, INCY 167 FMADD f19, ALPHA_I, f2, f11 168 LFDU f11, 1 * SIZE(Y) 169#ifdef PPCG4 170 dcbt X, PRE 171#endif 172 173 ADD1 f16, ALPHA_I, f1, f16 174 LFDUX f0, X, INCX 175 ADD2 f17, ALPHA_R, f1, f17 176 LFDU f1, 1 * SIZE(X) 177 ADD1 f18, ALPHA_I, f3, f18 178 LFDUX f2, X, INCX 179 ADD2 f19, ALPHA_R, f3, f19 180 LFDU f3, 1 * SIZE(X) 181#ifdef PPCG4 182 dcbtst Y, PRE 183#endif 184 185 FMADD f20, ALPHA_R, f4, f12 186 LFDUX f12, Y, INCY 187 FMADD f21, ALPHA_I, f4, f13 188 LFDU f13, 1 * SIZE(Y) 189 FMADD f22, ALPHA_R, f6, f14 190 LFDUX f14, Y, INCY 191 FMADD f23, ALPHA_I, f6, f15 192 LFDU f15, 1 * SIZE(Y) 193#if defined(PPCG4) && defined(DOUBLE) 194 dcbt X, PRE 195#endif 196 197 ADD1 f20, ALPHA_I, f5, f20 198 LFDUX f4, X, INCX 199 ADD2 f21, ALPHA_R, f5, f21 200 LFDU f5, 1 * SIZE(X) 201 ADD1 f22, ALPHA_I, f7, f22 202 LFDUX f6, X, INCX 203 ADD2 f23, ALPHA_R, f7, f23 204 LFDU f7, 1 * SIZE(X) 205#if defined(PPCG4) && defined(DOUBLE) 206 dcbtst Y, PRE 207#endif 208 209 STFDUX f16, YY, INCY 210 STFDU f17, 1 * SIZE(YY) 211 STFDUX f18, YY, INCY 212 STFDU f19, 1 * SIZE(YY) 213 214 FMADD f16, ALPHA_R, f0, f8 215 LFDUX f8, Y, INCY 216 FMADD f17, ALPHA_I, f0, f9 217 LFDU f9, 1 * SIZE(Y) 218 FMADD f18, ALPHA_R, f2, f10 219 LFDUX f10, Y, INCY 220 FMADD f19, ALPHA_I, f2, f11 221 LFDU f11, 1 * SIZE(Y) 222#ifdef PPCG4 223 dcbt X, PRE 224#endif 225 226 ADD1 f16, ALPHA_I, f1, f16 227 LFDUX f0, X, INCX 228 ADD2 f17, ALPHA_R, f1, f17 229 LFDU f1, 1 * SIZE(X) 230 ADD1 f18, ALPHA_I, f3, f18 231 LFDUX f2, X, INCX 232 ADD2 f19, ALPHA_R, f3, f19 233 LFDU f3, 1 * SIZE(X) 234#ifdef PPCG4 235 dcbtst Y, PRE 236#endif 237 238 STFDUX f20, YY, INCY 239 STFDU f21, 1 * SIZE(YY) 240 STFDUX f22, YY, INCY 241 STFDU f23, 1 * SIZE(YY) 242 243 FMADD f20, ALPHA_R, f4, f12 244 LFDUX f12, Y, INCY 245 FMADD f21, ALPHA_I, f4, f13 246 LFDU f13, 1 * SIZE(Y) 247 FMADD f22, ALPHA_R, f6, f14 248 LFDUX f14, Y, INCY 249 FMADD f23, ALPHA_I, f6, f15 250 LFDU f15, 1 * SIZE(Y) 251#if defined(PPCG4) && defined(DOUBLE) 252 dcbt X, PRE 253#endif 254 255 ADD1 f20, ALPHA_I, f5, f20 256 LFDUX f4, X, INCX 257 ADD2 f21, ALPHA_R, f5, f21 258 LFDU f5, 1 * SIZE(X) 259 ADD1 f22, ALPHA_I, f7, f22 260 LFDUX f6, X, INCX 261 ADD2 f23, ALPHA_R, f7, f23 262 LFDU f7, 1 * SIZE(X) 263#if defined(PPCG4) && defined(DOUBLE) 264 dcbtst Y, PRE 265#endif 266 267 STFDUX f16, YY, INCY 268 STFDU f17, 1 * SIZE(YY) 269 STFDUX f18, YY, INCY 270 STFDU f19, 1 * SIZE(YY) 271 272 STFDUX f20, YY, INCY 273 STFDU f21, 1 * SIZE(YY) 274 STFDUX f22, YY, INCY 275 STFDU f23, 1 * SIZE(YY) 276 bdnz LL(110) 277 .align 4 278 279LL(120): 280 FMADD f16, ALPHA_R, f0, f8 281 LFDUX f8, Y, INCY 282 FMADD f17, ALPHA_I, f0, f9 283 LFDU f9, 1 * SIZE(Y) 284 FMADD f18, ALPHA_R, f2, f10 285 LFDUX f10, Y, INCY 286 FMADD f19, ALPHA_I, f2, f11 287 LFDU f11, 1 * SIZE(Y) 288 289 ADD1 f16, ALPHA_I, f1, f16 290 LFDUX f0, X, INCX 291 ADD2 f17, ALPHA_R, f1, f17 292 LFDU f1, 1 * SIZE(X) 293 ADD1 f18, ALPHA_I, f3, f18 294 LFDUX f2, X, INCX 295 ADD2 f19, ALPHA_R, f3, f19 296 LFDU f3, 1 * SIZE(X) 297 298 FMADD f20, ALPHA_R, f4, f12 299 LFDUX f12, Y, INCY 300 FMADD f21, ALPHA_I, f4, f13 301 LFDU f13, 1 * SIZE(Y) 302 FMADD f22, ALPHA_R, f6, f14 303 LFDUX f14, Y, INCY 304 FMADD f23, ALPHA_I, f6, f15 305 LFDU f15, 1 * SIZE(Y) 306 307 ADD1 f20, ALPHA_I, f5, f20 308 LFDUX f4, X, INCX 309 ADD2 f21, ALPHA_R, f5, f21 310 LFDU f5, 1 * SIZE(X) 311 ADD1 f22, ALPHA_I, f7, f22 312 LFDUX f6, X, INCX 313 ADD2 f23, ALPHA_R, f7, f23 314 LFDU f7, 1 * SIZE(X) 315 316 STFDUX f16, YY, INCY 317 FMADD f16, ALPHA_R, f0, f8 318 STFDU f17, 1 * SIZE(YY) 319 FMADD f17, ALPHA_I, f0, f9 320 STFDUX f18, YY, INCY 321 FMADD f18, ALPHA_R, f2, f10 322 STFDU f19, 1 * SIZE(YY) 323 FMADD f19, ALPHA_I, f2, f11 324 325 ADD1 f16, ALPHA_I, f1, f16 326 ADD2 f17, ALPHA_R, f1, f17 327 ADD1 f18, ALPHA_I, f3, f18 328 ADD2 f19, ALPHA_R, f3, f19 329 330 STFDUX f20, YY, INCY 331 FMADD f20, ALPHA_R, f4, f12 332 STFDU f21, 1 * SIZE(YY) 333 FMADD f21, ALPHA_I, f4, f13 334 STFDUX f22, YY, INCY 335 FMADD f22, ALPHA_R, f6, f14 336 STFDU f23, 1 * SIZE(YY) 337 FMADD f23, ALPHA_I, f6, f15 338 339 ADD1 f20, ALPHA_I, f5, f20 340 STFDUX f16, YY, INCY 341 ADD2 f21, ALPHA_R, f5, f21 342 STFDU f17, 1 * SIZE(YY) 343 ADD1 f22, ALPHA_I, f7, f22 344 STFDUX f18, YY, INCY 345 ADD2 f23, ALPHA_R, f7, f23 346 STFDU f19, 1 * SIZE(YY) 347 348 STFDUX f20, YY, INCY 349 STFDU f21, 1 * SIZE(YY) 350 STFDUX f22, YY, INCY 351 STFDU f23, 1 * SIZE(YY) 352 .align 4 353 354LL(150): 355 andi. r0, N, 7 356 mtspr CTR, r0 357 ble LL(999) 358 .align 4 359 360LL(160): 361 LFDUX f0, X, INCX 362 LFDU f1, 1 * SIZE(X) 363 LFDUX f8, Y, INCY 364 LFDU f9, 1 * SIZE(Y) 365 366 FMADD f16, ALPHA_R, f0, f8 367 FMADD f17, ALPHA_I, f0, f9 368 369 ADD1 f16, ALPHA_I, f1, f16 370 ADD2 f17, ALPHA_R, f1, f17 371 372 STFDUX f16, YY, INCY 373 STFDU f17, 1 * SIZE(YY) 374 bdnz LL(160) 375 .align 4 376 377LL(999): 378 lfd f14, 0(SP) 379 lfd f15, 8(SP) 380 lfd f16, 16(SP) 381 lfd f17, 24(SP) 382 383 lfd f18, 32(SP) 384 lfd f19, 40(SP) 385 lfd f20, 48(SP) 386 lfd f21, 56(SP) 387 388 lfd f22, 64(SP) 389 lfd f23, 72(SP) 390 lfd f24, 80(SP) 391 lfd f25, 88(SP) 392 393 addi SP, SP, STACKSIZE 394 li r0, 0 395 blr 396 EPILOGUE 397