1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define N $4 26#define X $5 27#define INCX $6 28#define Y $7 29#define INCY $8 30 31#define XX $9 32#define YY $10 33 34#define C $f17 35#define S $f18 36 37#define I $2 38#define TEMP $3 39 40#define a1 $f4 41#define a2 $f5 42#define a3 $f6 43#define a4 $f7 44 45#define b1 $f8 46#define b2 $f9 47#define b3 $f10 48#define b4 $f11 49 50#define t1 $f0 51#define t2 $f1 52#define t3 $f2 53#define t4 $f3 54 55 PROLOGUE 56 57 dsll INCX, INCX, BASE_SHIFT 58 li TEMP, SIZE 59 60 blez N, .L999 61 dsll INCY, INCY, BASE_SHIFT 62 63 bne INCX, TEMP, .L20 64 dsra I, N, 2 65 66 bne INCY, TEMP, .L20 67 NOP 68 69 blez I, .L15 70 daddiu I, I, -1 71 72 LD a1, 0 * SIZE(X) 73 LD b1, 0 * SIZE(Y) 74 LD a2, 1 * SIZE(X) 75 LD b2, 1 * SIZE(Y) 76 77 LD a3, 2 * SIZE(X) 78 LD b3, 2 * SIZE(Y) 79 MUL t1, S, b1 80 81 LD a4, 3 * SIZE(X) 82 MUL t2, C, b1 83 LD b4, 3 * SIZE(Y) 84 MUL t3, S, b2 85 86 blez I, .L13 87 MUL t4, C, b2 88 .align 3 89 90.L12: 91 MADD t1, t1, C, a1 92 LD b1, 4 * SIZE(Y) 93 NMSUB t2, t2, S, a1 94 LD a1, 4 * SIZE(X) 95 MADD t3, t3, C, a2 96 LD b2, 5 * SIZE(Y) 97 NMSUB t4, t4, S, a2 98 LD a2, 5 * SIZE(X) 99 100 ST t1, 0 * SIZE(X) 101 MUL t1, S, b3 102 ST t2, 0 * SIZE(Y) 103 MUL t2, C, b3 104 ST t3, 1 * SIZE(X) 105 MUL t3, S, b4 106 ST t4, 1 * SIZE(Y) 107 MUL t4, C, b4 108 109 110 MADD t1, t1, C, a3 111 LD b3, 6 * SIZE(Y) 112 NMSUB t2, t2, S, a3 113 LD a3, 6 * SIZE(X) 114 MADD t3, t3, C, a4 115 LD b4, 7 * SIZE(Y) 116 NMSUB t4, t4, S, a4 117 LD a4, 7 * SIZE(X) 118 119 ST t1, 2 * SIZE(X) 120 MUL t1, S, b1 121 ST t2, 2 * SIZE(Y) 122 MUL t2, C, b1 123 ST t3, 3 * SIZE(X) 124 MUL t3, S, b2 125 ST t4, 3 * SIZE(Y) 126 MUL t4, C, b2 127 128 daddiu I, I, -1 129 daddiu X, X, 4 * SIZE 130 131 bgtz I, .L12 132 daddiu Y, Y, 4 * SIZE 133 .align 3 134 135.L13: 136 MADD t1, t1, C, a1 137 NMSUB t2, t2, S, a1 138 MADD t3, t3, C, a2 139 NMSUB t4, t4, S, a2 140 141 ST t1, 0 * SIZE(X) 142 MUL t1, S, b3 143 ST t2, 0 * SIZE(Y) 144 MUL t2, C, b3 145 ST t3, 1 * SIZE(X) 146 MUL t3, S, b4 147 ST t4, 1 * SIZE(Y) 148 MUL t4, C, b4 149 150 MADD t1, t1, C, a3 151 NMSUB t2, t2, S, a3 152 MADD t3, t3, C, a4 153 daddiu X, X, 4 * SIZE 154 NMSUB t4, t4, S, a4 155 daddiu Y, Y, 4 * SIZE 156 157 ST t1, -2 * SIZE(X) 158 ST t2, -2 * SIZE(Y) 159 ST t3, -1 * SIZE(X) 160 ST t4, -1 * SIZE(Y) 161 .align 3 162 163.L15: 164 andi I, N, 3 165 166 blez I, .L999 167 NOP 168 .align 3 169 170.L16: 171 LD a1, 0 * SIZE(X) 172 LD b1, 0 * SIZE(Y) 173 174 MUL t1, S, b1 175 MUL t2, C, b1 176 177 MADD t1, t1, C, a1 178 NMSUB t2, t2, S, a1 179 180 ST t1, 0 * SIZE(X) 181 ST t2, 0 * SIZE(Y) 182 183 daddiu I, I, -1 184 185 daddiu X, X, SIZE 186 daddiu Y, Y, SIZE 187 188 bgtz I, .L16 189 NOP 190 j .L999 191 NOP 192 .align 3 193 194.L20: 195 move XX, X 196 move YY, Y 197 198 blez I, .L25 199 daddiu I, I, -1 200 201 LD a1, 0 * SIZE(X) 202 dadd X, X, INCX 203 LD b1, 0 * SIZE(Y) 204 dadd Y, Y, INCY 205 206 LD a2, 0 * SIZE(X) 207 dadd X, X, INCX 208 LD b2, 0 * SIZE(Y) 209 dadd Y, Y, INCY 210 211 LD a3, 0 * SIZE(X) 212 dadd X, X, INCX 213 LD b3, 0 * SIZE(Y) 214 dadd Y, Y, INCY 215 216 MUL t1, S, b1 217 218 LD a4, 0 * SIZE(X) 219 dadd X, X, INCX 220 MUL t2, C, b1 221 LD b4, 0 * SIZE(Y) 222 dadd Y, Y, INCY 223 224 MUL t3, S, b2 225 blez I, .L23 226 MUL t4, C, b2 227 .align 3 228 229.L22: 230 MADD t1, t1, C, a1 231 LD b1, 0 * SIZE(Y) 232 dadd Y, Y, INCY 233 NMSUB t2, t2, S, a1 234 LD a1, 0 * SIZE(X) 235 dadd X, X, INCX 236 MADD t3, t3, C, a2 237 LD b2, 0 * SIZE(Y) 238 dadd Y, Y, INCY 239 NMSUB t4, t4, S, a2 240 LD a2, 0 * SIZE(X) 241 dadd X, X, INCX 242 243 ST t1, 0 * SIZE(XX) 244 dadd XX, XX, INCX 245 MUL t1, S, b3 246 ST t2, 0 * SIZE(YY) 247 dadd YY, YY, INCY 248 MUL t2, C, b3 249 ST t3, 0 * SIZE(XX) 250 dadd XX, XX, INCX 251 MUL t3, S, b4 252 ST t4, 0 * SIZE(YY) 253 dadd YY, YY, INCY 254 MUL t4, C, b4 255 256 MADD t1, t1, C, a3 257 LD b3, 0 * SIZE(Y) 258 dadd Y, Y, INCY 259 NMSUB t2, t2, S, a3 260 LD a3, 0 * SIZE(X) 261 dadd X, X, INCX 262 MADD t3, t3, C, a4 263 LD b4, 0 * SIZE(Y) 264 dadd Y, Y, INCY 265 NMSUB t4, t4, S, a4 266 LD a4, 0 * SIZE(X) 267 dadd X, X, INCX 268 269 ST t1, 0 * SIZE(XX) 270 dadd XX, XX, INCX 271 MUL t1, S, b1 272 ST t2, 0 * SIZE(YY) 273 dadd YY, YY, INCY 274 MUL t2, C, b1 275 ST t3, 0 * SIZE(XX) 276 dadd XX, XX, INCX 277 MUL t3, S, b2 278 ST t4, 0 * SIZE(YY) 279 MUL t4, C, b2 280 daddiu I, I, -1 281 282 bgtz I, .L22 283 dadd YY, YY, INCY 284 .align 3 285 286.L23: 287 MADD t1, t1, C, a1 288 NMSUB t2, t2, S, a1 289 MADD t3, t3, C, a2 290 NMSUB t4, t4, S, a2 291 292 ST t1, 0 * SIZE(XX) 293 dadd XX, XX, INCX 294 MUL t1, S, b3 295 ST t2, 0 * SIZE(YY) 296 dadd YY, YY, INCY 297 MUL t2, C, b3 298 ST t3, 0 * SIZE(XX) 299 dadd XX, XX, INCX 300 MUL t3, S, b4 301 ST t4, 0 * SIZE(YY) 302 dadd YY, YY, INCY 303 MUL t4, C, b4 304 305 MADD t1, t1, C, a3 306 NMSUB t2, t2, S, a3 307 MADD t3, t3, C, a4 308 NMSUB t4, t4, S, a4 309 310 ST t1, 0 * SIZE(XX) 311 dadd XX, XX, INCX 312 ST t2, 0 * SIZE(YY) 313 dadd YY, YY, INCY 314 ST t3, 0 * SIZE(XX) 315 dadd XX, XX, INCX 316 ST t4, 0 * SIZE(YY) 317 dadd YY, YY, INCY 318 .align 3 319 320.L25: 321 andi I, N, 3 322 323 blez I, .L999 324 NOP 325 .align 3 326 327.L26: 328 LD a1, 0 * SIZE(X) 329 LD b1, 0 * SIZE(Y) 330 331 MUL t1, S, b1 332 MUL t2, C, b1 333 334 MADD t1, t1, C, a1 335 daddiu I, I, -1 336 NMSUB t2, t2, S, a1 337 338 ST t1, 0 * SIZE(X) 339 ST t2, 0 * SIZE(Y) 340 341 dadd X, X, INCX 342 bgtz I, .L26 343 dadd Y, Y, INCY 344 .align 3 345 346.L999: 347 j $31 348 NOP 349 350 EPILOGUE 351