1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define N %i0 26#define X %i1 27#define INCX %i2 28#define I %i3 29 30#define v1 %o0 31#define v2 %o1 32#define v3 %o2 33#define v4 %o3 34#define count %o4 35 36#ifdef DOUBLE 37#define c1 %f0 38#define c2 %f2 39#define c3 %f4 40#define c4 %f6 41#define t1 %f8 42#define t2 %f10 43#define t3 %f12 44#define t4 %f14 45 46#define a1 %f16 47#define a2 %f18 48#define a3 %f20 49#define a4 %f22 50#define a5 %f24 51#define a6 %f26 52#define a7 %f28 53#define a8 %f30 54#else 55#define c1 %f0 56#define c2 %f1 57#define c3 %f2 58#define c4 %f3 59#define t1 %f4 60#define t2 %f5 61#define t3 %f6 62#define t4 %f7 63 64#define a1 %f8 65#define a2 %f9 66#define a3 %f10 67#define a4 %f11 68#define a5 %f12 69#define a6 %f13 70#define a7 %f14 71#define a8 %f15 72#endif 73 74#ifndef USE_MIN 75#define FCMOV FMOVG 76#define CMOV movg 77#else 78#define FCMOV FMOVL 79#define CMOV movl 80#endif 81 82 PROLOGUE 83 SAVESP 84 85 FCLR(0) 86 87 cmp N, 0 88 ble .LL20 89 clr v1 90 91 cmp INCX, 0 92 ble .LL20 93 sll INCX, BASE_SHIFT, INCX 94 95 mov 1, v1 96 97 add N, -1, N 98 LDF [X], c4 99 add X, INCX, X 100 cmp N, 0 101 ble .LL20 102 FABS c4, c1 103 104 FABS c4, c2 105 mov 1, v2 106 FABS c4, c3 107 mov 1, v3 108 FABS c4, c4 109 mov 1, v4 110 mov 2, count 111 112 cmp INCX, SIZE 113 bne .LL50 114 nop 115 116 sra N, 3, I 117 cmp I, 0 118 ble,pn %icc, .LL15 119 nop 120 121 LDF [X + 0 * SIZE], a1 122 LDF [X + 1 * SIZE], a2 123 LDF [X + 2 * SIZE], a3 124 LDF [X + 3 * SIZE], a4 125 126 LDF [X + 4 * SIZE], a5 127 add I, -1, I 128 LDF [X + 5 * SIZE], a6 129 cmp I, 0 130 LDF [X + 6 * SIZE], a7 131 LDF [X + 7 * SIZE], a8 132 133 ble,pt %icc, .LL12 134 add X, 8 * SIZE, X 135 136#define PREFETCHSIZE 40 137 138.LL11: 139 FABS a1, t1 140 prefetch [X + PREFETCHSIZE * SIZE], 0 141 FABS a2, t2 142 LDF [X + 0 * SIZE], a1 143 FABS a3, t3 144 LDF [X + 1 * SIZE], a2 145 FABS a4, t4 146 LDF [X + 2 * SIZE], a3 147 148 FCMP %fcc0, t1, c1 149 LDF [X + 3 * SIZE], a4 150 FCMP %fcc1, t2, c2 151 nop 152 153 FCMP %fcc2, t3, c3 154 FCMP %fcc3, t4, c4 155 156 FCMOV %fcc0, t1, c1 157 CMOV %fcc0, count, v1 158 FCMOV %fcc1, t2, c2 159 CMOV %fcc1, count, v2 160 FCMOV %fcc2, t3, c3 161 CMOV %fcc2, count, v3 162 FCMOV %fcc3, t4, c4 163 CMOV %fcc3, count, v4 164 add count, 4, count 165 166 FABS a5, t1 167 LDF [X + 4 * SIZE], a5 168 FABS a6, t2 169 LDF [X + 5 * SIZE], a6 170 FABS a7, t3 171 LDF [X + 6 * SIZE], a7 172 FABS a8, t4 173 LDF [X + 7 * SIZE], a8 174 175 FCMP %fcc0, t1, c1 176 FCMP %fcc1, t2, c2 177 FCMP %fcc2, t3, c3 178 FCMP %fcc3, t4, c4 179 180 FCMOV %fcc0, t1, c1 181 nop 182 CMOV %fcc0, count, v1 183 add I, -1, I 184 185 FCMOV %fcc1, t2, c2 186 cmp I, 0 187 CMOV %fcc1, count, v2 188 add X, 8 * SIZE, X 189 190 FCMOV %fcc2, t3, c3 191 CMOV %fcc2, count, v3 192 FCMOV %fcc3, t4, c4 193 CMOV %fcc3, count, v4 194 bg,pt %icc, .LL11 195 add count, 4, count 196 197.LL12: 198 FABS a1, t1 199 FABS a2, t2 200 FABS a3, t3 201 FABS a4, t4 202 203 FCMP %fcc0, t1, c1 204 FCMP %fcc1, t2, c2 205 FCMP %fcc2, t3, c3 206 FCMP %fcc3, t4, c4 207 208 FCMOV %fcc0, t1, c1 209 CMOV %fcc0, count, v1 210 FCMOV %fcc1, t2, c2 211 CMOV %fcc1, count, v2 212 FCMOV %fcc2, t3, c3 213 CMOV %fcc2, count, v3 214 FCMOV %fcc3, t4, c4 215 CMOV %fcc3, count, v4 216 add count, 4, count 217 218 FABS a5, t1 219 FABS a6, t2 220 FABS a7, t3 221 FABS a8, t4 222 223 FCMP %fcc0, t1, c1 224 FCMP %fcc1, t2, c2 225 FCMP %fcc2, t3, c3 226 FCMP %fcc3, t4, c4 227 228 FCMOV %fcc0, t1, c1 229 CMOV %fcc0, count, v1 230 FCMOV %fcc1, t2, c2 231 CMOV %fcc1, count, v2 232 FCMOV %fcc2, t3, c3 233 CMOV %fcc2, count, v3 234 FCMOV %fcc3, t4, c4 235 CMOV %fcc3, count, v4 236 add count, 4, count 237 238.LL15: 239 and N, 7, I 240 cmp I, 0 241 ble,a,pn %icc, .LL19 242 nop 243 244.LL16: 245 LDF [X + 0 * SIZE], a1 246 FABS a1, t1 247 FCMP %fcc0, t1, c1 248 FCMOV %fcc0, t1, c1 249 CMOV %fcc0, count, v1 250 add I, -1, I 251 add count, 1, count 252 cmp I, 0 253 bg,pt %icc, .LL16 254 add X, 1 * SIZE, X 255 256.LL19: 257 FCMP %fcc0, c2, c1 258 add v2, 1, v2 259 FCMP %fcc1, c4, c3 260 add v3, 2, v3 261 add v4, 3, v4 262 263 FCMOV %fcc0, c2, c1 264 CMOV %fcc0, v2, v1 265 FCMOV %fcc1, c4, c3 266 CMOV %fcc1, v4, v3 267 FCMP %fcc0, c3, c1 268 CMOV %fcc0, v3, v1 269 270.LL20: 271 mov v1, %i0 272 return %i7 + 8 273 nop 274 275.LL50: 276 sra N, 3, I 277 cmp I, 0 278 ble,pn %icc, .LL55 279 nop 280 281 LDF [X + 0 * SIZE], a1 282 add X, INCX, X 283 LDF [X + 0 * SIZE], a2 284 add X, INCX, X 285 LDF [X + 0 * SIZE], a3 286 add X, INCX, X 287 LDF [X + 0 * SIZE], a4 288 add X, INCX, X 289 LDF [X + 0 * SIZE], a5 290 add X, INCX, X 291 LDF [X + 0 * SIZE], a6 292 add X, INCX, X 293 add I, -1, I 294 LDF [X + 0 * SIZE], a7 295 cmp I, 0 296 add X, INCX, X 297 LDF [X + 0 * SIZE], a8 298 ble,pt %icc, .LL52 299 add X, INCX, X 300 301.LL51: 302 FABS a1, t1 303 LDF [X + 0 * SIZE], a1 304 add X, INCX, X 305 FABS a2, t2 306 LDF [X + 0 * SIZE], a2 307 add X, INCX, X 308 FABS a3, t3 309 LDF [X + 0 * SIZE], a3 310 add X, INCX, X 311 FABS a4, t4 312 LDF [X + 0 * SIZE], a4 313 add X, INCX, X 314 315 FCMP %fcc0, t1, c1 316 FCMP %fcc1, t2, c2 317 FCMP %fcc2, t3, c3 318 FCMP %fcc3, t4, c4 319 320 FCMOV %fcc0, t1, c1 321 CMOV %fcc0, count, v1 322 FCMOV %fcc1, t2, c2 323 CMOV %fcc1, count, v2 324 FCMOV %fcc2, t3, c3 325 CMOV %fcc2, count, v3 326 FCMOV %fcc3, t4, c4 327 CMOV %fcc3, count, v4 328 add count, 4, count 329 330 FABS a5, t1 331 LDF [X + 0 * SIZE], a5 332 add X, INCX, X 333 FABS a6, t2 334 LDF [X + 0 * SIZE], a6 335 add X, INCX, X 336 FABS a7, t3 337 LDF [X + 0 * SIZE], a7 338 add X, INCX, X 339 FABS a8, t4 340 LDF [X + 0 * SIZE], a8 341 342 FCMP %fcc0, t1, c1 343 FCMP %fcc1, t2, c2 344 FCMP %fcc2, t3, c3 345 FCMP %fcc3, t4, c4 346 347 FCMOV %fcc0, t1, c1 348 CMOV %fcc0, count, v1 349 add I, -1, I 350 FCMOV %fcc1, t2, c2 351 CMOV %fcc1, count, v2 352 cmp I, 0 353 FCMOV %fcc2, t3, c3 354 CMOV %fcc2, count, v3 355 FCMOV %fcc3, t4, c4 356 CMOV %fcc3, count, v4 357 add count, 4, count 358 359 bg,pt %icc, .LL51 360 add X, INCX, X 361 362.LL52: 363 FABS a1, t1 364 FABS a2, t2 365 FABS a3, t3 366 FABS a4, t4 367 368 FCMP %fcc0, t1, c1 369 FCMP %fcc1, t2, c2 370 FCMP %fcc2, t3, c3 371 FCMP %fcc3, t4, c4 372 373 FCMOV %fcc0, t1, c1 374 CMOV %fcc0, count, v1 375 FCMOV %fcc1, t2, c2 376 CMOV %fcc1, count, v2 377 FCMOV %fcc2, t3, c3 378 CMOV %fcc2, count, v3 379 FCMOV %fcc3, t4, c4 380 CMOV %fcc3, count, v4 381 add count, 4, count 382 383 FABS a5, t1 384 FABS a6, t2 385 FABS a7, t3 386 FABS a8, t4 387 388 FCMP %fcc0, t1, c1 389 FCMP %fcc1, t2, c2 390 FCMP %fcc2, t3, c3 391 FCMP %fcc3, t4, c4 392 393 FCMOV %fcc0, t1, c1 394 CMOV %fcc0, count, v1 395 FCMOV %fcc1, t2, c2 396 CMOV %fcc1, count, v2 397 FCMOV %fcc2, t3, c3 398 CMOV %fcc2, count, v3 399 FCMOV %fcc3, t4, c4 400 CMOV %fcc3, count, v4 401 add count, 4, count 402 403.LL55: 404 and N, 7, I 405 cmp I, 0 406 ble,a,pn %icc, .LL59 407 nop 408 409.LL56: 410 LDF [X + 0 * SIZE], a1 411 FABS a1, t1 412 FCMP %fcc0, t1, c1 413 FCMOV %fcc0, t1, c1 414 CMOV %fcc0, count, v1 415 add I, -1, I 416 add count, 1, count 417 cmp I, 0 418 bg,pt %icc, .LL56 419 add X, INCX, X 420 421.LL59: 422 FCMP %fcc0, c2, c1 423 add v2, 1, v2 424 FCMP %fcc1, c4, c3 425 add v3, 2, v3 426 add v4, 3, v4 427 428 FCMOV %fcc0, c2, c1 429 CMOV %fcc0, v2, v1 430 FCMOV %fcc1, c4, c3 431 CMOV %fcc1, v4, v3 432 FCMP %fcc0, c3, c1 433 CMOV %fcc0, v3, v1 434 435 mov v1, %i0 436 return %i7 + 8 437 nop 438 439 EPILOGUE 440