1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define PREFETCHSIZE 140 26 27#define CO1 r14 28#define CO2 r15 29#define CO3 r16 30#define DO1 r17 31#define DO2 r18 32#define DO3 r19 33 34#define I r22 35#define I_AND_15 r23 36#define PRE1 r24 37 38#define PR r30 39#define ARLC r31 40 41#define M r32 42#define N r33 43#define C r34 44#define LDC r35 45#define J r36 46 47#define BETA f8 48 49 PROLOGUE 50 .prologue 51 PROFCODE 52 53 { .mmi 54#ifndef XDOUBLE 55 adds CO1 = 16, r12 56 adds CO2 = 24, r12 57#else 58 adds CO1 = 32, r12 59 adds CO2 = 40, r12 60#endif 61 .save ar.lc, ARLC 62 mov ARLC = ar.lc 63 } 64 { .mfb 65 cmp.ge p6, p0 = 0, N 66 fcmp.eq p0, p15 = BETA, f0 67 (p6) br.ret.sptk.many b0 68 } 69 ;; 70 .body 71 { .mmi 72 ld8 C = [CO1], 8 73 ld8 LDC = [CO2] 74 mov PR = pr 75 } 76 { .mmi 77 mov J = N 78 shr I = M, 4 79 } 80 ;; 81 { .mmb 82 shladd LDC = LDC, BASE_SHIFT, r0 83 adds I = -1, I 84 (p15) br.cond.dpnt .L100 // if (beta != 0) goto L100 85 } 86 ;; 87 .align 32 88 89.L60: 90 { .mmi 91 mov CO1 = C 92 mov CO3 = C 93 add CO2 = 4 * SIZE, C 94 } 95 { .mmi 96 adds PRE1 = PREFETCHSIZE * SIZE, C 97 add C = C, LDC 98 tbit.nz p12, p0 = M, 3 99 } 100 ;; 101 { .mmi 102 and I_AND_15 = 15, M 103 mov ar.lc = I 104 } 105 { .mib 106 cmp.gt p8, p0 = 0, I 107 (p8) br.cond.dpnt .L80 108 } 109 ;; 110 .align 32 111 112.L70: 113 { .mmi 114 STFD [CO1] = f0, 1 * SIZE 115 STFD [CO2] = f0, 1 * SIZE 116 } 117 { .mmi 118 lfetch.excl.nt1 [PRE1] 119 nop.m 0 120 adds PRE1 = 16 * SIZE, PRE1 121 } 122 ;; 123 { .mmi 124 STFD [CO1] = f0, 1 * SIZE 125 STFD [CO2] = f0, 1 * SIZE 126 adds CO3 = 16 * SIZE, CO3 127 } 128 ;; 129 { .mmi 130 STFD [CO1] = f0, 1 * SIZE 131 STFD [CO2] = f0, 1 * SIZE 132 } 133 ;; 134 { .mmi 135 STFD [CO1] = f0, 5 * SIZE 136 STFD [CO2] = f0, 5 * SIZE 137 } 138 ;; 139 { .mmi 140 STFD [CO1] = f0, 1 * SIZE 141 STFD [CO2] = f0, 1 * SIZE 142 } 143 ;; 144 { .mmi 145 STFD [CO1] = f0, 1 * SIZE 146 STFD [CO2] = f0, 1 * SIZE 147 } 148 ;; 149 { .mmi 150 STFD [CO1] = f0, 1 * SIZE 151 STFD [CO2] = f0, 1 * SIZE 152 } 153 ;; 154 { .mmb 155 STFD [CO1] = f0, 5 * SIZE 156 STFD [CO2] = f0, 5 * SIZE 157 br.cloop.sptk.few .L70 158 } 159 ;; 160 .align 32 161 162.L80: 163 { .mmi 164 (p12) STFD [CO1] = f0, 1 * SIZE 165 (p12) STFD [CO2] = f0, 1 * SIZE 166 tbit.nz p13, p0 = M, 2 167 } 168 { .mmb 169 cmp.eq p9, p0 = 0, I_AND_15 170 adds J = -1, J 171 (p9) br.cond.dptk .L99 172 } 173 ;; 174 { .mmi 175 (p12) STFD [CO1] = f0, 1 * SIZE 176 (p12) STFD [CO2] = f0, 1 * SIZE 177 tbit.nz p14, p0 = M, 1 178 } 179 ;; 180 { .mmi 181 (p12) STFD [CO1] = f0, 1 * SIZE 182 (p12) STFD [CO2] = f0, 1 * SIZE 183 (p12) adds CO3 = 8 * SIZE, CO3 184 } 185 ;; 186 { .mmi 187 (p12) STFD [CO1] = f0, 5 * SIZE 188 (p12) STFD [CO2] = f0 189 (p13) adds CO3 = 4 * SIZE, CO3 190 } 191 ;; 192 { .mmi 193 (p13) STFD [CO1] = f0, 1 * SIZE 194 (p14) STFD [CO3] = f0, 1 * SIZE 195 } 196 ;; 197 { .mmi 198 (p13) STFD [CO1] = f0, 1 * SIZE 199 (p14) STFD [CO3] = f0, 1 * SIZE 200 tbit.nz p15, p0 = M, 0 201 } 202 ;; 203 { .mmi 204 (p13) STFD [CO1] = f0, 1 * SIZE 205 (p15) STFD [CO3] = f0 206 } 207 ;; 208 { .mmi 209 (p13) STFD [CO1] = f0 210 } 211 ;; 212 .align 32 213 214.L99: 215 { .mib 216 cmp.lt p6, p0 = 0, J 217 mov ar.lc = ARLC 218 } 219 { .mbb 220 (p6) br.cond.dptk .L60 221 br.ret.sptk.many b0 222 } 223 ;; 224 .align 32 225 226.L100: 227 { .mmi 228 mov CO1 = C 229 mov CO3 = C 230 mov pr.rot = 0 231 } 232 { .mmi 233 adds PRE1 = PREFETCHSIZE * SIZE, C 234 add CO2 = 4 * SIZE, C 235 mov DO1 = C 236 } 237 ;; 238 { .mmi 239 mov ar.ec = 6 240 } 241 { .mmi 242 adds DO2 = 4 * SIZE, C 243 mov DO3 = C 244 add C = C, LDC 245 } 246 ;; 247 { .mmi 248 and I_AND_15 = 15, M 249 cmp.eq p16, p0 = r0, r0 250 mov ar.lc = I 251 } 252 { .mib 253 cmp.gt p8, p0 = 0, I 254 tbit.nz p12, p0 = M, 3 255 (p8) br.cond.dpnt .L180 256 } 257 ;; 258 .align 32 259 260.L170: 261 { .mmf 262 (p21) STFD [DO1] = f6, 1 * SIZE 263 (p21) STFD [DO2] = f7, 1 * SIZE 264 (p21) FMPY f6 = BETA, f85 265 } 266 { .mmf 267 (p16) lfetch.excl.nt1 [PRE1] 268 (p16) adds CO3 = 16 * SIZE, CO3 269 (p21) FMPY f7 = BETA, f91 270 } 271 ;; 272 { .mmf 273 (p21) STFD [DO1] = f10, 1 * SIZE 274 (p21) STFD [DO2] = f11, 1 * SIZE 275 (p21) FMPY f10 = BETA, f97 276 } 277 { .mmf 278 (p16) LDFD f32 = [CO1], 1 * SIZE 279 (p16) LDFD f38 = [CO2], 1 * SIZE 280 (p21) FMPY f11 = BETA, f103 281 } 282 ;; 283 { .mmf 284 (p21) STFD [DO1] = f12, 1 * SIZE 285 (p21) STFD [DO2] = f13, 1 * SIZE 286 (p21) FMPY f12 = BETA, f109 287 } 288 { .mmf 289 (p16) LDFD f44 = [CO1], 1 * SIZE 290 (p16) LDFD f50 = [CO2], 1 * SIZE 291 (p21) FMPY f13 = BETA, f115 292 } 293 ;; 294 { .mmf 295 (p21) STFD [DO1] = f14, 5 * SIZE 296 (p21) STFD [DO2] = f15, 5 * SIZE 297 (p21) FMPY f14 = BETA, f121 298 } 299 { .mmf 300 (p16) LDFD f56 = [CO1], 1 * SIZE 301 (p16) LDFD f62 = [CO2], 1 * SIZE 302 (p21) FMPY f15 = BETA, f127 303 } 304 ;; 305 { .mmf 306 (p21) STFD [DO1] = f6, 1 * SIZE 307 (p21) STFD [DO2] = f7, 1 * SIZE 308 (p20) FMPY f6 = BETA, f36 309 } 310 { .mmf 311 (p16) LDFD f68 = [CO1], 5 * SIZE 312 (p16) LDFD f74 = [CO2], 5 * SIZE 313 (p20) FMPY f7 = BETA, f42 314 } 315 ;; 316 { .mmf 317 (p21) STFD [DO1] = f10, 1 * SIZE 318 (p21) STFD [DO2] = f11, 1 * SIZE 319 (p20) FMPY f10 = BETA, f48 320 } 321 { .mmf 322 (p16) LDFD f80 = [CO1], 1 * SIZE 323 (p16) LDFD f86 = [CO2], 1 * SIZE 324 (p20) FMPY f11 = BETA, f54 325 } 326 ;; 327 { .mmf 328 (p21) STFD [DO1] = f12, 1 * SIZE 329 (p21) STFD [DO2] = f13, 1 * SIZE 330 (p20) FMPY f12 = BETA, f60 331 } 332 { .mmf 333 (p16) LDFD f92 = [CO1], 1 * SIZE 334 (p16) LDFD f98 = [CO2], 1 * SIZE 335 (p20) FMPY f13 = BETA, f66 336 } 337 ;; 338 { .mmf 339 (p21) STFD [DO1] = f14, 5 * SIZE 340 (p21) STFD [DO2] = f15, 5 * SIZE 341 (p20) FMPY f14 = BETA, f72 342 } 343 { .mmf 344 (p16) LDFD f104 = [CO1], 1 * SIZE 345 (p16) LDFD f110 = [CO2], 1 * SIZE 346 (p20) FMPY f15 = BETA, f78 347 } 348 ;; 349 { .mmi 350 (p16) LDFD f116 = [CO1], 5 * SIZE 351 (p16) LDFD f122 = [CO2], 5 * SIZE 352 adds PRE1 = 16 * SIZE, PRE1 353 } 354 { .mmb 355 (p16) adds DO3 = 16 * SIZE, DO3 356 nop.m 0 357 br.ctop.sptk.few .L170 358 } 359 ;; 360 .align 32 361 362.L180: 363 { .mmi 364 (p12) LDFD f32 = [CO1], 1 * SIZE 365 (p12) LDFD f36 = [CO2], 1 * SIZE 366 tbit.nz p13, p0 = M, 2 367 } 368 { .mmb 369 cmp.eq p9, p0 = 0, I_AND_15 370 adds J = -1, J 371 (p9) br.cond.dptk .L199 372 } 373 ;; 374 { .mmi 375 (p12) LDFD f33 = [CO1], 1 * SIZE 376 (p12) LDFD f37 = [CO2], 1 * SIZE 377 tbit.nz p14, p0 = M, 1 378 } 379 ;; 380 { .mmi 381 (p12) LDFD f34 = [CO1], 1 * SIZE 382 (p12) LDFD f38 = [CO2], 1 * SIZE 383 (p12) adds CO3 = 8 * SIZE, CO3 384 } 385 ;; 386 { .mmi 387 (p12) LDFD f35 = [CO1], 5 * SIZE 388 (p12) LDFD f39 = [CO2] 389 (p13) adds CO3 = 4 * SIZE, CO3 390 } 391 ;; 392 { .mmi 393 (p13) LDFD f40 = [CO1], 1 * SIZE 394 (p14) LDFD f44 = [CO3], 1 * SIZE 395 } 396 ;; 397 { .mmi 398 (p13) LDFD f41 = [CO1], 1 * SIZE 399 (p14) LDFD f45 = [CO3], 1 * SIZE 400 tbit.nz p15, p0 = M, 0 401 } 402 ;; 403 { .mmf 404 (p13) LDFD f42 = [CO1], 1 * SIZE 405 (p15) LDFD f46 = [CO3] 406 (p12) FMPY f32 = BETA, f32 407 } 408 { .mmf 409 (p12) FMPY f36 = BETA, f36 410 } 411 ;; 412 { .mmf 413 (p13) LDFD f43 = [CO1] 414 (p12) FMPY f33 = BETA, f33 415 } 416 { .mmf 417 (p12) FMPY f37 = BETA, f37 418 } 419 ;; 420 (p12) FMPY f34 = BETA, f34 421 (p12) FMPY f38 = BETA, f38 422 (p12) FMPY f35 = BETA, f35 423 (p12) FMPY f39 = BETA, f39 424 425 ;; 426 { .mmf 427 (p12) STFD [DO1] = f32, 1 * SIZE 428 (p12) STFD [DO2] = f36, 1 * SIZE 429 (p13) FMPY f40 = BETA, f40 430 } 431 { .mmf 432 (p12) adds DO3 = 8 * SIZE, DO3 433 (p14) FMPY f44 = BETA, f44 434 } 435 ;; 436 { .mmf 437 (p12) STFD [DO1] = f33, 1 * SIZE 438 (p12) STFD [DO2] = f37, 1 * SIZE 439 (p13) FMPY f41 = BETA, f41 440 } 441 { .mmf 442 (p13) adds DO3 = 4 * SIZE, DO3 443 (p14) FMPY f45 = BETA, f45 444 } 445 ;; 446 { .mmf 447 (p12) STFD [DO1] = f34, 1 * SIZE 448 (p12) STFD [DO2] = f38, 1 * SIZE 449 (p13) FMPY f42 = BETA, f42 450 } 451 { .mmf 452 (p15) FMPY f46 = BETA, f46 453 } 454 ;; 455 { .mmf 456 (p12) STFD [DO1] = f35, 5 * SIZE 457 (p12) STFD [DO2] = f39 458 (p13) FMPY f43 = BETA, f43 459 } 460 ;; 461 { .mmi 462 (p13) STFD [DO1] = f40, 1 * SIZE 463 (p14) STFD [DO3] = f44, 1 * SIZE 464 } 465 ;; 466 { .mmi 467 (p13) STFD [DO1] = f41, 1 * SIZE 468 (p14) STFD [DO3] = f45, 1 * SIZE 469 } 470 ;; 471 { .mmi 472 (p13) STFD [DO1] = f42, 1 * SIZE 473 (p15) STFD [DO3] = f46 474 } 475 ;; 476 { .mmi 477 (p13) STFD [DO1] = f43 478 } 479 ;; 480 .align 32 481 482.L199: 483 { .mib 484 cmp.lt p6, p0 = 0, J 485 mov ar.lc = ARLC 486 (p6) br.cond.dptk .L100 487 } 488 ;; 489 { .mib 490 mov pr = PR, -1 491 br.ret.sptk.many b0 492 } 493 ;; 494 EPILOGUE 495 496