1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define PREFETCHSIZE 64 43#define WPREFETCHSIZE 32 44 45#ifndef XDOUBLE 46#define LD LDF8 47#define ST STF8_NTA 48#else 49#define LD LDFD 50#define ST STFD_NTA 51#endif 52 53#define J r15 54#define PREB r17 55#define PREA r18 56 57#define A1 r19 58#define A2 r20 59#define A3 r21 60#define A4 r22 61#define A5 r23 62#define A6 r24 63#define A7 r25 64#define A8 r26 65#define B1 r27 66#define B2 r28 67 68#define COUNT r9 69#define I r10 70#define II r11 71 72#define ARLC r29 73#define PR r30 74 75#define M r32 76#define N r33 77#define A r34 78#define LDA r35 79#define B r36 80 81 PROLOGUE 82 .prologue 83 PROFCODE 84 85 .body 86 { .mii 87 shladd LDA = LDA, BASE_SHIFT, r0 88 mov PR = pr 89 shr J = N, 3 90 } 91 ;; 92 { .mib 93 cmp.eq p8, p0 = 0, J 94 mov ARLC = ar.lc 95 (p8) br.cond.dpnt .L20 96 } 97 ;; 98 .align 32 99 100.L11: 101 { .mmi 102 mov A1 = A 103 add A2 = A, LDA 104 mov pr.rot = 0 105 } 106 { .mmi 107 shladd A3 = LDA, 1, A 108 shladd A5 = LDA, 2, A 109 adds I = 1, M 110 } 111 ;; 112 { .mmi 113 shladd A4 = LDA, 1, A2 114 shladd A6 = LDA, 2, A2 115 mov ar.ec = 6 116 } 117 { .mmi 118 cmp.eq p16, p0 = r0, r0 119 shladd A7 = LDA, 2, A3 120 shr I = I, 1 121 } 122 ;; 123 { .mmi 124 adds B1 = 8 * SIZE, B 125 shladd A8 = LDA, 2, A4 126 shladd A = LDA, 3, A 127 } 128 { .mmi 129 adds I = -1, I 130 mov COUNT = 0 131 adds J = -1, J 132 } 133 ;; 134 { .mmi 135 adds PREA = PREFETCHSIZE * SIZE, A 136 adds PREB = WPREFETCHSIZE * SIZE, B 137 mov ar.lc = I 138 } 139 { .mmi 140 mov I = M 141 mov II = M 142 cmp.ne p14, p0 = r0, r0 143 } 144 ;; 145 .align 32 146 147.L12: 148 { .mmi 149 (p21) ST [B ] = f37, 1 * SIZE 150 (p14) ST [B1] = f49, 1 * SIZE 151 (p16) cmp.ne.unc p13, p0 = 1, I 152 } 153 { .mmi 154 lfetch.nt1 [PREA], LDA 155 lfetch.excl.nt1 [PREB] 156 adds PREB = 16 * SIZE, PREB 157 } 158 ;; 159 { .mmi 160 (p21) ST [B ] = f43, 1 * SIZE 161 (p14) ST [B1] = f55, 1 * SIZE 162 cmp.eq p9, p0 = 8, COUNT 163 } 164 { .mmi 165 (p16) LD f32 = [A1], SIZE 166 (p16) LD f38 = [A2], SIZE 167 (p16) adds I = -2, I 168 } 169 ;; 170 { .mmi 171 (p21) ST [B ] = f61, 1 * SIZE 172 (p14) ST [B1] = f73, 1 * SIZE 173 (p9) mov COUNT = 0 174 } 175 { .mmi 176 (p13) LD f44 = [A1], SIZE 177 (p13) LD f50 = [A2], SIZE 178 (p21) adds II = -2, II 179 } 180 ;; 181 { .mmb 182 (p21) ST [B ] = f67, 1 * SIZE 183 (p14) ST [B1] = f79, 1 * SIZE 184 nop __LINE__ 185 } 186 { .mmb 187 (p16) LD f56 = [A3], SIZE 188 (p16) LD f62 = [A4], SIZE 189 nop __LINE__ 190 } 191 ;; 192 { .mmi 193 (p21) ST [B ] = f85, 1 * SIZE 194 (p14) ST [B1] = f97, 1 * SIZE 195 (p9) adds PREA = (PREFETCHSIZE - 2)* SIZE, A1 196 } 197 { .mmb 198 (p13) LD f68 = [A3], SIZE 199 (p13) LD f74 = [A4], SIZE 200 nop __LINE__ 201 } 202 ;; 203 { .mmb 204 (p21) ST [B ] = f91, 1 * SIZE 205 (p14) ST [B1] = f103, 1 * SIZE 206 nop __LINE__ 207 } 208 { .mmb 209 (p16) LD f80 = [A5], SIZE 210 (p16) LD f86 = [A6], SIZE 211 nop __LINE__ 212 } 213 ;; 214 { .mmb 215 (p21) ST [B ] = f109, 1 * SIZE 216 (p14) ST [B1] = f121, 1 * SIZE 217 nop __LINE__ 218 } 219 { .mmb 220 (p13) LD f92 = [A5], SIZE 221 (p13) LD f98 = [A6], SIZE 222 nop __LINE__ 223 } 224 ;; 225 { .mmi 226 (p21) ST [B ] = f115, 1 * SIZE 227 (p14) ST [B1] = f127, 9 * SIZE 228 (p16) adds COUNT = 1, COUNT 229 } 230 { .mmb 231 (p16) LD f104 = [A7], SIZE 232 (p16) LD f110 = [A8], SIZE 233 nop __LINE__ 234 } 235 ;; 236 { .mmi 237 (p13) LD f116 = [A7], SIZE 238 (p13) LD f122 = [A8], SIZE 239 (p14) adds B = 8 * SIZE, B 240 } 241 { .mmb 242 (p20) cmp.ne.unc p14, p0 = 1, II 243 nop __LINE__ 244 br.ctop.sptk.few .L12 245 } 246 ;; 247 { .mmb 248 cmp.ne p6, p0 = 0, J 249 nop __LINE__ 250 (p6) br.cond.dptk .L11 251 } 252 ;; 253 .align 32 254 255.L20: 256 { .mmi 257 adds I = 1, M 258 mov A1 = A 259 mov pr.rot = 0 260 } 261 { .mmi 262 add A2 = A, LDA 263 shladd A3 = LDA, 1, A 264 tbit.z p6, p0 = N, 2 265 } 266 ;; 267 { .mmi 268 shladd A4 = LDA, 1, A2 269 adds B1 = 4 * SIZE, B 270 mov ar.ec = 6 271 } 272 { .mib 273 cmp.eq p16, p0 = r0, r0 274 shr I = I, 1 275 (p6) br.cond.dpnt .L30 276 } 277 ;; 278 { .mmi 279 shladd A = LDA, 2, A 280 nop __LINE__ 281 nop __LINE__ 282 } 283 { .mmi 284 adds I = -1, I 285 mov COUNT = 0 286 adds J = -1, J 287 } 288 ;; 289 { .mmi 290 adds PREA = PREFETCHSIZE * SIZE, A 291 adds PREB = WPREFETCHSIZE * SIZE, B 292 mov ar.lc = I 293 } 294 { .mmi 295 mov I = M 296 mov II = M 297 cmp.ne p14, p0 = r0, r0 298 } 299 ;; 300 .align 32 301 302.L22: 303 { .mmi 304 (p21) ST [B ] = f37, 1 * SIZE 305 (p14) ST [B1] = f49, 1 * SIZE 306 (p16) cmp.ne.unc p13, p0 = 1, I 307 } 308 { .mmi 309 lfetch.nt1 [PREA], LDA 310 lfetch.excl.nt1 [PREB], 8 * SIZE 311 cmp.eq p9, p0 = 4, COUNT 312 } 313 ;; 314 { .mmi 315 (p21) ST [B ] = f43, 1 * SIZE 316 (p14) ST [B1] = f55, 1 * SIZE 317 (p16) adds I = -2, I 318 } 319 { .mmi 320 (p16) LD f32 = [A1], SIZE 321 (p16) LD f38 = [A2], SIZE 322 (p21) adds II = -2, II 323 } 324 ;; 325 { .mmi 326 (p21) ST [B ] = f61, 1 * SIZE 327 (p14) ST [B1] = f73, 1 * SIZE 328 (p9) mov COUNT = 0 329 } 330 { .mmi 331 (p13) LD f44 = [A1], SIZE 332 (p13) LD f50 = [A2], SIZE 333 nop __LINE__ 334 } 335 ;; 336 { .mmi 337 (p21) ST [B ] = f67, 1 * SIZE 338 (p14) ST [B1] = f79, 5 * SIZE 339 (p9) adds PREA = PREFETCHSIZE * SIZE, A1 340 } 341 { .mmb 342 (p16) LD f56 = [A3], SIZE 343 (p16) LD f62 = [A4], SIZE 344 nop __LINE__ 345 } 346 ;; 347 { .mmi 348 (p13) LD f68 = [A3], SIZE 349 (p13) LD f74 = [A4], SIZE 350 (p16) adds COUNT = 1, COUNT 351 } 352 { .mmb 353 (p14) adds B = 4 * SIZE, B 354 (p20) cmp.ne.unc p14, p0 = 1, II 355 br.ctop.sptk.few .L22 356 } 357 ;; 358 .align 32 359 360.L30: 361 { .mmi 362 adds I = 1, M 363 mov A1 = A 364 mov pr.rot = 0 365 } 366 { .mmi 367 add A2 = A, LDA 368 adds B1 = 2 * SIZE, B 369 tbit.z p6, p0 = N, 1 370 } 371 ;; 372 { .mmi 373 nop __LINE__ 374 nop __LINE__ 375 mov ar.ec = 6 376 } 377 { .mib 378 cmp.eq p16, p0 = r0, r0 379 shr I = I, 1 380 (p6) br.cond.dpnt .L40 381 } 382 ;; 383 { .mmi 384 adds I = -1, I 385 ;; 386 shladd A = LDA, 1, A 387 mov ar.lc = I 388 } 389 { .mmi 390 mov I = M 391 mov II = M 392 cmp.ne p14, p0 = r0, r0 393 } 394 ;; 395 .align 32 396 397.L32: 398 { .mmi 399 (p21) ST [B ] = f37, 1 * SIZE 400 (p14) ST [B1] = f49, 1 * SIZE 401 (p16) cmp.ne.unc p13, p0 = 1, I 402 } 403 { .mmi 404 nop __LINE__ 405 nop __LINE__ 406 (p21) adds II = -2, II 407 } 408 ;; 409 { .mmi 410 (p21) ST [B ] = f43, 1 * SIZE 411 (p14) ST [B1] = f55, 3 * SIZE 412 nop __LINE__ 413 } 414 { .mmi 415 (p16) LD f32 = [A1], SIZE 416 (p16) LD f38 = [A2], SIZE 417 nop __LINE__ 418 } 419 ;; 420 { .mmi 421 (p13) LD f44 = [A1], SIZE 422 (p13) LD f50 = [A2], SIZE 423 (p16) adds I = -2, I 424 } 425 { .mmb 426 (p14) adds B = 2 * SIZE, B 427 (p20) cmp.ne.unc p14, p0 = 1, II 428 br.ctop.sptk.few .L32 429 } 430 ;; 431 .align 32 432 433.L40: 434 { .mmi 435 adds I = 1, M 436 mov A1 = A 437 mov pr.rot = 0 438 } 439 { .mmi 440 tbit.z p6, p0 = N, 0 441 } 442 ;; 443 { .mmi 444 nop __LINE__ 445 nop __LINE__ 446 mov ar.ec = 6 447 } 448 { .mib 449 cmp.eq p16, p0 = r0, r0 450 shr I = I, 1 451 (p6) br.cond.dpnt .L999 452 } 453 ;; 454 { .mmi 455 adds I = -1, I 456 ;; 457 mov ar.lc = I 458 } 459 { .mmi 460 mov I = M 461 mov II = M 462 cmp.ne p14, p0 = r0, r0 463 } 464 ;; 465 .align 32 466 467.L42: 468 { .mmi 469 (p21) ST [B ] = f37, 1 * SIZE 470 (p16) cmp.ne.unc p13, p0 = 1, I 471 (p21) adds II = -2, II 472 } 473 ;; 474 { .mmi 475 (p14) ST [B ] = f49, 1 * SIZE 476 (p16) LD f32 = [A1], SIZE 477 (p16) adds I = -2, I 478 } 479 ;; 480 { .mmb 481 (p13) LD f44 = [A1], SIZE 482 (p20) cmp.ne.unc p14, p0 = 1, II 483 br.ctop.sptk.few .L42 484 } 485 ;; 486 .align 32 487 488.L999: 489 mov pr = PR, -1 490 mov ar.lc = ARLC 491 br.ret.sptk.many b0 492 EPILOGUE 493 494