1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %i0 43#define N %i1 44 45#if defined(DOUBLE) && !defined(__64BIT__) 46#define X %i5 47#define INCX %i2 48#define Y %i3 49#define INCY %i4 50#else 51#define X %i4 52#define INCX %i5 53#define Y %i2 54#define INCY %i3 55#endif 56 57#define A %l0 58#define LDA %l1 59#define BUFFER %l2 60 61#define I %l3 62#define J %l4 63 64#define A1 %o0 65#define X1 %o2 66#define XX %o3 67 68#ifdef DOUBLE 69#define t1 %f0 70#define t2 %f2 71#define t3 %f4 72#define t4 %f6 73 74#define x1 %f8 75#define x2 %f10 76#define x3 %f12 77#define x4 %f14 78#define x5 %f16 79#define x6 %f18 80#define x7 %f20 81#define x8 %f22 82 83#define a1 %f24 84#define a2 %f26 85#define a3 %f28 86#define a4 %f30 87#define a5 %f32 88#define a6 %f34 89#define a7 %f36 90#define a8 %f38 91 92#define a9 %f40 93#define a10 %f42 94#define a11 %f44 95#define a12 %f46 96#define a13 %f48 97#define a14 %f50 98#define a15 %f52 99#define a16 %f54 100 101#define y1 %f56 102#define y2 %f58 103 104#define ALPHA %f60 105 106#else 107#define t1 %f0 108#define t2 %f1 109#define t3 %f2 110#define t4 %f3 111 112#define x1 %f4 113#define x2 %f5 114#define x3 %f6 115#define x4 %f7 116#define x5 %f8 117#define x6 %f9 118#define x7 %f10 119#define x8 %f11 120 121#define a1 %f12 122#define a2 %f13 123#define a3 %f14 124#define a4 %f15 125#define a5 %f16 126#define a6 %f17 127#define a7 %f18 128#define a8 %f19 129 130#define a9 %f20 131#define a10 %f21 132#define a11 %f22 133#define a12 %f23 134#define a13 %f24 135#define a14 %f25 136#define a15 %f26 137#define a16 %f27 138 139#define y1 %f28 140#define y2 %f29 141#define ALPHA %f30 142#endif 143 144#define PREFETCHSIZE 60 145 146 PROLOGUE 147 SAVESP 148 nop 149 150#ifndef __64BIT__ 151 152#ifdef DOUBLE 153 st %i3, [%sp + STACK_START + 16] 154 st %i4, [%sp + STACK_START + 20] 155 156 ld [%sp + STACK_START + 28], INCX 157 ld [%sp + STACK_START + 32], Y 158 ld [%sp + STACK_START + 36], INCY 159 ld [%sp + STACK_START + 40], A 160 ld [%sp + STACK_START + 44], LDA 161 ld [%sp + STACK_START + 48], BUFFER 162#else 163 st %i3, [%sp + STACK_START + 16] 164 165 ld [%sp + STACK_START + 28], Y 166 ld [%sp + STACK_START + 32], INCY 167 ld [%sp + STACK_START + 36], A 168 ld [%sp + STACK_START + 40], LDA 169 ld [%sp + STACK_START + 44], BUFFER 170#endif 171 LDF [%sp + STACK_START + 16], ALPHA 172#else 173 ldx [%sp + STACK_START + 56], Y 174 ldx [%sp + STACK_START + 64], INCY 175 ldx [%sp + STACK_START + 72], A 176 ldx [%sp + STACK_START + 80], LDA 177 ldx [%sp + STACK_START + 88], BUFFER 178#ifdef DOUBLE 179 FMOV %f6, ALPHA 180#else 181 FMOV %f7, ALPHA 182#endif 183#endif 184 185 sll LDA, BASE_SHIFT, LDA 186 187 cmp M, 0 188 ble %icc, .LL999 189 sll INCX, BASE_SHIFT, INCX 190 cmp N, 0 191 ble %icc, .LL999 192 sll INCY, BASE_SHIFT, INCY 193 194 cmp INCX, SIZE 195 be %icc, .LL10 196 mov X, XX 197 198 mov BUFFER, XX 199 mov BUFFER, X1 200 201 sra M, 3, J 202 cmp J, 0 203 ble,pn %icc, .LL05 204 nop 205 206.LL01: 207 LDF [X], a1 208 add X, INCX, X 209 LDF [X], a2 210 add X, INCX, X 211 LDF [X], a3 212 add X, INCX, X 213 LDF [X], a4 214 add X, INCX, X 215 LDF [X], a5 216 add X, INCX, X 217 LDF [X], a6 218 add X, INCX, X 219 LDF [X], a7 220 add X, INCX, X 221 LDF [X], a8 222 add X, INCX, X 223 224 STF a1, [X1 + 0 * SIZE] 225 STF a2, [X1 + 1 * SIZE] 226 STF a3, [X1 + 2 * SIZE] 227 STF a4, [X1 + 3 * SIZE] 228 STF a5, [X1 + 4 * SIZE] 229 STF a6, [X1 + 5 * SIZE] 230 STF a7, [X1 + 6 * SIZE] 231 STF a8, [X1 + 7 * SIZE] 232 233 add X1, 8 * SIZE, X1 234 235 deccc J 236 bg,pn %icc, .LL01 237 nop 238 239.LL05: 240 andcc M, 7, J 241 ble,pn %icc, .LL10 242 nop 243 244.LL06: 245 LDF [X], a1 246 add X, INCX, X 247 248 STF a1, [X1 + 0 * SIZE] 249 add X1, 1 * SIZE, X1 250 251 deccc J 252 bg,pn %icc, .LL06 253 nop 254 255.LL10: 256 mov N, J 257 cmp N, 0 258 ble,pn %icc, .LL999 259 nop 260 261.LL11: 262 mov XX, X1 263 264 mov A, A1 265 add A, LDA, A 266 267 LDF [Y], y1 268 add Y, INCY, Y 269 270 FMUL ALPHA, y1, y1 271 272 sra M, 3, I 273 cmp I, 0 274 ble,pn %icc, .LL15 275 nop 276 277 LDF [X1 + 0 * SIZE], x1 278 LDF [A1 + 0 * SIZE], a1 279 LDF [X1 + 1 * SIZE], x2 280 LDF [A1 + 1 * SIZE], a2 281 LDF [X1 + 2 * SIZE], x3 282 LDF [A1 + 2 * SIZE], a3 283 LDF [X1 + 3 * SIZE], x4 284 LDF [A1 + 3 * SIZE], a4 285 286 LDF [X1 + 4 * SIZE], x5 287 LDF [A1 + 4 * SIZE], a5 288 LDF [X1 + 5 * SIZE], x6 289 LDF [A1 + 5 * SIZE], a6 290 LDF [X1 + 6 * SIZE], x7 291 LDF [A1 + 6 * SIZE], a7 292 LDF [X1 + 7 * SIZE], x8 293 LDF [A1 + 7 * SIZE], a8 294 295 FMUL x1, y1, t1 296 FMUL x2, y1, t2 297 FMUL x3, y1, t3 298 FMUL x4, y1, t4 299 300 FADD a1, t1, a1 301 FMUL x5, y1, t1 302 FADD a2, t2, a2 303 FMUL x6, y1, t2 304 305 deccc I 306 ble,pn %icc, .LL13 307 nop 308 309.LL12: 310 prefetch [A1 + PREFETCHSIZE * SIZE], 0 311 312 FADD a3, t3, a3 313 LDF [X1 + 8 * SIZE], x1 314 FMUL x7, y1, t3 315 LDF [X1 + 9 * SIZE], x2 316 FADD a4, t4, a4 317 LDF [X1 + 10 * SIZE], x3 318 FMUL x8, y1, t4 319 LDF [X1 + 11 * SIZE], x4 320 321 FADD a5, t1, a5 322 STF a1, [A1 + 0 * SIZE] 323 LDF [A1 + 8 * SIZE], a1 324 FMUL x1, y1, t1 325 STF a2, [A1 + 1 * SIZE] 326 LDF [A1 + 9 * SIZE], a2 327 328 FADD a6, t2, a6 329 STF a3, [A1 + 2 * SIZE] 330 LDF [A1 + 10 * SIZE], a3 331 FMUL x2, y1, t2 332 STF a4, [A1 + 3 * SIZE] 333 LDF [A1 + 11 * SIZE], a4 334 335 FADD a7, t3, a7 336 LDF [X1 + 12 * SIZE], x5 337 FMUL x3, y1, t3 338 LDF [X1 + 13 * SIZE], x6 339 FADD a8, t4, a8 340 LDF [X1 + 14 * SIZE], x7 341 FMUL x4, y1, t4 342 LDF [X1 + 15 * SIZE], x8 343 344 FADD a1, t1, a1 345 STF a5, [A1 + 4 * SIZE] 346 deccc I 347 LDF [A1 + 12 * SIZE], a5 348 FMUL x5, y1, t1 349 STF a6, [A1 + 5 * SIZE] 350 LDF [A1 + 13 * SIZE], a6 351 FADD a2, t2, a2 352 STF a7, [A1 + 6 * SIZE] 353 LDF [A1 + 14 * SIZE], a7 354 FMUL x6, y1, t2 355 STF a8, [A1 + 7 * SIZE] 356 LDF [A1 + 15 * SIZE], a8 357 add A1, 8 * SIZE, A1 358 359 bg,pn %icc, .LL12 360 add X1, 8 * SIZE, X1 361 362.LL13: 363 FADD a3, t3, a3 364 FMUL x7, y1, t3 365 FADD a4, t4, a4 366 FMUL x8, y1, t4 367 368 FADD a5, t1, a5 369 FADD a6, t2, a6 370 FADD a7, t3, a7 371 FADD a8, t4, a8 372 373 STF a1, [A1 + 0 * SIZE] 374 STF a2, [A1 + 1 * SIZE] 375 STF a3, [A1 + 2 * SIZE] 376 STF a4, [A1 + 3 * SIZE] 377 378 STF a5, [A1 + 4 * SIZE] 379 STF a6, [A1 + 5 * SIZE] 380 STF a7, [A1 + 6 * SIZE] 381 STF a8, [A1 + 7 * SIZE] 382 383 add A1, 8 * SIZE, A1 384 add X1, 8 * SIZE, X1 385 386.LL15: 387 andcc M, 4, I 388 ble,pn %icc, .LL16 389 nop 390 391 LDF [X1 + 0 * SIZE], x1 392 LDF [A1 + 0 * SIZE], a1 393 LDF [X1 + 1 * SIZE], x2 394 LDF [A1 + 1 * SIZE], a2 395 396 LDF [X1 + 2 * SIZE], x3 397 LDF [A1 + 2 * SIZE], a3 398 LDF [X1 + 3 * SIZE], x4 399 LDF [A1 + 3 * SIZE], a4 400 401 FMUL x1, y1, t1 402 FMUL x2, y1, t2 403 FMUL x3, y1, t3 404 FMUL x4, y1, t4 405 406 FADD a1, t1, a1 407 FADD a2, t2, a2 408 FADD a3, t3, a3 409 FADD a4, t4, a4 410 411 STF a1, [A1 + 0 * SIZE] 412 STF a2, [A1 + 1 * SIZE] 413 STF a3, [A1 + 2 * SIZE] 414 add X1, 4 * SIZE, X1 415 STF a4, [A1 + 3 * SIZE] 416 add A1, 4 * SIZE, A1 417 418.LL16: 419 andcc M, 2, I 420 ble,pn %icc, .LL17 421 nop 422 423 LDF [X1 + 0 * SIZE], x1 424 LDF [X1 + 1 * SIZE], x2 425 LDF [A1 + 0 * SIZE], a1 426 LDF [A1 + 1 * SIZE], a2 427 428 FMUL x1, y1, t1 429 FMUL x2, y1, t2 430 431 FADD a1, t1, a1 432 FADD a2, t2, a2 433 434 STF a1, [A1 + 0 * SIZE] 435 add X1, 2 * SIZE, X1 436 STF a2, [A1 + 1 * SIZE] 437 add A1, 2 * SIZE, A1 438 439.LL17: 440 andcc M, 1, I 441 ble,pn %icc, .LL19 442 nop 443 444 LDF [X1 + 0 * SIZE], x1 445 add X1, 1 * SIZE, X1 446 447 LDF [A1 + 0 * SIZE], a1 448 449 FMUL x1, y1, t1 450 FADD a1, t1, a1 451 452 STF a1, [A1 + 0 * SIZE] 453 add A1, 1 * SIZE, A1 454 455.LL19: 456 deccc J 457 bg %icc, .LL11 458 nop 459 460.LL999: 461 return %i7 + 8 462 clr %o0 463 464 EPILOGUE 465