1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#if defined(DOUBLE) && !defined(__64BIT__) 43#define N %i0 44#define X %i5 45#define INCX %i1 46#define Y %i2 47#define INCY %i3 48#define I %i4 49#else 50#define N %i0 51#define X %i4 52#define INCX %i5 53#define Y %i1 54#define INCY %i2 55#define I %i3 56#endif 57 58#define YY %l1 59 60#ifdef DOUBLE 61#define a1 %f0 62#define a2 %f2 63#define a3 %f4 64#define a4 %f6 65#define a5 %f8 66#define a6 %f10 67#define a7 %f12 68#define a8 %f14 69#define b1 %f16 70#define b2 %f18 71#define b3 %f20 72#define b4 %f22 73#define b5 %f24 74#define b6 %f26 75#define b7 %f28 76#define b8 %f30 77 78#define t1 %f32 79#define t2 %f34 80#define t3 %f36 81#define t4 %f38 82#define c1 %f40 83#define c2 %f42 84#define c3 %f44 85#define c4 %f46 86 87#define c5 %f48 88#define c6 %f50 89#define c7 %f52 90#define c8 %f54 91 92#define ALPHA %f62 93#else 94#define a1 %f0 95#define a2 %f1 96#define a3 %f2 97#define a4 %f3 98#define a5 %f4 99#define a6 %f5 100#define a7 %f6 101#define a8 %f7 102#define b1 %f8 103#define b2 %f9 104#define b3 %f10 105#define b4 %f11 106#define b5 %f12 107#define b6 %f13 108#define b7 %f14 109#define b8 %f15 110 111#define t1 %f16 112#define t2 %f17 113#define t3 %f18 114#define t4 %f19 115#define c1 %f20 116#define c2 %f21 117#define c3 %f22 118#define c4 %f23 119 120#define c5 %f24 121#define c6 %f25 122#define c7 %f26 123#define c8 %f27 124 125#define ALPHA %f31 126#endif 127 128 PROLOGUE 129 SAVESP 130 131#ifndef __64BIT__ 132 133#ifdef DOUBLE 134 st %i3, [%sp + STACK_START + 16] 135 st %i4, [%sp + STACK_START + 20] 136 137 ld [%sp + STACK_START + 28], INCX 138 ld [%sp + STACK_START + 32], Y 139 ld [%sp + STACK_START + 36], INCY 140#else 141 st %i3, [%sp + STACK_START + 16] 142 ld [%sp + STACK_START + 28], Y 143 ld [%sp + STACK_START + 32], INCY 144#endif 145 LDF [%sp + STACK_START + 16], ALPHA 146#else 147 ldx [%sp + STACK_START + 56], Y 148 ldx [%sp + STACK_START + 64], INCY 149#ifdef DOUBLE 150 FMOV %f6, ALPHA 151#else 152 FMOV %f7, ALPHA 153#endif 154#endif 155 156 sll INCX, BASE_SHIFT, INCX 157 sll INCY, BASE_SHIFT, INCY 158 159 cmp INCX, SIZE 160 bne .LL50 161 nop 162 cmp INCY, SIZE 163 bne .LL50 164 nop 165 166 sra N, 3, I 167 cmp I, 0 168 ble,pn %icc, .LL15 169 nop 170 171 LDF [X + 0 * SIZE], a1 172 LDF [Y + 0 * SIZE], b1 173 LDF [X + 1 * SIZE], a2 174 LDF [Y + 1 * SIZE], b2 175 LDF [X + 2 * SIZE], a3 176 LDF [Y + 2 * SIZE], b3 177 LDF [X + 3 * SIZE], a4 178 LDF [Y + 3 * SIZE], b4 179 LDF [X + 4 * SIZE], a5 180 LDF [Y + 4 * SIZE], b5 181 LDF [X + 5 * SIZE], a6 182 LDF [Y + 5 * SIZE], b6 183 LDF [X + 6 * SIZE], a7 184 LDF [Y + 6 * SIZE], b7 185 LDF [X + 7 * SIZE], a8 186 LDF [Y + 7 * SIZE], b8 187 188 FMUL ALPHA, a1, t1 189 FMUL ALPHA, a2, t2 190 FMUL ALPHA, a3, t3 191 FMUL ALPHA, a4, t4 192 193 FADD b1, t1, c1 194 FMUL ALPHA, a5, t1 195 FADD b2, t2, c2 196 FMUL ALPHA, a6, t2 197 198 add I, -1, I 199 cmp I, 0 200 ble,pt %icc, .LL12 201 nop 202 203#ifdef DOUBLE 204#define PREFETCHSIZE 54 205#else 206#define PREFETCHSIZE 108 207#endif 208 209.LL11: 210 prefetch [Y + PREFETCHSIZE * SIZE], 0 211 212 LDF [X + 8 * SIZE], a1 213 LDF [X + 9 * SIZE], a2 214 LDF [X + 10 * SIZE], a3 215 LDF [X + 11 * SIZE], a4 216 217 FADD b3, t3, c3 218 STF c1, [Y + 0 * SIZE] 219 FMUL ALPHA, a7, t3 220 221 FADD b4, t4, c4 222 STF c2, [Y + 1 * SIZE] 223 FMUL ALPHA, a8, t4 224 225 LDF [Y + 8 * SIZE], b1 226 LDF [Y + 9 * SIZE], b2 227 LDF [Y + 10 * SIZE], b3 228 LDF [Y + 11 * SIZE], b4 229 230 FADD b5, t1, c5 231 STF c3, [Y + 2 * SIZE] 232 FMUL ALPHA, a1, t1 233 234 FADD b6, t2, c6 235 STF c4, [Y + 3 * SIZE] 236 FMUL ALPHA, a2, t2 237 238 prefetch [X + PREFETCHSIZE * SIZE], 0 239 240 LDF [X + 12 * SIZE], a5 241 LDF [X + 13 * SIZE], a6 242 LDF [X + 14 * SIZE], a7 243 LDF [X + 15 * SIZE], a8 244 245 FADD b7, t3, c7 246 STF c5, [Y + 4 * SIZE] 247 FMUL ALPHA, a3, t3 248 249 FADD b8, t4, c8 250 STF c6, [Y + 5 * SIZE] 251 FMUL ALPHA, a4, t4 252 253 LDF [Y + 12 * SIZE], b5 254 LDF [Y + 13 * SIZE], b6 255 LDF [Y + 14 * SIZE], b7 256 LDF [Y + 15 * SIZE], b8 257 258 FADD b1, t1, c1 259 STF c7, [Y + 6 * SIZE] 260 FMUL ALPHA, a5, t1 261 deccc I 262 263 FADD b2, t2, c2 264 STF c8, [Y + 7 * SIZE] 265 FMUL ALPHA, a6, t2 266 add Y, 8 * SIZE, Y 267 268 bg,pt %icc, .LL11 269 add X, 8 * SIZE, X 270 271.LL12: 272 FADD b3, t3, c3 273 FMUL ALPHA, a7, t3 274 FADD b4, t4, c4 275 FMUL ALPHA, a8, t4 276 277 FADD b5, t1, c5 278 FADD b6, t2, c6 279 FADD b7, t3, c7 280 FADD b8, t4, c8 281 282 STF c1, [Y + 0 * SIZE] 283 STF c2, [Y + 1 * SIZE] 284 STF c3, [Y + 2 * SIZE] 285 STF c4, [Y + 3 * SIZE] 286 287 STF c5, [Y + 4 * SIZE] 288 STF c6, [Y + 5 * SIZE] 289 STF c7, [Y + 6 * SIZE] 290 STF c8, [Y + 7 * SIZE] 291 292 add Y, 8 * SIZE, Y 293 add X, 8 * SIZE, X 294 295 296.LL15: 297 and N, 7, I 298 cmp I, 0 299 ble,a,pn %icc, .LL19 300 nop 301 302.LL16: 303 LDF [X + 0 * SIZE], a1 304 LDF [Y + 0 * SIZE], b1 305 306 FMUL ALPHA, a1, t1 307 FADD b1, t1, c1 308 309 add I, -1, I 310 cmp I, 0 311 STF c1, [Y + 0 * SIZE] 312 add Y, 1 * SIZE, Y 313 bg,pt %icc, .LL16 314 add X, 1 * SIZE, X 315 316.LL19: 317 return %i7 + 8 318 clr %g0 319 320.LL50: 321 sra N, 3, I 322 cmp I, 0 323 ble,pn %icc, .LL55 324 mov Y, YY 325 326 LDF [X + 0 * SIZE], a1 327 add I, -1, I 328 add X, INCX, X 329 LDF [Y + 0 * SIZE], b1 330 cmp I, 0 331 add Y, INCY, Y 332 LDF [X + 0 * SIZE], a2 333 add X, INCX, X 334 LDF [Y + 0 * SIZE], b2 335 add Y, INCY, Y 336 LDF [X + 0 * SIZE], a3 337 add X, INCX, X 338 LDF [Y + 0 * SIZE], b3 339 add Y, INCY, Y 340 LDF [X + 0 * SIZE], a4 341 add X, INCX, X 342 LDF [Y + 0 * SIZE], b4 343 add Y, INCY, Y 344 LDF [X + 0 * SIZE], a5 345 add X, INCX, X 346 LDF [Y + 0 * SIZE], b5 347 add Y, INCY, Y 348 LDF [X + 0 * SIZE], a6 349 add X, INCX, X 350 LDF [Y + 0 * SIZE], b6 351 add Y, INCY, Y 352 LDF [X + 0 * SIZE], a7 353 add X, INCX, X 354 LDF [Y + 0 * SIZE], b7 355 add Y, INCY, Y 356 LDF [X + 0 * SIZE], a8 357 add X, INCX, X 358 LDF [Y + 0 * SIZE], b8 359 ble,pt %icc, .LL52 360 add Y, INCY, Y 361 362 363.LL51: 364 FMUL ALPHA, a1, t1 365 LDF [X + 0 * SIZE], a1 366 add X, INCX, X 367 368 FMUL ALPHA, a2, t2 369 LDF [X + 0 * SIZE], a2 370 add X, INCX, X 371 372 FMUL ALPHA, a3, t3 373 LDF [X + 0 * SIZE], a3 374 add X, INCX, X 375 FMUL ALPHA, a4, t4 376 LDF [X + 0 * SIZE], a4 377 add X, INCX, X 378 379 FADD b1, t1, c1 380 LDF [Y + 0 * SIZE], b1 381 add Y, INCY, Y 382 383 FMUL ALPHA, a5, t1 384 LDF [X + 0 * SIZE], a5 385 add X, INCX, X 386 FADD b2, t2, c2 387 LDF [Y + 0 * SIZE], b2 388 add Y, INCY, Y 389 390 FMUL ALPHA, a6, t2 391 LDF [X + 0 * SIZE], a6 392 add X, INCX, X 393 FADD b3, t3, c3 394 LDF [Y + 0 * SIZE], b3 395 add Y, INCY, Y 396 397 FMUL ALPHA, a7, t3 398 LDF [X + 0 * SIZE], a7 399 add X, INCX, X 400 FADD b4, t4, c4 401 LDF [Y + 0 * SIZE], b4 402 add Y, INCY, Y 403 FMUL ALPHA, a8, t4 404 LDF [X + 0 * SIZE], a8 405 add X, INCX, X 406 407 STF c1, [YY + 0 * SIZE] 408 add YY, INCY, YY 409 FADD b5, t1, c1 410 STF c2, [YY + 0 * SIZE] 411 add YY, INCY, YY 412 FADD b6, t2, c2 413 STF c3, [YY + 0 * SIZE] 414 add YY, INCY, YY 415 FADD b7, t3, c3 416 STF c4, [YY + 0 * SIZE] 417 add YY, INCY, YY 418 FADD b8, t4, c4 419 420 LDF [Y + 0 * SIZE], b5 421 add I, -1, I 422 add Y, INCY, Y 423 LDF [Y + 0 * SIZE], b6 424 cmp I, 0 425 add Y, INCY, Y 426 LDF [Y + 0 * SIZE], b7 427 add Y, INCY, Y 428 LDF [Y + 0 * SIZE], b8 429 add Y, INCY, Y 430 431 STF c1, [YY + 0 * SIZE] 432 add YY, INCY, YY 433 STF c2, [YY + 0 * SIZE] 434 add YY, INCY, YY 435 STF c3, [YY + 0 * SIZE] 436 add YY, INCY, YY 437 STF c4, [YY + 0 * SIZE] 438 439 bg,pt %icc, .LL51 440 add YY, INCY, YY 441 442.LL52: 443 FMUL ALPHA, a1, t1 444 FMUL ALPHA, a2, t2 445 FMUL ALPHA, a3, t3 446 FMUL ALPHA, a4, t4 447 448 FADD b1, t1, c1 449 FMUL ALPHA, a5, t1 450 FADD b2, t2, c2 451 FMUL ALPHA, a6, t2 452 FADD b3, t3, c3 453 FMUL ALPHA, a7, t3 454 FADD b4, t4, c4 455 FMUL ALPHA, a8, t4 456 457 STF c1, [YY + 0 * SIZE] 458 add YY, INCY, YY 459 FADD b5, t1, c1 460 STF c2, [YY + 0 * SIZE] 461 add YY, INCY, YY 462 FADD b6, t2, c2 463 STF c3, [YY + 0 * SIZE] 464 add YY, INCY, YY 465 FADD b7, t3, c3 466 STF c4, [YY + 0 * SIZE] 467 add YY, INCY, YY 468 FADD b8, t4, c4 469 470 STF c1, [YY + 0 * SIZE] 471 add YY, INCY, YY 472 STF c2, [YY + 0 * SIZE] 473 add YY, INCY, YY 474 STF c3, [YY + 0 * SIZE] 475 add YY, INCY, YY 476 STF c4, [YY + 0 * SIZE] 477 add YY, INCY, YY 478 479.LL55: 480 and N, 7, I 481 cmp I, 0 482 ble,a,pn %icc, .LL59 483 nop 484 485.LL56: 486 LDF [X + 0 * SIZE], a1 487 LDF [Y + 0 * SIZE], b1 488 489 FMUL ALPHA, a1, t1 490 FADD b1, t1, c1 491 492 add I, -1, I 493 cmp I, 0 494 STF c1, [Y + 0 * SIZE] 495 add Y, INCY, Y 496 bg,pt %icc, .LL56 497 add X, INCX, X 498 499.LL59: 500 return %i7 + 8 501 clr %o0 502 503 EPILOGUE 504