1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#if defined(DOUBLE) && !defined(__64BIT__) 43#define N %i0 44#define X %i1 45#define INCX %i2 46#define Y %i3 47#define INCY %i4 48#define I %i5 49#else 50#define N %i0 51#define X %i5 52#define INCX %i1 53#define Y %i2 54#define INCY %i3 55#define I %i4 56#endif 57 58#define YY %l1 59 60#ifdef DOUBLE 61#define a1 %f0 62#define a2 %f2 63#define a3 %f4 64#define a4 %f6 65#define a5 %f8 66#define a6 %f10 67#define a7 %f12 68#define a8 %f14 69#define b1 %f16 70#define b2 %f18 71#define b3 %f20 72#define b4 %f22 73#define b5 %f24 74#define b6 %f26 75#define b7 %f28 76#define b8 %f30 77 78#define t1 %f32 79#define t2 %f34 80#define t3 %f36 81#define t4 %f38 82#define c1 %f40 83#define c2 %f42 84#define c3 %f44 85#define c4 %f46 86 87#define c5 %f48 88#define c6 %f50 89#define c7 %f52 90#define c8 %f54 91 92#define ALPHA_R %f60 93#define ALPHA_I %f62 94#else 95#define a1 %f0 96#define a2 %f1 97#define a3 %f2 98#define a4 %f3 99#define a5 %f4 100#define a6 %f5 101#define a7 %f6 102#define a8 %f7 103#define b1 %f8 104#define b2 %f9 105#define b3 %f10 106#define b4 %f11 107#define b5 %f12 108#define b6 %f13 109#define b7 %f14 110#define b8 %f15 111 112#define t1 %f16 113#define t2 %f17 114#define t3 %f18 115#define t4 %f19 116#define c1 %f20 117#define c2 %f21 118#define c3 %f22 119#define c4 %f23 120 121#define c5 %f24 122#define c6 %f25 123#define c7 %f26 124#define c8 %f27 125 126#define ALPHA_R %f30 127#define ALPHA_I %f31 128#endif 129 130#ifndef CONJ 131#define ADD1 FSUB 132#define ADD2 FADD 133#else 134#define ADD1 FADD 135#define ADD2 FSUB 136#endif 137 138 PROLOGUE 139 SAVESP 140 141#ifndef __64BIT__ 142#ifdef DOUBLE 143 st %i3, [%sp + STACK_START + 16] 144 st %i4, [%sp + STACK_START + 20] 145 st %i5, [%sp + STACK_START + 24] 146 147 ld [%sp+ STACK_START + 32], X 148 ld [%sp+ STACK_START + 36], INCX 149 ld [%sp+ STACK_START + 40], Y 150 ld [%sp+ STACK_START + 44], INCY 151 152 ldd [%sp + STACK_START + 16], ALPHA_R 153 ldd [%sp + STACK_START + 24], ALPHA_I 154#else 155 st %i3, [%sp + STACK_START + 16] 156 st %i4, [%sp + STACK_START + 20] 157 158 ld [%sp+ STACK_START + 28], INCX 159 ld [%sp+ STACK_START + 32], Y 160 ld [%sp+ STACK_START + 36], INCY 161 162 ld [%sp + STACK_START + 16], ALPHA_R 163 ld [%sp + STACK_START + 20], ALPHA_I 164#endif 165#else 166 ldx [%sp + STACK_START + 56], INCX 167 ldx [%sp + STACK_START + 64], Y 168 ldx [%sp + STACK_START + 72], INCY 169#ifdef DOUBLE 170 FMOV %f6, ALPHA_R 171 FMOV %f8, ALPHA_I 172#else 173 FMOV %f7, ALPHA_R 174 FMOV %f9, ALPHA_I 175#endif 176#endif 177 sll INCX, ZBASE_SHIFT, INCX 178 sll INCY, ZBASE_SHIFT, INCY 179 180 cmp INCX, 2 * SIZE 181 bne .LL50 182 nop 183 cmp INCY, 2 * SIZE 184 bne .LL50 185 nop 186 187 sra N, 2, I 188 cmp I, 0 189 ble,pn %icc, .LL15 190 nop 191 192 LDF [X + 0 * SIZE], a1 193 LDF [X + 1 * SIZE], a2 194 LDF [Y + 0 * SIZE], b1 195 LDF [Y + 1 * SIZE], b2 196 197 LDF [X + 2 * SIZE], a3 198 LDF [X + 3 * SIZE], a4 199 LDF [Y + 2 * SIZE], b3 200 LDF [Y + 3 * SIZE], b4 201 202 LDF [X + 4 * SIZE], a5 203 LDF [X + 5 * SIZE], a6 204 LDF [Y + 4 * SIZE], b5 205 LDF [Y + 5 * SIZE], b6 206 207 LDF [X + 6 * SIZE], a7 208 LDF [X + 7 * SIZE], a8 209 LDF [Y + 6 * SIZE], b7 210 LDF [Y + 7 * SIZE], b8 211 212 FMUL ALPHA_R, a1, t1 213 FMUL ALPHA_R, a2, t2 214 FMUL ALPHA_R, a3, t3 215 FMUL ALPHA_R, a4, t4 216 217 FADD b1, t1, c1 218 FMUL ALPHA_I, a2, t1 219 ADD2 b2, t2, c2 220 FMUL ALPHA_I, a1, t2 221 222 deccc I 223 ble,pt %icc, .LL12 224 nop 225 226#ifdef DOUBLE 227#define PREFETCHSIZE 54 228#else 229#define PREFETCHSIZE 108 230#endif 231 232.LL11: 233 FADD b3, t3, c3 234 prefetch [Y + PREFETCHSIZE * SIZE], 0 235 FMUL ALPHA_I, a4, t3 236 prefetch [X + PREFETCHSIZE * SIZE], 0 237 238 ADD2 b4, t4, c4 239 LDF [Y + 8 * SIZE], b1 240 FMUL ALPHA_I, a3, t4 241 LDF [X + 9 * SIZE], a2 242 243 ADD1 c1, t1, c1 244 LDF [Y + 9 * SIZE], b2 245 FMUL ALPHA_R, a5, t1 246 LDF [X + 8 * SIZE], a1 247 248 FADD c2, t2, c2 249 LDF [Y + 10 * SIZE], b3 250 FMUL ALPHA_R, a6, t2 251 LDF [X + 11 * SIZE], a4 252 253 ADD1 c3, t3, c3 254 STF c1, [Y + 0 * SIZE] 255 FMUL ALPHA_R, a7, t3 256 LDF [Y + 11 * SIZE], b4 257 258 FADD c4, t4, c4 259 STF c2, [Y + 1 * SIZE] 260 FMUL ALPHA_R, a8, t4 261 LDF [X + 10 * SIZE], a3 262 263 FADD b5, t1, c5 264 STF c3, [Y + 2 * SIZE] 265 FMUL ALPHA_I, a6, t1 266 267 ADD2 b6, t2, c6 268 STF c4, [Y + 3 * SIZE] 269 FMUL ALPHA_I, a5, t2 270 271 FADD b7, t3, c7 272 LDF [Y + 12 * SIZE], b5 273 FMUL ALPHA_I, a8, t3 274 LDF [X + 13 * SIZE], a6 275 276 ADD2 b8, t4, c8 277 LDF [Y + 13 * SIZE], b6 278 FMUL ALPHA_I, a7, t4 279 LDF [X + 12 * SIZE], a5 280 281 ADD1 c5, t1, c5 282 LDF [Y + 14 * SIZE], b7 283 FMUL ALPHA_R, a1, t1 284 LDF [X + 15 * SIZE], a8 285 286 FADD c6, t2, c6 287 LDF [Y + 15 * SIZE], b8 288 FMUL ALPHA_R, a2, t2 289 LDF [X + 14 * SIZE], a7 290 291 ADD1 c7, t3, c7 292 STF c5, [Y + 4 * SIZE] 293 FMUL ALPHA_R, a3, t3 294 add X, 8 * SIZE, X 295 296 FADD c8, t4, c8 297 STF c6, [Y + 5 * SIZE] 298 FMUL ALPHA_R, a4, t4 299 deccc I 300 301 FADD b1, t1, c1 302 STF c7, [Y + 6 * SIZE] 303 FMUL ALPHA_I, a2, t1 304 305 ADD2 b2, t2, c2 306 STF c8, [Y + 7 * SIZE] 307 FMUL ALPHA_I, a1, t2 308 309 bg,pt %icc, .LL11 310 add Y, 8 * SIZE, Y 311 312 313.LL12: 314 FADD b3, t3, c3 315 FMUL ALPHA_I, a4, t3 316 ADD2 b4, t4, c4 317 FMUL ALPHA_I, a3, t4 318 319 ADD1 c1, t1, c1 320 FMUL ALPHA_R, a5, t1 321 FADD c2, t2, c2 322 FMUL ALPHA_R, a6, t2 323 324 ADD1 c3, t3, c3 325 FMUL ALPHA_R, a7, t3 326 FADD c4, t4, c4 327 FMUL ALPHA_R, a8, t4 328 329 FADD b5, t1, c5 330 FMUL ALPHA_I, a6, t1 331 ADD2 b6, t2, c6 332 FMUL ALPHA_I, a5, t2 333 334 FADD b7, t3, c7 335 FMUL ALPHA_I, a8, t3 336 ADD2 b8, t4, c8 337 FMUL ALPHA_I, a7, t4 338 339 ADD1 c5, t1, c5 340 FADD c6, t2, c6 341 ADD1 c7, t3, c7 342 FADD c8, t4, c8 343 344 STF c1, [Y + 0 * SIZE] 345 STF c2, [Y + 1 * SIZE] 346 STF c3, [Y + 2 * SIZE] 347 STF c4, [Y + 3 * SIZE] 348 349 STF c5, [Y + 4 * SIZE] 350 STF c6, [Y + 5 * SIZE] 351 STF c7, [Y + 6 * SIZE] 352 STF c8, [Y + 7 * SIZE] 353 354 add X, 8 * SIZE, X 355 add Y, 8 * SIZE, Y 356 357 358.LL15: 359 and N, 3, I 360 cmp I, 0 361 ble,a,pn %icc, .LL19 362 nop 363 364.LL16: 365 LDF [X + 0 * SIZE], a1 366 LDF [X + 1 * SIZE], a2 367 LDF [Y + 0 * SIZE], b1 368 LDF [Y + 1 * SIZE], b2 369 370 FMUL ALPHA_R, a1, t1 371 FMUL ALPHA_R, a2, t2 372 FMUL ALPHA_I, a2, t3 373 FMUL ALPHA_I, a1, t4 374 375 FADD b1, t1, b1 376 add I, -1, I 377 ADD2 b2, t2, b2 378 cmp I, 0 379 ADD1 b1, t3, c1 380 FADD b2, t4, c2 381 382 STF c1, [Y + 0 * SIZE] 383 STF c2, [Y + 1 * SIZE] 384 385 add Y, 2 * SIZE, Y 386 bg,pt %icc, .LL16 387 add X, 2 * SIZE, X 388 389.LL19: 390 return %i7 + 8 391 clr %g0 392 393.LL50: 394 sra N, 2, I 395 cmp I, 0 396 ble,pn %icc, .LL55 397 mov Y, YY 398 399 LDF [X + 0 * SIZE], a1 400 LDF [Y + 0 * SIZE], b1 401 LDF [X + 1 * SIZE], a2 402 add X, INCX, X 403 LDF [Y + 1 * SIZE], b2 404 add Y, INCY, Y 405 LDF [X + 0 * SIZE], a3 406 LDF [Y + 0 * SIZE], b3 407 LDF [X + 1 * SIZE], a4 408 add X, INCX, X 409 LDF [Y + 1 * SIZE], b4 410 add Y, INCY, Y 411 LDF [X + 0 * SIZE], a5 412 add I, -1, I 413 LDF [Y + 0 * SIZE], b5 414 LDF [X + 1 * SIZE], a6 415 cmp I, 0 416 add X, INCX, X 417 LDF [Y + 1 * SIZE], b6 418 add Y, INCY, Y 419 LDF [X + 0 * SIZE], a7 420 FMUL ALPHA_R, a1, t1 421 LDF [Y + 0 * SIZE], b7 422 FMUL ALPHA_R, a2, t2 423 LDF [X + 1 * SIZE], a8 424 FMUL ALPHA_R, a3, t3 425 add X, INCX, X 426 LDF [Y + 1 * SIZE], b8 427 FMUL ALPHA_R, a4, t4 428 429 ble,pt %icc, .LL52 430 add Y, INCY, Y 431 432 433.LL51: 434 FADD b1, t1, c1 435 LDF [Y + 0 * SIZE], b1 436 FMUL ALPHA_I, a2, t1 437 LDF [X + 1 * SIZE], a2 438 ADD2 b2, t2, c2 439 LDF [Y + 1 * SIZE], b2 440 add Y, INCY, Y 441 FMUL ALPHA_I, a1, t2 442 LDF [X + 0 * SIZE], a1 443 add X, INCX, X 444 445 FADD b3, t3, c3 446 LDF [Y + 0 * SIZE], b3 447 FMUL ALPHA_I, a4, t3 448 LDF [X + 1 * SIZE], a4 449 ADD2 b4, t4, c4 450 LDF [Y + 1 * SIZE], b4 451 add Y, INCY, Y 452 FMUL ALPHA_I, a3, t4 453 LDF [X + 0 * SIZE], a3 454 add X, INCX, X 455 456 ADD1 c1, t1, c1 457 FMUL ALPHA_R, a5, t1 458 FADD c2, t2, c2 459 FMUL ALPHA_R, a6, t2 460 ADD1 c3, t3, c3 461 FMUL ALPHA_R, a7, t3 462 FADD c4, t4, c4 463 FMUL ALPHA_R, a8, t4 464 465 STF c1, [YY + 0 * SIZE] 466 FADD b5, t1, c1 467 FMUL ALPHA_I, a6, t1 468 STF c2, [YY + 1 * SIZE] 469 ADD2 b6, t2, c2 470 FMUL ALPHA_I, a5, t2 471 add YY, INCY, YY 472 STF c3, [YY + 0 * SIZE] 473 FADD b7, t3, c3 474 FMUL ALPHA_I, a8, t3 475 STF c4, [YY + 1 * SIZE] 476 ADD2 b8, t4, c4 477 FMUL ALPHA_I, a7, t4 478 add YY, INCY, YY 479 480 LDF [X + 0 * SIZE], a5 481 ADD1 c1, t1, c1 482 LDF [Y + 0 * SIZE], b5 483 FMUL ALPHA_R, a1, t1 484 LDF [X + 1 * SIZE], a6 485 add X, INCX, X 486 FADD c2, t2, c2 487 LDF [Y + 1 * SIZE], b6 488 add Y, INCY, Y 489 FMUL ALPHA_R, a2, t2 490 LDF [X + 0 * SIZE], a7 491 ADD1 c3, t3, c3 492 LDF [Y + 0 * SIZE], b7 493 FMUL ALPHA_R, a3, t3 494 LDF [X + 1 * SIZE], a8 495 add X, INCX, X 496 FADD c4, t4, c4 497 LDF [Y + 1 * SIZE], b8 498 add Y, INCY, Y 499 FMUL ALPHA_R, a4, t4 500 501 STF c1, [YY + 0 * SIZE] 502 add I, -1, I 503 STF c2, [YY + 1 * SIZE] 504 add YY, INCY, YY 505 STF c3, [YY + 0 * SIZE] 506 cmp I, 0 507 STF c4, [YY + 1 * SIZE] 508 509 bg,pt %icc, .LL51 510 add YY, INCY, YY 511 512.LL52: 513 FADD b1, t1, c1 514 FMUL ALPHA_I, a2, t1 515 ADD2 b2, t2, c2 516 FMUL ALPHA_I, a1, t2 517 518 FADD b3, t3, c3 519 FMUL ALPHA_I, a4, t3 520 ADD2 b4, t4, c4 521 FMUL ALPHA_I, a3, t4 522 523 ADD1 c1, t1, c1 524 FMUL ALPHA_R, a5, t1 525 FADD c2, t2, c2 526 FMUL ALPHA_R, a6, t2 527 ADD1 c3, t3, c3 528 FMUL ALPHA_R, a7, t3 529 FADD c4, t4, c4 530 FMUL ALPHA_R, a8, t4 531 532 STF c1, [YY + 0 * SIZE] 533 STF c2, [YY + 1 * SIZE] 534 add YY, INCY, YY 535 STF c3, [YY + 0 * SIZE] 536 STF c4, [YY + 1 * SIZE] 537 add YY, INCY, YY 538 539 FADD b5, t1, c1 540 FMUL ALPHA_I, a6, t1 541 ADD2 b6, t2, c2 542 FMUL ALPHA_I, a5, t2 543 FADD b7, t3, c3 544 FMUL ALPHA_I, a8, t3 545 ADD2 b8, t4, c4 546 FMUL ALPHA_I, a7, t4 547 548 ADD1 c1, t1, c1 549 FADD c2, t2, c2 550 ADD1 c3, t3, c3 551 FADD c4, t4, c4 552 553 STF c1, [YY + 0 * SIZE] 554 STF c2, [YY + 1 * SIZE] 555 add YY, INCY, YY 556 STF c3, [YY + 0 * SIZE] 557 STF c4, [YY + 1 * SIZE] 558 add YY, INCY, YY 559 560.LL55: 561 and N, 3, I 562 cmp I, 0 563 ble,a,pn %icc, .LL59 564 nop 565 566.LL56: 567 LDF [X + 0 * SIZE], a1 568 LDF [X + 1 * SIZE], a2 569 LDF [Y + 0 * SIZE], b1 570 LDF [Y + 1 * SIZE], b2 571 572 FMUL ALPHA_R, a1, t1 573 FMUL ALPHA_R, a2, t2 574 FMUL ALPHA_I, a2, t3 575 FMUL ALPHA_I, a1, t4 576 FADD b1, t1, b1 577 ADD2 b2, t2, b2 578 ADD1 b1, t3, c1 579 FADD b2, t4, c2 580 581 add I, -1, I 582 cmp I, 0 583 STF c1, [Y + 0 * SIZE] 584 STF c2, [Y + 1 * SIZE] 585 586 add Y, INCY, Y 587 bg,pt %icc, .LL56 588 add X, INCX, X 589 590.LL59: 591 return %i7 + 8 592 clr %o0 593 594 EPILOGUE 595