1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N %i0 43 44#if defined(DOUBLE) && !defined(__64BIT__) 45#define X %i3 46#define INCX %i4 47#else 48#define X %i5 49#define INCX %i3 50#endif 51 52#define I %i1 53#define XX %i2 54 55#ifdef DOUBLE 56#define c1 %f0 57#define c2 %f2 58#define c3 %f4 59#define c4 %f6 60#define c5 %f8 61#define c6 %f10 62#define c7 %f12 63#define c8 %f14 64 65#define t1 %f16 66#define t2 %f18 67#define t3 %f20 68#define t4 %f22 69#define t5 %f24 70#define t6 %f26 71#define t7 %f28 72#define t8 %f30 73 74#define c9 %f32 75#define c10 %f34 76#define c11 %f36 77#define c12 %f38 78#define c13 %f40 79#define c14 %f42 80#define c15 %f44 81#define c16 %f46 82 83#define s1 %f32 84#define s2 %f34 85#define s3 %f36 86#define s4 %f38 87#define s5 %f40 88#define s6 %f42 89#define s7 %f44 90#define s8 %f46 91 92#define FZERO %f48 93#define ALPHA_R %f50 94#define ALPHA_I %f52 95#else 96#define c1 %f0 97#define c2 %f1 98#define c3 %f2 99#define c4 %f3 100#define c5 %f4 101#define c6 %f5 102#define c7 %f6 103#define c8 %f7 104 105#define c9 %f8 106#define c10 %f9 107#define c11 %f10 108#define c12 %f11 109#define c13 %f12 110#define c14 %f13 111#define c15 %f14 112#define c16 %f15 113 114#define s1 %f8 115#define s2 %f9 116#define s3 %f10 117#define s4 %f11 118#define s5 %f12 119#define s6 %f13 120#define s7 %f14 121#define s8 %f15 122 123#define t1 %f16 124#define t2 %f17 125#define t3 %f18 126#define t4 %f19 127#define t5 %f20 128#define t6 %f21 129#define t7 %f22 130#define t8 %f23 131 132#define FZERO %f24 133#define ALPHA_R %f25 134#define ALPHA_I %f26 135#endif 136 137#define PREFETCHSIZE 128 138 139 PROLOGUE 140 SAVESP 141 142#ifndef __64BIT__ 143#ifdef DOUBLE 144 st %i3, [%sp + STACK_START + 16] 145 st %i4, [%sp + STACK_START + 20] 146 st %i5, [%sp + STACK_START + 24] 147 148 ld [%sp+ STACK_START + 32], X 149 ld [%sp+ STACK_START + 36], INCX 150#else 151 st %i3, [%sp + STACK_START + 16] 152 st %i4, [%sp + STACK_START + 24] 153 ld [%sp+ STACK_START + 28], INCX 154#endif 155 LDF [%sp + STACK_START + 16], ALPHA_R 156 LDF [%sp + STACK_START + 24], ALPHA_I 157#else 158 ldx [%sp + STACK_START + 56], INCX 159#ifdef DOUBLE 160 FMOV %f6, ALPHA_R 161 FMOV %f8, ALPHA_I 162#else 163 FMOV %f7, ALPHA_R 164 FMOV %f9, ALPHA_I 165#endif 166#endif 167 168#ifdef DOUBLE 169 FCLR(17) 170#else 171 FCLR(24) 172#endif 173 174 FCMP ALPHA_R, FZERO 175 fbne .LL100 176 sll INCX, ZBASE_SHIFT, INCX 177 178 FCMP ALPHA_I, FZERO 179 fbne .LL100 180 nop 181 cmp INCX, 2 * SIZE 182 bne .LL50 183 nop 184 sra N, 2, I 185 cmp I, 0 186 ble,pn %icc, .LL15 187 nop 188 189.LL11: 190 prefetch [X + PREFETCHSIZE * SIZE], 0 191 192 STF FZERO, [X + 0 * SIZE] 193 add I, -1, I 194 STF FZERO, [X + 1 * SIZE] 195 cmp I, 0 196 STF FZERO, [X + 2 * SIZE] 197 STF FZERO, [X + 3 * SIZE] 198 STF FZERO, [X + 4 * SIZE] 199 STF FZERO, [X + 5 * SIZE] 200 add X, 8 * SIZE, X 201 STF FZERO, [X - 2 * SIZE] 202 bg,pt %icc, .LL11 203 STF FZERO, [X - 1 * SIZE] 204 205.LL15: 206 and N, 3, I 207 cmp I, 0 208 ble,a,pn %icc, .LL19 209 nop 210 211.LL16: 212 STF FZERO, [X + 0 * SIZE] 213 STF FZERO, [X + 1 * SIZE] 214 add I, -1, I 215 cmp I, 0 216 bg,pt %icc, .LL16 217 add X, 2 * SIZE, X 218 219.LL19: 220 return %i7 + 8 221 clr %o0 222 223.LL50: 224 sra N, 2, I 225 cmp I, 0 226 ble,pn %icc, .LL55 227 nop 228 229.LL51: 230 STF FZERO, [X + 0 * SIZE] 231 add I, -1, I 232 STF FZERO, [X + 1 * SIZE] 233 add X, INCX, X 234 STF FZERO, [X + 0 * SIZE] 235 cmp I, 0 236 STF FZERO, [X + 1 * SIZE] 237 add X, INCX, X 238 STF FZERO, [X + 0 * SIZE] 239 STF FZERO, [X + 1 * SIZE] 240 add X, INCX, X 241 STF FZERO, [X + 0 * SIZE] 242 STF FZERO, [X + 1 * SIZE] 243 bg,pt %icc, .LL51 244 add X, INCX, X 245 246.LL55: 247 and N, 3, I 248 cmp I, 0 249 ble,a,pn %icc, .LL59 250 nop 251 252.LL56: 253 STF FZERO, [X + 0 * SIZE] 254 add I, -1, I 255 STF FZERO, [X + 1 * SIZE] 256 cmp I, 0 257 bg,pt %icc, .LL56 258 add X, INCX, X 259 260.LL59: 261 return %i7 + 8 262 clr %o0 263 264.LL100: 265 cmp INCX, 2 * SIZE 266 bne .LL150 267 sra N, 2, I 268 269 cmp I, 0 270 ble,pn %icc, .LL115 271 nop 272 273 LDF [X + 0 * SIZE], c1 274 LDF [X + 1 * SIZE], c2 275 LDF [X + 2 * SIZE], c3 276 LDF [X + 3 * SIZE], c4 277 LDF [X + 4 * SIZE], c5 278 LDF [X + 5 * SIZE], c6 279 LDF [X + 6 * SIZE], c7 280 LDF [X + 7 * SIZE], c8 281 282 FMUL ALPHA_R, c1, t1 283 FMUL ALPHA_I, c2, t3 284 285 FMUL ALPHA_I, c1, t2 286 LDF [X + 8 * SIZE], c1 287 FMUL ALPHA_R, c2, t4 288 LDF [X + 9 * SIZE], c2 289 290 FMUL ALPHA_R, c3, t5 291 deccc I 292 FMUL ALPHA_I, c4, t7 293 FSUB t1, t3, s1 294 295 FMUL ALPHA_I, c3, t6 296 LDF [X + 10 * SIZE], c3 297 FMUL ALPHA_R, c4, t8 298 LDF [X + 11 * SIZE], c4 299 FADD t4, t2, s2 300 301 ble,pn %icc, .LL112 302 nop 303 304.LL111: 305 prefetch [X + PREFETCHSIZE * SIZE], 0 306 307 FMUL ALPHA_R, c5, t1 308 FMUL ALPHA_I, c6, t3 309 FSUB t5, t7, s3 310 STF s1, [X + 0 * SIZE] 311 312 FMUL ALPHA_I, c5, t2 313 LDF [X + 12 * SIZE], c5 314 FMUL ALPHA_R, c6, t4 315 LDF [X + 13 * SIZE], c6 316 317 FADD t8, t6, s4 318 STF s2, [X + 1 * SIZE] 319 320 FMUL ALPHA_R, c7, t5 321 FMUL ALPHA_I, c8, t7 322 FSUB t1, t3, s5 323 STF s3, [X + 2 * SIZE] 324 325 FMUL ALPHA_I, c7, t6 326 LDF [X + 14 * SIZE], c7 327 FMUL ALPHA_R, c8, t8 328 LDF [X + 15 * SIZE], c8 329 330 FADD t4, t2, s6 331 STF s4, [X + 3 * SIZE] 332 333 FMUL ALPHA_R, c1, t1 334 FMUL ALPHA_I, c2, t3 335 FSUB t5, t7, s7 336 STF s5, [X + 4 * SIZE] 337 338 FMUL ALPHA_I, c1, t2 339 LDF [X + 16 * SIZE], c1 340 FMUL ALPHA_R, c2, t4 341 LDF [X + 17 * SIZE], c2 342 343 FADD t8, t6, s8 344 STF s6, [X + 5 * SIZE] 345 346 FMUL ALPHA_R, c3, t5 347 deccc I 348 FMUL ALPHA_I, c4, t7 349 FSUB t1, t3, s1 350 STF s7, [X + 6 * SIZE] 351 352 FMUL ALPHA_I, c3, t6 353 LDF [X + 18 * SIZE], c3 354 FMUL ALPHA_R, c4, t8 355 LDF [X + 19 * SIZE], c4 356 357 FADD t4, t2, s2 358 STF s8, [X + 7 * SIZE] 359 360 bg,pt %icc, .LL111 361 add X, 8 * SIZE, X 362 363 364.LL112: 365 FMUL ALPHA_R, c5, t1 366 FMUL ALPHA_I, c6, t3 367 FSUB t5, t7, s3 368 STF s1, [X + 0 * SIZE] 369 370 FMUL ALPHA_I, c5, t2 371 FMUL ALPHA_R, c6, t4 372 FADD t8, t6, s4 373 STF s2, [X + 1 * SIZE] 374 375 FMUL ALPHA_R, c7, t5 376 FMUL ALPHA_I, c8, t7 377 FSUB t1, t3, s5 378 STF s3, [X + 2 * SIZE] 379 380 FMUL ALPHA_I, c7, t6 381 FMUL ALPHA_R, c8, t8 382 FADD t4, t2, s6 383 STF s4, [X + 3 * SIZE] 384 385 FSUB t5, t7, s7 386 FADD t8, t6, s8 387 388 STF s5, [X + 4 * SIZE] 389 STF s6, [X + 5 * SIZE] 390 STF s7, [X + 6 * SIZE] 391 STF s8, [X + 7 * SIZE] 392 add X, 8 * SIZE, X 393 394.LL115: 395 and N, 3, I 396 cmp I, 0 397 ble,a,pn %icc, .LL119 398 nop 399 400.LL116: 401 LDF [X + 0 * SIZE], c1 402 LDF [X + 1 * SIZE], c2 403 404 FMUL ALPHA_R, c1, c3 405 FMUL ALPHA_I, c1, c4 406 FMUL ALPHA_I, c2, c1 407 FMUL ALPHA_R, c2, c2 408 409 FSUB c3, c1, c1 410 FADD c2, c4, c2 411 412 STF c1, [X + 0 * SIZE] 413 STF c2, [X + 1 * SIZE] 414 415 add I, -1, I 416 cmp I, 0 417 bg,pt %icc, .LL116 418 add X, 2 * SIZE, X 419 420.LL119: 421 return %i7 + 8 422 clr %o0 423 424.LL150: 425 sra N, 2, I 426 cmp I, 0 427 ble,pn %icc, .LL155 428 mov X, XX 429 430.LL151: 431 LDF [X + 0 * SIZE], c1 432 LDF [X + 1 * SIZE], c2 433 add X, INCX, X 434 LDF [X + 0 * SIZE], c3 435 FMUL ALPHA_R, c1, c9 436 LDF [X + 1 * SIZE], c4 437 FMUL ALPHA_I, c1, c10 438 add X, INCX, X 439 LDF [X + 0 * SIZE], c5 440 FMUL ALPHA_I, c2, c1 441 LDF [X + 1 * SIZE], c6 442 FMUL ALPHA_R, c2, c2 443 add X, INCX, X 444 LDF [X + 0 * SIZE], c7 445 FMUL ALPHA_R, c3, c11 446 LDF [X + 1 * SIZE], c8 447 FMUL ALPHA_I, c3, c12 448 add X, INCX, X 449 450 FMUL ALPHA_I, c4, c3 451 FMUL ALPHA_R, c4, c4 452 453 FMUL ALPHA_R, c5, c13 454 FMUL ALPHA_I, c5, c14 455 FMUL ALPHA_I, c6, c5 456 FMUL ALPHA_R, c6, c6 457 458 FMUL ALPHA_R, c7, c15 459 FSUB c9, c1, c1 460 FMUL ALPHA_I, c7, c16 461 FADD c2, c10, c2 462 FMUL ALPHA_I, c8, c7 463 FSUB c11, c3, c3 464 FMUL ALPHA_R, c8, c8 465 FADD c4, c12, c4 466 467 STF c1, [XX + 0 * SIZE] 468 FSUB c13, c5, c5 469 add I, -1, I 470 STF c2, [XX + 1 * SIZE] 471 FADD c6, c14, c6 472 add XX, INCX, XX 473 STF c3, [XX + 0 * SIZE] 474 FSUB c15, c7, c7 475 cmp I, 0 476 STF c4, [XX + 1 * SIZE] 477 FADD c8, c16, c8 478 add XX, INCX, XX 479 STF c5, [XX + 0 * SIZE] 480 STF c6, [XX + 1 * SIZE] 481 add XX, INCX, XX 482 STF c7, [XX + 0 * SIZE] 483 STF c8, [XX + 1 * SIZE] 484 bg,pt %icc, .LL151 485 add XX, INCX, XX 486 487.LL155: 488 and N, 3, I 489 cmp I, 0 490 ble,a,pn %icc, .LL159 491 nop 492 493.LL156: 494 LDF [X + 0 * SIZE], c1 495 LDF [X + 1 * SIZE], c2 496 497 FMUL ALPHA_R, c1, c3 498 FMUL ALPHA_I, c1, c4 499 FMUL ALPHA_I, c2, c1 500 FMUL ALPHA_R, c2, c2 501 502 FSUB c3, c1, c1 503 FADD c2, c4, c2 504 505 STF c1, [X + 0 * SIZE] 506 STF c2, [X + 1 * SIZE] 507 508 add I, -1, I 509 cmp I, 0 510 bg,pt %icc, .LL156 511 add X, INCX, X 512 513.LL159: 514 return %i7 + 8 515 clr %o0 516 517 518 EPILOGUE 519