1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef ATOM 43#define PREFETCH prefetcht0 44#define PREFETCHW prefetcht0 45#define PREFETCHSIZE (16 * 24) 46#endif 47 48#ifdef CORE2 49#define PREFETCH prefetcht0 50#define PREFETCHW prefetcht0 51#define PREFETCHSIZE (16 * 24) 52#endif 53 54#if defined(PENRYN) || defined(DUNNINGTON) 55#define PREFETCH prefetcht0 56#define PREFETCHW prefetcht0 57#define PREFETCHSIZE (16 * 24) 58#endif 59 60#ifdef NEHALEM 61#define PREFETCH prefetcht0 62#define PREFETCHW prefetcht0 63#define PREFETCHSIZE (16 * 24) 64#endif 65 66#ifdef PENTIUM4 67#define PREFETCH prefetcht0 68#define PREFETCHW prefetcht0 69#define PREFETCHSIZE (16 * 28) 70#endif 71 72#ifdef OPTERON 73#define PREFETCH prefetch 74#define PREFETCHW prefetchw 75#define PREFETCHSIZE (16 * 12) 76#define movsd movlpd 77#endif 78 79#if defined(BARCELONA) || defined(SHANGHAI) 80#define PREFETCH prefetch 81#define PREFETCHW prefetchw 82#define PREFETCHSIZE (16 * 16) 83#endif 84 85#ifdef NANO 86#define PREFETCH prefetcht0 87#define PREFETCHW prefetcht0 88#define PREFETCHSIZE (16 * 24) 89#endif 90 91#ifdef GENERIC 92#define PREFETCH prefetcht0 93#define PREFETCHW prefetcht0 94#define PREFETCHSIZE (16 * 14) 95#endif 96 97#ifndef WINDOWS_ABI 98 99#define STACKSIZE 80 100 101#define OLD_Y 8 + STACKSIZE(%rsp) 102#define OLD_INCY 16 + STACKSIZE(%rsp) 103#define OLD_BUFFER 24 + STACKSIZE(%rsp) 104 105#define M ARG1 106#define N ARG2 107#define A ARG3 108#define LDA ARG4 109#define X ARG5 110#define INCX ARG6 111 112#else 113 114#define STACKSIZE 256 115 116#define OLD_A 40 + STACKSIZE(%rsp) 117#define OLD_LDA 48 + STACKSIZE(%rsp) 118#define OLD_X 56 + STACKSIZE(%rsp) 119#define OLD_INCX 64 + STACKSIZE(%rsp) 120#define OLD_Y 72 + STACKSIZE(%rsp) 121#define OLD_INCY 80 + STACKSIZE(%rsp) 122#define OLD_BUFFER 88 + STACKSIZE(%rsp) 123 124#define M ARG1 125#define N ARG2 126#define A ARG4 127#define LDA ARG3 128#define X %rdi 129#define INCX %rsi 130#endif 131 132#define Y %r10 133#define INCY %r11 134#define BUFFER %r12 135 136#define TEMP %rax 137#define I %rax 138#define A1 %rbx 139#define A2 %rbp 140#define XX %r13 141#define YY %r14 142#define IS %r15 143#define NEW_X BUFFER 144#define NEW_Y X 145 146#define ALPHA_R %xmm0 147#define ALPHA_I %xmm1 148 149#define xsum1 %xmm0 150#define xsum2 %xmm1 151#define xsum3 %xmm2 152#define xsum4 %xmm3 153 154#define atemp1 %xmm4 155#define atemp2 %xmm5 156#define atemp3 %xmm6 157#define atemp4 %xmm7 158 159#define xtemp1 %xmm8 160#define xtemp2 %xmm9 161#define a1 %xmm10 162#define a2 %xmm11 163 164#define a3 %xmm12 165#define yy1 %xmm13 166#define xt1 %xmm14 167#define xt2 %xmm15 168 169#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) 170#define MOVDDUP(a, b, c) movddup a(b), c 171#define MOVDDUP2(a, b, c) movddup a##b, c 172#else 173#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c 174#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c 175#endif 176 177 PROLOGUE 178 PROFCODE 179 180 subq $STACKSIZE, %rsp 181 movq %rbx, 0(%rsp) 182 movq %rbp, 8(%rsp) 183 movq %r12, 16(%rsp) 184 movq %r13, 24(%rsp) 185 movq %r14, 32(%rsp) 186 movq %r15, 40(%rsp) 187 188#ifdef WINDOWS_ABI 189 movq %rdi, 48(%rsp) 190 movq %rsi, 56(%rsp) 191 movups %xmm6, 64(%rsp) 192 movups %xmm7, 80(%rsp) 193 movups %xmm8, 96(%rsp) 194 movups %xmm9, 112(%rsp) 195 movups %xmm10, 128(%rsp) 196 movups %xmm11, 144(%rsp) 197 movups %xmm12, 160(%rsp) 198 movups %xmm13, 176(%rsp) 199 movups %xmm14, 192(%rsp) 200 movups %xmm15, 208(%rsp) 201 202 movq OLD_A, A 203 movq OLD_LDA, LDA 204 movq OLD_X, X 205 movq OLD_INCX, INCX 206 207 movaps %xmm2, %xmm0 208 movaps %xmm3, %xmm1 209#endif 210 211 movq OLD_Y, Y 212 movq OLD_INCY, INCY 213 movq OLD_BUFFER, BUFFER 214 215 salq $ZBASE_SHIFT, INCX 216 salq $ZBASE_SHIFT, INCY 217 salq $ZBASE_SHIFT, LDA 218 219 testq M, M 220 jle .L999 221 222 negq IS 223 addq M, IS 224 225 movq IS, TEMP 226 imulq LDA, TEMP 227 addq TEMP, A 228 229 pcmpeqb %xmm3, %xmm3 230 xorpd %xmm2, %xmm2 231 pslld $31, %xmm3 232 unpckhps %xmm3, %xmm2 233 234 shufps $0, ALPHA_R, ALPHA_R 235 shufps $0, ALPHA_I, ALPHA_I 236 movaps ALPHA_I, %xmm3 237 238 unpcklps ALPHA_R, ALPHA_I 239 unpcklps %xmm3, ALPHA_R 240 pxor %xmm2, ALPHA_R 241 242 movq BUFFER, XX 243 244 movq M, %rax 245 sarq $2, %rax 246 jle .L02 247 ALIGN_3 248 249.L01: 250 movsd 0 * SIZE(X), %xmm4 251 addq INCX, X 252 movhps 0 * SIZE(X), %xmm4 253 addq INCX, X 254 movsd 0 * SIZE(X), %xmm6 255 addq INCX, X 256 movhps 0 * SIZE(X), %xmm6 257 addq INCX, X 258 259 movsldup %xmm4, %xmm3 260 movshdup %xmm4, %xmm4 261 movsldup %xmm6, %xmm5 262 movshdup %xmm6, %xmm6 263 264 mulps ALPHA_I, %xmm3 265 mulps ALPHA_R, %xmm4 266 mulps ALPHA_I, %xmm5 267 mulps ALPHA_R, %xmm6 268 269 addps %xmm4, %xmm3 270 addps %xmm6, %xmm5 271 272 movaps %xmm3, 4 * SIZE(XX) 273 movaps %xmm5, 12 * SIZE(XX) 274 275 shufps $0xb1, %xmm3, %xmm3 276 shufps $0xb1, %xmm5, %xmm5 277 278 pxor %xmm2, %xmm3 279 pxor %xmm2, %xmm5 280 281 movaps %xmm3, 0 * SIZE(XX) 282 movaps %xmm5, 8 * SIZE(XX) 283 284 subq $-16 * SIZE, XX 285 decq %rax 286 jg .L01 287 ALIGN_3 288 289.L02: 290 testq $2, M 291 jle .L03 292 293 movsd 0 * SIZE(X), %xmm4 294 addq INCX, X 295 movhps 0 * SIZE(X), %xmm4 296 addq INCX, X 297 298 movsldup %xmm4, %xmm3 299 movshdup %xmm4, %xmm4 300 301 mulps ALPHA_I, %xmm3 302 mulps ALPHA_R, %xmm4 303 304 addps %xmm4, %xmm3 305 306 movaps %xmm3, 4 * SIZE(XX) 307 308 shufps $0xb1, %xmm3, %xmm3 309 pxor %xmm2, %xmm3 310 movaps %xmm3, 0 * SIZE(XX) 311 312 subq $-8 * SIZE, XX 313 ALIGN_3 314 315.L03: 316 testq $1, M 317 jle .L05 318 319 movsd 0 * SIZE(X), %xmm4 320 addq INCX, X 321 322 movsldup %xmm4, %xmm3 323 movshdup %xmm4, %xmm4 324 325 mulps ALPHA_I, %xmm3 326 mulps ALPHA_R, %xmm4 327 328 addps %xmm4, %xmm3 329 330 movlps %xmm3, 2 * SIZE(XX) 331 332 shufps $0xb1, %xmm3, %xmm3 333 pxor %xmm2, %xmm3 334 movlps %xmm3, 0 * SIZE(XX) 335 336 subq $-4 * SIZE, XX 337 ALIGN_3 338 339.L05: 340 /* now we don't need original X */ 341 movq Y, NEW_Y 342 343 addq $512, XX 344 andq $-512, XX 345 346 cmpq $2 * SIZE, INCY 347 je .L10 348 349 movq Y, YY 350 movq XX, NEW_Y 351 352 movq M, %rax 353 sarq $2, %rax 354 jle .L07 355 ALIGN_3 356 357.L06: 358 movsd 0 * SIZE(YY), %xmm0 359 addq INCY, YY 360 movhps 0 * SIZE(YY), %xmm0 361 addq INCY, YY 362 movsd 0 * SIZE(YY), %xmm1 363 addq INCY, YY 364 movhps 0 * SIZE(YY), %xmm1 365 addq INCY, YY 366 367 movaps %xmm0, 0 * SIZE(XX) 368 movaps %xmm1, 8 * SIZE(XX) 369 370 addq $8 * SIZE, XX 371 decq %rax 372 jg .L06 373 ALIGN_3 374 375.L07: 376 movq M, %rax 377 andq $3, %rax 378 jle .L10 379 ALIGN_3 380 381.L08: 382 movsd 0 * SIZE(YY), %xmm0 383 addq INCY, YY 384 385 movlps %xmm0, 0 * SIZE(XX) 386 387 addq $2 * SIZE, XX 388 decq %rax 389 jg .L08 390 ALIGN_3 391 392.L10: 393 movq IS, I 394 addq $2, I 395 cmpq M, I 396 jg .L20 397 ALIGN_3 398 399.L11: 400 movq A, A1 401 leaq (A, LDA, 1), A2 402 leaq (A, LDA, 2), A 403 404 leaq (, IS, 4), I 405 406 movsd 0 * SIZE(NEW_X, I, SIZE), atemp2 407 movhps 4 * SIZE(NEW_X, I, SIZE), atemp2 408 movsd 2 * SIZE(NEW_X, I, SIZE), atemp4 409 movhps 6 * SIZE(NEW_X, I, SIZE), atemp4 410 411 pshufd $0xcc, atemp2, atemp1 412 pshufd $0x99, atemp2, atemp2 413 pshufd $0xcc, atemp4, atemp3 414 pshufd $0x99, atemp4, atemp4 415 416 pxor xsum1, xsum1 417 pxor xsum2, xsum2 418 pxor xsum3, xsum3 419 pxor xsum4, xsum4 420 421 movq NEW_X, XX 422 movq NEW_Y, YY 423 424 movq IS, I 425 sarq $2, I 426 jle .L15 427 ALIGN_3 428 429.L12: 430 HALT 431 432 subq $-16 * SIZE, XX 433 addq $ 8 * SIZE, YY 434 addq $ 8 * SIZE, A1 435 addq $ 8 * SIZE, A2 436 437 decq I 438 jg .L12 439 ALIGN_3 440 441.L15: 442 testq $2, IS 443 jle .L18 444 445 movsd 0 * SIZE(YY), yy1 446 movhps 2 * SIZE(YY), yy1 447 448 movaps 0 * SIZE(XX), xtemp1 449 movaps 4 * SIZE(XX), xtemp2 450 451 movsd 0 * SIZE(A1), a1 452 movhps 2 * SIZE(A1), a1 453 454 movaps xtemp1, xt1 455 movaps xtemp2, xt2 456 mulps a1, xt1 457 mulps a1, xt2 458 addps xt1, xsum1 459 addps xt2, xsum2 460 461 pshufd $0xb1, a1, xt2 462 mulps atemp1, a1 463 mulps atemp2, xt2 464 addps a1, yy1 465 addps xt2, yy1 466 467 movsd 0 * SIZE(A2), a1 468 movhps 2 * SIZE(A2), a1 469 470 movaps xtemp1, xt1 471 movaps xtemp2, xt2 472 mulps a1, xt1 473 mulps a1, xt2 474 addps xt1, xsum3 475 addps xt2, xsum4 476 477 pshufd $0xb1, a1, xt2 478 mulps atemp1, a1 479 mulps atemp2, xt2 480 addps a1, yy1 481 addps xt2, yy1 482 483 movlps yy1, 0 * SIZE(YY) 484 movhps yy1, 2 * SIZE(YY) 485 486 addq $8 * SIZE, XX 487 addq $4 * SIZE, YY 488 addq $4 * SIZE, A1 489 addq $4 * SIZE, A2 490 ALIGN_3 491 492.L18: 493 leaq (, IS, 4), I 494 495 movaps 0 * SIZE(NEW_X, I, SIZE), atemp1 496 movaps 4 * SIZE(NEW_X, I, SIZE), atemp2 497 498 movlps 0 * SIZE(YY), yy1 499 movhps 2 * SIZE(YY), yy1 500 501 movsd 0 * SIZE(A1), a1 502 movhps 0 * SIZE(A2), a1 503 504 movaps a1, a2 505 mulps atemp1, a1 506 mulps atemp2, a2 507 addps a1, xsum1 508 addps a2, xsum2 509 510 movsd 0 * SIZE(A2), a1 511 movhps 2 * SIZE(A2), a1 512 513 movaps a1, a2 514 mulps atemp1, a1 515 mulps atemp2, a2 516 addps a1, xsum3 517 addps a2, xsum4 518 519 haddps xsum2, xsum1 520 haddps xsum4, xsum3 521 522 haddps xsum3, xsum1 523 addps xsum1, yy1 524 525 movlps yy1, 0 * SIZE(YY) 526 movhps yy1, 2 * SIZE(YY) 527 528 addq $2, IS 529 530 movq IS, I 531 addq $2, I 532 cmpq M, I 533 jle .L11 534 ALIGN_3 535 536.L20: 537 testq $1, M 538 jle .L990 539 540 541.L990: 542 cmpq $2 * SIZE, INCY 543 je .L999 544 545 movq M, %rax 546 sarq $2, %rax 547 jle .L997 548 ALIGN_3 549 550.L996: 551 movaps 0 * SIZE(NEW_Y), %xmm0 552 movaps 4 * SIZE(NEW_Y), %xmm1 553 554 movlps %xmm0, 0 * SIZE(Y) 555 addq INCY, Y 556 movhps %xmm0, 0 * SIZE(Y) 557 addq INCY, Y 558 movlps %xmm1, 0 * SIZE(Y) 559 addq INCY, Y 560 movhps %xmm1, 0 * SIZE(Y) 561 addq INCY, Y 562 563 addq $8 * SIZE, NEW_Y 564 decq %rax 565 jg .L996 566 ALIGN_3 567 568.L997: 569 movq M, %rax 570 andq $3, %rax 571 jle .L999 572 ALIGN_3 573 574.L998: 575 movlps 0 * SIZE(NEW_Y), %xmm0 576 addq $2 * SIZE, NEW_Y 577 578 movlps %xmm0, 0 * SIZE(Y) 579 addq INCY, Y 580 581 decq %rax 582 jg .L998 583 ALIGN_3 584 585.L999: 586 movq 0(%rsp), %rbx 587 movq 8(%rsp), %rbp 588 movq 16(%rsp), %r12 589 movq 24(%rsp), %r13 590 movq 32(%rsp), %r14 591 movq 40(%rsp), %r15 592 addq $STACKSIZE, %rsp 593 ret 594 EPILOGUE 595