1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef movsd 43#undef movsd 44#endif 45 46#ifdef PENTIUM3 47#ifdef HAVE_SSE 48#define PREFETCH prefetcht0 49#define PREFETCHW prefetcht0 50#define PREFETCHSIZE (16 * 2) 51#endif 52#define movsd movlps 53#endif 54 55#ifdef PENTIUM4 56#define PREFETCH prefetcht0 57#define PREFETCHW prefetcht0 58#define PREFETCHSIZE (16 * 2) 59#endif 60 61#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) 62#define PREFETCH prefetcht0 63#define PREFETCHW prefetcht0 64#define PREFETCHSIZE (16 * 7) 65#endif 66 67#ifdef OPTERON 68#define PREFETCH prefetchnta 69#define PREFETCHW prefetchw 70#define PREFETCHSIZE (16 * 3) 71#define movsd movlps 72#endif 73 74#ifdef BARCELONA 75#define PREFETCH prefetchnta 76#define PREFETCHW prefetchw 77#define PREFETCHSIZE (16 * 5) 78#endif 79 80#ifdef ATOM 81#define PREFETCH prefetchnta 82#define PREFETCHW prefetcht0 83#define PREFETCHSIZE (16 * 6) 84#endif 85 86#ifdef NANO 87#define PREFETCH prefetcht0 88#define PREFETCHSIZE (16 * 4) 89#endif 90 91#define STACKSIZE 16 92 93#define M 4 + STACKSIZE(%esp) 94#define N 8 + STACKSIZE(%esp) 95#define ALPHA_R 16 + STACKSIZE(%esp) 96#define ALPHA_I 20 + STACKSIZE(%esp) 97#define A 24 + STACKSIZE(%esp) 98#define STACK_LDA 28 + STACKSIZE(%esp) 99#define STACK_X 32 + STACKSIZE(%esp) 100#define STACK_INCX 36 + STACKSIZE(%esp) 101#define Y 40 + STACKSIZE(%esp) 102#define STACK_INCY 44 + STACKSIZE(%esp) 103#define BUFFER 48 + STACKSIZE(%esp) 104 105#define I %eax 106#define J %ebx 107 108#define INCX %ecx 109#define INCY J 110 111#define A1 %esi 112#define X %edx 113#define Y1 %edi 114#define LDA %ebp 115 116#undef SUBPS 117 118#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 119#define SUBPS subps 120#else 121#define SUBPS addps 122#endif 123 124 PROLOGUE 125 126 pushl %ebp 127 pushl %edi 128 pushl %esi 129 pushl %ebx 130 131 PROFCODE 132 133 movl STACK_LDA, LDA 134 movl STACK_X, X 135 movl STACK_INCX, INCX 136 137 sall $ZBASE_SHIFT, INCX 138 sall $ZBASE_SHIFT, LDA 139 140 subl $-32 * SIZE, A 141 142 cmpl $0, N 143 jle .L999 144 cmpl $0, M 145 jle .L999 146 147 movl BUFFER, Y1 148 149 movl N, J 150 151 xorps %xmm7, %xmm7 152 153 movl M, %eax 154 addl $8, %eax 155 sarl $3, %eax 156 ALIGN_3 157 158.L01: 159 movaps %xmm7, 0 * SIZE(Y1) 160 movaps %xmm7, 4 * SIZE(Y1) 161 movaps %xmm7, 8 * SIZE(Y1) 162 movaps %xmm7, 12 * SIZE(Y1) 163 subl $-16 * SIZE, Y1 164 decl %eax 165 jg .L01 166 ALIGN_3 167 168.L10: 169 movl BUFFER, Y1 170 addl $32 * SIZE, Y1 171 172 movl A, A1 173 addl LDA, A 174 175 movsd (X), %xmm7 176 addl INCX, X 177 178#ifdef HAVE_SSE2 179 pcmpeqb %xmm5, %xmm5 180 psllq $63, %xmm5 181#else 182 subl $8, %esp 183 movl $0x00000000, 0(%esp) 184 movl $0x80000000, 4(%esp) 185 movlps (%esp), %xmm5 186 addl $8, %esp 187 movlhps %xmm5, %xmm5 188#endif 189 190#ifdef HAVE_SSE2 191 pshufd $0x00, %xmm7, %xmm6 192 pshufd $0x55, %xmm7, %xmm7 193#else 194 movaps %xmm7, %xmm6 195 shufps $0x00, %xmm6, %xmm6 196 shufps $0x55, %xmm7, %xmm7 197#endif 198 199#ifdef HAVE_SSE3 200 movddup ALPHA_R, %xmm3 201#else 202 movsd ALPHA_R, %xmm3 203 204 movlhps %xmm3, %xmm3 205#endif 206 207#ifdef HAVE_SSE2 208 pshufd $0xb1, %xmm3, %xmm4 209#else 210 movaps %xmm3, %xmm4 211 shufps $0xb1, %xmm4, %xmm4 212#endif 213 214 215#ifndef XCONJ 216 xorps %xmm5, %xmm7 217#else 218 xorps %xmm5, %xmm6 219#endif 220 221 mulps %xmm3, %xmm6 222 mulps %xmm4, %xmm7 223 224#ifndef XCONJ 225 subps %xmm7, %xmm6 226#else 227 addps %xmm7, %xmm6 228#endif 229 230#ifdef HAVE_SSE2 231 pshufd $0x55, %xmm6, %xmm7 232 pshufd $0x00, %xmm6, %xmm6 233#else 234 movaps %xmm6, %xmm7 235 shufps $0x55, %xmm7, %xmm7 236 shufps $0x00, %xmm6, %xmm6 237#endif 238 239#ifndef CONJ 240 xorps %xmm5, %xmm7 241#else 242 xorps %xmm5, %xmm6 243#endif 244 245 movaps -32 * SIZE(Y1), %xmm0 246 movaps -28 * SIZE(Y1), %xmm1 247 ALIGN_3 248 249 movl M, I 250 sarl $3, I 251 jle .L15 252 253 movsd -32 * SIZE(A1), %xmm2 254 movhps -30 * SIZE(A1), %xmm2 255 movsd -28 * SIZE(A1), %xmm4 256 movhps -26 * SIZE(A1), %xmm4 257 258 decl I 259 jle .L14 260 ALIGN_3 261 262.L13: 263#ifdef PREFETCH 264 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 265#endif 266 267#ifdef HAVE_SSE2 268 pshufd $0xb1, %xmm2, %xmm3 269#else 270 movaps %xmm2, %xmm3 271 shufps $0xb1, %xmm3, %xmm3 272#endif 273 mulps %xmm6, %xmm2 274 addps %xmm2, %xmm0 275 movsd -24 * SIZE(A1), %xmm2 276 movhps -22 * SIZE(A1), %xmm2 277#ifdef HAVE_SSE2 278 pshufd $0xb1, %xmm4, %xmm5 279#else 280 movaps %xmm4, %xmm5 281 shufps $0xb1, %xmm5, %xmm5 282#endif 283 mulps %xmm6, %xmm4 284 addps %xmm4, %xmm1 285 movsd -20 * SIZE(A1), %xmm4 286 movhps -18 * SIZE(A1), %xmm4 287 288 mulps %xmm7, %xmm3 289 SUBPS %xmm3, %xmm0 290 movaps %xmm0, -32 * SIZE(Y1) 291 movaps -24 * SIZE(Y1), %xmm0 292 mulps %xmm7, %xmm5 293 SUBPS %xmm5, %xmm1 294 movaps %xmm1, -28 * SIZE(Y1) 295 movaps -20 * SIZE(Y1), %xmm1 296 297#ifdef HAVE_SSE2 298 pshufd $0xb1, %xmm2, %xmm3 299#else 300 movaps %xmm2, %xmm3 301 shufps $0xb1, %xmm3, %xmm3 302#endif 303 mulps %xmm6, %xmm2 304 addps %xmm2, %xmm0 305 movsd -16 * SIZE(A1), %xmm2 306 movhps -14 * SIZE(A1), %xmm2 307#ifdef HAVE_SSE2 308 pshufd $0xb1, %xmm4, %xmm5 309#else 310 movaps %xmm4, %xmm5 311 shufps $0xb1, %xmm5, %xmm5 312#endif 313 mulps %xmm6, %xmm4 314 addps %xmm4, %xmm1 315 movsd -12 * SIZE(A1), %xmm4 316 movhps -10 * SIZE(A1), %xmm4 317 318 mulps %xmm7, %xmm3 319 SUBPS %xmm3, %xmm0 320 movaps %xmm0, -24 * SIZE(Y1) 321 movaps -16 * SIZE(Y1), %xmm0 322 mulps %xmm7, %xmm5 323 SUBPS %xmm5, %xmm1 324 movaps %xmm1, -20 * SIZE(Y1) 325 movaps -12 * SIZE(Y1), %xmm1 326 327 subl $-16 * SIZE, A1 328 subl $-16 * SIZE, Y1 329 330 subl $1, I 331 BRANCH 332 jg .L13 333 ALIGN_3 334 335.L14: 336#ifdef HAVE_SSE2 337 pshufd $0xb1, %xmm2, %xmm3 338#else 339 movaps %xmm2, %xmm3 340 shufps $0xb1, %xmm3, %xmm3 341#endif 342 mulps %xmm6, %xmm2 343 addps %xmm2, %xmm0 344 movsd -24 * SIZE(A1), %xmm2 345 movhps -22 * SIZE(A1), %xmm2 346#ifdef HAVE_SSE2 347 pshufd $0xb1, %xmm4, %xmm5 348#else 349 movaps %xmm4, %xmm5 350 shufps $0xb1, %xmm5, %xmm5 351#endif 352 mulps %xmm6, %xmm4 353 addps %xmm4, %xmm1 354 movsd -20 * SIZE(A1), %xmm4 355 movhps -18 * SIZE(A1), %xmm4 356 357 mulps %xmm7, %xmm3 358 SUBPS %xmm3, %xmm0 359 movaps %xmm0, -32 * SIZE(Y1) 360 movaps -24 * SIZE(Y1), %xmm0 361 mulps %xmm7, %xmm5 362 SUBPS %xmm5, %xmm1 363 movaps %xmm1, -28 * SIZE(Y1) 364 movaps -20 * SIZE(Y1), %xmm1 365 366#ifdef HAVE_SSE2 367 pshufd $0xb1, %xmm2, %xmm3 368#else 369 movaps %xmm2, %xmm3 370 shufps $0xb1, %xmm3, %xmm3 371#endif 372 mulps %xmm6, %xmm2 373 addps %xmm2, %xmm0 374#ifdef HAVE_SSE2 375 pshufd $0xb1, %xmm4, %xmm5 376#else 377 movaps %xmm4, %xmm5 378 shufps $0xb1, %xmm5, %xmm5 379#endif 380 mulps %xmm6, %xmm4 381 addps %xmm4, %xmm1 382 383 mulps %xmm7, %xmm3 384 SUBPS %xmm3, %xmm0 385 movaps %xmm0, -24 * SIZE(Y1) 386 movaps -16 * SIZE(Y1), %xmm0 387 mulps %xmm7, %xmm5 388 SUBPS %xmm5, %xmm1 389 movaps %xmm1, -20 * SIZE(Y1) 390 movaps -12 * SIZE(Y1), %xmm1 391 392 subl $-16 * SIZE, A1 393 subl $-16 * SIZE, Y1 394 ALIGN_3 395 396.L15: 397 testl $4, M 398 je .L17 399 400 movsd -32 * SIZE(A1), %xmm2 401 movhps -30 * SIZE(A1), %xmm2 402 movsd -28 * SIZE(A1), %xmm4 403 movhps -26 * SIZE(A1), %xmm4 404 405#ifdef HAVE_SSE2 406 pshufd $0xb1, %xmm2, %xmm3 407#else 408 movaps %xmm2, %xmm3 409 shufps $0xb1, %xmm3, %xmm3 410#endif 411 mulps %xmm6, %xmm2 412 addps %xmm2, %xmm0 413#ifdef HAVE_SSE2 414 pshufd $0xb1, %xmm4, %xmm5 415#else 416 movaps %xmm4, %xmm5 417 shufps $0xb1, %xmm5, %xmm5 418#endif 419 mulps %xmm6, %xmm4 420 addps %xmm4, %xmm1 421 422 mulps %xmm7, %xmm3 423 SUBPS %xmm3, %xmm0 424 movaps %xmm0, -32 * SIZE(Y1) 425 movaps -24 * SIZE(Y1), %xmm0 426 mulps %xmm7, %xmm5 427 SUBPS %xmm5, %xmm1 428 movaps %xmm1, -28 * SIZE(Y1) 429 movaps -20 * SIZE(Y1), %xmm1 430 431 addl $8 * SIZE, A1 432 addl $8 * SIZE, Y1 433 ALIGN_3 434 435.L17: 436 testl $2, M 437 je .L18 438 439 movsd -32 * SIZE(A1), %xmm2 440 movhps -30 * SIZE(A1), %xmm2 441 442#ifdef HAVE_SSE2 443 pshufd $0xb1, %xmm2, %xmm3 444#else 445 movaps %xmm2, %xmm3 446 shufps $0xb1, %xmm3, %xmm3 447#endif 448 mulps %xmm6, %xmm2 449 addps %xmm2, %xmm0 450 mulps %xmm7, %xmm3 451 SUBPS %xmm3, %xmm0 452 453 movaps %xmm0, -32 * SIZE(Y1) 454 movaps %xmm1, %xmm0 455 456 addl $4 * SIZE, A1 457 addl $4 * SIZE, Y1 458 ALIGN_3 459 460.L18: 461 testl $1, M 462 je .L19 463 464#ifdef movsd 465 xorps %xmm2, %xmm2 466#endif 467 movsd -32 * SIZE(A1), %xmm2 468 469#ifdef HAVE_SSE2 470 pshufd $0xb1, %xmm2, %xmm3 471#else 472 movaps %xmm2, %xmm3 473 shufps $0xb1, %xmm3, %xmm3 474#endif 475 mulps %xmm6, %xmm2 476 addps %xmm2, %xmm0 477 mulps %xmm7, %xmm3 478 SUBPS %xmm3, %xmm0 479 480 movlps %xmm0, -32 * SIZE(Y1) 481 ALIGN_3 482 483.L19: 484 decl J 485 jg .L10 486 ALIGN_4 487 488.L990: 489 movl Y, Y1 490 movl BUFFER, X 491 492 movl STACK_INCY, INCY 493 sall $ZBASE_SHIFT, INCY 494 495 movl M, %eax 496 sarl $3, %eax 497 jle .L994 498 ALIGN_3 499 500.L992: 501 movsd (Y1), %xmm0 502 movhps (Y1, INCY), %xmm0 503 504 addps 0 * SIZE(X), %xmm0 505 506 movlps %xmm0, (Y1) 507 movhps %xmm0, (Y1, INCY) 508 leal (Y1, INCY, 2), Y1 509 510 movsd (Y1), %xmm0 511 movhps (Y1, INCY), %xmm0 512 513 addps 4 * SIZE(X), %xmm0 514 515 movlps %xmm0, (Y1) 516 movhps %xmm0, (Y1, INCY) 517 leal (Y1, INCY, 2), Y1 518 519 movsd (Y1), %xmm0 520 movhps (Y1, INCY), %xmm0 521 522 addps 8 * SIZE(X), %xmm0 523 524 movlps %xmm0, (Y1) 525 movhps %xmm0, (Y1, INCY) 526 leal (Y1, INCY, 2), Y1 527 528 movsd (Y1), %xmm0 529 movhps (Y1, INCY), %xmm0 530 531 addps 12 * SIZE(X), %xmm0 532 533 movlps %xmm0, (Y1) 534 movhps %xmm0, (Y1, INCY) 535 leal (Y1, INCY, 2), Y1 536 537 addl $16 * SIZE, X 538 decl %eax 539 jg .L992 540 ALIGN_3 541 542.L994: 543 testl $4, M 544 jle .L995 545 546 movsd (Y1), %xmm0 547 movhps (Y1, INCY), %xmm0 548 549 addps 0 * SIZE(X), %xmm0 550 551 movlps %xmm0, (Y1) 552 movhps %xmm0, (Y1, INCY) 553 leal (Y1, INCY, 2), Y1 554 555 movsd (Y1), %xmm0 556 movhps (Y1, INCY), %xmm0 557 558 addps 4 * SIZE(X), %xmm0 559 560 movlps %xmm0, (Y1) 561 movhps %xmm0, (Y1, INCY) 562 leal (Y1, INCY, 2), Y1 563 564 addl $8 * SIZE, X 565 ALIGN_3 566 567.L995: 568 testl $2, M 569 jle .L996 570 571 movsd (Y1), %xmm0 572 movhps (Y1, INCY), %xmm0 573 574 addps 0 * SIZE(X), %xmm0 575 576 movlps %xmm0, (Y1) 577 movhps %xmm0, (Y1, INCY) 578 leal (Y1, INCY, 2), Y1 579 580 addl $4 * SIZE, X 581 ALIGN_3 582 583.L996: 584 testl $1, M 585 jle .L999 586 587#ifdef movsd 588 xorps %xmm0, %xmm0 589#endif 590 movsd (Y1), %xmm0 591 592 addps 0 * SIZE(X), %xmm0 593 594 movlps %xmm0, (Y1) 595 ALIGN_3 596 597.L999: 598 popl %ebx 599 popl %esi 600 popl %edi 601 popl %ebp 602 ret 603 604 EPILOGUE 605