1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef movsd 43#undef movsd 44#endif 45 46#ifdef PENTIUM3 47#ifdef HAVE_SSE 48#define PREFETCH prefetcht0 49#define PREFETCHW prefetcht0 50#define PREFETCHSIZE (16 * 2) 51#endif 52#define movsd movlps 53#endif 54 55#ifdef PENTIUM4 56#define PREFETCH prefetcht0 57#define PREFETCHW prefetcht0 58#define PREFETCHSIZE (16 * 4) 59#endif 60 61#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) 62#define PREFETCH prefetcht0 63#define PREFETCHW prefetcht0 64#define PREFETCHSIZE (16 * 7) 65#endif 66 67#ifdef OPTERON 68#define PREFETCH prefetchnta 69#define PREFETCHW prefetchw 70#define PREFETCHSIZE (16 * 3) 71#define movsd movlps 72#endif 73 74#ifdef BARCELONA 75#define PREFETCH prefetchnta 76#define PREFETCHW prefetchw 77#define PREFETCHSIZE (16 * 5) 78#endif 79 80#ifdef ATOM 81#define PREFETCH prefetchnta 82#define PREFETCHW prefetcht0 83#define PREFETCHSIZE (16 * 6) 84#endif 85 86#ifdef NANO 87#define PREFETCH prefetcht0 88#define PREFETCHSIZE (16 * 4) 89#endif 90 91#define STACKSIZE 16 92#define ARGS 20 93 94#define M 4 + STACKSIZE+ARGS(%esp) 95#define N 8 + STACKSIZE+ARGS(%esp) 96#define ALPHA 16 + STACKSIZE+ARGS(%esp) 97#define A 20 + STACKSIZE+ARGS(%esp) 98#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) 99#define STACK_X 28 + STACKSIZE+ARGS(%esp) 100#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) 101#define Y 36 + STACKSIZE+ARGS(%esp) 102#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) 103#define BUFFER 44 + STACKSIZE+ARGS(%esp) 104 105#define MMM 0+ARGS(%esp) 106#define AA 4+ARGS(%esp) 107#define XX 8+ARGS(%esp) 108 109#define I %eax 110#define J %ebx 111 112#define INCX J 113#define INCY %ecx 114 115#define A1 %esi 116#define X %edx 117#define Y1 %edi 118#define LDA %ebp 119 120 PROLOGUE 121 122 subl $ARGS,%esp 123 pushl %ebp 124 pushl %edi 125 pushl %esi 126 pushl %ebx 127 128 PROFCODE 129 130 movl STACK_X, X 131 movl X,XX 132 movl A,J 133 movl J,AA # backup A 134 movl M,J 135 movl J,MMM # mov M to MMM 136.L0t: 137 xorl J,J 138 addl $1,J 139 sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) 140 subl $8, J # Don't use last 8 float in the buffer. 141 subl J,MMM # MMM=MMM-J 142 movl J,M 143 jge .L00t 144 ALIGN_4 145 146 movl MMM,%eax 147 addl J,%eax 148 jle .L999x 149 movl %eax,M 150 151.L00t: 152 movl AA,%eax 153 movl %eax,A # mov AA to A 154 155 movl XX,%eax 156 movl %eax,X 157 158 movl STACK_LDA, LDA 159 movl STACK_INCX, INCX 160 movl STACK_INCY, INCY 161 162 leal (,INCX, SIZE), INCX 163 leal (,INCY, SIZE), INCY 164 leal (,LDA, SIZE), LDA 165 166 subl $-32 * SIZE, A 167 168 cmpl $0, N 169 jle .L999 170 cmpl $0, M 171 jle .L999 172 173 movl BUFFER, Y1 174 175 movl M, I 176 sarl $3, I 177 jle .L05 178 ALIGN_4 179 180.L02: 181 movss (X), %xmm0 182 addl INCX, X 183 movss (X), %xmm1 184 addl INCX, X 185 186 unpcklps %xmm1, %xmm0 187 188 movss (X), %xmm2 189 addl INCX, X 190 movss (X), %xmm3 191 addl INCX, X 192 193 unpcklps %xmm3, %xmm2 194 195 movss (X), %xmm4 196 addl INCX, X 197 movss (X), %xmm5 198 addl INCX, X 199 200 unpcklps %xmm5, %xmm4 201 202 movss (X), %xmm6 203 addl INCX, X 204 movss (X), %xmm7 205 addl INCX, X 206 207 unpcklps %xmm7, %xmm6 208 209 movlps %xmm0, 0 * SIZE(Y1) 210 movlps %xmm2, 2 * SIZE(Y1) 211 movlps %xmm4, 4 * SIZE(Y1) 212 movlps %xmm6, 6 * SIZE(Y1) 213 214 addl $8 * SIZE, Y1 215 decl I 216 jg .L02 217 ALIGN_4 218 219.L05: 220 movl M, I 221 andl $7, I 222 jle .L10 223 ALIGN_2 224 225.L06: 226 movss (X), %xmm0 227 addl INCX, X 228 movss %xmm0, 0 * SIZE(Y1) 229 addl $SIZE, Y1 230 decl I 231 jg .L06 232 ALIGN_4 233 234//Padding zero to prevent loading the dirty number from buffer. 235 movl M, I 236 movl $8, J 237 andl $7, I 238 xorps %xmm0, %xmm0 239 subl I, J 240 ALIGN_2 241.L07: 242 movss %xmm0, 0 * SIZE(Y1) 243 addl $SIZE, Y1 244 decl J 245 jg .L07 246 ALIGN_4 247 248.L10: 249 movl Y, Y1 250 251 movl N, J 252 sarl $1, J 253 jle .L20 254 ALIGN_3 255 256.L11: 257 movl BUFFER, X 258 addl $32 * SIZE, X 259 260 movl A, A1 261 leal (A1, LDA, 2), %eax 262 movl %eax, A 263 264 xorps %xmm0, %xmm0 265 xorps %xmm1, %xmm1 266 267 movaps -32 * SIZE(X), %xmm2 268 movaps -28 * SIZE(X), %xmm3 269 270 movl M, I 271 sarl $4, I 272 jle .L15 273 274 movsd -32 * SIZE(A1), %xmm4 275 movhps -30 * SIZE(A1), %xmm4 276 movsd -32 * SIZE(A1, LDA), %xmm5 277 movhps -30 * SIZE(A1, LDA), %xmm5 278 279 movsd -28 * SIZE(A1), %xmm6 280 movhps -26 * SIZE(A1), %xmm6 281 movsd -28 * SIZE(A1, LDA), %xmm7 282 movhps -26 * SIZE(A1, LDA), %xmm7 283 284 decl I 285 jle .L13 286 ALIGN_4 287 288.L12: 289#ifdef PREFETCH 290 PREFETCH PREFETCHSIZE * SIZE(A1) 291#endif 292 293 mulps %xmm2, %xmm4 294 addps %xmm4, %xmm0 295 movsd -24 * SIZE(A1), %xmm4 296 movhps -22 * SIZE(A1), %xmm4 297 mulps %xmm2, %xmm5 298 movaps -24 * SIZE(X), %xmm2 299 addps %xmm5, %xmm1 300 movsd -24 * SIZE(A1, LDA), %xmm5 301 movhps -22 * SIZE(A1, LDA), %xmm5 302 303 mulps %xmm3, %xmm6 304 addps %xmm6, %xmm0 305 movsd -20 * SIZE(A1), %xmm6 306 movhps -18 * SIZE(A1), %xmm6 307 mulps %xmm3, %xmm7 308 movaps -20 * SIZE(X), %xmm3 309 addps %xmm7, %xmm1 310 movsd -20 * SIZE(A1, LDA), %xmm7 311 movhps -18 * SIZE(A1, LDA), %xmm7 312 313#ifdef PREFETCH 314 PREFETCH PREFETCHSIZE * SIZE(A1, LDA) 315#endif 316 317 mulps %xmm2, %xmm4 318 addps %xmm4, %xmm0 319 movsd -16 * SIZE(A1), %xmm4 320 movhps -14 * SIZE(A1), %xmm4 321 mulps %xmm2, %xmm5 322 movaps -16 * SIZE(X), %xmm2 323 addps %xmm5, %xmm1 324 movsd -16 * SIZE(A1, LDA), %xmm5 325 movhps -14 * SIZE(A1, LDA), %xmm5 326 327 mulps %xmm3, %xmm6 328 addps %xmm6, %xmm0 329 movsd -12 * SIZE(A1), %xmm6 330 movhps -10 * SIZE(A1), %xmm6 331 mulps %xmm3, %xmm7 332 movaps -12 * SIZE(X), %xmm3 333 addps %xmm7, %xmm1 334 movsd -12 * SIZE(A1, LDA), %xmm7 335 movhps -10 * SIZE(A1, LDA), %xmm7 336 337 addl $16 * SIZE, A1 338 addl $16 * SIZE, X 339 340 decl I 341 jg .L12 342 ALIGN_4 343 344.L13: 345 mulps %xmm2, %xmm4 346 addps %xmm4, %xmm0 347 movsd -24 * SIZE(A1), %xmm4 348 movhps -22 * SIZE(A1), %xmm4 349 mulps %xmm2, %xmm5 350 movaps -24 * SIZE(X), %xmm2 351 addps %xmm5, %xmm1 352 movsd -24 * SIZE(A1, LDA), %xmm5 353 movhps -22 * SIZE(A1, LDA), %xmm5 354 355 mulps %xmm3, %xmm6 356 addps %xmm6, %xmm0 357 movsd -20 * SIZE(A1), %xmm6 358 movhps -18 * SIZE(A1), %xmm6 359 mulps %xmm3, %xmm7 360 movaps -20 * SIZE(X), %xmm3 361 addps %xmm7, %xmm1 362 movsd -20 * SIZE(A1, LDA), %xmm7 363 movhps -18 * SIZE(A1, LDA), %xmm7 364 365 mulps %xmm2, %xmm4 366 addps %xmm4, %xmm0 367 mulps %xmm2, %xmm5 368 movaps -16 * SIZE(X), %xmm2 369 addps %xmm5, %xmm1 370 371 mulps %xmm3, %xmm6 372 addps %xmm6, %xmm0 373 mulps %xmm3, %xmm7 374 movaps -12 * SIZE(X), %xmm3 375 addps %xmm7, %xmm1 376 377 addl $16 * SIZE, A1 378 addl $16 * SIZE, X 379 ALIGN_4 380 381.L15: 382 testl $8, M 383 jle .L16 384 385 movsd -32 * SIZE(A1), %xmm4 386 movhps -30 * SIZE(A1), %xmm4 387 movsd -32 * SIZE(A1, LDA), %xmm5 388 movhps -30 * SIZE(A1, LDA), %xmm5 389 390 movsd -28 * SIZE(A1), %xmm6 391 movhps -26 * SIZE(A1), %xmm6 392 movsd -28 * SIZE(A1, LDA), %xmm7 393 movhps -26 * SIZE(A1, LDA), %xmm7 394 395 mulps %xmm2, %xmm4 396 addps %xmm4, %xmm0 397 mulps %xmm2, %xmm5 398 movaps -24 * SIZE(X), %xmm2 399 addps %xmm5, %xmm1 400 401 mulps %xmm3, %xmm6 402 addps %xmm6, %xmm0 403 mulps %xmm3, %xmm7 404 movaps -20 * SIZE(X), %xmm3 405 addps %xmm7, %xmm1 406 407 addl $8 * SIZE, A1 408 addl $8 * SIZE, X 409 ALIGN_4 410 411.L16: 412 testl $4, M 413 jle .L17 414 415 movsd -32 * SIZE(A1), %xmm4 416 movhps -30 * SIZE(A1), %xmm4 417 418 movsd -32 * SIZE(A1, LDA), %xmm5 419 movhps -30 * SIZE(A1, LDA), %xmm5 420 421 mulps %xmm2, %xmm4 422 addps %xmm4, %xmm0 423 mulps %xmm2, %xmm5 424 addps %xmm5, %xmm1 425 movaps %xmm3, %xmm2 426 427 addl $4 * SIZE, A1 428 ALIGN_4 429 430.L17: 431 testl $2, M 432 jle .L18 433 434#ifdef movsd 435 xorps %xmm4, %xmm4 436#endif 437 movsd -32 * SIZE(A1), %xmm4 438 439#ifdef movsd 440 xorps %xmm5, %xmm5 441#endif 442 movsd -32 * SIZE(A1, LDA), %xmm5 443 444 mulps %xmm2, %xmm4 445 addps %xmm4, %xmm0 446 mulps %xmm2, %xmm5 447 addps %xmm5, %xmm1 448 movhlps %xmm2, %xmm2 449 450 addl $2 * SIZE, A1 451 ALIGN_4 452 453.L18: 454 testl $1, M 455 jle .L19 456 457 movss -32 * SIZE(A1), %xmm4 458 mulss %xmm2, %xmm4 459 addss %xmm4, %xmm0 460 movss -32 * SIZE(A1, LDA), %xmm5 461 mulss %xmm2, %xmm5 462 addss %xmm5, %xmm1 463 ALIGN_4 464 465.L19: 466#ifdef HAVE_SSE3 467 haddps %xmm0, %xmm0 468 haddps %xmm1, %xmm1 469 470 haddps %xmm0, %xmm0 471 haddps %xmm1, %xmm1 472#else 473 movhlps %xmm0, %xmm2 474 movhlps %xmm1, %xmm3 475 476 addps %xmm2, %xmm0 477 addps %xmm3, %xmm1 478 479 movaps %xmm0, %xmm2 480 shufps $1, %xmm0, %xmm0 481 movaps %xmm1, %xmm3 482 shufps $1, %xmm1, %xmm1 483 484 addss %xmm2, %xmm0 485 addss %xmm3, %xmm1 486#endif 487 488 movss ALPHA, %xmm7 489 490 mulss %xmm7, %xmm0 491 mulss %xmm7, %xmm1 492 493 addss (Y1), %xmm0 494 addss (Y1, INCY), %xmm1 495 496 movss %xmm0, (Y1) 497 movss %xmm1, (Y1, INCY) 498 leal (Y1, INCY, 2), Y1 499 500 decl J 501 jg .L11 502 ALIGN_4 503 504.L20: 505 testl $1, N 506 jle .L999 507 508 movl BUFFER, X 509 addl $32 * SIZE, X 510 511 movl A, A1 512 513 xorps %xmm0, %xmm0 514 xorps %xmm1, %xmm1 515 516 movaps -32 * SIZE(X), %xmm2 517 movaps -28 * SIZE(X), %xmm3 518 519 movl M, I 520 sarl $4, I 521 jle .L25 522 523 movsd -32 * SIZE(A1), %xmm4 524 movhps -30 * SIZE(A1), %xmm4 525 movsd -28 * SIZE(A1), %xmm6 526 movhps -26 * SIZE(A1), %xmm6 527 528 decl I 529 jle .L23 530 ALIGN_4 531 532.L22: 533#ifdef PREFETCH 534 PREFETCH PREFETCHSIZE * SIZE(A1) 535#endif 536 537 mulps %xmm2, %xmm4 538 movaps -24 * SIZE(X), %xmm2 539 addps %xmm4, %xmm0 540 movsd -24 * SIZE(A1), %xmm4 541 movhps -22 * SIZE(A1), %xmm4 542 543 mulps %xmm3, %xmm6 544 movaps -20 * SIZE(X), %xmm3 545 addps %xmm6, %xmm0 546 movsd -20 * SIZE(A1), %xmm6 547 movhps -18 * SIZE(A1), %xmm6 548 549 mulps %xmm2, %xmm4 550 movaps -16 * SIZE(X), %xmm2 551 addps %xmm4, %xmm0 552 movsd -16 * SIZE(A1), %xmm4 553 movhps -14 * SIZE(A1), %xmm4 554 555 mulps %xmm3, %xmm6 556 movaps -12 * SIZE(X), %xmm3 557 addps %xmm6, %xmm0 558 movsd -12 * SIZE(A1), %xmm6 559 movhps -10 * SIZE(A1), %xmm6 560 561 addl $16 * SIZE, A1 562 addl $16 * SIZE, X 563 564 decl I 565 jg .L22 566 ALIGN_4 567 568.L23: 569 mulps %xmm2, %xmm4 570 movaps -24 * SIZE(X), %xmm2 571 addps %xmm4, %xmm0 572 movsd -24 * SIZE(A1), %xmm4 573 movhps -22 * SIZE(A1), %xmm4 574 575 mulps %xmm3, %xmm6 576 movaps -20 * SIZE(X), %xmm3 577 addps %xmm6, %xmm0 578 movsd -20 * SIZE(A1), %xmm6 579 movhps -18 * SIZE(A1), %xmm6 580 581 mulps %xmm2, %xmm4 582 movaps -16 * SIZE(X), %xmm2 583 addps %xmm4, %xmm0 584 585 mulps %xmm3, %xmm6 586 movaps -12 * SIZE(X), %xmm3 587 addps %xmm6, %xmm0 588 589 addl $16 * SIZE, A1 590 addl $16 * SIZE, X 591 ALIGN_4 592 593.L25: 594 testl $8, M 595 jle .L26 596 597 movsd -32 * SIZE(A1), %xmm4 598 movhps -30 * SIZE(A1), %xmm4 599 movsd -28 * SIZE(A1), %xmm6 600 movhps -26 * SIZE(A1), %xmm6 601 602 mulps %xmm2, %xmm4 603 movaps -24 * SIZE(X), %xmm2 604 addps %xmm4, %xmm0 605 606 mulps %xmm3, %xmm6 607 movaps -20 * SIZE(X), %xmm3 608 addps %xmm6, %xmm0 609 610 addl $8 * SIZE, A1 611 addl $8 * SIZE, X 612 ALIGN_4 613 614.L26: 615 testl $4, M 616 jle .L27 617 618 movsd -32 * SIZE(A1), %xmm4 619 movhps -30 * SIZE(A1), %xmm4 620 621 mulps %xmm2, %xmm4 622 addps %xmm4, %xmm0 623 movaps %xmm3, %xmm2 624 625 addl $4 * SIZE, A1 626 ALIGN_4 627 628.L27: 629 testl $2, M 630 jle .L28 631 632#ifdef movsd 633 xorps %xmm4, %xmm4 634#endif 635 movsd -32 * SIZE(A1), %xmm4 636 637 mulps %xmm2, %xmm4 638 addps %xmm4, %xmm0 639 movhlps %xmm2, %xmm2 640 641 addl $2 * SIZE, A1 642 ALIGN_4 643 644.L28: 645 testl $1, M 646 jle .L29 647 648 movss -32 * SIZE(A1), %xmm4 649 mulss %xmm2, %xmm4 650 addss %xmm4, %xmm0 651 ALIGN_4 652 653.L29: 654#ifdef HAVE_SSE3 655 haddps %xmm0, %xmm0 656 haddps %xmm0, %xmm0 657#else 658 movhlps %xmm0, %xmm2 659 660 addps %xmm2, %xmm0 661 662 movaps %xmm0, %xmm2 663 shufps $1, %xmm0, %xmm0 664 665 addss %xmm2, %xmm0 666#endif 667 668 movss ALPHA, %xmm7 669 670 mulss %xmm7, %xmm0 671 672 addss (Y1), %xmm0 673 674 movss %xmm0, (Y1) 675 ALIGN_4 676 677.L999: 678 movl M,J 679 leal (,J,SIZE),%eax 680 addl %eax,AA 681 movl STACK_INCX,INCX 682 imull INCX,%eax 683 addl %eax,XX 684 jmp .L0t 685 ALIGN_4 686 687.L999x: 688 popl %ebx 689 popl %esi 690 popl %edi 691 popl %ebp 692 693 addl $ARGS,%esp 694 ret 695 696 EPILOGUE 697