1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef PENTIUM4 43#define PREFETCH prefetcht0 44#define PREFETCHW prefetcht0 45#define PREFETCHSIZE (8 * 2) 46#endif 47 48#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) 49#define PREFETCH prefetcht0 50#define PREFETCHW prefetcht0 51#define PREFETCHSIZE (8 * 7) 52#endif 53 54#ifdef OPTERON 55#define PREFETCH prefetchnta 56#define PREFETCHW prefetchw 57#define PREFETCHSIZE (8 * 3) 58#define movsd movlps 59#endif 60 61#ifdef BARCELONA 62#define PREFETCH prefetchnta 63#define PREFETCHW prefetchw 64#define PREFETCHSIZE (8 * 5) 65#endif 66 67#ifdef ATOM 68#define PREFETCH prefetch 69#define PREFETCHW prefetcht0 70#define PREFETCHSIZE (8 * 6) 71#endif 72 73#ifdef NANO 74#define PREFETCH prefetcht0 75#define PREFETCHSIZE (8 * 4) 76#endif 77 78#define STACKSIZE 16 79 80#define M 4 + STACKSIZE(%esp) 81#define N 8 + STACKSIZE(%esp) 82#define ALPHA 16 + STACKSIZE(%esp) 83#define A 24 + STACKSIZE(%esp) 84#define STACK_LDA 28 + STACKSIZE(%esp) 85#define STACK_X 32 + STACKSIZE(%esp) 86#define STACK_INCX 36 + STACKSIZE(%esp) 87#define Y 40 + STACKSIZE(%esp) 88#define STACK_INCY 44 + STACKSIZE(%esp) 89#define BUFFER 48 + STACKSIZE(%esp) 90 91#define I %eax 92#define J %ebx 93 94#define INCX %ecx 95#define INCY J 96 97#define A1 %esi 98#define X %edx 99#define Y1 %edi 100#define LDA %ebp 101 102 PROLOGUE 103 104 pushl %ebp 105 pushl %edi 106 pushl %esi 107 pushl %ebx 108 109 PROFCODE 110 111 movl STACK_LDA, LDA 112 movl STACK_X, X 113 movl STACK_INCX, INCX 114 115 leal (,INCX, SIZE), INCX 116 leal (,LDA, SIZE), LDA 117 118 subl $-16 * SIZE, A 119 120 cmpl $0, N 121 jle .L999 122 cmpl $0, M 123 jle .L999 124 125 movl BUFFER, Y1 126 127 pxor %xmm7, %xmm7 128 129 movl M, %eax 130 addl $16, %eax 131 sarl $4, %eax 132 ALIGN_3 133 134.L01: 135 movapd %xmm7, 0 * SIZE(Y1) 136 movapd %xmm7, 2 * SIZE(Y1) 137 movapd %xmm7, 4 * SIZE(Y1) 138 movapd %xmm7, 6 * SIZE(Y1) 139 movapd %xmm7, 8 * SIZE(Y1) 140 movapd %xmm7, 10 * SIZE(Y1) 141 movapd %xmm7, 12 * SIZE(Y1) 142 movapd %xmm7, 14 * SIZE(Y1) 143 subl $-16 * SIZE, Y1 144 decl %eax 145 jg .L01 146 ALIGN_3 147 148.L10: 149 movl N, J 150 sarl $1, J 151 jle .L20 152 ALIGN_3 153 154.L11: 155 156 movl BUFFER, Y1 157 addl $16 * SIZE, Y1 158 159 movl A, A1 160 leal (A1, LDA, 2), %eax 161 movl %eax, A 162 163#ifdef HAVE_SSE3 164 movddup (X), %xmm6 165 addl INCX, X 166 movddup (X), %xmm7 167 addl INCX, X 168 169 movddup ALPHA, %xmm0 170 171 mulpd %xmm0, %xmm6 172 mulpd %xmm0, %xmm7 173#else 174 movsd (X), %xmm6 175 addl INCX, X 176 movsd (X), %xmm7 177 addl INCX, X 178 179 movsd ALPHA, %xmm0 180 181 mulsd %xmm0, %xmm6 182 mulsd %xmm0, %xmm7 183 184 unpcklpd %xmm6, %xmm6 185 unpcklpd %xmm7, %xmm7 186#endif 187 188 ALIGN_3 189 190 movl M, I 191 sarl $3, I 192 jle .L15 193 194 movsd -16 * SIZE(A1), %xmm2 195 movhpd -15 * SIZE(A1), %xmm2 196 movsd -14 * SIZE(A1), %xmm3 197 movhpd -13 * SIZE(A1), %xmm3 198 199 movapd -16 * SIZE(Y1), %xmm0 200 movapd -14 * SIZE(Y1), %xmm1 201 202 movsd -16 * SIZE(A1, LDA), %xmm4 203 movhpd -15 * SIZE(A1, LDA), %xmm4 204 movsd -14 * SIZE(A1, LDA), %xmm5 205 movhpd -13 * SIZE(A1, LDA), %xmm5 206 207 decl I 208 jle .L14 209 ALIGN_3 210 211.L13: 212#ifdef PREFETCH 213 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 214#endif 215 216 mulpd %xmm6, %xmm2 217 addpd %xmm2, %xmm0 218 movsd -12 * SIZE(A1), %xmm2 219 movhpd -11 * SIZE(A1), %xmm2 220 mulpd %xmm6, %xmm3 221 addpd %xmm3, %xmm1 222 movsd -10 * SIZE(A1), %xmm3 223 movhpd -9 * SIZE(A1), %xmm3 224 225 mulpd %xmm7, %xmm4 226 addpd %xmm4, %xmm0 227 movsd -12 * SIZE(A1, LDA), %xmm4 228 movhpd -11 * SIZE(A1, LDA), %xmm4 229 230 movapd %xmm0, -16 * SIZE(Y1) 231 movapd -12 * SIZE(Y1), %xmm0 232 233 mulpd %xmm7, %xmm5 234 addpd %xmm5, %xmm1 235 movsd -10 * SIZE(A1, LDA), %xmm5 236 movhpd -9 * SIZE(A1, LDA), %xmm5 237 238 movapd %xmm1, -14 * SIZE(Y1) 239 movapd -10 * SIZE(Y1), %xmm1 240 241#ifdef PREFETCH 242 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) 243#endif 244 245 mulpd %xmm6, %xmm2 246 addpd %xmm2, %xmm0 247 movsd -8 * SIZE(A1), %xmm2 248 movhpd -7 * SIZE(A1), %xmm2 249 mulpd %xmm6, %xmm3 250 addpd %xmm3, %xmm1 251 movsd -6 * SIZE(A1), %xmm3 252 movhpd -5 * SIZE(A1), %xmm3 253 254 mulpd %xmm7, %xmm4 255 addpd %xmm4, %xmm0 256 movsd -8 * SIZE(A1, LDA), %xmm4 257 movhpd -7 * SIZE(A1, LDA), %xmm4 258 259 movapd %xmm0, -12 * SIZE(Y1) 260 movapd -8 * SIZE(Y1), %xmm0 261 262 mulpd %xmm7, %xmm5 263 addpd %xmm5, %xmm1 264 movsd -6 * SIZE(A1, LDA), %xmm5 265 movhpd -5 * SIZE(A1, LDA), %xmm5 266 267 movapd %xmm1, -10 * SIZE(Y1) 268 movapd -6 * SIZE(Y1), %xmm1 269 270 subl $-8 * SIZE, A1 271 subl $-8 * SIZE, Y1 272 273 subl $1, I 274 BRANCH 275 jg .L13 276 ALIGN_3 277 278.L14: 279 mulpd %xmm6, %xmm2 280 addpd %xmm2, %xmm0 281 movsd -12 * SIZE(A1), %xmm2 282 movhpd -11 * SIZE(A1), %xmm2 283 mulpd %xmm6, %xmm3 284 addpd %xmm3, %xmm1 285 movsd -10 * SIZE(A1), %xmm3 286 movhpd -9 * SIZE(A1), %xmm3 287 288 mulpd %xmm7, %xmm4 289 addpd %xmm4, %xmm0 290 movsd -12 * SIZE(A1, LDA), %xmm4 291 movhpd -11 * SIZE(A1, LDA), %xmm4 292 293 movapd %xmm0, -16 * SIZE(Y1) 294 movapd -12 * SIZE(Y1), %xmm0 295 296 mulpd %xmm7, %xmm5 297 addpd %xmm5, %xmm1 298 movsd -10 * SIZE(A1, LDA), %xmm5 299 movhpd -9 * SIZE(A1, LDA), %xmm5 300 301 movapd %xmm1, -14 * SIZE(Y1) 302 movapd -10 * SIZE(Y1), %xmm1 303 304 mulpd %xmm6, %xmm2 305 addpd %xmm2, %xmm0 306 mulpd %xmm6, %xmm3 307 addpd %xmm3, %xmm1 308 309 mulpd %xmm7, %xmm4 310 addpd %xmm4, %xmm0 311 movapd %xmm0, -12 * SIZE(Y1) 312 mulpd %xmm7, %xmm5 313 addpd %xmm5, %xmm1 314 movapd %xmm1, -10 * SIZE(Y1) 315 316 subl $-8 * SIZE, A1 317 subl $-8 * SIZE, Y1 318 ALIGN_3 319 320.L15: 321 testl $4, M 322 je .L16 323 324 movsd -16 * SIZE(A1), %xmm2 325 movhpd -15 * SIZE(A1), %xmm2 326 movsd -14 * SIZE(A1), %xmm3 327 movhpd -13 * SIZE(A1), %xmm3 328 329 movapd -16 * SIZE(Y1), %xmm0 330 movapd -14 * SIZE(Y1), %xmm1 331 332 mulpd %xmm6, %xmm2 333 addpd %xmm2, %xmm0 334 mulpd %xmm6, %xmm3 335 addpd %xmm3, %xmm1 336 337 movsd -16 * SIZE(A1, LDA), %xmm4 338 movhpd -15 * SIZE(A1, LDA), %xmm4 339 movsd -14 * SIZE(A1, LDA), %xmm5 340 movhpd -13 * SIZE(A1, LDA), %xmm5 341 342 mulpd %xmm7, %xmm4 343 addpd %xmm4, %xmm0 344 mulpd %xmm7, %xmm5 345 addpd %xmm5, %xmm1 346 347 movapd %xmm0, -16 * SIZE(Y1) 348 movapd %xmm1, -14 * SIZE(Y1) 349 350 addl $4 * SIZE, A1 351 addl $4 * SIZE, Y1 352 ALIGN_3 353 354.L16: 355 testl $2, M 356 je .L17 357 358 movsd -16 * SIZE(A1), %xmm2 359 movhpd -15 * SIZE(A1), %xmm2 360 movsd -16 * SIZE(A1, LDA), %xmm3 361 movhpd -15 * SIZE(A1, LDA), %xmm3 362 363 movapd -16 * SIZE(Y1), %xmm0 364 365 mulpd %xmm6, %xmm2 366 addpd %xmm2, %xmm0 367 mulpd %xmm7, %xmm3 368 addpd %xmm3, %xmm0 369 370 movapd %xmm0, -16 * SIZE(Y1) 371 372 addl $2 * SIZE, A1 373 addl $2 * SIZE, Y1 374 ALIGN_3 375 376.L17: 377 testl $1, M 378 je .L19 379 380 movsd -16 * SIZE(A1), %xmm2 381 movsd -16 * SIZE(A1, LDA), %xmm3 382 383 movsd -16 * SIZE(Y1), %xmm0 384 385 mulsd %xmm6, %xmm2 386 addsd %xmm2, %xmm0 387 mulsd %xmm7, %xmm3 388 addsd %xmm3, %xmm0 389 390 movsd %xmm0, -16 * SIZE(Y1) 391 ALIGN_3 392 393.L19: 394 decl J 395 jg .L11 396 ALIGN_4 397 398.L20: 399 testl $1, N 400 jle .L990 401 402 movl BUFFER, Y1 403 addl $16 * SIZE, Y1 404 405 movl A, A1 406 407#ifdef HAVE_SSE3 408 movddup (X), %xmm6 409 addl INCX, X 410 411 movddup ALPHA, %xmm0 412 413 mulpd %xmm0, %xmm6 414#else 415 movsd (X), %xmm6 416 addl INCX, X 417 418 movsd ALPHA, %xmm0 419 420 mulsd %xmm0, %xmm6 421 unpcklpd %xmm6, %xmm6 422#endif 423 424 ALIGN_3 425 426 movl M, I 427 sarl $3, I 428 jle .L25 429 430 movsd -16 * SIZE(A1), %xmm2 431 movhpd -15 * SIZE(A1), %xmm2 432 movsd -14 * SIZE(A1), %xmm3 433 movhpd -13 * SIZE(A1), %xmm3 434 435 movapd -16 * SIZE(Y1), %xmm0 436 movapd -14 * SIZE(Y1), %xmm1 437 decl I 438 jle .L24 439 ALIGN_3 440 441.L23: 442#ifdef PREFETCH 443 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 444#endif 445 446 mulpd %xmm6, %xmm2 447 addpd %xmm2, %xmm0 448 movsd -12 * SIZE(A1), %xmm2 449 movhpd -11 * SIZE(A1), %xmm2 450 451 movapd %xmm0, -16 * SIZE(Y1) 452 movapd -12 * SIZE(Y1), %xmm0 453 454 mulpd %xmm6, %xmm3 455 addpd %xmm3, %xmm1 456 movsd -10 * SIZE(A1), %xmm3 457 movhpd -9 * SIZE(A1), %xmm3 458 459 movapd %xmm1, -14 * SIZE(Y1) 460 movapd -10 * SIZE(Y1), %xmm1 461 462 mulpd %xmm6, %xmm2 463 addpd %xmm2, %xmm0 464 movsd -8 * SIZE(A1), %xmm2 465 movhpd -7 * SIZE(A1), %xmm2 466 467 movapd %xmm0, -12 * SIZE(Y1) 468 movapd -8 * SIZE(Y1), %xmm0 469 470 mulpd %xmm6, %xmm3 471 addpd %xmm3, %xmm1 472 movsd -6 * SIZE(A1), %xmm3 473 movhpd -5 * SIZE(A1), %xmm3 474 475 movapd %xmm1, -10 * SIZE(Y1) 476 movapd -6 * SIZE(Y1), %xmm1 477 478 subl $-8 * SIZE, A1 479 subl $-8 * SIZE, Y1 480 481 subl $1, I 482 BRANCH 483 jg .L23 484 ALIGN_3 485 486.L24: 487 mulpd %xmm6, %xmm2 488 addpd %xmm2, %xmm0 489 movsd -12 * SIZE(A1), %xmm2 490 movhpd -11 * SIZE(A1), %xmm2 491 mulpd %xmm6, %xmm3 492 addpd %xmm3, %xmm1 493 movsd -10 * SIZE(A1), %xmm3 494 movhpd -9 * SIZE(A1), %xmm3 495 496 movapd %xmm0, -16 * SIZE(Y1) 497 movapd -12 * SIZE(Y1), %xmm0 498 499 movapd %xmm1, -14 * SIZE(Y1) 500 movapd -10 * SIZE(Y1), %xmm1 501 502 mulpd %xmm6, %xmm2 503 addpd %xmm2, %xmm0 504 movapd %xmm0, -12 * SIZE(Y1) 505 mulpd %xmm6, %xmm3 506 addpd %xmm3, %xmm1 507 movapd %xmm1, -10 * SIZE(Y1) 508 509 subl $-8 * SIZE, A1 510 subl $-8 * SIZE, Y1 511 ALIGN_3 512 513.L25: 514 testl $4, M 515 je .L26 516 517 movsd -16 * SIZE(A1), %xmm2 518 movhpd -15 * SIZE(A1), %xmm2 519 movsd -14 * SIZE(A1), %xmm3 520 movhpd -13 * SIZE(A1), %xmm3 521 522 movapd -16 * SIZE(Y1), %xmm0 523 movapd -14 * SIZE(Y1), %xmm1 524 525 mulpd %xmm6, %xmm2 526 addpd %xmm2, %xmm0 527 mulpd %xmm6, %xmm3 528 addpd %xmm3, %xmm1 529 530 movapd %xmm0, -16 * SIZE(Y1) 531 movapd %xmm1, -14 * SIZE(Y1) 532 533 addl $4 * SIZE, A1 534 addl $4 * SIZE, Y1 535 ALIGN_3 536 537.L26: 538 testl $2, M 539 je .L27 540 541 movsd -16 * SIZE(A1), %xmm2 542 movhpd -15 * SIZE(A1), %xmm2 543 544 movapd -16 * SIZE(Y1), %xmm0 545 546 mulpd %xmm6, %xmm2 547 addpd %xmm2, %xmm0 548 549 movapd %xmm0, -16 * SIZE(Y1) 550 551 addl $2 * SIZE, A1 552 addl $2 * SIZE, Y1 553 ALIGN_3 554 555.L27: 556 testl $1, M 557 je .L990 558 559 movsd -16 * SIZE(A1), %xmm2 560 movsd -16 * SIZE(Y1), %xmm0 561 562 mulsd %xmm6, %xmm2 563 addsd %xmm2, %xmm0 564 565 movsd %xmm0, -16 * SIZE(Y1) 566 ALIGN_3 567 568.L990: 569 movl Y, Y1 570 movl BUFFER, X 571 572 movl STACK_INCY, INCY 573 sall $BASE_SHIFT, INCY 574 575 movl M, %eax 576 sarl $3, %eax 577 jle .L994 578 ALIGN_3 579 580.L992: 581 movsd (Y1), %xmm0 582 movhpd (Y1, INCY), %xmm0 583 584 addpd 0 * SIZE(X), %xmm0 585 586 movlpd %xmm0, (Y1) 587 movhpd %xmm0, (Y1, INCY) 588 leal (Y1, INCY, 2), Y1 589 590 movsd (Y1), %xmm0 591 movhpd (Y1, INCY), %xmm0 592 593 addpd 2 * SIZE(X), %xmm0 594 595 movlpd %xmm0, (Y1) 596 movhpd %xmm0, (Y1, INCY) 597 leal (Y1, INCY, 2), Y1 598 599 movsd (Y1), %xmm0 600 movhpd (Y1, INCY), %xmm0 601 602 addpd 4 * SIZE(X), %xmm0 603 604 movlpd %xmm0, (Y1) 605 movhpd %xmm0, (Y1, INCY) 606 leal (Y1, INCY, 2), Y1 607 608 movsd (Y1), %xmm0 609 movhpd (Y1, INCY), %xmm0 610 611 addpd 6 * SIZE(X), %xmm0 612 613 movlpd %xmm0, (Y1) 614 movhpd %xmm0, (Y1, INCY) 615 leal (Y1, INCY, 2), Y1 616 617 addl $8 * SIZE, X 618 decl %eax 619 jg .L992 620 ALIGN_3 621 622.L994: 623 testl $7, M 624 jle .L999 625 626 testl $4, M 627 jle .L995 628 629 movsd (Y1), %xmm0 630 movhpd (Y1, INCY), %xmm0 631 632 addpd 0 * SIZE(X), %xmm0 633 634 movlpd %xmm0, (Y1) 635 movhpd %xmm0, (Y1, INCY) 636 leal (Y1, INCY, 2), Y1 637 638 movsd (Y1), %xmm0 639 movhpd (Y1, INCY), %xmm0 640 641 addpd 2 * SIZE(X), %xmm0 642 643 movlpd %xmm0, (Y1) 644 movhpd %xmm0, (Y1, INCY) 645 leal (Y1, INCY, 2), Y1 646 647 addl $4 * SIZE, X 648 ALIGN_3 649 650.L995: 651 testl $2, M 652 jle .L996 653 654 movsd (Y1), %xmm0 655 movhpd (Y1, INCY), %xmm0 656 657 addpd 0 * SIZE(X), %xmm0 658 659 movlpd %xmm0, (Y1) 660 movhpd %xmm0, (Y1, INCY) 661 leal (Y1, INCY, 2), Y1 662 663 addl $2 * SIZE, X 664 ALIGN_3 665 666.L996: 667 testl $1, M 668 jle .L999 669 670 movsd (Y1), %xmm0 671 672 movsd 0 * SIZE(X), %xmm4 673 674 addsd %xmm4, %xmm0 675 676 movlpd %xmm0, (Y1) 677 ALIGN_3 678 679.L999: 680 popl %ebx 681 popl %esi 682 popl %edi 683 popl %ebp 684 ret 685 686 EPILOGUE 687