1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef ATOM 43#define PREFETCH prefetcht0 44#define PREFETCHW prefetcht0 45#define PREFETCHSIZE (16 * 12) 46#endif 47 48#ifdef CORE2 49#define PREFETCH prefetcht0 50#define PREFETCHW prefetcht0 51#define PREFETCHSIZE (16 * 12) 52#endif 53 54#if defined(PENRYN) || defined(DUNNINGTON) 55#define PREFETCH prefetcht0 56#define PREFETCHW prefetcht0 57#define PREFETCHSIZE (16 * 12) 58#endif 59 60#ifdef NEHALEM 61#define PREFETCH prefetcht0 62#define PREFETCHW prefetcht0 63#define PREFETCHSIZE (16 * 12) 64#endif 65 66#ifdef PENTIUM4 67#define PREFETCH prefetcht0 68#define PREFETCHW prefetcht0 69#define PREFETCHSIZE (16 * 20) 70#endif 71 72#ifdef OPTERON 73#define PREFETCH prefetch 74#define PREFETCHW prefetchw 75#define PREFETCHSIZE (16 * 8) 76#define movsd movlps 77#endif 78 79#if defined(BARCELONA) || defined(SHANGHAI) 80#define PREFETCH prefetch 81#define PREFETCHW prefetchw 82#define PREFETCHSIZE (16 * 16) 83#endif 84 85#ifdef NANO 86#define PREFETCH prefetcht0 87#define PREFETCHW prefetcht0 88#define PREFETCHSIZE (16 * 24) 89#endif 90 91#ifdef GENERIC 92#define PREFETCH prefetcht0 93#define PREFETCHW prefetcht0 94#define PREFETCHSIZE (16 * 20) 95#endif 96 97#ifndef WINDOWS_ABI 98 99#define STACKSIZE 80 100 101#define OLD_Y 8 + STACKSIZE(%rsp) 102#define OLD_INCY 16 + STACKSIZE(%rsp) 103#define OLD_BUFFER 24 + STACKSIZE(%rsp) 104 105#define M ARG1 106#define IS ARG2 107#define A ARG3 108#define LDA ARG4 109#define X ARG5 110#define INCX ARG6 111 112#else 113 114#define STACKSIZE 256 115 116#define OLD_LDA 40 + STACKSIZE(%rsp) 117#define OLD_X 48 + STACKSIZE(%rsp) 118#define OLD_INCX 56 + STACKSIZE(%rsp) 119#define OLD_Y 64 + STACKSIZE(%rsp) 120#define OLD_INCY 72 + STACKSIZE(%rsp) 121#define OLD_BUFFER 80 + STACKSIZE(%rsp) 122 123#define M ARG1 124#define IS ARG2 125#define A ARG4 126#define LDA ARG3 127#define X %rdi 128#define INCX %rsi 129 130#endif 131 132#define Y %r10 133#define INCY %r11 134#define BUFFER %r12 135 136#define TEMP %rax 137#define I %rax 138#define A1 %rbx 139#define A2 %rbp 140#define XX %r13 141#define YY %r14 142#define NEW_X BUFFER 143#define NEW_Y X 144 145#define ALPHA %xmm0 146 147#define atemp1 %xmm0 148#define atemp2 %xmm1 149#define atemp3 %xmm2 150#define atemp4 %xmm3 151 152#define xsum1 %xmm4 153#define xsum2 %xmm5 154#define xsum3 %xmm6 155#define xsum4 %xmm7 156 157#define xtemp1 %xmm8 158#define xtemp2 %xmm9 159#define yy1 %xmm10 160#define xt1 %xmm11 161 162#define a1 %xmm12 163#define a2 %xmm13 164#define a3 %xmm14 165#define a4 %xmm15 166 167 168 PROLOGUE 169 PROFCODE 170 171 subq $STACKSIZE, %rsp 172 movq %rbx, 0(%rsp) 173 movq %rbp, 8(%rsp) 174 movq %r12, 16(%rsp) 175 movq %r13, 24(%rsp) 176 movq %r14, 32(%rsp) 177 movq %r15, 40(%rsp) 178 179#ifdef WINDOWS_ABI 180 movq %rdi, 48(%rsp) 181 movq %rsi, 56(%rsp) 182 movups %xmm6, 64(%rsp) 183 movups %xmm7, 80(%rsp) 184 movups %xmm8, 96(%rsp) 185 movups %xmm9, 112(%rsp) 186 movups %xmm10, 128(%rsp) 187 movups %xmm11, 144(%rsp) 188 movups %xmm12, 160(%rsp) 189 movups %xmm13, 176(%rsp) 190 movups %xmm14, 192(%rsp) 191 movups %xmm15, 208(%rsp) 192 193 movq OLD_LDA, LDA 194 movq OLD_X, X 195 movq OLD_INCX, INCX 196 197 movaps %xmm2, %xmm0 198#endif 199 200 movq OLD_Y, Y 201 movq OLD_INCY, INCY 202 movq OLD_BUFFER, BUFFER 203 204 leaq (,INCX, SIZE), INCX 205 leaq (,INCY, SIZE), INCY 206 leaq (,LDA, SIZE), LDA 207 208 testq M, M 209 jle .L999 210 211 negq IS 212 addq M, IS 213 214 movq IS, TEMP 215 imulq LDA, TEMP 216 addq TEMP, A 217 218 shufps $0, ALPHA, ALPHA 219 220 movq BUFFER, XX 221 222 movq M, %rax 223 sarq $3, %rax 224 jle .L02 225 ALIGN_3 226 227.L01: 228 movss 0 * SIZE(X), %xmm1 229 addq INCX, X 230 movss 0 * SIZE(X), %xmm2 231 addq INCX, X 232 movss 0 * SIZE(X), %xmm3 233 addq INCX, X 234 movss 0 * SIZE(X), %xmm4 235 addq INCX, X 236 movss 0 * SIZE(X), %xmm5 237 addq INCX, X 238 movss 0 * SIZE(X), %xmm6 239 addq INCX, X 240 movss 0 * SIZE(X), %xmm7 241 addq INCX, X 242 movss 0 * SIZE(X), %xmm8 243 addq INCX, X 244 245 mulss ALPHA, %xmm1 246 mulss ALPHA, %xmm2 247 mulss ALPHA, %xmm3 248 mulss ALPHA, %xmm4 249 mulss ALPHA, %xmm5 250 mulss ALPHA, %xmm6 251 mulss ALPHA, %xmm7 252 mulss ALPHA, %xmm8 253 254 movss %xmm1, 0 * SIZE(XX) 255 movss %xmm2, 1 * SIZE(XX) 256 movss %xmm3, 2 * SIZE(XX) 257 movss %xmm4, 3 * SIZE(XX) 258 movss %xmm5, 4 * SIZE(XX) 259 movss %xmm6, 5 * SIZE(XX) 260 movss %xmm7, 6 * SIZE(XX) 261 movss %xmm8, 7 * SIZE(XX) 262 263 addq $8 * SIZE, XX 264 decq %rax 265 jg .L01 266 ALIGN_3 267 268.L02: 269 movq M, %rax 270 andq $7, %rax 271 jle .L05 272 ALIGN_3 273 274.L03: 275 movss 0 * SIZE(X), %xmm1 276 addq INCX, X 277 278 mulss ALPHA, %xmm1 279 280 movss %xmm1, 0 * SIZE(XX) 281 282 addq $1 * SIZE, XX 283 decq %rax 284 jg .L03 285 ALIGN_3 286 287.L05: 288 /* now we don't need original X */ 289 movq Y, NEW_Y 290 291 addq $512, XX 292 andq $-512, XX 293 294 cmpq $SIZE, INCY 295 je .L10 296 297 movq Y, YY 298 movq XX, NEW_Y 299 300 movq M, %rax 301 sarq $3, %rax 302 jle .L07 303 ALIGN_3 304 305.L06: 306 movss 0 * SIZE(YY), %xmm0 307 addq INCY, YY 308 movss 0 * SIZE(YY), %xmm1 309 addq INCY, YY 310 movss 0 * SIZE(YY), %xmm2 311 addq INCY, YY 312 movss 0 * SIZE(YY), %xmm3 313 addq INCY, YY 314 movss 0 * SIZE(YY), %xmm4 315 addq INCY, YY 316 movss 0 * SIZE(YY), %xmm5 317 addq INCY, YY 318 movss 0 * SIZE(YY), %xmm6 319 addq INCY, YY 320 movss 0 * SIZE(YY), %xmm7 321 addq INCY, YY 322 323 movss %xmm0, 0 * SIZE(XX) 324 movss %xmm1, 1 * SIZE(XX) 325 movss %xmm2, 2 * SIZE(XX) 326 movss %xmm3, 3 * SIZE(XX) 327 movss %xmm4, 4 * SIZE(XX) 328 movss %xmm5, 5 * SIZE(XX) 329 movss %xmm6, 6 * SIZE(XX) 330 movss %xmm7, 7 * SIZE(XX) 331 332 addq $8 * SIZE, XX 333 decq %rax 334 jg .L06 335 ALIGN_3 336 337.L07: 338 movq M, %rax 339 andq $7, %rax 340 jle .L10 341 ALIGN_3 342 343.L08: 344 movss 0 * SIZE(YY), %xmm0 345 addq INCY, YY 346 347 movss %xmm0, 0 * SIZE(XX) 348 349 addq $1 * SIZE, XX 350 decq %rax 351 jg .L08 352 ALIGN_3 353 354.L10: 355 movq IS, I 356 addq $4, I 357 cmpq M, I 358 jg .L20 359 ALIGN_3 360 361.L11: 362 movq A, A1 363 leaq (A, LDA, 2), A2 364 leaq (A, LDA, 4), A 365 366 movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 367 368 pshufd $0x00, atemp4, atemp1 369 pshufd $0x55, atemp4, atemp2 370 pshufd $0xaa, atemp4, atemp3 371 pshufd $0xff, atemp4, atemp4 372 373 pxor xsum1, xsum1 374 pxor xsum2, xsum2 375 pxor xsum3, xsum3 376 pxor xsum4, xsum4 377 378 movaps 0 * SIZE(NEW_X), xtemp1 379 movaps 4 * SIZE(NEW_X), xtemp2 380 381 movsd 0 * SIZE(A1), a1 382 movhps 2 * SIZE(A1), a1 383 movsd 0 * SIZE(A1, LDA, 1), a2 384 movhps 2 * SIZE(A1, LDA, 1), a2 385 movsd 0 * SIZE(A2), a3 386 movhps 2 * SIZE(A2), a3 387 movsd 0 * SIZE(A2, LDA, 1), a4 388 movhps 2 * SIZE(A2, LDA, 1), a4 389 390 movsd 0 * SIZE(NEW_Y), yy1 391 movhps 2 * SIZE(NEW_Y), yy1 392 393 movq NEW_X, XX 394 movq NEW_Y, YY 395 396 movq IS, I 397 sarq $4, I 398 jle .L14 399 ALIGN_3 400 401.L12: 402 movaps xtemp1, xt1 403 mulps a1, xt1 404 mulps atemp1, a1 405 addps xt1, xsum1 406 addps a1, yy1 407 movsd 4 * SIZE(A1), a1 408 movhps 6 * SIZE(A1), a1 409 410 PREFETCH PREFETCHSIZE(A1) 411 412 movaps xtemp1, xt1 413 mulps a2, xt1 414 mulps atemp2, a2 415 addps xt1, xsum2 416 addps a2, yy1 417 movsd 4 * SIZE(A1, LDA, 1), a2 418 movhps 6 * SIZE(A1, LDA, 1), a2 419 420 movaps xtemp1, xt1 421 mulps a3, xt1 422 mulps atemp3, a3 423 addps xt1, xsum3 424 addps a3, yy1 425 movsd 4 * SIZE(A2), a3 426 movhps 6 * SIZE(A2), a3 427 428#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) 429 PREFETCH PREFETCHSIZE(XX) 430#endif 431 432 movaps xtemp1, xt1 433 movaps 8 * SIZE(XX), xtemp1 434 mulps a4, xt1 435 mulps atemp4, a4 436 addps xt1, xsum4 437 addps a4, yy1 438 movsd 4 * SIZE(A2, LDA, 1), a4 439 movhps 6 * SIZE(A2, LDA, 1), a4 440 441 movlps yy1, 0 * SIZE(YY) 442 movhps yy1, 2 * SIZE(YY) 443 movsd 4 * SIZE(YY), yy1 444 movhps 6 * SIZE(YY), yy1 445 446 movaps xtemp2, xt1 447 mulps a1, xt1 448 mulps atemp1, a1 449 addps xt1, xsum1 450 addps a1, yy1 451 movsd 8 * SIZE(A1), a1 452 movhps 10 * SIZE(A1), a1 453 454 PREFETCH PREFETCHSIZE(A1, LDA, 1) 455 456 movaps xtemp2, xt1 457 mulps a2, xt1 458 mulps atemp2, a2 459 addps xt1, xsum2 460 addps a2, yy1 461 movsd 8 * SIZE(A1, LDA, 1), a2 462 movhps 10 * SIZE(A1, LDA, 1), a2 463 464 movaps xtemp2, xt1 465 mulps a3, xt1 466 mulps atemp3, a3 467 addps xt1, xsum3 468 addps a3, yy1 469 movsd 8 * SIZE(A2), a3 470 movhps 10 * SIZE(A2), a3 471 472 movaps xtemp2, xt1 473 movaps 12 * SIZE(XX), xtemp2 474 mulps a4, xt1 475 mulps atemp4, a4 476 addps xt1, xsum4 477 addps a4, yy1 478 movsd 8 * SIZE(A2, LDA, 1), a4 479 movhps 10 * SIZE(A2, LDA, 1), a4 480 481 movlps yy1, 4 * SIZE(YY) 482 movhps yy1, 6 * SIZE(YY) 483 movsd 8 * SIZE(YY), yy1 484 movhps 10 * SIZE(YY), yy1 485 486 487 movaps xtemp1, xt1 488 mulps a1, xt1 489 mulps atemp1, a1 490 addps xt1, xsum1 491 addps a1, yy1 492 movsd 12 * SIZE(A1), a1 493 movhps 14 * SIZE(A1), a1 494 495 PREFETCH PREFETCHSIZE(A2) 496 497 movaps xtemp1, xt1 498 mulps a2, xt1 499 mulps atemp2, a2 500 addps xt1, xsum2 501 addps a2, yy1 502 movsd 12 * SIZE(A1, LDA, 1), a2 503 movhps 14 * SIZE(A1, LDA, 1), a2 504 505 movaps xtemp1, xt1 506 mulps a3, xt1 507 mulps atemp3, a3 508 addps xt1, xsum3 509 addps a3, yy1 510 movsd 12 * SIZE(A2), a3 511 movhps 14 * SIZE(A2), a3 512 513#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) 514 PREFETCHW PREFETCHSIZE(YY) 515#endif 516 517 movaps xtemp1, xt1 518 movaps 16 * SIZE(XX), xtemp1 519 mulps a4, xt1 520 mulps atemp4, a4 521 addps xt1, xsum4 522 addps a4, yy1 523 movsd 12 * SIZE(A2, LDA, 1), a4 524 movhps 14 * SIZE(A2, LDA, 1), a4 525 526 movlps yy1, 8 * SIZE(YY) 527 movhps yy1, 10 * SIZE(YY) 528 movsd 12 * SIZE(YY), yy1 529 movhps 14 * SIZE(YY), yy1 530 531 movaps xtemp2, xt1 532 mulps a1, xt1 533 mulps atemp1, a1 534 addps xt1, xsum1 535 addps a1, yy1 536 movsd 16 * SIZE(A1), a1 537 movhps 18 * SIZE(A1), a1 538 539 PREFETCH PREFETCHSIZE(A2, LDA, 1) 540 541 movaps xtemp2, xt1 542 mulps a2, xt1 543 mulps atemp2, a2 544 addps xt1, xsum2 545 addps a2, yy1 546 movsd 16 * SIZE(A1, LDA, 1), a2 547 movhps 18 * SIZE(A1, LDA, 1), a2 548 549 movaps xtemp2, xt1 550 mulps a3, xt1 551 mulps atemp3, a3 552 addps xt1, xsum3 553 addps a3, yy1 554 movsd 16 * SIZE(A2), a3 555 movhps 18 * SIZE(A2), a3 556 557 movaps xtemp2, xt1 558 movaps 20 * SIZE(XX), xtemp2 559 mulps a4, xt1 560 mulps atemp4, a4 561 addps xt1, xsum4 562 addps a4, yy1 563 movsd 16 * SIZE(A2, LDA, 1), a4 564 movhps 18 * SIZE(A2, LDA, 1), a4 565 566 movlps yy1, 12 * SIZE(YY) 567 movhps yy1, 14 * SIZE(YY) 568 movsd 16 * SIZE(YY), yy1 569 movhps 18 * SIZE(YY), yy1 570 571 addq $16 * SIZE, XX 572 addq $16 * SIZE, YY 573 addq $16 * SIZE, A1 574 addq $16 * SIZE, A2 575 576 decq I 577 jg .L12 578 ALIGN_3 579 580.L14: 581 testq $8, IS 582 jle .L15 583 584 movaps xtemp1, xt1 585 mulps a1, xt1 586 mulps atemp1, a1 587 addps xt1, xsum1 588 addps a1, yy1 589 movsd 4 * SIZE(A1), a1 590 movhps 6 * SIZE(A1), a1 591 592 movaps xtemp1, xt1 593 mulps a2, xt1 594 mulps atemp2, a2 595 addps xt1, xsum2 596 addps a2, yy1 597 movsd 4 * SIZE(A1, LDA, 1), a2 598 movhps 6 * SIZE(A1, LDA, 1), a2 599 600 movaps xtemp1, xt1 601 mulps a3, xt1 602 mulps atemp3, a3 603 addps xt1, xsum3 604 addps a3, yy1 605 movsd 4 * SIZE(A2), a3 606 movhps 6 * SIZE(A2), a3 607 608 movaps xtemp1, xt1 609 movaps 8 * SIZE(XX), xtemp1 610 mulps a4, xt1 611 mulps atemp4, a4 612 addps xt1, xsum4 613 addps a4, yy1 614 movsd 4 * SIZE(A2, LDA, 1), a4 615 movhps 6 * SIZE(A2, LDA, 1), a4 616 617 movlps yy1, 0 * SIZE(YY) 618 movhps yy1, 2 * SIZE(YY) 619 movsd 4 * SIZE(YY), yy1 620 movhps 6 * SIZE(YY), yy1 621 622 movaps xtemp2, xt1 623 mulps a1, xt1 624 mulps atemp1, a1 625 addps xt1, xsum1 626 addps a1, yy1 627 movsd 8 * SIZE(A1), a1 628 movhps 10 * SIZE(A1), a1 629 630 movaps xtemp2, xt1 631 mulps a2, xt1 632 mulps atemp2, a2 633 addps xt1, xsum2 634 addps a2, yy1 635 movsd 8 * SIZE(A1, LDA, 1), a2 636 movhps 10 * SIZE(A1, LDA, 1), a2 637 638 movaps xtemp2, xt1 639 mulps a3, xt1 640 mulps atemp3, a3 641 addps xt1, xsum3 642 addps a3, yy1 643 movsd 8 * SIZE(A2), a3 644 movhps 10 * SIZE(A2), a3 645 646 movaps xtemp2, xt1 647 movaps 12 * SIZE(XX), xtemp2 648 mulps a4, xt1 649 mulps atemp4, a4 650 addps xt1, xsum4 651 addps a4, yy1 652 movsd 8 * SIZE(A2, LDA, 1), a4 653 movhps 10 * SIZE(A2, LDA, 1), a4 654 655 movlps yy1, 4 * SIZE(YY) 656 movhps yy1, 6 * SIZE(YY) 657 movsd 8 * SIZE(YY), yy1 658 movhps 10 * SIZE(YY), yy1 659 660 addq $8 * SIZE, XX 661 addq $8 * SIZE, YY 662 addq $8 * SIZE, A1 663 addq $8 * SIZE, A2 664 ALIGN_3 665 666.L15: 667 testq $4, IS 668 jle .L18 669 670 movaps xtemp1, xt1 671 mulps a1, xt1 672 mulps atemp1, a1 673 addps xt1, xsum1 674 addps a1, yy1 675 676 movaps xtemp1, xt1 677 mulps a2, xt1 678 mulps atemp2, a2 679 addps xt1, xsum2 680 addps a2, yy1 681 682 movaps xtemp1, xt1 683 mulps a3, xt1 684 mulps atemp3, a3 685 addps xt1, xsum3 686 addps a3, yy1 687 688 movaps xtemp1, xt1 689 mulps a4, xt1 690 mulps atemp4, a4 691 addps xt1, xsum4 692 addps a4, yy1 693 694 movlps yy1, 0 * SIZE(YY) 695 movhps yy1, 2 * SIZE(YY) 696 movsd 4 * SIZE(YY), yy1 697 movhps 6 * SIZE(YY), yy1 698 699 addq $4 * SIZE, XX 700 addq $4 * SIZE, YY 701 addq $4 * SIZE, A1 702 addq $4 * SIZE, A2 703 ALIGN_3 704 705.L18: 706 movaps 0 * SIZE(NEW_X, IS, SIZE), atemp1 707 708 movss 0 * SIZE(A1), a1 709 movss 0 * SIZE(A1, LDA, 1), a2 710 movss 0 * SIZE(A2), a3 711 movss 0 * SIZE(A2, LDA, 1), a4 712 713 unpcklps a3, a1 714 unpcklps a4, a2 715 unpcklps a2, a1 716 717 mulps atemp1, a1 718 addps a1, xsum1 719 720 movsd 0 * SIZE(A1, LDA, 1), a1 721 movss 1 * SIZE(A2), a2 722 movhps 1 * SIZE(A2, LDA, 1), a2 723 724 shufps $0x84, a2, a1 725 726 mulps atemp1, a1 727 addps a1, xsum2 728 729 movsd 0 * SIZE(A2), a1 730 movss 2 * SIZE(A2), a2 731 movhps 2 * SIZE(A2, LDA, 1), a2 732 733 shufps $0x84, a2, a1 734 735 mulps atemp1, a1 736 addps a1, xsum3 737 738 movsd 0 * SIZE(A2, LDA, 1), a1 739 movhps 2 * SIZE(A2, LDA, 1), a1 740 741 mulps atemp1, a1 742 addps a1, xsum4 743 744 745#ifndef HAVE_SSE3 746 movaps xsum1, xtemp1 747 unpcklps xsum3, xsum1 748 unpckhps xsum3, xtemp1 749 750 movaps xsum2, xtemp2 751 unpcklps xsum4, xsum2 752 unpckhps xsum4, xtemp2 753 754 movaps xsum1, xsum3 755 unpcklps xsum2, xsum1 756 unpckhps xsum2, xsum3 757 758 movaps xtemp1, xsum4 759 unpcklps xtemp2, xtemp1 760 unpckhps xtemp2, xsum4 761 762 addps xsum3, xsum1 763 addps xtemp1, xsum4 764 addps xsum4, xsum1 765#else 766 haddps xsum2, xsum1 767 haddps xsum4, xsum3 768 769 haddps xsum3, xsum1 770#endif 771 772 addps xsum1, yy1 773 774 movlps yy1, 0 * SIZE(YY) 775 movhps yy1, 2 * SIZE(YY) 776 777 addq $4, IS 778 779 movq IS, I 780 addq $4, I 781 cmpq M, I 782 jle .L11 783 ALIGN_3 784 785.L20: 786 testq $2, M 787 jle .L30 788 789 movq A, A1 790 leaq (A, LDA, 2), A 791 792 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp4 793 794 pshufd $0x00, atemp4, atemp1 795 pshufd $0x55, atemp4, atemp2 796 797 pxor xsum1, xsum1 798 pxor xsum2, xsum2 799 800 movaps 0 * SIZE(NEW_X), xtemp1 801 802 movsd 0 * SIZE(A1), a1 803 movhps 2 * SIZE(A1), a1 804 movsd 0 * SIZE(A1, LDA, 1), a2 805 movhps 2 * SIZE(A1, LDA, 1), a2 806 807 movsd 0 * SIZE(NEW_Y), yy1 808 movhps 2 * SIZE(NEW_Y), yy1 809 810 movq NEW_X, XX 811 movq NEW_Y, YY 812 813 movq IS, I 814 sarq $2, I 815 jle .L28 816 ALIGN_3 817 818.L22: 819 movaps xtemp1, xt1 820 mulps a1, xt1 821 mulps atemp1, a1 822 addps xt1, xsum1 823 addps a1, yy1 824 movsd 4 * SIZE(A1), a1 825 movhps 6 * SIZE(A1), a1 826 827 movaps xtemp1, xt1 828 movaps 4 * SIZE(XX), xtemp1 829 mulps a2, xt1 830 mulps atemp2, a2 831 addps xt1, xsum2 832 addps a2, yy1 833 movsd 4 * SIZE(A1, LDA, 1), a2 834 movhps 6 * SIZE(A1, LDA, 1), a2 835 836 movlps yy1, 0 * SIZE(YY) 837 movhps yy1, 2 * SIZE(YY) 838 movsd 4 * SIZE(YY), yy1 839 movhps 6 * SIZE(YY), yy1 840 841 addq $4 * SIZE, XX 842 addq $4 * SIZE, YY 843 addq $4 * SIZE, A1 844 845 decq I 846 jg .L22 847 ALIGN_3 848 849.L28: 850 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 851 852 movss 0 * SIZE(A1), a1 853 movss 0 * SIZE(A1, LDA, 1), a2 854 855 unpcklps a2, a1 856 857 mulps atemp1, a1 858 addps a1, xsum1 859 860 movsd 0 * SIZE(A1, LDA, 1), a1 861 mulps atemp1, a1 862 addps a1, xsum2 863 864#ifndef HAVE_SSE3 865 movhlps xsum1, xsum3 866 movhlps xsum2, xsum4 867 addps xsum3, xsum1 868 addps xsum4, xsum2 869 870 unpcklps xsum2, xsum1 871 movhlps xsum1, xsum2 872 873 addps xsum2, xsum1 874#else 875 haddps xsum2, xsum1 876 haddps xsum1, xsum1 877#endif 878 879 addps xsum1, yy1 880 881 movlps yy1, 0 * SIZE(YY) 882 883 addq $2, IS 884 ALIGN_3 885 886.L30: 887 testq $1, M 888 jle .L990 889 890 movq A, A1 891 892 movss 0 * SIZE(NEW_X, IS, SIZE), atemp1 893 894 pshufd $0x00, atemp1, atemp1 895 896 pxor xsum1, xsum1 897 pxor xsum2, xsum2 898 899 movss 0 * SIZE(NEW_Y), yy1 900 901 movss 0 * SIZE(NEW_X), xtemp1 902 movss 1 * SIZE(NEW_X), xtemp2 903 904 movss 0 * SIZE(A1), a1 905 movss 1 * SIZE(A1), a2 906 907 movq NEW_X, XX 908 movq NEW_Y, YY 909 910 movq IS, I 911 sarq $1, I 912 jle .L38 913 ALIGN_3 914 915.L32: 916 movaps xtemp1, xt1 917 movss 2 * SIZE(XX), xtemp1 918 mulps a1, xt1 919 mulps atemp1, a1 920 addps xt1, xsum1 921 addps a1, yy1 922 movss 2 * SIZE(A1), a1 923 924 movss yy1, 0 * SIZE(YY) 925 movss 1 * SIZE(YY), yy1 926 927 movaps xtemp2, xt1 928 movss 3 * SIZE(XX), xtemp2 929 mulps a2, xt1 930 mulps atemp1, a2 931 addps xt1, xsum1 932 addps a2, yy1 933 movss 3 * SIZE(A1), a2 934 935 movss yy1, 1 * SIZE(YY) 936 movss 2 * SIZE(YY), yy1 937 938 addq $2 * SIZE, XX 939 addq $2 * SIZE, YY 940 addq $2 * SIZE, A1 941 942 decq I 943 jg .L32 944 ALIGN_3 945 946.L38: 947 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 948 949 movss 0 * SIZE(A1), a1 950 mulss atemp1, a1 951 addss a1, xsum1 952 953#ifndef HAVE_SSE3 954 movhlps xsum1, xsum3 955 movhlps xsum2, xsum4 956 addps xsum3, xsum1 957 addps xsum4, xsum2 958 959 unpcklps xsum2, xsum1 960 movhlps xsum1, xsum2 961 962 addps xsum2, xsum1 963#else 964 addss xsum2, xsum1 965#endif 966 967 addss xsum1, yy1 968 969 movss yy1, 0 * SIZE(YY) 970 971 addq $2, IS 972 ALIGN_3 973 974.L990: 975 cmpq $SIZE, INCY 976 je .L999 977 978 movq M, %rax 979 sarq $3, %rax 980 jle .L997 981 ALIGN_3 982 983.L996: 984 movss 0 * SIZE(NEW_Y), %xmm0 985 movss 1 * SIZE(NEW_Y), %xmm1 986 movss 2 * SIZE(NEW_Y), %xmm2 987 movss 3 * SIZE(NEW_Y), %xmm3 988 movss 4 * SIZE(NEW_Y), %xmm4 989 movss 5 * SIZE(NEW_Y), %xmm5 990 movss 6 * SIZE(NEW_Y), %xmm6 991 movss 7 * SIZE(NEW_Y), %xmm7 992 993 movss %xmm0, 0 * SIZE(Y) 994 addq INCY, Y 995 movss %xmm1, 0 * SIZE(Y) 996 addq INCY, Y 997 movss %xmm2, 0 * SIZE(Y) 998 addq INCY, Y 999 movss %xmm3, 0 * SIZE(Y) 1000 addq INCY, Y 1001 movss %xmm4, 0 * SIZE(Y) 1002 addq INCY, Y 1003 movss %xmm5, 0 * SIZE(Y) 1004 addq INCY, Y 1005 movss %xmm6, 0 * SIZE(Y) 1006 addq INCY, Y 1007 movss %xmm7, 0 * SIZE(Y) 1008 addq INCY, Y 1009 1010 addq $8 * SIZE, NEW_Y 1011 decq %rax 1012 jg .L996 1013 ALIGN_3 1014 1015.L997: 1016 movq M, %rax 1017 andq $7, %rax 1018 jle .L999 1019 ALIGN_3 1020 1021.L998: 1022 movss 0 * SIZE(NEW_Y), %xmm0 1023 1024 movss %xmm0, 0 * SIZE(Y) 1025 addq INCY, Y 1026 1027 addq $1 * SIZE, NEW_Y 1028 1029 decq %rax 1030 jg .L998 1031 ALIGN_3 1032 1033 1034.L999: 1035 movq 0(%rsp), %rbx 1036 movq 8(%rsp), %rbp 1037 movq 16(%rsp), %r12 1038 movq 24(%rsp), %r13 1039 movq 32(%rsp), %r14 1040 movq 40(%rsp), %r15 1041 1042#ifdef WINDOWS_ABI 1043 movq 48(%rsp), %rdi 1044 movq 56(%rsp), %rsi 1045 movups 64(%rsp), %xmm6 1046 movups 80(%rsp), %xmm7 1047 movups 96(%rsp), %xmm8 1048 movups 112(%rsp), %xmm9 1049 movups 128(%rsp), %xmm10 1050 movups 144(%rsp), %xmm11 1051 movups 160(%rsp), %xmm12 1052 movups 176(%rsp), %xmm13 1053 movups 192(%rsp), %xmm14 1054 movups 208(%rsp), %xmm15 1055#endif 1056 1057 addq $STACKSIZE, %rsp 1058 ret 1059 EPILOGUE 1060