1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef ATOM 43#define PREFETCH prefetcht0 44#define PREFETCHW prefetcht0 45#define PREFETCHSIZE (16 * 12) 46#endif 47 48#ifdef CORE2 49#define PREFETCH prefetcht0 50#define PREFETCHW prefetcht0 51#define PREFETCHSIZE (16 * 12) 52#endif 53 54#if defined(PENRYN) || defined(DUNNINGTON) 55#define PREFETCH prefetcht0 56#define PREFETCHW prefetcht0 57#define PREFETCHSIZE (16 * 12) 58#endif 59 60#ifdef NEHALEM 61#define PREFETCH prefetcht0 62#define PREFETCHW prefetcht0 63#define PREFETCHSIZE (16 * 24) 64#endif 65 66#ifdef PENTIUM4 67#define PREFETCH prefetcht0 68#define PREFETCHW prefetcht0 69#define PREFETCHSIZE (16 * 20) 70#endif 71 72#ifdef OPTERON 73#define PREFETCH prefetch 74#define PREFETCHW prefetchw 75#define PREFETCHSIZE (16 * 8) 76#define movsd movlpd 77#endif 78 79#if defined(BARCELONA) || defined(SHANGHAI) 80#define PREFETCH prefetch 81#define PREFETCHW prefetchw 82#define PREFETCHSIZE (16 * 16) 83#endif 84 85#ifdef NANO 86#define PREFETCH prefetcht0 87#define PREFETCHW prefetcht0 88#define PREFETCHSIZE (8 * 24) 89#endif 90 91#ifdef GENERIC 92#define PREFETCH prefetcht0 93#define PREFETCHW prefetcht0 94#define PREFETCHSIZE (16 * 20) 95#endif 96 97#ifndef WINDOWS_ABI 98 99#define STACKSIZE 80 100 101#define OLD_Y 8 + STACKSIZE(%rsp) 102#define OLD_INCY 16 + STACKSIZE(%rsp) 103#define OLD_BUFFER 24 + STACKSIZE(%rsp) 104 105#define M ARG1 106#define IS ARG2 107#define A ARG3 108#define LDA ARG4 109#define X ARG5 110#define INCX ARG6 111 112#else 113 114#define STACKSIZE 256 115 116#define OLD_LDA 40 + STACKSIZE(%rsp) 117#define OLD_X 48 + STACKSIZE(%rsp) 118#define OLD_INCX 56 + STACKSIZE(%rsp) 119#define OLD_Y 64 + STACKSIZE(%rsp) 120#define OLD_INCY 72 + STACKSIZE(%rsp) 121#define OLD_BUFFER 80 + STACKSIZE(%rsp) 122 123#define M ARG1 124#define IS ARG2 125#define A ARG4 126#define LDA ARG3 127#define X %rdi 128#define INCX %rsi 129 130#endif 131 132#define Y %r10 133#define INCY %r11 134#define BUFFER %r12 135 136#define TEMP %rax 137#define I %rax 138#define A1 %rbx 139#define A2 %rbp 140#define XX %r13 141#define YY %r14 142#define NEW_X BUFFER 143#define NEW_Y X 144 145#define ALPHA %xmm0 146 147#define xtemp1 %xmm0 148#define xtemp2 %xmm1 149#define yy1 %xmm2 150#define yy2 %xmm3 151 152#define atemp1 %xmm4 153#define atemp2 %xmm5 154#define atemp3 %xmm6 155#define atemp4 %xmm7 156 157#define xsum1 %xmm8 158#define xsum2 %xmm9 159#define xsum3 %xmm10 160#define xsum4 %xmm11 161 162#define a1 %xmm12 163#define a2 %xmm13 164#define a3 %xmm14 165#define xt1 %xmm15 166 167 PROLOGUE 168 PROFCODE 169 170 subq $STACKSIZE, %rsp 171 movq %rbx, 0(%rsp) 172 movq %rbp, 8(%rsp) 173 movq %r12, 16(%rsp) 174 movq %r13, 24(%rsp) 175 movq %r14, 32(%rsp) 176 movq %r15, 40(%rsp) 177 178#ifdef WINDOWS_ABI 179 movq %rdi, 48(%rsp) 180 movq %rsi, 56(%rsp) 181 movups %xmm6, 64(%rsp) 182 movups %xmm7, 80(%rsp) 183 movups %xmm8, 96(%rsp) 184 movups %xmm9, 112(%rsp) 185 movups %xmm10, 128(%rsp) 186 movups %xmm11, 144(%rsp) 187 movups %xmm12, 160(%rsp) 188 movups %xmm13, 176(%rsp) 189 movups %xmm14, 192(%rsp) 190 movups %xmm15, 208(%rsp) 191 192 movq OLD_LDA, LDA 193 movq OLD_X, X 194 movq OLD_INCX, INCX 195 196 movaps %xmm2, %xmm0 197#endif 198 199 movq OLD_Y, Y 200 movq OLD_INCY, INCY 201 movq OLD_BUFFER, BUFFER 202 203 leaq (,INCX, SIZE), INCX 204 leaq (,INCY, SIZE), INCY 205 leaq (,LDA, SIZE), LDA 206 207 testq M, M 208 jle .L999 209 210 negq IS 211 addq M, IS 212 213 movq IS, TEMP 214 imulq LDA, TEMP 215 addq TEMP, A 216 217 unpcklpd ALPHA, ALPHA 218 219 movq BUFFER, XX 220 221 movq M, %rax 222 sarq $3, %rax 223 jle .L02 224 ALIGN_3 225 226.L01: 227 movsd 0 * SIZE(X), %xmm1 228 addq INCX, X 229 movhpd 0 * SIZE(X), %xmm1 230 addq INCX, X 231 movsd 0 * SIZE(X), %xmm2 232 addq INCX, X 233 movhpd 0 * SIZE(X), %xmm2 234 addq INCX, X 235 movsd 0 * SIZE(X), %xmm3 236 addq INCX, X 237 movhpd 0 * SIZE(X), %xmm3 238 addq INCX, X 239 movsd 0 * SIZE(X), %xmm4 240 addq INCX, X 241 movhpd 0 * SIZE(X), %xmm4 242 addq INCX, X 243 244 mulpd ALPHA, %xmm1 245 mulpd ALPHA, %xmm2 246 mulpd ALPHA, %xmm3 247 mulpd ALPHA, %xmm4 248 249 movapd %xmm1, 0 * SIZE(XX) 250 movapd %xmm2, 2 * SIZE(XX) 251 movapd %xmm3, 4 * SIZE(XX) 252 movapd %xmm4, 6 * SIZE(XX) 253 254 addq $8 * SIZE, XX 255 decq %rax 256 jg .L01 257 ALIGN_3 258 259.L02: 260 movq M, %rax 261 andq $7, %rax 262 jle .L05 263 ALIGN_3 264 265.L03: 266 movsd 0 * SIZE(X), %xmm1 267 addq INCX, X 268 269 mulsd ALPHA, %xmm1 270 271 movlpd %xmm1, 0 * SIZE(XX) 272 273 addq $1 * SIZE, XX 274 decq %rax 275 jg .L03 276 ALIGN_3 277 278.L05: 279 /* now we don't need original X */ 280 movq Y, NEW_Y 281 282 addq $512, XX 283 andq $-512, XX 284 285 cmpq $SIZE, INCY 286 je .L10 287 288 movq Y, YY 289 movq XX, NEW_Y 290 291 movq M, %rax 292 sarq $3, %rax 293 jle .L07 294 ALIGN_3 295 296.L06: 297 movsd 0 * SIZE(YY), %xmm0 298 addq INCY, YY 299 movhpd 0 * SIZE(YY), %xmm0 300 addq INCY, YY 301 movsd 0 * SIZE(YY), %xmm1 302 addq INCY, YY 303 movhpd 0 * SIZE(YY), %xmm1 304 addq INCY, YY 305 movsd 0 * SIZE(YY), %xmm2 306 addq INCY, YY 307 movhpd 0 * SIZE(YY), %xmm2 308 addq INCY, YY 309 movsd 0 * SIZE(YY), %xmm3 310 addq INCY, YY 311 movhpd 0 * SIZE(YY), %xmm3 312 addq INCY, YY 313 314 movapd %xmm0, 0 * SIZE(XX) 315 movapd %xmm1, 2 * SIZE(XX) 316 movapd %xmm2, 4 * SIZE(XX) 317 movapd %xmm3, 6 * SIZE(XX) 318 319 addq $8 * SIZE, XX 320 decq %rax 321 jg .L06 322 ALIGN_3 323 324.L07: 325 movq M, %rax 326 andq $7, %rax 327 jle .L10 328 ALIGN_3 329 330.L08: 331 movsd 0 * SIZE(YY), %xmm0 332 addq INCY, YY 333 334 movsd %xmm0, 0 * SIZE(XX) 335 336 addq $1 * SIZE, XX 337 decq %rax 338 jg .L08 339 ALIGN_3 340 341.L10: 342 movq IS, I 343 addq $4, I 344 cmpq M, I 345 jg .L20 346 ALIGN_3 347 348.L11: 349 movq A, A1 350 leaq (A, LDA, 2), A2 351 leaq (A, LDA, 4), A 352 353#ifdef HAVE_SSE3 354 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 355 movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 356 movddup 2 * SIZE(NEW_X, IS, SIZE), atemp3 357 movddup 3 * SIZE(NEW_X, IS, SIZE), atemp4 358#else 359 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 360 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 361 movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 362 movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 363 movsd 2 * SIZE(NEW_X, IS, SIZE), atemp3 364 movhpd 2 * SIZE(NEW_X, IS, SIZE), atemp3 365 movsd 3 * SIZE(NEW_X, IS, SIZE), atemp4 366 movhpd 3 * SIZE(NEW_X, IS, SIZE), atemp4 367#endif 368 369 pxor xsum1, xsum1 370 pxor xsum2, xsum2 371 pxor xsum3, xsum3 372 pxor xsum4, xsum4 373 374 movapd 0 * SIZE(NEW_X), xtemp1 375 movapd 2 * SIZE(NEW_X), xtemp2 376 377 movsd 0 * SIZE(A1), a1 378 movhpd 1 * SIZE(A1), a1 379 movsd 2 * SIZE(A1), a2 380 movhpd 3 * SIZE(A1), a2 381 movsd 0 * SIZE(A1, LDA, 1), a3 382 movhpd 1 * SIZE(A1, LDA, 1), a3 383 384 movsd 0 * SIZE(NEW_Y), yy1 385 movhpd 1 * SIZE(NEW_Y), yy1 386 movsd 2 * SIZE(NEW_Y), yy2 387 movhpd 3 * SIZE(NEW_Y), yy2 388 389 movq NEW_X, XX 390 movq NEW_Y, YY 391 392 movq IS, I 393 sarq $3, I 394 jle .L15 395 ALIGN_3 396 397.L12: 398 movapd xtemp1, xt1 399 mulpd a1, xt1 400 mulpd atemp1, a1 401 addpd xt1, xsum1 402 addpd a1, yy1 403 movsd 2 * SIZE(A1, LDA, 1), a1 404 movhpd 3 * SIZE(A1, LDA, 1), a1 405 406 PREFETCH PREFETCHSIZE(A1) 407 408 movapd xtemp2, xt1 409 mulpd a2, xt1 410 mulpd atemp1, a2 411 addpd xt1, xsum1 412 addpd a2, yy2 413 movsd 0 * SIZE(A2), a2 414 movhpd 1 * SIZE(A2), a2 415 416 movapd xtemp1, xt1 417 mulpd a3, xt1 418 mulpd atemp2, a3 419 addpd xt1, xsum2 420 addpd a3, yy1 421 movsd 2 * SIZE(A2), a3 422 movhpd 3 * SIZE(A2), a3 423 424#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) 425 PREFETCH PREFETCHSIZE(XX) 426#endif 427 428 movapd xtemp2, xt1 429 mulpd a1, xt1 430 mulpd atemp2, a1 431 addpd xt1, xsum2 432 addpd a1, yy2 433 movsd 0 * SIZE(A2, LDA, 1), a1 434 movhpd 1 * SIZE(A2, LDA, 1), a1 435 436 movapd xtemp1, xt1 437 mulpd a2, xt1 438 mulpd atemp3, a2 439 addpd xt1, xsum3 440 addpd a2, yy1 441 movsd 2 * SIZE(A2, LDA, 1), a2 442 movhpd 3 * SIZE(A2, LDA, 1), a2 443 444 PREFETCH PREFETCHSIZE(A1, LDA, 1) 445 446 movapd xtemp2, xt1 447 mulpd a3, xt1 448 mulpd atemp3, a3 449 addpd xt1, xsum3 450 addpd a3, yy2 451 movsd 4 * SIZE(A1), a3 452 movhpd 5 * SIZE(A1), a3 453 454 movapd xtemp1, xt1 455 movapd 4 * SIZE(XX), xtemp1 456 mulpd a1, xt1 457 mulpd atemp4, a1 458 addpd xt1, xsum4 459 addpd a1, yy1 460 movsd 6 * SIZE(A1), a1 461 movhpd 7 * SIZE(A1), a1 462 463 movapd xtemp2, xt1 464 movapd 6 * SIZE(XX), xtemp2 465 mulpd a2, xt1 466 mulpd atemp4, a2 467 addpd xt1, xsum4 468 addpd a2, yy2 469 movsd 4 * SIZE(A1, LDA, 1), a2 470 movhpd 5 * SIZE(A1, LDA, 1), a2 471 472 movsd yy1, 0 * SIZE(YY) 473 movhpd yy1, 1 * SIZE(YY) 474 movsd 4 * SIZE(YY), yy1 475 movhpd 5 * SIZE(YY), yy1 476 477 movsd yy2, 2 * SIZE(YY) 478 movhpd yy2, 3 * SIZE(YY) 479 movsd 6 * SIZE(YY), yy2 480 movhpd 7 * SIZE(YY), yy2 481 482 movapd xtemp1, xt1 483 mulpd a3, xt1 484 mulpd atemp1, a3 485 addpd xt1, xsum1 486 addpd a3, yy1 487 movsd 6 * SIZE(A1, LDA, 1), a3 488 movhpd 7 * SIZE(A1, LDA, 1), a3 489 490 PREFETCH PREFETCHSIZE(A2) 491 492 movapd xtemp2, xt1 493 mulpd a1, xt1 494 mulpd atemp1, a1 495 addpd xt1, xsum1 496 addpd a1, yy2 497 movsd 4 * SIZE(A2), a1 498 movhpd 5 * SIZE(A2), a1 499 500 movapd xtemp1, xt1 501 mulpd a2, xt1 502 mulpd atemp2, a2 503 addpd xt1, xsum2 504 addpd a2, yy1 505 movsd 6 * SIZE(A2), a2 506 movhpd 7 * SIZE(A2), a2 507 508#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) 509 PREFETCHW PREFETCHSIZE(YY) 510#endif 511 512 movapd xtemp2, xt1 513 mulpd a3, xt1 514 mulpd atemp2, a3 515 addpd xt1, xsum2 516 addpd a3, yy2 517 movsd 4 * SIZE(A2, LDA, 1), a3 518 movhpd 5 * SIZE(A2, LDA, 1), a3 519 520 movapd xtemp1, xt1 521 mulpd a1, xt1 522 mulpd atemp3, a1 523 addpd xt1, xsum3 524 addpd a1, yy1 525 movsd 6 * SIZE(A2, LDA, 1), a1 526 movhpd 7 * SIZE(A2, LDA, 1), a1 527 528 PREFETCH PREFETCHSIZE(A2, LDA, 1) 529 530 movapd xtemp2, xt1 531 mulpd a2, xt1 532 mulpd atemp3, a2 533 addpd xt1, xsum3 534 addpd a2, yy2 535 movsd 10 * SIZE(A1), a2 536 movhpd 11 * SIZE(A1), a2 537 538 movapd xtemp1, xt1 539 movapd 8 * SIZE(XX), xtemp1 540 mulpd a3, xt1 541 mulpd atemp4, a3 542 addpd xt1, xsum4 543 addpd a3, yy1 544 movsd 8 * SIZE(A1, LDA, 1), a3 545 movhpd 9 * SIZE(A1, LDA, 1), a3 546 547 movapd xtemp2, xt1 548 movapd 10 * SIZE(XX), xtemp2 549 mulpd a1, xt1 550 mulpd atemp4, a1 551 addpd xt1, xsum4 552 addpd a1, yy2 553 movsd 8 * SIZE(A1), a1 554 movhpd 9 * SIZE(A1), a1 555 556 movsd yy1, 4 * SIZE(YY) 557 movhpd yy1, 5 * SIZE(YY) 558 movsd 8 * SIZE(YY), yy1 559 movhpd 9 * SIZE(YY), yy1 560 561 movsd yy2, 6 * SIZE(YY) 562 movhpd yy2, 7 * SIZE(YY) 563 movsd 10 * SIZE(YY), yy2 564 movhpd 11 * SIZE(YY), yy2 565 566 addq $8 * SIZE, XX 567 addq $8 * SIZE, YY 568 addq $8 * SIZE, A1 569 addq $8 * SIZE, A2 570 571 decq I 572 jg .L12 573 ALIGN_3 574 575.L15: 576 testq $4, IS 577 jle .L18 578 579 movapd xtemp1, xt1 580 mulpd a1, xt1 581 mulpd atemp1, a1 582 addpd xt1, xsum1 583 addpd a1, yy1 584 movsd 2 * SIZE(A1, LDA, 1), a1 585 movhpd 3 * SIZE(A1, LDA, 1), a1 586 587 movapd xtemp2, xt1 588 mulpd a2, xt1 589 mulpd atemp1, a2 590 addpd xt1, xsum1 591 addpd a2, yy2 592 movsd 0 * SIZE(A2), a2 593 movhpd 1 * SIZE(A2), a2 594 595 movapd xtemp1, xt1 596 mulpd a3, xt1 597 mulpd atemp2, a3 598 addpd xt1, xsum2 599 addpd a3, yy1 600 movsd 2 * SIZE(A2), a3 601 movhpd 3 * SIZE(A2), a3 602 603 movapd xtemp2, xt1 604 mulpd a1, xt1 605 mulpd atemp2, a1 606 addpd xt1, xsum2 607 addpd a1, yy2 608 movsd 0 * SIZE(A2, LDA, 1), a1 609 movhpd 1 * SIZE(A2, LDA, 1), a1 610 611 movapd xtemp1, xt1 612 mulpd a2, xt1 613 mulpd atemp3, a2 614 addpd xt1, xsum3 615 addpd a2, yy1 616 movsd 2 * SIZE(A2, LDA, 1), a2 617 movhpd 3 * SIZE(A2, LDA, 1), a2 618 619 movapd xtemp2, xt1 620 mulpd a3, xt1 621 mulpd atemp3, a3 622 addpd xt1, xsum3 623 addpd a3, yy2 624 625 movapd xtemp1, xt1 626 movapd 4 * SIZE(XX), xtemp1 627 mulpd a1, xt1 628 mulpd atemp4, a1 629 addpd xt1, xsum4 630 addpd a1, yy1 631 632 movapd xtemp2, xt1 633 movapd 6 * SIZE(XX), xtemp2 634 mulpd a2, xt1 635 mulpd atemp4, a2 636 addpd xt1, xsum4 637 addpd a2, yy2 638 639 movsd yy1, 0 * SIZE(YY) 640 movhpd yy1, 1 * SIZE(YY) 641 movsd 4 * SIZE(YY), yy1 642 movhpd 5 * SIZE(YY), yy1 643 644 movsd yy2, 2 * SIZE(YY) 645 movhpd yy2, 3 * SIZE(YY) 646 movsd 6 * SIZE(YY), yy2 647 movhpd 7 * SIZE(YY), yy2 648 649 addq $4 * SIZE, XX 650 addq $4 * SIZE, YY 651 addq $4 * SIZE, A1 652 addq $4 * SIZE, A2 653 ALIGN_3 654 655.L18: 656 unpckhpd atemp2, atemp1 657 unpckhpd atemp4, atemp3 658 659 movsd 0 * SIZE(A1), a1 660 movhpd 0 * SIZE(A1, LDA, 1), a1 661 mulpd atemp1, a1 662 addpd a1, xsum1 663 664 movsd 0 * SIZE(A1, LDA, 1), a1 665 movhpd 1 * SIZE(A1, LDA, 1), a1 666 mulpd atemp1, a1 667 addpd a1, xsum2 668 669 movsd 0 * SIZE(A2), a1 670 movhpd 1 * SIZE(A2), a1 671 mulpd atemp1, a1 672 addpd a1, xsum3 673 674 movsd 0 * SIZE(A2, LDA, 1), a1 675 movhpd 1 * SIZE(A2, LDA, 1), a1 676 mulpd atemp1, a1 677 addpd a1, xsum4 678 679 movsd 0 * SIZE(A2), a1 680 movhpd 0 * SIZE(A2, LDA, 1), a1 681 mulpd atemp3, a1 682 addpd a1, xsum1 683 684 movsd 1 * SIZE(A2), a1 685 movhpd 1 * SIZE(A2, LDA, 1), a1 686 mulpd atemp3, a1 687 addpd a1, xsum2 688 689 movsd 2 * SIZE(A2), a1 690 movhpd 2 * SIZE(A2, LDA, 1), a1 691 mulpd atemp3, a1 692 addpd a1, xsum3 693 694 movsd 2 * SIZE(A2, LDA, 1), a1 695 movhpd 3 * SIZE(A2, LDA, 1), a1 696 mulpd atemp3, a1 697 addpd a1, xsum4 698 699#ifndef HAVE_SSE3 700 movapd xsum1, atemp1 701 movapd xsum3, atemp3 702 703 unpcklpd xsum2, xsum1 704 unpcklpd xsum4, xsum3 705 706 unpckhpd xsum2, atemp1 707 unpckhpd xsum4, atemp3 708 709 addpd atemp1, xsum1 710 addpd atemp3, xsum3 711#else 712 haddpd xsum2, xsum1 713 haddpd xsum4, xsum3 714#endif 715 716 addpd xsum1, yy1 717 addpd xsum3, yy2 718 719 movsd yy1, 0 * SIZE(YY) 720 movhpd yy1, 1 * SIZE(YY) 721 movsd yy2, 2 * SIZE(YY) 722 movhpd yy2, 3 * SIZE(YY) 723 724 addq $4, IS 725 726 movq IS, I 727 addq $4, I 728 cmpq M, I 729 jle .L11 730 ALIGN_3 731 732.L20: 733 testq $2, M 734 je .L30 735 ALIGN_3 736 737.L21: 738 movq A, A1 739 leaq (A, LDA, 2), A 740 741#ifdef HAVE_SSE3 742 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 743 movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 744#else 745 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 746 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 747 movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 748 movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 749#endif 750 751 pxor xsum1, xsum1 752 pxor xsum2, xsum2 753 754 movapd 0 * SIZE(NEW_X), xtemp1 755 756 movsd 0 * SIZE(NEW_Y), yy1 757 movhpd 1 * SIZE(NEW_Y), yy1 758 759 movsd 0 * SIZE(A1), a1 760 movhpd 1 * SIZE(A1), a1 761 movsd 0 * SIZE(A1, LDA, 1), a2 762 movhpd 1 * SIZE(A1, LDA, 1), a2 763 764 movq NEW_X, XX 765 movq NEW_Y, YY 766 767 movq IS, I 768 sarq $1, I 769 jle .L28 770 ALIGN_3 771 772.L22: 773 movapd xtemp1, xt1 774 mulpd a1, xt1 775 mulpd atemp1, a1 776 addpd xt1, xsum1 777 addpd a1, yy1 778 movsd 2 * SIZE(A1), a1 779 movhpd 3 * SIZE(A1), a1 780 781 movapd xtemp1, xt1 782 movapd 2 * SIZE(XX), xtemp1 783 mulpd a2, xt1 784 mulpd atemp2, a2 785 addpd xt1, xsum2 786 addpd a2, yy1 787 movsd 2 * SIZE(A1, LDA, 1), a2 788 movhpd 3 * SIZE(A1, LDA, 1), a2 789 790 movsd yy1, 0 * SIZE(YY) 791 movhpd yy1, 1 * SIZE(YY) 792 movsd 2 * SIZE(YY), yy1 793 movhpd 3 * SIZE(YY), yy1 794 795 addq $2 * SIZE, XX 796 addq $2 * SIZE, YY 797 addq $2 * SIZE, A1 798 799 decq I 800 jg .L22 801 ALIGN_3 802 803.L28: 804 unpckhpd atemp2, atemp1 805 806 movsd 0 * SIZE(A1), a1 807 movhpd 0 * SIZE(A1, LDA, 1), a1 808 mulpd atemp1, a1 809 addpd a1, xsum1 810 811 movsd 0 * SIZE(A1, LDA, 1), a1 812 movhpd 1 * SIZE(A1, LDA, 1), a1 813 mulpd atemp1, a1 814 addpd a1, xsum2 815 816#ifndef HAVE_SSE3 817 movapd xsum1, atemp1 818 819 unpcklpd xsum2, xsum1 820 unpckhpd xsum2, atemp1 821 822 addpd atemp1, xsum1 823#else 824 haddpd xsum2, xsum1 825#endif 826 827 addpd xsum1, yy1 828 829 movsd yy1, 0 * SIZE(YY) 830 movhpd yy1, 1 * SIZE(YY) 831 832 addq $2, IS 833 ALIGN_3 834 835.L30: 836 testq $1, M 837 je .L990 838 ALIGN_3 839 840.L31: 841 movq A, A1 842 843#ifdef HAVE_SSE3 844 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 845#else 846 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 847 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 848#endif 849 850 pxor xsum1, xsum1 851 852 movsd 0 * SIZE(NEW_X), xtemp1 853 movsd 0 * SIZE(NEW_Y), yy1 854 movsd 0 * SIZE(A1), a1 855 856 movq NEW_X, XX 857 movq NEW_Y, YY 858 859 movq IS, I 860 testq I, I 861 jle .L38 862 ALIGN_3 863 864.L32: 865 movapd xtemp1, xt1 866 mulpd a1, xt1 867 mulpd atemp1, a1 868 addpd xt1, xsum1 869 addpd a1, yy1 870 movsd 1 * SIZE(A1), a1 871 872 movsd 1 * SIZE(XX), xtemp1 873 874 movsd yy1, 0 * SIZE(YY) 875 movsd 1 * SIZE(YY), yy1 876 877 addq $1 * SIZE, XX 878 addq $1 * SIZE, YY 879 addq $1 * SIZE, A1 880 881 decq I 882 jg .L32 883 ALIGN_3 884 885.L38: 886 movsd 0 * SIZE(A1), a1 887 mulsd atemp1, a1 888 addsd a1, xsum1 889 890 addsd xsum1, yy1 891 892 movsd yy1, 0 * SIZE(YY) 893 ALIGN_3 894 895.L990: 896 cmpq $SIZE, INCY 897 je .L999 898 899 movq M, %rax 900 sarq $3, %rax 901 jle .L997 902 ALIGN_3 903 904.L996: 905 movapd 0 * SIZE(NEW_Y), %xmm0 906 movapd 2 * SIZE(NEW_Y), %xmm1 907 movapd 4 * SIZE(NEW_Y), %xmm2 908 movapd 6 * SIZE(NEW_Y), %xmm3 909 910 movsd %xmm0, 0 * SIZE(Y) 911 addq INCY, Y 912 movhpd %xmm0, 0 * SIZE(Y) 913 addq INCY, Y 914 movsd %xmm1, 0 * SIZE(Y) 915 addq INCY, Y 916 movhpd %xmm1, 0 * SIZE(Y) 917 addq INCY, Y 918 movsd %xmm2, 0 * SIZE(Y) 919 addq INCY, Y 920 movhpd %xmm2, 0 * SIZE(Y) 921 addq INCY, Y 922 movsd %xmm3, 0 * SIZE(Y) 923 addq INCY, Y 924 movhpd %xmm3, 0 * SIZE(Y) 925 addq INCY, Y 926 927 addq $8 * SIZE, NEW_Y 928 decq %rax 929 jg .L996 930 ALIGN_3 931 932.L997: 933 movq M, %rax 934 andq $7, %rax 935 jle .L999 936 ALIGN_3 937 938.L998: 939 movsd 0 * SIZE(NEW_Y), %xmm0 940 941 movsd %xmm0, 0 * SIZE(Y) 942 addq INCY, Y 943 944 addq $1 * SIZE, NEW_Y 945 946 decq %rax 947 jg .L998 948 ALIGN_3 949 950 951.L999: 952 movq 0(%rsp), %rbx 953 movq 8(%rsp), %rbp 954 movq 16(%rsp), %r12 955 movq 24(%rsp), %r13 956 movq 32(%rsp), %r14 957 movq 40(%rsp), %r15 958 959#ifdef WINDOWS_ABI 960 movq 48(%rsp), %rdi 961 movq 56(%rsp), %rsi 962 movups 64(%rsp), %xmm6 963 movups 80(%rsp), %xmm7 964 movups 96(%rsp), %xmm8 965 movups 112(%rsp), %xmm9 966 movups 128(%rsp), %xmm10 967 movups 144(%rsp), %xmm11 968 movups 160(%rsp), %xmm12 969 movups 176(%rsp), %xmm13 970 movups 192(%rsp), %xmm14 971 movups 208(%rsp), %xmm15 972#endif 973 974 addq $STACKSIZE, %rsp 975 ret 976 EPILOGUE 977