1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef ATOM 43#define PREFETCH prefetcht0 44#define PREFETCHW prefetcht0 45#define PREFETCHSIZE (16 * 12) 46#endif 47 48#ifdef CORE2 49#define PREFETCH prefetcht0 50#define PREFETCHW prefetcht0 51#define PREFETCHSIZE (16 * 12) 52#endif 53 54#if defined(PENRYN) || defined(DUNNINGTON) 55#define PREFETCH prefetcht0 56#define PREFETCHW prefetcht0 57#define PREFETCHSIZE (16 * 12) 58#endif 59 60#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) 61#define PREFETCH prefetcht0 62#define PREFETCHW prefetcht0 63#define PREFETCHSIZE (16 * 12) 64#endif 65 66#ifdef PENTIUM4 67#define PREFETCH prefetcht0 68#define PREFETCHW prefetcht0 69#define PREFETCHSIZE (16 * 20) 70#endif 71 72#ifdef OPTERON 73#define PREFETCH prefetch 74#define PREFETCHW prefetchw 75#define PREFETCHSIZE (16 * 8) 76#define movsd movlps 77#endif 78 79#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) 80#define PREFETCH prefetch 81#define PREFETCHW prefetchw 82#define PREFETCHSIZE (16 * 16) 83#endif 84 85#ifdef NANO 86#define PREFETCH prefetcht0 87#define PREFETCHW prefetcht0 88#define PREFETCHSIZE (16 * 24) 89#endif 90 91#ifdef GENERIC 92#define PREFETCH prefetcht0 93#define PREFETCHW prefetcht0 94#define PREFETCHSIZE (16 * 20) 95#endif 96 97#ifndef WINDOWS_ABI 98 99#define STACKSIZE 80 100 101#define OLD_Y 8 + STACKSIZE(%rsp) 102#define OLD_INCY 16 + STACKSIZE(%rsp) 103#define OLD_BUFFER 24 + STACKSIZE(%rsp) 104 105#define M ARG1 106#define N ARG2 107#define A ARG3 108#define LDA ARG4 109#define X ARG5 110#define INCX ARG6 111 112#else 113 114#define STACKSIZE 256 115 116#define OLD_LDA 40 + STACKSIZE(%rsp) 117#define OLD_X 48 + STACKSIZE(%rsp) 118#define OLD_INCX 56 + STACKSIZE(%rsp) 119#define OLD_Y 64 + STACKSIZE(%rsp) 120#define OLD_INCY 72 + STACKSIZE(%rsp) 121#define OLD_BUFFER 80 + STACKSIZE(%rsp) 122 123#define M ARG1 124#define N ARG2 125#define A ARG4 126#define LDA ARG3 127#define X %rdi 128#define INCX %rsi 129#endif 130 131#define Y %r10 132#define INCY %r11 133#define BUFFER %r12 134 135#define TEMP %rax 136#define I %rax 137#define A1 %rbx 138#define A2 %rbp 139#define XX %r13 140#define YY %r14 141#define IS %r15 142#define NEW_X BUFFER 143#define NEW_Y X 144 145#define ALPHA %xmm0 146 147#define atemp1 %xmm0 148#define atemp2 %xmm1 149#define atemp3 %xmm2 150#define atemp4 %xmm3 151 152#define xsum1 %xmm4 153#define xsum2 %xmm5 154#define xsum3 %xmm6 155#define xsum4 %xmm7 156 157#define xtemp1 %xmm8 158#define xtemp2 %xmm9 159#define yy1 %xmm10 160#define xt1 %xmm11 161 162#define a1 %xmm12 163#define a2 %xmm13 164#define a3 %xmm14 165#define a4 %xmm15 166 167 168 PROLOGUE 169 PROFCODE 170 171 subq $STACKSIZE, %rsp 172 movq %rbx, 0(%rsp) 173 movq %rbp, 8(%rsp) 174 movq %r12, 16(%rsp) 175 movq %r13, 24(%rsp) 176 movq %r14, 32(%rsp) 177 movq %r15, 40(%rsp) 178 179#ifdef WINDOWS_ABI 180 movq %rdi, 48(%rsp) 181 movq %rsi, 56(%rsp) 182 movups %xmm6, 64(%rsp) 183 movups %xmm7, 80(%rsp) 184 movups %xmm8, 96(%rsp) 185 movups %xmm9, 112(%rsp) 186 movups %xmm10, 128(%rsp) 187 movups %xmm11, 144(%rsp) 188 movups %xmm12, 160(%rsp) 189 movups %xmm13, 176(%rsp) 190 movups %xmm14, 192(%rsp) 191 movups %xmm15, 208(%rsp) 192 193 movq OLD_LDA, LDA 194 movq OLD_X, X 195 movq OLD_INCX, INCX 196 197 movaps %xmm2, %xmm0 198#endif 199 200 movq OLD_Y, Y 201 movq OLD_INCY, INCY 202 movq OLD_BUFFER, BUFFER 203 204 leaq (,INCX, SIZE), INCX 205 leaq (,INCY, SIZE), INCY 206 leaq (,LDA, SIZE), LDA 207 208 testq M, M 209 jle .L999 210 211 shufps $0, ALPHA, ALPHA 212 213 movq BUFFER, XX 214 215 movq M, %rax 216 sarq $3, %rax 217 jle .L02 218 ALIGN_3 219 220.L01: 221 movss 0 * SIZE(X), %xmm1 222 addq INCX, X 223 movss 0 * SIZE(X), %xmm2 224 addq INCX, X 225 movss 0 * SIZE(X), %xmm3 226 addq INCX, X 227 movss 0 * SIZE(X), %xmm4 228 addq INCX, X 229 movss 0 * SIZE(X), %xmm5 230 addq INCX, X 231 movss 0 * SIZE(X), %xmm6 232 addq INCX, X 233 movss 0 * SIZE(X), %xmm7 234 addq INCX, X 235 movss 0 * SIZE(X), %xmm8 236 addq INCX, X 237 238 mulss ALPHA, %xmm1 239 mulss ALPHA, %xmm2 240 mulss ALPHA, %xmm3 241 mulss ALPHA, %xmm4 242 mulss ALPHA, %xmm5 243 mulss ALPHA, %xmm6 244 mulss ALPHA, %xmm7 245 mulss ALPHA, %xmm8 246 247 movss %xmm1, 0 * SIZE(XX) 248 movss %xmm2, 1 * SIZE(XX) 249 movss %xmm3, 2 * SIZE(XX) 250 movss %xmm4, 3 * SIZE(XX) 251 movss %xmm5, 4 * SIZE(XX) 252 movss %xmm6, 5 * SIZE(XX) 253 movss %xmm7, 6 * SIZE(XX) 254 movss %xmm8, 7 * SIZE(XX) 255 256 addq $8 * SIZE, XX 257 decq %rax 258 jg .L01 259 ALIGN_3 260 261.L02: 262 movq M, %rax 263 andq $7, %rax 264 jle .L05 265 ALIGN_3 266 267.L03: 268 movss 0 * SIZE(X), %xmm1 269 addq INCX, X 270 271 mulss ALPHA, %xmm1 272 273 movss %xmm1, 0 * SIZE(XX) 274 275 addq $1 * SIZE, XX 276 decq %rax 277 jg .L03 278 ALIGN_3 279 280.L05: 281 /* now we don't need original X */ 282 movq Y, NEW_Y 283 284 addq $512, XX 285 andq $-512, XX 286 287 cmpq $SIZE, INCY 288 je .L10 289 290 movq Y, YY 291 movq XX, NEW_Y 292 293 movq M, %rax 294 sarq $3, %rax 295 jle .L07 296 ALIGN_3 297 298.L06: 299 movss 0 * SIZE(YY), %xmm0 300 addq INCY, YY 301 movss 0 * SIZE(YY), %xmm1 302 addq INCY, YY 303 movss 0 * SIZE(YY), %xmm2 304 addq INCY, YY 305 movss 0 * SIZE(YY), %xmm3 306 addq INCY, YY 307 movss 0 * SIZE(YY), %xmm4 308 addq INCY, YY 309 movss 0 * SIZE(YY), %xmm5 310 addq INCY, YY 311 movss 0 * SIZE(YY), %xmm6 312 addq INCY, YY 313 movss 0 * SIZE(YY), %xmm7 314 addq INCY, YY 315 316 movss %xmm0, 0 * SIZE(XX) 317 movss %xmm1, 1 * SIZE(XX) 318 movss %xmm2, 2 * SIZE(XX) 319 movss %xmm3, 3 * SIZE(XX) 320 movss %xmm4, 4 * SIZE(XX) 321 movss %xmm5, 5 * SIZE(XX) 322 movss %xmm6, 6 * SIZE(XX) 323 movss %xmm7, 7 * SIZE(XX) 324 325 addq $8 * SIZE, XX 326 decq %rax 327 jg .L06 328 ALIGN_3 329 330.L07: 331 movq M, %rax 332 andq $7, %rax 333 jle .L10 334 ALIGN_3 335 336.L08: 337 movss 0 * SIZE(YY), %xmm0 338 addq INCY, YY 339 340 movss %xmm0, 0 * SIZE(XX) 341 342 addq $1 * SIZE, XX 343 decq %rax 344 jg .L08 345 ALIGN_3 346 347.L10: 348 xorq IS, IS # is = 0 349 350 cmpq $4, N 351 jl .L20 352 ALIGN_3 353 354.L11: 355 movq A, A1 356 leaq (A, LDA, 2), A2 357 leaq 4 * SIZE(A, LDA, 4), A 358 359 leaq (NEW_X, IS, SIZE), XX 360 leaq 4 * SIZE(NEW_Y, IS, SIZE), YY 361 362 movaps 0 * SIZE(XX), atemp4 363 364 movsd 0 * SIZE(A1), xsum1 365 movhps 2 * SIZE(A1), xsum1 366 mulps atemp4, xsum1 367 368 movss 1 * SIZE(A1), xsum2 369 movss 1 * SIZE(A1, LDA, 1), a2 370 movss 2 * SIZE(A1, LDA, 1), a3 371 movss 3 * SIZE(A1, LDA, 1), a4 372 unpcklps a3, xsum2 373 unpcklps a4, a2 374 unpcklps a2, xsum2 375 mulps atemp4, xsum2 376 377 movss 2 * SIZE(A1), xsum3 378 movss 2 * SIZE(A1, LDA, 1), a2 379 movss 2 * SIZE(A2), a3 380 movss 3 * SIZE(A2), a4 381 unpcklps a3, xsum3 382 unpcklps a4, a2 383 unpcklps a2, xsum3 384 mulps atemp4, xsum3 385 386 movss 3 * SIZE(A1), xsum4 387 movss 3 * SIZE(A1, LDA, 1), a2 388 movss 3 * SIZE(A2), a3 389 movss 3 * SIZE(A2, LDA, 1), a4 390 unpcklps a3, xsum4 391 unpcklps a4, a2 392 unpcklps a2, xsum4 393 mulps atemp4, xsum4 394 395 pshufd $0x00, atemp4, atemp1 396 pshufd $0x55, atemp4, atemp2 397 pshufd $0xaa, atemp4, atemp3 398 pshufd $0xff, atemp4, atemp4 399 400 movaps 4 * SIZE(XX), xtemp1 401 movaps 8 * SIZE(XX), xtemp2 402 403 movsd 0 * SIZE(YY), yy1 404 movhps 2 * SIZE(YY), yy1 405 406 movsd 4 * SIZE(A1), a1 407 movhps 6 * SIZE(A1), a1 408 movsd 4 * SIZE(A1, LDA, 1), a2 409 movhps 6 * SIZE(A1, LDA, 1), a2 410 movsd 4 * SIZE(A2), a3 411 movhps 6 * SIZE(A2), a3 412 movsd 4 * SIZE(A2, LDA, 1), a4 413 movhps 6 * SIZE(A2, LDA, 1), a4 414 415 addq $4 * SIZE, XX 416 addq $4 * SIZE, A1 417 addq $4 * SIZE, A2 418 419 movq M, I 420 subq IS, I 421 subq $4, I 422 sarq $4, I 423 jle .L14 424 ALIGN_3 425 426.L12: 427 movaps xtemp1, xt1 428 mulps a1, xt1 429 mulps atemp1, a1 430 addps xt1, xsum1 431 addps a1, yy1 432 movsd 4 * SIZE(A1), a1 433 movhps 6 * SIZE(A1), a1 434 435 PREFETCH PREFETCHSIZE(A1) 436 437 movaps xtemp1, xt1 438 mulps a2, xt1 439 mulps atemp2, a2 440 addps xt1, xsum2 441 addps a2, yy1 442 movsd 4 * SIZE(A1, LDA, 1), a2 443 movhps 6 * SIZE(A1, LDA, 1), a2 444 445 movaps xtemp1, xt1 446 mulps a3, xt1 447 mulps atemp3, a3 448 addps xt1, xsum3 449 addps a3, yy1 450 movsd 4 * SIZE(A2), a3 451 movhps 6 * SIZE(A2), a3 452 453#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) 454 PREFETCH PREFETCHSIZE(XX) 455#endif 456 457 movaps xtemp1, xt1 458 movaps 8 * SIZE(XX), xtemp1 459 mulps a4, xt1 460 mulps atemp4, a4 461 addps xt1, xsum4 462 addps a4, yy1 463 movsd 4 * SIZE(A2, LDA, 1), a4 464 movhps 6 * SIZE(A2, LDA, 1), a4 465 466 movlps yy1, 0 * SIZE(YY) 467 movhps yy1, 2 * SIZE(YY) 468 movsd 4 * SIZE(YY), yy1 469 movhps 6 * SIZE(YY), yy1 470 471 movaps xtemp2, xt1 472 mulps a1, xt1 473 mulps atemp1, a1 474 addps xt1, xsum1 475 addps a1, yy1 476 movsd 8 * SIZE(A1), a1 477 movhps 10 * SIZE(A1), a1 478 479 PREFETCH PREFETCHSIZE(A1, LDA, 1) 480 481 movaps xtemp2, xt1 482 mulps a2, xt1 483 mulps atemp2, a2 484 addps xt1, xsum2 485 addps a2, yy1 486 movsd 8 * SIZE(A1, LDA, 1), a2 487 movhps 10 * SIZE(A1, LDA, 1), a2 488 489 movaps xtemp2, xt1 490 mulps a3, xt1 491 mulps atemp3, a3 492 addps xt1, xsum3 493 addps a3, yy1 494 movsd 8 * SIZE(A2), a3 495 movhps 10 * SIZE(A2), a3 496 497 movaps xtemp2, xt1 498 movaps 12 * SIZE(XX), xtemp2 499 mulps a4, xt1 500 mulps atemp4, a4 501 addps xt1, xsum4 502 addps a4, yy1 503 movsd 8 * SIZE(A2, LDA, 1), a4 504 movhps 10 * SIZE(A2, LDA, 1), a4 505 506 movlps yy1, 4 * SIZE(YY) 507 movhps yy1, 6 * SIZE(YY) 508 movsd 8 * SIZE(YY), yy1 509 movhps 10 * SIZE(YY), yy1 510 511 512 movaps xtemp1, xt1 513 mulps a1, xt1 514 mulps atemp1, a1 515 addps xt1, xsum1 516 addps a1, yy1 517 movsd 12 * SIZE(A1), a1 518 movhps 14 * SIZE(A1), a1 519 520 PREFETCH PREFETCHSIZE(A2) 521 522 movaps xtemp1, xt1 523 mulps a2, xt1 524 mulps atemp2, a2 525 addps xt1, xsum2 526 addps a2, yy1 527 movsd 12 * SIZE(A1, LDA, 1), a2 528 movhps 14 * SIZE(A1, LDA, 1), a2 529 530 movaps xtemp1, xt1 531 mulps a3, xt1 532 mulps atemp3, a3 533 addps xt1, xsum3 534 addps a3, yy1 535 movsd 12 * SIZE(A2), a3 536 movhps 14 * SIZE(A2), a3 537 538#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) 539 PREFETCHW PREFETCHSIZE(YY) 540#endif 541 542 movaps xtemp1, xt1 543 movaps 16 * SIZE(XX), xtemp1 544 mulps a4, xt1 545 mulps atemp4, a4 546 addps xt1, xsum4 547 addps a4, yy1 548 movsd 12 * SIZE(A2, LDA, 1), a4 549 movhps 14 * SIZE(A2, LDA, 1), a4 550 551 movlps yy1, 8 * SIZE(YY) 552 movhps yy1, 10 * SIZE(YY) 553 movsd 12 * SIZE(YY), yy1 554 movhps 14 * SIZE(YY), yy1 555 556 movaps xtemp2, xt1 557 mulps a1, xt1 558 mulps atemp1, a1 559 addps xt1, xsum1 560 addps a1, yy1 561 movsd 16 * SIZE(A1), a1 562 movhps 18 * SIZE(A1), a1 563 564 PREFETCH PREFETCHSIZE(A2, LDA, 1) 565 566 movaps xtemp2, xt1 567 mulps a2, xt1 568 mulps atemp2, a2 569 addps xt1, xsum2 570 addps a2, yy1 571 movsd 16 * SIZE(A1, LDA, 1), a2 572 movhps 18 * SIZE(A1, LDA, 1), a2 573 574 movaps xtemp2, xt1 575 mulps a3, xt1 576 mulps atemp3, a3 577 addps xt1, xsum3 578 addps a3, yy1 579 movsd 16 * SIZE(A2), a3 580 movhps 18 * SIZE(A2), a3 581 582 movaps xtemp2, xt1 583 movaps 20 * SIZE(XX), xtemp2 584 mulps a4, xt1 585 mulps atemp4, a4 586 addps xt1, xsum4 587 addps a4, yy1 588 movsd 16 * SIZE(A2, LDA, 1), a4 589 movhps 18 * SIZE(A2, LDA, 1), a4 590 591 movlps yy1, 12 * SIZE(YY) 592 movhps yy1, 14 * SIZE(YY) 593 movsd 16 * SIZE(YY), yy1 594 movhps 18 * SIZE(YY), yy1 595 596 addq $16 * SIZE, XX 597 addq $16 * SIZE, YY 598 addq $16 * SIZE, A1 599 addq $16 * SIZE, A2 600 601 decq I 602 jg .L12 603 ALIGN_3 604 605.L14: 606 movq M, I 607 subq IS, I 608 subq $4, I 609 test $8, I 610 jle .L15 611 612 movaps xtemp1, xt1 613 mulps a1, xt1 614 mulps atemp1, a1 615 addps xt1, xsum1 616 addps a1, yy1 617 movsd 4 * SIZE(A1), a1 618 movhps 6 * SIZE(A1), a1 619 620 movaps xtemp1, xt1 621 mulps a2, xt1 622 mulps atemp2, a2 623 addps xt1, xsum2 624 addps a2, yy1 625 movsd 4 * SIZE(A1, LDA, 1), a2 626 movhps 6 * SIZE(A1, LDA, 1), a2 627 628 movaps xtemp1, xt1 629 mulps a3, xt1 630 mulps atemp3, a3 631 addps xt1, xsum3 632 addps a3, yy1 633 movsd 4 * SIZE(A2), a3 634 movhps 6 * SIZE(A2), a3 635 636 movaps xtemp1, xt1 637 movaps 8 * SIZE(XX), xtemp1 638 mulps a4, xt1 639 mulps atemp4, a4 640 addps xt1, xsum4 641 addps a4, yy1 642 movsd 4 * SIZE(A2, LDA, 1), a4 643 movhps 6 * SIZE(A2, LDA, 1), a4 644 645 movlps yy1, 0 * SIZE(YY) 646 movhps yy1, 2 * SIZE(YY) 647 movsd 4 * SIZE(YY), yy1 648 movhps 6 * SIZE(YY), yy1 649 650 movaps xtemp2, xt1 651 mulps a1, xt1 652 mulps atemp1, a1 653 addps xt1, xsum1 654 addps a1, yy1 655 movsd 8 * SIZE(A1), a1 656 movhps 10 * SIZE(A1), a1 657 658 movaps xtemp2, xt1 659 mulps a2, xt1 660 mulps atemp2, a2 661 addps xt1, xsum2 662 addps a2, yy1 663 movsd 8 * SIZE(A1, LDA, 1), a2 664 movhps 10 * SIZE(A1, LDA, 1), a2 665 666 movaps xtemp2, xt1 667 mulps a3, xt1 668 mulps atemp3, a3 669 addps xt1, xsum3 670 addps a3, yy1 671 movsd 8 * SIZE(A2), a3 672 movhps 10 * SIZE(A2), a3 673 674 movaps xtemp2, xt1 675 movaps 12 * SIZE(XX), xtemp2 676 mulps a4, xt1 677 mulps atemp4, a4 678 addps xt1, xsum4 679 addps a4, yy1 680 movsd 8 * SIZE(A2, LDA, 1), a4 681 movhps 10 * SIZE(A2, LDA, 1), a4 682 683 movlps yy1, 4 * SIZE(YY) 684 movhps yy1, 6 * SIZE(YY) 685 movsd 8 * SIZE(YY), yy1 686 movhps 10 * SIZE(YY), yy1 687 688 addq $8 * SIZE, XX 689 addq $8 * SIZE, YY 690 addq $8 * SIZE, A1 691 addq $8 * SIZE, A2 692 ALIGN_3 693 694.L15: 695 test $4, I 696 jle .L17 697 698 movaps xtemp1, xt1 699 mulps a1, xt1 700 mulps atemp1, a1 701 addps xt1, xsum1 702 addps a1, yy1 703 movsd 4 * SIZE(A1), a1 704 705 movaps xtemp1, xt1 706 mulps a2, xt1 707 mulps atemp2, a2 708 addps xt1, xsum2 709 addps a2, yy1 710 movsd 4 * SIZE(A1, LDA, 1), a2 711 712 movaps xtemp1, xt1 713 mulps a3, xt1 714 mulps atemp3, a3 715 addps xt1, xsum3 716 addps a3, yy1 717 movsd 4 * SIZE(A2), a3 718 719 movaps xtemp1, xt1 720 movsd 4 * SIZE(XX), xtemp1 721 mulps a4, xt1 722 mulps atemp4, a4 723 addps xt1, xsum4 724 addps a4, yy1 725 movsd 4 * SIZE(A2, LDA, 1), a4 726 727 movlps yy1, 0 * SIZE(YY) 728 movhps yy1, 2 * SIZE(YY) 729 movsd 4 * SIZE(YY), yy1 730 731 addq $4 * SIZE, XX 732 addq $4 * SIZE, YY 733 addq $4 * SIZE, A1 734 addq $4 * SIZE, A2 735 ALIGN_3 736 737.L17: 738 testq $2, M 739 jle .L18 740 741 pxor xtemp2, xtemp2 742 743 movlhps xtemp2, a1 744 movaps xtemp1, xt1 745 mulps a1, xt1 746 mulps atemp1, a1 747 addps xt1, xsum1 748 addps a1, yy1 749 movss 2 * SIZE(A1), a1 750 751 movlhps xtemp2, a2 752 movaps xtemp1, xt1 753 mulps a2, xt1 754 mulps atemp2, a2 755 addps xt1, xsum2 756 addps a2, yy1 757 movss 2 * SIZE(A1, LDA, 1), a2 758 759 movlhps xtemp2, a3 760 movaps xtemp1, xt1 761 mulps a3, xt1 762 mulps atemp3, a3 763 addps xt1, xsum3 764 addps a3, yy1 765 movss 2 * SIZE(A2), a3 766 767 movlhps xtemp2, a4 768 movaps xtemp1, xt1 769 movss 2 * SIZE(XX), xtemp1 770 mulps a4, xt1 771 mulps atemp4, a4 772 addps xt1, xsum4 773 addps a4, yy1 774 movss 2 * SIZE(A2, LDA, 1), a4 775 776 movlps yy1, 0 * SIZE(YY) 777 movss 2 * SIZE(YY), yy1 778 779 addq $2 * SIZE, XX 780 addq $2 * SIZE, YY 781 addq $2 * SIZE, A1 782 addq $2 * SIZE, A2 783 ALIGN_3 784 785.L18: 786 testq $1, M 787 jle .L19 788 789 movss 0 * SIZE(XX), xtemp1 790 791 movss 0 * SIZE(YY), yy1 792 793 movss 0 * SIZE(A1), a1 794 movss 0 * SIZE(A1, LDA, 1), a2 795 movss 0 * SIZE(A2), a3 796 movss 0 * SIZE(A2, LDA, 1), a4 797 798 movaps xtemp1, xt1 799 mulss a1, xt1 800 mulss atemp1, a1 801 addss xt1, xsum1 802 addss a1, yy1 803 804 movaps xtemp1, xt1 805 mulss a2, xt1 806 mulss atemp2, a2 807 addss xt1, xsum2 808 addss a2, yy1 809 810 movaps xtemp1, xt1 811 mulss a3, xt1 812 mulss atemp3, a3 813 addss xt1, xsum3 814 addss a3, yy1 815 816 movaps xtemp1, xt1 817 mulss a4, xt1 818 mulss atemp4, a4 819 addss xt1, xsum4 820 addss a4, yy1 821 822 movss yy1, 0 * SIZE(YY) 823 ALIGN_3 824 825.L19: 826#ifndef HAVE_SSE3 827 movaps xsum1, xtemp1 828 unpcklps xsum3, xsum1 829 unpckhps xsum3, xtemp1 830 831 movaps xsum2, xtemp2 832 unpcklps xsum4, xsum2 833 unpckhps xsum4, xtemp2 834 835 movaps xsum1, xsum3 836 unpcklps xsum2, xsum1 837 unpckhps xsum2, xsum3 838 839 movaps xtemp1, xsum4 840 unpcklps xtemp2, xtemp1 841 unpckhps xtemp2, xsum4 842 843 addps xsum3, xsum1 844 addps xtemp1, xsum4 845 addps xsum4, xsum1 846#else 847 haddps xsum2, xsum1 848 haddps xsum4, xsum3 849 850 haddps xsum3, xsum1 851#endif 852 853 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 854 movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1 855 856 addps xsum1, yy1 857 858 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) 859 movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE) 860 861 addq $4, IS 862 863 movq IS, I 864 addq $4, I 865 cmpq N, I 866 jle .L11 867 ALIGN_3 868 869.L20: 870 testq $2, N 871 jle .L30 872 873 movq A, A1 874 leaq 2 * SIZE(A, LDA, 2), A 875 876 movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 877 878#if defined(OPTERON) 879 pxor xsum1, xsum1 880#endif 881 movsd 0 * SIZE(A1), xsum1 882 mulps atemp4, xsum1 883 884 movss 1 * SIZE(A1), xsum2 885 movss 1 * SIZE(A1, LDA, 1), a2 886 unpcklps a2, xsum2 887 mulps atemp4, xsum2 888 889 pshufd $0x00, atemp4, atemp1 890 pshufd $0x55, atemp4, atemp2 891 892 testq $1, M 893 jle .L29 894 895 movss 2 * SIZE(A1), a1 896 movss 2 * SIZE(A1, LDA, 1), a2 897 movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1 898 movss 2 * SIZE(NEW_Y, IS, SIZE), yy1 899 900 movaps xtemp1, xt1 901 mulss a1, xt1 902 mulss atemp1, a1 903 addss xt1, xsum1 904 addps a1, yy1 905 906 movaps xtemp1, xt1 907 mulss a2, xt1 908 mulss atemp2, a2 909 addss xt1, xsum2 910 addss a2, yy1 911 912 movss yy1, 2 * SIZE(NEW_Y, IS, SIZE) 913 ALIGN_3 914 915.L29: 916 917#ifndef HAVE_SSE3 918 unpcklps xsum2, xsum1 919 movhlps xsum1, xsum2 920 addps xsum2, xsum1 921#else 922 haddps xsum2, xsum1 923 haddps xsum1, xsum1 924#endif 925 926 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 927 928 addps xsum1, yy1 929 930 movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE) 931 932 addq $2, IS 933 ALIGN_3 934 935.L30: 936 testq $1, N 937 jle .L990 938 939 movss 0 * SIZE(NEW_X, IS, SIZE), xsum1 940 mulss 0 * SIZE(A), xsum1 941 addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1 942 movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE) 943 ALIGN_3 944 945.L990: 946 cmpq $SIZE, INCY 947 je .L999 948 949 movq M, %rax 950 sarq $3, %rax 951 jle .L997 952 ALIGN_3 953 954.L996: 955 movss 0 * SIZE(NEW_Y), %xmm0 956 movss 1 * SIZE(NEW_Y), %xmm1 957 movss 2 * SIZE(NEW_Y), %xmm2 958 movss 3 * SIZE(NEW_Y), %xmm3 959 movss 4 * SIZE(NEW_Y), %xmm4 960 movss 5 * SIZE(NEW_Y), %xmm5 961 movss 6 * SIZE(NEW_Y), %xmm6 962 movss 7 * SIZE(NEW_Y), %xmm7 963 964 movss %xmm0, 0 * SIZE(Y) 965 addq INCY, Y 966 movss %xmm1, 0 * SIZE(Y) 967 addq INCY, Y 968 movss %xmm2, 0 * SIZE(Y) 969 addq INCY, Y 970 movss %xmm3, 0 * SIZE(Y) 971 addq INCY, Y 972 movss %xmm4, 0 * SIZE(Y) 973 addq INCY, Y 974 movss %xmm5, 0 * SIZE(Y) 975 addq INCY, Y 976 movss %xmm6, 0 * SIZE(Y) 977 addq INCY, Y 978 movss %xmm7, 0 * SIZE(Y) 979 addq INCY, Y 980 981 addq $8 * SIZE, NEW_Y 982 decq %rax 983 jg .L996 984 ALIGN_3 985 986.L997: 987 movq M, %rax 988 andq $7, %rax 989 jle .L999 990 ALIGN_3 991 992.L998: 993 movss 0 * SIZE(NEW_Y), %xmm0 994 995 movss %xmm0, 0 * SIZE(Y) 996 addq INCY, Y 997 998 addq $1 * SIZE, NEW_Y 999 1000 decq %rax 1001 jg .L998 1002 ALIGN_3 1003 1004.L999: 1005 movq 0(%rsp), %rbx 1006 movq 8(%rsp), %rbp 1007 movq 16(%rsp), %r12 1008 movq 24(%rsp), %r13 1009 movq 32(%rsp), %r14 1010 movq 40(%rsp), %r15 1011 1012#ifdef WINDOWS_ABI 1013 movq 48(%rsp), %rdi 1014 movq 56(%rsp), %rsi 1015 movups 64(%rsp), %xmm6 1016 movups 80(%rsp), %xmm7 1017 movups 96(%rsp), %xmm8 1018 movups 112(%rsp), %xmm9 1019 movups 128(%rsp), %xmm10 1020 movups 144(%rsp), %xmm11 1021 movups 160(%rsp), %xmm12 1022 movups 176(%rsp), %xmm13 1023 movups 192(%rsp), %xmm14 1024 movups 208(%rsp), %xmm15 1025#endif 1026 1027 addq $STACKSIZE, %rsp 1028 ret 1029 EPILOGUE 1030