1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef ATOM 43#define PREFETCH prefetcht0 44#define PREFETCHW prefetcht0 45#define PREFETCHSIZE (16 * 24) 46#endif 47 48#ifdef CORE2 49#define PREFETCH prefetcht0 50#define PREFETCHW prefetcht0 51#define PREFETCHSIZE (16 * 24) 52#endif 53 54#if defined(PENRYN) || defined(DUNNINGTON) 55#define PREFETCH prefetcht0 56#define PREFETCHW prefetcht0 57#define PREFETCHSIZE (16 * 24) 58#endif 59 60#ifdef NEHALEM 61#define PREFETCH prefetcht0 62#define PREFETCHW prefetcht0 63#define PREFETCHSIZE (16 * 24) 64#endif 65 66#ifdef PENTIUM4 67#define PREFETCH prefetcht0 68#define PREFETCHW prefetcht0 69#define PREFETCHSIZE (16 * 28) 70#endif 71 72#ifdef OPTERON 73#define PREFETCH prefetch 74#define PREFETCHW prefetchw 75#define PREFETCHSIZE (16 * 12) 76#define movsd movlpd 77#endif 78 79#if defined(BARCELONA) || defined(SHANGHAI) 80#define PREFETCH prefetch 81#define PREFETCHW prefetchw 82#define PREFETCHSIZE (16 * 16) 83#endif 84 85#ifdef NANO 86#define PREFETCH prefetcht0 87#define PREFETCHW prefetcht0 88#define PREFETCHSIZE (8 * 24) 89#endif 90 91#ifdef GENERIC 92#define PREFETCH prefetcht0 93#define PREFETCHW prefetcht0 94#define PREFETCHSIZE (16 * 28) 95#endif 96 97#ifndef WINDOWS_ABI 98 99#define STACKSIZE 80 100 101#define OLD_Y 8 + STACKSIZE(%rsp) 102#define OLD_INCY 16 + STACKSIZE(%rsp) 103#define OLD_BUFFER 24 + STACKSIZE(%rsp) 104 105#define M ARG1 106#define IS ARG2 107#define A ARG3 108#define LDA ARG4 109#define X ARG5 110#define INCX ARG6 111 112#else 113 114#define STACKSIZE 256 115 116#define OLD_A 40 + STACKSIZE(%rsp) 117#define OLD_LDA 48 + STACKSIZE(%rsp) 118#define OLD_X 56 + STACKSIZE(%rsp) 119#define OLD_INCX 64 + STACKSIZE(%rsp) 120#define OLD_Y 72 + STACKSIZE(%rsp) 121#define OLD_INCY 80 + STACKSIZE(%rsp) 122#define OLD_BUFFER 88 + STACKSIZE(%rsp) 123 124#define M ARG1 125#define IS ARG2 126#define A ARG4 127#define LDA ARG3 128#define X %rdi 129#define INCX %rsi 130 131#endif 132 133#define Y %r10 134#define INCY %r11 135#define BUFFER %r12 136 137#define TEMP %rax 138#define I %rax 139#define A1 %rbx 140#define A2 %rbp 141#define XX %r13 142#define YY %r14 143#define NEW_X BUFFER 144#define NEW_Y X 145 146#define ALPHA_R %xmm0 147#define ALPHA_I %xmm1 148 149#define xtemp1 %xmm0 150#define xtemp2 %xmm1 151#define xtemp3 %xmm2 152#define xtemp4 %xmm3 153 154#define atemp1 %xmm4 155#define atemp2 %xmm5 156#define atemp3 %xmm6 157#define atemp4 %xmm7 158 159#define xsum1 %xmm8 160#define xsum2 %xmm9 161#define yy1 %xmm10 162#define yy2 %xmm11 163 164#define a1 %xmm12 165#define a2 %xmm13 166#define a3 %xmm14 167#define xt1 %xmm15 168 169#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) 170#define MOVDDUP(a, b, c) movddup a(b), c 171#define MOVDDUP2(a, b, c) movddup a##b, c 172#else 173#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c 174#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c 175#endif 176 177#ifndef HEMV 178#define ADD addpd 179#else 180#define ADD subpd 181#endif 182 183 PROLOGUE 184 PROFCODE 185 186 subq $STACKSIZE, %rsp 187 movq %rbx, 0(%rsp) 188 movq %rbp, 8(%rsp) 189 movq %r12, 16(%rsp) 190 movq %r13, 24(%rsp) 191 movq %r14, 32(%rsp) 192 movq %r15, 40(%rsp) 193 194#ifdef WINDOWS_ABI 195 movq %rdi, 48(%rsp) 196 movq %rsi, 56(%rsp) 197 movups %xmm6, 64(%rsp) 198 movups %xmm7, 80(%rsp) 199 movups %xmm8, 96(%rsp) 200 movups %xmm9, 112(%rsp) 201 movups %xmm10, 128(%rsp) 202 movups %xmm11, 144(%rsp) 203 movups %xmm12, 160(%rsp) 204 movups %xmm13, 176(%rsp) 205 movups %xmm14, 192(%rsp) 206 movups %xmm15, 208(%rsp) 207 208 movq OLD_A, A 209 movq OLD_LDA, LDA 210 movq OLD_X, X 211 movq OLD_INCX, INCX 212 213 movaps %xmm2, %xmm0 214 movaps %xmm3, %xmm1 215#endif 216 217 movq OLD_Y, Y 218 movq OLD_INCY, INCY 219 movq OLD_BUFFER, BUFFER 220 221 salq $ZBASE_SHIFT, INCX 222 salq $ZBASE_SHIFT, INCY 223 salq $ZBASE_SHIFT, LDA 224 225 testq M, M 226 jle .L999 227 228 negq IS 229 addq M, IS 230 231 movq IS, TEMP 232 imulq LDA, TEMP 233 addq TEMP, A 234 235 pcmpeqb %xmm2, %xmm2 236 xorpd %xmm3, %xmm3 237 psllq $63, %xmm2 238 unpcklpd %xmm3, %xmm2 239 240 unpcklpd ALPHA_I, ALPHA_R 241 unpcklpd ALPHA_R, ALPHA_I 242 xorpd %xmm2, ALPHA_I 243 244 movq BUFFER, XX 245 246 movq M, %rax 247 sarq $2, %rax 248 jle .L02 249 ALIGN_3 250 251.L01: 252 MOVDDUP(0 * SIZE, X, %xmm3) 253 MOVDDUP(1 * SIZE, X, %xmm4) 254 addq INCX, X 255 MOVDDUP(0 * SIZE, X, %xmm5) 256 MOVDDUP(1 * SIZE, X, %xmm6) 257 addq INCX, X 258 259 mulpd ALPHA_R, %xmm3 260 mulpd ALPHA_I, %xmm4 261 mulpd ALPHA_R, %xmm5 262 mulpd ALPHA_I, %xmm6 263 264 addpd %xmm4, %xmm3 265 addpd %xmm6, %xmm5 266 267 movapd %xmm3, 0 * SIZE(XX) 268 SHUFPD_1 %xmm3, %xmm3 269 pxor %xmm2, %xmm3 270 movapd %xmm3, 2 * SIZE(XX) 271 272 movapd %xmm5, 4 * SIZE(XX) 273 SHUFPD_1 %xmm5, %xmm5 274 pxor %xmm2, %xmm5 275 movapd %xmm5, 6 * SIZE(XX) 276 277 MOVDDUP(0 * SIZE, X, %xmm3) 278 MOVDDUP(1 * SIZE, X, %xmm4) 279 addq INCX, X 280 MOVDDUP(0 * SIZE, X, %xmm5) 281 MOVDDUP(1 * SIZE, X, %xmm6) 282 addq INCX, X 283 284 mulpd ALPHA_R, %xmm3 285 mulpd ALPHA_I, %xmm4 286 mulpd ALPHA_R, %xmm5 287 mulpd ALPHA_I, %xmm6 288 289 addpd %xmm4, %xmm3 290 addpd %xmm6, %xmm5 291 292 movapd %xmm3, 8 * SIZE(XX) 293 SHUFPD_1 %xmm3, %xmm3 294 pxor %xmm2, %xmm3 295 movapd %xmm3, 10 * SIZE(XX) 296 297 movapd %xmm5, 12 * SIZE(XX) 298 SHUFPD_1 %xmm5, %xmm5 299 pxor %xmm2, %xmm5 300 movapd %xmm5, 14 * SIZE(XX) 301 302 subq $-16 * SIZE, XX 303 decq %rax 304 jg .L01 305 ALIGN_3 306 307.L02: 308 movq M, %rax 309 andq $3, %rax 310 jle .L05 311 ALIGN_3 312 313.L03: 314 MOVDDUP(0 * SIZE, X, %xmm3) 315 MOVDDUP(1 * SIZE, X, %xmm4) 316 addq INCX, X 317 318 mulpd ALPHA_R, %xmm3 319 mulpd ALPHA_I, %xmm4 320 321 addpd %xmm4, %xmm3 322 323 movapd %xmm3, 0 * SIZE(XX) 324 SHUFPD_1 %xmm3, %xmm3 325 pxor %xmm2, %xmm3 326 movapd %xmm3, 2 * SIZE(XX) 327 328 addq $4 * SIZE, XX 329 decq %rax 330 jg .L03 331 ALIGN_3 332 333.L05: 334 /* now we don't need original X */ 335 movq Y, NEW_Y 336 337 addq $512, XX 338 andq $-512, XX 339 340 cmpq $2 * SIZE, INCY 341 je .L10 342 343 movq Y, YY 344 movq XX, NEW_Y 345 346 movq M, %rax 347 sarq $2, %rax 348 jle .L07 349 ALIGN_3 350 351.L06: 352 movsd 0 * SIZE(YY), %xmm0 353 movhpd 1 * SIZE(YY), %xmm0 354 addq INCY, YY 355 movsd 0 * SIZE(YY), %xmm1 356 movhpd 1 * SIZE(YY), %xmm1 357 addq INCY, YY 358 movsd 0 * SIZE(YY), %xmm2 359 movhpd 1 * SIZE(YY), %xmm2 360 addq INCY, YY 361 movsd 0 * SIZE(YY), %xmm3 362 movhpd 1 * SIZE(YY), %xmm3 363 addq INCY, YY 364 365 movapd %xmm0, 0 * SIZE(XX) 366 movapd %xmm1, 2 * SIZE(XX) 367 movapd %xmm2, 4 * SIZE(XX) 368 movapd %xmm3, 6 * SIZE(XX) 369 370 addq $8 * SIZE, XX 371 decq %rax 372 jg .L06 373 ALIGN_3 374 375.L07: 376 movq M, %rax 377 andq $3, %rax 378 jle .L10 379 ALIGN_3 380 381.L08: 382 movsd 0 * SIZE(YY), %xmm0 383 movhpd 1 * SIZE(YY), %xmm0 384 addq INCY, YY 385 386 movapd %xmm0, 0 * SIZE(XX) 387 388 addq $2 * SIZE, XX 389 decq %rax 390 jg .L08 391 ALIGN_3 392 393.L10: 394 movq IS, I 395 addq $2, I 396 cmpq M, I 397 jg .L20 398 ALIGN_3 399 400.L11: 401 movq A, A1 402 leaq (A, LDA, 1), A2 403 leaq (A, LDA, 2), A 404 405 leaq (, IS, 4), I 406 407 movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 408 movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 409 movapd 4 * SIZE(NEW_X, I, SIZE), atemp3 410 movapd 6 * SIZE(NEW_X, I, SIZE), atemp4 411 412 pxor xsum1, xsum1 413 pxor xsum2, xsum2 414 415 movsd 0 * SIZE(NEW_Y), yy1 416 movhpd 1 * SIZE(NEW_Y), yy1 417 movsd 2 * SIZE(NEW_Y), yy2 418 movhpd 3 * SIZE(NEW_Y), yy2 419 420 movapd 0 * SIZE(NEW_X), xtemp1 421 movapd 2 * SIZE(NEW_X), xtemp2 422 movapd 4 * SIZE(NEW_X), xtemp3 423 movapd 6 * SIZE(NEW_X), xtemp4 424 425 MOVDDUP(0 * SIZE, A1, a1) 426 MOVDDUP(2 * SIZE, A2, a2) 427 MOVDDUP(1 * SIZE, A1, a3) 428 429 movq NEW_X, XX 430 movq NEW_Y, YY 431 432 movq IS, I 433 sarq $2, I 434 jle .L15 435 ALIGN_3 436 437.L12: 438 movapd xtemp1, xt1 439 mulpd a1, xt1 440 mulpd atemp1, a1 441 addpd xt1, xsum1 442 addpd a1, yy1 443 MOVDDUP(3 * SIZE, A2, a1) 444 445 PREFETCH PREFETCHSIZE(A1) 446 447 movapd xtemp3, xt1 448 mulpd a2, xt1 449 mulpd atemp3, a2 450 addpd xt1, xsum2 451 addpd a2, yy2 452 MOVDDUP(2 * SIZE, A1, a2) 453 454 movapd xtemp2, xt1 455 mulpd a3, xt1 456 mulpd atemp2, a3 457 ADD xt1, xsum1 458 addpd a3, yy1 459 MOVDDUP(0 * SIZE, A2, a3) 460 461 movapd xtemp4, xt1 462 mulpd a1, xt1 463 mulpd atemp4, a1 464 ADD xt1, xsum2 465 addpd a1, yy2 466 MOVDDUP(3 * SIZE, A1, a1) 467 468 PREFETCH PREFETCHSIZE(XX) 469 470 movapd xtemp3, xt1 471 movapd 12 * SIZE(XX), xtemp3 472 mulpd a2, xt1 473 mulpd atemp1, a2 474 addpd xt1, xsum1 475 addpd a2, yy2 476 MOVDDUP(1 * SIZE, A2, a2) 477 478 movapd xtemp1, xt1 479 movapd 8 * SIZE(XX), xtemp1 480 mulpd a3, xt1 481 mulpd atemp3, a3 482 addpd xt1, xsum2 483 addpd a3, yy1 484 MOVDDUP(4 * SIZE, A1, a3) 485 486 movapd xtemp4, xt1 487 movapd 14 * SIZE(XX), xtemp4 488 mulpd a1, xt1 489 mulpd atemp2, a1 490 ADD xt1, xsum1 491 addpd a1, yy2 492 MOVDDUP(6 * SIZE, A2, a1) 493 494 movlpd yy2, 2 * SIZE(YY) 495 movhpd yy2, 3 * SIZE(YY) 496 movsd 6 * SIZE(YY), yy2 497 movhpd 7 * SIZE(YY), yy2 498 499 movapd xtemp2, xt1 500 movapd 10 * SIZE(XX), xtemp2 501 mulpd a2, xt1 502 mulpd atemp4, a2 503 ADD xt1, xsum2 504 addpd a2, yy1 505 MOVDDUP(5 * SIZE, A1, a2) 506 507 PREFETCH PREFETCHSIZE(A2) 508 509 movlpd yy1, 0 * SIZE(YY) 510 movhpd yy1, 1 * SIZE(YY) 511 movsd 4 * SIZE(YY), yy1 512 movhpd 5 * SIZE(YY), yy1 513 514 movapd xtemp1, xt1 515 mulpd a3, xt1 516 mulpd atemp1, a3 517 addpd xt1, xsum1 518 addpd a3, yy1 519 MOVDDUP(7 * SIZE, A2, a3) 520 521 movapd xtemp3, xt1 522 mulpd a1, xt1 523 mulpd atemp3, a1 524 addpd xt1, xsum2 525 addpd a1, yy2 526 MOVDDUP(6 * SIZE, A1, a1) 527 528 movapd xtemp2, xt1 529 mulpd a2, xt1 530 mulpd atemp2, a2 531 ADD xt1, xsum1 532 addpd a2, yy1 533 MOVDDUP(4 * SIZE, A2, a2) 534 535 PREFETCHW PREFETCHSIZE(YY) 536 537 movapd xtemp4, xt1 538 mulpd a3, xt1 539 mulpd atemp4, a3 540 ADD xt1, xsum2 541 addpd a3, yy2 542 MOVDDUP(7 * SIZE, A1, a3) 543 544 movapd xtemp3, xt1 545 movapd 20 * SIZE(XX), xtemp3 546 mulpd a1, xt1 547 mulpd atemp1, a1 548 addpd xt1, xsum1 549 addpd a1, yy2 550 MOVDDUP(5 * SIZE, A2, a1) 551 552 movapd xtemp1, xt1 553 movapd 16 * SIZE(XX), xtemp1 554 mulpd a2, xt1 555 mulpd atemp3, a2 556 addpd xt1, xsum2 557 addpd a2, yy1 558 MOVDDUP(10 * SIZE, A2, a2) 559 560 movapd xtemp4, xt1 561 movapd 22 * SIZE(XX), xtemp4 562 mulpd a3, xt1 563 mulpd atemp2, a3 564 ADD xt1, xsum1 565 addpd a3, yy2 566 MOVDDUP( 9 * SIZE, A1, a3) 567 568 movlpd yy2, 6 * SIZE(YY) 569 movhpd yy2, 7 * SIZE(YY) 570 movsd 10 * SIZE(YY), yy2 571 movhpd 11 * SIZE(YY), yy2 572 573 movapd xtemp2, xt1 574 movapd 18 * SIZE(XX), xtemp2 575 mulpd a1, xt1 576 mulpd atemp4, a1 577 ADD xt1, xsum2 578 addpd a1, yy1 579 MOVDDUP( 8 * SIZE, A1, a1) 580 581 movlpd yy1, 4 * SIZE(YY) 582 movhpd yy1, 5 * SIZE(YY) 583 movsd 8 * SIZE(YY), yy1 584 movhpd 9 * SIZE(YY), yy1 585 586 subq $-16 * SIZE, XX 587 addq $ 8 * SIZE, YY 588 addq $ 8 * SIZE, A1 589 addq $ 8 * SIZE, A2 590 591 decq I 592 jg .L12 593 ALIGN_3 594 595.L15: 596 testq $2, IS 597 jle .L18 598 599 movapd xtemp1, xt1 600 mulpd a1, xt1 601 mulpd atemp1, a1 602 addpd xt1, xsum1 603 addpd a1, yy1 604 MOVDDUP(1 * SIZE, A1, a1) 605 606 movapd xtemp3, xt1 607 mulpd a2, xt1 608 mulpd atemp3, a2 609 addpd xt1, xsum2 610 addpd a2, yy2 611 MOVDDUP(3 * SIZE, A2, a2) 612 613 movapd xtemp2, xt1 614 mulpd a1, xt1 615 mulpd atemp2, a1 616 ADD xt1, xsum1 617 addpd a1, yy1 618 MOVDDUP(2 * SIZE, A1, a1) 619 620 movapd xtemp4, xt1 621 mulpd a2, xt1 622 mulpd atemp4, a2 623 ADD xt1, xsum2 624 addpd a2, yy2 625 MOVDDUP(0 * SIZE, A2, a2) 626 627 movapd xtemp3, xt1 628 mulpd a1, xt1 629 mulpd atemp1, a1 630 addpd xt1, xsum1 631 addpd a1, yy2 632 MOVDDUP(3 * SIZE, A1, a1) 633 634 movapd xtemp1, xt1 635 mulpd a2, xt1 636 mulpd atemp3, a2 637 addpd xt1, xsum2 638 addpd a2, yy1 639 MOVDDUP(1 * SIZE, A2, a2) 640 641 movapd xtemp4, xt1 642 mulpd a1, xt1 643 mulpd atemp2, a1 644 ADD xt1, xsum1 645 addpd a1, yy2 646 647 movlpd yy2, 2 * SIZE(YY) 648 movhpd yy2, 3 * SIZE(YY) 649 movsd 6 * SIZE(YY), yy2 650 movhpd 7 * SIZE(YY), yy2 651 652 movapd xtemp2, xt1 653 mulpd a2, xt1 654 mulpd atemp4, a2 655 ADD xt1, xsum2 656 addpd a2, yy1 657 658 movlpd yy1, 0 * SIZE(YY) 659 movhpd yy1, 1 * SIZE(YY) 660 movsd 4 * SIZE(YY), yy1 661 movhpd 5 * SIZE(YY), yy1 662 663 addq $4 * SIZE, YY 664 addq $4 * SIZE, A1 665 addq $4 * SIZE, A2 666 ALIGN_3 667 668.L18: 669 MOVDDUP(0 * SIZE, A1, a1) 670 MOVDDUP(0 * SIZE, A2, a2) 671 672 mulpd atemp1, a1 673 mulpd atemp1, a2 674 addpd a1, xsum1 675 addpd a2, xsum2 676 677#ifndef HEMV 678 MOVDDUP(1 * SIZE, A1, a1) 679 MOVDDUP(1 * SIZE, A2, a2) 680 681 mulpd atemp2, a1 682 mulpd atemp2, a2 683 addpd a1, xsum1 684 addpd a2, xsum2 685#else 686 MOVDDUP(1 * SIZE, A2, a2) 687 688 mulpd atemp2, a2 689 subpd a2, xsum2 690#endif 691 692 MOVDDUP(0 * SIZE, A2, a1) 693 MOVDDUP(2 * SIZE, A2, a2) 694 695 mulpd atemp3, a1 696 mulpd atemp3, a2 697 addpd a1, xsum1 698 addpd a2, xsum2 699 700#ifndef HEMV 701 MOVDDUP(1 * SIZE, A2, a1) 702 MOVDDUP(3 * SIZE, A2, a2) 703 704 mulpd atemp4, a1 705 mulpd atemp4, a2 706 addpd a1, xsum1 707 addpd a2, xsum2 708#else 709 MOVDDUP(1 * SIZE, A2, a1) 710 711 mulpd atemp4, a1 712 addpd a1, xsum1 713#endif 714 715 addpd xsum1, yy1 716 addpd xsum2, yy2 717 718 movlpd yy1, 0 * SIZE(YY) 719 movhpd yy1, 1 * SIZE(YY) 720 movlpd yy2, 2 * SIZE(YY) 721 movhpd yy2, 3 * SIZE(YY) 722 723 addq $2, IS 724 725 movq IS, I 726 addq $2, I 727 cmpq M, I 728 jle .L11 729 ALIGN_3 730 731.L20: 732 testq $1, M 733 jle .L990 734 735 movq A, A1 736 leaq (, IS, 4), I 737 738 movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 739 movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 740 741 pxor xsum1, xsum1 742 pxor xsum2, xsum2 743 744 MOVDDUP(0 * SIZE, A1, a1) 745 MOVDDUP(1 * SIZE, A1, a2) 746 747 movapd 0 * SIZE(NEW_X), xtemp1 748 movapd 2 * SIZE(NEW_X), xtemp2 749 movapd 4 * SIZE(NEW_X), xtemp3 750 movapd 6 * SIZE(NEW_X), xtemp4 751 752 movsd 0 * SIZE(NEW_Y), yy1 753 movhpd 1 * SIZE(NEW_Y), yy1 754 movsd 2 * SIZE(NEW_Y), yy2 755 movhpd 3 * SIZE(NEW_Y), yy2 756 757 movq NEW_X, XX 758 movq NEW_Y, YY 759 760 movq IS, I 761 sarq $1, I 762 jle .L28 763 ALIGN_3 764 765.L22: 766 movapd xtemp1, xt1 767 movapd 8 * SIZE(XX), xtemp1 768 mulpd a1, xt1 769 mulpd atemp1, a1 770 addpd xt1, xsum1 771 addpd a1, yy1 772 MOVDDUP(2 * SIZE, A1, a1) 773 774 movapd xtemp2, xt1 775 movapd 10 * SIZE(XX), xtemp2 776 mulpd a2, xt1 777 mulpd atemp2, a2 778 ADD xt1, xsum2 779 addpd a2, yy1 780 MOVDDUP(3 * SIZE, A1, a2) 781 782 movlpd yy1, 0 * SIZE(YY) 783 movhpd yy1, 1 * SIZE(YY) 784 movsd 4 * SIZE(YY), yy1 785 movhpd 5 * SIZE(YY), yy1 786 787 movapd xtemp3, xt1 788 movapd 12 * SIZE(XX), xtemp3 789 mulpd a1, xt1 790 mulpd atemp1, a1 791 addpd xt1, xsum1 792 addpd a1, yy2 793 MOVDDUP(4 * SIZE, A1, a1) 794 795 movapd xtemp4, xt1 796 movapd 14 * SIZE(XX), xtemp4 797 mulpd a2, xt1 798 mulpd atemp2, a2 799 ADD xt1, xsum2 800 addpd a2, yy2 801 MOVDDUP(5 * SIZE, A1, a2) 802 803 movlpd yy2, 2 * SIZE(YY) 804 movhpd yy2, 3 * SIZE(YY) 805 movsd 6 * SIZE(YY), yy2 806 movhpd 7 * SIZE(YY), yy2 807 808 addq $8 * SIZE, XX 809 addq $4 * SIZE, YY 810 addq $4 * SIZE, A1 811 812 decq I 813 jg .L22 814 ALIGN_3 815 816.L28: 817 MOVDDUP(0 * SIZE, A1, a1) 818 819#ifndef HEMV 820 MOVDDUP(1 * SIZE, A1, a2) 821 822 mulpd atemp1, a1 823 mulpd atemp2, a2 824 addpd a1, xsum1 825 addpd a2, xsum2 826 827#else 828 mulpd atemp1, a1 829 addpd a1, xsum1 830#endif 831 832 addpd xsum2, xsum1 833 addpd xsum1, yy1 834 835 movlpd yy1, 0 * SIZE(YY) 836 movhpd yy1, 1 * SIZE(YY) 837 ALIGN_3 838 839.L990: 840 cmpq $2 * SIZE, INCY 841 je .L999 842 843 movq M, %rax 844 sarq $2, %rax 845 jle .L997 846 ALIGN_3 847 848.L996: 849 movapd 0 * SIZE(NEW_Y), %xmm0 850 movapd 2 * SIZE(NEW_Y), %xmm1 851 movapd 4 * SIZE(NEW_Y), %xmm2 852 movapd 6 * SIZE(NEW_Y), %xmm3 853 854 movsd %xmm0, 0 * SIZE(Y) 855 movhpd %xmm0, 1 * SIZE(Y) 856 addq INCY, Y 857 movsd %xmm1, 0 * SIZE(Y) 858 movhpd %xmm1, 1 * SIZE(Y) 859 addq INCY, Y 860 movsd %xmm2, 0 * SIZE(Y) 861 movhpd %xmm2, 1 * SIZE(Y) 862 addq INCY, Y 863 movsd %xmm3, 0 * SIZE(Y) 864 movhpd %xmm3, 1 * SIZE(Y) 865 addq INCY, Y 866 867 addq $8 * SIZE, NEW_Y 868 decq %rax 869 jg .L996 870 ALIGN_3 871 872.L997: 873 movq M, %rax 874 andq $3, %rax 875 jle .L999 876 ALIGN_3 877 878.L998: 879 movapd 0 * SIZE(NEW_Y), %xmm0 880 881 movsd %xmm0, 0 * SIZE(Y) 882 movhpd %xmm0, 1 * SIZE(Y) 883 addq INCY, Y 884 885 addq $2 * SIZE, NEW_Y 886 887 decq %rax 888 jg .L998 889 ALIGN_3 890 891.L999: 892 movq 0(%rsp), %rbx 893 movq 8(%rsp), %rbp 894 movq 16(%rsp), %r12 895 movq 24(%rsp), %r13 896 movq 32(%rsp), %r14 897 movq 40(%rsp), %r15 898 899#ifdef WINDOWS_ABI 900 movq 48(%rsp), %rdi 901 movq 56(%rsp), %rsi 902 movups 64(%rsp), %xmm6 903 movups 80(%rsp), %xmm7 904 movups 96(%rsp), %xmm8 905 movups 112(%rsp), %xmm9 906 movups 128(%rsp), %xmm10 907 movups 144(%rsp), %xmm11 908 movups 160(%rsp), %xmm12 909 movups 176(%rsp), %xmm13 910 movups 192(%rsp), %xmm14 911 movups 208(%rsp), %xmm15 912#endif 913 914 addq $STACKSIZE, %rsp 915 ret 916 EPILOGUE 917