1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifndef WINDOWS_ABI 43#define M ARG1 44#define X ARG4 45#define INCX ARG5 46#define Y ARG6 47#define INCY ARG2 48#else 49#define M ARG1 50#define X ARG2 51#define INCX ARG3 52#define Y ARG4 53#define INCY %r10 54#endif 55 56#define YY %r11 57#define ALPHA %xmm15 58 59#include "l1param.h" 60 61 PROLOGUE 62 PROFCODE 63 64#ifndef WINDOWS_ABI 65#ifndef XDOUBLE 66 movq 8(%rsp), INCY 67#else 68 movq 24(%rsp), INCY 69#endif 70 movaps %xmm0, ALPHA 71#else 72 movaps %xmm3, ALPHA 73 74 movq 40(%rsp), X 75 movq 48(%rsp), INCX 76 movq 56(%rsp), Y 77 movq 64(%rsp), INCY 78#endif 79 80 SAVEREGISTERS 81 82 unpcklpd ALPHA, ALPHA 83 84 leaq (, INCX, SIZE), INCX 85 leaq (, INCY, SIZE), INCY 86 87 testq M, M 88 jle .L47 89 90 cmpq $SIZE, INCX 91 jne .L40 92 cmpq $SIZE, INCY 93 jne .L40 94 95 testq $SIZE, Y 96 je .L10 97 98 movsd (X), %xmm0 99 mulsd ALPHA, %xmm0 100 addsd (Y), %xmm0 101 movsd %xmm0, (Y) 102 addq $1 * SIZE, X 103 addq $1 * SIZE, Y 104 decq M 105 jle .L19 106 ALIGN_4 107 108.L10: 109 subq $-16 * SIZE, X 110 subq $-16 * SIZE, Y 111 112 testq $SIZE, X 113 jne .L20 114 115 movq M, %rax 116 sarq $4, %rax 117 jle .L13 118 119 movaps -16 * SIZE(X), %xmm0 120 movaps -14 * SIZE(X), %xmm1 121 movaps -12 * SIZE(X), %xmm2 122 movaps -10 * SIZE(X), %xmm3 123 124 decq %rax 125 jle .L12 126 ALIGN_3 127 128.L11: 129 movaps -8 * SIZE(X), %xmm4 130 movaps -6 * SIZE(X), %xmm5 131 132#ifdef PREFETCHW 133 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 134#endif 135 136 mulpd ALPHA, %xmm0 137 addpd -16 * SIZE(Y), %xmm0 138 movaps %xmm0, -16 * SIZE(Y) 139 140 mulpd ALPHA, %xmm1 141 addpd -14 * SIZE(Y), %xmm1 142 movaps %xmm1, -14 * SIZE(Y) 143 144 movaps -4 * SIZE(X), %xmm6 145 movaps -2 * SIZE(X), %xmm7 146 147#ifdef PREFETCH 148 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 149#endif 150 151 mulpd ALPHA, %xmm2 152 addpd -12 * SIZE(Y), %xmm2 153 movaps %xmm2, -12 * SIZE(Y) 154 155 mulpd ALPHA, %xmm3 156 addpd -10 * SIZE(Y), %xmm3 157 movaps %xmm3, -10 * SIZE(Y) 158 159 movaps 0 * SIZE(X), %xmm0 160 movaps 2 * SIZE(X), %xmm1 161 162#if defined(PREFETCHW) && !defined(FETCH128) 163 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 164#endif 165 166 mulpd ALPHA, %xmm4 167 addpd -8 * SIZE(Y), %xmm4 168 movaps %xmm4, -8 * SIZE(Y) 169 170 mulpd ALPHA, %xmm5 171 addpd -6 * SIZE(Y), %xmm5 172 movaps %xmm5, -6 * SIZE(Y) 173 174 movaps 4 * SIZE(X), %xmm2 175 movaps 6 * SIZE(X), %xmm3 176 177#if defined(PREFETCH) && !defined(FETCH128) 178 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 179#endif 180 181 mulpd ALPHA, %xmm6 182 addpd -4 * SIZE(Y), %xmm6 183 movaps %xmm6, -4 * SIZE(Y) 184 185 mulpd ALPHA, %xmm7 186 addpd -2 * SIZE(Y), %xmm7 187 movaps %xmm7, -2 * SIZE(Y) 188 189 subq $-16 * SIZE, Y 190 subq $-16 * SIZE, X 191 decq %rax 192 jg .L11 193 ALIGN_3 194 195.L12: 196 movaps -8 * SIZE(X), %xmm4 197 movaps -6 * SIZE(X), %xmm5 198 199 mulpd ALPHA, %xmm0 200 addpd -16 * SIZE(Y), %xmm0 201 movaps %xmm0, -16 * SIZE(Y) 202 203 mulpd ALPHA, %xmm1 204 addpd -14 * SIZE(Y), %xmm1 205 movaps %xmm1, -14 * SIZE(Y) 206 207 movaps -4 * SIZE(X), %xmm6 208 movaps -2 * SIZE(X), %xmm7 209 210 mulpd ALPHA, %xmm2 211 addpd -12 * SIZE(Y), %xmm2 212 movaps %xmm2, -12 * SIZE(Y) 213 214 mulpd ALPHA, %xmm3 215 addpd -10 * SIZE(Y), %xmm3 216 movaps %xmm3, -10 * SIZE(Y) 217 218 mulpd ALPHA, %xmm4 219 addpd -8 * SIZE(Y), %xmm4 220 movaps %xmm4, -8 * SIZE(Y) 221 222 mulpd ALPHA, %xmm5 223 addpd -6 * SIZE(Y), %xmm5 224 movaps %xmm5, -6 * SIZE(Y) 225 226 mulpd ALPHA, %xmm6 227 addpd -4 * SIZE(Y), %xmm6 228 movaps %xmm6, -4 * SIZE(Y) 229 230 mulpd ALPHA, %xmm7 231 addpd -2 * SIZE(Y), %xmm7 232 movaps %xmm7, -2 * SIZE(Y) 233 234 subq $-16 * SIZE, Y 235 subq $-16 * SIZE, X 236 ALIGN_3 237 238.L13: 239 movq M, %rax 240 andq $8, %rax 241 jle .L14 242 ALIGN_3 243 244 movaps -16 * SIZE(X), %xmm0 245 movaps -14 * SIZE(X), %xmm1 246 movaps -12 * SIZE(X), %xmm2 247 movaps -10 * SIZE(X), %xmm3 248 249 mulpd ALPHA, %xmm0 250 addpd -16 * SIZE(Y), %xmm0 251 mulpd ALPHA, %xmm1 252 addpd -14 * SIZE(Y), %xmm1 253 mulpd ALPHA, %xmm2 254 addpd -12 * SIZE(Y), %xmm2 255 mulpd ALPHA, %xmm3 256 addpd -10 * SIZE(Y), %xmm3 257 258 movaps %xmm0, -16 * SIZE(Y) 259 movaps %xmm1, -14 * SIZE(Y) 260 movaps %xmm2, -12 * SIZE(Y) 261 movaps %xmm3, -10 * SIZE(Y) 262 263 addq $8 * SIZE, X 264 addq $8 * SIZE, Y 265 ALIGN_3 266 267.L14: 268 movq M, %rax 269 andq $4, %rax 270 jle .L15 271 ALIGN_3 272 273 movaps -16 * SIZE(X), %xmm0 274 movaps -14 * SIZE(X), %xmm1 275 276 mulpd ALPHA, %xmm0 277 mulpd ALPHA, %xmm1 278 279 addpd -16 * SIZE(Y), %xmm0 280 addpd -14 * SIZE(Y), %xmm1 281 282 movaps %xmm0, -16 * SIZE(Y) 283 movaps %xmm1, -14 * SIZE(Y) 284 285 addq $4 * SIZE, X 286 addq $4 * SIZE, Y 287 ALIGN_3 288 289.L15: 290 movq M, %rax 291 andq $2, %rax 292 jle .L16 293 ALIGN_3 294 295 movaps -16 * SIZE(X), %xmm0 296 mulpd ALPHA, %xmm0 297 addpd -16 * SIZE(Y), %xmm0 298 movaps %xmm0, -16 * SIZE(Y) 299 300 addq $2 * SIZE, X 301 addq $2 * SIZE, Y 302 ALIGN_3 303 304.L16: 305 movq M, %rax 306 andq $1, %rax 307 jle .L19 308 ALIGN_3 309 310 movsd -16 * SIZE(X), %xmm0 311 mulsd ALPHA, %xmm0 312 addsd -16 * SIZE(Y), %xmm0 313 314 movsd %xmm0, -16 * SIZE(Y) 315 ALIGN_3 316 317.L19: 318 xorq %rax,%rax 319 320 RESTOREREGISTERS 321 322 ret 323 ALIGN_3 324 325.L20: 326#ifdef ALIGNED_ACCESS 327 328 movhps -16 * SIZE(X), %xmm0 329 330 movq M, %rax 331 sarq $4, %rax 332 jle .L23 333 334 movaps -15 * SIZE(X), %xmm1 335 movaps -13 * SIZE(X), %xmm2 336 movaps -11 * SIZE(X), %xmm3 337 338 decq %rax 339 jle .L22 340 ALIGN_4 341 342.L21: 343 movaps -9 * SIZE(X), %xmm4 344 movaps -7 * SIZE(X), %xmm5 345 346#ifdef PREFETCHW 347 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 348#endif 349 350 SHUFPD_1 %xmm1, %xmm0 351 mulpd ALPHA, %xmm0 352 addpd -16 * SIZE(Y), %xmm0 353 movaps %xmm0, -16 * SIZE(Y) 354 355 SHUFPD_1 %xmm2, %xmm1 356 mulpd ALPHA, %xmm1 357 addpd -14 * SIZE(Y), %xmm1 358 movaps %xmm1, -14 * SIZE(Y) 359 360 movaps -5 * SIZE(X), %xmm6 361 movaps -3 * SIZE(X), %xmm7 362 363#ifdef PREFETCH 364 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 365#endif 366 367 SHUFPD_1 %xmm3, %xmm2 368 mulpd ALPHA, %xmm2 369 addpd -12 * SIZE(Y), %xmm2 370 movaps %xmm2, -12 * SIZE(Y) 371 372 SHUFPD_1 %xmm4, %xmm3 373 mulpd ALPHA, %xmm3 374 addpd -10 * SIZE(Y), %xmm3 375 movaps %xmm3, -10 * SIZE(Y) 376 377 movaps -1 * SIZE(X), %xmm0 378 movaps 1 * SIZE(X), %xmm1 379 380#if defined(PREFETCHW) && !defined(FETCH128) 381 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 382#endif 383 384 SHUFPD_1 %xmm5, %xmm4 385 mulpd ALPHA, %xmm4 386 addpd -8 * SIZE(Y), %xmm4 387 movaps %xmm4, -8 * SIZE(Y) 388 389 SHUFPD_1 %xmm6, %xmm5 390 mulpd ALPHA, %xmm5 391 addpd -6 * SIZE(Y), %xmm5 392 movaps %xmm5, -6 * SIZE(Y) 393 394 movaps 3 * SIZE(X), %xmm2 395 movaps 5 * SIZE(X), %xmm3 396 397#if defined(PREFETCH) && !defined(FETCH128) 398 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 399#endif 400 401 SHUFPD_1 %xmm7, %xmm6 402 mulpd ALPHA, %xmm6 403 addpd -4 * SIZE(Y), %xmm6 404 movaps %xmm6, -4 * SIZE(Y) 405 406 SHUFPD_1 %xmm0, %xmm7 407 mulpd ALPHA, %xmm7 408 addpd -2 * SIZE(Y), %xmm7 409 movaps %xmm7, -2 * SIZE(Y) 410 411 subq $-16 * SIZE, X 412 subq $-16 * SIZE, Y 413 decq %rax 414 jg .L21 415 ALIGN_3 416 417.L22: 418 movaps -9 * SIZE(X), %xmm4 419 movaps -7 * SIZE(X), %xmm5 420 421 SHUFPD_1 %xmm1, %xmm0 422 mulpd ALPHA, %xmm0 423 addpd -16 * SIZE(Y), %xmm0 424 movaps %xmm0, -16 * SIZE(Y) 425 movaps -1 * SIZE(X), %xmm0 426 427 SHUFPD_1 %xmm2, %xmm1 428 mulpd ALPHA, %xmm1 429 addpd -14 * SIZE(Y), %xmm1 430 movaps %xmm1, -14 * SIZE(Y) 431 432 movaps -5 * SIZE(X), %xmm6 433 movaps -3 * SIZE(X), %xmm7 434 435 SHUFPD_1 %xmm3, %xmm2 436 mulpd ALPHA, %xmm2 437 addpd -12 * SIZE(Y), %xmm2 438 movaps %xmm2, -12 * SIZE(Y) 439 440 SHUFPD_1 %xmm4, %xmm3 441 mulpd ALPHA, %xmm3 442 addpd -10 * SIZE(Y), %xmm3 443 movaps %xmm3, -10 * SIZE(Y) 444 445 SHUFPD_1 %xmm5, %xmm4 446 mulpd ALPHA, %xmm4 447 addpd -8 * SIZE(Y), %xmm4 448 movaps %xmm4, -8 * SIZE(Y) 449 450 SHUFPD_1 %xmm6, %xmm5 451 mulpd ALPHA, %xmm5 452 addpd -6 * SIZE(Y), %xmm5 453 movaps %xmm5, -6 * SIZE(Y) 454 455 SHUFPD_1 %xmm7, %xmm6 456 mulpd ALPHA, %xmm6 457 addpd -4 * SIZE(Y), %xmm6 458 movaps %xmm6, -4 * SIZE(Y) 459 460 SHUFPD_1 %xmm0, %xmm7 461 mulpd ALPHA, %xmm7 462 addpd -2 * SIZE(Y), %xmm7 463 movaps %xmm7, -2 * SIZE(Y) 464 465 subq $-16 * SIZE, X 466 subq $-16 * SIZE, Y 467 ALIGN_3 468 469.L23: 470 movq M, %rax 471 andq $8, %rax 472 jle .L24 473 ALIGN_3 474 475 movaps -15 * SIZE(X), %xmm1 476 movaps -13 * SIZE(X), %xmm2 477 movaps -11 * SIZE(X), %xmm3 478 movaps -9 * SIZE(X), %xmm8 479 480 SHUFPD_1 %xmm1, %xmm0 481 mulpd ALPHA, %xmm0 482 addpd -16 * SIZE(Y), %xmm0 483 movaps %xmm0, -16 * SIZE(Y) 484 485 SHUFPD_1 %xmm2, %xmm1 486 mulpd ALPHA, %xmm1 487 addpd -14 * SIZE(Y), %xmm1 488 movaps %xmm1, -14 * SIZE(Y) 489 490 SHUFPD_1 %xmm3, %xmm2 491 mulpd ALPHA, %xmm2 492 addpd -12 * SIZE(Y), %xmm2 493 movaps %xmm2, -12 * SIZE(Y) 494 495 SHUFPD_1 %xmm8, %xmm3 496 mulpd ALPHA, %xmm3 497 addpd -10 * SIZE(Y), %xmm3 498 movaps %xmm3, -10 * SIZE(Y) 499 500 movaps %xmm8, %xmm0 501 502 addq $8 * SIZE, X 503 addq $8 * SIZE, Y 504 ALIGN_3 505 506.L24: 507 movq M, %rax 508 andq $4, %rax 509 jle .L25 510 ALIGN_3 511 512 movaps -15 * SIZE(X), %xmm1 513 movaps -13 * SIZE(X), %xmm2 514 515 SHUFPD_1 %xmm1, %xmm0 516 SHUFPD_1 %xmm2, %xmm1 517 518 mulpd ALPHA, %xmm0 519 mulpd ALPHA, %xmm1 520 521 addpd -16 * SIZE(Y), %xmm0 522 addpd -14 * SIZE(Y), %xmm1 523 524 movaps %xmm0, -16 * SIZE(Y) 525 movaps %xmm1, -14 * SIZE(Y) 526 movaps %xmm2, %xmm0 527 528 addq $4 * SIZE, X 529 addq $4 * SIZE, Y 530 ALIGN_3 531 532.L25: 533 movq M, %rax 534 andq $2, %rax 535 jle .L26 536 ALIGN_3 537 538 movaps -15 * SIZE(X), %xmm1 539 SHUFPD_1 %xmm1, %xmm0 540 mulpd ALPHA, %xmm0 541 addpd -16 * SIZE(Y), %xmm0 542 543 movaps %xmm0, -16 * SIZE(Y) 544 545 addq $2 * SIZE, X 546 addq $2 * SIZE, Y 547 ALIGN_3 548 549.L26: 550 movq M, %rax 551 andq $1, %rax 552 jle .L29 553 ALIGN_3 554 555 movsd -16 * SIZE(X), %xmm0 556 mulsd ALPHA, %xmm0 557 addsd -16 * SIZE(Y), %xmm0 558 559 movsd %xmm0, -16 * SIZE(Y) 560 ALIGN_3 561 562.L29: 563 xorq %rax,%rax 564 565 RESTOREREGISTERS 566 567 ret 568 ALIGN_3 569 570#else 571 movq M, %rax 572 sarq $4, %rax 573 jle .L23 574 575 movsd -16 * SIZE(X), %xmm0 576 movhps -15 * SIZE(X), %xmm0 577 movsd -14 * SIZE(X), %xmm1 578 movhps -13 * SIZE(X), %xmm1 579 movsd -12 * SIZE(X), %xmm2 580 movhps -11 * SIZE(X), %xmm2 581 movsd -10 * SIZE(X), %xmm3 582 movhps -9 * SIZE(X), %xmm3 583 584 decq %rax 585 jle .L22 586 ALIGN_3 587 588.L21: 589 movsd -8 * SIZE(X), %xmm4 590 movhps -7 * SIZE(X), %xmm4 591 movsd -6 * SIZE(X), %xmm5 592 movhps -5 * SIZE(X), %xmm5 593 594#ifdef PREFETCHW 595 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 596#endif 597 598 mulpd ALPHA, %xmm0 599 addpd -16 * SIZE(Y), %xmm0 600 movaps %xmm0, -16 * SIZE(Y) 601 602 mulpd ALPHA, %xmm1 603 addpd -14 * SIZE(Y), %xmm1 604 movaps %xmm1, -14 * SIZE(Y) 605 606 movsd -4 * SIZE(X), %xmm6 607 movhps -3 * SIZE(X), %xmm6 608 movsd -2 * SIZE(X), %xmm7 609 movhps -1 * SIZE(X), %xmm7 610 611#ifdef PREFETCH 612 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 613#endif 614 615 mulpd ALPHA, %xmm2 616 addpd -12 * SIZE(Y), %xmm2 617 movaps %xmm2, -12 * SIZE(Y) 618 619 mulpd ALPHA, %xmm3 620 addpd -10 * SIZE(Y), %xmm3 621 movaps %xmm3, -10 * SIZE(Y) 622 623 movsd 0 * SIZE(X), %xmm0 624 movhps 1 * SIZE(X), %xmm0 625 movsd 2 * SIZE(X), %xmm1 626 movhps 3 * SIZE(X), %xmm1 627 628#if defined(PREFETCHW) && !defined(FETCH128) 629 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 630#endif 631 632 mulpd ALPHA, %xmm4 633 addpd -8 * SIZE(Y), %xmm4 634 movaps %xmm4, -8 * SIZE(Y) 635 636 mulpd ALPHA, %xmm5 637 addpd -6 * SIZE(Y), %xmm5 638 movaps %xmm5, -6 * SIZE(Y) 639 640 movsd 4 * SIZE(X), %xmm2 641 movhps 5 * SIZE(X), %xmm2 642 movsd 6 * SIZE(X), %xmm3 643 movhps 7 * SIZE(X), %xmm3 644 645#if defined(PREFETCH) && !defined(FETCH128) 646 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 647#endif 648 649 mulpd ALPHA, %xmm6 650 addpd -4 * SIZE(Y), %xmm6 651 movaps %xmm6, -4 * SIZE(Y) 652 653 mulpd ALPHA, %xmm7 654 addpd -2 * SIZE(Y), %xmm7 655 movaps %xmm7, -2 * SIZE(Y) 656 657 subq $-16 * SIZE, Y 658 subq $-16 * SIZE, X 659 decq %rax 660 jg .L21 661 ALIGN_3 662 663.L22: 664 movsd -8 * SIZE(X), %xmm4 665 movhps -7 * SIZE(X), %xmm4 666 movsd -6 * SIZE(X), %xmm5 667 movhps -5 * SIZE(X), %xmm5 668 669 mulpd ALPHA, %xmm0 670 addpd -16 * SIZE(Y), %xmm0 671 movaps %xmm0, -16 * SIZE(Y) 672 673 mulpd ALPHA, %xmm1 674 addpd -14 * SIZE(Y), %xmm1 675 movaps %xmm1, -14 * SIZE(Y) 676 677 movsd -4 * SIZE(X), %xmm6 678 movhps -3 * SIZE(X), %xmm6 679 movsd -2 * SIZE(X), %xmm7 680 movhps -1 * SIZE(X), %xmm7 681 682 mulpd ALPHA, %xmm2 683 addpd -12 * SIZE(Y), %xmm2 684 movaps %xmm2, -12 * SIZE(Y) 685 686 mulpd ALPHA, %xmm3 687 addpd -10 * SIZE(Y), %xmm3 688 movaps %xmm3, -10 * SIZE(Y) 689 690 mulpd ALPHA, %xmm4 691 addpd -8 * SIZE(Y), %xmm4 692 movaps %xmm4, -8 * SIZE(Y) 693 694 mulpd ALPHA, %xmm5 695 addpd -6 * SIZE(Y), %xmm5 696 movaps %xmm5, -6 * SIZE(Y) 697 698 mulpd ALPHA, %xmm6 699 addpd -4 * SIZE(Y), %xmm6 700 movaps %xmm6, -4 * SIZE(Y) 701 702 mulpd ALPHA, %xmm7 703 addpd -2 * SIZE(Y), %xmm7 704 movaps %xmm7, -2 * SIZE(Y) 705 706 subq $-16 * SIZE, Y 707 subq $-16 * SIZE, X 708 ALIGN_3 709 710.L23: 711 movq M, %rax 712 andq $8, %rax 713 jle .L24 714 ALIGN_3 715 716 movsd -16 * SIZE(X), %xmm0 717 movhps -15 * SIZE(X), %xmm0 718 movsd -14 * SIZE(X), %xmm1 719 movhps -13 * SIZE(X), %xmm1 720 movsd -12 * SIZE(X), %xmm2 721 movhps -11 * SIZE(X), %xmm2 722 movsd -10 * SIZE(X), %xmm3 723 movhps -9 * SIZE(X), %xmm3 724 725 mulpd ALPHA, %xmm0 726 addpd -16 * SIZE(Y), %xmm0 727 mulpd ALPHA, %xmm1 728 addpd -14 * SIZE(Y), %xmm1 729 mulpd ALPHA, %xmm2 730 addpd -12 * SIZE(Y), %xmm2 731 mulpd ALPHA, %xmm3 732 addpd -10 * SIZE(Y), %xmm3 733 734 movaps %xmm0, -16 * SIZE(Y) 735 movaps %xmm1, -14 * SIZE(Y) 736 movaps %xmm2, -12 * SIZE(Y) 737 movaps %xmm3, -10 * SIZE(Y) 738 739 addq $8 * SIZE, X 740 addq $8 * SIZE, Y 741 ALIGN_3 742 743.L24: 744 movq M, %rax 745 andq $4, %rax 746 jle .L25 747 ALIGN_3 748 749 movsd -16 * SIZE(X), %xmm0 750 movhps -15 * SIZE(X), %xmm0 751 movsd -14 * SIZE(X), %xmm1 752 movhps -13 * SIZE(X), %xmm1 753 754 mulpd ALPHA, %xmm0 755 mulpd ALPHA, %xmm1 756 757 addpd -16 * SIZE(Y), %xmm0 758 addpd -14 * SIZE(Y), %xmm1 759 760 movaps %xmm0, -16 * SIZE(Y) 761 movaps %xmm1, -14 * SIZE(Y) 762 763 addq $4 * SIZE, X 764 addq $4 * SIZE, Y 765 ALIGN_3 766 767.L25: 768 movq M, %rax 769 andq $2, %rax 770 jle .L26 771 ALIGN_3 772 773 movsd -16 * SIZE(X), %xmm0 774 movhps -15 * SIZE(X), %xmm0 775 mulpd ALPHA, %xmm0 776 addpd -16 * SIZE(Y), %xmm0 777 movaps %xmm0, -16 * SIZE(Y) 778 779 addq $2 * SIZE, X 780 addq $2 * SIZE, Y 781 ALIGN_3 782 783.L26: 784 movq M, %rax 785 andq $1, %rax 786 jle .L29 787 ALIGN_3 788 789 movsd -16 * SIZE(X), %xmm0 790 mulsd ALPHA, %xmm0 791 addsd -16 * SIZE(Y), %xmm0 792 793 movsd %xmm0, -16 * SIZE(Y) 794 ALIGN_3 795 796.L29: 797 xorq %rax,%rax 798 799 RESTOREREGISTERS 800 801 ret 802 ALIGN_3 803#endif 804 805.L40: 806 movq Y, YY 807 movq M, %rax 808//If incx==0 || incy==0, avoid unloop. 809 cmpq $0, INCX 810 je .L46 811 cmpq $0, INCY 812 je .L46 813 814 sarq $3, %rax 815 jle .L45 816 ALIGN_3 817 818.L41: 819 movsd 0 * SIZE(X), %xmm0 820 addq INCX, X 821 movhpd 0 * SIZE(X), %xmm0 822 addq INCX, X 823 mulpd ALPHA, %xmm0 824 825 movsd 0 * SIZE(YY), %xmm6 826 addq INCY, YY 827 movhpd 0 * SIZE(YY), %xmm6 828 addq INCY, YY 829 addpd %xmm6, %xmm0 830 831 movsd 0 * SIZE(X), %xmm1 832 addq INCX, X 833 movhpd 0 * SIZE(X), %xmm1 834 addq INCX, X 835 mulpd ALPHA, %xmm1 836 837 movsd 0 * SIZE(YY), %xmm6 838 addq INCY, YY 839 movhpd 0 * SIZE(YY), %xmm6 840 addq INCY, YY 841 addpd %xmm6, %xmm1 842 843 movsd 0 * SIZE(X), %xmm2 844 addq INCX, X 845 movhpd 0 * SIZE(X), %xmm2 846 addq INCX, X 847 mulpd ALPHA, %xmm2 848 849 movsd 0 * SIZE(YY), %xmm6 850 addq INCY, YY 851 movhpd 0 * SIZE(YY), %xmm6 852 addq INCY, YY 853 addpd %xmm6, %xmm2 854 855 movsd 0 * SIZE(X), %xmm3 856 addq INCX, X 857 movhpd 0 * SIZE(X), %xmm3 858 addq INCX, X 859 mulpd ALPHA, %xmm3 860 861 movsd 0 * SIZE(YY), %xmm6 862 addq INCY, YY 863 movhpd 0 * SIZE(YY), %xmm6 864 addq INCY, YY 865 addpd %xmm6, %xmm3 866 867 movsd %xmm0, 0 * SIZE(Y) 868 addq INCY, Y 869 movhpd %xmm0, 0 * SIZE(Y) 870 addq INCY, Y 871 movsd %xmm1, 0 * SIZE(Y) 872 addq INCY, Y 873 movhpd %xmm1, 0 * SIZE(Y) 874 addq INCY, Y 875 movsd %xmm2, 0 * SIZE(Y) 876 addq INCY, Y 877 movhpd %xmm2, 0 * SIZE(Y) 878 addq INCY, Y 879 movsd %xmm3, 0 * SIZE(Y) 880 addq INCY, Y 881 movhpd %xmm3, 0 * SIZE(Y) 882 addq INCY, Y 883 884 decq %rax 885 jg .L41 886 ALIGN_3 887 888.L45: 889 movq M, %rax 890 andq $7, %rax 891 jle .L47 892 ALIGN_3 893 894.L46: 895 movsd (X), %xmm0 896 addq INCX, X 897 mulsd %xmm15, %xmm0 898 addsd (Y), %xmm0 899 movsd %xmm0, (Y) 900 addq INCY, Y 901 decq %rax 902 jg .L46 903 ALIGN_3 904 905.L47: 906 xorq %rax, %rax 907 908 RESTOREREGISTERS 909 910 ret 911 912 EPILOGUE 913