1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifndef WINDOWS_ABI 26#define M ARG1 27#define X ARG4 28#define INCX ARG5 29#define Y ARG6 30#define INCY ARG2 31#else 32#define M ARG1 33#define X ARG2 34#define INCX ARG3 35#define Y ARG4 36#define INCY %r10 37#endif 38 39#define YY %r11 40#define ALPHA %xmm15 41 42#include "l1param.h" 43 44 PROLOGUE 45 PROFCODE 46 47#ifndef WINDOWS_ABI 48#ifndef XDOUBLE 49 movq 8(%rsp), INCY 50#else 51 movq 24(%rsp), INCY 52#endif 53 movaps %xmm0, ALPHA 54#else 55 movaps %xmm3, ALPHA 56 57 movq 40(%rsp), X 58 movq 48(%rsp), INCX 59 movq 56(%rsp), Y 60 movq 64(%rsp), INCY 61#endif 62 63 SAVEREGISTERS 64 65 shufps $0, ALPHA, ALPHA 66 67 leaq (, INCX, SIZE), INCX 68 leaq (, INCY, SIZE), INCY 69 70 testq M, M 71 jle .L19 72 73 cmpq $SIZE, INCX 74 jne .L50 75 cmpq $SIZE, INCY 76 jne .L50 77 78 subq $-32 * SIZE, X 79 subq $-32 * SIZE, Y 80 81 cmpq $3, M 82 jle .L16 83 84 testq $SIZE, Y 85 je .L00 86 87 movss -32 * SIZE(X), %xmm0 88 mulss ALPHA, %xmm0 89 addss -32 * SIZE(Y), %xmm0 90 movss %xmm0, -32 * SIZE(Y) 91 addq $1 * SIZE, X 92 addq $1 * SIZE, Y 93 decq M 94 jle .L19 95 ALIGN_3 96 97.L00: 98 testq $SIZE * 2, Y 99 je .L10 100 101 movsd -32 * SIZE(X), %xmm0 102 movsd -32 * SIZE(Y), %xmm4 103 mulps ALPHA, %xmm0 104 addps %xmm4, %xmm0 105 movsd %xmm0, -32 * SIZE(Y) 106 107 addq $2 * SIZE, X 108 addq $2 * SIZE, Y 109 subq $2, M 110 jle .L19 111 ALIGN_3 112 113.L10: 114 testq $SIZE * 3, X 115 jne .L20 116 117 movq M, %rax 118 sarq $5, %rax 119 jle .L13 120 121 movaps -32 * SIZE(X), %xmm0 122 movaps -28 * SIZE(X), %xmm1 123 movaps -24 * SIZE(X), %xmm2 124 movaps -20 * SIZE(X), %xmm3 125 126 decq %rax 127 jle .L12 128 ALIGN_4 129 130.L11: 131 movaps -16 * SIZE(X), %xmm4 132 movaps -12 * SIZE(X), %xmm5 133 134#ifdef PREFETCHW 135 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 136#endif 137 138 mulps ALPHA, %xmm0 139 addps -32 * SIZE(Y), %xmm0 140 movaps %xmm0, -32 * SIZE(Y) 141 142 mulps ALPHA, %xmm1 143 addps -28 * SIZE(Y), %xmm1 144 movaps %xmm1, -28 * SIZE(Y) 145 146 movaps -8 * SIZE(X), %xmm6 147 movaps -4 * SIZE(X), %xmm7 148 149#ifdef PREFETCH 150 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 151#endif 152 153 mulps ALPHA, %xmm2 154 addps -24 * SIZE(Y), %xmm2 155 movaps %xmm2, -24 * SIZE(Y) 156 157 mulps ALPHA, %xmm3 158 addps -20 * SIZE(Y), %xmm3 159 movaps %xmm3, -20 * SIZE(Y) 160 161 movaps 0 * SIZE(X), %xmm0 162 movaps 4 * SIZE(X), %xmm1 163 164#if defined(PREFETCHW) && !defined(FETCH128) 165 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 166#endif 167 168 mulps ALPHA, %xmm4 169 addps -16 * SIZE(Y), %xmm4 170 movaps %xmm4, -16 * SIZE(Y) 171 172 mulps ALPHA, %xmm5 173 addps -12 * SIZE(Y), %xmm5 174 movaps %xmm5, -12 * SIZE(Y) 175 176 movaps 8 * SIZE(X), %xmm2 177 movaps 12 * SIZE(X), %xmm3 178 179#if defined(PREFETCH) && !defined(FETCH128) 180 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 181#endif 182 183 mulps ALPHA, %xmm6 184 addps -8 * SIZE(Y), %xmm6 185 movaps %xmm6, -8 * SIZE(Y) 186 187 mulps ALPHA, %xmm7 188 addps -4 * SIZE(Y), %xmm7 189 movaps %xmm7, -4 * SIZE(Y) 190 191 subq $-32 * SIZE, X 192 subq $-32 * SIZE, Y 193 decq %rax 194 jg .L11 195 ALIGN_3 196 197.L12: 198 movaps -16 * SIZE(X), %xmm4 199 movaps -12 * SIZE(X), %xmm5 200 201 mulps ALPHA, %xmm0 202 addps -32 * SIZE(Y), %xmm0 203 movaps %xmm0, -32 * SIZE(Y) 204 205 mulps ALPHA, %xmm1 206 addps -28 * SIZE(Y), %xmm1 207 movaps %xmm1, -28 * SIZE(Y) 208 209 movaps -8 * SIZE(X), %xmm6 210 movaps -4 * SIZE(X), %xmm7 211 212 mulps ALPHA, %xmm2 213 addps -24 * SIZE(Y), %xmm2 214 movaps %xmm2, -24 * SIZE(Y) 215 216 mulps ALPHA, %xmm3 217 addps -20 * SIZE(Y), %xmm3 218 movaps %xmm3, -20 * SIZE(Y) 219 220 mulps ALPHA, %xmm4 221 addps -16 * SIZE(Y), %xmm4 222 movaps %xmm4, -16 * SIZE(Y) 223 224 mulps ALPHA, %xmm5 225 addps -12 * SIZE(Y), %xmm5 226 movaps %xmm5, -12 * SIZE(Y) 227 228 mulps ALPHA, %xmm6 229 addps -8 * SIZE(Y), %xmm6 230 movaps %xmm6, -8 * SIZE(Y) 231 232 mulps ALPHA, %xmm7 233 addps -4 * SIZE(Y), %xmm7 234 movaps %xmm7, -4 * SIZE(Y) 235 236 subq $-32 * SIZE, X 237 subq $-32 * SIZE, Y 238 ALIGN_3 239 240.L13: 241 movq M, %rax 242 andq $16, %rax 243 jle .L14 244 ALIGN_3 245 246 movaps -32 * SIZE(X), %xmm0 247 movaps -28 * SIZE(X), %xmm1 248 movaps -24 * SIZE(X), %xmm2 249 movaps -20 * SIZE(X), %xmm3 250 251 mulps ALPHA, %xmm0 252 addps -32 * SIZE(Y), %xmm0 253 mulps ALPHA, %xmm1 254 addps -28 * SIZE(Y), %xmm1 255 mulps ALPHA, %xmm2 256 addps -24 * SIZE(Y), %xmm2 257 mulps ALPHA, %xmm3 258 addps -20 * SIZE(Y), %xmm3 259 260 movaps %xmm0, -32 * SIZE(Y) 261 movaps %xmm1, -28 * SIZE(Y) 262 movaps %xmm2, -24 * SIZE(Y) 263 movaps %xmm3, -20 * SIZE(Y) 264 265 addq $16 * SIZE, X 266 addq $16 * SIZE, Y 267 ALIGN_3 268 269.L14: 270 movq M, %rax 271 andq $8, %rax 272 jle .L15 273 ALIGN_3 274 275 movaps -32 * SIZE(X), %xmm0 276 movaps -28 * SIZE(X), %xmm1 277 278 mulps ALPHA, %xmm0 279 addps -32 * SIZE(Y), %xmm0 280 mulps ALPHA, %xmm1 281 addps -28 * SIZE(Y), %xmm1 282 283 movaps %xmm0, -32 * SIZE(Y) 284 movaps %xmm1, -28 * SIZE(Y) 285 286 addq $8 * SIZE, X 287 addq $8 * SIZE, Y 288 ALIGN_3 289 290.L15: 291 movq M, %rax 292 andq $4, %rax 293 jle .L16 294 ALIGN_3 295 296 movaps -32 * SIZE(X), %xmm0 297 298 mulps ALPHA, %xmm0 299 300 addps -32 * SIZE(Y), %xmm0 301 302 movaps %xmm0, -32 * SIZE(Y) 303 304 addq $4 * SIZE, X 305 addq $4 * SIZE, Y 306 ALIGN_3 307 308.L16: 309 movq M, %rax 310 andq $2, %rax 311 jle .L17 312 ALIGN_3 313 314 movsd -32 * SIZE(X), %xmm0 315 movsd -32 * SIZE(Y), %xmm4 316 317 mulps ALPHA, %xmm0 318 addps %xmm4, %xmm0 319 320 movsd %xmm0, -32 * SIZE(Y) 321 322 addq $2 * SIZE, X 323 addq $2 * SIZE, Y 324 ALIGN_3 325 326.L17: 327 movq M, %rax 328 andq $1, %rax 329 jle .L19 330 ALIGN_3 331 332 movss -32 * SIZE(X), %xmm0 333 mulss ALPHA, %xmm0 334 addss -32 * SIZE(Y), %xmm0 335 336 movss %xmm0, -32 * SIZE(Y) 337 ALIGN_3 338 339.L19: 340 xorq %rax,%rax 341 342 RESTOREREGISTERS 343 344 ret 345 ALIGN_3 346 347.L20: 348 349#ifdef ALIGNED_ACCESS 350 351 testq $SIZE, X 352 jne .L30 353 354 movhps -32 * SIZE(X), %xmm0 355 356 movq M, %rax 357 sarq $5, %rax 358 jle .L23 359 360 movaps -30 * SIZE(X), %xmm1 361 movaps -26 * SIZE(X), %xmm2 362 movaps -22 * SIZE(X), %xmm3 363 movaps -18 * SIZE(X), %xmm4 364 365 decq %rax 366 jle .L22 367 ALIGN_4 368 369.L21: 370 movaps -14 * SIZE(X), %xmm5 371 movaps -10 * SIZE(X), %xmm6 372 373#ifdef PREFETCHW 374 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 375#endif 376 377 SHUFPD_1 %xmm1, %xmm0 378 mulps ALPHA, %xmm0 379 addps -32 * SIZE(Y), %xmm0 380 movaps %xmm0, -32 * SIZE(Y) 381 382 SHUFPD_1 %xmm2, %xmm1 383 mulps ALPHA, %xmm1 384 addps -28 * SIZE(Y), %xmm1 385 movaps %xmm1, -28 * SIZE(Y) 386 387 movaps -6 * SIZE(X), %xmm7 388 movaps -2 * SIZE(X), %xmm0 389 390#ifdef PREFETCH 391 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 392#endif 393 394 SHUFPD_1 %xmm3, %xmm2 395 mulps ALPHA, %xmm2 396 addps -24 * SIZE(Y), %xmm2 397 movaps %xmm2, -24 * SIZE(Y) 398 399 SHUFPD_1 %xmm4, %xmm3 400 mulps ALPHA, %xmm3 401 addps -20 * SIZE(Y), %xmm3 402 movaps %xmm3, -20 * SIZE(Y) 403 404 movaps 2 * SIZE(X), %xmm1 405 movaps 6 * SIZE(X), %xmm2 406 407#if defined(PREFETCHW) && !defined(FETCH128) 408 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 409#endif 410 411 SHUFPD_1 %xmm5, %xmm4 412 mulps ALPHA, %xmm4 413 addps -16 * SIZE(Y), %xmm4 414 movaps %xmm4, -16 * SIZE(Y) 415 416 SHUFPD_1 %xmm6, %xmm5 417 mulps ALPHA, %xmm5 418 addps -12 * SIZE(Y), %xmm5 419 movaps %xmm5, -12 * SIZE(Y) 420 421 movaps 10 * SIZE(X), %xmm3 422 movaps 14 * SIZE(X), %xmm4 423 424#if defined(PREFETCH) && !defined(FETCH128) 425 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 426#endif 427 428 SHUFPD_1 %xmm7, %xmm6 429 mulps ALPHA, %xmm6 430 addps -8 * SIZE(Y), %xmm6 431 movaps %xmm6, -8 * SIZE(Y) 432 433 SHUFPD_1 %xmm0, %xmm7 434 mulps ALPHA, %xmm7 435 addps -4 * SIZE(Y), %xmm7 436 movaps %xmm7, -4 * SIZE(Y) 437 438 subq $-32 * SIZE, X 439 subq $-32 * SIZE, Y 440 decq %rax 441 jg .L21 442 ALIGN_3 443 444.L22: 445 movaps -14 * SIZE(X), %xmm5 446 movaps -10 * SIZE(X), %xmm6 447 448 SHUFPD_1 %xmm1, %xmm0 449 mulps ALPHA, %xmm0 450 addps -32 * SIZE(Y), %xmm0 451 movaps %xmm0, -32 * SIZE(Y) 452 453 SHUFPD_1 %xmm2, %xmm1 454 mulps ALPHA, %xmm1 455 addps -28 * SIZE(Y), %xmm1 456 movaps %xmm1, -28 * SIZE(Y) 457 458 movaps -6 * SIZE(X), %xmm7 459 movaps -2 * SIZE(X), %xmm0 460 461 SHUFPD_1 %xmm3, %xmm2 462 mulps ALPHA, %xmm2 463 addps -24 * SIZE(Y), %xmm2 464 movaps %xmm2, -24 * SIZE(Y) 465 466 SHUFPD_1 %xmm4, %xmm3 467 mulps ALPHA, %xmm3 468 addps -20 * SIZE(Y), %xmm3 469 movaps %xmm3, -20 * SIZE(Y) 470 471 SHUFPD_1 %xmm5, %xmm4 472 mulps ALPHA, %xmm4 473 addps -16 * SIZE(Y), %xmm4 474 movaps %xmm4, -16 * SIZE(Y) 475 476 SHUFPD_1 %xmm6, %xmm5 477 mulps ALPHA, %xmm5 478 addps -12 * SIZE(Y), %xmm5 479 movaps %xmm5, -12 * SIZE(Y) 480 481 SHUFPD_1 %xmm7, %xmm6 482 mulps ALPHA, %xmm6 483 addps -8 * SIZE(Y), %xmm6 484 movaps %xmm6, -8 * SIZE(Y) 485 486 SHUFPD_1 %xmm0, %xmm7 487 mulps ALPHA, %xmm7 488 addps -4 * SIZE(Y), %xmm7 489 movaps %xmm7, -4 * SIZE(Y) 490 491 subq $-32 * SIZE, X 492 subq $-32 * SIZE, Y 493 ALIGN_3 494 495.L23: 496 movq M, %rax 497 andq $16, %rax 498 jle .L24 499 ALIGN_3 500 501 movaps -30 * SIZE(X), %xmm1 502 movaps -26 * SIZE(X), %xmm2 503 movaps -22 * SIZE(X), %xmm3 504 movaps -18 * SIZE(X), %xmm4 505 506 SHUFPD_1 %xmm1, %xmm0 507 SHUFPD_1 %xmm2, %xmm1 508 SHUFPD_1 %xmm3, %xmm2 509 SHUFPD_1 %xmm4, %xmm3 510 511 mulps ALPHA, %xmm0 512 addps -32 * SIZE(Y), %xmm0 513 mulps ALPHA, %xmm1 514 addps -28 * SIZE(Y), %xmm1 515 mulps ALPHA, %xmm2 516 addps -24 * SIZE(Y), %xmm2 517 mulps ALPHA, %xmm3 518 addps -20 * SIZE(Y), %xmm3 519 520 movaps %xmm0, -32 * SIZE(Y) 521 movaps %xmm1, -28 * SIZE(Y) 522 movaps %xmm2, -24 * SIZE(Y) 523 movaps %xmm3, -20 * SIZE(Y) 524 525 movaps %xmm4, %xmm0 526 527 addq $16 * SIZE, X 528 addq $16 * SIZE, Y 529 ALIGN_3 530 531.L24: 532 movq M, %rax 533 andq $8, %rax 534 jle .L25 535 ALIGN_3 536 537 movaps -30 * SIZE(X), %xmm1 538 movaps -26 * SIZE(X), %xmm2 539 540 SHUFPD_1 %xmm1, %xmm0 541 mulps ALPHA, %xmm0 542 addps -32 * SIZE(Y), %xmm0 543 SHUFPD_1 %xmm2, %xmm1 544 mulps ALPHA, %xmm1 545 addps -28 * SIZE(Y), %xmm1 546 547 movaps %xmm0, -32 * SIZE(Y) 548 movaps %xmm1, -28 * SIZE(Y) 549 movaps %xmm2, %xmm0 550 551 addq $8 * SIZE, X 552 addq $8 * SIZE, Y 553 ALIGN_3 554 555.L25: 556 movq M, %rax 557 andq $4, %rax 558 jle .L26 559 ALIGN_3 560 561 movaps -30 * SIZE(X), %xmm1 562 563 SHUFPD_1 %xmm1, %xmm0 564 mulps ALPHA, %xmm0 565 566 addps -32 * SIZE(Y), %xmm0 567 568 movaps %xmm0, -32 * SIZE(Y) 569 570 addq $4 * SIZE, X 571 addq $4 * SIZE, Y 572 ALIGN_3 573 574.L26: 575 movq M, %rax 576 andq $2, %rax 577 jle .L27 578 ALIGN_3 579 580 movsd -32 * SIZE(X), %xmm0 581 movsd -32 * SIZE(Y), %xmm4 582 583 mulps ALPHA, %xmm0 584 addps %xmm4, %xmm0 585 586 movsd %xmm0, -32 * SIZE(Y) 587 588 addq $2 * SIZE, X 589 addq $2 * SIZE, Y 590 ALIGN_3 591 592.L27: 593 movq M, %rax 594 andq $1, %rax 595 jle .L29 596 ALIGN_3 597 598 movss -32 * SIZE(X), %xmm0 599 mulss ALPHA, %xmm0 600 addss -32 * SIZE(Y), %xmm0 601 602 movss %xmm0, -32 * SIZE(Y) 603 addq $SIZE, Y 604 ALIGN_3 605 606.L29: 607 xorq %rax,%rax 608 609 RESTOREREGISTERS 610 611 ret 612 ALIGN_3 613 614.L30: 615 testq $2 * SIZE, X 616 jne .L40 617 618 movaps -33 * SIZE(X), %xmm0 619 620 movq M, %rax 621 sarq $5, %rax 622 jle .L33 623 624 movaps -29 * SIZE(X), %xmm1 625 movaps -25 * SIZE(X), %xmm2 626 movaps -21 * SIZE(X), %xmm3 627 movaps -17 * SIZE(X), %xmm4 628 629 decq %rax 630 jle .L32 631 ALIGN_4 632 633.L31: 634 movaps -13 * SIZE(X), %xmm5 635 movaps -9 * SIZE(X), %xmm6 636 637#ifdef PREFETCHW 638 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 639#endif 640 641 movss %xmm1, %xmm0 642 SHUFPS_39 %xmm0, %xmm0 643 mulps ALPHA, %xmm0 644 addps -32 * SIZE(Y), %xmm0 645 movaps %xmm0, -32 * SIZE(Y) 646 647 movss %xmm2, %xmm1 648 SHUFPS_39 %xmm1, %xmm1 649 mulps ALPHA, %xmm1 650 addps -28 * SIZE(Y), %xmm1 651 movaps %xmm1, -28 * SIZE(Y) 652 653 movaps -5 * SIZE(X), %xmm7 654 movaps -1 * SIZE(X), %xmm0 655 656#ifdef PREFETCH 657 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 658#endif 659 660 movss %xmm3, %xmm2 661 SHUFPS_39 %xmm2, %xmm2 662 mulps ALPHA, %xmm2 663 addps -24 * SIZE(Y), %xmm2 664 movaps %xmm2, -24 * SIZE(Y) 665 666 movss %xmm4, %xmm3 667 SHUFPS_39 %xmm3, %xmm3 668 mulps ALPHA, %xmm3 669 addps -20 * SIZE(Y), %xmm3 670 movaps %xmm3, -20 * SIZE(Y) 671 672 movaps 3 * SIZE(X), %xmm1 673 movaps 7 * SIZE(X), %xmm2 674 675#if defined(PREFETCHW) && !defined(FETCH128) 676 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 677#endif 678 679 movss %xmm5, %xmm4 680 SHUFPS_39 %xmm4, %xmm4 681 mulps ALPHA, %xmm4 682 addps -16 * SIZE(Y), %xmm4 683 movaps %xmm4, -16 * SIZE(Y) 684 685 movss %xmm6, %xmm5 686 SHUFPS_39 %xmm5, %xmm5 687 mulps ALPHA, %xmm5 688 addps -12 * SIZE(Y), %xmm5 689 movaps %xmm5, -12 * SIZE(Y) 690 691 movaps 11 * SIZE(X), %xmm3 692 movaps 15 * SIZE(X), %xmm4 693 694#if defined(PREFETCH) && !defined(FETCH128) 695 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 696#endif 697 698 movss %xmm7, %xmm6 699 SHUFPS_39 %xmm6, %xmm6 700 mulps ALPHA, %xmm6 701 addps -8 * SIZE(Y), %xmm6 702 movaps %xmm6, -8 * SIZE(Y) 703 704 movss %xmm0, %xmm7 705 SHUFPS_39 %xmm7, %xmm7 706 mulps ALPHA, %xmm7 707 addps -4 * SIZE(Y), %xmm7 708 movaps %xmm7, -4 * SIZE(Y) 709 710 subq $-32 * SIZE, X 711 subq $-32 * SIZE, Y 712 decq %rax 713 jg .L31 714 ALIGN_3 715 716.L32: 717 movaps -13 * SIZE(X), %xmm5 718 movaps -9 * SIZE(X), %xmm6 719 720 movss %xmm1, %xmm0 721 SHUFPS_39 %xmm0, %xmm0 722 mulps ALPHA, %xmm0 723 addps -32 * SIZE(Y), %xmm0 724 movaps %xmm0, -32 * SIZE(Y) 725 726 movss %xmm2, %xmm1 727 SHUFPS_39 %xmm1, %xmm1 728 mulps ALPHA, %xmm1 729 addps -28 * SIZE(Y), %xmm1 730 movaps %xmm1, -28 * SIZE(Y) 731 732 movaps -5 * SIZE(X), %xmm7 733 movaps -1 * SIZE(X), %xmm0 734 735 movss %xmm3, %xmm2 736 SHUFPS_39 %xmm2, %xmm2 737 mulps ALPHA, %xmm2 738 addps -24 * SIZE(Y), %xmm2 739 movaps %xmm2, -24 * SIZE(Y) 740 741 movss %xmm4, %xmm3 742 SHUFPS_39 %xmm3, %xmm3 743 mulps ALPHA, %xmm3 744 addps -20 * SIZE(Y), %xmm3 745 movaps %xmm3, -20 * SIZE(Y) 746 747 movss %xmm5, %xmm4 748 SHUFPS_39 %xmm4, %xmm4 749 mulps ALPHA, %xmm4 750 addps -16 * SIZE(Y), %xmm4 751 movaps %xmm4, -16 * SIZE(Y) 752 753 movss %xmm6, %xmm5 754 SHUFPS_39 %xmm5, %xmm5 755 mulps ALPHA, %xmm5 756 addps -12 * SIZE(Y), %xmm5 757 movaps %xmm5, -12 * SIZE(Y) 758 759 movss %xmm7, %xmm6 760 SHUFPS_39 %xmm6, %xmm6 761 mulps ALPHA, %xmm6 762 addps -8 * SIZE(Y), %xmm6 763 movaps %xmm6, -8 * SIZE(Y) 764 765 movss %xmm0, %xmm7 766 SHUFPS_39 %xmm7, %xmm7 767 mulps ALPHA, %xmm7 768 addps -4 * SIZE(Y), %xmm7 769 movaps %xmm7, -4 * SIZE(Y) 770 771 subq $-32 * SIZE, X 772 subq $-32 * SIZE, Y 773 ALIGN_3 774 775.L33: 776 movq M, %rax 777 andq $16, %rax 778 jle .L34 779 ALIGN_3 780 781 movaps -29 * SIZE(X), %xmm1 782 movaps -25 * SIZE(X), %xmm2 783 movaps -21 * SIZE(X), %xmm3 784 movaps -17 * SIZE(X), %xmm4 785 786 movss %xmm1, %xmm0 787 SHUFPS_39 %xmm0, %xmm0 788 mulps ALPHA, %xmm0 789 addps -32 * SIZE(Y), %xmm0 790 791 movss %xmm2, %xmm1 792 SHUFPS_39 %xmm1, %xmm1 793 mulps ALPHA, %xmm1 794 addps -28 * SIZE(Y), %xmm1 795 796 movss %xmm3, %xmm2 797 SHUFPS_39 %xmm2, %xmm2 798 mulps ALPHA, %xmm2 799 addps -24 * SIZE(Y), %xmm2 800 801 movss %xmm4, %xmm3 802 SHUFPS_39 %xmm3, %xmm3 803 mulps ALPHA, %xmm3 804 addps -20 * SIZE(Y), %xmm3 805 806 movaps %xmm0, -32 * SIZE(Y) 807 movaps %xmm1, -28 * SIZE(Y) 808 movaps %xmm2, -24 * SIZE(Y) 809 movaps %xmm3, -20 * SIZE(Y) 810 811 movaps %xmm4, %xmm0 812 813 addq $16 * SIZE, X 814 addq $16 * SIZE, Y 815 ALIGN_3 816 817.L34: 818 movq M, %rax 819 andq $8, %rax 820 jle .L35 821 ALIGN_3 822 823 movaps -29 * SIZE(X), %xmm1 824 movaps -25 * SIZE(X), %xmm2 825 826 movss %xmm1, %xmm0 827 SHUFPS_39 %xmm0, %xmm0 828 mulps ALPHA, %xmm0 829 addps -32 * SIZE(Y), %xmm0 830 831 movss %xmm2, %xmm1 832 SHUFPS_39 %xmm1, %xmm1 833 mulps ALPHA, %xmm1 834 addps -28 * SIZE(Y), %xmm1 835 836 movaps %xmm0, -32 * SIZE(Y) 837 movaps %xmm1, -28 * SIZE(Y) 838 movaps %xmm2, %xmm0 839 840 addq $8 * SIZE, X 841 addq $8 * SIZE, Y 842 ALIGN_3 843 844.L35: 845 movq M, %rax 846 andq $4, %rax 847 jle .L36 848 ALIGN_3 849 850 movaps -29 * SIZE(X), %xmm1 851 852 movss %xmm1, %xmm0 853 SHUFPS_39 %xmm0, %xmm0 854 mulps ALPHA, %xmm0 855 856 addps -32 * SIZE(Y), %xmm0 857 858 movaps %xmm0, -32 * SIZE(Y) 859 860 addq $4 * SIZE, X 861 addq $4 * SIZE, Y 862 ALIGN_3 863 864.L36: 865 movq M, %rax 866 andq $2, %rax 867 jle .L37 868 ALIGN_3 869 870 movsd -32 * SIZE(X), %xmm0 871 movsd -32 * SIZE(Y), %xmm4 872 873 mulps ALPHA, %xmm0 874 addps %xmm4, %xmm0 875 876 movsd %xmm0, -32 * SIZE(Y) 877 878 addq $2 * SIZE, X 879 addq $2 * SIZE, Y 880 ALIGN_3 881 882.L37: 883 movq M, %rax 884 andq $1, %rax 885 jle .L39 886 ALIGN_3 887 888 movss -32 * SIZE(X), %xmm0 889 mulss ALPHA, %xmm0 890 addss -32 * SIZE(Y), %xmm0 891 892 movss %xmm0, -32 * SIZE(Y) 893 addq $SIZE, Y 894 ALIGN_3 895 896.L39: 897 xorq %rax,%rax 898 899 RESTOREREGISTERS 900 901 ret 902 ALIGN_3 903 904.L40: 905 movaps -35 * SIZE(X), %xmm0 906 907 movq M, %rax 908 sarq $5, %rax 909 jle .L43 910 911 movaps -31 * SIZE(X), %xmm1 912 movaps -27 * SIZE(X), %xmm2 913 movaps -23 * SIZE(X), %xmm3 914 movaps -19 * SIZE(X), %xmm4 915 916 decq %rax 917 jle .L42 918 ALIGN_4 919 920.L41: 921 movaps -15 * SIZE(X), %xmm5 922 movaps -11 * SIZE(X), %xmm6 923 924#ifdef PREFETCHW 925 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 926#endif 927 928 movss %xmm1, %xmm0 929 shufps $0x93, %xmm1, %xmm0 930 mulps ALPHA, %xmm0 931 addps -32 * SIZE(Y), %xmm0 932 movaps %xmm0, -32 * SIZE(Y) 933 934 movss %xmm2, %xmm1 935 shufps $0x93, %xmm2, %xmm1 936 mulps ALPHA, %xmm1 937 addps -28 * SIZE(Y), %xmm1 938 movaps %xmm1, -28 * SIZE(Y) 939 940 movaps -7 * SIZE(X), %xmm7 941 movaps -3 * SIZE(X), %xmm0 942 943#ifdef PREFETCH 944 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 945#endif 946 947 movss %xmm3, %xmm2 948 shufps $0x93, %xmm3, %xmm2 949 mulps ALPHA, %xmm2 950 addps -24 * SIZE(Y), %xmm2 951 movaps %xmm2, -24 * SIZE(Y) 952 953 movss %xmm4, %xmm3 954 shufps $0x93, %xmm4, %xmm3 955 mulps ALPHA, %xmm3 956 addps -20 * SIZE(Y), %xmm3 957 movaps %xmm3, -20 * SIZE(Y) 958 959 movaps 1 * SIZE(X), %xmm1 960 movaps 5 * SIZE(X), %xmm2 961 962#if defined(PREFETCHW) && !defined(FETCH128) 963 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 964#endif 965 966 movss %xmm5, %xmm4 967 shufps $0x93, %xmm5, %xmm4 968 mulps ALPHA, %xmm4 969 addps -16 * SIZE(Y), %xmm4 970 movaps %xmm4, -16 * SIZE(Y) 971 972 movss %xmm6, %xmm5 973 shufps $0x93, %xmm6, %xmm5 974 mulps ALPHA, %xmm5 975 addps -12 * SIZE(Y), %xmm5 976 movaps %xmm5, -12 * SIZE(Y) 977 978 movaps 9 * SIZE(X), %xmm3 979 movaps 13 * SIZE(X), %xmm4 980 981#if defined(PREFETCH) && !defined(FETCH128) 982 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 983#endif 984 985 movss %xmm7, %xmm6 986 shufps $0x93, %xmm7, %xmm6 987 mulps ALPHA, %xmm6 988 addps -8 * SIZE(Y), %xmm6 989 movaps %xmm6, -8 * SIZE(Y) 990 991 movss %xmm0, %xmm7 992 shufps $0x93, %xmm0, %xmm7 993 mulps ALPHA, %xmm7 994 addps -4 * SIZE(Y), %xmm7 995 movaps %xmm7, -4 * SIZE(Y) 996 997 subq $-32 * SIZE, X 998 subq $-32 * SIZE, Y 999 decq %rax 1000 jg .L41 1001 ALIGN_3 1002 1003.L42: 1004 movaps -15 * SIZE(X), %xmm5 1005 movaps -11 * SIZE(X), %xmm6 1006 1007 movss %xmm1, %xmm0 1008 shufps $0x93, %xmm1, %xmm0 1009 mulps ALPHA, %xmm0 1010 addps -32 * SIZE(Y), %xmm0 1011 movaps %xmm0, -32 * SIZE(Y) 1012 1013 movss %xmm2, %xmm1 1014 shufps $0x93, %xmm2, %xmm1 1015 mulps ALPHA, %xmm1 1016 addps -28 * SIZE(Y), %xmm1 1017 movaps %xmm1, -28 * SIZE(Y) 1018 1019 movaps -7 * SIZE(X), %xmm7 1020 movaps -3 * SIZE(X), %xmm0 1021 1022 movss %xmm3, %xmm2 1023 shufps $0x93, %xmm3, %xmm2 1024 mulps ALPHA, %xmm2 1025 addps -24 * SIZE(Y), %xmm2 1026 movaps %xmm2, -24 * SIZE(Y) 1027 1028 movss %xmm4, %xmm3 1029 shufps $0x93, %xmm4, %xmm3 1030 mulps ALPHA, %xmm3 1031 addps -20 * SIZE(Y), %xmm3 1032 movaps %xmm3, -20 * SIZE(Y) 1033 1034 movss %xmm5, %xmm4 1035 shufps $0x93, %xmm5, %xmm4 1036 mulps ALPHA, %xmm4 1037 addps -16 * SIZE(Y), %xmm4 1038 movaps %xmm4, -16 * SIZE(Y) 1039 1040 movss %xmm6, %xmm5 1041 shufps $0x93, %xmm6, %xmm5 1042 mulps ALPHA, %xmm5 1043 addps -12 * SIZE(Y), %xmm5 1044 movaps %xmm5, -12 * SIZE(Y) 1045 1046 movss %xmm7, %xmm6 1047 shufps $0x93, %xmm7, %xmm6 1048 mulps ALPHA, %xmm6 1049 addps -8 * SIZE(Y), %xmm6 1050 movaps %xmm6, -8 * SIZE(Y) 1051 1052 movss %xmm0, %xmm7 1053 shufps $0x93, %xmm0, %xmm7 1054 mulps ALPHA, %xmm7 1055 addps -4 * SIZE(Y), %xmm7 1056 movaps %xmm7, -4 * SIZE(Y) 1057 1058 subq $-32 * SIZE, X 1059 subq $-32 * SIZE, Y 1060 ALIGN_3 1061 1062.L43: 1063 movq M, %rax 1064 andq $16, %rax 1065 jle .L44 1066 ALIGN_3 1067 1068 movaps -31 * SIZE(X), %xmm1 1069 movaps -27 * SIZE(X), %xmm2 1070 movaps -23 * SIZE(X), %xmm3 1071 movaps -19 * SIZE(X), %xmm4 1072 1073 movss %xmm1, %xmm0 1074 shufps $0x93, %xmm1, %xmm0 1075 mulps ALPHA, %xmm0 1076 addps -32 * SIZE(Y), %xmm0 1077 movss %xmm2, %xmm1 1078 shufps $0x93, %xmm2, %xmm1 1079 mulps ALPHA, %xmm1 1080 addps -28 * SIZE(Y), %xmm1 1081 1082 movss %xmm3, %xmm2 1083 shufps $0x93, %xmm3, %xmm2 1084 mulps ALPHA, %xmm2 1085 addps -24 * SIZE(Y), %xmm2 1086 movss %xmm4, %xmm3 1087 shufps $0x93, %xmm4, %xmm3 1088 mulps ALPHA, %xmm3 1089 addps -20 * SIZE(Y), %xmm3 1090 1091 movaps %xmm0, -32 * SIZE(Y) 1092 movaps %xmm1, -28 * SIZE(Y) 1093 movaps %xmm2, -24 * SIZE(Y) 1094 movaps %xmm3, -20 * SIZE(Y) 1095 1096 movaps %xmm4, %xmm0 1097 1098 addq $16 * SIZE, X 1099 addq $16 * SIZE, Y 1100 ALIGN_3 1101 1102.L44: 1103 movq M, %rax 1104 andq $8, %rax 1105 jle .L45 1106 ALIGN_3 1107 1108 movaps -31 * SIZE(X), %xmm1 1109 movaps -27 * SIZE(X), %xmm2 1110 1111 movss %xmm1, %xmm0 1112 shufps $0x93, %xmm1, %xmm0 1113 mulps ALPHA, %xmm0 1114 addps -32 * SIZE(Y), %xmm0 1115 movss %xmm2, %xmm1 1116 shufps $0x93, %xmm2, %xmm1 1117 mulps ALPHA, %xmm1 1118 addps -28 * SIZE(Y), %xmm1 1119 1120 movaps %xmm0, -32 * SIZE(Y) 1121 movaps %xmm1, -28 * SIZE(Y) 1122 movaps %xmm2, %xmm0 1123 1124 addq $8 * SIZE, X 1125 addq $8 * SIZE, Y 1126 ALIGN_3 1127 1128.L45: 1129 movq M, %rax 1130 andq $4, %rax 1131 jle .L46 1132 ALIGN_3 1133 1134 movaps -31 * SIZE(X), %xmm1 1135 1136 movss %xmm1, %xmm0 1137 shufps $0x93, %xmm1, %xmm0 1138 mulps ALPHA, %xmm0 1139 1140 addps -32 * SIZE(Y), %xmm0 1141 1142 movaps %xmm0, -32 * SIZE(Y) 1143 1144 addq $4 * SIZE, X 1145 addq $4 * SIZE, Y 1146 ALIGN_3 1147 1148.L46: 1149 movq M, %rax 1150 andq $2, %rax 1151 jle .L47 1152 ALIGN_3 1153 1154 movsd -32 * SIZE(X), %xmm0 1155 movsd -32 * SIZE(Y), %xmm4 1156 1157 mulps ALPHA, %xmm0 1158 addps %xmm4, %xmm0 1159 1160 movsd %xmm0, -32 * SIZE(Y) 1161 1162 addq $2 * SIZE, X 1163 addq $2 * SIZE, Y 1164 ALIGN_3 1165 1166.L47: 1167 movq M, %rax 1168 andq $1, %rax 1169 jle .L49 1170 ALIGN_3 1171 1172 movss -32 * SIZE(X), %xmm0 1173 mulss ALPHA, %xmm0 1174 addss -32 * SIZE(Y), %xmm0 1175 1176 movss %xmm0, -32 * SIZE(Y) 1177 addq $SIZE, Y 1178 ALIGN_3 1179 1180.L49: 1181 xorq %rax,%rax 1182 1183 RESTOREREGISTERS 1184 1185 ret 1186 1187#else 1188 1189 movq M, %rax 1190 sarq $5, %rax 1191 jle .L23 1192 1193 movsd -32 * SIZE(X), %xmm0 1194 movhps -30 * SIZE(X), %xmm0 1195 movsd -28 * SIZE(X), %xmm1 1196 movhps -26 * SIZE(X), %xmm1 1197 movsd -24 * SIZE(X), %xmm2 1198 movhps -22 * SIZE(X), %xmm2 1199 movsd -20 * SIZE(X), %xmm3 1200 movhps -18 * SIZE(X), %xmm3 1201 1202 decq %rax 1203 jle .L22 1204 ALIGN_4 1205 1206.L21: 1207 movsd -16 * SIZE(X), %xmm4 1208 movhps -14 * SIZE(X), %xmm4 1209 movsd -12 * SIZE(X), %xmm5 1210 movhps -10 * SIZE(X), %xmm5 1211 1212#ifdef PREFETCHW 1213 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 1214#endif 1215 1216 mulps ALPHA, %xmm0 1217 addps -32 * SIZE(Y), %xmm0 1218 movaps %xmm0, -32 * SIZE(Y) 1219 1220 mulps ALPHA, %xmm1 1221 addps -28 * SIZE(Y), %xmm1 1222 movaps %xmm1, -28 * SIZE(Y) 1223 1224 movsd -8 * SIZE(X), %xmm6 1225 movhps -6 * SIZE(X), %xmm6 1226 movsd -4 * SIZE(X), %xmm7 1227 movhps -2 * SIZE(X), %xmm7 1228 1229#ifdef PREFETCH 1230 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 1231#endif 1232 1233 mulps ALPHA, %xmm2 1234 addps -24 * SIZE(Y), %xmm2 1235 movaps %xmm2, -24 * SIZE(Y) 1236 1237 mulps ALPHA, %xmm3 1238 addps -20 * SIZE(Y), %xmm3 1239 movaps %xmm3, -20 * SIZE(Y) 1240 1241 movsd 0 * SIZE(X), %xmm0 1242 movhps 2 * SIZE(X), %xmm0 1243 movsd 4 * SIZE(X), %xmm1 1244 movhps 6 * SIZE(X), %xmm1 1245 1246#if defined(PREFETCHW) && !defined(FETCH128) 1247 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 1248#endif 1249 1250 mulps ALPHA, %xmm4 1251 addps -16 * SIZE(Y), %xmm4 1252 movaps %xmm4, -16 * SIZE(Y) 1253 1254 mulps ALPHA, %xmm5 1255 addps -12 * SIZE(Y), %xmm5 1256 movaps %xmm5, -12 * SIZE(Y) 1257 1258 movsd 8 * SIZE(X), %xmm2 1259 movhps 10 * SIZE(X), %xmm2 1260 movsd 12 * SIZE(X), %xmm3 1261 movhps 14 * SIZE(X), %xmm3 1262 1263#if defined(PREFETCH) && !defined(FETCH128) 1264 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 1265#endif 1266 1267 mulps ALPHA, %xmm6 1268 addps -8 * SIZE(Y), %xmm6 1269 movaps %xmm6, -8 * SIZE(Y) 1270 1271 mulps ALPHA, %xmm7 1272 addps -4 * SIZE(Y), %xmm7 1273 movaps %xmm7, -4 * SIZE(Y) 1274 1275 subq $-32 * SIZE, X 1276 subq $-32 * SIZE, Y 1277 decq %rax 1278 jg .L21 1279 ALIGN_3 1280 1281.L22: 1282 movsd -16 * SIZE(X), %xmm4 1283 movhps -14 * SIZE(X), %xmm4 1284 movsd -12 * SIZE(X), %xmm5 1285 movhps -10 * SIZE(X), %xmm5 1286 1287 mulps ALPHA, %xmm0 1288 addps -32 * SIZE(Y), %xmm0 1289 movaps %xmm0, -32 * SIZE(Y) 1290 1291 mulps ALPHA, %xmm1 1292 addps -28 * SIZE(Y), %xmm1 1293 movaps %xmm1, -28 * SIZE(Y) 1294 1295 movsd -8 * SIZE(X), %xmm6 1296 movhps -6 * SIZE(X), %xmm6 1297 movsd -4 * SIZE(X), %xmm7 1298 movhps -2 * SIZE(X), %xmm7 1299 1300 mulps ALPHA, %xmm2 1301 addps -24 * SIZE(Y), %xmm2 1302 movaps %xmm2, -24 * SIZE(Y) 1303 1304 mulps ALPHA, %xmm3 1305 addps -20 * SIZE(Y), %xmm3 1306 movaps %xmm3, -20 * SIZE(Y) 1307 1308 mulps ALPHA, %xmm4 1309 addps -16 * SIZE(Y), %xmm4 1310 movaps %xmm4, -16 * SIZE(Y) 1311 1312 mulps ALPHA, %xmm5 1313 addps -12 * SIZE(Y), %xmm5 1314 movaps %xmm5, -12 * SIZE(Y) 1315 1316 mulps ALPHA, %xmm6 1317 addps -8 * SIZE(Y), %xmm6 1318 movaps %xmm6, -8 * SIZE(Y) 1319 1320 mulps ALPHA, %xmm7 1321 addps -4 * SIZE(Y), %xmm7 1322 movaps %xmm7, -4 * SIZE(Y) 1323 1324 subq $-32 * SIZE, X 1325 subq $-32 * SIZE, Y 1326 ALIGN_3 1327 1328.L23: 1329 movq M, %rax 1330 andq $16, %rax 1331 jle .L24 1332 ALIGN_3 1333 1334 movsd -32 * SIZE(X), %xmm0 1335 movhps -30 * SIZE(X), %xmm0 1336 movsd -28 * SIZE(X), %xmm1 1337 movhps -26 * SIZE(X), %xmm1 1338 1339 mulps ALPHA, %xmm0 1340 addps -32 * SIZE(Y), %xmm0 1341 movaps %xmm0, -32 * SIZE(Y) 1342 mulps ALPHA, %xmm1 1343 addps -28 * SIZE(Y), %xmm1 1344 movaps %xmm1, -28 * SIZE(Y) 1345 1346 movsd -24 * SIZE(X), %xmm2 1347 movhps -22 * SIZE(X), %xmm2 1348 movsd -20 * SIZE(X), %xmm3 1349 movhps -18 * SIZE(X), %xmm3 1350 1351 mulps ALPHA, %xmm2 1352 addps -24 * SIZE(Y), %xmm2 1353 movaps %xmm2, -24 * SIZE(Y) 1354 mulps ALPHA, %xmm3 1355 addps -20 * SIZE(Y), %xmm3 1356 movaps %xmm3, -20 * SIZE(Y) 1357 1358 addq $16 * SIZE, X 1359 addq $16 * SIZE, Y 1360 ALIGN_3 1361 1362.L24: 1363 movq M, %rax 1364 andq $8, %rax 1365 jle .L25 1366 ALIGN_3 1367 1368 movsd -32 * SIZE(X), %xmm0 1369 movhps -30 * SIZE(X), %xmm0 1370 movsd -28 * SIZE(X), %xmm1 1371 movhps -26 * SIZE(X), %xmm1 1372 1373 mulps ALPHA, %xmm0 1374 addps -32 * SIZE(Y), %xmm0 1375 mulps ALPHA, %xmm1 1376 addps -28 * SIZE(Y), %xmm1 1377 1378 movaps %xmm0, -32 * SIZE(Y) 1379 movaps %xmm1, -28 * SIZE(Y) 1380 1381 addq $8 * SIZE, X 1382 addq $8 * SIZE, Y 1383 ALIGN_3 1384 1385.L25: 1386 movq M, %rax 1387 andq $4, %rax 1388 jle .L26 1389 ALIGN_3 1390 1391 movsd -32 * SIZE(X), %xmm0 1392 movhps -30 * SIZE(X), %xmm0 1393 1394 mulps ALPHA, %xmm0 1395 1396 addps -32 * SIZE(Y), %xmm0 1397 1398 movaps %xmm0, -32 * SIZE(Y) 1399 1400 addq $4 * SIZE, X 1401 addq $4 * SIZE, Y 1402 ALIGN_3 1403 1404.L26: 1405 movq M, %rax 1406 andq $2, %rax 1407 jle .L27 1408 ALIGN_3 1409 1410 movsd -32 * SIZE(X), %xmm0 1411 movsd -32 * SIZE(Y), %xmm4 1412 1413 mulps ALPHA, %xmm0 1414 addps %xmm4, %xmm0 1415 1416 movsd %xmm0, -32 * SIZE(Y) 1417 1418 addq $2 * SIZE, X 1419 addq $2 * SIZE, Y 1420 ALIGN_3 1421 1422.L27: 1423 movq M, %rax 1424 andq $1, %rax 1425 jle .L29 1426 ALIGN_3 1427 1428 movss -32 * SIZE(X), %xmm0 1429 mulss ALPHA, %xmm0 1430 addss -32 * SIZE(Y), %xmm0 1431 1432 movss %xmm0, -32 * SIZE(Y) 1433 addq $SIZE, Y 1434 ALIGN_3 1435 1436.L29: 1437 xorq %rax,%rax 1438 1439 RESTOREREGISTERS 1440 1441 ret 1442#endif 1443 ALIGN_3 1444 1445 1446.L50: 1447 movq M, %rax 1448 movq Y, YY 1449 sarq $3, %rax 1450 jle .L55 1451 ALIGN_3 1452 1453.L51: 1454 movss (X), %xmm0 1455 addq INCX, X 1456 mulss ALPHA, %xmm0 1457 movss (YY), %xmm6 1458 addq INCY, YY 1459 addss %xmm6, %xmm0 1460 1461 movss (X), %xmm1 1462 addq INCX, X 1463 mulss ALPHA, %xmm1 1464 movss (YY), %xmm6 1465 addq INCY, YY 1466 addss %xmm6, %xmm1 1467 1468 movss (X), %xmm2 1469 addq INCX, X 1470 mulss ALPHA, %xmm2 1471 movss (YY), %xmm6 1472 addq INCY, YY 1473 addss %xmm6, %xmm2 1474 1475 movss (X), %xmm3 1476 addq INCX, X 1477 mulss ALPHA, %xmm3 1478 movss (YY), %xmm6 1479 addq INCY, YY 1480 addss %xmm6, %xmm3 1481 1482 movss %xmm0, (Y) 1483 addq INCY, Y 1484 movss %xmm1, (Y) 1485 addq INCY, Y 1486 movss %xmm2, (Y) 1487 addq INCY, Y 1488 movss %xmm3, (Y) 1489 addq INCY, Y 1490 1491 movss (X), %xmm0 1492 addq INCX, X 1493 mulss ALPHA, %xmm0 1494 movss (YY), %xmm6 1495 addq INCY, YY 1496 addss %xmm6, %xmm0 1497 1498 movss (X), %xmm1 1499 addq INCX, X 1500 mulss ALPHA, %xmm1 1501 movss (YY), %xmm6 1502 addq INCY, YY 1503 addss %xmm6, %xmm1 1504 1505 movss (X), %xmm2 1506 addq INCX, X 1507 mulss ALPHA, %xmm2 1508 movss (YY), %xmm6 1509 addq INCY, YY 1510 addss %xmm6, %xmm2 1511 1512 movss (X), %xmm3 1513 addq INCX, X 1514 mulss ALPHA, %xmm3 1515 movss (YY), %xmm6 1516 addq INCY, YY 1517 addss %xmm6, %xmm3 1518 1519 movss %xmm0, (Y) 1520 addq INCY, Y 1521 movss %xmm1, (Y) 1522 addq INCY, Y 1523 movss %xmm2, (Y) 1524 addq INCY, Y 1525 movss %xmm3, (Y) 1526 addq INCY, Y 1527 1528 decq %rax 1529 jg .L51 1530 ALIGN_3 1531 1532.L55: 1533 movq M, %rax 1534 andq $7, %rax 1535 jle .L59 1536 ALIGN_3 1537 1538.L56: 1539 movss (X), %xmm0 1540 addq INCX, X 1541 mulss ALPHA, %xmm0 1542 movss (Y), %xmm6 1543 addss %xmm6, %xmm0 1544 movss %xmm0, (Y) 1545 addq INCY, Y 1546 decq %rax 1547 jg .L56 1548 ALIGN_3 1549 1550.L59: 1551 xorq %rax,%rax 1552 1553 RESTOREREGISTERS 1554 1555 ret 1556 ALIGN_3 1557 1558 1559 EPILOGUE 1560