1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M ARG1 /* rdi */ 26#define X ARG2 /* rsi */ 27#define INCX ARG3 /* rdx */ 28#define Y ARG4 /* rcx */ 29#ifndef WINDOWS_ABI 30#define INCY ARG5 /* r8 */ 31#else 32#define INCY %r10 33#endif 34 35#include "l1param.h" 36 37#ifdef OPTERON 38#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG 39#else 40#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG 41#endif 42 43 PROLOGUE 44 PROFCODE 45 46#ifdef WINDOWS_ABI 47 movq 40(%rsp), INCY 48#endif 49 50 SAVEREGISTERS 51 52 salq $ZBASE_SHIFT, INCX 53 salq $ZBASE_SHIFT, INCY 54 55 cmpq $2 * SIZE, INCX 56 jne .L100 57 cmpq $2 * SIZE, INCY 58 jne .L100 59 60 cmpq $3, M 61 jle .L106 62 63 subq $-32 * SIZE, X 64 subq $-32 * SIZE, Y 65 addq M, M 66 67 testq $SIZE, Y 68 je .L05 69 70 movss -32 * SIZE(X), %xmm0 71 movss %xmm0, -32 * SIZE(Y) 72 addq $1 * SIZE, X 73 addq $1 * SIZE, Y 74 decq M 75 ALIGN_4 76 77.L05: 78 testq $2 * SIZE, Y 79 je .L10 80 81 movsd -32 * SIZE(X), %xmm0 82 movlps %xmm0, -32 * SIZE(Y) 83 addq $2 * SIZE, X 84 addq $2 * SIZE, Y 85 subq $2, M 86 jle .L19 87 ALIGN_4 88 89.L10: 90 testq $3 * SIZE, X 91 jne .L20 92 93 movq M, %rax 94 sarq $5, %rax 95 jle .L13 96 97 movaps -32 * SIZE(X), %xmm0 98 movaps -28 * SIZE(X), %xmm1 99 movaps -24 * SIZE(X), %xmm2 100 movaps -20 * SIZE(X), %xmm3 101 movaps -16 * SIZE(X), %xmm4 102 movaps -12 * SIZE(X), %xmm5 103 movaps -8 * SIZE(X), %xmm6 104 movaps -4 * SIZE(X), %xmm7 105 106 decq %rax 107 jle .L12 108 ALIGN_3 109 110.L11: 111#ifdef PREFETCHW 112 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 113#endif 114 115 movaps %xmm0, -32 * SIZE(Y) 116 LOAD( 0 * SIZE, X, %xmm0) 117 movaps %xmm1, -28 * SIZE(Y) 118 LOAD( 4 * SIZE, X, %xmm1) 119 120#ifdef PREFETCH 121 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 122#endif 123 124 movaps %xmm2, -24 * SIZE(Y) 125 LOAD( 8 * SIZE, X, %xmm2) 126 movaps %xmm3, -20 * SIZE(Y) 127 LOAD(12 * SIZE, X, %xmm3) 128 129#if defined(PREFETCHW) && !defined(FETCH128) 130 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 131#endif 132 133 movaps %xmm4,-16 * SIZE(Y) 134 LOAD(16 * SIZE, X, %xmm4) 135 movaps %xmm5,-12 * SIZE(Y) 136 LOAD(20 * SIZE, X, %xmm5) 137 138#if defined(PREFETCH) && !defined(FETCH128) 139 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 140#endif 141 142 movaps %xmm6, -8 * SIZE(Y) 143 LOAD(24 * SIZE, X, %xmm6) 144 movaps %xmm7, -4 * SIZE(Y) 145 LOAD(28 * SIZE, X, %xmm7) 146 147 subq $-32 * SIZE, Y 148 subq $-32 * SIZE, X 149 decq %rax 150 jg .L11 151 ALIGN_3 152 153.L12: 154 movaps %xmm0, -32 * SIZE(Y) 155 movaps %xmm1, -28 * SIZE(Y) 156 movaps %xmm2, -24 * SIZE(Y) 157 movaps %xmm3, -20 * SIZE(Y) 158 movaps %xmm4, -16 * SIZE(Y) 159 movaps %xmm5, -12 * SIZE(Y) 160 movaps %xmm6, -8 * SIZE(Y) 161 movaps %xmm7, -4 * SIZE(Y) 162 163 subq $-32 * SIZE, Y 164 subq $-32 * SIZE, X 165 ALIGN_3 166 167.L13: 168 testq $16, M 169 jle .L14 170 171 movaps -32 * SIZE(X), %xmm0 172 movaps -28 * SIZE(X), %xmm1 173 movaps -24 * SIZE(X), %xmm2 174 movaps -20 * SIZE(X), %xmm3 175 176 movaps %xmm0, -32 * SIZE(Y) 177 movaps %xmm1, -28 * SIZE(Y) 178 movaps %xmm2, -24 * SIZE(Y) 179 movaps %xmm3, -20 * SIZE(Y) 180 181 addq $16 * SIZE, X 182 addq $16 * SIZE, Y 183 ALIGN_3 184 185.L14: 186 testq $8, M 187 jle .L15 188 189 movaps -32 * SIZE(X), %xmm0 190 movaps -28 * SIZE(X), %xmm1 191 192 movaps %xmm0, -32 * SIZE(Y) 193 movaps %xmm1, -28 * SIZE(Y) 194 195 addq $8 * SIZE, X 196 addq $8 * SIZE, Y 197 ALIGN_3 198 199.L15: 200 testq $4, M 201 jle .L16 202 203 movaps -32 * SIZE(X), %xmm0 204 movaps %xmm0, -32 * SIZE(Y) 205 206 addq $4 * SIZE, X 207 addq $4 * SIZE, Y 208 ALIGN_3 209 210.L16: 211 testq $2, M 212 jle .L17 213 214 movsd -32 * SIZE(X), %xmm0 215 movlps %xmm0, -32 * SIZE(Y) 216 217 addq $2 * SIZE, X 218 addq $2 * SIZE, Y 219 ALIGN_3 220 221.L17: 222 testq $1, M 223 jle .L19 224 225 movss -32 * SIZE(X), %xmm0 226 movss %xmm0, -32 * SIZE(Y) 227 ALIGN_3 228 229.L19: 230 xorq %rax,%rax 231 232 RESTOREREGISTERS 233 234 ret 235 ALIGN_3 236 237 238.L20: 239 testq $SIZE, X 240 jne .L30 241 242 movhps -32 * SIZE(X), %xmm0 243 244 movq M, %rax 245 sarq $5, %rax 246 jle .L23 247 248 movaps -30 * SIZE(X), %xmm1 249 movaps -26 * SIZE(X), %xmm2 250 movaps -22 * SIZE(X), %xmm3 251 movaps -18 * SIZE(X), %xmm4 252 movaps -14 * SIZE(X), %xmm5 253 movaps -10 * SIZE(X), %xmm6 254 movaps -6 * SIZE(X), %xmm7 255 256 decq %rax 257 jle .L22 258 ALIGN_4 259 260.L21: 261 262#ifdef PREFETCHW 263 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 264#endif 265 266 shufps $0x4e, %xmm1, %xmm0 267 movaps %xmm0, -32 * SIZE(Y) 268 movaps -2 * SIZE(X), %xmm0 269 270 shufps $0x4e, %xmm2, %xmm1 271 movaps %xmm1, -28 * SIZE(Y) 272 movaps 2 * SIZE(X), %xmm1 273 274#ifdef PREFETCH 275 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 276#endif 277 278 shufps $0x4e, %xmm3, %xmm2 279 movaps %xmm2, -24 * SIZE(Y) 280 movaps 6 * SIZE(X), %xmm2 281 282 shufps $0x4e, %xmm4, %xmm3 283 movaps %xmm3, -20 * SIZE(Y) 284 movaps 10 * SIZE(X), %xmm3 285 286#if defined(PREFETCHW) && !defined(FETCH128) 287 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 288#endif 289 290 shufps $0x4e, %xmm5, %xmm4 291 movaps %xmm4, -16 * SIZE(Y) 292 movaps 14 * SIZE(X), %xmm4 293 294 shufps $0x4e, %xmm6, %xmm5 295 movaps %xmm5, -12 * SIZE(Y) 296 movaps 18 * SIZE(X), %xmm5 297 298#if defined(PREFETCH) && !defined(FETCH128) 299 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 300#endif 301 302 shufps $0x4e, %xmm7, %xmm6 303 movaps %xmm6, -8 * SIZE(Y) 304 movaps 22 * SIZE(X), %xmm6 305 306 shufps $0x4e, %xmm0, %xmm7 307 movaps %xmm7, -4 * SIZE(Y) 308 movaps 26 * SIZE(X), %xmm7 309 310 subq $-32 * SIZE, X 311 subq $-32 * SIZE, Y 312 decq %rax 313 jg .L21 314 ALIGN_3 315 316.L22: 317 shufps $0x4e, %xmm1, %xmm0 318 movaps %xmm0, -32 * SIZE(Y) 319 movaps -2 * SIZE(X), %xmm0 320 321 shufps $0x4e, %xmm2, %xmm1 322 movaps %xmm1, -28 * SIZE(Y) 323 324 shufps $0x4e, %xmm3, %xmm2 325 movaps %xmm2, -24 * SIZE(Y) 326 327 shufps $0x4e, %xmm4, %xmm3 328 movaps %xmm3, -20 * SIZE(Y) 329 330 shufps $0x4e, %xmm5, %xmm4 331 movaps %xmm4, -16 * SIZE(Y) 332 333 shufps $0x4e, %xmm6, %xmm5 334 movaps %xmm5, -12 * SIZE(Y) 335 336 shufps $0x4e, %xmm7, %xmm6 337 movaps %xmm6, -8 * SIZE(Y) 338 339 shufps $0x4e, %xmm0, %xmm7 340 movaps %xmm7, -4 * SIZE(Y) 341 342 subq $-32 * SIZE, X 343 subq $-32 * SIZE, Y 344 ALIGN_3 345 346.L23: 347 testq $16, M 348 jle .L24 349 ALIGN_3 350 351 movaps -30 * SIZE(X), %xmm1 352 movaps -26 * SIZE(X), %xmm2 353 movaps -22 * SIZE(X), %xmm3 354 movaps -18 * SIZE(X), %xmm4 355 356 shufps $0x4e, %xmm1, %xmm0 357 movaps %xmm0, -32 * SIZE(Y) 358 shufps $0x4e, %xmm2, %xmm1 359 movaps %xmm1, -28 * SIZE(Y) 360 shufps $0x4e, %xmm3, %xmm2 361 movaps %xmm2, -24 * SIZE(Y) 362 shufps $0x4e, %xmm4, %xmm3 363 movaps %xmm3, -20 * SIZE(Y) 364 365 movaps %xmm4, %xmm0 366 367 addq $16 * SIZE, X 368 addq $16 * SIZE, Y 369 ALIGN_3 370 371.L24: 372 testq $8, M 373 jle .L25 374 ALIGN_3 375 376 movaps -30 * SIZE(X), %xmm1 377 movaps -26 * SIZE(X), %xmm2 378 379 shufps $0x4e, %xmm1, %xmm0 380 shufps $0x4e, %xmm2, %xmm1 381 382 movaps %xmm0, -32 * SIZE(Y) 383 movaps %xmm1, -28 * SIZE(Y) 384 movaps %xmm2, %xmm0 385 386 addq $8 * SIZE, X 387 addq $8 * SIZE, Y 388 ALIGN_3 389 390.L25: 391 testq $4, M 392 jle .L26 393 ALIGN_3 394 395 movaps -30 * SIZE(X), %xmm1 396 shufps $0x4e, %xmm1, %xmm0 397 movaps %xmm0, -32 * SIZE(Y) 398 399 addq $4 * SIZE, X 400 addq $4 * SIZE, Y 401 ALIGN_3 402 403.L26: 404 testq $2, M 405 jle .L27 406 ALIGN_3 407 408 movsd -32 * SIZE(X), %xmm0 409 410 movsd %xmm0, -32 * SIZE(Y) 411 412 addq $2 * SIZE, X 413 addq $2 * SIZE, Y 414 ALIGN_3 415 416.L27: 417 testq $1, M 418 jle .L29 419 ALIGN_3 420 421 movss -32 * SIZE(X), %xmm0 422 movss %xmm0, -32 * SIZE(Y) 423 addq $SIZE, Y 424 ALIGN_3 425 426.L29: 427 xorq %rax,%rax 428 429 RESTOREREGISTERS 430 431 ret 432 ALIGN_3 433 434.L30: 435 testq $2 * SIZE, X 436 jne .L40 437 438 movaps -33 * SIZE(X), %xmm0 439 440 movq M, %rax 441 sarq $5, %rax 442 jle .L33 443 444 movaps -29 * SIZE(X), %xmm1 445 movaps -25 * SIZE(X), %xmm2 446 movaps -21 * SIZE(X), %xmm3 447 movaps -17 * SIZE(X), %xmm4 448 movaps -13 * SIZE(X), %xmm5 449 movaps -9 * SIZE(X), %xmm6 450 movaps -5 * SIZE(X), %xmm7 451 452 decq %rax 453 jle .L32 454 ALIGN_4 455 456.L31: 457#ifdef PREFETCHW 458 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 459#endif 460 461 movss %xmm1, %xmm0 462 shufps $0x39, %xmm0, %xmm0 463 movaps %xmm0, -32 * SIZE(Y) 464 movaps -1 * SIZE(X), %xmm0 465 466 movss %xmm2, %xmm1 467 shufps $0x39, %xmm1, %xmm1 468 movaps %xmm1, -28 * SIZE(Y) 469 movaps 3 * SIZE(X), %xmm1 470 471#ifdef PREFETCH 472 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 473#endif 474 475 movss %xmm3, %xmm2 476 shufps $0x39, %xmm2, %xmm2 477 movaps %xmm2, -24 * SIZE(Y) 478 movaps 7 * SIZE(X), %xmm2 479 480 movss %xmm4, %xmm3 481 shufps $0x39, %xmm3, %xmm3 482 movaps %xmm3, -20 * SIZE(Y) 483 movaps 11 * SIZE(X), %xmm3 484 485#if defined(PREFETCHW) && !defined(FETCH128) 486 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 487#endif 488 489 movss %xmm5, %xmm4 490 shufps $0x39, %xmm4, %xmm4 491 movaps %xmm4, -16 * SIZE(Y) 492 movaps 15 * SIZE(X), %xmm4 493 494 movss %xmm6, %xmm5 495 shufps $0x39, %xmm5, %xmm5 496 movaps %xmm5, -12 * SIZE(Y) 497 movaps 19 * SIZE(X), %xmm5 498 499#if defined(PREFETCH) && !defined(FETCH128) 500 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 501#endif 502 503 movss %xmm7, %xmm6 504 shufps $0x39, %xmm6, %xmm6 505 movaps %xmm6, -8 * SIZE(Y) 506 movaps 23 * SIZE(X), %xmm6 507 508 movss %xmm0, %xmm7 509 shufps $0x39, %xmm7, %xmm7 510 movaps %xmm7, -4 * SIZE(Y) 511 movaps 27 * SIZE(X), %xmm7 512 513 subq $-32 * SIZE, X 514 subq $-32 * SIZE, Y 515 decq %rax 516 jg .L31 517 ALIGN_3 518 519.L32: 520 movss %xmm1, %xmm0 521 shufps $0x39, %xmm0, %xmm0 522 movaps %xmm0, -32 * SIZE(Y) 523 movaps -1 * SIZE(X), %xmm0 524 525 movss %xmm2, %xmm1 526 shufps $0x39, %xmm1, %xmm1 527 movaps %xmm1, -28 * SIZE(Y) 528 529 movss %xmm3, %xmm2 530 shufps $0x39, %xmm2, %xmm2 531 movaps %xmm2, -24 * SIZE(Y) 532 533 movss %xmm4, %xmm3 534 shufps $0x39, %xmm3, %xmm3 535 movaps %xmm3, -20 * SIZE(Y) 536 537 movss %xmm5, %xmm4 538 shufps $0x39, %xmm4, %xmm4 539 movaps %xmm4, -16 * SIZE(Y) 540 541 movss %xmm6, %xmm5 542 shufps $0x39, %xmm5, %xmm5 543 movaps %xmm5, -12 * SIZE(Y) 544 545 movss %xmm7, %xmm6 546 shufps $0x39, %xmm6, %xmm6 547 movaps %xmm6, -8 * SIZE(Y) 548 549 movss %xmm0, %xmm7 550 shufps $0x39, %xmm7, %xmm7 551 movaps %xmm7, -4 * SIZE(Y) 552 553 subq $-32 * SIZE, X 554 subq $-32 * SIZE, Y 555 ALIGN_3 556 557.L33: 558 testq $16, M 559 jle .L34 560 ALIGN_3 561 562 movaps -29 * SIZE(X), %xmm1 563 movaps -25 * SIZE(X), %xmm2 564 movaps -21 * SIZE(X), %xmm3 565 movaps -17 * SIZE(X), %xmm4 566 567 movss %xmm1, %xmm0 568 shufps $0x39, %xmm0, %xmm0 569 movaps %xmm0, -32 * SIZE(Y) 570 571 movss %xmm2, %xmm1 572 shufps $0x39, %xmm1, %xmm1 573 movaps %xmm1, -28 * SIZE(Y) 574 575 movss %xmm3, %xmm2 576 shufps $0x39, %xmm2, %xmm2 577 movaps %xmm2, -24 * SIZE(Y) 578 579 movss %xmm4, %xmm3 580 shufps $0x39, %xmm3, %xmm3 581 movaps %xmm3, -20 * SIZE(Y) 582 583 movaps %xmm4, %xmm0 584 585 addq $16 * SIZE, X 586 addq $16 * SIZE, Y 587 ALIGN_3 588 589.L34: 590 testq $8, M 591 jle .L35 592 ALIGN_3 593 594 movaps -29 * SIZE(X), %xmm1 595 movaps -25 * SIZE(X), %xmm2 596 597 movss %xmm1, %xmm0 598 shufps $0x39, %xmm0, %xmm0 599 movaps %xmm0, -32 * SIZE(Y) 600 601 movss %xmm2, %xmm1 602 shufps $0x39, %xmm1, %xmm1 603 movaps %xmm1, -28 * SIZE(Y) 604 movaps %xmm2, %xmm0 605 606 addq $8 * SIZE, X 607 addq $8 * SIZE, Y 608 ALIGN_3 609 610.L35: 611 testq $4, M 612 jle .L36 613 ALIGN_3 614 615 movaps -29 * SIZE(X), %xmm1 616 617 movss %xmm1, %xmm0 618 shufps $0x39, %xmm0, %xmm0 619 620 movaps %xmm0, -32 * SIZE(Y) 621 622 addq $4 * SIZE, X 623 addq $4 * SIZE, Y 624 ALIGN_3 625 626.L36: 627 testq $2, M 628 jle .L37 629 ALIGN_3 630 631 movsd -32 * SIZE(X), %xmm0 632 movsd %xmm0, -32 * SIZE(Y) 633 634 addq $2 * SIZE, X 635 addq $2 * SIZE, Y 636 ALIGN_3 637 638.L37: 639 testq $1, M 640 jle .L39 641 ALIGN_3 642 643 movss -32 * SIZE(X), %xmm0 644 movss %xmm0, -32 * SIZE(Y) 645 addq $SIZE, Y 646 ALIGN_3 647 648.L39: 649 xorq %rax,%rax 650 651 RESTOREREGISTERS 652 653 ret 654 ALIGN_3 655 656.L40: 657 movaps -35 * SIZE(X), %xmm0 658 659 movq M, %rax 660 sarq $5, %rax 661 jle .L43 662 663 movaps -31 * SIZE(X), %xmm1 664 movaps -27 * SIZE(X), %xmm2 665 movaps -23 * SIZE(X), %xmm3 666 movaps -19 * SIZE(X), %xmm4 667 movaps -15 * SIZE(X), %xmm5 668 movaps -11 * SIZE(X), %xmm6 669 movaps -7 * SIZE(X), %xmm7 670 671 decq %rax 672 jle .L42 673 ALIGN_4 674 675.L41: 676#ifdef PREFETCHW 677 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 678#endif 679 680 movss %xmm1, %xmm0 681 shufps $0x93, %xmm1, %xmm0 682 movaps %xmm0, -32 * SIZE(Y) 683 movaps -3 * SIZE(X), %xmm0 684 685 movss %xmm2, %xmm1 686 shufps $0x93, %xmm2, %xmm1 687 movaps %xmm1, -28 * SIZE(Y) 688 movaps 1 * SIZE(X), %xmm1 689 690#ifdef PREFETCH 691 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 692#endif 693 694 movss %xmm3, %xmm2 695 shufps $0x93, %xmm3, %xmm2 696 movaps %xmm2, -24 * SIZE(Y) 697 movaps 5 * SIZE(X), %xmm2 698 699 movss %xmm4, %xmm3 700 shufps $0x93, %xmm4, %xmm3 701 movaps %xmm3, -20 * SIZE(Y) 702 movaps 9 * SIZE(X), %xmm3 703 704#if defined(PREFETCHW) && !defined(FETCH128) 705 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 706#endif 707 708 movss %xmm5, %xmm4 709 shufps $0x93, %xmm5, %xmm4 710 movaps %xmm4, -16 * SIZE(Y) 711 movaps 13 * SIZE(X), %xmm4 712 713 movss %xmm6, %xmm5 714 shufps $0x93, %xmm6, %xmm5 715 movaps %xmm5, -12 * SIZE(Y) 716 movaps 17 * SIZE(X), %xmm5 717 718#if defined(PREFETCH) && !defined(FETCH128) 719 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 720#endif 721 722 movss %xmm7, %xmm6 723 shufps $0x93, %xmm7, %xmm6 724 movaps %xmm6, -8 * SIZE(Y) 725 movaps 21 * SIZE(X), %xmm6 726 727 movss %xmm0, %xmm7 728 shufps $0x93, %xmm0, %xmm7 729 movaps %xmm7, -4 * SIZE(Y) 730 movaps 25 * SIZE(X), %xmm7 731 732 subq $-32 * SIZE, X 733 subq $-32 * SIZE, Y 734 decq %rax 735 jg .L41 736 ALIGN_3 737 738.L42: 739 movss %xmm1, %xmm0 740 shufps $0x93, %xmm1, %xmm0 741 movaps %xmm0, -32 * SIZE(Y) 742 movaps -3 * SIZE(X), %xmm0 743 744 movss %xmm2, %xmm1 745 shufps $0x93, %xmm2, %xmm1 746 movaps %xmm1, -28 * SIZE(Y) 747 748 movss %xmm3, %xmm2 749 shufps $0x93, %xmm3, %xmm2 750 movaps %xmm2, -24 * SIZE(Y) 751 752 movss %xmm4, %xmm3 753 shufps $0x93, %xmm4, %xmm3 754 movaps %xmm3, -20 * SIZE(Y) 755 756 movss %xmm5, %xmm4 757 shufps $0x93, %xmm5, %xmm4 758 movaps %xmm4, -16 * SIZE(Y) 759 760 movss %xmm6, %xmm5 761 shufps $0x93, %xmm6, %xmm5 762 movaps %xmm5, -12 * SIZE(Y) 763 764 movss %xmm7, %xmm6 765 shufps $0x93, %xmm7, %xmm6 766 movaps %xmm6, -8 * SIZE(Y) 767 768 movss %xmm0, %xmm7 769 shufps $0x93, %xmm0, %xmm7 770 movaps %xmm7, -4 * SIZE(Y) 771 772 subq $-32 * SIZE, X 773 subq $-32 * SIZE, Y 774 ALIGN_3 775 776.L43: 777 testq $16, M 778 jle .L44 779 ALIGN_3 780 781 movaps -31 * SIZE(X), %xmm1 782 movaps -27 * SIZE(X), %xmm2 783 movaps -23 * SIZE(X), %xmm3 784 movaps -19 * SIZE(X), %xmm4 785 786 movss %xmm1, %xmm0 787 shufps $0x93, %xmm1, %xmm0 788 movaps %xmm0, -32 * SIZE(Y) 789 790 movss %xmm2, %xmm1 791 shufps $0x93, %xmm2, %xmm1 792 movaps %xmm1, -28 * SIZE(Y) 793 794 movss %xmm3, %xmm2 795 shufps $0x93, %xmm3, %xmm2 796 movaps %xmm2, -24 * SIZE(Y) 797 798 movss %xmm4, %xmm3 799 shufps $0x93, %xmm4, %xmm3 800 movaps %xmm3, -20 * SIZE(Y) 801 802 movaps %xmm4, %xmm0 803 804 addq $16 * SIZE, X 805 addq $16 * SIZE, Y 806 ALIGN_3 807 808.L44: 809 testq $8, M 810 jle .L45 811 ALIGN_3 812 813 movaps -31 * SIZE(X), %xmm1 814 movaps -27 * SIZE(X), %xmm2 815 816 movss %xmm1, %xmm0 817 shufps $0x93, %xmm1, %xmm0 818 movaps %xmm0, -32 * SIZE(Y) 819 820 movss %xmm2, %xmm1 821 shufps $0x93, %xmm2, %xmm1 822 movaps %xmm1, -28 * SIZE(Y) 823 824 movaps %xmm2, %xmm0 825 826 addq $8 * SIZE, X 827 addq $8 * SIZE, Y 828 ALIGN_3 829 830.L45: 831 testq $4, M 832 jle .L46 833 ALIGN_3 834 835 movaps -31 * SIZE(X), %xmm1 836 837 movss %xmm1, %xmm0 838 shufps $0x93, %xmm1, %xmm0 839 840 movaps %xmm0, -32 * SIZE(Y) 841 842 addq $4 * SIZE, X 843 addq $4 * SIZE, Y 844 ALIGN_3 845 846.L46: 847 testq $2, M 848 jle .L47 849 ALIGN_3 850 851 movsd -32 * SIZE(X), %xmm0 852 movsd %xmm0, -32 * SIZE(Y) 853 854 addq $2 * SIZE, X 855 addq $2 * SIZE, Y 856 ALIGN_3 857 858.L47: 859 testq $1, M 860 jle .L49 861 ALIGN_3 862 863 movss -32 * SIZE(X), %xmm0 864 movss %xmm0, -32 * SIZE(Y) 865 addq $SIZE, Y 866 ALIGN_3 867 868.L49: 869 xorq %rax,%rax 870 871 RESTOREREGISTERS 872 873 ret 874 ALIGN_4 875 876.L100: 877 movq M, %rax 878 sarq $3, %rax 879 jle .L105 880 ALIGN_3 881 882.L102: 883 movsd (X), %xmm0 884 addq INCX, X 885 movhps (X), %xmm0 886 addq INCX, X 887 movsd (X), %xmm1 888 addq INCX, X 889 movhps (X), %xmm1 890 addq INCX, X 891 movsd (X), %xmm2 892 addq INCX, X 893 movhps (X), %xmm2 894 addq INCX, X 895 movsd (X), %xmm3 896 addq INCX, X 897 movhps (X), %xmm3 898 addq INCX, X 899 900 movsd %xmm0, (Y) 901 addq INCY, Y 902 movhps %xmm0, (Y) 903 addq INCY, Y 904 movsd %xmm1, (Y) 905 addq INCY, Y 906 movhps %xmm1, (Y) 907 addq INCY, Y 908 movsd %xmm2, (Y) 909 addq INCY, Y 910 movhps %xmm2, (Y) 911 addq INCY, Y 912 movsd %xmm3, (Y) 913 addq INCY, Y 914 movhps %xmm3, (Y) 915 addq INCY, Y 916 917 decq %rax 918 jg .L102 919 ALIGN_3 920 921.L105: 922 testq $4, M 923 jle .L106 924 925 movsd (X), %xmm0 926 addq INCX, X 927 movhps (X), %xmm0 928 addq INCX, X 929 movsd (X), %xmm1 930 addq INCX, X 931 movhps (X), %xmm1 932 addq INCX, X 933 934 movsd %xmm0, (Y) 935 addq INCY, Y 936 movhps %xmm0, (Y) 937 addq INCY, Y 938 movsd %xmm1, (Y) 939 addq INCY, Y 940 movhps %xmm1, (Y) 941 addq INCY, Y 942 ALIGN_3 943 944.L106: 945 testq $2, M 946 jle .L107 947 948 movsd (X), %xmm0 949 addq INCX, X 950 movhps (X), %xmm0 951 addq INCX, X 952 953 movsd %xmm0, (Y) 954 addq INCY, Y 955 movhps %xmm0, (Y) 956 addq INCY, Y 957 ALIGN_3 958 959.L107: 960 testq $1, M 961 jle .L999 962 963 movsd (X), %xmm0 964 movsd %xmm0, (Y) 965 ALIGN_3 966 967.L999: 968 xorq %rax, %rax 969 970 RESTOREREGISTERS 971 972 ret 973 974 EPILOGUE 975 976