1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45#define Y ARG4 /* rcx */ 46#ifndef WINDOWS_ABI 47#define INCY ARG5 /* r8 */ 48#else 49#define INCY %r10 50#endif 51 52#include "l1param.h" 53 54#ifdef OPTERON 55#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG 56#else 57#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG 58#endif 59 60 PROLOGUE 61 PROFCODE 62 63#ifdef WINDOWS_ABI 64 movq 40(%rsp), INCY 65#endif 66 67 SAVEREGISTERS 68 69 salq $ZBASE_SHIFT, INCX 70 salq $ZBASE_SHIFT, INCY 71 72 cmpq $2 * SIZE, INCX 73 jne .L100 74 cmpq $2 * SIZE, INCY 75 jne .L100 76 77 cmpq $3, M 78 jle .L106 79 80 subq $-32 * SIZE, X 81 subq $-32 * SIZE, Y 82 addq M, M 83 84 testq $SIZE, Y 85 je .L05 86 87 movss -32 * SIZE(X), %xmm0 88 movss %xmm0, -32 * SIZE(Y) 89 addq $1 * SIZE, X 90 addq $1 * SIZE, Y 91 decq M 92 ALIGN_4 93 94.L05: 95 testq $2 * SIZE, Y 96 je .L10 97 98 movsd -32 * SIZE(X), %xmm0 99 movlps %xmm0, -32 * SIZE(Y) 100 addq $2 * SIZE, X 101 addq $2 * SIZE, Y 102 subq $2, M 103 jle .L19 104 ALIGN_4 105 106.L10: 107 testq $3 * SIZE, X 108 jne .L20 109 110 movq M, %rax 111 sarq $5, %rax 112 jle .L13 113 114 movaps -32 * SIZE(X), %xmm0 115 movaps -28 * SIZE(X), %xmm1 116 movaps -24 * SIZE(X), %xmm2 117 movaps -20 * SIZE(X), %xmm3 118 movaps -16 * SIZE(X), %xmm4 119 movaps -12 * SIZE(X), %xmm5 120 movaps -8 * SIZE(X), %xmm6 121 movaps -4 * SIZE(X), %xmm7 122 123 decq %rax 124 jle .L12 125 ALIGN_3 126 127.L11: 128#ifdef PREFETCHW 129 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 130#endif 131 132 movaps %xmm0, -32 * SIZE(Y) 133 LOAD( 0 * SIZE, X, %xmm0) 134 movaps %xmm1, -28 * SIZE(Y) 135 LOAD( 4 * SIZE, X, %xmm1) 136 137#ifdef PREFETCH 138 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 139#endif 140 141 movaps %xmm2, -24 * SIZE(Y) 142 LOAD( 8 * SIZE, X, %xmm2) 143 movaps %xmm3, -20 * SIZE(Y) 144 LOAD(12 * SIZE, X, %xmm3) 145 146#if defined(PREFETCHW) && !defined(FETCH128) 147 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 148#endif 149 150 movaps %xmm4,-16 * SIZE(Y) 151 LOAD(16 * SIZE, X, %xmm4) 152 movaps %xmm5,-12 * SIZE(Y) 153 LOAD(20 * SIZE, X, %xmm5) 154 155#if defined(PREFETCH) && !defined(FETCH128) 156 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 157#endif 158 159 movaps %xmm6, -8 * SIZE(Y) 160 LOAD(24 * SIZE, X, %xmm6) 161 movaps %xmm7, -4 * SIZE(Y) 162 LOAD(28 * SIZE, X, %xmm7) 163 164 subq $-32 * SIZE, Y 165 subq $-32 * SIZE, X 166 decq %rax 167 jg .L11 168 ALIGN_3 169 170.L12: 171 movaps %xmm0, -32 * SIZE(Y) 172 movaps %xmm1, -28 * SIZE(Y) 173 movaps %xmm2, -24 * SIZE(Y) 174 movaps %xmm3, -20 * SIZE(Y) 175 movaps %xmm4, -16 * SIZE(Y) 176 movaps %xmm5, -12 * SIZE(Y) 177 movaps %xmm6, -8 * SIZE(Y) 178 movaps %xmm7, -4 * SIZE(Y) 179 180 subq $-32 * SIZE, Y 181 subq $-32 * SIZE, X 182 ALIGN_3 183 184.L13: 185 testq $16, M 186 jle .L14 187 188 movaps -32 * SIZE(X), %xmm0 189 movaps -28 * SIZE(X), %xmm1 190 movaps -24 * SIZE(X), %xmm2 191 movaps -20 * SIZE(X), %xmm3 192 193 movaps %xmm0, -32 * SIZE(Y) 194 movaps %xmm1, -28 * SIZE(Y) 195 movaps %xmm2, -24 * SIZE(Y) 196 movaps %xmm3, -20 * SIZE(Y) 197 198 addq $16 * SIZE, X 199 addq $16 * SIZE, Y 200 ALIGN_3 201 202.L14: 203 testq $8, M 204 jle .L15 205 206 movaps -32 * SIZE(X), %xmm0 207 movaps -28 * SIZE(X), %xmm1 208 209 movaps %xmm0, -32 * SIZE(Y) 210 movaps %xmm1, -28 * SIZE(Y) 211 212 addq $8 * SIZE, X 213 addq $8 * SIZE, Y 214 ALIGN_3 215 216.L15: 217 testq $4, M 218 jle .L16 219 220 movaps -32 * SIZE(X), %xmm0 221 movaps %xmm0, -32 * SIZE(Y) 222 223 addq $4 * SIZE, X 224 addq $4 * SIZE, Y 225 ALIGN_3 226 227.L16: 228 testq $2, M 229 jle .L17 230 231 movsd -32 * SIZE(X), %xmm0 232 movlps %xmm0, -32 * SIZE(Y) 233 234 addq $2 * SIZE, X 235 addq $2 * SIZE, Y 236 ALIGN_3 237 238.L17: 239 testq $1, M 240 jle .L19 241 242 movss -32 * SIZE(X), %xmm0 243 movss %xmm0, -32 * SIZE(Y) 244 ALIGN_3 245 246.L19: 247 xorq %rax,%rax 248 249 RESTOREREGISTERS 250 251 ret 252 ALIGN_3 253 254 255.L20: 256 testq $SIZE, X 257 jne .L30 258 259 movhps -32 * SIZE(X), %xmm0 260 261 movq M, %rax 262 sarq $5, %rax 263 jle .L23 264 265 movaps -30 * SIZE(X), %xmm1 266 movaps -26 * SIZE(X), %xmm2 267 movaps -22 * SIZE(X), %xmm3 268 movaps -18 * SIZE(X), %xmm4 269 movaps -14 * SIZE(X), %xmm5 270 movaps -10 * SIZE(X), %xmm6 271 movaps -6 * SIZE(X), %xmm7 272 273 decq %rax 274 jle .L22 275 ALIGN_4 276 277.L21: 278 279#ifdef PREFETCHW 280 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 281#endif 282 283 shufps $0x4e, %xmm1, %xmm0 284 movaps %xmm0, -32 * SIZE(Y) 285 movaps -2 * SIZE(X), %xmm0 286 287 shufps $0x4e, %xmm2, %xmm1 288 movaps %xmm1, -28 * SIZE(Y) 289 movaps 2 * SIZE(X), %xmm1 290 291#ifdef PREFETCH 292 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 293#endif 294 295 shufps $0x4e, %xmm3, %xmm2 296 movaps %xmm2, -24 * SIZE(Y) 297 movaps 6 * SIZE(X), %xmm2 298 299 shufps $0x4e, %xmm4, %xmm3 300 movaps %xmm3, -20 * SIZE(Y) 301 movaps 10 * SIZE(X), %xmm3 302 303#if defined(PREFETCHW) && !defined(FETCH128) 304 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 305#endif 306 307 shufps $0x4e, %xmm5, %xmm4 308 movaps %xmm4, -16 * SIZE(Y) 309 movaps 14 * SIZE(X), %xmm4 310 311 shufps $0x4e, %xmm6, %xmm5 312 movaps %xmm5, -12 * SIZE(Y) 313 movaps 18 * SIZE(X), %xmm5 314 315#if defined(PREFETCH) && !defined(FETCH128) 316 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 317#endif 318 319 shufps $0x4e, %xmm7, %xmm6 320 movaps %xmm6, -8 * SIZE(Y) 321 movaps 22 * SIZE(X), %xmm6 322 323 shufps $0x4e, %xmm0, %xmm7 324 movaps %xmm7, -4 * SIZE(Y) 325 movaps 26 * SIZE(X), %xmm7 326 327 subq $-32 * SIZE, X 328 subq $-32 * SIZE, Y 329 decq %rax 330 jg .L21 331 ALIGN_3 332 333.L22: 334 shufps $0x4e, %xmm1, %xmm0 335 movaps %xmm0, -32 * SIZE(Y) 336 movaps -2 * SIZE(X), %xmm0 337 338 shufps $0x4e, %xmm2, %xmm1 339 movaps %xmm1, -28 * SIZE(Y) 340 341 shufps $0x4e, %xmm3, %xmm2 342 movaps %xmm2, -24 * SIZE(Y) 343 344 shufps $0x4e, %xmm4, %xmm3 345 movaps %xmm3, -20 * SIZE(Y) 346 347 shufps $0x4e, %xmm5, %xmm4 348 movaps %xmm4, -16 * SIZE(Y) 349 350 shufps $0x4e, %xmm6, %xmm5 351 movaps %xmm5, -12 * SIZE(Y) 352 353 shufps $0x4e, %xmm7, %xmm6 354 movaps %xmm6, -8 * SIZE(Y) 355 356 shufps $0x4e, %xmm0, %xmm7 357 movaps %xmm7, -4 * SIZE(Y) 358 359 subq $-32 * SIZE, X 360 subq $-32 * SIZE, Y 361 ALIGN_3 362 363.L23: 364 testq $16, M 365 jle .L24 366 ALIGN_3 367 368 movaps -30 * SIZE(X), %xmm1 369 movaps -26 * SIZE(X), %xmm2 370 movaps -22 * SIZE(X), %xmm3 371 movaps -18 * SIZE(X), %xmm4 372 373 shufps $0x4e, %xmm1, %xmm0 374 movaps %xmm0, -32 * SIZE(Y) 375 shufps $0x4e, %xmm2, %xmm1 376 movaps %xmm1, -28 * SIZE(Y) 377 shufps $0x4e, %xmm3, %xmm2 378 movaps %xmm2, -24 * SIZE(Y) 379 shufps $0x4e, %xmm4, %xmm3 380 movaps %xmm3, -20 * SIZE(Y) 381 382 movaps %xmm4, %xmm0 383 384 addq $16 * SIZE, X 385 addq $16 * SIZE, Y 386 ALIGN_3 387 388.L24: 389 testq $8, M 390 jle .L25 391 ALIGN_3 392 393 movaps -30 * SIZE(X), %xmm1 394 movaps -26 * SIZE(X), %xmm2 395 396 shufps $0x4e, %xmm1, %xmm0 397 shufps $0x4e, %xmm2, %xmm1 398 399 movaps %xmm0, -32 * SIZE(Y) 400 movaps %xmm1, -28 * SIZE(Y) 401 movaps %xmm2, %xmm0 402 403 addq $8 * SIZE, X 404 addq $8 * SIZE, Y 405 ALIGN_3 406 407.L25: 408 testq $4, M 409 jle .L26 410 ALIGN_3 411 412 movaps -30 * SIZE(X), %xmm1 413 shufps $0x4e, %xmm1, %xmm0 414 movaps %xmm0, -32 * SIZE(Y) 415 416 addq $4 * SIZE, X 417 addq $4 * SIZE, Y 418 ALIGN_3 419 420.L26: 421 testq $2, M 422 jle .L27 423 ALIGN_3 424 425 movsd -32 * SIZE(X), %xmm0 426 427 movsd %xmm0, -32 * SIZE(Y) 428 429 addq $2 * SIZE, X 430 addq $2 * SIZE, Y 431 ALIGN_3 432 433.L27: 434 testq $1, M 435 jle .L29 436 ALIGN_3 437 438 movss -32 * SIZE(X), %xmm0 439 movss %xmm0, -32 * SIZE(Y) 440 addq $SIZE, Y 441 ALIGN_3 442 443.L29: 444 xorq %rax,%rax 445 446 RESTOREREGISTERS 447 448 ret 449 ALIGN_3 450 451.L30: 452 testq $2 * SIZE, X 453 jne .L40 454 455 movaps -33 * SIZE(X), %xmm0 456 457 movq M, %rax 458 sarq $5, %rax 459 jle .L33 460 461 movaps -29 * SIZE(X), %xmm1 462 movaps -25 * SIZE(X), %xmm2 463 movaps -21 * SIZE(X), %xmm3 464 movaps -17 * SIZE(X), %xmm4 465 movaps -13 * SIZE(X), %xmm5 466 movaps -9 * SIZE(X), %xmm6 467 movaps -5 * SIZE(X), %xmm7 468 469 decq %rax 470 jle .L32 471 ALIGN_4 472 473.L31: 474#ifdef PREFETCHW 475 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 476#endif 477 478 movss %xmm1, %xmm0 479 shufps $0x39, %xmm0, %xmm0 480 movaps %xmm0, -32 * SIZE(Y) 481 movaps -1 * SIZE(X), %xmm0 482 483 movss %xmm2, %xmm1 484 shufps $0x39, %xmm1, %xmm1 485 movaps %xmm1, -28 * SIZE(Y) 486 movaps 3 * SIZE(X), %xmm1 487 488#ifdef PREFETCH 489 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 490#endif 491 492 movss %xmm3, %xmm2 493 shufps $0x39, %xmm2, %xmm2 494 movaps %xmm2, -24 * SIZE(Y) 495 movaps 7 * SIZE(X), %xmm2 496 497 movss %xmm4, %xmm3 498 shufps $0x39, %xmm3, %xmm3 499 movaps %xmm3, -20 * SIZE(Y) 500 movaps 11 * SIZE(X), %xmm3 501 502#if defined(PREFETCHW) && !defined(FETCH128) 503 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 504#endif 505 506 movss %xmm5, %xmm4 507 shufps $0x39, %xmm4, %xmm4 508 movaps %xmm4, -16 * SIZE(Y) 509 movaps 15 * SIZE(X), %xmm4 510 511 movss %xmm6, %xmm5 512 shufps $0x39, %xmm5, %xmm5 513 movaps %xmm5, -12 * SIZE(Y) 514 movaps 19 * SIZE(X), %xmm5 515 516#if defined(PREFETCH) && !defined(FETCH128) 517 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 518#endif 519 520 movss %xmm7, %xmm6 521 shufps $0x39, %xmm6, %xmm6 522 movaps %xmm6, -8 * SIZE(Y) 523 movaps 23 * SIZE(X), %xmm6 524 525 movss %xmm0, %xmm7 526 shufps $0x39, %xmm7, %xmm7 527 movaps %xmm7, -4 * SIZE(Y) 528 movaps 27 * SIZE(X), %xmm7 529 530 subq $-32 * SIZE, X 531 subq $-32 * SIZE, Y 532 decq %rax 533 jg .L31 534 ALIGN_3 535 536.L32: 537 movss %xmm1, %xmm0 538 shufps $0x39, %xmm0, %xmm0 539 movaps %xmm0, -32 * SIZE(Y) 540 movaps -1 * SIZE(X), %xmm0 541 542 movss %xmm2, %xmm1 543 shufps $0x39, %xmm1, %xmm1 544 movaps %xmm1, -28 * SIZE(Y) 545 546 movss %xmm3, %xmm2 547 shufps $0x39, %xmm2, %xmm2 548 movaps %xmm2, -24 * SIZE(Y) 549 550 movss %xmm4, %xmm3 551 shufps $0x39, %xmm3, %xmm3 552 movaps %xmm3, -20 * SIZE(Y) 553 554 movss %xmm5, %xmm4 555 shufps $0x39, %xmm4, %xmm4 556 movaps %xmm4, -16 * SIZE(Y) 557 558 movss %xmm6, %xmm5 559 shufps $0x39, %xmm5, %xmm5 560 movaps %xmm5, -12 * SIZE(Y) 561 562 movss %xmm7, %xmm6 563 shufps $0x39, %xmm6, %xmm6 564 movaps %xmm6, -8 * SIZE(Y) 565 566 movss %xmm0, %xmm7 567 shufps $0x39, %xmm7, %xmm7 568 movaps %xmm7, -4 * SIZE(Y) 569 570 subq $-32 * SIZE, X 571 subq $-32 * SIZE, Y 572 ALIGN_3 573 574.L33: 575 testq $16, M 576 jle .L34 577 ALIGN_3 578 579 movaps -29 * SIZE(X), %xmm1 580 movaps -25 * SIZE(X), %xmm2 581 movaps -21 * SIZE(X), %xmm3 582 movaps -17 * SIZE(X), %xmm4 583 584 movss %xmm1, %xmm0 585 shufps $0x39, %xmm0, %xmm0 586 movaps %xmm0, -32 * SIZE(Y) 587 588 movss %xmm2, %xmm1 589 shufps $0x39, %xmm1, %xmm1 590 movaps %xmm1, -28 * SIZE(Y) 591 592 movss %xmm3, %xmm2 593 shufps $0x39, %xmm2, %xmm2 594 movaps %xmm2, -24 * SIZE(Y) 595 596 movss %xmm4, %xmm3 597 shufps $0x39, %xmm3, %xmm3 598 movaps %xmm3, -20 * SIZE(Y) 599 600 movaps %xmm4, %xmm0 601 602 addq $16 * SIZE, X 603 addq $16 * SIZE, Y 604 ALIGN_3 605 606.L34: 607 testq $8, M 608 jle .L35 609 ALIGN_3 610 611 movaps -29 * SIZE(X), %xmm1 612 movaps -25 * SIZE(X), %xmm2 613 614 movss %xmm1, %xmm0 615 shufps $0x39, %xmm0, %xmm0 616 movaps %xmm0, -32 * SIZE(Y) 617 618 movss %xmm2, %xmm1 619 shufps $0x39, %xmm1, %xmm1 620 movaps %xmm1, -28 * SIZE(Y) 621 movaps %xmm2, %xmm0 622 623 addq $8 * SIZE, X 624 addq $8 * SIZE, Y 625 ALIGN_3 626 627.L35: 628 testq $4, M 629 jle .L36 630 ALIGN_3 631 632 movaps -29 * SIZE(X), %xmm1 633 634 movss %xmm1, %xmm0 635 shufps $0x39, %xmm0, %xmm0 636 637 movaps %xmm0, -32 * SIZE(Y) 638 639 addq $4 * SIZE, X 640 addq $4 * SIZE, Y 641 ALIGN_3 642 643.L36: 644 testq $2, M 645 jle .L37 646 ALIGN_3 647 648 movsd -32 * SIZE(X), %xmm0 649 movsd %xmm0, -32 * SIZE(Y) 650 651 addq $2 * SIZE, X 652 addq $2 * SIZE, Y 653 ALIGN_3 654 655.L37: 656 testq $1, M 657 jle .L39 658 ALIGN_3 659 660 movss -32 * SIZE(X), %xmm0 661 movss %xmm0, -32 * SIZE(Y) 662 addq $SIZE, Y 663 ALIGN_3 664 665.L39: 666 xorq %rax,%rax 667 668 RESTOREREGISTERS 669 670 ret 671 ALIGN_3 672 673.L40: 674 movaps -35 * SIZE(X), %xmm0 675 676 movq M, %rax 677 sarq $5, %rax 678 jle .L43 679 680 movaps -31 * SIZE(X), %xmm1 681 movaps -27 * SIZE(X), %xmm2 682 movaps -23 * SIZE(X), %xmm3 683 movaps -19 * SIZE(X), %xmm4 684 movaps -15 * SIZE(X), %xmm5 685 movaps -11 * SIZE(X), %xmm6 686 movaps -7 * SIZE(X), %xmm7 687 688 decq %rax 689 jle .L42 690 ALIGN_4 691 692.L41: 693#ifdef PREFETCHW 694 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 695#endif 696 697 movss %xmm1, %xmm0 698 shufps $0x93, %xmm1, %xmm0 699 movaps %xmm0, -32 * SIZE(Y) 700 movaps -3 * SIZE(X), %xmm0 701 702 movss %xmm2, %xmm1 703 shufps $0x93, %xmm2, %xmm1 704 movaps %xmm1, -28 * SIZE(Y) 705 movaps 1 * SIZE(X), %xmm1 706 707#ifdef PREFETCH 708 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 709#endif 710 711 movss %xmm3, %xmm2 712 shufps $0x93, %xmm3, %xmm2 713 movaps %xmm2, -24 * SIZE(Y) 714 movaps 5 * SIZE(X), %xmm2 715 716 movss %xmm4, %xmm3 717 shufps $0x93, %xmm4, %xmm3 718 movaps %xmm3, -20 * SIZE(Y) 719 movaps 9 * SIZE(X), %xmm3 720 721#if defined(PREFETCHW) && !defined(FETCH128) 722 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 723#endif 724 725 movss %xmm5, %xmm4 726 shufps $0x93, %xmm5, %xmm4 727 movaps %xmm4, -16 * SIZE(Y) 728 movaps 13 * SIZE(X), %xmm4 729 730 movss %xmm6, %xmm5 731 shufps $0x93, %xmm6, %xmm5 732 movaps %xmm5, -12 * SIZE(Y) 733 movaps 17 * SIZE(X), %xmm5 734 735#if defined(PREFETCH) && !defined(FETCH128) 736 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 737#endif 738 739 movss %xmm7, %xmm6 740 shufps $0x93, %xmm7, %xmm6 741 movaps %xmm6, -8 * SIZE(Y) 742 movaps 21 * SIZE(X), %xmm6 743 744 movss %xmm0, %xmm7 745 shufps $0x93, %xmm0, %xmm7 746 movaps %xmm7, -4 * SIZE(Y) 747 movaps 25 * SIZE(X), %xmm7 748 749 subq $-32 * SIZE, X 750 subq $-32 * SIZE, Y 751 decq %rax 752 jg .L41 753 ALIGN_3 754 755.L42: 756 movss %xmm1, %xmm0 757 shufps $0x93, %xmm1, %xmm0 758 movaps %xmm0, -32 * SIZE(Y) 759 movaps -3 * SIZE(X), %xmm0 760 761 movss %xmm2, %xmm1 762 shufps $0x93, %xmm2, %xmm1 763 movaps %xmm1, -28 * SIZE(Y) 764 765 movss %xmm3, %xmm2 766 shufps $0x93, %xmm3, %xmm2 767 movaps %xmm2, -24 * SIZE(Y) 768 769 movss %xmm4, %xmm3 770 shufps $0x93, %xmm4, %xmm3 771 movaps %xmm3, -20 * SIZE(Y) 772 773 movss %xmm5, %xmm4 774 shufps $0x93, %xmm5, %xmm4 775 movaps %xmm4, -16 * SIZE(Y) 776 777 movss %xmm6, %xmm5 778 shufps $0x93, %xmm6, %xmm5 779 movaps %xmm5, -12 * SIZE(Y) 780 781 movss %xmm7, %xmm6 782 shufps $0x93, %xmm7, %xmm6 783 movaps %xmm6, -8 * SIZE(Y) 784 785 movss %xmm0, %xmm7 786 shufps $0x93, %xmm0, %xmm7 787 movaps %xmm7, -4 * SIZE(Y) 788 789 subq $-32 * SIZE, X 790 subq $-32 * SIZE, Y 791 ALIGN_3 792 793.L43: 794 testq $16, M 795 jle .L44 796 ALIGN_3 797 798 movaps -31 * SIZE(X), %xmm1 799 movaps -27 * SIZE(X), %xmm2 800 movaps -23 * SIZE(X), %xmm3 801 movaps -19 * SIZE(X), %xmm4 802 803 movss %xmm1, %xmm0 804 shufps $0x93, %xmm1, %xmm0 805 movaps %xmm0, -32 * SIZE(Y) 806 807 movss %xmm2, %xmm1 808 shufps $0x93, %xmm2, %xmm1 809 movaps %xmm1, -28 * SIZE(Y) 810 811 movss %xmm3, %xmm2 812 shufps $0x93, %xmm3, %xmm2 813 movaps %xmm2, -24 * SIZE(Y) 814 815 movss %xmm4, %xmm3 816 shufps $0x93, %xmm4, %xmm3 817 movaps %xmm3, -20 * SIZE(Y) 818 819 movaps %xmm4, %xmm0 820 821 addq $16 * SIZE, X 822 addq $16 * SIZE, Y 823 ALIGN_3 824 825.L44: 826 testq $8, M 827 jle .L45 828 ALIGN_3 829 830 movaps -31 * SIZE(X), %xmm1 831 movaps -27 * SIZE(X), %xmm2 832 833 movss %xmm1, %xmm0 834 shufps $0x93, %xmm1, %xmm0 835 movaps %xmm0, -32 * SIZE(Y) 836 837 movss %xmm2, %xmm1 838 shufps $0x93, %xmm2, %xmm1 839 movaps %xmm1, -28 * SIZE(Y) 840 841 movaps %xmm2, %xmm0 842 843 addq $8 * SIZE, X 844 addq $8 * SIZE, Y 845 ALIGN_3 846 847.L45: 848 testq $4, M 849 jle .L46 850 ALIGN_3 851 852 movaps -31 * SIZE(X), %xmm1 853 854 movss %xmm1, %xmm0 855 shufps $0x93, %xmm1, %xmm0 856 857 movaps %xmm0, -32 * SIZE(Y) 858 859 addq $4 * SIZE, X 860 addq $4 * SIZE, Y 861 ALIGN_3 862 863.L46: 864 testq $2, M 865 jle .L47 866 ALIGN_3 867 868 movsd -32 * SIZE(X), %xmm0 869 movsd %xmm0, -32 * SIZE(Y) 870 871 addq $2 * SIZE, X 872 addq $2 * SIZE, Y 873 ALIGN_3 874 875.L47: 876 testq $1, M 877 jle .L49 878 ALIGN_3 879 880 movss -32 * SIZE(X), %xmm0 881 movss %xmm0, -32 * SIZE(Y) 882 addq $SIZE, Y 883 ALIGN_3 884 885.L49: 886 xorq %rax,%rax 887 888 RESTOREREGISTERS 889 890 ret 891 ALIGN_4 892 893.L100: 894 movq M, %rax 895 sarq $3, %rax 896 jle .L105 897 ALIGN_3 898 899.L102: 900 movsd (X), %xmm0 901 addq INCX, X 902 movhps (X), %xmm0 903 addq INCX, X 904 movsd (X), %xmm1 905 addq INCX, X 906 movhps (X), %xmm1 907 addq INCX, X 908 movsd (X), %xmm2 909 addq INCX, X 910 movhps (X), %xmm2 911 addq INCX, X 912 movsd (X), %xmm3 913 addq INCX, X 914 movhps (X), %xmm3 915 addq INCX, X 916 917 movsd %xmm0, (Y) 918 addq INCY, Y 919 movhps %xmm0, (Y) 920 addq INCY, Y 921 movsd %xmm1, (Y) 922 addq INCY, Y 923 movhps %xmm1, (Y) 924 addq INCY, Y 925 movsd %xmm2, (Y) 926 addq INCY, Y 927 movhps %xmm2, (Y) 928 addq INCY, Y 929 movsd %xmm3, (Y) 930 addq INCY, Y 931 movhps %xmm3, (Y) 932 addq INCY, Y 933 934 decq %rax 935 jg .L102 936 ALIGN_3 937 938.L105: 939 testq $4, M 940 jle .L106 941 942 movsd (X), %xmm0 943 addq INCX, X 944 movhps (X), %xmm0 945 addq INCX, X 946 movsd (X), %xmm1 947 addq INCX, X 948 movhps (X), %xmm1 949 addq INCX, X 950 951 movsd %xmm0, (Y) 952 addq INCY, Y 953 movhps %xmm0, (Y) 954 addq INCY, Y 955 movsd %xmm1, (Y) 956 addq INCY, Y 957 movhps %xmm1, (Y) 958 addq INCY, Y 959 ALIGN_3 960 961.L106: 962 testq $2, M 963 jle .L107 964 965 movsd (X), %xmm0 966 addq INCX, X 967 movhps (X), %xmm0 968 addq INCX, X 969 970 movsd %xmm0, (Y) 971 addq INCY, Y 972 movhps %xmm0, (Y) 973 addq INCY, Y 974 ALIGN_3 975 976.L107: 977 testq $1, M 978 jle .L999 979 980 movsd (X), %xmm0 981 movsd %xmm0, (Y) 982 ALIGN_3 983 984.L999: 985 xorq %rax, %rax 986 987 RESTOREREGISTERS 988 989 ret 990 991 EPILOGUE 992 993