1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#if defined(PENTIUM4) || defined(GENERIC) 43#define PREFETCHSIZE 16 44#define PREFETCH prefetcht0 45#define PREFETCHW prefetcht0 46#endif 47 48#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) 49#define PREFETCHSIZE 16 50#define PREFETCH prefetcht0 51#define PREFETCHW prefetcht0 52#endif 53 54#ifdef ATOM 55#define PREFETCHSIZE 16 56#define PREFETCH prefetcht0 57#define PREFETCHW prefetcht0 58#endif 59 60#ifdef NANO 61#define PREFETCHSIZE 16 62#define PREFETCH prefetcht0 63#define PREFETCHW prefetcht0 64#endif 65 66#ifdef OPTERON 67#define PREFETCHSIZE 16 68#define PREFETCH prefetch 69#define PREFETCHW prefetchw 70#endif 71 72#ifdef GENERIC 73#define PREFETCHSIZE 16 74#define PREFETCH prefetcht0 75#define PREFETCHW prefetcht0 76#endif 77 78#ifndef WINDOWS_ABI 79 80#define M ARG1 /* rdi */ 81#define N ARG2 /* rsi */ 82#define A ARG3 /* rdx */ 83#define LDA ARG4 /* rcx */ 84#define B ARG5 /* r8 */ 85 86#define I %r9 87 88#else 89 90#define STACKSIZE 256 91 92#define M ARG1 /* rcx */ 93#define N ARG2 /* rdx */ 94#define A ARG3 /* r8 */ 95#define LDA ARG4 /* r9 */ 96#define OLD_B 40 + 32 + STACKSIZE(%rsp) 97 98#define B %r14 99#define I %r15 100 101#endif 102 103#define J %r10 104#define AO1 %r11 105#define AO2 %r12 106#define MM %r13 107 108 PROLOGUE 109 PROFCODE 110 111#ifdef WINDOWS_ABI 112 pushq %r15 113 pushq %r14 114#endif 115 pushq %r13 116 pushq %r12 117 118#ifdef WINDOWS_ABI 119 subq $STACKSIZE, %rsp 120 121 movups %xmm6, 0(%rsp) 122 movups %xmm7, 16(%rsp) 123 124 movq OLD_B, B 125#endif 126 127 leaq (,LDA, SIZE), LDA 128 subq $-16 * SIZE, B 129 130 movq M, MM 131 leaq -1(M), %rax 132 testq $SIZE, A 133 cmovne %rax, MM 134 135 testq $SIZE, LDA 136 jne .L50 137 138 movq N, J 139 sarq $2, J 140 jle .L20 141 ALIGN_4 142 143.L11: 144 movq A, AO1 145 leaq (A, LDA, 2), AO2 146 leaq (A, LDA, 4), A 147 148 testq $SIZE, A 149 je .L12 150 151 movsd 0 * SIZE(AO1), %xmm0 152 movsd 0 * SIZE(AO1, LDA), %xmm1 153 movsd 0 * SIZE(AO2), %xmm2 154 movsd 0 * SIZE(AO2, LDA), %xmm3 155 156 unpcklpd %xmm1, %xmm0 157 unpcklpd %xmm3, %xmm2 158 159 movapd %xmm0, -16 * SIZE(B) 160 movapd %xmm2, -14 * SIZE(B) 161 162 addq $1 * SIZE, AO1 163 addq $1 * SIZE, AO2 164 subq $-4 * SIZE, B 165 ALIGN_3 166 167.L12: 168 movq MM, I 169 sarq $3, I 170 jle .L14 171 ALIGN_4 172 173.L13: 174#ifdef PREFETCH 175 PREFETCH PREFETCHSIZE * SIZE(AO1) 176#endif 177 178 movapd 0 * SIZE(AO1), %xmm0 179 movapd 0 * SIZE(AO1, LDA), %xmm1 180 movapd 0 * SIZE(AO2), %xmm2 181 movapd 0 * SIZE(AO2, LDA), %xmm3 182 183 movapd %xmm0, %xmm4 184 unpcklpd %xmm1, %xmm0 185 movapd %xmm2, %xmm6 186 unpcklpd %xmm3, %xmm2 187 188 unpckhpd %xmm1, %xmm4 189 unpckhpd %xmm3, %xmm6 190 191#ifdef PREFETCHW 192 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 193#endif 194 195 movapd %xmm0, -16 * SIZE(B) 196 movapd %xmm2, -14 * SIZE(B) 197 movapd %xmm4, -12 * SIZE(B) 198 movapd %xmm6, -10 * SIZE(B) 199 200#ifdef PREFETCH 201 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) 202#endif 203 204 movapd 2 * SIZE(AO1), %xmm0 205 movapd 2 * SIZE(AO1, LDA), %xmm1 206 movapd 2 * SIZE(AO2), %xmm2 207 movapd 2 * SIZE(AO2, LDA), %xmm3 208 209 movapd %xmm0, %xmm4 210 unpcklpd %xmm1, %xmm0 211 movapd %xmm2, %xmm6 212 unpcklpd %xmm3, %xmm2 213 214 unpckhpd %xmm1, %xmm4 215 unpckhpd %xmm3, %xmm6 216 217#ifdef PREFETCHW 218 PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) 219#endif 220 221 movapd %xmm0, -8 * SIZE(B) 222 movapd %xmm2, -6 * SIZE(B) 223 movapd %xmm4, -4 * SIZE(B) 224 movapd %xmm6, -2 * SIZE(B) 225 226#ifdef PREFETCH 227 PREFETCH PREFETCHSIZE * SIZE(AO2) 228#endif 229 230 movapd 4 * SIZE(AO1), %xmm0 231 movapd 4 * SIZE(AO1, LDA), %xmm1 232 movapd 4 * SIZE(AO2), %xmm2 233 movapd 4 * SIZE(AO2, LDA), %xmm3 234 235 movapd %xmm0, %xmm4 236 unpcklpd %xmm1, %xmm0 237 movapd %xmm2, %xmm6 238 unpcklpd %xmm3, %xmm2 239 240 unpckhpd %xmm1, %xmm4 241 unpckhpd %xmm3, %xmm6 242 243#ifdef PREFETCHW 244 PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) 245#endif 246 247 movapd %xmm0, 0 * SIZE(B) 248 movapd %xmm2, 2 * SIZE(B) 249 movapd %xmm4, 4 * SIZE(B) 250 movapd %xmm6, 6 * SIZE(B) 251 252#ifdef PREFETCH 253 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) 254#endif 255 256 movapd 6 * SIZE(AO1), %xmm0 257 movapd 6 * SIZE(AO1, LDA), %xmm1 258 movapd 6 * SIZE(AO2), %xmm2 259 movapd 6 * SIZE(AO2, LDA), %xmm3 260 261 movapd %xmm0, %xmm4 262 unpcklpd %xmm1, %xmm0 263 movapd %xmm2, %xmm6 264 unpcklpd %xmm3, %xmm2 265 266 unpckhpd %xmm1, %xmm4 267 unpckhpd %xmm3, %xmm6 268 269#ifdef PREFETCHW 270 PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) 271#endif 272 273 movapd %xmm0, 8 * SIZE(B) 274 movapd %xmm2, 10 * SIZE(B) 275 movapd %xmm4, 12 * SIZE(B) 276 movapd %xmm6, 14 * SIZE(B) 277 278 addq $8 * SIZE, AO1 279 addq $8 * SIZE, AO2 280 subq $-32 * SIZE, B 281 282 decq I 283 jg .L13 284 ALIGN_4 285 286.L14: 287 testq $4, MM 288 jle .L16 289 290 movapd 0 * SIZE(AO1), %xmm0 291 movapd 0 * SIZE(AO1, LDA), %xmm1 292 movapd 0 * SIZE(AO2), %xmm2 293 movapd 0 * SIZE(AO2, LDA), %xmm3 294 295 movapd %xmm0, %xmm4 296 unpcklpd %xmm1, %xmm0 297 movapd %xmm2, %xmm6 298 unpcklpd %xmm3, %xmm2 299 300 unpckhpd %xmm1, %xmm4 301 unpckhpd %xmm3, %xmm6 302 303 movapd %xmm0, -16 * SIZE(B) 304 movapd %xmm2, -14 * SIZE(B) 305 movapd %xmm4, -12 * SIZE(B) 306 movapd %xmm6, -10 * SIZE(B) 307 308 movapd 2 * SIZE(AO1), %xmm0 309 movapd 2 * SIZE(AO1, LDA), %xmm1 310 movapd 2 * SIZE(AO2), %xmm2 311 movapd 2 * SIZE(AO2, LDA), %xmm3 312 313 movapd %xmm0, %xmm4 314 unpcklpd %xmm1, %xmm0 315 movapd %xmm2, %xmm6 316 unpcklpd %xmm3, %xmm2 317 318 unpckhpd %xmm1, %xmm4 319 unpckhpd %xmm3, %xmm6 320 321 movapd %xmm0, -8 * SIZE(B) 322 movapd %xmm2, -6 * SIZE(B) 323 movapd %xmm4, -4 * SIZE(B) 324 movapd %xmm6, -2 * SIZE(B) 325 326 addq $4 * SIZE, AO1 327 addq $4 * SIZE, AO2 328 subq $-16 * SIZE, B 329 ALIGN_4 330 331.L16: 332 testq $2, MM 333 jle .L18 334 335 movapd 0 * SIZE(AO1), %xmm0 336 movapd 0 * SIZE(AO1, LDA), %xmm1 337 movapd 0 * SIZE(AO2), %xmm2 338 movapd 0 * SIZE(AO2, LDA), %xmm3 339 340 movapd %xmm0, %xmm4 341 unpcklpd %xmm1, %xmm0 342 movapd %xmm2, %xmm6 343 unpcklpd %xmm3, %xmm2 344 345 unpckhpd %xmm1, %xmm4 346 unpckhpd %xmm3, %xmm6 347 348 movapd %xmm0, -16 * SIZE(B) 349 movapd %xmm2, -14 * SIZE(B) 350 movapd %xmm4, -12 * SIZE(B) 351 movapd %xmm6, -10 * SIZE(B) 352 353 addq $2 * SIZE, AO1 354 addq $2 * SIZE, AO2 355 subq $-8 * SIZE, B 356 ALIGN_4 357 358.L18: 359 testq $1, MM 360 jle .L19 361 362 movsd 0 * SIZE(AO1), %xmm0 363 movsd 0 * SIZE(AO1, LDA), %xmm1 364 movsd 0 * SIZE(AO2), %xmm2 365 movsd 0 * SIZE(AO2, LDA), %xmm3 366 367 unpcklpd %xmm1, %xmm0 368 unpcklpd %xmm3, %xmm2 369 370 movapd %xmm0, -16 * SIZE(B) 371 movapd %xmm2, -14 * SIZE(B) 372 subq $-4 * SIZE, B 373 ALIGN_4 374 375.L19: 376 decq J 377 jg .L11 378 ALIGN_4 379 380.L20: 381 testq $2, N 382 jle .L30 383 384 movq A, AO1 385 leaq (A, LDA), AO2 386 leaq (A, LDA, 2), A 387 388 testq $SIZE, A 389 je .L22 390 391 movsd 0 * SIZE(AO1), %xmm0 392 movsd 0 * SIZE(AO2), %xmm1 393 394 unpcklpd %xmm1, %xmm0 395 396 movapd %xmm0, -16 * SIZE(B) 397 398 addq $1 * SIZE, AO1 399 addq $1 * SIZE, AO2 400 subq $-2 * SIZE, B 401 ALIGN_3 402 403.L22: 404 movq MM, I 405 sarq $3, I 406 jle .L24 407 ALIGN_4 408 409.L23: 410#ifdef PREFETCH 411 PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) 412#endif 413 414 movapd 0 * SIZE(AO1), %xmm0 415 movapd 0 * SIZE(AO2), %xmm1 416 movapd 2 * SIZE(AO1), %xmm2 417 movapd 2 * SIZE(AO2), %xmm3 418 419 movapd %xmm0, %xmm4 420 unpcklpd %xmm1, %xmm0 421 movapd %xmm2, %xmm6 422 unpcklpd %xmm3, %xmm2 423 424 unpckhpd %xmm1, %xmm4 425 unpckhpd %xmm3, %xmm6 426 427#ifdef PREFETCHW 428 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 429#endif 430 431 movapd %xmm0, -16 * SIZE(B) 432 movapd %xmm4, -14 * SIZE(B) 433 movapd %xmm2, -12 * SIZE(B) 434 movapd %xmm6, -10 * SIZE(B) 435 436#ifdef PREFETCH 437 PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) 438#endif 439 440 movapd 4 * SIZE(AO1), %xmm0 441 movapd 4 * SIZE(AO2), %xmm1 442 movapd 6 * SIZE(AO1), %xmm2 443 movapd 6 * SIZE(AO2), %xmm3 444 445 movapd %xmm0, %xmm4 446 unpcklpd %xmm1, %xmm0 447 movapd %xmm2, %xmm6 448 unpcklpd %xmm3, %xmm2 449 450 unpckhpd %xmm1, %xmm4 451 unpckhpd %xmm3, %xmm6 452 453#ifdef PREFETCHW 454 PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) 455#endif 456 457 movapd %xmm0, -8 * SIZE(B) 458 movapd %xmm4, -6 * SIZE(B) 459 movapd %xmm2, -4 * SIZE(B) 460 movapd %xmm6, -2 * SIZE(B) 461 462 addq $8 * SIZE, AO1 463 addq $8 * SIZE, AO2 464 subq $-16 * SIZE, B 465 466 decq I 467 jg .L23 468 ALIGN_4 469 470.L24: 471 testq $4, MM 472 jle .L26 473 474 movapd 0 * SIZE(AO1), %xmm0 475 movapd 0 * SIZE(AO2), %xmm1 476 movapd 2 * SIZE(AO1), %xmm2 477 movapd 2 * SIZE(AO2), %xmm3 478 479 movapd %xmm0, %xmm4 480 unpcklpd %xmm1, %xmm0 481 unpckhpd %xmm1, %xmm4 482 483 movapd %xmm2, %xmm6 484 unpcklpd %xmm3, %xmm2 485 unpckhpd %xmm3, %xmm6 486 487 movapd %xmm0, -16 * SIZE(B) 488 movapd %xmm4, -14 * SIZE(B) 489 movapd %xmm2, -12 * SIZE(B) 490 movapd %xmm6, -10 * SIZE(B) 491 492 addq $4 * SIZE, AO1 493 addq $4 * SIZE, AO2 494 subq $-8 * SIZE, B 495 ALIGN_4 496 497.L26: 498 testq $2, MM 499 jle .L28 500 501 movapd 0 * SIZE(AO1), %xmm0 502 movapd 0 * SIZE(AO2), %xmm1 503 504 movapd %xmm0, %xmm2 505 unpcklpd %xmm1, %xmm0 506 unpckhpd %xmm1, %xmm2 507 508 movapd %xmm0, -16 * SIZE(B) 509 movapd %xmm2, -14 * SIZE(B) 510 511 addq $2 * SIZE, AO1 512 addq $2 * SIZE, AO2 513 subq $-4 * SIZE, B 514 ALIGN_4 515 516.L28: 517 testq $1, MM 518 jle .L30 519 520 movsd 0 * SIZE(AO1), %xmm0 521 movsd 0 * SIZE(AO2), %xmm1 522 523 unpcklpd %xmm1, %xmm0 524 525 movapd %xmm0, -16 * SIZE(B) 526 subq $-2 * SIZE, B 527 ALIGN_4 528 529.L30: 530 testq $1, N 531 jle .L999 532 533 movq A, AO1 534 535 testq $SIZE, A 536 jne .L35 537 538 movq MM, I 539 sarq $3, I 540 jle .L32 541 ALIGN_4 542 543.L31: 544#ifdef PREFETCH 545 PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) 546#endif 547 548 movapd 0 * SIZE(AO1), %xmm0 549 movapd 2 * SIZE(AO1), %xmm1 550 movapd 4 * SIZE(AO1), %xmm2 551 movapd 6 * SIZE(AO1), %xmm3 552 553#ifdef PREFETCHW 554 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 555#endif 556 557 movapd %xmm0, -16 * SIZE(B) 558 movapd %xmm1, -14 * SIZE(B) 559 movapd %xmm2, -12 * SIZE(B) 560 movapd %xmm3, -10 * SIZE(B) 561 562 addq $8 * SIZE, AO1 563 subq $-8 * SIZE, B 564 565 decq I 566 jg .L31 567 ALIGN_4 568 569.L32: 570 testq $4, MM 571 jle .L33 572 573 movapd 0 * SIZE(AO1), %xmm0 574 movapd 2 * SIZE(AO1), %xmm1 575 576 movapd %xmm0, -16 * SIZE(B) 577 movapd %xmm1, -14 * SIZE(B) 578 579 addq $4 * SIZE, AO1 580 subq $-4 * SIZE, B 581 ALIGN_4 582 583.L33: 584 testq $2, MM 585 jle .L34 586 587 movapd 0 * SIZE(AO1), %xmm0 588 589 movapd %xmm0, -16 * SIZE(B) 590 591 addq $2 * SIZE, AO1 592 subq $-2 * SIZE, B 593 ALIGN_4 594 595.L34: 596 testq $1, MM 597 jle .L999 598 599 movsd 0 * SIZE(AO1), %xmm0 600 601 movlpd %xmm0, -16 * SIZE(B) 602 jmp .L999 603 ALIGN_4 604 605.L35: 606 movapd -1 * SIZE(AO1), %xmm0 607 608 movq MM, I 609 sarq $3, I 610 jle .L36 611 ALIGN_4 612 613.L36: 614#ifdef PREFETCH 615 PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) 616#endif 617 618 movapd 1 * SIZE(AO1), %xmm1 619 movapd 3 * SIZE(AO1), %xmm2 620 movapd 5 * SIZE(AO1), %xmm3 621 movapd 7 * SIZE(AO1), %xmm4 622 623 shufpd $1, %xmm1, %xmm0 624 shufpd $1, %xmm2, %xmm1 625 shufpd $1, %xmm3, %xmm2 626 shufpd $1, %xmm4, %xmm3 627 628#ifdef PREFETCHW 629 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 630#endif 631 632 movapd %xmm0, -16 * SIZE(B) 633 movapd %xmm1, -14 * SIZE(B) 634 movapd %xmm2, -12 * SIZE(B) 635 movapd %xmm3, -10 * SIZE(B) 636 637 movapd %xmm4, %xmm0 638 639 addq $8 * SIZE, AO1 640 subq $-8 * SIZE, B 641 642 decq I 643 jg .L36 644 ALIGN_4 645 646.L37: 647 testq $4, MM 648 jle .L38 649 650 movapd 1 * SIZE(AO1), %xmm1 651 movapd 3 * SIZE(AO1), %xmm2 652 653 shufpd $1, %xmm1, %xmm0 654 shufpd $1, %xmm2, %xmm1 655 656 movapd %xmm0, -16 * SIZE(B) 657 movapd %xmm1, -14 * SIZE(B) 658 659 movapd %xmm2, %xmm0 660 661 addq $4 * SIZE, AO1 662 addq $4 * SIZE, B 663 ALIGN_4 664 665.L38: 666 testq $2, MM 667 jle .L39 668 669 movapd 1 * SIZE(AO1), %xmm1 670 671 shufpd $1, %xmm1, %xmm0 672 673 movapd %xmm0, -16 * SIZE(B) 674 675 movapd %xmm1, %xmm0 676 677 addq $2 * SIZE, AO1 678 subq $-2 * SIZE, B 679 ALIGN_4 680 681.L39: 682 testq $1, MM 683 jle .L999 684 685 shufpd $1, %xmm0, %xmm0 686 687 movlpd %xmm0, -16 * SIZE(B) 688 jmp .L999 689 ALIGN_4 690 691.L50: 692 movq N, J 693 sarq $2, J 694 jle .L60 695 ALIGN_4 696 697.L51: 698 movq A, AO1 699 leaq (A, LDA, 2), AO2 700 leaq (A, LDA, 4), A 701 702 testq $SIZE, A 703 je .L52 704 705 movsd 0 * SIZE(AO1), %xmm0 706 movsd 0 * SIZE(AO1, LDA), %xmm1 707 movsd 0 * SIZE(AO2), %xmm2 708 movsd 0 * SIZE(AO2, LDA), %xmm3 709 710 unpcklpd %xmm1, %xmm0 711 unpcklpd %xmm3, %xmm2 712 713 movapd %xmm0, -16 * SIZE(B) 714 movapd %xmm2, -14 * SIZE(B) 715 716 addq $1 * SIZE, AO1 717 addq $1 * SIZE, AO2 718 subq $-4 * SIZE, B 719 ALIGN_3 720 721.L52: 722 movapd -1 * SIZE(AO1, LDA), %xmm5 723 movapd -1 * SIZE(AO2, LDA), %xmm7 724 725 movq MM, I 726 sarq $3, I 727 jle .L54 728 ALIGN_4 729 730.L53: 731#ifdef PREFETCH 732 PREFETCH PREFETCHSIZE * SIZE(AO1) 733#endif 734 735 movapd 0 * SIZE(AO1), %xmm0 736 movapd 1 * SIZE(AO1, LDA), %xmm1 737 movapd 0 * SIZE(AO2), %xmm2 738 movapd 1 * SIZE(AO2, LDA), %xmm3 739 740 movsd %xmm0, %xmm5 741 movsd %xmm2, %xmm7 742 shufpd $1, %xmm1, %xmm0 743 shufpd $1, %xmm3, %xmm2 744 745#ifdef PREFETCHW 746 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 747#endif 748 749 movapd %xmm5, -16 * SIZE(B) 750 movapd %xmm7, -14 * SIZE(B) 751 movapd %xmm0, -12 * SIZE(B) 752 movapd %xmm2, -10 * SIZE(B) 753 754#ifdef PREFETCH 755 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) 756#endif 757 758 movapd 2 * SIZE(AO1), %xmm0 759 movapd 3 * SIZE(AO1, LDA), %xmm5 760 movapd 2 * SIZE(AO2), %xmm2 761 movapd 3 * SIZE(AO2, LDA), %xmm7 762 763 movsd %xmm0, %xmm1 764 movsd %xmm2, %xmm3 765 shufpd $1, %xmm5, %xmm0 766 shufpd $1, %xmm7, %xmm2 767 768#ifdef PREFETCHW 769 PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) 770#endif 771 772 movapd %xmm1, -8 * SIZE(B) 773 movapd %xmm3, -6 * SIZE(B) 774 movapd %xmm0, -4 * SIZE(B) 775 movapd %xmm2, -2 * SIZE(B) 776 777#ifdef PREFETCH 778 PREFETCH PREFETCHSIZE * SIZE(AO2) 779#endif 780 781 movapd 4 * SIZE(AO1), %xmm0 782 movapd 5 * SIZE(AO1, LDA), %xmm1 783 movapd 4 * SIZE(AO2), %xmm2 784 movapd 5 * SIZE(AO2, LDA), %xmm3 785 786 movsd %xmm0, %xmm5 787 movsd %xmm2, %xmm7 788 shufpd $1, %xmm1, %xmm0 789 shufpd $1, %xmm3, %xmm2 790 791#ifdef PREFETCHW 792 PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) 793#endif 794 795 movapd %xmm5, 0 * SIZE(B) 796 movapd %xmm7, 2 * SIZE(B) 797 movapd %xmm0, 4 * SIZE(B) 798 movapd %xmm2, 6 * SIZE(B) 799 800#ifdef PREFETCH 801 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) 802#endif 803 804 movapd 6 * SIZE(AO1), %xmm0 805 movapd 7 * SIZE(AO1, LDA), %xmm5 806 movapd 6 * SIZE(AO2), %xmm2 807 movapd 7 * SIZE(AO2, LDA), %xmm7 808 809 movsd %xmm0, %xmm1 810 movsd %xmm2, %xmm3 811 shufpd $1, %xmm5, %xmm0 812 shufpd $1, %xmm7, %xmm2 813 814#ifdef PREFETCHW 815 PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) 816#endif 817 818 movapd %xmm1, 8 * SIZE(B) 819 movapd %xmm3, 10 * SIZE(B) 820 movapd %xmm0, 12 * SIZE(B) 821 movapd %xmm2, 14 * SIZE(B) 822 823 addq $8 * SIZE, AO1 824 addq $8 * SIZE, AO2 825 subq $-32 * SIZE, B 826 827 decq I 828 jg .L53 829 ALIGN_4 830 831.L54: 832 testq $4, MM 833 jle .L56 834 835 movapd 0 * SIZE(AO1), %xmm0 836 movapd 1 * SIZE(AO1, LDA), %xmm1 837 movapd 0 * SIZE(AO2), %xmm2 838 movapd 1 * SIZE(AO2, LDA), %xmm3 839 840 movsd %xmm0, %xmm5 841 shufpd $1, %xmm1, %xmm0 842 movsd %xmm2, %xmm7 843 shufpd $1, %xmm3, %xmm2 844 845 movapd %xmm5, -16 * SIZE(B) 846 movapd %xmm7, -14 * SIZE(B) 847 movapd %xmm0, -12 * SIZE(B) 848 movapd %xmm2, -10 * SIZE(B) 849 850 movapd 2 * SIZE(AO1), %xmm0 851 movapd 3 * SIZE(AO1, LDA), %xmm5 852 movapd 2 * SIZE(AO2), %xmm2 853 movapd 3 * SIZE(AO2, LDA), %xmm7 854 855 movsd %xmm0, %xmm1 856 shufpd $1, %xmm5, %xmm0 857 movsd %xmm2, %xmm3 858 shufpd $1, %xmm7, %xmm2 859 860 movapd %xmm1, -8 * SIZE(B) 861 movapd %xmm3, -6 * SIZE(B) 862 movapd %xmm0, -4 * SIZE(B) 863 movapd %xmm2, -2 * SIZE(B) 864 865 addq $4 * SIZE, AO1 866 addq $4 * SIZE, AO2 867 subq $-16 * SIZE, B 868 ALIGN_4 869 870.L56: 871 testq $2, MM 872 jle .L58 873 874 movapd 0 * SIZE(AO1), %xmm0 875 movapd 1 * SIZE(AO1, LDA), %xmm1 876 movapd 0 * SIZE(AO2), %xmm2 877 movapd 1 * SIZE(AO2, LDA), %xmm3 878 879 movsd %xmm0, %xmm5 880 movsd %xmm2, %xmm7 881 shufpd $1, %xmm1, %xmm0 882 shufpd $1, %xmm3, %xmm2 883 884 movapd %xmm5, -16 * SIZE(B) 885 movapd %xmm7, -14 * SIZE(B) 886 movapd %xmm0, -12 * SIZE(B) 887 movapd %xmm2, -10 * SIZE(B) 888 889 addq $2 * SIZE, AO1 890 addq $2 * SIZE, AO2 891 subq $-8 * SIZE, B 892 ALIGN_4 893 894.L58: 895 testq $1, MM 896 jle .L59 897 898 movsd 0 * SIZE(AO1), %xmm0 899 movsd 0 * SIZE(AO1, LDA), %xmm1 900 movsd 0 * SIZE(AO2), %xmm2 901 movsd 0 * SIZE(AO2, LDA), %xmm3 902 903 unpcklpd %xmm1, %xmm0 904 unpcklpd %xmm3, %xmm2 905 906 movapd %xmm0, -16 * SIZE(B) 907 movapd %xmm2, -14 * SIZE(B) 908 subq $-4 * SIZE, B 909 ALIGN_4 910 911.L59: 912 decq J 913 jg .L51 914 ALIGN_4 915 916.L60: 917 testq $2, N 918 jle .L70 919 920 movq A, AO1 921 leaq (A, LDA), AO2 922 leaq (A, LDA, 2), A 923 924 testq $SIZE, A 925 je .L62 926 927 movsd 0 * SIZE(AO1), %xmm0 928 movsd 0 * SIZE(AO2), %xmm1 929 930 unpcklpd %xmm1, %xmm0 931 932 movapd %xmm0, -16 * SIZE(B) 933 934 addq $1 * SIZE, AO1 935 addq $1 * SIZE, AO2 936 subq $-2 * SIZE, B 937 ALIGN_3 938 939.L62: 940 movapd -1 * SIZE(AO2), %xmm5 941 942 movq MM, I 943 sarq $3, I 944 jle .L64 945 ALIGN_4 946 947.L63: 948#ifdef PREFETCH 949 PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) 950#endif 951 952 movapd 0 * SIZE(AO1), %xmm0 953 movapd 1 * SIZE(AO2), %xmm1 954 movapd 2 * SIZE(AO1), %xmm2 955 movapd 3 * SIZE(AO2), %xmm3 956 957 movsd %xmm0, %xmm5 958 shufpd $1, %xmm1, %xmm0 959 movsd %xmm2, %xmm1 960 shufpd $1, %xmm3, %xmm2 961 962#ifdef PREFETCHW 963 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 964#endif 965 966 movapd %xmm5, -16 * SIZE(B) 967 movapd %xmm0, -14 * SIZE(B) 968 movapd %xmm1, -12 * SIZE(B) 969 movapd %xmm2, -10 * SIZE(B) 970 971#ifdef PREFETCH 972 PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) 973#endif 974 975 movapd 4 * SIZE(AO1), %xmm0 976 movapd 5 * SIZE(AO2), %xmm1 977 movapd 6 * SIZE(AO1), %xmm2 978 movapd 7 * SIZE(AO2), %xmm5 979 980 movsd %xmm0, %xmm3 981 shufpd $1, %xmm1, %xmm0 982 movsd %xmm2, %xmm1 983 shufpd $1, %xmm5, %xmm2 984 985#ifdef PREFETCHW 986 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 987#endif 988 989 movapd %xmm3, -8 * SIZE(B) 990 movapd %xmm0, -6 * SIZE(B) 991 movapd %xmm1, -4 * SIZE(B) 992 movapd %xmm2, -2 * SIZE(B) 993 994 addq $8 * SIZE, AO1 995 addq $8 * SIZE, AO2 996 subq $-16 * SIZE, B 997 998 decq I 999 jg .L63 1000 ALIGN_4 1001 1002.L64: 1003 testq $4, MM 1004 jle .L66 1005 1006 movapd 0 * SIZE(AO1), %xmm0 1007 movapd 1 * SIZE(AO2), %xmm1 1008 movapd 2 * SIZE(AO1), %xmm2 1009 movapd 3 * SIZE(AO2), %xmm3 1010 1011 movsd %xmm0, %xmm5 1012 shufpd $1, %xmm1, %xmm0 1013 movsd %xmm2, %xmm1 1014 shufpd $1, %xmm3, %xmm2 1015 1016 movapd %xmm5, -16 * SIZE(B) 1017 movapd %xmm0, -14 * SIZE(B) 1018 movapd %xmm1, -12 * SIZE(B) 1019 movapd %xmm2, -10 * SIZE(B) 1020 1021 movaps %xmm3, %xmm5 1022 1023 addq $4 * SIZE, AO1 1024 addq $4 * SIZE, AO2 1025 subq $-8 * SIZE, B 1026 ALIGN_4 1027 1028.L66: 1029 testq $2, MM 1030 jle .L68 1031 1032 movapd 0 * SIZE(AO1), %xmm0 1033 movapd 1 * SIZE(AO2), %xmm1 1034 1035 movsd %xmm0, %xmm5 1036 shufpd $1, %xmm1, %xmm0 1037 1038 movapd %xmm5, -16 * SIZE(B) 1039 movapd %xmm0, -14 * SIZE(B) 1040 1041 addq $2 * SIZE, AO1 1042 addq $2 * SIZE, AO2 1043 subq $-4 * SIZE, B 1044 ALIGN_4 1045 1046.L68: 1047 testq $1, MM 1048 jle .L70 1049 1050 movsd 0 * SIZE(AO1), %xmm0 1051 movsd 0 * SIZE(AO2), %xmm1 1052 1053 unpcklpd %xmm1, %xmm0 1054 1055 movapd %xmm0, -16 * SIZE(B) 1056 subq $-2 * SIZE, B 1057 ALIGN_4 1058 1059.L70: 1060 testq $1, N 1061 jle .L999 1062 1063 movq A, AO1 1064 1065 testq $SIZE, A 1066 jne .L75 1067 1068 movq MM, I 1069 sarq $3, I 1070 jle .L72 1071 ALIGN_4 1072 1073.L71: 1074#ifdef PREFETCH 1075 PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) 1076#endif 1077 1078 movapd 0 * SIZE(AO1), %xmm0 1079 movapd 2 * SIZE(AO1), %xmm2 1080 movapd 4 * SIZE(AO1), %xmm4 1081 movapd 6 * SIZE(AO1), %xmm6 1082 1083#ifdef PREFETCHW 1084 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 1085#endif 1086 1087 movapd %xmm0, -16 * SIZE(B) 1088 movapd %xmm2, -14 * SIZE(B) 1089 movapd %xmm4, -12 * SIZE(B) 1090 movapd %xmm6, -10 * SIZE(B) 1091 1092 addq $8 * SIZE, AO1 1093 subq $-8 * SIZE, B 1094 1095 decq I 1096 jg .L71 1097 ALIGN_4 1098 1099.L72: 1100 testq $4, MM 1101 jle .L73 1102 1103 movapd 0 * SIZE(AO1), %xmm0 1104 movapd 2 * SIZE(AO1), %xmm2 1105 1106 movapd %xmm0, -16 * SIZE(B) 1107 movapd %xmm2, -14 * SIZE(B) 1108 1109 addq $4 * SIZE, AO1 1110 subq $-4 * SIZE, B 1111 ALIGN_4 1112 1113.L73: 1114 testq $2, MM 1115 jle .L74 1116 1117 movapd 0 * SIZE(AO1), %xmm0 1118 1119 movapd %xmm0, -16 * SIZE(B) 1120 1121 addq $2 * SIZE, AO1 1122 subq $-2 * SIZE, B 1123 ALIGN_4 1124 1125.L74: 1126 testq $1, MM 1127 jle .L999 1128 1129 movsd 0 * SIZE(AO1), %xmm0 1130 1131 movlpd %xmm0, -16 * SIZE(B) 1132 jmp .L999 1133 ALIGN_4 1134 1135.L75: 1136 movapd -1 * SIZE(AO1), %xmm0 1137 1138 movq MM, I 1139 sarq $3, I 1140 jle .L76 1141 ALIGN_4 1142 1143.L76: 1144#ifdef PREFETCH 1145 PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) 1146#endif 1147 1148 movapd 1 * SIZE(AO1), %xmm1 1149 movapd 3 * SIZE(AO1), %xmm2 1150 movapd 5 * SIZE(AO1), %xmm3 1151 movapd 7 * SIZE(AO1), %xmm4 1152 1153 shufpd $1, %xmm1, %xmm0 1154 shufpd $1, %xmm2, %xmm1 1155 shufpd $1, %xmm3, %xmm2 1156 shufpd $1, %xmm4, %xmm3 1157 1158#ifdef PREFETCHW 1159 PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) 1160#endif 1161 1162 movapd %xmm0, -16 * SIZE(B) 1163 movapd %xmm1, -14 * SIZE(B) 1164 movapd %xmm2, -12 * SIZE(B) 1165 movapd %xmm3, -10 * SIZE(B) 1166 1167 movapd %xmm4, %xmm0 1168 1169 addq $8 * SIZE, AO1 1170 subq $-8 * SIZE, B 1171 1172 decq I 1173 jg .L76 1174 ALIGN_4 1175 1176.L77: 1177 testq $4, MM 1178 jle .L78 1179 1180 movapd 1 * SIZE(AO1), %xmm1 1181 movapd 3 * SIZE(AO1), %xmm2 1182 1183 shufpd $1, %xmm1, %xmm0 1184 shufpd $1, %xmm2, %xmm1 1185 1186 movapd %xmm0, -16 * SIZE(B) 1187 movapd %xmm1, -14 * SIZE(B) 1188 1189 movapd %xmm2, %xmm0 1190 1191 addq $4 * SIZE, AO1 1192 addq $4 * SIZE, B 1193 ALIGN_4 1194 1195.L78: 1196 testq $2, MM 1197 jle .L79 1198 1199 movapd 1 * SIZE(AO1), %xmm1 1200 1201 shufpd $1, %xmm1, %xmm0 1202 1203 movapd %xmm0, -16 * SIZE(B) 1204 1205 movapd %xmm1, %xmm0 1206 1207 addq $2 * SIZE, AO1 1208 subq $-2 * SIZE, B 1209 ALIGN_4 1210 1211.L79: 1212 testq $1, MM 1213 jle .L999 1214 1215 shufpd $1, %xmm0, %xmm0 1216 1217 movlpd %xmm0, -16 * SIZE(B) 1218 ALIGN_4 1219 1220.L999: 1221#ifdef WINDOWS_ABI 1222 movups 0(%rsp), %xmm6 1223 movups 16(%rsp), %xmm7 1224 1225 addq $STACKSIZE, %rsp 1226#endif 1227 1228 popq %r12 1229 popq %r13 1230 1231#ifdef WINDOWS_ABI 1232 popq %r14 1233 popq %r15 1234#endif 1235 ret 1236 1237 EPILOGUE 1238