1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef NEHALEM 43#define PREFETCHSIZE 12 44#define PREFETCH prefetcht0 45#define PREFETCHW prefetcht0 46#endif 47 48#ifdef SANDYBRIDGE 49#define PREFETCHSIZE 12 50#define PREFETCH prefetcht0 51#define PREFETCHW prefetcht0 52#endif 53 54#ifndef MOVAPS 55#define MOVAPS movaps 56#endif 57 58#ifndef WINDOWS_ABI 59 60#define M ARG1 /* rdi */ 61#define N ARG2 /* rsi */ 62#define A ARG3 /* rdx */ 63#define LDA ARG4 /* rcx */ 64#define B ARG5 /* r8 */ 65 66#define AO1 %r9 67#define AO2 %r10 68#define LDA3 %r11 69#define J %r12 70#define MM %r13 71 72#else 73 74#define STACKSIZE 128 75 76#define M ARG1 /* rcx */ 77#define N ARG2 /* rdx */ 78#define A ARG3 /* r8 */ 79#define LDA ARG4 /* r9 */ 80#define OLD_B 40 + 32 + STACKSIZE(%rsp) 81 82#define B %r15 83 84#define AO1 %r10 85#define AO2 %r11 86#define LDA3 %r12 87#define J %r13 88#define MM %r14 89 90#endif 91 92#define I %rax 93 94 PROLOGUE 95 PROFCODE 96 97#ifdef WINDOWS_ABI 98 pushq %r15 99 pushq %r14 100#endif 101 pushq %r13 102 pushq %r12 103 104#ifdef WINDOWS_ABI 105 subq $STACKSIZE, %rsp 106 107 movups %xmm6, 0(%rsp) 108 movups %xmm7, 16(%rsp) 109 movups %xmm8, 32(%rsp) 110 movups %xmm9, 48(%rsp) 111 movups %xmm10, 64(%rsp) 112 movups %xmm11, 80(%rsp) 113 movups %xmm12, 96(%rsp) 114 115 movq OLD_B, B 116#endif 117 118 leaq (,LDA, SIZE), LDA 119 leaq (LDA, LDA, 2), LDA3 120 subq $-16 * SIZE, B 121 122 movq M, MM 123 leaq -1(M), %rax 124 testq $SIZE, A 125 cmovne %rax, MM 126 127 testq $SIZE, LDA 128 jne .L50 129 130 movq N, J 131 sarq $3, J 132 jle .L20 133 ALIGN_4 134 135.L11: 136 movq A, AO1 137 leaq (A, LDA, 4), AO2 138 leaq (A, LDA, 8), A 139 140 testq $SIZE, A 141 je .L12 142 143 movsd 0 * SIZE(AO1), %xmm0 144 movsd 0 * SIZE(AO1, LDA), %xmm1 145 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 146 movsd 0 * SIZE(AO1, LDA3), %xmm3 147 148 movsd 0 * SIZE(AO2), %xmm4 149 movsd 0 * SIZE(AO2, LDA), %xmm5 150 movsd 0 * SIZE(AO2, LDA, 2), %xmm6 151 movsd 0 * SIZE(AO2, LDA3), %xmm7 152 153 unpcklpd %xmm1, %xmm0 154 unpcklpd %xmm3, %xmm2 155 unpcklpd %xmm5, %xmm4 156 unpcklpd %xmm7, %xmm6 157 158 movaps %xmm0, -16 * SIZE(B) 159 movaps %xmm2, -14 * SIZE(B) 160 movaps %xmm4, -12 * SIZE(B) 161 movaps %xmm6, -10 * SIZE(B) 162 163 addq $1 * SIZE, AO1 164 addq $1 * SIZE, AO2 165 subq $-8 * SIZE, B 166 ALIGN_3 167 168.L12: 169 movq MM, I 170 sarq $3, I 171 jle .L14 172 ALIGN_4 173 174.L13: 175#ifdef PREFETCH 176 PREFETCH PREFETCHSIZE * SIZE(AO1) 177#endif 178 179 MOVAPS 0 * SIZE(AO1), %xmm0 180 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 181 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 182 MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 183 184 movaps %xmm0, %xmm8 185 unpcklpd %xmm1, %xmm0 186 movaps %xmm2, %xmm9 187 unpcklpd %xmm3, %xmm2 188 189#ifdef PREFETCH 190 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) 191#endif 192 193 MOVAPS 0 * SIZE(AO2), %xmm4 194 MOVAPS 0 * SIZE(AO2, LDA), %xmm5 195 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 196 MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 197 198 movaps %xmm4, %xmm10 199 unpcklpd %xmm5, %xmm4 200 movaps %xmm6, %xmm11 201 unpcklpd %xmm7, %xmm6 202 203#ifdef PREFETCHW 204 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 205#endif 206 207 movaps %xmm0, -16 * SIZE(B) 208 movaps %xmm2, -14 * SIZE(B) 209 movaps %xmm4, -12 * SIZE(B) 210 movaps %xmm6, -10 * SIZE(B) 211 212 unpckhpd %xmm1, %xmm8 213 unpckhpd %xmm3, %xmm9 214 unpckhpd %xmm5, %xmm10 215 unpckhpd %xmm7, %xmm11 216 217#ifdef PREFETCHW 218 PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) 219#endif 220 221 movaps %xmm8, -8 * SIZE(B) 222 movaps %xmm9, -6 * SIZE(B) 223 movaps %xmm10, -4 * SIZE(B) 224 movaps %xmm11, -2 * SIZE(B) 225 226#ifdef PREFETCH 227 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) 228#endif 229 230 MOVAPS 2 * SIZE(AO1), %xmm0 231 MOVAPS 2 * SIZE(AO1, LDA), %xmm1 232 MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 233 MOVAPS 2 * SIZE(AO1, LDA3), %xmm3 234 235 movaps %xmm0, %xmm8 236 unpcklpd %xmm1, %xmm0 237 movaps %xmm2, %xmm9 238 unpcklpd %xmm3, %xmm2 239 240#ifdef PREFETCH 241 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) 242#endif 243 244 MOVAPS 2 * SIZE(AO2), %xmm4 245 MOVAPS 2 * SIZE(AO2, LDA), %xmm5 246 MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 247 MOVAPS 2 * SIZE(AO2, LDA3), %xmm7 248 249 movaps %xmm4, %xmm10 250 unpcklpd %xmm5, %xmm4 251 movaps %xmm6, %xmm11 252 unpcklpd %xmm7, %xmm6 253 254#ifdef PREFETCHW 255 PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) 256#endif 257 258 movaps %xmm0, 0 * SIZE(B) 259 movaps %xmm2, 2 * SIZE(B) 260 movaps %xmm4, 4 * SIZE(B) 261 movaps %xmm6, 6 * SIZE(B) 262 263 unpckhpd %xmm1, %xmm8 264 unpckhpd %xmm3, %xmm9 265 unpckhpd %xmm5, %xmm10 266 unpckhpd %xmm7, %xmm11 267 268#ifdef PREFETCHW 269 PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) 270#endif 271 272 movaps %xmm8, 8 * SIZE(B) 273 movaps %xmm9, 10 * SIZE(B) 274 movaps %xmm10, 12 * SIZE(B) 275 movaps %xmm11, 14 * SIZE(B) 276 277#ifdef PREFETCH 278 PREFETCH PREFETCHSIZE * SIZE(AO2) 279#endif 280 281 MOVAPS 4 * SIZE(AO1), %xmm0 282 MOVAPS 4 * SIZE(AO1, LDA), %xmm1 283 MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2 284 MOVAPS 4 * SIZE(AO1, LDA3), %xmm3 285 286 movaps %xmm0, %xmm8 287 unpcklpd %xmm1, %xmm0 288 movaps %xmm2, %xmm9 289 unpcklpd %xmm3, %xmm2 290 291#ifdef PREFETCH 292 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) 293#endif 294 295 MOVAPS 4 * SIZE(AO2), %xmm4 296 MOVAPS 4 * SIZE(AO2, LDA), %xmm5 297 MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6 298 MOVAPS 4 * SIZE(AO2, LDA3), %xmm7 299 300 movaps %xmm4, %xmm10 301 unpcklpd %xmm5, %xmm4 302 movaps %xmm6, %xmm11 303 unpcklpd %xmm7, %xmm6 304 305#ifdef PREFETCHW 306 PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B) 307#endif 308 309 movaps %xmm0, 16 * SIZE(B) 310 movaps %xmm2, 18 * SIZE(B) 311 movaps %xmm4, 20 * SIZE(B) 312 movaps %xmm6, 22 * SIZE(B) 313 314 unpckhpd %xmm1, %xmm8 315 unpckhpd %xmm3, %xmm9 316 unpckhpd %xmm5, %xmm10 317 unpckhpd %xmm7, %xmm11 318 319#ifdef PREFETCHW 320 PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B) 321#endif 322 323 movaps %xmm8, 24 * SIZE(B) 324 movaps %xmm9, 26 * SIZE(B) 325 movaps %xmm10, 28 * SIZE(B) 326 movaps %xmm11, 30 * SIZE(B) 327 328#ifdef PREFETCH 329 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) 330#endif 331 332 MOVAPS 6 * SIZE(AO1), %xmm0 333 MOVAPS 6 * SIZE(AO1, LDA), %xmm1 334 MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2 335 MOVAPS 6 * SIZE(AO1, LDA3), %xmm3 336 337 movaps %xmm0, %xmm8 338 unpcklpd %xmm1, %xmm0 339 movaps %xmm2, %xmm9 340 unpcklpd %xmm3, %xmm2 341 342#ifdef PREFETCH 343 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) 344#endif 345 346 MOVAPS 6 * SIZE(AO2), %xmm4 347 MOVAPS 6 * SIZE(AO2, LDA), %xmm5 348 MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6 349 MOVAPS 6 * SIZE(AO2, LDA3), %xmm7 350 351 movaps %xmm4, %xmm10 352 unpcklpd %xmm5, %xmm4 353 movaps %xmm6, %xmm11 354 unpcklpd %xmm7, %xmm6 355 356#ifdef PREFETCHW 357 PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B) 358#endif 359 360 movaps %xmm0, 32 * SIZE(B) 361 movaps %xmm2, 34 * SIZE(B) 362 movaps %xmm4, 36 * SIZE(B) 363 movaps %xmm6, 38 * SIZE(B) 364 365 unpckhpd %xmm1, %xmm8 366 unpckhpd %xmm3, %xmm9 367 unpckhpd %xmm5, %xmm10 368 unpckhpd %xmm7, %xmm11 369 370#ifdef PREFETCHW 371 PREFETCHW (PREFETCHSIZE * 8 + 56) * SIZE(B) 372#endif 373 374 movaps %xmm8, 40 * SIZE(B) 375 movaps %xmm9, 42 * SIZE(B) 376 movaps %xmm10, 44 * SIZE(B) 377 movaps %xmm11, 46 * SIZE(B) 378 379 addq $8 * SIZE, AO1 380 addq $8 * SIZE, AO2 381 subq $-64 * SIZE, B 382 383 decq I 384 jg .L13 385 ALIGN_4 386 387.L14: 388 testq $4, MM 389 jle .L16 390 391 MOVAPS 0 * SIZE(AO1), %xmm0 392 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 393 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 394 MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 395 396 MOVAPS 0 * SIZE(AO2), %xmm4 397 MOVAPS 0 * SIZE(AO2, LDA), %xmm5 398 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 399 MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 400 401 movaps %xmm0, %xmm8 402 unpcklpd %xmm1, %xmm0 403 movaps %xmm2, %xmm9 404 unpcklpd %xmm3, %xmm2 405 406 movaps %xmm4, %xmm10 407 unpcklpd %xmm5, %xmm4 408 movaps %xmm6, %xmm11 409 unpcklpd %xmm7, %xmm6 410 411 movaps %xmm0, -16 * SIZE(B) 412 movaps %xmm2, -14 * SIZE(B) 413 movaps %xmm4, -12 * SIZE(B) 414 movaps %xmm6, -10 * SIZE(B) 415 416 unpckhpd %xmm1, %xmm8 417 unpckhpd %xmm3, %xmm9 418 unpckhpd %xmm5, %xmm10 419 unpckhpd %xmm7, %xmm11 420 421 movaps %xmm8, -8 * SIZE(B) 422 movaps %xmm9, -6 * SIZE(B) 423 movaps %xmm10, -4 * SIZE(B) 424 movaps %xmm11, -2 * SIZE(B) 425 426 MOVAPS 2 * SIZE(AO1), %xmm0 427 MOVAPS 2 * SIZE(AO1, LDA), %xmm1 428 MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 429 MOVAPS 2 * SIZE(AO1, LDA3), %xmm3 430 431 MOVAPS 2 * SIZE(AO2), %xmm4 432 MOVAPS 2 * SIZE(AO2, LDA), %xmm5 433 MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 434 MOVAPS 2 * SIZE(AO2, LDA3), %xmm7 435 436 movaps %xmm0, %xmm8 437 unpcklpd %xmm1, %xmm0 438 movaps %xmm2, %xmm9 439 unpcklpd %xmm3, %xmm2 440 441 movaps %xmm4, %xmm10 442 unpcklpd %xmm5, %xmm4 443 movaps %xmm6, %xmm11 444 unpcklpd %xmm7, %xmm6 445 446 movaps %xmm0, 0 * SIZE(B) 447 movaps %xmm2, 2 * SIZE(B) 448 movaps %xmm4, 4 * SIZE(B) 449 movaps %xmm6, 6 * SIZE(B) 450 451 unpckhpd %xmm1, %xmm8 452 unpckhpd %xmm3, %xmm9 453 unpckhpd %xmm5, %xmm10 454 unpckhpd %xmm7, %xmm11 455 456 movaps %xmm8, 8 * SIZE(B) 457 movaps %xmm9, 10 * SIZE(B) 458 movaps %xmm10, 12 * SIZE(B) 459 movaps %xmm11, 14 * SIZE(B) 460 461 addq $4 * SIZE, AO1 462 addq $4 * SIZE, AO2 463 subq $-32 * SIZE, B 464 ALIGN_4 465 466.L16: 467 testq $2, MM 468 jle .L18 469 470 MOVAPS 0 * SIZE(AO1), %xmm0 471 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 472 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 473 MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 474 475 MOVAPS 0 * SIZE(AO2), %xmm4 476 MOVAPS 0 * SIZE(AO2, LDA), %xmm5 477 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 478 MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 479 480 movaps %xmm0, %xmm8 481 unpcklpd %xmm1, %xmm0 482 movaps %xmm2, %xmm9 483 unpcklpd %xmm3, %xmm2 484 485 movaps %xmm4, %xmm10 486 unpcklpd %xmm5, %xmm4 487 movaps %xmm6, %xmm11 488 unpcklpd %xmm7, %xmm6 489 490 movaps %xmm0, -16 * SIZE(B) 491 movaps %xmm2, -14 * SIZE(B) 492 movaps %xmm4, -12 * SIZE(B) 493 movaps %xmm6, -10 * SIZE(B) 494 495 unpckhpd %xmm1, %xmm8 496 unpckhpd %xmm3, %xmm9 497 unpckhpd %xmm5, %xmm10 498 unpckhpd %xmm7, %xmm11 499 500 movaps %xmm8, -8 * SIZE(B) 501 movaps %xmm9, -6 * SIZE(B) 502 movaps %xmm10, -4 * SIZE(B) 503 movaps %xmm11, -2 * SIZE(B) 504 505 addq $2 * SIZE, AO1 506 addq $2 * SIZE, AO2 507 subq $-16 * SIZE, B 508 ALIGN_4 509 510.L18: 511 testq $1, MM 512 jle .L19 513 514 movsd 0 * SIZE(AO1), %xmm0 515 movsd 0 * SIZE(AO1, LDA), %xmm1 516 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 517 movsd 0 * SIZE(AO1, LDA3), %xmm3 518 519 movsd 0 * SIZE(AO2), %xmm4 520 movsd 0 * SIZE(AO2, LDA), %xmm5 521 movsd 0 * SIZE(AO2, LDA, 2), %xmm6 522 movsd 0 * SIZE(AO2, LDA3), %xmm7 523 524 unpcklpd %xmm1, %xmm0 525 unpcklpd %xmm3, %xmm2 526 unpcklpd %xmm5, %xmm4 527 unpcklpd %xmm7, %xmm6 528 529 movaps %xmm0, -16 * SIZE(B) 530 movaps %xmm2, -14 * SIZE(B) 531 movaps %xmm4, -12 * SIZE(B) 532 movaps %xmm6, -10 * SIZE(B) 533 534 subq $-8 * SIZE, B 535 ALIGN_4 536 537.L19: 538 decq J 539 jg .L11 540 ALIGN_4 541 542.L20: 543 testq $4, N 544 jle .L30 545 546 movq A, AO1 547 leaq (A, LDA, 2), AO2 548 leaq (A, LDA, 4), A 549 550 testq $SIZE, A 551 je .L22 552 553 movsd 0 * SIZE(AO1), %xmm0 554 movsd 0 * SIZE(AO1, LDA), %xmm1 555 movsd 0 * SIZE(AO2), %xmm2 556 movsd 0 * SIZE(AO2, LDA), %xmm3 557 558 unpcklpd %xmm1, %xmm0 559 unpcklpd %xmm3, %xmm2 560 561 movaps %xmm0, -16 * SIZE(B) 562 movaps %xmm2, -14 * SIZE(B) 563 564 addq $1 * SIZE, AO1 565 addq $1 * SIZE, AO2 566 subq $-4 * SIZE, B 567 ALIGN_3 568 569.L22: 570 movq MM, I 571 sarq $3, I 572 jle .L24 573 ALIGN_4 574 575.L23: 576#ifdef PREFETCH 577 PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) 578#endif 579 580 MOVAPS 0 * SIZE(AO1), %xmm0 581 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 582 MOVAPS 0 * SIZE(AO2), %xmm2 583 MOVAPS 0 * SIZE(AO2, LDA), %xmm3 584 585 movaps %xmm0, %xmm4 586 unpcklpd %xmm1, %xmm0 587 movaps %xmm2, %xmm6 588 unpcklpd %xmm3, %xmm2 589 590 unpckhpd %xmm1, %xmm4 591 unpckhpd %xmm3, %xmm6 592 593#ifdef PREFETCHW 594 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 595#endif 596 597 movaps %xmm0, -16 * SIZE(B) 598 movaps %xmm2, -14 * SIZE(B) 599 movaps %xmm4, -12 * SIZE(B) 600 movaps %xmm6, -10 * SIZE(B) 601 602#ifdef PREFETCH 603 PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA) 604#endif 605 606 MOVAPS 2 * SIZE(AO1), %xmm0 607 MOVAPS 2 * SIZE(AO1, LDA), %xmm1 608 MOVAPS 2 * SIZE(AO2), %xmm2 609 MOVAPS 2 * SIZE(AO2, LDA), %xmm3 610 611 movaps %xmm0, %xmm4 612 unpcklpd %xmm1, %xmm0 613 movaps %xmm2, %xmm6 614 unpcklpd %xmm3, %xmm2 615 616 unpckhpd %xmm1, %xmm4 617 unpckhpd %xmm3, %xmm6 618 619#ifdef PREFETCHW 620 PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) 621#endif 622 623 movaps %xmm0, -8 * SIZE(B) 624 movaps %xmm2, -6 * SIZE(B) 625 movaps %xmm4, -4 * SIZE(B) 626 movaps %xmm6, -2 * SIZE(B) 627 628#ifdef PREFETCH 629 PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) 630#endif 631 632 MOVAPS 4 * SIZE(AO1), %xmm0 633 MOVAPS 4 * SIZE(AO1, LDA), %xmm1 634 MOVAPS 4 * SIZE(AO2), %xmm2 635 MOVAPS 4 * SIZE(AO2, LDA), %xmm3 636 637 movaps %xmm0, %xmm4 638 unpcklpd %xmm1, %xmm0 639 movaps %xmm2, %xmm6 640 unpcklpd %xmm3, %xmm2 641 642 unpckhpd %xmm1, %xmm4 643 unpckhpd %xmm3, %xmm6 644 645#ifdef PREFETCHW 646 PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) 647#endif 648 649 movaps %xmm0, 0 * SIZE(B) 650 movaps %xmm2, 2 * SIZE(B) 651 movaps %xmm4, 4 * SIZE(B) 652 movaps %xmm6, 6 * SIZE(B) 653 654#ifdef PREFETCH 655 PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA) 656#endif 657 658 MOVAPS 6 * SIZE(AO1), %xmm0 659 MOVAPS 6 * SIZE(AO1, LDA), %xmm1 660 MOVAPS 6 * SIZE(AO2), %xmm2 661 MOVAPS 6 * SIZE(AO2, LDA), %xmm3 662 663 movaps %xmm0, %xmm4 664 unpcklpd %xmm1, %xmm0 665 movaps %xmm2, %xmm6 666 unpcklpd %xmm3, %xmm2 667 668 unpckhpd %xmm1, %xmm4 669 unpckhpd %xmm3, %xmm6 670 671#ifdef PREFETCHW 672 PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) 673#endif 674 675 movaps %xmm0, 8 * SIZE(B) 676 movaps %xmm2, 10 * SIZE(B) 677 movaps %xmm4, 12 * SIZE(B) 678 movaps %xmm6, 14 * SIZE(B) 679 680 addq $8 * SIZE, AO1 681 addq $8 * SIZE, AO2 682 subq $-32 * SIZE, B 683 684 decq I 685 jg .L23 686 ALIGN_4 687 688.L24: 689 testq $4, MM 690 jle .L26 691 692 MOVAPS 0 * SIZE(AO1), %xmm0 693 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 694 MOVAPS 0 * SIZE(AO2), %xmm2 695 MOVAPS 0 * SIZE(AO2, LDA), %xmm3 696 697 movaps %xmm0, %xmm4 698 unpcklpd %xmm1, %xmm0 699 movaps %xmm2, %xmm6 700 unpcklpd %xmm3, %xmm2 701 702 unpckhpd %xmm1, %xmm4 703 unpckhpd %xmm3, %xmm6 704 705 movaps %xmm0, -16 * SIZE(B) 706 movaps %xmm2, -14 * SIZE(B) 707 movaps %xmm4, -12 * SIZE(B) 708 movaps %xmm6, -10 * SIZE(B) 709 710 MOVAPS 2 * SIZE(AO1), %xmm0 711 MOVAPS 2 * SIZE(AO1, LDA), %xmm1 712 MOVAPS 2 * SIZE(AO2), %xmm2 713 MOVAPS 2 * SIZE(AO2, LDA), %xmm3 714 715 movaps %xmm0, %xmm4 716 unpcklpd %xmm1, %xmm0 717 movaps %xmm2, %xmm6 718 unpcklpd %xmm3, %xmm2 719 720 unpckhpd %xmm1, %xmm4 721 unpckhpd %xmm3, %xmm6 722 723 movaps %xmm0, -8 * SIZE(B) 724 movaps %xmm2, -6 * SIZE(B) 725 movaps %xmm4, -4 * SIZE(B) 726 movaps %xmm6, -2 * SIZE(B) 727 728 addq $4 * SIZE, AO1 729 addq $4 * SIZE, AO2 730 subq $-16 * SIZE, B 731 ALIGN_4 732 733.L26: 734 testq $2, MM 735 jle .L28 736 737 MOVAPS 0 * SIZE(AO1), %xmm0 738 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 739 MOVAPS 0 * SIZE(AO2), %xmm2 740 MOVAPS 0 * SIZE(AO2, LDA), %xmm3 741 742 movaps %xmm0, %xmm4 743 unpcklpd %xmm1, %xmm0 744 movaps %xmm2, %xmm6 745 unpcklpd %xmm3, %xmm2 746 747 unpckhpd %xmm1, %xmm4 748 unpckhpd %xmm3, %xmm6 749 750 movaps %xmm0, -16 * SIZE(B) 751 movaps %xmm2, -14 * SIZE(B) 752 movaps %xmm4, -12 * SIZE(B) 753 movaps %xmm6, -10 * SIZE(B) 754 755 addq $2 * SIZE, AO1 756 addq $2 * SIZE, AO2 757 subq $-8 * SIZE, B 758 ALIGN_4 759 760.L28: 761 testq $1, MM 762 jle .L30 763 764 movsd 0 * SIZE(AO1), %xmm0 765 movsd 0 * SIZE(AO1, LDA), %xmm1 766 movsd 0 * SIZE(AO2), %xmm2 767 movsd 0 * SIZE(AO2, LDA), %xmm3 768 769 unpcklpd %xmm1, %xmm0 770 unpcklpd %xmm3, %xmm2 771 772 movaps %xmm0, -16 * SIZE(B) 773 movaps %xmm2, -14 * SIZE(B) 774 subq $-4 * SIZE, B 775 ALIGN_4 776 777.L30: 778 testq $2, N 779 jle .L40 780 781 movq A, AO1 782 leaq (A, LDA), AO2 783 leaq (A, LDA, 2), A 784 785 testq $SIZE, A 786 je .L32 787 788 movsd 0 * SIZE(AO1), %xmm0 789 movsd 0 * SIZE(AO2), %xmm1 790 791 unpcklpd %xmm1, %xmm0 792 793 movaps %xmm0, -16 * SIZE(B) 794 795 addq $1 * SIZE, AO1 796 addq $1 * SIZE, AO2 797 subq $-2 * SIZE, B 798 ALIGN_3 799 800.L32: 801 movq MM, I 802 sarq $3, I 803 jle .L34 804 ALIGN_4 805 806.L33: 807#ifdef PREFETCH 808 PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) 809#endif 810 811 MOVAPS 0 * SIZE(AO1), %xmm0 812 MOVAPS 0 * SIZE(AO2), %xmm1 813 MOVAPS 2 * SIZE(AO1), %xmm2 814 MOVAPS 2 * SIZE(AO2), %xmm3 815 816 movaps %xmm0, %xmm4 817 unpcklpd %xmm1, %xmm0 818 movaps %xmm2, %xmm6 819 unpcklpd %xmm3, %xmm2 820 821 unpckhpd %xmm1, %xmm4 822 unpckhpd %xmm3, %xmm6 823 824#ifdef PREFETCHW 825 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 826#endif 827 828 movaps %xmm0, -16 * SIZE(B) 829 movaps %xmm4, -14 * SIZE(B) 830 movaps %xmm2, -12 * SIZE(B) 831 movaps %xmm6, -10 * SIZE(B) 832 833#ifdef PREFETCH 834 PREFETCH PREFETCHSIZE * 4 * SIZE(AO2) 835#endif 836 837 MOVAPS 4 * SIZE(AO1), %xmm0 838 MOVAPS 4 * SIZE(AO2), %xmm1 839 MOVAPS 6 * SIZE(AO1), %xmm2 840 MOVAPS 6 * SIZE(AO2), %xmm3 841 842 movaps %xmm0, %xmm4 843 unpcklpd %xmm1, %xmm0 844 movaps %xmm2, %xmm6 845 unpcklpd %xmm3, %xmm2 846 847 unpckhpd %xmm1, %xmm4 848 unpckhpd %xmm3, %xmm6 849 850#ifdef PREFETCHW 851 PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) 852#endif 853 854 movaps %xmm0, -8 * SIZE(B) 855 movaps %xmm4, -6 * SIZE(B) 856 movaps %xmm2, -4 * SIZE(B) 857 movaps %xmm6, -2 * SIZE(B) 858 859 addq $8 * SIZE, AO1 860 addq $8 * SIZE, AO2 861 subq $-16 * SIZE, B 862 863 decq I 864 jg .L33 865 ALIGN_4 866 867.L34: 868 testq $4, MM 869 jle .L36 870 871 MOVAPS 0 * SIZE(AO1), %xmm0 872 MOVAPS 0 * SIZE(AO2), %xmm1 873 MOVAPS 2 * SIZE(AO1), %xmm2 874 MOVAPS 2 * SIZE(AO2), %xmm3 875 876 movaps %xmm0, %xmm4 877 unpcklpd %xmm1, %xmm0 878 unpckhpd %xmm1, %xmm4 879 880 movaps %xmm2, %xmm6 881 unpcklpd %xmm3, %xmm2 882 unpckhpd %xmm3, %xmm6 883 884 movaps %xmm0, -16 * SIZE(B) 885 movaps %xmm4, -14 * SIZE(B) 886 movaps %xmm2, -12 * SIZE(B) 887 movaps %xmm6, -10 * SIZE(B) 888 889 addq $4 * SIZE, AO1 890 addq $4 * SIZE, AO2 891 subq $-8 * SIZE, B 892 ALIGN_4 893 894.L36: 895 testq $2, MM 896 jle .L38 897 898 MOVAPS 0 * SIZE(AO1), %xmm0 899 MOVAPS 0 * SIZE(AO2), %xmm1 900 901 movaps %xmm0, %xmm2 902 unpcklpd %xmm1, %xmm0 903 unpckhpd %xmm1, %xmm2 904 905 movaps %xmm0, -16 * SIZE(B) 906 movaps %xmm2, -14 * SIZE(B) 907 908 addq $2 * SIZE, AO1 909 addq $2 * SIZE, AO2 910 subq $-4 * SIZE, B 911 ALIGN_4 912 913.L38: 914 testq $1, MM 915 jle .L40 916 917 movsd 0 * SIZE(AO1), %xmm0 918 movsd 0 * SIZE(AO2), %xmm1 919 920 unpcklpd %xmm1, %xmm0 921 922 movaps %xmm0, -16 * SIZE(B) 923 subq $-2 * SIZE, B 924 ALIGN_4 925 926.L40: 927 testq $1, N 928 jle .L999 929 930 movq A, AO1 931 932 testq $SIZE, A 933 jne .L45 934 935 movq MM, I 936 sarq $3, I 937 jle .L42 938 ALIGN_4 939 940.L41: 941#ifdef PREFETCH 942 PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) 943#endif 944 945 MOVAPS 0 * SIZE(AO1), %xmm0 946 MOVAPS 2 * SIZE(AO1), %xmm1 947 MOVAPS 4 * SIZE(AO1), %xmm2 948 MOVAPS 6 * SIZE(AO1), %xmm3 949 950#ifdef PREFETCHW 951 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 952#endif 953 954 movaps %xmm0, -16 * SIZE(B) 955 movaps %xmm1, -14 * SIZE(B) 956 movaps %xmm2, -12 * SIZE(B) 957 movaps %xmm3, -10 * SIZE(B) 958 959 addq $8 * SIZE, AO1 960 subq $-8 * SIZE, B 961 962 decq I 963 jg .L41 964 ALIGN_4 965 966.L42: 967 testq $4, MM 968 jle .L43 969 970 MOVAPS 0 * SIZE(AO1), %xmm0 971 MOVAPS 2 * SIZE(AO1), %xmm1 972 973 movaps %xmm0, -16 * SIZE(B) 974 movaps %xmm1, -14 * SIZE(B) 975 976 addq $4 * SIZE, AO1 977 subq $-4 * SIZE, B 978 ALIGN_4 979 980.L43: 981 testq $2, MM 982 jle .L44 983 984 MOVAPS 0 * SIZE(AO1), %xmm0 985 986 movaps %xmm0, -16 * SIZE(B) 987 988 addq $2 * SIZE, AO1 989 subq $-2 * SIZE, B 990 ALIGN_4 991 992.L44: 993 testq $1, MM 994 jle .L999 995 996 movsd 0 * SIZE(AO1), %xmm0 997 998 movlpd %xmm0, -16 * SIZE(B) 999 jmp .L999 1000 ALIGN_4 1001 1002.L45: 1003 MOVAPS -1 * SIZE(AO1), %xmm0 1004 1005 movq M, I 1006 sarq $3, I 1007 jle .L46 1008 ALIGN_4 1009 1010.L46: 1011#ifdef PREFETCH 1012 PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) 1013#endif 1014 1015 MOVAPS 1 * SIZE(AO1), %xmm1 1016 MOVAPS 3 * SIZE(AO1), %xmm2 1017 MOVAPS 5 * SIZE(AO1), %xmm3 1018 MOVAPS 7 * SIZE(AO1), %xmm4 1019 1020 shufpd $1, %xmm1, %xmm0 1021 shufpd $1, %xmm2, %xmm1 1022 shufpd $1, %xmm3, %xmm2 1023 shufpd $1, %xmm4, %xmm3 1024 1025#ifdef PREFETCHW 1026 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 1027#endif 1028 1029 movaps %xmm0, -16 * SIZE(B) 1030 movaps %xmm1, -14 * SIZE(B) 1031 movaps %xmm2, -12 * SIZE(B) 1032 movaps %xmm3, -10 * SIZE(B) 1033 1034 movaps %xmm4, %xmm0 1035 1036 addq $8 * SIZE, AO1 1037 subq $-8 * SIZE, B 1038 1039 decq I 1040 jg .L46 1041 ALIGN_4 1042 1043.L47: 1044 testq $4, M 1045 jle .L48 1046 1047 MOVAPS 1 * SIZE(AO1), %xmm1 1048 MOVAPS 3 * SIZE(AO1), %xmm2 1049 1050 shufpd $1, %xmm1, %xmm0 1051 shufpd $1, %xmm2, %xmm1 1052 1053 movaps %xmm0, -16 * SIZE(B) 1054 movaps %xmm1, -14 * SIZE(B) 1055 1056 movaps %xmm2, %xmm0 1057 1058 addq $4 * SIZE, AO1 1059 addq $4 * SIZE, B 1060 ALIGN_4 1061 1062.L48: 1063 testq $2, M 1064 jle .L49 1065 1066 MOVAPS 1 * SIZE(AO1), %xmm1 1067 1068 shufpd $1, %xmm1, %xmm0 1069 1070 movaps %xmm0, -16 * SIZE(B) 1071 1072 movaps %xmm1, %xmm0 1073 1074 addq $2 * SIZE, AO1 1075 subq $-2 * SIZE, B 1076 ALIGN_4 1077 1078.L49: 1079 testq $1, M 1080 jle .L999 1081 1082 shufpd $1, %xmm0, %xmm0 1083 1084 movlpd %xmm0, -16 * SIZE(B) 1085 jmp .L999 1086 ALIGN_4 1087 1088.L50: 1089 movq N, J 1090 sarq $3, J 1091 jle .L60 1092 ALIGN_4 1093 1094.L51: 1095 movq A, AO1 1096 leaq (A, LDA, 4), AO2 1097 leaq (A, LDA, 8), A 1098 1099 testq $SIZE, A 1100 je .L52 1101 1102 movsd 0 * SIZE(AO1), %xmm0 1103 movsd 0 * SIZE(AO1, LDA), %xmm1 1104 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 1105 movsd 0 * SIZE(AO1, LDA3), %xmm3 1106 movsd 0 * SIZE(AO2), %xmm4 1107 movsd 0 * SIZE(AO2, LDA), %xmm5 1108 movsd 0 * SIZE(AO2, LDA, 2), %xmm6 1109 movsd 0 * SIZE(AO2, LDA3), %xmm7 1110 1111 unpcklpd %xmm1, %xmm0 1112 unpcklpd %xmm3, %xmm2 1113 unpcklpd %xmm5, %xmm4 1114 unpcklpd %xmm7, %xmm6 1115 1116 movaps %xmm0, -16 * SIZE(B) 1117 movaps %xmm2, -14 * SIZE(B) 1118 movaps %xmm4, -12 * SIZE(B) 1119 movaps %xmm6, -10 * SIZE(B) 1120 1121 addq $1 * SIZE, AO1 1122 addq $1 * SIZE, AO2 1123 subq $-8 * SIZE, B 1124 ALIGN_3 1125 1126.L52: 1127 MOVAPS -1 * SIZE(AO1, LDA), %xmm9 1128 MOVAPS -1 * SIZE(AO1, LDA3), %xmm10 1129 MOVAPS -1 * SIZE(AO2, LDA), %xmm11 1130 MOVAPS -1 * SIZE(AO2, LDA3), %xmm12 1131 1132 movq MM, I 1133 sarq $3, I 1134 jle .L54 1135 ALIGN_4 1136 1137.L53: 1138#ifdef PREFETCH 1139 PREFETCH PREFETCHSIZE * SIZE(AO1) 1140#endif 1141 1142 MOVAPS 0 * SIZE(AO1), %xmm0 1143 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 1144 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 1145 MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 1146 1147#ifdef PREFETCH 1148 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) 1149#endif 1150 1151 MOVAPS 0 * SIZE(AO2), %xmm4 1152 MOVAPS 1 * SIZE(AO2, LDA), %xmm5 1153 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 1154 MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 1155 1156 movsd %xmm0, %xmm9 1157 movsd %xmm2, %xmm10 1158 movsd %xmm4, %xmm11 1159 movsd %xmm6, %xmm12 1160 1161#ifdef PREFETCHW 1162 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 1163#endif 1164 1165 movaps %xmm9, -16 * SIZE(B) 1166 movaps %xmm10, -14 * SIZE(B) 1167 movaps %xmm11, -12 * SIZE(B) 1168 movaps %xmm12, -10 * SIZE(B) 1169 1170 shufpd $1, %xmm1, %xmm0 1171 shufpd $1, %xmm3, %xmm2 1172 shufpd $1, %xmm5, %xmm4 1173 shufpd $1, %xmm7, %xmm6 1174 1175#ifdef PREFETCHW 1176 PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) 1177#endif 1178 1179 movaps %xmm0, -8 * SIZE(B) 1180 movaps %xmm2, -6 * SIZE(B) 1181 movaps %xmm4, -4 * SIZE(B) 1182 movaps %xmm6, -2 * SIZE(B) 1183 1184#ifdef PREFETCH 1185 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) 1186#endif 1187 1188 MOVAPS 2 * SIZE(AO1), %xmm0 1189 MOVAPS 3 * SIZE(AO1, LDA), %xmm9 1190 MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 1191 MOVAPS 3 * SIZE(AO1, LDA3), %xmm10 1192 1193#ifdef PREFETCH 1194 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) 1195#endif 1196 1197 MOVAPS 2 * SIZE(AO2), %xmm4 1198 MOVAPS 3 * SIZE(AO2, LDA), %xmm11 1199 MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 1200 MOVAPS 3 * SIZE(AO2, LDA3), %xmm12 1201 1202 movsd %xmm0, %xmm1 1203 movsd %xmm2, %xmm3 1204 movsd %xmm4, %xmm5 1205 movsd %xmm6, %xmm7 1206 1207#ifdef PREFETCHW 1208 PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) 1209#endif 1210 1211 movaps %xmm1, 0 * SIZE(B) 1212 movaps %xmm3, 2 * SIZE(B) 1213 movaps %xmm5, 4 * SIZE(B) 1214 movaps %xmm7, 6 * SIZE(B) 1215 1216 shufpd $1, %xmm9, %xmm0 1217 shufpd $1, %xmm10, %xmm2 1218 shufpd $1, %xmm11, %xmm4 1219 shufpd $1, %xmm12, %xmm6 1220 1221#ifdef PREFETCHW 1222 PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) 1223#endif 1224 1225 movaps %xmm0, 8 * SIZE(B) 1226 movaps %xmm2, 10 * SIZE(B) 1227 movaps %xmm4, 12 * SIZE(B) 1228 movaps %xmm6, 14 * SIZE(B) 1229 1230#ifdef PREFETCH 1231 PREFETCH PREFETCHSIZE * SIZE(AO2) 1232#endif 1233 1234 MOVAPS 4 * SIZE(AO1), %xmm0 1235 MOVAPS 5 * SIZE(AO1, LDA), %xmm1 1236 MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2 1237 MOVAPS 5 * SIZE(AO1, LDA3), %xmm3 1238 1239#ifdef PREFETCH 1240 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) 1241#endif 1242 1243 MOVAPS 4 * SIZE(AO2), %xmm4 1244 MOVAPS 5 * SIZE(AO2, LDA), %xmm5 1245 MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6 1246 MOVAPS 5 * SIZE(AO2, LDA3), %xmm7 1247 1248 movsd %xmm0, %xmm9 1249 movsd %xmm2, %xmm10 1250 movsd %xmm4, %xmm11 1251 movsd %xmm6, %xmm12 1252 1253#ifdef PREFETCHW 1254 PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B) 1255#endif 1256 1257 movaps %xmm9, 16 * SIZE(B) 1258 movaps %xmm10, 18 * SIZE(B) 1259 movaps %xmm11, 20 * SIZE(B) 1260 movaps %xmm12, 22 * SIZE(B) 1261 1262 shufpd $1, %xmm1, %xmm0 1263 shufpd $1, %xmm3, %xmm2 1264 shufpd $1, %xmm5, %xmm4 1265 shufpd $1, %xmm7, %xmm6 1266 1267#ifdef PREFETCHW 1268 PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) 1269#endif 1270 1271 movaps %xmm0, 24 * SIZE(B) 1272 movaps %xmm2, 26 * SIZE(B) 1273 movaps %xmm4, 28 * SIZE(B) 1274 movaps %xmm6, 30 * SIZE(B) 1275 1276#ifdef PREFETCH 1277 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) 1278#endif 1279 1280 MOVAPS 6 * SIZE(AO1), %xmm0 1281 MOVAPS 7 * SIZE(AO1, LDA), %xmm9 1282 MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2 1283 MOVAPS 7 * SIZE(AO1, LDA3), %xmm10 1284 1285#ifdef PREFETCH 1286 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) 1287#endif 1288 1289 MOVAPS 6 * SIZE(AO2), %xmm4 1290 MOVAPS 7 * SIZE(AO2, LDA), %xmm11 1291 MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6 1292 MOVAPS 7 * SIZE(AO2, LDA3), %xmm12 1293 1294 movsd %xmm0, %xmm1 1295 movsd %xmm2, %xmm3 1296 movsd %xmm4, %xmm5 1297 movsd %xmm6, %xmm7 1298 1299#ifdef PREFETCHW 1300 PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B) 1301#endif 1302 1303 movaps %xmm1, 32 * SIZE(B) 1304 movaps %xmm3, 34 * SIZE(B) 1305 movaps %xmm5, 36 * SIZE(B) 1306 movaps %xmm7, 38 * SIZE(B) 1307 1308 shufpd $1, %xmm9, %xmm0 1309 shufpd $1, %xmm10, %xmm2 1310 shufpd $1, %xmm11, %xmm4 1311 shufpd $1, %xmm12, %xmm6 1312 1313#ifdef PREFETCHW 1314 PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B) 1315#endif 1316 movaps %xmm0, 40 * SIZE(B) 1317 movaps %xmm2, 42 * SIZE(B) 1318 movaps %xmm4, 44 * SIZE(B) 1319 movaps %xmm6, 46 * SIZE(B) 1320 1321 addq $8 * SIZE, AO1 1322 addq $8 * SIZE, AO2 1323 subq $-64 * SIZE, B 1324 1325 decq I 1326 jg .L53 1327 ALIGN_4 1328 1329.L54: 1330 testq $4, MM 1331 jle .L56 1332 1333 MOVAPS 0 * SIZE(AO1), %xmm0 1334 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 1335 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 1336 MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 1337 MOVAPS 0 * SIZE(AO2), %xmm4 1338 MOVAPS 1 * SIZE(AO2, LDA), %xmm5 1339 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 1340 MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 1341 1342 movsd %xmm0, %xmm9 1343 movsd %xmm2, %xmm10 1344 movsd %xmm4, %xmm11 1345 movsd %xmm6, %xmm12 1346 1347 movaps %xmm9, -16 * SIZE(B) 1348 movaps %xmm10, -14 * SIZE(B) 1349 movaps %xmm11, -12 * SIZE(B) 1350 movaps %xmm12, -10 * SIZE(B) 1351 1352 shufpd $1, %xmm1, %xmm0 1353 shufpd $1, %xmm3, %xmm2 1354 shufpd $1, %xmm5, %xmm4 1355 shufpd $1, %xmm7, %xmm6 1356 1357 movaps %xmm0, -8 * SIZE(B) 1358 movaps %xmm2, -6 * SIZE(B) 1359 movaps %xmm4, -4 * SIZE(B) 1360 movaps %xmm6, -2 * SIZE(B) 1361 1362 MOVAPS 2 * SIZE(AO1), %xmm0 1363 MOVAPS 3 * SIZE(AO1, LDA), %xmm9 1364 MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 1365 MOVAPS 3 * SIZE(AO1, LDA3), %xmm10 1366 MOVAPS 2 * SIZE(AO2), %xmm4 1367 MOVAPS 3 * SIZE(AO2, LDA), %xmm11 1368 MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 1369 MOVAPS 3 * SIZE(AO2, LDA3), %xmm12 1370 1371 movsd %xmm0, %xmm1 1372 movsd %xmm2, %xmm3 1373 movsd %xmm4, %xmm5 1374 movsd %xmm6, %xmm7 1375 1376 movaps %xmm1, 0 * SIZE(B) 1377 movaps %xmm3, 2 * SIZE(B) 1378 movaps %xmm5, 4 * SIZE(B) 1379 movaps %xmm7, 6 * SIZE(B) 1380 1381 shufpd $1, %xmm9, %xmm0 1382 shufpd $1, %xmm10, %xmm2 1383 shufpd $1, %xmm11, %xmm4 1384 shufpd $1, %xmm12, %xmm6 1385 1386 movaps %xmm0, 8 * SIZE(B) 1387 movaps %xmm2, 10 * SIZE(B) 1388 movaps %xmm4, 12 * SIZE(B) 1389 movaps %xmm6, 14 * SIZE(B) 1390 1391 addq $4 * SIZE, AO1 1392 addq $4 * SIZE, AO2 1393 subq $-32 * SIZE, B 1394 ALIGN_4 1395 1396.L56: 1397 testq $2, MM 1398 jle .L58 1399 1400 MOVAPS 0 * SIZE(AO1), %xmm0 1401 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 1402 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 1403 MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 1404 MOVAPS 0 * SIZE(AO2), %xmm4 1405 MOVAPS 1 * SIZE(AO2, LDA), %xmm5 1406 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 1407 MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 1408 1409 movsd %xmm0, %xmm9 1410 movsd %xmm2, %xmm10 1411 movsd %xmm4, %xmm11 1412 movsd %xmm6, %xmm12 1413 1414 movaps %xmm9, -16 * SIZE(B) 1415 movaps %xmm10, -14 * SIZE(B) 1416 movaps %xmm11, -12 * SIZE(B) 1417 movaps %xmm12, -10 * SIZE(B) 1418 1419 shufpd $1, %xmm1, %xmm0 1420 shufpd $1, %xmm3, %xmm2 1421 shufpd $1, %xmm5, %xmm4 1422 shufpd $1, %xmm7, %xmm6 1423 1424 movaps %xmm0, -8 * SIZE(B) 1425 movaps %xmm2, -6 * SIZE(B) 1426 movaps %xmm4, -4 * SIZE(B) 1427 movaps %xmm6, -2 * SIZE(B) 1428 1429 addq $2 * SIZE, AO1 1430 addq $2 * SIZE, AO2 1431 subq $-16 * SIZE, B 1432 ALIGN_4 1433 1434.L58: 1435 testq $1, MM 1436 jle .L59 1437 1438 movsd 0 * SIZE(AO1), %xmm0 1439 movsd 0 * SIZE(AO1, LDA), %xmm1 1440 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 1441 movsd 0 * SIZE(AO1, LDA3), %xmm3 1442 movsd 0 * SIZE(AO2), %xmm4 1443 movsd 0 * SIZE(AO2, LDA), %xmm5 1444 movsd 0 * SIZE(AO2, LDA, 2), %xmm6 1445 movsd 0 * SIZE(AO2, LDA3), %xmm7 1446 1447 unpcklpd %xmm1, %xmm0 1448 unpcklpd %xmm3, %xmm2 1449 unpcklpd %xmm5, %xmm4 1450 unpcklpd %xmm7, %xmm6 1451 1452 movaps %xmm0, -16 * SIZE(B) 1453 movaps %xmm2, -14 * SIZE(B) 1454 movaps %xmm4, -12 * SIZE(B) 1455 movaps %xmm6, -10 * SIZE(B) 1456 1457 subq $-8 * SIZE, B 1458 ALIGN_4 1459 1460.L59: 1461 decq J 1462 jg .L51 1463 ALIGN_4 1464 1465.L60: 1466 testq $4, N 1467 jle .L70 1468 1469 movq A, AO1 1470 leaq (A, LDA, 2), AO2 1471 leaq (A, LDA, 4), A 1472 1473 testq $SIZE, A 1474 je .L62 1475 1476 movsd 0 * SIZE(AO1), %xmm0 1477 movsd 0 * SIZE(AO1, LDA), %xmm1 1478 movsd 0 * SIZE(AO2), %xmm2 1479 movsd 0 * SIZE(AO2, LDA), %xmm3 1480 1481 unpcklpd %xmm1, %xmm0 1482 unpcklpd %xmm3, %xmm2 1483 1484 movaps %xmm0, -16 * SIZE(B) 1485 movaps %xmm2, -14 * SIZE(B) 1486 1487 addq $1 * SIZE, AO1 1488 addq $1 * SIZE, AO2 1489 subq $-4 * SIZE, B 1490 ALIGN_3 1491 1492.L62: 1493 movaps -1 * SIZE(AO1, LDA), %xmm5 1494 movaps -1 * SIZE(AO2, LDA), %xmm7 1495 1496 movq MM, I 1497 sarq $3, I 1498 jle .L64 1499 ALIGN_4 1500 1501.L63: 1502#ifdef PREFETCH 1503 PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) 1504#endif 1505 1506 MOVAPS 0 * SIZE(AO1), %xmm0 1507 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 1508 MOVAPS 0 * SIZE(AO2), %xmm2 1509 MOVAPS 1 * SIZE(AO2, LDA), %xmm3 1510 1511 movsd %xmm0, %xmm5 1512 movsd %xmm2, %xmm7 1513 shufpd $1, %xmm1, %xmm0 1514 shufpd $1, %xmm3, %xmm2 1515 1516#ifdef PREFETCHW 1517 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 1518#endif 1519 1520 movaps %xmm5, -16 * SIZE(B) 1521 movaps %xmm7, -14 * SIZE(B) 1522 movaps %xmm0, -12 * SIZE(B) 1523 movaps %xmm2, -10 * SIZE(B) 1524 1525#ifdef PREFETCH 1526 PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA) 1527#endif 1528 1529 MOVAPS 2 * SIZE(AO1), %xmm0 1530 MOVAPS 3 * SIZE(AO1, LDA), %xmm5 1531 MOVAPS 2 * SIZE(AO2), %xmm2 1532 MOVAPS 3 * SIZE(AO2, LDA), %xmm7 1533 1534 movsd %xmm0, %xmm1 1535 movsd %xmm2, %xmm3 1536 shufpd $1, %xmm5, %xmm0 1537 shufpd $1, %xmm7, %xmm2 1538 1539#ifdef PREFETCHW 1540 PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) 1541#endif 1542 1543 movaps %xmm1, -8 * SIZE(B) 1544 movaps %xmm3, -6 * SIZE(B) 1545 movaps %xmm0, -4 * SIZE(B) 1546 movaps %xmm2, -2 * SIZE(B) 1547 1548#ifdef PREFETCH 1549 PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) 1550#endif 1551 1552 MOVAPS 4 * SIZE(AO1), %xmm0 1553 MOVAPS 5 * SIZE(AO1, LDA), %xmm1 1554 MOVAPS 4 * SIZE(AO2), %xmm2 1555 MOVAPS 5 * SIZE(AO2, LDA), %xmm3 1556 1557 movsd %xmm0, %xmm5 1558 movsd %xmm2, %xmm7 1559 shufpd $1, %xmm1, %xmm0 1560 shufpd $1, %xmm3, %xmm2 1561 1562#ifdef PREFETCHW 1563 PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) 1564#endif 1565 1566 movaps %xmm5, 0 * SIZE(B) 1567 movaps %xmm7, 2 * SIZE(B) 1568 movaps %xmm0, 4 * SIZE(B) 1569 movaps %xmm2, 6 * SIZE(B) 1570 1571#ifdef PREFETCH 1572 PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA) 1573#endif 1574 1575 MOVAPS 6 * SIZE(AO1), %xmm0 1576 MOVAPS 7 * SIZE(AO1, LDA), %xmm5 1577 MOVAPS 6 * SIZE(AO2), %xmm2 1578 MOVAPS 7 * SIZE(AO2, LDA), %xmm7 1579 1580 movsd %xmm0, %xmm1 1581 movsd %xmm2, %xmm3 1582 shufpd $1, %xmm5, %xmm0 1583 shufpd $1, %xmm7, %xmm2 1584 1585#ifdef PREFETCHW 1586 PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) 1587#endif 1588 1589 movaps %xmm1, 8 * SIZE(B) 1590 movaps %xmm3, 10 * SIZE(B) 1591 movaps %xmm0, 12 * SIZE(B) 1592 movaps %xmm2, 14 * SIZE(B) 1593 1594 addq $8 * SIZE, AO1 1595 addq $8 * SIZE, AO2 1596 subq $-32 * SIZE, B 1597 1598 decq I 1599 jg .L63 1600 ALIGN_4 1601 1602.L64: 1603 testq $4, MM 1604 jle .L66 1605 1606 MOVAPS 0 * SIZE(AO1), %xmm0 1607 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 1608 MOVAPS 0 * SIZE(AO2), %xmm2 1609 MOVAPS 1 * SIZE(AO2, LDA), %xmm3 1610 1611 movsd %xmm0, %xmm5 1612 shufpd $1, %xmm1, %xmm0 1613 movsd %xmm2, %xmm7 1614 shufpd $1, %xmm3, %xmm2 1615 1616 movaps %xmm5, -16 * SIZE(B) 1617 movaps %xmm7, -14 * SIZE(B) 1618 movaps %xmm0, -12 * SIZE(B) 1619 movaps %xmm2, -10 * SIZE(B) 1620 1621 MOVAPS 2 * SIZE(AO1), %xmm0 1622 MOVAPS 3 * SIZE(AO1, LDA), %xmm5 1623 MOVAPS 2 * SIZE(AO2), %xmm2 1624 MOVAPS 3 * SIZE(AO2, LDA), %xmm7 1625 1626 movsd %xmm0, %xmm1 1627 shufpd $1, %xmm5, %xmm0 1628 movsd %xmm2, %xmm3 1629 shufpd $1, %xmm7, %xmm2 1630 1631 movaps %xmm1, -8 * SIZE(B) 1632 movaps %xmm3, -6 * SIZE(B) 1633 movaps %xmm0, -4 * SIZE(B) 1634 movaps %xmm2, -2 * SIZE(B) 1635 1636 addq $4 * SIZE, AO1 1637 addq $4 * SIZE, AO2 1638 subq $-16 * SIZE, B 1639 ALIGN_4 1640 1641.L66: 1642 testq $2, MM 1643 jle .L68 1644 1645 MOVAPS 0 * SIZE(AO1), %xmm0 1646 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 1647 MOVAPS 0 * SIZE(AO2), %xmm2 1648 MOVAPS 1 * SIZE(AO2, LDA), %xmm3 1649 1650 movsd %xmm0, %xmm5 1651 movsd %xmm2, %xmm7 1652 shufpd $1, %xmm1, %xmm0 1653 shufpd $1, %xmm3, %xmm2 1654 1655 movaps %xmm5, -16 * SIZE(B) 1656 movaps %xmm7, -14 * SIZE(B) 1657 movaps %xmm0, -12 * SIZE(B) 1658 movaps %xmm2, -10 * SIZE(B) 1659 1660 addq $2 * SIZE, AO1 1661 addq $2 * SIZE, AO2 1662 subq $-8 * SIZE, B 1663 ALIGN_4 1664 1665.L68: 1666 testq $1, MM 1667 jle .L70 1668 1669 movsd 0 * SIZE(AO1), %xmm0 1670 movsd 0 * SIZE(AO1, LDA), %xmm1 1671 movsd 0 * SIZE(AO2), %xmm2 1672 movsd 0 * SIZE(AO2, LDA), %xmm3 1673 1674 unpcklpd %xmm1, %xmm0 1675 unpcklpd %xmm3, %xmm2 1676 1677 movaps %xmm0, -16 * SIZE(B) 1678 movaps %xmm2, -14 * SIZE(B) 1679 subq $-4 * SIZE, B 1680 ALIGN_4 1681 1682.L70: 1683 testq $2, N 1684 jle .L80 1685 1686 movq A, AO1 1687 leaq (A, LDA), AO2 1688 leaq (A, LDA, 2), A 1689 1690 testq $SIZE, A 1691 je .L72 1692 1693 movsd 0 * SIZE(AO1), %xmm0 1694 movsd 0 * SIZE(AO2), %xmm1 1695 1696 unpcklpd %xmm1, %xmm0 1697 1698 movaps %xmm0, -16 * SIZE(B) 1699 1700 addq $1 * SIZE, AO1 1701 addq $1 * SIZE, AO2 1702 subq $-2 * SIZE, B 1703 ALIGN_3 1704 1705.L72: 1706 MOVAPS -1 * SIZE(AO2), %xmm5 1707 1708 movq MM, I 1709 sarq $3, I 1710 jle .L74 1711 ALIGN_4 1712 1713.L73: 1714#ifdef PREFETCH 1715 PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) 1716#endif 1717 1718 MOVAPS 0 * SIZE(AO1), %xmm0 1719 MOVAPS 1 * SIZE(AO2), %xmm1 1720 MOVAPS 2 * SIZE(AO1), %xmm2 1721 MOVAPS 3 * SIZE(AO2), %xmm3 1722 1723 movsd %xmm0, %xmm5 1724 shufpd $1, %xmm1, %xmm0 1725 movsd %xmm2, %xmm1 1726 shufpd $1, %xmm3, %xmm2 1727 1728#ifdef PREFETCHW 1729 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 1730#endif 1731 1732 movaps %xmm5, -16 * SIZE(B) 1733 movaps %xmm0, -14 * SIZE(B) 1734 movaps %xmm1, -12 * SIZE(B) 1735 movaps %xmm2, -10 * SIZE(B) 1736 1737#ifdef PREFETCH 1738 PREFETCH PREFETCHSIZE * 4 * SIZE(AO2) 1739#endif 1740 1741 MOVAPS 4 * SIZE(AO1), %xmm0 1742 MOVAPS 5 * SIZE(AO2), %xmm1 1743 MOVAPS 6 * SIZE(AO1), %xmm2 1744 MOVAPS 7 * SIZE(AO2), %xmm5 1745 1746 movsd %xmm0, %xmm3 1747 shufpd $1, %xmm1, %xmm0 1748 movsd %xmm2, %xmm1 1749 shufpd $1, %xmm5, %xmm2 1750 1751#ifdef PREFETCHW 1752 PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) 1753#endif 1754 1755 movaps %xmm3, -8 * SIZE(B) 1756 movaps %xmm0, -6 * SIZE(B) 1757 movaps %xmm1, -4 * SIZE(B) 1758 movaps %xmm2, -2 * SIZE(B) 1759 1760 addq $8 * SIZE, AO1 1761 addq $8 * SIZE, AO2 1762 subq $-16 * SIZE, B 1763 1764 decq I 1765 jg .L73 1766 ALIGN_4 1767 1768.L74: 1769 testq $4, MM 1770 jle .L76 1771 1772 MOVAPS 0 * SIZE(AO1), %xmm0 1773 MOVAPS 1 * SIZE(AO2), %xmm1 1774 MOVAPS 2 * SIZE(AO1), %xmm2 1775 MOVAPS 3 * SIZE(AO2), %xmm3 1776 1777 movsd %xmm0, %xmm5 1778 shufpd $1, %xmm1, %xmm0 1779 movsd %xmm2, %xmm1 1780 shufpd $1, %xmm3, %xmm2 1781 1782 movaps %xmm5, -16 * SIZE(B) 1783 movaps %xmm0, -14 * SIZE(B) 1784 movaps %xmm1, -12 * SIZE(B) 1785 movaps %xmm2, -10 * SIZE(B) 1786 1787 movaps %xmm3, %xmm5 1788 1789 addq $4 * SIZE, AO1 1790 addq $4 * SIZE, AO2 1791 subq $-8 * SIZE, B 1792 ALIGN_4 1793 1794.L76: 1795 testq $2, MM 1796 jle .L78 1797 1798 MOVAPS 0 * SIZE(AO1), %xmm0 1799 MOVAPS 1 * SIZE(AO2), %xmm1 1800 1801 movsd %xmm0, %xmm5 1802 shufpd $1, %xmm1, %xmm0 1803 1804 movaps %xmm5, -16 * SIZE(B) 1805 movaps %xmm0, -14 * SIZE(B) 1806 1807 addq $2 * SIZE, AO1 1808 addq $2 * SIZE, AO2 1809 subq $-4 * SIZE, B 1810 ALIGN_4 1811 1812.L78: 1813 testq $1, MM 1814 jle .L80 1815 1816 movsd 0 * SIZE(AO1), %xmm0 1817 movsd 0 * SIZE(AO2), %xmm1 1818 1819 unpcklpd %xmm1, %xmm0 1820 1821 movaps %xmm0, -16 * SIZE(B) 1822 subq $-2 * SIZE, B 1823 ALIGN_4 1824 1825.L80: 1826 testq $1, N 1827 jle .L999 1828 1829 movq A, AO1 1830 1831 testq $SIZE, A 1832 jne .L85 1833 1834 movq MM, I 1835 sarq $3, I 1836 jle .L82 1837 ALIGN_4 1838 1839.L81: 1840#ifdef PREFETCH 1841 PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) 1842#endif 1843 1844 MOVAPS 0 * SIZE(AO1), %xmm0 1845 MOVAPS 2 * SIZE(AO1), %xmm2 1846 MOVAPS 4 * SIZE(AO1), %xmm4 1847 MOVAPS 6 * SIZE(AO1), %xmm6 1848 1849#ifdef PREFETCHW 1850 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 1851#endif 1852 1853 movaps %xmm0, -16 * SIZE(B) 1854 movaps %xmm2, -14 * SIZE(B) 1855 movaps %xmm4, -12 * SIZE(B) 1856 movaps %xmm6, -10 * SIZE(B) 1857 1858 addq $8 * SIZE, AO1 1859 subq $-8 * SIZE, B 1860 1861 decq I 1862 jg .L81 1863 ALIGN_4 1864 1865.L82: 1866 testq $4, MM 1867 jle .L83 1868 1869 MOVAPS 0 * SIZE(AO1), %xmm0 1870 MOVAPS 2 * SIZE(AO1), %xmm2 1871 1872 movaps %xmm0, -16 * SIZE(B) 1873 movaps %xmm2, -14 * SIZE(B) 1874 1875 addq $4 * SIZE, AO1 1876 subq $-4 * SIZE, B 1877 ALIGN_4 1878 1879.L83: 1880 testq $2, MM 1881 jle .L84 1882 1883 MOVAPS 0 * SIZE(AO1), %xmm0 1884 1885 movaps %xmm0, -16 * SIZE(B) 1886 1887 addq $2 * SIZE, AO1 1888 subq $-2 * SIZE, B 1889 ALIGN_4 1890 1891.L84: 1892 testq $1, MM 1893 jle .L999 1894 1895 movsd 0 * SIZE(AO1), %xmm0 1896 1897 movlpd %xmm0, -16 * SIZE(B) 1898 jmp .L999 1899 ALIGN_4 1900 1901.L85: 1902 MOVAPS -1 * SIZE(AO1), %xmm0 1903 1904 movq M, I 1905 sarq $3, I 1906 jle .L86 1907 ALIGN_4 1908 1909.L86: 1910#ifdef PREFETCH 1911 PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) 1912#endif 1913 1914 MOVAPS 1 * SIZE(AO1), %xmm1 1915 MOVAPS 3 * SIZE(AO1), %xmm2 1916 MOVAPS 5 * SIZE(AO1), %xmm3 1917 MOVAPS 7 * SIZE(AO1), %xmm4 1918 1919 shufpd $1, %xmm1, %xmm0 1920 shufpd $1, %xmm2, %xmm1 1921 shufpd $1, %xmm3, %xmm2 1922 shufpd $1, %xmm4, %xmm3 1923 1924#ifdef PREFETCHW 1925 PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) 1926#endif 1927 1928 movaps %xmm0, -16 * SIZE(B) 1929 movaps %xmm1, -14 * SIZE(B) 1930 movaps %xmm2, -12 * SIZE(B) 1931 movaps %xmm3, -10 * SIZE(B) 1932 1933 movaps %xmm4, %xmm0 1934 1935 addq $8 * SIZE, AO1 1936 subq $-8 * SIZE, B 1937 1938 decq I 1939 jg .L86 1940 ALIGN_4 1941 1942.L87: 1943 testq $4, M 1944 jle .L88 1945 1946 MOVAPS 1 * SIZE(AO1), %xmm1 1947 MOVAPS 3 * SIZE(AO1), %xmm2 1948 1949 shufpd $1, %xmm1, %xmm0 1950 shufpd $1, %xmm2, %xmm1 1951 1952 movaps %xmm0, -16 * SIZE(B) 1953 movaps %xmm1, -14 * SIZE(B) 1954 1955 movaps %xmm2, %xmm0 1956 1957 addq $4 * SIZE, AO1 1958 addq $4 * SIZE, B 1959 ALIGN_4 1960 1961.L88: 1962 testq $2, M 1963 jle .L89 1964 1965 MOVAPS 1 * SIZE(AO1), %xmm1 1966 1967 shufpd $1, %xmm1, %xmm0 1968 1969 movaps %xmm0, -16 * SIZE(B) 1970 1971 movaps %xmm1, %xmm0 1972 1973 addq $2 * SIZE, AO1 1974 subq $-2 * SIZE, B 1975 ALIGN_4 1976 1977.L89: 1978 testq $1, M 1979 jle .L999 1980 1981 shufpd $1, %xmm0, %xmm0 1982 1983 movlpd %xmm0, -16 * SIZE(B) 1984 ALIGN_4 1985 1986.L999: 1987#ifdef WINDOWS_ABI 1988 movups 0(%rsp), %xmm6 1989 movups 16(%rsp), %xmm7 1990 movups 32(%rsp), %xmm8 1991 movups 48(%rsp), %xmm9 1992 movups 64(%rsp), %xmm10 1993 movups 80(%rsp), %xmm11 1994 movups 96(%rsp), %xmm12 1995 1996 addq $STACKSIZE, %rsp 1997#endif 1998 1999 popq %r12 2000 popq %r13 2001 2002#ifdef WINDOWS_ABI 2003 popq %r14 2004 popq %r15 2005#endif 2006 ret 2007 2008 EPILOGUE 2009