1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define OLD_K %rdx 45 46#define M %r13 47#define N %r14 48#define K %r15 49 50#define A %rcx 51#define B %r8 52#define C %r9 53#define LDC %rbp 54 55#define I %r11 56#define AO %rdi 57#define BO %rsi 58#define CO1 %rbx 59#define CO2 %rdx 60#define BB %r12 61 62#define PREA %r10 63 64#ifndef WINDOWS_ABI 65 66#define STACKSIZE 128 67 68#define OLD_LDC 8 + STACKSIZE(%rsp) 69#define OLD_OFFSET 16 + STACKSIZE(%rsp) 70 71#define ALPHA_R 48(%rsp) 72#define ALPHA_I 56(%rsp) 73#define J 64(%rsp) 74#define OFFSET 72(%rsp) 75#define KK 80(%rsp) 76#define KKK 88(%rsp) 77 78#else 79 80#define STACKSIZE 512 81 82#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 83#define OLD_A 48 + STACKSIZE(%rsp) 84#define OLD_B 56 + STACKSIZE(%rsp) 85#define OLD_C 64 + STACKSIZE(%rsp) 86#define OLD_LDC 72 + STACKSIZE(%rsp) 87#define OLD_OFFSET 80 + STACKSIZE(%rsp) 88 89#define ALPHA_R 224(%rsp) 90#define ALPHA_I 232(%rsp) 91#define J 240(%rsp) 92#define OFFSET 248(%rsp) 93#define KK 256(%rsp) 94#define KKK 264(%rsp) 95 96#endif 97 98#define PREFETCHSIZE 4 99#define PREFETCH prefetcht0 100 101#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 102#define ADD1 addpd 103#define ADD2 addpd 104#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 105#define ADD1 addpd 106#define ADD2 addpd 107#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 108#define ADD1 addpd 109#define ADD2 addpd 110#else 111#define ADD1 addpd 112#define ADD2 subpd 113#endif 114 115 PROLOGUE 116 PROFCODE 117 118 subq $STACKSIZE, %rsp 119 120 movq %rbx, 0(%rsp) 121 movq %rbp, 8(%rsp) 122 movq %r12, 16(%rsp) 123 movq %r13, 24(%rsp) 124 movq %r14, 32(%rsp) 125 movq %r15, 40(%rsp) 126 127#ifdef WINDOWS_ABI 128 movq %rdi, 48(%rsp) 129 movq %rsi, 56(%rsp) 130 movups %xmm6, 64(%rsp) 131 movups %xmm7, 80(%rsp) 132 movups %xmm8, 96(%rsp) 133 movups %xmm9, 112(%rsp) 134 movups %xmm10, 128(%rsp) 135 movups %xmm11, 144(%rsp) 136 movups %xmm12, 160(%rsp) 137 movups %xmm13, 176(%rsp) 138 movups %xmm14, 192(%rsp) 139 movups %xmm15, 208(%rsp) 140 141 movq ARG1, OLD_M 142 movq ARG2, OLD_N 143 movq ARG3, OLD_K 144 movq OLD_A, A 145 movq OLD_B, B 146 movq OLD_C, C 147 movq OLD_LDC, LDC 148#ifdef TRMMKERNEL 149 movq OLD_OFFSET, %r11 150#endif 151 movaps %xmm3, %xmm0 152 movsd OLD_ALPHA_I, %xmm1 153#else 154 movq OLD_LDC, LDC 155#ifdef TRMMKERNEL 156 movq OLD_OFFSET, %r11 157#endif 158 159#endif 160 161 movlps %xmm0, ALPHA_R 162 movlps %xmm1, ALPHA_I 163 164 subq $-16 * SIZE, A 165 subq $-16 * SIZE, B 166 167 movq OLD_M, M 168 movq OLD_N, N 169 movq OLD_K, K 170 171 salq $ZBASE_SHIFT, LDC 172 173#ifdef TRMMKERNEL 174 movq %r11, OFFSET 175#ifndef LEFT 176 negq %r11 177#endif 178 movq %r11, KK 179#endif 180 testq M, M 181 jle .L999 182 183 movq N, J 184 sarq $2, J 185 NOBRANCH 186 jle .L20 187 ALIGN_4 188 189.L01: 190#if defined(TRMMKERNEL) && defined(LEFT) 191 movq OFFSET, %rax 192 movq %rax, KK 193#endif 194 195 movq C, CO1 196 leaq (C, LDC, 2), CO2 197 movq A, AO 198 199 movq K, %rax 200 salq $ZBASE_SHIFT + 2, %rax 201 leaq (B, %rax), BB 202 203 movq M, I 204 ALIGN_4 205 206.L11: 207 prefetcht2 -16 * SIZE(BB) 208 subq $-8 * SIZE, BB 209 210#if !defined(TRMMKERNEL) || \ 211 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 212 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 213 214 movq B, BO 215#else 216 movq B, BO 217 218 movq KK, %rax 219 salq $ZBASE_SHIFT, %rax 220 leaq (AO, %rax, 1), AO 221 leaq (BO, %rax, 4), BO 222#endif 223 224 PADDING 225 xorps %xmm1, %xmm1 226 xorps %xmm2, %xmm2 227 xorps %xmm3, %xmm3 228 xorps %xmm4, %xmm4 229 230 xorps %xmm8, %xmm8 231 prefetcht0 1 * SIZE(CO1) 232 xorps %xmm9, %xmm9 233 xorps %xmm10, %xmm10 234 prefetcht0 3 * SIZE(CO1, LDC) 235 xorps %xmm11, %xmm11 236 237 movaps -16 * SIZE(AO), %xmm0 238 239 xorps %xmm12, %xmm12 240 xorps %xmm13, %xmm13 241 prefetcht0 1 * SIZE(CO2) 242 xorps %xmm14, %xmm14 243 xorps %xmm15, %xmm15 244 prefetcht0 3 * SIZE(CO2, LDC) 245 246#ifndef TRMMKERNEL 247 movq K, %rax 248#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 249 movq K, %rax 250 subq KK, %rax 251 movq %rax, KKK 252#else 253 movq KK, %rax 254#ifdef LEFT 255 addq $1, %rax 256#else 257 addq $4, %rax 258#endif 259 movq %rax, KKK 260#endif 261 sarq $2, %rax 262 NOBRANCH 263 jle .L15 264 ALIGN_3 265 266.L12: 267 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 268 269 ADD1 %xmm1, %xmm12 270 movaps -16 * SIZE(BO), %xmm1 271 ADD2 %xmm2, %xmm13 272 pshufd $0x4e, %xmm1, %xmm2 273 mulpd %xmm0, %xmm1 274 mulpd %xmm0, %xmm2 275 276 ADD1 %xmm3, %xmm14 277 movaps -14 * SIZE(BO), %xmm3 278 ADD2 %xmm4, %xmm15 279 pshufd $0x4e, %xmm3, %xmm4 280 mulpd %xmm0, %xmm3 281 mulpd %xmm0, %xmm4 282 283 ADD1 %xmm1, %xmm8 284 movaps -12 * SIZE(BO), %xmm1 285 ADD2 %xmm2, %xmm9 286 pshufd $0x4e, %xmm1, %xmm2 287 mulpd %xmm0, %xmm1 288 mulpd %xmm0, %xmm2 289 290 ADD1 %xmm3, %xmm10 291 movaps -10 * SIZE(BO), %xmm3 292 ADD2 %xmm4, %xmm11 293 pshufd $0x4e, %xmm3, %xmm4 294 movaps -14 * SIZE(AO), %xmm5 295 mulpd %xmm0, %xmm3 296 mulpd %xmm0, %xmm4 297 298 ADD1 %xmm1, %xmm12 299 movaps -8 * SIZE(BO), %xmm1 300 ADD2 %xmm2, %xmm13 301 pshufd $0x4e, %xmm1, %xmm2 302 mulpd %xmm5, %xmm1 303 mulpd %xmm5, %xmm2 304 305 ADD1 %xmm3, %xmm14 306 movaps -6 * SIZE(BO), %xmm3 307 ADD2 %xmm4, %xmm15 308 pshufd $0x4e, %xmm3, %xmm4 309 mulpd %xmm5, %xmm3 310 mulpd %xmm5, %xmm4 311 312 ADD1 %xmm1, %xmm8 313 movaps -4 * SIZE(BO), %xmm1 314 ADD2 %xmm2, %xmm9 315 pshufd $0x4e, %xmm1, %xmm2 316 mulpd %xmm5, %xmm1 317 mulpd %xmm5, %xmm2 318 319 ADD1 %xmm3, %xmm10 320 movaps -2 * SIZE(BO), %xmm3 321 ADD2 %xmm4, %xmm11 322 pshufd $0x4e, %xmm3, %xmm4 323 movaps -12 * SIZE(AO), %xmm0 324 mulpd %xmm5, %xmm3 325 mulpd %xmm5, %xmm4 326 327 ADD1 %xmm1, %xmm12 328 movaps 0 * SIZE(BO), %xmm1 329 ADD2 %xmm2, %xmm13 330 pshufd $0x4e, %xmm1, %xmm2 331 mulpd %xmm0, %xmm1 332 mulpd %xmm0, %xmm2 333 334 ADD1 %xmm3, %xmm14 335 movaps 2 * SIZE(BO), %xmm3 336 ADD2 %xmm4, %xmm15 337 pshufd $0x4e, %xmm3, %xmm4 338 mulpd %xmm0, %xmm3 339 mulpd %xmm0, %xmm4 340 341 ADD1 %xmm1, %xmm8 342 movaps 4 * SIZE(BO), %xmm1 343 ADD2 %xmm2, %xmm9 344 pshufd $0x4e, %xmm1, %xmm2 345 mulpd %xmm0, %xmm1 346 mulpd %xmm0, %xmm2 347 348 ADD1 %xmm3, %xmm10 349 movaps 6 * SIZE(BO), %xmm3 350 ADD2 %xmm4, %xmm11 351 pshufd $0x4e, %xmm3, %xmm4 352 mulpd %xmm0, %xmm3 353 movaps -10 * SIZE(AO), %xmm5 354 mulpd %xmm0, %xmm4 355 356 ADD1 %xmm1, %xmm12 357 movaps 8 * SIZE(BO), %xmm1 358 ADD2 %xmm2, %xmm13 359 pshufd $0x4e, %xmm1, %xmm2 360 mulpd %xmm5, %xmm1 361 mulpd %xmm5, %xmm2 362 363 ADD1 %xmm3, %xmm14 364 movaps 10 * SIZE(BO), %xmm3 365 ADD2 %xmm4, %xmm15 366 pshufd $0x4e, %xmm3, %xmm4 367 mulpd %xmm5, %xmm3 368 PADDING; 369 mulpd %xmm5, %xmm4 370 371 ADD1 %xmm1, %xmm8 372 movaps 12 * SIZE(BO), %xmm1 373 ADD2 %xmm2, %xmm9 374 pshufd $0x4e, %xmm1, %xmm2 375 mulpd %xmm5, %xmm1 376 PADDING; 377 mulpd %xmm5, %xmm2 378 379 ADD1 %xmm3, %xmm10 380 movaps 14 * SIZE(BO), %xmm3 381 ADD2 %xmm4, %xmm11 382 pshufd $0x4e, %xmm3, %xmm4 383 mulpd %xmm5, %xmm3 384 movaps -8 * SIZE(AO), %xmm0 385 mulpd %xmm5, %xmm4 386 387 subq $-32 * SIZE, BO 388 subq $-8 * SIZE, AO 389 390 subq $1, %rax 391 BRANCH 392 jg .L12 393 ALIGN_3 394 395.L15: 396 movddup ALPHA_R, %xmm6 397 movddup ALPHA_I, %xmm7 398 399#ifndef TRMMKERNEL 400 movq K, %rax 401#else 402 movq KKK, %rax 403#endif 404 andq $3, %rax # if (k & 1) 405 BRANCH 406 je .L18 407 ALIGN_3 408 409.L16: 410 ADD1 %xmm1, %xmm12 411 movaps -16 * SIZE(BO), %xmm1 412 ADD2 %xmm2, %xmm13 413 pshufd $0x4e, %xmm1, %xmm2 414 mulpd %xmm0, %xmm1 415 mulpd %xmm0, %xmm2 416 417 ADD1 %xmm3, %xmm14 418 movaps -14 * SIZE(BO), %xmm3 419 ADD2 %xmm4, %xmm15 420 pshufd $0x4e, %xmm3, %xmm4 421 mulpd %xmm0, %xmm3 422 mulpd %xmm0, %xmm4 423 424 ADD1 %xmm1, %xmm8 425 movaps -12 * SIZE(BO), %xmm1 426 ADD2 %xmm2, %xmm9 427 pshufd $0x4e, %xmm1, %xmm2 428 mulpd %xmm0, %xmm1 429 mulpd %xmm0, %xmm2 430 431 ADD1 %xmm3, %xmm10 432 movaps -10 * SIZE(BO), %xmm3 433 ADD2 %xmm4, %xmm11 434 pshufd $0x4e, %xmm3, %xmm4 435 mulpd %xmm0, %xmm3 436 mulpd %xmm0, %xmm4 437 438 movaps -14 * SIZE(AO), %xmm0 439 440 addq $2 * SIZE, AO 441 addq $8 * SIZE, BO 442 443 subq $1, %rax 444 BRANCH 445 jg .L16 446 ALIGN_3 447 448.L18: 449 ADD1 %xmm1, %xmm12 450 ADD2 %xmm2, %xmm13 451 ADD1 %xmm3, %xmm14 452 ADD2 %xmm4, %xmm15 453 454 pcmpeqb %xmm0, %xmm0 455 psllq $63, %xmm0 456 457#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 458 defined(RR) || defined(RC) || defined(CR) || defined(CC) 459 shufps $0x40, %xmm0, %xmm0 460 461 xorps %xmm0, %xmm8 462 xorps %xmm0, %xmm10 463 xorps %xmm0, %xmm12 464 xorps %xmm0, %xmm14 465#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 466 shufps $0x04, %xmm0, %xmm0 467 468 xorps %xmm0, %xmm9 469 xorps %xmm0, %xmm11 470 xorps %xmm0, %xmm13 471 xorps %xmm0, %xmm15 472#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 473 shufps $0x40, %xmm0, %xmm0 474 475 xorps %xmm0, %xmm9 476 xorps %xmm0, %xmm11 477 xorps %xmm0, %xmm13 478 xorps %xmm0, %xmm15 479#endif 480 481 haddpd %xmm9, %xmm8 482 haddpd %xmm11, %xmm10 483 haddpd %xmm13, %xmm12 484 haddpd %xmm15, %xmm14 485 486 pshufd $0x4e, %xmm8, %xmm9 487 pshufd $0x4e, %xmm10, %xmm11 488 pshufd $0x4e, %xmm12, %xmm13 489 pshufd $0x4e, %xmm14, %xmm15 490 491 mulpd %xmm6, %xmm8 492 mulpd %xmm7, %xmm9 493 mulpd %xmm6, %xmm10 494 mulpd %xmm7, %xmm11 495 496 mulpd %xmm6, %xmm12 497 mulpd %xmm7, %xmm13 498 mulpd %xmm6, %xmm14 499 mulpd %xmm7, %xmm15 500 501 addsubpd %xmm9, %xmm8 502 addsubpd %xmm11, %xmm10 503 addsubpd %xmm13, %xmm12 504 addsubpd %xmm15, %xmm14 505 506 testq $15, CO1 507 NOBRANCH 508 jne .L18x 509 510#ifndef TRMMKERNEL 511 movaps (CO1), %xmm0 512 movaps (CO1, LDC), %xmm1 513 movaps (CO2), %xmm2 514 movaps (CO2, LDC), %xmm3 515 516 addpd %xmm0, %xmm8 517 addpd %xmm1, %xmm10 518 addpd %xmm2, %xmm12 519 addpd %xmm3, %xmm14 520#endif 521 522 movaps %xmm8, (CO1) 523 movaps %xmm10, (CO1, LDC) 524 movaps %xmm12, (CO2) 525 movaps %xmm14, (CO2, LDC) 526 527#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 528 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 529 movq K, %rax 530 subq KKK, %rax 531 salq $ZBASE_SHIFT, %rax 532 leaq (AO, %rax, 1), AO 533 leaq (BO, %rax, 4), BO 534#endif 535 536#if defined(TRMMKERNEL) && defined(LEFT) 537 addq $1, KK 538#endif 539 540 addq $2 * SIZE, CO1 541 addq $2 * SIZE, CO2 542 decq I 543 BRANCH 544 jg .L11 545 546#if defined(TRMMKERNEL) && !defined(LEFT) 547 addq $4, KK 548#endif 549 550 leaq (C, LDC, 4), C 551 movq BO, B 552 553 subq $1, J 554 BRANCH 555 jg .L01 556 jmp .L20 557 ALIGN_4 558 559.L18x: 560#ifndef TRMMKERNEL 561 movups (CO1), %xmm0 562 movups (CO1, LDC), %xmm1 563 movups (CO2), %xmm2 564 movups (CO2, LDC), %xmm3 565 566 addpd %xmm0, %xmm8 567 addpd %xmm1, %xmm10 568 addpd %xmm2, %xmm12 569 addpd %xmm3, %xmm14 570#endif 571 572 movups %xmm8, (CO1) 573 movups %xmm10, (CO1, LDC) 574 movups %xmm12, (CO2) 575 movups %xmm14, (CO2, LDC) 576 577#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 578 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 579 movq K, %rax 580 subq KKK, %rax 581 salq $ZBASE_SHIFT, %rax 582 leaq (AO, %rax, 1), AO 583 leaq (BO, %rax, 4), BO 584#endif 585 586#if defined(TRMMKERNEL) && defined(LEFT) 587 addq $1, KK 588#endif 589 590 addq $2 * SIZE, CO1 591 addq $2 * SIZE, CO2 592 decq I 593 BRANCH 594 jg .L11 595 596#if defined(TRMMKERNEL) && !defined(LEFT) 597 addq $4, KK 598#endif 599 600 leaq (C, LDC, 4), C 601 movq BO, B 602 603 subq $1, J 604 BRANCH 605 jg .L01 606 ALIGN_4 607 608.L20: 609 testq $2, N 610 BRANCH 611 jle .L30 612 613#if defined(TRMMKERNEL) && defined(LEFT) 614 movq OFFSET, %rax 615 movq %rax, KK 616#endif 617 618 movq C, CO1 619 leaq (C, LDC, 1), CO2 620 movq A, AO 621 622 movq M, I 623 ALIGN_4 624 625.L21: 626#if !defined(TRMMKERNEL) || \ 627 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 628 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 629 630 movq B, BO 631#else 632 movq B, BO 633 634 movq KK, %rax 635 salq $ZBASE_SHIFT, %rax 636 leaq (AO, %rax, 1), AO 637 leaq (BO, %rax, 2), BO 638#endif 639 640 xorps %xmm1, %xmm1 641 movaps -16 * SIZE(AO), %xmm0 642 xorps %xmm2, %xmm2 643 xorps %xmm3, %xmm3 644 xorps %xmm4, %xmm4 645 646 xorps %xmm8, %xmm8 647 prefetcht0 1 * SIZE(CO1) 648 xorps %xmm9, %xmm9 649 prefetcht0 2 * SIZE(CO2) 650 xorps %xmm10, %xmm10 651 xorps %xmm11, %xmm11 652 653#ifndef TRMMKERNEL 654 movq K, %rax 655#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 656 movq K, %rax 657 subq KK, %rax 658 movq %rax, KKK 659#else 660 movq KK, %rax 661#ifdef LEFT 662 addq $1, %rax 663#else 664 addq $2, %rax 665#endif 666 movq %rax, KKK 667#endif 668 sarq $2, %rax 669 NOBRANCH 670 jle .L25 671 ALIGN_3 672 673.L22: 674 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 675 676 ADD1 %xmm1, %xmm8 677 movaps -16 * SIZE(BO), %xmm1 678 ADD2 %xmm2, %xmm9 679 pshufd $0x4e, %xmm1, %xmm2 680 mulpd %xmm0, %xmm1 681 mulpd %xmm0, %xmm2 682 683 ADD1 %xmm3, %xmm10 684 movaps -14 * SIZE(BO), %xmm3 685 ADD2 %xmm4, %xmm11 686 pshufd $0x4e, %xmm3, %xmm4 687 mulpd %xmm0, %xmm3 688 mulpd %xmm0, %xmm4 689 690 movaps -14 * SIZE(AO), %xmm0 691 692 ADD1 %xmm1, %xmm8 693 movaps -12 * SIZE(BO), %xmm1 694 ADD2 %xmm2, %xmm9 695 pshufd $0x4e, %xmm1, %xmm2 696 mulpd %xmm0, %xmm1 697 mulpd %xmm0, %xmm2 698 699 ADD1 %xmm3, %xmm10 700 movaps -10 * SIZE(BO), %xmm3 701 ADD2 %xmm4, %xmm11 702 pshufd $0x4e, %xmm3, %xmm4 703 mulpd %xmm0, %xmm3 704 mulpd %xmm0, %xmm4 705 706 movaps -12 * SIZE(AO), %xmm0 707 708 ADD1 %xmm1, %xmm8 709 movaps -8 * SIZE(BO), %xmm1 710 ADD2 %xmm2, %xmm9 711 pshufd $0x4e, %xmm1, %xmm2 712 mulpd %xmm0, %xmm1 713 mulpd %xmm0, %xmm2 714 715 ADD1 %xmm3, %xmm10 716 movaps -6 * SIZE(BO), %xmm3 717 ADD2 %xmm4, %xmm11 718 pshufd $0x4e, %xmm3, %xmm4 719 mulpd %xmm0, %xmm3 720 mulpd %xmm0, %xmm4 721 722 movaps -10 * SIZE(AO), %xmm0 723 724 ADD1 %xmm1, %xmm8 725 movaps -4 * SIZE(BO), %xmm1 726 ADD2 %xmm2, %xmm9 727 pshufd $0x4e, %xmm1, %xmm2 728 mulpd %xmm0, %xmm1 729 mulpd %xmm0, %xmm2 730 731 ADD1 %xmm3, %xmm10 732 movaps -2 * SIZE(BO), %xmm3 733 ADD2 %xmm4, %xmm11 734 pshufd $0x4e, %xmm3, %xmm4 735 mulpd %xmm0, %xmm3 736 mulpd %xmm0, %xmm4 737 738 movaps -8 * SIZE(AO), %xmm0 739 740 subq $-8 * SIZE, AO 741 subq $-16 * SIZE, BO 742 743 subq $1, %rax 744 BRANCH 745 jg .L22 746 ALIGN_3 747 748.L25: 749#ifndef TRMMKERNEL 750 movq K, %rax 751#else 752 movq KKK, %rax 753#endif 754 andq $3, %rax # if (k & 1) 755 BRANCH 756 je .L28 757 ALIGN_3 758 759.L26: 760 ADD1 %xmm1, %xmm8 761 movaps -16 * SIZE(BO), %xmm1 762 ADD2 %xmm2, %xmm9 763 pshufd $0x4e, %xmm1, %xmm2 764 mulpd %xmm0, %xmm1 765 mulpd %xmm0, %xmm2 766 767 ADD1 %xmm3, %xmm10 768 movaps -14 * SIZE(BO), %xmm3 769 ADD2 %xmm4, %xmm11 770 pshufd $0x4e, %xmm3, %xmm4 771 mulpd %xmm0, %xmm3 772 mulpd %xmm0, %xmm4 773 774 movaps -14 * SIZE(AO), %xmm0 775 776 addq $2 * SIZE, AO 777 addq $4 * SIZE, BO 778 779 subq $1, %rax 780 BRANCH 781 jg .L26 782 ALIGN_3 783 784.L28: 785 ADD1 %xmm1, %xmm8 786 ADD2 %xmm2, %xmm9 787 ADD1 %xmm3, %xmm10 788 ADD2 %xmm4, %xmm11 789 790 pcmpeqb %xmm0, %xmm0 791 psllq $63, %xmm0 792 793 movddup ALPHA_R, %xmm2 794 movddup ALPHA_I, %xmm3 795 796#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 797 defined(RR) || defined(RC) || defined(CR) || defined(CC) 798 shufps $0x40, %xmm0, %xmm0 799 800 xorps %xmm0, %xmm8 801 xorps %xmm0, %xmm10 802#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 803 shufps $0x04, %xmm0, %xmm0 804 805 xorps %xmm0, %xmm9 806 xorps %xmm0, %xmm11 807#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 808 shufps $0x40, %xmm0, %xmm0 809 810 xorps %xmm0, %xmm9 811 xorps %xmm0, %xmm11 812#endif 813 814 haddpd %xmm9, %xmm8 815 haddpd %xmm11, %xmm10 816 817 pshufd $0x4e, %xmm8, %xmm9 818 pshufd $0x4e, %xmm10, %xmm11 819 820 mulpd %xmm2, %xmm8 821 mulpd %xmm3, %xmm9 822 mulpd %xmm2, %xmm10 823 mulpd %xmm3, %xmm11 824 825 addsubpd %xmm9, %xmm8 826 addsubpd %xmm11, %xmm10 827 828#ifndef TRMMKERNEL 829 movsd 0 * SIZE(CO1), %xmm0 830 movhpd 1 * SIZE(CO1), %xmm0 831 movsd 0 * SIZE(CO2), %xmm1 832 movhpd 1 * SIZE(CO2), %xmm1 833 834 addpd %xmm0, %xmm8 835 addpd %xmm1, %xmm10 836#endif 837 838 movsd %xmm8, 0 * SIZE(CO1) 839 movhpd %xmm8, 1 * SIZE(CO1) 840 movsd %xmm10, 0 * SIZE(CO2) 841 movhpd %xmm10, 1 * SIZE(CO2) 842 843#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 844 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 845 movq K, %rax 846 subq KKK, %rax 847 salq $ZBASE_SHIFT, %rax 848 leaq (AO, %rax, 1), AO 849 leaq (BO, %rax, 2), BO 850#endif 851 852#if defined(TRMMKERNEL) && defined(LEFT) 853 addq $1, KK 854#endif 855 856 addq $2 * SIZE, CO1 857 addq $2 * SIZE, CO2 858 decq I 859 BRANCH 860 jg .L21 861 862#if defined(TRMMKERNEL) && !defined(LEFT) 863 addq $2, KK 864#endif 865 866 leaq (C, LDC, 2), C 867 movq BO, B 868 ALIGN_4 869 870.L30: 871 testq $1, N 872 BRANCH 873 jle .L999 874 875#if defined(TRMMKERNEL) && defined(LEFT) 876 movq OFFSET, %rax 877 movq %rax, KK 878#endif 879 880 movq C, CO1 881 movq A, AO 882 883 movq M, I 884 ALIGN_4 885 886.L31: 887#if !defined(TRMMKERNEL) || \ 888 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 889 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 890 891 movq B, BO 892#else 893 movq B, BO 894 895 movq KK, %rax 896 salq $ZBASE_SHIFT, %rax 897 leaq (AO, %rax, 1), AO 898 leaq (BO, %rax, 1), BO 899#endif 900 901 xorps %xmm1, %xmm1 902 movaps -16 * SIZE(AO), %xmm0 903 xorps %xmm2, %xmm2 904 905 xorps %xmm8, %xmm8 906 prefetcht0 2 * SIZE(CO1) 907 xorps %xmm9, %xmm9 908 xorps %xmm10, %xmm10 909 xorps %xmm11, %xmm11 910 911#ifndef TRMMKERNEL 912 movq K, %rax 913#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 914 movq K, %rax 915 subq KK, %rax 916 movq %rax, KKK 917#else 918 movq KK, %rax 919#ifdef LEFT 920 addq $1, %rax 921#else 922 addq $1, %rax 923#endif 924 movq %rax, KKK 925#endif 926 sarq $2, %rax 927 NOBRANCH 928 jle .L35 929 ALIGN_3 930 931.L32: 932 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 933 934 ADD1 %xmm1, %xmm8 935 movaps -16 * SIZE(BO), %xmm1 936 ADD2 %xmm2, %xmm9 937 pshufd $0x4e, %xmm1, %xmm2 938 mulpd %xmm0, %xmm1 939 mulpd %xmm0, %xmm2 940 movaps -14 * SIZE(AO), %xmm0 941 942 ADD1 %xmm1, %xmm10 943 movaps -14 * SIZE(BO), %xmm1 944 ADD2 %xmm2, %xmm11 945 pshufd $0x4e, %xmm1, %xmm2 946 mulpd %xmm0, %xmm1 947 mulpd %xmm0, %xmm2 948 movaps -12 * SIZE(AO), %xmm0 949 950 ADD1 %xmm1, %xmm8 951 movaps -12 * SIZE(BO), %xmm1 952 ADD2 %xmm2, %xmm9 953 pshufd $0x4e, %xmm1, %xmm2 954 mulpd %xmm0, %xmm1 955 mulpd %xmm0, %xmm2 956 movaps -10 * SIZE(AO), %xmm0 957 958 ADD1 %xmm1, %xmm10 959 movaps -10 * SIZE(BO), %xmm1 960 ADD2 %xmm2, %xmm11 961 pshufd $0x4e, %xmm1, %xmm2 962 mulpd %xmm0, %xmm1 963 mulpd %xmm0, %xmm2 964 movaps -8 * SIZE(AO), %xmm0 965 966 subq $-8 * SIZE, AO 967 subq $-8 * SIZE, BO 968 969 subq $1, %rax 970 BRANCH 971 jg .L32 972 973 addpd %xmm10, %xmm8 974 addpd %xmm11, %xmm9 975 ALIGN_3 976 977.L35: 978#ifndef TRMMKERNEL 979 movq K, %rax 980#else 981 movq KKK, %rax 982#endif 983 andq $3, %rax 984 BRANCH 985 je .L38 986 ALIGN_3 987 988.L36: 989 ADD1 %xmm1, %xmm8 990 movaps -16 * SIZE(BO), %xmm1 991 ADD2 %xmm2, %xmm9 992 pshufd $0x4e, %xmm1, %xmm2 993 mulpd %xmm0, %xmm1 994 mulpd %xmm0, %xmm2 995 movaps -14 * SIZE(AO), %xmm0 996 997 addq $2 * SIZE, AO 998 addq $2 * SIZE, BO 999 1000 subq $1, %rax 1001 BRANCH 1002 jg .L36 1003 ALIGN_3 1004 1005.L38: 1006 ADD1 %xmm1, %xmm8 1007 ADD2 %xmm2, %xmm9 1008 1009 pcmpeqb %xmm0, %xmm0 1010 psllq $63, %xmm0 1011 1012 movddup ALPHA_R, %xmm2 1013 movddup ALPHA_I, %xmm3 1014 1015#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1016 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1017 shufps $0x40, %xmm0, %xmm0 1018 1019 xorps %xmm0, %xmm8 1020#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 1021 shufps $0x04, %xmm0, %xmm0 1022 1023 xorps %xmm0, %xmm9 1024#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 1025 shufps $0x40, %xmm0, %xmm0 1026 1027 xorps %xmm0, %xmm9 1028#endif 1029 1030 haddpd %xmm9, %xmm8 1031 pshufd $0x4e, %xmm8, %xmm9 1032 1033 mulpd %xmm2, %xmm8 1034 mulpd %xmm3, %xmm9 1035 1036 addsubpd %xmm9, %xmm8 1037 1038#ifndef TRMMKERNEL 1039 movsd 0 * SIZE(CO1), %xmm0 1040 movhpd 1 * SIZE(CO1), %xmm0 1041 1042 addpd %xmm0, %xmm8 1043#endif 1044 1045 movsd %xmm8, 0 * SIZE(CO1) 1046 movhpd %xmm8, 1 * SIZE(CO1) 1047 1048#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1049 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1050 movq K, %rax 1051 subq KKK, %rax 1052 salq $ZBASE_SHIFT, %rax 1053 leaq (AO, %rax, 1), AO 1054 leaq (BO, %rax, 1), BO 1055#endif 1056 1057#if defined(TRMMKERNEL) && defined(LEFT) 1058 addq $1, KK 1059#endif 1060 1061 addq $2 * SIZE, CO1 1062 decq I 1063 BRANCH 1064 jg .L31 1065 ALIGN_4 1066 1067.L999: 1068 movq 0(%rsp), %rbx 1069 movq 8(%rsp), %rbp 1070 movq 16(%rsp), %r12 1071 movq 24(%rsp), %r13 1072 movq 32(%rsp), %r14 1073 movq 40(%rsp), %r15 1074 1075#ifdef WINDOWS_ABI 1076 movq 48(%rsp), %rdi 1077 movq 56(%rsp), %rsi 1078 movups 64(%rsp), %xmm6 1079 movups 80(%rsp), %xmm7 1080 movups 96(%rsp), %xmm8 1081 movups 112(%rsp), %xmm9 1082 movups 128(%rsp), %xmm10 1083 movups 144(%rsp), %xmm11 1084 movups 160(%rsp), %xmm12 1085 movups 176(%rsp), %xmm13 1086 movups 192(%rsp), %xmm14 1087 movups 208(%rsp), %xmm15 1088#endif 1089 1090 addq $STACKSIZE, %rsp 1091 ret 1092 1093 EPILOGUE 1094