1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#if !defined(HAVE_SSE) || !defined(HAVE_MMX) 43#error You have to check your configuration. 44#endif 45 46#define STACK 16 47#define ARGS 0 48 49#define STACK_M 4 + STACK + ARGS(%esi) 50#define STACK_N 8 + STACK + ARGS(%esi) 51#define STACK_K 12 + STACK + ARGS(%esi) 52#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) 53#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) 54#define STACK_A 24 + STACK + ARGS(%esi) 55#define STACK_B 28 + STACK + ARGS(%esi) 56#define STACK_C 32 + STACK + ARGS(%esi) 57#define STACK_LDC 36 + STACK + ARGS(%esi) 58#define STACK_OFFT 40 + STACK + ARGS(%esi) 59 60#define POSINV 0(%esp) 61#define ALPHA_R 16(%esp) 62#define ALPHA_I 32(%esp) 63#define K 48(%esp) 64#define N 52(%esp) 65#define M 56(%esp) 66#define A 60(%esp) 67#define C 64(%esp) 68#define J 68(%esp) 69#define OLD_STACK 72(%esp) 70#define TEMP 76(%esp) 71#define OFFSET 80(%esp) 72#define KK 84(%esp) 73#define KKK 88(%esp) 74#define BUFFER 128(%esp) 75 76#define B %edi 77#define LDC %ebp 78 79#define STACK_ALIGN 4096 80#define STACK_OFFSET 1024 81 82#define AA %edx 83#define BB %ecx 84 85#if !defined(HAVE_SSE2) || defined(OPTERON) 86#define movsd movlps 87#endif 88 89#ifdef HAVE_SSE2 90#define xorps pxor 91#endif 92 93#define KERNEL1(address) \ 94 mulps %xmm0, %xmm2; \ 95 mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ 96 addps %xmm2, %xmm4; \ 97 movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ 98 addps %xmm0, %xmm5; \ 99 movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ 100 mulps %xmm0, %xmm2; \ 101 mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ 102 addps %xmm2, %xmm6; \ 103 movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ 104 addps %xmm0, %xmm7; \ 105 movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 106 107#define KERNEL2(address) \ 108 mulps %xmm0, %xmm2; \ 109 mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ 110 addps %xmm2, %xmm4; \ 111 movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ 112 addps %xmm0, %xmm5; \ 113 movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ 114 mulps %xmm0, %xmm2; \ 115 mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ 116 addps %xmm2, %xmm6; \ 117 movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ 118 addps %xmm0, %xmm7; \ 119 movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 120 121#define KERNEL3(address) \ 122 mulps %xmm1, %xmm3; \ 123 mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ 124 addps %xmm3, %xmm4; \ 125 movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ 126 addps %xmm1, %xmm5; \ 127 movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ 128 mulps %xmm1, %xmm3; \ 129 mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ 130 addps %xmm3, %xmm6; \ 131 movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ 132 addps %xmm1, %xmm7; \ 133 movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 134 135#define KERNEL4(address) \ 136 mulps %xmm1, %xmm3; \ 137 mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ 138 addps %xmm3, %xmm4; \ 139 movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ 140 addps %xmm1, %xmm5; \ 141 movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ 142 mulps %xmm1, %xmm3; \ 143 mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ 144 addps %xmm3, %xmm6; \ 145 movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ 146 addps %xmm1, %xmm7; \ 147 movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 148 149#define KERNEL5(address) \ 150 mulps %xmm0, %xmm2; \ 151 mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ 152 addps %xmm2, %xmm4; \ 153 movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ 154 addps %xmm0, %xmm5; \ 155 movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ 156 mulps %xmm0, %xmm2; \ 157 mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ 158 addps %xmm2, %xmm6; \ 159 movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ 160 addps %xmm0, %xmm7; \ 161 movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 162 163#define KERNEL6(address) \ 164 mulps %xmm0, %xmm2; \ 165 mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ 166 addps %xmm2, %xmm4; \ 167 movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ 168 addps %xmm0, %xmm5; \ 169 movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ 170 mulps %xmm0, %xmm2; \ 171 mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ 172 addps %xmm2, %xmm6; \ 173 movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ 174 addps %xmm0, %xmm7; \ 175 movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 176 177#define KERNEL7(address) \ 178 mulps %xmm1, %xmm3; \ 179 mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ 180 addps %xmm3, %xmm4; \ 181 movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ 182 addps %xmm1, %xmm5; \ 183 movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ 184 mulps %xmm1, %xmm3; \ 185 mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ 186 addps %xmm3, %xmm6; \ 187 movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ 188 addps %xmm1, %xmm7; \ 189 movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 190 191#define KERNEL8(address) \ 192 mulps %xmm1, %xmm3; \ 193 mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ 194 addps %xmm3, %xmm4; \ 195 movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ 196 addps %xmm1, %xmm5; \ 197 movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ 198 mulps %xmm1, %xmm3; \ 199 mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ 200 addps %xmm3, %xmm6; \ 201 movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ 202 addps %xmm1, %xmm7; \ 203 movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 204 205 PROLOGUE 206 207 pushl %ebp 208 pushl %edi 209 pushl %esi 210 pushl %ebx 211 212 PROFCODE 213 214 EMMS 215 216 movl %esp, %esi # save old stack 217 218 subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp 219 andl $-STACK_ALIGN, %esp # align stack 220 addl $STACK_OFFSET, %esp 221 222 STACK_TOUCHING 223 224 movd STACK_M, %mm0 225 movl STACK_N, %eax 226 movd STACK_K, %mm1 227 movd STACK_A, %mm2 228 movl STACK_B, B 229 movd STACK_C, %mm3 230 movl STACK_LDC, LDC 231#ifdef TRMMKERNEL 232 movd STACK_OFFT, %mm4 233#endif 234 235 movd %mm1, K 236 movd %mm0, M 237 movl %eax, N 238 movd %mm2, A 239 movd %mm3, C 240 movl %esi, OLD_STACK 241#ifdef TRMMKERNEL 242 movd %mm4, OFFSET 243 movd %mm4, KK 244#ifndef LEFT 245 negl KK 246#endif 247#endif 248 249 leal (, LDC, SIZE * 2), LDC 250 251 movss STACK_ALPHA_R, %xmm0 252 movss STACK_ALPHA_I, %xmm1 253 254#ifdef HAVE_SSE2 255 pxor %xmm7, %xmm7 256 cmpeqps %xmm7, %xmm7 257 pslld $31, %xmm7 # Generate mask 258#else 259 movl $0x80000000, TEMP 260 movss TEMP, %xmm7 261 shufps $0, %xmm7, %xmm7 262#endif 263 xorps %xmm2, %xmm2 264 265 shufps $0, %xmm0, %xmm0 266 267 movaps %xmm0, 0 + ALPHA_R 268 movss %xmm1, 4 + ALPHA_I 269 movss %xmm1, 12 + ALPHA_I 270 xorps %xmm7, %xmm1 271 movss %xmm1, 0 + ALPHA_I 272 movss %xmm1, 8 + ALPHA_I 273 274#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ 275 defined(TN) || defined(TT) || defined(TR) || defined(TC) 276 movss %xmm7, 0 + POSINV 277 movss %xmm2, 4 + POSINV 278 movss %xmm7, 8 + POSINV 279 movss %xmm2, 12 + POSINV 280#else 281 movss %xmm2, 0 + POSINV 282 movss %xmm7, 4 + POSINV 283 movss %xmm2, 8 + POSINV 284 movss %xmm7, 12 + POSINV 285#endif 286 287 movl %eax, J # j = n 288 testl %eax, %eax 289 jle .L999 290 291.L01: 292#if defined(TRMMKERNEL) && defined(LEFT) 293 movl OFFSET, %eax 294 movl %eax, KK 295#endif 296 297 leal BUFFER, BB 298 movaps POSINV, %xmm7 299 300 movl K, %eax 301 sarl $2, %eax 302 jle .L03 303 304.L02: 305 movss 0 * SIZE(B), %xmm0 306 movss 1 * SIZE(B), %xmm1 307 movss 2 * SIZE(B), %xmm2 308 movss 3 * SIZE(B), %xmm3 309 310 shufps $0, %xmm0, %xmm0 311 shufps $0, %xmm1, %xmm1 312 shufps $0, %xmm2, %xmm2 313 shufps $0, %xmm3, %xmm3 314 315#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ 316 defined(TN) || defined(TT) || defined(TR) || defined(TC) 317 xorps %xmm7, %xmm1 318 xorps %xmm7, %xmm3 319#else 320 xorps %xmm7, %xmm0 321 xorps %xmm7, %xmm2 322#endif 323 324 movaps %xmm0, 0 * SIZE(BB) 325 movaps %xmm1, 4 * SIZE(BB) 326 movaps %xmm2, 8 * SIZE(BB) 327 movaps %xmm3, 12 * SIZE(BB) 328 329 movss 4 * SIZE(B), %xmm0 330 movss 5 * SIZE(B), %xmm1 331 movss 6 * SIZE(B), %xmm2 332 movss 7 * SIZE(B), %xmm3 333 334 shufps $0, %xmm0, %xmm0 335 shufps $0, %xmm1, %xmm1 336 shufps $0, %xmm2, %xmm2 337 shufps $0, %xmm3, %xmm3 338 339#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ 340 defined(TN) || defined(TT) || defined(TR) || defined(TC) 341 xorps %xmm7, %xmm1 342 xorps %xmm7, %xmm3 343#else 344 xorps %xmm7, %xmm0 345 xorps %xmm7, %xmm2 346#endif 347 348 movaps %xmm0, 16 * SIZE(BB) 349 movaps %xmm1, 20 * SIZE(BB) 350 movaps %xmm2, 24 * SIZE(BB) 351 movaps %xmm3, 28 * SIZE(BB) 352 353 prefetcht0 104 * SIZE(B) 354 355 addl $ 8 * SIZE, B 356 addl $32 * SIZE, BB 357 decl %eax 358 jne .L02 359 360.L03: 361 movl K, %eax 362 andl $3, %eax 363 BRANCH 364 jle .L05 365 366.L04: 367 movss 0 * SIZE(B), %xmm0 368 movss 1 * SIZE(B), %xmm1 369 370 shufps $0, %xmm0, %xmm0 371 shufps $0, %xmm1, %xmm1 372 373#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ 374 defined(TN) || defined(TT) || defined(TR) || defined(TC) 375 xorps %xmm7, %xmm1 376#else 377 xorps %xmm7, %xmm0 378#endif 379 380 movaps %xmm0, 0 * SIZE(BB) 381 movaps %xmm1, 4 * SIZE(BB) 382 383 addl $2 * SIZE, B 384 addl $8 * SIZE, BB 385 decl %eax 386 jne .L04 387 ALIGN_4 388 389.L05: 390 movl C, %esi # coffset = c 391 movl A, AA # aoffset = a 392 movl M, %ebx 393 sarl $2, %ebx # i = (m >> 2) 394 jle .L50 395 ALIGN_4 396 397.L10: 398 399#ifdef PENTIUM4 400 401#if !defined(TRMMKERNEL) || \ 402 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 403 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 404 405 leal BUFFER, BB 406 movaps 0 * SIZE + BUFFER, %xmm2 407 xorps %xmm4, %xmm4 408 movaps 0 * SIZE(AA), %xmm0 409 xorps %xmm5, %xmm5 410 movaps 16 * SIZE + BUFFER, %xmm3 411 xorps %xmm6, %xmm6 412 movaps 16 * SIZE(AA), %xmm1 413 xorps %xmm7, %xmm7 414 415#else 416 417 leal BUFFER, BB 418 movl KK, %eax 419 leal (, %eax, 8), %eax 420 leal (AA, %eax, 4), AA 421 leal (BB, %eax, 4), BB /* because it's doubled */ 422 423 movaps 0 * SIZE(BB), %xmm2 424 xorps %xmm4, %xmm4 425 movaps 0 * SIZE(AA), %xmm0 426 xorps %xmm5, %xmm5 427 movaps 16 * SIZE(BB), %xmm3 428 xorps %xmm6, %xmm6 429 movaps 16 * SIZE(AA), %xmm1 430 xorps %xmm7, %xmm7 431 432#endif 433 434 prefetchnta 8 * SIZE(%esi) 435 436#ifndef TRMMKERNEL 437 movl K, %eax 438#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 439 movl K, %eax 440 subl KK, %eax 441 movl %eax, KKK 442#else 443 movl KK, %eax 444#ifdef LEFT 445 addl $4, %eax 446#else 447 addl $1, %eax 448#endif 449 movl %eax, KKK 450#endif 451 andl $-8, %eax 452 je .L12 453 sall $3, %eax 454 455.L1X: 456 KERNEL1(32 * 0) 457 KERNEL2(32 * 0) 458 KERNEL3(32 * 0) 459 KERNEL4(32 * 0) 460 KERNEL5(32 * 0) 461 KERNEL6(32 * 0) 462 KERNEL7(32 * 0) 463 KERNEL8(32 * 0) 464 cmpl $64 * 1, %eax 465 NOBRANCH 466 jle .L11 467 KERNEL1(32 * 1) 468 KERNEL2(32 * 1) 469 KERNEL3(32 * 1) 470 KERNEL4(32 * 1) 471 KERNEL5(32 * 1) 472 KERNEL6(32 * 1) 473 KERNEL7(32 * 1) 474 KERNEL8(32 * 1) 475 cmpl $64 * 2, %eax 476 NOBRANCH 477 jle .L11 478 KERNEL1(32 * 2) 479 KERNEL2(32 * 2) 480 KERNEL3(32 * 2) 481 KERNEL4(32 * 2) 482 KERNEL5(32 * 2) 483 KERNEL6(32 * 2) 484 KERNEL7(32 * 2) 485 KERNEL8(32 * 2) 486 cmpl $64 * 3, %eax 487 NOBRANCH 488 jle .L11 489 KERNEL1(32 * 3) 490 KERNEL2(32 * 3) 491 KERNEL3(32 * 3) 492 KERNEL4(32 * 3) 493 KERNEL5(32 * 3) 494 KERNEL6(32 * 3) 495 KERNEL7(32 * 3) 496 KERNEL8(32 * 3) 497 cmpl $64 * 4, %eax 498 NOBRANCH 499 jle .L11 500 KERNEL1(32 * 4) 501 KERNEL2(32 * 4) 502 KERNEL3(32 * 4) 503 KERNEL4(32 * 4) 504 KERNEL5(32 * 4) 505 KERNEL6(32 * 4) 506 KERNEL7(32 * 4) 507 KERNEL8(32 * 4) 508 cmpl $64 * 5, %eax 509 NOBRANCH 510 jle .L11 511 KERNEL1(32 * 5) 512 KERNEL2(32 * 5) 513 KERNEL3(32 * 5) 514 KERNEL4(32 * 5) 515 KERNEL5(32 * 5) 516 KERNEL6(32 * 5) 517 KERNEL7(32 * 5) 518 KERNEL8(32 * 5) 519 cmpl $64 * 6, %eax 520 NOBRANCH 521 jle .L11 522 KERNEL1(32 * 6) 523 KERNEL2(32 * 6) 524 KERNEL3(32 * 6) 525 KERNEL4(32 * 6) 526 KERNEL5(32 * 6) 527 KERNEL6(32 * 6) 528 KERNEL7(32 * 6) 529 KERNEL8(32 * 6) 530 cmpl $64 * 7, %eax 531 NOBRANCH 532 jle .L11 533 KERNEL1(32 * 7) 534 KERNEL2(32 * 7) 535 KERNEL3(32 * 7) 536 KERNEL4(32 * 7) 537 KERNEL5(32 * 7) 538 KERNEL6(32 * 7) 539 KERNEL7(32 * 7) 540 KERNEL8(32 * 7) 541 542 addl $128 * 4 * SIZE, AA 543 addl $128 * 4 * SIZE, BB 544 subl $ 64 * 8, %eax 545 BRANCH 546 jg .L1X 547 548.L11: 549 leal (AA, %eax, 4), AA 550 leal (BB, %eax, 4), BB 551 552#else 553 554#if !defined(TRMMKERNEL) || \ 555 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 556 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 557 558 leal BUFFER, BB 559 movaps 0 * SIZE + BUFFER, %xmm2 560 xorps %xmm4, %xmm4 561 movaps 0 * SIZE(AA), %xmm0 562 xorps %xmm5, %xmm5 563 movaps 8 * SIZE + BUFFER, %xmm3 564 xorps %xmm6, %xmm6 565 movaps 8 * SIZE(AA), %xmm1 566 xorps %xmm7, %xmm7 567 568#else 569 570 leal BUFFER, BB 571 movl KK, %eax 572 leal (, %eax, 8), %eax 573 leal (AA, %eax, 4), AA 574 leal (BB, %eax, 4), BB /* because it's doubled */ 575 576 movaps 0 * SIZE(BB), %xmm2 577 xorps %xmm4, %xmm4 578 movaps 0 * SIZE(AA), %xmm0 579 xorps %xmm5, %xmm5 580 movaps 8 * SIZE(BB), %xmm3 581 xorps %xmm6, %xmm6 582 movaps 8 * SIZE(AA), %xmm1 583 xorps %xmm7, %xmm7 584 585#endif 586 587#ifndef TRMMKERNEL 588 movl K, %eax 589#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 590 movl K, %eax 591 subl KK, %eax 592 movl %eax, KKK 593#else 594 movl KK, %eax 595#ifdef LEFT 596 addl $4, %eax 597#else 598 addl $1, %eax 599#endif 600 movl %eax, KKK 601#endif 602 sarl $3, %eax 603 prefetcht0 8 * SIZE(%esi) 604 je .L12 605 ALIGN_4 606 607#define PREFETCHSIZE 48 608 609.L11: 610#ifdef CORE_KATMAI 611 prefetcht0 PREFETCHSIZE * SIZE(AA) 612#endif 613 614 mulps %xmm0, %xmm2 615 mulps 4 * SIZE(BB), %xmm0 616 addps %xmm2, %xmm4 617 movaps 0 * SIZE(BB), %xmm2 618 619 addps %xmm0, %xmm5 620 movaps 4 * SIZE(AA), %xmm0 621 mulps %xmm0, %xmm2 622 mulps 4 * SIZE(BB), %xmm0 623 624 addps %xmm2, %xmm6 625 movaps 16 * SIZE(BB), %xmm2 626 addps %xmm0, %xmm7 627 movaps 16 * SIZE(AA), %xmm0 628 629#ifdef CORE_KATMAI 630 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) 631#endif 632 633 mulps %xmm1, %xmm3 634 mulps 12 * SIZE(BB), %xmm1 635 addps %xmm3, %xmm4 636 movaps 8 * SIZE(BB), %xmm3 637 638 addps %xmm1, %xmm5 639 movaps 12 * SIZE(AA), %xmm1 640 mulps %xmm1, %xmm3 641 mulps 12 * SIZE(BB), %xmm1 642 643 addps %xmm3, %xmm6 644 movaps 24 * SIZE(BB), %xmm3 645 addps %xmm1, %xmm7 646 movaps 24 * SIZE(AA), %xmm1 647 648#ifdef CORE_KATMAI 649 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) 650#endif 651 652 mulps %xmm0, %xmm2 653 mulps 20 * SIZE(BB), %xmm0 654 addps %xmm2, %xmm4 655 movaps 16 * SIZE(BB), %xmm2 656 657 addps %xmm0, %xmm5 658 movaps 20 * SIZE(AA), %xmm0 659 mulps %xmm0, %xmm2 660 mulps 20 * SIZE(BB), %xmm0 661 662 addps %xmm2, %xmm6 663 movaps 32 * SIZE(BB), %xmm2 664 addps %xmm0, %xmm7 665 movaps 32 * SIZE(AA), %xmm0 666 667#ifdef CORE_KATMAI 668 prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) 669#endif 670 671 mulps %xmm1, %xmm3 672 mulps 28 * SIZE(BB), %xmm1 673 addps %xmm3, %xmm4 674 movaps 24 * SIZE(BB), %xmm3 675 676 addps %xmm1, %xmm5 677 movaps 28 * SIZE(AA), %xmm1 678 mulps %xmm1, %xmm3 679 mulps 28 * SIZE(BB), %xmm1 680 681 addps %xmm3, %xmm6 682 movaps 40 * SIZE(BB), %xmm3 683 addps %xmm1, %xmm7 684 movaps 40 * SIZE(AA), %xmm1 685 686#ifdef CORE_KATMAI 687 prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) 688#endif 689 690 mulps %xmm0, %xmm2 691 mulps 36 * SIZE(BB), %xmm0 692 addps %xmm2, %xmm4 693 movaps 32 * SIZE(BB), %xmm2 694 695 addps %xmm0, %xmm5 696 movaps 36 * SIZE(AA), %xmm0 697 mulps %xmm0, %xmm2 698 mulps 36 * SIZE(BB), %xmm0 699 700 addps %xmm2, %xmm6 701 movaps 48 * SIZE(BB), %xmm2 702 addps %xmm0, %xmm7 703 movaps 48 * SIZE(AA), %xmm0 704 705#ifdef CORE_KATMAI 706 prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) 707#endif 708 709 mulps %xmm1, %xmm3 710 mulps 44 * SIZE(BB), %xmm1 711 addps %xmm3, %xmm4 712 movaps 40 * SIZE(BB), %xmm3 713 714 addps %xmm1, %xmm5 715 movaps 44 * SIZE(AA), %xmm1 716 mulps %xmm1, %xmm3 717 mulps 44 * SIZE(BB), %xmm1 718 719 addps %xmm3, %xmm6 720 movaps 56 * SIZE(BB), %xmm3 721 addps %xmm1, %xmm7 722 movaps 56 * SIZE(AA), %xmm1 723 724#ifdef CORE_KATMAI 725 prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) 726#endif 727 728 mulps %xmm0, %xmm2 729 mulps 52 * SIZE(BB), %xmm0 730 addps %xmm2, %xmm4 731 movaps 48 * SIZE(BB), %xmm2 732 733 addps %xmm0, %xmm5 734 movaps 52 * SIZE(AA), %xmm0 735 mulps %xmm0, %xmm2 736 mulps 52 * SIZE(BB), %xmm0 737 738 addps %xmm2, %xmm6 739 movaps 64 * SIZE(BB), %xmm2 740 addps %xmm0, %xmm7 741 movaps 64 * SIZE(AA), %xmm0 742 743#ifdef CORE_KATMAI 744 prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) 745#endif 746 747 mulps %xmm1, %xmm3 748 mulps 60 * SIZE(BB), %xmm1 749 addps %xmm3, %xmm4 750 movaps 56 * SIZE(BB), %xmm3 751 752 addps %xmm1, %xmm5 753 movaps 60 * SIZE(AA), %xmm1 754 mulps %xmm1, %xmm3 755 mulps 60 * SIZE(BB), %xmm1 756 757 addps %xmm3, %xmm6 758 movaps 72 * SIZE(BB), %xmm3 759 addps %xmm1, %xmm7 760 movaps 72 * SIZE(AA), %xmm1 761 762 addl $64 * SIZE, BB 763 addl $64 * SIZE, AA 764 decl %eax 765 jne .L11 766#endif 767 768.L12: 769#ifndef TRMMKERNEL 770 movl K, %eax 771#else 772 movl KKK, %eax 773#endif 774 movaps ALPHA_R, %xmm1 775 movaps ALPHA_I, %xmm3 776 andl $7, %eax # if (k & 1) 777 BRANCH 778 je .L14 779 780.L13: 781 mulps %xmm0, %xmm2 782 mulps 4 * SIZE(BB), %xmm0 783 addps %xmm2, %xmm4 784 movaps 0 * SIZE(BB), %xmm2 785 addps %xmm0, %xmm5 786 movaps 4 * SIZE(AA), %xmm0 787 mulps %xmm0, %xmm2 788 mulps 4 * SIZE(BB), %xmm0 789 addps %xmm2, %xmm6 790 movaps 8 * SIZE(BB), %xmm2 791 addps %xmm0, %xmm7 792 movaps 8 * SIZE(AA), %xmm0 793 794 addl $8 * SIZE, AA # aoffset += 8 795 addl $8 * SIZE, BB # boffset1 += 8 796 797 decl %eax 798 jg .L13 799 800.L14: 801 shufps $0xb1, %xmm5, %xmm5 802 shufps $0xb1, %xmm7, %xmm7 803 804#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 805 defined(RR) || defined(RC) || defined(CR) || defined(CC) 806 subps %xmm5, %xmm4 807 subps %xmm7, %xmm6 808#else 809 addps %xmm5, %xmm4 810 addps %xmm7, %xmm6 811#endif 812 813 movaps %xmm4, %xmm5 814 movaps %xmm6, %xmm7 815 816 shufps $0xb1, %xmm4, %xmm4 817 shufps $0xb1, %xmm6, %xmm6 818 819 mulps %xmm1, %xmm5 820 mulps %xmm3, %xmm4 821 mulps %xmm1, %xmm7 822 mulps %xmm3, %xmm6 823 824 addps %xmm5, %xmm4 825 addps %xmm7, %xmm6 826 827 shufps $0xe4, %xmm4, %xmm4 828 shufps $0xe4, %xmm6, %xmm6 829 830#ifndef TRMMKERNEL 831 movsd 0 * SIZE(%esi), %xmm0 832 movhps 2 * SIZE(%esi), %xmm0 833 movsd 4 * SIZE(%esi), %xmm2 834 movhps 6 * SIZE(%esi), %xmm2 835 836 addps %xmm0, %xmm4 837 addps %xmm2, %xmm6 838#endif 839 840 movsd %xmm4, 0 * SIZE(%esi) 841 movhps %xmm4, 2 * SIZE(%esi) 842 movsd %xmm6, 4 * SIZE(%esi) 843 movhps %xmm6, 6 * SIZE(%esi) 844 845#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 846 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 847 movl K, %eax 848 subl KKK, %eax 849 leal (,%eax, 8), %eax 850 leal (AA, %eax, 4), AA 851 leal (BB, %eax, 4), BB 852#endif 853 854#if defined(TRMMKERNEL) && defined(LEFT) 855 addl $4, KK 856#endif 857 858 addl $8 * SIZE, %esi # coffset += 4 859 decl %ebx # i -- 860 jg .L10 861 ALIGN_2 862 863.L50: 864 movl M, %ebx 865 testl $2, %ebx 866 jle .L70 867 868 869#if (L1_DATA_LINESIZE == 64) 870 871#if !defined(TRMMKERNEL) || \ 872 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 873 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 874 875 leal BUFFER, BB 876 movaps 0 * SIZE + BUFFER, %xmm2 877 xorps %xmm4, %xmm4 878 movaps 0 * SIZE(AA), %xmm0 879 xorps %xmm5, %xmm5 880 movaps 16 * SIZE + BUFFER, %xmm3 881 xorps %xmm6, %xmm6 882 movaps 16 * SIZE(AA), %xmm1 883 xorps %xmm7, %xmm7 884 885#else 886 887 leal BUFFER, BB 888 movl KK, %eax 889 leal (, %eax, 8), %eax 890 leal (AA, %eax, 2), AA 891 leal (BB, %eax, 4), BB /* because it's doubled */ 892 893 movaps 0 * SIZE(BB), %xmm2 894 xorps %xmm4, %xmm4 895 movaps 0 * SIZE(AA), %xmm0 896 xorps %xmm5, %xmm5 897 movaps 16 * SIZE(BB), %xmm3 898 xorps %xmm6, %xmm6 899 movaps 16 * SIZE(AA), %xmm1 900 xorps %xmm7, %xmm7 901 902#endif 903 904#ifndef TRMMKERNEL 905 movl K, %eax 906#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 907 movl K, %eax 908 subl KK, %eax 909 movl %eax, KKK 910#else 911 movl KK, %eax 912#ifdef LEFT 913 addl $2, %eax 914#else 915 addl $1, %eax 916#endif 917 movl %eax, KKK 918#endif 919 sarl $3, %eax 920 je .L52 921 ALIGN_4 922 923.L51: 924 mulps %xmm0, %xmm2 925 mulps 4 * SIZE(BB), %xmm0 926 addps %xmm2, %xmm4 927 movaps 8 * SIZE(BB), %xmm2 928 addps %xmm0, %xmm5 929 movaps 4 * SIZE(AA), %xmm0 930 mulps %xmm0, %xmm2 931 mulps 12 * SIZE(BB), %xmm0 932 addps %xmm2, %xmm6 933 movaps 32 * SIZE(BB), %xmm2 934 addps %xmm0, %xmm7 935 movaps 8 * SIZE(AA), %xmm0 936 mulps %xmm0, %xmm3 937 mulps 20 * SIZE(BB), %xmm0 938 addps %xmm3, %xmm4 939 movaps 24 * SIZE(BB), %xmm3 940 addps %xmm0, %xmm5 941 movaps 12 * SIZE(AA), %xmm0 942 mulps %xmm0, %xmm3 943 mulps 28 * SIZE(BB), %xmm0 944 addps %xmm3, %xmm6 945 movaps 48 * SIZE(BB), %xmm3 946 addps %xmm0, %xmm7 947 movaps 32 * SIZE(AA), %xmm0 948 mulps %xmm1, %xmm2 949 mulps 36 * SIZE(BB), %xmm1 950 addps %xmm2, %xmm4 951 movaps 40 * SIZE(BB), %xmm2 952 addps %xmm1, %xmm5 953 movaps 20 * SIZE(AA), %xmm1 954 mulps %xmm1, %xmm2 955 mulps 44 * SIZE(BB), %xmm1 956 addps %xmm2, %xmm6 957 movaps 64 * SIZE(BB), %xmm2 958 addps %xmm1, %xmm7 959 movaps 24 * SIZE(AA), %xmm1 960 mulps %xmm1, %xmm3 961 mulps 52 * SIZE(BB), %xmm1 962 addps %xmm3, %xmm4 963 movaps 56 * SIZE(BB), %xmm3 964 addps %xmm1, %xmm5 965 movaps 28 * SIZE(AA), %xmm1 966 mulps %xmm1, %xmm3 967 mulps 60 * SIZE(BB), %xmm1 968 addps %xmm3, %xmm6 969 movaps 80 * SIZE(BB), %xmm3 970 addps %xmm1, %xmm7 971 movaps 48 * SIZE(AA), %xmm1 972 973 addl $32 * SIZE, AA 974 addl $64 * SIZE, BB 975 decl %eax 976 jne .L51 977 ALIGN_2 978 979#else 980 981#if !defined(TRMMKERNEL) || \ 982 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 983 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 984 985 leal BUFFER, BB 986 movaps 0 * SIZE + BUFFER, %xmm2 987 xorps %xmm4, %xmm4 988 movaps 0 * SIZE(AA), %xmm0 989 xorps %xmm5, %xmm5 990 movaps 8 * SIZE + BUFFER, %xmm3 991 xorps %xmm6, %xmm6 992 movaps 8 * SIZE(AA), %xmm1 993 xorps %xmm7, %xmm7 994 995#else 996 997 leal BUFFER, BB 998 movl KK, %eax 999 leal (, %eax, 8), %eax 1000 leal (AA, %eax, 2), AA 1001 leal (BB, %eax, 4), BB /* because it's doubled */ 1002 1003 movaps 0 * SIZE(BB), %xmm2 1004 xorps %xmm4, %xmm4 1005 movaps 0 * SIZE(AA), %xmm0 1006 xorps %xmm5, %xmm5 1007 movaps 8 * SIZE(BB), %xmm3 1008 xorps %xmm6, %xmm6 1009 movaps 8 * SIZE(AA), %xmm1 1010 xorps %xmm7, %xmm7 1011 1012#endif 1013 1014#ifndef TRMMKERNEL 1015 movl K, %eax 1016#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1017 movl K, %eax 1018 subl KK, %eax 1019 movl %eax, KKK 1020#else 1021 movl KK, %eax 1022#ifdef LEFT 1023 addl $2, %eax 1024#else 1025 addl $1, %eax 1026#endif 1027 movl %eax, KKK 1028#endif 1029 sarl $3, %eax 1030 je .L52 1031 ALIGN_4 1032 1033.L51: 1034 mulps %xmm0, %xmm2 1035 mulps 4 * SIZE(BB), %xmm0 1036 addps %xmm2, %xmm4 1037 movaps 16 * SIZE(BB), %xmm2 1038 addps %xmm0, %xmm5 1039 movaps 4 * SIZE(AA), %xmm0 1040 mulps %xmm0, %xmm3 1041 mulps 12 * SIZE(BB), %xmm0 1042 addps %xmm3, %xmm6 1043 movaps 24 * SIZE(BB), %xmm3 1044 addps %xmm0, %xmm7 1045 movaps 16 * SIZE(AA), %xmm0 1046 mulps %xmm1, %xmm2 1047 mulps 20 * SIZE(BB), %xmm1 1048 addps %xmm2, %xmm4 1049 movaps 32 * SIZE(BB), %xmm2 1050 addps %xmm1, %xmm5 1051 movaps 12 * SIZE(AA), %xmm1 1052 mulps %xmm1, %xmm3 1053 mulps 28 * SIZE(BB), %xmm1 1054 addps %xmm3, %xmm6 1055 movaps 40 * SIZE(BB), %xmm3 1056 addps %xmm1, %xmm7 1057 movaps 24 * SIZE(AA), %xmm1 1058 mulps %xmm0, %xmm2 1059 mulps 36 * SIZE(BB), %xmm0 1060 addps %xmm2, %xmm4 1061 movaps 48 * SIZE(BB), %xmm2 1062 addps %xmm0, %xmm5 1063 movaps 20 * SIZE(AA), %xmm0 1064 mulps %xmm0, %xmm3 1065 mulps 44 * SIZE(BB), %xmm0 1066 addps %xmm3, %xmm6 1067 movaps 56 * SIZE(BB), %xmm3 1068 addps %xmm0, %xmm7 1069 movaps 32 * SIZE(AA), %xmm0 1070 mulps %xmm1, %xmm2 1071 mulps 52 * SIZE(BB), %xmm1 1072 addps %xmm2, %xmm4 1073 movaps 64 * SIZE(BB), %xmm2 1074 addps %xmm1, %xmm5 1075 movaps 28 * SIZE(AA), %xmm1 1076 mulps %xmm1, %xmm3 1077 mulps 60 * SIZE(BB), %xmm1 1078 addps %xmm3, %xmm6 1079 movaps 72 * SIZE(BB), %xmm3 1080 addps %xmm1, %xmm7 1081 movaps 40 * SIZE(AA), %xmm1 1082 1083 addl $32 * SIZE, AA 1084 addl $64 * SIZE, BB 1085 decl %eax 1086 jne .L51 1087#endif 1088 1089.L52: 1090#ifndef TRMMKERNEL 1091 movl K, %eax 1092#else 1093 movl KKK, %eax 1094#endif 1095 movaps ALPHA_R, %xmm1 1096 movaps ALPHA_I, %xmm3 1097 andl $7, %eax # if (k & 1) 1098 BRANCH 1099 je .L54 1100 1101.L53: 1102 mulps %xmm0, %xmm2 1103 mulps 4 * SIZE(BB), %xmm0 1104 addps %xmm2, %xmm4 1105 movaps 8 * SIZE(BB), %xmm2 1106 addps %xmm0, %xmm5 1107 movaps 4 * SIZE(AA), %xmm0 1108 1109 addl $4 * SIZE, AA # aoffset += 8 1110 addl $8 * SIZE, BB # boffset1 += 8 1111 decl %eax 1112 jg .L53 1113 1114.L54: 1115 addps %xmm6, %xmm4 1116 addps %xmm7, %xmm5 1117 1118 shufps $0xb1, %xmm5, %xmm5 1119 1120#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1121 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1122 subps %xmm5, %xmm4 1123#else 1124 addps %xmm5, %xmm4 1125#endif 1126 1127 movaps %xmm4, %xmm5 1128 1129 shufps $0xb1, %xmm4, %xmm4 1130 1131 mulps %xmm1, %xmm5 1132 mulps %xmm3, %xmm4 1133 1134 addps %xmm5, %xmm4 1135 1136#ifndef TRMMKERNEL 1137 movsd 0 * SIZE(%esi), %xmm0 1138 movhps 2 * SIZE(%esi), %xmm0 1139 1140 addps %xmm0, %xmm4 1141#endif 1142 1143 movlps %xmm4, 0 * SIZE(%esi) 1144 movhps %xmm4, 2 * SIZE(%esi) 1145 1146#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1147 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1148 movl K, %eax 1149 subl KKK, %eax 1150 leal (,%eax, 8), %eax 1151 leal (AA, %eax, 2), AA 1152 leal (BB, %eax, 4), BB 1153#endif 1154 1155#if defined(TRMMKERNEL) && defined(LEFT) 1156 addl $2, KK 1157#endif 1158 addl $4 * SIZE, %esi # coffset += 4 1159 ALIGN_2 1160 1161.L70: 1162 testl $1, %ebx 1163 jle .L99 1164 1165 1166#if (L1_DATA_LINESIZE == 64) 1167 1168#if !defined(TRMMKERNEL) || \ 1169 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1170 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1171 1172 leal BUFFER, BB 1173 movaps 0 * SIZE + BUFFER, %xmm2 1174 xorps %xmm4, %xmm4 1175#ifdef movsd 1176 xorps %xmm0, %xmm0 1177#endif 1178 movsd 0 * SIZE(AA), %xmm0 1179 xorps %xmm5, %xmm5 1180 movaps 16 * SIZE + BUFFER, %xmm3 1181 xorps %xmm6, %xmm6 1182#ifdef movsd 1183 xorps %xmm1, %xmm1 1184#endif 1185 movsd 8 * SIZE(AA), %xmm1 1186 xorps %xmm7, %xmm7 1187 1188#else 1189 1190 leal BUFFER, BB 1191 movl KK, %eax 1192 leal (, %eax, 8), %eax 1193 leal (AA, %eax, 1), AA 1194 leal (BB, %eax, 4), BB /* because it's doubled */ 1195 1196 movaps 0 * SIZE(BB), %xmm2 1197 xorps %xmm4, %xmm4 1198#ifdef movsd 1199 xorps %xmm0, %xmm0 1200#endif 1201 movsd 0 * SIZE(AA), %xmm0 1202 xorps %xmm5, %xmm5 1203 movaps 16 * SIZE(BB), %xmm3 1204 xorps %xmm6, %xmm6 1205#ifdef movsd 1206 xorps %xmm1, %xmm1 1207#endif 1208 movsd 8 * SIZE(AA), %xmm1 1209 xorps %xmm7, %xmm7 1210 1211#endif 1212 1213#ifndef TRMMKERNEL 1214 movl K, %eax 1215#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1216 movl K, %eax 1217 subl KK, %eax 1218 movl %eax, KKK 1219#else 1220 movl KK, %eax 1221 addl $1, %eax 1222 movl %eax, KKK 1223#endif 1224 sarl $3, %eax 1225 je .L72 1226 ALIGN_4 1227 1228.L71: 1229 mulps %xmm0, %xmm2 1230 addps %xmm2, %xmm4 1231 movaps 4 * SIZE(BB), %xmm2 1232 mulps %xmm0, %xmm2 1233 movsd 2 * SIZE(AA), %xmm0 1234 addps %xmm2, %xmm5 1235 movaps 8 * SIZE(BB), %xmm2 1236 mulps %xmm0, %xmm2 1237 addps %xmm2, %xmm6 1238 movaps 12 * SIZE(BB), %xmm2 1239 mulps %xmm0, %xmm2 1240 movsd 4 * SIZE(AA), %xmm0 1241 addps %xmm2, %xmm7 1242 movaps 32 * SIZE(BB), %xmm2 1243 mulps %xmm0, %xmm3 1244 addps %xmm3, %xmm4 1245 movaps 20 * SIZE(BB), %xmm3 1246 mulps %xmm0, %xmm3 1247 movsd 6 * SIZE(AA), %xmm0 1248 addps %xmm3, %xmm5 1249 movaps 24 * SIZE(BB), %xmm3 1250 mulps %xmm0, %xmm3 1251 addps %xmm3, %xmm6 1252 movaps 28 * SIZE(BB), %xmm3 1253 mulps %xmm0, %xmm3 1254 movsd 16 * SIZE(AA), %xmm0 1255 addps %xmm3, %xmm7 1256 movaps 48 * SIZE(BB), %xmm3 1257 mulps %xmm1, %xmm2 1258 addps %xmm2, %xmm4 1259 movaps 36 * SIZE(BB), %xmm2 1260 mulps %xmm1, %xmm2 1261 movsd 10 * SIZE(AA), %xmm1 1262 addps %xmm2, %xmm5 1263 movaps 40 * SIZE(BB), %xmm2 1264 mulps %xmm1, %xmm2 1265 addps %xmm2, %xmm6 1266 movaps 44 * SIZE(BB), %xmm2 1267 mulps %xmm1, %xmm2 1268 movsd 12 * SIZE(AA), %xmm1 1269 addps %xmm2, %xmm7 1270 movaps 64 * SIZE(BB), %xmm2 1271 mulps %xmm1, %xmm3 1272 addps %xmm3, %xmm4 1273 movaps 52 * SIZE(BB), %xmm3 1274 mulps %xmm1, %xmm3 1275 movsd 14 * SIZE(AA), %xmm1 1276 addps %xmm3, %xmm5 1277 movaps 56 * SIZE(BB), %xmm3 1278 mulps %xmm1, %xmm3 1279 addps %xmm3, %xmm6 1280 movaps 60 * SIZE(BB), %xmm3 1281 mulps %xmm1, %xmm3 1282 movsd 24 * SIZE(AA), %xmm1 1283 addps %xmm3, %xmm7 1284 movaps 80 * SIZE(BB), %xmm3 1285 1286 addl $16 * SIZE, AA 1287 addl $64 * SIZE, BB 1288 decl %eax 1289 jne .L71 1290 ALIGN_2 1291 1292#else 1293#if !defined(TRMMKERNEL) || \ 1294 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1295 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1296 1297 leal BUFFER, BB 1298 movaps 0 * SIZE + BUFFER, %xmm2 1299 xorps %xmm4, %xmm4 1300 movaps 0 * SIZE(AA), %xmm0 1301 xorps %xmm5, %xmm5 1302 movaps 8 * SIZE + BUFFER, %xmm3 1303 xorps %xmm6, %xmm6 1304 movaps 8 * SIZE(AA), %xmm1 1305 xorps %xmm7, %xmm7 1306 1307#else 1308 1309 leal BUFFER, BB 1310 movl KK, %eax 1311 leal (, %eax, 8), %eax 1312 leal (AA, %eax, 1), AA 1313 leal (BB, %eax, 4), BB /* because it's doubled */ 1314 1315 movaps 0 * SIZE(BB), %xmm2 1316 xorps %xmm4, %xmm4 1317#ifdef movsd 1318 xorps %xmm0, %xmm0 1319#endif 1320 movsd 0 * SIZE(AA), %xmm0 1321 xorps %xmm5, %xmm5 1322 movaps 8 * SIZE(BB), %xmm3 1323 xorps %xmm6, %xmm6 1324#ifdef movsd 1325 xorps %xmm1, %xmm1 1326#endif 1327 movsd 8 * SIZE(AA), %xmm1 1328 xorps %xmm7, %xmm7 1329 1330#endif 1331 1332 1333#ifndef TRMMKERNEL 1334 movl K, %eax 1335#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1336 movl K, %eax 1337 subl KK, %eax 1338 movl %eax, KKK 1339#else 1340 movl KK, %eax 1341 addl $1, %eax 1342 movl %eax, KKK 1343#endif 1344 sarl $3, %eax 1345 je .L72 1346 ALIGN_4 1347 1348.L71: 1349 mulps %xmm0, %xmm2 1350 addps %xmm2, %xmm4 1351 movaps 4 * SIZE(BB), %xmm2 1352 mulps %xmm0, %xmm2 1353 movsd 2 * SIZE(AA), %xmm0 1354 addps %xmm2, %xmm5 1355 movaps 16 * SIZE(BB), %xmm2 1356 mulps %xmm0, %xmm3 1357 addps %xmm3, %xmm4 1358 movaps 12 * SIZE(BB), %xmm3 1359 mulps %xmm0, %xmm3 1360 movsd 4 * SIZE(AA), %xmm0 1361 addps %xmm3, %xmm5 1362 movaps 24 * SIZE(BB), %xmm3 1363 mulps %xmm0, %xmm2 1364 addps %xmm2, %xmm4 1365 movaps 20 * SIZE(BB), %xmm2 1366 mulps %xmm0, %xmm2 1367 movsd 6 * SIZE(AA), %xmm0 1368 addps %xmm2, %xmm5 1369 movaps 32 * SIZE(BB), %xmm2 1370 mulps %xmm0, %xmm3 1371 addps %xmm3, %xmm4 1372 movaps 28 * SIZE(BB), %xmm3 1373 mulps %xmm0, %xmm3 1374 movsd 16 * SIZE(AA), %xmm0 1375 addps %xmm3, %xmm5 1376 movaps 40 * SIZE(BB), %xmm3 1377 mulps %xmm1, %xmm2 1378 addps %xmm2, %xmm4 1379 movaps 36 * SIZE(BB), %xmm2 1380 mulps %xmm1, %xmm2 1381 movsd 10 * SIZE(AA), %xmm1 1382 addps %xmm2, %xmm5 1383 movaps 48 * SIZE(BB), %xmm2 1384 mulps %xmm1, %xmm3 1385 addps %xmm3, %xmm4 1386 movaps 44 * SIZE(BB), %xmm3 1387 mulps %xmm1, %xmm3 1388 movsd 12 * SIZE(AA), %xmm1 1389 addps %xmm3, %xmm5 1390 movaps 56 * SIZE(BB), %xmm3 1391 mulps %xmm1, %xmm2 1392 addps %xmm2, %xmm4 1393 movaps 52 * SIZE(BB), %xmm2 1394 mulps %xmm1, %xmm2 1395 movsd 14 * SIZE(AA), %xmm1 1396 addps %xmm2, %xmm5 1397 movaps 64 * SIZE(BB), %xmm2 1398 mulps %xmm1, %xmm3 1399 addps %xmm3, %xmm4 1400 movaps 60 * SIZE(BB), %xmm3 1401 mulps %xmm1, %xmm3 1402 movsd 24 * SIZE(AA), %xmm1 1403 addps %xmm3, %xmm5 1404 movaps 72 * SIZE(BB), %xmm3 1405 1406 addl $16 * SIZE, AA 1407 addl $64 * SIZE, BB 1408 decl %eax 1409 jne .L71 1410 ALIGN_2 1411#endif 1412 1413.L72: 1414#ifndef TRMMKERNEL 1415 movl K, %eax 1416#else 1417 movl KKK, %eax 1418#endif 1419 movaps ALPHA_R, %xmm1 1420 movaps ALPHA_I, %xmm3 1421 andl $7, %eax # if (k & 1) 1422 BRANCH 1423 je .L74 1424 1425.L73: 1426 mulps %xmm0, %xmm2 1427 addps %xmm2, %xmm4 1428 movaps 4 * SIZE(BB), %xmm2 1429 mulps %xmm0, %xmm2 1430 movsd 2 * SIZE(AA), %xmm0 1431 addps %xmm2, %xmm5 1432 movaps 8 * SIZE(BB), %xmm2 1433 1434 addl $2 * SIZE, AA # aoffset += 8 1435 addl $8 * SIZE, BB # boffset1 += 8 1436 decl %eax 1437 jg .L73 1438 1439.L74: 1440 addps %xmm6, %xmm4 1441 addps %xmm7, %xmm5 1442 1443 shufps $0xb1, %xmm5, %xmm5 1444 1445#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1446 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1447 subps %xmm5, %xmm4 1448#else 1449 addps %xmm5, %xmm4 1450#endif 1451 1452 movaps %xmm4, %xmm5 1453 1454 shufps $0xb1, %xmm4, %xmm4 1455 1456 mulps %xmm1, %xmm5 1457 mulps %xmm3, %xmm4 1458 1459 addps %xmm5, %xmm4 1460 1461#ifndef TRMMKERNEL 1462#ifdef movsd 1463 xorps %xmm0, %xmm0 1464#endif 1465 movsd 0 * SIZE(%esi), %xmm0 1466 1467 addps %xmm0, %xmm4 1468#endif 1469 1470 movlps %xmm4, 0 * SIZE(%esi) 1471 1472#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1473 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1474 movl K, %eax 1475 subl KKK, %eax 1476 leal (,%eax, 8), %eax 1477 leal (AA, %eax, 1), AA 1478 leal (BB, %eax, 4), BB 1479#endif 1480 1481#if defined(TRMMKERNEL) && defined(LEFT) 1482 addl $1, KK 1483#endif 1484 1485 ALIGN_2 1486 1487.L99: 1488#if defined(TRMMKERNEL) && !defined(LEFT) 1489 addl $1, KK 1490#endif 1491 1492 addl LDC, C # c += ldc 1493 decl J # j -- 1494 jg .L01 1495 ALIGN_2 1496 1497.L999: 1498 movl OLD_STACK, %esp 1499 1500 EMMS 1501 1502 popl %ebx 1503 popl %esi 1504 popl %edi 1505 popl %ebp 1506 ret 1507 1508 EPILOGUE 1509