1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43#define ARGS 0 44 45#define STACK_M 4 + STACK + ARGS(%esi) 46#define STACK_N 8 + STACK + ARGS(%esi) 47#define STACK_K 12 + STACK + ARGS(%esi) 48#define STACK_ALPHA 16 + STACK + ARGS(%esi) 49#define STACK_A 20 + STACK + ARGS(%esi) 50#define STACK_B 24 + STACK + ARGS(%esi) 51#define STACK_C 28 + STACK + ARGS(%esi) 52#define STACK_LDC 32 + STACK + ARGS(%esi) 53#define STACK_OFFT 36 + STACK + ARGS(%esi) 54 55#define ALPHA 0(%esp) 56#define K 16(%esp) 57#define N 20(%esp) 58#define M 24(%esp) 59#define A 28(%esp) 60#define C 32(%esp) 61#define J 36(%esp) 62#define OLD_STACK 40(%esp) 63#define OFFSET 44(%esp) 64#define KK 48(%esp) 65#define KKK 52(%esp) 66#define BUFFER 512(%esp) 67 68#define PREFETCH_R (8 * 16 + 0) 69#define PREFETCH_W (PREFETCH_R * 2) 70 71#define PREFETCHSIZE (8 * 16 + 4) 72#define PREFETCH prefetcht0 73 74#define AA %edx 75#define BB %ecx 76#define LDC %ebp 77#define B %edi 78#define C1 %esi 79#define I %ebx 80 81 82 PROLOGUE 83 84 pushl %ebp 85 pushl %edi 86 pushl %esi 87 pushl %ebx 88 89 PROFCODE 90 91 movl %esp, %esi # save old stack 92 93 subl $512 + LOCAL_BUFFER_SIZE, %esp 94 andl $-4096, %esp # align stack 95 96 STACK_TOUCHING 97 98 movl STACK_M, %ebx 99 movl STACK_N, %eax 100 movl STACK_K, %ecx 101 movl STACK_A, %edx 102 movss STACK_ALPHA, %xmm3 103#ifdef TRMMKERNEL 104 movd STACK_OFFT, %mm4 105#endif 106 107 movl %ebx, M 108 movl %eax, N 109 movl %ecx, K 110 movl %edx, A 111 movl %esi, OLD_STACK 112#ifdef TRMMKERNEL 113 movd %mm4, OFFSET 114 movd %mm4, KK 115#ifndef LEFT 116 negl KK 117#endif 118#endif 119 120 shufps $0, %xmm3, %xmm3 121 122 movl STACK_B, B 123 movl STACK_C, %ebx 124 125 movaps %xmm3, ALPHA 126 movl %ebx, C 127 movl STACK_LDC, LDC 128 129 subl $-32 * SIZE, A 130 subl $-32 * SIZE, B 131 132 leal (, LDC, SIZE), LDC 133 134 sarl $1, %eax 135 movl %eax, J 136 jle .L50 137 ALIGN_4 138 139.L01: 140 leal 32 * SIZE + BUFFER, BB 141 142#if defined(TRMMKERNEL) && defined(LEFT) 143 movl OFFSET, %eax 144 movl %eax, KK 145#endif 146 147 movl K, %eax 148 sarl $2, %eax 149 jle .L05 150 ALIGN_4 151 152.L02: 153 prefetcht0 (PREFETCH_R + 0) * SIZE(B) 154 movss -32 * SIZE(B), %xmm0 155 movss -31 * SIZE(B), %xmm1 156 movss -30 * SIZE(B), %xmm2 157 movss -29 * SIZE(B), %xmm3 158 movss -28 * SIZE(B), %xmm4 159 movss -27 * SIZE(B), %xmm5 160 movss -26 * SIZE(B), %xmm6 161 movss -25 * SIZE(B), %xmm7 162 163 prefetcht0 (PREFETCH_W + 0) * SIZE(BB) 164 shufps $0, %xmm0, %xmm0 165 shufps $0, %xmm1, %xmm1 166 shufps $0, %xmm2, %xmm2 167 shufps $0, %xmm3, %xmm3 168 shufps $0, %xmm4, %xmm4 169 shufps $0, %xmm5, %xmm5 170 shufps $0, %xmm6, %xmm6 171 shufps $0, %xmm7, %xmm7 172 173 prefetcht0 (PREFETCH_W + 16) * SIZE(BB) 174 movaps %xmm0, -32 * SIZE(BB) 175 movaps %xmm1, -28 * SIZE(BB) 176 movaps %xmm2, -24 * SIZE(BB) 177 movaps %xmm3, -20 * SIZE(BB) 178 movaps %xmm4, -16 * SIZE(BB) 179 movaps %xmm5, -12 * SIZE(BB) 180 movaps %xmm6, -8 * SIZE(BB) 181 movaps %xmm7, -4 * SIZE(BB) 182 183 addl $ 8 * SIZE, B 184 subl $-32 * SIZE, BB 185 decl %eax 186 jne .L02 187 ALIGN_4 188 189.L05: 190 movl K, %eax 191 andl $3, %eax 192 BRANCH 193 jle .L10 194 ALIGN_4 195 196.L06: 197 movss -32 * SIZE(B), %xmm0 198 movss -31 * SIZE(B), %xmm1 199 200 shufps $0, %xmm0, %xmm0 201 shufps $0, %xmm1, %xmm1 202 203 movaps %xmm0, -32 * SIZE(BB) 204 movaps %xmm1, -28 * SIZE(BB) 205 addl $2 * SIZE, B 206 addl $8 * SIZE, BB 207 decl %eax 208 jne .L06 209 ALIGN_4 210 211.L10: 212 movl C, C1 213 movl A, AA 214 movl M, I 215 sarl $3, I 216 jle .L20 217 ALIGN_4 218 219.L11: 220#if !defined(TRMMKERNEL) || \ 221 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 222 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 223 leal 32 * SIZE + BUFFER, BB 224#else 225 leal 32 * SIZE + BUFFER, BB 226 movl KK, %eax 227 leal (, %eax, 8), %eax 228 leal (AA, %eax, 4), AA 229 leal (BB, %eax, 4), BB /* because it's doubled */ 230#endif 231 232 movaps -32 * SIZE(AA), %xmm0 233 pxor %xmm4, %xmm4 234 movaps -32 * SIZE(BB), %xmm1 235 pxor %xmm5, %xmm5 236 movapd -16 * SIZE(AA), %xmm3 237 pxor %xmm6, %xmm6 238 prefetcht0 7 * SIZE(C1) 239 pxor %xmm7, %xmm7 240 prefetcht0 7 * SIZE(C1, LDC) 241 242#ifndef TRMMKERNEL 243 movl K, %eax 244#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 245 movl K, %eax 246 subl KK, %eax 247 movl %eax, KKK 248#else 249 movl KK, %eax 250#ifdef LEFT 251 addl $8, %eax 252#else 253 addl $2, %eax 254#endif 255 movl %eax, KKK 256#endif 257 sarl $3, %eax 258 je .L15 259 ALIGN_4 260 261.L12: 262 movaps %xmm1, %xmm2 263 mulps %xmm0, %xmm1 264 addps %xmm1, %xmm4 265 movaps -28 * SIZE(BB), %xmm1 266 mulps %xmm1, %xmm0 267 addps %xmm0, %xmm5 268 movaps -28 * SIZE(AA), %xmm0 269 mulps %xmm0, %xmm2 270 mulps %xmm0, %xmm1 271 movaps -24 * SIZE(AA), %xmm0 272 addps %xmm2, %xmm6 273 addps %xmm1, %xmm7 274 275 movaps -24 * SIZE(BB), %xmm1 276 movaps %xmm1, %xmm2 277 mulps %xmm0, %xmm1 278 addps %xmm1, %xmm4 279 movaps -20 * SIZE(BB), %xmm1 280 mulps %xmm1, %xmm0 281 addps %xmm0, %xmm5 282 movaps -20 * SIZE(AA), %xmm0 283 mulps %xmm0, %xmm2 284 mulps %xmm0, %xmm1 285 movaps 0 * SIZE(AA), %xmm0 286 addps %xmm2, %xmm6 287 addps %xmm1, %xmm7 288 289 movaps -16 * SIZE(BB), %xmm1 290 movaps %xmm1, %xmm2 291 mulps %xmm3, %xmm1 292 addps %xmm1, %xmm4 293 movaps -12 * SIZE(BB), %xmm1 294 mulps %xmm1, %xmm3 295 addps %xmm3, %xmm5 296 movaps -12 * SIZE(AA), %xmm3 297 mulps %xmm3, %xmm2 298 mulps %xmm3, %xmm1 299 movaps -8 * SIZE(AA), %xmm3 300 addps %xmm2, %xmm6 301 addps %xmm1, %xmm7 302 303 movaps -8 * SIZE(BB), %xmm1 304 movaps %xmm1, %xmm2 305 mulps %xmm3, %xmm1 306 addps %xmm1, %xmm4 307 movaps -4 * SIZE(BB), %xmm1 308 mulps %xmm1, %xmm3 309 addps %xmm3, %xmm5 310 movaps -4 * SIZE(AA), %xmm3 311 mulps %xmm3, %xmm2 312 mulps %xmm3, %xmm1 313 movaps 16 * SIZE(AA), %xmm3 314 addps %xmm2, %xmm6 315 addps %xmm1, %xmm7 316 movaps 0 * SIZE(BB), %xmm1 317 318 movaps %xmm1, %xmm2 319 mulps %xmm0, %xmm1 320 addps %xmm1, %xmm4 321 movaps 4 * SIZE(BB), %xmm1 322 mulps %xmm1, %xmm0 323 addps %xmm0, %xmm5 324 movaps 4 * SIZE(AA), %xmm0 325 mulps %xmm0, %xmm2 326 mulps %xmm0, %xmm1 327 movaps 8 * SIZE(AA), %xmm0 328 addps %xmm2, %xmm6 329 addps %xmm1, %xmm7 330 331 movaps 8 * SIZE(BB), %xmm1 332 movaps %xmm1, %xmm2 333 mulps %xmm0, %xmm1 334 addps %xmm1, %xmm4 335 movaps 12 * SIZE(BB), %xmm1 336 mulps %xmm1, %xmm0 337 addps %xmm0, %xmm5 338 movaps 12 * SIZE(AA), %xmm0 339 mulps %xmm0, %xmm2 340 mulps %xmm0, %xmm1 341 movaps 32 * SIZE(AA), %xmm0 342 addps %xmm2, %xmm6 343 addps %xmm1, %xmm7 344 345 movaps 16 * SIZE(BB), %xmm1 346 movaps %xmm1, %xmm2 347 mulps %xmm3, %xmm1 348 addps %xmm1, %xmm4 349 movaps 20 * SIZE(BB), %xmm1 350 mulps %xmm1, %xmm3 351 addps %xmm3, %xmm5 352 movaps 20 * SIZE(AA), %xmm3 353 mulps %xmm3, %xmm2 354 mulps %xmm3, %xmm1 355 addps %xmm2, %xmm6 356 movaps 24 * SIZE(AA), %xmm3 357 addps %xmm1, %xmm7 358 359 movaps 24 * SIZE(BB), %xmm1 360 movaps %xmm1, %xmm2 361 mulps %xmm3, %xmm1 362 addps %xmm1, %xmm4 363 movaps 28 * SIZE(BB), %xmm1 364 mulps %xmm1, %xmm3 365 addps %xmm3, %xmm5 366 movaps 28 * SIZE(AA), %xmm3 367 mulps %xmm3, %xmm2 368 mulps %xmm3, %xmm1 369 subl $-64 * SIZE, BB 370 movaps 48 * SIZE(AA), %xmm3 371 subl $-64 * SIZE, AA 372 addps %xmm2, %xmm6 373 addps %xmm1, %xmm7 374 movaps -32 * SIZE(BB), %xmm1 375 376 decl %eax 377 jne .L12 378 ALIGN_4 379 380.L15: 381#ifndef TRMMKERNEL 382 movl K, %eax 383#else 384 movl KKK, %eax 385#endif 386 andl $7, %eax 387 BRANCH 388 je .L18 389 ALIGN_4 390 391.L16: 392 movaps %xmm1, %xmm2 393 mulps %xmm0, %xmm1 394 addps %xmm1, %xmm4 395 movaps -28 * SIZE(BB), %xmm1 396 mulps %xmm1, %xmm0 397 addps %xmm0, %xmm5 398 movaps -28 * SIZE(AA), %xmm0 399 mulps %xmm0, %xmm2 400 mulps %xmm0, %xmm1 401 movaps -24 * SIZE(AA), %xmm0 402 addps %xmm2, %xmm6 403 addps %xmm1, %xmm7 404 movaps -24 * SIZE(BB), %xmm1 405 406 addl $8 * SIZE, AA 407 addl $8 * SIZE, BB 408 decl %eax 409 jg .L16 410 ALIGN_4 411 412.L18: 413 movaps ALPHA, %xmm3 414 415 mulps %xmm3, %xmm4 416 mulps %xmm3, %xmm5 417 mulps %xmm3, %xmm6 418 mulps %xmm3, %xmm7 419 420#ifndef TRMMKERNEL 421 movsd 0 * SIZE(C1), %xmm0 422 movhps 2 * SIZE(C1), %xmm0 423 movsd 4 * SIZE(C1), %xmm2 424 movhps 6 * SIZE(C1), %xmm2 425 426 movsd 0 * SIZE(C1, LDC), %xmm1 427 movhps 2 * SIZE(C1, LDC), %xmm1 428 movsd 4 * SIZE(C1, LDC), %xmm3 429 movhps 6 * SIZE(C1, LDC), %xmm3 430 431 addps %xmm0, %xmm4 432 addps %xmm1, %xmm5 433 addps %xmm2, %xmm6 434 addps %xmm3, %xmm7 435#endif 436 437 movsd %xmm4, 0 * SIZE(C1) 438 movhps %xmm4, 2 * SIZE(C1) 439 movsd %xmm6, 4 * SIZE(C1) 440 movhps %xmm6, 6 * SIZE(C1) 441 442 movsd %xmm5, 0 * SIZE(C1, LDC) 443 movhps %xmm5, 2 * SIZE(C1, LDC) 444 movsd %xmm7, 4 * SIZE(C1, LDC) 445 movhps %xmm7, 6 * SIZE(C1, LDC) 446 447#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 448 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 449 movl K, %eax 450 subl KKK, %eax 451 leal (,%eax, 8), %eax 452 leal (AA, %eax, 4), AA 453 leal (BB, %eax, 4), BB 454#endif 455 456#if defined(TRMMKERNEL) && defined(LEFT) 457 addl $8, KK 458#endif 459 460 addl $8 * SIZE, C1 461 decl I 462 jg .L11 463 ALIGN_4 464 465.L20: 466 movl M, I 467 testl $4, I 468 jle .L30 469 470.L21: 471#if !defined(TRMMKERNEL) || \ 472 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 473 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 474 leal 32 * SIZE + BUFFER, BB 475#else 476 leal 32 * SIZE + BUFFER, BB 477 movl KK, %eax 478 leal (, %eax, 8), %eax 479 leal (AA, %eax, 2), AA 480 leal (BB, %eax, 4), BB /* because it's doubled */ 481#endif 482 483 movaps -32 * SIZE(AA), %xmm0 484 pxor %xmm4, %xmm4 485 movaps -32 * SIZE(BB), %xmm1 486 pxor %xmm5, %xmm5 487 movaps -16 * SIZE(AA), %xmm2 488 pxor %xmm6, %xmm6 489 movaps -16 * SIZE(BB), %xmm3 490 pxor %xmm7, %xmm7 491 492#ifndef TRMMKERNEL 493 movl K, %eax 494#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 495 movl K, %eax 496 subl KK, %eax 497 movl %eax, KKK 498#else 499 movl KK, %eax 500#ifdef LEFT 501 addl $4, %eax 502#else 503 addl $2, %eax 504#endif 505 movl %eax, KKK 506#endif 507 sarl $3, %eax 508 je .L25 509 ALIGN_4 510 511.L22: 512 mulps %xmm0, %xmm1 513 mulps -28 * SIZE(BB), %xmm0 514 addps %xmm1, %xmm4 515 movaps -24 * SIZE(BB), %xmm1 516 addps %xmm0, %xmm5 517 movaps -28 * SIZE(AA), %xmm0 518 mulps %xmm0, %xmm1 519 mulps -20 * SIZE(BB), %xmm0 520 addps %xmm1, %xmm6 521 movaps 0 * SIZE(BB), %xmm1 522 addps %xmm0, %xmm7 523 movaps -24 * SIZE(AA), %xmm0 524 mulps %xmm0, %xmm3 525 mulps -12 * SIZE(BB), %xmm0 526 addps %xmm3, %xmm4 527 movaps -8 * SIZE(BB), %xmm3 528 addps %xmm0, %xmm5 529 movaps -20 * SIZE(AA), %xmm0 530 mulps %xmm0, %xmm3 531 mulps -4 * SIZE(BB), %xmm0 532 addps %xmm3, %xmm6 533 movaps 16 * SIZE(BB), %xmm3 534 addps %xmm0, %xmm7 535 movaps 0 * SIZE(AA), %xmm0 536 mulps %xmm2, %xmm1 537 mulps 4 * SIZE(BB), %xmm2 538 addps %xmm1, %xmm4 539 movaps 8 * SIZE(BB), %xmm1 540 addps %xmm2, %xmm5 541 movaps -12 * SIZE(AA), %xmm2 542 mulps %xmm2, %xmm1 543 mulps 12 * SIZE(BB), %xmm2 544 addps %xmm1, %xmm6 545 movaps 32 * SIZE(BB), %xmm1 546 addps %xmm2, %xmm7 547 movaps -8 * SIZE(AA), %xmm2 548 mulps %xmm2, %xmm3 549 mulps 20 * SIZE(BB), %xmm2 550 addps %xmm3, %xmm4 551 movaps 24 * SIZE(BB), %xmm3 552 addps %xmm2, %xmm5 553 movaps -4 * SIZE(AA), %xmm2 554 mulps %xmm2, %xmm3 555 mulps 28 * SIZE(BB), %xmm2 556 addps %xmm3, %xmm6 557 movaps 48 * SIZE(BB), %xmm3 558 addps %xmm2, %xmm7 559 movaps 16 * SIZE(AA), %xmm2 560 561 subl $-32 * SIZE, AA 562 addl $ 64 * SIZE, BB 563 decl %eax 564 jne .L22 565 ALIGN_4 566 567.L25: 568 movaps ALPHA, %xmm3 569 570#ifndef TRMMKERNEL 571 movl K, %eax 572#else 573 movl KKK, %eax 574#endif 575 andl $7, %eax 576 BRANCH 577 je .L28 578 ALIGN_4 579 580.L26: 581 mulps %xmm0, %xmm1 582 mulps -28 * SIZE(BB), %xmm0 583 addps %xmm1, %xmm4 584 movaps -24 * SIZE(BB), %xmm1 585 addps %xmm0, %xmm5 586 movaps -28 * SIZE(AA), %xmm0 587 588 addl $4 * SIZE, AA 589 addl $8 * SIZE, BB 590 decl %eax 591 jg .L26 592 ALIGN_4 593 594.L28: 595 addps %xmm6, %xmm4 596 addps %xmm7, %xmm5 597 598 mulps %xmm3, %xmm4 599 mulps %xmm3, %xmm5 600 601#ifndef TRMMKERNEL 602 movsd 0 * SIZE(C1), %xmm0 603 movhps 2 * SIZE(C1), %xmm0 604 605 movsd 0 * SIZE(C1, LDC), %xmm1 606 movhps 2 * SIZE(C1, LDC), %xmm1 607 608 addps %xmm0, %xmm4 609 addps %xmm1, %xmm5 610#endif 611 612 movsd %xmm4, 0 * SIZE(C1) 613 movhps %xmm4, 2 * SIZE(C1) 614 movsd %xmm5, 0 * SIZE(C1, LDC) 615 movhps %xmm5, 2 * SIZE(C1, LDC) 616 617#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 618 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 619 movl K, %eax 620 subl KKK, %eax 621 leal (,%eax, 8), %eax 622 leal (AA, %eax, 2), AA 623 leal (BB, %eax, 4), BB 624#endif 625 626#if defined(TRMMKERNEL) && defined(LEFT) 627 addl $4, KK 628#endif 629 630 addl $4 * SIZE, C1 631 ALIGN_4 632 633.L30: 634 movl M, I 635 testl $2, I 636 jle .L40 637 638.L31: 639#if !defined(TRMMKERNEL) || \ 640 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 641 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 642 leal 32 * SIZE + BUFFER, BB 643#else 644 leal 32 * SIZE + BUFFER, BB 645 movl KK, %eax 646 leal (, %eax, 8), %eax 647 leal (AA, %eax, 1), AA 648 leal (BB, %eax, 4), BB /* because it's doubled */ 649#endif 650 651 movsd -32 * SIZE(AA), %xmm0 652 pxor %xmm4, %xmm4 653 movsd -32 * SIZE(BB), %xmm1 654 pxor %xmm5, %xmm5 655 movsd -24 * SIZE(AA), %xmm2 656 pxor %xmm6, %xmm6 657 movsd -16 * SIZE(BB), %xmm3 658 pxor %xmm7, %xmm7 659 660#ifndef TRMMKERNEL 661 movl K, %eax 662#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 663 movl K, %eax 664 subl KK, %eax 665 movl %eax, KKK 666#else 667 movl KK, %eax 668#ifdef LEFT 669 addl $2, %eax 670#else 671 addl $2, %eax 672#endif 673 movl %eax, KKK 674#endif 675 sarl $3, %eax 676 je .L35 677 ALIGN_4 678 679.L32: 680 mulps %xmm0, %xmm1 681 mulps -28 * SIZE(BB), %xmm0 682 addps %xmm1, %xmm4 683 movsd -24 * SIZE(BB), %xmm1 684 addps %xmm0, %xmm5 685 movsd -30 * SIZE(AA), %xmm0 686 mulps %xmm0, %xmm1 687 mulps -20 * SIZE(BB), %xmm0 688 addps %xmm1, %xmm6 689 movsd 0 * SIZE(BB), %xmm1 690 addps %xmm0, %xmm7 691 movsd -28 * SIZE(AA), %xmm0 692 mulps %xmm0, %xmm3 693 mulps -12 * SIZE(BB), %xmm0 694 addps %xmm3, %xmm4 695 movsd -8 * SIZE(BB), %xmm3 696 addps %xmm0, %xmm5 697 movsd -26 * SIZE(AA), %xmm0 698 mulps %xmm0, %xmm3 699 mulps -4 * SIZE(BB), %xmm0 700 addps %xmm3, %xmm6 701 movsd 16 * SIZE(BB), %xmm3 702 addps %xmm0, %xmm7 703 movsd -16 * SIZE(AA), %xmm0 704 mulps %xmm2, %xmm1 705 mulps 4 * SIZE(BB), %xmm2 706 addps %xmm1, %xmm4 707 movsd 8 * SIZE(BB), %xmm1 708 addps %xmm2, %xmm5 709 movsd -22 * SIZE(AA), %xmm2 710 mulps %xmm2, %xmm1 711 mulps 12 * SIZE(BB), %xmm2 712 addps %xmm1, %xmm6 713 movsd 32 * SIZE(BB), %xmm1 714 addps %xmm2, %xmm7 715 movsd -20 * SIZE(AA), %xmm2 716 mulps %xmm2, %xmm3 717 mulps 20 * SIZE(BB), %xmm2 718 addps %xmm3, %xmm4 719 movsd 24 * SIZE(BB), %xmm3 720 addps %xmm2, %xmm5 721 movsd -18 * SIZE(AA), %xmm2 722 mulps %xmm2, %xmm3 723 mulps 28 * SIZE(BB), %xmm2 724 addps %xmm3, %xmm6 725 movsd 48 * SIZE(BB), %xmm3 726 addps %xmm2, %xmm7 727 movsd -8 * SIZE(AA), %xmm2 728 729 subl $-16 * SIZE, AA 730 addl $ 64 * SIZE, BB 731 decl %eax 732 jne .L32 733 ALIGN_4 734 735.L35: 736 movsd ALPHA, %xmm3 737 738#ifndef TRMMKERNEL 739 movl K, %eax 740#else 741 movl KKK, %eax 742#endif 743 andl $7, %eax 744 BRANCH 745 je .L38 746 ALIGN_4 747 748.L36: 749 mulps %xmm0, %xmm1 750 mulps -28 * SIZE(BB), %xmm0 751 addps %xmm1, %xmm4 752 movsd -24 * SIZE(BB), %xmm1 753 addps %xmm0, %xmm5 754 movsd -30 * SIZE(AA), %xmm0 755 756 addl $2 * SIZE, AA 757 addl $8 * SIZE, BB 758 decl %eax 759 jg .L36 760 ALIGN_4 761 762.L38: 763 addps %xmm6, %xmm4 764 addps %xmm7, %xmm5 765 766 mulps %xmm3, %xmm4 767 mulps %xmm3, %xmm5 768 769#ifndef TRMMKERNEL 770 movsd 0 * SIZE(C1), %xmm0 771 movsd 0 * SIZE(C1, LDC), %xmm1 772 773 addps %xmm0, %xmm4 774 addps %xmm1, %xmm5 775#endif 776 777 movsd %xmm4, 0 * SIZE(C1) 778 movsd %xmm5, 0 * SIZE(C1, LDC) 779 780#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 781 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 782 movl K, %eax 783 subl KKK, %eax 784 leal (,%eax, 8), %eax 785 leal (AA, %eax, 1), AA 786 leal (BB, %eax, 4), BB 787#endif 788 789#if defined(TRMMKERNEL) && defined(LEFT) 790 addl $2, KK 791#endif 792 793 addl $2 * SIZE, C1 794 ALIGN_4 795 796.L40: 797 movl M, I 798 testl $1, I 799 jle .L49 800 801.L41: 802#if !defined(TRMMKERNEL) || \ 803 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 804 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 805 leal 32 * SIZE + BUFFER, BB 806#else 807 leal 32 * SIZE + BUFFER, BB 808 movl KK, %eax 809 leal (, %eax, 4), %eax 810 leal (AA, %eax, 1), AA 811 leal (BB, %eax, 8), BB /* because it's doubled */ 812#endif 813 814 movss -32 * SIZE(AA), %xmm0 815 pxor %xmm4, %xmm4 816 movss -32 * SIZE(BB), %xmm1 817 pxor %xmm5, %xmm5 818 movss -28 * SIZE(AA), %xmm2 819 pxor %xmm6, %xmm6 820 movss -16 * SIZE(BB), %xmm3 821 pxor %xmm7, %xmm7 822 823#ifndef TRMMKERNEL 824 movl K, %eax 825#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 826 movl K, %eax 827 subl KK, %eax 828 movl %eax, KKK 829#else 830 movl KK, %eax 831#ifdef LEFT 832 addl $1, %eax 833#else 834 addl $2, %eax 835#endif 836 movl %eax, KKK 837#endif 838 sarl $3, %eax 839 je .L45 840 ALIGN_4 841 842.L42: 843 mulss %xmm0, %xmm1 844 mulss -28 * SIZE(BB), %xmm0 845 addss %xmm1, %xmm4 846 movss -24 * SIZE(BB), %xmm1 847 addss %xmm0, %xmm5 848 movss -31 * SIZE(AA), %xmm0 849 mulss %xmm0, %xmm1 850 mulss -20 * SIZE(BB), %xmm0 851 addss %xmm1, %xmm6 852 movss 0 * SIZE(BB), %xmm1 853 addss %xmm0, %xmm7 854 movss -30 * SIZE(AA), %xmm0 855 mulss %xmm0, %xmm3 856 mulss -12 * SIZE(BB), %xmm0 857 addss %xmm3, %xmm4 858 movss -8 * SIZE(BB), %xmm3 859 addss %xmm0, %xmm5 860 movss -29 * SIZE(AA), %xmm0 861 mulss %xmm0, %xmm3 862 mulss -4 * SIZE(BB), %xmm0 863 addss %xmm3, %xmm6 864 movss 16 * SIZE(BB), %xmm3 865 addss %xmm0, %xmm7 866 movss -24 * SIZE(AA), %xmm0 867 mulss %xmm2, %xmm1 868 mulss 4 * SIZE(BB), %xmm2 869 addss %xmm1, %xmm4 870 movss 8 * SIZE(BB), %xmm1 871 addss %xmm2, %xmm5 872 movss -27 * SIZE(AA), %xmm2 873 mulss %xmm2, %xmm1 874 mulss 12 * SIZE(BB), %xmm2 875 addss %xmm1, %xmm6 876 movss 32 * SIZE(BB), %xmm1 877 addss %xmm2, %xmm7 878 movss -26 * SIZE(AA), %xmm2 879 mulss %xmm2, %xmm3 880 mulss 20 * SIZE(BB), %xmm2 881 addss %xmm3, %xmm4 882 movss 24 * SIZE(BB), %xmm3 883 addss %xmm2, %xmm5 884 movss -25 * SIZE(AA), %xmm2 885 mulss %xmm2, %xmm3 886 mulss 28 * SIZE(BB), %xmm2 887 addss %xmm3, %xmm6 888 movss 48 * SIZE(BB), %xmm3 889 addss %xmm2, %xmm7 890 movss -20 * SIZE(AA), %xmm2 891 892 subl $-8 * SIZE, AA 893 addl $64 * SIZE, BB 894 decl %eax 895 jne .L42 896 ALIGN_4 897 898.L45: 899 movss ALPHA, %xmm3 900 901#ifndef TRMMKERNEL 902 movl K, %eax 903#else 904 movl KKK, %eax 905#endif 906 andl $7, %eax 907 BRANCH 908 je .L48 909 ALIGN_4 910 911.L46: 912 mulss %xmm0, %xmm1 913 mulss -28 * SIZE(BB), %xmm0 914 addss %xmm1, %xmm4 915 movss -24 * SIZE(BB), %xmm1 916 addss %xmm0, %xmm5 917 movss -31 * SIZE(AA), %xmm0 918 919 addl $1 * SIZE, AA 920 addl $8 * SIZE, BB 921 decl %eax 922 jg .L46 923 ALIGN_4 924 925.L48: 926 addss %xmm6, %xmm4 927 addss %xmm7, %xmm5 928 929 mulss %xmm3, %xmm4 930 mulss %xmm3, %xmm5 931 932#ifndef TRMMKERNEL 933 movss 0 * SIZE(C1), %xmm0 934 movss 0 * SIZE(C1, LDC), %xmm1 935 936 addss %xmm0, %xmm4 937 addss %xmm1, %xmm5 938#endif 939 940 movss %xmm4, 0 * SIZE(C1) 941 movss %xmm5, 0 * SIZE(C1, LDC) 942 943#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 944 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 945 movl K, %eax 946 subl KKK, %eax 947 leal (,%eax, 4), %eax 948 leal (AA, %eax, 1), AA 949 leal (BB, %eax, 8), BB 950#endif 951 952#if defined(TRMMKERNEL) && defined(LEFT) 953 addl $1, KK 954#endif 955 ALIGN_4 956 957.L49: 958#if defined(TRMMKERNEL) && !defined(LEFT) 959 addl $2, KK 960#endif 961 962 leal (, LDC, 2), %eax 963 addl %eax, C 964 decl J 965 jg .L01 966 ALIGN_4 967 968.L50: 969 movl N, %eax 970 testl $1, %eax 971 jle .L999 972 ALIGN_4 973 974.L51: 975 leal 32 * SIZE + BUFFER, BB 976 977#if defined(TRMMKERNEL) && defined(LEFT) 978 movl OFFSET, %eax 979 movl %eax, KK 980#endif 981 982 movl K, %eax 983 sarl $3, %eax 984 jle .L55 985 ALIGN_4 986 987.L52: 988 movss -32 * SIZE(B), %xmm0 989 movss -31 * SIZE(B), %xmm1 990 movss -30 * SIZE(B), %xmm2 991 movss -29 * SIZE(B), %xmm3 992 movss -28 * SIZE(B), %xmm4 993 movss -27 * SIZE(B), %xmm5 994 movss -26 * SIZE(B), %xmm6 995 movss -25 * SIZE(B), %xmm7 996 997 shufps $0, %xmm0, %xmm0 998 shufps $0, %xmm1, %xmm1 999 shufps $0, %xmm2, %xmm2 1000 shufps $0, %xmm3, %xmm3 1001 shufps $0, %xmm4, %xmm4 1002 shufps $0, %xmm5, %xmm5 1003 shufps $0, %xmm6, %xmm6 1004 shufps $0, %xmm7, %xmm7 1005 1006 movaps %xmm0, -32 * SIZE(BB) 1007 movaps %xmm1, -28 * SIZE(BB) 1008 movaps %xmm2, -24 * SIZE(BB) 1009 movaps %xmm3, -20 * SIZE(BB) 1010 movaps %xmm4, -16 * SIZE(BB) 1011 movaps %xmm5, -12 * SIZE(BB) 1012 movaps %xmm6, -8 * SIZE(BB) 1013 movaps %xmm7, -4 * SIZE(BB) 1014 1015 addl $ 8 * SIZE, B 1016 subl $-32 * SIZE, BB 1017 decl %eax 1018 jne .L52 1019 ALIGN_4 1020 1021.L55: 1022 movl K, %eax 1023 andl $7, %eax 1024 BRANCH 1025 jle .L60 1026 ALIGN_4 1027 1028.L56: 1029 movss -32 * SIZE(B), %xmm0 1030 shufps $0, %xmm0, %xmm0 1031 movaps %xmm0, -32 * SIZE(BB) 1032 1033 addl $1 * SIZE, B 1034 addl $4 * SIZE, BB 1035 decl %eax 1036 jne .L56 1037 ALIGN_4 1038 1039.L60: 1040 movl C, C1 1041 movl A, AA 1042 movl M, I 1043 sarl $3, I 1044 jle .L70 1045 ALIGN_4 1046 1047.L61: 1048#if !defined(TRMMKERNEL) || \ 1049 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1050 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1051 leal 32 * SIZE + BUFFER, BB 1052#else 1053 leal 32 * SIZE + BUFFER, BB 1054 movl KK, %eax 1055 leal (, %eax, 8), %eax 1056 leal (AA, %eax, 4), AA 1057 leal (BB, %eax, 2), BB /* because it's doubled */ 1058#endif 1059 1060 movaps -32 * SIZE(AA), %xmm0 1061 pxor %xmm4, %xmm4 1062 movaps -32 * SIZE(BB), %xmm1 1063 pxor %xmm5, %xmm5 1064 movaps -16 * SIZE(AA), %xmm2 1065 pxor %xmm6, %xmm6 1066 movaps -16 * SIZE(BB), %xmm3 1067 pxor %xmm7, %xmm7 1068 1069 prefetcht0 3 * SIZE(C1) 1070 1071#ifndef TRMMKERNEL 1072 movl K, %eax 1073#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1074 movl K, %eax 1075 subl KK, %eax 1076 movl %eax, KKK 1077#else 1078 movl KK, %eax 1079#ifdef LEFT 1080 addl $8, %eax 1081#else 1082 addl $1, %eax 1083#endif 1084 movl %eax, KKK 1085#endif 1086 sarl $3, %eax 1087 je .L65 1088 ALIGN_4 1089 1090.L62: 1091 mulps %xmm1, %xmm0 1092 mulps -28 * SIZE(AA), %xmm1 1093 addps %xmm0, %xmm4 1094 movaps -24 * SIZE(AA), %xmm0 1095 addps %xmm1, %xmm6 1096 movaps -28 * SIZE(BB), %xmm1 1097 mulps %xmm1, %xmm0 1098 mulps -20 * SIZE(AA), %xmm1 1099 addps %xmm0, %xmm5 1100 movaps 0 * SIZE(AA), %xmm0 1101 addps %xmm1, %xmm7 1102 movaps -24 * SIZE(BB), %xmm1 1103 mulps %xmm1, %xmm2 1104 mulps -12 * SIZE(AA), %xmm1 1105 addps %xmm2, %xmm4 1106 movaps -8 * SIZE(AA), %xmm2 1107 addps %xmm1, %xmm6 1108 movaps -20 * SIZE(BB), %xmm1 1109 mulps %xmm1, %xmm2 1110 mulps -4 * SIZE(AA), %xmm1 1111 addps %xmm2, %xmm5 1112 movaps 16 * SIZE(AA), %xmm2 1113 addps %xmm1, %xmm7 1114 movaps 0 * SIZE(BB), %xmm1 1115 mulps %xmm3, %xmm0 1116 mulps 4 * SIZE(AA), %xmm3 1117 addps %xmm0, %xmm4 1118 movaps 8 * SIZE(AA), %xmm0 1119 addps %xmm3, %xmm6 1120 movaps -12 * SIZE(BB), %xmm3 1121 mulps %xmm3, %xmm0 1122 mulps 12 * SIZE(AA), %xmm3 1123 addps %xmm0, %xmm5 1124 movaps 32 * SIZE(AA), %xmm0 1125 addps %xmm3, %xmm7 1126 movaps -8 * SIZE(BB), %xmm3 1127 mulps %xmm3, %xmm2 1128 mulps 20 * SIZE(AA), %xmm3 1129 addps %xmm2, %xmm4 1130 movaps 24 * SIZE(AA), %xmm2 1131 addps %xmm3, %xmm6 1132 movaps -4 * SIZE(BB), %xmm3 1133 mulps %xmm3, %xmm2 1134 mulps 28 * SIZE(AA), %xmm3 1135 addps %xmm2, %xmm5 1136 movaps 48 * SIZE(AA), %xmm2 1137 addps %xmm3, %xmm7 1138 movaps 16 * SIZE(BB), %xmm3 1139 1140 addl $ 64 * SIZE, AA 1141 subl $-32 * SIZE, BB 1142 decl %eax 1143 jne .L62 1144 ALIGN_4 1145 1146.L65: 1147 movaps ALPHA, %xmm3 1148 1149#ifndef TRMMKERNEL 1150 movl K, %eax 1151#else 1152 movl KKK, %eax 1153#endif 1154 andl $7, %eax 1155 BRANCH 1156 je .L68 1157 ALIGN_4 1158 1159.L66: 1160 mulps %xmm1, %xmm0 1161 mulps -28 * SIZE(AA), %xmm1 1162 addps %xmm0, %xmm4 1163 movaps -24 * SIZE(AA), %xmm0 1164 addps %xmm1, %xmm6 1165 movaps -28 * SIZE(BB), %xmm1 1166 1167 addl $8 * SIZE, AA 1168 addl $4 * SIZE, BB 1169 decl %eax 1170 jg .L66 1171 ALIGN_4 1172 1173.L68: 1174 addps %xmm5, %xmm4 1175 addps %xmm7, %xmm6 1176 1177 mulps %xmm3, %xmm4 1178 mulps %xmm3, %xmm6 1179 1180#ifndef TRMMKERNEL 1181 movsd 0 * SIZE(C1), %xmm0 1182 movhps 2 * SIZE(C1), %xmm0 1183 movsd 4 * SIZE(C1), %xmm2 1184 movhps 6 * SIZE(C1), %xmm2 1185 1186 addps %xmm0, %xmm4 1187 addps %xmm2, %xmm6 1188#endif 1189 1190 movsd %xmm4, 0 * SIZE(C1) 1191 movhps %xmm4, 2 * SIZE(C1) 1192 movsd %xmm6, 4 * SIZE(C1) 1193 movhps %xmm6, 6 * SIZE(C1) 1194 1195#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1196 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1197 movl K, %eax 1198 subl KKK, %eax 1199 leal (,%eax, 8), %eax 1200 leal (AA, %eax, 4), AA 1201 leal (BB, %eax, 2), BB 1202#endif 1203 1204#if defined(TRMMKERNEL) && defined(LEFT) 1205 addl $8, KK 1206#endif 1207 1208 addl $8 * SIZE, C1 1209 decl I 1210 jg .L61 1211 ALIGN_4 1212 1213.L70: 1214 movl M, I 1215 testl $4, I 1216 jle .L80 1217 1218.L71: 1219#if !defined(TRMMKERNEL) || \ 1220 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1221 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1222 leal 32 * SIZE + BUFFER, BB 1223#else 1224 leal 32 * SIZE + BUFFER, BB 1225 movl KK, %eax 1226 leal (, %eax, 8), %eax 1227 leal (AA, %eax, 2), AA 1228 leal (BB, %eax, 2), BB /* because it's doubled */ 1229#endif 1230 1231 movaps -32 * SIZE(AA), %xmm0 1232 pxor %xmm4, %xmm4 1233 movaps -32 * SIZE(BB), %xmm1 1234 pxor %xmm5, %xmm5 1235 movaps -16 * SIZE(AA), %xmm2 1236 movaps -16 * SIZE(BB), %xmm3 1237 1238#ifndef TRMMKERNEL 1239 movl K, %eax 1240#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1241 movl K, %eax 1242 subl KK, %eax 1243 movl %eax, KKK 1244#else 1245 movl KK, %eax 1246#ifdef LEFT 1247 addl $4, %eax 1248#else 1249 addl $1, %eax 1250#endif 1251 movl %eax, KKK 1252#endif 1253 sarl $3, %eax 1254 je .L75 1255 ALIGN_4 1256 1257.L72: 1258 mulps %xmm0, %xmm1 1259 movaps -28 * SIZE(AA), %xmm0 1260 addps %xmm1, %xmm4 1261 movaps -28 * SIZE(BB), %xmm1 1262 mulps %xmm0, %xmm1 1263 movaps -24 * SIZE(AA), %xmm0 1264 addps %xmm1, %xmm5 1265 movaps -24 * SIZE(BB), %xmm1 1266 mulps %xmm0, %xmm1 1267 movaps -20 * SIZE(AA), %xmm0 1268 addps %xmm1, %xmm4 1269 movaps -20 * SIZE(BB), %xmm1 1270 mulps %xmm0, %xmm1 1271 movaps 0 * SIZE(AA), %xmm0 1272 addps %xmm1, %xmm5 1273 movaps 0 * SIZE(BB), %xmm1 1274 mulps %xmm2, %xmm3 1275 movaps -12 * SIZE(AA), %xmm2 1276 addps %xmm3, %xmm4 1277 movaps -12 * SIZE(BB), %xmm3 1278 mulps %xmm2, %xmm3 1279 movaps -8 * SIZE(AA), %xmm2 1280 addps %xmm3, %xmm5 1281 movaps -8 * SIZE(BB), %xmm3 1282 mulps %xmm2, %xmm3 1283 movaps -4 * SIZE(AA), %xmm2 1284 addps %xmm3, %xmm4 1285 movaps -4 * SIZE(BB), %xmm3 1286 mulps %xmm2, %xmm3 1287 movaps 16 * SIZE(AA), %xmm2 1288 addps %xmm3, %xmm5 1289 movaps 16 * SIZE(BB), %xmm3 1290 1291 subl $-32 * SIZE, AA 1292 subl $-32 * SIZE, BB 1293 decl %eax 1294 jne .L72 1295 ALIGN_4 1296 1297.L75: 1298 movaps ALPHA, %xmm3 1299 1300#ifndef TRMMKERNEL 1301 movl K, %eax 1302#else 1303 movl KKK, %eax 1304#endif 1305 andl $7, %eax 1306 BRANCH 1307 je .L78 1308 ALIGN_4 1309 1310.L76: 1311 mulps %xmm0, %xmm1 1312 movaps -28 * SIZE(AA), %xmm0 1313 addps %xmm1, %xmm4 1314 movaps -28 * SIZE(BB), %xmm1 1315 1316 addl $4 * SIZE, AA 1317 addl $4 * SIZE, BB 1318 decl %eax 1319 jg .L76 1320 ALIGN_4 1321 1322.L78: 1323 addps %xmm5, %xmm4 1324 mulps %xmm3, %xmm4 1325 1326#ifndef TRMMKERNEL 1327 movsd 0 * SIZE(C1), %xmm0 1328 movhps 2 * SIZE(C1), %xmm0 1329 1330 addps %xmm0, %xmm4 1331#endif 1332 1333 movsd %xmm4, 0 * SIZE(C1) 1334 movhps %xmm4, 2 * SIZE(C1) 1335 1336#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1337 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1338 movl K, %eax 1339 subl KKK, %eax 1340 leal (,%eax, 8), %eax 1341 leal (AA, %eax, 2), AA 1342 leal (BB, %eax, 2), BB 1343#endif 1344 1345#if defined(TRMMKERNEL) && defined(LEFT) 1346 addl $4, KK 1347#endif 1348 1349 addl $4 * SIZE, C1 1350 ALIGN_4 1351 1352.L80: 1353 movl M, I 1354 testl $2, I 1355 jle .L90 1356 1357.L81: 1358#if !defined(TRMMKERNEL) || \ 1359 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1360 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1361 leal 32 * SIZE + BUFFER, BB 1362#else 1363 leal 32 * SIZE + BUFFER, BB 1364 movl KK, %eax 1365 leal (, %eax, 8), %eax 1366 leal (AA, %eax, 1), AA 1367 leal (BB, %eax, 2), BB /* because it's doubled */ 1368#endif 1369 1370 movsd -32 * SIZE(AA), %xmm0 1371 pxor %xmm4, %xmm4 1372 movsd -32 * SIZE(BB), %xmm1 1373 pxor %xmm5, %xmm5 1374 movsd -16 * SIZE(BB), %xmm3 1375 movsd -24 * SIZE(AA), %xmm2 1376 1377#ifndef TRMMKERNEL 1378 movl K, %eax 1379#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1380 movl K, %eax 1381 subl KK, %eax 1382 movl %eax, KKK 1383#else 1384 movl KK, %eax 1385#ifdef LEFT 1386 addl $2, %eax 1387#else 1388 addl $1, %eax 1389#endif 1390 movl %eax, KKK 1391#endif 1392 sarl $3, %eax 1393 je .L85 1394 ALIGN_4 1395 1396.L82: 1397 mulps %xmm0, %xmm1 1398 movsd -30 * SIZE(AA), %xmm0 1399 addps %xmm1, %xmm4 1400 movsd -28 * SIZE(BB), %xmm1 1401 mulps %xmm0, %xmm1 1402 movsd -28 * SIZE(AA), %xmm0 1403 addps %xmm1, %xmm5 1404 movsd -24 * SIZE(BB), %xmm1 1405 mulps %xmm0, %xmm1 1406 movsd -26 * SIZE(AA), %xmm0 1407 addps %xmm1, %xmm4 1408 movsd -20 * SIZE(BB), %xmm1 1409 mulps %xmm0, %xmm1 1410 movsd -16 * SIZE(AA), %xmm0 1411 addps %xmm1, %xmm5 1412 movsd -0 * SIZE(BB), %xmm1 1413 mulps %xmm2, %xmm3 1414 movsd -22 * SIZE(AA), %xmm2 1415 addps %xmm3, %xmm4 1416 movsd -12 * SIZE(BB), %xmm3 1417 mulps %xmm2, %xmm3 1418 movsd -20 * SIZE(AA), %xmm2 1419 addps %xmm3, %xmm5 1420 movsd -8 * SIZE(BB), %xmm3 1421 mulps %xmm2, %xmm3 1422 movsd -18 * SIZE(AA), %xmm2 1423 addps %xmm3, %xmm4 1424 movsd -4 * SIZE(BB), %xmm3 1425 mulps %xmm2, %xmm3 1426 movsd -8 * SIZE(AA), %xmm2 1427 addps %xmm3, %xmm5 1428 movsd 16 * SIZE(BB), %xmm3 1429 1430 subl $-16 * SIZE, AA 1431 subl $-32 * SIZE, BB 1432 decl %eax 1433 jne .L82 1434 ALIGN_4 1435 1436.L85: 1437 movsd ALPHA, %xmm3 1438 1439#ifndef TRMMKERNEL 1440 movl K, %eax 1441#else 1442 movl KKK, %eax 1443#endif 1444 andl $7, %eax 1445 BRANCH 1446 je .L88 1447 ALIGN_4 1448 1449.L86: 1450 mulps %xmm0, %xmm1 1451 movsd -30 * SIZE(AA), %xmm0 1452 addps %xmm1, %xmm4 1453 movsd -28 * SIZE(BB), %xmm1 1454 1455 addl $2 * SIZE, AA 1456 addl $4 * SIZE, BB 1457 decl %eax 1458 jg .L86 1459 ALIGN_4 1460 1461.L88: 1462 addps %xmm5, %xmm4 1463 mulps %xmm3, %xmm4 1464 1465#ifndef TRMMKERNEL 1466 movsd 0 * SIZE(C1), %xmm0 1467 addps %xmm0, %xmm4 1468#endif 1469 movsd %xmm4, 0 * SIZE(C1) 1470 1471#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1472 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1473 movl K, %eax 1474 subl KKK, %eax 1475 leal (,%eax, 8), %eax 1476 leal (AA, %eax, 1), AA 1477 leal (BB, %eax, 2), BB 1478#endif 1479 1480#if defined(TRMMKERNEL) && defined(LEFT) 1481 addl $2, KK 1482#endif 1483 addl $2 * SIZE, C1 1484 ALIGN_4 1485 1486.L90: 1487 movl M, I 1488 testl $1, I 1489 jle .L99 1490 1491.L91: 1492#if !defined(TRMMKERNEL) || \ 1493 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1494 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1495 leal 32 * SIZE + BUFFER, BB 1496#else 1497 leal 32 * SIZE + BUFFER, BB 1498 movl KK, %eax 1499 leal (, %eax, 4), %eax 1500 leal (AA, %eax, 1), AA 1501 leal (BB, %eax, 4), BB /* because it's doubled */ 1502#endif 1503 1504 movss -32 * SIZE(AA), %xmm0 1505 pxor %xmm4, %xmm4 1506 movss -32 * SIZE(BB), %xmm1 1507 pxor %xmm5, %xmm5 1508 movss -16 * SIZE(BB), %xmm3 1509 movss -28 * SIZE(AA), %xmm2 1510 1511#ifndef TRMMKERNEL 1512 movl K, %eax 1513#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1514 movl K, %eax 1515 subl KK, %eax 1516 movl %eax, KKK 1517#else 1518 movl KK, %eax 1519#ifdef LEFT 1520 addl $1, %eax 1521#else 1522 addl $1, %eax 1523#endif 1524 movl %eax, KKK 1525#endif 1526 sarl $3, %eax 1527 je .L95 1528 ALIGN_4 1529 1530.L92: 1531 mulss %xmm0, %xmm1 1532 movss -31 * SIZE(AA), %xmm0 1533 addss %xmm1, %xmm4 1534 movss -28 * SIZE(BB), %xmm1 1535 mulss %xmm0, %xmm1 1536 movss -30 * SIZE(AA), %xmm0 1537 addss %xmm1, %xmm5 1538 movss -24 * SIZE(BB), %xmm1 1539 mulss %xmm0, %xmm1 1540 movss -29 * SIZE(AA), %xmm0 1541 addss %xmm1, %xmm4 1542 movss -20 * SIZE(BB), %xmm1 1543 mulss %xmm0, %xmm1 1544 movss -24 * SIZE(AA), %xmm0 1545 addss %xmm1, %xmm5 1546 movss -0 * SIZE(BB), %xmm1 1547 mulss %xmm2, %xmm3 1548 movss -27 * SIZE(AA), %xmm2 1549 addss %xmm3, %xmm4 1550 movss -12 * SIZE(BB), %xmm3 1551 mulss %xmm2, %xmm3 1552 movss -26 * SIZE(AA), %xmm2 1553 addss %xmm3, %xmm5 1554 movss -8 * SIZE(BB), %xmm3 1555 mulss %xmm2, %xmm3 1556 movss -25 * SIZE(AA), %xmm2 1557 addss %xmm3, %xmm4 1558 movss -4 * SIZE(BB), %xmm3 1559 mulss %xmm2, %xmm3 1560 movss -20 * SIZE(AA), %xmm2 1561 addss %xmm3, %xmm5 1562 movss 16 * SIZE(BB), %xmm3 1563 1564 subl $ -8 * SIZE, AA 1565 subl $-32 * SIZE, BB 1566 decl %eax 1567 jne .L92 1568 ALIGN_4 1569 1570.L95: 1571 movss ALPHA, %xmm3 1572 1573#ifndef TRMMKERNEL 1574 movl K, %eax 1575#else 1576 movl KKK, %eax 1577#endif 1578 andl $7, %eax 1579 BRANCH 1580 je .L98 1581 ALIGN_4 1582 1583.L96: 1584 mulss %xmm0, %xmm1 1585 movss -31 * SIZE(AA), %xmm0 1586 addss %xmm1, %xmm4 1587 movss -28 * SIZE(BB), %xmm1 1588 1589 addl $1 * SIZE, AA 1590 addl $4 * SIZE, BB 1591 decl %eax 1592 jg .L96 1593 ALIGN_4 1594 1595.L98: 1596 addss %xmm5, %xmm4 1597 mulss %xmm3, %xmm4 1598 1599#ifndef TRMMKERNEL 1600 movss 0 * SIZE(C1), %xmm0 1601 addss %xmm0, %xmm4 1602#endif 1603 movss %xmm4, 0 * SIZE(C1) 1604 ALIGN_4 1605 1606.L99: 1607 addl LDC, C 1608 ALIGN_4 1609 1610 1611.L999: 1612 movl OLD_STACK, %esp 1613 1614 EMMS 1615 1616 popl %ebx 1617 popl %esi 1618 popl %edi 1619 popl %ebp 1620 ret 1621 1622 EPILOGUE 1623