1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 16 26 27#define OLD_M 4 + STACK(%esi) 28#define OLD_N 8 + STACK(%esi) 29#define OLD_K 12 + STACK(%esi) 30#define OLD_ALPHA_R 16 + STACK(%esi) 31#define OLD_ALPHA_I 20 + STACK(%esi) 32#define OLD_A 24 + STACK(%esi) 33#define OLD_B 28 + STACK(%esi) 34#define OLD_C 32 + STACK(%esi) 35#define OLD_LDC 36 + STACK(%esi) 36 37#define ALPHA 0(%esp) 38#define K 16(%esp) 39#define N 20(%esp) 40#define M 24(%esp) 41#define A 28(%esp) 42#define C 32(%esp) 43#define J 36(%esp) 44#define OLD_STACK 40(%esp) 45#define OFFSET 44(%esp) 46#define KK 48(%esp) 47#define KKK 52(%esp) 48#define BUFFER 128(%esp) 49 50#if defined(PENRYN) || defined(DUNNINGTON) 51#define PREFETCH prefetcht0 52#define PREFETCHSIZE 96 53#endif 54 55#ifdef PENTIUM4 56#define PREFETCH prefetcht0 57#define PREFETCHSIZE 96 58#endif 59 60#ifdef PENTIUMM 61#define PREFETCH prefetcht0 62#define PREFETCHSIZE 96 63#endif 64 65#define AA %edx 66#define BB %ecx 67#define LDC %ebp 68 69#define KERNEL1(address) \ 70 mulps %xmm0, %xmm2; \ 71 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA); \ 72 addps %xmm2, %xmm4; \ 73 movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 74 mulps %xmm0, %xmm2; \ 75 addps %xmm2, %xmm5; \ 76 movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 77 mulps %xmm0, %xmm2; \ 78 addps %xmm2, %xmm6; \ 79 movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 80 mulps %xmm0, %xmm2; \ 81 movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ 82 addps %xmm2, %xmm7; \ 83 movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 84 85#define KERNEL2(address) \ 86 mulps %xmm0, %xmm2; \ 87 addps %xmm2, %xmm4; \ 88 movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 89 mulps %xmm0, %xmm2; \ 90 addps %xmm2, %xmm5; \ 91 movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 92 mulps %xmm0, %xmm2; \ 93 addps %xmm2, %xmm6; \ 94 movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 95 mulps %xmm0, %xmm2; \ 96 movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ 97 addps %xmm2, %xmm7; \ 98 movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 99 100#define KERNEL3(address) \ 101 mulps %xmm0, %xmm3; \ 102 addps %xmm3, %xmm4; \ 103 movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 104 mulps %xmm0, %xmm3; \ 105 addps %xmm3, %xmm5; \ 106 movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 107 mulps %xmm0, %xmm3; \ 108 addps %xmm3, %xmm6; \ 109 movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 110 mulps %xmm0, %xmm3; \ 111 movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ 112 addps %xmm3, %xmm7; \ 113 movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 114 115#define KERNEL4(address) \ 116 mulps %xmm0, %xmm3; \ 117 addps %xmm3, %xmm4; \ 118 movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 119 mulps %xmm0, %xmm3; \ 120 addps %xmm3, %xmm5; \ 121 movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 122 mulps %xmm0, %xmm3; \ 123 addps %xmm3, %xmm6; \ 124 movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 125 mulps %xmm0, %xmm3; \ 126 movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ 127 addps %xmm3, %xmm7; \ 128 movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 129 130#define KERNEL5(address) \ 131 mulps %xmm1, %xmm2; \ 132 addps %xmm2, %xmm4; \ 133 movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 134 mulps %xmm1, %xmm2; \ 135 addps %xmm2, %xmm5; \ 136 movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 137 mulps %xmm1, %xmm2; \ 138 addps %xmm2, %xmm6; \ 139 movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 140 mulps %xmm1, %xmm2; \ 141 movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ 142 addps %xmm2, %xmm7 143 144#define KERNEL6(address) \ 145 movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 146 mulps %xmm1, %xmm2; \ 147 addps %xmm2, %xmm4; \ 148 movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 149 mulps %xmm1, %xmm2; \ 150 addps %xmm2, %xmm5; \ 151 movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 152 mulps %xmm1, %xmm2; \ 153 addps %xmm2, %xmm6; \ 154 movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ 155 mulps %xmm1, %xmm2; \ 156 movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ 157 addps %xmm2, %xmm7; \ 158 movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 159 160#define KERNEL7(address) \ 161 mulps %xmm1, %xmm3; \ 162 addps %xmm3, %xmm4; \ 163 movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 164 mulps %xmm1, %xmm3; \ 165 addps %xmm3, %xmm5; \ 166 movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 167 mulps %xmm1, %xmm3; \ 168 addps %xmm3, %xmm6; \ 169 movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 170 mulps %xmm1, %xmm3; \ 171 movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ 172 addps %xmm3, %xmm7; \ 173 movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 174 175#define KERNEL8(address) \ 176 mulps %xmm1, %xmm3; \ 177 addps %xmm3, %xmm4; \ 178 movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 179 mulps %xmm1, %xmm3; \ 180 addps %xmm3, %xmm5; \ 181 movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 182 mulps %xmm1, %xmm3; \ 183 addps %xmm3, %xmm6; \ 184 movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ 185 mulps %xmm1, %xmm3; \ 186 movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ 187 addps %xmm3, %xmm7; \ 188 movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 189 190 191 PROLOGUE 192 193 pushl %ebp 194 pushl %edi 195 pushl %esi 196 pushl %ebx 197 198 PROFCODE 199 200 movl %esp, %esi # save old stack 201 subl $128 + LOCAL_BUFFER_SIZE, %esp 202 movl OLD_M, %ebx 203 andl $-1024, %esp # align stack 204 205 STACK_TOUCHING 206 207 movl OLD_N, %eax 208 movl OLD_K, %ecx 209 movl OLD_A, %edx 210 movss OLD_ALPHA_R, %xmm0 211 movss OLD_ALPHA_I, %xmm1 212 213 movl %ebx, M 214 movl %eax, N 215 movl %ecx, K 216 movl %edx, A 217 movl %esi, OLD_STACK 218 219 movl OLD_B, %edi 220 movl OLD_C, %ebx 221 222 unpcklps %xmm1, %xmm0 223 movlhps %xmm0, %xmm0 224 225 movaps %xmm0, ALPHA 226 227 movl %ebx, C 228 movl OLD_LDC, LDC 229#ifdef TRMMKERNEL 230 movss %xmm4, OFFSET 231 movss %xmm4, KK 232#ifndef LEFT 233 negl KK 234#endif 235#endif 236 237 sall $ZBASE_SHIFT, LDC 238 239 sarl $2, %eax 240 movl %eax, J 241 jle .L40 242 243.L01: 244#if defined(TRMMKERNEL) && defined(LEFT) 245 movl OFFSET, %eax 246 movl %eax, KK 247#endif 248 249/* Copying to Sub Buffer */ 250 leal BUFFER, %ecx 251 252 movl K, %eax 253 sarl $2, %eax 254 jle .L05 255 ALIGN_4 256 257.L02: 258 movddup 0 * SIZE(%edi), %xmm0 259 movddup 2 * SIZE(%edi), %xmm1 260 movddup 4 * SIZE(%edi), %xmm2 261 movddup 6 * SIZE(%edi), %xmm3 262 movddup 8 * SIZE(%edi), %xmm4 263 movddup 10 * SIZE(%edi), %xmm5 264 movddup 12 * SIZE(%edi), %xmm6 265 movddup 14 * SIZE(%edi), %xmm7 266 267 movaps %xmm0, 0 * SIZE(%ecx) 268 movaps %xmm1, 4 * SIZE(%ecx) 269 movaps %xmm2, 8 * SIZE(%ecx) 270 movaps %xmm3, 12 * SIZE(%ecx) 271 movaps %xmm4, 16 * SIZE(%ecx) 272 movaps %xmm5, 20 * SIZE(%ecx) 273 movaps %xmm6, 24 * SIZE(%ecx) 274 movaps %xmm7, 28 * SIZE(%ecx) 275 276# prefetcht1 128 * SIZE(%ecx) 277 prefetcht0 112 * SIZE(%edi) 278 279 addl $16 * SIZE, %edi 280 addl $32 * SIZE, %ecx 281 decl %eax 282 jne .L02 283 ALIGN_2 284 285.L05: 286 movl K, %eax 287 andl $3, %eax 288 BRANCH 289 jle .L10 290 ALIGN_2 291 292.L06: 293 movddup 0 * SIZE(%edi), %xmm0 294 movddup 2 * SIZE(%edi), %xmm1 295 296 movaps %xmm0, 0 * SIZE(%ecx) 297 movaps %xmm1, 4 * SIZE(%ecx) 298 299 addl $4 * SIZE, %edi 300 addl $8 * SIZE, %ecx 301 decl %eax 302 jne .L06 303 ALIGN_4 304 305.L10: 306 movl C, %esi # coffset = c 307 movl A, %edx # aoffset = a 308 movl M, %ebx 309 sarl $2, %ebx # i = (m >> 2) 310 jle .L20 311 ALIGN_4 312 313.L11: 314#if !defined(TRMMKERNEL) || \ 315 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 316 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 317 318 leal BUFFER, BB # boffset1 = boffset 319#else 320 leal BUFFER, BB # boffset1 = boffset 321 movl KK, %eax 322 leal (, %eax, 8), %eax 323 leal (AA, %eax, 2), AA 324 leal (BB, %eax, 4), BB 325#endif 326 327 movaps 0 * SIZE(AA), %xmm0 328 pxor %xmm4, %xmm4 329 movaps 16 * SIZE(AA), %xmm1 330 pxor %xmm5, %xmm5 331 movsldup 0 * SIZE(BB), %xmm2 332 pxor %xmm6, %xmm6 333 movsldup 16 * SIZE(BB), %xmm3 334 pxor %xmm7, %xmm7 335 336 leal (LDC, LDC, 2), %eax 337 338 prefetchnta 4 * SIZE(%esi) 339 prefetchnta 4 * SIZE(%esi, LDC) 340 prefetchnta 4 * SIZE(%esi, LDC, 2) 341 prefetchnta 4 * SIZE(%esi, %eax) 342 343#ifndef TRMMKERNEL 344 movl K, %eax 345#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 346 movl K, %eax 347 subl KK, %eax 348 movl %eax, KKK 349#else 350 movl KK, %eax 351#ifdef LEFT 352 addl $4, %eax 353#else 354 addl $4, %eax 355#endif 356 movl %eax, KKK 357#endif 358 359#if 1 360 andl $-8, %eax 361 sall $4, %eax 362 je .L15 363.L1X: 364 KERNEL1(32 * 0) 365 KERNEL2(32 * 0) 366 KERNEL3(32 * 0) 367 KERNEL4(32 * 0) 368 KERNEL5(32 * 0) 369 KERNEL6(32 * 0) 370 KERNEL7(32 * 0) 371 KERNEL8(32 * 0) 372 cmpl $128 * 1, %eax 373 jle .L12 374 KERNEL1(32 * 1) 375 KERNEL2(32 * 1) 376 KERNEL3(32 * 1) 377 KERNEL4(32 * 1) 378 KERNEL5(32 * 1) 379 KERNEL6(32 * 1) 380 KERNEL7(32 * 1) 381 KERNEL8(32 * 1) 382 cmpl $128 * 2, %eax 383 jle .L12 384 KERNEL1(32 * 2) 385 KERNEL2(32 * 2) 386 KERNEL3(32 * 2) 387 KERNEL4(32 * 2) 388 KERNEL5(32 * 2) 389 KERNEL6(32 * 2) 390 KERNEL7(32 * 2) 391 KERNEL8(32 * 2) 392 cmpl $128 * 3, %eax 393 jle .L12 394 KERNEL1(32 * 3) 395 KERNEL2(32 * 3) 396 KERNEL3(32 * 3) 397 KERNEL4(32 * 3) 398 KERNEL5(32 * 3) 399 KERNEL6(32 * 3) 400 KERNEL7(32 * 3) 401 KERNEL8(32 * 3) 402 cmpl $128 * 4, %eax 403 jle .L12 404 KERNEL1(32 * 4) 405 KERNEL2(32 * 4) 406 KERNEL3(32 * 4) 407 KERNEL4(32 * 4) 408 KERNEL5(32 * 4) 409 KERNEL6(32 * 4) 410 KERNEL7(32 * 4) 411 KERNEL8(32 * 4) 412 cmpl $128 * 5, %eax 413 jle .L12 414 KERNEL1(32 * 5) 415 KERNEL2(32 * 5) 416 KERNEL3(32 * 5) 417 KERNEL4(32 * 5) 418 KERNEL5(32 * 5) 419 KERNEL6(32 * 5) 420 KERNEL7(32 * 5) 421 KERNEL8(32 * 5) 422 cmpl $128 * 6, %eax 423 jle .L12 424 KERNEL1(32 * 6) 425 KERNEL2(32 * 6) 426 KERNEL3(32 * 6) 427 KERNEL4(32 * 6) 428 KERNEL5(32 * 6) 429 KERNEL6(32 * 6) 430 KERNEL7(32 * 6) 431 KERNEL8(32 * 6) 432 cmpl $128 * 7, %eax 433 jle .L12 434 KERNEL1(32 * 7) 435 KERNEL2(32 * 7) 436 KERNEL3(32 * 7) 437 KERNEL4(32 * 7) 438 KERNEL5(32 * 7) 439 KERNEL6(32 * 7) 440 KERNEL7(32 * 7) 441 KERNEL8(32 * 7) 442#if 1 443 cmpl $128 * 8, %eax 444 jle .L12 445 KERNEL1(32 * 8) 446 KERNEL2(32 * 8) 447 KERNEL3(32 * 8) 448 KERNEL4(32 * 8) 449 KERNEL5(32 * 8) 450 KERNEL6(32 * 8) 451 KERNEL7(32 * 8) 452 KERNEL8(32 * 8) 453 cmpl $128 * 9, %eax 454 jle .L12 455 KERNEL1(32 * 9) 456 KERNEL2(32 * 9) 457 KERNEL3(32 * 9) 458 KERNEL4(32 * 9) 459 KERNEL5(32 * 9) 460 KERNEL6(32 * 9) 461 KERNEL7(32 * 9) 462 KERNEL8(32 * 9) 463 cmpl $128 * 10, %eax 464 jle .L12 465 KERNEL1(32 * 10) 466 KERNEL2(32 * 10) 467 KERNEL3(32 * 10) 468 KERNEL4(32 * 10) 469 KERNEL5(32 * 10) 470 KERNEL6(32 * 10) 471 KERNEL7(32 * 10) 472 KERNEL8(32 * 10) 473 cmpl $128 * 11, %eax 474 jle .L12 475 KERNEL1(32 * 11) 476 KERNEL2(32 * 11) 477 KERNEL3(32 * 11) 478 KERNEL4(32 * 11) 479 KERNEL5(32 * 11) 480 KERNEL6(32 * 11) 481 KERNEL7(32 * 11) 482 KERNEL8(32 * 11) 483 cmpl $128 * 12, %eax 484 jle .L12 485 KERNEL1(32 * 12) 486 KERNEL2(32 * 12) 487 KERNEL3(32 * 12) 488 KERNEL4(32 * 12) 489 KERNEL5(32 * 12) 490 KERNEL6(32 * 12) 491 KERNEL7(32 * 12) 492 KERNEL8(32 * 12) 493 cmpl $128 * 13, %eax 494 jle .L12 495 KERNEL1(32 * 13) 496 KERNEL2(32 * 13) 497 KERNEL3(32 * 13) 498 KERNEL4(32 * 13) 499 KERNEL5(32 * 13) 500 KERNEL6(32 * 13) 501 KERNEL7(32 * 13) 502 KERNEL8(32 * 13) 503 cmpl $128 * 14, %eax 504 jle .L12 505 KERNEL1(32 * 14) 506 KERNEL2(32 * 14) 507 KERNEL3(32 * 14) 508 KERNEL4(32 * 14) 509 KERNEL5(32 * 14) 510 KERNEL6(32 * 14) 511 KERNEL7(32 * 14) 512 KERNEL8(32 * 14) 513 cmpl $128 * 15, %eax 514 jle .L12 515 KERNEL1(32 * 15) 516 KERNEL2(32 * 15) 517 KERNEL3(32 * 15) 518 KERNEL4(32 * 15) 519 KERNEL5(32 * 15) 520 KERNEL6(32 * 15) 521 KERNEL7(32 * 15) 522 KERNEL8(32 * 15) 523#else 524 addl $128 * 4 * SIZE, BB 525 addl $128 * 2 * SIZE, AA 526 subl $128 * 8, %eax 527 jg .L1X 528 jmp .L15 529#endif 530 531.L12: 532 leal (AA, %eax, 1), AA 533 leal (BB, %eax, 2), BB 534 ALIGN_4 535#else 536 sarl $3, %eax 537 je .L15 538 ALIGN_4 539 540.L12: 541 KERNEL1(32 * 7) 542 KERNEL2(32 * 7) 543 KERNEL3(32 * 7) 544 KERNEL4(32 * 7) 545 KERNEL5(32 * 7) 546 KERNEL6(32 * 7) 547 KERNEL7(32 * 7) 548 KERNEL8(32 * 7) 549 550 addl $32 * SIZE, AA 551 addl $64 * SIZE, BB 552 decl %eax 553 jne .L12 554 ALIGN_4 555#endif 556 557.L15: 558#ifndef TRMMKERNEL 559 movl K, %eax 560#else 561 movl KKK, %eax 562#endif 563 movaps ALPHA, %xmm3 564 andl $7, %eax # if (k & 1) 565 BRANCH 566 je .L18 567 ALIGN_4 568 569.L16: 570 mulps %xmm0, %xmm2 571 addps %xmm2, %xmm4 572 movshdup 0 * SIZE(BB), %xmm2 573 mulps %xmm0, %xmm2 574 addps %xmm2, %xmm5 575 movsldup 4 * SIZE(BB), %xmm2 576 mulps %xmm0, %xmm2 577 addps %xmm2, %xmm6 578 movshdup 4 * SIZE(BB), %xmm2 579 mulps %xmm0, %xmm2 580 movaps 4 * SIZE(AA), %xmm0 581 addps %xmm2, %xmm7 582 movsldup 8 * SIZE(BB), %xmm2 583 584 addl $4 * SIZE, AA 585 addl $8 * SIZE, BB 586 decl %eax 587 jg .L16 588 ALIGN_4 589 590.L18: 591 leal (LDC, LDC, 2), %eax 592 593 movsd 0 * SIZE(%esi), %xmm0 594 movhps 2 * SIZE(%esi), %xmm0 595 movsd 4 * SIZE(%esi), %xmm1 596 movhps 6 * SIZE(%esi), %xmm1 597 598 pshufd $0x50, %xmm4, %xmm2 599 pshufd $0xfa, %xmm4, %xmm4 600 601 mulps %xmm3, %xmm2 602 mulps %xmm3, %xmm4 603 604 addps %xmm2, %xmm0 605 addps %xmm4, %xmm1 606 607 movlps %xmm0, 0 * SIZE(%esi) 608 movhps %xmm0, 2 * SIZE(%esi) 609 movlps %xmm1, 4 * SIZE(%esi) 610 movhps %xmm1, 6 * SIZE(%esi) 611 612 movsd 0 * SIZE(%esi, LDC), %xmm0 613 movhps 2 * SIZE(%esi, LDC), %xmm0 614 movsd 4 * SIZE(%esi, LDC), %xmm1 615 movhps 6 * SIZE(%esi, LDC), %xmm1 616 617 pshufd $0x50, %xmm5, %xmm2 618 pshufd $0xfa, %xmm5, %xmm5 619 620 mulps %xmm3, %xmm2 621 mulps %xmm3, %xmm5 622 623 addps %xmm2, %xmm0 624 addps %xmm5, %xmm1 625 626 movlps %xmm0, 0 * SIZE(%esi, LDC) 627 movhps %xmm0, 2 * SIZE(%esi, LDC) 628 movlps %xmm1, 4 * SIZE(%esi, LDC) 629 movhps %xmm1, 6 * SIZE(%esi, LDC) 630 631 movsd 0 * SIZE(%esi, LDC, 2), %xmm0 632 movhps 2 * SIZE(%esi, LDC, 2), %xmm0 633 movsd 4 * SIZE(%esi, LDC, 2), %xmm1 634 movhps 6 * SIZE(%esi, LDC, 2), %xmm1 635 636 pshufd $0x50, %xmm6, %xmm2 637 pshufd $0xfa, %xmm6, %xmm6 638 639 mulps %xmm3, %xmm2 640 mulps %xmm3, %xmm6 641 642 addps %xmm2, %xmm0 643 addps %xmm6, %xmm1 644 645 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) 646 movhps %xmm0, 2 * SIZE(%esi, LDC, 2) 647 movlps %xmm1, 4 * SIZE(%esi, LDC, 2) 648 movhps %xmm1, 6 * SIZE(%esi, LDC, 2) 649 650 movsd 0 * SIZE(%esi, %eax), %xmm0 651 movhps 2 * SIZE(%esi, %eax), %xmm0 652 movsd 4 * SIZE(%esi, %eax), %xmm1 653 movhps 6 * SIZE(%esi, %eax), %xmm1 654 655 pshufd $0x50, %xmm7, %xmm2 656 pshufd $0xfa, %xmm7, %xmm7 657 658 mulps %xmm3, %xmm2 659 mulps %xmm3, %xmm7 660 661 addps %xmm2, %xmm0 662 addps %xmm7, %xmm1 663 664 movlps %xmm0, 0 * SIZE(%esi, %eax) 665 movhps %xmm0, 2 * SIZE(%esi, %eax) 666 movlps %xmm1, 4 * SIZE(%esi, %eax) 667 movhps %xmm1, 6 * SIZE(%esi, %eax) 668 669 addl $8 * SIZE, %esi # coffset += 2 670 decl %ebx # i -- 671 jg .L11 672 ALIGN_4 673 674.L20: 675 testl $2, M 676 je .L30 677 678#if !defined(TRMMKERNEL) || \ 679 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 680 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 681 682 leal BUFFER, BB # boffset1 = boffset 683#else 684 leal BUFFER, BB # boffset1 = boffset 685 movl KK, %eax 686 leal (, %eax, 8), %eax 687 leal (AA, %eax, 1), AA 688 leal (BB, %eax, 4), BB 689#endif 690 691 movddup 0 * SIZE(AA), %xmm0 692 pxor %xmm4, %xmm4 693 movddup 8 * SIZE(AA), %xmm1 694 pxor %xmm5, %xmm5 695 movsd 0 * SIZE(BB), %xmm2 696 movsd 16 * SIZE(BB), %xmm3 697 698#ifndef TRMMKERNEL 699 movl K, %eax 700#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 701 movl K, %eax 702 subl KK, %eax 703 movl %eax, KKK 704#else 705 movl KK, %eax 706#ifdef LEFT 707 addl $2, %eax 708#else 709 addl $4, %eax 710#endif 711 movl %eax, KKK 712#endif 713 sarl $3, %eax 714 je .L25 715 ALIGN_4 716 717.L22: 718 shufps $0x50, %xmm2, %xmm2 719 mulps %xmm0, %xmm2 720 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 721 addps %xmm2, %xmm4 722 movsd 4 * SIZE(BB), %xmm2 723 shufps $0x50, %xmm2, %xmm2 724 mulps %xmm0, %xmm2 725 movddup 2 * SIZE(AA), %xmm0 726 addps %xmm2, %xmm5 727 movsd 8 * SIZE(BB), %xmm2 728 shufps $0x50, %xmm2, %xmm2 729 mulps %xmm0, %xmm2 730 addps %xmm2, %xmm4 731 movsd 12 * SIZE(BB), %xmm2 732 shufps $0x50, %xmm2, %xmm2 733 mulps %xmm0, %xmm2 734 movddup 4 * SIZE(AA), %xmm0 735 addps %xmm2, %xmm5 736 movsd 32 * SIZE(BB), %xmm2 737 shufps $0x50, %xmm3, %xmm3 738 mulps %xmm0, %xmm3 739 addps %xmm3, %xmm4 740 movsd 20 * SIZE(BB), %xmm3 741 shufps $0x50, %xmm3, %xmm3 742 mulps %xmm0, %xmm3 743 movddup 6 * SIZE(AA), %xmm0 744 addps %xmm3, %xmm5 745 movsd 24 * SIZE(BB), %xmm3 746 shufps $0x50, %xmm3, %xmm3 747 mulps %xmm0, %xmm3 748 addps %xmm3, %xmm4 749 movsd 28 * SIZE(BB), %xmm3 750 shufps $0x50, %xmm3, %xmm3 751 mulps %xmm0, %xmm3 752 movddup 16 * SIZE(AA), %xmm0 753 addps %xmm3, %xmm5 754 movsd 48 * SIZE(BB), %xmm3 755 shufps $0x50, %xmm2, %xmm2 756 mulps %xmm1, %xmm2 757 addps %xmm2, %xmm4 758 movsd 36 * SIZE(BB), %xmm2 759 shufps $0x50, %xmm2, %xmm2 760 mulps %xmm1, %xmm2 761 movddup 10 * SIZE(AA), %xmm1 762 addps %xmm2, %xmm5 763 movsd 40 * SIZE(BB), %xmm2 764 shufps $0x50, %xmm2, %xmm2 765 mulps %xmm1, %xmm2 766 addps %xmm2, %xmm4 767 movsd 44 * SIZE(BB), %xmm2 768 shufps $0x50, %xmm2, %xmm2 769 mulps %xmm1, %xmm2 770 movddup 12 * SIZE(AA), %xmm1 771 addps %xmm2, %xmm5 772 movsd 64 * SIZE(BB), %xmm2 773 shufps $0x50, %xmm3, %xmm3 774 mulps %xmm1, %xmm3 775 addps %xmm3, %xmm4 776 movsd 52 * SIZE(BB), %xmm3 777 shufps $0x50, %xmm3, %xmm3 778 mulps %xmm1, %xmm3 779 movddup 14 * SIZE(AA), %xmm1 780 addps %xmm3, %xmm5 781 movsd 56 * SIZE(BB), %xmm3 782 shufps $0x50, %xmm3, %xmm3 783 mulps %xmm1, %xmm3 784 addps %xmm3, %xmm4 785 movsd 60 * SIZE(BB), %xmm3 786 shufps $0x50, %xmm3, %xmm3 787 mulps %xmm1, %xmm3 788 movddup 24 * SIZE(AA), %xmm1 789 addps %xmm3, %xmm5 790 movsd 80 * SIZE(BB), %xmm3 791 792 addl $16 * SIZE, AA 793 addl $64 * SIZE, BB 794 decl %eax 795 jne .L22 796 ALIGN_4 797 798.L25: 799#ifndef TRMMKERNEL 800 movl K, %eax 801#else 802 movl KKK, %eax 803#endif 804 movaps ALPHA, %xmm3 805 andl $7, %eax # if (k & 1) 806 BRANCH 807 je .L28 808 ALIGN_4 809 810.L26: 811 shufps $0x50, %xmm2, %xmm2 812 mulps %xmm0, %xmm2 813 addps %xmm2, %xmm4 814 movsd 4 * SIZE(BB), %xmm2 815 shufps $0x50, %xmm2, %xmm2 816 mulps %xmm0, %xmm2 817 movddup 2 * SIZE(AA), %xmm0 818 addps %xmm2, %xmm5 819 movsd 8 * SIZE(BB), %xmm2 820 821 addl $2 * SIZE, AA 822 addl $8 * SIZE, BB 823 decl %eax 824 jg .L26 825 ALIGN_4 826 827.L28: 828 leal (LDC, LDC, 2), %eax 829 830 movsd 0 * SIZE(%esi), %xmm0 831 movhps 2 * SIZE(%esi), %xmm0 832 movsd 0 * SIZE(%esi, LDC), %xmm1 833 movhps 2 * SIZE(%esi, LDC), %xmm1 834 835 pshufd $0x50, %xmm4, %xmm2 836 pshufd $0xfa, %xmm4, %xmm4 837 838 mulps %xmm3, %xmm2 839 mulps %xmm3, %xmm4 840 841 addps %xmm2, %xmm0 842 addps %xmm4, %xmm1 843 844 movlps %xmm0, 0 * SIZE(%esi) 845 movhps %xmm0, 2 * SIZE(%esi) 846 movlps %xmm1, 0 * SIZE(%esi, LDC) 847 movhps %xmm1, 2 * SIZE(%esi, LDC) 848 849 movsd 0 * SIZE(%esi, LDC, 2), %xmm0 850 movhps 2 * SIZE(%esi, LDC, 2), %xmm0 851 movsd 0 * SIZE(%esi, %eax), %xmm1 852 movhps 2 * SIZE(%esi, %eax), %xmm1 853 854 pshufd $0x50, %xmm5, %xmm2 855 pshufd $0xfa, %xmm5, %xmm5 856 857 mulps %xmm3, %xmm2 858 mulps %xmm3, %xmm5 859 860 addps %xmm2, %xmm0 861 addps %xmm5, %xmm1 862 863 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) 864 movhps %xmm0, 2 * SIZE(%esi, LDC, 2) 865 movlps %xmm1, 0 * SIZE(%esi, %eax) 866 movhps %xmm1, 2 * SIZE(%esi, %eax) 867 868 addl $4 * SIZE, %esi # coffset += 2 869 ALIGN_4 870 871.L30: 872 testl $1, M 873 je .L39 874 875#if !defined(TRMMKERNEL) || \ 876 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 877 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 878 879 leal BUFFER, BB # boffset1 = boffset 880#else 881 leal BUFFER, BB # boffset1 = boffset 882 movl KK, %eax 883 leal (, %eax, 4), %eax 884 leal (AA, %eax, 1), AA 885 leal (BB, %eax, 8), BB 886#endif 887 888 movss 0 * SIZE(AA), %xmm0 889 pxor %xmm4, %xmm4 890 movss 4 * SIZE(AA), %xmm1 891 pxor %xmm5, %xmm5 892 movsd 0 * SIZE(BB), %xmm2 893 movsd 16 * SIZE(BB), %xmm3 894 895#ifndef TRMMKERNEL 896 movl K, %eax 897#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 898 movl K, %eax 899 subl KK, %eax 900 movl %eax, KKK 901#else 902 movl KK, %eax 903#ifdef LEFT 904 addl $1, %eax 905#else 906 addl $4, %eax 907#endif 908 movl %eax, KKK 909#endif 910 sarl $3, %eax 911 je .L35 912 ALIGN_4 913 914.L32: 915 shufps $0, %xmm0, %xmm0 916 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 917 movhps 4 * SIZE(BB), %xmm2 918 mulps %xmm0, %xmm2 919 movss 1 * SIZE(AA), %xmm0 920 addps %xmm2, %xmm4 921 movsd 8 * SIZE(BB), %xmm2 922 shufps $0, %xmm0, %xmm0 923 movhps 12 * SIZE(BB), %xmm2 924 mulps %xmm0, %xmm2 925 movss 2 * SIZE(AA), %xmm0 926 addps %xmm2, %xmm5 927 movhps 20 * SIZE(BB), %xmm3 928 shufps $0, %xmm0, %xmm0 929 movsd 32 * SIZE(BB), %xmm2 930 mulps %xmm0, %xmm3 931 movss 3 * SIZE(AA), %xmm0 932 addps %xmm3, %xmm4 933 movsd 24 * SIZE(BB), %xmm3 934 shufps $0, %xmm0, %xmm0 935 movhps 28 * SIZE(BB), %xmm3 936 mulps %xmm0, %xmm3 937 movss 8 * SIZE(AA), %xmm0 938 addps %xmm3, %xmm5 939 movsd 48 * SIZE(BB), %xmm3 940 shufps $0, %xmm1, %xmm1 941 movhps 36 * SIZE(BB), %xmm2 942 mulps %xmm1, %xmm2 943 movss 5 * SIZE(AA), %xmm1 944 addps %xmm2, %xmm4 945 movsd 40 * SIZE(BB), %xmm2 946 shufps $0, %xmm1, %xmm1 947 movhps 44 * SIZE(BB), %xmm2 948 mulps %xmm1, %xmm2 949 movss 6 * SIZE(AA), %xmm1 950 addps %xmm2, %xmm5 951 movsd 64 * SIZE(BB), %xmm2 952 shufps $0, %xmm1, %xmm1 953 movhps 52 * SIZE(BB), %xmm3 954 mulps %xmm1, %xmm3 955 movss 7 * SIZE(AA), %xmm1 956 addps %xmm3, %xmm4 957 movsd 56 * SIZE(BB), %xmm3 958 shufps $0, %xmm1, %xmm1 959 movhps 60 * SIZE(BB), %xmm3 960 mulps %xmm1, %xmm3 961 movss 12 * SIZE(AA), %xmm1 962 addps %xmm3, %xmm5 963 movsd 80 * SIZE(BB), %xmm3 964 965 addl $ 8 * SIZE, AA 966 addl $64 * SIZE, BB 967 decl %eax 968 jne .L32 969 ALIGN_4 970 971.L35: 972#ifndef TRMMKERNEL 973 movl K, %eax 974#else 975 movl KKK, %eax 976#endif 977 movaps ALPHA, %xmm3 978 andl $7, %eax # if (k & 1) 979 BRANCH 980 je .L38 981 ALIGN_4 982 983.L36: 984 shufps $0, %xmm0, %xmm0 985 movhps 4 * SIZE(BB), %xmm2 986 mulps %xmm0, %xmm2 987 movss 1 * SIZE(AA), %xmm0 988 addps %xmm2, %xmm4 989 movsd 8 * SIZE(BB), %xmm2 990 991 addl $1 * SIZE, AA 992 addl $8 * SIZE, BB 993 decl %eax 994 jg .L36 995 ALIGN_4 996 997.L38: 998 leal (LDC, LDC, 2), %eax 999 1000 addps %xmm5, %xmm4 1001 1002 movsd (%esi), %xmm0 1003 movhps (%esi, LDC), %xmm0 1004 movsd (%esi, LDC, 2), %xmm1 1005 movhps (%esi, %eax), %xmm1 1006 1007 pshufd $0x50, %xmm4, %xmm2 1008 pshufd $0xfa, %xmm4, %xmm4 1009 1010 mulps %xmm3, %xmm2 1011 mulps %xmm3, %xmm4 1012 1013 addps %xmm2, %xmm0 1014 addps %xmm4, %xmm1 1015 1016 movlps %xmm0, (%esi) 1017 movhps %xmm0, (%esi, LDC) 1018 movlps %xmm1, (%esi, LDC, 2) 1019 movhps %xmm1, (%esi, %eax) 1020 ALIGN_4 1021 1022.L39: 1023#if defined(TRMMKERNEL) && !defined(LEFT) 1024 addl $4, KK 1025#endif 1026 1027 leal (, LDC, 4), %eax 1028 addl %eax, C # c += 4 * ldc 1029 decl J # j -- 1030 jg .L01 1031 ALIGN_4 1032 1033.L40: 1034 testl $2, N 1035 je .L80 1036 1037#if defined(TRMMKERNEL) && defined(LEFT) 1038 movl OFFSET, %eax 1039 movl %eax, KK 1040#endif 1041 1042 movl K, %eax 1043 leal BUFFER, %ecx 1044 sarl $3, %eax 1045 jle .L45 1046 ALIGN_4 1047 1048.L42: 1049 movddup 0 * SIZE(%edi), %xmm0 1050 movddup 2 * SIZE(%edi), %xmm1 1051 movddup 4 * SIZE(%edi), %xmm2 1052 movddup 6 * SIZE(%edi), %xmm3 1053 movddup 8 * SIZE(%edi), %xmm4 1054 movddup 10 * SIZE(%edi), %xmm5 1055 movddup 12 * SIZE(%edi), %xmm6 1056 movddup 14 * SIZE(%edi), %xmm7 1057 1058 movaps %xmm0, 0 * SIZE(%ecx) 1059 movaps %xmm1, 4 * SIZE(%ecx) 1060 movaps %xmm2, 8 * SIZE(%ecx) 1061 movaps %xmm3, 12 * SIZE(%ecx) 1062 movaps %xmm4, 16 * SIZE(%ecx) 1063 movaps %xmm5, 20 * SIZE(%ecx) 1064 movaps %xmm6, 24 * SIZE(%ecx) 1065 movaps %xmm7, 28 * SIZE(%ecx) 1066 1067# prefetcht1 128 * SIZE(%ecx) 1068 prefetcht0 112 * SIZE(%edi) 1069 1070 addl $16 * SIZE, %edi 1071 addl $32 * SIZE, %ecx 1072 decl %eax 1073 jne .L42 1074 ALIGN_4 1075 1076.L45: 1077 movl K, %eax 1078 andl $7, %eax 1079 BRANCH 1080 jle .L50 1081 ALIGN_4 1082 1083.L46: 1084 movddup 0 * SIZE(%edi), %xmm0 1085 movaps %xmm0, 0 * SIZE(%ecx) 1086 1087 addl $2 * SIZE, %edi 1088 addl $4 * SIZE, %ecx 1089 decl %eax 1090 jne .L46 1091 ALIGN_4 1092 1093.L50: 1094 movl C, %esi # coffset = c 1095 movl A, %edx # aoffset = a 1096 movl M, %ebx 1097 sarl $2, %ebx # i = (m >> 2) 1098 jle .L60 1099 ALIGN_4 1100 1101.L51: 1102#if !defined(TRMMKERNEL) || \ 1103 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1104 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1105 1106 leal BUFFER, BB # boffset1 = boffset 1107#else 1108 leal BUFFER, BB # boffset1 = boffset 1109 movl KK, %eax 1110 leal (, %eax, 8), %eax 1111 leal (AA, %eax, 2), AA 1112 leal (BB, %eax, 2), BB 1113#endif 1114 1115 movaps 0 * SIZE(AA), %xmm0 1116 pxor %xmm4, %xmm4 1117 movaps 16 * SIZE(AA), %xmm1 1118 pxor %xmm5, %xmm5 1119 movsldup 0 * SIZE(BB), %xmm2 1120 pxor %xmm6, %xmm6 1121 movsldup 16 * SIZE(BB), %xmm3 1122 pxor %xmm7, %xmm7 1123 1124 prefetcht2 4 * SIZE(%esi) 1125 prefetcht2 4 * SIZE(%esi, LDC) 1126 1127#ifndef TRMMKERNEL 1128 movl K, %eax 1129#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1130 movl K, %eax 1131 subl KK, %eax 1132 movl %eax, KKK 1133#else 1134 movl KK, %eax 1135#ifdef LEFT 1136 addl $4, %eax 1137#else 1138 addl $2, %eax 1139#endif 1140 movl %eax, KKK 1141#endif 1142 sarl $3, %eax 1143 je .L55 1144 ALIGN_4 1145 1146.L52: 1147 mulps %xmm0, %xmm2 1148 addps %xmm2, %xmm4 1149 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1150 movshdup 0 * SIZE(BB), %xmm2 1151 mulps %xmm0, %xmm2 1152 movaps 4 * SIZE(AA), %xmm0 1153 addps %xmm2, %xmm5 1154 movsldup 4 * SIZE(BB), %xmm2 1155 mulps %xmm0, %xmm2 1156 addps %xmm2, %xmm4 1157 movshdup 4 * SIZE(BB), %xmm2 1158 mulps %xmm0, %xmm2 1159 movaps 8 * SIZE(AA), %xmm0 1160 addps %xmm2, %xmm5 1161 movsldup 8 * SIZE(BB), %xmm2 1162 mulps %xmm0, %xmm2 1163 addps %xmm2, %xmm4 1164 movshdup 8 * SIZE(BB), %xmm2 1165 mulps %xmm0, %xmm2 1166 movaps 12 * SIZE(AA), %xmm0 1167 addps %xmm2, %xmm5 1168 movsldup 12 * SIZE(BB), %xmm2 1169 mulps %xmm0, %xmm2 1170 addps %xmm2, %xmm4 1171 movshdup 12 * SIZE(BB), %xmm2 1172 mulps %xmm0, %xmm2 1173 movaps 32 * SIZE(AA), %xmm0 1174 addps %xmm2, %xmm5 1175 movsldup 32 * SIZE(BB), %xmm2 1176 mulps %xmm1, %xmm3 1177 addps %xmm3, %xmm4 1178 movshdup 16 * SIZE(BB), %xmm3 1179 mulps %xmm1, %xmm3 1180 movaps 20 * SIZE(AA), %xmm1 1181 addps %xmm3, %xmm5 1182 movsldup 20 * SIZE(BB), %xmm3 1183 mulps %xmm1, %xmm3 1184 addps %xmm3, %xmm4 1185 movshdup 20 * SIZE(BB), %xmm3 1186 mulps %xmm1, %xmm3 1187 movaps 24 * SIZE(AA), %xmm1 1188 addps %xmm3, %xmm5 1189 movsldup 24 * SIZE(BB), %xmm3 1190 mulps %xmm1, %xmm3 1191 addps %xmm3, %xmm4 1192 movshdup 24 * SIZE(BB), %xmm3 1193 mulps %xmm1, %xmm3 1194 movaps 28 * SIZE(AA), %xmm1 1195 addps %xmm3, %xmm5 1196 movsldup 28 * SIZE(BB), %xmm3 1197 mulps %xmm1, %xmm3 1198 addps %xmm3, %xmm4 1199 movshdup 28 * SIZE(BB), %xmm3 1200 mulps %xmm1, %xmm3 1201 movaps 48 * SIZE(AA), %xmm1 1202 addps %xmm3, %xmm5 1203 movsldup 48 * SIZE(BB), %xmm3 1204 1205 addl $32 * SIZE, AA 1206 addl $32 * SIZE, BB 1207 decl %eax 1208 jne .L52 1209 ALIGN_4 1210 1211.L55: 1212#ifndef TRMMKERNEL 1213 movl K, %eax 1214#else 1215 movl KKK, %eax 1216#endif 1217 movaps ALPHA, %xmm3 1218 andl $7, %eax # if (k & 1) 1219 BRANCH 1220 je .L58 1221 ALIGN_4 1222 1223.L56: 1224 mulps %xmm0, %xmm2 1225 addps %xmm2, %xmm4 1226 movshdup 0 * SIZE(BB), %xmm2 1227 mulps %xmm0, %xmm2 1228 movaps 4 * SIZE(AA), %xmm0 1229 addps %xmm2, %xmm5 1230 movsldup 4 * SIZE(BB), %xmm2 1231 1232 addl $4 * SIZE, AA 1233 addl $4 * SIZE, BB 1234 decl %eax 1235 jg .L56 1236 ALIGN_4 1237 1238.L58: 1239 movsd 0 * SIZE(%esi), %xmm0 1240 movhps 2 * SIZE(%esi), %xmm0 1241 movsd 4 * SIZE(%esi), %xmm1 1242 movhps 6 * SIZE(%esi), %xmm1 1243 1244 pshufd $0x50, %xmm4, %xmm2 1245 pshufd $0xfa, %xmm4, %xmm4 1246 1247 mulps %xmm3, %xmm2 1248 mulps %xmm3, %xmm4 1249 1250 addps %xmm2, %xmm0 1251 addps %xmm4, %xmm1 1252 1253 movlps %xmm0, 0 * SIZE(%esi) 1254 movhps %xmm0, 2 * SIZE(%esi) 1255 movlps %xmm1, 4 * SIZE(%esi) 1256 movhps %xmm1, 6 * SIZE(%esi) 1257 1258 movsd 0 * SIZE(%esi, LDC), %xmm0 1259 movhps 2 * SIZE(%esi, LDC), %xmm0 1260 movsd 4 * SIZE(%esi, LDC), %xmm1 1261 movhps 6 * SIZE(%esi, LDC), %xmm1 1262 1263 pshufd $0x50, %xmm5, %xmm2 1264 pshufd $0xfa, %xmm5, %xmm5 1265 1266 mulps %xmm3, %xmm2 1267 mulps %xmm3, %xmm5 1268 1269 addps %xmm2, %xmm0 1270 addps %xmm5, %xmm1 1271 1272 movlps %xmm0, 0 * SIZE(%esi, LDC) 1273 movhps %xmm0, 2 * SIZE(%esi, LDC) 1274 movlps %xmm1, 4 * SIZE(%esi, LDC) 1275 movhps %xmm1, 6 * SIZE(%esi, LDC) 1276 1277 addl $8 * SIZE, %esi # coffset += 2 1278 decl %ebx # i -- 1279 jg .L51 1280 ALIGN_4 1281 1282.L60: 1283 testl $2, M 1284 je .L70 1285 1286#if !defined(TRMMKERNEL) || \ 1287 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1288 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1289 1290 leal BUFFER, BB # boffset1 = boffset 1291#else 1292 leal BUFFER, BB # boffset1 = boffset 1293 movl KK, %eax 1294 leal (, %eax, 8), %eax 1295 leal (AA, %eax, 1), AA 1296 leal (BB, %eax, 2), BB 1297#endif 1298 1299 movddup 0 * SIZE(AA), %xmm0 1300 pxor %xmm4, %xmm4 1301 movddup 8 * SIZE(AA), %xmm1 1302 pxor %xmm5, %xmm5 1303 movsd 0 * SIZE(BB), %xmm2 1304 movsd 16 * SIZE(BB), %xmm3 1305 1306 leal (LDC, LDC, 2), %eax 1307 1308#ifndef TRMMKERNEL 1309 movl K, %eax 1310#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1311 movl K, %eax 1312 subl KK, %eax 1313 movl %eax, KKK 1314#else 1315 movl KK, %eax 1316#ifdef LEFT 1317 addl $2, %eax 1318#else 1319 addl $2, %eax 1320#endif 1321 movl %eax, KKK 1322#endif 1323 sarl $3, %eax 1324 je .L65 1325 ALIGN_4 1326 1327.L62: 1328 shufps $0x50, %xmm2, %xmm2 1329 mulps %xmm0, %xmm2 1330 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1331 movddup 2 * SIZE(AA), %xmm0 1332 addps %xmm2, %xmm4 1333 movsd 4 * SIZE(BB), %xmm2 1334 shufps $0x50, %xmm2, %xmm2 1335 mulps %xmm0, %xmm2 1336 movddup 4 * SIZE(AA), %xmm0 1337 addps %xmm2, %xmm5 1338 movsd 8 * SIZE(BB), %xmm2 1339 shufps $0x50, %xmm2, %xmm2 1340 mulps %xmm0, %xmm2 1341 movddup 6 * SIZE(AA), %xmm0 1342 addps %xmm2, %xmm4 1343 movsd 12 * SIZE(BB), %xmm2 1344 shufps $0x50, %xmm2, %xmm2 1345 mulps %xmm0, %xmm2 1346 movddup 16 * SIZE(AA), %xmm0 1347 addps %xmm2, %xmm5 1348 movsd 32 * SIZE(BB), %xmm2 1349 shufps $0x50, %xmm3, %xmm3 1350 mulps %xmm1, %xmm3 1351 movddup 10 * SIZE(AA), %xmm1 1352 addps %xmm3, %xmm4 1353 movsd 20 * SIZE(BB), %xmm3 1354 shufps $0x50, %xmm3, %xmm3 1355 mulps %xmm1, %xmm3 1356 movddup 12 * SIZE(AA), %xmm1 1357 addps %xmm3, %xmm5 1358 movsd 24 * SIZE(BB), %xmm3 1359 shufps $0x50, %xmm3, %xmm3 1360 mulps %xmm1, %xmm3 1361 movddup 14 * SIZE(AA), %xmm1 1362 addps %xmm3, %xmm4 1363 movsd 28 * SIZE(BB), %xmm3 1364 shufps $0x50, %xmm3, %xmm3 1365 mulps %xmm1, %xmm3 1366 movddup 24 * SIZE(AA), %xmm1 1367 addps %xmm3, %xmm5 1368 movsd 48 * SIZE(BB), %xmm3 1369 1370 addl $16 * SIZE, AA 1371 addl $32 * SIZE, BB 1372 decl %eax 1373 jne .L62 1374 ALIGN_4 1375 1376.L65: 1377#ifndef TRMMKERNEL 1378 movl K, %eax 1379#else 1380 movl KKK, %eax 1381#endif 1382 movaps ALPHA, %xmm3 1383 andl $7, %eax # if (k & 1) 1384 BRANCH 1385 je .L68 1386 ALIGN_4 1387 1388.L66: 1389 shufps $0x50, %xmm2, %xmm2 1390 mulps %xmm0, %xmm2 1391 movddup 2 * SIZE(AA), %xmm0 1392 addps %xmm2, %xmm4 1393 movsd 4 * SIZE(BB), %xmm2 1394 1395 addl $2 * SIZE, AA 1396 addl $4 * SIZE, BB 1397 decl %eax 1398 jg .L66 1399 ALIGN_4 1400 1401.L68: 1402 addps %xmm5, %xmm4 1403 1404 movsd 0 * SIZE(%esi), %xmm0 1405 movhps 2 * SIZE(%esi), %xmm0 1406 movsd 0 * SIZE(%esi, LDC), %xmm1 1407 movhps 2 * SIZE(%esi, LDC), %xmm1 1408 1409 pshufd $0x50, %xmm4, %xmm2 1410 pshufd $0xfa, %xmm4, %xmm4 1411 1412 mulps %xmm3, %xmm2 1413 mulps %xmm3, %xmm4 1414 1415 addps %xmm2, %xmm0 1416 addps %xmm4, %xmm1 1417 1418 movlps %xmm0, 0 * SIZE(%esi) 1419 movhps %xmm0, 2 * SIZE(%esi) 1420 movlps %xmm1, 0 * SIZE(%esi, LDC) 1421 movhps %xmm1, 2 * SIZE(%esi, LDC) 1422 1423 addl $4 * SIZE, %esi 1424 ALIGN_4 1425 1426.L70: 1427 testl $1, M 1428 je .L79 1429 1430#if !defined(TRMMKERNEL) || \ 1431 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1432 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1433 1434 leal BUFFER, BB # boffset1 = boffset 1435#else 1436 leal BUFFER, BB # boffset1 = boffset 1437 movl KK, %eax 1438 leal (, %eax, 4), %eax 1439 leal (AA, %eax, 1), AA 1440 leal (BB, %eax, 4), BB 1441#endif 1442 1443 movss 0 * SIZE(AA), %xmm0 1444 pxor %xmm4, %xmm4 1445 movss 4 * SIZE(AA), %xmm1 1446 pxor %xmm5, %xmm5 1447 movsd 0 * SIZE(BB), %xmm2 1448 movsd 16 * SIZE(BB), %xmm3 1449 1450 leal (LDC, LDC, 2), %eax 1451 1452#ifndef TRMMKERNEL 1453 movl K, %eax 1454#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1455 movl K, %eax 1456 subl KK, %eax 1457 movl %eax, KKK 1458#else 1459 movl KK, %eax 1460#ifdef LEFT 1461 addl $1, %eax 1462#else 1463 addl $2, %eax 1464#endif 1465 movl %eax, KKK 1466#endif 1467 sarl $3, %eax 1468 je .L75 1469 ALIGN_4 1470 1471.L72: 1472 shufps $0, %xmm0, %xmm0 1473 mulps %xmm0, %xmm2 1474 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1475 movss 1 * SIZE(AA), %xmm0 1476 addps %xmm2, %xmm4 1477 shufps $0, %xmm0, %xmm0 1478 movsd 4 * SIZE(BB), %xmm2 1479 mulps %xmm0, %xmm2 1480 movss 2 * SIZE(AA), %xmm0 1481 addps %xmm2, %xmm5 1482 shufps $0, %xmm0, %xmm0 1483 movsd 8 * SIZE(BB), %xmm2 1484 mulps %xmm0, %xmm2 1485 movss 3 * SIZE(AA), %xmm0 1486 addps %xmm2, %xmm4 1487 shufps $0, %xmm0, %xmm0 1488 movsd 12 * SIZE(BB), %xmm2 1489 mulps %xmm0, %xmm2 1490 movss 8 * SIZE(AA), %xmm0 1491 addps %xmm2, %xmm5 1492 movsd 32 * SIZE(BB), %xmm2 1493 shufps $0, %xmm1, %xmm1 1494 mulps %xmm1, %xmm3 1495 movss 5 * SIZE(AA), %xmm1 1496 addps %xmm3, %xmm4 1497 shufps $0, %xmm1, %xmm1 1498 movsd 20 * SIZE(BB), %xmm3 1499 mulps %xmm1, %xmm3 1500 movss 6 * SIZE(AA), %xmm1 1501 addps %xmm3, %xmm5 1502 shufps $0, %xmm1, %xmm1 1503 movsd 24 * SIZE(BB), %xmm3 1504 mulps %xmm1, %xmm3 1505 movss 7 * SIZE(AA), %xmm1 1506 addps %xmm3, %xmm4 1507 shufps $0, %xmm1, %xmm1 1508 movsd 28 * SIZE(BB), %xmm3 1509 mulps %xmm1, %xmm3 1510 movss 12 * SIZE(AA), %xmm1 1511 addps %xmm3, %xmm5 1512 movsd 48 * SIZE(BB), %xmm3 1513 1514 addl $ 8 * SIZE, AA 1515 addl $32 * SIZE, BB 1516 decl %eax 1517 jne .L72 1518 ALIGN_4 1519 1520.L75: 1521#ifndef TRMMKERNEL 1522 movl K, %eax 1523#else 1524 movl KKK, %eax 1525#endif 1526 movaps ALPHA, %xmm3 1527 andl $7, %eax # if (k & 1) 1528 BRANCH 1529 je .L78 1530 ALIGN_4 1531 1532.L76: 1533 shufps $0, %xmm0, %xmm0 1534 mulps %xmm0, %xmm2 1535 movss 1 * SIZE(AA), %xmm0 1536 addps %xmm2, %xmm4 1537 movsd 4 * SIZE(BB), %xmm2 1538 1539 addl $ 1 * SIZE, AA 1540 addl $ 4 * SIZE, BB 1541 decl %eax 1542 jg .L76 1543 ALIGN_4 1544 1545.L78: 1546 addps %xmm5, %xmm4 1547 1548 movsd (%esi), %xmm0 1549 movhps (%esi, LDC), %xmm0 1550 1551 pshufd $0x50, %xmm4, %xmm2 1552 mulps %xmm3, %xmm2 1553 addps %xmm2, %xmm0 1554 1555 movlps %xmm0, (%esi) 1556 movhps %xmm0, (%esi, LDC) 1557 ALIGN_4 1558 1559.L79: 1560#if defined(TRMMKERNEL) && !defined(LEFT) 1561 addl $2, KK 1562#endif 1563 leal (, LDC, 2), %eax 1564 addl %eax, C 1565 ALIGN_4 1566 1567.L80: 1568 testl $1, N 1569 je .L999 1570 1571#if defined(TRMMKERNEL) && defined(LEFT) 1572 movl OFFSET, %eax 1573 movl %eax, KK 1574#endif 1575 1576 movl K, %eax 1577 leal BUFFER, %ecx 1578 sarl $3, %eax 1579 jle .L85 1580 ALIGN_4 1581 1582.L82: 1583 movss 0 * SIZE(%edi), %xmm0 1584 movss 1 * SIZE(%edi), %xmm1 1585 movss 2 * SIZE(%edi), %xmm2 1586 movss 3 * SIZE(%edi), %xmm3 1587 movss 4 * SIZE(%edi), %xmm4 1588 movss 5 * SIZE(%edi), %xmm5 1589 movss 6 * SIZE(%edi), %xmm6 1590 movss 7 * SIZE(%edi), %xmm7 1591 1592 movss %xmm0, 0 * SIZE(%ecx) 1593 movss %xmm0, 1 * SIZE(%ecx) 1594 movss %xmm1, 2 * SIZE(%ecx) 1595 movss %xmm1, 3 * SIZE(%ecx) 1596 movss %xmm2, 4 * SIZE(%ecx) 1597 movss %xmm2, 5 * SIZE(%ecx) 1598 movss %xmm3, 6 * SIZE(%ecx) 1599 movss %xmm3, 7 * SIZE(%ecx) 1600 movss %xmm4, 8 * SIZE(%ecx) 1601 movss %xmm4, 9 * SIZE(%ecx) 1602 movss %xmm5, 10 * SIZE(%ecx) 1603 movss %xmm5, 11 * SIZE(%ecx) 1604 movss %xmm6, 12 * SIZE(%ecx) 1605 movss %xmm6, 13 * SIZE(%ecx) 1606 movss %xmm7, 14 * SIZE(%ecx) 1607 movss %xmm7, 15 * SIZE(%ecx) 1608 1609# prefetcht1 128 * SIZE(%ecx) 1610 prefetcht0 112 * SIZE(%edi) 1611 1612 addl $ 8 * SIZE, %edi 1613 addl $16 * SIZE, %ecx 1614 decl %eax 1615 jne .L82 1616 ALIGN_4 1617 1618.L85: 1619 movl K, %eax 1620 andl $7, %eax 1621 BRANCH 1622 jle .L90 1623 ALIGN_4 1624 1625.L86: 1626 movss 0 * SIZE(%edi), %xmm0 1627 movss %xmm0, 0 * SIZE(%ecx) 1628 movss %xmm0, 1 * SIZE(%ecx) 1629 1630 addl $1 * SIZE, %edi 1631 addl $2 * SIZE, %ecx 1632 decl %eax 1633 jne .L86 1634 ALIGN_4 1635 1636.L90: 1637 movl C, %esi # coffset = c 1638 movl A, %edx # aoffset = a 1639 movl M, %ebx 1640 sarl $2, %ebx # i = (m >> 2) 1641 jle .L100 1642 ALIGN_4 1643 1644.L91: 1645#if !defined(TRMMKERNEL) || \ 1646 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1647 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1648 1649 leal BUFFER, BB # boffset1 = boffset 1650#else 1651 leal BUFFER, BB # boffset1 = boffset 1652 movl KK, %eax 1653 leal (, %eax, 8), %eax 1654 leal (AA, %eax, 2), AA 1655 leal (BB, %eax, 1), BB 1656#endif 1657 1658 movaps 0 * SIZE(AA), %xmm0 1659 pxor %xmm4, %xmm4 1660 movddup 0 * SIZE(BB), %xmm2 1661 pxor %xmm5, %xmm5 1662 movaps 16 * SIZE(AA), %xmm1 1663 movddup 8 * SIZE(BB), %xmm3 1664 1665#ifdef HAVE_3DNOW 1666 prefetchw 4 * SIZE(%esi) 1667#elif defined(HAVE_SSE) || defined(HAVE_SSE2) 1668 prefetcht2 4 * SIZE(%esi) 1669#endif 1670 1671#ifndef TRMMKERNEL 1672 movl K, %eax 1673#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1674 movl K, %eax 1675 subl KK, %eax 1676 movl %eax, KKK 1677#else 1678 movl KK, %eax 1679#ifdef LEFT 1680 addl $4, %eax 1681#else 1682 addl $1, %eax 1683#endif 1684 movl %eax, KKK 1685#endif 1686 sarl $3, %eax 1687 je .L95 1688 ALIGN_4 1689 1690.L92: 1691 mulps %xmm0, %xmm2 1692 movaps 4 * SIZE(AA), %xmm0 1693 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1694 addps %xmm2, %xmm4 1695 movddup 2 * SIZE(BB), %xmm2 1696 mulps %xmm0, %xmm2 1697 movaps 8 * SIZE(AA), %xmm0 1698 addps %xmm2, %xmm5 1699 movddup 4 * SIZE(BB), %xmm2 1700 mulps %xmm0, %xmm2 1701 movaps 12 * SIZE(AA), %xmm0 1702 addps %xmm2, %xmm4 1703 movddup 6 * SIZE(BB), %xmm2 1704 mulps %xmm0, %xmm2 1705 movaps 32 * SIZE(AA), %xmm0 1706 addps %xmm2, %xmm5 1707 movddup 16 * SIZE(BB), %xmm2 1708 mulps %xmm1, %xmm3 1709 movaps 20 * SIZE(AA), %xmm1 1710 addps %xmm3, %xmm4 1711 movddup 10 * SIZE(BB), %xmm3 1712 mulps %xmm1, %xmm3 1713 movaps 24 * SIZE(AA), %xmm1 1714 addps %xmm3, %xmm5 1715 movddup 12 * SIZE(BB), %xmm3 1716 mulps %xmm1, %xmm3 1717 movaps 28 * SIZE(AA), %xmm1 1718 addps %xmm3, %xmm4 1719 movddup 14 * SIZE(BB), %xmm3 1720 mulps %xmm1, %xmm3 1721 movaps 48 * SIZE(AA), %xmm1 1722 addps %xmm3, %xmm5 1723 movddup 24 * SIZE(BB), %xmm3 1724 1725 addl $32 * SIZE, AA 1726 addl $16 * SIZE, BB 1727 decl %eax 1728 jne .L92 1729 ALIGN_4 1730 1731.L95: 1732#ifndef TRMMKERNEL 1733 movl K, %eax 1734#else 1735 movl KKK, %eax 1736#endif 1737 movaps ALPHA, %xmm3 1738 andl $7, %eax # if (k & 1) 1739 BRANCH 1740 je .L98 1741 ALIGN_4 1742 1743.L96: 1744 mulps %xmm0, %xmm2 1745 movaps 4 * SIZE(AA), %xmm0 1746 addps %xmm2, %xmm4 1747 movddup 2 * SIZE(BB), %xmm2 1748 1749 addl $4 * SIZE, AA 1750 addl $2 * SIZE, BB 1751 decl %eax 1752 jg .L96 1753 ALIGN_4 1754 1755.L98: 1756 addps %xmm5, %xmm4 1757 1758 movsd 0 * SIZE(%esi), %xmm0 1759 movhps 2 * SIZE(%esi), %xmm0 1760 movsd 4 * SIZE(%esi), %xmm1 1761 movhps 6 * SIZE(%esi), %xmm1 1762 1763 pshufd $0x50, %xmm4, %xmm2 1764 pshufd $0xfa, %xmm4, %xmm4 1765 1766 mulps %xmm3, %xmm2 1767 mulps %xmm3, %xmm4 1768 1769 addps %xmm2, %xmm0 1770 addps %xmm4, %xmm1 1771 1772 movlps %xmm0, 0 * SIZE(%esi) 1773 movhps %xmm0, 2 * SIZE(%esi) 1774 movlps %xmm1, 4 * SIZE(%esi) 1775 movhps %xmm1, 6 * SIZE(%esi) 1776 1777 addl $8 * SIZE, %esi 1778 decl %ebx # i -- 1779 jg .L91 1780 ALIGN_4 1781 1782.L100: 1783 testl $2, M 1784 je .L110 1785 1786#if !defined(TRMMKERNEL) || \ 1787 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1788 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1789 1790 leal BUFFER, BB # boffset1 = boffset 1791#else 1792 leal BUFFER, BB # boffset1 = boffset 1793 movl KK, %eax 1794 leal (, %eax, 8), %eax 1795 leal (AA, %eax, 1), AA 1796 leal (BB, %eax, 1), BB 1797#endif 1798 1799 pxor %xmm4, %xmm4 1800 pxor %xmm5, %xmm5 1801 pxor %xmm6, %xmm6 1802 pxor %xmm7, %xmm7 1803 1804 movsd 0 * SIZE(AA), %xmm0 1805 movsd 0 * SIZE(BB), %xmm2 1806 movsd 8 * SIZE(AA), %xmm1 1807 movsd 8 * SIZE(BB), %xmm3 1808 1809 leal (LDC, LDC, 2), %eax 1810 1811#ifndef TRMMKERNEL 1812 movl K, %eax 1813#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1814 movl K, %eax 1815 subl KK, %eax 1816 movl %eax, KKK 1817#else 1818 movl KK, %eax 1819#ifdef LEFT 1820 addl $2, %eax 1821#else 1822 addl $1, %eax 1823#endif 1824 movl %eax, KKK 1825#endif 1826 sarl $3, %eax 1827 je .L105 1828 ALIGN_4 1829 1830.L102: 1831 mulps %xmm0, %xmm2 1832 movsd 2 * SIZE(AA), %xmm0 1833 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1834 addps %xmm2, %xmm4 1835 movsd 2 * SIZE(BB), %xmm2 1836 mulps %xmm0, %xmm2 1837 movsd 4 * SIZE(AA), %xmm0 1838 addps %xmm2, %xmm5 1839 movsd 4 * SIZE(BB), %xmm2 1840 mulps %xmm0, %xmm2 1841 movsd 6 * SIZE(AA), %xmm0 1842 addps %xmm2, %xmm4 1843 movsd 6 * SIZE(BB), %xmm2 1844 mulps %xmm0, %xmm2 1845 movsd 16 * SIZE(AA), %xmm0 1846 addps %xmm2, %xmm5 1847 movsd 16 * SIZE(BB), %xmm2 1848 mulps %xmm1, %xmm3 1849 movsd 10 * SIZE(AA), %xmm1 1850 addps %xmm3, %xmm4 1851 movsd 10 * SIZE(BB), %xmm3 1852 mulps %xmm1, %xmm3 1853 movsd 12 * SIZE(AA), %xmm1 1854 addps %xmm3, %xmm5 1855 movsd 12 * SIZE(BB), %xmm3 1856 mulps %xmm1, %xmm3 1857 movsd 14 * SIZE(AA), %xmm1 1858 addps %xmm3, %xmm4 1859 movsd 14 * SIZE(BB), %xmm3 1860 mulps %xmm1, %xmm3 1861 movsd 24 * SIZE(AA), %xmm1 1862 addps %xmm3, %xmm5 1863 movsd 24 * SIZE(BB), %xmm3 1864 1865 addl $16 * SIZE, AA 1866 addl $16 * SIZE, BB 1867 decl %eax 1868 jne .L102 1869 ALIGN_4 1870 1871.L105: 1872#ifndef TRMMKERNEL 1873 movl K, %eax 1874#else 1875 movl KKK, %eax 1876#endif 1877 movaps ALPHA, %xmm3 1878 andl $7, %eax # if (k & 1) 1879 BRANCH 1880 je .L108 1881 ALIGN_4 1882 1883.L106: 1884 mulps %xmm0, %xmm2 1885 movsd 2 * SIZE(AA), %xmm0 1886 addps %xmm2, %xmm4 1887 movsd 2 * SIZE(BB), %xmm2 1888 1889 addl $2 * SIZE, AA 1890 addl $2 * SIZE, BB 1891 decl %eax 1892 jg .L106 1893 ALIGN_4 1894 1895.L108: 1896 addps %xmm5, %xmm4 1897 movhlps %xmm4, %xmm5 1898 addps %xmm5, %xmm4 1899 1900 movsd 0 * SIZE(%esi), %xmm0 1901 movhps 2 * SIZE(%esi), %xmm0 1902 1903 pshufd $0x50, %xmm4, %xmm2 1904 mulps %xmm3, %xmm2 1905 addps %xmm2, %xmm0 1906 1907 movlps %xmm0, 0 * SIZE(%esi) 1908 movhps %xmm0, 2 * SIZE(%esi) 1909 1910 addl $4 * SIZE, %esi # coffset += 2 1911 ALIGN_4 1912 1913.L110: 1914 testl $1, M 1915 je .L999 1916 1917#if !defined(TRMMKERNEL) || \ 1918 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1919 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1920 1921 leal BUFFER, BB # boffset1 = boffset 1922#else 1923 leal BUFFER, BB # boffset1 = boffset 1924 movl KK, %eax 1925 leal (, %eax, 4), %eax 1926 leal (AA, %eax, 1), AA 1927 leal (BB, %eax, 2), BB 1928#endif 1929 1930 movss 0 * SIZE(AA), %xmm0 1931 pxor %xmm4, %xmm4 1932 movss 0 * SIZE(BB), %xmm2 1933 pxor %xmm5, %xmm5 1934 movss 4 * SIZE(AA), %xmm1 1935 movss 8 * SIZE(BB), %xmm3 1936 1937 leal (LDC, LDC, 2), %eax 1938 1939#ifndef TRMMKERNEL 1940 movl K, %eax 1941#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1942 movl K, %eax 1943 subl KK, %eax 1944 movl %eax, KKK 1945#else 1946 movl KK, %eax 1947#ifdef LEFT 1948 addl $1, %eax 1949#else 1950 addl $1, %eax 1951#endif 1952 movl %eax, KKK 1953#endif 1954 sarl $3, %eax 1955 je .L115 1956 ALIGN_4 1957 1958.L112: 1959 mulss %xmm0, %xmm2 1960 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1961 movss 1 * SIZE(AA), %xmm0 1962 addss %xmm2, %xmm4 1963 movss 2 * SIZE(BB), %xmm2 1964 mulss %xmm0, %xmm2 1965 movss 2 * SIZE(AA), %xmm0 1966 addss %xmm2, %xmm5 1967 movss 4 * SIZE(BB), %xmm2 1968 mulss %xmm0, %xmm2 1969 movss 3 * SIZE(AA), %xmm0 1970 addss %xmm2, %xmm4 1971 movss 6 * SIZE(BB), %xmm2 1972 mulss %xmm0, %xmm2 1973 movss 8 * SIZE(AA), %xmm0 1974 addss %xmm2, %xmm5 1975 movss 16 * SIZE(BB), %xmm2 1976 mulss %xmm1, %xmm3 1977 movss 5 * SIZE(AA), %xmm1 1978 addss %xmm3, %xmm4 1979 movss 10 * SIZE(BB), %xmm3 1980 mulss %xmm1, %xmm3 1981 movss 6 * SIZE(AA), %xmm1 1982 addss %xmm3, %xmm5 1983 movss 12 * SIZE(BB), %xmm3 1984 mulss %xmm1, %xmm3 1985 movss 7 * SIZE(AA), %xmm1 1986 addss %xmm3, %xmm4 1987 movss 14 * SIZE(BB), %xmm3 1988 mulss %xmm1, %xmm3 1989 movss 12 * SIZE(AA), %xmm1 1990 addss %xmm3, %xmm5 1991 movss 24 * SIZE(BB), %xmm3 1992 1993 addl $ 8 * SIZE, AA 1994 addl $16 * SIZE, BB 1995 decl %eax 1996 jne .L112 1997 ALIGN_4 1998 1999.L115: 2000#ifndef TRMMKERNEL 2001 movl K, %eax 2002#else 2003 movl KKK, %eax 2004#endif 2005 movaps ALPHA, %xmm3 2006 andl $7, %eax # if (k & 1) 2007 BRANCH 2008 je .L118 2009 ALIGN_4 2010 2011.L116: 2012 mulss %xmm0, %xmm2 2013 movss 1 * SIZE(AA), %xmm0 2014 addss %xmm2, %xmm4 2015 movss 2 * SIZE(BB), %xmm2 2016 2017 addl $1 * SIZE, AA 2018 addl $2 * SIZE, BB 2019 decl %eax 2020 jg .L116 2021 ALIGN_4 2022 2023.L118: 2024 addss %xmm5, %xmm4 2025 2026 movsd (%esi), %xmm0 2027 2028 pshufd $0x50, %xmm4, %xmm2 2029 mulps %xmm3, %xmm2 2030 addps %xmm2, %xmm0 2031 2032 movlps %xmm0, (%esi) 2033 ALIGN_4 2034 2035.L999: 2036 movl OLD_STACK, %esp 2037 popl %ebx 2038 popl %esi 2039 popl %edi 2040 popl %ebp 2041 ret 2042 2043 EPILOGUE 2044