1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 16 26#define ARGS 16 27 28#define M 4 + STACK + ARGS(%esp) 29#define N 8 + STACK + ARGS(%esp) 30#define K 12 + STACK + ARGS(%esp) 31#define ALPHA_R 16 + STACK + ARGS(%esp) 32#define ALPHA_I 24 + STACK + ARGS(%esp) 33#define A 32 + STACK + ARGS(%esp) 34#define OLD_B 36 + STACK + ARGS(%esp) 35#define C 40 + STACK + ARGS(%esp) 36#define OLD_LDC 44 + STACK + ARGS(%esp) 37#define OFFSET 48 + STACK + ARGS(%esp) 38 39#define J 0 + STACK(%esp) 40#define BX 4 + STACK(%esp) 41#define KK 8 + STACK(%esp) 42#define KKK 12 + STACK(%esp) 43 44#define B %edi 45#define LDC %ebp 46#define AO %edx 47#define BO %ecx 48#define CO %esi 49#define I %ebx 50 51#define movsd movlps 52#define movapd movups 53#define movlpd movlps 54#define movhpd movhps 55 56#define PREFETCH prefetch 57#define PREFETCHSIZE (8 * 7 + 0) 58 59#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 60#define ADD1 addpd 61#define ADD2 addpd 62#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 63#define ADD1 addpd 64#define ADD2 subpd 65#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 66#define ADD1 subpd 67#define ADD2 addpd 68#else 69#define ADD1 subpd 70#define ADD2 subpd 71#endif 72 73#define KERNEL1(address) \ 74 mulpd %xmm0, %xmm1; \ 75 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %eax, 2); \ 76 mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ 77 ADD1 %xmm1, %xmm4; \ 78 movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ 79 ADD1 %xmm0, %xmm6; \ 80 movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ 81 mulpd %xmm0, %xmm2; \ 82 mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ 83 ADD2 %xmm0, %xmm7; \ 84 movddup -14 * SIZE(AO, %eax, 2), %xmm0 85 86#define KERNEL2(address) \ 87 ADD2 %xmm2, %xmm5; \ 88 movapd %xmm1, %xmm2; \ 89 mulpd %xmm0, %xmm1; \ 90 mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ 91 ADD1 %xmm1, %xmm4; \ 92 movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ 93 ADD1 %xmm0, %xmm6; \ 94 movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ 95 mulpd %xmm0, %xmm2; \ 96 mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ 97 ADD2 %xmm0, %xmm7; \ 98 movddup -12 * SIZE(AO, %eax, 2), %xmm0 99 100#define KERNEL3(address) \ 101 ADD2 %xmm2, %xmm5; \ 102 movapd %xmm1, %xmm2; \ 103 mulpd %xmm0, %xmm1; \ 104 mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ 105 ADD1 %xmm1, %xmm4; \ 106 movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ 107 ADD1 %xmm0, %xmm6; \ 108 movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ 109 mulpd %xmm0, %xmm2; \ 110 mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ 111 ADD2 %xmm0, %xmm7; \ 112 movddup -10 * SIZE(AO, %eax, 2), %xmm0 113 114#define KERNEL4(address) \ 115 ADD2 %xmm2, %xmm5; \ 116 movapd %xmm1, %xmm2; \ 117 mulpd %xmm0, %xmm1; \ 118 mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ 119 ADD1 %xmm1, %xmm4; \ 120 movapd (BO, %eax, 4), %xmm1; \ 121 ADD1 %xmm0, %xmm6; \ 122 movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ 123 mulpd %xmm0, %xmm2; \ 124 mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ 125 ADD2 %xmm0, %xmm7; \ 126 movddup (AO, %eax, 2), %xmm0 127 128#define KERNEL5(address) \ 129 ADD2 %xmm2, %xmm5; \ 130 movapd %xmm1, %xmm2; \ 131 mulpd %xmm3, %xmm1; \ 132 mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ 133 ADD1 %xmm1, %xmm4; \ 134 movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ 135 ADD1 %xmm3, %xmm6; \ 136 movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ 137 mulpd %xmm3, %xmm2; \ 138 mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ 139 ADD2 %xmm3, %xmm7; \ 140 movddup -6 * SIZE(AO, %eax, 2), %xmm3 141 142#define KERNEL6(address) \ 143 ADD2 %xmm2, %xmm5; \ 144 movapd %xmm1, %xmm2; \ 145 mulpd %xmm3, %xmm1; \ 146 mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ 147 ADD1 %xmm1, %xmm4; \ 148 movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ 149 ADD1 %xmm3, %xmm6; \ 150 movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ 151 mulpd %xmm3, %xmm2; \ 152 mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ 153 ADD2 %xmm3, %xmm7; \ 154 movddup -4 * SIZE(AO, %eax, 2), %xmm3 155 156#define KERNEL7(address) \ 157 ADD2 %xmm2, %xmm5; \ 158 movapd %xmm1, %xmm2; \ 159 mulpd %xmm3, %xmm1; \ 160 mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ 161 ADD1 %xmm1, %xmm4; \ 162 movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ 163 ADD1 %xmm3, %xmm6; \ 164 movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ 165 mulpd %xmm3, %xmm2; \ 166 mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ 167 ADD2 %xmm3, %xmm7; \ 168 movddup -2 * SIZE(AO, %eax, 2), %xmm3 169 170#define KERNEL8(address) \ 171 ADD2 %xmm2, %xmm5; \ 172 movapd %xmm1, %xmm2; \ 173 mulpd %xmm3, %xmm1; \ 174 mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ 175 ADD1 %xmm1, %xmm4; \ 176 movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ 177 ADD1 %xmm3, %xmm6; \ 178 movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ 179 mulpd %xmm3, %xmm2; \ 180 mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ 181 ADD2 %xmm3, %xmm7; \ 182 movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ 183 ADD2 %xmm2, %xmm5; \ 184 movapd %xmm1, %xmm2 185 186 PROLOGUE 187 188 subl $ARGS, %esp 189 190 pushl %ebp 191 pushl %edi 192 pushl %esi 193 pushl %ebx 194 195 PROFCODE 196 197 movl OLD_B, B 198 movl OLD_LDC, LDC 199 200#ifdef TRMMKERNEL 201 movl OFFSET, %eax 202 203#ifndef LEFT 204 negl %eax 205#endif 206 207 movl %eax, KK 208#endif 209 210 subl $-16 * SIZE, A 211 subl $-16 * SIZE, B 212 213 sall $ZBASE_SHIFT, LDC 214 215 movl N, %eax 216 sarl $1, %eax 217 movl %eax, J # j = n 218 jle .L100 219 ALIGN_4 220 221.L01: 222#if defined(TRMMKERNEL) && defined(LEFT) 223 movl OFFSET, %eax 224 movl %eax, KK 225#endif 226 227 leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax 228 movl %eax, BX 229 230 movl C, CO 231 movl A, AO 232 movl M, I 233 testl I, I 234 jle .L100 235 ALIGN_4 236 237.L10: 238#if !defined(TRMMKERNEL) || \ 239 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 240 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 241 242 movl B, BO 243#else 244 movl KK, %eax 245 leal (, %eax, SIZE), %eax 246 leal (AO, %eax, 2), AO 247 leal (B, %eax, 4), BO 248#endif 249 250 movl BX, %eax 251 252 prefetcht2 0 * SIZE(%eax) 253 254 subl $-8 * SIZE, BX 255 256 movddup -16 * SIZE(AO), %xmm0 257 movapd -16 * SIZE(BO), %xmm1 258 pxor %xmm4, %xmm4 259 movddup -8 * SIZE(AO), %xmm3 260 pxor %xmm5, %xmm5 261 262 prefetchw 1 * SIZE(CO) 263 pxor %xmm6, %xmm6 264 prefetchw 1 * SIZE(CO, LDC) 265 pxor %xmm7, %xmm7 266 movapd %xmm1, %xmm2 267 268#ifndef TRMMKERNEL 269 movl K, %eax 270#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 271 movl K, %eax 272 subl KK, %eax 273 movl %eax, KKK 274#else 275 movl KK, %eax 276#ifdef LEFT 277 addl $1, %eax 278#else 279 addl $2, %eax 280#endif 281 movl %eax, KKK 282#endif 283 284 andl $-8, %eax 285 286 leal (, %eax, SIZE), %eax 287 leal (AO, %eax, 2), AO 288 leal (BO, %eax, 4), BO 289 negl %eax 290 NOBRANCH 291 je .L15 292 ALIGN_3 293 294.L12: 295 KERNEL1(16 * 0) 296 KERNEL2(16 * 0) 297 KERNEL3(16 * 0) 298 KERNEL4(16 * 0) 299 KERNEL5(16 * 0) 300 KERNEL6(16 * 0) 301 KERNEL7(16 * 0) 302 KERNEL8(16 * 0) 303 addl $8 * SIZE, %eax 304 NOBRANCH 305 je .L15 306 KERNEL1(16 * 0) 307 KERNEL2(16 * 0) 308 KERNEL3(16 * 0) 309 KERNEL4(16 * 0) 310 KERNEL5(16 * 0) 311 KERNEL6(16 * 0) 312 KERNEL7(16 * 0) 313 KERNEL8(16 * 0) 314 addl $8 * SIZE, %eax 315 NOBRANCH 316 je .L15 317 KERNEL1(16 * 0) 318 KERNEL2(16 * 0) 319 KERNEL3(16 * 0) 320 KERNEL4(16 * 0) 321 KERNEL5(16 * 0) 322 KERNEL6(16 * 0) 323 KERNEL7(16 * 0) 324 KERNEL8(16 * 0) 325 addl $8 * SIZE, %eax 326 NOBRANCH 327 je .L15 328 KERNEL1(16 * 0) 329 KERNEL2(16 * 0) 330 KERNEL3(16 * 0) 331 KERNEL4(16 * 0) 332 KERNEL5(16 * 0) 333 KERNEL6(16 * 0) 334 KERNEL7(16 * 0) 335 KERNEL8(16 * 0) 336 addl $8 * SIZE, %eax 337 NOBRANCH 338 je .L15 339 KERNEL1(16 * 0) 340 KERNEL2(16 * 0) 341 KERNEL3(16 * 0) 342 KERNEL4(16 * 0) 343 KERNEL5(16 * 0) 344 KERNEL6(16 * 0) 345 KERNEL7(16 * 0) 346 KERNEL8(16 * 0) 347 addl $8 * SIZE, %eax 348 NOBRANCH 349 je .L15 350 KERNEL1(16 * 0) 351 KERNEL2(16 * 0) 352 KERNEL3(16 * 0) 353 KERNEL4(16 * 0) 354 KERNEL5(16 * 0) 355 KERNEL6(16 * 0) 356 KERNEL7(16 * 0) 357 KERNEL8(16 * 0) 358 addl $8 * SIZE, %eax 359 NOBRANCH 360 je .L15 361 KERNEL1(16 * 0) 362 KERNEL2(16 * 0) 363 KERNEL3(16 * 0) 364 KERNEL4(16 * 0) 365 KERNEL5(16 * 0) 366 KERNEL6(16 * 0) 367 KERNEL7(16 * 0) 368 KERNEL8(16 * 0) 369 addl $8 * SIZE, %eax 370 NOBRANCH 371 je .L15 372 KERNEL1(16 * 0) 373 KERNEL2(16 * 0) 374 KERNEL3(16 * 0) 375 KERNEL4(16 * 0) 376 KERNEL5(16 * 0) 377 KERNEL6(16 * 0) 378 KERNEL7(16 * 0) 379 KERNEL8(16 * 0) 380 addl $8 * SIZE, %eax 381 BRANCH 382 jl .L12 383 ALIGN_3 384 385.L15: 386#ifndef TRMMKERNEL 387 movl K, %eax 388#else 389 movl KKK, %eax 390#endif 391 andl $7, %eax # if (k & 1) 392 BRANCH 393 je .L14 394 395 leal (, %eax, SIZE), %eax 396 leal (AO, %eax, 2), AO 397 leal (BO, %eax, 4), BO 398 negl %eax 399 ALIGN_4 400 401.L16: 402 mulpd %xmm0, %xmm1 403 mulpd -14 * SIZE(BO, %eax, 4), %xmm0 404 ADD1 %xmm1, %xmm4 405 movapd -12 * SIZE(BO, %eax, 4), %xmm1 406 ADD1 %xmm0, %xmm6 407 movddup -15 * SIZE(AO, %eax, 2), %xmm0 408 mulpd %xmm0, %xmm2 409 mulpd -14 * SIZE(BO, %eax, 4), %xmm0 410 ADD2 %xmm0, %xmm7 411 movddup -14 * SIZE(AO, %eax, 2), %xmm0 412 ADD2 %xmm2, %xmm5 413 movapd %xmm1, %xmm2 414 415 addl $SIZE, %eax 416 jl .L16 417 ALIGN_4 418 419.L14: 420#ifndef TRMMKERNEL 421 movupd 0 * SIZE(CO), %xmm0 422 movupd 0 * SIZE(CO, LDC), %xmm1 423#endif 424 425 movddup ALPHA_R, %xmm2 426 movddup ALPHA_I, %xmm3 427 428 SHUFPD_1 %xmm5, %xmm5 429 SHUFPD_1 %xmm7, %xmm7 430 431#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 432 defined(RN) || defined(RT) || defined(CN) || defined(CT) 433 addsubpd %xmm5, %xmm4 434 addsubpd %xmm7, %xmm6 435 436 pshufd $0x4e, %xmm4, %xmm5 437 pshufd $0x4e, %xmm6, %xmm7 438#else 439 addsubpd %xmm4, %xmm5 440 addsubpd %xmm6, %xmm7 441 442 movapd %xmm5, %xmm4 443 pshufd $0x4e, %xmm5, %xmm5 444 movapd %xmm7, %xmm6 445 pshufd $0x4e, %xmm7, %xmm7 446#endif 447 448 mulpd %xmm2, %xmm4 449 mulpd %xmm3, %xmm5 450 mulpd %xmm2, %xmm6 451 mulpd %xmm3, %xmm7 452 453 addsubpd %xmm5, %xmm4 454 addsubpd %xmm7, %xmm6 455 456#ifndef TRMMKERNEL 457 addpd %xmm0, %xmm4 458 addpd %xmm1, %xmm6 459#endif 460 461 movlpd %xmm4, 0 * SIZE(CO) 462 movhpd %xmm4, 1 * SIZE(CO) 463 movlpd %xmm6, 0 * SIZE(CO, LDC) 464 movhpd %xmm6, 1 * SIZE(CO, LDC) 465 466#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 467 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 468 movl K, %eax 469 subl KKK, %eax 470 leal (,%eax, SIZE), %eax 471 leal (AO, %eax, 2), AO 472 leal (BO, %eax, 4), BO 473#endif 474 475#if defined(TRMMKERNEL) && defined(LEFT) 476 addl $1, KK 477#endif 478 479 addl $2 * SIZE, CO # coffset += 4 480 decl I # i -- 481 jg .L10 482 ALIGN_4 483 484.L99: 485#if defined(TRMMKERNEL) && !defined(LEFT) 486 addl $2, KK 487#endif 488 489 movl BO, B 490 491 leal (, LDC, 2), %eax 492 addl %eax, C # c += ldc 493 decl J # j -- 494 jg .L01 495 ALIGN_4 496 497.L100: 498 movl N, %eax 499 andl $1, %eax 500 jle .L500 501 ALIGN_4 502 503.L101: 504#if defined(TRMMKERNEL) && defined(LEFT) 505 movl OFFSET, %eax 506 movl %eax, KK 507#endif 508 509 movl C, CO 510 movl A, AO 511 512 movl M, I 513 testl %ebx, I 514 jle .L500 515 ALIGN_4 516 517.L110: 518#if !defined(TRMMKERNEL) || \ 519 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 520 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 521 522 movl B, BO 523#else 524 movl KK, %eax 525 leal (, %eax, SIZE), %eax 526 leal (AO, %eax, 2), AO 527 leal (B, %eax, 2), BO 528#endif 529 530 movddup -16 * SIZE(AO), %xmm0 531 pxor %xmm4, %xmm4 532 movddup -15 * SIZE(AO), %xmm1 533 pxor %xmm5, %xmm5 534 pxor %xmm6, %xmm6 535 pxor %xmm7, %xmm7 536 537 prefetchw 1 * SIZE(CO) 538 539#ifndef TRMMKERNEL 540 movl K, %eax 541#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 542 movl K, %eax 543 subl KK, %eax 544 movl %eax, KKK 545#else 546 movl KK, %eax 547#ifdef LEFT 548 addl $1, %eax 549#else 550 addl $1, %eax 551#endif 552 movl %eax, KKK 553#endif 554 sarl $3, %eax 555 je .L112 556 ALIGN_4 557 558.L111: 559 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 560 561 mulpd -16 * SIZE(BO), %xmm0 562 ADD1 %xmm0, %xmm4 563 movddup -14 * SIZE(AO), %xmm0 564 mulpd -16 * SIZE(BO), %xmm1 565 ADD2 %xmm1, %xmm5 566 movddup -13 * SIZE(AO), %xmm1 567 568 mulpd -14 * SIZE(BO), %xmm0 569 ADD1 %xmm0, %xmm6 570 movddup -12 * SIZE(AO), %xmm0 571 mulpd -14 * SIZE(BO), %xmm1 572 ADD2 %xmm1, %xmm7 573 movddup -11 * SIZE(AO), %xmm1 574 575 mulpd -12 * SIZE(BO), %xmm0 576 ADD1 %xmm0, %xmm4 577 movddup -10 * SIZE(AO), %xmm0 578 mulpd -12 * SIZE(BO), %xmm1 579 ADD2 %xmm1, %xmm5 580 movddup -9 * SIZE(AO), %xmm1 581 582 mulpd -10 * SIZE(BO), %xmm0 583 ADD1 %xmm0, %xmm6 584 movddup -8 * SIZE(AO), %xmm0 585 mulpd -10 * SIZE(BO), %xmm1 586 ADD2 %xmm1, %xmm7 587 movddup -7 * SIZE(AO), %xmm1 588 589 mulpd -8 * SIZE(BO), %xmm0 590 ADD1 %xmm0, %xmm4 591 movddup -6 * SIZE(AO), %xmm0 592 mulpd -8 * SIZE(BO), %xmm1 593 ADD2 %xmm1, %xmm5 594 movddup -5 * SIZE(AO), %xmm1 595 596 mulpd -6 * SIZE(BO), %xmm0 597 ADD1 %xmm0, %xmm6 598 movddup -4 * SIZE(AO), %xmm0 599 mulpd -6 * SIZE(BO), %xmm1 600 ADD2 %xmm1, %xmm7 601 movddup -3 * SIZE(AO), %xmm1 602 603 mulpd -4 * SIZE(BO), %xmm0 604 ADD1 %xmm0, %xmm4 605 movddup -2 * SIZE(AO), %xmm0 606 mulpd -4 * SIZE(BO), %xmm1 607 ADD2 %xmm1, %xmm5 608 movddup -1 * SIZE(AO), %xmm1 609 610 mulpd -2 * SIZE(BO), %xmm0 611 ADD1 %xmm0, %xmm6 612 movddup 0 * SIZE(AO), %xmm0 613 mulpd -2 * SIZE(BO), %xmm1 614 ADD2 %xmm1, %xmm7 615 movddup 1 * SIZE(AO), %xmm1 616 617 subl $-16 * SIZE, AO 618 subl $-16 * SIZE, BO 619 decl %eax 620 jne .L111 621 ALIGN_4 622 623.L112: 624#ifndef TRMMKERNEL 625 movl K, %eax 626#else 627 movl KKK, %eax 628#endif 629 andl $7, %eax # if (k & 1) 630 BRANCH 631 je .L114 632 ALIGN_4 633 634.L113: 635 mulpd -16 * SIZE(BO), %xmm0 636 ADD1 %xmm0, %xmm4 637 movddup -14 * SIZE(AO), %xmm0 638 mulpd -16 * SIZE(BO), %xmm1 639 ADD2 %xmm1, %xmm5 640 movddup -13 * SIZE(AO), %xmm1 641 642 addl $2 * SIZE, AO 643 addl $2 * SIZE, BO 644 decl %eax 645 jg .L113 646 ALIGN_4 647 648.L114: 649#ifndef TRMMKERNEL 650 movupd 0 * SIZE(CO), %xmm0 651#endif 652 653 movddup ALPHA_R, %xmm2 654 movddup ALPHA_I, %xmm3 655 656 addpd %xmm6, %xmm4 657 addpd %xmm7, %xmm5 658 659 SHUFPD_1 %xmm5, %xmm5 660 661#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 662 defined(RN) || defined(RT) || defined(CN) || defined(CT) 663 addsubpd %xmm5, %xmm4 664 pshufd $0x4e, %xmm4, %xmm5 665#else 666 addsubpd %xmm4, %xmm5 667 movapd %xmm5, %xmm4 668 pshufd $0x4e, %xmm5, %xmm5 669#endif 670 671 mulpd %xmm2, %xmm4 672 mulpd %xmm3, %xmm5 673 674 addsubpd %xmm5, %xmm4 675 676#ifndef TRMMKERNEL 677 addpd %xmm0, %xmm4 678#endif 679 680 movlpd %xmm4, 0 * SIZE(CO) 681 movhpd %xmm4, 1 * SIZE(CO) 682 683#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 684 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 685 movl K, %eax 686 subl KKK, %eax 687 leal (,%eax, SIZE), %eax 688 leal (AO, %eax, 2), AO 689 leal (BO, %eax, 2), BO 690#endif 691 692#if defined(TRMMKERNEL) && defined(LEFT) 693 addl $1, KK 694#endif 695 696 addl $2 * SIZE, CO # coffset += 4 697 decl I # i -- 698 jg .L110 699 ALIGN_4 700 701.L500: 702 popl %ebx 703 popl %esi 704 popl %edi 705 popl %ebp 706 707 addl $ARGS, %esp 708 709 ret 710 711 EPILOGUE 712