1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43#define ARGS 0 44 45#define STACK_M 4 + STACK + ARGS(%esi) 46#define STACK_N 8 + STACK + ARGS(%esi) 47#define STACK_K 12 + STACK + ARGS(%esi) 48#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) 49#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) 50#define STACK_A 32 + STACK + ARGS(%esi) 51#define STACK_B 36 + STACK + ARGS(%esi) 52#define STACK_C 40 + STACK + ARGS(%esi) 53#define STACK_LDC 44 + STACK + ARGS(%esi) 54#define STACK_OFFT 48 + STACK + ARGS(%esi) 55 56#define POSINV 0(%esp) 57#define ALPHA_R 16(%esp) 58#define ALPHA_I 32(%esp) 59#define K 48(%esp) 60#define N 52(%esp) 61#define M 56(%esp) 62#define A 60(%esp) 63#define C 64(%esp) 64#define J 68(%esp) 65#define OLD_STACK 72(%esp) 66#define OFFSET 76(%esp) 67#define KK 80(%esp) 68#define KKK 84(%esp) 69#define BUFFER 128(%esp) 70 71#define STACK_ALIGN 4096 72#define STACK_OFFSET 1024 73 74#if defined(OPTERON) || defined(BARCELONA) 75#define PREFETCH prefetch 76#endif 77 78#define PREFETCHSIZE (8 * 10 + 4) 79 80#define AA %edx 81#define BB %ecx 82#define LDC %ebp 83#define B %edi 84 85 86#define KERNEL1(address) \ 87 mulpd %xmm0, %xmm2; \ 88 addpd %xmm2, %xmm4; \ 89 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ 90 movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 91 mulpd %xmm0, %xmm2; \ 92 addpd %xmm2, %xmm5; \ 93 movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 94 mulpd %xmm0, %xmm2; \ 95 mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 96 addpd %xmm2, %xmm6; \ 97 movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 98 addpd %xmm0, %xmm7; \ 99 movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 100 101#define KERNEL2(address) \ 102 mulpd %xmm0, %xmm3; \ 103 addpd %xmm3, %xmm4; \ 104 movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 105 mulpd %xmm0, %xmm3; \ 106 addpd %xmm3, %xmm5; \ 107 movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 108 mulpd %xmm0, %xmm3; \ 109 mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 110 addpd %xmm3, %xmm6; \ 111 movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 112 addpd %xmm0, %xmm7; \ 113 movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 114 115#define KERNEL3(address) \ 116 mulpd %xmm0, %xmm2; \ 117 addpd %xmm2, %xmm4; \ 118 movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 119 mulpd %xmm0, %xmm2; \ 120 addpd %xmm2, %xmm5; \ 121 movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 122 mulpd %xmm0, %xmm2; \ 123 mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 124 addpd %xmm2, %xmm6; \ 125 movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 126 addpd %xmm0, %xmm7; \ 127 movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 128 129#define KERNEL4(address) \ 130 mulpd %xmm0, %xmm3; \ 131 addpd %xmm3, %xmm4; \ 132 movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 133 mulpd %xmm0, %xmm3; \ 134 addpd %xmm3, %xmm5; \ 135 movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 136 mulpd %xmm0, %xmm3; \ 137 mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 138 addpd %xmm3, %xmm6; \ 139 movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 140 addpd %xmm0, %xmm7; \ 141 movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 142 143#define KERNEL5(address) \ 144 PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ 145 mulpd %xmm1, %xmm2; \ 146 addpd %xmm2, %xmm4; \ 147 movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 148 mulpd %xmm1, %xmm2; \ 149 addpd %xmm2, %xmm5; \ 150 movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 151 mulpd %xmm1, %xmm2; \ 152 mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 153 addpd %xmm2, %xmm6; \ 154 movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 155 addpd %xmm1, %xmm7; \ 156 movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 157 158#define KERNEL6(address) \ 159 mulpd %xmm1, %xmm3; \ 160 addpd %xmm3, %xmm4; \ 161 movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 162 mulpd %xmm1, %xmm3; \ 163 addpd %xmm3, %xmm5; \ 164 movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 165 mulpd %xmm1, %xmm3; \ 166 mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 167 addpd %xmm3, %xmm6; \ 168 movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 169 addpd %xmm1, %xmm7; \ 170 movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 171 172#define KERNEL7(address) \ 173 mulpd %xmm1, %xmm2; \ 174 addpd %xmm2, %xmm4; \ 175 movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 176 mulpd %xmm1, %xmm2; \ 177 addpd %xmm2, %xmm5; \ 178 movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 179 mulpd %xmm1, %xmm2; \ 180 mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 181 addpd %xmm2, %xmm6; \ 182 movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 183 addpd %xmm1, %xmm7; \ 184 movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 185 186#define KERNEL8(address) \ 187 mulpd %xmm1, %xmm3; \ 188 addpd %xmm3, %xmm4; \ 189 movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 190 mulpd %xmm1, %xmm3; \ 191 addpd %xmm3, %xmm5; \ 192 movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 193 mulpd %xmm1, %xmm3; \ 194 mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 195 addpd %xmm3, %xmm6; \ 196 movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 197 addpd %xmm1, %xmm7; \ 198 movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 199 200 PROLOGUE 201 202 pushl %ebp 203 pushl %edi 204 pushl %esi 205 pushl %ebx 206 207 PROFCODE 208 209 EMMS 210 211 movl %esp, %esi # save old stack 212 213 subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp 214 andl $-STACK_ALIGN, %esp # align stack 215 addl $STACK_OFFSET, %esp 216 217 STACK_TOUCHING 218 219 movl STACK_M, %ebx 220 movl STACK_N, %eax 221 movl STACK_K, %ecx 222 movl STACK_A, %edx 223 224 movl %ebx, M 225 movl %eax, N 226 movl %ecx, K 227 movl %edx, A 228 movl %esi, OLD_STACK 229 230 movl STACK_B, B 231 movl STACK_C, %ebx 232#ifdef TRMMKERNEL 233 movss STACK_OFFT, %xmm4 234#endif 235 236 movlpd STACK_ALPHA_R, %xmm0 237 movlpd STACK_ALPHA_I, %xmm1 238 239 pcmpeqb %xmm7, %xmm7 240 psllq $63, %xmm7 # Generate mask 241 pxor %xmm2, %xmm2 242 243 movlpd %xmm0, 0 + ALPHA_R 244 movlpd %xmm0, 8 + ALPHA_R 245 246 movlpd %xmm1, 8 + ALPHA_I 247 xorpd %xmm7, %xmm1 248 movlpd %xmm1, 0 + ALPHA_I 249 250 movlpd %xmm2, 0 + POSINV 251 movlpd %xmm7, 8 + POSINV 252 253 movl %ebx, C 254 movl STACK_LDC, LDC 255 256#ifdef TRMMKERNEL 257 movss %xmm4, OFFSET 258 movss %xmm4, KK 259#ifndef LEFT 260 negl KK 261#endif 262#endif 263 264 sall $ZBASE_SHIFT, LDC 265 266 sarl $1, %eax 267 movl %eax, J # j = n 268 jle .L100 269 ALIGN_4 270 271.L01: 272#if defined(TRMMKERNEL) && defined(LEFT) 273 movl OFFSET, %eax 274 movl %eax, KK 275#endif 276 277 leal BUFFER, %ecx 278 279 movapd POSINV, %xmm7 280 281 movl K, %eax 282 sarl $1, %eax 283 jle .L03 284 ALIGN_4 285 286.L02: 287 prefetchnta 56 * SIZE(B) 288 289 movlpd 0 * SIZE(B), %xmm0 290 movlpd 1 * SIZE(B), %xmm1 291 movlpd 2 * SIZE(B), %xmm2 292 movlpd 3 * SIZE(B), %xmm3 293 movlpd 4 * SIZE(B), %xmm4 294 movlpd 5 * SIZE(B), %xmm5 295 movlpd 6 * SIZE(B), %xmm6 296 movlpd 7 * SIZE(B), %xmm7 297 298 movlpd %xmm0, 0 * SIZE(BB) 299 movlpd %xmm0, 1 * SIZE(BB) 300 movlpd %xmm1, 2 * SIZE(BB) 301 movlpd %xmm1, 3 * SIZE(BB) 302 movlpd %xmm2, 4 * SIZE(BB) 303 movlpd %xmm2, 5 * SIZE(BB) 304 movlpd %xmm3, 6 * SIZE(BB) 305 movlpd %xmm3, 7 * SIZE(BB) 306 movlpd %xmm4, 8 * SIZE(BB) 307 movlpd %xmm4, 9 * SIZE(BB) 308 movlpd %xmm5, 10 * SIZE(BB) 309 movlpd %xmm5, 11 * SIZE(BB) 310 movlpd %xmm6, 12 * SIZE(BB) 311 movlpd %xmm6, 13 * SIZE(BB) 312 movlpd %xmm7, 14 * SIZE(BB) 313 movlpd %xmm7, 15 * SIZE(BB) 314 315 addl $ 8 * SIZE, B 316 subl $-16 * SIZE, BB 317 318 decl %eax 319 jne .L02 320 ALIGN_4 321 322.L03: 323 movl K, %eax 324 andl $1, %eax 325 BRANCH 326 jle .L05 327 328 movlpd 0 * SIZE(B), %xmm0 329 movlpd 1 * SIZE(B), %xmm1 330 movlpd 2 * SIZE(B), %xmm2 331 movlpd 3 * SIZE(B), %xmm3 332 333 movlpd %xmm0, 0 * SIZE(BB) 334 movlpd %xmm0, 1 * SIZE(BB) 335 movlpd %xmm1, 2 * SIZE(BB) 336 movlpd %xmm1, 3 * SIZE(BB) 337 movlpd %xmm2, 4 * SIZE(BB) 338 movlpd %xmm2, 5 * SIZE(BB) 339 movlpd %xmm3, 6 * SIZE(BB) 340 movlpd %xmm3, 7 * SIZE(BB) 341 342 addl $4 * SIZE, B 343 ALIGN_4 344 345.L05: 346 movl C, %esi # coffset = c 347 movl A, AA # aoffset = a 348 movl M, %ebx 349 testl %ebx, %ebx 350 jle .L100 351 ALIGN_4 352 353.L10: 354#if !defined(TRMMKERNEL) || \ 355 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 356 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 357 358 leal BUFFER, BB # boffset1 = boffset 359#else 360 leal BUFFER, BB # boffset1 = boffset 361 movl KK, %eax 362 leal (, %eax, SIZE), %eax 363 leal (AA, %eax, 2), AA 364 leal (BB, %eax, 8), BB 365#endif 366 367 movapd 0 * SIZE(AA), %xmm0 368 pxor %xmm4, %xmm4 369 movapd 8 * SIZE(AA), %xmm1 370 pxor %xmm5, %xmm5 371 movapd 0 * SIZE(BB), %xmm2 372 pxor %xmm6, %xmm6 373 movapd 8 * SIZE(BB), %xmm3 374 pxor %xmm7, %xmm7 375 376 prefetchw 2 * SIZE(%esi) 377 prefetchw 2 * SIZE(%esi, LDC) 378 379#ifndef TRMMKERNEL 380 movl K, %eax 381#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 382 movl K, %eax 383 subl KK, %eax 384 movl %eax, KKK 385#else 386 movl KK, %eax 387#ifdef LEFT 388 addl $1, %eax 389#else 390 addl $2, %eax 391#endif 392 movl %eax, KKK 393#endif 394 395#if 1 396 andl $-8, %eax 397 sall $4, %eax 398 je .L15 399.L1X: 400 KERNEL1(16 * 0) 401 KERNEL2(16 * 0) 402 KERNEL3(16 * 0) 403 KERNEL4(16 * 0) 404 KERNEL5(16 * 0) 405 KERNEL6(16 * 0) 406 KERNEL7(16 * 0) 407 KERNEL8(16 * 0) 408 cmpl $128 * 1, %eax 409 jle .L12 410 KERNEL1(16 * 1) 411 KERNEL2(16 * 1) 412 KERNEL3(16 * 1) 413 KERNEL4(16 * 1) 414 KERNEL5(16 * 1) 415 KERNEL6(16 * 1) 416 KERNEL7(16 * 1) 417 KERNEL8(16 * 1) 418 cmpl $128 * 2, %eax 419 jle .L12 420 KERNEL1(16 * 2) 421 KERNEL2(16 * 2) 422 KERNEL3(16 * 2) 423 KERNEL4(16 * 2) 424 KERNEL5(16 * 2) 425 KERNEL6(16 * 2) 426 KERNEL7(16 * 2) 427 KERNEL8(16 * 2) 428 cmpl $128 * 3, %eax 429 jle .L12 430 KERNEL1(16 * 3) 431 KERNEL2(16 * 3) 432 KERNEL3(16 * 3) 433 KERNEL4(16 * 3) 434 KERNEL5(16 * 3) 435 KERNEL6(16 * 3) 436 KERNEL7(16 * 3) 437 KERNEL8(16 * 3) 438 cmpl $128 * 4, %eax 439 jle .L12 440 KERNEL1(16 * 4) 441 KERNEL2(16 * 4) 442 KERNEL3(16 * 4) 443 KERNEL4(16 * 4) 444 KERNEL5(16 * 4) 445 KERNEL6(16 * 4) 446 KERNEL7(16 * 4) 447 KERNEL8(16 * 4) 448 cmpl $128 * 5, %eax 449 jle .L12 450 KERNEL1(16 * 5) 451 KERNEL2(16 * 5) 452 KERNEL3(16 * 5) 453 KERNEL4(16 * 5) 454 KERNEL5(16 * 5) 455 KERNEL6(16 * 5) 456 KERNEL7(16 * 5) 457 KERNEL8(16 * 5) 458 cmpl $128 * 6, %eax 459 jle .L12 460 KERNEL1(16 * 6) 461 KERNEL2(16 * 6) 462 KERNEL3(16 * 6) 463 KERNEL4(16 * 6) 464 KERNEL5(16 * 6) 465 KERNEL6(16 * 6) 466 KERNEL7(16 * 6) 467 KERNEL8(16 * 6) 468 cmpl $128 * 7, %eax 469 jle .L12 470 KERNEL1(16 * 7) 471 KERNEL2(16 * 7) 472 KERNEL3(16 * 7) 473 KERNEL4(16 * 7) 474 KERNEL5(16 * 7) 475 KERNEL6(16 * 7) 476 KERNEL7(16 * 7) 477 KERNEL8(16 * 7) 478 479 addl $128 * 4 * SIZE, BB 480 addl $128 * 1 * SIZE, AA 481 subl $128 * 8, %eax 482 jg .L1X 483 jmp .L15 484 485.L12: 486 leal (AA, %eax, 1), AA 487 leal (BB, %eax, 4), BB 488 ALIGN_4 489#else 490 491 sarl $3, %eax 492 je .L15 493 ALIGN_4 494 495.L12: 496 KERNEL1(16 * 0) 497 KERNEL2(16 * 0) 498 KERNEL3(16 * 0) 499 KERNEL4(16 * 0) 500 KERNEL5(16 * 0) 501 KERNEL6(16 * 0) 502 KERNEL7(16 * 0) 503 KERNEL8(16 * 0) 504 505 addl $64 * SIZE, BB 506 addl $16 * SIZE, AA 507 decl %eax 508 jne .L11 509 ALIGN_4 510#endif 511 512.L15: 513#ifndef TRMMKERNEL 514 movl K, %eax 515#else 516 movl KKK, %eax 517#endif 518 andl $7, %eax # if (k & 1) 519 BRANCH 520 je .L14 521 ALIGN_4 522 523.L13: 524 mulpd %xmm0, %xmm2 525 addpd %xmm2, %xmm4 526 movapd 2 * SIZE(BB), %xmm2 527 mulpd %xmm0, %xmm2 528 addpd %xmm2, %xmm5 529 movapd 4 * SIZE(BB), %xmm2 530 mulpd %xmm0, %xmm2 531 mulpd 6 * SIZE(BB), %xmm0 532 addpd %xmm2, %xmm6 533 movapd 8 * SIZE(BB), %xmm2 534 addpd %xmm0, %xmm7 535 movapd 2 * SIZE(AA), %xmm0 536 537 addl $2 * SIZE, AA 538 addl $8 * SIZE, BB 539 decl %eax 540 jg .L13 541 ALIGN_4 542 543.L14: 544 movapd POSINV, %xmm1 545 movapd ALPHA_R, %xmm2 546 movapd ALPHA_I, %xmm3 547 548 SHUFPD_1 %xmm5, %xmm5 549 SHUFPD_1 %xmm7, %xmm7 550 551 552#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 553 defined(NR) || defined(NC) || defined(TR) || defined(TC) 554 xorpd %xmm1, %xmm5 555 xorpd %xmm1, %xmm7 556#else 557 xorpd %xmm1, %xmm4 558 xorpd %xmm1, %xmm6 559#endif 560 561#ifndef TRMMKERNEL 562 movlpd 0 * SIZE(%esi), %xmm0 563 movhpd 1 * SIZE(%esi), %xmm0 564 movlpd 0 * SIZE(%esi, LDC), %xmm1 565 movhpd 1 * SIZE(%esi, LDC), %xmm1 566#endif 567 568#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 569 defined(RR) || defined(RC) || defined(CR) || defined(CC) 570 subpd %xmm5, %xmm4 571 subpd %xmm7, %xmm6 572#else 573 addpd %xmm5, %xmm4 574 addpd %xmm7, %xmm6 575#endif 576 577 pshufd $0x4e, %xmm4, %xmm5 578 pshufd $0x4e, %xmm6, %xmm7 579 580 mulpd %xmm2, %xmm4 581 mulpd %xmm3, %xmm5 582 mulpd %xmm2, %xmm6 583 mulpd %xmm3, %xmm7 584 585 addpd %xmm5, %xmm4 586 addpd %xmm7, %xmm6 587 588#ifndef TRMMKERNEL 589 addpd %xmm0, %xmm4 590 addpd %xmm1, %xmm6 591#endif 592 593 movlpd %xmm4, 0 * SIZE(%esi) 594 movhpd %xmm4, 1 * SIZE(%esi) 595 movlpd %xmm6, 0 * SIZE(%esi, LDC) 596 movhpd %xmm6, 1 * SIZE(%esi, LDC) 597 598#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 599 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 600 movl K, %eax 601 subl KKK, %eax 602 leal (,%eax, SIZE), %eax 603 leal (AA, %eax, 2), AA 604 leal (BB, %eax, 8), BB 605#endif 606 607#if defined(TRMMKERNEL) && defined(LEFT) 608 addl $1, KK 609#endif 610 611 addl $2 * SIZE, %esi # coffset += 4 612 decl %ebx # i -- 613 jg .L10 614 ALIGN_4 615 616.L99: 617#if defined(TRMMKERNEL) && !defined(LEFT) 618 addl $2, KK 619#endif 620 621 leal (, LDC, 2), %eax 622 addl %eax, C # c += ldc 623 decl J # j -- 624 jg .L01 625 ALIGN_4 626 627.L100: 628 movl N, %eax 629 andl $1, %eax 630 jle .L500 631 ALIGN_4 632 633.L101: 634#if defined(TRMMKERNEL) && defined(LEFT) 635 movl OFFSET, %eax 636 movl %eax, KK 637#endif 638 639 leal BUFFER, %ecx 640 movapd POSINV, %xmm7 641 642 movl K, %eax 643 sarl $2, %eax 644 jle .L103 645 ALIGN_4 646 647.L102: 648 prefetchnta 56 * SIZE(B) 649 650 movlpd 0 * SIZE(B), %xmm0 651 movlpd 1 * SIZE(B), %xmm1 652 movlpd 2 * SIZE(B), %xmm2 653 movlpd 3 * SIZE(B), %xmm3 654 movlpd 4 * SIZE(B), %xmm4 655 movlpd 5 * SIZE(B), %xmm5 656 movlpd 6 * SIZE(B), %xmm6 657 movlpd 7 * SIZE(B), %xmm7 658 659 movlpd %xmm0, 0 * SIZE(BB) 660 movlpd %xmm0, 1 * SIZE(BB) 661 movlpd %xmm1, 2 * SIZE(BB) 662 movlpd %xmm1, 3 * SIZE(BB) 663 movlpd %xmm2, 4 * SIZE(BB) 664 movlpd %xmm2, 5 * SIZE(BB) 665 movlpd %xmm3, 6 * SIZE(BB) 666 movlpd %xmm3, 7 * SIZE(BB) 667 movlpd %xmm4, 8 * SIZE(BB) 668 movlpd %xmm4, 9 * SIZE(BB) 669 movlpd %xmm5, 10 * SIZE(BB) 670 movlpd %xmm5, 11 * SIZE(BB) 671 movlpd %xmm6, 12 * SIZE(BB) 672 movlpd %xmm6, 13 * SIZE(BB) 673 movlpd %xmm7, 14 * SIZE(BB) 674 movlpd %xmm7, 15 * SIZE(BB) 675 676 addl $ 8 * SIZE, B 677 subl $-16 * SIZE, %ecx 678 decl %eax 679 jne .L102 680 ALIGN_4 681 682.L103: 683 movl K, %eax 684 andl $3, %eax 685 BRANCH 686 jle .L105 687 ALIGN_4 688 689.L104: 690 movlpd 0 * SIZE(B), %xmm0 691 movlpd 1 * SIZE(B), %xmm1 692 693 movlpd %xmm0, 0 * SIZE(BB) 694 movlpd %xmm0, 1 * SIZE(BB) 695 movlpd %xmm1, 2 * SIZE(BB) 696 movlpd %xmm1, 3 * SIZE(BB) 697 698 addl $2 * SIZE, B 699 addl $4 * SIZE, %ecx 700 decl %eax 701 jne .L104 702 ALIGN_4 703 704.L105: 705 movl C, %esi # coffset = c 706 movl A, AA # aoffset = a 707 movl M, %ebx 708 testl %ebx, %ebx 709 jle .L500 710 ALIGN_4 711 712.L110: 713#if !defined(TRMMKERNEL) || \ 714 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 715 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 716 717 leal BUFFER, BB 718#else 719 leal BUFFER, BB 720 movl KK, %eax 721 leal (, %eax, SIZE), %eax 722 leal (AA, %eax, 2), AA 723 leal (BB, %eax, 4), BB 724#endif 725 726 pxor %xmm4, %xmm4 727 pxor %xmm5, %xmm5 728 pxor %xmm6, %xmm6 729 pxor %xmm7, %xmm7 730 731 movapd 0 * SIZE(AA), %xmm0 732 movapd 8 * SIZE(AA), %xmm1 733 movapd 0 * SIZE(BB), %xmm2 734 movapd 8 * SIZE(BB), %xmm3 735 736#ifndef TRMMKERNEL 737 movl K, %eax 738#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 739 movl K, %eax 740 subl KK, %eax 741 movl %eax, KKK 742#else 743 movl KK, %eax 744#ifdef LEFT 745 addl $1, %eax 746#else 747 addl $1, %eax 748#endif 749 movl %eax, KKK 750#endif 751 sarl $3, %eax 752 je .L112 753 ALIGN_4 754 755.L111: 756 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 757 mulpd %xmm0, %xmm2 758 mulpd 2 * SIZE(BB), %xmm0 759 addpd %xmm2, %xmm4 760 movapd 4 * SIZE(BB), %xmm2 761 addpd %xmm0, %xmm5 762 movapd 2 * SIZE(AA), %xmm0 763 mulpd %xmm0, %xmm2 764 mulpd 6 * SIZE(BB), %xmm0 765 addpd %xmm2, %xmm6 766 movapd 16 * SIZE(BB), %xmm2 767 addpd %xmm0, %xmm7 768 movapd 4 * SIZE(AA), %xmm0 769 mulpd %xmm0, %xmm3 770 mulpd 10 * SIZE(BB), %xmm0 771 addpd %xmm3, %xmm4 772 movapd 12 * SIZE(BB), %xmm3 773 addpd %xmm0, %xmm5 774 movapd 6 * SIZE(AA), %xmm0 775 mulpd %xmm0, %xmm3 776 mulpd 14 * SIZE(BB), %xmm0 777 addpd %xmm3, %xmm6 778 movapd 24 * SIZE(BB), %xmm3 779 addpd %xmm0, %xmm7 780 movapd 16 * SIZE(AA), %xmm0 781 mulpd %xmm1, %xmm2 782 mulpd 18 * SIZE(BB), %xmm1 783 addpd %xmm2, %xmm4 784 movapd 20 * SIZE(BB), %xmm2 785 addpd %xmm1, %xmm5 786 movapd 10 * SIZE(AA), %xmm1 787 mulpd %xmm1, %xmm2 788 mulpd 22 * SIZE(BB), %xmm1 789 addpd %xmm2, %xmm6 790 movapd 32 * SIZE(BB), %xmm2 791 addpd %xmm1, %xmm7 792 movapd 12 * SIZE(AA), %xmm1 793 mulpd %xmm1, %xmm3 794 mulpd 26 * SIZE(BB), %xmm1 795 addpd %xmm3, %xmm4 796 movapd 28 * SIZE(BB), %xmm3 797 addpd %xmm1, %xmm5 798 movapd 14 * SIZE(AA), %xmm1 799 mulpd %xmm1, %xmm3 800 mulpd 30 * SIZE(BB), %xmm1 801 addpd %xmm3, %xmm6 802 movapd 40 * SIZE(BB), %xmm3 803 addpd %xmm1, %xmm7 804 movapd 24 * SIZE(AA), %xmm1 805 806 addl $16 * SIZE, AA 807 addl $32 * SIZE, BB 808 decl %eax 809 jne .L111 810 ALIGN_4 811 812.L112: 813#ifndef TRMMKERNEL 814 movl K, %eax 815#else 816 movl KKK, %eax 817#endif 818 andl $7, %eax # if (k & 1) 819 BRANCH 820 je .L114 821 ALIGN_4 822 823.L113: 824 mulpd %xmm0, %xmm2 825 mulpd 2 * SIZE(BB), %xmm0 826 addpd %xmm2, %xmm4 827 movapd 4 * SIZE(BB), %xmm2 828 addpd %xmm0, %xmm5 829 movapd 2 * SIZE(AA), %xmm0 830 831 addl $2 * SIZE, AA 832 addl $4 * SIZE, BB 833 decl %eax 834 jg .L113 835 ALIGN_4 836 837.L114: 838 movapd POSINV, %xmm1 839 movapd ALPHA_R, %xmm2 840 movapd ALPHA_I, %xmm3 841 842 addpd %xmm6, %xmm4 843 addpd %xmm7, %xmm5 844 845 SHUFPD_1 %xmm5, %xmm5 846 847#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 848 defined(NR) || defined(NC) || defined(TR) || defined(TC) 849 xorpd %xmm1, %xmm5 850#else 851 xorpd %xmm1, %xmm4 852#endif 853 854#ifndef TRMMKERNEL 855 movlpd 0 * SIZE(%esi), %xmm0 856 movhpd 1 * SIZE(%esi), %xmm0 857#endif 858 859#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 860 defined(RR) || defined(RC) || defined(CR) || defined(CC) 861 subpd %xmm5, %xmm4 862#else 863 addpd %xmm5, %xmm4 864#endif 865 866 pshufd $0x4e, %xmm4, %xmm5 867 868 mulpd %xmm2, %xmm4 869 mulpd %xmm3, %xmm5 870 871 addpd %xmm5, %xmm4 872 873#ifndef TRMMKERNEL 874 addpd %xmm0, %xmm4 875#endif 876 877 movlpd %xmm4, 0 * SIZE(%esi) 878 movhpd %xmm4, 1 * SIZE(%esi) 879 880#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 881 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 882 movl K, %eax 883 subl KKK, %eax 884 leal (,%eax, SIZE), %eax 885 leal (AA, %eax, 2), AA 886 leal (BB, %eax, 4), BB 887#endif 888 889#if defined(TRMMKERNEL) && defined(LEFT) 890 addl $1, KK 891#endif 892 893 addl $2 * SIZE, %esi # coffset += 4 894 decl %ebx # i -- 895 jg .L110 896 ALIGN_4 897 898.L500: 899 movl OLD_STACK, %esp 900 901 EMMS 902 903 popl %ebx 904 popl %esi 905 popl %edi 906 popl %ebp 907 ret 908 909 EPILOGUE 910