1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43 44#define OLD_M 4 + STACK(%esi) 45#define OLD_N 8 + STACK(%esi) 46#define OLD_K 12 + STACK(%esi) 47#define OLD_A 20 + STACK(%esi) 48#define OLD_B 24 + STACK(%esi) 49#define OLD_C 28 + STACK(%esi) 50#define OLD_LDC 32 + STACK(%esi) 51#define STACK_OFFT 36 + STACK(%esi) 52 53#define K 16(%esp) 54#define N 20(%esp) 55#define M 24(%esp) 56#define A 28(%esp) 57#define C 32(%esp) 58#define J 36(%esp) 59#define OLD_STACK 40(%esp) 60#define OFFSET 44(%esp) 61#define KK 48(%esp) 62#define KKK 52(%esp) 63#define AORIG 56(%esp) 64#define BORIG 60(%esp) 65#define BUFFER 128(%esp) 66 67#if defined(OPTERON) || defined(BARCELONA) 68#define PREFETCH prefetch 69#define PREFETCHW prefetchw 70#define PREFETCHSIZE (16 * 10 + 8) 71#endif 72 73#if defined(PENTIUM4) || defined(PENTIUMM) 74#define PREFETCH prefetcht0 75#define PREFETCHW prefetcht0 76#define PREFETCHSIZE 96 77#endif 78 79#if defined(PENRYN) || defined(DUNNINGTON) 80#define PREFETCH prefetcht0 81#define PREFETCHW prefetcht0 82#define PREFETCHSIZE 96 83#endif 84 85#define B %edi 86#define AA %edx 87#define BB %ecx 88#define LDC %ebp 89#define CO1 %esi 90 91#if defined(OPTERON) || !defined(HAVE_SSE2) 92#define movsd movlps 93#endif 94 95#ifdef HAVE_SSE2 96#define xorps pxor 97#endif 98 99#define KERNEL1(address) \ 100 mulps %xmm0, %xmm2; \ 101 addps %xmm2, %xmm4; \ 102 movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 103 mulps %xmm0, %xmm2; \ 104 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ 105 addps %xmm2, %xmm5; \ 106 movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 107 mulps %xmm0, %xmm2; \ 108 mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 109 addps %xmm2, %xmm6; \ 110 movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 111 addps %xmm0, %xmm7; \ 112 movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 113 114#define KERNEL2(address) \ 115 mulps %xmm0, %xmm3; \ 116 addps %xmm3, %xmm4; \ 117 movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 118 mulps %xmm0, %xmm3; \ 119 addps %xmm3, %xmm5; \ 120 movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 121 mulps %xmm0, %xmm3; \ 122 mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 123 addps %xmm3, %xmm6; \ 124 movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 125 addps %xmm0, %xmm7; \ 126 movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 127 128#define KERNEL3(address) \ 129 mulps %xmm0, %xmm2; \ 130 addps %xmm2, %xmm4; \ 131 movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 132 mulps %xmm0, %xmm2; \ 133 addps %xmm2, %xmm5; \ 134 movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 135 mulps %xmm0, %xmm2; \ 136 mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 137 addps %xmm2, %xmm6; \ 138 movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 139 addps %xmm0, %xmm7; \ 140 movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 141 142#define KERNEL4(address) \ 143 mulps %xmm0, %xmm3; \ 144 addps %xmm3, %xmm4; \ 145 movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 146 mulps %xmm0, %xmm3; \ 147 addps %xmm3, %xmm5; \ 148 movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 149 mulps %xmm0, %xmm3; \ 150 mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 151 addps %xmm3, %xmm6; \ 152 movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 153 addps %xmm0, %xmm7; \ 154 movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 155 156#define KERNEL5(address) \ 157 mulps %xmm1, %xmm2; \ 158 addps %xmm2, %xmm4; \ 159 movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 160 mulps %xmm1, %xmm2; \ 161 addps %xmm2, %xmm5; \ 162 movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 163 mulps %xmm1, %xmm2; \ 164 mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 165 addps %xmm2, %xmm6; \ 166 movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 167 addps %xmm1, %xmm7; \ 168 movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 169 170#define KERNEL6(address) \ 171 mulps %xmm1, %xmm3; \ 172 addps %xmm3, %xmm4; \ 173 movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 174 mulps %xmm1, %xmm3; \ 175 addps %xmm3, %xmm5; \ 176 movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 177 mulps %xmm1, %xmm3; \ 178 mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 179 addps %xmm3, %xmm6; \ 180 movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 181 addps %xmm1, %xmm7; \ 182 movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 183 184#define KERNEL7(address) \ 185 mulps %xmm1, %xmm2; \ 186 addps %xmm2, %xmm4; \ 187 movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 188 mulps %xmm1, %xmm2; \ 189 addps %xmm2, %xmm5; \ 190 movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 191 mulps %xmm1, %xmm2; \ 192 mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 193 addps %xmm2, %xmm6; \ 194 movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 195 addps %xmm1, %xmm7; \ 196 movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 197 198#define KERNEL8(address) \ 199 mulps %xmm1, %xmm3; \ 200 addps %xmm3, %xmm4; \ 201 movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 202 mulps %xmm1, %xmm3; \ 203 addps %xmm3, %xmm5; \ 204 movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 205 mulps %xmm1, %xmm3; \ 206 mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 207 addps %xmm3, %xmm6; \ 208 movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 209 addps %xmm1, %xmm7; \ 210 movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; 211 212 PROLOGUE 213 214 pushl %ebp 215 pushl %edi 216 pushl %esi 217 pushl %ebx 218 219 PROFCODE 220 221 movl %esp, %esi 222 223 subl $128 + LOCAL_BUFFER_SIZE, %esp 224 andl $-1024, %esp 225 226 STACK_TOUCHING 227 228 movl OLD_M, %ebx 229 movl OLD_N, %eax 230 movl OLD_K, %ecx 231 movl OLD_A, %edx 232 233 movl %ebx, M 234 movl %eax, N 235 movl %ecx, K 236 movl %edx, A 237 movl %esi, OLD_STACK 238 movss STACK_OFFT, %xmm4 239 240 movl OLD_B, B 241 movl OLD_C, %ebx 242 243 movl %ebx, C 244 movl OLD_LDC, LDC 245 246 movss %xmm4, OFFSET 247 movss %xmm4, KK 248 249 leal (, LDC, SIZE), LDC 250 251#ifdef LN 252 movl M, %eax 253 leal (, %eax, SIZE), %eax 254 addl %eax, C 255 imull K, %eax 256 addl %eax, A 257#endif 258 259#ifdef RT 260 movl N, %eax 261 leal (, %eax, SIZE), %eax 262 imull K, %eax 263 addl %eax, B 264 movl N, %eax 265 imull LDC, %eax 266 addl %eax, C 267#endif 268 269#ifdef RN 270 negl KK 271#endif 272 273#ifdef RT 274 movl N, %eax 275 subl OFFSET, %eax 276 movl %eax, KK 277#endif 278 279 movl N, %eax 280 sarl $2, %eax 281 movl %eax, J 282 jle .L40 283 284.L01: 285#ifdef LN 286 movl OFFSET, %eax 287 addl M, %eax 288 movl %eax, KK 289#endif 290 291 leal BUFFER, %ecx 292 293#ifdef RT 294 movl K, %eax 295 sall $2 + BASE_SHIFT, %eax 296 subl %eax, B 297#endif 298 299#if defined(LN) || defined(RT) 300 movl KK, %eax 301 movl B, BORIG 302 sall $2 + BASE_SHIFT, %eax 303 leal (B, %eax, 1), B 304 leal (BB, %eax, 4), BB 305#endif 306 307#ifdef LT 308 movl OFFSET, %eax 309 movl %eax, KK 310#endif 311 312#if defined(LT) || defined(RN) 313 movl KK, %eax 314#else 315 movl K, %eax 316 subl KK, %eax 317#endif 318 sarl $1, %eax 319 jle .L05 320 ALIGN_4 321 322.L02: 323 movaps 0 * SIZE(B), %xmm3 324 movaps 4 * SIZE(B), %xmm7 325 326 pshufd $0x00, %xmm3, %xmm0 327 pshufd $0x55, %xmm3, %xmm1 328 pshufd $0xaa, %xmm3, %xmm2 329 pshufd $0xff, %xmm3, %xmm3 330 331 pshufd $0x00, %xmm7, %xmm4 332 pshufd $0x55, %xmm7, %xmm5 333 pshufd $0xaa, %xmm7, %xmm6 334 pshufd $0xff, %xmm7, %xmm7 335 336 movaps %xmm0, 0 * SIZE(BB) 337 movaps %xmm1, 4 * SIZE(BB) 338 movaps %xmm2, 8 * SIZE(BB) 339 movaps %xmm3, 12 * SIZE(BB) 340 movaps %xmm4, 16 * SIZE(BB) 341 movaps %xmm5, 20 * SIZE(BB) 342 movaps %xmm6, 24 * SIZE(BB) 343 movaps %xmm7, 28 * SIZE(BB) 344 345 addl $ 8 * SIZE, B 346 addl $32 * SIZE, %ecx 347 decl %eax 348 jne .L02 349 ALIGN_2 350 351.L05: 352#if defined(LT) || defined(RN) 353 movl KK, %eax 354#else 355 movl K, %eax 356 subl KK, %eax 357#endif 358 andl $1, %eax 359 BRANCH 360 jle .L10 361 362 movaps 0 * SIZE(B), %xmm3 363 364 pshufd $0x00, %xmm3, %xmm0 365 pshufd $0x55, %xmm3, %xmm1 366 pshufd $0xaa, %xmm3, %xmm2 367 pshufd $0xff, %xmm3, %xmm3 368 369 movaps %xmm0, 0 * SIZE(BB) 370 movaps %xmm1, 4 * SIZE(BB) 371 movaps %xmm2, 8 * SIZE(BB) 372 movaps %xmm3, 12 * SIZE(BB) 373 374 addl $4 * SIZE, B 375 ALIGN_4 376 377.L10: 378#if defined(LT) || defined(RN) 379 movl A, AA 380#else 381 movl A, %eax 382 movl %eax, AORIG 383#endif 384 385 leal (, LDC, 4), %eax 386 387#ifdef RT 388 subl %eax, C 389#endif 390 movl C, CO1 391#ifndef RT 392 addl %eax, C 393#endif 394 395 testl $1, M 396 je .L20 397 398#ifdef LN 399 movl K, %eax 400 sall $BASE_SHIFT, %eax 401 subl %eax, AORIG 402#endif 403 404#if defined(LN) || defined(RT) 405 movl KK, %eax 406 movl AORIG, AA 407 leal (AA, %eax, SIZE), AA 408#endif 409 410 leal BUFFER, BB 411 412#if defined(LN) || defined(RT) 413 movl KK, %eax 414 sall $2 + BASE_SHIFT, %eax 415 leal (BB, %eax, 4), BB 416#endif 417 418 movss 0 * SIZE(AA), %xmm0 419 xorps %xmm4, %xmm4 420 movss 4 * SIZE(AA), %xmm1 421 xorps %xmm5, %xmm5 422 movss 0 * SIZE(BB), %xmm2 423 xorps %xmm6, %xmm6 424 movss 16 * SIZE(BB), %xmm3 425 xorps %xmm7, %xmm7 426 427#if defined(LT) || defined(RN) 428 movl KK, %eax 429#else 430 movl K, %eax 431 subl KK, %eax 432#endif 433 sarl $3, %eax 434 je .L35 435 ALIGN_4 436 437.L32: 438 mulss %xmm0, %xmm2 439 addss %xmm2, %xmm4 440#if defined(OPTERON) || defined(BARCELONA) 441 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 442#endif 443 movss 4 * SIZE(BB), %xmm2 444 mulss %xmm0, %xmm2 445 addss %xmm2, %xmm5 446 movss 8 * SIZE(BB), %xmm2 447 mulss %xmm0, %xmm2 448 mulss 12 * SIZE(BB), %xmm0 449 addss %xmm2, %xmm6 450 movss 32 * SIZE(BB), %xmm2 451 addss %xmm0, %xmm7 452 movss 1 * SIZE(AA), %xmm0 453 454 mulss %xmm0, %xmm3 455 addss %xmm3, %xmm4 456 movss 20 * SIZE(BB), %xmm3 457 mulss %xmm0, %xmm3 458 addss %xmm3, %xmm5 459 movss 24 * SIZE(BB), %xmm3 460 mulss %xmm0, %xmm3 461 mulss 28 * SIZE(BB), %xmm0 462 addss %xmm3, %xmm6 463 movss 48 * SIZE(BB), %xmm3 464 addss %xmm0, %xmm7 465 movss 2 * SIZE(AA), %xmm0 466 467 mulss %xmm0, %xmm2 468 addss %xmm2, %xmm4 469 movss 36 * SIZE(BB), %xmm2 470 mulss %xmm0, %xmm2 471 addss %xmm2, %xmm5 472 movss 40 * SIZE(BB), %xmm2 473 mulss %xmm0, %xmm2 474 mulss 44 * SIZE(BB), %xmm0 475 addss %xmm2, %xmm6 476 movss 64 * SIZE(BB), %xmm2 477 addss %xmm0, %xmm7 478 movss 3 * SIZE(AA), %xmm0 479 480 mulss %xmm0, %xmm3 481 addss %xmm3, %xmm4 482 movss 52 * SIZE(BB), %xmm3 483 mulss %xmm0, %xmm3 484 addss %xmm3, %xmm5 485 movss 56 * SIZE(BB), %xmm3 486 mulss %xmm0, %xmm3 487 mulss 60 * SIZE(BB), %xmm0 488 addss %xmm3, %xmm6 489 movss 80 * SIZE(BB), %xmm3 490 addss %xmm0, %xmm7 491 movss 8 * SIZE(AA), %xmm0 492 493 mulss %xmm1, %xmm2 494 addss %xmm2, %xmm4 495 movss 68 * SIZE(BB), %xmm2 496 mulss %xmm1, %xmm2 497 addss %xmm2, %xmm5 498 movss 72 * SIZE(BB), %xmm2 499 mulss %xmm1, %xmm2 500 mulss 76 * SIZE(BB), %xmm1 501 addss %xmm2, %xmm6 502 movss 96 * SIZE(BB), %xmm2 503 addss %xmm1, %xmm7 504 movss 5 * SIZE(AA), %xmm1 505 506 mulss %xmm1, %xmm3 507 addss %xmm3, %xmm4 508 movss 84 * SIZE(BB), %xmm3 509 mulss %xmm1, %xmm3 510 addss %xmm3, %xmm5 511 movss 88 * SIZE(BB), %xmm3 512 mulss %xmm1, %xmm3 513 mulss 92 * SIZE(BB), %xmm1 514 addss %xmm3, %xmm6 515 movss 112 * SIZE(BB), %xmm3 516 addss %xmm1, %xmm7 517 movss 6 * SIZE(AA), %xmm1 518 519 mulss %xmm1, %xmm2 520 addss %xmm2, %xmm4 521 movss 100 * SIZE(BB), %xmm2 522 mulss %xmm1, %xmm2 523 addss %xmm2, %xmm5 524 movss 104 * SIZE(BB), %xmm2 525 mulss %xmm1, %xmm2 526 mulss 108 * SIZE(BB), %xmm1 527 addss %xmm2, %xmm6 528 movss 128 * SIZE(BB), %xmm2 529 addss %xmm1, %xmm7 530 movss 7 * SIZE(AA), %xmm1 531 532 mulss %xmm1, %xmm3 533 addss %xmm3, %xmm4 534 movss 116 * SIZE(BB), %xmm3 535 mulss %xmm1, %xmm3 536 addss %xmm3, %xmm5 537 movss 120 * SIZE(BB), %xmm3 538 mulss %xmm1, %xmm3 539 mulss 124 * SIZE(BB), %xmm1 540 addss %xmm3, %xmm6 541 movss 144 * SIZE(BB), %xmm3 542 addss %xmm1, %xmm7 543 movss 12 * SIZE(AA), %xmm1 544 545 addl $ 8 * SIZE, AA 546 addl $128 * SIZE, BB 547 decl %eax 548 jne .L32 549 ALIGN_4 550 551.L35: 552#if defined(LT) || defined(RN) 553 movl KK, %eax 554#else 555 movl K, %eax 556 subl KK, %eax 557#endif 558 andl $7, %eax # if (k & 1) 559 BRANCH 560 je .L38 561 ALIGN_4 562 563.L36: 564 mulss %xmm0, %xmm2 565 addss %xmm2, %xmm4 566 movss 4 * SIZE(BB), %xmm2 567 mulss %xmm0, %xmm2 568 addss %xmm2, %xmm5 569 movss 8 * SIZE(BB), %xmm2 570 mulss %xmm0, %xmm2 571 mulss 12 * SIZE(BB), %xmm0 572 addss %xmm2, %xmm6 573 movss 16 * SIZE(BB), %xmm2 574 addss %xmm0, %xmm7 575 movss 1 * SIZE(AA), %xmm0 576 577 addl $ 1 * SIZE, AA 578 addl $16 * SIZE, BB 579 decl %eax 580 jg .L36 581 ALIGN_4 582 583.L38: 584#if defined(LN) || defined(RT) 585 movl KK, %eax 586#ifdef LN 587 subl $1, %eax 588#else 589 subl $4, %eax 590#endif 591 592 movl AORIG, AA 593 movl BORIG, B 594 leal BUFFER, BB 595 596 leal (AA, %eax, SIZE), AA 597 598 sall $2 + BASE_SHIFT, %eax 599 leal (B, %eax, 1), B 600 leal (BB, %eax, 4), BB 601#endif 602 603#if defined(LN) || defined(LT) 604 unpcklps %xmm6, %xmm4 605 unpcklps %xmm7, %xmm5 606 unpcklps %xmm5, %xmm4 607 608 movaps 0 * SIZE(B), %xmm1 609 610 subps %xmm4, %xmm1 611#else 612 movss 0 * SIZE(AA), %xmm0 613 movss 1 * SIZE(AA), %xmm1 614 movss 2 * SIZE(AA), %xmm2 615 movss 3 * SIZE(AA), %xmm3 616 617 subss %xmm4, %xmm0 618 subss %xmm5, %xmm1 619 subss %xmm6, %xmm2 620 subss %xmm7, %xmm3 621#endif 622 623#if defined(LN) || defined(LT) 624 movss 0 * SIZE(AA), %xmm4 625 pshufd $0x00, %xmm4, %xmm6 626 mulps %xmm6, %xmm1 627#endif 628 629#ifdef RN 630 movaps 0 * SIZE(B), %xmm6 631 pshufd $0x00, %xmm6, %xmm7 632 mulss %xmm7, %xmm0 633 pshufd $0x55, %xmm6, %xmm7 634 mulss %xmm0, %xmm7 635 subss %xmm7, %xmm1 636 pshufd $0xaa, %xmm6, %xmm7 637 mulss %xmm0, %xmm7 638 subss %xmm7, %xmm2 639 pshufd $0xff, %xmm6, %xmm7 640 mulss %xmm0, %xmm7 641 subss %xmm7, %xmm3 642 643 movaps 4 * SIZE(B), %xmm6 644 pshufd $0x55, %xmm6, %xmm7 645 mulss %xmm7, %xmm1 646 pshufd $0xaa, %xmm6, %xmm7 647 mulss %xmm1, %xmm7 648 subss %xmm7, %xmm2 649 pshufd $0xff, %xmm6, %xmm7 650 mulss %xmm1, %xmm7 651 subss %xmm7, %xmm3 652 653 movaps 8 * SIZE(B), %xmm6 654 pshufd $0xaa, %xmm6, %xmm7 655 mulss %xmm7, %xmm2 656 pshufd $0xff, %xmm6, %xmm7 657 mulss %xmm2, %xmm7 658 subss %xmm7, %xmm3 659 660 movaps 12 * SIZE(B), %xmm6 661 pshufd $0xff, %xmm6, %xmm7 662 mulss %xmm7, %xmm3 663#endif 664 665#ifdef RT 666 movaps 12 * SIZE(B), %xmm6 667 pshufd $0xff, %xmm6, %xmm7 668 mulss %xmm7, %xmm3 669 pshufd $0xaa, %xmm6, %xmm7 670 mulss %xmm3, %xmm7 671 subss %xmm7, %xmm2 672 pshufd $0x55, %xmm6, %xmm7 673 mulss %xmm3, %xmm7 674 subss %xmm7, %xmm1 675 pshufd $0x00, %xmm6, %xmm7 676 mulss %xmm3, %xmm7 677 subss %xmm7, %xmm0 678 679 movaps 8 * SIZE(B), %xmm6 680 pshufd $0xaa, %xmm6, %xmm7 681 mulss %xmm7, %xmm2 682 pshufd $0x55, %xmm6, %xmm7 683 mulss %xmm2, %xmm7 684 subss %xmm7, %xmm1 685 pshufd $0x00, %xmm6, %xmm7 686 mulss %xmm2, %xmm7 687 subss %xmm7, %xmm0 688 689 movaps 4 * SIZE(B), %xmm6 690 pshufd $0x55, %xmm6, %xmm7 691 mulss %xmm7, %xmm1 692 pshufd $0x00, %xmm6, %xmm7 693 mulss %xmm1, %xmm7 694 subss %xmm7, %xmm0 695 696 movaps 0 * SIZE(B), %xmm6 697 pshufd $0x00, %xmm6, %xmm7 698 mulss %xmm7, %xmm0 699#endif 700 701#if defined(LN) || defined(LT) 702 movaps %xmm1, 0 * SIZE(B) 703 704 pshufd $0x00, %xmm1, %xmm0 705 pshufd $0x55, %xmm1, %xmm2 706 pshufd $0xaa, %xmm1, %xmm4 707 pshufd $0xff, %xmm1, %xmm6 708 movaps %xmm0, 0 * SIZE(BB) 709 movaps %xmm2, 4 * SIZE(BB) 710 movaps %xmm4, 8 * SIZE(BB) 711 movaps %xmm6, 12 * SIZE(BB) 712#else 713 movss %xmm0, 0 * SIZE(AA) 714 movss %xmm1, 1 * SIZE(AA) 715 movss %xmm2, 2 * SIZE(AA) 716 movss %xmm3, 3 * SIZE(AA) 717#endif 718 719#ifdef LN 720 subl $1 * SIZE, CO1 721#endif 722 723 leal (LDC, LDC, 2), %eax 724 725#if defined(LN) || defined(LT) 726 movaps %xmm1, %xmm0 727 unpcklps %xmm5, %xmm1 728 unpckhps %xmm5, %xmm0 729 730 movaps %xmm3, %xmm4 731 unpcklps %xmm7, %xmm3 732 unpckhps %xmm7, %xmm4 733 734 movaps %xmm1, %xmm2 735 unpcklps %xmm3, %xmm1 736 unpckhps %xmm3, %xmm2 737 738 movaps %xmm0, %xmm6 739 unpcklps %xmm4, %xmm0 740 unpckhps %xmm4, %xmm6 741 742 movss %xmm1, 0 * SIZE(CO1) 743 movss %xmm2, 0 * SIZE(CO1, LDC, 1) 744 movss %xmm0, 0 * SIZE(CO1, LDC, 2) 745 movss %xmm6, 0 * SIZE(CO1, %eax, 1) 746#else 747 movss %xmm0, 0 * SIZE(CO1) 748 movss %xmm1, 0 * SIZE(CO1, LDC, 1) 749 movss %xmm2, 0 * SIZE(CO1, LDC, 2) 750 movss %xmm3, 0 * SIZE(CO1, %eax, 1) 751#endif 752 753#ifndef LN 754 addl $1 * SIZE, CO1 755#endif 756 757#if defined(LT) || defined(RN) 758 movl K, %eax 759 subl KK, %eax 760 leal (AA, %eax, SIZE), AA 761#ifdef LT 762 addl $4 * SIZE, B 763#endif 764#endif 765 766#ifdef LN 767 subl $1, KK 768 movl BORIG, B 769#endif 770 771#ifdef LT 772 addl $1, KK 773#endif 774 775#ifdef RT 776 movl K, %eax 777 movl BORIG, B 778 sall $BASE_SHIFT, %eax 779 addl %eax, AORIG 780#endif 781 ALIGN_4 782 783.L20: 784 testl $2, M 785 je .L30 786 787#ifdef LN 788 movl K, %eax 789 sall $1 + BASE_SHIFT, %eax 790 subl %eax, AORIG 791#endif 792 793#if defined(LN) || defined(RT) 794 movl KK, %eax 795 movl AORIG, AA 796 leal (, %eax, SIZE), %eax 797 leal (AA, %eax, 2), AA 798#endif 799 800 leal BUFFER, BB 801 802#if defined(LN) || defined(RT) 803 movl KK, %eax 804 sall $2 + BASE_SHIFT, %eax 805 leal (BB, %eax, 4), BB 806#endif 807 808#ifdef movsd 809 xorps %xmm0, %xmm0 810#endif 811 movsd 0 * SIZE(AA), %xmm0 812 xorps %xmm4, %xmm4 813#ifdef movsd 814 xorps %xmm1, %xmm1 815#endif 816 movsd 8 * SIZE(AA), %xmm1 817 xorps %xmm5, %xmm5 818 movaps 0 * SIZE(BB), %xmm2 819 xorps %xmm6, %xmm6 820 movaps 16 * SIZE(BB), %xmm3 821 xorps %xmm7, %xmm7 822 823#if defined(LT) || defined(RN) 824 movl KK, %eax 825#else 826 movl K, %eax 827 subl KK, %eax 828#endif 829 sarl $3, %eax 830 je .L25 831 ALIGN_4 832 833.L22: 834 mulps %xmm0, %xmm2 835 addps %xmm2, %xmm4 836#if defined(OPTERON) || defined(BARCELONA) 837 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 838#endif 839 movaps 4 * SIZE(BB), %xmm2 840 mulps %xmm0, %xmm2 841 addps %xmm2, %xmm5 842 movaps 8 * SIZE(BB), %xmm2 843 mulps %xmm0, %xmm2 844 addps %xmm2, %xmm6 845 movaps 12 * SIZE(BB), %xmm2 846 mulps %xmm0, %xmm2 847 movsd 2 * SIZE(AA), %xmm0 848 addps %xmm2, %xmm7 849 movaps 32 * SIZE(BB), %xmm2 850 851 mulps %xmm0, %xmm3 852 addps %xmm3, %xmm4 853 movaps 20 * SIZE(BB), %xmm3 854 mulps %xmm0, %xmm3 855 addps %xmm3, %xmm5 856 movaps 24 * SIZE(BB), %xmm3 857 mulps %xmm0, %xmm3 858 addps %xmm3, %xmm6 859 movaps 28 * SIZE(BB), %xmm3 860 mulps %xmm0, %xmm3 861 movsd 4 * SIZE(AA), %xmm0 862 addps %xmm3, %xmm7 863 movaps 48 * SIZE(BB), %xmm3 864 865 mulps %xmm0, %xmm2 866 addps %xmm2, %xmm4 867 movaps 36 * SIZE(BB), %xmm2 868 mulps %xmm0, %xmm2 869 addps %xmm2, %xmm5 870 movaps 40 * SIZE(BB), %xmm2 871 mulps %xmm0, %xmm2 872 addps %xmm2, %xmm6 873 movaps 44 * SIZE(BB), %xmm2 874 mulps %xmm0, %xmm2 875 movsd 6 * SIZE(AA), %xmm0 876 addps %xmm2, %xmm7 877 movaps 64 * SIZE(BB), %xmm2 878 879 mulps %xmm0, %xmm3 880 addps %xmm3, %xmm4 881 movaps 52 * SIZE(BB), %xmm3 882 mulps %xmm0, %xmm3 883 addps %xmm3, %xmm5 884 movaps 56 * SIZE(BB), %xmm3 885 mulps %xmm0, %xmm3 886 addps %xmm3, %xmm6 887 movaps 60 * SIZE(BB), %xmm3 888 mulps %xmm0, %xmm3 889 movsd 16 * SIZE(AA), %xmm0 890 addps %xmm3, %xmm7 891 movaps 80 * SIZE(BB), %xmm3 892 893 mulps %xmm1, %xmm2 894 addps %xmm2, %xmm4 895 movaps 68 * SIZE(BB), %xmm2 896 mulps %xmm1, %xmm2 897 addps %xmm2, %xmm5 898 movaps 72 * SIZE(BB), %xmm2 899 mulps %xmm1, %xmm2 900 addps %xmm2, %xmm6 901 movaps 76 * SIZE(BB), %xmm2 902 mulps %xmm1, %xmm2 903 movsd 10 * SIZE(AA), %xmm1 904 addps %xmm2, %xmm7 905 movaps 96 * SIZE(BB), %xmm2 906 907 mulps %xmm1, %xmm3 908 addps %xmm3, %xmm4 909 movaps 84 * SIZE(BB), %xmm3 910 mulps %xmm1, %xmm3 911 addps %xmm3, %xmm5 912 movaps 88 * SIZE(BB), %xmm3 913 mulps %xmm1, %xmm3 914 addps %xmm3, %xmm6 915 movaps 92 * SIZE(BB), %xmm3 916 mulps %xmm1, %xmm3 917 movsd 12 * SIZE(AA), %xmm1 918 addps %xmm3, %xmm7 919 movaps 112 * SIZE(BB), %xmm3 920 921 mulps %xmm1, %xmm2 922 addps %xmm2, %xmm4 923 movaps 100 * SIZE(BB), %xmm2 924 mulps %xmm1, %xmm2 925 addps %xmm2, %xmm5 926 movaps 104 * SIZE(BB), %xmm2 927 mulps %xmm1, %xmm2 928 addps %xmm2, %xmm6 929 movaps 108 * SIZE(BB), %xmm2 930 mulps %xmm1, %xmm2 931 movsd 14 * SIZE(AA), %xmm1 932 addps %xmm2, %xmm7 933 movaps 128 * SIZE(BB), %xmm2 934 935 mulps %xmm1, %xmm3 936 addps %xmm3, %xmm4 937 movaps 116 * SIZE(BB), %xmm3 938 mulps %xmm1, %xmm3 939 addps %xmm3, %xmm5 940 movaps 120 * SIZE(BB), %xmm3 941 mulps %xmm1, %xmm3 942 addps %xmm3, %xmm6 943 movaps 124 * SIZE(BB), %xmm3 944 mulps %xmm1, %xmm3 945 movsd 24 * SIZE(AA), %xmm1 946 addps %xmm3, %xmm7 947 movaps 144 * SIZE(BB), %xmm3 948 949 addl $ 16 * SIZE, AA 950 addl $128 * SIZE, BB 951 decl %eax 952 jne .L22 953 ALIGN_4 954 955.L25: 956#if defined(LT) || defined(RN) 957 movl KK, %eax 958#else 959 movl K, %eax 960 subl KK, %eax 961#endif 962 andl $7, %eax # if (k & 1) 963 BRANCH 964 je .L28 965 ALIGN_4 966 967.L26: 968 mulps %xmm0, %xmm2 969 addps %xmm2, %xmm4 970 movaps 4 * SIZE(BB), %xmm2 971 mulps %xmm0, %xmm2 972 addps %xmm2, %xmm5 973 movaps 8 * SIZE(BB), %xmm2 974 mulps %xmm0, %xmm2 975 addps %xmm2, %xmm6 976 movaps 12 * SIZE(BB), %xmm2 977 mulps %xmm0, %xmm2 978 movsd 2 * SIZE(AA), %xmm0 979 addps %xmm2, %xmm7 980 movaps 16 * SIZE(BB), %xmm2 981 982 addl $ 2 * SIZE, AA 983 addl $16 * SIZE, BB 984 decl %eax 985 jg .L26 986 ALIGN_4 987 988.L28: 989#if defined(LN) || defined(RT) 990 movl KK, %eax 991#ifdef LN 992 subl $2, %eax 993#else 994 subl $4, %eax 995#endif 996 997 movl AORIG, AA 998 movl BORIG, B 999 leal BUFFER, BB 1000 1001 sall $1 + BASE_SHIFT, %eax 1002 leal (AA, %eax, 1), AA 1003 leal (B, %eax, 2), B 1004 leal (BB, %eax, 8), BB 1005#endif 1006 1007#if defined(LN) || defined(LT) 1008 unpcklps %xmm6, %xmm4 1009 unpcklps %xmm7, %xmm5 1010 1011 movaps %xmm4, %xmm6 1012 unpcklps %xmm5, %xmm4 1013 unpckhps %xmm5, %xmm6 1014 1015 movaps 0 * SIZE(B), %xmm1 1016 movaps 4 * SIZE(B), %xmm3 1017 1018 subps %xmm4, %xmm1 1019 subps %xmm6, %xmm3 1020#else 1021#ifdef movsd 1022 xorps %xmm0, %xmm0 1023#endif 1024 movsd 0 * SIZE(AA), %xmm0 1025#ifdef movsd 1026 xorps %xmm1, %xmm1 1027#endif 1028 movsd 2 * SIZE(AA), %xmm1 1029#ifdef movsd 1030 xorps %xmm2, %xmm2 1031#endif 1032 movsd 4 * SIZE(AA), %xmm2 1033#ifdef movsd 1034 xorps %xmm3, %xmm3 1035#endif 1036 movsd 6 * SIZE(AA), %xmm3 1037 1038 subps %xmm4, %xmm0 1039 subps %xmm5, %xmm1 1040 subps %xmm6, %xmm2 1041 subps %xmm7, %xmm3 1042#endif 1043 1044#ifdef LN 1045 movaps 0 * SIZE(AA), %xmm4 1046 pshufd $0xff, %xmm4, %xmm6 1047 mulps %xmm6, %xmm3 1048 pshufd $0xaa, %xmm4, %xmm6 1049 mulps %xmm3, %xmm6 1050 subps %xmm6, %xmm1 1051 1052 pshufd $0x00, %xmm4, %xmm6 1053 mulps %xmm6, %xmm1 1054#endif 1055 1056#ifdef LT 1057 movaps 0 * SIZE(AA), %xmm4 1058 pshufd $0x00, %xmm4, %xmm6 1059 mulps %xmm6, %xmm1 1060 1061 pshufd $0x55, %xmm4, %xmm6 1062 mulps %xmm1, %xmm6 1063 subps %xmm6, %xmm3 1064 1065 pshufd $0xff, %xmm4, %xmm6 1066 mulps %xmm6, %xmm3 1067#endif 1068 1069#ifdef RN 1070 movaps 0 * SIZE(B), %xmm6 1071 pshufd $0x00, %xmm6, %xmm7 1072 mulps %xmm7, %xmm0 1073 pshufd $0x55, %xmm6, %xmm7 1074 mulps %xmm0, %xmm7 1075 subps %xmm7, %xmm1 1076 pshufd $0xaa, %xmm6, %xmm7 1077 mulps %xmm0, %xmm7 1078 subps %xmm7, %xmm2 1079 pshufd $0xff, %xmm6, %xmm7 1080 mulps %xmm0, %xmm7 1081 subps %xmm7, %xmm3 1082 1083 movaps 4 * SIZE(B), %xmm6 1084 pshufd $0x55, %xmm6, %xmm7 1085 mulps %xmm7, %xmm1 1086 pshufd $0xaa, %xmm6, %xmm7 1087 mulps %xmm1, %xmm7 1088 subps %xmm7, %xmm2 1089 pshufd $0xff, %xmm6, %xmm7 1090 mulps %xmm1, %xmm7 1091 subps %xmm7, %xmm3 1092 1093 movaps 8 * SIZE(B), %xmm6 1094 pshufd $0xaa, %xmm6, %xmm7 1095 mulps %xmm7, %xmm2 1096 pshufd $0xff, %xmm6, %xmm7 1097 mulps %xmm2, %xmm7 1098 subps %xmm7, %xmm3 1099 1100 movaps 12 * SIZE(B), %xmm6 1101 pshufd $0xff, %xmm6, %xmm7 1102 mulps %xmm7, %xmm3 1103#endif 1104 1105#ifdef RT 1106 movaps 12 * SIZE(B), %xmm6 1107 pshufd $0xff, %xmm6, %xmm7 1108 mulps %xmm7, %xmm3 1109 pshufd $0xaa, %xmm6, %xmm7 1110 mulps %xmm3, %xmm7 1111 subps %xmm7, %xmm2 1112 pshufd $0x55, %xmm6, %xmm7 1113 mulps %xmm3, %xmm7 1114 subps %xmm7, %xmm1 1115 pshufd $0x00, %xmm6, %xmm7 1116 mulps %xmm3, %xmm7 1117 subps %xmm7, %xmm0 1118 1119 movaps 8 * SIZE(B), %xmm6 1120 pshufd $0xaa, %xmm6, %xmm7 1121 mulps %xmm7, %xmm2 1122 pshufd $0x55, %xmm6, %xmm7 1123 mulps %xmm2, %xmm7 1124 subps %xmm7, %xmm1 1125 pshufd $0x00, %xmm6, %xmm7 1126 mulps %xmm2, %xmm7 1127 subps %xmm7, %xmm0 1128 1129 movaps 4 * SIZE(B), %xmm6 1130 pshufd $0x55, %xmm6, %xmm7 1131 mulps %xmm7, %xmm1 1132 pshufd $0x00, %xmm6, %xmm7 1133 mulps %xmm1, %xmm7 1134 subps %xmm7, %xmm0 1135 1136 movaps 0 * SIZE(B), %xmm6 1137 pshufd $0x00, %xmm6, %xmm7 1138 mulps %xmm7, %xmm0 1139#endif 1140 1141#if defined(LN) || defined(LT) 1142 movaps %xmm1, 0 * SIZE(B) 1143 movaps %xmm3, 4 * SIZE(B) 1144 1145 pshufd $0x00, %xmm1, %xmm0 1146 pshufd $0x55, %xmm1, %xmm2 1147 pshufd $0xaa, %xmm1, %xmm4 1148 pshufd $0xff, %xmm1, %xmm6 1149 movaps %xmm0, 0 * SIZE(BB) 1150 movaps %xmm2, 4 * SIZE(BB) 1151 movaps %xmm4, 8 * SIZE(BB) 1152 movaps %xmm6, 12 * SIZE(BB) 1153 1154 pshufd $0x00, %xmm3, %xmm0 1155 pshufd $0x55, %xmm3, %xmm2 1156 pshufd $0xaa, %xmm3, %xmm4 1157 pshufd $0xff, %xmm3, %xmm6 1158 movaps %xmm0, 16 * SIZE(BB) 1159 movaps %xmm2, 20 * SIZE(BB) 1160 movaps %xmm4, 24 * SIZE(BB) 1161 movaps %xmm6, 28 * SIZE(BB) 1162#else 1163 movlps %xmm0, 0 * SIZE(AA) 1164 movlps %xmm1, 2 * SIZE(AA) 1165 movlps %xmm2, 4 * SIZE(AA) 1166 movlps %xmm3, 6 * SIZE(AA) 1167#endif 1168 1169#ifdef LN 1170 subl $2 * SIZE, CO1 1171#endif 1172 1173 leal (LDC, LDC, 2), %eax 1174 1175#if defined(LN) || defined(LT) 1176 movaps %xmm1, %xmm0 1177 unpcklps %xmm5, %xmm1 1178 unpckhps %xmm5, %xmm0 1179 1180 movaps %xmm3, %xmm4 1181 unpcklps %xmm7, %xmm3 1182 unpckhps %xmm7, %xmm4 1183 1184 movaps %xmm1, %xmm2 1185 unpcklps %xmm3, %xmm1 1186 unpckhps %xmm3, %xmm2 1187 1188 movaps %xmm0, %xmm6 1189 unpcklps %xmm4, %xmm0 1190 unpckhps %xmm4, %xmm6 1191 1192 movlps %xmm1, 0 * SIZE(CO1) 1193 movlps %xmm2, 0 * SIZE(CO1, LDC, 1) 1194 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) 1195 movlps %xmm6, 0 * SIZE(CO1, %eax, 1) 1196#else 1197 movlps %xmm0, 0 * SIZE(CO1) 1198 movlps %xmm1, 0 * SIZE(CO1, LDC, 1) 1199 movlps %xmm2, 0 * SIZE(CO1, LDC, 2) 1200 movlps %xmm3, 0 * SIZE(CO1, %eax, 1) 1201#endif 1202 1203#ifndef LN 1204 addl $2 * SIZE, CO1 1205#endif 1206 1207#if defined(LT) || defined(RN) 1208 movl K, %eax 1209 subl KK, %eax 1210 leal (,%eax, SIZE), %eax 1211 leal (AA, %eax, 2), AA 1212#ifdef LT 1213 addl $8 * SIZE, B 1214#endif 1215#endif 1216 1217#ifdef LN 1218 subl $2, KK 1219 movl BORIG, B 1220#endif 1221 1222#ifdef LT 1223 addl $2, KK 1224#endif 1225 1226#ifdef RT 1227 movl K, %eax 1228 movl BORIG, B 1229 sall $1 + BASE_SHIFT, %eax 1230 addl %eax, AORIG 1231#endif 1232 ALIGN_4 1233 1234.L30: 1235 movl M, %ebx 1236 sarl $2, %ebx # i = (m >> 2) 1237 jle .L39 1238 ALIGN_4 1239 1240.L11: 1241#ifdef LN 1242 movl K, %eax 1243 sall $2 + BASE_SHIFT, %eax 1244 subl %eax, AORIG 1245#endif 1246 1247#if defined(LN) || defined(RT) 1248 movl KK, %eax 1249 movl AORIG, AA 1250 leal (, %eax, SIZE), %eax 1251 leal (AA, %eax, 4), AA 1252#endif 1253 1254 leal BUFFER, BB 1255 1256#if defined(LN) || defined(RT) 1257 movl KK, %eax 1258 sall $2 + BASE_SHIFT, %eax 1259 leal (BB, %eax, 4), BB 1260#endif 1261 1262 movaps 0 * SIZE(AA), %xmm0 1263 xorps %xmm4, %xmm4 1264 movaps 16 * SIZE(AA), %xmm1 1265 xorps %xmm5, %xmm5 1266 movaps 0 * SIZE(BB), %xmm2 1267 xorps %xmm6, %xmm6 1268 movaps 16 * SIZE(BB), %xmm3 1269 xorps %xmm7, %xmm7 1270 1271 leal (LDC, LDC, 2), %eax 1272 1273 PREFETCHW -4 * SIZE(CO1) 1274 PREFETCHW -4 * SIZE(CO1, LDC) 1275 PREFETCHW -4 * SIZE(CO1, LDC, 2) 1276 PREFETCHW -4 * SIZE(CO1, %eax) 1277 1278#if defined(LT) || defined(RN) 1279 movl KK, %eax 1280#else 1281 movl K, %eax 1282 subl KK, %eax 1283#endif 1284 sarl $3, %eax 1285 je .L15 1286 ALIGN_4 1287 1288.L12: 1289 KERNEL1(0 * 16) 1290 KERNEL2(0 * 16) 1291 KERNEL3(0 * 16) 1292 KERNEL4(0 * 16) 1293 KERNEL5(0 * 16) 1294 KERNEL6(0 * 16) 1295 KERNEL7(0 * 16) 1296 KERNEL8(0 * 16) 1297 1298 addl $128 * SIZE, BB 1299 addl $32 * SIZE, AA 1300 decl %eax 1301 jne .L12 1302 ALIGN_4 1303 1304.L15: 1305#if defined(LT) || defined(RN) 1306 movl KK, %eax 1307#else 1308 movl K, %eax 1309 subl KK, %eax 1310#endif 1311 andl $7, %eax # if (k & 1) 1312 BRANCH 1313 je .L18 1314 ALIGN_4 1315 1316.L16: 1317 mulps %xmm0, %xmm2 1318 addps %xmm2, %xmm4 1319 movaps 4 * SIZE(BB), %xmm2 1320 mulps %xmm0, %xmm2 1321 addps %xmm2, %xmm5 1322 movaps 8 * SIZE(BB), %xmm2 1323 mulps %xmm0, %xmm2 1324 mulps 12 * SIZE(BB), %xmm0 1325 addps %xmm2, %xmm6 1326 movaps 16 * SIZE(BB), %xmm2 1327 addps %xmm0, %xmm7 1328 movaps 4 * SIZE(AA), %xmm0 1329 1330 addl $ 4 * SIZE, AA 1331 addl $16 * SIZE, BB 1332 decl %eax 1333 jg .L16 1334 ALIGN_4 1335 1336.L18: 1337#if defined(LN) || defined(RT) 1338 movl KK, %eax 1339#ifdef LN 1340 subl $4, %eax 1341#else 1342 subl $4, %eax 1343#endif 1344 1345 movl AORIG, AA 1346 movl BORIG, B 1347 leal BUFFER, BB 1348 1349 sall $2 + BASE_SHIFT, %eax 1350 leal (AA, %eax, 1), AA 1351 leal (B, %eax, 1), B 1352 leal (BB, %eax, 4), BB 1353#endif 1354 1355#if defined(LN) || defined(LT) 1356 movaps %xmm4, %xmm0 1357 unpcklps %xmm6, %xmm4 1358 unpckhps %xmm6, %xmm0 1359 1360 movaps %xmm5, %xmm1 1361 unpcklps %xmm7, %xmm5 1362 unpckhps %xmm7, %xmm1 1363 1364 movaps %xmm4, %xmm6 1365 unpcklps %xmm5, %xmm4 1366 unpckhps %xmm5, %xmm6 1367 1368 movaps %xmm0, %xmm2 1369 unpcklps %xmm1, %xmm0 1370 unpckhps %xmm1, %xmm2 1371 1372 movaps 0 * SIZE(B), %xmm1 1373 movaps 4 * SIZE(B), %xmm3 1374 movaps 8 * SIZE(B), %xmm5 1375 movaps 12 * SIZE(B), %xmm7 1376 1377 subps %xmm4, %xmm1 1378 subps %xmm6, %xmm3 1379 subps %xmm0, %xmm5 1380 subps %xmm2, %xmm7 1381#else 1382 movaps 0 * SIZE(AA), %xmm0 1383 movaps 4 * SIZE(AA), %xmm1 1384 movaps 8 * SIZE(AA), %xmm2 1385 movaps 12 * SIZE(AA), %xmm3 1386 1387 subps %xmm4, %xmm0 1388 subps %xmm5, %xmm1 1389 subps %xmm6, %xmm2 1390 subps %xmm7, %xmm3 1391#endif 1392 1393#ifdef LN 1394 movaps 12 * SIZE(AA), %xmm4 1395 pshufd $0xff, %xmm4, %xmm6 1396 mulps %xmm6, %xmm7 1397 pshufd $0xaa, %xmm4, %xmm6 1398 mulps %xmm7, %xmm6 1399 subps %xmm6, %xmm5 1400 pshufd $0x55, %xmm4, %xmm6 1401 mulps %xmm7, %xmm6 1402 subps %xmm6, %xmm3 1403 pshufd $0x00, %xmm4, %xmm6 1404 mulps %xmm7, %xmm6 1405 subps %xmm6, %xmm1 1406 1407 movaps 8 * SIZE(AA), %xmm4 1408 pshufd $0xaa, %xmm4, %xmm6 1409 mulps %xmm6, %xmm5 1410 pshufd $0x55, %xmm4, %xmm6 1411 mulps %xmm5, %xmm6 1412 subps %xmm6, %xmm3 1413 pshufd $0x00, %xmm4, %xmm6 1414 mulps %xmm5, %xmm6 1415 subps %xmm6, %xmm1 1416 1417 movaps 4 * SIZE(AA), %xmm4 1418 pshufd $0x55, %xmm4, %xmm6 1419 mulps %xmm6, %xmm3 1420 pshufd $0x00, %xmm4, %xmm6 1421 mulps %xmm3, %xmm6 1422 subps %xmm6, %xmm1 1423 1424 movaps 0 * SIZE(AA), %xmm4 1425 pshufd $0x00, %xmm4, %xmm6 1426 mulps %xmm6, %xmm1 1427#endif 1428 1429#ifdef LT 1430 movaps 0 * SIZE(AA), %xmm4 1431 pshufd $0x00, %xmm4, %xmm6 1432 mulps %xmm6, %xmm1 1433 1434 pshufd $0x55, %xmm4, %xmm6 1435 mulps %xmm1, %xmm6 1436 subps %xmm6, %xmm3 1437 pshufd $0xaa, %xmm4, %xmm6 1438 mulps %xmm1, %xmm6 1439 subps %xmm6, %xmm5 1440 pshufd $0xff, %xmm4, %xmm6 1441 mulps %xmm1, %xmm6 1442 subps %xmm6, %xmm7 1443 1444 movaps 4 * SIZE(AA), %xmm4 1445 pshufd $0x55, %xmm4, %xmm6 1446 mulps %xmm6, %xmm3 1447 pshufd $0xaa, %xmm4, %xmm6 1448 mulps %xmm3, %xmm6 1449 subps %xmm6, %xmm5 1450 pshufd $0xff, %xmm4, %xmm6 1451 mulps %xmm3, %xmm6 1452 subps %xmm6, %xmm7 1453 1454 movaps 8 * SIZE(AA), %xmm4 1455 pshufd $0xaa, %xmm4, %xmm6 1456 mulps %xmm6, %xmm5 1457 pshufd $0xff, %xmm4, %xmm6 1458 mulps %xmm5, %xmm6 1459 subps %xmm6, %xmm7 1460 1461 movaps 12 * SIZE(AA), %xmm4 1462 pshufd $0xff, %xmm4, %xmm6 1463 mulps %xmm6, %xmm7 1464#endif 1465 1466#ifdef RN 1467 movaps 0 * SIZE(B), %xmm6 1468 pshufd $0x00, %xmm6, %xmm7 1469 mulps %xmm7, %xmm0 1470 pshufd $0x55, %xmm6, %xmm7 1471 mulps %xmm0, %xmm7 1472 subps %xmm7, %xmm1 1473 pshufd $0xaa, %xmm6, %xmm7 1474 mulps %xmm0, %xmm7 1475 subps %xmm7, %xmm2 1476 pshufd $0xff, %xmm6, %xmm7 1477 mulps %xmm0, %xmm7 1478 subps %xmm7, %xmm3 1479 1480 movaps 4 * SIZE(B), %xmm6 1481 pshufd $0x55, %xmm6, %xmm7 1482 mulps %xmm7, %xmm1 1483 pshufd $0xaa, %xmm6, %xmm7 1484 mulps %xmm1, %xmm7 1485 subps %xmm7, %xmm2 1486 pshufd $0xff, %xmm6, %xmm7 1487 mulps %xmm1, %xmm7 1488 subps %xmm7, %xmm3 1489 1490 movaps 8 * SIZE(B), %xmm6 1491 pshufd $0xaa, %xmm6, %xmm7 1492 mulps %xmm7, %xmm2 1493 pshufd $0xff, %xmm6, %xmm7 1494 mulps %xmm2, %xmm7 1495 subps %xmm7, %xmm3 1496 1497 movaps 12 * SIZE(B), %xmm6 1498 pshufd $0xff, %xmm6, %xmm7 1499 mulps %xmm7, %xmm3 1500#endif 1501 1502#ifdef RT 1503 movaps 12 * SIZE(B), %xmm6 1504 pshufd $0xff, %xmm6, %xmm7 1505 mulps %xmm7, %xmm3 1506 pshufd $0xaa, %xmm6, %xmm7 1507 mulps %xmm3, %xmm7 1508 subps %xmm7, %xmm2 1509 pshufd $0x55, %xmm6, %xmm7 1510 mulps %xmm3, %xmm7 1511 subps %xmm7, %xmm1 1512 pshufd $0x00, %xmm6, %xmm7 1513 mulps %xmm3, %xmm7 1514 subps %xmm7, %xmm0 1515 1516 movaps 8 * SIZE(B), %xmm6 1517 pshufd $0xaa, %xmm6, %xmm7 1518 mulps %xmm7, %xmm2 1519 pshufd $0x55, %xmm6, %xmm7 1520 mulps %xmm2, %xmm7 1521 subps %xmm7, %xmm1 1522 pshufd $0x00, %xmm6, %xmm7 1523 mulps %xmm2, %xmm7 1524 subps %xmm7, %xmm0 1525 1526 movaps 4 * SIZE(B), %xmm6 1527 pshufd $0x55, %xmm6, %xmm7 1528 mulps %xmm7, %xmm1 1529 pshufd $0x00, %xmm6, %xmm7 1530 mulps %xmm1, %xmm7 1531 subps %xmm7, %xmm0 1532 1533 movaps 0 * SIZE(B), %xmm6 1534 pshufd $0x00, %xmm6, %xmm7 1535 mulps %xmm7, %xmm0 1536#endif 1537 1538#if defined(LN) || defined(LT) 1539 movaps %xmm1, 0 * SIZE(B) 1540 movaps %xmm3, 4 * SIZE(B) 1541 movaps %xmm5, 8 * SIZE(B) 1542 movaps %xmm7, 12 * SIZE(B) 1543 1544 pshufd $0x00, %xmm1, %xmm0 1545 pshufd $0x55, %xmm1, %xmm2 1546 pshufd $0xaa, %xmm1, %xmm4 1547 pshufd $0xff, %xmm1, %xmm6 1548 movaps %xmm0, 0 * SIZE(BB) 1549 movaps %xmm2, 4 * SIZE(BB) 1550 movaps %xmm4, 8 * SIZE(BB) 1551 movaps %xmm6, 12 * SIZE(BB) 1552 1553 pshufd $0x00, %xmm3, %xmm0 1554 pshufd $0x55, %xmm3, %xmm2 1555 pshufd $0xaa, %xmm3, %xmm4 1556 pshufd $0xff, %xmm3, %xmm6 1557 movaps %xmm0, 16 * SIZE(BB) 1558 movaps %xmm2, 20 * SIZE(BB) 1559 movaps %xmm4, 24 * SIZE(BB) 1560 movaps %xmm6, 28 * SIZE(BB) 1561 1562 pshufd $0x00, %xmm5, %xmm0 1563 pshufd $0x55, %xmm5, %xmm2 1564 pshufd $0xaa, %xmm5, %xmm4 1565 pshufd $0xff, %xmm5, %xmm6 1566 movaps %xmm0, 32 * SIZE(BB) 1567 movaps %xmm2, 36 * SIZE(BB) 1568 movaps %xmm4, 40 * SIZE(BB) 1569 movaps %xmm6, 44 * SIZE(BB) 1570 1571 pshufd $0x00, %xmm7, %xmm0 1572 pshufd $0x55, %xmm7, %xmm2 1573 pshufd $0xaa, %xmm7, %xmm4 1574 pshufd $0xff, %xmm7, %xmm6 1575 movaps %xmm0, 48 * SIZE(BB) 1576 movaps %xmm2, 52 * SIZE(BB) 1577 movaps %xmm4, 56 * SIZE(BB) 1578 movaps %xmm6, 60 * SIZE(BB) 1579#else 1580 movaps %xmm0, 0 * SIZE(AA) 1581 movaps %xmm1, 4 * SIZE(AA) 1582 movaps %xmm2, 8 * SIZE(AA) 1583 movaps %xmm3, 12 * SIZE(AA) 1584#endif 1585 1586#ifdef LN 1587 subl $4 * SIZE, CO1 1588#endif 1589 1590 leal (LDC, LDC, 2), %eax 1591 1592#if defined(LN) || defined(LT) 1593 movaps %xmm1, %xmm0 1594 unpcklps %xmm5, %xmm1 1595 unpckhps %xmm5, %xmm0 1596 1597 movaps %xmm3, %xmm4 1598 unpcklps %xmm7, %xmm3 1599 unpckhps %xmm7, %xmm4 1600 1601 movaps %xmm1, %xmm2 1602 unpcklps %xmm3, %xmm1 1603 unpckhps %xmm3, %xmm2 1604 1605 movaps %xmm0, %xmm6 1606 unpcklps %xmm4, %xmm0 1607 unpckhps %xmm4, %xmm6 1608 1609 movlps %xmm1, 0 * SIZE(CO1) 1610 movhps %xmm1, 2 * SIZE(CO1) 1611 movlps %xmm2, 0 * SIZE(CO1, LDC, 1) 1612 movhps %xmm2, 2 * SIZE(CO1, LDC, 1) 1613 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) 1614 movhps %xmm0, 2 * SIZE(CO1, LDC, 2) 1615 movlps %xmm6, 0 * SIZE(CO1, %eax, 1) 1616 movhps %xmm6, 2 * SIZE(CO1, %eax, 1) 1617#else 1618 movlps %xmm0, 0 * SIZE(CO1) 1619 movhps %xmm0, 2 * SIZE(CO1) 1620 movlps %xmm1, 0 * SIZE(CO1, LDC, 1) 1621 movhps %xmm1, 2 * SIZE(CO1, LDC, 1) 1622 movlps %xmm2, 0 * SIZE(CO1, LDC, 2) 1623 movhps %xmm2, 2 * SIZE(CO1, LDC, 2) 1624 movlps %xmm3, 0 * SIZE(CO1, %eax, 1) 1625 movhps %xmm3, 2 * SIZE(CO1, %eax, 1) 1626#endif 1627 1628#ifndef LN 1629 addl $4 * SIZE, CO1 1630#endif 1631 1632#if defined(LT) || defined(RN) 1633 movl K, %eax 1634 subl KK, %eax 1635 leal (,%eax, SIZE), %eax 1636 leal (AA, %eax, 4), AA 1637#ifdef LT 1638 addl $16 * SIZE, B 1639#endif 1640#endif 1641 1642#ifdef LN 1643 subl $4, KK 1644 movl BORIG, B 1645#endif 1646 1647#ifdef LT 1648 addl $4, KK 1649#endif 1650 1651#ifdef RT 1652 movl K, %eax 1653 movl BORIG, B 1654 sall $2 + BASE_SHIFT, %eax 1655 addl %eax, AORIG 1656#endif 1657 1658 decl %ebx # i -- 1659 jg .L11 1660 ALIGN_4 1661 1662.L39: 1663#ifdef LN 1664 movl K, %eax 1665 leal (, %eax, SIZE), %eax 1666 leal (B, %eax, 4), B 1667#endif 1668 1669#if defined(LT) || defined(RN) 1670 movl K, %eax 1671 subl KK, %eax 1672 leal (,%eax, SIZE), %eax 1673 leal (B, %eax, 4), B 1674#endif 1675 1676#ifdef RN 1677 addl $4, KK 1678#endif 1679 1680#ifdef RT 1681 subl $4, KK 1682#endif 1683 1684 decl J # j -- 1685 jg .L01 1686 ALIGN_4 1687 1688.L40: 1689 testl $2, N 1690 je .L80 1691 1692#ifdef LN 1693 movl OFFSET, %eax 1694 addl M, %eax 1695 movl %eax, KK 1696#endif 1697 1698 leal BUFFER, %ecx 1699 1700#ifdef RT 1701 movl K, %eax 1702 sall $1 + BASE_SHIFT, %eax 1703 subl %eax, B 1704#endif 1705 1706#if defined(LN) || defined(RT) 1707 movl KK, %eax 1708 movl B, BORIG 1709 sall $1 + BASE_SHIFT, %eax 1710 leal (B, %eax, 1), B 1711 leal (BB, %eax, 4), BB 1712#endif 1713 1714#ifdef LT 1715 movl OFFSET, %eax 1716 movl %eax, KK 1717#endif 1718 1719#if defined(LT) || defined(RN) 1720 movl KK, %eax 1721#else 1722 movl K, %eax 1723 subl KK, %eax 1724#endif 1725 sarl $2, %eax 1726 jle .L45 1727 ALIGN_4 1728 1729.L42: 1730 movaps 0 * SIZE(B), %xmm3 1731 movaps 4 * SIZE(B), %xmm7 1732 1733 pshufd $0x00, %xmm3, %xmm0 1734 pshufd $0x55, %xmm3, %xmm1 1735 pshufd $0xaa, %xmm3, %xmm2 1736 pshufd $0xff, %xmm3, %xmm3 1737 1738 pshufd $0x00, %xmm7, %xmm4 1739 pshufd $0x55, %xmm7, %xmm5 1740 pshufd $0xaa, %xmm7, %xmm6 1741 pshufd $0xff, %xmm7, %xmm7 1742 1743 movaps %xmm0, 0 * SIZE(BB) 1744 movaps %xmm1, 4 * SIZE(BB) 1745 movaps %xmm2, 8 * SIZE(BB) 1746 movaps %xmm3, 12 * SIZE(BB) 1747 movaps %xmm4, 16 * SIZE(BB) 1748 movaps %xmm5, 20 * SIZE(BB) 1749 movaps %xmm6, 24 * SIZE(BB) 1750 movaps %xmm7, 28 * SIZE(BB) 1751 1752 addl $ 8 * SIZE, B 1753 addl $32 * SIZE, %ecx 1754 decl %eax 1755 jne .L42 1756 ALIGN_4 1757 1758.L45: 1759#if defined(LT) || defined(RN) 1760 movl KK, %eax 1761#else 1762 movl K, %eax 1763 subl KK, %eax 1764#endif 1765 andl $3, %eax 1766 BRANCH 1767 jle .L50 1768 ALIGN_4 1769 1770.L46: 1771#ifdef movsd 1772 xorps %xmm3, %xmm3 1773#endif 1774 movsd 0 * SIZE(B), %xmm3 1775 1776 pshufd $0x00, %xmm3, %xmm0 1777 pshufd $0x55, %xmm3, %xmm1 1778 1779 movaps %xmm0, 0 * SIZE(BB) 1780 movaps %xmm1, 4 * SIZE(BB) 1781 1782 addl $2 * SIZE, B 1783 addl $8 * SIZE, %ecx 1784 decl %eax 1785 jne .L46 1786 ALIGN_4 1787 1788.L50: 1789#if defined(LT) || defined(RN) 1790 movl A, AA 1791#else 1792 movl A, %eax 1793 movl %eax, AORIG 1794#endif 1795 1796 leal (, LDC, 2), %eax 1797 1798#ifdef RT 1799 subl %eax, C 1800#endif 1801 movl C, CO1 1802#ifndef RT 1803 addl %eax, C 1804#endif 1805 1806 testl $1, M 1807 je .L60 1808 1809#ifdef LN 1810 movl K, %eax 1811 sall $BASE_SHIFT, %eax 1812 subl %eax, AORIG 1813#endif 1814 1815#if defined(LN) || defined(RT) 1816 movl KK, %eax 1817 movl AORIG, AA 1818 leal (AA, %eax, SIZE), AA 1819#endif 1820 1821 leal BUFFER, BB 1822 1823#if defined(LN) || defined(RT) 1824 movl KK, %eax 1825 sall $1 + BASE_SHIFT, %eax 1826 leal (BB, %eax, 4), BB 1827#endif 1828 1829 xorps %xmm4, %xmm4 1830 xorps %xmm5, %xmm5 1831 xorps %xmm6, %xmm6 1832 xorps %xmm7, %xmm7 1833 1834 movss 0 * SIZE(AA), %xmm0 1835 movss 4 * SIZE(AA), %xmm1 1836 movss 0 * SIZE(BB), %xmm2 1837 movss 16 * SIZE(BB), %xmm3 1838 1839#if defined(LT) || defined(RN) 1840 movl KK, %eax 1841#else 1842 movl K, %eax 1843 subl KK, %eax 1844#endif 1845 sarl $3, %eax 1846 je .L75 1847 ALIGN_4 1848 1849.L72: 1850 mulss %xmm0, %xmm2 1851#if defined(OPTERON) || defined(BARCELONA) 1852 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 1853#endif 1854 mulss 4 * SIZE(BB), %xmm0 1855 addss %xmm2, %xmm4 1856 movss 8 * SIZE(BB), %xmm2 1857 addss %xmm0, %xmm5 1858 movss 1 * SIZE(AA), %xmm0 1859 mulss %xmm0, %xmm2 1860 mulss 12 * SIZE(BB), %xmm0 1861 addss %xmm2, %xmm6 1862 movss 32 * SIZE(BB), %xmm2 1863 addss %xmm0, %xmm7 1864 movss 2 * SIZE(AA), %xmm0 1865 mulss %xmm0, %xmm3 1866 mulss 20 * SIZE(BB), %xmm0 1867 addss %xmm3, %xmm4 1868 movss 24 * SIZE(BB), %xmm3 1869 addss %xmm0, %xmm5 1870 movss 3 * SIZE(AA), %xmm0 1871 mulss %xmm0, %xmm3 1872 mulss 28 * SIZE(BB), %xmm0 1873 addss %xmm3, %xmm6 1874 movss 48 * SIZE(BB), %xmm3 1875 addss %xmm0, %xmm7 1876 movss 8 * SIZE(AA), %xmm0 1877 mulss %xmm1, %xmm2 1878 mulss 36 * SIZE(BB), %xmm1 1879 addss %xmm2, %xmm4 1880 movss 40 * SIZE(BB), %xmm2 1881 addss %xmm1, %xmm5 1882 movss 5 * SIZE(AA), %xmm1 1883 mulss %xmm1, %xmm2 1884 mulss 44 * SIZE(BB), %xmm1 1885 addss %xmm2, %xmm6 1886 movss 64 * SIZE(BB), %xmm2 1887 addss %xmm1, %xmm7 1888 movss 6 * SIZE(AA), %xmm1 1889 mulss %xmm1, %xmm3 1890 mulss 52 * SIZE(BB), %xmm1 1891 addss %xmm3, %xmm4 1892 movss 56 * SIZE(BB), %xmm3 1893 addss %xmm1, %xmm5 1894 movss 7 * SIZE(AA), %xmm1 1895 mulss %xmm1, %xmm3 1896 mulss 60 * SIZE(BB), %xmm1 1897 addss %xmm3, %xmm6 1898 movss 80 * SIZE(BB), %xmm3 1899 addss %xmm1, %xmm7 1900 movss 12 * SIZE(AA), %xmm1 1901 1902 addl $ 8 * SIZE, AA 1903 addl $64 * SIZE, BB 1904 decl %eax 1905 jne .L72 1906 ALIGN_4 1907 1908.L75: 1909#if defined(LT) || defined(RN) 1910 movl KK, %eax 1911#else 1912 movl K, %eax 1913 subl KK, %eax 1914#endif 1915 andl $7, %eax # if (k & 1) 1916 BRANCH 1917 je .L78 1918 ALIGN_4 1919 1920.L76: 1921 mulss %xmm0, %xmm2 1922 mulss 4 * SIZE(BB), %xmm0 1923 addss %xmm2, %xmm4 1924 movss 8 * SIZE(BB), %xmm2 1925 addss %xmm0, %xmm5 1926 movss 1 * SIZE(AA), %xmm0 1927 1928 addl $ 1 * SIZE, AA 1929 addl $ 8 * SIZE, BB 1930 decl %eax 1931 jg .L76 1932 ALIGN_4 1933 1934.L78: 1935 addss %xmm6, %xmm4 1936 addss %xmm7, %xmm5 1937 1938#if defined(LN) || defined(RT) 1939 movl KK, %eax 1940#ifdef LN 1941 subl $1, %eax 1942#else 1943 subl $2, %eax 1944#endif 1945 1946 movl AORIG, AA 1947 movl BORIG, B 1948 leal BUFFER, BB 1949 1950 sall $BASE_SHIFT, %eax 1951 leal (AA, %eax, 1), AA 1952 leal (B, %eax, 2), B 1953 leal (BB, %eax, 8), BB 1954#endif 1955 1956#if defined(LN) || defined(LT) 1957 unpcklps %xmm5, %xmm4 1958 1959#ifdef movsd 1960 xorps %xmm1, %xmm1 1961#endif 1962 movsd 0 * SIZE(B), %xmm1 1963 1964 subps %xmm4, %xmm1 1965#else 1966 movss 0 * SIZE(AA), %xmm0 1967 movss 1 * SIZE(AA), %xmm1 1968 1969 subss %xmm4, %xmm0 1970 subss %xmm5, %xmm1 1971#endif 1972 1973#if defined(LN) || defined(LT) 1974 movss 0 * SIZE(AA), %xmm4 1975 pshufd $0x00, %xmm4, %xmm6 1976 mulps %xmm6, %xmm1 1977#endif 1978 1979#ifdef RN 1980 movaps 0 * SIZE(B), %xmm6 1981 pshufd $0x00, %xmm6, %xmm7 1982 mulss %xmm7, %xmm0 1983 pshufd $0x55, %xmm6, %xmm7 1984 mulss %xmm0, %xmm7 1985 subss %xmm7, %xmm1 1986 1987 pshufd $0xff, %xmm6, %xmm7 1988 mulss %xmm7, %xmm1 1989#endif 1990 1991#ifdef RT 1992 movaps 0 * SIZE(B), %xmm6 1993 pshufd $0xff, %xmm6, %xmm7 1994 mulss %xmm7, %xmm1 1995 pshufd $0xaa, %xmm6, %xmm7 1996 mulss %xmm1, %xmm7 1997 subss %xmm7, %xmm0 1998 1999 pshufd $0x00, %xmm6, %xmm7 2000 mulss %xmm7, %xmm0 2001#endif 2002 2003#if defined(LN) || defined(LT) 2004 movlps %xmm1, 0 * SIZE(B) 2005 2006 pshufd $0x00, %xmm1, %xmm0 2007 pshufd $0x55, %xmm1, %xmm2 2008 movaps %xmm0, 0 * SIZE(BB) 2009 movaps %xmm2, 4 * SIZE(BB) 2010#else 2011 movss %xmm0, 0 * SIZE(AA) 2012 movss %xmm1, 1 * SIZE(AA) 2013#endif 2014 2015#ifdef LN 2016 subl $1 * SIZE, CO1 2017#endif 2018 2019#if defined(LN) || defined(LT) 2020 pshufd $1, %xmm1, %xmm3 2021 2022 movss %xmm1, 0 * SIZE(CO1) 2023 movss %xmm3, 0 * SIZE(CO1, LDC) 2024#else 2025 movss %xmm0, 0 * SIZE(CO1) 2026 movss %xmm1, 0 * SIZE(CO1, LDC) 2027#endif 2028 2029#ifndef LN 2030 addl $1 * SIZE, CO1 2031#endif 2032 2033#if defined(LT) || defined(RN) 2034 movl K, %eax 2035 subl KK, %eax 2036 leal (AA, %eax, SIZE), AA 2037#ifdef LT 2038 addl $2 * SIZE, B 2039#endif 2040#endif 2041 2042#ifdef LN 2043 subl $1, KK 2044 movl BORIG, B 2045#endif 2046 2047#ifdef LT 2048 addl $1, KK 2049#endif 2050 2051#ifdef RT 2052 movl K, %eax 2053 movl BORIG, B 2054 sall $BASE_SHIFT, %eax 2055 addl %eax, AORIG 2056#endif 2057 ALIGN_4 2058 2059 2060.L60: 2061 testl $2, M 2062 je .L70 2063 2064#ifdef LN 2065 movl K, %eax 2066 sall $1 + BASE_SHIFT, %eax 2067 subl %eax, AORIG 2068#endif 2069 2070#if defined(LN) || defined(RT) 2071 movl KK, %eax 2072 movl AORIG, AA 2073 leal (, %eax, SIZE), %eax 2074 leal (AA, %eax, 2), AA 2075#endif 2076 2077 leal BUFFER, BB 2078 2079#if defined(LN) || defined(RT) 2080 movl KK, %eax 2081 sall $1 + BASE_SHIFT, %eax 2082 leal (BB, %eax, 4), BB 2083#endif 2084 2085 xorps %xmm4, %xmm4 2086 xorps %xmm5, %xmm5 2087 xorps %xmm6, %xmm6 2088 xorps %xmm7, %xmm7 2089 2090#ifdef movsd 2091 xorps %xmm0, %xmm0 2092#endif 2093 movsd 0 * SIZE(AA), %xmm0 2094#ifdef movsd 2095 xorps %xmm1, %xmm1 2096#endif 2097 movsd 8 * SIZE(AA), %xmm1 2098 movaps 0 * SIZE(BB), %xmm2 2099 movaps 16 * SIZE(BB), %xmm3 2100 2101#if defined(LT) || defined(RN) 2102 movl KK, %eax 2103#else 2104 movl K, %eax 2105 subl KK, %eax 2106#endif 2107 sarl $3, %eax 2108 je .L65 2109 ALIGN_4 2110 2111.L62: 2112#if defined(OPTERON) || defined(BARCELONA) 2113 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 2114#endif 2115 2116 mulps %xmm0, %xmm2 2117 addps %xmm2, %xmm4 2118 movaps 4 * SIZE(BB), %xmm2 2119 mulps %xmm0, %xmm2 2120 movsd 2 * SIZE(AA), %xmm0 2121 addps %xmm2, %xmm5 2122 movaps 8 * SIZE(BB), %xmm2 2123 2124 mulps %xmm0, %xmm2 2125 addps %xmm2, %xmm6 2126 movaps 12 * SIZE(BB), %xmm2 2127 mulps %xmm0, %xmm2 2128 movsd 4 * SIZE(AA), %xmm0 2129 addps %xmm2, %xmm7 2130 movaps 32 * SIZE(BB), %xmm2 2131 2132 mulps %xmm0, %xmm3 2133 addps %xmm3, %xmm4 2134 movaps 20 * SIZE(BB), %xmm3 2135 mulps %xmm0, %xmm3 2136 movsd 6 * SIZE(AA), %xmm0 2137 addps %xmm3, %xmm5 2138 movaps 24 * SIZE(BB), %xmm3 2139 2140 mulps %xmm0, %xmm3 2141 addps %xmm3, %xmm6 2142 movaps 28 * SIZE(BB), %xmm3 2143 mulps %xmm0, %xmm3 2144 movsd 16 * SIZE(AA), %xmm0 2145 addps %xmm3, %xmm7 2146 movaps 48 * SIZE(BB), %xmm3 2147 2148 mulps %xmm1, %xmm2 2149 addps %xmm2, %xmm4 2150 movaps 36 * SIZE(BB), %xmm2 2151 mulps %xmm1, %xmm2 2152 movsd 10 * SIZE(AA), %xmm1 2153 addps %xmm2, %xmm5 2154 movaps 40 * SIZE(BB), %xmm2 2155 2156 mulps %xmm1, %xmm2 2157 addps %xmm2, %xmm6 2158 movaps 44 * SIZE(BB), %xmm2 2159 mulps %xmm1, %xmm2 2160 movsd 12 * SIZE(AA), %xmm1 2161 addps %xmm2, %xmm7 2162 movaps 64 * SIZE(BB), %xmm2 2163 2164 mulps %xmm1, %xmm3 2165 addps %xmm3, %xmm4 2166 movaps 52 * SIZE(BB), %xmm3 2167 mulps %xmm1, %xmm3 2168 movsd 14 * SIZE(AA), %xmm1 2169 addps %xmm3, %xmm5 2170 movaps 56 * SIZE(BB), %xmm3 2171 2172 mulps %xmm1, %xmm3 2173 addps %xmm3, %xmm6 2174 movaps 60 * SIZE(BB), %xmm3 2175 mulps %xmm1, %xmm3 2176 movsd 24 * SIZE(AA), %xmm1 2177 addps %xmm3, %xmm7 2178 movaps 80 * SIZE(BB), %xmm3 2179 2180 addl $16 * SIZE, AA 2181 addl $64 * SIZE, BB 2182 decl %eax 2183 jne .L62 2184 ALIGN_4 2185 2186.L65: 2187#if defined(LT) || defined(RN) 2188 movl KK, %eax 2189#else 2190 movl K, %eax 2191 subl KK, %eax 2192#endif 2193 andl $7, %eax # if (k & 1) 2194 BRANCH 2195 je .L68 2196 ALIGN_4 2197 2198.L66: 2199 mulps %xmm0, %xmm2 2200 addps %xmm2, %xmm4 2201 movaps 4 * SIZE(BB), %xmm2 2202 mulps %xmm0, %xmm2 2203 movsd 2 * SIZE(AA), %xmm0 2204 addps %xmm2, %xmm5 2205 movaps 8 * SIZE(BB), %xmm2 2206 2207 addl $2 * SIZE, AA 2208 addl $8 * SIZE, BB 2209 decl %eax 2210 jg .L66 2211 ALIGN_4 2212 2213.L68: 2214 addps %xmm6, %xmm4 2215 addps %xmm7, %xmm5 2216 2217#if defined(LN) || defined(RT) 2218 movl KK, %eax 2219#ifdef LN 2220 subl $2, %eax 2221#else 2222 subl $2, %eax 2223#endif 2224 2225 movl AORIG, AA 2226 movl BORIG, B 2227 leal BUFFER, BB 2228 2229 sall $BASE_SHIFT, %eax 2230 leal (AA, %eax, 2), AA 2231 leal (B, %eax, 2), B 2232 leal (BB, %eax, 8), BB 2233#endif 2234 2235#if defined(LN) || defined(LT) 2236 unpcklps %xmm6, %xmm4 2237 unpcklps %xmm7, %xmm5 2238 2239 movaps %xmm4, %xmm6 2240 unpcklps %xmm5, %xmm4 2241 unpckhps %xmm5, %xmm6 2242 2243#ifdef movsd 2244 xorps %xmm1, %xmm1 2245#endif 2246 movsd 0 * SIZE(B), %xmm1 2247#ifdef movsd 2248 xorps %xmm3, %xmm3 2249#endif 2250 movsd 2 * SIZE(B), %xmm3 2251 2252 subps %xmm4, %xmm1 2253 subps %xmm6, %xmm3 2254#else 2255#ifdef movsd 2256 xorps %xmm0, %xmm0 2257#endif 2258 movsd 0 * SIZE(AA), %xmm0 2259#ifdef movsd 2260 xorps %xmm1, %xmm1 2261#endif 2262 movsd 2 * SIZE(AA), %xmm1 2263 2264 subps %xmm4, %xmm0 2265 subps %xmm5, %xmm1 2266#endif 2267 2268#ifdef LN 2269 movaps 0 * SIZE(AA), %xmm4 2270 pshufd $0xff, %xmm4, %xmm6 2271 mulps %xmm6, %xmm3 2272 pshufd $0xaa, %xmm4, %xmm6 2273 mulps %xmm3, %xmm6 2274 subps %xmm6, %xmm1 2275 2276 pshufd $0x00, %xmm4, %xmm6 2277 mulps %xmm6, %xmm1 2278#endif 2279 2280#ifdef LT 2281 movaps 0 * SIZE(AA), %xmm4 2282 pshufd $0x00, %xmm4, %xmm6 2283 mulps %xmm6, %xmm1 2284 pshufd $0x55, %xmm4, %xmm6 2285 mulps %xmm1, %xmm6 2286 subps %xmm6, %xmm3 2287 2288 pshufd $0xff, %xmm4, %xmm6 2289 mulps %xmm6, %xmm3 2290#endif 2291 2292#ifdef RN 2293 movaps 0 * SIZE(B), %xmm6 2294 pshufd $0x00, %xmm6, %xmm7 2295 mulps %xmm7, %xmm0 2296 pshufd $0x55, %xmm6, %xmm7 2297 mulps %xmm0, %xmm7 2298 subps %xmm7, %xmm1 2299 2300 pshufd $0xff, %xmm6, %xmm7 2301 mulps %xmm7, %xmm1 2302#endif 2303 2304#ifdef RT 2305 movaps 0 * SIZE(B), %xmm6 2306 pshufd $0xff, %xmm6, %xmm7 2307 mulps %xmm7, %xmm1 2308 pshufd $0xaa, %xmm6, %xmm7 2309 mulps %xmm1, %xmm7 2310 subps %xmm7, %xmm0 2311 2312 pshufd $0x00, %xmm6, %xmm7 2313 mulps %xmm7, %xmm0 2314#endif 2315 2316#if defined(LN) || defined(LT) 2317 movlps %xmm1, 0 * SIZE(B) 2318 movlps %xmm3, 2 * SIZE(B) 2319 2320 pshufd $0x00, %xmm1, %xmm0 2321 pshufd $0x55, %xmm1, %xmm2 2322 movaps %xmm0, 0 * SIZE(BB) 2323 movaps %xmm2, 4 * SIZE(BB) 2324 2325 pshufd $0x00, %xmm3, %xmm0 2326 pshufd $0x55, %xmm3, %xmm2 2327 movaps %xmm0, 8 * SIZE(BB) 2328 movaps %xmm2, 12 * SIZE(BB) 2329#else 2330 movlps %xmm0, 0 * SIZE(AA) 2331 movlps %xmm1, 2 * SIZE(AA) 2332#endif 2333 2334#ifdef LN 2335 subl $2 * SIZE, CO1 2336#endif 2337 2338#if defined(LN) || defined(LT) 2339 unpcklps %xmm3, %xmm1 2340 2341 movlps %xmm1, 0 * SIZE(CO1) 2342 movhps %xmm1, 0 * SIZE(CO1, LDC) 2343#else 2344 movlps %xmm0, 0 * SIZE(CO1) 2345 movlps %xmm1, 0 * SIZE(CO1, LDC) 2346#endif 2347 2348#ifndef LN 2349 addl $2 * SIZE, CO1 2350#endif 2351 2352#if defined(LT) || defined(RN) 2353 movl K, %eax 2354 subl KK, %eax 2355 leal (,%eax, SIZE), %eax 2356 leal (AA, %eax, 2), AA 2357#ifdef LT 2358 addl $4 * SIZE, B 2359#endif 2360#endif 2361 2362#ifdef LN 2363 subl $2, KK 2364 movl BORIG, B 2365#endif 2366 2367#ifdef LT 2368 addl $2, KK 2369#endif 2370 2371#ifdef RT 2372 movl K, %eax 2373 movl BORIG, B 2374 sall $1 + BASE_SHIFT, %eax 2375 addl %eax, AORIG 2376#endif 2377 ALIGN_4 2378 2379.L70: 2380 movl M, %ebx 2381 sarl $2, %ebx # i = (m >> 2) 2382 jle .L79 2383 ALIGN_4 2384 2385.L51: 2386#ifdef LN 2387 movl K, %eax 2388 sall $2 + BASE_SHIFT, %eax 2389 subl %eax, AORIG 2390#endif 2391 2392#if defined(LN) || defined(RT) 2393 movl KK, %eax 2394 movl AORIG, AA 2395 leal (, %eax, SIZE), %eax 2396 leal (AA, %eax, 4), AA 2397#endif 2398 2399 leal BUFFER, BB 2400 2401#if defined(LN) || defined(RT) 2402 movl KK, %eax 2403 sall $1 + BASE_SHIFT, %eax 2404 leal (BB, %eax, 4), BB 2405#endif 2406 2407 xorps %xmm4, %xmm4 2408 xorps %xmm5, %xmm5 2409 xorps %xmm6, %xmm6 2410 xorps %xmm7, %xmm7 2411 2412 movaps 0 * SIZE(AA), %xmm0 2413 movaps 16 * SIZE(AA), %xmm1 2414 movaps 0 * SIZE(BB), %xmm2 2415 movaps 16 * SIZE(BB), %xmm3 2416 2417 PREFETCHW -4 * SIZE(CO1) 2418 PREFETCHW -4 * SIZE(CO1, LDC) 2419 2420#if defined(LT) || defined(RN) 2421 movl KK, %eax 2422#else 2423 movl K, %eax 2424 subl KK, %eax 2425#endif 2426 sarl $3, %eax 2427 je .L55 2428 ALIGN_4 2429 2430.L52: 2431 mulps %xmm0, %xmm2 2432#if defined(OPTERON) || defined(BARCELONA) 2433 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 2434#endif 2435 mulps 4 * SIZE(BB), %xmm0 2436 addps %xmm2, %xmm4 2437 movaps 8 * SIZE(BB), %xmm2 2438 addps %xmm0, %xmm5 2439 movaps 4 * SIZE(AA), %xmm0 2440 2441 mulps %xmm0, %xmm2 2442 mulps 12 * SIZE(BB), %xmm0 2443 addps %xmm2, %xmm4 2444 movaps 32 * SIZE(BB), %xmm2 2445 addps %xmm0, %xmm5 2446 movaps 8 * SIZE(AA), %xmm0 2447 2448 mulps %xmm0, %xmm3 2449 mulps 20 * SIZE(BB), %xmm0 2450 addps %xmm3, %xmm4 2451 movaps 24 * SIZE(BB), %xmm3 2452 addps %xmm0, %xmm5 2453 movaps 12 * SIZE(AA), %xmm0 2454 2455 mulps %xmm0, %xmm3 2456 mulps 28 * SIZE(BB), %xmm0 2457 addps %xmm3, %xmm4 2458 movaps 48 * SIZE(BB), %xmm3 2459 addps %xmm0, %xmm5 2460 movaps 32 * SIZE(AA), %xmm0 2461 2462#if defined(OPTERON) || defined(BARCELONA) 2463 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) 2464#endif 2465 mulps %xmm1, %xmm2 2466 mulps 36 * SIZE(BB), %xmm1 2467 addps %xmm2, %xmm4 2468 movaps 40 * SIZE(BB), %xmm2 2469 addps %xmm1, %xmm5 2470 movaps 20 * SIZE(AA), %xmm1 2471 2472 mulps %xmm1, %xmm2 2473 mulps 44 * SIZE(BB), %xmm1 2474 addps %xmm2, %xmm4 2475 movaps 64 * SIZE(BB), %xmm2 2476 addps %xmm1, %xmm5 2477 movaps 24 * SIZE(AA), %xmm1 2478 2479 mulps %xmm1, %xmm3 2480 mulps 52 * SIZE(BB), %xmm1 2481 addps %xmm3, %xmm4 2482 movaps 56 * SIZE(BB), %xmm3 2483 addps %xmm1, %xmm5 2484 movaps 28 * SIZE(AA), %xmm1 2485 2486 mulps %xmm1, %xmm3 2487 mulps 60 * SIZE(BB), %xmm1 2488 addps %xmm3, %xmm4 2489 movaps 80 * SIZE(BB), %xmm3 2490 addps %xmm1, %xmm5 2491 movaps 48 * SIZE(AA), %xmm1 2492 2493 addl $32 * SIZE, AA 2494 addl $64 * SIZE, BB 2495 decl %eax 2496 jne .L52 2497 ALIGN_4 2498 2499.L55: 2500#if defined(LT) || defined(RN) 2501 movl KK, %eax 2502#else 2503 movl K, %eax 2504 subl KK, %eax 2505#endif 2506 andl $7, %eax # if (k & 1) 2507 BRANCH 2508 je .L58 2509 ALIGN_4 2510 2511.L56: 2512 mulps %xmm0, %xmm2 2513 mulps 4 * SIZE(BB), %xmm0 2514 addps %xmm2, %xmm4 2515 movaps 8 * SIZE(BB), %xmm2 2516 addps %xmm0, %xmm5 2517 movaps 4 * SIZE(AA), %xmm0 2518 2519 addl $4 * SIZE, AA 2520 addl $8 * SIZE, BB 2521 decl %eax 2522 jg .L56 2523 ALIGN_4 2524 2525.L58: 2526#if defined(LN) || defined(RT) 2527 movl KK, %eax 2528#ifdef LN 2529 subl $4, %eax 2530#else 2531 subl $2, %eax 2532#endif 2533 2534 movl AORIG, AA 2535 movl BORIG, B 2536 leal BUFFER, BB 2537 2538 sall $1 + BASE_SHIFT, %eax 2539 leal (AA, %eax, 2), AA 2540 leal (B, %eax, 1), B 2541 leal (BB, %eax, 4), BB 2542#endif 2543 2544#if defined(LN) || defined(LT) 2545 movaps %xmm4, %xmm0 2546 unpcklps %xmm6, %xmm4 2547 unpckhps %xmm6, %xmm0 2548 2549 movaps %xmm5, %xmm1 2550 unpcklps %xmm7, %xmm5 2551 unpckhps %xmm7, %xmm1 2552 2553 movaps %xmm4, %xmm6 2554 unpcklps %xmm5, %xmm4 2555 unpckhps %xmm5, %xmm6 2556 2557 movaps %xmm0, %xmm2 2558 unpcklps %xmm1, %xmm0 2559 unpckhps %xmm1, %xmm2 2560 2561#ifdef movsd 2562 xorps %xmm1, %xmm1 2563#endif 2564 movsd 0 * SIZE(B), %xmm1 2565#ifdef movsd 2566 xorps %xmm3, %xmm3 2567#endif 2568 movsd 2 * SIZE(B), %xmm3 2569#ifdef movsd 2570 xorps %xmm5, %xmm5 2571#endif 2572 movsd 4 * SIZE(B), %xmm5 2573#ifdef movsd 2574 xorps %xmm7, %xmm7 2575#endif 2576 movsd 6 * SIZE(B), %xmm7 2577 2578 subps %xmm4, %xmm1 2579 subps %xmm6, %xmm3 2580 subps %xmm0, %xmm5 2581 subps %xmm2, %xmm7 2582#else 2583 movaps 0 * SIZE(AA), %xmm0 2584 movaps 4 * SIZE(AA), %xmm1 2585 2586 subps %xmm4, %xmm0 2587 subps %xmm5, %xmm1 2588#endif 2589 2590#ifdef LN 2591 movaps 12 * SIZE(AA), %xmm4 2592 pshufd $0xff, %xmm4, %xmm6 2593 mulps %xmm6, %xmm7 2594 pshufd $0xaa, %xmm4, %xmm6 2595 mulps %xmm7, %xmm6 2596 subps %xmm6, %xmm5 2597 pshufd $0x55, %xmm4, %xmm6 2598 mulps %xmm7, %xmm6 2599 subps %xmm6, %xmm3 2600 pshufd $0x00, %xmm4, %xmm6 2601 mulps %xmm7, %xmm6 2602 subps %xmm6, %xmm1 2603 2604 movaps 8 * SIZE(AA), %xmm4 2605 pshufd $0xaa, %xmm4, %xmm6 2606 mulps %xmm6, %xmm5 2607 pshufd $0x55, %xmm4, %xmm6 2608 mulps %xmm5, %xmm6 2609 subps %xmm6, %xmm3 2610 pshufd $0x00, %xmm4, %xmm6 2611 mulps %xmm5, %xmm6 2612 subps %xmm6, %xmm1 2613 2614 movaps 4 * SIZE(AA), %xmm4 2615 pshufd $0x55, %xmm4, %xmm6 2616 mulps %xmm6, %xmm3 2617 pshufd $0x00, %xmm4, %xmm6 2618 mulps %xmm3, %xmm6 2619 subps %xmm6, %xmm1 2620 2621 movaps 0 * SIZE(AA), %xmm4 2622 pshufd $0x00, %xmm4, %xmm6 2623 mulps %xmm6, %xmm1 2624#endif 2625 2626#ifdef LT 2627 movaps 0 * SIZE(AA), %xmm4 2628 pshufd $0x00, %xmm4, %xmm6 2629 mulps %xmm6, %xmm1 2630 2631 pshufd $0x55, %xmm4, %xmm6 2632 mulps %xmm1, %xmm6 2633 subps %xmm6, %xmm3 2634 pshufd $0xaa, %xmm4, %xmm6 2635 mulps %xmm1, %xmm6 2636 subps %xmm6, %xmm5 2637 pshufd $0xff, %xmm4, %xmm6 2638 mulps %xmm1, %xmm6 2639 subps %xmm6, %xmm7 2640 2641 movaps 4 * SIZE(AA), %xmm4 2642 pshufd $0x55, %xmm4, %xmm6 2643 mulps %xmm6, %xmm3 2644 pshufd $0xaa, %xmm4, %xmm6 2645 mulps %xmm3, %xmm6 2646 subps %xmm6, %xmm5 2647 pshufd $0xff, %xmm4, %xmm6 2648 mulps %xmm3, %xmm6 2649 subps %xmm6, %xmm7 2650 2651 movaps 8 * SIZE(AA), %xmm4 2652 pshufd $0xaa, %xmm4, %xmm6 2653 mulps %xmm6, %xmm5 2654 pshufd $0xff, %xmm4, %xmm6 2655 mulps %xmm5, %xmm6 2656 subps %xmm6, %xmm7 2657 2658 movaps 12 * SIZE(AA), %xmm4 2659 pshufd $0xff, %xmm4, %xmm6 2660 mulps %xmm6, %xmm7 2661#endif 2662 2663#ifdef RN 2664 movaps 0 * SIZE(B), %xmm6 2665 pshufd $0x00, %xmm6, %xmm7 2666 mulps %xmm7, %xmm0 2667 pshufd $0x55, %xmm6, %xmm7 2668 mulps %xmm0, %xmm7 2669 subps %xmm7, %xmm1 2670 2671 pshufd $0xff, %xmm6, %xmm7 2672 mulps %xmm7, %xmm1 2673#endif 2674 2675#ifdef RT 2676 movaps 0 * SIZE(B), %xmm6 2677 pshufd $0xff, %xmm6, %xmm7 2678 mulps %xmm7, %xmm1 2679 pshufd $0xaa, %xmm6, %xmm7 2680 mulps %xmm1, %xmm7 2681 subps %xmm7, %xmm0 2682 2683 pshufd $0x00, %xmm6, %xmm7 2684 mulps %xmm7, %xmm0 2685#endif 2686 2687#if defined(LN) || defined(LT) 2688 movlps %xmm1, 0 * SIZE(B) 2689 movlps %xmm3, 2 * SIZE(B) 2690 movlps %xmm5, 4 * SIZE(B) 2691 movlps %xmm7, 6 * SIZE(B) 2692 2693 pshufd $0x00, %xmm1, %xmm0 2694 pshufd $0x55, %xmm1, %xmm2 2695 movaps %xmm0, 0 * SIZE(BB) 2696 movaps %xmm2, 4 * SIZE(BB) 2697 2698 pshufd $0x00, %xmm3, %xmm0 2699 pshufd $0x55, %xmm3, %xmm2 2700 movaps %xmm0, 8 * SIZE(BB) 2701 movaps %xmm2, 12 * SIZE(BB) 2702 2703 pshufd $0x00, %xmm5, %xmm0 2704 pshufd $0x55, %xmm5, %xmm2 2705 movaps %xmm0, 16 * SIZE(BB) 2706 movaps %xmm2, 20 * SIZE(BB) 2707 2708 pshufd $0x00, %xmm7, %xmm0 2709 pshufd $0x55, %xmm7, %xmm2 2710 movaps %xmm0, 24 * SIZE(BB) 2711 movaps %xmm2, 28 * SIZE(BB) 2712#else 2713 movaps %xmm0, 0 * SIZE(AA) 2714 movaps %xmm1, 4 * SIZE(AA) 2715#endif 2716 2717#ifdef LN 2718 subl $4 * SIZE, CO1 2719#endif 2720 2721#if defined(LN) || defined(LT) 2722 unpcklps %xmm5, %xmm1 2723 unpcklps %xmm7, %xmm3 2724 2725 movaps %xmm1, %xmm2 2726 unpcklps %xmm3, %xmm1 2727 unpckhps %xmm3, %xmm2 2728 2729 movlps %xmm1, 0 * SIZE(CO1) 2730 movhps %xmm1, 2 * SIZE(CO1) 2731 movlps %xmm2, 0 * SIZE(CO1, LDC, 1) 2732 movhps %xmm2, 2 * SIZE(CO1, LDC, 1) 2733#else 2734 movlps %xmm0, 0 * SIZE(CO1) 2735 movhps %xmm0, 2 * SIZE(CO1) 2736 movlps %xmm1, 0 * SIZE(CO1, LDC, 1) 2737 movhps %xmm1, 2 * SIZE(CO1, LDC, 1) 2738#endif 2739 2740#ifndef LN 2741 addl $4 * SIZE, CO1 2742#endif 2743 2744#if defined(LT) || defined(RN) 2745 movl K, %eax 2746 subl KK, %eax 2747 leal (,%eax, SIZE), %eax 2748 leal (AA, %eax, 4), AA 2749#ifdef LT 2750 addl $8 * SIZE, B 2751#endif 2752#endif 2753 2754#ifdef LN 2755 subl $4, KK 2756 movl BORIG, B 2757#endif 2758 2759#ifdef LT 2760 addl $4, KK 2761#endif 2762 2763#ifdef RT 2764 movl K, %eax 2765 movl BORIG, B 2766 sall $2 + BASE_SHIFT, %eax 2767 addl %eax, AORIG 2768#endif 2769 2770 decl %ebx # i -- 2771 jg .L51 2772 ALIGN_4 2773 2774.L79: 2775#ifdef LN 2776 movl K, %eax 2777 leal (, %eax, SIZE), %eax 2778 leal (B, %eax, 2), B 2779#endif 2780 2781#if defined(LT) || defined(RN) 2782 movl K, %eax 2783 subl KK, %eax 2784 leal (,%eax, SIZE), %eax 2785 leal (B, %eax, 2), B 2786#endif 2787 2788#ifdef RN 2789 addl $2, KK 2790#endif 2791 2792#ifdef RT 2793 subl $2, KK 2794#endif 2795 ALIGN_4 2796 2797.L80: 2798 testl $1, N 2799 je .L999 2800 2801#ifdef LN 2802 movl OFFSET, %eax 2803 addl M, %eax 2804 movl %eax, KK 2805#endif 2806 2807 leal BUFFER, %ecx 2808 2809#ifdef RT 2810 movl K, %eax 2811 sall $BASE_SHIFT, %eax 2812 subl %eax, B 2813#endif 2814 2815#if defined(LN) || defined(RT) 2816 movl KK, %eax 2817 movl B, BORIG 2818 sall $BASE_SHIFT, %eax 2819 leal (B, %eax, 1), B 2820 leal (BB, %eax, 4), BB 2821#endif 2822 2823#ifdef LT 2824 movl OFFSET, %eax 2825 movl %eax, KK 2826#endif 2827 2828#if defined(LT) || defined(RN) 2829 movl KK, %eax 2830#else 2831 movl K, %eax 2832 subl KK, %eax 2833#endif 2834 sarl $3, %eax 2835 jle .L85 2836 ALIGN_4 2837 2838.L82: 2839 movsd 0 * SIZE(B), %xmm3 2840 movhps 2 * SIZE(B), %xmm3 2841 movsd 4 * SIZE(B), %xmm7 2842 movhps 6 * SIZE(B), %xmm7 2843 2844 pshufd $0x00, %xmm3, %xmm0 2845 pshufd $0x55, %xmm3, %xmm1 2846 pshufd $0xaa, %xmm3, %xmm2 2847 pshufd $0xff, %xmm3, %xmm3 2848 2849 pshufd $0x00, %xmm7, %xmm4 2850 pshufd $0x55, %xmm7, %xmm5 2851 pshufd $0xaa, %xmm7, %xmm6 2852 pshufd $0xff, %xmm7, %xmm7 2853 2854 movaps %xmm0, 0 * SIZE(BB) 2855 movaps %xmm1, 4 * SIZE(BB) 2856 movaps %xmm2, 8 * SIZE(BB) 2857 movaps %xmm3, 12 * SIZE(BB) 2858 movaps %xmm4, 16 * SIZE(BB) 2859 movaps %xmm5, 20 * SIZE(BB) 2860 movaps %xmm6, 24 * SIZE(BB) 2861 movaps %xmm7, 28 * SIZE(BB) 2862 2863 addl $ 8 * SIZE, B 2864 addl $32 * SIZE, BB 2865 decl %eax 2866 jne .L82 2867 ALIGN_4 2868 2869.L85: 2870#if defined(LT) || defined(RN) 2871 movl KK, %eax 2872#else 2873 movl K, %eax 2874 subl KK, %eax 2875#endif 2876 andl $7, %eax 2877 BRANCH 2878 jle .L90 2879 ALIGN_4 2880 2881.L86: 2882 movss 0 * SIZE(B), %xmm3 2883 2884 pshufd $0x00, %xmm3, %xmm0 2885 2886 movaps %xmm0, 0 * SIZE(BB) 2887 2888 addl $1 * SIZE, B 2889 addl $4 * SIZE, BB 2890 decl %eax 2891 jne .L86 2892 ALIGN_4 2893 2894.L90: 2895#if defined(LT) || defined(RN) 2896 movl A, AA 2897#else 2898 movl A, %eax 2899 movl %eax, AORIG 2900#endif 2901 2902#ifdef RT 2903 subl LDC, C 2904#endif 2905 movl C, CO1 2906#ifndef RT 2907 addl LDC, C 2908#endif 2909 2910 testl $1, M 2911 je .L100 2912 2913#ifdef LN 2914 movl K, %eax 2915 sall $BASE_SHIFT, %eax 2916 subl %eax, AORIG 2917#endif 2918 2919#if defined(LN) || defined(RT) 2920 movl KK, %eax 2921 movl AORIG, AA 2922 leal (AA, %eax, SIZE), AA 2923#endif 2924 2925 leal BUFFER, BB 2926 2927#if defined(LN) || defined(RT) 2928 movl KK, %eax 2929 sall $BASE_SHIFT, %eax 2930 leal (BB, %eax, 4), BB 2931#endif 2932 2933 xorps %xmm4, %xmm4 2934 xorps %xmm5, %xmm5 2935 xorps %xmm6, %xmm6 2936 xorps %xmm7, %xmm7 2937 2938 movss 0 * SIZE(AA), %xmm0 2939 movss 4 * SIZE(AA), %xmm1 2940 movss 0 * SIZE(BB), %xmm2 2941 movss 16 * SIZE(BB), %xmm3 2942 2943#if defined(LT) || defined(RN) 2944 movl KK, %eax 2945#else 2946 movl K, %eax 2947 subl KK, %eax 2948#endif 2949 sarl $3, %eax 2950 je .L115 2951 ALIGN_4 2952 2953.L112: 2954 mulss %xmm0, %xmm2 2955#if defined(OPTERON) || defined(BARCELONA) 2956 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 2957#endif 2958 movss 1 * SIZE(AA), %xmm0 2959 addss %xmm2, %xmm4 2960 movss 32 * SIZE(BB), %xmm2 2961 mulss 4 * SIZE(BB), %xmm0 2962 addss %xmm0, %xmm5 2963 movss 2 * SIZE(AA), %xmm0 2964 mulss 8 * SIZE(BB), %xmm0 2965 addss %xmm0, %xmm6 2966 movss 3 * SIZE(AA), %xmm0 2967 mulss 12 * SIZE(BB), %xmm0 2968 addss %xmm0, %xmm7 2969 movss 8 * SIZE(AA), %xmm0 2970 mulss %xmm1, %xmm3 2971 movss 5 * SIZE(AA), %xmm1 2972 addss %xmm3, %xmm4 2973 movss 48 * SIZE(BB), %xmm3 2974 mulss 20 * SIZE(BB), %xmm1 2975 addss %xmm1, %xmm5 2976 movss 6 * SIZE(AA), %xmm1 2977 mulss 24 * SIZE(BB), %xmm1 2978 addss %xmm1, %xmm6 2979 movss 7 * SIZE(AA), %xmm1 2980 mulss 28 * SIZE(BB), %xmm1 2981 addss %xmm1, %xmm7 2982 movss 12 * SIZE(AA), %xmm1 2983 2984 addl $ 8 * SIZE, AA 2985 addl $32 * SIZE, BB 2986 decl %eax 2987 jne .L112 2988 ALIGN_4 2989 2990.L115: 2991#if defined(LT) || defined(RN) 2992 movl KK, %eax 2993#else 2994 movl K, %eax 2995 subl KK, %eax 2996#endif 2997 andl $7, %eax # if (k & 1) 2998 BRANCH 2999 je .L118 3000 ALIGN_4 3001 3002.L116: 3003 mulss %xmm0, %xmm2 3004 movss 1 * SIZE(AA), %xmm0 3005 addss %xmm2, %xmm4 3006 movss 4 * SIZE(BB), %xmm2 3007 3008 addl $ 1 * SIZE, AA 3009 addl $ 4 * SIZE, BB 3010 decl %eax 3011 jg .L116 3012 ALIGN_4 3013 3014.L118: 3015 addss %xmm5, %xmm4 3016 addss %xmm7, %xmm6 3017 addss %xmm6, %xmm4 3018 3019#if defined(LN) || defined(RT) 3020 movl KK, %eax 3021 subl $1, %eax 3022 3023 movl AORIG, AA 3024 movl BORIG, B 3025 leal BUFFER, BB 3026 3027 sall $ BASE_SHIFT, %eax 3028 leal (AA, %eax, 1), AA 3029 leal (B, %eax, 1), B 3030 leal (BB, %eax, 4), BB 3031#endif 3032 3033#if defined(LN) || defined(LT) 3034 movss 0 * SIZE(B), %xmm1 3035 subss %xmm4, %xmm1 3036#else 3037 movss 0 * SIZE(AA), %xmm0 3038 subss %xmm4, %xmm0 3039#endif 3040 3041#if defined(LN) || defined(LT) 3042 mulss 0 * SIZE(AA), %xmm1 3043#endif 3044 3045#if defined(RN) || defined(RT) 3046 mulss 0 * SIZE(B), %xmm0 3047#endif 3048 3049#if defined(LN) || defined(LT) 3050 movss %xmm1, 0 * SIZE(B) 3051 3052 pshufd $0x00, %xmm1, %xmm0 3053 movaps %xmm0, 0 * SIZE(BB) 3054#else 3055 movss %xmm0, 0 * SIZE(AA) 3056#endif 3057 3058#ifdef LN 3059 subl $1 * SIZE, CO1 3060#endif 3061 3062#if defined(LN) || defined(LT) 3063 movss %xmm1, 0 * SIZE(CO1) 3064#else 3065 movss %xmm0, 0 * SIZE(CO1) 3066#endif 3067 3068#ifndef LN 3069 addl $1 * SIZE, CO1 3070#endif 3071 3072#if defined(LT) || defined(RN) 3073 movl K, %eax 3074 subl KK, %eax 3075 leal (AA, %eax, SIZE), AA 3076#ifdef LT 3077 addl $1 * SIZE, B 3078#endif 3079#endif 3080 3081#ifdef LN 3082 subl $1, KK 3083 movl BORIG, B 3084#endif 3085 3086#ifdef LT 3087 addl $1, KK 3088#endif 3089 3090#ifdef RT 3091 movl K, %eax 3092 movl BORIG, B 3093 sall $BASE_SHIFT, %eax 3094 addl %eax, AORIG 3095#endif 3096 ALIGN_4 3097 3098.L100: 3099 testl $2, M 3100 je .L110 3101 3102#ifdef LN 3103 movl K, %eax 3104 sall $1 + BASE_SHIFT, %eax 3105 subl %eax, AORIG 3106#endif 3107 3108#if defined(LN) || defined(RT) 3109 movl KK, %eax 3110 movl AORIG, AA 3111 leal (, %eax, SIZE), %eax 3112 leal (AA, %eax, 2), AA 3113#endif 3114 3115 leal BUFFER, BB 3116 3117#if defined(LN) || defined(RT) 3118 movl KK, %eax 3119 sall $BASE_SHIFT, %eax 3120 leal (BB, %eax, 4), BB 3121#endif 3122 3123 xorps %xmm4, %xmm4 3124 xorps %xmm5, %xmm5 3125 xorps %xmm6, %xmm6 3126 xorps %xmm7, %xmm7 3127 3128#ifdef movsd 3129 xorps %xmm0, %xmm0 3130#endif 3131 movsd 0 * SIZE(AA), %xmm0 3132#ifdef movsd 3133 xorps %xmm1, %xmm1 3134#endif 3135 movsd 8 * SIZE(AA), %xmm1 3136 movaps 0 * SIZE(BB), %xmm2 3137 movaps 16 * SIZE(BB), %xmm3 3138 3139#if defined(LT) || defined(RN) 3140 movl KK, %eax 3141#else 3142 movl K, %eax 3143 subl KK, %eax 3144#endif 3145 sarl $3, %eax 3146 je .L105 3147 ALIGN_4 3148 3149.L102: 3150 mulps %xmm0, %xmm2 3151#if defined(OPTERON) || defined(BARCELONA) 3152 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 3153#endif 3154 movsd 2 * SIZE(AA), %xmm0 3155 addps %xmm2, %xmm4 3156 movaps 4 * SIZE(BB), %xmm2 3157 mulps %xmm0, %xmm2 3158 movsd 4 * SIZE(AA), %xmm0 3159 addps %xmm2, %xmm5 3160 movaps 8 * SIZE(BB), %xmm2 3161 mulps %xmm0, %xmm2 3162 movsd 6 * SIZE(AA), %xmm0 3163 addps %xmm2, %xmm6 3164 movaps 12 * SIZE(BB), %xmm2 3165 mulps %xmm0, %xmm2 3166 movsd 16 * SIZE(AA), %xmm0 3167 addps %xmm2, %xmm7 3168 movaps 32 * SIZE(BB), %xmm2 3169 mulps %xmm1, %xmm3 3170 movsd 10 * SIZE(AA), %xmm1 3171 addps %xmm3, %xmm4 3172 movaps 20 * SIZE(BB), %xmm3 3173 mulps %xmm1, %xmm3 3174 movsd 12 * SIZE(AA), %xmm1 3175 addps %xmm3, %xmm5 3176 movaps 24 * SIZE(BB), %xmm3 3177 mulps %xmm1, %xmm3 3178 movsd 14 * SIZE(AA), %xmm1 3179 addps %xmm3, %xmm6 3180 movaps 28 * SIZE(BB), %xmm3 3181 mulps %xmm1, %xmm3 3182 movsd 24 * SIZE(AA), %xmm1 3183 addps %xmm3, %xmm7 3184 movaps 48 * SIZE(BB), %xmm3 3185 3186 addl $16 * SIZE, AA 3187 addl $32 * SIZE, BB 3188 decl %eax 3189 jne .L102 3190 ALIGN_4 3191 3192.L105: 3193#if defined(LT) || defined(RN) 3194 movl KK, %eax 3195#else 3196 movl K, %eax 3197 subl KK, %eax 3198#endif 3199 andl $7, %eax # if (k & 1) 3200 BRANCH 3201 je .L108 3202 ALIGN_4 3203 3204.L106: 3205 mulps %xmm0, %xmm2 3206 addps %xmm2, %xmm4 3207 movsd 2 * SIZE(AA), %xmm0 3208 movaps 4 * SIZE(BB), %xmm2 3209 3210 addl $2 * SIZE, AA 3211 addl $4 * SIZE, BB 3212 decl %eax 3213 jg .L106 3214 ALIGN_4 3215 3216.L108: 3217 addps %xmm5, %xmm4 3218 addps %xmm7, %xmm6 3219 addps %xmm6, %xmm4 3220 3221#if defined(LN) || defined(RT) 3222 movl KK, %eax 3223#ifdef LN 3224 subl $2, %eax 3225#else 3226 subl $1, %eax 3227#endif 3228 3229 movl AORIG, AA 3230 movl BORIG, B 3231 leal BUFFER, BB 3232 3233 sall $ BASE_SHIFT, %eax 3234 leal (AA, %eax, 2), AA 3235 leal (B, %eax, 1), B 3236 leal (BB, %eax, 4), BB 3237#endif 3238 3239#if defined(LN) || defined(LT) 3240 pshufd $1, %xmm4, %xmm6 3241 3242 movss 0 * SIZE(B), %xmm1 3243 movss 1 * SIZE(B), %xmm3 3244 3245 subss %xmm4, %xmm1 3246 subss %xmm6, %xmm3 3247#else 3248#ifdef movsd 3249 xorps %xmm0, %xmm0 3250#endif 3251 movsd 0 * SIZE(AA), %xmm0 3252 3253 subps %xmm4, %xmm0 3254#endif 3255 3256#ifdef LN 3257 movaps 0 * SIZE(AA), %xmm4 3258 pshufd $0xff, %xmm4, %xmm6 3259 mulss %xmm6, %xmm3 3260 pshufd $0xaa, %xmm4, %xmm6 3261 mulss %xmm3, %xmm6 3262 subss %xmm6, %xmm1 3263 3264 pshufd $0x00, %xmm4, %xmm6 3265 mulss %xmm6, %xmm1 3266#endif 3267 3268#ifdef LT 3269 movaps 0 * SIZE(AA), %xmm4 3270 pshufd $0x00, %xmm4, %xmm6 3271 mulss %xmm6, %xmm1 3272 pshufd $0x55, %xmm4, %xmm6 3273 mulss %xmm1, %xmm6 3274 subss %xmm6, %xmm3 3275 3276 pshufd $0xff, %xmm4, %xmm6 3277 mulss %xmm6, %xmm3 3278#endif 3279 3280#if defined(RN) || defined(RT) 3281 movss 0 * SIZE(B), %xmm6 3282 pshufd $0x00, %xmm6, %xmm7 3283 mulps %xmm7, %xmm0 3284#endif 3285 3286#if defined(LN) || defined(LT) 3287 movss %xmm1, 0 * SIZE(B) 3288 movss %xmm3, 1 * SIZE(B) 3289 3290 pshufd $0x00, %xmm1, %xmm0 3291 movaps %xmm0, 0 * SIZE(BB) 3292 pshufd $0x00, %xmm3, %xmm0 3293 movaps %xmm0, 4 * SIZE(BB) 3294#else 3295 movlps %xmm0, 0 * SIZE(AA) 3296#endif 3297 3298#ifdef LN 3299 subl $2 * SIZE, CO1 3300#endif 3301 3302#if defined(LN) || defined(LT) 3303 movss %xmm1, 0 * SIZE(CO1) 3304 movss %xmm3, 1 * SIZE(CO1) 3305#else 3306 movlps %xmm0, 0 * SIZE(CO1) 3307#endif 3308 3309#ifndef LN 3310 addl $2 * SIZE, CO1 3311#endif 3312 3313#if defined(LT) || defined(RN) 3314 movl K, %eax 3315 subl KK, %eax 3316 leal (,%eax, SIZE), %eax 3317 leal (AA, %eax, 2), AA 3318#ifdef LT 3319 addl $2 * SIZE, B 3320#endif 3321#endif 3322 3323#ifdef LN 3324 subl $2, KK 3325 movl BORIG, B 3326#endif 3327 3328#ifdef LT 3329 addl $2, KK 3330#endif 3331 3332#ifdef RT 3333 movl K, %eax 3334 movl BORIG, B 3335 sall $1 + BASE_SHIFT, %eax 3336 addl %eax, AORIG 3337#endif 3338 ALIGN_4 3339 3340.L110: 3341 movl M, %ebx 3342 sarl $2, %ebx # i = (m >> 2) 3343 jle .L119 3344 ALIGN_4 3345 3346.L91: 3347#ifdef LN 3348 movl K, %eax 3349 sall $2 + BASE_SHIFT, %eax 3350 subl %eax, AORIG 3351#endif 3352 3353#if defined(LN) || defined(RT) 3354 movl KK, %eax 3355 movl AORIG, AA 3356 leal (, %eax, SIZE), %eax 3357 leal (AA, %eax, 4), AA 3358#endif 3359 3360 leal BUFFER, BB 3361 3362#if defined(LN) || defined(RT) 3363 movl KK, %eax 3364 sall $BASE_SHIFT, %eax 3365 leal (BB, %eax, 4), BB 3366#endif 3367 3368 xorps %xmm4, %xmm4 3369 xorps %xmm5, %xmm5 3370 xorps %xmm6, %xmm6 3371 xorps %xmm7, %xmm7 3372 3373 movaps 0 * SIZE(AA), %xmm0 3374 movaps 16 * SIZE(AA), %xmm1 3375 movaps 0 * SIZE(BB), %xmm2 3376 movaps 16 * SIZE(BB), %xmm3 3377 3378 PREFETCHW -4 * SIZE(CO1) 3379 3380#if defined(LT) || defined(RN) 3381 movl KK, %eax 3382#else 3383 movl K, %eax 3384 subl KK, %eax 3385#endif 3386 sarl $3, %eax 3387 je .L95 3388 ALIGN_4 3389 3390.L92: 3391 mulps %xmm0, %xmm2 3392#if defined(OPTERON) || defined(BARCELONA) 3393 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 3394#endif 3395 movaps 4 * SIZE(AA), %xmm0 3396 addps %xmm2, %xmm4 3397 movaps 32 * SIZE(BB), %xmm2 3398 mulps 4 * SIZE(BB), %xmm0 3399 addps %xmm0, %xmm5 3400 movaps 8 * SIZE(AA), %xmm0 3401 mulps 8 * SIZE(BB), %xmm0 3402 addps %xmm0, %xmm6 3403 movaps 12 * SIZE(AA), %xmm0 3404 mulps 12 * SIZE(BB), %xmm0 3405 addps %xmm0, %xmm7 3406 movaps 32 * SIZE(AA), %xmm0 3407#if defined(OPTERON) || defined(BARCELONA) 3408 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) 3409#endif 3410 mulps %xmm1, %xmm3 3411 movaps 20 * SIZE(AA), %xmm1 3412 addps %xmm3, %xmm4 3413 movaps 48 * SIZE(BB), %xmm3 3414 mulps 20 * SIZE(BB), %xmm1 3415 addps %xmm1, %xmm5 3416 movaps 24 * SIZE(AA), %xmm1 3417 mulps 24 * SIZE(BB), %xmm1 3418 addps %xmm1, %xmm6 3419 movaps 28 * SIZE(AA), %xmm1 3420 mulps 28 * SIZE(BB), %xmm1 3421 addps %xmm1, %xmm7 3422 movaps 48 * SIZE(AA), %xmm1 3423 3424 addl $32 * SIZE, AA 3425 addl $32 * SIZE, BB 3426 decl %eax 3427 jne .L92 3428 ALIGN_4 3429 3430.L95: 3431#if defined(LT) || defined(RN) 3432 movl KK, %eax 3433#else 3434 movl K, %eax 3435 subl KK, %eax 3436#endif 3437 andl $7, %eax # if (k & 1) 3438 BRANCH 3439 je .L98 3440 ALIGN_4 3441 3442.L96: 3443 mulps %xmm0, %xmm2 3444 addps %xmm2, %xmm4 3445 movaps 4 * SIZE(AA), %xmm0 3446 movaps 4 * SIZE(BB), %xmm2 3447 3448 addl $4 * SIZE, AA 3449 addl $4 * SIZE, BB 3450 decl %eax 3451 jg .L96 3452 ALIGN_4 3453 3454.L98: 3455 addps %xmm5, %xmm4 3456 addps %xmm7, %xmm6 3457 addps %xmm6, %xmm4 3458 3459#if defined(LN) || defined(RT) 3460 movl KK, %eax 3461#ifdef LN 3462 subl $4, %eax 3463#else 3464 subl $1, %eax 3465#endif 3466 3467 movl AORIG, AA 3468 movl BORIG, B 3469 leal BUFFER, BB 3470 3471 sall $ BASE_SHIFT, %eax 3472 leal (AA, %eax, 4), AA 3473 leal (B, %eax, 1), B 3474 leal (BB, %eax, 4), BB 3475#endif 3476 3477#if defined(LN) || defined(LT) 3478 movaps %xmm4, %xmm0 3479 unpcklps %xmm6, %xmm4 3480 unpckhps %xmm6, %xmm0 3481 3482 movaps %xmm5, %xmm1 3483 unpcklps %xmm7, %xmm5 3484 unpckhps %xmm7, %xmm1 3485 3486 movaps %xmm4, %xmm6 3487 unpcklps %xmm5, %xmm4 3488 unpckhps %xmm5, %xmm6 3489 3490 movaps %xmm0, %xmm2 3491 unpcklps %xmm1, %xmm0 3492 unpckhps %xmm1, %xmm2 3493 3494 movss 0 * SIZE(B), %xmm1 3495 movss 1 * SIZE(B), %xmm3 3496 movss 2 * SIZE(B), %xmm5 3497 movss 3 * SIZE(B), %xmm7 3498 3499 subss %xmm4, %xmm1 3500 subss %xmm6, %xmm3 3501 subss %xmm0, %xmm5 3502 subss %xmm2, %xmm7 3503#else 3504 movaps 0 * SIZE(AA), %xmm0 3505 3506 subps %xmm4, %xmm0 3507#endif 3508 3509#ifdef LN 3510 movaps 12 * SIZE(AA), %xmm4 3511 pshufd $0xff, %xmm4, %xmm6 3512 mulss %xmm6, %xmm7 3513 pshufd $0xaa, %xmm4, %xmm6 3514 mulss %xmm7, %xmm6 3515 subss %xmm6, %xmm5 3516 pshufd $0x55, %xmm4, %xmm6 3517 mulss %xmm7, %xmm6 3518 subss %xmm6, %xmm3 3519 pshufd $0x00, %xmm4, %xmm6 3520 mulss %xmm7, %xmm6 3521 subss %xmm6, %xmm1 3522 3523 movaps 8 * SIZE(AA), %xmm4 3524 pshufd $0xaa, %xmm4, %xmm6 3525 mulss %xmm6, %xmm5 3526 pshufd $0x55, %xmm4, %xmm6 3527 mulss %xmm5, %xmm6 3528 subss %xmm6, %xmm3 3529 pshufd $0x00, %xmm4, %xmm6 3530 mulss %xmm5, %xmm6 3531 subss %xmm6, %xmm1 3532 3533 movaps 4 * SIZE(AA), %xmm4 3534 pshufd $0x55, %xmm4, %xmm6 3535 mulss %xmm6, %xmm3 3536 pshufd $0x00, %xmm4, %xmm6 3537 mulss %xmm3, %xmm6 3538 subss %xmm6, %xmm1 3539 3540 movaps 0 * SIZE(AA), %xmm4 3541 pshufd $0x00, %xmm4, %xmm6 3542 mulss %xmm6, %xmm1 3543#endif 3544 3545#ifdef LT 3546 movaps 0 * SIZE(AA), %xmm4 3547 pshufd $0x00, %xmm4, %xmm6 3548 mulss %xmm6, %xmm1 3549 3550 pshufd $0x55, %xmm4, %xmm6 3551 mulss %xmm1, %xmm6 3552 subss %xmm6, %xmm3 3553 pshufd $0xaa, %xmm4, %xmm6 3554 mulss %xmm1, %xmm6 3555 subss %xmm6, %xmm5 3556 pshufd $0xff, %xmm4, %xmm6 3557 mulss %xmm1, %xmm6 3558 subss %xmm6, %xmm7 3559 3560 movaps 4 * SIZE(AA), %xmm4 3561 pshufd $0x55, %xmm4, %xmm6 3562 mulss %xmm6, %xmm3 3563 pshufd $0xaa, %xmm4, %xmm6 3564 mulss %xmm3, %xmm6 3565 subss %xmm6, %xmm5 3566 pshufd $0xff, %xmm4, %xmm6 3567 mulss %xmm3, %xmm6 3568 subss %xmm6, %xmm7 3569 3570 movaps 8 * SIZE(AA), %xmm4 3571 pshufd $0xaa, %xmm4, %xmm6 3572 mulss %xmm6, %xmm5 3573 pshufd $0xff, %xmm4, %xmm6 3574 mulss %xmm5, %xmm6 3575 subss %xmm6, %xmm7 3576 3577 movaps 12 * SIZE(AA), %xmm4 3578 pshufd $0xff, %xmm4, %xmm6 3579 mulss %xmm6, %xmm7 3580#endif 3581 3582#if defined(RN) || defined(RT) 3583 movss 0 * SIZE(B), %xmm6 3584 pshufd $0x00, %xmm6, %xmm7 3585 mulps %xmm7, %xmm0 3586#endif 3587 3588#if defined(LN) || defined(LT) 3589 movss %xmm1, 0 * SIZE(B) 3590 movss %xmm3, 1 * SIZE(B) 3591 movss %xmm5, 2 * SIZE(B) 3592 movss %xmm7, 3 * SIZE(B) 3593 3594 pshufd $0x00, %xmm1, %xmm0 3595 movaps %xmm0, 0 * SIZE(BB) 3596 pshufd $0x00, %xmm3, %xmm0 3597 movaps %xmm0, 4 * SIZE(BB) 3598 3599 pshufd $0x00, %xmm5, %xmm0 3600 movaps %xmm0, 8 * SIZE(BB) 3601 pshufd $0x00, %xmm7, %xmm0 3602 movaps %xmm0, 12 * SIZE(BB) 3603#else 3604 movss %xmm0, 0 * SIZE(AA) 3605 movss %xmm1, 1 * SIZE(AA) 3606 movss %xmm2, 2 * SIZE(AA) 3607 movss %xmm3, 3 * SIZE(AA) 3608#endif 3609 3610#ifdef LN 3611 subl $4 * SIZE, CO1 3612#endif 3613 3614#if defined(LN) || defined(LT) 3615 unpcklps %xmm5, %xmm1 3616 unpcklps %xmm7, %xmm3 3617 3618 unpcklps %xmm3, %xmm1 3619 3620 movlps %xmm1, 0 * SIZE(CO1) 3621 movhps %xmm1, 2 * SIZE(CO1) 3622#else 3623 movlps %xmm0, 0 * SIZE(CO1) 3624 movhps %xmm0, 2 * SIZE(CO1) 3625#endif 3626 3627#ifndef LN 3628 addl $4 * SIZE, CO1 3629#endif 3630 3631#if defined(LT) || defined(RN) 3632 movl K, %eax 3633 subl KK, %eax 3634 leal (,%eax, SIZE), %eax 3635 leal (AA, %eax, 4), AA 3636#ifdef LT 3637 addl $4 * SIZE, B 3638#endif 3639#endif 3640 3641#ifdef LN 3642 subl $4, KK 3643 movl BORIG, B 3644#endif 3645 3646#ifdef LT 3647 addl $4, KK 3648#endif 3649 3650#ifdef RT 3651 movl K, %eax 3652 movl BORIG, B 3653 sall $2 + BASE_SHIFT, %eax 3654 addl %eax, AORIG 3655#endif 3656 3657 decl %ebx # i -- 3658 jg .L91 3659 ALIGN_4 3660 3661.L119: 3662#ifdef LN 3663 movl K, %eax 3664 leal (B, %eax, SIZE), B 3665#endif 3666 3667#if defined(LT) || defined(RN) 3668 movl K, %eax 3669 subl KK, %eax 3670 leal (B, %eax, SIZE), B 3671#endif 3672 3673#ifdef RN 3674 addl $1, KK 3675#endif 3676 3677#ifdef RT 3678 subl $1, KK 3679#endif 3680 ALIGN_4 3681 3682 3683.L999: 3684 movl OLD_STACK, %esp 3685 popl %ebx 3686 popl %esi 3687 popl %edi 3688 popl %ebp 3689 ret 3690 3691 EPILOGUE 3692