1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43 44#define OLD_M 4 + STACK(%esi) 45#define OLD_N 8 + STACK(%esi) 46#define OLD_K 12 + STACK(%esi) 47#define OLD_A 20 + STACK(%esi) 48#define OLD_B 24 + STACK(%esi) 49#define OLD_C 28 + STACK(%esi) 50#define OLD_LDC 32 + STACK(%esi) 51#define STACK_OFFT 36 + STACK(%esi) 52 53#define K 16(%esp) 54#define N 20(%esp) 55#define M 24(%esp) 56#define A 28(%esp) 57#define C 32(%esp) 58#define J 36(%esp) 59#define OLD_STACK 40(%esp) 60#define OFFSET 44(%esp) 61#define KK 48(%esp) 62#define KKK 52(%esp) 63#define AORIG 56(%esp) 64#define BORIG 60(%esp) 65#define BUFFER 128(%esp) 66 67#if defined(OPTERON) || defined(BARCELONA) 68#define PREFETCH prefetch 69#define PREFETCHW prefetchw 70#define PREFETCHSIZE (16 * 10 + 8) 71#endif 72 73#if defined(PENTIUM4) || defined(PENTIUMM) 74#define PREFETCH prefetcht0 75#define PREFETCHW prefetcht0 76#define PREFETCHSIZE 96 77#endif 78 79#if defined(PENRYN) || defined(DUNNINGTON) 80#define PREFETCH prefetcht0 81#define PREFETCHW prefetcht0 82#define PREFETCHSIZE 96 83#endif 84 85#define B %edi 86#define AA %edx 87#define BB %ecx 88#define LDC %ebp 89#define CO1 %esi 90 91#if defined(OPTERON) || !defined(HAVE_SSE2) 92#define movsd movlps 93#endif 94 95#ifdef HAVE_SSE2 96#define xorps pxor 97#endif 98 99#define KERNEL1(address) \ 100 mulps %xmm0, %xmm2; \ 101 addps %xmm2, %xmm4; \ 102 movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 103 mulps %xmm0, %xmm2; \ 104 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ 105 addps %xmm2, %xmm5; \ 106 movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 107 mulps %xmm0, %xmm2; \ 108 mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 109 addps %xmm2, %xmm6; \ 110 movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 111 addps %xmm0, %xmm7; \ 112 movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 113 114#define KERNEL2(address) \ 115 mulps %xmm0, %xmm3; \ 116 addps %xmm3, %xmm4; \ 117 movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 118 mulps %xmm0, %xmm3; \ 119 addps %xmm3, %xmm5; \ 120 movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 121 mulps %xmm0, %xmm3; \ 122 mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 123 addps %xmm3, %xmm6; \ 124 movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 125 addps %xmm0, %xmm7; \ 126 movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 127 128#define KERNEL3(address) \ 129 mulps %xmm0, %xmm2; \ 130 addps %xmm2, %xmm4; \ 131 movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 132 mulps %xmm0, %xmm2; \ 133 addps %xmm2, %xmm5; \ 134 movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 135 mulps %xmm0, %xmm2; \ 136 mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 137 addps %xmm2, %xmm6; \ 138 movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 139 addps %xmm0, %xmm7; \ 140 movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 141 142#define KERNEL4(address) \ 143 mulps %xmm0, %xmm3; \ 144 addps %xmm3, %xmm4; \ 145 movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 146 mulps %xmm0, %xmm3; \ 147 addps %xmm3, %xmm5; \ 148 movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 149 mulps %xmm0, %xmm3; \ 150 mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ 151 addps %xmm3, %xmm6; \ 152 movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 153 addps %xmm0, %xmm7; \ 154 movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 155 156#define KERNEL5(address) \ 157 mulps %xmm1, %xmm2; \ 158 addps %xmm2, %xmm4; \ 159 movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 160 mulps %xmm1, %xmm2; \ 161 addps %xmm2, %xmm5; \ 162 movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 163 mulps %xmm1, %xmm2; \ 164 mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 165 addps %xmm2, %xmm6; \ 166 movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 167 addps %xmm1, %xmm7; \ 168 movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 169 170#define KERNEL6(address) \ 171 mulps %xmm1, %xmm3; \ 172 addps %xmm3, %xmm4; \ 173 movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 174 mulps %xmm1, %xmm3; \ 175 addps %xmm3, %xmm5; \ 176 movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 177 mulps %xmm1, %xmm3; \ 178 mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 179 addps %xmm3, %xmm6; \ 180 movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 181 addps %xmm1, %xmm7; \ 182 movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 183 184#define KERNEL7(address) \ 185 mulps %xmm1, %xmm2; \ 186 addps %xmm2, %xmm4; \ 187 movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 188 mulps %xmm1, %xmm2; \ 189 addps %xmm2, %xmm5; \ 190 movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 191 mulps %xmm1, %xmm2; \ 192 mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 193 addps %xmm2, %xmm6; \ 194 movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ 195 addps %xmm1, %xmm7; \ 196 movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 197 198#define KERNEL8(address) \ 199 mulps %xmm1, %xmm3; \ 200 addps %xmm3, %xmm4; \ 201 movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 202 mulps %xmm1, %xmm3; \ 203 addps %xmm3, %xmm5; \ 204 movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 205 mulps %xmm1, %xmm3; \ 206 mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ 207 addps %xmm3, %xmm6; \ 208 movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ 209 addps %xmm1, %xmm7; \ 210 movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; 211 212 PROLOGUE 213 214 pushl %ebp 215 pushl %edi 216 pushl %esi 217 pushl %ebx 218 219 PROFCODE 220 221 movl %esp, %esi 222 223 subl $128 + LOCAL_BUFFER_SIZE, %esp 224 andl $-1024, %esp 225 226 STACK_TOUCHING 227 228 movl OLD_M, %ebx 229 movl OLD_N, %eax 230 movl OLD_K, %ecx 231 movl OLD_A, %edx 232 233 movl %ebx, M 234 movl %eax, N 235 movl %ecx, K 236 movl %edx, A 237 movl %esi, OLD_STACK 238 movss STACK_OFFT, %xmm4 239 240 movl OLD_B, B 241 movl OLD_C, %ebx 242 243 movl %ebx, C 244 movl OLD_LDC, LDC 245 246 movss %xmm4, OFFSET 247 movss %xmm4, KK 248 249 leal (, LDC, SIZE), LDC 250 251#ifdef LN 252 movl M, %eax 253 leal (, %eax, SIZE), %eax 254 addl %eax, C 255 imull K, %eax 256 addl %eax, A 257#endif 258 259#ifdef RT 260 movl N, %eax 261 leal (, %eax, SIZE), %eax 262 imull K, %eax 263 addl %eax, B 264 movl N, %eax 265 imull LDC, %eax 266 addl %eax, C 267#endif 268 269#ifdef RN 270 negl KK 271#endif 272 273#ifdef RT 274 movl N, %eax 275 subl OFFSET, %eax 276 movl %eax, KK 277#endif 278 279 movl N, %eax 280 sarl $2, %eax 281 movl %eax, J 282 jle .L40 283 284.L01: 285#ifdef LN 286 movl OFFSET, %eax 287 addl M, %eax 288 movl %eax, KK 289#endif 290 291 leal BUFFER, %ecx 292 293#ifdef RT 294 movl K, %eax 295 sall $2 + BASE_SHIFT, %eax 296 subl %eax, B 297#endif 298 299#if defined(LN) || defined(RT) 300 movl KK, %eax 301 movl B, BORIG 302 sall $2 + BASE_SHIFT, %eax 303 leal (B, %eax, 1), B 304 leal (BB, %eax, 4), BB 305#endif 306 307#ifdef LT 308 movl OFFSET, %eax 309 movl %eax, KK 310#endif 311 312#if defined(LT) || defined(RN) 313 movl KK, %eax 314#else 315 movl K, %eax 316 subl KK, %eax 317#endif 318 sarl $1, %eax 319 jle .L05 320 ALIGN_4 321 322.L02: 323 movaps 0 * SIZE(B), %xmm3 324 movaps 4 * SIZE(B), %xmm7 325 326 pshufd $0x00, %xmm3, %xmm0 327 pshufd $0x55, %xmm3, %xmm1 328 pshufd $0xaa, %xmm3, %xmm2 329 pshufd $0xff, %xmm3, %xmm3 330 331 pshufd $0x00, %xmm7, %xmm4 332 pshufd $0x55, %xmm7, %xmm5 333 pshufd $0xaa, %xmm7, %xmm6 334 pshufd $0xff, %xmm7, %xmm7 335 336 movaps %xmm0, 0 * SIZE(BB) 337 movaps %xmm1, 4 * SIZE(BB) 338 movaps %xmm2, 8 * SIZE(BB) 339 movaps %xmm3, 12 * SIZE(BB) 340 movaps %xmm4, 16 * SIZE(BB) 341 movaps %xmm5, 20 * SIZE(BB) 342 movaps %xmm6, 24 * SIZE(BB) 343 movaps %xmm7, 28 * SIZE(BB) 344 345 addl $ 8 * SIZE, B 346 addl $32 * SIZE, %ecx 347 decl %eax 348 jne .L02 349 ALIGN_2 350 351.L05: 352#if defined(LT) || defined(RN) 353 movl KK, %eax 354#else 355 movl K, %eax 356 subl KK, %eax 357#endif 358 andl $1, %eax 359 BRANCH 360 jle .L10 361 362 movaps 0 * SIZE(B), %xmm3 363 364 pshufd $0x00, %xmm3, %xmm0 365 pshufd $0x55, %xmm3, %xmm1 366 pshufd $0xaa, %xmm3, %xmm2 367 pshufd $0xff, %xmm3, %xmm3 368 369 movaps %xmm0, 0 * SIZE(BB) 370 movaps %xmm1, 4 * SIZE(BB) 371 movaps %xmm2, 8 * SIZE(BB) 372 movaps %xmm3, 12 * SIZE(BB) 373 374 addl $4 * SIZE, B 375 ALIGN_4 376 377.L10: 378#if defined(LT) || defined(RN) 379 movl A, AA 380#else 381 movl A, %eax 382 movl %eax, AORIG 383#endif 384 385 leal (, LDC, 4), %eax 386 387#ifdef RT 388 subl %eax, C 389#endif 390 movl C, CO1 391#ifndef RT 392 addl %eax, C 393#endif 394 395 movl M, %ebx 396 sarl $2, %ebx # i = (m >> 2) 397 jle .L20 398 ALIGN_4 399 400.L11: 401#ifdef LN 402 movl K, %eax 403 sall $2 + BASE_SHIFT, %eax 404 subl %eax, AORIG 405#endif 406 407#if defined(LN) || defined(RT) 408 movl KK, %eax 409 movl AORIG, AA 410 leal (, %eax, SIZE), %eax 411 leal (AA, %eax, 4), AA 412#endif 413 414 leal BUFFER, BB 415 416#if defined(LN) || defined(RT) 417 movl KK, %eax 418 sall $2 + BASE_SHIFT, %eax 419 leal (BB, %eax, 4), BB 420#endif 421 422 movaps 0 * SIZE(AA), %xmm0 423 xorps %xmm4, %xmm4 424 movaps 16 * SIZE(AA), %xmm1 425 xorps %xmm5, %xmm5 426 movaps 0 * SIZE(BB), %xmm2 427 xorps %xmm6, %xmm6 428 movaps 16 * SIZE(BB), %xmm3 429 xorps %xmm7, %xmm7 430 431 leal (LDC, LDC, 2), %eax 432 433 PREFETCHW 3 * SIZE(CO1) 434 PREFETCHW 3 * SIZE(CO1, LDC) 435 PREFETCHW 3 * SIZE(CO1, LDC, 2) 436 PREFETCHW 3 * SIZE(CO1, %eax) 437 438#if defined(LT) || defined(RN) 439 movl KK, %eax 440#else 441 movl K, %eax 442 subl KK, %eax 443#endif 444 sarl $3, %eax 445 je .L15 446 ALIGN_4 447 448.L12: 449 KERNEL1(0 * 16) 450 KERNEL2(0 * 16) 451 KERNEL3(0 * 16) 452 KERNEL4(0 * 16) 453 KERNEL5(0 * 16) 454 KERNEL6(0 * 16) 455 KERNEL7(0 * 16) 456 KERNEL8(0 * 16) 457 458 addl $128 * SIZE, BB 459 addl $32 * SIZE, AA 460 decl %eax 461 jne .L12 462 ALIGN_4 463 464.L15: 465#if defined(LT) || defined(RN) 466 movl KK, %eax 467#else 468 movl K, %eax 469 subl KK, %eax 470#endif 471 andl $7, %eax # if (k & 1) 472 BRANCH 473 je .L18 474 ALIGN_4 475 476.L16: 477 mulps %xmm0, %xmm2 478 addps %xmm2, %xmm4 479 movaps 4 * SIZE(BB), %xmm2 480 mulps %xmm0, %xmm2 481 addps %xmm2, %xmm5 482 movaps 8 * SIZE(BB), %xmm2 483 mulps %xmm0, %xmm2 484 mulps 12 * SIZE(BB), %xmm0 485 addps %xmm2, %xmm6 486 movaps 16 * SIZE(BB), %xmm2 487 addps %xmm0, %xmm7 488 movaps 4 * SIZE(AA), %xmm0 489 490 addl $ 4 * SIZE, AA 491 addl $16 * SIZE, BB 492 decl %eax 493 jg .L16 494 ALIGN_4 495 496.L18: 497#if defined(LN) || defined(RT) 498 movl KK, %eax 499#ifdef LN 500 subl $4, %eax 501#else 502 subl $4, %eax 503#endif 504 505 movl AORIG, AA 506 movl BORIG, B 507 leal BUFFER, BB 508 509 sall $2 + BASE_SHIFT, %eax 510 leal (AA, %eax, 1), AA 511 leal (B, %eax, 1), B 512 leal (BB, %eax, 4), BB 513#endif 514 515#if defined(LN) || defined(LT) 516 movaps %xmm4, %xmm0 517 unpcklps %xmm6, %xmm4 518 unpckhps %xmm6, %xmm0 519 520 movaps %xmm5, %xmm1 521 unpcklps %xmm7, %xmm5 522 unpckhps %xmm7, %xmm1 523 524 movaps %xmm4, %xmm6 525 unpcklps %xmm5, %xmm4 526 unpckhps %xmm5, %xmm6 527 528 movaps %xmm0, %xmm2 529 unpcklps %xmm1, %xmm0 530 unpckhps %xmm1, %xmm2 531 532 movaps 0 * SIZE(B), %xmm1 533 movaps 4 * SIZE(B), %xmm3 534 movaps 8 * SIZE(B), %xmm5 535 movaps 12 * SIZE(B), %xmm7 536 537 subps %xmm4, %xmm1 538 subps %xmm6, %xmm3 539 subps %xmm0, %xmm5 540 subps %xmm2, %xmm7 541#else 542 movaps 0 * SIZE(AA), %xmm0 543 movaps 4 * SIZE(AA), %xmm1 544 movaps 8 * SIZE(AA), %xmm2 545 movaps 12 * SIZE(AA), %xmm3 546 547 subps %xmm4, %xmm0 548 subps %xmm5, %xmm1 549 subps %xmm6, %xmm2 550 subps %xmm7, %xmm3 551#endif 552 553#ifdef LN 554 movaps 12 * SIZE(AA), %xmm4 555 pshufd $0xff, %xmm4, %xmm6 556 mulps %xmm6, %xmm7 557 pshufd $0xaa, %xmm4, %xmm6 558 mulps %xmm7, %xmm6 559 subps %xmm6, %xmm5 560 pshufd $0x55, %xmm4, %xmm6 561 mulps %xmm7, %xmm6 562 subps %xmm6, %xmm3 563 pshufd $0x00, %xmm4, %xmm6 564 mulps %xmm7, %xmm6 565 subps %xmm6, %xmm1 566 567 movaps 8 * SIZE(AA), %xmm4 568 pshufd $0xaa, %xmm4, %xmm6 569 mulps %xmm6, %xmm5 570 pshufd $0x55, %xmm4, %xmm6 571 mulps %xmm5, %xmm6 572 subps %xmm6, %xmm3 573 pshufd $0x00, %xmm4, %xmm6 574 mulps %xmm5, %xmm6 575 subps %xmm6, %xmm1 576 577 movaps 4 * SIZE(AA), %xmm4 578 pshufd $0x55, %xmm4, %xmm6 579 mulps %xmm6, %xmm3 580 pshufd $0x00, %xmm4, %xmm6 581 mulps %xmm3, %xmm6 582 subps %xmm6, %xmm1 583 584 movaps 0 * SIZE(AA), %xmm4 585 pshufd $0x00, %xmm4, %xmm6 586 mulps %xmm6, %xmm1 587#endif 588 589#ifdef LT 590 movaps 0 * SIZE(AA), %xmm4 591 pshufd $0x00, %xmm4, %xmm6 592 mulps %xmm6, %xmm1 593 594 pshufd $0x55, %xmm4, %xmm6 595 mulps %xmm1, %xmm6 596 subps %xmm6, %xmm3 597 pshufd $0xaa, %xmm4, %xmm6 598 mulps %xmm1, %xmm6 599 subps %xmm6, %xmm5 600 pshufd $0xff, %xmm4, %xmm6 601 mulps %xmm1, %xmm6 602 subps %xmm6, %xmm7 603 604 movaps 4 * SIZE(AA), %xmm4 605 pshufd $0x55, %xmm4, %xmm6 606 mulps %xmm6, %xmm3 607 pshufd $0xaa, %xmm4, %xmm6 608 mulps %xmm3, %xmm6 609 subps %xmm6, %xmm5 610 pshufd $0xff, %xmm4, %xmm6 611 mulps %xmm3, %xmm6 612 subps %xmm6, %xmm7 613 614 movaps 8 * SIZE(AA), %xmm4 615 pshufd $0xaa, %xmm4, %xmm6 616 mulps %xmm6, %xmm5 617 pshufd $0xff, %xmm4, %xmm6 618 mulps %xmm5, %xmm6 619 subps %xmm6, %xmm7 620 621 movaps 12 * SIZE(AA), %xmm4 622 pshufd $0xff, %xmm4, %xmm6 623 mulps %xmm6, %xmm7 624#endif 625 626#ifdef RN 627 movaps 0 * SIZE(B), %xmm6 628 pshufd $0x00, %xmm6, %xmm7 629 mulps %xmm7, %xmm0 630 pshufd $0x55, %xmm6, %xmm7 631 mulps %xmm0, %xmm7 632 subps %xmm7, %xmm1 633 pshufd $0xaa, %xmm6, %xmm7 634 mulps %xmm0, %xmm7 635 subps %xmm7, %xmm2 636 pshufd $0xff, %xmm6, %xmm7 637 mulps %xmm0, %xmm7 638 subps %xmm7, %xmm3 639 640 movaps 4 * SIZE(B), %xmm6 641 pshufd $0x55, %xmm6, %xmm7 642 mulps %xmm7, %xmm1 643 pshufd $0xaa, %xmm6, %xmm7 644 mulps %xmm1, %xmm7 645 subps %xmm7, %xmm2 646 pshufd $0xff, %xmm6, %xmm7 647 mulps %xmm1, %xmm7 648 subps %xmm7, %xmm3 649 650 movaps 8 * SIZE(B), %xmm6 651 pshufd $0xaa, %xmm6, %xmm7 652 mulps %xmm7, %xmm2 653 pshufd $0xff, %xmm6, %xmm7 654 mulps %xmm2, %xmm7 655 subps %xmm7, %xmm3 656 657 movaps 12 * SIZE(B), %xmm6 658 pshufd $0xff, %xmm6, %xmm7 659 mulps %xmm7, %xmm3 660#endif 661 662#ifdef RT 663 movaps 12 * SIZE(B), %xmm6 664 pshufd $0xff, %xmm6, %xmm7 665 mulps %xmm7, %xmm3 666 pshufd $0xaa, %xmm6, %xmm7 667 mulps %xmm3, %xmm7 668 subps %xmm7, %xmm2 669 pshufd $0x55, %xmm6, %xmm7 670 mulps %xmm3, %xmm7 671 subps %xmm7, %xmm1 672 pshufd $0x00, %xmm6, %xmm7 673 mulps %xmm3, %xmm7 674 subps %xmm7, %xmm0 675 676 movaps 8 * SIZE(B), %xmm6 677 pshufd $0xaa, %xmm6, %xmm7 678 mulps %xmm7, %xmm2 679 pshufd $0x55, %xmm6, %xmm7 680 mulps %xmm2, %xmm7 681 subps %xmm7, %xmm1 682 pshufd $0x00, %xmm6, %xmm7 683 mulps %xmm2, %xmm7 684 subps %xmm7, %xmm0 685 686 movaps 4 * SIZE(B), %xmm6 687 pshufd $0x55, %xmm6, %xmm7 688 mulps %xmm7, %xmm1 689 pshufd $0x00, %xmm6, %xmm7 690 mulps %xmm1, %xmm7 691 subps %xmm7, %xmm0 692 693 movaps 0 * SIZE(B), %xmm6 694 pshufd $0x00, %xmm6, %xmm7 695 mulps %xmm7, %xmm0 696#endif 697 698#if defined(LN) || defined(LT) 699 movaps %xmm1, 0 * SIZE(B) 700 movaps %xmm3, 4 * SIZE(B) 701 movaps %xmm5, 8 * SIZE(B) 702 movaps %xmm7, 12 * SIZE(B) 703 704 pshufd $0x00, %xmm1, %xmm0 705 pshufd $0x55, %xmm1, %xmm2 706 pshufd $0xaa, %xmm1, %xmm4 707 pshufd $0xff, %xmm1, %xmm6 708 movaps %xmm0, 0 * SIZE(BB) 709 movaps %xmm2, 4 * SIZE(BB) 710 movaps %xmm4, 8 * SIZE(BB) 711 movaps %xmm6, 12 * SIZE(BB) 712 713 pshufd $0x00, %xmm3, %xmm0 714 pshufd $0x55, %xmm3, %xmm2 715 pshufd $0xaa, %xmm3, %xmm4 716 pshufd $0xff, %xmm3, %xmm6 717 movaps %xmm0, 16 * SIZE(BB) 718 movaps %xmm2, 20 * SIZE(BB) 719 movaps %xmm4, 24 * SIZE(BB) 720 movaps %xmm6, 28 * SIZE(BB) 721 722 pshufd $0x00, %xmm5, %xmm0 723 pshufd $0x55, %xmm5, %xmm2 724 pshufd $0xaa, %xmm5, %xmm4 725 pshufd $0xff, %xmm5, %xmm6 726 movaps %xmm0, 32 * SIZE(BB) 727 movaps %xmm2, 36 * SIZE(BB) 728 movaps %xmm4, 40 * SIZE(BB) 729 movaps %xmm6, 44 * SIZE(BB) 730 731 pshufd $0x00, %xmm7, %xmm0 732 pshufd $0x55, %xmm7, %xmm2 733 pshufd $0xaa, %xmm7, %xmm4 734 pshufd $0xff, %xmm7, %xmm6 735 movaps %xmm0, 48 * SIZE(BB) 736 movaps %xmm2, 52 * SIZE(BB) 737 movaps %xmm4, 56 * SIZE(BB) 738 movaps %xmm6, 60 * SIZE(BB) 739#else 740 movaps %xmm0, 0 * SIZE(AA) 741 movaps %xmm1, 4 * SIZE(AA) 742 movaps %xmm2, 8 * SIZE(AA) 743 movaps %xmm3, 12 * SIZE(AA) 744#endif 745 746#ifdef LN 747 subl $4 * SIZE, CO1 748#endif 749 750 leal (LDC, LDC, 2), %eax 751 752#if defined(LN) || defined(LT) 753 movaps %xmm1, %xmm0 754 unpcklps %xmm5, %xmm1 755 unpckhps %xmm5, %xmm0 756 757 movaps %xmm3, %xmm4 758 unpcklps %xmm7, %xmm3 759 unpckhps %xmm7, %xmm4 760 761 movaps %xmm1, %xmm2 762 unpcklps %xmm3, %xmm1 763 unpckhps %xmm3, %xmm2 764 765 movaps %xmm0, %xmm6 766 unpcklps %xmm4, %xmm0 767 unpckhps %xmm4, %xmm6 768 769 movlps %xmm1, 0 * SIZE(CO1) 770 movhps %xmm1, 2 * SIZE(CO1) 771 movlps %xmm2, 0 * SIZE(CO1, LDC, 1) 772 movhps %xmm2, 2 * SIZE(CO1, LDC, 1) 773 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) 774 movhps %xmm0, 2 * SIZE(CO1, LDC, 2) 775 movlps %xmm6, 0 * SIZE(CO1, %eax, 1) 776 movhps %xmm6, 2 * SIZE(CO1, %eax, 1) 777#else 778 movlps %xmm0, 0 * SIZE(CO1) 779 movhps %xmm0, 2 * SIZE(CO1) 780 movlps %xmm1, 0 * SIZE(CO1, LDC, 1) 781 movhps %xmm1, 2 * SIZE(CO1, LDC, 1) 782 movlps %xmm2, 0 * SIZE(CO1, LDC, 2) 783 movhps %xmm2, 2 * SIZE(CO1, LDC, 2) 784 movlps %xmm3, 0 * SIZE(CO1, %eax, 1) 785 movhps %xmm3, 2 * SIZE(CO1, %eax, 1) 786#endif 787 788#ifndef LN 789 addl $4 * SIZE, CO1 790#endif 791 792#if defined(LT) || defined(RN) 793 movl K, %eax 794 subl KK, %eax 795 leal (,%eax, SIZE), %eax 796 leal (AA, %eax, 4), AA 797#ifdef LT 798 addl $16 * SIZE, B 799#endif 800#endif 801 802#ifdef LN 803 subl $4, KK 804 movl BORIG, B 805#endif 806 807#ifdef LT 808 addl $4, KK 809#endif 810 811#ifdef RT 812 movl K, %eax 813 movl BORIG, B 814 sall $2 + BASE_SHIFT, %eax 815 addl %eax, AORIG 816#endif 817 818 decl %ebx # i -- 819 jg .L11 820 ALIGN_4 821 822.L20: 823 testl $2, M 824 je .L30 825 826#ifdef LN 827 movl K, %eax 828 sall $1 + BASE_SHIFT, %eax 829 subl %eax, AORIG 830#endif 831 832#if defined(LN) || defined(RT) 833 movl KK, %eax 834 movl AORIG, AA 835 leal (, %eax, SIZE), %eax 836 leal (AA, %eax, 2), AA 837#endif 838 839 leal BUFFER, BB 840 841#if defined(LN) || defined(RT) 842 movl KK, %eax 843 sall $2 + BASE_SHIFT, %eax 844 leal (BB, %eax, 4), BB 845#endif 846 847#ifdef movsd 848 xorps %xmm0, %xmm0 849#endif 850 movsd 0 * SIZE(AA), %xmm0 851 xorps %xmm4, %xmm4 852#ifdef movsd 853 xorps %xmm1, %xmm1 854#endif 855 movsd 8 * SIZE(AA), %xmm1 856 xorps %xmm5, %xmm5 857 movaps 0 * SIZE(BB), %xmm2 858 xorps %xmm6, %xmm6 859 movaps 16 * SIZE(BB), %xmm3 860 xorps %xmm7, %xmm7 861 862#if defined(LT) || defined(RN) 863 movl KK, %eax 864#else 865 movl K, %eax 866 subl KK, %eax 867#endif 868 sarl $3, %eax 869 je .L25 870 ALIGN_4 871 872.L22: 873 mulps %xmm0, %xmm2 874 addps %xmm2, %xmm4 875#if defined(OPTERON) || defined(BARCELONA) 876 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 877#endif 878 movaps 4 * SIZE(BB), %xmm2 879 mulps %xmm0, %xmm2 880 addps %xmm2, %xmm5 881 movaps 8 * SIZE(BB), %xmm2 882 mulps %xmm0, %xmm2 883 addps %xmm2, %xmm6 884 movaps 12 * SIZE(BB), %xmm2 885 mulps %xmm0, %xmm2 886 movsd 2 * SIZE(AA), %xmm0 887 addps %xmm2, %xmm7 888 movaps 32 * SIZE(BB), %xmm2 889 890 mulps %xmm0, %xmm3 891 addps %xmm3, %xmm4 892 movaps 20 * SIZE(BB), %xmm3 893 mulps %xmm0, %xmm3 894 addps %xmm3, %xmm5 895 movaps 24 * SIZE(BB), %xmm3 896 mulps %xmm0, %xmm3 897 addps %xmm3, %xmm6 898 movaps 28 * SIZE(BB), %xmm3 899 mulps %xmm0, %xmm3 900 movsd 4 * SIZE(AA), %xmm0 901 addps %xmm3, %xmm7 902 movaps 48 * SIZE(BB), %xmm3 903 904 mulps %xmm0, %xmm2 905 addps %xmm2, %xmm4 906 movaps 36 * SIZE(BB), %xmm2 907 mulps %xmm0, %xmm2 908 addps %xmm2, %xmm5 909 movaps 40 * SIZE(BB), %xmm2 910 mulps %xmm0, %xmm2 911 addps %xmm2, %xmm6 912 movaps 44 * SIZE(BB), %xmm2 913 mulps %xmm0, %xmm2 914 movsd 6 * SIZE(AA), %xmm0 915 addps %xmm2, %xmm7 916 movaps 64 * SIZE(BB), %xmm2 917 918 mulps %xmm0, %xmm3 919 addps %xmm3, %xmm4 920 movaps 52 * SIZE(BB), %xmm3 921 mulps %xmm0, %xmm3 922 addps %xmm3, %xmm5 923 movaps 56 * SIZE(BB), %xmm3 924 mulps %xmm0, %xmm3 925 addps %xmm3, %xmm6 926 movaps 60 * SIZE(BB), %xmm3 927 mulps %xmm0, %xmm3 928 movsd 16 * SIZE(AA), %xmm0 929 addps %xmm3, %xmm7 930 movaps 80 * SIZE(BB), %xmm3 931 932 mulps %xmm1, %xmm2 933 addps %xmm2, %xmm4 934 movaps 68 * SIZE(BB), %xmm2 935 mulps %xmm1, %xmm2 936 addps %xmm2, %xmm5 937 movaps 72 * SIZE(BB), %xmm2 938 mulps %xmm1, %xmm2 939 addps %xmm2, %xmm6 940 movaps 76 * SIZE(BB), %xmm2 941 mulps %xmm1, %xmm2 942 movsd 10 * SIZE(AA), %xmm1 943 addps %xmm2, %xmm7 944 movaps 96 * SIZE(BB), %xmm2 945 946 mulps %xmm1, %xmm3 947 addps %xmm3, %xmm4 948 movaps 84 * SIZE(BB), %xmm3 949 mulps %xmm1, %xmm3 950 addps %xmm3, %xmm5 951 movaps 88 * SIZE(BB), %xmm3 952 mulps %xmm1, %xmm3 953 addps %xmm3, %xmm6 954 movaps 92 * SIZE(BB), %xmm3 955 mulps %xmm1, %xmm3 956 movsd 12 * SIZE(AA), %xmm1 957 addps %xmm3, %xmm7 958 movaps 112 * SIZE(BB), %xmm3 959 960 mulps %xmm1, %xmm2 961 addps %xmm2, %xmm4 962 movaps 100 * SIZE(BB), %xmm2 963 mulps %xmm1, %xmm2 964 addps %xmm2, %xmm5 965 movaps 104 * SIZE(BB), %xmm2 966 mulps %xmm1, %xmm2 967 addps %xmm2, %xmm6 968 movaps 108 * SIZE(BB), %xmm2 969 mulps %xmm1, %xmm2 970 movsd 14 * SIZE(AA), %xmm1 971 addps %xmm2, %xmm7 972 movaps 128 * SIZE(BB), %xmm2 973 974 mulps %xmm1, %xmm3 975 addps %xmm3, %xmm4 976 movaps 116 * SIZE(BB), %xmm3 977 mulps %xmm1, %xmm3 978 addps %xmm3, %xmm5 979 movaps 120 * SIZE(BB), %xmm3 980 mulps %xmm1, %xmm3 981 addps %xmm3, %xmm6 982 movaps 124 * SIZE(BB), %xmm3 983 mulps %xmm1, %xmm3 984 movsd 24 * SIZE(AA), %xmm1 985 addps %xmm3, %xmm7 986 movaps 144 * SIZE(BB), %xmm3 987 988 addl $ 16 * SIZE, AA 989 addl $128 * SIZE, BB 990 decl %eax 991 jne .L22 992 ALIGN_4 993 994.L25: 995#if defined(LT) || defined(RN) 996 movl KK, %eax 997#else 998 movl K, %eax 999 subl KK, %eax 1000#endif 1001 andl $7, %eax # if (k & 1) 1002 BRANCH 1003 je .L28 1004 ALIGN_4 1005 1006.L26: 1007 mulps %xmm0, %xmm2 1008 addps %xmm2, %xmm4 1009 movaps 4 * SIZE(BB), %xmm2 1010 mulps %xmm0, %xmm2 1011 addps %xmm2, %xmm5 1012 movaps 8 * SIZE(BB), %xmm2 1013 mulps %xmm0, %xmm2 1014 addps %xmm2, %xmm6 1015 movaps 12 * SIZE(BB), %xmm2 1016 mulps %xmm0, %xmm2 1017 movsd 2 * SIZE(AA), %xmm0 1018 addps %xmm2, %xmm7 1019 movaps 16 * SIZE(BB), %xmm2 1020 1021 addl $ 2 * SIZE, AA 1022 addl $16 * SIZE, BB 1023 decl %eax 1024 jg .L26 1025 ALIGN_4 1026 1027.L28: 1028#if defined(LN) || defined(RT) 1029 movl KK, %eax 1030#ifdef LN 1031 subl $2, %eax 1032#else 1033 subl $4, %eax 1034#endif 1035 1036 movl AORIG, AA 1037 movl BORIG, B 1038 leal BUFFER, BB 1039 1040 sall $1 + BASE_SHIFT, %eax 1041 leal (AA, %eax, 1), AA 1042 leal (B, %eax, 2), B 1043 leal (BB, %eax, 8), BB 1044#endif 1045 1046#if defined(LN) || defined(LT) 1047 unpcklps %xmm6, %xmm4 1048 unpcklps %xmm7, %xmm5 1049 1050 movaps %xmm4, %xmm6 1051 unpcklps %xmm5, %xmm4 1052 unpckhps %xmm5, %xmm6 1053 1054 movaps 0 * SIZE(B), %xmm1 1055 movaps 4 * SIZE(B), %xmm3 1056 1057 subps %xmm4, %xmm1 1058 subps %xmm6, %xmm3 1059#else 1060#ifdef movsd 1061 xorps %xmm0, %xmm0 1062#endif 1063 movsd 0 * SIZE(AA), %xmm0 1064#ifdef movsd 1065 xorps %xmm1, %xmm1 1066#endif 1067 movsd 2 * SIZE(AA), %xmm1 1068#ifdef movsd 1069 xorps %xmm2, %xmm2 1070#endif 1071 movsd 4 * SIZE(AA), %xmm2 1072#ifdef movsd 1073 xorps %xmm3, %xmm3 1074#endif 1075 movsd 6 * SIZE(AA), %xmm3 1076 1077 subps %xmm4, %xmm0 1078 subps %xmm5, %xmm1 1079 subps %xmm6, %xmm2 1080 subps %xmm7, %xmm3 1081#endif 1082 1083#ifdef LN 1084 movaps 0 * SIZE(AA), %xmm4 1085 pshufd $0xff, %xmm4, %xmm6 1086 mulps %xmm6, %xmm3 1087 pshufd $0xaa, %xmm4, %xmm6 1088 mulps %xmm3, %xmm6 1089 subps %xmm6, %xmm1 1090 1091 pshufd $0x00, %xmm4, %xmm6 1092 mulps %xmm6, %xmm1 1093#endif 1094 1095#ifdef LT 1096 movaps 0 * SIZE(AA), %xmm4 1097 pshufd $0x00, %xmm4, %xmm6 1098 mulps %xmm6, %xmm1 1099 1100 pshufd $0x55, %xmm4, %xmm6 1101 mulps %xmm1, %xmm6 1102 subps %xmm6, %xmm3 1103 1104 pshufd $0xff, %xmm4, %xmm6 1105 mulps %xmm6, %xmm3 1106#endif 1107 1108#ifdef RN 1109 movaps 0 * SIZE(B), %xmm6 1110 pshufd $0x00, %xmm6, %xmm7 1111 mulps %xmm7, %xmm0 1112 pshufd $0x55, %xmm6, %xmm7 1113 mulps %xmm0, %xmm7 1114 subps %xmm7, %xmm1 1115 pshufd $0xaa, %xmm6, %xmm7 1116 mulps %xmm0, %xmm7 1117 subps %xmm7, %xmm2 1118 pshufd $0xff, %xmm6, %xmm7 1119 mulps %xmm0, %xmm7 1120 subps %xmm7, %xmm3 1121 1122 movaps 4 * SIZE(B), %xmm6 1123 pshufd $0x55, %xmm6, %xmm7 1124 mulps %xmm7, %xmm1 1125 pshufd $0xaa, %xmm6, %xmm7 1126 mulps %xmm1, %xmm7 1127 subps %xmm7, %xmm2 1128 pshufd $0xff, %xmm6, %xmm7 1129 mulps %xmm1, %xmm7 1130 subps %xmm7, %xmm3 1131 1132 movaps 8 * SIZE(B), %xmm6 1133 pshufd $0xaa, %xmm6, %xmm7 1134 mulps %xmm7, %xmm2 1135 pshufd $0xff, %xmm6, %xmm7 1136 mulps %xmm2, %xmm7 1137 subps %xmm7, %xmm3 1138 1139 movaps 12 * SIZE(B), %xmm6 1140 pshufd $0xff, %xmm6, %xmm7 1141 mulps %xmm7, %xmm3 1142#endif 1143 1144#ifdef RT 1145 movaps 12 * SIZE(B), %xmm6 1146 pshufd $0xff, %xmm6, %xmm7 1147 mulps %xmm7, %xmm3 1148 pshufd $0xaa, %xmm6, %xmm7 1149 mulps %xmm3, %xmm7 1150 subps %xmm7, %xmm2 1151 pshufd $0x55, %xmm6, %xmm7 1152 mulps %xmm3, %xmm7 1153 subps %xmm7, %xmm1 1154 pshufd $0x00, %xmm6, %xmm7 1155 mulps %xmm3, %xmm7 1156 subps %xmm7, %xmm0 1157 1158 movaps 8 * SIZE(B), %xmm6 1159 pshufd $0xaa, %xmm6, %xmm7 1160 mulps %xmm7, %xmm2 1161 pshufd $0x55, %xmm6, %xmm7 1162 mulps %xmm2, %xmm7 1163 subps %xmm7, %xmm1 1164 pshufd $0x00, %xmm6, %xmm7 1165 mulps %xmm2, %xmm7 1166 subps %xmm7, %xmm0 1167 1168 movaps 4 * SIZE(B), %xmm6 1169 pshufd $0x55, %xmm6, %xmm7 1170 mulps %xmm7, %xmm1 1171 pshufd $0x00, %xmm6, %xmm7 1172 mulps %xmm1, %xmm7 1173 subps %xmm7, %xmm0 1174 1175 movaps 0 * SIZE(B), %xmm6 1176 pshufd $0x00, %xmm6, %xmm7 1177 mulps %xmm7, %xmm0 1178#endif 1179 1180#if defined(LN) || defined(LT) 1181 movaps %xmm1, 0 * SIZE(B) 1182 movaps %xmm3, 4 * SIZE(B) 1183 1184 pshufd $0x00, %xmm1, %xmm0 1185 pshufd $0x55, %xmm1, %xmm2 1186 pshufd $0xaa, %xmm1, %xmm4 1187 pshufd $0xff, %xmm1, %xmm6 1188 movaps %xmm0, 0 * SIZE(BB) 1189 movaps %xmm2, 4 * SIZE(BB) 1190 movaps %xmm4, 8 * SIZE(BB) 1191 movaps %xmm6, 12 * SIZE(BB) 1192 1193 pshufd $0x00, %xmm3, %xmm0 1194 pshufd $0x55, %xmm3, %xmm2 1195 pshufd $0xaa, %xmm3, %xmm4 1196 pshufd $0xff, %xmm3, %xmm6 1197 movaps %xmm0, 16 * SIZE(BB) 1198 movaps %xmm2, 20 * SIZE(BB) 1199 movaps %xmm4, 24 * SIZE(BB) 1200 movaps %xmm6, 28 * SIZE(BB) 1201#else 1202 movlps %xmm0, 0 * SIZE(AA) 1203 movlps %xmm1, 2 * SIZE(AA) 1204 movlps %xmm2, 4 * SIZE(AA) 1205 movlps %xmm3, 6 * SIZE(AA) 1206#endif 1207 1208#ifdef LN 1209 subl $2 * SIZE, CO1 1210#endif 1211 1212 leal (LDC, LDC, 2), %eax 1213 1214#if defined(LN) || defined(LT) 1215 movaps %xmm1, %xmm0 1216 unpcklps %xmm5, %xmm1 1217 unpckhps %xmm5, %xmm0 1218 1219 movaps %xmm3, %xmm4 1220 unpcklps %xmm7, %xmm3 1221 unpckhps %xmm7, %xmm4 1222 1223 movaps %xmm1, %xmm2 1224 unpcklps %xmm3, %xmm1 1225 unpckhps %xmm3, %xmm2 1226 1227 movaps %xmm0, %xmm6 1228 unpcklps %xmm4, %xmm0 1229 unpckhps %xmm4, %xmm6 1230 1231 movlps %xmm1, 0 * SIZE(CO1) 1232 movlps %xmm2, 0 * SIZE(CO1, LDC, 1) 1233 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) 1234 movlps %xmm6, 0 * SIZE(CO1, %eax, 1) 1235#else 1236 movlps %xmm0, 0 * SIZE(CO1) 1237 movlps %xmm1, 0 * SIZE(CO1, LDC, 1) 1238 movlps %xmm2, 0 * SIZE(CO1, LDC, 2) 1239 movlps %xmm3, 0 * SIZE(CO1, %eax, 1) 1240#endif 1241 1242#ifndef LN 1243 addl $2 * SIZE, CO1 1244#endif 1245 1246#if defined(LT) || defined(RN) 1247 movl K, %eax 1248 subl KK, %eax 1249 leal (,%eax, SIZE), %eax 1250 leal (AA, %eax, 2), AA 1251#ifdef LT 1252 addl $8 * SIZE, B 1253#endif 1254#endif 1255 1256#ifdef LN 1257 subl $2, KK 1258 movl BORIG, B 1259#endif 1260 1261#ifdef LT 1262 addl $2, KK 1263#endif 1264 1265#ifdef RT 1266 movl K, %eax 1267 movl BORIG, B 1268 sall $1 + BASE_SHIFT, %eax 1269 addl %eax, AORIG 1270#endif 1271 ALIGN_4 1272 1273.L30: 1274 testl $1, M 1275 je .L39 1276 1277#ifdef LN 1278 movl K, %eax 1279 sall $BASE_SHIFT, %eax 1280 subl %eax, AORIG 1281#endif 1282 1283#if defined(LN) || defined(RT) 1284 movl KK, %eax 1285 movl AORIG, AA 1286 leal (AA, %eax, SIZE), AA 1287#endif 1288 1289 leal BUFFER, BB 1290 1291#if defined(LN) || defined(RT) 1292 movl KK, %eax 1293 sall $2 + BASE_SHIFT, %eax 1294 leal (BB, %eax, 4), BB 1295#endif 1296 1297 movss 0 * SIZE(AA), %xmm0 1298 xorps %xmm4, %xmm4 1299 movss 4 * SIZE(AA), %xmm1 1300 xorps %xmm5, %xmm5 1301 movss 0 * SIZE(BB), %xmm2 1302 xorps %xmm6, %xmm6 1303 movss 16 * SIZE(BB), %xmm3 1304 xorps %xmm7, %xmm7 1305 1306#if defined(LT) || defined(RN) 1307 movl KK, %eax 1308#else 1309 movl K, %eax 1310 subl KK, %eax 1311#endif 1312 sarl $3, %eax 1313 je .L35 1314 ALIGN_4 1315 1316.L32: 1317 mulss %xmm0, %xmm2 1318 addss %xmm2, %xmm4 1319#if defined(OPTERON) || defined(BARCELONA) 1320 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 1321#endif 1322 movss 4 * SIZE(BB), %xmm2 1323 mulss %xmm0, %xmm2 1324 addss %xmm2, %xmm5 1325 movss 8 * SIZE(BB), %xmm2 1326 mulss %xmm0, %xmm2 1327 mulss 12 * SIZE(BB), %xmm0 1328 addss %xmm2, %xmm6 1329 movss 32 * SIZE(BB), %xmm2 1330 addss %xmm0, %xmm7 1331 movss 1 * SIZE(AA), %xmm0 1332 1333 mulss %xmm0, %xmm3 1334 addss %xmm3, %xmm4 1335 movss 20 * SIZE(BB), %xmm3 1336 mulss %xmm0, %xmm3 1337 addss %xmm3, %xmm5 1338 movss 24 * SIZE(BB), %xmm3 1339 mulss %xmm0, %xmm3 1340 mulss 28 * SIZE(BB), %xmm0 1341 addss %xmm3, %xmm6 1342 movss 48 * SIZE(BB), %xmm3 1343 addss %xmm0, %xmm7 1344 movss 2 * SIZE(AA), %xmm0 1345 1346 mulss %xmm0, %xmm2 1347 addss %xmm2, %xmm4 1348 movss 36 * SIZE(BB), %xmm2 1349 mulss %xmm0, %xmm2 1350 addss %xmm2, %xmm5 1351 movss 40 * SIZE(BB), %xmm2 1352 mulss %xmm0, %xmm2 1353 mulss 44 * SIZE(BB), %xmm0 1354 addss %xmm2, %xmm6 1355 movss 64 * SIZE(BB), %xmm2 1356 addss %xmm0, %xmm7 1357 movss 3 * SIZE(AA), %xmm0 1358 1359 mulss %xmm0, %xmm3 1360 addss %xmm3, %xmm4 1361 movss 52 * SIZE(BB), %xmm3 1362 mulss %xmm0, %xmm3 1363 addss %xmm3, %xmm5 1364 movss 56 * SIZE(BB), %xmm3 1365 mulss %xmm0, %xmm3 1366 mulss 60 * SIZE(BB), %xmm0 1367 addss %xmm3, %xmm6 1368 movss 80 * SIZE(BB), %xmm3 1369 addss %xmm0, %xmm7 1370 movss 8 * SIZE(AA), %xmm0 1371 1372 mulss %xmm1, %xmm2 1373 addss %xmm2, %xmm4 1374 movss 68 * SIZE(BB), %xmm2 1375 mulss %xmm1, %xmm2 1376 addss %xmm2, %xmm5 1377 movss 72 * SIZE(BB), %xmm2 1378 mulss %xmm1, %xmm2 1379 mulss 76 * SIZE(BB), %xmm1 1380 addss %xmm2, %xmm6 1381 movss 96 * SIZE(BB), %xmm2 1382 addss %xmm1, %xmm7 1383 movss 5 * SIZE(AA), %xmm1 1384 1385 mulss %xmm1, %xmm3 1386 addss %xmm3, %xmm4 1387 movss 84 * SIZE(BB), %xmm3 1388 mulss %xmm1, %xmm3 1389 addss %xmm3, %xmm5 1390 movss 88 * SIZE(BB), %xmm3 1391 mulss %xmm1, %xmm3 1392 mulss 92 * SIZE(BB), %xmm1 1393 addss %xmm3, %xmm6 1394 movss 112 * SIZE(BB), %xmm3 1395 addss %xmm1, %xmm7 1396 movss 6 * SIZE(AA), %xmm1 1397 1398 mulss %xmm1, %xmm2 1399 addss %xmm2, %xmm4 1400 movss 100 * SIZE(BB), %xmm2 1401 mulss %xmm1, %xmm2 1402 addss %xmm2, %xmm5 1403 movss 104 * SIZE(BB), %xmm2 1404 mulss %xmm1, %xmm2 1405 mulss 108 * SIZE(BB), %xmm1 1406 addss %xmm2, %xmm6 1407 movss 128 * SIZE(BB), %xmm2 1408 addss %xmm1, %xmm7 1409 movss 7 * SIZE(AA), %xmm1 1410 1411 mulss %xmm1, %xmm3 1412 addss %xmm3, %xmm4 1413 movss 116 * SIZE(BB), %xmm3 1414 mulss %xmm1, %xmm3 1415 addss %xmm3, %xmm5 1416 movss 120 * SIZE(BB), %xmm3 1417 mulss %xmm1, %xmm3 1418 mulss 124 * SIZE(BB), %xmm1 1419 addss %xmm3, %xmm6 1420 movss 144 * SIZE(BB), %xmm3 1421 addss %xmm1, %xmm7 1422 movss 12 * SIZE(AA), %xmm1 1423 1424 addl $ 8 * SIZE, AA 1425 addl $128 * SIZE, BB 1426 decl %eax 1427 jne .L32 1428 ALIGN_4 1429 1430.L35: 1431#if defined(LT) || defined(RN) 1432 movl KK, %eax 1433#else 1434 movl K, %eax 1435 subl KK, %eax 1436#endif 1437 andl $7, %eax # if (k & 1) 1438 BRANCH 1439 je .L38 1440 ALIGN_4 1441 1442.L36: 1443 mulss %xmm0, %xmm2 1444 addss %xmm2, %xmm4 1445 movss 4 * SIZE(BB), %xmm2 1446 mulss %xmm0, %xmm2 1447 addss %xmm2, %xmm5 1448 movss 8 * SIZE(BB), %xmm2 1449 mulss %xmm0, %xmm2 1450 mulss 12 * SIZE(BB), %xmm0 1451 addss %xmm2, %xmm6 1452 movss 16 * SIZE(BB), %xmm2 1453 addss %xmm0, %xmm7 1454 movss 1 * SIZE(AA), %xmm0 1455 1456 addl $ 1 * SIZE, AA 1457 addl $16 * SIZE, BB 1458 decl %eax 1459 jg .L36 1460 ALIGN_4 1461 1462.L38: 1463#if defined(LN) || defined(RT) 1464 movl KK, %eax 1465#ifdef LN 1466 subl $1, %eax 1467#else 1468 subl $4, %eax 1469#endif 1470 1471 movl AORIG, AA 1472 movl BORIG, B 1473 leal BUFFER, BB 1474 1475 leal (AA, %eax, SIZE), AA 1476 1477 sall $2 + BASE_SHIFT, %eax 1478 leal (B, %eax, 1), B 1479 leal (BB, %eax, 4), BB 1480#endif 1481 1482#if defined(LN) || defined(LT) 1483 unpcklps %xmm6, %xmm4 1484 unpcklps %xmm7, %xmm5 1485 unpcklps %xmm5, %xmm4 1486 1487 movaps 0 * SIZE(B), %xmm1 1488 1489 subps %xmm4, %xmm1 1490#else 1491 movss 0 * SIZE(AA), %xmm0 1492 movss 1 * SIZE(AA), %xmm1 1493 movss 2 * SIZE(AA), %xmm2 1494 movss 3 * SIZE(AA), %xmm3 1495 1496 subss %xmm4, %xmm0 1497 subss %xmm5, %xmm1 1498 subss %xmm6, %xmm2 1499 subss %xmm7, %xmm3 1500#endif 1501 1502#if defined(LN) || defined(LT) 1503 movss 0 * SIZE(AA), %xmm4 1504 pshufd $0x00, %xmm4, %xmm6 1505 mulps %xmm6, %xmm1 1506#endif 1507 1508#ifdef RN 1509 movaps 0 * SIZE(B), %xmm6 1510 pshufd $0x00, %xmm6, %xmm7 1511 mulss %xmm7, %xmm0 1512 pshufd $0x55, %xmm6, %xmm7 1513 mulss %xmm0, %xmm7 1514 subss %xmm7, %xmm1 1515 pshufd $0xaa, %xmm6, %xmm7 1516 mulss %xmm0, %xmm7 1517 subss %xmm7, %xmm2 1518 pshufd $0xff, %xmm6, %xmm7 1519 mulss %xmm0, %xmm7 1520 subss %xmm7, %xmm3 1521 1522 movaps 4 * SIZE(B), %xmm6 1523 pshufd $0x55, %xmm6, %xmm7 1524 mulss %xmm7, %xmm1 1525 pshufd $0xaa, %xmm6, %xmm7 1526 mulss %xmm1, %xmm7 1527 subss %xmm7, %xmm2 1528 pshufd $0xff, %xmm6, %xmm7 1529 mulss %xmm1, %xmm7 1530 subss %xmm7, %xmm3 1531 1532 movaps 8 * SIZE(B), %xmm6 1533 pshufd $0xaa, %xmm6, %xmm7 1534 mulss %xmm7, %xmm2 1535 pshufd $0xff, %xmm6, %xmm7 1536 mulss %xmm2, %xmm7 1537 subss %xmm7, %xmm3 1538 1539 movaps 12 * SIZE(B), %xmm6 1540 pshufd $0xff, %xmm6, %xmm7 1541 mulss %xmm7, %xmm3 1542#endif 1543 1544#ifdef RT 1545 movaps 12 * SIZE(B), %xmm6 1546 pshufd $0xff, %xmm6, %xmm7 1547 mulss %xmm7, %xmm3 1548 pshufd $0xaa, %xmm6, %xmm7 1549 mulss %xmm3, %xmm7 1550 subss %xmm7, %xmm2 1551 pshufd $0x55, %xmm6, %xmm7 1552 mulss %xmm3, %xmm7 1553 subss %xmm7, %xmm1 1554 pshufd $0x00, %xmm6, %xmm7 1555 mulss %xmm3, %xmm7 1556 subss %xmm7, %xmm0 1557 1558 movaps 8 * SIZE(B), %xmm6 1559 pshufd $0xaa, %xmm6, %xmm7 1560 mulss %xmm7, %xmm2 1561 pshufd $0x55, %xmm6, %xmm7 1562 mulss %xmm2, %xmm7 1563 subss %xmm7, %xmm1 1564 pshufd $0x00, %xmm6, %xmm7 1565 mulss %xmm2, %xmm7 1566 subss %xmm7, %xmm0 1567 1568 movaps 4 * SIZE(B), %xmm6 1569 pshufd $0x55, %xmm6, %xmm7 1570 mulss %xmm7, %xmm1 1571 pshufd $0x00, %xmm6, %xmm7 1572 mulss %xmm1, %xmm7 1573 subss %xmm7, %xmm0 1574 1575 movaps 0 * SIZE(B), %xmm6 1576 pshufd $0x00, %xmm6, %xmm7 1577 mulss %xmm7, %xmm0 1578#endif 1579 1580#if defined(LN) || defined(LT) 1581 movaps %xmm1, 0 * SIZE(B) 1582 1583 pshufd $0x00, %xmm1, %xmm0 1584 pshufd $0x55, %xmm1, %xmm2 1585 pshufd $0xaa, %xmm1, %xmm4 1586 pshufd $0xff, %xmm1, %xmm6 1587 movaps %xmm0, 0 * SIZE(BB) 1588 movaps %xmm2, 4 * SIZE(BB) 1589 movaps %xmm4, 8 * SIZE(BB) 1590 movaps %xmm6, 12 * SIZE(BB) 1591#else 1592 movss %xmm0, 0 * SIZE(AA) 1593 movss %xmm1, 1 * SIZE(AA) 1594 movss %xmm2, 2 * SIZE(AA) 1595 movss %xmm3, 3 * SIZE(AA) 1596#endif 1597 1598#ifdef LN 1599 subl $1 * SIZE, CO1 1600#endif 1601 1602 leal (LDC, LDC, 2), %eax 1603 1604#if defined(LN) || defined(LT) 1605 movaps %xmm1, %xmm0 1606 unpcklps %xmm5, %xmm1 1607 unpckhps %xmm5, %xmm0 1608 1609 movaps %xmm3, %xmm4 1610 unpcklps %xmm7, %xmm3 1611 unpckhps %xmm7, %xmm4 1612 1613 movaps %xmm1, %xmm2 1614 unpcklps %xmm3, %xmm1 1615 unpckhps %xmm3, %xmm2 1616 1617 movaps %xmm0, %xmm6 1618 unpcklps %xmm4, %xmm0 1619 unpckhps %xmm4, %xmm6 1620 1621 movss %xmm1, 0 * SIZE(CO1) 1622 movss %xmm2, 0 * SIZE(CO1, LDC, 1) 1623 movss %xmm0, 0 * SIZE(CO1, LDC, 2) 1624 movss %xmm6, 0 * SIZE(CO1, %eax, 1) 1625#else 1626 movss %xmm0, 0 * SIZE(CO1) 1627 movss %xmm1, 0 * SIZE(CO1, LDC, 1) 1628 movss %xmm2, 0 * SIZE(CO1, LDC, 2) 1629 movss %xmm3, 0 * SIZE(CO1, %eax, 1) 1630#endif 1631 1632#ifndef LN 1633 addl $1 * SIZE, CO1 1634#endif 1635 1636#if defined(LT) || defined(RN) 1637 movl K, %eax 1638 subl KK, %eax 1639 leal (AA, %eax, SIZE), AA 1640#ifdef LT 1641 addl $4 * SIZE, B 1642#endif 1643#endif 1644 1645#ifdef LN 1646 subl $1, KK 1647 movl BORIG, B 1648#endif 1649 1650#ifdef LT 1651 addl $1, KK 1652#endif 1653 1654#ifdef RT 1655 movl K, %eax 1656 movl BORIG, B 1657 sall $BASE_SHIFT, %eax 1658 addl %eax, AORIG 1659#endif 1660 ALIGN_4 1661 1662.L39: 1663#ifdef LN 1664 movl K, %eax 1665 leal (, %eax, SIZE), %eax 1666 leal (B, %eax, 4), B 1667#endif 1668 1669#if defined(LT) || defined(RN) 1670 movl K, %eax 1671 subl KK, %eax 1672 leal (,%eax, SIZE), %eax 1673 leal (B, %eax, 4), B 1674#endif 1675 1676#ifdef RN 1677 addl $4, KK 1678#endif 1679 1680#ifdef RT 1681 subl $4, KK 1682#endif 1683 1684 decl J # j -- 1685 jg .L01 1686 ALIGN_4 1687 1688.L40: 1689 testl $2, N 1690 je .L80 1691 1692#ifdef LN 1693 movl OFFSET, %eax 1694 addl M, %eax 1695 movl %eax, KK 1696#endif 1697 1698 leal BUFFER, %ecx 1699 1700#ifdef RT 1701 movl K, %eax 1702 sall $1 + BASE_SHIFT, %eax 1703 subl %eax, B 1704#endif 1705 1706#if defined(LN) || defined(RT) 1707 movl KK, %eax 1708 movl B, BORIG 1709 sall $1 + BASE_SHIFT, %eax 1710 leal (B, %eax, 1), B 1711 leal (BB, %eax, 4), BB 1712#endif 1713 1714#ifdef LT 1715 movl OFFSET, %eax 1716 movl %eax, KK 1717#endif 1718 1719#if defined(LT) || defined(RN) 1720 movl KK, %eax 1721#else 1722 movl K, %eax 1723 subl KK, %eax 1724#endif 1725 sarl $2, %eax 1726 jle .L45 1727 ALIGN_4 1728 1729.L42: 1730 movaps 0 * SIZE(B), %xmm3 1731 movaps 4 * SIZE(B), %xmm7 1732 1733 pshufd $0x00, %xmm3, %xmm0 1734 pshufd $0x55, %xmm3, %xmm1 1735 pshufd $0xaa, %xmm3, %xmm2 1736 pshufd $0xff, %xmm3, %xmm3 1737 1738 pshufd $0x00, %xmm7, %xmm4 1739 pshufd $0x55, %xmm7, %xmm5 1740 pshufd $0xaa, %xmm7, %xmm6 1741 pshufd $0xff, %xmm7, %xmm7 1742 1743 movaps %xmm0, 0 * SIZE(BB) 1744 movaps %xmm1, 4 * SIZE(BB) 1745 movaps %xmm2, 8 * SIZE(BB) 1746 movaps %xmm3, 12 * SIZE(BB) 1747 movaps %xmm4, 16 * SIZE(BB) 1748 movaps %xmm5, 20 * SIZE(BB) 1749 movaps %xmm6, 24 * SIZE(BB) 1750 movaps %xmm7, 28 * SIZE(BB) 1751 1752 addl $ 8 * SIZE, B 1753 addl $32 * SIZE, %ecx 1754 decl %eax 1755 jne .L42 1756 ALIGN_4 1757 1758.L45: 1759#if defined(LT) || defined(RN) 1760 movl KK, %eax 1761#else 1762 movl K, %eax 1763 subl KK, %eax 1764#endif 1765 andl $3, %eax 1766 BRANCH 1767 jle .L50 1768 ALIGN_4 1769 1770.L46: 1771#ifdef movsd 1772 xorps %xmm3, %xmm3 1773#endif 1774 movsd 0 * SIZE(B), %xmm3 1775 1776 pshufd $0x00, %xmm3, %xmm0 1777 pshufd $0x55, %xmm3, %xmm1 1778 1779 movaps %xmm0, 0 * SIZE(BB) 1780 movaps %xmm1, 4 * SIZE(BB) 1781 1782 addl $2 * SIZE, B 1783 addl $8 * SIZE, %ecx 1784 decl %eax 1785 jne .L46 1786 ALIGN_4 1787 1788.L50: 1789#if defined(LT) || defined(RN) 1790 movl A, AA 1791#else 1792 movl A, %eax 1793 movl %eax, AORIG 1794#endif 1795 1796 leal (, LDC, 2), %eax 1797 1798#ifdef RT 1799 subl %eax, C 1800#endif 1801 movl C, CO1 1802#ifndef RT 1803 addl %eax, C 1804#endif 1805 1806 movl M, %ebx 1807 sarl $2, %ebx # i = (m >> 2) 1808 jle .L60 1809 ALIGN_4 1810 1811.L51: 1812#ifdef LN 1813 movl K, %eax 1814 sall $2 + BASE_SHIFT, %eax 1815 subl %eax, AORIG 1816#endif 1817 1818#if defined(LN) || defined(RT) 1819 movl KK, %eax 1820 movl AORIG, AA 1821 leal (, %eax, SIZE), %eax 1822 leal (AA, %eax, 4), AA 1823#endif 1824 1825 leal BUFFER, BB 1826 1827#if defined(LN) || defined(RT) 1828 movl KK, %eax 1829 sall $1 + BASE_SHIFT, %eax 1830 leal (BB, %eax, 4), BB 1831#endif 1832 1833 xorps %xmm4, %xmm4 1834 xorps %xmm5, %xmm5 1835 xorps %xmm6, %xmm6 1836 xorps %xmm7, %xmm7 1837 1838 movaps 0 * SIZE(AA), %xmm0 1839 movaps 16 * SIZE(AA), %xmm1 1840 movaps 0 * SIZE(BB), %xmm2 1841 movaps 16 * SIZE(BB), %xmm3 1842 1843 PREFETCHW 3 * SIZE(CO1) 1844 PREFETCHW 3 * SIZE(CO1, LDC) 1845 1846#if defined(LT) || defined(RN) 1847 movl KK, %eax 1848#else 1849 movl K, %eax 1850 subl KK, %eax 1851#endif 1852 sarl $3, %eax 1853 je .L55 1854 ALIGN_4 1855 1856.L52: 1857 mulps %xmm0, %xmm2 1858#if defined(OPTERON) || defined(BARCELONA) 1859 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 1860#endif 1861 mulps 4 * SIZE(BB), %xmm0 1862 addps %xmm2, %xmm4 1863 movaps 8 * SIZE(BB), %xmm2 1864 addps %xmm0, %xmm5 1865 movaps 4 * SIZE(AA), %xmm0 1866 1867 mulps %xmm0, %xmm2 1868 mulps 12 * SIZE(BB), %xmm0 1869 addps %xmm2, %xmm4 1870 movaps 32 * SIZE(BB), %xmm2 1871 addps %xmm0, %xmm5 1872 movaps 8 * SIZE(AA), %xmm0 1873 1874 mulps %xmm0, %xmm3 1875 mulps 20 * SIZE(BB), %xmm0 1876 addps %xmm3, %xmm4 1877 movaps 24 * SIZE(BB), %xmm3 1878 addps %xmm0, %xmm5 1879 movaps 12 * SIZE(AA), %xmm0 1880 1881 mulps %xmm0, %xmm3 1882 mulps 28 * SIZE(BB), %xmm0 1883 addps %xmm3, %xmm4 1884 movaps 48 * SIZE(BB), %xmm3 1885 addps %xmm0, %xmm5 1886 movaps 32 * SIZE(AA), %xmm0 1887 1888#if defined(OPTERON) || defined(BARCELONA) 1889 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) 1890#endif 1891 mulps %xmm1, %xmm2 1892 mulps 36 * SIZE(BB), %xmm1 1893 addps %xmm2, %xmm4 1894 movaps 40 * SIZE(BB), %xmm2 1895 addps %xmm1, %xmm5 1896 movaps 20 * SIZE(AA), %xmm1 1897 1898 mulps %xmm1, %xmm2 1899 mulps 44 * SIZE(BB), %xmm1 1900 addps %xmm2, %xmm4 1901 movaps 64 * SIZE(BB), %xmm2 1902 addps %xmm1, %xmm5 1903 movaps 24 * SIZE(AA), %xmm1 1904 1905 mulps %xmm1, %xmm3 1906 mulps 52 * SIZE(BB), %xmm1 1907 addps %xmm3, %xmm4 1908 movaps 56 * SIZE(BB), %xmm3 1909 addps %xmm1, %xmm5 1910 movaps 28 * SIZE(AA), %xmm1 1911 1912 mulps %xmm1, %xmm3 1913 mulps 60 * SIZE(BB), %xmm1 1914 addps %xmm3, %xmm4 1915 movaps 80 * SIZE(BB), %xmm3 1916 addps %xmm1, %xmm5 1917 movaps 48 * SIZE(AA), %xmm1 1918 1919 addl $32 * SIZE, AA 1920 addl $64 * SIZE, BB 1921 decl %eax 1922 jne .L52 1923 ALIGN_4 1924 1925.L55: 1926#if defined(LT) || defined(RN) 1927 movl KK, %eax 1928#else 1929 movl K, %eax 1930 subl KK, %eax 1931#endif 1932 andl $7, %eax # if (k & 1) 1933 BRANCH 1934 je .L58 1935 ALIGN_4 1936 1937.L56: 1938 mulps %xmm0, %xmm2 1939 mulps 4 * SIZE(BB), %xmm0 1940 addps %xmm2, %xmm4 1941 movaps 8 * SIZE(BB), %xmm2 1942 addps %xmm0, %xmm5 1943 movaps 4 * SIZE(AA), %xmm0 1944 1945 addl $4 * SIZE, AA 1946 addl $8 * SIZE, BB 1947 decl %eax 1948 jg .L56 1949 ALIGN_4 1950 1951.L58: 1952#if defined(LN) || defined(RT) 1953 movl KK, %eax 1954#ifdef LN 1955 subl $4, %eax 1956#else 1957 subl $2, %eax 1958#endif 1959 1960 movl AORIG, AA 1961 movl BORIG, B 1962 leal BUFFER, BB 1963 1964 sall $1 + BASE_SHIFT, %eax 1965 leal (AA, %eax, 2), AA 1966 leal (B, %eax, 1), B 1967 leal (BB, %eax, 4), BB 1968#endif 1969 1970#if defined(LN) || defined(LT) 1971 movaps %xmm4, %xmm0 1972 unpcklps %xmm6, %xmm4 1973 unpckhps %xmm6, %xmm0 1974 1975 movaps %xmm5, %xmm1 1976 unpcklps %xmm7, %xmm5 1977 unpckhps %xmm7, %xmm1 1978 1979 movaps %xmm4, %xmm6 1980 unpcklps %xmm5, %xmm4 1981 unpckhps %xmm5, %xmm6 1982 1983 movaps %xmm0, %xmm2 1984 unpcklps %xmm1, %xmm0 1985 unpckhps %xmm1, %xmm2 1986 1987#ifdef movsd 1988 xorps %xmm1, %xmm1 1989#endif 1990 movsd 0 * SIZE(B), %xmm1 1991#ifdef movsd 1992 xorps %xmm3, %xmm3 1993#endif 1994 movsd 2 * SIZE(B), %xmm3 1995#ifdef movsd 1996 xorps %xmm5, %xmm5 1997#endif 1998 movsd 4 * SIZE(B), %xmm5 1999#ifdef movsd 2000 xorps %xmm7, %xmm7 2001#endif 2002 movsd 6 * SIZE(B), %xmm7 2003 2004 subps %xmm4, %xmm1 2005 subps %xmm6, %xmm3 2006 subps %xmm0, %xmm5 2007 subps %xmm2, %xmm7 2008#else 2009 movaps 0 * SIZE(AA), %xmm0 2010 movaps 4 * SIZE(AA), %xmm1 2011 2012 subps %xmm4, %xmm0 2013 subps %xmm5, %xmm1 2014#endif 2015 2016#ifdef LN 2017 movaps 12 * SIZE(AA), %xmm4 2018 pshufd $0xff, %xmm4, %xmm6 2019 mulps %xmm6, %xmm7 2020 pshufd $0xaa, %xmm4, %xmm6 2021 mulps %xmm7, %xmm6 2022 subps %xmm6, %xmm5 2023 pshufd $0x55, %xmm4, %xmm6 2024 mulps %xmm7, %xmm6 2025 subps %xmm6, %xmm3 2026 pshufd $0x00, %xmm4, %xmm6 2027 mulps %xmm7, %xmm6 2028 subps %xmm6, %xmm1 2029 2030 movaps 8 * SIZE(AA), %xmm4 2031 pshufd $0xaa, %xmm4, %xmm6 2032 mulps %xmm6, %xmm5 2033 pshufd $0x55, %xmm4, %xmm6 2034 mulps %xmm5, %xmm6 2035 subps %xmm6, %xmm3 2036 pshufd $0x00, %xmm4, %xmm6 2037 mulps %xmm5, %xmm6 2038 subps %xmm6, %xmm1 2039 2040 movaps 4 * SIZE(AA), %xmm4 2041 pshufd $0x55, %xmm4, %xmm6 2042 mulps %xmm6, %xmm3 2043 pshufd $0x00, %xmm4, %xmm6 2044 mulps %xmm3, %xmm6 2045 subps %xmm6, %xmm1 2046 2047 movaps 0 * SIZE(AA), %xmm4 2048 pshufd $0x00, %xmm4, %xmm6 2049 mulps %xmm6, %xmm1 2050#endif 2051 2052#ifdef LT 2053 movaps 0 * SIZE(AA), %xmm4 2054 pshufd $0x00, %xmm4, %xmm6 2055 mulps %xmm6, %xmm1 2056 2057 pshufd $0x55, %xmm4, %xmm6 2058 mulps %xmm1, %xmm6 2059 subps %xmm6, %xmm3 2060 pshufd $0xaa, %xmm4, %xmm6 2061 mulps %xmm1, %xmm6 2062 subps %xmm6, %xmm5 2063 pshufd $0xff, %xmm4, %xmm6 2064 mulps %xmm1, %xmm6 2065 subps %xmm6, %xmm7 2066 2067 movaps 4 * SIZE(AA), %xmm4 2068 pshufd $0x55, %xmm4, %xmm6 2069 mulps %xmm6, %xmm3 2070 pshufd $0xaa, %xmm4, %xmm6 2071 mulps %xmm3, %xmm6 2072 subps %xmm6, %xmm5 2073 pshufd $0xff, %xmm4, %xmm6 2074 mulps %xmm3, %xmm6 2075 subps %xmm6, %xmm7 2076 2077 movaps 8 * SIZE(AA), %xmm4 2078 pshufd $0xaa, %xmm4, %xmm6 2079 mulps %xmm6, %xmm5 2080 pshufd $0xff, %xmm4, %xmm6 2081 mulps %xmm5, %xmm6 2082 subps %xmm6, %xmm7 2083 2084 movaps 12 * SIZE(AA), %xmm4 2085 pshufd $0xff, %xmm4, %xmm6 2086 mulps %xmm6, %xmm7 2087#endif 2088 2089#ifdef RN 2090 movaps 0 * SIZE(B), %xmm6 2091 pshufd $0x00, %xmm6, %xmm7 2092 mulps %xmm7, %xmm0 2093 pshufd $0x55, %xmm6, %xmm7 2094 mulps %xmm0, %xmm7 2095 subps %xmm7, %xmm1 2096 2097 pshufd $0xff, %xmm6, %xmm7 2098 mulps %xmm7, %xmm1 2099#endif 2100 2101#ifdef RT 2102 movaps 0 * SIZE(B), %xmm6 2103 pshufd $0xff, %xmm6, %xmm7 2104 mulps %xmm7, %xmm1 2105 pshufd $0xaa, %xmm6, %xmm7 2106 mulps %xmm1, %xmm7 2107 subps %xmm7, %xmm0 2108 2109 pshufd $0x00, %xmm6, %xmm7 2110 mulps %xmm7, %xmm0 2111#endif 2112 2113#if defined(LN) || defined(LT) 2114 movlps %xmm1, 0 * SIZE(B) 2115 movlps %xmm3, 2 * SIZE(B) 2116 movlps %xmm5, 4 * SIZE(B) 2117 movlps %xmm7, 6 * SIZE(B) 2118 2119 pshufd $0x00, %xmm1, %xmm0 2120 pshufd $0x55, %xmm1, %xmm2 2121 movaps %xmm0, 0 * SIZE(BB) 2122 movaps %xmm2, 4 * SIZE(BB) 2123 2124 pshufd $0x00, %xmm3, %xmm0 2125 pshufd $0x55, %xmm3, %xmm2 2126 movaps %xmm0, 8 * SIZE(BB) 2127 movaps %xmm2, 12 * SIZE(BB) 2128 2129 pshufd $0x00, %xmm5, %xmm0 2130 pshufd $0x55, %xmm5, %xmm2 2131 movaps %xmm0, 16 * SIZE(BB) 2132 movaps %xmm2, 20 * SIZE(BB) 2133 2134 pshufd $0x00, %xmm7, %xmm0 2135 pshufd $0x55, %xmm7, %xmm2 2136 movaps %xmm0, 24 * SIZE(BB) 2137 movaps %xmm2, 28 * SIZE(BB) 2138#else 2139 movaps %xmm0, 0 * SIZE(AA) 2140 movaps %xmm1, 4 * SIZE(AA) 2141#endif 2142 2143#ifdef LN 2144 subl $4 * SIZE, CO1 2145#endif 2146 2147#if defined(LN) || defined(LT) 2148 unpcklps %xmm5, %xmm1 2149 unpcklps %xmm7, %xmm3 2150 2151 movaps %xmm1, %xmm2 2152 unpcklps %xmm3, %xmm1 2153 unpckhps %xmm3, %xmm2 2154 2155 movlps %xmm1, 0 * SIZE(CO1) 2156 movhps %xmm1, 2 * SIZE(CO1) 2157 movlps %xmm2, 0 * SIZE(CO1, LDC, 1) 2158 movhps %xmm2, 2 * SIZE(CO1, LDC, 1) 2159#else 2160 movlps %xmm0, 0 * SIZE(CO1) 2161 movhps %xmm0, 2 * SIZE(CO1) 2162 movlps %xmm1, 0 * SIZE(CO1, LDC, 1) 2163 movhps %xmm1, 2 * SIZE(CO1, LDC, 1) 2164#endif 2165 2166#ifndef LN 2167 addl $4 * SIZE, CO1 2168#endif 2169 2170#if defined(LT) || defined(RN) 2171 movl K, %eax 2172 subl KK, %eax 2173 leal (,%eax, SIZE), %eax 2174 leal (AA, %eax, 4), AA 2175#ifdef LT 2176 addl $8 * SIZE, B 2177#endif 2178#endif 2179 2180#ifdef LN 2181 subl $4, KK 2182 movl BORIG, B 2183#endif 2184 2185#ifdef LT 2186 addl $4, KK 2187#endif 2188 2189#ifdef RT 2190 movl K, %eax 2191 movl BORIG, B 2192 sall $2 + BASE_SHIFT, %eax 2193 addl %eax, AORIG 2194#endif 2195 2196 decl %ebx # i -- 2197 jg .L51 2198 ALIGN_4 2199 2200.L60: 2201 testl $2, M 2202 je .L70 2203 2204#ifdef LN 2205 movl K, %eax 2206 sall $1 + BASE_SHIFT, %eax 2207 subl %eax, AORIG 2208#endif 2209 2210#if defined(LN) || defined(RT) 2211 movl KK, %eax 2212 movl AORIG, AA 2213 leal (, %eax, SIZE), %eax 2214 leal (AA, %eax, 2), AA 2215#endif 2216 2217 leal BUFFER, BB 2218 2219#if defined(LN) || defined(RT) 2220 movl KK, %eax 2221 sall $1 + BASE_SHIFT, %eax 2222 leal (BB, %eax, 4), BB 2223#endif 2224 2225 xorps %xmm4, %xmm4 2226 xorps %xmm5, %xmm5 2227 xorps %xmm6, %xmm6 2228 xorps %xmm7, %xmm7 2229 2230#ifdef movsd 2231 xorps %xmm0, %xmm0 2232#endif 2233 movsd 0 * SIZE(AA), %xmm0 2234#ifdef movsd 2235 xorps %xmm1, %xmm1 2236#endif 2237 movsd 8 * SIZE(AA), %xmm1 2238 movaps 0 * SIZE(BB), %xmm2 2239 movaps 16 * SIZE(BB), %xmm3 2240 2241#if defined(LT) || defined(RN) 2242 movl KK, %eax 2243#else 2244 movl K, %eax 2245 subl KK, %eax 2246#endif 2247 sarl $3, %eax 2248 je .L65 2249 ALIGN_4 2250 2251.L62: 2252#if defined(OPTERON) || defined(BARCELONA) 2253 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 2254#endif 2255 2256 mulps %xmm0, %xmm2 2257 addps %xmm2, %xmm4 2258 movaps 4 * SIZE(BB), %xmm2 2259 mulps %xmm0, %xmm2 2260 movsd 2 * SIZE(AA), %xmm0 2261 addps %xmm2, %xmm5 2262 movaps 8 * SIZE(BB), %xmm2 2263 2264 mulps %xmm0, %xmm2 2265 addps %xmm2, %xmm6 2266 movaps 12 * SIZE(BB), %xmm2 2267 mulps %xmm0, %xmm2 2268 movsd 4 * SIZE(AA), %xmm0 2269 addps %xmm2, %xmm7 2270 movaps 32 * SIZE(BB), %xmm2 2271 2272 mulps %xmm0, %xmm3 2273 addps %xmm3, %xmm4 2274 movaps 20 * SIZE(BB), %xmm3 2275 mulps %xmm0, %xmm3 2276 movsd 6 * SIZE(AA), %xmm0 2277 addps %xmm3, %xmm5 2278 movaps 24 * SIZE(BB), %xmm3 2279 2280 mulps %xmm0, %xmm3 2281 addps %xmm3, %xmm6 2282 movaps 28 * SIZE(BB), %xmm3 2283 mulps %xmm0, %xmm3 2284 movsd 16 * SIZE(AA), %xmm0 2285 addps %xmm3, %xmm7 2286 movaps 48 * SIZE(BB), %xmm3 2287 2288 mulps %xmm1, %xmm2 2289 addps %xmm2, %xmm4 2290 movaps 36 * SIZE(BB), %xmm2 2291 mulps %xmm1, %xmm2 2292 movsd 10 * SIZE(AA), %xmm1 2293 addps %xmm2, %xmm5 2294 movaps 40 * SIZE(BB), %xmm2 2295 2296 mulps %xmm1, %xmm2 2297 addps %xmm2, %xmm6 2298 movaps 44 * SIZE(BB), %xmm2 2299 mulps %xmm1, %xmm2 2300 movsd 12 * SIZE(AA), %xmm1 2301 addps %xmm2, %xmm7 2302 movaps 64 * SIZE(BB), %xmm2 2303 2304 mulps %xmm1, %xmm3 2305 addps %xmm3, %xmm4 2306 movaps 52 * SIZE(BB), %xmm3 2307 mulps %xmm1, %xmm3 2308 movsd 14 * SIZE(AA), %xmm1 2309 addps %xmm3, %xmm5 2310 movaps 56 * SIZE(BB), %xmm3 2311 2312 mulps %xmm1, %xmm3 2313 addps %xmm3, %xmm6 2314 movaps 60 * SIZE(BB), %xmm3 2315 mulps %xmm1, %xmm3 2316 movsd 24 * SIZE(AA), %xmm1 2317 addps %xmm3, %xmm7 2318 movaps 80 * SIZE(BB), %xmm3 2319 2320 addl $16 * SIZE, AA 2321 addl $64 * SIZE, BB 2322 decl %eax 2323 jne .L62 2324 ALIGN_4 2325 2326.L65: 2327#if defined(LT) || defined(RN) 2328 movl KK, %eax 2329#else 2330 movl K, %eax 2331 subl KK, %eax 2332#endif 2333 andl $7, %eax # if (k & 1) 2334 BRANCH 2335 je .L68 2336 ALIGN_4 2337 2338.L66: 2339 mulps %xmm0, %xmm2 2340 addps %xmm2, %xmm4 2341 movaps 4 * SIZE(BB), %xmm2 2342 mulps %xmm0, %xmm2 2343 movsd 2 * SIZE(AA), %xmm0 2344 addps %xmm2, %xmm5 2345 movaps 8 * SIZE(BB), %xmm2 2346 2347 addl $2 * SIZE, AA 2348 addl $8 * SIZE, BB 2349 decl %eax 2350 jg .L66 2351 ALIGN_4 2352 2353.L68: 2354 addps %xmm6, %xmm4 2355 addps %xmm7, %xmm5 2356 2357#if defined(LN) || defined(RT) 2358 movl KK, %eax 2359#ifdef LN 2360 subl $2, %eax 2361#else 2362 subl $2, %eax 2363#endif 2364 2365 movl AORIG, AA 2366 movl BORIG, B 2367 leal BUFFER, BB 2368 2369 sall $BASE_SHIFT, %eax 2370 leal (AA, %eax, 2), AA 2371 leal (B, %eax, 2), B 2372 leal (BB, %eax, 8), BB 2373#endif 2374 2375#if defined(LN) || defined(LT) 2376 unpcklps %xmm6, %xmm4 2377 unpcklps %xmm7, %xmm5 2378 2379 movaps %xmm4, %xmm6 2380 unpcklps %xmm5, %xmm4 2381 unpckhps %xmm5, %xmm6 2382 2383#ifdef movsd 2384 xorps %xmm1, %xmm1 2385#endif 2386 movsd 0 * SIZE(B), %xmm1 2387#ifdef movsd 2388 xorps %xmm3, %xmm3 2389#endif 2390 movsd 2 * SIZE(B), %xmm3 2391 2392 subps %xmm4, %xmm1 2393 subps %xmm6, %xmm3 2394#else 2395#ifdef movsd 2396 xorps %xmm0, %xmm0 2397#endif 2398 movsd 0 * SIZE(AA), %xmm0 2399#ifdef movsd 2400 xorps %xmm1, %xmm1 2401#endif 2402 movsd 2 * SIZE(AA), %xmm1 2403 2404 subps %xmm4, %xmm0 2405 subps %xmm5, %xmm1 2406#endif 2407 2408#ifdef LN 2409 movaps 0 * SIZE(AA), %xmm4 2410 pshufd $0xff, %xmm4, %xmm6 2411 mulps %xmm6, %xmm3 2412 pshufd $0xaa, %xmm4, %xmm6 2413 mulps %xmm3, %xmm6 2414 subps %xmm6, %xmm1 2415 2416 pshufd $0x00, %xmm4, %xmm6 2417 mulps %xmm6, %xmm1 2418#endif 2419 2420#ifdef LT 2421 movaps 0 * SIZE(AA), %xmm4 2422 pshufd $0x00, %xmm4, %xmm6 2423 mulps %xmm6, %xmm1 2424 pshufd $0x55, %xmm4, %xmm6 2425 mulps %xmm1, %xmm6 2426 subps %xmm6, %xmm3 2427 2428 pshufd $0xff, %xmm4, %xmm6 2429 mulps %xmm6, %xmm3 2430#endif 2431 2432#ifdef RN 2433 movaps 0 * SIZE(B), %xmm6 2434 pshufd $0x00, %xmm6, %xmm7 2435 mulps %xmm7, %xmm0 2436 pshufd $0x55, %xmm6, %xmm7 2437 mulps %xmm0, %xmm7 2438 subps %xmm7, %xmm1 2439 2440 pshufd $0xff, %xmm6, %xmm7 2441 mulps %xmm7, %xmm1 2442#endif 2443 2444#ifdef RT 2445 movaps 0 * SIZE(B), %xmm6 2446 pshufd $0xff, %xmm6, %xmm7 2447 mulps %xmm7, %xmm1 2448 pshufd $0xaa, %xmm6, %xmm7 2449 mulps %xmm1, %xmm7 2450 subps %xmm7, %xmm0 2451 2452 pshufd $0x00, %xmm6, %xmm7 2453 mulps %xmm7, %xmm0 2454#endif 2455 2456#if defined(LN) || defined(LT) 2457 movlps %xmm1, 0 * SIZE(B) 2458 movlps %xmm3, 2 * SIZE(B) 2459 2460 pshufd $0x00, %xmm1, %xmm0 2461 pshufd $0x55, %xmm1, %xmm2 2462 movaps %xmm0, 0 * SIZE(BB) 2463 movaps %xmm2, 4 * SIZE(BB) 2464 2465 pshufd $0x00, %xmm3, %xmm0 2466 pshufd $0x55, %xmm3, %xmm2 2467 movaps %xmm0, 8 * SIZE(BB) 2468 movaps %xmm2, 12 * SIZE(BB) 2469#else 2470 movlps %xmm0, 0 * SIZE(AA) 2471 movlps %xmm1, 2 * SIZE(AA) 2472#endif 2473 2474#ifdef LN 2475 subl $2 * SIZE, CO1 2476#endif 2477 2478#if defined(LN) || defined(LT) 2479 unpcklps %xmm3, %xmm1 2480 2481 movlps %xmm1, 0 * SIZE(CO1) 2482 movhps %xmm1, 0 * SIZE(CO1, LDC) 2483#else 2484 movlps %xmm0, 0 * SIZE(CO1) 2485 movlps %xmm1, 0 * SIZE(CO1, LDC) 2486#endif 2487 2488#ifndef LN 2489 addl $2 * SIZE, CO1 2490#endif 2491 2492#if defined(LT) || defined(RN) 2493 movl K, %eax 2494 subl KK, %eax 2495 leal (,%eax, SIZE), %eax 2496 leal (AA, %eax, 2), AA 2497#ifdef LT 2498 addl $4 * SIZE, B 2499#endif 2500#endif 2501 2502#ifdef LN 2503 subl $2, KK 2504 movl BORIG, B 2505#endif 2506 2507#ifdef LT 2508 addl $2, KK 2509#endif 2510 2511#ifdef RT 2512 movl K, %eax 2513 movl BORIG, B 2514 sall $1 + BASE_SHIFT, %eax 2515 addl %eax, AORIG 2516#endif 2517 ALIGN_4 2518 2519.L70: 2520 testl $1, M 2521 je .L79 2522 2523#ifdef LN 2524 movl K, %eax 2525 sall $BASE_SHIFT, %eax 2526 subl %eax, AORIG 2527#endif 2528 2529#if defined(LN) || defined(RT) 2530 movl KK, %eax 2531 movl AORIG, AA 2532 leal (AA, %eax, SIZE), AA 2533#endif 2534 2535 leal BUFFER, BB 2536 2537#if defined(LN) || defined(RT) 2538 movl KK, %eax 2539 sall $1 + BASE_SHIFT, %eax 2540 leal (BB, %eax, 4), BB 2541#endif 2542 2543 xorps %xmm4, %xmm4 2544 xorps %xmm5, %xmm5 2545 xorps %xmm6, %xmm6 2546 xorps %xmm7, %xmm7 2547 2548 movss 0 * SIZE(AA), %xmm0 2549 movss 4 * SIZE(AA), %xmm1 2550 movss 0 * SIZE(BB), %xmm2 2551 movss 16 * SIZE(BB), %xmm3 2552 2553#if defined(LT) || defined(RN) 2554 movl KK, %eax 2555#else 2556 movl K, %eax 2557 subl KK, %eax 2558#endif 2559 sarl $3, %eax 2560 je .L75 2561 ALIGN_4 2562 2563.L72: 2564 mulss %xmm0, %xmm2 2565#if defined(OPTERON) || defined(BARCELONA) 2566 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 2567#endif 2568 mulss 4 * SIZE(BB), %xmm0 2569 addss %xmm2, %xmm4 2570 movss 8 * SIZE(BB), %xmm2 2571 addss %xmm0, %xmm5 2572 movss 1 * SIZE(AA), %xmm0 2573 mulss %xmm0, %xmm2 2574 mulss 12 * SIZE(BB), %xmm0 2575 addss %xmm2, %xmm6 2576 movss 32 * SIZE(BB), %xmm2 2577 addss %xmm0, %xmm7 2578 movss 2 * SIZE(AA), %xmm0 2579 mulss %xmm0, %xmm3 2580 mulss 20 * SIZE(BB), %xmm0 2581 addss %xmm3, %xmm4 2582 movss 24 * SIZE(BB), %xmm3 2583 addss %xmm0, %xmm5 2584 movss 3 * SIZE(AA), %xmm0 2585 mulss %xmm0, %xmm3 2586 mulss 28 * SIZE(BB), %xmm0 2587 addss %xmm3, %xmm6 2588 movss 48 * SIZE(BB), %xmm3 2589 addss %xmm0, %xmm7 2590 movss 8 * SIZE(AA), %xmm0 2591 mulss %xmm1, %xmm2 2592 mulss 36 * SIZE(BB), %xmm1 2593 addss %xmm2, %xmm4 2594 movss 40 * SIZE(BB), %xmm2 2595 addss %xmm1, %xmm5 2596 movss 5 * SIZE(AA), %xmm1 2597 mulss %xmm1, %xmm2 2598 mulss 44 * SIZE(BB), %xmm1 2599 addss %xmm2, %xmm6 2600 movss 64 * SIZE(BB), %xmm2 2601 addss %xmm1, %xmm7 2602 movss 6 * SIZE(AA), %xmm1 2603 mulss %xmm1, %xmm3 2604 mulss 52 * SIZE(BB), %xmm1 2605 addss %xmm3, %xmm4 2606 movss 56 * SIZE(BB), %xmm3 2607 addss %xmm1, %xmm5 2608 movss 7 * SIZE(AA), %xmm1 2609 mulss %xmm1, %xmm3 2610 mulss 60 * SIZE(BB), %xmm1 2611 addss %xmm3, %xmm6 2612 movss 80 * SIZE(BB), %xmm3 2613 addss %xmm1, %xmm7 2614 movss 12 * SIZE(AA), %xmm1 2615 2616 addl $ 8 * SIZE, AA 2617 addl $64 * SIZE, BB 2618 decl %eax 2619 jne .L72 2620 ALIGN_4 2621 2622.L75: 2623#if defined(LT) || defined(RN) 2624 movl KK, %eax 2625#else 2626 movl K, %eax 2627 subl KK, %eax 2628#endif 2629 andl $7, %eax # if (k & 1) 2630 BRANCH 2631 je .L78 2632 ALIGN_4 2633 2634.L76: 2635 mulss %xmm0, %xmm2 2636 mulss 4 * SIZE(BB), %xmm0 2637 addss %xmm2, %xmm4 2638 movss 8 * SIZE(BB), %xmm2 2639 addss %xmm0, %xmm5 2640 movss 1 * SIZE(AA), %xmm0 2641 2642 addl $ 1 * SIZE, AA 2643 addl $ 8 * SIZE, BB 2644 decl %eax 2645 jg .L76 2646 ALIGN_4 2647 2648.L78: 2649 addss %xmm6, %xmm4 2650 addss %xmm7, %xmm5 2651 2652#if defined(LN) || defined(RT) 2653 movl KK, %eax 2654#ifdef LN 2655 subl $1, %eax 2656#else 2657 subl $2, %eax 2658#endif 2659 2660 movl AORIG, AA 2661 movl BORIG, B 2662 leal BUFFER, BB 2663 2664 sall $BASE_SHIFT, %eax 2665 leal (AA, %eax, 1), AA 2666 leal (B, %eax, 2), B 2667 leal (BB, %eax, 8), BB 2668#endif 2669 2670#if defined(LN) || defined(LT) 2671 unpcklps %xmm5, %xmm4 2672 2673#ifdef movsd 2674 xorps %xmm1, %xmm1 2675#endif 2676 movsd 0 * SIZE(B), %xmm1 2677 2678 subps %xmm4, %xmm1 2679#else 2680 movss 0 * SIZE(AA), %xmm0 2681 movss 1 * SIZE(AA), %xmm1 2682 2683 subss %xmm4, %xmm0 2684 subss %xmm5, %xmm1 2685#endif 2686 2687#if defined(LN) || defined(LT) 2688 movss 0 * SIZE(AA), %xmm4 2689 pshufd $0x00, %xmm4, %xmm6 2690 mulps %xmm6, %xmm1 2691#endif 2692 2693#ifdef RN 2694 movaps 0 * SIZE(B), %xmm6 2695 pshufd $0x00, %xmm6, %xmm7 2696 mulss %xmm7, %xmm0 2697 pshufd $0x55, %xmm6, %xmm7 2698 mulss %xmm0, %xmm7 2699 subss %xmm7, %xmm1 2700 2701 pshufd $0xff, %xmm6, %xmm7 2702 mulss %xmm7, %xmm1 2703#endif 2704 2705#ifdef RT 2706 movaps 0 * SIZE(B), %xmm6 2707 pshufd $0xff, %xmm6, %xmm7 2708 mulss %xmm7, %xmm1 2709 pshufd $0xaa, %xmm6, %xmm7 2710 mulss %xmm1, %xmm7 2711 subss %xmm7, %xmm0 2712 2713 pshufd $0x00, %xmm6, %xmm7 2714 mulss %xmm7, %xmm0 2715#endif 2716 2717#if defined(LN) || defined(LT) 2718 movlps %xmm1, 0 * SIZE(B) 2719 2720 pshufd $0x00, %xmm1, %xmm0 2721 pshufd $0x55, %xmm1, %xmm2 2722 movaps %xmm0, 0 * SIZE(BB) 2723 movaps %xmm2, 4 * SIZE(BB) 2724#else 2725 movss %xmm0, 0 * SIZE(AA) 2726 movss %xmm1, 1 * SIZE(AA) 2727#endif 2728 2729#ifdef LN 2730 subl $1 * SIZE, CO1 2731#endif 2732 2733#if defined(LN) || defined(LT) 2734 pshufd $1, %xmm1, %xmm3 2735 2736 movss %xmm1, 0 * SIZE(CO1) 2737 movss %xmm3, 0 * SIZE(CO1, LDC) 2738#else 2739 movss %xmm0, 0 * SIZE(CO1) 2740 movss %xmm1, 0 * SIZE(CO1, LDC) 2741#endif 2742 2743#ifndef LN 2744 addl $1 * SIZE, CO1 2745#endif 2746 2747#if defined(LT) || defined(RN) 2748 movl K, %eax 2749 subl KK, %eax 2750 leal (AA, %eax, SIZE), AA 2751#ifdef LT 2752 addl $2 * SIZE, B 2753#endif 2754#endif 2755 2756#ifdef LN 2757 subl $1, KK 2758 movl BORIG, B 2759#endif 2760 2761#ifdef LT 2762 addl $1, KK 2763#endif 2764 2765#ifdef RT 2766 movl K, %eax 2767 movl BORIG, B 2768 sall $BASE_SHIFT, %eax 2769 addl %eax, AORIG 2770#endif 2771 ALIGN_4 2772 2773.L79: 2774#ifdef LN 2775 movl K, %eax 2776 leal (, %eax, SIZE), %eax 2777 leal (B, %eax, 2), B 2778#endif 2779 2780#if defined(LT) || defined(RN) 2781 movl K, %eax 2782 subl KK, %eax 2783 leal (,%eax, SIZE), %eax 2784 leal (B, %eax, 2), B 2785#endif 2786 2787#ifdef RN 2788 addl $2, KK 2789#endif 2790 2791#ifdef RT 2792 subl $2, KK 2793#endif 2794 ALIGN_4 2795 2796.L80: 2797 testl $1, N 2798 je .L999 2799 2800#ifdef LN 2801 movl OFFSET, %eax 2802 addl M, %eax 2803 movl %eax, KK 2804#endif 2805 2806 leal BUFFER, %ecx 2807 2808#ifdef RT 2809 movl K, %eax 2810 sall $BASE_SHIFT, %eax 2811 subl %eax, B 2812#endif 2813 2814#if defined(LN) || defined(RT) 2815 movl KK, %eax 2816 movl B, BORIG 2817 sall $BASE_SHIFT, %eax 2818 leal (B, %eax, 1), B 2819 leal (BB, %eax, 4), BB 2820#endif 2821 2822#ifdef LT 2823 movl OFFSET, %eax 2824 movl %eax, KK 2825#endif 2826 2827#if defined(LT) || defined(RN) 2828 movl KK, %eax 2829#else 2830 movl K, %eax 2831 subl KK, %eax 2832#endif 2833 sarl $3, %eax 2834 jle .L85 2835 ALIGN_4 2836 2837.L82: 2838 movsd 0 * SIZE(B), %xmm3 2839 movhps 2 * SIZE(B), %xmm3 2840 movsd 4 * SIZE(B), %xmm7 2841 movhps 6 * SIZE(B), %xmm7 2842 2843 pshufd $0x00, %xmm3, %xmm0 2844 pshufd $0x55, %xmm3, %xmm1 2845 pshufd $0xaa, %xmm3, %xmm2 2846 pshufd $0xff, %xmm3, %xmm3 2847 2848 pshufd $0x00, %xmm7, %xmm4 2849 pshufd $0x55, %xmm7, %xmm5 2850 pshufd $0xaa, %xmm7, %xmm6 2851 pshufd $0xff, %xmm7, %xmm7 2852 2853 movaps %xmm0, 0 * SIZE(BB) 2854 movaps %xmm1, 4 * SIZE(BB) 2855 movaps %xmm2, 8 * SIZE(BB) 2856 movaps %xmm3, 12 * SIZE(BB) 2857 movaps %xmm4, 16 * SIZE(BB) 2858 movaps %xmm5, 20 * SIZE(BB) 2859 movaps %xmm6, 24 * SIZE(BB) 2860 movaps %xmm7, 28 * SIZE(BB) 2861 2862 addl $ 8 * SIZE, B 2863 addl $32 * SIZE, BB 2864 decl %eax 2865 jne .L82 2866 ALIGN_4 2867 2868.L85: 2869#if defined(LT) || defined(RN) 2870 movl KK, %eax 2871#else 2872 movl K, %eax 2873 subl KK, %eax 2874#endif 2875 andl $7, %eax 2876 BRANCH 2877 jle .L90 2878 ALIGN_4 2879 2880.L86: 2881 movss 0 * SIZE(B), %xmm3 2882 2883 pshufd $0x00, %xmm3, %xmm0 2884 2885 movaps %xmm0, 0 * SIZE(BB) 2886 2887 addl $1 * SIZE, B 2888 addl $4 * SIZE, BB 2889 decl %eax 2890 jne .L86 2891 ALIGN_4 2892 2893.L90: 2894#if defined(LT) || defined(RN) 2895 movl A, AA 2896#else 2897 movl A, %eax 2898 movl %eax, AORIG 2899#endif 2900 2901#ifdef RT 2902 subl LDC, C 2903#endif 2904 movl C, CO1 2905#ifndef RT 2906 addl LDC, C 2907#endif 2908 2909 movl M, %ebx 2910 sarl $2, %ebx # i = (m >> 2) 2911 jle .L100 2912 ALIGN_4 2913 2914.L91: 2915#ifdef LN 2916 movl K, %eax 2917 sall $2 + BASE_SHIFT, %eax 2918 subl %eax, AORIG 2919#endif 2920 2921#if defined(LN) || defined(RT) 2922 movl KK, %eax 2923 movl AORIG, AA 2924 leal (, %eax, SIZE), %eax 2925 leal (AA, %eax, 4), AA 2926#endif 2927 2928 leal BUFFER, BB 2929 2930#if defined(LN) || defined(RT) 2931 movl KK, %eax 2932 sall $BASE_SHIFT, %eax 2933 leal (BB, %eax, 4), BB 2934#endif 2935 2936 xorps %xmm4, %xmm4 2937 xorps %xmm5, %xmm5 2938 xorps %xmm6, %xmm6 2939 xorps %xmm7, %xmm7 2940 2941 movaps 0 * SIZE(AA), %xmm0 2942 movaps 16 * SIZE(AA), %xmm1 2943 movaps 0 * SIZE(BB), %xmm2 2944 movaps 16 * SIZE(BB), %xmm3 2945 2946 PREFETCHW 3 * SIZE(CO1) 2947 2948#if defined(LT) || defined(RN) 2949 movl KK, %eax 2950#else 2951 movl K, %eax 2952 subl KK, %eax 2953#endif 2954 sarl $3, %eax 2955 je .L95 2956 ALIGN_4 2957 2958.L92: 2959 mulps %xmm0, %xmm2 2960#if defined(OPTERON) || defined(BARCELONA) 2961 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 2962#endif 2963 movaps 4 * SIZE(AA), %xmm0 2964 addps %xmm2, %xmm4 2965 movaps 32 * SIZE(BB), %xmm2 2966 mulps 4 * SIZE(BB), %xmm0 2967 addps %xmm0, %xmm5 2968 movaps 8 * SIZE(AA), %xmm0 2969 mulps 8 * SIZE(BB), %xmm0 2970 addps %xmm0, %xmm6 2971 movaps 12 * SIZE(AA), %xmm0 2972 mulps 12 * SIZE(BB), %xmm0 2973 addps %xmm0, %xmm7 2974 movaps 32 * SIZE(AA), %xmm0 2975#if defined(OPTERON) || defined(BARCELONA) 2976 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) 2977#endif 2978 mulps %xmm1, %xmm3 2979 movaps 20 * SIZE(AA), %xmm1 2980 addps %xmm3, %xmm4 2981 movaps 48 * SIZE(BB), %xmm3 2982 mulps 20 * SIZE(BB), %xmm1 2983 addps %xmm1, %xmm5 2984 movaps 24 * SIZE(AA), %xmm1 2985 mulps 24 * SIZE(BB), %xmm1 2986 addps %xmm1, %xmm6 2987 movaps 28 * SIZE(AA), %xmm1 2988 mulps 28 * SIZE(BB), %xmm1 2989 addps %xmm1, %xmm7 2990 movaps 48 * SIZE(AA), %xmm1 2991 2992 addl $32 * SIZE, AA 2993 addl $32 * SIZE, BB 2994 decl %eax 2995 jne .L92 2996 ALIGN_4 2997 2998.L95: 2999#if defined(LT) || defined(RN) 3000 movl KK, %eax 3001#else 3002 movl K, %eax 3003 subl KK, %eax 3004#endif 3005 andl $7, %eax # if (k & 1) 3006 BRANCH 3007 je .L98 3008 ALIGN_4 3009 3010.L96: 3011 mulps %xmm0, %xmm2 3012 addps %xmm2, %xmm4 3013 movaps 4 * SIZE(AA), %xmm0 3014 movaps 4 * SIZE(BB), %xmm2 3015 3016 addl $4 * SIZE, AA 3017 addl $4 * SIZE, BB 3018 decl %eax 3019 jg .L96 3020 ALIGN_4 3021 3022.L98: 3023 addps %xmm5, %xmm4 3024 addps %xmm7, %xmm6 3025 addps %xmm6, %xmm4 3026 3027#if defined(LN) || defined(RT) 3028 movl KK, %eax 3029#ifdef LN 3030 subl $4, %eax 3031#else 3032 subl $1, %eax 3033#endif 3034 3035 movl AORIG, AA 3036 movl BORIG, B 3037 leal BUFFER, BB 3038 3039 sall $ BASE_SHIFT, %eax 3040 leal (AA, %eax, 4), AA 3041 leal (B, %eax, 1), B 3042 leal (BB, %eax, 4), BB 3043#endif 3044 3045#if defined(LN) || defined(LT) 3046 movaps %xmm4, %xmm0 3047 unpcklps %xmm6, %xmm4 3048 unpckhps %xmm6, %xmm0 3049 3050 movaps %xmm5, %xmm1 3051 unpcklps %xmm7, %xmm5 3052 unpckhps %xmm7, %xmm1 3053 3054 movaps %xmm4, %xmm6 3055 unpcklps %xmm5, %xmm4 3056 unpckhps %xmm5, %xmm6 3057 3058 movaps %xmm0, %xmm2 3059 unpcklps %xmm1, %xmm0 3060 unpckhps %xmm1, %xmm2 3061 3062 movss 0 * SIZE(B), %xmm1 3063 movss 1 * SIZE(B), %xmm3 3064 movss 2 * SIZE(B), %xmm5 3065 movss 3 * SIZE(B), %xmm7 3066 3067 subss %xmm4, %xmm1 3068 subss %xmm6, %xmm3 3069 subss %xmm0, %xmm5 3070 subss %xmm2, %xmm7 3071#else 3072 movaps 0 * SIZE(AA), %xmm0 3073 3074 subps %xmm4, %xmm0 3075#endif 3076 3077#ifdef LN 3078 movaps 12 * SIZE(AA), %xmm4 3079 pshufd $0xff, %xmm4, %xmm6 3080 mulss %xmm6, %xmm7 3081 pshufd $0xaa, %xmm4, %xmm6 3082 mulss %xmm7, %xmm6 3083 subss %xmm6, %xmm5 3084 pshufd $0x55, %xmm4, %xmm6 3085 mulss %xmm7, %xmm6 3086 subss %xmm6, %xmm3 3087 pshufd $0x00, %xmm4, %xmm6 3088 mulss %xmm7, %xmm6 3089 subss %xmm6, %xmm1 3090 3091 movaps 8 * SIZE(AA), %xmm4 3092 pshufd $0xaa, %xmm4, %xmm6 3093 mulss %xmm6, %xmm5 3094 pshufd $0x55, %xmm4, %xmm6 3095 mulss %xmm5, %xmm6 3096 subss %xmm6, %xmm3 3097 pshufd $0x00, %xmm4, %xmm6 3098 mulss %xmm5, %xmm6 3099 subss %xmm6, %xmm1 3100 3101 movaps 4 * SIZE(AA), %xmm4 3102 pshufd $0x55, %xmm4, %xmm6 3103 mulss %xmm6, %xmm3 3104 pshufd $0x00, %xmm4, %xmm6 3105 mulss %xmm3, %xmm6 3106 subss %xmm6, %xmm1 3107 3108 movaps 0 * SIZE(AA), %xmm4 3109 pshufd $0x00, %xmm4, %xmm6 3110 mulss %xmm6, %xmm1 3111#endif 3112 3113#ifdef LT 3114 movaps 0 * SIZE(AA), %xmm4 3115 pshufd $0x00, %xmm4, %xmm6 3116 mulss %xmm6, %xmm1 3117 3118 pshufd $0x55, %xmm4, %xmm6 3119 mulss %xmm1, %xmm6 3120 subss %xmm6, %xmm3 3121 pshufd $0xaa, %xmm4, %xmm6 3122 mulss %xmm1, %xmm6 3123 subss %xmm6, %xmm5 3124 pshufd $0xff, %xmm4, %xmm6 3125 mulss %xmm1, %xmm6 3126 subss %xmm6, %xmm7 3127 3128 movaps 4 * SIZE(AA), %xmm4 3129 pshufd $0x55, %xmm4, %xmm6 3130 mulss %xmm6, %xmm3 3131 pshufd $0xaa, %xmm4, %xmm6 3132 mulss %xmm3, %xmm6 3133 subss %xmm6, %xmm5 3134 pshufd $0xff, %xmm4, %xmm6 3135 mulss %xmm3, %xmm6 3136 subss %xmm6, %xmm7 3137 3138 movaps 8 * SIZE(AA), %xmm4 3139 pshufd $0xaa, %xmm4, %xmm6 3140 mulss %xmm6, %xmm5 3141 pshufd $0xff, %xmm4, %xmm6 3142 mulss %xmm5, %xmm6 3143 subss %xmm6, %xmm7 3144 3145 movaps 12 * SIZE(AA), %xmm4 3146 pshufd $0xff, %xmm4, %xmm6 3147 mulss %xmm6, %xmm7 3148#endif 3149 3150#if defined(RN) || defined(RT) 3151 movss 0 * SIZE(B), %xmm6 3152 pshufd $0x00, %xmm6, %xmm7 3153 mulps %xmm7, %xmm0 3154#endif 3155 3156#if defined(LN) || defined(LT) 3157 movss %xmm1, 0 * SIZE(B) 3158 movss %xmm3, 1 * SIZE(B) 3159 movss %xmm5, 2 * SIZE(B) 3160 movss %xmm7, 3 * SIZE(B) 3161 3162 pshufd $0x00, %xmm1, %xmm0 3163 movaps %xmm0, 0 * SIZE(BB) 3164 pshufd $0x00, %xmm3, %xmm0 3165 movaps %xmm0, 4 * SIZE(BB) 3166 3167 pshufd $0x00, %xmm5, %xmm0 3168 movaps %xmm0, 8 * SIZE(BB) 3169 pshufd $0x00, %xmm7, %xmm0 3170 movaps %xmm0, 12 * SIZE(BB) 3171#else 3172 movss %xmm0, 0 * SIZE(AA) 3173 movss %xmm1, 1 * SIZE(AA) 3174 movss %xmm2, 2 * SIZE(AA) 3175 movss %xmm3, 3 * SIZE(AA) 3176#endif 3177 3178#ifdef LN 3179 subl $4 * SIZE, CO1 3180#endif 3181 3182#if defined(LN) || defined(LT) 3183 unpcklps %xmm5, %xmm1 3184 unpcklps %xmm7, %xmm3 3185 3186 unpcklps %xmm3, %xmm1 3187 3188 movlps %xmm1, 0 * SIZE(CO1) 3189 movhps %xmm1, 2 * SIZE(CO1) 3190#else 3191 movlps %xmm0, 0 * SIZE(CO1) 3192 movhps %xmm0, 2 * SIZE(CO1) 3193#endif 3194 3195#ifndef LN 3196 addl $4 * SIZE, CO1 3197#endif 3198 3199#if defined(LT) || defined(RN) 3200 movl K, %eax 3201 subl KK, %eax 3202 leal (,%eax, SIZE), %eax 3203 leal (AA, %eax, 4), AA 3204#ifdef LT 3205 addl $4 * SIZE, B 3206#endif 3207#endif 3208 3209#ifdef LN 3210 subl $4, KK 3211 movl BORIG, B 3212#endif 3213 3214#ifdef LT 3215 addl $4, KK 3216#endif 3217 3218#ifdef RT 3219 movl K, %eax 3220 movl BORIG, B 3221 sall $2 + BASE_SHIFT, %eax 3222 addl %eax, AORIG 3223#endif 3224 3225 decl %ebx # i -- 3226 jg .L91 3227 ALIGN_4 3228 3229.L100: 3230 testl $2, M 3231 je .L110 3232 3233#ifdef LN 3234 movl K, %eax 3235 sall $1 + BASE_SHIFT, %eax 3236 subl %eax, AORIG 3237#endif 3238 3239#if defined(LN) || defined(RT) 3240 movl KK, %eax 3241 movl AORIG, AA 3242 sall $1 + BASE_SHIFT, %eax 3243 leal (, %eax, SIZE), %eax 3244 leal (AA, %eax, 2), AA 3245#endif 3246 3247 leal BUFFER, BB 3248 3249#if defined(LN) || defined(RT) 3250 movl KK, %eax 3251 sall $BASE_SHIFT, %eax 3252 leal (BB, %eax, 4), BB 3253#endif 3254 3255 xorps %xmm4, %xmm4 3256 xorps %xmm5, %xmm5 3257 xorps %xmm6, %xmm6 3258 xorps %xmm7, %xmm7 3259 3260#ifdef movsd 3261 xorps %xmm0, %xmm0 3262#endif 3263 movsd 0 * SIZE(AA), %xmm0 3264#ifdef movsd 3265 xorps %xmm1, %xmm1 3266#endif 3267 movsd 8 * SIZE(AA), %xmm1 3268 movaps 0 * SIZE(BB), %xmm2 3269 movaps 16 * SIZE(BB), %xmm3 3270 3271#if defined(LT) || defined(RN) 3272 movl KK, %eax 3273#else 3274 movl K, %eax 3275 subl KK, %eax 3276#endif 3277 sarl $3, %eax 3278 je .L105 3279 ALIGN_4 3280 3281.L102: 3282 mulps %xmm0, %xmm2 3283#if defined(OPTERON) || defined(BARCELONA) 3284 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 3285#endif 3286 movsd 2 * SIZE(AA), %xmm0 3287 addps %xmm2, %xmm4 3288 movaps 4 * SIZE(BB), %xmm2 3289 mulps %xmm0, %xmm2 3290 movsd 4 * SIZE(AA), %xmm0 3291 addps %xmm2, %xmm5 3292 movaps 8 * SIZE(BB), %xmm2 3293 mulps %xmm0, %xmm2 3294 movsd 6 * SIZE(AA), %xmm0 3295 addps %xmm2, %xmm6 3296 movaps 12 * SIZE(BB), %xmm2 3297 mulps %xmm0, %xmm2 3298 movsd 16 * SIZE(AA), %xmm0 3299 addps %xmm2, %xmm7 3300 movaps 32 * SIZE(BB), %xmm2 3301 mulps %xmm1, %xmm3 3302 movsd 10 * SIZE(AA), %xmm1 3303 addps %xmm3, %xmm4 3304 movaps 20 * SIZE(BB), %xmm3 3305 mulps %xmm1, %xmm3 3306 movsd 12 * SIZE(AA), %xmm1 3307 addps %xmm3, %xmm5 3308 movaps 24 * SIZE(BB), %xmm3 3309 mulps %xmm1, %xmm3 3310 movsd 14 * SIZE(AA), %xmm1 3311 addps %xmm3, %xmm6 3312 movaps 28 * SIZE(BB), %xmm3 3313 mulps %xmm1, %xmm3 3314 movsd 24 * SIZE(AA), %xmm1 3315 addps %xmm3, %xmm7 3316 movaps 48 * SIZE(BB), %xmm3 3317 3318 addl $16 * SIZE, AA 3319 addl $32 * SIZE, BB 3320 decl %eax 3321 jne .L102 3322 ALIGN_4 3323 3324.L105: 3325#if defined(LT) || defined(RN) 3326 movl KK, %eax 3327#else 3328 movl K, %eax 3329 subl KK, %eax 3330#endif 3331 andl $7, %eax # if (k & 1) 3332 BRANCH 3333 je .L108 3334 ALIGN_4 3335 3336.L106: 3337 mulps %xmm0, %xmm2 3338 addps %xmm2, %xmm4 3339 movsd 2 * SIZE(AA), %xmm0 3340 movaps 4 * SIZE(BB), %xmm2 3341 3342 addl $2 * SIZE, AA 3343 addl $4 * SIZE, BB 3344 decl %eax 3345 jg .L106 3346 ALIGN_4 3347 3348.L108: 3349 addps %xmm5, %xmm4 3350 addps %xmm7, %xmm6 3351 addps %xmm6, %xmm4 3352 3353#if defined(LN) || defined(RT) 3354 movl KK, %eax 3355#ifdef LN 3356 subl $2, %eax 3357#else 3358 subl $1, %eax 3359#endif 3360 3361 movl AORIG, AA 3362 movl BORIG, B 3363 leal BUFFER, BB 3364 3365 sall $ BASE_SHIFT, %eax 3366 leal (AA, %eax, 2), AA 3367 leal (B, %eax, 1), B 3368 leal (BB, %eax, 4), BB 3369#endif 3370 3371#if defined(LN) || defined(LT) 3372 pshufd $1, %xmm4, %xmm6 3373 3374 movss 0 * SIZE(B), %xmm1 3375 movss 1 * SIZE(B), %xmm3 3376 3377 subss %xmm4, %xmm1 3378 subss %xmm6, %xmm3 3379#else 3380#ifdef movsd 3381 xorps %xmm0, %xmm0 3382#endif 3383 movsd 0 * SIZE(AA), %xmm0 3384 3385 subps %xmm4, %xmm0 3386#endif 3387 3388#ifdef LN 3389 movaps 0 * SIZE(AA), %xmm4 3390 pshufd $0xff, %xmm4, %xmm6 3391 mulss %xmm6, %xmm3 3392 pshufd $0xaa, %xmm4, %xmm6 3393 mulss %xmm3, %xmm6 3394 subss %xmm6, %xmm1 3395 3396 pshufd $0x00, %xmm4, %xmm6 3397 mulss %xmm6, %xmm1 3398#endif 3399 3400#ifdef LT 3401 movaps 0 * SIZE(AA), %xmm4 3402 pshufd $0x00, %xmm4, %xmm6 3403 mulss %xmm6, %xmm1 3404 pshufd $0x55, %xmm4, %xmm6 3405 mulss %xmm1, %xmm6 3406 subss %xmm6, %xmm3 3407 3408 pshufd $0xff, %xmm4, %xmm6 3409 mulss %xmm6, %xmm3 3410#endif 3411 3412#if defined(RN) || defined(RT) 3413 movss 0 * SIZE(B), %xmm6 3414 pshufd $0x00, %xmm6, %xmm7 3415 mulps %xmm7, %xmm0 3416#endif 3417 3418#if defined(LN) || defined(LT) 3419 movss %xmm1, 0 * SIZE(B) 3420 movss %xmm3, 1 * SIZE(B) 3421 3422 pshufd $0x00, %xmm1, %xmm0 3423 movaps %xmm0, 0 * SIZE(BB) 3424 pshufd $0x00, %xmm3, %xmm0 3425 movaps %xmm0, 4 * SIZE(BB) 3426#else 3427 movlps %xmm0, 0 * SIZE(AA) 3428#endif 3429 3430#ifdef LN 3431 subl $2 * SIZE, CO1 3432#endif 3433 3434#if defined(LN) || defined(LT) 3435 movss %xmm1, 0 * SIZE(CO1) 3436 movss %xmm3, 1 * SIZE(CO1) 3437#else 3438 movlps %xmm0, 0 * SIZE(CO1) 3439#endif 3440 3441#ifndef LN 3442 addl $2 * SIZE, CO1 3443#endif 3444 3445#if defined(LT) || defined(RN) 3446 movl K, %eax 3447 subl KK, %eax 3448 leal (,%eax, SIZE), %eax 3449 leal (AA, %eax, 2), AA 3450#ifdef LT 3451 addl $2 * SIZE, B 3452#endif 3453#endif 3454 3455#ifdef LN 3456 subl $2, KK 3457 movl BORIG, B 3458#endif 3459 3460#ifdef LT 3461 addl $2, KK 3462#endif 3463 3464#ifdef RT 3465 movl K, %eax 3466 movl BORIG, B 3467 sall $1 + BASE_SHIFT, %eax 3468 addl %eax, AORIG 3469#endif 3470 ALIGN_4 3471 3472.L110: 3473 testl $1, M 3474 je .L119 3475 3476#ifdef LN 3477 movl K, %eax 3478 sall $BASE_SHIFT, %eax 3479 subl %eax, AORIG 3480#endif 3481 3482#if defined(LN) || defined(RT) 3483 movl KK, %eax 3484 movl AORIG, AA 3485 leal (AA, %eax, SIZE), AA 3486#endif 3487 3488 leal BUFFER, BB 3489 3490#if defined(LN) || defined(RT) 3491 movl KK, %eax 3492 sall $BASE_SHIFT, %eax 3493 leal (BB, %eax, 4), BB 3494#endif 3495 3496 xorps %xmm4, %xmm4 3497 xorps %xmm5, %xmm5 3498 xorps %xmm6, %xmm6 3499 xorps %xmm7, %xmm7 3500 3501 movss 0 * SIZE(AA), %xmm0 3502 movss 4 * SIZE(AA), %xmm1 3503 movss 0 * SIZE(BB), %xmm2 3504 movss 16 * SIZE(BB), %xmm3 3505 3506#if defined(LT) || defined(RN) 3507 movl KK, %eax 3508#else 3509 movl K, %eax 3510 subl KK, %eax 3511#endif 3512 sarl $3, %eax 3513 je .L115 3514 ALIGN_4 3515 3516.L112: 3517 mulss %xmm0, %xmm2 3518#if defined(OPTERON) || defined(BARCELONA) 3519 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) 3520#endif 3521 movss 1 * SIZE(AA), %xmm0 3522 addss %xmm2, %xmm4 3523 movss 32 * SIZE(BB), %xmm2 3524 mulss 4 * SIZE(BB), %xmm0 3525 addss %xmm0, %xmm5 3526 movss 2 * SIZE(AA), %xmm0 3527 mulss 8 * SIZE(BB), %xmm0 3528 addss %xmm0, %xmm6 3529 movss 3 * SIZE(AA), %xmm0 3530 mulss 12 * SIZE(BB), %xmm0 3531 addss %xmm0, %xmm7 3532 movss 8 * SIZE(AA), %xmm0 3533 mulss %xmm1, %xmm3 3534 movss 5 * SIZE(AA), %xmm1 3535 addss %xmm3, %xmm4 3536 movss 48 * SIZE(BB), %xmm3 3537 mulss 20 * SIZE(BB), %xmm1 3538 addss %xmm1, %xmm5 3539 movss 6 * SIZE(AA), %xmm1 3540 mulss 24 * SIZE(BB), %xmm1 3541 addss %xmm1, %xmm6 3542 movss 7 * SIZE(AA), %xmm1 3543 mulss 28 * SIZE(BB), %xmm1 3544 addss %xmm1, %xmm7 3545 movss 12 * SIZE(AA), %xmm1 3546 3547 addl $ 8 * SIZE, AA 3548 addl $32 * SIZE, BB 3549 decl %eax 3550 jne .L112 3551 ALIGN_4 3552 3553.L115: 3554#if defined(LT) || defined(RN) 3555 movl KK, %eax 3556#else 3557 movl K, %eax 3558 subl KK, %eax 3559#endif 3560 andl $7, %eax # if (k & 1) 3561 BRANCH 3562 je .L118 3563 ALIGN_4 3564 3565.L116: 3566 mulss %xmm0, %xmm2 3567 movss 1 * SIZE(AA), %xmm0 3568 addss %xmm2, %xmm4 3569 movss 4 * SIZE(BB), %xmm2 3570 3571 addl $ 1 * SIZE, AA 3572 addl $ 4 * SIZE, BB 3573 decl %eax 3574 jg .L116 3575 ALIGN_4 3576 3577.L118: 3578 addss %xmm5, %xmm4 3579 addss %xmm7, %xmm6 3580 addss %xmm6, %xmm4 3581 3582#if defined(LN) || defined(RT) 3583 movl KK, %eax 3584 subl $1, %eax 3585 3586 movl AORIG, AA 3587 movl BORIG, B 3588 leal BUFFER, BB 3589 3590 sall $ BASE_SHIFT, %eax 3591 leal (AA, %eax, 1), AA 3592 leal (B, %eax, 1), B 3593 leal (BB, %eax, 4), BB 3594#endif 3595 3596#if defined(LN) || defined(LT) 3597 movss 0 * SIZE(B), %xmm1 3598 subss %xmm4, %xmm1 3599#else 3600 movss 0 * SIZE(AA), %xmm0 3601 subss %xmm4, %xmm0 3602#endif 3603 3604#if defined(LN) || defined(LT) 3605 mulss 0 * SIZE(AA), %xmm1 3606#endif 3607 3608#if defined(RN) || defined(RT) 3609 mulss 0 * SIZE(B), %xmm0 3610#endif 3611 3612#if defined(LN) || defined(LT) 3613 movss %xmm1, 0 * SIZE(B) 3614 3615 pshufd $0x00, %xmm1, %xmm0 3616 movaps %xmm0, 0 * SIZE(BB) 3617#else 3618 movss %xmm0, 0 * SIZE(AA) 3619#endif 3620 3621#ifdef LN 3622 subl $1 * SIZE, CO1 3623#endif 3624 3625#if defined(LN) || defined(LT) 3626 movss %xmm1, 0 * SIZE(CO1) 3627#else 3628 movss %xmm0, 0 * SIZE(CO1) 3629#endif 3630 3631#ifndef LN 3632 addl $1 * SIZE, CO1 3633#endif 3634 3635#if defined(LT) || defined(RN) 3636 movl K, %eax 3637 subl KK, %eax 3638 leal (AA, %eax, SIZE), AA 3639#ifdef LT 3640 addl $1 * SIZE, B 3641#endif 3642#endif 3643 3644#ifdef LN 3645 subl $1, KK 3646 movl BORIG, B 3647#endif 3648 3649#ifdef LT 3650 addl $1, KK 3651#endif 3652 3653#ifdef RT 3654 movl K, %eax 3655 movl BORIG, B 3656 sall $BASE_SHIFT, %eax 3657 addl %eax, AORIG 3658#endif 3659 ALIGN_4 3660 3661.L119: 3662#ifdef LN 3663 movl K, %eax 3664 leal (B, %eax, SIZE), B 3665#endif 3666 3667#if defined(LT) || defined(RN) 3668 movl K, %eax 3669 subl KK, %eax 3670 leal (B, %eax, SIZE), B 3671#endif 3672 3673#ifdef RN 3674 addl $1, KK 3675#endif 3676 3677#ifdef RT 3678 subl $1, KK 3679#endif 3680 ALIGN_4 3681 3682.L999: 3683 movl OLD_STACK, %esp 3684 popl %ebx 3685 popl %esi 3686 popl %edi 3687 popl %ebp 3688 ret 3689 3690 EPILOGUE 3691