1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#if !defined(HAVE_SSE) || !defined(HAVE_MMX) 43#error You have to check your configuration. 44#endif 45 46#define STACK 16 47#define ARGS 0 48 49#define STACK_M 4 + STACK + ARGS(%esi) 50#define STACK_N 8 + STACK + ARGS(%esi) 51#define STACK_K 12 + STACK + ARGS(%esi) 52#define STACK_A 20 + STACK + ARGS(%esi) 53#define STACK_B 24 + STACK + ARGS(%esi) 54#define STACK_C 28 + STACK + ARGS(%esi) 55#define STACK_LDC 32 + STACK + ARGS(%esi) 56#define STACK_OFFT 36 + STACK + ARGS(%esi) 57 58#define TRMASK 0(%esp) 59#define K 16(%esp) 60#define N 20(%esp) 61#define M 24(%esp) 62#define A 28(%esp) 63#define C 32(%esp) 64#define J 36(%esp) 65#define OLD_STACK 40(%esp) 66#define OFFSET 44(%esp) 67#define KK 48(%esp) 68#define KKK 52(%esp) 69#define AORIG 56(%esp) 70#define BORIG 60(%esp) 71#define BUFFER 128(%esp) 72 73#ifdef HAVE_3DNOW 74#define PREFETCH prefetch 75#define PREFETCHW prefetchw 76#define PREFETCHSIZE (16 * 10 + 8) 77#else 78#define PREFETCH prefetcht0 79#define PREFETCHW prefetcht0 80#define PREFETCHSIZE 96 81#endif 82 83#define B %edi 84#define AA %edx 85#define BB %ecx 86#define LDC %ebp 87#define CO1 %esi 88 89#define STACK_ALIGN 4096 90#define STACK_OFFSET 1024 91 92#if !defined(HAVE_SSE2) || defined(OPTERON) 93#define movsd movlps 94#endif 95 96#ifdef HAVE_SSE2 97#define xorps pxor 98#endif 99 100 PROLOGUE 101 102 pushl %ebp 103 pushl %edi 104 pushl %esi 105 pushl %ebx 106 107 PROFCODE 108 109 movl %esp, %esi # save old stack 110 111 subl $128 + LOCAL_BUFFER_SIZE, %esp 112 andl $-STACK_ALIGN, %esp 113 114 STACK_TOUCHING 115 116 movss STACK_M, %xmm0 117 movl STACK_N, %eax 118 movss STACK_K, %xmm1 119 movss STACK_A, %xmm2 120 movl STACK_B, B 121 movss STACK_C, %xmm3 122 movl STACK_LDC, LDC 123 movss STACK_OFFT, %xmm4 124 125 movss %xmm1, K 126 movl %eax, N 127 movss %xmm0, M 128 movss %xmm2, A 129 movss %xmm3, C 130 movl %esi, OLD_STACK 131 movss %xmm4, OFFSET 132 movss %xmm4, KK 133 134 leal (, LDC, SIZE), LDC 135 136#ifdef LN 137 movl M, %eax 138 leal (, %eax, SIZE), %eax 139 addl %eax, C 140 imull K, %eax 141 addl %eax, A 142#endif 143 144#ifdef RT 145 movl N, %eax 146 leal (, %eax, SIZE), %eax 147 imull K, %eax 148 addl %eax, B 149 movl N, %eax 150 imull LDC, %eax 151 addl %eax, C 152#endif 153 154#ifdef RN 155 negl KK 156#endif 157 158#ifdef RT 159 movl N, %eax 160 subl OFFSET, %eax 161 movl %eax, KK 162#endif 163 164#if defined(LN) || defined(LT) 165 movl $0x3f800000, 0 + TRMASK # 1.0 166 movl $0x00000000, 4 + TRMASK # 0.0 167 movl $0x3f800000, 8 + TRMASK # 1.0 168 movl $0x00000000, 12 + TRMASK # 0.0 169#endif 170 171 testl $1, N 172 jle .L100 173 174#ifdef LN 175 movl OFFSET, %eax 176 addl M, %eax 177 movl %eax, KK 178#endif 179 180 leal BUFFER, BB 181 182#ifdef RT 183 movl K, %eax 184 sall $BASE_SHIFT, %eax 185 subl %eax, B 186#endif 187 188#if defined(LN) || defined(RT) 189 movl KK, %eax 190 movl B, BORIG 191 sall $BASE_SHIFT, %eax 192 leal (B, %eax, 1), B 193 leal (BB, %eax, 4), BB 194#endif 195 196#ifdef LT 197 movl OFFSET, %eax 198 movl %eax, KK 199#endif 200 201#if defined(LT) || defined(RN) 202 movl KK, %eax 203#else 204 movl K, %eax 205 subl KK, %eax 206#endif 207 sarl $3, %eax 208 jle .L103 209 ALIGN_4 210 211.L102: 212 movsd 0 * SIZE(B), %xmm3 213 movhps 2 * SIZE(B), %xmm3 214 movsd 4 * SIZE(B), %xmm7 215 movhps 6 * SIZE(B), %xmm7 216 217#ifdef HAVE_SSE2 218 pshufd $0x00, %xmm3, %xmm0 219 pshufd $0x55, %xmm3, %xmm1 220 pshufd $0xaa, %xmm3, %xmm2 221 pshufd $0xff, %xmm3, %xmm3 222 223 pshufd $0x00, %xmm7, %xmm4 224 pshufd $0x55, %xmm7, %xmm5 225 pshufd $0xaa, %xmm7, %xmm6 226 pshufd $0xff, %xmm7, %xmm7 227#else 228 movaps %xmm3, %xmm0 229 shufps $0x00, %xmm0, %xmm0 230 movaps %xmm3, %xmm1 231 shufps $0x55, %xmm1, %xmm1 232 movaps %xmm3, %xmm2 233 shufps $0xaa, %xmm2, %xmm2 234 shufps $0xff, %xmm3, %xmm3 235 236 movaps %xmm7, %xmm4 237 shufps $0x00, %xmm4, %xmm4 238 movaps %xmm7, %xmm5 239 shufps $0x55, %xmm5, %xmm5 240 movaps %xmm7, %xmm6 241 shufps $0xaa, %xmm6, %xmm6 242 shufps $0xff, %xmm7, %xmm7 243#endif 244 245 movaps %xmm0, 0 * SIZE(BB) 246 movaps %xmm1, 4 * SIZE(BB) 247 movaps %xmm2, 8 * SIZE(BB) 248 movaps %xmm3, 12 * SIZE(BB) 249 movaps %xmm4, 16 * SIZE(BB) 250 movaps %xmm5, 20 * SIZE(BB) 251 movaps %xmm6, 24 * SIZE(BB) 252 movaps %xmm7, 28 * SIZE(BB) 253 254 addl $ 8 * SIZE, B 255 addl $32 * SIZE, BB 256 257 decl %eax 258 BRANCH 259 jne .L102 260 ALIGN_2 261 262.L103: 263#if defined(LT) || defined(RN) 264 movl KK, %eax 265#else 266 movl K, %eax 267 subl KK, %eax 268#endif 269 andl $7, %eax 270 BRANCH 271 jle .L105 272 ALIGN_2 273 274.L104: 275 movss 0 * SIZE(B), %xmm0 276 277 shufps $0x00, %xmm0, %xmm0 278 279 movaps %xmm0, 0 * SIZE(BB) 280 281 addl $1 * SIZE, B 282 addl $4 * SIZE, BB 283 284 decl %eax 285 jne .L104 286 ALIGN_4 287 288.L105: 289#if defined(LT) || defined(RN) 290 movl A, AA 291#else 292 movl A, %eax 293 movl %eax, AORIG 294#endif 295 296#ifdef RT 297 subl LDC, C 298#endif 299 movl C, CO1 300#ifndef RT 301 addl LDC, C 302#endif 303 304 movl M, %ebx 305 sarl $3, %ebx # i = (m >> 2) 306 jle .L130 307 ALIGN_4 308 309.L110: 310#ifdef LN 311 movl K, %eax 312 sall $3 + BASE_SHIFT, %eax 313 subl %eax, AORIG 314#endif 315 316#if defined(LN) || defined(RT) 317 movl KK, %eax 318 movl AORIG, AA 319 sall $3 + BASE_SHIFT, %eax 320 addl %eax, AA 321#endif 322 323 leal BUFFER, BB 324 325#if defined(LN) || defined(RT) 326 movl KK, %eax 327 sall $BASE_SHIFT, %eax 328 leal (BB, %eax, 4), BB 329#endif 330 331 movaps 0 * SIZE(BB), %xmm2 332 xorps %xmm4, %xmm4 333 movaps 0 * SIZE(AA), %xmm0 334 xorps %xmm5, %xmm5 335 movaps 16 * SIZE(BB), %xmm3 336 xorps %xmm6, %xmm6 337 movaps 16 * SIZE(AA), %xmm1 338 xorps %xmm7, %xmm7 339 340 PREFETCHW 7 * SIZE(CO1) 341 342#if defined(LT) || defined(RN) 343 movl KK, %eax 344#else 345 movl K, %eax 346 subl KK, %eax 347#endif 348 sarl $3, %eax 349 je .L112 350 ALIGN_2 351 352.L111: 353 mulps %xmm2, %xmm0 354 mulps 4 * SIZE(AA), %xmm2 355 addps %xmm0, %xmm4 356 movaps 8 * SIZE(AA), %xmm0 357 addps %xmm2, %xmm6 358 movaps 4 * SIZE(BB), %xmm2 359 mulps %xmm2, %xmm0 360 mulps 12 * SIZE(AA), %xmm2 361 addps %xmm0, %xmm5 362 movaps 32 * SIZE(AA), %xmm0 363 addps %xmm2, %xmm7 364 movaps 8 * SIZE(BB), %xmm2 365 mulps %xmm2, %xmm1 366 mulps 20 * SIZE(AA), %xmm2 367 addps %xmm1, %xmm4 368 movaps 24 * SIZE(AA), %xmm1 369 addps %xmm2, %xmm6 370 movaps 12 * SIZE(BB), %xmm2 371 mulps %xmm2, %xmm1 372 mulps 28 * SIZE(AA), %xmm2 373 addps %xmm1, %xmm5 374 movaps 48 * SIZE(AA), %xmm1 375 addps %xmm2, %xmm7 376 movaps 32 * SIZE(BB), %xmm2 377 mulps %xmm3, %xmm0 378 mulps 36 * SIZE(AA), %xmm3 379 addps %xmm0, %xmm4 380 movaps 40 * SIZE(AA), %xmm0 381 addps %xmm3, %xmm6 382 movaps 20 * SIZE(BB), %xmm3 383 mulps %xmm3, %xmm0 384 mulps 44 * SIZE(AA), %xmm3 385 addps %xmm0, %xmm5 386 movaps 64 * SIZE(AA), %xmm0 387 addps %xmm3, %xmm7 388 movaps 24 * SIZE(BB), %xmm3 389 mulps %xmm3, %xmm1 390 mulps 52 * SIZE(AA), %xmm3 391 addps %xmm1, %xmm4 392 movaps 56 * SIZE(AA), %xmm1 393 addps %xmm3, %xmm6 394 movaps 28 * SIZE(BB), %xmm3 395 mulps %xmm3, %xmm1 396 mulps 60 * SIZE(AA), %xmm3 397 addps %xmm1, %xmm5 398 movaps 80 * SIZE(AA), %xmm1 399 addps %xmm3, %xmm7 400 movaps 48 * SIZE(BB), %xmm3 401 402 addl $64 * SIZE, AA 403 addl $32 * SIZE, BB 404 decl %eax 405 jne .L111 406 ALIGN_2 407 408.L112: 409#if defined(LT) || defined(RN) 410 movl KK, %eax 411#else 412 movl K, %eax 413 subl KK, %eax 414#endif 415 andl $7, %eax # if (k & 1) 416 BRANCH 417 je .L114 418 419.L113: 420 movaps 0 * SIZE(BB), %xmm2 421 movaps 0 * SIZE(AA), %xmm0 422 mulps %xmm2, %xmm0 423 addps %xmm0, %xmm4 424 mulps 4 * SIZE(AA), %xmm2 425 addps %xmm2, %xmm6 426 427 addl $8 * SIZE, AA 428 addl $4 * SIZE, BB 429 subl $1, %eax 430 jg .L113 431 ALIGN_4 432 433.L114: 434 addps %xmm5, %xmm4 435 addps %xmm7, %xmm6 436 437#if defined(LN) || defined(RT) 438 movl KK, %eax 439#ifdef LN 440 subl $8, %eax 441#else 442 subl $1, %eax 443#endif 444 445 movl AORIG, AA 446 movl BORIG, B 447 leal BUFFER, BB 448 449 sall $BASE_SHIFT, %eax 450 leal (AA, %eax, 8), AA 451 leal (B, %eax, 1), B 452 leal (BB, %eax, 4), BB 453#endif 454 455#if defined(LN) || defined(LT) 456 movsd 0 * SIZE(B), %xmm2 457 movhps 2 * SIZE(B), %xmm2 458 movsd 4 * SIZE(B), %xmm5 459 movhps 6 * SIZE(B), %xmm5 460 461 subps %xmm4, %xmm2 462 subps %xmm6, %xmm5 463 464 xorps %xmm0, %xmm0 465 466 movaps %xmm2, %xmm3 467 unpcklps %xmm0, %xmm2 468 unpckhps %xmm0, %xmm3 469 470 movaps %xmm5, %xmm7 471 unpcklps %xmm0, %xmm5 472 unpckhps %xmm0, %xmm7 473#else 474 movaps 0 * SIZE(AA), %xmm0 475 movaps 4 * SIZE(AA), %xmm1 476 477 subps %xmm4, %xmm0 478 subps %xmm6, %xmm1 479#endif 480 481#if defined(LN) || defined(LT) 482 movaps TRMASK, %xmm6 483#endif 484 485#ifdef LN 486 movss 63 * SIZE(AA), %xmm0 487 movaps %xmm6, %xmm1 488 shufps $0x00, %xmm0, %xmm1 489 mulps %xmm1, %xmm7 490 491 movaps %xmm7, %xmm1 492 shufps $0xee, %xmm1, %xmm1 493 494 movss 62 * SIZE(AA), %xmm0 495 shufps $0x50, %xmm0, %xmm0 496 mulps %xmm1, %xmm0 497 subps %xmm0, %xmm7 498 499 movsd 60 * SIZE(AA), %xmm0 500 shufps $0x50, %xmm0, %xmm0 501 mulps %xmm1, %xmm0 502 subps %xmm0, %xmm5 503 504 movsd 58 * SIZE(AA), %xmm0 505 shufps $0x50, %xmm0, %xmm0 506 mulps %xmm1, %xmm0 507 subps %xmm0, %xmm3 508 509 movsd 56 * SIZE(AA), %xmm0 510 shufps $0x50, %xmm0, %xmm0 511 mulps %xmm1, %xmm0 512 subps %xmm0, %xmm2 513 514 movss 54 * SIZE(AA), %xmm0 515 shufps $0x00, %xmm6, %xmm0 516 mulps %xmm0, %xmm7 517 518 movaps %xmm7, %xmm1 519 shufps $0x44, %xmm1, %xmm1 520 521 movsd 52 * SIZE(AA), %xmm0 522 shufps $0x50, %xmm0, %xmm0 523 mulps %xmm1, %xmm0 524 subps %xmm0, %xmm5 525 526 movsd 50 * SIZE(AA), %xmm0 527 shufps $0x50, %xmm0, %xmm0 528 mulps %xmm1, %xmm0 529 subps %xmm0, %xmm3 530 531 movsd 48 * SIZE(AA), %xmm0 532 shufps $0x50, %xmm0, %xmm0 533 mulps %xmm1, %xmm0 534 subps %xmm0, %xmm2 535 536 537 movss 45 * SIZE(AA), %xmm0 538 movaps %xmm6, %xmm1 539 shufps $0x00, %xmm0, %xmm1 540 mulps %xmm1, %xmm5 541 542 movaps %xmm5, %xmm1 543 shufps $0xee, %xmm1, %xmm1 544 545 movss 44 * SIZE(AA), %xmm0 546 shufps $0x50, %xmm0, %xmm0 547 mulps %xmm1, %xmm0 548 subps %xmm0, %xmm5 549 550 movsd 42 * SIZE(AA), %xmm0 551 shufps $0x50, %xmm0, %xmm0 552 mulps %xmm1, %xmm0 553 subps %xmm0, %xmm3 554 555 movsd 40 * SIZE(AA), %xmm0 556 shufps $0x50, %xmm0, %xmm0 557 mulps %xmm1, %xmm0 558 subps %xmm0, %xmm2 559 560 movss 36 * SIZE(AA), %xmm0 561 shufps $0x00, %xmm6, %xmm0 562 mulps %xmm0, %xmm5 563 564 movaps %xmm5, %xmm1 565 shufps $0x44, %xmm1, %xmm1 566 567 movsd 34 * SIZE(AA), %xmm0 568 shufps $0x50, %xmm0, %xmm0 569 mulps %xmm1, %xmm0 570 subps %xmm0, %xmm3 571 572 movsd 32 * SIZE(AA), %xmm0 573 shufps $0x50, %xmm0, %xmm0 574 mulps %xmm1, %xmm0 575 subps %xmm0, %xmm2 576 577 movss 27 * SIZE(AA), %xmm0 578 movaps %xmm6, %xmm1 579 shufps $0x00, %xmm0, %xmm1 580 mulps %xmm1, %xmm3 581 582 movaps %xmm3, %xmm1 583 shufps $0xee, %xmm1, %xmm1 584 585 movss 26 * SIZE(AA), %xmm0 586 shufps $0x50, %xmm0, %xmm0 587 mulps %xmm1, %xmm0 588 subps %xmm0, %xmm3 589 590 movsd 24 * SIZE(AA), %xmm0 591 shufps $0x50, %xmm0, %xmm0 592 mulps %xmm1, %xmm0 593 subps %xmm0, %xmm2 594 595 movss 18 * SIZE(AA), %xmm0 596 shufps $0x00, %xmm6, %xmm0 597 mulps %xmm0, %xmm3 598 599 movaps %xmm3, %xmm1 600 shufps $0x44, %xmm1, %xmm1 601 602 movsd 16 * SIZE(AA), %xmm0 603 shufps $0x50, %xmm0, %xmm0 604 mulps %xmm1, %xmm0 605 subps %xmm0, %xmm2 606 607 movss 9 * SIZE(AA), %xmm0 608 movaps %xmm6, %xmm1 609 shufps $0x00, %xmm0, %xmm1 610 mulps %xmm1, %xmm2 611 612 movaps %xmm2, %xmm1 613 shufps $0xee, %xmm1, %xmm1 614 615 movss 8 * SIZE(AA), %xmm0 616 shufps $0x50, %xmm0, %xmm0 617 mulps %xmm1, %xmm0 618 subps %xmm0, %xmm2 619 620 movss 0 * SIZE(AA), %xmm0 621 shufps $0x00, %xmm6, %xmm0 622 mulps %xmm0, %xmm2 623#endif 624 625#ifdef LT 626 movss 0 * SIZE(AA), %xmm0 627 shufps $0x00, %xmm6, %xmm0 628 mulps %xmm0, %xmm2 629 630 movaps %xmm2, %xmm1 631 shufps $0x44, %xmm1, %xmm1 632 633 movss 1 * SIZE(AA), %xmm0 634 shufps $0x05, %xmm0, %xmm0 635 mulps %xmm1, %xmm0 636 subps %xmm0, %xmm2 637 638 movsd 2 * SIZE(AA), %xmm0 639 shufps $0x50, %xmm0, %xmm0 640 mulps %xmm1, %xmm0 641 subps %xmm0, %xmm3 642 643 movsd 4 * SIZE(AA), %xmm0 644 shufps $0x50, %xmm0, %xmm0 645 mulps %xmm1, %xmm0 646 subps %xmm0, %xmm5 647 648 movsd 6 * SIZE(AA), %xmm0 649 shufps $0x50, %xmm0, %xmm0 650 mulps %xmm1, %xmm0 651 subps %xmm0, %xmm7 652 653 movss 9 * SIZE(AA), %xmm0 654 movaps %xmm6, %xmm1 655 shufps $0x00, %xmm0, %xmm1 656 mulps %xmm1, %xmm2 657 658 movaps %xmm2, %xmm1 659 shufps $0xee, %xmm1, %xmm1 660 661 movsd 10 * SIZE(AA), %xmm0 662 shufps $0x50, %xmm0, %xmm0 663 mulps %xmm1, %xmm0 664 subps %xmm0, %xmm3 665 666 movsd 12 * SIZE(AA), %xmm0 667 shufps $0x50, %xmm0, %xmm0 668 mulps %xmm1, %xmm0 669 subps %xmm0, %xmm5 670 671 movsd 14 * SIZE(AA), %xmm0 672 shufps $0x50, %xmm0, %xmm0 673 mulps %xmm1, %xmm0 674 subps %xmm0, %xmm7 675 676 movss 18 * SIZE(AA), %xmm0 677 shufps $0x00, %xmm6, %xmm0 678 mulps %xmm0, %xmm3 679 680 movaps %xmm3, %xmm1 681 shufps $0x44, %xmm1, %xmm1 682 683 movss 19 * SIZE(AA), %xmm0 684 shufps $0x05, %xmm0, %xmm0 685 mulps %xmm1, %xmm0 686 subps %xmm0, %xmm3 687 688 movsd 20 * SIZE(AA), %xmm0 689 shufps $0x50, %xmm0, %xmm0 690 mulps %xmm1, %xmm0 691 subps %xmm0, %xmm5 692 693 movsd 22 * SIZE(AA), %xmm0 694 shufps $0x50, %xmm0, %xmm0 695 mulps %xmm1, %xmm0 696 subps %xmm0, %xmm7 697 698 movss 27 * SIZE(AA), %xmm0 699 movaps %xmm6, %xmm1 700 shufps $0x00, %xmm0, %xmm1 701 mulps %xmm1, %xmm3 702 703 movaps %xmm3, %xmm1 704 shufps $0xee, %xmm1, %xmm1 705 706 movsd 28 * SIZE(AA), %xmm0 707 shufps $0x50, %xmm0, %xmm0 708 mulps %xmm1, %xmm0 709 subps %xmm0, %xmm5 710 711 movsd 30 * SIZE(AA), %xmm0 712 shufps $0x50, %xmm0, %xmm0 713 mulps %xmm1, %xmm0 714 subps %xmm0, %xmm7 715 716 movss 36 * SIZE(AA), %xmm0 717 shufps $0x00, %xmm6, %xmm0 718 mulps %xmm0, %xmm5 719 720 movaps %xmm5, %xmm1 721 shufps $0x44, %xmm1, %xmm1 722 723 movss 37 * SIZE(AA), %xmm0 724 shufps $0x05, %xmm0, %xmm0 725 mulps %xmm1, %xmm0 726 subps %xmm0, %xmm5 727 728 movsd 38 * SIZE(AA), %xmm0 729 shufps $0x50, %xmm0, %xmm0 730 mulps %xmm1, %xmm0 731 subps %xmm0, %xmm7 732 733 movss 45 * SIZE(AA), %xmm0 734 movaps %xmm6, %xmm1 735 shufps $0x00, %xmm0, %xmm1 736 mulps %xmm1, %xmm5 737 738 movaps %xmm5, %xmm1 739 shufps $0xee, %xmm1, %xmm1 740 741 movsd 46 * SIZE(AA), %xmm0 742 shufps $0x50, %xmm0, %xmm0 743 mulps %xmm1, %xmm0 744 subps %xmm0, %xmm7 745 746 movss 54 * SIZE(AA), %xmm0 747 shufps $0x00, %xmm6, %xmm0 748 mulps %xmm0, %xmm7 749 750 movaps %xmm7, %xmm1 751 shufps $0x44, %xmm1, %xmm1 752 753 movss 55 * SIZE(AA), %xmm0 754 shufps $0x05, %xmm0, %xmm0 755 mulps %xmm1, %xmm0 756 subps %xmm0, %xmm7 757 758 movss 63 * SIZE(AA), %xmm0 759 movaps %xmm6, %xmm1 760 shufps $0x00, %xmm0, %xmm1 761 mulps %xmm1, %xmm7 762#endif 763 764#if defined(RN) || defined(RT) 765 movss 0 * SIZE(B), %xmm6 766 shufps $0x00, %xmm6, %xmm6 767 768 mulps %xmm6, %xmm0 769 mulps %xmm6, %xmm1 770#endif 771 772#if defined(LN) || defined(LT) 773 shufps $0x88, %xmm3, %xmm2 774 shufps $0x88, %xmm7, %xmm5 775 776 movlps %xmm2, 0 * SIZE(B) 777 movhps %xmm2, 2 * SIZE(B) 778 movlps %xmm5, 4 * SIZE(B) 779 movhps %xmm5, 6 * SIZE(B) 780 781#ifdef HAVE_SSE2 782 pshufd $0x00, %xmm2, %xmm0 783 pshufd $0x55, %xmm2, %xmm1 784 pshufd $0xaa, %xmm2, %xmm4 785 pshufd $0xff, %xmm2, %xmm6 786#else 787 movaps %xmm2, %xmm0 788 shufps $0x00, %xmm0, %xmm0 789 movaps %xmm2, %xmm1 790 shufps $0x55, %xmm1, %xmm1 791 movaps %xmm2, %xmm4 792 shufps $0xaa, %xmm4, %xmm4 793 movaps %xmm2, %xmm6 794 shufps $0xff, %xmm6, %xmm6 795#endif 796 797 movaps %xmm0, 0 * SIZE(BB) 798 movaps %xmm1, 4 * SIZE(BB) 799 movaps %xmm4, 8 * SIZE(BB) 800 movaps %xmm6, 12 * SIZE(BB) 801 802#ifdef HAVE_SSE2 803 pshufd $0x00, %xmm5, %xmm0 804 pshufd $0x55, %xmm5, %xmm1 805 pshufd $0xaa, %xmm5, %xmm4 806 pshufd $0xff, %xmm5, %xmm6 807#else 808 movaps %xmm5, %xmm0 809 shufps $0x00, %xmm0, %xmm0 810 movaps %xmm5, %xmm1 811 shufps $0x55, %xmm1, %xmm1 812 movaps %xmm5, %xmm4 813 shufps $0xaa, %xmm4, %xmm4 814 movaps %xmm5, %xmm6 815 shufps $0xff, %xmm6, %xmm6 816#endif 817 818 movaps %xmm0, 16 * SIZE(BB) 819 movaps %xmm1, 20 * SIZE(BB) 820 movaps %xmm4, 24 * SIZE(BB) 821 movaps %xmm6, 28 * SIZE(BB) 822#else 823 movaps %xmm0, 0 * SIZE(AA) 824 movaps %xmm1, 4 * SIZE(AA) 825#endif 826 827#ifdef LN 828 subl $8 * SIZE, CO1 829#endif 830 831#if defined(LN) || defined(LT) 832 movlps %xmm2, 0 * SIZE(CO1) 833 movhps %xmm2, 2 * SIZE(CO1) 834 movlps %xmm5, 4 * SIZE(CO1) 835 movhps %xmm5, 6 * SIZE(CO1) 836#else 837 movlps %xmm0, 0 * SIZE(CO1) 838 movhps %xmm0, 2 * SIZE(CO1) 839 movlps %xmm1, 4 * SIZE(CO1) 840 movhps %xmm1, 6 * SIZE(CO1) 841#endif 842 843#ifndef LN 844 addl $8 * SIZE, CO1 845#endif 846 847#if defined(LT) || defined(RN) 848 movl K, %eax 849 subl KK, %eax 850 leal (,%eax, SIZE), %eax 851 leal (AA, %eax, 8), AA 852#ifdef LT 853 addl $8 * SIZE, B 854#endif 855#endif 856 857#ifdef LN 858 subl $8, KK 859 movl BORIG, B 860#endif 861 862#ifdef LT 863 addl $8, KK 864#endif 865 866#ifdef RT 867 movl K, %eax 868 movl BORIG, B 869 sall $3 + BASE_SHIFT, %eax 870 addl %eax, AORIG 871#endif 872 873 decl %ebx # i -- 874 jg .L110 875 ALIGN_2 876 877.L130: 878 testl $4, M 879 jle .L150 880 881#ifdef LN 882 movl K, %eax 883 sall $2 + BASE_SHIFT, %eax 884 subl %eax, AORIG 885#endif 886 887#if defined(LN) || defined(RT) 888 movl KK, %eax 889 movl AORIG, AA 890 sall $2 + BASE_SHIFT, %eax 891 addl %eax, AA 892#endif 893 894 leal BUFFER, BB 895 896#if defined(LN) || defined(RT) 897 movl KK, %eax 898 sall $BASE_SHIFT, %eax 899 leal (BB, %eax, 4), BB 900#endif 901 902 movaps 0 * SIZE(BB), %xmm2 903 xorps %xmm4, %xmm4 904 movsd 0 * SIZE(AA), %xmm0 905 movhps 2 * SIZE(AA), %xmm0 906 xorps %xmm5, %xmm5 907 movaps 16 * SIZE(BB), %xmm3 908 xorps %xmm6, %xmm6 909 movsd 16 * SIZE(AA), %xmm1 910 movhps 18 * SIZE(AA), %xmm1 911 xorps %xmm7, %xmm7 912 913#if defined(LT) || defined(RN) 914 movl KK, %eax 915#else 916 movl K, %eax 917 subl KK, %eax 918#endif 919 sarl $3, %eax 920 je .L132 921 ALIGN_2 922 923.L131: 924 mulps %xmm0, %xmm2 925 movaps 4 * SIZE(AA), %xmm0 926 addps %xmm2, %xmm4 927 mulps 4 * SIZE(BB), %xmm0 928 movaps 32 * SIZE(BB), %xmm2 929 addps %xmm0, %xmm5 930 movaps 8 * SIZE(AA), %xmm0 931 mulps 8 * SIZE(BB), %xmm0 932 addps %xmm0, %xmm6 933 movaps 12 * SIZE(AA), %xmm0 934 mulps 12 * SIZE(BB), %xmm0 935 addps %xmm0, %xmm7 936 movaps 32 * SIZE(AA), %xmm0 937 mulps %xmm1, %xmm3 938 movaps 20 * SIZE(AA), %xmm1 939 addps %xmm3, %xmm4 940 mulps 20 * SIZE(BB), %xmm1 941 movaps 48 * SIZE(BB), %xmm3 942 addps %xmm1, %xmm5 943 movaps 24 * SIZE(AA), %xmm1 944 mulps 24 * SIZE(BB), %xmm1 945 addps %xmm1, %xmm6 946 movaps 28 * SIZE(AA), %xmm1 947 mulps 28 * SIZE(BB), %xmm1 948 addps %xmm1, %xmm7 949 movaps 48 * SIZE(AA), %xmm1 950 951 addl $32 * SIZE, AA 952 addl $32 * SIZE, BB 953 decl %eax 954 jne .L131 955 ALIGN_2 956 957.L132: 958#if defined(LT) || defined(RN) 959 movl KK, %eax 960#else 961 movl K, %eax 962 subl KK, %eax 963#endif 964 andl $7, %eax # if (k & 1) 965 BRANCH 966 je .L134 967 968.L133: 969 movaps 0 * SIZE(BB), %xmm2 970 movaps 0 * SIZE(AA), %xmm0 971 mulps %xmm0, %xmm2 972 addps %xmm2, %xmm4 973 974 addl $4 * SIZE, AA 975 addl $4 * SIZE, BB 976 decl %eax 977 jg .L133 978 ALIGN_4 979 980.L134: 981 addps %xmm5, %xmm4 982 addps %xmm7, %xmm6 983 addps %xmm6, %xmm4 984 985#if defined(LN) || defined(RT) 986 movl KK, %eax 987#ifdef LN 988 subl $4, %eax 989#else 990 subl $1, %eax 991#endif 992 993 movl AORIG, AA 994 movl BORIG, B 995 leal BUFFER, BB 996 997 sall $BASE_SHIFT, %eax 998 leal (AA, %eax, 4), AA 999 leal (B, %eax, 1), B 1000 leal (BB, %eax, 4), BB 1001#endif 1002 1003#if defined(LN) || defined(LT) 1004 movsd 0 * SIZE(B), %xmm2 1005 movhps 2 * SIZE(B), %xmm2 1006 1007 subps %xmm4, %xmm2 1008 1009 xorps %xmm5, %xmm5 1010 1011 movaps %xmm2, %xmm3 1012 unpcklps %xmm5, %xmm2 1013 unpckhps %xmm5, %xmm3 1014#else 1015 movaps 0 * SIZE(AA), %xmm0 1016 subps %xmm4, %xmm0 1017#endif 1018 1019#if defined(LN) || defined(LT) 1020 movaps TRMASK, %xmm6 1021#endif 1022 1023#ifdef LN 1024 movss 15 * SIZE(AA), %xmm0 1025 movaps %xmm6, %xmm1 1026 shufps $0x00, %xmm0, %xmm1 1027 mulps %xmm1, %xmm3 1028 1029 movaps %xmm3, %xmm1 1030 shufps $0xee, %xmm1, %xmm1 1031 1032 movss 14 * SIZE(AA), %xmm0 1033 shufps $0x50, %xmm0, %xmm0 1034 mulps %xmm1, %xmm0 1035 subps %xmm0, %xmm3 1036 1037 movsd 12 * SIZE(AA), %xmm0 1038 shufps $0x50, %xmm0, %xmm0 1039 mulps %xmm1, %xmm0 1040 subps %xmm0, %xmm2 1041 1042 movss 10 * SIZE(AA), %xmm0 1043 shufps $0x00, %xmm6, %xmm0 1044 mulps %xmm0, %xmm3 1045 1046 movaps %xmm3, %xmm1 1047 shufps $0x44, %xmm1, %xmm1 1048 1049 movsd 8 * SIZE(AA), %xmm0 1050 shufps $0x50, %xmm0, %xmm0 1051 mulps %xmm1, %xmm0 1052 subps %xmm0, %xmm2 1053 1054 movss 5 * SIZE(AA), %xmm0 1055 movaps %xmm6, %xmm1 1056 shufps $0x00, %xmm0, %xmm1 1057 mulps %xmm1, %xmm2 1058 1059 movaps %xmm2, %xmm1 1060 shufps $0xee, %xmm1, %xmm1 1061 1062 movss 4 * SIZE(AA), %xmm0 1063 shufps $0x50, %xmm0, %xmm0 1064 mulps %xmm1, %xmm0 1065 subps %xmm0, %xmm2 1066 1067 movss 0 * SIZE(AA), %xmm0 1068 shufps $0x00, %xmm6, %xmm0 1069 mulps %xmm0, %xmm2 1070#endif 1071 1072#ifdef LT 1073 movss 0 * SIZE(AA), %xmm0 1074 shufps $0x00, %xmm6, %xmm0 1075 mulps %xmm0, %xmm2 1076 1077 movaps %xmm2, %xmm1 1078 shufps $0x44, %xmm1, %xmm1 1079 1080 movss 1 * SIZE(AA), %xmm0 1081 shufps $0x05, %xmm0, %xmm0 1082 mulps %xmm1, %xmm0 1083 subps %xmm0, %xmm2 1084 1085 movsd 2 * SIZE(AA), %xmm0 1086 shufps $0x50, %xmm0, %xmm0 1087 mulps %xmm1, %xmm0 1088 subps %xmm0, %xmm3 1089 1090 movss 5 * SIZE(AA), %xmm0 1091 movaps %xmm6, %xmm1 1092 shufps $0x00, %xmm0, %xmm1 1093 mulps %xmm1, %xmm2 1094 1095 movaps %xmm2, %xmm1 1096 shufps $0xee, %xmm1, %xmm1 1097 1098 movsd 6 * SIZE(AA), %xmm0 1099 shufps $0x50, %xmm0, %xmm0 1100 mulps %xmm1, %xmm0 1101 subps %xmm0, %xmm3 1102 1103 movss 10 * SIZE(AA), %xmm0 1104 shufps $0x00, %xmm6, %xmm0 1105 mulps %xmm0, %xmm3 1106 1107 movaps %xmm3, %xmm1 1108 shufps $0x44, %xmm1, %xmm1 1109 1110 movss 11 * SIZE(AA), %xmm0 1111 shufps $0x05, %xmm0, %xmm0 1112 mulps %xmm1, %xmm0 1113 subps %xmm0, %xmm3 1114 1115 movss 15 * SIZE(AA), %xmm0 1116 movaps %xmm6, %xmm1 1117 shufps $0x00, %xmm0, %xmm1 1118 mulps %xmm1, %xmm3 1119#endif 1120 1121#ifdef RN 1122 movss 0 * SIZE(B), %xmm6 1123 shufps $0x00, %xmm6, %xmm6 1124 mulps %xmm6, %xmm0 1125#endif 1126 1127#ifdef RT 1128 movss 0 * SIZE(B), %xmm6 1129 shufps $0x00, %xmm6, %xmm6 1130 mulps %xmm6, %xmm0 1131#endif 1132 1133#if defined(LN) || defined(LT) 1134 shufps $0x88, %xmm3, %xmm2 1135 1136 movlps %xmm2, 0 * SIZE(B) 1137 movhps %xmm2, 2 * SIZE(B) 1138 1139#ifdef HAVE_SSE2 1140 pshufd $0x00, %xmm2, %xmm0 1141 pshufd $0x55, %xmm2, %xmm1 1142 pshufd $0xaa, %xmm2, %xmm4 1143 pshufd $0xff, %xmm2, %xmm6 1144#else 1145 movaps %xmm2, %xmm0 1146 shufps $0x00, %xmm0, %xmm0 1147 movaps %xmm2, %xmm1 1148 shufps $0x55, %xmm1, %xmm1 1149 movaps %xmm2, %xmm4 1150 shufps $0xaa, %xmm4, %xmm4 1151 movaps %xmm2, %xmm6 1152 shufps $0xff, %xmm6, %xmm6 1153#endif 1154 1155 movaps %xmm0, 0 * SIZE(BB) 1156 movaps %xmm1, 4 * SIZE(BB) 1157 movaps %xmm4, 8 * SIZE(BB) 1158 movaps %xmm6, 12 * SIZE(BB) 1159#else 1160 movaps %xmm0, 0 * SIZE(AA) 1161#endif 1162 1163#ifdef LN 1164 subl $4 * SIZE, CO1 1165#endif 1166 1167#if defined(LN) || defined(LT) 1168 movlps %xmm2, 0 * SIZE(CO1) 1169 movhps %xmm2, 2 * SIZE(CO1) 1170#else 1171 movlps %xmm0, 0 * SIZE(CO1) 1172 movhps %xmm0, 2 * SIZE(CO1) 1173#endif 1174 1175#ifndef LN 1176 addl $4 * SIZE, CO1 1177#endif 1178 1179#if defined(LT) || defined(RN) 1180 movl K, %eax 1181 subl KK, %eax 1182 leal (,%eax, SIZE), %eax 1183 leal (AA, %eax, 4), AA 1184#ifdef LT 1185 addl $4 * SIZE, B 1186#endif 1187#endif 1188 1189#ifdef LN 1190 subl $4, KK 1191 movl BORIG, B 1192#endif 1193 1194#ifdef LT 1195 addl $4, KK 1196#endif 1197 1198#ifdef RT 1199 movl K, %eax 1200 movl BORIG, B 1201 sall $2 + BASE_SHIFT, %eax 1202 addl %eax, AORIG 1203#endif 1204 ALIGN_2 1205 1206.L150: 1207 testl $2, M 1208 jle .L170 1209 1210#ifdef LN 1211 movl K, %eax 1212 sall $1 + BASE_SHIFT, %eax 1213 subl %eax, AORIG 1214#endif 1215 1216#if defined(LN) || defined(RT) 1217 movl KK, %eax 1218 movl AORIG, AA 1219 sall $1 + BASE_SHIFT, %eax 1220 addl %eax, AA 1221#endif 1222 1223 leal BUFFER, BB 1224 1225#if defined(LN) || defined(RT) 1226 movl KK, %eax 1227 sall $BASE_SHIFT, %eax 1228 leal (BB, %eax, 4), BB 1229#endif 1230 1231 movaps 0 * SIZE(BB), %xmm2 1232 xorps %xmm4, %xmm4 1233#ifdef movsd 1234 xorps %xmm0, %xmm0 1235#endif 1236 movsd 0 * SIZE(AA), %xmm0 1237 xorps %xmm5, %xmm5 1238 movaps 16 * SIZE(BB), %xmm3 1239 xorps %xmm6, %xmm6 1240#ifdef movsd 1241 xorps %xmm1, %xmm1 1242#endif 1243 movsd 8 * SIZE(AA), %xmm1 1244 xorps %xmm7, %xmm7 1245 1246#if defined(LT) || defined(RN) 1247 movl KK, %eax 1248#else 1249 movl K, %eax 1250 subl KK, %eax 1251#endif 1252 sarl $3, %eax 1253 je .L152 1254 ALIGN_2 1255 1256.L151: 1257 mulps %xmm0, %xmm2 1258 movsd 2 * SIZE(AA), %xmm0 1259 addps %xmm2, %xmm4 1260 movaps 4 * SIZE(BB), %xmm2 1261 mulps %xmm0, %xmm2 1262 movsd 4 * SIZE(AA), %xmm0 1263 addps %xmm2, %xmm5 1264 movaps 8 * SIZE(BB), %xmm2 1265 mulps %xmm0, %xmm2 1266 movsd 6 * SIZE(AA), %xmm0 1267 addps %xmm2, %xmm6 1268 movaps 12 * SIZE(BB), %xmm2 1269 mulps %xmm0, %xmm2 1270 movsd 16 * SIZE(AA), %xmm0 1271 addps %xmm2, %xmm7 1272 movaps 32 * SIZE(BB), %xmm2 1273 mulps %xmm1, %xmm3 1274 movsd 10 * SIZE(AA), %xmm1 1275 addps %xmm3, %xmm4 1276 movaps 20 * SIZE(BB), %xmm3 1277 mulps %xmm1, %xmm3 1278 movsd 12 * SIZE(AA), %xmm1 1279 addps %xmm3, %xmm5 1280 movaps 24 * SIZE(BB), %xmm3 1281 mulps %xmm1, %xmm3 1282 movsd 14 * SIZE(AA), %xmm1 1283 addps %xmm3, %xmm6 1284 movaps 28 * SIZE(BB), %xmm3 1285 mulps %xmm1, %xmm3 1286 movsd 24 * SIZE(AA), %xmm1 1287 addps %xmm3, %xmm7 1288 movaps 48 * SIZE(BB), %xmm3 1289 1290 addl $16 * SIZE, AA 1291 addl $32 * SIZE, BB 1292 decl %eax 1293 jne .L151 1294 ALIGN_2 1295 1296.L152: 1297#if defined(LT) || defined(RN) 1298 movl KK, %eax 1299#else 1300 movl K, %eax 1301 subl KK, %eax 1302#endif 1303 andl $7, %eax # if (k & 1) 1304 BRANCH 1305 je .L154 1306 1307.L153: 1308 mulps %xmm0, %xmm2 1309 movsd 2 * SIZE(AA), %xmm0 1310 addps %xmm2, %xmm4 1311 movaps 4 * SIZE(BB), %xmm2 1312 1313 addl $2 * SIZE, AA 1314 addl $4 * SIZE, BB 1315 decl %eax 1316 jg .L153 1317 ALIGN_4 1318 1319.L154: 1320 addps %xmm5, %xmm4 1321 addps %xmm7, %xmm6 1322 addps %xmm6, %xmm4 1323 1324#if defined(LN) || defined(RT) 1325 movl KK, %eax 1326#ifdef LN 1327 subl $2, %eax 1328#else 1329 subl $1, %eax 1330#endif 1331 1332 movl AORIG, AA 1333 movl BORIG, B 1334 leal BUFFER, BB 1335 1336 sall $BASE_SHIFT, %eax 1337 leal (AA, %eax, 2), AA 1338 leal (B, %eax, 1), B 1339 leal (BB, %eax, 4), BB 1340#endif 1341 1342#if defined(LN) || defined(LT) 1343 movaps %xmm4, %xmm5 1344 shufps $1, %xmm5, %xmm5 1345 1346 movss 0 * SIZE(B), %xmm0 1347 movss 1 * SIZE(B), %xmm1 1348 1349 subss %xmm4, %xmm0 1350 subss %xmm5, %xmm1 1351#else 1352#ifdef movsd 1353 xorps %xmm0, %xmm0 1354#endif 1355 movsd 0 * SIZE(AA), %xmm0 1356 subps %xmm4, %xmm0 1357#endif 1358 1359#ifdef LN 1360 movaps 0 * SIZE(AA), %xmm4 1361 1362 movaps %xmm4, %xmm6 1363 shufps $0xff, %xmm6, %xmm6 1364 mulss %xmm6, %xmm1 1365 1366 movaps %xmm4, %xmm6 1367 shufps $0xaa, %xmm6, %xmm6 1368 mulss %xmm1, %xmm6 1369 subss %xmm6, %xmm0 1370 mulss %xmm4, %xmm0 1371#endif 1372 1373#ifdef LT 1374 movaps 0 * SIZE(AA), %xmm4 1375 mulss %xmm4, %xmm0 1376 movaps %xmm4, %xmm6 1377 shufps $0x55, %xmm6, %xmm6 1378 mulss %xmm0, %xmm6 1379 subss %xmm6, %xmm1 1380 movaps %xmm4, %xmm6 1381 shufps $0xff, %xmm6, %xmm6 1382 mulss %xmm6, %xmm1 1383#endif 1384 1385#ifdef RN 1386 movss 0 * SIZE(B), %xmm6 1387 shufps $0x00, %xmm6, %xmm6 1388 mulps %xmm6, %xmm0 1389#endif 1390 1391#ifdef RT 1392 movss 0 * SIZE(B), %xmm6 1393 shufps $0x00, %xmm6, %xmm6 1394 mulps %xmm6, %xmm0 1395#endif 1396 1397#if defined(LN) || defined(LT) 1398 movss %xmm0, 0 * SIZE(B) 1399 movss %xmm1, 1 * SIZE(B) 1400 1401 shufps $0x00, %xmm0, %xmm0 1402 shufps $0x00, %xmm1, %xmm1 1403 movaps %xmm0, 0 * SIZE(BB) 1404 movaps %xmm1, 4 * SIZE(BB) 1405#else 1406 movlps %xmm0, 0 * SIZE(AA) 1407#endif 1408 1409#ifdef LN 1410 subl $2 * SIZE, CO1 1411#endif 1412 1413#if defined(LN) || defined(LT) 1414 movss %xmm0, 0 * SIZE(CO1) 1415 movss %xmm1, 1 * SIZE(CO1) 1416#else 1417 movlps %xmm0, 0 * SIZE(CO1) 1418#endif 1419 1420#ifndef LN 1421 addl $2 * SIZE, CO1 1422#endif 1423 1424#if defined(LT) || defined(RN) 1425 movl K, %eax 1426 subl KK, %eax 1427 leal (,%eax, SIZE), %eax 1428 leal (AA, %eax, 2), AA 1429#ifdef LT 1430 addl $2 * SIZE, B 1431#endif 1432#endif 1433 1434#ifdef LN 1435 subl $2, KK 1436 movl BORIG, B 1437#endif 1438 1439#ifdef LT 1440 addl $2, KK 1441#endif 1442 1443#ifdef RT 1444 movl K, %eax 1445 movl BORIG, B 1446 sall $1 + BASE_SHIFT, %eax 1447 addl %eax, AORIG 1448#endif 1449 ALIGN_2 1450 1451.L170: 1452 testl $1, M 1453 jle .L179 1454 1455#ifdef LN 1456 movl K, %eax 1457 sall $BASE_SHIFT, %eax 1458 subl %eax, AORIG 1459#endif 1460 1461#if defined(LN) || defined(RT) 1462 movl KK, %eax 1463 movl AORIG, AA 1464 leal (AA, %eax, SIZE), AA 1465#endif 1466 1467 leal BUFFER, BB 1468 1469#if defined(LN) || defined(RT) 1470 movl KK, %eax 1471 sall $BASE_SHIFT, %eax 1472 leal (BB, %eax, 4), BB 1473#endif 1474 1475 movss 0 * SIZE(BB), %xmm2 1476 xorps %xmm4, %xmm4 1477 movss 0 * SIZE(AA), %xmm0 1478 xorps %xmm5, %xmm5 1479 movss 16 * SIZE(BB), %xmm3 1480 xorps %xmm6, %xmm6 1481 movss 4 * SIZE(AA), %xmm1 1482 xorps %xmm7, %xmm7 1483 1484#if defined(LT) || defined(RN) 1485 movl KK, %eax 1486#else 1487 movl K, %eax 1488 subl KK, %eax 1489#endif 1490 sarl $3, %eax 1491 je .L172 1492 ALIGN_2 1493 1494.L171: 1495 mulss %xmm0, %xmm2 1496 movss 1 * SIZE(AA), %xmm0 1497 addss %xmm2, %xmm4 1498 mulss 4 * SIZE(BB), %xmm0 1499 movss 32 * SIZE(BB), %xmm2 1500 addss %xmm0, %xmm5 1501 movss 2 * SIZE(AA), %xmm0 1502 mulss 8 * SIZE(BB), %xmm0 1503 addss %xmm0, %xmm6 1504 movss 3 * SIZE(AA), %xmm0 1505 mulss 12 * SIZE(BB), %xmm0 1506 addss %xmm0, %xmm7 1507 movss 8 * SIZE(AA), %xmm0 1508 mulss %xmm1, %xmm3 1509 movss 5 * SIZE(AA), %xmm1 1510 addss %xmm3, %xmm4 1511 mulss 20 * SIZE(BB), %xmm1 1512 movss 48 * SIZE(BB), %xmm3 1513 addss %xmm1, %xmm5 1514 movss 6 * SIZE(AA), %xmm1 1515 mulss 24 * SIZE(BB), %xmm1 1516 addss %xmm1, %xmm6 1517 movss 7 * SIZE(AA), %xmm1 1518 mulss 28 * SIZE(BB), %xmm1 1519 addss %xmm1, %xmm7 1520 movss 12 * SIZE(AA), %xmm1 1521 1522 addl $ 8 * SIZE, AA 1523 addl $32 * SIZE, BB 1524 decl %eax 1525 jne .L171 1526 ALIGN_2 1527 1528.L172: 1529#if defined(LT) || defined(RN) 1530 movl KK, %eax 1531#else 1532 movl K, %eax 1533 subl KK, %eax 1534#endif 1535 andl $7, %eax # if (k & 1) 1536 BRANCH 1537 je .L174 1538 1539.L173: 1540 movss 0 * SIZE(AA), %xmm0 1541 movss 0 * SIZE(BB), %xmm2 1542 mulss %xmm0, %xmm2 1543 addss %xmm2, %xmm4 1544 1545 addl $1 * SIZE, AA 1546 addl $4 * SIZE, BB 1547 decl %eax 1548 jg .L173 1549 ALIGN_4 1550 1551.L174: 1552 addss %xmm5, %xmm4 1553 addss %xmm7, %xmm6 1554 addss %xmm6, %xmm4 1555 1556#if defined(LN) || defined(RT) 1557 movl KK, %eax 1558 subl $1, %eax 1559 1560 movl AORIG, AA 1561 movl BORIG, B 1562 leal BUFFER, BB 1563 1564 sall $ BASE_SHIFT, %eax 1565 leal (AA, %eax, 1), AA 1566 leal (B, %eax, 1), B 1567 leal (BB, %eax, 4), BB 1568#endif 1569 1570#if defined(LN) || defined(LT) 1571 movss 0 * SIZE(B), %xmm1 1572 subss %xmm4, %xmm1 1573#else 1574 movss 0 * SIZE(AA), %xmm0 1575 subss %xmm4, %xmm0 1576#endif 1577 1578#if defined(LN) || defined(LT) 1579 mulss 0 * SIZE(AA), %xmm1 1580#endif 1581 1582#if defined(RN) || defined(RT) 1583 mulss 0 * SIZE(B), %xmm0 1584#endif 1585 1586#if defined(LN) || defined(LT) 1587 movss %xmm1, 0 * SIZE(B) 1588 1589 shufps $0x00, %xmm1, %xmm1 1590 movaps %xmm1, 0 * SIZE(BB) 1591#else 1592 movss %xmm0, 0 * SIZE(AA) 1593#endif 1594 1595#ifdef LN 1596 subl $1 * SIZE, CO1 1597#endif 1598 1599#if defined(LN) || defined(LT) 1600 movss %xmm1, 0 * SIZE(CO1) 1601#else 1602 movss %xmm0, 0 * SIZE(CO1) 1603#endif 1604 1605#ifndef LN 1606 addl $1 * SIZE, CO1 1607#endif 1608 1609#if defined(LT) || defined(RN) 1610 movl K, %eax 1611 subl KK, %eax 1612 leal (AA, %eax, SIZE), AA 1613#ifdef LT 1614 addl $1 * SIZE, B 1615#endif 1616#endif 1617 1618#ifdef LN 1619 subl $1, KK 1620 movl BORIG, B 1621#endif 1622 1623#ifdef LT 1624 addl $1, KK 1625#endif 1626 1627#ifdef RT 1628 movl K, %eax 1629 movl BORIG, B 1630 sall $BASE_SHIFT, %eax 1631 addl %eax, AORIG 1632#endif 1633 ALIGN_2 1634.L179: 1635#ifdef LN 1636 movl K, %eax 1637 leal (B, %eax, SIZE), B 1638#endif 1639 1640#if defined(LT) || defined(RN) 1641 movl K, %eax 1642 subl KK, %eax 1643 leal (B, %eax, SIZE), B 1644#endif 1645 1646#ifdef RN 1647 addl $1, KK 1648#endif 1649 1650#ifdef RT 1651 subl $1, KK 1652#endif 1653 ALIGN_4 1654 1655.L100: 1656 movl N, %eax 1657 sarl $1, %eax # j = (n >> 1) 1658 movl %eax, J 1659 jle .L999 1660 ALIGN_2 1661 1662.L01: 1663#ifdef LN 1664 movl OFFSET, %eax 1665 addl M, %eax 1666 movl %eax, KK 1667#endif 1668 1669 leal BUFFER, BB 1670 1671#ifdef RT 1672 movl K, %eax 1673 sall $1 + BASE_SHIFT, %eax 1674 subl %eax, B 1675#endif 1676 1677#if defined(LN) || defined(RT) 1678 movl KK, %eax 1679 movl B, BORIG 1680 sall $1 + BASE_SHIFT, %eax 1681 leal (B, %eax, 1), B 1682 leal (BB, %eax, 4), BB 1683#endif 1684 1685#ifdef LT 1686 movl OFFSET, %eax 1687 movl %eax, KK 1688#endif 1689 1690#if defined(LT) || defined(RN) 1691 movl KK, %eax 1692#else 1693 movl K, %eax 1694 subl KK, %eax 1695#endif 1696 sarl $2, %eax 1697 jle .L03 1698 ALIGN_4 1699 1700.L02: 1701 movsd 0 * SIZE(B), %xmm3 1702 movhps 2 * SIZE(B), %xmm3 1703 movsd 4 * SIZE(B), %xmm7 1704 movhps 6 * SIZE(B), %xmm7 1705 1706#ifdef HAVE_SSE2 1707 pshufd $0x00, %xmm3, %xmm0 1708 pshufd $0x55, %xmm3, %xmm1 1709 pshufd $0xaa, %xmm3, %xmm2 1710 pshufd $0xff, %xmm3, %xmm3 1711 1712 pshufd $0x00, %xmm7, %xmm4 1713 pshufd $0x55, %xmm7, %xmm5 1714 pshufd $0xaa, %xmm7, %xmm6 1715 pshufd $0xff, %xmm7, %xmm7 1716#else 1717 movaps %xmm3, %xmm0 1718 shufps $0x00, %xmm0, %xmm0 1719 movaps %xmm3, %xmm1 1720 shufps $0x55, %xmm1, %xmm1 1721 movaps %xmm3, %xmm2 1722 shufps $0xaa, %xmm2, %xmm2 1723 shufps $0xff, %xmm3, %xmm3 1724 1725 movaps %xmm7, %xmm4 1726 shufps $0x00, %xmm4, %xmm4 1727 movaps %xmm7, %xmm5 1728 shufps $0x55, %xmm5, %xmm5 1729 movaps %xmm7, %xmm6 1730 shufps $0xaa, %xmm6, %xmm6 1731 shufps $0xff, %xmm7, %xmm7 1732#endif 1733 1734 movaps %xmm0, 0 * SIZE(BB) 1735 movaps %xmm1, 4 * SIZE(BB) 1736 movaps %xmm2, 8 * SIZE(BB) 1737 movaps %xmm3, 12 * SIZE(BB) 1738 movaps %xmm4, 16 * SIZE(BB) 1739 movaps %xmm5, 20 * SIZE(BB) 1740 movaps %xmm6, 24 * SIZE(BB) 1741 movaps %xmm7, 28 * SIZE(BB) 1742 1743 addl $ 8 * SIZE, B 1744 addl $32 * SIZE, BB 1745 decl %eax 1746 BRANCH 1747 jne .L02 1748 ALIGN_2 1749 1750.L03: 1751#if defined(LT) || defined(RN) 1752 movl KK, %eax 1753#else 1754 movl K, %eax 1755 subl KK, %eax 1756#endif 1757 andl $3, %eax 1758 BRANCH 1759 jle .L05 1760 ALIGN_2 1761 1762.L04: 1763 movsd 0 * SIZE(B), %xmm3 1764 1765#ifdef HAVE_SSE2 1766 pshufd $0x00, %xmm3, %xmm0 1767 pshufd $0x55, %xmm3, %xmm1 1768#else 1769 movaps %xmm3, %xmm0 1770 shufps $0x00, %xmm0, %xmm0 1771 movaps %xmm3, %xmm1 1772 shufps $0x55, %xmm1, %xmm1 1773#endif 1774 1775 movaps %xmm0, 0 * SIZE(BB) 1776 movaps %xmm1, 4 * SIZE(BB) 1777 1778 addl $2 * SIZE, B 1779 addl $8 * SIZE, BB 1780 1781 decl %eax 1782 jne .L04 1783 ALIGN_4 1784 1785.L05: 1786#if defined(LT) || defined(RN) 1787 movl A, AA 1788#else 1789 movl A, %eax 1790 movl %eax, AORIG 1791#endif 1792 1793 leal (, LDC, 2), %eax 1794 1795#ifdef RT 1796 subl %eax, C 1797#endif 1798 movl C, CO1 1799#ifndef RT 1800 addl %eax, C 1801#endif 1802 1803 movl M, %ebx 1804 sarl $3, %ebx 1805 jle .L30 1806 ALIGN_4 1807 1808.L10: 1809#ifdef LN 1810 movl K, %eax 1811 sall $3 + BASE_SHIFT, %eax 1812 subl %eax, AORIG 1813#endif 1814 1815#if defined(LN) || defined(RT) 1816 movl KK, %eax 1817 movl AORIG, AA 1818 sall $3 + BASE_SHIFT, %eax 1819 addl %eax, AA 1820#endif 1821 1822 leal BUFFER, BB 1823 1824#if defined(LN) || defined(RT) 1825 movl KK, %eax 1826 sall $1 + BASE_SHIFT, %eax 1827 leal (BB, %eax, 4), BB 1828#endif 1829 1830 movaps 0 * SIZE(BB), %xmm2 1831 xorps %xmm4, %xmm4 1832 movaps 0 * SIZE(AA), %xmm0 1833 xorps %xmm5, %xmm5 1834 movaps 8 * SIZE(BB), %xmm3 1835 xorps %xmm6, %xmm6 1836 movaps 8 * SIZE(AA), %xmm1 1837 xorps %xmm7, %xmm7 1838 1839 PREFETCHW 7 * SIZE(CO1) 1840 PREFETCHW 7 * SIZE(CO1, LDC) 1841 1842#if defined(LT) || defined(RN) 1843 movl KK, %eax 1844#else 1845 movl K, %eax 1846 subl KK, %eax 1847#endif 1848 sarl $3, %eax 1849 je .L12 1850 ALIGN_2 1851 1852.L11: 1853 mulps %xmm0, %xmm2 1854 mulps 4 * SIZE(BB), %xmm0 1855 addps %xmm2, %xmm4 1856 movaps 0 * SIZE(BB), %xmm2 1857 1858 addps %xmm0, %xmm5 1859 movaps 4 * SIZE(AA), %xmm0 1860 mulps %xmm0, %xmm2 1861 mulps 4 * SIZE(BB), %xmm0 1862 1863 addps %xmm2, %xmm6 1864 movaps 16 * SIZE(BB), %xmm2 1865 addps %xmm0, %xmm7 1866 movaps 16 * SIZE(AA), %xmm0 1867 1868 mulps %xmm1, %xmm3 1869 mulps 12 * SIZE(BB), %xmm1 1870 addps %xmm3, %xmm4 1871 movaps 8 * SIZE(BB), %xmm3 1872 1873 addps %xmm1, %xmm5 1874 movaps 12 * SIZE(AA), %xmm1 1875 mulps %xmm1, %xmm3 1876 mulps 12 * SIZE(BB), %xmm1 1877 1878 addps %xmm3, %xmm6 1879 movaps 24 * SIZE(BB), %xmm3 1880 addps %xmm1, %xmm7 1881 movaps 24 * SIZE(AA), %xmm1 1882 1883 mulps %xmm0, %xmm2 1884 mulps 20 * SIZE(BB), %xmm0 1885 addps %xmm2, %xmm4 1886 movaps 16 * SIZE(BB), %xmm2 1887 1888 addps %xmm0, %xmm5 1889 movaps 20 * SIZE(AA), %xmm0 1890 mulps %xmm0, %xmm2 1891 mulps 20 * SIZE(BB), %xmm0 1892 1893 addps %xmm2, %xmm6 1894 movaps 32 * SIZE(BB), %xmm2 1895 addps %xmm0, %xmm7 1896 movaps 32 * SIZE(AA), %xmm0 1897 1898 mulps %xmm1, %xmm3 1899 mulps 28 * SIZE(BB), %xmm1 1900 addps %xmm3, %xmm4 1901 movaps 24 * SIZE(BB), %xmm3 1902 1903 addps %xmm1, %xmm5 1904 movaps 28 * SIZE(AA), %xmm1 1905 mulps %xmm1, %xmm3 1906 mulps 28 * SIZE(BB), %xmm1 1907 1908 addps %xmm3, %xmm6 1909 movaps 40 * SIZE(BB), %xmm3 1910 addps %xmm1, %xmm7 1911 movaps 40 * SIZE(AA), %xmm1 1912 1913 mulps %xmm0, %xmm2 1914 mulps 36 * SIZE(BB), %xmm0 1915 addps %xmm2, %xmm4 1916 movaps 32 * SIZE(BB), %xmm2 1917 1918 addps %xmm0, %xmm5 1919 movaps 36 * SIZE(AA), %xmm0 1920 mulps %xmm0, %xmm2 1921 mulps 36 * SIZE(BB), %xmm0 1922 1923 addps %xmm2, %xmm6 1924 movaps 48 * SIZE(BB), %xmm2 1925 addps %xmm0, %xmm7 1926 movaps 48 * SIZE(AA), %xmm0 1927 1928 mulps %xmm1, %xmm3 1929 mulps 44 * SIZE(BB), %xmm1 1930 addps %xmm3, %xmm4 1931 movaps 40 * SIZE(BB), %xmm3 1932 1933 addps %xmm1, %xmm5 1934 movaps 44 * SIZE(AA), %xmm1 1935 mulps %xmm1, %xmm3 1936 mulps 44 * SIZE(BB), %xmm1 1937 1938 addps %xmm3, %xmm6 1939 movaps 56 * SIZE(BB), %xmm3 1940 addps %xmm1, %xmm7 1941 movaps 56 * SIZE(AA), %xmm1 1942 1943 mulps %xmm0, %xmm2 1944 mulps 52 * SIZE(BB), %xmm0 1945 addps %xmm2, %xmm4 1946 movaps 48 * SIZE(BB), %xmm2 1947 1948 addps %xmm0, %xmm5 1949 movaps 52 * SIZE(AA), %xmm0 1950 mulps %xmm0, %xmm2 1951 mulps 52 * SIZE(BB), %xmm0 1952 1953 addps %xmm2, %xmm6 1954 movaps 64 * SIZE(BB), %xmm2 1955 addps %xmm0, %xmm7 1956 movaps 64 * SIZE(AA), %xmm0 1957 1958 mulps %xmm1, %xmm3 1959 mulps 60 * SIZE(BB), %xmm1 1960 addps %xmm3, %xmm4 1961 movaps 56 * SIZE(BB), %xmm3 1962 1963 addps %xmm1, %xmm5 1964 movaps 60 * SIZE(AA), %xmm1 1965 mulps %xmm1, %xmm3 1966 mulps 60 * SIZE(BB), %xmm1 1967 1968 addps %xmm3, %xmm6 1969 movaps 72 * SIZE(BB), %xmm3 1970 addps %xmm1, %xmm7 1971 movaps 72 * SIZE(AA), %xmm1 1972 1973 addl $64 * SIZE, BB 1974 addl $64 * SIZE, AA 1975 decl %eax 1976 jne .L11 1977 ALIGN_2 1978 1979.L12: 1980#if defined(LT) || defined(RN) 1981 movl KK, %eax 1982#else 1983 movl K, %eax 1984 subl KK, %eax 1985#endif 1986 andl $7, %eax # if (k & 1) 1987 BRANCH 1988 je .L14 1989 1990.L13: 1991 movaps 4 * SIZE(BB), %xmm1 1992 mulps %xmm0, %xmm2 1993 addps %xmm2, %xmm4 1994 movaps 0 * SIZE(BB), %xmm2 1995 mulps %xmm0, %xmm1 1996 movaps 4 * SIZE(AA), %xmm0 1997 addps %xmm1, %xmm5 1998 movaps 4 * SIZE(BB), %xmm1 1999 mulps %xmm0, %xmm2 2000 addps %xmm2, %xmm6 2001 movaps 8 * SIZE(BB), %xmm2 2002 mulps %xmm0, %xmm1 2003 movaps 8 * SIZE(AA), %xmm0 2004 addps %xmm1, %xmm7 2005 2006 addl $8 * SIZE, AA 2007 addl $8 * SIZE, BB 2008 subl $1, %eax 2009 jg .L13 2010 ALIGN_4 2011 2012.L14: 2013#if defined(LN) || defined(RT) 2014 movl KK, %eax 2015#ifdef LN 2016 subl $8, %eax 2017#else 2018 subl $2, %eax 2019#endif 2020 2021 movl AORIG, AA 2022 movl BORIG, B 2023 leal BUFFER, BB 2024 2025 sall $BASE_SHIFT, %eax 2026 leal (AA, %eax, 8), AA 2027 leal (B, %eax, 2), B 2028 leal (BB, %eax, 8), BB 2029#endif 2030 2031#if defined(LN) || defined(LT) 2032 movaps %xmm4, %xmm0 2033 unpcklps %xmm5, %xmm4 2034 unpckhps %xmm5, %xmm0 2035 2036 movaps %xmm6, %xmm1 2037 unpcklps %xmm7, %xmm6 2038 unpckhps %xmm7, %xmm1 2039 2040 movsd 0 * SIZE(B), %xmm2 2041 movhps 2 * SIZE(B), %xmm2 2042 movsd 4 * SIZE(B), %xmm3 2043 movhps 6 * SIZE(B), %xmm3 2044 movsd 8 * SIZE(B), %xmm5 2045 movhps 10 * SIZE(B), %xmm5 2046 movsd 12 * SIZE(B), %xmm7 2047 movhps 14 * SIZE(B), %xmm7 2048 2049 subps %xmm4, %xmm2 2050 subps %xmm0, %xmm3 2051 subps %xmm6, %xmm5 2052 subps %xmm1, %xmm7 2053#else 2054 movaps 0 * SIZE(AA), %xmm0 2055 movaps 4 * SIZE(AA), %xmm1 2056 movaps 8 * SIZE(AA), %xmm2 2057 movaps 12 * SIZE(AA), %xmm3 2058 2059 subps %xmm4, %xmm0 2060 subps %xmm6, %xmm1 2061 subps %xmm5, %xmm2 2062 subps %xmm7, %xmm3 2063#endif 2064 2065#if defined(LN) || defined(LT) 2066 movaps TRMASK, %xmm6 2067#endif 2068 2069#ifdef LN 2070 movss 63 * SIZE(AA), %xmm0 2071 movaps %xmm6, %xmm1 2072 shufps $0x00, %xmm0, %xmm1 2073 mulps %xmm1, %xmm7 2074 2075 movaps %xmm7, %xmm1 2076 shufps $0xee, %xmm1, %xmm1 2077 2078 movss 62 * SIZE(AA), %xmm0 2079 shufps $0x50, %xmm0, %xmm0 2080 mulps %xmm1, %xmm0 2081 subps %xmm0, %xmm7 2082 2083 movsd 60 * SIZE(AA), %xmm0 2084 shufps $0x50, %xmm0, %xmm0 2085 mulps %xmm1, %xmm0 2086 subps %xmm0, %xmm5 2087 2088 movsd 58 * SIZE(AA), %xmm0 2089 shufps $0x50, %xmm0, %xmm0 2090 mulps %xmm1, %xmm0 2091 subps %xmm0, %xmm3 2092 2093 movsd 56 * SIZE(AA), %xmm0 2094 shufps $0x50, %xmm0, %xmm0 2095 mulps %xmm1, %xmm0 2096 subps %xmm0, %xmm2 2097 2098 movss 54 * SIZE(AA), %xmm0 2099 shufps $0x00, %xmm6, %xmm0 2100 mulps %xmm0, %xmm7 2101 2102 movaps %xmm7, %xmm1 2103 shufps $0x44, %xmm1, %xmm1 2104 2105 movsd 52 * SIZE(AA), %xmm0 2106 shufps $0x50, %xmm0, %xmm0 2107 mulps %xmm1, %xmm0 2108 subps %xmm0, %xmm5 2109 2110 movsd 50 * SIZE(AA), %xmm0 2111 shufps $0x50, %xmm0, %xmm0 2112 mulps %xmm1, %xmm0 2113 subps %xmm0, %xmm3 2114 2115 movsd 48 * SIZE(AA), %xmm0 2116 shufps $0x50, %xmm0, %xmm0 2117 mulps %xmm1, %xmm0 2118 subps %xmm0, %xmm2 2119 2120 2121 movss 45 * SIZE(AA), %xmm0 2122 movaps %xmm6, %xmm1 2123 shufps $0x00, %xmm0, %xmm1 2124 mulps %xmm1, %xmm5 2125 2126 movaps %xmm5, %xmm1 2127 shufps $0xee, %xmm1, %xmm1 2128 2129 movss 44 * SIZE(AA), %xmm0 2130 shufps $0x50, %xmm0, %xmm0 2131 mulps %xmm1, %xmm0 2132 subps %xmm0, %xmm5 2133 2134 movsd 42 * SIZE(AA), %xmm0 2135 shufps $0x50, %xmm0, %xmm0 2136 mulps %xmm1, %xmm0 2137 subps %xmm0, %xmm3 2138 2139 movsd 40 * SIZE(AA), %xmm0 2140 shufps $0x50, %xmm0, %xmm0 2141 mulps %xmm1, %xmm0 2142 subps %xmm0, %xmm2 2143 2144 movss 36 * SIZE(AA), %xmm0 2145 shufps $0x00, %xmm6, %xmm0 2146 mulps %xmm0, %xmm5 2147 2148 movaps %xmm5, %xmm1 2149 shufps $0x44, %xmm1, %xmm1 2150 2151 movsd 34 * SIZE(AA), %xmm0 2152 shufps $0x50, %xmm0, %xmm0 2153 mulps %xmm1, %xmm0 2154 subps %xmm0, %xmm3 2155 2156 movsd 32 * SIZE(AA), %xmm0 2157 shufps $0x50, %xmm0, %xmm0 2158 mulps %xmm1, %xmm0 2159 subps %xmm0, %xmm2 2160 2161 movss 27 * SIZE(AA), %xmm0 2162 movaps %xmm6, %xmm1 2163 shufps $0x00, %xmm0, %xmm1 2164 mulps %xmm1, %xmm3 2165 2166 movaps %xmm3, %xmm1 2167 shufps $0xee, %xmm1, %xmm1 2168 2169 movss 26 * SIZE(AA), %xmm0 2170 shufps $0x50, %xmm0, %xmm0 2171 mulps %xmm1, %xmm0 2172 subps %xmm0, %xmm3 2173 2174 movsd 24 * SIZE(AA), %xmm0 2175 shufps $0x50, %xmm0, %xmm0 2176 mulps %xmm1, %xmm0 2177 subps %xmm0, %xmm2 2178 2179 movss 18 * SIZE(AA), %xmm0 2180 shufps $0x00, %xmm6, %xmm0 2181 mulps %xmm0, %xmm3 2182 2183 movaps %xmm3, %xmm1 2184 shufps $0x44, %xmm1, %xmm1 2185 2186 movsd 16 * SIZE(AA), %xmm0 2187 shufps $0x50, %xmm0, %xmm0 2188 mulps %xmm1, %xmm0 2189 subps %xmm0, %xmm2 2190 2191 movss 9 * SIZE(AA), %xmm0 2192 movaps %xmm6, %xmm1 2193 shufps $0x00, %xmm0, %xmm1 2194 mulps %xmm1, %xmm2 2195 2196 movaps %xmm2, %xmm1 2197 shufps $0xee, %xmm1, %xmm1 2198 2199 movss 8 * SIZE(AA), %xmm0 2200 shufps $0x50, %xmm0, %xmm0 2201 mulps %xmm1, %xmm0 2202 subps %xmm0, %xmm2 2203 2204 movss 0 * SIZE(AA), %xmm0 2205 shufps $0x00, %xmm6, %xmm0 2206 mulps %xmm0, %xmm2 2207#endif 2208 2209#ifdef LT 2210 movss 0 * SIZE(AA), %xmm0 2211 shufps $0x00, %xmm6, %xmm0 2212 mulps %xmm0, %xmm2 2213 2214 movaps %xmm2, %xmm1 2215 shufps $0x44, %xmm1, %xmm1 2216 2217 movss 1 * SIZE(AA), %xmm0 2218 shufps $0x05, %xmm0, %xmm0 2219 mulps %xmm1, %xmm0 2220 subps %xmm0, %xmm2 2221 2222 movsd 2 * SIZE(AA), %xmm0 2223 shufps $0x50, %xmm0, %xmm0 2224 mulps %xmm1, %xmm0 2225 subps %xmm0, %xmm3 2226 2227 movsd 4 * SIZE(AA), %xmm0 2228 shufps $0x50, %xmm0, %xmm0 2229 mulps %xmm1, %xmm0 2230 subps %xmm0, %xmm5 2231 2232 movsd 6 * SIZE(AA), %xmm0 2233 shufps $0x50, %xmm0, %xmm0 2234 mulps %xmm1, %xmm0 2235 subps %xmm0, %xmm7 2236 2237 movss 9 * SIZE(AA), %xmm0 2238 movaps %xmm6, %xmm1 2239 shufps $0x00, %xmm0, %xmm1 2240 mulps %xmm1, %xmm2 2241 2242 movaps %xmm2, %xmm1 2243 shufps $0xee, %xmm1, %xmm1 2244 2245 movsd 10 * SIZE(AA), %xmm0 2246 shufps $0x50, %xmm0, %xmm0 2247 mulps %xmm1, %xmm0 2248 subps %xmm0, %xmm3 2249 2250 movsd 12 * SIZE(AA), %xmm0 2251 shufps $0x50, %xmm0, %xmm0 2252 mulps %xmm1, %xmm0 2253 subps %xmm0, %xmm5 2254 2255 movsd 14 * SIZE(AA), %xmm0 2256 shufps $0x50, %xmm0, %xmm0 2257 mulps %xmm1, %xmm0 2258 subps %xmm0, %xmm7 2259 2260 movss 18 * SIZE(AA), %xmm0 2261 shufps $0x00, %xmm6, %xmm0 2262 mulps %xmm0, %xmm3 2263 2264 movaps %xmm3, %xmm1 2265 shufps $0x44, %xmm1, %xmm1 2266 2267 movss 19 * SIZE(AA), %xmm0 2268 shufps $0x05, %xmm0, %xmm0 2269 mulps %xmm1, %xmm0 2270 subps %xmm0, %xmm3 2271 2272 movsd 20 * SIZE(AA), %xmm0 2273 shufps $0x50, %xmm0, %xmm0 2274 mulps %xmm1, %xmm0 2275 subps %xmm0, %xmm5 2276 2277 movsd 22 * SIZE(AA), %xmm0 2278 shufps $0x50, %xmm0, %xmm0 2279 mulps %xmm1, %xmm0 2280 subps %xmm0, %xmm7 2281 2282 movss 27 * SIZE(AA), %xmm0 2283 movaps %xmm6, %xmm1 2284 shufps $0x00, %xmm0, %xmm1 2285 mulps %xmm1, %xmm3 2286 2287 movaps %xmm3, %xmm1 2288 shufps $0xee, %xmm1, %xmm1 2289 2290 movsd 28 * SIZE(AA), %xmm0 2291 shufps $0x50, %xmm0, %xmm0 2292 mulps %xmm1, %xmm0 2293 subps %xmm0, %xmm5 2294 2295 movsd 30 * SIZE(AA), %xmm0 2296 shufps $0x50, %xmm0, %xmm0 2297 mulps %xmm1, %xmm0 2298 subps %xmm0, %xmm7 2299 2300 movss 36 * SIZE(AA), %xmm0 2301 shufps $0x00, %xmm6, %xmm0 2302 mulps %xmm0, %xmm5 2303 2304 movaps %xmm5, %xmm1 2305 shufps $0x44, %xmm1, %xmm1 2306 2307 movss 37 * SIZE(AA), %xmm0 2308 shufps $0x05, %xmm0, %xmm0 2309 mulps %xmm1, %xmm0 2310 subps %xmm0, %xmm5 2311 2312 movsd 38 * SIZE(AA), %xmm0 2313 shufps $0x50, %xmm0, %xmm0 2314 mulps %xmm1, %xmm0 2315 subps %xmm0, %xmm7 2316 2317 movss 45 * SIZE(AA), %xmm0 2318 movaps %xmm6, %xmm1 2319 shufps $0x00, %xmm0, %xmm1 2320 mulps %xmm1, %xmm5 2321 2322 movaps %xmm5, %xmm1 2323 shufps $0xee, %xmm1, %xmm1 2324 2325 movsd 46 * SIZE(AA), %xmm0 2326 shufps $0x50, %xmm0, %xmm0 2327 mulps %xmm1, %xmm0 2328 subps %xmm0, %xmm7 2329 2330 movss 54 * SIZE(AA), %xmm0 2331 shufps $0x00, %xmm6, %xmm0 2332 mulps %xmm0, %xmm7 2333 2334 movaps %xmm7, %xmm1 2335 shufps $0x44, %xmm1, %xmm1 2336 2337 movss 55 * SIZE(AA), %xmm0 2338 shufps $0x05, %xmm0, %xmm0 2339 mulps %xmm1, %xmm0 2340 subps %xmm0, %xmm7 2341 2342 movss 63 * SIZE(AA), %xmm0 2343 movaps %xmm6, %xmm1 2344 shufps $0x00, %xmm0, %xmm1 2345 mulps %xmm1, %xmm7 2346#endif 2347 2348#ifdef RN 2349 movss 0 * SIZE(B), %xmm6 2350 shufps $0x00, %xmm6, %xmm6 2351 2352 mulps %xmm6, %xmm0 2353 mulps %xmm6, %xmm1 2354 2355 movss 1 * SIZE(B), %xmm6 2356 shufps $0x00, %xmm6, %xmm6 2357 movaps %xmm6, %xmm5 2358 2359 mulps %xmm0, %xmm5 2360 mulps %xmm1, %xmm6 2361 2362 subps %xmm5, %xmm2 2363 subps %xmm6, %xmm3 2364 2365 movss 3 * SIZE(B), %xmm6 2366 shufps $0x00, %xmm6, %xmm6 2367 2368 mulps %xmm6, %xmm2 2369 mulps %xmm6, %xmm3 2370#endif 2371 2372#ifdef RT 2373 movss 3 * SIZE(B), %xmm6 2374 shufps $0x00, %xmm6, %xmm6 2375 2376 mulps %xmm6, %xmm2 2377 mulps %xmm6, %xmm3 2378 2379 movss 2 * SIZE(B), %xmm6 2380 shufps $0x00, %xmm6, %xmm6 2381 movaps %xmm6, %xmm5 2382 2383 mulps %xmm2, %xmm5 2384 mulps %xmm3, %xmm6 2385 2386 subps %xmm5, %xmm0 2387 subps %xmm6, %xmm1 2388 2389 movss 0 * SIZE(B), %xmm6 2390 shufps $0x00, %xmm6, %xmm6 2391 2392 mulps %xmm6, %xmm0 2393 mulps %xmm6, %xmm1 2394#endif 2395 2396#if defined(LN) || defined(LT) 2397 movlps %xmm2, 0 * SIZE(B) 2398 movhps %xmm2, 2 * SIZE(B) 2399 movlps %xmm3, 4 * SIZE(B) 2400 movhps %xmm3, 6 * SIZE(B) 2401 movlps %xmm5, 8 * SIZE(B) 2402 movhps %xmm5, 10 * SIZE(B) 2403 movlps %xmm7, 12 * SIZE(B) 2404 movhps %xmm7, 14 * SIZE(B) 2405 2406#ifdef HAVE_SSE2 2407 pshufd $0x00, %xmm2, %xmm0 2408 pshufd $0x55, %xmm2, %xmm1 2409 pshufd $0xaa, %xmm2, %xmm4 2410 pshufd $0xff, %xmm2, %xmm6 2411#else 2412 movaps %xmm2, %xmm0 2413 shufps $0x00, %xmm0, %xmm0 2414 movaps %xmm2, %xmm1 2415 shufps $0x55, %xmm1, %xmm1 2416 movaps %xmm2, %xmm4 2417 shufps $0xaa, %xmm4, %xmm4 2418 movaps %xmm2, %xmm6 2419 shufps $0xff, %xmm6, %xmm6 2420#endif 2421 movaps %xmm0, 0 * SIZE(BB) 2422 movaps %xmm1, 4 * SIZE(BB) 2423 movaps %xmm4, 8 * SIZE(BB) 2424 movaps %xmm6, 12 * SIZE(BB) 2425 2426#ifdef HAVE_SSE2 2427 pshufd $0x00, %xmm3, %xmm0 2428 pshufd $0x55, %xmm3, %xmm1 2429 pshufd $0xaa, %xmm3, %xmm4 2430 pshufd $0xff, %xmm3, %xmm6 2431#else 2432 movaps %xmm3, %xmm0 2433 shufps $0x00, %xmm0, %xmm0 2434 movaps %xmm3, %xmm1 2435 shufps $0x55, %xmm1, %xmm1 2436 movaps %xmm3, %xmm4 2437 shufps $0xaa, %xmm4, %xmm4 2438 movaps %xmm3, %xmm6 2439 shufps $0xff, %xmm6, %xmm6 2440#endif 2441 movaps %xmm0, 16 * SIZE(BB) 2442 movaps %xmm1, 20 * SIZE(BB) 2443 movaps %xmm4, 24 * SIZE(BB) 2444 movaps %xmm6, 28 * SIZE(BB) 2445 2446#ifdef HAVE_SSE2 2447 pshufd $0x00, %xmm5, %xmm0 2448 pshufd $0x55, %xmm5, %xmm1 2449 pshufd $0xaa, %xmm5, %xmm4 2450 pshufd $0xff, %xmm5, %xmm6 2451#else 2452 movaps %xmm5, %xmm0 2453 shufps $0x00, %xmm0, %xmm0 2454 movaps %xmm5, %xmm1 2455 shufps $0x55, %xmm1, %xmm1 2456 movaps %xmm5, %xmm4 2457 shufps $0xaa, %xmm4, %xmm4 2458 movaps %xmm5, %xmm6 2459 shufps $0xff, %xmm6, %xmm6 2460#endif 2461 movaps %xmm0, 32 * SIZE(BB) 2462 movaps %xmm1, 36 * SIZE(BB) 2463 movaps %xmm4, 40 * SIZE(BB) 2464 movaps %xmm6, 44 * SIZE(BB) 2465 2466#ifdef HAVE_SSE2 2467 pshufd $0x00, %xmm7, %xmm0 2468 pshufd $0x55, %xmm7, %xmm1 2469 pshufd $0xaa, %xmm7, %xmm4 2470 pshufd $0xff, %xmm7, %xmm6 2471#else 2472 movaps %xmm7, %xmm0 2473 shufps $0x00, %xmm0, %xmm0 2474 movaps %xmm7, %xmm1 2475 shufps $0x55, %xmm1, %xmm1 2476 movaps %xmm7, %xmm4 2477 shufps $0xaa, %xmm4, %xmm4 2478 movaps %xmm7, %xmm6 2479 shufps $0xff, %xmm6, %xmm6 2480#endif 2481 movaps %xmm0, 48 * SIZE(BB) 2482 movaps %xmm1, 52 * SIZE(BB) 2483 movaps %xmm4, 56 * SIZE(BB) 2484 movaps %xmm6, 60 * SIZE(BB) 2485#else 2486 movaps %xmm0, 0 * SIZE(AA) 2487 movaps %xmm1, 4 * SIZE(AA) 2488 movaps %xmm2, 8 * SIZE(AA) 2489 movaps %xmm3, 12 * SIZE(AA) 2490#endif 2491 2492#ifdef LN 2493 subl $8 * SIZE, CO1 2494#endif 2495 2496#if defined(LN) || defined(LT) 2497 movaps %xmm2, %xmm0 2498 shufps $0x88, %xmm3, %xmm2 2499 shufps $0xdd, %xmm3, %xmm0 2500 2501 movaps %xmm5, %xmm4 2502 shufps $0x88, %xmm7, %xmm5 2503 shufps $0xdd, %xmm7, %xmm4 2504 2505 movlps %xmm2, 0 * SIZE(CO1) 2506 movhps %xmm2, 2 * SIZE(CO1) 2507 movlps %xmm5, 4 * SIZE(CO1) 2508 movhps %xmm5, 6 * SIZE(CO1) 2509 movlps %xmm0, 0 * SIZE(CO1, LDC) 2510 movhps %xmm0, 2 * SIZE(CO1, LDC) 2511 movlps %xmm4, 4 * SIZE(CO1, LDC) 2512 movhps %xmm4, 6 * SIZE(CO1, LDC) 2513#else 2514 movlps %xmm0, 0 * SIZE(CO1) 2515 movhps %xmm0, 2 * SIZE(CO1) 2516 movlps %xmm1, 4 * SIZE(CO1) 2517 movhps %xmm1, 6 * SIZE(CO1) 2518 movlps %xmm2, 0 * SIZE(CO1, LDC) 2519 movhps %xmm2, 2 * SIZE(CO1, LDC) 2520 movlps %xmm3, 4 * SIZE(CO1, LDC) 2521 movhps %xmm3, 6 * SIZE(CO1, LDC) 2522#endif 2523 2524#ifndef LN 2525 addl $8 * SIZE, CO1 2526#endif 2527 2528#if defined(LT) || defined(RN) 2529 movl K, %eax 2530 subl KK, %eax 2531 leal (,%eax, SIZE), %eax 2532 leal (AA, %eax, 8), AA 2533#ifdef LT 2534 addl $16 * SIZE, B 2535#endif 2536#endif 2537 2538#ifdef LN 2539 subl $8, KK 2540 movl BORIG, B 2541#endif 2542 2543#ifdef LT 2544 addl $8, KK 2545#endif 2546 2547#ifdef RT 2548 movl K, %eax 2549 movl BORIG, B 2550 sall $3 + BASE_SHIFT, %eax 2551 addl %eax, AORIG 2552#endif 2553 2554 decl %ebx # i -- 2555 jg .L10 2556 ALIGN_2 2557 2558.L30: 2559 testl $4, M 2560 jle .L50 2561 2562#ifdef LN 2563 movl K, %eax 2564 sall $2 + BASE_SHIFT, %eax 2565 subl %eax, AORIG 2566#endif 2567 2568#if defined(LN) || defined(RT) 2569 movl KK, %eax 2570 movl AORIG, AA 2571 sall $2 + BASE_SHIFT, %eax 2572 addl %eax, AA 2573#endif 2574 2575 leal BUFFER, BB 2576 2577#if defined(LN) || defined(RT) 2578 movl KK, %eax 2579 sall $1 + BASE_SHIFT, %eax 2580 leal (BB, %eax, 4), BB 2581#endif 2582 2583 movaps 0 * SIZE(BB), %xmm2 2584 xorps %xmm4, %xmm4 2585 movaps 0 * SIZE(AA), %xmm0 2586 xorps %xmm5, %xmm5 2587 movaps 16 * SIZE(BB), %xmm3 2588 xorps %xmm6, %xmm6 2589 movaps 16 * SIZE(AA), %xmm1 2590 xorps %xmm7, %xmm7 2591 2592#if defined(LT) || defined(RN) 2593 movl KK, %eax 2594#else 2595 movl K, %eax 2596 subl KK, %eax 2597#endif 2598 sarl $3, %eax 2599 je .L32 2600 ALIGN_2 2601 2602.L31: 2603 mulps %xmm0, %xmm2 2604 mulps 4 * SIZE(BB), %xmm0 2605 addps %xmm2, %xmm4 2606 movaps 8 * SIZE(BB), %xmm2 2607 addps %xmm0, %xmm5 2608 movaps 4 * SIZE(AA), %xmm0 2609 mulps %xmm0, %xmm2 2610 mulps 12 * SIZE(BB), %xmm0 2611 addps %xmm2, %xmm6 2612 movaps 32 * SIZE(BB), %xmm2 2613 addps %xmm0, %xmm7 2614 movaps 8 * SIZE(AA), %xmm0 2615 mulps %xmm0, %xmm3 2616 mulps 20 * SIZE(BB), %xmm0 2617 addps %xmm3, %xmm4 2618 movaps 24 * SIZE(BB), %xmm3 2619 addps %xmm0, %xmm5 2620 movaps 12 * SIZE(AA), %xmm0 2621 mulps %xmm0, %xmm3 2622 mulps 28 * SIZE(BB), %xmm0 2623 addps %xmm3, %xmm6 2624 movaps 48 * SIZE(BB), %xmm3 2625 addps %xmm0, %xmm7 2626 movaps 32 * SIZE(AA), %xmm0 2627 mulps %xmm1, %xmm2 2628 mulps 36 * SIZE(BB), %xmm1 2629 addps %xmm2, %xmm4 2630 movaps 40 * SIZE(BB), %xmm2 2631 addps %xmm1, %xmm5 2632 movaps 20 * SIZE(AA), %xmm1 2633 mulps %xmm1, %xmm2 2634 mulps 44 * SIZE(BB), %xmm1 2635 addps %xmm2, %xmm6 2636 movaps 64 * SIZE(BB), %xmm2 2637 addps %xmm1, %xmm7 2638 movaps 24 * SIZE(AA), %xmm1 2639 mulps %xmm1, %xmm3 2640 mulps 52 * SIZE(BB), %xmm1 2641 addps %xmm3, %xmm4 2642 movaps 56 * SIZE(BB), %xmm3 2643 addps %xmm1, %xmm5 2644 movaps 28 * SIZE(AA), %xmm1 2645 mulps %xmm1, %xmm3 2646 mulps 60 * SIZE(BB), %xmm1 2647 addps %xmm3, %xmm6 2648 movaps 80 * SIZE(BB), %xmm3 2649 addps %xmm1, %xmm7 2650 movaps 48 * SIZE(AA), %xmm1 2651 2652 addl $32 * SIZE, AA 2653 addl $64 * SIZE, BB 2654 decl %eax 2655 jne .L31 2656 ALIGN_2 2657 2658.L32: 2659#if defined(LT) || defined(RN) 2660 movl KK, %eax 2661#else 2662 movl K, %eax 2663 subl KK, %eax 2664#endif 2665 andl $7, %eax # if (k & 1) 2666 BRANCH 2667 je .L34 2668 2669.L33: 2670 mulps %xmm0, %xmm2 2671 mulps 4 * SIZE(BB), %xmm0 2672 addps %xmm2, %xmm4 2673 movaps 8 * SIZE(BB), %xmm2 2674 addps %xmm0, %xmm5 2675 movaps 4 * SIZE(AA), %xmm0 2676 2677 addl $4 * SIZE, AA 2678 addl $8 * SIZE, BB 2679 decl %eax 2680 jg .L33 2681 ALIGN_4 2682 2683.L34: 2684 addps %xmm6, %xmm4 2685 addps %xmm7, %xmm5 2686 2687#if defined(LN) || defined(RT) 2688 movl KK, %eax 2689#ifdef LN 2690 subl $4, %eax 2691#else 2692 subl $2, %eax 2693#endif 2694 2695 movl AORIG, AA 2696 movl BORIG, B 2697 leal BUFFER, BB 2698 2699 sall $BASE_SHIFT, %eax 2700 leal (AA, %eax, 4), AA 2701 leal (B, %eax, 2), B 2702 leal (BB, %eax, 8), BB 2703#endif 2704 2705#if defined(LN) || defined(LT) 2706 movaps %xmm4, %xmm0 2707 unpcklps %xmm5, %xmm4 2708 unpckhps %xmm5, %xmm0 2709 2710 movsd 0 * SIZE(B), %xmm2 2711 movhps 2 * SIZE(B), %xmm2 2712 movsd 4 * SIZE(B), %xmm3 2713 movhps 6 * SIZE(B), %xmm3 2714 2715 subps %xmm4, %xmm2 2716 subps %xmm0, %xmm3 2717#else 2718 movaps 0 * SIZE(AA), %xmm0 2719 movaps 4 * SIZE(AA), %xmm2 2720 2721 subps %xmm4, %xmm0 2722 subps %xmm5, %xmm2 2723#endif 2724 2725#if defined(LN) || defined(LT) 2726 movaps TRMASK, %xmm6 2727#endif 2728 2729#ifdef LN 2730 movss 15 * SIZE(AA), %xmm0 2731 movaps %xmm6, %xmm1 2732 shufps $0x00, %xmm0, %xmm1 2733 mulps %xmm1, %xmm3 2734 2735 movaps %xmm3, %xmm1 2736 shufps $0xee, %xmm1, %xmm1 2737 2738 movss 14 * SIZE(AA), %xmm0 2739 shufps $0x50, %xmm0, %xmm0 2740 mulps %xmm1, %xmm0 2741 subps %xmm0, %xmm3 2742 2743 movsd 12 * SIZE(AA), %xmm0 2744 shufps $0x50, %xmm0, %xmm0 2745 mulps %xmm1, %xmm0 2746 subps %xmm0, %xmm2 2747 2748 movss 10 * SIZE(AA), %xmm0 2749 shufps $0x00, %xmm6, %xmm0 2750 mulps %xmm0, %xmm3 2751 2752 movaps %xmm3, %xmm1 2753 shufps $0x44, %xmm1, %xmm1 2754 2755 movsd 8 * SIZE(AA), %xmm0 2756 shufps $0x50, %xmm0, %xmm0 2757 mulps %xmm1, %xmm0 2758 subps %xmm0, %xmm2 2759 2760 movss 5 * SIZE(AA), %xmm0 2761 movaps %xmm6, %xmm1 2762 shufps $0x00, %xmm0, %xmm1 2763 mulps %xmm1, %xmm2 2764 2765 movaps %xmm2, %xmm1 2766 shufps $0xee, %xmm1, %xmm1 2767 2768 movss 4 * SIZE(AA), %xmm0 2769 shufps $0x50, %xmm0, %xmm0 2770 mulps %xmm1, %xmm0 2771 subps %xmm0, %xmm2 2772 2773 movss 0 * SIZE(AA), %xmm0 2774 shufps $0x00, %xmm6, %xmm0 2775 mulps %xmm0, %xmm2 2776 2777#endif 2778 2779#ifdef LT 2780 movss 0 * SIZE(AA), %xmm0 2781 shufps $0x00, %xmm6, %xmm0 2782 mulps %xmm0, %xmm2 2783 2784 movaps %xmm2, %xmm1 2785 shufps $0x44, %xmm1, %xmm1 2786 2787 movss 1 * SIZE(AA), %xmm0 2788 shufps $0x05, %xmm0, %xmm0 2789 mulps %xmm1, %xmm0 2790 subps %xmm0, %xmm2 2791 2792 movsd 2 * SIZE(AA), %xmm0 2793 shufps $0x50, %xmm0, %xmm0 2794 mulps %xmm1, %xmm0 2795 subps %xmm0, %xmm3 2796 2797 movss 5 * SIZE(AA), %xmm0 2798 movaps %xmm6, %xmm1 2799 shufps $0x00, %xmm0, %xmm1 2800 mulps %xmm1, %xmm2 2801 2802 movaps %xmm2, %xmm1 2803 shufps $0xee, %xmm1, %xmm1 2804 2805 movsd 6 * SIZE(AA), %xmm0 2806 shufps $0x50, %xmm0, %xmm0 2807 mulps %xmm1, %xmm0 2808 subps %xmm0, %xmm3 2809 2810 movss 10 * SIZE(AA), %xmm0 2811 shufps $0x00, %xmm6, %xmm0 2812 mulps %xmm0, %xmm3 2813 2814 movaps %xmm3, %xmm1 2815 shufps $0x44, %xmm1, %xmm1 2816 2817 movss 11 * SIZE(AA), %xmm0 2818 shufps $0x05, %xmm0, %xmm0 2819 mulps %xmm1, %xmm0 2820 subps %xmm0, %xmm3 2821 2822 movss 15 * SIZE(AA), %xmm0 2823 movaps %xmm6, %xmm1 2824 shufps $0x00, %xmm0, %xmm1 2825 mulps %xmm1, %xmm3 2826#endif 2827 2828#ifdef RN 2829 movss 0 * SIZE(B), %xmm6 2830 shufps $0x00, %xmm6, %xmm6 2831 2832 mulps %xmm6, %xmm0 2833 2834 movss 1 * SIZE(B), %xmm6 2835 shufps $0x00, %xmm6, %xmm6 2836 movaps %xmm6, %xmm5 2837 2838 mulps %xmm0, %xmm5 2839 subps %xmm5, %xmm2 2840 2841 movss 3 * SIZE(B), %xmm6 2842 shufps $0x00, %xmm6, %xmm6 2843 2844 mulps %xmm6, %xmm2 2845#endif 2846 2847#ifdef RT 2848 movss 3 * SIZE(B), %xmm6 2849 shufps $0x00, %xmm6, %xmm6 2850 2851 mulps %xmm6, %xmm2 2852 2853 movss 2 * SIZE(B), %xmm6 2854 shufps $0x00, %xmm6, %xmm6 2855 movaps %xmm6, %xmm5 2856 2857 mulps %xmm2, %xmm5 2858 2859 subps %xmm5, %xmm0 2860 2861 movss 0 * SIZE(B), %xmm6 2862 shufps $0x00, %xmm6, %xmm6 2863 2864 mulps %xmm6, %xmm0 2865#endif 2866 2867#if defined(LN) || defined(LT) 2868 movlps %xmm2, 0 * SIZE(B) 2869 movhps %xmm2, 2 * SIZE(B) 2870 movlps %xmm3, 4 * SIZE(B) 2871 movhps %xmm3, 6 * SIZE(B) 2872 2873#ifdef HAVE_SSE2 2874 pshufd $0x00, %xmm2, %xmm0 2875 pshufd $0x55, %xmm2, %xmm1 2876 pshufd $0xaa, %xmm2, %xmm4 2877 pshufd $0xff, %xmm2, %xmm6 2878#else 2879 movaps %xmm2, %xmm0 2880 shufps $0x00, %xmm0, %xmm0 2881 movaps %xmm2, %xmm1 2882 shufps $0x55, %xmm1, %xmm1 2883 movaps %xmm2, %xmm4 2884 shufps $0xaa, %xmm4, %xmm4 2885 movaps %xmm2, %xmm6 2886 shufps $0xff, %xmm6, %xmm6 2887#endif 2888 2889 movaps %xmm0, 0 * SIZE(BB) 2890 movaps %xmm1, 4 * SIZE(BB) 2891 movaps %xmm4, 8 * SIZE(BB) 2892 movaps %xmm6, 12 * SIZE(BB) 2893 2894#ifdef HAVE_SSE2 2895 pshufd $0x00, %xmm3, %xmm0 2896 pshufd $0x55, %xmm3, %xmm1 2897 pshufd $0xaa, %xmm3, %xmm4 2898 pshufd $0xff, %xmm3, %xmm6 2899#else 2900 movaps %xmm3, %xmm0 2901 shufps $0x00, %xmm0, %xmm0 2902 movaps %xmm3, %xmm1 2903 shufps $0x55, %xmm1, %xmm1 2904 movaps %xmm3, %xmm4 2905 shufps $0xaa, %xmm4, %xmm4 2906 movaps %xmm3, %xmm6 2907 shufps $0xff, %xmm6, %xmm6 2908#endif 2909 2910 movaps %xmm0, 16 * SIZE(BB) 2911 movaps %xmm1, 20 * SIZE(BB) 2912 movaps %xmm4, 24 * SIZE(BB) 2913 movaps %xmm6, 28 * SIZE(BB) 2914#else 2915 movaps %xmm0, 0 * SIZE(AA) 2916 movaps %xmm2, 4 * SIZE(AA) 2917#endif 2918 2919#ifdef LN 2920 subl $4 * SIZE, CO1 2921#endif 2922 2923#if defined(LN) || defined(LT) 2924 movaps %xmm2, %xmm0 2925 shufps $0x88, %xmm3, %xmm2 2926 shufps $0xdd, %xmm3, %xmm0 2927 2928 movlps %xmm2, 0 * SIZE(CO1) 2929 movhps %xmm2, 2 * SIZE(CO1) 2930 movlps %xmm0, 0 * SIZE(CO1, LDC) 2931 movhps %xmm0, 2 * SIZE(CO1, LDC) 2932#else 2933 movlps %xmm0, 0 * SIZE(CO1) 2934 movhps %xmm0, 2 * SIZE(CO1) 2935 movlps %xmm2, 0 * SIZE(CO1, LDC) 2936 movhps %xmm2, 2 * SIZE(CO1, LDC) 2937#endif 2938 2939#ifndef LN 2940 addl $4 * SIZE, CO1 2941#endif 2942 2943#if defined(LT) || defined(RN) 2944 movl K, %eax 2945 subl KK, %eax 2946 leal (,%eax, SIZE), %eax 2947 leal (AA, %eax, 4), AA 2948#ifdef LT 2949 addl $8 * SIZE, B 2950#endif 2951#endif 2952 2953#ifdef LN 2954 subl $4, KK 2955 movl BORIG, B 2956#endif 2957 2958#ifdef LT 2959 addl $4, KK 2960#endif 2961 2962#ifdef RT 2963 movl K, %eax 2964 movl BORIG, B 2965 sall $2 + BASE_SHIFT, %eax 2966 addl %eax, AORIG 2967#endif 2968 ALIGN_2 2969 2970.L50: 2971 testl $2, M 2972 jle .L70 2973 2974#ifdef LN 2975 movl K, %eax 2976 sall $1 + BASE_SHIFT, %eax 2977 subl %eax, AORIG 2978#endif 2979 2980#if defined(LN) || defined(RT) 2981 movl KK, %eax 2982 movl AORIG, AA 2983 sall $1 + BASE_SHIFT, %eax 2984 addl %eax, AA 2985#endif 2986 2987 leal BUFFER, BB 2988 2989#if defined(LN) || defined(RT) 2990 movl KK, %eax 2991 sall $1 + BASE_SHIFT, %eax 2992 leal (BB, %eax, 4), BB 2993#endif 2994 2995 movaps 0 * SIZE(BB), %xmm2 2996 xorps %xmm4, %xmm4 2997 movaps 0 * SIZE(AA), %xmm0 2998 xorps %xmm5, %xmm5 2999 movaps 16 * SIZE(BB), %xmm3 3000 xorps %xmm6, %xmm6 3001 movaps 8 * SIZE(AA), %xmm1 3002 xorps %xmm7, %xmm7 3003 3004#if defined(LT) || defined(RN) 3005 movl KK, %eax 3006#else 3007 movl K, %eax 3008 subl KK, %eax 3009#endif 3010 sarl $3, %eax 3011 je .L52 3012 ALIGN_2 3013 3014.L51: 3015 mulps %xmm0, %xmm2 3016 addps %xmm2, %xmm4 3017 movaps 4 * SIZE(BB), %xmm2 3018 mulps %xmm0, %xmm2 3019 movsd 2 * SIZE(AA), %xmm0 3020 addps %xmm2, %xmm5 3021 movaps 8 * SIZE(BB), %xmm2 3022 mulps %xmm0, %xmm2 3023 addps %xmm2, %xmm6 3024 movaps 12 * SIZE(BB), %xmm2 3025 mulps %xmm0, %xmm2 3026 movsd 4 * SIZE(AA), %xmm0 3027 addps %xmm2, %xmm7 3028 movaps 32 * SIZE(BB), %xmm2 3029 mulps %xmm0, %xmm3 3030 addps %xmm3, %xmm4 3031 movaps 20 * SIZE(BB), %xmm3 3032 mulps %xmm0, %xmm3 3033 movsd 6 * SIZE(AA), %xmm0 3034 addps %xmm3, %xmm5 3035 movaps 24 * SIZE(BB), %xmm3 3036 mulps %xmm0, %xmm3 3037 addps %xmm3, %xmm6 3038 movaps 28 * SIZE(BB), %xmm3 3039 mulps %xmm0, %xmm3 3040 movsd 16 * SIZE(AA), %xmm0 3041 addps %xmm3, %xmm7 3042 movaps 48 * SIZE(BB), %xmm3 3043 mulps %xmm1, %xmm2 3044 addps %xmm2, %xmm4 3045 movaps 36 * SIZE(BB), %xmm2 3046 mulps %xmm1, %xmm2 3047 movsd 10 * SIZE(AA), %xmm1 3048 addps %xmm2, %xmm5 3049 movaps 40 * SIZE(BB), %xmm2 3050 mulps %xmm1, %xmm2 3051 addps %xmm2, %xmm6 3052 movaps 44 * SIZE(BB), %xmm2 3053 mulps %xmm1, %xmm2 3054 movsd 12 * SIZE(AA), %xmm1 3055 addps %xmm2, %xmm7 3056 movaps 64 * SIZE(BB), %xmm2 3057 mulps %xmm1, %xmm3 3058 addps %xmm3, %xmm4 3059 movaps 52 * SIZE(BB), %xmm3 3060 mulps %xmm1, %xmm3 3061 movsd 14 * SIZE(AA), %xmm1 3062 addps %xmm3, %xmm5 3063 movaps 56 * SIZE(BB), %xmm3 3064 mulps %xmm1, %xmm3 3065 addps %xmm3, %xmm6 3066 movaps 60 * SIZE(BB), %xmm3 3067 mulps %xmm1, %xmm3 3068 movsd 24 * SIZE(AA), %xmm1 3069 addps %xmm3, %xmm7 3070 movaps 80 * SIZE(BB), %xmm3 3071 3072 addl $16 * SIZE, AA 3073 addl $64 * SIZE, BB 3074 decl %eax 3075 jne .L51 3076 ALIGN_2 3077 3078.L52: 3079#if defined(LT) || defined(RN) 3080 movl KK, %eax 3081#else 3082 movl K, %eax 3083 subl KK, %eax 3084#endif 3085 andl $7, %eax # if (k & 1) 3086 BRANCH 3087 je .L54 3088 3089.L53: 3090 mulps %xmm0, %xmm2 3091 addps %xmm2, %xmm4 3092 movaps 4 * SIZE(BB), %xmm2 3093 mulps %xmm0, %xmm2 3094 movsd 2 * SIZE(AA), %xmm0 3095 addps %xmm2, %xmm5 3096 movaps 8 * SIZE(BB), %xmm2 3097 3098 addl $2 * SIZE, AA 3099 addl $8 * SIZE, BB 3100 decl %eax 3101 jg .L53 3102 ALIGN_4 3103 3104.L54: 3105 addps %xmm6, %xmm4 3106 addps %xmm7, %xmm5 3107 3108#if defined(LN) || defined(RT) 3109 movl KK, %eax 3110#ifdef LN 3111 subl $2, %eax 3112#else 3113 subl $2, %eax 3114#endif 3115 3116 movl AORIG, AA 3117 movl BORIG, B 3118 leal BUFFER, BB 3119 3120 sall $BASE_SHIFT, %eax 3121 leal (AA, %eax, 2), AA 3122 leal (B, %eax, 2), B 3123 leal (BB, %eax, 8), BB 3124#endif 3125 3126#if defined(LN) || defined(LT) 3127 unpcklps %xmm5, %xmm4 3128 3129 movsd 0 * SIZE(B), %xmm2 3130 movhps 2 * SIZE(B), %xmm2 3131 3132 subps %xmm4, %xmm2 3133#else 3134#ifdef movsd 3135 xorps %xmm0, %xmm0 3136#endif 3137 movsd 0 * SIZE(AA), %xmm0 3138#ifdef movsd 3139 xorps %xmm2, %xmm2 3140#endif 3141 movsd 2 * SIZE(AA), %xmm2 3142 3143 subps %xmm4, %xmm0 3144 subps %xmm5, %xmm2 3145#endif 3146 3147#if defined(LN) || defined(LT) 3148 movaps TRMASK, %xmm6 3149#endif 3150 3151#ifdef LN 3152 movss 3 * SIZE(AA), %xmm0 3153 movaps %xmm6, %xmm1 3154 shufps $0x00, %xmm0, %xmm1 3155 mulps %xmm1, %xmm2 3156 3157 movaps %xmm2, %xmm1 3158 shufps $0xee, %xmm1, %xmm1 3159 3160 movss 2 * SIZE(AA), %xmm0 3161 shufps $0x50, %xmm0, %xmm0 3162 mulps %xmm1, %xmm0 3163 subps %xmm0, %xmm2 3164 3165 movss 0 * SIZE(AA), %xmm0 3166 shufps $0x00, %xmm6, %xmm0 3167 mulps %xmm0, %xmm2 3168 3169#endif 3170 3171#ifdef LT 3172 movss 0 * SIZE(AA), %xmm0 3173 shufps $0x00, %xmm6, %xmm0 3174 mulps %xmm0, %xmm2 3175 3176 movaps %xmm2, %xmm1 3177 shufps $0x44, %xmm1, %xmm1 3178 3179 movss 1 * SIZE(AA), %xmm0 3180 shufps $0x05, %xmm0, %xmm0 3181 mulps %xmm1, %xmm0 3182 subps %xmm0, %xmm2 3183 3184 movss 3 * SIZE(AA), %xmm0 3185 movaps %xmm6, %xmm1 3186 shufps $0x00, %xmm0, %xmm1 3187 mulps %xmm1, %xmm2 3188#endif 3189 3190#ifdef RN 3191 movss 0 * SIZE(B), %xmm6 3192 shufps $0x00, %xmm6, %xmm6 3193 3194 mulps %xmm6, %xmm0 3195 3196 movss 1 * SIZE(B), %xmm6 3197 shufps $0x00, %xmm6, %xmm6 3198 movaps %xmm6, %xmm5 3199 3200 mulps %xmm0, %xmm5 3201 subps %xmm5, %xmm2 3202 3203 movss 3 * SIZE(B), %xmm6 3204 shufps $0x00, %xmm6, %xmm6 3205 3206 mulps %xmm6, %xmm2 3207#endif 3208 3209#ifdef RT 3210 movss 3 * SIZE(B), %xmm6 3211 shufps $0x00, %xmm6, %xmm6 3212 3213 mulps %xmm6, %xmm2 3214 3215 movss 2 * SIZE(B), %xmm6 3216 shufps $0x00, %xmm6, %xmm6 3217 movaps %xmm6, %xmm5 3218 3219 mulps %xmm2, %xmm5 3220 3221 subps %xmm5, %xmm0 3222 3223 movss 0 * SIZE(B), %xmm6 3224 shufps $0x00, %xmm6, %xmm6 3225 3226 mulps %xmm6, %xmm0 3227#endif 3228 3229#if defined(LN) || defined(LT) 3230 movlps %xmm2, 0 * SIZE(B) 3231 movhps %xmm2, 2 * SIZE(B) 3232 3233#ifdef HAVE_SSE2 3234 pshufd $0x00, %xmm2, %xmm0 3235 pshufd $0x55, %xmm2, %xmm1 3236 pshufd $0xaa, %xmm2, %xmm4 3237 pshufd $0xff, %xmm2, %xmm6 3238#else 3239 movaps %xmm2, %xmm0 3240 shufps $0x00, %xmm0, %xmm0 3241 movaps %xmm2, %xmm1 3242 shufps $0x55, %xmm1, %xmm1 3243 movaps %xmm2, %xmm4 3244 shufps $0xaa, %xmm4, %xmm4 3245 movaps %xmm2, %xmm6 3246 shufps $0xff, %xmm6, %xmm6 3247#endif 3248 3249 movaps %xmm0, 0 * SIZE(BB) 3250 movaps %xmm1, 4 * SIZE(BB) 3251 movaps %xmm4, 8 * SIZE(BB) 3252 movaps %xmm6, 12 * SIZE(BB) 3253#else 3254 movlps %xmm0, 0 * SIZE(AA) 3255 movlps %xmm2, 2 * SIZE(AA) 3256#endif 3257 3258#ifdef LN 3259 subl $2 * SIZE, CO1 3260#endif 3261 3262#if defined(LN) || defined(LT) 3263 movaps %xmm2, %xmm0 3264 shufps $0x88, %xmm3, %xmm2 3265 shufps $0xdd, %xmm3, %xmm0 3266 3267 movlps %xmm2, 0 * SIZE(CO1) 3268 movlps %xmm0, 0 * SIZE(CO1, LDC) 3269#else 3270 movlps %xmm0, 0 * SIZE(CO1) 3271 movlps %xmm2, 0 * SIZE(CO1, LDC) 3272#endif 3273 3274#ifndef LN 3275 addl $2 * SIZE, CO1 3276#endif 3277 3278#if defined(LT) || defined(RN) 3279 movl K, %eax 3280 subl KK, %eax 3281 leal (,%eax, SIZE), %eax 3282 leal (AA, %eax, 2), AA 3283#ifdef LT 3284 addl $4 * SIZE, B 3285#endif 3286#endif 3287 3288#ifdef LN 3289 subl $2, KK 3290 movl BORIG, B 3291#endif 3292 3293#ifdef LT 3294 addl $2, KK 3295#endif 3296 3297#ifdef RT 3298 movl K, %eax 3299 movl BORIG, B 3300 sall $1 + BASE_SHIFT, %eax 3301 addl %eax, AORIG 3302#endif 3303 ALIGN_2 3304 3305.L70: 3306 testl $1, M 3307 jle .L99 3308 3309#ifdef LN 3310 movl K, %eax 3311 sall $BASE_SHIFT, %eax 3312 subl %eax, AORIG 3313#endif 3314 3315#if defined(LN) || defined(RT) 3316 movl KK, %eax 3317 movl AORIG, AA 3318 sall $BASE_SHIFT, %eax 3319 addl %eax, AA 3320#endif 3321 3322 leal BUFFER, BB 3323 3324#if defined(LN) || defined(RT) 3325 movl KK, %eax 3326 sall $1 + BASE_SHIFT, %eax 3327 leal (BB, %eax, 4), BB 3328#endif 3329 3330 movss 0 * SIZE(BB), %xmm2 3331 xorps %xmm4, %xmm4 3332 movss 0 * SIZE(AA), %xmm0 3333 xorps %xmm5, %xmm5 3334 movss 16 * SIZE(BB), %xmm3 3335 xorps %xmm6, %xmm6 3336 movss 4 * SIZE(AA), %xmm1 3337 xorps %xmm7, %xmm7 3338 3339#if defined(LT) || defined(RN) 3340 movl KK, %eax 3341#else 3342 movl K, %eax 3343 subl KK, %eax 3344#endif 3345 sarl $3, %eax 3346 je .L72 3347 ALIGN_2 3348 3349.L71: 3350 mulss %xmm0, %xmm2 3351 mulss 4 * SIZE(BB), %xmm0 3352 addss %xmm2, %xmm4 3353 movss 8 * SIZE(BB), %xmm2 3354 addss %xmm0, %xmm5 3355 movss 1 * SIZE(AA), %xmm0 3356 mulss %xmm0, %xmm2 3357 mulss 12 * SIZE(BB), %xmm0 3358 addss %xmm2, %xmm6 3359 movss 32 * SIZE(BB), %xmm2 3360 addss %xmm0, %xmm7 3361 movss 2 * SIZE(AA), %xmm0 3362 mulss %xmm0, %xmm3 3363 mulss 20 * SIZE(BB), %xmm0 3364 addss %xmm3, %xmm4 3365 movss 24 * SIZE(BB), %xmm3 3366 addss %xmm0, %xmm5 3367 movss 3 * SIZE(AA), %xmm0 3368 mulss %xmm0, %xmm3 3369 mulss 28 * SIZE(BB), %xmm0 3370 addss %xmm3, %xmm6 3371 movss 48 * SIZE(BB), %xmm3 3372 addss %xmm0, %xmm7 3373 movss 8 * SIZE(AA), %xmm0 3374 mulss %xmm1, %xmm2 3375 mulss 36 * SIZE(BB), %xmm1 3376 addss %xmm2, %xmm4 3377 movss 40 * SIZE(BB), %xmm2 3378 addss %xmm1, %xmm5 3379 movss 5 * SIZE(AA), %xmm1 3380 mulss %xmm1, %xmm2 3381 mulss 44 * SIZE(BB), %xmm1 3382 addss %xmm2, %xmm6 3383 movss 64 * SIZE(BB), %xmm2 3384 addss %xmm1, %xmm7 3385 movss 6 * SIZE(AA), %xmm1 3386 mulss %xmm1, %xmm3 3387 mulss 52 * SIZE(BB), %xmm1 3388 addss %xmm3, %xmm4 3389 movss 56 * SIZE(BB), %xmm3 3390 addss %xmm1, %xmm5 3391 movss 7 * SIZE(AA), %xmm1 3392 mulss %xmm1, %xmm3 3393 mulss 60 * SIZE(BB), %xmm1 3394 addss %xmm3, %xmm6 3395 movss 80 * SIZE(BB), %xmm3 3396 addss %xmm1, %xmm7 3397 movss 12 * SIZE(AA), %xmm1 3398 3399 addl $ 8 * SIZE, AA 3400 addl $64 * SIZE, BB 3401 decl %eax 3402 jne .L71 3403 ALIGN_2 3404 3405.L72: 3406#if defined(LT) || defined(RN) 3407 movl KK, %eax 3408#else 3409 movl K, %eax 3410 subl KK, %eax 3411#endif 3412 andl $7, %eax # if (k & 1) 3413 BRANCH 3414 je .L74 3415 3416.L73: 3417 mulss %xmm0, %xmm2 3418 mulss 4 * SIZE(BB), %xmm0 3419 addss %xmm2, %xmm4 3420 movss 8 * SIZE(BB), %xmm2 3421 addss %xmm0, %xmm5 3422 movss 1 * SIZE(AA), %xmm0 3423 3424 addl $1 * SIZE, AA 3425 addl $8 * SIZE, BB 3426 decl %eax 3427 jg .L73 3428 ALIGN_4 3429 3430.L74: 3431 addss %xmm6, %xmm4 3432 addss %xmm7, %xmm5 3433 3434#if defined(LN) || defined(RT) 3435 movl KK, %eax 3436#ifdef LN 3437 subl $1, %eax 3438#else 3439 subl $2, %eax 3440#endif 3441 3442 movl AORIG, AA 3443 movl BORIG, B 3444 leal BUFFER, BB 3445 3446 sall $BASE_SHIFT, %eax 3447 leal (AA, %eax, 1), AA 3448 leal (B, %eax, 2), B 3449 leal (BB, %eax, 8), BB 3450#endif 3451 3452#if defined(LN) || defined(LT) 3453 unpcklps %xmm5, %xmm4 3454 3455#ifdef movsd 3456 xorps %xmm2, %xmm2 3457#endif 3458 movsd 0 * SIZE(B), %xmm2 3459 3460 subps %xmm4, %xmm2 3461#else 3462 movss 0 * SIZE(AA), %xmm0 3463 movss 1 * SIZE(AA), %xmm2 3464 3465 subss %xmm4, %xmm0 3466 subss %xmm5, %xmm2 3467#endif 3468 3469#if defined(LN) || defined(LT) 3470 movaps TRMASK, %xmm6 3471#endif 3472 3473#if defined(LN) || defined(LT) 3474 movss 0 * SIZE(AA), %xmm0 3475 shufps $0x00, %xmm6, %xmm0 3476 mulps %xmm0, %xmm2 3477#endif 3478 3479#ifdef RN 3480 movss 0 * SIZE(B), %xmm6 3481 mulss %xmm6, %xmm0 3482 3483 movss 1 * SIZE(B), %xmm6 3484 movaps %xmm6, %xmm5 3485 3486 mulss %xmm0, %xmm5 3487 subss %xmm5, %xmm2 3488 3489 movss 3 * SIZE(B), %xmm6 3490 mulss %xmm6, %xmm2 3491#endif 3492 3493#ifdef RT 3494 movss 3 * SIZE(B), %xmm6 3495 mulss %xmm6, %xmm2 3496 3497 movss 2 * SIZE(B), %xmm6 3498 movaps %xmm6, %xmm5 3499 3500 mulss %xmm2, %xmm5 3501 subss %xmm5, %xmm0 3502 3503 movss 0 * SIZE(B), %xmm6 3504 mulss %xmm6, %xmm0 3505#endif 3506 3507#if defined(LN) || defined(LT) 3508#ifdef movsd 3509 xorps %xmm2, %xmm2 3510#endif 3511 movsd %xmm2, 0 * SIZE(B) 3512 3513 movaps %xmm2, %xmm0 3514 shufps $0x00, %xmm0, %xmm0 3515 movaps %xmm2, %xmm1 3516 shufps $0x55, %xmm1, %xmm1 3517 3518 movaps %xmm0, 0 * SIZE(BB) 3519 movaps %xmm1, 4 * SIZE(BB) 3520#else 3521 movss %xmm0, 0 * SIZE(AA) 3522 movss %xmm2, 1 * SIZE(AA) 3523#endif 3524 3525#ifdef LN 3526 subl $1 * SIZE, CO1 3527#endif 3528 3529#if defined(LN) || defined(LT) 3530 movaps %xmm2, %xmm0 3531 shufps $0x88, %xmm3, %xmm2 3532 shufps $0xdd, %xmm3, %xmm0 3533 3534 movss %xmm2, 0 * SIZE(CO1) 3535 movss %xmm0, 0 * SIZE(CO1, LDC) 3536#else 3537 movss %xmm0, 0 * SIZE(CO1) 3538 movss %xmm2, 0 * SIZE(CO1, LDC) 3539#endif 3540 3541#ifndef LN 3542 addl $1 * SIZE, CO1 3543#endif 3544 3545#if defined(LT) || defined(RN) 3546 movl K, %eax 3547 subl KK, %eax 3548 leal (,%eax, SIZE), %eax 3549 leal (AA, %eax, 1), AA 3550#ifdef LT 3551 addl $2 * SIZE, B 3552#endif 3553#endif 3554 3555#ifdef LN 3556 subl $1, KK 3557 movl BORIG, B 3558#endif 3559 3560#ifdef LT 3561 addl $1, KK 3562#endif 3563 3564#ifdef RT 3565 movl K, %eax 3566 movl BORIG, B 3567 sall $BASE_SHIFT, %eax 3568 addl %eax, AORIG 3569#endif 3570 ALIGN_2 3571 3572.L99: 3573#ifdef LN 3574 movl K, %eax 3575 leal (, %eax, SIZE), %eax 3576 leal (B, %eax, 2), B 3577#endif 3578 3579#if defined(LT) || defined(RN) 3580 movl K, %eax 3581 subl KK, %eax 3582 leal (,%eax, SIZE), %eax 3583 leal (B, %eax, 2), B 3584#endif 3585 3586#ifdef RN 3587 addl $2, KK 3588#endif 3589 3590#ifdef RT 3591 subl $2, KK 3592#endif 3593 3594 decl J # j -- 3595 jg .L01 3596 ALIGN_2 3597 3598.L999: 3599 movl OLD_STACK, %esp 3600 3601 popl %ebx 3602 popl %esi 3603 popl %edi 3604 popl %ebp 3605 ret 3606 3607 EPILOGUE 3608