1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 16 26#define ARGS 16 27 28#define M 4 + STACK + ARGS(%esp) 29#define N 8 + STACK + ARGS(%esp) 30#define K 12 + STACK + ARGS(%esp) 31#define A 24 + STACK + ARGS(%esp) 32#define ARG_B 28 + STACK + ARGS(%esp) 33#define C 32 + STACK + ARGS(%esp) 34#define ARG_LDC 36 + STACK + ARGS(%esp) 35#define OFFSET 40 + STACK + ARGS(%esp) 36 37#define J 0 + STACK(%esp) 38#define KK 4 + STACK(%esp) 39#define KKK 8 + STACK(%esp) 40#define AORIG 12 + STACK(%esp) 41 42#if defined(PENRYN) || defined(DUNNINGTON) 43#define PREFETCH prefetcht1 44#define PREFETCHSIZE 84 45#endif 46 47#ifdef NEHALEM 48#define PREFETCH prefetcht1 49#define PREFETCHSIZE 84 50#endif 51 52#ifdef ATOM 53#define PREFETCH prefetcht0 54#define PREFETCHSIZE 84 55#endif 56 57#ifdef NANO 58#define PREFETCH prefetcht0 59#define PREFETCHSIZE (16 * 2) 60#endif 61 62#define B %edi 63#define LDC %ebp 64#define AA %edx 65#define BB %ecx 66#define CO1 %esi 67 68#define ADD1 addps 69#define ADD2 addps 70 71 PROLOGUE 72 73 subl $ARGS, %esp 74 75 pushl %ebp 76 pushl %edi 77 pushl %esi 78 pushl %ebx 79 80 PROFCODE 81 82 movl ARG_B, B 83 movl ARG_LDC, LDC 84 movl OFFSET, %eax 85#ifdef RN 86 negl %eax 87#endif 88 movl %eax, KK 89 90 movl M, %ebx 91 testl %ebx, %ebx 92 jle .L999 93 94 subl $-32 * SIZE, A 95 subl $-32 * SIZE, B 96 97 sall $ZBASE_SHIFT, LDC 98 99#ifdef LN 100 movl M, %eax 101 sall $ZBASE_SHIFT, %eax 102 addl %eax, C 103 imull K, %eax 104 addl %eax, A 105#endif 106 107#ifdef RT 108 movl N, %eax 109 sall $ZBASE_SHIFT, %eax 110 imull K, %eax 111 addl %eax, B 112 113 movl N, %eax 114 imull LDC, %eax 115 addl %eax, C 116#endif 117 118#ifdef RN 119 negl KK 120#endif 121 122#ifdef RT 123 movl N, %eax 124 subl OFFSET, %eax 125 movl %eax, KK 126#endif 127 128 movl N, %eax 129 movl %eax, J 130 sarl $1, J 131 jle .L100 132 ALIGN_4 133 134.L01: 135#if defined(LT) || defined(RN) 136 movl A, %eax 137 movl %eax, AA 138#else 139 movl A, %eax 140 movl %eax, AORIG 141#endif 142 143#ifdef RT 144 movl K, %eax 145 sall $1 + ZBASE_SHIFT, %eax 146 subl %eax, B 147#endif 148 149 leal (, LDC, 2), %eax 150 151#ifdef RT 152 subl %eax, C 153#endif 154 movl C, CO1 155#ifndef RT 156 addl %eax, C 157#endif 158 159#ifdef LN 160 movl OFFSET, %eax 161 addl M, %eax 162 movl %eax, KK 163#endif 164 165#ifdef LT 166 movl OFFSET, %eax 167 movl %eax, KK 168#endif 169 170 movl M, %ebx 171 sarl $1, %ebx 172 jle .L30 173 ALIGN_4 174 175.L10: 176#ifdef LN 177 movl K, %eax 178 sall $1 + ZBASE_SHIFT, %eax 179 subl %eax, AORIG 180#endif 181 182#if defined(LN) || defined(RT) 183 movl KK, %eax 184 movl AORIG, AA 185 sall $1 + ZBASE_SHIFT, %eax 186 addl %eax, AA 187#endif 188 189 movl B, BB 190 191#if defined(LN) || defined(RT) 192 movl KK, %eax 193 sall $1 + ZBASE_SHIFT, %eax 194 addl %eax, BB 195#endif 196 197 movaps -32 * SIZE(AA), %xmm0 198 pxor %xmm2, %xmm2 199 movaps -32 * SIZE(BB), %xmm1 200 pxor %xmm3, %xmm3 201 202#ifdef LN 203 pxor %xmm4, %xmm4 204 prefetcht0 -4 * SIZE(CO1) 205 pxor %xmm5, %xmm5 206 prefetcht0 -4 * SIZE(CO1, LDC) 207 pxor %xmm6, %xmm6 208 pxor %xmm7, %xmm7 209#else 210 pxor %xmm4, %xmm4 211 prefetcht0 3 * SIZE(CO1) 212 pxor %xmm5, %xmm5 213 prefetcht0 3 * SIZE(CO1, LDC) 214 pxor %xmm6, %xmm6 215 pxor %xmm7, %xmm7 216#endif 217 218#if defined(LT) || defined(RN) 219 movl KK, %eax 220#else 221 movl K, %eax 222 subl KK, %eax 223#endif 224 sarl $3, %eax 225 je .L15 226 ALIGN_4 227 228.L11: 229 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 230 231 ADD2 %xmm2, %xmm7 232 pshufd $0xb1, %xmm1, %xmm2 233 mulps %xmm0, %xmm1 234 ADD1 %xmm3, %xmm6 235 pshufd $0x1b, %xmm2, %xmm3 236 mulps %xmm0, %xmm2 237 238 ADD2 %xmm2, %xmm5 239 pshufd $0xb1, %xmm3, %xmm2 240 mulps %xmm0, %xmm3 241 ADD1 %xmm1, %xmm4 242 movaps -28 * SIZE(BB), %xmm1 243 mulps %xmm0, %xmm2 244 movaps -28 * SIZE(AA), %xmm0 245 246 ADD2 %xmm2, %xmm7 247 pshufd $0xb1, %xmm1, %xmm2 248 mulps %xmm0, %xmm1 249 ADD1 %xmm3, %xmm6 250 pshufd $0x1b, %xmm2, %xmm3 251 mulps %xmm0, %xmm2 252 253 ADD2 %xmm2, %xmm5 254 pshufd $0xb1, %xmm3, %xmm2 255 mulps %xmm0, %xmm3 256 ADD1 %xmm1, %xmm4 257 movaps -24 * SIZE(BB), %xmm1 258 mulps %xmm0, %xmm2 259 movaps -24 * SIZE(AA), %xmm0 260 261 ADD2 %xmm2, %xmm7 262 pshufd $0xb1, %xmm1, %xmm2 263 mulps %xmm0, %xmm1 264 ADD1 %xmm3, %xmm6 265 pshufd $0x1b, %xmm2, %xmm3 266 mulps %xmm0, %xmm2 267 268 ADD2 %xmm2, %xmm5 269 pshufd $0xb1, %xmm3, %xmm2 270 mulps %xmm0, %xmm3 271 ADD1 %xmm1, %xmm4 272 movaps -20 * SIZE(BB), %xmm1 273 mulps %xmm0, %xmm2 274 movaps -20 * SIZE(AA), %xmm0 275 276 ADD2 %xmm2, %xmm7 277 pshufd $0xb1, %xmm1, %xmm2 278 mulps %xmm0, %xmm1 279 ADD1 %xmm3, %xmm6 280 pshufd $0x1b, %xmm2, %xmm3 281 mulps %xmm0, %xmm2 282 283 ADD2 %xmm2, %xmm5 284 pshufd $0xb1, %xmm3, %xmm2 285 mulps %xmm0, %xmm3 286 ADD1 %xmm1, %xmm4 287 movaps -16 * SIZE(BB), %xmm1 288 mulps %xmm0, %xmm2 289 movaps -16 * SIZE(AA), %xmm0 290 291 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) 292 293 ADD2 %xmm2, %xmm7 294 pshufd $0xb1, %xmm1, %xmm2 295 mulps %xmm0, %xmm1 296 ADD1 %xmm3, %xmm6 297 pshufd $0x1b, %xmm2, %xmm3 298 mulps %xmm0, %xmm2 299 300 ADD2 %xmm2, %xmm5 301 pshufd $0xb1, %xmm3, %xmm2 302 mulps %xmm0, %xmm3 303 ADD1 %xmm1, %xmm4 304 movaps -12 * SIZE(BB), %xmm1 305 mulps %xmm0, %xmm2 306 movaps -12 * SIZE(AA), %xmm0 307 308 ADD2 %xmm2, %xmm7 309 pshufd $0xb1, %xmm1, %xmm2 310 mulps %xmm0, %xmm1 311 ADD1 %xmm3, %xmm6 312 pshufd $0x1b, %xmm2, %xmm3 313 mulps %xmm0, %xmm2 314 315 ADD2 %xmm2, %xmm5 316 pshufd $0xb1, %xmm3, %xmm2 317 mulps %xmm0, %xmm3 318 ADD1 %xmm1, %xmm4 319 movaps -8 * SIZE(BB), %xmm1 320 mulps %xmm0, %xmm2 321 movaps -8 * SIZE(AA), %xmm0 322 323 ADD2 %xmm2, %xmm7 324 pshufd $0xb1, %xmm1, %xmm2 325 mulps %xmm0, %xmm1 326 ADD1 %xmm3, %xmm6 327 pshufd $0x1b, %xmm2, %xmm3 328 mulps %xmm0, %xmm2 329 330 ADD2 %xmm2, %xmm5 331 pshufd $0xb1, %xmm3, %xmm2 332 mulps %xmm0, %xmm3 333 ADD1 %xmm1, %xmm4 334 movaps -4 * SIZE(BB), %xmm1 335 mulps %xmm0, %xmm2 336 movaps -4 * SIZE(AA), %xmm0 337 338 ADD2 %xmm2, %xmm7 339 subl $-32 * SIZE, BB 340 pshufd $0xb1, %xmm1, %xmm2 341 mulps %xmm0, %xmm1 342 ADD1 %xmm3, %xmm6 343 pshufd $0x1b, %xmm2, %xmm3 344 mulps %xmm0, %xmm2 345 346 ADD2 %xmm2, %xmm5 347 subl $-32 * SIZE, AA 348 pshufd $0xb1, %xmm3, %xmm2 349 mulps %xmm0, %xmm3 350 ADD1 %xmm1, %xmm4 351 movaps -32 * SIZE(BB), %xmm1 352 mulps %xmm0, %xmm2 353 movaps -32 * SIZE(AA), %xmm0 354 355 decl %eax 356 jne .L11 357 ALIGN_4 358 359.L15: 360#if defined(LT) || defined(RN) 361 movl KK, %eax 362#else 363 movl K, %eax 364 subl KK, %eax 365#endif 366 andl $7, %eax # if (k & 1) 367 BRANCH 368 je .L14 369 ALIGN_4 370 371.L13: 372 ADD2 %xmm2, %xmm7 373 pshufd $0xb1, %xmm1, %xmm2 374 mulps %xmm0, %xmm1 375 ADD1 %xmm3, %xmm6 376 pshufd $0x1b, %xmm2, %xmm3 377 mulps %xmm0, %xmm2 378 379 ADD2 %xmm2, %xmm5 380 pshufd $0xb1, %xmm3, %xmm2 381 mulps %xmm0, %xmm3 382 ADD1 %xmm1, %xmm4 383 movaps -28 * SIZE(BB), %xmm1 384 mulps %xmm0, %xmm2 385 movaps -28 * SIZE(AA), %xmm0 386 387 addl $4 * SIZE, AA 388 addl $4 * SIZE, BB 389 decl %eax 390 jg .L13 391 ALIGN_4 392 393.L14: 394#if defined(LN) || defined(RT) 395 movl KK, %eax 396#ifdef LN 397 subl $2, %eax 398#else 399 subl $2, %eax 400#endif 401 402 movl AORIG, AA 403 sall $ZBASE_SHIFT, %eax 404 leal (AA, %eax, 2), AA 405 leal (B, %eax, 2), BB 406#endif 407 408 ADD2 %xmm2, %xmm7 409 pcmpeqb %xmm0, %xmm0 410 ADD1 %xmm3, %xmm6 411 psllq $63, %xmm0 412 413#ifndef CONJ 414 pxor %xmm0, %xmm4 415 pxor %xmm0, %xmm6 416 417 shufps $0xb1, %xmm0, %xmm0 418#else 419#if defined(LN) || defined(LT) 420 pxor %xmm0, %xmm5 421 pxor %xmm0, %xmm7 422#else 423 pshufd $0xb1, %xmm0, %xmm1 424 425 pxor %xmm1, %xmm5 426 pxor %xmm1, %xmm7 427#endif 428#endif 429 430 haddps %xmm5, %xmm4 431 haddps %xmm7, %xmm6 432 433 shufps $0xd8, %xmm4, %xmm4 434 shufps $0xd8, %xmm6, %xmm6 435 436 movaps %xmm4, %xmm5 437 shufps $0xe4, %xmm6, %xmm4 438 shufps $0xe4, %xmm5, %xmm6 439 440#if defined(LN) || defined(LT) 441 movaps %xmm4, %xmm5 442 unpcklpd %xmm6, %xmm4 443 unpckhpd %xmm6, %xmm5 444 445 movaps -32 * SIZE(BB), %xmm2 446 movaps -28 * SIZE(BB), %xmm3 447 448 subps %xmm4, %xmm2 449 subps %xmm5, %xmm3 450#else 451 movaps -32 * SIZE(AA), %xmm1 452 movaps -28 * SIZE(AA), %xmm5 453 454 subps %xmm4, %xmm1 455 subps %xmm6, %xmm5 456#endif 457 458#ifdef LN 459 movaps -28 * SIZE(AA), %xmm5 460 461 pshufd $0xee, %xmm5, %xmm6 462 pshufd $0xbb, %xmm5, %xmm7 463 464 pshufd $0xa0, %xmm3, %xmm4 465 pshufd $0xf5, %xmm3, %xmm3 466 467#ifndef CONJ 468 xorps %xmm0, %xmm3 469#else 470 xorps %xmm0, %xmm4 471#endif 472 473 mulps %xmm6, %xmm4 474 mulps %xmm7, %xmm3 475 addps %xmm4, %xmm3 476 477 pshufd $0x44, %xmm5, %xmm6 478 pshufd $0x11, %xmm5, %xmm7 479 480 pshufd $0xa0, %xmm3, %xmm4 481 pshufd $0xf5, %xmm3, %xmm1 482 483#ifndef CONJ 484 xorps %xmm0, %xmm1 485#else 486 xorps %xmm0, %xmm4 487#endif 488 489 mulps %xmm6, %xmm4 490 mulps %xmm7, %xmm1 491 subps %xmm4, %xmm2 492 subps %xmm1, %xmm2 493 494 movaps -32 * SIZE(AA), %xmm5 495 496 pshufd $0x44, %xmm5, %xmm6 497 pshufd $0x11, %xmm5, %xmm7 498 499 pshufd $0xa0, %xmm2, %xmm4 500 pshufd $0xf5, %xmm2, %xmm2 501 502#ifndef CONJ 503 xorps %xmm0, %xmm2 504#else 505 xorps %xmm0, %xmm4 506#endif 507 508 mulps %xmm6, %xmm4 509 mulps %xmm7, %xmm2 510 addps %xmm4, %xmm2 511#endif 512 513#ifdef LT 514 movaps -32 * SIZE(AA), %xmm5 515 516 pshufd $0x44, %xmm5, %xmm6 517 pshufd $0x11, %xmm5, %xmm7 518 519 pshufd $0xa0, %xmm2, %xmm4 520 pshufd $0xf5, %xmm2, %xmm2 521 522#ifndef CONJ 523 xorps %xmm0, %xmm2 524#else 525 xorps %xmm0, %xmm4 526#endif 527 528 mulps %xmm6, %xmm4 529 mulps %xmm7, %xmm2 530 addps %xmm4, %xmm2 531 532 pshufd $0xee, %xmm5, %xmm6 533 pshufd $0xbb, %xmm5, %xmm7 534 535 pshufd $0xa0, %xmm2, %xmm4 536 pshufd $0xf5, %xmm2, %xmm1 537 538#ifndef CONJ 539 xorps %xmm0, %xmm1 540#else 541 xorps %xmm0, %xmm4 542#endif 543 544 mulps %xmm6, %xmm4 545 mulps %xmm7, %xmm1 546 subps %xmm4, %xmm3 547 subps %xmm1, %xmm3 548 549 movaps -28 * SIZE(AA), %xmm5 550 551 pshufd $0xee, %xmm5, %xmm6 552 pshufd $0xbb, %xmm5, %xmm7 553 554 pshufd $0xa0, %xmm3, %xmm4 555 pshufd $0xf5, %xmm3, %xmm3 556 557#ifndef CONJ 558 xorps %xmm0, %xmm3 559#else 560 xorps %xmm0, %xmm4 561#endif 562 563 mulps %xmm6, %xmm4 564 mulps %xmm7, %xmm3 565 addps %xmm4, %xmm3 566#endif 567 568#ifdef RN 569 movaps -32 * SIZE(BB), %xmm4 570 571 pshufd $0x44, %xmm4, %xmm6 572 pshufd $0x11, %xmm4, %xmm7 573 574 pshufd $0xa0, %xmm1, %xmm3 575 pshufd $0xf5, %xmm1, %xmm1 576 577#ifndef CONJ 578 xorps %xmm0, %xmm1 579#else 580 xorps %xmm0, %xmm3 581#endif 582 583 mulps %xmm6, %xmm3 584 mulps %xmm7, %xmm1 585 586 addps %xmm3, %xmm1 587 588 pshufd $0xee, %xmm4, %xmm6 589 pshufd $0xbb, %xmm4, %xmm7 590 591 pshufd $0xa0, %xmm1, %xmm3 592 pshufd $0xf5, %xmm1, %xmm2 593 594#ifndef CONJ 595 xorps %xmm0, %xmm2 596#else 597 xorps %xmm0, %xmm3 598#endif 599 600 mulps %xmm6, %xmm3 601 mulps %xmm7, %xmm2 602 603 subps %xmm3, %xmm5 604 subps %xmm2, %xmm5 605 606 movaps -28 * SIZE(BB), %xmm4 607 608 pshufd $0xee, %xmm4, %xmm6 609 pshufd $0xbb, %xmm4, %xmm7 610 611 pshufd $0xa0, %xmm5, %xmm3 612 pshufd $0xf5, %xmm5, %xmm5 613 614#ifndef CONJ 615 xorps %xmm0, %xmm5 616#else 617 xorps %xmm0, %xmm3 618#endif 619 620 mulps %xmm6, %xmm3 621 mulps %xmm7, %xmm5 622 623 addps %xmm3, %xmm5 624#endif 625 626#ifdef RT 627 movaps -28 * SIZE(BB), %xmm4 628 629 pshufd $0xee, %xmm4, %xmm6 630 pshufd $0xbb, %xmm4, %xmm7 631 632 pshufd $0xa0, %xmm5, %xmm3 633 pshufd $0xf5, %xmm5, %xmm5 634 635#ifndef CONJ 636 xorps %xmm0, %xmm5 637#else 638 xorps %xmm0, %xmm3 639#endif 640 641 mulps %xmm6, %xmm3 642 mulps %xmm7, %xmm5 643 644 addps %xmm3, %xmm5 645 646 pshufd $0x44, %xmm4, %xmm6 647 pshufd $0x11, %xmm4, %xmm7 648 649 pshufd $0xa0, %xmm5, %xmm3 650 pshufd $0xf5, %xmm5, %xmm2 651 652#ifndef CONJ 653 xorps %xmm0, %xmm2 654#else 655 xorps %xmm0, %xmm3 656#endif 657 658 mulps %xmm6, %xmm3 659 mulps %xmm7, %xmm2 660 661 subps %xmm3, %xmm1 662 subps %xmm2, %xmm1 663 664 movaps -32 * SIZE(BB), %xmm4 665 666 pshufd $0x44, %xmm4, %xmm6 667 pshufd $0x11, %xmm4, %xmm7 668 669 pshufd $0xa0, %xmm1, %xmm3 670 pshufd $0xf5, %xmm1, %xmm1 671 672#ifndef CONJ 673 xorps %xmm0, %xmm1 674#else 675 xorps %xmm0, %xmm3 676#endif 677 678 mulps %xmm6, %xmm3 679 mulps %xmm7, %xmm1 680 681 addps %xmm3, %xmm1 682#endif 683 684#ifdef LN 685 subl $4 * SIZE, CO1 686#endif 687 688#if defined(LN) || defined(LT) 689 movaps %xmm2, -32 * SIZE(BB) 690 movaps %xmm3, -28 * SIZE(BB) 691 692 movlps %xmm2, 0 * SIZE(CO1) 693 movlps %xmm3, 2 * SIZE(CO1) 694 movhps %xmm2, 0 * SIZE(CO1, LDC) 695 movhps %xmm3, 2 * SIZE(CO1, LDC) 696#else 697 movaps %xmm1, -32 * SIZE(AA) 698 movaps %xmm5, -28 * SIZE(AA) 699 700 movlps %xmm1, 0 * SIZE(CO1) 701 movhps %xmm1, 2 * SIZE(CO1) 702 703 movlps %xmm5, 0 * SIZE(CO1, LDC) 704 movhps %xmm5, 2 * SIZE(CO1, LDC) 705#endif 706 707#ifndef LN 708 addl $4 * SIZE, CO1 709#endif 710 711#if defined(LT) || defined(RN) 712 movl K, %eax 713 subl KK, %eax 714 sall $ZBASE_SHIFT, %eax 715 leal (AA, %eax, 2), AA 716 leal (BB, %eax, 2), BB 717#endif 718 719#ifdef LN 720 subl $2, KK 721#endif 722 723#ifdef LT 724 addl $2, KK 725#endif 726 727#ifdef RT 728 movl K, %eax 729 sall $1 + ZBASE_SHIFT, %eax 730 addl %eax, AORIG 731#endif 732 733 decl %ebx 734 jg .L10 735 ALIGN_4 736 737.L30: 738 movl M, %ebx 739 andl $1, %ebx 740 jle .L99 741 742#ifdef LN 743 movl K, %eax 744 sall $ZBASE_SHIFT, %eax 745 subl %eax, AORIG 746#endif 747 748#if defined(LN) || defined(RT) 749 movl KK, %eax 750 movl AORIG, AA 751 sall $ZBASE_SHIFT, %eax 752 addl %eax, AA 753#endif 754 755 movl B, BB 756 757#if defined(LN) || defined(RT) 758 movl KK, %eax 759 sall $1 + ZBASE_SHIFT, %eax 760 addl %eax, BB 761#endif 762 763 movsd -32 * SIZE(AA), %xmm0 764 pxor %xmm2, %xmm2 765 movaps -32 * SIZE(BB), %xmm1 766 pxor %xmm3, %xmm3 767 768 pxor %xmm4, %xmm4 769 pxor %xmm5, %xmm5 770 pxor %xmm6, %xmm6 771 pxor %xmm7, %xmm7 772 773#if defined(LT) || defined(RN) 774 movl KK, %eax 775#else 776 movl K, %eax 777 subl KK, %eax 778#endif 779 sarl $3, %eax 780 je .L42 781 ALIGN_4 782 783.L41: 784 addps %xmm2, %xmm6 785 pshufd $0x00, %xmm1, %xmm2 786 mulps %xmm0, %xmm2 787 addps %xmm3, %xmm7 788 pshufd $0x55, %xmm1, %xmm3 789 mulps %xmm0, %xmm3 790 791 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 792 793 addps %xmm2, %xmm4 794 pshufd $0xaa, %xmm1, %xmm2 795 mulps %xmm0, %xmm2 796 addps %xmm3, %xmm5 797 pshufd $0xff, %xmm1, %xmm3 798 movaps -28 * SIZE(BB), %xmm1 799 mulps %xmm0, %xmm3 800 movsd -30 * SIZE(AA), %xmm0 801 802 addps %xmm2, %xmm6 803 pshufd $0x00, %xmm1, %xmm2 804 mulps %xmm0, %xmm2 805 addps %xmm3, %xmm7 806 pshufd $0x55, %xmm1, %xmm3 807 mulps %xmm0, %xmm3 808 809 addps %xmm2, %xmm4 810 pshufd $0xaa, %xmm1, %xmm2 811 mulps %xmm0, %xmm2 812 addps %xmm3, %xmm5 813 pshufd $0xff, %xmm1, %xmm3 814 movaps -24 * SIZE(BB), %xmm1 815 mulps %xmm0, %xmm3 816 movsd -28 * SIZE(AA), %xmm0 817 818 addps %xmm2, %xmm6 819 pshufd $0x00, %xmm1, %xmm2 820 mulps %xmm0, %xmm2 821 addps %xmm3, %xmm7 822 pshufd $0x55, %xmm1, %xmm3 823 mulps %xmm0, %xmm3 824 825 addps %xmm2, %xmm4 826 pshufd $0xaa, %xmm1, %xmm2 827 mulps %xmm0, %xmm2 828 addps %xmm3, %xmm5 829 pshufd $0xff, %xmm1, %xmm3 830 movaps -20 * SIZE(BB), %xmm1 831 mulps %xmm0, %xmm3 832 movsd -26 * SIZE(AA), %xmm0 833 834 addps %xmm2, %xmm6 835 pshufd $0x00, %xmm1, %xmm2 836 mulps %xmm0, %xmm2 837 addps %xmm3, %xmm7 838 pshufd $0x55, %xmm1, %xmm3 839 mulps %xmm0, %xmm3 840 841 addps %xmm2, %xmm4 842 pshufd $0xaa, %xmm1, %xmm2 843 mulps %xmm0, %xmm2 844 addps %xmm3, %xmm5 845 pshufd $0xff, %xmm1, %xmm3 846 movaps -16 * SIZE(BB), %xmm1 847 mulps %xmm0, %xmm3 848 movsd -24 * SIZE(AA), %xmm0 849 850 addps %xmm2, %xmm6 851 pshufd $0x00, %xmm1, %xmm2 852 mulps %xmm0, %xmm2 853 addps %xmm3, %xmm7 854 pshufd $0x55, %xmm1, %xmm3 855 mulps %xmm0, %xmm3 856 857 addps %xmm2, %xmm4 858 pshufd $0xaa, %xmm1, %xmm2 859 mulps %xmm0, %xmm2 860 addps %xmm3, %xmm5 861 pshufd $0xff, %xmm1, %xmm3 862 movaps -12 * SIZE(BB), %xmm1 863 mulps %xmm0, %xmm3 864 movsd -22 * SIZE(AA), %xmm0 865 866 addps %xmm2, %xmm6 867 pshufd $0x00, %xmm1, %xmm2 868 mulps %xmm0, %xmm2 869 addps %xmm3, %xmm7 870 pshufd $0x55, %xmm1, %xmm3 871 mulps %xmm0, %xmm3 872 873 addps %xmm2, %xmm4 874 pshufd $0xaa, %xmm1, %xmm2 875 mulps %xmm0, %xmm2 876 addps %xmm3, %xmm5 877 pshufd $0xff, %xmm1, %xmm3 878 movaps -8 * SIZE(BB), %xmm1 879 mulps %xmm0, %xmm3 880 movsd -20 * SIZE(AA), %xmm0 881 882 addps %xmm2, %xmm6 883 pshufd $0x00, %xmm1, %xmm2 884 mulps %xmm0, %xmm2 885 addps %xmm3, %xmm7 886 pshufd $0x55, %xmm1, %xmm3 887 mulps %xmm0, %xmm3 888 889 addps %xmm2, %xmm4 890 pshufd $0xaa, %xmm1, %xmm2 891 mulps %xmm0, %xmm2 892 addps %xmm3, %xmm5 893 pshufd $0xff, %xmm1, %xmm3 894 movaps -4 * SIZE(BB), %xmm1 895 mulps %xmm0, %xmm3 896 movsd -18 * SIZE(AA), %xmm0 897 898 addps %xmm2, %xmm6 899 pshufd $0x00, %xmm1, %xmm2 900 mulps %xmm0, %xmm2 901 addps %xmm3, %xmm7 902 pshufd $0x55, %xmm1, %xmm3 903 mulps %xmm0, %xmm3 904 905 addps %xmm2, %xmm4 906 pshufd $0xaa, %xmm1, %xmm2 907 mulps %xmm0, %xmm2 908 addps %xmm3, %xmm5 909 pshufd $0xff, %xmm1, %xmm3 910 movaps 0 * SIZE(BB), %xmm1 911 mulps %xmm0, %xmm3 912 movsd -16 * SIZE(AA), %xmm0 913 914 subl $-16 * SIZE, AA 915 subl $-32 * SIZE, BB 916 decl %eax 917 jne .L41 918 ALIGN_4 919 920.L42: 921#if defined(LT) || defined(RN) 922 movl KK, %eax 923#else 924 movl K, %eax 925 subl KK, %eax 926#endif 927 andl $7, %eax # if (k & 1) 928 BRANCH 929 je .L44 930 ALIGN_4 931 932.L43: 933 addps %xmm2, %xmm6 934 pshufd $0x00, %xmm1, %xmm2 935 mulps %xmm0, %xmm2 936 addps %xmm3, %xmm7 937 pshufd $0x55, %xmm1, %xmm3 938 mulps %xmm0, %xmm3 939 940 addps %xmm2, %xmm4 941 pshufd $0xaa, %xmm1, %xmm2 942 mulps %xmm0, %xmm2 943 addps %xmm3, %xmm5 944 pshufd $0xff, %xmm1, %xmm3 945 movaps -28 * SIZE(BB), %xmm1 946 mulps %xmm0, %xmm3 947 movsd -30 * SIZE(AA), %xmm0 948 949 addl $2 * SIZE, AA 950 addl $4 * SIZE, BB 951 decl %eax 952 jg .L43 953 ALIGN_4 954 955.L44: 956#if defined(LN) || defined(RT) 957 movl KK, %eax 958#ifdef LN 959 subl $1, %eax 960#else 961 subl $2, %eax 962#endif 963 964 movl AORIG, AA 965 sall $ZBASE_SHIFT, %eax 966 leal (AA, %eax, 1), AA 967 leal (B, %eax, 2), BB 968#endif 969 970 addps %xmm2, %xmm6 971 addps %xmm3, %xmm7 972 973 pshufd $0xb1, %xmm5, %xmm5 974 pcmpeqb %xmm0, %xmm0 975 pshufd $0xb1, %xmm7, %xmm7 976 psllq $63, %xmm0 977 978#ifndef CONJ 979 shufps $0xb1, %xmm0, %xmm0 980 981 pxor %xmm0, %xmm5 982 pxor %xmm0, %xmm7 983#else 984#if defined(LN) || defined(LT) 985 pxor %xmm0, %xmm4 986 pxor %xmm0, %xmm6 987#else 988 pxor %xmm0, %xmm5 989 pxor %xmm0, %xmm7 990#endif 991#endif 992 993 addps %xmm5, %xmm4 994 addps %xmm7, %xmm6 995 996#if defined(LN) || defined(LT) 997 unpcklpd %xmm6, %xmm4 998 999 movaps -32 * SIZE(BB), %xmm2 1000 1001 subps %xmm4, %xmm2 1002#else 1003 movsd -32 * SIZE(AA), %xmm1 1004 movsd -30 * SIZE(AA), %xmm5 1005 1006 subps %xmm4, %xmm1 1007 subps %xmm6, %xmm5 1008#endif 1009 1010#if defined(LN) || defined(LT) 1011 movaps -32 * SIZE(AA), %xmm5 1012 1013 pshufd $0x44, %xmm5, %xmm6 1014 pshufd $0x11, %xmm5, %xmm7 1015 1016 pshufd $0xa0, %xmm2, %xmm4 1017 pshufd $0xf5, %xmm2, %xmm2 1018 1019#ifndef CONJ 1020 xorps %xmm0, %xmm2 1021#else 1022 xorps %xmm0, %xmm4 1023#endif 1024 1025 mulps %xmm6, %xmm4 1026 mulps %xmm7, %xmm2 1027 addps %xmm4, %xmm2 1028#endif 1029 1030#ifdef RN 1031 movaps -32 * SIZE(BB), %xmm4 1032 1033 pshufd $0x44, %xmm4, %xmm6 1034 pshufd $0x11, %xmm4, %xmm7 1035 1036 pshufd $0xa0, %xmm1, %xmm3 1037 pshufd $0xf5, %xmm1, %xmm1 1038 1039#ifndef CONJ 1040 xorps %xmm0, %xmm1 1041#else 1042 xorps %xmm0, %xmm3 1043#endif 1044 1045 mulps %xmm6, %xmm3 1046 mulps %xmm7, %xmm1 1047 1048 addps %xmm3, %xmm1 1049 1050 pshufd $0xee, %xmm4, %xmm6 1051 pshufd $0xbb, %xmm4, %xmm7 1052 1053 pshufd $0xa0, %xmm1, %xmm3 1054 pshufd $0xf5, %xmm1, %xmm2 1055 1056#ifndef CONJ 1057 xorps %xmm0, %xmm2 1058#else 1059 xorps %xmm0, %xmm3 1060#endif 1061 1062 mulps %xmm6, %xmm3 1063 mulps %xmm7, %xmm2 1064 1065 subps %xmm3, %xmm5 1066 subps %xmm2, %xmm5 1067 1068 movaps -28 * SIZE(BB), %xmm4 1069 1070 pshufd $0xee, %xmm4, %xmm6 1071 pshufd $0xbb, %xmm4, %xmm7 1072 1073 pshufd $0xa0, %xmm5, %xmm3 1074 pshufd $0xf5, %xmm5, %xmm5 1075 1076#ifndef CONJ 1077 xorps %xmm0, %xmm5 1078#else 1079 xorps %xmm0, %xmm3 1080#endif 1081 1082 mulps %xmm6, %xmm3 1083 mulps %xmm7, %xmm5 1084 1085 addps %xmm3, %xmm5 1086#endif 1087 1088#ifdef RT 1089 movaps -28 * SIZE(BB), %xmm4 1090 1091 pshufd $0xee, %xmm4, %xmm6 1092 pshufd $0xbb, %xmm4, %xmm7 1093 1094 pshufd $0xa0, %xmm5, %xmm3 1095 pshufd $0xf5, %xmm5, %xmm5 1096 1097#ifndef CONJ 1098 xorps %xmm0, %xmm5 1099#else 1100 xorps %xmm0, %xmm3 1101#endif 1102 1103 mulps %xmm6, %xmm3 1104 mulps %xmm7, %xmm5 1105 1106 addps %xmm3, %xmm5 1107 1108 pshufd $0x44, %xmm4, %xmm6 1109 pshufd $0x11, %xmm4, %xmm7 1110 1111 pshufd $0xa0, %xmm5, %xmm3 1112 pshufd $0xf5, %xmm5, %xmm2 1113 1114#ifndef CONJ 1115 xorps %xmm0, %xmm2 1116#else 1117 xorps %xmm0, %xmm3 1118#endif 1119 1120 mulps %xmm6, %xmm3 1121 mulps %xmm7, %xmm2 1122 1123 subps %xmm3, %xmm1 1124 subps %xmm2, %xmm1 1125 1126 movaps -32 * SIZE(BB), %xmm4 1127 1128 pshufd $0x44, %xmm4, %xmm6 1129 pshufd $0x11, %xmm4, %xmm7 1130 1131 pshufd $0xa0, %xmm1, %xmm3 1132 pshufd $0xf5, %xmm1, %xmm1 1133 1134#ifndef CONJ 1135 xorps %xmm0, %xmm1 1136#else 1137 xorps %xmm0, %xmm3 1138#endif 1139 1140 mulps %xmm6, %xmm3 1141 mulps %xmm7, %xmm1 1142 1143 addps %xmm3, %xmm1 1144#endif 1145 1146#ifdef LN 1147 subl $2 * SIZE, CO1 1148#endif 1149 1150#if defined(LN) || defined(LT) 1151 movaps %xmm2, -32 * SIZE(BB) 1152 1153 movlps %xmm2, 0 * SIZE(CO1) 1154 movhps %xmm2, 0 * SIZE(CO1, LDC) 1155#else 1156 movlps %xmm1, -32 * SIZE(AA) 1157 movlps %xmm5, -30 * SIZE(AA) 1158 1159 movlps %xmm1, 0 * SIZE(CO1) 1160 movlps %xmm5, 0 * SIZE(CO1, LDC) 1161#endif 1162 1163#ifndef LN 1164 addl $2 * SIZE, CO1 1165#endif 1166 1167#if defined(LT) || defined(RN) 1168 movl K, %eax 1169 subl KK, %eax 1170 sall $ZBASE_SHIFT, %eax 1171 leal (AA, %eax, 1), AA 1172 leal (BB, %eax, 2), BB 1173#endif 1174 1175#ifdef LN 1176 subl $1, KK 1177#endif 1178 1179#ifdef LT 1180 addl $1, KK 1181#endif 1182 1183#ifdef RT 1184 movl K, %eax 1185 sall $ZBASE_SHIFT, %eax 1186 addl %eax, AORIG 1187#endif 1188 ALIGN_4 1189 1190.L99: 1191#ifdef LN 1192 movl K, %eax 1193 sall $1 + ZBASE_SHIFT, %eax 1194 addl %eax, B 1195#endif 1196 1197#if defined(LT) || defined(RN) 1198 movl BB, B 1199#endif 1200 1201#ifdef RN 1202 addl $2, KK 1203#endif 1204 1205#ifdef RT 1206 subl $2, KK 1207#endif 1208 1209 decl J # j -- 1210 jg .L01 1211 ALIGN_4 1212 1213.L100: 1214 movl N, %eax 1215 andl $1, %eax 1216 jle .L999 1217 1218#if defined(LT) || defined(RN) 1219 movl A, %eax 1220 movl %eax, AA 1221#else 1222 movl A, %eax 1223 movl %eax, AORIG 1224#endif 1225 1226#ifdef RT 1227 movl K, %eax 1228 sall $ZBASE_SHIFT, %eax 1229 subl %eax, B 1230#endif 1231 1232#ifdef RT 1233 subl LDC, C 1234#endif 1235 movl C, CO1 1236#ifndef RT 1237 addl LDC, C 1238#endif 1239 1240#ifdef LN 1241 movl OFFSET, %eax 1242 addl M, %eax 1243 movl %eax, KK 1244#endif 1245 1246#ifdef LT 1247 movl OFFSET, %eax 1248 movl %eax, KK 1249#endif 1250 1251 movl M, %ebx 1252 sarl $1, %ebx 1253 jle .L130 1254 ALIGN_4 1255 1256.L110: 1257#ifdef LN 1258 movl K, %eax 1259 sall $1 + ZBASE_SHIFT, %eax 1260 subl %eax, AORIG 1261#endif 1262 1263#if defined(LN) || defined(RT) 1264 movl KK, %eax 1265 movl AORIG, AA 1266 sall $1 + ZBASE_SHIFT, %eax 1267 addl %eax, AA 1268#endif 1269 1270 movl B, BB 1271 1272#if defined(LN) || defined(RT) 1273 movl KK, %eax 1274 sall $ZBASE_SHIFT, %eax 1275 addl %eax, BB 1276#endif 1277 1278 movaps -32 * SIZE(AA), %xmm0 1279 pxor %xmm2, %xmm2 1280 movsd -32 * SIZE(BB), %xmm1 1281 pxor %xmm3, %xmm3 1282 movhps -30 * SIZE(BB), %xmm1 1283 pxor %xmm4, %xmm4 1284#ifdef LN 1285 prefetcht0 -4 * SIZE(CO1) 1286#else 1287 prefetcht0 3 * SIZE(CO1) 1288#endif 1289 pxor %xmm5, %xmm5 1290 pxor %xmm6, %xmm6 1291 pxor %xmm7, %xmm7 1292 1293#if defined(LT) || defined(RN) 1294 movl KK, %eax 1295#else 1296 movl K, %eax 1297 subl KK, %eax 1298#endif 1299 sarl $3, %eax 1300 je .L112 1301 ALIGN_4 1302 1303.L111: 1304 addps %xmm2, %xmm4 1305 pshufd $0x00, %xmm1, %xmm2 1306 mulps %xmm0, %xmm2 1307 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1308 addps %xmm3, %xmm5 1309 pshufd $0x55, %xmm1, %xmm3 1310 mulps %xmm0, %xmm3 1311 movaps -28 * SIZE(AA), %xmm0 1312 1313 addps %xmm2, %xmm4 1314 pshufd $0xaa, %xmm1, %xmm2 1315 mulps %xmm0, %xmm2 1316 addps %xmm3, %xmm5 1317 pshufd $0xff, %xmm1, %xmm3 1318 movaps -28 * SIZE(BB), %xmm1 1319 mulps %xmm0, %xmm3 1320 movaps -24 * SIZE(AA), %xmm0 1321 1322 addps %xmm2, %xmm4 1323 pshufd $0x00, %xmm1, %xmm2 1324 mulps %xmm0, %xmm2 1325 addps %xmm3, %xmm5 1326 pshufd $0x55, %xmm1, %xmm3 1327 mulps %xmm0, %xmm3 1328 movaps -20 * SIZE(AA), %xmm0 1329 1330 addps %xmm2, %xmm4 1331 pshufd $0xaa, %xmm1, %xmm2 1332 mulps %xmm0, %xmm2 1333 addps %xmm3, %xmm5 1334 pshufd $0xff, %xmm1, %xmm3 1335 movaps -24 * SIZE(BB), %xmm1 1336 mulps %xmm0, %xmm3 1337 movaps -16 * SIZE(AA), %xmm0 1338 1339 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) 1340 1341 addps %xmm2, %xmm4 1342 pshufd $0x00, %xmm1, %xmm2 1343 mulps %xmm0, %xmm2 1344 addps %xmm3, %xmm5 1345 pshufd $0x55, %xmm1, %xmm3 1346 mulps %xmm0, %xmm3 1347 movaps -12 * SIZE(AA), %xmm0 1348 1349 addps %xmm2, %xmm4 1350 pshufd $0xaa, %xmm1, %xmm2 1351 mulps %xmm0, %xmm2 1352 addps %xmm3, %xmm5 1353 pshufd $0xff, %xmm1, %xmm3 1354 movaps -20 * SIZE(BB), %xmm1 1355 mulps %xmm0, %xmm3 1356 movaps -8 * SIZE(AA), %xmm0 1357 1358 addps %xmm2, %xmm4 1359 pshufd $0x00, %xmm1, %xmm2 1360 mulps %xmm0, %xmm2 1361 addps %xmm3, %xmm5 1362 pshufd $0x55, %xmm1, %xmm3 1363 mulps %xmm0, %xmm3 1364 movaps -4 * SIZE(AA), %xmm0 1365 1366 addps %xmm2, %xmm4 1367 pshufd $0xaa, %xmm1, %xmm2 1368 mulps %xmm0, %xmm2 1369 addps %xmm3, %xmm5 1370 pshufd $0xff, %xmm1, %xmm3 1371 movaps -16 * SIZE(BB), %xmm1 1372 mulps %xmm0, %xmm3 1373 movaps 0 * SIZE(AA), %xmm0 1374 1375 subl $-32 * SIZE, AA 1376 subl $-16 * SIZE, BB 1377 1378 decl %eax 1379 jne .L111 1380 ALIGN_4 1381 1382.L112: 1383#if defined(LT) || defined(RN) 1384 movl KK, %eax 1385#else 1386 movl K, %eax 1387 subl KK, %eax 1388#endif 1389 andl $7, %eax # if (k & 1) 1390 BRANCH 1391 je .L114 1392 ALIGN_4 1393 1394.L113: 1395 addps %xmm2, %xmm4 1396 pshufd $0x00, %xmm1, %xmm2 1397 mulps %xmm0, %xmm2 1398 addps %xmm3, %xmm5 1399 pshufd $0x55, %xmm1, %xmm3 1400 movsd -30 * SIZE(BB), %xmm1 1401 mulps %xmm0, %xmm3 1402 movaps -28 * SIZE(AA), %xmm0 1403 1404 addl $4 * SIZE, AA 1405 addl $2 * SIZE, BB 1406 decl %eax 1407 jg .L113 1408 ALIGN_4 1409 1410.L114: 1411#if defined(LN) || defined(RT) 1412 movl KK, %eax 1413#ifdef LN 1414 subl $2, %eax 1415#else 1416 subl $1, %eax 1417#endif 1418 1419 movl AORIG, AA 1420 sall $ZBASE_SHIFT, %eax 1421 leal (AA, %eax, 2), AA 1422 leal (B, %eax, 1), BB 1423#endif 1424 1425 addps %xmm2, %xmm4 1426 addps %xmm3, %xmm5 1427 1428 pshufd $0xb1, %xmm5, %xmm5 1429 pcmpeqb %xmm0, %xmm0 1430 psllq $63, %xmm0 1431 1432#ifndef CONJ 1433 shufps $0xb1, %xmm0, %xmm0 1434 1435 pxor %xmm0, %xmm5 1436#else 1437#if defined(LN) || defined(LT) 1438 pxor %xmm0, %xmm4 1439#else 1440 pxor %xmm0, %xmm5 1441#endif 1442#endif 1443 1444 addps %xmm5, %xmm4 1445 1446#if defined(LN) || defined(LT) 1447 movaps %xmm4, %xmm5 1448 unpcklpd %xmm6, %xmm4 1449 unpckhpd %xmm6, %xmm5 1450 1451 movsd -32 * SIZE(BB), %xmm2 1452 movsd -30 * SIZE(BB), %xmm3 1453 1454 subps %xmm4, %xmm2 1455 subps %xmm5, %xmm3 1456#else 1457 movaps -32 * SIZE(AA), %xmm1 1458 1459 subps %xmm4, %xmm1 1460#endif 1461 1462#ifdef LN 1463 movaps -28 * SIZE(AA), %xmm5 1464 1465 pshufd $0xee, %xmm5, %xmm6 1466 pshufd $0xbb, %xmm5, %xmm7 1467 1468 pshufd $0xa0, %xmm3, %xmm4 1469 pshufd $0xf5, %xmm3, %xmm3 1470 1471#ifndef CONJ 1472 xorps %xmm0, %xmm3 1473#else 1474 xorps %xmm0, %xmm4 1475#endif 1476 1477 mulps %xmm6, %xmm4 1478 mulps %xmm7, %xmm3 1479 addps %xmm4, %xmm3 1480 1481 pshufd $0x44, %xmm5, %xmm6 1482 pshufd $0x11, %xmm5, %xmm7 1483 1484 pshufd $0xa0, %xmm3, %xmm4 1485 pshufd $0xf5, %xmm3, %xmm1 1486 1487#ifndef CONJ 1488 xorps %xmm0, %xmm1 1489#else 1490 xorps %xmm0, %xmm4 1491#endif 1492 1493 mulps %xmm6, %xmm4 1494 mulps %xmm7, %xmm1 1495 subps %xmm4, %xmm2 1496 subps %xmm1, %xmm2 1497 1498 movaps -32 * SIZE(AA), %xmm5 1499 1500 pshufd $0x44, %xmm5, %xmm6 1501 pshufd $0x11, %xmm5, %xmm7 1502 1503 pshufd $0xa0, %xmm2, %xmm4 1504 pshufd $0xf5, %xmm2, %xmm2 1505 1506#ifndef CONJ 1507 xorps %xmm0, %xmm2 1508#else 1509 xorps %xmm0, %xmm4 1510#endif 1511 1512 mulps %xmm6, %xmm4 1513 mulps %xmm7, %xmm2 1514 addps %xmm4, %xmm2 1515#endif 1516 1517#ifdef LT 1518 movaps -32 * SIZE(AA), %xmm5 1519 1520 pshufd $0x44, %xmm5, %xmm6 1521 pshufd $0x11, %xmm5, %xmm7 1522 1523 pshufd $0xa0, %xmm2, %xmm4 1524 pshufd $0xf5, %xmm2, %xmm2 1525 1526#ifndef CONJ 1527 xorps %xmm0, %xmm2 1528#else 1529 xorps %xmm0, %xmm4 1530#endif 1531 1532 mulps %xmm6, %xmm4 1533 mulps %xmm7, %xmm2 1534 addps %xmm4, %xmm2 1535 1536 pshufd $0xee, %xmm5, %xmm6 1537 pshufd $0xbb, %xmm5, %xmm7 1538 1539 pshufd $0xa0, %xmm2, %xmm4 1540 pshufd $0xf5, %xmm2, %xmm1 1541 1542#ifndef CONJ 1543 xorps %xmm0, %xmm1 1544#else 1545 xorps %xmm0, %xmm4 1546#endif 1547 1548 mulps %xmm6, %xmm4 1549 mulps %xmm7, %xmm1 1550 subps %xmm4, %xmm3 1551 subps %xmm1, %xmm3 1552 1553 movaps -28 * SIZE(AA), %xmm5 1554 1555 pshufd $0xee, %xmm5, %xmm6 1556 pshufd $0xbb, %xmm5, %xmm7 1557 1558 pshufd $0xa0, %xmm3, %xmm4 1559 pshufd $0xf5, %xmm3, %xmm3 1560 1561#ifndef CONJ 1562 xorps %xmm0, %xmm3 1563#else 1564 xorps %xmm0, %xmm4 1565#endif 1566 1567 mulps %xmm6, %xmm4 1568 mulps %xmm7, %xmm3 1569 addps %xmm4, %xmm3 1570#endif 1571 1572#if defined(RN) || defined(RT) 1573 movaps -32 * SIZE(BB), %xmm4 1574 1575 pshufd $0x44, %xmm4, %xmm6 1576 pshufd $0x11, %xmm4, %xmm7 1577 1578 pshufd $0xa0, %xmm1, %xmm3 1579 pshufd $0xf5, %xmm1, %xmm1 1580 1581#ifndef CONJ 1582 xorps %xmm0, %xmm1 1583#else 1584 xorps %xmm0, %xmm3 1585#endif 1586 1587 mulps %xmm6, %xmm3 1588 mulps %xmm7, %xmm1 1589 1590 addps %xmm3, %xmm1 1591#endif 1592 1593#ifdef LN 1594 subl $4 * SIZE, CO1 1595#endif 1596 1597#if defined(LN) || defined(LT) 1598 movlps %xmm2, -32 * SIZE(BB) 1599 movlps %xmm3, -30 * SIZE(BB) 1600 1601 movlps %xmm2, 0 * SIZE(CO1) 1602 movlps %xmm3, 2 * SIZE(CO1) 1603#else 1604 movaps %xmm1, -32 * SIZE(AA) 1605 1606 movlps %xmm1, 0 * SIZE(CO1) 1607 movhps %xmm1, 2 * SIZE(CO1) 1608#endif 1609 1610#ifndef LN 1611 addl $4 * SIZE, CO1 1612#endif 1613 1614#if defined(LT) || defined(RN) 1615 movl K, %eax 1616 subl KK, %eax 1617 sall $ZBASE_SHIFT, %eax 1618 leal (AA, %eax, 2), AA 1619 leal (BB, %eax, 1), BB 1620#endif 1621 1622#ifdef LN 1623 subl $2, KK 1624#endif 1625 1626#ifdef LT 1627 addl $2, KK 1628#endif 1629 1630#ifdef RT 1631 movl K, %eax 1632 sall $1 + ZBASE_SHIFT, %eax 1633 addl %eax, AORIG 1634#endif 1635 1636 decl %ebx # i -- 1637 jg .L110 1638 ALIGN_4 1639 1640.L130: 1641 movl M, %ebx 1642 andl $1, %ebx 1643 jle .L149 1644 1645#ifdef LN 1646 movl K, %eax 1647 sall $ZBASE_SHIFT, %eax 1648 subl %eax, AORIG 1649#endif 1650 1651#if defined(LN) || defined(RT) 1652 movl KK, %eax 1653 movl AORIG, AA 1654 sall $ZBASE_SHIFT, %eax 1655 addl %eax, AA 1656#endif 1657 1658 movl B, BB 1659 1660#if defined(LN) || defined(RT) 1661 movl KK, %eax 1662 sall $ZBASE_SHIFT, %eax 1663 addl %eax, BB 1664#endif 1665 1666 movsd -32 * SIZE(AA), %xmm0 1667 pxor %xmm2, %xmm2 1668 movsd -32 * SIZE(BB), %xmm1 1669 pxor %xmm3, %xmm3 1670 1671 pxor %xmm4, %xmm4 1672 pxor %xmm5, %xmm5 1673 pxor %xmm6, %xmm6 1674 pxor %xmm7, %xmm7 1675 1676#if defined(LT) || defined(RN) 1677 movl KK, %eax 1678#else 1679 movl K, %eax 1680 subl KK, %eax 1681#endif 1682 sarl $3, %eax 1683 je .L142 1684 ALIGN_4 1685 1686.L141: 1687 addps %xmm2, %xmm4 1688 pshufd $0x00, %xmm1, %xmm2 1689 mulps %xmm0, %xmm2 1690 addps %xmm3, %xmm5 1691 pshufd $0x55, %xmm1, %xmm3 1692 movsd -30 * SIZE(BB), %xmm1 1693 mulps %xmm0, %xmm3 1694 movsd -30 * SIZE(AA), %xmm0 1695 1696 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1697 1698 addps %xmm2, %xmm4 1699 pshufd $0x00, %xmm1, %xmm2 1700 mulps %xmm0, %xmm2 1701 addps %xmm3, %xmm5 1702 pshufd $0x55, %xmm1, %xmm3 1703 movsd -28 * SIZE(BB), %xmm1 1704 mulps %xmm0, %xmm3 1705 movsd -28 * SIZE(AA), %xmm0 1706 1707 addps %xmm2, %xmm4 1708 pshufd $0x00, %xmm1, %xmm2 1709 mulps %xmm0, %xmm2 1710 addps %xmm3, %xmm5 1711 pshufd $0x55, %xmm1, %xmm3 1712 movsd -26 * SIZE(BB), %xmm1 1713 mulps %xmm0, %xmm3 1714 movsd -26 * SIZE(AA), %xmm0 1715 1716 addps %xmm2, %xmm4 1717 pshufd $0x00, %xmm1, %xmm2 1718 mulps %xmm0, %xmm2 1719 addps %xmm3, %xmm5 1720 pshufd $0x55, %xmm1, %xmm3 1721 movsd -24 * SIZE(BB), %xmm1 1722 mulps %xmm0, %xmm3 1723 movsd -24 * SIZE(AA), %xmm0 1724 1725 addps %xmm2, %xmm4 1726 pshufd $0x00, %xmm1, %xmm2 1727 mulps %xmm0, %xmm2 1728 addps %xmm3, %xmm5 1729 pshufd $0x55, %xmm1, %xmm3 1730 movsd -22 * SIZE(BB), %xmm1 1731 mulps %xmm0, %xmm3 1732 movsd -22 * SIZE(AA), %xmm0 1733 1734 addps %xmm2, %xmm4 1735 pshufd $0x00, %xmm1, %xmm2 1736 mulps %xmm0, %xmm2 1737 addps %xmm3, %xmm5 1738 pshufd $0x55, %xmm1, %xmm3 1739 movsd -20 * SIZE(BB), %xmm1 1740 mulps %xmm0, %xmm3 1741 movsd -20 * SIZE(AA), %xmm0 1742 1743 addps %xmm2, %xmm4 1744 pshufd $0x00, %xmm1, %xmm2 1745 mulps %xmm0, %xmm2 1746 addps %xmm3, %xmm5 1747 pshufd $0x55, %xmm1, %xmm3 1748 movsd -18 * SIZE(BB), %xmm1 1749 mulps %xmm0, %xmm3 1750 movsd -18 * SIZE(AA), %xmm0 1751 1752 addps %xmm2, %xmm4 1753 pshufd $0x00, %xmm1, %xmm2 1754 mulps %xmm0, %xmm2 1755 addps %xmm3, %xmm5 1756 pshufd $0x55, %xmm1, %xmm3 1757 movsd -16 * SIZE(BB), %xmm1 1758 mulps %xmm0, %xmm3 1759 movsd -16 * SIZE(AA), %xmm0 1760 1761 subl $-16 * SIZE, AA 1762 subl $-16 * SIZE, BB 1763 1764 decl %eax 1765 jne .L141 1766 ALIGN_4 1767 1768.L142: 1769#if defined(LT) || defined(RN) 1770 movl KK, %eax 1771#else 1772 movl K, %eax 1773 subl KK, %eax 1774#endif 1775 andl $7, %eax # if (k & 1) 1776 BRANCH 1777 je .L144 1778 ALIGN_4 1779 1780.L143: 1781 addps %xmm2, %xmm4 1782 pshufd $0x00, %xmm1, %xmm2 1783 mulps %xmm0, %xmm2 1784 addps %xmm3, %xmm5 1785 pshufd $0x55, %xmm1, %xmm3 1786 movsd -30 * SIZE(BB), %xmm1 1787 mulps %xmm0, %xmm3 1788 movsd -30 * SIZE(AA), %xmm0 1789 1790 addl $2 * SIZE, AA 1791 addl $2 * SIZE, BB 1792 decl %eax 1793 jg .L143 1794 ALIGN_4 1795 1796.L144: 1797#if defined(LN) || defined(RT) 1798 movl KK, %eax 1799 subl $1, %eax 1800 1801 movl AORIG, AA 1802 sall $ZBASE_SHIFT, %eax 1803 leal (AA, %eax, 1), AA 1804 leal (B, %eax, 1), BB 1805#endif 1806 1807 addps %xmm2, %xmm4 1808 addps %xmm3, %xmm5 1809 1810 pshufd $0xb1, %xmm5, %xmm5 1811 pcmpeqb %xmm0, %xmm0 1812 psllq $63, %xmm0 1813 1814#ifndef CONJ 1815 shufps $0xb1, %xmm0, %xmm0 1816 1817 pxor %xmm0, %xmm5 1818#else 1819#if defined(LN) || defined(LT) 1820 pxor %xmm0, %xmm4 1821#else 1822 pxor %xmm0, %xmm5 1823#endif 1824#endif 1825 1826 addps %xmm5, %xmm4 1827 1828#if defined(LN) || defined(LT) 1829 movsd -32 * SIZE(BB), %xmm2 1830 1831 subps %xmm4, %xmm2 1832#else 1833 movsd -32 * SIZE(AA), %xmm1 1834 1835 subps %xmm4, %xmm1 1836#endif 1837 1838#if defined(LN) || defined(LT) 1839 movaps -32 * SIZE(AA), %xmm5 1840 1841 pshufd $0x44, %xmm5, %xmm6 1842 pshufd $0x11, %xmm5, %xmm7 1843 1844 pshufd $0xa0, %xmm2, %xmm4 1845 pshufd $0xf5, %xmm2, %xmm2 1846 1847#ifndef CONJ 1848 xorps %xmm0, %xmm2 1849#else 1850 xorps %xmm0, %xmm4 1851#endif 1852 1853 mulps %xmm6, %xmm4 1854 mulps %xmm7, %xmm2 1855 addps %xmm4, %xmm2 1856#endif 1857 1858#if defined(RN) || defined(RT) 1859 movaps -32 * SIZE(BB), %xmm4 1860 1861 pshufd $0x44, %xmm4, %xmm6 1862 pshufd $0x11, %xmm4, %xmm7 1863 1864 pshufd $0xa0, %xmm1, %xmm3 1865 pshufd $0xf5, %xmm1, %xmm1 1866 1867#ifndef CONJ 1868 xorps %xmm0, %xmm1 1869#else 1870 xorps %xmm0, %xmm3 1871#endif 1872 1873 mulps %xmm6, %xmm3 1874 mulps %xmm7, %xmm1 1875 1876 addps %xmm3, %xmm1 1877#endif 1878 1879#ifdef LN 1880 subl $2 * SIZE, CO1 1881#endif 1882 1883#if defined(LN) || defined(LT) 1884 movlps %xmm2, -32 * SIZE(BB) 1885 1886 movlps %xmm2, 0 * SIZE(CO1) 1887#else 1888 movlps %xmm1, -32 * SIZE(AA) 1889 1890 movlps %xmm1, 0 * SIZE(CO1) 1891#endif 1892 1893#ifndef LN 1894 addl $2 * SIZE, CO1 1895#endif 1896 1897#if defined(LT) || defined(RN) 1898 movl K, %eax 1899 subl KK, %eax 1900 sall $ZBASE_SHIFT, %eax 1901 leal (AA, %eax, 1), AA 1902 leal (BB, %eax, 1), BB 1903#endif 1904 1905#ifdef LN 1906 subl $1, KK 1907#endif 1908 1909#ifdef LT 1910 addl $1, KK 1911#endif 1912 1913#ifdef RT 1914 movl K, %eax 1915 sall $ZBASE_SHIFT, %eax 1916 addl %eax, AORIG 1917#endif 1918 ALIGN_4 1919 1920.L149: 1921#ifdef LN 1922 movl K, %eax 1923 sall $ZBASE_SHIFT, %eax 1924 addl %eax, B 1925#endif 1926 1927#if defined(LT) || defined(RN) 1928 movl BB, B 1929#endif 1930 1931#ifdef RN 1932 addl $1, KK 1933#endif 1934 1935#ifdef RT 1936 subl $1, KK 1937#endif 1938 ALIGN_4 1939 1940.L999: 1941 popl %ebx 1942 popl %esi 1943 popl %edi 1944 popl %ebp 1945 1946 addl $ARGS, %esp 1947 ret 1948 1949 EPILOGUE 1950