1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43#define ARGS 16 44 45#define M 4 + STACK + ARGS(%esp) 46#define N 8 + STACK + ARGS(%esp) 47#define K 12 + STACK + ARGS(%esp) 48#define ALPHA_R 16 + STACK + ARGS(%esp) 49#define ALPHA_I 24 + STACK + ARGS(%esp) 50#define A 32 + STACK + ARGS(%esp) 51#define ARG_B 36 + STACK + ARGS(%esp) 52#define C 40 + STACK + ARGS(%esp) 53#define ARG_LDC 44 + STACK + ARGS(%esp) 54#define OFFSET 48 + STACK + ARGS(%esp) 55 56#define J 0 + STACK(%esp) 57#define KK 4 + STACK(%esp) 58#define KKK 8 + STACK(%esp) 59#define AORIG 12 + STACK(%esp) 60 61#if defined(PENRYN) || defined(DUNNINGTON) 62#define PREFETCH prefetcht1 63#define PREFETCHSIZE 84 64#endif 65 66#ifdef NEHALEM 67#define PREFETCH prefetcht1 68#define PREFETCHSIZE 84 69#endif 70 71#ifdef NANO 72#define PREFETCH prefetcht0 73#define PREFETCHSIZE (8 * 2) 74#endif 75 76#define AA %edx 77#define BB %ecx 78#define LDC %ebp 79#define B %edi 80#define CO1 %esi 81 82#define ADD1 addpd 83#define ADD2 addpd 84 85 PROLOGUE 86 87 subl $ARGS, %esp 88 89 pushl %ebp 90 pushl %edi 91 pushl %esi 92 pushl %ebx 93 94 PROFCODE 95 96 movl ARG_B, B 97 movl ARG_LDC, LDC 98 movl OFFSET, %eax 99#ifdef RN 100 negl %eax 101#endif 102 movl %eax, KK 103 104 movl M, %ebx 105 testl %ebx, %ebx 106 jle .L999 107 108 subl $-16 * SIZE, A 109 subl $-16 * SIZE, B 110 111 sall $ZBASE_SHIFT, LDC 112 113#ifdef LN 114 movl M, %eax 115 sall $ZBASE_SHIFT, %eax 116 addl %eax, C 117 imull K, %eax 118 addl %eax, A 119#endif 120 121#ifdef RT 122 movl N, %eax 123 sall $ZBASE_SHIFT, %eax 124 imull K, %eax 125 addl %eax, B 126 127 movl N, %eax 128 imull LDC, %eax 129 addl %eax, C 130#endif 131 132#ifdef RT 133 movl N, %eax 134 subl OFFSET, %eax 135 movl %eax, KK 136#endif 137 138 movl N, %eax 139 testl $1, %eax 140 jle .L100 141 142#if defined(LT) || defined(RN) 143 movl A, AA 144#else 145 movl A, %eax 146 movl %eax, AORIG 147#endif 148 149#ifdef RT 150 movl K, %eax 151 sall $ZBASE_SHIFT, %eax 152 subl %eax, B 153#endif 154 155#ifdef RT 156 subl LDC, C 157#endif 158 movl C, CO1 159#ifndef RT 160 addl LDC, C 161#endif 162 163#ifdef LN 164 movl OFFSET, %eax 165 addl M, %eax 166 movl %eax, KK 167#endif 168 169#ifdef LT 170 movl OFFSET, %eax 171 movl %eax, KK 172#endif 173 174 movl M, %ebx 175 ALIGN_4 176 177L110: 178#ifdef LN 179 movl K, %eax 180 sall $ZBASE_SHIFT, %eax 181 subl %eax, AORIG 182#endif 183 184#if defined(LN) || defined(RT) 185 movl KK, %eax 186 movl AORIG, AA 187 sall $ZBASE_SHIFT, %eax 188 addl %eax, AA 189#endif 190 191 movl B, BB 192 193#if defined(LN) || defined(RT) 194 movl KK, %eax 195 sall $ZBASE_SHIFT, %eax 196 addl %eax, BB 197#endif 198 199 movaps -16 * SIZE(AA), %xmm0 200 pxor %xmm2, %xmm2 201 movaps -16 * SIZE(BB), %xmm1 202 pxor %xmm3, %xmm3 203 204 pxor %xmm4, %xmm4 205#ifdef LN 206 prefetcht0 -2 * SIZE(CO1) 207#else 208 prefetcht0 1 * SIZE(CO1) 209#endif 210 pxor %xmm5, %xmm5 211 pxor %xmm6, %xmm6 212 pxor %xmm7, %xmm7 213 214#if defined(LT) || defined(RN) 215 movl KK, %eax 216#else 217 movl K, %eax 218 subl KK, %eax 219#endif 220 sarl $3, %eax 221 je L115 222 ALIGN_4 223 224L112: 225 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 226 227 pshufd $0x4e, %xmm1, %xmm2 228 mulpd %xmm0, %xmm1 229 mulpd %xmm0, %xmm2 230 movaps -14 * SIZE(AA), %xmm0 231 232 ADD1 %xmm1, %xmm4 233 movaps -14 * SIZE(BB), %xmm1 234 ADD2 %xmm2, %xmm5 235 236 pshufd $0x4e, %xmm1, %xmm2 237 mulpd %xmm0, %xmm1 238 mulpd %xmm0, %xmm2 239 movaps -12 * SIZE(AA), %xmm0 240 241 ADD1 %xmm1, %xmm6 242 movaps -12 * SIZE(BB), %xmm1 243 ADD2 %xmm2, %xmm7 244 245 pshufd $0x4e, %xmm1, %xmm2 246 mulpd %xmm0, %xmm1 247 mulpd %xmm0, %xmm2 248 movaps -10 * SIZE(AA), %xmm0 249 250 ADD1 %xmm1, %xmm4 251 movaps -10 * SIZE(BB), %xmm1 252 ADD2 %xmm2, %xmm5 253 254 pshufd $0x4e, %xmm1, %xmm2 255 mulpd %xmm0, %xmm1 256 mulpd %xmm0, %xmm2 257 movaps -8 * SIZE(AA), %xmm0 258 259 ADD1 %xmm1, %xmm6 260 movaps -8 * SIZE(BB), %xmm1 261 ADD2 %xmm2, %xmm7 262 263 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) 264 265 pshufd $0x4e, %xmm1, %xmm2 266 mulpd %xmm0, %xmm1 267 mulpd %xmm0, %xmm2 268 movaps -6 * SIZE(AA), %xmm0 269 270 ADD1 %xmm1, %xmm4 271 movaps -6 * SIZE(BB), %xmm1 272 ADD2 %xmm2, %xmm5 273 274 pshufd $0x4e, %xmm1, %xmm2 275 mulpd %xmm0, %xmm1 276 mulpd %xmm0, %xmm2 277 movaps -4 * SIZE(AA), %xmm0 278 279 ADD1 %xmm1, %xmm6 280 movaps -4 * SIZE(BB), %xmm1 281 ADD2 %xmm2, %xmm7 282 283 pshufd $0x4e, %xmm1, %xmm2 284 mulpd %xmm0, %xmm1 285 mulpd %xmm0, %xmm2 286 movaps -2 * SIZE(AA), %xmm0 287 288 ADD1 %xmm1, %xmm4 289 movaps -2 * SIZE(BB), %xmm1 290 ADD2 %xmm2, %xmm5 291 292 pshufd $0x4e, %xmm1, %xmm2 293 mulpd %xmm0, %xmm1 294 mulpd %xmm0, %xmm2 295 movaps 0 * SIZE(AA), %xmm0 296 297 ADD1 %xmm1, %xmm6 298 movaps 0 * SIZE(BB), %xmm1 299 ADD2 %xmm2, %xmm7 300 301 subl $-16 * SIZE, AA 302 subl $-16 * SIZE, BB 303 304 subl $1, %eax 305 jne L112 306 ALIGN_4 307 308L115: 309#if defined(LT) || defined(RN) 310 movl KK, %eax 311#else 312 movl K, %eax 313 subl KK, %eax 314#endif 315 andl $7, %eax # if (k & 1) 316 BRANCH 317 je L118 318 ALIGN_4 319 320L116: 321 pshufd $0x4e, %xmm1, %xmm2 322 mulpd %xmm0, %xmm1 323 mulpd %xmm0, %xmm2 324 movaps -14 * SIZE(AA), %xmm0 325 326 ADD1 %xmm1, %xmm4 327 movaps -14 * SIZE(BB), %xmm1 328 ADD2 %xmm2, %xmm5 329 330 addl $2 * SIZE, AA 331 addl $2 * SIZE, BB 332 decl %eax 333 jg L116 334 ALIGN_4 335 336L118: 337#if defined(LN) || defined(RT) 338 movl KK, %eax 339#ifdef LN 340 subl $1, %eax 341#else 342 subl $1, %eax 343#endif 344 345 movl AORIG, AA 346 sall $ZBASE_SHIFT, %eax 347 leal (AA, %eax, 1), AA 348 leal (B, %eax, 1), BB 349#endif 350 351 addpd %xmm6, %xmm4 352 pcmpeqb %xmm1, %xmm1 353 addpd %xmm7, %xmm5 354 psllq $63, %xmm1 355 356#ifndef CONJ 357 pshufd $0x40, %xmm1, %xmm0 358 shufps $0x04, %xmm1, %xmm1 359 360 pxor %xmm0, %xmm4 361#else 362#if defined(LN) || defined(LT) 363 pshufd $0x40, %xmm1, %xmm0 364#else 365 pshufd $0x04, %xmm1, %xmm0 366#endif 367 shufps $0x40, %xmm1, %xmm1 368 369 pxor %xmm0, %xmm5 370#endif 371 372 haddpd %xmm5, %xmm4 373 374#if defined(LN) || defined(LT) 375 movapd -16 * SIZE(BB), %xmm5 376 subpd %xmm4, %xmm5 377#else 378 movapd -16 * SIZE(AA), %xmm5 379 subpd %xmm4, %xmm5 380#endif 381 382#if defined(LN) || defined(LT) 383 movddup -16 * SIZE(AA), %xmm2 384 movddup -15 * SIZE(AA), %xmm3 385 386 pshufd $0x4e, %xmm5, %xmm4 387 388 xorpd %xmm1, %xmm4 389 390 mulpd %xmm2, %xmm5 391 mulpd %xmm3, %xmm4 392 393 addpd %xmm4, %xmm5 394#endif 395 396#if defined(RN) || defined(RT) 397 movddup -16 * SIZE(BB), %xmm2 398 movddup -15 * SIZE(BB), %xmm3 399 400 pshufd $0x4e, %xmm5, %xmm4 401 402 xorpd %xmm1, %xmm4 403 404 mulpd %xmm2, %xmm5 405 mulpd %xmm3, %xmm4 406 407 addpd %xmm4, %xmm5 408#endif 409 410#ifdef LN 411 subl $2 * SIZE, CO1 412#endif 413 414 movlpd %xmm5, 0 * SIZE(CO1) 415 movhpd %xmm5, 1 * SIZE(CO1) 416 417#if defined(LN) || defined(LT) 418 movapd %xmm5, -16 * SIZE(BB) 419#else 420 movapd %xmm5, -16 * SIZE(AA) 421#endif 422 423#ifndef LN 424 addl $2 * SIZE, CO1 425#endif 426 427#if defined(LT) || defined(RN) 428 movl K, %eax 429 subl KK, %eax 430 sall $ZBASE_SHIFT, %eax 431 addl %eax, AA 432 addl %eax, BB 433#endif 434 435#ifdef LN 436 subl $1, KK 437#endif 438 439#ifdef LT 440 addl $1, KK 441#endif 442 443#ifdef RT 444 movl K, %eax 445 sall $ZBASE_SHIFT, %eax 446 addl %eax, AORIG 447#endif 448 449 decl %ebx # i -- 450 jg L110 451 452#ifdef LN 453 movl K, %eax 454 sall $ZBASE_SHIFT, %eax 455 addl %eax, B 456#endif 457 458#if defined(LT) || defined(RN) 459 movl BB, B 460#endif 461 462#ifdef RN 463 addl $1, KK 464#endif 465 466#ifdef RT 467 subl $1, KK 468#endif 469 ALIGN_4 470 471.L100: 472 movl N, %eax 473 sarl $1, %eax 474 movl %eax, J # j = n 475 jle .L999 476 ALIGN_4 477 478.L01: 479#if defined(LT) || defined(RN) 480 movl A, AA 481#else 482 movl A, %eax 483 movl %eax, AORIG 484#endif 485 486#ifdef RT 487 movl K, %eax 488 sall $1 + ZBASE_SHIFT, %eax 489 subl %eax, B 490#endif 491 492 leal (, LDC, 2), %eax 493 494#ifdef RT 495 subl %eax, C 496#endif 497 movl C, CO1 498#ifndef RT 499 addl %eax, C 500#endif 501 502#ifdef LN 503 movl OFFSET, %eax 504 addl M, %eax 505 movl %eax, KK 506#endif 507 508#ifdef LT 509 movl OFFSET, %eax 510 movl %eax, KK 511#endif 512 513 movl M, %ebx 514 ALIGN_4 515 516.L10: 517#ifdef LN 518 movl K, %eax 519 sall $ZBASE_SHIFT, %eax 520 subl %eax, AORIG 521#endif 522 523#if defined(LN) || defined(RT) 524 movl KK, %eax 525 movl AORIG, AA 526 sall $ZBASE_SHIFT, %eax 527 addl %eax, AA 528#endif 529 530 movl B, BB 531 532#if defined(LN) || defined(RT) 533 movl KK, %eax 534 sall $1 + ZBASE_SHIFT, %eax 535 addl %eax, BB 536#endif 537 538 movaps -16 * SIZE(AA), %xmm0 539 pxor %xmm2, %xmm2 540 movaps -16 * SIZE(BB), %xmm1 541 pxor %xmm3, %xmm3 542 543#ifdef LN 544 pxor %xmm4, %xmm4 545 prefetcht0 -2 * SIZE(CO1) 546 pxor %xmm5, %xmm5 547 prefetcht0 -2 * SIZE(CO1, LDC) 548#else 549 pxor %xmm4, %xmm4 550 prefetcht0 1 * SIZE(CO1) 551 pxor %xmm5, %xmm5 552 prefetcht0 1 * SIZE(CO1, LDC) 553#endif 554 pxor %xmm6, %xmm6 555 pxor %xmm7, %xmm7 556 557#if defined(LT) || defined(RN) 558 movl KK, %eax 559#else 560 movl K, %eax 561 subl KK, %eax 562#endif 563 sarl $3, %eax 564 je .L15 565 ALIGN_4 566 567.L12: 568 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 569 570 ADD1 %xmm3, %xmm6 571 movaps -14 * SIZE(BB), %xmm3 572 ADD2 %xmm2, %xmm7 573 pshufd $0x4e, %xmm1, %xmm2 574 mulpd %xmm0, %xmm1 575 mulpd %xmm0, %xmm2 576 577 ADD1 %xmm1, %xmm4 578 movaps -12 * SIZE(BB), %xmm1 579 ADD2 %xmm2, %xmm5 580 pshufd $0x4e, %xmm3, %xmm2 581 mulpd %xmm0, %xmm3 582 mulpd %xmm0, %xmm2 583 movaps -14 * SIZE(AA), %xmm0 584 585 ADD1 %xmm3, %xmm6 586 movaps -10 * SIZE(BB), %xmm3 587 ADD2 %xmm2, %xmm7 588 pshufd $0x4e, %xmm1, %xmm2 589 mulpd %xmm0, %xmm1 590 mulpd %xmm0, %xmm2 591 592 ADD1 %xmm1, %xmm4 593 movaps -8 * SIZE(BB), %xmm1 594 ADD2 %xmm2, %xmm5 595 pshufd $0x4e, %xmm3, %xmm2 596 mulpd %xmm0, %xmm3 597 mulpd %xmm0, %xmm2 598 movaps -12 * SIZE(AA), %xmm0 599 600 ADD1 %xmm3, %xmm6 601 movaps -6 * SIZE(BB), %xmm3 602 ADD2 %xmm2, %xmm7 603 pshufd $0x4e, %xmm1, %xmm2 604 mulpd %xmm0, %xmm1 605 mulpd %xmm0, %xmm2 606 607 ADD1 %xmm1, %xmm4 608 movaps -4 * SIZE(BB), %xmm1 609 ADD2 %xmm2, %xmm5 610 pshufd $0x4e, %xmm3, %xmm2 611 mulpd %xmm0, %xmm3 612 mulpd %xmm0, %xmm2 613 movaps -10 * SIZE(AA), %xmm0 614 615 ADD1 %xmm3, %xmm6 616 movaps -2 * SIZE(BB), %xmm3 617 ADD2 %xmm2, %xmm7 618 pshufd $0x4e, %xmm1, %xmm2 619 mulpd %xmm0, %xmm1 620 mulpd %xmm0, %xmm2 621 622 ADD1 %xmm1, %xmm4 623 movaps 0 * SIZE(BB), %xmm1 624 ADD2 %xmm2, %xmm5 625 pshufd $0x4e, %xmm3, %xmm2 626 mulpd %xmm0, %xmm3 627 mulpd %xmm0, %xmm2 628 movaps -8 * SIZE(AA), %xmm0 629 630 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) 631 632 ADD1 %xmm3, %xmm6 633 movaps 2 * SIZE(BB), %xmm3 634 ADD2 %xmm2, %xmm7 635 pshufd $0x4e, %xmm1, %xmm2 636 mulpd %xmm0, %xmm1 637 mulpd %xmm0, %xmm2 638 639 ADD1 %xmm1, %xmm4 640 movaps 4 * SIZE(BB), %xmm1 641 ADD2 %xmm2, %xmm5 642 pshufd $0x4e, %xmm3, %xmm2 643 mulpd %xmm0, %xmm3 644 mulpd %xmm0, %xmm2 645 movaps -6 * SIZE(AA), %xmm0 646 647 ADD1 %xmm3, %xmm6 648 movaps 6 * SIZE(BB), %xmm3 649 ADD2 %xmm2, %xmm7 650 pshufd $0x4e, %xmm1, %xmm2 651 mulpd %xmm0, %xmm1 652 mulpd %xmm0, %xmm2 653 654 ADD1 %xmm1, %xmm4 655 movaps 8 * SIZE(BB), %xmm1 656 ADD2 %xmm2, %xmm5 657 pshufd $0x4e, %xmm3, %xmm2 658 mulpd %xmm0, %xmm3 659 mulpd %xmm0, %xmm2 660 movaps -4 * SIZE(AA), %xmm0 661 662 ADD1 %xmm3, %xmm6 663 movaps 10 * SIZE(BB), %xmm3 664 ADD2 %xmm2, %xmm7 665 pshufd $0x4e, %xmm1, %xmm2 666 mulpd %xmm0, %xmm1 667 mulpd %xmm0, %xmm2 668 669 ADD1 %xmm1, %xmm4 670 movaps 12 * SIZE(BB), %xmm1 671 ADD2 %xmm2, %xmm5 672 pshufd $0x4e, %xmm3, %xmm2 673 mulpd %xmm0, %xmm3 674 mulpd %xmm0, %xmm2 675 movaps -2 * SIZE(AA), %xmm0 676 677 ADD1 %xmm3, %xmm6 678 movaps 14 * SIZE(BB), %xmm3 679 ADD2 %xmm2, %xmm7 680 pshufd $0x4e, %xmm1, %xmm2 681 mulpd %xmm0, %xmm1 682 mulpd %xmm0, %xmm2 683 684 ADD1 %xmm1, %xmm4 685 movaps 16 * SIZE(BB), %xmm1 686 ADD2 %xmm2, %xmm5 687 pshufd $0x4e, %xmm3, %xmm2 688 mulpd %xmm0, %xmm3 689 subl $-32 * SIZE, BB 690 mulpd %xmm0, %xmm2 691 movaps 0 * SIZE(AA), %xmm0 692 693 subl $-16 * SIZE, AA 694 695 subl $1, %eax 696 jne .L12 697 ALIGN_4 698 699.L15: 700#if defined(LT) || defined(RN) 701 movl KK, %eax 702#else 703 movl K, %eax 704 subl KK, %eax 705#endif 706 andl $7, %eax # if (k & 1) 707 BRANCH 708 je .L18 709 ALIGN_4 710 711.L16: 712 ADD1 %xmm3, %xmm6 713 movaps -14 * SIZE(BB), %xmm3 714 ADD2 %xmm2, %xmm7 715 pshufd $0x4e, %xmm1, %xmm2 716 mulpd %xmm0, %xmm1 717 mulpd %xmm0, %xmm2 718 719 ADD1 %xmm1, %xmm4 720 movaps -12 * SIZE(BB), %xmm1 721 ADD2 %xmm2, %xmm5 722 pshufd $0x4e, %xmm3, %xmm2 723 mulpd %xmm0, %xmm3 724 mulpd %xmm0, %xmm2 725 726 movaps -14 * SIZE(AA), %xmm0 727 728 addl $2 * SIZE, AA 729 addl $4 * SIZE, BB 730 decl %eax 731 jg .L16 732 ALIGN_4 733 734.L18: 735#if defined(LN) || defined(RT) 736 movl KK, %eax 737#ifdef LN 738 subl $1, %eax 739#else 740 subl $2, %eax 741#endif 742 743 movl AORIG, AA 744 sall $ZBASE_SHIFT, %eax 745 leal (AA, %eax, 1), AA 746 leal (B, %eax, 2), BB 747#endif 748 749 ADD1 %xmm3, %xmm6 750 pcmpeqb %xmm1, %xmm1 751 ADD2 %xmm2, %xmm7 752 psllq $63, %xmm1 753 754#ifndef CONJ 755 pshufd $0x40, %xmm1, %xmm0 756 shufps $0x04, %xmm1, %xmm1 757 758 pxor %xmm0, %xmm4 759 pxor %xmm0, %xmm6 760#else 761#if defined(LN) || defined(LT) 762 pshufd $0x40, %xmm1, %xmm0 763#else 764 pshufd $0x04, %xmm1, %xmm0 765#endif 766 shufps $0x40, %xmm1, %xmm1 767 768 pxor %xmm0, %xmm5 769 pxor %xmm0, %xmm7 770#endif 771 772 haddpd %xmm5, %xmm4 773 haddpd %xmm7, %xmm6 774 775 776#if defined(LN) || defined(LT) 777 movapd -16 * SIZE(BB), %xmm5 778 movapd -14 * SIZE(BB), %xmm7 779 780 subpd %xmm4, %xmm5 781 subpd %xmm6, %xmm7 782#else 783 movapd -16 * SIZE(AA), %xmm5 784 movapd -14 * SIZE(AA), %xmm7 785 786 subpd %xmm4, %xmm5 787 subpd %xmm6, %xmm7 788#endif 789 790#if defined(LN) || defined(LT) 791 movddup -16 * SIZE(AA), %xmm2 792 movddup -15 * SIZE(AA), %xmm3 793 794 pshufd $0x4e, %xmm5, %xmm4 795 pshufd $0x4e, %xmm7, %xmm6 796 797 xorpd %xmm1, %xmm4 798 xorpd %xmm1, %xmm6 799 800 mulpd %xmm2, %xmm5 801 mulpd %xmm3, %xmm4 802 mulpd %xmm2, %xmm7 803 mulpd %xmm3, %xmm6 804 805 addpd %xmm4, %xmm5 806 addpd %xmm6, %xmm7 807#endif 808 809#ifdef RN 810 movddup -16 * SIZE(BB), %xmm2 811 movddup -15 * SIZE(BB), %xmm3 812 813 pshufd $0x4e, %xmm5, %xmm4 814 815 xorpd %xmm1, %xmm4 816 817 mulpd %xmm2, %xmm5 818 mulpd %xmm3, %xmm4 819 820 addpd %xmm4, %xmm5 821 822 movddup -14 * SIZE(BB), %xmm2 823 movddup -13 * SIZE(BB), %xmm3 824 825 movapd %xmm5, %xmm4 826 pshufd $0x4e, %xmm5, %xmm6 827 828 xorpd %xmm1, %xmm6 829 830 mulpd %xmm2, %xmm4 831 mulpd %xmm3, %xmm6 832 833 subpd %xmm4, %xmm7 834 subpd %xmm6, %xmm7 835 836 movddup -10 * SIZE(BB), %xmm2 837 movddup -9 * SIZE(BB), %xmm3 838 839 pshufd $0x4e, %xmm7, %xmm6 840 841 xorpd %xmm1, %xmm6 842 843 mulpd %xmm2, %xmm7 844 mulpd %xmm3, %xmm6 845 846 addpd %xmm6, %xmm7 847#endif 848 849#ifdef RT 850 movddup -10 * SIZE(BB), %xmm2 851 movddup -9 * SIZE(BB), %xmm3 852 853 pshufd $0x4e, %xmm7, %xmm6 854 855 xorpd %xmm1, %xmm6 856 857 mulpd %xmm2, %xmm7 858 mulpd %xmm3, %xmm6 859 860 addpd %xmm6, %xmm7 861 862 movddup -12 * SIZE(BB), %xmm2 863 movddup -11 * SIZE(BB), %xmm3 864 865 movapd %xmm7, %xmm4 866 pshufd $0x4e, %xmm7, %xmm6 867 868 xorpd %xmm1, %xmm6 869 870 mulpd %xmm2, %xmm4 871 mulpd %xmm3, %xmm6 872 873 subpd %xmm4, %xmm5 874 subpd %xmm6, %xmm5 875 876 movddup -16 * SIZE(BB), %xmm2 877 movddup -15 * SIZE(BB), %xmm3 878 879 pshufd $0x4e, %xmm5, %xmm4 880 881 xorpd %xmm1, %xmm4 882 883 mulpd %xmm2, %xmm5 884 mulpd %xmm3, %xmm4 885 886 addpd %xmm4, %xmm5 887#endif 888 889#ifdef LN 890 subl $2 * SIZE, CO1 891#endif 892 893 movlpd %xmm5, 0 * SIZE(CO1) 894 movhpd %xmm5, 1 * SIZE(CO1) 895 896 movlpd %xmm7, 0 * SIZE(CO1, LDC) 897 movhpd %xmm7, 1 * SIZE(CO1, LDC) 898 899#if defined(LN) || defined(LT) 900 movapd %xmm5, -16 * SIZE(BB) 901 movapd %xmm7, -14 * SIZE(BB) 902#else 903 movapd %xmm5, -16 * SIZE(AA) 904 movapd %xmm7, -14 * SIZE(AA) 905#endif 906 907#ifndef LN 908 addl $2 * SIZE, CO1 909#endif 910 911#if defined(LT) || defined(RN) 912 movl K, %eax 913 subl KK, %eax 914 sall $ZBASE_SHIFT, %eax 915 addl %eax, AA 916 leal (BB, %eax, 2), BB 917#endif 918 919#ifdef LN 920 subl $1, KK 921#endif 922 923#ifdef LT 924 addl $1, KK 925#endif 926 927#ifdef RT 928 movl K, %eax 929 sall $ZBASE_SHIFT, %eax 930 addl %eax, AORIG 931#endif 932 933 decl %ebx # i -- 934 jg .L10 935 ALIGN_4 936 937.L99: 938#ifdef LN 939 movl K, %eax 940 sall $1 + ZBASE_SHIFT, %eax 941 addl %eax, B 942#endif 943 944#if defined(LT) || defined(RN) 945 movl BB, B 946#endif 947 948#ifdef RN 949 addl $2, KK 950#endif 951 952#ifdef RT 953 subl $2, KK 954#endif 955 956 decl J # j -- 957 jg .L01 958 ALIGN_4 959 960.L999: 961 popl %ebx 962 popl %esi 963 popl %edi 964 popl %ebp 965 966 addl $ARGS, %esp 967 ret 968 969 EPILOGUE 970