1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43#define ARGS 16 44 45#define J 0 + STACK(%esp) 46#define KK 4 + STACK(%esp) 47#define KKK 8 + STACK(%esp) 48#define AORIG 12 + STACK(%esp) 49 50#define M 4 + STACK + ARGS(%esp) 51#define N 8 + STACK + ARGS(%esp) 52#define K 12 + STACK + ARGS(%esp) 53#define ALPHA 16 + STACK + ARGS(%esp) 54#ifdef DOUBLE 55#define A 24 + STACK + ARGS(%esp) 56#define B 28 + STACK + ARGS(%esp) 57#define C 32 + STACK + ARGS(%esp) 58#define LDC 36 + STACK + ARGS(%esp) 59#define OFFSET 40 + STACK + ARGS(%esp) 60#else 61#define A 20 + STACK + ARGS(%esp) 62#define B 24 + STACK + ARGS(%esp) 63#define C 28 + STACK + ARGS(%esp) 64#define LDC 32 + STACK + ARGS(%esp) 65#define OFFSET 36 + STACK + ARGS(%esp) 66#endif 67 68#define PREFETCH_OFFSET 48 69 70#if defined(PENTIUM3) || defined(PENTIUMM) 71#define REP rep 72#else 73#define REP rep 74#endif 75 76#define AA %edx 77#define BB %ecx 78 79 PROLOGUE 80 81 subl $ARGS, %esp # Generate Stack Frame 82 83 pushl %ebp 84 pushl %edi 85 pushl %esi 86 pushl %ebx 87 88 PROFCODE 89 90 movl LDC, %ebp # ldc # MEMORY 91 movl B, %ebx 92 leal (, %ebp, SIZE), %ebp 93 94#ifdef LN 95 movl M, %eax 96 leal (, %eax, SIZE), %eax 97 addl %eax, C 98 imull K, %eax 99 addl %eax, A 100#endif 101 102#ifdef RT 103 movl N, %eax 104 leal (, %eax, SIZE), %eax 105 imull K, %eax 106 addl %eax, %ebx 107 108 movl N, %eax 109 imull %ebp, %eax 110 addl %eax, C 111#endif 112 113#ifdef RN 114 negl KK 115#endif 116 117#ifdef RT 118 movl N, %eax 119 subl OFFSET, %eax 120 movl %eax, KK 121#endif 122 123 movl N, %eax # n # MEMORY 124 andl $1, %eax 125 je .L8 126 127#if defined(LT) || defined(RN) 128 movl A, AA 129#else 130 movl A, %eax 131 movl %eax, AORIG 132#endif 133 134#ifdef RT 135 movl K, %eax 136 sall $0 + BASE_SHIFT, %eax 137 subl %eax, %ebx 138#endif 139 140#ifdef RT 141 subl %ebp, C 142#endif 143 movl C, %edi # c # MEMORY 144#ifndef RT 145 addl %ebp, C 146#endif 147 148#ifdef LN 149 movl OFFSET, %eax 150 addl M, %eax 151 movl %eax, KK 152#endif 153 154#ifdef LT 155 movl OFFSET, %eax 156 movl %eax, KK 157#endif 158 159 movl M, %esi # m # MEMORY 160 sarl $1, %esi # m >> 1 161 je .L36 162 ALIGN_4 163 164.L46: 165#ifdef LN 166 movl K, %eax 167 sall $1 + BASE_SHIFT, %eax 168 subl %eax, AORIG 169#endif 170 171#if defined(LN) || defined(RT) 172 movl KK, %eax 173 leal (, %eax, SIZE), %eax 174 movl AORIG, AA 175 leal (AA, %eax, 2), AA 176 leal (%ebx, %eax, 1), BB 177#else 178 movl %ebx, BB 179#endif 180 181 fldz 182 fldz 183 FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) 184 185#if defined(LT) || defined(RN) 186 movl KK, %eax 187#else 188 movl K, %eax 189 subl KK, %eax 190#endif 191 sarl $1, %eax 192 je .L56 193 ALIGN_4 194 195.L57: 196 FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) 197 fmul %st(1), %st 198 faddp %st, %st(2) 199 200 FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) 201 faddp %st, %st(2) 202 FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) 203 204 FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) 205 fmul %st(1), %st 206 faddp %st, %st(2) 207 208 FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) 209 faddp %st, %st(2) 210 FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) 211 212 addl $4 * SIZE,AA 213 addl $2 * SIZE,BB 214 dec %eax 215 jne .L57 216 ALIGN_4 217 218.L56: 219#if defined(LT) || defined(RN) 220 movl KK, %eax 221#else 222 movl K, %eax 223 subl KK, %eax 224#endif 225 andl $1, %eax 226 je .L45 227 ALIGN_4 228 229 FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) 230 fmul %st(1), %st 231 faddp %st, %st(2) 232 233 FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) 234 faddp %st, %st(2) 235 FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) 236 237 addl $2 * SIZE,AA 238 addl $1 * SIZE,BB 239 ALIGN_4 240 241.L45: 242 ffreep %st(0) 243 244#if defined(LN) || defined(RT) 245 movl KK, %eax 246#ifdef LN 247 subl $2, %eax 248#else 249 subl $1, %eax 250#endif 251 252 leal (, %eax, SIZE), %eax 253 254 movl AORIG, AA 255 leal (AA, %eax, 2), AA 256 leal (%ebx, %eax, 1), BB 257#endif 258 259#if defined(LN) || defined(LT) 260 FLD 0 * SIZE(BB) 261 fsubp %st, %st(1) 262 FLD 1 * SIZE(BB) 263 fsubp %st, %st(2) 264#else 265 FLD 0 * SIZE(AA) 266 fsubp %st, %st(1) 267 FLD 1 * SIZE(AA) 268 fsubp %st, %st(2) 269#endif 270 271#ifdef LN 272 FLD 3 * SIZE(AA) 273 fmulp %st, %st(2) 274 275 FLD 2 * SIZE(AA) 276 fmul %st(2), %st 277 278 fsubrp %st, %st(1) 279 FLD 0 * SIZE(AA) 280 fmulp %st, %st(1) 281#endif 282 283#ifdef LT 284 FLD 0 * SIZE(AA) 285 fmulp %st, %st(1) 286 287 FLD 1 * SIZE(AA) 288 fmul %st(1), %st 289 290 fsubrp %st, %st(2) 291 292 FLD 3 * SIZE(AA) 293 fmulp %st, %st(2) 294#endif 295 296#ifdef RN 297 FLD 0 * SIZE(BB) 298 fmul %st, %st(1) 299 fmulp %st, %st(2) 300#endif 301 302#ifdef RT 303 FLD 0 * SIZE(BB) 304 fmul %st, %st(1) 305 fmulp %st, %st(2) 306#endif 307 308#ifdef LN 309 subl $2 * SIZE, %edi 310#endif 311 312#if defined(LN) || defined(LT) 313 FSTU 0 * SIZE(BB) 314 fxch %st(1) 315 FSTU 1 * SIZE(BB) 316#else 317 FSTU 0 * SIZE(AA) 318 fxch %st(1) 319 FSTU 1 * SIZE(AA) 320#endif 321 322 FST 1 * SIZE(%edi) 323 FST 0 * SIZE(%edi) 324 325#ifndef LN 326 addl $2 * SIZE, %edi 327#endif 328 329#if defined(LT) || defined(RN) 330 movl K, %eax 331 subl KK, %eax 332 leal (,%eax, SIZE), %eax 333 leal (AA, %eax, 2), AA 334 leal (BB, %eax, 1), BB 335#endif 336 337#ifdef LN 338 subl $2, KK 339#endif 340 341#ifdef LT 342 addl $2, KK 343#endif 344 345#ifdef RT 346 movl K, %eax 347 sall $1 + BASE_SHIFT, %eax 348 addl %eax, AORIG 349#endif 350 351 decl %esi # i -- 352 jne .L46 353 ALIGN_4 354 355.L36: 356 movl M, %eax # m # MEMORY 357 andl $1, %eax # m & 1 358 je .L99 359 360#ifdef LN 361 movl K, %eax 362 sall $0 + BASE_SHIFT, %eax 363 subl %eax, AORIG 364#endif 365 366#if defined(LN) || defined(RT) 367 movl KK, %eax 368 leal (, %eax, SIZE), %eax 369 movl AORIG, AA 370 leal (AA, %eax, 1), AA 371 leal (%ebx, %eax, 1), BB 372#else 373 movl %ebx, BB 374#endif 375 376 fldz 377 378#if defined(LT) || defined(RN) 379 movl KK, %eax 380#else 381 movl K, %eax 382 subl KK, %eax 383#endif 384 test %eax, %eax 385 jle .L52 386 ALIGN_3 387 388.L51: 389 FLD (AA) 390 FMUL (BB) 391 addl $1 * SIZE,AA 392 addl $1 * SIZE,BB 393 faddp %st,%st(1) 394 decl %eax 395 jne .L51 396 ALIGN_4 397 398.L52: 399 400#if defined(LN) || defined(RT) 401 movl KK, %eax 402#ifdef LN 403 subl $1, %eax 404#else 405 subl $1, %eax 406#endif 407 408 leal (, %eax, SIZE), %eax 409 410 movl AORIG, AA 411 leal (AA, %eax, 1), AA 412 leal (%ebx, %eax, 1), BB 413#endif 414 415#if defined(LN) || defined(LT) 416 FLD 0 * SIZE(BB) 417 fsubp %st, %st(1) 418#else 419 FLD 0 * SIZE(AA) 420 fsubp %st, %st(1) 421#endif 422 423#if defined(LN) || defined(LT) 424 FMUL 0 * SIZE(AA) 425#else 426 FMUL 0 * SIZE(BB) 427#endif 428 429#ifdef LN 430 subl $1 * SIZE, %edi 431#endif 432 433#if defined(LN) || defined(LT) 434 FSTU 0 * SIZE(BB) 435#else 436 FSTU 0 * SIZE(AA) 437#endif 438 439 FST 0 * SIZE(%edi) 440 441#ifndef LN 442 addl $1 * SIZE, %edi 443#endif 444 445#if defined(LT) || defined(RN) 446 movl K, %eax 447 subl KK, %eax 448 leal (,%eax, SIZE), %eax 449 leal (AA, %eax, 1), AA 450 leal (BB, %eax, 1), BB 451#endif 452 453#ifdef LN 454 subl $1, KK 455#endif 456 457#ifdef LT 458 addl $1, KK 459#endif 460 461#ifdef RT 462 movl K, %eax 463 sall $0 + BASE_SHIFT, %eax 464 addl %eax, AORIG 465#endif 466 ALIGN_4 467 468.L99: 469#ifdef LN 470 movl K, %eax 471 leal (%ebx, %eax, SIZE), %ebx 472#endif 473#if defined(LT) || defined(RN) 474 movl BB, %ebx 475#endif 476 477#ifdef RN 478 addl $1, KK 479#endif 480 481#ifdef RT 482 subl $1, KK 483#endif 484 ALIGN_4 485 486.L8: 487 movl N, %eax # j = (n >> 1) # MEMORY 488 sarl $1, %eax 489 movl %eax, J # j = (n >> 1) # MEMORY 490 je .End 491 ALIGN_4 492 493.L34: 494#if defined(LT) || defined(RN) 495 movl A, AA 496#else 497 movl A, %eax 498 movl %eax, AORIG 499#endif 500 501#ifdef RT 502 movl K, %eax 503 sall $1 + BASE_SHIFT, %eax 504 subl %eax, %ebx 505#endif 506 lea (, %ebp, 2), %eax 507 508#ifdef RT 509 subl %eax, C 510#endif 511 movl C, %edi 512#ifndef RT 513 addl %eax, C 514#endif 515 516#ifdef LN 517 movl OFFSET, %eax 518 addl M, %eax 519 movl %eax, KK 520#endif 521 522#ifdef LT 523 movl OFFSET, %eax 524 movl %eax, KK 525#endif 526 527 movl M, %esi 528 sarl $1, %esi 529 je .L12 530 ALIGN_4 531 532.MainHead: 533#ifdef LN 534 movl K, %eax 535 sall $1 + BASE_SHIFT, %eax 536 subl %eax, AORIG 537#endif 538 539#if defined(LN) || defined(RT) 540 movl KK, %eax 541 leal (, %eax, SIZE), %eax 542 movl AORIG, AA 543 leal (AA, %eax, 2), AA 544 leal (%ebx, %eax, 2), BB 545#else 546 movl %ebx, BB 547#endif 548 549 fldz 550 fldz 551 fldz 552 fldz 553 554 FLD 4 * SIZE(BB) # b5 555 FLD 4 * SIZE(AA) # a5 556 FLD 0 * SIZE(BB) # b1 557 FLD 0 * SIZE(AA) # a1 558 559#if defined(HAVE_3DNOW) 560 prefetchw 2 * SIZE(%edi) 561 prefetchw 2 * SIZE(%edi, %ebp, 1) 562#elif defined(HAVE_SSE) 563 prefetchnta 2 * SIZE(%edi) 564 prefetchnta 2 * SIZE(%edi, %ebp, 1) 565#endif 566 567#if defined(LT) || defined(RN) 568 movl KK, %eax 569#else 570 movl K, %eax 571 subl KK, %eax 572#endif 573 sarl $2, %eax 574 je .L16 575 ALIGN_4 576 577.MainLoop: 578#if defined(HAVE_3DNOW) 579 prefetch (PREFETCH_OFFSET) * SIZE(BB) 580 nop 581#elif defined(HAVE_SSE) 582 prefetchnta (PREFETCH_OFFSET) * SIZE(BB) 583#if (L2_SIZE == 524288) 584 prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) 585#endif 586#endif 587 588 fmul %st, %st(1) 589 FMUL 1 * SIZE(BB) 590 fxch %st(1) 591 faddp %st, %st(4) 592 FLD 0 * SIZE(BB) 593 fxch %st(1) 594 faddp %st, %st(5) 595 FLD 1 * SIZE(AA) 596 fmul %st, %st(1) 597 FMUL 1 * SIZE(BB) 598 fxch %st(1) 599 faddp %st, %st(6) 600 FLD 2 * SIZE(BB) 601 fxch %st(1) 602 faddp %st, %st(7) 603 FLD 2 * SIZE(AA) 604 605 fmul %st, %st(1) 606 FMUL 3 * SIZE(BB) 607 fxch %st(1) 608 faddp %st, %st(4) 609 FLD 2 * SIZE(BB) 610 fxch %st(1) 611 faddp %st, %st(5) 612 FLD 3 * SIZE(AA) 613 fmul %st, %st(1) 614 FMUL 3 * SIZE(BB) 615 fxch %st(1) 616 faddp %st, %st(6) 617 FLD 8 * SIZE(BB) 618 fxch %st(1) 619 faddp %st, %st(7) 620 FLD 8 * SIZE(AA) 621 fxch %st(2) 622 623#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) 624 prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) 625#if (L2_SIZE == 524288) 626 prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) 627#endif 628#endif 629 630 fmul %st, %st(3) 631 FMUL 5 * SIZE(BB) 632 fxch %st(3) 633 faddp %st, %st(4) 634 FLD 4 * SIZE(BB) 635 fxch %st(3) 636 faddp %st, %st(5) 637 FLD 5 * SIZE(AA) 638 fmul %st, %st(3) 639 FMUL 5 * SIZE(BB) 640 fxch %st(3) 641 faddp %st, %st(6) 642 FLD 6 * SIZE(BB) 643 fxch %st(3) 644 faddp %st, %st(7) 645 FLD 6 * SIZE(AA) 646 647 fmul %st, %st(3) 648 FMUL 7 * SIZE(BB) 649 fxch %st(3) 650 faddp %st, %st(4) 651 FLD 6 * SIZE(BB) 652 fxch %st(3) 653 faddp %st, %st(5) 654 FLD 7 * SIZE(AA) 655 fmul %st, %st(3) 656 FMUL 7 * SIZE(BB) 657 fxch %st(3) 658 faddp %st, %st(6) 659 FLD 12 * SIZE(BB) 660 fxch %st(3) 661 faddp %st, %st(7) 662 FLD 12 * SIZE(AA) 663 fxch %st(2) 664 665 subl $-8 * SIZE, BB 666 subl $-8 * SIZE, AA 667 decl %eax # l -- 668 jne .MainLoop 669 ALIGN_4 670 671.L16: 672#if defined(LT) || defined(RN) 673 movl KK, %eax 674#else 675 movl K, %eax 676 subl KK, %eax 677#endif 678 and $3, %eax 679 je .L21 680 ALIGN_4 681 682.SubLoop: 683 fmul %st, %st(1) 684 FMUL 1 * SIZE(BB) 685 fxch %st(1) 686 faddp %st, %st(4) 687 FLD 0 * SIZE(BB) 688 fxch %st(1) 689 faddp %st, %st(5) 690 FLD 1 * SIZE(AA) 691 fmul %st, %st(1) 692 FMUL 1 * SIZE(BB) 693 fxch %st(1) 694 faddp %st, %st(6) 695 FLD 2 * SIZE(BB) 696 fxch %st(1) 697 faddp %st, %st(7) 698 FLD 2 * SIZE(AA) 699 700 addl $2 * SIZE,BB 701 addl $2 * SIZE,AA 702 decl %eax 703 jne .SubLoop 704 ALIGN_4 705 706.L21: 707 ffreep %st(0) 708 ffreep %st(0) 709 ffreep %st(0) 710 ffreep %st(0) 711 712#if defined(LN) || defined(RT) 713 movl KK, %eax 714#ifdef LN 715 subl $2, %eax 716#else 717 subl $2, %eax 718#endif 719 720 leal (, %eax, SIZE), %eax 721 722 movl AORIG, AA 723 leal (AA, %eax, 2), AA 724 leal (%ebx, %eax, 2), BB 725#endif 726 727#if defined(LN) || defined(LT) 728 FLD 0 * SIZE(BB) 729 fsubp %st, %st(1) 730 FLD 1 * SIZE(BB) 731 fsubp %st, %st(2) 732 FLD 2 * SIZE(BB) 733 fsubp %st, %st(3) 734 FLD 3 * SIZE(BB) 735 fsubp %st, %st(4) 736#else 737 FLD 0 * SIZE(AA) 738 fsubp %st, %st(1) 739 FLD 1 * SIZE(AA) 740 fsubp %st, %st(3) 741 FLD 2 * SIZE(AA) 742 fsubp %st, %st(2) 743 FLD 3 * SIZE(AA) 744 fsubp %st, %st(4) 745#endif 746 747#ifdef LN 748 FLD 3 * SIZE(AA) 749 fmul %st, %st(3) 750 fmulp %st, %st(4) 751 752 FLD 2 * SIZE(AA) 753 fmul %st(3), %st 754 FLD 2 * SIZE(AA) 755 fmul %st(5), %st 756 757 fsubrp %st, %st(3) 758 fsubrp %st, %st(1) 759 760 FLD 0 * SIZE(AA) 761 fmul %st, %st(1) 762 fmulp %st, %st(2) 763#endif 764 765#ifdef LT 766 FLD 0 * SIZE(AA) 767 fmul %st, %st(1) 768 fmulp %st, %st(2) 769 770 FLD 1 * SIZE(AA) 771 fmul %st(1), %st 772 FLD 1 * SIZE(AA) 773 fmul %st(3), %st 774 775 fsubrp %st, %st(5) 776 fsubrp %st, %st(3) 777 778 FLD 3 * SIZE(AA) 779 fmul %st, %st(3) 780 fmulp %st, %st(4) 781#endif 782 783#ifdef RN 784 FLD 0 * SIZE(BB) 785 fmul %st, %st(1) 786 fmulp %st, %st(3) 787 788 FLD 1 * SIZE(BB) 789 fmul %st(1), %st 790 FLD 1 * SIZE(BB) 791 fmul %st(4), %st 792 793 fsubrp %st, %st(5) 794 fsubrp %st, %st(2) 795 796 FLD 3 * SIZE(BB) 797 fmul %st, %st(2) 798 fmulp %st, %st(4) 799#endif 800 801#ifdef RT 802 FLD 3 * SIZE(BB) 803 fmul %st, %st(2) 804 fmulp %st, %st(4) 805 806 FLD 2 * SIZE(BB) 807 fmul %st(2), %st 808 FLD 2 * SIZE(BB) 809 fmul %st(5), %st 810 811 fsubrp %st, %st(4) 812 fsubrp %st, %st(1) 813 814 FLD 0 * SIZE(BB) 815 fmul %st, %st(1) 816 fmulp %st, %st(3) 817#endif 818 819#ifdef LN 820 subl $2 * SIZE, %edi 821#endif 822 823#if defined(LN) || defined(LT) 824 FSTU 0 * SIZE(BB) 825 fxch %st(1) 826 FSTU 1 * SIZE(BB) 827 fxch %st(2) 828 FSTU 2 * SIZE(BB) 829 fxch %st(3) 830 FSTU 3 * SIZE(BB) 831 832 FST 1 * SIZE(%edi,%ebp) 833 FST 0 * SIZE(%edi) 834 FST 0 * SIZE(%edi,%ebp) 835 FST 1 * SIZE(%edi) 836#else 837 FSTU 0 * SIZE(AA) 838 fxch %st(2) 839 FSTU 1 * SIZE(AA) 840 fxch %st(1) 841 FSTU 2 * SIZE(AA) 842 fxch %st(3) 843 FSTU 3 * SIZE(AA) 844 845 FST 1 * SIZE(%edi,%ebp) 846 FST 1 * SIZE(%edi) 847 FST 0 * SIZE(%edi) 848 FST 0 * SIZE(%edi,%ebp) 849#endif 850 851#ifndef LN 852 addl $2 * SIZE, %edi 853#endif 854 855#if defined(LT) || defined(RN) 856 movl K, %eax 857 subl KK, %eax 858 leal (,%eax, SIZE), %eax 859 leal (AA, %eax, 2), AA 860 leal (BB, %eax, 2), BB 861#endif 862 863#ifdef LN 864 subl $2, KK 865#endif 866 867#ifdef LT 868 addl $2, KK 869#endif 870 871#ifdef RT 872 movl K, %eax 873 sall $1 + BASE_SHIFT, %eax 874 addl %eax, AORIG 875#endif 876 877 decl %esi # i -- 878 jne .MainHead 879 ALIGN_4 880 881.L12: 882 movl M, %eax # m # MEMORY 883 andl $1, %eax 884 je .L27 885 886#ifdef LN 887 movl K, %eax 888 sall $0 + BASE_SHIFT, %eax 889 subl %eax, AORIG 890#endif 891 892#if defined(LN) || defined(RT) 893 movl KK, %eax 894 leal (, %eax, SIZE), %eax 895 movl AORIG, AA 896 leal (AA, %eax, 1), AA 897 leal (%ebx, %eax, 2), BB 898#else 899 movl %ebx, BB 900#endif 901 902 fldz 903 fldz 904 905 FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) 906 907#if defined(LT) || defined(RN) 908 movl KK, %eax 909#else 910 movl K, %eax 911 subl KK, %eax 912#endif 913 sarl $1,%eax # k >> 1 # MEMORY 914 je .L54 915 ALIGN_4 916 917.L55: 918 FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) 919 rep 920 fmul %st(1), %st 921 faddp %st, %st(2) 922 923 FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) 924 faddp %st, %st(2) 925 FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) 926 927 FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) 928 rep 929 fmul %st(1), %st 930 faddp %st, %st(2) 931 932 FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) 933 faddp %st, %st(2) 934 FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) 935 936 addl $2 * SIZE, AA 937 addl $4 * SIZE, BB 938 decl %eax 939 jne .L55 940 ALIGN_4 941 942.L54: 943#if defined(LT) || defined(RN) 944 movl KK, %eax 945#else 946 movl K, %eax 947 subl KK, %eax 948#endif 949 andl $1,%eax # k & 1 950 je .L33 951 ALIGN_4 952 953 FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) 954 rep 955 fmul %st(1), %st 956 faddp %st, %st(2) 957 958 FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) 959 faddp %st, %st(2) 960 FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) 961 962 addl $1 * SIZE, AA 963 addl $2 * SIZE, BB 964 ALIGN_4 965 966.L33: 967 ffreep %st(0) 968 969#if defined(LN) || defined(RT) 970 movl KK, %eax 971#ifdef LN 972 subl $1, %eax 973#else 974 subl $2, %eax 975#endif 976 977 leal (, %eax, SIZE), %eax 978 979 movl AORIG, AA 980 leal (AA, %eax, 1), AA 981 leal (%ebx, %eax, 2), BB 982#endif 983 984#if defined(LN) || defined(LT) 985 FLD 0 * SIZE(BB) 986 fsubp %st, %st(1) 987 FLD 1 * SIZE(BB) 988 fsubp %st, %st(2) 989#else 990 FLD 0 * SIZE(AA) 991 fsubp %st, %st(1) 992 FLD 1 * SIZE(AA) 993 fsubp %st, %st(2) 994#endif 995 996#if defined(LN) || defined(LT) 997 FLD 0 * SIZE(AA) 998 fmul %st, %st(1) 999 fmulp %st, %st(2) 1000#endif 1001 1002#ifdef RN 1003 FLD 0 * SIZE(BB) 1004 fmulp %st, %st(1) 1005 1006 FLD 1 * SIZE(BB) 1007 fmul %st(1), %st 1008 1009 fsubrp %st, %st(2) 1010 1011 FLD 3 * SIZE(BB) 1012 fmulp %st, %st(2) 1013#endif 1014 1015#ifdef RT 1016 FLD 3 * SIZE(BB) 1017 fmulp %st, %st(2) 1018 1019 FLD 2 * SIZE(BB) 1020 fmul %st(2), %st 1021 1022 fsubrp %st, %st(1) 1023 1024 FLD 0 * SIZE(BB) 1025 fmulp %st, %st(1) 1026#endif 1027 1028#ifdef LN 1029 subl $1 * SIZE, %edi 1030#endif 1031 1032#if defined(LN) || defined(LT) 1033 FSTU 0 * SIZE(BB) 1034 fxch %st(1) 1035 FSTU 1 * SIZE(BB) 1036#else 1037 FSTU 0 * SIZE(AA) 1038 fxch %st(1) 1039 FSTU 1 * SIZE(AA) 1040#endif 1041 1042 FST 0 * SIZE(%edi,%ebp) 1043 FST 0 * SIZE(%edi) 1044 1045#ifndef LN 1046 addl $1 * SIZE, %edi 1047#endif 1048 1049#if defined(LT) || defined(RN) 1050 movl K, %eax 1051 subl KK, %eax 1052 leal (,%eax, SIZE), %eax 1053 leal (AA, %eax, 1), AA 1054 leal (BB, %eax, 2), BB 1055#endif 1056 1057#ifdef LN 1058 subl $1, KK 1059#endif 1060 1061#ifdef LT 1062 addl $1, KK 1063#endif 1064 1065#ifdef RT 1066 movl K, %eax 1067 sall $0 + BASE_SHIFT, %eax 1068 addl %eax, AORIG 1069#endif 1070 ALIGN_4 1071 1072.L27: 1073#ifdef LN 1074 movl K, %eax 1075 leal ( , %eax, SIZE), %eax 1076 leal (%ebx, %eax, 2), %ebx 1077#endif 1078#if defined(LT) || defined(RN) 1079 movl BB, %ebx 1080#endif 1081 1082#ifdef RN 1083 addl $2, KK 1084#endif 1085 1086#ifdef RT 1087 subl $2, KK 1088#endif 1089 1090 decl J # j-- # MEMORY 1091 jne .L34 1092 ALIGN_4 1093 1094.End: 1095 popl %ebx 1096 popl %esi 1097 popl %edi 1098 popl %ebp 1099 addl $ARGS, %esp 1100 ret 1101 1102 EPILOGUE 1103