1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef OPTERON 43#define PREFETCH prefetch 44#define PREFETCHW prefetchw 45#else 46#define PREFETCH prefetcht0 47#define PREFETCHW prefetcht0 48#endif 49 50#define PREFETCHSIZE (5 + 4 * 10) 51#define STACK 16 52#define ARGS 16 53 54#define J 0 + STACK(%esp) 55#define KK 4 + STACK(%esp) 56#define AORIG 8 + STACK(%esp) 57 58#define M 4 + STACK + ARGS(%esp) 59#define N 8 + STACK + ARGS(%esp) 60#define K 12 + STACK + ARGS(%esp) 61#define ALPHA 16 + STACK + ARGS(%esp) 62#define A 32 + STACK + ARGS(%esp) 63#define ARG_B 36 + STACK + ARGS(%esp) 64#define C 40 + STACK + ARGS(%esp) 65#define ARG_LDC 44 + STACK + ARGS(%esp) 66#define OFFSET 48 + STACK + ARGS(%esp) 67 68#define I %esi 69#define B %ebx 70#define CO %edi 71#define AO %edx 72#define BO %ecx 73#define LDC %ebp 74 75#define PREFETCH_OFFSET 48 76 77 PROLOGUE 78 79 subl $ARGS, %esp # Generate Stack Frame 80 81 pushl %ebp 82 pushl %edi 83 pushl %esi 84 pushl %ebx 85 86 PROFCODE 87 88 movl ARG_LDC, LDC 89 movl ARG_B, B 90 sall $BASE_SHIFT, LDC 91 92 addl $8 * SIZE, A 93 addl $8 * SIZE, B 94 95#ifdef LN 96 movl M, %eax 97 sall $BASE_SHIFT, %eax 98 addl %eax, C 99 imull K, %eax 100 addl %eax, A 101#endif 102 103#ifdef RT 104 movl N, %eax 105 sall $BASE_SHIFT, %eax 106 imull K, %eax 107 addl %eax, B 108 109 movl N, %eax 110 imull %ebp, %eax 111 addl %eax, C 112#endif 113 114#ifdef RN 115 movl OFFSET, %eax 116 negl %eax 117 movl %eax, KK 118#endif 119 120#ifdef RT 121 movl N, %eax 122 subl OFFSET, %eax 123 movl %eax, KK 124#endif 125 126 movl N, %eax 127 sarl $1, %eax 128 movl %eax, J 129 je .L30 130 ALIGN_4 131 132.L01: 133#if defined(LT) || defined(RN) 134 movl A, AO 135#else 136 movl A, %eax 137 movl %eax, AORIG 138#endif 139 140#ifdef RT 141 movl K, %eax 142 sall $1 + BASE_SHIFT, %eax 143 subl %eax, B 144#endif 145 146 lea (, LDC, 2), %eax 147 148#ifdef RT 149 subl %eax, C 150#endif 151 movl C, CO 152#ifndef RT 153 addl %eax, C 154#endif 155 156#ifdef LN 157 movl OFFSET, %eax 158 addl M, %eax 159 movl %eax, KK 160#endif 161 162#ifdef LT 163 movl OFFSET, %eax 164 movl %eax, KK 165#endif 166 167 movl M, I 168 sarl $1, I 169 je .L20 170 ALIGN_4 171 172.L11: 173#ifdef LN 174 movl K, %eax 175 sall $1 + BASE_SHIFT, %eax 176 subl %eax, AORIG 177#endif 178 179#if defined(LN) || defined(RT) 180 movl KK, %eax 181 sall $BASE_SHIFT, %eax 182 movl AORIG, AO 183 leal (AO, %eax, 2), AO 184 leal (B, %eax, 2), BO 185#else 186 movl B, BO 187#endif 188 189 fldz 190 fldz 191 fldz 192 fldz 193 194#if defined(HAVE_3DNOW) 195 prefetchw 2 * SIZE(CO) 196 prefetchw 2 * SIZE(CO, LDC, 1) 197#elif defined(HAVE_SSE) 198 prefetchnta 2 * SIZE(CO) 199 prefetchnta 2 * SIZE(CO, LDC, 1) 200#endif 201 202#if defined(LT) || defined(RN) 203 movl KK, %eax 204#else 205 movl K, %eax 206 subl KK, %eax 207#endif 208 sarl $2, %eax 209 je .L15 210 ALIGN_4 211 212.L12: 213 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 214 215 FLD -8 * SIZE(AO) 216 217 FLD -8 * SIZE(BO) 218 fld %st(1) 219 fmul %st(1), %st 220 faddp %st, %st(3) 221 222 FLD -7 * SIZE(BO) 223 fmul %st, %st(2) 224 225 FLD -7 * SIZE(AO) 226 fmul %st, %st(2) 227 fmulp %st, %st(1) 228 229 faddp %st, %st(6) 230 faddp %st, %st(4) 231 faddp %st, %st(2) 232 233 FLD -6 * SIZE(AO) 234 235 FLD -6 * SIZE(BO) 236 fld %st(1) 237 fmul %st(1), %st 238 faddp %st, %st(3) 239 240 FLD -5 * SIZE(BO) 241 fmul %st, %st(2) 242 243 FLD -5 * SIZE(AO) 244 fmul %st, %st(2) 245 fmulp %st, %st(1) 246 247 faddp %st, %st(6) 248 faddp %st, %st(4) 249 faddp %st, %st(2) 250 251 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 252 253 FLD -4 * SIZE(AO) 254 255 FLD -4 * SIZE(BO) 256 fld %st(1) 257 fmul %st(1), %st 258 faddp %st, %st(3) 259 260 FLD -3 * SIZE(BO) 261 fmul %st, %st(2) 262 263 FLD -3 * SIZE(AO) 264 fmul %st, %st(2) 265 fmulp %st, %st(1) 266 267 faddp %st, %st(6) 268 faddp %st, %st(4) 269 faddp %st, %st(2) 270 271 FLD -2 * SIZE(AO) 272 273 FLD -2 * SIZE(BO) 274 fld %st(1) 275 fmul %st(1), %st 276 faddp %st, %st(3) 277 278 FLD -1 * SIZE(BO) 279 fmul %st, %st(2) 280 281 FLD -1 * SIZE(AO) 282 fmul %st, %st(2) 283 fmulp %st, %st(1) 284 285 faddp %st, %st(6) 286 faddp %st, %st(4) 287 faddp %st, %st(2) 288 289 addl $8 * SIZE,AO 290 addl $8 * SIZE,BO 291 292 decl %eax 293 jne .L12 294 ALIGN_4 295 296.L15: 297#if defined(LT) || defined(RN) 298 movl KK, %eax 299#else 300 movl K, %eax 301 subl KK, %eax 302#endif 303 and $3, %eax 304 je .L18 305 ALIGN_4 306 307.L16: 308 FLD -8 * SIZE(AO) 309 310 FLD -8 * SIZE(BO) 311 fld %st(1) 312 fmul %st(1), %st 313 faddp %st, %st(3) 314 315 FLD -7 * SIZE(BO) 316 fmul %st, %st(2) 317 318 FLD -7 * SIZE(AO) 319 fmul %st, %st(2) 320 fmulp %st, %st(1) 321 322 faddp %st, %st(6) 323 faddp %st, %st(4) 324 faddp %st, %st(2) 325 326 addl $2 * SIZE,AO 327 addl $2 * SIZE,BO 328 329 decl %eax 330 jne .L16 331 ALIGN_4 332 333.L18: 334#if defined(LN) || defined(RT) 335 movl KK, %eax 336#ifdef LN 337 subl $2, %eax 338#else 339 subl $2, %eax 340#endif 341 342 sall $BASE_SHIFT, %eax 343 344 movl AORIG, AO 345 leal (AO, %eax, 2), AO 346 leal (B, %eax, 2), BO 347#endif 348 349#if defined(LN) || defined(LT) 350 FLD -8 * SIZE(BO) 351 fsubp %st, %st(1) 352 FLD -7 * SIZE(BO) 353 fsubp %st, %st(2) 354 FLD -6 * SIZE(BO) 355 fsubp %st, %st(3) 356 FLD -5 * SIZE(BO) 357 fsubp %st, %st(4) 358#else 359 FLD -8 * SIZE(AO) 360 fsubp %st, %st(1) 361 FLD -7 * SIZE(AO) 362 fsubp %st, %st(3) 363 FLD -6 * SIZE(AO) 364 fsubp %st, %st(2) 365 FLD -5 * SIZE(AO) 366 fsubp %st, %st(4) 367#endif 368 369#ifdef LN 370 FLD -5 * SIZE(AO) 371 fmul %st, %st(3) 372 fmulp %st, %st(4) 373 374 FLD -6 * SIZE(AO) 375 fmul %st(3), %st 376 FLD -6 * SIZE(AO) 377 fmul %st(5), %st 378 379 fsubrp %st, %st(3) 380 fsubrp %st, %st(1) 381 382 FLD -8 * SIZE(AO) 383 fmul %st, %st(1) 384 fmulp %st, %st(2) 385#endif 386 387#ifdef LT 388 FLD -8 * SIZE(AO) 389 fmul %st, %st(1) 390 fmulp %st, %st(2) 391 392 FLD -7 * SIZE(AO) 393 fmul %st(1), %st 394 FLD -7 * SIZE(AO) 395 fmul %st(3), %st 396 397 fsubrp %st, %st(5) 398 fsubrp %st, %st(3) 399 400 FLD -5 * SIZE(AO) 401 fmul %st, %st(3) 402 fmulp %st, %st(4) 403#endif 404 405#ifdef RN 406 FLD -8 * SIZE(BO) 407 fmul %st, %st(1) 408 fmulp %st, %st(3) 409 410 FLD -7 * SIZE(BO) 411 fmul %st(1), %st 412 FLD -7 * SIZE(BO) 413 fmul %st(4), %st 414 415 fsubrp %st, %st(5) 416 fsubrp %st, %st(2) 417 418 FLD -5 * SIZE(BO) 419 fmul %st, %st(2) 420 fmulp %st, %st(4) 421#endif 422 423#ifdef RT 424 FLD -5 * SIZE(BO) 425 fmul %st, %st(2) 426 fmulp %st, %st(4) 427 428 FLD -6 * SIZE(BO) 429 fmul %st(2), %st 430 FLD -6 * SIZE(BO) 431 fmul %st(5), %st 432 433 fsubrp %st, %st(4) 434 fsubrp %st, %st(1) 435 436 FLD -8 * SIZE(BO) 437 fmul %st, %st(1) 438 fmulp %st, %st(3) 439#endif 440 441#ifdef LN 442 subl $2 * SIZE, CO 443#endif 444 445#if defined(LN) || defined(LT) 446 fld %st 447 FST -8 * SIZE(BO) 448 fxch %st(1) 449 fld %st 450 FST -7 * SIZE(BO) 451 fxch %st(2) 452 fld %st 453 FST -6 * SIZE(BO) 454 fxch %st(3) 455 fld %st 456 FST -5 * SIZE(BO) 457 458 FST 1 * SIZE(CO, LDC) 459 FST 0 * SIZE(CO) 460 FST 0 * SIZE(CO, LDC) 461 FST 1 * SIZE(CO) 462#else 463 fld %st 464 FST -8 * SIZE(AO) 465 fxch %st(2) 466 fld %st 467 FST -7 * SIZE(AO) 468 fxch %st(1) 469 fld %st 470 FST -6 * SIZE(AO) 471 fxch %st(3) 472 fld %st 473 FST -5 * SIZE(AO) 474 475 FST 1 * SIZE(CO, LDC) 476 FST 1 * SIZE(CO) 477 FST 0 * SIZE(CO) 478 FST 0 * SIZE(CO, LDC) 479#endif 480 481#ifndef LN 482 addl $2 * SIZE, CO 483#endif 484 485#if defined(LT) || defined(RN) 486 movl K, %eax 487 subl KK, %eax 488 sall $BASE_SHIFT, %eax 489 leal (AO, %eax, 2), AO 490 leal (BO, %eax, 2), BO 491#endif 492 493#ifdef LN 494 subl $2, KK 495#endif 496 497#ifdef LT 498 addl $2, KK 499#endif 500 501#ifdef RT 502 movl K, %eax 503 sall $1 + BASE_SHIFT, %eax 504 addl %eax, AORIG 505#endif 506 507 decl I 508 jne .L11 509 ALIGN_4 510 511.L20: 512 movl M, %eax 513 andl $1, %eax 514 je .L29 515 ALIGN_4 516 517.L21: 518#ifdef LN 519 movl K, %eax 520 sall $0 + BASE_SHIFT, %eax 521 subl %eax, AORIG 522#endif 523 524#if defined(LN) || defined(RT) 525 movl KK, %eax 526 sall $BASE_SHIFT, %eax 527 movl AORIG, AO 528 leal (AO, %eax, 1), AO 529 leal (B, %eax, 2), BO 530#else 531 movl B, BO 532#endif 533 534 fldz 535 fldz 536 537#if defined(LT) || defined(RN) 538 movl KK, %eax 539#else 540 movl K, %eax 541 subl KK, %eax 542#endif 543 sarl $2, %eax 544 je .L25 545 ALIGN_4 546 547.L22: 548 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 549 550 FLD -8 * SIZE(AO) 551 552 FLD -8 * SIZE(BO) 553 fmul %st(1), %st 554 faddp %st, %st(2) 555 556 FLD -7 * SIZE(BO) 557 fmulp %st, %st(1) 558 faddp %st, %st(2) 559 560 FLD -7 * SIZE(AO) 561 562 FLD -6 * SIZE(BO) 563 fmul %st(1), %st 564 faddp %st, %st(2) 565 566 FLD -5 * SIZE(BO) 567 fmulp %st, %st(1) 568 faddp %st, %st(2) 569 570 FLD -6 * SIZE(AO) 571 572 FLD -4 * SIZE(BO) 573 fmul %st(1), %st 574 faddp %st, %st(2) 575 576 FLD -3 * SIZE(BO) 577 fmulp %st, %st(1) 578 faddp %st, %st(2) 579 580 FLD -5 * SIZE(AO) 581 582 FLD -2 * SIZE(BO) 583 fmul %st(1), %st 584 faddp %st, %st(2) 585 586 FLD -1 * SIZE(BO) 587 fmulp %st, %st(1) 588 faddp %st, %st(2) 589 590 addl $4 * SIZE,AO 591 addl $8 * SIZE,BO 592 593 decl %eax 594 jne .L22 595 ALIGN_4 596 597.L25: 598#if defined(LT) || defined(RN) 599 movl KK, %eax 600#else 601 movl K, %eax 602 subl KK, %eax 603#endif 604 and $3, %eax 605 je .L28 606 ALIGN_4 607 608.L26: 609 FLD -8 * SIZE(AO) 610 611 FLD -8 * SIZE(BO) 612 fmul %st(1), %st 613 faddp %st, %st(2) 614 615 FLD -7 * SIZE(BO) 616 fmulp %st, %st(1) 617 faddp %st, %st(2) 618 619 addl $1 * SIZE,AO 620 addl $2 * SIZE,BO 621 622 decl %eax 623 jne .L26 624 ALIGN_4 625 626.L28: 627#if defined(LN) || defined(RT) 628 movl KK, %eax 629#ifdef LN 630 subl $1, %eax 631#else 632 subl $2, %eax 633#endif 634 635 sall $BASE_SHIFT, %eax 636 637 movl AORIG, AO 638 leal (AO, %eax, 1), AO 639 leal (B, %eax, 2), BO 640#endif 641 642#if defined(LN) || defined(LT) 643 FLD -8 * SIZE(BO) 644 fsubp %st, %st(1) 645 FLD -7 * SIZE(BO) 646 fsubp %st, %st(2) 647#else 648 FLD -8 * SIZE(AO) 649 fsubp %st, %st(1) 650 FLD -7 * SIZE(AO) 651 fsubp %st, %st(2) 652#endif 653 654#if defined(LN) || defined(LT) 655 FLD -8 * SIZE(AO) 656 fmul %st, %st(1) 657 fmulp %st, %st(2) 658#endif 659 660#ifdef RN 661 FLD -8 * SIZE(BO) 662 fmulp %st, %st(1) 663 664 FLD -7 * SIZE(BO) 665 fmul %st(1), %st 666 667 fsubrp %st, %st(2) 668 669 FLD -5 * SIZE(BO) 670 fmulp %st, %st(2) 671#endif 672 673#ifdef RT 674 FLD -5 * SIZE(BO) 675 fmulp %st, %st(2) 676 677 FLD -6 * SIZE(BO) 678 fmul %st(2), %st 679 680 fsubrp %st, %st(1) 681 682 FLD -8 * SIZE(BO) 683 fmulp %st, %st(1) 684#endif 685 686#ifdef LN 687 subl $1 * SIZE, CO 688#endif 689 690#if defined(LN) || defined(LT) 691 fld %st 692 FST -8 * SIZE(BO) 693 fxch %st(1) 694 fld %st 695 FST -7 * SIZE(BO) 696#else 697 fld %st 698 FST -8 * SIZE(AO) 699 fxch %st(1) 700 fld %st 701 FST -7 * SIZE(AO) 702#endif 703 704 FST 0 * SIZE(CO, LDC) 705 FST 0 * SIZE(CO) 706 707#ifndef LN 708 addl $1 * SIZE, CO 709#endif 710 711#if defined(LT) || defined(RN) 712 movl K, %eax 713 subl KK, %eax 714 sall $BASE_SHIFT, %eax 715 leal (AO, %eax, 1), AO 716 leal (BO, %eax, 2), BO 717#endif 718 719#ifdef LN 720 subl $1, KK 721#endif 722 723#ifdef LT 724 addl $1, KK 725#endif 726 727#ifdef RT 728 movl K, %eax 729 sall $0 + BASE_SHIFT, %eax 730 addl %eax, AORIG 731#endif 732 ALIGN_4 733 734.L29: 735#ifdef LN 736 movl K, %eax 737 sall $BASE_SHIFT, %eax 738 leal (B, %eax, 2), B 739#endif 740 741#if defined(LT) || defined(RN) 742 movl BO, B 743#endif 744 745#ifdef RN 746 addl $2, KK 747#endif 748 749#ifdef RT 750 subl $2, KK 751#endif 752 753 decl J 754 jne .L01 755 ALIGN_4 756 757.L30: 758 movl N, %eax 759 testl $1, %eax 760 je .L999 761 762#if defined(LT) || defined(RN) 763 movl A, AO 764#else 765 movl A, %eax 766 movl %eax, AORIG 767#endif 768 769#ifdef RT 770 movl K, %eax 771 sall $0 + BASE_SHIFT, %eax 772 subl %eax, B 773#endif 774 775#ifdef RT 776 subl LDC, C 777#endif 778 movl C, CO 779#ifndef RT 780 addl LDC, C 781#endif 782 783#ifdef LN 784 movl OFFSET, %eax 785 addl M, %eax 786 movl %eax, KK 787#endif 788 789#ifdef LT 790 movl OFFSET, %eax 791 movl %eax, KK 792#endif 793 794 movl M, I 795 sarl $1, I 796 je .L40 797 ALIGN_4 798 799.L31: 800#ifdef LN 801 movl K, %eax 802 sall $1 + BASE_SHIFT, %eax 803 subl %eax, AORIG 804#endif 805 806#if defined(LN) || defined(RT) 807 movl KK, %eax 808 sall $BASE_SHIFT, %eax 809 movl AORIG, AO 810 leal (AO, %eax, 2), AO 811 leal (B, %eax, 1), BO 812#else 813 movl B, BO 814#endif 815 816 fldz 817 fldz 818 819#if defined(HAVE_3DNOW) 820 prefetchw 2 * SIZE(CO) 821#elif defined(HAVE_SSE) 822 prefetchnta 2 * SIZE(CO) 823#endif 824 825#if defined(LT) || defined(RN) 826 movl KK, %eax 827#else 828 movl K, %eax 829 subl KK, %eax 830#endif 831 sarl $2, %eax 832 je .L35 833 ALIGN_4 834 835.L32: 836 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 837 838 FLD -8 * SIZE(BO) 839 FLD -8 * SIZE(AO) 840 fmul %st(1), %st 841 faddp %st, %st(2) 842 843 FLD -7 * SIZE(AO) 844 fmulp %st, %st(1) 845 faddp %st, %st(2) 846 847 FLD -7 * SIZE(BO) 848 FLD -6 * SIZE(AO) 849 fmul %st(1), %st 850 faddp %st, %st(2) 851 852 FLD -5 * SIZE(AO) 853 fmulp %st, %st(1) 854 faddp %st, %st(2) 855 856 FLD -6 * SIZE(BO) 857 FLD -4 * SIZE(AO) 858 fmul %st(1), %st 859 faddp %st, %st(2) 860 861 FLD -3 * SIZE(AO) 862 fmulp %st, %st(1) 863 faddp %st, %st(2) 864 865 FLD -5 * SIZE(BO) 866 FLD -2 * SIZE(AO) 867 fmul %st(1), %st 868 faddp %st, %st(2) 869 870 FLD -1 * SIZE(AO) 871 fmulp %st, %st(1) 872 faddp %st, %st(2) 873 874 addl $8 * SIZE,AO 875 addl $4 * SIZE,BO 876 877 decl %eax 878 jne .L32 879 ALIGN_4 880 881.L35: 882#if defined(LT) || defined(RN) 883 movl KK, %eax 884#else 885 movl K, %eax 886 subl KK, %eax 887#endif 888 and $3, %eax 889 je .L38 890 ALIGN_4 891 892.L36: 893 FLD -8 * SIZE(BO) 894 895 FLD -8 * SIZE(AO) 896 fmul %st(1), %st 897 faddp %st, %st(2) 898 899 FLD -7 * SIZE(AO) 900 fmulp %st, %st(1) 901 faddp %st, %st(2) 902 903 addl $2 * SIZE,AO 904 addl $1 * SIZE,BO 905 906 decl %eax 907 jne .L36 908 ALIGN_4 909 910.L38: 911#if defined(LN) || defined(RT) 912 movl KK, %eax 913#ifdef LN 914 subl $2, %eax 915#else 916 subl $1, %eax 917#endif 918 919 sall $BASE_SHIFT, %eax 920 921 movl AORIG, AO 922 leal (AO, %eax, 2), AO 923 leal (B, %eax, 1), BO 924#endif 925 926#if defined(LN) || defined(LT) 927 FLD -8 * SIZE(BO) 928 fsubp %st, %st(1) 929 FLD -7 * SIZE(BO) 930 fsubp %st, %st(2) 931#else 932 FLD -8 * SIZE(AO) 933 fsubp %st, %st(1) 934 FLD -7 * SIZE(AO) 935 fsubp %st, %st(2) 936#endif 937 938#ifdef LN 939 FLD -5 * SIZE(AO) 940 fmulp %st, %st(2) 941 942 FLD -6 * SIZE(AO) 943 fmul %st(2), %st 944 945 fsubrp %st, %st(1) 946 FLD -8 * SIZE(AO) 947 fmulp %st, %st(1) 948#endif 949 950#ifdef LT 951 FLD -8 * SIZE(AO) 952 fmulp %st, %st(1) 953 954 FLD -7 * SIZE(AO) 955 fmul %st(1), %st 956 957 fsubrp %st, %st(2) 958 959 FLD -5 * SIZE(AO) 960 fmulp %st, %st(2) 961#endif 962 963#ifdef RN 964 FLD -8 * SIZE(BO) 965 fmul %st, %st(1) 966 fmulp %st, %st(2) 967#endif 968 969#ifdef RT 970 FLD -8 * SIZE(BO) 971 fmul %st, %st(1) 972 fmulp %st, %st(2) 973#endif 974 975#ifdef LN 976 subl $2 * SIZE, CO 977#endif 978 979#if defined(LN) || defined(LT) 980 fld %st 981 FST -8 * SIZE(BO) 982 fxch %st(1) 983 fld %st 984 FST -7 * SIZE(BO) 985#else 986 fld %st 987 FST -8 * SIZE(AO) 988 fxch %st(1) 989 fld %st 990 FST -7 * SIZE(AO) 991#endif 992 993 FST 1 * SIZE(CO) 994 FST 0 * SIZE(CO) 995 996#ifndef LN 997 addl $2 * SIZE, CO 998#endif 999 1000#if defined(LT) || defined(RN) 1001 movl K, %eax 1002 subl KK, %eax 1003 sall $BASE_SHIFT, %eax 1004 leal (AO, %eax, 2), AO 1005 leal (BO, %eax, 1), BO 1006#endif 1007 1008#ifdef LN 1009 subl $2, KK 1010#endif 1011 1012#ifdef LT 1013 addl $2, KK 1014#endif 1015 1016#ifdef RT 1017 movl K, %eax 1018 sall $1 + BASE_SHIFT, %eax 1019 addl %eax, AORIG 1020#endif 1021 1022 decl I 1023 jne .L31 1024 ALIGN_4 1025 1026.L40: 1027 movl M, %eax 1028 andl $1, %eax 1029 je .L49 1030 ALIGN_4 1031 1032.L41: 1033#ifdef LN 1034 movl K, %eax 1035 sall $0 + BASE_SHIFT, %eax 1036 subl %eax, AORIG 1037#endif 1038 1039#if defined(LN) || defined(RT) 1040 movl KK, %eax 1041 sall $BASE_SHIFT, %eax 1042 movl AORIG, AO 1043 leal (AO, %eax, 1), AO 1044 leal (B, %eax, 1), BO 1045#else 1046 movl B, BO 1047#endif 1048 1049 fldz 1050 1051#if defined(LT) || defined(RN) 1052 movl KK, %eax 1053#else 1054 movl K, %eax 1055 subl KK, %eax 1056#endif 1057 sarl $2, %eax 1058 je .L45 1059 ALIGN_4 1060 1061.L42: 1062 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1063 1064 FLD -8 * SIZE(AO) 1065 FLD -8 * SIZE(BO) 1066 fmulp %st, %st(1) 1067 faddp %st, %st(1) 1068 1069 FLD -7 * SIZE(AO) 1070 FLD -7 * SIZE(BO) 1071 fmulp %st, %st(1) 1072 faddp %st, %st(1) 1073 1074 FLD -6 * SIZE(AO) 1075 FLD -6 * SIZE(BO) 1076 fmulp %st, %st(1) 1077 faddp %st, %st(1) 1078 1079 FLD -5 * SIZE(AO) 1080 FLD -5 * SIZE(BO) 1081 fmulp %st, %st(1) 1082 faddp %st, %st(1) 1083 1084 addl $4 * SIZE,AO 1085 addl $4 * SIZE,BO 1086 1087 decl %eax 1088 jne .L42 1089 ALIGN_4 1090 1091.L45: 1092#if defined(LT) || defined(RN) 1093 movl KK, %eax 1094#else 1095 movl K, %eax 1096 subl KK, %eax 1097#endif 1098 and $3, %eax 1099 je .L48 1100 ALIGN_4 1101 1102.L46: 1103 FLD -8 * SIZE(AO) 1104 1105 FLD -8 * SIZE(BO) 1106 fmulp %st, %st(1) 1107 faddp %st, %st(1) 1108 1109 addl $1 * SIZE,AO 1110 addl $1 * SIZE,BO 1111 1112 decl %eax 1113 jne .L46 1114 ALIGN_4 1115 1116.L48: 1117#if defined(LN) || defined(RT) 1118 movl KK, %eax 1119#ifdef LN 1120 subl $1, %eax 1121#else 1122 subl $1, %eax 1123#endif 1124 1125 sall $BASE_SHIFT, %eax 1126 1127 movl AORIG, AO 1128 leal (AO, %eax, 1), AO 1129 leal (B, %eax, 1), BO 1130#endif 1131 1132#if defined(LN) || defined(LT) 1133 FLD -8 * SIZE(BO) 1134 fsubp %st, %st(1) 1135#else 1136 FLD -8 * SIZE(AO) 1137 fsubp %st, %st(1) 1138#endif 1139 1140#ifdef LN 1141 FLD -8 * SIZE(AO) 1142 fmulp %st, %st(1) 1143#endif 1144 1145#ifdef LT 1146 FLD -8 * SIZE(AO) 1147 fmulp %st, %st(1) 1148#endif 1149 1150#ifdef RN 1151 FLD -8 * SIZE(BO) 1152 fmulp %st, %st(1) 1153#endif 1154 1155#ifdef RT 1156 FLD -8 * SIZE(BO) 1157 fmulp %st, %st(1) 1158#endif 1159 1160#ifdef LN 1161 subl $1 * SIZE, CO 1162#endif 1163 1164#if defined(LN) || defined(LT) 1165 fld %st 1166 FST -8 * SIZE(BO) 1167#else 1168 fld %st 1169 FST -8 * SIZE(AO) 1170#endif 1171 1172 FST 0 * SIZE(CO) 1173 1174#ifndef LN 1175 addl $1 * SIZE, CO 1176#endif 1177 1178#if defined(LT) || defined(RN) 1179 movl K, %eax 1180 subl KK, %eax 1181 sall $BASE_SHIFT, %eax 1182 leal (AO, %eax, 1), AO 1183 leal (BO, %eax, 1), BO 1184#endif 1185 1186#ifdef LN 1187 subl $1, KK 1188#endif 1189 1190#ifdef LT 1191 addl $1, KK 1192#endif 1193 1194#ifdef RT 1195 movl K, %eax 1196 sall $0 + BASE_SHIFT, %eax 1197 addl %eax, AORIG 1198#endif 1199 ALIGN_4 1200 1201.L49: 1202#ifdef LN 1203 movl K, %eax 1204 sall $BASE_SHIFT, %eax 1205 leal (B, %eax, 1), B 1206#endif 1207 1208#if defined(LT) || defined(RN) 1209 movl BO, B 1210#endif 1211 1212#ifdef RN 1213 addl $1, KK 1214#endif 1215 1216#ifdef RT 1217 subl $1, KK 1218#endif 1219 ALIGN_4 1220 1221.L999: 1222 popl %ebx 1223 popl %esi 1224 popl %edi 1225 popl %ebp 1226 addl $ARGS, %esp 1227 ret 1228 1229 EPILOGUE 1230