1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef OPTERON 43#define PREFETCH prefetch 44#define PREFETCHW prefetchw 45#else 46#define PREFETCH prefetcht0 47#define PREFETCHW prefetcht0 48#endif 49 50#define PREFETCHSIZE (5 + 4 * 10) 51#define STACK 16 52#define ARGS 16 53 54#define J 0 + STACK(%esp) 55#define KK 4 + STACK(%esp) 56#define KKK 8 + STACK(%esp) 57#define AORIG 12 + STACK(%esp) 58 59#define M 4 + STACK + ARGS(%esp) 60#define N 8 + STACK + ARGS(%esp) 61#define K 12 + STACK + ARGS(%esp) 62#define ALPHA 16 + STACK + ARGS(%esp) 63#define A 32 + STACK + ARGS(%esp) 64#define ARG_B 36 + STACK + ARGS(%esp) 65#define C 40 + STACK + ARGS(%esp) 66#define ARG_LDC 44 + STACK + ARGS(%esp) 67#define OFFSET 48 + STACK + ARGS(%esp) 68 69#define I %esi 70#define B %ebx 71#define CO %edi 72#define AO %edx 73#define BO %ecx 74#define LDC %ebp 75 76#define PREFETCH_OFFSET 48 77 78 PROLOGUE 79 80 subl $ARGS, %esp # Generate Stack Frame 81 82 pushl %ebp 83 pushl %edi 84 pushl %esi 85 pushl %ebx 86 87 PROFCODE 88 89 movl ARG_LDC, LDC 90 movl ARG_B, B 91 sall $BASE_SHIFT, LDC 92 93 addl $8 * SIZE, A 94 addl $8 * SIZE, B 95 96 97#ifdef LN 98 movl M, %eax 99 sall $BASE_SHIFT, %eax 100 addl %eax, C 101 imull K, %eax 102 addl %eax, A 103#endif 104 105#ifdef RT 106 movl N, %eax 107 sall $BASE_SHIFT, %eax 108 imull K, %eax 109 addl %eax, B 110 111 movl N, %eax 112 imull %ebp, %eax 113 addl %eax, C 114#endif 115 116#ifdef RN 117 movl OFFSET, %eax 118 negl %eax 119 movl %eax, KK 120#endif 121 122#ifdef RT 123 movl N, %eax 124 subl OFFSET, %eax 125 movl %eax, KK 126#endif 127 128 movl N, %eax 129 sarl $1, %eax 130 movl %eax, J 131 je .L30 132 ALIGN_4 133 134.L01: 135#if defined(LT) || defined(RN) 136 movl A, AO 137#else 138 movl A, %eax 139 movl %eax, AORIG 140#endif 141 142#ifdef RT 143 movl K, %eax 144 sall $1 + BASE_SHIFT, %eax 145 subl %eax, B 146#endif 147 148 lea (, LDC, 2), %eax 149 150#ifdef RT 151 subl %eax, C 152#endif 153 movl C, CO 154#ifndef RT 155 addl %eax, C 156#endif 157 158#ifdef LN 159 movl OFFSET, %eax 160 addl M, %eax 161 movl %eax, KK 162#endif 163 164#ifdef LT 165 movl OFFSET, %eax 166 movl %eax, KK 167#endif 168 169 movl M, %eax 170 andl $1, %eax 171 je .L20 172 ALIGN_4 173 174.L21: 175#ifdef LN 176 movl K, %eax 177 sall $0 + BASE_SHIFT, %eax 178 subl %eax, AORIG 179#endif 180 181#if defined(LN) || defined(RT) 182 movl KK, %eax 183 sall $BASE_SHIFT, %eax 184 movl AORIG, AO 185 leal (AO, %eax, 1), AO 186 leal (B, %eax, 2), BO 187#else 188 movl B, BO 189#endif 190 191 fldz 192 fldz 193 194#if defined(LT) || defined(RN) 195 movl KK, %eax 196#else 197 movl K, %eax 198 subl KK, %eax 199#endif 200 sarl $2, %eax 201 je .L25 202 ALIGN_4 203 204.L22: 205 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 206 207 FLD -8 * SIZE(AO) 208 209 FLD -8 * SIZE(BO) 210 fmul %st(1), %st 211 faddp %st, %st(2) 212 213 FLD -7 * SIZE(BO) 214 fmulp %st, %st(1) 215 faddp %st, %st(2) 216 217 FLD -7 * SIZE(AO) 218 219 FLD -6 * SIZE(BO) 220 fmul %st(1), %st 221 faddp %st, %st(2) 222 223 FLD -5 * SIZE(BO) 224 fmulp %st, %st(1) 225 faddp %st, %st(2) 226 227 FLD -6 * SIZE(AO) 228 229 FLD -4 * SIZE(BO) 230 fmul %st(1), %st 231 faddp %st, %st(2) 232 233 FLD -3 * SIZE(BO) 234 fmulp %st, %st(1) 235 faddp %st, %st(2) 236 237 FLD -5 * SIZE(AO) 238 239 FLD -2 * SIZE(BO) 240 fmul %st(1), %st 241 faddp %st, %st(2) 242 243 FLD -1 * SIZE(BO) 244 fmulp %st, %st(1) 245 faddp %st, %st(2) 246 247 addl $4 * SIZE,AO 248 addl $8 * SIZE,BO 249 250 decl %eax 251 jne .L22 252 ALIGN_4 253 254.L25: 255#if defined(LT) || defined(RN) 256 movl KK, %eax 257#else 258 movl K, %eax 259 subl KK, %eax 260#endif 261 and $3, %eax 262 je .L28 263 ALIGN_4 264 265.L26: 266 FLD -8 * SIZE(AO) 267 268 FLD -8 * SIZE(BO) 269 fmul %st(1), %st 270 faddp %st, %st(2) 271 272 FLD -7 * SIZE(BO) 273 fmulp %st, %st(1) 274 faddp %st, %st(2) 275 276 addl $1 * SIZE,AO 277 addl $2 * SIZE,BO 278 279 decl %eax 280 jne .L26 281 ALIGN_4 282 283.L28: 284#if defined(LN) || defined(RT) 285 movl KK, %eax 286#ifdef LN 287 subl $1, %eax 288#else 289 subl $2, %eax 290#endif 291 292 sall $BASE_SHIFT, %eax 293 294 movl AORIG, AO 295 leal (AO, %eax, 1), AO 296 leal (B, %eax, 2), BO 297#endif 298 299#if defined(LN) || defined(LT) 300 FLD -8 * SIZE(BO) 301 fsubp %st, %st(1) 302 FLD -7 * SIZE(BO) 303 fsubp %st, %st(2) 304#else 305 FLD -8 * SIZE(AO) 306 fsubp %st, %st(1) 307 FLD -7 * SIZE(AO) 308 fsubp %st, %st(3) 309#endif 310 311#if defined(LN) || defined(LT) 312 FLD -8 * SIZE(AO) 313 fmul %st, %st(1) 314 fmulp %st, %st(2) 315#endif 316 317#ifdef RN 318 FLD -8 * SIZE(BO) 319 fmulp %st, %st(1) 320 321 FLD -7 * SIZE(BO) 322 fmul %st(1), %st 323 324 fsubrp %st, %st(2) 325 326 FLD -5 * SIZE(BO) 327 fmulp %st, %st(2) 328#endif 329 330#ifdef RT 331 FLD -5 * SIZE(BO) 332 fmulp %st, %st(2) 333 334 FLD -6 * SIZE(BO) 335 fmul %st(2), %st 336 337 fsubrp %st, %st(1) 338 339 FLD -8 * SIZE(BO) 340 fmulp %st, %st(1) 341#endif 342 343#ifdef LN 344 subl $1 * SIZE, CO 345#endif 346 347#if defined(LN) || defined(LT) 348 fld %st 349 FST -8 * SIZE(BO) 350 fxch %st(1) 351 fld %st 352 FST -7 * SIZE(BO) 353#else 354 fld %st 355 FST -8 * SIZE(AO) 356 fxch %st(1) 357 fld %st 358 FST -7 * SIZE(AO) 359#endif 360 361 FST 0 * SIZE(CO, LDC) 362 FST 0 * SIZE(CO) 363 364#ifndef LN 365 addl $1 * SIZE, CO 366#endif 367 368#if defined(LT) || defined(RN) 369 movl K, %eax 370 subl KK, %eax 371 sall $BASE_SHIFT, %eax 372 leal (AO, %eax, 1), AO 373 leal (BO, %eax, 2), BO 374#endif 375 376#ifdef LN 377 subl $1, KK 378#endif 379 380#ifdef LT 381 addl $1, KK 382#endif 383 384#ifdef RT 385 movl K, %eax 386 sall $0 + BASE_SHIFT, %eax 387 addl %eax, AORIG 388#endif 389 ALIGN_4 390 391.L20: 392 movl M, I 393 sarl $1, I 394 je .L29 395 ALIGN_4 396 397.L11: 398#ifdef LN 399 movl K, %eax 400 sall $1 + BASE_SHIFT, %eax 401 subl %eax, AORIG 402#endif 403 404#if defined(LN) || defined(RT) 405 movl KK, %eax 406 sall $BASE_SHIFT, %eax 407 movl AORIG, AO 408 leal (AO, %eax, 2), AO 409 leal (B, %eax, 2), BO 410#else 411 movl B, BO 412#endif 413 414 fldz 415 fldz 416 fldz 417 fldz 418 419#if defined(HAVE_3DNOW) 420 prefetchw 2 * SIZE(CO) 421 prefetchw 2 * SIZE(CO, LDC, 1) 422#elif defined(HAVE_SSE) 423 prefetchnta 2 * SIZE(CO) 424 prefetchnta 2 * SIZE(CO, LDC, 1) 425#endif 426 427#if defined(LT) || defined(RN) 428 movl KK, %eax 429#else 430 movl K, %eax 431 subl KK, %eax 432#endif 433 sarl $2, %eax 434 je .L15 435 ALIGN_4 436 437.L12: 438 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 439 440 FLD -8 * SIZE(AO) 441 442 FLD -8 * SIZE(BO) 443 fld %st(1) 444 fmul %st(1), %st 445 faddp %st, %st(3) 446 447 FLD -7 * SIZE(BO) 448 fmul %st, %st(2) 449 450 FLD -7 * SIZE(AO) 451 fmul %st, %st(2) 452 fmulp %st, %st(1) 453 454 faddp %st, %st(6) 455 faddp %st, %st(4) 456 faddp %st, %st(2) 457 458 FLD -6 * SIZE(AO) 459 460 FLD -6 * SIZE(BO) 461 fld %st(1) 462 fmul %st(1), %st 463 faddp %st, %st(3) 464 465 FLD -5 * SIZE(BO) 466 fmul %st, %st(2) 467 468 FLD -5 * SIZE(AO) 469 fmul %st, %st(2) 470 fmulp %st, %st(1) 471 472 faddp %st, %st(6) 473 faddp %st, %st(4) 474 faddp %st, %st(2) 475 476 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 477 478 FLD -4 * SIZE(AO) 479 480 FLD -4 * SIZE(BO) 481 fld %st(1) 482 fmul %st(1), %st 483 faddp %st, %st(3) 484 485 FLD -3 * SIZE(BO) 486 fmul %st, %st(2) 487 488 FLD -3 * SIZE(AO) 489 fmul %st, %st(2) 490 fmulp %st, %st(1) 491 492 faddp %st, %st(6) 493 faddp %st, %st(4) 494 faddp %st, %st(2) 495 496 FLD -2 * SIZE(AO) 497 498 FLD -2 * SIZE(BO) 499 fld %st(1) 500 fmul %st(1), %st 501 faddp %st, %st(3) 502 503 FLD -1 * SIZE(BO) 504 fmul %st, %st(2) 505 506 FLD -1 * SIZE(AO) 507 fmul %st, %st(2) 508 fmulp %st, %st(1) 509 510 faddp %st, %st(6) 511 faddp %st, %st(4) 512 faddp %st, %st(2) 513 514 addl $8 * SIZE,AO 515 addl $8 * SIZE,BO 516 517 decl %eax 518 jne .L12 519 ALIGN_4 520 521.L15: 522#if defined(LT) || defined(RN) 523 movl KK, %eax 524#else 525 movl K, %eax 526 subl KK, %eax 527#endif 528 and $3, %eax 529 je .L18 530 ALIGN_4 531 532.L16: 533 FLD -8 * SIZE(AO) 534 535 FLD -8 * SIZE(BO) 536 fld %st(1) 537 fmul %st(1), %st 538 faddp %st, %st(3) 539 540 FLD -7 * SIZE(BO) 541 fmul %st, %st(2) 542 543 FLD -7 * SIZE(AO) 544 fmul %st, %st(2) 545 fmulp %st, %st(1) 546 547 faddp %st, %st(6) 548 faddp %st, %st(4) 549 faddp %st, %st(2) 550 551 addl $2 * SIZE,AO 552 addl $2 * SIZE,BO 553 554 decl %eax 555 jne .L16 556 ALIGN_4 557 558.L18: 559#if defined(LN) || defined(RT) 560 movl KK, %eax 561#ifdef LN 562 subl $2, %eax 563#else 564 subl $2, %eax 565#endif 566 567 sall $BASE_SHIFT, %eax 568 569 movl AORIG, AO 570 leal (AO, %eax, 2), AO 571 leal (B, %eax, 2), BO 572#endif 573 574#if defined(LN) || defined(LT) 575 FLD -8 * SIZE(BO) 576 fsubp %st, %st(1) 577 FLD -7 * SIZE(BO) 578 fsubp %st, %st(2) 579 FLD -6 * SIZE(BO) 580 fsubp %st, %st(3) 581 FLD -5 * SIZE(BO) 582 fsubp %st, %st(4) 583#else 584 FLD -8 * SIZE(AO) 585 fsubp %st, %st(1) 586 FLD -7 * SIZE(AO) 587 fsubp %st, %st(3) 588 FLD -6 * SIZE(AO) 589 fsubp %st, %st(2) 590 FLD -5 * SIZE(AO) 591 fsubp %st, %st(4) 592#endif 593 594#ifdef LN 595 FLD -5 * SIZE(AO) 596 fmul %st, %st(3) 597 fmulp %st, %st(4) 598 599 FLD -6 * SIZE(AO) 600 fmul %st(3), %st 601 FLD -6 * SIZE(AO) 602 fmul %st(5), %st 603 604 fsubrp %st, %st(3) 605 fsubrp %st, %st(1) 606 607 FLD -8 * SIZE(AO) 608 fmul %st, %st(1) 609 fmulp %st, %st(2) 610#endif 611 612#ifdef LT 613 FLD -8 * SIZE(AO) 614 fmul %st, %st(1) 615 fmulp %st, %st(2) 616 617 FLD -7 * SIZE(AO) 618 fmul %st(1), %st 619 FLD -7 * SIZE(AO) 620 fmul %st(3), %st 621 622 fsubrp %st, %st(5) 623 fsubrp %st, %st(3) 624 625 FLD -5 * SIZE(AO) 626 fmul %st, %st(3) 627 fmulp %st, %st(4) 628#endif 629 630#ifdef RN 631 FLD -8 * SIZE(BO) 632 fmul %st, %st(1) 633 fmulp %st, %st(3) 634 635 FLD -7 * SIZE(BO) 636 fmul %st(1), %st 637 FLD -7 * SIZE(BO) 638 fmul %st(4), %st 639 640 fsubrp %st, %st(5) 641 fsubrp %st, %st(2) 642 643 FLD -5 * SIZE(BO) 644 fmul %st, %st(2) 645 fmulp %st, %st(4) 646#endif 647 648#ifdef RT 649 FLD -5 * SIZE(BO) 650 fmul %st, %st(2) 651 fmulp %st, %st(4) 652 653 FLD -6 * SIZE(BO) 654 fmul %st(2), %st 655 FLD -6 * SIZE(BO) 656 fmul %st(5), %st 657 658 fsubrp %st, %st(4) 659 fsubrp %st, %st(1) 660 661 FLD -8 * SIZE(BO) 662 fmul %st, %st(1) 663 fmulp %st, %st(3) 664#endif 665 666#ifdef LN 667 subl $2 * SIZE, CO 668#endif 669 670#if defined(LN) || defined(LT) 671 fld %st 672 FST -8 * SIZE(BO) 673 fxch %st(1) 674 fld %st 675 FST -7 * SIZE(BO) 676 fxch %st(2) 677 fld %st 678 FST -6 * SIZE(BO) 679 fxch %st(3) 680 fld %st 681 FST -5 * SIZE(BO) 682 683 FST 1 * SIZE(CO, LDC) 684 FST 0 * SIZE(CO) 685 FST 0 * SIZE(CO, LDC) 686 FST 1 * SIZE(CO) 687#else 688 fld %st 689 FST -8 * SIZE(AO) 690 fxch %st(2) 691 fld %st 692 FST -7 * SIZE(AO) 693 fxch %st(1) 694 fld %st 695 FST -6 * SIZE(AO) 696 fxch %st(3) 697 fld %st 698 FST -5 * SIZE(AO) 699 700 FST 1 * SIZE(CO, LDC) 701 FST 1 * SIZE(CO) 702 FST 0 * SIZE(CO) 703 FST 0 * SIZE(CO, LDC) 704#endif 705 706#ifndef LN 707 addl $2 * SIZE, CO 708#endif 709 710#if defined(LT) || defined(RN) 711 movl K, %eax 712 subl KK, %eax 713 sall $BASE_SHIFT, %eax 714 leal (AO, %eax, 2), AO 715 leal (BO, %eax, 2), BO 716#endif 717 718#ifdef LN 719 subl $2, KK 720#endif 721 722#ifdef LT 723 addl $2, KK 724#endif 725 726#ifdef RT 727 movl K, %eax 728 sall $1 + BASE_SHIFT, %eax 729 addl %eax, AORIG 730#endif 731 732 decl I 733 jne .L11 734 ALIGN_4 735 736.L29: 737#ifdef LN 738 movl K, %eax 739 sall $BASE_SHIFT, %eax 740 leal (B, %eax, 2), B 741#endif 742 743#if defined(LT) || defined(RN) 744 movl BO, B 745#endif 746 747#ifdef RN 748 addl $2, KK 749#endif 750 751#ifdef RT 752 subl $2, KK 753#endif 754 755 decl J 756 jne .L01 757 ALIGN_4 758 759.L30: 760 movl N, %eax 761 testl $1, %eax 762 je .L999 763 764#if defined(LT) || defined(RN) 765 movl A, AO 766#else 767 movl A, %eax 768 movl %eax, AORIG 769#endif 770 771#ifdef RT 772 movl K, %eax 773 sall $0 + BASE_SHIFT, %eax 774 subl %eax, B 775#endif 776 777#ifdef RT 778 subl LDC, C 779#endif 780 movl C, CO 781#ifndef RT 782 addl %eax, C 783#endif 784 785#ifdef LN 786 movl OFFSET, %eax 787 addl M, %eax 788 movl %eax, KK 789#endif 790 791#ifdef LT 792 movl OFFSET, %eax 793 movl %eax, KK 794#endif 795 796 movl M, %eax 797 andl $1, %eax 798 je .L40 799 ALIGN_4 800 801.L41: 802#ifdef LN 803 movl K, %eax 804 sall $0 + BASE_SHIFT, %eax 805 subl %eax, AORIG 806#endif 807 808#if defined(LN) || defined(RT) 809 movl KK, %eax 810 sall $BASE_SHIFT, %eax 811 movl AORIG, AO 812 leal (AO, %eax, 1), AO 813 leal (B, %eax, 1), BO 814#else 815 movl B, BO 816#endif 817 818 fldz 819 820#if defined(LT) || defined(RN) 821 movl KK, %eax 822#else 823 movl K, %eax 824 subl KK, %eax 825#endif 826 sarl $2, %eax 827 je .L45 828 ALIGN_4 829 830.L42: 831 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 832 833 FLD -8 * SIZE(AO) 834 FLD -8 * SIZE(BO) 835 fmulp %st, %st(1) 836 faddp %st, %st(1) 837 838 FLD -7 * SIZE(AO) 839 FLD -7 * SIZE(BO) 840 fmulp %st, %st(1) 841 faddp %st, %st(1) 842 843 FLD -6 * SIZE(AO) 844 FLD -6 * SIZE(BO) 845 fmulp %st, %st(1) 846 faddp %st, %st(1) 847 848 FLD -5 * SIZE(AO) 849 FLD -5 * SIZE(BO) 850 fmulp %st, %st(1) 851 faddp %st, %st(1) 852 853 addl $4 * SIZE,AO 854 addl $4 * SIZE,BO 855 856 decl %eax 857 jne .L42 858 ALIGN_4 859 860.L45: 861#if defined(LT) || defined(RN) 862 movl KK, %eax 863#else 864 movl K, %eax 865 subl KK, %eax 866#endif 867 and $3, %eax 868 je .L48 869 ALIGN_4 870 871.L46: 872 FLD -8 * SIZE(AO) 873 874 FLD -8 * SIZE(BO) 875 fmulp %st, %st(1) 876 faddp %st, %st(1) 877 878 addl $1 * SIZE,AO 879 addl $1 * SIZE,BO 880 881 decl %eax 882 jne .L46 883 ALIGN_4 884 885.L48: 886#if defined(LN) || defined(RT) 887 movl KK, %eax 888#ifdef LN 889 subl $1, %eax 890#else 891 subl $1, %eax 892#endif 893 894 sall $BASE_SHIFT, %eax 895 896 movl AORIG, AO 897 leal (AO, %eax, 1), AO 898 leal (B, %eax, 1), BO 899#endif 900 901#if defined(LN) || defined(LT) 902 FLD -8 * SIZE(BO) 903 fsubp %st, %st(1) 904#else 905 FLD -8 * SIZE(AO) 906 fsubp %st, %st(1) 907#endif 908 909#ifdef LN 910 FLD -8 * SIZE(AO) 911 fmulp %st, %st(1) 912#endif 913 914#ifdef LT 915 FLD -8 * SIZE(AO) 916 fmulp %st, %st(1) 917#endif 918 919#ifdef RN 920 FLD -8 * SIZE(BO) 921 fmulp %st, %st(1) 922#endif 923 924#ifdef RT 925 FLD -8 * SIZE(BO) 926 fmulp %st, %st(1) 927#endif 928 929#ifdef LN 930 subl $1 * SIZE, CO 931#endif 932 933#if defined(LN) || defined(LT) 934 fld %st 935 FST -8 * SIZE(BO) 936#else 937 fld %st 938 FST -8 * SIZE(AO) 939#endif 940 941 FST 0 * SIZE(CO) 942 943#ifndef LN 944 addl $1 * SIZE, CO 945#endif 946 947#if defined(LT) || defined(RN) 948 movl K, %eax 949 subl KK, %eax 950 sall $BASE_SHIFT, %eax 951 leal (AO, %eax, 1), AO 952 leal (BO, %eax, 1), BO 953#endif 954 955#ifdef LN 956 subl $1, KK 957#endif 958 959#ifdef LT 960 addl $1, KK 961#endif 962 963#ifdef RT 964 movl K, %eax 965 sall $0 + BASE_SHIFT, %eax 966 addl %eax, AORIG 967#endif 968 ALIGN_4 969 970.L40: 971 movl M, I 972 sarl $1, I 973 je .L49 974 ALIGN_4 975 976.L31: 977#ifdef LN 978 movl K, %eax 979 sall $1 + BASE_SHIFT, %eax 980 subl %eax, AORIG 981#endif 982 983#if defined(LN) || defined(RT) 984 movl KK, %eax 985 sall $BASE_SHIFT, %eax 986 movl AORIG, AO 987 leal (AO, %eax, 2), AO 988 leal (B, %eax, 1), BO 989#else 990 movl B, BO 991#endif 992 993 fldz 994 fldz 995 996#if defined(HAVE_3DNOW) 997 prefetchw 2 * SIZE(CO) 998#elif defined(HAVE_SSE) 999 prefetchnta 2 * SIZE(CO) 1000#endif 1001 1002#if defined(LT) || defined(RN) 1003 movl KK, %eax 1004#else 1005 movl K, %eax 1006 subl KK, %eax 1007#endif 1008 sarl $2, %eax 1009 je .L35 1010 ALIGN_4 1011 1012.L32: 1013 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1014 1015 FLD -8 * SIZE(BO) 1016 FLD -8 * SIZE(AO) 1017 fmul %st(1), %st 1018 faddp %st, %st(2) 1019 1020 FLD -7 * SIZE(AO) 1021 fmulp %st, %st(1) 1022 faddp %st, %st(2) 1023 1024 FLD -7 * SIZE(BO) 1025 FLD -6 * SIZE(AO) 1026 fmul %st(1), %st 1027 faddp %st, %st(2) 1028 1029 FLD -5 * SIZE(AO) 1030 fmulp %st, %st(1) 1031 faddp %st, %st(2) 1032 1033 FLD -6 * SIZE(BO) 1034 FLD -4 * SIZE(AO) 1035 fmul %st(1), %st 1036 faddp %st, %st(2) 1037 1038 FLD -3 * SIZE(AO) 1039 fmulp %st, %st(1) 1040 faddp %st, %st(2) 1041 1042 FLD -5 * SIZE(BO) 1043 FLD -2 * SIZE(AO) 1044 fmul %st(1), %st 1045 faddp %st, %st(2) 1046 1047 FLD -1 * SIZE(AO) 1048 fmulp %st, %st(1) 1049 faddp %st, %st(2) 1050 1051 addl $8 * SIZE,AO 1052 addl $4 * SIZE,BO 1053 1054 decl %eax 1055 jne .L32 1056 ALIGN_4 1057 1058.L35: 1059#if defined(LT) || defined(RN) 1060 movl KK, %eax 1061#else 1062 movl K, %eax 1063 subl KK, %eax 1064#endif 1065 and $3, %eax 1066 je .L38 1067 ALIGN_4 1068 1069.L36: 1070 FLD -8 * SIZE(BO) 1071 1072 FLD -8 * SIZE(AO) 1073 fmul %st(1), %st 1074 faddp %st, %st(2) 1075 1076 FLD -7 * SIZE(AO) 1077 fmulp %st, %st(1) 1078 faddp %st, %st(2) 1079 1080 addl $2 * SIZE,AO 1081 addl $1 * SIZE,BO 1082 1083 decl %eax 1084 jne .L36 1085 ALIGN_4 1086 1087.L38: 1088#if defined(LN) || defined(RT) 1089 movl KK, %eax 1090#ifdef LN 1091 subl $2, %eax 1092#else 1093 subl $1, %eax 1094#endif 1095 1096 sall $BASE_SHIFT, %eax 1097 1098 movl AORIG, AO 1099 leal (AO, %eax, 2), AO 1100 leal (B, %eax, 1), BO 1101#endif 1102 1103#if defined(LN) || defined(LT) 1104 FLD -8 * SIZE(BO) 1105 fsubp %st, %st(1) 1106 FLD -7 * SIZE(BO) 1107 fsubp %st, %st(2) 1108#else 1109 FLD -8 * SIZE(AO) 1110 fsubp %st, %st(1) 1111 FLD -7 * SIZE(AO) 1112 fsubp %st, %st(3) 1113#endif 1114 1115#ifdef LN 1116 FLD -5 * SIZE(AO) 1117 fmulp %st, %st(2) 1118 1119 FLD -6 * SIZE(AO) 1120 fmul %st(2), %st 1121 1122 fsubrp %st, %st(1) 1123 FLD -8 * SIZE(AO) 1124 fmulp %st, %st(1) 1125#endif 1126 1127#ifdef LT 1128 FLD -8 * SIZE(AO) 1129 fmulp %st, %st(1) 1130 1131 FLD -7 * SIZE(AO) 1132 fmul %st(1), %st 1133 1134 fsubrp %st, %st(2) 1135 1136 FLD -5 * SIZE(AO) 1137 fmulp %st, %st(2) 1138#endif 1139 1140#ifdef RN 1141 FLD -8 * SIZE(BO) 1142 fmul %st, %st(1) 1143 fmulp %st, %st(2) 1144#endif 1145 1146#ifdef RT 1147 FLD -8 * SIZE(BO) 1148 fmul %st, %st(1) 1149 fmulp %st, %st(2) 1150#endif 1151 1152#ifdef LN 1153 subl $2 * SIZE, CO 1154#endif 1155 1156#if defined(LN) || defined(LT) 1157 fld %st 1158 FST -8 * SIZE(BO) 1159 fxch %st(1) 1160 fld %st 1161 FST -7 * SIZE(BO) 1162#else 1163 fld %st 1164 FST -8 * SIZE(AO) 1165 fxch %st(1) 1166 fld %st 1167 FST -7 * SIZE(AO) 1168#endif 1169 1170 FST 1 * SIZE(CO) 1171 FST 0 * SIZE(CO) 1172 1173#ifndef LN 1174 addl $2 * SIZE, CO 1175#endif 1176 1177#if defined(LT) || defined(RN) 1178 movl K, %eax 1179 subl KK, %eax 1180 sall $BASE_SHIFT, %eax 1181 leal (AO, %eax, 2), AO 1182 leal (BO, %eax, 1), BO 1183#endif 1184 1185#ifdef LN 1186 subl $2, KK 1187#endif 1188 1189#ifdef LT 1190 addl $2, KK 1191#endif 1192 1193#ifdef RT 1194 movl K, %eax 1195 sall $1 + BASE_SHIFT, %eax 1196 addl %eax, AORIG 1197#endif 1198 1199 decl I 1200 jne .L31 1201 ALIGN_4 1202 1203.L49: 1204#ifdef LN 1205 movl K, %eax 1206 sall $BASE_SHIFT, %eax 1207 leal (B, %eax, 1), B 1208#endif 1209 1210#if defined(LT) || defined(RN) 1211 movl BO, B 1212#endif 1213 1214#ifdef RN 1215 addl $1, KK 1216#endif 1217 1218#ifdef RT 1219 subl $1, KK 1220#endif 1221 ALIGN_4 1222 1223.L999: 1224 popl %ebx 1225 popl %esi 1226 popl %edi 1227 popl %ebp 1228 addl $ARGS, %esp 1229 ret 1230 1231 EPILOGUE 1232