1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef OPTERON 26#define PREFETCH prefetch 27#define PREFETCHW prefetchw 28#else 29#define PREFETCH prefetcht0 30#define PREFETCHW prefetcht0 31#endif 32 33#define PREFETCHSIZE (5 + 4 * 10) 34#define STACK 16 35#define ARGS 16 36 37#define J 0 + STACK(%esp) 38#define KK 4 + STACK(%esp) 39#define KKK 8 + STACK(%esp) 40#define AORIG 12 + STACK(%esp) 41 42#define M 4 + STACK + ARGS(%esp) 43#define N 8 + STACK + ARGS(%esp) 44#define K 12 + STACK + ARGS(%esp) 45#define ALPHA 16 + STACK + ARGS(%esp) 46#define A 32 + STACK + ARGS(%esp) 47#define ARG_B 36 + STACK + ARGS(%esp) 48#define C 40 + STACK + ARGS(%esp) 49#define ARG_LDC 44 + STACK + ARGS(%esp) 50#define OFFSET 48 + STACK + ARGS(%esp) 51 52#define I %esi 53#define B %ebx 54#define CO %edi 55#define AO %edx 56#define BO %ecx 57#define LDC %ebp 58 59#define PREFETCH_OFFSET 48 60 61 PROLOGUE 62 63 subl $ARGS, %esp # Generate Stack Frame 64 65 pushl %ebp 66 pushl %edi 67 pushl %esi 68 pushl %ebx 69 70 PROFCODE 71 72 movl ARG_LDC, LDC 73 movl ARG_B, B 74 sall $BASE_SHIFT, LDC 75 76 addl $8 * SIZE, A 77 addl $8 * SIZE, B 78 79 80#ifdef LN 81 movl M, %eax 82 sall $BASE_SHIFT, %eax 83 addl %eax, C 84 imull K, %eax 85 addl %eax, A 86#endif 87 88#ifdef RT 89 movl N, %eax 90 sall $BASE_SHIFT, %eax 91 imull K, %eax 92 addl %eax, B 93 94 movl N, %eax 95 imull %ebp, %eax 96 addl %eax, C 97#endif 98 99#ifdef RN 100 movl OFFSET, %eax 101 negl %eax 102 movl %eax, KK 103#endif 104 105#ifdef RT 106 movl N, %eax 107 subl OFFSET, %eax 108 movl %eax, KK 109#endif 110 111 movl N, %eax 112 testl $1, %eax 113 je .L30 114 115#if defined(LT) || defined(RN) 116 movl A, AO 117#else 118 movl A, %eax 119 movl %eax, AORIG 120#endif 121 122#ifdef RT 123 movl K, %eax 124 sall $0 + BASE_SHIFT, %eax 125 subl %eax, B 126#endif 127 128#ifdef RT 129 subl LDC, C 130#endif 131 movl C, CO 132#ifndef RT 133 addl LDC, C 134#endif 135 136#ifdef LN 137 movl OFFSET, %eax 138 addl M, %eax 139 movl %eax, KK 140#endif 141 142#ifdef LT 143 movl OFFSET, %eax 144 movl %eax, KK 145#endif 146 147 movl M, I 148 sarl $1, I 149 je .L40 150 ALIGN_4 151 152.L31: 153#ifdef LN 154 movl K, %eax 155 sall $1 + BASE_SHIFT, %eax 156 subl %eax, AORIG 157#endif 158 159#if defined(LN) || defined(RT) 160 movl KK, %eax 161 sall $BASE_SHIFT, %eax 162 movl AORIG, AO 163 leal (AO, %eax, 2), AO 164 leal (B, %eax, 1), BO 165#else 166 movl B, BO 167#endif 168 169 fldz 170 fldz 171 172#if defined(HAVE_3DNOW) 173 prefetchw 2 * SIZE(CO) 174#elif defined(HAVE_SSE) 175 prefetchnta 2 * SIZE(CO) 176#endif 177 178#if defined(LT) || defined(RN) 179 movl KK, %eax 180#else 181 movl K, %eax 182 subl KK, %eax 183#endif 184 sarl $2, %eax 185 je .L35 186 ALIGN_4 187 188.L32: 189 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 190 191 FLD -8 * SIZE(BO) 192 FLD -8 * SIZE(AO) 193 fmul %st(1), %st 194 faddp %st, %st(2) 195 196 FLD -7 * SIZE(AO) 197 fmulp %st, %st(1) 198 faddp %st, %st(2) 199 200 FLD -7 * SIZE(BO) 201 FLD -6 * SIZE(AO) 202 fmul %st(1), %st 203 faddp %st, %st(2) 204 205 FLD -5 * SIZE(AO) 206 fmulp %st, %st(1) 207 faddp %st, %st(2) 208 209 FLD -6 * SIZE(BO) 210 FLD -4 * SIZE(AO) 211 fmul %st(1), %st 212 faddp %st, %st(2) 213 214 FLD -3 * SIZE(AO) 215 fmulp %st, %st(1) 216 faddp %st, %st(2) 217 218 FLD -5 * SIZE(BO) 219 FLD -2 * SIZE(AO) 220 fmul %st(1), %st 221 faddp %st, %st(2) 222 223 FLD -1 * SIZE(AO) 224 fmulp %st, %st(1) 225 faddp %st, %st(2) 226 227 addl $8 * SIZE,AO 228 addl $4 * SIZE,BO 229 230 decl %eax 231 jne .L32 232 ALIGN_4 233 234.L35: 235#if defined(LT) || defined(RN) 236 movl KK, %eax 237#else 238 movl K, %eax 239 subl KK, %eax 240#endif 241 and $3, %eax 242 je .L38 243 ALIGN_4 244 245.L36: 246 FLD -8 * SIZE(BO) 247 248 FLD -8 * SIZE(AO) 249 fmul %st(1), %st 250 faddp %st, %st(2) 251 252 FLD -7 * SIZE(AO) 253 fmulp %st, %st(1) 254 faddp %st, %st(2) 255 256 addl $2 * SIZE,AO 257 addl $1 * SIZE,BO 258 259 decl %eax 260 jne .L36 261 ALIGN_4 262 263.L38: 264#if defined(LN) || defined(RT) 265 movl KK, %eax 266#ifdef LN 267 subl $2, %eax 268#else 269 subl $1, %eax 270#endif 271 272 sall $BASE_SHIFT, %eax 273 274 movl AORIG, AO 275 leal (AO, %eax, 2), AO 276 leal (B, %eax, 1), BO 277#endif 278 279#if defined(LN) || defined(LT) 280 FLD -8 * SIZE(BO) 281 fsubp %st, %st(1) 282 FLD -7 * SIZE(BO) 283 fsubp %st, %st(2) 284#else 285 FLD -8 * SIZE(AO) 286 fsubp %st, %st(1) 287 FLD -7 * SIZE(AO) 288 fsubp %st, %st(2) 289#endif 290 291#ifdef LN 292 FLD -5 * SIZE(AO) 293 fmulp %st, %st(2) 294 295 FLD -6 * SIZE(AO) 296 fmul %st(2), %st 297 298 fsubrp %st, %st(1) 299 FLD -8 * SIZE(AO) 300 fmulp %st, %st(1) 301#endif 302 303#ifdef LT 304 FLD -8 * SIZE(AO) 305 fmulp %st, %st(1) 306 307 FLD -7 * SIZE(AO) 308 fmul %st(1), %st 309 310 fsubrp %st, %st(2) 311 312 FLD -5 * SIZE(AO) 313 fmulp %st, %st(2) 314#endif 315 316#ifdef RN 317 FLD -8 * SIZE(BO) 318 fmul %st, %st(1) 319 fmulp %st, %st(2) 320#endif 321 322#ifdef RT 323 FLD -8 * SIZE(BO) 324 fmul %st, %st(1) 325 fmulp %st, %st(2) 326#endif 327 328#ifdef LN 329 subl $2 * SIZE, CO 330#endif 331 332#if defined(LN) || defined(LT) 333 fld %st 334 FST -8 * SIZE(BO) 335 fxch %st(1) 336 fld %st 337 FST -7 * SIZE(BO) 338#else 339 fld %st 340 FST -8 * SIZE(AO) 341 fxch %st(1) 342 fld %st 343 FST -7 * SIZE(AO) 344#endif 345 346 FST 1 * SIZE(CO) 347 FST 0 * SIZE(CO) 348 349#ifndef LN 350 addl $2 * SIZE, CO 351#endif 352 353#if defined(LT) || defined(RN) 354 movl K, %eax 355 subl KK, %eax 356 sall $BASE_SHIFT, %eax 357 leal (AO, %eax, 2), AO 358 leal (BO, %eax, 1), BO 359#endif 360 361#ifdef LN 362 subl $2, KK 363#endif 364 365#ifdef LT 366 addl $2, KK 367#endif 368 369#ifdef RT 370 movl K, %eax 371 sall $1 + BASE_SHIFT, %eax 372 addl %eax, AORIG 373#endif 374 375 decl I 376 jne .L31 377 ALIGN_4 378 379.L40: 380 movl M, %eax 381 andl $1, %eax 382 je .L49 383 ALIGN_4 384 385.L41: 386#ifdef LN 387 movl K, %eax 388 sall $0 + BASE_SHIFT, %eax 389 subl %eax, AORIG 390#endif 391 392#if defined(LN) || defined(RT) 393 movl KK, %eax 394 sall $BASE_SHIFT, %eax 395 movl AORIG, AO 396 leal (AO, %eax, 1), AO 397 leal (B, %eax, 1), BO 398#else 399 movl B, BO 400#endif 401 402 fldz 403 404#if defined(LT) || defined(RN) 405 movl KK, %eax 406#else 407 movl K, %eax 408 subl KK, %eax 409#endif 410 sarl $2, %eax 411 je .L45 412 ALIGN_4 413 414.L42: 415 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 416 417 FLD -8 * SIZE(AO) 418 FLD -8 * SIZE(BO) 419 fmulp %st, %st(1) 420 faddp %st, %st(1) 421 422 FLD -7 * SIZE(AO) 423 FLD -7 * SIZE(BO) 424 fmulp %st, %st(1) 425 faddp %st, %st(1) 426 427 FLD -6 * SIZE(AO) 428 FLD -6 * SIZE(BO) 429 fmulp %st, %st(1) 430 faddp %st, %st(1) 431 432 FLD -5 * SIZE(AO) 433 FLD -5 * SIZE(BO) 434 fmulp %st, %st(1) 435 faddp %st, %st(1) 436 437 addl $4 * SIZE,AO 438 addl $4 * SIZE,BO 439 440 decl %eax 441 jne .L42 442 ALIGN_4 443 444.L45: 445#if defined(LT) || defined(RN) 446 movl KK, %eax 447#else 448 movl K, %eax 449 subl KK, %eax 450#endif 451 and $3, %eax 452 je .L48 453 ALIGN_4 454 455.L46: 456 FLD -8 * SIZE(AO) 457 458 FLD -8 * SIZE(BO) 459 fmulp %st, %st(1) 460 faddp %st, %st(1) 461 462 addl $1 * SIZE,AO 463 addl $1 * SIZE,BO 464 465 decl %eax 466 jne .L46 467 ALIGN_4 468 469.L48: 470#if defined(LN) || defined(RT) 471 movl KK, %eax 472#ifdef LN 473 subl $1, %eax 474#else 475 subl $1, %eax 476#endif 477 478 sall $BASE_SHIFT, %eax 479 480 movl AORIG, AO 481 leal (AO, %eax, 1), AO 482 leal (B, %eax, 1), BO 483#endif 484 485#if defined(LN) || defined(LT) 486 FLD -8 * SIZE(BO) 487 fsubp %st, %st(1) 488#else 489 FLD -8 * SIZE(AO) 490 fsubp %st, %st(1) 491#endif 492 493#ifdef LN 494 FLD -8 * SIZE(AO) 495 fmulp %st, %st(1) 496#endif 497 498#ifdef LT 499 FLD -8 * SIZE(AO) 500 fmulp %st, %st(1) 501#endif 502 503#ifdef RN 504 FLD -8 * SIZE(BO) 505 fmulp %st, %st(1) 506#endif 507 508#ifdef RT 509 FLD -8 * SIZE(BO) 510 fmulp %st, %st(1) 511#endif 512 513#ifdef LN 514 subl $1 * SIZE, CO 515#endif 516 517#if defined(LN) || defined(LT) 518 fld %st 519 FST -8 * SIZE(BO) 520#else 521 fld %st 522 FST -8 * SIZE(AO) 523#endif 524 525 FST 0 * SIZE(CO) 526 527#ifndef LN 528 addl $1 * SIZE, CO 529#endif 530 531#if defined(LT) || defined(RN) 532 movl K, %eax 533 subl KK, %eax 534 sall $BASE_SHIFT, %eax 535 leal (AO, %eax, 1), AO 536 leal (BO, %eax, 1), BO 537#endif 538 539#ifdef LN 540 subl $1, KK 541#endif 542 543#ifdef LT 544 addl $1, KK 545#endif 546 547#ifdef RT 548 movl K, %eax 549 sall $0 + BASE_SHIFT, %eax 550 addl %eax, AORIG 551#endif 552 ALIGN_4 553 554.L49: 555#ifdef LN 556 movl K, %eax 557 sall $BASE_SHIFT, %eax 558 leal (B, %eax, 1), B 559#endif 560 561#if defined(LT) || defined(RN) 562 movl BO, B 563#endif 564 565#ifdef RN 566 addl $1, KK 567#endif 568 569#ifdef RT 570 subl $1, KK 571#endif 572 ALIGN_4 573 574.L30: 575 movl N, %eax 576 sarl $1, %eax 577 movl %eax, J 578 je .L999 579 ALIGN_4 580 581.L01: 582#if defined(LT) || defined(RN) 583 movl A, AO 584#else 585 movl A, %eax 586 movl %eax, AORIG 587#endif 588 589#ifdef RT 590 movl K, %eax 591 sall $1 + BASE_SHIFT, %eax 592 subl %eax, B 593#endif 594 595 lea (, LDC, 2), %eax 596 597#ifdef RT 598 subl %eax, C 599#endif 600 movl C, CO 601#ifndef RT 602 addl %eax, C 603#endif 604 605#ifdef LN 606 movl OFFSET, %eax 607 addl M, %eax 608 movl %eax, KK 609#endif 610 611#ifdef LT 612 movl OFFSET, %eax 613 movl %eax, KK 614#endif 615 616 movl M, I 617 sarl $1, I 618 je .L20 619 ALIGN_4 620 621.L11: 622#ifdef LN 623 movl K, %eax 624 sall $1 + BASE_SHIFT, %eax 625 subl %eax, AORIG 626#endif 627 628#if defined(LN) || defined(RT) 629 movl KK, %eax 630 sall $BASE_SHIFT, %eax 631 movl AORIG, AO 632 leal (AO, %eax, 2), AO 633 leal (B, %eax, 2), BO 634#else 635 movl B, BO 636#endif 637 638 fldz 639 fldz 640 fldz 641 fldz 642 643#if defined(HAVE_3DNOW) 644 prefetchw 2 * SIZE(CO) 645 prefetchw 2 * SIZE(CO, LDC, 1) 646#elif defined(HAVE_SSE) 647 prefetchnta 2 * SIZE(CO) 648 prefetchnta 2 * SIZE(CO, LDC, 1) 649#endif 650 651#if defined(LT) || defined(RN) 652 movl KK, %eax 653#else 654 movl K, %eax 655 subl KK, %eax 656#endif 657 sarl $2, %eax 658 je .L15 659 ALIGN_4 660 661.L12: 662 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 663 664 FLD -8 * SIZE(AO) 665 666 FLD -8 * SIZE(BO) 667 fld %st(1) 668 fmul %st(1), %st 669 faddp %st, %st(3) 670 671 FLD -7 * SIZE(BO) 672 fmul %st, %st(2) 673 674 FLD -7 * SIZE(AO) 675 fmul %st, %st(2) 676 fmulp %st, %st(1) 677 678 faddp %st, %st(6) 679 faddp %st, %st(4) 680 faddp %st, %st(2) 681 682 FLD -6 * SIZE(AO) 683 684 FLD -6 * SIZE(BO) 685 fld %st(1) 686 fmul %st(1), %st 687 faddp %st, %st(3) 688 689 FLD -5 * SIZE(BO) 690 fmul %st, %st(2) 691 692 FLD -5 * SIZE(AO) 693 fmul %st, %st(2) 694 fmulp %st, %st(1) 695 696 faddp %st, %st(6) 697 faddp %st, %st(4) 698 faddp %st, %st(2) 699 700 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 701 702 FLD -4 * SIZE(AO) 703 704 FLD -4 * SIZE(BO) 705 fld %st(1) 706 fmul %st(1), %st 707 faddp %st, %st(3) 708 709 FLD -3 * SIZE(BO) 710 fmul %st, %st(2) 711 712 FLD -3 * SIZE(AO) 713 fmul %st, %st(2) 714 fmulp %st, %st(1) 715 716 faddp %st, %st(6) 717 faddp %st, %st(4) 718 faddp %st, %st(2) 719 720 FLD -2 * SIZE(AO) 721 722 FLD -2 * SIZE(BO) 723 fld %st(1) 724 fmul %st(1), %st 725 faddp %st, %st(3) 726 727 FLD -1 * SIZE(BO) 728 fmul %st, %st(2) 729 730 FLD -1 * SIZE(AO) 731 fmul %st, %st(2) 732 fmulp %st, %st(1) 733 734 faddp %st, %st(6) 735 faddp %st, %st(4) 736 faddp %st, %st(2) 737 738 addl $8 * SIZE,AO 739 addl $8 * SIZE,BO 740 741 decl %eax 742 jne .L12 743 ALIGN_4 744 745.L15: 746#if defined(LT) || defined(RN) 747 movl KK, %eax 748#else 749 movl K, %eax 750 subl KK, %eax 751#endif 752 and $3, %eax 753 je .L18 754 ALIGN_4 755 756.L16: 757 FLD -8 * SIZE(AO) 758 759 FLD -8 * SIZE(BO) 760 fld %st(1) 761 fmul %st(1), %st 762 faddp %st, %st(3) 763 764 FLD -7 * SIZE(BO) 765 fmul %st, %st(2) 766 767 FLD -7 * SIZE(AO) 768 fmul %st, %st(2) 769 fmulp %st, %st(1) 770 771 faddp %st, %st(6) 772 faddp %st, %st(4) 773 faddp %st, %st(2) 774 775 addl $2 * SIZE,AO 776 addl $2 * SIZE,BO 777 778 decl %eax 779 jne .L16 780 ALIGN_4 781 782.L18: 783#if defined(LN) || defined(RT) 784 movl KK, %eax 785#ifdef LN 786 subl $2, %eax 787#else 788 subl $2, %eax 789#endif 790 791 sall $BASE_SHIFT, %eax 792 793 movl AORIG, AO 794 leal (AO, %eax, 2), AO 795 leal (B, %eax, 2), BO 796#endif 797 798#if defined(LN) || defined(LT) 799 FLD -8 * SIZE(BO) 800 fsubp %st, %st(1) 801 FLD -7 * SIZE(BO) 802 fsubp %st, %st(2) 803 FLD -6 * SIZE(BO) 804 fsubp %st, %st(3) 805 FLD -5 * SIZE(BO) 806 fsubp %st, %st(4) 807#else 808 FLD -8 * SIZE(AO) 809 fsubp %st, %st(1) 810 FLD -7 * SIZE(AO) 811 fsubp %st, %st(3) 812 FLD -6 * SIZE(AO) 813 fsubp %st, %st(2) 814 FLD -5 * SIZE(AO) 815 fsubp %st, %st(4) 816#endif 817 818#ifdef LN 819 FLD -5 * SIZE(AO) 820 fmul %st, %st(3) 821 fmulp %st, %st(4) 822 823 FLD -6 * SIZE(AO) 824 fmul %st(3), %st 825 FLD -6 * SIZE(AO) 826 fmul %st(5), %st 827 828 fsubrp %st, %st(3) 829 fsubrp %st, %st(1) 830 831 FLD -8 * SIZE(AO) 832 fmul %st, %st(1) 833 fmulp %st, %st(2) 834#endif 835 836#ifdef LT 837 FLD -8 * SIZE(AO) 838 fmul %st, %st(1) 839 fmulp %st, %st(2) 840 841 FLD -7 * SIZE(AO) 842 fmul %st(1), %st 843 FLD -7 * SIZE(AO) 844 fmul %st(3), %st 845 846 fsubrp %st, %st(5) 847 fsubrp %st, %st(3) 848 849 FLD -5 * SIZE(AO) 850 fmul %st, %st(3) 851 fmulp %st, %st(4) 852#endif 853 854#ifdef RN 855 FLD -8 * SIZE(BO) 856 fmul %st, %st(1) 857 fmulp %st, %st(3) 858 859 FLD -7 * SIZE(BO) 860 fmul %st(1), %st 861 FLD -7 * SIZE(BO) 862 fmul %st(4), %st 863 864 fsubrp %st, %st(5) 865 fsubrp %st, %st(2) 866 867 FLD -5 * SIZE(BO) 868 fmul %st, %st(2) 869 fmulp %st, %st(4) 870#endif 871 872#ifdef RT 873 FLD -5 * SIZE(BO) 874 fmul %st, %st(2) 875 fmulp %st, %st(4) 876 877 FLD -6 * SIZE(BO) 878 fmul %st(2), %st 879 FLD -6 * SIZE(BO) 880 fmul %st(5), %st 881 882 fsubrp %st, %st(4) 883 fsubrp %st, %st(1) 884 885 FLD -8 * SIZE(BO) 886 fmul %st, %st(1) 887 fmulp %st, %st(3) 888#endif 889 890#ifdef LN 891 subl $2 * SIZE, CO 892#endif 893 894#if defined(LN) || defined(LT) 895 fld %st 896 FST -8 * SIZE(BO) 897 fxch %st(1) 898 fld %st 899 FST -7 * SIZE(BO) 900 fxch %st(2) 901 fld %st 902 FST -6 * SIZE(BO) 903 fxch %st(3) 904 fld %st 905 FST -5 * SIZE(BO) 906 907 FST 1 * SIZE(CO, LDC) 908 FST 0 * SIZE(CO) 909 FST 0 * SIZE(CO, LDC) 910 FST 1 * SIZE(CO) 911#else 912 fld %st 913 FST -8 * SIZE(AO) 914 fxch %st(2) 915 fld %st 916 FST -7 * SIZE(AO) 917 fxch %st(1) 918 fld %st 919 FST -6 * SIZE(AO) 920 fxch %st(3) 921 fld %st 922 FST -5 * SIZE(AO) 923 924 FST 1 * SIZE(CO, LDC) 925 FST 1 * SIZE(CO) 926 FST 0 * SIZE(CO) 927 FST 0 * SIZE(CO, LDC) 928#endif 929 930#ifndef LN 931 addl $2 * SIZE, CO 932#endif 933 934#if defined(LT) || defined(RN) 935 movl K, %eax 936 subl KK, %eax 937 sall $BASE_SHIFT, %eax 938 leal (AO, %eax, 2), AO 939 leal (BO, %eax, 2), BO 940#endif 941 942#ifdef LN 943 subl $2, KK 944#endif 945 946#ifdef LT 947 addl $2, KK 948#endif 949 950#ifdef RT 951 movl K, %eax 952 sall $1 + BASE_SHIFT, %eax 953 addl %eax, AORIG 954#endif 955 956 decl I 957 jne .L11 958 ALIGN_4 959 960.L20: 961 movl M, %eax 962 andl $1, %eax 963 je .L29 964 ALIGN_4 965 966.L21: 967#ifdef LN 968 movl K, %eax 969 sall $0 + BASE_SHIFT, %eax 970 subl %eax, AORIG 971#endif 972 973#if defined(LN) || defined(RT) 974 movl KK, %eax 975 sall $BASE_SHIFT, %eax 976 movl AORIG, AO 977 leal (AO, %eax, 1), AO 978 leal (B, %eax, 2), BO 979#else 980 movl B, BO 981#endif 982 983 fldz 984 fldz 985 986#if defined(LT) || defined(RN) 987 movl KK, %eax 988#else 989 movl K, %eax 990 subl KK, %eax 991#endif 992 sarl $2, %eax 993 je .L25 994 ALIGN_4 995 996.L22: 997 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 998 999 FLD -8 * SIZE(AO) 1000 1001 FLD -8 * SIZE(BO) 1002 fmul %st(1), %st 1003 faddp %st, %st(2) 1004 1005 FLD -7 * SIZE(BO) 1006 fmulp %st, %st(1) 1007 faddp %st, %st(2) 1008 1009 FLD -7 * SIZE(AO) 1010 1011 FLD -6 * SIZE(BO) 1012 fmul %st(1), %st 1013 faddp %st, %st(2) 1014 1015 FLD -5 * SIZE(BO) 1016 fmulp %st, %st(1) 1017 faddp %st, %st(2) 1018 1019 FLD -6 * SIZE(AO) 1020 1021 FLD -4 * SIZE(BO) 1022 fmul %st(1), %st 1023 faddp %st, %st(2) 1024 1025 FLD -3 * SIZE(BO) 1026 fmulp %st, %st(1) 1027 faddp %st, %st(2) 1028 1029 FLD -5 * SIZE(AO) 1030 1031 FLD -2 * SIZE(BO) 1032 fmul %st(1), %st 1033 faddp %st, %st(2) 1034 1035 FLD -1 * SIZE(BO) 1036 fmulp %st, %st(1) 1037 faddp %st, %st(2) 1038 1039 addl $4 * SIZE,AO 1040 addl $8 * SIZE,BO 1041 1042 decl %eax 1043 jne .L22 1044 ALIGN_4 1045 1046.L25: 1047#if defined(LT) || defined(RN) 1048 movl KK, %eax 1049#else 1050 movl K, %eax 1051 subl KK, %eax 1052#endif 1053 and $3, %eax 1054 je .L28 1055 ALIGN_4 1056 1057.L26: 1058 FLD -8 * SIZE(AO) 1059 1060 FLD -8 * SIZE(BO) 1061 fmul %st(1), %st 1062 faddp %st, %st(2) 1063 1064 FLD -7 * SIZE(BO) 1065 fmulp %st, %st(1) 1066 faddp %st, %st(2) 1067 1068 addl $1 * SIZE,AO 1069 addl $2 * SIZE,BO 1070 1071 decl %eax 1072 jne .L26 1073 ALIGN_4 1074 1075.L28: 1076#if defined(LN) || defined(RT) 1077 movl KK, %eax 1078#ifdef LN 1079 subl $1, %eax 1080#else 1081 subl $2, %eax 1082#endif 1083 1084 sall $BASE_SHIFT, %eax 1085 1086 movl AORIG, AO 1087 leal (AO, %eax, 1), AO 1088 leal (B, %eax, 2), BO 1089#endif 1090 1091#if defined(LN) || defined(LT) 1092 FLD -8 * SIZE(BO) 1093 fsubp %st, %st(1) 1094 FLD -7 * SIZE(BO) 1095 fsubp %st, %st(2) 1096#else 1097 FLD -8 * SIZE(AO) 1098 fsubp %st, %st(1) 1099 FLD -7 * SIZE(AO) 1100 fsubp %st, %st(2) 1101#endif 1102 1103#if defined(LN) || defined(LT) 1104 FLD -8 * SIZE(AO) 1105 fmul %st, %st(1) 1106 fmulp %st, %st(2) 1107#endif 1108 1109#ifdef RN 1110 FLD -8 * SIZE(BO) 1111 fmulp %st, %st(1) 1112 1113 FLD -7 * SIZE(BO) 1114 fmul %st(1), %st 1115 1116 fsubrp %st, %st(2) 1117 1118 FLD -5 * SIZE(BO) 1119 fmulp %st, %st(2) 1120#endif 1121 1122#ifdef RT 1123 FLD -5 * SIZE(BO) 1124 fmulp %st, %st(2) 1125 1126 FLD -6 * SIZE(BO) 1127 fmul %st(2), %st 1128 1129 fsubrp %st, %st(1) 1130 1131 FLD -8 * SIZE(BO) 1132 fmulp %st, %st(1) 1133#endif 1134 1135#ifdef LN 1136 subl $1 * SIZE, CO 1137#endif 1138 1139#if defined(LN) || defined(LT) 1140 fld %st 1141 FST -8 * SIZE(BO) 1142 fxch %st(1) 1143 fld %st 1144 FST -7 * SIZE(BO) 1145#else 1146 fld %st 1147 FST -8 * SIZE(AO) 1148 fxch %st(1) 1149 fld %st 1150 FST -7 * SIZE(AO) 1151#endif 1152 1153 FST 0 * SIZE(CO, LDC) 1154 FST 0 * SIZE(CO) 1155 1156#ifndef LN 1157 addl $1 * SIZE, CO 1158#endif 1159 1160#if defined(LT) || defined(RN) 1161 movl K, %eax 1162 subl KK, %eax 1163 sall $BASE_SHIFT, %eax 1164 leal (AO, %eax, 1), AO 1165 leal (BO, %eax, 2), BO 1166#endif 1167 1168#ifdef LN 1169 subl $1, KK 1170#endif 1171 1172#ifdef LT 1173 addl $1, KK 1174#endif 1175 1176#ifdef RT 1177 movl K, %eax 1178 sall $0 + BASE_SHIFT, %eax 1179 addl %eax, AORIG 1180#endif 1181 ALIGN_4 1182 1183.L29: 1184#ifdef LN 1185 movl K, %eax 1186 sall $BASE_SHIFT, %eax 1187 leal (B, %eax, 2), B 1188#endif 1189 1190#if defined(LT) || defined(RN) 1191 movl BO, B 1192#endif 1193 1194#ifdef RN 1195 addl $2, KK 1196#endif 1197 1198#ifdef RT 1199 subl $2, KK 1200#endif 1201 1202 decl J 1203 jne .L01 1204 ALIGN_4 1205 1206.L999: 1207 popl %ebx 1208 popl %esi 1209 popl %edi 1210 popl %ebp 1211 addl $ARGS, %esp 1212 ret 1213 1214 EPILOGUE 1215