1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 43#define N ARG2 44#define K ARG3 45#define A ARG4 46#define B ARG5 47#define C ARG6 48#define LDC %r10 49 50#define I %r12 51#define J %r13 52#define AO %r14 53#define BO %r15 54#define CO %rbp 55 56#define KK %r11 57#define AORIG 48(%rsp) 58 59#define STACKSIZE 64 60 61#define ALPHA 8 + STACKSIZE(%rsp) 62#define OFFSET 32 + STACKSIZE(%rsp) 63 64#ifdef OPTERON 65#define PREFETCH prefetch 66#define PREFETCHW prefetchw 67#else 68#define PREFETCH prefetcht0 69#define PREFETCHW prefetcht0 70#endif 71 72#define PREFETCHSIZE (5 + 4 * 10) 73 74 PROLOGUE 75 PROFCODE 76 77#ifdef WINDOWS_ABI 78 emms 79#endif 80 81 subq $STACKSIZE, %rsp 82 movq %rbx, 0(%rsp) 83 movq %rbp, 8(%rsp) 84 movq %r12, 16(%rsp) 85 movq %r13, 24(%rsp) 86 movq %r14, 32(%rsp) 87 movq %r15, 40(%rsp) 88 89 movq 24 + STACKSIZE(%rsp), LDC 90 91#if defined(TRMMKERNEL) && !defined(LEFT) 92 movq OFFSET, %rax 93 negq %rax 94 movq %rax, KK 95#endif 96 97 addq $8 * SIZE, A 98 addq $8 * SIZE, B 99 100 salq $BASE_SHIFT, LDC 101 102#ifdef LN 103 movq M, %rax 104 salq $BASE_SHIFT, %rax 105 addq %rax, C 106 imulq K, %rax 107 addq %rax, A 108#endif 109 110#ifdef RT 111 movq N, %rax 112 salq $BASE_SHIFT, %rax 113 imulq K, %rax 114 addq %rax, B 115 116 movq N, %rax 117 imulq LDC, %rax 118 addq %rax, C 119#endif 120 121#ifdef RN 122 movq OFFSET, %rax 123 negq %rax 124 movq %rax, KK 125#endif 126 127#ifdef RT 128 movq N, %rax 129 subq OFFSET, %rax 130 movq %rax, KK 131#endif 132 133 movq N, %rax 134 testq $1, %rax 135 je .L30 136 137#if defined(LT) || defined(RN) 138 movq A, AO 139#else 140 movq A, %rax 141 movq %rax, AORIG 142#endif 143 144#ifdef RT 145 movq K, %rax 146 salq $0 + BASE_SHIFT, %rax 147 subq %rax, B 148#endif 149 150#ifdef RT 151 subq LDC, C 152#endif 153 movq C, CO 154#ifndef RT 155 addq LDC, C 156#endif 157 158#ifdef LN 159 movq OFFSET, %rax 160 addq M, %rax 161 movq %rax, KK 162#endif 163 164#ifdef LT 165 movq OFFSET, %rax 166 movq %rax, KK 167#endif 168 169 movq M, I 170 sarq $1, I 171 je .L40 172 ALIGN_4 173 174.L31: 175#ifdef LN 176 movq K, %rax 177 salq $1 + BASE_SHIFT, %rax 178 subq %rax, AORIG 179#endif 180 181#if defined(LN) || defined(RT) 182 movq KK, %rax 183 salq $BASE_SHIFT, %rax 184 movq AORIG, AO 185 leaq (AO, %rax, 2), AO 186 leaq (B, %rax, 1), BO 187#else 188 movq B, BO 189#endif 190 191 fldz 192 fldz 193 194#if defined(HAVE_3DNOW) 195 prefetchw 2 * SIZE(CO) 196#elif defined(HAVE_SSE) 197 prefetchnta 2 * SIZE(CO) 198#endif 199 200#if defined(LT) || defined(RN) 201 movq KK, %rax 202#else 203 movq K, %rax 204 subq KK, %rax 205#endif 206 sarq $2, %rax 207 je .L35 208 ALIGN_4 209 210.L32: 211 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 212 213 FLD -8 * SIZE(BO) 214 FLD -8 * SIZE(AO) 215 fmul %st(1), %st 216 faddp %st, %st(2) 217 218 FLD -7 * SIZE(AO) 219 fmulp %st, %st(1) 220 faddp %st, %st(2) 221 222 FLD -7 * SIZE(BO) 223 FLD -6 * SIZE(AO) 224 fmul %st(1), %st 225 faddp %st, %st(2) 226 227 FLD -5 * SIZE(AO) 228 fmulp %st, %st(1) 229 faddp %st, %st(2) 230 231 FLD -6 * SIZE(BO) 232 FLD -4 * SIZE(AO) 233 fmul %st(1), %st 234 faddp %st, %st(2) 235 236 FLD -3 * SIZE(AO) 237 fmulp %st, %st(1) 238 faddp %st, %st(2) 239 240 FLD -5 * SIZE(BO) 241 FLD -2 * SIZE(AO) 242 fmul %st(1), %st 243 faddp %st, %st(2) 244 245 FLD -1 * SIZE(AO) 246 fmulp %st, %st(1) 247 faddp %st, %st(2) 248 249 addq $8 * SIZE,AO 250 addq $4 * SIZE,BO 251 252 decq %rax 253 jne .L32 254 ALIGN_4 255 256.L35: 257#if defined(LT) || defined(RN) 258 movq KK, %rax 259#else 260 movq K, %rax 261 subq KK, %rax 262#endif 263 and $3, %rax 264 je .L38 265 ALIGN_4 266 267.L36: 268 FLD -8 * SIZE(BO) 269 270 FLD -8 * SIZE(AO) 271 fmul %st(1), %st 272 faddp %st, %st(2) 273 274 FLD -7 * SIZE(AO) 275 fmulp %st, %st(1) 276 faddp %st, %st(2) 277 278 addq $2 * SIZE,AO 279 addq $1 * SIZE,BO 280 281 decq %rax 282 jne .L36 283 ALIGN_4 284 285.L38: 286#if defined(LN) || defined(RT) 287 movq KK, %rax 288#ifdef LN 289 subq $2, %rax 290#else 291 subq $1, %rax 292#endif 293 294 salq $BASE_SHIFT, %rax 295 296 movq AORIG, AO 297 leaq (AO, %rax, 2), AO 298 leaq (B, %rax, 1), BO 299#endif 300 301#if defined(LN) || defined(LT) 302 FLD -8 * SIZE(BO) 303 fsubp %st, %st(1) 304 FLD -7 * SIZE(BO) 305 fsubp %st, %st(2) 306#else 307 FLD -8 * SIZE(AO) 308 fsubp %st, %st(1) 309 FLD -7 * SIZE(AO) 310 fsubp %st, %st(2) 311#endif 312 313#ifdef LN 314 FLD -5 * SIZE(AO) 315 fmulp %st, %st(2) 316 317 FLD -6 * SIZE(AO) 318 fmul %st(2), %st 319 320 fsubrp %st, %st(1) 321 FLD -8 * SIZE(AO) 322 fmulp %st, %st(1) 323#endif 324 325#ifdef LT 326 FLD -8 * SIZE(AO) 327 fmulp %st, %st(1) 328 329 FLD -7 * SIZE(AO) 330 fmul %st(1), %st 331 332 fsubrp %st, %st(2) 333 334 FLD -5 * SIZE(AO) 335 fmulp %st, %st(2) 336#endif 337 338#ifdef RN 339 FLD -8 * SIZE(BO) 340 fmul %st, %st(1) 341 fmulp %st, %st(2) 342#endif 343 344#ifdef RT 345 FLD -8 * SIZE(BO) 346 fmul %st, %st(1) 347 fmulp %st, %st(2) 348#endif 349 350#ifdef LN 351 subq $2 * SIZE, CO 352#endif 353 354#if defined(LN) || defined(LT) 355 fld %st 356 FST -8 * SIZE(BO) 357 fxch %st(1) 358 fld %st 359 FST -7 * SIZE(BO) 360#else 361 fld %st 362 FST -8 * SIZE(AO) 363 fxch %st(1) 364 fld %st 365 FST -7 * SIZE(AO) 366#endif 367 368 FST 1 * SIZE(CO) 369 FST 0 * SIZE(CO) 370 371#ifndef LN 372 addq $2 * SIZE, CO 373#endif 374 375#if defined(LT) || defined(RN) 376 movq K, %rax 377 subq KK, %rax 378 salq $BASE_SHIFT, %rax 379 leaq (AO, %rax, 2), AO 380 leaq (BO, %rax, 1), BO 381#endif 382 383#ifdef LN 384 subq $2, KK 385#endif 386 387#ifdef LT 388 addq $2, KK 389#endif 390 391#ifdef RT 392 movq K, %rax 393 salq $1 + BASE_SHIFT, %rax 394 addq %rax, AORIG 395#endif 396 397 decq I 398 jne .L31 399 ALIGN_4 400 401.L40: 402 movq M, %rax 403 andq $1, %rax 404 je .L49 405 ALIGN_4 406 407.L41: 408#ifdef LN 409 movq K, %rax 410 salq $0 + BASE_SHIFT, %rax 411 subq %rax, AORIG 412#endif 413 414#if defined(LN) || defined(RT) 415 movq KK, %rax 416 salq $BASE_SHIFT, %rax 417 movq AORIG, AO 418 leaq (AO, %rax, 1), AO 419 leaq (B, %rax, 1), BO 420#else 421 movq B, BO 422#endif 423 424 fldz 425 426#if defined(LT) || defined(RN) 427 movq KK, %rax 428#else 429 movq K, %rax 430 subq KK, %rax 431#endif 432 sarq $2, %rax 433 je .L45 434 ALIGN_4 435 436.L42: 437 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 438 439 FLD -8 * SIZE(AO) 440 FLD -8 * SIZE(BO) 441 fmulp %st, %st(1) 442 faddp %st, %st(1) 443 444 FLD -7 * SIZE(AO) 445 FLD -7 * SIZE(BO) 446 fmulp %st, %st(1) 447 faddp %st, %st(1) 448 449 FLD -6 * SIZE(AO) 450 FLD -6 * SIZE(BO) 451 fmulp %st, %st(1) 452 faddp %st, %st(1) 453 454 FLD -5 * SIZE(AO) 455 FLD -5 * SIZE(BO) 456 fmulp %st, %st(1) 457 faddp %st, %st(1) 458 459 addq $4 * SIZE,AO 460 addq $4 * SIZE,BO 461 462 decq %rax 463 jne .L42 464 ALIGN_4 465 466.L45: 467#if defined(LT) || defined(RN) 468 movq KK, %rax 469#else 470 movq K, %rax 471 subq KK, %rax 472#endif 473 and $3, %rax 474 je .L48 475 ALIGN_4 476 477.L46: 478 FLD -8 * SIZE(AO) 479 480 FLD -8 * SIZE(BO) 481 fmulp %st, %st(1) 482 faddp %st, %st(1) 483 484 addq $1 * SIZE,AO 485 addq $1 * SIZE,BO 486 487 decq %rax 488 jne .L46 489 ALIGN_4 490 491.L48: 492#if defined(LN) || defined(RT) 493 movq KK, %rax 494#ifdef LN 495 subq $1, %rax 496#else 497 subq $1, %rax 498#endif 499 500 salq $BASE_SHIFT, %rax 501 502 movq AORIG, AO 503 leaq (AO, %rax, 1), AO 504 leaq (B, %rax, 1), BO 505#endif 506 507#if defined(LN) || defined(LT) 508 FLD -8 * SIZE(BO) 509 fsubp %st, %st(1) 510#else 511 FLD -8 * SIZE(AO) 512 fsubp %st, %st(1) 513#endif 514 515#ifdef LN 516 FLD -8 * SIZE(AO) 517 fmulp %st, %st(1) 518#endif 519 520#ifdef LT 521 FLD -8 * SIZE(AO) 522 fmulp %st, %st(1) 523#endif 524 525#ifdef RN 526 FLD -8 * SIZE(BO) 527 fmulp %st, %st(1) 528#endif 529 530#ifdef RT 531 FLD -8 * SIZE(BO) 532 fmulp %st, %st(1) 533#endif 534 535#ifdef LN 536 subq $1 * SIZE, CO 537#endif 538 539#if defined(LN) || defined(LT) 540 fld %st 541 FST -8 * SIZE(BO) 542#else 543 fld %st 544 FST -8 * SIZE(AO) 545#endif 546 547 FST 0 * SIZE(CO) 548 549#ifndef LN 550 addq $1 * SIZE, CO 551#endif 552 553#if defined(LT) || defined(RN) 554 movq K, %rax 555 subq KK, %rax 556 salq $BASE_SHIFT, %rax 557 leaq (AO, %rax, 1), AO 558 leaq (BO, %rax, 1), BO 559#endif 560 561#ifdef LN 562 subq $1, KK 563#endif 564 565#ifdef LT 566 addq $1, KK 567#endif 568 569#ifdef RT 570 movq K, %rax 571 salq $0 + BASE_SHIFT, %rax 572 addq %rax, AORIG 573#endif 574 ALIGN_4 575 576.L49: 577#ifdef LN 578 movq K, %rax 579 salq $BASE_SHIFT, %rax 580 leaq (B, %rax, 1), B 581#endif 582 583#if defined(LT) || defined(RN) 584 movq BO, B 585#endif 586 587#ifdef RN 588 addq $1, KK 589#endif 590 591#ifdef RT 592 subq $1, KK 593#endif 594 ALIGN_4 595 596.L30: 597 movq N, %rax 598 sarq $1, %rax 599 movq %rax, J 600 je .L999 601 ALIGN_4 602 603.L01: 604#if defined(LT) || defined(RN) 605 movq A, AO 606#else 607 movq A, %rax 608 movq %rax, AORIG 609#endif 610 611#ifdef RT 612 movq K, %rax 613 salq $1 + BASE_SHIFT, %rax 614 subq %rax, B 615#endif 616 617 lea (, LDC, 2), %rax 618 619#ifdef RT 620 subq %rax, C 621#endif 622 movq C, CO 623#ifndef RT 624 addq %rax, C 625#endif 626 627#ifdef LN 628 movq OFFSET, %rax 629 addq M, %rax 630 movq %rax, KK 631#endif 632 633#ifdef LT 634 movq OFFSET, %rax 635 movq %rax, KK 636#endif 637 638 movq M, I 639 sarq $1, I 640 je .L20 641 ALIGN_4 642 643.L11: 644#ifdef LN 645 movq K, %rax 646 salq $1 + BASE_SHIFT, %rax 647 subq %rax, AORIG 648#endif 649 650#if defined(LN) || defined(RT) 651 movq KK, %rax 652 salq $BASE_SHIFT, %rax 653 movq AORIG, AO 654 leaq (AO, %rax, 2), AO 655 leaq (B, %rax, 2), BO 656#else 657 movq B, BO 658#endif 659 660 fldz 661 fldz 662 fldz 663 fldz 664 665#if defined(HAVE_3DNOW) 666 prefetchw 2 * SIZE(CO) 667 prefetchw 2 * SIZE(CO, LDC, 1) 668#elif defined(HAVE_SSE) 669 prefetchnta 2 * SIZE(CO) 670 prefetchnta 2 * SIZE(CO, LDC, 1) 671#endif 672 673#if defined(LT) || defined(RN) 674 movq KK, %rax 675#else 676 movq K, %rax 677 subq KK, %rax 678#endif 679 sarq $2, %rax 680 je .L15 681 ALIGN_4 682 683.L12: 684 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 685 686 FLD -8 * SIZE(AO) 687 688 FLD -8 * SIZE(BO) 689 fld %st(1) 690 fmul %st(1), %st 691 faddp %st, %st(3) 692 693 FLD -7 * SIZE(BO) 694 fmul %st, %st(2) 695 696 FLD -7 * SIZE(AO) 697 fmul %st, %st(2) 698 fmulp %st, %st(1) 699 700 faddp %st, %st(6) 701 faddp %st, %st(4) 702 faddp %st, %st(2) 703 704 FLD -6 * SIZE(AO) 705 706 FLD -6 * SIZE(BO) 707 fld %st(1) 708 fmul %st(1), %st 709 faddp %st, %st(3) 710 711 FLD -5 * SIZE(BO) 712 fmul %st, %st(2) 713 714 FLD -5 * SIZE(AO) 715 fmul %st, %st(2) 716 fmulp %st, %st(1) 717 718 faddp %st, %st(6) 719 faddp %st, %st(4) 720 faddp %st, %st(2) 721 722 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 723 724 FLD -4 * SIZE(AO) 725 726 FLD -4 * SIZE(BO) 727 fld %st(1) 728 fmul %st(1), %st 729 faddp %st, %st(3) 730 731 FLD -3 * SIZE(BO) 732 fmul %st, %st(2) 733 734 FLD -3 * SIZE(AO) 735 fmul %st, %st(2) 736 fmulp %st, %st(1) 737 738 faddp %st, %st(6) 739 faddp %st, %st(4) 740 faddp %st, %st(2) 741 742 FLD -2 * SIZE(AO) 743 744 FLD -2 * SIZE(BO) 745 fld %st(1) 746 fmul %st(1), %st 747 faddp %st, %st(3) 748 749 FLD -1 * SIZE(BO) 750 fmul %st, %st(2) 751 752 FLD -1 * SIZE(AO) 753 fmul %st, %st(2) 754 fmulp %st, %st(1) 755 756 faddp %st, %st(6) 757 faddp %st, %st(4) 758 faddp %st, %st(2) 759 760 addq $8 * SIZE,AO 761 addq $8 * SIZE,BO 762 763 decq %rax 764 jne .L12 765 ALIGN_4 766 767.L15: 768#if defined(LT) || defined(RN) 769 movq KK, %rax 770#else 771 movq K, %rax 772 subq KK, %rax 773#endif 774 and $3, %rax 775 je .L18 776 ALIGN_4 777 778.L16: 779 FLD -8 * SIZE(AO) 780 781 FLD -8 * SIZE(BO) 782 fld %st(1) 783 fmul %st(1), %st 784 faddp %st, %st(3) 785 786 FLD -7 * SIZE(BO) 787 fmul %st, %st(2) 788 789 FLD -7 * SIZE(AO) 790 fmul %st, %st(2) 791 fmulp %st, %st(1) 792 793 faddp %st, %st(6) 794 faddp %st, %st(4) 795 faddp %st, %st(2) 796 797 addq $2 * SIZE,AO 798 addq $2 * SIZE,BO 799 800 decq %rax 801 jne .L16 802 ALIGN_4 803 804.L18: 805#if defined(LN) || defined(RT) 806 movq KK, %rax 807#ifdef LN 808 subq $2, %rax 809#else 810 subq $2, %rax 811#endif 812 813 salq $BASE_SHIFT, %rax 814 815 movq AORIG, AO 816 leaq (AO, %rax, 2), AO 817 leaq (B, %rax, 2), BO 818#endif 819 820#if defined(LN) || defined(LT) 821 FLD -8 * SIZE(BO) 822 fsubp %st, %st(1) 823 FLD -7 * SIZE(BO) 824 fsubp %st, %st(2) 825 FLD -6 * SIZE(BO) 826 fsubp %st, %st(3) 827 FLD -5 * SIZE(BO) 828 fsubp %st, %st(4) 829#else 830 FLD -8 * SIZE(AO) 831 fsubp %st, %st(1) 832 FLD -7 * SIZE(AO) 833 fsubp %st, %st(3) 834 FLD -6 * SIZE(AO) 835 fsubp %st, %st(2) 836 FLD -5 * SIZE(AO) 837 fsubp %st, %st(4) 838#endif 839 840#ifdef LN 841 FLD -5 * SIZE(AO) 842 fmul %st, %st(3) 843 fmulp %st, %st(4) 844 845 FLD -6 * SIZE(AO) 846 fmul %st(3), %st 847 FLD -6 * SIZE(AO) 848 fmul %st(5), %st 849 850 fsubrp %st, %st(3) 851 fsubrp %st, %st(1) 852 853 FLD -8 * SIZE(AO) 854 fmul %st, %st(1) 855 fmulp %st, %st(2) 856#endif 857 858#ifdef LT 859 FLD -8 * SIZE(AO) 860 fmul %st, %st(1) 861 fmulp %st, %st(2) 862 863 FLD -7 * SIZE(AO) 864 fmul %st(1), %st 865 FLD -7 * SIZE(AO) 866 fmul %st(3), %st 867 868 fsubrp %st, %st(5) 869 fsubrp %st, %st(3) 870 871 FLD -5 * SIZE(AO) 872 fmul %st, %st(3) 873 fmulp %st, %st(4) 874#endif 875 876#ifdef RN 877 FLD -8 * SIZE(BO) 878 fmul %st, %st(1) 879 fmulp %st, %st(3) 880 881 FLD -7 * SIZE(BO) 882 fmul %st(1), %st 883 FLD -7 * SIZE(BO) 884 fmul %st(4), %st 885 886 fsubrp %st, %st(5) 887 fsubrp %st, %st(2) 888 889 FLD -5 * SIZE(BO) 890 fmul %st, %st(2) 891 fmulp %st, %st(4) 892#endif 893 894#ifdef RT 895 FLD -5 * SIZE(BO) 896 fmul %st, %st(2) 897 fmulp %st, %st(4) 898 899 FLD -6 * SIZE(BO) 900 fmul %st(2), %st 901 FLD -6 * SIZE(BO) 902 fmul %st(5), %st 903 904 fsubrp %st, %st(4) 905 fsubrp %st, %st(1) 906 907 FLD -8 * SIZE(BO) 908 fmul %st, %st(1) 909 fmulp %st, %st(3) 910#endif 911 912#ifdef LN 913 subq $2 * SIZE, CO 914#endif 915 916#if defined(LN) || defined(LT) 917 fld %st 918 FST -8 * SIZE(BO) 919 fxch %st(1) 920 fld %st 921 FST -7 * SIZE(BO) 922 fxch %st(2) 923 fld %st 924 FST -6 * SIZE(BO) 925 fxch %st(3) 926 fld %st 927 FST -5 * SIZE(BO) 928 929 FST 1 * SIZE(CO, LDC) 930 FST 0 * SIZE(CO) 931 FST 0 * SIZE(CO, LDC) 932 FST 1 * SIZE(CO) 933#else 934 fld %st 935 FST -8 * SIZE(AO) 936 fxch %st(2) 937 fld %st 938 FST -7 * SIZE(AO) 939 fxch %st(1) 940 fld %st 941 FST -6 * SIZE(AO) 942 fxch %st(3) 943 fld %st 944 FST -5 * SIZE(AO) 945 946 FST 1 * SIZE(CO, LDC) 947 FST 1 * SIZE(CO) 948 FST 0 * SIZE(CO) 949 FST 0 * SIZE(CO, LDC) 950#endif 951 952#ifndef LN 953 addq $2 * SIZE, CO 954#endif 955 956#if defined(LT) || defined(RN) 957 movq K, %rax 958 subq KK, %rax 959 salq $BASE_SHIFT, %rax 960 leaq (AO, %rax, 2), AO 961 leaq (BO, %rax, 2), BO 962#endif 963 964#ifdef LN 965 subq $2, KK 966#endif 967 968#ifdef LT 969 addq $2, KK 970#endif 971 972#ifdef RT 973 movq K, %rax 974 salq $1 + BASE_SHIFT, %rax 975 addq %rax, AORIG 976#endif 977 978 decq I 979 jne .L11 980 ALIGN_4 981 982.L20: 983 movq M, %rax 984 andq $1, %rax 985 je .L29 986 ALIGN_4 987 988.L21: 989#ifdef LN 990 movq K, %rax 991 salq $0 + BASE_SHIFT, %rax 992 subq %rax, AORIG 993#endif 994 995#if defined(LN) || defined(RT) 996 movq KK, %rax 997 salq $BASE_SHIFT, %rax 998 movq AORIG, AO 999 leaq (AO, %rax, 1), AO 1000 leaq (B, %rax, 2), BO 1001#else 1002 movq B, BO 1003#endif 1004 1005 fldz 1006 fldz 1007 1008#if defined(LT) || defined(RN) 1009 movq KK, %rax 1010#else 1011 movq K, %rax 1012 subq KK, %rax 1013#endif 1014 sarq $2, %rax 1015 je .L25 1016 ALIGN_4 1017 1018.L22: 1019 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1020 1021 FLD -8 * SIZE(AO) 1022 1023 FLD -8 * SIZE(BO) 1024 fmul %st(1), %st 1025 faddp %st, %st(2) 1026 1027 FLD -7 * SIZE(BO) 1028 fmulp %st, %st(1) 1029 faddp %st, %st(2) 1030 1031 FLD -7 * SIZE(AO) 1032 1033 FLD -6 * SIZE(BO) 1034 fmul %st(1), %st 1035 faddp %st, %st(2) 1036 1037 FLD -5 * SIZE(BO) 1038 fmulp %st, %st(1) 1039 faddp %st, %st(2) 1040 1041 FLD -6 * SIZE(AO) 1042 1043 FLD -4 * SIZE(BO) 1044 fmul %st(1), %st 1045 faddp %st, %st(2) 1046 1047 FLD -3 * SIZE(BO) 1048 fmulp %st, %st(1) 1049 faddp %st, %st(2) 1050 1051 FLD -5 * SIZE(AO) 1052 1053 FLD -2 * SIZE(BO) 1054 fmul %st(1), %st 1055 faddp %st, %st(2) 1056 1057 FLD -1 * SIZE(BO) 1058 fmulp %st, %st(1) 1059 faddp %st, %st(2) 1060 1061 addq $4 * SIZE,AO 1062 addq $8 * SIZE,BO 1063 1064 decq %rax 1065 jne .L22 1066 ALIGN_4 1067 1068.L25: 1069#if defined(LT) || defined(RN) 1070 movq KK, %rax 1071#else 1072 movq K, %rax 1073 subq KK, %rax 1074#endif 1075 and $3, %rax 1076 je .L28 1077 ALIGN_4 1078 1079.L26: 1080 FLD -8 * SIZE(AO) 1081 1082 FLD -8 * SIZE(BO) 1083 fmul %st(1), %st 1084 faddp %st, %st(2) 1085 1086 FLD -7 * SIZE(BO) 1087 fmulp %st, %st(1) 1088 faddp %st, %st(2) 1089 1090 addq $1 * SIZE,AO 1091 addq $2 * SIZE,BO 1092 1093 decq %rax 1094 jne .L26 1095 ALIGN_4 1096 1097.L28: 1098#if defined(LN) || defined(RT) 1099 movq KK, %rax 1100#ifdef LN 1101 subq $1, %rax 1102#else 1103 subq $2, %rax 1104#endif 1105 1106 salq $BASE_SHIFT, %rax 1107 1108 movq AORIG, AO 1109 leaq (AO, %rax, 1), AO 1110 leaq (B, %rax, 2), BO 1111#endif 1112 1113#if defined(LN) || defined(LT) 1114 FLD -8 * SIZE(BO) 1115 fsubp %st, %st(1) 1116 FLD -7 * SIZE(BO) 1117 fsubp %st, %st(2) 1118#else 1119 FLD -8 * SIZE(AO) 1120 fsubp %st, %st(1) 1121 FLD -7 * SIZE(AO) 1122 fsubp %st, %st(2) 1123#endif 1124 1125#if defined(LN) || defined(LT) 1126 FLD -8 * SIZE(AO) 1127 fmul %st, %st(1) 1128 fmulp %st, %st(2) 1129#endif 1130 1131#ifdef RN 1132 FLD -8 * SIZE(BO) 1133 fmulp %st, %st(1) 1134 1135 FLD -7 * SIZE(BO) 1136 fmul %st(1), %st 1137 1138 fsubrp %st, %st(2) 1139 1140 FLD -5 * SIZE(BO) 1141 fmulp %st, %st(2) 1142#endif 1143 1144#ifdef RT 1145 FLD -5 * SIZE(BO) 1146 fmulp %st, %st(2) 1147 1148 FLD -6 * SIZE(BO) 1149 fmul %st(2), %st 1150 1151 fsubrp %st, %st(1) 1152 1153 FLD -8 * SIZE(BO) 1154 fmulp %st, %st(1) 1155#endif 1156 1157#ifdef LN 1158 subq $1 * SIZE, CO 1159#endif 1160 1161#if defined(LN) || defined(LT) 1162 fld %st 1163 FST -8 * SIZE(BO) 1164 fxch %st(1) 1165 fld %st 1166 FST -7 * SIZE(BO) 1167#else 1168 fld %st 1169 FST -8 * SIZE(AO) 1170 fxch %st(1) 1171 fld %st 1172 FST -7 * SIZE(AO) 1173#endif 1174 1175 FST 0 * SIZE(CO, LDC) 1176 FST 0 * SIZE(CO) 1177 1178#ifndef LN 1179 addq $1 * SIZE, CO 1180#endif 1181 1182#if defined(LT) || defined(RN) 1183 movq K, %rax 1184 subq KK, %rax 1185 salq $BASE_SHIFT, %rax 1186 leaq (AO, %rax, 1), AO 1187 leaq (BO, %rax, 2), BO 1188#endif 1189 1190#ifdef LN 1191 subq $1, KK 1192#endif 1193 1194#ifdef LT 1195 addq $1, KK 1196#endif 1197 1198#ifdef RT 1199 movq K, %rax 1200 salq $0 + BASE_SHIFT, %rax 1201 addq %rax, AORIG 1202#endif 1203 ALIGN_4 1204 1205.L29: 1206#ifdef LN 1207 movq K, %rax 1208 salq $BASE_SHIFT, %rax 1209 leaq (B, %rax, 2), B 1210#endif 1211 1212#if defined(LT) || defined(RN) 1213 movq BO, B 1214#endif 1215 1216#ifdef RN 1217 addq $2, KK 1218#endif 1219 1220#ifdef RT 1221 subq $2, KK 1222#endif 1223 1224 decq J 1225 jne .L01 1226 ALIGN_4 1227 1228.L999: 1229 movq 0(%rsp), %rbx 1230 movq 8(%rsp), %rbp 1231 movq 16(%rsp), %r12 1232 movq 24(%rsp), %r13 1233 movq 32(%rsp), %r14 1234 movq 40(%rsp), %r15 1235 addq $STACKSIZE, %rsp 1236 ret 1237 1238 EPILOGUE 1239