1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 43#define N ARG2 44#define K ARG3 45#define A ARG4 46#define B ARG5 47#define C ARG6 48#define LDC %r10 49 50#define I %r12 51#define J %r13 52#define AO %r14 53#define BO %r15 54#define CO %rbp 55 56#define KK %r11 57#define AORIG 48(%rsp) 58 59#define STACKSIZE 64 60 61#define ALPHA 8 + STACKSIZE(%rsp) 62#define OFFSET 32 + STACKSIZE(%rsp) 63 64#ifdef OPTERON 65#define PREFETCH prefetch 66#define PREFETCHW prefetchw 67#else 68#define PREFETCH prefetcht0 69#define PREFETCHW prefetcht0 70#endif 71 72#define PREFETCHSIZE (5 + 4 * 10) 73 74 PROLOGUE 75 PROFCODE 76 77 subq $STACKSIZE, %rsp 78 movq %rbx, 0(%rsp) 79 movq %rbp, 8(%rsp) 80 movq %r12, 16(%rsp) 81 movq %r13, 24(%rsp) 82 movq %r14, 32(%rsp) 83 movq %r15, 40(%rsp) 84 85 movq 24 + STACKSIZE(%rsp), LDC 86 87#if defined(TRMMKERNEL) && !defined(LEFT) 88 movq OFFSET, %rax 89 negq %rax 90 movq %rax, KK 91#endif 92 93 addq $8 * SIZE, A 94 addq $8 * SIZE, B 95 96 salq $BASE_SHIFT, LDC 97 98#ifdef LN 99 movq M, %rax 100 salq $BASE_SHIFT, %rax 101 addq %rax, C 102 imulq K, %rax 103 addq %rax, A 104#endif 105 106#ifdef RT 107 movq N, %rax 108 salq $BASE_SHIFT, %rax 109 imulq K, %rax 110 addq %rax, B 111 112 movq N, %rax 113 imulq LDC, %rax 114 addq %rax, C 115#endif 116 117#ifdef RN 118 movq OFFSET, %rax 119 negq %rax 120 movq %rax, KK 121#endif 122 123#ifdef RT 124 movq N, %rax 125 subq OFFSET, %rax 126 movq %rax, KK 127#endif 128 129 movq N, %rax 130 testq $1, %rax 131 je .L30 132 133#if defined(LT) || defined(RN) 134 movq A, AO 135#else 136 movq A, %rax 137 movq %rax, AORIG 138#endif 139 140#ifdef RT 141 movq K, %rax 142 salq $0 + BASE_SHIFT, %rax 143 subq %rax, B 144#endif 145 146#ifdef RT 147 subq LDC, C 148#endif 149 movq C, CO 150#ifndef RT 151 addq LDC, C 152#endif 153 154#ifdef LN 155 movq OFFSET, %rax 156 addq M, %rax 157 movq %rax, KK 158#endif 159 160#ifdef LT 161 movq OFFSET, %rax 162 movq %rax, KK 163#endif 164 165 movq M, I 166 sarq $1, I 167 je .L40 168 ALIGN_4 169 170.L31: 171#ifdef LN 172 movq K, %rax 173 salq $1 + BASE_SHIFT, %rax 174 subq %rax, AORIG 175#endif 176 177#if defined(LN) || defined(RT) 178 movq KK, %rax 179 salq $BASE_SHIFT, %rax 180 movq AORIG, AO 181 leaq (AO, %rax, 2), AO 182 leaq (B, %rax, 1), BO 183#else 184 movq B, BO 185#endif 186 187 fldz 188 fldz 189 190#if defined(HAVE_3DNOW) 191 prefetchw 2 * SIZE(CO) 192#elif defined(HAVE_SSE) 193 prefetchnta 2 * SIZE(CO) 194#endif 195 196#if defined(LT) || defined(RN) 197 movq KK, %rax 198#else 199 movq K, %rax 200 subq KK, %rax 201#endif 202 sarq $2, %rax 203 je .L35 204 ALIGN_4 205 206.L32: 207 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 208 209 FLD -8 * SIZE(BO) 210 FLD -8 * SIZE(AO) 211 fmul %st(1), %st 212 faddp %st, %st(2) 213 214 FLD -7 * SIZE(AO) 215 fmulp %st, %st(1) 216 faddp %st, %st(2) 217 218 FLD -7 * SIZE(BO) 219 FLD -6 * SIZE(AO) 220 fmul %st(1), %st 221 faddp %st, %st(2) 222 223 FLD -5 * SIZE(AO) 224 fmulp %st, %st(1) 225 faddp %st, %st(2) 226 227 FLD -6 * SIZE(BO) 228 FLD -4 * SIZE(AO) 229 fmul %st(1), %st 230 faddp %st, %st(2) 231 232 FLD -3 * SIZE(AO) 233 fmulp %st, %st(1) 234 faddp %st, %st(2) 235 236 FLD -5 * SIZE(BO) 237 FLD -2 * SIZE(AO) 238 fmul %st(1), %st 239 faddp %st, %st(2) 240 241 FLD -1 * SIZE(AO) 242 fmulp %st, %st(1) 243 faddp %st, %st(2) 244 245 addq $8 * SIZE,AO 246 addq $4 * SIZE,BO 247 248 decq %rax 249 jne .L32 250 ALIGN_4 251 252.L35: 253#if defined(LT) || defined(RN) 254 movq KK, %rax 255#else 256 movq K, %rax 257 subq KK, %rax 258#endif 259 and $3, %rax 260 je .L38 261 ALIGN_4 262 263.L36: 264 FLD -8 * SIZE(BO) 265 266 FLD -8 * SIZE(AO) 267 fmul %st(1), %st 268 faddp %st, %st(2) 269 270 FLD -7 * SIZE(AO) 271 fmulp %st, %st(1) 272 faddp %st, %st(2) 273 274 addq $2 * SIZE,AO 275 addq $1 * SIZE,BO 276 277 decq %rax 278 jne .L36 279 ALIGN_4 280 281.L38: 282#if defined(LN) || defined(RT) 283 movq KK, %rax 284#ifdef LN 285 subq $2, %rax 286#else 287 subq $1, %rax 288#endif 289 290 salq $BASE_SHIFT, %rax 291 292 movq AORIG, AO 293 leaq (AO, %rax, 2), AO 294 leaq (B, %rax, 1), BO 295#endif 296 297#if defined(LN) || defined(LT) 298 FLD -8 * SIZE(BO) 299 fsubp %st, %st(1) 300 FLD -7 * SIZE(BO) 301 fsubp %st, %st(2) 302#else 303 FLD -8 * SIZE(AO) 304 fsubp %st, %st(1) 305 FLD -7 * SIZE(AO) 306 fsubp %st, %st(2) 307#endif 308 309#ifdef LN 310 FLD -5 * SIZE(AO) 311 fmulp %st, %st(2) 312 313 FLD -6 * SIZE(AO) 314 fmul %st(2), %st 315 316 fsubrp %st, %st(1) 317 FLD -8 * SIZE(AO) 318 fmulp %st, %st(1) 319#endif 320 321#ifdef LT 322 FLD -8 * SIZE(AO) 323 fmulp %st, %st(1) 324 325 FLD -7 * SIZE(AO) 326 fmul %st(1), %st 327 328 fsubrp %st, %st(2) 329 330 FLD -5 * SIZE(AO) 331 fmulp %st, %st(2) 332#endif 333 334#ifdef RN 335 FLD -8 * SIZE(BO) 336 fmul %st, %st(1) 337 fmulp %st, %st(2) 338#endif 339 340#ifdef RT 341 FLD -8 * SIZE(BO) 342 fmul %st, %st(1) 343 fmulp %st, %st(2) 344#endif 345 346#ifdef LN 347 subq $2 * SIZE, CO 348#endif 349 350#if defined(LN) || defined(LT) 351 fld %st 352 FST -8 * SIZE(BO) 353 fxch %st(1) 354 fld %st 355 FST -7 * SIZE(BO) 356#else 357 fld %st 358 FST -8 * SIZE(AO) 359 fxch %st(1) 360 fld %st 361 FST -7 * SIZE(AO) 362#endif 363 364 FST 1 * SIZE(CO) 365 FST 0 * SIZE(CO) 366 367#ifndef LN 368 addq $2 * SIZE, CO 369#endif 370 371#if defined(LT) || defined(RN) 372 movq K, %rax 373 subq KK, %rax 374 salq $BASE_SHIFT, %rax 375 leaq (AO, %rax, 2), AO 376 leaq (BO, %rax, 1), BO 377#endif 378 379#ifdef LN 380 subq $2, KK 381#endif 382 383#ifdef LT 384 addq $2, KK 385#endif 386 387#ifdef RT 388 movq K, %rax 389 salq $1 + BASE_SHIFT, %rax 390 addq %rax, AORIG 391#endif 392 393 decq I 394 jne .L31 395 ALIGN_4 396 397.L40: 398 movq M, %rax 399 andq $1, %rax 400 je .L49 401 ALIGN_4 402 403.L41: 404#ifdef LN 405 movq K, %rax 406 salq $0 + BASE_SHIFT, %rax 407 subq %rax, AORIG 408#endif 409 410#if defined(LN) || defined(RT) 411 movq KK, %rax 412 salq $BASE_SHIFT, %rax 413 movq AORIG, AO 414 leaq (AO, %rax, 1), AO 415 leaq (B, %rax, 1), BO 416#else 417 movq B, BO 418#endif 419 420 fldz 421 422#if defined(LT) || defined(RN) 423 movq KK, %rax 424#else 425 movq K, %rax 426 subq KK, %rax 427#endif 428 sarq $2, %rax 429 je .L45 430 ALIGN_4 431 432.L42: 433 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 434 435 FLD -8 * SIZE(AO) 436 FLD -8 * SIZE(BO) 437 fmulp %st, %st(1) 438 faddp %st, %st(1) 439 440 FLD -7 * SIZE(AO) 441 FLD -7 * SIZE(BO) 442 fmulp %st, %st(1) 443 faddp %st, %st(1) 444 445 FLD -6 * SIZE(AO) 446 FLD -6 * SIZE(BO) 447 fmulp %st, %st(1) 448 faddp %st, %st(1) 449 450 FLD -5 * SIZE(AO) 451 FLD -5 * SIZE(BO) 452 fmulp %st, %st(1) 453 faddp %st, %st(1) 454 455 addq $4 * SIZE,AO 456 addq $4 * SIZE,BO 457 458 decq %rax 459 jne .L42 460 ALIGN_4 461 462.L45: 463#if defined(LT) || defined(RN) 464 movq KK, %rax 465#else 466 movq K, %rax 467 subq KK, %rax 468#endif 469 and $3, %rax 470 je .L48 471 ALIGN_4 472 473.L46: 474 FLD -8 * SIZE(AO) 475 476 FLD -8 * SIZE(BO) 477 fmulp %st, %st(1) 478 faddp %st, %st(1) 479 480 addq $1 * SIZE,AO 481 addq $1 * SIZE,BO 482 483 decq %rax 484 jne .L46 485 ALIGN_4 486 487.L48: 488#if defined(LN) || defined(RT) 489 movq KK, %rax 490#ifdef LN 491 subq $1, %rax 492#else 493 subq $1, %rax 494#endif 495 496 salq $BASE_SHIFT, %rax 497 498 movq AORIG, AO 499 leaq (AO, %rax, 1), AO 500 leaq (B, %rax, 1), BO 501#endif 502 503#if defined(LN) || defined(LT) 504 FLD -8 * SIZE(BO) 505 fsubp %st, %st(1) 506#else 507 FLD -8 * SIZE(AO) 508 fsubp %st, %st(1) 509#endif 510 511#ifdef LN 512 FLD -8 * SIZE(AO) 513 fmulp %st, %st(1) 514#endif 515 516#ifdef LT 517 FLD -8 * SIZE(AO) 518 fmulp %st, %st(1) 519#endif 520 521#ifdef RN 522 FLD -8 * SIZE(BO) 523 fmulp %st, %st(1) 524#endif 525 526#ifdef RT 527 FLD -8 * SIZE(BO) 528 fmulp %st, %st(1) 529#endif 530 531#ifdef LN 532 subq $1 * SIZE, CO 533#endif 534 535#if defined(LN) || defined(LT) 536 fld %st 537 FST -8 * SIZE(BO) 538#else 539 fld %st 540 FST -8 * SIZE(AO) 541#endif 542 543 FST 0 * SIZE(CO) 544 545#ifndef LN 546 addq $1 * SIZE, CO 547#endif 548 549#if defined(LT) || defined(RN) 550 movq K, %rax 551 subq KK, %rax 552 salq $BASE_SHIFT, %rax 553 leaq (AO, %rax, 1), AO 554 leaq (BO, %rax, 1), BO 555#endif 556 557#ifdef LN 558 subq $1, KK 559#endif 560 561#ifdef LT 562 addq $1, KK 563#endif 564 565#ifdef RT 566 movq K, %rax 567 salq $0 + BASE_SHIFT, %rax 568 addq %rax, AORIG 569#endif 570 ALIGN_4 571 572.L49: 573#ifdef LN 574 movq K, %rax 575 salq $BASE_SHIFT, %rax 576 leaq (B, %rax, 1), B 577#endif 578 579#if defined(LT) || defined(RN) 580 movq BO, B 581#endif 582 583#ifdef RN 584 addq $1, KK 585#endif 586 587#ifdef RT 588 subq $1, KK 589#endif 590 ALIGN_4 591 592.L30: 593 movq N, %rax 594 sarq $1, %rax 595 movq %rax, J 596 je .L999 597 ALIGN_4 598 599.L01: 600#if defined(LT) || defined(RN) 601 movq A, AO 602#else 603 movq A, %rax 604 movq %rax, AORIG 605#endif 606 607#ifdef RT 608 movq K, %rax 609 salq $1 + BASE_SHIFT, %rax 610 subq %rax, B 611#endif 612 613 lea (, LDC, 2), %rax 614 615#ifdef RT 616 subq %rax, C 617#endif 618 movq C, CO 619#ifndef RT 620 addq %rax, C 621#endif 622 623#ifdef LN 624 movq OFFSET, %rax 625 addq M, %rax 626 movq %rax, KK 627#endif 628 629#ifdef LT 630 movq OFFSET, %rax 631 movq %rax, KK 632#endif 633 634 movq M, I 635 sarq $1, I 636 je .L20 637 ALIGN_4 638 639.L11: 640#ifdef LN 641 movq K, %rax 642 salq $1 + BASE_SHIFT, %rax 643 subq %rax, AORIG 644#endif 645 646#if defined(LN) || defined(RT) 647 movq KK, %rax 648 salq $BASE_SHIFT, %rax 649 movq AORIG, AO 650 leaq (AO, %rax, 2), AO 651 leaq (B, %rax, 2), BO 652#else 653 movq B, BO 654#endif 655 656 fldz 657 fldz 658 fldz 659 fldz 660 661#if defined(HAVE_3DNOW) 662 prefetchw 2 * SIZE(CO) 663 prefetchw 2 * SIZE(CO, LDC, 1) 664#elif defined(HAVE_SSE) 665 prefetchnta 2 * SIZE(CO) 666 prefetchnta 2 * SIZE(CO, LDC, 1) 667#endif 668 669#if defined(LT) || defined(RN) 670 movq KK, %rax 671#else 672 movq K, %rax 673 subq KK, %rax 674#endif 675 sarq $2, %rax 676 je .L15 677 ALIGN_4 678 679.L12: 680 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 681 682 FLD -8 * SIZE(AO) 683 684 FLD -8 * SIZE(BO) 685 fld %st(1) 686 fmul %st(1), %st 687 faddp %st, %st(3) 688 689 FLD -7 * SIZE(BO) 690 fmul %st, %st(2) 691 692 FLD -7 * SIZE(AO) 693 fmul %st, %st(2) 694 fmulp %st, %st(1) 695 696 faddp %st, %st(6) 697 faddp %st, %st(4) 698 faddp %st, %st(2) 699 700 FLD -6 * SIZE(AO) 701 702 FLD -6 * SIZE(BO) 703 fld %st(1) 704 fmul %st(1), %st 705 faddp %st, %st(3) 706 707 FLD -5 * SIZE(BO) 708 fmul %st, %st(2) 709 710 FLD -5 * SIZE(AO) 711 fmul %st, %st(2) 712 fmulp %st, %st(1) 713 714 faddp %st, %st(6) 715 faddp %st, %st(4) 716 faddp %st, %st(2) 717 718 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 719 720 FLD -4 * SIZE(AO) 721 722 FLD -4 * SIZE(BO) 723 fld %st(1) 724 fmul %st(1), %st 725 faddp %st, %st(3) 726 727 FLD -3 * SIZE(BO) 728 fmul %st, %st(2) 729 730 FLD -3 * SIZE(AO) 731 fmul %st, %st(2) 732 fmulp %st, %st(1) 733 734 faddp %st, %st(6) 735 faddp %st, %st(4) 736 faddp %st, %st(2) 737 738 FLD -2 * SIZE(AO) 739 740 FLD -2 * SIZE(BO) 741 fld %st(1) 742 fmul %st(1), %st 743 faddp %st, %st(3) 744 745 FLD -1 * SIZE(BO) 746 fmul %st, %st(2) 747 748 FLD -1 * SIZE(AO) 749 fmul %st, %st(2) 750 fmulp %st, %st(1) 751 752 faddp %st, %st(6) 753 faddp %st, %st(4) 754 faddp %st, %st(2) 755 756 addq $8 * SIZE,AO 757 addq $8 * SIZE,BO 758 759 decq %rax 760 jne .L12 761 ALIGN_4 762 763.L15: 764#if defined(LT) || defined(RN) 765 movq KK, %rax 766#else 767 movq K, %rax 768 subq KK, %rax 769#endif 770 and $3, %rax 771 je .L18 772 ALIGN_4 773 774.L16: 775 FLD -8 * SIZE(AO) 776 777 FLD -8 * SIZE(BO) 778 fld %st(1) 779 fmul %st(1), %st 780 faddp %st, %st(3) 781 782 FLD -7 * SIZE(BO) 783 fmul %st, %st(2) 784 785 FLD -7 * SIZE(AO) 786 fmul %st, %st(2) 787 fmulp %st, %st(1) 788 789 faddp %st, %st(6) 790 faddp %st, %st(4) 791 faddp %st, %st(2) 792 793 addq $2 * SIZE,AO 794 addq $2 * SIZE,BO 795 796 decq %rax 797 jne .L16 798 ALIGN_4 799 800.L18: 801#if defined(LN) || defined(RT) 802 movq KK, %rax 803#ifdef LN 804 subq $2, %rax 805#else 806 subq $2, %rax 807#endif 808 809 salq $BASE_SHIFT, %rax 810 811 movq AORIG, AO 812 leaq (AO, %rax, 2), AO 813 leaq (B, %rax, 2), BO 814#endif 815 816#if defined(LN) || defined(LT) 817 FLD -8 * SIZE(BO) 818 fsubp %st, %st(1) 819 FLD -7 * SIZE(BO) 820 fsubp %st, %st(2) 821 FLD -6 * SIZE(BO) 822 fsubp %st, %st(3) 823 FLD -5 * SIZE(BO) 824 fsubp %st, %st(4) 825#else 826 FLD -8 * SIZE(AO) 827 fsubp %st, %st(1) 828 FLD -7 * SIZE(AO) 829 fsubp %st, %st(3) 830 FLD -6 * SIZE(AO) 831 fsubp %st, %st(2) 832 FLD -5 * SIZE(AO) 833 fsubp %st, %st(4) 834#endif 835 836#ifdef LN 837 FLD -5 * SIZE(AO) 838 fmul %st, %st(3) 839 fmulp %st, %st(4) 840 841 FLD -6 * SIZE(AO) 842 fmul %st(3), %st 843 FLD -6 * SIZE(AO) 844 fmul %st(5), %st 845 846 fsubrp %st, %st(3) 847 fsubrp %st, %st(1) 848 849 FLD -8 * SIZE(AO) 850 fmul %st, %st(1) 851 fmulp %st, %st(2) 852#endif 853 854#ifdef LT 855 FLD -8 * SIZE(AO) 856 fmul %st, %st(1) 857 fmulp %st, %st(2) 858 859 FLD -7 * SIZE(AO) 860 fmul %st(1), %st 861 FLD -7 * SIZE(AO) 862 fmul %st(3), %st 863 864 fsubrp %st, %st(5) 865 fsubrp %st, %st(3) 866 867 FLD -5 * SIZE(AO) 868 fmul %st, %st(3) 869 fmulp %st, %st(4) 870#endif 871 872#ifdef RN 873 FLD -8 * SIZE(BO) 874 fmul %st, %st(1) 875 fmulp %st, %st(3) 876 877 FLD -7 * SIZE(BO) 878 fmul %st(1), %st 879 FLD -7 * SIZE(BO) 880 fmul %st(4), %st 881 882 fsubrp %st, %st(5) 883 fsubrp %st, %st(2) 884 885 FLD -5 * SIZE(BO) 886 fmul %st, %st(2) 887 fmulp %st, %st(4) 888#endif 889 890#ifdef RT 891 FLD -5 * SIZE(BO) 892 fmul %st, %st(2) 893 fmulp %st, %st(4) 894 895 FLD -6 * SIZE(BO) 896 fmul %st(2), %st 897 FLD -6 * SIZE(BO) 898 fmul %st(5), %st 899 900 fsubrp %st, %st(4) 901 fsubrp %st, %st(1) 902 903 FLD -8 * SIZE(BO) 904 fmul %st, %st(1) 905 fmulp %st, %st(3) 906#endif 907 908#ifdef LN 909 subq $2 * SIZE, CO 910#endif 911 912#if defined(LN) || defined(LT) 913 fld %st 914 FST -8 * SIZE(BO) 915 fxch %st(1) 916 fld %st 917 FST -7 * SIZE(BO) 918 fxch %st(2) 919 fld %st 920 FST -6 * SIZE(BO) 921 fxch %st(3) 922 fld %st 923 FST -5 * SIZE(BO) 924 925 FST 1 * SIZE(CO, LDC) 926 FST 0 * SIZE(CO) 927 FST 0 * SIZE(CO, LDC) 928 FST 1 * SIZE(CO) 929#else 930 fld %st 931 FST -8 * SIZE(AO) 932 fxch %st(2) 933 fld %st 934 FST -7 * SIZE(AO) 935 fxch %st(1) 936 fld %st 937 FST -6 * SIZE(AO) 938 fxch %st(3) 939 fld %st 940 FST -5 * SIZE(AO) 941 942 FST 1 * SIZE(CO, LDC) 943 FST 1 * SIZE(CO) 944 FST 0 * SIZE(CO) 945 FST 0 * SIZE(CO, LDC) 946#endif 947 948#ifndef LN 949 addq $2 * SIZE, CO 950#endif 951 952#if defined(LT) || defined(RN) 953 movq K, %rax 954 subq KK, %rax 955 salq $BASE_SHIFT, %rax 956 leaq (AO, %rax, 2), AO 957 leaq (BO, %rax, 2), BO 958#endif 959 960#ifdef LN 961 subq $2, KK 962#endif 963 964#ifdef LT 965 addq $2, KK 966#endif 967 968#ifdef RT 969 movq K, %rax 970 salq $1 + BASE_SHIFT, %rax 971 addq %rax, AORIG 972#endif 973 974 decq I 975 jne .L11 976 ALIGN_4 977 978.L20: 979 movq M, %rax 980 andq $1, %rax 981 je .L29 982 ALIGN_4 983 984.L21: 985#ifdef LN 986 movq K, %rax 987 salq $0 + BASE_SHIFT, %rax 988 subq %rax, AORIG 989#endif 990 991#if defined(LN) || defined(RT) 992 movq KK, %rax 993 salq $BASE_SHIFT, %rax 994 movq AORIG, AO 995 leaq (AO, %rax, 1), AO 996 leaq (B, %rax, 2), BO 997#else 998 movq B, BO 999#endif 1000 1001 fldz 1002 fldz 1003 1004#if defined(LT) || defined(RN) 1005 movq KK, %rax 1006#else 1007 movq K, %rax 1008 subq KK, %rax 1009#endif 1010 sarq $2, %rax 1011 je .L25 1012 ALIGN_4 1013 1014.L22: 1015 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1016 1017 FLD -8 * SIZE(AO) 1018 1019 FLD -8 * SIZE(BO) 1020 fmul %st(1), %st 1021 faddp %st, %st(2) 1022 1023 FLD -7 * SIZE(BO) 1024 fmulp %st, %st(1) 1025 faddp %st, %st(2) 1026 1027 FLD -7 * SIZE(AO) 1028 1029 FLD -6 * SIZE(BO) 1030 fmul %st(1), %st 1031 faddp %st, %st(2) 1032 1033 FLD -5 * SIZE(BO) 1034 fmulp %st, %st(1) 1035 faddp %st, %st(2) 1036 1037 FLD -6 * SIZE(AO) 1038 1039 FLD -4 * SIZE(BO) 1040 fmul %st(1), %st 1041 faddp %st, %st(2) 1042 1043 FLD -3 * SIZE(BO) 1044 fmulp %st, %st(1) 1045 faddp %st, %st(2) 1046 1047 FLD -5 * SIZE(AO) 1048 1049 FLD -2 * SIZE(BO) 1050 fmul %st(1), %st 1051 faddp %st, %st(2) 1052 1053 FLD -1 * SIZE(BO) 1054 fmulp %st, %st(1) 1055 faddp %st, %st(2) 1056 1057 addq $4 * SIZE,AO 1058 addq $8 * SIZE,BO 1059 1060 decq %rax 1061 jne .L22 1062 ALIGN_4 1063 1064.L25: 1065#if defined(LT) || defined(RN) 1066 movq KK, %rax 1067#else 1068 movq K, %rax 1069 subq KK, %rax 1070#endif 1071 and $3, %rax 1072 je .L28 1073 ALIGN_4 1074 1075.L26: 1076 FLD -8 * SIZE(AO) 1077 1078 FLD -8 * SIZE(BO) 1079 fmul %st(1), %st 1080 faddp %st, %st(2) 1081 1082 FLD -7 * SIZE(BO) 1083 fmulp %st, %st(1) 1084 faddp %st, %st(2) 1085 1086 addq $1 * SIZE,AO 1087 addq $2 * SIZE,BO 1088 1089 decq %rax 1090 jne .L26 1091 ALIGN_4 1092 1093.L28: 1094#if defined(LN) || defined(RT) 1095 movq KK, %rax 1096#ifdef LN 1097 subq $1, %rax 1098#else 1099 subq $2, %rax 1100#endif 1101 1102 salq $BASE_SHIFT, %rax 1103 1104 movq AORIG, AO 1105 leaq (AO, %rax, 1), AO 1106 leaq (B, %rax, 2), BO 1107#endif 1108 1109#if defined(LN) || defined(LT) 1110 FLD -8 * SIZE(BO) 1111 fsubp %st, %st(1) 1112 FLD -7 * SIZE(BO) 1113 fsubp %st, %st(2) 1114#else 1115 FLD -8 * SIZE(AO) 1116 fsubp %st, %st(1) 1117 FLD -7 * SIZE(AO) 1118 fsubp %st, %st(2) 1119#endif 1120 1121#if defined(LN) || defined(LT) 1122 FLD -8 * SIZE(AO) 1123 fmul %st, %st(1) 1124 fmulp %st, %st(2) 1125#endif 1126 1127#ifdef RN 1128 FLD -8 * SIZE(BO) 1129 fmulp %st, %st(1) 1130 1131 FLD -7 * SIZE(BO) 1132 fmul %st(1), %st 1133 1134 fsubrp %st, %st(2) 1135 1136 FLD -5 * SIZE(BO) 1137 fmulp %st, %st(2) 1138#endif 1139 1140#ifdef RT 1141 FLD -5 * SIZE(BO) 1142 fmulp %st, %st(2) 1143 1144 FLD -6 * SIZE(BO) 1145 fmul %st(2), %st 1146 1147 fsubrp %st, %st(1) 1148 1149 FLD -8 * SIZE(BO) 1150 fmulp %st, %st(1) 1151#endif 1152 1153#ifdef LN 1154 subq $1 * SIZE, CO 1155#endif 1156 1157#if defined(LN) || defined(LT) 1158 fld %st 1159 FST -8 * SIZE(BO) 1160 fxch %st(1) 1161 fld %st 1162 FST -7 * SIZE(BO) 1163#else 1164 fld %st 1165 FST -8 * SIZE(AO) 1166 fxch %st(1) 1167 fld %st 1168 FST -7 * SIZE(AO) 1169#endif 1170 1171 FST 0 * SIZE(CO, LDC) 1172 FST 0 * SIZE(CO) 1173 1174#ifndef LN 1175 addq $1 * SIZE, CO 1176#endif 1177 1178#if defined(LT) || defined(RN) 1179 movq K, %rax 1180 subq KK, %rax 1181 salq $BASE_SHIFT, %rax 1182 leaq (AO, %rax, 1), AO 1183 leaq (BO, %rax, 2), BO 1184#endif 1185 1186#ifdef LN 1187 subq $1, KK 1188#endif 1189 1190#ifdef LT 1191 addq $1, KK 1192#endif 1193 1194#ifdef RT 1195 movq K, %rax 1196 salq $0 + BASE_SHIFT, %rax 1197 addq %rax, AORIG 1198#endif 1199 ALIGN_4 1200 1201.L29: 1202#ifdef LN 1203 movq K, %rax 1204 salq $BASE_SHIFT, %rax 1205 leaq (B, %rax, 2), B 1206#endif 1207 1208#if defined(LT) || defined(RN) 1209 movq BO, B 1210#endif 1211 1212#ifdef RN 1213 addq $2, KK 1214#endif 1215 1216#ifdef RT 1217 subq $2, KK 1218#endif 1219 1220 decq J 1221 jne .L01 1222 ALIGN_4 1223 1224.L999: 1225 movq 0(%rsp), %rbx 1226 movq 8(%rsp), %rbp 1227 movq 16(%rsp), %r12 1228 movq 24(%rsp), %r13 1229 movq 32(%rsp), %r14 1230 movq 40(%rsp), %r15 1231 addq $STACKSIZE, %rsp 1232 ret 1233 1234 EPILOGUE 1235