1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 43#define N ARG2 44#define K ARG3 45#define A ARG4 46#define B ARG5 47#define C ARG6 48#define LDC %r10 49 50#define I %r12 51#define J %r13 52#define AO %r14 53#define BO %r15 54#define CO %rbp 55 56#define KK %r11 57#define AORIG 48(%rsp) 58 59#define STACKSIZE 64 60 61#define ALPHA 8 + STACKSIZE(%rsp) 62#define OFFSET 32 + STACKSIZE(%rsp) 63 64#ifdef OPTERON 65#define PREFETCH prefetch 66#define PREFETCHW prefetchw 67#else 68#define PREFETCH prefetcht0 69#define PREFETCHW prefetcht0 70#endif 71 72#define PREFETCHSIZE (5 + 4 * 10) 73 74 PROLOGUE 75 PROFCODE 76 77#ifdef WINDOWS_ABI 78 emms 79#endif 80 81 subq $STACKSIZE, %rsp 82 movq %rbx, 0(%rsp) 83 movq %rbp, 8(%rsp) 84 movq %r12, 16(%rsp) 85 movq %r13, 24(%rsp) 86 movq %r14, 32(%rsp) 87 movq %r15, 40(%rsp) 88 89 movq 24 + STACKSIZE(%rsp), LDC 90 91#if defined(TRMMKERNEL) && !defined(LEFT) 92 movq OFFSET, %rax 93 negq %rax 94 movq %rax, KK 95#endif 96 97 addq $8 * SIZE, A 98 addq $8 * SIZE, B 99 100 salq $BASE_SHIFT, LDC 101 102#ifdef LN 103 movq M, %rax 104 salq $BASE_SHIFT, %rax 105 addq %rax, C 106 imulq K, %rax 107 addq %rax, A 108#endif 109 110#ifdef RT 111 movq N, %rax 112 salq $BASE_SHIFT, %rax 113 imulq K, %rax 114 addq %rax, B 115 116 movq N, %rax 117 imulq LDC, %rax 118 addq %rax, C 119#endif 120 121#ifdef RN 122 movq OFFSET, %rax 123 negq %rax 124 movq %rax, KK 125#endif 126 127#ifdef RT 128 movq N, %rax 129 subq OFFSET, %rax 130 movq %rax, KK 131#endif 132 133 movq N, %rax 134 sarq $1, %rax 135 movq %rax, J 136 je .L30 137 ALIGN_4 138 139.L01: 140#if defined(LT) || defined(RN) 141 movq A, AO 142#else 143 movq A, %rax 144 movq %rax, AORIG 145#endif 146 147#ifdef RT 148 movq K, %rax 149 salq $1 + BASE_SHIFT, %rax 150 subq %rax, B 151#endif 152 153 lea (, LDC, 2), %rax 154 155#ifdef RT 156 subq %rax, C 157#endif 158 movq C, CO 159#ifndef RT 160 addq %rax, C 161#endif 162 163#ifdef LN 164 movq OFFSET, %rax 165 addq M, %rax 166 movq %rax, KK 167#endif 168 169#ifdef LT 170 movq OFFSET, %rax 171 movq %rax, KK 172#endif 173 174 movq M, I 175 sarq $1, I 176 je .L20 177 ALIGN_4 178 179.L11: 180#ifdef LN 181 movq K, %rax 182 salq $1 + BASE_SHIFT, %rax 183 subq %rax, AORIG 184#endif 185 186#if defined(LN) || defined(RT) 187 movq KK, %rax 188 salq $BASE_SHIFT, %rax 189 movq AORIG, AO 190 leaq (AO, %rax, 2), AO 191 leaq (B, %rax, 2), BO 192#else 193 movq B, BO 194#endif 195 196 fldz 197 fldz 198 fldz 199 fldz 200 201#if defined(HAVE_3DNOW) 202 prefetchw 2 * SIZE(CO) 203 prefetchw 2 * SIZE(CO, LDC, 1) 204#elif defined(HAVE_SSE) 205 prefetchnta 2 * SIZE(CO) 206 prefetchnta 2 * SIZE(CO, LDC, 1) 207#endif 208 209#if defined(LT) || defined(RN) 210 movq KK, %rax 211#else 212 movq K, %rax 213 subq KK, %rax 214#endif 215 sarq $2, %rax 216 je .L15 217 ALIGN_4 218 219.L12: 220 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 221 222 FLD -8 * SIZE(AO) 223 224 FLD -8 * SIZE(BO) 225 fld %st(1) 226 fmul %st(1), %st 227 faddp %st, %st(3) 228 229 FLD -7 * SIZE(BO) 230 fmul %st, %st(2) 231 232 FLD -7 * SIZE(AO) 233 fmul %st, %st(2) 234 fmulp %st, %st(1) 235 236 faddp %st, %st(6) 237 faddp %st, %st(4) 238 faddp %st, %st(2) 239 240 FLD -6 * SIZE(AO) 241 242 FLD -6 * SIZE(BO) 243 fld %st(1) 244 fmul %st(1), %st 245 faddp %st, %st(3) 246 247 FLD -5 * SIZE(BO) 248 fmul %st, %st(2) 249 250 FLD -5 * SIZE(AO) 251 fmul %st, %st(2) 252 fmulp %st, %st(1) 253 254 faddp %st, %st(6) 255 faddp %st, %st(4) 256 faddp %st, %st(2) 257 258 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 259 260 FLD -4 * SIZE(AO) 261 262 FLD -4 * SIZE(BO) 263 fld %st(1) 264 fmul %st(1), %st 265 faddp %st, %st(3) 266 267 FLD -3 * SIZE(BO) 268 fmul %st, %st(2) 269 270 FLD -3 * SIZE(AO) 271 fmul %st, %st(2) 272 fmulp %st, %st(1) 273 274 faddp %st, %st(6) 275 faddp %st, %st(4) 276 faddp %st, %st(2) 277 278 FLD -2 * SIZE(AO) 279 280 FLD -2 * SIZE(BO) 281 fld %st(1) 282 fmul %st(1), %st 283 faddp %st, %st(3) 284 285 FLD -1 * SIZE(BO) 286 fmul %st, %st(2) 287 288 FLD -1 * SIZE(AO) 289 fmul %st, %st(2) 290 fmulp %st, %st(1) 291 292 faddp %st, %st(6) 293 faddp %st, %st(4) 294 faddp %st, %st(2) 295 296 addq $8 * SIZE,AO 297 addq $8 * SIZE,BO 298 299 decq %rax 300 jne .L12 301 ALIGN_4 302 303.L15: 304#if defined(LT) || defined(RN) 305 movq KK, %rax 306#else 307 movq K, %rax 308 subq KK, %rax 309#endif 310 and $3, %rax 311 je .L18 312 ALIGN_4 313 314.L16: 315 FLD -8 * SIZE(AO) 316 317 FLD -8 * SIZE(BO) 318 fld %st(1) 319 fmul %st(1), %st 320 faddp %st, %st(3) 321 322 FLD -7 * SIZE(BO) 323 fmul %st, %st(2) 324 325 FLD -7 * SIZE(AO) 326 fmul %st, %st(2) 327 fmulp %st, %st(1) 328 329 faddp %st, %st(6) 330 faddp %st, %st(4) 331 faddp %st, %st(2) 332 333 addq $2 * SIZE,AO 334 addq $2 * SIZE,BO 335 336 decq %rax 337 jne .L16 338 ALIGN_4 339 340.L18: 341#if defined(LN) || defined(RT) 342 movq KK, %rax 343#ifdef LN 344 subq $2, %rax 345#else 346 subq $2, %rax 347#endif 348 349 salq $BASE_SHIFT, %rax 350 351 movq AORIG, AO 352 leaq (AO, %rax, 2), AO 353 leaq (B, %rax, 2), BO 354#endif 355 356#if defined(LN) || defined(LT) 357 FLD -8 * SIZE(BO) 358 fsubp %st, %st(1) 359 FLD -7 * SIZE(BO) 360 fsubp %st, %st(2) 361 FLD -6 * SIZE(BO) 362 fsubp %st, %st(3) 363 FLD -5 * SIZE(BO) 364 fsubp %st, %st(4) 365#else 366 FLD -8 * SIZE(AO) 367 fsubp %st, %st(1) 368 FLD -7 * SIZE(AO) 369 fsubp %st, %st(3) 370 FLD -6 * SIZE(AO) 371 fsubp %st, %st(2) 372 FLD -5 * SIZE(AO) 373 fsubp %st, %st(4) 374#endif 375 376#ifdef LN 377 FLD -5 * SIZE(AO) 378 fmul %st, %st(3) 379 fmulp %st, %st(4) 380 381 FLD -6 * SIZE(AO) 382 fmul %st(3), %st 383 FLD -6 * SIZE(AO) 384 fmul %st(5), %st 385 386 fsubrp %st, %st(3) 387 fsubrp %st, %st(1) 388 389 FLD -8 * SIZE(AO) 390 fmul %st, %st(1) 391 fmulp %st, %st(2) 392#endif 393 394#ifdef LT 395 FLD -8 * SIZE(AO) 396 fmul %st, %st(1) 397 fmulp %st, %st(2) 398 399 FLD -7 * SIZE(AO) 400 fmul %st(1), %st 401 FLD -7 * SIZE(AO) 402 fmul %st(3), %st 403 404 fsubrp %st, %st(5) 405 fsubrp %st, %st(3) 406 407 FLD -5 * SIZE(AO) 408 fmul %st, %st(3) 409 fmulp %st, %st(4) 410#endif 411 412#ifdef RN 413 FLD -8 * SIZE(BO) 414 fmul %st, %st(1) 415 fmulp %st, %st(3) 416 417 FLD -7 * SIZE(BO) 418 fmul %st(1), %st 419 FLD -7 * SIZE(BO) 420 fmul %st(4), %st 421 422 fsubrp %st, %st(5) 423 fsubrp %st, %st(2) 424 425 FLD -5 * SIZE(BO) 426 fmul %st, %st(2) 427 fmulp %st, %st(4) 428#endif 429 430#ifdef RT 431 FLD -5 * SIZE(BO) 432 fmul %st, %st(2) 433 fmulp %st, %st(4) 434 435 FLD -6 * SIZE(BO) 436 fmul %st(2), %st 437 FLD -6 * SIZE(BO) 438 fmul %st(5), %st 439 440 fsubrp %st, %st(4) 441 fsubrp %st, %st(1) 442 443 FLD -8 * SIZE(BO) 444 fmul %st, %st(1) 445 fmulp %st, %st(3) 446#endif 447 448#ifdef LN 449 subq $2 * SIZE, CO 450#endif 451 452#if defined(LN) || defined(LT) 453 fld %st 454 FST -8 * SIZE(BO) 455 fxch %st(1) 456 fld %st 457 FST -7 * SIZE(BO) 458 fxch %st(2) 459 fld %st 460 FST -6 * SIZE(BO) 461 fxch %st(3) 462 fld %st 463 FST -5 * SIZE(BO) 464 465 FST 1 * SIZE(CO, LDC) 466 FST 0 * SIZE(CO) 467 FST 0 * SIZE(CO, LDC) 468 FST 1 * SIZE(CO) 469#else 470 fld %st 471 FST -8 * SIZE(AO) 472 fxch %st(2) 473 fld %st 474 FST -7 * SIZE(AO) 475 fxch %st(1) 476 fld %st 477 FST -6 * SIZE(AO) 478 fxch %st(3) 479 fld %st 480 FST -5 * SIZE(AO) 481 482 FST 1 * SIZE(CO, LDC) 483 FST 1 * SIZE(CO) 484 FST 0 * SIZE(CO) 485 FST 0 * SIZE(CO, LDC) 486#endif 487 488#ifndef LN 489 addq $2 * SIZE, CO 490#endif 491 492#if defined(LT) || defined(RN) 493 movq K, %rax 494 subq KK, %rax 495 salq $BASE_SHIFT, %rax 496 leaq (AO, %rax, 2), AO 497 leaq (BO, %rax, 2), BO 498#endif 499 500#ifdef LN 501 subq $2, KK 502#endif 503 504#ifdef LT 505 addq $2, KK 506#endif 507 508#ifdef RT 509 movq K, %rax 510 salq $1 + BASE_SHIFT, %rax 511 addq %rax, AORIG 512#endif 513 514 decq I 515 jne .L11 516 ALIGN_4 517 518.L20: 519 movq M, %rax 520 andq $1, %rax 521 je .L29 522 ALIGN_4 523 524.L21: 525#ifdef LN 526 movq K, %rax 527 salq $0 + BASE_SHIFT, %rax 528 subq %rax, AORIG 529#endif 530 531#if defined(LN) || defined(RT) 532 movq KK, %rax 533 salq $BASE_SHIFT, %rax 534 movq AORIG, AO 535 leaq (AO, %rax, 1), AO 536 leaq (B, %rax, 2), BO 537#else 538 movq B, BO 539#endif 540 541 fldz 542 fldz 543 544#if defined(LT) || defined(RN) 545 movq KK, %rax 546#else 547 movq K, %rax 548 subq KK, %rax 549#endif 550 sarq $2, %rax 551 je .L25 552 ALIGN_4 553 554.L22: 555 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 556 557 FLD -8 * SIZE(AO) 558 559 FLD -8 * SIZE(BO) 560 fmul %st(1), %st 561 faddp %st, %st(2) 562 563 FLD -7 * SIZE(BO) 564 fmulp %st, %st(1) 565 faddp %st, %st(2) 566 567 FLD -7 * SIZE(AO) 568 569 FLD -6 * SIZE(BO) 570 fmul %st(1), %st 571 faddp %st, %st(2) 572 573 FLD -5 * SIZE(BO) 574 fmulp %st, %st(1) 575 faddp %st, %st(2) 576 577 FLD -6 * SIZE(AO) 578 579 FLD -4 * SIZE(BO) 580 fmul %st(1), %st 581 faddp %st, %st(2) 582 583 FLD -3 * SIZE(BO) 584 fmulp %st, %st(1) 585 faddp %st, %st(2) 586 587 FLD -5 * SIZE(AO) 588 589 FLD -2 * SIZE(BO) 590 fmul %st(1), %st 591 faddp %st, %st(2) 592 593 FLD -1 * SIZE(BO) 594 fmulp %st, %st(1) 595 faddp %st, %st(2) 596 597 addq $4 * SIZE,AO 598 addq $8 * SIZE,BO 599 600 decq %rax 601 jne .L22 602 ALIGN_4 603 604.L25: 605#if defined(LT) || defined(RN) 606 movq KK, %rax 607#else 608 movq K, %rax 609 subq KK, %rax 610#endif 611 and $3, %rax 612 je .L28 613 ALIGN_4 614 615.L26: 616 FLD -8 * SIZE(AO) 617 618 FLD -8 * SIZE(BO) 619 fmul %st(1), %st 620 faddp %st, %st(2) 621 622 FLD -7 * SIZE(BO) 623 fmulp %st, %st(1) 624 faddp %st, %st(2) 625 626 addq $1 * SIZE,AO 627 addq $2 * SIZE,BO 628 629 decq %rax 630 jne .L26 631 ALIGN_4 632 633.L28: 634#if defined(LN) || defined(RT) 635 movq KK, %rax 636#ifdef LN 637 subq $1, %rax 638#else 639 subq $2, %rax 640#endif 641 642 salq $BASE_SHIFT, %rax 643 644 movq AORIG, AO 645 leaq (AO, %rax, 1), AO 646 leaq (B, %rax, 2), BO 647#endif 648 649#if defined(LN) || defined(LT) 650 FLD -8 * SIZE(BO) 651 fsubp %st, %st(1) 652 FLD -7 * SIZE(BO) 653 fsubp %st, %st(2) 654#else 655 FLD -8 * SIZE(AO) 656 fsubp %st, %st(1) 657 FLD -7 * SIZE(AO) 658 fsubp %st, %st(2) 659#endif 660 661#if defined(LN) || defined(LT) 662 FLD -8 * SIZE(AO) 663 fmul %st, %st(1) 664 fmulp %st, %st(2) 665#endif 666 667#ifdef RN 668 FLD -8 * SIZE(BO) 669 fmulp %st, %st(1) 670 671 FLD -7 * SIZE(BO) 672 fmul %st(1), %st 673 674 fsubrp %st, %st(2) 675 676 FLD -5 * SIZE(BO) 677 fmulp %st, %st(2) 678#endif 679 680#ifdef RT 681 FLD -5 * SIZE(BO) 682 fmulp %st, %st(2) 683 684 FLD -6 * SIZE(BO) 685 fmul %st(2), %st 686 687 fsubrp %st, %st(1) 688 689 FLD -8 * SIZE(BO) 690 fmulp %st, %st(1) 691#endif 692 693#ifdef LN 694 subq $1 * SIZE, CO 695#endif 696 697#if defined(LN) || defined(LT) 698 fld %st 699 FST -8 * SIZE(BO) 700 fxch %st(1) 701 fld %st 702 FST -7 * SIZE(BO) 703#else 704 fld %st 705 FST -8 * SIZE(AO) 706 fxch %st(1) 707 fld %st 708 FST -7 * SIZE(AO) 709#endif 710 711 FST 0 * SIZE(CO, LDC) 712 FST 0 * SIZE(CO) 713 714#ifndef LN 715 addq $1 * SIZE, CO 716#endif 717 718#if defined(LT) || defined(RN) 719 movq K, %rax 720 subq KK, %rax 721 salq $BASE_SHIFT, %rax 722 leaq (AO, %rax, 1), AO 723 leaq (BO, %rax, 2), BO 724#endif 725 726#ifdef LN 727 subq $1, KK 728#endif 729 730#ifdef LT 731 addq $1, KK 732#endif 733 734#ifdef RT 735 movq K, %rax 736 salq $0 + BASE_SHIFT, %rax 737 addq %rax, AORIG 738#endif 739 ALIGN_4 740 741.L29: 742#ifdef LN 743 movq K, %rax 744 salq $BASE_SHIFT, %rax 745 leaq (B, %rax, 2), B 746#endif 747 748#if defined(LT) || defined(RN) 749 movq BO, B 750#endif 751 752#ifdef RN 753 addq $2, KK 754#endif 755 756#ifdef RT 757 subq $2, KK 758#endif 759 760 decq J 761 jne .L01 762 ALIGN_4 763 764.L30: 765 movq N, %rax 766 testq $1, %rax 767 je .L999 768 769#if defined(LT) || defined(RN) 770 movq A, AO 771#else 772 movq A, %rax 773 movq %rax, AORIG 774#endif 775 776#ifdef RT 777 movq K, %rax 778 salq $0 + BASE_SHIFT, %rax 779 subq %rax, B 780#endif 781 782#ifdef RT 783 subq LDC, C 784#endif 785 movq C, CO 786#ifndef RT 787 addq LDC, C 788#endif 789 790#ifdef LN 791 movq OFFSET, %rax 792 addq M, %rax 793 movq %rax, KK 794#endif 795 796#ifdef LT 797 movq OFFSET, %rax 798 movq %rax, KK 799#endif 800 801 movq M, I 802 sarq $1, I 803 je .L40 804 ALIGN_4 805 806.L31: 807#ifdef LN 808 movq K, %rax 809 salq $1 + BASE_SHIFT, %rax 810 subq %rax, AORIG 811#endif 812 813#if defined(LN) || defined(RT) 814 movq KK, %rax 815 salq $BASE_SHIFT, %rax 816 movq AORIG, AO 817 leaq (AO, %rax, 2), AO 818 leaq (B, %rax, 1), BO 819#else 820 movq B, BO 821#endif 822 823 fldz 824 fldz 825 826#if defined(HAVE_3DNOW) 827 prefetchw 2 * SIZE(CO) 828#elif defined(HAVE_SSE) 829 prefetchnta 2 * SIZE(CO) 830#endif 831 832#if defined(LT) || defined(RN) 833 movq KK, %rax 834#else 835 movq K, %rax 836 subq KK, %rax 837#endif 838 sarq $2, %rax 839 je .L35 840 ALIGN_4 841 842.L32: 843 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 844 845 FLD -8 * SIZE(BO) 846 FLD -8 * SIZE(AO) 847 fmul %st(1), %st 848 faddp %st, %st(2) 849 850 FLD -7 * SIZE(AO) 851 fmulp %st, %st(1) 852 faddp %st, %st(2) 853 854 FLD -7 * SIZE(BO) 855 FLD -6 * SIZE(AO) 856 fmul %st(1), %st 857 faddp %st, %st(2) 858 859 FLD -5 * SIZE(AO) 860 fmulp %st, %st(1) 861 faddp %st, %st(2) 862 863 FLD -6 * SIZE(BO) 864 FLD -4 * SIZE(AO) 865 fmul %st(1), %st 866 faddp %st, %st(2) 867 868 FLD -3 * SIZE(AO) 869 fmulp %st, %st(1) 870 faddp %st, %st(2) 871 872 FLD -5 * SIZE(BO) 873 FLD -2 * SIZE(AO) 874 fmul %st(1), %st 875 faddp %st, %st(2) 876 877 FLD -1 * SIZE(AO) 878 fmulp %st, %st(1) 879 faddp %st, %st(2) 880 881 addq $8 * SIZE,AO 882 addq $4 * SIZE,BO 883 884 decq %rax 885 jne .L32 886 ALIGN_4 887 888.L35: 889#if defined(LT) || defined(RN) 890 movq KK, %rax 891#else 892 movq K, %rax 893 subq KK, %rax 894#endif 895 and $3, %rax 896 je .L38 897 ALIGN_4 898 899.L36: 900 FLD -8 * SIZE(BO) 901 902 FLD -8 * SIZE(AO) 903 fmul %st(1), %st 904 faddp %st, %st(2) 905 906 FLD -7 * SIZE(AO) 907 fmulp %st, %st(1) 908 faddp %st, %st(2) 909 910 addq $2 * SIZE,AO 911 addq $1 * SIZE,BO 912 913 decq %rax 914 jne .L36 915 ALIGN_4 916 917.L38: 918#if defined(LN) || defined(RT) 919 movq KK, %rax 920#ifdef LN 921 subq $2, %rax 922#else 923 subq $1, %rax 924#endif 925 926 salq $BASE_SHIFT, %rax 927 928 movq AORIG, AO 929 leaq (AO, %rax, 2), AO 930 leaq (B, %rax, 1), BO 931#endif 932 933#if defined(LN) || defined(LT) 934 FLD -8 * SIZE(BO) 935 fsubp %st, %st(1) 936 FLD -7 * SIZE(BO) 937 fsubp %st, %st(2) 938#else 939 FLD -8 * SIZE(AO) 940 fsubp %st, %st(1) 941 FLD -7 * SIZE(AO) 942 fsubp %st, %st(2) 943#endif 944 945#ifdef LN 946 FLD -5 * SIZE(AO) 947 fmulp %st, %st(2) 948 949 FLD -6 * SIZE(AO) 950 fmul %st(2), %st 951 952 fsubrp %st, %st(1) 953 FLD -8 * SIZE(AO) 954 fmulp %st, %st(1) 955#endif 956 957#ifdef LT 958 FLD -8 * SIZE(AO) 959 fmulp %st, %st(1) 960 961 FLD -7 * SIZE(AO) 962 fmul %st(1), %st 963 964 fsubrp %st, %st(2) 965 966 FLD -5 * SIZE(AO) 967 fmulp %st, %st(2) 968#endif 969 970#ifdef RN 971 FLD -8 * SIZE(BO) 972 fmul %st, %st(1) 973 fmulp %st, %st(2) 974#endif 975 976#ifdef RT 977 FLD -8 * SIZE(BO) 978 fmul %st, %st(1) 979 fmulp %st, %st(2) 980#endif 981 982#ifdef LN 983 subq $2 * SIZE, CO 984#endif 985 986#if defined(LN) || defined(LT) 987 fld %st 988 FST -8 * SIZE(BO) 989 fxch %st(1) 990 fld %st 991 FST -7 * SIZE(BO) 992#else 993 fld %st 994 FST -8 * SIZE(AO) 995 fxch %st(1) 996 fld %st 997 FST -7 * SIZE(AO) 998#endif 999 1000 FST 1 * SIZE(CO) 1001 FST 0 * SIZE(CO) 1002 1003#ifndef LN 1004 addq $2 * SIZE, CO 1005#endif 1006 1007#if defined(LT) || defined(RN) 1008 movq K, %rax 1009 subq KK, %rax 1010 salq $BASE_SHIFT, %rax 1011 leaq (AO, %rax, 2), AO 1012 leaq (BO, %rax, 1), BO 1013#endif 1014 1015#ifdef LN 1016 subq $2, KK 1017#endif 1018 1019#ifdef LT 1020 addq $2, KK 1021#endif 1022 1023#ifdef RT 1024 movq K, %rax 1025 salq $1 + BASE_SHIFT, %rax 1026 addq %rax, AORIG 1027#endif 1028 1029 decq I 1030 jne .L31 1031 ALIGN_4 1032 1033.L40: 1034 movq M, %rax 1035 andq $1, %rax 1036 je .L49 1037 ALIGN_4 1038 1039.L41: 1040#ifdef LN 1041 movq K, %rax 1042 salq $0 + BASE_SHIFT, %rax 1043 subq %rax, AORIG 1044#endif 1045 1046#if defined(LN) || defined(RT) 1047 movq KK, %rax 1048 salq $BASE_SHIFT, %rax 1049 movq AORIG, AO 1050 leaq (AO, %rax, 1), AO 1051 leaq (B, %rax, 1), BO 1052#else 1053 movq B, BO 1054#endif 1055 1056 fldz 1057 1058#if defined(LT) || defined(RN) 1059 movq KK, %rax 1060#else 1061 movq K, %rax 1062 subq KK, %rax 1063#endif 1064 sarq $2, %rax 1065 je .L45 1066 ALIGN_4 1067 1068.L42: 1069 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1070 1071 FLD -8 * SIZE(AO) 1072 FLD -8 * SIZE(BO) 1073 fmulp %st, %st(1) 1074 faddp %st, %st(1) 1075 1076 FLD -7 * SIZE(AO) 1077 FLD -7 * SIZE(BO) 1078 fmulp %st, %st(1) 1079 faddp %st, %st(1) 1080 1081 FLD -6 * SIZE(AO) 1082 FLD -6 * SIZE(BO) 1083 fmulp %st, %st(1) 1084 faddp %st, %st(1) 1085 1086 FLD -5 * SIZE(AO) 1087 FLD -5 * SIZE(BO) 1088 fmulp %st, %st(1) 1089 faddp %st, %st(1) 1090 1091 addq $4 * SIZE,AO 1092 addq $4 * SIZE,BO 1093 1094 decq %rax 1095 jne .L42 1096 ALIGN_4 1097 1098.L45: 1099#if defined(LT) || defined(RN) 1100 movq KK, %rax 1101#else 1102 movq K, %rax 1103 subq KK, %rax 1104#endif 1105 and $3, %rax 1106 je .L48 1107 ALIGN_4 1108 1109.L46: 1110 FLD -8 * SIZE(AO) 1111 1112 FLD -8 * SIZE(BO) 1113 fmulp %st, %st(1) 1114 faddp %st, %st(1) 1115 1116 addq $1 * SIZE,AO 1117 addq $1 * SIZE,BO 1118 1119 decq %rax 1120 jne .L46 1121 ALIGN_4 1122 1123.L48: 1124#if defined(LN) || defined(RT) 1125 movq KK, %rax 1126#ifdef LN 1127 subq $1, %rax 1128#else 1129 subq $1, %rax 1130#endif 1131 1132 salq $BASE_SHIFT, %rax 1133 1134 movq AORIG, AO 1135 leaq (AO, %rax, 1), AO 1136 leaq (B, %rax, 1), BO 1137#endif 1138 1139#if defined(LN) || defined(LT) 1140 FLD -8 * SIZE(BO) 1141 fsubp %st, %st(1) 1142#else 1143 FLD -8 * SIZE(AO) 1144 fsubp %st, %st(1) 1145#endif 1146 1147#ifdef LN 1148 FLD -8 * SIZE(AO) 1149 fmulp %st, %st(1) 1150#endif 1151 1152#ifdef LT 1153 FLD -8 * SIZE(AO) 1154 fmulp %st, %st(1) 1155#endif 1156 1157#ifdef RN 1158 FLD -8 * SIZE(BO) 1159 fmulp %st, %st(1) 1160#endif 1161 1162#ifdef RT 1163 FLD -8 * SIZE(BO) 1164 fmulp %st, %st(1) 1165#endif 1166 1167#ifdef LN 1168 subq $1 * SIZE, CO 1169#endif 1170 1171#if defined(LN) || defined(LT) 1172 fld %st 1173 FST -8 * SIZE(BO) 1174#else 1175 fld %st 1176 FST -8 * SIZE(AO) 1177#endif 1178 1179 FST 0 * SIZE(CO) 1180 1181#ifndef LN 1182 addq $1 * SIZE, CO 1183#endif 1184 1185#if defined(LT) || defined(RN) 1186 movq K, %rax 1187 subq KK, %rax 1188 salq $BASE_SHIFT, %rax 1189 leaq (AO, %rax, 1), AO 1190 leaq (BO, %rax, 1), BO 1191#endif 1192 1193#ifdef LN 1194 subq $1, KK 1195#endif 1196 1197#ifdef LT 1198 addq $1, KK 1199#endif 1200 1201#ifdef RT 1202 movq K, %rax 1203 salq $0 + BASE_SHIFT, %rax 1204 addq %rax, AORIG 1205#endif 1206 ALIGN_4 1207 1208.L49: 1209#ifdef LN 1210 movq K, %rax 1211 salq $BASE_SHIFT, %rax 1212 leaq (B, %rax, 1), B 1213#endif 1214 1215#if defined(LT) || defined(RN) 1216 movq BO, B 1217#endif 1218 1219#ifdef RN 1220 addq $1, KK 1221#endif 1222 1223#ifdef RT 1224 subq $1, KK 1225#endif 1226 ALIGN_4 1227 1228.L999: 1229 movq 0(%rsp), %rbx 1230 movq 8(%rsp), %rbp 1231 movq 16(%rsp), %r12 1232 movq 24(%rsp), %r13 1233 movq 32(%rsp), %r14 1234 movq 40(%rsp), %r15 1235 addq $STACKSIZE, %rsp 1236 ret 1237 1238 EPILOGUE 1239