1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 43#define N ARG2 44#define K ARG3 45#define A ARG4 46#define B ARG5 47#define C ARG6 48#define LDC %r10 49 50#define I %r12 51#define J %r13 52#define AO %r14 53#define BO %r15 54#define CO %rbp 55 56#define KK %r11 57#define AORIG 48(%rsp) 58 59#define STACKSIZE 64 60 61#define ALPHA 8 + STACKSIZE(%rsp) 62#define OFFSET 32 + STACKSIZE(%rsp) 63 64#ifdef OPTERON 65#define PREFETCH prefetch 66#define PREFETCHW prefetchw 67#else 68#define PREFETCH prefetcht0 69#define PREFETCHW prefetcht0 70#endif 71 72#define PREFETCHSIZE (5 + 4 * 10) 73 74 PROLOGUE 75 PROFCODE 76 77 subq $STACKSIZE, %rsp 78 movq %rbx, 0(%rsp) 79 movq %rbp, 8(%rsp) 80 movq %r12, 16(%rsp) 81 movq %r13, 24(%rsp) 82 movq %r14, 32(%rsp) 83 movq %r15, 40(%rsp) 84 85 movq 24 + STACKSIZE(%rsp), LDC 86 87#if defined(TRMMKERNEL) && !defined(LEFT) 88 movq OFFSET, %rax 89 negq %rax 90 movq %rax, KK 91#endif 92 93 addq $8 * SIZE, A 94 addq $8 * SIZE, B 95 96 salq $BASE_SHIFT, LDC 97 98#ifdef LN 99 movq M, %rax 100 salq $BASE_SHIFT, %rax 101 addq %rax, C 102 imulq K, %rax 103 addq %rax, A 104#endif 105 106#ifdef RT 107 movq N, %rax 108 salq $BASE_SHIFT, %rax 109 imulq K, %rax 110 addq %rax, B 111 112 movq N, %rax 113 imulq LDC, %rax 114 addq %rax, C 115#endif 116 117#ifdef RN 118 movq OFFSET, %rax 119 negq %rax 120 movq %rax, KK 121#endif 122 123#ifdef RT 124 movq N, %rax 125 subq OFFSET, %rax 126 movq %rax, KK 127#endif 128 129 movq N, %rax 130 sarq $1, %rax 131 movq %rax, J 132 je .L30 133 ALIGN_4 134 135.L01: 136#if defined(LT) || defined(RN) 137 movq A, AO 138#else 139 movq A, %rax 140 movq %rax, AORIG 141#endif 142 143#ifdef RT 144 movq K, %rax 145 salq $1 + BASE_SHIFT, %rax 146 subq %rax, B 147#endif 148 149 lea (, LDC, 2), %rax 150 151#ifdef RT 152 subq %rax, C 153#endif 154 movq C, CO 155#ifndef RT 156 addq %rax, C 157#endif 158 159#ifdef LN 160 movq OFFSET, %rax 161 addq M, %rax 162 movq %rax, KK 163#endif 164 165#ifdef LT 166 movq OFFSET, %rax 167 movq %rax, KK 168#endif 169 170 movq M, I 171 sarq $1, I 172 je .L20 173 ALIGN_4 174 175.L11: 176#ifdef LN 177 movq K, %rax 178 salq $1 + BASE_SHIFT, %rax 179 subq %rax, AORIG 180#endif 181 182#if defined(LN) || defined(RT) 183 movq KK, %rax 184 salq $BASE_SHIFT, %rax 185 movq AORIG, AO 186 leaq (AO, %rax, 2), AO 187 leaq (B, %rax, 2), BO 188#else 189 movq B, BO 190#endif 191 192 fldz 193 fldz 194 fldz 195 fldz 196 197#if defined(HAVE_3DNOW) 198 prefetchw 2 * SIZE(CO) 199 prefetchw 2 * SIZE(CO, LDC, 1) 200#elif defined(HAVE_SSE) 201 prefetchnta 2 * SIZE(CO) 202 prefetchnta 2 * SIZE(CO, LDC, 1) 203#endif 204 205#if defined(LT) || defined(RN) 206 movq KK, %rax 207#else 208 movq K, %rax 209 subq KK, %rax 210#endif 211 sarq $2, %rax 212 je .L15 213 ALIGN_4 214 215.L12: 216 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 217 218 FLD -8 * SIZE(AO) 219 220 FLD -8 * SIZE(BO) 221 fld %st(1) 222 fmul %st(1), %st 223 faddp %st, %st(3) 224 225 FLD -7 * SIZE(BO) 226 fmul %st, %st(2) 227 228 FLD -7 * SIZE(AO) 229 fmul %st, %st(2) 230 fmulp %st, %st(1) 231 232 faddp %st, %st(6) 233 faddp %st, %st(4) 234 faddp %st, %st(2) 235 236 FLD -6 * SIZE(AO) 237 238 FLD -6 * SIZE(BO) 239 fld %st(1) 240 fmul %st(1), %st 241 faddp %st, %st(3) 242 243 FLD -5 * SIZE(BO) 244 fmul %st, %st(2) 245 246 FLD -5 * SIZE(AO) 247 fmul %st, %st(2) 248 fmulp %st, %st(1) 249 250 faddp %st, %st(6) 251 faddp %st, %st(4) 252 faddp %st, %st(2) 253 254 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 255 256 FLD -4 * SIZE(AO) 257 258 FLD -4 * SIZE(BO) 259 fld %st(1) 260 fmul %st(1), %st 261 faddp %st, %st(3) 262 263 FLD -3 * SIZE(BO) 264 fmul %st, %st(2) 265 266 FLD -3 * SIZE(AO) 267 fmul %st, %st(2) 268 fmulp %st, %st(1) 269 270 faddp %st, %st(6) 271 faddp %st, %st(4) 272 faddp %st, %st(2) 273 274 FLD -2 * SIZE(AO) 275 276 FLD -2 * SIZE(BO) 277 fld %st(1) 278 fmul %st(1), %st 279 faddp %st, %st(3) 280 281 FLD -1 * SIZE(BO) 282 fmul %st, %st(2) 283 284 FLD -1 * SIZE(AO) 285 fmul %st, %st(2) 286 fmulp %st, %st(1) 287 288 faddp %st, %st(6) 289 faddp %st, %st(4) 290 faddp %st, %st(2) 291 292 addq $8 * SIZE,AO 293 addq $8 * SIZE,BO 294 295 decq %rax 296 jne .L12 297 ALIGN_4 298 299.L15: 300#if defined(LT) || defined(RN) 301 movq KK, %rax 302#else 303 movq K, %rax 304 subq KK, %rax 305#endif 306 and $3, %rax 307 je .L18 308 ALIGN_4 309 310.L16: 311 FLD -8 * SIZE(AO) 312 313 FLD -8 * SIZE(BO) 314 fld %st(1) 315 fmul %st(1), %st 316 faddp %st, %st(3) 317 318 FLD -7 * SIZE(BO) 319 fmul %st, %st(2) 320 321 FLD -7 * SIZE(AO) 322 fmul %st, %st(2) 323 fmulp %st, %st(1) 324 325 faddp %st, %st(6) 326 faddp %st, %st(4) 327 faddp %st, %st(2) 328 329 addq $2 * SIZE,AO 330 addq $2 * SIZE,BO 331 332 decq %rax 333 jne .L16 334 ALIGN_4 335 336.L18: 337#if defined(LN) || defined(RT) 338 movq KK, %rax 339#ifdef LN 340 subq $2, %rax 341#else 342 subq $2, %rax 343#endif 344 345 salq $BASE_SHIFT, %rax 346 347 movq AORIG, AO 348 leaq (AO, %rax, 2), AO 349 leaq (B, %rax, 2), BO 350#endif 351 352#if defined(LN) || defined(LT) 353 FLD -8 * SIZE(BO) 354 fsubp %st, %st(1) 355 FLD -7 * SIZE(BO) 356 fsubp %st, %st(2) 357 FLD -6 * SIZE(BO) 358 fsubp %st, %st(3) 359 FLD -5 * SIZE(BO) 360 fsubp %st, %st(4) 361#else 362 FLD -8 * SIZE(AO) 363 fsubp %st, %st(1) 364 FLD -7 * SIZE(AO) 365 fsubp %st, %st(3) 366 FLD -6 * SIZE(AO) 367 fsubp %st, %st(2) 368 FLD -5 * SIZE(AO) 369 fsubp %st, %st(4) 370#endif 371 372#ifdef LN 373 FLD -5 * SIZE(AO) 374 fmul %st, %st(3) 375 fmulp %st, %st(4) 376 377 FLD -6 * SIZE(AO) 378 fmul %st(3), %st 379 FLD -6 * SIZE(AO) 380 fmul %st(5), %st 381 382 fsubrp %st, %st(3) 383 fsubrp %st, %st(1) 384 385 FLD -8 * SIZE(AO) 386 fmul %st, %st(1) 387 fmulp %st, %st(2) 388#endif 389 390#ifdef LT 391 FLD -8 * SIZE(AO) 392 fmul %st, %st(1) 393 fmulp %st, %st(2) 394 395 FLD -7 * SIZE(AO) 396 fmul %st(1), %st 397 FLD -7 * SIZE(AO) 398 fmul %st(3), %st 399 400 fsubrp %st, %st(5) 401 fsubrp %st, %st(3) 402 403 FLD -5 * SIZE(AO) 404 fmul %st, %st(3) 405 fmulp %st, %st(4) 406#endif 407 408#ifdef RN 409 FLD -8 * SIZE(BO) 410 fmul %st, %st(1) 411 fmulp %st, %st(3) 412 413 FLD -7 * SIZE(BO) 414 fmul %st(1), %st 415 FLD -7 * SIZE(BO) 416 fmul %st(4), %st 417 418 fsubrp %st, %st(5) 419 fsubrp %st, %st(2) 420 421 FLD -5 * SIZE(BO) 422 fmul %st, %st(2) 423 fmulp %st, %st(4) 424#endif 425 426#ifdef RT 427 FLD -5 * SIZE(BO) 428 fmul %st, %st(2) 429 fmulp %st, %st(4) 430 431 FLD -6 * SIZE(BO) 432 fmul %st(2), %st 433 FLD -6 * SIZE(BO) 434 fmul %st(5), %st 435 436 fsubrp %st, %st(4) 437 fsubrp %st, %st(1) 438 439 FLD -8 * SIZE(BO) 440 fmul %st, %st(1) 441 fmulp %st, %st(3) 442#endif 443 444#ifdef LN 445 subq $2 * SIZE, CO 446#endif 447 448#if defined(LN) || defined(LT) 449 fld %st 450 FST -8 * SIZE(BO) 451 fxch %st(1) 452 fld %st 453 FST -7 * SIZE(BO) 454 fxch %st(2) 455 fld %st 456 FST -6 * SIZE(BO) 457 fxch %st(3) 458 fld %st 459 FST -5 * SIZE(BO) 460 461 FST 1 * SIZE(CO, LDC) 462 FST 0 * SIZE(CO) 463 FST 0 * SIZE(CO, LDC) 464 FST 1 * SIZE(CO) 465#else 466 fld %st 467 FST -8 * SIZE(AO) 468 fxch %st(2) 469 fld %st 470 FST -7 * SIZE(AO) 471 fxch %st(1) 472 fld %st 473 FST -6 * SIZE(AO) 474 fxch %st(3) 475 fld %st 476 FST -5 * SIZE(AO) 477 478 FST 1 * SIZE(CO, LDC) 479 FST 1 * SIZE(CO) 480 FST 0 * SIZE(CO) 481 FST 0 * SIZE(CO, LDC) 482#endif 483 484#ifndef LN 485 addq $2 * SIZE, CO 486#endif 487 488#if defined(LT) || defined(RN) 489 movq K, %rax 490 subq KK, %rax 491 salq $BASE_SHIFT, %rax 492 leaq (AO, %rax, 2), AO 493 leaq (BO, %rax, 2), BO 494#endif 495 496#ifdef LN 497 subq $2, KK 498#endif 499 500#ifdef LT 501 addq $2, KK 502#endif 503 504#ifdef RT 505 movq K, %rax 506 salq $1 + BASE_SHIFT, %rax 507 addq %rax, AORIG 508#endif 509 510 decq I 511 jne .L11 512 ALIGN_4 513 514.L20: 515 movq M, %rax 516 andq $1, %rax 517 je .L29 518 ALIGN_4 519 520.L21: 521#ifdef LN 522 movq K, %rax 523 salq $0 + BASE_SHIFT, %rax 524 subq %rax, AORIG 525#endif 526 527#if defined(LN) || defined(RT) 528 movq KK, %rax 529 salq $BASE_SHIFT, %rax 530 movq AORIG, AO 531 leaq (AO, %rax, 1), AO 532 leaq (B, %rax, 2), BO 533#else 534 movq B, BO 535#endif 536 537 fldz 538 fldz 539 540#if defined(LT) || defined(RN) 541 movq KK, %rax 542#else 543 movq K, %rax 544 subq KK, %rax 545#endif 546 sarq $2, %rax 547 je .L25 548 ALIGN_4 549 550.L22: 551 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 552 553 FLD -8 * SIZE(AO) 554 555 FLD -8 * SIZE(BO) 556 fmul %st(1), %st 557 faddp %st, %st(2) 558 559 FLD -7 * SIZE(BO) 560 fmulp %st, %st(1) 561 faddp %st, %st(2) 562 563 FLD -7 * SIZE(AO) 564 565 FLD -6 * SIZE(BO) 566 fmul %st(1), %st 567 faddp %st, %st(2) 568 569 FLD -5 * SIZE(BO) 570 fmulp %st, %st(1) 571 faddp %st, %st(2) 572 573 FLD -6 * SIZE(AO) 574 575 FLD -4 * SIZE(BO) 576 fmul %st(1), %st 577 faddp %st, %st(2) 578 579 FLD -3 * SIZE(BO) 580 fmulp %st, %st(1) 581 faddp %st, %st(2) 582 583 FLD -5 * SIZE(AO) 584 585 FLD -2 * SIZE(BO) 586 fmul %st(1), %st 587 faddp %st, %st(2) 588 589 FLD -1 * SIZE(BO) 590 fmulp %st, %st(1) 591 faddp %st, %st(2) 592 593 addq $4 * SIZE,AO 594 addq $8 * SIZE,BO 595 596 decq %rax 597 jne .L22 598 ALIGN_4 599 600.L25: 601#if defined(LT) || defined(RN) 602 movq KK, %rax 603#else 604 movq K, %rax 605 subq KK, %rax 606#endif 607 and $3, %rax 608 je .L28 609 ALIGN_4 610 611.L26: 612 FLD -8 * SIZE(AO) 613 614 FLD -8 * SIZE(BO) 615 fmul %st(1), %st 616 faddp %st, %st(2) 617 618 FLD -7 * SIZE(BO) 619 fmulp %st, %st(1) 620 faddp %st, %st(2) 621 622 addq $1 * SIZE,AO 623 addq $2 * SIZE,BO 624 625 decq %rax 626 jne .L26 627 ALIGN_4 628 629.L28: 630#if defined(LN) || defined(RT) 631 movq KK, %rax 632#ifdef LN 633 subq $1, %rax 634#else 635 subq $2, %rax 636#endif 637 638 salq $BASE_SHIFT, %rax 639 640 movq AORIG, AO 641 leaq (AO, %rax, 1), AO 642 leaq (B, %rax, 2), BO 643#endif 644 645#if defined(LN) || defined(LT) 646 FLD -8 * SIZE(BO) 647 fsubp %st, %st(1) 648 FLD -7 * SIZE(BO) 649 fsubp %st, %st(2) 650#else 651 FLD -8 * SIZE(AO) 652 fsubp %st, %st(1) 653 FLD -7 * SIZE(AO) 654 fsubp %st, %st(2) 655#endif 656 657#if defined(LN) || defined(LT) 658 FLD -8 * SIZE(AO) 659 fmul %st, %st(1) 660 fmulp %st, %st(2) 661#endif 662 663#ifdef RN 664 FLD -8 * SIZE(BO) 665 fmulp %st, %st(1) 666 667 FLD -7 * SIZE(BO) 668 fmul %st(1), %st 669 670 fsubrp %st, %st(2) 671 672 FLD -5 * SIZE(BO) 673 fmulp %st, %st(2) 674#endif 675 676#ifdef RT 677 FLD -5 * SIZE(BO) 678 fmulp %st, %st(2) 679 680 FLD -6 * SIZE(BO) 681 fmul %st(2), %st 682 683 fsubrp %st, %st(1) 684 685 FLD -8 * SIZE(BO) 686 fmulp %st, %st(1) 687#endif 688 689#ifdef LN 690 subq $1 * SIZE, CO 691#endif 692 693#if defined(LN) || defined(LT) 694 fld %st 695 FST -8 * SIZE(BO) 696 fxch %st(1) 697 fld %st 698 FST -7 * SIZE(BO) 699#else 700 fld %st 701 FST -8 * SIZE(AO) 702 fxch %st(1) 703 fld %st 704 FST -7 * SIZE(AO) 705#endif 706 707 FST 0 * SIZE(CO, LDC) 708 FST 0 * SIZE(CO) 709 710#ifndef LN 711 addq $1 * SIZE, CO 712#endif 713 714#if defined(LT) || defined(RN) 715 movq K, %rax 716 subq KK, %rax 717 salq $BASE_SHIFT, %rax 718 leaq (AO, %rax, 1), AO 719 leaq (BO, %rax, 2), BO 720#endif 721 722#ifdef LN 723 subq $1, KK 724#endif 725 726#ifdef LT 727 addq $1, KK 728#endif 729 730#ifdef RT 731 movq K, %rax 732 salq $0 + BASE_SHIFT, %rax 733 addq %rax, AORIG 734#endif 735 ALIGN_4 736 737.L29: 738#ifdef LN 739 movq K, %rax 740 salq $BASE_SHIFT, %rax 741 leaq (B, %rax, 2), B 742#endif 743 744#if defined(LT) || defined(RN) 745 movq BO, B 746#endif 747 748#ifdef RN 749 addq $2, KK 750#endif 751 752#ifdef RT 753 subq $2, KK 754#endif 755 756 decq J 757 jne .L01 758 ALIGN_4 759 760.L30: 761 movq N, %rax 762 testq $1, %rax 763 je .L999 764 765#if defined(LT) || defined(RN) 766 movq A, AO 767#else 768 movq A, %rax 769 movq %rax, AORIG 770#endif 771 772#ifdef RT 773 movq K, %rax 774 salq $0 + BASE_SHIFT, %rax 775 subq %rax, B 776#endif 777 778#ifdef RT 779 subq LDC, C 780#endif 781 movq C, CO 782#ifndef RT 783 addq LDC, C 784#endif 785 786#ifdef LN 787 movq OFFSET, %rax 788 addq M, %rax 789 movq %rax, KK 790#endif 791 792#ifdef LT 793 movq OFFSET, %rax 794 movq %rax, KK 795#endif 796 797 movq M, I 798 sarq $1, I 799 je .L40 800 ALIGN_4 801 802.L31: 803#ifdef LN 804 movq K, %rax 805 salq $1 + BASE_SHIFT, %rax 806 subq %rax, AORIG 807#endif 808 809#if defined(LN) || defined(RT) 810 movq KK, %rax 811 salq $BASE_SHIFT, %rax 812 movq AORIG, AO 813 leaq (AO, %rax, 2), AO 814 leaq (B, %rax, 1), BO 815#else 816 movq B, BO 817#endif 818 819 fldz 820 fldz 821 822#if defined(HAVE_3DNOW) 823 prefetchw 2 * SIZE(CO) 824#elif defined(HAVE_SSE) 825 prefetchnta 2 * SIZE(CO) 826#endif 827 828#if defined(LT) || defined(RN) 829 movq KK, %rax 830#else 831 movq K, %rax 832 subq KK, %rax 833#endif 834 sarq $2, %rax 835 je .L35 836 ALIGN_4 837 838.L32: 839 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 840 841 FLD -8 * SIZE(BO) 842 FLD -8 * SIZE(AO) 843 fmul %st(1), %st 844 faddp %st, %st(2) 845 846 FLD -7 * SIZE(AO) 847 fmulp %st, %st(1) 848 faddp %st, %st(2) 849 850 FLD -7 * SIZE(BO) 851 FLD -6 * SIZE(AO) 852 fmul %st(1), %st 853 faddp %st, %st(2) 854 855 FLD -5 * SIZE(AO) 856 fmulp %st, %st(1) 857 faddp %st, %st(2) 858 859 FLD -6 * SIZE(BO) 860 FLD -4 * SIZE(AO) 861 fmul %st(1), %st 862 faddp %st, %st(2) 863 864 FLD -3 * SIZE(AO) 865 fmulp %st, %st(1) 866 faddp %st, %st(2) 867 868 FLD -5 * SIZE(BO) 869 FLD -2 * SIZE(AO) 870 fmul %st(1), %st 871 faddp %st, %st(2) 872 873 FLD -1 * SIZE(AO) 874 fmulp %st, %st(1) 875 faddp %st, %st(2) 876 877 addq $8 * SIZE,AO 878 addq $4 * SIZE,BO 879 880 decq %rax 881 jne .L32 882 ALIGN_4 883 884.L35: 885#if defined(LT) || defined(RN) 886 movq KK, %rax 887#else 888 movq K, %rax 889 subq KK, %rax 890#endif 891 and $3, %rax 892 je .L38 893 ALIGN_4 894 895.L36: 896 FLD -8 * SIZE(BO) 897 898 FLD -8 * SIZE(AO) 899 fmul %st(1), %st 900 faddp %st, %st(2) 901 902 FLD -7 * SIZE(AO) 903 fmulp %st, %st(1) 904 faddp %st, %st(2) 905 906 addq $2 * SIZE,AO 907 addq $1 * SIZE,BO 908 909 decq %rax 910 jne .L36 911 ALIGN_4 912 913.L38: 914#if defined(LN) || defined(RT) 915 movq KK, %rax 916#ifdef LN 917 subq $2, %rax 918#else 919 subq $1, %rax 920#endif 921 922 salq $BASE_SHIFT, %rax 923 924 movq AORIG, AO 925 leaq (AO, %rax, 2), AO 926 leaq (B, %rax, 1), BO 927#endif 928 929#if defined(LN) || defined(LT) 930 FLD -8 * SIZE(BO) 931 fsubp %st, %st(1) 932 FLD -7 * SIZE(BO) 933 fsubp %st, %st(2) 934#else 935 FLD -8 * SIZE(AO) 936 fsubp %st, %st(1) 937 FLD -7 * SIZE(AO) 938 fsubp %st, %st(2) 939#endif 940 941#ifdef LN 942 FLD -5 * SIZE(AO) 943 fmulp %st, %st(2) 944 945 FLD -6 * SIZE(AO) 946 fmul %st(2), %st 947 948 fsubrp %st, %st(1) 949 FLD -8 * SIZE(AO) 950 fmulp %st, %st(1) 951#endif 952 953#ifdef LT 954 FLD -8 * SIZE(AO) 955 fmulp %st, %st(1) 956 957 FLD -7 * SIZE(AO) 958 fmul %st(1), %st 959 960 fsubrp %st, %st(2) 961 962 FLD -5 * SIZE(AO) 963 fmulp %st, %st(2) 964#endif 965 966#ifdef RN 967 FLD -8 * SIZE(BO) 968 fmul %st, %st(1) 969 fmulp %st, %st(2) 970#endif 971 972#ifdef RT 973 FLD -8 * SIZE(BO) 974 fmul %st, %st(1) 975 fmulp %st, %st(2) 976#endif 977 978#ifdef LN 979 subq $2 * SIZE, CO 980#endif 981 982#if defined(LN) || defined(LT) 983 fld %st 984 FST -8 * SIZE(BO) 985 fxch %st(1) 986 fld %st 987 FST -7 * SIZE(BO) 988#else 989 fld %st 990 FST -8 * SIZE(AO) 991 fxch %st(1) 992 fld %st 993 FST -7 * SIZE(AO) 994#endif 995 996 FST 1 * SIZE(CO) 997 FST 0 * SIZE(CO) 998 999#ifndef LN 1000 addq $2 * SIZE, CO 1001#endif 1002 1003#if defined(LT) || defined(RN) 1004 movq K, %rax 1005 subq KK, %rax 1006 salq $BASE_SHIFT, %rax 1007 leaq (AO, %rax, 2), AO 1008 leaq (BO, %rax, 1), BO 1009#endif 1010 1011#ifdef LN 1012 subq $2, KK 1013#endif 1014 1015#ifdef LT 1016 addq $2, KK 1017#endif 1018 1019#ifdef RT 1020 movq K, %rax 1021 salq $1 + BASE_SHIFT, %rax 1022 addq %rax, AORIG 1023#endif 1024 1025 decq I 1026 jne .L31 1027 ALIGN_4 1028 1029.L40: 1030 movq M, %rax 1031 andq $1, %rax 1032 je .L49 1033 ALIGN_4 1034 1035.L41: 1036#ifdef LN 1037 movq K, %rax 1038 salq $0 + BASE_SHIFT, %rax 1039 subq %rax, AORIG 1040#endif 1041 1042#if defined(LN) || defined(RT) 1043 movq KK, %rax 1044 salq $BASE_SHIFT, %rax 1045 movq AORIG, AO 1046 leaq (AO, %rax, 1), AO 1047 leaq (B, %rax, 1), BO 1048#else 1049 movq B, BO 1050#endif 1051 1052 fldz 1053 1054#if defined(LT) || defined(RN) 1055 movq KK, %rax 1056#else 1057 movq K, %rax 1058 subq KK, %rax 1059#endif 1060 sarq $2, %rax 1061 je .L45 1062 ALIGN_4 1063 1064.L42: 1065 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1066 1067 FLD -8 * SIZE(AO) 1068 FLD -8 * SIZE(BO) 1069 fmulp %st, %st(1) 1070 faddp %st, %st(1) 1071 1072 FLD -7 * SIZE(AO) 1073 FLD -7 * SIZE(BO) 1074 fmulp %st, %st(1) 1075 faddp %st, %st(1) 1076 1077 FLD -6 * SIZE(AO) 1078 FLD -6 * SIZE(BO) 1079 fmulp %st, %st(1) 1080 faddp %st, %st(1) 1081 1082 FLD -5 * SIZE(AO) 1083 FLD -5 * SIZE(BO) 1084 fmulp %st, %st(1) 1085 faddp %st, %st(1) 1086 1087 addq $4 * SIZE,AO 1088 addq $4 * SIZE,BO 1089 1090 decq %rax 1091 jne .L42 1092 ALIGN_4 1093 1094.L45: 1095#if defined(LT) || defined(RN) 1096 movq KK, %rax 1097#else 1098 movq K, %rax 1099 subq KK, %rax 1100#endif 1101 and $3, %rax 1102 je .L48 1103 ALIGN_4 1104 1105.L46: 1106 FLD -8 * SIZE(AO) 1107 1108 FLD -8 * SIZE(BO) 1109 fmulp %st, %st(1) 1110 faddp %st, %st(1) 1111 1112 addq $1 * SIZE,AO 1113 addq $1 * SIZE,BO 1114 1115 decq %rax 1116 jne .L46 1117 ALIGN_4 1118 1119.L48: 1120#if defined(LN) || defined(RT) 1121 movq KK, %rax 1122#ifdef LN 1123 subq $1, %rax 1124#else 1125 subq $1, %rax 1126#endif 1127 1128 salq $BASE_SHIFT, %rax 1129 1130 movq AORIG, AO 1131 leaq (AO, %rax, 1), AO 1132 leaq (B, %rax, 1), BO 1133#endif 1134 1135#if defined(LN) || defined(LT) 1136 FLD -8 * SIZE(BO) 1137 fsubp %st, %st(1) 1138#else 1139 FLD -8 * SIZE(AO) 1140 fsubp %st, %st(1) 1141#endif 1142 1143#ifdef LN 1144 FLD -8 * SIZE(AO) 1145 fmulp %st, %st(1) 1146#endif 1147 1148#ifdef LT 1149 FLD -8 * SIZE(AO) 1150 fmulp %st, %st(1) 1151#endif 1152 1153#ifdef RN 1154 FLD -8 * SIZE(BO) 1155 fmulp %st, %st(1) 1156#endif 1157 1158#ifdef RT 1159 FLD -8 * SIZE(BO) 1160 fmulp %st, %st(1) 1161#endif 1162 1163#ifdef LN 1164 subq $1 * SIZE, CO 1165#endif 1166 1167#if defined(LN) || defined(LT) 1168 fld %st 1169 FST -8 * SIZE(BO) 1170#else 1171 fld %st 1172 FST -8 * SIZE(AO) 1173#endif 1174 1175 FST 0 * SIZE(CO) 1176 1177#ifndef LN 1178 addq $1 * SIZE, CO 1179#endif 1180 1181#if defined(LT) || defined(RN) 1182 movq K, %rax 1183 subq KK, %rax 1184 salq $BASE_SHIFT, %rax 1185 leaq (AO, %rax, 1), AO 1186 leaq (BO, %rax, 1), BO 1187#endif 1188 1189#ifdef LN 1190 subq $1, KK 1191#endif 1192 1193#ifdef LT 1194 addq $1, KK 1195#endif 1196 1197#ifdef RT 1198 movq K, %rax 1199 salq $0 + BASE_SHIFT, %rax 1200 addq %rax, AORIG 1201#endif 1202 ALIGN_4 1203 1204.L49: 1205#ifdef LN 1206 movq K, %rax 1207 salq $BASE_SHIFT, %rax 1208 leaq (B, %rax, 1), B 1209#endif 1210 1211#if defined(LT) || defined(RN) 1212 movq BO, B 1213#endif 1214 1215#ifdef RN 1216 addq $1, KK 1217#endif 1218 1219#ifdef RT 1220 subq $1, KK 1221#endif 1222 ALIGN_4 1223 1224.L999: 1225 movq 0(%rsp), %rbx 1226 movq 8(%rsp), %rbp 1227 movq 16(%rsp), %r12 1228 movq 24(%rsp), %r13 1229 movq 32(%rsp), %r14 1230 movq 40(%rsp), %r15 1231 addq $STACKSIZE, %rsp 1232 ret 1233 1234 EPILOGUE 1235