1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 43#define N ARG2 44#define K ARG3 45#define A ARG4 46#define B ARG5 47#define C ARG6 48#define LDC %r10 49 50#define I %r12 51#define J %r13 52#define AO %r14 53#define BO %r15 54#define CO %rbp 55 56#define KK %r11 57#define AORIG 48(%rsp) 58 59#define STACKSIZE 64 60 61#define ALPHA 8 + STACKSIZE(%rsp) 62#define OFFSET 32 + STACKSIZE(%rsp) 63 64#ifdef OPTERON 65#define PREFETCH prefetch 66#define PREFETCHW prefetchw 67#else 68#define PREFETCH prefetcht0 69#define PREFETCHW prefetcht0 70#endif 71 72#define PREFETCHSIZE (5 + 4 * 10) 73 74 PROLOGUE 75 PROFCODE 76 77#ifdef WINDOWS_ABI 78 emms 79#endif 80 81 subq $STACKSIZE, %rsp 82 movq %rbx, 0(%rsp) 83 movq %rbp, 8(%rsp) 84 movq %r12, 16(%rsp) 85 movq %r13, 24(%rsp) 86 movq %r14, 32(%rsp) 87 movq %r15, 40(%rsp) 88 89 movq 24 + STACKSIZE(%rsp), LDC 90 91#if defined(TRMMKERNEL) && !defined(LEFT) 92 movq OFFSET, %rax 93 negq %rax 94 movq %rax, KK 95#endif 96 97 addq $8 * SIZE, A 98 addq $8 * SIZE, B 99 100 salq $BASE_SHIFT, LDC 101 102#ifdef LN 103 movq M, %rax 104 salq $BASE_SHIFT, %rax 105 addq %rax, C 106 imulq K, %rax 107 addq %rax, A 108#endif 109 110#ifdef RT 111 movq N, %rax 112 salq $BASE_SHIFT, %rax 113 imulq K, %rax 114 addq %rax, B 115 116 movq N, %rax 117 imulq LDC, %rax 118 addq %rax, C 119#endif 120 121#ifdef RN 122 movq OFFSET, %rax 123 negq %rax 124 movq %rax, KK 125#endif 126 127#ifdef RT 128 movq N, %rax 129 subq OFFSET, %rax 130 movq %rax, KK 131#endif 132 133 movq N, %rax 134 sarq $1, %rax 135 movq %rax, J 136 je .L30 137 ALIGN_4 138 139.L01: 140#if defined(LT) || defined(RN) 141 movq A, AO 142#else 143 movq A, %rax 144 movq %rax, AORIG 145#endif 146 147#ifdef RT 148 movq K, %rax 149 salq $1 + BASE_SHIFT, %rax 150 subq %rax, B 151#endif 152 153 lea (, LDC, 2), %rax 154 155#ifdef RT 156 subq %rax, C 157#endif 158 movq C, CO 159#ifndef RT 160 addq %rax, C 161#endif 162 163#ifdef LN 164 movq OFFSET, %rax 165 addq M, %rax 166 movq %rax, KK 167#endif 168 169#ifdef LT 170 movq OFFSET, %rax 171 movq %rax, KK 172#endif 173 174 movq M, %rax 175 andq $1, %rax 176 je .L20 177 ALIGN_4 178 179.L21: 180#ifdef LN 181 movq K, %rax 182 salq $0 + BASE_SHIFT, %rax 183 subq %rax, AORIG 184#endif 185 186#if defined(LN) || defined(RT) 187 movq KK, %rax 188 salq $BASE_SHIFT, %rax 189 movq AORIG, AO 190 leaq (AO, %rax, 1), AO 191 leaq (B, %rax, 2), BO 192#else 193 movq B, BO 194#endif 195 196 fldz 197 fldz 198 199#if defined(LT) || defined(RN) 200 movq KK, %rax 201#else 202 movq K, %rax 203 subq KK, %rax 204#endif 205 sarq $2, %rax 206 je .L25 207 ALIGN_4 208 209.L22: 210 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 211 212 FLD -8 * SIZE(AO) 213 214 FLD -8 * SIZE(BO) 215 fmul %st(1), %st 216 faddp %st, %st(2) 217 218 FLD -7 * SIZE(BO) 219 fmulp %st, %st(1) 220 faddp %st, %st(2) 221 222 FLD -7 * SIZE(AO) 223 224 FLD -6 * SIZE(BO) 225 fmul %st(1), %st 226 faddp %st, %st(2) 227 228 FLD -5 * SIZE(BO) 229 fmulp %st, %st(1) 230 faddp %st, %st(2) 231 232 FLD -6 * SIZE(AO) 233 234 FLD -4 * SIZE(BO) 235 fmul %st(1), %st 236 faddp %st, %st(2) 237 238 FLD -3 * SIZE(BO) 239 fmulp %st, %st(1) 240 faddp %st, %st(2) 241 242 FLD -5 * SIZE(AO) 243 244 FLD -2 * SIZE(BO) 245 fmul %st(1), %st 246 faddp %st, %st(2) 247 248 FLD -1 * SIZE(BO) 249 fmulp %st, %st(1) 250 faddp %st, %st(2) 251 252 addq $4 * SIZE,AO 253 addq $8 * SIZE,BO 254 255 decq %rax 256 jne .L22 257 ALIGN_4 258 259.L25: 260#if defined(LT) || defined(RN) 261 movq KK, %rax 262#else 263 movq K, %rax 264 subq KK, %rax 265#endif 266 and $3, %rax 267 je .L28 268 ALIGN_4 269 270.L26: 271 FLD -8 * SIZE(AO) 272 273 FLD -8 * SIZE(BO) 274 fmul %st(1), %st 275 faddp %st, %st(2) 276 277 FLD -7 * SIZE(BO) 278 fmulp %st, %st(1) 279 faddp %st, %st(2) 280 281 addq $1 * SIZE,AO 282 addq $2 * SIZE,BO 283 284 decq %rax 285 jne .L26 286 ALIGN_4 287 288.L28: 289#if defined(LN) || defined(RT) 290 movq KK, %rax 291#ifdef LN 292 subq $1, %rax 293#else 294 subq $2, %rax 295#endif 296 297 salq $BASE_SHIFT, %rax 298 299 movq AORIG, AO 300 leaq (AO, %rax, 1), AO 301 leaq (B, %rax, 2), BO 302#endif 303 304#if defined(LN) || defined(LT) 305 FLD -8 * SIZE(BO) 306 fsubp %st, %st(1) 307 FLD -7 * SIZE(BO) 308 fsubp %st, %st(2) 309#else 310 FLD -8 * SIZE(AO) 311 fsubp %st, %st(1) 312 FLD -7 * SIZE(AO) 313 fsubp %st, %st(2) 314#endif 315 316#if defined(LN) || defined(LT) 317 FLD -8 * SIZE(AO) 318 fmul %st, %st(1) 319 fmulp %st, %st(2) 320#endif 321 322#ifdef RN 323 FLD -8 * SIZE(BO) 324 fmulp %st, %st(1) 325 326 FLD -7 * SIZE(BO) 327 fmul %st(1), %st 328 329 fsubrp %st, %st(2) 330 331 FLD -5 * SIZE(BO) 332 fmulp %st, %st(2) 333#endif 334 335#ifdef RT 336 FLD -5 * SIZE(BO) 337 fmulp %st, %st(2) 338 339 FLD -6 * SIZE(BO) 340 fmul %st(2), %st 341 342 fsubrp %st, %st(1) 343 344 FLD -8 * SIZE(BO) 345 fmulp %st, %st(1) 346#endif 347 348#ifdef LN 349 subq $1 * SIZE, CO 350#endif 351 352#if defined(LN) || defined(LT) 353 fld %st 354 FST -8 * SIZE(BO) 355 fxch %st(1) 356 fld %st 357 FST -7 * SIZE(BO) 358#else 359 fld %st 360 FST -8 * SIZE(AO) 361 fxch %st(1) 362 fld %st 363 FST -7 * SIZE(AO) 364#endif 365 366 FST 0 * SIZE(CO, LDC) 367 FST 0 * SIZE(CO) 368 369#ifndef LN 370 addq $1 * SIZE, CO 371#endif 372 373#if defined(LT) || defined(RN) 374 movq K, %rax 375 subq KK, %rax 376 salq $BASE_SHIFT, %rax 377 leaq (AO, %rax, 1), AO 378 leaq (BO, %rax, 2), BO 379#endif 380 381#ifdef LN 382 subq $1, KK 383#endif 384 385#ifdef LT 386 addq $1, KK 387#endif 388 389#ifdef RT 390 movq K, %rax 391 salq $0 + BASE_SHIFT, %rax 392 addq %rax, AORIG 393#endif 394 ALIGN_4 395 396.L20: 397 movq M, I 398 sarq $1, I 399 je .L29 400 ALIGN_4 401 402.L11: 403#ifdef LN 404 movq K, %rax 405 salq $1 + BASE_SHIFT, %rax 406 subq %rax, AORIG 407#endif 408 409#if defined(LN) || defined(RT) 410 movq KK, %rax 411 salq $BASE_SHIFT, %rax 412 movq AORIG, AO 413 leaq (AO, %rax, 2), AO 414 leaq (B, %rax, 2), BO 415#else 416 movq B, BO 417#endif 418 419 fldz 420 fldz 421 fldz 422 fldz 423 424#if defined(HAVE_3DNOW) 425 prefetchw 2 * SIZE(CO) 426 prefetchw 2 * SIZE(CO, LDC, 1) 427#elif defined(HAVE_SSE) 428 prefetchnta 2 * SIZE(CO) 429 prefetchnta 2 * SIZE(CO, LDC, 1) 430#endif 431 432#if defined(LT) || defined(RN) 433 movq KK, %rax 434#else 435 movq K, %rax 436 subq KK, %rax 437#endif 438 sarq $2, %rax 439 je .L15 440 ALIGN_4 441 442.L12: 443 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 444 445 FLD -8 * SIZE(AO) 446 447 FLD -8 * SIZE(BO) 448 fld %st(1) 449 fmul %st(1), %st 450 faddp %st, %st(3) 451 452 FLD -7 * SIZE(BO) 453 fmul %st, %st(2) 454 455 FLD -7 * SIZE(AO) 456 fmul %st, %st(2) 457 fmulp %st, %st(1) 458 459 faddp %st, %st(6) 460 faddp %st, %st(4) 461 faddp %st, %st(2) 462 463 FLD -6 * SIZE(AO) 464 465 FLD -6 * SIZE(BO) 466 fld %st(1) 467 fmul %st(1), %st 468 faddp %st, %st(3) 469 470 FLD -5 * SIZE(BO) 471 fmul %st, %st(2) 472 473 FLD -5 * SIZE(AO) 474 fmul %st, %st(2) 475 fmulp %st, %st(1) 476 477 faddp %st, %st(6) 478 faddp %st, %st(4) 479 faddp %st, %st(2) 480 481 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 482 483 FLD -4 * SIZE(AO) 484 485 FLD -4 * SIZE(BO) 486 fld %st(1) 487 fmul %st(1), %st 488 faddp %st, %st(3) 489 490 FLD -3 * SIZE(BO) 491 fmul %st, %st(2) 492 493 FLD -3 * SIZE(AO) 494 fmul %st, %st(2) 495 fmulp %st, %st(1) 496 497 faddp %st, %st(6) 498 faddp %st, %st(4) 499 faddp %st, %st(2) 500 501 FLD -2 * SIZE(AO) 502 503 FLD -2 * SIZE(BO) 504 fld %st(1) 505 fmul %st(1), %st 506 faddp %st, %st(3) 507 508 FLD -1 * SIZE(BO) 509 fmul %st, %st(2) 510 511 FLD -1 * SIZE(AO) 512 fmul %st, %st(2) 513 fmulp %st, %st(1) 514 515 faddp %st, %st(6) 516 faddp %st, %st(4) 517 faddp %st, %st(2) 518 519 addq $8 * SIZE,AO 520 addq $8 * SIZE,BO 521 522 decq %rax 523 jne .L12 524 ALIGN_4 525 526.L15: 527#if defined(LT) || defined(RN) 528 movq KK, %rax 529#else 530 movq K, %rax 531 subq KK, %rax 532#endif 533 and $3, %rax 534 je .L18 535 ALIGN_4 536 537.L16: 538 FLD -8 * SIZE(AO) 539 540 FLD -8 * SIZE(BO) 541 fld %st(1) 542 fmul %st(1), %st 543 faddp %st, %st(3) 544 545 FLD -7 * SIZE(BO) 546 fmul %st, %st(2) 547 548 FLD -7 * SIZE(AO) 549 fmul %st, %st(2) 550 fmulp %st, %st(1) 551 552 faddp %st, %st(6) 553 faddp %st, %st(4) 554 faddp %st, %st(2) 555 556 addq $2 * SIZE,AO 557 addq $2 * SIZE,BO 558 559 decq %rax 560 jne .L16 561 ALIGN_4 562 563.L18: 564#if defined(LN) || defined(RT) 565 movq KK, %rax 566#ifdef LN 567 subq $2, %rax 568#else 569 subq $2, %rax 570#endif 571 572 salq $BASE_SHIFT, %rax 573 574 movq AORIG, AO 575 leaq (AO, %rax, 2), AO 576 leaq (B, %rax, 2), BO 577#endif 578 579#if defined(LN) || defined(LT) 580 FLD -8 * SIZE(BO) 581 fsubp %st, %st(1) 582 FLD -7 * SIZE(BO) 583 fsubp %st, %st(2) 584 FLD -6 * SIZE(BO) 585 fsubp %st, %st(3) 586 FLD -5 * SIZE(BO) 587 fsubp %st, %st(4) 588#else 589 FLD -8 * SIZE(AO) 590 fsubp %st, %st(1) 591 FLD -7 * SIZE(AO) 592 fsubp %st, %st(3) 593 FLD -6 * SIZE(AO) 594 fsubp %st, %st(2) 595 FLD -5 * SIZE(AO) 596 fsubp %st, %st(4) 597#endif 598 599#ifdef LN 600 FLD -5 * SIZE(AO) 601 fmul %st, %st(3) 602 fmulp %st, %st(4) 603 604 FLD -6 * SIZE(AO) 605 fmul %st(3), %st 606 FLD -6 * SIZE(AO) 607 fmul %st(5), %st 608 609 fsubrp %st, %st(3) 610 fsubrp %st, %st(1) 611 612 FLD -8 * SIZE(AO) 613 fmul %st, %st(1) 614 fmulp %st, %st(2) 615#endif 616 617#ifdef LT 618 FLD -8 * SIZE(AO) 619 fmul %st, %st(1) 620 fmulp %st, %st(2) 621 622 FLD -7 * SIZE(AO) 623 fmul %st(1), %st 624 FLD -7 * SIZE(AO) 625 fmul %st(3), %st 626 627 fsubrp %st, %st(5) 628 fsubrp %st, %st(3) 629 630 FLD -5 * SIZE(AO) 631 fmul %st, %st(3) 632 fmulp %st, %st(4) 633#endif 634 635#ifdef RN 636 FLD -8 * SIZE(BO) 637 fmul %st, %st(1) 638 fmulp %st, %st(3) 639 640 FLD -7 * SIZE(BO) 641 fmul %st(1), %st 642 FLD -7 * SIZE(BO) 643 fmul %st(4), %st 644 645 fsubrp %st, %st(5) 646 fsubrp %st, %st(2) 647 648 FLD -5 * SIZE(BO) 649 fmul %st, %st(2) 650 fmulp %st, %st(4) 651#endif 652 653#ifdef RT 654 FLD -5 * SIZE(BO) 655 fmul %st, %st(2) 656 fmulp %st, %st(4) 657 658 FLD -6 * SIZE(BO) 659 fmul %st(2), %st 660 FLD -6 * SIZE(BO) 661 fmul %st(5), %st 662 663 fsubrp %st, %st(4) 664 fsubrp %st, %st(1) 665 666 FLD -8 * SIZE(BO) 667 fmul %st, %st(1) 668 fmulp %st, %st(3) 669#endif 670 671#ifdef LN 672 subq $2 * SIZE, CO 673#endif 674 675#if defined(LN) || defined(LT) 676 fld %st 677 FST -8 * SIZE(BO) 678 fxch %st(1) 679 fld %st 680 FST -7 * SIZE(BO) 681 fxch %st(2) 682 fld %st 683 FST -6 * SIZE(BO) 684 fxch %st(3) 685 fld %st 686 FST -5 * SIZE(BO) 687 688 FST 1 * SIZE(CO, LDC) 689 FST 0 * SIZE(CO) 690 FST 0 * SIZE(CO, LDC) 691 FST 1 * SIZE(CO) 692#else 693 fld %st 694 FST -8 * SIZE(AO) 695 fxch %st(2) 696 fld %st 697 FST -7 * SIZE(AO) 698 fxch %st(1) 699 fld %st 700 FST -6 * SIZE(AO) 701 fxch %st(3) 702 fld %st 703 FST -5 * SIZE(AO) 704 705 FST 1 * SIZE(CO, LDC) 706 FST 1 * SIZE(CO) 707 FST 0 * SIZE(CO) 708 FST 0 * SIZE(CO, LDC) 709#endif 710 711#ifndef LN 712 addq $2 * SIZE, CO 713#endif 714 715#if defined(LT) || defined(RN) 716 movq K, %rax 717 subq KK, %rax 718 salq $BASE_SHIFT, %rax 719 leaq (AO, %rax, 2), AO 720 leaq (BO, %rax, 2), BO 721#endif 722 723#ifdef LN 724 subq $2, KK 725#endif 726 727#ifdef LT 728 addq $2, KK 729#endif 730 731#ifdef RT 732 movq K, %rax 733 salq $1 + BASE_SHIFT, %rax 734 addq %rax, AORIG 735#endif 736 737 decq I 738 jne .L11 739 ALIGN_4 740 741.L29: 742#ifdef LN 743 movq K, %rax 744 salq $BASE_SHIFT, %rax 745 leaq (B, %rax, 2), B 746#endif 747 748#if defined(LT) || defined(RN) 749 movq BO, B 750#endif 751 752#ifdef RN 753 addq $2, KK 754#endif 755 756#ifdef RT 757 subq $2, KK 758#endif 759 760 decq J 761 jne .L01 762 ALIGN_4 763 764.L30: 765 movq N, %rax 766 testq $1, %rax 767 je .L999 768 769#if defined(LT) || defined(RN) 770 movq A, AO 771#else 772 movq A, %rax 773 movq %rax, AORIG 774#endif 775 776#ifdef RT 777 movq K, %rax 778 salq $0 + BASE_SHIFT, %rax 779 subq %rax, B 780#endif 781 782#ifdef RT 783 subq LDC, C 784#endif 785 movq C, CO 786#ifndef RT 787 addq LDC, C 788#endif 789 790#ifdef LN 791 movq OFFSET, %rax 792 addq M, %rax 793 movq %rax, KK 794#endif 795 796#ifdef LT 797 movq OFFSET, %rax 798 movq %rax, KK 799#endif 800 801 movq M, %rax 802 andq $1, %rax 803 je .L40 804 ALIGN_4 805 806.L41: 807#ifdef LN 808 movq K, %rax 809 salq $0 + BASE_SHIFT, %rax 810 subq %rax, AORIG 811#endif 812 813#if defined(LN) || defined(RT) 814 movq KK, %rax 815 salq $BASE_SHIFT, %rax 816 movq AORIG, AO 817 leaq (AO, %rax, 1), AO 818 leaq (B, %rax, 1), BO 819#else 820 movq B, BO 821#endif 822 823 fldz 824 825#if defined(LT) || defined(RN) 826 movq KK, %rax 827#else 828 movq K, %rax 829 subq KK, %rax 830#endif 831 sarq $2, %rax 832 je .L45 833 ALIGN_4 834 835.L42: 836 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 837 838 FLD -8 * SIZE(AO) 839 FLD -8 * SIZE(BO) 840 fmulp %st, %st(1) 841 faddp %st, %st(1) 842 843 FLD -7 * SIZE(AO) 844 FLD -7 * SIZE(BO) 845 fmulp %st, %st(1) 846 faddp %st, %st(1) 847 848 FLD -6 * SIZE(AO) 849 FLD -6 * SIZE(BO) 850 fmulp %st, %st(1) 851 faddp %st, %st(1) 852 853 FLD -5 * SIZE(AO) 854 FLD -5 * SIZE(BO) 855 fmulp %st, %st(1) 856 faddp %st, %st(1) 857 858 addq $4 * SIZE,AO 859 addq $4 * SIZE,BO 860 861 decq %rax 862 jne .L42 863 ALIGN_4 864 865.L45: 866#if defined(LT) || defined(RN) 867 movq KK, %rax 868#else 869 movq K, %rax 870 subq KK, %rax 871#endif 872 and $3, %rax 873 je .L48 874 ALIGN_4 875 876.L46: 877 FLD -8 * SIZE(AO) 878 879 FLD -8 * SIZE(BO) 880 fmulp %st, %st(1) 881 faddp %st, %st(1) 882 883 addq $1 * SIZE,AO 884 addq $1 * SIZE,BO 885 886 decq %rax 887 jne .L46 888 ALIGN_4 889 890.L48: 891#if defined(LN) || defined(RT) 892 movq KK, %rax 893#ifdef LN 894 subq $1, %rax 895#else 896 subq $1, %rax 897#endif 898 899 salq $BASE_SHIFT, %rax 900 901 movq AORIG, AO 902 leaq (AO, %rax, 1), AO 903 leaq (B, %rax, 1), BO 904#endif 905 906#if defined(LN) || defined(LT) 907 FLD -8 * SIZE(BO) 908 fsubp %st, %st(1) 909#else 910 FLD -8 * SIZE(AO) 911 fsubp %st, %st(1) 912#endif 913 914#ifdef LN 915 FLD -8 * SIZE(AO) 916 fmulp %st, %st(1) 917#endif 918 919#ifdef LT 920 FLD -8 * SIZE(AO) 921 fmulp %st, %st(1) 922#endif 923 924#ifdef RN 925 FLD -8 * SIZE(BO) 926 fmulp %st, %st(1) 927#endif 928 929#ifdef RT 930 FLD -8 * SIZE(BO) 931 fmulp %st, %st(1) 932#endif 933 934#ifdef LN 935 subq $1 * SIZE, CO 936#endif 937 938#if defined(LN) || defined(LT) 939 fld %st 940 FST -8 * SIZE(BO) 941#else 942 fld %st 943 FST -8 * SIZE(AO) 944#endif 945 946 FST 0 * SIZE(CO) 947 948#ifndef LN 949 addq $1 * SIZE, CO 950#endif 951 952#if defined(LT) || defined(RN) 953 movq K, %rax 954 subq KK, %rax 955 salq $BASE_SHIFT, %rax 956 leaq (AO, %rax, 1), AO 957 leaq (BO, %rax, 1), BO 958#endif 959 960#ifdef LN 961 subq $1, KK 962#endif 963 964#ifdef LT 965 addq $1, KK 966#endif 967 968#ifdef RT 969 movq K, %rax 970 salq $0 + BASE_SHIFT, %rax 971 addq %rax, AORIG 972#endif 973 ALIGN_4 974 975.L40: 976 movq M, I 977 sarq $1, I 978 je .L49 979 ALIGN_4 980 981.L31: 982#ifdef LN 983 movq K, %rax 984 salq $1 + BASE_SHIFT, %rax 985 subq %rax, AORIG 986#endif 987 988#if defined(LN) || defined(RT) 989 movq KK, %rax 990 salq $BASE_SHIFT, %rax 991 movq AORIG, AO 992 leaq (AO, %rax, 2), AO 993 leaq (B, %rax, 1), BO 994#else 995 movq B, BO 996#endif 997 998 fldz 999 fldz 1000 1001#if defined(HAVE_3DNOW) 1002 prefetchw 2 * SIZE(CO) 1003#elif defined(HAVE_SSE) 1004 prefetchnta 2 * SIZE(CO) 1005#endif 1006 1007#if defined(LT) || defined(RN) 1008 movq KK, %rax 1009#else 1010 movq K, %rax 1011 subq KK, %rax 1012#endif 1013 sarq $2, %rax 1014 je .L35 1015 ALIGN_4 1016 1017.L32: 1018 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1019 1020 FLD -8 * SIZE(BO) 1021 FLD -8 * SIZE(AO) 1022 fmul %st(1), %st 1023 faddp %st, %st(2) 1024 1025 FLD -7 * SIZE(AO) 1026 fmulp %st, %st(1) 1027 faddp %st, %st(2) 1028 1029 FLD -7 * SIZE(BO) 1030 FLD -6 * SIZE(AO) 1031 fmul %st(1), %st 1032 faddp %st, %st(2) 1033 1034 FLD -5 * SIZE(AO) 1035 fmulp %st, %st(1) 1036 faddp %st, %st(2) 1037 1038 FLD -6 * SIZE(BO) 1039 FLD -4 * SIZE(AO) 1040 fmul %st(1), %st 1041 faddp %st, %st(2) 1042 1043 FLD -3 * SIZE(AO) 1044 fmulp %st, %st(1) 1045 faddp %st, %st(2) 1046 1047 FLD -5 * SIZE(BO) 1048 FLD -2 * SIZE(AO) 1049 fmul %st(1), %st 1050 faddp %st, %st(2) 1051 1052 FLD -1 * SIZE(AO) 1053 fmulp %st, %st(1) 1054 faddp %st, %st(2) 1055 1056 addq $8 * SIZE,AO 1057 addq $4 * SIZE,BO 1058 1059 decq %rax 1060 jne .L32 1061 ALIGN_4 1062 1063.L35: 1064#if defined(LT) || defined(RN) 1065 movq KK, %rax 1066#else 1067 movq K, %rax 1068 subq KK, %rax 1069#endif 1070 and $3, %rax 1071 je .L38 1072 ALIGN_4 1073 1074.L36: 1075 FLD -8 * SIZE(BO) 1076 1077 FLD -8 * SIZE(AO) 1078 fmul %st(1), %st 1079 faddp %st, %st(2) 1080 1081 FLD -7 * SIZE(AO) 1082 fmulp %st, %st(1) 1083 faddp %st, %st(2) 1084 1085 addq $2 * SIZE,AO 1086 addq $1 * SIZE,BO 1087 1088 decq %rax 1089 jne .L36 1090 ALIGN_4 1091 1092.L38: 1093#if defined(LN) || defined(RT) 1094 movq KK, %rax 1095#ifdef LN 1096 subq $2, %rax 1097#else 1098 subq $1, %rax 1099#endif 1100 1101 salq $BASE_SHIFT, %rax 1102 1103 movq AORIG, AO 1104 leaq (AO, %rax, 2), AO 1105 leaq (B, %rax, 1), BO 1106#endif 1107 1108#if defined(LN) || defined(LT) 1109 FLD -8 * SIZE(BO) 1110 fsubp %st, %st(1) 1111 FLD -7 * SIZE(BO) 1112 fsubp %st, %st(2) 1113#else 1114 FLD -8 * SIZE(AO) 1115 fsubp %st, %st(1) 1116 FLD -7 * SIZE(AO) 1117 fsubp %st, %st(2) 1118#endif 1119 1120#ifdef LN 1121 FLD -5 * SIZE(AO) 1122 fmulp %st, %st(2) 1123 1124 FLD -6 * SIZE(AO) 1125 fmul %st(2), %st 1126 1127 fsubrp %st, %st(1) 1128 FLD -8 * SIZE(AO) 1129 fmulp %st, %st(1) 1130#endif 1131 1132#ifdef LT 1133 FLD -8 * SIZE(AO) 1134 fmulp %st, %st(1) 1135 1136 FLD -7 * SIZE(AO) 1137 fmul %st(1), %st 1138 1139 fsubrp %st, %st(2) 1140 1141 FLD -5 * SIZE(AO) 1142 fmulp %st, %st(2) 1143#endif 1144 1145#ifdef RN 1146 FLD -8 * SIZE(BO) 1147 fmul %st, %st(1) 1148 fmulp %st, %st(2) 1149#endif 1150 1151#ifdef RT 1152 FLD -8 * SIZE(BO) 1153 fmul %st, %st(1) 1154 fmulp %st, %st(2) 1155#endif 1156 1157#ifdef LN 1158 subq $2 * SIZE, CO 1159#endif 1160 1161#if defined(LN) || defined(LT) 1162 fld %st 1163 FST -8 * SIZE(BO) 1164 fxch %st(1) 1165 fld %st 1166 FST -7 * SIZE(BO) 1167#else 1168 fld %st 1169 FST -8 * SIZE(AO) 1170 fxch %st(1) 1171 fld %st 1172 FST -7 * SIZE(AO) 1173#endif 1174 1175 FST 1 * SIZE(CO) 1176 FST 0 * SIZE(CO) 1177 1178#ifndef LN 1179 addq $2 * SIZE, CO 1180#endif 1181 1182#if defined(LT) || defined(RN) 1183 movq K, %rax 1184 subq KK, %rax 1185 salq $BASE_SHIFT, %rax 1186 leaq (AO, %rax, 2), AO 1187 leaq (BO, %rax, 1), BO 1188#endif 1189 1190#ifdef LN 1191 subq $2, KK 1192#endif 1193 1194#ifdef LT 1195 addq $2, KK 1196#endif 1197 1198#ifdef RT 1199 movq K, %rax 1200 salq $1 + BASE_SHIFT, %rax 1201 addq %rax, AORIG 1202#endif 1203 1204 decq I 1205 jne .L31 1206 ALIGN_4 1207 1208.L49: 1209#ifdef LN 1210 movq K, %rax 1211 salq $BASE_SHIFT, %rax 1212 leaq (B, %rax, 1), B 1213#endif 1214 1215#if defined(LT) || defined(RN) 1216 movq BO, B 1217#endif 1218 1219#ifdef RN 1220 addq $1, KK 1221#endif 1222 1223#ifdef RT 1224 subq $1, KK 1225#endif 1226 ALIGN_4 1227 1228.L999: 1229 movq 0(%rsp), %rbx 1230 movq 8(%rsp), %rbp 1231 movq 16(%rsp), %r12 1232 movq 24(%rsp), %r13 1233 movq 32(%rsp), %r14 1234 movq 40(%rsp), %r15 1235 addq $STACKSIZE, %rsp 1236 ret 1237 1238 EPILOGUE 1239