1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M ARG1 26#define N ARG2 27#define K ARG3 28#define A ARG4 29#define B ARG5 30#define C ARG6 31#define LDC %r10 32 33#define I %r12 34#define J %r13 35#define AO %r14 36#define BO %r15 37#define CO %rbp 38 39#define KK %r11 40#define AORIG 48(%rsp) 41 42#define STACKSIZE 64 43 44#define ALPHA 8 + STACKSIZE(%rsp) 45#define OFFSET 32 + STACKSIZE(%rsp) 46 47#ifdef OPTERON 48#define PREFETCH prefetch 49#define PREFETCHW prefetchw 50#else 51#define PREFETCH prefetcht0 52#define PREFETCHW prefetcht0 53#endif 54 55#define PREFETCHSIZE (5 + 4 * 10) 56 57 PROLOGUE 58 PROFCODE 59 60 subq $STACKSIZE, %rsp 61 movq %rbx, 0(%rsp) 62 movq %rbp, 8(%rsp) 63 movq %r12, 16(%rsp) 64 movq %r13, 24(%rsp) 65 movq %r14, 32(%rsp) 66 movq %r15, 40(%rsp) 67 68 movq 24 + STACKSIZE(%rsp), LDC 69 70#if defined(TRMMKERNEL) && !defined(LEFT) 71 movq OFFSET, %rax 72 negq %rax 73 movq %rax, KK 74#endif 75 76 addq $8 * SIZE, A 77 addq $8 * SIZE, B 78 79 salq $BASE_SHIFT, LDC 80 81#ifdef LN 82 movq M, %rax 83 salq $BASE_SHIFT, %rax 84 addq %rax, C 85 imulq K, %rax 86 addq %rax, A 87#endif 88 89#ifdef RT 90 movq N, %rax 91 salq $BASE_SHIFT, %rax 92 imulq K, %rax 93 addq %rax, B 94 95 movq N, %rax 96 imulq LDC, %rax 97 addq %rax, C 98#endif 99 100#ifdef RN 101 movq OFFSET, %rax 102 negq %rax 103 movq %rax, KK 104#endif 105 106#ifdef RT 107 movq N, %rax 108 subq OFFSET, %rax 109 movq %rax, KK 110#endif 111 112 movq N, %rax 113 testq $1, %rax 114 je .L30 115 116#if defined(LT) || defined(RN) 117 movq A, AO 118#else 119 movq A, %rax 120 movq %rax, AORIG 121#endif 122 123#ifdef RT 124 movq K, %rax 125 salq $0 + BASE_SHIFT, %rax 126 subq %rax, B 127#endif 128 129#ifdef RT 130 subq LDC, C 131#endif 132 movq C, CO 133#ifndef RT 134 addq LDC, C 135#endif 136 137#ifdef LN 138 movq OFFSET, %rax 139 addq M, %rax 140 movq %rax, KK 141#endif 142 143#ifdef LT 144 movq OFFSET, %rax 145 movq %rax, KK 146#endif 147 148 movq M, I 149 sarq $1, I 150 je .L40 151 ALIGN_4 152 153.L31: 154#ifdef LN 155 movq K, %rax 156 salq $1 + BASE_SHIFT, %rax 157 subq %rax, AORIG 158#endif 159 160#if defined(LN) || defined(RT) 161 movq KK, %rax 162 salq $BASE_SHIFT, %rax 163 movq AORIG, AO 164 leaq (AO, %rax, 2), AO 165 leaq (B, %rax, 1), BO 166#else 167 movq B, BO 168#endif 169 170 fldz 171 fldz 172 173#if defined(HAVE_3DNOW) 174 prefetchw 2 * SIZE(CO) 175#elif defined(HAVE_SSE) 176 prefetchnta 2 * SIZE(CO) 177#endif 178 179#if defined(LT) || defined(RN) 180 movq KK, %rax 181#else 182 movq K, %rax 183 subq KK, %rax 184#endif 185 sarq $2, %rax 186 je .L35 187 ALIGN_4 188 189.L32: 190 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 191 192 FLD -8 * SIZE(BO) 193 FLD -8 * SIZE(AO) 194 fmul %st(1), %st 195 faddp %st, %st(2) 196 197 FLD -7 * SIZE(AO) 198 fmulp %st, %st(1) 199 faddp %st, %st(2) 200 201 FLD -7 * SIZE(BO) 202 FLD -6 * SIZE(AO) 203 fmul %st(1), %st 204 faddp %st, %st(2) 205 206 FLD -5 * SIZE(AO) 207 fmulp %st, %st(1) 208 faddp %st, %st(2) 209 210 FLD -6 * SIZE(BO) 211 FLD -4 * SIZE(AO) 212 fmul %st(1), %st 213 faddp %st, %st(2) 214 215 FLD -3 * SIZE(AO) 216 fmulp %st, %st(1) 217 faddp %st, %st(2) 218 219 FLD -5 * SIZE(BO) 220 FLD -2 * SIZE(AO) 221 fmul %st(1), %st 222 faddp %st, %st(2) 223 224 FLD -1 * SIZE(AO) 225 fmulp %st, %st(1) 226 faddp %st, %st(2) 227 228 addq $8 * SIZE,AO 229 addq $4 * SIZE,BO 230 231 decq %rax 232 jne .L32 233 ALIGN_4 234 235.L35: 236#if defined(LT) || defined(RN) 237 movq KK, %rax 238#else 239 movq K, %rax 240 subq KK, %rax 241#endif 242 and $3, %rax 243 je .L38 244 ALIGN_4 245 246.L36: 247 FLD -8 * SIZE(BO) 248 249 FLD -8 * SIZE(AO) 250 fmul %st(1), %st 251 faddp %st, %st(2) 252 253 FLD -7 * SIZE(AO) 254 fmulp %st, %st(1) 255 faddp %st, %st(2) 256 257 addq $2 * SIZE,AO 258 addq $1 * SIZE,BO 259 260 decq %rax 261 jne .L36 262 ALIGN_4 263 264.L38: 265#if defined(LN) || defined(RT) 266 movq KK, %rax 267#ifdef LN 268 subq $2, %rax 269#else 270 subq $1, %rax 271#endif 272 273 salq $BASE_SHIFT, %rax 274 275 movq AORIG, AO 276 leaq (AO, %rax, 2), AO 277 leaq (B, %rax, 1), BO 278#endif 279 280#if defined(LN) || defined(LT) 281 FLD -8 * SIZE(BO) 282 fsubp %st, %st(1) 283 FLD -7 * SIZE(BO) 284 fsubp %st, %st(2) 285#else 286 FLD -8 * SIZE(AO) 287 fsubp %st, %st(1) 288 FLD -7 * SIZE(AO) 289 fsubp %st, %st(2) 290#endif 291 292#ifdef LN 293 FLD -5 * SIZE(AO) 294 fmulp %st, %st(2) 295 296 FLD -6 * SIZE(AO) 297 fmul %st(2), %st 298 299 fsubrp %st, %st(1) 300 FLD -8 * SIZE(AO) 301 fmulp %st, %st(1) 302#endif 303 304#ifdef LT 305 FLD -8 * SIZE(AO) 306 fmulp %st, %st(1) 307 308 FLD -7 * SIZE(AO) 309 fmul %st(1), %st 310 311 fsubrp %st, %st(2) 312 313 FLD -5 * SIZE(AO) 314 fmulp %st, %st(2) 315#endif 316 317#ifdef RN 318 FLD -8 * SIZE(BO) 319 fmul %st, %st(1) 320 fmulp %st, %st(2) 321#endif 322 323#ifdef RT 324 FLD -8 * SIZE(BO) 325 fmul %st, %st(1) 326 fmulp %st, %st(2) 327#endif 328 329#ifdef LN 330 subq $2 * SIZE, CO 331#endif 332 333#if defined(LN) || defined(LT) 334 fld %st 335 FST -8 * SIZE(BO) 336 fxch %st(1) 337 fld %st 338 FST -7 * SIZE(BO) 339#else 340 fld %st 341 FST -8 * SIZE(AO) 342 fxch %st(1) 343 fld %st 344 FST -7 * SIZE(AO) 345#endif 346 347 FST 1 * SIZE(CO) 348 FST 0 * SIZE(CO) 349 350#ifndef LN 351 addq $2 * SIZE, CO 352#endif 353 354#if defined(LT) || defined(RN) 355 movq K, %rax 356 subq KK, %rax 357 salq $BASE_SHIFT, %rax 358 leaq (AO, %rax, 2), AO 359 leaq (BO, %rax, 1), BO 360#endif 361 362#ifdef LN 363 subq $2, KK 364#endif 365 366#ifdef LT 367 addq $2, KK 368#endif 369 370#ifdef RT 371 movq K, %rax 372 salq $1 + BASE_SHIFT, %rax 373 addq %rax, AORIG 374#endif 375 376 decq I 377 jne .L31 378 ALIGN_4 379 380.L40: 381 movq M, %rax 382 andq $1, %rax 383 je .L49 384 ALIGN_4 385 386.L41: 387#ifdef LN 388 movq K, %rax 389 salq $0 + BASE_SHIFT, %rax 390 subq %rax, AORIG 391#endif 392 393#if defined(LN) || defined(RT) 394 movq KK, %rax 395 salq $BASE_SHIFT, %rax 396 movq AORIG, AO 397 leaq (AO, %rax, 1), AO 398 leaq (B, %rax, 1), BO 399#else 400 movq B, BO 401#endif 402 403 fldz 404 405#if defined(LT) || defined(RN) 406 movq KK, %rax 407#else 408 movq K, %rax 409 subq KK, %rax 410#endif 411 sarq $2, %rax 412 je .L45 413 ALIGN_4 414 415.L42: 416 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 417 418 FLD -8 * SIZE(AO) 419 FLD -8 * SIZE(BO) 420 fmulp %st, %st(1) 421 faddp %st, %st(1) 422 423 FLD -7 * SIZE(AO) 424 FLD -7 * SIZE(BO) 425 fmulp %st, %st(1) 426 faddp %st, %st(1) 427 428 FLD -6 * SIZE(AO) 429 FLD -6 * SIZE(BO) 430 fmulp %st, %st(1) 431 faddp %st, %st(1) 432 433 FLD -5 * SIZE(AO) 434 FLD -5 * SIZE(BO) 435 fmulp %st, %st(1) 436 faddp %st, %st(1) 437 438 addq $4 * SIZE,AO 439 addq $4 * SIZE,BO 440 441 decq %rax 442 jne .L42 443 ALIGN_4 444 445.L45: 446#if defined(LT) || defined(RN) 447 movq KK, %rax 448#else 449 movq K, %rax 450 subq KK, %rax 451#endif 452 and $3, %rax 453 je .L48 454 ALIGN_4 455 456.L46: 457 FLD -8 * SIZE(AO) 458 459 FLD -8 * SIZE(BO) 460 fmulp %st, %st(1) 461 faddp %st, %st(1) 462 463 addq $1 * SIZE,AO 464 addq $1 * SIZE,BO 465 466 decq %rax 467 jne .L46 468 ALIGN_4 469 470.L48: 471#if defined(LN) || defined(RT) 472 movq KK, %rax 473#ifdef LN 474 subq $1, %rax 475#else 476 subq $1, %rax 477#endif 478 479 salq $BASE_SHIFT, %rax 480 481 movq AORIG, AO 482 leaq (AO, %rax, 1), AO 483 leaq (B, %rax, 1), BO 484#endif 485 486#if defined(LN) || defined(LT) 487 FLD -8 * SIZE(BO) 488 fsubp %st, %st(1) 489#else 490 FLD -8 * SIZE(AO) 491 fsubp %st, %st(1) 492#endif 493 494#ifdef LN 495 FLD -8 * SIZE(AO) 496 fmulp %st, %st(1) 497#endif 498 499#ifdef LT 500 FLD -8 * SIZE(AO) 501 fmulp %st, %st(1) 502#endif 503 504#ifdef RN 505 FLD -8 * SIZE(BO) 506 fmulp %st, %st(1) 507#endif 508 509#ifdef RT 510 FLD -8 * SIZE(BO) 511 fmulp %st, %st(1) 512#endif 513 514#ifdef LN 515 subq $1 * SIZE, CO 516#endif 517 518#if defined(LN) || defined(LT) 519 fld %st 520 FST -8 * SIZE(BO) 521#else 522 fld %st 523 FST -8 * SIZE(AO) 524#endif 525 526 FST 0 * SIZE(CO) 527 528#ifndef LN 529 addq $1 * SIZE, CO 530#endif 531 532#if defined(LT) || defined(RN) 533 movq K, %rax 534 subq KK, %rax 535 salq $BASE_SHIFT, %rax 536 leaq (AO, %rax, 1), AO 537 leaq (BO, %rax, 1), BO 538#endif 539 540#ifdef LN 541 subq $1, KK 542#endif 543 544#ifdef LT 545 addq $1, KK 546#endif 547 548#ifdef RT 549 movq K, %rax 550 salq $0 + BASE_SHIFT, %rax 551 addq %rax, AORIG 552#endif 553 ALIGN_4 554 555.L49: 556#ifdef LN 557 movq K, %rax 558 salq $BASE_SHIFT, %rax 559 leaq (B, %rax, 1), B 560#endif 561 562#if defined(LT) || defined(RN) 563 movq BO, B 564#endif 565 566#ifdef RN 567 addq $1, KK 568#endif 569 570#ifdef RT 571 subq $1, KK 572#endif 573 ALIGN_4 574 575.L30: 576 movq N, %rax 577 sarq $1, %rax 578 movq %rax, J 579 je .L999 580 ALIGN_4 581 582.L01: 583#if defined(LT) || defined(RN) 584 movq A, AO 585#else 586 movq A, %rax 587 movq %rax, AORIG 588#endif 589 590#ifdef RT 591 movq K, %rax 592 salq $1 + BASE_SHIFT, %rax 593 subq %rax, B 594#endif 595 596 lea (, LDC, 2), %rax 597 598#ifdef RT 599 subq %rax, C 600#endif 601 movq C, CO 602#ifndef RT 603 addq %rax, C 604#endif 605 606#ifdef LN 607 movq OFFSET, %rax 608 addq M, %rax 609 movq %rax, KK 610#endif 611 612#ifdef LT 613 movq OFFSET, %rax 614 movq %rax, KK 615#endif 616 617 movq M, I 618 sarq $1, I 619 je .L20 620 ALIGN_4 621 622.L11: 623#ifdef LN 624 movq K, %rax 625 salq $1 + BASE_SHIFT, %rax 626 subq %rax, AORIG 627#endif 628 629#if defined(LN) || defined(RT) 630 movq KK, %rax 631 salq $BASE_SHIFT, %rax 632 movq AORIG, AO 633 leaq (AO, %rax, 2), AO 634 leaq (B, %rax, 2), BO 635#else 636 movq B, BO 637#endif 638 639 fldz 640 fldz 641 fldz 642 fldz 643 644#if defined(HAVE_3DNOW) 645 prefetchw 2 * SIZE(CO) 646 prefetchw 2 * SIZE(CO, LDC, 1) 647#elif defined(HAVE_SSE) 648 prefetchnta 2 * SIZE(CO) 649 prefetchnta 2 * SIZE(CO, LDC, 1) 650#endif 651 652#if defined(LT) || defined(RN) 653 movq KK, %rax 654#else 655 movq K, %rax 656 subq KK, %rax 657#endif 658 sarq $2, %rax 659 je .L15 660 ALIGN_4 661 662.L12: 663 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 664 665 FLD -8 * SIZE(AO) 666 667 FLD -8 * SIZE(BO) 668 fld %st(1) 669 fmul %st(1), %st 670 faddp %st, %st(3) 671 672 FLD -7 * SIZE(BO) 673 fmul %st, %st(2) 674 675 FLD -7 * SIZE(AO) 676 fmul %st, %st(2) 677 fmulp %st, %st(1) 678 679 faddp %st, %st(6) 680 faddp %st, %st(4) 681 faddp %st, %st(2) 682 683 FLD -6 * SIZE(AO) 684 685 FLD -6 * SIZE(BO) 686 fld %st(1) 687 fmul %st(1), %st 688 faddp %st, %st(3) 689 690 FLD -5 * SIZE(BO) 691 fmul %st, %st(2) 692 693 FLD -5 * SIZE(AO) 694 fmul %st, %st(2) 695 fmulp %st, %st(1) 696 697 faddp %st, %st(6) 698 faddp %st, %st(4) 699 faddp %st, %st(2) 700 701 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 702 703 FLD -4 * SIZE(AO) 704 705 FLD -4 * SIZE(BO) 706 fld %st(1) 707 fmul %st(1), %st 708 faddp %st, %st(3) 709 710 FLD -3 * SIZE(BO) 711 fmul %st, %st(2) 712 713 FLD -3 * SIZE(AO) 714 fmul %st, %st(2) 715 fmulp %st, %st(1) 716 717 faddp %st, %st(6) 718 faddp %st, %st(4) 719 faddp %st, %st(2) 720 721 FLD -2 * SIZE(AO) 722 723 FLD -2 * SIZE(BO) 724 fld %st(1) 725 fmul %st(1), %st 726 faddp %st, %st(3) 727 728 FLD -1 * SIZE(BO) 729 fmul %st, %st(2) 730 731 FLD -1 * SIZE(AO) 732 fmul %st, %st(2) 733 fmulp %st, %st(1) 734 735 faddp %st, %st(6) 736 faddp %st, %st(4) 737 faddp %st, %st(2) 738 739 addq $8 * SIZE,AO 740 addq $8 * SIZE,BO 741 742 decq %rax 743 jne .L12 744 ALIGN_4 745 746.L15: 747#if defined(LT) || defined(RN) 748 movq KK, %rax 749#else 750 movq K, %rax 751 subq KK, %rax 752#endif 753 and $3, %rax 754 je .L18 755 ALIGN_4 756 757.L16: 758 FLD -8 * SIZE(AO) 759 760 FLD -8 * SIZE(BO) 761 fld %st(1) 762 fmul %st(1), %st 763 faddp %st, %st(3) 764 765 FLD -7 * SIZE(BO) 766 fmul %st, %st(2) 767 768 FLD -7 * SIZE(AO) 769 fmul %st, %st(2) 770 fmulp %st, %st(1) 771 772 faddp %st, %st(6) 773 faddp %st, %st(4) 774 faddp %st, %st(2) 775 776 addq $2 * SIZE,AO 777 addq $2 * SIZE,BO 778 779 decq %rax 780 jne .L16 781 ALIGN_4 782 783.L18: 784#if defined(LN) || defined(RT) 785 movq KK, %rax 786#ifdef LN 787 subq $2, %rax 788#else 789 subq $2, %rax 790#endif 791 792 salq $BASE_SHIFT, %rax 793 794 movq AORIG, AO 795 leaq (AO, %rax, 2), AO 796 leaq (B, %rax, 2), BO 797#endif 798 799#if defined(LN) || defined(LT) 800 FLD -8 * SIZE(BO) 801 fsubp %st, %st(1) 802 FLD -7 * SIZE(BO) 803 fsubp %st, %st(2) 804 FLD -6 * SIZE(BO) 805 fsubp %st, %st(3) 806 FLD -5 * SIZE(BO) 807 fsubp %st, %st(4) 808#else 809 FLD -8 * SIZE(AO) 810 fsubp %st, %st(1) 811 FLD -7 * SIZE(AO) 812 fsubp %st, %st(3) 813 FLD -6 * SIZE(AO) 814 fsubp %st, %st(2) 815 FLD -5 * SIZE(AO) 816 fsubp %st, %st(4) 817#endif 818 819#ifdef LN 820 FLD -5 * SIZE(AO) 821 fmul %st, %st(3) 822 fmulp %st, %st(4) 823 824 FLD -6 * SIZE(AO) 825 fmul %st(3), %st 826 FLD -6 * SIZE(AO) 827 fmul %st(5), %st 828 829 fsubrp %st, %st(3) 830 fsubrp %st, %st(1) 831 832 FLD -8 * SIZE(AO) 833 fmul %st, %st(1) 834 fmulp %st, %st(2) 835#endif 836 837#ifdef LT 838 FLD -8 * SIZE(AO) 839 fmul %st, %st(1) 840 fmulp %st, %st(2) 841 842 FLD -7 * SIZE(AO) 843 fmul %st(1), %st 844 FLD -7 * SIZE(AO) 845 fmul %st(3), %st 846 847 fsubrp %st, %st(5) 848 fsubrp %st, %st(3) 849 850 FLD -5 * SIZE(AO) 851 fmul %st, %st(3) 852 fmulp %st, %st(4) 853#endif 854 855#ifdef RN 856 FLD -8 * SIZE(BO) 857 fmul %st, %st(1) 858 fmulp %st, %st(3) 859 860 FLD -7 * SIZE(BO) 861 fmul %st(1), %st 862 FLD -7 * SIZE(BO) 863 fmul %st(4), %st 864 865 fsubrp %st, %st(5) 866 fsubrp %st, %st(2) 867 868 FLD -5 * SIZE(BO) 869 fmul %st, %st(2) 870 fmulp %st, %st(4) 871#endif 872 873#ifdef RT 874 FLD -5 * SIZE(BO) 875 fmul %st, %st(2) 876 fmulp %st, %st(4) 877 878 FLD -6 * SIZE(BO) 879 fmul %st(2), %st 880 FLD -6 * SIZE(BO) 881 fmul %st(5), %st 882 883 fsubrp %st, %st(4) 884 fsubrp %st, %st(1) 885 886 FLD -8 * SIZE(BO) 887 fmul %st, %st(1) 888 fmulp %st, %st(3) 889#endif 890 891#ifdef LN 892 subq $2 * SIZE, CO 893#endif 894 895#if defined(LN) || defined(LT) 896 fld %st 897 FST -8 * SIZE(BO) 898 fxch %st(1) 899 fld %st 900 FST -7 * SIZE(BO) 901 fxch %st(2) 902 fld %st 903 FST -6 * SIZE(BO) 904 fxch %st(3) 905 fld %st 906 FST -5 * SIZE(BO) 907 908 FST 1 * SIZE(CO, LDC) 909 FST 0 * SIZE(CO) 910 FST 0 * SIZE(CO, LDC) 911 FST 1 * SIZE(CO) 912#else 913 fld %st 914 FST -8 * SIZE(AO) 915 fxch %st(2) 916 fld %st 917 FST -7 * SIZE(AO) 918 fxch %st(1) 919 fld %st 920 FST -6 * SIZE(AO) 921 fxch %st(3) 922 fld %st 923 FST -5 * SIZE(AO) 924 925 FST 1 * SIZE(CO, LDC) 926 FST 1 * SIZE(CO) 927 FST 0 * SIZE(CO) 928 FST 0 * SIZE(CO, LDC) 929#endif 930 931#ifndef LN 932 addq $2 * SIZE, CO 933#endif 934 935#if defined(LT) || defined(RN) 936 movq K, %rax 937 subq KK, %rax 938 salq $BASE_SHIFT, %rax 939 leaq (AO, %rax, 2), AO 940 leaq (BO, %rax, 2), BO 941#endif 942 943#ifdef LN 944 subq $2, KK 945#endif 946 947#ifdef LT 948 addq $2, KK 949#endif 950 951#ifdef RT 952 movq K, %rax 953 salq $1 + BASE_SHIFT, %rax 954 addq %rax, AORIG 955#endif 956 957 decq I 958 jne .L11 959 ALIGN_4 960 961.L20: 962 movq M, %rax 963 andq $1, %rax 964 je .L29 965 ALIGN_4 966 967.L21: 968#ifdef LN 969 movq K, %rax 970 salq $0 + BASE_SHIFT, %rax 971 subq %rax, AORIG 972#endif 973 974#if defined(LN) || defined(RT) 975 movq KK, %rax 976 salq $BASE_SHIFT, %rax 977 movq AORIG, AO 978 leaq (AO, %rax, 1), AO 979 leaq (B, %rax, 2), BO 980#else 981 movq B, BO 982#endif 983 984 fldz 985 fldz 986 987#if defined(LT) || defined(RN) 988 movq KK, %rax 989#else 990 movq K, %rax 991 subq KK, %rax 992#endif 993 sarq $2, %rax 994 je .L25 995 ALIGN_4 996 997.L22: 998 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 999 1000 FLD -8 * SIZE(AO) 1001 1002 FLD -8 * SIZE(BO) 1003 fmul %st(1), %st 1004 faddp %st, %st(2) 1005 1006 FLD -7 * SIZE(BO) 1007 fmulp %st, %st(1) 1008 faddp %st, %st(2) 1009 1010 FLD -7 * SIZE(AO) 1011 1012 FLD -6 * SIZE(BO) 1013 fmul %st(1), %st 1014 faddp %st, %st(2) 1015 1016 FLD -5 * SIZE(BO) 1017 fmulp %st, %st(1) 1018 faddp %st, %st(2) 1019 1020 FLD -6 * SIZE(AO) 1021 1022 FLD -4 * SIZE(BO) 1023 fmul %st(1), %st 1024 faddp %st, %st(2) 1025 1026 FLD -3 * SIZE(BO) 1027 fmulp %st, %st(1) 1028 faddp %st, %st(2) 1029 1030 FLD -5 * SIZE(AO) 1031 1032 FLD -2 * SIZE(BO) 1033 fmul %st(1), %st 1034 faddp %st, %st(2) 1035 1036 FLD -1 * SIZE(BO) 1037 fmulp %st, %st(1) 1038 faddp %st, %st(2) 1039 1040 addq $4 * SIZE,AO 1041 addq $8 * SIZE,BO 1042 1043 decq %rax 1044 jne .L22 1045 ALIGN_4 1046 1047.L25: 1048#if defined(LT) || defined(RN) 1049 movq KK, %rax 1050#else 1051 movq K, %rax 1052 subq KK, %rax 1053#endif 1054 and $3, %rax 1055 je .L28 1056 ALIGN_4 1057 1058.L26: 1059 FLD -8 * SIZE(AO) 1060 1061 FLD -8 * SIZE(BO) 1062 fmul %st(1), %st 1063 faddp %st, %st(2) 1064 1065 FLD -7 * SIZE(BO) 1066 fmulp %st, %st(1) 1067 faddp %st, %st(2) 1068 1069 addq $1 * SIZE,AO 1070 addq $2 * SIZE,BO 1071 1072 decq %rax 1073 jne .L26 1074 ALIGN_4 1075 1076.L28: 1077#if defined(LN) || defined(RT) 1078 movq KK, %rax 1079#ifdef LN 1080 subq $1, %rax 1081#else 1082 subq $2, %rax 1083#endif 1084 1085 salq $BASE_SHIFT, %rax 1086 1087 movq AORIG, AO 1088 leaq (AO, %rax, 1), AO 1089 leaq (B, %rax, 2), BO 1090#endif 1091 1092#if defined(LN) || defined(LT) 1093 FLD -8 * SIZE(BO) 1094 fsubp %st, %st(1) 1095 FLD -7 * SIZE(BO) 1096 fsubp %st, %st(2) 1097#else 1098 FLD -8 * SIZE(AO) 1099 fsubp %st, %st(1) 1100 FLD -7 * SIZE(AO) 1101 fsubp %st, %st(2) 1102#endif 1103 1104#if defined(LN) || defined(LT) 1105 FLD -8 * SIZE(AO) 1106 fmul %st, %st(1) 1107 fmulp %st, %st(2) 1108#endif 1109 1110#ifdef RN 1111 FLD -8 * SIZE(BO) 1112 fmulp %st, %st(1) 1113 1114 FLD -7 * SIZE(BO) 1115 fmul %st(1), %st 1116 1117 fsubrp %st, %st(2) 1118 1119 FLD -5 * SIZE(BO) 1120 fmulp %st, %st(2) 1121#endif 1122 1123#ifdef RT 1124 FLD -5 * SIZE(BO) 1125 fmulp %st, %st(2) 1126 1127 FLD -6 * SIZE(BO) 1128 fmul %st(2), %st 1129 1130 fsubrp %st, %st(1) 1131 1132 FLD -8 * SIZE(BO) 1133 fmulp %st, %st(1) 1134#endif 1135 1136#ifdef LN 1137 subq $1 * SIZE, CO 1138#endif 1139 1140#if defined(LN) || defined(LT) 1141 fld %st 1142 FST -8 * SIZE(BO) 1143 fxch %st(1) 1144 fld %st 1145 FST -7 * SIZE(BO) 1146#else 1147 fld %st 1148 FST -8 * SIZE(AO) 1149 fxch %st(1) 1150 fld %st 1151 FST -7 * SIZE(AO) 1152#endif 1153 1154 FST 0 * SIZE(CO, LDC) 1155 FST 0 * SIZE(CO) 1156 1157#ifndef LN 1158 addq $1 * SIZE, CO 1159#endif 1160 1161#if defined(LT) || defined(RN) 1162 movq K, %rax 1163 subq KK, %rax 1164 salq $BASE_SHIFT, %rax 1165 leaq (AO, %rax, 1), AO 1166 leaq (BO, %rax, 2), BO 1167#endif 1168 1169#ifdef LN 1170 subq $1, KK 1171#endif 1172 1173#ifdef LT 1174 addq $1, KK 1175#endif 1176 1177#ifdef RT 1178 movq K, %rax 1179 salq $0 + BASE_SHIFT, %rax 1180 addq %rax, AORIG 1181#endif 1182 ALIGN_4 1183 1184.L29: 1185#ifdef LN 1186 movq K, %rax 1187 salq $BASE_SHIFT, %rax 1188 leaq (B, %rax, 2), B 1189#endif 1190 1191#if defined(LT) || defined(RN) 1192 movq BO, B 1193#endif 1194 1195#ifdef RN 1196 addq $2, KK 1197#endif 1198 1199#ifdef RT 1200 subq $2, KK 1201#endif 1202 1203 decq J 1204 jne .L01 1205 ALIGN_4 1206 1207.L999: 1208 movq 0(%rsp), %rbx 1209 movq 8(%rsp), %rbp 1210 movq 16(%rsp), %r12 1211 movq 24(%rsp), %r13 1212 movq 32(%rsp), %r14 1213 movq 40(%rsp), %r15 1214 addq $STACKSIZE, %rsp 1215 ret 1216 1217 EPILOGUE 1218