1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M ARG1 26#define N ARG2 27#define K ARG3 28#define A ARG4 29#define B ARG5 30#define C ARG6 31#define LDC %r10 32 33#define I %r12 34#define J %r13 35#define AO %r14 36#define BO %r15 37#define CO %rbp 38 39#define KK %r11 40#define AORIG 48(%rsp) 41 42#define STACKSIZE 64 43 44#define ALPHA 8 + STACKSIZE(%rsp) 45#define OFFSET 32 + STACKSIZE(%rsp) 46 47#ifdef OPTERON 48#define PREFETCH prefetch 49#define PREFETCHW prefetchw 50#else 51#define PREFETCH prefetcht0 52#define PREFETCHW prefetcht0 53#endif 54 55#define PREFETCHSIZE (5 + 4 * 10) 56 57 PROLOGUE 58 PROFCODE 59 60 subq $STACKSIZE, %rsp 61 movq %rbx, 0(%rsp) 62 movq %rbp, 8(%rsp) 63 movq %r12, 16(%rsp) 64 movq %r13, 24(%rsp) 65 movq %r14, 32(%rsp) 66 movq %r15, 40(%rsp) 67 68 movq 24 + STACKSIZE(%rsp), LDC 69 70#if defined(TRMMKERNEL) && !defined(LEFT) 71 movq OFFSET, %rax 72 negq %rax 73 movq %rax, KK 74#endif 75 76 addq $8 * SIZE, A 77 addq $8 * SIZE, B 78 79 salq $BASE_SHIFT, LDC 80 81#ifdef LN 82 movq M, %rax 83 salq $BASE_SHIFT, %rax 84 addq %rax, C 85 imulq K, %rax 86 addq %rax, A 87#endif 88 89#ifdef RT 90 movq N, %rax 91 salq $BASE_SHIFT, %rax 92 imulq K, %rax 93 addq %rax, B 94 95 movq N, %rax 96 imulq LDC, %rax 97 addq %rax, C 98#endif 99 100#ifdef RN 101 movq OFFSET, %rax 102 negq %rax 103 movq %rax, KK 104#endif 105 106#ifdef RT 107 movq N, %rax 108 subq OFFSET, %rax 109 movq %rax, KK 110#endif 111 112 movq N, %rax 113 sarq $1, %rax 114 movq %rax, J 115 je .L30 116 ALIGN_4 117 118.L01: 119#if defined(LT) || defined(RN) 120 movq A, AO 121#else 122 movq A, %rax 123 movq %rax, AORIG 124#endif 125 126#ifdef RT 127 movq K, %rax 128 salq $1 + BASE_SHIFT, %rax 129 subq %rax, B 130#endif 131 132 lea (, LDC, 2), %rax 133 134#ifdef RT 135 subq %rax, C 136#endif 137 movq C, CO 138#ifndef RT 139 addq %rax, C 140#endif 141 142#ifdef LN 143 movq OFFSET, %rax 144 addq M, %rax 145 movq %rax, KK 146#endif 147 148#ifdef LT 149 movq OFFSET, %rax 150 movq %rax, KK 151#endif 152 153 movq M, I 154 sarq $1, I 155 je .L20 156 ALIGN_4 157 158.L11: 159#ifdef LN 160 movq K, %rax 161 salq $1 + BASE_SHIFT, %rax 162 subq %rax, AORIG 163#endif 164 165#if defined(LN) || defined(RT) 166 movq KK, %rax 167 salq $BASE_SHIFT, %rax 168 movq AORIG, AO 169 leaq (AO, %rax, 2), AO 170 leaq (B, %rax, 2), BO 171#else 172 movq B, BO 173#endif 174 175 fldz 176 fldz 177 fldz 178 fldz 179 180#if defined(HAVE_3DNOW) 181 prefetchw 2 * SIZE(CO) 182 prefetchw 2 * SIZE(CO, LDC, 1) 183#elif defined(HAVE_SSE) 184 prefetchnta 2 * SIZE(CO) 185 prefetchnta 2 * SIZE(CO, LDC, 1) 186#endif 187 188#if defined(LT) || defined(RN) 189 movq KK, %rax 190#else 191 movq K, %rax 192 subq KK, %rax 193#endif 194 sarq $2, %rax 195 je .L15 196 ALIGN_4 197 198.L12: 199 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 200 201 FLD -8 * SIZE(AO) 202 203 FLD -8 * SIZE(BO) 204 fld %st(1) 205 fmul %st(1), %st 206 faddp %st, %st(3) 207 208 FLD -7 * SIZE(BO) 209 fmul %st, %st(2) 210 211 FLD -7 * SIZE(AO) 212 fmul %st, %st(2) 213 fmulp %st, %st(1) 214 215 faddp %st, %st(6) 216 faddp %st, %st(4) 217 faddp %st, %st(2) 218 219 FLD -6 * SIZE(AO) 220 221 FLD -6 * SIZE(BO) 222 fld %st(1) 223 fmul %st(1), %st 224 faddp %st, %st(3) 225 226 FLD -5 * SIZE(BO) 227 fmul %st, %st(2) 228 229 FLD -5 * SIZE(AO) 230 fmul %st, %st(2) 231 fmulp %st, %st(1) 232 233 faddp %st, %st(6) 234 faddp %st, %st(4) 235 faddp %st, %st(2) 236 237 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 238 239 FLD -4 * SIZE(AO) 240 241 FLD -4 * SIZE(BO) 242 fld %st(1) 243 fmul %st(1), %st 244 faddp %st, %st(3) 245 246 FLD -3 * SIZE(BO) 247 fmul %st, %st(2) 248 249 FLD -3 * SIZE(AO) 250 fmul %st, %st(2) 251 fmulp %st, %st(1) 252 253 faddp %st, %st(6) 254 faddp %st, %st(4) 255 faddp %st, %st(2) 256 257 FLD -2 * SIZE(AO) 258 259 FLD -2 * SIZE(BO) 260 fld %st(1) 261 fmul %st(1), %st 262 faddp %st, %st(3) 263 264 FLD -1 * SIZE(BO) 265 fmul %st, %st(2) 266 267 FLD -1 * SIZE(AO) 268 fmul %st, %st(2) 269 fmulp %st, %st(1) 270 271 faddp %st, %st(6) 272 faddp %st, %st(4) 273 faddp %st, %st(2) 274 275 addq $8 * SIZE,AO 276 addq $8 * SIZE,BO 277 278 decq %rax 279 jne .L12 280 ALIGN_4 281 282.L15: 283#if defined(LT) || defined(RN) 284 movq KK, %rax 285#else 286 movq K, %rax 287 subq KK, %rax 288#endif 289 and $3, %rax 290 je .L18 291 ALIGN_4 292 293.L16: 294 FLD -8 * SIZE(AO) 295 296 FLD -8 * SIZE(BO) 297 fld %st(1) 298 fmul %st(1), %st 299 faddp %st, %st(3) 300 301 FLD -7 * SIZE(BO) 302 fmul %st, %st(2) 303 304 FLD -7 * SIZE(AO) 305 fmul %st, %st(2) 306 fmulp %st, %st(1) 307 308 faddp %st, %st(6) 309 faddp %st, %st(4) 310 faddp %st, %st(2) 311 312 addq $2 * SIZE,AO 313 addq $2 * SIZE,BO 314 315 decq %rax 316 jne .L16 317 ALIGN_4 318 319.L18: 320#if defined(LN) || defined(RT) 321 movq KK, %rax 322#ifdef LN 323 subq $2, %rax 324#else 325 subq $2, %rax 326#endif 327 328 salq $BASE_SHIFT, %rax 329 330 movq AORIG, AO 331 leaq (AO, %rax, 2), AO 332 leaq (B, %rax, 2), BO 333#endif 334 335#if defined(LN) || defined(LT) 336 FLD -8 * SIZE(BO) 337 fsubp %st, %st(1) 338 FLD -7 * SIZE(BO) 339 fsubp %st, %st(2) 340 FLD -6 * SIZE(BO) 341 fsubp %st, %st(3) 342 FLD -5 * SIZE(BO) 343 fsubp %st, %st(4) 344#else 345 FLD -8 * SIZE(AO) 346 fsubp %st, %st(1) 347 FLD -7 * SIZE(AO) 348 fsubp %st, %st(3) 349 FLD -6 * SIZE(AO) 350 fsubp %st, %st(2) 351 FLD -5 * SIZE(AO) 352 fsubp %st, %st(4) 353#endif 354 355#ifdef LN 356 FLD -5 * SIZE(AO) 357 fmul %st, %st(3) 358 fmulp %st, %st(4) 359 360 FLD -6 * SIZE(AO) 361 fmul %st(3), %st 362 FLD -6 * SIZE(AO) 363 fmul %st(5), %st 364 365 fsubrp %st, %st(3) 366 fsubrp %st, %st(1) 367 368 FLD -8 * SIZE(AO) 369 fmul %st, %st(1) 370 fmulp %st, %st(2) 371#endif 372 373#ifdef LT 374 FLD -8 * SIZE(AO) 375 fmul %st, %st(1) 376 fmulp %st, %st(2) 377 378 FLD -7 * SIZE(AO) 379 fmul %st(1), %st 380 FLD -7 * SIZE(AO) 381 fmul %st(3), %st 382 383 fsubrp %st, %st(5) 384 fsubrp %st, %st(3) 385 386 FLD -5 * SIZE(AO) 387 fmul %st, %st(3) 388 fmulp %st, %st(4) 389#endif 390 391#ifdef RN 392 FLD -8 * SIZE(BO) 393 fmul %st, %st(1) 394 fmulp %st, %st(3) 395 396 FLD -7 * SIZE(BO) 397 fmul %st(1), %st 398 FLD -7 * SIZE(BO) 399 fmul %st(4), %st 400 401 fsubrp %st, %st(5) 402 fsubrp %st, %st(2) 403 404 FLD -5 * SIZE(BO) 405 fmul %st, %st(2) 406 fmulp %st, %st(4) 407#endif 408 409#ifdef RT 410 FLD -5 * SIZE(BO) 411 fmul %st, %st(2) 412 fmulp %st, %st(4) 413 414 FLD -6 * SIZE(BO) 415 fmul %st(2), %st 416 FLD -6 * SIZE(BO) 417 fmul %st(5), %st 418 419 fsubrp %st, %st(4) 420 fsubrp %st, %st(1) 421 422 FLD -8 * SIZE(BO) 423 fmul %st, %st(1) 424 fmulp %st, %st(3) 425#endif 426 427#ifdef LN 428 subq $2 * SIZE, CO 429#endif 430 431#if defined(LN) || defined(LT) 432 fld %st 433 FST -8 * SIZE(BO) 434 fxch %st(1) 435 fld %st 436 FST -7 * SIZE(BO) 437 fxch %st(2) 438 fld %st 439 FST -6 * SIZE(BO) 440 fxch %st(3) 441 fld %st 442 FST -5 * SIZE(BO) 443 444 FST 1 * SIZE(CO, LDC) 445 FST 0 * SIZE(CO) 446 FST 0 * SIZE(CO, LDC) 447 FST 1 * SIZE(CO) 448#else 449 fld %st 450 FST -8 * SIZE(AO) 451 fxch %st(2) 452 fld %st 453 FST -7 * SIZE(AO) 454 fxch %st(1) 455 fld %st 456 FST -6 * SIZE(AO) 457 fxch %st(3) 458 fld %st 459 FST -5 * SIZE(AO) 460 461 FST 1 * SIZE(CO, LDC) 462 FST 1 * SIZE(CO) 463 FST 0 * SIZE(CO) 464 FST 0 * SIZE(CO, LDC) 465#endif 466 467#ifndef LN 468 addq $2 * SIZE, CO 469#endif 470 471#if defined(LT) || defined(RN) 472 movq K, %rax 473 subq KK, %rax 474 salq $BASE_SHIFT, %rax 475 leaq (AO, %rax, 2), AO 476 leaq (BO, %rax, 2), BO 477#endif 478 479#ifdef LN 480 subq $2, KK 481#endif 482 483#ifdef LT 484 addq $2, KK 485#endif 486 487#ifdef RT 488 movq K, %rax 489 salq $1 + BASE_SHIFT, %rax 490 addq %rax, AORIG 491#endif 492 493 decq I 494 jne .L11 495 ALIGN_4 496 497.L20: 498 movq M, %rax 499 andq $1, %rax 500 je .L29 501 ALIGN_4 502 503.L21: 504#ifdef LN 505 movq K, %rax 506 salq $0 + BASE_SHIFT, %rax 507 subq %rax, AORIG 508#endif 509 510#if defined(LN) || defined(RT) 511 movq KK, %rax 512 salq $BASE_SHIFT, %rax 513 movq AORIG, AO 514 leaq (AO, %rax, 1), AO 515 leaq (B, %rax, 2), BO 516#else 517 movq B, BO 518#endif 519 520 fldz 521 fldz 522 523#if defined(LT) || defined(RN) 524 movq KK, %rax 525#else 526 movq K, %rax 527 subq KK, %rax 528#endif 529 sarq $2, %rax 530 je .L25 531 ALIGN_4 532 533.L22: 534 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 535 536 FLD -8 * SIZE(AO) 537 538 FLD -8 * SIZE(BO) 539 fmul %st(1), %st 540 faddp %st, %st(2) 541 542 FLD -7 * SIZE(BO) 543 fmulp %st, %st(1) 544 faddp %st, %st(2) 545 546 FLD -7 * SIZE(AO) 547 548 FLD -6 * SIZE(BO) 549 fmul %st(1), %st 550 faddp %st, %st(2) 551 552 FLD -5 * SIZE(BO) 553 fmulp %st, %st(1) 554 faddp %st, %st(2) 555 556 FLD -6 * SIZE(AO) 557 558 FLD -4 * SIZE(BO) 559 fmul %st(1), %st 560 faddp %st, %st(2) 561 562 FLD -3 * SIZE(BO) 563 fmulp %st, %st(1) 564 faddp %st, %st(2) 565 566 FLD -5 * SIZE(AO) 567 568 FLD -2 * SIZE(BO) 569 fmul %st(1), %st 570 faddp %st, %st(2) 571 572 FLD -1 * SIZE(BO) 573 fmulp %st, %st(1) 574 faddp %st, %st(2) 575 576 addq $4 * SIZE,AO 577 addq $8 * SIZE,BO 578 579 decq %rax 580 jne .L22 581 ALIGN_4 582 583.L25: 584#if defined(LT) || defined(RN) 585 movq KK, %rax 586#else 587 movq K, %rax 588 subq KK, %rax 589#endif 590 and $3, %rax 591 je .L28 592 ALIGN_4 593 594.L26: 595 FLD -8 * SIZE(AO) 596 597 FLD -8 * SIZE(BO) 598 fmul %st(1), %st 599 faddp %st, %st(2) 600 601 FLD -7 * SIZE(BO) 602 fmulp %st, %st(1) 603 faddp %st, %st(2) 604 605 addq $1 * SIZE,AO 606 addq $2 * SIZE,BO 607 608 decq %rax 609 jne .L26 610 ALIGN_4 611 612.L28: 613#if defined(LN) || defined(RT) 614 movq KK, %rax 615#ifdef LN 616 subq $1, %rax 617#else 618 subq $2, %rax 619#endif 620 621 salq $BASE_SHIFT, %rax 622 623 movq AORIG, AO 624 leaq (AO, %rax, 1), AO 625 leaq (B, %rax, 2), BO 626#endif 627 628#if defined(LN) || defined(LT) 629 FLD -8 * SIZE(BO) 630 fsubp %st, %st(1) 631 FLD -7 * SIZE(BO) 632 fsubp %st, %st(2) 633#else 634 FLD -8 * SIZE(AO) 635 fsubp %st, %st(1) 636 FLD -7 * SIZE(AO) 637 fsubp %st, %st(2) 638#endif 639 640#if defined(LN) || defined(LT) 641 FLD -8 * SIZE(AO) 642 fmul %st, %st(1) 643 fmulp %st, %st(2) 644#endif 645 646#ifdef RN 647 FLD -8 * SIZE(BO) 648 fmulp %st, %st(1) 649 650 FLD -7 * SIZE(BO) 651 fmul %st(1), %st 652 653 fsubrp %st, %st(2) 654 655 FLD -5 * SIZE(BO) 656 fmulp %st, %st(2) 657#endif 658 659#ifdef RT 660 FLD -5 * SIZE(BO) 661 fmulp %st, %st(2) 662 663 FLD -6 * SIZE(BO) 664 fmul %st(2), %st 665 666 fsubrp %st, %st(1) 667 668 FLD -8 * SIZE(BO) 669 fmulp %st, %st(1) 670#endif 671 672#ifdef LN 673 subq $1 * SIZE, CO 674#endif 675 676#if defined(LN) || defined(LT) 677 fld %st 678 FST -8 * SIZE(BO) 679 fxch %st(1) 680 fld %st 681 FST -7 * SIZE(BO) 682#else 683 fld %st 684 FST -8 * SIZE(AO) 685 fxch %st(1) 686 fld %st 687 FST -7 * SIZE(AO) 688#endif 689 690 FST 0 * SIZE(CO, LDC) 691 FST 0 * SIZE(CO) 692 693#ifndef LN 694 addq $1 * SIZE, CO 695#endif 696 697#if defined(LT) || defined(RN) 698 movq K, %rax 699 subq KK, %rax 700 salq $BASE_SHIFT, %rax 701 leaq (AO, %rax, 1), AO 702 leaq (BO, %rax, 2), BO 703#endif 704 705#ifdef LN 706 subq $1, KK 707#endif 708 709#ifdef LT 710 addq $1, KK 711#endif 712 713#ifdef RT 714 movq K, %rax 715 salq $0 + BASE_SHIFT, %rax 716 addq %rax, AORIG 717#endif 718 ALIGN_4 719 720.L29: 721#ifdef LN 722 movq K, %rax 723 salq $BASE_SHIFT, %rax 724 leaq (B, %rax, 2), B 725#endif 726 727#if defined(LT) || defined(RN) 728 movq BO, B 729#endif 730 731#ifdef RN 732 addq $2, KK 733#endif 734 735#ifdef RT 736 subq $2, KK 737#endif 738 739 decq J 740 jne .L01 741 ALIGN_4 742 743.L30: 744 movq N, %rax 745 testq $1, %rax 746 je .L999 747 748#if defined(LT) || defined(RN) 749 movq A, AO 750#else 751 movq A, %rax 752 movq %rax, AORIG 753#endif 754 755#ifdef RT 756 movq K, %rax 757 salq $0 + BASE_SHIFT, %rax 758 subq %rax, B 759#endif 760 761#ifdef RT 762 subq LDC, C 763#endif 764 movq C, CO 765#ifndef RT 766 addq LDC, C 767#endif 768 769#ifdef LN 770 movq OFFSET, %rax 771 addq M, %rax 772 movq %rax, KK 773#endif 774 775#ifdef LT 776 movq OFFSET, %rax 777 movq %rax, KK 778#endif 779 780 movq M, I 781 sarq $1, I 782 je .L40 783 ALIGN_4 784 785.L31: 786#ifdef LN 787 movq K, %rax 788 salq $1 + BASE_SHIFT, %rax 789 subq %rax, AORIG 790#endif 791 792#if defined(LN) || defined(RT) 793 movq KK, %rax 794 salq $BASE_SHIFT, %rax 795 movq AORIG, AO 796 leaq (AO, %rax, 2), AO 797 leaq (B, %rax, 1), BO 798#else 799 movq B, BO 800#endif 801 802 fldz 803 fldz 804 805#if defined(HAVE_3DNOW) 806 prefetchw 2 * SIZE(CO) 807#elif defined(HAVE_SSE) 808 prefetchnta 2 * SIZE(CO) 809#endif 810 811#if defined(LT) || defined(RN) 812 movq KK, %rax 813#else 814 movq K, %rax 815 subq KK, %rax 816#endif 817 sarq $2, %rax 818 je .L35 819 ALIGN_4 820 821.L32: 822 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 823 824 FLD -8 * SIZE(BO) 825 FLD -8 * SIZE(AO) 826 fmul %st(1), %st 827 faddp %st, %st(2) 828 829 FLD -7 * SIZE(AO) 830 fmulp %st, %st(1) 831 faddp %st, %st(2) 832 833 FLD -7 * SIZE(BO) 834 FLD -6 * SIZE(AO) 835 fmul %st(1), %st 836 faddp %st, %st(2) 837 838 FLD -5 * SIZE(AO) 839 fmulp %st, %st(1) 840 faddp %st, %st(2) 841 842 FLD -6 * SIZE(BO) 843 FLD -4 * SIZE(AO) 844 fmul %st(1), %st 845 faddp %st, %st(2) 846 847 FLD -3 * SIZE(AO) 848 fmulp %st, %st(1) 849 faddp %st, %st(2) 850 851 FLD -5 * SIZE(BO) 852 FLD -2 * SIZE(AO) 853 fmul %st(1), %st 854 faddp %st, %st(2) 855 856 FLD -1 * SIZE(AO) 857 fmulp %st, %st(1) 858 faddp %st, %st(2) 859 860 addq $8 * SIZE,AO 861 addq $4 * SIZE,BO 862 863 decq %rax 864 jne .L32 865 ALIGN_4 866 867.L35: 868#if defined(LT) || defined(RN) 869 movq KK, %rax 870#else 871 movq K, %rax 872 subq KK, %rax 873#endif 874 and $3, %rax 875 je .L38 876 ALIGN_4 877 878.L36: 879 FLD -8 * SIZE(BO) 880 881 FLD -8 * SIZE(AO) 882 fmul %st(1), %st 883 faddp %st, %st(2) 884 885 FLD -7 * SIZE(AO) 886 fmulp %st, %st(1) 887 faddp %st, %st(2) 888 889 addq $2 * SIZE,AO 890 addq $1 * SIZE,BO 891 892 decq %rax 893 jne .L36 894 ALIGN_4 895 896.L38: 897#if defined(LN) || defined(RT) 898 movq KK, %rax 899#ifdef LN 900 subq $2, %rax 901#else 902 subq $1, %rax 903#endif 904 905 salq $BASE_SHIFT, %rax 906 907 movq AORIG, AO 908 leaq (AO, %rax, 2), AO 909 leaq (B, %rax, 1), BO 910#endif 911 912#if defined(LN) || defined(LT) 913 FLD -8 * SIZE(BO) 914 fsubp %st, %st(1) 915 FLD -7 * SIZE(BO) 916 fsubp %st, %st(2) 917#else 918 FLD -8 * SIZE(AO) 919 fsubp %st, %st(1) 920 FLD -7 * SIZE(AO) 921 fsubp %st, %st(2) 922#endif 923 924#ifdef LN 925 FLD -5 * SIZE(AO) 926 fmulp %st, %st(2) 927 928 FLD -6 * SIZE(AO) 929 fmul %st(2), %st 930 931 fsubrp %st, %st(1) 932 FLD -8 * SIZE(AO) 933 fmulp %st, %st(1) 934#endif 935 936#ifdef LT 937 FLD -8 * SIZE(AO) 938 fmulp %st, %st(1) 939 940 FLD -7 * SIZE(AO) 941 fmul %st(1), %st 942 943 fsubrp %st, %st(2) 944 945 FLD -5 * SIZE(AO) 946 fmulp %st, %st(2) 947#endif 948 949#ifdef RN 950 FLD -8 * SIZE(BO) 951 fmul %st, %st(1) 952 fmulp %st, %st(2) 953#endif 954 955#ifdef RT 956 FLD -8 * SIZE(BO) 957 fmul %st, %st(1) 958 fmulp %st, %st(2) 959#endif 960 961#ifdef LN 962 subq $2 * SIZE, CO 963#endif 964 965#if defined(LN) || defined(LT) 966 fld %st 967 FST -8 * SIZE(BO) 968 fxch %st(1) 969 fld %st 970 FST -7 * SIZE(BO) 971#else 972 fld %st 973 FST -8 * SIZE(AO) 974 fxch %st(1) 975 fld %st 976 FST -7 * SIZE(AO) 977#endif 978 979 FST 1 * SIZE(CO) 980 FST 0 * SIZE(CO) 981 982#ifndef LN 983 addq $2 * SIZE, CO 984#endif 985 986#if defined(LT) || defined(RN) 987 movq K, %rax 988 subq KK, %rax 989 salq $BASE_SHIFT, %rax 990 leaq (AO, %rax, 2), AO 991 leaq (BO, %rax, 1), BO 992#endif 993 994#ifdef LN 995 subq $2, KK 996#endif 997 998#ifdef LT 999 addq $2, KK 1000#endif 1001 1002#ifdef RT 1003 movq K, %rax 1004 salq $1 + BASE_SHIFT, %rax 1005 addq %rax, AORIG 1006#endif 1007 1008 decq I 1009 jne .L31 1010 ALIGN_4 1011 1012.L40: 1013 movq M, %rax 1014 andq $1, %rax 1015 je .L49 1016 ALIGN_4 1017 1018.L41: 1019#ifdef LN 1020 movq K, %rax 1021 salq $0 + BASE_SHIFT, %rax 1022 subq %rax, AORIG 1023#endif 1024 1025#if defined(LN) || defined(RT) 1026 movq KK, %rax 1027 salq $BASE_SHIFT, %rax 1028 movq AORIG, AO 1029 leaq (AO, %rax, 1), AO 1030 leaq (B, %rax, 1), BO 1031#else 1032 movq B, BO 1033#endif 1034 1035 fldz 1036 1037#if defined(LT) || defined(RN) 1038 movq KK, %rax 1039#else 1040 movq K, %rax 1041 subq KK, %rax 1042#endif 1043 sarq $2, %rax 1044 je .L45 1045 ALIGN_4 1046 1047.L42: 1048 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1049 1050 FLD -8 * SIZE(AO) 1051 FLD -8 * SIZE(BO) 1052 fmulp %st, %st(1) 1053 faddp %st, %st(1) 1054 1055 FLD -7 * SIZE(AO) 1056 FLD -7 * SIZE(BO) 1057 fmulp %st, %st(1) 1058 faddp %st, %st(1) 1059 1060 FLD -6 * SIZE(AO) 1061 FLD -6 * SIZE(BO) 1062 fmulp %st, %st(1) 1063 faddp %st, %st(1) 1064 1065 FLD -5 * SIZE(AO) 1066 FLD -5 * SIZE(BO) 1067 fmulp %st, %st(1) 1068 faddp %st, %st(1) 1069 1070 addq $4 * SIZE,AO 1071 addq $4 * SIZE,BO 1072 1073 decq %rax 1074 jne .L42 1075 ALIGN_4 1076 1077.L45: 1078#if defined(LT) || defined(RN) 1079 movq KK, %rax 1080#else 1081 movq K, %rax 1082 subq KK, %rax 1083#endif 1084 and $3, %rax 1085 je .L48 1086 ALIGN_4 1087 1088.L46: 1089 FLD -8 * SIZE(AO) 1090 1091 FLD -8 * SIZE(BO) 1092 fmulp %st, %st(1) 1093 faddp %st, %st(1) 1094 1095 addq $1 * SIZE,AO 1096 addq $1 * SIZE,BO 1097 1098 decq %rax 1099 jne .L46 1100 ALIGN_4 1101 1102.L48: 1103#if defined(LN) || defined(RT) 1104 movq KK, %rax 1105#ifdef LN 1106 subq $1, %rax 1107#else 1108 subq $1, %rax 1109#endif 1110 1111 salq $BASE_SHIFT, %rax 1112 1113 movq AORIG, AO 1114 leaq (AO, %rax, 1), AO 1115 leaq (B, %rax, 1), BO 1116#endif 1117 1118#if defined(LN) || defined(LT) 1119 FLD -8 * SIZE(BO) 1120 fsubp %st, %st(1) 1121#else 1122 FLD -8 * SIZE(AO) 1123 fsubp %st, %st(1) 1124#endif 1125 1126#ifdef LN 1127 FLD -8 * SIZE(AO) 1128 fmulp %st, %st(1) 1129#endif 1130 1131#ifdef LT 1132 FLD -8 * SIZE(AO) 1133 fmulp %st, %st(1) 1134#endif 1135 1136#ifdef RN 1137 FLD -8 * SIZE(BO) 1138 fmulp %st, %st(1) 1139#endif 1140 1141#ifdef RT 1142 FLD -8 * SIZE(BO) 1143 fmulp %st, %st(1) 1144#endif 1145 1146#ifdef LN 1147 subq $1 * SIZE, CO 1148#endif 1149 1150#if defined(LN) || defined(LT) 1151 fld %st 1152 FST -8 * SIZE(BO) 1153#else 1154 fld %st 1155 FST -8 * SIZE(AO) 1156#endif 1157 1158 FST 0 * SIZE(CO) 1159 1160#ifndef LN 1161 addq $1 * SIZE, CO 1162#endif 1163 1164#if defined(LT) || defined(RN) 1165 movq K, %rax 1166 subq KK, %rax 1167 salq $BASE_SHIFT, %rax 1168 leaq (AO, %rax, 1), AO 1169 leaq (BO, %rax, 1), BO 1170#endif 1171 1172#ifdef LN 1173 subq $1, KK 1174#endif 1175 1176#ifdef LT 1177 addq $1, KK 1178#endif 1179 1180#ifdef RT 1181 movq K, %rax 1182 salq $0 + BASE_SHIFT, %rax 1183 addq %rax, AORIG 1184#endif 1185 ALIGN_4 1186 1187.L49: 1188#ifdef LN 1189 movq K, %rax 1190 salq $BASE_SHIFT, %rax 1191 leaq (B, %rax, 1), B 1192#endif 1193 1194#if defined(LT) || defined(RN) 1195 movq BO, B 1196#endif 1197 1198#ifdef RN 1199 addq $1, KK 1200#endif 1201 1202#ifdef RT 1203 subq $1, KK 1204#endif 1205 ALIGN_4 1206 1207.L999: 1208 movq 0(%rsp), %rbx 1209 movq 8(%rsp), %rbp 1210 movq 16(%rsp), %r12 1211 movq 24(%rsp), %r13 1212 movq 32(%rsp), %r14 1213 movq 40(%rsp), %r15 1214 addq $STACKSIZE, %rsp 1215 ret 1216 1217 EPILOGUE 1218