1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef OPTERON 26#define PREFETCH prefetch 27#define PREFETCHW prefetchw 28#else 29#define PREFETCH prefetcht0 30#define PREFETCHW prefetcht0 31#endif 32 33#define PREFETCHSIZE (5 + 4 * 10) 34#define STACK 16 35#define ARGS 16 36 37#define J 0 + STACK(%esp) 38#define KK 4 + STACK(%esp) 39#define AORIG 8 + STACK(%esp) 40 41#define M 4 + STACK + ARGS(%esp) 42#define N 8 + STACK + ARGS(%esp) 43#define K 12 + STACK + ARGS(%esp) 44#define ALPHA 16 + STACK + ARGS(%esp) 45#define A 32 + STACK + ARGS(%esp) 46#define ARG_B 36 + STACK + ARGS(%esp) 47#define C 40 + STACK + ARGS(%esp) 48#define ARG_LDC 44 + STACK + ARGS(%esp) 49#define OFFSET 48 + STACK + ARGS(%esp) 50 51#define I %esi 52#define B %ebx 53#define CO %edi 54#define AO %edx 55#define BO %ecx 56#define LDC %ebp 57 58#define PREFETCH_OFFSET 48 59 60 PROLOGUE 61 62 subl $ARGS, %esp # Generate Stack Frame 63 64 pushl %ebp 65 pushl %edi 66 pushl %esi 67 pushl %ebx 68 69 PROFCODE 70 71 movl ARG_LDC, LDC 72 movl ARG_B, B 73 sall $BASE_SHIFT, LDC 74 75 addl $8 * SIZE, A 76 addl $8 * SIZE, B 77 78#ifdef LN 79 movl M, %eax 80 sall $BASE_SHIFT, %eax 81 addl %eax, C 82 imull K, %eax 83 addl %eax, A 84#endif 85 86#ifdef RT 87 movl N, %eax 88 sall $BASE_SHIFT, %eax 89 imull K, %eax 90 addl %eax, B 91 92 movl N, %eax 93 imull %ebp, %eax 94 addl %eax, C 95#endif 96 97#ifdef RN 98 movl OFFSET, %eax 99 negl %eax 100 movl %eax, KK 101#endif 102 103#ifdef RT 104 movl N, %eax 105 subl OFFSET, %eax 106 movl %eax, KK 107#endif 108 109 movl N, %eax 110 sarl $1, %eax 111 movl %eax, J 112 je .L30 113 ALIGN_4 114 115.L01: 116#if defined(LT) || defined(RN) 117 movl A, AO 118#else 119 movl A, %eax 120 movl %eax, AORIG 121#endif 122 123#ifdef RT 124 movl K, %eax 125 sall $1 + BASE_SHIFT, %eax 126 subl %eax, B 127#endif 128 129 lea (, LDC, 2), %eax 130 131#ifdef RT 132 subl %eax, C 133#endif 134 movl C, CO 135#ifndef RT 136 addl %eax, C 137#endif 138 139#ifdef LN 140 movl OFFSET, %eax 141 addl M, %eax 142 movl %eax, KK 143#endif 144 145#ifdef LT 146 movl OFFSET, %eax 147 movl %eax, KK 148#endif 149 150 movl M, I 151 sarl $1, I 152 je .L20 153 ALIGN_4 154 155.L11: 156#ifdef LN 157 movl K, %eax 158 sall $1 + BASE_SHIFT, %eax 159 subl %eax, AORIG 160#endif 161 162#if defined(LN) || defined(RT) 163 movl KK, %eax 164 sall $BASE_SHIFT, %eax 165 movl AORIG, AO 166 leal (AO, %eax, 2), AO 167 leal (B, %eax, 2), BO 168#else 169 movl B, BO 170#endif 171 172 fldz 173 fldz 174 fldz 175 fldz 176 177#if defined(HAVE_3DNOW) 178 prefetchw 2 * SIZE(CO) 179 prefetchw 2 * SIZE(CO, LDC, 1) 180#elif defined(HAVE_SSE) 181 prefetchnta 2 * SIZE(CO) 182 prefetchnta 2 * SIZE(CO, LDC, 1) 183#endif 184 185#if defined(LT) || defined(RN) 186 movl KK, %eax 187#else 188 movl K, %eax 189 subl KK, %eax 190#endif 191 sarl $2, %eax 192 je .L15 193 ALIGN_4 194 195.L12: 196 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 197 198 FLD -8 * SIZE(AO) 199 200 FLD -8 * SIZE(BO) 201 fld %st(1) 202 fmul %st(1), %st 203 faddp %st, %st(3) 204 205 FLD -7 * SIZE(BO) 206 fmul %st, %st(2) 207 208 FLD -7 * SIZE(AO) 209 fmul %st, %st(2) 210 fmulp %st, %st(1) 211 212 faddp %st, %st(6) 213 faddp %st, %st(4) 214 faddp %st, %st(2) 215 216 FLD -6 * SIZE(AO) 217 218 FLD -6 * SIZE(BO) 219 fld %st(1) 220 fmul %st(1), %st 221 faddp %st, %st(3) 222 223 FLD -5 * SIZE(BO) 224 fmul %st, %st(2) 225 226 FLD -5 * SIZE(AO) 227 fmul %st, %st(2) 228 fmulp %st, %st(1) 229 230 faddp %st, %st(6) 231 faddp %st, %st(4) 232 faddp %st, %st(2) 233 234 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 235 236 FLD -4 * SIZE(AO) 237 238 FLD -4 * SIZE(BO) 239 fld %st(1) 240 fmul %st(1), %st 241 faddp %st, %st(3) 242 243 FLD -3 * SIZE(BO) 244 fmul %st, %st(2) 245 246 FLD -3 * SIZE(AO) 247 fmul %st, %st(2) 248 fmulp %st, %st(1) 249 250 faddp %st, %st(6) 251 faddp %st, %st(4) 252 faddp %st, %st(2) 253 254 FLD -2 * SIZE(AO) 255 256 FLD -2 * SIZE(BO) 257 fld %st(1) 258 fmul %st(1), %st 259 faddp %st, %st(3) 260 261 FLD -1 * SIZE(BO) 262 fmul %st, %st(2) 263 264 FLD -1 * SIZE(AO) 265 fmul %st, %st(2) 266 fmulp %st, %st(1) 267 268 faddp %st, %st(6) 269 faddp %st, %st(4) 270 faddp %st, %st(2) 271 272 addl $8 * SIZE,AO 273 addl $8 * SIZE,BO 274 275 decl %eax 276 jne .L12 277 ALIGN_4 278 279.L15: 280#if defined(LT) || defined(RN) 281 movl KK, %eax 282#else 283 movl K, %eax 284 subl KK, %eax 285#endif 286 and $3, %eax 287 je .L18 288 ALIGN_4 289 290.L16: 291 FLD -8 * SIZE(AO) 292 293 FLD -8 * SIZE(BO) 294 fld %st(1) 295 fmul %st(1), %st 296 faddp %st, %st(3) 297 298 FLD -7 * SIZE(BO) 299 fmul %st, %st(2) 300 301 FLD -7 * SIZE(AO) 302 fmul %st, %st(2) 303 fmulp %st, %st(1) 304 305 faddp %st, %st(6) 306 faddp %st, %st(4) 307 faddp %st, %st(2) 308 309 addl $2 * SIZE,AO 310 addl $2 * SIZE,BO 311 312 decl %eax 313 jne .L16 314 ALIGN_4 315 316.L18: 317#if defined(LN) || defined(RT) 318 movl KK, %eax 319#ifdef LN 320 subl $2, %eax 321#else 322 subl $2, %eax 323#endif 324 325 sall $BASE_SHIFT, %eax 326 327 movl AORIG, AO 328 leal (AO, %eax, 2), AO 329 leal (B, %eax, 2), BO 330#endif 331 332#if defined(LN) || defined(LT) 333 FLD -8 * SIZE(BO) 334 fsubp %st, %st(1) 335 FLD -7 * SIZE(BO) 336 fsubp %st, %st(2) 337 FLD -6 * SIZE(BO) 338 fsubp %st, %st(3) 339 FLD -5 * SIZE(BO) 340 fsubp %st, %st(4) 341#else 342 FLD -8 * SIZE(AO) 343 fsubp %st, %st(1) 344 FLD -7 * SIZE(AO) 345 fsubp %st, %st(3) 346 FLD -6 * SIZE(AO) 347 fsubp %st, %st(2) 348 FLD -5 * SIZE(AO) 349 fsubp %st, %st(4) 350#endif 351 352#ifdef LN 353 FLD -5 * SIZE(AO) 354 fmul %st, %st(3) 355 fmulp %st, %st(4) 356 357 FLD -6 * SIZE(AO) 358 fmul %st(3), %st 359 FLD -6 * SIZE(AO) 360 fmul %st(5), %st 361 362 fsubrp %st, %st(3) 363 fsubrp %st, %st(1) 364 365 FLD -8 * SIZE(AO) 366 fmul %st, %st(1) 367 fmulp %st, %st(2) 368#endif 369 370#ifdef LT 371 FLD -8 * SIZE(AO) 372 fmul %st, %st(1) 373 fmulp %st, %st(2) 374 375 FLD -7 * SIZE(AO) 376 fmul %st(1), %st 377 FLD -7 * SIZE(AO) 378 fmul %st(3), %st 379 380 fsubrp %st, %st(5) 381 fsubrp %st, %st(3) 382 383 FLD -5 * SIZE(AO) 384 fmul %st, %st(3) 385 fmulp %st, %st(4) 386#endif 387 388#ifdef RN 389 FLD -8 * SIZE(BO) 390 fmul %st, %st(1) 391 fmulp %st, %st(3) 392 393 FLD -7 * SIZE(BO) 394 fmul %st(1), %st 395 FLD -7 * SIZE(BO) 396 fmul %st(4), %st 397 398 fsubrp %st, %st(5) 399 fsubrp %st, %st(2) 400 401 FLD -5 * SIZE(BO) 402 fmul %st, %st(2) 403 fmulp %st, %st(4) 404#endif 405 406#ifdef RT 407 FLD -5 * SIZE(BO) 408 fmul %st, %st(2) 409 fmulp %st, %st(4) 410 411 FLD -6 * SIZE(BO) 412 fmul %st(2), %st 413 FLD -6 * SIZE(BO) 414 fmul %st(5), %st 415 416 fsubrp %st, %st(4) 417 fsubrp %st, %st(1) 418 419 FLD -8 * SIZE(BO) 420 fmul %st, %st(1) 421 fmulp %st, %st(3) 422#endif 423 424#ifdef LN 425 subl $2 * SIZE, CO 426#endif 427 428#if defined(LN) || defined(LT) 429 fld %st 430 FST -8 * SIZE(BO) 431 fxch %st(1) 432 fld %st 433 FST -7 * SIZE(BO) 434 fxch %st(2) 435 fld %st 436 FST -6 * SIZE(BO) 437 fxch %st(3) 438 fld %st 439 FST -5 * SIZE(BO) 440 441 FST 1 * SIZE(CO, LDC) 442 FST 0 * SIZE(CO) 443 FST 0 * SIZE(CO, LDC) 444 FST 1 * SIZE(CO) 445#else 446 fld %st 447 FST -8 * SIZE(AO) 448 fxch %st(2) 449 fld %st 450 FST -7 * SIZE(AO) 451 fxch %st(1) 452 fld %st 453 FST -6 * SIZE(AO) 454 fxch %st(3) 455 fld %st 456 FST -5 * SIZE(AO) 457 458 FST 1 * SIZE(CO, LDC) 459 FST 1 * SIZE(CO) 460 FST 0 * SIZE(CO) 461 FST 0 * SIZE(CO, LDC) 462#endif 463 464#ifndef LN 465 addl $2 * SIZE, CO 466#endif 467 468#if defined(LT) || defined(RN) 469 movl K, %eax 470 subl KK, %eax 471 sall $BASE_SHIFT, %eax 472 leal (AO, %eax, 2), AO 473 leal (BO, %eax, 2), BO 474#endif 475 476#ifdef LN 477 subl $2, KK 478#endif 479 480#ifdef LT 481 addl $2, KK 482#endif 483 484#ifdef RT 485 movl K, %eax 486 sall $1 + BASE_SHIFT, %eax 487 addl %eax, AORIG 488#endif 489 490 decl I 491 jne .L11 492 ALIGN_4 493 494.L20: 495 movl M, %eax 496 andl $1, %eax 497 je .L29 498 ALIGN_4 499 500.L21: 501#ifdef LN 502 movl K, %eax 503 sall $0 + BASE_SHIFT, %eax 504 subl %eax, AORIG 505#endif 506 507#if defined(LN) || defined(RT) 508 movl KK, %eax 509 sall $BASE_SHIFT, %eax 510 movl AORIG, AO 511 leal (AO, %eax, 1), AO 512 leal (B, %eax, 2), BO 513#else 514 movl B, BO 515#endif 516 517 fldz 518 fldz 519 520#if defined(LT) || defined(RN) 521 movl KK, %eax 522#else 523 movl K, %eax 524 subl KK, %eax 525#endif 526 sarl $2, %eax 527 je .L25 528 ALIGN_4 529 530.L22: 531 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 532 533 FLD -8 * SIZE(AO) 534 535 FLD -8 * SIZE(BO) 536 fmul %st(1), %st 537 faddp %st, %st(2) 538 539 FLD -7 * SIZE(BO) 540 fmulp %st, %st(1) 541 faddp %st, %st(2) 542 543 FLD -7 * SIZE(AO) 544 545 FLD -6 * SIZE(BO) 546 fmul %st(1), %st 547 faddp %st, %st(2) 548 549 FLD -5 * SIZE(BO) 550 fmulp %st, %st(1) 551 faddp %st, %st(2) 552 553 FLD -6 * SIZE(AO) 554 555 FLD -4 * SIZE(BO) 556 fmul %st(1), %st 557 faddp %st, %st(2) 558 559 FLD -3 * SIZE(BO) 560 fmulp %st, %st(1) 561 faddp %st, %st(2) 562 563 FLD -5 * SIZE(AO) 564 565 FLD -2 * SIZE(BO) 566 fmul %st(1), %st 567 faddp %st, %st(2) 568 569 FLD -1 * SIZE(BO) 570 fmulp %st, %st(1) 571 faddp %st, %st(2) 572 573 addl $4 * SIZE,AO 574 addl $8 * SIZE,BO 575 576 decl %eax 577 jne .L22 578 ALIGN_4 579 580.L25: 581#if defined(LT) || defined(RN) 582 movl KK, %eax 583#else 584 movl K, %eax 585 subl KK, %eax 586#endif 587 and $3, %eax 588 je .L28 589 ALIGN_4 590 591.L26: 592 FLD -8 * SIZE(AO) 593 594 FLD -8 * SIZE(BO) 595 fmul %st(1), %st 596 faddp %st, %st(2) 597 598 FLD -7 * SIZE(BO) 599 fmulp %st, %st(1) 600 faddp %st, %st(2) 601 602 addl $1 * SIZE,AO 603 addl $2 * SIZE,BO 604 605 decl %eax 606 jne .L26 607 ALIGN_4 608 609.L28: 610#if defined(LN) || defined(RT) 611 movl KK, %eax 612#ifdef LN 613 subl $1, %eax 614#else 615 subl $2, %eax 616#endif 617 618 sall $BASE_SHIFT, %eax 619 620 movl AORIG, AO 621 leal (AO, %eax, 1), AO 622 leal (B, %eax, 2), BO 623#endif 624 625#if defined(LN) || defined(LT) 626 FLD -8 * SIZE(BO) 627 fsubp %st, %st(1) 628 FLD -7 * SIZE(BO) 629 fsubp %st, %st(2) 630#else 631 FLD -8 * SIZE(AO) 632 fsubp %st, %st(1) 633 FLD -7 * SIZE(AO) 634 fsubp %st, %st(2) 635#endif 636 637#if defined(LN) || defined(LT) 638 FLD -8 * SIZE(AO) 639 fmul %st, %st(1) 640 fmulp %st, %st(2) 641#endif 642 643#ifdef RN 644 FLD -8 * SIZE(BO) 645 fmulp %st, %st(1) 646 647 FLD -7 * SIZE(BO) 648 fmul %st(1), %st 649 650 fsubrp %st, %st(2) 651 652 FLD -5 * SIZE(BO) 653 fmulp %st, %st(2) 654#endif 655 656#ifdef RT 657 FLD -5 * SIZE(BO) 658 fmulp %st, %st(2) 659 660 FLD -6 * SIZE(BO) 661 fmul %st(2), %st 662 663 fsubrp %st, %st(1) 664 665 FLD -8 * SIZE(BO) 666 fmulp %st, %st(1) 667#endif 668 669#ifdef LN 670 subl $1 * SIZE, CO 671#endif 672 673#if defined(LN) || defined(LT) 674 fld %st 675 FST -8 * SIZE(BO) 676 fxch %st(1) 677 fld %st 678 FST -7 * SIZE(BO) 679#else 680 fld %st 681 FST -8 * SIZE(AO) 682 fxch %st(1) 683 fld %st 684 FST -7 * SIZE(AO) 685#endif 686 687 FST 0 * SIZE(CO, LDC) 688 FST 0 * SIZE(CO) 689 690#ifndef LN 691 addl $1 * SIZE, CO 692#endif 693 694#if defined(LT) || defined(RN) 695 movl K, %eax 696 subl KK, %eax 697 sall $BASE_SHIFT, %eax 698 leal (AO, %eax, 1), AO 699 leal (BO, %eax, 2), BO 700#endif 701 702#ifdef LN 703 subl $1, KK 704#endif 705 706#ifdef LT 707 addl $1, KK 708#endif 709 710#ifdef RT 711 movl K, %eax 712 sall $0 + BASE_SHIFT, %eax 713 addl %eax, AORIG 714#endif 715 ALIGN_4 716 717.L29: 718#ifdef LN 719 movl K, %eax 720 sall $BASE_SHIFT, %eax 721 leal (B, %eax, 2), B 722#endif 723 724#if defined(LT) || defined(RN) 725 movl BO, B 726#endif 727 728#ifdef RN 729 addl $2, KK 730#endif 731 732#ifdef RT 733 subl $2, KK 734#endif 735 736 decl J 737 jne .L01 738 ALIGN_4 739 740.L30: 741 movl N, %eax 742 testl $1, %eax 743 je .L999 744 745#if defined(LT) || defined(RN) 746 movl A, AO 747#else 748 movl A, %eax 749 movl %eax, AORIG 750#endif 751 752#ifdef RT 753 movl K, %eax 754 sall $0 + BASE_SHIFT, %eax 755 subl %eax, B 756#endif 757 758#ifdef RT 759 subl LDC, C 760#endif 761 movl C, CO 762#ifndef RT 763 addl LDC, C 764#endif 765 766#ifdef LN 767 movl OFFSET, %eax 768 addl M, %eax 769 movl %eax, KK 770#endif 771 772#ifdef LT 773 movl OFFSET, %eax 774 movl %eax, KK 775#endif 776 777 movl M, I 778 sarl $1, I 779 je .L40 780 ALIGN_4 781 782.L31: 783#ifdef LN 784 movl K, %eax 785 sall $1 + BASE_SHIFT, %eax 786 subl %eax, AORIG 787#endif 788 789#if defined(LN) || defined(RT) 790 movl KK, %eax 791 sall $BASE_SHIFT, %eax 792 movl AORIG, AO 793 leal (AO, %eax, 2), AO 794 leal (B, %eax, 1), BO 795#else 796 movl B, BO 797#endif 798 799 fldz 800 fldz 801 802#if defined(HAVE_3DNOW) 803 prefetchw 2 * SIZE(CO) 804#elif defined(HAVE_SSE) 805 prefetchnta 2 * SIZE(CO) 806#endif 807 808#if defined(LT) || defined(RN) 809 movl KK, %eax 810#else 811 movl K, %eax 812 subl KK, %eax 813#endif 814 sarl $2, %eax 815 je .L35 816 ALIGN_4 817 818.L32: 819 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 820 821 FLD -8 * SIZE(BO) 822 FLD -8 * SIZE(AO) 823 fmul %st(1), %st 824 faddp %st, %st(2) 825 826 FLD -7 * SIZE(AO) 827 fmulp %st, %st(1) 828 faddp %st, %st(2) 829 830 FLD -7 * SIZE(BO) 831 FLD -6 * SIZE(AO) 832 fmul %st(1), %st 833 faddp %st, %st(2) 834 835 FLD -5 * SIZE(AO) 836 fmulp %st, %st(1) 837 faddp %st, %st(2) 838 839 FLD -6 * SIZE(BO) 840 FLD -4 * SIZE(AO) 841 fmul %st(1), %st 842 faddp %st, %st(2) 843 844 FLD -3 * SIZE(AO) 845 fmulp %st, %st(1) 846 faddp %st, %st(2) 847 848 FLD -5 * SIZE(BO) 849 FLD -2 * SIZE(AO) 850 fmul %st(1), %st 851 faddp %st, %st(2) 852 853 FLD -1 * SIZE(AO) 854 fmulp %st, %st(1) 855 faddp %st, %st(2) 856 857 addl $8 * SIZE,AO 858 addl $4 * SIZE,BO 859 860 decl %eax 861 jne .L32 862 ALIGN_4 863 864.L35: 865#if defined(LT) || defined(RN) 866 movl KK, %eax 867#else 868 movl K, %eax 869 subl KK, %eax 870#endif 871 and $3, %eax 872 je .L38 873 ALIGN_4 874 875.L36: 876 FLD -8 * SIZE(BO) 877 878 FLD -8 * SIZE(AO) 879 fmul %st(1), %st 880 faddp %st, %st(2) 881 882 FLD -7 * SIZE(AO) 883 fmulp %st, %st(1) 884 faddp %st, %st(2) 885 886 addl $2 * SIZE,AO 887 addl $1 * SIZE,BO 888 889 decl %eax 890 jne .L36 891 ALIGN_4 892 893.L38: 894#if defined(LN) || defined(RT) 895 movl KK, %eax 896#ifdef LN 897 subl $2, %eax 898#else 899 subl $1, %eax 900#endif 901 902 sall $BASE_SHIFT, %eax 903 904 movl AORIG, AO 905 leal (AO, %eax, 2), AO 906 leal (B, %eax, 1), BO 907#endif 908 909#if defined(LN) || defined(LT) 910 FLD -8 * SIZE(BO) 911 fsubp %st, %st(1) 912 FLD -7 * SIZE(BO) 913 fsubp %st, %st(2) 914#else 915 FLD -8 * SIZE(AO) 916 fsubp %st, %st(1) 917 FLD -7 * SIZE(AO) 918 fsubp %st, %st(2) 919#endif 920 921#ifdef LN 922 FLD -5 * SIZE(AO) 923 fmulp %st, %st(2) 924 925 FLD -6 * SIZE(AO) 926 fmul %st(2), %st 927 928 fsubrp %st, %st(1) 929 FLD -8 * SIZE(AO) 930 fmulp %st, %st(1) 931#endif 932 933#ifdef LT 934 FLD -8 * SIZE(AO) 935 fmulp %st, %st(1) 936 937 FLD -7 * SIZE(AO) 938 fmul %st(1), %st 939 940 fsubrp %st, %st(2) 941 942 FLD -5 * SIZE(AO) 943 fmulp %st, %st(2) 944#endif 945 946#ifdef RN 947 FLD -8 * SIZE(BO) 948 fmul %st, %st(1) 949 fmulp %st, %st(2) 950#endif 951 952#ifdef RT 953 FLD -8 * SIZE(BO) 954 fmul %st, %st(1) 955 fmulp %st, %st(2) 956#endif 957 958#ifdef LN 959 subl $2 * SIZE, CO 960#endif 961 962#if defined(LN) || defined(LT) 963 fld %st 964 FST -8 * SIZE(BO) 965 fxch %st(1) 966 fld %st 967 FST -7 * SIZE(BO) 968#else 969 fld %st 970 FST -8 * SIZE(AO) 971 fxch %st(1) 972 fld %st 973 FST -7 * SIZE(AO) 974#endif 975 976 FST 1 * SIZE(CO) 977 FST 0 * SIZE(CO) 978 979#ifndef LN 980 addl $2 * SIZE, CO 981#endif 982 983#if defined(LT) || defined(RN) 984 movl K, %eax 985 subl KK, %eax 986 sall $BASE_SHIFT, %eax 987 leal (AO, %eax, 2), AO 988 leal (BO, %eax, 1), BO 989#endif 990 991#ifdef LN 992 subl $2, KK 993#endif 994 995#ifdef LT 996 addl $2, KK 997#endif 998 999#ifdef RT 1000 movl K, %eax 1001 sall $1 + BASE_SHIFT, %eax 1002 addl %eax, AORIG 1003#endif 1004 1005 decl I 1006 jne .L31 1007 ALIGN_4 1008 1009.L40: 1010 movl M, %eax 1011 andl $1, %eax 1012 je .L49 1013 ALIGN_4 1014 1015.L41: 1016#ifdef LN 1017 movl K, %eax 1018 sall $0 + BASE_SHIFT, %eax 1019 subl %eax, AORIG 1020#endif 1021 1022#if defined(LN) || defined(RT) 1023 movl KK, %eax 1024 sall $BASE_SHIFT, %eax 1025 movl AORIG, AO 1026 leal (AO, %eax, 1), AO 1027 leal (B, %eax, 1), BO 1028#else 1029 movl B, BO 1030#endif 1031 1032 fldz 1033 1034#if defined(LT) || defined(RN) 1035 movl KK, %eax 1036#else 1037 movl K, %eax 1038 subl KK, %eax 1039#endif 1040 sarl $2, %eax 1041 je .L45 1042 ALIGN_4 1043 1044.L42: 1045 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1046 1047 FLD -8 * SIZE(AO) 1048 FLD -8 * SIZE(BO) 1049 fmulp %st, %st(1) 1050 faddp %st, %st(1) 1051 1052 FLD -7 * SIZE(AO) 1053 FLD -7 * SIZE(BO) 1054 fmulp %st, %st(1) 1055 faddp %st, %st(1) 1056 1057 FLD -6 * SIZE(AO) 1058 FLD -6 * SIZE(BO) 1059 fmulp %st, %st(1) 1060 faddp %st, %st(1) 1061 1062 FLD -5 * SIZE(AO) 1063 FLD -5 * SIZE(BO) 1064 fmulp %st, %st(1) 1065 faddp %st, %st(1) 1066 1067 addl $4 * SIZE,AO 1068 addl $4 * SIZE,BO 1069 1070 decl %eax 1071 jne .L42 1072 ALIGN_4 1073 1074.L45: 1075#if defined(LT) || defined(RN) 1076 movl KK, %eax 1077#else 1078 movl K, %eax 1079 subl KK, %eax 1080#endif 1081 and $3, %eax 1082 je .L48 1083 ALIGN_4 1084 1085.L46: 1086 FLD -8 * SIZE(AO) 1087 1088 FLD -8 * SIZE(BO) 1089 fmulp %st, %st(1) 1090 faddp %st, %st(1) 1091 1092 addl $1 * SIZE,AO 1093 addl $1 * SIZE,BO 1094 1095 decl %eax 1096 jne .L46 1097 ALIGN_4 1098 1099.L48: 1100#if defined(LN) || defined(RT) 1101 movl KK, %eax 1102#ifdef LN 1103 subl $1, %eax 1104#else 1105 subl $1, %eax 1106#endif 1107 1108 sall $BASE_SHIFT, %eax 1109 1110 movl AORIG, AO 1111 leal (AO, %eax, 1), AO 1112 leal (B, %eax, 1), BO 1113#endif 1114 1115#if defined(LN) || defined(LT) 1116 FLD -8 * SIZE(BO) 1117 fsubp %st, %st(1) 1118#else 1119 FLD -8 * SIZE(AO) 1120 fsubp %st, %st(1) 1121#endif 1122 1123#ifdef LN 1124 FLD -8 * SIZE(AO) 1125 fmulp %st, %st(1) 1126#endif 1127 1128#ifdef LT 1129 FLD -8 * SIZE(AO) 1130 fmulp %st, %st(1) 1131#endif 1132 1133#ifdef RN 1134 FLD -8 * SIZE(BO) 1135 fmulp %st, %st(1) 1136#endif 1137 1138#ifdef RT 1139 FLD -8 * SIZE(BO) 1140 fmulp %st, %st(1) 1141#endif 1142 1143#ifdef LN 1144 subl $1 * SIZE, CO 1145#endif 1146 1147#if defined(LN) || defined(LT) 1148 fld %st 1149 FST -8 * SIZE(BO) 1150#else 1151 fld %st 1152 FST -8 * SIZE(AO) 1153#endif 1154 1155 FST 0 * SIZE(CO) 1156 1157#ifndef LN 1158 addl $1 * SIZE, CO 1159#endif 1160 1161#if defined(LT) || defined(RN) 1162 movl K, %eax 1163 subl KK, %eax 1164 sall $BASE_SHIFT, %eax 1165 leal (AO, %eax, 1), AO 1166 leal (BO, %eax, 1), BO 1167#endif 1168 1169#ifdef LN 1170 subl $1, KK 1171#endif 1172 1173#ifdef LT 1174 addl $1, KK 1175#endif 1176 1177#ifdef RT 1178 movl K, %eax 1179 sall $0 + BASE_SHIFT, %eax 1180 addl %eax, AORIG 1181#endif 1182 ALIGN_4 1183 1184.L49: 1185#ifdef LN 1186 movl K, %eax 1187 sall $BASE_SHIFT, %eax 1188 leal (B, %eax, 1), B 1189#endif 1190 1191#if defined(LT) || defined(RN) 1192 movl BO, B 1193#endif 1194 1195#ifdef RN 1196 addl $1, KK 1197#endif 1198 1199#ifdef RT 1200 subl $1, KK 1201#endif 1202 ALIGN_4 1203 1204.L999: 1205 popl %ebx 1206 popl %esi 1207 popl %edi 1208 popl %ebp 1209 addl $ARGS, %esp 1210 ret 1211 1212 EPILOGUE 1213