1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#if defined(OPTERON) || defined(BARCELONA) 26#define PREFETCH prefetch 27#define PREFETCHW prefetchw 28#else 29#define PREFETCH prefetcht0 30#define PREFETCHW prefetcht0 31#endif 32 33#define PREFETCHSIZE (5 + 4 * 10) 34#define STACK 16 35#define ARGS 16 36 37#define J 0 + STACK(%esp) 38#define KK 4 + STACK(%esp) 39#define KKK 8 + STACK(%esp) 40 41#define M 4 + STACK + ARGS(%esp) 42#define N 8 + STACK + ARGS(%esp) 43#define K 12 + STACK + ARGS(%esp) 44#define ALPHA 16 + STACK + ARGS(%esp) 45#define A 32 + STACK + ARGS(%esp) 46#define ARG_B 36 + STACK + ARGS(%esp) 47#define C 40 + STACK + ARGS(%esp) 48#define ARG_LDC 44 + STACK + ARGS(%esp) 49#define OFFSET 48 + STACK + ARGS(%esp) 50 51#define I %esi 52#define B %ebx 53#define CO %edi 54#define AO %edx 55#define BO %ecx 56#define LDC %ebp 57 58#define PREFETCH_OFFSET 48 59 60 PROLOGUE 61 62 subl $ARGS, %esp # Generate Stack Frame 63 64 pushl %ebp 65 pushl %edi 66 pushl %esi 67 pushl %ebx 68 69 PROFCODE 70 71#if defined(TRMMKERNEL) && !defined(LEFT) 72 movl OFFSET, %eax 73 negl %eax 74 movl %eax, KK 75#endif 76 77 movl ARG_LDC, LDC 78 movl ARG_B, B 79 80 addl $8 * SIZE, A 81 addl $8 * SIZE, B 82 83 sall $BASE_SHIFT, LDC 84 85 movl N, %eax 86 sarl $1, %eax 87 movl %eax, J 88 je .L30 89 ALIGN_4 90 91.L01: 92#if defined(TRMMKERNEL) && defined(LEFT) 93 movl OFFSET, %eax 94 movl %eax, KK 95#endif 96 97 movl A, AO 98 99 movl C, CO 100 lea (, LDC, 2), %eax 101 addl %eax, C 102 103 movl M, I 104 sarl $1, I 105 je .L20 106 ALIGN_4 107 108.L11: 109#if !defined(TRMMKERNEL) || \ 110 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 111 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 112 movl B, BO 113#else 114 movl KK, %eax 115 sall $BASE_SHIFT, %eax 116 leal (AO, %eax, 2), AO 117 leal (B, %eax, 2), BO 118#endif 119 120 fldz 121 fldz 122 fldz 123 fldz 124 125#if defined(HAVE_3DNOW) 126 prefetchw 2 * SIZE(CO) 127 prefetchw 2 * SIZE(CO, LDC, 1) 128#elif defined(HAVE_SSE) 129 prefetchnta 2 * SIZE(CO) 130 prefetchnta 2 * SIZE(CO, LDC, 1) 131#endif 132 133#ifndef TRMMKERNEL 134 movl K, %eax 135#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 136 movl K, %eax 137 subl KK, %eax 138 movl %eax, KKK 139#else 140 movl KK, %eax 141#ifdef LEFT 142 addl $2, %eax 143#else 144 addl $2, %eax 145#endif 146 movl %eax, KKK 147#endif 148 sarl $2, %eax 149 je .L15 150 ALIGN_4 151 152.L12: 153 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 154 155 FLD -8 * SIZE(AO) 156 157 FLD -8 * SIZE(BO) 158 fld %st(1) 159 fmul %st(1), %st 160 faddp %st, %st(3) 161 162 FLD -7 * SIZE(BO) 163 fmul %st, %st(2) 164 165 FLD -7 * SIZE(AO) 166 fmul %st, %st(2) 167 fmulp %st, %st(1) 168 169 faddp %st, %st(6) 170 faddp %st, %st(3) 171 faddp %st, %st(3) 172 173 FLD -6 * SIZE(AO) 174 175 FLD -6 * SIZE(BO) 176 fld %st(1) 177 fmul %st(1), %st 178 faddp %st, %st(3) 179 180 FLD -5 * SIZE(BO) 181 fmul %st, %st(2) 182 183 FLD -5 * SIZE(AO) 184 fmul %st, %st(2) 185 fmulp %st, %st(1) 186 187 faddp %st, %st(6) 188 faddp %st, %st(3) 189 faddp %st, %st(3) 190 191 PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) 192 193 FLD -4 * SIZE(AO) 194 195 FLD -4 * SIZE(BO) 196 fld %st(1) 197 fmul %st(1), %st 198 faddp %st, %st(3) 199 200 FLD -3 * SIZE(BO) 201 fmul %st, %st(2) 202 203 FLD -3 * SIZE(AO) 204 fmul %st, %st(2) 205 fmulp %st, %st(1) 206 207 faddp %st, %st(6) 208 faddp %st, %st(3) 209 faddp %st, %st(3) 210 211 FLD -2 * SIZE(AO) 212 213 FLD -2 * SIZE(BO) 214 fld %st(1) 215 fmul %st(1), %st 216 faddp %st, %st(3) 217 218 FLD -1 * SIZE(BO) 219 fmul %st, %st(2) 220 221 FLD -1 * SIZE(AO) 222 fmul %st, %st(2) 223 fmulp %st, %st(1) 224 225 faddp %st, %st(6) 226 faddp %st, %st(3) 227 faddp %st, %st(3) 228 229 addl $8 * SIZE,AO 230 addl $8 * SIZE,BO 231 232 decl %eax 233 jne .L12 234 ALIGN_4 235 236.L15: 237#ifndef TRMMKERNEL 238 movl K, %eax 239#else 240 movl KKK, %eax 241#endif 242 and $3, %eax 243 je .L18 244 ALIGN_4 245 246.L16: 247 FLD -8 * SIZE(AO) 248 249 FLD -8 * SIZE(BO) 250 fld %st(1) 251 fmul %st(1), %st 252 faddp %st, %st(3) 253 254 FLD -7 * SIZE(BO) 255 fmul %st, %st(2) 256 257 FLD -7 * SIZE(AO) 258 fmul %st, %st(2) 259 fmulp %st, %st(1) 260 261 faddp %st, %st(6) 262 faddp %st, %st(3) 263 faddp %st, %st(3) 264 265 addl $2 * SIZE,AO 266 addl $2 * SIZE,BO 267 268 decl %eax 269 jne .L16 270 ALIGN_4 271 272.L18: 273#ifndef TRMMKERNEL 274 FLD ALPHA 275 276 fmul %st, %st(1) 277 fmul %st, %st(2) 278 fmul %st, %st(3) 279 fmulp %st, %st(4) 280 281 FLD 0 * SIZE(CO) 282 faddp %st, %st(1) 283 FST 0 * SIZE(CO) 284 285 FLD 1 * SIZE(CO) 286 faddp %st, %st(1) 287 FST 1 * SIZE(CO) 288 289 FLD 0 * SIZE(CO, LDC) 290 faddp %st, %st(1) 291 FST 0 * SIZE(CO, LDC) 292 293 FLD 1 * SIZE(CO, LDC) 294 faddp %st, %st(1) 295 FST 1 * SIZE(CO, LDC) 296#else 297 FST 0 * SIZE(CO) 298 FST 1 * SIZE(CO) 299 FST 0 * SIZE(CO, LDC) 300 FST 1 * SIZE(CO, LDC) 301#endif 302 303#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 304 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 305 movl K, %eax 306 subl KKK, %eax 307 sall $BASE_SHIFT, %eax 308 leal (AO, %eax, 2), AO 309 leal (BO, %eax, 2), BO 310#endif 311 312#if defined(TRMMKERNEL) && defined(LEFT) 313 addl $2, KK 314#endif 315 316 addl $2 * SIZE, CO 317 decl I 318 jne .L11 319 ALIGN_4 320 321.L20: 322 movl M, %eax 323 andl $1, %eax 324 je .L29 325 ALIGN_4 326 327.L21: 328#if !defined(TRMMKERNEL) || \ 329 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 330 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 331 movl B, BO 332#else 333 movl KK, %eax 334 sall $BASE_SHIFT, %eax 335 leal (AO, %eax, 1), AO 336 leal ( B, %eax, 2), BO 337#endif 338 339 fldz 340 fldz 341 342#ifndef TRMMKERNEL 343 movl K, %eax 344#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 345 movl K, %eax 346 subl KK, %eax 347 movl %eax, KKK 348#else 349 movl KK, %eax 350#ifdef LEFT 351 addl $1, %eax 352#else 353 addl $2, %eax 354#endif 355 movl %eax, KKK 356#endif 357 sarl $2, %eax 358 je .L25 359 ALIGN_4 360 361.L22: 362 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 363 364 FLD -8 * SIZE(AO) 365 366 FLD -8 * SIZE(BO) 367 fmul %st(1), %st 368 faddp %st, %st(2) 369 370 FLD -7 * SIZE(BO) 371 fmulp %st, %st(1) 372 faddp %st, %st(2) 373 374 FLD -7 * SIZE(AO) 375 376 FLD -6 * SIZE(BO) 377 fmul %st(1), %st 378 faddp %st, %st(2) 379 380 FLD -5 * SIZE(BO) 381 fmulp %st, %st(1) 382 faddp %st, %st(2) 383 384 FLD -6 * SIZE(AO) 385 386 FLD -4 * SIZE(BO) 387 fmul %st(1), %st 388 faddp %st, %st(2) 389 390 FLD -3 * SIZE(BO) 391 fmulp %st, %st(1) 392 faddp %st, %st(2) 393 394 FLD -5 * SIZE(AO) 395 396 FLD -2 * SIZE(BO) 397 fmul %st(1), %st 398 faddp %st, %st(2) 399 400 FLD -1 * SIZE(BO) 401 fmulp %st, %st(1) 402 faddp %st, %st(2) 403 404 addl $4 * SIZE,AO 405 addl $8 * SIZE,BO 406 407 decl %eax 408 jne .L22 409 ALIGN_4 410 411.L25: 412#ifndef TRMMKERNEL 413 movl K, %eax 414#else 415 movl KKK, %eax 416#endif 417 and $3, %eax 418 je .L28 419 ALIGN_4 420 421.L26: 422 FLD -8 * SIZE(AO) 423 424 FLD -8 * SIZE(BO) 425 fmul %st(1), %st 426 faddp %st, %st(2) 427 428 FLD -7 * SIZE(BO) 429 fmulp %st, %st(1) 430 faddp %st, %st(2) 431 432 addl $1 * SIZE,AO 433 addl $2 * SIZE,BO 434 435 decl %eax 436 jne .L26 437 ALIGN_4 438 439.L28: 440#ifndef TRMMKERNEL 441 FLD ALPHA 442 443 fmul %st, %st(1) 444 fmulp %st, %st(2) 445 446 FLD 0 * SIZE(CO) 447 faddp %st, %st(1) 448 FST 0 * SIZE(CO) 449 450 FLD 0 * SIZE(CO, LDC) 451 faddp %st, %st(1) 452 FST 0 * SIZE(CO, LDC) 453#else 454 FST 0 * SIZE(CO) 455 FST 0 * SIZE(CO, LDC) 456#endif 457 458#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 459 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 460 movl K, %eax 461 subl KKK, %eax 462 sall $BASE_SHIFT, %eax 463 leal (AO, %eax, 1), AO 464 leal (BO, %eax, 2), BO 465#endif 466 467#if defined(TRMMKERNEL) && defined(LEFT) 468 addl $1, KK 469#endif 470 471 addl $1 * SIZE, CO 472 ALIGN_4 473 474.L29: 475#if defined(TRMMKERNEL) && !defined(LEFT) 476 addl $2, KK 477#endif 478 479 movl BO, B 480 decl J 481 jne .L01 482 ALIGN_4 483 484.L30: 485 movl N, %eax 486 testl $1, %eax 487 je .L999 488 489#if defined(TRMMKERNEL) && defined(LEFT) 490 movl OFFSET, %eax 491 movl %eax, KK 492#endif 493 494 movl A, AO 495 496 movl C, CO 497 addl LDC, C 498 499 movl M, I 500 sarl $1, I 501 je .L40 502 ALIGN_4 503 504.L31: 505#if !defined(TRMMKERNEL) || \ 506 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 507 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 508 movl B, BO 509#else 510 movl KK, %eax 511 sall $BASE_SHIFT, %eax 512 leal (AO, %eax, 2), AO 513 leal ( B, %eax, 1), BO 514#endif 515 516 fldz 517 fldz 518 519#if defined(HAVE_3DNOW) 520 prefetchw 2 * SIZE(CO) 521#elif defined(HAVE_SSE) 522 prefetchnta 2 * SIZE(CO) 523#endif 524 525#ifndef TRMMKERNEL 526 movl K, %eax 527#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 528 movl K, %eax 529 subl KK, %eax 530 movl %eax, KKK 531#else 532 movl KK, %eax 533#ifdef LEFT 534 addl $2, %eax 535#else 536 addl $1, %eax 537#endif 538 movl %eax, KKK 539#endif 540 sarl $2, %eax 541 je .L35 542 ALIGN_4 543 544.L32: 545 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 546 547 FLD -8 * SIZE(BO) 548 FLD -8 * SIZE(AO) 549 fmul %st(1), %st 550 faddp %st, %st(2) 551 552 FLD -7 * SIZE(AO) 553 fmulp %st, %st(1) 554 faddp %st, %st(2) 555 556 FLD -7 * SIZE(BO) 557 FLD -6 * SIZE(AO) 558 fmul %st(1), %st 559 faddp %st, %st(2) 560 561 FLD -5 * SIZE(AO) 562 fmulp %st, %st(1) 563 faddp %st, %st(2) 564 565 FLD -6 * SIZE(BO) 566 FLD -4 * SIZE(AO) 567 fmul %st(1), %st 568 faddp %st, %st(2) 569 570 FLD -3 * SIZE(AO) 571 fmulp %st, %st(1) 572 faddp %st, %st(2) 573 574 FLD -5 * SIZE(BO) 575 FLD -2 * SIZE(AO) 576 fmul %st(1), %st 577 faddp %st, %st(2) 578 579 FLD -1 * SIZE(AO) 580 fmulp %st, %st(1) 581 faddp %st, %st(2) 582 583 addl $8 * SIZE,AO 584 addl $4 * SIZE,BO 585 586 decl %eax 587 jne .L32 588 ALIGN_4 589 590.L35: 591#ifndef TRMMKERNEL 592 movl K, %eax 593#else 594 movl KKK, %eax 595#endif 596 and $3, %eax 597 je .L38 598 ALIGN_4 599 600.L36: 601 FLD -8 * SIZE(BO) 602 603 FLD -8 * SIZE(AO) 604 fmul %st(1), %st 605 faddp %st, %st(2) 606 607 FLD -7 * SIZE(AO) 608 fmulp %st, %st(1) 609 faddp %st, %st(2) 610 611 addl $2 * SIZE,AO 612 addl $1 * SIZE,BO 613 614 decl %eax 615 jne .L36 616 ALIGN_4 617 618.L38: 619#ifndef TRMMKERNEL 620 FLD ALPHA 621 622 fmul %st, %st(1) 623 fmulp %st, %st(2) 624 625 FLD 0 * SIZE(CO) 626 faddp %st, %st(1) 627 FST 0 * SIZE(CO) 628 629 FLD 1 * SIZE(CO) 630 faddp %st, %st(1) 631 FST 1 * SIZE(CO) 632#else 633 FST 0 * SIZE(CO) 634 FST 1 * SIZE(CO) 635#endif 636 637#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 638 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 639 movl K, %eax 640 subl KKK, %eax 641 sall $BASE_SHIFT, %eax 642 leal (AO, %eax, 2), AO 643 leal (BO, %eax, 1), BO 644#endif 645 646#if defined(TRMMKERNEL) && defined(LEFT) 647 addl $2, KK 648#endif 649 650 addl $2 * SIZE, CO 651 decl I 652 jne .L31 653 ALIGN_4 654 655.L40: 656 movl M, %eax 657 andl $1, %eax 658 je .L49 659 ALIGN_4 660 661.L41: 662#if !defined(TRMMKERNEL) || \ 663 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 664 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 665 movl B, BO 666#else 667 movl KK, %eax 668 sall $BASE_SHIFT, %eax 669 leal (AO, %eax, 1), AO 670 leal ( B, %eax, 1), BO 671#endif 672 673 fldz 674 675#ifndef TRMMKERNEL 676 movl K, %eax 677#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 678 movl K, %eax 679 subl KK, %eax 680 movl %eax, KKK 681#else 682 movl KK, %eax 683#ifdef LEFT 684 addl $1, %eax 685#else 686 addl $1, %eax 687#endif 688 movl %eax, KKK 689#endif 690 sarl $2, %eax 691 je .L45 692 ALIGN_4 693 694.L42: 695 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 696 697 FLD -8 * SIZE(AO) 698 FLD -8 * SIZE(BO) 699 fmulp %st, %st(1) 700 faddp %st, %st(1) 701 702 FLD -7 * SIZE(AO) 703 FLD -7 * SIZE(BO) 704 fmulp %st, %st(1) 705 faddp %st, %st(1) 706 707 FLD -6 * SIZE(AO) 708 FLD -6 * SIZE(BO) 709 fmulp %st, %st(1) 710 faddp %st, %st(1) 711 712 FLD -5 * SIZE(AO) 713 FLD -5 * SIZE(BO) 714 fmulp %st, %st(1) 715 faddp %st, %st(1) 716 717 addl $4 * SIZE,AO 718 addl $4 * SIZE,BO 719 720 decl %eax 721 jne .L42 722 ALIGN_4 723 724.L45: 725#ifndef TRMMKERNEL 726 movl K, %eax 727#else 728 movl KKK, %eax 729#endif 730 and $3, %eax 731 je .L48 732 ALIGN_4 733 734.L46: 735 FLD -8 * SIZE(AO) 736 737 FLD -8 * SIZE(BO) 738 fmulp %st, %st(1) 739 faddp %st, %st(1) 740 741 addl $1 * SIZE,AO 742 addl $1 * SIZE,BO 743 744 decl %eax 745 jne .L46 746 ALIGN_4 747 748.L48: 749#ifndef TRMMKERNEL 750 FLD ALPHA 751 752 fmulp %st, %st(1) 753 754 FLD 0 * SIZE(CO) 755 faddp %st, %st(1) 756 FST 0 * SIZE(CO) 757#else 758 FST 0 * SIZE(CO) 759#endif 760 761#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 762 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 763 movl K, %eax 764 subl KKK, %eax 765 sall $BASE_SHIFT, %eax 766 leal (AO, %eax, 1), AO 767 leal (BO, %eax, 1), BO 768#endif 769 770#if defined(TRMMKERNEL) && defined(LEFT) 771 addl $1, KK 772#endif 773 774 addl $1 * SIZE, CO 775 ALIGN_4 776 777.L49: 778#if defined(TRMMKERNEL) && !defined(LEFT) 779 addl $1, KK 780#endif 781 782 movl BO, B 783 ALIGN_4 784 785.L999: 786 popl %ebx 787 popl %esi 788 popl %edi 789 popl %ebp 790 addl $ARGS, %esp 791 ret 792 793 EPILOGUE 794