1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43#define ARGS 16 44 45#define J 0 + STACK(%esp) 46#define BX 4 + STACK(%esp) 47#define KK 8 + STACK(%esp) 48#define KKK 12 + STACK(%esp) 49 50#define M 4 + STACK + ARGS(%esp) 51#define N 8 + STACK + ARGS(%esp) 52#define K 12 + STACK + ARGS(%esp) 53#define ALPHA 16 + STACK + ARGS(%esp) 54#ifdef DOUBLE 55#define A 24 + STACK + ARGS(%esp) 56#define B 28 + STACK + ARGS(%esp) 57#define C 32 + STACK + ARGS(%esp) 58#define LDC 36 + STACK + ARGS(%esp) 59#define OFFSET 40 + STACK + ARGS(%esp) 60#else 61#define A 20 + STACK + ARGS(%esp) 62#define B 24 + STACK + ARGS(%esp) 63#define C 28 + STACK + ARGS(%esp) 64#define LDC 32 + STACK + ARGS(%esp) 65#define OFFSET 36 + STACK + ARGS(%esp) 66#endif 67 68#define PREFETCH_OFFSET 48 69 70#if defined(PENTIUM3) || defined(PENTIUMM) 71#define REP rep 72#else 73#define REP rep 74#endif 75 76 PROLOGUE 77 78 subl $ARGS, %esp # Generate Stack Frame 79 80 pushl %ebp 81 pushl %edi 82 pushl %esi 83 pushl %ebx 84 85 PROFCODE 86 87#if defined(TRMMKERNEL) && !defined(LEFT) 88 movl OFFSET, %eax 89 negl %eax 90 movl %eax, KK 91#endif 92 93 movl N, %eax # j = (n >> 1) # MEMORY 94 movl LDC, %ebp # ldc # MEMORY 95 movl B, %ebx 96 97 sarl $1, %eax 98 leal (, %ebp, SIZE), %ebp 99 leal 0(%ecx) , %ecx # NOP 100 movl %eax, J # j = (n >> 1) # MEMORY 101 test %eax, %eax 102 je .L8 # if !(n >> 1) goto .L8 103 ALIGN_4 104 105.L34: 106#if defined(TRMMKERNEL) && defined(LEFT) 107 movl OFFSET, %eax 108 movl %eax, KK 109#endif 110 111 movl %ebx, BX 112 113 movl M, %esi # m # MEMORY 114 movl A, %edx # a # MEMORY 115 movl C, %edi # C # MEMORY 116 sarl $1, %esi # i = (m >> 1) 117 je .L12 118 ALIGN_4 119 120.MainHead: 121#if !defined(TRMMKERNEL) || \ 122 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 123 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 124 movl %ebx, %ecx 125#else 126 movl KK, %eax 127 leal (, %eax, SIZE), %eax 128 leal (%edx, %eax, 2), %edx 129 leal (%ebx, %eax, 2), %ecx 130#endif 131 132#ifdef HAVE_SSE 133 movl BX, %eax 134 135 prefetcht2 0 * SIZE(%eax) 136 prefetcht2 4 * SIZE(%eax) 137 138#if L2_SIZE > 262144 139 140 subl $-8 * SIZE, BX 141 142#elif L2_SIZE > 131072 143 144 prefetcht2 8 * SIZE(%eax) 145 prefetcht2 12 * SIZE(%eax) 146 147 148 subl $-16 * SIZE, BX 149#else 150 prefetcht2 16 * SIZE(%eax) 151 prefetcht2 20 * SIZE(%eax) 152 prefetcht2 24 * SIZE(%eax) 153 prefetcht2 28 * SIZE(%eax) 154 155 subl $-32 * SIZE, BX 156#endif 157#endif 158 159 fldz 160 fldz 161 162#ifndef TRMMKERNEL 163 movl K, %eax 164#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 165 movl K, %eax 166 subl KK, %eax 167 movl %eax, KKK 168#else 169 movl KK, %eax 170#ifdef LEFT 171 addl $2, %eax 172#else 173 addl $2, %eax 174#endif 175 movl %eax, KKK 176#endif 177 fldz 178 fldz 179 180 FLD 4 * SIZE(%ecx) # b5 181 FLD 4 * SIZE(%edx) # a5 182 FLD 0 * SIZE(%ecx) # b1 183 FLD 0 * SIZE(%edx) # a1 184 185#if defined(HAVE_3DNOW) 186 prefetchw 2 * SIZE(%edi) 187 prefetchw 2 * SIZE(%edi, %ebp, 1) 188#elif defined(HAVE_SSE) 189 prefetchnta 2 * SIZE(%edi) 190 prefetchnta 2 * SIZE(%edi, %ebp, 1) 191#endif 192 sarl $2, %eax 193 je .L16 194 ALIGN_4 195 196.MainLoop: 197#if defined(HAVE_3DNOW) 198 prefetch (PREFETCH_OFFSET) * SIZE(%ecx) 199 nop 200#elif defined(HAVE_SSE) 201 prefetchnta (PREFETCH_OFFSET) * SIZE(%ecx) 202#ifdef CORE_KATMAI 203 prefetcht0 (PREFETCH_OFFSET) * SIZE(%edx) 204#endif 205#endif 206 207 fmul %st, %st(1) 208 FMUL 1 * SIZE(%ecx) 209 fxch %st(1) 210 faddp %st, %st(4) 211 FLD 0 * SIZE(%ecx) 212 fxch %st(1) 213 faddp %st, %st(5) 214 FLD 1 * SIZE(%edx) 215 fmul %st, %st(1) 216 FMUL 1 * SIZE(%ecx) 217 fxch %st(1) 218 faddp %st, %st(6) 219 FLD 2 * SIZE(%ecx) 220 fxch %st(1) 221 faddp %st, %st(7) 222 FLD 2 * SIZE(%edx) 223 224 fmul %st, %st(1) 225 FMUL 3 * SIZE(%ecx) 226 fxch %st(1) 227 faddp %st, %st(4) 228 FLD 2 * SIZE(%ecx) 229 fxch %st(1) 230 faddp %st, %st(5) 231 FLD 3 * SIZE(%edx) 232 fmul %st, %st(1) 233 FMUL 3 * SIZE(%ecx) 234 fxch %st(1) 235 faddp %st, %st(6) 236 FLD 8 * SIZE(%ecx) 237 fxch %st(1) 238 faddp %st, %st(7) 239 FLD 8 * SIZE(%edx) 240 fxch %st(2) 241 242#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) 243 prefetchnta (PREFETCH_OFFSET + 4) * SIZE(%ecx) 244#ifdef CORE_KATMAI 245 prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(%edx) 246#endif 247#endif 248 249 fmul %st, %st(3) 250 FMUL 5 * SIZE(%ecx) 251 fxch %st(3) 252 faddp %st, %st(4) 253 FLD 4 * SIZE(%ecx) 254 fxch %st(3) 255 faddp %st, %st(5) 256 FLD 5 * SIZE(%edx) 257 fmul %st, %st(3) 258 FMUL 5 * SIZE(%ecx) 259 fxch %st(3) 260 faddp %st, %st(6) 261 FLD 6 * SIZE(%ecx) 262 fxch %st(3) 263 faddp %st, %st(7) 264 FLD 6 * SIZE(%edx) 265 266 fmul %st, %st(3) 267 FMUL 7 * SIZE(%ecx) 268 fxch %st(3) 269 faddp %st, %st(4) 270 FLD 6 * SIZE(%ecx) 271 fxch %st(3) 272 faddp %st, %st(5) 273 FLD 7 * SIZE(%edx) 274 fmul %st, %st(3) 275 FMUL 7 * SIZE(%ecx) 276 fxch %st(3) 277 faddp %st, %st(6) 278 FLD 12 * SIZE(%ecx) 279 fxch %st(3) 280 faddp %st, %st(7) 281 FLD 12 * SIZE(%edx) 282 fxch %st(2) 283 284 subl $-8 * SIZE, %ecx 285 subl $-8 * SIZE, %edx 286 decl %eax # l -- 287 jne .MainLoop 288 ALIGN_4 289 290.L16: 291#ifndef TRMMKERNEL 292 movl K, %eax 293#else 294 movl KKK, %eax 295#endif 296 and $3, %eax 297 je .L21 298 ALIGN_4 299 300.SubLoop: 301 fmul %st, %st(1) 302 FMUL 1 * SIZE(%ecx) 303 fxch %st(1) 304 faddp %st, %st(4) 305 FLD 0 * SIZE(%ecx) 306 fxch %st(1) 307 faddp %st, %st(5) 308 FLD 1 * SIZE(%edx) 309 fmul %st, %st(1) 310 FMUL 1 * SIZE(%ecx) 311 fxch %st(1) 312 faddp %st, %st(6) 313 FLD 2 * SIZE(%ecx) 314 fxch %st(1) 315 faddp %st, %st(7) 316 FLD 2 * SIZE(%edx) 317 318 addl $2 * SIZE,%ecx 319 addl $2 * SIZE,%edx 320 decl %eax 321 jne .SubLoop 322 ALIGN_4 323 324.L21: 325 ffreep %st(0) 326 ffreep %st(0) 327 ffreep %st(0) 328 ffreep %st(0) 329 330 FLD ALPHA 331 fmul %st, %st(4) 332 fmul %st, %st(1) 333 fmul %st, %st(2) 334 fmulp %st, %st(3) 335 336#ifndef TRMMKERNEL 337 FADD 0 * SIZE(%edi) 338 FST 0 * SIZE(%edi) 339 FADD 0 * SIZE(%edi,%ebp) 340 FST 0 * SIZE(%edi,%ebp) 341 FADD 1 * SIZE(%edi) 342 FST 1 * SIZE(%edi) 343 FADD 1 * SIZE(%edi,%ebp) 344 FST 1 * SIZE(%edi,%ebp) 345#else 346 FST 0 * SIZE(%edi) 347 FST 0 * SIZE(%edi,%ebp) 348 FST 1 * SIZE(%edi) 349 FST 1 * SIZE(%edi,%ebp) 350#endif 351 352 353#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 354 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 355 movl K, %eax 356 subl KKK, %eax 357 leal (,%eax, SIZE), %eax 358 leal (%edx, %eax, 2), %edx 359 leal (%ecx, %eax, 2), %ecx 360#endif 361 362#if defined(TRMMKERNEL) && defined(LEFT) 363 addl $2, KK 364#endif 365 366 addl $2 * SIZE, %edi 367 rep 368 decl %esi # i -- 369 rep 370 jne .MainHead 371 ALIGN_4 372 373.L12: 374 movl M, %eax # m # MEMORY 375 andl $1, %eax 376 je .L27 377 378#if !defined(TRMMKERNEL) || \ 379 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 380 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 381 movl %ebx, %ecx 382#else 383 movl KK, %eax 384 leal (, %eax, SIZE), %eax 385 leal (%edx, %eax, 1), %edx 386 leal (%ebx, %eax, 2), %ecx 387#endif 388 fldz 389 fldz 390 391 FLD 0 * SIZE(%edx) # temp1 = *(aoffset + 0) 392 393#ifndef TRMMKERNEL 394 movl K, %eax 395#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 396 movl K, %eax 397 subl KK, %eax 398 movl %eax, KKK 399#else 400 movl KK, %eax 401#ifdef LEFT 402 addl $1, %eax 403#else 404 addl $2, %eax 405#endif 406 movl %eax, KKK 407#endif 408 sarl $1,%eax # k >> 1 # MEMORY 409 je .L54 410 ALIGN_4 411 412.L55: 413 FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) 414 rep 415 fmul %st(1), %st 416 faddp %st, %st(2) 417 418 FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) 419 faddp %st, %st(2) 420 FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) 421 422 FLD 2 * SIZE(%ecx) # temp2 = *(boffset + 0) 423 rep 424 fmul %st(1), %st 425 faddp %st, %st(2) 426 427 FMUL 3 * SIZE(%ecx) # temp2 = *(boffset + 0) 428 faddp %st, %st(2) 429 FLD 2 * SIZE(%edx) # temp1 = *(aoffset + 0) 430 431 addl $2 * SIZE, %edx 432 addl $4 * SIZE, %ecx 433 decl %eax 434 jne .L55 435 ALIGN_4 436 437.L54: 438#ifndef TRMMKERNEL 439 movl K, %eax 440#else 441 movl KKK, %eax 442#endif 443 andl $1,%eax # k & 1 444 je .L33 445 ALIGN_4 446 447 FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) 448 rep 449 fmul %st(1), %st 450 faddp %st, %st(2) 451 452 FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) 453 faddp %st, %st(2) 454 FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) 455 456 addl $1 * SIZE, %edx 457 addl $2 * SIZE, %ecx 458 ALIGN_4 459 460.L33: 461 ffreep %st(0) 462 FLD ALPHA 463 464 fmul %st, %st(2) 465 fmulp %st, %st(1) 466 467#ifndef TRMMKERNEL 468 FADD (%edi) 469 FST (%edi) 470 FADD (%edi,%ebp) 471 FST (%edi,%ebp) 472#else 473 FST (%edi) 474 FST (%edi,%ebp) 475#endif 476 477#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 478 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 479 movl K, %eax 480 subl KKK, %eax 481 leal (,%eax, SIZE), %eax 482 leal (%edx, %eax, 1), %edx 483 leal (%ecx, %eax, 2), %ecx 484#endif 485 486#if defined(TRMMKERNEL) && defined(LEFT) 487 addl $1, KK 488#endif 489 ALIGN_4 490 491.L27: 492#if defined(TRMMKERNEL) && !defined(LEFT) 493 addl $2, KK 494#endif 495 496 lea (, %ebp, 2), %eax 497 addl %eax, C # C + 2 * ldc # MEMORY 498 movl %ecx, %ebx # b # MEMORY 499 decl J # j-- # MEMORY 500 jne .L34 501 ALIGN_4 502 503.L8: 504 movl N, %eax # n # MEMORY 505 andl $1, %eax 506 je .End 507 508#if defined(TRMMKERNEL) && defined(LEFT) 509 movl OFFSET, %eax 510 movl %eax, KK 511#endif 512 513 movl C, %edi # c # MEMORY 514 movl A, %edx # a # MEMORY 515 516 movl M, %esi # m # MEMORY 517 sarl $1, %esi # m >> 1 518 je .L36 519 ALIGN_4 520 521.L46: 522#if !defined(TRMMKERNEL) || \ 523 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 524 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 525 movl %ebx, %ecx 526#else 527 movl KK, %eax 528 leal (, %eax, SIZE), %eax 529 leal (%edx, %eax, 2), %edx 530 leal (%ebx, %eax, 1), %ecx 531#endif 532 533#ifndef TRMMKERNEL 534 movl K, %eax 535#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 536 movl K, %eax 537 subl KK, %eax 538 movl %eax, KKK 539#else 540 movl KK, %eax 541#ifdef LEFT 542 addl $2, %eax 543#else 544 addl $1, %eax 545#endif 546 movl %eax, KKK 547#endif 548 fldz 549 sarl $1, %eax 550 fldz 551 FLD 0 * SIZE(%ecx) # temp1 = *(boffset + 0) 552 553 je .L56 554 ALIGN_4 555 556.L57: 557 FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) 558 fmul %st(1), %st 559 faddp %st, %st(2) 560 561 FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) 562 faddp %st, %st(2) 563 FLD 1 * SIZE(%ecx) # temp1 = *(boffset + 0) 564 565 FLD 2 * SIZE(%edx) # temp2 = *(aoffset + 0) 566 fmul %st(1), %st 567 faddp %st, %st(2) 568 569 FMUL 3 * SIZE(%edx) # temp2 = *(aoffset + 0) 570 faddp %st, %st(2) 571 FLD 2 * SIZE(%ecx) # temp1 = *(boffset + 0) 572 573 addl $4 * SIZE,%edx 574 addl $2 * SIZE,%ecx 575 dec %eax 576 jne .L57 577 ALIGN_4 578 579.L56: 580#ifndef TRMMKERNEL 581 movl K, %eax 582#else 583 movl KKK, %eax 584#endif 585 andl $1, %eax 586 je .L45 587 ALIGN_4 588 589 FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) 590 fmul %st(1), %st 591 faddp %st, %st(2) 592 593 FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) 594 faddp %st, %st(2) 595 FLD 3 * SIZE(%ecx) # temp1 = *(boffset + 0) 596 597 addl $2 * SIZE,%edx 598 addl $1 * SIZE,%ecx 599 ALIGN_4 600 601.L45: 602 ffreep %st(0) 603 FLD ALPHA 604 605 fmul %st, %st(1) 606 fmulp %st, %st(2) 607 608#ifndef TRMMKERNEL 609 FADD 0 * SIZE(%edi) 610 FST 0 * SIZE(%edi) 611 FADD 1 * SIZE(%edi) 612 FST 1 * SIZE(%edi) 613#else 614 FST 0 * SIZE(%edi) 615 FST 1 * SIZE(%edi) 616#endif 617 618 addl $2 * SIZE, %edi 619 620#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 621 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 622 movl K, %eax 623 subl KKK, %eax 624 leal (,%eax, SIZE), %eax 625 leal (%edx, %eax, 2), %edx 626 leal (%ecx, %eax, 1), %ecx 627#endif 628 629#if defined(TRMMKERNEL) && defined(LEFT) 630 addl $2, KK 631#endif 632 633 decl %esi # i -- 634 jne .L46 635 ALIGN_4 636 637.L36: 638 movl M, %eax # m # MEMORY 639 andl $1, %eax # m & 1 640 je .End 641 642#if !defined(TRMMKERNEL) || \ 643 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 644 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 645 movl %ebx, %ecx 646#else 647 movl KK, %eax 648 leal (, %eax, SIZE), %eax 649 leal (%edx, %eax, 1), %edx 650 leal (%ebx, %eax, 1), %ecx 651#endif 652 653#ifndef TRMMKERNEL 654 movl K, %eax 655#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 656 movl K, %eax 657 subl KK, %eax 658 movl %eax, KKK 659#else 660 movl KK, %eax 661#ifdef LEFT 662 addl $1, %eax 663#else 664 addl $1, %eax 665#endif 666 movl %eax, KKK 667#endif 668 fldz 669 ALIGN_3 670 671.L51: 672 FLD (%edx) 673 FMUL (%ecx) 674 addl $1 * SIZE,%edx 675 addl $1 * SIZE,%ecx 676 faddp %st,%st(1) 677 decl %eax 678 jne .L51 679 680 FMUL ALPHA 681#ifndef TRMMKERNEL 682 FADD (%edi) 683 FST (%edi) 684#else 685 FST (%edi) 686#endif 687 ALIGN_4 688 689.End: 690 popl %ebx 691 popl %esi 692 popl %edi 693 popl %ebp 694 addl $ARGS, %esp 695 ret 696 697 EPILOGUE 698