1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef movsd 43#undef movsd 44#endif 45 46#ifdef PENTIUM3 47#ifdef HAVE_SSE 48#define PREFETCH prefetcht0 49#define PREFETCHW prefetcht0 50#define PREFETCHSIZE (16 * 2) 51#endif 52#define movsd movlps 53#endif 54 55#ifdef PENTIUM4 56#define PREFETCH prefetcht0 57#define PREFETCHW prefetcht0 58#define PREFETCHSIZE (16 * 4) 59#endif 60 61#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) 62#define PREFETCH prefetcht0 63#define PREFETCHW prefetcht0 64#define PREFETCHSIZE (16 * 7) 65#endif 66 67#ifdef OPTERON 68#define PREFETCH prefetchnta 69#define PREFETCHW prefetchw 70#define PREFETCHSIZE (16 * 3) 71#define movsd movlps 72#endif 73 74#ifdef BARCELONA 75#define PREFETCH prefetchnta 76#define PREFETCHW prefetchw 77#define PREFETCHSIZE (16 * 5) 78#endif 79 80#ifdef ATOM 81#define PREFETCH prefetchnta 82#define PREFETCHW prefetcht0 83#define PREFETCHSIZE (16 * 6) 84#endif 85 86#ifdef NANO 87#define PREFETCH prefetcht0 88#define PREFETCHSIZE (16 * 4) 89#endif 90 91#define STACKSIZE 16 92 93#define M 4 + STACKSIZE(%esp) 94#define N 8 + STACKSIZE(%esp) 95#define ALPHA 16 + STACKSIZE(%esp) 96#define A 20 + STACKSIZE(%esp) 97#define STACK_LDA 24 + STACKSIZE(%esp) 98#define STACK_X 28 + STACKSIZE(%esp) 99#define STACK_INCX 32 + STACKSIZE(%esp) 100#define Y 36 + STACKSIZE(%esp) 101#define STACK_INCY 40 + STACKSIZE(%esp) 102#define BUFFER 44 + STACKSIZE(%esp) 103 104#define I %eax 105#define J %ebx 106 107#define INCX %ecx 108#define INCY J 109 110#define A1 %esi 111#define X %edx 112#define Y1 %edi 113#define LDA %ebp 114 115 PROLOGUE 116 117 pushl %ebp 118 pushl %edi 119 pushl %esi 120 pushl %ebx 121 122 PROFCODE 123 124 movl STACK_LDA, LDA 125 movl STACK_X, X 126 movl STACK_INCX, INCX 127 128 leal (,INCX, SIZE), INCX 129 leal (,LDA, SIZE), LDA 130 131 subl $-32 * SIZE, A 132 133 cmpl $0, N 134 jle .L999 135 cmpl $0, M 136 jle .L999 137 138 movl BUFFER, Y1 139 140 xorps %xmm7, %xmm7 141 142 movl M, %eax 143 addl $16, %eax 144 sarl $4, %eax 145 ALIGN_3 146 147.L01: 148 movaps %xmm7, 0 * SIZE(Y1) 149 movaps %xmm7, 4 * SIZE(Y1) 150 movaps %xmm7, 8 * SIZE(Y1) 151 movaps %xmm7, 12 * SIZE(Y1) 152 subl $-16 * SIZE, Y1 153 decl %eax 154 jg .L01 155 ALIGN_3 156 157.L10: 158 movl N, J 159 sarl $1, J 160 jle .L20 161 ALIGN_3 162 163.L11: 164 movl BUFFER, Y1 165 addl $32 * SIZE, Y1 166 167 movl A, A1 168 leal (A1, LDA, 2), %eax 169 movl %eax, A 170 171 movss (X), %xmm6 172 addl INCX, X 173 movss (X), %xmm7 174 addl INCX, X 175 176 movss ALPHA, %xmm0 177 178 mulss %xmm0, %xmm6 179 mulss %xmm0, %xmm7 180 181 shufps $0, %xmm6, %xmm6 182 shufps $0, %xmm7, %xmm7 183 ALIGN_3 184 185 movl M, I 186 sarl $4, I 187 jle .L15 188 189 movsd -32 * SIZE(A1), %xmm2 190 movhps -30 * SIZE(A1), %xmm2 191 movsd -28 * SIZE(A1), %xmm3 192 movhps -26 * SIZE(A1), %xmm3 193 194 movaps -32 * SIZE(Y1), %xmm0 195 movaps -28 * SIZE(Y1), %xmm1 196 197 movsd -32 * SIZE(A1, LDA), %xmm4 198 movhps -30 * SIZE(A1, LDA), %xmm4 199 movsd -28 * SIZE(A1, LDA), %xmm5 200 movhps -26 * SIZE(A1, LDA), %xmm5 201 202 decl I 203 jle .L14 204 ALIGN_3 205 206.L13: 207#ifdef PREFETCH 208 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 209#endif 210 211 mulps %xmm6, %xmm2 212 addps %xmm2, %xmm0 213 movsd -24 * SIZE(A1), %xmm2 214 movhps -22 * SIZE(A1), %xmm2 215 mulps %xmm6, %xmm3 216 addps %xmm3, %xmm1 217 movsd -20 * SIZE(A1), %xmm3 218 movhps -18 * SIZE(A1), %xmm3 219 220 mulps %xmm7, %xmm4 221 addps %xmm4, %xmm0 222 movsd -24 * SIZE(A1, LDA), %xmm4 223 movhps -22 * SIZE(A1, LDA), %xmm4 224 225 movaps %xmm0, -32 * SIZE(Y1) 226 movaps -24 * SIZE(Y1), %xmm0 227 228 mulps %xmm7, %xmm5 229 addps %xmm5, %xmm1 230 movsd -20 * SIZE(A1, LDA), %xmm5 231 movhps -18 * SIZE(A1, LDA), %xmm5 232 233 movaps %xmm1, -28 * SIZE(Y1) 234 movaps -20 * SIZE(Y1), %xmm1 235 236#ifdef PREFETCH 237 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) 238#endif 239 240 mulps %xmm6, %xmm2 241 addps %xmm2, %xmm0 242 movsd -16 * SIZE(A1), %xmm2 243 movhps -14 * SIZE(A1), %xmm2 244 mulps %xmm6, %xmm3 245 addps %xmm3, %xmm1 246 movsd -12 * SIZE(A1), %xmm3 247 movhps -10 * SIZE(A1), %xmm3 248 249 mulps %xmm7, %xmm4 250 addps %xmm4, %xmm0 251 movsd -16 * SIZE(A1, LDA), %xmm4 252 movhps -14 * SIZE(A1, LDA), %xmm4 253 254 movaps %xmm0, -24 * SIZE(Y1) 255 movaps -16 * SIZE(Y1), %xmm0 256 257 mulps %xmm7, %xmm5 258 addps %xmm5, %xmm1 259 movsd -12 * SIZE(A1, LDA), %xmm5 260 movhps -10 * SIZE(A1, LDA), %xmm5 261 262 movaps %xmm1, -20 * SIZE(Y1) 263 movaps -12 * SIZE(Y1), %xmm1 264 265 subl $-16 * SIZE, A1 266 subl $-16 * SIZE, Y1 267 268 subl $1, I 269 BRANCH 270 jg .L13 271 ALIGN_3 272 273.L14: 274 mulps %xmm6, %xmm2 275 addps %xmm2, %xmm0 276 movsd -24 * SIZE(A1), %xmm2 277 movhps -22 * SIZE(A1), %xmm2 278 mulps %xmm6, %xmm3 279 addps %xmm3, %xmm1 280 movsd -20 * SIZE(A1), %xmm3 281 movhps -18 * SIZE(A1), %xmm3 282 283 mulps %xmm7, %xmm4 284 addps %xmm4, %xmm0 285 movsd -24 * SIZE(A1, LDA), %xmm4 286 movhps -22 * SIZE(A1, LDA), %xmm4 287 288 movaps %xmm0, -32 * SIZE(Y1) 289 movaps -24 * SIZE(Y1), %xmm0 290 291 mulps %xmm7, %xmm5 292 addps %xmm5, %xmm1 293 movsd -20 * SIZE(A1, LDA), %xmm5 294 movhps -18 * SIZE(A1, LDA), %xmm5 295 296 movaps %xmm1, -28 * SIZE(Y1) 297 movaps -20 * SIZE(Y1), %xmm1 298 299 mulps %xmm6, %xmm2 300 addps %xmm2, %xmm0 301 mulps %xmm6, %xmm3 302 addps %xmm3, %xmm1 303 304 mulps %xmm7, %xmm4 305 addps %xmm4, %xmm0 306 movaps %xmm0, -24 * SIZE(Y1) 307 mulps %xmm7, %xmm5 308 addps %xmm5, %xmm1 309 movaps %xmm1, -20 * SIZE(Y1) 310 311 subl $-16 * SIZE, A1 312 subl $-16 * SIZE, Y1 313 ALIGN_3 314 315.L15: 316 testl $8, M 317 je .L16 318 319 movsd -32 * SIZE(A1), %xmm2 320 movhps -30 * SIZE(A1), %xmm2 321 movsd -28 * SIZE(A1), %xmm3 322 movhps -26 * SIZE(A1), %xmm3 323 324 movaps -32 * SIZE(Y1), %xmm0 325 movaps -28 * SIZE(Y1), %xmm1 326 327 mulps %xmm6, %xmm2 328 addps %xmm2, %xmm0 329 mulps %xmm6, %xmm3 330 addps %xmm3, %xmm1 331 332 movsd -32 * SIZE(A1, LDA), %xmm4 333 movhps -30 * SIZE(A1, LDA), %xmm4 334 movsd -28 * SIZE(A1, LDA), %xmm5 335 movhps -26 * SIZE(A1, LDA), %xmm5 336 337 mulps %xmm7, %xmm4 338 addps %xmm4, %xmm0 339 mulps %xmm7, %xmm5 340 addps %xmm5, %xmm1 341 342 movaps %xmm0, -32 * SIZE(Y1) 343 movaps %xmm1, -28 * SIZE(Y1) 344 345 addl $8 * SIZE, A1 346 addl $8 * SIZE, Y1 347 ALIGN_3 348 349.L16: 350 testl $4, M 351 je .L17 352 353 movsd -32 * SIZE(A1), %xmm2 354 movhps -30 * SIZE(A1), %xmm2 355 movsd -32 * SIZE(A1, LDA), %xmm3 356 movhps -30 * SIZE(A1, LDA), %xmm3 357 358 movaps -32 * SIZE(Y1), %xmm0 359 360 mulps %xmm6, %xmm2 361 addps %xmm2, %xmm0 362 mulps %xmm7, %xmm3 363 addps %xmm3, %xmm0 364 365 movaps %xmm0, -32 * SIZE(Y1) 366 367 addl $4 * SIZE, A1 368 addl $4 * SIZE, Y1 369 ALIGN_3 370 371.L17: 372 testl $2, M 373 je .L18 374 375 movsd -32 * SIZE(A1), %xmm2 376 movsd -32 * SIZE(A1, LDA), %xmm3 377 378 movsd -32 * SIZE(Y1), %xmm0 379 380 mulps %xmm6, %xmm2 381 addps %xmm2, %xmm0 382 mulps %xmm7, %xmm3 383 addps %xmm3, %xmm0 384 385 movlps %xmm0, -32 * SIZE(Y1) 386 387 addl $2 * SIZE, A1 388 addl $2 * SIZE, Y1 389 ALIGN_3 390 391.L18: 392 testl $1, M 393 je .L19 394 395 movss -32 * SIZE(A1), %xmm2 396 movss -32 * SIZE(A1, LDA), %xmm3 397 398 movss -32 * SIZE(Y1), %xmm0 399 400 mulss %xmm6, %xmm2 401 addss %xmm2, %xmm0 402 mulss %xmm7, %xmm3 403 addss %xmm3, %xmm0 404 405 movss %xmm0, -32 * SIZE(Y1) 406 ALIGN_3 407 408.L19: 409 decl J 410 jg .L11 411 ALIGN_4 412 413.L20: 414 testl $1, N 415 jle .L990 416 417 movl BUFFER, Y1 418 addl $32 * SIZE, Y1 419 420 movl A, A1 421 422 movss (X), %xmm6 423 addl INCX, X 424 425 movss ALPHA, %xmm0 426 427 mulss %xmm0, %xmm6 428 429 shufps $0, %xmm6, %xmm6 430 ALIGN_3 431 432 movl M, I 433 sarl $4, I 434 jle .L25 435 436 movsd -32 * SIZE(A1), %xmm2 437 movhps -30 * SIZE(A1), %xmm2 438 movsd -28 * SIZE(A1), %xmm3 439 movhps -26 * SIZE(A1), %xmm3 440 441 movaps -32 * SIZE(Y1), %xmm0 442 movaps -28 * SIZE(Y1), %xmm1 443 444 decl I 445 jle .L24 446 ALIGN_3 447 448.L23: 449#ifdef PREFETCH 450 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 451#endif 452 453 mulps %xmm6, %xmm2 454 addps %xmm2, %xmm0 455 movsd -24 * SIZE(A1), %xmm2 456 movhps -22 * SIZE(A1), %xmm2 457 458 movaps %xmm0, -32 * SIZE(Y1) 459 movaps -24 * SIZE(Y1), %xmm0 460 461 mulps %xmm6, %xmm3 462 addps %xmm3, %xmm1 463 movsd -20 * SIZE(A1), %xmm3 464 movhps -18 * SIZE(A1), %xmm3 465 466 movaps %xmm1, -28 * SIZE(Y1) 467 movaps -20 * SIZE(Y1), %xmm1 468 469 mulps %xmm6, %xmm2 470 addps %xmm2, %xmm0 471 movsd -16 * SIZE(A1), %xmm2 472 movhps -14 * SIZE(A1), %xmm2 473 474 movaps %xmm0, -24 * SIZE(Y1) 475 movaps -16 * SIZE(Y1), %xmm0 476 477 mulps %xmm6, %xmm3 478 addps %xmm3, %xmm1 479 movsd -12 * SIZE(A1), %xmm3 480 movhps -10 * SIZE(A1), %xmm3 481 482 movaps %xmm1, -20 * SIZE(Y1) 483 movaps -12 * SIZE(Y1), %xmm1 484 485 subl $-16 * SIZE, A1 486 subl $-16 * SIZE, Y1 487 488 subl $1, I 489 BRANCH 490 jg .L23 491 ALIGN_3 492 493.L24: 494 mulps %xmm6, %xmm2 495 addps %xmm2, %xmm0 496 movsd -24 * SIZE(A1), %xmm2 497 movhps -22 * SIZE(A1), %xmm2 498 mulps %xmm6, %xmm3 499 addps %xmm3, %xmm1 500 movsd -20 * SIZE(A1), %xmm3 501 movhps -18 * SIZE(A1), %xmm3 502 503 movaps %xmm0, -32 * SIZE(Y1) 504 movaps -24 * SIZE(Y1), %xmm0 505 506 movaps %xmm1, -28 * SIZE(Y1) 507 movaps -20 * SIZE(Y1), %xmm1 508 509 mulps %xmm6, %xmm2 510 addps %xmm2, %xmm0 511 movaps %xmm0, -24 * SIZE(Y1) 512 mulps %xmm6, %xmm3 513 addps %xmm3, %xmm1 514 movaps %xmm1, -20 * SIZE(Y1) 515 516 subl $-16 * SIZE, A1 517 subl $-16 * SIZE, Y1 518 ALIGN_3 519 520.L25: 521 testl $8, M 522 je .L26 523 524 movsd -32 * SIZE(A1), %xmm2 525 movhps -30 * SIZE(A1), %xmm2 526 movsd -28 * SIZE(A1), %xmm3 527 movhps -26 * SIZE(A1), %xmm3 528 529 movaps -32 * SIZE(Y1), %xmm0 530 movaps -28 * SIZE(Y1), %xmm1 531 532 mulps %xmm6, %xmm2 533 addps %xmm2, %xmm0 534 mulps %xmm6, %xmm3 535 addps %xmm3, %xmm1 536 537 movaps %xmm0, -32 * SIZE(Y1) 538 movaps %xmm1, -28 * SIZE(Y1) 539 540 addl $8 * SIZE, A1 541 addl $8 * SIZE, Y1 542 ALIGN_3 543 544.L26: 545 testl $4, M 546 je .L27 547 548 movsd -32 * SIZE(A1), %xmm2 549 movhps -30 * SIZE(A1), %xmm2 550 551 movaps -32 * SIZE(Y1), %xmm0 552 553 mulps %xmm6, %xmm2 554 addps %xmm2, %xmm0 555 556 movaps %xmm0, -32 * SIZE(Y1) 557 558 addl $4 * SIZE, A1 559 addl $4 * SIZE, Y1 560 ALIGN_3 561 562.L27: 563 testl $2, M 564 je .L28 565 566 movsd -32 * SIZE(A1), %xmm2 567 movsd -32 * SIZE(Y1), %xmm0 568 569 mulps %xmm6, %xmm2 570 addps %xmm2, %xmm0 571 572 movlps %xmm0, -32 * SIZE(Y1) 573 574 addl $2 * SIZE, A1 575 addl $2 * SIZE, Y1 576 ALIGN_3 577 578.L28: 579 testl $1, M 580 je .L990 581 582 movss -32 * SIZE(A1), %xmm2 583 movss -32 * SIZE(Y1), %xmm0 584 585 mulss %xmm6, %xmm2 586 addss %xmm2, %xmm0 587 588 movss %xmm0, -32 * SIZE(Y1) 589 ALIGN_3 590 591.L990: 592 movl Y, Y1 593 movl BUFFER, X 594 595 movl STACK_INCY, INCY 596 sall $BASE_SHIFT, INCY 597 598 movl M, %eax 599 sarl $2, %eax 600 jle .L994 601 ALIGN_3 602 603.L992: 604 movss (Y1), %xmm0 605 addss 0 * SIZE(X), %xmm0 606 movss %xmm0, (Y1) 607 addl INCY, Y1 608 609 movss (Y1), %xmm0 610 addss 1 * SIZE(X), %xmm0 611 movss %xmm0, (Y1) 612 addl INCY, Y1 613 614 movss (Y1), %xmm0 615 addss 2 * SIZE(X), %xmm0 616 movss %xmm0, (Y1) 617 addl INCY, Y1 618 619 movss (Y1), %xmm0 620 addss 3 * SIZE(X), %xmm0 621 movss %xmm0, (Y1) 622 addl INCY, Y1 623 624 addl $4 * SIZE, X 625 decl %eax 626 jg .L992 627 ALIGN_3 628 629.L994: 630 testl $2, M 631 jle .L996 632 633 movss (Y1), %xmm0 634 addss 0 * SIZE(X), %xmm0 635 movss %xmm0, (Y1) 636 addl INCY, Y1 637 638 movss (Y1), %xmm0 639 addss 1 * SIZE(X), %xmm0 640 movss %xmm0, (Y1) 641 addl INCY, Y1 642 643 addl $2 * SIZE, X 644 ALIGN_3 645 646.L996: 647 testl $1, M 648 jle .L999 649 650 movss (Y1), %xmm0 651 addss 0 * SIZE(X), %xmm0 652 movss %xmm0, (Y1) 653 ALIGN_3 654 655.L999: 656 popl %ebx 657 popl %esi 658 popl %edi 659 popl %ebp 660 ret 661 662 EPILOGUE 663