1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef PENTIUM4 26#define PREFETCH prefetcht0 27#define PREFETCHW prefetcht0 28#define PREFETCHSIZE (8 * 2) 29#endif 30 31#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) 32#define PREFETCH prefetcht0 33#define PREFETCHW prefetcht0 34#define PREFETCHSIZE (8 * 7) 35#endif 36 37#ifdef OPTERON 38#define PREFETCH prefetchnta 39#define PREFETCHW prefetchw 40#define PREFETCHSIZE (8 * 3) 41#define movsd movlps 42#endif 43 44#ifdef BARCELONA 45#define PREFETCH prefetchnta 46#define PREFETCHW prefetchw 47#define PREFETCHSIZE (8 * 5) 48#endif 49 50#ifdef ATOM 51#define PREFETCH prefetch 52#define PREFETCHW prefetcht0 53#define PREFETCHSIZE (8 * 6) 54#endif 55 56#ifdef NANO 57#define PREFETCH prefetcht0 58#define PREFETCHSIZE (8 * 4) 59#endif 60 61#define STACKSIZE 16 62 63#define M 4 + STACKSIZE(%esp) 64#define N 8 + STACKSIZE(%esp) 65#define ALPHA 16 + STACKSIZE(%esp) 66#define A 24 + STACKSIZE(%esp) 67#define STACK_LDA 28 + STACKSIZE(%esp) 68#define STACK_X 32 + STACKSIZE(%esp) 69#define STACK_INCX 36 + STACKSIZE(%esp) 70#define Y 40 + STACKSIZE(%esp) 71#define STACK_INCY 44 + STACKSIZE(%esp) 72#define BUFFER 48 + STACKSIZE(%esp) 73 74#define I %eax 75#define J %ebx 76 77#define INCX %ecx 78#define INCY J 79 80#define A1 %esi 81#define X %edx 82#define Y1 %edi 83#define LDA %ebp 84 85 PROLOGUE 86 87 pushl %ebp 88 pushl %edi 89 pushl %esi 90 pushl %ebx 91 92 PROFCODE 93 94 movl STACK_LDA, LDA 95 movl STACK_X, X 96 movl STACK_INCX, INCX 97 98 leal (,INCX, SIZE), INCX 99 leal (,LDA, SIZE), LDA 100 101 subl $-16 * SIZE, A 102 103 cmpl $0, N 104 jle .L999 105 cmpl $0, M 106 jle .L999 107 108 movl BUFFER, Y1 109 110 pxor %xmm7, %xmm7 111 112 movl M, %eax 113 addl $16, %eax 114 sarl $4, %eax 115 ALIGN_3 116 117.L01: 118 movapd %xmm7, 0 * SIZE(Y1) 119 movapd %xmm7, 2 * SIZE(Y1) 120 movapd %xmm7, 4 * SIZE(Y1) 121 movapd %xmm7, 6 * SIZE(Y1) 122 movapd %xmm7, 8 * SIZE(Y1) 123 movapd %xmm7, 10 * SIZE(Y1) 124 movapd %xmm7, 12 * SIZE(Y1) 125 movapd %xmm7, 14 * SIZE(Y1) 126 subl $-16 * SIZE, Y1 127 decl %eax 128 jg .L01 129 ALIGN_3 130 131.L10: 132 movl N, J 133 sarl $1, J 134 jle .L20 135 ALIGN_3 136 137.L11: 138 139 movl BUFFER, Y1 140 addl $16 * SIZE, Y1 141 142 movl A, A1 143 leal (A1, LDA, 2), %eax 144 movl %eax, A 145 146#ifdef HAVE_SSE3 147 movddup (X), %xmm6 148 addl INCX, X 149 movddup (X), %xmm7 150 addl INCX, X 151 152 movddup ALPHA, %xmm0 153 154 mulpd %xmm0, %xmm6 155 mulpd %xmm0, %xmm7 156#else 157 movsd (X), %xmm6 158 addl INCX, X 159 movsd (X), %xmm7 160 addl INCX, X 161 162 movsd ALPHA, %xmm0 163 164 mulsd %xmm0, %xmm6 165 mulsd %xmm0, %xmm7 166 167 unpcklpd %xmm6, %xmm6 168 unpcklpd %xmm7, %xmm7 169#endif 170 171 ALIGN_3 172 173 movl M, I 174 sarl $3, I 175 jle .L15 176 177 movsd -16 * SIZE(A1), %xmm2 178 movhpd -15 * SIZE(A1), %xmm2 179 movsd -14 * SIZE(A1), %xmm3 180 movhpd -13 * SIZE(A1), %xmm3 181 182 movapd -16 * SIZE(Y1), %xmm0 183 movapd -14 * SIZE(Y1), %xmm1 184 185 movsd -16 * SIZE(A1, LDA), %xmm4 186 movhpd -15 * SIZE(A1, LDA), %xmm4 187 movsd -14 * SIZE(A1, LDA), %xmm5 188 movhpd -13 * SIZE(A1, LDA), %xmm5 189 190 decl I 191 jle .L14 192 ALIGN_3 193 194.L13: 195#ifdef PREFETCH 196 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 197#endif 198 199 mulpd %xmm6, %xmm2 200 addpd %xmm2, %xmm0 201 movsd -12 * SIZE(A1), %xmm2 202 movhpd -11 * SIZE(A1), %xmm2 203 mulpd %xmm6, %xmm3 204 addpd %xmm3, %xmm1 205 movsd -10 * SIZE(A1), %xmm3 206 movhpd -9 * SIZE(A1), %xmm3 207 208 mulpd %xmm7, %xmm4 209 addpd %xmm4, %xmm0 210 movsd -12 * SIZE(A1, LDA), %xmm4 211 movhpd -11 * SIZE(A1, LDA), %xmm4 212 213 movapd %xmm0, -16 * SIZE(Y1) 214 movapd -12 * SIZE(Y1), %xmm0 215 216 mulpd %xmm7, %xmm5 217 addpd %xmm5, %xmm1 218 movsd -10 * SIZE(A1, LDA), %xmm5 219 movhpd -9 * SIZE(A1, LDA), %xmm5 220 221 movapd %xmm1, -14 * SIZE(Y1) 222 movapd -10 * SIZE(Y1), %xmm1 223 224#ifdef PREFETCH 225 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) 226#endif 227 228 mulpd %xmm6, %xmm2 229 addpd %xmm2, %xmm0 230 movsd -8 * SIZE(A1), %xmm2 231 movhpd -7 * SIZE(A1), %xmm2 232 mulpd %xmm6, %xmm3 233 addpd %xmm3, %xmm1 234 movsd -6 * SIZE(A1), %xmm3 235 movhpd -5 * SIZE(A1), %xmm3 236 237 mulpd %xmm7, %xmm4 238 addpd %xmm4, %xmm0 239 movsd -8 * SIZE(A1, LDA), %xmm4 240 movhpd -7 * SIZE(A1, LDA), %xmm4 241 242 movapd %xmm0, -12 * SIZE(Y1) 243 movapd -8 * SIZE(Y1), %xmm0 244 245 mulpd %xmm7, %xmm5 246 addpd %xmm5, %xmm1 247 movsd -6 * SIZE(A1, LDA), %xmm5 248 movhpd -5 * SIZE(A1, LDA), %xmm5 249 250 movapd %xmm1, -10 * SIZE(Y1) 251 movapd -6 * SIZE(Y1), %xmm1 252 253 subl $-8 * SIZE, A1 254 subl $-8 * SIZE, Y1 255 256 subl $1, I 257 BRANCH 258 jg .L13 259 ALIGN_3 260 261.L14: 262 mulpd %xmm6, %xmm2 263 addpd %xmm2, %xmm0 264 movsd -12 * SIZE(A1), %xmm2 265 movhpd -11 * SIZE(A1), %xmm2 266 mulpd %xmm6, %xmm3 267 addpd %xmm3, %xmm1 268 movsd -10 * SIZE(A1), %xmm3 269 movhpd -9 * SIZE(A1), %xmm3 270 271 mulpd %xmm7, %xmm4 272 addpd %xmm4, %xmm0 273 movsd -12 * SIZE(A1, LDA), %xmm4 274 movhpd -11 * SIZE(A1, LDA), %xmm4 275 276 movapd %xmm0, -16 * SIZE(Y1) 277 movapd -12 * SIZE(Y1), %xmm0 278 279 mulpd %xmm7, %xmm5 280 addpd %xmm5, %xmm1 281 movsd -10 * SIZE(A1, LDA), %xmm5 282 movhpd -9 * SIZE(A1, LDA), %xmm5 283 284 movapd %xmm1, -14 * SIZE(Y1) 285 movapd -10 * SIZE(Y1), %xmm1 286 287 mulpd %xmm6, %xmm2 288 addpd %xmm2, %xmm0 289 mulpd %xmm6, %xmm3 290 addpd %xmm3, %xmm1 291 292 mulpd %xmm7, %xmm4 293 addpd %xmm4, %xmm0 294 movapd %xmm0, -12 * SIZE(Y1) 295 mulpd %xmm7, %xmm5 296 addpd %xmm5, %xmm1 297 movapd %xmm1, -10 * SIZE(Y1) 298 299 subl $-8 * SIZE, A1 300 subl $-8 * SIZE, Y1 301 ALIGN_3 302 303.L15: 304 testl $4, M 305 je .L16 306 307 movsd -16 * SIZE(A1), %xmm2 308 movhpd -15 * SIZE(A1), %xmm2 309 movsd -14 * SIZE(A1), %xmm3 310 movhpd -13 * SIZE(A1), %xmm3 311 312 movapd -16 * SIZE(Y1), %xmm0 313 movapd -14 * SIZE(Y1), %xmm1 314 315 mulpd %xmm6, %xmm2 316 addpd %xmm2, %xmm0 317 mulpd %xmm6, %xmm3 318 addpd %xmm3, %xmm1 319 320 movsd -16 * SIZE(A1, LDA), %xmm4 321 movhpd -15 * SIZE(A1, LDA), %xmm4 322 movsd -14 * SIZE(A1, LDA), %xmm5 323 movhpd -13 * SIZE(A1, LDA), %xmm5 324 325 mulpd %xmm7, %xmm4 326 addpd %xmm4, %xmm0 327 mulpd %xmm7, %xmm5 328 addpd %xmm5, %xmm1 329 330 movapd %xmm0, -16 * SIZE(Y1) 331 movapd %xmm1, -14 * SIZE(Y1) 332 333 addl $4 * SIZE, A1 334 addl $4 * SIZE, Y1 335 ALIGN_3 336 337.L16: 338 testl $2, M 339 je .L17 340 341 movsd -16 * SIZE(A1), %xmm2 342 movhpd -15 * SIZE(A1), %xmm2 343 movsd -16 * SIZE(A1, LDA), %xmm3 344 movhpd -15 * SIZE(A1, LDA), %xmm3 345 346 movapd -16 * SIZE(Y1), %xmm0 347 348 mulpd %xmm6, %xmm2 349 addpd %xmm2, %xmm0 350 mulpd %xmm7, %xmm3 351 addpd %xmm3, %xmm0 352 353 movapd %xmm0, -16 * SIZE(Y1) 354 355 addl $2 * SIZE, A1 356 addl $2 * SIZE, Y1 357 ALIGN_3 358 359.L17: 360 testl $1, M 361 je .L19 362 363 movsd -16 * SIZE(A1), %xmm2 364 movsd -16 * SIZE(A1, LDA), %xmm3 365 366 movsd -16 * SIZE(Y1), %xmm0 367 368 mulsd %xmm6, %xmm2 369 addsd %xmm2, %xmm0 370 mulsd %xmm7, %xmm3 371 addsd %xmm3, %xmm0 372 373 movsd %xmm0, -16 * SIZE(Y1) 374 ALIGN_3 375 376.L19: 377 decl J 378 jg .L11 379 ALIGN_4 380 381.L20: 382 testl $1, N 383 jle .L990 384 385 movl BUFFER, Y1 386 addl $16 * SIZE, Y1 387 388 movl A, A1 389 390#ifdef HAVE_SSE3 391 movddup (X), %xmm6 392 addl INCX, X 393 394 movddup ALPHA, %xmm0 395 396 mulpd %xmm0, %xmm6 397#else 398 movsd (X), %xmm6 399 addl INCX, X 400 401 movsd ALPHA, %xmm0 402 403 mulsd %xmm0, %xmm6 404 unpcklpd %xmm6, %xmm6 405#endif 406 407 ALIGN_3 408 409 movl M, I 410 sarl $3, I 411 jle .L25 412 413 movsd -16 * SIZE(A1), %xmm2 414 movhpd -15 * SIZE(A1), %xmm2 415 movsd -14 * SIZE(A1), %xmm3 416 movhpd -13 * SIZE(A1), %xmm3 417 418 movapd -16 * SIZE(Y1), %xmm0 419 movapd -14 * SIZE(Y1), %xmm1 420 decl I 421 jle .L24 422 ALIGN_3 423 424.L23: 425#ifdef PREFETCH 426 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 427#endif 428 429 mulpd %xmm6, %xmm2 430 addpd %xmm2, %xmm0 431 movsd -12 * SIZE(A1), %xmm2 432 movhpd -11 * SIZE(A1), %xmm2 433 434 movapd %xmm0, -16 * SIZE(Y1) 435 movapd -12 * SIZE(Y1), %xmm0 436 437 mulpd %xmm6, %xmm3 438 addpd %xmm3, %xmm1 439 movsd -10 * SIZE(A1), %xmm3 440 movhpd -9 * SIZE(A1), %xmm3 441 442 movapd %xmm1, -14 * SIZE(Y1) 443 movapd -10 * SIZE(Y1), %xmm1 444 445 mulpd %xmm6, %xmm2 446 addpd %xmm2, %xmm0 447 movsd -8 * SIZE(A1), %xmm2 448 movhpd -7 * SIZE(A1), %xmm2 449 450 movapd %xmm0, -12 * SIZE(Y1) 451 movapd -8 * SIZE(Y1), %xmm0 452 453 mulpd %xmm6, %xmm3 454 addpd %xmm3, %xmm1 455 movsd -6 * SIZE(A1), %xmm3 456 movhpd -5 * SIZE(A1), %xmm3 457 458 movapd %xmm1, -10 * SIZE(Y1) 459 movapd -6 * SIZE(Y1), %xmm1 460 461 subl $-8 * SIZE, A1 462 subl $-8 * SIZE, Y1 463 464 subl $1, I 465 BRANCH 466 jg .L23 467 ALIGN_3 468 469.L24: 470 mulpd %xmm6, %xmm2 471 addpd %xmm2, %xmm0 472 movsd -12 * SIZE(A1), %xmm2 473 movhpd -11 * SIZE(A1), %xmm2 474 mulpd %xmm6, %xmm3 475 addpd %xmm3, %xmm1 476 movsd -10 * SIZE(A1), %xmm3 477 movhpd -9 * SIZE(A1), %xmm3 478 479 movapd %xmm0, -16 * SIZE(Y1) 480 movapd -12 * SIZE(Y1), %xmm0 481 482 movapd %xmm1, -14 * SIZE(Y1) 483 movapd -10 * SIZE(Y1), %xmm1 484 485 mulpd %xmm6, %xmm2 486 addpd %xmm2, %xmm0 487 movapd %xmm0, -12 * SIZE(Y1) 488 mulpd %xmm6, %xmm3 489 addpd %xmm3, %xmm1 490 movapd %xmm1, -10 * SIZE(Y1) 491 492 subl $-8 * SIZE, A1 493 subl $-8 * SIZE, Y1 494 ALIGN_3 495 496.L25: 497 testl $4, M 498 je .L26 499 500 movsd -16 * SIZE(A1), %xmm2 501 movhpd -15 * SIZE(A1), %xmm2 502 movsd -14 * SIZE(A1), %xmm3 503 movhpd -13 * SIZE(A1), %xmm3 504 505 movapd -16 * SIZE(Y1), %xmm0 506 movapd -14 * SIZE(Y1), %xmm1 507 508 mulpd %xmm6, %xmm2 509 addpd %xmm2, %xmm0 510 mulpd %xmm6, %xmm3 511 addpd %xmm3, %xmm1 512 513 movapd %xmm0, -16 * SIZE(Y1) 514 movapd %xmm1, -14 * SIZE(Y1) 515 516 addl $4 * SIZE, A1 517 addl $4 * SIZE, Y1 518 ALIGN_3 519 520.L26: 521 testl $2, M 522 je .L27 523 524 movsd -16 * SIZE(A1), %xmm2 525 movhpd -15 * SIZE(A1), %xmm2 526 527 movapd -16 * SIZE(Y1), %xmm0 528 529 mulpd %xmm6, %xmm2 530 addpd %xmm2, %xmm0 531 532 movapd %xmm0, -16 * SIZE(Y1) 533 534 addl $2 * SIZE, A1 535 addl $2 * SIZE, Y1 536 ALIGN_3 537 538.L27: 539 testl $1, M 540 je .L990 541 542 movsd -16 * SIZE(A1), %xmm2 543 movsd -16 * SIZE(Y1), %xmm0 544 545 mulsd %xmm6, %xmm2 546 addsd %xmm2, %xmm0 547 548 movsd %xmm0, -16 * SIZE(Y1) 549 ALIGN_3 550 551.L990: 552 movl Y, Y1 553 movl BUFFER, X 554 555 movl STACK_INCY, INCY 556 sall $BASE_SHIFT, INCY 557 558 movl M, %eax 559 sarl $3, %eax 560 jle .L994 561 ALIGN_3 562 563.L992: 564 movsd (Y1), %xmm0 565 movhpd (Y1, INCY), %xmm0 566 567 addpd 0 * SIZE(X), %xmm0 568 569 movlpd %xmm0, (Y1) 570 movhpd %xmm0, (Y1, INCY) 571 leal (Y1, INCY, 2), Y1 572 573 movsd (Y1), %xmm0 574 movhpd (Y1, INCY), %xmm0 575 576 addpd 2 * SIZE(X), %xmm0 577 578 movlpd %xmm0, (Y1) 579 movhpd %xmm0, (Y1, INCY) 580 leal (Y1, INCY, 2), Y1 581 582 movsd (Y1), %xmm0 583 movhpd (Y1, INCY), %xmm0 584 585 addpd 4 * SIZE(X), %xmm0 586 587 movlpd %xmm0, (Y1) 588 movhpd %xmm0, (Y1, INCY) 589 leal (Y1, INCY, 2), Y1 590 591 movsd (Y1), %xmm0 592 movhpd (Y1, INCY), %xmm0 593 594 addpd 6 * SIZE(X), %xmm0 595 596 movlpd %xmm0, (Y1) 597 movhpd %xmm0, (Y1, INCY) 598 leal (Y1, INCY, 2), Y1 599 600 addl $8 * SIZE, X 601 decl %eax 602 jg .L992 603 ALIGN_3 604 605.L994: 606 testl $7, M 607 jle .L999 608 609 testl $4, M 610 jle .L995 611 612 movsd (Y1), %xmm0 613 movhpd (Y1, INCY), %xmm0 614 615 addpd 0 * SIZE(X), %xmm0 616 617 movlpd %xmm0, (Y1) 618 movhpd %xmm0, (Y1, INCY) 619 leal (Y1, INCY, 2), Y1 620 621 movsd (Y1), %xmm0 622 movhpd (Y1, INCY), %xmm0 623 624 addpd 2 * SIZE(X), %xmm0 625 626 movlpd %xmm0, (Y1) 627 movhpd %xmm0, (Y1, INCY) 628 leal (Y1, INCY, 2), Y1 629 630 addl $4 * SIZE, X 631 ALIGN_3 632 633.L995: 634 testl $2, M 635 jle .L996 636 637 movsd (Y1), %xmm0 638 movhpd (Y1, INCY), %xmm0 639 640 addpd 0 * SIZE(X), %xmm0 641 642 movlpd %xmm0, (Y1) 643 movhpd %xmm0, (Y1, INCY) 644 leal (Y1, INCY, 2), Y1 645 646 addl $2 * SIZE, X 647 ALIGN_3 648 649.L996: 650 testl $1, M 651 jle .L999 652 653 movsd (Y1), %xmm0 654 655 movsd 0 * SIZE(X), %xmm4 656 657 addsd %xmm4, %xmm0 658 659 movlpd %xmm0, (Y1) 660 ALIGN_3 661 662.L999: 663 popl %ebx 664 popl %esi 665 popl %edi 666 popl %ebp 667 ret 668 669 EPILOGUE 670