1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef ATOM 26#define PREFETCH prefetcht0 27#define PREFETCHW prefetcht0 28#define PREFETCHSIZE (16 * 24) 29#endif 30 31#ifdef CORE2 32#define PREFETCH prefetcht0 33#define PREFETCHW prefetcht0 34#define PREFETCHSIZE (16 * 24) 35#endif 36 37#if defined(PENRYN) || defined(DUNNINGTON) 38#define PREFETCH prefetcht0 39#define PREFETCHW prefetcht0 40#define PREFETCHSIZE (16 * 24) 41#endif 42 43#ifdef NEHALEM 44#define PREFETCH prefetcht0 45#define PREFETCHW prefetcht0 46#define PREFETCHSIZE (16 * 24) 47#endif 48 49#ifdef PENTIUM4 50#define PREFETCH prefetcht0 51#define PREFETCHW prefetcht0 52#define PREFETCHSIZE (16 * 28) 53#endif 54 55#ifdef OPTERON 56#define PREFETCH prefetch 57#define PREFETCHW prefetchw 58#define PREFETCHSIZE (16 * 12) 59#define movsd movlpd 60#endif 61 62#if defined(BARCELONA) || defined(SHANGHAI) 63#define PREFETCH prefetch 64#define PREFETCHW prefetchw 65#define PREFETCHSIZE (16 * 16) 66#endif 67 68#ifdef NANO 69#define PREFETCH prefetcht0 70#define PREFETCHW prefetcht0 71#define PREFETCHSIZE (16 * 24) 72#endif 73 74#ifdef GENERIC 75#define PREFETCH prefetcht0 76#define PREFETCHW prefetcht0 77#define PREFETCHSIZE (16 * 14) 78#endif 79 80#ifndef WINDOWS_ABI 81 82#define STACKSIZE 80 83 84#define OLD_Y 8 + STACKSIZE(%rsp) 85#define OLD_INCY 16 + STACKSIZE(%rsp) 86#define OLD_BUFFER 24 + STACKSIZE(%rsp) 87 88#define M ARG1 89#define N ARG2 90#define A ARG3 91#define LDA ARG4 92#define X ARG5 93#define INCX ARG6 94 95#else 96 97#define STACKSIZE 256 98 99#define OLD_A 40 + STACKSIZE(%rsp) 100#define OLD_LDA 48 + STACKSIZE(%rsp) 101#define OLD_X 56 + STACKSIZE(%rsp) 102#define OLD_INCX 64 + STACKSIZE(%rsp) 103#define OLD_Y 72 + STACKSIZE(%rsp) 104#define OLD_INCY 80 + STACKSIZE(%rsp) 105#define OLD_BUFFER 88 + STACKSIZE(%rsp) 106 107#define M ARG1 108#define N ARG2 109#define A ARG4 110#define LDA ARG3 111#define X %rdi 112#define INCX %rsi 113#endif 114 115#define Y %r10 116#define INCY %r11 117#define BUFFER %r12 118 119#define TEMP %rax 120#define I %rax 121#define A1 %rbx 122#define A2 %rbp 123#define XX %r13 124#define YY %r14 125#define IS %r15 126#define NEW_X BUFFER 127#define NEW_Y X 128 129#define ALPHA_R %xmm0 130#define ALPHA_I %xmm1 131 132#define xsum1 %xmm0 133#define xsum2 %xmm1 134#define xsum3 %xmm2 135#define xsum4 %xmm3 136 137#define atemp1 %xmm4 138#define atemp2 %xmm5 139#define atemp3 %xmm6 140#define atemp4 %xmm7 141 142#define xtemp1 %xmm8 143#define xtemp2 %xmm9 144#define a1 %xmm10 145#define a2 %xmm11 146 147#define a3 %xmm12 148#define yy1 %xmm13 149#define xt1 %xmm14 150#define xt2 %xmm15 151 152#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) 153#define MOVDDUP(a, b, c) movddup a(b), c 154#define MOVDDUP2(a, b, c) movddup a##b, c 155#else 156#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c 157#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c 158#endif 159 160 PROLOGUE 161 PROFCODE 162 163 subq $STACKSIZE, %rsp 164 movq %rbx, 0(%rsp) 165 movq %rbp, 8(%rsp) 166 movq %r12, 16(%rsp) 167 movq %r13, 24(%rsp) 168 movq %r14, 32(%rsp) 169 movq %r15, 40(%rsp) 170 171#ifdef WINDOWS_ABI 172 movq %rdi, 48(%rsp) 173 movq %rsi, 56(%rsp) 174 movups %xmm6, 64(%rsp) 175 movups %xmm7, 80(%rsp) 176 movups %xmm8, 96(%rsp) 177 movups %xmm9, 112(%rsp) 178 movups %xmm10, 128(%rsp) 179 movups %xmm11, 144(%rsp) 180 movups %xmm12, 160(%rsp) 181 movups %xmm13, 176(%rsp) 182 movups %xmm14, 192(%rsp) 183 movups %xmm15, 208(%rsp) 184 185 movq OLD_A, A 186 movq OLD_LDA, LDA 187 movq OLD_X, X 188 movq OLD_INCX, INCX 189 190 movaps %xmm2, %xmm0 191 movaps %xmm3, %xmm1 192#endif 193 194 movq OLD_Y, Y 195 movq OLD_INCY, INCY 196 movq OLD_BUFFER, BUFFER 197 198 salq $ZBASE_SHIFT, INCX 199 salq $ZBASE_SHIFT, INCY 200 salq $ZBASE_SHIFT, LDA 201 202 testq M, M 203 jle .L999 204 205 negq IS 206 addq M, IS 207 208 movq IS, TEMP 209 imulq LDA, TEMP 210 addq TEMP, A 211 212 pcmpeqb %xmm3, %xmm3 213 xorpd %xmm2, %xmm2 214 pslld $31, %xmm3 215 unpckhps %xmm3, %xmm2 216 217 shufps $0, ALPHA_R, ALPHA_R 218 shufps $0, ALPHA_I, ALPHA_I 219 movaps ALPHA_I, %xmm3 220 221 unpcklps ALPHA_R, ALPHA_I 222 unpcklps %xmm3, ALPHA_R 223 pxor %xmm2, ALPHA_R 224 225 movq BUFFER, XX 226 227 movq M, %rax 228 sarq $2, %rax 229 jle .L02 230 ALIGN_3 231 232.L01: 233 movsd 0 * SIZE(X), %xmm4 234 addq INCX, X 235 movhps 0 * SIZE(X), %xmm4 236 addq INCX, X 237 movsd 0 * SIZE(X), %xmm6 238 addq INCX, X 239 movhps 0 * SIZE(X), %xmm6 240 addq INCX, X 241 242 movsldup %xmm4, %xmm3 243 movshdup %xmm4, %xmm4 244 movsldup %xmm6, %xmm5 245 movshdup %xmm6, %xmm6 246 247 mulps ALPHA_I, %xmm3 248 mulps ALPHA_R, %xmm4 249 mulps ALPHA_I, %xmm5 250 mulps ALPHA_R, %xmm6 251 252 addps %xmm4, %xmm3 253 addps %xmm6, %xmm5 254 255 movaps %xmm3, 4 * SIZE(XX) 256 movaps %xmm5, 12 * SIZE(XX) 257 258 shufps $0xb1, %xmm3, %xmm3 259 shufps $0xb1, %xmm5, %xmm5 260 261 pxor %xmm2, %xmm3 262 pxor %xmm2, %xmm5 263 264 movaps %xmm3, 0 * SIZE(XX) 265 movaps %xmm5, 8 * SIZE(XX) 266 267 subq $-16 * SIZE, XX 268 decq %rax 269 jg .L01 270 ALIGN_3 271 272.L02: 273 testq $2, M 274 jle .L03 275 276 movsd 0 * SIZE(X), %xmm4 277 addq INCX, X 278 movhps 0 * SIZE(X), %xmm4 279 addq INCX, X 280 281 movsldup %xmm4, %xmm3 282 movshdup %xmm4, %xmm4 283 284 mulps ALPHA_I, %xmm3 285 mulps ALPHA_R, %xmm4 286 287 addps %xmm4, %xmm3 288 289 movaps %xmm3, 4 * SIZE(XX) 290 291 shufps $0xb1, %xmm3, %xmm3 292 pxor %xmm2, %xmm3 293 movaps %xmm3, 0 * SIZE(XX) 294 295 subq $-8 * SIZE, XX 296 ALIGN_3 297 298.L03: 299 testq $1, M 300 jle .L05 301 302 movsd 0 * SIZE(X), %xmm4 303 addq INCX, X 304 305 movsldup %xmm4, %xmm3 306 movshdup %xmm4, %xmm4 307 308 mulps ALPHA_I, %xmm3 309 mulps ALPHA_R, %xmm4 310 311 addps %xmm4, %xmm3 312 313 movlps %xmm3, 2 * SIZE(XX) 314 315 shufps $0xb1, %xmm3, %xmm3 316 pxor %xmm2, %xmm3 317 movlps %xmm3, 0 * SIZE(XX) 318 319 subq $-4 * SIZE, XX 320 ALIGN_3 321 322.L05: 323 /* now we don't need original X */ 324 movq Y, NEW_Y 325 326 addq $512, XX 327 andq $-512, XX 328 329 cmpq $2 * SIZE, INCY 330 je .L10 331 332 movq Y, YY 333 movq XX, NEW_Y 334 335 movq M, %rax 336 sarq $2, %rax 337 jle .L07 338 ALIGN_3 339 340.L06: 341 movsd 0 * SIZE(YY), %xmm0 342 addq INCY, YY 343 movhps 0 * SIZE(YY), %xmm0 344 addq INCY, YY 345 movsd 0 * SIZE(YY), %xmm1 346 addq INCY, YY 347 movhps 0 * SIZE(YY), %xmm1 348 addq INCY, YY 349 350 movaps %xmm0, 0 * SIZE(XX) 351 movaps %xmm1, 8 * SIZE(XX) 352 353 addq $8 * SIZE, XX 354 decq %rax 355 jg .L06 356 ALIGN_3 357 358.L07: 359 movq M, %rax 360 andq $3, %rax 361 jle .L10 362 ALIGN_3 363 364.L08: 365 movsd 0 * SIZE(YY), %xmm0 366 addq INCY, YY 367 368 movlps %xmm0, 0 * SIZE(XX) 369 370 addq $2 * SIZE, XX 371 decq %rax 372 jg .L08 373 ALIGN_3 374 375.L10: 376 movq IS, I 377 addq $2, I 378 cmpq M, I 379 jg .L20 380 ALIGN_3 381 382.L11: 383 movq A, A1 384 leaq (A, LDA, 1), A2 385 leaq (A, LDA, 2), A 386 387 leaq (, IS, 4), I 388 389 movsd 0 * SIZE(NEW_X, I, SIZE), atemp2 390 movhps 4 * SIZE(NEW_X, I, SIZE), atemp2 391 movsd 2 * SIZE(NEW_X, I, SIZE), atemp4 392 movhps 6 * SIZE(NEW_X, I, SIZE), atemp4 393 394 pshufd $0xcc, atemp2, atemp1 395 pshufd $0x99, atemp2, atemp2 396 pshufd $0xcc, atemp4, atemp3 397 pshufd $0x99, atemp4, atemp4 398 399 pxor xsum1, xsum1 400 pxor xsum2, xsum2 401 pxor xsum3, xsum3 402 pxor xsum4, xsum4 403 404 movq NEW_X, XX 405 movq NEW_Y, YY 406 407 movq IS, I 408 sarq $2, I 409 jle .L15 410 ALIGN_3 411 412.L12: 413 HALT 414 415 subq $-16 * SIZE, XX 416 addq $ 8 * SIZE, YY 417 addq $ 8 * SIZE, A1 418 addq $ 8 * SIZE, A2 419 420 decq I 421 jg .L12 422 ALIGN_3 423 424.L15: 425 testq $2, IS 426 jle .L18 427 428 movsd 0 * SIZE(YY), yy1 429 movhps 2 * SIZE(YY), yy1 430 431 movaps 0 * SIZE(XX), xtemp1 432 movaps 4 * SIZE(XX), xtemp2 433 434 movsd 0 * SIZE(A1), a1 435 movhps 2 * SIZE(A1), a1 436 437 movaps xtemp1, xt1 438 movaps xtemp2, xt2 439 mulps a1, xt1 440 mulps a1, xt2 441 addps xt1, xsum1 442 addps xt2, xsum2 443 444 pshufd $0xb1, a1, xt2 445 mulps atemp1, a1 446 mulps atemp2, xt2 447 addps a1, yy1 448 addps xt2, yy1 449 450 movsd 0 * SIZE(A2), a1 451 movhps 2 * SIZE(A2), a1 452 453 movaps xtemp1, xt1 454 movaps xtemp2, xt2 455 mulps a1, xt1 456 mulps a1, xt2 457 addps xt1, xsum3 458 addps xt2, xsum4 459 460 pshufd $0xb1, a1, xt2 461 mulps atemp1, a1 462 mulps atemp2, xt2 463 addps a1, yy1 464 addps xt2, yy1 465 466 movlps yy1, 0 * SIZE(YY) 467 movhps yy1, 2 * SIZE(YY) 468 469 addq $8 * SIZE, XX 470 addq $4 * SIZE, YY 471 addq $4 * SIZE, A1 472 addq $4 * SIZE, A2 473 ALIGN_3 474 475.L18: 476 leaq (, IS, 4), I 477 478 movaps 0 * SIZE(NEW_X, I, SIZE), atemp1 479 movaps 4 * SIZE(NEW_X, I, SIZE), atemp2 480 481 movlps 0 * SIZE(YY), yy1 482 movhps 2 * SIZE(YY), yy1 483 484 movsd 0 * SIZE(A1), a1 485 movhps 0 * SIZE(A2), a1 486 487 movaps a1, a2 488 mulps atemp1, a1 489 mulps atemp2, a2 490 addps a1, xsum1 491 addps a2, xsum2 492 493 movsd 0 * SIZE(A2), a1 494 movhps 2 * SIZE(A2), a1 495 496 movaps a1, a2 497 mulps atemp1, a1 498 mulps atemp2, a2 499 addps a1, xsum3 500 addps a2, xsum4 501 502 haddps xsum2, xsum1 503 haddps xsum4, xsum3 504 505 haddps xsum3, xsum1 506 addps xsum1, yy1 507 508 movlps yy1, 0 * SIZE(YY) 509 movhps yy1, 2 * SIZE(YY) 510 511 addq $2, IS 512 513 movq IS, I 514 addq $2, I 515 cmpq M, I 516 jle .L11 517 ALIGN_3 518 519.L20: 520 testq $1, M 521 jle .L990 522 523 524.L990: 525 cmpq $2 * SIZE, INCY 526 je .L999 527 528 movq M, %rax 529 sarq $2, %rax 530 jle .L997 531 ALIGN_3 532 533.L996: 534 movaps 0 * SIZE(NEW_Y), %xmm0 535 movaps 4 * SIZE(NEW_Y), %xmm1 536 537 movlps %xmm0, 0 * SIZE(Y) 538 addq INCY, Y 539 movhps %xmm0, 0 * SIZE(Y) 540 addq INCY, Y 541 movlps %xmm1, 0 * SIZE(Y) 542 addq INCY, Y 543 movhps %xmm1, 0 * SIZE(Y) 544 addq INCY, Y 545 546 addq $8 * SIZE, NEW_Y 547 decq %rax 548 jg .L996 549 ALIGN_3 550 551.L997: 552 movq M, %rax 553 andq $3, %rax 554 jle .L999 555 ALIGN_3 556 557.L998: 558 movlps 0 * SIZE(NEW_Y), %xmm0 559 addq $2 * SIZE, NEW_Y 560 561 movlps %xmm0, 0 * SIZE(Y) 562 addq INCY, Y 563 564 decq %rax 565 jg .L998 566 ALIGN_3 567 568.L999: 569 movq 0(%rsp), %rbx 570 movq 8(%rsp), %rbp 571 movq 16(%rsp), %r12 572 movq 24(%rsp), %r13 573 movq 32(%rsp), %r14 574 movq 40(%rsp), %r15 575 addq $STACKSIZE, %rsp 576 ret 577 EPILOGUE 578