1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24#include "l2param.h" 25 26#if GEMV_UNROLL < 4 27#undef GEMV_UNROLL 28#define GEMV_UNROLL 4 29#endif 30 31#ifndef WINDOWS_ABI 32 33#define STACKSIZE 64 34 35#define OLD_M %rdi 36#define OLD_N %rsi 37#define OLD_A %rcx 38#define OLD_LDA %r8 39#define STACK_INCX 8 + STACKSIZE(%rsp) 40#define STACK_Y 16 + STACKSIZE(%rsp) 41#define STACK_INCY 24 + STACKSIZE(%rsp) 42#define STACK_BUFFER 32 + STACKSIZE(%rsp) 43 44#else 45 46#define STACKSIZE 256 47 48#define OLD_M %rcx 49#define OLD_N %rdx 50#define OLD_A 40 + STACKSIZE(%rsp) 51#define OLD_LDA 48 + STACKSIZE(%rsp) 52#define OLD_X 56 + STACKSIZE(%rsp) 53#define STACK_INCX 64 + STACKSIZE(%rsp) 54#define STACK_Y 72 + STACKSIZE(%rsp) 55#define STACK_INCY 80 + STACKSIZE(%rsp) 56#define STACK_BUFFER 88 + STACKSIZE(%rsp) 57 58#endif 59 60#define LDA %r8 61#define X %r9 62 63#define INCX %rsi 64#define INCY %rdi 65 66#define M %r10 67#define N %r11 68#define A %r12 69#define Y %r14 70#define BUFFER %r13 71 72#define I %rax 73#define A1 %rbx 74#define A2 %rcx 75#define LDA3 %rdx 76#define X1 %rbp 77 78#define Y1 INCX 79 80#ifdef ALIGNED_ACCESS 81#define MM %r15 82#else 83#define MM M 84#endif 85 86#define ALPHA %xmm7 87 88 PROLOGUE 89 PROFCODE 90 91 subq $STACKSIZE, %rsp 92 movq %rbx, 0(%rsp) 93 movq %rbp, 8(%rsp) 94 movq %r12, 16(%rsp) 95 movq %r13, 24(%rsp) 96 movq %r14, 32(%rsp) 97 movq %r15, 40(%rsp) 98 99#ifdef WINDOWS_ABI 100 movq %rdi, 48(%rsp) 101 movq %rsi, 56(%rsp) 102 movups %xmm6, 64(%rsp) 103 movups %xmm7, 80(%rsp) 104 movups %xmm8, 96(%rsp) 105 movups %xmm9, 112(%rsp) 106 movups %xmm10, 128(%rsp) 107 movups %xmm11, 144(%rsp) 108 movups %xmm12, 160(%rsp) 109 movups %xmm13, 176(%rsp) 110 movups %xmm14, 192(%rsp) 111 movups %xmm15, 208(%rsp) 112 113 movq OLD_M, M 114 movq OLD_N, N 115 movq OLD_A, A 116 movq OLD_LDA, LDA 117 movq OLD_X, X 118#else 119 movq OLD_M, M 120 movq OLD_N, N 121 movq OLD_A, A 122 movq OLD_LDA, LDA 123#endif 124 125 movq STACK_INCX, INCX 126 movq STACK_Y, Y 127 movq STACK_INCY, INCY 128 movq STACK_BUFFER, BUFFER 129 130#ifndef WINDOWS_ABI 131 pshufd $0, %xmm0, ALPHA 132#else 133 pshufd $0, %xmm3, ALPHA 134#endif 135 136 leaq (,INCX, SIZE), INCX 137 leaq (,INCY, SIZE), INCY 138 leaq (,LDA, SIZE), LDA 139 140 leaq (LDA, LDA, 2), LDA3 141 142#ifdef ALIGNED_ACCESS 143 movq M, MM 144 testq $4 * SIZE - 1, A 145 je .L0X 146 cmpq $3, M 147 jle .L0X 148 149 movq A, MM 150 sarq $BASE_SHIFT, MM 151 andq $3, MM 152 subq $4, MM 153 addq M, MM 154 155.L0X: 156#endif 157 158 testq M, M 159 jle .L999 160 testq N, N 161 jle .L999 162 ALIGN_4 163 164 subq $-32 * SIZE, A 165 166#ifdef ALIGNED_ACCESS 167 movq A, %rax 168 andq $4 * SIZE - 1, %rax 169 addq %rax, BUFFER 170#endif 171 172 movq BUFFER, X1 173 174 movq M, I 175 sarq $3, I 176 jle .L05 177 ALIGN_4 178 179.L02: 180 movss (X), %xmm0 181 addq INCX, X 182 movss (X), %xmm1 183 addq INCX, X 184 185 movss (X), %xmm2 186 addq INCX, X 187 movss (X), %xmm3 188 addq INCX, X 189 190 movss (X), %xmm4 191 addq INCX, X 192 movss (X), %xmm5 193 addq INCX, X 194 195 movss (X), %xmm6 196 addq INCX, X 197 movss (X), %xmm8 198 addq INCX, X 199 200 movss %xmm0, 0 * SIZE(X1) 201 movss %xmm1, 1 * SIZE(X1) 202 movss %xmm2, 2 * SIZE(X1) 203 movss %xmm3, 3 * SIZE(X1) 204 movss %xmm4, 4 * SIZE(X1) 205 movss %xmm5, 5 * SIZE(X1) 206 movss %xmm6, 6 * SIZE(X1) 207 movss %xmm8, 7 * SIZE(X1) 208 209 addq $8 * SIZE, X1 210 decq I 211 jg .L02 212 ALIGN_4 213 214.L05: 215 movq M, I 216 andq $7, I 217 jle .L10 218 ALIGN_2 219 220.L06: 221 movss (X), %xmm0 222 addq INCX, X 223 movss %xmm0, 0 * SIZE(X1) 224 addq $SIZE, X1 225 decq I 226 jg .L06 227 ALIGN_4 228 229.L10: 230 movq Y, Y1 231 232#ifdef ALIGNED_ACCESS 233 testq $4 * SIZE - 1, LDA 234 jne .L100 235#endif 236 237#if GEMV_UNROLL >= 8 238 239 cmpq $8, N 240 jl .L20 241 ALIGN_3 242 243.L11: 244 subq $8, N 245 246 leaq 32 * SIZE(BUFFER), X1 247 248 movq A, A1 249 leaq (A1, LDA, 4), A2 250 leaq (A1, LDA, 8), A 251 252 xorps %xmm8, %xmm8 253 xorps %xmm9, %xmm9 254 xorps %xmm10, %xmm10 255 xorps %xmm11, %xmm11 256 xorps %xmm12, %xmm12 257 xorps %xmm13, %xmm13 258 xorps %xmm14, %xmm14 259 xorps %xmm15, %xmm15 260 261#ifdef ALIGNED_ACCESS 262 cmpq $3, M 263 jle .L17 264 265 testq $SIZE, A1 266 je .L1X 267 268 movss -32 * SIZE(A1), %xmm0 269 movss -32 * SIZE(X1), %xmm4 270 mulss %xmm4, %xmm0 271 addss %xmm0, %xmm8 272 movss -32 * SIZE(A1, LDA, 1), %xmm1 273 mulss %xmm4, %xmm1 274 addss %xmm1, %xmm9 275 movss -32 * SIZE(A1, LDA, 2), %xmm2 276 mulss %xmm4, %xmm2 277 addss %xmm2, %xmm10 278 movss -32 * SIZE(A1, LDA3, 1), %xmm3 279 mulss %xmm4, %xmm3 280 addss %xmm3, %xmm11 281 movss -32 * SIZE(A2), %xmm0 282 mulss %xmm4, %xmm0 283 addss %xmm0, %xmm12 284 movss -32 * SIZE(A2, LDA, 1), %xmm1 285 mulss %xmm4, %xmm1 286 addss %xmm1, %xmm13 287 movss -32 * SIZE(A2, LDA, 2), %xmm2 288 mulss %xmm4, %xmm2 289 addss %xmm2, %xmm14 290 movss -32 * SIZE(A2, LDA3, 1), %xmm3 291 mulss %xmm4, %xmm3 292 addss %xmm3, %xmm15 293 294 addq $1 * SIZE, A1 295 addq $1 * SIZE, A2 296 addq $1 * SIZE, X1 297 ALIGN_3 298 299.L1X: 300 testq $2 * SIZE, A1 301 je .L1XX 302 303#ifdef movsd 304 xorps %xmm0, %xmm0 305 xorps %xmm4, %xmm4 306#endif 307 movsd -32 * SIZE(A1), %xmm0 308 movsd -32 * SIZE(X1), %xmm4 309 mulps %xmm4, %xmm0 310 addps %xmm0, %xmm8 311#ifdef movsd 312 xorps %xmm1, %xmm1 313#endif 314 movsd -32 * SIZE(A1, LDA, 1), %xmm1 315 mulps %xmm4, %xmm1 316 addps %xmm1, %xmm9 317#ifdef movsd 318 xorps %xmm2, %xmm2 319#endif 320 movsd -32 * SIZE(A1, LDA, 2), %xmm2 321 mulps %xmm4, %xmm2 322 addps %xmm2, %xmm10 323#ifdef movsd 324 xorps %xmm3, %xmm3 325#endif 326 movsd -32 * SIZE(A1, LDA3, 1), %xmm3 327 mulps %xmm4, %xmm3 328 addps %xmm3, %xmm11 329 movsd -32 * SIZE(A2), %xmm0 330 mulps %xmm4, %xmm0 331 addps %xmm0, %xmm12 332 movsd -32 * SIZE(A2, LDA, 1), %xmm1 333 mulps %xmm4, %xmm1 334 addps %xmm1, %xmm13 335 movsd -32 * SIZE(A2, LDA, 2), %xmm2 336 mulps %xmm4, %xmm2 337 addps %xmm2, %xmm14 338 movsd -32 * SIZE(A2, LDA3, 1), %xmm3 339 mulps %xmm4, %xmm3 340 addps %xmm3, %xmm15 341 342 addq $2 * SIZE, A1 343 addq $2 * SIZE, A2 344 addq $2 * SIZE, X1 345 ALIGN_3 346 347.L1XX: 348#endif 349 350 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 351 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 352 353#ifdef PREFETCHW 354 PREFETCHW 8 * SIZE(Y1) 355#endif 356 357 movq MM, I 358 sarq $4, I 359 jle .L15 360 361 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 362 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 363 MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) 364 MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) 365 366 decq I 367 jle .L13 368 ALIGN_4 369 370.L12: 371#ifdef PREFETCH 372 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) 373#endif 374 375 mulps %xmm4, %xmm0 376 addps %xmm0, %xmm8 377 MOVUPS_A1 (-32 * SIZE, A2, %xmm0) 378 mulps %xmm4, %xmm1 379 addps %xmm1, %xmm9 380 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) 381 mulps %xmm4, %xmm2 382 addps %xmm2, %xmm10 383 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) 384 mulps %xmm4, %xmm3 385 addps %xmm3, %xmm11 386 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) 387 388#ifdef PREFETCH 389 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1) 390#endif 391 392 mulps %xmm4, %xmm0 393 addps %xmm0, %xmm12 394 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 395 mulps %xmm4, %xmm1 396 addps %xmm1, %xmm13 397 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) 398 mulps %xmm4, %xmm2 399 addps %xmm2, %xmm14 400 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) 401 mulps %xmm4, %xmm3 402 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 403 addps %xmm3, %xmm15 404 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) 405 406#ifdef PREFETCH 407 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) 408#endif 409 410 mulps %xmm5, %xmm0 411 addps %xmm0, %xmm8 412 MOVUPS_A1 (-28 * SIZE, A2, %xmm0) 413 mulps %xmm5, %xmm1 414 addps %xmm1, %xmm9 415 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) 416 mulps %xmm5, %xmm2 417 addps %xmm2, %xmm10 418 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) 419 mulps %xmm5, %xmm3 420 addps %xmm3, %xmm11 421 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) 422 423#ifdef PREFETCH 424 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) 425#endif 426 427 mulps %xmm5, %xmm0 428 addps %xmm0, %xmm12 429 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 430 mulps %xmm5, %xmm1 431 addps %xmm1, %xmm13 432 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) 433 mulps %xmm5, %xmm2 434 addps %xmm2, %xmm14 435 MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2) 436 mulps %xmm5, %xmm3 437 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 438 addps %xmm3, %xmm15 439 MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3) 440 441#ifdef PREFETCH 442 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) 443#endif 444 445 mulps %xmm4, %xmm0 446 addps %xmm0, %xmm8 447 MOVUPS_A1 (-24 * SIZE, A2, %xmm0) 448 mulps %xmm4, %xmm1 449 addps %xmm1, %xmm9 450 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1) 451 mulps %xmm4, %xmm2 452 addps %xmm2, %xmm10 453 MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2) 454 mulps %xmm4, %xmm3 455 addps %xmm3, %xmm11 456 MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3) 457 mulps %xmm4, %xmm0 458 addps %xmm0, %xmm12 459 460#ifdef PREFETCH 461 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1) 462#endif 463 464 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 465 mulps %xmm4, %xmm1 466 addps %xmm1, %xmm13 467 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1) 468 mulps %xmm4, %xmm2 469 addps %xmm2, %xmm14 470 MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2) 471 mulps %xmm4, %xmm3 472 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 473 addps %xmm3, %xmm15 474 MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3) 475 476#ifdef PREFETCH 477 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) 478#endif 479 480 mulps %xmm5, %xmm0 481 addps %xmm0, %xmm8 482 MOVUPS_A1 (-20 * SIZE, A2, %xmm0) 483 mulps %xmm5, %xmm1 484 addps %xmm1, %xmm9 485 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1) 486 mulps %xmm5, %xmm2 487 addps %xmm2, %xmm10 488 MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2) 489 mulps %xmm5, %xmm3 490 addps %xmm3, %xmm11 491 MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3) 492 mulps %xmm5, %xmm0 493 addps %xmm0, %xmm12 494 495#ifdef PREFETCH 496 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) 497#endif 498 499 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 500 mulps %xmm5, %xmm1 501 addps %xmm1, %xmm13 502 MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) 503 mulps %xmm5, %xmm2 504 addps %xmm2, %xmm14 505 506#ifdef PREFETCHW 507 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) 508#endif 509 510 MOVUPS_A2 (-16 * SIZE, A1, LDA, 2, %xmm2) 511 mulps %xmm5, %xmm3 512 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 513 addps %xmm3, %xmm15 514 MOVUPS_A2 (-16 * SIZE, A1, LDA3, 1, %xmm3) 515 516 addq $16 * SIZE, A1 517 addq $16 * SIZE, A2 518 addq $16 * SIZE, X1 519 520 decq I 521 jg .L12 522 ALIGN_4 523 524.L13: 525 mulps %xmm4, %xmm0 526 addps %xmm0, %xmm8 527 MOVUPS_A1 (-32 * SIZE, A2, %xmm0) 528 mulps %xmm4, %xmm1 529 addps %xmm1, %xmm9 530 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) 531 mulps %xmm4, %xmm2 532 addps %xmm2, %xmm10 533 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) 534 mulps %xmm4, %xmm3 535 addps %xmm3, %xmm11 536 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) 537 mulps %xmm4, %xmm0 538 addps %xmm0, %xmm12 539 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 540 mulps %xmm4, %xmm1 541 addps %xmm1, %xmm13 542 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) 543 mulps %xmm4, %xmm2 544 addps %xmm2, %xmm14 545 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) 546 mulps %xmm4, %xmm3 547 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 548 addps %xmm3, %xmm15 549 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) 550 551 mulps %xmm5, %xmm0 552 addps %xmm0, %xmm8 553 MOVUPS_A1 (-28 * SIZE, A2, %xmm0) 554 mulps %xmm5, %xmm1 555 addps %xmm1, %xmm9 556 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) 557 mulps %xmm5, %xmm2 558 addps %xmm2, %xmm10 559 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) 560 mulps %xmm5, %xmm3 561 addps %xmm3, %xmm11 562 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) 563 mulps %xmm5, %xmm0 564 addps %xmm0, %xmm12 565 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 566 mulps %xmm5, %xmm1 567 addps %xmm1, %xmm13 568 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) 569 mulps %xmm5, %xmm2 570 addps %xmm2, %xmm14 571 MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2) 572 mulps %xmm5, %xmm3 573 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 574 addps %xmm3, %xmm15 575 MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3) 576 577 mulps %xmm4, %xmm0 578 addps %xmm0, %xmm8 579 MOVUPS_A1 (-24 * SIZE, A2, %xmm0) 580 mulps %xmm4, %xmm1 581 addps %xmm1, %xmm9 582 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1) 583 mulps %xmm4, %xmm2 584 addps %xmm2, %xmm10 585 MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2) 586 mulps %xmm4, %xmm3 587 addps %xmm3, %xmm11 588 MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3) 589 mulps %xmm4, %xmm0 590 addps %xmm0, %xmm12 591 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 592 mulps %xmm4, %xmm1 593 addps %xmm1, %xmm13 594 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1) 595 mulps %xmm4, %xmm2 596 addps %xmm2, %xmm14 597 MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2) 598 mulps %xmm4, %xmm3 599 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 600 addps %xmm3, %xmm15 601 MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3) 602 603 mulps %xmm5, %xmm0 604 addps %xmm0, %xmm8 605 MOVUPS_A1 (-20 * SIZE, A2, %xmm0) 606 mulps %xmm5, %xmm1 607 addps %xmm1, %xmm9 608 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1) 609 mulps %xmm5, %xmm2 610 addps %xmm2, %xmm10 611 MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2) 612 mulps %xmm5, %xmm3 613 addps %xmm3, %xmm11 614 MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3) 615 mulps %xmm5, %xmm0 616 addps %xmm0, %xmm12 617 mulps %xmm5, %xmm1 618 addps %xmm1, %xmm13 619 mulps %xmm5, %xmm2 620 addps %xmm2, %xmm14 621 mulps %xmm5, %xmm3 622 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 623 addps %xmm3, %xmm15 624 625 addq $16 * SIZE, A1 626 addq $16 * SIZE, A2 627 addq $16 * SIZE, X1 628 ALIGN_4 629 630.L15: 631 testq $8, MM 632 jle .L16 633 634 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 635 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 636 MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) 637 MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) 638 639 mulps %xmm4, %xmm0 640 addps %xmm0, %xmm8 641 MOVUPS_A1 (-32 * SIZE, A2, %xmm0) 642 mulps %xmm4, %xmm1 643 addps %xmm1, %xmm9 644 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) 645 mulps %xmm4, %xmm2 646 addps %xmm2, %xmm10 647 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) 648 mulps %xmm4, %xmm3 649 addps %xmm3, %xmm11 650 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) 651 mulps %xmm4, %xmm0 652 addps %xmm0, %xmm12 653 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 654 mulps %xmm4, %xmm1 655 addps %xmm1, %xmm13 656 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) 657 mulps %xmm4, %xmm2 658 addps %xmm2, %xmm14 659 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) 660 mulps %xmm4, %xmm3 661 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 662 addps %xmm3, %xmm15 663 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) 664 665 mulps %xmm5, %xmm0 666 addps %xmm0, %xmm8 667 MOVUPS_A1 (-28 * SIZE, A2, %xmm0) 668 mulps %xmm5, %xmm1 669 addps %xmm1, %xmm9 670 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) 671 mulps %xmm5, %xmm2 672 addps %xmm2, %xmm10 673 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) 674 mulps %xmm5, %xmm3 675 addps %xmm3, %xmm11 676 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) 677 mulps %xmm5, %xmm0 678 addps %xmm0, %xmm12 679 mulps %xmm5, %xmm1 680 addps %xmm1, %xmm13 681 mulps %xmm5, %xmm2 682 addps %xmm2, %xmm14 683 mulps %xmm5, %xmm3 684 addps %xmm3, %xmm15 685 686 addq $8 * SIZE, A1 687 addq $8 * SIZE, A2 688 addq $8 * SIZE, X1 689 ALIGN_4 690 691.L16: 692 testq $4, MM 693 jle .L17 694 695 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 696 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 697 MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) 698 MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) 699 700 mulps %xmm4, %xmm0 701 addps %xmm0, %xmm8 702 MOVUPS_A1 (-32 * SIZE, A2, %xmm0) 703 mulps %xmm4, %xmm1 704 addps %xmm1, %xmm9 705 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) 706 mulps %xmm4, %xmm2 707 addps %xmm2, %xmm10 708 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) 709 mulps %xmm4, %xmm3 710 addps %xmm3, %xmm11 711 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) 712 mulps %xmm4, %xmm0 713 addps %xmm0, %xmm12 714 mulps %xmm4, %xmm1 715 addps %xmm1, %xmm13 716 mulps %xmm4, %xmm2 717 addps %xmm2, %xmm14 718 mulps %xmm4, %xmm3 719 addps %xmm3, %xmm15 720 721 addq $4 * SIZE, A1 722 addq $4 * SIZE, A2 723 addq $4 * SIZE, X1 724 ALIGN_4 725 726.L17: 727 testq $2, MM 728 jle .L18 729 730#ifdef movsd 731 xorps %xmm0, %xmm0 732#endif 733 movsd -32 * SIZE(A1), %xmm0 734#ifdef movsd 735 xorps %xmm4, %xmm4 736#endif 737 movsd -32 * SIZE(X1), %xmm4 738 mulps %xmm4, %xmm0 739 addps %xmm0, %xmm8 740#ifdef movsd 741 xorps %xmm1, %xmm1 742#endif 743 movsd -32 * SIZE(A1, LDA, 1), %xmm1 744 mulps %xmm4, %xmm1 745 addps %xmm1, %xmm9 746#ifdef movsd 747 xorps %xmm2, %xmm2 748#endif 749 movsd -32 * SIZE(A1, LDA, 2), %xmm2 750 mulps %xmm4, %xmm2 751 addps %xmm2, %xmm10 752#ifdef movsd 753 xorps %xmm3, %xmm3 754#endif 755 movsd -32 * SIZE(A1, LDA3, 1), %xmm3 756 mulps %xmm4, %xmm3 757 addps %xmm3, %xmm11 758 movsd -32 * SIZE(A2), %xmm0 759 mulps %xmm4, %xmm0 760 addps %xmm0, %xmm12 761 movsd -32 * SIZE(A2, LDA, 1), %xmm1 762 mulps %xmm4, %xmm1 763 addps %xmm1, %xmm13 764 movsd -32 * SIZE(A2, LDA, 2), %xmm2 765 mulps %xmm4, %xmm2 766 addps %xmm2, %xmm14 767 movsd -32 * SIZE(A2, LDA3, 1), %xmm3 768 mulps %xmm4, %xmm3 769 addps %xmm3, %xmm15 770 771 addq $2 * SIZE, A1 772 addq $2 * SIZE, A2 773 addq $2 * SIZE, X1 774 ALIGN_4 775 776.L18: 777 testq $1, MM 778 jle .L19 779 780 movss -32 * SIZE(A1), %xmm0 781 movss -32 * SIZE(X1), %xmm4 782 mulss %xmm4, %xmm0 783 addss %xmm0, %xmm8 784 movss -32 * SIZE(A1, LDA, 1), %xmm1 785 mulss %xmm4, %xmm1 786 addss %xmm1, %xmm9 787 movss -32 * SIZE(A1, LDA, 2), %xmm2 788 mulss %xmm4, %xmm2 789 addss %xmm2, %xmm10 790 movss -32 * SIZE(A1, LDA3, 1), %xmm3 791 mulss %xmm4, %xmm3 792 addss %xmm3, %xmm11 793 movss -32 * SIZE(A2), %xmm0 794 mulss %xmm4, %xmm0 795 addss %xmm0, %xmm12 796 movss -32 * SIZE(A2, LDA, 1), %xmm1 797 mulss %xmm4, %xmm1 798 addss %xmm1, %xmm13 799 movss -32 * SIZE(A2, LDA, 2), %xmm2 800 mulss %xmm4, %xmm2 801 addss %xmm2, %xmm14 802 movss -32 * SIZE(A2, LDA3, 1), %xmm3 803 mulss %xmm4, %xmm3 804 addss %xmm3, %xmm15 805 ALIGN_4 806 807.L19: 808#ifdef HAVE_SSE3 809 haddps %xmm9, %xmm8 810 haddps %xmm11, %xmm10 811 haddps %xmm10, %xmm8 812 813 pshufd $0x1, %xmm8, %xmm9 814 pshufd $0x2, %xmm8, %xmm10 815 pshufd $0x3, %xmm8, %xmm11 816 817 haddps %xmm13, %xmm12 818 haddps %xmm15, %xmm14 819 haddps %xmm14, %xmm12 820 821 pshufd $0x1, %xmm12, %xmm13 822 pshufd $0x2, %xmm12, %xmm14 823 pshufd $0x3, %xmm12, %xmm15 824#else 825 movaps %xmm8, %xmm0 826 unpcklps %xmm9, %xmm8 827 unpckhps %xmm9, %xmm0 828 829 movaps %xmm10, %xmm1 830 unpcklps %xmm11, %xmm10 831 unpckhps %xmm11, %xmm1 832 833 movaps %xmm8, %xmm9 834 unpcklps %xmm10, %xmm8 835 unpckhps %xmm10, %xmm9 836 837 movaps %xmm0, %xmm10 838 unpcklps %xmm1, %xmm0 839 unpckhps %xmm1, %xmm10 840 841 addps %xmm9, %xmm8 842 addps %xmm0, %xmm10 843 addps %xmm10, %xmm8 844 845 pshufd $0x2, %xmm8, %xmm9 846 pshufd $0x1, %xmm8, %xmm10 847 pshufd $0x3, %xmm8, %xmm11 848 849 movaps %xmm12, %xmm0 850 unpcklps %xmm13, %xmm12 851 unpckhps %xmm13, %xmm0 852 853 movaps %xmm14, %xmm1 854 unpcklps %xmm15, %xmm14 855 unpckhps %xmm15, %xmm1 856 857 movaps %xmm12, %xmm13 858 unpcklps %xmm14, %xmm12 859 unpckhps %xmm14, %xmm13 860 861 movaps %xmm0, %xmm14 862 unpcklps %xmm1, %xmm0 863 unpckhps %xmm1, %xmm14 864 865 addps %xmm13, %xmm12 866 addps %xmm0, %xmm14 867 addps %xmm14, %xmm12 868 869 pshufd $0x2, %xmm12, %xmm13 870 pshufd $0x1, %xmm12, %xmm14 871 pshufd $0x3, %xmm12, %xmm15 872#endif 873 874 mulss ALPHA, %xmm8 875 mulss ALPHA, %xmm9 876 mulss ALPHA, %xmm10 877 mulss ALPHA, %xmm11 878 mulss ALPHA, %xmm12 879 mulss ALPHA, %xmm13 880 mulss ALPHA, %xmm14 881 mulss ALPHA, %xmm15 882 883 addss (Y), %xmm8 884 addq INCY, Y 885 addss (Y), %xmm9 886 addq INCY, Y 887 addss (Y), %xmm10 888 addq INCY, Y 889 addss (Y), %xmm11 890 addq INCY, Y 891 addss (Y), %xmm12 892 addq INCY, Y 893 addss (Y), %xmm13 894 addq INCY, Y 895 addss (Y), %xmm14 896 addq INCY, Y 897 addss (Y), %xmm15 898 addq INCY, Y 899 900 movss %xmm8, (Y1) 901 addq INCY, Y1 902 movss %xmm9, (Y1) 903 addq INCY, Y1 904 movss %xmm10, (Y1) 905 addq INCY, Y1 906 movss %xmm11, (Y1) 907 addq INCY, Y1 908 movss %xmm12, (Y1) 909 addq INCY, Y1 910 movss %xmm13, (Y1) 911 addq INCY, Y1 912 movss %xmm14, (Y1) 913 addq INCY, Y1 914 movss %xmm15, (Y1) 915 addq INCY, Y1 916 917 cmpq $8, N 918 jge .L11 919 ALIGN_4 920 921.L20: 922#endif 923 924 cmpq $4, N 925 jl .L30 926 927#if GEMV_UNROLL == 4 928 ALIGN_3 929 930.L21: 931#endif 932 subq $4, N 933 934 leaq 32 * SIZE(BUFFER), X1 935 936 movq A, A1 937 leaq (A1, LDA, 2), A2 938 leaq (A1, LDA, 4), A 939 940 xorps %xmm8, %xmm8 941 xorps %xmm9, %xmm9 942 xorps %xmm10, %xmm10 943 xorps %xmm11, %xmm11 944 945#ifdef ALIGNED_ACCESS 946 cmpq $3, M 947 jle .L27 948 949 testq $SIZE, A1 950 je .L2X 951 952 movss -32 * SIZE(A1), %xmm0 953 movss -32 * SIZE(X1), %xmm4 954 mulss %xmm4, %xmm0 955 addss %xmm0, %xmm8 956 movss -32 * SIZE(A1, LDA), %xmm1 957 mulss %xmm4, %xmm1 958 addss %xmm1, %xmm9 959 movss -32 * SIZE(A2), %xmm2 960 mulss %xmm4, %xmm2 961 addss %xmm2, %xmm10 962 movss -32 * SIZE(A2, LDA), %xmm3 963 mulss %xmm4, %xmm3 964 addss %xmm3, %xmm11 965 966 addq $1 * SIZE, A1 967 addq $1 * SIZE, A2 968 addq $1 * SIZE, X1 969 ALIGN_3 970 971.L2X: 972 testq $2 * SIZE, A1 973 je .L2XX 974 975#ifdef movsd 976 xorps %xmm0, %xmm0 977 xorps %xmm4, %xmm4 978#endif 979 movsd -32 * SIZE(A1), %xmm0 980 movsd -32 * SIZE(X1), %xmm4 981 mulps %xmm4, %xmm0 982 addps %xmm0, %xmm8 983#ifdef movsd 984 xorps %xmm1, %xmm1 985#endif 986 movsd -32 * SIZE(A1, LDA), %xmm1 987 mulps %xmm4, %xmm1 988 addps %xmm1, %xmm9 989#ifdef movsd 990 xorps %xmm2, %xmm2 991#endif 992 movsd -32 * SIZE(A2), %xmm2 993 mulps %xmm4, %xmm2 994 addps %xmm2, %xmm10 995#ifdef movsd 996 xorps %xmm3, %xmm3 997#endif 998 movsd -32 * SIZE(A2, LDA), %xmm3 999 mulps %xmm4, %xmm3 1000 addps %xmm3, %xmm11 1001 1002 addq $2 * SIZE, A1 1003 addq $2 * SIZE, A2 1004 addq $2 * SIZE, X1 1005 ALIGN_3 1006 1007.L2XX: 1008#endif 1009 1010 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 1011 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 1012 1013#if (GEMV_UNROLL == 4) && defined(PREFETCHW) 1014 PREFETCHW 4 * SIZE(Y1) 1015#endif 1016 1017 movq MM, I 1018 sarq $4, I 1019 jle .L25 1020 1021 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1022 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 1023 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 1024 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) 1025 1026 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 1027 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) 1028 MOVUPS_A1 (-28 * SIZE, A2, %xmm14) 1029 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15) 1030 1031 decq I 1032 jle .L23 1033 ALIGN_4 1034 1035.L22: 1036#ifdef PREFETCH 1037 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 1038#endif 1039 1040 mulps %xmm4, %xmm0 1041 addps %xmm0, %xmm8 1042 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 1043 mulps %xmm4, %xmm1 1044 addps %xmm1, %xmm9 1045 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) 1046 mulps %xmm4, %xmm2 1047 addps %xmm2, %xmm10 1048 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) 1049 mulps %xmm4, %xmm3 1050 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1051 addps %xmm3, %xmm11 1052 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3) 1053 1054#ifdef PREFETCH 1055 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) 1056#endif 1057 1058 mulps %xmm5, %xmm12 1059 addps %xmm12, %xmm8 1060 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 1061 mulps %xmm5, %xmm13 1062 addps %xmm13, %xmm9 1063 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) 1064 mulps %xmm5, %xmm14 1065 addps %xmm14, %xmm10 1066 MOVUPS_A1 (-20 * SIZE, A2, %xmm14) 1067 mulps %xmm5, %xmm15 1068 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 1069 addps %xmm15, %xmm11 1070 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15) 1071 1072#ifdef PREFETCH 1073 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 1074#endif 1075 1076 mulps %xmm4, %xmm0 1077 addps %xmm0, %xmm8 1078 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 1079 mulps %xmm4, %xmm1 1080 addps %xmm1, %xmm9 1081 MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) 1082 mulps %xmm4, %xmm2 1083 addps %xmm2, %xmm10 1084 MOVUPS_A1 (-16 * SIZE, A2, %xmm2) 1085 mulps %xmm4, %xmm3 1086 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 1087 addps %xmm3, %xmm11 1088 MOVUPS_A2 (-16 * SIZE, A2, LDA, 1, %xmm3) 1089 1090#ifdef PREFETCH 1091 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) 1092#endif 1093 1094 mulps %xmm5, %xmm12 1095 addps %xmm12, %xmm8 1096 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) 1097 mulps %xmm5, %xmm13 1098 addps %xmm13, %xmm9 1099 MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13) 1100 1101#ifdef PREFETCHW 1102 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) 1103#endif 1104 1105 mulps %xmm5, %xmm14 1106 addps %xmm14, %xmm10 1107 MOVUPS_A1 (-12 * SIZE, A2, %xmm14) 1108 mulps %xmm5, %xmm15 1109 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 1110 addps %xmm15, %xmm11 1111 MOVUPS_A2 (-12 * SIZE, A2, LDA, 1, %xmm15) 1112 1113 addq $16 * SIZE, A1 1114 addq $16 * SIZE, A2 1115 addq $16 * SIZE, X1 1116 1117 decq I 1118 jg .L22 1119 ALIGN_4 1120 1121.L23: 1122 mulps %xmm4, %xmm0 1123 addps %xmm0, %xmm8 1124 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 1125 mulps %xmm4, %xmm1 1126 addps %xmm1, %xmm9 1127 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) 1128 mulps %xmm4, %xmm2 1129 addps %xmm2, %xmm10 1130 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) 1131 mulps %xmm4, %xmm3 1132 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1133 addps %xmm3, %xmm11 1134 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3) 1135 1136 mulps %xmm5, %xmm12 1137 addps %xmm12, %xmm8 1138 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 1139 mulps %xmm5, %xmm13 1140 addps %xmm13, %xmm9 1141 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) 1142 mulps %xmm5, %xmm14 1143 addps %xmm14, %xmm10 1144 MOVUPS_A1 (-20 * SIZE, A2, %xmm14) 1145 mulps %xmm5, %xmm15 1146 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 1147 addps %xmm15, %xmm11 1148 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15) 1149 1150 mulps %xmm4, %xmm0 1151 addps %xmm0, %xmm8 1152 mulps %xmm4, %xmm1 1153 addps %xmm1, %xmm9 1154 mulps %xmm4, %xmm2 1155 addps %xmm2, %xmm10 1156 mulps %xmm4, %xmm3 1157 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 1158 addps %xmm3, %xmm11 1159 1160 mulps %xmm5, %xmm12 1161 addps %xmm12, %xmm8 1162 mulps %xmm5, %xmm13 1163 addps %xmm13, %xmm9 1164 mulps %xmm5, %xmm14 1165 addps %xmm14, %xmm10 1166 mulps %xmm5, %xmm15 1167 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 1168 addps %xmm15, %xmm11 1169 1170 addq $16 * SIZE, A1 1171 addq $16 * SIZE, A2 1172 addq $16 * SIZE, X1 1173 ALIGN_4 1174 1175.L25: 1176 testq $8, MM 1177 jle .L26 1178 1179 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1180 mulps %xmm4, %xmm0 1181 addps %xmm0, %xmm8 1182 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 1183 mulps %xmm4, %xmm1 1184 addps %xmm1, %xmm9 1185 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 1186 mulps %xmm4, %xmm2 1187 addps %xmm2, %xmm10 1188 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) 1189 mulps %xmm4, %xmm3 1190 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1191 addps %xmm3, %xmm11 1192 1193 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 1194 mulps %xmm5, %xmm12 1195 addps %xmm12, %xmm8 1196 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) 1197 mulps %xmm5, %xmm13 1198 addps %xmm13, %xmm9 1199 MOVUPS_A1 (-28 * SIZE, A2, %xmm14) 1200 mulps %xmm5, %xmm14 1201 addps %xmm14, %xmm10 1202 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15) 1203 mulps %xmm5, %xmm15 1204 addps %xmm15, %xmm11 1205 1206 addq $8 * SIZE, A1 1207 addq $8 * SIZE, A2 1208 addq $8 * SIZE, X1 1209 ALIGN_4 1210 1211.L26: 1212 testq $4, MM 1213 jle .L27 1214 1215 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1216 mulps %xmm4, %xmm0 1217 addps %xmm0, %xmm8 1218 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 1219 mulps %xmm4, %xmm1 1220 addps %xmm1, %xmm9 1221 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 1222 mulps %xmm4, %xmm2 1223 addps %xmm2, %xmm10 1224 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) 1225 mulps %xmm4, %xmm3 1226 addps %xmm3, %xmm11 1227 1228 addq $4 * SIZE, A1 1229 addq $4 * SIZE, A2 1230 addq $4 * SIZE, X1 1231 ALIGN_4 1232 1233.L27: 1234 testq $2, MM 1235 jle .L28 1236 1237#ifdef movsd 1238 xorps %xmm0, %xmm0 1239#endif 1240 movsd -32 * SIZE(A1), %xmm0 1241#ifdef movsd 1242 xorps %xmm4, %xmm4 1243#endif 1244 movsd -32 * SIZE(X1), %xmm4 1245 mulps %xmm4, %xmm0 1246 addps %xmm0, %xmm8 1247#ifdef movsd 1248 xorps %xmm1, %xmm1 1249#endif 1250 movsd -32 * SIZE(A1, LDA), %xmm1 1251 mulps %xmm4, %xmm1 1252 addps %xmm1, %xmm9 1253#ifdef movsd 1254 xorps %xmm2, %xmm2 1255#endif 1256 movsd -32 * SIZE(A2), %xmm2 1257 mulps %xmm4, %xmm2 1258 addps %xmm2, %xmm10 1259#ifdef movsd 1260 xorps %xmm3, %xmm3 1261#endif 1262 movsd -32 * SIZE(A2, LDA), %xmm3 1263 mulps %xmm4, %xmm3 1264 addps %xmm3, %xmm11 1265 shufps $0xe, %xmm4, %xmm4 1266 1267 addq $2 * SIZE, A1 1268 addq $2 * SIZE, A2 1269 addq $2 * SIZE, X1 1270 ALIGN_4 1271 1272.L28: 1273 testq $1, MM 1274 jle .L29 1275 1276 movss -32 * SIZE(A1), %xmm0 1277 movss -32 * SIZE(X1), %xmm4 1278 mulss %xmm4, %xmm0 1279 addss %xmm0, %xmm8 1280 movss -32 * SIZE(A1, LDA), %xmm1 1281 mulss %xmm4, %xmm1 1282 addss %xmm1, %xmm9 1283 movss -32 * SIZE(A2), %xmm2 1284 mulss %xmm4, %xmm2 1285 addss %xmm2, %xmm10 1286 movss -32 * SIZE(A2, LDA), %xmm3 1287 mulss %xmm4, %xmm3 1288 addss %xmm3, %xmm11 1289 ALIGN_4 1290 1291.L29: 1292#ifdef HAVE_SSE3 1293 haddps %xmm9, %xmm8 1294 haddps %xmm11, %xmm10 1295 haddps %xmm10, %xmm8 1296 1297 pshufd $0x1, %xmm8, %xmm9 1298 pshufd $0x2, %xmm8, %xmm10 1299 pshufd $0x3, %xmm8, %xmm11 1300#else 1301 movaps %xmm8, %xmm0 1302 unpcklps %xmm9, %xmm8 1303 unpckhps %xmm9, %xmm0 1304 1305 movaps %xmm10, %xmm1 1306 unpcklps %xmm11, %xmm10 1307 unpckhps %xmm11, %xmm1 1308 1309 movaps %xmm8, %xmm9 1310 unpcklps %xmm10, %xmm8 1311 unpckhps %xmm10, %xmm9 1312 1313 movaps %xmm0, %xmm10 1314 unpcklps %xmm1, %xmm0 1315 unpckhps %xmm1, %xmm10 1316 1317 addps %xmm9, %xmm8 1318 addps %xmm0, %xmm10 1319 addps %xmm10, %xmm8 1320 1321 pshufd $0x2, %xmm8, %xmm9 1322 pshufd $0x1, %xmm8, %xmm10 1323 pshufd $0x3, %xmm8, %xmm11 1324#endif 1325 1326 mulss ALPHA, %xmm8 1327 mulss ALPHA, %xmm9 1328 mulss ALPHA, %xmm10 1329 mulss ALPHA, %xmm11 1330 1331 addss (Y), %xmm8 1332 addq INCY, Y 1333 addss (Y), %xmm9 1334 addq INCY, Y 1335 addss (Y), %xmm10 1336 addq INCY, Y 1337 addss (Y), %xmm11 1338 addq INCY, Y 1339 1340 movss %xmm8, (Y1) 1341 addq INCY, Y1 1342 movss %xmm9, (Y1) 1343 addq INCY, Y1 1344 movss %xmm10, (Y1) 1345 addq INCY, Y1 1346 movss %xmm11, (Y1) 1347 addq INCY, Y1 1348 1349#if GEMV_UNROLL == 4 1350 cmpq $4, N 1351 jge .L21 1352#endif 1353 ALIGN_4 1354 1355.L30: 1356 cmpq $3, N 1357 jne .L40 1358 1359 leaq 32 * SIZE(BUFFER), X1 1360 1361 movq A, A1 1362 leaq (A1, LDA, 2), A2 1363 leaq (A1, LDA, 4), A 1364 1365 xorps %xmm8, %xmm8 1366 xorps %xmm9, %xmm9 1367 xorps %xmm10, %xmm10 1368 1369#ifdef ALIGNED_ACCESS 1370 cmpq $3, M 1371 jle .L37 1372 1373 testq $SIZE, A1 1374 je .L3X 1375 1376 movss -32 * SIZE(A1), %xmm0 1377 movss -32 * SIZE(X1), %xmm4 1378 mulss %xmm4, %xmm0 1379 addss %xmm0, %xmm8 1380 movss -32 * SIZE(A1, LDA), %xmm1 1381 mulss %xmm4, %xmm1 1382 addss %xmm1, %xmm9 1383 movss -32 * SIZE(A2), %xmm2 1384 mulss %xmm4, %xmm2 1385 addss %xmm2, %xmm10 1386 movss -32 * SIZE(A2, LDA), %xmm3 1387 mulss %xmm4, %xmm3 1388 addss %xmm3, %xmm11 1389 1390 addq $1 * SIZE, A1 1391 addq $1 * SIZE, A2 1392 addq $1 * SIZE, X1 1393 ALIGN_3 1394 1395.L3X: 1396 testq $2 * SIZE, A1 1397 je .L3XX 1398 1399#ifdef movsd 1400 xorps %xmm0, %xmm0 1401 xorps %xmm4, %xmm4 1402#endif 1403 movsd -32 * SIZE(A1), %xmm0 1404 movsd -32 * SIZE(X1), %xmm4 1405 mulps %xmm4, %xmm0 1406 addps %xmm0, %xmm8 1407#ifdef movsd 1408 xorps %xmm1, %xmm1 1409#endif 1410 movsd -32 * SIZE(A1, LDA), %xmm1 1411 mulps %xmm4, %xmm1 1412 addps %xmm1, %xmm9 1413#ifdef movsd 1414 xorps %xmm2, %xmm2 1415#endif 1416 movsd -32 * SIZE(A2), %xmm2 1417 mulps %xmm4, %xmm2 1418 addps %xmm2, %xmm10 1419#ifdef movsd 1420 xorps %xmm3, %xmm3 1421#endif 1422 movsd -32 * SIZE(A2, LDA), %xmm3 1423 mulps %xmm4, %xmm3 1424 addps %xmm3, %xmm11 1425 1426 addq $2 * SIZE, A1 1427 addq $2 * SIZE, A2 1428 addq $2 * SIZE, X1 1429 ALIGN_3 1430 1431.L3XX: 1432#endif 1433 1434 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 1435 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 1436 1437#if (GEMV_UNROLL == 4) && defined(PREFETCHW) 1438 PREFETCHW 4 * SIZE(Y1) 1439#endif 1440 1441 movq MM, I 1442 sarq $4, I 1443 jle .L35 1444 1445 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1446 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 1447 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 1448 1449 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 1450 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) 1451 MOVUPS_A1 (-28 * SIZE, A2, %xmm14) 1452 1453 decq I 1454 jle .L33 1455 ALIGN_4 1456 1457.L32: 1458#ifdef PREFETCH 1459 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) 1460#endif 1461 1462 mulps %xmm4, %xmm0 1463 addps %xmm0, %xmm8 1464 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 1465 mulps %xmm4, %xmm1 1466 addps %xmm1, %xmm9 1467 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) 1468 mulps %xmm4, %xmm2 1469 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1470 addps %xmm2, %xmm10 1471 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) 1472 1473#ifdef PREFETCH 1474 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) 1475#endif 1476 1477 mulps %xmm5, %xmm12 1478 addps %xmm12, %xmm8 1479 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 1480 mulps %xmm5, %xmm13 1481 addps %xmm13, %xmm9 1482 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) 1483 mulps %xmm5, %xmm14 1484 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 1485 addps %xmm14, %xmm10 1486 MOVUPS_A1 (-20 * SIZE, A2, %xmm14) 1487 1488#ifdef PREFETCH 1489 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) 1490#endif 1491 1492 mulps %xmm4, %xmm0 1493 addps %xmm0, %xmm8 1494 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 1495 mulps %xmm4, %xmm1 1496 addps %xmm1, %xmm9 1497 MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) 1498 mulps %xmm4, %xmm2 1499 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 1500 addps %xmm2, %xmm10 1501 MOVUPS_A1 (-16 * SIZE, A2, %xmm2) 1502 1503#ifdef PREFETCHW 1504 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) 1505#endif 1506 1507 mulps %xmm5, %xmm12 1508 addps %xmm12, %xmm8 1509 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) 1510 mulps %xmm5, %xmm13 1511 addps %xmm13, %xmm9 1512 MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13) 1513 mulps %xmm5, %xmm14 1514 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 1515 addps %xmm14, %xmm10 1516 MOVUPS_A1 (-12 * SIZE, A2, %xmm14) 1517 1518 addq $16 * SIZE, A1 1519 addq $16 * SIZE, A2 1520 addq $16 * SIZE, X1 1521 1522 decq I 1523 jg .L32 1524 ALIGN_4 1525 1526.L33: 1527 mulps %xmm4, %xmm0 1528 addps %xmm0, %xmm8 1529 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 1530 mulps %xmm4, %xmm1 1531 addps %xmm1, %xmm9 1532 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) 1533 mulps %xmm4, %xmm2 1534 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1535 addps %xmm2, %xmm10 1536 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) 1537 1538 mulps %xmm5, %xmm12 1539 addps %xmm12, %xmm8 1540 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 1541 mulps %xmm5, %xmm13 1542 addps %xmm13, %xmm9 1543 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) 1544 mulps %xmm5, %xmm14 1545 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 1546 addps %xmm14, %xmm10 1547 MOVUPS_A1 (-20 * SIZE, A2, %xmm14) 1548 1549 mulps %xmm4, %xmm0 1550 addps %xmm0, %xmm8 1551 mulps %xmm4, %xmm1 1552 addps %xmm1, %xmm9 1553 mulps %xmm4, %xmm2 1554 addps %xmm2, %xmm10 1555 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 1556 1557 mulps %xmm5, %xmm12 1558 addps %xmm12, %xmm8 1559 mulps %xmm5, %xmm13 1560 addps %xmm13, %xmm9 1561 mulps %xmm5, %xmm14 1562 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 1563 addps %xmm14, %xmm10 1564 1565 addq $16 * SIZE, A1 1566 addq $16 * SIZE, A2 1567 addq $16 * SIZE, X1 1568 ALIGN_4 1569 1570.L35: 1571 testq $8, MM 1572 jle .L36 1573 1574 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1575 mulps %xmm4, %xmm0 1576 addps %xmm0, %xmm8 1577 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 1578 mulps %xmm4, %xmm1 1579 addps %xmm1, %xmm9 1580 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 1581 mulps %xmm4, %xmm2 1582 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1583 addps %xmm2, %xmm10 1584 1585 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 1586 mulps %xmm5, %xmm12 1587 addps %xmm12, %xmm8 1588 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) 1589 mulps %xmm5, %xmm13 1590 addps %xmm13, %xmm9 1591 MOVUPS_A1 (-28 * SIZE, A2, %xmm14) 1592 mulps %xmm5, %xmm14 1593 addps %xmm14, %xmm10 1594 1595 addq $8 * SIZE, A1 1596 addq $8 * SIZE, A2 1597 addq $8 * SIZE, X1 1598 ALIGN_4 1599 1600.L36: 1601 testq $4, MM 1602 jle .L37 1603 1604 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1605 mulps %xmm4, %xmm0 1606 addps %xmm0, %xmm8 1607 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) 1608 mulps %xmm4, %xmm1 1609 addps %xmm1, %xmm9 1610 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 1611 mulps %xmm4, %xmm2 1612 addps %xmm2, %xmm10 1613 1614 addq $4 * SIZE, A1 1615 addq $4 * SIZE, A2 1616 addq $4 * SIZE, X1 1617 ALIGN_4 1618 1619.L37: 1620 testq $2, MM 1621 jle .L38 1622 1623#ifdef movsd 1624 xorps %xmm0, %xmm0 1625#endif 1626 movsd -32 * SIZE(A1), %xmm0 1627#ifdef movsd 1628 xorps %xmm4, %xmm4 1629#endif 1630 movsd -32 * SIZE(X1), %xmm4 1631 mulps %xmm4, %xmm0 1632 addps %xmm0, %xmm8 1633#ifdef movsd 1634 xorps %xmm1, %xmm1 1635#endif 1636 movsd -32 * SIZE(A1, LDA), %xmm1 1637 mulps %xmm4, %xmm1 1638 addps %xmm1, %xmm9 1639#ifdef movsd 1640 xorps %xmm2, %xmm2 1641#endif 1642 movsd -32 * SIZE(A2), %xmm2 1643 mulps %xmm4, %xmm2 1644 addps %xmm2, %xmm10 1645#ifdef movsd 1646 xorps %xmm3, %xmm3 1647#endif 1648 1649 addq $2 * SIZE, A1 1650 addq $2 * SIZE, A2 1651 addq $2 * SIZE, X1 1652 ALIGN_4 1653 1654.L38: 1655 testq $1, MM 1656 jle .L39 1657 1658 movss -32 * SIZE(A1), %xmm0 1659 movss -32 * SIZE(X1), %xmm4 1660 mulss %xmm4, %xmm0 1661 addss %xmm0, %xmm8 1662 movss -32 * SIZE(A1, LDA), %xmm1 1663 mulss %xmm4, %xmm1 1664 addss %xmm1, %xmm9 1665 movss -32 * SIZE(A2), %xmm2 1666 mulss %xmm4, %xmm2 1667 addss %xmm2, %xmm10 1668 ALIGN_4 1669 1670.L39: 1671#ifdef HAVE_SSE3 1672 haddps %xmm9, %xmm8 1673 haddps %xmm11, %xmm10 1674 haddps %xmm10, %xmm8 1675 1676 pshufd $0x1, %xmm8, %xmm9 1677 pshufd $0x2, %xmm8, %xmm10 1678#else 1679 movaps %xmm8, %xmm0 1680 unpcklps %xmm9, %xmm8 1681 unpckhps %xmm9, %xmm0 1682 1683 movaps %xmm10, %xmm1 1684 unpcklps %xmm11, %xmm10 1685 unpckhps %xmm11, %xmm1 1686 1687 movaps %xmm8, %xmm9 1688 unpcklps %xmm10, %xmm8 1689 unpckhps %xmm10, %xmm9 1690 1691 movaps %xmm0, %xmm10 1692 unpcklps %xmm1, %xmm0 1693 unpckhps %xmm1, %xmm10 1694 1695 addps %xmm9, %xmm8 1696 addps %xmm0, %xmm10 1697 addps %xmm10, %xmm8 1698 1699 pshufd $0x2, %xmm8, %xmm9 1700 pshufd $0x1, %xmm8, %xmm10 1701#endif 1702 1703 mulss ALPHA, %xmm8 1704 mulss ALPHA, %xmm9 1705 mulss ALPHA, %xmm10 1706 1707 addss (Y), %xmm8 1708 addq INCY, Y 1709 addss (Y), %xmm9 1710 addq INCY, Y 1711 addss (Y), %xmm10 1712 addq INCY, Y 1713 1714 movss %xmm8, (Y1) 1715 addq INCY, Y1 1716 movss %xmm9, (Y1) 1717 addq INCY, Y1 1718 movss %xmm10, (Y1) 1719 addq INCY, Y1 1720 jmp .L999 1721 ALIGN_4 1722 1723.L40: 1724 cmpq $2, N 1725 jne .L50 1726 1727 leaq 32 * SIZE(BUFFER), X1 1728 1729 movq A, A1 1730 leaq (A1, LDA), A2 1731 leaq (A1, LDA, 2), A 1732 1733 xorps %xmm8, %xmm8 1734 xorps %xmm9, %xmm9 1735 1736#ifdef ALIGNED_ACCESS 1737 cmpq $3, M 1738 jle .L47 1739 1740 testq $SIZE, A1 1741 je .L4X 1742 1743 movss -32 * SIZE(A1), %xmm0 1744 movss -32 * SIZE(X1), %xmm4 1745 mulss %xmm4, %xmm0 1746 addss %xmm0, %xmm8 1747 movss -32 * SIZE(A2), %xmm1 1748 mulss %xmm4, %xmm1 1749 addss %xmm1, %xmm9 1750 1751 addq $1 * SIZE, A1 1752 addq $1 * SIZE, A2 1753 addq $1 * SIZE, X1 1754 ALIGN_3 1755 1756.L4X: 1757 testq $2 * SIZE, A1 1758 je .L4XX 1759 1760#ifdef movsd 1761 xorps %xmm0, %xmm0 1762 xorps %xmm4, %xmm4 1763#endif 1764 movsd -32 * SIZE(A1), %xmm0 1765 movsd -32 * SIZE(X1), %xmm4 1766 mulps %xmm4, %xmm0 1767 addps %xmm0, %xmm8 1768#ifdef movsd 1769 xorps %xmm1, %xmm1 1770#endif 1771 movsd -32 * SIZE(A2), %xmm1 1772 mulps %xmm4, %xmm1 1773 addps %xmm1, %xmm9 1774 1775 addq $2 * SIZE, A1 1776 addq $2 * SIZE, A2 1777 addq $2 * SIZE, X1 1778 ALIGN_3 1779 1780.L4XX: 1781#endif 1782 1783 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 1784 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 1785 1786 movq MM, I 1787 sarq $4, I 1788 jle .L45 1789 1790 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1791 MOVUPS_A1 (-32 * SIZE, A2, %xmm1) 1792 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 1793 MOVUPS_A1 (-28 * SIZE, A2, %xmm13) 1794 1795 decq I 1796 jle .L43 1797 ALIGN_4 1798 1799.L42: 1800#ifdef PREFETCH 1801 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 1802#endif 1803 1804 mulps %xmm4, %xmm0 1805 addps %xmm0, %xmm8 1806 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 1807 mulps %xmm4, %xmm1 1808 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1809 addps %xmm1, %xmm9 1810 MOVUPS_A1 (-24 * SIZE, A2, %xmm1) 1811 1812 mulps %xmm5, %xmm12 1813 addps %xmm12, %xmm8 1814 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 1815 mulps %xmm5, %xmm13 1816 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 1817 addps %xmm13, %xmm9 1818 MOVUPS_A1 (-20 * SIZE, A2, %xmm13) 1819 1820#ifdef PREFETCH 1821 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) 1822#endif 1823 1824 mulps %xmm4, %xmm0 1825 addps %xmm0, %xmm8 1826 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 1827 mulps %xmm4, %xmm1 1828 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 1829 addps %xmm1, %xmm9 1830 MOVUPS_A1 (-16 * SIZE, A2, %xmm1) 1831 1832#ifdef PREFETCHW 1833 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) 1834#endif 1835 1836 mulps %xmm5, %xmm12 1837 addps %xmm12, %xmm8 1838 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) 1839 mulps %xmm5, %xmm13 1840 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 1841 addps %xmm13, %xmm9 1842 MOVUPS_A1 (-12 * SIZE, A2, %xmm13) 1843 1844 addq $16 * SIZE, A1 1845 addq $16 * SIZE, A2 1846 addq $16 * SIZE, X1 1847 1848 decq I 1849 jg .L42 1850 ALIGN_4 1851 1852.L43: 1853 mulps %xmm4, %xmm0 1854 addps %xmm0, %xmm8 1855 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 1856 mulps %xmm4, %xmm1 1857 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1858 addps %xmm1, %xmm9 1859 MOVUPS_A1 (-24 * SIZE, A2, %xmm1) 1860 1861 mulps %xmm5, %xmm12 1862 addps %xmm12, %xmm8 1863 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 1864 mulps %xmm5, %xmm13 1865 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 1866 addps %xmm13, %xmm9 1867 MOVUPS_A1 (-20 * SIZE, A2, %xmm13) 1868 1869 mulps %xmm4, %xmm0 1870 addps %xmm0, %xmm8 1871 mulps %xmm4, %xmm1 1872 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 1873 addps %xmm1, %xmm9 1874 1875 mulps %xmm5, %xmm12 1876 addps %xmm12, %xmm8 1877 mulps %xmm5, %xmm13 1878 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 1879 addps %xmm13, %xmm9 1880 1881 addq $16 * SIZE, A1 1882 addq $16 * SIZE, A2 1883 addq $16 * SIZE, X1 1884 ALIGN_4 1885 1886.L45: 1887 testq $8, MM 1888 jle .L46 1889 1890 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1891 mulps %xmm4, %xmm0 1892 addps %xmm0, %xmm8 1893 MOVUPS_A1 (-32 * SIZE, A2, %xmm1) 1894 mulps %xmm4, %xmm1 1895 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 1896 addps %xmm1, %xmm9 1897 1898 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 1899 mulps %xmm5, %xmm12 1900 addps %xmm12, %xmm8 1901 MOVUPS_A1 (-28 * SIZE, A2, %xmm13) 1902 mulps %xmm5, %xmm13 1903 addps %xmm13, %xmm9 1904 1905 addq $8 * SIZE, A1 1906 addq $8 * SIZE, A2 1907 addq $8 * SIZE, X1 1908 ALIGN_4 1909 1910.L46: 1911 testq $4, MM 1912 jle .L47 1913 1914 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 1915 mulps %xmm4, %xmm0 1916 addps %xmm0, %xmm8 1917 MOVUPS_A1 (-32 * SIZE, A2, %xmm1) 1918 mulps %xmm4, %xmm1 1919 addps %xmm1, %xmm9 1920 1921 addq $4 * SIZE, A1 1922 addq $4 * SIZE, A2 1923 addq $4 * SIZE, X1 1924 ALIGN_4 1925 1926.L47: 1927 testq $2, MM 1928 jle .L48 1929 1930#ifdef movsd 1931 xorps %xmm0, %xmm0 1932#endif 1933 movsd -32 * SIZE(A1), %xmm0 1934#ifdef movsd 1935 xorps %xmm4, %xmm4 1936#endif 1937 movsd -32 * SIZE(X1), %xmm4 1938 mulps %xmm4, %xmm0 1939 addps %xmm0, %xmm8 1940#ifdef movsd 1941 xorps %xmm1, %xmm1 1942#endif 1943 movsd -32 * SIZE(A2), %xmm1 1944 mulps %xmm4, %xmm1 1945 addps %xmm1, %xmm9 1946 shufps $0xe, %xmm4, %xmm4 1947 1948 addq $2 * SIZE, A1 1949 addq $2 * SIZE, A2 1950 addq $2 * SIZE, X1 1951 ALIGN_4 1952 1953.L48: 1954 testq $1, MM 1955 jle .L49 1956 1957 movss -32 * SIZE(A1), %xmm0 1958 movss -32 * SIZE(X1), %xmm4 1959 mulss %xmm4, %xmm0 1960 addss %xmm0, %xmm8 1961 movss -32 * SIZE(A2), %xmm1 1962 mulss %xmm4, %xmm1 1963 addss %xmm1, %xmm9 1964 ALIGN_4 1965 1966.L49: 1967#ifdef HAVE_SSE3 1968 haddps %xmm9, %xmm8 1969 haddps %xmm8, %xmm8 1970#else 1971 movaps %xmm8, %xmm10 1972 unpcklps %xmm9, %xmm8 1973 unpckhps %xmm9, %xmm10 1974 1975 addps %xmm10, %xmm8 1976 movhlps %xmm8, %xmm9 1977 addps %xmm9, %xmm8 1978#endif 1979 1980 pshufd $0x1, %xmm8, %xmm9 1981 1982 mulss ALPHA, %xmm8 1983 mulss ALPHA, %xmm9 1984 1985 addss (Y), %xmm8 1986 addq INCY, Y 1987 addss (Y), %xmm9 1988 addq INCY, Y 1989 1990 movss %xmm8, (Y1) 1991 addq INCY, Y1 1992 movss %xmm9, (Y1) 1993 addq INCY, Y1 1994 jmp .L999 1995 ALIGN_4 1996 1997.L50: 1998 cmpq $1, N 1999 jne .L999 2000 2001 leaq 32 * SIZE(BUFFER), X1 2002 2003 movq A, A1 2004 2005 xorps %xmm8, %xmm8 2006 xorps %xmm9, %xmm9 2007 2008#ifdef ALIGNED_ACCESS 2009 cmpq $3, M 2010 jle .L57 2011 2012 testq $SIZE, A1 2013 je .L5X 2014 2015 movss -32 * SIZE(A1), %xmm0 2016 movss -32 * SIZE(X1), %xmm4 2017 mulss %xmm4, %xmm0 2018 addss %xmm0, %xmm8 2019 2020 addq $1 * SIZE, A1 2021 addq $1 * SIZE, X1 2022 ALIGN_3 2023 2024.L5X: 2025 testq $2 * SIZE, A1 2026 je .L5XX 2027 2028 2029#ifdef movsd 2030 xorps %xmm0, %xmm0 2031 xorps %xmm4, %xmm4 2032#endif 2033 movsd -32 * SIZE(A1), %xmm0 2034 movsd -32 * SIZE(X1), %xmm4 2035 mulps %xmm4, %xmm0 2036 addps %xmm0, %xmm8 2037 shufps $0xe, %xmm4, %xmm4 2038 2039 addq $2 * SIZE, A1 2040 addq $2 * SIZE, X1 2041 ALIGN_3 2042 2043.L5XX: 2044#endif 2045 2046 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 2047 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 2048 2049 movq MM, I 2050 sarq $4, I 2051 jle .L55 2052 2053 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2054 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 2055 2056 decq I 2057 jle .L53 2058 ALIGN_4 2059 2060.L52: 2061#ifdef PREFETCH 2062 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) 2063#endif 2064 2065 mulps %xmm4, %xmm0 2066 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2067 addps %xmm0, %xmm8 2068 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 2069 2070 mulps %xmm5, %xmm12 2071 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 2072 addps %xmm12, %xmm9 2073 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 2074 2075#ifdef PREFETCHW 2076 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) 2077#endif 2078 2079 mulps %xmm4, %xmm0 2080 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 2081 addps %xmm0, %xmm8 2082 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 2083 2084 mulps %xmm5, %xmm12 2085 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 2086 addps %xmm12, %xmm9 2087 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) 2088 2089 addq $16 * SIZE, A1 2090 addq $16 * SIZE, X1 2091 2092 decq I 2093 jg .L52 2094 ALIGN_4 2095 2096.L53: 2097 mulps %xmm4, %xmm0 2098 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2099 addps %xmm0, %xmm8 2100 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 2101 2102 mulps %xmm5, %xmm12 2103 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 2104 addps %xmm12, %xmm9 2105 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 2106 2107 mulps %xmm4, %xmm0 2108 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 2109 addps %xmm0, %xmm8 2110 2111 mulps %xmm5, %xmm12 2112 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 2113 addps %xmm12, %xmm9 2114 2115 addq $16 * SIZE, A1 2116 addq $16 * SIZE, X1 2117 ALIGN_4 2118 2119.L55: 2120 testq $8, MM 2121 jle .L56 2122 2123 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2124 mulps %xmm4, %xmm0 2125 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2126 addps %xmm0, %xmm8 2127 2128 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 2129 mulps %xmm5, %xmm12 2130 addps %xmm12, %xmm9 2131 2132 addq $8 * SIZE, A1 2133 addq $8 * SIZE, X1 2134 ALIGN_4 2135 2136.L56: 2137 testq $4, MM 2138 jle .L57 2139 2140 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2141 mulps %xmm4, %xmm0 2142 addps %xmm0, %xmm8 2143 2144 addq $4 * SIZE, A1 2145 addq $4 * SIZE, X1 2146 ALIGN_4 2147 2148.L57: 2149 testq $2, MM 2150 jle .L58 2151 2152#ifdef movsd 2153 xorps %xmm0, %xmm0 2154#endif 2155 movsd -32 * SIZE(A1), %xmm0 2156#ifdef movsd 2157 xorps %xmm4, %xmm4 2158#endif 2159 movsd -32 * SIZE(X1), %xmm4 2160 mulps %xmm4, %xmm0 2161 addps %xmm0, %xmm8 2162 shufps $0xe, %xmm4, %xmm4 2163 2164 addq $2 * SIZE, A1 2165 addq $2 * SIZE, X1 2166 ALIGN_4 2167 2168.L58: 2169 testq $1, MM 2170 jle .L59 2171 2172 movss -32 * SIZE(A1), %xmm0 2173 movss -32 * SIZE(X1), %xmm4 2174 mulss %xmm4, %xmm0 2175 addss %xmm0, %xmm8 2176 ALIGN_4 2177 2178.L59: 2179 addps %xmm9, %xmm8 2180 2181#ifdef HAVE_SSE3 2182 haddps %xmm8, %xmm8 2183 haddps %xmm8, %xmm8 2184#else 2185 pshufd $1, %xmm8, %xmm9 2186 pshufd $2, %xmm8, %xmm10 2187 pshufd $3, %xmm8, %xmm11 2188 2189 addss %xmm9, %xmm8 2190 addss %xmm11, %xmm10 2191 addss %xmm10, %xmm8 2192#endif 2193 2194 mulss ALPHA, %xmm8 2195 2196 addss (Y), %xmm8 2197 movss %xmm8, (Y1) 2198 2199#ifdef ALIGNED_ACCESS 2200 jmp .L999 2201 ALIGN_4 2202 2203.L100: 2204 testq $2 * SIZE - 1, LDA 2205 jne .L200 2206 2207 cmpq $4, N 2208 jl .L110 2209 ALIGN_3 2210 2211.L101: 2212 subq $4, N 2213 2214 leaq 32 * SIZE(BUFFER), X1 2215 2216 movq A, A1 2217 leaq (A1, LDA, 2), A2 2218 leaq (A1, LDA, 4), A 2219 2220 xorps %xmm8, %xmm8 2221 xorps %xmm9, %xmm9 2222 xorps %xmm10, %xmm10 2223 xorps %xmm11, %xmm11 2224 2225 cmpq $3, M 2226 jle .L107 2227 2228 testq $SIZE, A1 2229 je .L10X 2230 2231 movss -32 * SIZE(A1), %xmm0 2232 movss -32 * SIZE(X1), %xmm4 2233 mulss %xmm4, %xmm0 2234 addss %xmm0, %xmm8 2235 movss -32 * SIZE(A1, LDA), %xmm1 2236 mulss %xmm4, %xmm1 2237 addss %xmm1, %xmm9 2238 movss -32 * SIZE(A2), %xmm2 2239 mulss %xmm4, %xmm2 2240 addss %xmm2, %xmm10 2241 movss -32 * SIZE(A2, LDA), %xmm3 2242 mulss %xmm4, %xmm3 2243 addss %xmm3, %xmm11 2244 2245 addq $1 * SIZE, A1 2246 addq $1 * SIZE, A2 2247 addq $1 * SIZE, X1 2248 ALIGN_3 2249 2250.L10X: 2251 testq $2 * SIZE, A1 2252 je .L10XX 2253 2254#ifdef movsd 2255 xorps %xmm0, %xmm0 2256 xorps %xmm4, %xmm4 2257#endif 2258 movsd -32 * SIZE(A1), %xmm0 2259 movsd -32 * SIZE(X1), %xmm4 2260 mulps %xmm4, %xmm0 2261 addps %xmm0, %xmm8 2262#ifdef movsd 2263 xorps %xmm1, %xmm1 2264#endif 2265 movsd -32 * SIZE(A1, LDA), %xmm1 2266 mulps %xmm4, %xmm1 2267 addps %xmm1, %xmm9 2268#ifdef movsd 2269 xorps %xmm2, %xmm2 2270#endif 2271 movsd -32 * SIZE(A2), %xmm2 2272 mulps %xmm4, %xmm2 2273 addps %xmm2, %xmm10 2274#ifdef movsd 2275 xorps %xmm3, %xmm3 2276#endif 2277 movsd -32 * SIZE(A2, LDA), %xmm3 2278 mulps %xmm4, %xmm3 2279 addps %xmm3, %xmm11 2280 2281 addq $2 * SIZE, A1 2282 addq $2 * SIZE, A2 2283 addq $2 * SIZE, X1 2284 ALIGN_3 2285 2286.L10XX: 2287 MOVUPS_A2 (-34 * SIZE, A1, LDA, 1, %xmm12) 2288 MOVUPS_A2 (-34 * SIZE, A2, LDA, 1, %xmm13) 2289 2290 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 2291 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 2292 2293#ifdef PREFETCHW 2294 PREFETCHW 4 * SIZE(Y1) 2295#endif 2296 2297 movq MM, I 2298 sarq $4, I 2299 jle .L105 2300 2301 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2302 MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) 2303 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 2304 MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) 2305 2306 decq I 2307 jle .L103 2308 ALIGN_4 2309 2310.L102: 2311#ifdef PREFETCH 2312 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 2313#endif 2314 2315 mulps %xmm4, %xmm0 2316 addps %xmm0, %xmm8 2317 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 2318 shufps $0x4e, %xmm1, %xmm12 2319 mulps %xmm4, %xmm12 2320 addps %xmm12, %xmm9 2321 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) 2322 mulps %xmm4, %xmm2 2323 addps %xmm2, %xmm10 2324 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) 2325 shufps $0x4e, %xmm3, %xmm13 2326 mulps %xmm4, %xmm13 2327 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2328 addps %xmm13, %xmm11 2329 MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) 2330 2331#ifdef PREFETCH 2332 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) 2333#endif 2334 2335 mulps %xmm5, %xmm0 2336 addps %xmm0, %xmm8 2337 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 2338 shufps $0x4e, %xmm12, %xmm1 2339 mulps %xmm5, %xmm1 2340 addps %xmm1, %xmm9 2341 MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) 2342 mulps %xmm5, %xmm2 2343 addps %xmm2, %xmm10 2344 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) 2345 shufps $0x4e, %xmm13, %xmm3 2346 mulps %xmm5, %xmm3 2347 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 2348 addps %xmm3, %xmm11 2349 MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3) 2350 2351#ifdef PREFETCH 2352 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 2353#endif 2354 2355 mulps %xmm4, %xmm0 2356 addps %xmm0, %xmm8 2357 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 2358 shufps $0x4e, %xmm1, %xmm12 2359 mulps %xmm4, %xmm12 2360 addps %xmm12, %xmm9 2361 MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) 2362 mulps %xmm4, %xmm2 2363 addps %xmm2, %xmm10 2364 MOVUPS_A1 (-20 * SIZE, A2, %xmm2) 2365 shufps $0x4e, %xmm3, %xmm13 2366 mulps %xmm4, %xmm13 2367 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 2368 addps %xmm13, %xmm11 2369 MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13) 2370 2371#ifdef PREFETCH 2372 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) 2373#endif 2374 2375 mulps %xmm5, %xmm0 2376 addps %xmm0, %xmm8 2377 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 2378 shufps $0x4e, %xmm12, %xmm1 2379 mulps %xmm5, %xmm1 2380 addps %xmm1, %xmm9 2381 MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1) 2382 2383#ifdef PREFETCHW 2384 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) 2385#endif 2386 2387 mulps %xmm5, %xmm2 2388 addps %xmm2, %xmm10 2389 MOVUPS_A1 (-16 * SIZE, A2, %xmm2) 2390 shufps $0x4e, %xmm13, %xmm3 2391 mulps %xmm5, %xmm3 2392 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 2393 addps %xmm3, %xmm11 2394 MOVUPS_A2 (-14 * SIZE, A2, LDA, 1, %xmm3) 2395 2396 addq $16 * SIZE, A1 2397 addq $16 * SIZE, A2 2398 addq $16 * SIZE, X1 2399 2400 decq I 2401 jg .L102 2402 ALIGN_4 2403 2404.L103: 2405 mulps %xmm4, %xmm0 2406 addps %xmm0, %xmm8 2407 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 2408 shufps $0x4e, %xmm1, %xmm12 2409 mulps %xmm4, %xmm12 2410 addps %xmm12, %xmm9 2411 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) 2412 mulps %xmm4, %xmm2 2413 addps %xmm2, %xmm10 2414 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) 2415 shufps $0x4e, %xmm3, %xmm13 2416 mulps %xmm4, %xmm13 2417 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2418 addps %xmm13, %xmm11 2419 MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) 2420 2421 mulps %xmm5, %xmm0 2422 addps %xmm0, %xmm8 2423 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 2424 shufps $0x4e, %xmm12, %xmm1 2425 mulps %xmm5, %xmm1 2426 addps %xmm1, %xmm9 2427 MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) 2428 mulps %xmm5, %xmm2 2429 addps %xmm2, %xmm10 2430 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) 2431 shufps $0x4e, %xmm13, %xmm3 2432 mulps %xmm5, %xmm3 2433 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 2434 addps %xmm3, %xmm11 2435 MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3) 2436 2437 mulps %xmm4, %xmm0 2438 addps %xmm0, %xmm8 2439 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 2440 shufps $0x4e, %xmm1, %xmm12 2441 mulps %xmm4, %xmm12 2442 addps %xmm12, %xmm9 2443 MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) 2444 mulps %xmm4, %xmm2 2445 addps %xmm2, %xmm10 2446 MOVUPS_A1 (-20 * SIZE, A2, %xmm2) 2447 shufps $0x4e, %xmm3, %xmm13 2448 mulps %xmm4, %xmm13 2449 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 2450 addps %xmm13, %xmm11 2451 MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13) 2452 2453 mulps %xmm5, %xmm0 2454 addps %xmm0, %xmm8 2455 shufps $0x4e, %xmm12, %xmm1 2456 mulps %xmm5, %xmm1 2457 addps %xmm1, %xmm9 2458 mulps %xmm5, %xmm2 2459 addps %xmm2, %xmm10 2460 shufps $0x4e, %xmm13, %xmm3 2461 mulps %xmm5, %xmm3 2462 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 2463 addps %xmm3, %xmm11 2464 2465 addq $16 * SIZE, A1 2466 addq $16 * SIZE, A2 2467 addq $16 * SIZE, X1 2468 ALIGN_4 2469 2470.L105: 2471 testq $8, MM 2472 jle .L106 2473 2474 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2475 MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) 2476 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 2477 MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) 2478 2479 mulps %xmm4, %xmm0 2480 addps %xmm0, %xmm8 2481 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 2482 shufps $0x4e, %xmm1, %xmm12 2483 mulps %xmm4, %xmm12 2484 addps %xmm12, %xmm9 2485 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) 2486 mulps %xmm4, %xmm2 2487 addps %xmm2, %xmm10 2488 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) 2489 shufps $0x4e, %xmm3, %xmm13 2490 mulps %xmm4, %xmm13 2491 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2492 addps %xmm13, %xmm11 2493 MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) 2494 2495 mulps %xmm5, %xmm0 2496 addps %xmm0, %xmm8 2497 shufps $0x4e, %xmm12, %xmm1 2498 mulps %xmm5, %xmm1 2499 addps %xmm1, %xmm9 2500 mulps %xmm5, %xmm2 2501 addps %xmm2, %xmm10 2502 shufps $0x4e, %xmm13, %xmm3 2503 mulps %xmm5, %xmm3 2504 addps %xmm3, %xmm11 2505 2506 addq $8 * SIZE, A1 2507 addq $8 * SIZE, A2 2508 addq $8 * SIZE, X1 2509 ALIGN_4 2510 2511.L106: 2512 testq $4, MM 2513 jle .L107 2514 2515 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2516 mulps %xmm4, %xmm0 2517 addps %xmm0, %xmm8 2518 MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) 2519 shufps $0x4e, %xmm1, %xmm12 2520 mulps %xmm4, %xmm12 2521 addps %xmm12, %xmm9 2522 2523 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 2524 mulps %xmm4, %xmm2 2525 addps %xmm2, %xmm10 2526 MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) 2527 shufps $0x4e, %xmm3, %xmm13 2528 mulps %xmm4, %xmm13 2529 addps %xmm13, %xmm11 2530 2531 addq $4 * SIZE, A1 2532 addq $4 * SIZE, A2 2533 addq $4 * SIZE, X1 2534 ALIGN_4 2535 2536.L107: 2537 testq $2, MM 2538 jle .L108 2539 2540#ifdef movsd 2541 xorps %xmm0, %xmm0 2542#endif 2543 movsd -32 * SIZE(A1), %xmm0 2544#ifdef movsd 2545 xorps %xmm4, %xmm4 2546#endif 2547 movsd -32 * SIZE(X1), %xmm4 2548 mulps %xmm4, %xmm0 2549 addps %xmm0, %xmm8 2550#ifdef movsd 2551 xorps %xmm1, %xmm1 2552#endif 2553 movsd -32 * SIZE(A1, LDA), %xmm1 2554 mulps %xmm4, %xmm1 2555 addps %xmm1, %xmm9 2556#ifdef movsd 2557 xorps %xmm2, %xmm2 2558#endif 2559 movsd -32 * SIZE(A2), %xmm2 2560 mulps %xmm4, %xmm2 2561 addps %xmm2, %xmm10 2562#ifdef movsd 2563 xorps %xmm3, %xmm3 2564#endif 2565 movsd -32 * SIZE(A2, LDA), %xmm3 2566 mulps %xmm4, %xmm3 2567 addps %xmm3, %xmm11 2568 shufps $0xe, %xmm4, %xmm4 2569 2570 addq $2 * SIZE, A1 2571 addq $2 * SIZE, A2 2572 addq $2 * SIZE, X1 2573 ALIGN_4 2574 2575.L108: 2576 testq $1, MM 2577 jle .L109 2578 2579 movss -32 * SIZE(A1), %xmm0 2580 movss -32 * SIZE(X1), %xmm4 2581 mulss %xmm4, %xmm0 2582 addss %xmm0, %xmm8 2583 movss -32 * SIZE(A1, LDA), %xmm1 2584 mulss %xmm4, %xmm1 2585 addss %xmm1, %xmm9 2586 movss -32 * SIZE(A2), %xmm2 2587 mulss %xmm4, %xmm2 2588 addss %xmm2, %xmm10 2589 movss -32 * SIZE(A2, LDA), %xmm3 2590 mulss %xmm4, %xmm3 2591 addss %xmm3, %xmm11 2592 ALIGN_4 2593 2594.L109: 2595#ifdef HAVE_SSE3 2596 haddps %xmm9, %xmm8 2597 haddps %xmm11, %xmm10 2598 haddps %xmm10, %xmm8 2599 2600 pshufd $0x1, %xmm8, %xmm9 2601 pshufd $0x2, %xmm8, %xmm10 2602 pshufd $0x3, %xmm8, %xmm11 2603#else 2604 movaps %xmm8, %xmm0 2605 unpcklps %xmm9, %xmm8 2606 unpckhps %xmm9, %xmm0 2607 2608 movaps %xmm10, %xmm1 2609 unpcklps %xmm11, %xmm10 2610 unpckhps %xmm11, %xmm1 2611 2612 movaps %xmm8, %xmm9 2613 unpcklps %xmm10, %xmm8 2614 unpckhps %xmm10, %xmm9 2615 2616 movaps %xmm0, %xmm10 2617 unpcklps %xmm1, %xmm0 2618 unpckhps %xmm1, %xmm10 2619 2620 addps %xmm9, %xmm8 2621 addps %xmm0, %xmm10 2622 addps %xmm10, %xmm8 2623 2624 pshufd $0x2, %xmm8, %xmm9 2625 pshufd $0x1, %xmm8, %xmm10 2626 pshufd $0x3, %xmm8, %xmm11 2627#endif 2628 2629 mulss ALPHA, %xmm8 2630 mulss ALPHA, %xmm9 2631 mulss ALPHA, %xmm10 2632 mulss ALPHA, %xmm11 2633 2634 addss (Y), %xmm8 2635 addq INCY, Y 2636 addss (Y), %xmm9 2637 addq INCY, Y 2638 addss (Y), %xmm10 2639 addq INCY, Y 2640 addss (Y), %xmm11 2641 addq INCY, Y 2642 2643 movss %xmm8, (Y1) 2644 addq INCY, Y1 2645 movss %xmm9, (Y1) 2646 addq INCY, Y1 2647 movss %xmm10, (Y1) 2648 addq INCY, Y1 2649 movss %xmm11, (Y1) 2650 addq INCY, Y1 2651 2652 cmpq $4, N 2653 jge .L101 2654 ALIGN_4 2655 2656.L110: 2657 cmpq $3, N 2658 jne .L120 2659 2660 leaq 32 * SIZE(BUFFER), X1 2661 2662 movq A, A1 2663 leaq (A1, LDA, 2), A2 2664 leaq (A1, LDA, 4), A 2665 2666 xorps %xmm8, %xmm8 2667 xorps %xmm9, %xmm9 2668 xorps %xmm10, %xmm10 2669 2670 cmpq $3, M 2671 jle .L117 2672 2673 testq $SIZE, A1 2674 je .L11X 2675 2676 movss -32 * SIZE(A1), %xmm0 2677 movss -32 * SIZE(X1), %xmm4 2678 mulss %xmm4, %xmm0 2679 addss %xmm0, %xmm8 2680 movss -32 * SIZE(A1, LDA), %xmm1 2681 mulss %xmm4, %xmm1 2682 addss %xmm1, %xmm9 2683 movss -32 * SIZE(A2), %xmm2 2684 mulss %xmm4, %xmm2 2685 addss %xmm2, %xmm10 2686 2687 addq $1 * SIZE, A1 2688 addq $1 * SIZE, A2 2689 addq $1 * SIZE, X1 2690 ALIGN_3 2691 2692.L11X: 2693 testq $2 * SIZE, A1 2694 je .L11XX 2695 2696#ifdef movsd 2697 xorps %xmm0, %xmm0 2698 xorps %xmm4, %xmm4 2699#endif 2700 movsd -32 * SIZE(A1), %xmm0 2701 movsd -32 * SIZE(X1), %xmm4 2702 mulps %xmm4, %xmm0 2703 addps %xmm0, %xmm8 2704#ifdef movsd 2705 xorps %xmm1, %xmm1 2706#endif 2707 movsd -32 * SIZE(A1, LDA), %xmm1 2708 mulps %xmm4, %xmm1 2709 addps %xmm1, %xmm9 2710#ifdef movsd 2711 xorps %xmm2, %xmm2 2712#endif 2713 movsd -32 * SIZE(A2), %xmm2 2714 mulps %xmm4, %xmm2 2715 addps %xmm2, %xmm10 2716 2717 addq $2 * SIZE, A1 2718 addq $2 * SIZE, A2 2719 addq $2 * SIZE, X1 2720 ALIGN_3 2721 2722.L11XX: 2723 MOVUPS_A2 (-34 * SIZE, A1, LDA, 1, %xmm12) 2724 2725 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 2726 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 2727 2728 movq MM, I 2729 sarq $4, I 2730 jle .L115 2731 2732 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2733 MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) 2734 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 2735 2736 decq I 2737 jle .L113 2738 ALIGN_4 2739 2740.L112: 2741#ifdef PREFETCH 2742 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) 2743#endif 2744 2745 mulps %xmm4, %xmm0 2746 addps %xmm0, %xmm8 2747 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 2748 shufps $0x4e, %xmm1, %xmm12 2749 mulps %xmm4, %xmm12 2750 addps %xmm12, %xmm9 2751 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) 2752 mulps %xmm4, %xmm2 2753 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2754 addps %xmm2, %xmm10 2755 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) 2756 2757#ifdef PREFETCH 2758 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) 2759#endif 2760 2761 mulps %xmm5, %xmm0 2762 addps %xmm0, %xmm8 2763 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 2764 shufps $0x4e, %xmm12, %xmm1 2765 mulps %xmm5, %xmm1 2766 addps %xmm1, %xmm9 2767 MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) 2768 mulps %xmm5, %xmm2 2769 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 2770 addps %xmm2, %xmm10 2771 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) 2772 2773#ifdef PREFETCH 2774 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) 2775#endif 2776 2777 mulps %xmm4, %xmm0 2778 addps %xmm0, %xmm8 2779 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 2780 shufps $0x4e, %xmm1, %xmm12 2781 mulps %xmm4, %xmm12 2782 addps %xmm12, %xmm9 2783 MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) 2784 mulps %xmm4, %xmm2 2785 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 2786 addps %xmm2, %xmm10 2787 MOVUPS_A1 (-20 * SIZE, A2, %xmm2) 2788 2789#ifdef PREFETCHW 2790 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) 2791#endif 2792 2793 mulps %xmm5, %xmm0 2794 addps %xmm0, %xmm8 2795 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 2796 shufps $0x4e, %xmm12, %xmm1 2797 mulps %xmm5, %xmm1 2798 addps %xmm1, %xmm9 2799 MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1) 2800 mulps %xmm5, %xmm2 2801 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 2802 addps %xmm2, %xmm10 2803 MOVUPS_A1 (-16 * SIZE, A2, %xmm2) 2804 2805 addq $16 * SIZE, A1 2806 addq $16 * SIZE, A2 2807 addq $16 * SIZE, X1 2808 2809 decq I 2810 jg .L112 2811 ALIGN_4 2812 2813.L113: 2814 mulps %xmm4, %xmm0 2815 addps %xmm0, %xmm8 2816 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 2817 shufps $0x4e, %xmm1, %xmm12 2818 mulps %xmm4, %xmm12 2819 addps %xmm12, %xmm9 2820 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) 2821 mulps %xmm4, %xmm2 2822 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2823 addps %xmm2, %xmm10 2824 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) 2825 2826 mulps %xmm5, %xmm0 2827 addps %xmm0, %xmm8 2828 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 2829 shufps $0x4e, %xmm12, %xmm1 2830 mulps %xmm5, %xmm1 2831 addps %xmm1, %xmm9 2832 MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) 2833 mulps %xmm5, %xmm2 2834 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 2835 addps %xmm2, %xmm10 2836 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) 2837 2838 mulps %xmm4, %xmm0 2839 addps %xmm0, %xmm8 2840 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 2841 shufps $0x4e, %xmm1, %xmm12 2842 mulps %xmm4, %xmm12 2843 addps %xmm12, %xmm9 2844 MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) 2845 mulps %xmm4, %xmm2 2846 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 2847 addps %xmm2, %xmm10 2848 MOVUPS_A1 (-20 * SIZE, A2, %xmm2) 2849 2850 mulps %xmm5, %xmm0 2851 addps %xmm0, %xmm8 2852 shufps $0x4e, %xmm12, %xmm1 2853 mulps %xmm5, %xmm1 2854 addps %xmm1, %xmm9 2855 mulps %xmm5, %xmm2 2856 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 2857 addps %xmm2, %xmm10 2858 2859 addq $16 * SIZE, A1 2860 addq $16 * SIZE, A2 2861 addq $16 * SIZE, X1 2862 ALIGN_4 2863 2864.L115: 2865 testq $8, MM 2866 jle .L116 2867 2868 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2869 MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) 2870 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 2871 2872 mulps %xmm4, %xmm0 2873 addps %xmm0, %xmm8 2874 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 2875 shufps $0x4e, %xmm1, %xmm12 2876 mulps %xmm4, %xmm12 2877 addps %xmm12, %xmm9 2878 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) 2879 mulps %xmm4, %xmm2 2880 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 2881 addps %xmm2, %xmm10 2882 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) 2883 2884 mulps %xmm5, %xmm0 2885 addps %xmm0, %xmm8 2886 shufps $0x4e, %xmm12, %xmm1 2887 mulps %xmm5, %xmm1 2888 addps %xmm1, %xmm9 2889 mulps %xmm5, %xmm2 2890 addps %xmm2, %xmm10 2891 2892 addq $8 * SIZE, A1 2893 addq $8 * SIZE, A2 2894 addq $8 * SIZE, X1 2895 ALIGN_4 2896 2897.L116: 2898 testq $4, MM 2899 jle .L117 2900 2901 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 2902 mulps %xmm4, %xmm0 2903 addps %xmm0, %xmm8 2904 MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) 2905 shufps $0x4e, %xmm1, %xmm12 2906 mulps %xmm4, %xmm12 2907 addps %xmm12, %xmm9 2908 2909 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) 2910 mulps %xmm4, %xmm2 2911 addps %xmm2, %xmm10 2912 2913 addq $4 * SIZE, A1 2914 addq $4 * SIZE, A2 2915 addq $4 * SIZE, X1 2916 ALIGN_4 2917 2918.L117: 2919 testq $2, MM 2920 jle .L118 2921 2922#ifdef movsd 2923 xorps %xmm0, %xmm0 2924#endif 2925 movsd -32 * SIZE(A1), %xmm0 2926#ifdef movsd 2927 xorps %xmm4, %xmm4 2928#endif 2929 movsd -32 * SIZE(X1), %xmm4 2930 mulps %xmm4, %xmm0 2931 addps %xmm0, %xmm8 2932#ifdef movsd 2933 xorps %xmm1, %xmm1 2934#endif 2935 movsd -32 * SIZE(A1, LDA), %xmm1 2936 mulps %xmm4, %xmm1 2937 addps %xmm1, %xmm9 2938#ifdef movsd 2939 xorps %xmm2, %xmm2 2940#endif 2941 movsd -32 * SIZE(A2), %xmm2 2942 mulps %xmm4, %xmm2 2943 addps %xmm2, %xmm10 2944 2945 addq $2 * SIZE, A1 2946 addq $2 * SIZE, A2 2947 addq $2 * SIZE, X1 2948 ALIGN_4 2949 2950.L118: 2951 testq $1, MM 2952 jle .L119 2953 2954 movss -32 * SIZE(A1), %xmm0 2955 movss -32 * SIZE(X1), %xmm4 2956 mulss %xmm4, %xmm0 2957 addss %xmm0, %xmm8 2958 movss -32 * SIZE(A1, LDA), %xmm1 2959 mulss %xmm4, %xmm1 2960 addss %xmm1, %xmm9 2961 movss -32 * SIZE(A2), %xmm2 2962 mulss %xmm4, %xmm2 2963 addss %xmm2, %xmm10 2964 ALIGN_4 2965 2966.L119: 2967#ifdef HAVE_SSE3 2968 haddps %xmm9, %xmm8 2969 haddps %xmm11, %xmm10 2970 haddps %xmm10, %xmm8 2971 2972 pshufd $0x1, %xmm8, %xmm9 2973 pshufd $0x2, %xmm8, %xmm10 2974#else 2975 movaps %xmm8, %xmm0 2976 unpcklps %xmm9, %xmm8 2977 unpckhps %xmm9, %xmm0 2978 2979 movaps %xmm10, %xmm1 2980 unpcklps %xmm11, %xmm10 2981 unpckhps %xmm11, %xmm1 2982 2983 movaps %xmm8, %xmm9 2984 unpcklps %xmm10, %xmm8 2985 unpckhps %xmm10, %xmm9 2986 2987 movaps %xmm0, %xmm10 2988 unpcklps %xmm1, %xmm0 2989 unpckhps %xmm1, %xmm10 2990 2991 addps %xmm9, %xmm8 2992 addps %xmm0, %xmm10 2993 addps %xmm10, %xmm8 2994 2995 pshufd $0x2, %xmm8, %xmm9 2996 pshufd $0x1, %xmm8, %xmm10 2997#endif 2998 2999 mulss ALPHA, %xmm8 3000 mulss ALPHA, %xmm9 3001 mulss ALPHA, %xmm10 3002 3003 addss (Y), %xmm8 3004 addq INCY, Y 3005 addss (Y), %xmm9 3006 addq INCY, Y 3007 addss (Y), %xmm10 3008 3009 movss %xmm8, (Y1) 3010 addq INCY, Y1 3011 movss %xmm9, (Y1) 3012 addq INCY, Y1 3013 movss %xmm10, (Y1) 3014 jmp .L999 3015 ALIGN_4 3016 3017.L120: 3018 cmpq $2, N 3019 jne .L130 3020 3021 leaq 32 * SIZE(BUFFER), X1 3022 3023 movq A, A1 3024 leaq (A1, LDA), A2 3025 leaq (A1, LDA, 2), A 3026 3027 xorps %xmm8, %xmm8 3028 xorps %xmm9, %xmm9 3029 3030 cmpq $3, M 3031 jle .L127 3032 3033 testq $SIZE, A1 3034 je .L12X 3035 3036 movss -32 * SIZE(A1), %xmm0 3037 movss -32 * SIZE(X1), %xmm4 3038 mulss %xmm4, %xmm0 3039 addss %xmm0, %xmm8 3040 movss -32 * SIZE(A2), %xmm1 3041 mulss %xmm4, %xmm1 3042 addss %xmm1, %xmm9 3043 3044 addq $1 * SIZE, A1 3045 addq $1 * SIZE, A2 3046 addq $1 * SIZE, X1 3047 ALIGN_3 3048 3049.L12X: 3050 testq $2 * SIZE, A1 3051 je .L12XX 3052 3053#ifdef movsd 3054 xorps %xmm0, %xmm0 3055 xorps %xmm4, %xmm4 3056#endif 3057 movsd -32 * SIZE(A1), %xmm0 3058 movsd -32 * SIZE(X1), %xmm4 3059 mulps %xmm4, %xmm0 3060 addps %xmm0, %xmm8 3061#ifdef movsd 3062 xorps %xmm1, %xmm1 3063#endif 3064 movsd -32 * SIZE(A2), %xmm1 3065 mulps %xmm4, %xmm1 3066 addps %xmm1, %xmm9 3067 3068 addq $2 * SIZE, A1 3069 addq $2 * SIZE, A2 3070 addq $2 * SIZE, X1 3071 ALIGN_3 3072 3073.L12XX: 3074 MOVUPS_A1 (-34 * SIZE, A2, %xmm12) 3075 3076 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 3077 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 3078 3079 movq MM, I 3080 sarq $4, I 3081 jle .L125 3082 3083 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3084 MOVUPS_A1 (-30 * SIZE, A2, %xmm1) 3085 3086 decq I 3087 jle .L123 3088 ALIGN_4 3089 3090.L122: 3091#ifdef PREFETCH 3092 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 3093#endif 3094 3095 mulps %xmm4, %xmm0 3096 addps %xmm0, %xmm8 3097 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 3098 shufps $0x4e, %xmm1, %xmm12 3099 mulps %xmm4, %xmm12 3100 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3101 addps %xmm12, %xmm9 3102 MOVUPS_A1 (-26 * SIZE, A2, %xmm12) 3103 3104 mulps %xmm5, %xmm0 3105 addps %xmm0, %xmm8 3106 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 3107 shufps $0x4e, %xmm12, %xmm1 3108 mulps %xmm5, %xmm1 3109 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 3110 addps %xmm1, %xmm9 3111 MOVUPS_A1 (-22 * SIZE, A2, %xmm1) 3112 3113#ifdef PREFETCH 3114 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) 3115#endif 3116 3117 mulps %xmm4, %xmm0 3118 addps %xmm0, %xmm8 3119 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 3120 shufps $0x4e, %xmm1, %xmm12 3121 mulps %xmm4, %xmm12 3122 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 3123 addps %xmm12, %xmm9 3124 MOVUPS_A1 (-18 * SIZE, A2, %xmm12) 3125 3126#ifdef PREFETCHW 3127 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) 3128#endif 3129 3130 mulps %xmm5, %xmm0 3131 addps %xmm0, %xmm8 3132 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 3133 shufps $0x4e, %xmm12, %xmm1 3134 mulps %xmm5, %xmm1 3135 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 3136 addps %xmm1, %xmm9 3137 MOVUPS_A1 (-14 * SIZE, A2, %xmm1) 3138 3139 addq $16 * SIZE, A1 3140 addq $16 * SIZE, A2 3141 addq $16 * SIZE, X1 3142 3143 decq I 3144 jg .L122 3145 ALIGN_4 3146 3147.L123: 3148 mulps %xmm4, %xmm0 3149 addps %xmm0, %xmm8 3150 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 3151 shufps $0x4e, %xmm1, %xmm12 3152 mulps %xmm4, %xmm12 3153 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3154 addps %xmm12, %xmm9 3155 MOVUPS_A1 (-26 * SIZE, A2, %xmm12) 3156 3157 mulps %xmm5, %xmm0 3158 addps %xmm0, %xmm8 3159 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 3160 shufps $0x4e, %xmm12, %xmm1 3161 mulps %xmm5, %xmm1 3162 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 3163 addps %xmm1, %xmm9 3164 MOVUPS_A1 (-22 * SIZE, A2, %xmm1) 3165 3166 mulps %xmm4, %xmm0 3167 addps %xmm0, %xmm8 3168 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 3169 shufps $0x4e, %xmm1, %xmm12 3170 mulps %xmm4, %xmm12 3171 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 3172 addps %xmm12, %xmm9 3173 MOVUPS_A1 (-18 * SIZE, A2, %xmm12) 3174 3175 mulps %xmm5, %xmm0 3176 addps %xmm0, %xmm8 3177 shufps $0x4e, %xmm12, %xmm1 3178 mulps %xmm5, %xmm1 3179 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 3180 addps %xmm1, %xmm9 3181 3182 addq $16 * SIZE, A1 3183 addq $16 * SIZE, A2 3184 addq $16 * SIZE, X1 3185 ALIGN_4 3186 3187.L125: 3188 testq $8, MM 3189 jle .L126 3190 3191 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3192 MOVUPS_A1 (-30 * SIZE, A2, %xmm1) 3193 3194 mulps %xmm4, %xmm0 3195 addps %xmm0, %xmm8 3196 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 3197 shufps $0x4e, %xmm1, %xmm12 3198 mulps %xmm4, %xmm12 3199 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3200 addps %xmm12, %xmm9 3201 MOVUPS_A1 (-26 * SIZE, A2, %xmm12) 3202 3203 mulps %xmm5, %xmm0 3204 addps %xmm0, %xmm8 3205 shufps $0x4e, %xmm12, %xmm1 3206 mulps %xmm5, %xmm1 3207 addps %xmm1, %xmm9 3208 3209 addq $8 * SIZE, A1 3210 addq $8 * SIZE, A2 3211 addq $8 * SIZE, X1 3212 ALIGN_4 3213 3214.L126: 3215 testq $4, MM 3216 jle .L127 3217 3218 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3219 mulps %xmm4, %xmm0 3220 addps %xmm0, %xmm8 3221 MOVUPS_A1 (-30 * SIZE, A2, %xmm1) 3222 shufps $0x4e, %xmm1, %xmm12 3223 mulps %xmm4, %xmm12 3224 addps %xmm12, %xmm9 3225 3226 addq $4 * SIZE, A1 3227 addq $4 * SIZE, A2 3228 addq $4 * SIZE, X1 3229 ALIGN_4 3230 3231.L127: 3232 testq $2, MM 3233 jle .L128 3234 3235#ifdef movsd 3236 xorps %xmm0, %xmm0 3237#endif 3238 movsd -32 * SIZE(A1), %xmm0 3239#ifdef movsd 3240 xorps %xmm4, %xmm4 3241#endif 3242 movsd -32 * SIZE(X1), %xmm4 3243 mulps %xmm4, %xmm0 3244 addps %xmm0, %xmm8 3245#ifdef movsd 3246 xorps %xmm1, %xmm1 3247#endif 3248 movsd -32 * SIZE(A2), %xmm1 3249 mulps %xmm4, %xmm1 3250 addps %xmm1, %xmm9 3251 shufps $0xe, %xmm4, %xmm4 3252 3253 addq $2 * SIZE, A1 3254 addq $2 * SIZE, A2 3255 addq $2 * SIZE, X1 3256 ALIGN_4 3257 3258.L128: 3259 testq $1, MM 3260 jle .L129 3261 3262 movss -32 * SIZE(A1), %xmm0 3263 movss -32 * SIZE(X1), %xmm4 3264 mulss %xmm4, %xmm0 3265 addss %xmm0, %xmm8 3266 movss -32 * SIZE(A2), %xmm1 3267 mulss %xmm4, %xmm1 3268 addss %xmm1, %xmm9 3269 ALIGN_4 3270 3271.L129: 3272#ifdef HAVE_SSE3 3273 haddps %xmm9, %xmm8 3274 haddps %xmm8, %xmm8 3275#else 3276 movaps %xmm8, %xmm10 3277 unpcklps %xmm9, %xmm8 3278 unpckhps %xmm9, %xmm10 3279 3280 addps %xmm10, %xmm8 3281 movhlps %xmm8, %xmm9 3282 addps %xmm9, %xmm8 3283#endif 3284 3285 pshufd $0x1, %xmm8, %xmm9 3286 3287 mulss ALPHA, %xmm8 3288 mulss ALPHA, %xmm9 3289 3290 addss (Y), %xmm8 3291 addq INCY, Y 3292 addss (Y), %xmm9 3293 addq INCY, Y 3294 3295 movss %xmm8, (Y1) 3296 addq INCY, Y1 3297 movss %xmm9, (Y1) 3298 addq INCY, Y1 3299 jmp .L999 3300 ALIGN_4 3301 3302.L130: 3303 cmpq $1, N 3304 jne .L999 3305 3306 leaq 32 * SIZE(BUFFER), X1 3307 3308 movq A, A1 3309 3310 xorps %xmm8, %xmm8 3311 xorps %xmm9, %xmm9 3312 3313 cmpq $3, M 3314 jle .L137 3315 3316 testq $SIZE, A1 3317 je .L13X 3318 3319 movss -32 * SIZE(A1), %xmm0 3320 movss -32 * SIZE(X1), %xmm4 3321 mulss %xmm4, %xmm0 3322 addss %xmm0, %xmm8 3323 3324 addq $1 * SIZE, A1 3325 addq $1 * SIZE, X1 3326 ALIGN_3 3327 3328.L13X: 3329 testq $2 * SIZE, A1 3330 je .L13XX 3331 3332 3333#ifdef movsd 3334 xorps %xmm0, %xmm0 3335 xorps %xmm4, %xmm4 3336#endif 3337 movsd -32 * SIZE(A1), %xmm0 3338 movsd -32 * SIZE(X1), %xmm4 3339 mulps %xmm4, %xmm0 3340 addps %xmm0, %xmm8 3341 shufps $0xe, %xmm4, %xmm4 3342 3343 addq $2 * SIZE, A1 3344 addq $2 * SIZE, X1 3345 ALIGN_3 3346 3347.L13XX: 3348 3349 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 3350 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 3351 3352 movq MM, I 3353 sarq $4, I 3354 jle .L135 3355 3356 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3357 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 3358 3359 decq I 3360 jle .L133 3361 ALIGN_4 3362 3363.L132: 3364#ifdef PREFETCH 3365 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) 3366#endif 3367 3368 mulps %xmm4, %xmm0 3369 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3370 addps %xmm0, %xmm8 3371 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 3372 3373 mulps %xmm5, %xmm12 3374 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 3375 addps %xmm12, %xmm9 3376 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 3377 3378#ifdef PREFETCHW 3379 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) 3380#endif 3381 3382 mulps %xmm4, %xmm0 3383 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 3384 addps %xmm0, %xmm8 3385 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 3386 3387 mulps %xmm5, %xmm12 3388 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 3389 addps %xmm12, %xmm9 3390 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) 3391 3392 addq $16 * SIZE, A1 3393 addq $16 * SIZE, X1 3394 3395 decq I 3396 jg .L132 3397 ALIGN_4 3398 3399.L133: 3400 mulps %xmm4, %xmm0 3401 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3402 addps %xmm0, %xmm8 3403 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 3404 3405 mulps %xmm5, %xmm12 3406 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 3407 addps %xmm12, %xmm9 3408 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 3409 3410 mulps %xmm4, %xmm0 3411 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 3412 addps %xmm0, %xmm8 3413 3414 mulps %xmm5, %xmm12 3415 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 3416 addps %xmm12, %xmm9 3417 3418 addq $16 * SIZE, A1 3419 addq $16 * SIZE, X1 3420 ALIGN_4 3421 3422.L135: 3423 testq $8, MM 3424 jle .L136 3425 3426 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3427 mulps %xmm4, %xmm0 3428 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3429 addps %xmm0, %xmm8 3430 3431 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 3432 mulps %xmm5, %xmm12 3433 addps %xmm12, %xmm9 3434 3435 addq $8 * SIZE, A1 3436 addq $8 * SIZE, X1 3437 ALIGN_4 3438 3439.L136: 3440 testq $4, MM 3441 jle .L137 3442 3443 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3444 mulps %xmm4, %xmm0 3445 addps %xmm0, %xmm8 3446 3447 addq $4 * SIZE, A1 3448 addq $4 * SIZE, X1 3449 ALIGN_4 3450 3451.L137: 3452 testq $2, MM 3453 jle .L138 3454 3455#ifdef movsd 3456 xorps %xmm0, %xmm0 3457#endif 3458 movsd -32 * SIZE(A1), %xmm0 3459#ifdef movsd 3460 xorps %xmm4, %xmm4 3461#endif 3462 movsd -32 * SIZE(X1), %xmm4 3463 mulps %xmm4, %xmm0 3464 addps %xmm0, %xmm8 3465 shufps $0xe, %xmm4, %xmm4 3466 3467 addq $2 * SIZE, A1 3468 addq $2 * SIZE, X1 3469 ALIGN_4 3470 3471.L138: 3472 testq $1, MM 3473 jle .L139 3474 3475 movss -32 * SIZE(A1), %xmm0 3476 movss -32 * SIZE(X1), %xmm4 3477 mulss %xmm4, %xmm0 3478 addss %xmm0, %xmm8 3479 ALIGN_4 3480 3481.L139: 3482 addps %xmm9, %xmm8 3483 3484#ifdef HAVE_SSE3 3485 haddps %xmm8, %xmm8 3486 haddps %xmm8, %xmm8 3487#else 3488 pshufd $1, %xmm8, %xmm9 3489 pshufd $2, %xmm8, %xmm10 3490 pshufd $3, %xmm8, %xmm11 3491 3492 addss %xmm9, %xmm8 3493 addss %xmm11, %xmm10 3494 addss %xmm10, %xmm8 3495#endif 3496 3497 mulss ALPHA, %xmm8 3498 3499 addss (Y), %xmm8 3500 movss %xmm8, (Y1) 3501 jmp .L999 3502 ALIGN_4 3503 3504.L200: 3505 testq $2 * SIZE, LDA 3506 jne .L300 3507 3508 cmpq $4, N 3509 jl .L210 3510 ALIGN_3 3511 3512.L201: 3513 subq $4, N 3514 3515 leaq 32 * SIZE(BUFFER), X1 3516 3517 movq A, A1 3518 leaq (A1, LDA, 2), A2 3519 leaq (A1, LDA, 4), A 3520 3521 xorps %xmm8, %xmm8 3522 xorps %xmm9, %xmm9 3523 xorps %xmm10, %xmm10 3524 xorps %xmm11, %xmm11 3525 3526 cmpq $3, M 3527 jle .L207 3528 3529 testq $SIZE, A1 3530 je .L20X 3531 3532 movss -32 * SIZE(A1), %xmm0 3533 movss -32 * SIZE(X1), %xmm4 3534 mulss %xmm4, %xmm0 3535 addss %xmm0, %xmm8 3536 movss -32 * SIZE(A1, LDA), %xmm1 3537 mulss %xmm4, %xmm1 3538 addss %xmm1, %xmm9 3539 movss -32 * SIZE(A2), %xmm2 3540 mulss %xmm4, %xmm2 3541 addss %xmm2, %xmm10 3542 movss -32 * SIZE(A2, LDA), %xmm3 3543 mulss %xmm4, %xmm3 3544 addss %xmm3, %xmm11 3545 3546 addq $1 * SIZE, A1 3547 addq $1 * SIZE, A2 3548 addq $1 * SIZE, X1 3549 ALIGN_3 3550 3551.L20X: 3552 testq $2 * SIZE, A1 3553 je .L20XX 3554 3555#ifdef movsd 3556 xorps %xmm0, %xmm0 3557 xorps %xmm4, %xmm4 3558#endif 3559 movsd -32 * SIZE(A1), %xmm0 3560 movsd -32 * SIZE(X1), %xmm4 3561 mulps %xmm4, %xmm0 3562 addps %xmm0, %xmm8 3563#ifdef movsd 3564 xorps %xmm1, %xmm1 3565#endif 3566 movsd -32 * SIZE(A1, LDA), %xmm1 3567 mulps %xmm4, %xmm1 3568 addps %xmm1, %xmm9 3569#ifdef movsd 3570 xorps %xmm2, %xmm2 3571#endif 3572 movsd -32 * SIZE(A2), %xmm2 3573 mulps %xmm4, %xmm2 3574 addps %xmm2, %xmm10 3575#ifdef movsd 3576 xorps %xmm3, %xmm3 3577#endif 3578 movsd -32 * SIZE(A2, LDA), %xmm3 3579 mulps %xmm4, %xmm3 3580 addps %xmm3, %xmm11 3581 3582 addq $2 * SIZE, A1 3583 addq $2 * SIZE, A2 3584 addq $2 * SIZE, X1 3585 ALIGN_3 3586 3587.L20XX: 3588 movaps -33 * SIZE(A1, LDA), %xmm12 3589 movaps -34 * SIZE(A2), %xmm13 3590 movaps -35 * SIZE(A2, LDA), %xmm14 3591 3592 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 3593 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 3594 3595#ifdef PREFETCHW 3596 PREFETCHW 4 * SIZE(Y1) 3597#endif 3598 3599 movq MM, I 3600 sarq $4, I 3601 jle .L205 3602 3603 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3604 MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) 3605 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 3606 MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) 3607 3608 decq I 3609 jle .L203 3610 ALIGN_4 3611 3612.L202: 3613#ifdef PREFETCH 3614 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 3615#endif 3616 3617 mulps %xmm4, %xmm0 3618 addps %xmm0, %xmm8 3619 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 3620 movss %xmm1, %xmm12 3621 shufps $0x39, %xmm12, %xmm12 3622 mulps %xmm4, %xmm12 3623 addps %xmm12, %xmm9 3624 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) 3625 3626 shufps $0x4e, %xmm2, %xmm13 3627 mulps %xmm4, %xmm13 3628 addps %xmm13, %xmm10 3629 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 3630 movss %xmm3, %xmm14 3631 shufps $0x93, %xmm3, %xmm14 3632 mulps %xmm4, %xmm14 3633 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3634 addps %xmm14, %xmm11 3635 MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) 3636 3637#ifdef PREFETCH 3638 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) 3639#endif 3640 3641 mulps %xmm5, %xmm0 3642 addps %xmm0, %xmm8 3643 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 3644 movss %xmm12, %xmm1 3645 shufps $0x39, %xmm1, %xmm1 3646 mulps %xmm5, %xmm1 3647 addps %xmm1, %xmm9 3648 MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) 3649 3650 shufps $0x4e, %xmm13, %xmm2 3651 mulps %xmm5, %xmm2 3652 addps %xmm2, %xmm10 3653 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) 3654 movss %xmm14, %xmm3 3655 shufps $0x93, %xmm14, %xmm3 3656 mulps %xmm5, %xmm3 3657 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 3658 addps %xmm3, %xmm11 3659 MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) 3660 3661#ifdef PREFETCH 3662 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 3663#endif 3664 3665 mulps %xmm4, %xmm0 3666 addps %xmm0, %xmm8 3667 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 3668 movss %xmm1, %xmm12 3669 shufps $0x39, %xmm12, %xmm12 3670 mulps %xmm4, %xmm12 3671 addps %xmm12, %xmm9 3672 MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) 3673 3674 shufps $0x4e, %xmm2, %xmm13 3675 mulps %xmm4, %xmm13 3676 addps %xmm13, %xmm10 3677 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) 3678 movss %xmm3, %xmm14 3679 shufps $0x93, %xmm3, %xmm14 3680 mulps %xmm4, %xmm14 3681 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 3682 addps %xmm14, %xmm11 3683 MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) 3684 3685#ifdef PREFETCH 3686 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) 3687#endif 3688 3689 mulps %xmm5, %xmm0 3690 addps %xmm0, %xmm8 3691 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 3692 movss %xmm12, %xmm1 3693 shufps $0x39, %xmm1, %xmm1 3694 mulps %xmm5, %xmm1 3695 addps %xmm1, %xmm9 3696 MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1) 3697 3698#ifdef PREFETCHW 3699 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) 3700#endif 3701 3702 shufps $0x4e, %xmm13, %xmm2 3703 mulps %xmm5, %xmm2 3704 addps %xmm2, %xmm10 3705 MOVUPS_A1 (-14 * SIZE, A2, %xmm2) 3706 movss %xmm14, %xmm3 3707 shufps $0x93, %xmm14, %xmm3 3708 mulps %xmm5, %xmm3 3709 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 3710 addps %xmm3, %xmm11 3711 MOVUPS_A2 (-15 * SIZE, A2, LDA, 1, %xmm3) 3712 3713 addq $16 * SIZE, A1 3714 addq $16 * SIZE, A2 3715 addq $16 * SIZE, X1 3716 3717 decq I 3718 jg .L202 3719 ALIGN_4 3720 3721.L203: 3722 mulps %xmm4, %xmm0 3723 addps %xmm0, %xmm8 3724 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 3725 movss %xmm1, %xmm12 3726 shufps $0x39, %xmm12, %xmm12 3727 mulps %xmm4, %xmm12 3728 addps %xmm12, %xmm9 3729 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) 3730 3731 shufps $0x4e, %xmm2, %xmm13 3732 mulps %xmm4, %xmm13 3733 addps %xmm13, %xmm10 3734 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 3735 movss %xmm3, %xmm14 3736 shufps $0x93, %xmm3, %xmm14 3737 mulps %xmm4, %xmm14 3738 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3739 addps %xmm14, %xmm11 3740 MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) 3741 3742 mulps %xmm5, %xmm0 3743 addps %xmm0, %xmm8 3744 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 3745 movss %xmm12, %xmm1 3746 shufps $0x39, %xmm1, %xmm1 3747 mulps %xmm5, %xmm1 3748 addps %xmm1, %xmm9 3749 MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) 3750 3751 shufps $0x4e, %xmm13, %xmm2 3752 mulps %xmm5, %xmm2 3753 addps %xmm2, %xmm10 3754 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) 3755 movss %xmm14, %xmm3 3756 shufps $0x93, %xmm14, %xmm3 3757 mulps %xmm5, %xmm3 3758 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 3759 addps %xmm3, %xmm11 3760 MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) 3761 3762 mulps %xmm4, %xmm0 3763 addps %xmm0, %xmm8 3764 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 3765 movss %xmm1, %xmm12 3766 shufps $0x39, %xmm12, %xmm12 3767 mulps %xmm4, %xmm12 3768 addps %xmm12, %xmm9 3769 MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) 3770 3771 shufps $0x4e, %xmm2, %xmm13 3772 mulps %xmm4, %xmm13 3773 addps %xmm13, %xmm10 3774 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) 3775 movss %xmm3, %xmm14 3776 shufps $0x93, %xmm3, %xmm14 3777 mulps %xmm4, %xmm14 3778 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 3779 addps %xmm14, %xmm11 3780 MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) 3781 3782 mulps %xmm5, %xmm0 3783 addps %xmm0, %xmm8 3784 movss %xmm12, %xmm1 3785 shufps $0x39, %xmm1, %xmm1 3786 mulps %xmm5, %xmm1 3787 addps %xmm1, %xmm9 3788 3789 shufps $0x4e, %xmm13, %xmm2 3790 mulps %xmm5, %xmm2 3791 addps %xmm2, %xmm10 3792 movss %xmm14, %xmm3 3793 shufps $0x93, %xmm14, %xmm3 3794 mulps %xmm5, %xmm3 3795 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 3796 addps %xmm3, %xmm11 3797 3798 addq $16 * SIZE, A1 3799 addq $16 * SIZE, A2 3800 addq $16 * SIZE, X1 3801 ALIGN_4 3802 3803.L205: 3804 testq $8, MM 3805 jle .L206 3806 3807 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3808 MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) 3809 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 3810 MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) 3811 3812 mulps %xmm4, %xmm0 3813 addps %xmm0, %xmm8 3814 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 3815 movss %xmm1, %xmm12 3816 shufps $0x39, %xmm12, %xmm12 3817 mulps %xmm4, %xmm12 3818 addps %xmm12, %xmm9 3819 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) 3820 3821 shufps $0x4e, %xmm2, %xmm13 3822 mulps %xmm4, %xmm13 3823 addps %xmm13, %xmm10 3824 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 3825 movss %xmm3, %xmm14 3826 shufps $0x93, %xmm3, %xmm14 3827 mulps %xmm4, %xmm14 3828 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 3829 addps %xmm14, %xmm11 3830 MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) 3831 3832 mulps %xmm5, %xmm0 3833 addps %xmm0, %xmm8 3834 movss %xmm12, %xmm1 3835 shufps $0x39, %xmm1, %xmm1 3836 mulps %xmm5, %xmm1 3837 addps %xmm1, %xmm9 3838 3839 shufps $0x4e, %xmm13, %xmm2 3840 mulps %xmm5, %xmm2 3841 addps %xmm2, %xmm10 3842 movss %xmm14, %xmm3 3843 shufps $0x93, %xmm14, %xmm3 3844 mulps %xmm5, %xmm3 3845 addps %xmm3, %xmm11 3846 3847 addq $8 * SIZE, A1 3848 addq $8 * SIZE, A2 3849 addq $8 * SIZE, X1 3850 ALIGN_4 3851 3852.L206: 3853 testq $4, MM 3854 jle .L207 3855 3856 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 3857 MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) 3858 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 3859 MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) 3860 3861 mulps %xmm4, %xmm0 3862 addps %xmm0, %xmm8 3863 movss %xmm1, %xmm12 3864 shufps $0x39, %xmm12, %xmm12 3865 mulps %xmm4, %xmm12 3866 addps %xmm12, %xmm9 3867 3868 shufps $0x4e, %xmm2, %xmm13 3869 mulps %xmm4, %xmm13 3870 addps %xmm13, %xmm10 3871 movss %xmm3, %xmm14 3872 shufps $0x93, %xmm3, %xmm14 3873 mulps %xmm4, %xmm14 3874 addps %xmm14, %xmm11 3875 3876 addq $4 * SIZE, A1 3877 addq $4 * SIZE, A2 3878 addq $4 * SIZE, X1 3879 ALIGN_4 3880 3881.L207: 3882 testq $2, MM 3883 jle .L208 3884 3885#ifdef movsd 3886 xorps %xmm0, %xmm0 3887#endif 3888 movsd -32 * SIZE(A1), %xmm0 3889#ifdef movsd 3890 xorps %xmm4, %xmm4 3891#endif 3892 movsd -32 * SIZE(X1), %xmm4 3893 mulps %xmm4, %xmm0 3894 addps %xmm0, %xmm8 3895#ifdef movsd 3896 xorps %xmm1, %xmm1 3897#endif 3898 movsd -32 * SIZE(A1, LDA), %xmm1 3899 mulps %xmm4, %xmm1 3900 addps %xmm1, %xmm9 3901#ifdef movsd 3902 xorps %xmm2, %xmm2 3903#endif 3904 movsd -32 * SIZE(A2), %xmm2 3905 mulps %xmm4, %xmm2 3906 addps %xmm2, %xmm10 3907#ifdef movsd 3908 xorps %xmm3, %xmm3 3909#endif 3910 movsd -32 * SIZE(A2, LDA), %xmm3 3911 mulps %xmm4, %xmm3 3912 addps %xmm3, %xmm11 3913 shufps $0xe, %xmm4, %xmm4 3914 3915 addq $2 * SIZE, A1 3916 addq $2 * SIZE, A2 3917 addq $2 * SIZE, X1 3918 ALIGN_4 3919 3920.L208: 3921 testq $1, MM 3922 jle .L209 3923 3924 movss -32 * SIZE(A1), %xmm0 3925 movss -32 * SIZE(X1), %xmm4 3926 mulss %xmm4, %xmm0 3927 addss %xmm0, %xmm8 3928 movss -32 * SIZE(A1, LDA), %xmm1 3929 mulss %xmm4, %xmm1 3930 addss %xmm1, %xmm9 3931 movss -32 * SIZE(A2), %xmm2 3932 mulss %xmm4, %xmm2 3933 addss %xmm2, %xmm10 3934 movss -32 * SIZE(A2, LDA), %xmm3 3935 mulss %xmm4, %xmm3 3936 addss %xmm3, %xmm11 3937 ALIGN_4 3938 3939.L209: 3940#ifdef HAVE_SSE3 3941 haddps %xmm9, %xmm8 3942 haddps %xmm11, %xmm10 3943 haddps %xmm10, %xmm8 3944 3945 pshufd $0x1, %xmm8, %xmm9 3946 pshufd $0x2, %xmm8, %xmm10 3947 pshufd $0x3, %xmm8, %xmm11 3948#else 3949 movaps %xmm8, %xmm0 3950 unpcklps %xmm9, %xmm8 3951 unpckhps %xmm9, %xmm0 3952 3953 movaps %xmm10, %xmm1 3954 unpcklps %xmm11, %xmm10 3955 unpckhps %xmm11, %xmm1 3956 3957 movaps %xmm8, %xmm9 3958 unpcklps %xmm10, %xmm8 3959 unpckhps %xmm10, %xmm9 3960 3961 movaps %xmm0, %xmm10 3962 unpcklps %xmm1, %xmm0 3963 unpckhps %xmm1, %xmm10 3964 3965 addps %xmm9, %xmm8 3966 addps %xmm0, %xmm10 3967 addps %xmm10, %xmm8 3968 3969 pshufd $0x2, %xmm8, %xmm9 3970 pshufd $0x1, %xmm8, %xmm10 3971 pshufd $0x3, %xmm8, %xmm11 3972#endif 3973 3974 mulss ALPHA, %xmm8 3975 mulss ALPHA, %xmm9 3976 mulss ALPHA, %xmm10 3977 mulss ALPHA, %xmm11 3978 3979 addss (Y), %xmm8 3980 addq INCY, Y 3981 addss (Y), %xmm9 3982 addq INCY, Y 3983 addss (Y), %xmm10 3984 addq INCY, Y 3985 addss (Y), %xmm11 3986 addq INCY, Y 3987 3988 movss %xmm8, (Y1) 3989 addq INCY, Y1 3990 movss %xmm9, (Y1) 3991 addq INCY, Y1 3992 movss %xmm10, (Y1) 3993 addq INCY, Y1 3994 movss %xmm11, (Y1) 3995 addq INCY, Y1 3996 3997 cmpq $4, N 3998 jge .L201 3999 ALIGN_4 4000 4001.L210: 4002 cmpq $3, N 4003 jne .L220 4004 4005 leaq 32 * SIZE(BUFFER), X1 4006 4007 movq A, A1 4008 leaq (A1, LDA, 2), A2 4009 leaq (A1, LDA, 4), A 4010 4011 xorps %xmm8, %xmm8 4012 xorps %xmm9, %xmm9 4013 xorps %xmm10, %xmm10 4014 4015 cmpq $3, M 4016 jle .L217 4017 4018 testq $SIZE, A1 4019 je .L21X 4020 4021 movss -32 * SIZE(A1), %xmm0 4022 movss -32 * SIZE(X1), %xmm4 4023 mulss %xmm4, %xmm0 4024 addss %xmm0, %xmm8 4025 movss -32 * SIZE(A1, LDA), %xmm1 4026 mulss %xmm4, %xmm1 4027 addss %xmm1, %xmm9 4028 movss -32 * SIZE(A2), %xmm2 4029 mulss %xmm4, %xmm2 4030 addss %xmm2, %xmm10 4031 4032 addq $1 * SIZE, A1 4033 addq $1 * SIZE, A2 4034 addq $1 * SIZE, X1 4035 ALIGN_3 4036 4037.L21X: 4038 testq $2 * SIZE, A1 4039 je .L21XX 4040 4041#ifdef movsd 4042 xorps %xmm0, %xmm0 4043 xorps %xmm4, %xmm4 4044#endif 4045 movsd -32 * SIZE(A1), %xmm0 4046 movsd -32 * SIZE(X1), %xmm4 4047 mulps %xmm4, %xmm0 4048 addps %xmm0, %xmm8 4049#ifdef movsd 4050 xorps %xmm1, %xmm1 4051#endif 4052 movsd -32 * SIZE(A1, LDA), %xmm1 4053 mulps %xmm4, %xmm1 4054 addps %xmm1, %xmm9 4055#ifdef movsd 4056 xorps %xmm2, %xmm2 4057#endif 4058 movsd -32 * SIZE(A2), %xmm2 4059 mulps %xmm4, %xmm2 4060 addps %xmm2, %xmm10 4061 4062 addq $2 * SIZE, A1 4063 addq $2 * SIZE, A2 4064 addq $2 * SIZE, X1 4065 ALIGN_3 4066 4067.L21XX: 4068 movaps -33 * SIZE(A1, LDA), %xmm12 4069 movaps -34 * SIZE(A2), %xmm13 4070 4071 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 4072 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 4073 4074#ifdef PREFETCHW 4075 PREFETCHW 4 * SIZE(Y1) 4076#endif 4077 4078 movq MM, I 4079 sarq $4, I 4080 jle .L215 4081 4082 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4083 MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) 4084 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 4085 4086 decq I 4087 jle .L213 4088 ALIGN_4 4089 4090.L212: 4091#ifdef PREFETCH 4092 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) 4093#endif 4094 4095 mulps %xmm4, %xmm0 4096 addps %xmm0, %xmm8 4097 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 4098 movss %xmm1, %xmm12 4099 shufps $0x39, %xmm12, %xmm12 4100 mulps %xmm4, %xmm12 4101 addps %xmm12, %xmm9 4102 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) 4103 4104 shufps $0x4e, %xmm2, %xmm13 4105 mulps %xmm4, %xmm13 4106 addps %xmm13, %xmm10 4107 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 4108 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4109 4110#ifdef PREFETCH 4111 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) 4112#endif 4113 4114 mulps %xmm5, %xmm0 4115 addps %xmm0, %xmm8 4116 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 4117 movss %xmm12, %xmm1 4118 shufps $0x39, %xmm1, %xmm1 4119 mulps %xmm5, %xmm1 4120 addps %xmm1, %xmm9 4121 MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) 4122 4123 shufps $0x4e, %xmm13, %xmm2 4124 mulps %xmm5, %xmm2 4125 addps %xmm2, %xmm10 4126 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) 4127 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 4128 4129#ifdef PREFETCH 4130 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) 4131#endif 4132 4133 mulps %xmm4, %xmm0 4134 addps %xmm0, %xmm8 4135 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 4136 movss %xmm1, %xmm12 4137 shufps $0x39, %xmm12, %xmm12 4138 mulps %xmm4, %xmm12 4139 addps %xmm12, %xmm9 4140 MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) 4141 shufps $0x4e, %xmm2, %xmm13 4142 mulps %xmm4, %xmm13 4143 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 4144 addps %xmm13, %xmm10 4145 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) 4146 4147#ifdef PREFETCHW 4148 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) 4149#endif 4150 4151 mulps %xmm5, %xmm0 4152 addps %xmm0, %xmm8 4153 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 4154 movss %xmm12, %xmm1 4155 shufps $0x39, %xmm1, %xmm1 4156 mulps %xmm5, %xmm1 4157 addps %xmm1, %xmm9 4158 MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1) 4159 shufps $0x4e, %xmm13, %xmm2 4160 mulps %xmm5, %xmm2 4161 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 4162 addps %xmm2, %xmm10 4163 MOVUPS_A1 (-14 * SIZE, A2, %xmm2) 4164 4165 addq $16 * SIZE, A1 4166 addq $16 * SIZE, A2 4167 addq $16 * SIZE, X1 4168 4169 decq I 4170 jg .L212 4171 ALIGN_4 4172 4173.L213: 4174 mulps %xmm4, %xmm0 4175 addps %xmm0, %xmm8 4176 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 4177 movss %xmm1, %xmm12 4178 shufps $0x39, %xmm12, %xmm12 4179 mulps %xmm4, %xmm12 4180 addps %xmm12, %xmm9 4181 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) 4182 4183 shufps $0x4e, %xmm2, %xmm13 4184 mulps %xmm4, %xmm13 4185 addps %xmm13, %xmm10 4186 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 4187 movss %xmm3, %xmm14 4188 shufps $0x93, %xmm3, %xmm14 4189 mulps %xmm4, %xmm14 4190 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4191 addps %xmm14, %xmm11 4192 MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) 4193 4194 mulps %xmm5, %xmm0 4195 addps %xmm0, %xmm8 4196 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 4197 movss %xmm12, %xmm1 4198 shufps $0x39, %xmm1, %xmm1 4199 mulps %xmm5, %xmm1 4200 addps %xmm1, %xmm9 4201 MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) 4202 4203 shufps $0x4e, %xmm13, %xmm2 4204 mulps %xmm5, %xmm2 4205 addps %xmm2, %xmm10 4206 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) 4207 movss %xmm14, %xmm3 4208 shufps $0x93, %xmm14, %xmm3 4209 mulps %xmm5, %xmm3 4210 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 4211 addps %xmm3, %xmm11 4212 MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) 4213 4214 mulps %xmm4, %xmm0 4215 addps %xmm0, %xmm8 4216 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 4217 movss %xmm1, %xmm12 4218 shufps $0x39, %xmm12, %xmm12 4219 mulps %xmm4, %xmm12 4220 addps %xmm12, %xmm9 4221 MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) 4222 4223 shufps $0x4e, %xmm2, %xmm13 4224 mulps %xmm4, %xmm13 4225 addps %xmm13, %xmm10 4226 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) 4227 movss %xmm3, %xmm14 4228 shufps $0x93, %xmm3, %xmm14 4229 mulps %xmm4, %xmm14 4230 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 4231 addps %xmm14, %xmm11 4232 MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) 4233 4234 mulps %xmm5, %xmm0 4235 addps %xmm0, %xmm8 4236 movss %xmm12, %xmm1 4237 shufps $0x39, %xmm1, %xmm1 4238 mulps %xmm5, %xmm1 4239 addps %xmm1, %xmm9 4240 4241 shufps $0x4e, %xmm13, %xmm2 4242 mulps %xmm5, %xmm2 4243 addps %xmm2, %xmm10 4244 movss %xmm14, %xmm3 4245 shufps $0x93, %xmm14, %xmm3 4246 mulps %xmm5, %xmm3 4247 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 4248 addps %xmm3, %xmm11 4249 4250 addq $16 * SIZE, A1 4251 addq $16 * SIZE, A2 4252 addq $16 * SIZE, X1 4253 ALIGN_4 4254 4255.L215: 4256 testq $8, MM 4257 jle .L216 4258 4259 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4260 MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) 4261 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 4262 4263 mulps %xmm4, %xmm0 4264 addps %xmm0, %xmm8 4265 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 4266 movss %xmm1, %xmm12 4267 shufps $0x39, %xmm12, %xmm12 4268 mulps %xmm4, %xmm12 4269 addps %xmm12, %xmm9 4270 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) 4271 4272 shufps $0x4e, %xmm2, %xmm13 4273 mulps %xmm4, %xmm13 4274 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4275 addps %xmm13, %xmm10 4276 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 4277 4278 mulps %xmm5, %xmm0 4279 addps %xmm0, %xmm8 4280 movss %xmm12, %xmm1 4281 shufps $0x39, %xmm1, %xmm1 4282 mulps %xmm5, %xmm1 4283 addps %xmm1, %xmm9 4284 4285 shufps $0x4e, %xmm13, %xmm2 4286 mulps %xmm5, %xmm2 4287 addps %xmm2, %xmm10 4288 4289 addq $8 * SIZE, A1 4290 addq $8 * SIZE, A2 4291 addq $8 * SIZE, X1 4292 ALIGN_4 4293 4294.L216: 4295 testq $4, MM 4296 jle .L217 4297 4298 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4299 MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) 4300 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 4301 4302 mulps %xmm4, %xmm0 4303 addps %xmm0, %xmm8 4304 movss %xmm1, %xmm12 4305 shufps $0x39, %xmm12, %xmm12 4306 mulps %xmm4, %xmm12 4307 addps %xmm12, %xmm9 4308 4309 shufps $0x4e, %xmm2, %xmm13 4310 mulps %xmm4, %xmm13 4311 addps %xmm13, %xmm10 4312 4313 addq $4 * SIZE, A1 4314 addq $4 * SIZE, A2 4315 addq $4 * SIZE, X1 4316 ALIGN_4 4317 4318.L217: 4319 testq $2, MM 4320 jle .L218 4321 4322#ifdef movsd 4323 xorps %xmm0, %xmm0 4324#endif 4325 movsd -32 * SIZE(A1), %xmm0 4326#ifdef movsd 4327 xorps %xmm4, %xmm4 4328#endif 4329 movsd -32 * SIZE(X1), %xmm4 4330 mulps %xmm4, %xmm0 4331 addps %xmm0, %xmm8 4332#ifdef movsd 4333 xorps %xmm1, %xmm1 4334#endif 4335 movsd -32 * SIZE(A1, LDA), %xmm1 4336 mulps %xmm4, %xmm1 4337 addps %xmm1, %xmm9 4338#ifdef movsd 4339 xorps %xmm2, %xmm2 4340#endif 4341 movsd -32 * SIZE(A2), %xmm2 4342 mulps %xmm4, %xmm2 4343 addps %xmm2, %xmm10 4344 4345 addq $2 * SIZE, A1 4346 addq $2 * SIZE, A2 4347 addq $2 * SIZE, X1 4348 ALIGN_4 4349 4350.L218: 4351 testq $1, MM 4352 jle .L219 4353 4354 movss -32 * SIZE(A1), %xmm0 4355 movss -32 * SIZE(X1), %xmm4 4356 mulss %xmm4, %xmm0 4357 addss %xmm0, %xmm8 4358 movss -32 * SIZE(A1, LDA), %xmm1 4359 mulss %xmm4, %xmm1 4360 addss %xmm1, %xmm9 4361 movss -32 * SIZE(A2), %xmm2 4362 mulss %xmm4, %xmm2 4363 addss %xmm2, %xmm10 4364 ALIGN_4 4365 4366.L219: 4367#ifdef HAVE_SSE3 4368 haddps %xmm9, %xmm8 4369 haddps %xmm11, %xmm10 4370 haddps %xmm10, %xmm8 4371 4372 pshufd $0x1, %xmm8, %xmm9 4373 pshufd $0x2, %xmm8, %xmm10 4374#else 4375 movaps %xmm8, %xmm0 4376 unpcklps %xmm9, %xmm8 4377 unpckhps %xmm9, %xmm0 4378 4379 movaps %xmm10, %xmm1 4380 unpcklps %xmm11, %xmm10 4381 unpckhps %xmm11, %xmm1 4382 4383 movaps %xmm8, %xmm9 4384 unpcklps %xmm10, %xmm8 4385 unpckhps %xmm10, %xmm9 4386 4387 movaps %xmm0, %xmm10 4388 unpcklps %xmm1, %xmm0 4389 unpckhps %xmm1, %xmm10 4390 4391 addps %xmm9, %xmm8 4392 addps %xmm0, %xmm10 4393 addps %xmm10, %xmm8 4394 4395 pshufd $0x2, %xmm8, %xmm9 4396 pshufd $0x1, %xmm8, %xmm10 4397#endif 4398 4399 mulss ALPHA, %xmm8 4400 mulss ALPHA, %xmm9 4401 mulss ALPHA, %xmm10 4402 4403 addss (Y), %xmm8 4404 addq INCY, Y 4405 addss (Y), %xmm9 4406 addq INCY, Y 4407 addss (Y), %xmm10 4408 4409 movss %xmm8, (Y1) 4410 addq INCY, Y1 4411 movss %xmm9, (Y1) 4412 addq INCY, Y1 4413 movss %xmm10, (Y1) 4414 jmp .L999 4415 ALIGN_4 4416 4417.L220: 4418 testq N, N 4419 jle .L999 4420 4421 cmpq $2, N 4422 jne .L230 4423 4424 leaq 32 * SIZE(BUFFER), X1 4425 4426 movq A, A1 4427 leaq (A1, LDA), A2 4428 leaq (A1, LDA, 2), A 4429 4430 xorps %xmm8, %xmm8 4431 xorps %xmm9, %xmm9 4432 4433 cmpq $3, M 4434 jle .L227 4435 4436 testq $SIZE, A1 4437 je .L22X 4438 4439 movss -32 * SIZE(A1), %xmm0 4440 movss -32 * SIZE(X1), %xmm4 4441 mulss %xmm4, %xmm0 4442 addss %xmm0, %xmm8 4443 movss -32 * SIZE(A2), %xmm1 4444 mulss %xmm4, %xmm1 4445 addss %xmm1, %xmm9 4446 4447 addq $1 * SIZE, A1 4448 addq $1 * SIZE, A2 4449 addq $1 * SIZE, X1 4450 ALIGN_3 4451 4452.L22X: 4453 testq $2 * SIZE, A1 4454 je .L22XX 4455 4456#ifdef movsd 4457 xorps %xmm0, %xmm0 4458 xorps %xmm4, %xmm4 4459#endif 4460 movsd -32 * SIZE(A1), %xmm0 4461 movsd -32 * SIZE(X1), %xmm4 4462 mulps %xmm4, %xmm0 4463 addps %xmm0, %xmm8 4464#ifdef movsd 4465 xorps %xmm1, %xmm1 4466#endif 4467 movsd -32 * SIZE(A2), %xmm1 4468 mulps %xmm4, %xmm1 4469 addps %xmm1, %xmm9 4470 4471 addq $2 * SIZE, A1 4472 addq $2 * SIZE, A2 4473 addq $2 * SIZE, X1 4474 ALIGN_3 4475 4476.L22XX: 4477 movaps -33 * SIZE(A2), %xmm12 4478 4479 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 4480 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 4481 4482 movq MM, I 4483 sarq $4, I 4484 jle .L225 4485 4486 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4487 MOVUPS_A1 (-29 * SIZE, A2, %xmm1) 4488 4489 decq I 4490 jle .L223 4491 ALIGN_4 4492 4493.L222: 4494#ifdef PREFETCH 4495 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 4496#endif 4497 4498 mulps %xmm4, %xmm0 4499 addps %xmm0, %xmm8 4500 MOVUPS_A1 (-28 * SIZE, A1, %xmm2) 4501 movss %xmm1, %xmm12 4502 shufps $0x39, %xmm12, %xmm12 4503 mulps %xmm4, %xmm12 4504 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4505 addps %xmm12, %xmm9 4506 MOVUPS_A1 (-25 * SIZE, A2, %xmm12) 4507 4508 mulps %xmm5, %xmm2 4509 addps %xmm2, %xmm8 4510 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 4511 movss %xmm12, %xmm1 4512 shufps $0x39, %xmm1, %xmm1 4513 mulps %xmm5, %xmm1 4514 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 4515 addps %xmm1, %xmm9 4516 MOVUPS_A1 (-21 * SIZE, A2, %xmm1) 4517 4518#ifdef PREFETCH 4519 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) 4520#endif 4521 4522 mulps %xmm4, %xmm0 4523 addps %xmm0, %xmm8 4524 MOVUPS_A1 (-20 * SIZE, A1, %xmm2) 4525 movss %xmm1, %xmm12 4526 shufps $0x39, %xmm12, %xmm12 4527 mulps %xmm4, %xmm12 4528 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 4529 addps %xmm12, %xmm9 4530 MOVUPS_A1 (-17 * SIZE, A2, %xmm12) 4531 4532#ifdef PREFETCHW 4533 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) 4534#endif 4535 4536 mulps %xmm5, %xmm2 4537 addps %xmm2, %xmm8 4538 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 4539 movss %xmm12, %xmm1 4540 shufps $0x39, %xmm1, %xmm1 4541 mulps %xmm5, %xmm1 4542 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 4543 addps %xmm1, %xmm9 4544 MOVUPS_A1 (-13 * SIZE, A2, %xmm1) 4545 4546 addq $16 * SIZE, A1 4547 addq $16 * SIZE, A2 4548 addq $16 * SIZE, X1 4549 4550 decq I 4551 jg .L222 4552 ALIGN_4 4553 4554.L223: 4555 mulps %xmm4, %xmm0 4556 addps %xmm0, %xmm8 4557 MOVUPS_A1 (-28 * SIZE, A1, %xmm2) 4558 movss %xmm1, %xmm12 4559 shufps $0x39, %xmm12, %xmm12 4560 mulps %xmm4, %xmm12 4561 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4562 addps %xmm12, %xmm9 4563 MOVUPS_A1 (-25 * SIZE, A2, %xmm12) 4564 4565 mulps %xmm5, %xmm2 4566 addps %xmm2, %xmm8 4567 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 4568 movss %xmm12, %xmm1 4569 shufps $0x39, %xmm1, %xmm1 4570 mulps %xmm5, %xmm1 4571 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 4572 addps %xmm1, %xmm9 4573 MOVUPS_A1 (-21 * SIZE, A2, %xmm1) 4574 4575 mulps %xmm4, %xmm0 4576 addps %xmm0, %xmm8 4577 MOVUPS_A1 (-20 * SIZE, A1, %xmm2) 4578 movss %xmm1, %xmm12 4579 shufps $0x39, %xmm12, %xmm12 4580 mulps %xmm4, %xmm12 4581 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 4582 addps %xmm12, %xmm9 4583 MOVUPS_A1 (-17 * SIZE, A2, %xmm12) 4584 4585 mulps %xmm5, %xmm2 4586 addps %xmm2, %xmm8 4587 movss %xmm12, %xmm1 4588 shufps $0x39, %xmm1, %xmm1 4589 mulps %xmm5, %xmm1 4590 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 4591 addps %xmm1, %xmm9 4592 4593 addq $16 * SIZE, A1 4594 addq $16 * SIZE, A2 4595 addq $16 * SIZE, X1 4596 ALIGN_4 4597 4598.L225: 4599 testq $8, MM 4600 jle .L226 4601 4602 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4603 MOVUPS_A1 (-29 * SIZE, A2, %xmm1) 4604 4605 mulps %xmm4, %xmm0 4606 addps %xmm0, %xmm8 4607 MOVUPS_A1 (-28 * SIZE, A1, %xmm2) 4608 movss %xmm1, %xmm12 4609 shufps $0x39, %xmm12, %xmm12 4610 mulps %xmm4, %xmm12 4611 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4612 addps %xmm12, %xmm9 4613 MOVUPS_A1 (-25 * SIZE, A2, %xmm12) 4614 4615 mulps %xmm5, %xmm2 4616 addps %xmm2, %xmm8 4617 movss %xmm12, %xmm1 4618 shufps $0x39, %xmm1, %xmm1 4619 mulps %xmm5, %xmm1 4620 addps %xmm1, %xmm9 4621 4622 addq $8 * SIZE, A1 4623 addq $8 * SIZE, A2 4624 addq $8 * SIZE, X1 4625 ALIGN_4 4626 4627.L226: 4628 testq $4, MM 4629 jle .L227 4630 4631 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4632 MOVUPS_A1 (-29 * SIZE, A2, %xmm1) 4633 4634 mulps %xmm4, %xmm0 4635 addps %xmm0, %xmm8 4636 4637 movss %xmm1, %xmm12 4638 shufps $0x39, %xmm12, %xmm12 4639 mulps %xmm4, %xmm12 4640 addps %xmm12, %xmm9 4641 4642 addq $4 * SIZE, A1 4643 addq $4 * SIZE, A2 4644 addq $4 * SIZE, X1 4645 ALIGN_4 4646 4647.L227: 4648 testq $2, MM 4649 jle .L228 4650 4651#ifdef movsd 4652 xorps %xmm0, %xmm0 4653#endif 4654 movsd -32 * SIZE(A1), %xmm0 4655#ifdef movsd 4656 xorps %xmm4, %xmm4 4657#endif 4658 movsd -32 * SIZE(X1), %xmm4 4659 mulps %xmm4, %xmm0 4660 addps %xmm0, %xmm8 4661#ifdef movsd 4662 xorps %xmm1, %xmm1 4663#endif 4664 movsd -32 * SIZE(A2), %xmm1 4665 mulps %xmm4, %xmm1 4666 addps %xmm1, %xmm9 4667 shufps $0xe, %xmm4, %xmm4 4668 4669 addq $2 * SIZE, A1 4670 addq $2 * SIZE, A2 4671 addq $2 * SIZE, X1 4672 ALIGN_4 4673 4674.L228: 4675 testq $1, MM 4676 jle .L229 4677 4678 movss -32 * SIZE(A1), %xmm0 4679 movss -32 * SIZE(X1), %xmm4 4680 mulss %xmm4, %xmm0 4681 addss %xmm0, %xmm8 4682 movss -32 * SIZE(A2), %xmm1 4683 mulss %xmm4, %xmm1 4684 addss %xmm1, %xmm9 4685 ALIGN_4 4686 4687.L229: 4688#ifdef HAVE_SSE3 4689 haddps %xmm9, %xmm8 4690 haddps %xmm8, %xmm8 4691#else 4692 movaps %xmm8, %xmm10 4693 unpcklps %xmm9, %xmm8 4694 unpckhps %xmm9, %xmm10 4695 4696 addps %xmm10, %xmm8 4697 movhlps %xmm8, %xmm9 4698 addps %xmm9, %xmm8 4699#endif 4700 4701 pshufd $0x1, %xmm8, %xmm9 4702 4703 mulss ALPHA, %xmm8 4704 mulss ALPHA, %xmm9 4705 4706 addss (Y), %xmm8 4707 addq INCY, Y 4708 addss (Y), %xmm9 4709 addq INCY, Y 4710 4711 movss %xmm8, (Y1) 4712 addq INCY, Y1 4713 movss %xmm9, (Y1) 4714 addq INCY, Y1 4715 jmp .L999 4716 ALIGN_4 4717 4718.L230: 4719 cmpq $1, N 4720 jne .L999 4721 4722 leaq 32 * SIZE(BUFFER), X1 4723 4724 movq A, A1 4725 4726 xorps %xmm8, %xmm8 4727 xorps %xmm9, %xmm9 4728 4729 cmpq $3, M 4730 jle .L237 4731 4732 testq $SIZE, A1 4733 je .L23X 4734 4735 movss -32 * SIZE(A1), %xmm0 4736 movss -32 * SIZE(X1), %xmm4 4737 mulss %xmm4, %xmm0 4738 addss %xmm0, %xmm8 4739 4740 addq $1 * SIZE, A1 4741 addq $1 * SIZE, X1 4742 ALIGN_3 4743 4744.L23X: 4745 testq $2 * SIZE, A1 4746 je .L23XX 4747 4748#ifdef movsd 4749 xorps %xmm0, %xmm0 4750 xorps %xmm4, %xmm4 4751#endif 4752 movsd -32 * SIZE(A1), %xmm0 4753 movsd -32 * SIZE(X1), %xmm4 4754 mulps %xmm4, %xmm0 4755 addps %xmm0, %xmm8 4756 shufps $0xe, %xmm4, %xmm4 4757 4758 addq $2 * SIZE, A1 4759 addq $2 * SIZE, X1 4760 ALIGN_3 4761 4762.L23XX: 4763 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 4764 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 4765 4766 4767 movq MM, I 4768 sarq $4, I 4769 jle .L235 4770 4771 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4772 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 4773 4774 decq I 4775 jle .L233 4776 ALIGN_4 4777 4778.L232: 4779#ifdef PREFETCH 4780 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) 4781#endif 4782 4783 mulps %xmm4, %xmm0 4784 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4785 addps %xmm0, %xmm8 4786 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 4787 4788 mulps %xmm5, %xmm12 4789 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 4790 addps %xmm12, %xmm9 4791 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 4792 4793#ifdef PREFETCHW 4794 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) 4795#endif 4796 4797 mulps %xmm4, %xmm0 4798 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 4799 addps %xmm0, %xmm8 4800 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 4801 4802 mulps %xmm5, %xmm12 4803 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 4804 addps %xmm12, %xmm9 4805 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) 4806 4807 addq $16 * SIZE, A1 4808 addq $16 * SIZE, X1 4809 4810 decq I 4811 jg .L232 4812 ALIGN_4 4813 4814.L233: 4815 mulps %xmm4, %xmm0 4816 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4817 addps %xmm0, %xmm8 4818 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 4819 4820 mulps %xmm5, %xmm12 4821 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 4822 addps %xmm12, %xmm9 4823 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 4824 4825 mulps %xmm4, %xmm0 4826 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 4827 addps %xmm0, %xmm8 4828 4829 mulps %xmm5, %xmm12 4830 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 4831 addps %xmm12, %xmm9 4832 4833 addq $16 * SIZE, A1 4834 addq $16 * SIZE, X1 4835 ALIGN_4 4836 4837.L235: 4838 testq $8, MM 4839 jle .L236 4840 4841 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4842 mulps %xmm4, %xmm0 4843 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 4844 addps %xmm0, %xmm8 4845 4846 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 4847 mulps %xmm5, %xmm12 4848 addps %xmm12, %xmm9 4849 4850 addq $8 * SIZE, A1 4851 addq $8 * SIZE, X1 4852 ALIGN_4 4853 4854.L236: 4855 testq $4, MM 4856 jle .L237 4857 4858 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 4859 mulps %xmm4, %xmm0 4860 addps %xmm0, %xmm8 4861 4862 addq $4 * SIZE, A1 4863 addq $4 * SIZE, X1 4864 ALIGN_4 4865 4866.L237: 4867 testq $2, MM 4868 jle .L238 4869 4870#ifdef movsd 4871 xorps %xmm0, %xmm0 4872#endif 4873 movsd -32 * SIZE(A1), %xmm0 4874#ifdef movsd 4875 xorps %xmm4, %xmm4 4876#endif 4877 movsd -32 * SIZE(X1), %xmm4 4878 mulps %xmm4, %xmm0 4879 addps %xmm0, %xmm8 4880 shufps $0xe, %xmm4, %xmm4 4881 4882 addq $2 * SIZE, A1 4883 addq $2 * SIZE, X1 4884 ALIGN_4 4885 4886.L238: 4887 testq $1, MM 4888 jle .L239 4889 4890 movss -32 * SIZE(A1), %xmm0 4891 movss -32 * SIZE(X1), %xmm4 4892 mulss %xmm4, %xmm0 4893 addss %xmm0, %xmm8 4894 ALIGN_4 4895 4896.L239: 4897 addps %xmm9, %xmm8 4898 4899#ifdef HAVE_SSE3 4900 haddps %xmm8, %xmm8 4901 haddps %xmm8, %xmm8 4902#else 4903 pshufd $1, %xmm8, %xmm9 4904 pshufd $2, %xmm8, %xmm10 4905 pshufd $3, %xmm8, %xmm11 4906 4907 addss %xmm9, %xmm8 4908 addss %xmm11, %xmm10 4909 addss %xmm10, %xmm8 4910#endif 4911 4912 mulss ALPHA, %xmm8 4913 4914 addss (Y), %xmm8 4915 movss %xmm8, (Y1) 4916 jmp .L999 4917 ALIGN_4 4918 4919.L300: 4920 cmpq $4, N 4921 jl .L310 4922 ALIGN_3 4923 4924.L301: 4925 subq $4, N 4926 4927 leaq 32 * SIZE(BUFFER), X1 4928 4929 movq A, A1 4930 leaq (A1, LDA, 2), A2 4931 leaq (A1, LDA, 4), A 4932 4933 xorps %xmm8, %xmm8 4934 xorps %xmm9, %xmm9 4935 xorps %xmm10, %xmm10 4936 xorps %xmm11, %xmm11 4937 4938 cmpq $3, M 4939 jle .L307 4940 4941 testq $SIZE, A1 4942 je .L30X 4943 4944 movss -32 * SIZE(A1), %xmm0 4945 movss -32 * SIZE(X1), %xmm4 4946 mulss %xmm4, %xmm0 4947 addss %xmm0, %xmm8 4948 movss -32 * SIZE(A1, LDA), %xmm1 4949 mulss %xmm4, %xmm1 4950 addss %xmm1, %xmm9 4951 movss -32 * SIZE(A2), %xmm2 4952 mulss %xmm4, %xmm2 4953 addss %xmm2, %xmm10 4954 movss -32 * SIZE(A2, LDA), %xmm3 4955 mulss %xmm4, %xmm3 4956 addss %xmm3, %xmm11 4957 4958 addq $1 * SIZE, A1 4959 addq $1 * SIZE, A2 4960 addq $1 * SIZE, X1 4961 ALIGN_3 4962 4963.L30X: 4964 testq $2 * SIZE, A1 4965 je .L30XX 4966 4967#ifdef movsd 4968 xorps %xmm0, %xmm0 4969 xorps %xmm4, %xmm4 4970#endif 4971 movsd -32 * SIZE(A1), %xmm0 4972 movsd -32 * SIZE(X1), %xmm4 4973 mulps %xmm4, %xmm0 4974 addps %xmm0, %xmm8 4975#ifdef movsd 4976 xorps %xmm1, %xmm1 4977#endif 4978 movsd -32 * SIZE(A1, LDA), %xmm1 4979 mulps %xmm4, %xmm1 4980 addps %xmm1, %xmm9 4981#ifdef movsd 4982 xorps %xmm2, %xmm2 4983#endif 4984 movsd -32 * SIZE(A2), %xmm2 4985 mulps %xmm4, %xmm2 4986 addps %xmm2, %xmm10 4987#ifdef movsd 4988 xorps %xmm3, %xmm3 4989#endif 4990 movsd -32 * SIZE(A2, LDA), %xmm3 4991 mulps %xmm4, %xmm3 4992 addps %xmm3, %xmm11 4993 4994 addq $2 * SIZE, A1 4995 addq $2 * SIZE, A2 4996 addq $2 * SIZE, X1 4997 ALIGN_3 4998 4999.L30XX: 5000 movaps -35 * SIZE(A1, LDA), %xmm12 5001 movaps -34 * SIZE(A2), %xmm13 5002 movaps -33 * SIZE(A2, LDA), %xmm14 5003 5004 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 5005 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 5006 5007#ifdef PREFETCHW 5008 PREFETCHW 4 * SIZE(Y1) 5009#endif 5010 5011 movq MM, I 5012 sarq $4, I 5013 jle .L305 5014 5015 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 5016 MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) 5017 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 5018 MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) 5019 5020 decq I 5021 jle .L303 5022 ALIGN_4 5023 5024.L302: 5025#ifdef PREFETCH 5026 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 5027#endif 5028 5029 mulps %xmm4, %xmm0 5030 addps %xmm0, %xmm8 5031 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 5032 movss %xmm1, %xmm12 5033 shufps $0x93, %xmm1, %xmm12 5034 mulps %xmm4, %xmm12 5035 addps %xmm12, %xmm9 5036 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) 5037 5038 shufps $0x4e, %xmm2, %xmm13 5039 mulps %xmm4, %xmm13 5040 addps %xmm13, %xmm10 5041 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 5042 movss %xmm3, %xmm14 5043 shufps $0x39, %xmm14, %xmm14 5044 mulps %xmm4, %xmm14 5045 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 5046 addps %xmm14, %xmm11 5047 MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) 5048 5049#ifdef PREFETCH 5050 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) 5051#endif 5052 5053 mulps %xmm5, %xmm0 5054 addps %xmm0, %xmm8 5055 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 5056 movss %xmm12, %xmm1 5057 shufps $0x93, %xmm12, %xmm1 5058 mulps %xmm5, %xmm1 5059 addps %xmm1, %xmm9 5060 MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) 5061 5062 shufps $0x4e, %xmm13, %xmm2 5063 mulps %xmm5, %xmm2 5064 addps %xmm2, %xmm10 5065 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) 5066 movss %xmm14, %xmm3 5067 shufps $0x39, %xmm3, %xmm3 5068 mulps %xmm5, %xmm3 5069 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 5070 addps %xmm3, %xmm11 5071 MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) 5072 5073#ifdef PREFETCH 5074 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 5075#endif 5076 5077 mulps %xmm4, %xmm0 5078 addps %xmm0, %xmm8 5079 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 5080 movss %xmm1, %xmm12 5081 shufps $0x93, %xmm1, %xmm12 5082 mulps %xmm4, %xmm12 5083 addps %xmm12, %xmm9 5084 MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) 5085 5086 shufps $0x4e, %xmm2, %xmm13 5087 mulps %xmm4, %xmm13 5088 addps %xmm13, %xmm10 5089 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) 5090 movss %xmm3, %xmm14 5091 shufps $0x39, %xmm14, %xmm14 5092 mulps %xmm4, %xmm14 5093 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 5094 addps %xmm14, %xmm11 5095 MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) 5096 5097#ifdef PREFETCH 5098 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) 5099#endif 5100 5101 mulps %xmm5, %xmm0 5102 addps %xmm0, %xmm8 5103 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 5104 movss %xmm12, %xmm1 5105 shufps $0x93, %xmm12, %xmm1 5106 mulps %xmm5, %xmm1 5107 addps %xmm1, %xmm9 5108 MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1) 5109 5110#ifdef PREFETCHW 5111 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) 5112#endif 5113 5114 shufps $0x4e, %xmm13, %xmm2 5115 mulps %xmm5, %xmm2 5116 addps %xmm2, %xmm10 5117 MOVUPS_A1 (-14 * SIZE, A2, %xmm2) 5118 movss %xmm14, %xmm3 5119 shufps $0x39, %xmm3, %xmm3 5120 mulps %xmm5, %xmm3 5121 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 5122 addps %xmm3, %xmm11 5123 MOVUPS_A2 (-13 * SIZE, A2, LDA, 1, %xmm3) 5124 5125 addq $16 * SIZE, A1 5126 addq $16 * SIZE, A2 5127 addq $16 * SIZE, X1 5128 5129 decq I 5130 jg .L302 5131 ALIGN_4 5132 5133.L303: 5134 mulps %xmm4, %xmm0 5135 addps %xmm0, %xmm8 5136 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 5137 movss %xmm1, %xmm12 5138 shufps $0x93, %xmm1, %xmm12 5139 mulps %xmm4, %xmm12 5140 addps %xmm12, %xmm9 5141 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) 5142 5143 shufps $0x4e, %xmm2, %xmm13 5144 mulps %xmm4, %xmm13 5145 addps %xmm13, %xmm10 5146 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 5147 movss %xmm3, %xmm14 5148 shufps $0x39, %xmm14, %xmm14 5149 mulps %xmm4, %xmm14 5150 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 5151 addps %xmm14, %xmm11 5152 MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) 5153 5154 mulps %xmm5, %xmm0 5155 addps %xmm0, %xmm8 5156 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 5157 movss %xmm12, %xmm1 5158 shufps $0x93, %xmm12, %xmm1 5159 mulps %xmm5, %xmm1 5160 addps %xmm1, %xmm9 5161 MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) 5162 5163 shufps $0x4e, %xmm13, %xmm2 5164 mulps %xmm5, %xmm2 5165 addps %xmm2, %xmm10 5166 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) 5167 movss %xmm14, %xmm3 5168 shufps $0x39, %xmm3, %xmm3 5169 mulps %xmm5, %xmm3 5170 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 5171 addps %xmm3, %xmm11 5172 MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) 5173 5174 mulps %xmm4, %xmm0 5175 addps %xmm0, %xmm8 5176 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 5177 movss %xmm1, %xmm12 5178 shufps $0x93, %xmm1, %xmm12 5179 mulps %xmm4, %xmm12 5180 addps %xmm12, %xmm9 5181 MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) 5182 5183 shufps $0x4e, %xmm2, %xmm13 5184 mulps %xmm4, %xmm13 5185 addps %xmm13, %xmm10 5186 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) 5187 movss %xmm3, %xmm14 5188 shufps $0x39, %xmm14, %xmm14 5189 mulps %xmm4, %xmm14 5190 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 5191 addps %xmm14, %xmm11 5192 MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) 5193 5194 mulps %xmm5, %xmm0 5195 addps %xmm0, %xmm8 5196 movss %xmm12, %xmm1 5197 shufps $0x93, %xmm12, %xmm1 5198 mulps %xmm5, %xmm1 5199 addps %xmm1, %xmm9 5200 5201 shufps $0x4e, %xmm13, %xmm2 5202 mulps %xmm5, %xmm2 5203 addps %xmm2, %xmm10 5204 movss %xmm14, %xmm3 5205 shufps $0x39, %xmm3, %xmm3 5206 mulps %xmm5, %xmm3 5207 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 5208 addps %xmm3, %xmm11 5209 5210 addq $16 * SIZE, A1 5211 addq $16 * SIZE, A2 5212 addq $16 * SIZE, X1 5213 ALIGN_4 5214 5215.L305: 5216 testq $8, MM 5217 jle .L306 5218 5219 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 5220 MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) 5221 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 5222 MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) 5223 5224 mulps %xmm4, %xmm0 5225 addps %xmm0, %xmm8 5226 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 5227 movss %xmm1, %xmm12 5228 shufps $0x93, %xmm1, %xmm12 5229 mulps %xmm4, %xmm12 5230 addps %xmm12, %xmm9 5231 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) 5232 5233 shufps $0x4e, %xmm2, %xmm13 5234 mulps %xmm4, %xmm13 5235 addps %xmm13, %xmm10 5236 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 5237 movss %xmm3, %xmm14 5238 shufps $0x39, %xmm14, %xmm14 5239 mulps %xmm4, %xmm14 5240 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 5241 addps %xmm14, %xmm11 5242 MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) 5243 5244 mulps %xmm5, %xmm0 5245 addps %xmm0, %xmm8 5246 movss %xmm12, %xmm1 5247 shufps $0x93, %xmm12, %xmm1 5248 mulps %xmm5, %xmm1 5249 addps %xmm1, %xmm9 5250 5251 shufps $0x4e, %xmm13, %xmm2 5252 mulps %xmm5, %xmm2 5253 addps %xmm2, %xmm10 5254 movss %xmm14, %xmm3 5255 shufps $0x39, %xmm3, %xmm3 5256 mulps %xmm5, %xmm3 5257 addps %xmm3, %xmm11 5258 5259 addq $8 * SIZE, A1 5260 addq $8 * SIZE, A2 5261 addq $8 * SIZE, X1 5262 ALIGN_4 5263 5264.L306: 5265 testq $4, MM 5266 jle .L307 5267 5268 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 5269 MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) 5270 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 5271 MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) 5272 5273 mulps %xmm4, %xmm0 5274 addps %xmm0, %xmm8 5275 movss %xmm1, %xmm12 5276 shufps $0x93, %xmm1, %xmm12 5277 mulps %xmm4, %xmm12 5278 addps %xmm12, %xmm9 5279 5280 shufps $0x4e, %xmm2, %xmm13 5281 mulps %xmm4, %xmm13 5282 addps %xmm13, %xmm10 5283 movss %xmm3, %xmm14 5284 shufps $0x39, %xmm14, %xmm14 5285 mulps %xmm4, %xmm14 5286 addps %xmm14, %xmm11 5287 5288 addq $4 * SIZE, A1 5289 addq $4 * SIZE, A2 5290 addq $4 * SIZE, X1 5291 ALIGN_4 5292 5293.L307: 5294 testq $2, MM 5295 jle .L308 5296 5297#ifdef movsd 5298 xorps %xmm0, %xmm0 5299#endif 5300 movsd -32 * SIZE(A1), %xmm0 5301#ifdef movsd 5302 xorps %xmm4, %xmm4 5303#endif 5304 movsd -32 * SIZE(X1), %xmm4 5305 mulps %xmm4, %xmm0 5306 addps %xmm0, %xmm8 5307#ifdef movsd 5308 xorps %xmm1, %xmm1 5309#endif 5310 movsd -32 * SIZE(A1, LDA), %xmm1 5311 mulps %xmm4, %xmm1 5312 addps %xmm1, %xmm9 5313#ifdef movsd 5314 xorps %xmm2, %xmm2 5315#endif 5316 movsd -32 * SIZE(A2), %xmm2 5317 mulps %xmm4, %xmm2 5318 addps %xmm2, %xmm10 5319#ifdef movsd 5320 xorps %xmm3, %xmm3 5321#endif 5322 movsd -32 * SIZE(A2, LDA), %xmm3 5323 mulps %xmm4, %xmm3 5324 addps %xmm3, %xmm11 5325 shufps $0xe, %xmm4, %xmm4 5326 5327 addq $2 * SIZE, A1 5328 addq $2 * SIZE, A2 5329 addq $2 * SIZE, X1 5330 ALIGN_4 5331 5332.L308: 5333 testq $1, MM 5334 jle .L309 5335 5336 movss -32 * SIZE(A1), %xmm0 5337 movss -32 * SIZE(X1), %xmm4 5338 mulss %xmm4, %xmm0 5339 addss %xmm0, %xmm8 5340 movss -32 * SIZE(A1, LDA), %xmm1 5341 mulss %xmm4, %xmm1 5342 addss %xmm1, %xmm9 5343 movss -32 * SIZE(A2), %xmm2 5344 mulss %xmm4, %xmm2 5345 addss %xmm2, %xmm10 5346 movss -32 * SIZE(A2, LDA), %xmm3 5347 mulss %xmm4, %xmm3 5348 addss %xmm3, %xmm11 5349 ALIGN_4 5350 5351.L309: 5352#ifdef HAVE_SSE3 5353 haddps %xmm9, %xmm8 5354 haddps %xmm11, %xmm10 5355 haddps %xmm10, %xmm8 5356 5357 pshufd $0x1, %xmm8, %xmm9 5358 pshufd $0x2, %xmm8, %xmm10 5359 pshufd $0x3, %xmm8, %xmm11 5360#else 5361 movaps %xmm8, %xmm0 5362 unpcklps %xmm9, %xmm8 5363 unpckhps %xmm9, %xmm0 5364 5365 movaps %xmm10, %xmm1 5366 unpcklps %xmm11, %xmm10 5367 unpckhps %xmm11, %xmm1 5368 5369 movaps %xmm8, %xmm9 5370 unpcklps %xmm10, %xmm8 5371 unpckhps %xmm10, %xmm9 5372 5373 movaps %xmm0, %xmm10 5374 unpcklps %xmm1, %xmm0 5375 unpckhps %xmm1, %xmm10 5376 5377 addps %xmm9, %xmm8 5378 addps %xmm0, %xmm10 5379 addps %xmm10, %xmm8 5380 5381 pshufd $0x2, %xmm8, %xmm9 5382 pshufd $0x1, %xmm8, %xmm10 5383 pshufd $0x3, %xmm8, %xmm11 5384#endif 5385 5386 mulss ALPHA, %xmm8 5387 mulss ALPHA, %xmm9 5388 mulss ALPHA, %xmm10 5389 mulss ALPHA, %xmm11 5390 5391 addss (Y), %xmm8 5392 addq INCY, Y 5393 addss (Y), %xmm9 5394 addq INCY, Y 5395 addss (Y), %xmm10 5396 addq INCY, Y 5397 addss (Y), %xmm11 5398 addq INCY, Y 5399 5400 movss %xmm8, (Y1) 5401 addq INCY, Y1 5402 movss %xmm9, (Y1) 5403 addq INCY, Y1 5404 movss %xmm10, (Y1) 5405 addq INCY, Y1 5406 movss %xmm11, (Y1) 5407 addq INCY, Y1 5408 5409 cmpq $4, N 5410 jge .L301 5411 ALIGN_4 5412 5413.L310: 5414 testq N, N 5415 jle .L999 5416 5417 cmpq $3, N 5418 jne .L320 5419 5420 leaq 32 * SIZE(BUFFER), X1 5421 5422 movq A, A1 5423 leaq (A1, LDA, 2), A2 5424 leaq (A1, LDA, 4), A 5425 5426 xorps %xmm8, %xmm8 5427 xorps %xmm9, %xmm9 5428 xorps %xmm10, %xmm10 5429 5430 cmpq $3, M 5431 jle .L317 5432 5433 testq $SIZE, A1 5434 je .L31X 5435 5436 movss -32 * SIZE(A1), %xmm0 5437 movss -32 * SIZE(X1), %xmm4 5438 mulss %xmm4, %xmm0 5439 addss %xmm0, %xmm8 5440 movss -32 * SIZE(A1, LDA), %xmm1 5441 mulss %xmm4, %xmm1 5442 addss %xmm1, %xmm9 5443 movss -32 * SIZE(A2), %xmm2 5444 mulss %xmm4, %xmm2 5445 addss %xmm2, %xmm10 5446 5447 addq $1 * SIZE, A1 5448 addq $1 * SIZE, A2 5449 addq $1 * SIZE, X1 5450 ALIGN_3 5451 5452.L31X: 5453 testq $2 * SIZE, A1 5454 je .L31XX 5455 5456#ifdef movsd 5457 xorps %xmm0, %xmm0 5458 xorps %xmm4, %xmm4 5459#endif 5460 movsd -32 * SIZE(A1), %xmm0 5461 movsd -32 * SIZE(X1), %xmm4 5462 mulps %xmm4, %xmm0 5463 addps %xmm0, %xmm8 5464#ifdef movsd 5465 xorps %xmm1, %xmm1 5466#endif 5467 movsd -32 * SIZE(A1, LDA), %xmm1 5468 mulps %xmm4, %xmm1 5469 addps %xmm1, %xmm9 5470#ifdef movsd 5471 xorps %xmm2, %xmm2 5472#endif 5473 movsd -32 * SIZE(A2), %xmm2 5474 mulps %xmm4, %xmm2 5475 addps %xmm2, %xmm10 5476 5477 addq $2 * SIZE, A1 5478 addq $2 * SIZE, A2 5479 addq $2 * SIZE, X1 5480 ALIGN_3 5481 5482.L31XX: 5483 movaps -35 * SIZE(A1, LDA), %xmm12 5484 movaps -34 * SIZE(A2), %xmm13 5485 5486 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 5487 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 5488 5489 movq MM, I 5490 sarq $4, I 5491 jle .L315 5492 5493 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 5494 MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) 5495 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 5496 5497 decq I 5498 jle .L313 5499 ALIGN_4 5500 5501.L312: 5502#ifdef PREFETCH 5503 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) 5504#endif 5505 5506 mulps %xmm4, %xmm0 5507 addps %xmm0, %xmm8 5508 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 5509 movss %xmm1, %xmm12 5510 shufps $0x93, %xmm1, %xmm12 5511 mulps %xmm4, %xmm12 5512 addps %xmm12, %xmm9 5513 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) 5514 5515 shufps $0x4e, %xmm2, %xmm13 5516 mulps %xmm4, %xmm13 5517 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 5518 addps %xmm13, %xmm10 5519 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 5520 5521#ifdef PREFETCH 5522 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) 5523#endif 5524 5525 mulps %xmm5, %xmm0 5526 addps %xmm0, %xmm8 5527 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 5528 movss %xmm12, %xmm1 5529 shufps $0x93, %xmm12, %xmm1 5530 mulps %xmm5, %xmm1 5531 addps %xmm1, %xmm9 5532 MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) 5533 shufps $0x4e, %xmm13, %xmm2 5534 mulps %xmm5, %xmm2 5535 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 5536 addps %xmm2, %xmm10 5537 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) 5538 5539#ifdef PREFETCH 5540 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) 5541#endif 5542 5543 mulps %xmm4, %xmm0 5544 addps %xmm0, %xmm8 5545 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 5546 movss %xmm1, %xmm12 5547 shufps $0x93, %xmm1, %xmm12 5548 mulps %xmm4, %xmm12 5549 addps %xmm12, %xmm9 5550 MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) 5551 shufps $0x4e, %xmm2, %xmm13 5552 mulps %xmm4, %xmm13 5553 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 5554 addps %xmm13, %xmm10 5555 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) 5556 5557#ifdef PREFETCHW 5558 PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) 5559#endif 5560 5561 mulps %xmm5, %xmm0 5562 addps %xmm0, %xmm8 5563 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 5564 movss %xmm12, %xmm1 5565 shufps $0x93, %xmm12, %xmm1 5566 mulps %xmm5, %xmm1 5567 addps %xmm1, %xmm9 5568 MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1) 5569 shufps $0x4e, %xmm13, %xmm2 5570 mulps %xmm5, %xmm2 5571 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 5572 addps %xmm2, %xmm10 5573 MOVUPS_A1 (-14 * SIZE, A2, %xmm2) 5574 5575 addq $16 * SIZE, A1 5576 addq $16 * SIZE, A2 5577 addq $16 * SIZE, X1 5578 5579 decq I 5580 jg .L312 5581 ALIGN_4 5582 5583.L313: 5584 mulps %xmm4, %xmm0 5585 addps %xmm0, %xmm8 5586 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 5587 movss %xmm1, %xmm12 5588 shufps $0x93, %xmm1, %xmm12 5589 mulps %xmm4, %xmm12 5590 addps %xmm12, %xmm9 5591 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) 5592 5593 shufps $0x4e, %xmm2, %xmm13 5594 mulps %xmm4, %xmm13 5595 addps %xmm13, %xmm10 5596 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 5597 movss %xmm3, %xmm14 5598 shufps $0x39, %xmm14, %xmm14 5599 mulps %xmm4, %xmm14 5600 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 5601 addps %xmm14, %xmm11 5602 MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) 5603 5604 mulps %xmm5, %xmm0 5605 addps %xmm0, %xmm8 5606 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 5607 movss %xmm12, %xmm1 5608 shufps $0x93, %xmm12, %xmm1 5609 mulps %xmm5, %xmm1 5610 addps %xmm1, %xmm9 5611 MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) 5612 5613 shufps $0x4e, %xmm13, %xmm2 5614 mulps %xmm5, %xmm2 5615 addps %xmm2, %xmm10 5616 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) 5617 movss %xmm14, %xmm3 5618 shufps $0x39, %xmm3, %xmm3 5619 mulps %xmm5, %xmm3 5620 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 5621 addps %xmm3, %xmm11 5622 MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) 5623 5624 mulps %xmm4, %xmm0 5625 addps %xmm0, %xmm8 5626 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 5627 movss %xmm1, %xmm12 5628 shufps $0x93, %xmm1, %xmm12 5629 mulps %xmm4, %xmm12 5630 addps %xmm12, %xmm9 5631 MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) 5632 5633 shufps $0x4e, %xmm2, %xmm13 5634 mulps %xmm4, %xmm13 5635 addps %xmm13, %xmm10 5636 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) 5637 movss %xmm3, %xmm14 5638 shufps $0x39, %xmm14, %xmm14 5639 mulps %xmm4, %xmm14 5640 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 5641 addps %xmm14, %xmm11 5642 MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) 5643 5644 mulps %xmm5, %xmm0 5645 addps %xmm0, %xmm8 5646 movss %xmm12, %xmm1 5647 shufps $0x93, %xmm12, %xmm1 5648 mulps %xmm5, %xmm1 5649 addps %xmm1, %xmm9 5650 5651 shufps $0x4e, %xmm13, %xmm2 5652 mulps %xmm5, %xmm2 5653 addps %xmm2, %xmm10 5654 movss %xmm14, %xmm3 5655 shufps $0x39, %xmm3, %xmm3 5656 mulps %xmm5, %xmm3 5657 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 5658 addps %xmm3, %xmm11 5659 5660 addq $16 * SIZE, A1 5661 addq $16 * SIZE, A2 5662 addq $16 * SIZE, X1 5663 ALIGN_4 5664 5665.L315: 5666 testq $8, MM 5667 jle .L316 5668 5669 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 5670 MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) 5671 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 5672 5673 mulps %xmm4, %xmm0 5674 addps %xmm0, %xmm8 5675 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 5676 movss %xmm1, %xmm12 5677 shufps $0x93, %xmm1, %xmm12 5678 mulps %xmm4, %xmm12 5679 addps %xmm12, %xmm9 5680 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) 5681 5682 shufps $0x4e, %xmm2, %xmm13 5683 mulps %xmm4, %xmm13 5684 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 5685 addps %xmm13, %xmm10 5686 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) 5687 5688 mulps %xmm5, %xmm0 5689 addps %xmm0, %xmm8 5690 movss %xmm12, %xmm1 5691 shufps $0x93, %xmm12, %xmm1 5692 mulps %xmm5, %xmm1 5693 addps %xmm1, %xmm9 5694 5695 shufps $0x4e, %xmm13, %xmm2 5696 mulps %xmm5, %xmm2 5697 addps %xmm2, %xmm10 5698 5699 addq $8 * SIZE, A1 5700 addq $8 * SIZE, A2 5701 addq $8 * SIZE, X1 5702 ALIGN_4 5703 5704.L316: 5705 testq $4, MM 5706 jle .L317 5707 5708 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 5709 MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) 5710 MOVUPS_A1 (-30 * SIZE, A2, %xmm2) 5711 5712 mulps %xmm4, %xmm0 5713 addps %xmm0, %xmm8 5714 movss %xmm1, %xmm12 5715 shufps $0x93, %xmm1, %xmm12 5716 mulps %xmm4, %xmm12 5717 addps %xmm12, %xmm9 5718 5719 shufps $0x4e, %xmm2, %xmm13 5720 mulps %xmm4, %xmm13 5721 addps %xmm13, %xmm10 5722 5723 addq $4 * SIZE, A1 5724 addq $4 * SIZE, A2 5725 addq $4 * SIZE, X1 5726 ALIGN_4 5727 5728.L317: 5729 testq $2, MM 5730 jle .L318 5731 5732#ifdef movsd 5733 xorps %xmm0, %xmm0 5734#endif 5735 movsd -32 * SIZE(A1), %xmm0 5736#ifdef movsd 5737 xorps %xmm4, %xmm4 5738#endif 5739 movsd -32 * SIZE(X1), %xmm4 5740 mulps %xmm4, %xmm0 5741 addps %xmm0, %xmm8 5742#ifdef movsd 5743 xorps %xmm1, %xmm1 5744#endif 5745 movsd -32 * SIZE(A1, LDA), %xmm1 5746 mulps %xmm4, %xmm1 5747 addps %xmm1, %xmm9 5748#ifdef movsd 5749 xorps %xmm2, %xmm2 5750#endif 5751 movsd -32 * SIZE(A2), %xmm2 5752 mulps %xmm4, %xmm2 5753 addps %xmm2, %xmm10 5754 5755 addq $2 * SIZE, A1 5756 addq $2 * SIZE, A2 5757 addq $2 * SIZE, X1 5758 ALIGN_4 5759 5760.L318: 5761 testq $1, MM 5762 jle .L319 5763 5764 movss -32 * SIZE(A1), %xmm0 5765 movss -32 * SIZE(X1), %xmm4 5766 mulss %xmm4, %xmm0 5767 addss %xmm0, %xmm8 5768 movss -32 * SIZE(A1, LDA), %xmm1 5769 mulss %xmm4, %xmm1 5770 addss %xmm1, %xmm9 5771 movss -32 * SIZE(A2), %xmm2 5772 mulss %xmm4, %xmm2 5773 addss %xmm2, %xmm10 5774 ALIGN_4 5775 5776.L319: 5777#ifdef HAVE_SSE3 5778 haddps %xmm9, %xmm8 5779 haddps %xmm11, %xmm10 5780 haddps %xmm10, %xmm8 5781 5782 pshufd $0x1, %xmm8, %xmm9 5783 pshufd $0x2, %xmm8, %xmm10 5784#else 5785 movaps %xmm8, %xmm0 5786 unpcklps %xmm9, %xmm8 5787 unpckhps %xmm9, %xmm0 5788 5789 movaps %xmm10, %xmm1 5790 unpcklps %xmm11, %xmm10 5791 unpckhps %xmm11, %xmm1 5792 5793 movaps %xmm8, %xmm9 5794 unpcklps %xmm10, %xmm8 5795 unpckhps %xmm10, %xmm9 5796 5797 movaps %xmm0, %xmm10 5798 unpcklps %xmm1, %xmm0 5799 unpckhps %xmm1, %xmm10 5800 5801 addps %xmm9, %xmm8 5802 addps %xmm0, %xmm10 5803 addps %xmm10, %xmm8 5804 5805 pshufd $0x2, %xmm8, %xmm9 5806 pshufd $0x1, %xmm8, %xmm10 5807#endif 5808 5809 mulss ALPHA, %xmm8 5810 mulss ALPHA, %xmm9 5811 mulss ALPHA, %xmm10 5812 5813 addss (Y), %xmm8 5814 addq INCY, Y 5815 addss (Y), %xmm9 5816 addq INCY, Y 5817 addss (Y), %xmm10 5818 5819 movss %xmm8, (Y1) 5820 addq INCY, Y1 5821 movss %xmm9, (Y1) 5822 addq INCY, Y1 5823 movss %xmm10, (Y1) 5824 jmp .L999 5825 ALIGN_3 5826 5827.L320: 5828 cmpq $2, N 5829 jne .L330 5830 5831 leaq 32 * SIZE(BUFFER), X1 5832 5833 movq A, A1 5834 leaq (A1, LDA), A2 5835 5836 xorps %xmm8, %xmm8 5837 xorps %xmm9, %xmm9 5838 5839 cmpq $3, M 5840 jle .L327 5841 5842 testq $SIZE, A1 5843 je .L32X 5844 5845 movss -32 * SIZE(A1), %xmm0 5846 movss -32 * SIZE(X1), %xmm4 5847 mulss %xmm4, %xmm0 5848 addss %xmm0, %xmm8 5849 movss -32 * SIZE(A2), %xmm1 5850 mulss %xmm4, %xmm1 5851 addss %xmm1, %xmm9 5852 5853 addq $1 * SIZE, A1 5854 addq $1 * SIZE, A2 5855 addq $1 * SIZE, X1 5856 ALIGN_3 5857 5858.L32X: 5859 testq $2 * SIZE, A1 5860 je .L32XX 5861 5862#ifdef movsd 5863 xorps %xmm0, %xmm0 5864 xorps %xmm4, %xmm4 5865#endif 5866 movsd -32 * SIZE(A1), %xmm0 5867 movsd -32 * SIZE(X1), %xmm4 5868 mulps %xmm4, %xmm0 5869 addps %xmm0, %xmm8 5870#ifdef movsd 5871 xorps %xmm1, %xmm1 5872#endif 5873 movsd -32 * SIZE(A2), %xmm1 5874 mulps %xmm4, %xmm1 5875 addps %xmm1, %xmm9 5876 5877 addq $2 * SIZE, A1 5878 addq $2 * SIZE, A2 5879 addq $2 * SIZE, X1 5880 ALIGN_3 5881 5882.L32XX: 5883 movaps -35 * SIZE(A2), %xmm12 5884 5885 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 5886 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 5887 5888 movq MM, I 5889 sarq $4, I 5890 jle .L325 5891 5892 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 5893 MOVUPS_A1 (-31 * SIZE, A2, %xmm1) 5894 5895 decq I 5896 jle .L323 5897 ALIGN_4 5898 5899.L322: 5900#ifdef PREFETCH 5901 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 5902#endif 5903 5904 mulps %xmm4, %xmm0 5905 addps %xmm0, %xmm8 5906 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 5907 movss %xmm1, %xmm12 5908 shufps $0x93, %xmm1, %xmm12 5909 mulps %xmm4, %xmm12 5910 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 5911 addps %xmm12, %xmm9 5912 MOVUPS_A1 (-27 * SIZE, A2, %xmm12) 5913 5914 mulps %xmm5, %xmm0 5915 addps %xmm0, %xmm8 5916 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 5917 movss %xmm12, %xmm1 5918 shufps $0x93, %xmm12, %xmm1 5919 mulps %xmm5, %xmm1 5920 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 5921 addps %xmm1, %xmm9 5922 MOVUPS_A1 (-23 * SIZE, A2, %xmm1) 5923 5924#ifdef PREFETCH 5925 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) 5926#endif 5927 5928 mulps %xmm4, %xmm0 5929 addps %xmm0, %xmm8 5930 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 5931 movss %xmm1, %xmm12 5932 shufps $0x93, %xmm1, %xmm12 5933 mulps %xmm4, %xmm12 5934 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 5935 addps %xmm12, %xmm9 5936 MOVUPS_A1 (-19 * SIZE, A2, %xmm12) 5937 5938#ifdef PREFETCHW 5939 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) 5940#endif 5941 5942 mulps %xmm5, %xmm0 5943 addps %xmm0, %xmm8 5944 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 5945 movss %xmm12, %xmm1 5946 shufps $0x93, %xmm12, %xmm1 5947 mulps %xmm5, %xmm1 5948 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 5949 addps %xmm1, %xmm9 5950 MOVUPS_A1 (-15 * SIZE, A2, %xmm1) 5951 5952 addq $16 * SIZE, A1 5953 addq $16 * SIZE, A2 5954 addq $16 * SIZE, X1 5955 5956 decq I 5957 jg .L322 5958 ALIGN_4 5959 5960.L323: 5961 mulps %xmm4, %xmm0 5962 addps %xmm0, %xmm8 5963 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 5964 movss %xmm1, %xmm12 5965 shufps $0x93, %xmm1, %xmm12 5966 mulps %xmm4, %xmm12 5967 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 5968 addps %xmm12, %xmm9 5969 MOVUPS_A1 (-27 * SIZE, A2, %xmm12) 5970 5971 mulps %xmm5, %xmm0 5972 addps %xmm0, %xmm8 5973 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 5974 movss %xmm12, %xmm1 5975 shufps $0x93, %xmm12, %xmm1 5976 mulps %xmm5, %xmm1 5977 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 5978 addps %xmm1, %xmm9 5979 MOVUPS_A1 (-23 * SIZE, A2, %xmm1) 5980 5981 mulps %xmm4, %xmm0 5982 addps %xmm0, %xmm8 5983 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) 5984 movss %xmm1, %xmm12 5985 shufps $0x93, %xmm1, %xmm12 5986 mulps %xmm4, %xmm12 5987 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 5988 addps %xmm12, %xmm9 5989 MOVUPS_A1 (-19 * SIZE, A2, %xmm12) 5990 5991 mulps %xmm5, %xmm0 5992 addps %xmm0, %xmm8 5993 movss %xmm12, %xmm1 5994 shufps $0x93, %xmm12, %xmm1 5995 mulps %xmm5, %xmm1 5996 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 5997 addps %xmm1, %xmm9 5998 5999 addq $16 * SIZE, A1 6000 addq $16 * SIZE, A2 6001 addq $16 * SIZE, X1 6002 ALIGN_4 6003 6004.L325: 6005 testq $8, MM 6006 jle .L326 6007 6008 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 6009 MOVUPS_A1 (-31 * SIZE, A2, %xmm1) 6010 6011 mulps %xmm4, %xmm0 6012 addps %xmm0, %xmm8 6013 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) 6014 movss %xmm1, %xmm12 6015 shufps $0x93, %xmm1, %xmm12 6016 mulps %xmm4, %xmm12 6017 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 6018 addps %xmm12, %xmm9 6019 MOVUPS_A1 (-27 * SIZE, A2, %xmm12) 6020 6021 mulps %xmm5, %xmm0 6022 addps %xmm0, %xmm8 6023 movss %xmm12, %xmm1 6024 shufps $0x93, %xmm12, %xmm1 6025 mulps %xmm5, %xmm1 6026 addps %xmm1, %xmm9 6027 6028 addq $8 * SIZE, A1 6029 addq $8 * SIZE, A2 6030 addq $8 * SIZE, X1 6031 ALIGN_4 6032 6033.L326: 6034 testq $4, MM 6035 jle .L327 6036 6037 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 6038 MOVUPS_A1 (-31 * SIZE, A2, %xmm1) 6039 6040 mulps %xmm4, %xmm0 6041 addps %xmm0, %xmm8 6042 movss %xmm1, %xmm12 6043 shufps $0x93, %xmm1, %xmm12 6044 mulps %xmm4, %xmm12 6045 addps %xmm12, %xmm9 6046 6047 addq $4 * SIZE, A1 6048 addq $4 * SIZE, A2 6049 addq $4 * SIZE, X1 6050 ALIGN_4 6051 6052.L327: 6053 testq $2, MM 6054 jle .L328 6055 6056#ifdef movsd 6057 xorps %xmm0, %xmm0 6058#endif 6059 movsd -32 * SIZE(A1), %xmm0 6060#ifdef movsd 6061 xorps %xmm4, %xmm4 6062#endif 6063 movsd -32 * SIZE(X1), %xmm4 6064 mulps %xmm4, %xmm0 6065 addps %xmm0, %xmm8 6066#ifdef movsd 6067 xorps %xmm1, %xmm1 6068#endif 6069 movsd -32 * SIZE(A2), %xmm1 6070 mulps %xmm4, %xmm1 6071 addps %xmm1, %xmm9 6072 shufps $0xe, %xmm4, %xmm4 6073 6074 addq $2 * SIZE, A1 6075 addq $2 * SIZE, A2 6076 addq $2 * SIZE, X1 6077 ALIGN_4 6078 6079.L328: 6080 testq $1, MM 6081 jle .L329 6082 6083 movss -32 * SIZE(A1), %xmm0 6084 movss -32 * SIZE(X1), %xmm4 6085 mulss %xmm4, %xmm0 6086 addss %xmm0, %xmm8 6087 movss -32 * SIZE(A2), %xmm1 6088 mulss %xmm4, %xmm1 6089 addss %xmm1, %xmm9 6090 ALIGN_4 6091 6092.L329: 6093#ifdef HAVE_SSE3 6094 haddps %xmm9, %xmm8 6095 haddps %xmm8, %xmm8 6096#else 6097 movaps %xmm8, %xmm10 6098 unpcklps %xmm9, %xmm8 6099 unpckhps %xmm9, %xmm10 6100 6101 addps %xmm10, %xmm8 6102 movhlps %xmm8, %xmm9 6103 addps %xmm9, %xmm8 6104#endif 6105 6106 pshufd $0x1, %xmm8, %xmm9 6107 6108 mulss ALPHA, %xmm8 6109 mulss ALPHA, %xmm9 6110 6111 addss (Y), %xmm8 6112 addq INCY, Y 6113 addss (Y), %xmm9 6114 addq INCY, Y 6115 6116 movss %xmm8, (Y1) 6117 addq INCY, Y1 6118 movss %xmm9, (Y1) 6119 addq INCY, Y1 6120 jmp .L999 6121 ALIGN_4 6122 6123.L330: 6124 cmpq $1, N 6125 jne .L999 6126 6127 leaq 32 * SIZE(BUFFER), X1 6128 6129 movq A, A1 6130 6131 xorps %xmm8, %xmm8 6132 xorps %xmm9, %xmm9 6133 6134 cmpq $3, M 6135 jle .L337 6136 6137 testq $SIZE, A1 6138 je .L33X 6139 6140 movss -32 * SIZE(A1), %xmm0 6141 movss -32 * SIZE(X1), %xmm4 6142 mulss %xmm4, %xmm0 6143 addss %xmm0, %xmm8 6144 6145 addq $1 * SIZE, A1 6146 addq $1 * SIZE, X1 6147 ALIGN_3 6148 6149.L33X: 6150 testq $2 * SIZE, A1 6151 je .L33XX 6152 6153#ifdef movsd 6154 xorps %xmm0, %xmm0 6155 xorps %xmm4, %xmm4 6156#endif 6157 movsd -32 * SIZE(A1), %xmm0 6158 movsd -32 * SIZE(X1), %xmm4 6159 mulps %xmm4, %xmm0 6160 addps %xmm0, %xmm8 6161 shufps $0xe, %xmm4, %xmm4 6162 6163 addq $2 * SIZE, A1 6164 addq $2 * SIZE, X1 6165 ALIGN_3 6166 6167.L33XX: 6168 6169 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) 6170 MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) 6171 6172 movq MM, I 6173 sarq $4, I 6174 jle .L335 6175 6176 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 6177 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 6178 6179 decq I 6180 jle .L333 6181 ALIGN_4 6182 6183.L332: 6184#ifdef PREFETCH 6185 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) 6186#endif 6187 6188 mulps %xmm4, %xmm0 6189 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 6190 addps %xmm0, %xmm8 6191 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 6192 6193 mulps %xmm5, %xmm12 6194 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 6195 addps %xmm12, %xmm9 6196 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 6197 6198#ifdef PREFETCHW 6199 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) 6200#endif 6201 6202 mulps %xmm4, %xmm0 6203 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 6204 addps %xmm0, %xmm8 6205 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) 6206 6207 mulps %xmm5, %xmm12 6208 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 6209 addps %xmm12, %xmm9 6210 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) 6211 6212 addq $16 * SIZE, A1 6213 addq $16 * SIZE, X1 6214 6215 decq I 6216 jg .L332 6217 ALIGN_4 6218 6219.L333: 6220 mulps %xmm4, %xmm0 6221 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 6222 addps %xmm0, %xmm8 6223 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) 6224 6225 mulps %xmm5, %xmm12 6226 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) 6227 addps %xmm12, %xmm9 6228 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) 6229 6230 mulps %xmm4, %xmm0 6231 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) 6232 addps %xmm0, %xmm8 6233 6234 mulps %xmm5, %xmm12 6235 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) 6236 addps %xmm12, %xmm9 6237 6238 addq $16 * SIZE, A1 6239 addq $16 * SIZE, X1 6240 ALIGN_4 6241 6242.L335: 6243 testq $8, MM 6244 jle .L336 6245 6246 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 6247 mulps %xmm4, %xmm0 6248 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) 6249 addps %xmm0, %xmm8 6250 6251 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) 6252 mulps %xmm5, %xmm12 6253 addps %xmm12, %xmm9 6254 6255 addq $8 * SIZE, A1 6256 addq $8 * SIZE, X1 6257 ALIGN_4 6258 6259.L336: 6260 testq $4, MM 6261 jle .L337 6262 6263 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) 6264 mulps %xmm4, %xmm0 6265 addps %xmm0, %xmm8 6266 6267 addq $4 * SIZE, A1 6268 addq $4 * SIZE, X1 6269 ALIGN_4 6270 6271.L337: 6272 testq $2, MM 6273 jle .L338 6274 6275#ifdef movsd 6276 xorps %xmm0, %xmm0 6277#endif 6278 movsd -32 * SIZE(A1), %xmm0 6279#ifdef movsd 6280 xorps %xmm4, %xmm4 6281#endif 6282 movsd -32 * SIZE(X1), %xmm4 6283 mulps %xmm4, %xmm0 6284 addps %xmm0, %xmm8 6285 shufps $0xe, %xmm4, %xmm4 6286 6287 addq $2 * SIZE, A1 6288 addq $2 * SIZE, X1 6289 ALIGN_4 6290 6291.L338: 6292 testq $1, MM 6293 jle .L339 6294 6295 movss -32 * SIZE(A1), %xmm0 6296 movss -32 * SIZE(X1), %xmm4 6297 mulss %xmm4, %xmm0 6298 addss %xmm0, %xmm8 6299 ALIGN_4 6300 6301.L339: 6302 addps %xmm9, %xmm8 6303 6304#ifdef HAVE_SSE3 6305 haddps %xmm8, %xmm8 6306 haddps %xmm8, %xmm8 6307#else 6308 pshufd $1, %xmm8, %xmm9 6309 pshufd $2, %xmm8, %xmm10 6310 pshufd $3, %xmm8, %xmm11 6311 6312 addss %xmm9, %xmm8 6313 addss %xmm11, %xmm10 6314 addss %xmm10, %xmm8 6315#endif 6316 6317 mulss ALPHA, %xmm8 6318 6319 addss (Y), %xmm8 6320 movss %xmm8, (Y1) 6321 6322 jmp .L999 6323#endif 6324 ALIGN_4 6325 6326.L999: 6327 movq 0(%rsp), %rbx 6328 movq 8(%rsp), %rbp 6329 movq 16(%rsp), %r12 6330 movq 24(%rsp), %r13 6331 movq 32(%rsp), %r14 6332 movq 40(%rsp), %r15 6333 6334#ifdef WINDOWS_ABI 6335 movq 48(%rsp), %rdi 6336 movq 56(%rsp), %rsi 6337 movups 64(%rsp), %xmm6 6338 movups 80(%rsp), %xmm7 6339 movups 96(%rsp), %xmm8 6340 movups 112(%rsp), %xmm9 6341 movups 128(%rsp), %xmm10 6342 movups 144(%rsp), %xmm11 6343 movups 160(%rsp), %xmm12 6344 movups 176(%rsp), %xmm13 6345 movups 192(%rsp), %xmm14 6346 movups 208(%rsp), %xmm15 6347#endif 6348 6349 addq $STACKSIZE, %rsp 6350 ret 6351 ALIGN_4 6352 6353 EPILOGUE 6354