1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41#include "l2param.h" 42 43#if GEMV_UNROLL < 2 44#undef GEMV_UNROLL 45#define GEMV_UNROLL 2 46#endif 47 48#ifndef WINDOWS_ABI 49 50#define STACKSIZE 128 51 52#define OLD_M %rdi 53#define OLD_N %rsi 54#define OLD_A %rcx 55#define OLD_LDA %r8 56#define STACK_INCX 8 + STACKSIZE(%rsp) 57#define STACK_Y 16 + STACKSIZE(%rsp) 58#define STACK_INCY 24 + STACKSIZE(%rsp) 59#define STACK_BUFFER 32 + STACKSIZE(%rsp) 60#define ALPHA 48 (%rsp) 61 62#define MMM 56(%rsp) 63#define NN 64(%rsp) 64#define AA 72(%rsp) 65#define LDAX 80(%rsp) 66#define XX 88(%rsp) 67#else 68 69#define STACKSIZE 288 70 71#define OLD_M %rcx 72#define OLD_N %rdx 73#define OLD_A 40 + STACKSIZE(%rsp) 74#define OLD_LDA 48 + STACKSIZE(%rsp) 75#define OLD_X 56 + STACKSIZE(%rsp) 76#define STACK_INCX 64 + STACKSIZE(%rsp) 77#define STACK_Y 72 + STACKSIZE(%rsp) 78#define STACK_INCY 80 + STACKSIZE(%rsp) 79#define STACK_BUFFER 88 + STACKSIZE(%rsp) 80#define ALPHA 224 (%rsp) 81 82#define MMM 232(%rsp) 83#define NN 240(%rsp) 84#define AA 248(%rsp) 85#define LDAX 256(%rsp) 86#define XX 264(%rsp) 87 88#endif 89 90#define LDA %r8 91#define X %r9 92 93#define INCX %rsi 94#define INCY %rdi 95 96#define M %r10 97#define N %r11 98#define A %r12 99#define Y %r14 100#define BUFFER %r13 101 102#define I %rax 103#define A1 %rbx 104#define A2 %rcx 105#define LDA3 %rdx 106#define Y1 %rbp 107 108#ifdef ALIGNED_ACCESS 109#define MM %r15 110#else 111#define MM M 112#endif 113 114#define TMP_M %r15 115#define Y2 %rbx 116 117 PROLOGUE 118 PROFCODE 119 120 subq $STACKSIZE, %rsp 121 movq %rbx, 0(%rsp) 122 movq %rbp, 8(%rsp) 123 movq %r12, 16(%rsp) 124 movq %r13, 24(%rsp) 125 movq %r14, 32(%rsp) 126 movq %r15, 40(%rsp) 127 128#ifdef WINDOWS_ABI 129 movq %rdi, 48(%rsp) 130 movq %rsi, 56(%rsp) 131 movups %xmm6, 64(%rsp) 132 movups %xmm7, 80(%rsp) 133 movups %xmm8, 96(%rsp) 134 movups %xmm9, 112(%rsp) 135 movups %xmm10, 128(%rsp) 136 movups %xmm11, 144(%rsp) 137 movups %xmm12, 160(%rsp) 138 movups %xmm13, 176(%rsp) 139 movups %xmm14, 192(%rsp) 140 movups %xmm15, 208(%rsp) 141 142 movq OLD_M, M 143 movq OLD_N, N 144 movq OLD_A, A 145 movq OLD_LDA, LDA 146 movq OLD_X, X 147#else 148 movq OLD_M, M 149 movq OLD_N, N 150 movq OLD_A, A 151 movq OLD_LDA, LDA 152#endif 153 154#ifndef WINDOWS_ABI 155 movsd %xmm0, ALPHA 156#else 157 movsd %xmm3, ALPHA 158#endif 159 160 movq STACK_Y, Y 161 movq A,AA 162 movq N,NN 163 movq M,MMM 164 movq LDA,LDAX 165 movq X,XX 166 167.L0t: 168 xorq I,I 169 addq $1,I 170 salq $21,I 171 subq I,MMM 172 movq I,M 173 jge .L00t 174 175 movq MMM,M 176 addq M, I 177 jle .L999x 178 movq I, M 179 180.L00t: 181 movq XX,X 182 movq AA,A 183 movq NN,N 184 movq LDAX,LDA 185 186 movq STACK_INCX, INCX 187 movq STACK_INCY, INCY 188 movq STACK_BUFFER, BUFFER 189 190 191 leaq -1(INCY), %rax 192 193 leaq (,INCX, SIZE), INCX 194 leaq (,INCY, SIZE), INCY 195 leaq (,LDA, SIZE), LDA 196 197 leaq (LDA, LDA, 2), LDA3 198 199 subq $-16 * SIZE, A 200 201#ifdef ALIGNED_ACCESS 202 leaq -1 (M), MM 203 testq $SIZE, A 204 cmoveq M, MM 205#endif 206 207 testq N, N # if n <= 0 goto END 208 jle .L999 209 testq M, M # if n <= 0 goto END 210 jle .L999 211 212#if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS) 213#ifndef NOCOPY_UNALIGNED 214 movq Y, Y1 215 andq $0xf, Y1 216 orq Y1, %rax 217#endif 218 testq %rax, %rax 219 cmoveq Y, BUFFER 220 je .L10 221#endif 222 223 movq BUFFER, Y1 224 225 pxor %xmm4, %xmm4 226 227 movq M, %rax 228 addq $16, %rax 229 sarq $4, %rax 230 ALIGN_3 231 232.L01: 233 movapd %xmm4, 0 * SIZE(Y1) 234 movapd %xmm4, 2 * SIZE(Y1) 235 movapd %xmm4, 4 * SIZE(Y1) 236 movapd %xmm4, 6 * SIZE(Y1) 237 movapd %xmm4, 8 * SIZE(Y1) 238 movapd %xmm4, 10 * SIZE(Y1) 239 movapd %xmm4, 12 * SIZE(Y1) 240 movapd %xmm4, 14 * SIZE(Y1) 241 subq $-16 * SIZE, Y1 242 decq %rax 243 jg .L01 244 ALIGN_3 245 246.L10: 247 248#ifdef ALIGNED_ACCESS 249 leaq SIZE(BUFFER), %rax 250 testq $SIZE, A 251 cmovne %rax, BUFFER 252 253 testq $SIZE, LDA 254 jne .L50 255#endif 256 257#if GEMV_UNROLL >= 8 258 259 cmpq $8, N 260 jl .L20 261 ALIGN_3 262 263.L11: 264 subq $8, N 265 266 leaq 16 * SIZE(BUFFER), Y1 267 movq A, A1 268 leaq (A, LDA, 4), A2 269 leaq (A, LDA, 8), A 270 271#ifdef HAVE_SSE3 272 movddup (X), %xmm8 273 addq INCX, X 274 movddup (X), %xmm9 275 addq INCX, X 276 movddup (X), %xmm10 277 addq INCX, X 278 movddup (X), %xmm11 279 addq INCX, X 280 movddup (X), %xmm12 281 addq INCX, X 282 movddup (X), %xmm13 283 addq INCX, X 284 movddup (X), %xmm14 285 addq INCX, X 286 movddup (X), %xmm15 287 addq INCX, X 288 289 movddup ALPHA, %xmm0 290#else 291 movsd (X), %xmm8 292 unpcklpd %xmm8, %xmm8 293 addq INCX, X 294 movsd (X), %xmm9 295 unpcklpd %xmm9, %xmm9 296 addq INCX, X 297 movsd (X), %xmm10 298 unpcklpd %xmm10, %xmm10 299 addq INCX, X 300 movsd (X), %xmm11 301 unpcklpd %xmm11, %xmm11 302 addq INCX, X 303 movsd (X), %xmm12 304 unpcklpd %xmm12, %xmm12 305 addq INCX, X 306 movsd (X), %xmm13 307 unpcklpd %xmm13, %xmm13 308 addq INCX, X 309 movsd (X), %xmm14 310 unpcklpd %xmm14, %xmm14 311 addq INCX, X 312 movsd (X), %xmm15 313 unpcklpd %xmm15, %xmm15 314 addq INCX, X 315 316 movsd ALPHA, %xmm0 317 unpcklpd %xmm0, %xmm0 318#endif 319 320 mulpd %xmm0, %xmm8 321 mulpd %xmm0, %xmm9 322 mulpd %xmm0, %xmm10 323 mulpd %xmm0, %xmm11 324 mulpd %xmm0, %xmm12 325 mulpd %xmm0, %xmm13 326 mulpd %xmm0, %xmm14 327 mulpd %xmm0, %xmm15 328 329#ifdef ALIGNED_ACCESS 330 testq $SIZE, A 331 je .L1X 332 333 movsd -16 * SIZE(A1), %xmm4 334 movsd -16 * SIZE(A1, LDA), %xmm5 335 movsd -16 * SIZE(A1, LDA, 2), %xmm6 336 movsd -16 * SIZE(A1, LDA3), %xmm7 337 338 movsd -16 * SIZE(Y1), %xmm0 339 340 mulsd %xmm8, %xmm4 341 addsd %xmm4, %xmm0 342 movsd -16 * SIZE(A2), %xmm4 343 mulsd %xmm9, %xmm5 344 addsd %xmm5, %xmm0 345 movsd -16 * SIZE(A2, LDA), %xmm5 346 mulsd %xmm10, %xmm6 347 addsd %xmm6, %xmm0 348 movsd -16 * SIZE(A2, LDA, 2), %xmm6 349 mulsd %xmm11, %xmm7 350 addsd %xmm7, %xmm0 351 movsd -16 * SIZE(A2, LDA3), %xmm7 352 353 mulsd %xmm12, %xmm4 354 addsd %xmm4, %xmm0 355 mulsd %xmm13, %xmm5 356 addsd %xmm5, %xmm0 357 mulsd %xmm14, %xmm6 358 addsd %xmm6, %xmm0 359 mulsd %xmm15, %xmm7 360 addsd %xmm7, %xmm0 361 362 movsd %xmm0, -16 * SIZE(Y1) 363 364 addq $SIZE, A1 365 addq $SIZE, A2 366 addq $SIZE, Y1 367 ALIGN_3 368 369.L1X: 370#endif 371 372 movq MM, I 373 sarq $3, I 374 jle .L15 375 376 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 377 MOVUPS_A1(-14 * SIZE, A1, %xmm5) 378 MOVUPS_A1(-12 * SIZE, A1, %xmm6) 379 MOVUPS_A1(-10 * SIZE, A1, %xmm7) 380 381 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 382 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 383 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 384 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 385 386 decq I 387 jle .L14 388 ALIGN_3 389 390.L13: 391#ifdef PREFETCH 392 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) 393#endif 394 395 mulpd %xmm8, %xmm4 396 addpd %xmm4, %xmm0 397 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) 398 mulpd %xmm8, %xmm5 399 addpd %xmm5, %xmm1 400 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) 401 402 mulpd %xmm8, %xmm6 403 addpd %xmm6, %xmm2 404 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) 405 mulpd %xmm8, %xmm7 406 addpd %xmm7, %xmm3 407 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) 408 409#ifdef PREFETCH 410 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1) 411#endif 412 413 mulpd %xmm9, %xmm4 414 addpd %xmm4, %xmm0 415 MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) 416 mulpd %xmm9, %xmm5 417 addpd %xmm5, %xmm1 418 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) 419 420 mulpd %xmm9, %xmm6 421 addpd %xmm6, %xmm2 422 MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) 423 mulpd %xmm9, %xmm7 424 addpd %xmm7, %xmm3 425 MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) 426 427#ifdef PREFETCH 428 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) 429#endif 430 431 mulpd %xmm10, %xmm4 432 addpd %xmm4, %xmm0 433 MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) 434 mulpd %xmm10, %xmm5 435 addpd %xmm5, %xmm1 436 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) 437 438 mulpd %xmm10, %xmm6 439 addpd %xmm6, %xmm2 440 MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) 441 mulpd %xmm10, %xmm7 442 addpd %xmm7, %xmm3 443 MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) 444 445#ifdef PREFETCH 446 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) 447#endif 448 449 mulpd %xmm11, %xmm4 450 addpd %xmm4, %xmm0 451 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 452 mulpd %xmm11, %xmm5 453 addpd %xmm5, %xmm1 454 MOVUPS_A1(-14 * SIZE, A2, %xmm5) 455 456 mulpd %xmm11, %xmm6 457 addpd %xmm6, %xmm2 458 MOVUPS_A1(-12 * SIZE, A2, %xmm6) 459 mulpd %xmm11, %xmm7 460 addpd %xmm7, %xmm3 461 MOVUPS_A1(-10 * SIZE, A2, %xmm7) 462 463#ifdef PREFETCH 464 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) 465#endif 466 467 mulpd %xmm12, %xmm4 468 addpd %xmm4, %xmm0 469 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) 470 mulpd %xmm12, %xmm5 471 addpd %xmm5, %xmm1 472 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) 473 474 mulpd %xmm12, %xmm6 475 addpd %xmm6, %xmm2 476 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) 477 mulpd %xmm12, %xmm7 478 addpd %xmm7, %xmm3 479 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) 480 481#ifdef PREFETCH 482 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1) 483#endif 484 485 mulpd %xmm13, %xmm4 486 addpd %xmm4, %xmm0 487 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) 488 mulpd %xmm13, %xmm5 489 addpd %xmm5, %xmm1 490 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) 491 492 mulpd %xmm13, %xmm6 493 addpd %xmm6, %xmm2 494 MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) 495 mulpd %xmm13, %xmm7 496 addpd %xmm7, %xmm3 497 MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) 498 499#ifdef PREFETCH 500 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) 501#endif 502 503 mulpd %xmm14, %xmm4 504 addpd %xmm4, %xmm0 505 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) 506 mulpd %xmm14, %xmm5 507 addpd %xmm5, %xmm1 508 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) 509 510 mulpd %xmm14, %xmm6 511 addpd %xmm6, %xmm2 512 MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) 513 mulpd %xmm14, %xmm7 514 addpd %xmm7, %xmm3 515 MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) 516 517#ifdef PREFETCH 518 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) 519#endif 520 521 mulpd %xmm15, %xmm4 522 addpd %xmm4, %xmm0 523 MOVUPS_A1( -8 * SIZE, A1, %xmm4) 524 mulpd %xmm15, %xmm5 525 addpd %xmm5, %xmm1 526 MOVUPS_A1( -6 * SIZE, A1, %xmm5) 527 528 mulpd %xmm15, %xmm6 529 addpd %xmm6, %xmm2 530 MOVUPS_A1( -4 * SIZE, A1, %xmm6) 531 mulpd %xmm15, %xmm7 532 addpd %xmm7, %xmm3 533 MOVUPS_A1( -2 * SIZE, A1, %xmm7) 534 535#ifdef PREFETCHW 536 PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) 537#endif 538 539 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 540 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 541 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 542 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 543 544 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 545 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 546 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 547 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 548 549 subq $-8 * SIZE, A1 550 subq $-8 * SIZE, A2 551 subq $-8 * SIZE, Y1 552 553 subq $1, I 554 BRANCH 555 jg .L13 556 ALIGN_3 557 558.L14: 559 mulpd %xmm8, %xmm4 560 addpd %xmm4, %xmm0 561 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) 562 mulpd %xmm8, %xmm5 563 addpd %xmm5, %xmm1 564 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) 565 566 mulpd %xmm8, %xmm6 567 addpd %xmm6, %xmm2 568 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) 569 mulpd %xmm8, %xmm7 570 addpd %xmm7, %xmm3 571 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) 572 573 mulpd %xmm9, %xmm4 574 addpd %xmm4, %xmm0 575 MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) 576 mulpd %xmm9, %xmm5 577 addpd %xmm5, %xmm1 578 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) 579 580 mulpd %xmm9, %xmm6 581 addpd %xmm6, %xmm2 582 MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) 583 mulpd %xmm9, %xmm7 584 addpd %xmm7, %xmm3 585 MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) 586 587 mulpd %xmm10, %xmm4 588 addpd %xmm4, %xmm0 589 MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) 590 mulpd %xmm10, %xmm5 591 addpd %xmm5, %xmm1 592 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) 593 594 mulpd %xmm10, %xmm6 595 addpd %xmm6, %xmm2 596 MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) 597 mulpd %xmm10, %xmm7 598 addpd %xmm7, %xmm3 599 MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) 600 601 mulpd %xmm11, %xmm4 602 addpd %xmm4, %xmm0 603 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 604 mulpd %xmm11, %xmm5 605 addpd %xmm5, %xmm1 606 MOVUPS_A1(-14 * SIZE, A2, %xmm5) 607 608 mulpd %xmm11, %xmm6 609 addpd %xmm6, %xmm2 610 MOVUPS_A1(-12 * SIZE, A2, %xmm6) 611 mulpd %xmm11, %xmm7 612 addpd %xmm7, %xmm3 613 MOVUPS_A1(-10 * SIZE, A2, %xmm7) 614 615 mulpd %xmm12, %xmm4 616 addpd %xmm4, %xmm0 617 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) 618 mulpd %xmm12, %xmm5 619 addpd %xmm5, %xmm1 620 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) 621 622 mulpd %xmm12, %xmm6 623 addpd %xmm6, %xmm2 624 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) 625 mulpd %xmm12, %xmm7 626 addpd %xmm7, %xmm3 627 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) 628 629 mulpd %xmm13, %xmm4 630 addpd %xmm4, %xmm0 631 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) 632 mulpd %xmm13, %xmm5 633 addpd %xmm5, %xmm1 634 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) 635 636 mulpd %xmm13, %xmm6 637 addpd %xmm6, %xmm2 638 MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) 639 mulpd %xmm13, %xmm7 640 addpd %xmm7, %xmm3 641 MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) 642 643 mulpd %xmm14, %xmm4 644 addpd %xmm4, %xmm0 645 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) 646 mulpd %xmm14, %xmm5 647 addpd %xmm5, %xmm1 648 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) 649 650 mulpd %xmm14, %xmm6 651 addpd %xmm6, %xmm2 652 MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) 653 mulpd %xmm14, %xmm7 654 addpd %xmm7, %xmm3 655 MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) 656 657 mulpd %xmm15, %xmm4 658 addpd %xmm4, %xmm0 659 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 660 mulpd %xmm15, %xmm5 661 addpd %xmm5, %xmm1 662 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 663 664 mulpd %xmm15, %xmm6 665 addpd %xmm6, %xmm2 666 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 667 mulpd %xmm15, %xmm7 668 addpd %xmm7, %xmm3 669 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 670 671 subq $-8 * SIZE, A1 672 subq $-8 * SIZE, A2 673 subq $-8 * SIZE, Y1 674 ALIGN_3 675 676.L15: 677 testq $4, MM 678 je .L16 679 680 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 681 MOVUPS_A1(-14 * SIZE, A1, %xmm5) 682 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) 683 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7) 684 685 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 686 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 687 688 mulpd %xmm8, %xmm4 689 addpd %xmm4, %xmm0 690 MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) 691 mulpd %xmm8, %xmm5 692 addpd %xmm5, %xmm1 693 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) 694 695 mulpd %xmm9, %xmm6 696 addpd %xmm6, %xmm0 697 MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6) 698 mulpd %xmm9, %xmm7 699 addpd %xmm7, %xmm1 700 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7) 701 702 mulpd %xmm10, %xmm4 703 addpd %xmm4, %xmm0 704 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 705 mulpd %xmm10, %xmm5 706 addpd %xmm5, %xmm1 707 MOVUPS_A1(-14 * SIZE, A2, %xmm5) 708 709 mulpd %xmm11, %xmm6 710 addpd %xmm6, %xmm0 711 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) 712 mulpd %xmm11, %xmm7 713 addpd %xmm7, %xmm1 714 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7) 715 716 mulpd %xmm12, %xmm4 717 addpd %xmm4, %xmm0 718 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) 719 mulpd %xmm12, %xmm5 720 addpd %xmm5, %xmm1 721 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) 722 723 mulpd %xmm13, %xmm6 724 addpd %xmm6, %xmm0 725 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6) 726 mulpd %xmm13, %xmm7 727 addpd %xmm7, %xmm1 728 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7) 729 730 mulpd %xmm14, %xmm4 731 addpd %xmm4, %xmm0 732 mulpd %xmm14, %xmm5 733 addpd %xmm5, %xmm1 734 735 mulpd %xmm15, %xmm6 736 addpd %xmm6, %xmm0 737 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 738 mulpd %xmm15, %xmm7 739 addpd %xmm7, %xmm1 740 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 741 742 addq $4 * SIZE, A1 743 addq $4 * SIZE, A2 744 addq $4 * SIZE, Y1 745 ALIGN_3 746 747.L16: 748 testq $2, MM 749 je .L17 750 751 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 752 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm5) 753 MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm6) 754 MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7) 755 756 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 757 758 mulpd %xmm8, %xmm4 759 addpd %xmm4, %xmm0 760 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 761 mulpd %xmm9, %xmm5 762 addpd %xmm5, %xmm0 763 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm5) 764 765 mulpd %xmm10, %xmm6 766 addpd %xmm6, %xmm0 767 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm6) 768 mulpd %xmm11, %xmm7 769 addpd %xmm7, %xmm0 770 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7) 771 772 mulpd %xmm12, %xmm4 773 addpd %xmm4, %xmm0 774 mulpd %xmm13, %xmm5 775 addpd %xmm5, %xmm0 776 mulpd %xmm14, %xmm6 777 addpd %xmm6, %xmm0 778 mulpd %xmm15, %xmm7 779 addpd %xmm7, %xmm0 780 781 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 782 783 addq $2 * SIZE, A1 784 addq $2 * SIZE, A2 785 addq $2 * SIZE, Y1 786 ALIGN_3 787 788.L17: 789 testq $1, MM 790 je .L18 791 792 movsd -16 * SIZE(A1), %xmm4 793 movsd -16 * SIZE(A1, LDA), %xmm5 794 movsd -16 * SIZE(A1, LDA, 2), %xmm6 795 movsd -16 * SIZE(A1, LDA3), %xmm7 796 797 movsd -16 * SIZE(Y1), %xmm0 798 799 mulsd %xmm8, %xmm4 800 addsd %xmm4, %xmm0 801 movsd -16 * SIZE(A2), %xmm4 802 mulsd %xmm9, %xmm5 803 addsd %xmm5, %xmm0 804 movsd -16 * SIZE(A2, LDA), %xmm5 805 mulsd %xmm10, %xmm6 806 addsd %xmm6, %xmm0 807 movsd -16 * SIZE(A2, LDA, 2), %xmm6 808 mulsd %xmm11, %xmm7 809 addsd %xmm7, %xmm0 810 movsd -16 * SIZE(A2, LDA3), %xmm7 811 812 mulsd %xmm12, %xmm4 813 addsd %xmm4, %xmm0 814 mulsd %xmm13, %xmm5 815 addsd %xmm5, %xmm0 816 mulsd %xmm14, %xmm6 817 addsd %xmm6, %xmm0 818 mulsd %xmm15, %xmm7 819 addsd %xmm7, %xmm0 820 821 movsd %xmm0, -16 * SIZE(Y1) 822 ALIGN_3 823 824.L18: 825 cmpq $8, N 826 jge .L11 827 ALIGN_3 828 829.L20: 830#endif 831 832#if GEMV_UNROLL >= 4 833 834 cmpq $4, N 835 jl .L30 836 837#if GEMV_UNROLL == 4 838 ALIGN_3 839 840.L21: 841#endif 842 843 subq $4, N 844 845 leaq 16 * SIZE(BUFFER), Y1 846 movq A, A1 847 leaq (A, LDA, 2), A2 848 leaq (A, LDA, 4), A 849 850#ifdef HAVE_SSE3 851 movddup (X), %xmm12 852 addq INCX, X 853 movddup (X), %xmm13 854 addq INCX, X 855 movddup (X), %xmm14 856 addq INCX, X 857 movddup (X), %xmm15 858 addq INCX, X 859 860 movddup ALPHA, %xmm0 861#else 862 movsd (X), %xmm12 863 unpcklpd %xmm12, %xmm12 864 addq INCX, X 865 movsd (X), %xmm13 866 unpcklpd %xmm13, %xmm13 867 addq INCX, X 868 movsd (X), %xmm14 869 unpcklpd %xmm14, %xmm14 870 addq INCX, X 871 movsd (X), %xmm15 872 unpcklpd %xmm15, %xmm15 873 addq INCX, X 874 875 movsd ALPHA, %xmm0 876 unpcklpd %xmm0, %xmm0 877#endif 878 879 mulpd %xmm0, %xmm12 880 mulpd %xmm0, %xmm13 881 mulpd %xmm0, %xmm14 882 mulpd %xmm0, %xmm15 883 884#ifdef ALIGNED_ACCESS 885 testq $SIZE, A 886 je .L2X 887 888 movsd -16 * SIZE(A1), %xmm4 889 movsd -16 * SIZE(A1, LDA), %xmm5 890 movsd -16 * SIZE(A2), %xmm6 891 movsd -16 * SIZE(A2, LDA), %xmm7 892 893 movsd -16 * SIZE(Y1), %xmm0 894 895 mulsd %xmm12, %xmm4 896 addsd %xmm4, %xmm0 897 mulsd %xmm13, %xmm5 898 addsd %xmm5, %xmm0 899 mulsd %xmm14, %xmm6 900 addsd %xmm6, %xmm0 901 mulsd %xmm15, %xmm7 902 addsd %xmm7, %xmm0 903 904 movsd %xmm0, -16 * SIZE(Y1) 905 906 addq $SIZE, A1 907 addq $SIZE, A2 908 addq $SIZE, Y1 909 ALIGN_3 910 911.L2X: 912#endif 913 914 movq MM, I 915 sarq $3, I 916 jle .L25 917 918 MOVUPS_A1(-16 * SIZE, A1, %xmm0) 919 MOVUPS_A1(-14 * SIZE, A1, %xmm1) 920 MOVUPS_A1(-12 * SIZE, A1, %xmm2) 921 MOVUPS_A1(-10 * SIZE, A1, %xmm3) 922 923 MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) 924 MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) 925 MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) 926 MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) 927 928 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) 929 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) 930 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) 931 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) 932 933 decq I 934 jle .L24 935 ALIGN_3 936 937.L23: 938#ifdef PREFETCH 939 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 940#endif 941 942 mulpd %xmm12, %xmm0 943 addpd %xmm0, %xmm8 944 MOVUPS_A1(-16 * SIZE, A2, %xmm0) 945 mulpd %xmm12, %xmm1 946 addpd %xmm1, %xmm9 947 MOVUPS_A1(-14 * SIZE, A2, %xmm1) 948 949 mulpd %xmm12, %xmm2 950 addpd %xmm2, %xmm10 951 MOVUPS_A1(-12 * SIZE, A2, %xmm2) 952 mulpd %xmm12, %xmm3 953 addpd %xmm3, %xmm11 954 MOVUPS_A1(-10 * SIZE, A2, %xmm3) 955 956#ifdef PREFETCH 957 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) 958#endif 959 960 mulpd %xmm13, %xmm4 961 addpd %xmm4, %xmm8 962 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) 963 mulpd %xmm13, %xmm5 964 addpd %xmm5, %xmm9 965 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) 966 967 mulpd %xmm13, %xmm6 968 addpd %xmm6, %xmm10 969 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) 970 mulpd %xmm13, %xmm7 971 addpd %xmm7, %xmm11 972 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) 973 974#ifdef PREFETCH 975 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 976#endif 977 978 mulpd %xmm14, %xmm0 979 addpd %xmm0, %xmm8 980 MOVUPS_A1( -8 * SIZE, A1, %xmm0) 981 mulpd %xmm14, %xmm1 982 addpd %xmm1, %xmm9 983 MOVUPS_A1( -6 * SIZE, A1, %xmm1) 984 985 mulpd %xmm14, %xmm2 986 addpd %xmm2, %xmm10 987 MOVUPS_A1( -4 * SIZE, A1, %xmm2) 988 mulpd %xmm14, %xmm3 989 addpd %xmm3, %xmm11 990 MOVUPS_A1( -2 * SIZE, A1, %xmm3) 991 992#ifdef PREFETCH 993 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) 994#endif 995 996 mulpd %xmm15, %xmm4 997 addpd %xmm4, %xmm8 998 MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4) 999 mulpd %xmm15, %xmm5 1000 addpd %xmm5, %xmm9 1001 MOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5) 1002 1003 mulpd %xmm15, %xmm6 1004 addpd %xmm6, %xmm10 1005 MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6) 1006 mulpd %xmm15, %xmm7 1007 addpd %xmm7, %xmm11 1008 MOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7) 1009 1010#ifdef PREFETCHW 1011 PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) 1012#endif 1013 1014 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1015 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1016 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) 1017 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) 1018 1019 MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) 1020 MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) 1021 MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) 1022 MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) 1023 1024 subq $-8 * SIZE, A1 1025 subq $-8 * SIZE, A2 1026 subq $-8 * SIZE, Y1 1027 1028 subq $1, I 1029 BRANCH 1030 jg .L23 1031 ALIGN_3 1032 1033.L24: 1034 mulpd %xmm12, %xmm0 1035 addpd %xmm0, %xmm8 1036 MOVUPS_A1(-16 * SIZE, A2, %xmm0) 1037 mulpd %xmm12, %xmm1 1038 addpd %xmm1, %xmm9 1039 MOVUPS_A1(-14 * SIZE, A2, %xmm1) 1040 1041 mulpd %xmm12, %xmm2 1042 addpd %xmm2, %xmm10 1043 MOVUPS_A1(-12 * SIZE, A2, %xmm2) 1044 mulpd %xmm12, %xmm3 1045 addpd %xmm3, %xmm11 1046 MOVUPS_A1(-10 * SIZE, A2, %xmm3) 1047 1048 mulpd %xmm13, %xmm4 1049 addpd %xmm4, %xmm8 1050 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) 1051 mulpd %xmm13, %xmm5 1052 addpd %xmm5, %xmm9 1053 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) 1054 1055 mulpd %xmm13, %xmm6 1056 addpd %xmm6, %xmm10 1057 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) 1058 mulpd %xmm13, %xmm7 1059 addpd %xmm7, %xmm11 1060 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) 1061 1062 mulpd %xmm14, %xmm0 1063 addpd %xmm0, %xmm8 1064 mulpd %xmm14, %xmm1 1065 addpd %xmm1, %xmm9 1066 1067 mulpd %xmm14, %xmm2 1068 addpd %xmm2, %xmm10 1069 mulpd %xmm14, %xmm3 1070 addpd %xmm3, %xmm11 1071 1072 mulpd %xmm15, %xmm4 1073 addpd %xmm4, %xmm8 1074 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1075 mulpd %xmm15, %xmm5 1076 addpd %xmm5, %xmm9 1077 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1078 1079 mulpd %xmm15, %xmm6 1080 addpd %xmm6, %xmm10 1081 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) 1082 mulpd %xmm15, %xmm7 1083 addpd %xmm7, %xmm11 1084 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) 1085 1086 subq $-8 * SIZE, A1 1087 subq $-8 * SIZE, A2 1088 subq $-8 * SIZE, Y1 1089 ALIGN_3 1090 1091.L25: 1092 testq $4, MM 1093 je .L26 1094 1095 MOVUPS_A1(-16 * SIZE, A1, %xmm0) 1096 MOVUPS_A1(-14 * SIZE, A1, %xmm1) 1097 1098 MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) 1099 MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) 1100 1101 mulpd %xmm12, %xmm0 1102 addpd %xmm0, %xmm8 1103 mulpd %xmm12, %xmm1 1104 addpd %xmm1, %xmm9 1105 1106 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) 1107 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) 1108 1109 mulpd %xmm13, %xmm4 1110 addpd %xmm4, %xmm8 1111 mulpd %xmm13, %xmm5 1112 addpd %xmm5, %xmm9 1113 1114 MOVUPS_A1(-16 * SIZE, A2, %xmm0) 1115 MOVUPS_A1(-14 * SIZE, A2, %xmm1) 1116 1117 mulpd %xmm14, %xmm0 1118 addpd %xmm0, %xmm8 1119 mulpd %xmm14, %xmm1 1120 addpd %xmm1, %xmm9 1121 1122 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) 1123 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) 1124 1125 mulpd %xmm15, %xmm4 1126 addpd %xmm4, %xmm8 1127 mulpd %xmm15, %xmm5 1128 addpd %xmm5, %xmm9 1129 1130 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1131 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1132 1133 addq $4 * SIZE, A1 1134 addq $4 * SIZE, A2 1135 addq $4 * SIZE, Y1 1136 ALIGN_3 1137 1138.L26: 1139 testq $2, MM 1140 je .L27 1141 1142 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 1143 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) 1144 MOVUPS_A1(-16 * SIZE, A2, %xmm10) 1145 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) 1146 1147 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1148 1149 mulpd %xmm12, %xmm8 1150 addpd %xmm8, %xmm0 1151 mulpd %xmm13, %xmm9 1152 addpd %xmm9, %xmm0 1153 mulpd %xmm14, %xmm10 1154 addpd %xmm10, %xmm0 1155 mulpd %xmm15, %xmm11 1156 addpd %xmm11, %xmm0 1157 1158 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1159 1160 addq $2 * SIZE, A1 1161 addq $2 * SIZE, A2 1162 addq $2 * SIZE, Y1 1163 ALIGN_3 1164 1165.L27: 1166 testq $1, MM 1167#if GEMV_UNROLL == 4 1168 je .L28 1169#else 1170 je .L30 1171#endif 1172 1173 movsd -16 * SIZE(Y1), %xmm0 1174 1175 movsd -16 * SIZE(A1), %xmm8 1176 movsd -16 * SIZE(A1, LDA), %xmm9 1177 movsd -16 * SIZE(A2), %xmm10 1178 movsd -16 * SIZE(A2, LDA), %xmm11 1179 1180 mulsd %xmm12, %xmm8 1181 addsd %xmm8, %xmm0 1182 mulsd %xmm13, %xmm9 1183 addsd %xmm9, %xmm0 1184 mulsd %xmm14, %xmm10 1185 addsd %xmm10, %xmm0 1186 mulsd %xmm15, %xmm11 1187 addsd %xmm11, %xmm0 1188 1189 movsd %xmm0, -16 * SIZE(Y1) 1190 ALIGN_3 1191 1192#if GEMV_UNROLL == 4 1193.L28: 1194 cmpq $4, N 1195 jge .L21 1196 ALIGN_3 1197 1198#endif 1199 1200.L30: 1201#endif 1202 1203#if GEMV_UNROLL >= 2 1204 1205 cmpq $2, N 1206 jl .L40 1207 1208#if GEMV_UNROLL == 2 1209 ALIGN_3 1210 1211.L31: 1212#endif 1213 1214 subq $2, N 1215 1216 leaq 16 * SIZE(BUFFER), Y1 1217 movq A, A1 1218 leaq (A, LDA), A2 1219 leaq (A, LDA, 2), A 1220 1221#ifdef HAVE_SSE3 1222 movddup (X), %xmm12 1223 addq INCX, X 1224 movddup (X), %xmm13 1225 addq INCX, X 1226 1227 movddup ALPHA, %xmm0 1228#else 1229 movsd (X), %xmm12 1230 unpcklpd %xmm12, %xmm12 1231 addq INCX, X 1232 movsd (X), %xmm13 1233 unpcklpd %xmm13, %xmm13 1234 addq INCX, X 1235 1236 movsd ALPHA, %xmm0 1237 unpcklpd %xmm0, %xmm0 1238#endif 1239 1240 mulpd %xmm0, %xmm12 1241 mulpd %xmm0, %xmm13 1242 1243#ifdef ALIGNED_ACCESS 1244 testq $SIZE, A 1245 je .L3X 1246 1247 movsd -16 * SIZE(A1), %xmm4 1248 movsd -16 * SIZE(A2), %xmm5 1249 1250 movsd -16 * SIZE(Y1), %xmm0 1251 1252 mulsd %xmm12, %xmm4 1253 addsd %xmm4, %xmm0 1254 mulsd %xmm13, %xmm5 1255 addsd %xmm5, %xmm0 1256 1257 movsd %xmm0, -16 * SIZE(Y1) 1258 1259 addq $SIZE, A1 1260 addq $SIZE, A2 1261 addq $SIZE, Y1 1262 ALIGN_3 1263 1264.L3X: 1265#endif 1266 1267 movq MM, I 1268 sarq $3, I 1269 jle .L35 1270 1271 MOVUPS_A1(-16 * SIZE, A1, %xmm0) 1272 MOVUPS_A1(-14 * SIZE, A1, %xmm1) 1273 MOVUPS_A1(-12 * SIZE, A1, %xmm2) 1274 MOVUPS_A1(-10 * SIZE, A1, %xmm3) 1275 1276 MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) 1277 MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) 1278 MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) 1279 MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) 1280 1281 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 1282 MOVUPS_A1(-14 * SIZE, A2, %xmm5) 1283 MOVUPS_A1(-12 * SIZE, A2, %xmm6) 1284 MOVUPS_A1(-10 * SIZE, A2, %xmm7) 1285 1286 decq I 1287 jle .L34 1288 ALIGN_3 1289 1290.L33: 1291#ifdef PREFETCH 1292 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 1293#endif 1294 1295 mulpd %xmm12, %xmm0 1296 addpd %xmm0, %xmm8 1297 MOVUPS_A1( -8 * SIZE, A1, %xmm0) 1298 mulpd %xmm12, %xmm1 1299 addpd %xmm1, %xmm9 1300 MOVUPS_A1( -6 * SIZE, A1, %xmm1) 1301 1302 mulpd %xmm12, %xmm2 1303 addpd %xmm2, %xmm10 1304 MOVUPS_A1( -4 * SIZE, A1, %xmm2) 1305 mulpd %xmm12, %xmm3 1306 addpd %xmm3, %xmm11 1307 MOVUPS_A1( -2 * SIZE, A1, %xmm3) 1308 1309#ifdef PREFETCH 1310 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) 1311#endif 1312 1313 mulpd %xmm13, %xmm4 1314 addpd %xmm4, %xmm8 1315 MOVUPS_A1( -8 * SIZE, A2, %xmm4) 1316 mulpd %xmm13, %xmm5 1317 addpd %xmm5, %xmm9 1318 MOVUPS_A1( -6 * SIZE, A2, %xmm5) 1319 1320 mulpd %xmm13, %xmm6 1321 addpd %xmm6, %xmm10 1322 MOVUPS_A1( -4 * SIZE, A2, %xmm6) 1323 mulpd %xmm13, %xmm7 1324 addpd %xmm7, %xmm11 1325 MOVUPS_A1( -2 * SIZE, A2, %xmm7) 1326 1327#ifdef PREFETCHW 1328 PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) 1329#endif 1330 1331 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1332 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1333 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) 1334 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) 1335 1336 MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) 1337 MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) 1338 MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) 1339 MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) 1340 1341 subq $-8 * SIZE, A1 1342 subq $-8 * SIZE, A2 1343 subq $-8 * SIZE, Y1 1344 1345 subq $1, I 1346 BRANCH 1347 jg .L33 1348 ALIGN_3 1349 1350.L34: 1351 mulpd %xmm12, %xmm0 1352 addpd %xmm0, %xmm8 1353 mulpd %xmm12, %xmm1 1354 addpd %xmm1, %xmm9 1355 mulpd %xmm12, %xmm2 1356 addpd %xmm2, %xmm10 1357 mulpd %xmm12, %xmm3 1358 addpd %xmm3, %xmm11 1359 1360 mulpd %xmm13, %xmm4 1361 addpd %xmm4, %xmm8 1362 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1363 mulpd %xmm13, %xmm5 1364 addpd %xmm5, %xmm9 1365 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1366 mulpd %xmm13, %xmm6 1367 addpd %xmm6, %xmm10 1368 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) 1369 mulpd %xmm13, %xmm7 1370 addpd %xmm7, %xmm11 1371 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) 1372 1373 subq $-8 * SIZE, A1 1374 subq $-8 * SIZE, A2 1375 subq $-8 * SIZE, Y1 1376 ALIGN_3 1377 1378.L35: 1379 testq $4, MM 1380 je .L36 1381 1382 1383 MOVUPS_A1(-16 * SIZE, A1, %xmm0) 1384 MOVUPS_A1(-14 * SIZE, A1, %xmm1) 1385 1386 MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) 1387 MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) 1388 1389 mulpd %xmm12, %xmm0 1390 addpd %xmm0, %xmm8 1391 mulpd %xmm12, %xmm1 1392 addpd %xmm1, %xmm9 1393 1394 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 1395 MOVUPS_A1(-14 * SIZE, A2, %xmm5) 1396 1397 mulpd %xmm13, %xmm4 1398 addpd %xmm4, %xmm8 1399 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1400 mulpd %xmm13, %xmm5 1401 addpd %xmm5, %xmm9 1402 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1403 1404 addq $4 * SIZE, A1 1405 addq $4 * SIZE, A2 1406 addq $4 * SIZE, Y1 1407 ALIGN_3 1408 1409.L36: 1410 testq $2, MM 1411 je .L37 1412 1413 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 1414 MOVUPS_A1(-16 * SIZE, A2, %xmm9) 1415 1416 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1417 1418 mulpd %xmm12, %xmm8 1419 addpd %xmm8, %xmm0 1420 mulpd %xmm13, %xmm9 1421 addpd %xmm9, %xmm0 1422 1423 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1424 1425 addq $2 * SIZE, A1 1426 addq $2 * SIZE, A2 1427 addq $2 * SIZE, Y1 1428 ALIGN_3 1429 1430.L37: 1431 testq $1, MM 1432#if GEMV_UNROLL == 2 1433 je .L38 1434#else 1435 je .L40 1436#endif 1437 1438 movsd -16 * SIZE(Y1), %xmm0 1439 1440 movsd -16 * SIZE(A1), %xmm8 1441 movsd -16 * SIZE(A2), %xmm9 1442 1443 mulsd %xmm12, %xmm8 1444 addsd %xmm8, %xmm0 1445 mulsd %xmm13, %xmm9 1446 addsd %xmm9, %xmm0 1447 1448 movsd %xmm0, -16 * SIZE(Y1) 1449 ALIGN_3 1450 1451#if GEMV_UNROLL == 2 1452.L38: 1453 cmpq $2, N 1454 jge .L31 1455 ALIGN_3 1456 1457#endif 1458 1459.L40: 1460 cmpq $1, N 1461 jl .L900 1462#endif 1463 1464 leaq 16 * SIZE(BUFFER), Y1 1465 movq A, A1 1466 1467#ifdef HAVE_SSE3 1468 movddup (X), %xmm12 1469 addq INCX, X 1470 1471 movddup ALPHA, %xmm0 1472#else 1473 movsd (X), %xmm12 1474 unpcklpd %xmm12, %xmm12 1475 addq INCX, X 1476 1477 movsd ALPHA, %xmm0 1478 unpcklpd %xmm0, %xmm0 1479#endif 1480 1481 mulpd %xmm0, %xmm12 1482 1483#ifdef ALIGNED_ACCESS 1484 testq $SIZE, A 1485 je .L4X 1486 1487 movsd -16 * SIZE(A1), %xmm4 1488 movsd -16 * SIZE(Y1), %xmm0 1489 1490 mulsd %xmm12, %xmm4 1491 addsd %xmm4, %xmm0 1492 1493 movsd %xmm0, -16 * SIZE(Y1) 1494 1495 addq $SIZE, A1 1496 addq $SIZE, Y1 1497 ALIGN_3 1498 1499.L4X: 1500#endif 1501 1502 movq MM, I 1503 sarq $3, I 1504 jle .L45 1505 1506 MOVUPS_A1(-16 * SIZE, A1, %xmm0) 1507 MOVUPS_A1(-14 * SIZE, A1, %xmm1) 1508 MOVUPS_A1(-12 * SIZE, A1, %xmm2) 1509 MOVUPS_A1(-10 * SIZE, A1, %xmm3) 1510 1511 MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) 1512 MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) 1513 MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) 1514 MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) 1515 1516 decq I 1517 jle .L44 1518 ALIGN_3 1519 1520.L43: 1521#ifdef PREFETCH 1522 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) 1523#endif 1524 1525 mulpd %xmm12, %xmm0 1526 addpd %xmm0, %xmm8 1527 MOVUPS_A1( -8 * SIZE, A1, %xmm0) 1528 mulpd %xmm12, %xmm1 1529 addpd %xmm1, %xmm9 1530 MOVUPS_A1( -6 * SIZE, A1, %xmm1) 1531 1532 mulpd %xmm12, %xmm2 1533 addpd %xmm2, %xmm10 1534 MOVUPS_A1( -4 * SIZE, A1, %xmm2) 1535 mulpd %xmm12, %xmm3 1536 addpd %xmm3, %xmm11 1537 MOVUPS_A1( -2 * SIZE, A1, %xmm3) 1538 1539#ifdef PREFETCHW 1540 PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) 1541#endif 1542 1543 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1544 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1545 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) 1546 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) 1547 1548 MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) 1549 MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) 1550 MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) 1551 MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) 1552 1553 subq $-8 * SIZE, A1 1554 subq $-8 * SIZE, Y1 1555 1556 subq $1, I 1557 BRANCH 1558 jg .L43 1559 ALIGN_3 1560 1561.L44: 1562 mulpd %xmm12, %xmm0 1563 addpd %xmm0, %xmm8 1564 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1565 mulpd %xmm12, %xmm1 1566 addpd %xmm1, %xmm9 1567 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1568 mulpd %xmm12, %xmm2 1569 addpd %xmm2, %xmm10 1570 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) 1571 mulpd %xmm12, %xmm3 1572 addpd %xmm3, %xmm11 1573 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) 1574 1575 subq $-8 * SIZE, A1 1576 subq $-8 * SIZE, Y1 1577 ALIGN_3 1578 1579.L45: 1580 testq $4, MM 1581 je .L46 1582 1583 MOVUPS_A1(-16 * SIZE, A1, %xmm0) 1584 MOVUPS_A1(-14 * SIZE, A1, %xmm1) 1585 1586 MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) 1587 MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) 1588 1589 mulpd %xmm12, %xmm0 1590 addpd %xmm0, %xmm8 1591 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 1592 mulpd %xmm12, %xmm1 1593 addpd %xmm1, %xmm9 1594 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 1595 1596 addq $4 * SIZE, A1 1597 addq $4 * SIZE, Y1 1598 ALIGN_3 1599 1600.L46: 1601 testq $2, MM 1602 je .L47 1603 1604 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 1605 1606 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1607 1608 mulpd %xmm12, %xmm8 1609 addpd %xmm8, %xmm0 1610 1611 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1612 1613 addq $2 * SIZE, A1 1614 addq $2 * SIZE, Y1 1615 ALIGN_3 1616 1617.L47: 1618 testq $1, MM 1619 je .L900 1620 1621 movsd -16 * SIZE(Y1), %xmm0 1622 movsd -16 * SIZE(A1), %xmm8 1623 1624 mulsd %xmm12, %xmm8 1625 addsd %xmm8, %xmm0 1626 1627 movsd %xmm0, -16 * SIZE(Y1) 1628 ALIGN_3 1629 1630#ifdef ALIGNED_ACCESS 1631 jmp .L900 1632 ALIGN_3 1633 1634.L50: 1635#if GEMV_UNROLL >= 4 1636 1637 cmpq $4, N 1638 jl .L60 1639 ALIGN_3 1640 1641.L51: 1642 1643 subq $4, N 1644 1645 leaq 16 * SIZE(BUFFER), Y1 1646 movq A, A1 1647 leaq (A, LDA, 2), A2 1648 leaq (A, LDA, 4), A 1649 1650#ifdef HAVE_SSE3 1651 movddup (X), %xmm12 1652 addq INCX, X 1653 movddup (X), %xmm13 1654 addq INCX, X 1655 movddup (X), %xmm14 1656 addq INCX, X 1657 movddup (X), %xmm15 1658 addq INCX, X 1659 1660 movddup ALPHA, %xmm0 1661#else 1662 movsd (X), %xmm12 1663 unpcklpd %xmm12, %xmm12 1664 addq INCX, X 1665 movsd (X), %xmm13 1666 unpcklpd %xmm13, %xmm13 1667 addq INCX, X 1668 movsd (X), %xmm14 1669 unpcklpd %xmm14, %xmm14 1670 addq INCX, X 1671 movsd (X), %xmm15 1672 unpcklpd %xmm15, %xmm15 1673 addq INCX, X 1674 1675 movsd ALPHA, %xmm0 1676 unpcklpd %xmm0, %xmm0 1677#endif 1678 1679 mulpd %xmm0, %xmm12 1680 mulpd %xmm0, %xmm13 1681 mulpd %xmm0, %xmm14 1682 mulpd %xmm0, %xmm15 1683 1684 testq $SIZE, A 1685 je .L5X 1686 1687 movsd -16 * SIZE(A1), %xmm4 1688 movsd -16 * SIZE(A1, LDA), %xmm5 1689 movsd -16 * SIZE(A2), %xmm6 1690 movsd -16 * SIZE(A2, LDA), %xmm7 1691 1692 movsd -16 * SIZE(Y1), %xmm0 1693 1694 mulsd %xmm12, %xmm4 1695 addsd %xmm4, %xmm0 1696 mulsd %xmm13, %xmm5 1697 addsd %xmm5, %xmm0 1698 mulsd %xmm14, %xmm6 1699 addsd %xmm6, %xmm0 1700 mulsd %xmm15, %xmm7 1701 addsd %xmm7, %xmm0 1702 1703 movsd %xmm0, -16 * SIZE(Y1) 1704 1705 addq $SIZE, A1 1706 addq $SIZE, A2 1707 addq $SIZE, Y1 1708 ALIGN_3 1709 1710.L5X: 1711 movhpd -16 * SIZE(A1, LDA), %xmm8 1712 movhpd -16 * SIZE(A2, LDA), %xmm9 1713 1714 movq MM, I 1715 sarq $3, I 1716 jle .L55 1717 1718 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 1719 MOVUPS_A1(-14 * SIZE, A1, %xmm5) 1720 MOVUPS_A1(-12 * SIZE, A1, %xmm6) 1721 1722 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1723 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 1724 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 1725 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 1726 1727 decq I 1728 jle .L54 1729 ALIGN_3 1730 1731.L53: 1732#ifdef PREFETCH 1733 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 1734#endif 1735 1736 mulpd %xmm12, %xmm4 1737 addpd %xmm4, %xmm0 1738 MOVUPS_A1(-10 * SIZE, A1, %xmm7) 1739 mulpd %xmm12, %xmm5 1740 addpd %xmm5, %xmm1 1741 MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) 1742 1743 mulpd %xmm12, %xmm6 1744 addpd %xmm6, %xmm2 1745 MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) 1746 mulpd %xmm12, %xmm7 1747 addpd %xmm7, %xmm3 1748 MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) 1749 1750#ifdef PREFETCH 1751 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A1, LDA) 1752#endif 1753 1754 shufpd $1, %xmm4, %xmm8 1755 mulpd %xmm13, %xmm8 1756 addpd %xmm8, %xmm0 1757 MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) 1758 shufpd $1, %xmm5, %xmm4 1759 mulpd %xmm13, %xmm4 1760 addpd %xmm4, %xmm1 1761 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 1762 1763 shufpd $1, %xmm6, %xmm5 1764 mulpd %xmm13, %xmm5 1765 addpd %xmm5, %xmm2 1766 MOVUPS_A1(-14 * SIZE, A2, %xmm5) 1767 shufpd $1, %xmm8, %xmm6 1768 mulpd %xmm13, %xmm6 1769 addpd %xmm6, %xmm3 1770 MOVUPS_A1(-12 * SIZE, A2, %xmm6) 1771 1772#ifdef PREFETCH 1773 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 1774#endif 1775 1776 mulpd %xmm14, %xmm4 1777 addpd %xmm4, %xmm0 1778 MOVUPS_A1(-10 * SIZE, A2, %xmm7) 1779 mulpd %xmm14, %xmm5 1780 addpd %xmm5, %xmm1 1781 MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) 1782 1783 mulpd %xmm14, %xmm6 1784 addpd %xmm6, %xmm2 1785 MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) 1786 mulpd %xmm14, %xmm7 1787 addpd %xmm7, %xmm3 1788 MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) 1789 1790#ifdef PREFETCH 1791 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A2, LDA) 1792#endif 1793 1794 shufpd $1, %xmm4, %xmm9 1795 mulpd %xmm15, %xmm9 1796 addpd %xmm9, %xmm0 1797 MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) 1798 shufpd $1, %xmm5, %xmm4 1799 mulpd %xmm15, %xmm4 1800 addpd %xmm4, %xmm1 1801 MOVUPS_A1( -8 * SIZE, A1, %xmm4) 1802 1803 shufpd $1, %xmm6, %xmm5 1804 mulpd %xmm15, %xmm5 1805 addpd %xmm5, %xmm2 1806 MOVUPS_A1( -6 * SIZE, A1, %xmm5) 1807 shufpd $1, %xmm9, %xmm6 1808 mulpd %xmm15, %xmm6 1809 addpd %xmm6, %xmm3 1810 MOVUPS_A1( -4 * SIZE, A1, %xmm6) 1811 1812#ifdef PREFETCHW 1813 PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) 1814#endif 1815 1816 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1817 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1818 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1819 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1820 1821 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 1822 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 1823 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 1824 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 1825 1826 subq $-8 * SIZE, A1 1827 subq $-8 * SIZE, A2 1828 subq $-8 * SIZE, Y1 1829 1830 subq $1, I 1831 BRANCH 1832 jg .L53 1833 ALIGN_3 1834 1835.L54: 1836 mulpd %xmm12, %xmm4 1837 addpd %xmm4, %xmm0 1838 MOVUPS_A1(-10 * SIZE, A1, %xmm7) 1839 mulpd %xmm12, %xmm5 1840 addpd %xmm5, %xmm1 1841 MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) 1842 1843 mulpd %xmm12, %xmm6 1844 addpd %xmm6, %xmm2 1845 MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) 1846 mulpd %xmm12, %xmm7 1847 addpd %xmm7, %xmm3 1848 MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) 1849 1850 shufpd $1, %xmm4, %xmm8 1851 mulpd %xmm13, %xmm8 1852 addpd %xmm8, %xmm0 1853 MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) 1854 shufpd $1, %xmm5, %xmm4 1855 mulpd %xmm13, %xmm4 1856 addpd %xmm4, %xmm1 1857 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 1858 1859 shufpd $1, %xmm6, %xmm5 1860 mulpd %xmm13, %xmm5 1861 addpd %xmm5, %xmm2 1862 MOVUPS_A1(-14 * SIZE, A2, %xmm5) 1863 shufpd $1, %xmm8, %xmm6 1864 mulpd %xmm13, %xmm6 1865 addpd %xmm6, %xmm3 1866 MOVUPS_A1(-12 * SIZE, A2, %xmm6) 1867 1868 mulpd %xmm14, %xmm4 1869 addpd %xmm4, %xmm0 1870 MOVUPS_A1(-10 * SIZE, A2, %xmm7) 1871 mulpd %xmm14, %xmm5 1872 addpd %xmm5, %xmm1 1873 MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) 1874 1875 mulpd %xmm14, %xmm6 1876 addpd %xmm6, %xmm2 1877 MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) 1878 mulpd %xmm14, %xmm7 1879 addpd %xmm7, %xmm3 1880 MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) 1881 1882 shufpd $1, %xmm4, %xmm9 1883 mulpd %xmm15, %xmm9 1884 addpd %xmm9, %xmm0 1885 MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) 1886 1887 shufpd $1, %xmm5, %xmm4 1888 mulpd %xmm15, %xmm4 1889 addpd %xmm4, %xmm1 1890 shufpd $1, %xmm6, %xmm5 1891 mulpd %xmm15, %xmm5 1892 addpd %xmm5, %xmm2 1893 shufpd $1, %xmm9, %xmm6 1894 mulpd %xmm15, %xmm6 1895 addpd %xmm6, %xmm3 1896 1897 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1898 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1899 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1900 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1901 1902 subq $-8 * SIZE, A1 1903 subq $-8 * SIZE, A2 1904 subq $-8 * SIZE, Y1 1905 ALIGN_3 1906 1907.L55: 1908 testq $4, MM 1909 je .L56 1910 1911 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 1912 MOVUPS_A1(-14 * SIZE, A1, %xmm5) 1913 1914 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1915 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 1916 1917 mulpd %xmm12, %xmm4 1918 addpd %xmm4, %xmm0 1919 mulpd %xmm12, %xmm5 1920 addpd %xmm5, %xmm1 1921 1922 MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6) 1923 MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7) 1924 1925 shufpd $1, %xmm6, %xmm8 1926 mulpd %xmm13, %xmm8 1927 addpd %xmm8, %xmm0 1928 movaps %xmm7, %xmm8 1929 shufpd $1, %xmm7, %xmm6 1930 mulpd %xmm13, %xmm6 1931 addpd %xmm6, %xmm1 1932 1933 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 1934 MOVUPS_A1(-14 * SIZE, A2, %xmm5) 1935 1936 mulpd %xmm14, %xmm4 1937 addpd %xmm4, %xmm0 1938 mulpd %xmm14, %xmm5 1939 addpd %xmm5, %xmm1 1940 1941 MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6) 1942 MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7) 1943 1944 shufpd $1, %xmm6, %xmm9 1945 mulpd %xmm15, %xmm9 1946 addpd %xmm9, %xmm0 1947 movaps %xmm7, %xmm9 1948 shufpd $1, %xmm7, %xmm6 1949 mulpd %xmm15, %xmm6 1950 addpd %xmm6, %xmm1 1951 1952 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1953 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1954 1955 addq $4 * SIZE, A1 1956 addq $4 * SIZE, A2 1957 addq $4 * SIZE, Y1 1958 ALIGN_3 1959 1960.L56: 1961 testq $2, MM 1962 je .L57 1963 1964 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 1965 MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) 1966 MOVUPS_A1(-16 * SIZE, A2, %xmm6) 1967 MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) 1968 1969 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1970 1971 mulpd %xmm12, %xmm4 1972 addpd %xmm4, %xmm0 1973 shufpd $1, %xmm5, %xmm8 1974 mulpd %xmm13, %xmm8 1975 addpd %xmm8, %xmm0 1976 movaps %xmm5, %xmm8 1977 mulpd %xmm14, %xmm6 1978 addpd %xmm6, %xmm0 1979 shufpd $1, %xmm7, %xmm9 1980 mulpd %xmm15, %xmm9 1981 addpd %xmm9, %xmm0 1982 movaps %xmm7, %xmm9 1983 1984 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1985 1986 addq $2 * SIZE, A1 1987 addq $2 * SIZE, A2 1988 addq $2 * SIZE, Y1 1989 ALIGN_3 1990 1991.L57: 1992 testq $1, MM 1993 je .L58 1994 1995 movsd -16 * SIZE(Y1), %xmm0 1996 1997 movsd -16 * SIZE(A1), %xmm4 1998 shufpd $1, %xmm8, %xmm8 1999 movsd -16 * SIZE(A2), %xmm6 2000 shufpd $1, %xmm9, %xmm9 2001 2002 mulsd %xmm12, %xmm4 2003 addsd %xmm4, %xmm0 2004 mulsd %xmm13, %xmm8 2005 addsd %xmm8, %xmm0 2006 mulsd %xmm14, %xmm6 2007 addsd %xmm6, %xmm0 2008 mulsd %xmm15, %xmm9 2009 addsd %xmm9, %xmm0 2010 2011 movsd %xmm0, -16 * SIZE(Y1) 2012 ALIGN_3 2013 2014.L58: 2015 cmpq $4, N 2016 jge .L51 2017 ALIGN_3 2018 2019.L60: 2020#endif 2021 2022#if GEMV_UNROLL >= 2 2023 2024 cmpq $2, N 2025 jl .L70 2026 2027#if GEMV_UNROLL == 2 2028 ALIGN_3 2029 2030.L61: 2031#endif 2032 2033 subq $2, N 2034 2035 leaq 16 * SIZE(BUFFER), Y1 2036 movq A, A1 2037 leaq (A, LDA), A2 2038 leaq (A, LDA, 2), A 2039 2040#ifdef HAVE_SSE3 2041 movddup (X), %xmm12 2042 addq INCX, X 2043 movddup (X), %xmm13 2044 addq INCX, X 2045 2046 movddup ALPHA, %xmm0 2047#else 2048 movsd (X), %xmm12 2049 unpcklpd %xmm12, %xmm12 2050 addq INCX, X 2051 movsd (X), %xmm13 2052 unpcklpd %xmm13, %xmm13 2053 addq INCX, X 2054 2055 movsd ALPHA, %xmm0 2056 unpcklpd %xmm0, %xmm0 2057#endif 2058 2059 mulpd %xmm0, %xmm12 2060 mulpd %xmm0, %xmm13 2061 2062 testq $SIZE, A 2063 je .L6X 2064 2065 movsd -16 * SIZE(A1), %xmm4 2066 movsd -16 * SIZE(A2), %xmm5 2067 2068 movsd -16 * SIZE(Y1), %xmm0 2069 2070 mulsd %xmm12, %xmm4 2071 addsd %xmm4, %xmm0 2072 mulsd %xmm13, %xmm5 2073 addsd %xmm5, %xmm0 2074 2075 movsd %xmm0, -16 * SIZE(Y1) 2076 2077 addq $SIZE, A1 2078 addq $SIZE, A2 2079 addq $SIZE, Y1 2080 ALIGN_3 2081 2082.L6X: 2083 movhpd -16 * SIZE(A2), %xmm8 2084 2085 movq MM, I 2086 sarq $3, I 2087 jle .L65 2088 2089 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 2090 MOVUPS_A1(-14 * SIZE, A1, %xmm5) 2091 MOVUPS_A1(-12 * SIZE, A1, %xmm6) 2092 2093 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 2094 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 2095 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 2096 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 2097 2098 decq I 2099 jle .L64 2100 ALIGN_3 2101 2102.L63: 2103#ifdef PREFETCH 2104 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 2105#endif 2106 2107 mulpd %xmm12, %xmm4 2108 addpd %xmm4, %xmm0 2109 MOVUPS_A1(-10 * SIZE, A1, %xmm7) 2110 mulpd %xmm12, %xmm5 2111 addpd %xmm5, %xmm1 2112 MOVUPS_A1(-15 * SIZE, A2, %xmm4) 2113 2114 mulpd %xmm12, %xmm6 2115 addpd %xmm6, %xmm2 2116 MOVUPS_A1(-13 * SIZE, A2, %xmm5) 2117 mulpd %xmm12, %xmm7 2118 addpd %xmm7, %xmm3 2119 MOVUPS_A1(-11 * SIZE, A2, %xmm6) 2120 2121#ifdef PREFETCH 2122 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(A2) 2123#endif 2124 2125 shufpd $1, %xmm4, %xmm8 2126 mulpd %xmm13, %xmm8 2127 addpd %xmm8, %xmm0 2128 MOVUPS_A1( -9 * SIZE, A2, %xmm8) 2129 shufpd $1, %xmm5, %xmm4 2130 mulpd %xmm13, %xmm4 2131 addpd %xmm4, %xmm1 2132 MOVUPS_A1( -8 * SIZE, A1, %xmm4) 2133 2134 shufpd $1, %xmm6, %xmm5 2135 mulpd %xmm13, %xmm5 2136 addpd %xmm5, %xmm2 2137 MOVUPS_A1( -6 * SIZE, A1, %xmm5) 2138 shufpd $1, %xmm8, %xmm6 2139 mulpd %xmm13, %xmm6 2140 addpd %xmm6, %xmm3 2141 MOVUPS_A1( -4 * SIZE, A1, %xmm6) 2142 2143#ifdef PREFETCHW 2144 PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) 2145#endif 2146 2147 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2148 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 2149 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 2150 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 2151 2152 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 2153 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 2154 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 2155 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 2156 2157 subq $-8 * SIZE, A1 2158 subq $-8 * SIZE, A2 2159 subq $-8 * SIZE, Y1 2160 2161 subq $1, I 2162 BRANCH 2163 jg .L63 2164 ALIGN_3 2165 2166.L64: 2167 mulpd %xmm12, %xmm4 2168 addpd %xmm4, %xmm0 2169 MOVUPS_A1(-10 * SIZE, A1, %xmm7) 2170 mulpd %xmm12, %xmm5 2171 addpd %xmm5, %xmm1 2172 MOVUPS_A1(-15 * SIZE, A2, %xmm4) 2173 2174 mulpd %xmm12, %xmm6 2175 addpd %xmm6, %xmm2 2176 MOVUPS_A1(-13 * SIZE, A2, %xmm5) 2177 mulpd %xmm12, %xmm7 2178 addpd %xmm7, %xmm3 2179 MOVUPS_A1(-11 * SIZE, A2, %xmm6) 2180 2181 shufpd $1, %xmm4, %xmm8 2182 mulpd %xmm13, %xmm8 2183 addpd %xmm8, %xmm0 2184 MOVUPS_A1( -9 * SIZE, A2, %xmm8) 2185 shufpd $1, %xmm5, %xmm4 2186 mulpd %xmm13, %xmm4 2187 addpd %xmm4, %xmm1 2188 2189 shufpd $1, %xmm6, %xmm5 2190 mulpd %xmm13, %xmm5 2191 addpd %xmm5, %xmm2 2192 shufpd $1, %xmm8, %xmm6 2193 mulpd %xmm13, %xmm6 2194 addpd %xmm6, %xmm3 2195 2196 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2197 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 2198 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 2199 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 2200 2201 subq $-8 * SIZE, A1 2202 subq $-8 * SIZE, A2 2203 subq $-8 * SIZE, Y1 2204 ALIGN_3 2205 2206.L65: 2207 testq $4, MM 2208 je .L66 2209 2210 2211 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 2212 MOVUPS_A1(-14 * SIZE, A1, %xmm5) 2213 2214 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 2215 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 2216 2217 mulpd %xmm12, %xmm4 2218 addpd %xmm4, %xmm0 2219 mulpd %xmm12, %xmm5 2220 addpd %xmm5, %xmm1 2221 2222 MOVUPS_A1(-15 * SIZE, A2, %xmm6) 2223 MOVUPS_A1(-13 * SIZE, A2, %xmm7) 2224 2225 shufpd $1, %xmm6, %xmm8 2226 mulpd %xmm13, %xmm8 2227 addpd %xmm8, %xmm0 2228 movaps %xmm7, %xmm8 2229 shufpd $1, %xmm7, %xmm6 2230 mulpd %xmm13, %xmm6 2231 addpd %xmm6, %xmm1 2232 2233 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2234 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 2235 2236 addq $4 * SIZE, A1 2237 addq $4 * SIZE, A2 2238 addq $4 * SIZE, Y1 2239 ALIGN_3 2240 2241.L66: 2242 testq $2, MM 2243 je .L67 2244 2245 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 2246 MOVUPS_A1(-15 * SIZE, A2, %xmm5) 2247 2248 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 2249 2250 mulpd %xmm12, %xmm4 2251 addpd %xmm4, %xmm0 2252 shufpd $1, %xmm5, %xmm8 2253 mulpd %xmm13, %xmm8 2254 addpd %xmm8, %xmm0 2255 movaps %xmm5, %xmm8 2256 2257 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2258 2259 addq $2 * SIZE, A1 2260 addq $2 * SIZE, A2 2261 addq $2 * SIZE, Y1 2262 ALIGN_3 2263 2264.L67: 2265 testq $1, MM 2266#if GEMV_UNROLL == 2 2267 je .L68 2268#else 2269 je .L70 2270#endif 2271 2272 movsd -16 * SIZE(Y1), %xmm0 2273 2274 movsd -16 * SIZE(A1), %xmm4 2275 shufpd $1, %xmm8, %xmm8 2276 2277 mulsd %xmm12, %xmm4 2278 addsd %xmm4, %xmm0 2279 mulsd %xmm13, %xmm8 2280 addsd %xmm8, %xmm0 2281 2282 movsd %xmm0, -16 * SIZE(Y1) 2283 ALIGN_3 2284 2285#if GEMV_UNROLL == 2 2286.L68: 2287 cmpq $2, N 2288 jge .L61 2289 ALIGN_3 2290 2291#endif 2292 2293.L70: 2294 cmpq $1, N 2295 jl .L900 2296 2297#endif 2298 2299 leaq 16 * SIZE(BUFFER), Y1 2300 movq A, A1 2301 2302#ifdef HAVE_SSE3 2303 movddup (X), %xmm12 2304 addq INCX, X 2305 2306 movddup ALPHA, %xmm0 2307#else 2308 movsd (X), %xmm12 2309 unpcklpd %xmm12, %xmm12 2310 addq INCX, X 2311 2312 movsd ALPHA, %xmm0 2313 unpcklpd %xmm0, %xmm0 2314#endif 2315 2316 mulpd %xmm0, %xmm12 2317 2318 testq $SIZE, A 2319 je .L7X 2320 2321 movsd -16 * SIZE(A1), %xmm4 2322 movsd -16 * SIZE(Y1), %xmm0 2323 2324 mulsd %xmm12, %xmm4 2325 addsd %xmm4, %xmm0 2326 2327 movsd %xmm0, -16 * SIZE(Y1) 2328 2329 addq $SIZE, A1 2330 addq $SIZE, Y1 2331 ALIGN_3 2332 2333.L7X: 2334 2335 movq MM, I 2336 sarq $3, I 2337 jle .L75 2338 2339 MOVUPS_A1(-16 * SIZE, A1, %xmm0) 2340 MOVUPS_A1(-14 * SIZE, A1, %xmm1) 2341 MOVUPS_A1(-12 * SIZE, A1, %xmm2) 2342 MOVUPS_A1(-10 * SIZE, A1, %xmm3) 2343 2344 MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) 2345 MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) 2346 MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) 2347 MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) 2348 2349 decq I 2350 jle .L74 2351 ALIGN_3 2352 2353.L73: 2354#ifdef PREFETCH 2355 PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) 2356#endif 2357 2358 mulpd %xmm12, %xmm0 2359 addpd %xmm0, %xmm8 2360 MOVUPS_A1( -8 * SIZE, A1, %xmm0) 2361 mulpd %xmm12, %xmm1 2362 addpd %xmm1, %xmm9 2363 MOVUPS_A1( -6 * SIZE, A1, %xmm1) 2364 2365 mulpd %xmm12, %xmm2 2366 addpd %xmm2, %xmm10 2367 MOVUPS_A1( -4 * SIZE, A1, %xmm2) 2368 mulpd %xmm12, %xmm3 2369 addpd %xmm3, %xmm11 2370 MOVUPS_A1( -2 * SIZE, A1, %xmm3) 2371 2372#ifdef PREFETCHW 2373 PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) 2374#endif 2375 2376 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 2377 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 2378 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) 2379 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) 2380 2381 MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) 2382 MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) 2383 MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) 2384 MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) 2385 2386 subq $-8 * SIZE, A1 2387 subq $-8 * SIZE, Y1 2388 2389 subq $1, I 2390 BRANCH 2391 jg .L73 2392 ALIGN_3 2393 2394.L74: 2395 mulpd %xmm12, %xmm0 2396 addpd %xmm0, %xmm8 2397 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 2398 mulpd %xmm12, %xmm1 2399 addpd %xmm1, %xmm9 2400 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 2401 mulpd %xmm12, %xmm2 2402 addpd %xmm2, %xmm10 2403 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) 2404 mulpd %xmm12, %xmm3 2405 addpd %xmm3, %xmm11 2406 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) 2407 2408 subq $-8 * SIZE, A1 2409 subq $-8 * SIZE, Y1 2410 ALIGN_3 2411 2412.L75: 2413 testq $4, MM 2414 je .L76 2415 2416 MOVUPS_A1(-16 * SIZE, A1, %xmm0) 2417 MOVUPS_A1(-14 * SIZE, A1, %xmm1) 2418 2419 MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) 2420 MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) 2421 2422 mulpd %xmm12, %xmm0 2423 addpd %xmm0, %xmm8 2424 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) 2425 mulpd %xmm12, %xmm1 2426 addpd %xmm1, %xmm9 2427 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) 2428 2429 addq $4 * SIZE, A1 2430 addq $4 * SIZE, Y1 2431 ALIGN_3 2432 2433.L76: 2434 testq $2, MM 2435 je .L77 2436 2437 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 2438 2439 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 2440 2441 mulpd %xmm12, %xmm8 2442 addpd %xmm8, %xmm0 2443 2444 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2445 2446 addq $2 * SIZE, A1 2447 addq $2 * SIZE, Y1 2448 ALIGN_3 2449 2450.L77: 2451 testq $1, MM 2452 je .L900 2453 2454 movsd -16 * SIZE(Y1), %xmm0 2455 movsd -16 * SIZE(A1), %xmm8 2456 2457 mulsd %xmm12, %xmm8 2458 addsd %xmm8, %xmm0 2459 2460 movsd %xmm0, -16 * SIZE(Y1) 2461#endif 2462 ALIGN_3 2463 2464 2465.L900: 2466#ifndef COPY_FORCE 2467 cmpq Y, BUFFER 2468 je .L999 2469#endif 2470 movq M, TMP_M 2471 movq Y, Y1 2472 2473 cmpq $SIZE, INCY 2474 jne .L950 2475 2476 testq $SIZE, Y1 2477 je .L910 2478 2479 movsd (Y1), %xmm0 2480 addsd (BUFFER), %xmm0 2481 movsd %xmm0, (Y1) 2482 2483 addq $SIZE, Y1 2484 addq $SIZE, BUFFER 2485 2486 decq TMP_M 2487 jle .L999 2488 ALIGN_4 2489 2490.L910: 2491 testq $SIZE, BUFFER 2492 jne .L920 2493 2494 movq TMP_M, %rax 2495 sarq $3, %rax 2496 jle .L914 2497 ALIGN_3 2498 2499.L912: 2500#ifdef PREFETCHW 2501 PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1) 2502#endif 2503 2504 movapd 0 * SIZE(Y1), %xmm0 2505 movapd 2 * SIZE(Y1), %xmm1 2506 movapd 4 * SIZE(Y1), %xmm2 2507 movapd 6 * SIZE(Y1), %xmm3 2508 2509 movapd 0 * SIZE(BUFFER), %xmm4 2510 movapd 2 * SIZE(BUFFER), %xmm5 2511 movapd 4 * SIZE(BUFFER), %xmm6 2512 movapd 6 * SIZE(BUFFER), %xmm7 2513 2514#ifdef PREFETCH 2515 PREFETCH (PREFETCHSIZE) * 4 + PREOFFSET(BUFFER) 2516#endif 2517 2518 addpd %xmm4, %xmm0 2519 addpd %xmm5, %xmm1 2520 addpd %xmm6, %xmm2 2521 addpd %xmm7, %xmm3 2522 2523 movapd %xmm0, 0 * SIZE(Y1) 2524 movapd %xmm1, 2 * SIZE(Y1) 2525 movapd %xmm2, 4 * SIZE(Y1) 2526 movapd %xmm3, 6 * SIZE(Y1) 2527 2528 addq $8 * SIZE, Y1 2529 addq $8 * SIZE, BUFFER 2530 2531 decq %rax 2532 jg .L912 2533 ALIGN_3 2534 2535.L914: 2536 testq $7, TMP_M 2537 jle .L999 2538 2539 testq $4, TMP_M 2540 jle .L915 2541 2542 movapd 0 * SIZE(Y1), %xmm0 2543 movapd 2 * SIZE(Y1), %xmm1 2544 2545 movapd 0 * SIZE(BUFFER), %xmm4 2546 movapd 2 * SIZE(BUFFER), %xmm5 2547 2548 addpd %xmm4, %xmm0 2549 addpd %xmm5, %xmm1 2550 2551 movapd %xmm0, 0 * SIZE(Y1) 2552 movapd %xmm1, 2 * SIZE(Y1) 2553 2554 addq $4 * SIZE, Y1 2555 addq $4 * SIZE, BUFFER 2556 ALIGN_3 2557 2558.L915: 2559 testq $2, TMP_M 2560 jle .L916 2561 2562 movapd (Y1), %xmm0 2563 2564 movapd (BUFFER), %xmm4 2565 2566 addpd %xmm4, %xmm0 2567 2568 movapd %xmm0, (Y1) 2569 2570 addq $2 * SIZE, Y1 2571 addq $2 * SIZE, BUFFER 2572 ALIGN_3 2573 2574.L916: 2575 testq $1, TMP_M 2576 jle .L999 2577 2578 movsd (Y1), %xmm0 2579 2580 movsd 0 * SIZE(BUFFER), %xmm4 2581 2582 addsd %xmm4, %xmm0 2583 2584 movlpd %xmm0, (Y1) 2585 ALIGN_3 2586 2587 jmp .L999 2588 ALIGN_4 2589 2590.L920: 2591 movapd -1 * SIZE(BUFFER), %xmm4 2592 2593 movq TMP_M, %rax 2594 sarq $3, %rax 2595 jle .L924 2596 ALIGN_3 2597 2598.L922: 2599#ifdef PREFETCHW 2600 PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1) 2601#endif 2602 2603 movapd 0 * SIZE(Y1), %xmm0 2604 movapd 2 * SIZE(Y1), %xmm1 2605 movapd 4 * SIZE(Y1), %xmm2 2606 movapd 6 * SIZE(Y1), %xmm3 2607 2608 movapd 1 * SIZE(BUFFER), %xmm5 2609 movapd 3 * SIZE(BUFFER), %xmm6 2610 movapd 5 * SIZE(BUFFER), %xmm7 2611 movapd 7 * SIZE(BUFFER), %xmm8 2612 2613 shufpd $1, %xmm5, %xmm4 2614 shufpd $1, %xmm6, %xmm5 2615 shufpd $1, %xmm7, %xmm6 2616 shufpd $1, %xmm8, %xmm7 2617 2618#ifdef PREFETCH 2619 PREFETCH (PREFETCHSIZE) * 4 + PREOFFSET(BUFFER) 2620#endif 2621 2622 addpd %xmm4, %xmm0 2623 addpd %xmm5, %xmm1 2624 addpd %xmm6, %xmm2 2625 addpd %xmm7, %xmm3 2626 2627 movapd %xmm0, 0 * SIZE(Y1) 2628 movapd %xmm1, 2 * SIZE(Y1) 2629 movapd %xmm2, 4 * SIZE(Y1) 2630 movapd %xmm3, 6 * SIZE(Y1) 2631 2632 movapd %xmm8, %xmm4 2633 2634 addq $8 * SIZE, Y1 2635 addq $8 * SIZE, BUFFER 2636 2637 decq %rax 2638 jg .L922 2639 ALIGN_3 2640 2641.L924: 2642 testq $7, TMP_M 2643 jle .L999 2644 2645 testq $4, TMP_M 2646 jle .L925 2647 2648 movapd 0 * SIZE(Y1), %xmm0 2649 movapd 2 * SIZE(Y1), %xmm1 2650 2651 movapd 1 * SIZE(BUFFER), %xmm5 2652 movapd 3 * SIZE(BUFFER), %xmm6 2653 2654 shufpd $1, %xmm5, %xmm4 2655 shufpd $1, %xmm6, %xmm5 2656 2657 addpd %xmm4, %xmm0 2658 addpd %xmm5, %xmm1 2659 2660 movapd %xmm0, 0 * SIZE(Y1) 2661 movapd %xmm1, 2 * SIZE(Y1) 2662 2663 movapd %xmm6, %xmm4 2664 2665 addq $4 * SIZE, Y1 2666 addq $4 * SIZE, BUFFER 2667 ALIGN_3 2668 2669.L925: 2670 testq $2, TMP_M 2671 jle .L926 2672 2673 movapd (Y1), %xmm0 2674 2675 movapd 1 * SIZE(BUFFER), %xmm5 2676 2677 shufpd $1, %xmm5, %xmm4 2678 2679 addpd %xmm4, %xmm0 2680 2681 movapd %xmm0, (Y1) 2682 2683 movaps %xmm5, %xmm4 2684 2685 addq $2 * SIZE, Y1 2686 addq $2 * SIZE, BUFFER 2687 ALIGN_3 2688 2689.L926: 2690 testq $1, TMP_M 2691 jle .L999 2692 2693 movsd (Y1), %xmm0 2694 2695 shufpd $1, %xmm4, %xmm4 2696 2697 addsd %xmm4, %xmm0 2698 2699 movlpd %xmm0, (Y1) 2700 ALIGN_3 2701 2702 jmp .L999 2703 ALIGN_4 2704 2705.L950: 2706 testq $SIZE, BUFFER 2707 je .L960 2708 2709 movsd (Y1), %xmm0 2710 addsd (BUFFER), %xmm0 2711 movsd %xmm0, (Y1) 2712 2713 addq INCY, Y1 2714 addq $SIZE, BUFFER 2715 2716 decq TMP_M 2717 jle .L999 2718 ALIGN_4 2719 2720.L960: 2721 movq Y1, Y2 2722 2723 movq TMP_M, %rax 2724 sarq $3, %rax 2725 jle .L964 2726 ALIGN_3 2727 2728.L962: 2729 movsd (Y2), %xmm0 2730 addq INCY, Y2 2731 movhpd (Y2), %xmm0 2732 addq INCY, Y2 2733 2734 movapd 0 * SIZE(BUFFER), %xmm4 2735 2736 movsd (Y2), %xmm1 2737 addq INCY, Y2 2738 movhpd (Y2), %xmm1 2739 addq INCY, Y2 2740 2741 movapd 2 * SIZE(BUFFER), %xmm5 2742 2743 movsd (Y2), %xmm2 2744 addq INCY, Y2 2745 movhpd (Y2), %xmm2 2746 addq INCY, Y2 2747 2748 movapd 4 * SIZE(BUFFER), %xmm6 2749 2750 addpd %xmm4, %xmm0 2751 2752 movsd (Y2), %xmm3 2753 addq INCY, Y2 2754 movhpd (Y2), %xmm3 2755 addq INCY, Y2 2756 2757 movapd 6 * SIZE(BUFFER), %xmm7 2758 2759 addpd %xmm5, %xmm1 2760 2761 movlpd %xmm0, (Y1) 2762 addq INCY, Y1 2763 movhpd %xmm0, (Y1) 2764 addq INCY, Y1 2765 2766 addpd %xmm6, %xmm2 2767 2768 movlpd %xmm1, (Y1) 2769 addq INCY, Y1 2770 movhpd %xmm1, (Y1) 2771 addq INCY, Y1 2772 2773 addpd %xmm7, %xmm3 2774 2775 movlpd %xmm2, (Y1) 2776 addq INCY, Y1 2777 movhpd %xmm2, (Y1) 2778 addq INCY, Y1 2779 movlpd %xmm3, (Y1) 2780 addq INCY, Y1 2781 movhpd %xmm3, (Y1) 2782 addq INCY, Y1 2783 2784 addq $8 * SIZE, BUFFER 2785 decq %rax 2786 jg .L962 2787 ALIGN_3 2788 2789.L964: 2790 testq $7, TMP_M 2791 jle .L999 2792 2793 testq $4, TMP_M 2794 jle .L965 2795 2796 movsd (Y2), %xmm0 2797 addq INCY, Y2 2798 movhpd (Y2), %xmm0 2799 addq INCY, Y2 2800 2801 movapd 0 * SIZE(BUFFER), %xmm4 2802 2803 movsd (Y2), %xmm1 2804 addq INCY, Y2 2805 movhpd (Y2), %xmm1 2806 addq INCY, Y2 2807 2808 movapd 2 * SIZE(BUFFER), %xmm5 2809 2810 addpd %xmm4, %xmm0 2811 addpd %xmm5, %xmm1 2812 2813 movlpd %xmm0, (Y1) 2814 addq INCY, Y1 2815 movhpd %xmm0, (Y1) 2816 addq INCY, Y1 2817 movlpd %xmm1, (Y1) 2818 addq INCY, Y1 2819 movhpd %xmm1, (Y1) 2820 addq INCY, Y1 2821 2822 addq $4 * SIZE, BUFFER 2823 ALIGN_3 2824 2825.L965: 2826 testq $2, TMP_M 2827 jle .L966 2828 2829 movsd (Y2), %xmm0 2830 addq INCY, Y2 2831 movhpd (Y2), %xmm0 2832 addq INCY, Y2 2833 2834 movapd 0 * SIZE(BUFFER), %xmm4 2835 2836 addpd %xmm4, %xmm0 2837 2838 movlpd %xmm0, (Y1) 2839 addq INCY, Y1 2840 movhpd %xmm0, (Y1) 2841 addq INCY, Y1 2842 2843 addq $2 * SIZE, BUFFER 2844 ALIGN_3 2845 2846.L966: 2847 testq $1, TMP_M 2848 jle .L999 2849 2850 movsd (Y2), %xmm0 2851 2852 movsd 0 * SIZE(BUFFER), %xmm4 2853 2854 addsd %xmm4, %xmm0 2855 2856 movlpd %xmm0, (Y1) 2857 ALIGN_3 2858 2859.L999: 2860 leaq (, M, SIZE), %rax 2861 addq %rax,AA 2862 movq STACK_INCY, INCY 2863 imulq INCY, %rax 2864 addq %rax, Y 2865 jmp .L0t 2866 ALIGN_4 2867 2868.L999x: 2869 movq 0(%rsp), %rbx 2870 movq 8(%rsp), %rbp 2871 movq 16(%rsp), %r12 2872 movq 24(%rsp), %r13 2873 movq 32(%rsp), %r14 2874 movq 40(%rsp), %r15 2875 2876#ifdef WINDOWS_ABI 2877 movq 48(%rsp), %rdi 2878 movq 56(%rsp), %rsi 2879 movups 64(%rsp), %xmm6 2880 movups 80(%rsp), %xmm7 2881 movups 96(%rsp), %xmm8 2882 movups 112(%rsp), %xmm9 2883 movups 128(%rsp), %xmm10 2884 movups 144(%rsp), %xmm11 2885 movups 160(%rsp), %xmm12 2886 movups 176(%rsp), %xmm13 2887 movups 192(%rsp), %xmm14 2888 movups 208(%rsp), %xmm15 2889#endif 2890 2891 addq $STACKSIZE, %rsp 2892 2893 ret 2894 EPILOGUE 2895