1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41#include "l2param.h" 42 43#ifndef WINDOWS_ABI 44 45#define STACKSIZE 64 46 47#define OLD_INCX 8 + STACKSIZE(%rsp) 48#define OLD_Y 16 + STACKSIZE(%rsp) 49#define OLD_INCY 24 + STACKSIZE(%rsp) 50#define OLD_BUFFER 32 + STACKSIZE(%rsp) 51#define ALPHA_R 48 (%rsp) 52#define ALPHA_I 56 (%rsp) 53 54#define M %rdi 55#define N %rsi 56#define A %rcx 57#define LDA %r8 58#define X %r9 59#define INCX %rdx 60#define Y %rbp 61#define INCY %r10 62 63#else 64 65#define STACKSIZE 256 66 67#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 68#define OLD_A 48 + STACKSIZE(%rsp) 69#define OLD_LDA 56 + STACKSIZE(%rsp) 70#define OLD_X 64 + STACKSIZE(%rsp) 71#define OLD_INCX 72 + STACKSIZE(%rsp) 72#define OLD_Y 80 + STACKSIZE(%rsp) 73#define OLD_INCY 88 + STACKSIZE(%rsp) 74#define OLD_BUFFER 96 + STACKSIZE(%rsp) 75#define ALPHA_R 224 (%rsp) 76#define ALPHA_I 232 (%rsp) 77 78#define M %rcx 79#define N %rdx 80#define A %r8 81#define LDA %r9 82#define X %rdi 83#define INCX %rsi 84#define Y %rbp 85#define INCY %r10 86 87#endif 88 89#define I %rax 90#define A1 %r12 91#define A2 %r13 92 93#define Y1 %r14 94#define BUFFER %r15 95 96#define J %r11 97 98#undef SUBPD 99 100#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 101#define SUBPD subpd 102#else 103#define SUBPD addpd 104#endif 105 106 PROLOGUE 107 PROFCODE 108 109 subq $STACKSIZE, %rsp 110 movq %rbx, 0(%rsp) 111 movq %rbp, 8(%rsp) 112 movq %r12, 16(%rsp) 113 movq %r13, 24(%rsp) 114 movq %r14, 32(%rsp) 115 movq %r15, 40(%rsp) 116 117#ifdef WINDOWS_ABI 118 movq %rdi, 48(%rsp) 119 movq %rsi, 56(%rsp) 120 movups %xmm6, 64(%rsp) 121 movups %xmm7, 80(%rsp) 122 movups %xmm8, 96(%rsp) 123 movups %xmm9, 112(%rsp) 124 movups %xmm10, 128(%rsp) 125 movups %xmm11, 144(%rsp) 126 movups %xmm12, 160(%rsp) 127 movups %xmm13, 176(%rsp) 128 movups %xmm14, 192(%rsp) 129 movups %xmm15, 208(%rsp) 130 131 movq OLD_A, A 132 movq OLD_LDA, LDA 133 movq OLD_X, X 134 135 movapd %xmm3, %xmm0 136 movsd OLD_ALPHA_I, %xmm1 137#endif 138 139 movq OLD_INCX, INCX 140 movq OLD_Y, Y 141 movq OLD_INCY, INCY 142 movq OLD_BUFFER, BUFFER 143 144 salq $ZBASE_SHIFT, LDA 145 salq $ZBASE_SHIFT, INCX 146 salq $ZBASE_SHIFT, INCY 147 148 movlpd %xmm0, ALPHA_R 149 movlpd %xmm1, ALPHA_I 150 151 subq $-16 * SIZE, A 152 153 testq M, M 154 jle .L999 155 testq N, N 156 jle .L999 157 ALIGN_3 158 159 movq BUFFER, Y1 160 161 pxor %xmm4, %xmm4 162 163 movq M, %rax 164 addq $8, %rax 165 sarq $3, %rax 166 ALIGN_3 167 168.L01: 169 movapd %xmm4, 0 * SIZE(Y1) 170 movapd %xmm4, 2 * SIZE(Y1) 171 movapd %xmm4, 4 * SIZE(Y1) 172 movapd %xmm4, 6 * SIZE(Y1) 173 movapd %xmm4, 8 * SIZE(Y1) 174 movapd %xmm4, 10 * SIZE(Y1) 175 movapd %xmm4, 12 * SIZE(Y1) 176 movapd %xmm4, 14 * SIZE(Y1) 177 178 subq $-16 * SIZE, Y1 179 decq %rax 180 jg .L01 181 ALIGN_3 182 183.L10: 184#ifdef ALIGNED_ACCESS 185 testq $SIZE, A 186 jne .L100 187#endif 188 189#if GEMV_UNROLL >= 4 190 191 cmpq $4, N 192 jl .L20 193 ALIGN_3 194 195.L11: 196 subq $4, N 197 198 leaq 16 * SIZE(BUFFER), Y1 199 movq A, A1 200 leaq (A, LDA, 2), A2 201 leaq (A, LDA, 4), A 202 203 movsd 0 * SIZE(X), %xmm8 204 movhpd 1 * SIZE(X), %xmm8 205 addq INCX, X 206 movsd 0 * SIZE(X), %xmm10 207 movhpd 1 * SIZE(X), %xmm10 208 addq INCX, X 209 movsd 0 * SIZE(X), %xmm12 210 movhpd 1 * SIZE(X), %xmm12 211 addq INCX, X 212 movsd 0 * SIZE(X), %xmm14 213 movhpd 1 * SIZE(X), %xmm14 214 addq INCX, X 215 216 pcmpeqb %xmm5, %xmm5 217 psllq $63, %xmm5 218 shufps $0xc0, %xmm5, %xmm5 219 220 pshufd $0x4e, %xmm8, %xmm9 221 pshufd $0x4e, %xmm10, %xmm11 222 pshufd $0x4e, %xmm12, %xmm13 223 pshufd $0x4e, %xmm14, %xmm15 224 225#ifdef HAVE_SSE3 226 movddup ALPHA_R, %xmm6 227 movddup ALPHA_I, %xmm7 228#else 229 movsd ALPHA_R, %xmm6 230 unpcklpd %xmm6, %xmm6 231 movsd ALPHA_I, %xmm7 232 unpcklpd %xmm7, %xmm7 233#endif 234 235 xorpd %xmm5, %xmm9 236 xorpd %xmm5, %xmm11 237 xorpd %xmm5, %xmm13 238 xorpd %xmm5, %xmm15 239 240 mulpd %xmm6, %xmm8 241 mulpd %xmm7, %xmm9 242 mulpd %xmm6, %xmm10 243 mulpd %xmm7, %xmm11 244 245 mulpd %xmm6, %xmm12 246 mulpd %xmm7, %xmm13 247 mulpd %xmm6, %xmm14 248 mulpd %xmm7, %xmm15 249 250#ifndef XCONJ 251 subpd %xmm9, %xmm8 252 subpd %xmm11, %xmm10 253 subpd %xmm13, %xmm12 254 subpd %xmm15, %xmm14 255#else 256 addpd %xmm9, %xmm8 257 addpd %xmm11, %xmm10 258 addpd %xmm13, %xmm12 259 addpd %xmm15, %xmm14 260#endif 261 262 pshufd $0xee, %xmm8, %xmm9 263 pshufd $0x44, %xmm8, %xmm8 264 265 pshufd $0xee, %xmm10, %xmm11 266 pshufd $0x44, %xmm10, %xmm10 267 268 pshufd $0xee, %xmm12, %xmm13 269 pshufd $0x44, %xmm12, %xmm12 270 271 pshufd $0xee, %xmm14, %xmm15 272 pshufd $0x44, %xmm14, %xmm14 273 274#ifndef CONJ 275 xorpd %xmm5, %xmm9 276 xorpd %xmm5, %xmm11 277 xorpd %xmm5, %xmm13 278 xorpd %xmm5, %xmm15 279#else 280 xorpd %xmm5, %xmm8 281 xorpd %xmm5, %xmm10 282 xorpd %xmm5, %xmm12 283 xorpd %xmm5, %xmm14 284#endif 285 286 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 287 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 288 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 289 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 290 ALIGN_3 291 292 movq M, I 293 sarq $2, I 294 jle .L15 295 296 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 297 MOVUPS_A1(-14 * SIZE, A1, %xmm6) 298 299 decq I 300 jle .L14 301 ALIGN_3 302 303.L13: 304#ifdef PREFETCH 305 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) 306#endif 307 308 pshufd $0x4e, %xmm4, %xmm5 309 mulpd %xmm8, %xmm4 310 addpd %xmm4, %xmm0 311 MOVUPS_A1(-12 * SIZE, A1, %xmm4) 312 pshufd $0x4e, %xmm6, %xmm7 313 mulpd %xmm8, %xmm6 314 addpd %xmm6, %xmm1 315 MOVUPS_A1(-10 * SIZE, A1, %xmm6) 316 317 mulpd %xmm9, %xmm5 318 SUBPD %xmm5, %xmm0 319 mulpd %xmm9, %xmm7 320 SUBPD %xmm7, %xmm1 321 322 pshufd $0x4e, %xmm4, %xmm5 323 mulpd %xmm8, %xmm4 324 addpd %xmm4, %xmm2 325 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) 326 pshufd $0x4e, %xmm6, %xmm7 327 mulpd %xmm8, %xmm6 328 addpd %xmm6, %xmm3 329 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) 330 331 mulpd %xmm9, %xmm5 332 SUBPD %xmm5, %xmm2 333 mulpd %xmm9, %xmm7 334 SUBPD %xmm7, %xmm3 335 336#ifdef PREFETCH 337 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) 338#endif 339 340 pshufd $0x4e, %xmm4, %xmm5 341 mulpd %xmm10, %xmm4 342 addpd %xmm4, %xmm0 343 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4) 344 pshufd $0x4e, %xmm6, %xmm7 345 mulpd %xmm10, %xmm6 346 addpd %xmm6, %xmm1 347 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6) 348 349 mulpd %xmm11, %xmm5 350 SUBPD %xmm5, %xmm0 351 mulpd %xmm11, %xmm7 352 SUBPD %xmm7, %xmm1 353 354 pshufd $0x4e, %xmm4, %xmm5 355 mulpd %xmm10, %xmm4 356 addpd %xmm4, %xmm2 357 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 358 pshufd $0x4e, %xmm6, %xmm7 359 mulpd %xmm10, %xmm6 360 addpd %xmm6, %xmm3 361 MOVUPS_A1(-14 * SIZE, A2, %xmm6) 362 363 mulpd %xmm11, %xmm5 364 SUBPD %xmm5, %xmm2 365 mulpd %xmm11, %xmm7 366 SUBPD %xmm7, %xmm3 367 368#ifdef PREFETCH 369 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) 370#endif 371 372 pshufd $0x4e, %xmm4, %xmm5 373 mulpd %xmm12, %xmm4 374 addpd %xmm4, %xmm0 375 MOVUPS_A1(-12 * SIZE, A2, %xmm4) 376 pshufd $0x4e, %xmm6, %xmm7 377 mulpd %xmm12, %xmm6 378 addpd %xmm6, %xmm1 379 MOVUPS_A1(-10 * SIZE, A2, %xmm6) 380 381 mulpd %xmm13, %xmm5 382 SUBPD %xmm5, %xmm0 383 mulpd %xmm13, %xmm7 384 SUBPD %xmm7, %xmm1 385 386 pshufd $0x4e, %xmm4, %xmm5 387 mulpd %xmm12, %xmm4 388 addpd %xmm4, %xmm2 389 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) 390 pshufd $0x4e, %xmm6, %xmm7 391 mulpd %xmm12, %xmm6 392 addpd %xmm6, %xmm3 393 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) 394 395 mulpd %xmm13, %xmm5 396 SUBPD %xmm5, %xmm2 397 mulpd %xmm13, %xmm7 398 SUBPD %xmm7, %xmm3 399 400#ifdef PREFETCH 401 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) 402#endif 403 404 pshufd $0x4e, %xmm4, %xmm5 405 mulpd %xmm14, %xmm4 406 addpd %xmm4, %xmm0 407 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4) 408 pshufd $0x4e, %xmm6, %xmm7 409 mulpd %xmm14, %xmm6 410 addpd %xmm6, %xmm1 411 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6) 412 413 mulpd %xmm15, %xmm5 414 SUBPD %xmm5, %xmm0 415 mulpd %xmm15, %xmm7 416 SUBPD %xmm7, %xmm1 417 418 pshufd $0x4e, %xmm4, %xmm5 419 mulpd %xmm14, %xmm4 420 addpd %xmm4, %xmm2 421 MOVUPS_A1( -8 * SIZE, A1, %xmm4) 422 pshufd $0x4e, %xmm6, %xmm7 423 mulpd %xmm14, %xmm6 424 addpd %xmm6, %xmm3 425 MOVUPS_A1( -6 * SIZE, A1, %xmm6) 426 427 mulpd %xmm15, %xmm5 428 SUBPD %xmm5, %xmm2 429 mulpd %xmm15, %xmm7 430 SUBPD %xmm7, %xmm3 431 432#ifdef PREFETCHW 433 PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) 434#endif 435 436 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 437 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 438 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 439 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 440 441 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 442 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 443 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 444 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 445 446 subq $-8 * SIZE, A1 447 subq $-8 * SIZE, A2 448 subq $-8 * SIZE, Y1 449 450 subq $1, I 451 BRANCH 452 jg .L13 453 ALIGN_3 454 455.L14: 456 pshufd $0x4e, %xmm4, %xmm5 457 mulpd %xmm8, %xmm4 458 addpd %xmm4, %xmm0 459 MOVUPS_A1(-12 * SIZE, A1, %xmm4) 460 pshufd $0x4e, %xmm6, %xmm7 461 mulpd %xmm8, %xmm6 462 addpd %xmm6, %xmm1 463 MOVUPS_A1(-10 * SIZE, A1, %xmm6) 464 465 mulpd %xmm9, %xmm5 466 SUBPD %xmm5, %xmm0 467 mulpd %xmm9, %xmm7 468 SUBPD %xmm7, %xmm1 469 470 pshufd $0x4e, %xmm4, %xmm5 471 mulpd %xmm8, %xmm4 472 addpd %xmm4, %xmm2 473 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) 474 pshufd $0x4e, %xmm6, %xmm7 475 mulpd %xmm8, %xmm6 476 addpd %xmm6, %xmm3 477 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) 478 479 mulpd %xmm9, %xmm5 480 SUBPD %xmm5, %xmm2 481 mulpd %xmm9, %xmm7 482 SUBPD %xmm7, %xmm3 483 484 pshufd $0x4e, %xmm4, %xmm5 485 mulpd %xmm10, %xmm4 486 addpd %xmm4, %xmm0 487 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4) 488 pshufd $0x4e, %xmm6, %xmm7 489 mulpd %xmm10, %xmm6 490 addpd %xmm6, %xmm1 491 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6) 492 493 mulpd %xmm11, %xmm5 494 SUBPD %xmm5, %xmm0 495 mulpd %xmm11, %xmm7 496 SUBPD %xmm7, %xmm1 497 498 pshufd $0x4e, %xmm4, %xmm5 499 mulpd %xmm10, %xmm4 500 addpd %xmm4, %xmm2 501 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 502 pshufd $0x4e, %xmm6, %xmm7 503 mulpd %xmm10, %xmm6 504 addpd %xmm6, %xmm3 505 MOVUPS_A1(-14 * SIZE, A2, %xmm6) 506 507 mulpd %xmm11, %xmm5 508 SUBPD %xmm5, %xmm2 509 mulpd %xmm11, %xmm7 510 SUBPD %xmm7, %xmm3 511 512 pshufd $0x4e, %xmm4, %xmm5 513 mulpd %xmm12, %xmm4 514 addpd %xmm4, %xmm0 515 MOVUPS_A1(-12 * SIZE, A2, %xmm4) 516 pshufd $0x4e, %xmm6, %xmm7 517 mulpd %xmm12, %xmm6 518 addpd %xmm6, %xmm1 519 MOVUPS_A1(-10 * SIZE, A2, %xmm6) 520 521 mulpd %xmm13, %xmm5 522 SUBPD %xmm5, %xmm0 523 mulpd %xmm13, %xmm7 524 SUBPD %xmm7, %xmm1 525 526 pshufd $0x4e, %xmm4, %xmm5 527 mulpd %xmm12, %xmm4 528 addpd %xmm4, %xmm2 529 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) 530 pshufd $0x4e, %xmm6, %xmm7 531 mulpd %xmm12, %xmm6 532 addpd %xmm6, %xmm3 533 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) 534 535 mulpd %xmm13, %xmm5 536 SUBPD %xmm5, %xmm2 537 mulpd %xmm13, %xmm7 538 SUBPD %xmm7, %xmm3 539 540 pshufd $0x4e, %xmm4, %xmm5 541 mulpd %xmm14, %xmm4 542 addpd %xmm4, %xmm0 543 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4) 544 pshufd $0x4e, %xmm6, %xmm7 545 mulpd %xmm14, %xmm6 546 addpd %xmm6, %xmm1 547 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6) 548 549 mulpd %xmm15, %xmm5 550 SUBPD %xmm5, %xmm0 551 mulpd %xmm15, %xmm7 552 SUBPD %xmm7, %xmm1 553 554 pshufd $0x4e, %xmm4, %xmm5 555 mulpd %xmm14, %xmm4 556 addpd %xmm4, %xmm2 557 pshufd $0x4e, %xmm6, %xmm7 558 mulpd %xmm14, %xmm6 559 addpd %xmm6, %xmm3 560 561 mulpd %xmm15, %xmm5 562 SUBPD %xmm5, %xmm2 563 mulpd %xmm15, %xmm7 564 SUBPD %xmm7, %xmm3 565 566 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 567 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 568 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 569 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 570 571 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 572 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 573 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 574 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 575 576 subq $-8 * SIZE, A1 577 subq $-8 * SIZE, A2 578 subq $-8 * SIZE, Y1 579 ALIGN_3 580 581.L15: 582 testq $2, M 583 je .L17 584 585 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 586 MOVUPS_A1(-14 * SIZE, A1, %xmm6) 587 588 pshufd $0x4e, %xmm4, %xmm5 589 mulpd %xmm8, %xmm4 590 addpd %xmm4, %xmm0 591 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) 592 pshufd $0x4e, %xmm6, %xmm7 593 mulpd %xmm8, %xmm6 594 addpd %xmm6, %xmm1 595 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) 596 597 mulpd %xmm9, %xmm5 598 SUBPD %xmm5, %xmm0 599 mulpd %xmm9, %xmm7 600 SUBPD %xmm7, %xmm1 601 602 pshufd $0x4e, %xmm4, %xmm5 603 mulpd %xmm10, %xmm4 604 addpd %xmm4, %xmm0 605 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 606 pshufd $0x4e, %xmm6, %xmm7 607 mulpd %xmm10, %xmm6 608 addpd %xmm6, %xmm1 609 MOVUPS_A1(-14 * SIZE, A2, %xmm6) 610 611 mulpd %xmm11, %xmm5 612 SUBPD %xmm5, %xmm0 613 mulpd %xmm11, %xmm7 614 SUBPD %xmm7, %xmm1 615 616 pshufd $0x4e, %xmm4, %xmm5 617 mulpd %xmm12, %xmm4 618 addpd %xmm4, %xmm0 619 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) 620 pshufd $0x4e, %xmm6, %xmm7 621 mulpd %xmm12, %xmm6 622 addpd %xmm6, %xmm1 623 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) 624 625 mulpd %xmm13, %xmm5 626 SUBPD %xmm5, %xmm0 627 mulpd %xmm13, %xmm7 628 SUBPD %xmm7, %xmm1 629 630 pshufd $0x4e, %xmm4, %xmm5 631 mulpd %xmm14, %xmm4 632 addpd %xmm4, %xmm0 633 mulpd %xmm15, %xmm5 634 SUBPD %xmm5, %xmm0 635 636 pshufd $0x4e, %xmm6, %xmm7 637 mulpd %xmm14, %xmm6 638 addpd %xmm6, %xmm1 639 mulpd %xmm15, %xmm7 640 SUBPD %xmm7, %xmm1 641 642 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 643 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 644 movapd %xmm2, %xmm0 645 646 addq $4 * SIZE, A1 647 addq $4 * SIZE, A2 648 addq $4 * SIZE, Y1 649 ALIGN_3 650 651.L17: 652 testq $1, M 653 je .L19 654 655 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 656 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) 657 658 pshufd $0x4e, %xmm4, %xmm5 659 mulpd %xmm8, %xmm4 660 addpd %xmm4, %xmm0 661 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 662 mulpd %xmm9, %xmm5 663 SUBPD %xmm5, %xmm0 664 665 pshufd $0x4e, %xmm6, %xmm7 666 mulpd %xmm10, %xmm6 667 addpd %xmm6, %xmm0 668 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) 669 mulpd %xmm11, %xmm7 670 SUBPD %xmm7, %xmm0 671 672 pshufd $0x4e, %xmm4, %xmm5 673 mulpd %xmm12, %xmm4 674 addpd %xmm4, %xmm0 675 mulpd %xmm13, %xmm5 676 SUBPD %xmm5, %xmm0 677 678 pshufd $0x4e, %xmm6, %xmm7 679 mulpd %xmm14, %xmm6 680 addpd %xmm6, %xmm0 681 mulpd %xmm15, %xmm7 682 SUBPD %xmm7, %xmm0 683 684 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 685 ALIGN_3 686 687.L19: 688 cmpq $4, N 689 jge .L11 690 ALIGN_3 691 692.L20: 693#endif 694 695#if GEMV_UNROLL >= 2 696 697 cmpq $2, N 698 jl .L30 699 700#if GEMV_UNROLL == 2 701 ALIGN_3 702 703.L21: 704#endif 705 706 subq $2, N 707 708 leaq 16 * SIZE(BUFFER), Y1 709 movq A, A1 710 leaq (A, LDA, 1), A2 711 leaq (A, LDA, 2), A 712 713 movsd 0 * SIZE(X), %xmm12 714 movhpd 1 * SIZE(X), %xmm12 715 addq INCX, X 716 movsd 0 * SIZE(X), %xmm14 717 movhpd 1 * SIZE(X), %xmm14 718 addq INCX, X 719 720 pcmpeqb %xmm11, %xmm11 721 psllq $63, %xmm11 722 shufps $0xc0, %xmm11, %xmm11 723 724 pshufd $0x4e, %xmm12, %xmm13 725 pshufd $0x4e, %xmm14, %xmm15 726 727#ifdef HAVE_SSE3 728 movddup ALPHA_R, %xmm8 729 movddup ALPHA_I, %xmm9 730#else 731 movsd ALPHA_R, %xmm8 732 unpcklpd %xmm8, %xmm8 733 movsd ALPHA_I, %xmm9 734 unpcklpd %xmm9, %xmm9 735#endif 736 737 xorpd %xmm11, %xmm13 738 xorpd %xmm11, %xmm15 739 740 mulpd %xmm8, %xmm12 741 mulpd %xmm9, %xmm13 742 mulpd %xmm8, %xmm14 743 mulpd %xmm9, %xmm15 744 745#ifndef XCONJ 746 subpd %xmm13, %xmm12 747 subpd %xmm15, %xmm14 748#else 749 addpd %xmm13, %xmm12 750 addpd %xmm15, %xmm14 751#endif 752 753 pshufd $0xee, %xmm12, %xmm13 754 pshufd $0x44, %xmm12, %xmm12 755 756 pshufd $0xee, %xmm14, %xmm15 757 pshufd $0x44, %xmm14, %xmm14 758 759#ifndef CONJ 760 xorpd %xmm11, %xmm13 761 xorpd %xmm11, %xmm15 762#else 763 xorpd %xmm11, %xmm12 764 xorpd %xmm11, %xmm14 765#endif 766 767 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 768 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 769 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 770 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 771 ALIGN_3 772 773 movq M, I 774 sarq $2, I 775 jle .L25 776 777 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 778 MOVUPS_A1(-14 * SIZE, A1, %xmm6) 779 MOVUPS_A1(-12 * SIZE, A1, %xmm8) 780 MOVUPS_A1(-10 * SIZE, A1, %xmm10) 781 782 decq I 783 jle .L24 784 ALIGN_3 785 786.L23: 787#ifdef PREFETCH 788 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 789#endif 790 791 pshufd $0x4e, %xmm4, %xmm5 792 mulpd %xmm12, %xmm4 793 addpd %xmm4, %xmm0 794 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 795 pshufd $0x4e, %xmm6, %xmm7 796 mulpd %xmm12, %xmm6 797 addpd %xmm6, %xmm1 798 MOVUPS_A1(-14 * SIZE, A2, %xmm6) 799 800 pshufd $0x4e, %xmm8, %xmm9 801 mulpd %xmm12, %xmm8 802 addpd %xmm8, %xmm2 803 MOVUPS_A1(-12 * SIZE, A2, %xmm8) 804 pshufd $0x4e, %xmm10, %xmm11 805 mulpd %xmm12, %xmm10 806 addpd %xmm10, %xmm3 807 MOVUPS_A1(-10 * SIZE, A2, %xmm10) 808 809 mulpd %xmm13, %xmm5 810 SUBPD %xmm5, %xmm0 811 mulpd %xmm13, %xmm7 812 SUBPD %xmm7, %xmm1 813 814 mulpd %xmm13, %xmm9 815 SUBPD %xmm9, %xmm2 816 mulpd %xmm13, %xmm11 817 SUBPD %xmm11, %xmm3 818 819#ifdef PREFETCH 820 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 821#endif 822 823 pshufd $0x4e, %xmm4, %xmm5 824 mulpd %xmm14, %xmm4 825 addpd %xmm4, %xmm0 826 MOVUPS_A1( -8 * SIZE, A1, %xmm4) 827 pshufd $0x4e, %xmm6, %xmm7 828 mulpd %xmm14, %xmm6 829 addpd %xmm6, %xmm1 830 MOVUPS_A1( -6 * SIZE, A1, %xmm6) 831 832 pshufd $0x4e, %xmm8, %xmm9 833 mulpd %xmm14, %xmm8 834 addpd %xmm8, %xmm2 835 MOVUPS_A1( -4 * SIZE, A1, %xmm8) 836 pshufd $0x4e, %xmm10, %xmm11 837 mulpd %xmm14, %xmm10 838 addpd %xmm10, %xmm3 839 MOVUPS_A1( -2 * SIZE, A1, %xmm10) 840 841 mulpd %xmm15, %xmm5 842 SUBPD %xmm5, %xmm0 843 mulpd %xmm15, %xmm7 844 SUBPD %xmm7, %xmm1 845 846 mulpd %xmm15, %xmm9 847 SUBPD %xmm9, %xmm2 848 mulpd %xmm15, %xmm11 849 SUBPD %xmm11, %xmm3 850 851#ifdef PREFETCHW 852 PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) 853#endif 854 855 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 856 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 857 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 858 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 859 860 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 861 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 862 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 863 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 864 865 subq $-8 * SIZE, A1 866 subq $-8 * SIZE, A2 867 subq $-8 * SIZE, Y1 868 869 subq $1, I 870 BRANCH 871 jg .L23 872 ALIGN_3 873 874.L24: 875 pshufd $0x4e, %xmm4, %xmm5 876 mulpd %xmm12, %xmm4 877 addpd %xmm4, %xmm0 878 MOVUPS_A1(-16 * SIZE, A2, %xmm4) 879 pshufd $0x4e, %xmm6, %xmm7 880 mulpd %xmm12, %xmm6 881 addpd %xmm6, %xmm1 882 MOVUPS_A1(-14 * SIZE, A2, %xmm6) 883 884 pshufd $0x4e, %xmm8, %xmm9 885 mulpd %xmm12, %xmm8 886 addpd %xmm8, %xmm2 887 MOVUPS_A1(-12 * SIZE, A2, %xmm8) 888 pshufd $0x4e, %xmm10, %xmm11 889 mulpd %xmm12, %xmm10 890 addpd %xmm10, %xmm3 891 MOVUPS_A1(-10 * SIZE, A2, %xmm10) 892 893 mulpd %xmm13, %xmm5 894 SUBPD %xmm5, %xmm0 895 mulpd %xmm13, %xmm7 896 SUBPD %xmm7, %xmm1 897 898 mulpd %xmm13, %xmm9 899 SUBPD %xmm9, %xmm2 900 mulpd %xmm13, %xmm11 901 SUBPD %xmm11, %xmm3 902 903 pshufd $0x4e, %xmm4, %xmm5 904 mulpd %xmm14, %xmm4 905 addpd %xmm4, %xmm0 906 pshufd $0x4e, %xmm6, %xmm7 907 mulpd %xmm14, %xmm6 908 addpd %xmm6, %xmm1 909 910 pshufd $0x4e, %xmm8, %xmm9 911 mulpd %xmm14, %xmm8 912 addpd %xmm8, %xmm2 913 pshufd $0x4e, %xmm10, %xmm11 914 mulpd %xmm14, %xmm10 915 addpd %xmm10, %xmm3 916 917 mulpd %xmm15, %xmm5 918 SUBPD %xmm5, %xmm0 919 mulpd %xmm15, %xmm7 920 SUBPD %xmm7, %xmm1 921 922 mulpd %xmm15, %xmm9 923 SUBPD %xmm9, %xmm2 924 mulpd %xmm15, %xmm11 925 SUBPD %xmm11, %xmm3 926 927 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 928 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 929 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 930 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 931 932 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 933 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 934 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 935 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 936 937 subq $-8 * SIZE, A1 938 subq $-8 * SIZE, A2 939 subq $-8 * SIZE, Y1 940 ALIGN_3 941 942.L25: 943 testq $2, M 944 je .L27 945 946 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 947 MOVUPS_A1(-14 * SIZE, A1, %xmm6) 948 MOVUPS_A1(-16 * SIZE, A2, %xmm8) 949 MOVUPS_A1(-14 * SIZE, A2, %xmm10) 950 951 pshufd $0x4e, %xmm4, %xmm5 952 mulpd %xmm12, %xmm4 953 addpd %xmm4, %xmm0 954 pshufd $0x4e, %xmm6, %xmm7 955 mulpd %xmm12, %xmm6 956 addpd %xmm6, %xmm1 957 958 mulpd %xmm13, %xmm5 959 SUBPD %xmm5, %xmm0 960 mulpd %xmm13, %xmm7 961 SUBPD %xmm7, %xmm1 962 963 pshufd $0x4e, %xmm8, %xmm9 964 mulpd %xmm14, %xmm8 965 addpd %xmm8, %xmm0 966 pshufd $0x4e, %xmm10, %xmm11 967 mulpd %xmm14, %xmm10 968 addpd %xmm10, %xmm1 969 970 mulpd %xmm15, %xmm9 971 SUBPD %xmm9, %xmm0 972 mulpd %xmm15, %xmm11 973 SUBPD %xmm11, %xmm1 974 975 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 976 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 977 movapd %xmm2, %xmm0 978 979 addq $4 * SIZE, A1 980 addq $4 * SIZE, A2 981 addq $4 * SIZE, Y1 982 ALIGN_3 983 984.L27: 985 testq $1, M 986#if GEMV_UNROLL == 2 987 je .L29 988#else 989 je .L30 990#endif 991 992 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 993 MOVUPS_A1(-16 * SIZE, A2, %xmm6) 994 995 pshufd $0x4e, %xmm4, %xmm5 996 mulpd %xmm12, %xmm4 997 addpd %xmm4, %xmm0 998 mulpd %xmm13, %xmm5 999 SUBPD %xmm5, %xmm0 1000 1001 pshufd $0x4e, %xmm6, %xmm7 1002 mulpd %xmm14, %xmm6 1003 addpd %xmm6, %xmm0 1004 mulpd %xmm15, %xmm7 1005 SUBPD %xmm7, %xmm0 1006 1007 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1008 1009#if GEMV_UNROLL == 2 1010 ALIGN_3 1011 1012.L29: 1013 cmpq $2, N 1014 jge .L21 1015#endif 1016 ALIGN_3 1017 1018.L30: 1019#endif 1020 1021 cmpq $1, N 1022 jl .L980 1023 1024#if GEMV_UNROLL == 1 1025.L31: 1026 decq N 1027#endif 1028 1029 leaq 16 * SIZE(BUFFER), Y1 1030 movq A, A1 1031#if GEMV_UNROLL == 1 1032 addq LDA, A 1033#endif 1034 1035 movsd 0 * SIZE(X), %xmm12 1036 movhpd 1 * SIZE(X), %xmm12 1037 addq INCX, X 1038 1039 pcmpeqb %xmm11, %xmm11 1040 psllq $63, %xmm11 1041 shufps $0xc0, %xmm11, %xmm11 1042 1043 pshufd $0x4e, %xmm12, %xmm13 1044 1045#ifdef HAVE_SSE3 1046 movddup ALPHA_R, %xmm8 1047 movddup ALPHA_I, %xmm9 1048#else 1049 movsd ALPHA_R, %xmm8 1050 unpcklpd %xmm8, %xmm8 1051 movsd ALPHA_I, %xmm9 1052 unpcklpd %xmm9, %xmm9 1053#endif 1054 1055 xorpd %xmm11, %xmm13 1056 1057 mulpd %xmm8, %xmm12 1058 mulpd %xmm9, %xmm13 1059 1060#ifndef XCONJ 1061 subpd %xmm13, %xmm12 1062#else 1063 addpd %xmm13, %xmm12 1064#endif 1065 1066 pshufd $0xee, %xmm12, %xmm13 1067 pshufd $0x44, %xmm12, %xmm12 1068 1069#ifndef CONJ 1070 xorpd %xmm11, %xmm13 1071#else 1072 xorpd %xmm11, %xmm12 1073#endif 1074 1075 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1076 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 1077 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 1078 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 1079 1080 movq M, I 1081 sarq $2, I 1082 jle .L35 1083 1084 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 1085 MOVUPS_A1(-14 * SIZE, A1, %xmm6) 1086 MOVUPS_A1(-12 * SIZE, A1, %xmm8) 1087 MOVUPS_A1(-10 * SIZE, A1, %xmm10) 1088 1089 decq I 1090 jle .L34 1091 ALIGN_3 1092 1093.L33: 1094#ifdef PREFETCH 1095 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 1096#endif 1097 1098 pshufd $0x4e, %xmm4, %xmm5 1099 mulpd %xmm12, %xmm4 1100 addpd %xmm4, %xmm0 1101 MOVUPS_A1( -8 * SIZE, A1, %xmm4) 1102 pshufd $0x4e, %xmm6, %xmm7 1103 mulpd %xmm12, %xmm6 1104 addpd %xmm6, %xmm1 1105 MOVUPS_A1( -6 * SIZE, A1, %xmm6) 1106 1107 pshufd $0x4e, %xmm8, %xmm9 1108 mulpd %xmm12, %xmm8 1109 addpd %xmm8, %xmm2 1110 MOVUPS_A1( -4 * SIZE, A1, %xmm8) 1111 pshufd $0x4e, %xmm10, %xmm11 1112 mulpd %xmm12, %xmm10 1113 addpd %xmm10, %xmm3 1114 MOVUPS_A1( -2 * SIZE, A1, %xmm10) 1115 1116 mulpd %xmm13, %xmm5 1117 SUBPD %xmm5, %xmm0 1118 mulpd %xmm13, %xmm7 1119 SUBPD %xmm7, %xmm1 1120 1121 mulpd %xmm13, %xmm9 1122 SUBPD %xmm9, %xmm2 1123 mulpd %xmm13, %xmm11 1124 SUBPD %xmm11, %xmm3 1125 1126#ifdef PREFETCHW 1127 PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) 1128#endif 1129 1130 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1131 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1132 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1133 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1134 1135 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 1136 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 1137 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 1138 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 1139 1140 subq $-8 * SIZE, A1 1141 subq $-8 * SIZE, Y1 1142 1143 subq $1, I 1144 BRANCH 1145 jg .L33 1146 ALIGN_3 1147 1148.L34: 1149 pshufd $0x4e, %xmm4, %xmm5 1150 mulpd %xmm12, %xmm4 1151 addpd %xmm4, %xmm0 1152 pshufd $0x4e, %xmm6, %xmm7 1153 mulpd %xmm12, %xmm6 1154 addpd %xmm6, %xmm1 1155 1156 pshufd $0x4e, %xmm8, %xmm9 1157 mulpd %xmm12, %xmm8 1158 addpd %xmm8, %xmm2 1159 pshufd $0x4e, %xmm10, %xmm11 1160 mulpd %xmm12, %xmm10 1161 addpd %xmm10, %xmm3 1162 1163 mulpd %xmm13, %xmm5 1164 SUBPD %xmm5, %xmm0 1165 mulpd %xmm13, %xmm7 1166 SUBPD %xmm7, %xmm1 1167 1168 mulpd %xmm13, %xmm9 1169 SUBPD %xmm9, %xmm2 1170 mulpd %xmm13, %xmm11 1171 SUBPD %xmm11, %xmm3 1172 1173 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1174 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1175 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1176 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1177 1178 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 1179 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 1180 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 1181 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 1182 1183 subq $-8 * SIZE, A1 1184 subq $-8 * SIZE, Y1 1185 ALIGN_3 1186 1187.L35: 1188 testq $2, M 1189 je .L37 1190 1191 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 1192 MOVUPS_A1(-14 * SIZE, A1, %xmm6) 1193 1194 pshufd $0x4e, %xmm4, %xmm5 1195 mulpd %xmm12, %xmm4 1196 addpd %xmm4, %xmm0 1197 pshufd $0x4e, %xmm6, %xmm7 1198 mulpd %xmm12, %xmm6 1199 addpd %xmm6, %xmm1 1200 1201 mulpd %xmm13, %xmm5 1202 SUBPD %xmm5, %xmm0 1203 mulpd %xmm13, %xmm7 1204 SUBPD %xmm7, %xmm1 1205 1206 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1207 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1208 movapd %xmm2, %xmm0 1209 1210 addq $4 * SIZE, A1 1211 addq $4 * SIZE, Y1 1212 ALIGN_3 1213 1214.L37: 1215 testq $1, M 1216#if GEMV_UNROLL == 1 1217 je .L39 1218#else 1219 je .L980 1220#endif 1221 1222 MOVUPS_A1(-16 * SIZE, A1, %xmm4) 1223 1224 pshufd $0x4e, %xmm4, %xmm5 1225 mulpd %xmm12, %xmm4 1226 addpd %xmm4, %xmm0 1227 mulpd %xmm13, %xmm5 1228 SUBPD %xmm5, %xmm0 1229 1230 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1231 1232#if GEMV_UNROLL == 1 1233 ALIGN_3 1234.L39: 1235 cmpq $1, N 1236 jge .L31 1237#endif 1238 1239#ifdef ALIGNED_ACCESS 1240 1241 jmp .L980 1242 ALIGN_3 1243 1244.L100: 1245#if GEMV_UNROLL >= 4 1246 1247 cmpq $4, N 1248 jl .L110 1249 ALIGN_3 1250 1251.L101: 1252 subq $4, N 1253 1254 leaq 16 * SIZE(BUFFER), Y1 1255 movq A, A1 1256 leaq (A, LDA, 2), A2 1257 leaq (A, LDA, 4), A 1258 1259 movsd 0 * SIZE(X), %xmm8 1260 movhpd 1 * SIZE(X), %xmm8 1261 addq INCX, X 1262 movsd 0 * SIZE(X), %xmm10 1263 movhpd 1 * SIZE(X), %xmm10 1264 addq INCX, X 1265 movsd 0 * SIZE(X), %xmm12 1266 movhpd 1 * SIZE(X), %xmm12 1267 addq INCX, X 1268 movsd 0 * SIZE(X), %xmm14 1269 movhpd 1 * SIZE(X), %xmm14 1270 addq INCX, X 1271 1272 pcmpeqb %xmm5, %xmm5 1273 psllq $63, %xmm5 1274 shufps $0xc0, %xmm5, %xmm5 1275 1276 pshufd $0x4e, %xmm8, %xmm9 1277 pshufd $0x4e, %xmm10, %xmm11 1278 pshufd $0x4e, %xmm12, %xmm13 1279 pshufd $0x4e, %xmm14, %xmm15 1280 1281#ifdef HAVE_SSE3 1282 movddup ALPHA_R, %xmm6 1283 movddup ALPHA_I, %xmm7 1284#else 1285 movsd ALPHA_R, %xmm6 1286 unpcklpd %xmm6, %xmm6 1287 movsd ALPHA_I, %xmm7 1288 unpcklpd %xmm7, %xmm7 1289#endif 1290 1291 xorpd %xmm5, %xmm9 1292 xorpd %xmm5, %xmm11 1293 xorpd %xmm5, %xmm13 1294 xorpd %xmm5, %xmm15 1295 1296 mulpd %xmm6, %xmm8 1297 mulpd %xmm7, %xmm9 1298 mulpd %xmm6, %xmm10 1299 mulpd %xmm7, %xmm11 1300 1301 mulpd %xmm6, %xmm12 1302 mulpd %xmm7, %xmm13 1303 mulpd %xmm6, %xmm14 1304 mulpd %xmm7, %xmm15 1305 1306#ifndef XCONJ 1307 subpd %xmm9, %xmm8 1308 subpd %xmm11, %xmm10 1309 subpd %xmm13, %xmm12 1310 subpd %xmm15, %xmm14 1311#else 1312 addpd %xmm9, %xmm8 1313 addpd %xmm11, %xmm10 1314 addpd %xmm13, %xmm12 1315 addpd %xmm15, %xmm14 1316#endif 1317 1318 pshufd $0xee, %xmm8, %xmm9 1319 pshufd $0x44, %xmm8, %xmm8 1320 1321 pshufd $0xee, %xmm10, %xmm11 1322 pshufd $0x44, %xmm10, %xmm10 1323 1324 pshufd $0xee, %xmm12, %xmm13 1325 pshufd $0x44, %xmm12, %xmm12 1326 1327 pshufd $0xee, %xmm14, %xmm15 1328 pshufd $0x44, %xmm14, %xmm14 1329 1330#ifndef CONJ 1331 xorpd %xmm5, %xmm9 1332 xorpd %xmm5, %xmm11 1333 xorpd %xmm5, %xmm13 1334 xorpd %xmm5, %xmm15 1335#else 1336 xorpd %xmm5, %xmm8 1337 xorpd %xmm5, %xmm10 1338 xorpd %xmm5, %xmm12 1339 xorpd %xmm5, %xmm14 1340#endif 1341 1342 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1343 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 1344 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 1345 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 1346 ALIGN_3 1347 1348 movq M, I 1349 sarq $2, I 1350 jle .L105 1351 1352 movsd -16 * SIZE(A1), %xmm4 1353 movhpd -15 * SIZE(A1), %xmm4 1354 movsd -14 * SIZE(A1), %xmm6 1355 movhpd -13 * SIZE(A1), %xmm6 1356 1357 decq I 1358 jle .L104 1359 ALIGN_3 1360 1361.L103: 1362#ifdef PREFETCH 1363 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) 1364#endif 1365 1366 pshufd $0x4e, %xmm4, %xmm5 1367 mulpd %xmm8, %xmm4 1368 addpd %xmm4, %xmm0 1369 movsd -12 * SIZE(A1), %xmm4 1370 movhpd -11 * SIZE(A1), %xmm4 1371 pshufd $0x4e, %xmm6, %xmm7 1372 mulpd %xmm8, %xmm6 1373 addpd %xmm6, %xmm1 1374 movsd -10 * SIZE(A1), %xmm6 1375 movhpd -9 * SIZE(A1), %xmm6 1376 1377 mulpd %xmm9, %xmm5 1378 SUBPD %xmm5, %xmm0 1379 mulpd %xmm9, %xmm7 1380 SUBPD %xmm7, %xmm1 1381 1382 pshufd $0x4e, %xmm4, %xmm5 1383 mulpd %xmm8, %xmm4 1384 addpd %xmm4, %xmm2 1385 movsd -16 * SIZE(A1, LDA), %xmm4 1386 movhpd -15 * SIZE(A1, LDA), %xmm4 1387 pshufd $0x4e, %xmm6, %xmm7 1388 mulpd %xmm8, %xmm6 1389 addpd %xmm6, %xmm3 1390 movsd -14 * SIZE(A1, LDA), %xmm6 1391 movhpd -13 * SIZE(A1, LDA), %xmm6 1392 1393 mulpd %xmm9, %xmm5 1394 SUBPD %xmm5, %xmm2 1395 mulpd %xmm9, %xmm7 1396 SUBPD %xmm7, %xmm3 1397 1398#ifdef PREFETCH 1399 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) 1400#endif 1401 1402 pshufd $0x4e, %xmm4, %xmm5 1403 mulpd %xmm10, %xmm4 1404 addpd %xmm4, %xmm0 1405 movsd -12 * SIZE(A1, LDA), %xmm4 1406 movhpd -11 * SIZE(A1, LDA), %xmm4 1407 pshufd $0x4e, %xmm6, %xmm7 1408 mulpd %xmm10, %xmm6 1409 addpd %xmm6, %xmm1 1410 movsd -10 * SIZE(A1, LDA), %xmm6 1411 movhpd -9 * SIZE(A1, LDA), %xmm6 1412 1413 mulpd %xmm11, %xmm5 1414 SUBPD %xmm5, %xmm0 1415 mulpd %xmm11, %xmm7 1416 SUBPD %xmm7, %xmm1 1417 1418 pshufd $0x4e, %xmm4, %xmm5 1419 mulpd %xmm10, %xmm4 1420 addpd %xmm4, %xmm2 1421 movsd -16 * SIZE(A2), %xmm4 1422 movhpd -15 * SIZE(A2), %xmm4 1423 pshufd $0x4e, %xmm6, %xmm7 1424 mulpd %xmm10, %xmm6 1425 addpd %xmm6, %xmm3 1426 movsd -14 * SIZE(A2), %xmm6 1427 movhpd -13 * SIZE(A2), %xmm6 1428 1429 mulpd %xmm11, %xmm5 1430 SUBPD %xmm5, %xmm2 1431 mulpd %xmm11, %xmm7 1432 SUBPD %xmm7, %xmm3 1433 1434#ifdef PREFETCH 1435 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) 1436#endif 1437 1438 pshufd $0x4e, %xmm4, %xmm5 1439 mulpd %xmm12, %xmm4 1440 addpd %xmm4, %xmm0 1441 movsd -12 * SIZE(A2), %xmm4 1442 movhpd -11 * SIZE(A2), %xmm4 1443 pshufd $0x4e, %xmm6, %xmm7 1444 mulpd %xmm12, %xmm6 1445 addpd %xmm6, %xmm1 1446 movsd -10 * SIZE(A2), %xmm6 1447 movhpd -9 * SIZE(A2), %xmm6 1448 1449 mulpd %xmm13, %xmm5 1450 SUBPD %xmm5, %xmm0 1451 mulpd %xmm13, %xmm7 1452 SUBPD %xmm7, %xmm1 1453 1454 pshufd $0x4e, %xmm4, %xmm5 1455 mulpd %xmm12, %xmm4 1456 addpd %xmm4, %xmm2 1457 movsd -16 * SIZE(A2, LDA), %xmm4 1458 movhpd -15 * SIZE(A2, LDA), %xmm4 1459 pshufd $0x4e, %xmm6, %xmm7 1460 mulpd %xmm12, %xmm6 1461 addpd %xmm6, %xmm3 1462 movsd -14 * SIZE(A2, LDA), %xmm6 1463 movhpd -13 * SIZE(A2, LDA), %xmm6 1464 1465 mulpd %xmm13, %xmm5 1466 SUBPD %xmm5, %xmm2 1467 mulpd %xmm13, %xmm7 1468 SUBPD %xmm7, %xmm3 1469 1470#ifdef PREFETCH 1471 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) 1472#endif 1473 1474 pshufd $0x4e, %xmm4, %xmm5 1475 mulpd %xmm14, %xmm4 1476 addpd %xmm4, %xmm0 1477 movsd -12 * SIZE(A2, LDA), %xmm4 1478 movhpd -11 * SIZE(A2, LDA), %xmm4 1479 pshufd $0x4e, %xmm6, %xmm7 1480 mulpd %xmm14, %xmm6 1481 addpd %xmm6, %xmm1 1482 movsd -10 * SIZE(A2, LDA), %xmm6 1483 movhpd -9 * SIZE(A2, LDA), %xmm6 1484 1485 mulpd %xmm15, %xmm5 1486 SUBPD %xmm5, %xmm0 1487 mulpd %xmm15, %xmm7 1488 SUBPD %xmm7, %xmm1 1489 1490 pshufd $0x4e, %xmm4, %xmm5 1491 mulpd %xmm14, %xmm4 1492 addpd %xmm4, %xmm2 1493 movsd -8 * SIZE(A1), %xmm4 1494 movhpd -7 * SIZE(A1), %xmm4 1495 pshufd $0x4e, %xmm6, %xmm7 1496 mulpd %xmm14, %xmm6 1497 addpd %xmm6, %xmm3 1498 movsd -6 * SIZE(A1), %xmm6 1499 movhpd -5 * SIZE(A1), %xmm6 1500 1501 mulpd %xmm15, %xmm5 1502 SUBPD %xmm5, %xmm2 1503 mulpd %xmm15, %xmm7 1504 SUBPD %xmm7, %xmm3 1505 1506#ifdef PREFETCHW 1507 PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) 1508#endif 1509 1510 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1511 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1512 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1513 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1514 1515 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 1516 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 1517 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 1518 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 1519 1520 subq $-8 * SIZE, A1 1521 subq $-8 * SIZE, A2 1522 subq $-8 * SIZE, Y1 1523 1524 subq $1, I 1525 BRANCH 1526 jg .L103 1527 ALIGN_3 1528 1529.L104: 1530 pshufd $0x4e, %xmm4, %xmm5 1531 mulpd %xmm8, %xmm4 1532 addpd %xmm4, %xmm0 1533 movsd -12 * SIZE(A1), %xmm4 1534 movhpd -11 * SIZE(A1), %xmm4 1535 pshufd $0x4e, %xmm6, %xmm7 1536 mulpd %xmm8, %xmm6 1537 addpd %xmm6, %xmm1 1538 movsd -10 * SIZE(A1), %xmm6 1539 movhpd -9 * SIZE(A1), %xmm6 1540 1541 mulpd %xmm9, %xmm5 1542 SUBPD %xmm5, %xmm0 1543 mulpd %xmm9, %xmm7 1544 SUBPD %xmm7, %xmm1 1545 1546 pshufd $0x4e, %xmm4, %xmm5 1547 mulpd %xmm8, %xmm4 1548 addpd %xmm4, %xmm2 1549 movsd -16 * SIZE(A1, LDA), %xmm4 1550 movhpd -15 * SIZE(A1, LDA), %xmm4 1551 pshufd $0x4e, %xmm6, %xmm7 1552 mulpd %xmm8, %xmm6 1553 addpd %xmm6, %xmm3 1554 movsd -14 * SIZE(A1, LDA), %xmm6 1555 movhpd -13 * SIZE(A1, LDA), %xmm6 1556 1557 mulpd %xmm9, %xmm5 1558 SUBPD %xmm5, %xmm2 1559 mulpd %xmm9, %xmm7 1560 SUBPD %xmm7, %xmm3 1561 1562 pshufd $0x4e, %xmm4, %xmm5 1563 mulpd %xmm10, %xmm4 1564 addpd %xmm4, %xmm0 1565 movsd -12 * SIZE(A1, LDA), %xmm4 1566 movhpd -11 * SIZE(A1, LDA), %xmm4 1567 pshufd $0x4e, %xmm6, %xmm7 1568 mulpd %xmm10, %xmm6 1569 addpd %xmm6, %xmm1 1570 movsd -10 * SIZE(A1, LDA), %xmm6 1571 movhpd -9 * SIZE(A1, LDA), %xmm6 1572 1573 mulpd %xmm11, %xmm5 1574 SUBPD %xmm5, %xmm0 1575 mulpd %xmm11, %xmm7 1576 SUBPD %xmm7, %xmm1 1577 1578 pshufd $0x4e, %xmm4, %xmm5 1579 mulpd %xmm10, %xmm4 1580 addpd %xmm4, %xmm2 1581 movsd -16 * SIZE(A2), %xmm4 1582 movhpd -15 * SIZE(A2), %xmm4 1583 pshufd $0x4e, %xmm6, %xmm7 1584 mulpd %xmm10, %xmm6 1585 addpd %xmm6, %xmm3 1586 movsd -14 * SIZE(A2), %xmm6 1587 movhpd -13 * SIZE(A2), %xmm6 1588 1589 mulpd %xmm11, %xmm5 1590 SUBPD %xmm5, %xmm2 1591 mulpd %xmm11, %xmm7 1592 SUBPD %xmm7, %xmm3 1593 1594 pshufd $0x4e, %xmm4, %xmm5 1595 mulpd %xmm12, %xmm4 1596 addpd %xmm4, %xmm0 1597 movsd -12 * SIZE(A2), %xmm4 1598 movhpd -11 * SIZE(A2), %xmm4 1599 pshufd $0x4e, %xmm6, %xmm7 1600 mulpd %xmm12, %xmm6 1601 addpd %xmm6, %xmm1 1602 movsd -10 * SIZE(A2), %xmm6 1603 movhpd -9 * SIZE(A2), %xmm6 1604 1605 mulpd %xmm13, %xmm5 1606 SUBPD %xmm5, %xmm0 1607 mulpd %xmm13, %xmm7 1608 SUBPD %xmm7, %xmm1 1609 1610 pshufd $0x4e, %xmm4, %xmm5 1611 mulpd %xmm12, %xmm4 1612 addpd %xmm4, %xmm2 1613 movsd -16 * SIZE(A2, LDA), %xmm4 1614 movhpd -15 * SIZE(A2, LDA), %xmm4 1615 pshufd $0x4e, %xmm6, %xmm7 1616 mulpd %xmm12, %xmm6 1617 addpd %xmm6, %xmm3 1618 movsd -14 * SIZE(A2, LDA), %xmm6 1619 movhpd -13 * SIZE(A2, LDA), %xmm6 1620 1621 mulpd %xmm13, %xmm5 1622 SUBPD %xmm5, %xmm2 1623 mulpd %xmm13, %xmm7 1624 SUBPD %xmm7, %xmm3 1625 1626 pshufd $0x4e, %xmm4, %xmm5 1627 mulpd %xmm14, %xmm4 1628 addpd %xmm4, %xmm0 1629 movsd -12 * SIZE(A2, LDA), %xmm4 1630 movhpd -11 * SIZE(A2, LDA), %xmm4 1631 pshufd $0x4e, %xmm6, %xmm7 1632 mulpd %xmm14, %xmm6 1633 addpd %xmm6, %xmm1 1634 movsd -10 * SIZE(A2, LDA), %xmm6 1635 movhpd -9 * SIZE(A2, LDA), %xmm6 1636 1637 mulpd %xmm15, %xmm5 1638 SUBPD %xmm5, %xmm0 1639 mulpd %xmm15, %xmm7 1640 SUBPD %xmm7, %xmm1 1641 1642 pshufd $0x4e, %xmm4, %xmm5 1643 mulpd %xmm14, %xmm4 1644 addpd %xmm4, %xmm2 1645 pshufd $0x4e, %xmm6, %xmm7 1646 mulpd %xmm14, %xmm6 1647 addpd %xmm6, %xmm3 1648 1649 mulpd %xmm15, %xmm5 1650 SUBPD %xmm5, %xmm2 1651 mulpd %xmm15, %xmm7 1652 SUBPD %xmm7, %xmm3 1653 1654 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1655 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1656 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1657 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1658 1659 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 1660 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 1661 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 1662 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 1663 1664 subq $-8 * SIZE, A1 1665 subq $-8 * SIZE, A2 1666 subq $-8 * SIZE, Y1 1667 ALIGN_3 1668 1669.L105: 1670 testq $2, M 1671 je .L107 1672 1673 movsd -16 * SIZE(A1), %xmm4 1674 movhpd -15 * SIZE(A1), %xmm4 1675 movsd -14 * SIZE(A1), %xmm6 1676 movhpd -13 * SIZE(A1), %xmm6 1677 1678 pshufd $0x4e, %xmm4, %xmm5 1679 mulpd %xmm8, %xmm4 1680 addpd %xmm4, %xmm0 1681 movsd -16 * SIZE(A1, LDA), %xmm4 1682 movhpd -15 * SIZE(A1, LDA), %xmm4 1683 pshufd $0x4e, %xmm6, %xmm7 1684 mulpd %xmm8, %xmm6 1685 addpd %xmm6, %xmm1 1686 movsd -14 * SIZE(A1, LDA), %xmm6 1687 movhpd -13 * SIZE(A1, LDA), %xmm6 1688 1689 mulpd %xmm9, %xmm5 1690 SUBPD %xmm5, %xmm0 1691 mulpd %xmm9, %xmm7 1692 SUBPD %xmm7, %xmm1 1693 1694 pshufd $0x4e, %xmm4, %xmm5 1695 mulpd %xmm10, %xmm4 1696 addpd %xmm4, %xmm0 1697 movsd -16 * SIZE(A2), %xmm4 1698 movhpd -15 * SIZE(A2), %xmm4 1699 pshufd $0x4e, %xmm6, %xmm7 1700 mulpd %xmm10, %xmm6 1701 addpd %xmm6, %xmm1 1702 movsd -14 * SIZE(A2), %xmm6 1703 movhpd -13 * SIZE(A2), %xmm6 1704 1705 mulpd %xmm11, %xmm5 1706 SUBPD %xmm5, %xmm0 1707 mulpd %xmm11, %xmm7 1708 SUBPD %xmm7, %xmm1 1709 1710 pshufd $0x4e, %xmm4, %xmm5 1711 mulpd %xmm12, %xmm4 1712 addpd %xmm4, %xmm0 1713 movsd -16 * SIZE(A2, LDA), %xmm4 1714 movhpd -15 * SIZE(A2, LDA), %xmm4 1715 pshufd $0x4e, %xmm6, %xmm7 1716 mulpd %xmm12, %xmm6 1717 addpd %xmm6, %xmm1 1718 movsd -14 * SIZE(A2, LDA), %xmm6 1719 movhpd -13 * SIZE(A2, LDA), %xmm6 1720 1721 mulpd %xmm13, %xmm5 1722 SUBPD %xmm5, %xmm0 1723 mulpd %xmm13, %xmm7 1724 SUBPD %xmm7, %xmm1 1725 1726 pshufd $0x4e, %xmm4, %xmm5 1727 mulpd %xmm14, %xmm4 1728 addpd %xmm4, %xmm0 1729 mulpd %xmm15, %xmm5 1730 SUBPD %xmm5, %xmm0 1731 1732 pshufd $0x4e, %xmm6, %xmm7 1733 mulpd %xmm14, %xmm6 1734 addpd %xmm6, %xmm1 1735 mulpd %xmm15, %xmm7 1736 SUBPD %xmm7, %xmm1 1737 1738 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1739 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1740 movapd %xmm2, %xmm0 1741 1742 addq $4 * SIZE, A1 1743 addq $4 * SIZE, A2 1744 addq $4 * SIZE, Y1 1745 ALIGN_3 1746 1747.L107: 1748 testq $1, M 1749 je .L109 1750 1751 movsd -16 * SIZE(A1), %xmm4 1752 movhpd -15 * SIZE(A1), %xmm4 1753 movsd -16 * SIZE(A1, LDA), %xmm6 1754 movhpd -15 * SIZE(A1, LDA), %xmm6 1755 1756 pshufd $0x4e, %xmm4, %xmm5 1757 mulpd %xmm8, %xmm4 1758 addpd %xmm4, %xmm0 1759 movsd -16 * SIZE(A2), %xmm4 1760 movhpd -15 * SIZE(A2), %xmm4 1761 mulpd %xmm9, %xmm5 1762 SUBPD %xmm5, %xmm0 1763 1764 pshufd $0x4e, %xmm6, %xmm7 1765 mulpd %xmm10, %xmm6 1766 addpd %xmm6, %xmm0 1767 movsd -16 * SIZE(A2, LDA), %xmm6 1768 movhpd -15 * SIZE(A2, LDA), %xmm6 1769 mulpd %xmm11, %xmm7 1770 SUBPD %xmm7, %xmm0 1771 1772 pshufd $0x4e, %xmm4, %xmm5 1773 mulpd %xmm12, %xmm4 1774 addpd %xmm4, %xmm0 1775 mulpd %xmm13, %xmm5 1776 SUBPD %xmm5, %xmm0 1777 1778 pshufd $0x4e, %xmm6, %xmm7 1779 mulpd %xmm14, %xmm6 1780 addpd %xmm6, %xmm0 1781 mulpd %xmm15, %xmm7 1782 SUBPD %xmm7, %xmm0 1783 1784 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1785 ALIGN_3 1786 1787.L109: 1788 cmpq $4, N 1789 jge .L101 1790 ALIGN_3 1791 1792.L110: 1793#endif 1794 1795#if GEMV_UNROLL >= 2 1796 1797 cmpq $2, N 1798 jl .L120 1799 1800#if GEMV_UNROLL == 2 1801 ALIGN_3 1802 1803.L111: 1804#endif 1805 1806 subq $2, N 1807 1808 leaq 16 * SIZE(BUFFER), Y1 1809 movq A, A1 1810 leaq (A, LDA, 1), A2 1811 leaq (A, LDA, 2), A 1812 1813 movsd 0 * SIZE(X), %xmm12 1814 movhpd 1 * SIZE(X), %xmm12 1815 addq INCX, X 1816 movsd 0 * SIZE(X), %xmm14 1817 movhpd 1 * SIZE(X), %xmm14 1818 addq INCX, X 1819 1820 pcmpeqb %xmm11, %xmm11 1821 psllq $63, %xmm11 1822 shufps $0xc0, %xmm11, %xmm11 1823 1824 pshufd $0x4e, %xmm12, %xmm13 1825 pshufd $0x4e, %xmm14, %xmm15 1826 1827#ifdef HAVE_SSE3 1828 movddup ALPHA_R, %xmm8 1829 movddup ALPHA_I, %xmm9 1830#else 1831 movsd ALPHA_R, %xmm8 1832 unpcklpd %xmm8, %xmm8 1833 movsd ALPHA_I, %xmm9 1834 unpcklpd %xmm9, %xmm9 1835#endif 1836 1837 xorpd %xmm11, %xmm13 1838 xorpd %xmm11, %xmm15 1839 1840 mulpd %xmm8, %xmm12 1841 mulpd %xmm9, %xmm13 1842 mulpd %xmm8, %xmm14 1843 mulpd %xmm9, %xmm15 1844 1845#ifndef XCONJ 1846 subpd %xmm13, %xmm12 1847 subpd %xmm15, %xmm14 1848#else 1849 addpd %xmm13, %xmm12 1850 addpd %xmm15, %xmm14 1851#endif 1852 1853 pshufd $0xee, %xmm12, %xmm13 1854 pshufd $0x44, %xmm12, %xmm12 1855 1856 pshufd $0xee, %xmm14, %xmm15 1857 pshufd $0x44, %xmm14, %xmm14 1858 1859#ifndef CONJ 1860 xorpd %xmm11, %xmm13 1861 xorpd %xmm11, %xmm15 1862#else 1863 xorpd %xmm11, %xmm12 1864 xorpd %xmm11, %xmm14 1865#endif 1866 1867 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1868 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 1869 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 1870 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 1871 ALIGN_3 1872 1873 movq M, I 1874 sarq $2, I 1875 jle .L115 1876 1877 movsd -16 * SIZE(A1), %xmm4 1878 movhpd -15 * SIZE(A1), %xmm4 1879 movsd -14 * SIZE(A1), %xmm6 1880 movhpd -13 * SIZE(A1), %xmm6 1881 movsd -12 * SIZE(A1), %xmm8 1882 movhpd -11 * SIZE(A1), %xmm8 1883 movsd -10 * SIZE(A1), %xmm10 1884 movhpd -9 * SIZE(A1), %xmm10 1885 1886 decq I 1887 jle .L114 1888 ALIGN_3 1889 1890.L113: 1891#ifdef PREFETCH 1892 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 1893#endif 1894 1895 pshufd $0x4e, %xmm4, %xmm5 1896 mulpd %xmm12, %xmm4 1897 addpd %xmm4, %xmm0 1898 movsd -16 * SIZE(A2), %xmm4 1899 movhpd -15 * SIZE(A2), %xmm4 1900 pshufd $0x4e, %xmm6, %xmm7 1901 mulpd %xmm12, %xmm6 1902 addpd %xmm6, %xmm1 1903 movsd -14 * SIZE(A2), %xmm6 1904 movhpd -13 * SIZE(A2), %xmm6 1905 1906 pshufd $0x4e, %xmm8, %xmm9 1907 mulpd %xmm12, %xmm8 1908 addpd %xmm8, %xmm2 1909 movsd -12 * SIZE(A2), %xmm8 1910 movhpd -11 * SIZE(A2), %xmm8 1911 pshufd $0x4e, %xmm10, %xmm11 1912 mulpd %xmm12, %xmm10 1913 addpd %xmm10, %xmm3 1914 movsd -10 * SIZE(A2), %xmm10 1915 movhpd -9 * SIZE(A2), %xmm10 1916 1917 mulpd %xmm13, %xmm5 1918 SUBPD %xmm5, %xmm0 1919 mulpd %xmm13, %xmm7 1920 SUBPD %xmm7, %xmm1 1921 1922 mulpd %xmm13, %xmm9 1923 SUBPD %xmm9, %xmm2 1924 mulpd %xmm13, %xmm11 1925 SUBPD %xmm11, %xmm3 1926 1927#ifdef PREFETCH 1928 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 1929#endif 1930 1931 pshufd $0x4e, %xmm4, %xmm5 1932 mulpd %xmm14, %xmm4 1933 addpd %xmm4, %xmm0 1934 movsd -8 * SIZE(A1), %xmm4 1935 movhpd -7 * SIZE(A1), %xmm4 1936 pshufd $0x4e, %xmm6, %xmm7 1937 mulpd %xmm14, %xmm6 1938 addpd %xmm6, %xmm1 1939 movsd -6 * SIZE(A1), %xmm6 1940 movhpd -5 * SIZE(A1), %xmm6 1941 1942 pshufd $0x4e, %xmm8, %xmm9 1943 mulpd %xmm14, %xmm8 1944 addpd %xmm8, %xmm2 1945 movsd -4 * SIZE(A1), %xmm8 1946 movhpd -3 * SIZE(A1), %xmm8 1947 pshufd $0x4e, %xmm10, %xmm11 1948 mulpd %xmm14, %xmm10 1949 addpd %xmm10, %xmm3 1950 movsd -2 * SIZE(A1), %xmm10 1951 movhpd -1 * SIZE(A1), %xmm10 1952 1953 mulpd %xmm15, %xmm5 1954 SUBPD %xmm5, %xmm0 1955 mulpd %xmm15, %xmm7 1956 SUBPD %xmm7, %xmm1 1957 1958 mulpd %xmm15, %xmm9 1959 SUBPD %xmm9, %xmm2 1960 mulpd %xmm15, %xmm11 1961 SUBPD %xmm11, %xmm3 1962 1963#ifdef PREFETCHW 1964 PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) 1965#endif 1966 1967 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1968 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1969 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1970 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1971 1972 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 1973 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 1974 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 1975 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 1976 1977 subq $-8 * SIZE, A1 1978 subq $-8 * SIZE, A2 1979 subq $-8 * SIZE, Y1 1980 1981 subq $1, I 1982 BRANCH 1983 jg .L113 1984 ALIGN_3 1985 1986.L114: 1987 pshufd $0x4e, %xmm4, %xmm5 1988 mulpd %xmm12, %xmm4 1989 addpd %xmm4, %xmm0 1990 movsd -16 * SIZE(A2), %xmm4 1991 movhpd -15 * SIZE(A2), %xmm4 1992 pshufd $0x4e, %xmm6, %xmm7 1993 mulpd %xmm12, %xmm6 1994 addpd %xmm6, %xmm1 1995 movsd -14 * SIZE(A2), %xmm6 1996 movhpd -13 * SIZE(A2), %xmm6 1997 1998 pshufd $0x4e, %xmm8, %xmm9 1999 mulpd %xmm12, %xmm8 2000 addpd %xmm8, %xmm2 2001 movsd -12 * SIZE(A2), %xmm8 2002 movhpd -11 * SIZE(A2), %xmm8 2003 pshufd $0x4e, %xmm10, %xmm11 2004 mulpd %xmm12, %xmm10 2005 addpd %xmm10, %xmm3 2006 movsd -10 * SIZE(A2), %xmm10 2007 movhpd -9 * SIZE(A2), %xmm10 2008 2009 mulpd %xmm13, %xmm5 2010 SUBPD %xmm5, %xmm0 2011 mulpd %xmm13, %xmm7 2012 SUBPD %xmm7, %xmm1 2013 2014 mulpd %xmm13, %xmm9 2015 SUBPD %xmm9, %xmm2 2016 mulpd %xmm13, %xmm11 2017 SUBPD %xmm11, %xmm3 2018 2019 pshufd $0x4e, %xmm4, %xmm5 2020 mulpd %xmm14, %xmm4 2021 addpd %xmm4, %xmm0 2022 pshufd $0x4e, %xmm6, %xmm7 2023 mulpd %xmm14, %xmm6 2024 addpd %xmm6, %xmm1 2025 2026 pshufd $0x4e, %xmm8, %xmm9 2027 mulpd %xmm14, %xmm8 2028 addpd %xmm8, %xmm2 2029 pshufd $0x4e, %xmm10, %xmm11 2030 mulpd %xmm14, %xmm10 2031 addpd %xmm10, %xmm3 2032 2033 mulpd %xmm15, %xmm5 2034 SUBPD %xmm5, %xmm0 2035 mulpd %xmm15, %xmm7 2036 SUBPD %xmm7, %xmm1 2037 2038 mulpd %xmm15, %xmm9 2039 SUBPD %xmm9, %xmm2 2040 mulpd %xmm15, %xmm11 2041 SUBPD %xmm11, %xmm3 2042 2043 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2044 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 2045 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 2046 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 2047 2048 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 2049 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 2050 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 2051 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 2052 2053 subq $-8 * SIZE, A1 2054 subq $-8 * SIZE, A2 2055 subq $-8 * SIZE, Y1 2056 ALIGN_3 2057 2058.L115: 2059 testq $2, M 2060 je .L117 2061 2062 movsd -16 * SIZE(A1), %xmm4 2063 movhpd -15 * SIZE(A1), %xmm4 2064 movsd -14 * SIZE(A1), %xmm6 2065 movhpd -13 * SIZE(A1), %xmm6 2066 2067 movsd -16 * SIZE(A2), %xmm8 2068 movhpd -15 * SIZE(A2), %xmm8 2069 movsd -14 * SIZE(A2), %xmm10 2070 movhpd -13 * SIZE(A2), %xmm10 2071 2072 pshufd $0x4e, %xmm4, %xmm5 2073 mulpd %xmm12, %xmm4 2074 addpd %xmm4, %xmm0 2075 pshufd $0x4e, %xmm6, %xmm7 2076 mulpd %xmm12, %xmm6 2077 addpd %xmm6, %xmm1 2078 2079 mulpd %xmm13, %xmm5 2080 SUBPD %xmm5, %xmm0 2081 mulpd %xmm13, %xmm7 2082 SUBPD %xmm7, %xmm1 2083 2084 pshufd $0x4e, %xmm8, %xmm9 2085 mulpd %xmm14, %xmm8 2086 addpd %xmm8, %xmm0 2087 pshufd $0x4e, %xmm10, %xmm11 2088 mulpd %xmm14, %xmm10 2089 addpd %xmm10, %xmm1 2090 2091 mulpd %xmm15, %xmm9 2092 SUBPD %xmm9, %xmm0 2093 mulpd %xmm15, %xmm11 2094 SUBPD %xmm11, %xmm1 2095 2096 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2097 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 2098 movapd %xmm2, %xmm0 2099 2100 addq $4 * SIZE, A1 2101 addq $4 * SIZE, A2 2102 addq $4 * SIZE, Y1 2103 ALIGN_3 2104 2105.L117: 2106 testq $1, M 2107#if GEMV_UNROLL == 2 2108 je .L119 2109#else 2110 je .L120 2111#endif 2112 2113 movsd -16 * SIZE(A1), %xmm4 2114 movhpd -15 * SIZE(A1), %xmm4 2115 movsd -16 * SIZE(A2), %xmm6 2116 movhpd -15 * SIZE(A2), %xmm6 2117 2118 pshufd $0x4e, %xmm4, %xmm5 2119 mulpd %xmm12, %xmm4 2120 addpd %xmm4, %xmm0 2121 mulpd %xmm13, %xmm5 2122 SUBPD %xmm5, %xmm0 2123 2124 pshufd $0x4e, %xmm6, %xmm7 2125 mulpd %xmm14, %xmm6 2126 addpd %xmm6, %xmm0 2127 mulpd %xmm15, %xmm7 2128 SUBPD %xmm7, %xmm0 2129 2130 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2131 2132#if GEMV_UNROLL == 2 2133 ALIGN_3 2134 2135.L119: 2136 cmpq $2, N 2137 jge .L111 2138#endif 2139 ALIGN_3 2140 2141.L120: 2142#endif 2143 2144 cmpq $1, N 2145 jl .L980 2146 2147#if GEMV_UNROLL == 1 2148.L121: 2149 decq N 2150#endif 2151 2152 leaq 16 * SIZE(BUFFER), Y1 2153 movq A, A1 2154#if GEMV_UNROLL == 1 2155 addq LDA, A 2156#endif 2157 2158 movsd 0 * SIZE(X), %xmm12 2159 movhpd 1 * SIZE(X), %xmm12 2160 addq INCX, X 2161 2162 pcmpeqb %xmm11, %xmm11 2163 psllq $63, %xmm11 2164 shufps $0xc0, %xmm11, %xmm11 2165 2166 pshufd $0x4e, %xmm12, %xmm13 2167 2168#ifdef HAVE_SSE3 2169 movddup ALPHA_R, %xmm8 2170 movddup ALPHA_I, %xmm9 2171#else 2172 movsd ALPHA_R, %xmm8 2173 unpcklpd %xmm8, %xmm8 2174 movsd ALPHA_I, %xmm9 2175 unpcklpd %xmm9, %xmm9 2176#endif 2177 2178 xorpd %xmm11, %xmm13 2179 2180 mulpd %xmm8, %xmm12 2181 mulpd %xmm9, %xmm13 2182 2183#ifndef XCONJ 2184 subpd %xmm13, %xmm12 2185#else 2186 addpd %xmm13, %xmm12 2187#endif 2188 2189 pshufd $0xee, %xmm12, %xmm13 2190 pshufd $0x44, %xmm12, %xmm12 2191 2192#ifndef CONJ 2193 xorpd %xmm11, %xmm13 2194#else 2195 xorpd %xmm11, %xmm12 2196#endif 2197 2198 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 2199 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 2200 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 2201 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 2202 2203 movq M, I 2204 sarq $2, I 2205 jle .L125 2206 2207 movsd -16 * SIZE(A1), %xmm4 2208 movhpd -15 * SIZE(A1), %xmm4 2209 movsd -14 * SIZE(A1), %xmm6 2210 movhpd -13 * SIZE(A1), %xmm6 2211 movsd -12 * SIZE(A1), %xmm8 2212 movhpd -11 * SIZE(A1), %xmm8 2213 movsd -10 * SIZE(A1), %xmm10 2214 movhpd -9 * SIZE(A1), %xmm10 2215 2216 decq I 2217 jle .L124 2218 ALIGN_3 2219 2220.L123: 2221#ifdef PREFETCH 2222 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 2223#endif 2224 2225 pshufd $0x4e, %xmm4, %xmm5 2226 mulpd %xmm12, %xmm4 2227 addpd %xmm4, %xmm0 2228 movsd -8 * SIZE(A1), %xmm4 2229 movhpd -7 * SIZE(A1), %xmm4 2230 pshufd $0x4e, %xmm6, %xmm7 2231 mulpd %xmm12, %xmm6 2232 addpd %xmm6, %xmm1 2233 movsd -6 * SIZE(A1), %xmm6 2234 movhpd -5 * SIZE(A1), %xmm6 2235 2236 pshufd $0x4e, %xmm8, %xmm9 2237 mulpd %xmm12, %xmm8 2238 addpd %xmm8, %xmm2 2239 movsd -4 * SIZE(A1), %xmm8 2240 movhpd -3 * SIZE(A1), %xmm8 2241 pshufd $0x4e, %xmm10, %xmm11 2242 mulpd %xmm12, %xmm10 2243 addpd %xmm10, %xmm3 2244 movsd -2 * SIZE(A1), %xmm10 2245 movhpd -1 * SIZE(A1), %xmm10 2246 2247 mulpd %xmm13, %xmm5 2248 SUBPD %xmm5, %xmm0 2249 mulpd %xmm13, %xmm7 2250 SUBPD %xmm7, %xmm1 2251 2252 mulpd %xmm13, %xmm9 2253 SUBPD %xmm9, %xmm2 2254 mulpd %xmm13, %xmm11 2255 SUBPD %xmm11, %xmm3 2256 2257#ifdef PREFETCHW 2258 PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) 2259#endif 2260 2261 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2262 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 2263 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 2264 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 2265 2266 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 2267 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 2268 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 2269 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 2270 2271 subq $-8 * SIZE, A1 2272 subq $-8 * SIZE, Y1 2273 2274 subq $1, I 2275 BRANCH 2276 jg .L123 2277 ALIGN_3 2278 2279.L124: 2280 pshufd $0x4e, %xmm4, %xmm5 2281 mulpd %xmm12, %xmm4 2282 addpd %xmm4, %xmm0 2283 pshufd $0x4e, %xmm6, %xmm7 2284 mulpd %xmm12, %xmm6 2285 addpd %xmm6, %xmm1 2286 2287 pshufd $0x4e, %xmm8, %xmm9 2288 mulpd %xmm12, %xmm8 2289 addpd %xmm8, %xmm2 2290 pshufd $0x4e, %xmm10, %xmm11 2291 mulpd %xmm12, %xmm10 2292 addpd %xmm10, %xmm3 2293 2294 mulpd %xmm13, %xmm5 2295 SUBPD %xmm5, %xmm0 2296 mulpd %xmm13, %xmm7 2297 SUBPD %xmm7, %xmm1 2298 2299 mulpd %xmm13, %xmm9 2300 SUBPD %xmm9, %xmm2 2301 mulpd %xmm13, %xmm11 2302 SUBPD %xmm11, %xmm3 2303 2304 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2305 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 2306 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 2307 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 2308 2309 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 2310 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 2311 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 2312 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 2313 2314 subq $-8 * SIZE, A1 2315 subq $-8 * SIZE, Y1 2316 ALIGN_3 2317 2318.L125: 2319 testq $2, M 2320 je .L127 2321 2322 2323 movsd -16 * SIZE(A1), %xmm4 2324 movhpd -15 * SIZE(A1), %xmm4 2325 movsd -14 * SIZE(A1), %xmm6 2326 movhpd -13 * SIZE(A1), %xmm6 2327 2328 pshufd $0x4e, %xmm4, %xmm5 2329 mulpd %xmm12, %xmm4 2330 addpd %xmm4, %xmm0 2331 pshufd $0x4e, %xmm6, %xmm7 2332 mulpd %xmm12, %xmm6 2333 addpd %xmm6, %xmm1 2334 2335 mulpd %xmm13, %xmm5 2336 SUBPD %xmm5, %xmm0 2337 mulpd %xmm13, %xmm7 2338 SUBPD %xmm7, %xmm1 2339 2340 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2341 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 2342 movapd %xmm2, %xmm0 2343 2344 addq $4 * SIZE, A1 2345 addq $4 * SIZE, Y1 2346 ALIGN_3 2347 2348.L127: 2349 testq $1, M 2350#if GEMV_UNROLL == 1 2351 je .L129 2352#else 2353 je .L980 2354#endif 2355 2356 movsd -16 * SIZE(A1), %xmm4 2357 movhpd -15 * SIZE(A1), %xmm4 2358 2359 pshufd $0x4e, %xmm4, %xmm5 2360 mulpd %xmm12, %xmm4 2361 addpd %xmm4, %xmm0 2362 mulpd %xmm13, %xmm5 2363 SUBPD %xmm5, %xmm0 2364 2365 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 2366 2367#if GEMV_UNROLL == 1 2368 ALIGN_3 2369.L129: 2370 cmpq $1, N 2371 jge .L121 2372#endif 2373 2374 2375#endif 2376 ALIGN_3 2377 2378.L980: 2379 testq $SIZE, Y 2380 jne .L990 2381 2382 movq Y, Y1 2383 2384 movq M, %rax 2385 sarq $3, %rax 2386 jle .L184 2387 ALIGN_3 2388 2389.L182: 2390 movapd (Y), %xmm0 2391 addq INCY, Y 2392 movapd (Y), %xmm1 2393 addq INCY, Y 2394 movapd (Y), %xmm2 2395 addq INCY, Y 2396 movapd (Y), %xmm3 2397 addq INCY, Y 2398 movapd (Y), %xmm4 2399 addq INCY, Y 2400 movapd (Y), %xmm5 2401 addq INCY, Y 2402 movapd (Y), %xmm6 2403 addq INCY, Y 2404 movapd (Y), %xmm7 2405 addq INCY, Y 2406 2407 addpd 0 * SIZE(BUFFER), %xmm0 2408 addpd 2 * SIZE(BUFFER), %xmm1 2409 addpd 4 * SIZE(BUFFER), %xmm2 2410 addpd 6 * SIZE(BUFFER), %xmm3 2411 addpd 8 * SIZE(BUFFER), %xmm4 2412 addpd 10 * SIZE(BUFFER), %xmm5 2413 addpd 12 * SIZE(BUFFER), %xmm6 2414 addpd 14 * SIZE(BUFFER), %xmm7 2415 2416 movapd %xmm0, (Y1) 2417 addq INCY, Y1 2418 movapd %xmm1, (Y1) 2419 addq INCY, Y1 2420 movapd %xmm2, (Y1) 2421 addq INCY, Y1 2422 movapd %xmm3, (Y1) 2423 addq INCY, Y1 2424 movapd %xmm4, (Y1) 2425 addq INCY, Y1 2426 movapd %xmm5, (Y1) 2427 addq INCY, Y1 2428 movapd %xmm6, (Y1) 2429 addq INCY, Y1 2430 movapd %xmm7, (Y1) 2431 addq INCY, Y1 2432 2433 subq $-16 * SIZE, BUFFER 2434 decq %rax 2435 jg .L182 2436 ALIGN_3 2437 2438.L184: 2439 testq $7, M 2440 jle .L999 2441 2442 testq $4, M 2443 jle .L185 2444 2445 movapd (Y), %xmm0 2446 addq INCY, Y 2447 movapd (Y), %xmm1 2448 addq INCY, Y 2449 movapd (Y), %xmm2 2450 addq INCY, Y 2451 movapd (Y), %xmm3 2452 addq INCY, Y 2453 2454 addpd 0 * SIZE(BUFFER), %xmm0 2455 addpd 2 * SIZE(BUFFER), %xmm1 2456 addpd 4 * SIZE(BUFFER), %xmm2 2457 addpd 6 * SIZE(BUFFER), %xmm3 2458 2459 movapd %xmm0, (Y1) 2460 addq INCY, Y1 2461 movapd %xmm1, (Y1) 2462 addq INCY, Y1 2463 movapd %xmm2, (Y1) 2464 addq INCY, Y1 2465 movapd %xmm3, (Y1) 2466 addq INCY, Y1 2467 2468 addq $8 * SIZE, BUFFER 2469 ALIGN_3 2470 2471.L185: 2472 testq $2, M 2473 jle .L186 2474 2475 movapd (Y), %xmm0 2476 addq INCY, Y 2477 movapd (Y), %xmm1 2478 addq INCY, Y 2479 addpd 0 * SIZE(BUFFER), %xmm0 2480 addpd 2 * SIZE(BUFFER), %xmm1 2481 2482 movapd %xmm0, (Y1) 2483 addq INCY, Y1 2484 movapd %xmm1, (Y1) 2485 addq INCY, Y1 2486 2487 addq $4 * SIZE, BUFFER 2488 ALIGN_3 2489 2490.L186: 2491 testq $1, M 2492 jle .L999 2493 2494 movapd (Y), %xmm0 2495 2496 addpd (BUFFER), %xmm0 2497 2498 movapd %xmm0, (Y1) 2499 jmp .L999 2500 ALIGN_3 2501 2502.L990: 2503 movq Y, Y1 2504 2505 movq M, %rax 2506 sarq $3, %rax 2507 jle .L994 2508 ALIGN_3 2509 2510.L992: 2511 movsd 0 * SIZE(Y), %xmm0 2512 movhpd 1 * SIZE(Y), %xmm0 2513 addq INCY, Y 2514 2515 movsd 0 * SIZE(Y), %xmm1 2516 movhpd 1 * SIZE(Y), %xmm1 2517 addq INCY, Y 2518 2519 movsd 0 * SIZE(Y), %xmm2 2520 movhpd 1 * SIZE(Y), %xmm2 2521 addq INCY, Y 2522 2523 movsd 0 * SIZE(Y), %xmm3 2524 movhpd 1 * SIZE(Y), %xmm3 2525 addq INCY, Y 2526 2527 movsd 0 * SIZE(Y), %xmm4 2528 movhpd 1 * SIZE(Y), %xmm4 2529 addq INCY, Y 2530 2531 movsd 0 * SIZE(Y), %xmm5 2532 movhpd 1 * SIZE(Y), %xmm5 2533 addq INCY, Y 2534 2535 movsd 0 * SIZE(Y), %xmm6 2536 movhpd 1 * SIZE(Y), %xmm6 2537 addq INCY, Y 2538 2539 movsd 0 * SIZE(Y), %xmm7 2540 movhpd 1 * SIZE(Y), %xmm7 2541 addq INCY, Y 2542 2543 addpd 0 * SIZE(BUFFER), %xmm0 2544 addpd 2 * SIZE(BUFFER), %xmm1 2545 addpd 4 * SIZE(BUFFER), %xmm2 2546 addpd 6 * SIZE(BUFFER), %xmm3 2547 addpd 8 * SIZE(BUFFER), %xmm4 2548 addpd 10 * SIZE(BUFFER), %xmm5 2549 addpd 12 * SIZE(BUFFER), %xmm6 2550 addpd 14 * SIZE(BUFFER), %xmm7 2551 2552 movlpd %xmm0, 0 * SIZE(Y1) 2553 movhpd %xmm0, 1 * SIZE(Y1) 2554 addq INCY, Y1 2555 2556 movlpd %xmm1, 0 * SIZE(Y1) 2557 movhpd %xmm1, 1 * SIZE(Y1) 2558 addq INCY, Y1 2559 2560 movlpd %xmm2, 0 * SIZE(Y1) 2561 movhpd %xmm2, 1 * SIZE(Y1) 2562 addq INCY, Y1 2563 2564 movlpd %xmm3, 0 * SIZE(Y1) 2565 movhpd %xmm3, 1 * SIZE(Y1) 2566 addq INCY, Y1 2567 2568 movlpd %xmm4, 0 * SIZE(Y1) 2569 movhpd %xmm4, 1 * SIZE(Y1) 2570 addq INCY, Y1 2571 2572 movlpd %xmm5, 0 * SIZE(Y1) 2573 movhpd %xmm5, 1 * SIZE(Y1) 2574 addq INCY, Y1 2575 2576 movlpd %xmm6, 0 * SIZE(Y1) 2577 movhpd %xmm6, 1 * SIZE(Y1) 2578 addq INCY, Y1 2579 2580 movlpd %xmm7, 0 * SIZE(Y1) 2581 movhpd %xmm7, 1 * SIZE(Y1) 2582 addq INCY, Y1 2583 2584 subq $-16 * SIZE, BUFFER 2585 decq %rax 2586 jg .L992 2587 ALIGN_3 2588 2589.L994: 2590 testq $7, M 2591 jle .L999 2592 2593 testq $4, M 2594 jle .L995 2595 2596 movsd 0 * SIZE(Y), %xmm0 2597 movhpd 1 * SIZE(Y), %xmm0 2598 addq INCY, Y 2599 2600 movsd 0 * SIZE(Y), %xmm1 2601 movhpd 1 * SIZE(Y), %xmm1 2602 addq INCY, Y 2603 2604 movsd 0 * SIZE(Y), %xmm2 2605 movhpd 1 * SIZE(Y), %xmm2 2606 addq INCY, Y 2607 2608 movsd 0 * SIZE(Y), %xmm3 2609 movhpd 1 * SIZE(Y), %xmm3 2610 addq INCY, Y 2611 2612 addpd 0 * SIZE(BUFFER), %xmm0 2613 addpd 2 * SIZE(BUFFER), %xmm1 2614 addpd 4 * SIZE(BUFFER), %xmm2 2615 addpd 6 * SIZE(BUFFER), %xmm3 2616 2617 movlpd %xmm0, 0 * SIZE(Y1) 2618 movhpd %xmm0, 1 * SIZE(Y1) 2619 addq INCY, Y1 2620 2621 movlpd %xmm1, 0 * SIZE(Y1) 2622 movhpd %xmm1, 1 * SIZE(Y1) 2623 addq INCY, Y1 2624 2625 movlpd %xmm2, 0 * SIZE(Y1) 2626 movhpd %xmm2, 1 * SIZE(Y1) 2627 addq INCY, Y1 2628 2629 movlpd %xmm3, 0 * SIZE(Y1) 2630 movhpd %xmm3, 1 * SIZE(Y1) 2631 addq INCY, Y1 2632 2633 addq $8 * SIZE, BUFFER 2634 ALIGN_3 2635 2636.L995: 2637 testq $2, M 2638 jle .L996 2639 2640 movsd 0 * SIZE(Y), %xmm0 2641 movhpd 1 * SIZE(Y), %xmm0 2642 addq INCY, Y 2643 2644 movsd 0 * SIZE(Y), %xmm1 2645 movhpd 1 * SIZE(Y), %xmm1 2646 addq INCY, Y 2647 2648 addpd 0 * SIZE(BUFFER), %xmm0 2649 addpd 2 * SIZE(BUFFER), %xmm1 2650 2651 movlpd %xmm0, 0 * SIZE(Y1) 2652 movhpd %xmm0, 1 * SIZE(Y1) 2653 addq INCY, Y1 2654 2655 movlpd %xmm1, 0 * SIZE(Y1) 2656 movhpd %xmm1, 1 * SIZE(Y1) 2657 addq INCY, Y1 2658 2659 addq $4 * SIZE, BUFFER 2660 ALIGN_3 2661 2662.L996: 2663 testq $1, M 2664 jle .L999 2665 2666 movsd 0 * SIZE(Y), %xmm0 2667 movhpd 1 * SIZE(Y), %xmm0 2668 2669 addpd 0 * SIZE(BUFFER), %xmm0 2670 2671 movlpd %xmm0, 0 * SIZE(Y1) 2672 movhpd %xmm0, 1 * SIZE(Y1) 2673 ALIGN_3 2674 2675.L999: 2676 movq 0(%rsp), %rbx 2677 movq 8(%rsp), %rbp 2678 movq 16(%rsp), %r12 2679 movq 24(%rsp), %r13 2680 movq 32(%rsp), %r14 2681 movq 40(%rsp), %r15 2682 2683#ifdef WINDOWS_ABI 2684 movq 48(%rsp), %rdi 2685 movq 56(%rsp), %rsi 2686 movups 64(%rsp), %xmm6 2687 movups 80(%rsp), %xmm7 2688 movups 96(%rsp), %xmm8 2689 movups 112(%rsp), %xmm9 2690 movups 128(%rsp), %xmm10 2691 movups 144(%rsp), %xmm11 2692 movups 160(%rsp), %xmm12 2693 movups 176(%rsp), %xmm13 2694 movups 192(%rsp), %xmm14 2695 movups 208(%rsp), %xmm15 2696#endif 2697 2698 addq $STACKSIZE, %rsp 2699 ret 2700 2701 EPILOGUE 2702