1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41#include "l2param.h" 42 43#ifndef WINDOWS_ABI 44 45#define STACKSIZE 64 46 47#define OLD_INCX 8 + STACKSIZE(%rsp) 48#define OLD_Y 16 + STACKSIZE(%rsp) 49#define OLD_INCY 24 + STACKSIZE(%rsp) 50#define OLD_BUFFER 32 + STACKSIZE(%rsp) 51 52#define M %rdi 53#define N %rsi 54#define A %rcx 55#define LDA %r8 56#define X %r9 57#define INCX %rdx 58#define Y %rbp 59#define INCY %r10 60 61#else 62 63#define STACKSIZE 256 64 65#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 66#define OLD_A 48 + STACKSIZE(%rsp) 67#define OLD_LDA 56 + STACKSIZE(%rsp) 68#define OLD_X 64 + STACKSIZE(%rsp) 69#define OLD_INCX 72 + STACKSIZE(%rsp) 70#define OLD_Y 80 + STACKSIZE(%rsp) 71#define OLD_INCY 88 + STACKSIZE(%rsp) 72#define OLD_BUFFER 96 + STACKSIZE(%rsp) 73 74#define M %rcx 75#define N %rdx 76#define A %r8 77#define LDA %r9 78#define X %rdi 79#define INCX %rsi 80#define Y %rbp 81#define INCY %r10 82 83#endif 84 85#define I %rax 86#define J %rbx 87#define A1 %r11 88#define A2 %r12 89 90#define X1 %r13 91#define Y1 %r14 92#define BUFFER %r15 93 94#define ALPHA_R %xmm14 95#define ALPHA_I %xmm15 96 97#undef SUBPD 98 99#ifndef CONJ 100#define SUBPD addpd 101#else 102#define SUBPD subpd 103#endif 104 105 PROLOGUE 106 PROFCODE 107 108 subq $STACKSIZE, %rsp 109 movq %rbx, 0(%rsp) 110 movq %rbp, 8(%rsp) 111 movq %r12, 16(%rsp) 112 movq %r13, 24(%rsp) 113 movq %r14, 32(%rsp) 114 movq %r15, 40(%rsp) 115 116#ifdef WINDOWS_ABI 117 movq %rdi, 48(%rsp) 118 movq %rsi, 56(%rsp) 119 movups %xmm6, 64(%rsp) 120 movups %xmm7, 80(%rsp) 121 movups %xmm8, 96(%rsp) 122 movups %xmm9, 112(%rsp) 123 movups %xmm10, 128(%rsp) 124 movups %xmm11, 144(%rsp) 125 movups %xmm12, 160(%rsp) 126 movups %xmm13, 176(%rsp) 127 movups %xmm14, 192(%rsp) 128 movups %xmm15, 208(%rsp) 129 130 movq OLD_A, A 131 movq OLD_LDA, LDA 132 movq OLD_X, X 133 134 movaps %xmm3, %xmm0 135 movss OLD_ALPHA_I, %xmm1 136#endif 137 138 movq OLD_INCX, INCX 139 movq OLD_Y, Y 140 movq OLD_INCY, INCY 141 movq OLD_BUFFER, BUFFER 142 143 salq $ZBASE_SHIFT, LDA 144 salq $ZBASE_SHIFT, INCX 145 salq $ZBASE_SHIFT, INCY 146 147#ifdef HAVE_SSE3 148 movddup %xmm0, ALPHA_R 149 movddup %xmm1, ALPHA_I 150#else 151 pshufd $0x44, %xmm0, ALPHA_R 152 pshufd $0x44, %xmm1, ALPHA_I 153#endif 154 155 subq $-16 * SIZE, A 156 157 testq M, M 158 jle .L999 159 testq N, N 160 jle .L999 161 ALIGN_3 162 163 movq BUFFER, X1 164 165 movq Y, Y1 166 167 movq M, I 168 sarq $2, I 169 jle .L05 170 ALIGN_4 171 172.L02: 173 movsd 0 * SIZE(X), %xmm0 174 movhpd 1 * SIZE(X), %xmm0 175 addq INCX, X 176 177 movsd 0 * SIZE(X), %xmm1 178 movhpd 1 * SIZE(X), %xmm1 179 addq INCX, X 180 181 movsd 0 * SIZE(X), %xmm2 182 movhpd 1 * SIZE(X), %xmm2 183 addq INCX, X 184 185 movsd 0 * SIZE(X), %xmm3 186 movhpd 1 * SIZE(X), %xmm3 187 addq INCX, X 188 189 movapd %xmm0, 0 * SIZE(X1) 190 movapd %xmm1, 2 * SIZE(X1) 191 movapd %xmm2, 4 * SIZE(X1) 192 movapd %xmm3, 6 * SIZE(X1) 193 194 addq $8 * SIZE, X1 195 decq I 196 jg .L02 197 ALIGN_4 198 199.L05: 200 movq M, I 201 andq $3, I 202 jle .L10 203 ALIGN_2 204 205.L06: 206 movsd 0 * SIZE(X), %xmm0 207 movhpd 1 * SIZE(X), %xmm0 208 addq INCX, X 209 movapd %xmm0, 0 * SIZE(X1) 210 addq $2 * SIZE, X1 211 decq I 212 jg .L06 213 ALIGN_4 214 215.L10: 216#ifdef ALIGNED_ACCESS 217 testq $SIZE, A 218 jne .L100 219#endif 220 221#if GEMV_UNROLL >= 4 222 223 cmpq $4, N 224 jl .L20 225 ALIGN_3 226 227.L11: 228 subq $4, N 229 230 leaq 16 * SIZE(BUFFER), X1 231 232 movq A, A1 233 leaq (A1, LDA, 2), A2 234 leaq (A1, LDA, 4), A 235 236 MOVUPS_XL1(-16 * SIZE, X1, %xmm12) 237 xorpd %xmm0, %xmm0 238 xorpd %xmm1, %xmm1 239 xorpd %xmm2, %xmm2 240 xorpd %xmm3, %xmm3 241 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) 242 xorpd %xmm4, %xmm4 243 xorpd %xmm5, %xmm5 244 xorpd %xmm6, %xmm6 245 xorpd %xmm7, %xmm7 246 247#ifdef PREFETCHW 248 PREFETCHW 3 * SIZE(Y1) 249#endif 250 251 movq M, I 252 sarq $2, I 253 jle .L15 254 255 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 256 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) 257 258 decq I 259 jle .L14 260 ALIGN_3 261 262.L13: 263#ifdef PREFETCH 264 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) 265#endif 266 267 pshufd $0x4e, %xmm8, %xmm9 268 mulpd %xmm12, %xmm8 269 addpd %xmm8, %xmm0 270 MOVUPS_A1(-16 * SIZE, A2, %xmm8) 271 mulpd %xmm12, %xmm9 272 SUBPD %xmm9, %xmm1 273 274 pshufd $0x4e, %xmm10, %xmm11 275 mulpd %xmm12, %xmm10 276 addpd %xmm10, %xmm2 277 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) 278 mulpd %xmm12, %xmm11 279 SUBPD %xmm11, %xmm3 280 281 pshufd $0x4e, %xmm8, %xmm9 282 mulpd %xmm12, %xmm8 283 addpd %xmm8, %xmm4 284 MOVUPS_A1(-14 * SIZE, A1, %xmm8) 285 mulpd %xmm12, %xmm9 286 SUBPD %xmm9, %xmm5 287 288 pshufd $0x4e, %xmm10, %xmm11 289 mulpd %xmm12, %xmm10 290 addpd %xmm10, %xmm6 291 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) 292 mulpd %xmm12, %xmm11 293 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) 294 SUBPD %xmm11, %xmm7 295 296#ifdef PREFETCH 297 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) 298#endif 299 300 pshufd $0x4e, %xmm8, %xmm9 301 mulpd %xmm13, %xmm8 302 addpd %xmm8, %xmm0 303 MOVUPS_A1(-14 * SIZE, A2, %xmm8) 304 mulpd %xmm13, %xmm9 305 SUBPD %xmm9, %xmm1 306 307 pshufd $0x4e, %xmm10, %xmm11 308 mulpd %xmm13, %xmm10 309 addpd %xmm10, %xmm2 310 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) 311 mulpd %xmm13, %xmm11 312 SUBPD %xmm11, %xmm3 313 314 pshufd $0x4e, %xmm8, %xmm9 315 mulpd %xmm13, %xmm8 316 addpd %xmm8, %xmm4 317 MOVUPS_A1(-12 * SIZE, A1, %xmm8) 318 mulpd %xmm13, %xmm9 319 SUBPD %xmm9, %xmm5 320 321 pshufd $0x4e, %xmm10, %xmm11 322 mulpd %xmm13, %xmm10 323 addpd %xmm10, %xmm6 324 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10) 325 mulpd %xmm13, %xmm11 326 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) 327 SUBPD %xmm11, %xmm7 328 329#ifdef PREFETCH 330 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) 331#endif 332 333 pshufd $0x4e, %xmm8, %xmm9 334 mulpd %xmm12, %xmm8 335 addpd %xmm8, %xmm0 336 MOVUPS_A1(-12 * SIZE, A2, %xmm8) 337 mulpd %xmm12, %xmm9 338 SUBPD %xmm9, %xmm1 339 340 pshufd $0x4e, %xmm10, %xmm11 341 mulpd %xmm12, %xmm10 342 addpd %xmm10, %xmm2 343 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10) 344 mulpd %xmm12, %xmm11 345 SUBPD %xmm11, %xmm3 346 347 pshufd $0x4e, %xmm8, %xmm9 348 mulpd %xmm12, %xmm8 349 addpd %xmm8, %xmm4 350 MOVUPS_A1(-10 * SIZE, A1, %xmm8) 351 mulpd %xmm12, %xmm9 352 SUBPD %xmm9, %xmm5 353 354 pshufd $0x4e, %xmm10, %xmm11 355 mulpd %xmm12, %xmm10 356 addpd %xmm10, %xmm6 357 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10) 358 mulpd %xmm12, %xmm11 359 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) 360 SUBPD %xmm11, %xmm7 361 362#ifdef PREFETCH 363 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) 364#endif 365 366 pshufd $0x4e, %xmm8, %xmm9 367 mulpd %xmm13, %xmm8 368 addpd %xmm8, %xmm0 369 MOVUPS_A1(-10 * SIZE, A2, %xmm8) 370 mulpd %xmm13, %xmm9 371 SUBPD %xmm9, %xmm1 372 373 pshufd $0x4e, %xmm10, %xmm11 374 mulpd %xmm13, %xmm10 375 addpd %xmm10, %xmm2 376 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10) 377 mulpd %xmm13, %xmm11 378 SUBPD %xmm11, %xmm3 379 380#ifdef PREFETCHW 381 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) 382#endif 383 384 pshufd $0x4e, %xmm8, %xmm9 385 mulpd %xmm13, %xmm8 386 addpd %xmm8, %xmm4 387 MOVUPS_A1( -8 * SIZE, A1, %xmm8) 388 mulpd %xmm13, %xmm9 389 SUBPD %xmm9, %xmm5 390 391 pshufd $0x4e, %xmm10, %xmm11 392 mulpd %xmm13, %xmm10 393 addpd %xmm10, %xmm6 394 MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm10) 395 mulpd %xmm13, %xmm11 396 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) 397 SUBPD %xmm11, %xmm7 398 399 subq $-8 * SIZE, A1 400 subq $-8 * SIZE, A2 401 subq $-8 * SIZE, X1 402 403 subq $1, I 404 BRANCH 405 jg .L13 406 ALIGN_3 407 408.L14: 409 pshufd $0x4e, %xmm8, %xmm9 410 mulpd %xmm12, %xmm8 411 addpd %xmm8, %xmm0 412 MOVUPS_A1(-16 * SIZE, A2, %xmm8) 413 mulpd %xmm12, %xmm9 414 SUBPD %xmm9, %xmm1 415 416 pshufd $0x4e, %xmm10, %xmm11 417 mulpd %xmm12, %xmm10 418 addpd %xmm10, %xmm2 419 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) 420 mulpd %xmm12, %xmm11 421 SUBPD %xmm11, %xmm3 422 423 pshufd $0x4e, %xmm8, %xmm9 424 mulpd %xmm12, %xmm8 425 addpd %xmm8, %xmm4 426 MOVUPS_A1(-14 * SIZE, A1, %xmm8) 427 mulpd %xmm12, %xmm9 428 SUBPD %xmm9, %xmm5 429 430 pshufd $0x4e, %xmm10, %xmm11 431 mulpd %xmm12, %xmm10 432 addpd %xmm10, %xmm6 433 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) 434 mulpd %xmm12, %xmm11 435 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) 436 SUBPD %xmm11, %xmm7 437 438 pshufd $0x4e, %xmm8, %xmm9 439 mulpd %xmm13, %xmm8 440 addpd %xmm8, %xmm0 441 MOVUPS_A1(-14 * SIZE, A2, %xmm8) 442 mulpd %xmm13, %xmm9 443 SUBPD %xmm9, %xmm1 444 445 pshufd $0x4e, %xmm10, %xmm11 446 mulpd %xmm13, %xmm10 447 addpd %xmm10, %xmm2 448 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) 449 mulpd %xmm13, %xmm11 450 SUBPD %xmm11, %xmm3 451 452 pshufd $0x4e, %xmm8, %xmm9 453 mulpd %xmm13, %xmm8 454 addpd %xmm8, %xmm4 455 MOVUPS_A1(-12 * SIZE, A1, %xmm8) 456 mulpd %xmm13, %xmm9 457 SUBPD %xmm9, %xmm5 458 459 pshufd $0x4e, %xmm10, %xmm11 460 mulpd %xmm13, %xmm10 461 addpd %xmm10, %xmm6 462 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10) 463 mulpd %xmm13, %xmm11 464 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) 465 SUBPD %xmm11, %xmm7 466 467 pshufd $0x4e, %xmm8, %xmm9 468 mulpd %xmm12, %xmm8 469 addpd %xmm8, %xmm0 470 MOVUPS_A1(-12 * SIZE, A2, %xmm8) 471 mulpd %xmm12, %xmm9 472 SUBPD %xmm9, %xmm1 473 474 pshufd $0x4e, %xmm10, %xmm11 475 mulpd %xmm12, %xmm10 476 addpd %xmm10, %xmm2 477 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10) 478 mulpd %xmm12, %xmm11 479 SUBPD %xmm11, %xmm3 480 481 pshufd $0x4e, %xmm8, %xmm9 482 mulpd %xmm12, %xmm8 483 addpd %xmm8, %xmm4 484 MOVUPS_A1(-10 * SIZE, A1, %xmm8) 485 mulpd %xmm12, %xmm9 486 SUBPD %xmm9, %xmm5 487 488 pshufd $0x4e, %xmm10, %xmm11 489 mulpd %xmm12, %xmm10 490 addpd %xmm10, %xmm6 491 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10) 492 mulpd %xmm12, %xmm11 493 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) 494 SUBPD %xmm11, %xmm7 495 496 pshufd $0x4e, %xmm8, %xmm9 497 mulpd %xmm13, %xmm8 498 addpd %xmm8, %xmm0 499 MOVUPS_A1(-10 * SIZE, A2, %xmm8) 500 mulpd %xmm13, %xmm9 501 SUBPD %xmm9, %xmm1 502 503 pshufd $0x4e, %xmm10, %xmm11 504 mulpd %xmm13, %xmm10 505 addpd %xmm10, %xmm2 506 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10) 507 mulpd %xmm13, %xmm11 508 SUBPD %xmm11, %xmm3 509 510 pshufd $0x4e, %xmm8, %xmm9 511 mulpd %xmm13, %xmm8 512 addpd %xmm8, %xmm4 513 mulpd %xmm13, %xmm9 514 SUBPD %xmm9, %xmm5 515 516 pshufd $0x4e, %xmm10, %xmm11 517 mulpd %xmm13, %xmm10 518 addpd %xmm10, %xmm6 519 mulpd %xmm13, %xmm11 520 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) 521 SUBPD %xmm11, %xmm7 522 523 subq $-8 * SIZE, A1 524 subq $-8 * SIZE, A2 525 subq $-8 * SIZE, X1 526 ALIGN_3 527 528.L15: 529 testq $2, M 530 je .L17 531 532 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 533 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) 534 535 pshufd $0x4e, %xmm8, %xmm9 536 mulpd %xmm12, %xmm8 537 addpd %xmm8, %xmm0 538 MOVUPS_A1(-16 * SIZE, A2, %xmm8) 539 mulpd %xmm12, %xmm9 540 SUBPD %xmm9, %xmm1 541 542 pshufd $0x4e, %xmm10, %xmm11 543 mulpd %xmm12, %xmm10 544 addpd %xmm10, %xmm2 545 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) 546 mulpd %xmm12, %xmm11 547 SUBPD %xmm11, %xmm3 548 549 pshufd $0x4e, %xmm8, %xmm9 550 mulpd %xmm12, %xmm8 551 addpd %xmm8, %xmm4 552 MOVUPS_A1(-14 * SIZE, A1, %xmm8) 553 mulpd %xmm12, %xmm9 554 SUBPD %xmm9, %xmm5 555 556 pshufd $0x4e, %xmm10, %xmm11 557 mulpd %xmm12, %xmm10 558 addpd %xmm10, %xmm6 559 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) 560 mulpd %xmm12, %xmm11 561 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) 562 SUBPD %xmm11, %xmm7 563 564 pshufd $0x4e, %xmm8, %xmm9 565 mulpd %xmm13, %xmm8 566 addpd %xmm8, %xmm0 567 MOVUPS_A1(-14 * SIZE, A2, %xmm8) 568 mulpd %xmm13, %xmm9 569 SUBPD %xmm9, %xmm1 570 571 pshufd $0x4e, %xmm10, %xmm11 572 mulpd %xmm13, %xmm10 573 addpd %xmm10, %xmm2 574 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) 575 mulpd %xmm13, %xmm11 576 SUBPD %xmm11, %xmm3 577 578 pshufd $0x4e, %xmm8, %xmm9 579 mulpd %xmm13, %xmm8 580 addpd %xmm8, %xmm4 581 mulpd %xmm13, %xmm9 582 SUBPD %xmm9, %xmm5 583 584 pshufd $0x4e, %xmm10, %xmm11 585 mulpd %xmm13, %xmm10 586 addpd %xmm10, %xmm6 587 mulpd %xmm13, %xmm11 588 SUBPD %xmm11, %xmm7 589 590 addq $4 * SIZE, A1 591 addq $4 * SIZE, A2 592 ALIGN_3 593 594.L17: 595 testq $1, M 596 je .L19 597 598 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 599 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) 600 601 pshufd $0x4e, %xmm8, %xmm9 602 mulpd %xmm12, %xmm8 603 addpd %xmm8, %xmm0 604 MOVUPS_A1(-16 * SIZE, A2, %xmm8) 605 mulpd %xmm12, %xmm9 606 SUBPD %xmm9, %xmm1 607 608 pshufd $0x4e, %xmm10, %xmm11 609 mulpd %xmm12, %xmm10 610 addpd %xmm10, %xmm2 611 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) 612 mulpd %xmm12, %xmm11 613 SUBPD %xmm11, %xmm3 614 615 pshufd $0x4e, %xmm8, %xmm9 616 mulpd %xmm12, %xmm8 617 addpd %xmm8, %xmm4 618 mulpd %xmm12, %xmm9 619 SUBPD %xmm9, %xmm5 620 621 pshufd $0x4e, %xmm10, %xmm11 622 mulpd %xmm12, %xmm10 623 addpd %xmm10, %xmm6 624 mulpd %xmm12, %xmm11 625 SUBPD %xmm11, %xmm7 626 ALIGN_3 627 628.L19: 629 pcmpeqb %xmm13, %xmm13 630 psllq $63, %xmm13 631 shufps $0xc0, %xmm13, %xmm13 632 633#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 634 xorpd %xmm13, %xmm0 635 xorpd %xmm13, %xmm2 636 xorpd %xmm13, %xmm4 637 xorpd %xmm13, %xmm6 638#else 639 xorpd %xmm13, %xmm1 640 xorpd %xmm13, %xmm3 641 xorpd %xmm13, %xmm5 642 xorpd %xmm13, %xmm7 643#endif 644 645#ifdef HAVE_SSE3 646 haddpd %xmm1, %xmm0 647 haddpd %xmm3, %xmm2 648 649 haddpd %xmm5, %xmm4 650 haddpd %xmm7, %xmm6 651#else 652 movapd %xmm0, %xmm8 653 unpcklpd %xmm1, %xmm0 654 unpckhpd %xmm1, %xmm8 655 656 movapd %xmm2, %xmm9 657 unpcklpd %xmm3, %xmm2 658 unpckhpd %xmm3, %xmm9 659 660 movapd %xmm4, %xmm10 661 unpcklpd %xmm5, %xmm4 662 unpckhpd %xmm5, %xmm10 663 664 movapd %xmm6, %xmm11 665 unpcklpd %xmm7, %xmm6 666 unpckhpd %xmm7, %xmm11 667 668 addpd %xmm8, %xmm0 669 addpd %xmm9, %xmm2 670 addpd %xmm10, %xmm4 671 addpd %xmm11, %xmm6 672#endif 673 674 pshufd $0x4e, %xmm0, %xmm1 675 pshufd $0x4e, %xmm2, %xmm3 676 pshufd $0x4e, %xmm4, %xmm5 677 pshufd $0x4e, %xmm6, %xmm7 678 679 mulpd ALPHA_R, %xmm0 680 mulpd ALPHA_I, %xmm1 681 mulpd ALPHA_R, %xmm2 682 mulpd ALPHA_I, %xmm3 683 684 mulpd ALPHA_R, %xmm4 685 mulpd ALPHA_I, %xmm5 686 mulpd ALPHA_R, %xmm6 687 mulpd ALPHA_I, %xmm7 688 689 xorpd %xmm13, %xmm1 690 xorpd %xmm13, %xmm3 691 xorpd %xmm13, %xmm5 692 xorpd %xmm13, %xmm7 693 694 subpd %xmm1, %xmm0 695 subpd %xmm3, %xmm2 696 subpd %xmm5, %xmm4 697 subpd %xmm7, %xmm6 698 699 movsd 0 * SIZE(Y), %xmm1 700 movhpd 1 * SIZE(Y), %xmm1 701 addq INCY, Y 702 movsd 0 * SIZE(Y), %xmm3 703 movhpd 1 * SIZE(Y), %xmm3 704 addq INCY, Y 705 movsd 0 * SIZE(Y), %xmm5 706 movhpd 1 * SIZE(Y), %xmm5 707 addq INCY, Y 708 movsd 0 * SIZE(Y), %xmm7 709 movhpd 1 * SIZE(Y), %xmm7 710 addq INCY, Y 711 712 addpd %xmm1, %xmm0 713 addpd %xmm3, %xmm2 714 addpd %xmm5, %xmm4 715 addpd %xmm7, %xmm6 716 717 movlpd %xmm0, 0 * SIZE(Y1) 718 movhpd %xmm0, 1 * SIZE(Y1) 719 addq INCY, Y1 720 movlpd %xmm2, 0 * SIZE(Y1) 721 movhpd %xmm2, 1 * SIZE(Y1) 722 addq INCY, Y1 723 movlpd %xmm4, 0 * SIZE(Y1) 724 movhpd %xmm4, 1 * SIZE(Y1) 725 addq INCY, Y1 726 movlpd %xmm6, 0 * SIZE(Y1) 727 movhpd %xmm6, 1 * SIZE(Y1) 728 addq INCY, Y1 729 730 cmpq $4, N 731 jge .L11 732 ALIGN_3 733 734.L20: 735#endif 736 737#if GEMV_UNROLL >= 2 738 739 cmpq $2, N 740 jl .L30 741 742#if GEMV_UNROLL == 2 743 ALIGN_3 744 745.L21: 746#endif 747 748 subq $2, N 749 750 leaq 16 * SIZE(BUFFER), X1 751 752 movq A, A1 753 leaq (A1, LDA), A2 754 leaq (A1, LDA, 2), A 755 756 xorpd %xmm0, %xmm0 757 xorpd %xmm1, %xmm1 758 xorpd %xmm2, %xmm2 759 xorpd %xmm3, %xmm3 760 761 MOVUPS_XL1(-16 * SIZE, X1, %xmm4) 762 MOVUPS_XL1(-14 * SIZE, X1, %xmm5) 763 764#ifdef PREFETCHW 765 PREFETCHW 3 * SIZE(Y1) 766#endif 767 768 movq M, I 769 sarq $2, I 770 jle .L25 771 772 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 773 MOVUPS_A1(-16 * SIZE, A2, %xmm10) 774 MOVUPS_A1(-14 * SIZE, A1, %xmm12) 775 MOVUPS_A1(-14 * SIZE, A2, %xmm6) 776 777 decq I 778 jle .L24 779 ALIGN_3 780 781.L23: 782#ifdef PREFETCH 783 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 784#endif 785 786 pshufd $0x4e, %xmm8, %xmm9 787 mulpd %xmm4, %xmm8 788 addpd %xmm8, %xmm0 789 MOVUPS_A1(-12 * SIZE, A1, %xmm8) 790 mulpd %xmm4, %xmm9 791 SUBPD %xmm9, %xmm1 792 793 pshufd $0x4e, %xmm10, %xmm11 794 mulpd %xmm4, %xmm10 795 addpd %xmm10, %xmm2 796 MOVUPS_A1(-12 * SIZE, A2, %xmm10) 797 mulpd %xmm4, %xmm11 798 SUBPD %xmm11, %xmm3 799 800 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 801 802 pshufd $0x4e, %xmm12, %xmm13 803 mulpd %xmm5, %xmm12 804 addpd %xmm12, %xmm0 805 MOVUPS_A1(-10 * SIZE, A1, %xmm12) 806 mulpd %xmm5, %xmm13 807 SUBPD %xmm13, %xmm1 808 809 pshufd $0x4e, %xmm6, %xmm7 810 mulpd %xmm5, %xmm6 811 addpd %xmm6, %xmm2 812 MOVUPS_A1(-10 * SIZE, A2, %xmm6) 813 mulpd %xmm5, %xmm7 814 SUBPD %xmm7, %xmm3 815 816 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) 817 818#ifdef PREFETCH 819 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 820#endif 821 822 pshufd $0x4e, %xmm8, %xmm9 823 mulpd %xmm4, %xmm8 824 addpd %xmm8, %xmm0 825 MOVUPS_A1( -8 * SIZE, A1, %xmm8) 826 mulpd %xmm4, %xmm9 827 SUBPD %xmm9, %xmm1 828 829 pshufd $0x4e, %xmm10, %xmm11 830 mulpd %xmm4, %xmm10 831 addpd %xmm10, %xmm2 832 MOVUPS_A1( -8 * SIZE, A2, %xmm10) 833 mulpd %xmm4, %xmm11 834 SUBPD %xmm11, %xmm3 835 836 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) 837 838#ifdef PREFETCHW 839 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) 840#endif 841 842 pshufd $0x4e, %xmm12, %xmm13 843 mulpd %xmm5, %xmm12 844 addpd %xmm12, %xmm0 845 MOVUPS_A1( -6 * SIZE, A1, %xmm12) 846 mulpd %xmm5, %xmm13 847 SUBPD %xmm13, %xmm1 848 849 pshufd $0x4e, %xmm6, %xmm7 850 mulpd %xmm5, %xmm6 851 addpd %xmm6, %xmm2 852 MOVUPS_A1( -6 * SIZE, A2, %xmm6) 853 mulpd %xmm5, %xmm7 854 SUBPD %xmm7, %xmm3 855 856 MOVUPS_XL1( -6 * SIZE, X1, %xmm5) 857 858 subq $-8 * SIZE, A1 859 subq $-8 * SIZE, A2 860 subq $-8 * SIZE, X1 861 862 subq $1, I 863 BRANCH 864 jg .L23 865 ALIGN_3 866 867.L24: 868 pshufd $0x4e, %xmm8, %xmm9 869 mulpd %xmm4, %xmm8 870 addpd %xmm8, %xmm0 871 MOVUPS_A1(-12 * SIZE, A1, %xmm8) 872 mulpd %xmm4, %xmm9 873 SUBPD %xmm9, %xmm1 874 875 pshufd $0x4e, %xmm10, %xmm11 876 mulpd %xmm4, %xmm10 877 addpd %xmm10, %xmm2 878 MOVUPS_A1(-12 * SIZE, A2, %xmm10) 879 mulpd %xmm4, %xmm11 880 SUBPD %xmm11, %xmm3 881 882 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 883 884 pshufd $0x4e, %xmm12, %xmm13 885 mulpd %xmm5, %xmm12 886 addpd %xmm12, %xmm0 887 MOVUPS_A1(-10 * SIZE, A1, %xmm12) 888 mulpd %xmm5, %xmm13 889 SUBPD %xmm13, %xmm1 890 891 pshufd $0x4e, %xmm6, %xmm7 892 mulpd %xmm5, %xmm6 893 addpd %xmm6, %xmm2 894 MOVUPS_A1(-10 * SIZE, A2, %xmm6) 895 mulpd %xmm5, %xmm7 896 SUBPD %xmm7, %xmm3 897 898 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) 899 900 pshufd $0x4e, %xmm8, %xmm9 901 mulpd %xmm4, %xmm8 902 addpd %xmm8, %xmm0 903 mulpd %xmm4, %xmm9 904 SUBPD %xmm9, %xmm1 905 906 pshufd $0x4e, %xmm10, %xmm11 907 mulpd %xmm4, %xmm10 908 addpd %xmm10, %xmm2 909 mulpd %xmm4, %xmm11 910 SUBPD %xmm11, %xmm3 911 912 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) 913 914 pshufd $0x4e, %xmm12, %xmm13 915 mulpd %xmm5, %xmm12 916 addpd %xmm12, %xmm0 917 mulpd %xmm5, %xmm13 918 SUBPD %xmm13, %xmm1 919 920 pshufd $0x4e, %xmm6, %xmm7 921 mulpd %xmm5, %xmm6 922 addpd %xmm6, %xmm2 923 mulpd %xmm5, %xmm7 924 SUBPD %xmm7, %xmm3 925 926 MOVUPS_XL1( -6 * SIZE, X1, %xmm5) 927 928 subq $-8 * SIZE, A1 929 subq $-8 * SIZE, A2 930 subq $-8 * SIZE, X1 931 ALIGN_3 932 933.L25: 934 testq $2, M 935 je .L27 936 937 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 938 MOVUPS_A1(-16 * SIZE, A2, %xmm10) 939 940 MOVUPS_A1(-14 * SIZE, A1, %xmm12) 941 MOVUPS_A1(-14 * SIZE, A2, %xmm6) 942 943 pshufd $0x4e, %xmm8, %xmm9 944 mulpd %xmm4, %xmm8 945 addpd %xmm8, %xmm0 946 mulpd %xmm4, %xmm9 947 SUBPD %xmm9, %xmm1 948 949 pshufd $0x4e, %xmm10, %xmm11 950 mulpd %xmm4, %xmm10 951 addpd %xmm10, %xmm2 952 mulpd %xmm4, %xmm11 953 SUBPD %xmm11, %xmm3 954 955 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 956 957 pshufd $0x4e, %xmm12, %xmm13 958 mulpd %xmm5, %xmm12 959 addpd %xmm12, %xmm0 960 mulpd %xmm5, %xmm13 961 SUBPD %xmm13, %xmm1 962 963 pshufd $0x4e, %xmm6, %xmm7 964 mulpd %xmm5, %xmm6 965 addpd %xmm6, %xmm2 966 mulpd %xmm5, %xmm7 967 SUBPD %xmm7, %xmm3 968 969 addq $4 * SIZE, A1 970 addq $4 * SIZE, A2 971 ALIGN_3 972 973.L27: 974 testq $1, M 975 je .L29 976 977 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 978 MOVUPS_A1(-16 * SIZE, A2, %xmm10) 979 980 pshufd $0x4e, %xmm8, %xmm9 981 mulpd %xmm4, %xmm8 982 addpd %xmm8, %xmm0 983 mulpd %xmm4, %xmm9 984 SUBPD %xmm9, %xmm1 985 986 pshufd $0x4e, %xmm10, %xmm11 987 mulpd %xmm4, %xmm10 988 addpd %xmm10, %xmm2 989 mulpd %xmm4, %xmm11 990 SUBPD %xmm11, %xmm3 991 ALIGN_3 992 993.L29: 994 pcmpeqb %xmm11, %xmm11 995 psllq $63, %xmm11 996 shufps $0xc0, %xmm11, %xmm11 997 998#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 999 xorpd %xmm11, %xmm0 1000 xorpd %xmm11, %xmm2 1001#else 1002 xorpd %xmm11, %xmm1 1003 xorpd %xmm11, %xmm3 1004#endif 1005 1006#ifdef HAVE_SSE3 1007 haddpd %xmm1, %xmm0 1008 haddpd %xmm3, %xmm2 1009#else 1010 movapd %xmm0, %xmm8 1011 unpcklpd %xmm1, %xmm0 1012 unpckhpd %xmm1, %xmm8 1013 1014 movapd %xmm2, %xmm9 1015 unpcklpd %xmm3, %xmm2 1016 unpckhpd %xmm3, %xmm9 1017 1018 addpd %xmm8, %xmm0 1019 addpd %xmm9, %xmm2 1020#endif 1021 1022 pshufd $0x4e, %xmm0, %xmm1 1023 pshufd $0x4e, %xmm2, %xmm3 1024 1025 mulpd ALPHA_R, %xmm0 1026 mulpd ALPHA_I, %xmm1 1027 mulpd ALPHA_R, %xmm2 1028 mulpd ALPHA_I, %xmm3 1029 1030 xorpd %xmm11, %xmm1 1031 xorpd %xmm11, %xmm3 1032 1033 subpd %xmm1, %xmm0 1034 subpd %xmm3, %xmm2 1035 1036 movsd 0 * SIZE(Y), %xmm4 1037 movhpd 1 * SIZE(Y), %xmm4 1038 addq INCY, Y 1039 movsd 0 * SIZE(Y), %xmm5 1040 movhpd 1 * SIZE(Y), %xmm5 1041 addq INCY, Y 1042 1043 addpd %xmm4, %xmm0 1044 addpd %xmm5, %xmm2 1045 1046 movlpd %xmm0, 0 * SIZE(Y1) 1047 movhpd %xmm0, 1 * SIZE(Y1) 1048 addq INCY, Y1 1049 movlpd %xmm2, 0 * SIZE(Y1) 1050 movhpd %xmm2, 1 * SIZE(Y1) 1051 addq INCY, Y1 1052 1053#if GEMV_UNROLL == 2 1054 cmpq $2, N 1055 jge .L21 1056#endif 1057 ALIGN_3 1058 1059.L30: 1060#endif 1061 1062 cmpq $1, N 1063 jl .L999 1064 1065#if GEMV_UNROLL == 1 1066.L31: 1067 decq N 1068#endif 1069 1070 leaq 16 * SIZE(BUFFER), X1 1071 1072 movq A, A1 1073#if GEMV_UNROLL == 1 1074 addq LDA, A 1075#endif 1076 1077 xorpd %xmm0, %xmm0 1078 xorpd %xmm1, %xmm1 1079 1080 MOVUPS_XL1(-16 * SIZE, X1, %xmm4) 1081 MOVUPS_XL1(-14 * SIZE, X1, %xmm5) 1082 1083 movq M, I 1084 sarq $2, I 1085 jle .L35 1086 1087 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 1088 MOVUPS_A1(-14 * SIZE, A1, %xmm12) 1089 1090 decq I 1091 jle .L34 1092 ALIGN_3 1093 1094.L33: 1095#ifdef PREFETCH 1096 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 1097#endif 1098 1099 pshufd $0x4e, %xmm8, %xmm9 1100 mulpd %xmm4, %xmm8 1101 addpd %xmm8, %xmm0 1102 MOVUPS_A1(-12 * SIZE, A1, %xmm8) 1103 mulpd %xmm4, %xmm9 1104 SUBPD %xmm9, %xmm1 1105 1106 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 1107 1108 pshufd $0x4e, %xmm12, %xmm13 1109 mulpd %xmm5, %xmm12 1110 addpd %xmm12, %xmm0 1111 MOVUPS_A1(-10 * SIZE, A1, %xmm12) 1112 mulpd %xmm5, %xmm13 1113 SUBPD %xmm13, %xmm1 1114 1115 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) 1116 1117#ifdef PREFETCHW 1118 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) 1119#endif 1120 1121 pshufd $0x4e, %xmm8, %xmm9 1122 mulpd %xmm4, %xmm8 1123 addpd %xmm8, %xmm0 1124 MOVUPS_A1( -8 * SIZE, A1, %xmm8) 1125 mulpd %xmm4, %xmm9 1126 SUBPD %xmm9, %xmm1 1127 1128 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) 1129 1130 pshufd $0x4e, %xmm12, %xmm13 1131 mulpd %xmm5, %xmm12 1132 addpd %xmm12, %xmm0 1133 MOVUPS_A1( -6 * SIZE, A1, %xmm12) 1134 mulpd %xmm5, %xmm13 1135 SUBPD %xmm13, %xmm1 1136 1137 MOVUPS_XL1(-6 * SIZE, X1, %xmm5) 1138 1139 subq $-8 * SIZE, A1 1140 subq $-8 * SIZE, X1 1141 1142 subq $1, I 1143 BRANCH 1144 jg .L33 1145 ALIGN_3 1146 1147.L34: 1148 pshufd $0x4e, %xmm8, %xmm9 1149 mulpd %xmm4, %xmm8 1150 addpd %xmm8, %xmm0 1151 MOVUPS_A1(-12 * SIZE, A1, %xmm8) 1152 mulpd %xmm4, %xmm9 1153 SUBPD %xmm9, %xmm1 1154 1155 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 1156 1157 pshufd $0x4e, %xmm12, %xmm13 1158 mulpd %xmm5, %xmm12 1159 addpd %xmm12, %xmm0 1160 MOVUPS_A1(-10 * SIZE, A1, %xmm12) 1161 mulpd %xmm5, %xmm13 1162 SUBPD %xmm13, %xmm1 1163 1164 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) 1165 1166 pshufd $0x4e, %xmm8, %xmm9 1167 mulpd %xmm4, %xmm8 1168 addpd %xmm8, %xmm0 1169 mulpd %xmm4, %xmm9 1170 SUBPD %xmm9, %xmm1 1171 1172 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) 1173 1174 pshufd $0x4e, %xmm12, %xmm13 1175 mulpd %xmm5, %xmm12 1176 addpd %xmm12, %xmm0 1177 mulpd %xmm5, %xmm13 1178 SUBPD %xmm13, %xmm1 1179 1180 MOVUPS_XL1(-6 * SIZE, X1, %xmm5) 1181 1182 subq $-8 * SIZE, A1 1183 subq $-8 * SIZE, X1 1184 ALIGN_3 1185 1186.L35: 1187 testq $2, M 1188 je .L37 1189 1190 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 1191 MOVUPS_A1(-14 * SIZE, A1, %xmm12) 1192 1193 pshufd $0x4e, %xmm8, %xmm9 1194 mulpd %xmm4, %xmm8 1195 addpd %xmm8, %xmm0 1196 mulpd %xmm4, %xmm9 1197 SUBPD %xmm9, %xmm1 1198 1199 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 1200 1201 pshufd $0x4e, %xmm12, %xmm13 1202 mulpd %xmm5, %xmm12 1203 addpd %xmm12, %xmm0 1204 mulpd %xmm5, %xmm13 1205 SUBPD %xmm13, %xmm1 1206 1207 addq $4 * SIZE, A1 1208 ALIGN_3 1209 1210.L37: 1211 testq $1, M 1212 je .L39 1213 1214 MOVUPS_A1(-16 * SIZE, A1, %xmm8) 1215 1216 pshufd $0x4e, %xmm8, %xmm9 1217 mulpd %xmm4, %xmm8 1218 addpd %xmm8, %xmm0 1219 mulpd %xmm4, %xmm9 1220 SUBPD %xmm9, %xmm1 1221 ALIGN_3 1222 1223.L39: 1224 pcmpeqb %xmm11, %xmm11 1225 psllq $63, %xmm11 1226 shufps $0xc0, %xmm11, %xmm11 1227 1228#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 1229 xorpd %xmm11, %xmm0 1230#else 1231 xorpd %xmm11, %xmm1 1232#endif 1233 1234#ifdef HAVE_SSE3 1235 haddpd %xmm1, %xmm0 1236#else 1237 movapd %xmm0, %xmm8 1238 unpcklpd %xmm1, %xmm0 1239 unpckhpd %xmm1, %xmm8 1240 1241 addpd %xmm8, %xmm0 1242#endif 1243 1244 pshufd $0x4e, %xmm0, %xmm1 1245 1246 mulpd ALPHA_R, %xmm0 1247 mulpd ALPHA_I, %xmm1 1248 1249 xorpd %xmm11, %xmm1 1250 1251 subpd %xmm1, %xmm0 1252 1253 movsd 0 * SIZE(Y), %xmm4 1254 movhpd 1 * SIZE(Y), %xmm4 1255 1256 addpd %xmm4, %xmm0 1257 1258 movlpd %xmm0, 0 * SIZE(Y1) 1259 movhpd %xmm0, 1 * SIZE(Y1) 1260 1261#if GEMV_UNROLL == 1 1262 addq INCY, Y 1263 addq INCY, Y1 1264 1265 cmpq $1, N 1266 jge .L31 1267#endif 1268 1269#ifdef ALIGNED_ACCESS 1270 jmp .L999 1271 ALIGN_3 1272 1273.L100: 1274#if GEMV_UNROLL >= 4 1275 1276 cmpq $4, N 1277 jl .L110 1278 ALIGN_3 1279 1280.L101: 1281 subq $4, N 1282 1283 leaq 16 * SIZE(BUFFER), X1 1284 1285 movq A, A1 1286 leaq (A1, LDA, 2), A2 1287 leaq (A1, LDA, 4), A 1288 1289 MOVUPS_XL1(-16 * SIZE, X1, %xmm12) 1290 xorpd %xmm0, %xmm0 1291 xorpd %xmm1, %xmm1 1292 xorpd %xmm2, %xmm2 1293 xorpd %xmm3, %xmm3 1294 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) 1295 xorpd %xmm4, %xmm4 1296 xorpd %xmm5, %xmm5 1297 xorpd %xmm6, %xmm6 1298 xorpd %xmm7, %xmm7 1299 1300#ifdef PREFETCHW 1301 PREFETCHW 3 * SIZE(Y1) 1302#endif 1303 1304 movq M, I 1305 sarq $2, I 1306 jle .L105 1307 1308 movsd -16 * SIZE(A1), %xmm8 1309 movhpd -15 * SIZE(A1), %xmm8 1310 1311 movsd -16 * SIZE(A1, LDA), %xmm10 1312 movhpd -15 * SIZE(A1, LDA), %xmm10 1313 1314 decq I 1315 jle .L104 1316 ALIGN_3 1317 1318.L103: 1319#ifdef PREFETCH 1320 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) 1321#endif 1322 1323 pshufd $0x4e, %xmm8, %xmm9 1324 mulpd %xmm12, %xmm8 1325 addpd %xmm8, %xmm0 1326 movsd -16 * SIZE(A2), %xmm8 1327 movhpd -15 * SIZE(A2), %xmm8 1328 mulpd %xmm12, %xmm9 1329 SUBPD %xmm9, %xmm1 1330 1331 pshufd $0x4e, %xmm10, %xmm11 1332 mulpd %xmm12, %xmm10 1333 addpd %xmm10, %xmm2 1334 movsd -16 * SIZE(A2, LDA), %xmm10 1335 movhpd -15 * SIZE(A2, LDA), %xmm10 1336 mulpd %xmm12, %xmm11 1337 SUBPD %xmm11, %xmm3 1338 1339 pshufd $0x4e, %xmm8, %xmm9 1340 mulpd %xmm12, %xmm8 1341 addpd %xmm8, %xmm4 1342 movsd -14 * SIZE(A1), %xmm8 1343 movhpd -13 * SIZE(A1), %xmm8 1344 mulpd %xmm12, %xmm9 1345 SUBPD %xmm9, %xmm5 1346 1347 pshufd $0x4e, %xmm10, %xmm11 1348 mulpd %xmm12, %xmm10 1349 addpd %xmm10, %xmm6 1350 movsd -14 * SIZE(A1, LDA), %xmm10 1351 movhpd -13 * SIZE(A1, LDA), %xmm10 1352 mulpd %xmm12, %xmm11 1353 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) 1354 SUBPD %xmm11, %xmm7 1355 1356#ifdef PREFETCH 1357 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) 1358#endif 1359 1360 pshufd $0x4e, %xmm8, %xmm9 1361 mulpd %xmm13, %xmm8 1362 addpd %xmm8, %xmm0 1363 movsd -14 * SIZE(A2), %xmm8 1364 movhpd -13 * SIZE(A2), %xmm8 1365 mulpd %xmm13, %xmm9 1366 SUBPD %xmm9, %xmm1 1367 1368 pshufd $0x4e, %xmm10, %xmm11 1369 mulpd %xmm13, %xmm10 1370 addpd %xmm10, %xmm2 1371 movsd -14 * SIZE(A2, LDA), %xmm10 1372 movhpd -13 * SIZE(A2, LDA), %xmm10 1373 mulpd %xmm13, %xmm11 1374 SUBPD %xmm11, %xmm3 1375 1376 pshufd $0x4e, %xmm8, %xmm9 1377 mulpd %xmm13, %xmm8 1378 addpd %xmm8, %xmm4 1379 movsd -12 * SIZE(A1), %xmm8 1380 movhpd -11 * SIZE(A1), %xmm8 1381 mulpd %xmm13, %xmm9 1382 SUBPD %xmm9, %xmm5 1383 1384 pshufd $0x4e, %xmm10, %xmm11 1385 mulpd %xmm13, %xmm10 1386 addpd %xmm10, %xmm6 1387 movsd -12 * SIZE(A1, LDA), %xmm10 1388 movhpd -11 * SIZE(A1, LDA), %xmm10 1389 mulpd %xmm13, %xmm11 1390 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) 1391 SUBPD %xmm11, %xmm7 1392 1393#ifdef PREFETCH 1394 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) 1395#endif 1396 1397 pshufd $0x4e, %xmm8, %xmm9 1398 mulpd %xmm12, %xmm8 1399 addpd %xmm8, %xmm0 1400 movsd -12 * SIZE(A2), %xmm8 1401 movhpd -11 * SIZE(A2), %xmm8 1402 mulpd %xmm12, %xmm9 1403 SUBPD %xmm9, %xmm1 1404 1405 pshufd $0x4e, %xmm10, %xmm11 1406 mulpd %xmm12, %xmm10 1407 addpd %xmm10, %xmm2 1408 movsd -12 * SIZE(A2, LDA), %xmm10 1409 movhpd -11 * SIZE(A2, LDA), %xmm10 1410 mulpd %xmm12, %xmm11 1411 SUBPD %xmm11, %xmm3 1412 1413 pshufd $0x4e, %xmm8, %xmm9 1414 mulpd %xmm12, %xmm8 1415 addpd %xmm8, %xmm4 1416 movsd -10 * SIZE(A1), %xmm8 1417 movhpd -9 * SIZE(A1), %xmm8 1418 mulpd %xmm12, %xmm9 1419 SUBPD %xmm9, %xmm5 1420 1421 pshufd $0x4e, %xmm10, %xmm11 1422 mulpd %xmm12, %xmm10 1423 addpd %xmm10, %xmm6 1424 movsd -10 * SIZE(A1, LDA), %xmm10 1425 movhpd -9 * SIZE(A1, LDA), %xmm10 1426 mulpd %xmm12, %xmm11 1427 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) 1428 SUBPD %xmm11, %xmm7 1429 1430#ifdef PREFETCH 1431 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) 1432#endif 1433 1434 pshufd $0x4e, %xmm8, %xmm9 1435 mulpd %xmm13, %xmm8 1436 addpd %xmm8, %xmm0 1437 movsd -10 * SIZE(A2), %xmm8 1438 movhpd -9 * SIZE(A2), %xmm8 1439 mulpd %xmm13, %xmm9 1440 SUBPD %xmm9, %xmm1 1441 1442 pshufd $0x4e, %xmm10, %xmm11 1443 mulpd %xmm13, %xmm10 1444 addpd %xmm10, %xmm2 1445 movsd -10 * SIZE(A2, LDA), %xmm10 1446 movhpd -9 * SIZE(A2, LDA), %xmm10 1447 mulpd %xmm13, %xmm11 1448 SUBPD %xmm11, %xmm3 1449 1450#ifdef PREFETCHW 1451 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) 1452#endif 1453 1454 pshufd $0x4e, %xmm8, %xmm9 1455 mulpd %xmm13, %xmm8 1456 addpd %xmm8, %xmm4 1457 movsd -8 * SIZE(A1), %xmm8 1458 movhpd -7 * SIZE(A1), %xmm8 1459 mulpd %xmm13, %xmm9 1460 SUBPD %xmm9, %xmm5 1461 1462 pshufd $0x4e, %xmm10, %xmm11 1463 mulpd %xmm13, %xmm10 1464 addpd %xmm10, %xmm6 1465 movsd -8 * SIZE(A1, LDA), %xmm10 1466 movhpd -7 * SIZE(A1, LDA), %xmm10 1467 mulpd %xmm13, %xmm11 1468 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) 1469 SUBPD %xmm11, %xmm7 1470 1471 subq $-8 * SIZE, A1 1472 subq $-8 * SIZE, A2 1473 subq $-8 * SIZE, X1 1474 1475 subq $1, I 1476 BRANCH 1477 jg .L103 1478 ALIGN_3 1479 1480.L104: 1481 pshufd $0x4e, %xmm8, %xmm9 1482 mulpd %xmm12, %xmm8 1483 addpd %xmm8, %xmm0 1484 movsd -16 * SIZE(A2), %xmm8 1485 movhpd -15 * SIZE(A2), %xmm8 1486 mulpd %xmm12, %xmm9 1487 SUBPD %xmm9, %xmm1 1488 1489 pshufd $0x4e, %xmm10, %xmm11 1490 mulpd %xmm12, %xmm10 1491 addpd %xmm10, %xmm2 1492 movsd -16 * SIZE(A2, LDA), %xmm10 1493 movhpd -15 * SIZE(A2, LDA), %xmm10 1494 mulpd %xmm12, %xmm11 1495 SUBPD %xmm11, %xmm3 1496 1497 pshufd $0x4e, %xmm8, %xmm9 1498 mulpd %xmm12, %xmm8 1499 addpd %xmm8, %xmm4 1500 movsd -14 * SIZE(A1), %xmm8 1501 movhpd -13 * SIZE(A1), %xmm8 1502 mulpd %xmm12, %xmm9 1503 SUBPD %xmm9, %xmm5 1504 1505 pshufd $0x4e, %xmm10, %xmm11 1506 mulpd %xmm12, %xmm10 1507 addpd %xmm10, %xmm6 1508 movsd -14 * SIZE(A1, LDA), %xmm10 1509 movhpd -13 * SIZE(A1, LDA), %xmm10 1510 mulpd %xmm12, %xmm11 1511 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) 1512 SUBPD %xmm11, %xmm7 1513 1514 pshufd $0x4e, %xmm8, %xmm9 1515 mulpd %xmm13, %xmm8 1516 addpd %xmm8, %xmm0 1517 movsd -14 * SIZE(A2), %xmm8 1518 movhpd -13 * SIZE(A2), %xmm8 1519 mulpd %xmm13, %xmm9 1520 SUBPD %xmm9, %xmm1 1521 1522 pshufd $0x4e, %xmm10, %xmm11 1523 mulpd %xmm13, %xmm10 1524 addpd %xmm10, %xmm2 1525 movsd -14 * SIZE(A2, LDA), %xmm10 1526 movhpd -13 * SIZE(A2, LDA), %xmm10 1527 mulpd %xmm13, %xmm11 1528 SUBPD %xmm11, %xmm3 1529 1530 pshufd $0x4e, %xmm8, %xmm9 1531 mulpd %xmm13, %xmm8 1532 addpd %xmm8, %xmm4 1533 movsd -12 * SIZE(A1), %xmm8 1534 movhpd -11 * SIZE(A1), %xmm8 1535 mulpd %xmm13, %xmm9 1536 SUBPD %xmm9, %xmm5 1537 1538 pshufd $0x4e, %xmm10, %xmm11 1539 mulpd %xmm13, %xmm10 1540 addpd %xmm10, %xmm6 1541 movsd -12 * SIZE(A1, LDA), %xmm10 1542 movhpd -11 * SIZE(A1, LDA), %xmm10 1543 mulpd %xmm13, %xmm11 1544 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) 1545 SUBPD %xmm11, %xmm7 1546 1547 pshufd $0x4e, %xmm8, %xmm9 1548 mulpd %xmm12, %xmm8 1549 addpd %xmm8, %xmm0 1550 movsd -12 * SIZE(A2), %xmm8 1551 movhpd -11 * SIZE(A2), %xmm8 1552 mulpd %xmm12, %xmm9 1553 SUBPD %xmm9, %xmm1 1554 1555 pshufd $0x4e, %xmm10, %xmm11 1556 mulpd %xmm12, %xmm10 1557 addpd %xmm10, %xmm2 1558 movsd -12 * SIZE(A2, LDA), %xmm10 1559 movhpd -11 * SIZE(A2, LDA), %xmm10 1560 mulpd %xmm12, %xmm11 1561 SUBPD %xmm11, %xmm3 1562 1563 pshufd $0x4e, %xmm8, %xmm9 1564 mulpd %xmm12, %xmm8 1565 addpd %xmm8, %xmm4 1566 movsd -10 * SIZE(A1), %xmm8 1567 movhpd -9 * SIZE(A1), %xmm8 1568 mulpd %xmm12, %xmm9 1569 SUBPD %xmm9, %xmm5 1570 1571 pshufd $0x4e, %xmm10, %xmm11 1572 mulpd %xmm12, %xmm10 1573 addpd %xmm10, %xmm6 1574 movsd -10 * SIZE(A1, LDA), %xmm10 1575 movhpd -9 * SIZE(A1, LDA), %xmm10 1576 mulpd %xmm12, %xmm11 1577 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) 1578 SUBPD %xmm11, %xmm7 1579 1580 pshufd $0x4e, %xmm8, %xmm9 1581 mulpd %xmm13, %xmm8 1582 addpd %xmm8, %xmm0 1583 movsd -10 * SIZE(A2), %xmm8 1584 movhpd -9 * SIZE(A2), %xmm8 1585 mulpd %xmm13, %xmm9 1586 SUBPD %xmm9, %xmm1 1587 1588 pshufd $0x4e, %xmm10, %xmm11 1589 mulpd %xmm13, %xmm10 1590 addpd %xmm10, %xmm2 1591 movsd -10 * SIZE(A2, LDA), %xmm10 1592 movhpd -9 * SIZE(A2, LDA), %xmm10 1593 mulpd %xmm13, %xmm11 1594 SUBPD %xmm11, %xmm3 1595 1596 pshufd $0x4e, %xmm8, %xmm9 1597 mulpd %xmm13, %xmm8 1598 addpd %xmm8, %xmm4 1599 mulpd %xmm13, %xmm9 1600 SUBPD %xmm9, %xmm5 1601 1602 pshufd $0x4e, %xmm10, %xmm11 1603 mulpd %xmm13, %xmm10 1604 addpd %xmm10, %xmm6 1605 mulpd %xmm13, %xmm11 1606 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) 1607 SUBPD %xmm11, %xmm7 1608 1609 subq $-8 * SIZE, A1 1610 subq $-8 * SIZE, A2 1611 subq $-8 * SIZE, X1 1612 ALIGN_3 1613 1614.L105: 1615 testq $2, M 1616 je .L107 1617 1618 movsd -16 * SIZE(A1), %xmm8 1619 movhpd -15 * SIZE(A1), %xmm8 1620 1621 movsd -16 * SIZE(A1, LDA), %xmm10 1622 movhpd -15 * SIZE(A1, LDA), %xmm10 1623 1624 pshufd $0x4e, %xmm8, %xmm9 1625 mulpd %xmm12, %xmm8 1626 addpd %xmm8, %xmm0 1627 movsd -16 * SIZE(A2), %xmm8 1628 movhpd -15 * SIZE(A2), %xmm8 1629 mulpd %xmm12, %xmm9 1630 SUBPD %xmm9, %xmm1 1631 1632 pshufd $0x4e, %xmm10, %xmm11 1633 mulpd %xmm12, %xmm10 1634 addpd %xmm10, %xmm2 1635 movsd -16 * SIZE(A2, LDA), %xmm10 1636 movhpd -15 * SIZE(A2, LDA), %xmm10 1637 mulpd %xmm12, %xmm11 1638 SUBPD %xmm11, %xmm3 1639 1640 pshufd $0x4e, %xmm8, %xmm9 1641 mulpd %xmm12, %xmm8 1642 addpd %xmm8, %xmm4 1643 movsd -14 * SIZE(A1), %xmm8 1644 movhpd -13 * SIZE(A1), %xmm8 1645 mulpd %xmm12, %xmm9 1646 SUBPD %xmm9, %xmm5 1647 1648 pshufd $0x4e, %xmm10, %xmm11 1649 mulpd %xmm12, %xmm10 1650 addpd %xmm10, %xmm6 1651 movsd -14 * SIZE(A1, LDA), %xmm10 1652 movhpd -13 * SIZE(A1, LDA), %xmm10 1653 mulpd %xmm12, %xmm11 1654 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) 1655 SUBPD %xmm11, %xmm7 1656 1657 pshufd $0x4e, %xmm8, %xmm9 1658 mulpd %xmm13, %xmm8 1659 addpd %xmm8, %xmm0 1660 movsd -14 * SIZE(A2), %xmm8 1661 movhpd -13 * SIZE(A2), %xmm8 1662 mulpd %xmm13, %xmm9 1663 SUBPD %xmm9, %xmm1 1664 1665 pshufd $0x4e, %xmm10, %xmm11 1666 mulpd %xmm13, %xmm10 1667 addpd %xmm10, %xmm2 1668 movsd -14 * SIZE(A2, LDA), %xmm10 1669 movhpd -13 * SIZE(A2, LDA), %xmm10 1670 mulpd %xmm13, %xmm11 1671 SUBPD %xmm11, %xmm3 1672 1673 pshufd $0x4e, %xmm8, %xmm9 1674 mulpd %xmm13, %xmm8 1675 addpd %xmm8, %xmm4 1676 mulpd %xmm13, %xmm9 1677 SUBPD %xmm9, %xmm5 1678 1679 pshufd $0x4e, %xmm10, %xmm11 1680 mulpd %xmm13, %xmm10 1681 addpd %xmm10, %xmm6 1682 mulpd %xmm13, %xmm11 1683 SUBPD %xmm11, %xmm7 1684 1685 addq $4 * SIZE, A1 1686 addq $4 * SIZE, A2 1687 ALIGN_3 1688 1689.L107: 1690 testq $1, M 1691 je .L109 1692 1693 movsd -16 * SIZE(A1), %xmm8 1694 movhpd -15 * SIZE(A1), %xmm8 1695 1696 movsd -16 * SIZE(A1, LDA), %xmm10 1697 movhpd -15 * SIZE(A1, LDA), %xmm10 1698 1699 pshufd $0x4e, %xmm8, %xmm9 1700 mulpd %xmm12, %xmm8 1701 addpd %xmm8, %xmm0 1702 movsd -16 * SIZE(A2), %xmm8 1703 movhpd -15 * SIZE(A2), %xmm8 1704 mulpd %xmm12, %xmm9 1705 SUBPD %xmm9, %xmm1 1706 1707 pshufd $0x4e, %xmm10, %xmm11 1708 mulpd %xmm12, %xmm10 1709 addpd %xmm10, %xmm2 1710 movsd -16 * SIZE(A2, LDA), %xmm10 1711 movhpd -15 * SIZE(A2, LDA), %xmm10 1712 mulpd %xmm12, %xmm11 1713 SUBPD %xmm11, %xmm3 1714 1715 pshufd $0x4e, %xmm8, %xmm9 1716 mulpd %xmm12, %xmm8 1717 addpd %xmm8, %xmm4 1718 mulpd %xmm12, %xmm9 1719 SUBPD %xmm9, %xmm5 1720 1721 pshufd $0x4e, %xmm10, %xmm11 1722 mulpd %xmm12, %xmm10 1723 addpd %xmm10, %xmm6 1724 mulpd %xmm12, %xmm11 1725 SUBPD %xmm11, %xmm7 1726 ALIGN_3 1727 1728.L109: 1729 pcmpeqb %xmm13, %xmm13 1730 psllq $63, %xmm13 1731 shufps $0xc0, %xmm13, %xmm13 1732 1733#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 1734 xorpd %xmm13, %xmm0 1735 xorpd %xmm13, %xmm2 1736 xorpd %xmm13, %xmm4 1737 xorpd %xmm13, %xmm6 1738#else 1739 xorpd %xmm13, %xmm1 1740 xorpd %xmm13, %xmm3 1741 xorpd %xmm13, %xmm5 1742 xorpd %xmm13, %xmm7 1743#endif 1744 1745#ifdef HAVE_SSE3 1746 haddpd %xmm1, %xmm0 1747 haddpd %xmm3, %xmm2 1748 1749 haddpd %xmm5, %xmm4 1750 haddpd %xmm7, %xmm6 1751#else 1752 movapd %xmm0, %xmm8 1753 unpcklpd %xmm1, %xmm0 1754 unpckhpd %xmm1, %xmm8 1755 1756 movapd %xmm2, %xmm9 1757 unpcklpd %xmm3, %xmm2 1758 unpckhpd %xmm3, %xmm9 1759 1760 movapd %xmm4, %xmm10 1761 unpcklpd %xmm5, %xmm4 1762 unpckhpd %xmm5, %xmm10 1763 1764 movapd %xmm6, %xmm11 1765 unpcklpd %xmm7, %xmm6 1766 unpckhpd %xmm7, %xmm11 1767 1768 addpd %xmm8, %xmm0 1769 addpd %xmm9, %xmm2 1770 addpd %xmm10, %xmm4 1771 addpd %xmm11, %xmm6 1772#endif 1773 1774 pshufd $0x4e, %xmm0, %xmm1 1775 pshufd $0x4e, %xmm2, %xmm3 1776 pshufd $0x4e, %xmm4, %xmm5 1777 pshufd $0x4e, %xmm6, %xmm7 1778 1779 mulpd ALPHA_R, %xmm0 1780 mulpd ALPHA_I, %xmm1 1781 mulpd ALPHA_R, %xmm2 1782 mulpd ALPHA_I, %xmm3 1783 1784 mulpd ALPHA_R, %xmm4 1785 mulpd ALPHA_I, %xmm5 1786 mulpd ALPHA_R, %xmm6 1787 mulpd ALPHA_I, %xmm7 1788 1789 xorpd %xmm13, %xmm1 1790 xorpd %xmm13, %xmm3 1791 xorpd %xmm13, %xmm5 1792 xorpd %xmm13, %xmm7 1793 1794 subpd %xmm1, %xmm0 1795 subpd %xmm3, %xmm2 1796 subpd %xmm5, %xmm4 1797 subpd %xmm7, %xmm6 1798 1799 movsd 0 * SIZE(Y), %xmm1 1800 movhpd 1 * SIZE(Y), %xmm1 1801 addq INCY, Y 1802 movsd 0 * SIZE(Y), %xmm3 1803 movhpd 1 * SIZE(Y), %xmm3 1804 addq INCY, Y 1805 movsd 0 * SIZE(Y), %xmm5 1806 movhpd 1 * SIZE(Y), %xmm5 1807 addq INCY, Y 1808 movsd 0 * SIZE(Y), %xmm7 1809 movhpd 1 * SIZE(Y), %xmm7 1810 addq INCY, Y 1811 1812 addpd %xmm1, %xmm0 1813 addpd %xmm3, %xmm2 1814 addpd %xmm5, %xmm4 1815 addpd %xmm7, %xmm6 1816 1817 movlpd %xmm0, 0 * SIZE(Y1) 1818 movhpd %xmm0, 1 * SIZE(Y1) 1819 addq INCY, Y1 1820 movlpd %xmm2, 0 * SIZE(Y1) 1821 movhpd %xmm2, 1 * SIZE(Y1) 1822 addq INCY, Y1 1823 movlpd %xmm4, 0 * SIZE(Y1) 1824 movhpd %xmm4, 1 * SIZE(Y1) 1825 addq INCY, Y1 1826 movlpd %xmm6, 0 * SIZE(Y1) 1827 movhpd %xmm6, 1 * SIZE(Y1) 1828 addq INCY, Y1 1829 1830 cmpq $4, N 1831 jge .L101 1832 ALIGN_3 1833 1834.L110: 1835#endif 1836 1837#if GEMV_UNROLL >= 2 1838 1839 cmpq $2, N 1840 jl .L120 1841 1842#if GEMV_UNROLL == 2 1843 ALIGN_3 1844 1845.L111: 1846#endif 1847 1848 subq $2, N 1849 1850 leaq 16 * SIZE(BUFFER), X1 1851 1852 movq A, A1 1853 leaq (A1, LDA), A2 1854 leaq (A1, LDA, 2), A 1855 1856 xorpd %xmm0, %xmm0 1857 xorpd %xmm1, %xmm1 1858 xorpd %xmm2, %xmm2 1859 xorpd %xmm3, %xmm3 1860 1861 MOVUPS_XL1(-16 * SIZE, X1, %xmm4) 1862 MOVUPS_XL1(-14 * SIZE, X1, %xmm5) 1863 1864#ifdef PREFETCHW 1865 PREFETCHW 3 * SIZE(Y1) 1866#endif 1867 1868 movq M, I 1869 sarq $2, I 1870 jle .L115 1871 1872 movsd -16 * SIZE(A1), %xmm8 1873 movhpd -15 * SIZE(A1), %xmm8 1874 movsd -16 * SIZE(A2), %xmm10 1875 movhpd -15 * SIZE(A2), %xmm10 1876 1877 movsd -14 * SIZE(A1), %xmm12 1878 movhpd -13 * SIZE(A1), %xmm12 1879 movsd -14 * SIZE(A2), %xmm6 1880 movhpd -13 * SIZE(A2), %xmm6 1881 1882 decq I 1883 jle .L114 1884 ALIGN_3 1885 1886.L113: 1887#ifdef PREFETCH 1888 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 1889#endif 1890 1891 pshufd $0x4e, %xmm8, %xmm9 1892 mulpd %xmm4, %xmm8 1893 addpd %xmm8, %xmm0 1894 movsd -12 * SIZE(A1), %xmm8 1895 movhpd -11 * SIZE(A1), %xmm8 1896 mulpd %xmm4, %xmm9 1897 SUBPD %xmm9, %xmm1 1898 1899 pshufd $0x4e, %xmm10, %xmm11 1900 mulpd %xmm4, %xmm10 1901 addpd %xmm10, %xmm2 1902 movsd -12 * SIZE(A2), %xmm10 1903 movhpd -11 * SIZE(A2), %xmm10 1904 mulpd %xmm4, %xmm11 1905 SUBPD %xmm11, %xmm3 1906 1907 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 1908 1909 pshufd $0x4e, %xmm12, %xmm13 1910 mulpd %xmm5, %xmm12 1911 addpd %xmm12, %xmm0 1912 movsd -10 * SIZE(A1), %xmm12 1913 movhpd -9 * SIZE(A1), %xmm12 1914 mulpd %xmm5, %xmm13 1915 SUBPD %xmm13, %xmm1 1916 1917 pshufd $0x4e, %xmm6, %xmm7 1918 mulpd %xmm5, %xmm6 1919 addpd %xmm6, %xmm2 1920 movsd -10 * SIZE(A2), %xmm6 1921 movhpd -9 * SIZE(A2), %xmm6 1922 mulpd %xmm5, %xmm7 1923 SUBPD %xmm7, %xmm3 1924 1925 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) 1926 1927#ifdef PREFETCH 1928 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 1929#endif 1930 1931 pshufd $0x4e, %xmm8, %xmm9 1932 mulpd %xmm4, %xmm8 1933 addpd %xmm8, %xmm0 1934 movsd -8 * SIZE(A1), %xmm8 1935 movhpd -7 * SIZE(A1), %xmm8 1936 mulpd %xmm4, %xmm9 1937 SUBPD %xmm9, %xmm1 1938 1939 pshufd $0x4e, %xmm10, %xmm11 1940 mulpd %xmm4, %xmm10 1941 addpd %xmm10, %xmm2 1942 movsd -8 * SIZE(A2), %xmm10 1943 movhpd -7 * SIZE(A2), %xmm10 1944 mulpd %xmm4, %xmm11 1945 SUBPD %xmm11, %xmm3 1946 1947 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) 1948 1949#ifdef PREFETCHW 1950 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) 1951#endif 1952 1953 pshufd $0x4e, %xmm12, %xmm13 1954 mulpd %xmm5, %xmm12 1955 addpd %xmm12, %xmm0 1956 movsd -6 * SIZE(A1), %xmm12 1957 movhpd -5 * SIZE(A1), %xmm12 1958 mulpd %xmm5, %xmm13 1959 SUBPD %xmm13, %xmm1 1960 1961 pshufd $0x4e, %xmm6, %xmm7 1962 mulpd %xmm5, %xmm6 1963 addpd %xmm6, %xmm2 1964 movsd -6 * SIZE(A2), %xmm6 1965 movhpd -5 * SIZE(A2), %xmm6 1966 mulpd %xmm5, %xmm7 1967 SUBPD %xmm7, %xmm3 1968 1969 MOVUPS_XL1( -6 * SIZE, X1, %xmm5) 1970 1971 subq $-8 * SIZE, A1 1972 subq $-8 * SIZE, A2 1973 subq $-8 * SIZE, X1 1974 1975 subq $1, I 1976 BRANCH 1977 jg .L113 1978 ALIGN_3 1979 1980.L114: 1981 pshufd $0x4e, %xmm8, %xmm9 1982 mulpd %xmm4, %xmm8 1983 addpd %xmm8, %xmm0 1984 movsd -12 * SIZE(A1), %xmm8 1985 movhpd -11 * SIZE(A1), %xmm8 1986 mulpd %xmm4, %xmm9 1987 SUBPD %xmm9, %xmm1 1988 1989 pshufd $0x4e, %xmm10, %xmm11 1990 mulpd %xmm4, %xmm10 1991 addpd %xmm10, %xmm2 1992 movsd -12 * SIZE(A2), %xmm10 1993 movhpd -11 * SIZE(A2), %xmm10 1994 mulpd %xmm4, %xmm11 1995 SUBPD %xmm11, %xmm3 1996 1997 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 1998 1999 pshufd $0x4e, %xmm12, %xmm13 2000 mulpd %xmm5, %xmm12 2001 addpd %xmm12, %xmm0 2002 movsd -10 * SIZE(A1), %xmm12 2003 movhpd -9 * SIZE(A1), %xmm12 2004 mulpd %xmm5, %xmm13 2005 SUBPD %xmm13, %xmm1 2006 2007 pshufd $0x4e, %xmm6, %xmm7 2008 mulpd %xmm5, %xmm6 2009 addpd %xmm6, %xmm2 2010 movsd -10 * SIZE(A2), %xmm6 2011 movhpd -9 * SIZE(A2), %xmm6 2012 mulpd %xmm5, %xmm7 2013 SUBPD %xmm7, %xmm3 2014 2015 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) 2016 2017 pshufd $0x4e, %xmm8, %xmm9 2018 mulpd %xmm4, %xmm8 2019 addpd %xmm8, %xmm0 2020 mulpd %xmm4, %xmm9 2021 SUBPD %xmm9, %xmm1 2022 2023 pshufd $0x4e, %xmm10, %xmm11 2024 mulpd %xmm4, %xmm10 2025 addpd %xmm10, %xmm2 2026 mulpd %xmm4, %xmm11 2027 SUBPD %xmm11, %xmm3 2028 2029 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) 2030 2031 pshufd $0x4e, %xmm12, %xmm13 2032 mulpd %xmm5, %xmm12 2033 addpd %xmm12, %xmm0 2034 mulpd %xmm5, %xmm13 2035 SUBPD %xmm13, %xmm1 2036 2037 pshufd $0x4e, %xmm6, %xmm7 2038 mulpd %xmm5, %xmm6 2039 addpd %xmm6, %xmm2 2040 mulpd %xmm5, %xmm7 2041 SUBPD %xmm7, %xmm3 2042 2043 MOVUPS_XL1( -6 * SIZE, X1, %xmm5) 2044 2045 subq $-8 * SIZE, A1 2046 subq $-8 * SIZE, A2 2047 subq $-8 * SIZE, X1 2048 ALIGN_3 2049 2050.L115: 2051 testq $2, M 2052 je .L117 2053 2054 movsd -16 * SIZE(A1), %xmm8 2055 movhpd -15 * SIZE(A1), %xmm8 2056 movsd -16 * SIZE(A2), %xmm10 2057 movhpd -15 * SIZE(A2), %xmm10 2058 2059 movsd -14 * SIZE(A1), %xmm12 2060 movhpd -13 * SIZE(A1), %xmm12 2061 movsd -14 * SIZE(A2), %xmm6 2062 movhpd -13 * SIZE(A2), %xmm6 2063 2064 pshufd $0x4e, %xmm8, %xmm9 2065 mulpd %xmm4, %xmm8 2066 addpd %xmm8, %xmm0 2067 mulpd %xmm4, %xmm9 2068 SUBPD %xmm9, %xmm1 2069 2070 pshufd $0x4e, %xmm10, %xmm11 2071 mulpd %xmm4, %xmm10 2072 addpd %xmm10, %xmm2 2073 mulpd %xmm4, %xmm11 2074 SUBPD %xmm11, %xmm3 2075 2076 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 2077 2078 pshufd $0x4e, %xmm12, %xmm13 2079 mulpd %xmm5, %xmm12 2080 addpd %xmm12, %xmm0 2081 mulpd %xmm5, %xmm13 2082 SUBPD %xmm13, %xmm1 2083 2084 pshufd $0x4e, %xmm6, %xmm7 2085 mulpd %xmm5, %xmm6 2086 addpd %xmm6, %xmm2 2087 mulpd %xmm5, %xmm7 2088 SUBPD %xmm7, %xmm3 2089 2090 addq $4 * SIZE, A1 2091 addq $4 * SIZE, A2 2092 ALIGN_3 2093 2094.L117: 2095 testq $1, M 2096 je .L119 2097 2098 movsd -16 * SIZE(A1), %xmm8 2099 movhpd -15 * SIZE(A1), %xmm8 2100 movsd -16 * SIZE(A2), %xmm10 2101 movhpd -15 * SIZE(A2), %xmm10 2102 2103 pshufd $0x4e, %xmm8, %xmm9 2104 mulpd %xmm4, %xmm8 2105 addpd %xmm8, %xmm0 2106 mulpd %xmm4, %xmm9 2107 SUBPD %xmm9, %xmm1 2108 2109 pshufd $0x4e, %xmm10, %xmm11 2110 mulpd %xmm4, %xmm10 2111 addpd %xmm10, %xmm2 2112 mulpd %xmm4, %xmm11 2113 SUBPD %xmm11, %xmm3 2114 ALIGN_3 2115 2116.L119: 2117 pcmpeqb %xmm11, %xmm11 2118 psllq $63, %xmm11 2119 shufps $0xc0, %xmm11, %xmm11 2120 2121#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 2122 xorpd %xmm11, %xmm0 2123 xorpd %xmm11, %xmm2 2124#else 2125 xorpd %xmm11, %xmm1 2126 xorpd %xmm11, %xmm3 2127#endif 2128 2129#ifdef HAVE_SSE3 2130 haddpd %xmm1, %xmm0 2131 haddpd %xmm3, %xmm2 2132#else 2133 movapd %xmm0, %xmm8 2134 unpcklpd %xmm1, %xmm0 2135 unpckhpd %xmm1, %xmm8 2136 2137 movapd %xmm2, %xmm9 2138 unpcklpd %xmm3, %xmm2 2139 unpckhpd %xmm3, %xmm9 2140 2141 addpd %xmm8, %xmm0 2142 addpd %xmm9, %xmm2 2143#endif 2144 2145 pshufd $0x4e, %xmm0, %xmm1 2146 pshufd $0x4e, %xmm2, %xmm3 2147 2148 mulpd ALPHA_R, %xmm0 2149 mulpd ALPHA_I, %xmm1 2150 mulpd ALPHA_R, %xmm2 2151 mulpd ALPHA_I, %xmm3 2152 2153 xorpd %xmm11, %xmm1 2154 xorpd %xmm11, %xmm3 2155 2156 subpd %xmm1, %xmm0 2157 subpd %xmm3, %xmm2 2158 2159 movsd 0 * SIZE(Y), %xmm4 2160 movhpd 1 * SIZE(Y), %xmm4 2161 addq INCY, Y 2162 movsd 0 * SIZE(Y), %xmm5 2163 movhpd 1 * SIZE(Y), %xmm5 2164 addq INCY, Y 2165 2166 addpd %xmm4, %xmm0 2167 addpd %xmm5, %xmm2 2168 2169 movlpd %xmm0, 0 * SIZE(Y1) 2170 movhpd %xmm0, 1 * SIZE(Y1) 2171 addq INCY, Y1 2172 movlpd %xmm2, 0 * SIZE(Y1) 2173 movhpd %xmm2, 1 * SIZE(Y1) 2174 addq INCY, Y1 2175 2176#if GEMV_UNROLL == 2 2177 cmpq $2, N 2178 jge .L111 2179#endif 2180 ALIGN_3 2181 2182.L120: 2183#endif 2184 2185 cmpq $1, N 2186 jl .L999 2187 2188#if GEMV_UNROLL == 1 2189.L121: 2190 decq N 2191#endif 2192 2193 leaq 16 * SIZE(BUFFER), X1 2194 2195 movq A, A1 2196#if GEMV_UNROLL == 1 2197 addq LDA, A 2198#endif 2199 2200 xorpd %xmm0, %xmm0 2201 xorpd %xmm1, %xmm1 2202 2203 MOVUPS_XL1(-16 * SIZE, X1, %xmm4) 2204 MOVUPS_XL1(-14 * SIZE, X1, %xmm5) 2205 2206 movq M, I 2207 sarq $2, I 2208 jle .L125 2209 2210 movsd -16 * SIZE(A1), %xmm8 2211 movhpd -15 * SIZE(A1), %xmm8 2212 movsd -14 * SIZE(A1), %xmm12 2213 movhpd -13 * SIZE(A1), %xmm12 2214 2215 decq I 2216 jle .L124 2217 ALIGN_3 2218 2219.L123: 2220#ifdef PREFETCH 2221 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 2222#endif 2223 2224 pshufd $0x4e, %xmm8, %xmm9 2225 mulpd %xmm4, %xmm8 2226 addpd %xmm8, %xmm0 2227 movsd -12 * SIZE(A1), %xmm8 2228 movhpd -11 * SIZE(A1), %xmm8 2229 mulpd %xmm4, %xmm9 2230 SUBPD %xmm9, %xmm1 2231 2232 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 2233 2234 pshufd $0x4e, %xmm12, %xmm13 2235 mulpd %xmm5, %xmm12 2236 addpd %xmm12, %xmm0 2237 movsd -10 * SIZE(A1), %xmm12 2238 movhpd -9 * SIZE(A1), %xmm12 2239 mulpd %xmm5, %xmm13 2240 SUBPD %xmm13, %xmm1 2241 2242 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) 2243 2244#ifdef PREFETCHW 2245 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) 2246#endif 2247 2248 pshufd $0x4e, %xmm8, %xmm9 2249 mulpd %xmm4, %xmm8 2250 addpd %xmm8, %xmm0 2251 movsd -8 * SIZE(A1), %xmm8 2252 movhpd -7 * SIZE(A1), %xmm8 2253 mulpd %xmm4, %xmm9 2254 SUBPD %xmm9, %xmm1 2255 2256 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) 2257 2258 pshufd $0x4e, %xmm12, %xmm13 2259 mulpd %xmm5, %xmm12 2260 addpd %xmm12, %xmm0 2261 movsd -6 * SIZE(A1), %xmm12 2262 movhpd -5 * SIZE(A1), %xmm12 2263 mulpd %xmm5, %xmm13 2264 SUBPD %xmm13, %xmm1 2265 2266 MOVUPS_XL1(-6 * SIZE, X1, %xmm5) 2267 2268 subq $-8 * SIZE, A1 2269 subq $-8 * SIZE, X1 2270 2271 subq $1, I 2272 BRANCH 2273 jg .L123 2274 ALIGN_3 2275 2276.L124: 2277 pshufd $0x4e, %xmm8, %xmm9 2278 mulpd %xmm4, %xmm8 2279 addpd %xmm8, %xmm0 2280 movsd -12 * SIZE(A1), %xmm8 2281 movhpd -11 * SIZE(A1), %xmm8 2282 mulpd %xmm4, %xmm9 2283 SUBPD %xmm9, %xmm1 2284 2285 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 2286 2287 pshufd $0x4e, %xmm12, %xmm13 2288 mulpd %xmm5, %xmm12 2289 addpd %xmm12, %xmm0 2290 movsd -10 * SIZE(A1), %xmm12 2291 movhpd -9 * SIZE(A1), %xmm12 2292 mulpd %xmm5, %xmm13 2293 SUBPD %xmm13, %xmm1 2294 2295 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) 2296 2297 pshufd $0x4e, %xmm8, %xmm9 2298 mulpd %xmm4, %xmm8 2299 addpd %xmm8, %xmm0 2300 mulpd %xmm4, %xmm9 2301 SUBPD %xmm9, %xmm1 2302 2303 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) 2304 2305 pshufd $0x4e, %xmm12, %xmm13 2306 mulpd %xmm5, %xmm12 2307 addpd %xmm12, %xmm0 2308 mulpd %xmm5, %xmm13 2309 SUBPD %xmm13, %xmm1 2310 2311 MOVUPS_XL1(-6 * SIZE, X1, %xmm5) 2312 2313 subq $-8 * SIZE, A1 2314 subq $-8 * SIZE, X1 2315 ALIGN_3 2316 2317.L125: 2318 testq $2, M 2319 je .L127 2320 2321 movsd -16 * SIZE(A1), %xmm8 2322 movhpd -15 * SIZE(A1), %xmm8 2323 movsd -14 * SIZE(A1), %xmm12 2324 movhpd -13 * SIZE(A1), %xmm12 2325 2326 pshufd $0x4e, %xmm8, %xmm9 2327 mulpd %xmm4, %xmm8 2328 addpd %xmm8, %xmm0 2329 mulpd %xmm4, %xmm9 2330 SUBPD %xmm9, %xmm1 2331 2332 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) 2333 2334 pshufd $0x4e, %xmm12, %xmm13 2335 mulpd %xmm5, %xmm12 2336 addpd %xmm12, %xmm0 2337 mulpd %xmm5, %xmm13 2338 SUBPD %xmm13, %xmm1 2339 2340 addq $4 * SIZE, A1 2341 ALIGN_3 2342 2343.L127: 2344 testq $1, M 2345 je .L129 2346 2347 movsd -16 * SIZE(A1), %xmm8 2348 movhpd -15 * SIZE(A1), %xmm8 2349 2350 pshufd $0x4e, %xmm8, %xmm9 2351 mulpd %xmm4, %xmm8 2352 addpd %xmm8, %xmm0 2353 mulpd %xmm4, %xmm9 2354 SUBPD %xmm9, %xmm1 2355 ALIGN_3 2356 2357.L129: 2358 pcmpeqb %xmm11, %xmm11 2359 psllq $63, %xmm11 2360 shufps $0xc0, %xmm11, %xmm11 2361 2362#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 2363 xorpd %xmm11, %xmm0 2364#else 2365 xorpd %xmm11, %xmm1 2366#endif 2367 2368#ifdef HAVE_SSE3 2369 haddpd %xmm1, %xmm0 2370#else 2371 movapd %xmm0, %xmm8 2372 unpcklpd %xmm1, %xmm0 2373 unpckhpd %xmm1, %xmm8 2374 2375 addpd %xmm8, %xmm0 2376#endif 2377 2378 pshufd $0x4e, %xmm0, %xmm1 2379 2380 mulpd ALPHA_R, %xmm0 2381 mulpd ALPHA_I, %xmm1 2382 2383 xorpd %xmm11, %xmm1 2384 2385 subpd %xmm1, %xmm0 2386 2387 movsd 0 * SIZE(Y), %xmm4 2388 movhpd 1 * SIZE(Y), %xmm4 2389 2390 addpd %xmm4, %xmm0 2391 2392 movlpd %xmm0, 0 * SIZE(Y1) 2393 movhpd %xmm0, 1 * SIZE(Y1) 2394 2395#if GEMV_UNROLL == 1 2396 addq INCY, Y 2397 addq INCY, Y1 2398 2399 cmpq $1, N 2400 jge .L121 2401#endif 2402 2403 2404#endif 2405 ALIGN_3 2406 2407.L999: 2408 movq 0(%rsp), %rbx 2409 movq 8(%rsp), %rbp 2410 movq 16(%rsp), %r12 2411 movq 24(%rsp), %r13 2412 movq 32(%rsp), %r14 2413 movq 40(%rsp), %r15 2414 2415#ifdef WINDOWS_ABI 2416 movq 48(%rsp), %rdi 2417 movq 56(%rsp), %rsi 2418 movups 64(%rsp), %xmm6 2419 movups 80(%rsp), %xmm7 2420 movups 96(%rsp), %xmm8 2421 movups 112(%rsp), %xmm9 2422 movups 128(%rsp), %xmm10 2423 movups 144(%rsp), %xmm11 2424 movups 160(%rsp), %xmm12 2425 movups 176(%rsp), %xmm13 2426 movups 192(%rsp), %xmm14 2427 movups 208(%rsp), %xmm15 2428#endif 2429 2430 addq $STACKSIZE, %rsp 2431 ret 2432 2433 EPILOGUE 2434