1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24#include "l2param.h" 25 26#ifndef WINDOWS_ABI 27 28#define STACKSIZE 64 29 30#define OLD_INCX 8 + STACKSIZE(%rsp) 31#define OLD_Y 16 + STACKSIZE(%rsp) 32#define OLD_INCY 24 + STACKSIZE(%rsp) 33#define OLD_BUFFER 32 + STACKSIZE(%rsp) 34#define ALPHA_R 48 (%rsp) 35#define ALPHA_I 56 (%rsp) 36 37#define M %rdi 38#define N %rsi 39#define A %rcx 40#define LDA %r8 41#define X %r9 42#define INCX %rdx 43#define Y %rbp 44#define INCY %r10 45 46#else 47 48#define STACKSIZE 256 49 50#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 51#define OLD_A 48 + STACKSIZE(%rsp) 52#define OLD_LDA 56 + STACKSIZE(%rsp) 53#define OLD_X 64 + STACKSIZE(%rsp) 54#define OLD_INCX 72 + STACKSIZE(%rsp) 55#define OLD_Y 80 + STACKSIZE(%rsp) 56#define OLD_INCY 88 + STACKSIZE(%rsp) 57#define OLD_BUFFER 96 + STACKSIZE(%rsp) 58#define ALPHA_R 224 (%rsp) 59#define ALPHA_I 232 (%rsp) 60 61#define M %rcx 62#define N %rdx 63#define A %r8 64#define LDA %r9 65#define X %rdi 66#define INCX %rsi 67#define Y %rbp 68#define INCY %r10 69 70#endif 71 72#define I %rax 73#define A1 %r12 74#define A2 %r13 75 76#define Y1 %r14 77#define BUFFER %r15 78 79#define J %r11 80 81#undef SUBPD 82 83#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 84#define SUBPD subpd 85#else 86#define SUBPD addpd 87#endif 88 89 PROLOGUE 90 PROFCODE 91 92 subq $STACKSIZE, %rsp 93 movq %rbx, 0(%rsp) 94 movq %rbp, 8(%rsp) 95 movq %r12, 16(%rsp) 96 movq %r13, 24(%rsp) 97 movq %r14, 32(%rsp) 98 movq %r15, 40(%rsp) 99 100#ifdef WINDOWS_ABI 101 movq %rdi, 48(%rsp) 102 movq %rsi, 56(%rsp) 103 movups %xmm6, 64(%rsp) 104 movups %xmm7, 80(%rsp) 105 movups %xmm8, 96(%rsp) 106 movups %xmm9, 112(%rsp) 107 movups %xmm10, 128(%rsp) 108 movups %xmm11, 144(%rsp) 109 movups %xmm12, 160(%rsp) 110 movups %xmm13, 176(%rsp) 111 movups %xmm14, 192(%rsp) 112 movups %xmm15, 208(%rsp) 113 114 movq OLD_A, A 115 movq OLD_LDA, LDA 116 movq OLD_X, X 117 118 movapd %xmm3, %xmm0 119 movsd OLD_ALPHA_I, %xmm1 120#endif 121 122 movq OLD_INCX, INCX 123 movq OLD_Y, Y 124 movq OLD_INCY, INCY 125 movq OLD_BUFFER, BUFFER 126 127 salq $ZBASE_SHIFT, LDA 128 salq $ZBASE_SHIFT, INCX 129 salq $ZBASE_SHIFT, INCY 130 131 movlps %xmm0, ALPHA_R 132 movlps %xmm1, ALPHA_I 133 134 subq $-16 * SIZE, A 135 136 testq M, M 137 jle .L999 138 testq N, N 139 jle .L999 140 ALIGN_3 141 142 movq BUFFER, Y1 143 144 xorps %xmm4, %xmm4 145 146 movq M, %rax 147 addq $8, %rax 148 sarq $3, %rax 149 ALIGN_3 150 151.L01: 152 movaps %xmm4, 0 * SIZE(Y1) 153 movaps %xmm4, 2 * SIZE(Y1) 154 movaps %xmm4, 4 * SIZE(Y1) 155 movaps %xmm4, 6 * SIZE(Y1) 156 movaps %xmm4, 8 * SIZE(Y1) 157 movaps %xmm4, 10 * SIZE(Y1) 158 movaps %xmm4, 12 * SIZE(Y1) 159 movaps %xmm4, 14 * SIZE(Y1) 160 161 subq $-16 * SIZE, Y1 162 decq %rax 163 jg .L01 164 ALIGN_3 165 166.L10: 167#if GEMV_UNROLL >= 4 168 169 cmpq $4, N 170 jl .L20 171 ALIGN_3 172 173.L11: 174 subq $4, N 175 176 leaq 16 * SIZE(BUFFER), Y1 177 movq A, A1 178 leaq (A, LDA, 2), A2 179 leaq (A, LDA, 4), A 180 181 movddup 0 * SIZE(X), %xmm8 182 movddup 1 * SIZE(X), %xmm9 183 addq INCX, X 184 movddup 0 * SIZE(X), %xmm10 185 movddup 1 * SIZE(X), %xmm11 186 addq INCX, X 187 movddup 0 * SIZE(X), %xmm12 188 movddup 1 * SIZE(X), %xmm13 189 addq INCX, X 190 movddup 0 * SIZE(X), %xmm14 191 movddup 1 * SIZE(X), %xmm15 192 addq INCX, X 193 194 pcmpeqb %xmm5, %xmm5 195 psllq $63, %xmm5 196 shufps $0x40, %xmm5, %xmm5 197 198 movsd ALPHA_R, %xmm6 199 movhps ALPHA_I, %xmm6 200 201 pshufd $0x4e, %xmm6, %xmm7 202 203#ifndef XCONJ 204 xorps %xmm5, %xmm7 205#else 206 xorps %xmm5, %xmm6 207#endif 208 209 mulpd %xmm6, %xmm8 210 mulpd %xmm7, %xmm9 211 mulpd %xmm6, %xmm10 212 mulpd %xmm7, %xmm11 213 214 mulpd %xmm6, %xmm12 215 mulpd %xmm7, %xmm13 216 mulpd %xmm6, %xmm14 217 mulpd %xmm7, %xmm15 218 219#ifndef XCONJ 220 subpd %xmm9, %xmm8 221 subpd %xmm11, %xmm10 222 subpd %xmm13, %xmm12 223 subpd %xmm15, %xmm14 224#else 225 addpd %xmm9, %xmm8 226 addpd %xmm11, %xmm10 227 addpd %xmm13, %xmm12 228 addpd %xmm15, %xmm14 229#endif 230 231 pshufd $0x4e, %xmm8, %xmm9 232 pshufd $0x4e, %xmm10, %xmm11 233 pshufd $0x4e, %xmm12, %xmm13 234 pshufd $0x4e, %xmm14, %xmm15 235 236#ifndef XCONJ 237 xorps %xmm5, %xmm9 238 xorps %xmm5, %xmm11 239 xorps %xmm5, %xmm13 240 xorps %xmm5, %xmm15 241#else 242 xorps %xmm5, %xmm8 243 xorps %xmm5, %xmm10 244 xorps %xmm5, %xmm12 245 xorps %xmm5, %xmm14 246#endif 247 248 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 249 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 250 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 251 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 252 ALIGN_3 253 254 movq M, I 255 sarq $2, I 256 jle .L15 257 258 movddup -16 * SIZE(A1), %xmm4 259 movddup -14 * SIZE(A1), %xmm5 260 movddup -12 * SIZE(A1), %xmm6 261 movddup -10 * SIZE(A1), %xmm7 262 263 decq I 264 jle .L14 265 ALIGN_3 266 267.L13: 268#ifdef PREFETCH 269 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) 270#endif 271 272 mulpd %xmm8, %xmm4 273 addpd %xmm4, %xmm0 274 movddup -15 * SIZE(A1), %xmm4 275 mulpd %xmm8, %xmm5 276 addpd %xmm5, %xmm1 277 movddup -13 * SIZE(A1), %xmm5 278 mulpd %xmm8, %xmm6 279 addpd %xmm6, %xmm2 280 movddup -11 * SIZE(A1), %xmm6 281 mulpd %xmm8, %xmm7 282 addpd %xmm7, %xmm3 283 movddup -9 * SIZE(A1), %xmm7 284 285 mulpd %xmm9, %xmm4 286 SUBPD %xmm4, %xmm0 287 movddup -16 * SIZE(A1, LDA), %xmm4 288 mulpd %xmm9, %xmm5 289 SUBPD %xmm5, %xmm1 290 movddup -14 * SIZE(A1, LDA), %xmm5 291 mulpd %xmm9, %xmm6 292 SUBPD %xmm6, %xmm2 293 movddup -12 * SIZE(A1, LDA), %xmm6 294 mulpd %xmm9, %xmm7 295 SUBPD %xmm7, %xmm3 296 movddup -10 * SIZE(A1, LDA), %xmm7 297 298#ifdef PREFETCH 299 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) 300#endif 301 302 mulpd %xmm10, %xmm4 303 addpd %xmm4, %xmm0 304 movddup -15 * SIZE(A1, LDA), %xmm4 305 mulpd %xmm10, %xmm5 306 addpd %xmm5, %xmm1 307 movddup -13 * SIZE(A1, LDA), %xmm5 308 mulpd %xmm10, %xmm6 309 addpd %xmm6, %xmm2 310 movddup -11 * SIZE(A1, LDA), %xmm6 311 mulpd %xmm10, %xmm7 312 addpd %xmm7, %xmm3 313 movddup -9 * SIZE(A1, LDA), %xmm7 314 315 mulpd %xmm11, %xmm4 316 SUBPD %xmm4, %xmm0 317 movddup -16 * SIZE(A2), %xmm4 318 mulpd %xmm11, %xmm5 319 SUBPD %xmm5, %xmm1 320 movddup -14 * SIZE(A2), %xmm5 321 mulpd %xmm11, %xmm6 322 SUBPD %xmm6, %xmm2 323 movddup -12 * SIZE(A2), %xmm6 324 mulpd %xmm11, %xmm7 325 SUBPD %xmm7, %xmm3 326 movddup -10 * SIZE(A2), %xmm7 327 328#ifdef PREFETCH 329 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) 330#endif 331 332 mulpd %xmm12, %xmm4 333 addpd %xmm4, %xmm0 334 movddup -15 * SIZE(A2), %xmm4 335 mulpd %xmm12, %xmm5 336 addpd %xmm5, %xmm1 337 movddup -13 * SIZE(A2), %xmm5 338 mulpd %xmm12, %xmm6 339 addpd %xmm6, %xmm2 340 movddup -11 * SIZE(A2), %xmm6 341 mulpd %xmm12, %xmm7 342 addpd %xmm7, %xmm3 343 movddup -9 * SIZE(A2), %xmm7 344 345 mulpd %xmm13, %xmm4 346 SUBPD %xmm4, %xmm0 347 movddup -16 * SIZE(A2, LDA), %xmm4 348 mulpd %xmm13, %xmm5 349 SUBPD %xmm5, %xmm1 350 movddup -14 * SIZE(A2, LDA), %xmm5 351 mulpd %xmm13, %xmm6 352 SUBPD %xmm6, %xmm2 353 movddup -12 * SIZE(A2, LDA), %xmm6 354 mulpd %xmm13, %xmm7 355 SUBPD %xmm7, %xmm3 356 movddup -10 * SIZE(A2, LDA), %xmm7 357 358#ifdef PREFETCH 359 PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) 360#endif 361 362 mulpd %xmm14, %xmm4 363 addpd %xmm4, %xmm0 364 movddup -15 * SIZE(A2, LDA), %xmm4 365 mulpd %xmm14, %xmm5 366 addpd %xmm5, %xmm1 367 movddup -13 * SIZE(A2, LDA), %xmm5 368 mulpd %xmm14, %xmm6 369 addpd %xmm6, %xmm2 370 movddup -11 * SIZE(A2, LDA), %xmm6 371 mulpd %xmm14, %xmm7 372 addpd %xmm7, %xmm3 373 movddup -9 * SIZE(A2, LDA), %xmm7 374 375 mulpd %xmm15, %xmm4 376 SUBPD %xmm4, %xmm0 377 movddup -8 * SIZE(A1), %xmm4 378 mulpd %xmm15, %xmm5 379 SUBPD %xmm5, %xmm1 380 movddup -6 * SIZE(A1), %xmm5 381 mulpd %xmm15, %xmm6 382 SUBPD %xmm6, %xmm2 383 movddup -4 * SIZE(A1), %xmm6 384 mulpd %xmm15, %xmm7 385 SUBPD %xmm7, %xmm3 386 movddup -2 * SIZE(A1), %xmm7 387 388#ifdef PREFETCHW 389 PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) 390#endif 391 392 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 393 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 394 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 395 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 396 397 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 398 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 399 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 400 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 401 402 subq $-8 * SIZE, A1 403 subq $-8 * SIZE, A2 404 subq $-8 * SIZE, Y1 405 406 subq $1, I 407 BRANCH 408 jg .L13 409 ALIGN_3 410 411.L14: 412 mulpd %xmm8, %xmm4 413 addpd %xmm4, %xmm0 414 movddup -15 * SIZE(A1), %xmm4 415 mulpd %xmm8, %xmm5 416 addpd %xmm5, %xmm1 417 movddup -13 * SIZE(A1), %xmm5 418 mulpd %xmm8, %xmm6 419 addpd %xmm6, %xmm2 420 movddup -11 * SIZE(A1), %xmm6 421 mulpd %xmm8, %xmm7 422 addpd %xmm7, %xmm3 423 movddup -9 * SIZE(A1), %xmm7 424 425 mulpd %xmm9, %xmm4 426 SUBPD %xmm4, %xmm0 427 movddup -16 * SIZE(A1, LDA), %xmm4 428 mulpd %xmm9, %xmm5 429 SUBPD %xmm5, %xmm1 430 movddup -14 * SIZE(A1, LDA), %xmm5 431 mulpd %xmm9, %xmm6 432 SUBPD %xmm6, %xmm2 433 movddup -12 * SIZE(A1, LDA), %xmm6 434 mulpd %xmm9, %xmm7 435 SUBPD %xmm7, %xmm3 436 movddup -10 * SIZE(A1, LDA), %xmm7 437 438 mulpd %xmm10, %xmm4 439 addpd %xmm4, %xmm0 440 movddup -15 * SIZE(A1, LDA), %xmm4 441 mulpd %xmm10, %xmm5 442 addpd %xmm5, %xmm1 443 movddup -13 * SIZE(A1, LDA), %xmm5 444 mulpd %xmm10, %xmm6 445 addpd %xmm6, %xmm2 446 movddup -11 * SIZE(A1, LDA), %xmm6 447 mulpd %xmm10, %xmm7 448 addpd %xmm7, %xmm3 449 movddup -9 * SIZE(A1, LDA), %xmm7 450 451 mulpd %xmm11, %xmm4 452 SUBPD %xmm4, %xmm0 453 movddup -16 * SIZE(A2), %xmm4 454 mulpd %xmm11, %xmm5 455 SUBPD %xmm5, %xmm1 456 movddup -14 * SIZE(A2), %xmm5 457 mulpd %xmm11, %xmm6 458 SUBPD %xmm6, %xmm2 459 movddup -12 * SIZE(A2), %xmm6 460 mulpd %xmm11, %xmm7 461 SUBPD %xmm7, %xmm3 462 movddup -10 * SIZE(A2), %xmm7 463 464 mulpd %xmm12, %xmm4 465 addpd %xmm4, %xmm0 466 movddup -15 * SIZE(A2), %xmm4 467 mulpd %xmm12, %xmm5 468 addpd %xmm5, %xmm1 469 movddup -13 * SIZE(A2), %xmm5 470 mulpd %xmm12, %xmm6 471 addpd %xmm6, %xmm2 472 movddup -11 * SIZE(A2), %xmm6 473 mulpd %xmm12, %xmm7 474 addpd %xmm7, %xmm3 475 movddup -9 * SIZE(A2), %xmm7 476 477 mulpd %xmm13, %xmm4 478 SUBPD %xmm4, %xmm0 479 movddup -16 * SIZE(A2, LDA), %xmm4 480 mulpd %xmm13, %xmm5 481 SUBPD %xmm5, %xmm1 482 movddup -14 * SIZE(A2, LDA), %xmm5 483 mulpd %xmm13, %xmm6 484 SUBPD %xmm6, %xmm2 485 movddup -12 * SIZE(A2, LDA), %xmm6 486 mulpd %xmm13, %xmm7 487 SUBPD %xmm7, %xmm3 488 movddup -10 * SIZE(A2, LDA), %xmm7 489 490 mulpd %xmm14, %xmm4 491 addpd %xmm4, %xmm0 492 movddup -15 * SIZE(A2, LDA), %xmm4 493 mulpd %xmm14, %xmm5 494 addpd %xmm5, %xmm1 495 movddup -13 * SIZE(A2, LDA), %xmm5 496 mulpd %xmm14, %xmm6 497 addpd %xmm6, %xmm2 498 movddup -11 * SIZE(A2, LDA), %xmm6 499 mulpd %xmm14, %xmm7 500 addpd %xmm7, %xmm3 501 movddup -9 * SIZE(A2, LDA), %xmm7 502 503 mulpd %xmm15, %xmm4 504 SUBPD %xmm4, %xmm0 505 mulpd %xmm15, %xmm5 506 SUBPD %xmm5, %xmm1 507 mulpd %xmm15, %xmm6 508 SUBPD %xmm6, %xmm2 509 mulpd %xmm15, %xmm7 510 SUBPD %xmm7, %xmm3 511 512 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 513 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 514 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 515 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 516 517 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 518 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 519 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 520 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 521 522 subq $-8 * SIZE, A1 523 subq $-8 * SIZE, A2 524 subq $-8 * SIZE, Y1 525 ALIGN_3 526 527.L15: 528 testq $2, M 529 je .L17 530 531 movddup -16 * SIZE(A1), %xmm4 532 movddup -15 * SIZE(A1), %xmm5 533 movddup -14 * SIZE(A1), %xmm6 534 movddup -13 * SIZE(A1), %xmm7 535 536 mulpd %xmm8, %xmm4 537 addpd %xmm4, %xmm0 538 movddup -16 * SIZE(A1, LDA, 1), %xmm4 539 mulpd %xmm8, %xmm6 540 addpd %xmm6, %xmm1 541 movddup -14 * SIZE(A1, LDA, 1), %xmm6 542 543 mulpd %xmm9, %xmm5 544 SUBPD %xmm5, %xmm0 545 movddup -15 * SIZE(A1, LDA, 1), %xmm5 546 mulpd %xmm9, %xmm7 547 SUBPD %xmm7, %xmm1 548 movddup -13 * SIZE(A1, LDA, 1), %xmm7 549 550 mulpd %xmm10, %xmm4 551 addpd %xmm4, %xmm0 552 movddup -16 * SIZE(A2), %xmm4 553 mulpd %xmm10, %xmm6 554 addpd %xmm6, %xmm1 555 movddup -14 * SIZE(A2), %xmm6 556 557 mulpd %xmm11, %xmm5 558 SUBPD %xmm5, %xmm0 559 movddup -15 * SIZE(A2), %xmm5 560 mulpd %xmm11, %xmm7 561 SUBPD %xmm7, %xmm1 562 movddup -13 * SIZE(A2), %xmm7 563 564 mulpd %xmm12, %xmm4 565 addpd %xmm4, %xmm0 566 movddup -16 * SIZE(A2, LDA, 1), %xmm4 567 mulpd %xmm12, %xmm6 568 addpd %xmm6, %xmm1 569 movddup -14 * SIZE(A2, LDA, 1), %xmm6 570 571 mulpd %xmm13, %xmm5 572 SUBPD %xmm5, %xmm0 573 movddup -15 * SIZE(A2, LDA, 1), %xmm5 574 mulpd %xmm13, %xmm7 575 SUBPD %xmm7, %xmm1 576 movddup -13 * SIZE(A2, LDA, 1), %xmm7 577 578 mulpd %xmm14, %xmm4 579 addpd %xmm4, %xmm0 580 mulpd %xmm14, %xmm6 581 addpd %xmm6, %xmm1 582 583 mulpd %xmm15, %xmm5 584 SUBPD %xmm5, %xmm0 585 mulpd %xmm15, %xmm7 586 SUBPD %xmm7, %xmm1 587 588 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 589 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 590 movaps %xmm2, %xmm0 591 592 addq $4 * SIZE, A1 593 addq $4 * SIZE, A2 594 addq $4 * SIZE, Y1 595 ALIGN_3 596 597.L17: 598 testq $1, M 599 je .L19 600 601 movddup -16 * SIZE(A1), %xmm4 602 movddup -15 * SIZE(A1), %xmm5 603 movddup -16 * SIZE(A1, LDA, 1), %xmm6 604 movddup -15 * SIZE(A1, LDA, 1), %xmm7 605 606 mulpd %xmm8, %xmm4 607 addpd %xmm4, %xmm0 608 movddup -16 * SIZE(A2), %xmm4 609 mulpd %xmm9, %xmm5 610 SUBPD %xmm5, %xmm0 611 movddup -15 * SIZE(A2), %xmm5 612 613 mulpd %xmm10, %xmm6 614 addpd %xmm6, %xmm0 615 movddup -16 * SIZE(A2, LDA, 1), %xmm6 616 mulpd %xmm11, %xmm7 617 SUBPD %xmm7, %xmm0 618 movddup -15 * SIZE(A2, LDA, 1), %xmm7 619 620 mulpd %xmm12, %xmm4 621 addpd %xmm4, %xmm0 622 mulpd %xmm13, %xmm5 623 SUBPD %xmm5, %xmm0 624 625 mulpd %xmm14, %xmm6 626 addpd %xmm6, %xmm0 627 mulpd %xmm15, %xmm7 628 SUBPD %xmm7, %xmm0 629 630 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 631 ALIGN_3 632 633.L19: 634 cmpq $4, N 635 jge .L11 636 ALIGN_3 637 638.L20: 639#endif 640 641#if GEMV_UNROLL >= 2 642 643 cmpq $2, N 644 jl .L30 645 646#if GEMV_UNROLL == 2 647 ALIGN_3 648 649.L21: 650#endif 651 652 subq $2, N 653 654 leaq 16 * SIZE(BUFFER), Y1 655 movq A, A1 656 leaq (A, LDA, 1), A2 657 leaq (A, LDA, 2), A 658 659 movddup 0 * SIZE(X), %xmm8 660 movddup 1 * SIZE(X), %xmm9 661 addq INCX, X 662 movddup 0 * SIZE(X), %xmm10 663 movddup 1 * SIZE(X), %xmm11 664 addq INCX, X 665 666 pcmpeqb %xmm5, %xmm5 667 psllq $63, %xmm5 668 shufps $0x40, %xmm5, %xmm5 669 670 movsd ALPHA_R, %xmm6 671 movhps ALPHA_I, %xmm6 672 673 pshufd $0x4e, %xmm6, %xmm7 674 675#ifndef XCONJ 676 xorps %xmm5, %xmm7 677#else 678 xorps %xmm5, %xmm6 679#endif 680 681 mulpd %xmm6, %xmm8 682 mulpd %xmm7, %xmm9 683 mulpd %xmm6, %xmm10 684 mulpd %xmm7, %xmm11 685 686#ifndef XCONJ 687 subpd %xmm9, %xmm8 688 subpd %xmm11, %xmm10 689#else 690 addpd %xmm9, %xmm8 691 addpd %xmm11, %xmm10 692#endif 693 694 pshufd $0x4e, %xmm8, %xmm9 695 pshufd $0x4e, %xmm10, %xmm11 696 697#ifndef XCONJ 698 xorps %xmm5, %xmm9 699 xorps %xmm5, %xmm11 700#else 701 xorps %xmm5, %xmm8 702 xorps %xmm5, %xmm10 703#endif 704 705 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 706 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 707 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 708 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 709 710 movq M, I 711 sarq $2, I 712 jle .L25 713 714 movddup -16 * SIZE(A1), %xmm4 715 movddup -14 * SIZE(A1), %xmm5 716 movddup -12 * SIZE(A1), %xmm6 717 movddup -10 * SIZE(A1), %xmm7 718 719 decq I 720 jle .L24 721 ALIGN_3 722 723.L23: 724#ifdef PREFETCH 725 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) 726#endif 727 728 mulpd %xmm8, %xmm4 729 addpd %xmm4, %xmm0 730 movddup -15 * SIZE(A1), %xmm4 731 mulpd %xmm8, %xmm5 732 addpd %xmm5, %xmm1 733 movddup -13 * SIZE(A1), %xmm5 734 mulpd %xmm8, %xmm6 735 addpd %xmm6, %xmm2 736 movddup -11 * SIZE(A1), %xmm6 737 mulpd %xmm8, %xmm7 738 addpd %xmm7, %xmm3 739 movddup -9 * SIZE(A1), %xmm7 740 741 mulpd %xmm9, %xmm4 742 SUBPD %xmm4, %xmm0 743 movddup -16 * SIZE(A2), %xmm4 744 mulpd %xmm9, %xmm5 745 SUBPD %xmm5, %xmm1 746 movddup -14 * SIZE(A2), %xmm5 747 mulpd %xmm9, %xmm6 748 SUBPD %xmm6, %xmm2 749 movddup -12 * SIZE(A2), %xmm6 750 mulpd %xmm9, %xmm7 751 SUBPD %xmm7, %xmm3 752 movddup -10 * SIZE(A2), %xmm7 753 754#ifdef PREFETCH 755 PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) 756#endif 757 758 mulpd %xmm10, %xmm4 759 addpd %xmm4, %xmm0 760 movddup -15 * SIZE(A2), %xmm4 761 mulpd %xmm10, %xmm5 762 addpd %xmm5, %xmm1 763 movddup -13 * SIZE(A2), %xmm5 764 mulpd %xmm10, %xmm6 765 addpd %xmm6, %xmm2 766 movddup -11 * SIZE(A2), %xmm6 767 mulpd %xmm10, %xmm7 768 addpd %xmm7, %xmm3 769 movddup -9 * SIZE(A2), %xmm7 770 771 mulpd %xmm11, %xmm4 772 SUBPD %xmm4, %xmm0 773 movddup -8 * SIZE(A1), %xmm4 774 mulpd %xmm11, %xmm5 775 SUBPD %xmm5, %xmm1 776 movddup -6 * SIZE(A1), %xmm5 777 mulpd %xmm11, %xmm6 778 SUBPD %xmm6, %xmm2 779 movddup -4 * SIZE(A1), %xmm6 780 mulpd %xmm11, %xmm7 781 SUBPD %xmm7, %xmm3 782 movddup -2 * SIZE(A1), %xmm7 783 784#ifdef PREFETCHW 785 PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) 786#endif 787 788 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 789 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 790 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 791 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 792 793 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 794 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 795 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 796 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 797 798 subq $-8 * SIZE, A1 799 subq $-8 * SIZE, A2 800 subq $-8 * SIZE, Y1 801 802 subq $1, I 803 BRANCH 804 jg .L23 805 ALIGN_3 806 807.L24: 808 mulpd %xmm8, %xmm4 809 addpd %xmm4, %xmm0 810 movddup -15 * SIZE(A1), %xmm4 811 mulpd %xmm8, %xmm5 812 addpd %xmm5, %xmm1 813 movddup -13 * SIZE(A1), %xmm5 814 mulpd %xmm8, %xmm6 815 addpd %xmm6, %xmm2 816 movddup -11 * SIZE(A1), %xmm6 817 mulpd %xmm8, %xmm7 818 addpd %xmm7, %xmm3 819 movddup -9 * SIZE(A1), %xmm7 820 821 mulpd %xmm9, %xmm4 822 SUBPD %xmm4, %xmm0 823 movddup -16 * SIZE(A2), %xmm4 824 mulpd %xmm9, %xmm5 825 SUBPD %xmm5, %xmm1 826 movddup -14 * SIZE(A2), %xmm5 827 mulpd %xmm9, %xmm6 828 SUBPD %xmm6, %xmm2 829 movddup -12 * SIZE(A2), %xmm6 830 mulpd %xmm9, %xmm7 831 SUBPD %xmm7, %xmm3 832 movddup -10 * SIZE(A2), %xmm7 833 834 mulpd %xmm10, %xmm4 835 addpd %xmm4, %xmm0 836 movddup -15 * SIZE(A2), %xmm4 837 mulpd %xmm10, %xmm5 838 addpd %xmm5, %xmm1 839 movddup -13 * SIZE(A2), %xmm5 840 mulpd %xmm10, %xmm6 841 addpd %xmm6, %xmm2 842 movddup -11 * SIZE(A2), %xmm6 843 mulpd %xmm10, %xmm7 844 addpd %xmm7, %xmm3 845 movddup -9 * SIZE(A2), %xmm7 846 847 mulpd %xmm11, %xmm4 848 SUBPD %xmm4, %xmm0 849 mulpd %xmm11, %xmm5 850 SUBPD %xmm5, %xmm1 851 mulpd %xmm11, %xmm6 852 SUBPD %xmm6, %xmm2 853 mulpd %xmm11, %xmm7 854 SUBPD %xmm7, %xmm3 855 856 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 857 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 858 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 859 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 860 861 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 862 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 863 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 864 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 865 866 subq $-8 * SIZE, A1 867 subq $-8 * SIZE, A2 868 subq $-8 * SIZE, Y1 869 ALIGN_3 870 871.L25: 872 testq $2, M 873 je .L27 874 875 movddup -16 * SIZE(A1), %xmm4 876 movddup -15 * SIZE(A1), %xmm5 877 movddup -14 * SIZE(A1), %xmm6 878 movddup -13 * SIZE(A1), %xmm7 879 880 mulpd %xmm8, %xmm4 881 addpd %xmm4, %xmm0 882 movddup -16 * SIZE(A2), %xmm4 883 mulpd %xmm8, %xmm6 884 addpd %xmm6, %xmm1 885 movddup -14 * SIZE(A2), %xmm6 886 887 mulpd %xmm9, %xmm5 888 SUBPD %xmm5, %xmm0 889 movddup -15 * SIZE(A2), %xmm5 890 mulpd %xmm9, %xmm7 891 SUBPD %xmm7, %xmm1 892 movddup -13 * SIZE(A2), %xmm7 893 894 mulpd %xmm10, %xmm4 895 addpd %xmm4, %xmm0 896 mulpd %xmm10, %xmm6 897 addpd %xmm6, %xmm1 898 899 mulpd %xmm11, %xmm5 900 SUBPD %xmm5, %xmm0 901 mulpd %xmm11, %xmm7 902 SUBPD %xmm7, %xmm1 903 904 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 905 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 906 movaps %xmm2, %xmm0 907 908 addq $4 * SIZE, A1 909 addq $4 * SIZE, A2 910 addq $4 * SIZE, Y1 911 ALIGN_3 912 913.L27: 914 testq $1, M 915#if GEMV_UNROLL == 2 916 je .L29 917#else 918 je .L30 919#endif 920 921 movddup -16 * SIZE(A1), %xmm4 922 movddup -15 * SIZE(A1), %xmm5 923 movddup -16 * SIZE(A2), %xmm6 924 movddup -15 * SIZE(A2), %xmm7 925 926 mulpd %xmm8, %xmm4 927 addpd %xmm4, %xmm0 928 mulpd %xmm9, %xmm5 929 SUBPD %xmm5, %xmm0 930 931 mulpd %xmm10, %xmm6 932 addpd %xmm6, %xmm0 933 mulpd %xmm11, %xmm7 934 SUBPD %xmm7, %xmm0 935 936 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 937 938#if GEMV_UNROLL == 2 939 ALIGN_3 940 941.L29: 942 cmpq $2, N 943 jge .L21 944#endif 945 ALIGN_3 946 947.L30: 948#endif 949 950 cmpq $1, N 951 jl .L980 952 953#if GEMV_UNROLL == 1 954.L31: 955 decq N 956#endif 957 958 leaq 16 * SIZE(BUFFER), Y1 959 movq A, A1 960#if GEMV_UNROLL == 1 961 addq LDA, A 962#endif 963 964 movddup 0 * SIZE(X), %xmm8 965 movddup 1 * SIZE(X), %xmm9 966 addq INCX, X 967 968 pcmpeqb %xmm5, %xmm5 969 psllq $63, %xmm5 970 shufps $0x40, %xmm5, %xmm5 971 972 movsd ALPHA_R, %xmm6 973 movhps ALPHA_I, %xmm6 974 975 pshufd $0x4e, %xmm6, %xmm7 976 977#ifndef XCONJ 978 xorps %xmm5, %xmm7 979#else 980 xorps %xmm5, %xmm6 981#endif 982 983 mulpd %xmm6, %xmm8 984 mulpd %xmm7, %xmm9 985 986#ifndef XCONJ 987 subpd %xmm9, %xmm8 988#else 989 addpd %xmm9, %xmm8 990#endif 991 992 pshufd $0x4e, %xmm8, %xmm9 993 994#ifndef XCONJ 995 xorps %xmm5, %xmm9 996#else 997 xorps %xmm5, %xmm8 998#endif 999 1000 MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) 1001 MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) 1002 MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) 1003 MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) 1004 1005 movq M, I 1006 sarq $2, I 1007 jle .L35 1008 1009 movddup -16 * SIZE(A1), %xmm4 1010 movddup -14 * SIZE(A1), %xmm5 1011 movddup -12 * SIZE(A1), %xmm6 1012 movddup -10 * SIZE(A1), %xmm7 1013 1014 decq I 1015 jle .L34 1016 ALIGN_3 1017 1018.L33: 1019#ifdef PREFETCH 1020 PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) 1021#endif 1022 1023 mulpd %xmm8, %xmm4 1024 addpd %xmm4, %xmm0 1025 movddup -15 * SIZE(A1), %xmm4 1026 mulpd %xmm8, %xmm5 1027 addpd %xmm5, %xmm1 1028 movddup -13 * SIZE(A1), %xmm5 1029 mulpd %xmm8, %xmm6 1030 addpd %xmm6, %xmm2 1031 movddup -11 * SIZE(A1), %xmm6 1032 mulpd %xmm8, %xmm7 1033 addpd %xmm7, %xmm3 1034 movddup -9 * SIZE(A1), %xmm7 1035 1036 mulpd %xmm9, %xmm4 1037 SUBPD %xmm4, %xmm0 1038 movddup -8 * SIZE(A1), %xmm4 1039 mulpd %xmm9, %xmm5 1040 SUBPD %xmm5, %xmm1 1041 movddup -6 * SIZE(A1), %xmm5 1042 mulpd %xmm9, %xmm6 1043 SUBPD %xmm6, %xmm2 1044 movddup -4 * SIZE(A1), %xmm6 1045 mulpd %xmm9, %xmm7 1046 SUBPD %xmm7, %xmm3 1047 movddup -2 * SIZE(A1), %xmm7 1048 1049#ifdef PREFETCHW 1050 PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) 1051#endif 1052 1053 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1054 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1055 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1056 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1057 1058 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 1059 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 1060 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 1061 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 1062 1063 subq $-8 * SIZE, A1 1064 subq $-8 * SIZE, Y1 1065 1066 subq $1, I 1067 BRANCH 1068 jg .L33 1069 ALIGN_3 1070 1071.L34: 1072 mulpd %xmm8, %xmm4 1073 addpd %xmm4, %xmm0 1074 movddup -15 * SIZE(A1), %xmm4 1075 mulpd %xmm8, %xmm5 1076 addpd %xmm5, %xmm1 1077 movddup -13 * SIZE(A1), %xmm5 1078 mulpd %xmm8, %xmm6 1079 addpd %xmm6, %xmm2 1080 movddup -11 * SIZE(A1), %xmm6 1081 mulpd %xmm8, %xmm7 1082 addpd %xmm7, %xmm3 1083 movddup -9 * SIZE(A1), %xmm7 1084 1085 mulpd %xmm9, %xmm4 1086 SUBPD %xmm4, %xmm0 1087 mulpd %xmm9, %xmm5 1088 SUBPD %xmm5, %xmm1 1089 mulpd %xmm9, %xmm6 1090 SUBPD %xmm6, %xmm2 1091 mulpd %xmm9, %xmm7 1092 SUBPD %xmm7, %xmm3 1093 1094 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1095 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1096 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) 1097 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) 1098 1099 MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) 1100 MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) 1101 MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) 1102 MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) 1103 1104 subq $-8 * SIZE, A1 1105 subq $-8 * SIZE, Y1 1106 ALIGN_3 1107 1108.L35: 1109 testq $2, M 1110 je .L37 1111 1112 movddup -16 * SIZE(A1), %xmm4 1113 movddup -15 * SIZE(A1), %xmm5 1114 movddup -14 * SIZE(A1), %xmm6 1115 movddup -13 * SIZE(A1), %xmm7 1116 1117 mulpd %xmm8, %xmm4 1118 addpd %xmm4, %xmm0 1119 mulpd %xmm8, %xmm6 1120 addpd %xmm6, %xmm1 1121 1122 mulpd %xmm9, %xmm5 1123 SUBPD %xmm5, %xmm0 1124 mulpd %xmm9, %xmm7 1125 SUBPD %xmm7, %xmm1 1126 1127 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1128 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) 1129 movaps %xmm2, %xmm0 1130 1131 addq $4 * SIZE, A1 1132 addq $4 * SIZE, Y1 1133 ALIGN_3 1134 1135.L37: 1136 testq $1, M 1137#if GEMV_UNROLL == 1 1138 je .L39 1139#else 1140 je .L980 1141#endif 1142 1143 movddup -16 * SIZE(A1), %xmm4 1144 movddup -15 * SIZE(A1), %xmm5 1145 1146 mulpd %xmm8, %xmm4 1147 addpd %xmm4, %xmm0 1148 mulpd %xmm9, %xmm5 1149 SUBPD %xmm5, %xmm0 1150 1151 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) 1152 1153#if GEMV_UNROLL == 1 1154 ALIGN_3 1155.L39: 1156 cmpq $1, N 1157 jge .L31 1158#endif 1159 1160.L980: 1161 testq $SIZE, Y 1162 jne .L990 1163 1164 movq Y, Y1 1165 1166 movq M, %rax 1167 sarq $3, %rax 1168 jle .L184 1169 ALIGN_3 1170 1171.L182: 1172 movaps (Y), %xmm0 1173 addq INCY, Y 1174 movaps (Y), %xmm1 1175 addq INCY, Y 1176 movaps (Y), %xmm2 1177 addq INCY, Y 1178 movaps (Y), %xmm3 1179 addq INCY, Y 1180 movaps (Y), %xmm4 1181 addq INCY, Y 1182 movaps (Y), %xmm5 1183 addq INCY, Y 1184 movaps (Y), %xmm6 1185 addq INCY, Y 1186 movaps (Y), %xmm7 1187 addq INCY, Y 1188 1189 addpd 0 * SIZE(BUFFER), %xmm0 1190 addpd 2 * SIZE(BUFFER), %xmm1 1191 addpd 4 * SIZE(BUFFER), %xmm2 1192 addpd 6 * SIZE(BUFFER), %xmm3 1193 addpd 8 * SIZE(BUFFER), %xmm4 1194 addpd 10 * SIZE(BUFFER), %xmm5 1195 addpd 12 * SIZE(BUFFER), %xmm6 1196 addpd 14 * SIZE(BUFFER), %xmm7 1197 1198 movaps %xmm0, (Y1) 1199 addq INCY, Y1 1200 movaps %xmm1, (Y1) 1201 addq INCY, Y1 1202 movaps %xmm2, (Y1) 1203 addq INCY, Y1 1204 movaps %xmm3, (Y1) 1205 addq INCY, Y1 1206 movaps %xmm4, (Y1) 1207 addq INCY, Y1 1208 movaps %xmm5, (Y1) 1209 addq INCY, Y1 1210 movaps %xmm6, (Y1) 1211 addq INCY, Y1 1212 movaps %xmm7, (Y1) 1213 addq INCY, Y1 1214 1215 subq $-16 * SIZE, BUFFER 1216 decq %rax 1217 jg .L182 1218 ALIGN_3 1219 1220.L184: 1221 testq $7, M 1222 jle .L999 1223 1224 testq $4, M 1225 jle .L185 1226 1227 movaps (Y), %xmm0 1228 addq INCY, Y 1229 movaps (Y), %xmm1 1230 addq INCY, Y 1231 movaps (Y), %xmm2 1232 addq INCY, Y 1233 movaps (Y), %xmm3 1234 addq INCY, Y 1235 1236 addpd 0 * SIZE(BUFFER), %xmm0 1237 addpd 2 * SIZE(BUFFER), %xmm1 1238 addpd 4 * SIZE(BUFFER), %xmm2 1239 addpd 6 * SIZE(BUFFER), %xmm3 1240 1241 movaps %xmm0, (Y1) 1242 addq INCY, Y1 1243 movaps %xmm1, (Y1) 1244 addq INCY, Y1 1245 movaps %xmm2, (Y1) 1246 addq INCY, Y1 1247 movaps %xmm3, (Y1) 1248 addq INCY, Y1 1249 1250 addq $8 * SIZE, BUFFER 1251 ALIGN_3 1252 1253.L185: 1254 testq $2, M 1255 jle .L186 1256 1257 movaps (Y), %xmm0 1258 addq INCY, Y 1259 movaps (Y), %xmm1 1260 addq INCY, Y 1261 addpd 0 * SIZE(BUFFER), %xmm0 1262 addpd 2 * SIZE(BUFFER), %xmm1 1263 1264 movaps %xmm0, (Y1) 1265 addq INCY, Y1 1266 movaps %xmm1, (Y1) 1267 addq INCY, Y1 1268 1269 addq $4 * SIZE, BUFFER 1270 ALIGN_3 1271 1272.L186: 1273 testq $1, M 1274 jle .L999 1275 1276 movaps (Y), %xmm0 1277 1278 addpd (BUFFER), %xmm0 1279 1280 movaps %xmm0, (Y1) 1281 jmp .L999 1282 ALIGN_3 1283 1284.L990: 1285 movq Y, Y1 1286 1287 movq M, %rax 1288 sarq $3, %rax 1289 jle .L994 1290 ALIGN_3 1291 1292.L992: 1293 movsd 0 * SIZE(Y), %xmm0 1294 movhpd 1 * SIZE(Y), %xmm0 1295 addq INCY, Y 1296 1297 movsd 0 * SIZE(Y), %xmm1 1298 movhpd 1 * SIZE(Y), %xmm1 1299 addq INCY, Y 1300 1301 movsd 0 * SIZE(Y), %xmm2 1302 movhpd 1 * SIZE(Y), %xmm2 1303 addq INCY, Y 1304 1305 movsd 0 * SIZE(Y), %xmm3 1306 movhpd 1 * SIZE(Y), %xmm3 1307 addq INCY, Y 1308 1309 movsd 0 * SIZE(Y), %xmm4 1310 movhpd 1 * SIZE(Y), %xmm4 1311 addq INCY, Y 1312 1313 movsd 0 * SIZE(Y), %xmm5 1314 movhpd 1 * SIZE(Y), %xmm5 1315 addq INCY, Y 1316 1317 movsd 0 * SIZE(Y), %xmm6 1318 movhpd 1 * SIZE(Y), %xmm6 1319 addq INCY, Y 1320 1321 movsd 0 * SIZE(Y), %xmm7 1322 movhpd 1 * SIZE(Y), %xmm7 1323 addq INCY, Y 1324 1325 addpd 0 * SIZE(BUFFER), %xmm0 1326 addpd 2 * SIZE(BUFFER), %xmm1 1327 addpd 4 * SIZE(BUFFER), %xmm2 1328 addpd 6 * SIZE(BUFFER), %xmm3 1329 addpd 8 * SIZE(BUFFER), %xmm4 1330 addpd 10 * SIZE(BUFFER), %xmm5 1331 addpd 12 * SIZE(BUFFER), %xmm6 1332 addpd 14 * SIZE(BUFFER), %xmm7 1333 1334 movlpd %xmm0, 0 * SIZE(Y1) 1335 movhpd %xmm0, 1 * SIZE(Y1) 1336 addq INCY, Y1 1337 1338 movlpd %xmm1, 0 * SIZE(Y1) 1339 movhpd %xmm1, 1 * SIZE(Y1) 1340 addq INCY, Y1 1341 1342 movlpd %xmm2, 0 * SIZE(Y1) 1343 movhpd %xmm2, 1 * SIZE(Y1) 1344 addq INCY, Y1 1345 1346 movlpd %xmm3, 0 * SIZE(Y1) 1347 movhpd %xmm3, 1 * SIZE(Y1) 1348 addq INCY, Y1 1349 1350 movlpd %xmm4, 0 * SIZE(Y1) 1351 movhpd %xmm4, 1 * SIZE(Y1) 1352 addq INCY, Y1 1353 1354 movlpd %xmm5, 0 * SIZE(Y1) 1355 movhpd %xmm5, 1 * SIZE(Y1) 1356 addq INCY, Y1 1357 1358 movlpd %xmm6, 0 * SIZE(Y1) 1359 movhpd %xmm6, 1 * SIZE(Y1) 1360 addq INCY, Y1 1361 1362 movlpd %xmm7, 0 * SIZE(Y1) 1363 movhpd %xmm7, 1 * SIZE(Y1) 1364 addq INCY, Y1 1365 1366 subq $-16 * SIZE, BUFFER 1367 decq %rax 1368 jg .L992 1369 ALIGN_3 1370 1371.L994: 1372 testq $7, M 1373 jle .L999 1374 1375 testq $4, M 1376 jle .L995 1377 1378 movsd 0 * SIZE(Y), %xmm0 1379 movhpd 1 * SIZE(Y), %xmm0 1380 addq INCY, Y 1381 1382 movsd 0 * SIZE(Y), %xmm1 1383 movhpd 1 * SIZE(Y), %xmm1 1384 addq INCY, Y 1385 1386 movsd 0 * SIZE(Y), %xmm2 1387 movhpd 1 * SIZE(Y), %xmm2 1388 addq INCY, Y 1389 1390 movsd 0 * SIZE(Y), %xmm3 1391 movhpd 1 * SIZE(Y), %xmm3 1392 addq INCY, Y 1393 1394 addpd 0 * SIZE(BUFFER), %xmm0 1395 addpd 2 * SIZE(BUFFER), %xmm1 1396 addpd 4 * SIZE(BUFFER), %xmm2 1397 addpd 6 * SIZE(BUFFER), %xmm3 1398 1399 movlpd %xmm0, 0 * SIZE(Y1) 1400 movhpd %xmm0, 1 * SIZE(Y1) 1401 addq INCY, Y1 1402 1403 movlpd %xmm1, 0 * SIZE(Y1) 1404 movhpd %xmm1, 1 * SIZE(Y1) 1405 addq INCY, Y1 1406 1407 movlpd %xmm2, 0 * SIZE(Y1) 1408 movhpd %xmm2, 1 * SIZE(Y1) 1409 addq INCY, Y1 1410 1411 movlpd %xmm3, 0 * SIZE(Y1) 1412 movhpd %xmm3, 1 * SIZE(Y1) 1413 addq INCY, Y1 1414 1415 addq $8 * SIZE, BUFFER 1416 ALIGN_3 1417 1418.L995: 1419 testq $2, M 1420 jle .L996 1421 1422 movsd 0 * SIZE(Y), %xmm0 1423 movhpd 1 * SIZE(Y), %xmm0 1424 addq INCY, Y 1425 1426 movsd 0 * SIZE(Y), %xmm1 1427 movhpd 1 * SIZE(Y), %xmm1 1428 addq INCY, Y 1429 1430 addpd 0 * SIZE(BUFFER), %xmm0 1431 addpd 2 * SIZE(BUFFER), %xmm1 1432 1433 movlpd %xmm0, 0 * SIZE(Y1) 1434 movhpd %xmm0, 1 * SIZE(Y1) 1435 addq INCY, Y1 1436 1437 movlpd %xmm1, 0 * SIZE(Y1) 1438 movhpd %xmm1, 1 * SIZE(Y1) 1439 addq INCY, Y1 1440 1441 addq $4 * SIZE, BUFFER 1442 ALIGN_3 1443 1444.L996: 1445 testq $1, M 1446 jle .L999 1447 1448 movsd 0 * SIZE(Y), %xmm0 1449 movhpd 1 * SIZE(Y), %xmm0 1450 1451 addpd 0 * SIZE(BUFFER), %xmm0 1452 1453 movlpd %xmm0, 0 * SIZE(Y1) 1454 movhpd %xmm0, 1 * SIZE(Y1) 1455 ALIGN_3 1456 1457.L999: 1458 movq 0(%rsp), %rbx 1459 movq 8(%rsp), %rbp 1460 movq 16(%rsp), %r12 1461 movq 24(%rsp), %r13 1462 movq 32(%rsp), %r14 1463 movq 40(%rsp), %r15 1464 1465#ifdef WINDOWS_ABI 1466 movq 48(%rsp), %rdi 1467 movq 56(%rsp), %rsi 1468 movups 64(%rsp), %xmm6 1469 movups 80(%rsp), %xmm7 1470 movups 96(%rsp), %xmm8 1471 movups 112(%rsp), %xmm9 1472 movups 128(%rsp), %xmm10 1473 movups 144(%rsp), %xmm11 1474 movups 160(%rsp), %xmm12 1475 movups 176(%rsp), %xmm13 1476 movups 192(%rsp), %xmm14 1477 movups 208(%rsp), %xmm15 1478#endif 1479 1480 addq $STACKSIZE, %rsp 1481 ret 1482 1483 EPILOGUE 1484