1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define N ARG1 /* rdi */ 26#define X ARG2 /* rsi */ 27#define INCX ARG3 /* rdx */ 28#define Y ARG4 /* rcx */ 29#ifndef WINDOWS_ABI 30#define INCY ARG5 /* r8 */ 31#else 32#define INCY %r10 33#endif 34 35#include "l1param.h" 36 37#undef movsd 38 39#ifndef OPTERON 40#define MOVLPS movsd 41#else 42#define MOVLPS movlps 43#endif 44 45 46 PROLOGUE 47 PROFCODE 48 49#ifdef WINDOWS_ABI 50 movq 40(%rsp), INCY 51#endif 52 53 SAVEREGISTERS 54 55 salq $ZBASE_SHIFT, INCX 56 salq $ZBASE_SHIFT, INCY 57 58 xorps %xmm0, %xmm0 59 xorps %xmm1, %xmm1 60 xorps %xmm2, %xmm2 61 xorps %xmm3, %xmm3 62 63 cmpq $0, N 64 jle .L999 65 66 cmpq $2 * SIZE, INCX 67 jne .L50 68 cmpq $2 * SIZE, INCY 69 jne .L50 70 71 subq $-16 * SIZE, X 72 subq $-16 * SIZE, Y 73 74 testq $SIZE, Y 75 jne .L30 76 77 testq $SIZE, X 78 jne .L20 79 80 movq N, %rax 81 sarq $3, %rax 82 jle .L15 83 84 movaps -16 * SIZE(X), %xmm4 85 movaps -14 * SIZE(X), %xmm5 86 movaps -16 * SIZE(Y), %xmm8 87 movaps -14 * SIZE(Y), %xmm9 88 movaps -12 * SIZE(X), %xmm6 89 movaps -10 * SIZE(X), %xmm7 90 movaps -12 * SIZE(Y), %xmm10 91 movaps -10 * SIZE(Y), %xmm11 92 93 decq %rax 94 jle .L12 95 ALIGN_3 96 97.L11: 98#ifdef PREFETCH 99 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 100#endif 101 102 pshufd $0x4e, %xmm8, %xmm12 103 mulpd %xmm4, %xmm8 104 addpd %xmm8, %xmm0 105 movaps -8 * SIZE(Y), %xmm8 106 mulpd %xmm4, %xmm12 107 movaps -8 * SIZE(X), %xmm4 108 addpd %xmm12, %xmm1 109 110 pshufd $0x4e, %xmm9, %xmm12 111 mulpd %xmm5, %xmm9 112 addpd %xmm9, %xmm2 113 movaps -6 * SIZE(Y), %xmm9 114 mulpd %xmm5, %xmm12 115 movaps -6 * SIZE(X), %xmm5 116 addpd %xmm12, %xmm3 117 118#ifdef PREFETCH 119 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 120#endif 121 122 pshufd $0x4e, %xmm10, %xmm12 123 mulpd %xmm6, %xmm10 124 addpd %xmm10, %xmm0 125 movaps -4 * SIZE(Y), %xmm10 126 mulpd %xmm6, %xmm12 127 movaps -4 * SIZE(X), %xmm6 128 addpd %xmm12, %xmm1 129 130 pshufd $0x4e, %xmm11, %xmm12 131 mulpd %xmm7, %xmm11 132 addpd %xmm11, %xmm2 133 movaps -2 * SIZE(Y), %xmm11 134 mulpd %xmm7, %xmm12 135 movaps -2 * SIZE(X), %xmm7 136 addpd %xmm12, %xmm3 137 138#if defined(PREFETCH) && !defined(FETCH128) 139 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 140#endif 141 142 pshufd $0x4e, %xmm8, %xmm12 143 mulpd %xmm4, %xmm8 144 addpd %xmm8, %xmm0 145 movaps 0 * SIZE(Y), %xmm8 146 mulpd %xmm4, %xmm12 147 movaps 0 * SIZE(X), %xmm4 148 addpd %xmm12, %xmm1 149 150 pshufd $0x4e, %xmm9, %xmm12 151 mulpd %xmm5, %xmm9 152 addpd %xmm9, %xmm2 153 movaps 2 * SIZE(Y), %xmm9 154 mulpd %xmm5, %xmm12 155 movaps 2 * SIZE(X), %xmm5 156 addpd %xmm12, %xmm3 157 158#if defined(PREFETCH) && !defined(FETCH128) 159 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 160#endif 161 162 pshufd $0x4e, %xmm10, %xmm12 163 mulpd %xmm6, %xmm10 164 addpd %xmm10, %xmm0 165 movaps 4 * SIZE(Y), %xmm10 166 mulpd %xmm6, %xmm12 167 movaps 4 * SIZE(X), %xmm6 168 addpd %xmm12, %xmm1 169 170 pshufd $0x4e, %xmm11, %xmm12 171 mulpd %xmm7, %xmm11 172 addpd %xmm11, %xmm2 173 movaps 6 * SIZE(Y), %xmm11 174 mulpd %xmm7, %xmm12 175 movaps 6 * SIZE(X), %xmm7 176 addpd %xmm12, %xmm3 177 178 subq $-16 * SIZE, X 179 subq $-16 * SIZE, Y 180 181 decq %rax 182 jg .L11 183 ALIGN_3 184 185.L12: 186 pshufd $0x4e, %xmm8, %xmm12 187 mulpd %xmm4, %xmm8 188 addpd %xmm8, %xmm0 189 movaps -8 * SIZE(Y), %xmm8 190 mulpd %xmm4, %xmm12 191 movaps -8 * SIZE(X), %xmm4 192 addpd %xmm12, %xmm1 193 194 pshufd $0x4e, %xmm9, %xmm12 195 mulpd %xmm5, %xmm9 196 addpd %xmm9, %xmm2 197 movaps -6 * SIZE(Y), %xmm9 198 mulpd %xmm5, %xmm12 199 movaps -6 * SIZE(X), %xmm5 200 addpd %xmm12, %xmm3 201 202 pshufd $0x4e, %xmm10, %xmm12 203 mulpd %xmm6, %xmm10 204 addpd %xmm10, %xmm0 205 movaps -4 * SIZE(Y), %xmm10 206 mulpd %xmm6, %xmm12 207 movaps -4 * SIZE(X), %xmm6 208 addpd %xmm12, %xmm1 209 210 pshufd $0x4e, %xmm11, %xmm12 211 mulpd %xmm7, %xmm11 212 addpd %xmm11, %xmm2 213 movaps -2 * SIZE(Y), %xmm11 214 mulpd %xmm7, %xmm12 215 movaps -2 * SIZE(X), %xmm7 216 addpd %xmm12, %xmm3 217 218 pshufd $0x4e, %xmm8, %xmm12 219 mulpd %xmm4, %xmm8 220 addpd %xmm8, %xmm0 221 mulpd %xmm4, %xmm12 222 addpd %xmm12, %xmm1 223 224 pshufd $0x4e, %xmm9, %xmm12 225 mulpd %xmm5, %xmm9 226 addpd %xmm9, %xmm2 227 mulpd %xmm5, %xmm12 228 addpd %xmm12, %xmm3 229 230 pshufd $0x4e, %xmm10, %xmm12 231 mulpd %xmm6, %xmm10 232 addpd %xmm10, %xmm0 233 mulpd %xmm6, %xmm12 234 addpd %xmm12, %xmm1 235 236 pshufd $0x4e, %xmm11, %xmm12 237 mulpd %xmm7, %xmm11 238 addpd %xmm11, %xmm2 239 mulpd %xmm7, %xmm12 240 addpd %xmm12, %xmm3 241 242 subq $-16 * SIZE, X 243 subq $-16 * SIZE, Y 244 ALIGN_3 245 246.L15: 247 testq $4, N 248 jle .L16 249 250 movaps -16 * SIZE(X), %xmm4 251 movaps -16 * SIZE(Y), %xmm8 252 movaps -14 * SIZE(X), %xmm5 253 movaps -14 * SIZE(Y), %xmm9 254 255 pshufd $0x4e, %xmm8, %xmm12 256 mulpd %xmm4, %xmm8 257 addpd %xmm8, %xmm0 258 mulpd %xmm4, %xmm12 259 addpd %xmm12, %xmm1 260 261 pshufd $0x4e, %xmm9, %xmm12 262 mulpd %xmm5, %xmm9 263 addpd %xmm9, %xmm2 264 mulpd %xmm5, %xmm12 265 addpd %xmm12, %xmm3 266 267 movaps -12 * SIZE(X), %xmm6 268 movaps -12 * SIZE(Y), %xmm10 269 movaps -10 * SIZE(X), %xmm7 270 movaps -10 * SIZE(Y), %xmm11 271 272 pshufd $0x4e, %xmm10, %xmm12 273 mulpd %xmm6, %xmm10 274 addpd %xmm10, %xmm0 275 mulpd %xmm6, %xmm12 276 addpd %xmm12, %xmm1 277 278 pshufd $0x4e, %xmm11, %xmm12 279 mulpd %xmm7, %xmm11 280 addpd %xmm11, %xmm2 281 mulpd %xmm7, %xmm12 282 addpd %xmm12, %xmm3 283 284 addq $8 * SIZE, X 285 addq $8 * SIZE, Y 286 ALIGN_3 287 288.L16: 289 testq $2, N 290 jle .L17 291 292 movaps -16 * SIZE(X), %xmm4 293 movaps -16 * SIZE(Y), %xmm8 294 movaps -14 * SIZE(X), %xmm5 295 movaps -14 * SIZE(Y), %xmm9 296 297 pshufd $0x4e, %xmm8, %xmm12 298 mulpd %xmm4, %xmm8 299 addpd %xmm8, %xmm0 300 mulpd %xmm4, %xmm12 301 addpd %xmm12, %xmm1 302 303 pshufd $0x4e, %xmm9, %xmm12 304 mulpd %xmm5, %xmm9 305 addpd %xmm9, %xmm2 306 mulpd %xmm5, %xmm12 307 addpd %xmm12, %xmm3 308 309 addq $4 * SIZE, X 310 addq $4 * SIZE, Y 311 ALIGN_3 312 313.L17: 314 testq $1, N 315 jle .L98 316 317 movaps -16 * SIZE(X), %xmm4 318 movaps -16 * SIZE(Y), %xmm8 319 320 pshufd $0x4e, %xmm8, %xmm12 321 mulpd %xmm4, %xmm8 322 addpd %xmm8, %xmm0 323 mulpd %xmm4, %xmm12 324 addpd %xmm12, %xmm1 325 jmp .L98 326 ALIGN_3 327 328.L20: 329 movq N, %rax 330 sarq $3, %rax 331 jle .L25 332 333 MOVLPS -16 * SIZE(X), %xmm4 334 movhps -15 * SIZE(X), %xmm4 335 MOVLPS -14 * SIZE(X), %xmm5 336 movhps -13 * SIZE(X), %xmm5 337 movaps -16 * SIZE(Y), %xmm8 338 movaps -14 * SIZE(Y), %xmm9 339 MOVLPS -12 * SIZE(X), %xmm6 340 movhps -11 * SIZE(X), %xmm6 341 MOVLPS -10 * SIZE(X), %xmm7 342 movhps -9 * SIZE(X), %xmm7 343 movaps -12 * SIZE(Y), %xmm10 344 movaps -10 * SIZE(Y), %xmm11 345 346 decq %rax 347 jle .L22 348 ALIGN_3 349 350.L21: 351#ifdef PREFETCH 352 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 353#endif 354 355 pshufd $0x4e, %xmm8, %xmm12 356 mulpd %xmm4, %xmm8 357 addpd %xmm8, %xmm0 358 movaps -8 * SIZE(Y), %xmm8 359 mulpd %xmm4, %xmm12 360 MOVLPS -8 * SIZE(X), %xmm4 361 movhps -7 * SIZE(X), %xmm4 362 addpd %xmm12, %xmm1 363 364 pshufd $0x4e, %xmm9, %xmm12 365 mulpd %xmm5, %xmm9 366 addpd %xmm9, %xmm2 367 movaps -6 * SIZE(Y), %xmm9 368 mulpd %xmm5, %xmm12 369 MOVLPS -6 * SIZE(X), %xmm5 370 movhps -5 * SIZE(X), %xmm5 371 addpd %xmm12, %xmm3 372 373#ifdef PREFETCH 374 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 375#endif 376 377 pshufd $0x4e, %xmm10, %xmm12 378 mulpd %xmm6, %xmm10 379 addpd %xmm10, %xmm0 380 movaps -4 * SIZE(Y), %xmm10 381 mulpd %xmm6, %xmm12 382 MOVLPS -4 * SIZE(X), %xmm6 383 movhps -3 * SIZE(X), %xmm6 384 addpd %xmm12, %xmm1 385 386 pshufd $0x4e, %xmm11, %xmm12 387 mulpd %xmm7, %xmm11 388 addpd %xmm11, %xmm2 389 movaps -2 * SIZE(Y), %xmm11 390 mulpd %xmm7, %xmm12 391 MOVLPS -2 * SIZE(X), %xmm7 392 movhps -1 * SIZE(X), %xmm7 393 addpd %xmm12, %xmm3 394 395#if defined(PREFETCH) && !defined(FETCH128) 396 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 397#endif 398 399 pshufd $0x4e, %xmm8, %xmm12 400 mulpd %xmm4, %xmm8 401 addpd %xmm8, %xmm0 402 movaps 0 * SIZE(Y), %xmm8 403 mulpd %xmm4, %xmm12 404 MOVLPS 0 * SIZE(X), %xmm4 405 movhps 1 * SIZE(X), %xmm4 406 addpd %xmm12, %xmm1 407 408 pshufd $0x4e, %xmm9, %xmm12 409 mulpd %xmm5, %xmm9 410 addpd %xmm9, %xmm2 411 movaps 2 * SIZE(Y), %xmm9 412 mulpd %xmm5, %xmm12 413 MOVLPS 2 * SIZE(X), %xmm5 414 movhps 3 * SIZE(X), %xmm5 415 addpd %xmm12, %xmm3 416 417#if defined(PREFETCH) && !defined(FETCH128) 418 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 419#endif 420 421 pshufd $0x4e, %xmm10, %xmm12 422 mulpd %xmm6, %xmm10 423 addpd %xmm10, %xmm0 424 movaps 4 * SIZE(Y), %xmm10 425 mulpd %xmm6, %xmm12 426 MOVLPS 4 * SIZE(X), %xmm6 427 movhps 5 * SIZE(X), %xmm6 428 addpd %xmm12, %xmm1 429 430 pshufd $0x4e, %xmm11, %xmm12 431 mulpd %xmm7, %xmm11 432 addpd %xmm11, %xmm2 433 movaps 6 * SIZE(Y), %xmm11 434 mulpd %xmm7, %xmm12 435 MOVLPS 6 * SIZE(X), %xmm7 436 movhps 7 * SIZE(X), %xmm7 437 addpd %xmm12, %xmm3 438 439 subq $-16 * SIZE, X 440 subq $-16 * SIZE, Y 441 442 decq %rax 443 jg .L21 444 ALIGN_3 445 446.L22: 447 448 pshufd $0x4e, %xmm8, %xmm12 449 mulpd %xmm4, %xmm8 450 addpd %xmm8, %xmm0 451 movaps -8 * SIZE(Y), %xmm8 452 mulpd %xmm4, %xmm12 453 MOVLPS -8 * SIZE(X), %xmm4 454 movhps -7 * SIZE(X), %xmm4 455 addpd %xmm12, %xmm1 456 457 pshufd $0x4e, %xmm9, %xmm12 458 mulpd %xmm5, %xmm9 459 addpd %xmm9, %xmm2 460 movaps -6 * SIZE(Y), %xmm9 461 mulpd %xmm5, %xmm12 462 MOVLPS -6 * SIZE(X), %xmm5 463 movhps -5 * SIZE(X), %xmm5 464 addpd %xmm12, %xmm3 465 466 pshufd $0x4e, %xmm10, %xmm12 467 mulpd %xmm6, %xmm10 468 addpd %xmm10, %xmm0 469 movaps -4 * SIZE(Y), %xmm10 470 mulpd %xmm6, %xmm12 471 MOVLPS -4 * SIZE(X), %xmm6 472 movhps -3 * SIZE(X), %xmm6 473 addpd %xmm12, %xmm1 474 475 pshufd $0x4e, %xmm11, %xmm12 476 mulpd %xmm7, %xmm11 477 addpd %xmm11, %xmm2 478 movaps -2 * SIZE(Y), %xmm11 479 mulpd %xmm7, %xmm12 480 MOVLPS -2 * SIZE(X), %xmm7 481 movhps -1 * SIZE(X), %xmm7 482 addpd %xmm12, %xmm3 483 484 pshufd $0x4e, %xmm8, %xmm12 485 mulpd %xmm4, %xmm8 486 addpd %xmm8, %xmm0 487 mulpd %xmm4, %xmm12 488 addpd %xmm12, %xmm1 489 490 pshufd $0x4e, %xmm9, %xmm12 491 mulpd %xmm5, %xmm9 492 addpd %xmm9, %xmm2 493 mulpd %xmm5, %xmm12 494 addpd %xmm12, %xmm3 495 496 pshufd $0x4e, %xmm10, %xmm12 497 mulpd %xmm6, %xmm10 498 addpd %xmm10, %xmm0 499 mulpd %xmm6, %xmm12 500 addpd %xmm12, %xmm1 501 502 pshufd $0x4e, %xmm11, %xmm12 503 mulpd %xmm7, %xmm11 504 addpd %xmm11, %xmm2 505 mulpd %xmm7, %xmm12 506 addpd %xmm12, %xmm3 507 508 subq $-16 * SIZE, X 509 subq $-16 * SIZE, Y 510 ALIGN_3 511 512.L25: 513 testq $4, N 514 jle .L26 515 516 MOVLPS -16 * SIZE(X), %xmm4 517 movhps -15 * SIZE(X), %xmm4 518 movaps -16 * SIZE(Y), %xmm8 519 520 pshufd $0x4e, %xmm8, %xmm12 521 mulpd %xmm4, %xmm8 522 addpd %xmm8, %xmm0 523 mulpd %xmm4, %xmm12 524 addpd %xmm12, %xmm1 525 526 MOVLPS -14 * SIZE(X), %xmm5 527 movhps -13 * SIZE(X), %xmm5 528 movaps -14 * SIZE(Y), %xmm9 529 530 pshufd $0x4e, %xmm9, %xmm12 531 mulpd %xmm5, %xmm9 532 addpd %xmm9, %xmm2 533 mulpd %xmm5, %xmm12 534 addpd %xmm12, %xmm3 535 536 MOVLPS -12 * SIZE(X), %xmm6 537 movhps -11 * SIZE(X), %xmm6 538 movaps -12 * SIZE(Y), %xmm10 539 540 pshufd $0x4e, %xmm10, %xmm12 541 mulpd %xmm6, %xmm10 542 addpd %xmm10, %xmm0 543 mulpd %xmm6, %xmm12 544 addpd %xmm12, %xmm1 545 546 MOVLPS -10 * SIZE(X), %xmm7 547 movhps -9 * SIZE(X), %xmm7 548 movaps -10 * SIZE(Y), %xmm11 549 550 pshufd $0x4e, %xmm11, %xmm12 551 mulpd %xmm7, %xmm11 552 addpd %xmm11, %xmm2 553 mulpd %xmm7, %xmm12 554 addpd %xmm12, %xmm3 555 556 addq $8 * SIZE, X 557 addq $8 * SIZE, Y 558 ALIGN_3 559 560.L26: 561 testq $2, N 562 jle .L27 563 564 MOVLPS -16 * SIZE(X), %xmm4 565 movhps -15 * SIZE(X), %xmm4 566 movaps -16 * SIZE(Y), %xmm8 567 568 pshufd $0x4e, %xmm8, %xmm12 569 mulpd %xmm4, %xmm8 570 addpd %xmm8, %xmm0 571 mulpd %xmm4, %xmm12 572 addpd %xmm12, %xmm1 573 574 MOVLPS -14 * SIZE(X), %xmm5 575 movhps -13 * SIZE(X), %xmm5 576 movaps -14 * SIZE(Y), %xmm9 577 578 pshufd $0x4e, %xmm9, %xmm12 579 mulpd %xmm5, %xmm9 580 addpd %xmm9, %xmm2 581 mulpd %xmm5, %xmm12 582 addpd %xmm12, %xmm3 583 584 addq $4 * SIZE, X 585 addq $4 * SIZE, Y 586 ALIGN_3 587 588.L27: 589 testq $1, N 590 jle .L98 591 592 MOVLPS -16 * SIZE(X), %xmm4 593 movhps -15 * SIZE(X), %xmm4 594 movaps -16 * SIZE(Y), %xmm8 595 596 pshufd $0x4e, %xmm8, %xmm12 597 mulpd %xmm4, %xmm8 598 addpd %xmm8, %xmm0 599 mulpd %xmm4, %xmm12 600 addpd %xmm12, %xmm1 601 jmp .L98 602 ALIGN_3 603 604.L30: 605 testq $SIZE, X 606 jne .L40 607 608 movq N, %rax 609 sarq $3, %rax 610 jle .L35 611 612 MOVLPS -16 * SIZE(Y), %xmm4 613 movhps -15 * SIZE(Y), %xmm4 614 MOVLPS -14 * SIZE(Y), %xmm5 615 movhps -13 * SIZE(Y), %xmm5 616 movaps -16 * SIZE(X), %xmm8 617 movaps -14 * SIZE(X), %xmm9 618 MOVLPS -12 * SIZE(Y), %xmm6 619 movhps -11 * SIZE(Y), %xmm6 620 MOVLPS -10 * SIZE(Y), %xmm7 621 movhps -9 * SIZE(Y), %xmm7 622 movaps -12 * SIZE(X), %xmm10 623 movaps -10 * SIZE(X), %xmm11 624 625 decq %rax 626 jle .L32 627 ALIGN_3 628 629.L31: 630#ifdef PREFETCH 631 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 632#endif 633 634 pshufd $0x4e, %xmm8, %xmm12 635 mulpd %xmm4, %xmm8 636 addpd %xmm8, %xmm0 637 movaps -8 * SIZE(X), %xmm8 638 mulpd %xmm4, %xmm12 639 MOVLPS -8 * SIZE(Y), %xmm4 640 movhps -7 * SIZE(Y), %xmm4 641 addpd %xmm12, %xmm1 642 643 pshufd $0x4e, %xmm9, %xmm12 644 mulpd %xmm5, %xmm9 645 addpd %xmm9, %xmm2 646 movaps -6 * SIZE(X), %xmm9 647 mulpd %xmm5, %xmm12 648 MOVLPS -6 * SIZE(Y), %xmm5 649 movhps -5 * SIZE(Y), %xmm5 650 addpd %xmm12, %xmm3 651 652#ifdef PREFETCH 653 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 654#endif 655 656 pshufd $0x4e, %xmm10, %xmm12 657 mulpd %xmm6, %xmm10 658 addpd %xmm10, %xmm0 659 movaps -4 * SIZE(X), %xmm10 660 mulpd %xmm6, %xmm12 661 MOVLPS -4 * SIZE(Y), %xmm6 662 movhps -3 * SIZE(Y), %xmm6 663 addpd %xmm12, %xmm1 664 665 pshufd $0x4e, %xmm11, %xmm12 666 mulpd %xmm7, %xmm11 667 addpd %xmm11, %xmm2 668 movaps -2 * SIZE(X), %xmm11 669 mulpd %xmm7, %xmm12 670 MOVLPS -2 * SIZE(Y), %xmm7 671 movhps -1 * SIZE(Y), %xmm7 672 addpd %xmm12, %xmm3 673 674#if defined(PREFETCH) && !defined(FETCH128) 675 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 676#endif 677 678 pshufd $0x4e, %xmm8, %xmm12 679 mulpd %xmm4, %xmm8 680 addpd %xmm8, %xmm0 681 movaps 0 * SIZE(X), %xmm8 682 mulpd %xmm4, %xmm12 683 MOVLPS 0 * SIZE(Y), %xmm4 684 movhps 1 * SIZE(Y), %xmm4 685 addpd %xmm12, %xmm1 686 687 pshufd $0x4e, %xmm9, %xmm12 688 mulpd %xmm5, %xmm9 689 addpd %xmm9, %xmm2 690 movaps 2 * SIZE(X), %xmm9 691 mulpd %xmm5, %xmm12 692 MOVLPS 2 * SIZE(Y), %xmm5 693 movhps 3 * SIZE(Y), %xmm5 694 addpd %xmm12, %xmm3 695 696#if defined(PREFETCH) && !defined(FETCH128) 697 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 698#endif 699 700 pshufd $0x4e, %xmm10, %xmm12 701 mulpd %xmm6, %xmm10 702 addpd %xmm10, %xmm0 703 movaps 4 * SIZE(X), %xmm10 704 mulpd %xmm6, %xmm12 705 MOVLPS 4 * SIZE(Y), %xmm6 706 movhps 5 * SIZE(Y), %xmm6 707 addpd %xmm12, %xmm1 708 709 pshufd $0x4e, %xmm11, %xmm12 710 mulpd %xmm7, %xmm11 711 addpd %xmm11, %xmm2 712 movaps 6 * SIZE(X), %xmm11 713 mulpd %xmm7, %xmm12 714 MOVLPS 6 * SIZE(Y), %xmm7 715 movhps 7 * SIZE(Y), %xmm7 716 addpd %xmm12, %xmm3 717 718 subq $-16 * SIZE, X 719 subq $-16 * SIZE, Y 720 721 decq %rax 722 jg .L31 723 ALIGN_3 724 725.L32: 726 727 pshufd $0x4e, %xmm8, %xmm12 728 mulpd %xmm4, %xmm8 729 addpd %xmm8, %xmm0 730 movaps -8 * SIZE(X), %xmm8 731 mulpd %xmm4, %xmm12 732 MOVLPS -8 * SIZE(Y), %xmm4 733 movhps -7 * SIZE(Y), %xmm4 734 addpd %xmm12, %xmm1 735 736 pshufd $0x4e, %xmm9, %xmm12 737 mulpd %xmm5, %xmm9 738 addpd %xmm9, %xmm2 739 movaps -6 * SIZE(X), %xmm9 740 mulpd %xmm5, %xmm12 741 MOVLPS -6 * SIZE(Y), %xmm5 742 movhps -5 * SIZE(Y), %xmm5 743 addpd %xmm12, %xmm3 744 745 pshufd $0x4e, %xmm10, %xmm12 746 mulpd %xmm6, %xmm10 747 addpd %xmm10, %xmm0 748 movaps -4 * SIZE(X), %xmm10 749 mulpd %xmm6, %xmm12 750 MOVLPS -4 * SIZE(Y), %xmm6 751 movhps -3 * SIZE(Y), %xmm6 752 addpd %xmm12, %xmm1 753 754 pshufd $0x4e, %xmm11, %xmm12 755 mulpd %xmm7, %xmm11 756 addpd %xmm11, %xmm2 757 movaps -2 * SIZE(X), %xmm11 758 mulpd %xmm7, %xmm12 759 MOVLPS -2 * SIZE(Y), %xmm7 760 movhps -1 * SIZE(Y), %xmm7 761 addpd %xmm12, %xmm3 762 763 pshufd $0x4e, %xmm8, %xmm12 764 mulpd %xmm4, %xmm8 765 addpd %xmm8, %xmm0 766 mulpd %xmm4, %xmm12 767 addpd %xmm12, %xmm1 768 769 pshufd $0x4e, %xmm9, %xmm12 770 mulpd %xmm5, %xmm9 771 addpd %xmm9, %xmm2 772 mulpd %xmm5, %xmm12 773 addpd %xmm12, %xmm3 774 775 pshufd $0x4e, %xmm10, %xmm12 776 mulpd %xmm6, %xmm10 777 addpd %xmm10, %xmm0 778 mulpd %xmm6, %xmm12 779 addpd %xmm12, %xmm1 780 781 pshufd $0x4e, %xmm11, %xmm12 782 mulpd %xmm7, %xmm11 783 addpd %xmm11, %xmm2 784 mulpd %xmm7, %xmm12 785 addpd %xmm12, %xmm3 786 787 subq $-16 * SIZE, X 788 subq $-16 * SIZE, Y 789 ALIGN_3 790 791.L35: 792 testq $4, N 793 jle .L36 794 795 MOVLPS -16 * SIZE(Y), %xmm4 796 movhps -15 * SIZE(Y), %xmm4 797 movaps -16 * SIZE(X), %xmm8 798 799 pshufd $0x4e, %xmm8, %xmm12 800 mulpd %xmm4, %xmm8 801 addpd %xmm8, %xmm0 802 mulpd %xmm4, %xmm12 803 addpd %xmm12, %xmm1 804 805 MOVLPS -14 * SIZE(Y), %xmm5 806 movhps -13 * SIZE(Y), %xmm5 807 movaps -14 * SIZE(X), %xmm9 808 809 pshufd $0x4e, %xmm9, %xmm12 810 mulpd %xmm5, %xmm9 811 addpd %xmm9, %xmm2 812 mulpd %xmm5, %xmm12 813 addpd %xmm12, %xmm3 814 815 MOVLPS -12 * SIZE(Y), %xmm6 816 movhps -11 * SIZE(Y), %xmm6 817 movaps -12 * SIZE(X), %xmm10 818 819 pshufd $0x4e, %xmm10, %xmm12 820 mulpd %xmm6, %xmm10 821 addpd %xmm10, %xmm0 822 mulpd %xmm6, %xmm12 823 addpd %xmm12, %xmm1 824 825 MOVLPS -10 * SIZE(Y), %xmm7 826 movhps -9 * SIZE(Y), %xmm7 827 movaps -10 * SIZE(X), %xmm11 828 829 pshufd $0x4e, %xmm11, %xmm12 830 mulpd %xmm7, %xmm11 831 addpd %xmm11, %xmm2 832 mulpd %xmm7, %xmm12 833 addpd %xmm12, %xmm3 834 835 addq $8 * SIZE, X 836 addq $8 * SIZE, Y 837 ALIGN_3 838 839.L36: 840 testq $2, N 841 jle .L37 842 843 MOVLPS -16 * SIZE(Y), %xmm4 844 movhps -15 * SIZE(Y), %xmm4 845 movaps -16 * SIZE(X), %xmm8 846 847 pshufd $0x4e, %xmm8, %xmm12 848 mulpd %xmm4, %xmm8 849 addpd %xmm8, %xmm0 850 mulpd %xmm4, %xmm12 851 addpd %xmm12, %xmm1 852 853 MOVLPS -14 * SIZE(Y), %xmm5 854 movhps -13 * SIZE(Y), %xmm5 855 movaps -14 * SIZE(X), %xmm9 856 857 pshufd $0x4e, %xmm9, %xmm12 858 mulpd %xmm5, %xmm9 859 addpd %xmm9, %xmm2 860 mulpd %xmm5, %xmm12 861 addpd %xmm12, %xmm3 862 863 addq $4 * SIZE, X 864 addq $4 * SIZE, Y 865 ALIGN_3 866 867.L37: 868 SHUFPD_1 %xmm1, %xmm1 869 SHUFPD_1 %xmm3, %xmm3 870 871 testq $1, N 872 jle .L98 873 874 MOVLPS -16 * SIZE(Y), %xmm4 875 movhps -15 * SIZE(Y), %xmm4 876 movaps -16 * SIZE(X), %xmm8 877 878 pshufd $0x4e, %xmm8, %xmm12 879 mulpd %xmm4, %xmm8 880 addpd %xmm8, %xmm0 881 mulpd %xmm4, %xmm12 882 SHUFPD_1 %xmm12, %xmm12 883 addpd %xmm12, %xmm1 884 jmp .L98 885 ALIGN_3 886 887.L40: 888 movhps -16 * SIZE(X), %xmm4 889 addq $SIZE, X 890 movhps -16 * SIZE(Y), %xmm8 891 addq $SIZE, Y 892 893 movq N, %rax 894 sarq $3, %rax 895 jle .L45 896 897 movaps -16 * SIZE(X), %xmm5 898 movaps -16 * SIZE(Y), %xmm9 899 movaps -14 * SIZE(X), %xmm6 900 movaps -14 * SIZE(Y), %xmm10 901 movaps -12 * SIZE(X), %xmm7 902 movaps -12 * SIZE(Y), %xmm11 903 decq %rax 904 jle .L42 905 ALIGN_3 906 907.L41: 908#ifdef PREFETCH 909 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 910#endif 911 912 movsd %xmm9, %xmm8 913 pshufd $0x4e, %xmm8, %xmm12 914 movsd %xmm5, %xmm4 915 mulpd %xmm4, %xmm8 916 addpd %xmm8, %xmm0 917 movaps -10 * SIZE(Y), %xmm8 918 mulpd %xmm4, %xmm12 919 movaps -10 * SIZE(X), %xmm4 920 addpd %xmm12, %xmm1 921 922 movsd %xmm10, %xmm9 923 pshufd $0x4e, %xmm9, %xmm12 924 movsd %xmm6, %xmm5 925 mulpd %xmm5, %xmm9 926 addpd %xmm9, %xmm0 927 movaps -8 * SIZE(Y), %xmm9 928 mulpd %xmm5, %xmm12 929 movaps -8 * SIZE(X), %xmm5 930 addpd %xmm12, %xmm1 931 932#ifdef PREFETCH 933 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 934#endif 935 936 movsd %xmm11, %xmm10 937 pshufd $0x4e, %xmm10, %xmm12 938 movsd %xmm7, %xmm6 939 mulpd %xmm6, %xmm10 940 addpd %xmm10, %xmm0 941 movaps -6 * SIZE(Y), %xmm10 942 mulpd %xmm6, %xmm12 943 movaps -6 * SIZE(X), %xmm6 944 addpd %xmm12, %xmm1 945 946 movsd %xmm8, %xmm11 947 pshufd $0x4e, %xmm11, %xmm12 948 movsd %xmm4, %xmm7 949 mulpd %xmm7, %xmm11 950 addpd %xmm11, %xmm0 951 movaps -4 * SIZE(Y), %xmm11 952 mulpd %xmm7, %xmm12 953 movaps -4 * SIZE(X), %xmm7 954 addpd %xmm12, %xmm1 955 956#if defined(PREFETCH) && !defined(FETCH128) 957 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 958#endif 959 960 movsd %xmm9, %xmm8 961 pshufd $0x4e, %xmm8, %xmm12 962 movsd %xmm5, %xmm4 963 mulpd %xmm4, %xmm8 964 addpd %xmm8, %xmm0 965 movaps -2 * SIZE(Y), %xmm8 966 mulpd %xmm4, %xmm12 967 movaps -2 * SIZE(X), %xmm4 968 addpd %xmm12, %xmm1 969 970 movsd %xmm10, %xmm9 971 pshufd $0x4e, %xmm9, %xmm12 972 movsd %xmm6, %xmm5 973 mulpd %xmm5, %xmm9 974 addpd %xmm9, %xmm0 975 movaps 0 * SIZE(Y), %xmm9 976 mulpd %xmm5, %xmm12 977 movaps 0 * SIZE(X), %xmm5 978 addpd %xmm12, %xmm1 979 980#if defined(PREFETCH) && !defined(FETCH128) 981 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 982#endif 983 984 movsd %xmm11, %xmm10 985 pshufd $0x4e, %xmm10, %xmm12 986 movsd %xmm7, %xmm6 987 mulpd %xmm6, %xmm10 988 addpd %xmm10, %xmm0 989 movaps 2 * SIZE(Y), %xmm10 990 mulpd %xmm6, %xmm12 991 movaps 2 * SIZE(X), %xmm6 992 addpd %xmm12, %xmm1 993 994 movsd %xmm8, %xmm11 995 pshufd $0x4e, %xmm11, %xmm12 996 movsd %xmm4, %xmm7 997 mulpd %xmm7, %xmm11 998 addpd %xmm11, %xmm0 999 movaps 4 * SIZE(Y), %xmm11 1000 mulpd %xmm7, %xmm12 1001 movaps 4 * SIZE(X), %xmm7 1002 addpd %xmm12, %xmm1 1003 1004 subq $-16 * SIZE, X 1005 subq $-16 * SIZE, Y 1006 1007 decq %rax 1008 jg .L41 1009 ALIGN_3 1010 1011.L42: 1012 movsd %xmm9, %xmm8 1013 pshufd $0x4e, %xmm8, %xmm12 1014 movsd %xmm5, %xmm4 1015 mulpd %xmm4, %xmm8 1016 addpd %xmm8, %xmm0 1017 movaps -10 * SIZE(Y), %xmm8 1018 mulpd %xmm4, %xmm12 1019 movaps -10 * SIZE(X), %xmm4 1020 addpd %xmm12, %xmm1 1021 1022 movsd %xmm10, %xmm9 1023 pshufd $0x4e, %xmm9, %xmm12 1024 movsd %xmm6, %xmm5 1025 mulpd %xmm5, %xmm9 1026 addpd %xmm9, %xmm0 1027 movaps -8 * SIZE(Y), %xmm9 1028 mulpd %xmm5, %xmm12 1029 movaps -8 * SIZE(X), %xmm5 1030 addpd %xmm12, %xmm1 1031 1032 movsd %xmm11, %xmm10 1033 pshufd $0x4e, %xmm10, %xmm12 1034 movsd %xmm7, %xmm6 1035 mulpd %xmm6, %xmm10 1036 addpd %xmm10, %xmm0 1037 movaps -6 * SIZE(Y), %xmm10 1038 mulpd %xmm6, %xmm12 1039 movaps -6 * SIZE(X), %xmm6 1040 addpd %xmm12, %xmm1 1041 1042 movsd %xmm8, %xmm11 1043 pshufd $0x4e, %xmm11, %xmm12 1044 movsd %xmm4, %xmm7 1045 mulpd %xmm7, %xmm11 1046 addpd %xmm11, %xmm0 1047 movaps -4 * SIZE(Y), %xmm11 1048 mulpd %xmm7, %xmm12 1049 movaps -4 * SIZE(X), %xmm7 1050 addpd %xmm12, %xmm1 1051 1052 movsd %xmm9, %xmm8 1053 pshufd $0x4e, %xmm8, %xmm12 1054 movsd %xmm5, %xmm4 1055 mulpd %xmm4, %xmm8 1056 addpd %xmm8, %xmm0 1057 movaps -2 * SIZE(Y), %xmm8 1058 mulpd %xmm4, %xmm12 1059 movaps -2 * SIZE(X), %xmm4 1060 addpd %xmm12, %xmm1 1061 1062 movsd %xmm10, %xmm9 1063 pshufd $0x4e, %xmm9, %xmm12 1064 movsd %xmm6, %xmm5 1065 mulpd %xmm5, %xmm9 1066 addpd %xmm9, %xmm0 1067 mulpd %xmm5, %xmm12 1068 addpd %xmm12, %xmm1 1069 1070 movsd %xmm11, %xmm10 1071 pshufd $0x4e, %xmm10, %xmm12 1072 movsd %xmm7, %xmm6 1073 mulpd %xmm6, %xmm10 1074 addpd %xmm10, %xmm0 1075 mulpd %xmm6, %xmm12 1076 addpd %xmm12, %xmm1 1077 1078 movsd %xmm8, %xmm11 1079 pshufd $0x4e, %xmm11, %xmm12 1080 movsd %xmm4, %xmm7 1081 mulpd %xmm7, %xmm11 1082 addpd %xmm11, %xmm0 1083 mulpd %xmm7, %xmm12 1084 addpd %xmm12, %xmm1 1085 1086 subq $-16 * SIZE, X 1087 subq $-16 * SIZE, Y 1088 ALIGN_3 1089 1090.L45: 1091 testq $4, N 1092 jle .L46 1093 1094 movaps -16 * SIZE(X), %xmm5 1095 movaps -16 * SIZE(Y), %xmm9 1096 movaps -14 * SIZE(X), %xmm6 1097 movaps -14 * SIZE(Y), %xmm10 1098 1099 movsd %xmm9, %xmm8 1100 pshufd $0x4e, %xmm8, %xmm12 1101 movsd %xmm5, %xmm4 1102 mulpd %xmm4, %xmm8 1103 addpd %xmm8, %xmm0 1104 mulpd %xmm4, %xmm12 1105 addpd %xmm12, %xmm1 1106 1107 movaps -12 * SIZE(X), %xmm7 1108 movaps -12 * SIZE(Y), %xmm11 1109 1110 movsd %xmm10, %xmm9 1111 pshufd $0x4e, %xmm9, %xmm12 1112 movsd %xmm6, %xmm5 1113 mulpd %xmm5, %xmm9 1114 addpd %xmm9, %xmm0 1115 mulpd %xmm5, %xmm12 1116 addpd %xmm12, %xmm1 1117 1118 movaps -10 * SIZE(X), %xmm4 1119 movaps -10 * SIZE(Y), %xmm8 1120 1121 movsd %xmm11, %xmm10 1122 pshufd $0x4e, %xmm10, %xmm12 1123 movsd %xmm7, %xmm6 1124 mulpd %xmm6, %xmm10 1125 addpd %xmm10, %xmm0 1126 mulpd %xmm6, %xmm12 1127 addpd %xmm12, %xmm1 1128 1129 movsd %xmm8, %xmm11 1130 pshufd $0x4e, %xmm11, %xmm12 1131 movsd %xmm4, %xmm7 1132 mulpd %xmm7, %xmm11 1133 addpd %xmm11, %xmm0 1134 mulpd %xmm7, %xmm12 1135 addpd %xmm12, %xmm1 1136 1137 addq $8 * SIZE, X 1138 addq $8 * SIZE, Y 1139 ALIGN_3 1140 1141.L46: 1142 testq $2, N 1143 jle .L47 1144 1145 movaps -16 * SIZE(X), %xmm5 1146 movaps -16 * SIZE(Y), %xmm9 1147 1148 movsd %xmm9, %xmm8 1149 pshufd $0x4e, %xmm8, %xmm12 1150 movsd %xmm5, %xmm4 1151 mulpd %xmm4, %xmm8 1152 addpd %xmm8, %xmm0 1153 mulpd %xmm4, %xmm12 1154 addpd %xmm12, %xmm1 1155 1156 movaps -14 * SIZE(X), %xmm6 1157 movaps -14 * SIZE(Y), %xmm10 1158 1159 movsd %xmm10, %xmm9 1160 pshufd $0x4e, %xmm9, %xmm12 1161 movsd %xmm6, %xmm5 1162 mulpd %xmm5, %xmm9 1163 addpd %xmm9, %xmm0 1164 mulpd %xmm5, %xmm12 1165 addpd %xmm12, %xmm1 1166 1167 movaps %xmm6, %xmm4 1168 movaps %xmm10, %xmm8 1169 1170 addq $4 * SIZE, X 1171 addq $4 * SIZE, Y 1172 ALIGN_3 1173 1174.L47: 1175 testq $1, N 1176 jle .L48 1177 1178 movlps -16 * SIZE(X), %xmm4 1179 movlps -16 * SIZE(Y), %xmm8 1180 1181 pshufd $0x4e, %xmm8, %xmm12 1182 mulpd %xmm4, %xmm8 1183 addpd %xmm8, %xmm0 1184 mulpd %xmm4, %xmm12 1185 addpd %xmm12, %xmm1 1186 ALIGN_3 1187 1188.L48: 1189 SHUFPD_1 %xmm0, %xmm0 1190 SHUFPD_1 %xmm1, %xmm1 1191 SHUFPD_1 %xmm2, %xmm2 1192 SHUFPD_1 %xmm3, %xmm3 1193 jmp .L98 1194 ALIGN_3 1195 1196.L50: 1197 movq N, %rax 1198 sarq $3, %rax 1199 jle .L55 1200 1201 MOVLPS 0 * SIZE(X), %xmm4 1202 movhps 1 * SIZE(X), %xmm4 1203 addq INCX, X 1204 MOVLPS 0 * SIZE(Y), %xmm8 1205 movhps 1 * SIZE(Y), %xmm8 1206 addq INCY, Y 1207 1208 MOVLPS 0 * SIZE(X), %xmm5 1209 movhps 1 * SIZE(X), %xmm5 1210 addq INCX, X 1211 MOVLPS 0 * SIZE(Y), %xmm9 1212 movhps 1 * SIZE(Y), %xmm9 1213 addq INCY, Y 1214 1215 MOVLPS 0 * SIZE(X), %xmm6 1216 movhps 1 * SIZE(X), %xmm6 1217 addq INCX, X 1218 MOVLPS 0 * SIZE(Y), %xmm10 1219 movhps 1 * SIZE(Y), %xmm10 1220 addq INCY, Y 1221 1222 MOVLPS 0 * SIZE(X), %xmm7 1223 movhps 1 * SIZE(X), %xmm7 1224 addq INCX, X 1225 MOVLPS 0 * SIZE(Y), %xmm11 1226 movhps 1 * SIZE(Y), %xmm11 1227 addq INCY, Y 1228 1229 decq %rax 1230 jle .L54 1231 ALIGN_3 1232 1233.L53: 1234 pshufd $0x4e, %xmm8, %xmm12 1235 mulpd %xmm4, %xmm8 1236 addpd %xmm8, %xmm0 1237 MOVLPS 0 * SIZE(Y), %xmm8 1238 movhps 1 * SIZE(Y), %xmm8 1239 addq INCY, Y 1240 mulpd %xmm4, %xmm12 1241 MOVLPS 0 * SIZE(X), %xmm4 1242 movhps 1 * SIZE(X), %xmm4 1243 addq INCX, X 1244 addpd %xmm12, %xmm1 1245 1246 pshufd $0x4e, %xmm9, %xmm12 1247 mulpd %xmm5, %xmm9 1248 addpd %xmm9, %xmm2 1249 MOVLPS 0 * SIZE(Y), %xmm9 1250 movhps 1 * SIZE(Y), %xmm9 1251 addq INCY, Y 1252 mulpd %xmm5, %xmm12 1253 MOVLPS 0 * SIZE(X), %xmm5 1254 movhps 1 * SIZE(X), %xmm5 1255 addq INCX, X 1256 addpd %xmm12, %xmm3 1257 1258 pshufd $0x4e, %xmm10, %xmm12 1259 mulpd %xmm6, %xmm10 1260 addpd %xmm10, %xmm0 1261 MOVLPS 0 * SIZE(Y), %xmm10 1262 movhps 1 * SIZE(Y), %xmm10 1263 addq INCY, Y 1264 mulpd %xmm6, %xmm12 1265 MOVLPS 0 * SIZE(X), %xmm6 1266 movhps 1 * SIZE(X), %xmm6 1267 addq INCX, X 1268 addpd %xmm12, %xmm1 1269 1270 pshufd $0x4e, %xmm11, %xmm12 1271 mulpd %xmm7, %xmm11 1272 addpd %xmm11, %xmm2 1273 MOVLPS 0 * SIZE(Y), %xmm11 1274 movhps 1 * SIZE(Y), %xmm11 1275 addq INCY, Y 1276 mulpd %xmm7, %xmm12 1277 MOVLPS 0 * SIZE(X), %xmm7 1278 movhps 1 * SIZE(X), %xmm7 1279 addq INCX, X 1280 addpd %xmm12, %xmm3 1281 1282 pshufd $0x4e, %xmm8, %xmm12 1283 mulpd %xmm4, %xmm8 1284 addpd %xmm8, %xmm0 1285 MOVLPS 0 * SIZE(Y), %xmm8 1286 movhps 1 * SIZE(Y), %xmm8 1287 addq INCY, Y 1288 1289 mulpd %xmm4, %xmm12 1290 MOVLPS 0 * SIZE(X), %xmm4 1291 movhps 1 * SIZE(X), %xmm4 1292 addq INCX, X 1293 addpd %xmm12, %xmm1 1294 1295 pshufd $0x4e, %xmm9, %xmm12 1296 mulpd %xmm5, %xmm9 1297 addpd %xmm9, %xmm2 1298 MOVLPS 0 * SIZE(Y), %xmm9 1299 movhps 1 * SIZE(Y), %xmm9 1300 addq INCY, Y 1301 1302 mulpd %xmm5, %xmm12 1303 MOVLPS 0 * SIZE(X), %xmm5 1304 movhps 1 * SIZE(X), %xmm5 1305 addq INCX, X 1306 addpd %xmm12, %xmm3 1307 1308 pshufd $0x4e, %xmm10, %xmm12 1309 mulpd %xmm6, %xmm10 1310 addpd %xmm10, %xmm0 1311 MOVLPS 0 * SIZE(Y), %xmm10 1312 movhps 1 * SIZE(Y), %xmm10 1313 addq INCY, Y 1314 mulpd %xmm6, %xmm12 1315 MOVLPS 0 * SIZE(X), %xmm6 1316 movhps 1 * SIZE(X), %xmm6 1317 addq INCX, X 1318 addpd %xmm12, %xmm1 1319 1320 pshufd $0x4e, %xmm11, %xmm12 1321 mulpd %xmm7, %xmm11 1322 addpd %xmm11, %xmm2 1323 MOVLPS 0 * SIZE(Y), %xmm11 1324 movhps 1 * SIZE(Y), %xmm11 1325 addq INCY, Y 1326 mulpd %xmm7, %xmm12 1327 MOVLPS 0 * SIZE(X), %xmm7 1328 movhps 1 * SIZE(X), %xmm7 1329 addq INCX, X 1330 addpd %xmm12, %xmm3 1331 1332 decq %rax 1333 jg .L53 1334 ALIGN_3 1335 1336.L54: 1337 pshufd $0x4e, %xmm8, %xmm12 1338 mulpd %xmm4, %xmm8 1339 addpd %xmm8, %xmm0 1340 MOVLPS 0 * SIZE(Y), %xmm8 1341 movhps 1 * SIZE(Y), %xmm8 1342 addq INCY, Y 1343 mulpd %xmm4, %xmm12 1344 MOVLPS 0 * SIZE(X), %xmm4 1345 movhps 1 * SIZE(X), %xmm4 1346 addq INCX, X 1347 addpd %xmm12, %xmm1 1348 1349 pshufd $0x4e, %xmm9, %xmm12 1350 mulpd %xmm5, %xmm9 1351 addpd %xmm9, %xmm2 1352 MOVLPS 0 * SIZE(Y), %xmm9 1353 movhps 1 * SIZE(Y), %xmm9 1354 addq INCY, Y 1355 mulpd %xmm5, %xmm12 1356 MOVLPS 0 * SIZE(X), %xmm5 1357 movhps 1 * SIZE(X), %xmm5 1358 addq INCX, X 1359 addpd %xmm12, %xmm3 1360 1361 pshufd $0x4e, %xmm10, %xmm12 1362 mulpd %xmm6, %xmm10 1363 addpd %xmm10, %xmm0 1364 MOVLPS 0 * SIZE(Y), %xmm10 1365 movhps 1 * SIZE(Y), %xmm10 1366 addq INCY, Y 1367 mulpd %xmm6, %xmm12 1368 MOVLPS 0 * SIZE(X), %xmm6 1369 movhps 1 * SIZE(X), %xmm6 1370 addq INCX, X 1371 addpd %xmm12, %xmm1 1372 1373 pshufd $0x4e, %xmm11, %xmm12 1374 mulpd %xmm7, %xmm11 1375 addpd %xmm11, %xmm2 1376 MOVLPS 0 * SIZE(Y), %xmm11 1377 movhps 1 * SIZE(Y), %xmm11 1378 addq INCY, Y 1379 mulpd %xmm7, %xmm12 1380 MOVLPS 0 * SIZE(X), %xmm7 1381 movhps 1 * SIZE(X), %xmm7 1382 addq INCX, X 1383 addpd %xmm12, %xmm3 1384 1385 pshufd $0x4e, %xmm8, %xmm12 1386 mulpd %xmm4, %xmm8 1387 addpd %xmm8, %xmm0 1388 mulpd %xmm4, %xmm12 1389 addpd %xmm12, %xmm1 1390 1391 pshufd $0x4e, %xmm9, %xmm12 1392 mulpd %xmm5, %xmm9 1393 addpd %xmm9, %xmm2 1394 mulpd %xmm5, %xmm12 1395 addpd %xmm12, %xmm3 1396 1397 pshufd $0x4e, %xmm10, %xmm12 1398 mulpd %xmm6, %xmm10 1399 addpd %xmm10, %xmm0 1400 mulpd %xmm6, %xmm12 1401 addpd %xmm12, %xmm1 1402 1403 pshufd $0x4e, %xmm11, %xmm12 1404 mulpd %xmm7, %xmm11 1405 addpd %xmm11, %xmm2 1406 mulpd %xmm7, %xmm12 1407 addpd %xmm12, %xmm3 1408 ALIGN_3 1409 1410.L55: 1411 testq $4, N 1412 jle .L56 1413 1414 MOVLPS 0 * SIZE(X), %xmm4 1415 movhps 1 * SIZE(X), %xmm4 1416 addq INCX, X 1417 MOVLPS 0 * SIZE(Y), %xmm8 1418 movhps 1 * SIZE(Y), %xmm8 1419 addq INCY, Y 1420 1421 pshufd $0x4e, %xmm8, %xmm12 1422 mulpd %xmm4, %xmm8 1423 addpd %xmm8, %xmm0 1424 mulpd %xmm4, %xmm12 1425 addpd %xmm12, %xmm1 1426 1427 MOVLPS 0 * SIZE(X), %xmm5 1428 movhps 1 * SIZE(X), %xmm5 1429 addq INCX, X 1430 MOVLPS 0 * SIZE(Y), %xmm9 1431 movhps 1 * SIZE(Y), %xmm9 1432 addq INCY, Y 1433 1434 pshufd $0x4e, %xmm9, %xmm12 1435 mulpd %xmm5, %xmm9 1436 addpd %xmm9, %xmm2 1437 mulpd %xmm5, %xmm12 1438 addpd %xmm12, %xmm3 1439 1440 MOVLPS 0 * SIZE(X), %xmm6 1441 movhps 1 * SIZE(X), %xmm6 1442 addq INCX, X 1443 MOVLPS 0 * SIZE(Y), %xmm10 1444 movhps 1 * SIZE(Y), %xmm10 1445 addq INCY, Y 1446 1447 pshufd $0x4e, %xmm10, %xmm12 1448 mulpd %xmm6, %xmm10 1449 addpd %xmm10, %xmm0 1450 mulpd %xmm6, %xmm12 1451 addpd %xmm12, %xmm1 1452 1453 MOVLPS 0 * SIZE(X), %xmm7 1454 movhps 1 * SIZE(X), %xmm7 1455 addq INCX, X 1456 MOVLPS 0 * SIZE(Y), %xmm11 1457 movhps 1 * SIZE(Y), %xmm11 1458 addq INCY, Y 1459 1460 pshufd $0x4e, %xmm11, %xmm12 1461 mulpd %xmm7, %xmm11 1462 addpd %xmm11, %xmm2 1463 mulpd %xmm7, %xmm12 1464 addpd %xmm12, %xmm3 1465 ALIGN_3 1466 1467.L56: 1468 testq $2, N 1469 jle .L57 1470 1471 MOVLPS 0 * SIZE(X), %xmm4 1472 movhps 1 * SIZE(X), %xmm4 1473 addq INCX, X 1474 MOVLPS 0 * SIZE(Y), %xmm8 1475 movhps 1 * SIZE(Y), %xmm8 1476 addq INCY, Y 1477 1478 pshufd $0x4e, %xmm8, %xmm12 1479 mulpd %xmm4, %xmm8 1480 addpd %xmm8, %xmm0 1481 mulpd %xmm4, %xmm12 1482 addpd %xmm12, %xmm1 1483 1484 MOVLPS 0 * SIZE(X), %xmm5 1485 movhps 1 * SIZE(X), %xmm5 1486 addq INCX, X 1487 MOVLPS 0 * SIZE(Y), %xmm9 1488 movhps 1 * SIZE(Y), %xmm9 1489 addq INCY, Y 1490 1491 pshufd $0x4e, %xmm9, %xmm12 1492 mulpd %xmm5, %xmm9 1493 addpd %xmm9, %xmm2 1494 mulpd %xmm5, %xmm12 1495 addpd %xmm12, %xmm3 1496 ALIGN_3 1497 1498.L57: 1499 testq $1, N 1500 jle .L98 1501 1502 MOVLPS 0 * SIZE(X), %xmm4 1503 movhps 1 * SIZE(X), %xmm4 1504 MOVLPS 0 * SIZE(Y), %xmm8 1505 movhps 1 * SIZE(Y), %xmm8 1506 1507 pshufd $0x4e, %xmm8, %xmm12 1508 mulpd %xmm4, %xmm8 1509 addpd %xmm8, %xmm0 1510 mulpd %xmm4, %xmm12 1511 addpd %xmm12, %xmm1 1512 ALIGN_3 1513 1514.L98: 1515 addpd %xmm2, %xmm0 1516 addpd %xmm3, %xmm1 1517 1518 pshufd $0x4e, %xmm0, %xmm2 1519 pshufd $0x4e, %xmm1, %xmm3 1520 1521.L999: 1522#ifndef CONJ 1523 subsd %xmm2, %xmm0 1524 addsd %xmm3, %xmm1 1525#else 1526 addsd %xmm2, %xmm0 1527 subsd %xmm3, %xmm1 1528#endif 1529 1530 RESTOREREGISTERS 1531 ret 1532 1533 EPILOGUE 1534