1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45#define Y ARG4 /* rcx */ 46#ifndef WINDOWS_ABI 47#define INCY ARG5 /* r8 */ 48#else 49#define INCY %r10 50#endif 51 52#include "l1param.h" 53 54 PROLOGUE 55 PROFCODE 56 57#ifdef WINDOWS_ABI 58 movq 40(%rsp), INCY 59#endif 60 61 SAVEREGISTERS 62 63 salq $ZBASE_SHIFT, INCX 64 salq $ZBASE_SHIFT, INCY 65 66 xorps %xmm0, %xmm0 67 xorps %xmm1, %xmm1 68 xorps %xmm2, %xmm2 69 xorps %xmm3, %xmm3 70 71 testq N, N 72 jle .L999 73 74 cmpq $2 * SIZE, INCX 75 jne .L200 76 cmpq $2 * SIZE, INCY 77 jne .L200 78 79 subq $-32 * SIZE, X 80 subq $-32 * SIZE, Y 81 82 testq $SIZE, X 83 jne .L50 84 85.L0x: 86 testq $2 * SIZE, X 87 je .L10 88 89#ifdef movsd 90 xorps %xmm4, %xmm4 91#endif 92 movsd -32 * SIZE(X), %xmm4 93 movsd -32 * SIZE(Y), %xmm0 94 95 pshufd $0xb1, %xmm0, %xmm1 96 mulps %xmm4, %xmm0 97 mulps %xmm4, %xmm1 98 addq $2 * SIZE, X 99 addq $2 * SIZE, Y 100 decq N 101 ALIGN_3 102 103.L10: 104 testq $3 * SIZE, Y 105 jne .L20 106 107 movq N, %rax 108 sarq $4, %rax 109 jle .L15 110 111 movaps -32 * SIZE(X), %xmm4 112 movaps -28 * SIZE(X), %xmm5 113 movaps -32 * SIZE(Y), %xmm8 114 movaps -28 * SIZE(Y), %xmm9 115 movaps -24 * SIZE(X), %xmm6 116 movaps -20 * SIZE(X), %xmm7 117 movaps -24 * SIZE(Y), %xmm10 118 movaps -20 * SIZE(Y), %xmm11 119 120 decq %rax 121 jle .L12 122 ALIGN_3 123 124.L11: 125#ifdef PREFETCH 126 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 127#endif 128 129 pshufd $0xb1, %xmm8, %xmm12 130 mulps %xmm4, %xmm8 131 addps %xmm8, %xmm0 132 movaps -16 * SIZE(Y), %xmm8 133 mulps %xmm4, %xmm12 134 movaps -16 * SIZE(X), %xmm4 135 addps %xmm12, %xmm1 136 137 pshufd $0xb1, %xmm9, %xmm12 138 mulps %xmm5, %xmm9 139 addps %xmm9, %xmm2 140 movaps -12 * SIZE(Y), %xmm9 141 mulps %xmm5, %xmm12 142 movaps -12 * SIZE(X), %xmm5 143 addps %xmm12, %xmm3 144 145#ifdef PREFETCH 146 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 147#endif 148 149 pshufd $0xb1, %xmm10, %xmm12 150 mulps %xmm6, %xmm10 151 addps %xmm10, %xmm0 152 movaps -8 * SIZE(Y), %xmm10 153 mulps %xmm6, %xmm12 154 movaps -8 * SIZE(X), %xmm6 155 addps %xmm12, %xmm1 156 157 pshufd $0xb1, %xmm11, %xmm12 158 mulps %xmm7, %xmm11 159 addps %xmm11, %xmm2 160 movaps -4 * SIZE(Y), %xmm11 161 mulps %xmm7, %xmm12 162 movaps -4 * SIZE(X), %xmm7 163 addps %xmm12, %xmm3 164 165#if defined(PREFETCH) && !defined(FETCH128) 166 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 167#endif 168 169 pshufd $0xb1, %xmm8, %xmm12 170 mulps %xmm4, %xmm8 171 addps %xmm8, %xmm0 172 movaps 0 * SIZE(Y), %xmm8 173 mulps %xmm4, %xmm12 174 movaps 0 * SIZE(X), %xmm4 175 addps %xmm12, %xmm1 176 177 pshufd $0xb1, %xmm9, %xmm12 178 mulps %xmm5, %xmm9 179 addps %xmm9, %xmm2 180 movaps 4 * SIZE(Y), %xmm9 181 mulps %xmm5, %xmm12 182 movaps 4 * SIZE(X), %xmm5 183 addps %xmm12, %xmm3 184 185#if defined(PREFETCH) && !defined(FETCH128) 186 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 187#endif 188 189 pshufd $0xb1, %xmm10, %xmm12 190 mulps %xmm6, %xmm10 191 addps %xmm10, %xmm0 192 movaps 8 * SIZE(Y), %xmm10 193 mulps %xmm6, %xmm12 194 movaps 8 * SIZE(X), %xmm6 195 addps %xmm12, %xmm1 196 197 pshufd $0xb1, %xmm11, %xmm12 198 mulps %xmm7, %xmm11 199 addps %xmm11, %xmm2 200 movaps 12 * SIZE(Y), %xmm11 201 mulps %xmm7, %xmm12 202 movaps 12 * SIZE(X), %xmm7 203 addps %xmm12, %xmm3 204 205 subq $-32 * SIZE, X 206 subq $-32 * SIZE, Y 207 208 decq %rax 209 jg .L11 210 ALIGN_3 211 212.L12: 213 pshufd $0xb1, %xmm8, %xmm12 214 mulps %xmm4, %xmm8 215 addps %xmm8, %xmm0 216 movaps -16 * SIZE(Y), %xmm8 217 mulps %xmm4, %xmm12 218 movaps -16 * SIZE(X), %xmm4 219 addps %xmm12, %xmm1 220 221 pshufd $0xb1, %xmm9, %xmm12 222 mulps %xmm5, %xmm9 223 addps %xmm9, %xmm2 224 movaps -12 * SIZE(Y), %xmm9 225 mulps %xmm5, %xmm12 226 movaps -12 * SIZE(X), %xmm5 227 addps %xmm12, %xmm3 228 229 pshufd $0xb1, %xmm10, %xmm12 230 mulps %xmm6, %xmm10 231 addps %xmm10, %xmm0 232 movaps -8 * SIZE(Y), %xmm10 233 mulps %xmm6, %xmm12 234 movaps -8 * SIZE(X), %xmm6 235 addps %xmm12, %xmm1 236 237 pshufd $0xb1, %xmm11, %xmm12 238 mulps %xmm7, %xmm11 239 addps %xmm11, %xmm2 240 movaps -4 * SIZE(Y), %xmm11 241 mulps %xmm7, %xmm12 242 movaps -4 * SIZE(X), %xmm7 243 addps %xmm12, %xmm3 244 245 pshufd $0xb1, %xmm8, %xmm12 246 mulps %xmm4, %xmm8 247 addps %xmm8, %xmm0 248 mulps %xmm4, %xmm12 249 addps %xmm12, %xmm1 250 251 pshufd $0xb1, %xmm9, %xmm12 252 mulps %xmm5, %xmm9 253 addps %xmm9, %xmm2 254 mulps %xmm5, %xmm12 255 addps %xmm12, %xmm3 256 257 pshufd $0xb1, %xmm10, %xmm12 258 mulps %xmm6, %xmm10 259 addps %xmm10, %xmm0 260 mulps %xmm6, %xmm12 261 addps %xmm12, %xmm1 262 263 pshufd $0xb1, %xmm11, %xmm12 264 mulps %xmm7, %xmm11 265 addps %xmm11, %xmm2 266 mulps %xmm7, %xmm12 267 addps %xmm12, %xmm3 268 269 subq $-32 * SIZE, X 270 subq $-32 * SIZE, Y 271 ALIGN_3 272 273.L15: 274 testq $8, N 275 jle .L16 276 277 movaps -32 * SIZE(X), %xmm4 278 movaps -32 * SIZE(Y), %xmm8 279 280 pshufd $0xb1, %xmm8, %xmm12 281 mulps %xmm4, %xmm8 282 addps %xmm8, %xmm0 283 mulps %xmm4, %xmm12 284 addps %xmm12, %xmm1 285 286 movaps -28 * SIZE(X), %xmm5 287 movaps -28 * SIZE(Y), %xmm9 288 289 pshufd $0xb1, %xmm9, %xmm12 290 mulps %xmm5, %xmm9 291 addps %xmm9, %xmm2 292 mulps %xmm5, %xmm12 293 addps %xmm12, %xmm3 294 295 movaps -24 * SIZE(X), %xmm6 296 movaps -24 * SIZE(Y), %xmm10 297 298 pshufd $0xb1, %xmm10, %xmm12 299 mulps %xmm6, %xmm10 300 addps %xmm10, %xmm0 301 mulps %xmm6, %xmm12 302 addps %xmm12, %xmm1 303 304 movaps -20 * SIZE(X), %xmm7 305 movaps -20 * SIZE(Y), %xmm11 306 307 pshufd $0xb1, %xmm11, %xmm12 308 mulps %xmm7, %xmm11 309 addps %xmm11, %xmm2 310 mulps %xmm7, %xmm12 311 addps %xmm12, %xmm3 312 313 addq $16 * SIZE, X 314 addq $16 * SIZE, Y 315 ALIGN_3 316 317.L16: 318 testq $4, N 319 jle .L17 320 321 movaps -32 * SIZE(X), %xmm4 322 movaps -32 * SIZE(Y), %xmm8 323 movaps -28 * SIZE(X), %xmm5 324 movaps -28 * SIZE(Y), %xmm9 325 326 pshufd $0xb1, %xmm8, %xmm12 327 mulps %xmm4, %xmm8 328 addps %xmm8, %xmm0 329 mulps %xmm4, %xmm12 330 addps %xmm12, %xmm1 331 332 pshufd $0xb1, %xmm9, %xmm12 333 mulps %xmm5, %xmm9 334 addps %xmm9, %xmm2 335 mulps %xmm5, %xmm12 336 addps %xmm12, %xmm3 337 338 addq $8 * SIZE, X 339 addq $8 * SIZE, Y 340 ALIGN_3 341 342.L17: 343 testq $2, N 344 jle .L18 345 346 movaps -32 * SIZE(X), %xmm4 347 movaps -32 * SIZE(Y), %xmm8 348 349 pshufd $0xb1, %xmm8, %xmm12 350 mulps %xmm4, %xmm8 351 addps %xmm8, %xmm0 352 mulps %xmm4, %xmm12 353 addps %xmm12, %xmm1 354 355 addq $4 * SIZE, X 356 addq $4 * SIZE, Y 357 ALIGN_3 358 359.L18: 360 testq $1, N 361 jle .L98 362 363#ifdef movsd 364 xorps %xmm4, %xmm4 365#endif 366 movsd -32 * SIZE(X), %xmm4 367#ifdef movsd 368 xorps %xmm8, %xmm8 369#endif 370 movsd -32 * SIZE(Y), %xmm8 371 372 pshufd $0xb1, %xmm8, %xmm12 373 mulps %xmm4, %xmm8 374 addps %xmm8, %xmm0 375 mulps %xmm4, %xmm12 376 addps %xmm12, %xmm1 377 jmp .L98 378 ALIGN_3 379 380.L20: 381#ifdef ALIGNED_ACCESS 382 383 testq $2 * SIZE, Y 384 jne .L30 385 386 movaps -33 * SIZE(Y), %xmm8 387 addq $3 * SIZE, Y 388 389 shufps $0xb1, %xmm1, %xmm1 390 391 movq N, %rax 392 sarq $4, %rax 393 jle .L25 394 395 movaps -32 * SIZE(X), %xmm4 396 movaps -32 * SIZE(Y), %xmm9 397 movaps -28 * SIZE(X), %xmm5 398 movaps -28 * SIZE(Y), %xmm10 399 movaps -24 * SIZE(X), %xmm6 400 movaps -24 * SIZE(Y), %xmm11 401 movaps -20 * SIZE(X), %xmm7 402 403 decq %rax 404 jle .L22 405 ALIGN_3 406 407.L21: 408#ifdef PREFETCH 409 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 410#endif 411 412 movss %xmm9, %xmm8 413 pshufd $0xb1, %xmm4, %xmm12 414 shufps $0x39, %xmm8, %xmm8 415 mulps %xmm8, %xmm4 416 addps %xmm4, %xmm0 417 movaps -16 * SIZE(X), %xmm4 418 mulps %xmm8, %xmm12 419 movaps -20 * SIZE(Y), %xmm8 420 addps %xmm12, %xmm1 421 422 movss %xmm10, %xmm9 423 pshufd $0xb1, %xmm5, %xmm12 424 shufps $0x39, %xmm9, %xmm9 425 mulps %xmm9, %xmm5 426 addps %xmm5, %xmm0 427 movaps -12 * SIZE(X), %xmm5 428 mulps %xmm9, %xmm12 429 movaps -16 * SIZE(Y), %xmm9 430 addps %xmm12, %xmm1 431 432#ifdef PREFETCH 433 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 434#endif 435 436 movss %xmm11, %xmm10 437 pshufd $0xb1, %xmm6, %xmm12 438 shufps $0x39, %xmm10, %xmm10 439 mulps %xmm10, %xmm6 440 addps %xmm6, %xmm0 441 movaps -8 * SIZE(X), %xmm6 442 mulps %xmm10, %xmm12 443 movaps -12 * SIZE(Y), %xmm10 444 addps %xmm12, %xmm1 445 446 movss %xmm8, %xmm11 447 pshufd $0xb1, %xmm7, %xmm12 448 shufps $0x39, %xmm11, %xmm11 449 mulps %xmm11, %xmm7 450 addps %xmm7, %xmm0 451 movaps -4 * SIZE(X), %xmm7 452 mulps %xmm11, %xmm12 453 movaps -8 * SIZE(Y), %xmm11 454 addps %xmm12, %xmm1 455 456#if defined(PREFETCH) && !defined(FETCH128) 457 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 458#endif 459 460 movss %xmm9, %xmm8 461 pshufd $0xb1, %xmm4, %xmm12 462 shufps $0x39, %xmm8, %xmm8 463 mulps %xmm8, %xmm4 464 addps %xmm4, %xmm0 465 movaps 0 * SIZE(X), %xmm4 466 mulps %xmm8, %xmm12 467 movaps -4 * SIZE(Y), %xmm8 468 addps %xmm12, %xmm1 469 470 movss %xmm10, %xmm9 471 pshufd $0xb1, %xmm5, %xmm12 472 shufps $0x39, %xmm9, %xmm9 473 mulps %xmm9, %xmm5 474 addps %xmm5, %xmm0 475 movaps 4 * SIZE(X), %xmm5 476 mulps %xmm9, %xmm12 477 movaps 0 * SIZE(Y), %xmm9 478 addps %xmm12, %xmm1 479 480#if defined(PREFETCH) && !defined(FETCH128) 481 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 482#endif 483 484 movss %xmm11, %xmm10 485 pshufd $0xb1, %xmm6, %xmm12 486 shufps $0x39, %xmm10, %xmm10 487 mulps %xmm10, %xmm6 488 addps %xmm6, %xmm0 489 movaps 8 * SIZE(X), %xmm6 490 mulps %xmm10, %xmm12 491 movaps 4 * SIZE(Y), %xmm10 492 addps %xmm12, %xmm1 493 494 movss %xmm8, %xmm11 495 pshufd $0xb1, %xmm7, %xmm12 496 shufps $0x39, %xmm11, %xmm11 497 mulps %xmm11, %xmm7 498 addps %xmm7, %xmm0 499 movaps 12 * SIZE(X), %xmm7 500 mulps %xmm11, %xmm12 501 movaps 8 * SIZE(Y), %xmm11 502 addps %xmm12, %xmm1 503 504 subq $-32 * SIZE, X 505 subq $-32 * SIZE, Y 506 507 decq %rax 508 jg .L21 509 ALIGN_3 510 511.L22: 512 movss %xmm9, %xmm8 513 pshufd $0xb1, %xmm4, %xmm12 514 shufps $0x39, %xmm8, %xmm8 515 mulps %xmm8, %xmm4 516 addps %xmm4, %xmm0 517 movaps -16 * SIZE(X), %xmm4 518 mulps %xmm8, %xmm12 519 movaps -20 * SIZE(Y), %xmm8 520 addps %xmm12, %xmm1 521 522 movss %xmm10, %xmm9 523 pshufd $0xb1, %xmm5, %xmm12 524 shufps $0x39, %xmm9, %xmm9 525 mulps %xmm9, %xmm5 526 addps %xmm5, %xmm0 527 movaps -12 * SIZE(X), %xmm5 528 mulps %xmm9, %xmm12 529 movaps -16 * SIZE(Y), %xmm9 530 addps %xmm12, %xmm1 531 532 movss %xmm11, %xmm10 533 pshufd $0xb1, %xmm6, %xmm12 534 shufps $0x39, %xmm10, %xmm10 535 mulps %xmm10, %xmm6 536 addps %xmm6, %xmm0 537 movaps -8 * SIZE(X), %xmm6 538 mulps %xmm10, %xmm12 539 movaps -12 * SIZE(Y), %xmm10 540 addps %xmm12, %xmm1 541 542 movss %xmm8, %xmm11 543 pshufd $0xb1, %xmm7, %xmm12 544 shufps $0x39, %xmm11, %xmm11 545 mulps %xmm11, %xmm7 546 addps %xmm7, %xmm0 547 movaps -4 * SIZE(X), %xmm7 548 mulps %xmm11, %xmm12 549 movaps -8 * SIZE(Y), %xmm11 550 addps %xmm12, %xmm1 551 552 movss %xmm9, %xmm8 553 pshufd $0xb1, %xmm4, %xmm12 554 shufps $0x39, %xmm8, %xmm8 555 mulps %xmm8, %xmm4 556 addps %xmm4, %xmm0 557 mulps %xmm8, %xmm12 558 movaps -4 * SIZE(Y), %xmm8 559 addps %xmm12, %xmm1 560 561 movss %xmm10, %xmm9 562 pshufd $0xb1, %xmm5, %xmm12 563 shufps $0x39, %xmm9, %xmm9 564 mulps %xmm9, %xmm5 565 addps %xmm5, %xmm0 566 mulps %xmm9, %xmm12 567 addps %xmm12, %xmm1 568 569 movss %xmm11, %xmm10 570 pshufd $0xb1, %xmm6, %xmm12 571 shufps $0x39, %xmm10, %xmm10 572 mulps %xmm10, %xmm6 573 addps %xmm6, %xmm0 574 mulps %xmm10, %xmm12 575 addps %xmm12, %xmm1 576 577 movss %xmm8, %xmm11 578 pshufd $0xb1, %xmm7, %xmm12 579 shufps $0x39, %xmm11, %xmm11 580 mulps %xmm11, %xmm7 581 addps %xmm7, %xmm0 582 mulps %xmm11, %xmm12 583 addps %xmm12, %xmm1 584 585 subq $-32 * SIZE, X 586 subq $-32 * SIZE, Y 587 ALIGN_3 588 589.L25: 590 testq $8, N 591 jle .L26 592 593 movaps -32 * SIZE(X), %xmm4 594 movaps -32 * SIZE(Y), %xmm9 595 movaps -28 * SIZE(X), %xmm5 596 movaps -28 * SIZE(Y), %xmm10 597 598 movss %xmm9, %xmm8 599 pshufd $0xb1, %xmm4, %xmm12 600 shufps $0x39, %xmm8, %xmm8 601 mulps %xmm8, %xmm4 602 addps %xmm4, %xmm0 603 mulps %xmm8, %xmm12 604 addps %xmm12, %xmm1 605 606 movaps -24 * SIZE(X), %xmm6 607 movaps -24 * SIZE(Y), %xmm11 608 609 movss %xmm10, %xmm9 610 pshufd $0xb1, %xmm5, %xmm12 611 shufps $0x39, %xmm9, %xmm9 612 mulps %xmm9, %xmm5 613 addps %xmm5, %xmm0 614 mulps %xmm9, %xmm12 615 addps %xmm12, %xmm1 616 617 movaps -20 * SIZE(X), %xmm7 618 movaps -20 * SIZE(Y), %xmm8 619 620 movss %xmm11, %xmm10 621 pshufd $0xb1, %xmm6, %xmm12 622 shufps $0x39, %xmm10, %xmm10 623 mulps %xmm10, %xmm6 624 addps %xmm6, %xmm0 625 mulps %xmm10, %xmm12 626 addps %xmm12, %xmm1 627 628 movss %xmm8, %xmm11 629 pshufd $0xb1, %xmm7, %xmm12 630 shufps $0x39, %xmm11, %xmm11 631 mulps %xmm11, %xmm7 632 addps %xmm7, %xmm0 633 mulps %xmm11, %xmm12 634 addps %xmm12, %xmm1 635 636 addq $16 * SIZE, X 637 addq $16 * SIZE, Y 638 ALIGN_3 639 640.L26: 641 testq $4, N 642 jle .L27 643 644 movaps -32 * SIZE(X), %xmm4 645 movaps -32 * SIZE(Y), %xmm9 646 647 movss %xmm9, %xmm8 648 pshufd $0xb1, %xmm4, %xmm12 649 shufps $0x39, %xmm8, %xmm8 650 mulps %xmm8, %xmm4 651 addps %xmm4, %xmm0 652 mulps %xmm8, %xmm12 653 addps %xmm12, %xmm1 654 655 movaps -28 * SIZE(X), %xmm5 656 movaps -28 * SIZE(Y), %xmm10 657 658 movss %xmm10, %xmm9 659 pshufd $0xb1, %xmm5, %xmm12 660 shufps $0x39, %xmm9, %xmm9 661 mulps %xmm9, %xmm5 662 addps %xmm5, %xmm0 663 mulps %xmm9, %xmm12 664 addps %xmm12, %xmm1 665 666 movaps %xmm10, %xmm8 667 668 addq $8 * SIZE, X 669 addq $8 * SIZE, Y 670 ALIGN_3 671 672.L27: 673 testq $2, N 674 jle .L28 675 676 movaps -32 * SIZE(X), %xmm4 677 movaps -32 * SIZE(Y), %xmm9 678 679 movss %xmm9, %xmm8 680 pshufd $0xb1, %xmm4, %xmm12 681 shufps $0x39, %xmm8, %xmm8 682 mulps %xmm8, %xmm4 683 addps %xmm4, %xmm0 684 mulps %xmm8, %xmm12 685 addps %xmm12, %xmm1 686 687 movaps %xmm9, %xmm8 688 addq $4 * SIZE, X 689 addq $4 * SIZE, Y 690 ALIGN_3 691 692.L28: 693 testq $1, N 694 jle .L29 695 696#ifdef movsd 697 xorps %xmm4, %xmm4 698#endif 699 movsd -32 * SIZE(X), %xmm4 700 701 pshufd $0xb1, %xmm4, %xmm12 702 shufps $0x39, %xmm8, %xmm8 703 mulps %xmm8, %xmm4 704 addps %xmm4, %xmm0 705 mulps %xmm8, %xmm12 706 addps %xmm12, %xmm1 707 ALIGN_3 708 709.L29: 710 shufps $0xb1, %xmm1, %xmm1 711 shufps $0xb1, %xmm3, %xmm3 712 jmp .L98 713 ALIGN_3 714 715.L30: 716 717 testq $SIZE, Y 718 jne .L40 719#endif 720 721 movq N, %rax 722 sarq $4, %rax 723 jle .L35 724 725 movaps -32 * SIZE(X), %xmm4 726 movsd -32 * SIZE(Y), %xmm8 727 movhps -30 * SIZE(Y), %xmm8 728 movaps -28 * SIZE(X), %xmm5 729 movsd -28 * SIZE(Y), %xmm9 730 movhps -26 * SIZE(Y), %xmm9 731 732 movaps -24 * SIZE(X), %xmm6 733 movsd -24 * SIZE(Y), %xmm10 734 movhps -22 * SIZE(Y), %xmm10 735 movaps -20 * SIZE(X), %xmm7 736 movsd -20 * SIZE(Y), %xmm11 737 movhps -18 * SIZE(Y), %xmm11 738 739 decq %rax 740 jle .L32 741 ALIGN_3 742 743.L31: 744#ifdef PREFETCH 745 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 746#endif 747 748 pshufd $0xb1, %xmm8, %xmm12 749 mulps %xmm4, %xmm8 750 addps %xmm8, %xmm0 751 movsd -16 * SIZE(Y), %xmm8 752 movhps -14 * SIZE(Y), %xmm8 753 mulps %xmm4, %xmm12 754 movaps -16 * SIZE(X), %xmm4 755 addps %xmm12, %xmm1 756 757 pshufd $0xb1, %xmm9, %xmm12 758 mulps %xmm5, %xmm9 759 addps %xmm9, %xmm2 760 movsd -12 * SIZE(Y), %xmm9 761 movhps -10 * SIZE(Y), %xmm9 762 mulps %xmm5, %xmm12 763 movaps -12 * SIZE(X), %xmm5 764 addps %xmm12, %xmm3 765 766#ifdef PREFETCH 767 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 768#endif 769 770 pshufd $0xb1, %xmm10, %xmm12 771 mulps %xmm6, %xmm10 772 addps %xmm10, %xmm0 773 movsd -8 * SIZE(Y), %xmm10 774 movhps -6 * SIZE(Y), %xmm10 775 mulps %xmm6, %xmm12 776 movaps -8 * SIZE(X), %xmm6 777 addps %xmm12, %xmm1 778 779 pshufd $0xb1, %xmm11, %xmm12 780 mulps %xmm7, %xmm11 781 addps %xmm11, %xmm2 782 movsd -4 * SIZE(Y), %xmm11 783 movhps -2 * SIZE(Y), %xmm11 784 mulps %xmm7, %xmm12 785 movaps -4 * SIZE(X), %xmm7 786 addps %xmm12, %xmm3 787 788#if defined(PREFETCH) && !defined(FETCH128) 789 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 790#endif 791 792 pshufd $0xb1, %xmm8, %xmm12 793 mulps %xmm4, %xmm8 794 addps %xmm8, %xmm0 795 movsd 0 * SIZE(Y), %xmm8 796 movhps 2 * SIZE(Y), %xmm8 797 mulps %xmm4, %xmm12 798 movaps 0 * SIZE(X), %xmm4 799 addps %xmm12, %xmm1 800 801 pshufd $0xb1, %xmm9, %xmm12 802 mulps %xmm5, %xmm9 803 addps %xmm9, %xmm2 804 movsd 4 * SIZE(Y), %xmm9 805 movhps 6 * SIZE(Y), %xmm9 806 mulps %xmm5, %xmm12 807 movaps 4 * SIZE(X), %xmm5 808 addps %xmm12, %xmm3 809 810#if defined(PREFETCH) && !defined(FETCH128) 811 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 812#endif 813 814 pshufd $0xb1, %xmm10, %xmm12 815 mulps %xmm6, %xmm10 816 addps %xmm10, %xmm0 817 movsd 8 * SIZE(Y), %xmm10 818 movhps 10 * SIZE(Y), %xmm10 819 mulps %xmm6, %xmm12 820 movaps 8 * SIZE(X), %xmm6 821 addps %xmm12, %xmm1 822 823 pshufd $0xb1, %xmm11, %xmm12 824 mulps %xmm7, %xmm11 825 addps %xmm11, %xmm2 826 movsd 12 * SIZE(Y), %xmm11 827 movhps 14 * SIZE(Y), %xmm11 828 mulps %xmm7, %xmm12 829 movaps 12 * SIZE(X), %xmm7 830 addps %xmm12, %xmm3 831 832 subq $-32 * SIZE, X 833 subq $-32 * SIZE, Y 834 835 decq %rax 836 jg .L31 837 ALIGN_3 838 839.L32: 840 pshufd $0xb1, %xmm8, %xmm12 841 mulps %xmm4, %xmm8 842 addps %xmm8, %xmm0 843 movsd -16 * SIZE(Y), %xmm8 844 movhps -14 * SIZE(Y), %xmm8 845 mulps %xmm4, %xmm12 846 movaps -16 * SIZE(X), %xmm4 847 addps %xmm12, %xmm1 848 849 pshufd $0xb1, %xmm9, %xmm12 850 mulps %xmm5, %xmm9 851 addps %xmm9, %xmm2 852 movsd -12 * SIZE(Y), %xmm9 853 movhps -10 * SIZE(Y), %xmm9 854 mulps %xmm5, %xmm12 855 movaps -12 * SIZE(X), %xmm5 856 addps %xmm12, %xmm3 857 858 pshufd $0xb1, %xmm10, %xmm12 859 mulps %xmm6, %xmm10 860 addps %xmm10, %xmm0 861 movsd -8 * SIZE(Y), %xmm10 862 movhps -6 * SIZE(Y), %xmm10 863 mulps %xmm6, %xmm12 864 movaps -8 * SIZE(X), %xmm6 865 addps %xmm12, %xmm1 866 867 pshufd $0xb1, %xmm11, %xmm12 868 mulps %xmm7, %xmm11 869 addps %xmm11, %xmm2 870 movsd -4 * SIZE(Y), %xmm11 871 movhps -2 * SIZE(Y), %xmm11 872 mulps %xmm7, %xmm12 873 movaps -4 * SIZE(X), %xmm7 874 addps %xmm12, %xmm3 875 876 pshufd $0xb1, %xmm8, %xmm12 877 mulps %xmm4, %xmm8 878 addps %xmm8, %xmm0 879 mulps %xmm4, %xmm12 880 addps %xmm12, %xmm1 881 882 pshufd $0xb1, %xmm9, %xmm12 883 mulps %xmm5, %xmm9 884 addps %xmm9, %xmm2 885 mulps %xmm5, %xmm12 886 addps %xmm12, %xmm3 887 888 pshufd $0xb1, %xmm10, %xmm12 889 mulps %xmm6, %xmm10 890 addps %xmm10, %xmm0 891 mulps %xmm6, %xmm12 892 addps %xmm12, %xmm1 893 894 pshufd $0xb1, %xmm11, %xmm12 895 mulps %xmm7, %xmm11 896 addps %xmm11, %xmm2 897 mulps %xmm7, %xmm12 898 addps %xmm12, %xmm3 899 900 subq $-32 * SIZE, X 901 subq $-32 * SIZE, Y 902 ALIGN_3 903 904.L35: 905 testq $8, N 906 jle .L36 907 908 movaps -32 * SIZE(X), %xmm4 909 movsd -32 * SIZE(Y), %xmm8 910 movhps -30 * SIZE(Y), %xmm8 911 912 pshufd $0xb1, %xmm8, %xmm12 913 mulps %xmm4, %xmm8 914 addps %xmm8, %xmm0 915 mulps %xmm4, %xmm12 916 addps %xmm12, %xmm1 917 918 movaps -28 * SIZE(X), %xmm5 919 movsd -28 * SIZE(Y), %xmm9 920 movhps -26 * SIZE(Y), %xmm9 921 922 pshufd $0xb1, %xmm9, %xmm12 923 mulps %xmm5, %xmm9 924 addps %xmm9, %xmm2 925 mulps %xmm5, %xmm12 926 addps %xmm12, %xmm3 927 928 movaps -24 * SIZE(X), %xmm6 929 movsd -24 * SIZE(Y), %xmm10 930 movhps -22 * SIZE(Y), %xmm10 931 932 pshufd $0xb1, %xmm10, %xmm12 933 mulps %xmm6, %xmm10 934 addps %xmm10, %xmm0 935 mulps %xmm6, %xmm12 936 addps %xmm12, %xmm1 937 938 movaps -20 * SIZE(X), %xmm7 939 movsd -20 * SIZE(Y), %xmm11 940 movhps -18 * SIZE(Y), %xmm11 941 942 pshufd $0xb1, %xmm11, %xmm12 943 mulps %xmm7, %xmm11 944 addps %xmm11, %xmm2 945 mulps %xmm7, %xmm12 946 addps %xmm12, %xmm3 947 948 addq $16 * SIZE, X 949 addq $16 * SIZE, Y 950 ALIGN_3 951 952.L36: 953 testq $4, N 954 jle .L37 955 956 movaps -32 * SIZE(X), %xmm4 957 movsd -32 * SIZE(Y), %xmm8 958 movhps -30 * SIZE(Y), %xmm8 959 960 pshufd $0xb1, %xmm8, %xmm12 961 mulps %xmm4, %xmm8 962 addps %xmm8, %xmm0 963 mulps %xmm4, %xmm12 964 addps %xmm12, %xmm1 965 966 movaps -28 * SIZE(X), %xmm5 967 movsd -28 * SIZE(Y), %xmm9 968 movhps -26 * SIZE(Y), %xmm9 969 970 pshufd $0xb1, %xmm9, %xmm12 971 mulps %xmm5, %xmm9 972 addps %xmm9, %xmm2 973 mulps %xmm5, %xmm12 974 addps %xmm12, %xmm3 975 976 addq $8 * SIZE, X 977 addq $8 * SIZE, Y 978 ALIGN_3 979 980.L37: 981 testq $2, N 982 jle .L38 983 984 movaps -32 * SIZE(X), %xmm4 985 movsd -32 * SIZE(Y), %xmm8 986 movhps -30 * SIZE(Y), %xmm8 987 988 pshufd $0xb1, %xmm8, %xmm12 989 mulps %xmm4, %xmm8 990 addps %xmm8, %xmm0 991 mulps %xmm4, %xmm12 992 addps %xmm12, %xmm1 993 994 addq $4 * SIZE, X 995 addq $4 * SIZE, Y 996 ALIGN_3 997 998.L38: 999 testq $1, N 1000 jle .L98 1001 1002#ifdef movsd 1003 xorps %xmm4, %xmm4 1004#endif 1005 movsd -32 * SIZE(X), %xmm4 1006#ifdef movsd 1007 xorps %xmm8, %xmm8 1008#endif 1009 movsd -32 * SIZE(Y), %xmm8 1010 1011 pshufd $0xb1, %xmm8, %xmm12 1012 mulps %xmm4, %xmm8 1013 addps %xmm8, %xmm0 1014 mulps %xmm4, %xmm12 1015 addps %xmm12, %xmm1 1016 jmp .L98 1017 ALIGN_3 1018 1019#ifdef ALIGNED_ACCESS 1020.L40: 1021 movaps -35 * SIZE(Y), %xmm8 1022 addq $1 * SIZE, Y 1023 1024 shufps $0xb1, %xmm1, %xmm1 1025 1026 movq N, %rax 1027 sarq $4, %rax 1028 jle .L45 1029 1030 movaps -32 * SIZE(X), %xmm4 1031 movaps -32 * SIZE(Y), %xmm9 1032 movaps -28 * SIZE(X), %xmm5 1033 movaps -28 * SIZE(Y), %xmm10 1034 movaps -24 * SIZE(X), %xmm6 1035 movaps -24 * SIZE(Y), %xmm11 1036 movaps -20 * SIZE(X), %xmm7 1037 1038 decq %rax 1039 jle .L42 1040 ALIGN_3 1041 1042.L41: 1043#ifdef PREFETCH 1044 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 1045#endif 1046 1047 movss %xmm9, %xmm8 1048 pshufd $0xb1, %xmm4, %xmm12 1049 shufps $0x93, %xmm9, %xmm8 1050 mulps %xmm8, %xmm4 1051 addps %xmm4, %xmm0 1052 movaps -16 * SIZE(X), %xmm4 1053 mulps %xmm8, %xmm12 1054 movaps -20 * SIZE(Y), %xmm8 1055 addps %xmm12, %xmm1 1056 1057 movss %xmm10, %xmm9 1058 pshufd $0xb1, %xmm5, %xmm12 1059 shufps $0x93, %xmm10, %xmm9 1060 mulps %xmm9, %xmm5 1061 addps %xmm5, %xmm0 1062 movaps -12 * SIZE(X), %xmm5 1063 mulps %xmm9, %xmm12 1064 movaps -16 * SIZE(Y), %xmm9 1065 addps %xmm12, %xmm1 1066 1067#ifdef PREFETCH 1068 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 1069#endif 1070 1071 movss %xmm11, %xmm10 1072 pshufd $0xb1, %xmm6, %xmm12 1073 shufps $0x93, %xmm11, %xmm10 1074 mulps %xmm10, %xmm6 1075 addps %xmm6, %xmm0 1076 movaps -8 * SIZE(X), %xmm6 1077 mulps %xmm10, %xmm12 1078 movaps -12 * SIZE(Y), %xmm10 1079 addps %xmm12, %xmm1 1080 1081 movss %xmm8, %xmm11 1082 pshufd $0xb1, %xmm7, %xmm12 1083 shufps $0x93, %xmm8, %xmm11 1084 mulps %xmm11, %xmm7 1085 addps %xmm7, %xmm0 1086 movaps -4 * SIZE(X), %xmm7 1087 mulps %xmm11, %xmm12 1088 movaps -8 * SIZE(Y), %xmm11 1089 addps %xmm12, %xmm1 1090 1091#if defined(PREFETCH) && !defined(FETCH128) 1092 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 1093#endif 1094 1095 movss %xmm9, %xmm8 1096 pshufd $0xb1, %xmm4, %xmm12 1097 shufps $0x93, %xmm9, %xmm8 1098 mulps %xmm8, %xmm4 1099 addps %xmm4, %xmm0 1100 movaps 0 * SIZE(X), %xmm4 1101 mulps %xmm8, %xmm12 1102 movaps -4 * SIZE(Y), %xmm8 1103 addps %xmm12, %xmm1 1104 1105 movss %xmm10, %xmm9 1106 pshufd $0xb1, %xmm5, %xmm12 1107 shufps $0x93, %xmm10, %xmm9 1108 mulps %xmm9, %xmm5 1109 addps %xmm5, %xmm0 1110 movaps 4 * SIZE(X), %xmm5 1111 mulps %xmm9, %xmm12 1112 movaps 0 * SIZE(Y), %xmm9 1113 addps %xmm12, %xmm1 1114 1115#if defined(PREFETCH) && !defined(FETCH128) 1116 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 1117#endif 1118 1119 movss %xmm11, %xmm10 1120 pshufd $0xb1, %xmm6, %xmm12 1121 shufps $0x93, %xmm11, %xmm10 1122 mulps %xmm10, %xmm6 1123 addps %xmm6, %xmm0 1124 movaps 8 * SIZE(X), %xmm6 1125 mulps %xmm10, %xmm12 1126 movaps 4 * SIZE(Y), %xmm10 1127 addps %xmm12, %xmm1 1128 1129 movss %xmm8, %xmm11 1130 pshufd $0xb1, %xmm7, %xmm12 1131 shufps $0x93, %xmm8, %xmm11 1132 mulps %xmm11, %xmm7 1133 addps %xmm7, %xmm0 1134 movaps 12 * SIZE(X), %xmm7 1135 mulps %xmm11, %xmm12 1136 movaps 8 * SIZE(Y), %xmm11 1137 addps %xmm12, %xmm1 1138 1139 subq $-32 * SIZE, X 1140 subq $-32 * SIZE, Y 1141 1142 decq %rax 1143 jg .L41 1144 ALIGN_3 1145 1146.L42: 1147 movss %xmm9, %xmm8 1148 pshufd $0xb1, %xmm4, %xmm12 1149 shufps $0x93, %xmm9, %xmm8 1150 mulps %xmm8, %xmm4 1151 addps %xmm4, %xmm0 1152 movaps -16 * SIZE(X), %xmm4 1153 mulps %xmm8, %xmm12 1154 movaps -20 * SIZE(Y), %xmm8 1155 addps %xmm12, %xmm1 1156 1157 movss %xmm10, %xmm9 1158 pshufd $0xb1, %xmm5, %xmm12 1159 shufps $0x93, %xmm10, %xmm9 1160 mulps %xmm9, %xmm5 1161 addps %xmm5, %xmm0 1162 movaps -12 * SIZE(X), %xmm5 1163 mulps %xmm9, %xmm12 1164 movaps -16 * SIZE(Y), %xmm9 1165 addps %xmm12, %xmm1 1166 1167 movss %xmm11, %xmm10 1168 pshufd $0xb1, %xmm6, %xmm12 1169 shufps $0x93, %xmm11, %xmm10 1170 mulps %xmm10, %xmm6 1171 addps %xmm6, %xmm0 1172 movaps -8 * SIZE(X), %xmm6 1173 mulps %xmm10, %xmm12 1174 movaps -12 * SIZE(Y), %xmm10 1175 addps %xmm12, %xmm1 1176 1177 movss %xmm8, %xmm11 1178 pshufd $0xb1, %xmm7, %xmm12 1179 shufps $0x93, %xmm8, %xmm11 1180 mulps %xmm11, %xmm7 1181 addps %xmm7, %xmm0 1182 movaps -4 * SIZE(X), %xmm7 1183 mulps %xmm11, %xmm12 1184 movaps -8 * SIZE(Y), %xmm11 1185 addps %xmm12, %xmm1 1186 1187 movss %xmm9, %xmm8 1188 pshufd $0xb1, %xmm4, %xmm12 1189 shufps $0x93, %xmm9, %xmm8 1190 mulps %xmm8, %xmm4 1191 addps %xmm4, %xmm0 1192 mulps %xmm8, %xmm12 1193 movaps -4 * SIZE(Y), %xmm8 1194 addps %xmm12, %xmm1 1195 1196 movss %xmm10, %xmm9 1197 pshufd $0xb1, %xmm5, %xmm12 1198 shufps $0x93, %xmm10, %xmm9 1199 mulps %xmm9, %xmm5 1200 addps %xmm5, %xmm0 1201 mulps %xmm9, %xmm12 1202 addps %xmm12, %xmm1 1203 1204 movss %xmm11, %xmm10 1205 pshufd $0xb1, %xmm6, %xmm12 1206 shufps $0x93, %xmm11, %xmm10 1207 mulps %xmm10, %xmm6 1208 addps %xmm6, %xmm0 1209 mulps %xmm10, %xmm12 1210 addps %xmm12, %xmm1 1211 1212 movss %xmm8, %xmm11 1213 pshufd $0xb1, %xmm7, %xmm12 1214 shufps $0x93, %xmm8, %xmm11 1215 mulps %xmm11, %xmm7 1216 addps %xmm7, %xmm0 1217 mulps %xmm11, %xmm12 1218 addps %xmm12, %xmm1 1219 1220 subq $-32 * SIZE, X 1221 subq $-32 * SIZE, Y 1222 ALIGN_3 1223 1224.L45: 1225 testq $8, N 1226 jle .L46 1227 1228 movaps -32 * SIZE(X), %xmm4 1229 movaps -32 * SIZE(Y), %xmm9 1230 movaps -28 * SIZE(X), %xmm5 1231 movaps -28 * SIZE(Y), %xmm10 1232 1233 movss %xmm9, %xmm8 1234 pshufd $0xb1, %xmm4, %xmm12 1235 shufps $0x93, %xmm9, %xmm8 1236 mulps %xmm8, %xmm4 1237 addps %xmm4, %xmm0 1238 mulps %xmm8, %xmm12 1239 addps %xmm12, %xmm1 1240 1241 movaps -24 * SIZE(X), %xmm6 1242 movaps -24 * SIZE(Y), %xmm11 1243 1244 movss %xmm10, %xmm9 1245 pshufd $0xb1, %xmm5, %xmm12 1246 shufps $0x93, %xmm10, %xmm9 1247 mulps %xmm9, %xmm5 1248 addps %xmm5, %xmm0 1249 mulps %xmm9, %xmm12 1250 addps %xmm12, %xmm1 1251 1252 movaps -20 * SIZE(X), %xmm7 1253 movaps -20 * SIZE(Y), %xmm8 1254 1255 movss %xmm11, %xmm10 1256 pshufd $0xb1, %xmm6, %xmm12 1257 shufps $0x93, %xmm11, %xmm10 1258 mulps %xmm10, %xmm6 1259 addps %xmm6, %xmm0 1260 mulps %xmm10, %xmm12 1261 addps %xmm12, %xmm1 1262 1263 movss %xmm8, %xmm11 1264 pshufd $0xb1, %xmm7, %xmm12 1265 shufps $0x93, %xmm8, %xmm11 1266 mulps %xmm11, %xmm7 1267 addps %xmm7, %xmm0 1268 mulps %xmm11, %xmm12 1269 addps %xmm12, %xmm1 1270 1271 addq $16 * SIZE, X 1272 addq $16 * SIZE, Y 1273 ALIGN_3 1274 1275.L46: 1276 testq $4, N 1277 jle .L47 1278 1279 movaps -32 * SIZE(X), %xmm4 1280 movaps -32 * SIZE(Y), %xmm9 1281 1282 movss %xmm9, %xmm8 1283 pshufd $0xb1, %xmm4, %xmm12 1284 shufps $0x93, %xmm9, %xmm8 1285 mulps %xmm8, %xmm4 1286 addps %xmm4, %xmm0 1287 mulps %xmm8, %xmm12 1288 addps %xmm12, %xmm1 1289 1290 movaps -28 * SIZE(X), %xmm5 1291 movaps -28 * SIZE(Y), %xmm10 1292 1293 movss %xmm10, %xmm9 1294 pshufd $0xb1, %xmm5, %xmm12 1295 shufps $0x93, %xmm10, %xmm9 1296 mulps %xmm9, %xmm5 1297 addps %xmm5, %xmm0 1298 mulps %xmm9, %xmm12 1299 addps %xmm12, %xmm1 1300 1301 movaps %xmm10, %xmm8 1302 1303 addq $8 * SIZE, X 1304 addq $8 * SIZE, Y 1305 ALIGN_3 1306 1307.L47: 1308 testq $2, N 1309 jle .L48 1310 1311 movaps -32 * SIZE(X), %xmm4 1312 movaps -32 * SIZE(Y), %xmm9 1313 1314 movss %xmm9, %xmm8 1315 pshufd $0xb1, %xmm4, %xmm12 1316 shufps $0x93, %xmm9, %xmm8 1317 mulps %xmm8, %xmm4 1318 addps %xmm4, %xmm0 1319 mulps %xmm8, %xmm12 1320 addps %xmm12, %xmm1 1321 1322 movaps %xmm9, %xmm8 1323 addq $4 * SIZE, X 1324 addq $4 * SIZE, Y 1325 ALIGN_3 1326 1327.L48: 1328 testq $1, N 1329 jle .L49 1330 1331#ifdef movsd 1332 xorps %xmm4, %xmm4 1333#endif 1334 movsd -32 * SIZE(X), %xmm4 1335 movss -32 * SIZE(Y), %xmm9 1336 1337 movss %xmm9, %xmm8 1338 pshufd $0xb1, %xmm4, %xmm12 1339 shufps $0x93, %xmm8, %xmm8 1340 mulps %xmm8, %xmm4 1341 addps %xmm4, %xmm0 1342 mulps %xmm8, %xmm12 1343 addps %xmm12, %xmm1 1344 ALIGN_3 1345 1346.L49: 1347 shufps $0xb1, %xmm1, %xmm1 1348 shufps $0xb1, %xmm3, %xmm3 1349 jmp .L98 1350 ALIGN_3 1351#endif 1352 1353.L50: 1354 testq $SIZE, Y 1355 jne .L70 1356 1357#ifdef ALIGNED_ACCESS 1358 1359 testq $2 * SIZE, Y 1360 je .L50x 1361 1362#ifdef movsd 1363 xorps %xmm0, %xmm0 1364#endif 1365 movsd -32 * SIZE(X), %xmm0 1366#ifdef movsd 1367 xorps %xmm4, %xmm4 1368#endif 1369 movsd -32 * SIZE(Y), %xmm4 1370 1371 pshufd $0xb1, %xmm0, %xmm1 1372 mulps %xmm4, %xmm0 1373 mulps %xmm4, %xmm1 1374 addq $2 * SIZE, X 1375 addq $2 * SIZE, Y 1376 1377 decq N 1378 ALIGN_3 1379 1380.L50x: 1381 testq $2 * SIZE, X 1382 jne .L60 1383 1384 movaps -33 * SIZE(X), %xmm8 1385 addq $3 * SIZE, X 1386 1387 shufps $0xb1, %xmm1, %xmm1 1388 1389 movq N, %rax 1390 sarq $4, %rax 1391 jle .L55 1392 1393 movaps -32 * SIZE(Y), %xmm4 1394 movaps -32 * SIZE(X), %xmm9 1395 movaps -28 * SIZE(Y), %xmm5 1396 movaps -28 * SIZE(X), %xmm10 1397 movaps -24 * SIZE(Y), %xmm6 1398 movaps -24 * SIZE(X), %xmm11 1399 movaps -20 * SIZE(Y), %xmm7 1400 1401 decq %rax 1402 jle .L52 1403 ALIGN_3 1404 1405.L51: 1406#ifdef PREFETCH 1407 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 1408#endif 1409 1410 movss %xmm9, %xmm8 1411 pshufd $0xb1, %xmm4, %xmm12 1412 shufps $0x39, %xmm8, %xmm8 1413 mulps %xmm8, %xmm4 1414 addps %xmm4, %xmm0 1415 movaps -16 * SIZE(Y), %xmm4 1416 mulps %xmm8, %xmm12 1417 movaps -20 * SIZE(X), %xmm8 1418 addps %xmm12, %xmm1 1419 1420 movss %xmm10, %xmm9 1421 pshufd $0xb1, %xmm5, %xmm12 1422 shufps $0x39, %xmm9, %xmm9 1423 mulps %xmm9, %xmm5 1424 addps %xmm5, %xmm0 1425 movaps -12 * SIZE(Y), %xmm5 1426 mulps %xmm9, %xmm12 1427 movaps -16 * SIZE(X), %xmm9 1428 addps %xmm12, %xmm1 1429 1430#ifdef PREFETCH 1431 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 1432#endif 1433 1434 movss %xmm11, %xmm10 1435 pshufd $0xb1, %xmm6, %xmm12 1436 shufps $0x39, %xmm10, %xmm10 1437 mulps %xmm10, %xmm6 1438 addps %xmm6, %xmm0 1439 movaps -8 * SIZE(Y), %xmm6 1440 mulps %xmm10, %xmm12 1441 movaps -12 * SIZE(X), %xmm10 1442 addps %xmm12, %xmm1 1443 1444 movss %xmm8, %xmm11 1445 pshufd $0xb1, %xmm7, %xmm12 1446 shufps $0x39, %xmm11, %xmm11 1447 mulps %xmm11, %xmm7 1448 addps %xmm7, %xmm0 1449 movaps -4 * SIZE(Y), %xmm7 1450 mulps %xmm11, %xmm12 1451 movaps -8 * SIZE(X), %xmm11 1452 addps %xmm12, %xmm1 1453 1454#if defined(PREFETCH) && !defined(FETCH128) 1455 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 1456#endif 1457 1458 movss %xmm9, %xmm8 1459 pshufd $0xb1, %xmm4, %xmm12 1460 shufps $0x39, %xmm8, %xmm8 1461 mulps %xmm8, %xmm4 1462 addps %xmm4, %xmm0 1463 movaps 0 * SIZE(Y), %xmm4 1464 mulps %xmm8, %xmm12 1465 movaps -4 * SIZE(X), %xmm8 1466 addps %xmm12, %xmm1 1467 1468 movss %xmm10, %xmm9 1469 pshufd $0xb1, %xmm5, %xmm12 1470 shufps $0x39, %xmm9, %xmm9 1471 mulps %xmm9, %xmm5 1472 addps %xmm5, %xmm0 1473 movaps 4 * SIZE(Y), %xmm5 1474 mulps %xmm9, %xmm12 1475 movaps 0 * SIZE(X), %xmm9 1476 addps %xmm12, %xmm1 1477 1478#if defined(PREFETCH) && !defined(FETCH128) 1479 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 1480#endif 1481 1482 movss %xmm11, %xmm10 1483 pshufd $0xb1, %xmm6, %xmm12 1484 shufps $0x39, %xmm10, %xmm10 1485 mulps %xmm10, %xmm6 1486 addps %xmm6, %xmm0 1487 movaps 8 * SIZE(Y), %xmm6 1488 mulps %xmm10, %xmm12 1489 movaps 4 * SIZE(X), %xmm10 1490 addps %xmm12, %xmm1 1491 1492 movss %xmm8, %xmm11 1493 pshufd $0xb1, %xmm7, %xmm12 1494 shufps $0x39, %xmm11, %xmm11 1495 mulps %xmm11, %xmm7 1496 addps %xmm7, %xmm0 1497 movaps 12 * SIZE(Y), %xmm7 1498 mulps %xmm11, %xmm12 1499 movaps 8 * SIZE(X), %xmm11 1500 addps %xmm12, %xmm1 1501 1502 subq $-32 * SIZE, X 1503 subq $-32 * SIZE, Y 1504 1505 decq %rax 1506 jg .L51 1507 ALIGN_3 1508 1509.L52: 1510 movss %xmm9, %xmm8 1511 pshufd $0xb1, %xmm4, %xmm12 1512 shufps $0x39, %xmm8, %xmm8 1513 mulps %xmm8, %xmm4 1514 addps %xmm4, %xmm0 1515 movaps -16 * SIZE(Y), %xmm4 1516 mulps %xmm8, %xmm12 1517 movaps -20 * SIZE(X), %xmm8 1518 addps %xmm12, %xmm1 1519 1520 movss %xmm10, %xmm9 1521 pshufd $0xb1, %xmm5, %xmm12 1522 shufps $0x39, %xmm9, %xmm9 1523 mulps %xmm9, %xmm5 1524 addps %xmm5, %xmm0 1525 movaps -12 * SIZE(Y), %xmm5 1526 mulps %xmm9, %xmm12 1527 movaps -16 * SIZE(X), %xmm9 1528 addps %xmm12, %xmm1 1529 1530 movss %xmm11, %xmm10 1531 pshufd $0xb1, %xmm6, %xmm12 1532 shufps $0x39, %xmm10, %xmm10 1533 mulps %xmm10, %xmm6 1534 addps %xmm6, %xmm0 1535 movaps -8 * SIZE(Y), %xmm6 1536 mulps %xmm10, %xmm12 1537 movaps -12 * SIZE(X), %xmm10 1538 addps %xmm12, %xmm1 1539 1540 movss %xmm8, %xmm11 1541 pshufd $0xb1, %xmm7, %xmm12 1542 shufps $0x39, %xmm11, %xmm11 1543 mulps %xmm11, %xmm7 1544 addps %xmm7, %xmm0 1545 movaps -4 * SIZE(Y), %xmm7 1546 mulps %xmm11, %xmm12 1547 movaps -8 * SIZE(X), %xmm11 1548 addps %xmm12, %xmm1 1549 1550 movss %xmm9, %xmm8 1551 pshufd $0xb1, %xmm4, %xmm12 1552 shufps $0x39, %xmm8, %xmm8 1553 mulps %xmm8, %xmm4 1554 addps %xmm4, %xmm0 1555 mulps %xmm8, %xmm12 1556 movaps -4 * SIZE(X), %xmm8 1557 addps %xmm12, %xmm1 1558 1559 movss %xmm10, %xmm9 1560 pshufd $0xb1, %xmm5, %xmm12 1561 shufps $0x39, %xmm9, %xmm9 1562 mulps %xmm9, %xmm5 1563 addps %xmm5, %xmm0 1564 mulps %xmm9, %xmm12 1565 addps %xmm12, %xmm1 1566 1567 movss %xmm11, %xmm10 1568 pshufd $0xb1, %xmm6, %xmm12 1569 shufps $0x39, %xmm10, %xmm10 1570 mulps %xmm10, %xmm6 1571 addps %xmm6, %xmm0 1572 mulps %xmm10, %xmm12 1573 addps %xmm12, %xmm1 1574 1575 movss %xmm8, %xmm11 1576 pshufd $0xb1, %xmm7, %xmm12 1577 shufps $0x39, %xmm11, %xmm11 1578 mulps %xmm11, %xmm7 1579 addps %xmm7, %xmm0 1580 mulps %xmm11, %xmm12 1581 addps %xmm12, %xmm1 1582 1583 subq $-32 * SIZE, X 1584 subq $-32 * SIZE, Y 1585 ALIGN_3 1586 1587.L55: 1588 testq $8, N 1589 jle .L56 1590 1591 movaps -32 * SIZE(Y), %xmm4 1592 movaps -32 * SIZE(X), %xmm9 1593 movaps -28 * SIZE(Y), %xmm5 1594 movaps -28 * SIZE(X), %xmm10 1595 1596 movss %xmm9, %xmm8 1597 pshufd $0xb1, %xmm4, %xmm12 1598 shufps $0x39, %xmm8, %xmm8 1599 mulps %xmm8, %xmm4 1600 addps %xmm4, %xmm0 1601 mulps %xmm8, %xmm12 1602 addps %xmm12, %xmm1 1603 1604 movaps -24 * SIZE(Y), %xmm6 1605 movaps -24 * SIZE(X), %xmm11 1606 1607 movss %xmm10, %xmm9 1608 pshufd $0xb1, %xmm5, %xmm12 1609 shufps $0x39, %xmm9, %xmm9 1610 mulps %xmm9, %xmm5 1611 addps %xmm5, %xmm0 1612 mulps %xmm9, %xmm12 1613 addps %xmm12, %xmm1 1614 1615 movaps -20 * SIZE(Y), %xmm7 1616 movaps -20 * SIZE(X), %xmm8 1617 1618 movss %xmm11, %xmm10 1619 pshufd $0xb1, %xmm6, %xmm12 1620 shufps $0x39, %xmm10, %xmm10 1621 mulps %xmm10, %xmm6 1622 addps %xmm6, %xmm0 1623 mulps %xmm10, %xmm12 1624 addps %xmm12, %xmm1 1625 1626 movss %xmm8, %xmm11 1627 pshufd $0xb1, %xmm7, %xmm12 1628 shufps $0x39, %xmm11, %xmm11 1629 mulps %xmm11, %xmm7 1630 addps %xmm7, %xmm0 1631 mulps %xmm11, %xmm12 1632 addps %xmm12, %xmm1 1633 1634 addq $16 * SIZE, X 1635 addq $16 * SIZE, Y 1636 ALIGN_3 1637 1638.L56: 1639 testq $4, N 1640 jle .L57 1641 1642 movaps -32 * SIZE(Y), %xmm4 1643 movaps -32 * SIZE(X), %xmm9 1644 1645 movss %xmm9, %xmm8 1646 pshufd $0xb1, %xmm4, %xmm12 1647 shufps $0x39, %xmm8, %xmm8 1648 mulps %xmm8, %xmm4 1649 addps %xmm4, %xmm0 1650 mulps %xmm8, %xmm12 1651 addps %xmm12, %xmm1 1652 1653 movaps -28 * SIZE(Y), %xmm5 1654 movaps -28 * SIZE(X), %xmm10 1655 1656 movss %xmm10, %xmm9 1657 pshufd $0xb1, %xmm5, %xmm12 1658 shufps $0x39, %xmm9, %xmm9 1659 mulps %xmm9, %xmm5 1660 addps %xmm5, %xmm0 1661 mulps %xmm9, %xmm12 1662 addps %xmm12, %xmm1 1663 1664 movaps %xmm10, %xmm8 1665 1666 addq $8 * SIZE, X 1667 addq $8 * SIZE, Y 1668 ALIGN_3 1669 1670.L57: 1671 testq $2, N 1672 jle .L58 1673 1674 movaps -32 * SIZE(Y), %xmm4 1675 movaps -32 * SIZE(X), %xmm9 1676 1677 movss %xmm9, %xmm8 1678 pshufd $0xb1, %xmm4, %xmm12 1679 shufps $0x39, %xmm8, %xmm8 1680 mulps %xmm8, %xmm4 1681 addps %xmm4, %xmm0 1682 mulps %xmm8, %xmm12 1683 addps %xmm12, %xmm1 1684 1685 movaps %xmm9, %xmm8 1686 addq $4 * SIZE, X 1687 addq $4 * SIZE, Y 1688 ALIGN_3 1689 1690.L58: 1691 testq $1, N 1692 jle .L98 1693 1694#ifdef movsd 1695 xorps %xmm4, %xmm4 1696#endif 1697 movsd -32 * SIZE(Y), %xmm4 1698 1699 pshufd $0xb1, %xmm4, %xmm12 1700 shufps $0x39, %xmm8, %xmm8 1701 mulps %xmm8, %xmm4 1702 addps %xmm4, %xmm0 1703 mulps %xmm8, %xmm12 1704 addps %xmm12, %xmm1 1705 jmp .L98 1706 ALIGN_3 1707 1708.L60: 1709 movaps -35 * SIZE(X), %xmm8 1710 addq $1 * SIZE, X 1711 1712 shufps $0xb1, %xmm1, %xmm1 1713 1714 movq N, %rax 1715 sarq $4, %rax 1716 jle .L65 1717 1718 movaps -32 * SIZE(Y), %xmm4 1719 movaps -32 * SIZE(X), %xmm9 1720 movaps -28 * SIZE(Y), %xmm5 1721 movaps -28 * SIZE(X), %xmm10 1722 movaps -24 * SIZE(Y), %xmm6 1723 movaps -24 * SIZE(X), %xmm11 1724 movaps -20 * SIZE(Y), %xmm7 1725 1726 decq %rax 1727 jle .L62 1728 ALIGN_3 1729 1730.L61: 1731#ifdef PREFETCH 1732 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 1733#endif 1734 1735 movss %xmm9, %xmm8 1736 pshufd $0xb1, %xmm4, %xmm12 1737 shufps $0x93, %xmm9, %xmm8 1738 mulps %xmm8, %xmm4 1739 addps %xmm4, %xmm0 1740 movaps -16 * SIZE(Y), %xmm4 1741 mulps %xmm8, %xmm12 1742 movaps -20 * SIZE(X), %xmm8 1743 addps %xmm12, %xmm1 1744 1745 movss %xmm10, %xmm9 1746 pshufd $0xb1, %xmm5, %xmm12 1747 shufps $0x93, %xmm10, %xmm9 1748 mulps %xmm9, %xmm5 1749 addps %xmm5, %xmm0 1750 movaps -12 * SIZE(Y), %xmm5 1751 mulps %xmm9, %xmm12 1752 movaps -16 * SIZE(X), %xmm9 1753 addps %xmm12, %xmm1 1754 1755#ifdef PREFETCH 1756 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 1757#endif 1758 1759 movss %xmm11, %xmm10 1760 pshufd $0xb1, %xmm6, %xmm12 1761 shufps $0x93, %xmm11, %xmm10 1762 mulps %xmm10, %xmm6 1763 addps %xmm6, %xmm0 1764 movaps -8 * SIZE(Y), %xmm6 1765 mulps %xmm10, %xmm12 1766 movaps -12 * SIZE(X), %xmm10 1767 addps %xmm12, %xmm1 1768 1769 movss %xmm8, %xmm11 1770 pshufd $0xb1, %xmm7, %xmm12 1771 shufps $0x93, %xmm8, %xmm11 1772 mulps %xmm11, %xmm7 1773 addps %xmm7, %xmm0 1774 movaps -4 * SIZE(Y), %xmm7 1775 mulps %xmm11, %xmm12 1776 movaps -8 * SIZE(X), %xmm11 1777 addps %xmm12, %xmm1 1778 1779#if defined(PREFETCH) && !defined(FETCH128) 1780 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 1781#endif 1782 1783 movss %xmm9, %xmm8 1784 pshufd $0xb1, %xmm4, %xmm12 1785 shufps $0x93, %xmm9, %xmm8 1786 mulps %xmm8, %xmm4 1787 addps %xmm4, %xmm0 1788 movaps 0 * SIZE(Y), %xmm4 1789 mulps %xmm8, %xmm12 1790 movaps -4 * SIZE(X), %xmm8 1791 addps %xmm12, %xmm1 1792 1793 movss %xmm10, %xmm9 1794 pshufd $0xb1, %xmm5, %xmm12 1795 shufps $0x93, %xmm10, %xmm9 1796 mulps %xmm9, %xmm5 1797 addps %xmm5, %xmm0 1798 movaps 4 * SIZE(Y), %xmm5 1799 mulps %xmm9, %xmm12 1800 movaps 0 * SIZE(X), %xmm9 1801 addps %xmm12, %xmm1 1802 1803#if defined(PREFETCH) && !defined(FETCH128) 1804 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 1805#endif 1806 1807 movss %xmm11, %xmm10 1808 pshufd $0xb1, %xmm6, %xmm12 1809 shufps $0x93, %xmm11, %xmm10 1810 mulps %xmm10, %xmm6 1811 addps %xmm6, %xmm0 1812 movaps 8 * SIZE(Y), %xmm6 1813 mulps %xmm10, %xmm12 1814 movaps 4 * SIZE(X), %xmm10 1815 addps %xmm12, %xmm1 1816 1817 movss %xmm8, %xmm11 1818 pshufd $0xb1, %xmm7, %xmm12 1819 shufps $0x93, %xmm8, %xmm11 1820 mulps %xmm11, %xmm7 1821 addps %xmm7, %xmm0 1822 movaps 12 * SIZE(Y), %xmm7 1823 mulps %xmm11, %xmm12 1824 movaps 8 * SIZE(X), %xmm11 1825 addps %xmm12, %xmm1 1826 1827 subq $-32 * SIZE, X 1828 subq $-32 * SIZE, Y 1829 1830 decq %rax 1831 jg .L61 1832 ALIGN_3 1833 1834.L62: 1835 movss %xmm9, %xmm8 1836 pshufd $0xb1, %xmm4, %xmm12 1837 shufps $0x93, %xmm9, %xmm8 1838 mulps %xmm8, %xmm4 1839 addps %xmm4, %xmm0 1840 movaps -16 * SIZE(Y), %xmm4 1841 mulps %xmm8, %xmm12 1842 movaps -20 * SIZE(X), %xmm8 1843 addps %xmm12, %xmm1 1844 1845 movss %xmm10, %xmm9 1846 pshufd $0xb1, %xmm5, %xmm12 1847 shufps $0x93, %xmm10, %xmm9 1848 mulps %xmm9, %xmm5 1849 addps %xmm5, %xmm0 1850 movaps -12 * SIZE(Y), %xmm5 1851 mulps %xmm9, %xmm12 1852 movaps -16 * SIZE(X), %xmm9 1853 addps %xmm12, %xmm1 1854 1855 movss %xmm11, %xmm10 1856 pshufd $0xb1, %xmm6, %xmm12 1857 shufps $0x93, %xmm11, %xmm10 1858 mulps %xmm10, %xmm6 1859 addps %xmm6, %xmm0 1860 movaps -8 * SIZE(Y), %xmm6 1861 mulps %xmm10, %xmm12 1862 movaps -12 * SIZE(X), %xmm10 1863 addps %xmm12, %xmm1 1864 1865 movss %xmm8, %xmm11 1866 pshufd $0xb1, %xmm7, %xmm12 1867 shufps $0x93, %xmm8, %xmm11 1868 mulps %xmm11, %xmm7 1869 addps %xmm7, %xmm0 1870 movaps -4 * SIZE(Y), %xmm7 1871 mulps %xmm11, %xmm12 1872 movaps -8 * SIZE(X), %xmm11 1873 addps %xmm12, %xmm1 1874 1875 movss %xmm9, %xmm8 1876 pshufd $0xb1, %xmm4, %xmm12 1877 shufps $0x93, %xmm9, %xmm8 1878 mulps %xmm8, %xmm4 1879 addps %xmm4, %xmm0 1880 mulps %xmm8, %xmm12 1881 movaps -4 * SIZE(X), %xmm8 1882 addps %xmm12, %xmm1 1883 1884 movss %xmm10, %xmm9 1885 pshufd $0xb1, %xmm5, %xmm12 1886 shufps $0x93, %xmm10, %xmm9 1887 mulps %xmm9, %xmm5 1888 addps %xmm5, %xmm0 1889 mulps %xmm9, %xmm12 1890 addps %xmm12, %xmm1 1891 1892 movss %xmm11, %xmm10 1893 pshufd $0xb1, %xmm6, %xmm12 1894 shufps $0x93, %xmm11, %xmm10 1895 mulps %xmm10, %xmm6 1896 addps %xmm6, %xmm0 1897 mulps %xmm10, %xmm12 1898 addps %xmm12, %xmm1 1899 1900 movss %xmm8, %xmm11 1901 pshufd $0xb1, %xmm7, %xmm12 1902 shufps $0x93, %xmm8, %xmm11 1903 mulps %xmm11, %xmm7 1904 addps %xmm7, %xmm0 1905 mulps %xmm11, %xmm12 1906 addps %xmm12, %xmm1 1907 1908 subq $-32 * SIZE, X 1909 subq $-32 * SIZE, Y 1910 ALIGN_3 1911 1912.L65: 1913 testq $8, N 1914 jle .L66 1915 1916 movaps -32 * SIZE(Y), %xmm4 1917 movaps -32 * SIZE(X), %xmm9 1918 movaps -28 * SIZE(Y), %xmm5 1919 movaps -28 * SIZE(X), %xmm10 1920 1921 movss %xmm9, %xmm8 1922 pshufd $0xb1, %xmm4, %xmm12 1923 shufps $0x93, %xmm9, %xmm8 1924 mulps %xmm8, %xmm4 1925 addps %xmm4, %xmm0 1926 mulps %xmm8, %xmm12 1927 addps %xmm12, %xmm1 1928 1929 movaps -24 * SIZE(Y), %xmm6 1930 movaps -24 * SIZE(X), %xmm11 1931 1932 movss %xmm10, %xmm9 1933 pshufd $0xb1, %xmm5, %xmm12 1934 shufps $0x93, %xmm10, %xmm9 1935 mulps %xmm9, %xmm5 1936 addps %xmm5, %xmm0 1937 mulps %xmm9, %xmm12 1938 addps %xmm12, %xmm1 1939 1940 movaps -20 * SIZE(Y), %xmm7 1941 movaps -20 * SIZE(X), %xmm8 1942 1943 movss %xmm11, %xmm10 1944 pshufd $0xb1, %xmm6, %xmm12 1945 shufps $0x93, %xmm11, %xmm10 1946 mulps %xmm10, %xmm6 1947 addps %xmm6, %xmm0 1948 mulps %xmm10, %xmm12 1949 addps %xmm12, %xmm1 1950 1951 movss %xmm8, %xmm11 1952 pshufd $0xb1, %xmm7, %xmm12 1953 shufps $0x93, %xmm8, %xmm11 1954 mulps %xmm11, %xmm7 1955 addps %xmm7, %xmm0 1956 mulps %xmm11, %xmm12 1957 addps %xmm12, %xmm1 1958 1959 addq $16 * SIZE, X 1960 addq $16 * SIZE, Y 1961 ALIGN_3 1962 1963.L66: 1964 testq $4, N 1965 jle .L67 1966 1967 movaps -32 * SIZE(Y), %xmm4 1968 movaps -32 * SIZE(X), %xmm9 1969 1970 movss %xmm9, %xmm8 1971 pshufd $0xb1, %xmm4, %xmm12 1972 shufps $0x93, %xmm9, %xmm8 1973 mulps %xmm8, %xmm4 1974 addps %xmm4, %xmm0 1975 mulps %xmm8, %xmm12 1976 addps %xmm12, %xmm1 1977 1978 movaps -28 * SIZE(Y), %xmm5 1979 movaps -28 * SIZE(X), %xmm10 1980 1981 movss %xmm10, %xmm9 1982 pshufd $0xb1, %xmm5, %xmm12 1983 shufps $0x93, %xmm10, %xmm9 1984 mulps %xmm9, %xmm5 1985 addps %xmm5, %xmm0 1986 mulps %xmm9, %xmm12 1987 addps %xmm12, %xmm1 1988 1989 movaps %xmm10, %xmm8 1990 1991 addq $8 * SIZE, X 1992 addq $8 * SIZE, Y 1993 ALIGN_3 1994 1995.L67: 1996 testq $2, N 1997 jle .L68 1998 1999 movaps -32 * SIZE(Y), %xmm4 2000 movaps -32 * SIZE(X), %xmm9 2001 2002 movss %xmm9, %xmm8 2003 pshufd $0xb1, %xmm4, %xmm12 2004 shufps $0x93, %xmm9, %xmm8 2005 mulps %xmm8, %xmm4 2006 addps %xmm4, %xmm0 2007 mulps %xmm8, %xmm12 2008 addps %xmm12, %xmm1 2009 2010 movaps %xmm9, %xmm8 2011 addq $4 * SIZE, X 2012 addq $4 * SIZE, Y 2013 ALIGN_3 2014 2015.L68: 2016 testq $1, N 2017 jle .L98 2018 2019#ifdef movsd 2020 xorps %xmm4, %xmm4 2021#endif 2022 movsd -32 * SIZE(Y), %xmm4 2023 movss -32 * SIZE(X), %xmm9 2024 2025 movss %xmm9, %xmm8 2026 pshufd $0xb1, %xmm4, %xmm12 2027 shufps $0x93, %xmm8, %xmm8 2028 mulps %xmm8, %xmm4 2029 addps %xmm4, %xmm0 2030 mulps %xmm8, %xmm12 2031 addps %xmm12, %xmm1 2032 jmp .L98 2033 ALIGN_3 2034 2035#else 2036 2037 testq $2 * SIZE, Y 2038 je .L50x 2039 2040#ifdef movsd 2041 xorps %xmm0, %xmm0 2042#endif 2043 movsd -32 * SIZE(Y), %xmm0 2044#ifdef movsd 2045 xorps %xmm4, %xmm4 2046#endif 2047 movsd -32 * SIZE(X), %xmm4 2048 2049 pshufd $0xb1, %xmm0, %xmm1 2050 mulps %xmm4, %xmm0 2051 mulps %xmm4, %xmm1 2052 addq $2 * SIZE, X 2053 addq $2 * SIZE, Y 2054 2055 decq N 2056 ALIGN_3 2057 2058.L50x: 2059 movq N, %rax 2060 sarq $4, %rax 2061 jle .L55 2062 2063 movaps -32 * SIZE(Y), %xmm4 2064 movlps -32 * SIZE(X), %xmm8 2065 movhps -30 * SIZE(X), %xmm8 2066 movaps -28 * SIZE(Y), %xmm5 2067 movlps -28 * SIZE(X), %xmm9 2068 movhps -26 * SIZE(X), %xmm9 2069 2070 movaps -24 * SIZE(Y), %xmm6 2071 movlps -24 * SIZE(X), %xmm10 2072 movhps -22 * SIZE(X), %xmm10 2073 movaps -20 * SIZE(Y), %xmm7 2074 movlps -20 * SIZE(X), %xmm11 2075 movhps -18 * SIZE(X), %xmm11 2076 2077 decq %rax 2078 jle .L52 2079 ALIGN_3 2080 2081.L51: 2082#ifdef PREFETCH 2083 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 2084#endif 2085 2086#ifdef PREFETCH 2087 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 2088#endif 2089 2090#if defined(PREFETCH) && !defined(FETCH128) 2091 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 2092#endif 2093 2094#if defined(PREFETCH) && !defined(FETCH128) 2095 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 2096#endif 2097 2098 pshufd $0xb1, %xmm4, %xmm12 2099 mulps %xmm8, %xmm4 2100 addps %xmm4, %xmm0 2101 movaps -16 * SIZE(Y), %xmm4 2102 mulps %xmm8, %xmm12 2103 movlps -16 * SIZE(X), %xmm8 2104 movhps -14 * SIZE(X), %xmm8 2105 addps %xmm12, %xmm1 2106 2107 pshufd $0xb1, %xmm5, %xmm12 2108 mulps %xmm9, %xmm5 2109 addps %xmm5, %xmm0 2110 movaps -12 * SIZE(Y), %xmm5 2111 mulps %xmm9, %xmm12 2112 movlps -12 * SIZE(X), %xmm9 2113 movhps -10 * SIZE(X), %xmm9 2114 addps %xmm12, %xmm1 2115 2116 pshufd $0xb1, %xmm6, %xmm12 2117 mulps %xmm10, %xmm6 2118 addps %xmm6, %xmm0 2119 movaps -8 * SIZE(Y), %xmm6 2120 mulps %xmm10, %xmm12 2121 movlps -8 * SIZE(X), %xmm10 2122 movhps -6 * SIZE(X), %xmm10 2123 addps %xmm12, %xmm1 2124 2125 pshufd $0xb1, %xmm7, %xmm12 2126 mulps %xmm11, %xmm7 2127 addps %xmm7, %xmm0 2128 movaps -4 * SIZE(Y), %xmm7 2129 mulps %xmm11, %xmm12 2130 movlps -4 * SIZE(X), %xmm11 2131 movhps -2 * SIZE(X), %xmm11 2132 addps %xmm12, %xmm1 2133 2134 pshufd $0xb1, %xmm4, %xmm12 2135 mulps %xmm8, %xmm4 2136 addps %xmm4, %xmm0 2137 movaps 0 * SIZE(Y), %xmm4 2138 mulps %xmm8, %xmm12 2139 movlps 0 * SIZE(X), %xmm8 2140 movhps 2 * SIZE(X), %xmm8 2141 addps %xmm12, %xmm1 2142 2143 pshufd $0xb1, %xmm5, %xmm12 2144 mulps %xmm9, %xmm5 2145 addps %xmm5, %xmm0 2146 movaps 4 * SIZE(Y), %xmm5 2147 mulps %xmm9, %xmm12 2148 movlps 4 * SIZE(X), %xmm9 2149 movhps 6 * SIZE(X), %xmm9 2150 addps %xmm12, %xmm1 2151 2152 pshufd $0xb1, %xmm6, %xmm12 2153 mulps %xmm10, %xmm6 2154 addps %xmm6, %xmm0 2155 movaps 8 * SIZE(Y), %xmm6 2156 mulps %xmm10, %xmm12 2157 movlps 8 * SIZE(X), %xmm10 2158 movhps 10 * SIZE(X), %xmm10 2159 addps %xmm12, %xmm1 2160 2161 pshufd $0xb1, %xmm7, %xmm12 2162 mulps %xmm11, %xmm7 2163 addps %xmm7, %xmm0 2164 movaps 12 * SIZE(Y), %xmm7 2165 mulps %xmm11, %xmm12 2166 movlps 12 * SIZE(X), %xmm11 2167 movhps 14 * SIZE(X), %xmm11 2168 addps %xmm12, %xmm1 2169 2170 subq $-32 * SIZE, X 2171 subq $-32 * SIZE, Y 2172 2173 decq %rax 2174 jg .L51 2175 ALIGN_3 2176 2177.L52: 2178 pshufd $0xb1, %xmm4, %xmm12 2179 mulps %xmm8, %xmm4 2180 addps %xmm4, %xmm0 2181 movaps -16 * SIZE(Y), %xmm4 2182 mulps %xmm8, %xmm12 2183 movlps -16 * SIZE(X), %xmm8 2184 movhps -14 * SIZE(X), %xmm8 2185 addps %xmm12, %xmm1 2186 2187 pshufd $0xb1, %xmm5, %xmm12 2188 mulps %xmm9, %xmm5 2189 addps %xmm5, %xmm0 2190 movaps -12 * SIZE(Y), %xmm5 2191 mulps %xmm9, %xmm12 2192 movlps -12 * SIZE(X), %xmm9 2193 movhps -10 * SIZE(X), %xmm9 2194 addps %xmm12, %xmm1 2195 2196 pshufd $0xb1, %xmm6, %xmm12 2197 mulps %xmm10, %xmm6 2198 addps %xmm6, %xmm0 2199 movaps -8 * SIZE(Y), %xmm6 2200 mulps %xmm10, %xmm12 2201 movlps -8 * SIZE(X), %xmm10 2202 movhps -6 * SIZE(X), %xmm10 2203 addps %xmm12, %xmm1 2204 2205 pshufd $0xb1, %xmm7, %xmm12 2206 mulps %xmm11, %xmm7 2207 addps %xmm7, %xmm0 2208 movaps -4 * SIZE(Y), %xmm7 2209 mulps %xmm11, %xmm12 2210 movlps -4 * SIZE(X), %xmm11 2211 movhps -2 * SIZE(X), %xmm11 2212 addps %xmm12, %xmm1 2213 2214 pshufd $0xb1, %xmm4, %xmm12 2215 mulps %xmm8, %xmm4 2216 addps %xmm4, %xmm0 2217 mulps %xmm8, %xmm12 2218 addps %xmm12, %xmm1 2219 2220 pshufd $0xb1, %xmm5, %xmm12 2221 mulps %xmm9, %xmm5 2222 addps %xmm5, %xmm0 2223 mulps %xmm9, %xmm12 2224 addps %xmm12, %xmm1 2225 2226 pshufd $0xb1, %xmm6, %xmm12 2227 mulps %xmm10, %xmm6 2228 addps %xmm6, %xmm0 2229 mulps %xmm10, %xmm12 2230 addps %xmm12, %xmm1 2231 2232 pshufd $0xb1, %xmm7, %xmm12 2233 mulps %xmm11, %xmm7 2234 addps %xmm7, %xmm0 2235 mulps %xmm11, %xmm12 2236 addps %xmm12, %xmm1 2237 2238 subq $-32 * SIZE, X 2239 subq $-32 * SIZE, Y 2240 ALIGN_3 2241 2242.L55: 2243 testq $8, N 2244 jle .L56 2245 2246 movaps -32 * SIZE(Y), %xmm4 2247 movlps -32 * SIZE(X), %xmm8 2248 movhps -30 * SIZE(X), %xmm8 2249 2250 movaps -28 * SIZE(Y), %xmm5 2251 movlps -28 * SIZE(X), %xmm9 2252 movhps -26 * SIZE(X), %xmm9 2253 2254 pshufd $0xb1, %xmm4, %xmm12 2255 mulps %xmm8, %xmm4 2256 addps %xmm4, %xmm0 2257 mulps %xmm8, %xmm12 2258 addps %xmm12, %xmm1 2259 2260 movaps -24 * SIZE(Y), %xmm6 2261 movlps -24 * SIZE(X), %xmm10 2262 movhps -22 * SIZE(X), %xmm10 2263 2264 pshufd $0xb1, %xmm5, %xmm12 2265 mulps %xmm9, %xmm5 2266 addps %xmm5, %xmm0 2267 mulps %xmm9, %xmm12 2268 addps %xmm12, %xmm1 2269 2270 movaps -20 * SIZE(Y), %xmm7 2271 movlps -20 * SIZE(X), %xmm11 2272 movhps -18 * SIZE(X), %xmm11 2273 2274 pshufd $0xb1, %xmm6, %xmm12 2275 mulps %xmm10, %xmm6 2276 addps %xmm6, %xmm0 2277 mulps %xmm10, %xmm12 2278 addps %xmm12, %xmm1 2279 2280 pshufd $0xb1, %xmm7, %xmm12 2281 mulps %xmm11, %xmm7 2282 addps %xmm7, %xmm0 2283 mulps %xmm11, %xmm12 2284 addps %xmm12, %xmm1 2285 2286 addq $16 * SIZE, X 2287 addq $16 * SIZE, Y 2288 ALIGN_3 2289 2290.L56: 2291 testq $4, N 2292 jle .L57 2293 2294 movaps -32 * SIZE(Y), %xmm4 2295 movlps -32 * SIZE(X), %xmm8 2296 movhps -30 * SIZE(X), %xmm8 2297 2298 pshufd $0xb1, %xmm4, %xmm12 2299 mulps %xmm8, %xmm4 2300 addps %xmm4, %xmm0 2301 mulps %xmm8, %xmm12 2302 addps %xmm12, %xmm1 2303 2304 movaps -28 * SIZE(Y), %xmm5 2305 movlps -28 * SIZE(X), %xmm9 2306 movhps -26 * SIZE(X), %xmm9 2307 2308 pshufd $0xb1, %xmm5, %xmm12 2309 mulps %xmm9, %xmm5 2310 addps %xmm5, %xmm0 2311 mulps %xmm9, %xmm12 2312 addps %xmm12, %xmm1 2313 2314 addq $8 * SIZE, X 2315 addq $8 * SIZE, Y 2316 ALIGN_3 2317 2318.L57: 2319 testq $2, N 2320 jle .L58 2321 2322 movaps -32 * SIZE(Y), %xmm4 2323 movlps -32 * SIZE(X), %xmm8 2324 movhps -30 * SIZE(X), %xmm8 2325 2326 pshufd $0xb1, %xmm4, %xmm12 2327 mulps %xmm8, %xmm4 2328 addps %xmm4, %xmm0 2329 mulps %xmm8, %xmm12 2330 addps %xmm12, %xmm1 2331 2332 movaps %xmm9, %xmm8 2333 addq $4 * SIZE, X 2334 addq $4 * SIZE, Y 2335 ALIGN_3 2336 2337.L58: 2338 testq $1, N 2339 jle .L98 2340 2341#ifdef movsd 2342 xorps %xmm4, %xmm4 2343#endif 2344 movsd -32 * SIZE(Y), %xmm4 2345#ifdef movsd 2346 xorps %xmm8, %xmm8 2347#endif 2348 movsd -32 * SIZE(X), %xmm8 2349 2350 pshufd $0xb1, %xmm4, %xmm12 2351 mulps %xmm8, %xmm4 2352 addps %xmm4, %xmm0 2353 mulps %xmm8, %xmm12 2354 addps %xmm12, %xmm1 2355 jmp .L98 2356 ALIGN_3 2357#endif 2358 2359.L70: 2360 testq $2 * SIZE, Y 2361 je .L70x 2362 2363#ifdef movsd 2364 xorps %xmm4, %xmm4 2365#endif 2366 movsd -32 * SIZE(X), %xmm4 2367 addq $2 * SIZE, X 2368#ifdef movsd 2369 xorps %xmm1, %xmm1 2370#endif 2371 movsd -32 * SIZE(Y), %xmm1 2372 addq $2 * SIZE, Y 2373 2374 pshufd $0xb1, %xmm1, %xmm0 2375 shufps $0xb1, %xmm4, %xmm4 2376 2377 mulps %xmm4, %xmm0 2378 mulps %xmm4, %xmm1 2379 decq N 2380 ALIGN_3 2381 2382.L70x: 2383 testq $2 * SIZE, X 2384 jne .L80 2385 2386 movaps -33 * SIZE(X), %xmm4 2387 addq $3 * SIZE, X 2388 movaps -33 * SIZE(Y), %xmm8 2389 addq $3 * SIZE, Y 2390 2391 movq N, %rax 2392 sarq $4, %rax 2393 jle .L75 2394 2395 movaps -32 * SIZE(X), %xmm5 2396 movaps -32 * SIZE(Y), %xmm9 2397 movaps -28 * SIZE(X), %xmm6 2398 movaps -28 * SIZE(Y), %xmm10 2399 movaps -24 * SIZE(X), %xmm7 2400 movaps -24 * SIZE(Y), %xmm11 2401 2402 decq %rax 2403 jle .L72 2404 ALIGN_3 2405 2406.L71: 2407#ifdef PREFETCH 2408 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 2409#endif 2410 2411 movss %xmm9, %xmm8 2412 pshufd $0x1b, %xmm8, %xmm12 2413 movss %xmm5, %xmm4 2414 mulps %xmm4, %xmm8 2415 addps %xmm8, %xmm0 2416 movaps -20 * SIZE(Y), %xmm8 2417 mulps %xmm4, %xmm12 2418 movaps -20 * SIZE(X), %xmm4 2419 addps %xmm12, %xmm1 2420 2421 movss %xmm10, %xmm9 2422 pshufd $0x1b, %xmm9, %xmm12 2423 movss %xmm6, %xmm5 2424 mulps %xmm5, %xmm9 2425 addps %xmm9, %xmm2 2426 movaps -16 * SIZE(Y), %xmm9 2427 mulps %xmm5, %xmm12 2428 movaps -16 * SIZE(X), %xmm5 2429 addps %xmm12, %xmm3 2430 2431#ifdef PREFETCH 2432 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 2433#endif 2434 2435 movss %xmm11, %xmm10 2436 pshufd $0x1b, %xmm10, %xmm12 2437 movss %xmm7, %xmm6 2438 mulps %xmm6, %xmm10 2439 addps %xmm10, %xmm0 2440 movaps -12 * SIZE(Y), %xmm10 2441 mulps %xmm6, %xmm12 2442 movaps -12 * SIZE(X), %xmm6 2443 addps %xmm12, %xmm1 2444 2445 movss %xmm8, %xmm11 2446 pshufd $0x1b, %xmm11, %xmm12 2447 movss %xmm4, %xmm7 2448 mulps %xmm7, %xmm11 2449 addps %xmm11, %xmm2 2450 movaps -8 * SIZE(Y), %xmm11 2451 mulps %xmm7, %xmm12 2452 movaps -8 * SIZE(X), %xmm7 2453 addps %xmm12, %xmm3 2454 2455#if defined(PREFETCH) && !defined(FETCH128) 2456 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 2457#endif 2458 2459 movss %xmm9, %xmm8 2460 pshufd $0x1b, %xmm8, %xmm12 2461 movss %xmm5, %xmm4 2462 mulps %xmm4, %xmm8 2463 addps %xmm8, %xmm0 2464 movaps -4 * SIZE(Y), %xmm8 2465 mulps %xmm4, %xmm12 2466 movaps -4 * SIZE(X), %xmm4 2467 addps %xmm12, %xmm1 2468 2469 movss %xmm10, %xmm9 2470 pshufd $0x1b, %xmm9, %xmm12 2471 movss %xmm6, %xmm5 2472 mulps %xmm5, %xmm9 2473 addps %xmm9, %xmm2 2474 movaps 0 * SIZE(Y), %xmm9 2475 mulps %xmm5, %xmm12 2476 movaps 0 * SIZE(X), %xmm5 2477 addps %xmm12, %xmm3 2478 2479#if defined(PREFETCH) && !defined(FETCH128) 2480 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 2481#endif 2482 2483 movss %xmm11, %xmm10 2484 pshufd $0x1b, %xmm10, %xmm12 2485 movss %xmm7, %xmm6 2486 mulps %xmm6, %xmm10 2487 addps %xmm10, %xmm0 2488 movaps 4 * SIZE(Y), %xmm10 2489 mulps %xmm6, %xmm12 2490 movaps 4 * SIZE(X), %xmm6 2491 addps %xmm12, %xmm1 2492 2493 movss %xmm8, %xmm11 2494 pshufd $0x1b, %xmm11, %xmm12 2495 movss %xmm4, %xmm7 2496 mulps %xmm7, %xmm11 2497 addps %xmm11, %xmm2 2498 movaps 8 * SIZE(Y), %xmm11 2499 mulps %xmm7, %xmm12 2500 movaps 8 * SIZE(X), %xmm7 2501 addps %xmm12, %xmm3 2502 2503 subq $-32 * SIZE, X 2504 subq $-32 * SIZE, Y 2505 2506 decq %rax 2507 jg .L71 2508 ALIGN_3 2509 2510.L72: 2511 movss %xmm9, %xmm8 2512 pshufd $0x1b, %xmm8, %xmm12 2513 movss %xmm5, %xmm4 2514 mulps %xmm4, %xmm8 2515 addps %xmm8, %xmm0 2516 movaps -20 * SIZE(Y), %xmm8 2517 mulps %xmm4, %xmm12 2518 movaps -20 * SIZE(X), %xmm4 2519 addps %xmm12, %xmm1 2520 2521 movss %xmm10, %xmm9 2522 pshufd $0x1b, %xmm9, %xmm12 2523 movss %xmm6, %xmm5 2524 mulps %xmm5, %xmm9 2525 addps %xmm9, %xmm2 2526 movaps -16 * SIZE(Y), %xmm9 2527 mulps %xmm5, %xmm12 2528 movaps -16 * SIZE(X), %xmm5 2529 addps %xmm12, %xmm3 2530 2531 movss %xmm11, %xmm10 2532 pshufd $0x1b, %xmm10, %xmm12 2533 movss %xmm7, %xmm6 2534 mulps %xmm6, %xmm10 2535 addps %xmm10, %xmm0 2536 movaps -12 * SIZE(Y), %xmm10 2537 mulps %xmm6, %xmm12 2538 movaps -12 * SIZE(X), %xmm6 2539 addps %xmm12, %xmm1 2540 2541 movss %xmm8, %xmm11 2542 pshufd $0x1b, %xmm11, %xmm12 2543 movss %xmm4, %xmm7 2544 mulps %xmm7, %xmm11 2545 addps %xmm11, %xmm2 2546 movaps -8 * SIZE(Y), %xmm11 2547 mulps %xmm7, %xmm12 2548 movaps -8 * SIZE(X), %xmm7 2549 addps %xmm12, %xmm3 2550 2551 movss %xmm9, %xmm8 2552 pshufd $0x1b, %xmm8, %xmm12 2553 movss %xmm5, %xmm4 2554 mulps %xmm4, %xmm8 2555 addps %xmm8, %xmm0 2556 movaps -4 * SIZE(Y), %xmm8 2557 mulps %xmm4, %xmm12 2558 movaps -4 * SIZE(X), %xmm4 2559 addps %xmm12, %xmm1 2560 2561 movss %xmm10, %xmm9 2562 pshufd $0x1b, %xmm9, %xmm12 2563 movss %xmm6, %xmm5 2564 mulps %xmm5, %xmm9 2565 addps %xmm9, %xmm2 2566 mulps %xmm5, %xmm12 2567 addps %xmm12, %xmm3 2568 2569 movss %xmm11, %xmm10 2570 pshufd $0x1b, %xmm10, %xmm12 2571 movss %xmm7, %xmm6 2572 mulps %xmm6, %xmm10 2573 addps %xmm10, %xmm0 2574 mulps %xmm6, %xmm12 2575 addps %xmm12, %xmm1 2576 2577 movss %xmm8, %xmm11 2578 pshufd $0x1b, %xmm11, %xmm12 2579 movss %xmm4, %xmm7 2580 mulps %xmm7, %xmm11 2581 addps %xmm11, %xmm2 2582 mulps %xmm7, %xmm12 2583 addps %xmm12, %xmm3 2584 2585 subq $-32 * SIZE, X 2586 subq $-32 * SIZE, Y 2587 ALIGN_3 2588 2589.L75: 2590 testq $8, N 2591 jle .L76 2592 2593 movaps -32 * SIZE(X), %xmm5 2594 movaps -32 * SIZE(Y), %xmm9 2595 2596 movss %xmm9, %xmm8 2597 pshufd $0x1b, %xmm8, %xmm12 2598 movss %xmm5, %xmm4 2599 mulps %xmm4, %xmm8 2600 addps %xmm8, %xmm0 2601 mulps %xmm4, %xmm12 2602 addps %xmm12, %xmm1 2603 2604 movaps -28 * SIZE(X), %xmm6 2605 movaps -28 * SIZE(Y), %xmm10 2606 2607 movss %xmm10, %xmm9 2608 pshufd $0x1b, %xmm9, %xmm12 2609 movss %xmm6, %xmm5 2610 mulps %xmm5, %xmm9 2611 addps %xmm9, %xmm2 2612 mulps %xmm5, %xmm12 2613 addps %xmm12, %xmm3 2614 2615 movaps -24 * SIZE(X), %xmm7 2616 movaps -24 * SIZE(Y), %xmm11 2617 2618 movss %xmm11, %xmm10 2619 pshufd $0x1b, %xmm10, %xmm12 2620 movss %xmm7, %xmm6 2621 mulps %xmm6, %xmm10 2622 addps %xmm10, %xmm0 2623 mulps %xmm6, %xmm12 2624 addps %xmm12, %xmm1 2625 2626 movaps -20 * SIZE(X), %xmm4 2627 movaps -20 * SIZE(Y), %xmm8 2628 2629 movss %xmm8, %xmm11 2630 pshufd $0x1b, %xmm11, %xmm12 2631 movss %xmm4, %xmm7 2632 mulps %xmm7, %xmm11 2633 addps %xmm11, %xmm2 2634 mulps %xmm7, %xmm12 2635 addps %xmm12, %xmm3 2636 2637 addq $16 * SIZE, X 2638 addq $16 * SIZE, Y 2639 ALIGN_3 2640 2641.L76: 2642 testq $4, N 2643 jle .L77 2644 2645 movaps -32 * SIZE(X), %xmm5 2646 movaps -32 * SIZE(Y), %xmm9 2647 movaps -28 * SIZE(X), %xmm6 2648 movaps -28 * SIZE(Y), %xmm10 2649 2650 movss %xmm9, %xmm8 2651 pshufd $0x1b, %xmm8, %xmm12 2652 movss %xmm5, %xmm4 2653 mulps %xmm4, %xmm8 2654 addps %xmm8, %xmm0 2655 mulps %xmm4, %xmm12 2656 addps %xmm12, %xmm1 2657 2658 movss %xmm10, %xmm9 2659 pshufd $0x1b, %xmm9, %xmm12 2660 movss %xmm6, %xmm5 2661 mulps %xmm5, %xmm9 2662 addps %xmm9, %xmm2 2663 mulps %xmm5, %xmm12 2664 addps %xmm12, %xmm3 2665 2666 movaps %xmm6, %xmm4 2667 movaps %xmm10, %xmm8 2668 2669 addq $8 * SIZE, X 2670 addq $8 * SIZE, Y 2671 ALIGN_3 2672 2673.L77: 2674 testq $2, N 2675 jle .L78 2676 2677 movaps -32 * SIZE(X), %xmm5 2678 movaps -32 * SIZE(Y), %xmm9 2679 2680 movss %xmm9, %xmm8 2681 pshufd $0x1b, %xmm8, %xmm12 2682 movss %xmm5, %xmm4 2683 mulps %xmm4, %xmm8 2684 addps %xmm8, %xmm0 2685 mulps %xmm4, %xmm12 2686 addps %xmm12, %xmm1 2687 2688 movaps %xmm5, %xmm4 2689 movaps %xmm9, %xmm8 2690 ALIGN_3 2691 2692.L78: 2693 testq $1, N 2694 jle .L79 2695 2696 xorps %xmm5, %xmm5 2697 movss %xmm5, %xmm4 2698 movss %xmm5, %xmm8 2699 2700 shufps $0x24, %xmm4, %xmm4 2701 pshufd $0x18, %xmm8, %xmm12 2702 shufps $0x24, %xmm8, %xmm8 2703 2704 mulps %xmm4, %xmm8 2705 addps %xmm8, %xmm0 2706 mulps %xmm4, %xmm12 2707 addps %xmm12, %xmm1 2708 ALIGN_3 2709 2710.L79: 2711 shufps $0x39, %xmm0, %xmm0 2712 shufps $0x39, %xmm1, %xmm1 2713 shufps $0x39, %xmm2, %xmm2 2714 shufps $0x39, %xmm3, %xmm3 2715 jmp .L98 2716 ALIGN_3 2717 2718.L80: 2719 movsd -33 * SIZE(X), %xmm4 2720 movhps -31 * SIZE(X), %xmm4 2721 addq $3 * SIZE, X 2722 movaps -33 * SIZE(Y), %xmm8 2723 addq $3 * SIZE, Y 2724 2725 movq N, %rax 2726 sarq $4, %rax 2727 jle .L85 2728 2729 movsd -32 * SIZE(X), %xmm5 2730 movhps -30 * SIZE(X), %xmm5 2731 movaps -32 * SIZE(Y), %xmm9 2732 2733 movsd -28 * SIZE(X), %xmm6 2734 movhps -26 * SIZE(X), %xmm6 2735 movaps -28 * SIZE(Y), %xmm10 2736 2737 movsd -24 * SIZE(X), %xmm7 2738 movhps -22 * SIZE(X), %xmm7 2739 movaps -24 * SIZE(Y), %xmm11 2740 2741 decq %rax 2742 jle .L82 2743 ALIGN_3 2744 2745.L81: 2746#ifdef PREFETCH 2747 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 2748#endif 2749 2750 movss %xmm9, %xmm8 2751 pshufd $0x1b, %xmm8, %xmm12 2752 movss %xmm5, %xmm4 2753 mulps %xmm4, %xmm8 2754 addps %xmm8, %xmm0 2755 movaps -20 * SIZE(Y), %xmm8 2756 mulps %xmm4, %xmm12 2757 movsd -20 * SIZE(X), %xmm4 2758 movhps -18 * SIZE(X), %xmm4 2759 addps %xmm12, %xmm1 2760 2761 movss %xmm10, %xmm9 2762 pshufd $0x1b, %xmm9, %xmm12 2763 movss %xmm6, %xmm5 2764 mulps %xmm5, %xmm9 2765 addps %xmm9, %xmm2 2766 movaps -16 * SIZE(Y), %xmm9 2767 mulps %xmm5, %xmm12 2768 movsd -16 * SIZE(X), %xmm5 2769 movhps -14 * SIZE(X), %xmm5 2770 addps %xmm12, %xmm3 2771 2772#ifdef PREFETCH 2773 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) 2774#endif 2775 2776 movss %xmm11, %xmm10 2777 pshufd $0x1b, %xmm10, %xmm12 2778 movss %xmm7, %xmm6 2779 mulps %xmm6, %xmm10 2780 addps %xmm10, %xmm0 2781 movaps -12 * SIZE(Y), %xmm10 2782 mulps %xmm6, %xmm12 2783 movsd -12 * SIZE(X), %xmm6 2784 movhps -10 * SIZE(X), %xmm6 2785 addps %xmm12, %xmm1 2786 2787 movss %xmm8, %xmm11 2788 pshufd $0x1b, %xmm11, %xmm12 2789 movss %xmm4, %xmm7 2790 mulps %xmm7, %xmm11 2791 addps %xmm11, %xmm2 2792 movaps -8 * SIZE(Y), %xmm11 2793 mulps %xmm7, %xmm12 2794 movsd -8 * SIZE(X), %xmm7 2795 movhps -6 * SIZE(X), %xmm7 2796 addps %xmm12, %xmm3 2797 2798#if defined(PREFETCH) && !defined(FETCH128) 2799 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 2800#endif 2801 2802 movss %xmm9, %xmm8 2803 pshufd $0x1b, %xmm8, %xmm12 2804 movss %xmm5, %xmm4 2805 mulps %xmm4, %xmm8 2806 addps %xmm8, %xmm0 2807 movaps -4 * SIZE(Y), %xmm8 2808 mulps %xmm4, %xmm12 2809 movsd -4 * SIZE(X), %xmm4 2810 movhps -2 * SIZE(X), %xmm4 2811 addps %xmm12, %xmm1 2812 2813 movss %xmm10, %xmm9 2814 pshufd $0x1b, %xmm9, %xmm12 2815 movss %xmm6, %xmm5 2816 mulps %xmm5, %xmm9 2817 addps %xmm9, %xmm2 2818 movaps 0 * SIZE(Y), %xmm9 2819 mulps %xmm5, %xmm12 2820 movsd 0 * SIZE(X), %xmm5 2821 movhps 2 * SIZE(X), %xmm5 2822 addps %xmm12, %xmm3 2823 2824#if defined(PREFETCH) && !defined(FETCH128) 2825 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) 2826#endif 2827 2828 movss %xmm11, %xmm10 2829 pshufd $0x1b, %xmm10, %xmm12 2830 movss %xmm7, %xmm6 2831 mulps %xmm6, %xmm10 2832 addps %xmm10, %xmm0 2833 movaps 4 * SIZE(Y), %xmm10 2834 mulps %xmm6, %xmm12 2835 movsd 4 * SIZE(X), %xmm6 2836 movhps 6 * SIZE(X), %xmm6 2837 addps %xmm12, %xmm1 2838 2839 movss %xmm8, %xmm11 2840 pshufd $0x1b, %xmm11, %xmm12 2841 movss %xmm4, %xmm7 2842 mulps %xmm7, %xmm11 2843 addps %xmm11, %xmm2 2844 movaps 8 * SIZE(Y), %xmm11 2845 mulps %xmm7, %xmm12 2846 movsd 8 * SIZE(X), %xmm7 2847 movhps 10 * SIZE(X), %xmm7 2848 addps %xmm12, %xmm3 2849 2850 subq $-32 * SIZE, X 2851 subq $-32 * SIZE, Y 2852 2853 decq %rax 2854 jg .L81 2855 ALIGN_3 2856 2857.L82: 2858 movss %xmm9, %xmm8 2859 pshufd $0x1b, %xmm8, %xmm12 2860 movss %xmm5, %xmm4 2861 mulps %xmm4, %xmm8 2862 addps %xmm8, %xmm0 2863 movaps -20 * SIZE(Y), %xmm8 2864 mulps %xmm4, %xmm12 2865 movsd -20 * SIZE(X), %xmm4 2866 movhps -18 * SIZE(X), %xmm4 2867 addps %xmm12, %xmm1 2868 2869 movss %xmm10, %xmm9 2870 pshufd $0x1b, %xmm9, %xmm12 2871 movss %xmm6, %xmm5 2872 mulps %xmm5, %xmm9 2873 addps %xmm9, %xmm2 2874 movaps -16 * SIZE(Y), %xmm9 2875 mulps %xmm5, %xmm12 2876 movsd -16 * SIZE(X), %xmm5 2877 movhps -14 * SIZE(X), %xmm5 2878 addps %xmm12, %xmm3 2879 2880 movss %xmm11, %xmm10 2881 pshufd $0x1b, %xmm10, %xmm12 2882 movss %xmm7, %xmm6 2883 mulps %xmm6, %xmm10 2884 addps %xmm10, %xmm0 2885 movaps -12 * SIZE(Y), %xmm10 2886 mulps %xmm6, %xmm12 2887 movsd -12 * SIZE(X), %xmm6 2888 movhps -10 * SIZE(X), %xmm6 2889 addps %xmm12, %xmm1 2890 2891 movss %xmm8, %xmm11 2892 pshufd $0x1b, %xmm11, %xmm12 2893 movss %xmm4, %xmm7 2894 mulps %xmm7, %xmm11 2895 addps %xmm11, %xmm2 2896 movaps -8 * SIZE(Y), %xmm11 2897 mulps %xmm7, %xmm12 2898 movsd -8 * SIZE(X), %xmm7 2899 movhps -6 * SIZE(X), %xmm7 2900 addps %xmm12, %xmm3 2901 2902 movss %xmm9, %xmm8 2903 pshufd $0x1b, %xmm8, %xmm12 2904 movss %xmm5, %xmm4 2905 mulps %xmm4, %xmm8 2906 addps %xmm8, %xmm0 2907 movaps -4 * SIZE(Y), %xmm8 2908 mulps %xmm4, %xmm12 2909 movsd -4 * SIZE(X), %xmm4 2910 movhps -2 * SIZE(X), %xmm4 2911 addps %xmm12, %xmm1 2912 2913 movss %xmm10, %xmm9 2914 pshufd $0x1b, %xmm9, %xmm12 2915 movss %xmm6, %xmm5 2916 mulps %xmm5, %xmm9 2917 addps %xmm9, %xmm2 2918 mulps %xmm5, %xmm12 2919 addps %xmm12, %xmm3 2920 2921 movss %xmm11, %xmm10 2922 pshufd $0x1b, %xmm10, %xmm12 2923 movss %xmm7, %xmm6 2924 mulps %xmm6, %xmm10 2925 addps %xmm10, %xmm0 2926 mulps %xmm6, %xmm12 2927 addps %xmm12, %xmm1 2928 2929 movss %xmm8, %xmm11 2930 pshufd $0x1b, %xmm11, %xmm12 2931 movss %xmm4, %xmm7 2932 mulps %xmm7, %xmm11 2933 addps %xmm11, %xmm2 2934 mulps %xmm7, %xmm12 2935 addps %xmm12, %xmm3 2936 2937 subq $-32 * SIZE, X 2938 subq $-32 * SIZE, Y 2939 ALIGN_3 2940 2941.L85: 2942 testq $8, N 2943 jle .L86 2944 2945 movsd -32 * SIZE(X), %xmm5 2946 movhps -30 * SIZE(X), %xmm5 2947 movaps -32 * SIZE(Y), %xmm9 2948 2949 movss %xmm9, %xmm8 2950 pshufd $0x1b, %xmm8, %xmm12 2951 movss %xmm5, %xmm4 2952 mulps %xmm4, %xmm8 2953 addps %xmm8, %xmm0 2954 mulps %xmm4, %xmm12 2955 addps %xmm12, %xmm1 2956 2957 movsd -28 * SIZE(X), %xmm6 2958 movhps -26 * SIZE(X), %xmm6 2959 movaps -28 * SIZE(Y), %xmm10 2960 2961 movss %xmm10, %xmm9 2962 pshufd $0x1b, %xmm9, %xmm12 2963 movss %xmm6, %xmm5 2964 mulps %xmm5, %xmm9 2965 addps %xmm9, %xmm2 2966 mulps %xmm5, %xmm12 2967 addps %xmm12, %xmm3 2968 2969 movsd -24 * SIZE(X), %xmm7 2970 movhps -22 * SIZE(X), %xmm7 2971 movaps -24 * SIZE(Y), %xmm11 2972 2973 movss %xmm11, %xmm10 2974 pshufd $0x1b, %xmm10, %xmm12 2975 movss %xmm7, %xmm6 2976 mulps %xmm6, %xmm10 2977 addps %xmm10, %xmm0 2978 mulps %xmm6, %xmm12 2979 addps %xmm12, %xmm1 2980 2981 movsd -20 * SIZE(X), %xmm4 2982 movhps -18 * SIZE(X), %xmm4 2983 movaps -20 * SIZE(Y), %xmm8 2984 2985 movss %xmm8, %xmm11 2986 pshufd $0x1b, %xmm11, %xmm12 2987 movss %xmm4, %xmm7 2988 mulps %xmm7, %xmm11 2989 addps %xmm11, %xmm2 2990 mulps %xmm7, %xmm12 2991 addps %xmm12, %xmm3 2992 2993 addq $16 * SIZE, X 2994 addq $16 * SIZE, Y 2995 ALIGN_3 2996 2997.L86: 2998 testq $4, N 2999 jle .L87 3000 3001 movsd -32 * SIZE(X), %xmm5 3002 movhps -30 * SIZE(X), %xmm5 3003 movaps -32 * SIZE(Y), %xmm9 3004 3005 movss %xmm9, %xmm8 3006 pshufd $0x1b, %xmm8, %xmm12 3007 movss %xmm5, %xmm4 3008 mulps %xmm4, %xmm8 3009 addps %xmm8, %xmm0 3010 mulps %xmm4, %xmm12 3011 addps %xmm12, %xmm1 3012 3013 movsd -28 * SIZE(X), %xmm6 3014 movhps -26 * SIZE(X), %xmm6 3015 movaps -28 * SIZE(Y), %xmm10 3016 3017 movss %xmm10, %xmm9 3018 pshufd $0x1b, %xmm9, %xmm12 3019 movss %xmm6, %xmm5 3020 mulps %xmm5, %xmm9 3021 addps %xmm9, %xmm2 3022 mulps %xmm5, %xmm12 3023 addps %xmm12, %xmm3 3024 3025 movaps %xmm6, %xmm4 3026 movaps %xmm10, %xmm8 3027 3028 addq $8 * SIZE, X 3029 addq $8 * SIZE, Y 3030 ALIGN_3 3031 3032.L87: 3033 testq $2, N 3034 jle .L88 3035 3036 movsd -32 * SIZE(X), %xmm5 3037 movhps -30 * SIZE(X), %xmm5 3038 movaps -32 * SIZE(Y), %xmm9 3039 3040 movss %xmm9, %xmm8 3041 pshufd $0x1b, %xmm8, %xmm12 3042 movss %xmm5, %xmm4 3043 mulps %xmm4, %xmm8 3044 addps %xmm8, %xmm0 3045 mulps %xmm4, %xmm12 3046 addps %xmm12, %xmm1 3047 3048 movaps %xmm5, %xmm4 3049 movaps %xmm9, %xmm8 3050 ALIGN_3 3051 3052.L88: 3053 testq $1, N 3054 jle .L89 3055 3056 xorps %xmm5, %xmm5 3057 movss %xmm5, %xmm4 3058 movss %xmm5, %xmm8 3059 3060 shufps $0x24, %xmm4, %xmm4 3061 pshufd $0x18, %xmm8, %xmm12 3062 shufps $0x24, %xmm8, %xmm8 3063 3064 mulps %xmm4, %xmm8 3065 addps %xmm8, %xmm0 3066 mulps %xmm4, %xmm12 3067 addps %xmm12, %xmm1 3068 ALIGN_3 3069 3070.L89: 3071 shufps $0x39, %xmm0, %xmm0 3072 shufps $0x39, %xmm1, %xmm1 3073 shufps $0x39, %xmm2, %xmm2 3074 shufps $0x39, %xmm3, %xmm3 3075 jmp .L98 3076 ALIGN_3 3077 3078.L200: 3079 movq N, %rax 3080 sarq $4, %rax 3081 jle .L205 3082 3083 movsd (X), %xmm4 3084 addq INCX, X 3085 movhps (X), %xmm4 3086 addq INCX, X 3087 movsd (Y), %xmm8 3088 addq INCY, Y 3089 movhps (Y), %xmm8 3090 addq INCY, Y 3091 3092 movsd (X), %xmm5 3093 addq INCX, X 3094 movhps (X), %xmm5 3095 addq INCX, X 3096 movsd (Y), %xmm9 3097 addq INCY, Y 3098 movhps (Y), %xmm9 3099 addq INCY, Y 3100 3101 movsd (X), %xmm6 3102 addq INCX, X 3103 movhps (X), %xmm6 3104 addq INCX, X 3105 movsd (Y), %xmm10 3106 addq INCY, Y 3107 movhps (Y), %xmm10 3108 addq INCY, Y 3109 3110 movsd (X), %xmm7 3111 addq INCX, X 3112 movhps (X), %xmm7 3113 addq INCX, X 3114 movsd (Y), %xmm11 3115 addq INCY, Y 3116 movhps (Y), %xmm11 3117 addq INCY, Y 3118 3119 decq %rax 3120 jle .L204 3121 ALIGN_3 3122 3123.L203: 3124 pshufd $0xb1, %xmm8, %xmm12 3125 mulps %xmm4, %xmm8 3126 addps %xmm8, %xmm0 3127 movsd (Y), %xmm8 3128 addq INCY, Y 3129 movhps (Y), %xmm8 3130 addq INCY, Y 3131 mulps %xmm4, %xmm12 3132 movsd (X), %xmm4 3133 addq INCX, X 3134 movhps (X), %xmm4 3135 addq INCX, X 3136 addps %xmm12, %xmm1 3137 3138 pshufd $0xb1, %xmm9, %xmm12 3139 mulps %xmm5, %xmm9 3140 addps %xmm9, %xmm2 3141 movsd (Y), %xmm9 3142 addq INCY, Y 3143 movhps (Y), %xmm9 3144 addq INCY, Y 3145 mulps %xmm5, %xmm12 3146 movsd (X), %xmm5 3147 addq INCX, X 3148 movhps (X), %xmm5 3149 addq INCX, X 3150 addps %xmm12, %xmm3 3151 3152 pshufd $0xb1, %xmm10, %xmm12 3153 mulps %xmm6, %xmm10 3154 addps %xmm10, %xmm0 3155 movsd (Y), %xmm10 3156 addq INCY, Y 3157 movhps (Y), %xmm10 3158 addq INCY, Y 3159 mulps %xmm6, %xmm12 3160 movsd (X), %xmm6 3161 addq INCX, X 3162 movhps (X), %xmm6 3163 addq INCX, X 3164 addps %xmm12, %xmm1 3165 3166 pshufd $0xb1, %xmm11, %xmm12 3167 mulps %xmm7, %xmm11 3168 addps %xmm11, %xmm2 3169 movsd (Y), %xmm11 3170 addq INCY, Y 3171 movhps (Y), %xmm11 3172 addq INCY, Y 3173 mulps %xmm7, %xmm12 3174 movsd (X), %xmm7 3175 addq INCX, X 3176 movhps (X), %xmm7 3177 addq INCX, X 3178 addps %xmm12, %xmm3 3179 3180 pshufd $0xb1, %xmm8, %xmm12 3181 mulps %xmm4, %xmm8 3182 addps %xmm8, %xmm0 3183 movsd (Y), %xmm8 3184 addq INCY, Y 3185 movhps (Y), %xmm8 3186 addq INCY, Y 3187 mulps %xmm4, %xmm12 3188 movsd (X), %xmm4 3189 addq INCX, X 3190 movhps (X), %xmm4 3191 addq INCX, X 3192 addps %xmm12, %xmm1 3193 3194 pshufd $0xb1, %xmm9, %xmm12 3195 mulps %xmm5, %xmm9 3196 addps %xmm9, %xmm2 3197 movsd (Y), %xmm9 3198 addq INCY, Y 3199 movhps (Y), %xmm9 3200 addq INCY, Y 3201 mulps %xmm5, %xmm12 3202 movsd (X), %xmm5 3203 addq INCX, X 3204 movhps (X), %xmm5 3205 addq INCX, X 3206 addps %xmm12, %xmm3 3207 3208 pshufd $0xb1, %xmm10, %xmm12 3209 mulps %xmm6, %xmm10 3210 addps %xmm10, %xmm0 3211 movsd (Y), %xmm10 3212 addq INCY, Y 3213 movhps (Y), %xmm10 3214 addq INCY, Y 3215 mulps %xmm6, %xmm12 3216 movsd (X), %xmm6 3217 addq INCX, X 3218 movhps (X), %xmm6 3219 addq INCX, X 3220 addps %xmm12, %xmm1 3221 3222 pshufd $0xb1, %xmm11, %xmm12 3223 mulps %xmm7, %xmm11 3224 addps %xmm11, %xmm2 3225 movsd (Y), %xmm11 3226 addq INCY, Y 3227 movhps (Y), %xmm11 3228 addq INCY, Y 3229 3230 mulps %xmm7, %xmm12 3231 movsd (X), %xmm7 3232 addq INCX, X 3233 movhps (X), %xmm7 3234 addq INCX, X 3235 addps %xmm12, %xmm3 3236 3237 decq %rax 3238 jg .L203 3239 ALIGN_3 3240 3241.L204: 3242 pshufd $0xb1, %xmm8, %xmm12 3243 mulps %xmm4, %xmm8 3244 addps %xmm8, %xmm0 3245 movsd (Y), %xmm8 3246 addq INCY, Y 3247 movhps (Y), %xmm8 3248 addq INCY, Y 3249 mulps %xmm4, %xmm12 3250 movsd (X), %xmm4 3251 addq INCX, X 3252 movhps (X), %xmm4 3253 addq INCX, X 3254 addps %xmm12, %xmm1 3255 3256 pshufd $0xb1, %xmm9, %xmm12 3257 mulps %xmm5, %xmm9 3258 addps %xmm9, %xmm2 3259 movsd (Y), %xmm9 3260 addq INCY, Y 3261 movhps (Y), %xmm9 3262 addq INCY, Y 3263 mulps %xmm5, %xmm12 3264 movsd (X), %xmm5 3265 addq INCX, X 3266 movhps (X), %xmm5 3267 addq INCX, X 3268 addps %xmm12, %xmm3 3269 3270 pshufd $0xb1, %xmm10, %xmm12 3271 mulps %xmm6, %xmm10 3272 addps %xmm10, %xmm0 3273 movsd (Y), %xmm10 3274 addq INCY, Y 3275 movhps (Y), %xmm10 3276 addq INCY, Y 3277 mulps %xmm6, %xmm12 3278 movsd (X), %xmm6 3279 addq INCX, X 3280 movhps (X), %xmm6 3281 addq INCX, X 3282 addps %xmm12, %xmm1 3283 3284 pshufd $0xb1, %xmm11, %xmm12 3285 mulps %xmm7, %xmm11 3286 addps %xmm11, %xmm2 3287 movsd (Y), %xmm11 3288 addq INCY, Y 3289 movhps (Y), %xmm11 3290 addq INCY, Y 3291 mulps %xmm7, %xmm12 3292 movsd (X), %xmm7 3293 addq INCX, X 3294 movhps (X), %xmm7 3295 addq INCX, X 3296 addps %xmm12, %xmm3 3297 3298 pshufd $0xb1, %xmm8, %xmm12 3299 mulps %xmm4, %xmm8 3300 addps %xmm8, %xmm0 3301 mulps %xmm4, %xmm12 3302 addps %xmm12, %xmm1 3303 3304 pshufd $0xb1, %xmm9, %xmm12 3305 mulps %xmm5, %xmm9 3306 addps %xmm9, %xmm2 3307 mulps %xmm5, %xmm12 3308 addps %xmm12, %xmm3 3309 3310 pshufd $0xb1, %xmm10, %xmm12 3311 mulps %xmm6, %xmm10 3312 addps %xmm10, %xmm0 3313 mulps %xmm6, %xmm12 3314 addps %xmm12, %xmm1 3315 3316 pshufd $0xb1, %xmm11, %xmm12 3317 mulps %xmm7, %xmm11 3318 addps %xmm11, %xmm2 3319 mulps %xmm7, %xmm12 3320 addps %xmm12, %xmm3 3321 ALIGN_3 3322 3323.L205: 3324 testq $8, N 3325 jle .L206 3326 3327 movsd (X), %xmm4 3328 addq INCX, X 3329 movhps (X), %xmm4 3330 addq INCX, X 3331 movsd (Y), %xmm8 3332 addq INCY, Y 3333 movhps (Y), %xmm8 3334 addq INCY, Y 3335 3336 pshufd $0xb1, %xmm8, %xmm12 3337 mulps %xmm4, %xmm8 3338 addps %xmm8, %xmm0 3339 mulps %xmm4, %xmm12 3340 addps %xmm12, %xmm1 3341 3342 movsd (X), %xmm5 3343 addq INCX, X 3344 movhps (X), %xmm5 3345 addq INCX, X 3346 movsd (Y), %xmm9 3347 addq INCY, Y 3348 movhps (Y), %xmm9 3349 addq INCY, Y 3350 3351 pshufd $0xb1, %xmm9, %xmm12 3352 mulps %xmm5, %xmm9 3353 addps %xmm9, %xmm2 3354 mulps %xmm5, %xmm12 3355 addps %xmm12, %xmm3 3356 3357 movsd (X), %xmm6 3358 addq INCX, X 3359 movhps (X), %xmm6 3360 addq INCX, X 3361 movsd (Y), %xmm10 3362 addq INCY, Y 3363 movhps (Y), %xmm10 3364 addq INCY, Y 3365 3366 pshufd $0xb1, %xmm10, %xmm12 3367 mulps %xmm6, %xmm10 3368 addps %xmm10, %xmm0 3369 mulps %xmm6, %xmm12 3370 addps %xmm12, %xmm1 3371 3372 movsd (X), %xmm7 3373 addq INCX, X 3374 movhps (X), %xmm7 3375 addq INCX, X 3376 movsd (Y), %xmm11 3377 addq INCY, Y 3378 movhps (Y), %xmm11 3379 addq INCY, Y 3380 3381 pshufd $0xb1, %xmm11, %xmm12 3382 mulps %xmm7, %xmm11 3383 addps %xmm11, %xmm2 3384 mulps %xmm7, %xmm12 3385 addps %xmm12, %xmm3 3386 ALIGN_3 3387 3388.L206: 3389 testq $4, N 3390 jle .L207 3391 3392 movsd (X), %xmm4 3393 addq INCX, X 3394 movhps (X), %xmm4 3395 addq INCX, X 3396 movsd (Y), %xmm8 3397 addq INCY, Y 3398 movhps (Y), %xmm8 3399 addq INCY, Y 3400 3401 pshufd $0xb1, %xmm8, %xmm12 3402 mulps %xmm4, %xmm8 3403 addps %xmm8, %xmm0 3404 mulps %xmm4, %xmm12 3405 addps %xmm12, %xmm1 3406 3407 movsd (X), %xmm5 3408 addq INCX, X 3409 movhps (X), %xmm5 3410 addq INCX, X 3411 movsd (Y), %xmm9 3412 addq INCY, Y 3413 movhps (Y), %xmm9 3414 addq INCY, Y 3415 3416 pshufd $0xb1, %xmm9, %xmm12 3417 mulps %xmm5, %xmm9 3418 addps %xmm9, %xmm2 3419 mulps %xmm5, %xmm12 3420 addps %xmm12, %xmm3 3421 ALIGN_3 3422 3423.L207: 3424 testq $2, N 3425 jle .L208 3426 3427 movsd (X), %xmm4 3428 addq INCX, X 3429 movhps (X), %xmm4 3430 addq INCX, X 3431 movsd (Y), %xmm8 3432 addq INCY, Y 3433 movhps (Y), %xmm8 3434 addq INCY, Y 3435 3436 pshufd $0xb1, %xmm8, %xmm12 3437 mulps %xmm4, %xmm8 3438 addps %xmm8, %xmm0 3439 mulps %xmm4, %xmm12 3440 addps %xmm12, %xmm1 3441 ALIGN_3 3442 3443.L208: 3444 testq $1, N 3445 jle .L98 3446 3447#ifdef movsd 3448 xorps %xmm4, %xmm4 3449#endif 3450 movsd (X), %xmm4 3451#ifdef movsd 3452 xorps %xmm8, %xmm8 3453#endif 3454 movsd (Y), %xmm8 3455 3456 pshufd $0xb1, %xmm8, %xmm12 3457 mulps %xmm4, %xmm8 3458 addps %xmm8, %xmm0 3459 mulps %xmm4, %xmm12 3460 addps %xmm12, %xmm1 3461 ALIGN_3 3462 3463.L98: 3464 addps %xmm2, %xmm0 3465 addps %xmm3, %xmm1 3466 3467 movhlps %xmm0, %xmm2 3468 movhlps %xmm1, %xmm3 3469 3470 addps %xmm2, %xmm0 3471 addps %xmm3, %xmm1 3472 3473 pshufd $1, %xmm0, %xmm2 3474 pshufd $1, %xmm1, %xmm3 3475 ALIGN_3 3476 3477.L999: 3478#ifndef CONJ 3479 subss %xmm2, %xmm0 3480 addss %xmm3, %xmm1 3481#else 3482 addss %xmm2, %xmm0 3483 subss %xmm3, %xmm1 3484#endif 3485 unpcklps %xmm1, %xmm0 3486 3487#ifdef WINDOWS_ABI 3488 movq %xmm0, %rax 3489#endif 3490 3491 RESTOREREGISTERS 3492 3493 ret 3494 ALIGN_3 3495 3496 EPILOGUE 3497