1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 12 26#define ARGS 0 27 28#define STACK_N 4 + STACK + ARGS(%esp) 29#define STACK_X 8 + STACK + ARGS(%esp) 30#define STACK_INCX 12 + STACK + ARGS(%esp) 31#define STACK_Y 16 + STACK + ARGS(%esp) 32#define STACK_INCY 20 + STACK + ARGS(%esp) 33#define STACK_C 24 + STACK + ARGS(%esp) 34#define STACK_S 28 + STACK + ARGS(%esp) 35 36#define N %ebx 37#define X %esi 38#define INCX %ecx 39#define Y %edi 40#define INCY %edx 41 42#define I %eax 43 44#define C %xmm6 45#define S %xmm7 46 47#include "l1param.h" 48 49 PROLOGUE 50 PROFCODE 51 52 pushl %edi 53 pushl %esi 54 pushl %ebx 55 56 movl STACK_N, N 57 movl STACK_X, X 58 movl STACK_INCX, INCX 59 movl STACK_Y, Y 60 movl STACK_INCY, INCY 61 62 leal (, INCX, SIZE), INCX 63 leal (, INCY, SIZE), INCY 64 65 movss STACK_C, C 66 movss STACK_S, S 67 68 shufps $0x0, C, C 69 shufps $0x0, S, S 70 71 cmpl $0, N 72 jle .L999 73 74 cmpl $SIZE, INCX 75 jne .L50 76 cmpl $SIZE, INCY 77 jne .L50 78 79 testl $SIZE, X 80 je .L05 81 82 movss 0 * SIZE(Y), %xmm1 83 movss 0 * SIZE(X), %xmm0 84 85 movaps %xmm1, %xmm2 86 movaps %xmm0, %xmm3 87 88 mulss C, %xmm0 89 mulss S, %xmm1 90 91 mulss C, %xmm2 92 mulss S, %xmm3 93 94 addss %xmm1, %xmm0 95 subss %xmm3, %xmm2 96 97 movss %xmm0, 0 * SIZE(X) 98 movss %xmm2, 0 * SIZE(Y) 99 100 addl $1 * SIZE, X 101 addl $1 * SIZE, Y 102 decl N 103 jle .L999 104 105.L05: 106 testl $2 * SIZE, X 107 je .L10 108 109 cmpl $1, N 110 je .L17 111 112#ifdef movsd 113 xorps %xmm0, %xmm0 114 xorps %xmm1, %xmm1 115#endif 116 117 movsd 0 * SIZE(Y), %xmm1 118 movsd 0 * SIZE(X), %xmm0 119 120 movaps %xmm1, %xmm2 121 movaps %xmm0, %xmm3 122 123 mulps C, %xmm0 124 mulps S, %xmm1 125 126 mulps C, %xmm2 127 mulps S, %xmm3 128 129 addps %xmm1, %xmm0 130 subps %xmm3, %xmm2 131 132 movlps %xmm0, 0 * SIZE(X) 133 movlps %xmm2, 0 * SIZE(Y) 134 135 addl $2 * SIZE, X 136 addl $2 * SIZE, Y 137 subl $2, N 138 jle .L999 139 ALIGN_2 140 141.L10: 142 testl $3 * SIZE, Y 143 jne .L20 144 145 movl N, I 146 sarl $5, I 147 jle .L14 148 ALIGN_3 149 150.L11: 151#ifdef PREFETCHW 152 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) 153#endif 154 155 movsd 0 * SIZE(Y), %xmm1 156 movhps 2 * SIZE(Y), %xmm1 157 movaps 0 * SIZE(X), %xmm0 158 159 movaps %xmm1, %xmm2 160 movaps %xmm0, %xmm3 161 162 mulps C, %xmm0 163 mulps S, %xmm1 164 165 mulps C, %xmm2 166 mulps S, %xmm3 167 168 addps %xmm1, %xmm0 169 subps %xmm3, %xmm2 170 171 movaps %xmm0, 0 * SIZE(X) 172 movlps %xmm2, 0 * SIZE(Y) 173 movhps %xmm2, 2 * SIZE(Y) 174 175 movsd 4 * SIZE(Y), %xmm1 176 movhps 6 * SIZE(Y), %xmm1 177 movaps 4 * SIZE(X), %xmm0 178 179 movaps %xmm1, %xmm2 180 movaps %xmm0, %xmm3 181 182 mulps C, %xmm0 183 mulps S, %xmm1 184 185 mulps C, %xmm2 186 mulps S, %xmm3 187 188 addps %xmm1, %xmm0 189 subps %xmm3, %xmm2 190 191#ifdef PREFETCHW 192 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 193#endif 194 195 movaps %xmm0, 4 * SIZE(X) 196 movlps %xmm2, 4 * SIZE(Y) 197 movhps %xmm2, 6 * SIZE(Y) 198 199 movsd 8 * SIZE(Y), %xmm1 200 movhps 10 * SIZE(Y), %xmm1 201 movaps 8 * SIZE(X), %xmm0 202 203 movaps %xmm1, %xmm2 204 movaps %xmm0, %xmm3 205 206 mulps C, %xmm0 207 mulps S, %xmm1 208 209 mulps C, %xmm2 210 mulps S, %xmm3 211 212 addps %xmm1, %xmm0 213 subps %xmm3, %xmm2 214 215 movaps %xmm0, 8 * SIZE(X) 216 movlps %xmm2, 8 * SIZE(Y) 217 movhps %xmm2, 10 * SIZE(Y) 218 219 movsd 12 * SIZE(Y), %xmm1 220 movhps 14 * SIZE(Y), %xmm1 221 movaps 12 * SIZE(X), %xmm0 222 223 movaps %xmm1, %xmm2 224 movaps %xmm0, %xmm3 225 226 mulps C, %xmm0 227 mulps S, %xmm1 228 229 mulps C, %xmm2 230 mulps S, %xmm3 231 232 addps %xmm1, %xmm0 233 subps %xmm3, %xmm2 234 235 movaps %xmm0, 12 * SIZE(X) 236 movlps %xmm2, 12 * SIZE(Y) 237 movhps %xmm2, 14 * SIZE(Y) 238 239#ifdef PREFETCHW 240 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) 241#endif 242 243 movsd 16 * SIZE(Y), %xmm1 244 movhps 18 * SIZE(Y), %xmm1 245 movaps 16 * SIZE(X), %xmm0 246 247 movaps %xmm1, %xmm2 248 movaps %xmm0, %xmm3 249 250 mulps C, %xmm0 251 mulps S, %xmm1 252 253 mulps C, %xmm2 254 mulps S, %xmm3 255 256 addps %xmm1, %xmm0 257 subps %xmm3, %xmm2 258 259 movaps %xmm0, 16 * SIZE(X) 260 movlps %xmm2, 16 * SIZE(Y) 261 movhps %xmm2, 18 * SIZE(Y) 262 263 movsd 20 * SIZE(Y), %xmm1 264 movhps 22 * SIZE(Y), %xmm1 265 movaps 20 * SIZE(X), %xmm0 266 267 movaps %xmm1, %xmm2 268 movaps %xmm0, %xmm3 269 270 mulps C, %xmm0 271 mulps S, %xmm1 272 273 mulps C, %xmm2 274 mulps S, %xmm3 275 276 addps %xmm1, %xmm0 277 subps %xmm3, %xmm2 278 279 movaps %xmm0, 20 * SIZE(X) 280 movlps %xmm2, 20 * SIZE(Y) 281 movhps %xmm2, 22 * SIZE(Y) 282 283#ifdef PREFETCHW 284 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 285#endif 286 287 movsd 24 * SIZE(Y), %xmm1 288 movhps 26 * SIZE(Y), %xmm1 289 movaps 24 * SIZE(X), %xmm0 290 291 movaps %xmm1, %xmm2 292 movaps %xmm0, %xmm3 293 294 mulps C, %xmm0 295 mulps S, %xmm1 296 297 mulps C, %xmm2 298 mulps S, %xmm3 299 300 addps %xmm1, %xmm0 301 subps %xmm3, %xmm2 302 303 movaps %xmm0, 24 * SIZE(X) 304 movlps %xmm2, 24 * SIZE(Y) 305 movhps %xmm2, 26 * SIZE(Y) 306 307 movsd 28 * SIZE(Y), %xmm1 308 movhps 30 * SIZE(Y), %xmm1 309 movaps 28 * SIZE(X), %xmm0 310 311 movaps %xmm1, %xmm2 312 movaps %xmm0, %xmm3 313 314 mulps C, %xmm0 315 mulps S, %xmm1 316 317 mulps C, %xmm2 318 mulps S, %xmm3 319 320 addps %xmm1, %xmm0 321 subps %xmm3, %xmm2 322 323 movaps %xmm0, 28 * SIZE(X) 324 movlps %xmm2, 28 * SIZE(Y) 325 movhps %xmm2, 30 * SIZE(Y) 326 327 addl $32 * SIZE, X 328 addl $32 * SIZE, Y 329 330 decl I 331 jg .L11 332 ALIGN_3 333 334.L14: 335 testl $31, N 336 jle .L999 337 338 testl $16, N 339 jle .L15 340 341 movsd 0 * SIZE(Y), %xmm1 342 movhps 2 * SIZE(Y), %xmm1 343 movaps 0 * SIZE(X), %xmm0 344 345 movaps %xmm1, %xmm2 346 movaps %xmm0, %xmm3 347 348 mulps C, %xmm0 349 mulps S, %xmm1 350 351 mulps C, %xmm2 352 mulps S, %xmm3 353 354 addps %xmm1, %xmm0 355 subps %xmm3, %xmm2 356 357 movaps %xmm0, 0 * SIZE(X) 358 movlps %xmm2, 0 * SIZE(Y) 359 movhps %xmm2, 2 * SIZE(Y) 360 361 movsd 4 * SIZE(Y), %xmm1 362 movhps 6 * SIZE(Y), %xmm1 363 movaps 4 * SIZE(X), %xmm0 364 365 movaps %xmm1, %xmm2 366 movaps %xmm0, %xmm3 367 368 mulps C, %xmm0 369 mulps S, %xmm1 370 371 mulps C, %xmm2 372 mulps S, %xmm3 373 374 addps %xmm1, %xmm0 375 subps %xmm3, %xmm2 376 377 movaps %xmm0, 4 * SIZE(X) 378 movlps %xmm2, 4 * SIZE(Y) 379 movhps %xmm2, 6 * SIZE(Y) 380 381 movsd 8 * SIZE(Y), %xmm1 382 movhps 10 * SIZE(Y), %xmm1 383 movaps 8 * SIZE(X), %xmm0 384 385 movaps %xmm1, %xmm2 386 movaps %xmm0, %xmm3 387 388 mulps C, %xmm0 389 mulps S, %xmm1 390 391 mulps C, %xmm2 392 mulps S, %xmm3 393 394 addps %xmm1, %xmm0 395 subps %xmm3, %xmm2 396 397 movaps %xmm0, 8 * SIZE(X) 398 movlps %xmm2, 8 * SIZE(Y) 399 movhps %xmm2, 10 * SIZE(Y) 400 401 movsd 12 * SIZE(Y), %xmm1 402 movhps 14 * SIZE(Y), %xmm1 403 movaps 12 * SIZE(X), %xmm0 404 405 movaps %xmm1, %xmm2 406 movaps %xmm0, %xmm3 407 408 mulps C, %xmm0 409 mulps S, %xmm1 410 411 mulps C, %xmm2 412 mulps S, %xmm3 413 414 addps %xmm1, %xmm0 415 subps %xmm3, %xmm2 416 417 movaps %xmm0, 12 * SIZE(X) 418 movlps %xmm2, 12 * SIZE(Y) 419 movhps %xmm2, 14 * SIZE(Y) 420 421 addl $16 * SIZE, X 422 addl $16 * SIZE, Y 423 ALIGN_3 424 425.L15: 426 testl $8, N 427 jle .L16 428 429 movsd 0 * SIZE(Y), %xmm1 430 movhps 2 * SIZE(Y), %xmm1 431 movaps 0 * SIZE(X), %xmm0 432 433 movaps %xmm1, %xmm2 434 movaps %xmm0, %xmm3 435 436 mulps C, %xmm0 437 mulps S, %xmm1 438 439 mulps C, %xmm2 440 mulps S, %xmm3 441 442 addps %xmm1, %xmm0 443 subps %xmm3, %xmm2 444 445 movaps %xmm0, 0 * SIZE(X) 446 movlps %xmm2, 0 * SIZE(Y) 447 movhps %xmm2, 2 * SIZE(Y) 448 449 movsd 4 * SIZE(Y), %xmm1 450 movhps 6 * SIZE(Y), %xmm1 451 movaps 4 * SIZE(X), %xmm0 452 453 movaps %xmm1, %xmm2 454 movaps %xmm0, %xmm3 455 456 mulps C, %xmm0 457 mulps S, %xmm1 458 459 mulps C, %xmm2 460 mulps S, %xmm3 461 462 addps %xmm1, %xmm0 463 subps %xmm3, %xmm2 464 465 movaps %xmm0, 4 * SIZE(X) 466 movlps %xmm2, 4 * SIZE(Y) 467 movhps %xmm2, 6 * SIZE(Y) 468 469 addl $8 * SIZE, X 470 addl $8 * SIZE, Y 471 ALIGN_3 472 473.L16: 474 testl $4, N 475 jle .L17 476 477 movsd 0 * SIZE(Y), %xmm1 478 movhps 2 * SIZE(Y), %xmm1 479 movaps 0 * SIZE(X), %xmm0 480 481 movaps %xmm1, %xmm2 482 movaps %xmm0, %xmm3 483 484 mulps C, %xmm0 485 mulps S, %xmm1 486 487 mulps C, %xmm2 488 mulps S, %xmm3 489 490 addps %xmm1, %xmm0 491 subps %xmm3, %xmm2 492 493 movaps %xmm0, 0 * SIZE(X) 494 movlps %xmm2, 0 * SIZE(Y) 495 movhps %xmm2, 2 * SIZE(Y) 496 497 addl $4 * SIZE, X 498 addl $4 * SIZE, Y 499 ALIGN_3 500 501.L17: 502 testl $2, N 503 jle .L18 504 505#ifdef movsd 506 xorps %xmm0, %xmm0 507 xorps %xmm1, %xmm1 508#endif 509 510 movsd 0 * SIZE(Y), %xmm1 511 movsd 0 * SIZE(X), %xmm0 512 513 movaps %xmm1, %xmm2 514 movaps %xmm0, %xmm3 515 516 mulps C, %xmm0 517 mulps S, %xmm1 518 519 mulps C, %xmm2 520 mulps S, %xmm3 521 522 addps %xmm1, %xmm0 523 subps %xmm3, %xmm2 524 525 movlps %xmm0, 0 * SIZE(X) 526 movlps %xmm2, 0 * SIZE(Y) 527 528 addl $2 * SIZE, X 529 addl $2 * SIZE, Y 530 ALIGN_3 531 532.L18: 533 testl $1, N 534 jle .L999 535 536 movss 0 * SIZE(Y), %xmm1 537 movss 0 * SIZE(X), %xmm0 538 539 movaps %xmm1, %xmm2 540 movaps %xmm0, %xmm3 541 542 mulss C, %xmm0 543 mulss S, %xmm1 544 545 mulss C, %xmm2 546 mulss S, %xmm3 547 548 addss %xmm1, %xmm0 549 subss %xmm3, %xmm2 550 551 movss %xmm0, 0 * SIZE(X) 552 movss %xmm2, 0 * SIZE(Y) 553 jmp .L999 554 ALIGN_3 555 556.L20: 557 movl N, I 558 sarl $5, I 559 jle .L24 560 ALIGN_3 561 562.L21: 563#ifdef PREFETCHW 564 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) 565#endif 566 567 movsd 0 * SIZE(Y), %xmm1 568 movhps 2 * SIZE(Y), %xmm1 569 movaps 0 * SIZE(X), %xmm0 570 571 movaps %xmm1, %xmm2 572 movaps %xmm0, %xmm3 573 574 mulps C, %xmm0 575 mulps S, %xmm1 576 577 mulps C, %xmm2 578 mulps S, %xmm3 579 580 addps %xmm1, %xmm0 581 subps %xmm3, %xmm2 582 583 movaps %xmm0, 0 * SIZE(X) 584 movlps %xmm2, 0 * SIZE(Y) 585 movhps %xmm2, 2 * SIZE(Y) 586 587 movsd 4 * SIZE(Y), %xmm1 588 movhps 6 * SIZE(Y), %xmm1 589 movaps 4 * SIZE(X), %xmm0 590 591 movaps %xmm1, %xmm2 592 movaps %xmm0, %xmm3 593 594 mulps C, %xmm0 595 mulps S, %xmm1 596 597 mulps C, %xmm2 598 mulps S, %xmm3 599 600 addps %xmm1, %xmm0 601 subps %xmm3, %xmm2 602 603 movaps %xmm0, 4 * SIZE(X) 604 movlps %xmm2, 4 * SIZE(Y) 605 movhps %xmm2, 6 * SIZE(Y) 606 607#ifdef PREFETCHW 608 PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) 609#endif 610 611 movsd 8 * SIZE(Y), %xmm1 612 movhps 10 * SIZE(Y), %xmm1 613 movaps 8 * SIZE(X), %xmm0 614 615 movaps %xmm1, %xmm2 616 movaps %xmm0, %xmm3 617 618 mulps C, %xmm0 619 mulps S, %xmm1 620 621 mulps C, %xmm2 622 mulps S, %xmm3 623 624 addps %xmm1, %xmm0 625 subps %xmm3, %xmm2 626 627 movaps %xmm0, 8 * SIZE(X) 628 movlps %xmm2, 8 * SIZE(Y) 629 movhps %xmm2, 10 * SIZE(Y) 630 631 movsd 12 * SIZE(Y), %xmm1 632 movhps 14 * SIZE(Y), %xmm1 633 movaps 12 * SIZE(X), %xmm0 634 635 movaps %xmm1, %xmm2 636 movaps %xmm0, %xmm3 637 638 mulps C, %xmm0 639 mulps S, %xmm1 640 641 mulps C, %xmm2 642 mulps S, %xmm3 643 644 addps %xmm1, %xmm0 645 subps %xmm3, %xmm2 646 647 movaps %xmm0, 12 * SIZE(X) 648 movlps %xmm2, 12 * SIZE(Y) 649 movhps %xmm2, 14 * SIZE(Y) 650 651#ifdef PREFETCHW 652 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) 653#endif 654 655 movsd 16 * SIZE(Y), %xmm1 656 movhps 18 * SIZE(Y), %xmm1 657 movaps 16 * SIZE(X), %xmm0 658 659 movaps %xmm1, %xmm2 660 movaps %xmm0, %xmm3 661 662 mulps C, %xmm0 663 mulps S, %xmm1 664 665 mulps C, %xmm2 666 mulps S, %xmm3 667 668 addps %xmm1, %xmm0 669 subps %xmm3, %xmm2 670 671 movaps %xmm0, 16 * SIZE(X) 672 movlps %xmm2, 16 * SIZE(Y) 673 movhps %xmm2, 18 * SIZE(Y) 674 675 movsd 20 * SIZE(Y), %xmm1 676 movhps 22 * SIZE(Y), %xmm1 677 movaps 20 * SIZE(X), %xmm0 678 679 movaps %xmm1, %xmm2 680 movaps %xmm0, %xmm3 681 682 mulps C, %xmm0 683 mulps S, %xmm1 684 685 mulps C, %xmm2 686 mulps S, %xmm3 687 688 addps %xmm1, %xmm0 689 subps %xmm3, %xmm2 690 691 movaps %xmm0, 20 * SIZE(X) 692 movlps %xmm2, 20 * SIZE(Y) 693 movhps %xmm2, 22 * SIZE(Y) 694 695#ifdef PREFETCHW 696 PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) 697#endif 698 699 movsd 24 * SIZE(Y), %xmm1 700 movhps 26 * SIZE(Y), %xmm1 701 movaps 24 * SIZE(X), %xmm0 702 703 movaps %xmm1, %xmm2 704 movaps %xmm0, %xmm3 705 706 mulps C, %xmm0 707 mulps S, %xmm1 708 709 mulps C, %xmm2 710 mulps S, %xmm3 711 712 addps %xmm1, %xmm0 713 subps %xmm3, %xmm2 714 715 movaps %xmm0, 24 * SIZE(X) 716 movlps %xmm2, 24 * SIZE(Y) 717 movhps %xmm2, 26 * SIZE(Y) 718 719 movsd 28 * SIZE(Y), %xmm1 720 movhps 30 * SIZE(Y), %xmm1 721 movaps 28 * SIZE(X), %xmm0 722 723 movaps %xmm1, %xmm2 724 movaps %xmm0, %xmm3 725 726 mulps C, %xmm0 727 mulps S, %xmm1 728 729 mulps C, %xmm2 730 mulps S, %xmm3 731 732 addps %xmm1, %xmm0 733 subps %xmm3, %xmm2 734 735 movaps %xmm0, 28 * SIZE(X) 736 movlps %xmm2, 28 * SIZE(Y) 737 movhps %xmm2, 30 * SIZE(Y) 738 739 addl $32 * SIZE, X 740 addl $32 * SIZE, Y 741 decl I 742 jg .L21 743 ALIGN_3 744 745.L24: 746 testl $31, N 747 jle .L999 748 749 testl $16, N 750 jle .L25 751 752 movsd 0 * SIZE(Y), %xmm1 753 movhps 2 * SIZE(Y), %xmm1 754 movaps 0 * SIZE(X), %xmm0 755 756 movaps %xmm1, %xmm2 757 movaps %xmm0, %xmm3 758 759 mulps C, %xmm0 760 mulps S, %xmm1 761 762 mulps C, %xmm2 763 mulps S, %xmm3 764 765 addps %xmm1, %xmm0 766 subps %xmm3, %xmm2 767 768 movaps %xmm0, 0 * SIZE(X) 769 movlps %xmm2, 0 * SIZE(Y) 770 movhps %xmm2, 2 * SIZE(Y) 771 772 movsd 4 * SIZE(Y), %xmm1 773 movhps 6 * SIZE(Y), %xmm1 774 movaps 4 * SIZE(X), %xmm0 775 776 movaps %xmm1, %xmm2 777 movaps %xmm0, %xmm3 778 779 mulps C, %xmm0 780 mulps S, %xmm1 781 782 mulps C, %xmm2 783 mulps S, %xmm3 784 785 addps %xmm1, %xmm0 786 subps %xmm3, %xmm2 787 788 movaps %xmm0, 4 * SIZE(X) 789 movlps %xmm2, 4 * SIZE(Y) 790 movhps %xmm2, 6 * SIZE(Y) 791 792 movsd 8 * SIZE(Y), %xmm1 793 movhps 10 * SIZE(Y), %xmm1 794 movaps 8 * SIZE(X), %xmm0 795 796 movaps %xmm1, %xmm2 797 movaps %xmm0, %xmm3 798 799 mulps C, %xmm0 800 mulps S, %xmm1 801 802 mulps C, %xmm2 803 mulps S, %xmm3 804 805 addps %xmm1, %xmm0 806 subps %xmm3, %xmm2 807 808 movaps %xmm0, 8 * SIZE(X) 809 movlps %xmm2, 8 * SIZE(Y) 810 movhps %xmm2, 10 * SIZE(Y) 811 812 movsd 12 * SIZE(Y), %xmm1 813 movhps 14 * SIZE(Y), %xmm1 814 movaps 12 * SIZE(X), %xmm0 815 816 movaps %xmm1, %xmm2 817 movaps %xmm0, %xmm3 818 819 mulps C, %xmm0 820 mulps S, %xmm1 821 822 mulps C, %xmm2 823 mulps S, %xmm3 824 825 addps %xmm1, %xmm0 826 subps %xmm3, %xmm2 827 828 movaps %xmm0, 12 * SIZE(X) 829 movlps %xmm2, 12 * SIZE(Y) 830 movhps %xmm2, 14 * SIZE(Y) 831 832 addl $16 * SIZE, X 833 addl $16 * SIZE, Y 834 ALIGN_3 835 836.L25: 837 testl $8, N 838 jle .L26 839 840 movsd 0 * SIZE(Y), %xmm1 841 movhps 2 * SIZE(Y), %xmm1 842 movaps 0 * SIZE(X), %xmm0 843 844 movaps %xmm1, %xmm2 845 movaps %xmm0, %xmm3 846 847 mulps C, %xmm0 848 mulps S, %xmm1 849 850 mulps C, %xmm2 851 mulps S, %xmm3 852 853 addps %xmm1, %xmm0 854 subps %xmm3, %xmm2 855 856 movaps %xmm0, 0 * SIZE(X) 857 movlps %xmm2, 0 * SIZE(Y) 858 movhps %xmm2, 2 * SIZE(Y) 859 860 movsd 4 * SIZE(Y), %xmm1 861 movhps 6 * SIZE(Y), %xmm1 862 movaps 4 * SIZE(X), %xmm0 863 864 movaps %xmm1, %xmm2 865 movaps %xmm0, %xmm3 866 867 mulps C, %xmm0 868 mulps S, %xmm1 869 870 mulps C, %xmm2 871 mulps S, %xmm3 872 873 addps %xmm1, %xmm0 874 subps %xmm3, %xmm2 875 876 movaps %xmm0, 4 * SIZE(X) 877 movlps %xmm2, 4 * SIZE(Y) 878 movhps %xmm2, 6 * SIZE(Y) 879 880 addl $8 * SIZE, X 881 addl $8 * SIZE, Y 882 ALIGN_3 883 884 885.L26: 886 testl $4, N 887 jle .L27 888 889 movsd 0 * SIZE(Y), %xmm1 890 movhps 2 * SIZE(Y), %xmm1 891 movaps 0 * SIZE(X), %xmm0 892 893 movaps %xmm1, %xmm2 894 movaps %xmm0, %xmm3 895 896 mulps C, %xmm0 897 mulps S, %xmm1 898 899 mulps C, %xmm2 900 mulps S, %xmm3 901 902 addps %xmm1, %xmm0 903 subps %xmm3, %xmm2 904 905 movaps %xmm0, 0 * SIZE(X) 906 movlps %xmm2, 0 * SIZE(Y) 907 movhps %xmm2, 2 * SIZE(Y) 908 909 addl $4 * SIZE, X 910 addl $4 * SIZE, Y 911 ALIGN_3 912 913.L27: 914 testl $2, N 915 jle .L28 916 917#ifdef movsd 918 xorps %xmm0, %xmm0 919 xorps %xmm1, %xmm1 920#endif 921 922 movsd 0 * SIZE(Y), %xmm1 923 movsd 0 * SIZE(X), %xmm0 924 925 movaps %xmm1, %xmm2 926 movaps %xmm0, %xmm3 927 928 mulps C, %xmm0 929 mulps S, %xmm1 930 931 mulps C, %xmm2 932 mulps S, %xmm3 933 934 addps %xmm1, %xmm0 935 subps %xmm3, %xmm2 936 937 movlps %xmm0, 0 * SIZE(X) 938 movlps %xmm2, 0 * SIZE(Y) 939 940 addl $2 * SIZE, X 941 addl $2 * SIZE, Y 942 ALIGN_3 943 944.L28: 945 testl $1, N 946 jle .L999 947 948 movss 0 * SIZE(Y), %xmm1 949 movss 0 * SIZE(X), %xmm0 950 951 movaps %xmm1, %xmm2 952 movaps %xmm0, %xmm3 953 954 mulss C, %xmm0 955 mulss S, %xmm1 956 957 mulss C, %xmm2 958 mulss S, %xmm3 959 960 addss %xmm1, %xmm0 961 subss %xmm3, %xmm2 962 963 movss %xmm0, 0 * SIZE(X) 964 movss %xmm2, 0 * SIZE(Y) 965 jmp .L999 966 ALIGN_3 967 968.L50: 969 movl N, I 970 sarl $2, I 971 jle .L55 972 ALIGN_3 973 974.L53: 975 movss (Y), %xmm1 976 movss (X), %xmm0 977 978 movaps %xmm1, %xmm2 979 movaps %xmm0, %xmm3 980 981 mulss C, %xmm0 982 mulss S, %xmm1 983 984 mulss C, %xmm2 985 mulss S, %xmm3 986 987 addss %xmm1, %xmm0 988 subss %xmm3, %xmm2 989 990 movss %xmm0, (X) 991 movss %xmm2, (Y) 992 993 addl INCX, X 994 addl INCY, Y 995 996 movss (Y), %xmm1 997 movss (X), %xmm0 998 999 movaps %xmm1, %xmm2 1000 movaps %xmm0, %xmm3 1001 1002 mulss C, %xmm0 1003 mulss S, %xmm1 1004 1005 mulss C, %xmm2 1006 mulss S, %xmm3 1007 1008 addss %xmm1, %xmm0 1009 subss %xmm3, %xmm2 1010 1011 movss %xmm0, (X) 1012 movss %xmm2, (Y) 1013 1014 addl INCX, X 1015 addl INCY, Y 1016 1017 movss (Y), %xmm1 1018 movss (X), %xmm0 1019 1020 movaps %xmm1, %xmm2 1021 movaps %xmm0, %xmm3 1022 1023 mulss C, %xmm0 1024 mulss S, %xmm1 1025 1026 mulss C, %xmm2 1027 mulss S, %xmm3 1028 1029 addss %xmm1, %xmm0 1030 subss %xmm3, %xmm2 1031 1032 movss %xmm0, (X) 1033 movss %xmm2, (Y) 1034 1035 addl INCX, X 1036 addl INCY, Y 1037 1038 movss (Y), %xmm1 1039 movss (X), %xmm0 1040 1041 movaps %xmm1, %xmm2 1042 movaps %xmm0, %xmm3 1043 1044 mulss C, %xmm0 1045 mulss S, %xmm1 1046 1047 mulss C, %xmm2 1048 mulss S, %xmm3 1049 1050 addss %xmm1, %xmm0 1051 subss %xmm3, %xmm2 1052 1053 movss %xmm0, (X) 1054 movss %xmm2, (Y) 1055 1056 addl INCX, X 1057 addl INCY, Y 1058 1059 decl I 1060 jg .L53 1061 ALIGN_3 1062 1063.L55: 1064 movl N, I 1065 andl $3, I 1066 jle .L999 1067 ALIGN_3 1068 1069.L56: 1070 movss (Y), %xmm1 1071 movss (X), %xmm0 1072 1073 movaps %xmm1, %xmm2 1074 movaps %xmm0, %xmm3 1075 1076 mulss C, %xmm0 1077 mulss S, %xmm1 1078 1079 mulss C, %xmm2 1080 mulss S, %xmm3 1081 1082 addss %xmm1, %xmm0 1083 subss %xmm3, %xmm2 1084 1085 movss %xmm0, (X) 1086 movss %xmm2, (Y) 1087 1088 addl INCX, X 1089 addl INCY, Y 1090 1091 decl I 1092 jg .L56 1093 ALIGN_3 1094 1095.L999: 1096 popl %ebx 1097 popl %esi 1098 popl %edi 1099 1100 ret 1101 1102 EPILOGUE 1103