1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 16 26#define ARGS 0 27 28#define STACK_M 4 + STACK + ARGS(%esp) 29#define STACK_X 8 + STACK + ARGS(%esp) 30#define STACK_INCX 12 + STACK + ARGS(%esp) 31 32#define RET %eax 33#define M %ebx 34#define X %ecx 35#define INCX %edx 36#define I %esi 37#define MM %ebp 38#define XX %edi 39#define TEMP %ebx 40 41#ifdef USE_MIN 42#define maxpd minpd 43#define maxsd minsd 44#endif 45 46#include "l1param.h" 47 48 PROLOGUE 49 50 pushl %ebp 51 pushl %edi 52 pushl %esi 53 pushl %ebx 54 55 PROFCODE 56 57 movl STACK_M, M 58 movl STACK_X, X 59 movl STACK_INCX, INCX 60 61#ifdef F_INTERFACE 62 movl (M), M 63 movl (INCX), INCX 64#endif 65 66 pxor %xmm0, %xmm0 67#ifdef USE_ABS 68 pxor %xmm7, %xmm7 69#endif 70 xor RET, RET 71 testl M, M 72 jle .L999 73 leal (, INCX, SIZE), INCX 74 testl INCX, INCX 75 jle .L999 76 77 movl M, MM 78 movl X, XX 79 80#ifdef USE_ABS 81 cmpeqpd %xmm7, %xmm7 82 psrlq $1, %xmm7 83#endif 84 85 movsd (XX), %xmm0 86 addl INCX, XX 87 decl MM 88#ifdef USE_ABS 89 andpd %xmm7, %xmm0 90#endif 91 unpcklpd %xmm0, %xmm0 92 movapd %xmm0, %xmm1 93 movapd %xmm0, %xmm2 94 movapd %xmm0, %xmm3 95 cmpl $SIZE, INCX 96 jne .L80 97 98/* Analigned Check */ 99 cmpl $7, MM 100 jle .L50 101 102 testl $7, XX 103 jne .L50 # Purely Unaligned Mode 104 105 testl $15, XX # Checking for 128bit align 106 je .L05 107 108 movsd 0 * SIZE(XX), %xmm4 109#ifdef USE_ABS 110 andpd %xmm7, %xmm4 111#endif 112 unpcklpd %xmm4, %xmm4 113 maxpd %xmm4, %xmm3 114 decl MM 115 addl $SIZE, XX 116 ALIGN_3 117 118.L05: 119 movl MM, I 120 sarl $4, I 121 jle .L15 122 ALIGN_4 123 124.L11: 125#ifdef PREFETCH 126 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 127#endif 128 129 movapd 0 * SIZE(XX), %xmm4 130#ifdef USE_ABS 131 andpd %xmm7, %xmm4 132#endif 133 maxpd %xmm4, %xmm0 134 135 movapd 2 * SIZE(XX), %xmm4 136#ifdef USE_ABS 137 andpd %xmm7, %xmm4 138#endif 139 maxpd %xmm4, %xmm1 140 141 movapd 4 * SIZE(XX), %xmm4 142#ifdef USE_ABS 143 andpd %xmm7, %xmm4 144#endif 145 maxpd %xmm4, %xmm2 146 147 movapd 6 * SIZE(XX), %xmm4 148#ifdef USE_ABS 149 andpd %xmm7, %xmm4 150#endif 151 maxpd %xmm4, %xmm3 152 153#ifdef PREFETCH 154 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) 155#endif 156 157 movapd 8 * SIZE(XX), %xmm4 158#ifdef USE_ABS 159 andpd %xmm7, %xmm4 160#endif 161 maxpd %xmm4, %xmm0 162 163 movapd 10 * SIZE(XX), %xmm4 164#ifdef USE_ABS 165 andpd %xmm7, %xmm4 166#endif 167 maxpd %xmm4, %xmm1 168 169 movapd 12 * SIZE(XX), %xmm4 170#ifdef USE_ABS 171 andpd %xmm7, %xmm4 172#endif 173 maxpd %xmm4, %xmm2 174 175 movapd 14 * SIZE(XX), %xmm4 176#ifdef USE_ABS 177 andpd %xmm7, %xmm4 178#endif 179 maxpd %xmm4, %xmm3 180 181 addl $16 * SIZE, XX 182 decl I 183 jg .L11 184 ALIGN_4 185 186.L15: 187 andl $15, MM 188 jle .L20 189 190 testl $8, MM 191 je .L16 192 193 movapd 0 * SIZE(XX), %xmm4 194#ifdef USE_ABS 195 andpd %xmm7, %xmm4 196#endif 197 maxpd %xmm4, %xmm0 198 199 movapd 2 * SIZE(XX), %xmm4 200#ifdef USE_ABS 201 andpd %xmm7, %xmm4 202#endif 203 maxpd %xmm4, %xmm1 204 205 movapd 4 * SIZE(XX), %xmm4 206#ifdef USE_ABS 207 andpd %xmm7, %xmm4 208#endif 209 maxpd %xmm4, %xmm2 210 211 movapd 6 * SIZE(XX), %xmm4 212#ifdef USE_ABS 213 andpd %xmm7, %xmm4 214#endif 215 maxpd %xmm4, %xmm3 216 addl $8 * SIZE, XX 217 ALIGN_3 218 219.L16: 220 testl $4, MM 221 je .L17 222 223 movapd 0 * SIZE(XX), %xmm4 224#ifdef USE_ABS 225 andpd %xmm7, %xmm4 226#endif 227 maxpd %xmm4, %xmm0 228 229 movapd 2 * SIZE(XX), %xmm4 230#ifdef USE_ABS 231 andpd %xmm7, %xmm4 232#endif 233 maxpd %xmm4, %xmm1 234 addl $4 * SIZE, XX 235 ALIGN_3 236 237.L17: 238 testl $2, MM 239 je .L18 240 241 movapd 0 * SIZE(XX), %xmm4 242#ifdef USE_ABS 243 andpd %xmm7, %xmm4 244#endif 245 maxpd %xmm4, %xmm2 246 addl $2 * SIZE, XX 247 248.L18: 249 testl $1, MM 250 je .L20 251 252 movsd 0 * SIZE(XX), %xmm4 253#ifdef USE_ABS 254 andpd %xmm7, %xmm4 255#endif 256 unpcklpd %xmm4, %xmm4 257 maxpd %xmm4, %xmm3 258 ALIGN_3 259 260/* Finding Index */ 261.L20: 262 movl X, XX 263 movl M, MM 264 265 maxpd %xmm1, %xmm0 266 maxpd %xmm3, %xmm2 267 maxpd %xmm2, %xmm0 268 movapd %xmm0, %xmm1 269 unpckhpd %xmm0, %xmm0 270 maxsd %xmm1, %xmm0 271 unpcklpd %xmm0, %xmm0 272 273 testl $15, XX # Checking for 128bit align 274 je .L21 275 276 movsd 0 * SIZE(XX), %xmm1 277#ifdef USE_ABS 278 andpd %xmm7, %xmm1 279#endif 280 incl RET 281 comisd %xmm0, %xmm1 282 je .L999 283 addl $SIZE, XX 284 decl MM 285 ALIGN_3 286 287.L21: 288 movl MM, I 289 sarl $3, I 290 jle .L25 291 ALIGN_4 292 293.L22: 294#ifdef PREFETCH 295 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 296#endif 297 298 movapd 0 * SIZE(XX), %xmm1 299#ifdef USE_ABS 300 andpd %xmm7, %xmm1 301#endif 302 cmpeqpd %xmm0, %xmm1 303 304 movapd 2 * SIZE(XX), %xmm2 305#ifdef USE_ABS 306 andpd %xmm7, %xmm2 307#endif 308 cmpeqpd %xmm0, %xmm2 309 310 movapd 4 * SIZE(XX), %xmm3 311#ifdef USE_ABS 312 andpd %xmm7, %xmm3 313#endif 314 cmpeqpd %xmm0, %xmm3 315 316 movapd 6 * SIZE(XX), %xmm4 317#ifdef USE_ABS 318 andpd %xmm7, %xmm4 319#endif 320 cmpeqpd %xmm0, %xmm4 321 322 orpd %xmm2, %xmm1 323 orpd %xmm4, %xmm3 324 orpd %xmm3, %xmm1 325 movmskpd %xmm1, TEMP 326 testl $3, TEMP 327 jne .L23 328 329 addl $8 * SIZE, XX 330 addl $8, RET 331 decl I 332 jg .L22 333 jmp .L25 334 ALIGN_4 335 336.L23: 337 movsd 0 * SIZE(XX), %xmm1 338 movsd 1 * SIZE(XX), %xmm2 339 movsd 2 * SIZE(XX), %xmm3 340 movsd 3 * SIZE(XX), %xmm4 341#ifdef USE_ABS 342 andpd %xmm7, %xmm1 343 andpd %xmm7, %xmm2 344 andpd %xmm7, %xmm3 345 andpd %xmm7, %xmm4 346#endif 347 348 incl RET 349 comisd %xmm0, %xmm1 350 je .L999 351 incl RET 352 comisd %xmm0, %xmm2 353 je .L999 354 incl RET 355 comisd %xmm0, %xmm3 356 je .L999 357 incl RET 358 comisd %xmm0, %xmm4 359 je .L999 360 361 movsd 4 * SIZE(XX), %xmm1 362 movsd 5 * SIZE(XX), %xmm2 363 movsd 6 * SIZE(XX), %xmm3 364 365#ifdef USE_ABS 366 andpd %xmm7, %xmm1 367 andpd %xmm7, %xmm2 368 andpd %xmm7, %xmm3 369#endif 370 371 incl RET 372 comisd %xmm0, %xmm1 373 je .L999 374 incl RET 375 comisd %xmm0, %xmm2 376 je .L999 377 incl RET 378 comisd %xmm0, %xmm3 379 je .L999 380 incl RET 381 jmp .L999 382 ALIGN_3 383 384.L25: 385 testl $4, MM 386 je .L27 387 388 movsd 0 * SIZE(XX), %xmm1 389 movsd 1 * SIZE(XX), %xmm2 390 movsd 2 * SIZE(XX), %xmm3 391 movsd 3 * SIZE(XX), %xmm4 392#ifdef USE_ABS 393 andpd %xmm7, %xmm1 394 andpd %xmm7, %xmm2 395 andpd %xmm7, %xmm3 396 andpd %xmm7, %xmm4 397#endif 398 addl $4 * SIZE, XX 399 incl RET 400 comisd %xmm0, %xmm1 401 je .L999 402 incl RET 403 comisd %xmm0, %xmm2 404 je .L999 405 incl RET 406 comisd %xmm0, %xmm3 407 je .L999 408 incl RET 409 comisd %xmm0, %xmm4 410 je .L999 411 ALIGN_3 412 413.L27: 414 testl $2, MM 415 je .L28 416 417 movsd 0 * SIZE(XX), %xmm1 418 movsd 1 * SIZE(XX), %xmm2 419#ifdef USE_ABS 420 andpd %xmm7, %xmm1 421 andpd %xmm7, %xmm2 422#endif 423 addl $2 * SIZE, XX 424 incl RET 425 comisd %xmm0, %xmm1 426 je .L999 427 incl RET 428 comisd %xmm0, %xmm2 429 je .L999 430 ALIGN_3 431 432.L28: 433 incl RET 434 jmp .L999 435 ALIGN_3 436 437.L50: 438/* Unaligned Mode */ 439 movl MM, I 440 sarl $4, I 441 jle .L55 442 ALIGN_4 443 444.L51: 445#ifdef PREFETCH 446 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 447#endif 448 449 movsd 0 * SIZE(XX), %xmm4 450 movhpd 1 * SIZE(XX), %xmm4 451#ifdef USE_ABS 452 andpd %xmm7, %xmm4 453#endif 454 maxpd %xmm4, %xmm0 455 456 movsd 2 * SIZE(XX), %xmm4 457 movhpd 3 * SIZE(XX), %xmm4 458#ifdef USE_ABS 459 andpd %xmm7, %xmm4 460#endif 461 maxpd %xmm4, %xmm1 462 463 movsd 4 * SIZE(XX), %xmm4 464 movhpd 5 * SIZE(XX), %xmm4 465#ifdef USE_ABS 466 andpd %xmm7, %xmm4 467#endif 468 maxpd %xmm4, %xmm2 469 470 movsd 6 * SIZE(XX), %xmm4 471 movhpd 7 * SIZE(XX), %xmm4 472#ifdef USE_ABS 473 andpd %xmm7, %xmm4 474#endif 475 maxpd %xmm4, %xmm3 476 477#ifdef PREFETCH 478 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) 479#endif 480 481 movsd 8 * SIZE(XX), %xmm4 482 movhpd 9 * SIZE(XX), %xmm4 483#ifdef USE_ABS 484 andpd %xmm7, %xmm4 485#endif 486 maxpd %xmm4, %xmm0 487 488 movsd 10 * SIZE(XX), %xmm4 489 movhpd 11 * SIZE(XX), %xmm4 490#ifdef USE_ABS 491 andpd %xmm7, %xmm4 492#endif 493 maxpd %xmm4, %xmm1 494 495 movsd 12 * SIZE(XX), %xmm4 496 movhpd 13 * SIZE(XX), %xmm4 497#ifdef USE_ABS 498 andpd %xmm7, %xmm4 499#endif 500 maxpd %xmm4, %xmm2 501 502 movsd 14 * SIZE(XX), %xmm4 503 movhpd 15 * SIZE(XX), %xmm4 504#ifdef USE_ABS 505 andpd %xmm7, %xmm4 506#endif 507 maxpd %xmm4, %xmm3 508 509 addl $16 * SIZE, XX 510 decl I 511 jg .L51 512 ALIGN_4 513 514.L55: 515 andl $15, MM 516 jle .L60 517 518 testl $8, MM 519 je .L56 520 521 movsd 0 * SIZE(XX), %xmm4 522 movhpd 1 * SIZE(XX), %xmm4 523#ifdef USE_ABS 524 andpd %xmm7, %xmm4 525#endif 526 maxpd %xmm4, %xmm0 527 528 movsd 2 * SIZE(XX), %xmm4 529 movhpd 3 * SIZE(XX), %xmm4 530#ifdef USE_ABS 531 andpd %xmm7, %xmm4 532#endif 533 maxpd %xmm4, %xmm1 534 535 movsd 4 * SIZE(XX), %xmm4 536 movhpd 5 * SIZE(XX), %xmm4 537#ifdef USE_ABS 538 andpd %xmm7, %xmm4 539#endif 540 maxpd %xmm4, %xmm2 541 542 movsd 6 * SIZE(XX), %xmm4 543 movhpd 7 * SIZE(XX), %xmm4 544#ifdef USE_ABS 545 andpd %xmm7, %xmm4 546#endif 547 maxpd %xmm4, %xmm3 548 549 addl $8 * SIZE, XX 550 ALIGN_3 551 552.L56: 553 testl $4, MM 554 je .L57 555 556 movsd 0 * SIZE(XX), %xmm4 557 movhpd 1 * SIZE(XX), %xmm4 558#ifdef USE_ABS 559 andpd %xmm7, %xmm4 560#endif 561 maxpd %xmm4, %xmm0 562 563 movsd 2 * SIZE(XX), %xmm4 564 movhpd 3 * SIZE(XX), %xmm4 565#ifdef USE_ABS 566 andpd %xmm7, %xmm4 567#endif 568 maxpd %xmm4, %xmm1 569 addl $4 * SIZE, XX 570 ALIGN_3 571 572.L57: 573 testl $2, MM 574 je .L58 575 576 movsd 0 * SIZE(XX), %xmm4 577 movhpd 1 * SIZE(XX), %xmm4 578#ifdef USE_ABS 579 andpd %xmm7, %xmm4 580#endif 581 maxpd %xmm4, %xmm2 582 addl $2 * SIZE, XX 583 584.L58: 585 testl $1, MM 586 je .L60 587 588 movsd 0 * SIZE(XX), %xmm4 589 unpcklpd %xmm4, %xmm4 590#ifdef USE_ABS 591 andpd %xmm7, %xmm4 592#endif 593 maxpd %xmm4, %xmm3 594 ALIGN_3 595 596.L60: 597 movl X, XX 598 movl M, MM 599 600 maxpd %xmm1, %xmm0 601 maxpd %xmm3, %xmm2 602 maxpd %xmm2, %xmm0 603 movapd %xmm0, %xmm1 604 unpckhpd %xmm0, %xmm0 605 maxsd %xmm1, %xmm0 606 unpcklpd %xmm0, %xmm0 607 608 movl MM, I 609 sarl $3, I 610 jle .L65 611 ALIGN_4 612 613.L62: 614#ifdef PREFETCH 615 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 616#endif 617 618 movsd 0 * SIZE(XX), %xmm1 619 movhpd 1 * SIZE(XX), %xmm1 620#ifdef USE_ABS 621 andpd %xmm7, %xmm1 622#endif 623 cmpeqpd %xmm0, %xmm1 624 625 movsd 2 * SIZE(XX), %xmm2 626 movhpd 3 * SIZE(XX), %xmm2 627#ifdef USE_ABS 628 andpd %xmm7, %xmm2 629#endif 630 cmpeqpd %xmm0, %xmm2 631 632 movsd 4 * SIZE(XX), %xmm3 633 movhpd 5 * SIZE(XX), %xmm3 634#ifdef USE_ABS 635 andpd %xmm7, %xmm3 636#endif 637 cmpeqpd %xmm0, %xmm3 638 639 movsd 6 * SIZE(XX), %xmm4 640 movhpd 7 * SIZE(XX), %xmm4 641#ifdef USE_ABS 642 andpd %xmm7, %xmm4 643#endif 644 cmpeqpd %xmm0, %xmm4 645 646 orpd %xmm2, %xmm1 647 orpd %xmm4, %xmm3 648 orpd %xmm3, %xmm1 649 movmskpd %xmm1, TEMP 650 testl $3, TEMP 651 jne .L63 652 653 addl $8 * SIZE, XX 654 addl $8, RET 655 decl I 656 jg .L62 657 jmp .L65 658 ALIGN_4 659 660.L63: 661 movsd 0 * SIZE(XX), %xmm1 662 movsd 1 * SIZE(XX), %xmm2 663 movsd 2 * SIZE(XX), %xmm3 664 movsd 3 * SIZE(XX), %xmm4 665 666#ifdef USE_ABS 667 andpd %xmm7, %xmm1 668 andpd %xmm7, %xmm2 669 andpd %xmm7, %xmm3 670 andpd %xmm7, %xmm4 671#endif 672 673 incl RET 674 comisd %xmm0, %xmm1 675 je .L999 676 incl RET 677 comisd %xmm0, %xmm2 678 je .L999 679 incl RET 680 comisd %xmm0, %xmm3 681 je .L999 682 incl RET 683 comisd %xmm0, %xmm4 684 je .L999 685 incl RET 686 687 movsd 4 * SIZE(XX), %xmm1 688 movsd 5 * SIZE(XX), %xmm2 689 movsd 6 * SIZE(XX), %xmm3 690 691#ifdef USE_ABS 692 andpd %xmm7, %xmm1 693 andpd %xmm7, %xmm2 694 andpd %xmm7, %xmm3 695#endif 696 697 comisd %xmm0, %xmm1 698 je .L999 699 incl RET 700 comisd %xmm0, %xmm2 701 je .L999 702 incl RET 703 comisd %xmm0, %xmm3 704 je .L999 705 incl RET 706 jmp .L999 707 ALIGN_3 708 709.L65: 710 testl $4, MM 711 je .L67 712 713 movsd 0 * SIZE(XX), %xmm1 714 movsd 1 * SIZE(XX), %xmm2 715 movsd 2 * SIZE(XX), %xmm3 716 movsd 3 * SIZE(XX), %xmm4 717#ifdef USE_ABS 718 andpd %xmm7, %xmm1 719 andpd %xmm7, %xmm2 720 andpd %xmm7, %xmm3 721 andpd %xmm7, %xmm4 722#endif 723 addl $4 * SIZE, XX 724 incl RET 725 comisd %xmm0, %xmm1 726 je .L999 727 incl RET 728 comisd %xmm0, %xmm2 729 je .L999 730 incl RET 731 comisd %xmm0, %xmm3 732 je .L999 733 incl RET 734 comisd %xmm0, %xmm4 735 je .L999 736 ALIGN_3 737 738.L67: 739 testl $2, MM 740 je .L68 741 742 movsd 0 * SIZE(XX), %xmm1 743 movsd 1 * SIZE(XX), %xmm2 744#ifdef USE_ABS 745 andpd %xmm7, %xmm1 746 andpd %xmm7, %xmm2 747#endif 748 addl $2 * SIZE, XX 749 incl RET 750 comisd %xmm0, %xmm1 751 je .L999 752 incl RET 753 comisd %xmm0, %xmm2 754 je .L999 755 ALIGN_3 756 757.L68: 758 incl RET 759 jmp .L999 760 ALIGN_4 761 762.L80: 763 movl MM, I 764 sarl $4, I 765 jle .L85 766 ALIGN_4 767 768.L81: 769#ifdef PREFETCH 770 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 771#endif 772 773 movsd 0 * SIZE(XX), %xmm4 774 addl INCX, XX 775 movhpd 0 * SIZE(XX), %xmm4 776 addl INCX, XX 777#ifdef USE_ABS 778 andpd %xmm7, %xmm4 779#endif 780 maxpd %xmm4, %xmm0 781 782 movsd 0 * SIZE(XX), %xmm4 783 addl INCX, XX 784 movhpd 0 * SIZE(XX), %xmm4 785 addl INCX, XX 786#ifdef USE_ABS 787 andpd %xmm7, %xmm4 788#endif 789 maxpd %xmm4, %xmm1 790 791 movsd 0 * SIZE(XX), %xmm4 792 addl INCX, XX 793 movhpd 0 * SIZE(XX), %xmm4 794 addl INCX, XX 795#ifdef USE_ABS 796 andpd %xmm7, %xmm4 797#endif 798 maxpd %xmm4, %xmm2 799 800 movsd 0 * SIZE(XX), %xmm4 801 addl INCX, XX 802 movhpd 0 * SIZE(XX), %xmm4 803 addl INCX, XX 804#ifdef USE_ABS 805 andpd %xmm7, %xmm4 806#endif 807 maxpd %xmm4, %xmm3 808 809#ifdef PREFETCH 810 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 811#endif 812 813 movsd 0 * SIZE(XX), %xmm4 814 addl INCX, XX 815 movhpd 0 * SIZE(XX), %xmm4 816 addl INCX, XX 817#ifdef USE_ABS 818 andpd %xmm7, %xmm4 819#endif 820 maxpd %xmm4, %xmm0 821 822 movsd 0 * SIZE(XX), %xmm4 823 addl INCX, XX 824 movhpd 0 * SIZE(XX), %xmm4 825 addl INCX, XX 826#ifdef USE_ABS 827 andpd %xmm7, %xmm4 828#endif 829 maxpd %xmm4, %xmm1 830 831 movsd 0 * SIZE(XX), %xmm4 832 addl INCX, XX 833 movhpd 0 * SIZE(XX), %xmm4 834 addl INCX, XX 835#ifdef USE_ABS 836 andpd %xmm7, %xmm4 837#endif 838 maxpd %xmm4, %xmm2 839 840 movsd 0 * SIZE(XX), %xmm4 841 addl INCX, XX 842 movhpd 0 * SIZE(XX), %xmm4 843 addl INCX, XX 844#ifdef USE_ABS 845 andpd %xmm7, %xmm4 846#endif 847 maxpd %xmm4, %xmm3 848 849 decl I 850 jg .L81 851 ALIGN_4 852 853.L85: 854 andl $15, MM 855 jle .L90 856 857 testl $8, MM 858 je .L86 859 860 movsd 0 * SIZE(XX), %xmm4 861 addl INCX, XX 862 movhpd 0 * SIZE(XX), %xmm4 863 addl INCX, XX 864#ifdef USE_ABS 865 andpd %xmm7, %xmm4 866#endif 867 maxpd %xmm4, %xmm0 868 869 movsd 0 * SIZE(XX), %xmm4 870 addl INCX, XX 871 movhpd 0 * SIZE(XX), %xmm4 872 addl INCX, XX 873#ifdef USE_ABS 874 andpd %xmm7, %xmm4 875#endif 876 maxpd %xmm4, %xmm1 877 878 movsd 0 * SIZE(XX), %xmm4 879 addl INCX, XX 880 movhpd 0 * SIZE(XX), %xmm4 881 addl INCX, XX 882#ifdef USE_ABS 883 andpd %xmm7, %xmm4 884#endif 885 maxpd %xmm4, %xmm2 886 887 movsd 0 * SIZE(XX), %xmm4 888 addl INCX, XX 889 movhpd 0 * SIZE(XX), %xmm4 890 addl INCX, XX 891#ifdef USE_ABS 892 andpd %xmm7, %xmm4 893#endif 894 maxpd %xmm4, %xmm3 895 ALIGN_3 896 897.L86: 898 testl $4, MM 899 je .L87 900 901 movsd 0 * SIZE(XX), %xmm4 902 addl INCX, XX 903 movhpd 0 * SIZE(XX), %xmm4 904 addl INCX, XX 905#ifdef USE_ABS 906 andpd %xmm7, %xmm4 907#endif 908 maxpd %xmm4, %xmm0 909 910 movsd 0 * SIZE(XX), %xmm4 911 addl INCX, XX 912 movhpd 0 * SIZE(XX), %xmm4 913 addl INCX, XX 914#ifdef USE_ABS 915 andpd %xmm7, %xmm4 916#endif 917 maxpd %xmm4, %xmm1 918 ALIGN_3 919 920.L87: 921 testl $2, MM 922 je .L88 923 924 movsd 0 * SIZE(XX), %xmm4 925 addl INCX, XX 926 movhpd 0 * SIZE(XX), %xmm4 927 addl INCX, XX 928#ifdef USE_ABS 929 andpd %xmm7, %xmm4 930#endif 931 maxpd %xmm4, %xmm2 932 ALIGN_3 933 934.L88: 935 testl $1, MM 936 je .L90 937 938 movsd 0 * SIZE(XX), %xmm4 939#ifdef USE_ABS 940 andpd %xmm7, %xmm4 941#endif 942 unpcklpd %xmm4, %xmm4 943 maxpd %xmm4, %xmm3 944 ALIGN_4 945 946.L90: 947 movl X, XX 948 movl M, MM 949 950 maxpd %xmm1, %xmm0 951 maxpd %xmm3, %xmm2 952 maxpd %xmm2, %xmm0 953 movapd %xmm0, %xmm1 954 unpckhpd %xmm0, %xmm0 955 maxsd %xmm1, %xmm0 956 unpcklpd %xmm0, %xmm0 957 958 movl MM, I 959 sarl $3, I 960 jle .L95 961 ALIGN_4 962 963.L92: 964#ifdef PREFETCH 965 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 966#endif 967 968 movsd 0 * SIZE(XX), %xmm1 969 addl INCX, XX 970 movhpd 0 * SIZE(XX), %xmm1 971 addl INCX, XX 972#ifdef USE_ABS 973 andpd %xmm7, %xmm1 974#endif 975 cmpeqpd %xmm0, %xmm1 976 977 movsd 0 * SIZE(XX), %xmm2 978 addl INCX, XX 979 movhpd 0 * SIZE(XX), %xmm2 980 addl INCX, XX 981#ifdef USE_ABS 982 andpd %xmm7, %xmm2 983#endif 984 cmpeqpd %xmm0, %xmm2 985 986 movsd 0 * SIZE(XX), %xmm3 987 addl INCX, XX 988 movhpd 0 * SIZE(XX), %xmm3 989 addl INCX, XX 990#ifdef USE_ABS 991 andpd %xmm7, %xmm3 992#endif 993 cmpeqpd %xmm0, %xmm3 994 995 movsd 0 * SIZE(XX), %xmm4 996 addl INCX, XX 997 movhpd 0 * SIZE(XX), %xmm4 998 addl INCX, XX 999#ifdef USE_ABS 1000 andpd %xmm7, %xmm4 1001#endif 1002 cmpeqpd %xmm0, %xmm4 1003 1004 orpd %xmm2, %xmm1 1005 orpd %xmm4, %xmm3 1006 orpd %xmm3, %xmm1 1007 movmskpd %xmm1, TEMP 1008 testl $3, TEMP 1009 jne .L93 1010 1011 addl $8, RET 1012 decl I 1013 jg .L92 1014 jmp .L95 1015 ALIGN_4 1016 1017.L93: 1018 leal (, INCX, 8), TEMP 1019 subl TEMP, XX 1020 1021 movsd 0 * SIZE(XX), %xmm1 1022 addl INCX, XX 1023 movsd 0 * SIZE(XX), %xmm2 1024 addl INCX, XX 1025 movsd 0 * SIZE(XX), %xmm3 1026 addl INCX, XX 1027 movsd 0 * SIZE(XX), %xmm4 1028 addl INCX, XX 1029#ifdef USE_ABS 1030 andpd %xmm7, %xmm1 1031 andpd %xmm7, %xmm2 1032 andpd %xmm7, %xmm3 1033 andpd %xmm7, %xmm4 1034#endif 1035 incl RET 1036 comisd %xmm0, %xmm1 1037 je .L999 1038 incl RET 1039 comisd %xmm0, %xmm2 1040 je .L999 1041 incl RET 1042 comisd %xmm0, %xmm3 1043 je .L999 1044 incl RET 1045 comisd %xmm0, %xmm4 1046 je .L999 1047 1048 movsd 0 * SIZE(XX), %xmm1 1049 addl INCX, XX 1050 movsd 0 * SIZE(XX), %xmm2 1051 addl INCX, XX 1052 movsd 0 * SIZE(XX), %xmm3 1053#ifdef USE_ABS 1054 andpd %xmm7, %xmm1 1055 andpd %xmm7, %xmm2 1056 andpd %xmm7, %xmm3 1057#endif 1058 1059 incl RET 1060 comisd %xmm0, %xmm1 1061 je .L999 1062 incl RET 1063 comisd %xmm0, %xmm2 1064 je .L999 1065 incl RET 1066 comisd %xmm0, %xmm3 1067 je .L999 1068 incl RET 1069 jmp .L999 1070 ALIGN_3 1071 1072.L95: 1073 testl $4, MM 1074 je .L97 1075 1076 movsd 0 * SIZE(XX), %xmm1 1077 addl INCX, XX 1078 movsd 0 * SIZE(XX), %xmm2 1079 addl INCX, XX 1080 movsd 0 * SIZE(XX), %xmm3 1081 addl INCX, XX 1082 movsd 0 * SIZE(XX), %xmm4 1083 addl INCX, XX 1084#ifdef USE_ABS 1085 andpd %xmm7, %xmm1 1086 andpd %xmm7, %xmm2 1087 andpd %xmm7, %xmm3 1088 andpd %xmm7, %xmm4 1089#endif 1090 incl RET 1091 comisd %xmm0, %xmm1 1092 je .L999 1093 incl RET 1094 comisd %xmm0, %xmm2 1095 je .L999 1096 incl RET 1097 comisd %xmm0, %xmm3 1098 je .L999 1099 incl RET 1100 comisd %xmm0, %xmm4 1101 je .L999 1102 ALIGN_3 1103 1104.L97: 1105 testl $2, MM 1106 je .L98 1107 1108 movsd 0 * SIZE(XX), %xmm1 1109 addl INCX, XX 1110 movsd 0 * SIZE(XX), %xmm2 1111 addl INCX, XX 1112#ifdef USE_ABS 1113 andpd %xmm7, %xmm1 1114 andpd %xmm7, %xmm2 1115#endif 1116 incl RET 1117 comisd %xmm0, %xmm1 1118 je .L999 1119 incl RET 1120 comisd %xmm0, %xmm2 1121 je .L999 1122 ALIGN_3 1123 1124.L98: 1125 incl RET 1126 ALIGN_3 1127 1128.L999: 1129 popl %ebx 1130 popl %esi 1131 popl %edi 1132 popl %ebp 1133 ret 1134 1135 EPILOGUE 1136