1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend: SSE-specific stuff. 3//===---------------------------------------------------------------------===// 4 5//===---------------------------------------------------------------------===// 6 7SSE Variable shift can be custom lowered to something like this, which uses a 8small table + unaligned load + shuffle instead of going through memory. 9 10__m128i_shift_right: 11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 13 14... 15__m128i shift_right(__m128i value, unsigned long offset) { 16 return _mm_shuffle_epi8(value, 17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); 18} 19 20//===---------------------------------------------------------------------===// 21 22SSE has instructions for doing operations on complex numbers, we should pattern 23match them. For example, this should turn into a horizontal add: 24 25typedef float __attribute__((vector_size(16))) v4f32; 26float f32(v4f32 A) { 27 return A[0]+A[1]+A[2]+A[3]; 28} 29 30Instead we get this: 31 32_f32: ## @f32 33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] 34 addss %xmm0, %xmm1 35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] 36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] 37 movaps %xmm0, %xmm3 38 addss %xmm1, %xmm3 39 movdqa %xmm2, %xmm0 40 addss %xmm3, %xmm0 41 ret 42 43Also, there are cases where some simple local SLP would improve codegen a bit. 44compiling this: 45 46_Complex float f32(_Complex float A, _Complex float B) { 47 return A+B; 48} 49 50into: 51 52_f32: ## @f32 53 movdqa %xmm0, %xmm2 54 addss %xmm1, %xmm2 55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] 56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] 57 addss %xmm1, %xmm3 58 movaps %xmm2, %xmm0 59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 60 ret 61 62seems silly when it could just be one addps. 63 64 65//===---------------------------------------------------------------------===// 66 67Expand libm rounding functions inline: Significant speedups possible. 68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html 69 70//===---------------------------------------------------------------------===// 71 72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and 73other fast SSE modes. 74 75//===---------------------------------------------------------------------===// 76 77Think about doing i64 math in SSE regs on x86-32. 78 79//===---------------------------------------------------------------------===// 80 81This testcase should have no SSE instructions in it, and only one load from 82a constant pool: 83 84double %test3(bool %B) { 85 %C = select bool %B, double 123.412, double 523.01123123 86 ret double %C 87} 88 89Currently, the select is being lowered, which prevents the dag combiner from 90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' 91 92The pattern isel got this one right. 93 94//===---------------------------------------------------------------------===// 95 96SSE should implement 'select_cc' using 'emulated conditional moves' that use 97pcmp/pand/pandn/por to do a selection instead of a conditional branch: 98 99double %X(double %Y, double %Z, double %A, double %B) { 100 %C = setlt double %A, %B 101 %z = fadd double %Z, 0.0 ;; select operand is not a load 102 %D = select bool %C, double %Y, double %z 103 ret double %D 104} 105 106We currently emit: 107 108_X: 109 subl $12, %esp 110 xorpd %xmm0, %xmm0 111 addsd 24(%esp), %xmm0 112 movsd 32(%esp), %xmm1 113 movsd 16(%esp), %xmm2 114 ucomisd 40(%esp), %xmm1 115 jb LBB_X_2 116LBB_X_1: 117 movsd %xmm0, %xmm2 118LBB_X_2: 119 movsd %xmm2, (%esp) 120 fldl (%esp) 121 addl $12, %esp 122 ret 123 124//===---------------------------------------------------------------------===// 125 126Lower memcpy / memset to a series of SSE 128 bit move instructions when it's 127feasible. 128 129//===---------------------------------------------------------------------===// 130 131Codegen: 132 if (copysign(1.0, x) == copysign(1.0, y)) 133into: 134 if (x^y & mask) 135when using SSE. 136 137//===---------------------------------------------------------------------===// 138 139Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half 140of a v4sf value. 141 142//===---------------------------------------------------------------------===// 143 144Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. 145Perhaps use pxor / xorp* to clear a XMM register first? 146 147//===---------------------------------------------------------------------===// 148 149External test Nurbs exposed some problems. Look for 150__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc 151emits: 152 153 movaps (%edx), %xmm2 #59.21 154 movaps (%edx), %xmm5 #60.21 155 movaps (%edx), %xmm4 #61.21 156 movaps (%edx), %xmm3 #62.21 157 movl 40(%ecx), %ebp #69.49 158 shufps $0, %xmm2, %xmm5 #60.21 159 movl 100(%esp), %ebx #69.20 160 movl (%ebx), %edi #69.20 161 imull %ebp, %edi #69.49 162 addl (%eax), %edi #70.33 163 shufps $85, %xmm2, %xmm4 #61.21 164 shufps $170, %xmm2, %xmm3 #62.21 165 shufps $255, %xmm2, %xmm2 #63.21 166 lea (%ebp,%ebp,2), %ebx #69.49 167 negl %ebx #69.49 168 lea -3(%edi,%ebx), %ebx #70.33 169 shll $4, %ebx #68.37 170 addl 32(%ecx), %ebx #68.37 171 testb $15, %bl #91.13 172 jne L_B1.24 # Prob 5% #91.13 173 174This is the llvm code after instruction scheduling: 175 176cond_next140 (0xa910740, LLVM BB @0xa90beb0): 177 %reg1078 = MOV32ri -3 178 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 179 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 180 %reg1080 = IMUL32rr %reg1079, %reg1037 181 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 182 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 183 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 184 %reg1082 = SHL32ri %reg1038, 4 185 %reg1039 = ADD32rr %reg1036, %reg1082 186 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 187 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 188 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 189 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 190 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 191 %reg1040 = MOV32rr %reg1039 192 %reg1084 = AND32ri8 %reg1039, 15 193 CMP32ri8 %reg1084, 0 194 JE mbb<cond_next204,0xa914d30> 195 196Still ok. After register allocation: 197 198cond_next140 (0xa910740, LLVM BB @0xa90beb0): 199 %EAX = MOV32ri -3 200 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 201 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 202 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 203 %EDX = MOV32rm %EDX, 1, %NOREG, 40 204 IMUL32rr %EAX<def&use>, %EDX 205 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 206 %ESI = MOV32rm %ESI, 1, %NOREG, 0 207 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI 208 %EAX = LEA32r %ESI, 1, %EAX, -3 209 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 210 %ESI = MOV32rm %ESI, 1, %NOREG, 32 211 %EDI = MOV32rr %EAX 212 SHL32ri %EDI<def&use>, 4 213 ADD32rr %EDI<def&use>, %ESI 214 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 215 %XMM1 = MOVAPSrr %XMM0 216 SHUFPSrr %XMM1<def&use>, %XMM1, 170 217 %XMM2 = MOVAPSrr %XMM0 218 SHUFPSrr %XMM2<def&use>, %XMM2, 0 219 %XMM3 = MOVAPSrr %XMM0 220 SHUFPSrr %XMM3<def&use>, %XMM3, 255 221 SHUFPSrr %XMM0<def&use>, %XMM0, 85 222 %EBX = MOV32rr %EDI 223 AND32ri8 %EBX<def&use>, 15 224 CMP32ri8 %EBX, 0 225 JE mbb<cond_next204,0xa914d30> 226 227This looks really bad. The problem is shufps is a destructive opcode. Since it 228appears as operand two in more than one shufps ops. It resulted in a number of 229copies. Note icc also suffers from the same problem. Either the instruction 230selector should select pshufd or The register allocator can made the two-address 231to three-address transformation. 232 233It also exposes some other problems. See MOV32ri -3 and the spills. 234 235//===---------------------------------------------------------------------===// 236 237Consider: 238 239__m128 test(float a) { 240 return _mm_set_ps(0.0, 0.0, 0.0, a*a); 241} 242 243This compiles into: 244 245movss 4(%esp), %xmm1 246mulss %xmm1, %xmm1 247xorps %xmm0, %xmm0 248movss %xmm1, %xmm0 249ret 250 251Because mulss doesn't modify the top 3 elements, the top elements of 252xmm1 are already zero'd. We could compile this to: 253 254movss 4(%esp), %xmm0 255mulss %xmm0, %xmm0 256ret 257 258//===---------------------------------------------------------------------===// 259 260Here's a sick and twisted idea. Consider code like this: 261 262__m128 test(__m128 a) { 263 float b = *(float*)&A; 264 ... 265 return _mm_set_ps(0.0, 0.0, 0.0, b); 266} 267 268This might compile to this code: 269 270movaps c(%esp), %xmm1 271xorps %xmm0, %xmm0 272movss %xmm1, %xmm0 273ret 274 275Now consider if the ... code caused xmm1 to get spilled. This might produce 276this code: 277 278movaps c(%esp), %xmm1 279movaps %xmm1, c2(%esp) 280... 281 282xorps %xmm0, %xmm0 283movaps c2(%esp), %xmm1 284movss %xmm1, %xmm0 285ret 286 287However, since the reload is only used by these instructions, we could 288"fold" it into the uses, producing something like this: 289 290movaps c(%esp), %xmm1 291movaps %xmm1, c2(%esp) 292... 293 294movss c2(%esp), %xmm0 295ret 296 297... saving two instructions. 298 299The basic idea is that a reload from a spill slot, can, if only one 4-byte 300chunk is used, bring in 3 zeros the one element instead of 4 elements. 301This can be used to simplify a variety of shuffle operations, where the 302elements are fixed zeros. 303 304//===---------------------------------------------------------------------===// 305 306This code generates ugly code, probably due to costs being off or something: 307 308define void @test(float* %P, <4 x float>* %P2 ) { 309 %xFloat0.688 = load float* %P 310 %tmp = load <4 x float>* %P2 311 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 312 store <4 x float> %inFloat3.713, <4 x float>* %P2 313 ret void 314} 315 316Generates: 317 318_test: 319 movl 8(%esp), %eax 320 movaps (%eax), %xmm0 321 pxor %xmm1, %xmm1 322 movaps %xmm0, %xmm2 323 shufps $50, %xmm1, %xmm2 324 shufps $132, %xmm2, %xmm0 325 movaps %xmm0, (%eax) 326 ret 327 328Would it be better to generate: 329 330_test: 331 movl 8(%esp), %ecx 332 movaps (%ecx), %xmm0 333 xor %eax, %eax 334 pinsrw $6, %eax, %xmm0 335 pinsrw $7, %eax, %xmm0 336 movaps %xmm0, (%ecx) 337 ret 338 339? 340 341//===---------------------------------------------------------------------===// 342 343Some useful information in the Apple Altivec / SSE Migration Guide: 344 345http://developer.apple.com/documentation/Performance/Conceptual/ 346Accelerate_sse_migration/index.html 347 348e.g. SSE select using and, andnot, or. Various SSE compare translations. 349 350//===---------------------------------------------------------------------===// 351 352Add hooks to commute some CMPP operations. 353 354//===---------------------------------------------------------------------===// 355 356Apply the same transformation that merged four float into a single 128-bit load 357to loads from constant pool. 358 359//===---------------------------------------------------------------------===// 360 361Floating point max / min are commutable when -enable-unsafe-fp-path is 362specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other 363nodes which are selected to max / min instructions that are marked commutable. 364 365//===---------------------------------------------------------------------===// 366 367We should materialize vector constants like "all ones" and "signbit" with 368code like: 369 370 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 371 372and: 373 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 374 psrlq xmm1, 31 ; xmm1 = all 100000000000... 375 376instead of using a load from the constant pool. The later is important for 377ABS/NEG/copysign etc. 378 379//===---------------------------------------------------------------------===// 380 381These functions: 382 383#include <xmmintrin.h> 384__m128i a; 385void x(unsigned short n) { 386 a = _mm_slli_epi32 (a, n); 387} 388void y(unsigned n) { 389 a = _mm_slli_epi32 (a, n); 390} 391 392compile to ( -O3 -static -fomit-frame-pointer): 393_x: 394 movzwl 4(%esp), %eax 395 movd %eax, %xmm0 396 movaps _a, %xmm1 397 pslld %xmm0, %xmm1 398 movaps %xmm1, _a 399 ret 400_y: 401 movd 4(%esp), %xmm0 402 movaps _a, %xmm1 403 pslld %xmm0, %xmm1 404 movaps %xmm1, _a 405 ret 406 407"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems 408like movd would be sufficient in both cases as the value is already zero 409extended in the 32-bit stack slot IIRC. For signed short, it should also be 410save, as a really-signed value would be undefined for pslld. 411 412 413//===---------------------------------------------------------------------===// 414 415#include <math.h> 416int t1(double d) { return signbit(d); } 417 418This currently compiles to: 419 subl $12, %esp 420 movsd 16(%esp), %xmm0 421 movsd %xmm0, (%esp) 422 movl 4(%esp), %eax 423 shrl $31, %eax 424 addl $12, %esp 425 ret 426 427We should use movmskp{s|d} instead. 428 429//===---------------------------------------------------------------------===// 430 431CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single 432(aligned) vector load. This functionality has a couple of problems. 433 4341. The code to infer alignment from loads of globals is in the X86 backend, 435 not the dag combiner. This is because dagcombine2 needs to be able to see 436 through the X86ISD::Wrapper node, which DAGCombine can't really do. 4372. The code for turning 4 x load into a single vector load is target 438 independent and should be moved to the dag combiner. 4393. The code for turning 4 x load into a vector load can only handle a direct 440 load from a global or a direct load from the stack. It should be generalized 441 to handle any load from P, P+4, P+8, P+12, where P can be anything. 4424. The alignment inference code cannot handle loads from globals in non-static 443 mode because it doesn't look through the extra dyld stub load. If you try 444 vec_align.ll without -relocation-model=static, you'll see what I mean. 445 446//===---------------------------------------------------------------------===// 447 448We should lower store(fneg(load p), q) into an integer load+xor+store, which 449eliminates a constant pool load. For example, consider: 450 451define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { 452entry: 453 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] 454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly 455 ret i64 %tmp20 456} 457declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly 458 459This currently compiles to: 460 461LCPI1_0: # <4 x float> 462 .long 2147483648 # float -0 463 .long 2147483648 # float -0 464 .long 2147483648 # float -0 465 .long 2147483648 # float -0 466_ccosf: 467 subl $12, %esp 468 movss 16(%esp), %xmm0 469 movss %xmm0, 4(%esp) 470 movss 20(%esp), %xmm0 471 xorps LCPI1_0, %xmm0 472 movss %xmm0, (%esp) 473 call L_ccoshf$stub 474 addl $12, %esp 475 ret 476 477Note the load into xmm0, then xor (to negate), then store. In PIC mode, 478this code computes the pic base and does two loads to do the constant pool 479load, so the improvement is much bigger. 480 481The tricky part about this xform is that the argument load/store isn't exposed 482until post-legalize, and at that point, the fneg has been custom expanded into 483an X86 fxor. This means that we need to handle this case in the x86 backend 484instead of in target independent code. 485 486//===---------------------------------------------------------------------===// 487 488Non-SSE4 insert into 16 x i8 is atrociously bad. 489 490//===---------------------------------------------------------------------===// 491 492<2 x i64> extract is substantially worse than <2 x f64>, even if the destination 493is memory. 494 495//===---------------------------------------------------------------------===// 496 497SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext 498sitting between the truncate and the extract. 499 500//===---------------------------------------------------------------------===// 501 502INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert 503any number of 0.0 simultaneously. Currently we only use it for simple 504insertions. 505 506See comments in LowerINSERT_VECTOR_ELT_SSE4. 507 508//===---------------------------------------------------------------------===// 509 510On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not 511Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are 512legal, it'll just take a few extra patterns written in the .td file. 513 514Note: this is not a code quality issue; the custom lowered code happens to be 515right, but we shouldn't have to custom lower anything. This is probably related 516to <2 x i64> ops being so bad. 517 518//===---------------------------------------------------------------------===// 519 520LLVM currently generates stack realignment code, when it is not necessary 521needed. The problem is that we need to know about stack alignment too early, 522before RA runs. 523 524At that point we don't know, whether there will be vector spill, or not. 525Stack realignment logic is overly conservative here, but otherwise we can 526produce unaligned loads/stores. 527 528Fixing this will require some huge RA changes. 529 530Testcase: 531#include <emmintrin.h> 532 533typedef short vSInt16 __attribute__ ((__vector_size__ (16))); 534 535static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, 536- 22725, - 12873};; 537 538vSInt16 madd(vSInt16 b) 539{ 540 return _mm_madd_epi16(a, b); 541} 542 543Generated code (x86-32, linux): 544madd: 545 pushl %ebp 546 movl %esp, %ebp 547 andl $-16, %esp 548 movaps .LCPI1_0, %xmm1 549 pmaddwd %xmm1, %xmm0 550 movl %ebp, %esp 551 popl %ebp 552 ret 553 554//===---------------------------------------------------------------------===// 555 556Consider: 557#include <emmintrin.h> 558__m128 foo2 (float x) { 559 return _mm_set_ps (0, 0, x, 0); 560} 561 562In x86-32 mode, we generate this spiffy code: 563 564_foo2: 565 movss 4(%esp), %xmm0 566 pshufd $81, %xmm0, %xmm0 567 ret 568 569in x86-64 mode, we generate this code, which could be better: 570 571_foo2: 572 xorps %xmm1, %xmm1 573 movss %xmm0, %xmm1 574 pshufd $81, %xmm1, %xmm0 575 ret 576 577In sse4 mode, we could use insertps to make both better. 578 579Here's another testcase that could use insertps [mem]: 580 581#include <xmmintrin.h> 582extern float x2, x3; 583__m128 foo1 (float x1, float x4) { 584 return _mm_set_ps (x2, x1, x3, x4); 585} 586 587gcc mainline compiles it to: 588 589foo1: 590 insertps $0x10, x2(%rip), %xmm0 591 insertps $0x10, x3(%rip), %xmm1 592 movaps %xmm1, %xmm2 593 movlhps %xmm0, %xmm2 594 movaps %xmm2, %xmm0 595 ret 596 597//===---------------------------------------------------------------------===// 598 599We compile vector multiply-by-constant into poor code: 600 601define <4 x i32> @f(<4 x i32> %i) nounwind { 602 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > 603 ret <4 x i32> %A 604} 605 606On targets without SSE4.1, this compiles into: 607 608LCPI1_0: ## <4 x i32> 609 .long 10 610 .long 10 611 .long 10 612 .long 10 613 .text 614 .align 4,0x90 615 .globl _f 616_f: 617 pshufd $3, %xmm0, %xmm1 618 movd %xmm1, %eax 619 imull LCPI1_0+12, %eax 620 movd %eax, %xmm1 621 pshufd $1, %xmm0, %xmm2 622 movd %xmm2, %eax 623 imull LCPI1_0+4, %eax 624 movd %eax, %xmm2 625 punpckldq %xmm1, %xmm2 626 movd %xmm0, %eax 627 imull LCPI1_0, %eax 628 movd %eax, %xmm1 629 movhlps %xmm0, %xmm0 630 movd %xmm0, %eax 631 imull LCPI1_0+8, %eax 632 movd %eax, %xmm0 633 punpckldq %xmm0, %xmm1 634 movaps %xmm1, %xmm0 635 punpckldq %xmm2, %xmm0 636 ret 637 638It would be better to synthesize integer vector multiplication by constants 639using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, 640simple cases such as multiplication by powers of two would be better as 641vector shifts than as multiplications. 642 643//===---------------------------------------------------------------------===// 644 645We compile this: 646 647__m128i 648foo2 (char x) 649{ 650 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); 651} 652 653into: 654 movl $1, %eax 655 xorps %xmm0, %xmm0 656 pinsrw $2, %eax, %xmm0 657 movzbl 4(%esp), %eax 658 pinsrw $3, %eax, %xmm0 659 movl $256, %eax 660 pinsrw $7, %eax, %xmm0 661 ret 662 663 664gcc-4.2: 665 subl $12, %esp 666 movzbl 16(%esp), %eax 667 movdqa LC0, %xmm0 668 pinsrw $3, %eax, %xmm0 669 addl $12, %esp 670 ret 671 .const 672 .align 4 673LC0: 674 .word 0 675 .word 0 676 .word 1 677 .word 0 678 .word 0 679 .word 0 680 .word 0 681 .word 256 682 683With SSE4, it should be 684 movdqa .LC0(%rip), %xmm0 685 pinsrb $6, %edi, %xmm0 686 687//===---------------------------------------------------------------------===// 688 689We should transform a shuffle of two vectors of constants into a single vector 690of constants. Also, insertelement of a constant into a vector of constants 691should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. 692 693We compiled it to something horrible: 694 695 .align 4 696LCPI1_1: ## float 697 .long 1065353216 ## float 1 698 .const 699 700 .align 4 701LCPI1_0: ## <4 x float> 702 .space 4 703 .long 1065353216 ## float 1 704 .space 4 705 .long 1065353216 ## float 1 706 .text 707 .align 4,0x90 708 .globl _t 709_t: 710 xorps %xmm0, %xmm0 711 movhps LCPI1_0, %xmm0 712 movss LCPI1_1, %xmm1 713 movaps %xmm0, %xmm2 714 shufps $2, %xmm1, %xmm2 715 shufps $132, %xmm2, %xmm0 716 movaps %xmm0, 0 717 718//===---------------------------------------------------------------------===// 719rdar://5907648 720 721This function: 722 723float foo(unsigned char x) { 724 return x; 725} 726 727compiles to (x86-32): 728 729define float @foo(i8 zeroext %x) nounwind { 730 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] 731 ret float %tmp12 732} 733 734compiles to: 735 736_foo: 737 subl $4, %esp 738 movzbl 8(%esp), %eax 739 cvtsi2ss %eax, %xmm0 740 movss %xmm0, (%esp) 741 flds (%esp) 742 addl $4, %esp 743 ret 744 745We should be able to use: 746 cvtsi2ss 8($esp), %xmm0 747since we know the stack slot is already zext'd. 748 749//===---------------------------------------------------------------------===// 750 751Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) 752when code size is critical. movlps is slower than movsd on core2 but it's one 753byte shorter. 754 755//===---------------------------------------------------------------------===// 756 757We should use a dynamic programming based approach to tell when using FPStack 758operations is cheaper than SSE. SciMark montecarlo contains code like this 759for example: 760 761double MonteCarlo_num_flops(int Num_samples) { 762 return ((double) Num_samples)* 4.0; 763} 764 765In fpstack mode, this compiles into: 766 767LCPI1_0: 768 .long 1082130432 ## float 4.000000e+00 769_MonteCarlo_num_flops: 770 subl $4, %esp 771 movl 8(%esp), %eax 772 movl %eax, (%esp) 773 fildl (%esp) 774 fmuls LCPI1_0 775 addl $4, %esp 776 ret 777 778in SSE mode, it compiles into significantly slower code: 779 780_MonteCarlo_num_flops: 781 subl $12, %esp 782 cvtsi2sd 16(%esp), %xmm0 783 mulsd LCPI1_0, %xmm0 784 movsd %xmm0, (%esp) 785 fldl (%esp) 786 addl $12, %esp 787 ret 788 789There are also other cases in scimark where using fpstack is better, it is 790cheaper to do fld1 than load from a constant pool for example, so 791"load, add 1.0, store" is better done in the fp stack, etc. 792 793//===---------------------------------------------------------------------===// 794 795The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to 796"cmpsd". For example, this code: 797 798double d1(double x) { return x == x ? x : x + x; } 799 800Compiles into: 801 802_d1: 803 ucomisd %xmm0, %xmm0 804 jnp LBB1_2 805 addsd %xmm0, %xmm0 806 ret 807LBB1_2: 808 ret 809 810Also, the 'ret's should be shared. This is PR6032. 811 812//===---------------------------------------------------------------------===// 813 814These should compile into the same code (PR6214): Perhaps instcombine should 815canonicalize the former into the later? 816 817define float @foo(float %x) nounwind { 818 %t = bitcast float %x to i32 819 %s = and i32 %t, 2147483647 820 %d = bitcast i32 %s to float 821 ret float %d 822} 823 824declare float @fabsf(float %n) 825define float @bar(float %x) nounwind { 826 %d = call float @fabsf(float %x) 827 ret float %d 828} 829 830//===---------------------------------------------------------------------===// 831 832This IR (from PR6194): 833 834target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 835target triple = "x86_64-apple-darwin10.0.0" 836 837%0 = type { double, double } 838%struct.float3 = type { float, float, float } 839 840define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { 841entry: 842 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] 843 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] 844 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] 845 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] 846 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] 847 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] 848 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] 849 store float %tmp12, float* %tmp5 850 ret void 851} 852 853Compiles to: 854 855_test: ## @test 856 movd %xmm0, %rax 857 shrq $32, %rax 858 movl %eax, 4(%rdi) 859 ret 860 861This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and 862doing a shuffle from v[1] to v[0] then a float store. 863 864//===---------------------------------------------------------------------===// 865 866On SSE4 machines, we compile this code: 867 868define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, 869 <2 x float> *%P) nounwind { 870 %Z = fadd <2 x float> %Q, %R 871 872 store <2 x float> %Z, <2 x float> *%P 873 ret <2 x float> %Z 874} 875 876into: 877 878_test2: ## @test2 879## BB#0: 880 insertps $0, %xmm2, %xmm2 881 insertps $16, %xmm3, %xmm2 882 insertps $0, %xmm0, %xmm3 883 insertps $16, %xmm1, %xmm3 884 addps %xmm2, %xmm3 885 movq %xmm3, (%rdi) 886 movaps %xmm3, %xmm0 887 pshufd $1, %xmm3, %xmm1 888 ## kill: XMM1<def> XMM1<kill> 889 ret 890 891The insertps's of $0 are pointless complex copies. 892 893//===---------------------------------------------------------------------===// 894 895[UNSAFE FP] 896 897void foo(double, double, double); 898void norm(double x, double y, double z) { 899 double scale = __builtin_sqrt(x*x + y*y + z*z); 900 foo(x/scale, y/scale, z/scale); 901} 902 903We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is 904slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first 905and emit 3 mulsd in place of the divs. This can be done as a target-independent 906transform. 907 908If we're dealing with floats instead of doubles we could even replace the sqrtss 909and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the 910cost of reduced accuracy. 911 912//===---------------------------------------------------------------------===// 913 914This function should be matched to haddpd when the appropriate CPU is enabled: 915 916#include <x86intrin.h> 917double f (__m128d p) { 918 return p[0] + p[1]; 919} 920 921similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should 922turn into hsubpd also. 923 924//===---------------------------------------------------------------------===// 925 926define <2 x i32> @foo(<2 x double> %in) { 927 %x = fptosi <2 x double> %in to <2 x i32> 928 ret <2 x i32> %x 929} 930 931Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si. 932 933//===---------------------------------------------------------------------===// 934