1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
5Improvements to the multiply -> shift/add algorithm:
6http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
7
8//===---------------------------------------------------------------------===//
9
10Improve code like this (occurs fairly frequently, e.g. in LLVM):
11long long foo(int x) { return 1LL << x; }
12
13http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
14http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
15http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
16
17Another useful one would be  ~0ULL >> X and ~0ULL << X.
18
19One better solution for 1LL << x is:
20        xorl    %eax, %eax
21        xorl    %edx, %edx
22        testb   $32, %cl
23        sete    %al
24        setne   %dl
25        sall    %cl, %eax
26        sall    %cl, %edx
27
28But that requires good 8-bit subreg support.
29
30Also, this might be better.  It's an extra shift, but it's one instruction
31shorter, and doesn't stress 8-bit subreg support.
32(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
33but without the unnecessary and.)
34        movl %ecx, %eax
35        shrl $5, %eax
36        movl %eax, %edx
37        xorl $1, %edx
38        sall %cl, %eax
39        sall %cl. %edx
40
4164-bit shifts (in general) expand to really bad code.  Instead of using
42cmovs, we should expand to a conditional branch like GCC produces.
43
44//===---------------------------------------------------------------------===//
45
46Some isel ideas:
47
481. Dynamic programming based approach when compile time is not an
49   issue.
502. Code duplication (addressing mode) during isel.
513. Other ideas from "Register-Sensitive Selection, Duplication, and
52   Sequencing of Instructions".
534. Scheduling for reduced register pressure.  E.g. "Minimum Register
54   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
55   and other related papers.
56   http://citeseer.ist.psu.edu/govindarajan01minimum.html
57
58//===---------------------------------------------------------------------===//
59
60Should we promote i16 to i32 to avoid partial register update stalls?
61
62//===---------------------------------------------------------------------===//
63
64Leave any_extend as pseudo instruction and hint to register
65allocator. Delay codegen until post register allocation.
66Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
67the coalescer how to deal with it though.
68
69//===---------------------------------------------------------------------===//
70
71It appears icc use push for parameter passing. Need to investigate.
72
73//===---------------------------------------------------------------------===//
74
75The instruction selector sometimes misses folding a load into a compare.  The
76pattern is written as (cmp reg, (load p)).  Because the compare isn't
77commutative, it is not matched with the load on both sides.  The dag combiner
78should be made smart enough to canonicalize the load into the RHS of a compare
79when it can invert the result of the compare for free.
80
81//===---------------------------------------------------------------------===//
82
83In many cases, LLVM generates code like this:
84
85_test:
86        movl 8(%esp), %eax
87        cmpl %eax, 4(%esp)
88        setl %al
89        movzbl %al, %eax
90        ret
91
92on some processors (which ones?), it is more efficient to do this:
93
94_test:
95        movl 8(%esp), %ebx
96        xor  %eax, %eax
97        cmpl %ebx, 4(%esp)
98        setl %al
99        ret
100
101Doing this correctly is tricky though, as the xor clobbers the flags.
102
103//===---------------------------------------------------------------------===//
104
105We should generate bts/btr/etc instructions on targets where they are cheap or
106when codesize is important.  e.g., for:
107
108void setbit(int *target, int bit) {
109    *target |= (1 << bit);
110}
111void clearbit(int *target, int bit) {
112    *target &= ~(1 << bit);
113}
114
115//===---------------------------------------------------------------------===//
116
117Instead of the following for memset char*, 1, 10:
118
119	movl $16843009, 4(%edx)
120	movl $16843009, (%edx)
121	movw $257, 8(%edx)
122
123It might be better to generate
124
125	movl $16843009, %eax
126	movl %eax, 4(%edx)
127	movl %eax, (%edx)
128	movw al, 8(%edx)
129
130when we can spare a register. It reduces code size.
131
132//===---------------------------------------------------------------------===//
133
134Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
135get this:
136
137define i32 @test1(i32 %X) {
138    %Y = sdiv i32 %X, 8
139    ret i32 %Y
140}
141
142_test1:
143        movl 4(%esp), %eax
144        movl %eax, %ecx
145        sarl $31, %ecx
146        shrl $29, %ecx
147        addl %ecx, %eax
148        sarl $3, %eax
149        ret
150
151GCC knows several different ways to codegen it, one of which is this:
152
153_test1:
154        movl    4(%esp), %eax
155        cmpl    $-1, %eax
156        leal    7(%eax), %ecx
157        cmovle  %ecx, %eax
158        sarl    $3, %eax
159        ret
160
161which is probably slower, but it's interesting at least :)
162
163//===---------------------------------------------------------------------===//
164
165We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
166We should leave these as libcalls for everything over a much lower threshold,
167since libc is hand tuned for medium and large mem ops (avoiding RFO for large
168stores, TLB preheating, etc)
169
170//===---------------------------------------------------------------------===//
171
172Optimize this into something reasonable:
173 x * copysign(1.0, y) * copysign(1.0, z)
174
175//===---------------------------------------------------------------------===//
176
177Optimize copysign(x, *y) to use an integer load from y.
178
179//===---------------------------------------------------------------------===//
180
181The following tests perform worse with LSR:
182
183lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
184
185//===---------------------------------------------------------------------===//
186
187Adding to the list of cmp / test poor codegen issues:
188
189int test(__m128 *A, __m128 *B) {
190  if (_mm_comige_ss(*A, *B))
191    return 3;
192  else
193    return 4;
194}
195
196_test:
197	movl 8(%esp), %eax
198	movaps (%eax), %xmm0
199	movl 4(%esp), %eax
200	movaps (%eax), %xmm1
201	comiss %xmm0, %xmm1
202	setae %al
203	movzbl %al, %ecx
204	movl $3, %eax
205	movl $4, %edx
206	cmpl $0, %ecx
207	cmove %edx, %eax
208	ret
209
210Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
211are a number of issues. 1) We are introducing a setcc between the result of the
212intrisic call and select. 2) The intrinsic is expected to produce a i32 value
213so a any extend (which becomes a zero extend) is added.
214
215We probably need some kind of target DAG combine hook to fix this.
216
217//===---------------------------------------------------------------------===//
218
219We generate significantly worse code for this than GCC:
220http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
221http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
222
223There is also one case we do worse on PPC.
224
225//===---------------------------------------------------------------------===//
226
227For this:
228
229int test(int a)
230{
231  return a * 3;
232}
233
234We currently emits
235	imull $3, 4(%esp), %eax
236
237Perhaps this is what we really should generate is? Is imull three or four
238cycles? Note: ICC generates this:
239	movl	4(%esp), %eax
240	leal	(%eax,%eax,2), %eax
241
242The current instruction priority is based on pattern complexity. The former is
243more "complex" because it folds a load so the latter will not be emitted.
244
245Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
246should always try to match LEA first since the LEA matching code does some
247estimate to determine whether the match is profitable.
248
249However, if we care more about code size, then imull is better. It's two bytes
250shorter than movl + leal.
251
252On a Pentium M, both variants have the same characteristics with regard
253to throughput; however, the multiplication has a latency of four cycles, as
254opposed to two cycles for the movl+lea variant.
255
256//===---------------------------------------------------------------------===//
257
258It appears gcc place string data with linkonce linkage in
259.section __TEXT,__const_coal,coalesced instead of
260.section __DATA,__const_coal,coalesced.
261Take a look at darwin.h, there are other Darwin assembler directives that we
262do not make use of.
263
264//===---------------------------------------------------------------------===//
265
266define i32 @foo(i32* %a, i32 %t) {
267entry:
268	br label %cond_true
269
270cond_true:		; preds = %cond_true, %entry
271	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
272	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
273	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
274	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
275	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
276	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
277	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
278	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
279	br i1 %tmp, label %bb12, label %cond_true
280
281bb12:		; preds = %cond_true
282	ret i32 %tmp7
283}
284is pessimized by -loop-reduce and -indvars
285
286//===---------------------------------------------------------------------===//
287
288u32 to float conversion improvement:
289
290float uint32_2_float( unsigned u ) {
291  float fl = (int) (u & 0xffff);
292  float fh = (int) (u >> 16);
293  fh *= 0x1.0p16f;
294  return fh + fl;
295}
296
29700000000        subl    $0x04,%esp
29800000003        movl    0x08(%esp,1),%eax
29900000007        movl    %eax,%ecx
30000000009        shrl    $0x10,%ecx
3010000000c        cvtsi2ss        %ecx,%xmm0
30200000010        andl    $0x0000ffff,%eax
30300000015        cvtsi2ss        %eax,%xmm1
30400000019        mulss   0x00000078,%xmm0
30500000021        addss   %xmm1,%xmm0
30600000025        movss   %xmm0,(%esp,1)
3070000002a        flds    (%esp,1)
3080000002d        addl    $0x04,%esp
30900000030        ret
310
311//===---------------------------------------------------------------------===//
312
313When using fastcc abi, align stack slot of argument of type double on 8 byte
314boundary to improve performance.
315
316//===---------------------------------------------------------------------===//
317
318GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
319simplifications for integer "x cmp y ? a : b".
320
321//===---------------------------------------------------------------------===//
322
323Consider the expansion of:
324
325define i32 @test3(i32 %X) {
326        %tmp1 = urem i32 %X, 255
327        ret i32 %tmp1
328}
329
330Currently it compiles to:
331
332...
333        movl $2155905153, %ecx
334        movl 8(%esp), %esi
335        movl %esi, %eax
336        mull %ecx
337...
338
339This could be "reassociated" into:
340
341        movl $2155905153, %eax
342        movl 8(%esp), %ecx
343        mull %ecx
344
345to avoid the copy.  In fact, the existing two-address stuff would do this
346except that mul isn't a commutative 2-addr instruction.  I guess this has
347to be done at isel time based on the #uses to mul?
348
349//===---------------------------------------------------------------------===//
350
351Make sure the instruction which starts a loop does not cross a cacheline
352boundary. This requires knowning the exact length of each machine instruction.
353That is somewhat complicated, but doable. Example 256.bzip2:
354
355In the new trace, the hot loop has an instruction which crosses a cacheline
356boundary.  In addition to potential cache misses, this can't help decoding as I
357imagine there has to be some kind of complicated decoder reset and realignment
358to grab the bytes from the next cacheline.
359
360532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
361942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
362937  937 0x3d0a incl     %esi
3633    3   0x3d0b cmpb     %bl, %dl
36427   27  0x3d0d jnz      0x000062db <main+11707>
365
366//===---------------------------------------------------------------------===//
367
368In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
369
370//===---------------------------------------------------------------------===//
371
372This could be a single 16-bit load.
373
374int f(char *p) {
375    if ((p[0] == 1) & (p[1] == 2)) return 1;
376    return 0;
377}
378
379//===---------------------------------------------------------------------===//
380
381We should inline lrintf and probably other libc functions.
382
383//===---------------------------------------------------------------------===//
384
385This code:
386
387void test(int X) {
388  if (X) abort();
389}
390
391is currently compiled to:
392
393_test:
394        subl $12, %esp
395        cmpl $0, 16(%esp)
396        jne LBB1_1
397        addl $12, %esp
398        ret
399LBB1_1:
400        call L_abort$stub
401
402It would be better to produce:
403
404_test:
405        subl $12, %esp
406        cmpl $0, 16(%esp)
407        jne L_abort$stub
408        addl $12, %esp
409        ret
410
411This can be applied to any no-return function call that takes no arguments etc.
412Alternatively, the stack save/restore logic could be shrink-wrapped, producing
413something like this:
414
415_test:
416        cmpl $0, 4(%esp)
417        jne LBB1_1
418        ret
419LBB1_1:
420        subl $12, %esp
421        call L_abort$stub
422
423Both are useful in different situations.  Finally, it could be shrink-wrapped
424and tail called, like this:
425
426_test:
427        cmpl $0, 4(%esp)
428        jne LBB1_1
429        ret
430LBB1_1:
431        pop %eax   # realign stack.
432        call L_abort$stub
433
434Though this probably isn't worth it.
435
436//===---------------------------------------------------------------------===//
437
438Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
439a neg instead of a sub instruction.  Consider:
440
441int test(char X) { return 7-X; }
442
443we currently produce:
444_test:
445        movl $7, %eax
446        movsbl 4(%esp), %ecx
447        subl %ecx, %eax
448        ret
449
450We would use one fewer register if codegen'd as:
451
452        movsbl 4(%esp), %eax
453	neg %eax
454        add $7, %eax
455        ret
456
457Note that this isn't beneficial if the load can be folded into the sub.  In
458this case, we want a sub:
459
460int test(int X) { return 7-X; }
461_test:
462        movl $7, %eax
463        subl 4(%esp), %eax
464        ret
465
466//===---------------------------------------------------------------------===//
467
468Leaf functions that require one 4-byte spill slot have a prolog like this:
469
470_foo:
471        pushl   %esi
472        subl    $4, %esp
473...
474and an epilog like this:
475        addl    $4, %esp
476        popl    %esi
477        ret
478
479It would be smaller, and potentially faster, to push eax on entry and to
480pop into a dummy register instead of using addl/subl of esp.  Just don't pop
481into any return registers :)
482
483//===---------------------------------------------------------------------===//
484
485The X86 backend should fold (branch (or (setcc, setcc))) into multiple
486branches.  We generate really poor code for:
487
488double testf(double a) {
489       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
490}
491
492For example, the entry BB is:
493
494_testf:
495        subl    $20, %esp
496        pxor    %xmm0, %xmm0
497        movsd   24(%esp), %xmm1
498        ucomisd %xmm0, %xmm1
499        setnp   %al
500        sete    %cl
501        testb   %cl, %al
502        jne     LBB1_5  # UnifiedReturnBlock
503LBB1_1: # cond_true
504
505
506it would be better to replace the last four instructions with:
507
508	jp LBB1_1
509	je LBB1_5
510LBB1_1:
511
512We also codegen the inner ?: into a diamond:
513
514       cvtss2sd        LCPI1_0(%rip), %xmm2
515        cvtss2sd        LCPI1_1(%rip), %xmm3
516        ucomisd %xmm1, %xmm0
517        ja      LBB1_3  # cond_true
518LBB1_2: # cond_true
519        movapd  %xmm3, %xmm2
520LBB1_3: # cond_true
521        movapd  %xmm2, %xmm0
522        ret
523
524We should sink the load into xmm3 into the LBB1_2 block.  This should
525be pretty easy, and will nuke all the copies.
526
527//===---------------------------------------------------------------------===//
528
529This:
530        #include <algorithm>
531        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
532        { return std::make_pair(a + b, a + b < a); }
533        bool no_overflow(unsigned a, unsigned b)
534        { return !full_add(a, b).second; }
535
536Should compile to:
537	addl	%esi, %edi
538	setae	%al
539	movzbl	%al, %eax
540	ret
541
542on x86-64, instead of the rather stupid-looking:
543	addl	%esi, %edi
544	setb	%al
545	xorb	$1, %al
546	movzbl	%al, %eax
547	ret
548
549
550//===---------------------------------------------------------------------===//
551
552The following code:
553
554bb114.preheader:		; preds = %cond_next94
555	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
556	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
557	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
558	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
559	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
560	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
561	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
562	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
563	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
564	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
565	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
566	br label %bb114
567
568produces:
569
570LBB3_5:	# bb114.preheader
571	movswl	-68(%ebp), %eax
572	movl	$32, %ecx
573	movl	%ecx, -80(%ebp)
574	subl	%eax, -80(%ebp)
575	movswl	-52(%ebp), %eax
576	movl	%ecx, -84(%ebp)
577	subl	%eax, -84(%ebp)
578	movswl	-70(%ebp), %eax
579	movl	%ecx, -88(%ebp)
580	subl	%eax, -88(%ebp)
581	movswl	-50(%ebp), %eax
582	subl	%eax, %ecx
583	movl	%ecx, -76(%ebp)
584	movswl	-42(%ebp), %eax
585	movl	%eax, -92(%ebp)
586	movswl	-66(%ebp), %eax
587	movl	%eax, -96(%ebp)
588	movw	$0, -98(%ebp)
589
590This appears to be bad because the RA is not folding the store to the stack
591slot into the movl.  The above instructions could be:
592	movl    $32, -80(%ebp)
593...
594	movl    $32, -84(%ebp)
595...
596This seems like a cross between remat and spill folding.
597
598This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
599change, so we could simply subtract %eax from %ecx first and then use %ecx (or
600vice-versa).
601
602//===---------------------------------------------------------------------===//
603
604This code:
605
606	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
607	br i1 %tmp659, label %cond_true662, label %cond_next715
608
609produces this:
610
611	testw	%cx, %cx
612	movswl	%cx, %esi
613	jns	LBB4_109	# cond_next715
614
615Shark tells us that using %cx in the testw instruction is sub-optimal. It
616suggests using the 32-bit register (which is what ICC uses).
617
618//===---------------------------------------------------------------------===//
619
620We compile this:
621
622void compare (long long foo) {
623  if (foo < 4294967297LL)
624    abort();
625}
626
627to:
628
629compare:
630        subl    $4, %esp
631        cmpl    $0, 8(%esp)
632        setne   %al
633        movzbw  %al, %ax
634        cmpl    $1, 12(%esp)
635        setg    %cl
636        movzbw  %cl, %cx
637        cmove   %ax, %cx
638        testb   $1, %cl
639        jne     .LBB1_2 # UnifiedReturnBlock
640.LBB1_1:        # ifthen
641        call    abort
642.LBB1_2:        # UnifiedReturnBlock
643        addl    $4, %esp
644        ret
645
646(also really horrible code on ppc).  This is due to the expand code for 64-bit
647compares.  GCC produces multiple branches, which is much nicer:
648
649compare:
650        subl    $12, %esp
651        movl    20(%esp), %edx
652        movl    16(%esp), %eax
653        decl    %edx
654        jle     .L7
655.L5:
656        addl    $12, %esp
657        ret
658        .p2align 4,,7
659.L7:
660        jl      .L4
661        cmpl    $0, %eax
662        .p2align 4,,8
663        ja      .L5
664.L4:
665        .p2align 4,,9
666        call    abort
667
668//===---------------------------------------------------------------------===//
669
670Tail call optimization improvements: Tail call optimization currently
671pushes all arguments on the top of the stack (their normal place for
672non-tail call optimized calls) that source from the callers arguments
673or  that source from a virtual register (also possibly sourcing from
674callers arguments).
675This is done to prevent overwriting of parameters (see example
676below) that might be used later.
677
678example:
679
680int callee(int32, int64);
681int caller(int32 arg1, int32 arg2) {
682  int64 local = arg2 * 2;
683  return callee(arg2, (int64)local);
684}
685
686[arg1]          [!arg2 no longer valid since we moved local onto it]
687[arg2]      ->  [(int64)
688[RETADDR]        local  ]
689
690Moving arg1 onto the stack slot of callee function would overwrite
691arg2 of the caller.
692
693Possible optimizations:
694
695
696 - Analyse the actual parameters of the callee to see which would
697   overwrite a caller parameter which is used by the callee and only
698   push them onto the top of the stack.
699
700   int callee (int32 arg1, int32 arg2);
701   int caller (int32 arg1, int32 arg2) {
702       return callee(arg1,arg2);
703   }
704
705   Here we don't need to write any variables to the top of the stack
706   since they don't overwrite each other.
707
708   int callee (int32 arg1, int32 arg2);
709   int caller (int32 arg1, int32 arg2) {
710       return callee(arg2,arg1);
711   }
712
713   Here we need to push the arguments because they overwrite each
714   other.
715
716//===---------------------------------------------------------------------===//
717
718main ()
719{
720  int i = 0;
721  unsigned long int z = 0;
722
723  do {
724    z -= 0x00004000;
725    i++;
726    if (i > 0x00040000)
727      abort ();
728  } while (z > 0);
729  exit (0);
730}
731
732gcc compiles this to:
733
734_main:
735	subl	$28, %esp
736	xorl	%eax, %eax
737	jmp	L2
738L3:
739	cmpl	$262144, %eax
740	je	L10
741L2:
742	addl	$1, %eax
743	cmpl	$262145, %eax
744	jne	L3
745	call	L_abort$stub
746L10:
747	movl	$0, (%esp)
748	call	L_exit$stub
749
750llvm:
751
752_main:
753	subl	$12, %esp
754	movl	$1, %eax
755	movl	$16384, %ecx
756LBB1_1:	# bb
757	cmpl	$262145, %eax
758	jge	LBB1_4	# cond_true
759LBB1_2:	# cond_next
760	incl	%eax
761	addl	$4294950912, %ecx
762	cmpl	$16384, %ecx
763	jne	LBB1_1	# bb
764LBB1_3:	# bb11
765	xorl	%eax, %eax
766	addl	$12, %esp
767	ret
768LBB1_4:	# cond_true
769	call	L_abort$stub
770
7711. LSR should rewrite the first cmp with induction variable %ecx.
7722. DAG combiner should fold
773        leal    1(%eax), %edx
774        cmpl    $262145, %edx
775   =>
776        cmpl    $262144, %eax
777
778//===---------------------------------------------------------------------===//
779
780define i64 @test(double %X) {
781	%Y = fptosi double %X to i64
782	ret i64 %Y
783}
784
785compiles to:
786
787_test:
788	subl	$20, %esp
789	movsd	24(%esp), %xmm0
790	movsd	%xmm0, 8(%esp)
791	fldl	8(%esp)
792	fisttpll	(%esp)
793	movl	4(%esp), %edx
794	movl	(%esp), %eax
795	addl	$20, %esp
796	#FP_REG_KILL
797	ret
798
799This should just fldl directly from the input stack slot.
800
801//===---------------------------------------------------------------------===//
802
803This code:
804int foo (int x) { return (x & 65535) | 255; }
805
806Should compile into:
807
808_foo:
809        movzwl  4(%esp), %eax
810        orl     $255, %eax
811        ret
812
813instead of:
814_foo:
815	movl	$65280, %eax
816	andl	4(%esp), %eax
817	orl	$255, %eax
818	ret
819
820//===---------------------------------------------------------------------===//
821
822We're codegen'ing multiply of long longs inefficiently:
823
824unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
825  return arg1 *  arg2;
826}
827
828We compile to (fomit-frame-pointer):
829
830_LLM:
831	pushl	%esi
832	movl	8(%esp), %ecx
833	movl	16(%esp), %esi
834	movl	%esi, %eax
835	mull	%ecx
836	imull	12(%esp), %esi
837	addl	%edx, %esi
838	imull	20(%esp), %ecx
839	movl	%esi, %edx
840	addl	%ecx, %edx
841	popl	%esi
842	ret
843
844This looks like a scheduling deficiency and lack of remat of the load from
845the argument area.  ICC apparently produces:
846
847        movl      8(%esp), %ecx
848        imull     12(%esp), %ecx
849        movl      16(%esp), %eax
850        imull     4(%esp), %eax
851        addl      %eax, %ecx
852        movl      4(%esp), %eax
853        mull      12(%esp)
854        addl      %ecx, %edx
855        ret
856
857Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
858http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
859
860//===---------------------------------------------------------------------===//
861
862We can fold a store into "zeroing a reg".  Instead of:
863
864xorl    %eax, %eax
865movl    %eax, 124(%esp)
866
867we should get:
868
869movl    $0, 124(%esp)
870
871if the flags of the xor are dead.
872
873Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
874be folded into: shl [mem], 1
875
876//===---------------------------------------------------------------------===//
877
878In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
879or and instruction, for example:
880
881	xorpd	LCPI1_0, %xmm2
882
883However, if xmm2 gets spilled, we end up with really ugly code like this:
884
885	movsd	(%esp), %xmm0
886	xorpd	LCPI1_0, %xmm0
887	movsd	%xmm0, (%esp)
888
889Since we 'know' that this is a 'neg', we can actually "fold" the spill into
890the neg/abs instruction, turning it into an *integer* operation, like this:
891
892	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
893
894you could also use xorb, but xorl is less likely to lead to a partial register
895stall.  Here is a contrived testcase:
896
897double a, b, c;
898void test(double *P) {
899  double X = *P;
900  a = X;
901  bar();
902  X = -X;
903  b = X;
904  bar();
905  c = X;
906}
907
908//===---------------------------------------------------------------------===//
909
910The generated code on x86 for checking for signed overflow on a multiply the
911obvious way is much longer than it needs to be.
912
913int x(int a, int b) {
914  long long prod = (long long)a*b;
915  return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
916}
917
918See PR2053 for more details.
919
920//===---------------------------------------------------------------------===//
921
922We should investigate using cdq/ctld (effect: edx = sar eax, 31)
923more aggressively; it should cost the same as a move+shift on any modern
924processor, but it's a lot shorter. Downside is that it puts more
925pressure on register allocation because it has fixed operands.
926
927Example:
928int abs(int x) {return x < 0 ? -x : x;}
929
930gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
931abs:
932        movl    4(%esp), %eax
933        cltd
934        xorl    %edx, %eax
935        subl    %edx, %eax
936        ret
937
938//===---------------------------------------------------------------------===//
939
940Take the following code (from
941http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
942
943extern unsigned char first_one[65536];
944int FirstOnet(unsigned long long arg1)
945{
946  if (arg1 >> 48)
947    return (first_one[arg1 >> 48]);
948  return 0;
949}
950
951
952The following code is currently generated:
953FirstOnet:
954        movl    8(%esp), %eax
955        cmpl    $65536, %eax
956        movl    4(%esp), %ecx
957        jb      .LBB1_2 # UnifiedReturnBlock
958.LBB1_1:        # ifthen
959        shrl    $16, %eax
960        movzbl  first_one(%eax), %eax
961        ret
962.LBB1_2:        # UnifiedReturnBlock
963        xorl    %eax, %eax
964        ret
965
966We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
967lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
968
969//===---------------------------------------------------------------------===//
970
971We compile this function:
972
973define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
974entry:
975	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
976	br i1 %tmp2, label %bb7, label %bb
977
978bb:		; preds = %entry
979	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
980	ret i32 %tmp6
981
982bb7:		; preds = %entry
983	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
984	ret i32 %tmp10
985}
986
987to:
988
989foo:                                    # @foo
990# BB#0:                                 # %entry
991	movl	4(%esp), %ecx
992	cmpb	$0, 16(%esp)
993	je	.LBB0_2
994# BB#1:                                 # %bb
995	movl	8(%esp), %eax
996	addl	%ecx, %eax
997	ret
998.LBB0_2:                                # %bb7
999	movl	12(%esp), %edx
1000	movl	%ecx, %eax
1001	subl	%edx, %eax
1002	ret
1003
1004There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1005couple more movls by putting 4(%esp) into %eax instead of %ecx.
1006
1007//===---------------------------------------------------------------------===//
1008
1009See rdar://4653682.
1010
1011From flops:
1012
1013LBB1_15:        # bb310
1014        cvtss2sd        LCPI1_0, %xmm1
1015        addsd   %xmm1, %xmm0
1016        movsd   176(%esp), %xmm2
1017        mulsd   %xmm0, %xmm2
1018        movapd  %xmm2, %xmm3
1019        mulsd   %xmm3, %xmm3
1020        movapd  %xmm3, %xmm4
1021        mulsd   LCPI1_23, %xmm4
1022        addsd   LCPI1_24, %xmm4
1023        mulsd   %xmm3, %xmm4
1024        addsd   LCPI1_25, %xmm4
1025        mulsd   %xmm3, %xmm4
1026        addsd   LCPI1_26, %xmm4
1027        mulsd   %xmm3, %xmm4
1028        addsd   LCPI1_27, %xmm4
1029        mulsd   %xmm3, %xmm4
1030        addsd   LCPI1_28, %xmm4
1031        mulsd   %xmm3, %xmm4
1032        addsd   %xmm1, %xmm4
1033        mulsd   %xmm2, %xmm4
1034        movsd   152(%esp), %xmm1
1035        addsd   %xmm4, %xmm1
1036        movsd   %xmm1, 152(%esp)
1037        incl    %eax
1038        cmpl    %eax, %esi
1039        jge     LBB1_15 # bb310
1040LBB1_16:        # bb358.loopexit
1041        movsd   152(%esp), %xmm0
1042        addsd   %xmm0, %xmm0
1043        addsd   LCPI1_22, %xmm0
1044        movsd   %xmm0, 152(%esp)
1045
1046Rather than spilling the result of the last addsd in the loop, we should have
1047insert a copy to split the interval (one for the duration of the loop, one
1048extending to the fall through). The register pressure in the loop isn't high
1049enough to warrant the spill.
1050
1051Also check why xmm7 is not used at all in the function.
1052
1053//===---------------------------------------------------------------------===//
1054
1055Take the following:
1056
1057target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
1058target triple = "i386-apple-darwin8"
1059@in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
1060define fastcc void @abort_gzip() noreturn nounwind  {
1061entry:
1062	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
1063	br i1 %tmp.b.i, label %bb.i, label %bb4.i
1064bb.i:		; preds = %entry
1065	tail call void @exit( i32 1 ) noreturn nounwind
1066	unreachable
1067bb4.i:		; preds = %entry
1068	store i1 true, i1* @in_exit.4870.b
1069	tail call void @exit( i32 1 ) noreturn nounwind
1070	unreachable
1071}
1072declare void @exit(i32) noreturn nounwind
1073
1074This compiles into:
1075_abort_gzip:                            ## @abort_gzip
1076## BB#0:                                ## %entry
1077	subl	$12, %esp
1078	movb	_in_exit.4870.b, %al
1079	cmpb	$1, %al
1080	jne	LBB0_2
1081
1082We somehow miss folding the movb into the cmpb.
1083
1084//===---------------------------------------------------------------------===//
1085
1086We compile:
1087
1088int test(int x, int y) {
1089  return x-y-1;
1090}
1091
1092into (-m64):
1093
1094_test:
1095	decl	%edi
1096	movl	%edi, %eax
1097	subl	%esi, %eax
1098	ret
1099
1100it would be better to codegen as: x+~y  (notl+addl)
1101
1102//===---------------------------------------------------------------------===//
1103
1104This code:
1105
1106int foo(const char *str,...)
1107{
1108 __builtin_va_list a; int x;
1109 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1110 return x;
1111}
1112
1113gets compiled into this on x86-64:
1114	subq    $200, %rsp
1115        movaps  %xmm7, 160(%rsp)
1116        movaps  %xmm6, 144(%rsp)
1117        movaps  %xmm5, 128(%rsp)
1118        movaps  %xmm4, 112(%rsp)
1119        movaps  %xmm3, 96(%rsp)
1120        movaps  %xmm2, 80(%rsp)
1121        movaps  %xmm1, 64(%rsp)
1122        movaps  %xmm0, 48(%rsp)
1123        movq    %r9, 40(%rsp)
1124        movq    %r8, 32(%rsp)
1125        movq    %rcx, 24(%rsp)
1126        movq    %rdx, 16(%rsp)
1127        movq    %rsi, 8(%rsp)
1128        leaq    (%rsp), %rax
1129        movq    %rax, 192(%rsp)
1130        leaq    208(%rsp), %rax
1131        movq    %rax, 184(%rsp)
1132        movl    $48, 180(%rsp)
1133        movl    $8, 176(%rsp)
1134        movl    176(%rsp), %eax
1135        cmpl    $47, %eax
1136        jbe     .LBB1_3 # bb
1137.LBB1_1:        # bb3
1138        movq    184(%rsp), %rcx
1139        leaq    8(%rcx), %rax
1140        movq    %rax, 184(%rsp)
1141.LBB1_2:        # bb4
1142        movl    (%rcx), %eax
1143        addq    $200, %rsp
1144        ret
1145.LBB1_3:        # bb
1146        movl    %eax, %ecx
1147        addl    $8, %eax
1148        addq    192(%rsp), %rcx
1149        movl    %eax, 176(%rsp)
1150        jmp     .LBB1_2 # bb4
1151
1152gcc 4.3 generates:
1153	subq    $96, %rsp
1154.LCFI0:
1155        leaq    104(%rsp), %rax
1156        movq    %rsi, -80(%rsp)
1157        movl    $8, -120(%rsp)
1158        movq    %rax, -112(%rsp)
1159        leaq    -88(%rsp), %rax
1160        movq    %rax, -104(%rsp)
1161        movl    $8, %eax
1162        cmpl    $48, %eax
1163        jb      .L6
1164        movq    -112(%rsp), %rdx
1165        movl    (%rdx), %eax
1166        addq    $96, %rsp
1167        ret
1168        .p2align 4,,10
1169        .p2align 3
1170.L6:
1171        mov     %eax, %edx
1172        addq    -104(%rsp), %rdx
1173        addl    $8, %eax
1174        movl    %eax, -120(%rsp)
1175        movl    (%rdx), %eax
1176        addq    $96, %rsp
1177        ret
1178
1179and it gets compiled into this on x86:
1180	pushl   %ebp
1181        movl    %esp, %ebp
1182        subl    $4, %esp
1183        leal    12(%ebp), %eax
1184        movl    %eax, -4(%ebp)
1185        leal    16(%ebp), %eax
1186        movl    %eax, -4(%ebp)
1187        movl    12(%ebp), %eax
1188        addl    $4, %esp
1189        popl    %ebp
1190        ret
1191
1192gcc 4.3 generates:
1193	pushl   %ebp
1194        movl    %esp, %ebp
1195        movl    12(%ebp), %eax
1196        popl    %ebp
1197        ret
1198
1199//===---------------------------------------------------------------------===//
1200
1201Teach tblgen not to check bitconvert source type in some cases. This allows us
1202to consolidate the following patterns in X86InstrMMX.td:
1203
1204def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1205                                                  (iPTR 0))))),
1206          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1207def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1208                                                  (iPTR 0))))),
1209          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1210def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1211                                                  (iPTR 0))))),
1212          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1213
1214There are other cases in various td files.
1215
1216//===---------------------------------------------------------------------===//
1217
1218Take something like the following on x86-32:
1219unsigned a(unsigned long long x, unsigned y) {return x % y;}
1220
1221We currently generate a libcall, but we really shouldn't: the expansion is
1222shorter and likely faster than the libcall.  The expected code is something
1223like the following:
1224
1225	movl	12(%ebp), %eax
1226	movl	16(%ebp), %ecx
1227	xorl	%edx, %edx
1228	divl	%ecx
1229	movl	8(%ebp), %eax
1230	divl	%ecx
1231	movl	%edx, %eax
1232	ret
1233
1234A similar code sequence works for division.
1235
1236//===---------------------------------------------------------------------===//
1237
1238We currently compile this:
1239
1240define i32 @func1(i32 %v1, i32 %v2) nounwind {
1241entry:
1242  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1243  %sum = extractvalue {i32, i1} %t, 0
1244  %obit = extractvalue {i32, i1} %t, 1
1245  br i1 %obit, label %overflow, label %normal
1246normal:
1247  ret i32 %sum
1248overflow:
1249  call void @llvm.trap()
1250  unreachable
1251}
1252declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1253declare void @llvm.trap()
1254
1255to:
1256
1257_func1:
1258	movl	4(%esp), %eax
1259	addl	8(%esp), %eax
1260	jo	LBB1_2	## overflow
1261LBB1_1:	## normal
1262	ret
1263LBB1_2:	## overflow
1264	ud2
1265
1266it would be nice to produce "into" someday.
1267
1268//===---------------------------------------------------------------------===//
1269
1270Test instructions can be eliminated by using EFLAGS values from arithmetic
1271instructions. This is currently not done for mul, and, or, xor, neg, shl,
1272sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1273for read-modify-write instructions. It is also current not done if the
1274OF or CF flags are needed.
1275
1276The shift operators have the complication that when the shift count is
1277zero, EFLAGS is not set, so they can only subsume a test instruction if
1278the shift count is known to be non-zero. Also, using the EFLAGS value
1279from a shift is apparently very slow on some x86 implementations.
1280
1281In read-modify-write instructions, the root node in the isel match is
1282the store, and isel has no way for the use of the EFLAGS result of the
1283arithmetic to be remapped to the new node.
1284
1285Add and subtract instructions set OF on signed overflow and CF on unsiged
1286overflow, while test instructions always clear OF and CF. In order to
1287replace a test with an add or subtract in a situation where OF or CF is
1288needed, codegen must be able to prove that the operation cannot see
1289signed or unsigned overflow, respectively.
1290
1291//===---------------------------------------------------------------------===//
1292
1293memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1294define <16 x float> @foo(<16 x float> %A) nounwind {
1295	%tmp = alloca <16 x float>, align 16
1296	%tmp2 = alloca <16 x float>, align 16
1297	store <16 x float> %A, <16 x float>* %tmp
1298	%s = bitcast <16 x float>* %tmp to i8*
1299	%s2 = bitcast <16 x float>* %tmp2 to i8*
1300	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1301	%R = load <16 x float>* %tmp2
1302	ret <16 x float> %R
1303}
1304
1305declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1306
1307which compiles to:
1308
1309_foo:
1310	subl	$140, %esp
1311	movaps	%xmm3, 112(%esp)
1312	movaps	%xmm2, 96(%esp)
1313	movaps	%xmm1, 80(%esp)
1314	movaps	%xmm0, 64(%esp)
1315	movl	60(%esp), %eax
1316	movl	%eax, 124(%esp)
1317	movl	56(%esp), %eax
1318	movl	%eax, 120(%esp)
1319	movl	52(%esp), %eax
1320        <many many more 32-bit copies>
1321      	movaps	(%esp), %xmm0
1322	movaps	16(%esp), %xmm1
1323	movaps	32(%esp), %xmm2
1324	movaps	48(%esp), %xmm3
1325	addl	$140, %esp
1326	ret
1327
1328On Nehalem, it may even be cheaper to just use movups when unaligned than to
1329fall back to lower-granularity chunks.
1330
1331//===---------------------------------------------------------------------===//
1332
1333Implement processor-specific optimizations for parity with GCC on these
1334processors.  GCC does two optimizations:
1335
13361. ix86_pad_returns inserts a noop before ret instructions if immediately
1337   preceded by a conditional branch or is the target of a jump.
13382. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1339   code contains more than 3 branches.
1340
1341The first one is done for all AMDs, Core2, and "Generic"
1342The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1343  Core 2, and "Generic"
1344
1345//===---------------------------------------------------------------------===//
1346Testcase:
1347int x(int a) { return (a&0xf0)>>4; }
1348
1349Current output:
1350	movl	4(%esp), %eax
1351	shrl	$4, %eax
1352	andl	$15, %eax
1353	ret
1354
1355Ideal output:
1356	movzbl	4(%esp), %eax
1357	shrl	$4, %eax
1358	ret
1359
1360//===---------------------------------------------------------------------===//
1361
1362Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1363properly.
1364
1365When the return value is not used (i.e. only care about the value in the
1366memory), x86 does not have to use add to implement these. Instead, it can use
1367add, sub, inc, dec instructions with the "lock" prefix.
1368
1369This is currently implemented using a bit of instruction selection trick. The
1370issue is the target independent pattern produces one output and a chain and we
1371want to map it into one that just output a chain. The current trick is to select
1372it into a MERGE_VALUES with the first definition being an implicit_def. The
1373proper solution is to add new ISD opcodes for the no-output variant. DAG
1374combiner can then transform the node before it gets to target node selection.
1375
1376Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1377fact these instructions are identical to the non-lock versions. We need a way to
1378add target specific information to target nodes and have this information
1379carried over to machine instructions. Asm printer (or JIT) can use this
1380information to add the "lock" prefix.
1381
1382//===---------------------------------------------------------------------===//
1383
1384struct B {
1385  unsigned char y0 : 1;
1386};
1387
1388int bar(struct B* a) { return a->y0; }
1389
1390define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1391  %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1392  %2 = load i8* %1, align 1
1393  %3 = and i8 %2, 1
1394  %4 = zext i8 %3 to i32
1395  ret i32 %4
1396}
1397
1398bar:                                    # @bar
1399# BB#0:
1400        movb    (%rdi), %al
1401        andb    $1, %al
1402        movzbl  %al, %eax
1403        ret
1404
1405Missed optimization: should be movl+andl.
1406
1407//===---------------------------------------------------------------------===//
1408
1409The x86_64 abi says:
1410
1411Booleans, when stored in a memory object, are stored as single byte objects the
1412value of which is always 0 (false) or 1 (true).
1413
1414We are not using this fact:
1415
1416int bar(_Bool *a) { return *a; }
1417
1418define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1419  %1 = load i8* %a, align 1, !tbaa !0
1420  %tmp = and i8 %1, 1
1421  %2 = zext i8 %tmp to i32
1422  ret i32 %2
1423}
1424
1425bar:
1426        movb    (%rdi), %al
1427        andb    $1, %al
1428        movzbl  %al, %eax
1429        ret
1430
1431GCC produces
1432
1433bar:
1434        movzbl  (%rdi), %eax
1435        ret
1436
1437//===---------------------------------------------------------------------===//
1438
1439Consider the following two functions compiled with clang:
1440_Bool foo(int *x) { return !(*x & 4); }
1441unsigned bar(int *x) { return !(*x & 4); }
1442
1443foo:
1444	movl	4(%esp), %eax
1445	testb	$4, (%eax)
1446	sete	%al
1447	movzbl	%al, %eax
1448	ret
1449
1450bar:
1451	movl	4(%esp), %eax
1452	movl	(%eax), %eax
1453	shrl	$2, %eax
1454	andl	$1, %eax
1455	xorl	$1, %eax
1456	ret
1457
1458The second function generates more code even though the two functions are
1459are functionally identical.
1460
1461//===---------------------------------------------------------------------===//
1462
1463Take the following C code:
1464int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1465
1466We generate the following IR with clang:
1467define i32 @f(i32 %a, i32 %b) nounwind readnone {
1468entry:
1469  %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1470  %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1471  %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1472  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1473  ret i32 %conv5
1474}
1475
1476And the following x86 code:
1477	xorl	%esi, %edi
1478	testb	$-1, %dil
1479	sete	%al
1480	movzbl	%al, %eax
1481	ret
1482
1483A cmpb instead of the xorl+testb would be one instruction shorter.
1484
1485//===---------------------------------------------------------------------===//
1486
1487Given the following C code:
1488int f(int a, int b) { return (signed char)a == (signed char)b; }
1489
1490We generate the following IR with clang:
1491define i32 @f(i32 %a, i32 %b) nounwind readnone {
1492entry:
1493  %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1494  %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1495  %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1496  %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1497  %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1498  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1499  ret i32 %conv5
1500}
1501
1502And the following x86 code:
1503	movsbl	%sil, %eax
1504	movsbl	%dil, %ecx
1505	cmpl	%eax, %ecx
1506	sete	%al
1507	movzbl	%al, %eax
1508	ret
1509
1510
1511It should be possible to eliminate the sign extensions.
1512
1513//===---------------------------------------------------------------------===//
1514
1515LLVM misses a load+store narrowing opportunity in this code:
1516
1517%struct.bf = type { i64, i16, i16, i32 }
1518
1519@bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1520
1521define void @t1() nounwind ssp {
1522entry:
1523  %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1524  %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1525  %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1526  %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1527  %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1528  store i32 %4, i32* %2, align 1
1529  %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1530  %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1531  %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1532  %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1533  %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1534  store i32 %9, i32* %7, align 1
1535  ret void
1536}
1537
1538LLVM currently emits this:
1539
1540  movq  bfi(%rip), %rax
1541  andl  $-65537, 8(%rax)
1542  movq  bfi(%rip), %rax
1543  andl  $-131073, 8(%rax)
1544  ret
1545
1546It could narrow the loads and stores to emit this:
1547
1548  movq  bfi(%rip), %rax
1549  andb  $-2, 10(%rax)
1550  movq  bfi(%rip), %rax
1551  andb  $-3, 10(%rax)
1552  ret
1553
1554The trouble is that there is a TokenFactor between the store and the
1555load, making it non-trivial to determine if there's anything between
1556the load and the store which would prohibit narrowing.
1557
1558//===---------------------------------------------------------------------===//
1559
1560This code:
1561void foo(unsigned x) {
1562  if (x == 0) bar();
1563  else if (x == 1) qux();
1564}
1565
1566currently compiles into:
1567_foo:
1568	movl	4(%esp), %eax
1569	cmpl	$1, %eax
1570	je	LBB0_3
1571	testl	%eax, %eax
1572	jne	LBB0_4
1573
1574the testl could be removed:
1575_foo:
1576	movl	4(%esp), %eax
1577	cmpl	$1, %eax
1578	je	LBB0_3
1579	jb	LBB0_4
1580
15810 is the only unsigned number < 1.
1582
1583//===---------------------------------------------------------------------===//
1584
1585This code:
1586
1587%0 = type { i32, i1 }
1588
1589define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1590entry:
1591  %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1592  %cmp = extractvalue %0 %uadd, 1
1593  %inc = zext i1 %cmp to i32
1594  %add = add i32 %x, %sum
1595  %z.0 = add i32 %add, %inc
1596  ret i32 %z.0
1597}
1598
1599declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1600
1601compiles to:
1602
1603_add32carry:                            ## @add32carry
1604	addl	%esi, %edi
1605	sbbl	%ecx, %ecx
1606	movl	%edi, %eax
1607	subl	%ecx, %eax
1608	ret
1609
1610But it could be:
1611
1612_add32carry:
1613	leal	(%rsi,%rdi), %eax
1614	cmpl	%esi, %eax
1615	adcl	$0, %eax
1616	ret
1617
1618//===---------------------------------------------------------------------===//
1619
1620The hot loop of 256.bzip2 contains code that looks a bit like this:
1621
1622int foo(char *P, char *Q, int x, int y) {
1623  if (P[0] != Q[0])
1624     return P[0] < Q[0];
1625  if (P[1] != Q[1])
1626     return P[1] < Q[1];
1627  if (P[2] != Q[2])
1628     return P[2] < Q[2];
1629   return P[3] < Q[3];
1630}
1631
1632In the real code, we get a lot more wrong than this.  However, even in this
1633code we generate:
1634
1635_foo:                                   ## @foo
1636## BB#0:                                ## %entry
1637	movb	(%rsi), %al
1638	movb	(%rdi), %cl
1639	cmpb	%al, %cl
1640	je	LBB0_2
1641LBB0_1:                                 ## %if.then
1642	cmpb	%al, %cl
1643	jmp	LBB0_5
1644LBB0_2:                                 ## %if.end
1645	movb	1(%rsi), %al
1646	movb	1(%rdi), %cl
1647	cmpb	%al, %cl
1648	jne	LBB0_1
1649## BB#3:                                ## %if.end38
1650	movb	2(%rsi), %al
1651	movb	2(%rdi), %cl
1652	cmpb	%al, %cl
1653	jne	LBB0_1
1654## BB#4:                                ## %if.end60
1655	movb	3(%rdi), %al
1656	cmpb	3(%rsi), %al
1657LBB0_5:                                 ## %if.end60
1658	setl	%al
1659	movzbl	%al, %eax
1660	ret
1661
1662Note that we generate jumps to LBB0_1 which does a redundant compare.  The
1663redundant compare also forces the register values to be live, which prevents
1664folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
1665
1666_foo:
1667	movzbl	(%rsi), %eax
1668	cmpb	%al, (%rdi)
1669	jne	L10
1670L12:
1671	movzbl	1(%rsi), %eax
1672	cmpb	%al, 1(%rdi)
1673	jne	L10
1674	movzbl	2(%rsi), %eax
1675	cmpb	%al, 2(%rdi)
1676	jne	L10
1677	movzbl	3(%rdi), %eax
1678	cmpb	3(%rsi), %al
1679L10:
1680	setl	%al
1681	movzbl	%al, %eax
1682	ret
1683
1684which is "perfect".
1685
1686//===---------------------------------------------------------------------===//
1687
1688For the branch in the following code:
1689int a();
1690int b(int x, int y) {
1691  if (x & (1<<(y&7)))
1692    return a();
1693  return y;
1694}
1695
1696We currently generate:
1697	movb	%sil, %al
1698	andb	$7, %al
1699	movzbl	%al, %eax
1700	btl	%eax, %edi
1701	jae	.LBB0_2
1702
1703movl+andl would be shorter than the movb+andb+movzbl sequence.
1704
1705//===---------------------------------------------------------------------===//
1706
1707For the following:
1708struct u1 {
1709    float x, y;
1710};
1711float foo(struct u1 u) {
1712    return u.x + u.y;
1713}
1714
1715We currently generate:
1716	movdqa	%xmm0, %xmm1
1717	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
1718	addss	%xmm1, %xmm0
1719	ret
1720
1721We could save an instruction here by commuting the addss.
1722
1723//===---------------------------------------------------------------------===//
1724
1725This (from PR9661):
1726
1727float clamp_float(float a) {
1728        if (a > 1.0f)
1729                return 1.0f;
1730        else if (a < 0.0f)
1731                return 0.0f;
1732        else
1733                return a;
1734}
1735
1736Could compile to:
1737
1738clamp_float:                            # @clamp_float
1739        movss   .LCPI0_0(%rip), %xmm1
1740        minss   %xmm1, %xmm0
1741        pxor    %xmm1, %xmm1
1742        maxss   %xmm1, %xmm0
1743        ret
1744
1745with -ffast-math.
1746
1747//===---------------------------------------------------------------------===//
1748
1749This function (from PR9803):
1750
1751int clamp2(int a) {
1752        if (a > 5)
1753                a = 5;
1754        if (a < 0)
1755                return 0;
1756        return a;
1757}
1758
1759Compiles to:
1760
1761_clamp2:                                ## @clamp2
1762        pushq   %rbp
1763        movq    %rsp, %rbp
1764        cmpl    $5, %edi
1765        movl    $5, %ecx
1766        cmovlel %edi, %ecx
1767        testl   %ecx, %ecx
1768        movl    $0, %eax
1769        cmovnsl %ecx, %eax
1770        popq    %rbp
1771        ret
1772
1773The move of 0 could be scheduled above the test to make it is xor reg,reg.
1774
1775//===---------------------------------------------------------------------===//
1776
1777GCC PR48986.  We currently compile this:
1778
1779void bar(void);
1780void yyy(int* p) {
1781    if (__sync_fetch_and_add(p, -1) == 1)
1782      bar();
1783}
1784
1785into:
1786	movl	$-1, %eax
1787	lock
1788	xaddl	%eax, (%rdi)
1789	cmpl	$1, %eax
1790	je	LBB0_2
1791
1792Instead we could generate:
1793
1794	lock
1795	dec %rdi
1796	je LBB0_2
1797
1798The trick is to match "fetch_and_add(X, -C) == C".
1799
1800//===---------------------------------------------------------------------===//
1801
1802unsigned t(unsigned a, unsigned b) {
1803  return a <= b ? 5 : -5;
1804}
1805
1806We generate:
1807	movl	$5, %ecx
1808	cmpl	%esi, %edi
1809	movl	$-5, %eax
1810	cmovbel	%ecx, %eax
1811
1812GCC:
1813	cmpl	%edi, %esi
1814	sbbl	%eax, %eax
1815	andl	$-10, %eax
1816	addl	$5, %eax
1817
1818//===---------------------------------------------------------------------===//
1819