1*06f32e7eSjoerg//===---------------------------------------------------------------------===//
2*06f32e7eSjoerg
3*06f32e7eSjoergCommon register allocation / spilling problem:
4*06f32e7eSjoerg
5*06f32e7eSjoerg        mul lr, r4, lr
6*06f32e7eSjoerg        str lr, [sp, #+52]
7*06f32e7eSjoerg        ldr lr, [r1, #+32]
8*06f32e7eSjoerg        sxth r3, r3
9*06f32e7eSjoerg        ldr r4, [sp, #+52]
10*06f32e7eSjoerg        mla r4, r3, lr, r4
11*06f32e7eSjoerg
12*06f32e7eSjoergcan be:
13*06f32e7eSjoerg
14*06f32e7eSjoerg        mul lr, r4, lr
15*06f32e7eSjoerg        mov r4, lr
16*06f32e7eSjoerg        str lr, [sp, #+52]
17*06f32e7eSjoerg        ldr lr, [r1, #+32]
18*06f32e7eSjoerg        sxth r3, r3
19*06f32e7eSjoerg        mla r4, r3, lr, r4
20*06f32e7eSjoerg
21*06f32e7eSjoergand then "merge" mul and mov:
22*06f32e7eSjoerg
23*06f32e7eSjoerg        mul r4, r4, lr
24*06f32e7eSjoerg        str r4, [sp, #+52]
25*06f32e7eSjoerg        ldr lr, [r1, #+32]
26*06f32e7eSjoerg        sxth r3, r3
27*06f32e7eSjoerg        mla r4, r3, lr, r4
28*06f32e7eSjoerg
29*06f32e7eSjoergIt also increase the likelihood the store may become dead.
30*06f32e7eSjoerg
31*06f32e7eSjoerg//===---------------------------------------------------------------------===//
32*06f32e7eSjoerg
33*06f32e7eSjoergbb27 ...
34*06f32e7eSjoerg        ...
35*06f32e7eSjoerg        %reg1037 = ADDri %reg1039, 1
36*06f32e7eSjoerg        %reg1038 = ADDrs %reg1032, %reg1039, %noreg, 10
37*06f32e7eSjoerg    Successors according to CFG: 0x8b03bf0 (#5)
38*06f32e7eSjoerg
39*06f32e7eSjoergbb76 (0x8b03bf0, LLVM BB @0x8b032d0, ID#5):
40*06f32e7eSjoerg    Predecessors according to CFG: 0x8b0c5f0 (#3) 0x8b0a7c0 (#4)
41*06f32e7eSjoerg        %reg1039 = PHI %reg1070, mbb<bb76.outer,0x8b0c5f0>, %reg1037, mbb<bb27,0x8b0a7c0>
42*06f32e7eSjoerg
43*06f32e7eSjoergNote ADDri is not a two-address instruction. However, its result %reg1037 is an
44*06f32e7eSjoergoperand of the PHI node in bb76 and its operand %reg1039 is the result of the
45*06f32e7eSjoergPHI node. We should treat it as a two-address code and make sure the ADDri is
46*06f32e7eSjoergscheduled after any node that reads %reg1039.
47*06f32e7eSjoerg
48*06f32e7eSjoerg//===---------------------------------------------------------------------===//
49*06f32e7eSjoerg
50*06f32e7eSjoergUse local info (i.e. register scavenger) to assign it a free register to allow
51*06f32e7eSjoergreuse:
52*06f32e7eSjoerg        ldr r3, [sp, #+4]
53*06f32e7eSjoerg        add r3, r3, #3
54*06f32e7eSjoerg        ldr r2, [sp, #+8]
55*06f32e7eSjoerg        add r2, r2, #2
56*06f32e7eSjoerg        ldr r1, [sp, #+4]  <==
57*06f32e7eSjoerg        add r1, r1, #1
58*06f32e7eSjoerg        ldr r0, [sp, #+4]
59*06f32e7eSjoerg        add r0, r0, #2
60*06f32e7eSjoerg
61*06f32e7eSjoerg//===---------------------------------------------------------------------===//
62*06f32e7eSjoerg
63*06f32e7eSjoergLLVM aggressively lift CSE out of loop. Sometimes this can be negative side-
64*06f32e7eSjoergeffects:
65*06f32e7eSjoerg
66*06f32e7eSjoergR1 = X + 4
67*06f32e7eSjoergR2 = X + 7
68*06f32e7eSjoergR3 = X + 15
69*06f32e7eSjoerg
70*06f32e7eSjoergloop:
71*06f32e7eSjoergload [i + R1]
72*06f32e7eSjoerg...
73*06f32e7eSjoergload [i + R2]
74*06f32e7eSjoerg...
75*06f32e7eSjoergload [i + R3]
76*06f32e7eSjoerg
77*06f32e7eSjoergSuppose there is high register pressure, R1, R2, R3, can be spilled. We need
78*06f32e7eSjoergto implement proper re-materialization to handle this:
79*06f32e7eSjoerg
80*06f32e7eSjoergR1 = X + 4
81*06f32e7eSjoergR2 = X + 7
82*06f32e7eSjoergR3 = X + 15
83*06f32e7eSjoerg
84*06f32e7eSjoergloop:
85*06f32e7eSjoergR1 = X + 4  @ re-materialized
86*06f32e7eSjoergload [i + R1]
87*06f32e7eSjoerg...
88*06f32e7eSjoergR2 = X + 7 @ re-materialized
89*06f32e7eSjoergload [i + R2]
90*06f32e7eSjoerg...
91*06f32e7eSjoergR3 = X + 15 @ re-materialized
92*06f32e7eSjoergload [i + R3]
93*06f32e7eSjoerg
94*06f32e7eSjoergFurthermore, with re-association, we can enable sharing:
95*06f32e7eSjoerg
96*06f32e7eSjoergR1 = X + 4
97*06f32e7eSjoergR2 = X + 7
98*06f32e7eSjoergR3 = X + 15
99*06f32e7eSjoerg
100*06f32e7eSjoergloop:
101*06f32e7eSjoergT = i + X
102*06f32e7eSjoergload [T + 4]
103*06f32e7eSjoerg...
104*06f32e7eSjoergload [T + 7]
105*06f32e7eSjoerg...
106*06f32e7eSjoergload [T + 15]
107*06f32e7eSjoerg//===---------------------------------------------------------------------===//
108*06f32e7eSjoerg
109*06f32e7eSjoergIt's not always a good idea to choose rematerialization over spilling. If all
110*06f32e7eSjoergthe load / store instructions would be folded then spilling is cheaper because
111*06f32e7eSjoergit won't require new live intervals / registers. See 2003-05-31-LongShifts for
112*06f32e7eSjoergan example.
113*06f32e7eSjoerg
114*06f32e7eSjoerg//===---------------------------------------------------------------------===//
115*06f32e7eSjoerg
116*06f32e7eSjoergWith a copying garbage collector, derived pointers must not be retained across
117*06f32e7eSjoergcollector safe points; the collector could move the objects and invalidate the
118*06f32e7eSjoergderived pointer. This is bad enough in the first place, but safe points can
119*06f32e7eSjoergcrop up unpredictably. Consider:
120*06f32e7eSjoerg
121*06f32e7eSjoerg        %array = load { i32, [0 x %obj] }** %array_addr
122*06f32e7eSjoerg        %nth_el = getelementptr { i32, [0 x %obj] }* %array, i32 0, i32 %n
123*06f32e7eSjoerg        %old = load %obj** %nth_el
124*06f32e7eSjoerg        %z = div i64 %x, %y
125*06f32e7eSjoerg        store %obj* %new, %obj** %nth_el
126*06f32e7eSjoerg
127*06f32e7eSjoergIf the i64 division is lowered to a libcall, then a safe point will (must)
128*06f32e7eSjoergappear for the call site. If a collection occurs, %array and %nth_el no longer
129*06f32e7eSjoergpoint into the correct object.
130*06f32e7eSjoerg
131*06f32e7eSjoergThe fix for this is to copy address calculations so that dependent pointers
132*06f32e7eSjoergare never live across safe point boundaries. But the loads cannot be copied
133*06f32e7eSjoerglike this if there was an intervening store, so may be hard to get right.
134*06f32e7eSjoerg
135*06f32e7eSjoergOnly a concurrent mutator can trigger a collection at the libcall safe point.
136*06f32e7eSjoergSo single-threaded programs do not have this requirement, even with a copying
137*06f32e7eSjoergcollector. Still, LLVM optimizations would probably undo a front-end's careful
138*06f32e7eSjoergwork.
139*06f32e7eSjoerg
140*06f32e7eSjoerg//===---------------------------------------------------------------------===//
141*06f32e7eSjoerg
142*06f32e7eSjoergThe ocaml frametable structure supports liveness information. It would be good
143*06f32e7eSjoergto support it.
144*06f32e7eSjoerg
145*06f32e7eSjoerg//===---------------------------------------------------------------------===//
146*06f32e7eSjoerg
147*06f32e7eSjoergThe FIXME in ComputeCommonTailLength in BranchFolding.cpp needs to be
148*06f32e7eSjoergrevisited. The check is there to work around a misuse of directives in inline
149*06f32e7eSjoergassembly.
150*06f32e7eSjoerg
151*06f32e7eSjoerg//===---------------------------------------------------------------------===//
152*06f32e7eSjoerg
153*06f32e7eSjoergIt would be good to detect collector/target compatibility instead of silently
154*06f32e7eSjoergdoing the wrong thing.
155*06f32e7eSjoerg
156*06f32e7eSjoerg//===---------------------------------------------------------------------===//
157*06f32e7eSjoerg
158*06f32e7eSjoergIt would be really nice to be able to write patterns in .td files for copies,
159*06f32e7eSjoergwhich would eliminate a bunch of explicit predicates on them (e.g. no side
160*06f32e7eSjoergeffects).  Once this is in place, it would be even better to have tblgen
161*06f32e7eSjoergsynthesize the various copy insertion/inspection methods in TargetInstrInfo.
162*06f32e7eSjoerg
163*06f32e7eSjoerg//===---------------------------------------------------------------------===//
164*06f32e7eSjoerg
165*06f32e7eSjoergStack coloring improvements:
166*06f32e7eSjoerg
167*06f32e7eSjoerg1. Do proper LiveStacks analysis on all stack objects including those which are
168*06f32e7eSjoerg   not spill slots.
169*06f32e7eSjoerg2. Reorder objects to fill in gaps between objects.
170*06f32e7eSjoerg   e.g. 4, 1, <gap>, 4, 1, 1, 1, <gap>, 4 => 4, 1, 1, 1, 1, 4, 4
171*06f32e7eSjoerg
172*06f32e7eSjoerg//===---------------------------------------------------------------------===//
173*06f32e7eSjoerg
174*06f32e7eSjoergThe scheduler should be able to sort nearby instructions by their address. For
175*06f32e7eSjoergexample, in an expanded memset sequence it's not uncommon to see code like this:
176*06f32e7eSjoerg
177*06f32e7eSjoerg  movl $0, 4(%rdi)
178*06f32e7eSjoerg  movl $0, 8(%rdi)
179*06f32e7eSjoerg  movl $0, 12(%rdi)
180*06f32e7eSjoerg  movl $0, 0(%rdi)
181*06f32e7eSjoerg
182*06f32e7eSjoergEach of the stores is independent, and the scheduler is currently making an
183*06f32e7eSjoergarbitrary decision about the order.
184*06f32e7eSjoerg
185*06f32e7eSjoerg//===---------------------------------------------------------------------===//
186*06f32e7eSjoerg
187*06f32e7eSjoergAnother opportunitiy in this code is that the $0 could be moved to a register:
188*06f32e7eSjoerg
189*06f32e7eSjoerg  movl $0, 4(%rdi)
190*06f32e7eSjoerg  movl $0, 8(%rdi)
191*06f32e7eSjoerg  movl $0, 12(%rdi)
192*06f32e7eSjoerg  movl $0, 0(%rdi)
193*06f32e7eSjoerg
194*06f32e7eSjoergThis would save substantial code size, especially for longer sequences like
195*06f32e7eSjoergthis. It would be easy to have a rule telling isel to avoid matching MOV32mi
196*06f32e7eSjoergif the immediate has more than some fixed number of uses. It's more involved
197*06f32e7eSjoergto teach the register allocator how to do late folding to recover from
198*06f32e7eSjoergexcessive register pressure.
199*06f32e7eSjoerg
200