1 #include <algorithm>
2 #include <cstring>
3 #include <utility>
4
5 #include "Common/BitSet.h"
6 #include "Common/Data/Convert/SmallDataConvert.h"
7 #include "Common/Log.h"
8 #include "Core/MIPS/IR/IRInterpreter.h"
9 #include "Core/MIPS/IR/IRPassSimplify.h"
10 #include "Core/MIPS/IR/IRRegCache.h"
11
12 // #define CONDITIONAL_DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }
13 #define CONDITIONAL_DISABLE
14 #define DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }
15
Evaluate(u32 a,u32 b,IROp op)16 u32 Evaluate(u32 a, u32 b, IROp op) {
17 switch (op) {
18 case IROp::Add: case IROp::AddConst: return a + b;
19 case IROp::Sub: case IROp::SubConst: return a - b;
20 case IROp::And: case IROp::AndConst: return a & b;
21 case IROp::Or: case IROp::OrConst: return a | b;
22 case IROp::Xor: case IROp::XorConst: return a ^ b;
23 case IROp::Shr: case IROp::ShrImm: return a >> b;
24 case IROp::Sar: case IROp::SarImm: return (s32)a >> b;
25 case IROp::Ror: case IROp::RorImm: return (a >> b) | (a << (32 - b));
26 case IROp::Shl: case IROp::ShlImm: return a << b;
27 case IROp::Slt: case IROp::SltConst: return ((s32)a < (s32)b);
28 case IROp::SltU: case IROp::SltUConst: return (a < b);
29 default:
30 return -1;
31 }
32 }
33
Evaluate(u32 a,IROp op)34 u32 Evaluate(u32 a, IROp op) {
35 switch (op) {
36 case IROp::Not: return ~a;
37 case IROp::Neg: return -(s32)a;
38 case IROp::BSwap16: return ((a & 0xFF00FF00) >> 8) | ((a & 0x00FF00FF) << 8);
39 case IROp::BSwap32: return swap32(a);
40 case IROp::Ext8to32: return SignExtend8ToU32(a);
41 case IROp::Ext16to32: return SignExtend16ToU32(a);
42 case IROp::ReverseBits: return ReverseBits32(a);
43 case IROp::Clz: {
44 int x = 31;
45 int count = 0;
46 while (x >= 0 && !(a & (1 << x))) {
47 count++;
48 x--;
49 }
50 return count;
51 }
52 default:
53 return -1;
54 }
55 }
56
ArithToArithConst(IROp op)57 IROp ArithToArithConst(IROp op) {
58 switch (op) {
59 case IROp::Add: return IROp::AddConst;
60 case IROp::Sub: return IROp::SubConst;
61 case IROp::And: return IROp::AndConst;
62 case IROp::Or: return IROp::OrConst;
63 case IROp::Xor: return IROp::XorConst;
64 case IROp::Slt: return IROp::SltConst;
65 case IROp::SltU: return IROp::SltUConst;
66 default:
67 return (IROp)-1;
68 }
69 }
70
ShiftToShiftImm(IROp op)71 IROp ShiftToShiftImm(IROp op) {
72 switch (op) {
73 case IROp::Shl: return IROp::ShlImm;
74 case IROp::Shr: return IROp::ShrImm;
75 case IROp::Ror: return IROp::RorImm;
76 case IROp::Sar: return IROp::SarImm;
77 default:
78 return (IROp)-1;
79 }
80 }
81
IRApplyPasses(const IRPassFunc * passes,size_t c,const IRWriter & in,IRWriter & out,const IROptions & opts)82 bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out, const IROptions &opts) {
83 if (c == 1) {
84 return passes[0](in, out, opts);
85 }
86
87 bool logBlocks = false;
88
89 IRWriter temp[2];
90 const IRWriter *nextIn = ∈
91 IRWriter *nextOut = &temp[1];
92 for (size_t i = 0; i < c - 1; ++i) {
93 if (passes[i](*nextIn, *nextOut, opts)) {
94 logBlocks = true;
95 }
96
97 temp[0] = std::move(temp[1]);
98 nextIn = &temp[0];
99 }
100
101 if (passes[c - 1](*nextIn, out, opts)) {
102 logBlocks = true;
103 }
104
105 return logBlocks;
106 }
107
OptimizeFPMoves(const IRWriter & in,IRWriter & out,const IROptions & opts)108 bool OptimizeFPMoves(const IRWriter &in, IRWriter &out, const IROptions &opts) {
109 CONDITIONAL_DISABLE;
110
111 bool logBlocks = false;
112 IRInst prev{ IROp::Nop };
113
114 for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
115 IRInst inst = in.GetInstructions()[i];
116 switch (inst.op) {
117 case IROp::FMovFromGPR:
118 //FMovToGPR a0, f12
119 //FMovFromGPR f14, a0
120 // to
121 //FMovToGPR a0, f12
122 //FMov f14, f12
123 if (prev.op == IROp::FMovToGPR && prev.dest == inst.src1) {
124 inst.op = IROp::FMov;
125 inst.src1 = prev.src1;
126 out.Write(inst);
127 } else {
128 out.Write(inst);
129 }
130 break;
131
132 // This will need to scan forward or keep track of more information to be useful.
133 // Just doing one isn't.
134 /*
135 case IROp::LoadVec4:
136 // AddConst a0, sp, 0x30
137 // LoadVec4 v16, a0, 0x0
138 // to
139 // AddConst a0, sp, 0x30
140 // LoadVec4 v16, sp, 0x30
141 if (prev.op == IROp::AddConst && prev.dest == inst.src1 && prev.dest != prev.src1 && prev.src1 == MIPS_REG_SP) {
142 inst.constant += prev.constant;
143 inst.src1 = prev.src1;
144 logBlocks = 1;
145 } else {
146 goto doDefault;
147 }
148 out.Write(inst);
149 break;
150 */
151 default:
152 out.Write(inst);
153 break;
154 }
155 prev = inst;
156 }
157 return logBlocks;
158 }
159
160 // Might be useful later on x86.
ThreeOpToTwoOp(const IRWriter & in,IRWriter & out,const IROptions & opts)161 bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out, const IROptions &opts) {
162 CONDITIONAL_DISABLE;
163
164 bool logBlocks = false;
165 for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
166 IRInst inst = in.GetInstructions()[i];
167 switch (inst.op) {
168 case IROp::Sub:
169 case IROp::Slt:
170 case IROp::SltU:
171 case IROp::Add:
172 case IROp::And:
173 case IROp::Or:
174 case IROp::Xor:
175 if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
176 out.Write(IROp::Mov, inst.dest, inst.src1);
177 out.Write(inst.op, inst.dest, inst.dest, inst.src2);
178 } else {
179 out.Write(inst);
180 }
181 break;
182 case IROp::FMul:
183 case IROp::FAdd:
184 if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
185 out.Write(IROp::FMov, inst.dest, inst.src1);
186 out.Write(inst.op, inst.dest, inst.dest, inst.src2);
187 } else {
188 out.Write(inst);
189 }
190 break;
191
192 case IROp::Vec4Add:
193 case IROp::Vec4Sub:
194 case IROp::Vec4Mul:
195 case IROp::Vec4Div:
196 if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
197 out.Write(IROp::Vec4Mov, inst.dest, inst.src1);
198 out.Write(inst.op, inst.dest, inst.dest, inst.src2);
199 } else {
200 out.Write(inst);
201 }
202 break;
203
204 default:
205 out.Write(inst);
206 break;
207 }
208 }
209 return logBlocks;
210 }
211
RemoveLoadStoreLeftRight(const IRWriter & in,IRWriter & out,const IROptions & opts)212 bool RemoveLoadStoreLeftRight(const IRWriter &in, IRWriter &out, const IROptions &opts) {
213 CONDITIONAL_DISABLE;
214
215 bool logBlocks = false;
216 for (int i = 0, n = (int)in.GetInstructions().size(); i < n; ++i) {
217 const IRInst &inst = in.GetInstructions()[i];
218
219 // TODO: Reorder or look ahead to combine?
220
221 auto nextOp = [&]() -> const IRInst &{
222 return in.GetInstructions()[i + 1];
223 };
224
225 auto combineOpposite = [&](IROp matchOp, int matchOff, IROp replaceOp, int replaceOff) {
226 if (!opts.unalignedLoadStore || i + 1 >= n)
227 return false;
228 const IRInst &next = nextOp();
229 if (next.op != matchOp || next.dest != inst.dest || next.src1 != inst.src1)
230 return false;
231 if (inst.constant + matchOff != next.constant)
232 return false;
233
234 // Write out one unaligned op.
235 out.Write(replaceOp, inst.dest, inst.src1, out.AddConstant(inst.constant + replaceOff));
236 // Skip the next one, replaced.
237 i++;
238 return true;
239 };
240
241 auto addCommonProlog = [&]() {
242 // IRTEMP_LR_ADDR = rs + imm
243 out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant));
244 // IRTEMP_LR_SHIFT = (addr & 3) * 8
245 out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3));
246 out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3);
247 // IRTEMP_LR_ADDR = addr & 0xfffffffc (for stores, later)
248 out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC));
249 // IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR)
250 out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(0));
251 };
252 auto addCommonStore = [&](int off = 0) {
253 // RAM(IRTEMP_LR_ADDR) = IRTEMP_LR_VALUE
254 out.Write(IROp::Store32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(off));
255 };
256
257 switch (inst.op) {
258 case IROp::Load32Left:
259 if (!combineOpposite(IROp::Load32Right, -3, IROp::Load32, -3)) {
260 addCommonProlog();
261 // dest &= (0x00ffffff >> shift)
262 // Alternatively, could shift to a wall and back (but would require two shifts each way.)
263 out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);
264 out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
265 out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);
266 // IRTEMP_LR_SHIFT = 24 - shift
267 out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
268 out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
269 // IRTEMP_LR_VALUE <<= (24 - shift)
270 out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
271 // dest |= IRTEMP_LR_VALUE
272 out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
273
274 bool src1Dirty = inst.dest == inst.src1;
275 while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {
276 // IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)
277 out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));
278
279 // dest &= IRTEMP_LR_MASK
280 out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);
281 // IRTEMP_LR_VALUE <<= (24 - shift)
282 out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
283 // dest |= IRTEMP_LR_VALUE
284 out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);
285
286 src1Dirty = nextOp().dest == inst.src1;
287 ++i;
288 }
289 }
290 break;
291
292 case IROp::Load32Right:
293 if (!combineOpposite(IROp::Load32Left, 3, IROp::Load32, 0)) {
294 addCommonProlog();
295 // IRTEMP_LR_VALUE >>= shift
296 out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
297 // IRTEMP_LR_SHIFT = 24 - shift
298 out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
299 out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
300 // dest &= (0xffffff00 << (24 - shift))
301 // Alternatively, could shift to a wall and back (but would require two shifts each way.)
302 out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);
303 out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
304 out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);
305 // dest |= IRTEMP_LR_VALUE
306 out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
307
308 // Building display lists sometimes involves a bunch of lwr in a row.
309 // We can generate more optimal code by combining.
310 bool shiftNeedsReverse = true;
311 bool src1Dirty = inst.dest == inst.src1;
312 while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {
313 // IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)
314 out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));
315
316 if (shiftNeedsReverse) {
317 // IRTEMP_LR_SHIFT = shift again
318 out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
319 out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
320 shiftNeedsReverse = false;
321 }
322 // IRTEMP_LR_VALUE >>= IRTEMP_LR_SHIFT
323 out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
324 // dest &= IRTEMP_LR_MASK
325 out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);
326 // dest |= IRTEMP_LR_VALUE
327 out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);
328
329 src1Dirty = nextOp().dest == inst.src1;
330 ++i;
331 }
332 }
333 break;
334
335 case IROp::Store32Left:
336 if (!combineOpposite(IROp::Store32Right, -3, IROp::Store32, -3)) {
337 addCommonProlog();
338 // IRTEMP_LR_VALUE &= 0xffffff00 << shift
339 out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);
340 out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
341 out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
342 // IRTEMP_LR_SHIFT = 24 - shift
343 out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
344 out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
345 // IRTEMP_LR_VALUE |= src3 >> (24 - shift)
346 out.Write(IROp::Shr, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);
347 out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
348 addCommonStore(0);
349 }
350 break;
351
352 case IROp::Store32Right:
353 if (!combineOpposite(IROp::Store32Left, 3, IROp::Store32, 0)) {
354 addCommonProlog();
355 // IRTEMP_LR_VALUE &= 0x00ffffff << (24 - shift)
356 out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);
357 out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
358 out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
359 out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
360 out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
361 out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
362 out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
363 // IRTEMP_LR_VALUE |= src3 << shift
364 out.Write(IROp::Shl, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);
365 out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
366 addCommonStore(0);
367 }
368 break;
369
370 default:
371 out.Write(inst);
372 break;
373 }
374 }
375
376 return logBlocks;
377 }
378
PropagateConstants(const IRWriter & in,IRWriter & out,const IROptions & opts)379 bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts) {
380 CONDITIONAL_DISABLE;
381 IRRegCache gpr(&out);
382
383 bool logBlocks = false;
384 for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
385 IRInst inst = in.GetInstructions()[i];
386 bool symmetric = true;
387 switch (inst.op) {
388 case IROp::SetConst:
389 gpr.SetImm(inst.dest, inst.constant);
390 break;
391 case IROp::SetConstF:
392 goto doDefault;
393
394 case IROp::Sub:
395 case IROp::Slt:
396 case IROp::SltU:
397 symmetric = false; // fallthrough
398 case IROp::Add:
399 case IROp::And:
400 case IROp::Or:
401 case IROp::Xor:
402 // Regularize, for the add/or check below.
403 if (symmetric && inst.src2 == inst.dest && inst.src1 != inst.src2) {
404 std::swap(inst.src1, inst.src2);
405 }
406 if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
407 gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
408 } else if (inst.op == IROp::And && gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0) {
409 gpr.SetImm(inst.dest, 0);
410 } else if (inst.op == IROp::And && gpr.IsImm(inst.src2) && gpr.GetImm(inst.src2) == 0) {
411 gpr.SetImm(inst.dest, 0);
412 } else if (gpr.IsImm(inst.src2)) {
413 const u32 imm2 = gpr.GetImm(inst.src2);
414 gpr.MapDirtyIn(inst.dest, inst.src1);
415 if (imm2 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
416 // Add / Or with zero is just a Mov.
417 if (inst.dest != inst.src1)
418 out.Write(IROp::Mov, inst.dest, inst.src1);
419 } else {
420 out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(imm2));
421 }
422 } else if (symmetric && gpr.IsImm(inst.src1)) {
423 const u32 imm1 = gpr.GetImm(inst.src1);
424 gpr.MapDirtyIn(inst.dest, inst.src2);
425 if (imm1 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
426 // Add / Or with zero is just a Mov.
427 if (inst.dest != inst.src2)
428 out.Write(IROp::Mov, inst.dest, inst.src2);
429 } else {
430 out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(imm1));
431 }
432 } else {
433 gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
434 goto doDefault;
435 }
436 break;
437
438 case IROp::Neg:
439 case IROp::Not:
440 case IROp::BSwap16:
441 case IROp::BSwap32:
442 case IROp::Ext8to32:
443 case IROp::Ext16to32:
444 case IROp::ReverseBits:
445 case IROp::Clz:
446 if (gpr.IsImm(inst.src1)) {
447 gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.op));
448 } else {
449 gpr.MapDirtyIn(inst.dest, inst.src1);
450 goto doDefault;
451 }
452 break;
453
454 case IROp::AddConst:
455 case IROp::SubConst:
456 case IROp::AndConst:
457 case IROp::OrConst:
458 case IROp::XorConst:
459 case IROp::SltConst:
460 case IROp::SltUConst:
461 // And 0 is otherwise set to 0. Happens when optimizing lwl.
462 if (inst.op == IROp::AndConst && inst.constant == 0) {
463 gpr.SetImm(inst.dest, 0);
464 } else if (gpr.IsImm(inst.src1)) {
465 gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.constant, inst.op));
466 } else {
467 gpr.MapDirtyIn(inst.dest, inst.src1);
468 goto doDefault;
469 }
470 break;
471
472 case IROp::Shl:
473 case IROp::Shr:
474 case IROp::Ror:
475 case IROp::Sar:
476 if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
477 gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
478 } else if (gpr.IsImm(inst.src2)) {
479 const u8 sa = gpr.GetImm(inst.src2) & 31;
480 gpr.MapDirtyIn(inst.dest, inst.src1);
481 if (sa == 0) {
482 if (inst.dest != inst.src1)
483 out.Write(IROp::Mov, inst.dest, inst.src1);
484 } else {
485 out.Write(ShiftToShiftImm(inst.op), inst.dest, inst.src1, sa);
486 }
487 } else {
488 gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
489 goto doDefault;
490 }
491 break;
492
493 case IROp::ShlImm:
494 case IROp::ShrImm:
495 case IROp::RorImm:
496 case IROp::SarImm:
497 if (gpr.IsImm(inst.src1)) {
498 gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.src2, inst.op));
499 } else {
500 gpr.MapDirtyIn(inst.dest, inst.src1);
501 goto doDefault;
502 }
503 break;
504
505 case IROp::Mov:
506 if (inst.dest == inst.src1) {
507 // Nop
508 } else if (gpr.IsImm(inst.src1)) {
509 gpr.SetImm(inst.dest, gpr.GetImm(inst.src1));
510 } else {
511 gpr.MapDirtyIn(inst.dest, inst.src1);
512 goto doDefault;
513 }
514 break;
515
516 case IROp::Mult:
517 case IROp::MultU:
518 case IROp::Madd:
519 case IROp::MaddU:
520 case IROp::Msub:
521 case IROp::MsubU:
522 case IROp::Div:
523 case IROp::DivU:
524 gpr.MapInIn(inst.src1, inst.src2);
525 goto doDefault;
526
527 case IROp::MovZ:
528 case IROp::MovNZ:
529 gpr.MapInInIn(inst.dest, inst.src1, inst.src2);
530 goto doDefault;
531
532 case IROp::Min:
533 case IROp::Max:
534 gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
535 goto doDefault;
536
537 case IROp::FMovFromGPR:
538 if (gpr.IsImm(inst.src1)) {
539 out.Write(IROp::SetConstF, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
540 } else {
541 gpr.MapIn(inst.src1);
542 goto doDefault;
543 }
544 break;
545
546 case IROp::FMovToGPR:
547 gpr.MapDirty(inst.dest);
548 goto doDefault;
549
550 case IROp::MfHi:
551 case IROp::MfLo:
552 gpr.MapDirty(inst.dest);
553 goto doDefault;
554
555 case IROp::MtHi:
556 case IROp::MtLo:
557 gpr.MapIn(inst.src1);
558 goto doDefault;
559
560 case IROp::Store8:
561 case IROp::Store16:
562 case IROp::Store32:
563 case IROp::Store32Left:
564 case IROp::Store32Right:
565 if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
566 gpr.MapIn(inst.dest);
567 out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
568 } else {
569 gpr.MapInIn(inst.dest, inst.src1);
570 goto doDefault;
571 }
572 break;
573 case IROp::StoreFloat:
574 case IROp::StoreVec4:
575 if (gpr.IsImm(inst.src1)) {
576 out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
577 } else {
578 gpr.MapIn(inst.src1);
579 goto doDefault;
580 }
581 break;
582
583 case IROp::Load8:
584 case IROp::Load8Ext:
585 case IROp::Load16:
586 case IROp::Load16Ext:
587 case IROp::Load32:
588 if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
589 gpr.MapDirty(inst.dest);
590 out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
591 } else {
592 gpr.MapDirtyIn(inst.dest, inst.src1);
593 goto doDefault;
594 }
595 break;
596 case IROp::LoadFloat:
597 case IROp::LoadVec4:
598 if (gpr.IsImm(inst.src1)) {
599 out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
600 } else {
601 gpr.MapIn(inst.src1);
602 goto doDefault;
603 }
604 break;
605 case IROp::Load32Left:
606 case IROp::Load32Right:
607 if (gpr.IsImm(inst.src1)) {
608 gpr.MapIn(inst.dest);
609 out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
610 } else {
611 gpr.MapInIn(inst.dest, inst.src1);
612 goto doDefault;
613 }
614 break;
615
616 case IROp::Downcount:
617 case IROp::SetPCConst:
618 goto doDefault;
619
620 case IROp::SetPC:
621 if (gpr.IsImm(inst.src1)) {
622 out.Write(IROp::SetPCConst, out.AddConstant(gpr.GetImm(inst.src1)));
623 } else {
624 gpr.MapIn(inst.src1);
625 goto doDefault;
626 }
627 break;
628
629 // FP-only instructions don't need to flush immediates.
630 case IROp::FAdd:
631 case IROp::FMul:
632 // Regularize, to help x86 backends (add.s r0, r1, r0 -> add.s r0, r0, r1)
633 if (inst.src2 == inst.dest && inst.src1 != inst.src2)
634 std::swap(inst.src1, inst.src2);
635 out.Write(inst);
636 break;
637
638 case IROp::FSub:
639 case IROp::FDiv:
640 case IROp::FNeg:
641 case IROp::FAbs:
642 case IROp::FMov:
643 case IROp::FRound:
644 case IROp::FTrunc:
645 case IROp::FCeil:
646 case IROp::FFloor:
647 case IROp::FCvtSW:
648 case IROp::FSin:
649 case IROp::FCos:
650 case IROp::FSqrt:
651 case IROp::FRSqrt:
652 case IROp::FRecip:
653 case IROp::FAsin:
654 out.Write(inst);
655 break;
656
657 case IROp::SetCtrlVFPU:
658 goto doDefault;
659
660 case IROp::FCvtWS:
661 // TODO: Actually, this should just use the currently set rounding mode.
662 // Move up with FCvtSW when that's implemented.
663 gpr.MapIn(IRREG_FCR31);
664 out.Write(inst);
665 break;
666
667 case IROp::FpCondToReg:
668 if (gpr.IsImm(IRREG_FPCOND)) {
669 gpr.SetImm(inst.dest, gpr.GetImm(IRREG_FPCOND));
670 } else {
671 gpr.MapDirtyIn(inst.dest, IRREG_FPCOND);
672 out.Write(inst);
673 }
674 break;
675
676 case IROp::Vec4Init:
677 case IROp::Vec4Mov:
678 case IROp::Vec4Add:
679 case IROp::Vec4Sub:
680 case IROp::Vec4Mul:
681 case IROp::Vec4Div:
682 case IROp::Vec4Dot:
683 case IROp::Vec4Scale:
684 case IROp::Vec4Shuffle:
685 case IROp::Vec4Neg:
686 case IROp::Vec4Abs:
687 case IROp::Vec4Pack31To8:
688 case IROp::Vec4Pack32To8:
689 case IROp::Vec2Pack32To16:
690 case IROp::Vec4Unpack8To32:
691 case IROp::Vec2Unpack16To32:
692 case IROp::Vec4DuplicateUpperBitsAndShift1:
693 case IROp::Vec2ClampToZero:
694 case IROp::Vec4ClampToZero:
695 out.Write(inst);
696 break;
697
698 case IROp::ZeroFpCond:
699 case IROp::FCmp:
700 gpr.MapDirty(IRREG_FPCOND);
701 goto doDefault;
702
703 case IROp::RestoreRoundingMode:
704 case IROp::ApplyRoundingMode:
705 case IROp::UpdateRoundingMode:
706 goto doDefault;
707
708 case IROp::VfpuCtrlToReg:
709 gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
710 goto doDefault;
711
712 case IROp::CallReplacement:
713 case IROp::Break:
714 case IROp::Syscall:
715 case IROp::Interpret:
716 case IROp::ExitToConst:
717 case IROp::ExitToReg:
718 case IROp::ExitToConstIfEq:
719 case IROp::ExitToConstIfNeq:
720 case IROp::ExitToConstIfFpFalse:
721 case IROp::ExitToConstIfFpTrue:
722 case IROp::ExitToConstIfGeZ:
723 case IROp::ExitToConstIfGtZ:
724 case IROp::ExitToConstIfLeZ:
725 case IROp::ExitToConstIfLtZ:
726 case IROp::Breakpoint:
727 case IROp::MemoryCheck:
728 default:
729 {
730 gpr.FlushAll();
731 doDefault:
732 out.Write(inst);
733 break;
734 }
735 }
736 }
737 return logBlocks;
738 }
739
IRReadsFromGPR(const IRInst & inst,int reg)740 bool IRReadsFromGPR(const IRInst &inst, int reg) {
741 const IRMeta *m = GetIRMeta(inst.op);
742
743 if (m->types[1] == 'G' && inst.src1 == reg) {
744 return true;
745 }
746 if (m->types[2] == 'G' && inst.src2 == reg) {
747 return true;
748 }
749 if ((m->flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && m->types[0] == 'G' && inst.src3 == reg) {
750 return true;
751 }
752 if (inst.op == IROp::Interpret || inst.op == IROp::CallReplacement) {
753 return true;
754 }
755 return false;
756 }
757
IRReplaceSrcGPR(const IRInst & inst,int fromReg,int toReg)758 IRInst IRReplaceSrcGPR(const IRInst &inst, int fromReg, int toReg) {
759 IRInst newInst = inst;
760 const IRMeta *m = GetIRMeta(inst.op);
761
762 if (m->types[1] == 'G' && inst.src1 == fromReg) {
763 newInst.src1 = toReg;
764 }
765 if (m->types[2] == 'G' && inst.src2 == fromReg) {
766 newInst.src2 = toReg;
767 }
768 if ((m->flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && m->types[0] == 'G' && inst.src3 == fromReg) {
769 newInst.src3 = toReg;
770 }
771 return newInst;
772 }
773
IRDestGPR(const IRInst & inst)774 int IRDestGPR(const IRInst &inst) {
775 const IRMeta *m = GetIRMeta(inst.op);
776
777 if ((m->flags & IRFLAG_SRC3) == 0 && m->types[0] == 'G') {
778 return inst.dest;
779 }
780 return -1;
781 }
782
IRReplaceDestGPR(const IRInst & inst,int fromReg,int toReg)783 IRInst IRReplaceDestGPR(const IRInst &inst, int fromReg, int toReg) {
784 IRInst newInst = inst;
785 const IRMeta *m = GetIRMeta(inst.op);
786
787 if ((m->flags & IRFLAG_SRC3) == 0 && m->types[0] == 'G' && inst.dest == fromReg) {
788 newInst.dest = toReg;
789 }
790 return newInst;
791 }
792
IRMutatesDestGPR(const IRInst & inst,int reg)793 bool IRMutatesDestGPR(const IRInst &inst, int reg) {
794 const IRMeta *m = GetIRMeta(inst.op);
795 return (m->flags & IRFLAG_SRC3DST) != 0 && m->types[0] == 'G' && inst.src3 == reg;
796 }
797
PurgeTemps(const IRWriter & in,IRWriter & out,const IROptions & opts)798 bool PurgeTemps(const IRWriter &in, IRWriter &out, const IROptions &opts) {
799 CONDITIONAL_DISABLE;
800 std::vector<IRInst> insts;
801 insts.reserve(in.GetInstructions().size());
802
803 struct Check {
804 Check(int r, int i, bool rbx) : reg(r), index(i), readByExit(rbx) {
805 }
806
807 int reg;
808 int srcReg = -1;
809 int index;
810 bool readByExit;
811 };
812 std::vector<Check> checks;
813 int lastWrittenTo[256];
814 memset(lastWrittenTo, -1, sizeof(lastWrittenTo));
815
816 bool logBlocks = false;
817 for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
818 IRInst inst = in.GetInstructions()[i];
819 const IRMeta *m = GetIRMeta(inst.op);
820
821 for (Check &check : checks) {
822 if (check.reg == 0) {
823 continue;
824 }
825
826 if (IRReadsFromGPR(inst, check.reg)) {
827 // Read from, but was this just a copy?
828 bool mutatesReg = IRMutatesDestGPR(inst, check.reg);
829 bool cannotReplace = inst.op == IROp::Interpret || inst.op == IROp::CallReplacement;
830 if (!mutatesReg && !cannotReplace && check.srcReg >= 0 && lastWrittenTo[check.srcReg] < check.index) {
831 // Replace with the srcReg instead. This happens with non-nice delay slots.
832 inst = IRReplaceSrcGPR(inst, check.reg, check.srcReg);
833 } else if (!IRMutatesDestGPR(insts[check.index], check.reg) && inst.op == IROp::Mov && i == check.index + 1) {
834 // This happens with lwl/lwr temps. Replace the original dest.
835 insts[check.index] = IRReplaceDestGPR(insts[check.index], check.reg, inst.dest);
836 lastWrittenTo[inst.dest] = check.index;
837 check.reg = inst.dest;
838 // Update the read by exit flag to match the new reg.
839 check.readByExit = inst.dest < IRTEMP_0 || inst.dest > IRTEMP_LR_SHIFT;
840 // And swap the args for this mov, since we changed the other dest. We'll optimize this out later.
841 std::swap(inst.dest, inst.src1);
842 } else {
843 // Legitimately read from, so we can't optimize out.
844 check.reg = 0;
845 }
846 } else if (check.readByExit && (m->flags & IRFLAG_EXIT) != 0) {
847 check.reg = 0;
848 } else if (IRDestGPR(inst) == check.reg) {
849 // Clobbered, we can optimize out.
850 // This happens sometimes with temporaries used for constant addresses.
851 insts[check.index].op = IROp::Mov;
852 insts[check.index].dest = 0;
853 insts[check.index].src1 = 0;
854 check.reg = 0;
855 }
856 }
857
858 int dest = IRDestGPR(inst);
859 switch (dest) {
860 case IRTEMP_0:
861 case IRTEMP_1:
862 case IRTEMP_2:
863 case IRTEMP_3:
864 case IRTEMP_LHS:
865 case IRTEMP_RHS:
866 case IRTEMP_LR_ADDR:
867 case IRTEMP_LR_VALUE:
868 case IRTEMP_LR_MASK:
869 case IRTEMP_LR_SHIFT:
870 // Unlike other ops, these don't need to persist between blocks.
871 // So we consider them not read unless proven read.
872 lastWrittenTo[dest] = i;
873 // If this is a copy, we might be able to optimize out the copy.
874 if (inst.op == IROp::Mov) {
875 Check check(dest, i, false);
876 check.srcReg = inst.src1;
877 checks.push_back(check);
878 } else {
879 checks.push_back(Check(dest, i, false));
880 }
881 break;
882
883 default:
884 lastWrittenTo[dest] = i;
885 if (dest > IRTEMP_LR_SHIFT) {
886 // These might sometimes be implicitly read/written by other instructions.
887 break;
888 }
889 checks.push_back(Check(dest, i, true));
890 break;
891
892 // Not a GPR output.
893 case 0:
894 case -1:
895 break;
896 }
897
898 // TODO: VFPU temps? Especially for masked dregs.
899
900 insts.push_back(inst);
901 }
902
903 for (Check &check : checks) {
904 if (!check.readByExit && check.reg > 0) {
905 insts[check.index].op = IROp::Mov;
906 insts[check.index].dest = 0;
907 insts[check.index].src1 = 0;
908 }
909 }
910
911 for (const IRInst &inst : insts) {
912 if (inst.op != IROp::Mov || inst.dest != 0 || inst.src1 != 0) {
913 out.Write(inst);
914 }
915 }
916
917 return logBlocks;
918 }
919
ReduceLoads(const IRWriter & in,IRWriter & out,const IROptions & opts)920 bool ReduceLoads(const IRWriter &in, IRWriter &out, const IROptions &opts) {
921 CONDITIONAL_DISABLE;
922 // This tells us to skip an AND op that has been optimized out.
923 // Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
924 int nextSkip = -1;
925
926 bool logBlocks = false;
927 for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
928 IRInst inst = in.GetInstructions()[i];
929
930 if (inst.op == IROp::Load32 || inst.op == IROp::Load16 || inst.op == IROp::Load16Ext) {
931 int dest = IRDestGPR(inst);
932 for (int j = i + 1; j < n; j++) {
933 const IRInst &laterInst = in.GetInstructions()[j];
934 const IRMeta *m = GetIRMeta(laterInst.op);
935
936 if ((m->flags & IRFLAG_EXIT) != 0) {
937 // Exit, so we can't do the optimization.
938 break;
939 }
940 if (IRReadsFromGPR(laterInst, dest)) {
941 if (IRDestGPR(laterInst) == dest && laterInst.op == IROp::AndConst) {
942 const u32 mask = laterInst.constant;
943 // Here we are, maybe we can reduce the load size based on the mask.
944 if ((mask & 0xffffff00) == 0) {
945 inst.op = IROp::Load8;
946 if (mask == 0xff) {
947 nextSkip = j;
948 }
949 } else if ((mask & 0xffff0000) == 0 && inst.op == IROp::Load32) {
950 inst.op = IROp::Load16;
951 if (mask == 0xffff) {
952 nextSkip = j;
953 }
954 }
955 }
956 // If it was read, we can't do the optimization.
957 break;
958 }
959 if (IRDestGPR(laterInst) == dest) {
960 // Someone else wrote, so we can't do the optimization.
961 break;
962 }
963 }
964 }
965
966 if (i != nextSkip) {
967 out.Write(inst);
968 }
969 }
970
971 return logBlocks;
972 }
973
ReorderLoadStoreOps(std::vector<IRInst> & ops)974 static std::vector<IRInst> ReorderLoadStoreOps(std::vector<IRInst> &ops) {
975 if (ops.size() < 2) {
976 return ops;
977 }
978
979 bool modifiedRegs[256] = {};
980
981 for (size_t i = 0, n = ops.size(); i < n - 1; ++i) {
982 bool modifiesReg = false;
983 bool usesFloatReg = false;
984 switch (ops[i].op) {
985 case IROp::Load8:
986 case IROp::Load8Ext:
987 case IROp::Load16:
988 case IROp::Load16Ext:
989 case IROp::Load32:
990 case IROp::Load32Left:
991 case IROp::Load32Right:
992 modifiesReg = true;
993 if (ops[i].src1 == ops[i].dest) {
994 // Can't ever reorder these, since it changes.
995 continue;
996 }
997 break;
998
999 case IROp::Store8:
1000 case IROp::Store16:
1001 case IROp::Store32:
1002 case IROp::Store32Left:
1003 case IROp::Store32Right:
1004 break;
1005
1006 case IROp::LoadFloat:
1007 case IROp::LoadVec4:
1008 usesFloatReg = true;
1009 modifiesReg = true;
1010 break;
1011
1012 case IROp::StoreFloat:
1013 case IROp::StoreVec4:
1014 usesFloatReg = true;
1015 break;
1016
1017 default:
1018 continue;
1019 }
1020
1021 memset(modifiedRegs, 0, sizeof(modifiedRegs));
1022 size_t start = i;
1023 size_t j;
1024 for (j = i; j < n; ++j) {
1025 if (ops[start].op != ops[j].op || ops[start].src1 != ops[j].src1) {
1026 // Incompatible ops, so let's not reorder.
1027 break;
1028 }
1029 if (modifiedRegs[ops[j].dest] || (!usesFloatReg && modifiedRegs[ops[j].src1])) {
1030 // Can't reorder, this reg was modified.
1031 break;
1032 }
1033 if (modifiesReg) {
1034 // Modifies itself, can't reorder this.
1035 if (!usesFloatReg && ops[j].dest == ops[j].src1) {
1036 break;
1037 }
1038 modifiedRegs[ops[j].dest] = true;
1039 }
1040
1041 // Keep going, these operations are compatible.
1042 }
1043
1044 // Everything up to (but not including) j will be sorted, so skip them.
1045 i = j - 1;
1046 size_t end = j;
1047 if (start + 1 < end) {
1048 std::stable_sort(ops.begin() + start, ops.begin() + end, [&](const IRInst &a, const IRInst &b) {
1049 return a.constant < b.constant;
1050 });
1051 }
1052 }
1053
1054 return ops;
1055 }
1056
ReorderLoadStore(const IRWriter & in,IRWriter & out,const IROptions & opts)1057 bool ReorderLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1058 CONDITIONAL_DISABLE;
1059
1060 bool logBlocks = false;
1061
1062 enum class RegState : u8 {
1063 UNUSED = 0,
1064 READ = 1,
1065 CHANGED = 2,
1066 };
1067
1068 bool queuing = false;
1069 std::vector<IRInst> loadStoreQueue;
1070 std::vector<IRInst> otherQueue;
1071 RegState otherRegs[256] = {};
1072
1073 auto flushQueue = [&]() {
1074 if (!queuing) {
1075 return;
1076 }
1077
1078 std::vector<IRInst> loadStoreUnsorted = loadStoreQueue;
1079 std::vector<IRInst> loadStoreSorted = ReorderLoadStoreOps(loadStoreQueue);
1080 if (memcmp(&loadStoreSorted[0], &loadStoreUnsorted[0], sizeof(IRInst) * loadStoreSorted.size()) != 0) {
1081 logBlocks = true;
1082 }
1083
1084 queuing = false;
1085 for (IRInst queued : loadStoreSorted) {
1086 out.Write(queued);
1087 }
1088 for (IRInst queued : otherQueue) {
1089 out.Write(queued);
1090 }
1091 loadStoreQueue.clear();
1092 otherQueue.clear();
1093 memset(otherRegs, 0, sizeof(otherRegs));
1094 };
1095
1096 for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
1097 IRInst inst = in.GetInstructions()[i];
1098 switch (inst.op) {
1099 case IROp::Load8:
1100 case IROp::Load8Ext:
1101 case IROp::Load16:
1102 case IROp::Load16Ext:
1103 case IROp::Load32:
1104 case IROp::Load32Left:
1105 case IROp::Load32Right:
1106 // To move a load up, its dest can't be changed by things we move down.
1107 if (otherRegs[inst.dest] != RegState::UNUSED || otherRegs[inst.src1] == RegState::CHANGED) {
1108 flushQueue();
1109 }
1110
1111 queuing = true;
1112 loadStoreQueue.push_back(inst);
1113 break;
1114
1115 case IROp::Store8:
1116 case IROp::Store16:
1117 case IROp::Store32:
1118 case IROp::Store32Left:
1119 case IROp::Store32Right:
1120 // A store can move above even if it's read, as long as it's not changed by the other ops.
1121 if (otherRegs[inst.src3] == RegState::CHANGED || otherRegs[inst.src1] == RegState::CHANGED) {
1122 flushQueue();
1123 }
1124
1125 queuing = true;
1126 loadStoreQueue.push_back(inst);
1127 break;
1128
1129 case IROp::LoadVec4:
1130 case IROp::LoadFloat:
1131 case IROp::StoreVec4:
1132 case IROp::StoreFloat:
1133 // Floats can always move as long as their address is safe.
1134 if (otherRegs[inst.src1] == RegState::CHANGED) {
1135 flushQueue();
1136 }
1137
1138 queuing = true;
1139 loadStoreQueue.push_back(inst);
1140 break;
1141
1142 case IROp::Sub:
1143 case IROp::Slt:
1144 case IROp::SltU:
1145 case IROp::Add:
1146 case IROp::And:
1147 case IROp::Or:
1148 case IROp::Xor:
1149 case IROp::Shl:
1150 case IROp::Shr:
1151 case IROp::Ror:
1152 case IROp::Sar:
1153 case IROp::MovZ:
1154 case IROp::MovNZ:
1155 case IROp::Max:
1156 case IROp::Min:
1157 // We'll try to move this downward.
1158 otherRegs[inst.dest] = RegState::CHANGED;
1159 if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1160 otherRegs[inst.src1] = RegState::READ;
1161 if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)
1162 otherRegs[inst.src2] = RegState::READ;
1163 otherQueue.push_back(inst);
1164 queuing = true;
1165 break;
1166
1167 case IROp::Neg:
1168 case IROp::Not:
1169 case IROp::BSwap16:
1170 case IROp::BSwap32:
1171 case IROp::Ext8to32:
1172 case IROp::Ext16to32:
1173 case IROp::ReverseBits:
1174 case IROp::Clz:
1175 case IROp::AddConst:
1176 case IROp::SubConst:
1177 case IROp::AndConst:
1178 case IROp::OrConst:
1179 case IROp::XorConst:
1180 case IROp::SltConst:
1181 case IROp::SltUConst:
1182 case IROp::ShlImm:
1183 case IROp::ShrImm:
1184 case IROp::RorImm:
1185 case IROp::SarImm:
1186 case IROp::Mov:
1187 // We'll try to move this downward.
1188 otherRegs[inst.dest] = RegState::CHANGED;
1189 if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1190 otherRegs[inst.src1] = RegState::READ;
1191 otherQueue.push_back(inst);
1192 queuing = true;
1193 break;
1194
1195 case IROp::SetConst:
1196 // We'll try to move this downward.
1197 otherRegs[inst.dest] = RegState::CHANGED;
1198 otherQueue.push_back(inst);
1199 queuing = true;
1200 break;
1201
1202 case IROp::Mult:
1203 case IROp::MultU:
1204 case IROp::Madd:
1205 case IROp::MaddU:
1206 case IROp::Msub:
1207 case IROp::MsubU:
1208 case IROp::Div:
1209 case IROp::DivU:
1210 if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1211 otherRegs[inst.src1] = RegState::READ;
1212 if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)
1213 otherRegs[inst.src2] = RegState::READ;
1214 otherQueue.push_back(inst);
1215 queuing = true;
1216 break;
1217
1218 case IROp::MfHi:
1219 case IROp::MfLo:
1220 case IROp::FpCondToReg:
1221 otherRegs[inst.dest] = RegState::CHANGED;
1222 otherQueue.push_back(inst);
1223 queuing = true;
1224 break;
1225
1226 case IROp::MtHi:
1227 case IROp::MtLo:
1228 if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1229 otherRegs[inst.src1] = RegState::READ;
1230 otherQueue.push_back(inst);
1231 queuing = true;
1232 break;
1233
1234 case IROp::Nop:
1235 case IROp::Downcount:
1236 case IROp::ZeroFpCond:
1237 if (queuing) {
1238 // These are freebies. Sometimes helps with delay slots.
1239 otherQueue.push_back(inst);
1240 } else {
1241 out.Write(inst);
1242 }
1243 break;
1244
1245 default:
1246 flushQueue();
1247 out.Write(inst);
1248 break;
1249 }
1250 }
1251 return logBlocks;
1252 }
1253
MergeLoadStore(const IRWriter & in,IRWriter & out,const IROptions & opts)1254 bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1255 CONDITIONAL_DISABLE;
1256
1257 bool logBlocks = false;
1258
1259 auto opsCompatible = [&](const IRInst &a, const IRInst &b, int dist) {
1260 if (a.op != b.op || a.src1 != b.src1) {
1261 // Not similar enough at all.
1262 return false;
1263 }
1264 u32 off1 = a.constant;
1265 u32 off2 = b.constant;
1266 if (off1 + dist != off2) {
1267 // Not immediately sequential.
1268 return false;
1269 }
1270
1271 return true;
1272 };
1273
1274 IRInst prev = { IROp::Nop };
1275 for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
1276 IRInst inst = in.GetInstructions()[i];
1277 int c = 0;
1278 switch (inst.op) {
1279 case IROp::Store8:
1280 for (c = 1; c < 4 && i + c < n; ++c) {
1281 const IRInst &nextInst = in.GetInstructions()[i + c];
1282 // TODO: Might be nice to check if this is an obvious constant.
1283 if (inst.src3 != nextInst.src3 || inst.src3 != 0) {
1284 break;
1285 }
1286 if (!opsCompatible(inst, nextInst, c)) {
1287 break;
1288 }
1289 }
1290 if ((c == 2 || c == 3) && opts.unalignedLoadStore) {
1291 inst.op = IROp::Store16;
1292 out.Write(inst);
1293 prev = inst;
1294 // Skip the next one (the 3rd will be separate.)
1295 ++i;
1296 continue;
1297 }
1298 if (c == 4 && opts.unalignedLoadStore) {
1299 inst.op = IROp::Store32;
1300 out.Write(inst);
1301 prev = inst;
1302 // Skip all 4.
1303 i += 3;
1304 continue;
1305 }
1306 out.Write(inst);
1307 prev = inst;
1308 break;
1309
1310 case IROp::Store16:
1311 for (c = 1; c < 2 && i + c < n; ++c) {
1312 const IRInst &nextInst = in.GetInstructions()[i + c];
1313 // TODO: Might be nice to check if this is an obvious constant.
1314 if (inst.src3 != nextInst.src3 || inst.src3 != 0) {
1315 break;
1316 }
1317 if (!opsCompatible(inst, nextInst, c * 2)) {
1318 break;
1319 }
1320 }
1321 if (c == 2 && opts.unalignedLoadStore) {
1322 inst.op = IROp::Store32;
1323 out.Write(inst);
1324 prev = inst;
1325 // Skip the next one.
1326 ++i;
1327 continue;
1328 }
1329 out.Write(inst);
1330 prev = inst;
1331 break;
1332
1333 case IROp::Load32:
1334 if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {
1335 // A store and then an immediate load. This is sadly common in minis.
1336 if (prev.op == IROp::Store32 && prev.src3 == inst.dest) {
1337 // Even the same reg, a volatile variable? Skip it.
1338 continue;
1339 }
1340
1341 // Store16 and Store8 in rare cases happen... could be made AndConst, but not worth the trouble.
1342 if (prev.op == IROp::Store32) {
1343 inst.op = IROp::Mov;
1344 inst.src1 = prev.src3;
1345 inst.src2 = 0;
1346 } else if (prev.op == IROp::StoreFloat) {
1347 inst.op = IROp::FMovToGPR;
1348 inst.src1 = prev.src3;
1349 inst.src2 = 0;
1350 }
1351 // The actual op is written below.
1352 }
1353 out.Write(inst);
1354 prev = inst;
1355 break;
1356
1357 case IROp::LoadFloat:
1358 if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {
1359 // A store and then an immediate load, of a float.
1360 if (prev.op == IROp::StoreFloat && prev.src3 == inst.dest) {
1361 // Volatile float, I suppose?
1362 continue;
1363 }
1364
1365 if (prev.op == IROp::StoreFloat) {
1366 inst.op = IROp::FMov;
1367 inst.src1 = prev.src3;
1368 inst.src2 = 0;
1369 } else if (prev.op == IROp::Store32) {
1370 inst.op = IROp::FMovFromGPR;
1371 inst.src1 = prev.src3;
1372 inst.src2 = 0;
1373 }
1374 // The actual op is written below.
1375 }
1376 out.Write(inst);
1377 prev = inst;
1378 break;
1379
1380 default:
1381 out.Write(inst);
1382 prev = inst;
1383 break;
1384 }
1385 }
1386 return logBlocks;
1387 }
1388