1 #include <algorithm>
2 #include <cstring>
3 #include <utility>
4 
5 #include "Common/BitSet.h"
6 #include "Common/Data/Convert/SmallDataConvert.h"
7 #include "Common/Log.h"
8 #include "Core/MIPS/IR/IRInterpreter.h"
9 #include "Core/MIPS/IR/IRPassSimplify.h"
10 #include "Core/MIPS/IR/IRRegCache.h"
11 
12 // #define CONDITIONAL_DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }
13 #define CONDITIONAL_DISABLE
14 #define DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; }
15 
Evaluate(u32 a,u32 b,IROp op)16 u32 Evaluate(u32 a, u32 b, IROp op) {
17 	switch (op) {
18 	case IROp::Add: case IROp::AddConst: return a + b;
19 	case IROp::Sub: case IROp::SubConst: return a - b;
20 	case IROp::And: case IROp::AndConst: return a & b;
21 	case IROp::Or: case IROp::OrConst: return a | b;
22 	case IROp::Xor: case IROp::XorConst: return a ^ b;
23 	case IROp::Shr: case IROp::ShrImm: return a >> b;
24 	case IROp::Sar: case IROp::SarImm: return (s32)a >> b;
25 	case IROp::Ror: case IROp::RorImm: return (a >> b) | (a << (32 - b));
26 	case IROp::Shl: case IROp::ShlImm: return a << b;
27 	case IROp::Slt: case IROp::SltConst: return ((s32)a < (s32)b);
28 	case IROp::SltU: case IROp::SltUConst: return (a < b);
29 	default:
30 		return -1;
31 	}
32 }
33 
Evaluate(u32 a,IROp op)34 u32 Evaluate(u32 a, IROp op) {
35 	switch (op) {
36 	case IROp::Not: return ~a;
37 	case IROp::Neg: return -(s32)a;
38 	case IROp::BSwap16: return ((a & 0xFF00FF00) >> 8) | ((a & 0x00FF00FF) << 8);
39 	case IROp::BSwap32: return swap32(a);
40 	case IROp::Ext8to32: return SignExtend8ToU32(a);
41 	case IROp::Ext16to32: return SignExtend16ToU32(a);
42 	case IROp::ReverseBits: return ReverseBits32(a);
43 	case IROp::Clz: {
44 		int x = 31;
45 		int count = 0;
46 		while (x >= 0 && !(a & (1 << x))) {
47 			count++;
48 			x--;
49 		}
50 		return count;
51 	}
52 	default:
53 		return -1;
54 	}
55 }
56 
ArithToArithConst(IROp op)57 IROp ArithToArithConst(IROp op) {
58 	switch (op) {
59 	case IROp::Add: return IROp::AddConst;
60 	case IROp::Sub: return IROp::SubConst;
61 	case IROp::And: return IROp::AndConst;
62 	case IROp::Or: return IROp::OrConst;
63 	case IROp::Xor: return IROp::XorConst;
64 	case IROp::Slt: return IROp::SltConst;
65 	case IROp::SltU: return IROp::SltUConst;
66 	default:
67 		return (IROp)-1;
68 	}
69 }
70 
ShiftToShiftImm(IROp op)71 IROp ShiftToShiftImm(IROp op) {
72 	switch (op) {
73 	case IROp::Shl: return IROp::ShlImm;
74 	case IROp::Shr: return IROp::ShrImm;
75 	case IROp::Ror: return IROp::RorImm;
76 	case IROp::Sar: return IROp::SarImm;
77 	default:
78 		return (IROp)-1;
79 	}
80 }
81 
IRApplyPasses(const IRPassFunc * passes,size_t c,const IRWriter & in,IRWriter & out,const IROptions & opts)82 bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out, const IROptions &opts) {
83 	if (c == 1) {
84 		return passes[0](in, out, opts);
85 	}
86 
87 	bool logBlocks = false;
88 
89 	IRWriter temp[2];
90 	const IRWriter *nextIn = &in;
91 	IRWriter *nextOut = &temp[1];
92 	for (size_t i = 0; i < c - 1; ++i) {
93 		if (passes[i](*nextIn, *nextOut, opts)) {
94 			logBlocks = true;
95 		}
96 
97 		temp[0] = std::move(temp[1]);
98 		nextIn = &temp[0];
99 	}
100 
101 	if (passes[c - 1](*nextIn, out, opts)) {
102 		logBlocks = true;
103 	}
104 
105 	return logBlocks;
106 }
107 
OptimizeFPMoves(const IRWriter & in,IRWriter & out,const IROptions & opts)108 bool OptimizeFPMoves(const IRWriter &in, IRWriter &out, const IROptions &opts) {
109 	CONDITIONAL_DISABLE;
110 
111 	bool logBlocks = false;
112 	IRInst prev{ IROp::Nop };
113 
114 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
115 		IRInst inst = in.GetInstructions()[i];
116 		switch (inst.op) {
117 		case IROp::FMovFromGPR:
118 			//FMovToGPR a0, f12
119 			//FMovFromGPR f14, a0
120 			// to
121 			//FMovToGPR a0, f12
122 			//FMov f14, f12
123 			if (prev.op == IROp::FMovToGPR && prev.dest == inst.src1) {
124 				inst.op = IROp::FMov;
125 				inst.src1 = prev.src1;
126 				out.Write(inst);
127 			} else {
128 				out.Write(inst);
129 			}
130 			break;
131 
132 		// This will need to scan forward or keep track of more information to be useful.
133 		// Just doing one isn't.
134 		/*
135 		case IROp::LoadVec4:
136 			// AddConst a0, sp, 0x30
137 			// LoadVec4 v16, a0, 0x0
138 			// to
139 			// AddConst a0, sp, 0x30
140 			// LoadVec4 v16, sp, 0x30
141 			if (prev.op == IROp::AddConst && prev.dest == inst.src1 && prev.dest != prev.src1 && prev.src1 == MIPS_REG_SP) {
142 				inst.constant += prev.constant;
143 				inst.src1 = prev.src1;
144 				logBlocks = 1;
145 			} else {
146 				goto doDefault;
147 			}
148 			out.Write(inst);
149 			break;
150 		*/
151 		default:
152 			out.Write(inst);
153 			break;
154 		}
155 		prev = inst;
156 	}
157 	return logBlocks;
158 }
159 
160 // Might be useful later on x86.
ThreeOpToTwoOp(const IRWriter & in,IRWriter & out,const IROptions & opts)161 bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out, const IROptions &opts) {
162 	CONDITIONAL_DISABLE;
163 
164 	bool logBlocks = false;
165 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
166 		IRInst inst = in.GetInstructions()[i];
167 		switch (inst.op) {
168 		case IROp::Sub:
169 		case IROp::Slt:
170 		case IROp::SltU:
171 		case IROp::Add:
172 		case IROp::And:
173 		case IROp::Or:
174 		case IROp::Xor:
175 			if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
176 				out.Write(IROp::Mov, inst.dest, inst.src1);
177 				out.Write(inst.op, inst.dest, inst.dest, inst.src2);
178 			} else {
179 				out.Write(inst);
180 			}
181 			break;
182 		case IROp::FMul:
183 		case IROp::FAdd:
184 			if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
185 				out.Write(IROp::FMov, inst.dest, inst.src1);
186 				out.Write(inst.op, inst.dest, inst.dest, inst.src2);
187 			} else {
188 				out.Write(inst);
189 			}
190 			break;
191 
192 		case IROp::Vec4Add:
193 		case IROp::Vec4Sub:
194 		case IROp::Vec4Mul:
195 		case IROp::Vec4Div:
196 			if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
197 				out.Write(IROp::Vec4Mov, inst.dest, inst.src1);
198 				out.Write(inst.op, inst.dest, inst.dest, inst.src2);
199 			} else {
200 				out.Write(inst);
201 			}
202 			break;
203 
204 		default:
205 			out.Write(inst);
206 			break;
207 		}
208 	}
209 	return logBlocks;
210 }
211 
RemoveLoadStoreLeftRight(const IRWriter & in,IRWriter & out,const IROptions & opts)212 bool RemoveLoadStoreLeftRight(const IRWriter &in, IRWriter &out, const IROptions &opts) {
213 	CONDITIONAL_DISABLE;
214 
215 	bool logBlocks = false;
216 	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; ++i) {
217 		const IRInst &inst = in.GetInstructions()[i];
218 
219 		// TODO: Reorder or look ahead to combine?
220 
221 		auto nextOp = [&]() -> const IRInst &{
222 			return in.GetInstructions()[i + 1];
223 		};
224 
225 		auto combineOpposite = [&](IROp matchOp, int matchOff, IROp replaceOp, int replaceOff) {
226 			if (!opts.unalignedLoadStore || i + 1 >= n)
227 				return false;
228 			const IRInst &next = nextOp();
229 			if (next.op != matchOp || next.dest != inst.dest || next.src1 != inst.src1)
230 				return false;
231 			if (inst.constant + matchOff != next.constant)
232 				return false;
233 
234 			// Write out one unaligned op.
235 			out.Write(replaceOp, inst.dest, inst.src1, out.AddConstant(inst.constant + replaceOff));
236 			// Skip the next one, replaced.
237 			i++;
238 			return true;
239 		};
240 
241 		auto addCommonProlog = [&]() {
242 			// IRTEMP_LR_ADDR = rs + imm
243 			out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant));
244 			// IRTEMP_LR_SHIFT = (addr & 3) * 8
245 			out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3));
246 			out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3);
247 			// IRTEMP_LR_ADDR = addr & 0xfffffffc (for stores, later)
248 			out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC));
249 			// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR)
250 			out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(0));
251 		};
252 		auto addCommonStore = [&](int off = 0) {
253 			// RAM(IRTEMP_LR_ADDR) = IRTEMP_LR_VALUE
254 			out.Write(IROp::Store32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(off));
255 		};
256 
257 		switch (inst.op) {
258 		case IROp::Load32Left:
259 			if (!combineOpposite(IROp::Load32Right, -3, IROp::Load32, -3)) {
260 				addCommonProlog();
261 				// dest &= (0x00ffffff >> shift)
262 				// Alternatively, could shift to a wall and back (but would require two shifts each way.)
263 				out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);
264 				out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
265 				out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);
266 				// IRTEMP_LR_SHIFT = 24 - shift
267 				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
268 				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
269 				// IRTEMP_LR_VALUE <<= (24 - shift)
270 				out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
271 				// dest |= IRTEMP_LR_VALUE
272 				out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
273 
274 				bool src1Dirty = inst.dest == inst.src1;
275 				while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {
276 					// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)
277 					out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));
278 
279 					// dest &= IRTEMP_LR_MASK
280 					out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);
281 					// IRTEMP_LR_VALUE <<= (24 - shift)
282 					out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
283 					// dest |= IRTEMP_LR_VALUE
284 					out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);
285 
286 					src1Dirty = nextOp().dest == inst.src1;
287 					++i;
288 				}
289 			}
290 			break;
291 
292 		case IROp::Load32Right:
293 			if (!combineOpposite(IROp::Load32Left, 3, IROp::Load32, 0)) {
294 				addCommonProlog();
295 				// IRTEMP_LR_VALUE >>= shift
296 				out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
297 				// IRTEMP_LR_SHIFT = 24 - shift
298 				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
299 				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
300 				// dest &= (0xffffff00 << (24 - shift))
301 				// Alternatively, could shift to a wall and back (but would require two shifts each way.)
302 				out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);
303 				out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
304 				out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK);
305 				// dest |= IRTEMP_LR_VALUE
306 				out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE);
307 
308 				// Building display lists sometimes involves a bunch of lwr in a row.
309 				// We can generate more optimal code by combining.
310 				bool shiftNeedsReverse = true;
311 				bool src1Dirty = inst.dest == inst.src1;
312 				while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) {
313 					// IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta)
314 					out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant));
315 
316 					if (shiftNeedsReverse) {
317 						// IRTEMP_LR_SHIFT = shift again
318 						out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
319 						out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
320 						shiftNeedsReverse = false;
321 					}
322 					// IRTEMP_LR_VALUE >>= IRTEMP_LR_SHIFT
323 					out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT);
324 					// dest &= IRTEMP_LR_MASK
325 					out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK);
326 					// dest |= IRTEMP_LR_VALUE
327 					out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE);
328 
329 					src1Dirty = nextOp().dest == inst.src1;
330 					++i;
331 				}
332 			}
333 			break;
334 
335 		case IROp::Store32Left:
336 			if (!combineOpposite(IROp::Store32Right, -3, IROp::Store32, -3)) {
337 				addCommonProlog();
338 				// IRTEMP_LR_VALUE &= 0xffffff00 << shift
339 				out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00);
340 				out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
341 				out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
342 				// IRTEMP_LR_SHIFT = 24 - shift
343 				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
344 				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
345 				// IRTEMP_LR_VALUE |= src3 >> (24 - shift)
346 				out.Write(IROp::Shr, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);
347 				out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
348 				addCommonStore(0);
349 			}
350 			break;
351 
352 		case IROp::Store32Right:
353 			if (!combineOpposite(IROp::Store32Left, 3, IROp::Store32, 0)) {
354 				addCommonProlog();
355 				// IRTEMP_LR_VALUE &= 0x00ffffff << (24 - shift)
356 				out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff);
357 				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
358 				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
359 				out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT);
360 				out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
361 				out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT);
362 				out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24));
363 				// IRTEMP_LR_VALUE |= src3 << shift
364 				out.Write(IROp::Shl, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT);
365 				out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK);
366 				addCommonStore(0);
367 			}
368 			break;
369 
370 		default:
371 			out.Write(inst);
372 			break;
373 		}
374 	}
375 
376 	return logBlocks;
377 }
378 
PropagateConstants(const IRWriter & in,IRWriter & out,const IROptions & opts)379 bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts) {
380 	CONDITIONAL_DISABLE;
381 	IRRegCache gpr(&out);
382 
383 	bool logBlocks = false;
384 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
385 		IRInst inst = in.GetInstructions()[i];
386 		bool symmetric = true;
387 		switch (inst.op) {
388 		case IROp::SetConst:
389 			gpr.SetImm(inst.dest, inst.constant);
390 			break;
391 		case IROp::SetConstF:
392 			goto doDefault;
393 
394 		case IROp::Sub:
395 		case IROp::Slt:
396 		case IROp::SltU:
397 			symmetric = false;  // fallthrough
398 		case IROp::Add:
399 		case IROp::And:
400 		case IROp::Or:
401 		case IROp::Xor:
402 			// Regularize, for the add/or check below.
403 			if (symmetric && inst.src2 == inst.dest && inst.src1 != inst.src2) {
404 				std::swap(inst.src1, inst.src2);
405 			}
406 			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
407 				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
408 			} else if (inst.op == IROp::And && gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0) {
409 				gpr.SetImm(inst.dest, 0);
410 			} else if (inst.op == IROp::And && gpr.IsImm(inst.src2) && gpr.GetImm(inst.src2) == 0) {
411 				gpr.SetImm(inst.dest, 0);
412 			} else if (gpr.IsImm(inst.src2)) {
413 				const u32 imm2 = gpr.GetImm(inst.src2);
414 				gpr.MapDirtyIn(inst.dest, inst.src1);
415 				if (imm2 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
416 					// Add / Or with zero is just a Mov.
417 					if (inst.dest != inst.src1)
418 						out.Write(IROp::Mov, inst.dest, inst.src1);
419 				} else {
420 					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(imm2));
421 				}
422 			} else if (symmetric && gpr.IsImm(inst.src1)) {
423 				const u32 imm1 = gpr.GetImm(inst.src1);
424 				gpr.MapDirtyIn(inst.dest, inst.src2);
425 				if (imm1 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
426 					// Add / Or with zero is just a Mov.
427 					if (inst.dest != inst.src2)
428 						out.Write(IROp::Mov, inst.dest, inst.src2);
429 				} else {
430 					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(imm1));
431 				}
432 			} else {
433 				gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
434 				goto doDefault;
435 			}
436 			break;
437 
438 		case IROp::Neg:
439 		case IROp::Not:
440 		case IROp::BSwap16:
441 		case IROp::BSwap32:
442 		case IROp::Ext8to32:
443 		case IROp::Ext16to32:
444 		case IROp::ReverseBits:
445 		case IROp::Clz:
446 			if (gpr.IsImm(inst.src1)) {
447 				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.op));
448 			} else {
449 				gpr.MapDirtyIn(inst.dest, inst.src1);
450 				goto doDefault;
451 			}
452 			break;
453 
454 		case IROp::AddConst:
455 		case IROp::SubConst:
456 		case IROp::AndConst:
457 		case IROp::OrConst:
458 		case IROp::XorConst:
459 		case IROp::SltConst:
460 		case IROp::SltUConst:
461 			// And 0 is otherwise set to 0.  Happens when optimizing lwl.
462 			if (inst.op == IROp::AndConst && inst.constant == 0) {
463 				gpr.SetImm(inst.dest, 0);
464 			} else if (gpr.IsImm(inst.src1)) {
465 				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.constant, inst.op));
466 			} else {
467 				gpr.MapDirtyIn(inst.dest, inst.src1);
468 				goto doDefault;
469 			}
470 			break;
471 
472 		case IROp::Shl:
473 		case IROp::Shr:
474 		case IROp::Ror:
475 		case IROp::Sar:
476 			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
477 				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
478 			} else if (gpr.IsImm(inst.src2)) {
479 				const u8 sa = gpr.GetImm(inst.src2) & 31;
480 				gpr.MapDirtyIn(inst.dest, inst.src1);
481 				if (sa == 0) {
482 					if (inst.dest != inst.src1)
483 						out.Write(IROp::Mov, inst.dest, inst.src1);
484 				} else {
485 					out.Write(ShiftToShiftImm(inst.op), inst.dest, inst.src1, sa);
486 				}
487 			} else {
488 				gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
489 				goto doDefault;
490 			}
491 			break;
492 
493 		case IROp::ShlImm:
494 		case IROp::ShrImm:
495 		case IROp::RorImm:
496 		case IROp::SarImm:
497 			if (gpr.IsImm(inst.src1)) {
498 				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.src2, inst.op));
499 			} else {
500 				gpr.MapDirtyIn(inst.dest, inst.src1);
501 				goto doDefault;
502 			}
503 			break;
504 
505 		case IROp::Mov:
506 			if (inst.dest == inst.src1) {
507 				// Nop
508 			} else if (gpr.IsImm(inst.src1)) {
509 				gpr.SetImm(inst.dest, gpr.GetImm(inst.src1));
510 			} else {
511 				gpr.MapDirtyIn(inst.dest, inst.src1);
512 				goto doDefault;
513 			}
514 			break;
515 
516 		case IROp::Mult:
517 		case IROp::MultU:
518 		case IROp::Madd:
519 		case IROp::MaddU:
520 		case IROp::Msub:
521 		case IROp::MsubU:
522 		case IROp::Div:
523 		case IROp::DivU:
524 			gpr.MapInIn(inst.src1, inst.src2);
525 			goto doDefault;
526 
527 		case IROp::MovZ:
528 		case IROp::MovNZ:
529 			gpr.MapInInIn(inst.dest, inst.src1, inst.src2);
530 			goto doDefault;
531 
532 		case IROp::Min:
533 		case IROp::Max:
534 			gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
535 			goto doDefault;
536 
537 		case IROp::FMovFromGPR:
538 			if (gpr.IsImm(inst.src1)) {
539 				out.Write(IROp::SetConstF, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
540 			} else {
541 				gpr.MapIn(inst.src1);
542 				goto doDefault;
543 			}
544 			break;
545 
546 		case IROp::FMovToGPR:
547 			gpr.MapDirty(inst.dest);
548 			goto doDefault;
549 
550 		case IROp::MfHi:
551 		case IROp::MfLo:
552 			gpr.MapDirty(inst.dest);
553 			goto doDefault;
554 
555 		case IROp::MtHi:
556 		case IROp::MtLo:
557 			gpr.MapIn(inst.src1);
558 			goto doDefault;
559 
560 		case IROp::Store8:
561 		case IROp::Store16:
562 		case IROp::Store32:
563 		case IROp::Store32Left:
564 		case IROp::Store32Right:
565 			if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
566 				gpr.MapIn(inst.dest);
567 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
568 			} else {
569 				gpr.MapInIn(inst.dest, inst.src1);
570 				goto doDefault;
571 			}
572 			break;
573 		case IROp::StoreFloat:
574 		case IROp::StoreVec4:
575 			if (gpr.IsImm(inst.src1)) {
576 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
577 			} else {
578 				gpr.MapIn(inst.src1);
579 				goto doDefault;
580 			}
581 			break;
582 
583 		case IROp::Load8:
584 		case IROp::Load8Ext:
585 		case IROp::Load16:
586 		case IROp::Load16Ext:
587 		case IROp::Load32:
588 			if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
589 				gpr.MapDirty(inst.dest);
590 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
591 			} else {
592 				gpr.MapDirtyIn(inst.dest, inst.src1);
593 				goto doDefault;
594 			}
595 			break;
596 		case IROp::LoadFloat:
597 		case IROp::LoadVec4:
598 			if (gpr.IsImm(inst.src1)) {
599 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
600 			} else {
601 				gpr.MapIn(inst.src1);
602 				goto doDefault;
603 			}
604 			break;
605 		case IROp::Load32Left:
606 		case IROp::Load32Right:
607 			if (gpr.IsImm(inst.src1)) {
608 				gpr.MapIn(inst.dest);
609 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant));
610 			} else {
611 				gpr.MapInIn(inst.dest, inst.src1);
612 				goto doDefault;
613 			}
614 			break;
615 
616 		case IROp::Downcount:
617 		case IROp::SetPCConst:
618 			goto doDefault;
619 
620 		case IROp::SetPC:
621 			if (gpr.IsImm(inst.src1)) {
622 				out.Write(IROp::SetPCConst, out.AddConstant(gpr.GetImm(inst.src1)));
623 			} else {
624 				gpr.MapIn(inst.src1);
625 				goto doDefault;
626 			}
627 			break;
628 
629 		// FP-only instructions don't need to flush immediates.
630 		case IROp::FAdd:
631 		case IROp::FMul:
632 			// Regularize, to help x86 backends (add.s r0, r1, r0 -> add.s r0, r0, r1)
633 			if (inst.src2 == inst.dest && inst.src1 != inst.src2)
634 				std::swap(inst.src1, inst.src2);
635 			out.Write(inst);
636 			break;
637 
638 		case IROp::FSub:
639 		case IROp::FDiv:
640 		case IROp::FNeg:
641 		case IROp::FAbs:
642 		case IROp::FMov:
643 		case IROp::FRound:
644 		case IROp::FTrunc:
645 		case IROp::FCeil:
646 		case IROp::FFloor:
647 		case IROp::FCvtSW:
648 		case IROp::FSin:
649 		case IROp::FCos:
650 		case IROp::FSqrt:
651 		case IROp::FRSqrt:
652 		case IROp::FRecip:
653 		case IROp::FAsin:
654 			out.Write(inst);
655 			break;
656 
657 		case IROp::SetCtrlVFPU:
658 			goto doDefault;
659 
660 		case IROp::FCvtWS:
661 			// TODO: Actually, this should just use the currently set rounding mode.
662 			// Move up with FCvtSW when that's implemented.
663 			gpr.MapIn(IRREG_FCR31);
664 			out.Write(inst);
665 			break;
666 
667 		case IROp::FpCondToReg:
668 			if (gpr.IsImm(IRREG_FPCOND)) {
669 				gpr.SetImm(inst.dest, gpr.GetImm(IRREG_FPCOND));
670 			} else {
671 				gpr.MapDirtyIn(inst.dest, IRREG_FPCOND);
672 				out.Write(inst);
673 			}
674 			break;
675 
676 		case IROp::Vec4Init:
677 		case IROp::Vec4Mov:
678 		case IROp::Vec4Add:
679 		case IROp::Vec4Sub:
680 		case IROp::Vec4Mul:
681 		case IROp::Vec4Div:
682 		case IROp::Vec4Dot:
683 		case IROp::Vec4Scale:
684 		case IROp::Vec4Shuffle:
685 		case IROp::Vec4Neg:
686 		case IROp::Vec4Abs:
687 		case IROp::Vec4Pack31To8:
688 		case IROp::Vec4Pack32To8:
689 		case IROp::Vec2Pack32To16:
690 		case IROp::Vec4Unpack8To32:
691 		case IROp::Vec2Unpack16To32:
692 		case IROp::Vec4DuplicateUpperBitsAndShift1:
693 		case IROp::Vec2ClampToZero:
694 		case IROp::Vec4ClampToZero:
695 			out.Write(inst);
696 			break;
697 
698 		case IROp::ZeroFpCond:
699 		case IROp::FCmp:
700 			gpr.MapDirty(IRREG_FPCOND);
701 			goto doDefault;
702 
703 		case IROp::RestoreRoundingMode:
704 		case IROp::ApplyRoundingMode:
705 		case IROp::UpdateRoundingMode:
706 			goto doDefault;
707 
708 		case IROp::VfpuCtrlToReg:
709 			gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
710 			goto doDefault;
711 
712 		case IROp::CallReplacement:
713 		case IROp::Break:
714 		case IROp::Syscall:
715 		case IROp::Interpret:
716 		case IROp::ExitToConst:
717 		case IROp::ExitToReg:
718 		case IROp::ExitToConstIfEq:
719 		case IROp::ExitToConstIfNeq:
720 		case IROp::ExitToConstIfFpFalse:
721 		case IROp::ExitToConstIfFpTrue:
722 		case IROp::ExitToConstIfGeZ:
723 		case IROp::ExitToConstIfGtZ:
724 		case IROp::ExitToConstIfLeZ:
725 		case IROp::ExitToConstIfLtZ:
726 		case IROp::Breakpoint:
727 		case IROp::MemoryCheck:
728 		default:
729 		{
730 			gpr.FlushAll();
731 		doDefault:
732 			out.Write(inst);
733 			break;
734 		}
735 		}
736 	}
737 	return logBlocks;
738 }
739 
IRReadsFromGPR(const IRInst & inst,int reg)740 bool IRReadsFromGPR(const IRInst &inst, int reg) {
741 	const IRMeta *m = GetIRMeta(inst.op);
742 
743 	if (m->types[1] == 'G' && inst.src1 == reg) {
744 		return true;
745 	}
746 	if (m->types[2] == 'G' && inst.src2 == reg) {
747 		return true;
748 	}
749 	if ((m->flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && m->types[0] == 'G' && inst.src3 == reg) {
750 		return true;
751 	}
752 	if (inst.op == IROp::Interpret || inst.op == IROp::CallReplacement) {
753 		return true;
754 	}
755 	return false;
756 }
757 
IRReplaceSrcGPR(const IRInst & inst,int fromReg,int toReg)758 IRInst IRReplaceSrcGPR(const IRInst &inst, int fromReg, int toReg) {
759 	IRInst newInst = inst;
760 	const IRMeta *m = GetIRMeta(inst.op);
761 
762 	if (m->types[1] == 'G' && inst.src1 == fromReg) {
763 		newInst.src1 = toReg;
764 	}
765 	if (m->types[2] == 'G' && inst.src2 == fromReg) {
766 		newInst.src2 = toReg;
767 	}
768 	if ((m->flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && m->types[0] == 'G' && inst.src3 == fromReg) {
769 		newInst.src3 = toReg;
770 	}
771 	return newInst;
772 }
773 
IRDestGPR(const IRInst & inst)774 int IRDestGPR(const IRInst &inst) {
775 	const IRMeta *m = GetIRMeta(inst.op);
776 
777 	if ((m->flags & IRFLAG_SRC3) == 0 && m->types[0] == 'G') {
778 		return inst.dest;
779 	}
780 	return -1;
781 }
782 
IRReplaceDestGPR(const IRInst & inst,int fromReg,int toReg)783 IRInst IRReplaceDestGPR(const IRInst &inst, int fromReg, int toReg) {
784 	IRInst newInst = inst;
785 	const IRMeta *m = GetIRMeta(inst.op);
786 
787 	if ((m->flags & IRFLAG_SRC3) == 0 && m->types[0] == 'G' && inst.dest == fromReg) {
788 		newInst.dest = toReg;
789 	}
790 	return newInst;
791 }
792 
IRMutatesDestGPR(const IRInst & inst,int reg)793 bool IRMutatesDestGPR(const IRInst &inst, int reg) {
794 	const IRMeta *m = GetIRMeta(inst.op);
795 	return (m->flags & IRFLAG_SRC3DST) != 0 && m->types[0] == 'G' && inst.src3 == reg;
796 }
797 
PurgeTemps(const IRWriter & in,IRWriter & out,const IROptions & opts)798 bool PurgeTemps(const IRWriter &in, IRWriter &out, const IROptions &opts) {
799 	CONDITIONAL_DISABLE;
800 	std::vector<IRInst> insts;
801 	insts.reserve(in.GetInstructions().size());
802 
803 	struct Check {
804 		Check(int r, int i, bool rbx) : reg(r), index(i), readByExit(rbx) {
805 		}
806 
807 		int reg;
808 		int srcReg = -1;
809 		int index;
810 		bool readByExit;
811 	};
812 	std::vector<Check> checks;
813 	int lastWrittenTo[256];
814 	memset(lastWrittenTo, -1, sizeof(lastWrittenTo));
815 
816 	bool logBlocks = false;
817 	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
818 		IRInst inst = in.GetInstructions()[i];
819 		const IRMeta *m = GetIRMeta(inst.op);
820 
821 		for (Check &check : checks) {
822 			if (check.reg == 0) {
823 				continue;
824 			}
825 
826 			if (IRReadsFromGPR(inst, check.reg)) {
827 				// Read from, but was this just a copy?
828 				bool mutatesReg = IRMutatesDestGPR(inst, check.reg);
829 				bool cannotReplace = inst.op == IROp::Interpret || inst.op == IROp::CallReplacement;
830 				if (!mutatesReg && !cannotReplace && check.srcReg >= 0 && lastWrittenTo[check.srcReg] < check.index) {
831 					// Replace with the srcReg instead.  This happens with non-nice delay slots.
832 					inst = IRReplaceSrcGPR(inst, check.reg, check.srcReg);
833 				} else if (!IRMutatesDestGPR(insts[check.index], check.reg) && inst.op == IROp::Mov && i == check.index + 1) {
834 					// This happens with lwl/lwr temps.  Replace the original dest.
835 					insts[check.index] = IRReplaceDestGPR(insts[check.index], check.reg, inst.dest);
836 					lastWrittenTo[inst.dest] = check.index;
837 					check.reg = inst.dest;
838 					// Update the read by exit flag to match the new reg.
839 					check.readByExit = inst.dest < IRTEMP_0 || inst.dest > IRTEMP_LR_SHIFT;
840 					// And swap the args for this mov, since we changed the other dest.  We'll optimize this out later.
841 					std::swap(inst.dest, inst.src1);
842 				} else {
843 					// Legitimately read from, so we can't optimize out.
844 					check.reg = 0;
845 				}
846 			} else if (check.readByExit && (m->flags & IRFLAG_EXIT) != 0) {
847 				check.reg = 0;
848 			} else if (IRDestGPR(inst) == check.reg) {
849 				// Clobbered, we can optimize out.
850 				// This happens sometimes with temporaries used for constant addresses.
851 				insts[check.index].op = IROp::Mov;
852 				insts[check.index].dest = 0;
853 				insts[check.index].src1 = 0;
854 				check.reg = 0;
855 			}
856 		}
857 
858 		int dest = IRDestGPR(inst);
859 		switch (dest) {
860 		case IRTEMP_0:
861 		case IRTEMP_1:
862 		case IRTEMP_2:
863 		case IRTEMP_3:
864 		case IRTEMP_LHS:
865 		case IRTEMP_RHS:
866 		case IRTEMP_LR_ADDR:
867 		case IRTEMP_LR_VALUE:
868 		case IRTEMP_LR_MASK:
869 		case IRTEMP_LR_SHIFT:
870 			// Unlike other ops, these don't need to persist between blocks.
871 			// So we consider them not read unless proven read.
872 			lastWrittenTo[dest] = i;
873 			// If this is a copy, we might be able to optimize out the copy.
874 			if (inst.op == IROp::Mov) {
875 				Check check(dest, i, false);
876 				check.srcReg = inst.src1;
877 				checks.push_back(check);
878 			} else {
879 				checks.push_back(Check(dest, i, false));
880 			}
881 			break;
882 
883 		default:
884 			lastWrittenTo[dest] = i;
885 			if (dest > IRTEMP_LR_SHIFT) {
886 				// These might sometimes be implicitly read/written by other instructions.
887 				break;
888 			}
889 			checks.push_back(Check(dest, i, true));
890 			break;
891 
892 		// Not a GPR output.
893 		case 0:
894 		case -1:
895 			break;
896 		}
897 
898 		// TODO: VFPU temps?  Especially for masked dregs.
899 
900 		insts.push_back(inst);
901 	}
902 
903 	for (Check &check : checks) {
904 		if (!check.readByExit && check.reg > 0) {
905 			insts[check.index].op = IROp::Mov;
906 			insts[check.index].dest = 0;
907 			insts[check.index].src1 = 0;
908 		}
909 	}
910 
911 	for (const IRInst &inst : insts) {
912 		if (inst.op != IROp::Mov || inst.dest != 0 || inst.src1 != 0) {
913 			out.Write(inst);
914 		}
915 	}
916 
917 	return logBlocks;
918 }
919 
ReduceLoads(const IRWriter & in,IRWriter & out,const IROptions & opts)920 bool ReduceLoads(const IRWriter &in, IRWriter &out, const IROptions &opts) {
921 	CONDITIONAL_DISABLE;
922 	// This tells us to skip an AND op that has been optimized out.
923 	// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
924 	int nextSkip = -1;
925 
926 	bool logBlocks = false;
927 	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
928 		IRInst inst = in.GetInstructions()[i];
929 
930 		if (inst.op == IROp::Load32 || inst.op == IROp::Load16 || inst.op == IROp::Load16Ext) {
931 			int dest = IRDestGPR(inst);
932 			for (int j = i + 1; j < n; j++) {
933 				const IRInst &laterInst = in.GetInstructions()[j];
934 				const IRMeta *m = GetIRMeta(laterInst.op);
935 
936 				if ((m->flags & IRFLAG_EXIT) != 0) {
937 					// Exit, so we can't do the optimization.
938 					break;
939 				}
940 				if (IRReadsFromGPR(laterInst, dest)) {
941 					if (IRDestGPR(laterInst) == dest && laterInst.op == IROp::AndConst) {
942 						const u32 mask = laterInst.constant;
943 						// Here we are, maybe we can reduce the load size based on the mask.
944 						if ((mask & 0xffffff00) == 0) {
945 							inst.op = IROp::Load8;
946 							if (mask == 0xff) {
947 								nextSkip = j;
948 							}
949 						} else if ((mask & 0xffff0000) == 0 && inst.op == IROp::Load32) {
950 							inst.op = IROp::Load16;
951 							if (mask == 0xffff) {
952 								nextSkip = j;
953 							}
954 						}
955 					}
956 					// If it was read, we can't do the optimization.
957 					break;
958 				}
959 				if (IRDestGPR(laterInst) == dest) {
960 					// Someone else wrote, so we can't do the optimization.
961 					break;
962 				}
963 			}
964 		}
965 
966 		if (i != nextSkip) {
967 			out.Write(inst);
968 		}
969 	}
970 
971 	return logBlocks;
972 }
973 
ReorderLoadStoreOps(std::vector<IRInst> & ops)974 static std::vector<IRInst> ReorderLoadStoreOps(std::vector<IRInst> &ops) {
975 	if (ops.size() < 2) {
976 		return ops;
977 	}
978 
979 	bool modifiedRegs[256] = {};
980 
981 	for (size_t i = 0, n = ops.size(); i < n - 1; ++i) {
982 		bool modifiesReg = false;
983 		bool usesFloatReg = false;
984 		switch (ops[i].op) {
985 		case IROp::Load8:
986 		case IROp::Load8Ext:
987 		case IROp::Load16:
988 		case IROp::Load16Ext:
989 		case IROp::Load32:
990 		case IROp::Load32Left:
991 		case IROp::Load32Right:
992 			modifiesReg = true;
993 			if (ops[i].src1 == ops[i].dest) {
994 				// Can't ever reorder these, since it changes.
995 				continue;
996 			}
997 			break;
998 
999 		case IROp::Store8:
1000 		case IROp::Store16:
1001 		case IROp::Store32:
1002 		case IROp::Store32Left:
1003 		case IROp::Store32Right:
1004 			break;
1005 
1006 		case IROp::LoadFloat:
1007 		case IROp::LoadVec4:
1008 			usesFloatReg = true;
1009 			modifiesReg = true;
1010 			break;
1011 
1012 		case IROp::StoreFloat:
1013 		case IROp::StoreVec4:
1014 			usesFloatReg = true;
1015 			break;
1016 
1017 		default:
1018 			continue;
1019 		}
1020 
1021 		memset(modifiedRegs, 0, sizeof(modifiedRegs));
1022 		size_t start = i;
1023 		size_t j;
1024 		for (j = i; j < n; ++j) {
1025 			if (ops[start].op != ops[j].op || ops[start].src1 != ops[j].src1) {
1026 				// Incompatible ops, so let's not reorder.
1027 				break;
1028 			}
1029 			if (modifiedRegs[ops[j].dest] || (!usesFloatReg && modifiedRegs[ops[j].src1])) {
1030 				// Can't reorder, this reg was modified.
1031 				break;
1032 			}
1033 			if (modifiesReg) {
1034 				// Modifies itself, can't reorder this.
1035 				if (!usesFloatReg && ops[j].dest == ops[j].src1) {
1036 					break;
1037 				}
1038 				modifiedRegs[ops[j].dest] = true;
1039 			}
1040 
1041 			// Keep going, these operations are compatible.
1042 		}
1043 
1044 		// Everything up to (but not including) j will be sorted, so skip them.
1045 		i = j - 1;
1046 		size_t end = j;
1047 		if (start + 1 < end) {
1048 			std::stable_sort(ops.begin() + start, ops.begin() + end, [&](const IRInst &a, const IRInst &b) {
1049 				return a.constant < b.constant;
1050 			});
1051 		}
1052 	}
1053 
1054 	return ops;
1055 }
1056 
ReorderLoadStore(const IRWriter & in,IRWriter & out,const IROptions & opts)1057 bool ReorderLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1058 	CONDITIONAL_DISABLE;
1059 
1060 	bool logBlocks = false;
1061 
1062 	enum class RegState : u8 {
1063 		UNUSED = 0,
1064 		READ = 1,
1065 		CHANGED = 2,
1066 	};
1067 
1068 	bool queuing = false;
1069 	std::vector<IRInst> loadStoreQueue;
1070 	std::vector<IRInst> otherQueue;
1071 	RegState otherRegs[256] = {};
1072 
1073 	auto flushQueue = [&]() {
1074 		if (!queuing) {
1075 			return;
1076 		}
1077 
1078 		std::vector<IRInst> loadStoreUnsorted = loadStoreQueue;
1079 		std::vector<IRInst> loadStoreSorted = ReorderLoadStoreOps(loadStoreQueue);
1080 		if (memcmp(&loadStoreSorted[0], &loadStoreUnsorted[0], sizeof(IRInst) * loadStoreSorted.size()) != 0) {
1081 			logBlocks = true;
1082 		}
1083 
1084 		queuing = false;
1085 		for (IRInst queued : loadStoreSorted) {
1086 			out.Write(queued);
1087 		}
1088 		for (IRInst queued : otherQueue) {
1089 			out.Write(queued);
1090 		}
1091 		loadStoreQueue.clear();
1092 		otherQueue.clear();
1093 		memset(otherRegs, 0, sizeof(otherRegs));
1094 	};
1095 
1096 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
1097 		IRInst inst = in.GetInstructions()[i];
1098 		switch (inst.op) {
1099 		case IROp::Load8:
1100 		case IROp::Load8Ext:
1101 		case IROp::Load16:
1102 		case IROp::Load16Ext:
1103 		case IROp::Load32:
1104 		case IROp::Load32Left:
1105 		case IROp::Load32Right:
1106 			// To move a load up, its dest can't be changed by things we move down.
1107 			if (otherRegs[inst.dest] != RegState::UNUSED || otherRegs[inst.src1] == RegState::CHANGED) {
1108 				flushQueue();
1109 			}
1110 
1111 			queuing = true;
1112 			loadStoreQueue.push_back(inst);
1113 			break;
1114 
1115 		case IROp::Store8:
1116 		case IROp::Store16:
1117 		case IROp::Store32:
1118 		case IROp::Store32Left:
1119 		case IROp::Store32Right:
1120 			// A store can move above even if it's read, as long as it's not changed by the other ops.
1121 			if (otherRegs[inst.src3] == RegState::CHANGED || otherRegs[inst.src1] == RegState::CHANGED) {
1122 				flushQueue();
1123 			}
1124 
1125 			queuing = true;
1126 			loadStoreQueue.push_back(inst);
1127 			break;
1128 
1129 		case IROp::LoadVec4:
1130 		case IROp::LoadFloat:
1131 		case IROp::StoreVec4:
1132 		case IROp::StoreFloat:
1133 			// Floats can always move as long as their address is safe.
1134 			if (otherRegs[inst.src1] == RegState::CHANGED) {
1135 				flushQueue();
1136 			}
1137 
1138 			queuing = true;
1139 			loadStoreQueue.push_back(inst);
1140 			break;
1141 
1142 		case IROp::Sub:
1143 		case IROp::Slt:
1144 		case IROp::SltU:
1145 		case IROp::Add:
1146 		case IROp::And:
1147 		case IROp::Or:
1148 		case IROp::Xor:
1149 		case IROp::Shl:
1150 		case IROp::Shr:
1151 		case IROp::Ror:
1152 		case IROp::Sar:
1153 		case IROp::MovZ:
1154 		case IROp::MovNZ:
1155 		case IROp::Max:
1156 		case IROp::Min:
1157 			// We'll try to move this downward.
1158 			otherRegs[inst.dest] = RegState::CHANGED;
1159 			if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1160 				otherRegs[inst.src1] = RegState::READ;
1161 			if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)
1162 				otherRegs[inst.src2] = RegState::READ;
1163 			otherQueue.push_back(inst);
1164 			queuing = true;
1165 			break;
1166 
1167 		case IROp::Neg:
1168 		case IROp::Not:
1169 		case IROp::BSwap16:
1170 		case IROp::BSwap32:
1171 		case IROp::Ext8to32:
1172 		case IROp::Ext16to32:
1173 		case IROp::ReverseBits:
1174 		case IROp::Clz:
1175 		case IROp::AddConst:
1176 		case IROp::SubConst:
1177 		case IROp::AndConst:
1178 		case IROp::OrConst:
1179 		case IROp::XorConst:
1180 		case IROp::SltConst:
1181 		case IROp::SltUConst:
1182 		case IROp::ShlImm:
1183 		case IROp::ShrImm:
1184 		case IROp::RorImm:
1185 		case IROp::SarImm:
1186 		case IROp::Mov:
1187 			// We'll try to move this downward.
1188 			otherRegs[inst.dest] = RegState::CHANGED;
1189 			if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1190 				otherRegs[inst.src1] = RegState::READ;
1191 			otherQueue.push_back(inst);
1192 			queuing = true;
1193 			break;
1194 
1195 		case IROp::SetConst:
1196 			// We'll try to move this downward.
1197 			otherRegs[inst.dest] = RegState::CHANGED;
1198 			otherQueue.push_back(inst);
1199 			queuing = true;
1200 			break;
1201 
1202 		case IROp::Mult:
1203 		case IROp::MultU:
1204 		case IROp::Madd:
1205 		case IROp::MaddU:
1206 		case IROp::Msub:
1207 		case IROp::MsubU:
1208 		case IROp::Div:
1209 		case IROp::DivU:
1210 			if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1211 				otherRegs[inst.src1] = RegState::READ;
1212 			if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED)
1213 				otherRegs[inst.src2] = RegState::READ;
1214 			otherQueue.push_back(inst);
1215 			queuing = true;
1216 			break;
1217 
1218 		case IROp::MfHi:
1219 		case IROp::MfLo:
1220 		case IROp::FpCondToReg:
1221 			otherRegs[inst.dest] = RegState::CHANGED;
1222 			otherQueue.push_back(inst);
1223 			queuing = true;
1224 			break;
1225 
1226 		case IROp::MtHi:
1227 		case IROp::MtLo:
1228 			if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED)
1229 				otherRegs[inst.src1] = RegState::READ;
1230 			otherQueue.push_back(inst);
1231 			queuing = true;
1232 			break;
1233 
1234 		case IROp::Nop:
1235 		case IROp::Downcount:
1236 		case IROp::ZeroFpCond:
1237 			if (queuing) {
1238 				// These are freebies.  Sometimes helps with delay slots.
1239 				otherQueue.push_back(inst);
1240 			} else {
1241 				out.Write(inst);
1242 			}
1243 			break;
1244 
1245 		default:
1246 			flushQueue();
1247 			out.Write(inst);
1248 			break;
1249 		}
1250 	}
1251 	return logBlocks;
1252 }
1253 
MergeLoadStore(const IRWriter & in,IRWriter & out,const IROptions & opts)1254 bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) {
1255 	CONDITIONAL_DISABLE;
1256 
1257 	bool logBlocks = false;
1258 
1259 	auto opsCompatible = [&](const IRInst &a, const IRInst &b, int dist) {
1260 		if (a.op != b.op || a.src1 != b.src1) {
1261 			// Not similar enough at all.
1262 			return false;
1263 		}
1264 		u32 off1 = a.constant;
1265 		u32 off2 = b.constant;
1266 		if (off1 + dist != off2) {
1267 			// Not immediately sequential.
1268 			return false;
1269 		}
1270 
1271 		return true;
1272 	};
1273 
1274 	IRInst prev = { IROp::Nop };
1275 	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
1276 		IRInst inst = in.GetInstructions()[i];
1277 		int c = 0;
1278 		switch (inst.op) {
1279 		case IROp::Store8:
1280 			for (c = 1; c < 4 && i + c < n; ++c) {
1281 				const IRInst &nextInst = in.GetInstructions()[i + c];
1282 				// TODO: Might be nice to check if this is an obvious constant.
1283 				if (inst.src3 != nextInst.src3 || inst.src3 != 0) {
1284 					break;
1285 				}
1286 				if (!opsCompatible(inst, nextInst, c)) {
1287 					break;
1288 				}
1289 			}
1290 			if ((c == 2 || c == 3) && opts.unalignedLoadStore) {
1291 				inst.op = IROp::Store16;
1292 				out.Write(inst);
1293 				prev = inst;
1294 				// Skip the next one (the 3rd will be separate.)
1295 				++i;
1296 				continue;
1297 			}
1298 			if (c == 4 && opts.unalignedLoadStore) {
1299 				inst.op = IROp::Store32;
1300 				out.Write(inst);
1301 				prev = inst;
1302 				// Skip all 4.
1303 				i += 3;
1304 				continue;
1305 			}
1306 			out.Write(inst);
1307 			prev = inst;
1308 			break;
1309 
1310 		case IROp::Store16:
1311 			for (c = 1; c < 2 && i + c < n; ++c) {
1312 				const IRInst &nextInst = in.GetInstructions()[i + c];
1313 				// TODO: Might be nice to check if this is an obvious constant.
1314 				if (inst.src3 != nextInst.src3 || inst.src3 != 0) {
1315 					break;
1316 				}
1317 				if (!opsCompatible(inst, nextInst, c * 2)) {
1318 					break;
1319 				}
1320 			}
1321 			if (c == 2 && opts.unalignedLoadStore) {
1322 				inst.op = IROp::Store32;
1323 				out.Write(inst);
1324 				prev = inst;
1325 				// Skip the next one.
1326 				++i;
1327 				continue;
1328 			}
1329 			out.Write(inst);
1330 			prev = inst;
1331 			break;
1332 
1333 		case IROp::Load32:
1334 			if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {
1335 				// A store and then an immediate load.  This is sadly common in minis.
1336 				if (prev.op == IROp::Store32 && prev.src3 == inst.dest) {
1337 					// Even the same reg, a volatile variable?  Skip it.
1338 					continue;
1339 				}
1340 
1341 				// Store16 and Store8 in rare cases happen... could be made AndConst, but not worth the trouble.
1342 				if (prev.op == IROp::Store32) {
1343 					inst.op = IROp::Mov;
1344 					inst.src1 = prev.src3;
1345 					inst.src2 = 0;
1346 				} else if (prev.op == IROp::StoreFloat) {
1347 					inst.op = IROp::FMovToGPR;
1348 					inst.src1 = prev.src3;
1349 					inst.src2 = 0;
1350 				}
1351 				// The actual op is written below.
1352 			}
1353 			out.Write(inst);
1354 			prev = inst;
1355 			break;
1356 
1357 		case IROp::LoadFloat:
1358 			if (prev.src1 == inst.src1 && prev.src2 == inst.src2) {
1359 				// A store and then an immediate load, of a float.
1360 				if (prev.op == IROp::StoreFloat && prev.src3 == inst.dest) {
1361 					// Volatile float, I suppose?
1362 					continue;
1363 				}
1364 
1365 				if (prev.op == IROp::StoreFloat) {
1366 					inst.op = IROp::FMov;
1367 					inst.src1 = prev.src3;
1368 					inst.src2 = 0;
1369 				} else if (prev.op == IROp::Store32) {
1370 					inst.op = IROp::FMovFromGPR;
1371 					inst.src1 = prev.src3;
1372 					inst.src2 = 0;
1373 				}
1374 				// The actual op is written below.
1375 			}
1376 			out.Write(inst);
1377 			prev = inst;
1378 			break;
1379 
1380 		default:
1381 			out.Write(inst);
1382 			prev = inst;
1383 			break;
1384 		}
1385 	}
1386 	return logBlocks;
1387 }
1388