1 // Copyright (c) 2012- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include <cmath>
19 
20 #include "Common/CPUDetect.h"
21 #include "Common/Data/Convert/SmallDataConvert.h"
22 #include "Common/Math/math_util.h"
23 #include "Core/Compatibility.h"
24 #include "Core/Config.h"
25 #include "Core/MemMap.h"
26 #include "Core/MIPS/MIPS.h"
27 #include "Core/MIPS/MIPSTables.h"
28 #include "Core/MIPS/MIPSAnalyst.h"
29 #include "Core/MIPS/MIPSCodeUtils.h"
30 #include "Core/MIPS/IR/IRFrontend.h"
31 #include "Core/MIPS/IR/IRRegCache.h"
32 #include "Core/Reporting.h"
33 #include "Core/System.h"
34 
35 
36 // All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
37 // Currently known non working ones should have DISABLE.
38 
39 // #define CONDITIONAL_DISABLE(flag) { Comp_Generic(op); return; }
40 #define CONDITIONAL_DISABLE(flag) if (opts.disableFlags & (uint32_t)JitDisable::flag) { Comp_Generic(op); return; }
41 #define DISABLE { Comp_Generic(op); return; }
42 #define INVALIDOP { Comp_Generic(op); return; }
43 
44 #define _RS MIPS_GET_RS(op)
45 #define _RT MIPS_GET_RT(op)
46 #define _RD MIPS_GET_RD(op)
47 #define _FS MIPS_GET_FS(op)
48 #define _FT MIPS_GET_FT(op)
49 #define _FD MIPS_GET_FD(op)
50 #define _SA MIPS_GET_SA(op)
51 #define _POS  ((op>> 6) & 0x1F)
52 #define _SIZE ((op>>11) & 0x1F)
53 #define _IMM16 (signed short)(op & 0xFFFF)
54 #define _IMM26 (op & 0x03FFFFFF)
55 
56 const int vfpuBase = 32;  // skip the FP registers
57 
58 namespace MIPSComp {
ApplyVoffset(u8 regs[4],int count)59 	static void ApplyVoffset(u8 regs[4], int count) {
60 		for (int i = 0; i < count; i++) {
61 			regs[i] = vfpuBase + voffset[regs[i]];
62 		}
63 	}
64 
IsConsecutive2(const u8 regs[2])65 	static bool IsConsecutive2(const u8 regs[2]) {
66 		return regs[1] == regs[0] + 1;
67 	}
68 
IsConsecutive4(const u8 regs[4])69 	static bool IsConsecutive4(const u8 regs[4]) {
70 		return regs[1] == regs[0] + 1 &&
71 			     regs[2] == regs[1] + 1 &&
72 			     regs[3] == regs[2] + 1;
73 	}
74 
75 	// Vector regs can overlap in all sorts of swizzled ways.
76 	// This does allow a single overlap in sregs[i].
IsOverlapSafeAllowS(int dreg,int di,int sn,u8 sregs[],int tn=0,u8 tregs[]=NULL)77 	static bool IsOverlapSafeAllowS(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL) {
78 		for (int i = 0; i < sn; ++i) {
79 			if (sregs[i] == dreg && i != di)
80 				return false;
81 		}
82 		for (int i = 0; i < tn; ++i) {
83 			if (tregs[i] == dreg)
84 				return false;
85 		}
86 
87 		// Hurray, no overlap, we can write directly.
88 		return true;
89 	}
90 
IsOverlapSafeAllowS(int dn,u8 dregs[],int sn,u8 sregs[],int tn=0,u8 tregs[]=nullptr)91 	static bool IsOverlapSafeAllowS(int dn, u8 dregs[], int sn, u8 sregs[], int tn = 0, u8 tregs[] = nullptr) {
92 		for (int i = 0; i < dn; ++i) {
93 			if (!IsOverlapSafeAllowS(dregs[i], i, sn, sregs, tn, tregs)) {
94 				return false;
95 			}
96 		}
97 		return true;
98 	}
99 
IsOverlapSafe(int dreg,int sn,u8 sregs[],int tn=0,u8 tregs[]=nullptr)100 	static bool IsOverlapSafe(int dreg, int sn, u8 sregs[], int tn = 0, u8 tregs[] = nullptr) {
101 		return IsOverlapSafeAllowS(dreg, -1, sn, sregs, tn, tregs);
102 	}
103 
IsOverlapSafe(int dn,u8 dregs[],int sn,u8 sregs[],int tn=0,u8 tregs[]=nullptr)104 	static bool IsOverlapSafe(int dn, u8 dregs[], int sn, u8 sregs[], int tn = 0, u8 tregs[] = nullptr) {
105 		for (int i = 0; i < dn; ++i) {
106 			if (!IsOverlapSafe(dregs[i], sn, sregs, tn, tregs)) {
107 				return false;
108 			}
109 		}
110 		return true;
111 	}
112 
IsPrefixWithinSize(u32 prefix,VectorSize sz)113 	static bool IsPrefixWithinSize(u32 prefix, VectorSize sz) {
114 		int n = GetNumVectorElements(sz);
115 		for (int i = n; i < 4; i++) {
116 			int regnum = (prefix >> (i * 2)) & 3;
117 			int abs = (prefix >> (8 + i)) & 1;
118 			int negate = (prefix >> (16 + i)) & 1;
119 			int constants = (prefix >> (12 + i)) & 1;
120 			if (regnum < n || abs || negate || constants) {
121 				return false;
122 			}
123 		}
124 
125 		return true;
126 	}
127 
IsPrefixWithinSize(u32 prefix,MIPSOpcode op)128 	static bool IsPrefixWithinSize(u32 prefix, MIPSOpcode op) {
129 		return IsPrefixWithinSize(prefix, GetVecSize(op));
130 	}
131 
Comp_VPFX(MIPSOpcode op)132 	void IRFrontend::Comp_VPFX(MIPSOpcode op) {
133 		CONDITIONAL_DISABLE(VFPU_XFER);
134 		// This is how prefixes are typically set.
135 		int data = op & 0xFFFFF;
136 		int regnum = (op >> 24) & 3;
137 		switch (regnum) {
138 		case 0:  // S
139 			js.prefixS = data;
140 			js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
141 			break;
142 		case 1:  // T
143 			js.prefixT = data;
144 			js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
145 			break;
146 		case 2:  // D
147 			js.prefixD = data & 0x00000FFF;
148 			js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
149 			break;
150 		default:
151 			ERROR_LOG(CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);
152 			break;
153 		}
154 	}
155 
InitRegs(u8 * vregs,int reg)156 	static void InitRegs(u8 *vregs, int reg) {
157 		vregs[0] = reg;
158 		vregs[1] = reg + 1;
159 		vregs[2] = reg + 2;
160 		vregs[3] = reg + 3;
161 	}
162 
ApplyPrefixST(u8 * vregs,u32 prefix,VectorSize sz,int tempReg)163 	void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg) {
164 		if (prefix == 0xE4)
165 			return;
166 
167 		int n = GetNumVectorElements(sz);
168 		u8 origV[4];
169 		static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };
170 
171 		for (int i = 0; i < n; i++)
172 			origV[i] = vregs[i];
173 
174 		// Some common vector prefixes
175 		if (sz == V_Quad && IsConsecutive4(vregs)) {
176 			if (prefix == 0xF00E4) {
177 				InitRegs(vregs, tempReg);
178 				ir.Write(IROp::Vec4Neg, vregs[0], origV[0]);
179 				return;
180 			}
181 			if (prefix == 0x00FE4) {
182 				InitRegs(vregs, tempReg);
183 				ir.Write(IROp::Vec4Abs, vregs[0], origV[0]);
184 				return;
185 			}
186 			// Pure shuffle
187 			if (prefix == (prefix & 0xFF)) {
188 				InitRegs(vregs, tempReg);
189 				ir.Write(IROp::Vec4Shuffle, vregs[0], origV[0], prefix);
190 				return;
191 			}
192 		}
193 
194 		// Alright, fall back to the generic approach.
195 		for (int i = 0; i < n; i++) {
196 			int regnum = (prefix >> (i * 2)) & 3;
197 			int abs = (prefix >> (8 + i)) & 1;
198 			int negate = (prefix >> (16 + i)) & 1;
199 			int constants = (prefix >> (12 + i)) & 1;
200 
201 			// Unchanged, hurray.
202 			if (!constants && regnum == i && !abs && !negate)
203 				continue;
204 
205 			// This puts the value into a temp reg, so we won't write the modified value back.
206 			vregs[i] = tempReg + i;
207 			if (!constants) {
208 				if (regnum >= n) {
209 					// Depends on the op, but often zero.
210 					ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(0.0f));
211 				} else if (abs) {
212 					ir.Write(IROp::FAbs, vregs[i], origV[regnum]);
213 					if (negate)
214 						ir.Write(IROp::FNeg, vregs[i], vregs[i]);
215 				} else {
216 					if (negate)
217 						ir.Write(IROp::FNeg, vregs[i], origV[regnum]);
218 					else
219 						ir.Write(IROp::FMov, vregs[i], origV[regnum]);
220 				}
221 			} else {
222 				if (negate) {
223 					ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(-constantArray[regnum + (abs << 2)]));
224 				} else {
225 					ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(constantArray[regnum + (abs << 2)]));
226 				}
227 			}
228 		}
229 	}
230 
GetVectorRegs(u8 regs[4],VectorSize N,int vectorReg)231 	void IRFrontend::GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {
232 		::GetVectorRegs(regs, N, vectorReg);
233 		ApplyVoffset(regs, N);
234 	}
235 
GetMatrixRegs(u8 regs[16],MatrixSize N,int matrixReg)236 	void IRFrontend::GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {
237 		::GetMatrixRegs(regs, N, matrixReg);
238 		for (int i = 0; i < GetMatrixSide(N); i++) {
239 			ApplyVoffset(regs + 4 * i, GetVectorSize(N));
240 		}
241 	}
242 
GetVectorRegsPrefixS(u8 * regs,VectorSize sz,int vectorReg)243 	void IRFrontend::GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
244 		_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
245 		GetVectorRegs(regs, sz, vectorReg);
246 		ApplyPrefixST(regs, js.prefixS, sz, IRVTEMP_PFX_S);
247 	}
GetVectorRegsPrefixT(u8 * regs,VectorSize sz,int vectorReg)248 	void IRFrontend::GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
249 		_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
250 		GetVectorRegs(regs, sz, vectorReg);
251 		ApplyPrefixST(regs, js.prefixT, sz, IRVTEMP_PFX_T);
252 	}
253 
GetVectorRegsPrefixD(u8 * regs,VectorSize sz,int vectorReg)254 	void IRFrontend::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
255 		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
256 
257 		GetVectorRegs(regs, sz, vectorReg);
258 		int n = GetNumVectorElements(sz);
259 		if (js.prefixD == 0)
260 			return;
261 
262 		for (int i = 0; i < n; i++) {
263 			// Hopefully this is rare, we'll just write it into a dumping ground reg.
264 			if (js.VfpuWriteMask(i))
265 				regs[i] = IRVTEMP_PFX_D + i;
266 		}
267 	}
268 
GetDSat(int prefix,int i)269 	inline int GetDSat(int prefix, int i) {
270 		return (prefix >> (i * 2)) & 3;
271 	}
272 
273 	// "D" prefix is really a post process. No need to allocate a temporary register (except
274 	// dummies to simulate writemask, which is done in GetVectorRegsPrefixD
ApplyPrefixD(const u8 * vregs,VectorSize sz)275 	void IRFrontend::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
276 		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
277 		if (!js.prefixD)
278 			return;
279 
280 		int n = GetNumVectorElements(sz);
281 		for (int i = 0; i < n; i++) {
282 			if (js.VfpuWriteMask(i))
283 				continue;
284 			int sat = GetDSat(js.prefixD, i);
285 			if (sat == 1) {
286 				// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]
287 				ir.Write(IROp::FSat0_1, vregs[i], vregs[i]);
288 			} else if (sat == 3) {
289 				ir.Write(IROp::FSatMinus1_1, vregs[i], vregs[i]);
290 			}
291 		}
292 	}
293 
Comp_SV(MIPSOpcode op)294 	void IRFrontend::Comp_SV(MIPSOpcode op) {
295 		CONDITIONAL_DISABLE(LSU_VFPU);
296 		s32 offset = (signed short)(op & 0xFFFC);
297 		int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
298 		MIPSGPReg rs = _RS;
299 
300 		CheckMemoryBreakpoint(rs, offset);
301 
302 		switch (op >> 26) {
303 		case 50: //lv.s
304 			ir.Write(IROp::LoadFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));
305 			break;
306 
307 		case 58: //sv.s
308 			ir.Write(IROp::StoreFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));
309 			break;
310 
311 		default:
312 			INVALIDOP;
313 		}
314 	}
315 
Comp_SVQ(MIPSOpcode op)316 	void IRFrontend::Comp_SVQ(MIPSOpcode op) {
317 		CONDITIONAL_DISABLE(LSU_VFPU);
318 		int imm = (signed short)(op & 0xFFFC);
319 		int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5);
320 		MIPSGPReg rs = _RS;
321 
322 		u8 vregs[4];
323 		GetVectorRegs(vregs, V_Quad, vt);
324 
325 		CheckMemoryBreakpoint(rs, imm);
326 
327 		switch (op >> 26) {
328 		case 54: //lv.q
329 			if (IsConsecutive4(vregs)) {
330 				ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));
331 			} else {
332 				// Let's not even bother with "vertical" loads for now.
333 				ir.Write(IROp::LoadFloat, vregs[0], rs, ir.AddConstant(imm));
334 				ir.Write(IROp::LoadFloat, vregs[1], rs, ir.AddConstant(imm + 4));
335 				ir.Write(IROp::LoadFloat, vregs[2], rs, ir.AddConstant(imm + 8));
336 				ir.Write(IROp::LoadFloat, vregs[3], rs, ir.AddConstant(imm + 12));
337 			}
338 			break;
339 
340 		case 62: //sv.q
341 			if (IsConsecutive4(vregs)) {
342 				ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));
343 			} else {
344 				// Let's not even bother with "vertical" stores for now.
345 				ir.Write(IROp::StoreFloat, vregs[0], rs, ir.AddConstant(imm));
346 				ir.Write(IROp::StoreFloat, vregs[1], rs, ir.AddConstant(imm + 4));
347 				ir.Write(IROp::StoreFloat, vregs[2], rs, ir.AddConstant(imm + 8));
348 				ir.Write(IROp::StoreFloat, vregs[3], rs, ir.AddConstant(imm + 12));
349 			}
350 			break;
351 
352 		case 53: // lvl/lvr.q - highly unusual
353 		case 61: // svl/svr.q - highly unusual
354 			DISABLE;
355 			break;
356 
357 		default:
358 			INVALIDOP;
359 		}
360 	}
361 
Comp_VVectorInit(MIPSOpcode op)362 	void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {
363 		CONDITIONAL_DISABLE(VFPU_XFER);
364 		if (js.HasUnknownPrefix() || js.HasSPrefix()) {
365 			DISABLE;
366 		}
367 
368 		// Vector init
369 		// d[N] = CONST[N]
370 		// Note: probably implemented as vmov with prefix hack.
371 
372 		VectorSize sz = GetVecSize(op);
373 		int type = (op >> 16) & 0xF;
374 		int vd = _VD;
375 		int n = GetNumVectorElements(sz);
376 		u8 dregs[4];
377 		GetVectorRegsPrefixD(dregs, sz, vd);
378 
379 		if (sz == V_Quad && IsConsecutive4(dregs)) {
380 			ir.Write(IROp::Vec4Init, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
381 		} else {
382 			for (int i = 0; i < n; i++) {
383 				ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
384 			}
385 		}
386 		ApplyPrefixD(dregs, sz);
387 	}
388 
Comp_VIdt(MIPSOpcode op)389 	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
390 		CONDITIONAL_DISABLE(VFPU_XFER);
391 		if (js.HasUnknownPrefix() || js.HasSPrefix()) {
392 			DISABLE;
393 		}
394 
395 		// Vector identity row
396 		// d[N] = IDENTITY[N,m]
397 		// Note: probably implemented as vmov with prefix hack.
398 
399 		int vd = _VD;
400 		VectorSize sz = GetVecSize(op);
401 		u8 dregs[4];
402 		GetVectorRegsPrefixD(dregs, sz, vd);
403 
404 		if (sz == 4 && IsConsecutive4(dregs)) {
405 			int row = vd & 3;
406 			Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
407 			ir.Write(IROp::Vec4Init, dregs[0], (int)init);
408 		} else {
409 			switch (sz) {
410 			case V_Pair:
411 				ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 1) == 0 ? 1.0f : 0.0f));
412 				ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 1) == 1 ? 1.0f : 0.0f));
413 				break;
414 			case V_Quad:
415 				ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 3) == 0 ? 1.0f : 0.0f));
416 				ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 3) == 1 ? 1.0f : 0.0f));
417 				ir.Write(IROp::SetConstF, dregs[2], ir.AddConstantFloat((vd & 3) == 2 ? 1.0f : 0.0f));
418 				ir.Write(IROp::SetConstF, dregs[3], ir.AddConstantFloat((vd & 3) == 3 ? 1.0f : 0.0f));
419 				break;
420 			default:
421 				INVALIDOP;
422 			}
423 		}
424 
425 		ApplyPrefixD(dregs, sz);
426 	}
427 
Comp_VMatrixInit(MIPSOpcode op)428 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
429 		CONDITIONAL_DISABLE(VFPU_XFER);
430 		MatrixSize sz = GetMtxSize(op);
431 		if (sz != M_4x4 || !js.HasNoPrefix()) {
432 			DISABLE;
433 		}
434 
435 		// Matrix init (weird prefixes)
436 		// d[N,M] = CONST[N,M]
437 
438 		// Not really about trying here, it will work if enabled.
439 		VectorSize vsz = GetVectorSize(sz);
440 		u8 vecs[4];
441 		int vd = _VD;
442 		if (IsMatrixTransposed(vd)) {
443 			// All outputs are transpositionally symmetric, so should be fine.
444 			vd = TransposeMatrixReg(vd);
445 		}
446 		GetMatrixColumns(vd, M_4x4, vecs);
447 		for (int i = 0; i < 4; i++) {
448 			u8 vec[4];
449 			GetVectorRegs(vec, vsz, vecs[i]);
450 			// As they are columns, they will be nicely consecutive.
451 			Vec4Init init;
452 			switch ((op >> 16) & 0xF) {
453 			case 3:
454 				init = Vec4Init((int)Vec4Init::Set_1000 + i);
455 				break;
456 			case 6:
457 				init = Vec4Init::AllZERO;
458 				break;
459 			case 7:
460 				init = Vec4Init::AllONE;
461 				break;
462 			default:
463 				return;
464 			}
465 			ir.Write(IROp::Vec4Init, vec[0], (int)init);
466 		}
467 	}
468 
Comp_VHdp(MIPSOpcode op)469 	void IRFrontend::Comp_VHdp(MIPSOpcode op) {
470 		CONDITIONAL_DISABLE(VFPU_VEC);
471 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
472 			DISABLE;
473 		}
474 
475 		// Vector homogenous dot product
476 		// d[0] = s[0 .. n-2] dot t[0 .. n-2] + t[n-1]
477 		// Note: s[n-1] is ignored / treated as 1 via prefix override.
478 
479 		int vd = _VD;
480 		int vs = _VS;
481 		int vt = _VT;
482 		VectorSize sz = GetVecSize(op);
483 		int n = GetNumVectorElements(sz);
484 
485 		if (js.prefixS & (0x0101 << (8 + n - 1)))
486 			DISABLE;
487 
488 		// TODO: Force read one of them into regs? probably not.
489 		u8 sregs[4], tregs[4], dregs[1];
490 		GetVectorRegsPrefixS(sregs, sz, vs);
491 		GetVectorRegsPrefixT(tregs, sz, vt);
492 		GetVectorRegsPrefixD(dregs, V_Single, vd);
493 
494 		ir.Write(IROp::FMul, IRVTEMP_0, sregs[0], tregs[0]);
495 
496 		for (int i = 1; i < n; i++) {
497 			if (i == n - 1) {
498 				ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, tregs[i]);
499 			} else {
500 				ir.Write(IROp::FMul, IRVTEMP_0 + 1, sregs[i], tregs[i]);
501 				ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, IRVTEMP_0 + 1);
502 			}
503 		}
504 
505 		ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
506 		ApplyPrefixD(dregs, V_Single);
507 	}
508 
509 	alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
510 
Comp_Vhoriz(MIPSOpcode op)511 	void IRFrontend::Comp_Vhoriz(MIPSOpcode op) {
512 		CONDITIONAL_DISABLE(VFPU_VEC);
513 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
514 			DISABLE;
515 		}
516 
517 		// Vector horizontal add
518 		// d[0] = s[0] + ... s[n-1]
519 		// Vector horizontal average
520 		// d[0] = s[0] / n + ... s[n-1] / n
521 		// Note: Both are implemented as dot products against generated constants.
522 
523 		VectorSize sz = GetVecSize(op);
524 		int n = GetNumVectorElements(sz);
525 
526 		u8 sregs[4], dregs[1];
527 		GetVectorRegsPrefixS(sregs, sz, _VS);
528 		GetVectorRegsPrefixD(dregs, V_Single, _VD);
529 
530 		// We have to start at +0.000 in case any values are -0.000.
531 		ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(0.0f));
532 		for (int i = 0; i < n; ++i) {
533 			ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, sregs[i]);
534 		}
535 
536 		switch ((op >> 16) & 31) {
537 		case 6:  // vfad
538 			ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
539 			break;
540 		case 7:  // vavg
541 			ir.Write(IROp::SetConstF, IRVTEMP_0 + 1, ir.AddConstantFloat(vavg_table[n - 1]));
542 			ir.Write(IROp::FMul, dregs[0], IRVTEMP_0, IRVTEMP_0 + 1);
543 			break;
544 		}
545 
546 		ApplyPrefixD(dregs, V_Single);
547 	}
548 
Comp_VDot(MIPSOpcode op)549 	void IRFrontend::Comp_VDot(MIPSOpcode op) {
550 		CONDITIONAL_DISABLE(VFPU_VEC);
551 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
552 			DISABLE;
553 		}
554 
555 		// Vector dot product
556 		// d[0] = s[0 .. n-1] dot t[0 .. n-1]
557 
558 		int vd = _VD;
559 		int vs = _VS;
560 		int vt = _VT;
561 
562 		VectorSize sz = GetVecSize(op);
563 		int n = GetNumVectorElements(sz);
564 
565 		// TODO: Force read one of them into regs? probably not.
566 		u8 sregs[4], tregs[4], dregs[1];
567 		GetVectorRegsPrefixS(sregs, sz, vs);
568 		GetVectorRegsPrefixT(tregs, sz, vt);
569 		GetVectorRegsPrefixD(dregs, V_Single, vd);
570 
571 		if (sz == V_Quad && IsConsecutive4(sregs) && IsConsecutive4(tregs) && IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
572 			ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
573 			ApplyPrefixD(dregs, V_Single);
574 			return;
575 		}
576 
577 		int temp0 = IRVTEMP_0;
578 		int temp1 = IRVTEMP_0 + 1;
579 		ir.Write(IROp::FMul, temp0, sregs[0], tregs[0]);
580 		for (int i = 1; i < n; i++) {
581 			ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]);
582 			ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1);
583 		}
584 		ApplyPrefixD(dregs, V_Single);
585 	}
586 
Comp_VecDo3(MIPSOpcode op)587 	void IRFrontend::Comp_VecDo3(MIPSOpcode op) {
588 		CONDITIONAL_DISABLE(VFPU_VEC);
589 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
590 			DISABLE;
591 		}
592 
593 		// Vector arithmetic
594 		// d[N] = OP(s[N], t[N]) (see below)
595 
596 		// Check that we can support the ops, and prepare temporary values for ops that need it.
597 		bool allowSIMD = true;
598 		switch (op >> 26) {
599 		case 24: //VFPU0
600 			switch ((op >> 23) & 7) {
601 			case 0: // d[i] = s[i] + t[i]; break; //vadd
602 			case 1: // d[i] = s[i] - t[i]; break; //vsub
603 				break;
604 			case 7: // d[i] = s[i] / t[i]; break; //vdiv
605 				if (!js.HasNoPrefix()) {
606 					DISABLE;
607 				}
608 				break;
609 			default:
610 				INVALIDOP;
611 			}
612 			break;
613 		case 25: //VFPU1
614 			switch ((op >> 23) & 7) {
615 			case 0: // d[i] = s[i] * t[i]; break; //vmul
616 				break;
617 			default:
618 				INVALIDOP;
619 			}
620 			break;
621 		case 27: //VFPU3
622 			switch ((op >> 23) & 7) {
623 			case 2:  // vmin
624 			case 3:  // vmax
625 				allowSIMD = false;
626 				break;
627 			case 6:  // vsge
628 			case 7:  // vslt
629 				allowSIMD = false;
630 				break;
631 			default:
632 				INVALIDOP;
633 			}
634 			break;
635 		default:
636 			INVALIDOP;
637 		}
638 
639 		VectorSize sz = GetVecSize(op);
640 		int n = GetNumVectorElements(sz);
641 
642 		u8 sregs[4], tregs[4], dregs[4];
643 		GetVectorRegsPrefixS(sregs, sz, _VS);
644 		GetVectorRegsPrefixT(tregs, sz, _VT);
645 		GetVectorRegsPrefixD(dregs, sz, _VD);
646 
647 		u8 tempregs[4];
648 		for (int i = 0; i < n; i++) {
649 			if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
650 				tempregs[i] = IRVTEMP_0 + i;
651 			} else {
652 				tempregs[i] = dregs[i];
653 			}
654 		}
655 
656 		// If all three are consecutive 4, we're safe regardless of if we use temps so we should not check that here.
657 		if (allowSIMD && sz == V_Quad && IsConsecutive4(dregs) && IsConsecutive4(sregs) && IsConsecutive4(tregs)) {
658 			IROp opFunc = IROp::Nop;
659 			switch (op >> 26) {
660 			case 24: //VFPU0
661 				switch ((op >> 23) & 7) {
662 				case 0: // d[i] = s[i] + t[i]; break; //vadd
663 					opFunc = IROp::Vec4Add;
664 					break;
665 				case 1: // d[i] = s[i] - t[i]; break; //vsub
666 					opFunc = IROp::Vec4Sub;
667 					break;
668 				case 7: // d[i] = s[i] / t[i]; break; //vdiv
669 					opFunc = IROp::Vec4Div;
670 					break;
671 				}
672 				break;
673 			case 25: //VFPU1
674 				switch ((op >> 23) & 7)
675 				{
676 				case 0: // d[i] = s[i] * t[i]; break; //vmul
677 					opFunc = IROp::Vec4Mul;
678 					break;
679 				}
680 				break;
681 			case 27: //VFPU3
682 				switch ((op >> 23) & 7)
683 				{
684 				case 2:  // vmin
685 				case 3:  // vmax
686 				case 6:  // vsge
687 				case 7:  // vslt
688 					DISABLE;
689 					break;
690 				}
691 				break;
692 			}
693 
694 			if (opFunc != IROp::Nop) {
695 				ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
696 			} else {
697 				DISABLE;
698 			}
699 			ApplyPrefixD(dregs, sz);
700 			return;
701 		}
702 
703 		for (int i = 0; i < n; ++i) {
704 			switch (op >> 26) {
705 			case 24: //VFPU0
706 				switch ((op >> 23) & 7) {
707 				case 0: // d[i] = s[i] + t[i]; break; //vadd
708 					ir.Write(IROp::FAdd, tempregs[i], sregs[i], tregs[i]);
709 					break;
710 				case 1: // d[i] = s[i] - t[i]; break; //vsub
711 					ir.Write(IROp::FSub, tempregs[i], sregs[i], tregs[i]);
712 					break;
713 				case 7: // d[i] = s[i] / t[i]; break; //vdiv
714 					ir.Write(IROp::FDiv, tempregs[i], sregs[i], tregs[i]);
715 					break;
716 				}
717 				break;
718 			case 25: //VFPU1
719 				switch ((op >> 23) & 7) {
720 				case 0: // d[i] = s[i] * t[i]; break; //vmul
721 					ir.Write(IROp::FMul, tempregs[i], sregs[i], tregs[i]);
722 					break;
723 				}
724 				break;
725 			case 27: //VFPU3
726 				switch ((op >> 23) & 7) {
727 				case 2:  // vmin
728 					ir.Write(IROp::FMin, tempregs[i], sregs[i], tregs[i]);
729 					break;
730 				case 3:  // vmax
731 					ir.Write(IROp::FMax, tempregs[i], sregs[i], tregs[i]);
732 					break;
733 				case 6:  // vsge
734 				case 7:  // vslt
735 					DISABLE;
736 					break;
737 				}
738 				break;
739 			}
740 		}
741 
742 		for (int i = 0; i < n; i++) {
743 			if (dregs[i] != tempregs[i]) {
744 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
745 			}
746 		}
747 
748 		ApplyPrefixD(dregs, sz);
749 	}
750 
Comp_VV2Op(MIPSOpcode op)751 	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
752 		CONDITIONAL_DISABLE(VFPU_VEC);
753 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op))
754 			DISABLE;
755 
756 		// Vector unary operation
757 		// d[N] = OP(s[N]) (see below)
758 
759 		int vs = _VS;
760 		int vd = _VD;
761 
762 		int optype = (op >> 16) & 0x1f;
763 		if (optype >= 16 && !js.HasNoPrefix()) {
764 			DISABLE;
765 		} else if ((optype == 1 || optype == 2) && js.HasSPrefix()) {
766 			DISABLE;
767 		} else if (optype == 5 && js.HasDPrefix()) {
768 			DISABLE;
769 		}
770 
771 		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
772 		if (optype == 0 && vs == vd && js.HasNoPrefix()) {
773 			return;
774 		}
775 
776 		VectorSize sz = GetVecSize(op);
777 		int n = GetNumVectorElements(sz);
778 
779 		u8 sregs[4], dregs[4];
780 		GetVectorRegsPrefixS(sregs, sz, vs);
781 		GetVectorRegsPrefixD(dregs, sz, vd);
782 
783 		bool usingTemps = false;
784 		u8 tempregs[4];
785 		for (int i = 0; i < n; ++i) {
786 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
787 				usingTemps = true;
788 				tempregs[i] = IRVTEMP_0 + i;
789 			} else {
790 				tempregs[i] = dregs[i];
791 			}
792 		}
793 
794 		bool canSIMD = false;
795 		// Some can be SIMD'd.
796 		switch (optype) {
797 		case 0:  // vmov
798 		case 1:  // vabs
799 		case 2:  // vneg
800 			canSIMD = true;
801 			break;
802 		}
803 
804 		if (canSIMD && !usingTemps && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
805 			switch (optype) {
806 			case 0:  // vmov
807 				ir.Write(IROp::Vec4Mov, dregs[0], sregs[0]);
808 				break;
809 			case 1:  // vabs
810 				ir.Write(IROp::Vec4Abs, dregs[0], sregs[0]);
811 				break;
812 			case 2:  // vneg
813 				ir.Write(IROp::Vec4Neg, dregs[0], sregs[0]);
814 				break;
815 			}
816 			ApplyPrefixD(dregs, sz);
817 			return;
818 		}
819 
820 		for (int i = 0; i < n; ++i) {
821 			switch (optype) {
822 			case 0: // d[i] = s[i]; break; //vmov
823 				// Probably for swizzle.
824 				ir.Write(IROp::FMov, tempregs[i], sregs[i]);
825 				break;
826 			case 1: // d[i] = fabsf(s[i]); break; //vabs
827 				ir.Write(IROp::FAbs, tempregs[i], sregs[i]);
828 				break;
829 			case 2: // d[i] = -s[i]; break; //vneg
830 				ir.Write(IROp::FNeg, tempregs[i], sregs[i]);
831 				break;
832 			case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;    // vsat0
833 				ir.Write(IROp::FSat0_1, tempregs[i], sregs[i]);
834 				break;
835 			case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;  // vsat1
836 				ir.Write(IROp::FSatMinus1_1, tempregs[i], sregs[i]);
837 				break;
838 			case 16: // d[i] = 1.0f / s[i]; break; //vrcp
839 				ir.Write(IROp::FRecip, tempregs[i], sregs[i]);
840 				break;
841 			case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
842 				ir.Write(IROp::FRSqrt, tempregs[i], sregs[i]);
843 				break;
844 			case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
845 				ir.Write(IROp::FSin, tempregs[i], sregs[i]);
846 				break;
847 			case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
848 				ir.Write(IROp::FCos, tempregs[i], sregs[i]);
849 				break;
850 			case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
851 				DISABLE;
852 				break;
853 			case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
854 				DISABLE;
855 				break;
856 			case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
857 				ir.Write(IROp::FSqrt, tempregs[i], sregs[i]);
858 				break;
859 			case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
860 				ir.Write(IROp::FAsin, tempregs[i], sregs[i]);
861 				break;
862 			case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
863 				ir.Write(IROp::FRecip, tempregs[i], sregs[i]);
864 				ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);
865 				break;
866 			case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
867 				ir.Write(IROp::FSin, tempregs[i], sregs[i]);
868 				ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);
869 				break;
870 			case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
871 				DISABLE;
872 				break;
873 			default:
874 				INVALIDOP;
875 			}
876 		}
877 		for (int i = 0; i < n; i++) {
878 			if (dregs[i] != tempregs[i]) {
879 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
880 			}
881 		}
882 
883 		ApplyPrefixD(dregs, sz);
884 	}
885 
Comp_Vi2f(MIPSOpcode op)886 	void IRFrontend::Comp_Vi2f(MIPSOpcode op) {
887 		CONDITIONAL_DISABLE(VFPU_VEC);
888 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) {
889 			DISABLE;
890 		}
891 
892 		// Vector integer to float
893 		// d[N] = float(S[N]) * mult
894 
895 		VectorSize sz = GetVecSize(op);
896 		int n = GetNumVectorElements(sz);
897 
898 		int imm = (op >> 16) & 0x1f;
899 		const float mult = 1.0f / (float)(1UL << imm);
900 
901 		u8 sregs[4], dregs[4];
902 		GetVectorRegsPrefixS(sregs, sz, _VS);
903 		GetVectorRegsPrefixD(dregs, sz, _VD);
904 
905 		u8 tempregs[4];
906 		for (int i = 0; i < n; ++i) {
907 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
908 				tempregs[i] = IRVTEMP_PFX_T + i;  // Need IRVTEMP_0 for the scaling factor
909 			} else {
910 				tempregs[i] = dregs[i];
911 			}
912 		}
913 		if (mult != 1.0f)
914 			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(mult));
915 		// TODO: Use the SCVTF with builtin scaling where possible.
916 		for (int i = 0; i < n; i++) {
917 			ir.Write(IROp::FCvtSW, tempregs[i], sregs[i]);
918 		}
919 		if (mult != 1.0f) {
920 			for (int i = 0; i < n; i++) {
921 				ir.Write(IROp::FMul, tempregs[i], tempregs[i], IRVTEMP_0);
922 			}
923 		}
924 
925 		for (int i = 0; i < n; ++i) {
926 			if (dregs[i] != tempregs[i]) {
927 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
928 			}
929 		}
930 		ApplyPrefixD(dregs, sz);
931 	}
932 
Comp_Vh2f(MIPSOpcode op)933 	void IRFrontend::Comp_Vh2f(MIPSOpcode op) {
934 		CONDITIONAL_DISABLE(VFPU_VEC);
935 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) {
936 			DISABLE;
937 		}
938 
939 		// Vector expand half to float
940 		// d[N*2] = float(lowerhalf(s[N])), d[N*2+1] = float(upperhalf(s[N]))
941 
942 		DISABLE;
943 	}
944 
Comp_Vf2i(MIPSOpcode op)945 	void IRFrontend::Comp_Vf2i(MIPSOpcode op) {
946 		CONDITIONAL_DISABLE(VFPU_VEC);
947 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || (js.prefixD & 0xFF) != 0) {
948 			DISABLE;
949 		}
950 
951 		// Vector float to integer
952 		// d[N] = int(S[N] * mult)
953 		// Note: saturates on overflow.
954 
955 		DISABLE;
956 	}
957 
Comp_Mftv(MIPSOpcode op)958 	void IRFrontend::Comp_Mftv(MIPSOpcode op) {
959 		CONDITIONAL_DISABLE(VFPU_XFER);
960 
961 		int imm = op & 0xFF;
962 		MIPSGPReg rt = _RT;
963 		switch ((op >> 21) & 0x1f) {
964 		case 3: //mfv / mfvc
965 			// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
966 			if (rt != MIPS_REG_ZERO) {
967 				if (imm < 128) {  //R(rt) = VI(imm);
968 					ir.Write(IROp::FMovToGPR, rt, vfpuBase + voffset[imm]);
969 				} else {
970 					switch (imm - 128) {
971 					case VFPU_CTRL_DPREFIX:
972 					case VFPU_CTRL_SPREFIX:
973 					case VFPU_CTRL_TPREFIX:
974 						FlushPrefixV();
975 						break;
976 					}
977 					if (imm - 128 < VFPU_CTRL_MAX) {
978 						ir.Write(IROp::VfpuCtrlToReg, rt, imm - 128);
979 					} else {
980 						INVALIDOP;
981 					}
982 				}
983 			}
984 			break;
985 
986 		case 7: // mtv
987 			if (imm < 128) {
988 				ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[imm], rt);
989 			} else if ((imm - 128) < VFPU_CTRL_MAX) {
990 				u32 mask;
991 				if (GetVFPUCtrlMask(imm - 128, &mask)) {
992 					if (mask != 0xFFFFFFFF) {
993 						ir.Write(IROp::AndConst, IRTEMP_0, rt, ir.AddConstant(mask));
994 						ir.Write(IROp::SetCtrlVFPUReg, imm - 128, IRTEMP_0);
995 					} else {
996 						ir.Write(IROp::SetCtrlVFPUReg, imm - 128, rt);
997 					}
998 				}
999 
1000 				if (imm - 128 == VFPU_CTRL_SPREFIX) {
1001 					js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1002 				} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
1003 					js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1004 				} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
1005 					js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1006 				}
1007 			} else {
1008 				INVALIDOP;
1009 			}
1010 			break;
1011 
1012 		default:
1013 			INVALIDOP;
1014 		}
1015 		// This op is marked not to auto-eat prefix so we must do it manually.
1016 		EatPrefix();
1017 	}
1018 
Comp_Vmfvc(MIPSOpcode op)1019 	void IRFrontend::Comp_Vmfvc(MIPSOpcode op) {
1020 		CONDITIONAL_DISABLE(VFPU_XFER);
1021 
1022 		// Vector Move from vector control reg (no prefixes)
1023 		// D[0] = VFPU_CTRL[i]
1024 
1025 		int vd = _VD;
1026 		int imm = (op >> 8) & 0x7F;
1027 		if (imm < VFPU_CTRL_MAX) {
1028 			ir.Write(IROp::VfpuCtrlToReg, IRTEMP_0, imm);
1029 			ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[vd], IRTEMP_0);
1030 		} else {
1031 			INVALIDOP;
1032 		}
1033 	}
1034 
Comp_Vmtvc(MIPSOpcode op)1035 	void IRFrontend::Comp_Vmtvc(MIPSOpcode op) {
1036 		CONDITIONAL_DISABLE(VFPU_XFER);
1037 
1038 		// Vector Move to vector control reg (no prefixes)
1039 		// VFPU_CTRL[i] = S[0]
1040 
1041 		int vs = _VS;
1042 		int imm = op & 0xFF;
1043 		if (imm < VFPU_CTRL_MAX) {
1044 			u32 mask;
1045 			if (GetVFPUCtrlMask(imm, &mask)) {
1046 				if (mask != 0xFFFFFFFF) {
1047 					ir.Write(IROp::FMovToGPR, IRTEMP_0, vfpuBase + voffset[imm]);
1048 					ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(mask));
1049 					ir.Write(IROp::SetCtrlVFPUReg, imm, IRTEMP_0);
1050 				} else {
1051 					ir.Write(IROp::SetCtrlVFPUFReg, imm, vfpuBase + voffset[vs]);
1052 				}
1053 			}
1054 			if (imm == VFPU_CTRL_SPREFIX) {
1055 				js.prefixSFlag = JitState::PREFIX_UNKNOWN;
1056 			} else if (imm == VFPU_CTRL_TPREFIX) {
1057 				js.prefixTFlag = JitState::PREFIX_UNKNOWN;
1058 			} else if (imm == VFPU_CTRL_DPREFIX) {
1059 				js.prefixDFlag = JitState::PREFIX_UNKNOWN;
1060 			}
1061 		} else {
1062 			INVALIDOP;
1063 		}
1064 	}
1065 
Comp_Vmmov(MIPSOpcode op)1066 	void IRFrontend::Comp_Vmmov(MIPSOpcode op) {
1067 		CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);
1068 		if (!js.HasNoPrefix()) {
1069 			DISABLE;
1070 		}
1071 
1072 		// Matrix move (weird prefixes)
1073 		// D[N,M] = S[N,M]
1074 
1075 		int vs = _VS;
1076 		int vd = _VD;
1077 		// This probably ignores prefixes for all sane intents and purposes.
1078 		if (vs == vd) {
1079 			// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.
1080 			return;
1081 		}
1082 
1083 		MatrixSize sz = GetMtxSize(op);
1084 		int n = GetMatrixSide(sz);
1085 
1086 		u8 sregs[16], dregs[16];
1087 		GetMatrixRegs(sregs, sz, vs);
1088 		GetMatrixRegs(dregs, sz, vd);
1089 
1090 		switch (GetMatrixOverlap(vs, vd, sz)) {
1091 		case OVERLAP_EQUAL:
1092 			// In-place transpose
1093 			DISABLE;
1094 		case OVERLAP_PARTIAL:
1095 			DISABLE;
1096 		case OVERLAP_NONE:
1097 		default:
1098 			break;
1099 		}
1100 		if (IsMatrixTransposed(vd) == IsMatrixTransposed(vs) && sz == M_4x4) {
1101 			// Untranspose both matrices
1102 			if (IsMatrixTransposed(vd)) {
1103 				vd = TransposeMatrixReg(vd);
1104 				vs = TransposeMatrixReg(vs);
1105 			}
1106 			// Get the columns
1107 			u8 scols[4], dcols[4];
1108 			GetMatrixColumns(vs, sz, scols);
1109 			GetMatrixColumns(vd, sz, dcols);
1110 			for (int i = 0; i < 4; i++) {
1111 				u8 svec[4], dvec[4];
1112 				GetVectorRegs(svec, GetVectorSize(sz), scols[i]);
1113 				GetVectorRegs(dvec, GetVectorSize(sz), dcols[i]);
1114 				ir.Write(IROp::Vec4Mov, dvec[0], svec[0]);
1115 			}
1116 			return;
1117 		}
1118 		for (int a = 0; a < n; a++) {
1119 			for (int b = 0; b < n; b++) {
1120 				ir.Write(IROp::FMov, dregs[a * 4 + b], sregs[a * 4 + b]);
1121 			}
1122 		}
1123 	}
1124 
Comp_Vmscl(MIPSOpcode op)1125 	void IRFrontend::Comp_Vmscl(MIPSOpcode op) {
1126 		CONDITIONAL_DISABLE(VFPU_MTX_VMSCL);
1127 		if (!js.HasNoPrefix()) {
1128 			DISABLE;
1129 		}
1130 
1131 		// Matrix scale, matrix by scalar (weird prefixes)
1132 		// d[N,M] = s[N,M] * t[0]
1133 		// Note: behaves just slightly differently than a series of vscls.
1134 
1135 		int vs = _VS;
1136 		int vd = _VD;
1137 		int vt = _VT;
1138 
1139 		MatrixSize sz = GetMtxSize(op);
1140 		if (sz != M_4x4) {
1141 			DISABLE;
1142 		}
1143 		if (GetMtx(vt) == GetMtx(vd)) {
1144 			DISABLE;
1145 		}
1146 		int n = GetMatrixSide(sz);
1147 
1148 		// The entire matrix is scaled equally, so transpose doesn't matter.  Let's normalize.
1149 		if (IsMatrixTransposed(vs) && IsMatrixTransposed(vd)) {
1150 			vs = TransposeMatrixReg(vs);
1151 			vd = TransposeMatrixReg(vd);
1152 		}
1153 		if (IsMatrixTransposed(vs) || IsMatrixTransposed(vd)) {
1154 			DISABLE;
1155 		}
1156 
1157 		u8 sregs[16], dregs[16], tregs[1];
1158 		GetMatrixRegs(sregs, sz, vs);
1159 		GetMatrixRegs(dregs, sz, vd);
1160 		GetVectorRegs(tregs, V_Single, vt);
1161 
1162 		for (int i = 0; i < n; ++i) {
1163 			ir.Write(IROp::Vec4Scale, dregs[i * 4], sregs[i * 4], tregs[0]);
1164 		}
1165 	}
1166 
Comp_VScl(MIPSOpcode op)1167 	void IRFrontend::Comp_VScl(MIPSOpcode op) {
1168 		CONDITIONAL_DISABLE(VFPU_VEC);
1169 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
1170 			DISABLE;
1171 		}
1172 
1173 		// Vector scale, vector by scalar
1174 		// d[N] = s[N] * t[0]
1175 
1176 		VectorSize sz = GetVecSize(op);
1177 		int n = GetNumVectorElements(sz);
1178 
1179 		int vs = _VS;
1180 		int vd = _VD;
1181 		int vt = _VT;
1182 		u8 sregs[4], dregs[4], treg;
1183 		GetVectorRegsPrefixS(sregs, sz, vs);
1184 		// TODO: Prefixes seem strange...
1185 		GetVectorRegsPrefixT(&treg, V_Single, vt);
1186 		GetVectorRegsPrefixD(dregs, sz, vd);
1187 
1188 		bool overlap = false;
1189 		// For prefixes to work, we just have to ensure that none of the output registers spill
1190 		// and that there's no overlap.
1191 		u8 tempregs[4];
1192 		memcpy(tempregs, dregs, sizeof(tempregs));
1193 		for (int i = 0; i < n; ++i) {
1194 			// Conservative, can be improved
1195 			if (treg == dregs[i] || !IsOverlapSafe(dregs[i], n, sregs)) {
1196 				// Need to use temp regs
1197 				tempregs[i] = IRVTEMP_0 + i;
1198 				overlap = true;
1199 			}
1200 		}
1201 
1202 		if (n == 4 && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
1203 			if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
1204 				ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
1205 				ApplyPrefixD(dregs, sz);
1206 				return;
1207 			}
1208 		}
1209 
1210 		for (int i = 0; i < n; i++) {
1211 			ir.Write(IROp::FMul, tempregs[i], sregs[i], treg);
1212 		}
1213 
1214 		for (int i = 0; i < n; i++) {
1215 			// All must be mapped for prefixes to work.
1216 			if (dregs[i] != tempregs[i]) {
1217 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1218 			}
1219 		}
1220 
1221 		ApplyPrefixD(dregs, sz);
1222 	}
1223 
1224 	/*
1225 	// Capital = straight, lower case = transposed
1226 	// 8 possibilities:
1227 	ABC   2
1228 	ABc   missing
1229 	AbC   1
1230 	Abc   1
1231 
1232 	aBC = ACB    2 + swap
1233 	aBc = AcB    1 + swap
1234 	abC = ACb    missing
1235 	abc = Acb    1 + swap
1236 
1237 	*/
1238 
1239 	// This may or may not be a win when using the IR interpreter...
1240 	// Many more instructions to interpret.
Comp_Vmmul(MIPSOpcode op)1241 	void IRFrontend::Comp_Vmmul(MIPSOpcode op) {
1242 		CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);
1243 		if (!js.HasNoPrefix()) {
1244 			DISABLE;
1245 		}
1246 
1247 		if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {
1248 			// Fall back to interpreter, which has the accurate implementation.
1249 			// Later we might do something more optimized here.
1250 			DISABLE;
1251 		}
1252 
1253 		// Matrix multiply (weird prefixes)
1254 		// D[0 .. N, 0 .. M] = S[0 .. N, 0 .. M]' * T[0 .. N, 0 .. M]
1255 		// Note: Behaves as if it's implemented through a series of vdots.
1256 		// Important: this is a matrix multiply with a pre-transposed S.
1257 
1258 		MatrixSize sz = GetMtxSize(op);
1259 		int n = GetMatrixSide(sz);
1260 
1261 		int vs = _VS;
1262 		int vd = _VD;
1263 		int vt = _VT;
1264 		MatrixOverlapType soverlap = GetMatrixOverlap(vs, vd, sz);
1265 		MatrixOverlapType toverlap = GetMatrixOverlap(vt, vd, sz);
1266 
1267 		// A very common arrangment. Rearrange to something we can handle.
1268 		if (IsMatrixTransposed(vd)) {
1269 			// Matrix identity says (At * Bt) = (B * A)t
1270 			// D = S * T
1271 			// Dt = (S * T)t = (Tt * St)
1272 			vd = TransposeMatrixReg(vd);
1273 			std::swap(vs, vt);
1274 		}
1275 
1276 		u8 sregs[16], tregs[16], dregs[16];
1277 		GetMatrixRegs(sregs, sz, vs);
1278 		GetMatrixRegs(tregs, sz, vt);
1279 		GetMatrixRegs(dregs, sz, vd);
1280 
1281 		if (soverlap || toverlap) {
1282 			DISABLE;
1283 		}
1284 
1285 		// dregs are always consecutive, thanks to our transpose trick.
1286 		// However, not sure this is always worth it.
1287 		if (sz == M_4x4 && IsConsecutive4(dregs)) {
1288 			// TODO: The interpreter would like proper matrix ops better. Can generate those, and
1289 			// expand them like this as needed on "real" architectures.
1290 			int s0 = IRVTEMP_0;
1291 			int s1 = IRVTEMP_PFX_T;
1292 			if (!IsConsecutive4(sregs)) {
1293 				// METHOD 1: Handles AbC and Abc
1294 				for (int j = 0; j < 4; j++) {
1295 					ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[j * 4]);
1296 					for (int i = 1; i < 4; i++) {
1297 						ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[j * 4 + i]);
1298 						ir.Write(IROp::Vec4Add, s0, s0, s1);
1299 					}
1300 					ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
1301 				}
1302 				return;
1303 			} else if (IsConsecutive4(tregs)) {
1304 				// METHOD 2: Handles ABC only. Not efficient on CPUs that don't do fast dots.
1305 				// Dots only work if tregs are consecutive.
1306 				// TODO: Skip this and resort to method one and transpose the output?
1307 				for (int j = 0; j < 4; j++) {
1308 					for (int i = 0; i < 4; i++) {
1309 						ir.Write(IROp::Vec4Dot, s0 + i, sregs[i * 4], tregs[j * 4]);
1310 					}
1311 					ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
1312 				}
1313 				return;
1314 			} else {
1315 				// ABc - s consecutive, t not.
1316 				// Tekken uses this.
1317 				// logBlocks = 1;
1318 			}
1319 		}
1320 
1321 		// Fallback. Expands a LOT
1322 		int temp0 = IRVTEMP_0;
1323 		int temp1 = IRVTEMP_0 + 1;
1324 		for (int a = 0; a < n; a++) {
1325 			for (int b = 0; b < n; b++) {
1326 				ir.Write(IROp::FMul, temp0, sregs[b * 4], tregs[a * 4]);
1327 				for (int c = 1; c < n; c++) {
1328 					ir.Write(IROp::FMul, temp1, sregs[b * 4 + c], tregs[a * 4 + c]);
1329 					ir.Write(IROp::FAdd, (c == n - 1) ? dregs[a * 4 + b] : temp0, temp0, temp1);
1330 				}
1331 			}
1332 		}
1333 	}
1334 
Comp_Vtfm(MIPSOpcode op)1335 	void IRFrontend::Comp_Vtfm(MIPSOpcode op) {
1336 		CONDITIONAL_DISABLE(VFPU_MTX_VTFM);
1337 		if (!js.HasNoPrefix()) {
1338 			DISABLE;
1339 		}
1340 
1341 		// Vertex transform, vector by matrix (weird prefixes)
1342 		// d[N] = s[N*m .. N*m + n-1] dot t[0 .. n-1]
1343 		// Homogenous means t[n-1] is treated as 1.
1344 		// Note: this might be implemented as a series of vdots with special prefixes.
1345 
1346 		VectorSize sz = GetVecSize(op);
1347 		MatrixSize msz = GetMtxSize(op);
1348 		int n = GetNumVectorElements(sz);
1349 		int ins = (op >> 23) & 7;
1350 
1351 		bool homogenous = false;
1352 		if (n == ins) {
1353 			n++;
1354 			sz = (VectorSize)((int)(sz)+1);
1355 			msz = (MatrixSize)((int)(msz)+1);
1356 			homogenous = true;
1357 		}
1358 		// Otherwise, n should already be ins + 1.
1359 		else if (n != ins + 1) {
1360 			DISABLE;
1361 		}
1362 
1363 		u8 sregs[16], dregs[4], tregs[4];
1364 		GetMatrixRegs(sregs, msz, _VS);
1365 		GetVectorRegs(tregs, sz, _VT);
1366 		GetVectorRegs(dregs, sz, _VD);
1367 
1368 		// SIMD-optimized implementations - if sregs[0..3] is non-consecutive, it's transposed.
1369 		if (msz == M_4x4 && !IsConsecutive4(sregs)) {
1370 			int s0 = IRVTEMP_0;
1371 			int s1 = IRVTEMP_PFX_S;
1372 			// For this algorithm, we don't care if tregs are consecutive or not,
1373 			// they are accessed one at a time. This handles homogenous transforms correctly, as well.
1374 			// We take advantage of sregs[0] + 1 being sregs[4] here.
1375 			ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
1376 			for (int i = 1; i < 4; i++) {
1377 				if (!homogenous || (i != n - 1)) {
1378 					ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
1379 					ir.Write(IROp::Vec4Add, s0, s0, s1);
1380 				} else {
1381 					ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);
1382 				}
1383 			}
1384 			if (IsConsecutive4(dregs)) {
1385 				ir.Write(IROp::Vec4Mov, dregs[0], s0);
1386 			} else {
1387 				for (int i = 0; i < 4; i++) {
1388 					ir.Write(IROp::FMov, dregs[i], s0 + i);
1389 				}
1390 			}
1391 			return;
1392 		} else if (msz == M_4x4 && IsConsecutive4(sregs)) {
1393 			// Consecutive, which is harder.
1394 			DISABLE;
1395 			int s0 = IRVTEMP_0;
1396 			int s1 = IRVTEMP_PFX_S;
1397 			// Doesn't make complete sense to me why this works.... (because it doesn't.)
1398 			ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
1399 			for (int i = 1; i < 4; i++) {
1400 				if (!homogenous || (i != n - 1)) {
1401 					ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
1402 					ir.Write(IROp::Vec4Add, s0, s0, s1);
1403 				} else {
1404 					ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);
1405 				}
1406 			}
1407 			if (IsConsecutive4(dregs)) {
1408 				ir.Write(IROp::Vec4Mov, dregs[0], s0);
1409 			} else {
1410 				for (int i = 0; i < 4; i++) {
1411 					ir.Write(IROp::FMov, dregs[i], s0 + i);
1412 				}
1413 			}
1414 			return;
1415 		}
1416 
1417 		// TODO: test overlap, optimize.
1418 		u8 tempregs[4];
1419 		int s0 = IRVTEMP_0;
1420 		int temp1 = IRVTEMP_0 + 1;
1421 		for (int i = 0; i < n; i++) {
1422 			ir.Write(IROp::FMul, s0, sregs[i * 4], tregs[0]);
1423 			for (int k = 1; k < n; k++) {
1424 				if (!homogenous || k != n - 1) {
1425 					ir.Write(IROp::FMul, temp1, sregs[i * 4 + k], tregs[k]);
1426 					ir.Write(IROp::FAdd, s0, s0, temp1);
1427 				} else {
1428 					ir.Write(IROp::FAdd, s0, s0, sregs[i * 4 + k]);
1429 				}
1430 			}
1431 			int temp = IRVTEMP_PFX_T + i;
1432 			ir.Write(IROp::FMov, temp, s0);
1433 			tempregs[i] = temp;
1434 		}
1435 		for (int i = 0; i < n; i++) {
1436 			if (tempregs[i] != dregs[i])
1437 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1438 		}
1439 	}
1440 
Comp_VCrs(MIPSOpcode op)1441 	void IRFrontend::Comp_VCrs(MIPSOpcode op) {
1442 		CONDITIONAL_DISABLE(VFPU_VEC);
1443 		if (js.HasUnknownPrefix() || js.HasSPrefix() || js.HasTPrefix()) {
1444 			DISABLE;
1445 		}
1446 
1447 		// Vector cross (half a cross product, n = 3)
1448 		// d[0] = s[y]*t[z], d[1] = s[z]*t[x], d[2] = s[x]*t[y]
1449 		// To do a full cross product: vcrs tmp1, s, t; vcrs tmp2 t, s; vsub d, tmp1, tmp2;
1450 		// (or just use vcrsp.)
1451 
1452 		DISABLE;
1453 	}
1454 
Comp_VDet(MIPSOpcode op)1455 	void IRFrontend::Comp_VDet(MIPSOpcode op) {
1456 		CONDITIONAL_DISABLE(VFPU_VEC);
1457 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || (js.prefixT & 0x000CFCF0) != 0x000E0) {
1458 			DISABLE;
1459 		}
1460 
1461 		// Vector determinant
1462 		// d[0] = s[0]*t[1] - s[1]*t[0]
1463 		// Note: this operates on two vectors, not a 2x2 matrix.
1464 
1465 		DISABLE;
1466 	}
1467 
Comp_Vi2x(MIPSOpcode op)1468 	void IRFrontend::Comp_Vi2x(MIPSOpcode op) {
1469 		CONDITIONAL_DISABLE(VFPU_VEC);
1470 		if (js.HasUnknownPrefix() || js.HasSPrefix())
1471 			DISABLE;
1472 
1473 		int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
1474 		bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
1475 
1476 		// These instructions pack pairs or quads of integers into 32 bits.
1477 		// The unsigned (u) versions skip the sign bit when packing, first doing a signed clamp to 0 (so the sign bit won't ever be 1).
1478 
1479 		VectorSize sz = GetVecSize(op);
1480 		VectorSize outsize;
1481 		if (bits == 8) {
1482 			outsize = V_Single;
1483 			if (sz != V_Quad) {
1484 				DISABLE;
1485 			}
1486 		} else {
1487 			switch (sz) {
1488 			case V_Pair:
1489 				outsize = V_Single;
1490 				break;
1491 			case V_Quad:
1492 				outsize = V_Pair;
1493 				break;
1494 			default:
1495 				DISABLE;
1496 			}
1497 		}
1498 
1499 		u8 sregs[4], dregs[2], srcregs[4], tempregs[2];
1500 		GetVectorRegsPrefixS(sregs, sz, _VS);
1501 		GetVectorRegsPrefixD(dregs, outsize, _VD);
1502 		memcpy(srcregs, sregs, sizeof(sregs));
1503 		memcpy(tempregs, dregs, sizeof(dregs));
1504 
1505 		int nOut = GetNumVectorElements(outsize);
1506 
1507 		// If src registers aren't contiguous, make them.
1508 		if (sz == V_Quad && !IsConsecutive4(sregs)) {
1509 			// T prefix is unused.
1510 			for (int i = 0; i < 4; i++) {
1511 				srcregs[i] = IRVTEMP_PFX_T + i;
1512 				ir.Write(IROp::FMov, srcregs[i], sregs[i]);
1513 			}
1514 		}
1515 
1516 		if (bits == 8) {
1517 			if (unsignedOp) {  //vi2uc
1518 				// Output is only one register.
1519 				ir.Write(IROp::Vec4ClampToZero, IRVTEMP_0, srcregs[0]);
1520 				ir.Write(IROp::Vec4Pack31To8, tempregs[0], IRVTEMP_0);
1521 			} else {  //vi2c
1522 				ir.Write(IROp::Vec4Pack32To8, tempregs[0], srcregs[0]);
1523 			}
1524 		} else {
1525 			// bits == 16
1526 			if (unsignedOp) {  //vi2us
1527 				// Output is only one register.
1528 				ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0, srcregs[0]);
1529 				ir.Write(IROp::Vec2Pack31To16, tempregs[0], IRVTEMP_0);
1530 				if (outsize == V_Pair) {
1531 					ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0 + 2, srcregs[2]);
1532 					ir.Write(IROp::Vec2Pack31To16, tempregs[1], IRVTEMP_0 + 2);
1533 				}
1534 			} else {  //vi2s
1535 				ir.Write(IROp::Vec2Pack32To16, tempregs[0], srcregs[0]);
1536 				if (outsize == V_Pair) {
1537 					ir.Write(IROp::Vec2Pack32To16, tempregs[1], srcregs[2]);
1538 				}
1539 			}
1540 		}
1541 
1542 		for (int i = 0; i < nOut; i++) {
1543 			if (dregs[i] != tempregs[i]) {
1544 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1545 			}
1546 		}
1547 
1548 		ApplyPrefixD(dregs, outsize);
1549 	}
1550 
Comp_Vx2i(MIPSOpcode op)1551 	void IRFrontend::Comp_Vx2i(MIPSOpcode op) {
1552 		CONDITIONAL_DISABLE(VFPU_VEC);
1553 		if (js.HasUnknownPrefix() || js.HasSPrefix())
1554 			DISABLE;
1555 
1556 		int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)
1557 		bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)
1558 
1559 		// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values
1560 		// at the top.  vus2i shifts it an extra bit right afterward.
1561 		// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values
1562 		// at the top too.  vuc2i is a bit special (see below.)
1563 		// Let's do this similarly as h2f - we do a solution that works for both singles and pairs
1564 		// then use it for both.
1565 
1566 		VectorSize sz = GetVecSize(op);
1567 		VectorSize outsize;
1568 		if (bits == 8) {
1569 			outsize = V_Quad;
1570 			sz = V_Single;  // For some reason, sz is set to Quad in this case though the outsize is Single.
1571 		} else {
1572 			switch (sz) {
1573 			case V_Single:
1574 				outsize = V_Pair;
1575 				break;
1576 			case V_Pair:
1577 				outsize = V_Quad;
1578 				break;
1579 			default:
1580 				DISABLE;
1581 			}
1582 		}
1583 
1584 		u8 sregs[2], dregs[4], tempregs[4], srcregs[2];
1585 		GetVectorRegsPrefixS(sregs, sz, _VS);
1586 		GetVectorRegsPrefixD(dregs, outsize, _VD);
1587 		memcpy(tempregs, dregs, sizeof(dregs));
1588 		memcpy(srcregs, sregs, sizeof(sregs));
1589 
1590 		// Remap source regs to be consecutive. This is not required
1591 		// but helpful when implementations can join two Vec2Expand.
1592 		if (sz == V_Pair && !IsConsecutive2(srcregs)) {
1593 			for (int i = 0; i < 2; i++) {
1594 				srcregs[i] = IRVTEMP_0 + i;
1595 				ir.Write(IROp::FMov, srcregs[i], sregs[i]);
1596 			}
1597 		}
1598 
1599 		int nIn = GetNumVectorElements(sz);
1600 
1601 		int nOut = 2;
1602 		if (outsize == V_Quad)
1603 			nOut = 4;
1604 		// Remap dest regs. PFX_T is unused.
1605 		if (outsize == V_Pair) {
1606 			bool consecutive = IsConsecutive2(dregs);
1607 			// We must have them consecutive, so all temps, or none.
1608 			if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) {
1609 				for (int i = 0; i < nOut; i++) {
1610 					tempregs[i] = IRVTEMP_PFX_T + i;
1611 				}
1612 			}
1613 		} else if (outsize == V_Quad) {
1614 			bool consecutive = IsConsecutive4(dregs);
1615 			if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) {
1616 				for (int i = 0; i < nOut; i++) {
1617 					tempregs[i] = IRVTEMP_PFX_T + i;
1618 				}
1619 			}
1620 		}
1621 
1622 		if (bits == 16) {
1623 			if (unsignedOp) {
1624 				ir.Write(IROp::Vec2Unpack16To31, tempregs[0], srcregs[0]);
1625 				if (outsize == V_Quad)
1626 					ir.Write(IROp::Vec2Unpack16To31, tempregs[2], srcregs[1]);
1627 			} else {
1628 				ir.Write(IROp::Vec2Unpack16To32, tempregs[0], srcregs[0]);
1629 				if (outsize == V_Quad)
1630 					ir.Write(IROp::Vec2Unpack16To32, tempregs[2], srcregs[1]);
1631 			}
1632 		} else if (bits == 8) {
1633 			if (unsignedOp) {
1634 				// See the interpreter, this one is odd. Hardware bug?
1635 				ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]);
1636 				ir.Write(IROp::Vec4DuplicateUpperBitsAndShift1, tempregs[0], tempregs[0]);
1637 			} else {
1638 				ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]);
1639 			}
1640 		}
1641 
1642 		for (int i = 0; i < nOut; i++) {
1643 			if (tempregs[i] != dregs[i]) {
1644 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1645 			}
1646 		}
1647 		ApplyPrefixD(dregs, outsize);
1648 	}
1649 
Comp_VCrossQuat(MIPSOpcode op)1650 	void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) {
1651 		CONDITIONAL_DISABLE(VFPU_VEC);
1652 		if (!js.HasNoPrefix())
1653 			DISABLE;
1654 
1655 		// Vector cross product (n = 3, weird prefixes)
1656 		// d[0 .. 2] = s[0 .. 2] X t[0 .. 2]
1657 		// Vector quaternion product (n = 4, weird prefixes)
1658 		// d[0 .. 2] = t[0 .. 2] X s[0 .. 2] + s[3] * t[0 .. 2] + t[3] * s[0 .. 2]
1659 		// d[3] = s[3]*t[3] - s[0 .. 2] dot t[0 .. 3]
1660 		// Note: Behaves as if it's implemented through a series of vdots.
1661 
1662 		VectorSize sz = GetVecSize(op);
1663 		int n = GetNumVectorElements(sz);
1664 
1665 		u8 sregs[4], tregs[4], dregs[4];
1666 		GetVectorRegs(sregs, sz, _VS);
1667 		GetVectorRegs(tregs, sz, _VT);
1668 		GetVectorRegs(dregs, sz, _VD);
1669 
1670 		u8 tempregs[4];
1671 		for (int i = 0; i < n; ++i) {
1672 			if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
1673 				tempregs[i] = IRVTEMP_PFX_T + i;   // using IRTEMP0 for other things
1674 			} else {
1675 				tempregs[i] = dregs[i];
1676 			}
1677 		}
1678 
1679 		if (sz == V_Triple) {
1680 			int temp0 = IRVTEMP_0;
1681 			int temp1 = IRVTEMP_0 + 1;
1682 			// Compute X
1683 			ir.Write(IROp::FMul, temp0, sregs[1], tregs[2]);
1684 			ir.Write(IROp::FMul, temp1, sregs[2], tregs[1]);
1685 			ir.Write(IROp::FSub, tempregs[0], temp0, temp1);
1686 
1687 			// Compute Y
1688 			ir.Write(IROp::FMul, temp0, sregs[2], tregs[0]);
1689 			ir.Write(IROp::FMul, temp1, sregs[0], tregs[2]);
1690 			ir.Write(IROp::FSub, tempregs[1], temp0, temp1);
1691 
1692 			// Compute Z
1693 			ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]);
1694 			ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]);
1695 			ir.Write(IROp::FSub, tempregs[2], temp0, temp1);
1696 		} else if (sz == V_Quad) {
1697 			DISABLE;
1698 		} else {
1699 			DISABLE;
1700 		}
1701 
1702 		for (int i = 0; i < n; i++) {
1703 			if (tempregs[i] != dregs[i])
1704 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1705 		}
1706 	}
1707 
Comp_Vcmp(MIPSOpcode op)1708 	void IRFrontend::Comp_Vcmp(MIPSOpcode op) {
1709 		CONDITIONAL_DISABLE(VFPU_COMP);
1710 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
1711 			DISABLE;
1712 		}
1713 
1714 		// Vector compare
1715 		// VFPU_CC[N] = COMPARE(s[N], t[N])
1716 
1717 		VectorSize sz = GetVecSize(op);
1718 		int n = GetNumVectorElements(sz);
1719 
1720 		u8 sregs[4], tregs[4];
1721 		GetVectorRegsPrefixS(sregs, sz, _VS);
1722 		GetVectorRegsPrefixT(tregs, sz, _VT);
1723 
1724 		int cond = op & 0xF;
1725 		int mask = 0;
1726 		for (int i = 0; i < n; i++) {
1727 			ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]);
1728 			mask |= (1 << i);
1729 		}
1730 		ir.Write(IROp::FCmpVfpuAggregate, mask);
1731 	}
1732 
Comp_Vcmov(MIPSOpcode op)1733 	void IRFrontend::Comp_Vcmov(MIPSOpcode op) {
1734 		CONDITIONAL_DISABLE(VFPU_COMP);
1735 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
1736 			DISABLE;
1737 		}
1738 
1739 		// Vector conditional move
1740 		// imm3 >= 6: d[N] = VFPU_CC[N] == tf ? s[N] : d[N]
1741 		// imm3 < 6:  d[N] = VFPU_CC[imm3] == tf ? s[N] : d[N]
1742 
1743 		VectorSize sz = GetVecSize(op);
1744 		int n = GetNumVectorElements(sz);
1745 
1746 		u8 sregs[4], dregs[4];
1747 		GetVectorRegsPrefixS(sregs, sz, _VS);
1748 		GetVectorRegsPrefixD(dregs, sz, _VD);
1749 		int tf = (op >> 19) & 1;
1750 		int imm3 = (op >> 16) & 7;
1751 
1752 		for (int i = 0; i < n; ++i) {
1753 			// Simplification: Disable if overlap unsafe
1754 			if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
1755 				DISABLE;
1756 			}
1757 		}
1758 		if (imm3 < 6) {
1759 			// Test one bit of CC. This bit decides whether none or all subregisters are copied.
1760 			for (int i = 0; i < n; i++) {
1761 				ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (imm3) | ((!tf) << 7));
1762 			}
1763 		} else {
1764 			// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
1765 			for (int i = 0; i < n; i++) {
1766 				ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (i) | ((!tf) << 7));
1767 			}
1768 		}
1769 		ApplyPrefixD(dregs, sz);
1770 	}
1771 
Comp_Viim(MIPSOpcode op)1772 	void IRFrontend::Comp_Viim(MIPSOpcode op) {
1773 		CONDITIONAL_DISABLE(VFPU_XFER);
1774 		if (js.HasUnknownPrefix())
1775 			DISABLE;
1776 
1777 		// Vector integer immediate
1778 		// d[0] = float(imm)
1779 
1780 		s32 imm = SignExtend16ToS32(op);
1781 		u8 dreg;
1782 		GetVectorRegsPrefixD(&dreg, V_Single, _VT);
1783 		ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat((float)imm));
1784 		ApplyPrefixD(&dreg, V_Single);
1785 	}
1786 
Comp_Vfim(MIPSOpcode op)1787 	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
1788 		CONDITIONAL_DISABLE(VFPU_XFER);
1789 		if (js.HasUnknownPrefix())
1790 			DISABLE;
1791 
1792 		// Vector half-float immediate
1793 		// d[0] = float(imm)
1794 
1795 		FP16 half;
1796 		half.u = op & 0xFFFF;
1797 		FP32 fval = half_to_float_fast5(half);
1798 
1799 		u8 dreg;
1800 		GetVectorRegsPrefixD(&dreg, V_Single, _VT);
1801 		ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(fval.f));
1802 		ApplyPrefixD(&dreg, V_Single);
1803 	}
1804 
Comp_Vcst(MIPSOpcode op)1805 	void IRFrontend::Comp_Vcst(MIPSOpcode op) {
1806 		CONDITIONAL_DISABLE(VFPU_XFER);
1807 		if (js.HasUnknownPrefix())
1808 			DISABLE;
1809 
1810 		// Vector constant
1811 		// d[N] = CONST
1812 
1813 		int conNum = (op >> 16) & 0x1f;
1814 		int vd = _VD;
1815 
1816 		VectorSize sz = GetVecSize(op);
1817 		int n = GetNumVectorElements(sz);
1818 
1819 		u8 dregs[4];
1820 		GetVectorRegsPrefixD(dregs, sz, vd);
1821 		for (int i = 0; i < n; i++) {
1822 			ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum]));
1823 		}
1824 		ApplyPrefixD(dregs, sz);
1825 	}
1826 
1827 	// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
1828 	// calling the math library.
Comp_VRot(MIPSOpcode op)1829 	void IRFrontend::Comp_VRot(MIPSOpcode op) {
1830 		CONDITIONAL_DISABLE(VFPU_VEC);
1831 		if (!js.HasNoPrefix()) {
1832 			// Prefixes work strangely for this:
1833 			//  * They never apply to cos (whether d or s prefixes.)
1834 			//  * They mostly apply to sin/0, e.g. 0:1, M, or |x|.
1835 			DISABLE;
1836 		}
1837 
1838 		// Vector rotation matrix (weird prefixes)
1839 		// d[N] = SINCOSVAL(s[0], imm[N])
1840 		// The imm selects: cos index, sin index, 0 or sin for others, sin sign flip.
1841 
1842 		int vd = _VD;
1843 		int vs = _VS;
1844 		int imm = (op >> 16) & 0x1f;
1845 		VectorSize sz = GetVecSize(op);
1846 		int n = GetNumVectorElements(sz);
1847 		bool negSin = (imm & 0x10) ? true : false;
1848 
1849 		char d[4] = { '0', '0', '0', '0' };
1850 		if (((imm >> 2) & 3) == (imm & 3)) {
1851 			for (int i = 0; i < 4; i++)
1852 				d[i] = 's';
1853 		}
1854 		d[(imm >> 2) & 3] = 's';
1855 		d[imm & 3] = 'c';
1856 
1857 		u8 dregs[4];
1858 		GetVectorRegs(dregs, sz, vd);
1859 		u8 sreg[1];
1860 		GetVectorRegs(sreg, V_Single, vs);
1861 		for (int i = 0; i < n; i++) {
1862 			switch (d[i]) {
1863 			case '0':
1864 				ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(0.0f));
1865 				break;
1866 			case 's':
1867 				ir.Write(IROp::FSin, dregs[i], sreg[0]);
1868 				if (negSin) {
1869 					ir.Write(IROp::FNeg, dregs[i], dregs[i]);
1870 				}
1871 				break;
1872 			case 'c':
1873 				ir.Write(IROp::FCos, dregs[i], sreg[0]);
1874 				break;
1875 			}
1876 		}
1877 	}
1878 
Comp_Vsgn(MIPSOpcode op)1879 	void IRFrontend::Comp_Vsgn(MIPSOpcode op) {
1880 		CONDITIONAL_DISABLE(VFPU_VEC);
1881 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) {
1882 			DISABLE;
1883 		}
1884 
1885 		// Vector extract sign
1886 		// d[N] = signum(s[N])
1887 
1888 		VectorSize sz = GetVecSize(op);
1889 		int n = GetNumVectorElements(sz);
1890 
1891 		u8 sregs[4], dregs[4];
1892 		GetVectorRegsPrefixS(sregs, sz, _VS);
1893 		GetVectorRegsPrefixD(dregs, sz, _VD);
1894 
1895 		u8 tempregs[4];
1896 		for (int i = 0; i < n; ++i) {
1897 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
1898 				tempregs[i] = IRTEMP_0 + i;
1899 			} else {
1900 				tempregs[i] = dregs[i];
1901 			}
1902 		}
1903 
1904 		for (int i = 0; i < n; ++i) {
1905 			ir.Write(IROp::FSign, tempregs[i], sregs[i]);
1906 		}
1907 
1908 		for (int i = 0; i < n; ++i) {
1909 			if (dregs[i] != tempregs[i]) {
1910 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1911 			}
1912 		}
1913 
1914 		ApplyPrefixD(dregs, sz);
1915 	}
1916 
Comp_Vocp(MIPSOpcode op)1917 	void IRFrontend::Comp_Vocp(MIPSOpcode op) {
1918 		CONDITIONAL_DISABLE(VFPU_VEC);
1919 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix() || (js.prefixS & VFPU_NEGATE(1, 1, 1, 1)) != 0) {
1920 			DISABLE;
1921 		}
1922 
1923 		// Vector one's complement
1924 		// d[N] = 1.0 - s[N]
1925 
1926 		VectorSize sz = GetVecSize(op);
1927 		int n = GetNumVectorElements(sz);
1928 
1929 		// This is a hack that modifies prefixes.  We eat them later, so just overwrite.
1930 		// S prefix forces the negate flags.
1931 		js.prefixS |= 0x000F0000;
1932 		// T prefix forces constants on and regnum to 1.
1933 		// That means negate still works, and abs activates a different constant.
1934 		js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;
1935 
1936 		u8 sregs[4], tregs[4], dregs[4];
1937 		GetVectorRegsPrefixS(sregs, sz, _VS);
1938 		// There's no bits for t, so just reuse s.  It'll be constants only.
1939 		GetVectorRegsPrefixT(tregs, sz, _VS);
1940 		GetVectorRegsPrefixD(dregs, sz, _VD);
1941 
1942 		u8 tempregs[4];
1943 		for (int i = 0; i < n; ++i) {
1944 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
1945 				tempregs[i] = IRVTEMP_0 + i;
1946 			} else {
1947 				tempregs[i] = dregs[i];
1948 			}
1949 		}
1950 
1951 		for (int i = 0; i < n; ++i) {
1952 			ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]);
1953 		}
1954 		for (int i = 0; i < n; ++i) {
1955 			if (dregs[i] != tempregs[i]) {
1956 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
1957 			}
1958 		}
1959 
1960 		ApplyPrefixD(dregs, sz);
1961 	}
1962 
Comp_ColorConv(MIPSOpcode op)1963 	void IRFrontend::Comp_ColorConv(MIPSOpcode op) {
1964 		CONDITIONAL_DISABLE(VFPU_VEC);
1965 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
1966 			DISABLE;
1967 		}
1968 
1969 		// Vector color conversion
1970 		// d[N] = ConvertTo16(s[N*2]) | (ConvertTo16(s[N*2+1]) << 16)
1971 
1972 		DISABLE;
1973 	}
1974 
Comp_Vbfy(MIPSOpcode op)1975 	void IRFrontend::Comp_Vbfy(MIPSOpcode op) {
1976 		CONDITIONAL_DISABLE(VFPU_VEC);
1977 		if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) {
1978 			DISABLE;
1979 		}
1980 
1981 		// Vector butterfly operation
1982 		// vbfy2: d[0] = s[0] + s[2], d[1] = s[1] + s[3], d[2] = s[0] - s[2], d[3] = s[1] - s[3]
1983 		// vbfy1: d[N*2] = s[N*2] + s[N*2+1], d[N*2+1] = s[N*2] - s[N*2+1]
1984 
1985 		VectorSize sz = GetVecSize(op);
1986 		int n = GetNumVectorElements(sz);
1987 		if (n != 2 && n != 4) {
1988 			// Bad instructions
1989 			INVALIDOP;
1990 		}
1991 
1992 		u8 sregs[4], dregs[4];
1993 		GetVectorRegsPrefixS(sregs, sz, _VS);
1994 		GetVectorRegsPrefixD(dregs, sz, _VD);
1995 
1996 		u8 tempregs[4];
1997 		for (int i = 0; i < n; ++i) {
1998 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
1999 				tempregs[i] = IRVTEMP_0 + i;
2000 			} else {
2001 				tempregs[i] = dregs[i];
2002 			}
2003 		}
2004 
2005 		int subop = (op >> 16) & 0x1F;
2006 		if (subop == 3 && n == 4) {
2007 			// vbfy2
2008 			ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[2]);
2009 			ir.Write(IROp::FAdd, tempregs[1], sregs[1], sregs[3]);
2010 			ir.Write(IROp::FSub, tempregs[2], sregs[0], sregs[2]);
2011 			ir.Write(IROp::FSub, tempregs[3], sregs[1], sregs[3]);
2012 		} else if (subop == 2) {
2013 			// vbfy1
2014 			ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[1]);
2015 			ir.Write(IROp::FSub, tempregs[1], sregs[0], sregs[1]);
2016 			if (n == 4) {
2017 				ir.Write(IROp::FAdd, tempregs[2], sregs[2], sregs[3]);
2018 				ir.Write(IROp::FSub, tempregs[3], sregs[2], sregs[3]);
2019 			}
2020 		} else {
2021 			INVALIDOP;
2022 		}
2023 
2024 		for (int i = 0; i < n; ++i) {
2025 			if (tempregs[i] != dregs[i])
2026 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
2027 		}
2028 
2029 		ApplyPrefixD(dregs, sz);
2030 	}
2031 }
2032