1 // Copyright (c) 2012- PPSSPP Project. 2 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU General Public License as published by 5 // the Free Software Foundation, version 2.0 or later versions. 6 7 // This program is distributed in the hope that it will be useful, 8 // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 // GNU General Public License 2.0 for more details. 11 12 // A copy of the GPL 2.0 should have been included with the program. 13 // If not, see http://www.gnu.org/licenses/ 14 15 // Official git repository and contact information can be found at 16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. 17 18 #include <cmath> 19 20 #include "Common/CPUDetect.h" 21 #include "Common/Data/Convert/SmallDataConvert.h" 22 #include "Common/Math/math_util.h" 23 #include "Core/Compatibility.h" 24 #include "Core/Config.h" 25 #include "Core/MemMap.h" 26 #include "Core/MIPS/MIPS.h" 27 #include "Core/MIPS/MIPSTables.h" 28 #include "Core/MIPS/MIPSAnalyst.h" 29 #include "Core/MIPS/MIPSCodeUtils.h" 30 #include "Core/MIPS/IR/IRFrontend.h" 31 #include "Core/MIPS/IR/IRRegCache.h" 32 #include "Core/Reporting.h" 33 #include "Core/System.h" 34 35 36 // All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly. 37 // Currently known non working ones should have DISABLE. 38 39 // #define CONDITIONAL_DISABLE(flag) { Comp_Generic(op); return; } 40 #define CONDITIONAL_DISABLE(flag) if (opts.disableFlags & (uint32_t)JitDisable::flag) { Comp_Generic(op); return; } 41 #define DISABLE { Comp_Generic(op); return; } 42 #define INVALIDOP { Comp_Generic(op); return; } 43 44 #define _RS MIPS_GET_RS(op) 45 #define _RT MIPS_GET_RT(op) 46 #define _RD MIPS_GET_RD(op) 47 #define _FS MIPS_GET_FS(op) 48 #define _FT MIPS_GET_FT(op) 49 #define _FD MIPS_GET_FD(op) 50 #define _SA MIPS_GET_SA(op) 51 #define _POS ((op>> 6) & 0x1F) 52 #define _SIZE ((op>>11) & 0x1F) 53 #define _IMM16 (signed short)(op & 0xFFFF) 54 #define _IMM26 (op & 0x03FFFFFF) 55 56 const int vfpuBase = 32; // skip the FP registers 57 58 namespace MIPSComp { ApplyVoffset(u8 regs[4],int count)59 static void ApplyVoffset(u8 regs[4], int count) { 60 for (int i = 0; i < count; i++) { 61 regs[i] = vfpuBase + voffset[regs[i]]; 62 } 63 } 64 IsConsecutive2(const u8 regs[2])65 static bool IsConsecutive2(const u8 regs[2]) { 66 return regs[1] == regs[0] + 1; 67 } 68 IsConsecutive4(const u8 regs[4])69 static bool IsConsecutive4(const u8 regs[4]) { 70 return regs[1] == regs[0] + 1 && 71 regs[2] == regs[1] + 1 && 72 regs[3] == regs[2] + 1; 73 } 74 75 // Vector regs can overlap in all sorts of swizzled ways. 76 // This does allow a single overlap in sregs[i]. IsOverlapSafeAllowS(int dreg,int di,int sn,u8 sregs[],int tn=0,u8 tregs[]=NULL)77 static bool IsOverlapSafeAllowS(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL) { 78 for (int i = 0; i < sn; ++i) { 79 if (sregs[i] == dreg && i != di) 80 return false; 81 } 82 for (int i = 0; i < tn; ++i) { 83 if (tregs[i] == dreg) 84 return false; 85 } 86 87 // Hurray, no overlap, we can write directly. 88 return true; 89 } 90 IsOverlapSafeAllowS(int dn,u8 dregs[],int sn,u8 sregs[],int tn=0,u8 tregs[]=nullptr)91 static bool IsOverlapSafeAllowS(int dn, u8 dregs[], int sn, u8 sregs[], int tn = 0, u8 tregs[] = nullptr) { 92 for (int i = 0; i < dn; ++i) { 93 if (!IsOverlapSafeAllowS(dregs[i], i, sn, sregs, tn, tregs)) { 94 return false; 95 } 96 } 97 return true; 98 } 99 IsOverlapSafe(int dreg,int sn,u8 sregs[],int tn=0,u8 tregs[]=nullptr)100 static bool IsOverlapSafe(int dreg, int sn, u8 sregs[], int tn = 0, u8 tregs[] = nullptr) { 101 return IsOverlapSafeAllowS(dreg, -1, sn, sregs, tn, tregs); 102 } 103 IsOverlapSafe(int dn,u8 dregs[],int sn,u8 sregs[],int tn=0,u8 tregs[]=nullptr)104 static bool IsOverlapSafe(int dn, u8 dregs[], int sn, u8 sregs[], int tn = 0, u8 tregs[] = nullptr) { 105 for (int i = 0; i < dn; ++i) { 106 if (!IsOverlapSafe(dregs[i], sn, sregs, tn, tregs)) { 107 return false; 108 } 109 } 110 return true; 111 } 112 IsPrefixWithinSize(u32 prefix,VectorSize sz)113 static bool IsPrefixWithinSize(u32 prefix, VectorSize sz) { 114 int n = GetNumVectorElements(sz); 115 for (int i = n; i < 4; i++) { 116 int regnum = (prefix >> (i * 2)) & 3; 117 int abs = (prefix >> (8 + i)) & 1; 118 int negate = (prefix >> (16 + i)) & 1; 119 int constants = (prefix >> (12 + i)) & 1; 120 if (regnum < n || abs || negate || constants) { 121 return false; 122 } 123 } 124 125 return true; 126 } 127 IsPrefixWithinSize(u32 prefix,MIPSOpcode op)128 static bool IsPrefixWithinSize(u32 prefix, MIPSOpcode op) { 129 return IsPrefixWithinSize(prefix, GetVecSize(op)); 130 } 131 Comp_VPFX(MIPSOpcode op)132 void IRFrontend::Comp_VPFX(MIPSOpcode op) { 133 CONDITIONAL_DISABLE(VFPU_XFER); 134 // This is how prefixes are typically set. 135 int data = op & 0xFFFFF; 136 int regnum = (op >> 24) & 3; 137 switch (regnum) { 138 case 0: // S 139 js.prefixS = data; 140 js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY; 141 break; 142 case 1: // T 143 js.prefixT = data; 144 js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY; 145 break; 146 case 2: // D 147 js.prefixD = data & 0x00000FFF; 148 js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY; 149 break; 150 default: 151 ERROR_LOG(CPU, "VPFX - bad regnum %i : data=%08x", regnum, data); 152 break; 153 } 154 } 155 InitRegs(u8 * vregs,int reg)156 static void InitRegs(u8 *vregs, int reg) { 157 vregs[0] = reg; 158 vregs[1] = reg + 1; 159 vregs[2] = reg + 2; 160 vregs[3] = reg + 3; 161 } 162 ApplyPrefixST(u8 * vregs,u32 prefix,VectorSize sz,int tempReg)163 void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg) { 164 if (prefix == 0xE4) 165 return; 166 167 int n = GetNumVectorElements(sz); 168 u8 origV[4]; 169 static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f }; 170 171 for (int i = 0; i < n; i++) 172 origV[i] = vregs[i]; 173 174 // Some common vector prefixes 175 if (sz == V_Quad && IsConsecutive4(vregs)) { 176 if (prefix == 0xF00E4) { 177 InitRegs(vregs, tempReg); 178 ir.Write(IROp::Vec4Neg, vregs[0], origV[0]); 179 return; 180 } 181 if (prefix == 0x00FE4) { 182 InitRegs(vregs, tempReg); 183 ir.Write(IROp::Vec4Abs, vregs[0], origV[0]); 184 return; 185 } 186 // Pure shuffle 187 if (prefix == (prefix & 0xFF)) { 188 InitRegs(vregs, tempReg); 189 ir.Write(IROp::Vec4Shuffle, vregs[0], origV[0], prefix); 190 return; 191 } 192 } 193 194 // Alright, fall back to the generic approach. 195 for (int i = 0; i < n; i++) { 196 int regnum = (prefix >> (i * 2)) & 3; 197 int abs = (prefix >> (8 + i)) & 1; 198 int negate = (prefix >> (16 + i)) & 1; 199 int constants = (prefix >> (12 + i)) & 1; 200 201 // Unchanged, hurray. 202 if (!constants && regnum == i && !abs && !negate) 203 continue; 204 205 // This puts the value into a temp reg, so we won't write the modified value back. 206 vregs[i] = tempReg + i; 207 if (!constants) { 208 if (regnum >= n) { 209 // Depends on the op, but often zero. 210 ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(0.0f)); 211 } else if (abs) { 212 ir.Write(IROp::FAbs, vregs[i], origV[regnum]); 213 if (negate) 214 ir.Write(IROp::FNeg, vregs[i], vregs[i]); 215 } else { 216 if (negate) 217 ir.Write(IROp::FNeg, vregs[i], origV[regnum]); 218 else 219 ir.Write(IROp::FMov, vregs[i], origV[regnum]); 220 } 221 } else { 222 if (negate) { 223 ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(-constantArray[regnum + (abs << 2)])); 224 } else { 225 ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(constantArray[regnum + (abs << 2)])); 226 } 227 } 228 } 229 } 230 GetVectorRegs(u8 regs[4],VectorSize N,int vectorReg)231 void IRFrontend::GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) { 232 ::GetVectorRegs(regs, N, vectorReg); 233 ApplyVoffset(regs, N); 234 } 235 GetMatrixRegs(u8 regs[16],MatrixSize N,int matrixReg)236 void IRFrontend::GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) { 237 ::GetMatrixRegs(regs, N, matrixReg); 238 for (int i = 0; i < GetMatrixSide(N); i++) { 239 ApplyVoffset(regs + 4 * i, GetVectorSize(N)); 240 } 241 } 242 GetVectorRegsPrefixS(u8 * regs,VectorSize sz,int vectorReg)243 void IRFrontend::GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) { 244 _assert_(js.prefixSFlag & JitState::PREFIX_KNOWN); 245 GetVectorRegs(regs, sz, vectorReg); 246 ApplyPrefixST(regs, js.prefixS, sz, IRVTEMP_PFX_S); 247 } GetVectorRegsPrefixT(u8 * regs,VectorSize sz,int vectorReg)248 void IRFrontend::GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) { 249 _assert_(js.prefixTFlag & JitState::PREFIX_KNOWN); 250 GetVectorRegs(regs, sz, vectorReg); 251 ApplyPrefixST(regs, js.prefixT, sz, IRVTEMP_PFX_T); 252 } 253 GetVectorRegsPrefixD(u8 * regs,VectorSize sz,int vectorReg)254 void IRFrontend::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) { 255 _assert_(js.prefixDFlag & JitState::PREFIX_KNOWN); 256 257 GetVectorRegs(regs, sz, vectorReg); 258 int n = GetNumVectorElements(sz); 259 if (js.prefixD == 0) 260 return; 261 262 for (int i = 0; i < n; i++) { 263 // Hopefully this is rare, we'll just write it into a dumping ground reg. 264 if (js.VfpuWriteMask(i)) 265 regs[i] = IRVTEMP_PFX_D + i; 266 } 267 } 268 GetDSat(int prefix,int i)269 inline int GetDSat(int prefix, int i) { 270 return (prefix >> (i * 2)) & 3; 271 } 272 273 // "D" prefix is really a post process. No need to allocate a temporary register (except 274 // dummies to simulate writemask, which is done in GetVectorRegsPrefixD ApplyPrefixD(const u8 * vregs,VectorSize sz)275 void IRFrontend::ApplyPrefixD(const u8 *vregs, VectorSize sz) { 276 _assert_(js.prefixDFlag & JitState::PREFIX_KNOWN); 277 if (!js.prefixD) 278 return; 279 280 int n = GetNumVectorElements(sz); 281 for (int i = 0; i < n; i++) { 282 if (js.VfpuWriteMask(i)) 283 continue; 284 int sat = GetDSat(js.prefixD, i); 285 if (sat == 1) { 286 // clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1] 287 ir.Write(IROp::FSat0_1, vregs[i], vregs[i]); 288 } else if (sat == 3) { 289 ir.Write(IROp::FSatMinus1_1, vregs[i], vregs[i]); 290 } 291 } 292 } 293 Comp_SV(MIPSOpcode op)294 void IRFrontend::Comp_SV(MIPSOpcode op) { 295 CONDITIONAL_DISABLE(LSU_VFPU); 296 s32 offset = (signed short)(op & 0xFFFC); 297 int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5); 298 MIPSGPReg rs = _RS; 299 300 CheckMemoryBreakpoint(rs, offset); 301 302 switch (op >> 26) { 303 case 50: //lv.s 304 ir.Write(IROp::LoadFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset)); 305 break; 306 307 case 58: //sv.s 308 ir.Write(IROp::StoreFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset)); 309 break; 310 311 default: 312 INVALIDOP; 313 } 314 } 315 Comp_SVQ(MIPSOpcode op)316 void IRFrontend::Comp_SVQ(MIPSOpcode op) { 317 CONDITIONAL_DISABLE(LSU_VFPU); 318 int imm = (signed short)(op & 0xFFFC); 319 int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5); 320 MIPSGPReg rs = _RS; 321 322 u8 vregs[4]; 323 GetVectorRegs(vregs, V_Quad, vt); 324 325 CheckMemoryBreakpoint(rs, imm); 326 327 switch (op >> 26) { 328 case 54: //lv.q 329 if (IsConsecutive4(vregs)) { 330 ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm)); 331 } else { 332 // Let's not even bother with "vertical" loads for now. 333 ir.Write(IROp::LoadFloat, vregs[0], rs, ir.AddConstant(imm)); 334 ir.Write(IROp::LoadFloat, vregs[1], rs, ir.AddConstant(imm + 4)); 335 ir.Write(IROp::LoadFloat, vregs[2], rs, ir.AddConstant(imm + 8)); 336 ir.Write(IROp::LoadFloat, vregs[3], rs, ir.AddConstant(imm + 12)); 337 } 338 break; 339 340 case 62: //sv.q 341 if (IsConsecutive4(vregs)) { 342 ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm)); 343 } else { 344 // Let's not even bother with "vertical" stores for now. 345 ir.Write(IROp::StoreFloat, vregs[0], rs, ir.AddConstant(imm)); 346 ir.Write(IROp::StoreFloat, vregs[1], rs, ir.AddConstant(imm + 4)); 347 ir.Write(IROp::StoreFloat, vregs[2], rs, ir.AddConstant(imm + 8)); 348 ir.Write(IROp::StoreFloat, vregs[3], rs, ir.AddConstant(imm + 12)); 349 } 350 break; 351 352 case 53: // lvl/lvr.q - highly unusual 353 case 61: // svl/svr.q - highly unusual 354 DISABLE; 355 break; 356 357 default: 358 INVALIDOP; 359 } 360 } 361 Comp_VVectorInit(MIPSOpcode op)362 void IRFrontend::Comp_VVectorInit(MIPSOpcode op) { 363 CONDITIONAL_DISABLE(VFPU_XFER); 364 if (js.HasUnknownPrefix() || js.HasSPrefix()) { 365 DISABLE; 366 } 367 368 // Vector init 369 // d[N] = CONST[N] 370 // Note: probably implemented as vmov with prefix hack. 371 372 VectorSize sz = GetVecSize(op); 373 int type = (op >> 16) & 0xF; 374 int vd = _VD; 375 int n = GetNumVectorElements(sz); 376 u8 dregs[4]; 377 GetVectorRegsPrefixD(dregs, sz, vd); 378 379 if (sz == V_Quad && IsConsecutive4(dregs)) { 380 ir.Write(IROp::Vec4Init, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE)); 381 } else { 382 for (int i = 0; i < n; i++) { 383 ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f)); 384 } 385 } 386 ApplyPrefixD(dregs, sz); 387 } 388 Comp_VIdt(MIPSOpcode op)389 void IRFrontend::Comp_VIdt(MIPSOpcode op) { 390 CONDITIONAL_DISABLE(VFPU_XFER); 391 if (js.HasUnknownPrefix() || js.HasSPrefix()) { 392 DISABLE; 393 } 394 395 // Vector identity row 396 // d[N] = IDENTITY[N,m] 397 // Note: probably implemented as vmov with prefix hack. 398 399 int vd = _VD; 400 VectorSize sz = GetVecSize(op); 401 u8 dregs[4]; 402 GetVectorRegsPrefixD(dregs, sz, vd); 403 404 if (sz == 4 && IsConsecutive4(dregs)) { 405 int row = vd & 3; 406 Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row); 407 ir.Write(IROp::Vec4Init, dregs[0], (int)init); 408 } else { 409 switch (sz) { 410 case V_Pair: 411 ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 1) == 0 ? 1.0f : 0.0f)); 412 ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 1) == 1 ? 1.0f : 0.0f)); 413 break; 414 case V_Quad: 415 ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 3) == 0 ? 1.0f : 0.0f)); 416 ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 3) == 1 ? 1.0f : 0.0f)); 417 ir.Write(IROp::SetConstF, dregs[2], ir.AddConstantFloat((vd & 3) == 2 ? 1.0f : 0.0f)); 418 ir.Write(IROp::SetConstF, dregs[3], ir.AddConstantFloat((vd & 3) == 3 ? 1.0f : 0.0f)); 419 break; 420 default: 421 INVALIDOP; 422 } 423 } 424 425 ApplyPrefixD(dregs, sz); 426 } 427 Comp_VMatrixInit(MIPSOpcode op)428 void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) { 429 CONDITIONAL_DISABLE(VFPU_XFER); 430 MatrixSize sz = GetMtxSize(op); 431 if (sz != M_4x4 || !js.HasNoPrefix()) { 432 DISABLE; 433 } 434 435 // Matrix init (weird prefixes) 436 // d[N,M] = CONST[N,M] 437 438 // Not really about trying here, it will work if enabled. 439 VectorSize vsz = GetVectorSize(sz); 440 u8 vecs[4]; 441 int vd = _VD; 442 if (IsMatrixTransposed(vd)) { 443 // All outputs are transpositionally symmetric, so should be fine. 444 vd = TransposeMatrixReg(vd); 445 } 446 GetMatrixColumns(vd, M_4x4, vecs); 447 for (int i = 0; i < 4; i++) { 448 u8 vec[4]; 449 GetVectorRegs(vec, vsz, vecs[i]); 450 // As they are columns, they will be nicely consecutive. 451 Vec4Init init; 452 switch ((op >> 16) & 0xF) { 453 case 3: 454 init = Vec4Init((int)Vec4Init::Set_1000 + i); 455 break; 456 case 6: 457 init = Vec4Init::AllZERO; 458 break; 459 case 7: 460 init = Vec4Init::AllONE; 461 break; 462 default: 463 return; 464 } 465 ir.Write(IROp::Vec4Init, vec[0], (int)init); 466 } 467 } 468 Comp_VHdp(MIPSOpcode op)469 void IRFrontend::Comp_VHdp(MIPSOpcode op) { 470 CONDITIONAL_DISABLE(VFPU_VEC); 471 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) { 472 DISABLE; 473 } 474 475 // Vector homogenous dot product 476 // d[0] = s[0 .. n-2] dot t[0 .. n-2] + t[n-1] 477 // Note: s[n-1] is ignored / treated as 1 via prefix override. 478 479 int vd = _VD; 480 int vs = _VS; 481 int vt = _VT; 482 VectorSize sz = GetVecSize(op); 483 int n = GetNumVectorElements(sz); 484 485 if (js.prefixS & (0x0101 << (8 + n - 1))) 486 DISABLE; 487 488 // TODO: Force read one of them into regs? probably not. 489 u8 sregs[4], tregs[4], dregs[1]; 490 GetVectorRegsPrefixS(sregs, sz, vs); 491 GetVectorRegsPrefixT(tregs, sz, vt); 492 GetVectorRegsPrefixD(dregs, V_Single, vd); 493 494 ir.Write(IROp::FMul, IRVTEMP_0, sregs[0], tregs[0]); 495 496 for (int i = 1; i < n; i++) { 497 if (i == n - 1) { 498 ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, tregs[i]); 499 } else { 500 ir.Write(IROp::FMul, IRVTEMP_0 + 1, sregs[i], tregs[i]); 501 ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, IRVTEMP_0 + 1); 502 } 503 } 504 505 ir.Write(IROp::FMov, dregs[0], IRVTEMP_0); 506 ApplyPrefixD(dregs, V_Single); 507 } 508 509 alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f }; 510 Comp_Vhoriz(MIPSOpcode op)511 void IRFrontend::Comp_Vhoriz(MIPSOpcode op) { 512 CONDITIONAL_DISABLE(VFPU_VEC); 513 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) { 514 DISABLE; 515 } 516 517 // Vector horizontal add 518 // d[0] = s[0] + ... s[n-1] 519 // Vector horizontal average 520 // d[0] = s[0] / n + ... s[n-1] / n 521 // Note: Both are implemented as dot products against generated constants. 522 523 VectorSize sz = GetVecSize(op); 524 int n = GetNumVectorElements(sz); 525 526 u8 sregs[4], dregs[1]; 527 GetVectorRegsPrefixS(sregs, sz, _VS); 528 GetVectorRegsPrefixD(dregs, V_Single, _VD); 529 530 // We have to start at +0.000 in case any values are -0.000. 531 ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(0.0f)); 532 for (int i = 0; i < n; ++i) { 533 ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, sregs[i]); 534 } 535 536 switch ((op >> 16) & 31) { 537 case 6: // vfad 538 ir.Write(IROp::FMov, dregs[0], IRVTEMP_0); 539 break; 540 case 7: // vavg 541 ir.Write(IROp::SetConstF, IRVTEMP_0 + 1, ir.AddConstantFloat(vavg_table[n - 1])); 542 ir.Write(IROp::FMul, dregs[0], IRVTEMP_0, IRVTEMP_0 + 1); 543 break; 544 } 545 546 ApplyPrefixD(dregs, V_Single); 547 } 548 Comp_VDot(MIPSOpcode op)549 void IRFrontend::Comp_VDot(MIPSOpcode op) { 550 CONDITIONAL_DISABLE(VFPU_VEC); 551 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) { 552 DISABLE; 553 } 554 555 // Vector dot product 556 // d[0] = s[0 .. n-1] dot t[0 .. n-1] 557 558 int vd = _VD; 559 int vs = _VS; 560 int vt = _VT; 561 562 VectorSize sz = GetVecSize(op); 563 int n = GetNumVectorElements(sz); 564 565 // TODO: Force read one of them into regs? probably not. 566 u8 sregs[4], tregs[4], dregs[1]; 567 GetVectorRegsPrefixS(sregs, sz, vs); 568 GetVectorRegsPrefixT(tregs, sz, vt); 569 GetVectorRegsPrefixD(dregs, V_Single, vd); 570 571 if (sz == V_Quad && IsConsecutive4(sregs) && IsConsecutive4(tregs) && IsOverlapSafe(dregs[0], n, sregs, n, tregs)) { 572 ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]); 573 ApplyPrefixD(dregs, V_Single); 574 return; 575 } 576 577 int temp0 = IRVTEMP_0; 578 int temp1 = IRVTEMP_0 + 1; 579 ir.Write(IROp::FMul, temp0, sregs[0], tregs[0]); 580 for (int i = 1; i < n; i++) { 581 ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]); 582 ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1); 583 } 584 ApplyPrefixD(dregs, V_Single); 585 } 586 Comp_VecDo3(MIPSOpcode op)587 void IRFrontend::Comp_VecDo3(MIPSOpcode op) { 588 CONDITIONAL_DISABLE(VFPU_VEC); 589 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) { 590 DISABLE; 591 } 592 593 // Vector arithmetic 594 // d[N] = OP(s[N], t[N]) (see below) 595 596 // Check that we can support the ops, and prepare temporary values for ops that need it. 597 bool allowSIMD = true; 598 switch (op >> 26) { 599 case 24: //VFPU0 600 switch ((op >> 23) & 7) { 601 case 0: // d[i] = s[i] + t[i]; break; //vadd 602 case 1: // d[i] = s[i] - t[i]; break; //vsub 603 break; 604 case 7: // d[i] = s[i] / t[i]; break; //vdiv 605 if (!js.HasNoPrefix()) { 606 DISABLE; 607 } 608 break; 609 default: 610 INVALIDOP; 611 } 612 break; 613 case 25: //VFPU1 614 switch ((op >> 23) & 7) { 615 case 0: // d[i] = s[i] * t[i]; break; //vmul 616 break; 617 default: 618 INVALIDOP; 619 } 620 break; 621 case 27: //VFPU3 622 switch ((op >> 23) & 7) { 623 case 2: // vmin 624 case 3: // vmax 625 allowSIMD = false; 626 break; 627 case 6: // vsge 628 case 7: // vslt 629 allowSIMD = false; 630 break; 631 default: 632 INVALIDOP; 633 } 634 break; 635 default: 636 INVALIDOP; 637 } 638 639 VectorSize sz = GetVecSize(op); 640 int n = GetNumVectorElements(sz); 641 642 u8 sregs[4], tregs[4], dregs[4]; 643 GetVectorRegsPrefixS(sregs, sz, _VS); 644 GetVectorRegsPrefixT(tregs, sz, _VT); 645 GetVectorRegsPrefixD(dregs, sz, _VD); 646 647 u8 tempregs[4]; 648 for (int i = 0; i < n; i++) { 649 if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) { 650 tempregs[i] = IRVTEMP_0 + i; 651 } else { 652 tempregs[i] = dregs[i]; 653 } 654 } 655 656 // If all three are consecutive 4, we're safe regardless of if we use temps so we should not check that here. 657 if (allowSIMD && sz == V_Quad && IsConsecutive4(dregs) && IsConsecutive4(sregs) && IsConsecutive4(tregs)) { 658 IROp opFunc = IROp::Nop; 659 switch (op >> 26) { 660 case 24: //VFPU0 661 switch ((op >> 23) & 7) { 662 case 0: // d[i] = s[i] + t[i]; break; //vadd 663 opFunc = IROp::Vec4Add; 664 break; 665 case 1: // d[i] = s[i] - t[i]; break; //vsub 666 opFunc = IROp::Vec4Sub; 667 break; 668 case 7: // d[i] = s[i] / t[i]; break; //vdiv 669 opFunc = IROp::Vec4Div; 670 break; 671 } 672 break; 673 case 25: //VFPU1 674 switch ((op >> 23) & 7) 675 { 676 case 0: // d[i] = s[i] * t[i]; break; //vmul 677 opFunc = IROp::Vec4Mul; 678 break; 679 } 680 break; 681 case 27: //VFPU3 682 switch ((op >> 23) & 7) 683 { 684 case 2: // vmin 685 case 3: // vmax 686 case 6: // vsge 687 case 7: // vslt 688 DISABLE; 689 break; 690 } 691 break; 692 } 693 694 if (opFunc != IROp::Nop) { 695 ir.Write(opFunc, dregs[0], sregs[0], tregs[0]); 696 } else { 697 DISABLE; 698 } 699 ApplyPrefixD(dregs, sz); 700 return; 701 } 702 703 for (int i = 0; i < n; ++i) { 704 switch (op >> 26) { 705 case 24: //VFPU0 706 switch ((op >> 23) & 7) { 707 case 0: // d[i] = s[i] + t[i]; break; //vadd 708 ir.Write(IROp::FAdd, tempregs[i], sregs[i], tregs[i]); 709 break; 710 case 1: // d[i] = s[i] - t[i]; break; //vsub 711 ir.Write(IROp::FSub, tempregs[i], sregs[i], tregs[i]); 712 break; 713 case 7: // d[i] = s[i] / t[i]; break; //vdiv 714 ir.Write(IROp::FDiv, tempregs[i], sregs[i], tregs[i]); 715 break; 716 } 717 break; 718 case 25: //VFPU1 719 switch ((op >> 23) & 7) { 720 case 0: // d[i] = s[i] * t[i]; break; //vmul 721 ir.Write(IROp::FMul, tempregs[i], sregs[i], tregs[i]); 722 break; 723 } 724 break; 725 case 27: //VFPU3 726 switch ((op >> 23) & 7) { 727 case 2: // vmin 728 ir.Write(IROp::FMin, tempregs[i], sregs[i], tregs[i]); 729 break; 730 case 3: // vmax 731 ir.Write(IROp::FMax, tempregs[i], sregs[i], tregs[i]); 732 break; 733 case 6: // vsge 734 case 7: // vslt 735 DISABLE; 736 break; 737 } 738 break; 739 } 740 } 741 742 for (int i = 0; i < n; i++) { 743 if (dregs[i] != tempregs[i]) { 744 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 745 } 746 } 747 748 ApplyPrefixD(dregs, sz); 749 } 750 Comp_VV2Op(MIPSOpcode op)751 void IRFrontend::Comp_VV2Op(MIPSOpcode op) { 752 CONDITIONAL_DISABLE(VFPU_VEC); 753 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) 754 DISABLE; 755 756 // Vector unary operation 757 // d[N] = OP(s[N]) (see below) 758 759 int vs = _VS; 760 int vd = _VD; 761 762 int optype = (op >> 16) & 0x1f; 763 if (optype >= 16 && !js.HasNoPrefix()) { 764 DISABLE; 765 } else if ((optype == 1 || optype == 2) && js.HasSPrefix()) { 766 DISABLE; 767 } else if (optype == 5 && js.HasDPrefix()) { 768 DISABLE; 769 } 770 771 // Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure 772 if (optype == 0 && vs == vd && js.HasNoPrefix()) { 773 return; 774 } 775 776 VectorSize sz = GetVecSize(op); 777 int n = GetNumVectorElements(sz); 778 779 u8 sregs[4], dregs[4]; 780 GetVectorRegsPrefixS(sregs, sz, vs); 781 GetVectorRegsPrefixD(dregs, sz, vd); 782 783 bool usingTemps = false; 784 u8 tempregs[4]; 785 for (int i = 0; i < n; ++i) { 786 if (!IsOverlapSafe(dregs[i], n, sregs)) { 787 usingTemps = true; 788 tempregs[i] = IRVTEMP_0 + i; 789 } else { 790 tempregs[i] = dregs[i]; 791 } 792 } 793 794 bool canSIMD = false; 795 // Some can be SIMD'd. 796 switch (optype) { 797 case 0: // vmov 798 case 1: // vabs 799 case 2: // vneg 800 canSIMD = true; 801 break; 802 } 803 804 if (canSIMD && !usingTemps && IsConsecutive4(sregs) && IsConsecutive4(dregs)) { 805 switch (optype) { 806 case 0: // vmov 807 ir.Write(IROp::Vec4Mov, dregs[0], sregs[0]); 808 break; 809 case 1: // vabs 810 ir.Write(IROp::Vec4Abs, dregs[0], sregs[0]); 811 break; 812 case 2: // vneg 813 ir.Write(IROp::Vec4Neg, dregs[0], sregs[0]); 814 break; 815 } 816 ApplyPrefixD(dregs, sz); 817 return; 818 } 819 820 for (int i = 0; i < n; ++i) { 821 switch (optype) { 822 case 0: // d[i] = s[i]; break; //vmov 823 // Probably for swizzle. 824 ir.Write(IROp::FMov, tempregs[i], sregs[i]); 825 break; 826 case 1: // d[i] = fabsf(s[i]); break; //vabs 827 ir.Write(IROp::FAbs, tempregs[i], sregs[i]); 828 break; 829 case 2: // d[i] = -s[i]; break; //vneg 830 ir.Write(IROp::FNeg, tempregs[i], sregs[i]); 831 break; 832 case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0 833 ir.Write(IROp::FSat0_1, tempregs[i], sregs[i]); 834 break; 835 case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1 836 ir.Write(IROp::FSatMinus1_1, tempregs[i], sregs[i]); 837 break; 838 case 16: // d[i] = 1.0f / s[i]; break; //vrcp 839 ir.Write(IROp::FRecip, tempregs[i], sregs[i]); 840 break; 841 case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq 842 ir.Write(IROp::FRSqrt, tempregs[i], sregs[i]); 843 break; 844 case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin 845 ir.Write(IROp::FSin, tempregs[i], sregs[i]); 846 break; 847 case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos 848 ir.Write(IROp::FCos, tempregs[i], sregs[i]); 849 break; 850 case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2 851 DISABLE; 852 break; 853 case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2 854 DISABLE; 855 break; 856 case 22: // d[i] = sqrtf(s[i]); break; //vsqrt 857 ir.Write(IROp::FSqrt, tempregs[i], sregs[i]); 858 break; 859 case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin 860 ir.Write(IROp::FAsin, tempregs[i], sregs[i]); 861 break; 862 case 24: // d[i] = -1.0f / s[i]; break; // vnrcp 863 ir.Write(IROp::FRecip, tempregs[i], sregs[i]); 864 ir.Write(IROp::FNeg, tempregs[i], tempregs[i]); 865 break; 866 case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin 867 ir.Write(IROp::FSin, tempregs[i], sregs[i]); 868 ir.Write(IROp::FNeg, tempregs[i], tempregs[i]); 869 break; 870 case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2 871 DISABLE; 872 break; 873 default: 874 INVALIDOP; 875 } 876 } 877 for (int i = 0; i < n; i++) { 878 if (dregs[i] != tempregs[i]) { 879 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 880 } 881 } 882 883 ApplyPrefixD(dregs, sz); 884 } 885 Comp_Vi2f(MIPSOpcode op)886 void IRFrontend::Comp_Vi2f(MIPSOpcode op) { 887 CONDITIONAL_DISABLE(VFPU_VEC); 888 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) { 889 DISABLE; 890 } 891 892 // Vector integer to float 893 // d[N] = float(S[N]) * mult 894 895 VectorSize sz = GetVecSize(op); 896 int n = GetNumVectorElements(sz); 897 898 int imm = (op >> 16) & 0x1f; 899 const float mult = 1.0f / (float)(1UL << imm); 900 901 u8 sregs[4], dregs[4]; 902 GetVectorRegsPrefixS(sregs, sz, _VS); 903 GetVectorRegsPrefixD(dregs, sz, _VD); 904 905 u8 tempregs[4]; 906 for (int i = 0; i < n; ++i) { 907 if (!IsOverlapSafe(dregs[i], n, sregs)) { 908 tempregs[i] = IRVTEMP_PFX_T + i; // Need IRVTEMP_0 for the scaling factor 909 } else { 910 tempregs[i] = dregs[i]; 911 } 912 } 913 if (mult != 1.0f) 914 ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(mult)); 915 // TODO: Use the SCVTF with builtin scaling where possible. 916 for (int i = 0; i < n; i++) { 917 ir.Write(IROp::FCvtSW, tempregs[i], sregs[i]); 918 } 919 if (mult != 1.0f) { 920 for (int i = 0; i < n; i++) { 921 ir.Write(IROp::FMul, tempregs[i], tempregs[i], IRVTEMP_0); 922 } 923 } 924 925 for (int i = 0; i < n; ++i) { 926 if (dregs[i] != tempregs[i]) { 927 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 928 } 929 } 930 ApplyPrefixD(dregs, sz); 931 } 932 Comp_Vh2f(MIPSOpcode op)933 void IRFrontend::Comp_Vh2f(MIPSOpcode op) { 934 CONDITIONAL_DISABLE(VFPU_VEC); 935 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) { 936 DISABLE; 937 } 938 939 // Vector expand half to float 940 // d[N*2] = float(lowerhalf(s[N])), d[N*2+1] = float(upperhalf(s[N])) 941 942 DISABLE; 943 } 944 Comp_Vf2i(MIPSOpcode op)945 void IRFrontend::Comp_Vf2i(MIPSOpcode op) { 946 CONDITIONAL_DISABLE(VFPU_VEC); 947 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || (js.prefixD & 0xFF) != 0) { 948 DISABLE; 949 } 950 951 // Vector float to integer 952 // d[N] = int(S[N] * mult) 953 // Note: saturates on overflow. 954 955 DISABLE; 956 } 957 Comp_Mftv(MIPSOpcode op)958 void IRFrontend::Comp_Mftv(MIPSOpcode op) { 959 CONDITIONAL_DISABLE(VFPU_XFER); 960 961 int imm = op & 0xFF; 962 MIPSGPReg rt = _RT; 963 switch ((op >> 21) & 0x1f) { 964 case 3: //mfv / mfvc 965 // rt = 0, imm = 255 appears to be used as a CPU interlock by some games. 966 if (rt != MIPS_REG_ZERO) { 967 if (imm < 128) { //R(rt) = VI(imm); 968 ir.Write(IROp::FMovToGPR, rt, vfpuBase + voffset[imm]); 969 } else { 970 switch (imm - 128) { 971 case VFPU_CTRL_DPREFIX: 972 case VFPU_CTRL_SPREFIX: 973 case VFPU_CTRL_TPREFIX: 974 FlushPrefixV(); 975 break; 976 } 977 if (imm - 128 < VFPU_CTRL_MAX) { 978 ir.Write(IROp::VfpuCtrlToReg, rt, imm - 128); 979 } else { 980 INVALIDOP; 981 } 982 } 983 } 984 break; 985 986 case 7: // mtv 987 if (imm < 128) { 988 ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[imm], rt); 989 } else if ((imm - 128) < VFPU_CTRL_MAX) { 990 u32 mask; 991 if (GetVFPUCtrlMask(imm - 128, &mask)) { 992 if (mask != 0xFFFFFFFF) { 993 ir.Write(IROp::AndConst, IRTEMP_0, rt, ir.AddConstant(mask)); 994 ir.Write(IROp::SetCtrlVFPUReg, imm - 128, IRTEMP_0); 995 } else { 996 ir.Write(IROp::SetCtrlVFPUReg, imm - 128, rt); 997 } 998 } 999 1000 if (imm - 128 == VFPU_CTRL_SPREFIX) { 1001 js.prefixSFlag = JitState::PREFIX_UNKNOWN; 1002 } else if (imm - 128 == VFPU_CTRL_TPREFIX) { 1003 js.prefixTFlag = JitState::PREFIX_UNKNOWN; 1004 } else if (imm - 128 == VFPU_CTRL_DPREFIX) { 1005 js.prefixDFlag = JitState::PREFIX_UNKNOWN; 1006 } 1007 } else { 1008 INVALIDOP; 1009 } 1010 break; 1011 1012 default: 1013 INVALIDOP; 1014 } 1015 // This op is marked not to auto-eat prefix so we must do it manually. 1016 EatPrefix(); 1017 } 1018 Comp_Vmfvc(MIPSOpcode op)1019 void IRFrontend::Comp_Vmfvc(MIPSOpcode op) { 1020 CONDITIONAL_DISABLE(VFPU_XFER); 1021 1022 // Vector Move from vector control reg (no prefixes) 1023 // D[0] = VFPU_CTRL[i] 1024 1025 int vd = _VD; 1026 int imm = (op >> 8) & 0x7F; 1027 if (imm < VFPU_CTRL_MAX) { 1028 ir.Write(IROp::VfpuCtrlToReg, IRTEMP_0, imm); 1029 ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[vd], IRTEMP_0); 1030 } else { 1031 INVALIDOP; 1032 } 1033 } 1034 Comp_Vmtvc(MIPSOpcode op)1035 void IRFrontend::Comp_Vmtvc(MIPSOpcode op) { 1036 CONDITIONAL_DISABLE(VFPU_XFER); 1037 1038 // Vector Move to vector control reg (no prefixes) 1039 // VFPU_CTRL[i] = S[0] 1040 1041 int vs = _VS; 1042 int imm = op & 0xFF; 1043 if (imm < VFPU_CTRL_MAX) { 1044 u32 mask; 1045 if (GetVFPUCtrlMask(imm, &mask)) { 1046 if (mask != 0xFFFFFFFF) { 1047 ir.Write(IROp::FMovToGPR, IRTEMP_0, vfpuBase + voffset[imm]); 1048 ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(mask)); 1049 ir.Write(IROp::SetCtrlVFPUReg, imm, IRTEMP_0); 1050 } else { 1051 ir.Write(IROp::SetCtrlVFPUFReg, imm, vfpuBase + voffset[vs]); 1052 } 1053 } 1054 if (imm == VFPU_CTRL_SPREFIX) { 1055 js.prefixSFlag = JitState::PREFIX_UNKNOWN; 1056 } else if (imm == VFPU_CTRL_TPREFIX) { 1057 js.prefixTFlag = JitState::PREFIX_UNKNOWN; 1058 } else if (imm == VFPU_CTRL_DPREFIX) { 1059 js.prefixDFlag = JitState::PREFIX_UNKNOWN; 1060 } 1061 } else { 1062 INVALIDOP; 1063 } 1064 } 1065 Comp_Vmmov(MIPSOpcode op)1066 void IRFrontend::Comp_Vmmov(MIPSOpcode op) { 1067 CONDITIONAL_DISABLE(VFPU_MTX_VMMOV); 1068 if (!js.HasNoPrefix()) { 1069 DISABLE; 1070 } 1071 1072 // Matrix move (weird prefixes) 1073 // D[N,M] = S[N,M] 1074 1075 int vs = _VS; 1076 int vd = _VD; 1077 // This probably ignores prefixes for all sane intents and purposes. 1078 if (vs == vd) { 1079 // A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely. 1080 return; 1081 } 1082 1083 MatrixSize sz = GetMtxSize(op); 1084 int n = GetMatrixSide(sz); 1085 1086 u8 sregs[16], dregs[16]; 1087 GetMatrixRegs(sregs, sz, vs); 1088 GetMatrixRegs(dregs, sz, vd); 1089 1090 switch (GetMatrixOverlap(vs, vd, sz)) { 1091 case OVERLAP_EQUAL: 1092 // In-place transpose 1093 DISABLE; 1094 case OVERLAP_PARTIAL: 1095 DISABLE; 1096 case OVERLAP_NONE: 1097 default: 1098 break; 1099 } 1100 if (IsMatrixTransposed(vd) == IsMatrixTransposed(vs) && sz == M_4x4) { 1101 // Untranspose both matrices 1102 if (IsMatrixTransposed(vd)) { 1103 vd = TransposeMatrixReg(vd); 1104 vs = TransposeMatrixReg(vs); 1105 } 1106 // Get the columns 1107 u8 scols[4], dcols[4]; 1108 GetMatrixColumns(vs, sz, scols); 1109 GetMatrixColumns(vd, sz, dcols); 1110 for (int i = 0; i < 4; i++) { 1111 u8 svec[4], dvec[4]; 1112 GetVectorRegs(svec, GetVectorSize(sz), scols[i]); 1113 GetVectorRegs(dvec, GetVectorSize(sz), dcols[i]); 1114 ir.Write(IROp::Vec4Mov, dvec[0], svec[0]); 1115 } 1116 return; 1117 } 1118 for (int a = 0; a < n; a++) { 1119 for (int b = 0; b < n; b++) { 1120 ir.Write(IROp::FMov, dregs[a * 4 + b], sregs[a * 4 + b]); 1121 } 1122 } 1123 } 1124 Comp_Vmscl(MIPSOpcode op)1125 void IRFrontend::Comp_Vmscl(MIPSOpcode op) { 1126 CONDITIONAL_DISABLE(VFPU_MTX_VMSCL); 1127 if (!js.HasNoPrefix()) { 1128 DISABLE; 1129 } 1130 1131 // Matrix scale, matrix by scalar (weird prefixes) 1132 // d[N,M] = s[N,M] * t[0] 1133 // Note: behaves just slightly differently than a series of vscls. 1134 1135 int vs = _VS; 1136 int vd = _VD; 1137 int vt = _VT; 1138 1139 MatrixSize sz = GetMtxSize(op); 1140 if (sz != M_4x4) { 1141 DISABLE; 1142 } 1143 if (GetMtx(vt) == GetMtx(vd)) { 1144 DISABLE; 1145 } 1146 int n = GetMatrixSide(sz); 1147 1148 // The entire matrix is scaled equally, so transpose doesn't matter. Let's normalize. 1149 if (IsMatrixTransposed(vs) && IsMatrixTransposed(vd)) { 1150 vs = TransposeMatrixReg(vs); 1151 vd = TransposeMatrixReg(vd); 1152 } 1153 if (IsMatrixTransposed(vs) || IsMatrixTransposed(vd)) { 1154 DISABLE; 1155 } 1156 1157 u8 sregs[16], dregs[16], tregs[1]; 1158 GetMatrixRegs(sregs, sz, vs); 1159 GetMatrixRegs(dregs, sz, vd); 1160 GetVectorRegs(tregs, V_Single, vt); 1161 1162 for (int i = 0; i < n; ++i) { 1163 ir.Write(IROp::Vec4Scale, dregs[i * 4], sregs[i * 4], tregs[0]); 1164 } 1165 } 1166 Comp_VScl(MIPSOpcode op)1167 void IRFrontend::Comp_VScl(MIPSOpcode op) { 1168 CONDITIONAL_DISABLE(VFPU_VEC); 1169 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) { 1170 DISABLE; 1171 } 1172 1173 // Vector scale, vector by scalar 1174 // d[N] = s[N] * t[0] 1175 1176 VectorSize sz = GetVecSize(op); 1177 int n = GetNumVectorElements(sz); 1178 1179 int vs = _VS; 1180 int vd = _VD; 1181 int vt = _VT; 1182 u8 sregs[4], dregs[4], treg; 1183 GetVectorRegsPrefixS(sregs, sz, vs); 1184 // TODO: Prefixes seem strange... 1185 GetVectorRegsPrefixT(&treg, V_Single, vt); 1186 GetVectorRegsPrefixD(dregs, sz, vd); 1187 1188 bool overlap = false; 1189 // For prefixes to work, we just have to ensure that none of the output registers spill 1190 // and that there's no overlap. 1191 u8 tempregs[4]; 1192 memcpy(tempregs, dregs, sizeof(tempregs)); 1193 for (int i = 0; i < n; ++i) { 1194 // Conservative, can be improved 1195 if (treg == dregs[i] || !IsOverlapSafe(dregs[i], n, sregs)) { 1196 // Need to use temp regs 1197 tempregs[i] = IRVTEMP_0 + i; 1198 overlap = true; 1199 } 1200 } 1201 1202 if (n == 4 && IsConsecutive4(sregs) && IsConsecutive4(dregs)) { 1203 if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) { 1204 ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg); 1205 ApplyPrefixD(dregs, sz); 1206 return; 1207 } 1208 } 1209 1210 for (int i = 0; i < n; i++) { 1211 ir.Write(IROp::FMul, tempregs[i], sregs[i], treg); 1212 } 1213 1214 for (int i = 0; i < n; i++) { 1215 // All must be mapped for prefixes to work. 1216 if (dregs[i] != tempregs[i]) { 1217 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 1218 } 1219 } 1220 1221 ApplyPrefixD(dregs, sz); 1222 } 1223 1224 /* 1225 // Capital = straight, lower case = transposed 1226 // 8 possibilities: 1227 ABC 2 1228 ABc missing 1229 AbC 1 1230 Abc 1 1231 1232 aBC = ACB 2 + swap 1233 aBc = AcB 1 + swap 1234 abC = ACb missing 1235 abc = Acb 1 + swap 1236 1237 */ 1238 1239 // This may or may not be a win when using the IR interpreter... 1240 // Many more instructions to interpret. Comp_Vmmul(MIPSOpcode op)1241 void IRFrontend::Comp_Vmmul(MIPSOpcode op) { 1242 CONDITIONAL_DISABLE(VFPU_MTX_VMMUL); 1243 if (!js.HasNoPrefix()) { 1244 DISABLE; 1245 } 1246 1247 if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) { 1248 // Fall back to interpreter, which has the accurate implementation. 1249 // Later we might do something more optimized here. 1250 DISABLE; 1251 } 1252 1253 // Matrix multiply (weird prefixes) 1254 // D[0 .. N, 0 .. M] = S[0 .. N, 0 .. M]' * T[0 .. N, 0 .. M] 1255 // Note: Behaves as if it's implemented through a series of vdots. 1256 // Important: this is a matrix multiply with a pre-transposed S. 1257 1258 MatrixSize sz = GetMtxSize(op); 1259 int n = GetMatrixSide(sz); 1260 1261 int vs = _VS; 1262 int vd = _VD; 1263 int vt = _VT; 1264 MatrixOverlapType soverlap = GetMatrixOverlap(vs, vd, sz); 1265 MatrixOverlapType toverlap = GetMatrixOverlap(vt, vd, sz); 1266 1267 // A very common arrangment. Rearrange to something we can handle. 1268 if (IsMatrixTransposed(vd)) { 1269 // Matrix identity says (At * Bt) = (B * A)t 1270 // D = S * T 1271 // Dt = (S * T)t = (Tt * St) 1272 vd = TransposeMatrixReg(vd); 1273 std::swap(vs, vt); 1274 } 1275 1276 u8 sregs[16], tregs[16], dregs[16]; 1277 GetMatrixRegs(sregs, sz, vs); 1278 GetMatrixRegs(tregs, sz, vt); 1279 GetMatrixRegs(dregs, sz, vd); 1280 1281 if (soverlap || toverlap) { 1282 DISABLE; 1283 } 1284 1285 // dregs are always consecutive, thanks to our transpose trick. 1286 // However, not sure this is always worth it. 1287 if (sz == M_4x4 && IsConsecutive4(dregs)) { 1288 // TODO: The interpreter would like proper matrix ops better. Can generate those, and 1289 // expand them like this as needed on "real" architectures. 1290 int s0 = IRVTEMP_0; 1291 int s1 = IRVTEMP_PFX_T; 1292 if (!IsConsecutive4(sregs)) { 1293 // METHOD 1: Handles AbC and Abc 1294 for (int j = 0; j < 4; j++) { 1295 ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[j * 4]); 1296 for (int i = 1; i < 4; i++) { 1297 ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[j * 4 + i]); 1298 ir.Write(IROp::Vec4Add, s0, s0, s1); 1299 } 1300 ir.Write(IROp::Vec4Mov, dregs[j * 4], s0); 1301 } 1302 return; 1303 } else if (IsConsecutive4(tregs)) { 1304 // METHOD 2: Handles ABC only. Not efficient on CPUs that don't do fast dots. 1305 // Dots only work if tregs are consecutive. 1306 // TODO: Skip this and resort to method one and transpose the output? 1307 for (int j = 0; j < 4; j++) { 1308 for (int i = 0; i < 4; i++) { 1309 ir.Write(IROp::Vec4Dot, s0 + i, sregs[i * 4], tregs[j * 4]); 1310 } 1311 ir.Write(IROp::Vec4Mov, dregs[j * 4], s0); 1312 } 1313 return; 1314 } else { 1315 // ABc - s consecutive, t not. 1316 // Tekken uses this. 1317 // logBlocks = 1; 1318 } 1319 } 1320 1321 // Fallback. Expands a LOT 1322 int temp0 = IRVTEMP_0; 1323 int temp1 = IRVTEMP_0 + 1; 1324 for (int a = 0; a < n; a++) { 1325 for (int b = 0; b < n; b++) { 1326 ir.Write(IROp::FMul, temp0, sregs[b * 4], tregs[a * 4]); 1327 for (int c = 1; c < n; c++) { 1328 ir.Write(IROp::FMul, temp1, sregs[b * 4 + c], tregs[a * 4 + c]); 1329 ir.Write(IROp::FAdd, (c == n - 1) ? dregs[a * 4 + b] : temp0, temp0, temp1); 1330 } 1331 } 1332 } 1333 } 1334 Comp_Vtfm(MIPSOpcode op)1335 void IRFrontend::Comp_Vtfm(MIPSOpcode op) { 1336 CONDITIONAL_DISABLE(VFPU_MTX_VTFM); 1337 if (!js.HasNoPrefix()) { 1338 DISABLE; 1339 } 1340 1341 // Vertex transform, vector by matrix (weird prefixes) 1342 // d[N] = s[N*m .. N*m + n-1] dot t[0 .. n-1] 1343 // Homogenous means t[n-1] is treated as 1. 1344 // Note: this might be implemented as a series of vdots with special prefixes. 1345 1346 VectorSize sz = GetVecSize(op); 1347 MatrixSize msz = GetMtxSize(op); 1348 int n = GetNumVectorElements(sz); 1349 int ins = (op >> 23) & 7; 1350 1351 bool homogenous = false; 1352 if (n == ins) { 1353 n++; 1354 sz = (VectorSize)((int)(sz)+1); 1355 msz = (MatrixSize)((int)(msz)+1); 1356 homogenous = true; 1357 } 1358 // Otherwise, n should already be ins + 1. 1359 else if (n != ins + 1) { 1360 DISABLE; 1361 } 1362 1363 u8 sregs[16], dregs[4], tregs[4]; 1364 GetMatrixRegs(sregs, msz, _VS); 1365 GetVectorRegs(tregs, sz, _VT); 1366 GetVectorRegs(dregs, sz, _VD); 1367 1368 // SIMD-optimized implementations - if sregs[0..3] is non-consecutive, it's transposed. 1369 if (msz == M_4x4 && !IsConsecutive4(sregs)) { 1370 int s0 = IRVTEMP_0; 1371 int s1 = IRVTEMP_PFX_S; 1372 // For this algorithm, we don't care if tregs are consecutive or not, 1373 // they are accessed one at a time. This handles homogenous transforms correctly, as well. 1374 // We take advantage of sregs[0] + 1 being sregs[4] here. 1375 ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]); 1376 for (int i = 1; i < 4; i++) { 1377 if (!homogenous || (i != n - 1)) { 1378 ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]); 1379 ir.Write(IROp::Vec4Add, s0, s0, s1); 1380 } else { 1381 ir.Write(IROp::Vec4Add, s0, s0, sregs[i]); 1382 } 1383 } 1384 if (IsConsecutive4(dregs)) { 1385 ir.Write(IROp::Vec4Mov, dregs[0], s0); 1386 } else { 1387 for (int i = 0; i < 4; i++) { 1388 ir.Write(IROp::FMov, dregs[i], s0 + i); 1389 } 1390 } 1391 return; 1392 } else if (msz == M_4x4 && IsConsecutive4(sregs)) { 1393 // Consecutive, which is harder. 1394 DISABLE; 1395 int s0 = IRVTEMP_0; 1396 int s1 = IRVTEMP_PFX_S; 1397 // Doesn't make complete sense to me why this works.... (because it doesn't.) 1398 ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]); 1399 for (int i = 1; i < 4; i++) { 1400 if (!homogenous || (i != n - 1)) { 1401 ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]); 1402 ir.Write(IROp::Vec4Add, s0, s0, s1); 1403 } else { 1404 ir.Write(IROp::Vec4Add, s0, s0, sregs[i]); 1405 } 1406 } 1407 if (IsConsecutive4(dregs)) { 1408 ir.Write(IROp::Vec4Mov, dregs[0], s0); 1409 } else { 1410 for (int i = 0; i < 4; i++) { 1411 ir.Write(IROp::FMov, dregs[i], s0 + i); 1412 } 1413 } 1414 return; 1415 } 1416 1417 // TODO: test overlap, optimize. 1418 u8 tempregs[4]; 1419 int s0 = IRVTEMP_0; 1420 int temp1 = IRVTEMP_0 + 1; 1421 for (int i = 0; i < n; i++) { 1422 ir.Write(IROp::FMul, s0, sregs[i * 4], tregs[0]); 1423 for (int k = 1; k < n; k++) { 1424 if (!homogenous || k != n - 1) { 1425 ir.Write(IROp::FMul, temp1, sregs[i * 4 + k], tregs[k]); 1426 ir.Write(IROp::FAdd, s0, s0, temp1); 1427 } else { 1428 ir.Write(IROp::FAdd, s0, s0, sregs[i * 4 + k]); 1429 } 1430 } 1431 int temp = IRVTEMP_PFX_T + i; 1432 ir.Write(IROp::FMov, temp, s0); 1433 tempregs[i] = temp; 1434 } 1435 for (int i = 0; i < n; i++) { 1436 if (tempregs[i] != dregs[i]) 1437 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 1438 } 1439 } 1440 Comp_VCrs(MIPSOpcode op)1441 void IRFrontend::Comp_VCrs(MIPSOpcode op) { 1442 CONDITIONAL_DISABLE(VFPU_VEC); 1443 if (js.HasUnknownPrefix() || js.HasSPrefix() || js.HasTPrefix()) { 1444 DISABLE; 1445 } 1446 1447 // Vector cross (half a cross product, n = 3) 1448 // d[0] = s[y]*t[z], d[1] = s[z]*t[x], d[2] = s[x]*t[y] 1449 // To do a full cross product: vcrs tmp1, s, t; vcrs tmp2 t, s; vsub d, tmp1, tmp2; 1450 // (or just use vcrsp.) 1451 1452 DISABLE; 1453 } 1454 Comp_VDet(MIPSOpcode op)1455 void IRFrontend::Comp_VDet(MIPSOpcode op) { 1456 CONDITIONAL_DISABLE(VFPU_VEC); 1457 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || (js.prefixT & 0x000CFCF0) != 0x000E0) { 1458 DISABLE; 1459 } 1460 1461 // Vector determinant 1462 // d[0] = s[0]*t[1] - s[1]*t[0] 1463 // Note: this operates on two vectors, not a 2x2 matrix. 1464 1465 DISABLE; 1466 } 1467 Comp_Vi2x(MIPSOpcode op)1468 void IRFrontend::Comp_Vi2x(MIPSOpcode op) { 1469 CONDITIONAL_DISABLE(VFPU_VEC); 1470 if (js.HasUnknownPrefix() || js.HasSPrefix()) 1471 DISABLE; 1472 1473 int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3) 1474 bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2) 1475 1476 // These instructions pack pairs or quads of integers into 32 bits. 1477 // The unsigned (u) versions skip the sign bit when packing, first doing a signed clamp to 0 (so the sign bit won't ever be 1). 1478 1479 VectorSize sz = GetVecSize(op); 1480 VectorSize outsize; 1481 if (bits == 8) { 1482 outsize = V_Single; 1483 if (sz != V_Quad) { 1484 DISABLE; 1485 } 1486 } else { 1487 switch (sz) { 1488 case V_Pair: 1489 outsize = V_Single; 1490 break; 1491 case V_Quad: 1492 outsize = V_Pair; 1493 break; 1494 default: 1495 DISABLE; 1496 } 1497 } 1498 1499 u8 sregs[4], dregs[2], srcregs[4], tempregs[2]; 1500 GetVectorRegsPrefixS(sregs, sz, _VS); 1501 GetVectorRegsPrefixD(dregs, outsize, _VD); 1502 memcpy(srcregs, sregs, sizeof(sregs)); 1503 memcpy(tempregs, dregs, sizeof(dregs)); 1504 1505 int nOut = GetNumVectorElements(outsize); 1506 1507 // If src registers aren't contiguous, make them. 1508 if (sz == V_Quad && !IsConsecutive4(sregs)) { 1509 // T prefix is unused. 1510 for (int i = 0; i < 4; i++) { 1511 srcregs[i] = IRVTEMP_PFX_T + i; 1512 ir.Write(IROp::FMov, srcregs[i], sregs[i]); 1513 } 1514 } 1515 1516 if (bits == 8) { 1517 if (unsignedOp) { //vi2uc 1518 // Output is only one register. 1519 ir.Write(IROp::Vec4ClampToZero, IRVTEMP_0, srcregs[0]); 1520 ir.Write(IROp::Vec4Pack31To8, tempregs[0], IRVTEMP_0); 1521 } else { //vi2c 1522 ir.Write(IROp::Vec4Pack32To8, tempregs[0], srcregs[0]); 1523 } 1524 } else { 1525 // bits == 16 1526 if (unsignedOp) { //vi2us 1527 // Output is only one register. 1528 ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0, srcregs[0]); 1529 ir.Write(IROp::Vec2Pack31To16, tempregs[0], IRVTEMP_0); 1530 if (outsize == V_Pair) { 1531 ir.Write(IROp::Vec2ClampToZero, IRVTEMP_0 + 2, srcregs[2]); 1532 ir.Write(IROp::Vec2Pack31To16, tempregs[1], IRVTEMP_0 + 2); 1533 } 1534 } else { //vi2s 1535 ir.Write(IROp::Vec2Pack32To16, tempregs[0], srcregs[0]); 1536 if (outsize == V_Pair) { 1537 ir.Write(IROp::Vec2Pack32To16, tempregs[1], srcregs[2]); 1538 } 1539 } 1540 } 1541 1542 for (int i = 0; i < nOut; i++) { 1543 if (dregs[i] != tempregs[i]) { 1544 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 1545 } 1546 } 1547 1548 ApplyPrefixD(dregs, outsize); 1549 } 1550 Comp_Vx2i(MIPSOpcode op)1551 void IRFrontend::Comp_Vx2i(MIPSOpcode op) { 1552 CONDITIONAL_DISABLE(VFPU_VEC); 1553 if (js.HasUnknownPrefix() || js.HasSPrefix()) 1554 DISABLE; 1555 1556 int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3) 1557 bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2) 1558 1559 // vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values 1560 // at the top. vus2i shifts it an extra bit right afterward. 1561 // vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values 1562 // at the top too. vuc2i is a bit special (see below.) 1563 // Let's do this similarly as h2f - we do a solution that works for both singles and pairs 1564 // then use it for both. 1565 1566 VectorSize sz = GetVecSize(op); 1567 VectorSize outsize; 1568 if (bits == 8) { 1569 outsize = V_Quad; 1570 sz = V_Single; // For some reason, sz is set to Quad in this case though the outsize is Single. 1571 } else { 1572 switch (sz) { 1573 case V_Single: 1574 outsize = V_Pair; 1575 break; 1576 case V_Pair: 1577 outsize = V_Quad; 1578 break; 1579 default: 1580 DISABLE; 1581 } 1582 } 1583 1584 u8 sregs[2], dregs[4], tempregs[4], srcregs[2]; 1585 GetVectorRegsPrefixS(sregs, sz, _VS); 1586 GetVectorRegsPrefixD(dregs, outsize, _VD); 1587 memcpy(tempregs, dregs, sizeof(dregs)); 1588 memcpy(srcregs, sregs, sizeof(sregs)); 1589 1590 // Remap source regs to be consecutive. This is not required 1591 // but helpful when implementations can join two Vec2Expand. 1592 if (sz == V_Pair && !IsConsecutive2(srcregs)) { 1593 for (int i = 0; i < 2; i++) { 1594 srcregs[i] = IRVTEMP_0 + i; 1595 ir.Write(IROp::FMov, srcregs[i], sregs[i]); 1596 } 1597 } 1598 1599 int nIn = GetNumVectorElements(sz); 1600 1601 int nOut = 2; 1602 if (outsize == V_Quad) 1603 nOut = 4; 1604 // Remap dest regs. PFX_T is unused. 1605 if (outsize == V_Pair) { 1606 bool consecutive = IsConsecutive2(dregs); 1607 // We must have them consecutive, so all temps, or none. 1608 if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) { 1609 for (int i = 0; i < nOut; i++) { 1610 tempregs[i] = IRVTEMP_PFX_T + i; 1611 } 1612 } 1613 } else if (outsize == V_Quad) { 1614 bool consecutive = IsConsecutive4(dregs); 1615 if (!consecutive || !IsOverlapSafe(nOut, dregs, nIn, srcregs)) { 1616 for (int i = 0; i < nOut; i++) { 1617 tempregs[i] = IRVTEMP_PFX_T + i; 1618 } 1619 } 1620 } 1621 1622 if (bits == 16) { 1623 if (unsignedOp) { 1624 ir.Write(IROp::Vec2Unpack16To31, tempregs[0], srcregs[0]); 1625 if (outsize == V_Quad) 1626 ir.Write(IROp::Vec2Unpack16To31, tempregs[2], srcregs[1]); 1627 } else { 1628 ir.Write(IROp::Vec2Unpack16To32, tempregs[0], srcregs[0]); 1629 if (outsize == V_Quad) 1630 ir.Write(IROp::Vec2Unpack16To32, tempregs[2], srcregs[1]); 1631 } 1632 } else if (bits == 8) { 1633 if (unsignedOp) { 1634 // See the interpreter, this one is odd. Hardware bug? 1635 ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]); 1636 ir.Write(IROp::Vec4DuplicateUpperBitsAndShift1, tempregs[0], tempregs[0]); 1637 } else { 1638 ir.Write(IROp::Vec4Unpack8To32, tempregs[0], srcregs[0]); 1639 } 1640 } 1641 1642 for (int i = 0; i < nOut; i++) { 1643 if (tempregs[i] != dregs[i]) { 1644 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 1645 } 1646 } 1647 ApplyPrefixD(dregs, outsize); 1648 } 1649 Comp_VCrossQuat(MIPSOpcode op)1650 void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) { 1651 CONDITIONAL_DISABLE(VFPU_VEC); 1652 if (!js.HasNoPrefix()) 1653 DISABLE; 1654 1655 // Vector cross product (n = 3, weird prefixes) 1656 // d[0 .. 2] = s[0 .. 2] X t[0 .. 2] 1657 // Vector quaternion product (n = 4, weird prefixes) 1658 // d[0 .. 2] = t[0 .. 2] X s[0 .. 2] + s[3] * t[0 .. 2] + t[3] * s[0 .. 2] 1659 // d[3] = s[3]*t[3] - s[0 .. 2] dot t[0 .. 3] 1660 // Note: Behaves as if it's implemented through a series of vdots. 1661 1662 VectorSize sz = GetVecSize(op); 1663 int n = GetNumVectorElements(sz); 1664 1665 u8 sregs[4], tregs[4], dregs[4]; 1666 GetVectorRegs(sregs, sz, _VS); 1667 GetVectorRegs(tregs, sz, _VT); 1668 GetVectorRegs(dregs, sz, _VD); 1669 1670 u8 tempregs[4]; 1671 for (int i = 0; i < n; ++i) { 1672 if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) { 1673 tempregs[i] = IRVTEMP_PFX_T + i; // using IRTEMP0 for other things 1674 } else { 1675 tempregs[i] = dregs[i]; 1676 } 1677 } 1678 1679 if (sz == V_Triple) { 1680 int temp0 = IRVTEMP_0; 1681 int temp1 = IRVTEMP_0 + 1; 1682 // Compute X 1683 ir.Write(IROp::FMul, temp0, sregs[1], tregs[2]); 1684 ir.Write(IROp::FMul, temp1, sregs[2], tregs[1]); 1685 ir.Write(IROp::FSub, tempregs[0], temp0, temp1); 1686 1687 // Compute Y 1688 ir.Write(IROp::FMul, temp0, sregs[2], tregs[0]); 1689 ir.Write(IROp::FMul, temp1, sregs[0], tregs[2]); 1690 ir.Write(IROp::FSub, tempregs[1], temp0, temp1); 1691 1692 // Compute Z 1693 ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]); 1694 ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]); 1695 ir.Write(IROp::FSub, tempregs[2], temp0, temp1); 1696 } else if (sz == V_Quad) { 1697 DISABLE; 1698 } else { 1699 DISABLE; 1700 } 1701 1702 for (int i = 0; i < n; i++) { 1703 if (tempregs[i] != dregs[i]) 1704 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 1705 } 1706 } 1707 Comp_Vcmp(MIPSOpcode op)1708 void IRFrontend::Comp_Vcmp(MIPSOpcode op) { 1709 CONDITIONAL_DISABLE(VFPU_COMP); 1710 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) { 1711 DISABLE; 1712 } 1713 1714 // Vector compare 1715 // VFPU_CC[N] = COMPARE(s[N], t[N]) 1716 1717 VectorSize sz = GetVecSize(op); 1718 int n = GetNumVectorElements(sz); 1719 1720 u8 sregs[4], tregs[4]; 1721 GetVectorRegsPrefixS(sregs, sz, _VS); 1722 GetVectorRegsPrefixT(tregs, sz, _VT); 1723 1724 int cond = op & 0xF; 1725 int mask = 0; 1726 for (int i = 0; i < n; i++) { 1727 ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]); 1728 mask |= (1 << i); 1729 } 1730 ir.Write(IROp::FCmpVfpuAggregate, mask); 1731 } 1732 Comp_Vcmov(MIPSOpcode op)1733 void IRFrontend::Comp_Vcmov(MIPSOpcode op) { 1734 CONDITIONAL_DISABLE(VFPU_COMP); 1735 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) { 1736 DISABLE; 1737 } 1738 1739 // Vector conditional move 1740 // imm3 >= 6: d[N] = VFPU_CC[N] == tf ? s[N] : d[N] 1741 // imm3 < 6: d[N] = VFPU_CC[imm3] == tf ? s[N] : d[N] 1742 1743 VectorSize sz = GetVecSize(op); 1744 int n = GetNumVectorElements(sz); 1745 1746 u8 sregs[4], dregs[4]; 1747 GetVectorRegsPrefixS(sregs, sz, _VS); 1748 GetVectorRegsPrefixD(dregs, sz, _VD); 1749 int tf = (op >> 19) & 1; 1750 int imm3 = (op >> 16) & 7; 1751 1752 for (int i = 0; i < n; ++i) { 1753 // Simplification: Disable if overlap unsafe 1754 if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) { 1755 DISABLE; 1756 } 1757 } 1758 if (imm3 < 6) { 1759 // Test one bit of CC. This bit decides whether none or all subregisters are copied. 1760 for (int i = 0; i < n; i++) { 1761 ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (imm3) | ((!tf) << 7)); 1762 } 1763 } else { 1764 // Look at the bottom four bits of CC to individually decide if the subregisters should be copied. 1765 for (int i = 0; i < n; i++) { 1766 ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (i) | ((!tf) << 7)); 1767 } 1768 } 1769 ApplyPrefixD(dregs, sz); 1770 } 1771 Comp_Viim(MIPSOpcode op)1772 void IRFrontend::Comp_Viim(MIPSOpcode op) { 1773 CONDITIONAL_DISABLE(VFPU_XFER); 1774 if (js.HasUnknownPrefix()) 1775 DISABLE; 1776 1777 // Vector integer immediate 1778 // d[0] = float(imm) 1779 1780 s32 imm = SignExtend16ToS32(op); 1781 u8 dreg; 1782 GetVectorRegsPrefixD(&dreg, V_Single, _VT); 1783 ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat((float)imm)); 1784 ApplyPrefixD(&dreg, V_Single); 1785 } 1786 Comp_Vfim(MIPSOpcode op)1787 void IRFrontend::Comp_Vfim(MIPSOpcode op) { 1788 CONDITIONAL_DISABLE(VFPU_XFER); 1789 if (js.HasUnknownPrefix()) 1790 DISABLE; 1791 1792 // Vector half-float immediate 1793 // d[0] = float(imm) 1794 1795 FP16 half; 1796 half.u = op & 0xFFFF; 1797 FP32 fval = half_to_float_fast5(half); 1798 1799 u8 dreg; 1800 GetVectorRegsPrefixD(&dreg, V_Single, _VT); 1801 ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(fval.f)); 1802 ApplyPrefixD(&dreg, V_Single); 1803 } 1804 Comp_Vcst(MIPSOpcode op)1805 void IRFrontend::Comp_Vcst(MIPSOpcode op) { 1806 CONDITIONAL_DISABLE(VFPU_XFER); 1807 if (js.HasUnknownPrefix()) 1808 DISABLE; 1809 1810 // Vector constant 1811 // d[N] = CONST 1812 1813 int conNum = (op >> 16) & 0x1f; 1814 int vd = _VD; 1815 1816 VectorSize sz = GetVecSize(op); 1817 int n = GetNumVectorElements(sz); 1818 1819 u8 dregs[4]; 1820 GetVectorRegsPrefixD(dregs, sz, vd); 1821 for (int i = 0; i < n; i++) { 1822 ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum])); 1823 } 1824 ApplyPrefixD(dregs, sz); 1825 } 1826 1827 // Very heavily used by FF:CC. Should be replaced by a fast approximation instead of 1828 // calling the math library. Comp_VRot(MIPSOpcode op)1829 void IRFrontend::Comp_VRot(MIPSOpcode op) { 1830 CONDITIONAL_DISABLE(VFPU_VEC); 1831 if (!js.HasNoPrefix()) { 1832 // Prefixes work strangely for this: 1833 // * They never apply to cos (whether d or s prefixes.) 1834 // * They mostly apply to sin/0, e.g. 0:1, M, or |x|. 1835 DISABLE; 1836 } 1837 1838 // Vector rotation matrix (weird prefixes) 1839 // d[N] = SINCOSVAL(s[0], imm[N]) 1840 // The imm selects: cos index, sin index, 0 or sin for others, sin sign flip. 1841 1842 int vd = _VD; 1843 int vs = _VS; 1844 int imm = (op >> 16) & 0x1f; 1845 VectorSize sz = GetVecSize(op); 1846 int n = GetNumVectorElements(sz); 1847 bool negSin = (imm & 0x10) ? true : false; 1848 1849 char d[4] = { '0', '0', '0', '0' }; 1850 if (((imm >> 2) & 3) == (imm & 3)) { 1851 for (int i = 0; i < 4; i++) 1852 d[i] = 's'; 1853 } 1854 d[(imm >> 2) & 3] = 's'; 1855 d[imm & 3] = 'c'; 1856 1857 u8 dregs[4]; 1858 GetVectorRegs(dregs, sz, vd); 1859 u8 sreg[1]; 1860 GetVectorRegs(sreg, V_Single, vs); 1861 for (int i = 0; i < n; i++) { 1862 switch (d[i]) { 1863 case '0': 1864 ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(0.0f)); 1865 break; 1866 case 's': 1867 ir.Write(IROp::FSin, dregs[i], sreg[0]); 1868 if (negSin) { 1869 ir.Write(IROp::FNeg, dregs[i], dregs[i]); 1870 } 1871 break; 1872 case 'c': 1873 ir.Write(IROp::FCos, dregs[i], sreg[0]); 1874 break; 1875 } 1876 } 1877 } 1878 Comp_Vsgn(MIPSOpcode op)1879 void IRFrontend::Comp_Vsgn(MIPSOpcode op) { 1880 CONDITIONAL_DISABLE(VFPU_VEC); 1881 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || !IsPrefixWithinSize(js.prefixT, op)) { 1882 DISABLE; 1883 } 1884 1885 // Vector extract sign 1886 // d[N] = signum(s[N]) 1887 1888 VectorSize sz = GetVecSize(op); 1889 int n = GetNumVectorElements(sz); 1890 1891 u8 sregs[4], dregs[4]; 1892 GetVectorRegsPrefixS(sregs, sz, _VS); 1893 GetVectorRegsPrefixD(dregs, sz, _VD); 1894 1895 u8 tempregs[4]; 1896 for (int i = 0; i < n; ++i) { 1897 if (!IsOverlapSafe(dregs[i], n, sregs)) { 1898 tempregs[i] = IRTEMP_0 + i; 1899 } else { 1900 tempregs[i] = dregs[i]; 1901 } 1902 } 1903 1904 for (int i = 0; i < n; ++i) { 1905 ir.Write(IROp::FSign, tempregs[i], sregs[i]); 1906 } 1907 1908 for (int i = 0; i < n; ++i) { 1909 if (dregs[i] != tempregs[i]) { 1910 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 1911 } 1912 } 1913 1914 ApplyPrefixD(dregs, sz); 1915 } 1916 Comp_Vocp(MIPSOpcode op)1917 void IRFrontend::Comp_Vocp(MIPSOpcode op) { 1918 CONDITIONAL_DISABLE(VFPU_VEC); 1919 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix() || (js.prefixS & VFPU_NEGATE(1, 1, 1, 1)) != 0) { 1920 DISABLE; 1921 } 1922 1923 // Vector one's complement 1924 // d[N] = 1.0 - s[N] 1925 1926 VectorSize sz = GetVecSize(op); 1927 int n = GetNumVectorElements(sz); 1928 1929 // This is a hack that modifies prefixes. We eat them later, so just overwrite. 1930 // S prefix forces the negate flags. 1931 js.prefixS |= 0x000F0000; 1932 // T prefix forces constants on and regnum to 1. 1933 // That means negate still works, and abs activates a different constant. 1934 js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000; 1935 1936 u8 sregs[4], tregs[4], dregs[4]; 1937 GetVectorRegsPrefixS(sregs, sz, _VS); 1938 // There's no bits for t, so just reuse s. It'll be constants only. 1939 GetVectorRegsPrefixT(tregs, sz, _VS); 1940 GetVectorRegsPrefixD(dregs, sz, _VD); 1941 1942 u8 tempregs[4]; 1943 for (int i = 0; i < n; ++i) { 1944 if (!IsOverlapSafe(dregs[i], n, sregs)) { 1945 tempregs[i] = IRVTEMP_0 + i; 1946 } else { 1947 tempregs[i] = dregs[i]; 1948 } 1949 } 1950 1951 for (int i = 0; i < n; ++i) { 1952 ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]); 1953 } 1954 for (int i = 0; i < n; ++i) { 1955 if (dregs[i] != tempregs[i]) { 1956 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 1957 } 1958 } 1959 1960 ApplyPrefixD(dregs, sz); 1961 } 1962 Comp_ColorConv(MIPSOpcode op)1963 void IRFrontend::Comp_ColorConv(MIPSOpcode op) { 1964 CONDITIONAL_DISABLE(VFPU_VEC); 1965 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) { 1966 DISABLE; 1967 } 1968 1969 // Vector color conversion 1970 // d[N] = ConvertTo16(s[N*2]) | (ConvertTo16(s[N*2+1]) << 16) 1971 1972 DISABLE; 1973 } 1974 Comp_Vbfy(MIPSOpcode op)1975 void IRFrontend::Comp_Vbfy(MIPSOpcode op) { 1976 CONDITIONAL_DISABLE(VFPU_VEC); 1977 if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op) || js.HasTPrefix()) { 1978 DISABLE; 1979 } 1980 1981 // Vector butterfly operation 1982 // vbfy2: d[0] = s[0] + s[2], d[1] = s[1] + s[3], d[2] = s[0] - s[2], d[3] = s[1] - s[3] 1983 // vbfy1: d[N*2] = s[N*2] + s[N*2+1], d[N*2+1] = s[N*2] - s[N*2+1] 1984 1985 VectorSize sz = GetVecSize(op); 1986 int n = GetNumVectorElements(sz); 1987 if (n != 2 && n != 4) { 1988 // Bad instructions 1989 INVALIDOP; 1990 } 1991 1992 u8 sregs[4], dregs[4]; 1993 GetVectorRegsPrefixS(sregs, sz, _VS); 1994 GetVectorRegsPrefixD(dregs, sz, _VD); 1995 1996 u8 tempregs[4]; 1997 for (int i = 0; i < n; ++i) { 1998 if (!IsOverlapSafe(dregs[i], n, sregs)) { 1999 tempregs[i] = IRVTEMP_0 + i; 2000 } else { 2001 tempregs[i] = dregs[i]; 2002 } 2003 } 2004 2005 int subop = (op >> 16) & 0x1F; 2006 if (subop == 3 && n == 4) { 2007 // vbfy2 2008 ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[2]); 2009 ir.Write(IROp::FAdd, tempregs[1], sregs[1], sregs[3]); 2010 ir.Write(IROp::FSub, tempregs[2], sregs[0], sregs[2]); 2011 ir.Write(IROp::FSub, tempregs[3], sregs[1], sregs[3]); 2012 } else if (subop == 2) { 2013 // vbfy1 2014 ir.Write(IROp::FAdd, tempregs[0], sregs[0], sregs[1]); 2015 ir.Write(IROp::FSub, tempregs[1], sregs[0], sregs[1]); 2016 if (n == 4) { 2017 ir.Write(IROp::FAdd, tempregs[2], sregs[2], sregs[3]); 2018 ir.Write(IROp::FSub, tempregs[3], sregs[2], sregs[3]); 2019 } 2020 } else { 2021 INVALIDOP; 2022 } 2023 2024 for (int i = 0; i < n; ++i) { 2025 if (tempregs[i] != dregs[i]) 2026 ir.Write(IROp::FMov, dregs[i], tempregs[i]); 2027 } 2028 2029 ApplyPrefixD(dregs, sz); 2030 } 2031 } 2032