1 // Copyright 2008 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4
5 #include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
6
7 #include <array>
8
9 #include "Common/CPUDetect.h"
10 #include "Common/CommonTypes.h"
11 #include "Common/FloatUtils.h"
12 #include "Common/Intrinsics.h"
13 #include "Common/JitRegister.h"
14 #include "Common/x64ABI.h"
15 #include "Common/x64Emitter.h"
16 #include "Core/PowerPC/Gekko.h"
17 #include "Core/PowerPC/Jit64/Jit.h"
18 #include "Core/PowerPC/Jit64Common/Jit64Constants.h"
19 #include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
20 #include "Core/PowerPC/PowerPC.h"
21
22 #define QUANTIZED_REGS_TO_SAVE \
23 (ABI_ALL_CALLER_SAVED & ~BitSet32{RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0 + 16, XMM1 + 16})
24
25 #define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32{RSCRATCH2})
26
27 using namespace Gen;
28
29 alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
30 alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
31 alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
32 alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
33 alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
34
35 // Since the following float conversion functions are used in non-arithmetic PPC float
36 // instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
37 // into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
38 // flush-to-zero so we can use FLD+FSTP even on denormals.
39 // If the number is a NaN, make sure to set the QNaN bit back to its original value.
40
41 // Another problem is that officially, converting doubles to single format results in undefined
42 // behavior. Relying on undefined behavior is a bug so no software should ever do this.
43 // Super Mario 64 (on Wii VC) accidentally relies on this behavior. See issue #11173
44
45 // This is the same algorithm used in the interpreter (and actual hardware)
46 // The documentation states that the conversion of a double with an outside the
47 // valid range for a single (or a single denormal) is undefined.
48 // But testing on actual hardware shows it always picks bits 0..1 and 5..34
49 // unless the exponent is in the range of 874 to 896.
50
GenConvertDoubleToSingle()51 void CommonAsmRoutines::GenConvertDoubleToSingle()
52 {
53 // Input in XMM0, output to RSCRATCH
54 // Clobbers RSCRATCH/RSCRATCH2/XMM0/XMM1
55
56 const void* start = GetCodePtr();
57
58 // Grab Exponent
59 MOVQ_xmm(R(RSCRATCH), XMM0);
60 MOV(64, R(RSCRATCH2), R(RSCRATCH));
61 SHR(64, R(RSCRATCH), Imm8(52));
62 AND(16, R(RSCRATCH), Imm16(0x7ff));
63
64 // Check if the double is in the range of valid single subnormal
65 SUB(16, R(RSCRATCH), Imm16(874));
66 CMP(16, R(RSCRATCH), Imm16(896 - 874));
67 FixupBranch Denormalize = J_CC(CC_NA);
68
69 // Don't Denormalize
70
71 if (cpu_info.bFastBMI2)
72 {
73 // Extract bits 0-1 and 5-34
74 MOV(64, R(RSCRATCH), Imm64(0xc7ffffffe0000000));
75 PEXT(64, RSCRATCH, RSCRATCH2, R(RSCRATCH));
76 }
77 else
78 {
79 // We want bits 0, 1
80 avx_op(&XEmitter::VPAND, &XEmitter::PAND, XMM1, R(XMM0), MConst(double_top_two_bits));
81 PSRLQ(XMM1, 32);
82
83 // And 5 through to 34
84 PAND(XMM0, MConst(double_bottom_bits));
85 PSRLQ(XMM0, 29);
86
87 // OR them togther
88 POR(XMM0, R(XMM1));
89 MOVD_xmm(R(RSCRATCH), XMM0);
90 }
91 RET();
92
93 // Denormalise
94 SetJumpTarget(Denormalize);
95
96 // shift = (905 - Exponent) plus the 21 bit double to single shift
97 NEG(16, R(RSCRATCH));
98 ADD(16, R(RSCRATCH), Imm16((905 + 21) - 874));
99 MOVQ_xmm(XMM1, R(RSCRATCH));
100
101 // XMM0 = fraction | 0x0010000000000000
102 PAND(XMM0, MConst(double_fraction));
103 POR(XMM0, MConst(double_explicit_top_bit));
104
105 // fraction >> shift
106 PSRLQ(XMM0, R(XMM1));
107 MOVD_xmm(R(RSCRATCH), XMM0);
108
109 // OR the sign bit in.
110 SHR(64, R(RSCRATCH2), Imm8(32));
111 AND(32, R(RSCRATCH2), Imm32(0x80000000));
112
113 OR(32, R(RSCRATCH), R(RSCRATCH2));
114 RET();
115
116 JitRegister::Register(start, GetCodePtr(), "JIT_cdts");
117 }
118
GenFrsqrte()119 void CommonAsmRoutines::GenFrsqrte()
120 {
121 const void* start = GetCodePtr();
122
123 // Assume input in XMM0.
124 // This function clobbers all three RSCRATCH.
125 MOVQ_xmm(R(RSCRATCH), XMM0);
126
127 // Extract exponent
128 MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
129 SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
130
131 // Negatives, zeros, denormals, infinities and NaNs take the complex path.
132 LEA(32, RSCRATCH2, MDisp(RSCRATCH_EXTRA, -1));
133 CMP(32, R(RSCRATCH2), Imm32(0x7FE));
134 FixupBranch complex = J_CC(CC_AE, true);
135
136 SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
137 SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
138 MOV(32, R(RSCRATCH2), Imm32(0x3FF));
139 SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
140 SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) /
141 // 2)) & (0x7FFLL << 52);
142
143 MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
144 SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
145 AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
146 XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
147
148 PUSH(RSCRATCH2);
149 MOV(64, R(RSCRATCH2), ImmPtr(GetConstantFromPool(Common::frsqrte_expected)));
150 static_assert(sizeof(Common::BaseAndDec) == 8, "Unable to use SCALE_8; incorrect size");
151
152 SHR(64, R(RSCRATCH), Imm8(37));
153 AND(32, R(RSCRATCH), Imm32(0x7FF));
154 IMUL(32, RSCRATCH,
155 MComplex(RSCRATCH2, RSCRATCH_EXTRA, SCALE_8, offsetof(Common::BaseAndDec, m_dec)));
156 MOV(32, R(RSCRATCH_EXTRA),
157 MComplex(RSCRATCH2, RSCRATCH_EXTRA, SCALE_8, offsetof(Common::BaseAndDec, m_base)));
158 SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
159 SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
160
161 POP(RSCRATCH2);
162 OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] -
163 // frsqrte_expected_dec[index] * (i % 2048)) << 26;
164 MOVQ_xmm(XMM0, R(RSCRATCH2));
165 RET();
166
167 SetJumpTarget(complex);
168 AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
169 CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
170 FixupBranch nan_or_inf = J_CC(CC_E);
171
172 MOV(64, R(RSCRATCH2), R(RSCRATCH));
173 SHL(64, R(RSCRATCH2), Imm8(1));
174 FixupBranch nonzero = J_CC(CC_NZ);
175
176 // +0.0 or -0.0
177 TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
178 FixupBranch skip_set_fx1 = J_CC(CC_NZ);
179 OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
180 SetJumpTarget(skip_set_fx1);
181 MOV(64, R(RSCRATCH2), Imm64(0x7FF0'0000'0000'0000));
182 OR(64, R(RSCRATCH2), R(RSCRATCH));
183 MOVQ_xmm(XMM0, R(RSCRATCH2));
184 RET();
185
186 // SNaN or QNaN or +Inf or -Inf
187 SetJumpTarget(nan_or_inf);
188 MOV(64, R(RSCRATCH2), R(RSCRATCH));
189 SHL(64, R(RSCRATCH2), Imm8(12));
190 FixupBranch inf = J_CC(CC_Z);
191 BTS(64, R(RSCRATCH), Imm8(51));
192 MOVQ_xmm(XMM0, R(RSCRATCH));
193 RET();
194 SetJumpTarget(inf);
195 TEST(64, R(RSCRATCH), R(RSCRATCH));
196 FixupBranch negative = J_CC(CC_S);
197 XORPD(XMM0, R(XMM0));
198 RET();
199
200 SetJumpTarget(nonzero);
201 FixupBranch denormal = J_CC(CC_NC);
202
203 // Negative sign
204 SetJumpTarget(negative);
205 TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
206 FixupBranch skip_set_fx2 = J_CC(CC_NZ);
207 OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
208 SetJumpTarget(skip_set_fx2);
209 MOV(64, R(RSCRATCH2), Imm64(0x7FF8'0000'0000'0000));
210 MOVQ_xmm(XMM0, R(RSCRATCH2));
211 RET();
212
213 SetJumpTarget(denormal);
214 ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
215 ABI_CallFunction(Common::ApproximateReciprocalSquareRoot);
216 ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
217 RET();
218
219 JitRegister::Register(start, GetCodePtr(), "JIT_Frsqrte");
220 }
221
GenFres()222 void CommonAsmRoutines::GenFres()
223 {
224 const void* start = GetCodePtr();
225
226 // Assume input in XMM0.
227 // This function clobbers all three RSCRATCH.
228 MOVQ_xmm(R(RSCRATCH), XMM0);
229
230 // Zero inputs set an exception and take the complex path.
231 TEST(64, R(RSCRATCH), R(RSCRATCH));
232 FixupBranch zero = J_CC(CC_Z);
233
234 MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
235 SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
236 MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
237 AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
238 AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
239 SUB(32, R(RSCRATCH_EXTRA), Imm32(895));
240 CMP(32, R(RSCRATCH_EXTRA), Imm32(1149 - 895));
241 // Take the complex path for very large/small exponents.
242 FixupBranch complex = J_CC(CC_AE); // if (exp < 895 || exp >= 1149)
243
244 SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD - 895));
245 NEG(32, R(RSCRATCH_EXTRA));
246 OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
247 SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
248
249 MOV(64, R(RSCRATCH2), R(RSCRATCH));
250 SHR(64, R(RSCRATCH), Imm8(37));
251 SHR(64, R(RSCRATCH2), Imm8(47));
252 AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
253 AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
254
255 PUSH(RSCRATCH_EXTRA);
256 MOV(64, R(RSCRATCH_EXTRA), ImmPtr(GetConstantFromPool(Common::fres_expected)));
257 static_assert(sizeof(Common::BaseAndDec) == 8, "Unable to use SCALE_8; incorrect size");
258
259 IMUL(32, RSCRATCH,
260 MComplex(RSCRATCH_EXTRA, RSCRATCH2, SCALE_8, offsetof(Common::BaseAndDec, m_dec)));
261 ADD(32, R(RSCRATCH), Imm8(1));
262 SHR(32, R(RSCRATCH), Imm8(1));
263
264 MOV(32, R(RSCRATCH2),
265 MComplex(RSCRATCH_EXTRA, RSCRATCH2, SCALE_8, offsetof(Common::BaseAndDec, m_base)));
266 SUB(32, R(RSCRATCH2), R(RSCRATCH));
267 SHL(64, R(RSCRATCH2), Imm8(29));
268
269 POP(RSCRATCH_EXTRA);
270
271 OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] -
272 // (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2)
273 // << 29
274 MOVQ_xmm(XMM0, R(RSCRATCH2));
275 RET();
276
277 // Exception flags for zero input.
278 SetJumpTarget(zero);
279 TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
280 FixupBranch skip_set_fx1 = J_CC(CC_NZ);
281 OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
282 SetJumpTarget(skip_set_fx1);
283
284 SetJumpTarget(complex);
285 ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
286 ABI_CallFunction(Common::ApproximateReciprocal);
287 ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
288 RET();
289
290 JitRegister::Register(start, GetCodePtr(), "JIT_Fres");
291 }
292
GenMfcr()293 void CommonAsmRoutines::GenMfcr()
294 {
295 const void* start = GetCodePtr();
296
297 // Input: none
298 // Output: RSCRATCH
299 // This function clobbers all three RSCRATCH.
300 X64Reg dst = RSCRATCH;
301 X64Reg tmp = RSCRATCH2;
302 X64Reg cr_val = RSCRATCH_EXTRA;
303 XOR(32, R(dst), R(dst));
304 for (int i = 0; i < 8; i++)
305 {
306 static const u32 m_flagTable[8] = {0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9};
307 if (i != 0)
308 SHL(32, R(dst), Imm8(4));
309
310 MOV(64, R(cr_val), PPCSTATE(cr.fields[i]));
311
312 // Upper bits of tmp need to be zeroed.
313 // Note: tmp is used later for address calculations and thus
314 // can't be zero-ed once. This also prevents partial
315 // register stalls due to SETcc.
316 XOR(32, R(tmp), R(tmp));
317 // EQ: Bits 31-0 == 0; set flag bit 1
318 TEST(32, R(cr_val), R(cr_val));
319 SETcc(CC_Z, R(tmp));
320 LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
321
322 // GT: Value > 0; set flag bit 2
323 TEST(64, R(cr_val), R(cr_val));
324 SETcc(CC_G, R(tmp));
325 LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
326
327 // SO: Bit 61 set; set flag bit 0
328 // LT: Bit 62 set; set flag bit 3
329 SHR(64, R(cr_val), Imm8(61));
330 LEA(64, tmp, MConst(m_flagTable));
331 OR(32, R(dst), MComplex(tmp, cr_val, SCALE_4, 0));
332 }
333 RET();
334
335 JitRegister::Register(start, GetCodePtr(), "JIT_Mfcr");
336 }
337
338 // Safe + Fast Quantizers, originally from JITIL by magumagu
339 alignas(16) static const float m_65535[4] = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
340 alignas(16) static const float m_32767 = 32767.0f;
341 alignas(16) static const float m_m32768 = -32768.0f;
342 alignas(16) static const float m_255 = 255.0f;
343 alignas(16) static const float m_127 = 127.0f;
344 alignas(16) static const float m_m128 = -128.0f;
345
346 // Sizes of the various quantized store types
347 constexpr std::array<u8, 8> sizes{{32, 0, 0, 0, 8, 16, 8, 16}};
348
GenQuantizedStores()349 void CommonAsmRoutines::GenQuantizedStores()
350 {
351 // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_stXX).
352 paired_store_quantized = reinterpret_cast<const u8**>(AlignCodeTo(256));
353 ReserveCodeSpace(8 * sizeof(u8*));
354
355 for (int type = 0; type < 8; type++)
356 {
357 paired_store_quantized[type] =
358 GenQuantizedStoreRuntime(false, static_cast<EQuantizeType>(type));
359 }
360 }
361
362 // See comment in header for in/outs.
GenQuantizedSingleStores()363 void CommonAsmRoutines::GenQuantizedSingleStores()
364 {
365 // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_stXX).
366 single_store_quantized = reinterpret_cast<const u8**>(AlignCodeTo(256));
367 ReserveCodeSpace(8 * sizeof(u8*));
368
369 for (int type = 0; type < 8; type++)
370 single_store_quantized[type] = GenQuantizedStoreRuntime(true, static_cast<EQuantizeType>(type));
371 }
372
GenQuantizedStoreRuntime(bool single,EQuantizeType type)373 const u8* CommonAsmRoutines::GenQuantizedStoreRuntime(bool single, EQuantizeType type)
374 {
375 const void* start = GetCodePtr();
376 const u8* load = AlignCode4();
377 GenQuantizedStore(single, type, -1);
378 RET();
379 JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore_%i_%i", type, single);
380
381 return load;
382 }
383
GenQuantizedLoads()384 void CommonAsmRoutines::GenQuantizedLoads()
385 {
386 // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_lXX).
387 paired_load_quantized = reinterpret_cast<const u8**>(AlignCodeTo(256));
388 ReserveCodeSpace(8 * sizeof(u8*));
389
390 for (int type = 0; type < 8; type++)
391 paired_load_quantized[type] = GenQuantizedLoadRuntime(false, static_cast<EQuantizeType>(type));
392 }
393
GenQuantizedSingleLoads()394 void CommonAsmRoutines::GenQuantizedSingleLoads()
395 {
396 // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_lXX).
397 single_load_quantized = reinterpret_cast<const u8**>(AlignCodeTo(256));
398 ReserveCodeSpace(8 * sizeof(u8*));
399
400 for (int type = 0; type < 8; type++)
401 single_load_quantized[type] = GenQuantizedLoadRuntime(true, static_cast<EQuantizeType>(type));
402 }
403
GenQuantizedLoadRuntime(bool single,EQuantizeType type)404 const u8* CommonAsmRoutines::GenQuantizedLoadRuntime(bool single, EQuantizeType type)
405 {
406 const void* start = GetCodePtr();
407 const u8* load = AlignCode4();
408 GenQuantizedLoad(single, type, -1);
409 RET();
410 JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad_%i_%i", type, single);
411
412 return load;
413 }
414
GenQuantizedStore(bool single,EQuantizeType type,int quantize)415 void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type, int quantize)
416 {
417 // In: one or two single floats in XMM0, if quantize is -1, a quantization factor in RSCRATCH2
418
419 int size = sizes[type] * (single ? 1 : 2);
420 bool isInline = quantize != -1;
421
422 // illegal
423 if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
424 {
425 UD2();
426 return;
427 }
428
429 if (type == QUANTIZE_FLOAT)
430 {
431 GenQuantizedStoreFloat(single, isInline);
432 }
433 else if (single)
434 {
435 if (quantize == -1)
436 {
437 SHR(32, R(RSCRATCH2), Imm8(5));
438 LEA(64, RSCRATCH, MConst(m_quantizeTableS));
439 MULSS(XMM0, MRegSum(RSCRATCH2, RSCRATCH));
440 }
441 else if (quantize > 0)
442 {
443 MULSS(XMM0, MConst(m_quantizeTableS, quantize * 2));
444 }
445
446 switch (type)
447 {
448 case QUANTIZE_U8:
449 XORPS(XMM1, R(XMM1));
450 MAXSS(XMM0, R(XMM1));
451 MINSS(XMM0, MConst(m_255));
452 break;
453 case QUANTIZE_S8:
454 MAXSS(XMM0, MConst(m_m128));
455 MINSS(XMM0, MConst(m_127));
456 break;
457 case QUANTIZE_U16:
458 XORPS(XMM1, R(XMM1));
459 MAXSS(XMM0, R(XMM1));
460 MINSS(XMM0, MConst(m_65535));
461 break;
462 case QUANTIZE_S16:
463 MAXSS(XMM0, MConst(m_m32768));
464 MINSS(XMM0, MConst(m_32767));
465 break;
466 default:
467 break;
468 }
469
470 CVTTSS2SI(RSCRATCH, R(XMM0));
471 }
472 else
473 {
474 if (quantize == -1)
475 {
476 SHR(32, R(RSCRATCH2), Imm8(5));
477 LEA(64, RSCRATCH, MConst(m_quantizeTableS));
478 MOVQ_xmm(XMM1, MRegSum(RSCRATCH2, RSCRATCH));
479 MULPS(XMM0, R(XMM1));
480 }
481 else if (quantize > 0)
482 {
483 MOVQ_xmm(XMM1, MConst(m_quantizeTableS, quantize * 2));
484 MULPS(XMM0, R(XMM1));
485 }
486
487 bool hasPACKUSDW = cpu_info.bSSE4_1;
488
489 // Special case: if we don't have PACKUSDW we need to clamp to zero as well so the shuffle
490 // below can work
491 if (type == QUANTIZE_U16 && !hasPACKUSDW)
492 {
493 XORPS(XMM1, R(XMM1));
494 MAXPS(XMM0, R(XMM1));
495 }
496
497 // According to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value
498 // is out of int32 range while it's OK for large negatives, it isn't for positives
499 // I don't know whether the overflow actually happens in any games but it potentially can
500 // cause problems, so we need some clamping
501 MINPS(XMM0, MConst(m_65535));
502 CVTTPS2DQ(XMM0, R(XMM0));
503
504 switch (type)
505 {
506 case QUANTIZE_U8:
507 PACKSSDW(XMM0, R(XMM0));
508 PACKUSWB(XMM0, R(XMM0));
509 MOVD_xmm(R(RSCRATCH), XMM0);
510 break;
511 case QUANTIZE_S8:
512 PACKSSDW(XMM0, R(XMM0));
513 PACKSSWB(XMM0, R(XMM0));
514 MOVD_xmm(R(RSCRATCH), XMM0);
515 break;
516 case QUANTIZE_U16:
517 if (hasPACKUSDW)
518 {
519 PACKUSDW(XMM0, R(XMM0)); // AAAABBBB CCCCDDDD ... -> AABBCCDD ...
520 MOVD_xmm(R(RSCRATCH), XMM0); // AABBCCDD ... -> AABBCCDD
521 BSWAP(32, RSCRATCH); // AABBCCDD -> DDCCBBAA
522 ROL(32, R(RSCRATCH), Imm8(16)); // DDCCBBAA -> BBAADDCC
523 }
524 else
525 {
526 // We don't have PACKUSDW so we'll shuffle instead (assumes 32-bit values >= 0 and < 65536)
527 PSHUFLW(XMM0, R(XMM0), 2); // AABB0000 CCDD0000 ... -> CCDDAABB ...
528 MOVD_xmm(R(RSCRATCH), XMM0); // CCDDAABB ... -> CCDDAABB
529 BSWAP(32, RSCRATCH); // CCDDAABB -> BBAADDCC
530 }
531 break;
532 case QUANTIZE_S16:
533 PACKSSDW(XMM0, R(XMM0));
534 MOVD_xmm(R(RSCRATCH), XMM0);
535 BSWAP(32, RSCRATCH);
536 ROL(32, R(RSCRATCH), Imm8(16));
537 break;
538 default:
539 break;
540 }
541 }
542
543 int flags = isInline ? 0 :
544 SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG |
545 SAFE_LOADSTORE_DR_ON | SAFE_LOADSTORE_NO_UPDATE_PC;
546 if (!single)
547 flags |= SAFE_LOADSTORE_NO_SWAP;
548
549 SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, size, 0, QUANTIZED_REGS_TO_SAVE, flags);
550 }
551
GenQuantizedStoreFloat(bool single,bool isInline)552 void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline)
553 {
554 if (single)
555 {
556 // Easy!
557 MOVD_xmm(R(RSCRATCH), XMM0);
558 }
559 else
560 {
561 if (cpu_info.bSSSE3)
562 {
563 PSHUFB(XMM0, MConst(pbswapShuffle2x4));
564 MOVQ_xmm(R(RSCRATCH), XMM0);
565 }
566 else
567 {
568 MOVQ_xmm(R(RSCRATCH), XMM0);
569 ROL(64, R(RSCRATCH), Imm8(32));
570 BSWAP(64, RSCRATCH);
571 }
572 }
573 }
574
GenQuantizedLoad(bool single,EQuantizeType type,int quantize)575 void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type, int quantize)
576 {
577 // Note that this method assumes that inline methods know the value of quantize ahead of
578 // time. The methods generated AOT assume that the quantize flag is placed in RSCRATCH in
579 // the second lowest byte, ie: 0x0000xx00
580
581 int size = sizes[type] * (single ? 1 : 2);
582 bool isInline = quantize != -1;
583 bool safe_access = m_jit.jo.memcheck || !m_jit.jo.fastmem;
584
585 // illegal
586 if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
587 {
588 UD2();
589 return;
590 }
591
592 // Floats don't use quantization and can generate more optimal code
593 if (type == QUANTIZE_FLOAT)
594 {
595 GenQuantizedLoadFloat(single, isInline);
596 return;
597 }
598
599 bool extend = single && (type == QUANTIZE_S8 || type == QUANTIZE_S16);
600
601 if (safe_access)
602 {
603 BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE_LOAD;
604 int flags = isInline ? 0 :
605 SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG |
606 SAFE_LOADSTORE_DR_ON | SAFE_LOADSTORE_NO_UPDATE_PC;
607 SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
608 if (!single && (type == QUANTIZE_U8 || type == QUANTIZE_S8))
609 {
610 // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
611 ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
612 }
613 }
614 else
615 {
616 switch (type)
617 {
618 case QUANTIZE_U8:
619 case QUANTIZE_S8:
620 UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
621 break;
622 case QUANTIZE_U16:
623 case QUANTIZE_S16:
624 UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
625 break;
626 default:
627 break;
628 }
629 }
630
631 if (single)
632 {
633 CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
634
635 if (quantize == -1)
636 {
637 SHR(32, R(RSCRATCH2), Imm8(5));
638 LEA(64, RSCRATCH, MConst(m_dequantizeTableS));
639 MULSS(XMM0, MRegSum(RSCRATCH2, RSCRATCH));
640 }
641 else if (quantize > 0)
642 {
643 MULSS(XMM0, MConst(m_dequantizeTableS, quantize * 2));
644 }
645 UNPCKLPS(XMM0, MConst(m_one));
646 }
647 else
648 {
649 switch (type)
650 {
651 case QUANTIZE_U8:
652 MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
653 if (cpu_info.bSSE4_1)
654 {
655 PMOVZXBD(XMM0, R(XMM0));
656 }
657 else
658 {
659 PXOR(XMM1, R(XMM1));
660 PUNPCKLBW(XMM0, R(XMM1));
661 PUNPCKLWD(XMM0, R(XMM1));
662 }
663 break;
664 case QUANTIZE_S8:
665 MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
666 if (cpu_info.bSSE4_1)
667 {
668 PMOVSXBD(XMM0, R(XMM0));
669 }
670 else
671 {
672 PUNPCKLBW(XMM0, R(XMM0));
673 PUNPCKLWD(XMM0, R(XMM0));
674 PSRAD(XMM0, 24);
675 }
676 break;
677 case QUANTIZE_U16:
678 ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
679 MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
680 if (cpu_info.bSSE4_1)
681 {
682 PMOVZXWD(XMM0, R(XMM0));
683 }
684 else
685 {
686 PXOR(XMM1, R(XMM1));
687 PUNPCKLWD(XMM0, R(XMM1));
688 }
689 break;
690 case QUANTIZE_S16:
691 ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
692 MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
693 if (cpu_info.bSSE4_1)
694 {
695 PMOVSXWD(XMM0, R(XMM0));
696 }
697 else
698 {
699 PUNPCKLWD(XMM0, R(XMM0));
700 PSRAD(XMM0, 16);
701 }
702 break;
703 default:
704 break;
705 }
706 CVTDQ2PS(XMM0, R(XMM0));
707
708 if (quantize == -1)
709 {
710 SHR(32, R(RSCRATCH2), Imm8(5));
711 LEA(64, RSCRATCH, MConst(m_dequantizeTableS));
712 MOVQ_xmm(XMM1, MRegSum(RSCRATCH2, RSCRATCH));
713 MULPS(XMM0, R(XMM1));
714 }
715 else if (quantize > 0)
716 {
717 MOVQ_xmm(XMM1, MConst(m_dequantizeTableS, quantize * 2));
718 MULPS(XMM0, R(XMM1));
719 }
720 }
721 }
722
GenQuantizedLoadFloat(bool single,bool isInline)723 void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
724 {
725 int size = single ? 32 : 64;
726 bool extend = false;
727 bool safe_access = m_jit.jo.memcheck || !m_jit.jo.fastmem;
728
729 if (safe_access)
730 {
731 BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE;
732 int flags = isInline ? 0 :
733 SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG |
734 SAFE_LOADSTORE_DR_ON | SAFE_LOADSTORE_NO_UPDATE_PC;
735 SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
736 }
737
738 if (single)
739 {
740 if (safe_access)
741 {
742 MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
743 }
744 else if (cpu_info.bSSSE3)
745 {
746 MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
747 PSHUFB(XMM0, MConst(pbswapShuffle1x4));
748 }
749 else
750 {
751 LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
752 MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
753 }
754
755 UNPCKLPS(XMM0, MConst(m_one));
756 }
757 else
758 {
759 // FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
760 // don't need hardware access handling. This will definitely crash if paired loads occur
761 // from non-RAM areas, but as far as I know, this never happens. I don't know if this is
762 // for a good reason, or merely because no game does this.
763 // If we find something that actually does do this, maybe this should be changed. How
764 // much of a performance hit would it be?
765 if (safe_access)
766 {
767 ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
768 MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
769 }
770 else if (cpu_info.bSSSE3)
771 {
772 MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
773 PSHUFB(XMM0, MConst(pbswapShuffle2x4));
774 }
775 else
776 {
777 LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
778 ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
779 MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
780 }
781 }
782 }
783