1 // Copyright 2008 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4 
5 #include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
6 
7 #include <array>
8 
9 #include "Common/CPUDetect.h"
10 #include "Common/CommonTypes.h"
11 #include "Common/FloatUtils.h"
12 #include "Common/Intrinsics.h"
13 #include "Common/JitRegister.h"
14 #include "Common/x64ABI.h"
15 #include "Common/x64Emitter.h"
16 #include "Core/PowerPC/Gekko.h"
17 #include "Core/PowerPC/Jit64/Jit.h"
18 #include "Core/PowerPC/Jit64Common/Jit64Constants.h"
19 #include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
20 #include "Core/PowerPC/PowerPC.h"
21 
22 #define QUANTIZED_REGS_TO_SAVE                                                                     \
23   (ABI_ALL_CALLER_SAVED & ~BitSet32{RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0 + 16, XMM1 + 16})
24 
25 #define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32{RSCRATCH2})
26 
27 using namespace Gen;
28 
29 alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
30 alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
31 alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
32 alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
33 alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
34 
35 // Since the following float conversion functions are used in non-arithmetic PPC float
36 // instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
37 // into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
38 // flush-to-zero so we can use FLD+FSTP even on denormals.
39 // If the number is a NaN, make sure to set the QNaN bit back to its original value.
40 
41 // Another problem is that officially, converting doubles to single format results in undefined
42 // behavior.  Relying on undefined behavior is a bug so no software should ever do this.
43 // Super Mario 64 (on Wii VC) accidentally relies on this behavior.  See issue #11173
44 
45 // This is the same algorithm used in the interpreter (and actual hardware)
46 // The documentation states that the conversion of a double with an outside the
47 // valid range for a single (or a single denormal) is undefined.
48 // But testing on actual hardware shows it always picks bits 0..1 and 5..34
49 // unless the exponent is in the range of 874 to 896.
50 
GenConvertDoubleToSingle()51 void CommonAsmRoutines::GenConvertDoubleToSingle()
52 {
53   // Input in XMM0, output to RSCRATCH
54   // Clobbers RSCRATCH/RSCRATCH2/XMM0/XMM1
55 
56   const void* start = GetCodePtr();
57 
58   // Grab Exponent
59   MOVQ_xmm(R(RSCRATCH), XMM0);
60   MOV(64, R(RSCRATCH2), R(RSCRATCH));
61   SHR(64, R(RSCRATCH), Imm8(52));
62   AND(16, R(RSCRATCH), Imm16(0x7ff));
63 
64   // Check if the double is in the range of valid single subnormal
65   SUB(16, R(RSCRATCH), Imm16(874));
66   CMP(16, R(RSCRATCH), Imm16(896 - 874));
67   FixupBranch Denormalize = J_CC(CC_NA);
68 
69   // Don't Denormalize
70 
71   if (cpu_info.bFastBMI2)
72   {
73     // Extract bits 0-1 and 5-34
74     MOV(64, R(RSCRATCH), Imm64(0xc7ffffffe0000000));
75     PEXT(64, RSCRATCH, RSCRATCH2, R(RSCRATCH));
76   }
77   else
78   {
79     // We want bits 0, 1
80     avx_op(&XEmitter::VPAND, &XEmitter::PAND, XMM1, R(XMM0), MConst(double_top_two_bits));
81     PSRLQ(XMM1, 32);
82 
83     // And 5 through to 34
84     PAND(XMM0, MConst(double_bottom_bits));
85     PSRLQ(XMM0, 29);
86 
87     // OR them togther
88     POR(XMM0, R(XMM1));
89     MOVD_xmm(R(RSCRATCH), XMM0);
90   }
91   RET();
92 
93   // Denormalise
94   SetJumpTarget(Denormalize);
95 
96   // shift = (905 - Exponent) plus the 21 bit double to single shift
97   NEG(16, R(RSCRATCH));
98   ADD(16, R(RSCRATCH), Imm16((905 + 21) - 874));
99   MOVQ_xmm(XMM1, R(RSCRATCH));
100 
101   // XMM0 = fraction | 0x0010000000000000
102   PAND(XMM0, MConst(double_fraction));
103   POR(XMM0, MConst(double_explicit_top_bit));
104 
105   // fraction >> shift
106   PSRLQ(XMM0, R(XMM1));
107   MOVD_xmm(R(RSCRATCH), XMM0);
108 
109   // OR the sign bit in.
110   SHR(64, R(RSCRATCH2), Imm8(32));
111   AND(32, R(RSCRATCH2), Imm32(0x80000000));
112 
113   OR(32, R(RSCRATCH), R(RSCRATCH2));
114   RET();
115 
116   JitRegister::Register(start, GetCodePtr(), "JIT_cdts");
117 }
118 
GenFrsqrte()119 void CommonAsmRoutines::GenFrsqrte()
120 {
121   const void* start = GetCodePtr();
122 
123   // Assume input in XMM0.
124   // This function clobbers all three RSCRATCH.
125   MOVQ_xmm(R(RSCRATCH), XMM0);
126 
127   // Extract exponent
128   MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
129   SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
130 
131   // Negatives, zeros, denormals, infinities and NaNs take the complex path.
132   LEA(32, RSCRATCH2, MDisp(RSCRATCH_EXTRA, -1));
133   CMP(32, R(RSCRATCH2), Imm32(0x7FE));
134   FixupBranch complex = J_CC(CC_AE, true);
135 
136   SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
137   SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
138   MOV(32, R(RSCRATCH2), Imm32(0x3FF));
139   SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
140   SHL(64, R(RSCRATCH2), Imm8(52));  // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) /
141                                     // 2)) & (0x7FFLL << 52);
142 
143   MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
144   SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
145   AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
146   XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10));  // int index = i / 2048 + (odd_exponent ? 16 : 0);
147 
148   PUSH(RSCRATCH2);
149   MOV(64, R(RSCRATCH2), ImmPtr(GetConstantFromPool(Common::frsqrte_expected)));
150   static_assert(sizeof(Common::BaseAndDec) == 8, "Unable to use SCALE_8; incorrect size");
151 
152   SHR(64, R(RSCRATCH), Imm8(37));
153   AND(32, R(RSCRATCH), Imm32(0x7FF));
154   IMUL(32, RSCRATCH,
155        MComplex(RSCRATCH2, RSCRATCH_EXTRA, SCALE_8, offsetof(Common::BaseAndDec, m_dec)));
156   MOV(32, R(RSCRATCH_EXTRA),
157       MComplex(RSCRATCH2, RSCRATCH_EXTRA, SCALE_8, offsetof(Common::BaseAndDec, m_base)));
158   SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
159   SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
160 
161   POP(RSCRATCH2);
162   OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));  // vali |= (s64)(frsqrte_expected_base[index] -
163                                             // frsqrte_expected_dec[index] * (i % 2048)) << 26;
164   MOVQ_xmm(XMM0, R(RSCRATCH2));
165   RET();
166 
167   SetJumpTarget(complex);
168   AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
169   CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
170   FixupBranch nan_or_inf = J_CC(CC_E);
171 
172   MOV(64, R(RSCRATCH2), R(RSCRATCH));
173   SHL(64, R(RSCRATCH2), Imm8(1));
174   FixupBranch nonzero = J_CC(CC_NZ);
175 
176   // +0.0 or -0.0
177   TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
178   FixupBranch skip_set_fx1 = J_CC(CC_NZ);
179   OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
180   SetJumpTarget(skip_set_fx1);
181   MOV(64, R(RSCRATCH2), Imm64(0x7FF0'0000'0000'0000));
182   OR(64, R(RSCRATCH2), R(RSCRATCH));
183   MOVQ_xmm(XMM0, R(RSCRATCH2));
184   RET();
185 
186   // SNaN or QNaN or +Inf or -Inf
187   SetJumpTarget(nan_or_inf);
188   MOV(64, R(RSCRATCH2), R(RSCRATCH));
189   SHL(64, R(RSCRATCH2), Imm8(12));
190   FixupBranch inf = J_CC(CC_Z);
191   BTS(64, R(RSCRATCH), Imm8(51));
192   MOVQ_xmm(XMM0, R(RSCRATCH));
193   RET();
194   SetJumpTarget(inf);
195   TEST(64, R(RSCRATCH), R(RSCRATCH));
196   FixupBranch negative = J_CC(CC_S);
197   XORPD(XMM0, R(XMM0));
198   RET();
199 
200   SetJumpTarget(nonzero);
201   FixupBranch denormal = J_CC(CC_NC);
202 
203   // Negative sign
204   SetJumpTarget(negative);
205   TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
206   FixupBranch skip_set_fx2 = J_CC(CC_NZ);
207   OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
208   SetJumpTarget(skip_set_fx2);
209   MOV(64, R(RSCRATCH2), Imm64(0x7FF8'0000'0000'0000));
210   MOVQ_xmm(XMM0, R(RSCRATCH2));
211   RET();
212 
213   SetJumpTarget(denormal);
214   ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
215   ABI_CallFunction(Common::ApproximateReciprocalSquareRoot);
216   ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
217   RET();
218 
219   JitRegister::Register(start, GetCodePtr(), "JIT_Frsqrte");
220 }
221 
GenFres()222 void CommonAsmRoutines::GenFres()
223 {
224   const void* start = GetCodePtr();
225 
226   // Assume input in XMM0.
227   // This function clobbers all three RSCRATCH.
228   MOVQ_xmm(R(RSCRATCH), XMM0);
229 
230   // Zero inputs set an exception and take the complex path.
231   TEST(64, R(RSCRATCH), R(RSCRATCH));
232   FixupBranch zero = J_CC(CC_Z);
233 
234   MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
235   SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
236   MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
237   AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));  // exp
238   AND(32, R(RSCRATCH2), Imm32(0x800));       // sign
239   SUB(32, R(RSCRATCH_EXTRA), Imm32(895));
240   CMP(32, R(RSCRATCH_EXTRA), Imm32(1149 - 895));
241   // Take the complex path for very large/small exponents.
242   FixupBranch complex = J_CC(CC_AE);  // if (exp < 895 || exp >= 1149)
243 
244   SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD - 895));
245   NEG(32, R(RSCRATCH_EXTRA));
246   OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
247   SHL(64, R(RSCRATCH_EXTRA), Imm8(52));  // vali = sign | exponent
248 
249   MOV(64, R(RSCRATCH2), R(RSCRATCH));
250   SHR(64, R(RSCRATCH), Imm8(37));
251   SHR(64, R(RSCRATCH2), Imm8(47));
252   AND(32, R(RSCRATCH), Imm32(0x3FF));  // i % 1024
253   AND(32, R(RSCRATCH2), Imm8(0x1F));   // i / 1024
254 
255   PUSH(RSCRATCH_EXTRA);
256   MOV(64, R(RSCRATCH_EXTRA), ImmPtr(GetConstantFromPool(Common::fres_expected)));
257   static_assert(sizeof(Common::BaseAndDec) == 8, "Unable to use SCALE_8; incorrect size");
258 
259   IMUL(32, RSCRATCH,
260        MComplex(RSCRATCH_EXTRA, RSCRATCH2, SCALE_8, offsetof(Common::BaseAndDec, m_dec)));
261   ADD(32, R(RSCRATCH), Imm8(1));
262   SHR(32, R(RSCRATCH), Imm8(1));
263 
264   MOV(32, R(RSCRATCH2),
265       MComplex(RSCRATCH_EXTRA, RSCRATCH2, SCALE_8, offsetof(Common::BaseAndDec, m_base)));
266   SUB(32, R(RSCRATCH2), R(RSCRATCH));
267   SHL(64, R(RSCRATCH2), Imm8(29));
268 
269   POP(RSCRATCH_EXTRA);
270 
271   OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));  // vali |= (s64)(fres_expected_base[i / 1024] -
272                                             // (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2)
273                                             // << 29
274   MOVQ_xmm(XMM0, R(RSCRATCH2));
275   RET();
276 
277   // Exception flags for zero input.
278   SetJumpTarget(zero);
279   TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
280   FixupBranch skip_set_fx1 = J_CC(CC_NZ);
281   OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
282   SetJumpTarget(skip_set_fx1);
283 
284   SetJumpTarget(complex);
285   ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
286   ABI_CallFunction(Common::ApproximateReciprocal);
287   ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
288   RET();
289 
290   JitRegister::Register(start, GetCodePtr(), "JIT_Fres");
291 }
292 
GenMfcr()293 void CommonAsmRoutines::GenMfcr()
294 {
295   const void* start = GetCodePtr();
296 
297   // Input: none
298   // Output: RSCRATCH
299   // This function clobbers all three RSCRATCH.
300   X64Reg dst = RSCRATCH;
301   X64Reg tmp = RSCRATCH2;
302   X64Reg cr_val = RSCRATCH_EXTRA;
303   XOR(32, R(dst), R(dst));
304   for (int i = 0; i < 8; i++)
305   {
306     static const u32 m_flagTable[8] = {0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9};
307     if (i != 0)
308       SHL(32, R(dst), Imm8(4));
309 
310     MOV(64, R(cr_val), PPCSTATE(cr.fields[i]));
311 
312     // Upper bits of tmp need to be zeroed.
313     // Note: tmp is used later for address calculations and thus
314     //       can't be zero-ed once. This also prevents partial
315     //       register stalls due to SETcc.
316     XOR(32, R(tmp), R(tmp));
317     // EQ: Bits 31-0 == 0; set flag bit 1
318     TEST(32, R(cr_val), R(cr_val));
319     SETcc(CC_Z, R(tmp));
320     LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
321 
322     // GT: Value > 0; set flag bit 2
323     TEST(64, R(cr_val), R(cr_val));
324     SETcc(CC_G, R(tmp));
325     LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
326 
327     // SO: Bit 61 set; set flag bit 0
328     // LT: Bit 62 set; set flag bit 3
329     SHR(64, R(cr_val), Imm8(61));
330     LEA(64, tmp, MConst(m_flagTable));
331     OR(32, R(dst), MComplex(tmp, cr_val, SCALE_4, 0));
332   }
333   RET();
334 
335   JitRegister::Register(start, GetCodePtr(), "JIT_Mfcr");
336 }
337 
338 // Safe + Fast Quantizers, originally from JITIL by magumagu
339 alignas(16) static const float m_65535[4] = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
340 alignas(16) static const float m_32767 = 32767.0f;
341 alignas(16) static const float m_m32768 = -32768.0f;
342 alignas(16) static const float m_255 = 255.0f;
343 alignas(16) static const float m_127 = 127.0f;
344 alignas(16) static const float m_m128 = -128.0f;
345 
346 // Sizes of the various quantized store types
347 constexpr std::array<u8, 8> sizes{{32, 0, 0, 0, 8, 16, 8, 16}};
348 
GenQuantizedStores()349 void CommonAsmRoutines::GenQuantizedStores()
350 {
351   // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_stXX).
352   paired_store_quantized = reinterpret_cast<const u8**>(AlignCodeTo(256));
353   ReserveCodeSpace(8 * sizeof(u8*));
354 
355   for (int type = 0; type < 8; type++)
356   {
357     paired_store_quantized[type] =
358         GenQuantizedStoreRuntime(false, static_cast<EQuantizeType>(type));
359   }
360 }
361 
362 // See comment in header for in/outs.
GenQuantizedSingleStores()363 void CommonAsmRoutines::GenQuantizedSingleStores()
364 {
365   // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_stXX).
366   single_store_quantized = reinterpret_cast<const u8**>(AlignCodeTo(256));
367   ReserveCodeSpace(8 * sizeof(u8*));
368 
369   for (int type = 0; type < 8; type++)
370     single_store_quantized[type] = GenQuantizedStoreRuntime(true, static_cast<EQuantizeType>(type));
371 }
372 
GenQuantizedStoreRuntime(bool single,EQuantizeType type)373 const u8* CommonAsmRoutines::GenQuantizedStoreRuntime(bool single, EQuantizeType type)
374 {
375   const void* start = GetCodePtr();
376   const u8* load = AlignCode4();
377   GenQuantizedStore(single, type, -1);
378   RET();
379   JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore_%i_%i", type, single);
380 
381   return load;
382 }
383 
GenQuantizedLoads()384 void CommonAsmRoutines::GenQuantizedLoads()
385 {
386   // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_lXX).
387   paired_load_quantized = reinterpret_cast<const u8**>(AlignCodeTo(256));
388   ReserveCodeSpace(8 * sizeof(u8*));
389 
390   for (int type = 0; type < 8; type++)
391     paired_load_quantized[type] = GenQuantizedLoadRuntime(false, static_cast<EQuantizeType>(type));
392 }
393 
GenQuantizedSingleLoads()394 void CommonAsmRoutines::GenQuantizedSingleLoads()
395 {
396   // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_lXX).
397   single_load_quantized = reinterpret_cast<const u8**>(AlignCodeTo(256));
398   ReserveCodeSpace(8 * sizeof(u8*));
399 
400   for (int type = 0; type < 8; type++)
401     single_load_quantized[type] = GenQuantizedLoadRuntime(true, static_cast<EQuantizeType>(type));
402 }
403 
GenQuantizedLoadRuntime(bool single,EQuantizeType type)404 const u8* CommonAsmRoutines::GenQuantizedLoadRuntime(bool single, EQuantizeType type)
405 {
406   const void* start = GetCodePtr();
407   const u8* load = AlignCode4();
408   GenQuantizedLoad(single, type, -1);
409   RET();
410   JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad_%i_%i", type, single);
411 
412   return load;
413 }
414 
GenQuantizedStore(bool single,EQuantizeType type,int quantize)415 void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type, int quantize)
416 {
417   // In: one or two single floats in XMM0, if quantize is -1, a quantization factor in RSCRATCH2
418 
419   int size = sizes[type] * (single ? 1 : 2);
420   bool isInline = quantize != -1;
421 
422   // illegal
423   if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
424   {
425     UD2();
426     return;
427   }
428 
429   if (type == QUANTIZE_FLOAT)
430   {
431     GenQuantizedStoreFloat(single, isInline);
432   }
433   else if (single)
434   {
435     if (quantize == -1)
436     {
437       SHR(32, R(RSCRATCH2), Imm8(5));
438       LEA(64, RSCRATCH, MConst(m_quantizeTableS));
439       MULSS(XMM0, MRegSum(RSCRATCH2, RSCRATCH));
440     }
441     else if (quantize > 0)
442     {
443       MULSS(XMM0, MConst(m_quantizeTableS, quantize * 2));
444     }
445 
446     switch (type)
447     {
448     case QUANTIZE_U8:
449       XORPS(XMM1, R(XMM1));
450       MAXSS(XMM0, R(XMM1));
451       MINSS(XMM0, MConst(m_255));
452       break;
453     case QUANTIZE_S8:
454       MAXSS(XMM0, MConst(m_m128));
455       MINSS(XMM0, MConst(m_127));
456       break;
457     case QUANTIZE_U16:
458       XORPS(XMM1, R(XMM1));
459       MAXSS(XMM0, R(XMM1));
460       MINSS(XMM0, MConst(m_65535));
461       break;
462     case QUANTIZE_S16:
463       MAXSS(XMM0, MConst(m_m32768));
464       MINSS(XMM0, MConst(m_32767));
465       break;
466     default:
467       break;
468     }
469 
470     CVTTSS2SI(RSCRATCH, R(XMM0));
471   }
472   else
473   {
474     if (quantize == -1)
475     {
476       SHR(32, R(RSCRATCH2), Imm8(5));
477       LEA(64, RSCRATCH, MConst(m_quantizeTableS));
478       MOVQ_xmm(XMM1, MRegSum(RSCRATCH2, RSCRATCH));
479       MULPS(XMM0, R(XMM1));
480     }
481     else if (quantize > 0)
482     {
483       MOVQ_xmm(XMM1, MConst(m_quantizeTableS, quantize * 2));
484       MULPS(XMM0, R(XMM1));
485     }
486 
487     bool hasPACKUSDW = cpu_info.bSSE4_1;
488 
489     // Special case: if we don't have PACKUSDW we need to clamp to zero as well so the shuffle
490     // below can work
491     if (type == QUANTIZE_U16 && !hasPACKUSDW)
492     {
493       XORPS(XMM1, R(XMM1));
494       MAXPS(XMM0, R(XMM1));
495     }
496 
497     // According to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value
498     // is out of int32 range while it's OK for large negatives, it isn't for positives
499     // I don't know whether the overflow actually happens in any games but it potentially can
500     // cause problems, so we need some clamping
501     MINPS(XMM0, MConst(m_65535));
502     CVTTPS2DQ(XMM0, R(XMM0));
503 
504     switch (type)
505     {
506     case QUANTIZE_U8:
507       PACKSSDW(XMM0, R(XMM0));
508       PACKUSWB(XMM0, R(XMM0));
509       MOVD_xmm(R(RSCRATCH), XMM0);
510       break;
511     case QUANTIZE_S8:
512       PACKSSDW(XMM0, R(XMM0));
513       PACKSSWB(XMM0, R(XMM0));
514       MOVD_xmm(R(RSCRATCH), XMM0);
515       break;
516     case QUANTIZE_U16:
517       if (hasPACKUSDW)
518       {
519         PACKUSDW(XMM0, R(XMM0));         // AAAABBBB CCCCDDDD ... -> AABBCCDD ...
520         MOVD_xmm(R(RSCRATCH), XMM0);     // AABBCCDD ... -> AABBCCDD
521         BSWAP(32, RSCRATCH);             // AABBCCDD -> DDCCBBAA
522         ROL(32, R(RSCRATCH), Imm8(16));  // DDCCBBAA -> BBAADDCC
523       }
524       else
525       {
526         // We don't have PACKUSDW so we'll shuffle instead (assumes 32-bit values >= 0 and < 65536)
527         PSHUFLW(XMM0, R(XMM0), 2);    // AABB0000 CCDD0000 ... -> CCDDAABB ...
528         MOVD_xmm(R(RSCRATCH), XMM0);  // CCDDAABB ... -> CCDDAABB
529         BSWAP(32, RSCRATCH);          // CCDDAABB -> BBAADDCC
530       }
531       break;
532     case QUANTIZE_S16:
533       PACKSSDW(XMM0, R(XMM0));
534       MOVD_xmm(R(RSCRATCH), XMM0);
535       BSWAP(32, RSCRATCH);
536       ROL(32, R(RSCRATCH), Imm8(16));
537       break;
538     default:
539       break;
540     }
541   }
542 
543   int flags = isInline ? 0 :
544                          SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG |
545                              SAFE_LOADSTORE_DR_ON | SAFE_LOADSTORE_NO_UPDATE_PC;
546   if (!single)
547     flags |= SAFE_LOADSTORE_NO_SWAP;
548 
549   SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, size, 0, QUANTIZED_REGS_TO_SAVE, flags);
550 }
551 
GenQuantizedStoreFloat(bool single,bool isInline)552 void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline)
553 {
554   if (single)
555   {
556     // Easy!
557     MOVD_xmm(R(RSCRATCH), XMM0);
558   }
559   else
560   {
561     if (cpu_info.bSSSE3)
562     {
563       PSHUFB(XMM0, MConst(pbswapShuffle2x4));
564       MOVQ_xmm(R(RSCRATCH), XMM0);
565     }
566     else
567     {
568       MOVQ_xmm(R(RSCRATCH), XMM0);
569       ROL(64, R(RSCRATCH), Imm8(32));
570       BSWAP(64, RSCRATCH);
571     }
572   }
573 }
574 
GenQuantizedLoad(bool single,EQuantizeType type,int quantize)575 void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type, int quantize)
576 {
577   // Note that this method assumes that inline methods know the value of quantize ahead of
578   // time. The methods generated AOT assume that the quantize flag is placed in RSCRATCH in
579   // the second lowest byte, ie: 0x0000xx00
580 
581   int size = sizes[type] * (single ? 1 : 2);
582   bool isInline = quantize != -1;
583   bool safe_access = m_jit.jo.memcheck || !m_jit.jo.fastmem;
584 
585   // illegal
586   if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
587   {
588     UD2();
589     return;
590   }
591 
592   // Floats don't use quantization and can generate more optimal code
593   if (type == QUANTIZE_FLOAT)
594   {
595     GenQuantizedLoadFloat(single, isInline);
596     return;
597   }
598 
599   bool extend = single && (type == QUANTIZE_S8 || type == QUANTIZE_S16);
600 
601   if (safe_access)
602   {
603     BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE_LOAD;
604     int flags = isInline ? 0 :
605                            SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG |
606                                SAFE_LOADSTORE_DR_ON | SAFE_LOADSTORE_NO_UPDATE_PC;
607     SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
608     if (!single && (type == QUANTIZE_U8 || type == QUANTIZE_S8))
609     {
610       // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
611       ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
612     }
613   }
614   else
615   {
616     switch (type)
617     {
618     case QUANTIZE_U8:
619     case QUANTIZE_S8:
620       UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
621       break;
622     case QUANTIZE_U16:
623     case QUANTIZE_S16:
624       UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
625       break;
626     default:
627       break;
628     }
629   }
630 
631   if (single)
632   {
633     CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
634 
635     if (quantize == -1)
636     {
637       SHR(32, R(RSCRATCH2), Imm8(5));
638       LEA(64, RSCRATCH, MConst(m_dequantizeTableS));
639       MULSS(XMM0, MRegSum(RSCRATCH2, RSCRATCH));
640     }
641     else if (quantize > 0)
642     {
643       MULSS(XMM0, MConst(m_dequantizeTableS, quantize * 2));
644     }
645     UNPCKLPS(XMM0, MConst(m_one));
646   }
647   else
648   {
649     switch (type)
650     {
651     case QUANTIZE_U8:
652       MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
653       if (cpu_info.bSSE4_1)
654       {
655         PMOVZXBD(XMM0, R(XMM0));
656       }
657       else
658       {
659         PXOR(XMM1, R(XMM1));
660         PUNPCKLBW(XMM0, R(XMM1));
661         PUNPCKLWD(XMM0, R(XMM1));
662       }
663       break;
664     case QUANTIZE_S8:
665       MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
666       if (cpu_info.bSSE4_1)
667       {
668         PMOVSXBD(XMM0, R(XMM0));
669       }
670       else
671       {
672         PUNPCKLBW(XMM0, R(XMM0));
673         PUNPCKLWD(XMM0, R(XMM0));
674         PSRAD(XMM0, 24);
675       }
676       break;
677     case QUANTIZE_U16:
678       ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
679       MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
680       if (cpu_info.bSSE4_1)
681       {
682         PMOVZXWD(XMM0, R(XMM0));
683       }
684       else
685       {
686         PXOR(XMM1, R(XMM1));
687         PUNPCKLWD(XMM0, R(XMM1));
688       }
689       break;
690     case QUANTIZE_S16:
691       ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
692       MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
693       if (cpu_info.bSSE4_1)
694       {
695         PMOVSXWD(XMM0, R(XMM0));
696       }
697       else
698       {
699         PUNPCKLWD(XMM0, R(XMM0));
700         PSRAD(XMM0, 16);
701       }
702       break;
703     default:
704       break;
705     }
706     CVTDQ2PS(XMM0, R(XMM0));
707 
708     if (quantize == -1)
709     {
710       SHR(32, R(RSCRATCH2), Imm8(5));
711       LEA(64, RSCRATCH, MConst(m_dequantizeTableS));
712       MOVQ_xmm(XMM1, MRegSum(RSCRATCH2, RSCRATCH));
713       MULPS(XMM0, R(XMM1));
714     }
715     else if (quantize > 0)
716     {
717       MOVQ_xmm(XMM1, MConst(m_dequantizeTableS, quantize * 2));
718       MULPS(XMM0, R(XMM1));
719     }
720   }
721 }
722 
GenQuantizedLoadFloat(bool single,bool isInline)723 void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
724 {
725   int size = single ? 32 : 64;
726   bool extend = false;
727   bool safe_access = m_jit.jo.memcheck || !m_jit.jo.fastmem;
728 
729   if (safe_access)
730   {
731     BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE;
732     int flags = isInline ? 0 :
733                            SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG |
734                                SAFE_LOADSTORE_DR_ON | SAFE_LOADSTORE_NO_UPDATE_PC;
735     SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
736   }
737 
738   if (single)
739   {
740     if (safe_access)
741     {
742       MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
743     }
744     else if (cpu_info.bSSSE3)
745     {
746       MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
747       PSHUFB(XMM0, MConst(pbswapShuffle1x4));
748     }
749     else
750     {
751       LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
752       MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
753     }
754 
755     UNPCKLPS(XMM0, MConst(m_one));
756   }
757   else
758   {
759     // FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
760     // don't need hardware access handling. This will definitely crash if paired loads occur
761     // from non-RAM areas, but as far as I know, this never happens. I don't know if this is
762     // for a good reason, or merely because no game does this.
763     // If we find something that actually does do this, maybe this should be changed. How
764     // much of a performance hit would it be?
765     if (safe_access)
766     {
767       ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
768       MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
769     }
770     else if (cpu_info.bSSSE3)
771     {
772       MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
773       PSHUFB(XMM0, MConst(pbswapShuffle2x4));
774     }
775     else
776     {
777       LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
778       ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
779       MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
780     }
781   }
782 }
783