1 // Copyright (c) 2013- PPSSPP Project.
2
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
11
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18 #include "ppsspp_config.h"
19 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20
21 #include <emmintrin.h>
22
23 #include "Common/CPUDetect.h"
24 #include "Core/Config.h"
25 #include "Core/Reporting.h"
26 #include "GPU/GPUState.h"
27 #include "GPU/Common/VertexDecoderCommon.h"
28
29 // We start out by converting the active matrices into 4x4 which are easier to multiply with
30 // using SSE / NEON and store them here.
31 alignas(16) static float bones[16 * 8];
32
33 using namespace Gen;
34
35 alignas(16) static const float by128[4] = {
36 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
37 };
38 alignas(16) static const float by32768[4] = {
39 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
40 };
41
42 alignas(16) static const float by128_11[4] = {
43 1.0f / 128.0f, 1.0f / 128.0f, 1.0f, 1.0f,
44 };
45 alignas(16) static const float by32768_11[4] = {
46 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f, 1.0f,
47 };
48
49 alignas(16) static const u32 threeMasks[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 };
50 alignas(16) static const u32 aOne[4] = {0, 0, 0, 0x3F800000};
51
52 alignas(16) static const float by16384[4] = {
53 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f,
54 };
55
56 #if PPSSPP_ARCH(AMD64)
57 #ifdef _WIN32
58 static const X64Reg tempReg1 = RAX;
59 static const X64Reg tempReg2 = R9;
60 static const X64Reg tempReg3 = R10;
61 static const X64Reg srcReg = RCX;
62 static const X64Reg dstReg = RDX;
63 static const X64Reg counterReg = R8;
64 #else
65 static const X64Reg tempReg1 = RAX;
66 static const X64Reg tempReg2 = R9;
67 static const X64Reg tempReg3 = R10;
68 static const X64Reg srcReg = RDI;
69 static const X64Reg dstReg = RSI;
70 static const X64Reg counterReg = RDX;
71 #endif
72 #else
73 static const X64Reg tempReg1 = EAX;
74 static const X64Reg tempReg2 = EBX;
75 static const X64Reg tempReg3 = EDX;
76 static const X64Reg srcReg = ESI;
77 static const X64Reg dstReg = EDI;
78 static const X64Reg counterReg = ECX;
79 #endif
80
81 // XMM0-XMM5 are volatile on Windows X64
82 // XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms)
83 static const X64Reg fpScaleOffsetReg = XMM0;
84
85 static const X64Reg fpScratchReg = XMM1;
86 static const X64Reg fpScratchReg2 = XMM2;
87 static const X64Reg fpScratchReg3 = XMM3;
88 static const X64Reg fpScratchReg4 = XMM4;
89
90 // We're gonna keep the current skinning matrix in 4 XMM regs. Fortunately we easily
91 // have space for that now.
92
93 // To debug, just comment them out one at a time until it works. We fall back
94 // on the interpreter if the compiler fails.
95
96 static const JitLookup jitLookup[] = {
97 {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
98 {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
99 {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
100 {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
101 {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
102 {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
103
104 {&VertexDecoder::Step_WeightsU8ToFloat, &VertexDecoderJitCache::Jit_WeightsU8ToFloat},
105 {&VertexDecoder::Step_WeightsU16ToFloat, &VertexDecoderJitCache::Jit_WeightsU16ToFloat},
106
107 {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
108 {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
109 {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
110
111 {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
112 {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
113 {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
114
115 {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
116 {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
117
118 {&VertexDecoder::Step_TcU8MorphToFloat, &VertexDecoderJitCache::Jit_TcU8MorphToFloat},
119 {&VertexDecoder::Step_TcU16MorphToFloat, &VertexDecoderJitCache::Jit_TcU16MorphToFloat},
120 {&VertexDecoder::Step_TcFloatMorph, &VertexDecoderJitCache::Jit_TcFloatMorph},
121 {&VertexDecoder::Step_TcU8PrescaleMorph, &VertexDecoderJitCache::Jit_TcU8PrescaleMorph},
122 {&VertexDecoder::Step_TcU16PrescaleMorph, &VertexDecoderJitCache::Jit_TcU16PrescaleMorph},
123 {&VertexDecoder::Step_TcFloatPrescaleMorph, &VertexDecoderJitCache::Jit_TcFloatPrescaleMorph},
124
125 {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
126 {&VertexDecoder::Step_NormalS8ToFloat, &VertexDecoderJitCache::Jit_NormalS8ToFloat},
127 {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
128 {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
129
130 {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
131 {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
132 {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
133
134 {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
135 {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
136 {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
137 {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},
138
139 {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
140 {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
141 {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
142
143 {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
144 {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
145 {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
146
147 {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
148 {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
149 {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
150
151 {&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph},
152 {&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph},
153 {&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph},
154
155 {&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
156 {&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
157 {&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
158
159 {&VertexDecoder::Step_Color8888Morph, &VertexDecoderJitCache::Jit_Color8888Morph},
160 {&VertexDecoder::Step_Color4444Morph, &VertexDecoderJitCache::Jit_Color4444Morph},
161 {&VertexDecoder::Step_Color565Morph, &VertexDecoderJitCache::Jit_Color565Morph},
162 {&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph},
163 };
164
Compile(const VertexDecoder & dec,int32_t * jittedSize)165 JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
166 dec_ = &dec;
167 BeginWrite();
168 const u8 *start = this->AlignCode16();
169
170 #if PPSSPP_ARCH(X86)
171 // Store register values
172 PUSH(ESI);
173 PUSH(EDI);
174 PUSH(EBX);
175 PUSH(EBP);
176
177 // Read parameters
178 int offset = 4;
179 MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0));
180 MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
181 MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
182
183 const uint8_t STACK_FIXED_ALLOC = 64;
184 #else
185 // Parameters automatically fall into place.
186
187 // This will align the stack properly to 16 bytes (the call of this function pushed RIP, which is 8 bytes).
188 const uint8_t STACK_FIXED_ALLOC = 96 + 8;
189 #endif
190
191 // Allocate temporary storage on the stack.
192 SUB(PTRBITS, R(ESP), Imm8(STACK_FIXED_ALLOC));
193 // Save XMM4/XMM5 which apparently can be problematic?
194 // Actually, if they are, it must be a compiler bug because they SHOULD be ok.
195 // So I won't bother.
196 MOVUPS(MDisp(ESP, 0), XMM4);
197 MOVUPS(MDisp(ESP, 16), XMM5);
198 MOVUPS(MDisp(ESP, 32), XMM6);
199 MOVUPS(MDisp(ESP, 48), XMM7);
200 #if PPSSPP_ARCH(AMD64)
201 MOVUPS(MDisp(ESP, 64), XMM8);
202 MOVUPS(MDisp(ESP, 80), XMM9);
203 #endif
204
205 bool prescaleStep = false;
206 // Look for prescaled texcoord steps
207 for (int i = 0; i < dec.numSteps_; i++) {
208 if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
209 dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
210 dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
211 prescaleStep = true;
212 }
213 if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph ||
214 dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph ||
215 dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) {
216 prescaleStep = true;
217 }
218 }
219
220 // Add code to convert matrices to 4x4.
221 // Later we might want to do this when the matrices are loaded instead.
222 if (dec.weighttype && g_Config.bSoftwareSkinning) {
223 MOV(PTRBITS, R(tempReg1), ImmPtr(&threeMasks));
224 MOVAPS(XMM4, MatR(tempReg1));
225 MOV(PTRBITS, R(tempReg1), ImmPtr(&aOne));
226 MOVUPS(XMM5, MatR(tempReg1));
227 MOV(PTRBITS, R(tempReg1), ImmPtr(gstate.boneMatrix));
228 MOV(PTRBITS, R(tempReg2), ImmPtr(bones));
229 for (int i = 0; i < dec.nweights; i++) {
230 MOVUPS(XMM0, MDisp(tempReg1, (12 * i) * 4));
231 MOVUPS(XMM1, MDisp(tempReg1, (12 * i + 3) * 4));
232 MOVUPS(XMM2, MDisp(tempReg1, (12 * i + 3 * 2) * 4));
233 MOVUPS(XMM3, MDisp(tempReg1, (12 * i + 3 * 3) * 4));
234 ANDPS(XMM0, R(XMM4));
235 ANDPS(XMM1, R(XMM4));
236 ANDPS(XMM2, R(XMM4));
237 ANDPS(XMM3, R(XMM4));
238 ORPS(XMM3, R(XMM5));
239 MOVAPS(MDisp(tempReg2, (16 * i) * 4), XMM0);
240 MOVAPS(MDisp(tempReg2, (16 * i + 4) * 4), XMM1);
241 MOVAPS(MDisp(tempReg2, (16 * i + 8) * 4), XMM2);
242 MOVAPS(MDisp(tempReg2, (16 * i + 12) * 4), XMM3);
243 }
244 }
245
246 // Keep the scale/offset in a few fp registers if we need it.
247 if (prescaleStep) {
248 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.uv));
249 MOVUPS(fpScaleOffsetReg, MatR(tempReg1));
250 if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
251 MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11));
252 MULPS(fpScaleOffsetReg, MatR(tempReg2));
253 } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
254 MOV(PTRBITS, R(tempReg2), ImmPtr(&by32768_11));
255 MULPS(fpScaleOffsetReg, MatR(tempReg2));
256 }
257 }
258
259 // Let's not bother with a proper stack frame. We just grab the arguments and go.
260 JumpTarget loopStart = GetCodePtr();
261 for (int i = 0; i < dec.numSteps_; i++) {
262 if (!CompileStep(dec, i)) {
263 EndWrite();
264 // Reset the code ptr and return zero to indicate that we failed.
265 ResetCodePtr(GetOffset(start));
266 return 0;
267 }
268 }
269
270 ADD(PTRBITS, R(srcReg), Imm32(dec.VertexSize()));
271 ADD(PTRBITS, R(dstReg), Imm32(dec.decFmt.stride));
272 SUB(32, R(counterReg), Imm8(1));
273 J_CC(CC_NZ, loopStart, true);
274
275 MOVUPS(XMM4, MDisp(ESP, 0));
276 MOVUPS(XMM5, MDisp(ESP, 16));
277 MOVUPS(XMM6, MDisp(ESP, 32));
278 MOVUPS(XMM7, MDisp(ESP, 48));
279 #if PPSSPP_ARCH(AMD64)
280 MOVUPS(XMM8, MDisp(ESP, 64));
281 MOVUPS(XMM9, MDisp(ESP, 80));
282 #endif
283 ADD(PTRBITS, R(ESP), Imm8(STACK_FIXED_ALLOC));
284
285 #if PPSSPP_ARCH(X86)
286 // Restore register values
287 POP(EBP);
288 POP(EBX);
289 POP(EDI);
290 POP(ESI);
291 #endif
292
293 RET();
294
295 *jittedSize = GetCodePtr() - start;
296 EndWrite();
297 return (JittedVertexDecoder)start;
298 }
299
Jit_WeightsU8()300 void VertexDecoderJitCache::Jit_WeightsU8() {
301 switch (dec_->nweights) {
302 case 1:
303 MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff));
304 break;
305 case 2:
306 MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
307 break;
308 case 3:
309 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
310 AND(32, R(tempReg1), Imm32(0x00FFFFFF));
311 break;
312 case 4:
313 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
314 break;
315 case 5:
316 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
317 MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
318 break;
319 case 6:
320 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
321 MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
322 break;
323 case 7:
324 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
325 MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
326 AND(32, R(tempReg2), Imm32(0x00FFFFFF));
327 break;
328 case 8:
329 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
330 MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
331 break;
332 }
333
334 if (dec_->nweights <= 4) {
335 MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
336 } else {
337 MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
338 MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2));
339 }
340 }
341
Jit_WeightsU16()342 void VertexDecoderJitCache::Jit_WeightsU16() {
343 switch (dec_->nweights) {
344 case 1:
345 MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
346 MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
347 MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
348 return;
349
350 case 2:
351 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
352 MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
353 MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
354 return;
355
356 case 3:
357 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
358 MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
359 MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
360 MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
361 return;
362
363 case 4:
364 // Anything above 4 will do 4 here, and then the rest after.
365 case 5:
366 case 6:
367 case 7:
368 case 8:
369 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
370 MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
371 MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
372 MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
373 break;
374 }
375
376 // Basic implementation - a short at a time. TODO: Optimize
377 int j;
378 for (j = 4; j < dec_->nweights; j++) {
379 MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
380 MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
381 }
382 while (j & 3) {
383 MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0));
384 j++;
385 }
386 }
387
Jit_WeightsU8ToFloat()388 void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
389 if (dec_->nweights >= 4) {
390 Jit_AnyU8ToFloat(dec_->weightoff, 32);
391 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
392 if (dec_->nweights > 4) {
393 Jit_AnyU8ToFloat(dec_->weightoff + 4, (dec_->nweights - 4) * 8);
394 MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
395 }
396 } else {
397 Jit_AnyU8ToFloat(dec_->weightoff, dec_->nweights * 8);
398 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
399 }
400 }
401
Jit_WeightsU16ToFloat()402 void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
403 if (dec_->nweights >= 4) {
404 Jit_AnyU16ToFloat(dec_->weightoff, 64);
405 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
406 if (dec_->nweights > 4) {
407 Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2, (dec_->nweights - 4) * 16);
408 MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
409 }
410 } else {
411 Jit_AnyU16ToFloat(dec_->weightoff, dec_->nweights * 16);
412 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
413 }
414 }
415
Jit_WeightsFloat()416 void VertexDecoderJitCache::Jit_WeightsFloat() {
417 int j;
418 switch (dec_->nweights) {
419 case 1:
420 // MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s.
421 MOVSS(XMM3, MDisp(srcReg, dec_->weightoff));
422 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
423 break;
424
425 case 2:
426 MOVQ_xmm(XMM3, MDisp(srcReg, dec_->weightoff));
427 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
428 break;
429
430 case 4:
431 MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
432 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
433 break;
434
435 case 5:
436 MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
437 MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
438 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
439 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
440 break;
441
442 case 6:
443 MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
444 MOVQ_xmm(XMM4, MDisp(srcReg, dec_->weightoff + 16));
445 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
446 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
447 break;
448
449 case 8:
450 MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
451 MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
452 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
453 MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
454 break;
455
456 default:
457 for (j = 0; j < dec_->nweights; j++) {
458 MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
459 MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
460 }
461 while (j & 3) { // Zero additional weights rounding up to 4.
462 MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
463 j++;
464 }
465 break;
466 }
467 }
468
Jit_WeightsU8Skin()469 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
470 MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
471
472 #if PPSSPP_ARCH(AMD64)
473 if (dec_->nweights > 4) {
474 // This reads 8 bytes, we split the top 4 so we can expand each set of 4.
475 MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
476 PSHUFD(XMM9, R(XMM8), _MM_SHUFFLE(1, 1, 1, 1));
477 } else {
478 MOVD_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
479 }
480 if (cpu_info.bSSE4_1) {
481 PMOVZXBD(XMM8, R(XMM8));
482 } else {
483 PXOR(fpScratchReg, R(fpScratchReg));
484 PUNPCKLBW(XMM8, R(fpScratchReg));
485 PUNPCKLWD(XMM8, R(fpScratchReg));
486 }
487 if (dec_->nweights > 4) {
488 if (cpu_info.bSSE4_1) {
489 PMOVZXBD(XMM9, R(XMM9));
490 } else {
491 PUNPCKLBW(XMM9, R(fpScratchReg));
492 PUNPCKLWD(XMM9, R(fpScratchReg));
493 }
494 }
495 CVTDQ2PS(XMM8, R(XMM8));
496 if (dec_->nweights > 4)
497 CVTDQ2PS(XMM9, R(XMM9));
498
499 if (RipAccessible(&by128)) {
500 MULPS(XMM8, M(&by128)); // rip accessible
501 if (dec_->nweights > 4)
502 MULPS(XMM9, M(&by128)); // rip accessible
503 } else {
504 MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
505 MULPS(XMM8, MatR(tempReg1));
506 if (dec_->nweights > 4)
507 MULPS(XMM9, MatR(tempReg1));
508 }
509
510 auto weightToAllLanes = [this](X64Reg dst, int lane) {
511 X64Reg src = lane < 4 ? XMM8 : XMM9;
512 if (dst != INVALID_REG && dst != src) {
513 MOVAPS(dst, R(src));
514 } else {
515 // INVALID_REG means ruin the existing src (it's not needed any more.)
516 dst = src;
517 }
518 SHUFPS(dst, R(dst), _MM_SHUFFLE(lane % 4, lane % 4, lane % 4, lane % 4));
519 };
520 #endif
521
522 for (int j = 0; j < dec_->nweights; j++) {
523 X64Reg weight = XMM1;
524 #if PPSSPP_ARCH(AMD64)
525 X64Reg weightSrc = j < 4 ? XMM8 : XMM9;
526 if (j == 3 || j == dec_->nweights - 1) {
527 // In the previous iteration, we already spread this value to all lanes.
528 weight = weightSrc;
529 if (j == 0) {
530 // If there's only the one weight, no one shuffled it for us yet.
531 weightToAllLanes(weight, j);
532 }
533 // If we're on #3, prepare #4 if it's the last (and only for that reg, in fact.)
534 if (j == dec_->nweights - 2) {
535 weightToAllLanes(INVALID_REG, j + 1);
536 }
537 } else {
538 weightToAllLanes(weight, j);
539 // To improve latency, we shuffle in the last weight of the reg.
540 // If we're on slot #2, slot #3 will be the last. Otherwise, nweights - 1 is last.
541 if ((j == 2 && dec_->nweights > 3) || (j == dec_->nweights - 2)) {
542 // Prepare the last one now for better latency.
543 weightToAllLanes(INVALID_REG, j + 1);
544 }
545 }
546 #else
547 MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j));
548 CVTSI2SS(weight, R(tempReg1));
549 MULSS(weight, M(&by128)); // rip accessible (x86)
550 SHUFPS(weight, R(weight), _MM_SHUFFLE(0, 0, 0, 0));
551 #endif
552 if (j == 0) {
553 MOVAPS(XMM4, MDisp(tempReg2, 0));
554 MOVAPS(XMM5, MDisp(tempReg2, 16));
555 MOVAPS(XMM6, MDisp(tempReg2, 32));
556 MOVAPS(XMM7, MDisp(tempReg2, 48));
557 MULPS(XMM4, R(weight));
558 MULPS(XMM5, R(weight));
559 MULPS(XMM6, R(weight));
560 MULPS(XMM7, R(weight));
561 } else {
562 MOVAPS(XMM2, MDisp(tempReg2, 0));
563 MOVAPS(XMM3, MDisp(tempReg2, 16));
564 MULPS(XMM2, R(weight));
565 MULPS(XMM3, R(weight));
566 ADDPS(XMM4, R(XMM2));
567 ADDPS(XMM5, R(XMM3));
568 MOVAPS(XMM2, MDisp(tempReg2, 32));
569 MOVAPS(XMM3, MDisp(tempReg2, 48));
570 MULPS(XMM2, R(weight));
571 MULPS(XMM3, R(weight));
572 ADDPS(XMM6, R(XMM2));
573 ADDPS(XMM7, R(XMM3));
574 }
575 ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
576 }
577 }
578
Jit_WeightsU16Skin()579 void VertexDecoderJitCache::Jit_WeightsU16Skin() {
580 MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
581
582 #if PPSSPP_ARCH(AMD64)
583 if (dec_->nweights > 6) {
584 // Since this is probably not aligned, two MOVQs are better than one MOVDQU.
585 MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
586 MOVQ_xmm(XMM9, MDisp(srcReg, dec_->weightoff + 8));
587 } else if (dec_->nweights > 4) {
588 // Since this is probably not aligned, two MOVQs are better than one MOVDQU.
589 MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
590 MOVD_xmm(XMM9, MDisp(srcReg, dec_->weightoff + 8));
591 } else if (dec_->nweights > 2) {
592 MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
593 } else {
594 MOVD_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
595 }
596 if (cpu_info.bSSE4_1) {
597 PMOVZXWD(XMM8, R(XMM8));
598 } else {
599 PXOR(fpScratchReg, R(fpScratchReg));
600 PUNPCKLWD(XMM8, R(fpScratchReg));
601 }
602 if (dec_->nweights > 4) {
603 if (cpu_info.bSSE4_1) {
604 PMOVZXWD(XMM9, R(XMM9));
605 } else {
606 PUNPCKLWD(XMM9, R(fpScratchReg));
607 }
608 }
609 CVTDQ2PS(XMM8, R(XMM8));
610 if (dec_->nweights > 4)
611 CVTDQ2PS(XMM9, R(XMM9));
612
613 if (RipAccessible(&by32768)) {
614 MULPS(XMM8, M(&by32768)); // rip accessible
615 if (dec_->nweights > 4)
616 MULPS(XMM9, M(&by32768)); // rip accessible
617 } else {
618 MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
619 MULPS(XMM8, MatR(tempReg1));
620 if (dec_->nweights > 4)
621 MULPS(XMM9, MatR(tempReg1));
622 }
623
624 auto weightToAllLanes = [this](X64Reg dst, int lane) {
625 X64Reg src = lane < 4 ? XMM8 : XMM9;
626 if (dst != INVALID_REG && dst != src) {
627 MOVAPS(dst, R(src));
628 } else {
629 // INVALID_REG means ruin the existing src (it's not needed any more.)
630 dst = src;
631 }
632 SHUFPS(dst, R(dst), _MM_SHUFFLE(lane % 4, lane % 4, lane % 4, lane % 4));
633 };
634 #endif
635
636 for (int j = 0; j < dec_->nweights; j++) {
637 X64Reg weight = XMM1;
638 #if PPSSPP_ARCH(AMD64)
639 X64Reg weightSrc = j < 4 ? XMM8 : XMM9;
640 if (j == 3 || j == dec_->nweights - 1) {
641 // In the previous iteration, we already spread this value to all lanes.
642 weight = weightSrc;
643 if (j == 0) {
644 // If there's only the one weight, no one shuffled it for us yet.
645 weightToAllLanes(weight, j);
646 }
647 // If we're on #3, prepare #4 if it's the last (and only for that reg, in fact.)
648 if (j == dec_->nweights - 2) {
649 weightToAllLanes(INVALID_REG, j + 1);
650 }
651 } else {
652 weightToAllLanes(weight, j);
653 // To improve latency, we shuffle in the last weight of the reg.
654 // If we're on slot #2, slot #3 will be the last. Otherwise, nweights - 1 is last.
655 if ((j == 2 && dec_->nweights > 3) || (j == dec_->nweights - 2)) {
656 // Prepare the last one now for better latency.
657 weightToAllLanes(INVALID_REG, j + 1);
658 }
659 }
660 #else
661 MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2));
662 CVTSI2SS(weight, R(tempReg1));
663 MULSS(weight, M(&by32768)); // rip accessible (x86)
664 SHUFPS(weight, R(weight), _MM_SHUFFLE(0, 0, 0, 0));
665 #endif
666 if (j == 0) {
667 MOVAPS(XMM4, MDisp(tempReg2, 0));
668 MOVAPS(XMM5, MDisp(tempReg2, 16));
669 MOVAPS(XMM6, MDisp(tempReg2, 32));
670 MOVAPS(XMM7, MDisp(tempReg2, 48));
671 MULPS(XMM4, R(weight));
672 MULPS(XMM5, R(weight));
673 MULPS(XMM6, R(weight));
674 MULPS(XMM7, R(weight));
675 } else {
676 MOVAPS(XMM2, MDisp(tempReg2, 0));
677 MOVAPS(XMM3, MDisp(tempReg2, 16));
678 MULPS(XMM2, R(weight));
679 MULPS(XMM3, R(weight));
680 ADDPS(XMM4, R(XMM2));
681 ADDPS(XMM5, R(XMM3));
682 MOVAPS(XMM2, MDisp(tempReg2, 32));
683 MOVAPS(XMM3, MDisp(tempReg2, 48));
684 MULPS(XMM2, R(weight));
685 MULPS(XMM3, R(weight));
686 ADDPS(XMM6, R(XMM2));
687 ADDPS(XMM7, R(XMM3));
688 }
689 ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
690 }
691 }
692
Jit_WeightsFloatSkin()693 void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
694 MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
695 for (int j = 0; j < dec_->nweights; j++) {
696 MOVSS(XMM1, MDisp(srcReg, dec_->weightoff + j * 4));
697 SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
698 if (j == 0) {
699 MOVAPS(XMM4, MDisp(tempReg2, 0));
700 MOVAPS(XMM5, MDisp(tempReg2, 16));
701 MOVAPS(XMM6, MDisp(tempReg2, 32));
702 MOVAPS(XMM7, MDisp(tempReg2, 48));
703 MULPS(XMM4, R(XMM1));
704 MULPS(XMM5, R(XMM1));
705 MULPS(XMM6, R(XMM1));
706 MULPS(XMM7, R(XMM1));
707 } else {
708 MOVAPS(XMM2, MDisp(tempReg2, 0));
709 MOVAPS(XMM3, MDisp(tempReg2, 16));
710 MULPS(XMM2, R(XMM1));
711 MULPS(XMM3, R(XMM1));
712 ADDPS(XMM4, R(XMM2));
713 ADDPS(XMM5, R(XMM3));
714 MOVAPS(XMM2, MDisp(tempReg2, 32));
715 MOVAPS(XMM3, MDisp(tempReg2, 48));
716 MULPS(XMM2, R(XMM1));
717 MULPS(XMM3, R(XMM1));
718 ADDPS(XMM6, R(XMM2));
719 ADDPS(XMM7, R(XMM3));
720 }
721 ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
722 }
723 }
724
Jit_TcU8ToFloat()725 void VertexDecoderJitCache::Jit_TcU8ToFloat() {
726 Jit_AnyU8ToFloat(dec_->tcoff, 16);
727 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), XMM3);
728 }
729
Jit_TcU16ToFloat()730 void VertexDecoderJitCache::Jit_TcU16ToFloat() {
731 Jit_AnyU16ToFloat(dec_->tcoff, 32);
732 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), XMM3);
733 }
734
Jit_TcFloat()735 void VertexDecoderJitCache::Jit_TcFloat() {
736 #if PPSSPP_ARCH(AMD64)
737 MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
738 MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
739 #else
740 MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
741 MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
742 MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
743 MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
744 #endif
745 }
746
Jit_TcU8Prescale()747 void VertexDecoderJitCache::Jit_TcU8Prescale() {
748 // TODO: The first five instructions could be done in 1 or 2 in SSE4
749 MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff));
750 MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1));
751 CVTSI2SS(fpScratchReg, R(tempReg1));
752 CVTSI2SS(fpScratchReg2, R(tempReg2));
753 UNPCKLPS(fpScratchReg, R(fpScratchReg2));
754 MULPS(fpScratchReg, R(fpScaleOffsetReg));
755 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
756 ADDPS(fpScratchReg, R(fpScaleOffsetReg));
757 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
758 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
759 }
760
Jit_TcU16Prescale()761 void VertexDecoderJitCache::Jit_TcU16Prescale() {
762 PXOR(fpScratchReg2, R(fpScratchReg2));
763 MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
764 PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
765 CVTDQ2PS(fpScratchReg, R(fpScratchReg));
766 MULPS(fpScratchReg, R(fpScaleOffsetReg));
767 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
768 ADDPS(fpScratchReg, R(fpScaleOffsetReg));
769 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
770 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
771 }
772
Jit_TcFloatPrescale()773 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
774 MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
775 MULPS(fpScratchReg, R(fpScaleOffsetReg));
776 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
777 ADDPS(fpScratchReg, R(fpScaleOffsetReg));
778 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
779 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
780 }
781
Jit_TcAnyMorph(int bits)782 void VertexDecoderJitCache::Jit_TcAnyMorph(int bits) {
783 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
784 if (!cpu_info.bSSE4_1) {
785 PXOR(fpScratchReg4, R(fpScratchReg4));
786 }
787
788 bool first = true;
789 for (int n = 0; n < dec_->morphcount; ++n) {
790 const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
791 const OpArg src = MDisp(srcReg, dec_->onesize_ * n + dec_->tcoff);
792
793 // Load the actual values and convert to float.
794 if (bits == 32) {
795 // Two floats: just load as a MOVQ.
796 MOVQ_xmm(reg, src);
797 } else {
798 if (bits == 8) {
799 MOVZX(32, 16, tempReg2, src);
800 MOVD_xmm(reg, R(tempReg2));
801 } else {
802 MOVD_xmm(reg, src);
803 }
804 if (cpu_info.bSSE4_1) {
805 if (bits == 8) {
806 PMOVZXBD(reg, R(reg));
807 } else {
808 PMOVZXWD(reg, R(reg));
809 }
810 } else {
811 if (bits == 8) {
812 PUNPCKLBW(reg, R(fpScratchReg4));
813 }
814 PUNPCKLWD(reg, R(fpScratchReg4));
815 }
816
817 CVTDQ2PS(reg, R(reg));
818 }
819
820 // And now scale by the weight.
821 MOVSS(fpScratchReg3, MDisp(tempReg1, n * sizeof(float)));
822 SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
823 MULPS(reg, R(fpScratchReg3));
824
825 if (!first) {
826 ADDPS(fpScratchReg, R(fpScratchReg2));
827 } else {
828 first = false;
829 }
830 }
831 }
832
Jit_TcU8MorphToFloat()833 void VertexDecoderJitCache::Jit_TcU8MorphToFloat() {
834 Jit_TcAnyMorph(8);
835 // They were all added (weighted) pre-normalize, we normalize once here.
836 if (RipAccessible(&by128)) {
837 MULPS(fpScratchReg, M(&by128)); // rip accessible
838 } else {
839 MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
840 MULPS(fpScratchReg, MatR(tempReg1));
841 }
842 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
843 }
844
Jit_TcU16MorphToFloat()845 void VertexDecoderJitCache::Jit_TcU16MorphToFloat() {
846 Jit_TcAnyMorph(16);
847 // They were all added (weighted) pre-normalize, we normalize once here.
848 if (RipAccessible(&by32768)) {
849 MULPS(fpScratchReg, M(&by32768)); // rip accessible
850 } else {
851 MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
852 MULPS(fpScratchReg, MatR(tempReg1));
853 }
854 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
855 }
856
Jit_TcFloatMorph()857 void VertexDecoderJitCache::Jit_TcFloatMorph() {
858 Jit_TcAnyMorph(32);
859 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
860 }
861
Jit_TcU8PrescaleMorph()862 void VertexDecoderJitCache::Jit_TcU8PrescaleMorph() {
863 Jit_TcAnyMorph(8);
864 // The scale takes into account the u8 normalization.
865 MULPS(fpScratchReg, R(fpScaleOffsetReg));
866 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
867 ADDPS(fpScratchReg, R(fpScaleOffsetReg));
868 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
869 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
870 }
871
Jit_TcU16PrescaleMorph()872 void VertexDecoderJitCache::Jit_TcU16PrescaleMorph() {
873 Jit_TcAnyMorph(16);
874 // The scale takes into account the u16 normalization.
875 MULPS(fpScratchReg, R(fpScaleOffsetReg));
876 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
877 ADDPS(fpScratchReg, R(fpScaleOffsetReg));
878 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
879 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
880 }
881
Jit_TcFloatPrescaleMorph()882 void VertexDecoderJitCache::Jit_TcFloatPrescaleMorph() {
883 Jit_TcAnyMorph(32);
884 MULPS(fpScratchReg, R(fpScaleOffsetReg));
885 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
886 ADDPS(fpScratchReg, R(fpScaleOffsetReg));
887 SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
888 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
889 }
890
Jit_TcU16ThroughToFloat()891 void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
892 PXOR(fpScratchReg2, R(fpScratchReg2));
893 MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
894 MOVD_xmm(fpScratchReg, R(tempReg1));
895 PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
896 CVTDQ2PS(fpScratchReg, R(fpScratchReg));
897 MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
898
899 MOV(32, R(tempReg2), R(tempReg1));
900 SHR(32, R(tempReg2), Imm8(16));
901
902 MOV(PTRBITS, R(tempReg3), ImmPtr(&gstate_c.vertBounds));
903 auto updateSide = [&](X64Reg r, CCFlags skipCC, int offset) {
904 CMP(16, R(r), MDisp(tempReg3, offset));
905 FixupBranch skip = J_CC(skipCC);
906 MOV(16, MDisp(tempReg3, offset), R(r));
907 SetJumpTarget(skip);
908 };
909 // TODO: Can this actually be fast? Hmm, floats aren't better.
910 updateSide(tempReg1, CC_GE, offsetof(KnownVertexBounds, minU));
911 updateSide(tempReg1, CC_LE, offsetof(KnownVertexBounds, maxU));
912 updateSide(tempReg2, CC_GE, offsetof(KnownVertexBounds, minV));
913 updateSide(tempReg2, CC_LE, offsetof(KnownVertexBounds, maxV));
914 }
915
Jit_TcFloatThrough()916 void VertexDecoderJitCache::Jit_TcFloatThrough() {
917 #if PPSSPP_ARCH(AMD64)
918 MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
919 MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
920 #else
921 MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
922 MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
923 MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
924 MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
925 #endif
926 }
927
Jit_Color8888()928 void VertexDecoderJitCache::Jit_Color8888() {
929 MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
930 MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
931
932 CMP(32, R(tempReg1), Imm32(0xFF000000));
933 FixupBranch skip = J_CC(CC_AE, false);
934 if (RipAccessible(&gstate_c.vertexFullAlpha)) {
935 MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible
936 } else {
937 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha));
938 MOV(8, MatR(tempReg1), Imm8(0));
939 }
940 SetJumpTarget(skip);
941 }
942
943 alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
944
Jit_Color4444()945 void VertexDecoderJitCache::Jit_Color4444() {
946 // This over-reads slightly, but we assume pos or another component follows anyway.
947 MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff));
948 // Spread to RGBA -> R00GB00A.
949 PUNPCKLBW(fpScratchReg, R(fpScratchReg));
950 if (RipAccessible(&color4444mask[0])) {
951 PAND(fpScratchReg, M(&color4444mask[0])); // rip accessible
952 } else {
953 MOV(PTRBITS, R(tempReg1), ImmPtr(&color4444mask));
954 PAND(fpScratchReg, MatR(tempReg1));
955 }
956 MOVSS(fpScratchReg2, R(fpScratchReg));
957 MOVSS(fpScratchReg3, R(fpScratchReg));
958 // Create 0R000B00 and 00G000A0.
959 PSRLW(fpScratchReg2, 4);
960 PSLLW(fpScratchReg3, 4);
961 // Combine for the complete set: RRGGBBAA.
962 POR(fpScratchReg, R(fpScratchReg2));
963 POR(fpScratchReg, R(fpScratchReg3));
964 MOVD_xmm(R(tempReg1), fpScratchReg);
965 MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
966
967 CMP(32, R(tempReg1), Imm32(0xFF000000));
968 FixupBranch skip = J_CC(CC_AE, false);
969 if (RipAccessible(&gstate_c.vertexFullAlpha)) {
970 MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible
971 } else {
972 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha));
973 MOV(8, MatR(tempReg1), Imm8(0));
974 }
975 SetJumpTarget(skip);
976 }
977
Jit_Color565()978 void VertexDecoderJitCache::Jit_Color565() {
979 MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff));
980
981 MOV(32, R(tempReg2), R(tempReg1));
982 AND(32, R(tempReg2), Imm32(0x0000001F));
983
984 // B (we do R and B at the same time, they're both 5.)
985 MOV(32, R(tempReg3), R(tempReg1));
986 AND(32, R(tempReg3), Imm32(0x0000F800));
987 SHL(32, R(tempReg3), Imm8(5));
988 OR(32, R(tempReg2), R(tempReg3));
989
990 // Expand 5 -> 8. At this point we have 00BB00RR.
991 MOV(32, R(tempReg3), R(tempReg2));
992 SHL(32, R(tempReg2), Imm8(3));
993 SHR(32, R(tempReg3), Imm8(2));
994 OR(32, R(tempReg2), R(tempReg3));
995 AND(32, R(tempReg2), Imm32(0x00FF00FF));
996
997 // Now's as good a time to put in A as any.
998 OR(32, R(tempReg2), Imm32(0xFF000000));
999
1000 // Last, we need to align, extract, and expand G.
1001 // 3 to align to G, and then 2 to expand to 8.
1002 SHL(32, R(tempReg1), Imm8(3 + 2));
1003 AND(32, R(tempReg1), Imm32(0x0000FC00));
1004 MOV(32, R(tempReg3), R(tempReg1));
1005 // 2 to account for tempReg1 being preshifted, 4 for expansion.
1006 SHR(32, R(tempReg3), Imm8(2 + 4));
1007 OR(32, R(tempReg1), R(tempReg3));
1008 AND(32, R(tempReg1), Imm32(0x0000FF00));
1009 OR(32, R(tempReg2), R(tempReg1));
1010
1011 MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
1012 // Never has alpha, no need to update fullAlphaArg.
1013 }
1014
Jit_Color5551()1015 void VertexDecoderJitCache::Jit_Color5551() {
1016 MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff));
1017
1018 MOV(32, R(tempReg2), R(tempReg1));
1019 MOV(32, R(tempReg3), R(tempReg1));
1020 AND(32, R(tempReg2), Imm32(0x0000001F));
1021 AND(32, R(tempReg3), Imm32(0x000003E0));
1022 SHL(32, R(tempReg3), Imm8(3));
1023 OR(32, R(tempReg2), R(tempReg3));
1024
1025 MOV(32, R(tempReg3), R(tempReg1));
1026 AND(32, R(tempReg3), Imm32(0x00007C00));
1027 SHL(32, R(tempReg3), Imm8(6));
1028 OR(32, R(tempReg2), R(tempReg3));
1029
1030 // Expand 5 -> 8. After this is just A.
1031 MOV(32, R(tempReg3), R(tempReg2));
1032 SHL(32, R(tempReg2), Imm8(3));
1033 SHR(32, R(tempReg3), Imm8(2));
1034 // Chop off the bits that were shifted out.
1035 AND(32, R(tempReg3), Imm32(0x00070707));
1036 OR(32, R(tempReg2), R(tempReg3));
1037
1038 // For A, we shift it to a single bit, and then subtract and XOR.
1039 // That's probably the simplest way to expand it...
1040 SHR(32, R(tempReg1), Imm8(15));
1041 // If it was 0, it's now -1, otherwise it's 0. Easy.
1042 SUB(32, R(tempReg1), Imm8(1));
1043 XOR(32, R(tempReg1), Imm32(0xFF000000));
1044 AND(32, R(tempReg1), Imm32(0xFF000000));
1045 OR(32, R(tempReg2), R(tempReg1));
1046
1047 MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
1048
1049 CMP(32, R(tempReg2), Imm32(0xFF000000));
1050 FixupBranch skip = J_CC(CC_AE, false);
1051 if (RipAccessible(&gstate_c.vertexFullAlpha)) {
1052 MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible
1053 } else {
1054 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha));
1055 MOV(8, MatR(tempReg1), Imm8(0));
1056 }
1057 SetJumpTarget(skip);
1058 }
1059
Jit_Color8888Morph()1060 void VertexDecoderJitCache::Jit_Color8888Morph() {
1061 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1062 if (!cpu_info.bSSE4_1) {
1063 PXOR(fpScratchReg4, R(fpScratchReg4));
1064 }
1065
1066 bool first = true;
1067 for (int n = 0; n < dec_->morphcount; ++n) {
1068 const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1069 MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
1070 if (cpu_info.bSSE4_1) {
1071 PMOVZXBD(reg, R(reg));
1072 } else {
1073 PUNPCKLBW(reg, R(fpScratchReg4));
1074 PUNPCKLWD(reg, R(fpScratchReg4));
1075 }
1076
1077 CVTDQ2PS(reg, R(reg));
1078
1079 // And now the weight.
1080 MOVSS(fpScratchReg3, MDisp(tempReg1, n * sizeof(float)));
1081 SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1082 MULPS(reg, R(fpScratchReg3));
1083
1084 if (!first) {
1085 ADDPS(fpScratchReg, R(fpScratchReg2));
1086 } else {
1087 first = false;
1088 }
1089 }
1090
1091 Jit_WriteMorphColor(dec_->decFmt.c0off);
1092 }
1093
1094 alignas(16) static const float byColor4444[4] = { 255.0f / 15.0f, 255.0f / 15.0f, 255.0f / 15.0f, 255.0f / 15.0f, };
1095
Jit_Color4444Morph()1096 void VertexDecoderJitCache::Jit_Color4444Morph() {
1097 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1098 if (!cpu_info.bSSE4_1) {
1099 PXOR(fpScratchReg4, R(fpScratchReg4));
1100 }
1101 MOV(PTRBITS, R(tempReg2), ImmPtr(color4444mask));
1102 MOVDQA(XMM5, MatR(tempReg2));
1103 MOV(PTRBITS, R(tempReg2), ImmPtr(byColor4444));
1104 MOVAPS(XMM6, MatR(tempReg2));
1105
1106 bool first = true;
1107 for (int n = 0; n < dec_->morphcount; ++n) {
1108 const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1109 MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
1110 PUNPCKLBW(reg, R(reg));
1111 PAND(reg, R(XMM5));
1112 MOVSS(fpScratchReg3, R(reg));
1113 PSLLW(fpScratchReg3, 4);
1114 POR(reg, R(fpScratchReg3));
1115 PSRLW(reg, 4);
1116
1117 if (cpu_info.bSSE4_1) {
1118 PMOVZXBD(reg, R(reg));
1119 } else {
1120 PUNPCKLBW(reg, R(fpScratchReg4));
1121 PUNPCKLWD(reg, R(fpScratchReg4));
1122 }
1123
1124 CVTDQ2PS(reg, R(reg));
1125 MULPS(reg, R(XMM6));
1126
1127 // And now the weight.
1128 MOVSS(fpScratchReg3, MDisp(tempReg1, n * sizeof(float)));
1129 SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1130 MULPS(reg, R(fpScratchReg3));
1131
1132 if (!first) {
1133 ADDPS(fpScratchReg, R(fpScratchReg2));
1134 } else {
1135 first = false;
1136 }
1137 }
1138
1139 Jit_WriteMorphColor(dec_->decFmt.c0off);
1140 }
1141
1142 // The mask is intentionally in reverse order (but skips A.)
1143 alignas(16) static const u32 color565Mask[4] = { 0x0000f800, 0x000007e0, 0x0000001f, 0x00000000, };
1144 alignas(16) static const float byColor565[4] = { 255.0f / 31.0f, 255.0f / 63.0f, 255.0f / 31.0f, 255.0f / 1.0f, };
1145
Jit_Color565Morph()1146 void VertexDecoderJitCache::Jit_Color565Morph() {
1147 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1148 MOV(PTRBITS, R(tempReg2), ImmPtr(color565Mask));
1149 MOVDQA(XMM5, MatR(tempReg2));
1150 MOV(PTRBITS, R(tempReg2), ImmPtr(byColor565));
1151 MOVAPS(XMM6, MatR(tempReg2));
1152
1153 bool first = true;
1154 for (int n = 0; n < dec_->morphcount; ++n) {
1155 const X64Reg reg = first ? fpScratchReg : fpScratchReg3;
1156 MOVD_xmm(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
1157 // Spread it out into each lane. We end up with it reversed (R high, A low.)
1158 // Below, we shift out each lane from low to high and reverse them.
1159 PSHUFD(fpScratchReg2, R(fpScratchReg2), _MM_SHUFFLE(0, 0, 0, 0));
1160 PAND(fpScratchReg2, R(XMM5));
1161
1162 // Alpha handled in Jit_WriteMorphColor.
1163
1164 // Blue first.
1165 MOVSS(reg, R(fpScratchReg2));
1166 PSRLD(reg, 6);
1167 PSHUFD(reg, R(reg), _MM_SHUFFLE(3, 0, 0, 0));
1168
1169 // Green, let's shift it into the right lane first.
1170 PSRLDQ(fpScratchReg2, 4);
1171 MOVSS(reg, R(fpScratchReg2));
1172 PSRLD(reg, 5);
1173 PSHUFD(reg, R(reg), _MM_SHUFFLE(3, 2, 0, 0));
1174
1175 // Last one, red.
1176 PSRLDQ(fpScratchReg2, 4);
1177 MOVSS(reg, R(fpScratchReg2));
1178
1179 CVTDQ2PS(reg, R(reg));
1180 MULPS(reg, R(XMM6));
1181
1182 // And now the weight.
1183 MOVSS(fpScratchReg2, MDisp(tempReg1, n * sizeof(float)));
1184 SHUFPS(fpScratchReg2, R(fpScratchReg2), _MM_SHUFFLE(0, 0, 0, 0));
1185 MULPS(reg, R(fpScratchReg2));
1186
1187 if (!first) {
1188 ADDPS(fpScratchReg, R(fpScratchReg3));
1189 } else {
1190 first = false;
1191 }
1192 }
1193
1194 Jit_WriteMorphColor(dec_->decFmt.c0off, false);
1195 }
1196
1197 // The mask is intentionally in reverse order.
1198 alignas(16) static const u32 color5551Mask[4] = { 0x00008000, 0x00007c00, 0x000003e0, 0x0000001f, };
1199 alignas(16) static const float byColor5551[4] = { 255.0f / 31.0f, 255.0f / 31.0f, 255.0f / 31.0f, 255.0f / 1.0f, };
1200
Jit_Color5551Morph()1201 void VertexDecoderJitCache::Jit_Color5551Morph() {
1202 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1203 MOV(PTRBITS, R(tempReg2), ImmPtr(color5551Mask));
1204 MOVDQA(XMM5, MatR(tempReg2));
1205 MOV(PTRBITS, R(tempReg2), ImmPtr(byColor5551));
1206 MOVAPS(XMM6, MatR(tempReg2));
1207
1208 bool first = true;
1209 for (int n = 0; n < dec_->morphcount; ++n) {
1210 const X64Reg reg = first ? fpScratchReg : fpScratchReg3;
1211 MOVD_xmm(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
1212 // Spread it out into each lane.
1213 PSHUFD(fpScratchReg2, R(fpScratchReg2), _MM_SHUFFLE(0, 0, 0, 0));
1214 PAND(fpScratchReg2, R(XMM5));
1215
1216 // Alpha first.
1217 MOVSS(reg, R(fpScratchReg2));
1218 PSRLD(reg, 5);
1219 PSHUFD(reg, R(reg), _MM_SHUFFLE(0, 0, 0, 0));
1220
1221 // Blue, let's shift it into the right lane first.
1222 PSRLDQ(fpScratchReg2, 4);
1223 MOVSS(reg, R(fpScratchReg2));
1224 PSRLD(reg, 5);
1225 PSHUFD(reg, R(reg), _MM_SHUFFLE(3, 0, 0, 0));
1226
1227 // Green.
1228 PSRLDQ(fpScratchReg2, 4);
1229 MOVSS(reg, R(fpScratchReg2));
1230 PSRLD(reg, 5);
1231 PSHUFD(reg, R(reg), _MM_SHUFFLE(3, 2, 0, 0));
1232
1233 // Last one, red.
1234 PSRLDQ(fpScratchReg2, 4);
1235 MOVSS(reg, R(fpScratchReg2));
1236
1237 CVTDQ2PS(reg, R(reg));
1238 MULPS(reg, R(XMM6));
1239
1240 // And now the weight.
1241 MOVSS(fpScratchReg2, MDisp(tempReg1, n * sizeof(float)));
1242 SHUFPS(fpScratchReg2, R(fpScratchReg2), _MM_SHUFFLE(0, 0, 0, 0));
1243 MULPS(reg, R(fpScratchReg2));
1244
1245 if (!first) {
1246 ADDPS(fpScratchReg, R(fpScratchReg3));
1247 } else {
1248 first = false;
1249 }
1250 }
1251
1252 Jit_WriteMorphColor(dec_->decFmt.c0off);
1253 }
1254
Jit_WriteMorphColor(int outOff,bool checkAlpha)1255 void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) {
1256 // Pack back into a u32, with saturation.
1257 CVTPS2DQ(fpScratchReg, R(fpScratchReg));
1258 PACKSSDW(fpScratchReg, R(fpScratchReg));
1259 PACKUSWB(fpScratchReg, R(fpScratchReg));
1260 MOVD_xmm(R(tempReg1), fpScratchReg);
1261
1262 // TODO: May be a faster way to do this without the MOVD.
1263 if (checkAlpha) {
1264 CMP(32, R(tempReg1), Imm32(0xFF000000));
1265 FixupBranch skip = J_CC(CC_AE, false);
1266 if (RipAccessible(&gstate_c.vertexFullAlpha)) {
1267 MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible
1268 } else {
1269 MOV(PTRBITS, R(tempReg2), ImmPtr(&gstate_c.vertexFullAlpha));
1270 MOV(8, MatR(tempReg2), Imm8(0));
1271 }
1272 SetJumpTarget(skip);
1273 } else {
1274 // Force alpha to full if we're not checking it.
1275 OR(32, R(tempReg1), Imm32(0xFF000000));
1276 }
1277
1278 MOV(32, MDisp(dstReg, outOff), R(tempReg1));
1279 }
1280
1281 // Copy 3 bytes and then a zero. Might as well copy four.
Jit_NormalS8()1282 void VertexDecoderJitCache::Jit_NormalS8() {
1283 MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
1284 AND(32, R(tempReg1), Imm32(0x00FFFFFF));
1285 MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
1286 }
1287
Jit_NormalS8ToFloat()1288 void VertexDecoderJitCache::Jit_NormalS8ToFloat() {
1289 Jit_AnyS8ToFloat(dec_->nrmoff);
1290 MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM3);
1291 }
1292
1293 // Copy 6 bytes and then 2 zeroes.
Jit_NormalS16()1294 void VertexDecoderJitCache::Jit_NormalS16() {
1295 MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
1296 MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->nrmoff + 4));
1297 MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
1298 MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
1299 }
1300
Jit_NormalFloat()1301 void VertexDecoderJitCache::Jit_NormalFloat() {
1302 if (cpu_info.Mode64bit) {
1303 MOV(64, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
1304 MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8));
1305 MOV(64, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
1306 MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
1307 } else {
1308 MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
1309 MOV(32, R(tempReg2), MDisp(srcReg, dec_->nrmoff + 4));
1310 MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8));
1311 MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
1312 MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
1313 MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
1314 }
1315 }
1316
1317 // This could be a bit shorter with AVX 3-operand instructions and FMA.
Jit_WriteMatrixMul(int outOff,bool pos)1318 void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
1319 MOVAPS(XMM1, R(XMM3));
1320 MOVAPS(XMM2, R(XMM3));
1321 SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
1322 SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
1323 SHUFPS(XMM3, R(XMM3), _MM_SHUFFLE(2, 2, 2, 2));
1324 MULPS(XMM1, R(XMM4));
1325 MULPS(XMM2, R(XMM5));
1326 MULPS(XMM3, R(XMM6));
1327 ADDPS(XMM1, R(XMM2));
1328 ADDPS(XMM1, R(XMM3));
1329 if (pos) {
1330 ADDPS(XMM1, R(XMM7));
1331 }
1332 MOVUPS(MDisp(dstReg, outOff), XMM1);
1333 }
1334
Jit_NormalS8Skin()1335 void VertexDecoderJitCache::Jit_NormalS8Skin() {
1336 Jit_AnyS8ToFloat(dec_->nrmoff);
1337 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1338 }
1339
Jit_NormalS16Skin()1340 void VertexDecoderJitCache::Jit_NormalS16Skin() {
1341 Jit_AnyS16ToFloat(dec_->nrmoff);
1342 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1343 }
1344
Jit_NormalFloatSkin()1345 void VertexDecoderJitCache::Jit_NormalFloatSkin() {
1346 MOVUPS(XMM3, MDisp(srcReg, dec_->nrmoff));
1347 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1348 }
1349
1350 // Through expands into floats, always. Might want to look at changing this.
Jit_PosS8Through()1351 void VertexDecoderJitCache::Jit_PosS8Through() {
1352 DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
1353 // SIMD doesn't really matter since this isn't useful on hardware.
1354 for (int i = 0; i < 3; i++) {
1355 MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
1356 CVTSI2SS(fpScratchReg, R(tempReg1));
1357 MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg);
1358 }
1359 }
1360
1361 // Through expands into floats, always. Might want to look at changing this.
Jit_PosS16Through()1362 void VertexDecoderJitCache::Jit_PosS16Through() {
1363 if (cpu_info.bSSE4_1) {
1364 MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->posoff));
1365 MOVZX(32, 16, tempReg3, MDisp(srcReg, dec_->posoff + 4));
1366 MOVD_xmm(fpScratchReg2, R(tempReg3));
1367 PMOVSXWD(fpScratchReg, R(fpScratchReg));
1368 PUNPCKLQDQ(fpScratchReg, R(fpScratchReg2));
1369 CVTDQ2PS(fpScratchReg, R(fpScratchReg));
1370 MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), fpScratchReg);
1371 } else {
1372 MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff));
1373 MOVSX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 2));
1374 MOVZX(32, 16, tempReg3, MDisp(srcReg, dec_->posoff + 4)); // NOTE: MOVZX
1375 CVTSI2SS(fpScratchReg, R(tempReg1));
1376 MOVSS(MDisp(dstReg, dec_->decFmt.posoff), fpScratchReg);
1377 CVTSI2SS(fpScratchReg, R(tempReg2));
1378 MOVSS(MDisp(dstReg, dec_->decFmt.posoff + 4), fpScratchReg);
1379 CVTSI2SS(fpScratchReg, R(tempReg3));
1380 MOVSS(MDisp(dstReg, dec_->decFmt.posoff + 8), fpScratchReg);
1381 }
1382 }
1383
Jit_PosS8()1384 void VertexDecoderJitCache::Jit_PosS8() {
1385 Jit_AnyS8ToFloat(dec_->posoff);
1386 MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
1387 }
1388
Jit_PosS16()1389 void VertexDecoderJitCache::Jit_PosS16() {
1390 Jit_AnyS16ToFloat(dec_->posoff);
1391 MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
1392 }
1393
1394 // Just copy 12 bytes.
Jit_PosFloat()1395 void VertexDecoderJitCache::Jit_PosFloat() {
1396 if (cpu_info.Mode64bit) {
1397 MOV(64, R(tempReg1), MDisp(srcReg, dec_->posoff));
1398 MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8));
1399 MOV(64, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
1400 MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
1401 } else {
1402 MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
1403 MOV(32, R(tempReg2), MDisp(srcReg, dec_->posoff + 4));
1404 MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8));
1405 MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
1406 MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
1407 MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
1408 }
1409 }
1410
Jit_PosS8Skin()1411 void VertexDecoderJitCache::Jit_PosS8Skin() {
1412 Jit_AnyS8ToFloat(dec_->posoff);
1413 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1414 }
1415
Jit_PosS16Skin()1416 void VertexDecoderJitCache::Jit_PosS16Skin() {
1417 Jit_AnyS16ToFloat(dec_->posoff);
1418 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1419 }
1420
Jit_PosFloatSkin()1421 void VertexDecoderJitCache::Jit_PosFloatSkin() {
1422 MOVUPS(XMM3, MDisp(srcReg, dec_->posoff));
1423 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1424 }
1425
Jit_AnyS8ToFloat(int srcoff)1426 void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
1427 if (!cpu_info.bSSE4_1) {
1428 PXOR(XMM3, R(XMM3));
1429 }
1430 MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
1431 if (cpu_info.bSSE4_1) {
1432 PMOVSXBD(XMM1, R(XMM1));
1433 } else {
1434 PUNPCKLBW(XMM1, R(XMM3));
1435 PUNPCKLWD(XMM1, R(XMM3));
1436 PSLLD(XMM1, 24);
1437 PSRAD(XMM1, 24);
1438 }
1439 CVTDQ2PS(XMM3, R(XMM1));
1440 if (RipAccessible(&by128)) {
1441 MULPS(XMM3, M(&by128)); // rip accessible
1442 } else {
1443 MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
1444 MULPS(XMM3, MatR(tempReg1));
1445 }
1446 }
1447
Jit_AnyS16ToFloat(int srcoff)1448 void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
1449 if (!cpu_info.bSSE4_1) {
1450 PXOR(XMM3, R(XMM3));
1451 }
1452 MOVQ_xmm(XMM1, MDisp(srcReg, srcoff));
1453 if (cpu_info.bSSE4_1) {
1454 PMOVSXWD(XMM1, R(XMM1));
1455 } else {
1456 PUNPCKLWD(XMM1, R(XMM3));
1457 PSLLD(XMM1, 16);
1458 PSRAD(XMM1, 16);
1459 }
1460 CVTDQ2PS(XMM3, R(XMM1));
1461 if (RipAccessible(&by32768)) {
1462 MULPS(XMM3, M(&by32768)); // rip accessible
1463 } else {
1464 MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
1465 MULPS(XMM3, MatR(tempReg1));
1466 }
1467 }
1468
Jit_AnyU8ToFloat(int srcoff,u32 bits)1469 void VertexDecoderJitCache::Jit_AnyU8ToFloat(int srcoff, u32 bits) {
1470 _dbg_assert_msg_((bits & ~(32 | 16 | 8)) == 0, "Bits must be a multiple of 8.");
1471 _dbg_assert_msg_(bits >= 8 && bits <= 32, "Bits must be a between 8 and 32.");
1472
1473 if (!cpu_info.bSSE4_1) {
1474 PXOR(XMM3, R(XMM3));
1475 }
1476 if (bits == 32) {
1477 MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
1478 } else if (bits == 24) {
1479 MOV(32, R(tempReg1), MDisp(srcReg, srcoff));
1480 AND(32, R(tempReg1), Imm32(0x00FFFFFF));
1481 MOVD_xmm(XMM1, R(tempReg1));
1482 } else {
1483 MOVZX(32, bits, tempReg1, MDisp(srcReg, srcoff));
1484 MOVD_xmm(XMM1, R(tempReg1));
1485 }
1486 if (cpu_info.bSSE4_1) {
1487 PMOVZXBD(XMM1, R(XMM1));
1488 } else {
1489 PUNPCKLBW(XMM1, R(XMM3));
1490 PUNPCKLWD(XMM1, R(XMM3));
1491 }
1492 CVTDQ2PS(XMM3, R(XMM1));
1493 if (RipAccessible(&by128)) {
1494 MULPS(XMM3, M(&by128)); // rip accessible
1495 } else {
1496 MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
1497 MULPS(XMM3, MatR(tempReg1));
1498 }
1499 }
1500
Jit_AnyU16ToFloat(int srcoff,u32 bits)1501 void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff, u32 bits) {
1502 _dbg_assert_msg_((bits & ~(64 | 32 | 16)) == 0, "Bits must be a multiple of 16.");
1503 _dbg_assert_msg_(bits >= 16 && bits <= 64, "Bits must be a between 16 and 64.");
1504
1505 if (!cpu_info.bSSE4_1) {
1506 PXOR(XMM3, R(XMM3));
1507 }
1508 if (bits == 64) {
1509 MOVQ_xmm(XMM1, MDisp(srcReg, srcoff));
1510 } else if (bits == 48) {
1511 MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
1512 PINSRW(XMM1, MDisp(srcReg, srcoff + 4), 2);
1513 } else if (bits == 32) {
1514 MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
1515 } else if (bits == 16) {
1516 MOVZX(32, bits, tempReg1, MDisp(srcReg, srcoff));
1517 MOVD_xmm(XMM1, R(tempReg1));
1518 }
1519 if (cpu_info.bSSE4_1) {
1520 PMOVZXWD(XMM1, R(XMM1));
1521 } else {
1522 PUNPCKLWD(XMM1, R(XMM3));
1523 }
1524 CVTDQ2PS(XMM3, R(XMM1));
1525 if (RipAccessible(&by32768)) {
1526 MULPS(XMM3, M(&by32768)); // rip accessible
1527 } else {
1528 MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
1529 MULPS(XMM3, MatR(tempReg1));
1530 }
1531 }
1532
Jit_AnyS8Morph(int srcoff,int dstoff)1533 void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
1534 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1535 if (!cpu_info.bSSE4_1) {
1536 PXOR(fpScratchReg4, R(fpScratchReg4));
1537 }
1538 if (RipAccessible(&by128)) {
1539 MOVAPS(XMM5, M(&by128)); // rip accessible
1540 } else {
1541 MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
1542 MOVAPS(XMM5, MatR(tempReg1));
1543 }
1544
1545 // Sum into fpScratchReg.
1546 bool first = true;
1547 for (int n = 0; n < dec_->morphcount; ++n) {
1548 const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1549 // Okay, first convert to floats.
1550 MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
1551 if (cpu_info.bSSE4_1) {
1552 PMOVSXBD(reg, R(reg));
1553 } else {
1554 PUNPCKLBW(reg, R(fpScratchReg4));
1555 PUNPCKLWD(reg, R(fpScratchReg4));
1556 PSLLD(reg, 24);
1557 PSRAD(reg, 24);
1558 }
1559 CVTDQ2PS(reg, R(reg));
1560
1561 // Now, It's time to multiply by the weight and 1.0f/128.0f.
1562 MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
1563 MULSS(fpScratchReg3, R(XMM5));
1564 SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1565
1566 MULPS(reg, R(fpScratchReg3));
1567 if (!first) {
1568 ADDPS(fpScratchReg, R(fpScratchReg2));
1569 } else {
1570 first = false;
1571 }
1572 }
1573
1574 MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
1575 }
1576
Jit_AnyS16Morph(int srcoff,int dstoff)1577 void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
1578 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1579 if (!cpu_info.bSSE4_1) {
1580 PXOR(fpScratchReg4, R(fpScratchReg4));
1581 }
1582 if (RipAccessible(&by32768)) {
1583 MOVAPS(XMM5, M(&by32768)); // rip accessible
1584 } else {
1585 MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
1586 MOVAPS(XMM5, MatR(tempReg1));
1587 }
1588
1589 // Sum into fpScratchReg.
1590 bool first = true;
1591 for (int n = 0; n < dec_->morphcount; ++n) {
1592 const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1593 // Okay, first convert to floats.
1594 MOVQ_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
1595 if (cpu_info.bSSE4_1) {
1596 PMOVSXWD(reg, R(reg));
1597 } else {
1598 PUNPCKLWD(reg, R(fpScratchReg4));
1599 PSLLD(reg, 16);
1600 PSRAD(reg, 16);
1601 }
1602 CVTDQ2PS(reg, R(reg));
1603
1604 // Now, It's time to multiply by the weight and 1.0f/32768.0f.
1605 MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
1606 MULSS(fpScratchReg3, R(XMM5));
1607 SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1608
1609 MULPS(reg, R(fpScratchReg3));
1610 if (!first) {
1611 ADDPS(fpScratchReg, R(fpScratchReg2));
1612 } else {
1613 first = false;
1614 }
1615 }
1616
1617 MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
1618 }
1619
Jit_AnyFloatMorph(int srcoff,int dstoff)1620 void VertexDecoderJitCache::Jit_AnyFloatMorph(int srcoff, int dstoff) {
1621 MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1622
1623 // Sum into fpScratchReg.
1624 bool first = true;
1625 for (int n = 0; n < dec_->morphcount; ++n) {
1626 const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1627 MOVUPS(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
1628 MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
1629 SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1630 MULPS(reg, R(fpScratchReg3));
1631 if (!first) {
1632 ADDPS(fpScratchReg, R(fpScratchReg2));
1633 } else {
1634 first = false;
1635 }
1636 }
1637
1638 MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
1639 }
1640
Jit_PosS8Morph()1641 void VertexDecoderJitCache::Jit_PosS8Morph() {
1642 Jit_AnyS8Morph(dec_->posoff, dec_->decFmt.posoff);
1643 }
1644
Jit_PosS16Morph()1645 void VertexDecoderJitCache::Jit_PosS16Morph() {
1646 Jit_AnyS16Morph(dec_->posoff, dec_->decFmt.posoff);
1647 }
1648
Jit_PosFloatMorph()1649 void VertexDecoderJitCache::Jit_PosFloatMorph() {
1650 Jit_AnyFloatMorph(dec_->posoff, dec_->decFmt.posoff);
1651 }
1652
Jit_NormalS8Morph()1653 void VertexDecoderJitCache::Jit_NormalS8Morph() {
1654 Jit_AnyS8Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
1655 }
1656
Jit_NormalS16Morph()1657 void VertexDecoderJitCache::Jit_NormalS16Morph() {
1658 Jit_AnyS16Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
1659 }
1660
Jit_NormalFloatMorph()1661 void VertexDecoderJitCache::Jit_NormalFloatMorph() {
1662 Jit_AnyFloatMorph(dec_->nrmoff, dec_->decFmt.nrmoff);
1663 }
1664
CompileStep(const VertexDecoder & dec,int step)1665 bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
1666 // See if we find a matching JIT function
1667 for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {
1668 if (dec.steps_[step] == jitLookup[i].func) {
1669 ((*this).*jitLookup[i].jitFunc)();
1670 return true;
1671 }
1672 }
1673 return false;
1674 }
1675
1676 #endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
1677