1 // Copyright (c) 2013- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include "ppsspp_config.h"
19 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20 
21 #include <emmintrin.h>
22 
23 #include "Common/CPUDetect.h"
24 #include "Core/Config.h"
25 #include "Core/Reporting.h"
26 #include "GPU/GPUState.h"
27 #include "GPU/Common/VertexDecoderCommon.h"
28 
29 // We start out by converting the active matrices into 4x4 which are easier to multiply with
30 // using SSE / NEON and store them here.
31 alignas(16) static float bones[16 * 8];
32 
33 using namespace Gen;
34 
35 alignas(16) static const float by128[4] = {
36 	1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
37 };
38 alignas(16) static const float by32768[4] = {
39 	1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
40 };
41 
42 alignas(16) static const float by128_11[4] = {
43 	1.0f / 128.0f, 1.0f / 128.0f, 1.0f, 1.0f,
44 };
45 alignas(16) static const float by32768_11[4] = {
46 	1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f, 1.0f,
47 };
48 
49 alignas(16) static const u32 threeMasks[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 };
50 alignas(16) static const u32 aOne[4] = {0, 0, 0, 0x3F800000};
51 
52 alignas(16) static const float by16384[4] = {
53 	1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f,
54 };
55 
56 #if PPSSPP_ARCH(AMD64)
57 #ifdef _WIN32
58 static const X64Reg tempReg1 = RAX;
59 static const X64Reg tempReg2 = R9;
60 static const X64Reg tempReg3 = R10;
61 static const X64Reg srcReg = RCX;
62 static const X64Reg dstReg = RDX;
63 static const X64Reg counterReg = R8;
64 #else
65 static const X64Reg tempReg1 = RAX;
66 static const X64Reg tempReg2 = R9;
67 static const X64Reg tempReg3 = R10;
68 static const X64Reg srcReg = RDI;
69 static const X64Reg dstReg = RSI;
70 static const X64Reg counterReg = RDX;
71 #endif
72 #else
73 static const X64Reg tempReg1 = EAX;
74 static const X64Reg tempReg2 = EBX;
75 static const X64Reg tempReg3 = EDX;
76 static const X64Reg srcReg = ESI;
77 static const X64Reg dstReg = EDI;
78 static const X64Reg counterReg = ECX;
79 #endif
80 
81 // XMM0-XMM5 are volatile on Windows X64
82 // XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms)
83 static const X64Reg fpScaleOffsetReg = XMM0;
84 
85 static const X64Reg fpScratchReg = XMM1;
86 static const X64Reg fpScratchReg2 = XMM2;
87 static const X64Reg fpScratchReg3 = XMM3;
88 static const X64Reg fpScratchReg4 = XMM4;
89 
90 // We're gonna keep the current skinning matrix in 4 XMM regs. Fortunately we easily
91 // have space for that now.
92 
93 // To debug, just comment them out one at a time until it works. We fall back
94 // on the interpreter if the compiler fails.
95 
96 static const JitLookup jitLookup[] = {
97 	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
98 	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
99 	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
100 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
101 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
102 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
103 
104 	{&VertexDecoder::Step_WeightsU8ToFloat, &VertexDecoderJitCache::Jit_WeightsU8ToFloat},
105 	{&VertexDecoder::Step_WeightsU16ToFloat, &VertexDecoderJitCache::Jit_WeightsU16ToFloat},
106 
107 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
108 	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
109 	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
110 
111 	{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
112 	{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
113 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
114 
115 	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
116 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
117 
118 	{&VertexDecoder::Step_TcU8MorphToFloat, &VertexDecoderJitCache::Jit_TcU8MorphToFloat},
119 	{&VertexDecoder::Step_TcU16MorphToFloat, &VertexDecoderJitCache::Jit_TcU16MorphToFloat},
120 	{&VertexDecoder::Step_TcFloatMorph, &VertexDecoderJitCache::Jit_TcFloatMorph},
121 	{&VertexDecoder::Step_TcU8PrescaleMorph, &VertexDecoderJitCache::Jit_TcU8PrescaleMorph},
122 	{&VertexDecoder::Step_TcU16PrescaleMorph, &VertexDecoderJitCache::Jit_TcU16PrescaleMorph},
123 	{&VertexDecoder::Step_TcFloatPrescaleMorph, &VertexDecoderJitCache::Jit_TcFloatPrescaleMorph},
124 
125 	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
126 	{&VertexDecoder::Step_NormalS8ToFloat, &VertexDecoderJitCache::Jit_NormalS8ToFloat},
127 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
128 	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
129 
130 	{&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
131 	{&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
132 	{&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
133 
134 	{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
135 	{&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
136 	{&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
137 	{&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},
138 
139 	{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
140 	{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
141 	{&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
142 
143 	{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
144 	{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
145 	{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
146 
147 	{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
148 	{&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
149 	{&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
150 
151 	{&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph},
152 	{&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph},
153 	{&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph},
154 
155 	{&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
156 	{&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
157 	{&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
158 
159 	{&VertexDecoder::Step_Color8888Morph, &VertexDecoderJitCache::Jit_Color8888Morph},
160 	{&VertexDecoder::Step_Color4444Morph, &VertexDecoderJitCache::Jit_Color4444Morph},
161 	{&VertexDecoder::Step_Color565Morph, &VertexDecoderJitCache::Jit_Color565Morph},
162 	{&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph},
163 };
164 
Compile(const VertexDecoder & dec,int32_t * jittedSize)165 JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
166 	dec_ = &dec;
167 	BeginWrite();
168 	const u8 *start = this->AlignCode16();
169 
170 #if PPSSPP_ARCH(X86)
171 	// Store register values
172 	PUSH(ESI);
173 	PUSH(EDI);
174 	PUSH(EBX);
175 	PUSH(EBP);
176 
177 	// Read parameters
178 	int offset = 4;
179 	MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0));
180 	MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
181 	MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
182 
183 	const uint8_t STACK_FIXED_ALLOC = 64;
184 #else
185 	// Parameters automatically fall into place.
186 
187 	// This will align the stack properly to 16 bytes (the call of this function pushed RIP, which is 8 bytes).
188 	const uint8_t STACK_FIXED_ALLOC = 96 + 8;
189 #endif
190 
191 	// Allocate temporary storage on the stack.
192 	SUB(PTRBITS, R(ESP), Imm8(STACK_FIXED_ALLOC));
193 	// Save XMM4/XMM5 which apparently can be problematic?
194 	// Actually, if they are, it must be a compiler bug because they SHOULD be ok.
195 	// So I won't bother.
196 	MOVUPS(MDisp(ESP, 0), XMM4);
197 	MOVUPS(MDisp(ESP, 16), XMM5);
198 	MOVUPS(MDisp(ESP, 32), XMM6);
199 	MOVUPS(MDisp(ESP, 48), XMM7);
200 #if PPSSPP_ARCH(AMD64)
201 	MOVUPS(MDisp(ESP, 64), XMM8);
202 	MOVUPS(MDisp(ESP, 80), XMM9);
203 #endif
204 
205 	bool prescaleStep = false;
206 	// Look for prescaled texcoord steps
207 	for (int i = 0; i < dec.numSteps_; i++) {
208 		if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
209 			dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
210 			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
211 			prescaleStep = true;
212 		}
213 		if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph ||
214 			dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph ||
215 			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) {
216 			prescaleStep = true;
217 		}
218 	}
219 
220 	// Add code to convert matrices to 4x4.
221 	// Later we might want to do this when the matrices are loaded instead.
222 	if (dec.weighttype && g_Config.bSoftwareSkinning) {
223 		MOV(PTRBITS, R(tempReg1), ImmPtr(&threeMasks));
224 		MOVAPS(XMM4, MatR(tempReg1));
225 		MOV(PTRBITS, R(tempReg1), ImmPtr(&aOne));
226 		MOVUPS(XMM5, MatR(tempReg1));
227 		MOV(PTRBITS, R(tempReg1), ImmPtr(gstate.boneMatrix));
228 		MOV(PTRBITS, R(tempReg2), ImmPtr(bones));
229 		for (int i = 0; i < dec.nweights; i++) {
230 			MOVUPS(XMM0, MDisp(tempReg1, (12 * i) * 4));
231 			MOVUPS(XMM1, MDisp(tempReg1, (12 * i + 3) * 4));
232 			MOVUPS(XMM2, MDisp(tempReg1, (12 * i + 3 * 2) * 4));
233 			MOVUPS(XMM3, MDisp(tempReg1, (12 * i + 3 * 3) * 4));
234 			ANDPS(XMM0, R(XMM4));
235 			ANDPS(XMM1, R(XMM4));
236 			ANDPS(XMM2, R(XMM4));
237 			ANDPS(XMM3, R(XMM4));
238 			ORPS(XMM3, R(XMM5));
239 			MOVAPS(MDisp(tempReg2, (16 * i) * 4), XMM0);
240 			MOVAPS(MDisp(tempReg2, (16 * i + 4) * 4), XMM1);
241 			MOVAPS(MDisp(tempReg2, (16 * i + 8) * 4), XMM2);
242 			MOVAPS(MDisp(tempReg2, (16 * i + 12) * 4), XMM3);
243 		}
244 	}
245 
246 	// Keep the scale/offset in a few fp registers if we need it.
247 	if (prescaleStep) {
248 		MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.uv));
249 		MOVUPS(fpScaleOffsetReg, MatR(tempReg1));
250 		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
251 			MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11));
252 			MULPS(fpScaleOffsetReg, MatR(tempReg2));
253 		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
254 			MOV(PTRBITS, R(tempReg2), ImmPtr(&by32768_11));
255 			MULPS(fpScaleOffsetReg, MatR(tempReg2));
256 		}
257 	}
258 
259 	// Let's not bother with a proper stack frame. We just grab the arguments and go.
260 	JumpTarget loopStart = GetCodePtr();
261 	for (int i = 0; i < dec.numSteps_; i++) {
262 		if (!CompileStep(dec, i)) {
263 			EndWrite();
264 			// Reset the code ptr and return zero to indicate that we failed.
265 			ResetCodePtr(GetOffset(start));
266 			return 0;
267 		}
268 	}
269 
270 	ADD(PTRBITS, R(srcReg), Imm32(dec.VertexSize()));
271 	ADD(PTRBITS, R(dstReg), Imm32(dec.decFmt.stride));
272 	SUB(32, R(counterReg), Imm8(1));
273 	J_CC(CC_NZ, loopStart, true);
274 
275 	MOVUPS(XMM4, MDisp(ESP, 0));
276 	MOVUPS(XMM5, MDisp(ESP, 16));
277 	MOVUPS(XMM6, MDisp(ESP, 32));
278 	MOVUPS(XMM7, MDisp(ESP, 48));
279 #if PPSSPP_ARCH(AMD64)
280 	MOVUPS(XMM8, MDisp(ESP, 64));
281 	MOVUPS(XMM9, MDisp(ESP, 80));
282 #endif
283 	ADD(PTRBITS, R(ESP), Imm8(STACK_FIXED_ALLOC));
284 
285 #if PPSSPP_ARCH(X86)
286 	// Restore register values
287 	POP(EBP);
288 	POP(EBX);
289 	POP(EDI);
290 	POP(ESI);
291 #endif
292 
293 	RET();
294 
295 	*jittedSize = GetCodePtr() - start;
296 	EndWrite();
297 	return (JittedVertexDecoder)start;
298 }
299 
Jit_WeightsU8()300 void VertexDecoderJitCache::Jit_WeightsU8() {
301 	switch (dec_->nweights) {
302 	case 1:
303 		MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff));
304 		break;
305 	case 2:
306 		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
307 		break;
308 	case 3:
309 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
310 		AND(32, R(tempReg1), Imm32(0x00FFFFFF));
311 		break;
312 	case 4:
313 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
314 		break;
315 	case 5:
316 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
317 		MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
318 		break;
319 	case 6:
320 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
321 		MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
322 		break;
323 	case 7:
324 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
325 		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
326 		AND(32, R(tempReg2), Imm32(0x00FFFFFF));
327 		break;
328 	case 8:
329 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
330 		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
331 		break;
332 	}
333 
334 	if (dec_->nweights <= 4) {
335 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
336 	} else {
337 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
338 		MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2));
339 	}
340 }
341 
Jit_WeightsU16()342 void VertexDecoderJitCache::Jit_WeightsU16() {
343 	switch (dec_->nweights) {
344 	case 1:
345 		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
346 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
347 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
348 		return;
349 
350 	case 2:
351 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
352 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
353 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
354 		return;
355 
356 	case 3:
357 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
358 		MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
359 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
360 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
361 		return;
362 
363 	case 4:
364 		// Anything above 4 will do 4 here, and then the rest after.
365 	case 5:
366 	case 6:
367 	case 7:
368 	case 8:
369 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
370 		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
371 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
372 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
373 		break;
374 	}
375 
376 	// Basic implementation - a short at a time. TODO: Optimize
377 	int j;
378 	for (j = 4; j < dec_->nweights; j++) {
379 		MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
380 		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
381 	}
382 	while (j & 3) {
383 		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0));
384 		j++;
385 	}
386 }
387 
Jit_WeightsU8ToFloat()388 void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
389 	if (dec_->nweights >= 4) {
390 		Jit_AnyU8ToFloat(dec_->weightoff, 32);
391 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
392 		if (dec_->nweights > 4) {
393 			Jit_AnyU8ToFloat(dec_->weightoff + 4, (dec_->nweights - 4) * 8);
394 			MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
395 		}
396 	} else {
397 		Jit_AnyU8ToFloat(dec_->weightoff, dec_->nweights * 8);
398 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
399 	}
400 }
401 
Jit_WeightsU16ToFloat()402 void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
403 	if (dec_->nweights >= 4) {
404 		Jit_AnyU16ToFloat(dec_->weightoff, 64);
405 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
406 		if (dec_->nweights > 4) {
407 			Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2, (dec_->nweights - 4) * 16);
408 			MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
409 		}
410 	} else {
411 		Jit_AnyU16ToFloat(dec_->weightoff, dec_->nweights * 16);
412 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
413 	}
414 }
415 
Jit_WeightsFloat()416 void VertexDecoderJitCache::Jit_WeightsFloat() {
417 	int j;
418 	switch (dec_->nweights) {
419 	case 1:
420 		// MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s.
421 		MOVSS(XMM3, MDisp(srcReg, dec_->weightoff));
422 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
423 		break;
424 
425 	case 2:
426 		MOVQ_xmm(XMM3, MDisp(srcReg, dec_->weightoff));
427 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
428 		break;
429 
430 	case 4:
431 		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
432 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
433 		break;
434 
435 	case 5:
436 		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
437 		MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
438 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
439 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
440 		break;
441 
442 	case 6:
443 		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
444 		MOVQ_xmm(XMM4, MDisp(srcReg, dec_->weightoff + 16));
445 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
446 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
447 		break;
448 
449 	case 8:
450 		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
451 		MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
452 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
453 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
454 		break;
455 
456 	default:
457 		for (j = 0; j < dec_->nweights; j++) {
458 			MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
459 			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
460 		}
461 		while (j & 3) {  // Zero additional weights rounding up to 4.
462 			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
463 			j++;
464 		}
465 		break;
466 	}
467 }
468 
Jit_WeightsU8Skin()469 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
470 	MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
471 
472 #if PPSSPP_ARCH(AMD64)
473 	if (dec_->nweights > 4) {
474 		// This reads 8 bytes, we split the top 4 so we can expand each set of 4.
475 		MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
476 		PSHUFD(XMM9, R(XMM8), _MM_SHUFFLE(1, 1, 1, 1));
477 	} else {
478 		MOVD_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
479 	}
480 	if (cpu_info.bSSE4_1) {
481 		PMOVZXBD(XMM8, R(XMM8));
482 	} else {
483 		PXOR(fpScratchReg, R(fpScratchReg));
484 		PUNPCKLBW(XMM8, R(fpScratchReg));
485 		PUNPCKLWD(XMM8, R(fpScratchReg));
486 	}
487 	if (dec_->nweights > 4) {
488 		if (cpu_info.bSSE4_1) {
489 			PMOVZXBD(XMM9, R(XMM9));
490 		} else {
491 			PUNPCKLBW(XMM9, R(fpScratchReg));
492 			PUNPCKLWD(XMM9, R(fpScratchReg));
493 		}
494 	}
495 	CVTDQ2PS(XMM8, R(XMM8));
496 	if (dec_->nweights > 4)
497 		CVTDQ2PS(XMM9, R(XMM9));
498 
499 	if (RipAccessible(&by128)) {
500 		MULPS(XMM8, M(&by128));  // rip accessible
501 		if (dec_->nweights > 4)
502 			MULPS(XMM9, M(&by128));  // rip accessible
503 	} else {
504 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
505 		MULPS(XMM8, MatR(tempReg1));
506 		if (dec_->nweights > 4)
507 			MULPS(XMM9, MatR(tempReg1));
508 	}
509 
510 	auto weightToAllLanes = [this](X64Reg dst, int lane) {
511 		X64Reg src = lane < 4 ? XMM8 : XMM9;
512 		if (dst != INVALID_REG && dst != src) {
513 			MOVAPS(dst, R(src));
514 		} else {
515 			// INVALID_REG means ruin the existing src (it's not needed any more.)
516 			dst = src;
517 		}
518 		SHUFPS(dst, R(dst), _MM_SHUFFLE(lane % 4, lane % 4, lane % 4, lane % 4));
519 	};
520 #endif
521 
522 	for (int j = 0; j < dec_->nweights; j++) {
523 		X64Reg weight = XMM1;
524 #if PPSSPP_ARCH(AMD64)
525 		X64Reg weightSrc = j < 4 ? XMM8 : XMM9;
526 		if (j == 3 || j == dec_->nweights - 1) {
527 			// In the previous iteration, we already spread this value to all lanes.
528 			weight = weightSrc;
529 			if (j == 0) {
530 				// If there's only the one weight, no one shuffled it for us yet.
531 				weightToAllLanes(weight, j);
532 			}
533 			// If we're on #3, prepare #4 if it's the last (and only for that reg, in fact.)
534 			if (j == dec_->nweights - 2) {
535 				weightToAllLanes(INVALID_REG, j + 1);
536 			}
537 		} else {
538 			weightToAllLanes(weight, j);
539 			// To improve latency, we shuffle in the last weight of the reg.
540 			// If we're on slot #2, slot #3 will be the last.  Otherwise, nweights - 1 is last.
541 			if ((j == 2 && dec_->nweights > 3) || (j == dec_->nweights - 2)) {
542 				// Prepare the last one now for better latency.
543 				weightToAllLanes(INVALID_REG, j + 1);
544 			}
545 		}
546 #else
547 		MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j));
548 		CVTSI2SS(weight, R(tempReg1));
549 		MULSS(weight, M(&by128));  // rip accessible (x86)
550 		SHUFPS(weight, R(weight), _MM_SHUFFLE(0, 0, 0, 0));
551 #endif
552 		if (j == 0) {
553 			MOVAPS(XMM4, MDisp(tempReg2, 0));
554 			MOVAPS(XMM5, MDisp(tempReg2, 16));
555 			MOVAPS(XMM6, MDisp(tempReg2, 32));
556 			MOVAPS(XMM7, MDisp(tempReg2, 48));
557 			MULPS(XMM4, R(weight));
558 			MULPS(XMM5, R(weight));
559 			MULPS(XMM6, R(weight));
560 			MULPS(XMM7, R(weight));
561 		} else {
562 			MOVAPS(XMM2, MDisp(tempReg2, 0));
563 			MOVAPS(XMM3, MDisp(tempReg2, 16));
564 			MULPS(XMM2, R(weight));
565 			MULPS(XMM3, R(weight));
566 			ADDPS(XMM4, R(XMM2));
567 			ADDPS(XMM5, R(XMM3));
568 			MOVAPS(XMM2, MDisp(tempReg2, 32));
569 			MOVAPS(XMM3, MDisp(tempReg2, 48));
570 			MULPS(XMM2, R(weight));
571 			MULPS(XMM3, R(weight));
572 			ADDPS(XMM6, R(XMM2));
573 			ADDPS(XMM7, R(XMM3));
574 		}
575 		ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
576 	}
577 }
578 
Jit_WeightsU16Skin()579 void VertexDecoderJitCache::Jit_WeightsU16Skin() {
580 	MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
581 
582 #if PPSSPP_ARCH(AMD64)
583 	if (dec_->nweights > 6) {
584 		// Since this is probably not aligned, two MOVQs are better than one MOVDQU.
585 		MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
586 		MOVQ_xmm(XMM9, MDisp(srcReg, dec_->weightoff + 8));
587 	} else if (dec_->nweights > 4) {
588 		// Since this is probably not aligned, two MOVQs are better than one MOVDQU.
589 		MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
590 		MOVD_xmm(XMM9, MDisp(srcReg, dec_->weightoff + 8));
591 	} else if (dec_->nweights > 2) {
592 		MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
593 	} else {
594 		MOVD_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
595 	}
596 	if (cpu_info.bSSE4_1) {
597 		PMOVZXWD(XMM8, R(XMM8));
598 	} else {
599 		PXOR(fpScratchReg, R(fpScratchReg));
600 		PUNPCKLWD(XMM8, R(fpScratchReg));
601 	}
602 	if (dec_->nweights > 4) {
603 		if (cpu_info.bSSE4_1) {
604 			PMOVZXWD(XMM9, R(XMM9));
605 		} else {
606 			PUNPCKLWD(XMM9, R(fpScratchReg));
607 		}
608 	}
609 	CVTDQ2PS(XMM8, R(XMM8));
610 	if (dec_->nweights > 4)
611 		CVTDQ2PS(XMM9, R(XMM9));
612 
613 	if (RipAccessible(&by32768)) {
614 		MULPS(XMM8, M(&by32768));  // rip accessible
615 		if (dec_->nweights > 4)
616 			MULPS(XMM9, M(&by32768));  // rip accessible
617 	} else {
618 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
619 		MULPS(XMM8, MatR(tempReg1));
620 		if (dec_->nweights > 4)
621 			MULPS(XMM9, MatR(tempReg1));
622 	}
623 
624 	auto weightToAllLanes = [this](X64Reg dst, int lane) {
625 		X64Reg src = lane < 4 ? XMM8 : XMM9;
626 		if (dst != INVALID_REG && dst != src) {
627 			MOVAPS(dst, R(src));
628 		} else {
629 			// INVALID_REG means ruin the existing src (it's not needed any more.)
630 			dst = src;
631 		}
632 		SHUFPS(dst, R(dst), _MM_SHUFFLE(lane % 4, lane % 4, lane % 4, lane % 4));
633 	};
634 #endif
635 
636 	for (int j = 0; j < dec_->nweights; j++) {
637 		X64Reg weight = XMM1;
638 #if PPSSPP_ARCH(AMD64)
639 		X64Reg weightSrc = j < 4 ? XMM8 : XMM9;
640 		if (j == 3 || j == dec_->nweights - 1) {
641 			// In the previous iteration, we already spread this value to all lanes.
642 			weight = weightSrc;
643 			if (j == 0) {
644 				// If there's only the one weight, no one shuffled it for us yet.
645 				weightToAllLanes(weight, j);
646 			}
647 			// If we're on #3, prepare #4 if it's the last (and only for that reg, in fact.)
648 			if (j == dec_->nweights - 2) {
649 				weightToAllLanes(INVALID_REG, j + 1);
650 			}
651 		} else {
652 			weightToAllLanes(weight, j);
653 			// To improve latency, we shuffle in the last weight of the reg.
654 			// If we're on slot #2, slot #3 will be the last.  Otherwise, nweights - 1 is last.
655 			if ((j == 2 && dec_->nweights > 3) || (j == dec_->nweights - 2)) {
656 				// Prepare the last one now for better latency.
657 				weightToAllLanes(INVALID_REG, j + 1);
658 			}
659 		}
660 #else
661 		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2));
662 		CVTSI2SS(weight, R(tempReg1));
663 		MULSS(weight, M(&by32768));  // rip accessible (x86)
664 		SHUFPS(weight, R(weight), _MM_SHUFFLE(0, 0, 0, 0));
665 #endif
666 		if (j == 0) {
667 			MOVAPS(XMM4, MDisp(tempReg2, 0));
668 			MOVAPS(XMM5, MDisp(tempReg2, 16));
669 			MOVAPS(XMM6, MDisp(tempReg2, 32));
670 			MOVAPS(XMM7, MDisp(tempReg2, 48));
671 			MULPS(XMM4, R(weight));
672 			MULPS(XMM5, R(weight));
673 			MULPS(XMM6, R(weight));
674 			MULPS(XMM7, R(weight));
675 		} else {
676 			MOVAPS(XMM2, MDisp(tempReg2, 0));
677 			MOVAPS(XMM3, MDisp(tempReg2, 16));
678 			MULPS(XMM2, R(weight));
679 			MULPS(XMM3, R(weight));
680 			ADDPS(XMM4, R(XMM2));
681 			ADDPS(XMM5, R(XMM3));
682 			MOVAPS(XMM2, MDisp(tempReg2, 32));
683 			MOVAPS(XMM3, MDisp(tempReg2, 48));
684 			MULPS(XMM2, R(weight));
685 			MULPS(XMM3, R(weight));
686 			ADDPS(XMM6, R(XMM2));
687 			ADDPS(XMM7, R(XMM3));
688 		}
689 		ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
690 	}
691 }
692 
Jit_WeightsFloatSkin()693 void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
694 	MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
695 	for (int j = 0; j < dec_->nweights; j++) {
696 		MOVSS(XMM1, MDisp(srcReg, dec_->weightoff + j * 4));
697 		SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
698 		if (j == 0) {
699 			MOVAPS(XMM4, MDisp(tempReg2, 0));
700 			MOVAPS(XMM5, MDisp(tempReg2, 16));
701 			MOVAPS(XMM6, MDisp(tempReg2, 32));
702 			MOVAPS(XMM7, MDisp(tempReg2, 48));
703 			MULPS(XMM4, R(XMM1));
704 			MULPS(XMM5, R(XMM1));
705 			MULPS(XMM6, R(XMM1));
706 			MULPS(XMM7, R(XMM1));
707 		} else {
708 			MOVAPS(XMM2, MDisp(tempReg2, 0));
709 			MOVAPS(XMM3, MDisp(tempReg2, 16));
710 			MULPS(XMM2, R(XMM1));
711 			MULPS(XMM3, R(XMM1));
712 			ADDPS(XMM4, R(XMM2));
713 			ADDPS(XMM5, R(XMM3));
714 			MOVAPS(XMM2, MDisp(tempReg2, 32));
715 			MOVAPS(XMM3, MDisp(tempReg2, 48));
716 			MULPS(XMM2, R(XMM1));
717 			MULPS(XMM3, R(XMM1));
718 			ADDPS(XMM6, R(XMM2));
719 			ADDPS(XMM7, R(XMM3));
720 		}
721 		ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
722 	}
723 }
724 
Jit_TcU8ToFloat()725 void VertexDecoderJitCache::Jit_TcU8ToFloat() {
726 	Jit_AnyU8ToFloat(dec_->tcoff, 16);
727 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), XMM3);
728 }
729 
Jit_TcU16ToFloat()730 void VertexDecoderJitCache::Jit_TcU16ToFloat() {
731 	Jit_AnyU16ToFloat(dec_->tcoff, 32);
732 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), XMM3);
733 }
734 
Jit_TcFloat()735 void VertexDecoderJitCache::Jit_TcFloat() {
736 #if PPSSPP_ARCH(AMD64)
737 	MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
738 	MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
739 #else
740 	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
741 	MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
742 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
743 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
744 #endif
745 }
746 
Jit_TcU8Prescale()747 void VertexDecoderJitCache::Jit_TcU8Prescale() {
748 	// TODO: The first five instructions could be done in 1 or 2 in SSE4
749 	MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff));
750 	MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1));
751 	CVTSI2SS(fpScratchReg, R(tempReg1));
752 	CVTSI2SS(fpScratchReg2, R(tempReg2));
753 	UNPCKLPS(fpScratchReg, R(fpScratchReg2));
754 	MULPS(fpScratchReg, R(fpScaleOffsetReg));
755 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
756 	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
757 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
758 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
759 }
760 
Jit_TcU16Prescale()761 void VertexDecoderJitCache::Jit_TcU16Prescale() {
762 	PXOR(fpScratchReg2, R(fpScratchReg2));
763 	MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
764 	PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
765 	CVTDQ2PS(fpScratchReg, R(fpScratchReg));
766 	MULPS(fpScratchReg, R(fpScaleOffsetReg));
767 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
768 	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
769 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
770 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
771 }
772 
Jit_TcFloatPrescale()773 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
774 	MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
775 	MULPS(fpScratchReg, R(fpScaleOffsetReg));
776 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
777 	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
778 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
779 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
780 }
781 
Jit_TcAnyMorph(int bits)782 void VertexDecoderJitCache::Jit_TcAnyMorph(int bits) {
783 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
784 	if (!cpu_info.bSSE4_1) {
785 		PXOR(fpScratchReg4, R(fpScratchReg4));
786 	}
787 
788 	bool first = true;
789 	for (int n = 0; n < dec_->morphcount; ++n) {
790 		const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
791 		const OpArg src = MDisp(srcReg, dec_->onesize_ * n + dec_->tcoff);
792 
793 		// Load the actual values and convert to float.
794 		if (bits == 32) {
795 			// Two floats: just load as a MOVQ.
796 			MOVQ_xmm(reg, src);
797 		} else {
798 			if (bits == 8) {
799 				MOVZX(32, 16, tempReg2, src);
800 				MOVD_xmm(reg, R(tempReg2));
801 			} else {
802 				MOVD_xmm(reg, src);
803 			}
804 			if (cpu_info.bSSE4_1) {
805 				if (bits == 8) {
806 					PMOVZXBD(reg, R(reg));
807 				} else {
808 					PMOVZXWD(reg, R(reg));
809 				}
810 			} else {
811 				if (bits == 8) {
812 					PUNPCKLBW(reg, R(fpScratchReg4));
813 				}
814 				PUNPCKLWD(reg, R(fpScratchReg4));
815 			}
816 
817 			CVTDQ2PS(reg, R(reg));
818 		}
819 
820 		// And now scale by the weight.
821 		MOVSS(fpScratchReg3, MDisp(tempReg1, n * sizeof(float)));
822 		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
823 		MULPS(reg, R(fpScratchReg3));
824 
825 		if (!first) {
826 			ADDPS(fpScratchReg, R(fpScratchReg2));
827 		} else {
828 			first = false;
829 		}
830 	}
831 }
832 
Jit_TcU8MorphToFloat()833 void VertexDecoderJitCache::Jit_TcU8MorphToFloat() {
834 	Jit_TcAnyMorph(8);
835 	// They were all added (weighted) pre-normalize, we normalize once here.
836 	if (RipAccessible(&by128)) {
837 		MULPS(fpScratchReg, M(&by128));  // rip accessible
838 	} else {
839 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
840 		MULPS(fpScratchReg, MatR(tempReg1));
841 	}
842 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
843 }
844 
Jit_TcU16MorphToFloat()845 void VertexDecoderJitCache::Jit_TcU16MorphToFloat() {
846 	Jit_TcAnyMorph(16);
847 	// They were all added (weighted) pre-normalize, we normalize once here.
848 	if (RipAccessible(&by32768)) {
849 		MULPS(fpScratchReg, M(&by32768));  // rip accessible
850 	} else {
851 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
852 		MULPS(fpScratchReg, MatR(tempReg1));
853 	}
854 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
855 }
856 
Jit_TcFloatMorph()857 void VertexDecoderJitCache::Jit_TcFloatMorph() {
858 	Jit_TcAnyMorph(32);
859 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
860 }
861 
Jit_TcU8PrescaleMorph()862 void VertexDecoderJitCache::Jit_TcU8PrescaleMorph() {
863 	Jit_TcAnyMorph(8);
864 	// The scale takes into account the u8 normalization.
865 	MULPS(fpScratchReg, R(fpScaleOffsetReg));
866 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
867 	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
868 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
869 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
870 }
871 
Jit_TcU16PrescaleMorph()872 void VertexDecoderJitCache::Jit_TcU16PrescaleMorph() {
873 	Jit_TcAnyMorph(16);
874 	// The scale takes into account the u16 normalization.
875 	MULPS(fpScratchReg, R(fpScaleOffsetReg));
876 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
877 	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
878 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
879 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
880 }
881 
Jit_TcFloatPrescaleMorph()882 void VertexDecoderJitCache::Jit_TcFloatPrescaleMorph() {
883 	Jit_TcAnyMorph(32);
884 	MULPS(fpScratchReg, R(fpScaleOffsetReg));
885 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
886 	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
887 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
888 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
889 }
890 
Jit_TcU16ThroughToFloat()891 void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
892 	PXOR(fpScratchReg2, R(fpScratchReg2));
893 	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
894 	MOVD_xmm(fpScratchReg, R(tempReg1));
895 	PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
896 	CVTDQ2PS(fpScratchReg, R(fpScratchReg));
897 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
898 
899 	MOV(32, R(tempReg2), R(tempReg1));
900 	SHR(32, R(tempReg2), Imm8(16));
901 
902 	MOV(PTRBITS, R(tempReg3), ImmPtr(&gstate_c.vertBounds));
903 	auto updateSide = [&](X64Reg r, CCFlags skipCC, int offset) {
904 		CMP(16, R(r), MDisp(tempReg3, offset));
905 		FixupBranch skip = J_CC(skipCC);
906 		MOV(16, MDisp(tempReg3, offset), R(r));
907 		SetJumpTarget(skip);
908 	};
909 	// TODO: Can this actually be fast?  Hmm, floats aren't better.
910 	updateSide(tempReg1, CC_GE, offsetof(KnownVertexBounds, minU));
911 	updateSide(tempReg1, CC_LE, offsetof(KnownVertexBounds, maxU));
912 	updateSide(tempReg2, CC_GE, offsetof(KnownVertexBounds, minV));
913 	updateSide(tempReg2, CC_LE, offsetof(KnownVertexBounds, maxV));
914 }
915 
Jit_TcFloatThrough()916 void VertexDecoderJitCache::Jit_TcFloatThrough() {
917 #if PPSSPP_ARCH(AMD64)
918 	MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
919 	MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
920 #else
921 	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
922 	MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
923 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
924 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
925 #endif
926 }
927 
Jit_Color8888()928 void VertexDecoderJitCache::Jit_Color8888() {
929 	MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
930 	MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
931 
932 	CMP(32, R(tempReg1), Imm32(0xFF000000));
933 	FixupBranch skip = J_CC(CC_AE, false);
934 	if (RipAccessible(&gstate_c.vertexFullAlpha)) {
935 		MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0));  // rip accessible
936 	} else {
937 		MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha));
938 		MOV(8, MatR(tempReg1), Imm8(0));
939 	}
940 	SetJumpTarget(skip);
941 }
942 
943 alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
944 
Jit_Color4444()945 void VertexDecoderJitCache::Jit_Color4444() {
946 	// This over-reads slightly, but we assume pos or another component follows anyway.
947 	MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff));
948 	// Spread to RGBA -> R00GB00A.
949 	PUNPCKLBW(fpScratchReg, R(fpScratchReg));
950 	if (RipAccessible(&color4444mask[0])) {
951 		PAND(fpScratchReg, M(&color4444mask[0]));  // rip accessible
952 	} else {
953 		MOV(PTRBITS, R(tempReg1), ImmPtr(&color4444mask));
954 		PAND(fpScratchReg, MatR(tempReg1));
955 	}
956 	MOVSS(fpScratchReg2, R(fpScratchReg));
957 	MOVSS(fpScratchReg3, R(fpScratchReg));
958 	// Create 0R000B00 and 00G000A0.
959 	PSRLW(fpScratchReg2, 4);
960 	PSLLW(fpScratchReg3, 4);
961 	// Combine for the complete set: RRGGBBAA.
962 	POR(fpScratchReg, R(fpScratchReg2));
963 	POR(fpScratchReg, R(fpScratchReg3));
964 	MOVD_xmm(R(tempReg1), fpScratchReg);
965 	MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
966 
967 	CMP(32, R(tempReg1), Imm32(0xFF000000));
968 	FixupBranch skip = J_CC(CC_AE, false);
969 	if (RipAccessible(&gstate_c.vertexFullAlpha)) {
970 		MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0));  // rip accessible
971 	} else {
972 		MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha));
973 		MOV(8, MatR(tempReg1), Imm8(0));
974 	}
975 	SetJumpTarget(skip);
976 }
977 
Jit_Color565()978 void VertexDecoderJitCache::Jit_Color565() {
979 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff));
980 
981 	MOV(32, R(tempReg2), R(tempReg1));
982 	AND(32, R(tempReg2), Imm32(0x0000001F));
983 
984 	// B (we do R and B at the same time, they're both 5.)
985 	MOV(32, R(tempReg3), R(tempReg1));
986 	AND(32, R(tempReg3), Imm32(0x0000F800));
987 	SHL(32, R(tempReg3), Imm8(5));
988 	OR(32, R(tempReg2), R(tempReg3));
989 
990 	// Expand 5 -> 8.  At this point we have 00BB00RR.
991 	MOV(32, R(tempReg3), R(tempReg2));
992 	SHL(32, R(tempReg2), Imm8(3));
993 	SHR(32, R(tempReg3), Imm8(2));
994 	OR(32, R(tempReg2), R(tempReg3));
995 	AND(32, R(tempReg2), Imm32(0x00FF00FF));
996 
997 	// Now's as good a time to put in A as any.
998 	OR(32, R(tempReg2), Imm32(0xFF000000));
999 
1000 	// Last, we need to align, extract, and expand G.
1001 	// 3 to align to G, and then 2 to expand to 8.
1002 	SHL(32, R(tempReg1), Imm8(3 + 2));
1003 	AND(32, R(tempReg1), Imm32(0x0000FC00));
1004 	MOV(32, R(tempReg3), R(tempReg1));
1005 	// 2 to account for tempReg1 being preshifted, 4 for expansion.
1006 	SHR(32, R(tempReg3), Imm8(2 + 4));
1007 	OR(32, R(tempReg1), R(tempReg3));
1008 	AND(32, R(tempReg1), Imm32(0x0000FF00));
1009 	OR(32, R(tempReg2), R(tempReg1));
1010 
1011 	MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
1012 	// Never has alpha, no need to update fullAlphaArg.
1013 }
1014 
Jit_Color5551()1015 void VertexDecoderJitCache::Jit_Color5551() {
1016 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff));
1017 
1018 	MOV(32, R(tempReg2), R(tempReg1));
1019 	MOV(32, R(tempReg3), R(tempReg1));
1020 	AND(32, R(tempReg2), Imm32(0x0000001F));
1021 	AND(32, R(tempReg3), Imm32(0x000003E0));
1022 	SHL(32, R(tempReg3), Imm8(3));
1023 	OR(32, R(tempReg2), R(tempReg3));
1024 
1025 	MOV(32, R(tempReg3), R(tempReg1));
1026 	AND(32, R(tempReg3), Imm32(0x00007C00));
1027 	SHL(32, R(tempReg3), Imm8(6));
1028 	OR(32, R(tempReg2), R(tempReg3));
1029 
1030 	// Expand 5 -> 8.  After this is just A.
1031 	MOV(32, R(tempReg3), R(tempReg2));
1032 	SHL(32, R(tempReg2), Imm8(3));
1033 	SHR(32, R(tempReg3), Imm8(2));
1034 	// Chop off the bits that were shifted out.
1035 	AND(32, R(tempReg3), Imm32(0x00070707));
1036 	OR(32, R(tempReg2), R(tempReg3));
1037 
1038 	// For A, we shift it to a single bit, and then subtract and XOR.
1039 	// That's probably the simplest way to expand it...
1040 	SHR(32, R(tempReg1), Imm8(15));
1041 	// If it was 0, it's now -1, otherwise it's 0.  Easy.
1042 	SUB(32, R(tempReg1), Imm8(1));
1043 	XOR(32, R(tempReg1), Imm32(0xFF000000));
1044 	AND(32, R(tempReg1), Imm32(0xFF000000));
1045 	OR(32, R(tempReg2), R(tempReg1));
1046 
1047 	MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
1048 
1049 	CMP(32, R(tempReg2), Imm32(0xFF000000));
1050 	FixupBranch skip = J_CC(CC_AE, false);
1051 	if (RipAccessible(&gstate_c.vertexFullAlpha)) {
1052 		MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0));  // rip accessible
1053 	} else {
1054 		MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha));
1055 		MOV(8, MatR(tempReg1), Imm8(0));
1056 	}
1057 	SetJumpTarget(skip);
1058 }
1059 
Jit_Color8888Morph()1060 void VertexDecoderJitCache::Jit_Color8888Morph() {
1061 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1062 	if (!cpu_info.bSSE4_1) {
1063 		PXOR(fpScratchReg4, R(fpScratchReg4));
1064 	}
1065 
1066 	bool first = true;
1067 	for (int n = 0; n < dec_->morphcount; ++n) {
1068 		const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1069 		MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
1070 		if (cpu_info.bSSE4_1) {
1071 			PMOVZXBD(reg, R(reg));
1072 		} else {
1073 			PUNPCKLBW(reg, R(fpScratchReg4));
1074 			PUNPCKLWD(reg, R(fpScratchReg4));
1075 		}
1076 
1077 		CVTDQ2PS(reg, R(reg));
1078 
1079 		// And now the weight.
1080 		MOVSS(fpScratchReg3, MDisp(tempReg1, n * sizeof(float)));
1081 		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1082 		MULPS(reg, R(fpScratchReg3));
1083 
1084 		if (!first) {
1085 			ADDPS(fpScratchReg, R(fpScratchReg2));
1086 		} else {
1087 			first = false;
1088 		}
1089 	}
1090 
1091 	Jit_WriteMorphColor(dec_->decFmt.c0off);
1092 }
1093 
1094 alignas(16) static const float byColor4444[4] = { 255.0f / 15.0f, 255.0f / 15.0f, 255.0f / 15.0f, 255.0f / 15.0f, };
1095 
Jit_Color4444Morph()1096 void VertexDecoderJitCache::Jit_Color4444Morph() {
1097 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1098 	if (!cpu_info.bSSE4_1) {
1099 		PXOR(fpScratchReg4, R(fpScratchReg4));
1100 	}
1101 	MOV(PTRBITS, R(tempReg2), ImmPtr(color4444mask));
1102 	MOVDQA(XMM5, MatR(tempReg2));
1103 	MOV(PTRBITS, R(tempReg2), ImmPtr(byColor4444));
1104 	MOVAPS(XMM6, MatR(tempReg2));
1105 
1106 	bool first = true;
1107 	for (int n = 0; n < dec_->morphcount; ++n) {
1108 		const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1109 		MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
1110 		PUNPCKLBW(reg, R(reg));
1111 		PAND(reg, R(XMM5));
1112 		MOVSS(fpScratchReg3, R(reg));
1113 		PSLLW(fpScratchReg3, 4);
1114 		POR(reg, R(fpScratchReg3));
1115 		PSRLW(reg, 4);
1116 
1117 		if (cpu_info.bSSE4_1) {
1118 			PMOVZXBD(reg, R(reg));
1119 		} else {
1120 			PUNPCKLBW(reg, R(fpScratchReg4));
1121 			PUNPCKLWD(reg, R(fpScratchReg4));
1122 		}
1123 
1124 		CVTDQ2PS(reg, R(reg));
1125 		MULPS(reg, R(XMM6));
1126 
1127 		// And now the weight.
1128 		MOVSS(fpScratchReg3, MDisp(tempReg1, n * sizeof(float)));
1129 		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1130 		MULPS(reg, R(fpScratchReg3));
1131 
1132 		if (!first) {
1133 			ADDPS(fpScratchReg, R(fpScratchReg2));
1134 		} else {
1135 			first = false;
1136 		}
1137 	}
1138 
1139 	Jit_WriteMorphColor(dec_->decFmt.c0off);
1140 }
1141 
1142 // The mask is intentionally in reverse order (but skips A.)
1143 alignas(16) static const u32 color565Mask[4] = { 0x0000f800, 0x000007e0, 0x0000001f, 0x00000000, };
1144 alignas(16) static const float byColor565[4] = { 255.0f / 31.0f, 255.0f / 63.0f, 255.0f / 31.0f, 255.0f / 1.0f, };
1145 
Jit_Color565Morph()1146 void VertexDecoderJitCache::Jit_Color565Morph() {
1147 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1148 	MOV(PTRBITS, R(tempReg2), ImmPtr(color565Mask));
1149 	MOVDQA(XMM5, MatR(tempReg2));
1150 	MOV(PTRBITS, R(tempReg2), ImmPtr(byColor565));
1151 	MOVAPS(XMM6, MatR(tempReg2));
1152 
1153 	bool first = true;
1154 	for (int n = 0; n < dec_->morphcount; ++n) {
1155 		const X64Reg reg = first ? fpScratchReg : fpScratchReg3;
1156 		MOVD_xmm(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
1157 		// Spread it out into each lane.  We end up with it reversed (R high, A low.)
1158 		// Below, we shift out each lane from low to high and reverse them.
1159 		PSHUFD(fpScratchReg2, R(fpScratchReg2), _MM_SHUFFLE(0, 0, 0, 0));
1160 		PAND(fpScratchReg2, R(XMM5));
1161 
1162 		// Alpha handled in Jit_WriteMorphColor.
1163 
1164 		// Blue first.
1165 		MOVSS(reg, R(fpScratchReg2));
1166 		PSRLD(reg, 6);
1167 		PSHUFD(reg, R(reg), _MM_SHUFFLE(3, 0, 0, 0));
1168 
1169 		// Green, let's shift it into the right lane first.
1170 		PSRLDQ(fpScratchReg2, 4);
1171 		MOVSS(reg, R(fpScratchReg2));
1172 		PSRLD(reg, 5);
1173 		PSHUFD(reg, R(reg), _MM_SHUFFLE(3, 2, 0, 0));
1174 
1175 		// Last one, red.
1176 		PSRLDQ(fpScratchReg2, 4);
1177 		MOVSS(reg, R(fpScratchReg2));
1178 
1179 		CVTDQ2PS(reg, R(reg));
1180 		MULPS(reg, R(XMM6));
1181 
1182 		// And now the weight.
1183 		MOVSS(fpScratchReg2, MDisp(tempReg1, n * sizeof(float)));
1184 		SHUFPS(fpScratchReg2, R(fpScratchReg2), _MM_SHUFFLE(0, 0, 0, 0));
1185 		MULPS(reg, R(fpScratchReg2));
1186 
1187 		if (!first) {
1188 			ADDPS(fpScratchReg, R(fpScratchReg3));
1189 		} else {
1190 			first = false;
1191 		}
1192 	}
1193 
1194 	Jit_WriteMorphColor(dec_->decFmt.c0off, false);
1195 }
1196 
1197 // The mask is intentionally in reverse order.
1198 alignas(16) static const u32 color5551Mask[4] = { 0x00008000, 0x00007c00, 0x000003e0, 0x0000001f, };
1199 alignas(16) static const float byColor5551[4] = { 255.0f / 31.0f, 255.0f / 31.0f, 255.0f / 31.0f, 255.0f / 1.0f, };
1200 
Jit_Color5551Morph()1201 void VertexDecoderJitCache::Jit_Color5551Morph() {
1202 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1203 	MOV(PTRBITS, R(tempReg2), ImmPtr(color5551Mask));
1204 	MOVDQA(XMM5, MatR(tempReg2));
1205 	MOV(PTRBITS, R(tempReg2), ImmPtr(byColor5551));
1206 	MOVAPS(XMM6, MatR(tempReg2));
1207 
1208 	bool first = true;
1209 	for (int n = 0; n < dec_->morphcount; ++n) {
1210 		const X64Reg reg = first ? fpScratchReg : fpScratchReg3;
1211 		MOVD_xmm(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
1212 		// Spread it out into each lane.
1213 		PSHUFD(fpScratchReg2, R(fpScratchReg2), _MM_SHUFFLE(0, 0, 0, 0));
1214 		PAND(fpScratchReg2, R(XMM5));
1215 
1216 		// Alpha first.
1217 		MOVSS(reg, R(fpScratchReg2));
1218 		PSRLD(reg, 5);
1219 		PSHUFD(reg, R(reg), _MM_SHUFFLE(0, 0, 0, 0));
1220 
1221 		// Blue, let's shift it into the right lane first.
1222 		PSRLDQ(fpScratchReg2, 4);
1223 		MOVSS(reg, R(fpScratchReg2));
1224 		PSRLD(reg, 5);
1225 		PSHUFD(reg, R(reg), _MM_SHUFFLE(3, 0, 0, 0));
1226 
1227 		// Green.
1228 		PSRLDQ(fpScratchReg2, 4);
1229 		MOVSS(reg, R(fpScratchReg2));
1230 		PSRLD(reg, 5);
1231 		PSHUFD(reg, R(reg), _MM_SHUFFLE(3, 2, 0, 0));
1232 
1233 		// Last one, red.
1234 		PSRLDQ(fpScratchReg2, 4);
1235 		MOVSS(reg, R(fpScratchReg2));
1236 
1237 		CVTDQ2PS(reg, R(reg));
1238 		MULPS(reg, R(XMM6));
1239 
1240 		// And now the weight.
1241 		MOVSS(fpScratchReg2, MDisp(tempReg1, n * sizeof(float)));
1242 		SHUFPS(fpScratchReg2, R(fpScratchReg2), _MM_SHUFFLE(0, 0, 0, 0));
1243 		MULPS(reg, R(fpScratchReg2));
1244 
1245 		if (!first) {
1246 			ADDPS(fpScratchReg, R(fpScratchReg3));
1247 		} else {
1248 			first = false;
1249 		}
1250 	}
1251 
1252 	Jit_WriteMorphColor(dec_->decFmt.c0off);
1253 }
1254 
Jit_WriteMorphColor(int outOff,bool checkAlpha)1255 void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) {
1256 	// Pack back into a u32, with saturation.
1257 	CVTPS2DQ(fpScratchReg, R(fpScratchReg));
1258 	PACKSSDW(fpScratchReg, R(fpScratchReg));
1259 	PACKUSWB(fpScratchReg, R(fpScratchReg));
1260 	MOVD_xmm(R(tempReg1), fpScratchReg);
1261 
1262 	// TODO: May be a faster way to do this without the MOVD.
1263 	if (checkAlpha) {
1264 		CMP(32, R(tempReg1), Imm32(0xFF000000));
1265 		FixupBranch skip = J_CC(CC_AE, false);
1266 		if (RipAccessible(&gstate_c.vertexFullAlpha)) {
1267 			MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0));  // rip accessible
1268 		} else {
1269 			MOV(PTRBITS, R(tempReg2), ImmPtr(&gstate_c.vertexFullAlpha));
1270 			MOV(8, MatR(tempReg2), Imm8(0));
1271 		}
1272 		SetJumpTarget(skip);
1273 	} else {
1274 		// Force alpha to full if we're not checking it.
1275 		OR(32, R(tempReg1), Imm32(0xFF000000));
1276 	}
1277 
1278 	MOV(32, MDisp(dstReg, outOff), R(tempReg1));
1279 }
1280 
1281 // Copy 3 bytes and then a zero. Might as well copy four.
Jit_NormalS8()1282 void VertexDecoderJitCache::Jit_NormalS8() {
1283 	MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
1284 	AND(32, R(tempReg1), Imm32(0x00FFFFFF));
1285 	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
1286 }
1287 
Jit_NormalS8ToFloat()1288 void VertexDecoderJitCache::Jit_NormalS8ToFloat() {
1289 	Jit_AnyS8ToFloat(dec_->nrmoff);
1290 	MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM3);
1291 }
1292 
1293 // Copy 6 bytes and then 2 zeroes.
Jit_NormalS16()1294 void VertexDecoderJitCache::Jit_NormalS16() {
1295 	MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
1296 	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->nrmoff + 4));
1297 	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
1298 	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
1299 }
1300 
Jit_NormalFloat()1301 void VertexDecoderJitCache::Jit_NormalFloat() {
1302 	if (cpu_info.Mode64bit) {
1303 		MOV(64, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
1304 		MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8));
1305 		MOV(64, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
1306 		MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
1307 	} else {
1308 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
1309 		MOV(32, R(tempReg2), MDisp(srcReg, dec_->nrmoff + 4));
1310 		MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8));
1311 		MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
1312 		MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
1313 		MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
1314 	}
1315 }
1316 
1317 // This could be a bit shorter with AVX 3-operand instructions and FMA.
Jit_WriteMatrixMul(int outOff,bool pos)1318 void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
1319 	MOVAPS(XMM1, R(XMM3));
1320 	MOVAPS(XMM2, R(XMM3));
1321 	SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
1322 	SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
1323 	SHUFPS(XMM3, R(XMM3), _MM_SHUFFLE(2, 2, 2, 2));
1324 	MULPS(XMM1, R(XMM4));
1325 	MULPS(XMM2, R(XMM5));
1326 	MULPS(XMM3, R(XMM6));
1327 	ADDPS(XMM1, R(XMM2));
1328 	ADDPS(XMM1, R(XMM3));
1329 	if (pos) {
1330 		ADDPS(XMM1, R(XMM7));
1331 	}
1332 	MOVUPS(MDisp(dstReg, outOff), XMM1);
1333 }
1334 
Jit_NormalS8Skin()1335 void VertexDecoderJitCache::Jit_NormalS8Skin() {
1336 	Jit_AnyS8ToFloat(dec_->nrmoff);
1337 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1338 }
1339 
Jit_NormalS16Skin()1340 void VertexDecoderJitCache::Jit_NormalS16Skin() {
1341 	Jit_AnyS16ToFloat(dec_->nrmoff);
1342 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1343 }
1344 
Jit_NormalFloatSkin()1345 void VertexDecoderJitCache::Jit_NormalFloatSkin() {
1346 	MOVUPS(XMM3, MDisp(srcReg, dec_->nrmoff));
1347 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1348 }
1349 
1350 // Through expands into floats, always. Might want to look at changing this.
Jit_PosS8Through()1351 void VertexDecoderJitCache::Jit_PosS8Through() {
1352 	DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
1353 	// SIMD doesn't really matter since this isn't useful on hardware.
1354 	for (int i = 0; i < 3; i++) {
1355 		MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
1356 		CVTSI2SS(fpScratchReg, R(tempReg1));
1357 		MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg);
1358 	}
1359 }
1360 
1361 // Through expands into floats, always. Might want to look at changing this.
Jit_PosS16Through()1362 void VertexDecoderJitCache::Jit_PosS16Through() {
1363 	if (cpu_info.bSSE4_1) {
1364 		MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->posoff));
1365 		MOVZX(32, 16, tempReg3, MDisp(srcReg, dec_->posoff + 4));
1366 		MOVD_xmm(fpScratchReg2, R(tempReg3));
1367 		PMOVSXWD(fpScratchReg, R(fpScratchReg));
1368 		PUNPCKLQDQ(fpScratchReg, R(fpScratchReg2));
1369 		CVTDQ2PS(fpScratchReg, R(fpScratchReg));
1370 		MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), fpScratchReg);
1371 	} else {
1372 		MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff));
1373 		MOVSX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 2));
1374 		MOVZX(32, 16, tempReg3, MDisp(srcReg, dec_->posoff + 4));  // NOTE: MOVZX
1375 		CVTSI2SS(fpScratchReg, R(tempReg1));
1376 		MOVSS(MDisp(dstReg, dec_->decFmt.posoff), fpScratchReg);
1377 		CVTSI2SS(fpScratchReg, R(tempReg2));
1378 		MOVSS(MDisp(dstReg, dec_->decFmt.posoff + 4), fpScratchReg);
1379 		CVTSI2SS(fpScratchReg, R(tempReg3));
1380 		MOVSS(MDisp(dstReg, dec_->decFmt.posoff + 8), fpScratchReg);
1381 	}
1382 }
1383 
Jit_PosS8()1384 void VertexDecoderJitCache::Jit_PosS8() {
1385 	Jit_AnyS8ToFloat(dec_->posoff);
1386 	MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
1387 }
1388 
Jit_PosS16()1389 void VertexDecoderJitCache::Jit_PosS16() {
1390 	Jit_AnyS16ToFloat(dec_->posoff);
1391 	MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
1392 }
1393 
1394 // Just copy 12 bytes.
Jit_PosFloat()1395 void VertexDecoderJitCache::Jit_PosFloat() {
1396 	if (cpu_info.Mode64bit) {
1397 		MOV(64, R(tempReg1), MDisp(srcReg, dec_->posoff));
1398 		MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8));
1399 		MOV(64, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
1400 		MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
1401 	} else {
1402 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
1403 		MOV(32, R(tempReg2), MDisp(srcReg, dec_->posoff + 4));
1404 		MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8));
1405 		MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
1406 		MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
1407 		MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
1408 	}
1409 }
1410 
Jit_PosS8Skin()1411 void VertexDecoderJitCache::Jit_PosS8Skin() {
1412 	Jit_AnyS8ToFloat(dec_->posoff);
1413 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1414 }
1415 
Jit_PosS16Skin()1416 void VertexDecoderJitCache::Jit_PosS16Skin() {
1417 	Jit_AnyS16ToFloat(dec_->posoff);
1418 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1419 }
1420 
Jit_PosFloatSkin()1421 void VertexDecoderJitCache::Jit_PosFloatSkin() {
1422 	MOVUPS(XMM3, MDisp(srcReg, dec_->posoff));
1423 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1424 }
1425 
Jit_AnyS8ToFloat(int srcoff)1426 void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
1427 	if (!cpu_info.bSSE4_1) {
1428 		PXOR(XMM3, R(XMM3));
1429 	}
1430 	MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
1431 	if (cpu_info.bSSE4_1) {
1432 		PMOVSXBD(XMM1, R(XMM1));
1433 	} else {
1434 		PUNPCKLBW(XMM1, R(XMM3));
1435 		PUNPCKLWD(XMM1, R(XMM3));
1436 		PSLLD(XMM1, 24);
1437 		PSRAD(XMM1, 24);
1438 	}
1439 	CVTDQ2PS(XMM3, R(XMM1));
1440 	if (RipAccessible(&by128)) {
1441 		MULPS(XMM3, M(&by128));  // rip accessible
1442 	} else {
1443 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
1444 		MULPS(XMM3, MatR(tempReg1));
1445 	}
1446 }
1447 
Jit_AnyS16ToFloat(int srcoff)1448 void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
1449 	if (!cpu_info.bSSE4_1) {
1450 		PXOR(XMM3, R(XMM3));
1451 	}
1452 	MOVQ_xmm(XMM1, MDisp(srcReg, srcoff));
1453 	if (cpu_info.bSSE4_1) {
1454 		PMOVSXWD(XMM1, R(XMM1));
1455 	} else {
1456 		PUNPCKLWD(XMM1, R(XMM3));
1457 		PSLLD(XMM1, 16);
1458 		PSRAD(XMM1, 16);
1459 	}
1460 	CVTDQ2PS(XMM3, R(XMM1));
1461 	if (RipAccessible(&by32768)) {
1462 		MULPS(XMM3, M(&by32768));  // rip accessible
1463 	} else {
1464 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
1465 		MULPS(XMM3, MatR(tempReg1));
1466 	}
1467 }
1468 
Jit_AnyU8ToFloat(int srcoff,u32 bits)1469 void VertexDecoderJitCache::Jit_AnyU8ToFloat(int srcoff, u32 bits) {
1470 	_dbg_assert_msg_((bits & ~(32 | 16 | 8)) == 0, "Bits must be a multiple of 8.");
1471 	_dbg_assert_msg_(bits >= 8 && bits <= 32, "Bits must be a between 8 and 32.");
1472 
1473 	if (!cpu_info.bSSE4_1) {
1474 		PXOR(XMM3, R(XMM3));
1475 	}
1476 	if (bits == 32) {
1477 		MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
1478 	} else if (bits == 24) {
1479 		MOV(32, R(tempReg1), MDisp(srcReg, srcoff));
1480 		AND(32, R(tempReg1), Imm32(0x00FFFFFF));
1481 		MOVD_xmm(XMM1, R(tempReg1));
1482 	} else {
1483 		MOVZX(32, bits, tempReg1, MDisp(srcReg, srcoff));
1484 		MOVD_xmm(XMM1, R(tempReg1));
1485 	}
1486 	if (cpu_info.bSSE4_1) {
1487 		PMOVZXBD(XMM1, R(XMM1));
1488 	} else {
1489 		PUNPCKLBW(XMM1, R(XMM3));
1490 		PUNPCKLWD(XMM1, R(XMM3));
1491 	}
1492 	CVTDQ2PS(XMM3, R(XMM1));
1493 	if (RipAccessible(&by128)) {
1494 		MULPS(XMM3, M(&by128));  // rip accessible
1495 	} else {
1496 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
1497 		MULPS(XMM3, MatR(tempReg1));
1498 	}
1499 }
1500 
Jit_AnyU16ToFloat(int srcoff,u32 bits)1501 void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff, u32 bits) {
1502 	_dbg_assert_msg_((bits & ~(64 | 32 | 16)) == 0, "Bits must be a multiple of 16.");
1503 	_dbg_assert_msg_(bits >= 16 && bits <= 64, "Bits must be a between 16 and 64.");
1504 
1505 	if (!cpu_info.bSSE4_1) {
1506 		PXOR(XMM3, R(XMM3));
1507 	}
1508 	if (bits == 64) {
1509 		MOVQ_xmm(XMM1, MDisp(srcReg, srcoff));
1510 	} else if (bits == 48) {
1511 		MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
1512 		PINSRW(XMM1, MDisp(srcReg, srcoff + 4), 2);
1513 	} else if (bits == 32) {
1514 		MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
1515 	} else if (bits == 16) {
1516 		MOVZX(32, bits, tempReg1, MDisp(srcReg, srcoff));
1517 		MOVD_xmm(XMM1, R(tempReg1));
1518 	}
1519 	if (cpu_info.bSSE4_1) {
1520 		PMOVZXWD(XMM1, R(XMM1));
1521 	} else {
1522 		PUNPCKLWD(XMM1, R(XMM3));
1523 	}
1524 	CVTDQ2PS(XMM3, R(XMM1));
1525 	if (RipAccessible(&by32768)) {
1526 		MULPS(XMM3, M(&by32768));  // rip accessible
1527 	} else {
1528 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
1529 		MULPS(XMM3, MatR(tempReg1));
1530 	}
1531 }
1532 
Jit_AnyS8Morph(int srcoff,int dstoff)1533 void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
1534 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1535 	if (!cpu_info.bSSE4_1) {
1536 		PXOR(fpScratchReg4, R(fpScratchReg4));
1537 	}
1538 	if (RipAccessible(&by128)) {
1539 		MOVAPS(XMM5, M(&by128));  // rip accessible
1540 	} else {
1541 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by128));
1542 		MOVAPS(XMM5, MatR(tempReg1));
1543 	}
1544 
1545 	// Sum into fpScratchReg.
1546 	bool first = true;
1547 	for (int n = 0; n < dec_->morphcount; ++n) {
1548 		const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1549 		// Okay, first convert to floats.
1550 		MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
1551 		if (cpu_info.bSSE4_1) {
1552 			PMOVSXBD(reg, R(reg));
1553 		} else {
1554 			PUNPCKLBW(reg, R(fpScratchReg4));
1555 			PUNPCKLWD(reg, R(fpScratchReg4));
1556 			PSLLD(reg, 24);
1557 			PSRAD(reg, 24);
1558 		}
1559 		CVTDQ2PS(reg, R(reg));
1560 
1561 		// Now, It's time to multiply by the weight and 1.0f/128.0f.
1562 		MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
1563 		MULSS(fpScratchReg3, R(XMM5));
1564 		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1565 
1566 		MULPS(reg, R(fpScratchReg3));
1567 		if (!first) {
1568 			ADDPS(fpScratchReg, R(fpScratchReg2));
1569 		} else {
1570 			first = false;
1571 		}
1572 	}
1573 
1574 	MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
1575 }
1576 
Jit_AnyS16Morph(int srcoff,int dstoff)1577 void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
1578 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1579 	if (!cpu_info.bSSE4_1) {
1580 		PXOR(fpScratchReg4, R(fpScratchReg4));
1581 	}
1582 	if (RipAccessible(&by32768)) {
1583 		MOVAPS(XMM5, M(&by32768));  // rip accessible
1584 	} else {
1585 		MOV(PTRBITS, R(tempReg1), ImmPtr(&by32768));
1586 		MOVAPS(XMM5, MatR(tempReg1));
1587 	}
1588 
1589 	// Sum into fpScratchReg.
1590 	bool first = true;
1591 	for (int n = 0; n < dec_->morphcount; ++n) {
1592 		const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1593 		// Okay, first convert to floats.
1594 		MOVQ_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
1595 		if (cpu_info.bSSE4_1) {
1596 			PMOVSXWD(reg, R(reg));
1597 		} else {
1598 			PUNPCKLWD(reg, R(fpScratchReg4));
1599 			PSLLD(reg, 16);
1600 			PSRAD(reg, 16);
1601 		}
1602 		CVTDQ2PS(reg, R(reg));
1603 
1604 		// Now, It's time to multiply by the weight and 1.0f/32768.0f.
1605 		MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
1606 		MULSS(fpScratchReg3, R(XMM5));
1607 		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1608 
1609 		MULPS(reg, R(fpScratchReg3));
1610 		if (!first) {
1611 			ADDPS(fpScratchReg, R(fpScratchReg2));
1612 		} else {
1613 			first = false;
1614 		}
1615 	}
1616 
1617 	MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
1618 }
1619 
Jit_AnyFloatMorph(int srcoff,int dstoff)1620 void VertexDecoderJitCache::Jit_AnyFloatMorph(int srcoff, int dstoff) {
1621 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
1622 
1623 	// Sum into fpScratchReg.
1624 	bool first = true;
1625 	for (int n = 0; n < dec_->morphcount; ++n) {
1626 		const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
1627 		MOVUPS(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
1628 		MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
1629 		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
1630 		MULPS(reg, R(fpScratchReg3));
1631 		if (!first) {
1632 			ADDPS(fpScratchReg, R(fpScratchReg2));
1633 		} else {
1634 			first = false;
1635 		}
1636 	}
1637 
1638 	MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
1639 }
1640 
Jit_PosS8Morph()1641 void VertexDecoderJitCache::Jit_PosS8Morph() {
1642 	Jit_AnyS8Morph(dec_->posoff, dec_->decFmt.posoff);
1643 }
1644 
Jit_PosS16Morph()1645 void VertexDecoderJitCache::Jit_PosS16Morph() {
1646 	Jit_AnyS16Morph(dec_->posoff, dec_->decFmt.posoff);
1647 }
1648 
Jit_PosFloatMorph()1649 void VertexDecoderJitCache::Jit_PosFloatMorph() {
1650 	Jit_AnyFloatMorph(dec_->posoff, dec_->decFmt.posoff);
1651 }
1652 
Jit_NormalS8Morph()1653 void VertexDecoderJitCache::Jit_NormalS8Morph() {
1654 	Jit_AnyS8Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
1655 }
1656 
Jit_NormalS16Morph()1657 void VertexDecoderJitCache::Jit_NormalS16Morph() {
1658 	Jit_AnyS16Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
1659 }
1660 
Jit_NormalFloatMorph()1661 void VertexDecoderJitCache::Jit_NormalFloatMorph() {
1662 	Jit_AnyFloatMorph(dec_->nrmoff, dec_->decFmt.nrmoff);
1663 }
1664 
CompileStep(const VertexDecoder & dec,int step)1665 bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
1666 	// See if we find a matching JIT function
1667 	for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {
1668 		if (dec.steps_[step] == jitLookup[i].func) {
1669 			((*this).*jitLookup[i].jitFunc)();
1670 			return true;
1671 		}
1672 	}
1673 	return false;
1674 }
1675 
1676 #endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
1677