1 // Copyright (c) 2013- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include "ppsspp_config.h"
19 
20 #if PPSSPP_ARCH(ARM)
21 
22 // This allows highlighting to work.  Yay.
23 #ifdef __INTELLISENSE__
24 #define ARM
25 #endif
26 
27 #include <stddef.h>
28 
29 #include "Common/CPUDetect.h"
30 #include "Core/Config.h"
31 #include "Core/Reporting.h"
32 #include "GPU/GPUState.h"
33 #include "GPU/Common/VertexDecoderCommon.h"
34 
35 extern void DisassembleArm(const u8 *data, int size);
36 
37 bool NEONSkinning = false;
38 bool NEONMorphing = false;
39 
40 // Used only in non-NEON mode.
41 alignas(16) static float skinMatrix[12];
42 
43 // Will be used only in NEON mode.
44 alignas(16) static float bones[16 * 8];  // First two are kept in registers
45 alignas(16) static float boneMask[4] = {1.0f, 1.0f, 1.0f, 0.0f};
46 
47 // NEON register allocation:
48 // Q0: Texture scaling parameters
49 // Q1: Temp storage
50 // Q2: Vector-by-matrix accumulator
51 // Q3: Unused (multiplier temp when morphing)
52 //
53 // When skinning, we'll use Q4-Q7 as the "matrix accumulator".
54 // First two matrices will be preloaded into Q8-Q11 and Q12-Q15 to reduce
55 // memory bandwidth requirements.
56 // The rest will be dumped to bones as on x86.
57 //
58 // When morphing, we never skin.  So we're free to use Q4+.
59 // Q4 is for color shift values, and Q5 is a secondary multipler inside the morph.
60 // TODO: Maybe load all morph weights to Q6+ to avoid memory access?
61 
62 
63 static const float by128 = 1.0f / 128.0f;
64 static const float by16384 = 1.0f / 16384.0f;
65 static const float by32768 = 1.0f / 32768.0f;
66 
67 using namespace ArmGen;
68 
69 // NOTE: Avoid R9, it's dangerous on iOS.
70 //
71 // r0-r3: parameters
72 // r4-r11: local vars. save, except R9.
73 // r12: interprocedure scratch
74 // r13: stack8
75 
76 static const ARMReg tempReg1 = R3;
77 static const ARMReg tempReg2 = R4;
78 static const ARMReg tempReg3 = R5;
79 static const ARMReg scratchReg = R6;
80 static const ARMReg scratchReg2 = R7;
81 static const ARMReg scratchReg3 = R8;
82 static const ARMReg fullAlphaReg = R12;
83 static const ARMReg srcReg = R0;
84 static const ARMReg dstReg = R1;
85 static const ARMReg counterReg = R2;
86 static const ARMReg fpScratchReg = S4;
87 static const ARMReg fpScratchReg2 = S5;
88 static const ARMReg fpScratchReg3 = S6;
89 static const ARMReg fpScratchReg4 = S7;
90 static const ARMReg fpUscaleReg = S0;
91 static const ARMReg fpVscaleReg = S1;
92 static const ARMReg fpUoffsetReg = S2;
93 static const ARMReg fpVoffsetReg = S3;
94 
95 // Simpler aliases for NEON. Overlaps with corresponding VFP regs.
96 static const ARMReg neonUVScaleReg = D0;
97 static const ARMReg neonUVOffsetReg = D1;
98 static const ARMReg neonScratchReg = D2;
99 static const ARMReg neonScratchReg2 = D3;
100 static const ARMReg neonScratchRegQ = Q1;  // Overlaps with all the scratch regs
101 
102 // Everything above S6 is fair game for skinning
103 
104 // S8-S15 are used during matrix generation
105 
106 // These only live through the matrix multiplication
107 static const ARMReg src[3] = {S8, S9, S10};  // skin source
108 static const ARMReg acc[3] = {S11, S12, S13};  // skin accumulator
109 
110 static const ARMReg srcNEON = Q2;
111 static const ARMReg accNEON = Q3;
112 
113 static const JitLookup jitLookup[] = {
114 	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
115 	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
116 	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
117 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
118 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
119 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
120 
121 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
122 	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
123 	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
124 
125 	{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
126 	{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
127 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
128 
129 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
130 	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
131 
132 	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
133 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
134 	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
135 
136 	{&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
137 	{&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
138 	{&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
139 
140 	{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
141 	{&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
142 	{&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
143 	{&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},
144 
145 	{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
146 	{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
147 	{&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
148 
149 	{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
150 	{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
151 	{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
152 
153 	{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
154 	{&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
155 	{&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
156 
157 	{&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph},
158 	{&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph},
159 	{&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph},
160 
161 	{&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
162 	{&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
163 	{&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
164 
165 	{&VertexDecoder::Step_Color8888Morph, &VertexDecoderJitCache::Jit_Color8888Morph},
166 	{&VertexDecoder::Step_Color4444Morph, &VertexDecoderJitCache::Jit_Color4444Morph},
167 	{&VertexDecoder::Step_Color565Morph, &VertexDecoderJitCache::Jit_Color565Morph},
168 	{&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph},
169 };
170 
Compile(const VertexDecoder & dec,int32_t * jittedSize)171 JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
172 	dec_ = &dec;
173 	BeginWrite();
174 	const u8 *start = AlignCode16();
175 
176 	bool prescaleStep = false;
177 	bool skinning = false;
178 
179 	NEONSkinning = cpu_info.bNEON;
180 	NEONMorphing = cpu_info.bNEON;
181 
182 	// Look for prescaled texcoord steps
183 	for (int i = 0; i < dec.numSteps_; i++) {
184 		if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
185 			dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
186 			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
187 				prescaleStep = true;
188 		}
189 		if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin ||
190 			dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin ||
191 			dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) {
192 				skinning = true;
193 		}
194 	}
195 
196 	// Not used below, but useful for logging.
197 	(void)skinning;
198 
199 	SetCC(CC_AL);
200 
201 	PUSH(8, R4, R5, R6, R7, R8, R10, R11, R_LR);
202 	if (NEONSkinning || NEONMorphing) {
203 		VPUSH(D8, 8);
204 	}
205 
206 	// Keep the scale/offset in a few fp registers if we need it.
207 	if (prescaleStep) {
208 		MOVP2R(R3, &gstate_c.uv);
209 		if (cpu_info.bNEON) {
210 			VLD1(F_32, neonUVScaleReg, R3, 2, ALIGN_NONE);
211 			if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
212 				VMOV_neon(F_32, neonScratchReg, by128);
213 				VMUL(F_32, neonUVScaleReg, neonUVScaleReg, neonScratchReg);
214 			} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
215 				VMOV_neon(F_32, neonScratchReg, by32768);
216 				VMUL(F_32, neonUVScaleReg, neonUVScaleReg, neonScratchReg);
217 			}
218 		} else {
219 			VLDMIA(R3, false, fpUscaleReg, 4); // fp{Uscale, Yscale, Uoffset, Voffset}Reg = {S0-S4}
220 			if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
221 				MOVI2F(fpScratchReg, by128, scratchReg);
222 				VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg);
223 				VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg);
224 			} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
225 				MOVI2F(fpScratchReg, by32768, scratchReg);
226 				VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg);
227 				VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg);
228 			}
229 		}
230 	}
231 
232 	// Add code to convert matrices to 4x4.
233 	// Later we might want to do this when the matrices are loaded instead.
234 	if (NEONSkinning && dec.weighttype && g_Config.bSoftwareSkinning) {
235 		// Copying from R3 to R4
236 		MOVP2R(R3, gstate.boneMatrix);
237 		MOVP2R(R4, bones);
238 		MOVP2R(R5, boneMask);
239 		VLD1(F_32, Q3, R5, 2, ALIGN_128);
240 		for (int i = 0; i < dec.nweights; i++) {
241 			VLD1(F_32, Q4, R3, 2);  // Load 128 bits even though we just want 96
242 			VMUL(F_32, Q4, Q4, Q3);
243 			ADD(R3, R3, 12);
244 			VLD1(F_32, Q5, R3, 2);
245 			VMUL(F_32, Q5, Q5, Q3);
246 			ADD(R3, R3, 12);
247 			VLD1(F_32, Q6, R3, 2);
248 			VMUL(F_32, Q6, Q6, Q3);
249 			ADD(R3, R3, 12);
250 			VLD1(F_32, Q7, R3, 2);
251 			VMUL(F_32, Q7, Q7, Q3);
252 			ADD(R3, R3, 12);
253 			// First two matrices are in registers.
254 			if (i == 0) {
255 				VMOV(Q8, Q4);
256 				VMOV(Q9, Q5);
257 				VMOV(Q10, Q6);
258 				VMOV(Q11, Q7);
259 				ADD(R4, R4, 16 * 4);
260 			} else if (i == 1) {
261 				VMOV(Q12, Q4);
262 				VMOV(Q13, Q5);
263 				VMOV(Q14, Q6);
264 				VMOV(Q15, Q7);
265 				ADD(R4, R4, 16 * 4);
266 			} else {
267 				VST1(F_32, Q4, R4, 2, ALIGN_128, REG_UPDATE);
268 				VST1(F_32, Q5, R4, 2, ALIGN_128, REG_UPDATE);
269 				VST1(F_32, Q6, R4, 2, ALIGN_128, REG_UPDATE);
270 				VST1(F_32, Q7, R4, 2, ALIGN_128, REG_UPDATE);
271 			}
272 		}
273 	}
274 
275 	if (dec.col) {
276 		// Or LDB and skip the conditional?  This is probably cheaper.
277 		MOV(fullAlphaReg, 0xFF);
278 	}
279 
280 	JumpTarget loopStart = GetCodePtr();
281 	// Preload data cache ahead of reading. This offset seems pretty good.
282 	PLD(srcReg, 64);
283 	for (int i = 0; i < dec.numSteps_; i++) {
284 		if (!CompileStep(dec, i)) {
285 			EndWrite();
286 			// Reset the code ptr and return zero to indicate that we failed.
287 			ResetCodePtr(GetOffset(start));
288 			char temp[1024] = {0};
289 			dec.ToString(temp);
290 			INFO_LOG(G3D, "Could not compile vertex decoder: %s", temp);
291 			return 0;
292 		}
293 	}
294 
295 	ADDI2R(srcReg, srcReg, dec.VertexSize(), scratchReg);
296 	ADDI2R(dstReg, dstReg, dec.decFmt.stride, scratchReg);
297 	SUBS(counterReg, counterReg, 1);
298 	B_CC(CC_NEQ, loopStart);
299 
300 	if (dec.col) {
301 		MOVP2R(tempReg1, &gstate_c.vertexFullAlpha);
302 		CMP(fullAlphaReg, 0);
303 		SetCC(CC_EQ);
304 		STRB(fullAlphaReg, tempReg1, 0);
305 		SetCC(CC_AL);
306 	}
307 
308 	if (NEONSkinning || NEONMorphing) {
309 		VPOP(D8, 8);
310 	}
311 	POP(8, R4, R5, R6, R7, R8, R10, R11, R_PC);
312 
313 	FlushLitPool();
314 	FlushIcache();
315 
316 	/*
317 	DisassembleArm(start, GetCodePtr() - start);
318 	char temp[1024] = {0};
319 	dec.ToString(temp);
320 	INFO_LOG(G3D, "%s", temp);
321 	*/
322 
323 	*jittedSize = GetCodePtr() - start;
324 	EndWrite();
325 	return (JittedVertexDecoder)start;
326 }
327 
Jit_WeightsU8()328 void VertexDecoderJitCache::Jit_WeightsU8() {
329 	// Basic implementation - a byte at a time. TODO: Optimize
330 	int j;
331 	for (j = 0; j < dec_->nweights; j++) {
332 		LDRB(tempReg1, srcReg, dec_->weightoff + j);
333 		STRB(tempReg1, dstReg, dec_->decFmt.w0off + j);
334 	}
335 	if (j & 3) {
336 		// Create a zero register. Might want to make a fixed one.
337 		EOR(scratchReg, scratchReg, scratchReg);
338 	}
339 	while (j & 3) {
340 		STRB(scratchReg, dstReg, dec_->decFmt.w0off + j);
341 		j++;
342 	}
343 }
344 
Jit_WeightsU16()345 void VertexDecoderJitCache::Jit_WeightsU16() {
346 	// Basic implementation - a short at a time. TODO: Optimize
347 	int j;
348 	for (j = 0; j < dec_->nweights; j++) {
349 		LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
350 		STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
351 	}
352 	if (j & 3) {
353 		// Create a zero register. Might want to make a fixed one.
354 		EOR(scratchReg, scratchReg, scratchReg);
355 	}
356 	while (j & 3) {
357 		STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2);
358 		j++;
359 	}
360 }
361 
Jit_WeightsFloat()362 void VertexDecoderJitCache::Jit_WeightsFloat() {
363 	int j;
364 	for (j = 0; j < dec_->nweights; j++) {
365 		LDR(tempReg1, srcReg, dec_->weightoff + j * 4);
366 		STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
367 	}
368 	if (j & 3) {
369 		EOR(tempReg1, tempReg1, tempReg1);
370 	}
371 	while (j & 3) {  // Zero additional weights rounding up to 4.
372 		STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
373 		j++;
374 	}
375 }
376 
377 static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
378 static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 };
379 static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 };
380 
Jit_ApplyWeights()381 void VertexDecoderJitCache::Jit_ApplyWeights() {
382 	if (NEONSkinning) {
383 		// We construct a matrix in Q4-Q7
384 		// We can use Q1 as temp.
385 		if (dec_->nweights >= 2) {
386 			MOVP2R(scratchReg, bones + 16 * 2);
387 		}
388 		for (int i = 0; i < dec_->nweights; i++) {
389 			switch (i) {
390 			case 0:
391 				VMUL_scalar(F_32, Q4, Q8, QScalar(neonWeightRegsQ[0], 0));
392 				VMUL_scalar(F_32, Q5, Q9, QScalar(neonWeightRegsQ[0], 0));
393 				VMUL_scalar(F_32, Q6, Q10, QScalar(neonWeightRegsQ[0], 0));
394 				VMUL_scalar(F_32, Q7, Q11, QScalar(neonWeightRegsQ[0], 0));
395 				break;
396 			case 1:
397 				// Krait likes VDUP + VFMA better than VMLA, and it's easy to do here.
398 				if (cpu_info.bVFPv4) {
399 					VDUP(F_32, Q1, neonWeightRegsQ[i >> 2], i & 1);
400 					VFMA(F_32, Q4, Q12, Q1);
401 					VFMA(F_32, Q5, Q13, Q1);
402 					VFMA(F_32, Q6, Q14, Q1);
403 					VFMA(F_32, Q7, Q15, Q1);
404 				} else {
405 					VMLA_scalar(F_32, Q4, Q12, QScalar(neonWeightRegsQ[0], 1));
406 					VMLA_scalar(F_32, Q5, Q13, QScalar(neonWeightRegsQ[0], 1));
407 					VMLA_scalar(F_32, Q6, Q14, QScalar(neonWeightRegsQ[0], 1));
408 					VMLA_scalar(F_32, Q7, Q15, QScalar(neonWeightRegsQ[0], 1));
409 				}
410 				break;
411 			default:
412 				// Matrices 2+ need to be loaded from memory.
413 				// Wonder if we can free up one more register so we could get some parallelism.
414 				// Actually Q3 is free if there are fewer than 5 weights...
415 				if (dec_->nweights <= 4) {
416 					VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
417 					VLD1(F_32, Q3, scratchReg, 2, ALIGN_128, REG_UPDATE);
418 					VMLA_scalar(F_32, Q4, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
419 					VMLA_scalar(F_32, Q5, Q3, QScalar(neonWeightRegsQ[i >> 2], i & 3));
420 					VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
421 					VLD1(F_32, Q3, scratchReg, 2, ALIGN_128, REG_UPDATE);
422 					VMLA_scalar(F_32, Q6, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
423 					VMLA_scalar(F_32, Q7, Q3, QScalar(neonWeightRegsQ[i >> 2], i & 3));
424 				} else {
425 					VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
426 					VMLA_scalar(F_32, Q4, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
427 					VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
428 					VMLA_scalar(F_32, Q5, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
429 					VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
430 					VMLA_scalar(F_32, Q6, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
431 					VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
432 					VMLA_scalar(F_32, Q7, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
433 				}
434 				break;
435 			}
436 		}
437 	} else {
438 		MOVP2R(tempReg2, skinMatrix);
439 		// This approach saves a few stores but accesses the matrices in a more
440 		// sparse order.
441 		const float *bone = &gstate.boneMatrix[0];
442 		MOVP2R(tempReg1, bone);
443 		for (int i = 0; i < 12; i++) {
444 			VLDR(fpScratchReg3, tempReg1, i * 4);
445 			VMUL(fpScratchReg3, fpScratchReg3, weightRegs[0]);
446 			for (int j = 1; j < dec_->nweights; j++) {
447 				VLDR(fpScratchReg2, tempReg1, i * 4 + j * 4 * 12);
448 				VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]);
449 			}
450 			VSTR(fpScratchReg3, tempReg2, i * 4);
451 		}
452 	}
453 }
454 
Jit_WeightsU8Skin()455 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
456 	if (NEONSkinning) {
457 		// Weight is first so srcReg is correct.
458 		switch (dec_->nweights) {
459 		case 1: VLD1_lane(I_8, neonScratchReg, srcReg, 0, false); break;
460 		case 2: VLD1_lane(I_16, neonScratchReg, srcReg, 0, false); break;
461 		default:
462 			// For 3, we over read, for over 4, we read more later.
463 			VLD1_lane(I_32, neonScratchReg, srcReg, 0, false);
464 			break;
465 		}
466 		// This can be represented as a constant.
467 		VMOV_neon(F_32, Q3, by128);
468 		VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
469 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
470 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
471 		VMUL(F_32, neonWeightRegsQ[0], neonScratchRegQ, Q3);
472 
473 		if (dec_->nweights > 4) {
474 			ADD(tempReg1, srcReg, 4 * sizeof(u8));
475 			switch (dec_->nweights) {
476 			case 5: VLD1_lane(I_8, neonScratchReg, tempReg1, 0, false); break;
477 			case 6: VLD1_lane(I_16, neonScratchReg, tempReg1, 0, false); break;
478 			case 7:
479 			case 8:
480 				VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false);
481 				break;
482 			}
483 			VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
484 			VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
485 			VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
486 			VMUL(F_32, neonWeightRegsQ[1], neonScratchRegQ, Q3);
487 		}
488 	} else {
489 		for (int j = 0; j < dec_->nweights; j++) {
490 			LDRB(tempReg1, srcReg, dec_->weightoff + j);
491 			VMOV(fpScratchReg, tempReg1);
492 			VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
493 			MOVI2F(fpScratchReg2, by128, scratchReg);
494 			VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
495 		}
496 	}
497 	Jit_ApplyWeights();
498 }
499 
Jit_WeightsU16Skin()500 void VertexDecoderJitCache::Jit_WeightsU16Skin() {
501 	if (NEONSkinning) {
502 		switch (dec_->nweights) {
503 		case 1: VLD1_lane(I_16, neonScratchReg, srcReg, 0, true); break;
504 		case 2: VLD1_lane(I_32, neonScratchReg, srcReg, 0, false); break;
505 		default:
506 			// For 3, we over read, for over 4, we read more later.
507 			VLD1(I_32, neonScratchReg, srcReg, 1, ALIGN_NONE);
508 			break;
509 		}
510 		// This can be represented as a constant.
511 		VMOV_neon(F_32, Q3, by32768);
512 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
513 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
514 		VMUL(F_32, neonWeightRegsQ[0], neonScratchRegQ, Q3);
515 
516 		if (dec_->nweights > 4) {
517 			ADD(tempReg1, srcReg, 4 * sizeof(u16));
518 			switch (dec_->nweights) {
519 			case 5: VLD1_lane(I_16, neonScratchReg, tempReg1, 0, true); break;
520 			case 6: VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false); break;
521 			case 7:
522 			case 8:
523 				VLD1(I_32, neonScratchReg, tempReg1, 1, ALIGN_NONE);
524 				break;
525 			}
526 			VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
527 			VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
528 			VMUL(F_32, neonWeightRegsQ[1], neonScratchRegQ, Q3);
529 		}
530 	} else {
531 		// Fallback and non-neon
532 		for (int j = 0; j < dec_->nweights; j++) {
533 			LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
534 			VMOV(fpScratchReg, tempReg1);
535 			VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
536 			MOVI2F(fpScratchReg2, by32768, scratchReg);
537 			VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
538 		}
539 	}
540 	Jit_ApplyWeights();
541 }
542 
Jit_WeightsFloatSkin()543 void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
544 	for (int i = 1; i < dec_->nweights; ++i) {
545 		_dbg_assert_msg_(weightRegs[i - 1] + 1 == weightRegs[i], "VertexDecoder weightRegs must be in order.");
546 	}
547 
548 	// Weights are always first, so we can use srcReg directly.
549 	if (NEONSkinning) {
550 		// if (false) because this path breaks Daxter. VLDMIA with d registers doesn't seem to work as expected.
551 		if (dec_->nweights == 1) {
552 			VLD1_lane(F_32, neonWeightRegsD[0], srcReg, 0, true);
553 		} else {
554 			// We may over-read by one float but this is not a tragedy.
555 			VLD1(F_32, neonWeightRegsD[0], srcReg, (dec_->nweights + 1) / 2);
556 		}
557 	} else {
558 		VLDMIA(srcReg, false, weightRegs[0], dec_->nweights);
559 	}
560 	Jit_ApplyWeights();
561 }
562 
Jit_TcFloat()563 void VertexDecoderJitCache::Jit_TcFloat() {
564 	LDR(tempReg1, srcReg, dec_->tcoff);
565 	LDR(tempReg2, srcReg, dec_->tcoff + 4);
566 	STR(tempReg1, dstReg, dec_->decFmt.uvoff);
567 	STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
568 }
569 
Jit_TcU16ThroughToFloat()570 void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
571 	LDRH(tempReg1, srcReg, dec_->tcoff);
572 	LDRH(tempReg2, srcReg, dec_->tcoff + 2);
573 
574 	MOVP2R(scratchReg, &gstate_c.vertBounds.minU);
575 
576 	auto updateSide = [&](ARMReg r, CCFlags cc, u32 off) {
577 		LDRH(tempReg3, scratchReg, off);
578 		CMP(r, tempReg3);
579 		SetCC(cc);
580 		STRH(r, scratchReg, off);
581 		SetCC(CC_AL);
582 	};
583 
584 	// TODO: Can this actually be fast?  Hmm, floats aren't better.
585 	updateSide(tempReg1, CC_LT, offsetof(KnownVertexBounds, minU));
586 	updateSide(tempReg1, CC_GT, offsetof(KnownVertexBounds, maxU));
587 	updateSide(tempReg2, CC_LT, offsetof(KnownVertexBounds, minV));
588 	updateSide(tempReg2, CC_GT, offsetof(KnownVertexBounds, maxV));
589 
590 	if (cpu_info.bNEON) {
591 		ADD(scratchReg, srcReg, dec_->tcoff);
592 		VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
593 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
594 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
595 		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
596 		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
597 	} else {
598 		VMOV(fpScratchReg, tempReg1);
599 		VMOV(fpScratchReg2, tempReg2);
600 		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
601 		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
602 		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
603 		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
604 	}
605 }
606 
Jit_TcFloatThrough()607 void VertexDecoderJitCache::Jit_TcFloatThrough() {
608 	LDR(tempReg1, srcReg, dec_->tcoff);
609 	LDR(tempReg2, srcReg, dec_->tcoff + 4);
610 	STR(tempReg1, dstReg, dec_->decFmt.uvoff);
611 	STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
612 }
613 
Jit_TcU8Prescale()614 void VertexDecoderJitCache::Jit_TcU8Prescale() {
615 	if (cpu_info.bNEON) {
616 		// TODO: Needs testing
617 		ADD(scratchReg, srcReg, dec_->tcoff);
618 		VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
619 		VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 16-bit
620 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
621 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
622 		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
623 		VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
624 		VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
625 		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
626 	} else {
627 		LDRB(tempReg1, srcReg, dec_->tcoff);
628 		LDRB(tempReg2, srcReg, dec_->tcoff + 1);
629 		VMOV(fpScratchReg, tempReg1);
630 		VMOV(fpScratchReg2, tempReg2);
631 		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
632 		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
633 		// Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later.
634 		VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
635 		VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
636 		VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
637 		VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
638 		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
639 		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
640 	}
641 }
642 
Jit_TcU8ToFloat()643 void VertexDecoderJitCache::Jit_TcU8ToFloat() {
644 	if (cpu_info.bNEON) {
645 		// TODO: Needs testing
646 		ADD(scratchReg, srcReg, dec_->tcoff);
647 		VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
648 		VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 16-bit
649 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
650 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
651 		VMOV_neon(F_32, neonScratchReg2, by128);
652 		VMUL(F_32, neonScratchReg, neonScratchReg, neonScratchReg2);
653 		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
654 		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
655 	} else {
656 		LDRB(tempReg1, srcReg, dec_->tcoff);
657 		LDRB(tempReg2, srcReg, dec_->tcoff + 1);
658 		VMOV(fpScratchReg, tempReg1);
659 		VMOV(fpScratchReg2, tempReg2);
660 		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
661 		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
662 		MOVI2F(S15, by128, scratchReg);
663 		VMUL(fpScratchReg, fpScratchReg, S15);
664 		VMUL(fpScratchReg2, fpScratchReg2, S15);
665 		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
666 		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
667 	}
668 }
669 
Jit_TcU16Prescale()670 void VertexDecoderJitCache::Jit_TcU16Prescale() {
671 	if (cpu_info.bNEON) {
672 		// TODO: Needs testing
673 		ADD(scratchReg, srcReg, dec_->tcoff);
674 		VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
675 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
676 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
677 		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
678 		VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
679 		VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
680 		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
681 	} else {
682 		LDRH(tempReg1, srcReg, dec_->tcoff);
683 		LDRH(tempReg2, srcReg, dec_->tcoff + 2);
684 		VMOV(fpScratchReg, tempReg1);
685 		VMOV(fpScratchReg2, tempReg2);
686 		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
687 		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
688 		VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
689 		VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
690 		VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
691 		VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
692 		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
693 		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
694 	}
695 }
696 
Jit_TcU16ToFloat()697 void VertexDecoderJitCache::Jit_TcU16ToFloat() {
698 	if (cpu_info.bNEON) {
699 		// TODO: Needs testing
700 		ADD(scratchReg, srcReg, dec_->tcoff);
701 		VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
702 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
703 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
704 		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
705 		VMOV_neon(F_32, neonScratchReg2, by32768);
706 		VMUL(F_32, neonScratchReg, neonScratchReg, neonScratchReg2);
707 		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
708 	} else {
709 		LDRH(tempReg1, srcReg, dec_->tcoff);
710 		LDRH(tempReg2, srcReg, dec_->tcoff + 2);
711 		VMOV(fpScratchReg, tempReg1);
712 		VMOV(fpScratchReg2, tempReg2);
713 		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
714 		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
715 		MOVI2F(S15, by32768, scratchReg);
716 		VMUL(fpScratchReg, fpScratchReg, S15);
717 		VMUL(fpScratchReg2, fpScratchReg2, S15);
718 		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
719 		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
720 	}
721 }
722 
Jit_TcFloatPrescale()723 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
724 	if (cpu_info.bNEON) {
725 		ADD(scratchReg, srcReg, dec_->tcoff);
726 		VLD1(F_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
727 		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
728 		VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
729 		VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
730 		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
731 	} else {
732 		VLDR(fpScratchReg, srcReg, dec_->tcoff);
733 		VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4);
734 		VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
735 		VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
736 		VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
737 		VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
738 		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
739 		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
740 	}
741 }
742 
Jit_Color8888()743 void VertexDecoderJitCache::Jit_Color8888() {
744 	LDR(tempReg1, srcReg, dec_->coloff);
745 	// Set flags to determine if alpha != 0xFF.
746 	MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24));
747 	STR(tempReg1, dstReg, dec_->decFmt.c0off);
748 	SetCC(CC_NEQ);
749 	MOV(fullAlphaReg, 0);
750 	SetCC(CC_AL);
751 }
752 
Jit_Color4444()753 void VertexDecoderJitCache::Jit_Color4444() {
754 	LDRH(tempReg1, srcReg, dec_->coloff);
755 
756 	// Spread out the components.
757 	ANDI2R(tempReg2, tempReg1, 0x000F, scratchReg);
758 	ANDI2R(tempReg3, tempReg1, 0x00F0, scratchReg);
759 	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 4));
760 	ANDI2R(tempReg3, tempReg1, 0x0F00, scratchReg);
761 	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8));
762 	ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg);
763 	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12));
764 
765 	// And expand to 8 bits.
766 	ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4));
767 
768 	STR(tempReg1, dstReg, dec_->decFmt.c0off);
769 
770 	// Set flags to determine if alpha != 0xFF.
771 	MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24));
772 	SetCC(CC_NEQ);
773 	MOV(fullAlphaReg, 0);
774 	SetCC(CC_AL);
775 }
776 
Jit_Color565()777 void VertexDecoderJitCache::Jit_Color565() {
778 	LDRH(tempReg1, srcReg, dec_->coloff);
779 
780 	// Spread out R and B first.  This puts them in 0x001F001F.
781 	ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
782 	ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg);
783 	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 5));
784 
785 	// Expand 5 -> 8.
786 	LSL(tempReg3, tempReg2, 3);
787 	ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSR, 2));
788 	ANDI2R(tempReg2, tempReg2, 0xFFFF00FF, scratchReg);
789 
790 	// Now finally G.  We start by shoving it into a wall.
791 	LSR(tempReg1, tempReg1, 5);
792 	ANDI2R(tempReg1, tempReg1, 0x003F, scratchReg);
793 	LSL(tempReg3, tempReg1, 2);
794 	// Don't worry, shifts into a wall.
795 	ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4));
796 	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8));
797 
798 	// Add in full alpha.  No need to update fullAlphaReg.
799 	ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
800 
801 	STR(tempReg1, dstReg, dec_->decFmt.c0off);
802 }
803 
Jit_Color5551()804 void VertexDecoderJitCache::Jit_Color5551() {
805 	LDRSH(tempReg1, srcReg, dec_->coloff);
806 
807 	ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
808 	ANDI2R(tempReg3, tempReg1, 0x03E0, scratchReg);
809 	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 3));
810 	ANDI2R(tempReg3, tempReg1, 0x7C00, scratchReg);
811 	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 6));
812 
813 	// Expand 5 -> 8.
814 	LSR(tempReg3, tempReg2, 2);
815 	// Clean up the bits that were shifted right.
816 	BIC(tempReg3, tempReg3, AssumeMakeOperand2(0x000000F8));
817 	BIC(tempReg3, tempReg3, AssumeMakeOperand2(0x0000F800));
818 	ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSL, 3));
819 
820 	// Now we just need alpha.  Since we loaded as signed, it'll be extended.
821 	ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
822 	ORR(tempReg2, tempReg2, tempReg1);
823 
824 	// Set flags to determine if alpha != 0xFF.
825 	MVNS(tempReg3, Operand2(tempReg1, ST_ASR, 24));
826 	STR(tempReg2, dstReg, dec_->decFmt.c0off);
827 	SetCC(CC_NEQ);
828 	MOV(fullAlphaReg, 0);
829 	SetCC(CC_AL);
830 }
831 
Jit_Color8888Morph()832 void VertexDecoderJitCache::Jit_Color8888Morph() {
833 	const bool useNEON = NEONMorphing;
834 	ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
835 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
836 
837 	bool first = true;
838 	for (int n = 0; n < dec_->morphcount; ++n) {
839 		if (useNEON) {
840 			VLD1_lane(I_32, neonScratchReg, tempReg1, 0, true);
841 			VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
842 
843 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
844 			VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
845 			VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
846 			VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
847 
848 			if (first) {
849 				first = false;
850 				VMUL(F_32, Q2, neonScratchRegQ, Q3);
851 			} else if (cpu_info.bVFPv4) {
852 				VFMA(F_32, Q2, neonScratchRegQ, Q3);
853 			} else {
854 				VMLA(F_32, Q2, neonScratchRegQ, Q3);
855 			}
856 		} else {
857 			LDRB(scratchReg, tempReg1, 0);
858 			LDRB(scratchReg2, tempReg1, 1);
859 			LDRB(scratchReg3, tempReg1, 2);
860 			LDRB(tempReg3, tempReg1, 3);
861 			VMOV(fpScratchReg, scratchReg);
862 			VMOV(fpScratchReg2, scratchReg2);
863 			VMOV(fpScratchReg3, scratchReg3);
864 			VMOV(fpScratchReg4, tempReg3);
865 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
866 			VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
867 			VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
868 			VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
869 			VCVT(fpScratchReg4, fpScratchReg4, TO_FLOAT);
870 
871 			VLDR(S12, tempReg2, sizeof(float) * n);
872 
873 			if (first) {
874 				first = false;
875 				VMUL(S8, fpScratchReg, S12);
876 				VMUL(S9, fpScratchReg2, S12);
877 				VMUL(S10, fpScratchReg3, S12);
878 				VMUL(S11, fpScratchReg4, S12);
879 			} else {
880 				VMLA(S8, fpScratchReg, S12);
881 				VMLA(S9, fpScratchReg2, S12);
882 				VMLA(S10, fpScratchReg3, S12);
883 				VMLA(S11, fpScratchReg4, S12);
884 			}
885 		}
886 	}
887 
888 	Jit_WriteMorphColor(dec_->decFmt.c0off);
889 }
890 
891 // First is the left shift, second is the right shift (against walls, to get the RGBA values.)
892 alignas(16) static const s16 color4444Shift[2][4] = {{12, 8, 4, 0}, {-12, -12, -12, -12}};
893 
Jit_Color4444Morph()894 void VertexDecoderJitCache::Jit_Color4444Morph() {
895 	const bool useNEON = NEONMorphing;
896 	ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
897 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
898 
899 	if (useNEON) {
900 		MOVP2R(scratchReg, color4444Shift);
901 		MOVI2FR(scratchReg2, 255.0f / 15.0f);
902 		VDUP(I_32, Q5, scratchReg2);
903 		VLD1(I_16, D8, scratchReg, 2, ALIGN_128);
904 	} else {
905 		MOVI2F(S13, 255.0f / 15.0f, scratchReg);
906 	}
907 
908 	bool first = true;
909 	for (int n = 0; n < dec_->morphcount; ++n) {
910 		if (useNEON) {
911 			VLD1_all_lanes(I_16, neonScratchReg, tempReg1, true);
912 			VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
913 
914 			// Shift against walls and then back to get R, G, B, A in each 16-bit lane.
915 			VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D8);
916 			VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D9);
917 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
918 			VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
919 			VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
920 
921 			VMUL(F_32, Q3, Q3, Q5);
922 
923 			if (first) {
924 				first = false;
925 				VMUL(F_32, Q2, neonScratchRegQ, Q3);
926 			} else if (cpu_info.bVFPv4) {
927 				VFMA(F_32, Q2, neonScratchRegQ, Q3);
928 			} else {
929 				VMLA(F_32, Q2, neonScratchRegQ, Q3);
930 			}
931 		} else {
932 			LDRB(scratchReg, tempReg1, 0);
933 			ANDI2R(scratchReg2, scratchReg, 0x000F, scratchReg3);
934 			VMOV(fpScratchReg, scratchReg2);
935 
936 			MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 4));
937 			VMOV(fpScratchReg2, scratchReg2);
938 
939 			LDRB(scratchReg, tempReg1, 1);
940 			ANDI2R(scratchReg2, scratchReg, 0x000F, scratchReg3);
941 			VMOV(fpScratchReg3, scratchReg2);
942 
943 			MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 4));
944 			VMOV(fpScratchReg4, scratchReg2);
945 
946 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
947 			VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
948 			VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
949 			VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
950 			VCVT(fpScratchReg4, fpScratchReg4, TO_FLOAT);
951 
952 			VLDR(S12, tempReg2, sizeof(float) * n);
953 			VMUL(S12, S12, S13);
954 
955 			if (first) {
956 				first = false;
957 				VMUL(S8, fpScratchReg, S12);
958 				VMUL(S9, fpScratchReg2, S12);
959 				VMUL(S10, fpScratchReg3, S12);
960 				VMUL(S11, fpScratchReg4, S12);
961 			} else {
962 				VMLA(S8, fpScratchReg, S12);
963 				VMLA(S9, fpScratchReg2, S12);
964 				VMLA(S10, fpScratchReg3, S12);
965 				VMLA(S11, fpScratchReg4, S12);
966 			}
967 		}
968 	}
969 
970 	Jit_WriteMorphColor(dec_->decFmt.c0off);
971 }
972 
973 // First is the left shift, second is the right shift (against walls, to get the RGBA values.)
974 alignas(16) static const s16 color565Shift[2][4] = {{11, 5, 0, 0}, {-11, -10, -11, 0}};
975 alignas(16) static const float byColor565[4] = {255.0f / 31.0f, 255.0f / 63.0f, 255.0f / 31.0f, 0.0f};
976 
Jit_Color565Morph()977 void VertexDecoderJitCache::Jit_Color565Morph() {
978 	const bool useNEON = NEONMorphing;
979 	ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
980 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
981 	MOVI2FR(tempReg3, 255.0f);
982 
983 	if (useNEON) {
984 		MOVP2R(scratchReg, color565Shift);
985 		MOVP2R(scratchReg2, byColor565);
986 		VLD1(I_16, D8, scratchReg, 2, ALIGN_128);
987 		VLD1(F_32, D10, scratchReg2, 2, ALIGN_128);
988 	} else {
989 		MOVI2F(S14, 255.0f / 31.0f, scratchReg);
990 		MOVI2F(S15, 255.0f / 63.0f, scratchReg);
991 	}
992 
993 	bool first = true;
994 	for (int n = 0; n < dec_->morphcount; ++n) {
995 		if (useNEON) {
996 			VLD1_all_lanes(I_16, neonScratchReg, tempReg1, true);
997 			VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
998 
999 			VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D8);
1000 			VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D9);
1001 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1002 			VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
1003 			VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
1004 
1005 			VMUL(F_32, Q3, Q3, Q5);
1006 
1007 			if (first) {
1008 				first = false;
1009 				VMUL(F_32, Q2, neonScratchRegQ, Q3);
1010 			} else if (cpu_info.bVFPv4) {
1011 				VFMA(F_32, Q2, neonScratchRegQ, Q3);
1012 			} else {
1013 				VMLA(F_32, Q2, neonScratchRegQ, Q3);
1014 			}
1015 		} else {
1016 			LDRH(scratchReg, tempReg1, 0);
1017 			ANDI2R(scratchReg2, scratchReg, 0x001F, scratchReg3);
1018 			VMOV(fpScratchReg, scratchReg2);
1019 
1020 			MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 6));
1021 			ANDI2R(scratchReg2, scratchReg2, 0x003F, scratchReg3);
1022 			VMOV(fpScratchReg2, scratchReg2);
1023 
1024 			MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 11));
1025 			VMOV(fpScratchReg3, scratchReg2);
1026 
1027 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1028 			VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
1029 			VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
1030 			VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
1031 
1032 			VLDR(S12, tempReg2, sizeof(float) * n);
1033 			VMUL(S13, S12, S15);
1034 			VMUL(S12, S12, S14);
1035 
1036 			if (first) {
1037 				first = false;
1038 				VMUL(S8, fpScratchReg, S12);
1039 				VMUL(S9, fpScratchReg2, S13);
1040 				VMUL(S10, fpScratchReg3, S12);
1041 			} else {
1042 				VMLA(S8, fpScratchReg, S12);
1043 				VMLA(S9, fpScratchReg2, S13);
1044 				VMLA(S10, fpScratchReg3, S12);
1045 			}
1046 		}
1047 	}
1048 
1049 	// Overwrite A with 255.0f.
1050 	if (useNEON) {
1051 		VMOV_neon(F_32, D5, tempReg3, 1);
1052 	} else {
1053 		VMOV(S11, tempReg3);
1054 	}
1055 	Jit_WriteMorphColor(dec_->decFmt.c0off, false);
1056 }
1057 
1058 // First is the left shift, second is the right shift (against walls, to get the RGBA values.)
1059 alignas(16) static const s16 color5551Shift[2][4] = {{11, 6, 1, 0}, {-11, -11, -11, -15}};
1060 alignas(16) static const float byColor5551[4] = {255.0f / 31.0f, 255.0f / 31.0f, 255.0f / 31.0f, 255.0f / 1.0f};
1061 
Jit_Color5551Morph()1062 void VertexDecoderJitCache::Jit_Color5551Morph() {
1063 	const bool useNEON = NEONMorphing;
1064 	ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
1065 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
1066 
1067 	if (useNEON) {
1068 		MOVP2R(scratchReg, color5551Shift);
1069 		MOVP2R(scratchReg2, byColor5551);
1070 		VLD1(I_16, D8, scratchReg, 2, ALIGN_128);
1071 		VLD1(F_32, D10, scratchReg2, 2, ALIGN_128);
1072 	} else {
1073 		MOVI2F(S14, 255.0f / 31.0f, scratchReg);
1074 		MOVI2F(S15, 255.0f, scratchReg);
1075 	}
1076 
1077 	bool first = true;
1078 	for (int n = 0; n < dec_->morphcount; ++n) {
1079 		if (useNEON) {
1080 			VLD1_all_lanes(I_16, neonScratchReg, tempReg1, true);
1081 			VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
1082 
1083 			VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D8);
1084 			VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D9);
1085 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1086 			VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
1087 			VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
1088 
1089 			VMUL(F_32, Q3, Q3, Q5);
1090 
1091 			if (first) {
1092 				first = false;
1093 				VMUL(F_32, Q2, neonScratchRegQ, Q3);
1094 			} else if (cpu_info.bVFPv4) {
1095 				VFMA(F_32, Q2, neonScratchRegQ, Q3);
1096 			} else {
1097 				VMLA(F_32, Q2, neonScratchRegQ, Q3);
1098 			}
1099 		} else {
1100 			LDRH(scratchReg, tempReg1, 0);
1101 			ANDI2R(scratchReg2, scratchReg, 0x001F, scratchReg3);
1102 			VMOV(fpScratchReg, scratchReg2);
1103 
1104 			MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 5));
1105 			ANDI2R(scratchReg2, scratchReg2, 0x001F, scratchReg3);
1106 			VMOV(fpScratchReg2, scratchReg2);
1107 
1108 			MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 10));
1109 			ANDI2R(scratchReg2, scratchReg2, 0x001F, scratchReg3);
1110 			VMOV(fpScratchReg3, scratchReg2);
1111 
1112 			MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 15));
1113 			VMOV(fpScratchReg4, scratchReg2);
1114 
1115 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1116 			VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
1117 			VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
1118 			VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
1119 			VCVT(fpScratchReg4, fpScratchReg4, TO_FLOAT);
1120 
1121 			VLDR(S12, tempReg2, sizeof(float) * n);
1122 			VMUL(S13, S12, S15);
1123 			VMUL(S12, S12, S14);
1124 
1125 			if (first) {
1126 				first = false;
1127 				VMUL(S8, fpScratchReg, S12);
1128 				VMUL(S9, fpScratchReg2, S12);
1129 				VMUL(S10, fpScratchReg3, S12);
1130 				VMUL(S11, fpScratchReg4, S13);
1131 			} else {
1132 				VMLA(S8, fpScratchReg, S12);
1133 				VMLA(S9, fpScratchReg2, S12);
1134 				VMLA(S10, fpScratchReg3, S12);
1135 				VMLA(S11, fpScratchReg4, S13);
1136 			}
1137 		}
1138 	}
1139 
1140 	Jit_WriteMorphColor(dec_->decFmt.c0off);
1141 }
1142 
1143 // Expects RGBA color in S8 - S11, which is Q2.
Jit_WriteMorphColor(int outOff,bool checkAlpha)1144 void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) {
1145 	if (NEONMorphing) {
1146 		ADDI2R(tempReg1, dstReg, outOff, scratchReg);
1147 		VCVT(I_32 | I_UNSIGNED, Q2, Q2);
1148 		VQMOVN(I_32 | I_UNSIGNED, D4, Q2);
1149 		VQMOVN(I_16 | I_UNSIGNED, D4, Q2);
1150 		VST1_lane(I_32, D4, tempReg1, 0, true);
1151 		if (checkAlpha) {
1152 			VMOV_neon(I_32, scratchReg, D4, 0);
1153 		}
1154 	} else {
1155 		VCVT(S8, S8, TO_INT | ROUND_TO_ZERO);
1156 		VCVT(S9, S9, TO_INT | ROUND_TO_ZERO);
1157 		VCVT(S10, S10, TO_INT | ROUND_TO_ZERO);
1158 		VCVT(S11, S11, TO_INT | ROUND_TO_ZERO);
1159 		VMOV(scratchReg, S8);
1160 		VMOV(scratchReg2, S9);
1161 		VMOV(scratchReg3, S10);
1162 		VMOV(tempReg3, S11);
1163 		ORR(scratchReg, scratchReg, Operand2(scratchReg2, ST_LSL, 8));
1164 		ORR(scratchReg, scratchReg, Operand2(scratchReg3, ST_LSL, 16));
1165 		ORR(scratchReg, scratchReg, Operand2(tempReg3, ST_LSL, 24));
1166 		STR(scratchReg, dstReg, outOff);
1167 	}
1168 
1169 	// Set flags to determine if alpha != 0xFF.
1170 	if (checkAlpha) {
1171 		MVNS(tempReg2, Operand2(scratchReg, ST_ASR, 24));
1172 		SetCC(CC_NEQ);
1173 		MOV(fullAlphaReg, 0);
1174 		SetCC(CC_AL);
1175 	}
1176 }
1177 
Jit_NormalS8()1178 void VertexDecoderJitCache::Jit_NormalS8() {
1179 	LDRB(tempReg1, srcReg, dec_->nrmoff);
1180 	LDRB(tempReg2, srcReg, dec_->nrmoff + 1);
1181 	LDRB(tempReg3, srcReg, dec_->nrmoff + 2);
1182 	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
1183 	ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
1184 	STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
1185 
1186 	// Copy 3 bytes and then a zero. Might as well copy four.
1187 	// LDR(tempReg1, srcReg, dec_->nrmoff);
1188 	// ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg);
1189 	// STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
1190 }
1191 
1192 // Copy 6 bytes and then 2 zeroes.
Jit_NormalS16()1193 void VertexDecoderJitCache::Jit_NormalS16() {
1194 	LDRH(tempReg1, srcReg, dec_->nrmoff);
1195 	LDRH(tempReg2, srcReg, dec_->nrmoff + 2);
1196 	LDRH(tempReg3, srcReg, dec_->nrmoff + 4);
1197 	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
1198 	STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
1199 	STR(tempReg3, dstReg, dec_->decFmt.nrmoff + 4);
1200 }
1201 
Jit_NormalFloat()1202 void VertexDecoderJitCache::Jit_NormalFloat() {
1203 	ADD(scratchReg, srcReg, dec_->nrmoff);
1204 	LDMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
1205 	ADD(scratchReg, dstReg, dec_->decFmt.nrmoff);
1206 	STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
1207 }
1208 
1209 // Through expands into floats, always. Might want to look at changing this.
Jit_PosS8Through()1210 void VertexDecoderJitCache::Jit_PosS8Through() {
1211 	DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
1212 	_dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
1213 	_dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
1214 
1215 	// TODO: SIMD
1216 	LDRSB(tempReg1, srcReg, dec_->posoff);
1217 	LDRSB(tempReg2, srcReg, dec_->posoff + 1);
1218 	LDRSB(tempReg3, srcReg, dec_->posoff + 2);  // signed?
1219 	static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
1220 	static const ARMReg fr[3] = { fpScratchReg, fpScratchReg2, fpScratchReg3 };
1221 	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1222 	if (cpu_info.bNEON) {
1223 		VMOV(neonScratchReg, tempReg1, tempReg2);
1224 		VMOV(neonScratchReg2, tempReg3, tempReg3);
1225 		VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1226 		VST1(F_32, neonScratchReg, scratchReg, 2, ALIGN_NONE);
1227 	} else {
1228 		for (int i = 0; i < 3; i++) {
1229 			VMOV(fr[i], tr[i]);
1230 			VCVT(fr[i], fr[i], TO_FLOAT | IS_SIGNED);
1231 		}
1232 		VSTMIA(scratchReg, false, fr[0], 3);
1233 	}
1234 }
1235 
1236 // Through expands into floats, always. Might want to look at changing this.
Jit_PosS16Through()1237 void VertexDecoderJitCache::Jit_PosS16Through() {
1238 	_dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
1239 	_dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
1240 
1241 	LDRSH(tempReg1, srcReg, dec_->posoff);
1242 	LDRSH(tempReg2, srcReg, dec_->posoff + 2);
1243 	LDRH(tempReg3, srcReg, dec_->posoff + 4);
1244 	static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
1245 	static const ARMReg fr[3] = { fpScratchReg, fpScratchReg2, fpScratchReg3 };
1246 	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1247 	if (cpu_info.bNEON) {
1248 		VMOV(neonScratchReg, tempReg1, tempReg2);
1249 		VMOV(neonScratchReg2, tempReg3, tempReg3);
1250 		VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1251 		VST1(F_32, neonScratchReg, scratchReg, 2, ALIGN_NONE);
1252 	} else {
1253 		for (int i = 0; i < 3; i++) {
1254 			VMOV(fr[i], tr[i]);
1255 			VCVT(fr[i], fr[i], TO_FLOAT | IS_SIGNED);
1256 		}
1257 		VSTMIA(scratchReg, false, fr[0], 3);
1258 	}
1259 }
1260 
Jit_PosS8()1261 void VertexDecoderJitCache::Jit_PosS8() {
1262 	Jit_AnyS8ToFloat(dec_->posoff);
1263 
1264 	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1265 	if (NEONSkinning) {
1266 		VST1(F_32, srcNEON, scratchReg, 2);
1267 	} else {
1268 		VSTMIA(scratchReg, false, src[0], 3);
1269 	}
1270 }
1271 
Jit_PosS16()1272 void VertexDecoderJitCache::Jit_PosS16() {
1273 	Jit_AnyS16ToFloat(dec_->posoff);
1274 
1275 	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1276 	if (NEONSkinning) {
1277 		VST1(F_32, srcNEON, scratchReg, 2);
1278 	} else {
1279 		VSTMIA(scratchReg, false, src[0], 3);
1280 	}
1281 }
1282 
1283 // Just copy 12 bytes.
Jit_PosFloat()1284 void VertexDecoderJitCache::Jit_PosFloat() {
1285 	ADD(scratchReg, srcReg, dec_->posoff);
1286 	LDMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
1287 	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1288 	STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
1289 }
1290 
Jit_NormalS8Skin()1291 void VertexDecoderJitCache::Jit_NormalS8Skin() {
1292 	Jit_AnyS8ToFloat(dec_->nrmoff);
1293 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1294 }
1295 
Jit_NormalS16Skin()1296 void VertexDecoderJitCache::Jit_NormalS16Skin() {
1297 	Jit_AnyS16ToFloat(dec_->nrmoff);
1298 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1299 }
1300 
Jit_NormalFloatSkin()1301 void VertexDecoderJitCache::Jit_NormalFloatSkin() {
1302 	for (int i = 1; i < 3; ++i) {
1303 		_dbg_assert_msg_(src[i - 1] + 1 == src[i], "VertexDecoder src regs must be in order.");
1304 	}
1305 
1306 	ADD(tempReg1, srcReg, dec_->nrmoff);
1307 	if (NEONSkinning) {
1308 		VLD1(F_32, srcNEON, tempReg1, 2, ALIGN_NONE);
1309 	} else {
1310 		VLDMIA(tempReg1, false, src[0], 3);
1311 	}
1312 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1313 }
1314 
Jit_WriteMatrixMul(int outOff,bool pos)1315 void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
1316 	if (NEONSkinning) {
1317 		// Multiply with the matrix sitting in Q4-Q7.
1318 		ADD(scratchReg, dstReg, outOff);
1319 		VMUL_scalar(F_32, accNEON, Q4, QScalar(srcNEON, 0));
1320 		VMLA_scalar(F_32, accNEON, Q5, QScalar(srcNEON, 1));
1321 		VMLA_scalar(F_32, accNEON, Q6, QScalar(srcNEON, 2));
1322 		if (pos) {
1323 			VADD(F_32, accNEON, accNEON, Q7);
1324 		}
1325 		VST1(F_32, accNEON, scratchReg, 2);
1326 	} else {
1327 		_dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
1328 		_dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
1329 
1330 		MOVP2R(tempReg1, skinMatrix);
1331 		VLDMIA(tempReg1, true, fpScratchReg, 3);
1332 		for (int i = 0; i < 3; i++) {
1333 			VMUL(acc[i], ARMReg(fpScratchReg + i), src[0]);
1334 		}
1335 		VLDMIA(tempReg1, true, fpScratchReg, 3);
1336 		for (int i = 0; i < 3; i++) {
1337 			VMLA(acc[i], ARMReg(fpScratchReg + i), src[1]);
1338 		}
1339 		VLDMIA(tempReg1, true, fpScratchReg, 3);
1340 		for (int i = 0; i < 3; i++) {
1341 			VMLA(acc[i], ARMReg(fpScratchReg + i), src[2]);
1342 		}
1343 		if (pos) {
1344 			VLDMIA(tempReg1, true, fpScratchReg, 3);
1345 			for (int i = 0; i < 3; i++) {
1346 				VADD(acc[i], acc[i], ARMReg(fpScratchReg + i));
1347 			}
1348 		}
1349 		ADD(tempReg1, dstReg, outOff);
1350 		VSTMIA(tempReg1, false, acc[0], 3);
1351 	}
1352 }
1353 
Jit_PosS8Skin()1354 void VertexDecoderJitCache::Jit_PosS8Skin() {
1355 	Jit_AnyS8ToFloat(dec_->posoff);
1356 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1357 }
1358 
Jit_PosS16Skin()1359 void VertexDecoderJitCache::Jit_PosS16Skin() {
1360 	Jit_AnyS16ToFloat(dec_->posoff);
1361 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1362 }
1363 
Jit_PosFloatSkin()1364 void VertexDecoderJitCache::Jit_PosFloatSkin() {
1365 	for (int i = 1; i < 3; ++i) {
1366 		_dbg_assert_msg_(src[i - 1] + 1 == src[i], "VertexDecoder src regs must be in order.");
1367 	}
1368 
1369 	ADD(tempReg1, srcReg, dec_->posoff);
1370 	if (NEONSkinning) {
1371 		VLD1(F_32, srcNEON, tempReg1, 2, ALIGN_NONE);
1372 	} else {
1373 		VLDMIA(tempReg1, false, src[0], 3);
1374 	}
1375 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1376 }
1377 
Jit_AnyS8ToFloat(int srcoff)1378 void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
1379 	if (NEONSkinning) {
1380 		ADD(scratchReg, srcReg, srcoff);
1381 		VMOV_neon(F_32, Q3, by128);
1382 		VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
1383 		VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 16-bit
1384 		VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
1385 		VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1386 		VMUL(F_32, srcNEON, neonScratchReg, Q3);
1387 	} else {
1388 		LDRSB(tempReg1, srcReg, srcoff);
1389 		LDRSB(tempReg2, srcReg, srcoff + 1);
1390 		LDRSB(tempReg3, srcReg, srcoff + 2);
1391 		VMOV(src[0], tempReg1);
1392 		VMOV(src[1], tempReg2);
1393 		VMOV(src[2], tempReg3);
1394 		MOVI2F(S15, by128, scratchReg);
1395 		VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
1396 		VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
1397 		VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
1398 		VMUL(src[0], src[0], S15);
1399 		VMUL(src[1], src[1], S15);
1400 		VMUL(src[2], src[2], S15);
1401 	}
1402 }
1403 
Jit_AnyS16ToFloat(int srcoff)1404 void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
1405 	if (NEONSkinning) {
1406 		ADD(scratchReg, srcReg, srcoff);
1407 		VMOV_neon(F_32, Q3, by32768);
1408 		VLD1(I_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
1409 		VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
1410 		VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1411 		VMUL(F_32, srcNEON, neonScratchReg, Q3);
1412 	} else {
1413 		LDRSH(tempReg1, srcReg, srcoff);
1414 		LDRSH(tempReg2, srcReg, srcoff + 2);
1415 		LDRSH(tempReg3, srcReg, srcoff + 4);
1416 		VMOV(src[0], tempReg1);
1417 		VMOV(src[1], tempReg2);
1418 		VMOV(src[2], tempReg3);
1419 		MOVI2F(S15, by32768, scratchReg);
1420 		VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
1421 		VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
1422 		VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
1423 		VMUL(src[0], src[0], S15);
1424 		VMUL(src[1], src[1], S15);
1425 		VMUL(src[2], src[2], S15);
1426 	}
1427 }
1428 
Jit_AnyS8Morph(int srcoff,int dstoff)1429 void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
1430 	const bool useNEON = NEONMorphing;
1431 	ADDI2R(tempReg1, srcReg, srcoff, scratchReg);
1432 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
1433 
1434 	if (useNEON) {
1435 		MOVI2FR(scratchReg2, by128);
1436 		VDUP(I_32, Q5, scratchReg2);
1437 	} else {
1438 		MOVI2F(S13, by128, scratchReg);
1439 	}
1440 
1441 	bool first = true;
1442 	for (int n = 0; n < dec_->morphcount; ++n) {
1443 		if (useNEON) {
1444 			VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false);
1445 			VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
1446 
1447 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1448 			VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg);
1449 			VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);
1450 			VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1451 
1452 			VMUL(F_32, Q3, Q3, Q5);
1453 
1454 			if (first) {
1455 				first = false;
1456 				VMUL(F_32, Q2, neonScratchRegQ, Q3);
1457 			} else if (cpu_info.bVFPv4) {
1458 				VFMA(F_32, Q2, neonScratchRegQ, Q3);
1459 			} else {
1460 				VMLA(F_32, Q2, neonScratchRegQ, Q3);
1461 			}
1462 		} else {
1463 			LDRSB(scratchReg, tempReg1, 0);
1464 			LDRSB(scratchReg2, tempReg1, 1);
1465 			LDRSB(scratchReg3, tempReg1, 2);
1466 			VMOV(fpScratchReg, scratchReg);
1467 			VMOV(fpScratchReg2, scratchReg2);
1468 			VMOV(fpScratchReg3, scratchReg3);
1469 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1470 			VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
1471 			VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED);
1472 			VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED);
1473 
1474 			VLDR(S12, tempReg2, sizeof(float) * n);
1475 			VMUL(S12, S12, S13);
1476 
1477 			if (first) {
1478 				first = false;
1479 				VMUL(S8, fpScratchReg, S12);
1480 				VMUL(S9, fpScratchReg2, S12);
1481 				VMUL(S10, fpScratchReg3, S12);
1482 			} else {
1483 				VMLA(S8, fpScratchReg, S12);
1484 				VMLA(S9, fpScratchReg2, S12);
1485 				VMLA(S10, fpScratchReg3, S12);
1486 			}
1487 		}
1488 	}
1489 
1490 	ADDI2R(tempReg1, dstReg, dstoff, scratchReg);
1491 	if (useNEON) {
1492 		// TODO: Is it okay that we're over-writing by 4 bytes?  Probably...
1493 		VSTMIA(tempReg1, false, D4, 2);
1494 	} else {
1495 		VSTMIA(tempReg1, false, S8, 3);
1496 	}
1497 }
1498 
Jit_AnyS16Morph(int srcoff,int dstoff)1499 void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
1500 	const bool useNEON = NEONMorphing;
1501 	ADDI2R(tempReg1, srcReg, srcoff, scratchReg);
1502 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
1503 
1504 	if (useNEON) {
1505 		MOVI2FR(scratchReg, by32768);
1506 		VDUP(I_32, Q5, scratchReg);
1507 	} else {
1508 		MOVI2F(S13, by32768, scratchReg);
1509 	}
1510 
1511 	bool first = true;
1512 	for (int n = 0; n < dec_->morphcount; ++n) {
1513 		if (useNEON) {
1514 			VLD1(I_32, neonScratchReg, tempReg1, 1, ALIGN_NONE);
1515 			VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
1516 
1517 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1518 			VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);
1519 			VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1520 
1521 			VMUL(F_32, Q3, Q3, Q5);
1522 
1523 			if (first) {
1524 				first = false;
1525 				VMUL(F_32, Q2, neonScratchRegQ, Q3);
1526 			} else if (cpu_info.bVFPv4) {
1527 				VFMA(F_32, Q2, neonScratchRegQ, Q3);
1528 			} else {
1529 				VMLA(F_32, Q2, neonScratchRegQ, Q3);
1530 			}
1531 		} else {
1532 			LDRSH(scratchReg, tempReg1, 0);
1533 			LDRSH(scratchReg2, tempReg1, 2);
1534 			LDRSH(scratchReg3, tempReg1, 4);
1535 			VMOV(fpScratchReg, scratchReg);
1536 			VMOV(fpScratchReg2, scratchReg2);
1537 			VMOV(fpScratchReg3, scratchReg3);
1538 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1539 			VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
1540 			VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED);
1541 			VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED);
1542 
1543 			VLDR(S12, tempReg2, sizeof(float) * n);
1544 			VMUL(S12, S12, S13);
1545 
1546 			if (first) {
1547 				first = false;
1548 				VMUL(S8, fpScratchReg, S12);
1549 				VMUL(S9, fpScratchReg2, S12);
1550 				VMUL(S10, fpScratchReg3, S12);
1551 			} else {
1552 				VMLA(S8, fpScratchReg, S12);
1553 				VMLA(S9, fpScratchReg2, S12);
1554 				VMLA(S10, fpScratchReg3, S12);
1555 			}
1556 		}
1557 	}
1558 
1559 	ADDI2R(tempReg1, dstReg, dstoff, scratchReg);
1560 	if (useNEON) {
1561 		// TODO: Is it okay that we're over-writing by 4 bytes?  Probably...
1562 		VSTMIA(tempReg1, false, D4, 2);
1563 	} else {
1564 		VSTMIA(tempReg1, false, S8, 3);
1565 	}
1566 }
1567 
Jit_AnyFloatMorph(int srcoff,int dstoff)1568 void VertexDecoderJitCache::Jit_AnyFloatMorph(int srcoff, int dstoff) {
1569 	const bool useNEON = NEONMorphing;
1570 	ADDI2R(tempReg1, srcReg, srcoff, scratchReg);
1571 	MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
1572 
1573 	bool first = true;
1574 	for (int n = 0; n < dec_->morphcount; ++n) {
1575 		if (useNEON) {
1576 			// Load an extra float to stay in NEON mode.
1577 			VLD1(F_32, neonScratchRegQ, tempReg1, 2, ALIGN_NONE);
1578 			VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
1579 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1580 
1581 			if (first) {
1582 				first = false;
1583 				VMUL(F_32, Q2, neonScratchRegQ, Q3);
1584 			} else if (cpu_info.bVFPv4) {
1585 				VFMA(F_32, Q2, neonScratchRegQ, Q3);
1586 			} else {
1587 				VMLA(F_32, Q2, neonScratchRegQ, Q3);
1588 			}
1589 		} else {
1590 			// Load an extra float to stay in NEON mode.
1591 			VLDMIA(tempReg1, false, fpScratchReg, 3);
1592 			// Using VLDMIA to get writeback.
1593 			VLDMIA(tempReg2, true, S12, 1);
1594 			ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1595 
1596 			if (first) {
1597 				first = false;
1598 				VMUL(S8, fpScratchReg, S12);
1599 				VMUL(S9, fpScratchReg2, S12);
1600 				VMUL(S10, fpScratchReg3, S12);
1601 			} else {
1602 				VMLA(S8, fpScratchReg, S12);
1603 				VMLA(S9, fpScratchReg2, S12);
1604 				VMLA(S10, fpScratchReg3, S12);
1605 			}
1606 		}
1607 	}
1608 
1609 	ADDI2R(tempReg1, dstReg, dstoff, scratchReg);
1610 	if (useNEON) {
1611 		// TODO: Is it okay that we're over-writing by 4 bytes?  Probably...
1612 		VSTMIA(tempReg1, false, D4, 2);
1613 	} else {
1614 		VSTMIA(tempReg1, false, S8, 3);
1615 	}
1616 }
1617 
Jit_PosS8Morph()1618 void VertexDecoderJitCache::Jit_PosS8Morph() {
1619 	Jit_AnyS8Morph(dec_->posoff, dec_->decFmt.posoff);
1620 }
1621 
Jit_PosS16Morph()1622 void VertexDecoderJitCache::Jit_PosS16Morph() {
1623 	Jit_AnyS16Morph(dec_->posoff, dec_->decFmt.posoff);
1624 }
1625 
Jit_PosFloatMorph()1626 void VertexDecoderJitCache::Jit_PosFloatMorph() {
1627 	Jit_AnyFloatMorph(dec_->posoff, dec_->decFmt.posoff);
1628 }
1629 
Jit_NormalS8Morph()1630 void VertexDecoderJitCache::Jit_NormalS8Morph() {
1631 	Jit_AnyS8Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
1632 }
1633 
Jit_NormalS16Morph()1634 void VertexDecoderJitCache::Jit_NormalS16Morph() {
1635 	Jit_AnyS16Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
1636 }
1637 
Jit_NormalFloatMorph()1638 void VertexDecoderJitCache::Jit_NormalFloatMorph() {
1639 	Jit_AnyFloatMorph(dec_->nrmoff, dec_->decFmt.nrmoff);
1640 }
1641 
CompileStep(const VertexDecoder & dec,int step)1642 bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
1643 	// See if we find a matching JIT function
1644 	for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {
1645 		if (dec.steps_[step] == jitLookup[i].func) {
1646 			((*this).*jitLookup[i].jitFunc)();
1647 			return true;
1648 		}
1649 	}
1650 	return false;
1651 }
1652 
1653 #endif // PPSSPP_ARCH(ARM)
1654