1 // Copyright (c) 2013- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include "ppsspp_config.h"
19 #if PPSSPP_ARCH(ARM64)
20 
21 #include "Common/CPUDetect.h"
22 #include "Common/Log.h"
23 #include "Core/Config.h"
24 #include "Core/Reporting.h"
25 #include "Common/Arm64Emitter.h"
26 #include "Core/MIPS/JitCommon/JitCommon.h"
27 #include "GPU/GPUState.h"
28 #include "GPU/Common/VertexDecoderCommon.h"
29 
30 alignas(16) static float bones[16 * 8];  // First four are kept in registers
31 alignas(16) static float boneMask[4] = {1.0f, 1.0f, 1.0f, 0.0f};
32 
33 static const float by128 = 1.0f / 128.0f;
34 static const float by32768 = 1.0f / 32768.0f;
35 
36 using namespace Arm64Gen;
37 
38 // Pointers, X regs (X0 - X17 safe to use.)
39 static const ARM64Reg srcReg = X0;
40 static const ARM64Reg dstReg = X1;
41 
42 static const ARM64Reg counterReg = W2;
43 static const ARM64Reg tempReg1 = W3;
44 static const ARM64Reg tempRegPtr = X3;
45 static const ARM64Reg tempReg2 = W4;
46 static const ARM64Reg tempReg3 = W5;
47 static const ARM64Reg scratchReg = W6;
48 static const ARM64Reg scratchReg64 = X6;
49 static const ARM64Reg scratchReg2 = W7;
50 static const ARM64Reg scratchReg3 = W8;
51 static const ARM64Reg fullAlphaReg = W12;
52 static const ARM64Reg boundsMinUReg = W13;
53 static const ARM64Reg boundsMinVReg = W14;
54 static const ARM64Reg boundsMaxUReg = W15;
55 static const ARM64Reg boundsMaxVReg = W16;
56 
57 static const ARM64Reg fpScratchReg = S4;
58 static const ARM64Reg fpScratchReg2 = S5;
59 static const ARM64Reg fpScratchReg3 = S6;
60 static const ARM64Reg fpScratchReg4 = S7;
61 
62 static const ARM64Reg neonScratchRegD = D2;
63 static const ARM64Reg neonScratchRegQ = Q2;
64 
65 static const ARM64Reg neonUVScaleReg = D0;
66 static const ARM64Reg neonUVOffsetReg = D1;
67 
68 static const ARM64Reg src[3] = {S2, S3, S8};
69 static const ARM64Reg srcD[3] = {D2, D3, D8};
70 static const ARM64Reg srcQ[3] = {Q2, Q3, Q8};
71 
72 static const ARM64Reg srcNEON = Q8;
73 static const ARM64Reg accNEON = Q9;
74 
75 static const ARM64Reg neonWeightRegsQ[2] = { Q3, Q2 };  // reverse order to prevent clash with neonScratchReg in Jit_WeightsU*Skin.
76 
77 // Q4-Q7 is the generated matrix that we multiply things by.
78 // Q8,Q9 are accumulators/scratch for matrix mul.
79 // Q10, Q11 are more scratch for matrix mul.
80 // Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices.
81 
82 static const JitLookup jitLookup[] = {
83 	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
84 	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
85 	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
86 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
87 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
88 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
89 
90 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
91 	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
92 	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
93 
94 	{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
95 	{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
96 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
97 
98 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
99 	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
100 
101 	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
102 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
103 	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
104 
105 	{&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
106 	{&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
107 	{&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
108 
109 	{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
110 	{&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
111 	{&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
112 	{&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},
113 
114 	{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
115 	{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
116 	{&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
117 
118 	{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
119 	{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
120 	{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
121 
122 	{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
123 	{&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
124 	{&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
125 
126 	/*
127 	{&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph},
128 	{&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph},
129 	{&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph},
130 
131 	{&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
132 	{&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
133 	{&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
134 
135 	{&VertexDecoder::Step_Color8888Morph, &VertexDecoderJitCache::Jit_Color8888Morph},
136 	{&VertexDecoder::Step_Color4444Morph, &VertexDecoderJitCache::Jit_Color4444Morph},
137 	{&VertexDecoder::Step_Color565Morph, &VertexDecoderJitCache::Jit_Color565Morph},
138 	{&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph},
139 	*/
140 };
141 
142 
Compile(const VertexDecoder & dec,int32_t * jittedSize)143 JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
144 	dec_ = &dec;
145 
146 	BeginWrite();
147 	const u8 *start = AlignCode16();
148 
149 	bool prescaleStep = false;
150 	bool skinning = false;
151 
152 	bool log = false;
153 
154 	// Look for prescaled texcoord steps
155 	for (int i = 0; i < dec.numSteps_; i++) {
156 		if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
157 			dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
158 			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
159 			prescaleStep = true;
160 		}
161 		if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin ||
162 			dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin ||
163 			dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) {
164 			skinning = true;
165 		}
166 	}
167 
168 	// Not used below, but useful for logging.
169 	(void)skinning;
170 
171 	// if (skinning) log = true;
172 
173 	uint64_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
174 	uint64_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
175 	fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
176 
177 	// Keep the scale/offset in a few fp registers if we need it.
178 	if (prescaleStep) {
179 		MOVP2R(X3, &gstate_c.uv);
180 		fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
181 		fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
182 		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
183 			fp.MOVI2FDUP(neonScratchRegD, by128, scratchReg);
184 			fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
185 		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
186 			fp.MOVI2FDUP(neonScratchRegD, by32768, scratchReg);
187 			fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
188 		}
189 	}
190 
191 	// Add code to convert matrices to 4x4.
192 	// Later we might want to do this when the matrices are loaded instead.
193 	if (dec.weighttype && g_Config.bSoftwareSkinning) {
194 		// Copying from R3 to R4
195 		MOVP2R(X3, gstate.boneMatrix);
196 		MOVP2R(X4, bones);
197 		MOVP2R(X5, boneMask);
198 		fp.LDR(128, INDEX_UNSIGNED, Q3, X5, 0);
199 		for (int i = 0; i < dec.nweights; i++) {
200 			// Note that INDEX_UNSIGNED does not support offsets not aligned to the data size so we must use POST.
201 			fp.LDR(128, INDEX_POST, Q4, X3, 12);  // Load 128 bits even though we just want 96
202 			fp.LDR(128, INDEX_POST, Q5, X3, 12);
203 			fp.LDR(128, INDEX_POST, Q6, X3, 12);
204 			fp.LDR(128, INDEX_POST, Q7, X3, 12);
205 			// First four matrices are in registers Q16+.
206 			if (i < 4) {
207 				fp.FMUL(32, (ARM64Reg)(Q16 + i * 4), Q4, Q3);
208 				fp.FMUL(32, (ARM64Reg)(Q17 + i * 4), Q5, Q3);
209 				fp.FMUL(32, (ARM64Reg)(Q18 + i * 4), Q6, Q3);
210 				fp.FMUL(32, (ARM64Reg)(Q19 + i * 4), Q7, Q3);
211 				ADDI2R(X4, X4, 16 * 4);
212 			} else {
213 				fp.FMUL(32, Q4, Q4, Q3);
214 				fp.FMUL(32, Q5, Q5, Q3);
215 				fp.FMUL(32, Q6, Q6, Q3);
216 				fp.FMUL(32, Q7, Q7, Q3);
217 				fp.STR(128, INDEX_UNSIGNED, Q4, X4, 0);
218 				fp.STR(128, INDEX_UNSIGNED, Q5, X4, 16);
219 				fp.STR(128, INDEX_UNSIGNED, Q6, X4, 32);
220 				fp.STR(128, INDEX_UNSIGNED, Q7, X4, 48);
221 				ADDI2R(X4, X4, 16 * 4);
222 			}
223 		}
224 	}
225 
226 	if (dec.col) {
227 		// Or LDB and skip the conditional?  This is probably cheaper.
228 		MOVI2R(fullAlphaReg, 0xFF);
229 	}
230 
231 	if (dec.tc && dec.throughmode) {
232 		// TODO: Smarter, only when doing bounds.
233 		MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
234 		LDRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
235 		LDRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
236 		LDRH(INDEX_UNSIGNED, boundsMinVReg, scratchReg64, offsetof(KnownVertexBounds, minV));
237 		LDRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
238 	}
239 
240 	const u8 *loopStart = GetCodePtr();
241 	for (int i = 0; i < dec.numSteps_; i++) {
242 		if (!CompileStep(dec, i)) {
243 			EndWrite();
244 			// Reset the code ptr (effectively undoing what we generated) and return zero to indicate that we failed.
245 			ResetCodePtr(GetOffset(start));
246 			char temp[1024] = {0};
247 			dec.ToString(temp);
248 			ERROR_LOG(G3D, "Could not compile vertex decoder, failed at step %d: %s", i, temp);
249 			return nullptr;
250 		}
251 	}
252 
253 	ADDI2R(srcReg, srcReg, dec.VertexSize(), scratchReg);
254 	ADDI2R(dstReg, dstReg, dec.decFmt.stride, scratchReg);
255 	SUBS(counterReg, counterReg, 1);
256 	B(CC_NEQ, loopStart);
257 
258 	if (dec.col) {
259 		MOVP2R(tempRegPtr, &gstate_c.vertexFullAlpha);
260 		CMP(fullAlphaReg, 0);
261 		FixupBranch skip = B(CC_NEQ);
262 		STRB(INDEX_UNSIGNED, fullAlphaReg, tempRegPtr, 0);
263 		SetJumpTarget(skip);
264 	}
265 
266 	if (dec.tc && dec.throughmode) {
267 		// TODO: Smarter, only when doing bounds.
268 		MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
269 		STRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
270 		STRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
271 		STRH(INDEX_UNSIGNED, boundsMinVReg, scratchReg64, offsetof(KnownVertexBounds, minV));
272 		STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
273 	}
274 
275 	fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
276 
277 	RET();
278 
279 	FlushIcache();
280 
281 	if (log) {
282 		char temp[1024] = { 0 };
283 		dec.ToString(temp);
284 		INFO_LOG(JIT, "=== %s (%d bytes) ===", temp, (int)(GetCodePtr() - start));
285 		std::vector<std::string> lines = DisassembleArm64(start, (int)(GetCodePtr() - start));
286 		for (auto line : lines) {
287 			INFO_LOG(JIT, "%s", line.c_str());
288 		}
289 		INFO_LOG(JIT, "==========");
290 	}
291 
292 	*jittedSize = (int)(GetCodePtr() - start);
293 	EndWrite();
294 	return (JittedVertexDecoder)start;
295 }
296 
CompileStep(const VertexDecoder & dec,int step)297 bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
298 	// See if we find a matching JIT function
299 	for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {
300 		if (dec.steps_[step] == jitLookup[i].func) {
301 			((*this).*jitLookup[i].jitFunc)();
302 			return true;
303 		}
304 	}
305 	return false;
306 }
307 
Jit_ApplyWeights()308 void VertexDecoderJitCache::Jit_ApplyWeights() {
309 	// We construct a matrix in Q4-Q7
310 	if (dec_->nweights >= 4) {
311 		MOVP2R(scratchReg64, bones + 16 * 4);
312 	}
313 	for (int i = 0; i < dec_->nweights; i++) {
314 		switch (i) {
315 		case 0:
316 			fp.FMUL(32, Q4, Q16, neonWeightRegsQ[0], 0);
317 			fp.FMUL(32, Q5, Q17, neonWeightRegsQ[0], 0);
318 			fp.FMUL(32, Q6, Q18, neonWeightRegsQ[0], 0);
319 			fp.FMUL(32, Q7, Q19, neonWeightRegsQ[0], 0);
320 			break;
321 		case 1:
322 			fp.FMLA(32, Q4, Q20, neonWeightRegsQ[0], 1);
323 			fp.FMLA(32, Q5, Q21, neonWeightRegsQ[0], 1);
324 			fp.FMLA(32, Q6, Q22, neonWeightRegsQ[0], 1);
325 			fp.FMLA(32, Q7, Q23, neonWeightRegsQ[0], 1);
326 			break;
327 		case 2:
328 			fp.FMLA(32, Q4, Q24, neonWeightRegsQ[0], 2);
329 			fp.FMLA(32, Q5, Q25, neonWeightRegsQ[0], 2);
330 			fp.FMLA(32, Q6, Q26, neonWeightRegsQ[0], 2);
331 			fp.FMLA(32, Q7, Q27, neonWeightRegsQ[0], 2);
332 			break;
333 		case 3:
334 			fp.FMLA(32, Q4, Q28, neonWeightRegsQ[0], 3);
335 			fp.FMLA(32, Q5, Q29, neonWeightRegsQ[0], 3);
336 			fp.FMLA(32, Q6, Q30, neonWeightRegsQ[0], 3);
337 			fp.FMLA(32, Q7, Q31, neonWeightRegsQ[0], 3);
338 			break;
339 		default:
340 			// Matrices 4+ need to be loaded from memory.
341 			fp.LDP(128, INDEX_SIGNED, Q8, Q9, scratchReg64, 0);
342 			fp.LDP(128, INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16);
343 			fp.FMLA(32, Q4, Q8, neonWeightRegsQ[i >> 2], i & 3);
344 			fp.FMLA(32, Q5, Q9, neonWeightRegsQ[i >> 2], i & 3);
345 			fp.FMLA(32, Q6, Q10, neonWeightRegsQ[i >> 2], i & 3);
346 			fp.FMLA(32, Q7, Q11, neonWeightRegsQ[i >> 2], i & 3);
347 			ADDI2R(scratchReg64, scratchReg64, 4 * 16);
348 			break;
349 		}
350 	}
351 }
352 
Jit_WeightsU8()353 void VertexDecoderJitCache::Jit_WeightsU8() {
354 	// Basic implementation - a byte at a time. TODO: Optimize
355 	int j;
356 	for (j = 0; j < dec_->nweights; j++) {
357 		LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j);
358 		STRB(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j);
359 	}
360 	while (j & 3) {
361 		STRB(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j);
362 		j++;
363 	}
364 }
365 
Jit_WeightsU16()366 void VertexDecoderJitCache::Jit_WeightsU16() {
367 	// Basic implementation - a short at a time. TODO: Optimize
368 	int j;
369 	for (j = 0; j < dec_->nweights; j++) {
370 		LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 2);
371 		STRH(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
372 	}
373 	while (j & 3) {
374 		STRH(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 2);
375 		j++;
376 	}
377 }
378 
Jit_WeightsFloat()379 void VertexDecoderJitCache::Jit_WeightsFloat() {
380 	int j;
381 	for (j = 0; j < dec_->nweights; j++) {
382 		LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 4);
383 		STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
384 	}
385 	while (j & 3) {  // Zero additional weights rounding up to 4.
386 		STR(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 4);
387 		j++;
388 	}
389 }
390 
Jit_WeightsU8Skin()391 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
392 	// Weight is first so srcReg is correct.
393 	switch (dec_->nweights) {
394 	case 1: fp.LDR(8, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0); break;
395 	case 2: fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0); break;
396 	default:
397 		// For 3, we over read, for over 4, we read more later.
398 		fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0);
399 		break;
400 	}
401 
402 	fp.UXTL(8, neonScratchRegQ, neonScratchRegD);
403 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD);
404 	fp.UCVTF(32, neonWeightRegsQ[0], neonScratchRegQ, 7);
405 
406 	if (dec_->nweights > 4) {
407 		switch (dec_->nweights) {
408 		case 5: fp.LDR(8, INDEX_UNSIGNED, neonScratchRegD, srcReg, 4); break;
409 		case 6: fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, 4); break;
410 		case 7:
411 		case 8:
412 			fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, 4);
413 			break;
414 		}
415 		fp.UXTL(8, neonScratchRegQ, neonScratchRegD);
416 		fp.UXTL(16, neonScratchRegQ, neonScratchRegD);
417 		fp.UCVTF(32, neonWeightRegsQ[1], neonScratchRegQ, 7);
418 	}
419 	Jit_ApplyWeights();
420 }
421 
Jit_WeightsU16Skin()422 void VertexDecoderJitCache::Jit_WeightsU16Skin() {
423 	switch (dec_->nweights) {
424 	case 1: fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0); break;
425 	case 2: fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0); break;
426 	default:
427 		// For 3, we over read, for over 4, we read more later.
428 		fp.LDR(64, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0);
429 		break;
430 	}
431 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD);
432 	fp.UCVTF(32, neonWeightRegsQ[0], neonScratchRegQ, 15);
433 
434 	if (dec_->nweights > 4) {
435 		switch (dec_->nweights) {
436 		case 5: fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, 8); break;
437 		case 6: fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, 8); break;
438 		case 7:
439 		case 8:
440 			fp.LDR(64, INDEX_UNSIGNED, neonScratchRegD, srcReg, 8);
441 			break;
442 		}
443 		fp.UXTL(16, neonScratchRegQ, neonScratchRegD);
444 		fp.UCVTF(32, neonWeightRegsQ[1], neonScratchRegQ, 15);
445 	}
446 	Jit_ApplyWeights();
447 }
448 
Jit_WeightsFloatSkin()449 void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
450 	switch (dec_->nweights) {
451 	case 1:
452 		fp.LDR(32, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
453 		break;
454 	case 2:
455 		fp.LDR(64, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
456 		break;
457 	case 3:
458 	case 4:
459 		fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
460 		break;
461 
462 	case 5:
463 		fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
464 		fp.LDR(32, INDEX_UNSIGNED, neonWeightRegsQ[1], srcReg, 16);
465 		break;
466 	case 6:
467 		fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
468 		fp.LDR(64, INDEX_UNSIGNED, neonWeightRegsQ[1], srcReg, 16);
469 		break;
470 	case 7:
471 	case 8:
472 		fp.LDP(128, INDEX_SIGNED, neonWeightRegsQ[0], neonWeightRegsQ[1], srcReg, 0);
473 		break;
474 	}
475 	Jit_ApplyWeights();
476 }
477 
Jit_Color8888()478 void VertexDecoderJitCache::Jit_Color8888() {
479 	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
480 
481 	// Set flags to determine if alpha != 0xFF.
482 	ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
483 	CMP(tempReg2, 0);
484 
485 	// Clear fullAlphaReg when the inverse was not 0.
486 	// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
487 	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
488 
489 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
490 }
491 
Jit_Color4444()492 void VertexDecoderJitCache::Jit_Color4444() {
493 	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
494 
495 	// Spread out the components.
496 	ANDI2R(tempReg2, tempReg1, 0x000F, scratchReg);
497 	ANDI2R(tempReg3, tempReg1, 0x00F0, scratchReg);
498 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 4));
499 	ANDI2R(tempReg3, tempReg1, 0x0F00, scratchReg);
500 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 8));
501 	ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg);
502 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 12));
503 
504 	// And expand to 8 bits.
505 	ORR(tempReg1, tempReg2, tempReg2, ArithOption(tempReg2, ST_LSL, 4));
506 
507 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
508 
509 	// Set flags to determine if alpha != 0xFF.
510 	ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
511 	CMP(tempReg2, 0);
512 
513 	// Clear fullAlphaReg when the inverse was not 0.
514 	// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
515 	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
516 }
517 
Jit_Color565()518 void VertexDecoderJitCache::Jit_Color565() {
519 	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
520 
521 	// Spread out R and B first.  This puts them in 0x001F001F.
522 	ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
523 	ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg);
524 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 5));
525 
526 	// Expand 5 -> 8.
527 	LSL(tempReg3, tempReg2, 3);
528 	ORR(tempReg2, tempReg3, tempReg2, ArithOption(tempReg2, ST_LSR, 2));
529 	ANDI2R(tempReg2, tempReg2, 0xFFFF00FF, scratchReg);
530 
531 	// Now finally G.  We start by shoving it into a wall.
532 	LSR(tempReg1, tempReg1, 5);
533 	ANDI2R(tempReg1, tempReg1, 0x003F, scratchReg);
534 	LSL(tempReg3, tempReg1, 2);
535 	// Don't worry, shifts into a wall.
536 	ORR(tempReg3, tempReg3, tempReg1, ArithOption(tempReg1, ST_LSR, 4));
537 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 8));
538 
539 	// Add in full alpha.  No need to update fullAlphaReg.
540 	ORRI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
541 
542 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
543 }
544 
Jit_Color5551()545 void VertexDecoderJitCache::Jit_Color5551() {
546 	LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
547 
548 	ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
549 	ANDI2R(tempReg3, tempReg1, 0x03E0, scratchReg);
550 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 3));
551 	ANDI2R(tempReg3, tempReg1, 0x7C00, scratchReg);
552 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 6));
553 
554 	// Expand 5 -> 8.
555 	LSR(tempReg3, tempReg2, 2);
556 	// Clean up the bits that were shifted right.
557 	ANDI2R(tempReg3, tempReg3, ~0x000000F8);
558 	ANDI2R(tempReg3, tempReg3, ~0x0000F800);
559 	ORR(tempReg2, tempReg3, tempReg2, ArithOption(tempReg2, ST_LSL, 3));
560 
561 	// Now we just need alpha.  Since we loaded as signed, it'll be extended.
562 	ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
563 	ORR(tempReg2, tempReg2, tempReg1);
564 
565 	// Set flags to determine if alpha != 0xFF.
566 	ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
567 	CMP(tempReg3, 0);
568 
569 	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off);
570 
571 	// Clear fullAlphaReg when the inverse was not 0.
572 	// fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1;
573 	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
574 }
575 
Jit_TcU16ThroughToFloat()576 void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
577 	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
578 	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
579 
580 	auto updateSide = [&](ARM64Reg src, CCFlags cc, ARM64Reg dst) {
581 		CMP(src, dst);
582 		CSEL(dst, src, dst, cc);
583 	};
584 
585 	updateSide(tempReg1, CC_LT, boundsMinUReg);
586 	updateSide(tempReg1, CC_GT, boundsMaxUReg);
587 	updateSide(tempReg2, CC_LT, boundsMinVReg);
588 	updateSide(tempReg2, CC_GT, boundsMaxVReg);
589 
590 	fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
591 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
592 	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
593 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
594 }
595 
Jit_TcFloatThrough()596 void VertexDecoderJitCache::Jit_TcFloatThrough() {
597 	LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
598 	STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
599 }
600 
Jit_TcFloat()601 void VertexDecoderJitCache::Jit_TcFloat() {
602 	LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
603 	STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
604 }
605 
Jit_TcU8Prescale()606 void VertexDecoderJitCache::Jit_TcU8Prescale() {
607 	fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
608 	fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
609 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
610 	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
611 	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
612 	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
613 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
614 }
615 
Jit_TcU8ToFloat()616 void VertexDecoderJitCache::Jit_TcU8ToFloat() {
617 	fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
618 	fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
619 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
620 	fp.UCVTF(32, neonScratchRegD, neonScratchRegD, 7);
621 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
622 }
623 
Jit_TcU16Prescale()624 void VertexDecoderJitCache::Jit_TcU16Prescale() {
625 	fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
626 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
627 	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
628 	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
629 	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
630 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
631 }
632 
Jit_TcU16ToFloat()633 void VertexDecoderJitCache::Jit_TcU16ToFloat() {
634 	fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
635 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
636 	fp.UCVTF(32, neonScratchRegD, neonScratchRegD, 15);
637 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
638 }
639 
Jit_TcFloatPrescale()640 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
641 	fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff);
642 	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
643 	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
644 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
645 }
646 
Jit_PosS8()647 void VertexDecoderJitCache::Jit_PosS8() {
648 	Jit_AnyS8ToFloat(dec_->posoff);
649 	fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
650 }
651 
Jit_PosS16()652 void VertexDecoderJitCache::Jit_PosS16() {
653 	Jit_AnyS16ToFloat(dec_->posoff);
654 	fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
655 }
656 
Jit_PosFloat()657 void VertexDecoderJitCache::Jit_PosFloat() {
658 	// Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
659 	if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
660 		LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->posoff);
661 		STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.posoff);
662 	} else {
663 		LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->posoff);
664 		STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.posoff);
665 		LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8);
666 		STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8);
667 	}
668 }
669 
Jit_PosS8Through()670 void VertexDecoderJitCache::Jit_PosS8Through() {
671 	LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
672 	LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 1);
673 	LDRSB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 2);  // signed?
674 	fp.SCVTF(fpScratchReg, tempReg1);
675 	fp.SCVTF(fpScratchReg2, tempReg2);
676 	fp.SCVTF(fpScratchReg3, tempReg3);
677 	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff);
678 	STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4);
679 	STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8);
680 }
681 
Jit_PosS16Through()682 void VertexDecoderJitCache::Jit_PosS16Through() {
683 	// Start with X and Y (which is signed.)
684 	fp.LDUR(32, src[0], srcReg, dec_->posoff);
685 	fp.SXTL(16, srcD[0], src[0]);
686 	fp.SCVTF(32, srcD[0], srcD[0]);
687 	fp.STUR(64, src[0], dstReg, dec_->decFmt.posoff);
688 	// Now load in Z (which is unsigned.)
689 	LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4);
690 	fp.SCVTF(src[1], tempReg3);
691 	STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 8);
692 }
693 
Jit_NormalS8()694 void VertexDecoderJitCache::Jit_NormalS8() {
695 	LDURH(tempReg1, srcReg, dec_->nrmoff);
696 	LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 2);
697 	ORR(tempReg1, tempReg1, tempReg3, ArithOption(tempReg3, ST_LSL, 16));
698 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
699 }
700 
701 // Copy 6 bytes and then 2 zeroes.
Jit_NormalS16()702 void VertexDecoderJitCache::Jit_NormalS16() {
703 	// NOTE: Not LDRH, we just copy the raw bytes here.
704 	LDUR(tempReg1, srcReg, dec_->nrmoff);
705 	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4);
706 	STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff);
707 }
708 
Jit_NormalFloat()709 void VertexDecoderJitCache::Jit_NormalFloat() {
710 	// Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
711 	if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
712 		LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->nrmoff);
713 		STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.nrmoff);
714 	} else {
715 		LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->nrmoff);
716 		STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff);
717 		LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8);
718 		STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
719 	}
720 }
721 
Jit_NormalS8Skin()722 void VertexDecoderJitCache::Jit_NormalS8Skin() {
723 	Jit_AnyS8ToFloat(dec_->nrmoff);
724 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
725 }
726 
Jit_NormalS16Skin()727 void VertexDecoderJitCache::Jit_NormalS16Skin() {
728 	Jit_AnyS16ToFloat(dec_->nrmoff);
729 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
730 }
731 
Jit_NormalFloatSkin()732 void VertexDecoderJitCache::Jit_NormalFloatSkin() {
733 	fp.LDUR(128, srcQ[0], srcReg, dec_->nrmoff);
734 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
735 }
736 
Jit_PosS8Skin()737 void VertexDecoderJitCache::Jit_PosS8Skin() {
738 	Jit_AnyS8ToFloat(dec_->posoff);
739 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
740 }
741 
Jit_PosS16Skin()742 void VertexDecoderJitCache::Jit_PosS16Skin() {
743 	Jit_AnyS16ToFloat(dec_->posoff);
744 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
745 }
746 
Jit_PosFloatSkin()747 void VertexDecoderJitCache::Jit_PosFloatSkin() {
748 	fp.LDUR(128, srcQ[0], srcReg, dec_->posoff);
749 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
750 }
751 
Jit_AnyS8ToFloat(int srcoff)752 void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
753 	fp.LDUR(32, src[0], srcReg, srcoff);
754 	fp.SXTL(8, srcD[0], src[0]);
755 	fp.SXTL(16, srcQ[0], srcD[0]);
756 	fp.SCVTF(32, srcQ[0], srcQ[0], 7);
757 }
758 
Jit_AnyS16ToFloat(int srcoff)759 void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
760 	fp.LDUR(64, src[0], srcReg, srcoff);
761 	fp.SXTL(16, srcQ[0], srcD[0]);
762 	fp.SCVTF(32, srcQ[0], srcQ[0], 15);
763 }
764 
Jit_WriteMatrixMul(int outOff,bool pos)765 void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
766 	// Multiply with the matrix sitting in Q4-Q7.
767 	fp.FMUL(32, accNEON, Q4, srcQ[0], 0);
768 	fp.FMLA(32, accNEON, Q5, srcQ[0], 1);
769 	fp.FMLA(32, accNEON, Q6, srcQ[0], 2);
770 	if (pos) {
771 		fp.FADD(32, accNEON, accNEON, Q7);
772 	}
773 	fp.STUR(128, accNEON, dstReg, outOff);
774 }
775 
776 #endif // PPSSPP_ARCH(ARM64)
777