1 // Copyright (c) 2013- PPSSPP Project.
2
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
11
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18 #include "ppsspp_config.h"
19 #if PPSSPP_ARCH(ARM64)
20
21 #include "Common/CPUDetect.h"
22 #include "Common/Log.h"
23 #include "Core/Config.h"
24 #include "Core/Reporting.h"
25 #include "Common/Arm64Emitter.h"
26 #include "Core/MIPS/JitCommon/JitCommon.h"
27 #include "GPU/GPUState.h"
28 #include "GPU/Common/VertexDecoderCommon.h"
29
30 alignas(16) static float bones[16 * 8]; // First four are kept in registers
31 alignas(16) static float boneMask[4] = {1.0f, 1.0f, 1.0f, 0.0f};
32
33 static const float by128 = 1.0f / 128.0f;
34 static const float by32768 = 1.0f / 32768.0f;
35
36 using namespace Arm64Gen;
37
38 // Pointers, X regs (X0 - X17 safe to use.)
39 static const ARM64Reg srcReg = X0;
40 static const ARM64Reg dstReg = X1;
41
42 static const ARM64Reg counterReg = W2;
43 static const ARM64Reg tempReg1 = W3;
44 static const ARM64Reg tempRegPtr = X3;
45 static const ARM64Reg tempReg2 = W4;
46 static const ARM64Reg tempReg3 = W5;
47 static const ARM64Reg scratchReg = W6;
48 static const ARM64Reg scratchReg64 = X6;
49 static const ARM64Reg scratchReg2 = W7;
50 static const ARM64Reg scratchReg3 = W8;
51 static const ARM64Reg fullAlphaReg = W12;
52 static const ARM64Reg boundsMinUReg = W13;
53 static const ARM64Reg boundsMinVReg = W14;
54 static const ARM64Reg boundsMaxUReg = W15;
55 static const ARM64Reg boundsMaxVReg = W16;
56
57 static const ARM64Reg fpScratchReg = S4;
58 static const ARM64Reg fpScratchReg2 = S5;
59 static const ARM64Reg fpScratchReg3 = S6;
60 static const ARM64Reg fpScratchReg4 = S7;
61
62 static const ARM64Reg neonScratchRegD = D2;
63 static const ARM64Reg neonScratchRegQ = Q2;
64
65 static const ARM64Reg neonUVScaleReg = D0;
66 static const ARM64Reg neonUVOffsetReg = D1;
67
68 static const ARM64Reg src[3] = {S2, S3, S8};
69 static const ARM64Reg srcD[3] = {D2, D3, D8};
70 static const ARM64Reg srcQ[3] = {Q2, Q3, Q8};
71
72 static const ARM64Reg srcNEON = Q8;
73 static const ARM64Reg accNEON = Q9;
74
75 static const ARM64Reg neonWeightRegsQ[2] = { Q3, Q2 }; // reverse order to prevent clash with neonScratchReg in Jit_WeightsU*Skin.
76
77 // Q4-Q7 is the generated matrix that we multiply things by.
78 // Q8,Q9 are accumulators/scratch for matrix mul.
79 // Q10, Q11 are more scratch for matrix mul.
80 // Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices.
81
82 static const JitLookup jitLookup[] = {
83 {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
84 {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
85 {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
86 {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
87 {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
88 {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
89
90 {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
91 {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
92 {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
93
94 {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
95 {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
96 {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
97
98 {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
99 {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
100
101 {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
102 {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
103 {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
104
105 {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
106 {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
107 {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
108
109 {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
110 {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
111 {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
112 {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},
113
114 {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
115 {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
116 {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
117
118 {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
119 {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
120 {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
121
122 {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
123 {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
124 {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
125
126 /*
127 {&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph},
128 {&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph},
129 {&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph},
130
131 {&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
132 {&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
133 {&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
134
135 {&VertexDecoder::Step_Color8888Morph, &VertexDecoderJitCache::Jit_Color8888Morph},
136 {&VertexDecoder::Step_Color4444Morph, &VertexDecoderJitCache::Jit_Color4444Morph},
137 {&VertexDecoder::Step_Color565Morph, &VertexDecoderJitCache::Jit_Color565Morph},
138 {&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph},
139 */
140 };
141
142
Compile(const VertexDecoder & dec,int32_t * jittedSize)143 JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
144 dec_ = &dec;
145
146 BeginWrite();
147 const u8 *start = AlignCode16();
148
149 bool prescaleStep = false;
150 bool skinning = false;
151
152 bool log = false;
153
154 // Look for prescaled texcoord steps
155 for (int i = 0; i < dec.numSteps_; i++) {
156 if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
157 dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
158 dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
159 prescaleStep = true;
160 }
161 if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin ||
162 dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin ||
163 dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) {
164 skinning = true;
165 }
166 }
167
168 // Not used below, but useful for logging.
169 (void)skinning;
170
171 // if (skinning) log = true;
172
173 uint64_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
174 uint64_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
175 fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
176
177 // Keep the scale/offset in a few fp registers if we need it.
178 if (prescaleStep) {
179 MOVP2R(X3, &gstate_c.uv);
180 fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
181 fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
182 if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
183 fp.MOVI2FDUP(neonScratchRegD, by128, scratchReg);
184 fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
185 } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
186 fp.MOVI2FDUP(neonScratchRegD, by32768, scratchReg);
187 fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
188 }
189 }
190
191 // Add code to convert matrices to 4x4.
192 // Later we might want to do this when the matrices are loaded instead.
193 if (dec.weighttype && g_Config.bSoftwareSkinning) {
194 // Copying from R3 to R4
195 MOVP2R(X3, gstate.boneMatrix);
196 MOVP2R(X4, bones);
197 MOVP2R(X5, boneMask);
198 fp.LDR(128, INDEX_UNSIGNED, Q3, X5, 0);
199 for (int i = 0; i < dec.nweights; i++) {
200 // Note that INDEX_UNSIGNED does not support offsets not aligned to the data size so we must use POST.
201 fp.LDR(128, INDEX_POST, Q4, X3, 12); // Load 128 bits even though we just want 96
202 fp.LDR(128, INDEX_POST, Q5, X3, 12);
203 fp.LDR(128, INDEX_POST, Q6, X3, 12);
204 fp.LDR(128, INDEX_POST, Q7, X3, 12);
205 // First four matrices are in registers Q16+.
206 if (i < 4) {
207 fp.FMUL(32, (ARM64Reg)(Q16 + i * 4), Q4, Q3);
208 fp.FMUL(32, (ARM64Reg)(Q17 + i * 4), Q5, Q3);
209 fp.FMUL(32, (ARM64Reg)(Q18 + i * 4), Q6, Q3);
210 fp.FMUL(32, (ARM64Reg)(Q19 + i * 4), Q7, Q3);
211 ADDI2R(X4, X4, 16 * 4);
212 } else {
213 fp.FMUL(32, Q4, Q4, Q3);
214 fp.FMUL(32, Q5, Q5, Q3);
215 fp.FMUL(32, Q6, Q6, Q3);
216 fp.FMUL(32, Q7, Q7, Q3);
217 fp.STR(128, INDEX_UNSIGNED, Q4, X4, 0);
218 fp.STR(128, INDEX_UNSIGNED, Q5, X4, 16);
219 fp.STR(128, INDEX_UNSIGNED, Q6, X4, 32);
220 fp.STR(128, INDEX_UNSIGNED, Q7, X4, 48);
221 ADDI2R(X4, X4, 16 * 4);
222 }
223 }
224 }
225
226 if (dec.col) {
227 // Or LDB and skip the conditional? This is probably cheaper.
228 MOVI2R(fullAlphaReg, 0xFF);
229 }
230
231 if (dec.tc && dec.throughmode) {
232 // TODO: Smarter, only when doing bounds.
233 MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
234 LDRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
235 LDRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
236 LDRH(INDEX_UNSIGNED, boundsMinVReg, scratchReg64, offsetof(KnownVertexBounds, minV));
237 LDRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
238 }
239
240 const u8 *loopStart = GetCodePtr();
241 for (int i = 0; i < dec.numSteps_; i++) {
242 if (!CompileStep(dec, i)) {
243 EndWrite();
244 // Reset the code ptr (effectively undoing what we generated) and return zero to indicate that we failed.
245 ResetCodePtr(GetOffset(start));
246 char temp[1024] = {0};
247 dec.ToString(temp);
248 ERROR_LOG(G3D, "Could not compile vertex decoder, failed at step %d: %s", i, temp);
249 return nullptr;
250 }
251 }
252
253 ADDI2R(srcReg, srcReg, dec.VertexSize(), scratchReg);
254 ADDI2R(dstReg, dstReg, dec.decFmt.stride, scratchReg);
255 SUBS(counterReg, counterReg, 1);
256 B(CC_NEQ, loopStart);
257
258 if (dec.col) {
259 MOVP2R(tempRegPtr, &gstate_c.vertexFullAlpha);
260 CMP(fullAlphaReg, 0);
261 FixupBranch skip = B(CC_NEQ);
262 STRB(INDEX_UNSIGNED, fullAlphaReg, tempRegPtr, 0);
263 SetJumpTarget(skip);
264 }
265
266 if (dec.tc && dec.throughmode) {
267 // TODO: Smarter, only when doing bounds.
268 MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
269 STRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
270 STRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
271 STRH(INDEX_UNSIGNED, boundsMinVReg, scratchReg64, offsetof(KnownVertexBounds, minV));
272 STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
273 }
274
275 fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
276
277 RET();
278
279 FlushIcache();
280
281 if (log) {
282 char temp[1024] = { 0 };
283 dec.ToString(temp);
284 INFO_LOG(JIT, "=== %s (%d bytes) ===", temp, (int)(GetCodePtr() - start));
285 std::vector<std::string> lines = DisassembleArm64(start, (int)(GetCodePtr() - start));
286 for (auto line : lines) {
287 INFO_LOG(JIT, "%s", line.c_str());
288 }
289 INFO_LOG(JIT, "==========");
290 }
291
292 *jittedSize = (int)(GetCodePtr() - start);
293 EndWrite();
294 return (JittedVertexDecoder)start;
295 }
296
CompileStep(const VertexDecoder & dec,int step)297 bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
298 // See if we find a matching JIT function
299 for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {
300 if (dec.steps_[step] == jitLookup[i].func) {
301 ((*this).*jitLookup[i].jitFunc)();
302 return true;
303 }
304 }
305 return false;
306 }
307
Jit_ApplyWeights()308 void VertexDecoderJitCache::Jit_ApplyWeights() {
309 // We construct a matrix in Q4-Q7
310 if (dec_->nweights >= 4) {
311 MOVP2R(scratchReg64, bones + 16 * 4);
312 }
313 for (int i = 0; i < dec_->nweights; i++) {
314 switch (i) {
315 case 0:
316 fp.FMUL(32, Q4, Q16, neonWeightRegsQ[0], 0);
317 fp.FMUL(32, Q5, Q17, neonWeightRegsQ[0], 0);
318 fp.FMUL(32, Q6, Q18, neonWeightRegsQ[0], 0);
319 fp.FMUL(32, Q7, Q19, neonWeightRegsQ[0], 0);
320 break;
321 case 1:
322 fp.FMLA(32, Q4, Q20, neonWeightRegsQ[0], 1);
323 fp.FMLA(32, Q5, Q21, neonWeightRegsQ[0], 1);
324 fp.FMLA(32, Q6, Q22, neonWeightRegsQ[0], 1);
325 fp.FMLA(32, Q7, Q23, neonWeightRegsQ[0], 1);
326 break;
327 case 2:
328 fp.FMLA(32, Q4, Q24, neonWeightRegsQ[0], 2);
329 fp.FMLA(32, Q5, Q25, neonWeightRegsQ[0], 2);
330 fp.FMLA(32, Q6, Q26, neonWeightRegsQ[0], 2);
331 fp.FMLA(32, Q7, Q27, neonWeightRegsQ[0], 2);
332 break;
333 case 3:
334 fp.FMLA(32, Q4, Q28, neonWeightRegsQ[0], 3);
335 fp.FMLA(32, Q5, Q29, neonWeightRegsQ[0], 3);
336 fp.FMLA(32, Q6, Q30, neonWeightRegsQ[0], 3);
337 fp.FMLA(32, Q7, Q31, neonWeightRegsQ[0], 3);
338 break;
339 default:
340 // Matrices 4+ need to be loaded from memory.
341 fp.LDP(128, INDEX_SIGNED, Q8, Q9, scratchReg64, 0);
342 fp.LDP(128, INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16);
343 fp.FMLA(32, Q4, Q8, neonWeightRegsQ[i >> 2], i & 3);
344 fp.FMLA(32, Q5, Q9, neonWeightRegsQ[i >> 2], i & 3);
345 fp.FMLA(32, Q6, Q10, neonWeightRegsQ[i >> 2], i & 3);
346 fp.FMLA(32, Q7, Q11, neonWeightRegsQ[i >> 2], i & 3);
347 ADDI2R(scratchReg64, scratchReg64, 4 * 16);
348 break;
349 }
350 }
351 }
352
Jit_WeightsU8()353 void VertexDecoderJitCache::Jit_WeightsU8() {
354 // Basic implementation - a byte at a time. TODO: Optimize
355 int j;
356 for (j = 0; j < dec_->nweights; j++) {
357 LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j);
358 STRB(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j);
359 }
360 while (j & 3) {
361 STRB(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j);
362 j++;
363 }
364 }
365
Jit_WeightsU16()366 void VertexDecoderJitCache::Jit_WeightsU16() {
367 // Basic implementation - a short at a time. TODO: Optimize
368 int j;
369 for (j = 0; j < dec_->nweights; j++) {
370 LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 2);
371 STRH(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
372 }
373 while (j & 3) {
374 STRH(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 2);
375 j++;
376 }
377 }
378
Jit_WeightsFloat()379 void VertexDecoderJitCache::Jit_WeightsFloat() {
380 int j;
381 for (j = 0; j < dec_->nweights; j++) {
382 LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 4);
383 STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
384 }
385 while (j & 3) { // Zero additional weights rounding up to 4.
386 STR(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 4);
387 j++;
388 }
389 }
390
Jit_WeightsU8Skin()391 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
392 // Weight is first so srcReg is correct.
393 switch (dec_->nweights) {
394 case 1: fp.LDR(8, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0); break;
395 case 2: fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0); break;
396 default:
397 // For 3, we over read, for over 4, we read more later.
398 fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0);
399 break;
400 }
401
402 fp.UXTL(8, neonScratchRegQ, neonScratchRegD);
403 fp.UXTL(16, neonScratchRegQ, neonScratchRegD);
404 fp.UCVTF(32, neonWeightRegsQ[0], neonScratchRegQ, 7);
405
406 if (dec_->nweights > 4) {
407 switch (dec_->nweights) {
408 case 5: fp.LDR(8, INDEX_UNSIGNED, neonScratchRegD, srcReg, 4); break;
409 case 6: fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, 4); break;
410 case 7:
411 case 8:
412 fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, 4);
413 break;
414 }
415 fp.UXTL(8, neonScratchRegQ, neonScratchRegD);
416 fp.UXTL(16, neonScratchRegQ, neonScratchRegD);
417 fp.UCVTF(32, neonWeightRegsQ[1], neonScratchRegQ, 7);
418 }
419 Jit_ApplyWeights();
420 }
421
Jit_WeightsU16Skin()422 void VertexDecoderJitCache::Jit_WeightsU16Skin() {
423 switch (dec_->nweights) {
424 case 1: fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0); break;
425 case 2: fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0); break;
426 default:
427 // For 3, we over read, for over 4, we read more later.
428 fp.LDR(64, INDEX_UNSIGNED, neonScratchRegD, srcReg, 0);
429 break;
430 }
431 fp.UXTL(16, neonScratchRegQ, neonScratchRegD);
432 fp.UCVTF(32, neonWeightRegsQ[0], neonScratchRegQ, 15);
433
434 if (dec_->nweights > 4) {
435 switch (dec_->nweights) {
436 case 5: fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, 8); break;
437 case 6: fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, 8); break;
438 case 7:
439 case 8:
440 fp.LDR(64, INDEX_UNSIGNED, neonScratchRegD, srcReg, 8);
441 break;
442 }
443 fp.UXTL(16, neonScratchRegQ, neonScratchRegD);
444 fp.UCVTF(32, neonWeightRegsQ[1], neonScratchRegQ, 15);
445 }
446 Jit_ApplyWeights();
447 }
448
Jit_WeightsFloatSkin()449 void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
450 switch (dec_->nweights) {
451 case 1:
452 fp.LDR(32, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
453 break;
454 case 2:
455 fp.LDR(64, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
456 break;
457 case 3:
458 case 4:
459 fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
460 break;
461
462 case 5:
463 fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
464 fp.LDR(32, INDEX_UNSIGNED, neonWeightRegsQ[1], srcReg, 16);
465 break;
466 case 6:
467 fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
468 fp.LDR(64, INDEX_UNSIGNED, neonWeightRegsQ[1], srcReg, 16);
469 break;
470 case 7:
471 case 8:
472 fp.LDP(128, INDEX_SIGNED, neonWeightRegsQ[0], neonWeightRegsQ[1], srcReg, 0);
473 break;
474 }
475 Jit_ApplyWeights();
476 }
477
Jit_Color8888()478 void VertexDecoderJitCache::Jit_Color8888() {
479 LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
480
481 // Set flags to determine if alpha != 0xFF.
482 ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
483 CMP(tempReg2, 0);
484
485 // Clear fullAlphaReg when the inverse was not 0.
486 // fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
487 CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
488
489 STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
490 }
491
Jit_Color4444()492 void VertexDecoderJitCache::Jit_Color4444() {
493 LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
494
495 // Spread out the components.
496 ANDI2R(tempReg2, tempReg1, 0x000F, scratchReg);
497 ANDI2R(tempReg3, tempReg1, 0x00F0, scratchReg);
498 ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 4));
499 ANDI2R(tempReg3, tempReg1, 0x0F00, scratchReg);
500 ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 8));
501 ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg);
502 ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 12));
503
504 // And expand to 8 bits.
505 ORR(tempReg1, tempReg2, tempReg2, ArithOption(tempReg2, ST_LSL, 4));
506
507 STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
508
509 // Set flags to determine if alpha != 0xFF.
510 ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
511 CMP(tempReg2, 0);
512
513 // Clear fullAlphaReg when the inverse was not 0.
514 // fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
515 CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
516 }
517
Jit_Color565()518 void VertexDecoderJitCache::Jit_Color565() {
519 LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
520
521 // Spread out R and B first. This puts them in 0x001F001F.
522 ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
523 ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg);
524 ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 5));
525
526 // Expand 5 -> 8.
527 LSL(tempReg3, tempReg2, 3);
528 ORR(tempReg2, tempReg3, tempReg2, ArithOption(tempReg2, ST_LSR, 2));
529 ANDI2R(tempReg2, tempReg2, 0xFFFF00FF, scratchReg);
530
531 // Now finally G. We start by shoving it into a wall.
532 LSR(tempReg1, tempReg1, 5);
533 ANDI2R(tempReg1, tempReg1, 0x003F, scratchReg);
534 LSL(tempReg3, tempReg1, 2);
535 // Don't worry, shifts into a wall.
536 ORR(tempReg3, tempReg3, tempReg1, ArithOption(tempReg1, ST_LSR, 4));
537 ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 8));
538
539 // Add in full alpha. No need to update fullAlphaReg.
540 ORRI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
541
542 STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
543 }
544
Jit_Color5551()545 void VertexDecoderJitCache::Jit_Color5551() {
546 LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
547
548 ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
549 ANDI2R(tempReg3, tempReg1, 0x03E0, scratchReg);
550 ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 3));
551 ANDI2R(tempReg3, tempReg1, 0x7C00, scratchReg);
552 ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 6));
553
554 // Expand 5 -> 8.
555 LSR(tempReg3, tempReg2, 2);
556 // Clean up the bits that were shifted right.
557 ANDI2R(tempReg3, tempReg3, ~0x000000F8);
558 ANDI2R(tempReg3, tempReg3, ~0x0000F800);
559 ORR(tempReg2, tempReg3, tempReg2, ArithOption(tempReg2, ST_LSL, 3));
560
561 // Now we just need alpha. Since we loaded as signed, it'll be extended.
562 ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
563 ORR(tempReg2, tempReg2, tempReg1);
564
565 // Set flags to determine if alpha != 0xFF.
566 ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
567 CMP(tempReg3, 0);
568
569 STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off);
570
571 // Clear fullAlphaReg when the inverse was not 0.
572 // fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1;
573 CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
574 }
575
Jit_TcU16ThroughToFloat()576 void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
577 LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
578 LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
579
580 auto updateSide = [&](ARM64Reg src, CCFlags cc, ARM64Reg dst) {
581 CMP(src, dst);
582 CSEL(dst, src, dst, cc);
583 };
584
585 updateSide(tempReg1, CC_LT, boundsMinUReg);
586 updateSide(tempReg1, CC_GT, boundsMaxUReg);
587 updateSide(tempReg2, CC_LT, boundsMinVReg);
588 updateSide(tempReg2, CC_GT, boundsMaxVReg);
589
590 fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
591 fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
592 fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
593 fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
594 }
595
Jit_TcFloatThrough()596 void VertexDecoderJitCache::Jit_TcFloatThrough() {
597 LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
598 STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
599 }
600
Jit_TcFloat()601 void VertexDecoderJitCache::Jit_TcFloat() {
602 LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
603 STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
604 }
605
Jit_TcU8Prescale()606 void VertexDecoderJitCache::Jit_TcU8Prescale() {
607 fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
608 fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
609 fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
610 fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
611 fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
612 fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
613 fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
614 }
615
Jit_TcU8ToFloat()616 void VertexDecoderJitCache::Jit_TcU8ToFloat() {
617 fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
618 fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
619 fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
620 fp.UCVTF(32, neonScratchRegD, neonScratchRegD, 7);
621 fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
622 }
623
Jit_TcU16Prescale()624 void VertexDecoderJitCache::Jit_TcU16Prescale() {
625 fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
626 fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
627 fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
628 fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
629 fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
630 fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
631 }
632
Jit_TcU16ToFloat()633 void VertexDecoderJitCache::Jit_TcU16ToFloat() {
634 fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
635 fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
636 fp.UCVTF(32, neonScratchRegD, neonScratchRegD, 15);
637 fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
638 }
639
Jit_TcFloatPrescale()640 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
641 fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff);
642 fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
643 fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
644 fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
645 }
646
Jit_PosS8()647 void VertexDecoderJitCache::Jit_PosS8() {
648 Jit_AnyS8ToFloat(dec_->posoff);
649 fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
650 }
651
Jit_PosS16()652 void VertexDecoderJitCache::Jit_PosS16() {
653 Jit_AnyS16ToFloat(dec_->posoff);
654 fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
655 }
656
Jit_PosFloat()657 void VertexDecoderJitCache::Jit_PosFloat() {
658 // Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
659 if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
660 LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->posoff);
661 STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.posoff);
662 } else {
663 LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->posoff);
664 STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.posoff);
665 LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8);
666 STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8);
667 }
668 }
669
Jit_PosS8Through()670 void VertexDecoderJitCache::Jit_PosS8Through() {
671 LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
672 LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 1);
673 LDRSB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 2); // signed?
674 fp.SCVTF(fpScratchReg, tempReg1);
675 fp.SCVTF(fpScratchReg2, tempReg2);
676 fp.SCVTF(fpScratchReg3, tempReg3);
677 STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff);
678 STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4);
679 STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8);
680 }
681
Jit_PosS16Through()682 void VertexDecoderJitCache::Jit_PosS16Through() {
683 // Start with X and Y (which is signed.)
684 fp.LDUR(32, src[0], srcReg, dec_->posoff);
685 fp.SXTL(16, srcD[0], src[0]);
686 fp.SCVTF(32, srcD[0], srcD[0]);
687 fp.STUR(64, src[0], dstReg, dec_->decFmt.posoff);
688 // Now load in Z (which is unsigned.)
689 LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4);
690 fp.SCVTF(src[1], tempReg3);
691 STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 8);
692 }
693
Jit_NormalS8()694 void VertexDecoderJitCache::Jit_NormalS8() {
695 LDURH(tempReg1, srcReg, dec_->nrmoff);
696 LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 2);
697 ORR(tempReg1, tempReg1, tempReg3, ArithOption(tempReg3, ST_LSL, 16));
698 STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
699 }
700
701 // Copy 6 bytes and then 2 zeroes.
Jit_NormalS16()702 void VertexDecoderJitCache::Jit_NormalS16() {
703 // NOTE: Not LDRH, we just copy the raw bytes here.
704 LDUR(tempReg1, srcReg, dec_->nrmoff);
705 LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4);
706 STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff);
707 }
708
Jit_NormalFloat()709 void VertexDecoderJitCache::Jit_NormalFloat() {
710 // Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
711 if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
712 LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->nrmoff);
713 STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.nrmoff);
714 } else {
715 LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->nrmoff);
716 STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff);
717 LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8);
718 STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
719 }
720 }
721
Jit_NormalS8Skin()722 void VertexDecoderJitCache::Jit_NormalS8Skin() {
723 Jit_AnyS8ToFloat(dec_->nrmoff);
724 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
725 }
726
Jit_NormalS16Skin()727 void VertexDecoderJitCache::Jit_NormalS16Skin() {
728 Jit_AnyS16ToFloat(dec_->nrmoff);
729 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
730 }
731
Jit_NormalFloatSkin()732 void VertexDecoderJitCache::Jit_NormalFloatSkin() {
733 fp.LDUR(128, srcQ[0], srcReg, dec_->nrmoff);
734 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
735 }
736
Jit_PosS8Skin()737 void VertexDecoderJitCache::Jit_PosS8Skin() {
738 Jit_AnyS8ToFloat(dec_->posoff);
739 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
740 }
741
Jit_PosS16Skin()742 void VertexDecoderJitCache::Jit_PosS16Skin() {
743 Jit_AnyS16ToFloat(dec_->posoff);
744 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
745 }
746
Jit_PosFloatSkin()747 void VertexDecoderJitCache::Jit_PosFloatSkin() {
748 fp.LDUR(128, srcQ[0], srcReg, dec_->posoff);
749 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
750 }
751
Jit_AnyS8ToFloat(int srcoff)752 void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
753 fp.LDUR(32, src[0], srcReg, srcoff);
754 fp.SXTL(8, srcD[0], src[0]);
755 fp.SXTL(16, srcQ[0], srcD[0]);
756 fp.SCVTF(32, srcQ[0], srcQ[0], 7);
757 }
758
Jit_AnyS16ToFloat(int srcoff)759 void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
760 fp.LDUR(64, src[0], srcReg, srcoff);
761 fp.SXTL(16, srcQ[0], srcD[0]);
762 fp.SCVTF(32, srcQ[0], srcQ[0], 15);
763 }
764
Jit_WriteMatrixMul(int outOff,bool pos)765 void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
766 // Multiply with the matrix sitting in Q4-Q7.
767 fp.FMUL(32, accNEON, Q4, srcQ[0], 0);
768 fp.FMLA(32, accNEON, Q5, srcQ[0], 1);
769 fp.FMLA(32, accNEON, Q6, srcQ[0], 2);
770 if (pos) {
771 fp.FADD(32, accNEON, accNEON, Q7);
772 }
773 fp.STUR(128, accNEON, dstReg, outOff);
774 }
775
776 #endif // PPSSPP_ARCH(ARM64)
777