1 // Copyright (c) 2013- PPSSPP Project.
2
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
11
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18 #include "ppsspp_config.h"
19
20 #if PPSSPP_ARCH(ARM)
21
22 // This allows highlighting to work. Yay.
23 #ifdef __INTELLISENSE__
24 #define ARM
25 #endif
26
27 #include <stddef.h>
28
29 #include "Common/CPUDetect.h"
30 #include "Core/Config.h"
31 #include "Core/Reporting.h"
32 #include "GPU/GPUState.h"
33 #include "GPU/Common/VertexDecoderCommon.h"
34
35 extern void DisassembleArm(const u8 *data, int size);
36
37 bool NEONSkinning = false;
38 bool NEONMorphing = false;
39
40 // Used only in non-NEON mode.
41 alignas(16) static float skinMatrix[12];
42
43 // Will be used only in NEON mode.
44 alignas(16) static float bones[16 * 8]; // First two are kept in registers
45 alignas(16) static float boneMask[4] = {1.0f, 1.0f, 1.0f, 0.0f};
46
47 // NEON register allocation:
48 // Q0: Texture scaling parameters
49 // Q1: Temp storage
50 // Q2: Vector-by-matrix accumulator
51 // Q3: Unused (multiplier temp when morphing)
52 //
53 // When skinning, we'll use Q4-Q7 as the "matrix accumulator".
54 // First two matrices will be preloaded into Q8-Q11 and Q12-Q15 to reduce
55 // memory bandwidth requirements.
56 // The rest will be dumped to bones as on x86.
57 //
58 // When morphing, we never skin. So we're free to use Q4+.
59 // Q4 is for color shift values, and Q5 is a secondary multipler inside the morph.
60 // TODO: Maybe load all morph weights to Q6+ to avoid memory access?
61
62
63 static const float by128 = 1.0f / 128.0f;
64 static const float by16384 = 1.0f / 16384.0f;
65 static const float by32768 = 1.0f / 32768.0f;
66
67 using namespace ArmGen;
68
69 // NOTE: Avoid R9, it's dangerous on iOS.
70 //
71 // r0-r3: parameters
72 // r4-r11: local vars. save, except R9.
73 // r12: interprocedure scratch
74 // r13: stack8
75
76 static const ARMReg tempReg1 = R3;
77 static const ARMReg tempReg2 = R4;
78 static const ARMReg tempReg3 = R5;
79 static const ARMReg scratchReg = R6;
80 static const ARMReg scratchReg2 = R7;
81 static const ARMReg scratchReg3 = R8;
82 static const ARMReg fullAlphaReg = R12;
83 static const ARMReg srcReg = R0;
84 static const ARMReg dstReg = R1;
85 static const ARMReg counterReg = R2;
86 static const ARMReg fpScratchReg = S4;
87 static const ARMReg fpScratchReg2 = S5;
88 static const ARMReg fpScratchReg3 = S6;
89 static const ARMReg fpScratchReg4 = S7;
90 static const ARMReg fpUscaleReg = S0;
91 static const ARMReg fpVscaleReg = S1;
92 static const ARMReg fpUoffsetReg = S2;
93 static const ARMReg fpVoffsetReg = S3;
94
95 // Simpler aliases for NEON. Overlaps with corresponding VFP regs.
96 static const ARMReg neonUVScaleReg = D0;
97 static const ARMReg neonUVOffsetReg = D1;
98 static const ARMReg neonScratchReg = D2;
99 static const ARMReg neonScratchReg2 = D3;
100 static const ARMReg neonScratchRegQ = Q1; // Overlaps with all the scratch regs
101
102 // Everything above S6 is fair game for skinning
103
104 // S8-S15 are used during matrix generation
105
106 // These only live through the matrix multiplication
107 static const ARMReg src[3] = {S8, S9, S10}; // skin source
108 static const ARMReg acc[3] = {S11, S12, S13}; // skin accumulator
109
110 static const ARMReg srcNEON = Q2;
111 static const ARMReg accNEON = Q3;
112
113 static const JitLookup jitLookup[] = {
114 {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
115 {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
116 {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
117 {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
118 {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
119 {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
120
121 {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
122 {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
123 {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
124
125 {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
126 {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
127 {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
128
129 {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
130 {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
131
132 {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
133 {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
134 {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
135
136 {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
137 {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
138 {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
139
140 {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
141 {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
142 {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
143 {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},
144
145 {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
146 {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
147 {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
148
149 {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
150 {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
151 {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
152
153 {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
154 {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
155 {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
156
157 {&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph},
158 {&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph},
159 {&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph},
160
161 {&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
162 {&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
163 {&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
164
165 {&VertexDecoder::Step_Color8888Morph, &VertexDecoderJitCache::Jit_Color8888Morph},
166 {&VertexDecoder::Step_Color4444Morph, &VertexDecoderJitCache::Jit_Color4444Morph},
167 {&VertexDecoder::Step_Color565Morph, &VertexDecoderJitCache::Jit_Color565Morph},
168 {&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph},
169 };
170
Compile(const VertexDecoder & dec,int32_t * jittedSize)171 JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
172 dec_ = &dec;
173 BeginWrite();
174 const u8 *start = AlignCode16();
175
176 bool prescaleStep = false;
177 bool skinning = false;
178
179 NEONSkinning = cpu_info.bNEON;
180 NEONMorphing = cpu_info.bNEON;
181
182 // Look for prescaled texcoord steps
183 for (int i = 0; i < dec.numSteps_; i++) {
184 if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
185 dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
186 dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
187 prescaleStep = true;
188 }
189 if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin ||
190 dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin ||
191 dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) {
192 skinning = true;
193 }
194 }
195
196 // Not used below, but useful for logging.
197 (void)skinning;
198
199 SetCC(CC_AL);
200
201 PUSH(8, R4, R5, R6, R7, R8, R10, R11, R_LR);
202 if (NEONSkinning || NEONMorphing) {
203 VPUSH(D8, 8);
204 }
205
206 // Keep the scale/offset in a few fp registers if we need it.
207 if (prescaleStep) {
208 MOVP2R(R3, &gstate_c.uv);
209 if (cpu_info.bNEON) {
210 VLD1(F_32, neonUVScaleReg, R3, 2, ALIGN_NONE);
211 if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
212 VMOV_neon(F_32, neonScratchReg, by128);
213 VMUL(F_32, neonUVScaleReg, neonUVScaleReg, neonScratchReg);
214 } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
215 VMOV_neon(F_32, neonScratchReg, by32768);
216 VMUL(F_32, neonUVScaleReg, neonUVScaleReg, neonScratchReg);
217 }
218 } else {
219 VLDMIA(R3, false, fpUscaleReg, 4); // fp{Uscale, Yscale, Uoffset, Voffset}Reg = {S0-S4}
220 if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
221 MOVI2F(fpScratchReg, by128, scratchReg);
222 VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg);
223 VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg);
224 } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
225 MOVI2F(fpScratchReg, by32768, scratchReg);
226 VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg);
227 VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg);
228 }
229 }
230 }
231
232 // Add code to convert matrices to 4x4.
233 // Later we might want to do this when the matrices are loaded instead.
234 if (NEONSkinning && dec.weighttype && g_Config.bSoftwareSkinning) {
235 // Copying from R3 to R4
236 MOVP2R(R3, gstate.boneMatrix);
237 MOVP2R(R4, bones);
238 MOVP2R(R5, boneMask);
239 VLD1(F_32, Q3, R5, 2, ALIGN_128);
240 for (int i = 0; i < dec.nweights; i++) {
241 VLD1(F_32, Q4, R3, 2); // Load 128 bits even though we just want 96
242 VMUL(F_32, Q4, Q4, Q3);
243 ADD(R3, R3, 12);
244 VLD1(F_32, Q5, R3, 2);
245 VMUL(F_32, Q5, Q5, Q3);
246 ADD(R3, R3, 12);
247 VLD1(F_32, Q6, R3, 2);
248 VMUL(F_32, Q6, Q6, Q3);
249 ADD(R3, R3, 12);
250 VLD1(F_32, Q7, R3, 2);
251 VMUL(F_32, Q7, Q7, Q3);
252 ADD(R3, R3, 12);
253 // First two matrices are in registers.
254 if (i == 0) {
255 VMOV(Q8, Q4);
256 VMOV(Q9, Q5);
257 VMOV(Q10, Q6);
258 VMOV(Q11, Q7);
259 ADD(R4, R4, 16 * 4);
260 } else if (i == 1) {
261 VMOV(Q12, Q4);
262 VMOV(Q13, Q5);
263 VMOV(Q14, Q6);
264 VMOV(Q15, Q7);
265 ADD(R4, R4, 16 * 4);
266 } else {
267 VST1(F_32, Q4, R4, 2, ALIGN_128, REG_UPDATE);
268 VST1(F_32, Q5, R4, 2, ALIGN_128, REG_UPDATE);
269 VST1(F_32, Q6, R4, 2, ALIGN_128, REG_UPDATE);
270 VST1(F_32, Q7, R4, 2, ALIGN_128, REG_UPDATE);
271 }
272 }
273 }
274
275 if (dec.col) {
276 // Or LDB and skip the conditional? This is probably cheaper.
277 MOV(fullAlphaReg, 0xFF);
278 }
279
280 JumpTarget loopStart = GetCodePtr();
281 // Preload data cache ahead of reading. This offset seems pretty good.
282 PLD(srcReg, 64);
283 for (int i = 0; i < dec.numSteps_; i++) {
284 if (!CompileStep(dec, i)) {
285 EndWrite();
286 // Reset the code ptr and return zero to indicate that we failed.
287 ResetCodePtr(GetOffset(start));
288 char temp[1024] = {0};
289 dec.ToString(temp);
290 INFO_LOG(G3D, "Could not compile vertex decoder: %s", temp);
291 return 0;
292 }
293 }
294
295 ADDI2R(srcReg, srcReg, dec.VertexSize(), scratchReg);
296 ADDI2R(dstReg, dstReg, dec.decFmt.stride, scratchReg);
297 SUBS(counterReg, counterReg, 1);
298 B_CC(CC_NEQ, loopStart);
299
300 if (dec.col) {
301 MOVP2R(tempReg1, &gstate_c.vertexFullAlpha);
302 CMP(fullAlphaReg, 0);
303 SetCC(CC_EQ);
304 STRB(fullAlphaReg, tempReg1, 0);
305 SetCC(CC_AL);
306 }
307
308 if (NEONSkinning || NEONMorphing) {
309 VPOP(D8, 8);
310 }
311 POP(8, R4, R5, R6, R7, R8, R10, R11, R_PC);
312
313 FlushLitPool();
314 FlushIcache();
315
316 /*
317 DisassembleArm(start, GetCodePtr() - start);
318 char temp[1024] = {0};
319 dec.ToString(temp);
320 INFO_LOG(G3D, "%s", temp);
321 */
322
323 *jittedSize = GetCodePtr() - start;
324 EndWrite();
325 return (JittedVertexDecoder)start;
326 }
327
Jit_WeightsU8()328 void VertexDecoderJitCache::Jit_WeightsU8() {
329 // Basic implementation - a byte at a time. TODO: Optimize
330 int j;
331 for (j = 0; j < dec_->nweights; j++) {
332 LDRB(tempReg1, srcReg, dec_->weightoff + j);
333 STRB(tempReg1, dstReg, dec_->decFmt.w0off + j);
334 }
335 if (j & 3) {
336 // Create a zero register. Might want to make a fixed one.
337 EOR(scratchReg, scratchReg, scratchReg);
338 }
339 while (j & 3) {
340 STRB(scratchReg, dstReg, dec_->decFmt.w0off + j);
341 j++;
342 }
343 }
344
Jit_WeightsU16()345 void VertexDecoderJitCache::Jit_WeightsU16() {
346 // Basic implementation - a short at a time. TODO: Optimize
347 int j;
348 for (j = 0; j < dec_->nweights; j++) {
349 LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
350 STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
351 }
352 if (j & 3) {
353 // Create a zero register. Might want to make a fixed one.
354 EOR(scratchReg, scratchReg, scratchReg);
355 }
356 while (j & 3) {
357 STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2);
358 j++;
359 }
360 }
361
Jit_WeightsFloat()362 void VertexDecoderJitCache::Jit_WeightsFloat() {
363 int j;
364 for (j = 0; j < dec_->nweights; j++) {
365 LDR(tempReg1, srcReg, dec_->weightoff + j * 4);
366 STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
367 }
368 if (j & 3) {
369 EOR(tempReg1, tempReg1, tempReg1);
370 }
371 while (j & 3) { // Zero additional weights rounding up to 4.
372 STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
373 j++;
374 }
375 }
376
377 static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
378 static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 };
379 static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 };
380
Jit_ApplyWeights()381 void VertexDecoderJitCache::Jit_ApplyWeights() {
382 if (NEONSkinning) {
383 // We construct a matrix in Q4-Q7
384 // We can use Q1 as temp.
385 if (dec_->nweights >= 2) {
386 MOVP2R(scratchReg, bones + 16 * 2);
387 }
388 for (int i = 0; i < dec_->nweights; i++) {
389 switch (i) {
390 case 0:
391 VMUL_scalar(F_32, Q4, Q8, QScalar(neonWeightRegsQ[0], 0));
392 VMUL_scalar(F_32, Q5, Q9, QScalar(neonWeightRegsQ[0], 0));
393 VMUL_scalar(F_32, Q6, Q10, QScalar(neonWeightRegsQ[0], 0));
394 VMUL_scalar(F_32, Q7, Q11, QScalar(neonWeightRegsQ[0], 0));
395 break;
396 case 1:
397 // Krait likes VDUP + VFMA better than VMLA, and it's easy to do here.
398 if (cpu_info.bVFPv4) {
399 VDUP(F_32, Q1, neonWeightRegsQ[i >> 2], i & 1);
400 VFMA(F_32, Q4, Q12, Q1);
401 VFMA(F_32, Q5, Q13, Q1);
402 VFMA(F_32, Q6, Q14, Q1);
403 VFMA(F_32, Q7, Q15, Q1);
404 } else {
405 VMLA_scalar(F_32, Q4, Q12, QScalar(neonWeightRegsQ[0], 1));
406 VMLA_scalar(F_32, Q5, Q13, QScalar(neonWeightRegsQ[0], 1));
407 VMLA_scalar(F_32, Q6, Q14, QScalar(neonWeightRegsQ[0], 1));
408 VMLA_scalar(F_32, Q7, Q15, QScalar(neonWeightRegsQ[0], 1));
409 }
410 break;
411 default:
412 // Matrices 2+ need to be loaded from memory.
413 // Wonder if we can free up one more register so we could get some parallelism.
414 // Actually Q3 is free if there are fewer than 5 weights...
415 if (dec_->nweights <= 4) {
416 VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
417 VLD1(F_32, Q3, scratchReg, 2, ALIGN_128, REG_UPDATE);
418 VMLA_scalar(F_32, Q4, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
419 VMLA_scalar(F_32, Q5, Q3, QScalar(neonWeightRegsQ[i >> 2], i & 3));
420 VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
421 VLD1(F_32, Q3, scratchReg, 2, ALIGN_128, REG_UPDATE);
422 VMLA_scalar(F_32, Q6, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
423 VMLA_scalar(F_32, Q7, Q3, QScalar(neonWeightRegsQ[i >> 2], i & 3));
424 } else {
425 VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
426 VMLA_scalar(F_32, Q4, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
427 VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
428 VMLA_scalar(F_32, Q5, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
429 VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
430 VMLA_scalar(F_32, Q6, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
431 VLD1(F_32, Q1, scratchReg, 2, ALIGN_128, REG_UPDATE);
432 VMLA_scalar(F_32, Q7, Q1, QScalar(neonWeightRegsQ[i >> 2], i & 3));
433 }
434 break;
435 }
436 }
437 } else {
438 MOVP2R(tempReg2, skinMatrix);
439 // This approach saves a few stores but accesses the matrices in a more
440 // sparse order.
441 const float *bone = &gstate.boneMatrix[0];
442 MOVP2R(tempReg1, bone);
443 for (int i = 0; i < 12; i++) {
444 VLDR(fpScratchReg3, tempReg1, i * 4);
445 VMUL(fpScratchReg3, fpScratchReg3, weightRegs[0]);
446 for (int j = 1; j < dec_->nweights; j++) {
447 VLDR(fpScratchReg2, tempReg1, i * 4 + j * 4 * 12);
448 VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]);
449 }
450 VSTR(fpScratchReg3, tempReg2, i * 4);
451 }
452 }
453 }
454
Jit_WeightsU8Skin()455 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
456 if (NEONSkinning) {
457 // Weight is first so srcReg is correct.
458 switch (dec_->nweights) {
459 case 1: VLD1_lane(I_8, neonScratchReg, srcReg, 0, false); break;
460 case 2: VLD1_lane(I_16, neonScratchReg, srcReg, 0, false); break;
461 default:
462 // For 3, we over read, for over 4, we read more later.
463 VLD1_lane(I_32, neonScratchReg, srcReg, 0, false);
464 break;
465 }
466 // This can be represented as a constant.
467 VMOV_neon(F_32, Q3, by128);
468 VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
469 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
470 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
471 VMUL(F_32, neonWeightRegsQ[0], neonScratchRegQ, Q3);
472
473 if (dec_->nweights > 4) {
474 ADD(tempReg1, srcReg, 4 * sizeof(u8));
475 switch (dec_->nweights) {
476 case 5: VLD1_lane(I_8, neonScratchReg, tempReg1, 0, false); break;
477 case 6: VLD1_lane(I_16, neonScratchReg, tempReg1, 0, false); break;
478 case 7:
479 case 8:
480 VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false);
481 break;
482 }
483 VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
484 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
485 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
486 VMUL(F_32, neonWeightRegsQ[1], neonScratchRegQ, Q3);
487 }
488 } else {
489 for (int j = 0; j < dec_->nweights; j++) {
490 LDRB(tempReg1, srcReg, dec_->weightoff + j);
491 VMOV(fpScratchReg, tempReg1);
492 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
493 MOVI2F(fpScratchReg2, by128, scratchReg);
494 VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
495 }
496 }
497 Jit_ApplyWeights();
498 }
499
Jit_WeightsU16Skin()500 void VertexDecoderJitCache::Jit_WeightsU16Skin() {
501 if (NEONSkinning) {
502 switch (dec_->nweights) {
503 case 1: VLD1_lane(I_16, neonScratchReg, srcReg, 0, true); break;
504 case 2: VLD1_lane(I_32, neonScratchReg, srcReg, 0, false); break;
505 default:
506 // For 3, we over read, for over 4, we read more later.
507 VLD1(I_32, neonScratchReg, srcReg, 1, ALIGN_NONE);
508 break;
509 }
510 // This can be represented as a constant.
511 VMOV_neon(F_32, Q3, by32768);
512 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
513 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
514 VMUL(F_32, neonWeightRegsQ[0], neonScratchRegQ, Q3);
515
516 if (dec_->nweights > 4) {
517 ADD(tempReg1, srcReg, 4 * sizeof(u16));
518 switch (dec_->nweights) {
519 case 5: VLD1_lane(I_16, neonScratchReg, tempReg1, 0, true); break;
520 case 6: VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false); break;
521 case 7:
522 case 8:
523 VLD1(I_32, neonScratchReg, tempReg1, 1, ALIGN_NONE);
524 break;
525 }
526 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
527 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
528 VMUL(F_32, neonWeightRegsQ[1], neonScratchRegQ, Q3);
529 }
530 } else {
531 // Fallback and non-neon
532 for (int j = 0; j < dec_->nweights; j++) {
533 LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
534 VMOV(fpScratchReg, tempReg1);
535 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
536 MOVI2F(fpScratchReg2, by32768, scratchReg);
537 VMUL(weightRegs[j], fpScratchReg, fpScratchReg2);
538 }
539 }
540 Jit_ApplyWeights();
541 }
542
Jit_WeightsFloatSkin()543 void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
544 for (int i = 1; i < dec_->nweights; ++i) {
545 _dbg_assert_msg_(weightRegs[i - 1] + 1 == weightRegs[i], "VertexDecoder weightRegs must be in order.");
546 }
547
548 // Weights are always first, so we can use srcReg directly.
549 if (NEONSkinning) {
550 // if (false) because this path breaks Daxter. VLDMIA with d registers doesn't seem to work as expected.
551 if (dec_->nweights == 1) {
552 VLD1_lane(F_32, neonWeightRegsD[0], srcReg, 0, true);
553 } else {
554 // We may over-read by one float but this is not a tragedy.
555 VLD1(F_32, neonWeightRegsD[0], srcReg, (dec_->nweights + 1) / 2);
556 }
557 } else {
558 VLDMIA(srcReg, false, weightRegs[0], dec_->nweights);
559 }
560 Jit_ApplyWeights();
561 }
562
Jit_TcFloat()563 void VertexDecoderJitCache::Jit_TcFloat() {
564 LDR(tempReg1, srcReg, dec_->tcoff);
565 LDR(tempReg2, srcReg, dec_->tcoff + 4);
566 STR(tempReg1, dstReg, dec_->decFmt.uvoff);
567 STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
568 }
569
Jit_TcU16ThroughToFloat()570 void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
571 LDRH(tempReg1, srcReg, dec_->tcoff);
572 LDRH(tempReg2, srcReg, dec_->tcoff + 2);
573
574 MOVP2R(scratchReg, &gstate_c.vertBounds.minU);
575
576 auto updateSide = [&](ARMReg r, CCFlags cc, u32 off) {
577 LDRH(tempReg3, scratchReg, off);
578 CMP(r, tempReg3);
579 SetCC(cc);
580 STRH(r, scratchReg, off);
581 SetCC(CC_AL);
582 };
583
584 // TODO: Can this actually be fast? Hmm, floats aren't better.
585 updateSide(tempReg1, CC_LT, offsetof(KnownVertexBounds, minU));
586 updateSide(tempReg1, CC_GT, offsetof(KnownVertexBounds, maxU));
587 updateSide(tempReg2, CC_LT, offsetof(KnownVertexBounds, minV));
588 updateSide(tempReg2, CC_GT, offsetof(KnownVertexBounds, maxV));
589
590 if (cpu_info.bNEON) {
591 ADD(scratchReg, srcReg, dec_->tcoff);
592 VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
593 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
594 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
595 ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
596 VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
597 } else {
598 VMOV(fpScratchReg, tempReg1);
599 VMOV(fpScratchReg2, tempReg2);
600 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
601 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
602 VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
603 VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
604 }
605 }
606
Jit_TcFloatThrough()607 void VertexDecoderJitCache::Jit_TcFloatThrough() {
608 LDR(tempReg1, srcReg, dec_->tcoff);
609 LDR(tempReg2, srcReg, dec_->tcoff + 4);
610 STR(tempReg1, dstReg, dec_->decFmt.uvoff);
611 STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
612 }
613
Jit_TcU8Prescale()614 void VertexDecoderJitCache::Jit_TcU8Prescale() {
615 if (cpu_info.bNEON) {
616 // TODO: Needs testing
617 ADD(scratchReg, srcReg, dec_->tcoff);
618 VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
619 VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
620 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
621 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
622 ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
623 VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
624 VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
625 VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
626 } else {
627 LDRB(tempReg1, srcReg, dec_->tcoff);
628 LDRB(tempReg2, srcReg, dec_->tcoff + 1);
629 VMOV(fpScratchReg, tempReg1);
630 VMOV(fpScratchReg2, tempReg2);
631 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
632 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
633 // Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later.
634 VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
635 VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
636 VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
637 VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
638 VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
639 VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
640 }
641 }
642
Jit_TcU8ToFloat()643 void VertexDecoderJitCache::Jit_TcU8ToFloat() {
644 if (cpu_info.bNEON) {
645 // TODO: Needs testing
646 ADD(scratchReg, srcReg, dec_->tcoff);
647 VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
648 VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
649 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
650 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
651 VMOV_neon(F_32, neonScratchReg2, by128);
652 VMUL(F_32, neonScratchReg, neonScratchReg, neonScratchReg2);
653 ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
654 VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
655 } else {
656 LDRB(tempReg1, srcReg, dec_->tcoff);
657 LDRB(tempReg2, srcReg, dec_->tcoff + 1);
658 VMOV(fpScratchReg, tempReg1);
659 VMOV(fpScratchReg2, tempReg2);
660 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
661 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
662 MOVI2F(S15, by128, scratchReg);
663 VMUL(fpScratchReg, fpScratchReg, S15);
664 VMUL(fpScratchReg2, fpScratchReg2, S15);
665 VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
666 VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
667 }
668 }
669
Jit_TcU16Prescale()670 void VertexDecoderJitCache::Jit_TcU16Prescale() {
671 if (cpu_info.bNEON) {
672 // TODO: Needs testing
673 ADD(scratchReg, srcReg, dec_->tcoff);
674 VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
675 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
676 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
677 ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
678 VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
679 VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
680 VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
681 } else {
682 LDRH(tempReg1, srcReg, dec_->tcoff);
683 LDRH(tempReg2, srcReg, dec_->tcoff + 2);
684 VMOV(fpScratchReg, tempReg1);
685 VMOV(fpScratchReg2, tempReg2);
686 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
687 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
688 VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
689 VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
690 VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
691 VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
692 VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
693 VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
694 }
695 }
696
Jit_TcU16ToFloat()697 void VertexDecoderJitCache::Jit_TcU16ToFloat() {
698 if (cpu_info.bNEON) {
699 // TODO: Needs testing
700 ADD(scratchReg, srcReg, dec_->tcoff);
701 VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
702 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
703 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
704 ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
705 VMOV_neon(F_32, neonScratchReg2, by32768);
706 VMUL(F_32, neonScratchReg, neonScratchReg, neonScratchReg2);
707 VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
708 } else {
709 LDRH(tempReg1, srcReg, dec_->tcoff);
710 LDRH(tempReg2, srcReg, dec_->tcoff + 2);
711 VMOV(fpScratchReg, tempReg1);
712 VMOV(fpScratchReg2, tempReg2);
713 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
714 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
715 MOVI2F(S15, by32768, scratchReg);
716 VMUL(fpScratchReg, fpScratchReg, S15);
717 VMUL(fpScratchReg2, fpScratchReg2, S15);
718 VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
719 VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
720 }
721 }
722
Jit_TcFloatPrescale()723 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
724 if (cpu_info.bNEON) {
725 ADD(scratchReg, srcReg, dec_->tcoff);
726 VLD1(F_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
727 ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
728 VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
729 VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
730 VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
731 } else {
732 VLDR(fpScratchReg, srcReg, dec_->tcoff);
733 VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4);
734 VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
735 VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
736 VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
737 VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
738 VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
739 VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
740 }
741 }
742
Jit_Color8888()743 void VertexDecoderJitCache::Jit_Color8888() {
744 LDR(tempReg1, srcReg, dec_->coloff);
745 // Set flags to determine if alpha != 0xFF.
746 MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24));
747 STR(tempReg1, dstReg, dec_->decFmt.c0off);
748 SetCC(CC_NEQ);
749 MOV(fullAlphaReg, 0);
750 SetCC(CC_AL);
751 }
752
Jit_Color4444()753 void VertexDecoderJitCache::Jit_Color4444() {
754 LDRH(tempReg1, srcReg, dec_->coloff);
755
756 // Spread out the components.
757 ANDI2R(tempReg2, tempReg1, 0x000F, scratchReg);
758 ANDI2R(tempReg3, tempReg1, 0x00F0, scratchReg);
759 ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 4));
760 ANDI2R(tempReg3, tempReg1, 0x0F00, scratchReg);
761 ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8));
762 ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg);
763 ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12));
764
765 // And expand to 8 bits.
766 ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4));
767
768 STR(tempReg1, dstReg, dec_->decFmt.c0off);
769
770 // Set flags to determine if alpha != 0xFF.
771 MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24));
772 SetCC(CC_NEQ);
773 MOV(fullAlphaReg, 0);
774 SetCC(CC_AL);
775 }
776
Jit_Color565()777 void VertexDecoderJitCache::Jit_Color565() {
778 LDRH(tempReg1, srcReg, dec_->coloff);
779
780 // Spread out R and B first. This puts them in 0x001F001F.
781 ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
782 ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg);
783 ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 5));
784
785 // Expand 5 -> 8.
786 LSL(tempReg3, tempReg2, 3);
787 ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSR, 2));
788 ANDI2R(tempReg2, tempReg2, 0xFFFF00FF, scratchReg);
789
790 // Now finally G. We start by shoving it into a wall.
791 LSR(tempReg1, tempReg1, 5);
792 ANDI2R(tempReg1, tempReg1, 0x003F, scratchReg);
793 LSL(tempReg3, tempReg1, 2);
794 // Don't worry, shifts into a wall.
795 ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4));
796 ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8));
797
798 // Add in full alpha. No need to update fullAlphaReg.
799 ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
800
801 STR(tempReg1, dstReg, dec_->decFmt.c0off);
802 }
803
Jit_Color5551()804 void VertexDecoderJitCache::Jit_Color5551() {
805 LDRSH(tempReg1, srcReg, dec_->coloff);
806
807 ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
808 ANDI2R(tempReg3, tempReg1, 0x03E0, scratchReg);
809 ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 3));
810 ANDI2R(tempReg3, tempReg1, 0x7C00, scratchReg);
811 ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 6));
812
813 // Expand 5 -> 8.
814 LSR(tempReg3, tempReg2, 2);
815 // Clean up the bits that were shifted right.
816 BIC(tempReg3, tempReg3, AssumeMakeOperand2(0x000000F8));
817 BIC(tempReg3, tempReg3, AssumeMakeOperand2(0x0000F800));
818 ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSL, 3));
819
820 // Now we just need alpha. Since we loaded as signed, it'll be extended.
821 ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
822 ORR(tempReg2, tempReg2, tempReg1);
823
824 // Set flags to determine if alpha != 0xFF.
825 MVNS(tempReg3, Operand2(tempReg1, ST_ASR, 24));
826 STR(tempReg2, dstReg, dec_->decFmt.c0off);
827 SetCC(CC_NEQ);
828 MOV(fullAlphaReg, 0);
829 SetCC(CC_AL);
830 }
831
Jit_Color8888Morph()832 void VertexDecoderJitCache::Jit_Color8888Morph() {
833 const bool useNEON = NEONMorphing;
834 ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
835 MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
836
837 bool first = true;
838 for (int n = 0; n < dec_->morphcount; ++n) {
839 if (useNEON) {
840 VLD1_lane(I_32, neonScratchReg, tempReg1, 0, true);
841 VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
842
843 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
844 VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
845 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
846 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
847
848 if (first) {
849 first = false;
850 VMUL(F_32, Q2, neonScratchRegQ, Q3);
851 } else if (cpu_info.bVFPv4) {
852 VFMA(F_32, Q2, neonScratchRegQ, Q3);
853 } else {
854 VMLA(F_32, Q2, neonScratchRegQ, Q3);
855 }
856 } else {
857 LDRB(scratchReg, tempReg1, 0);
858 LDRB(scratchReg2, tempReg1, 1);
859 LDRB(scratchReg3, tempReg1, 2);
860 LDRB(tempReg3, tempReg1, 3);
861 VMOV(fpScratchReg, scratchReg);
862 VMOV(fpScratchReg2, scratchReg2);
863 VMOV(fpScratchReg3, scratchReg3);
864 VMOV(fpScratchReg4, tempReg3);
865 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
866 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
867 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
868 VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
869 VCVT(fpScratchReg4, fpScratchReg4, TO_FLOAT);
870
871 VLDR(S12, tempReg2, sizeof(float) * n);
872
873 if (first) {
874 first = false;
875 VMUL(S8, fpScratchReg, S12);
876 VMUL(S9, fpScratchReg2, S12);
877 VMUL(S10, fpScratchReg3, S12);
878 VMUL(S11, fpScratchReg4, S12);
879 } else {
880 VMLA(S8, fpScratchReg, S12);
881 VMLA(S9, fpScratchReg2, S12);
882 VMLA(S10, fpScratchReg3, S12);
883 VMLA(S11, fpScratchReg4, S12);
884 }
885 }
886 }
887
888 Jit_WriteMorphColor(dec_->decFmt.c0off);
889 }
890
891 // First is the left shift, second is the right shift (against walls, to get the RGBA values.)
892 alignas(16) static const s16 color4444Shift[2][4] = {{12, 8, 4, 0}, {-12, -12, -12, -12}};
893
Jit_Color4444Morph()894 void VertexDecoderJitCache::Jit_Color4444Morph() {
895 const bool useNEON = NEONMorphing;
896 ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
897 MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
898
899 if (useNEON) {
900 MOVP2R(scratchReg, color4444Shift);
901 MOVI2FR(scratchReg2, 255.0f / 15.0f);
902 VDUP(I_32, Q5, scratchReg2);
903 VLD1(I_16, D8, scratchReg, 2, ALIGN_128);
904 } else {
905 MOVI2F(S13, 255.0f / 15.0f, scratchReg);
906 }
907
908 bool first = true;
909 for (int n = 0; n < dec_->morphcount; ++n) {
910 if (useNEON) {
911 VLD1_all_lanes(I_16, neonScratchReg, tempReg1, true);
912 VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
913
914 // Shift against walls and then back to get R, G, B, A in each 16-bit lane.
915 VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D8);
916 VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D9);
917 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
918 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
919 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
920
921 VMUL(F_32, Q3, Q3, Q5);
922
923 if (first) {
924 first = false;
925 VMUL(F_32, Q2, neonScratchRegQ, Q3);
926 } else if (cpu_info.bVFPv4) {
927 VFMA(F_32, Q2, neonScratchRegQ, Q3);
928 } else {
929 VMLA(F_32, Q2, neonScratchRegQ, Q3);
930 }
931 } else {
932 LDRB(scratchReg, tempReg1, 0);
933 ANDI2R(scratchReg2, scratchReg, 0x000F, scratchReg3);
934 VMOV(fpScratchReg, scratchReg2);
935
936 MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 4));
937 VMOV(fpScratchReg2, scratchReg2);
938
939 LDRB(scratchReg, tempReg1, 1);
940 ANDI2R(scratchReg2, scratchReg, 0x000F, scratchReg3);
941 VMOV(fpScratchReg3, scratchReg2);
942
943 MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 4));
944 VMOV(fpScratchReg4, scratchReg2);
945
946 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
947 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
948 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
949 VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
950 VCVT(fpScratchReg4, fpScratchReg4, TO_FLOAT);
951
952 VLDR(S12, tempReg2, sizeof(float) * n);
953 VMUL(S12, S12, S13);
954
955 if (first) {
956 first = false;
957 VMUL(S8, fpScratchReg, S12);
958 VMUL(S9, fpScratchReg2, S12);
959 VMUL(S10, fpScratchReg3, S12);
960 VMUL(S11, fpScratchReg4, S12);
961 } else {
962 VMLA(S8, fpScratchReg, S12);
963 VMLA(S9, fpScratchReg2, S12);
964 VMLA(S10, fpScratchReg3, S12);
965 VMLA(S11, fpScratchReg4, S12);
966 }
967 }
968 }
969
970 Jit_WriteMorphColor(dec_->decFmt.c0off);
971 }
972
973 // First is the left shift, second is the right shift (against walls, to get the RGBA values.)
974 alignas(16) static const s16 color565Shift[2][4] = {{11, 5, 0, 0}, {-11, -10, -11, 0}};
975 alignas(16) static const float byColor565[4] = {255.0f / 31.0f, 255.0f / 63.0f, 255.0f / 31.0f, 0.0f};
976
Jit_Color565Morph()977 void VertexDecoderJitCache::Jit_Color565Morph() {
978 const bool useNEON = NEONMorphing;
979 ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
980 MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
981 MOVI2FR(tempReg3, 255.0f);
982
983 if (useNEON) {
984 MOVP2R(scratchReg, color565Shift);
985 MOVP2R(scratchReg2, byColor565);
986 VLD1(I_16, D8, scratchReg, 2, ALIGN_128);
987 VLD1(F_32, D10, scratchReg2, 2, ALIGN_128);
988 } else {
989 MOVI2F(S14, 255.0f / 31.0f, scratchReg);
990 MOVI2F(S15, 255.0f / 63.0f, scratchReg);
991 }
992
993 bool first = true;
994 for (int n = 0; n < dec_->morphcount; ++n) {
995 if (useNEON) {
996 VLD1_all_lanes(I_16, neonScratchReg, tempReg1, true);
997 VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
998
999 VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D8);
1000 VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D9);
1001 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1002 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
1003 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
1004
1005 VMUL(F_32, Q3, Q3, Q5);
1006
1007 if (first) {
1008 first = false;
1009 VMUL(F_32, Q2, neonScratchRegQ, Q3);
1010 } else if (cpu_info.bVFPv4) {
1011 VFMA(F_32, Q2, neonScratchRegQ, Q3);
1012 } else {
1013 VMLA(F_32, Q2, neonScratchRegQ, Q3);
1014 }
1015 } else {
1016 LDRH(scratchReg, tempReg1, 0);
1017 ANDI2R(scratchReg2, scratchReg, 0x001F, scratchReg3);
1018 VMOV(fpScratchReg, scratchReg2);
1019
1020 MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 6));
1021 ANDI2R(scratchReg2, scratchReg2, 0x003F, scratchReg3);
1022 VMOV(fpScratchReg2, scratchReg2);
1023
1024 MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 11));
1025 VMOV(fpScratchReg3, scratchReg2);
1026
1027 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1028 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
1029 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
1030 VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
1031
1032 VLDR(S12, tempReg2, sizeof(float) * n);
1033 VMUL(S13, S12, S15);
1034 VMUL(S12, S12, S14);
1035
1036 if (first) {
1037 first = false;
1038 VMUL(S8, fpScratchReg, S12);
1039 VMUL(S9, fpScratchReg2, S13);
1040 VMUL(S10, fpScratchReg3, S12);
1041 } else {
1042 VMLA(S8, fpScratchReg, S12);
1043 VMLA(S9, fpScratchReg2, S13);
1044 VMLA(S10, fpScratchReg3, S12);
1045 }
1046 }
1047 }
1048
1049 // Overwrite A with 255.0f.
1050 if (useNEON) {
1051 VMOV_neon(F_32, D5, tempReg3, 1);
1052 } else {
1053 VMOV(S11, tempReg3);
1054 }
1055 Jit_WriteMorphColor(dec_->decFmt.c0off, false);
1056 }
1057
1058 // First is the left shift, second is the right shift (against walls, to get the RGBA values.)
1059 alignas(16) static const s16 color5551Shift[2][4] = {{11, 6, 1, 0}, {-11, -11, -11, -15}};
1060 alignas(16) static const float byColor5551[4] = {255.0f / 31.0f, 255.0f / 31.0f, 255.0f / 31.0f, 255.0f / 1.0f};
1061
Jit_Color5551Morph()1062 void VertexDecoderJitCache::Jit_Color5551Morph() {
1063 const bool useNEON = NEONMorphing;
1064 ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
1065 MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
1066
1067 if (useNEON) {
1068 MOVP2R(scratchReg, color5551Shift);
1069 MOVP2R(scratchReg2, byColor5551);
1070 VLD1(I_16, D8, scratchReg, 2, ALIGN_128);
1071 VLD1(F_32, D10, scratchReg2, 2, ALIGN_128);
1072 } else {
1073 MOVI2F(S14, 255.0f / 31.0f, scratchReg);
1074 MOVI2F(S15, 255.0f, scratchReg);
1075 }
1076
1077 bool first = true;
1078 for (int n = 0; n < dec_->morphcount; ++n) {
1079 if (useNEON) {
1080 VLD1_all_lanes(I_16, neonScratchReg, tempReg1, true);
1081 VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
1082
1083 VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D8);
1084 VSHL(I_16 | I_UNSIGNED, neonScratchReg, neonScratchReg, D9);
1085 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1086 VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
1087 VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
1088
1089 VMUL(F_32, Q3, Q3, Q5);
1090
1091 if (first) {
1092 first = false;
1093 VMUL(F_32, Q2, neonScratchRegQ, Q3);
1094 } else if (cpu_info.bVFPv4) {
1095 VFMA(F_32, Q2, neonScratchRegQ, Q3);
1096 } else {
1097 VMLA(F_32, Q2, neonScratchRegQ, Q3);
1098 }
1099 } else {
1100 LDRH(scratchReg, tempReg1, 0);
1101 ANDI2R(scratchReg2, scratchReg, 0x001F, scratchReg3);
1102 VMOV(fpScratchReg, scratchReg2);
1103
1104 MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 5));
1105 ANDI2R(scratchReg2, scratchReg2, 0x001F, scratchReg3);
1106 VMOV(fpScratchReg2, scratchReg2);
1107
1108 MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 10));
1109 ANDI2R(scratchReg2, scratchReg2, 0x001F, scratchReg3);
1110 VMOV(fpScratchReg3, scratchReg2);
1111
1112 MOV(scratchReg2, Operand2(scratchReg, ST_LSR, 15));
1113 VMOV(fpScratchReg4, scratchReg2);
1114
1115 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1116 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
1117 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
1118 VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
1119 VCVT(fpScratchReg4, fpScratchReg4, TO_FLOAT);
1120
1121 VLDR(S12, tempReg2, sizeof(float) * n);
1122 VMUL(S13, S12, S15);
1123 VMUL(S12, S12, S14);
1124
1125 if (first) {
1126 first = false;
1127 VMUL(S8, fpScratchReg, S12);
1128 VMUL(S9, fpScratchReg2, S12);
1129 VMUL(S10, fpScratchReg3, S12);
1130 VMUL(S11, fpScratchReg4, S13);
1131 } else {
1132 VMLA(S8, fpScratchReg, S12);
1133 VMLA(S9, fpScratchReg2, S12);
1134 VMLA(S10, fpScratchReg3, S12);
1135 VMLA(S11, fpScratchReg4, S13);
1136 }
1137 }
1138 }
1139
1140 Jit_WriteMorphColor(dec_->decFmt.c0off);
1141 }
1142
1143 // Expects RGBA color in S8 - S11, which is Q2.
Jit_WriteMorphColor(int outOff,bool checkAlpha)1144 void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) {
1145 if (NEONMorphing) {
1146 ADDI2R(tempReg1, dstReg, outOff, scratchReg);
1147 VCVT(I_32 | I_UNSIGNED, Q2, Q2);
1148 VQMOVN(I_32 | I_UNSIGNED, D4, Q2);
1149 VQMOVN(I_16 | I_UNSIGNED, D4, Q2);
1150 VST1_lane(I_32, D4, tempReg1, 0, true);
1151 if (checkAlpha) {
1152 VMOV_neon(I_32, scratchReg, D4, 0);
1153 }
1154 } else {
1155 VCVT(S8, S8, TO_INT | ROUND_TO_ZERO);
1156 VCVT(S9, S9, TO_INT | ROUND_TO_ZERO);
1157 VCVT(S10, S10, TO_INT | ROUND_TO_ZERO);
1158 VCVT(S11, S11, TO_INT | ROUND_TO_ZERO);
1159 VMOV(scratchReg, S8);
1160 VMOV(scratchReg2, S9);
1161 VMOV(scratchReg3, S10);
1162 VMOV(tempReg3, S11);
1163 ORR(scratchReg, scratchReg, Operand2(scratchReg2, ST_LSL, 8));
1164 ORR(scratchReg, scratchReg, Operand2(scratchReg3, ST_LSL, 16));
1165 ORR(scratchReg, scratchReg, Operand2(tempReg3, ST_LSL, 24));
1166 STR(scratchReg, dstReg, outOff);
1167 }
1168
1169 // Set flags to determine if alpha != 0xFF.
1170 if (checkAlpha) {
1171 MVNS(tempReg2, Operand2(scratchReg, ST_ASR, 24));
1172 SetCC(CC_NEQ);
1173 MOV(fullAlphaReg, 0);
1174 SetCC(CC_AL);
1175 }
1176 }
1177
Jit_NormalS8()1178 void VertexDecoderJitCache::Jit_NormalS8() {
1179 LDRB(tempReg1, srcReg, dec_->nrmoff);
1180 LDRB(tempReg2, srcReg, dec_->nrmoff + 1);
1181 LDRB(tempReg3, srcReg, dec_->nrmoff + 2);
1182 ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
1183 ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
1184 STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
1185
1186 // Copy 3 bytes and then a zero. Might as well copy four.
1187 // LDR(tempReg1, srcReg, dec_->nrmoff);
1188 // ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg);
1189 // STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
1190 }
1191
1192 // Copy 6 bytes and then 2 zeroes.
Jit_NormalS16()1193 void VertexDecoderJitCache::Jit_NormalS16() {
1194 LDRH(tempReg1, srcReg, dec_->nrmoff);
1195 LDRH(tempReg2, srcReg, dec_->nrmoff + 2);
1196 LDRH(tempReg3, srcReg, dec_->nrmoff + 4);
1197 ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
1198 STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
1199 STR(tempReg3, dstReg, dec_->decFmt.nrmoff + 4);
1200 }
1201
Jit_NormalFloat()1202 void VertexDecoderJitCache::Jit_NormalFloat() {
1203 ADD(scratchReg, srcReg, dec_->nrmoff);
1204 LDMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
1205 ADD(scratchReg, dstReg, dec_->decFmt.nrmoff);
1206 STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
1207 }
1208
1209 // Through expands into floats, always. Might want to look at changing this.
Jit_PosS8Through()1210 void VertexDecoderJitCache::Jit_PosS8Through() {
1211 DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
1212 _dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
1213 _dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
1214
1215 // TODO: SIMD
1216 LDRSB(tempReg1, srcReg, dec_->posoff);
1217 LDRSB(tempReg2, srcReg, dec_->posoff + 1);
1218 LDRSB(tempReg3, srcReg, dec_->posoff + 2); // signed?
1219 static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
1220 static const ARMReg fr[3] = { fpScratchReg, fpScratchReg2, fpScratchReg3 };
1221 ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1222 if (cpu_info.bNEON) {
1223 VMOV(neonScratchReg, tempReg1, tempReg2);
1224 VMOV(neonScratchReg2, tempReg3, tempReg3);
1225 VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1226 VST1(F_32, neonScratchReg, scratchReg, 2, ALIGN_NONE);
1227 } else {
1228 for (int i = 0; i < 3; i++) {
1229 VMOV(fr[i], tr[i]);
1230 VCVT(fr[i], fr[i], TO_FLOAT | IS_SIGNED);
1231 }
1232 VSTMIA(scratchReg, false, fr[0], 3);
1233 }
1234 }
1235
1236 // Through expands into floats, always. Might want to look at changing this.
Jit_PosS16Through()1237 void VertexDecoderJitCache::Jit_PosS16Through() {
1238 _dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
1239 _dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
1240
1241 LDRSH(tempReg1, srcReg, dec_->posoff);
1242 LDRSH(tempReg2, srcReg, dec_->posoff + 2);
1243 LDRH(tempReg3, srcReg, dec_->posoff + 4);
1244 static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
1245 static const ARMReg fr[3] = { fpScratchReg, fpScratchReg2, fpScratchReg3 };
1246 ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1247 if (cpu_info.bNEON) {
1248 VMOV(neonScratchReg, tempReg1, tempReg2);
1249 VMOV(neonScratchReg2, tempReg3, tempReg3);
1250 VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1251 VST1(F_32, neonScratchReg, scratchReg, 2, ALIGN_NONE);
1252 } else {
1253 for (int i = 0; i < 3; i++) {
1254 VMOV(fr[i], tr[i]);
1255 VCVT(fr[i], fr[i], TO_FLOAT | IS_SIGNED);
1256 }
1257 VSTMIA(scratchReg, false, fr[0], 3);
1258 }
1259 }
1260
Jit_PosS8()1261 void VertexDecoderJitCache::Jit_PosS8() {
1262 Jit_AnyS8ToFloat(dec_->posoff);
1263
1264 ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1265 if (NEONSkinning) {
1266 VST1(F_32, srcNEON, scratchReg, 2);
1267 } else {
1268 VSTMIA(scratchReg, false, src[0], 3);
1269 }
1270 }
1271
Jit_PosS16()1272 void VertexDecoderJitCache::Jit_PosS16() {
1273 Jit_AnyS16ToFloat(dec_->posoff);
1274
1275 ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1276 if (NEONSkinning) {
1277 VST1(F_32, srcNEON, scratchReg, 2);
1278 } else {
1279 VSTMIA(scratchReg, false, src[0], 3);
1280 }
1281 }
1282
1283 // Just copy 12 bytes.
Jit_PosFloat()1284 void VertexDecoderJitCache::Jit_PosFloat() {
1285 ADD(scratchReg, srcReg, dec_->posoff);
1286 LDMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
1287 ADD(scratchReg, dstReg, dec_->decFmt.posoff);
1288 STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
1289 }
1290
Jit_NormalS8Skin()1291 void VertexDecoderJitCache::Jit_NormalS8Skin() {
1292 Jit_AnyS8ToFloat(dec_->nrmoff);
1293 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1294 }
1295
Jit_NormalS16Skin()1296 void VertexDecoderJitCache::Jit_NormalS16Skin() {
1297 Jit_AnyS16ToFloat(dec_->nrmoff);
1298 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1299 }
1300
Jit_NormalFloatSkin()1301 void VertexDecoderJitCache::Jit_NormalFloatSkin() {
1302 for (int i = 1; i < 3; ++i) {
1303 _dbg_assert_msg_(src[i - 1] + 1 == src[i], "VertexDecoder src regs must be in order.");
1304 }
1305
1306 ADD(tempReg1, srcReg, dec_->nrmoff);
1307 if (NEONSkinning) {
1308 VLD1(F_32, srcNEON, tempReg1, 2, ALIGN_NONE);
1309 } else {
1310 VLDMIA(tempReg1, false, src[0], 3);
1311 }
1312 Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
1313 }
1314
Jit_WriteMatrixMul(int outOff,bool pos)1315 void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
1316 if (NEONSkinning) {
1317 // Multiply with the matrix sitting in Q4-Q7.
1318 ADD(scratchReg, dstReg, outOff);
1319 VMUL_scalar(F_32, accNEON, Q4, QScalar(srcNEON, 0));
1320 VMLA_scalar(F_32, accNEON, Q5, QScalar(srcNEON, 1));
1321 VMLA_scalar(F_32, accNEON, Q6, QScalar(srcNEON, 2));
1322 if (pos) {
1323 VADD(F_32, accNEON, accNEON, Q7);
1324 }
1325 VST1(F_32, accNEON, scratchReg, 2);
1326 } else {
1327 _dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
1328 _dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
1329
1330 MOVP2R(tempReg1, skinMatrix);
1331 VLDMIA(tempReg1, true, fpScratchReg, 3);
1332 for (int i = 0; i < 3; i++) {
1333 VMUL(acc[i], ARMReg(fpScratchReg + i), src[0]);
1334 }
1335 VLDMIA(tempReg1, true, fpScratchReg, 3);
1336 for (int i = 0; i < 3; i++) {
1337 VMLA(acc[i], ARMReg(fpScratchReg + i), src[1]);
1338 }
1339 VLDMIA(tempReg1, true, fpScratchReg, 3);
1340 for (int i = 0; i < 3; i++) {
1341 VMLA(acc[i], ARMReg(fpScratchReg + i), src[2]);
1342 }
1343 if (pos) {
1344 VLDMIA(tempReg1, true, fpScratchReg, 3);
1345 for (int i = 0; i < 3; i++) {
1346 VADD(acc[i], acc[i], ARMReg(fpScratchReg + i));
1347 }
1348 }
1349 ADD(tempReg1, dstReg, outOff);
1350 VSTMIA(tempReg1, false, acc[0], 3);
1351 }
1352 }
1353
Jit_PosS8Skin()1354 void VertexDecoderJitCache::Jit_PosS8Skin() {
1355 Jit_AnyS8ToFloat(dec_->posoff);
1356 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1357 }
1358
Jit_PosS16Skin()1359 void VertexDecoderJitCache::Jit_PosS16Skin() {
1360 Jit_AnyS16ToFloat(dec_->posoff);
1361 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1362 }
1363
Jit_PosFloatSkin()1364 void VertexDecoderJitCache::Jit_PosFloatSkin() {
1365 for (int i = 1; i < 3; ++i) {
1366 _dbg_assert_msg_(src[i - 1] + 1 == src[i], "VertexDecoder src regs must be in order.");
1367 }
1368
1369 ADD(tempReg1, srcReg, dec_->posoff);
1370 if (NEONSkinning) {
1371 VLD1(F_32, srcNEON, tempReg1, 2, ALIGN_NONE);
1372 } else {
1373 VLDMIA(tempReg1, false, src[0], 3);
1374 }
1375 Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
1376 }
1377
Jit_AnyS8ToFloat(int srcoff)1378 void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
1379 if (NEONSkinning) {
1380 ADD(scratchReg, srcReg, srcoff);
1381 VMOV_neon(F_32, Q3, by128);
1382 VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
1383 VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
1384 VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
1385 VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1386 VMUL(F_32, srcNEON, neonScratchReg, Q3);
1387 } else {
1388 LDRSB(tempReg1, srcReg, srcoff);
1389 LDRSB(tempReg2, srcReg, srcoff + 1);
1390 LDRSB(tempReg3, srcReg, srcoff + 2);
1391 VMOV(src[0], tempReg1);
1392 VMOV(src[1], tempReg2);
1393 VMOV(src[2], tempReg3);
1394 MOVI2F(S15, by128, scratchReg);
1395 VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
1396 VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
1397 VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
1398 VMUL(src[0], src[0], S15);
1399 VMUL(src[1], src[1], S15);
1400 VMUL(src[2], src[2], S15);
1401 }
1402 }
1403
Jit_AnyS16ToFloat(int srcoff)1404 void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
1405 if (NEONSkinning) {
1406 ADD(scratchReg, srcReg, srcoff);
1407 VMOV_neon(F_32, Q3, by32768);
1408 VLD1(I_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
1409 VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
1410 VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1411 VMUL(F_32, srcNEON, neonScratchReg, Q3);
1412 } else {
1413 LDRSH(tempReg1, srcReg, srcoff);
1414 LDRSH(tempReg2, srcReg, srcoff + 2);
1415 LDRSH(tempReg3, srcReg, srcoff + 4);
1416 VMOV(src[0], tempReg1);
1417 VMOV(src[1], tempReg2);
1418 VMOV(src[2], tempReg3);
1419 MOVI2F(S15, by32768, scratchReg);
1420 VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
1421 VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
1422 VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
1423 VMUL(src[0], src[0], S15);
1424 VMUL(src[1], src[1], S15);
1425 VMUL(src[2], src[2], S15);
1426 }
1427 }
1428
Jit_AnyS8Morph(int srcoff,int dstoff)1429 void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
1430 const bool useNEON = NEONMorphing;
1431 ADDI2R(tempReg1, srcReg, srcoff, scratchReg);
1432 MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
1433
1434 if (useNEON) {
1435 MOVI2FR(scratchReg2, by128);
1436 VDUP(I_32, Q5, scratchReg2);
1437 } else {
1438 MOVI2F(S13, by128, scratchReg);
1439 }
1440
1441 bool first = true;
1442 for (int n = 0; n < dec_->morphcount; ++n) {
1443 if (useNEON) {
1444 VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false);
1445 VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
1446
1447 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1448 VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg);
1449 VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);
1450 VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1451
1452 VMUL(F_32, Q3, Q3, Q5);
1453
1454 if (first) {
1455 first = false;
1456 VMUL(F_32, Q2, neonScratchRegQ, Q3);
1457 } else if (cpu_info.bVFPv4) {
1458 VFMA(F_32, Q2, neonScratchRegQ, Q3);
1459 } else {
1460 VMLA(F_32, Q2, neonScratchRegQ, Q3);
1461 }
1462 } else {
1463 LDRSB(scratchReg, tempReg1, 0);
1464 LDRSB(scratchReg2, tempReg1, 1);
1465 LDRSB(scratchReg3, tempReg1, 2);
1466 VMOV(fpScratchReg, scratchReg);
1467 VMOV(fpScratchReg2, scratchReg2);
1468 VMOV(fpScratchReg3, scratchReg3);
1469 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1470 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
1471 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED);
1472 VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED);
1473
1474 VLDR(S12, tempReg2, sizeof(float) * n);
1475 VMUL(S12, S12, S13);
1476
1477 if (first) {
1478 first = false;
1479 VMUL(S8, fpScratchReg, S12);
1480 VMUL(S9, fpScratchReg2, S12);
1481 VMUL(S10, fpScratchReg3, S12);
1482 } else {
1483 VMLA(S8, fpScratchReg, S12);
1484 VMLA(S9, fpScratchReg2, S12);
1485 VMLA(S10, fpScratchReg3, S12);
1486 }
1487 }
1488 }
1489
1490 ADDI2R(tempReg1, dstReg, dstoff, scratchReg);
1491 if (useNEON) {
1492 // TODO: Is it okay that we're over-writing by 4 bytes? Probably...
1493 VSTMIA(tempReg1, false, D4, 2);
1494 } else {
1495 VSTMIA(tempReg1, false, S8, 3);
1496 }
1497 }
1498
Jit_AnyS16Morph(int srcoff,int dstoff)1499 void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
1500 const bool useNEON = NEONMorphing;
1501 ADDI2R(tempReg1, srcReg, srcoff, scratchReg);
1502 MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
1503
1504 if (useNEON) {
1505 MOVI2FR(scratchReg, by32768);
1506 VDUP(I_32, Q5, scratchReg);
1507 } else {
1508 MOVI2F(S13, by32768, scratchReg);
1509 }
1510
1511 bool first = true;
1512 for (int n = 0; n < dec_->morphcount; ++n) {
1513 if (useNEON) {
1514 VLD1(I_32, neonScratchReg, tempReg1, 1, ALIGN_NONE);
1515 VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
1516
1517 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1518 VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);
1519 VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
1520
1521 VMUL(F_32, Q3, Q3, Q5);
1522
1523 if (first) {
1524 first = false;
1525 VMUL(F_32, Q2, neonScratchRegQ, Q3);
1526 } else if (cpu_info.bVFPv4) {
1527 VFMA(F_32, Q2, neonScratchRegQ, Q3);
1528 } else {
1529 VMLA(F_32, Q2, neonScratchRegQ, Q3);
1530 }
1531 } else {
1532 LDRSH(scratchReg, tempReg1, 0);
1533 LDRSH(scratchReg2, tempReg1, 2);
1534 LDRSH(scratchReg3, tempReg1, 4);
1535 VMOV(fpScratchReg, scratchReg);
1536 VMOV(fpScratchReg2, scratchReg2);
1537 VMOV(fpScratchReg3, scratchReg3);
1538 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1539 VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
1540 VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED);
1541 VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED);
1542
1543 VLDR(S12, tempReg2, sizeof(float) * n);
1544 VMUL(S12, S12, S13);
1545
1546 if (first) {
1547 first = false;
1548 VMUL(S8, fpScratchReg, S12);
1549 VMUL(S9, fpScratchReg2, S12);
1550 VMUL(S10, fpScratchReg3, S12);
1551 } else {
1552 VMLA(S8, fpScratchReg, S12);
1553 VMLA(S9, fpScratchReg2, S12);
1554 VMLA(S10, fpScratchReg3, S12);
1555 }
1556 }
1557 }
1558
1559 ADDI2R(tempReg1, dstReg, dstoff, scratchReg);
1560 if (useNEON) {
1561 // TODO: Is it okay that we're over-writing by 4 bytes? Probably...
1562 VSTMIA(tempReg1, false, D4, 2);
1563 } else {
1564 VSTMIA(tempReg1, false, S8, 3);
1565 }
1566 }
1567
Jit_AnyFloatMorph(int srcoff,int dstoff)1568 void VertexDecoderJitCache::Jit_AnyFloatMorph(int srcoff, int dstoff) {
1569 const bool useNEON = NEONMorphing;
1570 ADDI2R(tempReg1, srcReg, srcoff, scratchReg);
1571 MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
1572
1573 bool first = true;
1574 for (int n = 0; n < dec_->morphcount; ++n) {
1575 if (useNEON) {
1576 // Load an extra float to stay in NEON mode.
1577 VLD1(F_32, neonScratchRegQ, tempReg1, 2, ALIGN_NONE);
1578 VLD1_all_lanes(F_32, Q3, tempReg2, true, REG_UPDATE);
1579 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1580
1581 if (first) {
1582 first = false;
1583 VMUL(F_32, Q2, neonScratchRegQ, Q3);
1584 } else if (cpu_info.bVFPv4) {
1585 VFMA(F_32, Q2, neonScratchRegQ, Q3);
1586 } else {
1587 VMLA(F_32, Q2, neonScratchRegQ, Q3);
1588 }
1589 } else {
1590 // Load an extra float to stay in NEON mode.
1591 VLDMIA(tempReg1, false, fpScratchReg, 3);
1592 // Using VLDMIA to get writeback.
1593 VLDMIA(tempReg2, true, S12, 1);
1594 ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
1595
1596 if (first) {
1597 first = false;
1598 VMUL(S8, fpScratchReg, S12);
1599 VMUL(S9, fpScratchReg2, S12);
1600 VMUL(S10, fpScratchReg3, S12);
1601 } else {
1602 VMLA(S8, fpScratchReg, S12);
1603 VMLA(S9, fpScratchReg2, S12);
1604 VMLA(S10, fpScratchReg3, S12);
1605 }
1606 }
1607 }
1608
1609 ADDI2R(tempReg1, dstReg, dstoff, scratchReg);
1610 if (useNEON) {
1611 // TODO: Is it okay that we're over-writing by 4 bytes? Probably...
1612 VSTMIA(tempReg1, false, D4, 2);
1613 } else {
1614 VSTMIA(tempReg1, false, S8, 3);
1615 }
1616 }
1617
Jit_PosS8Morph()1618 void VertexDecoderJitCache::Jit_PosS8Morph() {
1619 Jit_AnyS8Morph(dec_->posoff, dec_->decFmt.posoff);
1620 }
1621
Jit_PosS16Morph()1622 void VertexDecoderJitCache::Jit_PosS16Morph() {
1623 Jit_AnyS16Morph(dec_->posoff, dec_->decFmt.posoff);
1624 }
1625
Jit_PosFloatMorph()1626 void VertexDecoderJitCache::Jit_PosFloatMorph() {
1627 Jit_AnyFloatMorph(dec_->posoff, dec_->decFmt.posoff);
1628 }
1629
Jit_NormalS8Morph()1630 void VertexDecoderJitCache::Jit_NormalS8Morph() {
1631 Jit_AnyS8Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
1632 }
1633
Jit_NormalS16Morph()1634 void VertexDecoderJitCache::Jit_NormalS16Morph() {
1635 Jit_AnyS16Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
1636 }
1637
Jit_NormalFloatMorph()1638 void VertexDecoderJitCache::Jit_NormalFloatMorph() {
1639 Jit_AnyFloatMorph(dec_->nrmoff, dec_->decFmt.nrmoff);
1640 }
1641
CompileStep(const VertexDecoder & dec,int step)1642 bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
1643 // See if we find a matching JIT function
1644 for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {
1645 if (dec.steps_[step] == jitLookup[i].func) {
1646 ((*this).*jitLookup[i].jitFunc)();
1647 return true;
1648 }
1649 }
1650 return false;
1651 }
1652
1653 #endif // PPSSPP_ARCH(ARM)
1654