1 // Copyright 2015 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4 
5 #include "VideoCommon/VertexLoaderARM64.h"
6 #include "Common/CommonTypes.h"
7 #include "VideoCommon/DataReader.h"
8 #include "VideoCommon/VertexLoaderManager.h"
9 
10 using namespace Arm64Gen;
11 
12 constexpr ARM64Reg src_reg = X0;
13 constexpr ARM64Reg dst_reg = X1;
14 constexpr ARM64Reg count_reg = W2;
15 constexpr ARM64Reg skipped_reg = W17;
16 constexpr ARM64Reg scratch1_reg = W16;
17 constexpr ARM64Reg scratch2_reg = W15;
18 constexpr ARM64Reg scratch3_reg = W14;
19 constexpr ARM64Reg saved_count = W12;
20 
21 constexpr ARM64Reg stride_reg = X11;
22 constexpr ARM64Reg arraybase_reg = X10;
23 constexpr ARM64Reg scale_reg = X9;
24 
25 alignas(16) static const float scale_factors[] = {
26     1.0 / (1ULL << 0),  1.0 / (1ULL << 1),  1.0 / (1ULL << 2),  1.0 / (1ULL << 3),
27     1.0 / (1ULL << 4),  1.0 / (1ULL << 5),  1.0 / (1ULL << 6),  1.0 / (1ULL << 7),
28     1.0 / (1ULL << 8),  1.0 / (1ULL << 9),  1.0 / (1ULL << 10), 1.0 / (1ULL << 11),
29     1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15),
30     1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19),
31     1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23),
32     1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27),
33     1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31),
34 };
35 
VertexLoaderARM64(const TVtxDesc & vtx_desc,const VAT & vtx_att)36 VertexLoaderARM64::VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_att)
37     : VertexLoaderBase(vtx_desc, vtx_att), m_float_emit(this)
38 {
39   if (!IsInitialized())
40     return;
41 
42   AllocCodeSpace(4096);
43   ClearCodeSpace();
44   GenerateVertexLoader();
45   WriteProtect();
46 }
47 
GetVertexAddr(int array,u64 attribute,ARM64Reg reg)48 void VertexLoaderARM64::GetVertexAddr(int array, u64 attribute, ARM64Reg reg)
49 {
50   if (attribute & MASK_INDEXED)
51   {
52     if (attribute == INDEX8)
53     {
54       if (m_src_ofs < 4096)
55       {
56         LDRB(INDEX_UNSIGNED, scratch1_reg, src_reg, m_src_ofs);
57       }
58       else
59       {
60         ADD(reg, src_reg, m_src_ofs);
61         LDRB(INDEX_UNSIGNED, scratch1_reg, reg, 0);
62       }
63       m_src_ofs += 1;
64     }
65     else
66     {
67       if (m_src_ofs < 256)
68       {
69         LDURH(scratch1_reg, src_reg, m_src_ofs);
70       }
71       else if (m_src_ofs <= 8190 && !(m_src_ofs & 1))
72       {
73         LDRH(INDEX_UNSIGNED, scratch1_reg, src_reg, m_src_ofs);
74       }
75       else
76       {
77         ADD(reg, src_reg, m_src_ofs);
78         LDRH(INDEX_UNSIGNED, scratch1_reg, reg, 0);
79       }
80       m_src_ofs += 2;
81       REV16(scratch1_reg, scratch1_reg);
82     }
83 
84     if (array == ARRAY_POSITION)
85     {
86       EOR(scratch2_reg, scratch1_reg, 0, attribute == INDEX8 ? 7 : 15);  // 0xFF : 0xFFFF
87       m_skip_vertex = CBZ(scratch2_reg);
88     }
89 
90     LDR(INDEX_UNSIGNED, scratch2_reg, stride_reg, array * 4);
91     MUL(scratch1_reg, scratch1_reg, scratch2_reg);
92 
93     LDR(INDEX_UNSIGNED, EncodeRegTo64(scratch2_reg), arraybase_reg, array * 8);
94     ADD(EncodeRegTo64(reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg));
95   }
96   else
97     ADD(reg, src_reg, m_src_ofs);
98 }
99 
GetAddressImm(int array,u64 attribute,Arm64Gen::ARM64Reg reg,u32 align)100 s32 VertexLoaderARM64::GetAddressImm(int array, u64 attribute, Arm64Gen::ARM64Reg reg, u32 align)
101 {
102   if (attribute & MASK_INDEXED || (m_src_ofs > 255 && (m_src_ofs & (align - 1))))
103     GetVertexAddr(array, attribute, reg);
104   else
105     return m_src_ofs;
106   return -1;
107 }
108 
ReadVertex(u64 attribute,int format,int count_in,int count_out,bool dequantize,u8 scaling_exponent,AttributeFormat * native_format,s32 offset)109 int VertexLoaderARM64::ReadVertex(u64 attribute, int format, int count_in, int count_out,
110                                   bool dequantize, u8 scaling_exponent,
111                                   AttributeFormat* native_format, s32 offset)
112 {
113   ARM64Reg coords = count_in == 3 ? Q31 : D31;
114   ARM64Reg scale = count_in == 3 ? Q30 : D30;
115 
116   int elem_size = 1 << (format / 2);
117   int load_bytes = elem_size * count_in;
118   int load_size =
119       load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16;
120   load_size <<= 3;
121   elem_size <<= 3;
122 
123   if (offset == -1)
124   {
125     if (count_in == 1)
126       m_float_emit.LDR(elem_size, INDEX_UNSIGNED, coords, EncodeRegTo64(scratch1_reg), 0);
127     else
128       m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg));
129   }
130   else if (offset & (load_size - 1))  // Not aligned - unscaled
131   {
132     m_float_emit.LDUR(load_size, coords, src_reg, offset);
133   }
134   else
135   {
136     m_float_emit.LDR(load_size, INDEX_UNSIGNED, coords, src_reg, offset);
137   }
138 
139   if (format != FORMAT_FLOAT)
140   {
141     // Extend and convert to float
142     switch (format)
143     {
144     case FORMAT_UBYTE:
145       m_float_emit.UXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
146       m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
147       break;
148     case FORMAT_BYTE:
149       m_float_emit.SXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
150       m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
151       break;
152     case FORMAT_USHORT:
153       m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
154       m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
155       break;
156     case FORMAT_SHORT:
157       m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
158       m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
159       break;
160     }
161 
162     m_float_emit.SCVTF(32, coords, coords);
163 
164     if (dequantize && scaling_exponent)
165     {
166       m_float_emit.LDR(32, INDEX_UNSIGNED, scale, scale_reg, scaling_exponent * 4);
167       m_float_emit.FMUL(32, coords, coords, scale, 0);
168     }
169   }
170   else
171   {
172     m_float_emit.REV32(8, coords, coords);
173   }
174 
175   const u32 write_size = count_out == 3 ? 128 : count_out * 32;
176   const u32 mask = count_out == 3 ? 0xF : count_out == 2 ? 0x7 : 0x3;
177   if (m_dst_ofs < 256)
178   {
179     m_float_emit.STUR(write_size, coords, dst_reg, m_dst_ofs);
180   }
181   else if (!(m_dst_ofs & mask))
182   {
183     m_float_emit.STR(write_size, INDEX_UNSIGNED, coords, dst_reg, m_dst_ofs);
184   }
185   else
186   {
187     ADD(EncodeRegTo64(scratch2_reg), dst_reg, m_dst_ofs);
188     m_float_emit.ST1(32, 1, coords, EncodeRegTo64(scratch2_reg));
189   }
190 
191   // Z-Freeze
192   if (native_format == &m_native_vtx_decl.position)
193   {
194     CMP(count_reg, 3);
195     FixupBranch dont_store = B(CC_GT);
196     MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::position_cache);
197     ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg), EncodeRegTo64(count_reg),
198         ArithOption(EncodeRegTo64(count_reg), ST_LSL, 4));
199     m_float_emit.STUR(write_size, coords, EncodeRegTo64(scratch1_reg), -16);
200     SetJumpTarget(dont_store);
201   }
202 
203   native_format->components = count_out;
204   native_format->enable = true;
205   native_format->offset = m_dst_ofs;
206   native_format->type = VAR_FLOAT;
207   native_format->integer = false;
208   m_dst_ofs += sizeof(float) * count_out;
209 
210   if (attribute == DIRECT)
211     m_src_ofs += load_bytes;
212 
213   return load_bytes;
214 }
215 
ReadColor(u64 attribute,int format,s32 offset)216 void VertexLoaderARM64::ReadColor(u64 attribute, int format, s32 offset)
217 {
218   int load_bytes = 0;
219   switch (format)
220   {
221   case FORMAT_24B_888:
222   case FORMAT_32B_888x:
223   case FORMAT_32B_8888:
224     if (offset == -1)
225       LDR(INDEX_UNSIGNED, scratch2_reg, EncodeRegTo64(scratch1_reg), 0);
226     else if (offset & 3)  // Not aligned - unscaled
227       LDUR(scratch2_reg, src_reg, offset);
228     else
229       LDR(INDEX_UNSIGNED, scratch2_reg, src_reg, offset);
230 
231     if (format != FORMAT_32B_8888)
232       ORRI2R(scratch2_reg, scratch2_reg, 0xFF000000);
233     STR(INDEX_UNSIGNED, scratch2_reg, dst_reg, m_dst_ofs);
234     load_bytes = 3 + (format != FORMAT_24B_888);
235     break;
236 
237   case FORMAT_16B_565:
238     //                   RRRRRGGG GGGBBBBB
239     // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
240     if (offset == -1)
241       LDRH(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
242     else if (offset & 1)  // Not aligned - unscaled
243       LDURH(scratch3_reg, src_reg, offset);
244     else
245       LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, offset);
246 
247     REV16(scratch3_reg, scratch3_reg);
248 
249     // B
250     AND(scratch2_reg, scratch3_reg, 32, 4);
251     ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 3));
252     ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 5));
253     ORR(scratch1_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16));
254 
255     // G
256     UBFM(scratch2_reg, scratch3_reg, 5, 10);
257     ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
258     ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6));
259     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8));
260 
261     // R
262     UBFM(scratch2_reg, scratch3_reg, 11, 15);
263     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 3));
264     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 2));
265 
266     // A
267     ORRI2R(scratch1_reg, scratch1_reg, 0xFF000000);
268 
269     STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs);
270     load_bytes = 2;
271     break;
272 
273   case FORMAT_16B_4444:
274     //                   BBBBAAAA RRRRGGGG
275     //           REV16 - RRRRGGGG BBBBAAAA
276     // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
277     if (offset == -1)
278       LDRH(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
279     else if (offset & 1)  // Not aligned - unscaled
280       LDURH(scratch3_reg, src_reg, offset);
281     else
282       LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, offset);
283 
284     // R
285     UBFM(scratch1_reg, scratch3_reg, 4, 7);
286 
287     // G
288     AND(scratch2_reg, scratch3_reg, 32, 3);
289     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8));
290 
291     // B
292     UBFM(scratch2_reg, scratch3_reg, 12, 15);
293     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16));
294 
295     // A
296     UBFM(scratch2_reg, scratch3_reg, 8, 11);
297     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 24));
298 
299     // Final duplication
300     ORR(scratch1_reg, scratch1_reg, scratch1_reg, ArithOption(scratch1_reg, ST_LSL, 4));
301 
302     STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs);
303     load_bytes = 2;
304     break;
305 
306   case FORMAT_24B_6666:
307     //          RRRRRRGG GGGGBBBB BBAAAAAA
308     // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
309     if (offset == -1)
310     {
311       LDUR(scratch3_reg, EncodeRegTo64(scratch1_reg), -1);
312     }
313     else
314     {
315       offset -= 1;
316       if (offset & 3)  // Not aligned - unscaled
317         LDUR(scratch3_reg, src_reg, offset);
318       else
319         LDR(INDEX_UNSIGNED, scratch3_reg, src_reg, offset);
320     }
321 
322     REV32(scratch3_reg, scratch3_reg);
323 
324     // A
325     UBFM(scratch2_reg, scratch3_reg, 0, 5);
326     ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
327     ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6));
328     ORR(scratch1_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 24));
329 
330     // B
331     UBFM(scratch2_reg, scratch3_reg, 6, 11);
332     ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
333     ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6));
334     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16));
335 
336     // G
337     UBFM(scratch2_reg, scratch3_reg, 12, 17);
338     ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
339     ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6));
340     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8));
341 
342     // R
343     UBFM(scratch2_reg, scratch3_reg, 18, 23);
344     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
345     ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 4));
346 
347     STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs);
348 
349     load_bytes = 3;
350     break;
351   }
352   if (attribute == DIRECT)
353     m_src_ofs += load_bytes;
354 }
355 
GenerateVertexLoader()356 void VertexLoaderARM64::GenerateVertexLoader()
357 {
358   // R0 - Source pointer
359   // R1 - Destination pointer
360   // R2 - Count
361   // R30 - LR
362   //
363   // R0 return how many
364   //
365   // Registers we don't have to worry about saving
366   // R9-R17 are caller saved temporaries
367   // R18 is a temporary or platform specific register(iOS)
368   //
369   // VFP registers
370   // We can touch all except v8-v15
371   // If we need to use those, we need to retain the lower 64bits(!) of the register
372 
373   const u64 tc[8] = {
374       m_VtxDesc.Tex0Coord, m_VtxDesc.Tex1Coord, m_VtxDesc.Tex2Coord, m_VtxDesc.Tex3Coord,
375       m_VtxDesc.Tex4Coord, m_VtxDesc.Tex5Coord, m_VtxDesc.Tex6Coord, m_VtxDesc.Tex7Coord,
376   };
377 
378   bool has_tc = false;
379   bool has_tc_scale = false;
380   for (int i = 0; i < 8; i++)
381   {
382     has_tc |= tc[i] != 0;
383     has_tc_scale |= !!m_VtxAttr.texCoord[i].Frac;
384   }
385 
386   bool need_scale =
387       (m_VtxAttr.ByteDequant && m_VtxAttr.PosFrac) || (has_tc && has_tc_scale) || m_VtxDesc.Normal;
388 
389   AlignCode16();
390   if (m_VtxDesc.Position & MASK_INDEXED)
391     MOV(skipped_reg, WZR);
392   MOV(saved_count, count_reg);
393 
394   MOVP2R(stride_reg, g_main_cp_state.array_strides);
395   MOVP2R(arraybase_reg, VertexLoaderManager::cached_arraybases);
396 
397   if (need_scale)
398     MOVP2R(scale_reg, scale_factors);
399 
400   const u8* loop_start = GetCodePtr();
401 
402   if (m_VtxDesc.PosMatIdx)
403   {
404     LDRB(INDEX_UNSIGNED, scratch1_reg, src_reg, m_src_ofs);
405     AND(scratch1_reg, scratch1_reg, 0, 5);
406     STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs);
407 
408     // Z-Freeze
409     CMP(count_reg, 3);
410     FixupBranch dont_store = B(CC_GT);
411     MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::position_matrix_index);
412     STR(INDEX_UNSIGNED, scratch1_reg, EncodeRegTo64(scratch2_reg), 0);
413     SetJumpTarget(dont_store);
414 
415     m_native_components |= VB_HAS_POSMTXIDX;
416     m_native_vtx_decl.posmtx.components = 4;
417     m_native_vtx_decl.posmtx.enable = true;
418     m_native_vtx_decl.posmtx.offset = m_dst_ofs;
419     m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE;
420     m_native_vtx_decl.posmtx.integer = true;
421     m_src_ofs += sizeof(u8);
422     m_dst_ofs += sizeof(u32);
423   }
424 
425   u32 texmatidx_ofs[8];
426   const u64 tm[8] = {
427       m_VtxDesc.Tex0MatIdx, m_VtxDesc.Tex1MatIdx, m_VtxDesc.Tex2MatIdx, m_VtxDesc.Tex3MatIdx,
428       m_VtxDesc.Tex4MatIdx, m_VtxDesc.Tex5MatIdx, m_VtxDesc.Tex6MatIdx, m_VtxDesc.Tex7MatIdx,
429   };
430   for (int i = 0; i < 8; i++)
431   {
432     if (tm[i])
433       texmatidx_ofs[i] = m_src_ofs++;
434   }
435 
436   // Position
437   {
438     int elem_size = 1 << (m_VtxAttr.PosFormat / 2);
439     int load_bytes = elem_size * (m_VtxAttr.PosElements + 2);
440     int load_size =
441         load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16;
442     load_size <<= 3;
443 
444     s32 offset =
445         GetAddressImm(ARRAY_POSITION, m_VtxDesc.Position, EncodeRegTo64(scratch1_reg), load_size);
446     int pos_elements = m_VtxAttr.PosElements + 2;
447     ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, pos_elements, pos_elements,
448                m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position, offset);
449   }
450 
451   if (m_VtxDesc.Normal)
452   {
453     static const u8 map[8] = {7, 6, 15, 14};
454     u8 scaling_exponent = map[m_VtxAttr.NormalFormat];
455 
456     s32 offset = -1;
457     for (int i = 0; i < (m_VtxAttr.NormalElements ? 3 : 1); i++)
458     {
459       if (!i || m_VtxAttr.NormalIndex3)
460       {
461         int elem_size = 1 << (m_VtxAttr.NormalFormat / 2);
462 
463         int load_bytes = elem_size * 3;
464         int load_size = load_bytes == 1 ?
465                             1 :
466                             load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16;
467 
468         offset = GetAddressImm(ARRAY_NORMAL, m_VtxDesc.Normal, EncodeRegTo64(scratch1_reg),
469                                load_size << 3);
470 
471         if (offset == -1)
472           ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), i * elem_size * 3);
473         else
474           offset += i * elem_size * 3;
475       }
476       int bytes_read = ReadVertex(m_VtxDesc.Normal, m_VtxAttr.NormalFormat, 3, 3, true,
477                                   scaling_exponent, &m_native_vtx_decl.normals[i], offset);
478 
479       if (offset == -1)
480         ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), bytes_read);
481       else
482         offset += bytes_read;
483     }
484 
485     m_native_components |= VB_HAS_NRM0;
486     if (m_VtxAttr.NormalElements)
487       m_native_components |= VB_HAS_NRM1 | VB_HAS_NRM2;
488   }
489 
490   const u64 col[2] = {m_VtxDesc.Color0, m_VtxDesc.Color1};
491   for (int i = 0; i < 2; i++)
492   {
493     m_native_vtx_decl.colors[i].components = 4;
494     m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE;
495     m_native_vtx_decl.colors[i].integer = false;
496 
497     if (col[i])
498     {
499       u32 align = 4;
500       if (m_VtxAttr.color[i].Comp == FORMAT_16B_565 || m_VtxAttr.color[i].Comp == FORMAT_16B_4444)
501         align = 2;
502 
503       s32 offset = GetAddressImm(ARRAY_COLOR + i, col[i], EncodeRegTo64(scratch1_reg), align);
504       ReadColor(col[i], m_VtxAttr.color[i].Comp, offset);
505       m_native_components |= VB_HAS_COL0 << i;
506       m_native_vtx_decl.colors[i].components = 4;
507       m_native_vtx_decl.colors[i].enable = true;
508       m_native_vtx_decl.colors[i].offset = m_dst_ofs;
509       m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE;
510       m_native_vtx_decl.colors[i].integer = false;
511       m_dst_ofs += 4;
512     }
513   }
514 
515   for (int i = 0; i < 8; i++)
516   {
517     m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
518     m_native_vtx_decl.texcoords[i].type = VAR_FLOAT;
519     m_native_vtx_decl.texcoords[i].integer = false;
520 
521     int elements = m_VtxAttr.texCoord[i].Elements + 1;
522     if (tc[i])
523     {
524       m_native_components |= VB_HAS_UV0 << i;
525 
526       int elem_size = 1 << (m_VtxAttr.texCoord[i].Format / 2);
527       int load_bytes = elem_size * (elements + 2);
528       int load_size = load_bytes == 1 ?
529                           1 :
530                           load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16;
531       load_size <<= 3;
532 
533       s32 offset =
534           GetAddressImm(ARRAY_TEXCOORD0 + i, tc[i], EncodeRegTo64(scratch1_reg), load_size);
535       u8 scaling_exponent = m_VtxAttr.texCoord[i].Frac;
536       ReadVertex(tc[i], m_VtxAttr.texCoord[i].Format, elements, tm[i] ? 2 : elements,
537                  m_VtxAttr.ByteDequant, scaling_exponent, &m_native_vtx_decl.texcoords[i], offset);
538     }
539     if (tm[i])
540     {
541       m_native_components |= VB_HAS_TEXMTXIDX0 << i;
542       m_native_vtx_decl.texcoords[i].components = 3;
543       m_native_vtx_decl.texcoords[i].enable = true;
544       m_native_vtx_decl.texcoords[i].type = VAR_FLOAT;
545       m_native_vtx_decl.texcoords[i].integer = false;
546 
547       LDRB(INDEX_UNSIGNED, scratch2_reg, src_reg, texmatidx_ofs[i]);
548       m_float_emit.UCVTF(S31, scratch2_reg);
549 
550       if (tc[i])
551       {
552         m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs);
553         m_dst_ofs += sizeof(float);
554       }
555       else
556       {
557         m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
558 
559         if (m_dst_ofs < 256)
560         {
561           STUR(SP, dst_reg, m_dst_ofs);
562         }
563         else if (!(m_dst_ofs & 7))
564         {
565           // If m_dst_ofs isn't 8byte aligned we can't store an 8byte zero register
566           // So store two 4byte zero registers
567           // The destination is always 4byte aligned
568           STR(INDEX_UNSIGNED, WSP, dst_reg, m_dst_ofs);
569           STR(INDEX_UNSIGNED, WSP, dst_reg, m_dst_ofs + 4);
570         }
571         else
572         {
573           STR(INDEX_UNSIGNED, SP, dst_reg, m_dst_ofs);
574         }
575         m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs + 8);
576 
577         m_dst_ofs += sizeof(float) * 3;
578       }
579     }
580   }
581 
582   // Prepare for the next vertex.
583   ADD(dst_reg, dst_reg, m_dst_ofs);
584   const u8* cont = GetCodePtr();
585   ADD(src_reg, src_reg, m_src_ofs);
586 
587   SUB(count_reg, count_reg, 1);
588   CBNZ(count_reg, loop_start);
589 
590   if (m_VtxDesc.Position & MASK_INDEXED)
591   {
592     SUB(W0, saved_count, skipped_reg);
593     RET(X30);
594 
595     SetJumpTarget(m_skip_vertex);
596     ADD(skipped_reg, skipped_reg, 1);
597     B(cont);
598   }
599   else
600   {
601     MOV(W0, saved_count);
602     RET(X30);
603   }
604 
605   FlushIcache();
606 
607   m_VertexSize = m_src_ofs;
608   m_native_vtx_decl.stride = m_dst_ofs;
609 }
610 
RunVertices(DataReader src,DataReader dst,int count)611 int VertexLoaderARM64::RunVertices(DataReader src, DataReader dst, int count)
612 {
613   m_numLoadedVertices += count;
614   return ((int (*)(u8 * src, u8 * dst, int count)) region)(src.GetPointer(), dst.GetPointer(),
615                                                            count);
616 }
617