1 // Copyright (c) 2012- PPSSPP Project.
2
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
11
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18 #include <cstring>
19
20 #include "ppsspp_config.h"
21 #include "CPUDetect.h"
22 #include "Common.h"
23
24 #ifdef _M_SSE
25 #include <emmintrin.h>
26 #endif
27 #if PPSSPP_ARCH(ARM_NEON)
28
29 #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
30 #include <arm64_neon.h>
31 #else
32 #include <arm_neon.h>
33 #endif
34 #endif
35 #include "IndexGenerator.h"
36
37 // Points don't need indexing...
38 const u8 IndexGenerator::indexedPrimitiveType[7] = {
39 GE_PRIM_POINTS,
40 GE_PRIM_LINES,
41 GE_PRIM_LINES,
42 GE_PRIM_TRIANGLES,
43 GE_PRIM_TRIANGLES,
44 GE_PRIM_TRIANGLES,
45 GE_PRIM_RECTANGLES,
46 };
47
Setup(u16 * inds)48 void IndexGenerator::Setup(u16 *inds) {
49 this->indsBase_ = inds;
50 Reset();
51 }
52
AddPrim(int prim,int vertexCount,bool clockwise)53 void IndexGenerator::AddPrim(int prim, int vertexCount, bool clockwise) {
54 switch (prim) {
55 case GE_PRIM_POINTS: AddPoints(vertexCount); break;
56 case GE_PRIM_LINES: AddLineList(vertexCount); break;
57 case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount); break;
58 case GE_PRIM_TRIANGLES: AddList(vertexCount, clockwise); break;
59 case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, clockwise); break;
60 case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, clockwise); break;
61 case GE_PRIM_RECTANGLES: AddRectangles(vertexCount); break; // Same
62 }
63 }
64
AddPoints(int numVerts)65 void IndexGenerator::AddPoints(int numVerts) {
66 u16 *outInds = inds_;
67 const int startIndex = index_;
68 for (int i = 0; i < numVerts; i++)
69 *outInds++ = startIndex + i;
70 inds_ = outInds;
71 // ignore overflow verts
72 index_ += numVerts;
73 count_ += numVerts;
74 prim_ = GE_PRIM_POINTS;
75 seenPrims_ |= 1 << GE_PRIM_POINTS;
76 }
77
AddList(int numVerts,bool clockwise)78 void IndexGenerator::AddList(int numVerts, bool clockwise) {
79 u16 *outInds = inds_;
80 const int startIndex = index_;
81 const int v1 = clockwise ? 1 : 2;
82 const int v2 = clockwise ? 2 : 1;
83 for (int i = 0; i < numVerts; i += 3) {
84 *outInds++ = startIndex + i;
85 *outInds++ = startIndex + i + v1;
86 *outInds++ = startIndex + i + v2;
87 }
88 inds_ = outInds;
89 // ignore overflow verts
90 index_ += numVerts;
91 count_ += numVerts;
92 prim_ = GE_PRIM_TRIANGLES;
93 seenPrims_ |= 1 << GE_PRIM_TRIANGLES;
94 if (!clockwise) {
95 // Make sure we don't treat this as pure.
96 seenPrims_ |= 1 << GE_PRIM_TRIANGLE_STRIP;
97 }
98 }
99
100 alignas(16) static const u16 offsets_clockwise[24] = {
101 0, (u16)(0 + 1), (u16)(0 + 2),
102 1, (u16)(1 + 2), (u16)(1 + 1),
103 2, (u16)(2 + 1), (u16)(2 + 2),
104 3, (u16)(3 + 2), (u16)(3 + 1),
105 4, (u16)(4 + 1), (u16)(4 + 2),
106 5, (u16)(5 + 2), (u16)(5 + 1),
107 6, (u16)(6 + 1), (u16)(6 + 2),
108 7, (u16)(7 + 2), (u16)(7 + 1),
109 };
110
111 alignas(16) static const uint16_t offsets_counter_clockwise[24] = {
112 0, (u16)(0 + 2), (u16)(0 + 1),
113 1, (u16)(1 + 1), (u16)(1 + 2),
114 2, (u16)(2 + 2), (u16)(2 + 1),
115 3, (u16)(3 + 1), (u16)(3 + 2),
116 4, (u16)(4 + 2), (u16)(4 + 1),
117 5, (u16)(5 + 1), (u16)(5 + 2),
118 6, (u16)(6 + 2), (u16)(6 + 1),
119 7, (u16)(7 + 1), (u16)(7 + 2),
120 };
121
AddStrip(int numVerts,bool clockwise)122 void IndexGenerator::AddStrip(int numVerts, bool clockwise) {
123 int numTris = numVerts - 2;
124
125 #ifdef _M_SSE
126 // In an SSE2 register we can fit 8 16-bit integers.
127 // However, we need to output a multiple of 3 indices.
128 // The first such multiple is 24, which means we'll generate 24 indices per cycle,
129 // which corresponds to 8 triangles. That's pretty cool.
130
131 // We allow ourselves to write some extra indices to avoid the fallback loop.
132 // That's alright as we're appending to a buffer - they will get overwritten anyway.
133 int numChunks = (numTris + 7) / 8;
134 __m128i ibase8 = _mm_set1_epi16(index_);
135 __m128i increment = _mm_set1_epi16(8);
136 const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise);
137 __m128i offsets0 = _mm_load_si128(offsets);
138 __m128i offsets1 = _mm_load_si128(offsets + 1);
139 __m128i offsets2 = _mm_load_si128(offsets + 2);
140 __m128i *dst = (__m128i *)inds_;
141 for (int i = 0; i < numChunks; i++) {
142 _mm_storeu_si128(dst, _mm_add_epi16(ibase8, offsets0));
143 _mm_storeu_si128(dst + 1, _mm_add_epi16(ibase8, offsets1));
144 _mm_storeu_si128(dst + 2, _mm_add_epi16(ibase8, offsets2));
145 ibase8 = _mm_add_epi16(ibase8, increment);
146 dst += 3;
147 }
148 inds_ += numTris * 3;
149 // wind doesn't need to be updated, an even number of triangles have been drawn.
150 #elif PPSSPP_ARCH(ARM_NEON)
151 int numChunks = (numTris + 7) / 8;
152 uint16x8_t ibase8 = vdupq_n_u16(index_);
153 uint16x8_t increment = vdupq_n_u16(8);
154 const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise;
155 uint16x8_t offsets0 = vld1q_u16(offsets);
156 uint16x8_t offsets1 = vld1q_u16(offsets + 8);
157 uint16x8_t offsets2 = vld1q_u16(offsets + 16);
158 u16 *dst = inds_;
159 for (int i = 0; i < numChunks; i++) {
160 vst1q_u16(dst, vaddq_u16(ibase8, offsets0));
161 vst1q_u16(dst + 8, vaddq_u16(ibase8, offsets1));
162 vst1q_u16(dst + 16, vaddq_u16(ibase8, offsets2));
163 ibase8 = vaddq_u16(ibase8, increment);
164 dst += 3 * 8;
165 }
166 inds_ += numTris * 3;
167 #else
168 // Slow fallback loop.
169 int wind = clockwise ? 1 : 2;
170 int ibase = index_;
171 size_t numPairs = numTris / 2;
172 u16 *outInds = inds_;
173 while (numPairs > 0) {
174 *outInds++ = ibase;
175 *outInds++ = ibase + wind;
176 *outInds++ = ibase + (wind ^ 3);
177 *outInds++ = ibase + 1;
178 *outInds++ = ibase + 1 + (wind ^ 3);
179 *outInds++ = ibase + 1 + wind;
180 ibase += 2;
181 numPairs--;
182 }
183 if (numTris & 1) {
184 *outInds++ = ibase;
185 *outInds++ = ibase + wind;
186 wind ^= 3; // toggle between 1 and 2
187 *outInds++ = ibase + wind;
188 }
189 inds_ = outInds;
190 #endif
191
192 index_ += numVerts;
193 if (numTris > 0)
194 count_ += numTris * 3;
195 // This is so we can detect one single strip by just looking at seenPrims_.
196 if (!seenPrims_ && clockwise) {
197 seenPrims_ = 1 << GE_PRIM_TRIANGLE_STRIP;
198 prim_ = GE_PRIM_TRIANGLE_STRIP;
199 pureCount_ = numVerts;
200 } else {
201 seenPrims_ |= (1 << GE_PRIM_TRIANGLE_STRIP) | (1 << GE_PRIM_TRIANGLES);
202 prim_ = GE_PRIM_TRIANGLES;
203 pureCount_ = 0;
204 }
205 }
206
AddFan(int numVerts,bool clockwise)207 void IndexGenerator::AddFan(int numVerts, bool clockwise) {
208 const int numTris = numVerts - 2;
209 u16 *outInds = inds_;
210 const int startIndex = index_;
211 const int v1 = clockwise ? 1 : 2;
212 const int v2 = clockwise ? 2 : 1;
213 for (int i = 0; i < numTris; i++) {
214 *outInds++ = startIndex;
215 *outInds++ = startIndex + i + v1;
216 *outInds++ = startIndex + i + v2;
217 }
218 inds_ = outInds;
219 index_ += numVerts;
220 count_ += numTris * 3;
221 prim_ = GE_PRIM_TRIANGLES;
222 seenPrims_ |= 1 << GE_PRIM_TRIANGLE_FAN;
223 if (!clockwise) {
224 // Make sure we don't treat this as pure.
225 seenPrims_ |= 1 << GE_PRIM_TRIANGLE_STRIP;
226 }
227 }
228
229 //Lines
AddLineList(int numVerts)230 void IndexGenerator::AddLineList(int numVerts) {
231 u16 *outInds = inds_;
232 const int startIndex = index_;
233 for (int i = 0; i < numVerts; i += 2) {
234 *outInds++ = startIndex + i;
235 *outInds++ = startIndex + i + 1;
236 }
237 inds_ = outInds;
238 index_ += numVerts;
239 count_ += numVerts;
240 prim_ = GE_PRIM_LINES;
241 seenPrims_ |= 1 << prim_;
242 }
243
AddLineStrip(int numVerts)244 void IndexGenerator::AddLineStrip(int numVerts) {
245 const int numLines = numVerts - 1;
246 u16 *outInds = inds_;
247 const int startIndex = index_;
248 for (int i = 0; i < numLines; i++) {
249 *outInds++ = startIndex + i;
250 *outInds++ = startIndex + i + 1;
251 }
252 inds_ = outInds;
253 index_ += numVerts;
254 count_ += numLines * 2;
255 prim_ = GE_PRIM_LINES;
256 seenPrims_ |= 1 << GE_PRIM_LINE_STRIP;
257 }
258
AddRectangles(int numVerts)259 void IndexGenerator::AddRectangles(int numVerts) {
260 u16 *outInds = inds_;
261 const int startIndex = index_;
262 //rectangles always need 2 vertices, disregard the last one if there's an odd number
263 numVerts = numVerts & ~1;
264 for (int i = 0; i < numVerts; i += 2) {
265 *outInds++ = startIndex + i;
266 *outInds++ = startIndex + i + 1;
267 }
268 inds_ = outInds;
269 index_ += numVerts;
270 count_ += numVerts;
271 prim_ = GE_PRIM_RECTANGLES;
272 seenPrims_ |= 1 << GE_PRIM_RECTANGLES;
273 }
274
275 template <class ITypeLE, int flag>
TranslatePoints(int numInds,const ITypeLE * inds,int indexOffset)276 void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int indexOffset) {
277 indexOffset = index_ - indexOffset;
278 u16 *outInds = inds_;
279 for (int i = 0; i < numInds; i++)
280 *outInds++ = indexOffset + inds[i];
281 inds_ = outInds;
282 count_ += numInds;
283 prim_ = GE_PRIM_POINTS;
284 seenPrims_ |= (1 << GE_PRIM_POINTS) | flag;
285 }
286
287 template <class ITypeLE, int flag>
TranslateLineList(int numInds,const ITypeLE * inds,int indexOffset)288 void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int indexOffset) {
289 indexOffset = index_ - indexOffset;
290 u16 *outInds = inds_;
291 numInds = numInds & ~1;
292 for (int i = 0; i < numInds; i += 2) {
293 *outInds++ = indexOffset + inds[i];
294 *outInds++ = indexOffset + inds[i + 1];
295 }
296 inds_ = outInds;
297 count_ += numInds;
298 prim_ = GE_PRIM_LINES;
299 seenPrims_ |= (1 << GE_PRIM_LINES) | flag;
300 }
301
302 template <class ITypeLE, int flag>
TranslateLineStrip(int numInds,const ITypeLE * inds,int indexOffset)303 void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int indexOffset) {
304 indexOffset = index_ - indexOffset;
305 int numLines = numInds - 1;
306 u16 *outInds = inds_;
307 for (int i = 0; i < numLines; i++) {
308 *outInds++ = indexOffset + inds[i];
309 *outInds++ = indexOffset + inds[i + 1];
310 }
311 inds_ = outInds;
312 count_ += numLines * 2;
313 prim_ = GE_PRIM_LINES;
314 seenPrims_ |= (1 << GE_PRIM_LINE_STRIP) | flag;
315 }
316
317 template <class ITypeLE, int flag>
TranslateList(int numInds,const ITypeLE * inds,int indexOffset,bool clockwise)318 void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
319 indexOffset = index_ - indexOffset;
320 // We only bother doing this minor optimization in triangle list, since it's by far the most
321 // common operation that can benefit.
322 if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) {
323 memcpy(inds_, inds, numInds * sizeof(ITypeLE));
324 inds_ += numInds;
325 count_ += numInds;
326 } else {
327 u16 *outInds = inds_;
328 int numTris = numInds / 3; // Round to whole triangles
329 numInds = numTris * 3;
330 const int v1 = clockwise ? 1 : 2;
331 const int v2 = clockwise ? 2 : 1;
332 for (int i = 0; i < numInds; i += 3) {
333 *outInds++ = indexOffset + inds[i];
334 *outInds++ = indexOffset + inds[i + v1];
335 *outInds++ = indexOffset + inds[i + v2];
336 }
337 inds_ = outInds;
338 count_ += numInds;
339 }
340 prim_ = GE_PRIM_TRIANGLES;
341 seenPrims_ |= (1 << GE_PRIM_TRIANGLES) | flag;
342 }
343
344 template <class ITypeLE, int flag>
TranslateStrip(int numInds,const ITypeLE * inds,int indexOffset,bool clockwise)345 void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
346 int wind = clockwise ? 1 : 2;
347 indexOffset = index_ - indexOffset;
348 int numTris = numInds - 2;
349 u16 *outInds = inds_;
350 for (int i = 0; i < numTris; i++) {
351 *outInds++ = indexOffset + inds[i];
352 *outInds++ = indexOffset + inds[i + wind];
353 wind ^= 3; // Toggle between 1 and 2
354 *outInds++ = indexOffset + inds[i + wind];
355 }
356 inds_ = outInds;
357 count_ += numTris * 3;
358 prim_ = GE_PRIM_TRIANGLES;
359 seenPrims_ |= (1 << GE_PRIM_TRIANGLE_STRIP) | flag;
360 }
361
362 template <class ITypeLE, int flag>
TranslateFan(int numInds,const ITypeLE * inds,int indexOffset,bool clockwise)363 void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
364 if (numInds <= 0) return;
365 indexOffset = index_ - indexOffset;
366 int numTris = numInds - 2;
367 u16 *outInds = inds_;
368 const int v1 = clockwise ? 1 : 2;
369 const int v2 = clockwise ? 2 : 1;
370 for (int i = 0; i < numTris; i++) {
371 *outInds++ = indexOffset + inds[0];
372 *outInds++ = indexOffset + inds[i + v1];
373 *outInds++ = indexOffset + inds[i + v2];
374 }
375 inds_ = outInds;
376 count_ += numTris * 3;
377 prim_ = GE_PRIM_TRIANGLES;
378 seenPrims_ |= (1 << GE_PRIM_TRIANGLE_FAN) | flag;
379 }
380
381 template <class ITypeLE, int flag>
TranslateRectangles(int numInds,const ITypeLE * inds,int indexOffset)382 inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds, int indexOffset) {
383 indexOffset = index_ - indexOffset;
384 u16 *outInds = inds_;
385 //rectangles always need 2 vertices, disregard the last one if there's an odd number
386 numInds = numInds & ~1;
387 for (int i = 0; i < numInds; i += 2) {
388 *outInds++ = indexOffset + inds[i];
389 *outInds++ = indexOffset + inds[i+1];
390 }
391 inds_ = outInds;
392 count_ += numInds;
393 prim_ = GE_PRIM_RECTANGLES;
394 seenPrims_ |= (1 << GE_PRIM_RECTANGLES) | flag;
395 }
396
397 // Could template this too, but would have to define in header.
TranslatePrim(int prim,int numInds,const u8 * inds,int indexOffset,bool clockwise)398 void IndexGenerator::TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset, bool clockwise) {
399 switch (prim) {
400 case GE_PRIM_POINTS: TranslatePoints<u8, SEEN_INDEX8>(numInds, inds, indexOffset); break;
401 case GE_PRIM_LINES: TranslateLineList<u8, SEEN_INDEX8>(numInds, inds, indexOffset); break;
402 case GE_PRIM_LINE_STRIP: TranslateLineStrip<u8, SEEN_INDEX8>(numInds, inds, indexOffset); break;
403 case GE_PRIM_TRIANGLES: TranslateList<u8, SEEN_INDEX8>(numInds, inds, indexOffset, clockwise); break;
404 case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u8, SEEN_INDEX8>(numInds, inds, indexOffset, clockwise); break;
405 case GE_PRIM_TRIANGLE_FAN: TranslateFan<u8, SEEN_INDEX8>(numInds, inds, indexOffset, clockwise); break;
406 case GE_PRIM_RECTANGLES: TranslateRectangles<u8, SEEN_INDEX8>(numInds, inds, indexOffset); break; // Same
407 }
408 }
409
TranslatePrim(int prim,int numInds,const u16_le * inds,int indexOffset,bool clockwise)410 void IndexGenerator::TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise) {
411 switch (prim) {
412 case GE_PRIM_POINTS: TranslatePoints<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset); break;
413 case GE_PRIM_LINES: TranslateLineList<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset); break;
414 case GE_PRIM_LINE_STRIP: TranslateLineStrip<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset); break;
415 case GE_PRIM_TRIANGLES: TranslateList<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset, clockwise); break;
416 case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset, clockwise); break;
417 case GE_PRIM_TRIANGLE_FAN: TranslateFan<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset, clockwise); break;
418 case GE_PRIM_RECTANGLES: TranslateRectangles<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset); break; // Same
419 }
420 }
421
TranslatePrim(int prim,int numInds,const u32_le * inds,int indexOffset,bool clockwise)422 void IndexGenerator::TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise) {
423 switch (prim) {
424 case GE_PRIM_POINTS: TranslatePoints<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset); break;
425 case GE_PRIM_LINES: TranslateLineList<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset); break;
426 case GE_PRIM_LINE_STRIP: TranslateLineStrip<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset); break;
427 case GE_PRIM_TRIANGLES: TranslateList<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset, clockwise); break;
428 case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset, clockwise); break;
429 case GE_PRIM_TRIANGLE_FAN: TranslateFan<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset, clockwise); break;
430 case GE_PRIM_RECTANGLES: TranslateRectangles<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset); break; // Same
431 }
432 }
433