1 // Copyright (c) 2012- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include <cstring>
19 
20 #include "ppsspp_config.h"
21 #include "CPUDetect.h"
22 #include "Common.h"
23 
24 #ifdef _M_SSE
25 #include <emmintrin.h>
26 #endif
27 #if PPSSPP_ARCH(ARM_NEON)
28 
29 #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
30 #include <arm64_neon.h>
31 #else
32 #include <arm_neon.h>
33 #endif
34 #endif
35 #include "IndexGenerator.h"
36 
37 // Points don't need indexing...
38 const u8 IndexGenerator::indexedPrimitiveType[7] = {
39 	GE_PRIM_POINTS,
40 	GE_PRIM_LINES,
41 	GE_PRIM_LINES,
42 	GE_PRIM_TRIANGLES,
43 	GE_PRIM_TRIANGLES,
44 	GE_PRIM_TRIANGLES,
45 	GE_PRIM_RECTANGLES,
46 };
47 
Setup(u16 * inds)48 void IndexGenerator::Setup(u16 *inds) {
49 	this->indsBase_ = inds;
50 	Reset();
51 }
52 
AddPrim(int prim,int vertexCount,bool clockwise)53 void IndexGenerator::AddPrim(int prim, int vertexCount, bool clockwise) {
54 	switch (prim) {
55 	case GE_PRIM_POINTS: AddPoints(vertexCount); break;
56 	case GE_PRIM_LINES: AddLineList(vertexCount); break;
57 	case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount); break;
58 	case GE_PRIM_TRIANGLES: AddList(vertexCount, clockwise); break;
59 	case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, clockwise); break;
60 	case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, clockwise); break;
61 	case GE_PRIM_RECTANGLES: AddRectangles(vertexCount); break;  // Same
62 	}
63 }
64 
AddPoints(int numVerts)65 void IndexGenerator::AddPoints(int numVerts) {
66 	u16 *outInds = inds_;
67 	const int startIndex = index_;
68 	for (int i = 0; i < numVerts; i++)
69 		*outInds++ = startIndex + i;
70 	inds_ = outInds;
71 	// ignore overflow verts
72 	index_ += numVerts;
73 	count_ += numVerts;
74 	prim_ = GE_PRIM_POINTS;
75 	seenPrims_ |= 1 << GE_PRIM_POINTS;
76 }
77 
AddList(int numVerts,bool clockwise)78 void IndexGenerator::AddList(int numVerts, bool clockwise) {
79 	u16 *outInds = inds_;
80 	const int startIndex = index_;
81 	const int v1 = clockwise ? 1 : 2;
82 	const int v2 = clockwise ? 2 : 1;
83 	for (int i = 0; i < numVerts; i += 3) {
84 		*outInds++ = startIndex + i;
85 		*outInds++ = startIndex + i + v1;
86 		*outInds++ = startIndex + i + v2;
87 	}
88 	inds_ = outInds;
89 	// ignore overflow verts
90 	index_ += numVerts;
91 	count_ += numVerts;
92 	prim_ = GE_PRIM_TRIANGLES;
93 	seenPrims_ |= 1 << GE_PRIM_TRIANGLES;
94 	if (!clockwise) {
95 		// Make sure we don't treat this as pure.
96 		seenPrims_ |= 1 << GE_PRIM_TRIANGLE_STRIP;
97 	}
98 }
99 
100 alignas(16) static const u16 offsets_clockwise[24] = {
101 	0, (u16)(0 + 1), (u16)(0 + 2),
102 	1, (u16)(1 + 2), (u16)(1 + 1),
103 	2, (u16)(2 + 1), (u16)(2 + 2),
104 	3, (u16)(3 + 2), (u16)(3 + 1),
105 	4, (u16)(4 + 1), (u16)(4 + 2),
106 	5, (u16)(5 + 2), (u16)(5 + 1),
107 	6, (u16)(6 + 1), (u16)(6 + 2),
108 	7, (u16)(7 + 2), (u16)(7 + 1),
109 };
110 
111 alignas(16) static const uint16_t offsets_counter_clockwise[24] = {
112 	0, (u16)(0 + 2), (u16)(0 + 1),
113 	1, (u16)(1 + 1), (u16)(1 + 2),
114 	2, (u16)(2 + 2), (u16)(2 + 1),
115 	3, (u16)(3 + 1), (u16)(3 + 2),
116 	4, (u16)(4 + 2), (u16)(4 + 1),
117 	5, (u16)(5 + 1), (u16)(5 + 2),
118 	6, (u16)(6 + 2), (u16)(6 + 1),
119 	7, (u16)(7 + 1), (u16)(7 + 2),
120 };
121 
AddStrip(int numVerts,bool clockwise)122 void IndexGenerator::AddStrip(int numVerts, bool clockwise) {
123 	int numTris = numVerts - 2;
124 
125 #ifdef _M_SSE
126 	// In an SSE2 register we can fit 8 16-bit integers.
127 	// However, we need to output a multiple of 3 indices.
128 	// The first such multiple is 24, which means we'll generate 24 indices per cycle,
129 	// which corresponds to 8 triangles. That's pretty cool.
130 
131 	// We allow ourselves to write some extra indices to avoid the fallback loop.
132 	// That's alright as we're appending to a buffer - they will get overwritten anyway.
133 	int numChunks = (numTris + 7) / 8;
134 	__m128i ibase8 = _mm_set1_epi16(index_);
135 	__m128i increment = _mm_set1_epi16(8);
136 	const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise);
137 	__m128i offsets0 = _mm_load_si128(offsets);
138 	__m128i offsets1 = _mm_load_si128(offsets + 1);
139 	__m128i offsets2 = _mm_load_si128(offsets + 2);
140 	__m128i *dst = (__m128i *)inds_;
141 	for (int i = 0; i < numChunks; i++) {
142 		_mm_storeu_si128(dst, _mm_add_epi16(ibase8, offsets0));
143 		_mm_storeu_si128(dst + 1, _mm_add_epi16(ibase8, offsets1));
144 		_mm_storeu_si128(dst + 2, _mm_add_epi16(ibase8, offsets2));
145 		ibase8 = _mm_add_epi16(ibase8, increment);
146 		dst += 3;
147 	}
148 	inds_ += numTris * 3;
149 	// wind doesn't need to be updated, an even number of triangles have been drawn.
150 #elif PPSSPP_ARCH(ARM_NEON)
151 	int numChunks = (numTris + 7) / 8;
152 	uint16x8_t ibase8 = vdupq_n_u16(index_);
153 	uint16x8_t increment = vdupq_n_u16(8);
154 	const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise;
155 	uint16x8_t offsets0 = vld1q_u16(offsets);
156 	uint16x8_t offsets1 = vld1q_u16(offsets + 8);
157 	uint16x8_t offsets2 = vld1q_u16(offsets + 16);
158 	u16 *dst = inds_;
159 	for (int i = 0; i < numChunks; i++) {
160 		vst1q_u16(dst, vaddq_u16(ibase8, offsets0));
161 		vst1q_u16(dst + 8, vaddq_u16(ibase8, offsets1));
162 		vst1q_u16(dst + 16, vaddq_u16(ibase8, offsets2));
163 		ibase8 = vaddq_u16(ibase8, increment);
164 		dst += 3 * 8;
165 	}
166 	inds_ += numTris * 3;
167 #else
168 	// Slow fallback loop.
169 	int wind = clockwise ? 1 : 2;
170 	int ibase = index_;
171 	size_t numPairs = numTris / 2;
172 	u16 *outInds = inds_;
173 	while (numPairs > 0) {
174 		*outInds++ = ibase;
175 		*outInds++ = ibase + wind;
176 		*outInds++ = ibase + (wind ^ 3);
177 		*outInds++ = ibase + 1;
178 		*outInds++ = ibase + 1 + (wind ^ 3);
179 		*outInds++ = ibase + 1 + wind;
180 		ibase += 2;
181 		numPairs--;
182 	}
183 	if (numTris & 1) {
184 		*outInds++ = ibase;
185 		*outInds++ = ibase + wind;
186 		wind ^= 3;  // toggle between 1 and 2
187 		*outInds++ = ibase + wind;
188 	}
189 	inds_ = outInds;
190 #endif
191 
192 	index_ += numVerts;
193 	if (numTris > 0)
194 		count_ += numTris * 3;
195 	// This is so we can detect one single strip by just looking at seenPrims_.
196 	if (!seenPrims_ && clockwise) {
197 		seenPrims_ = 1 << GE_PRIM_TRIANGLE_STRIP;
198 		prim_ = GE_PRIM_TRIANGLE_STRIP;
199 		pureCount_ = numVerts;
200 	} else {
201 		seenPrims_ |= (1 << GE_PRIM_TRIANGLE_STRIP) | (1 << GE_PRIM_TRIANGLES);
202 		prim_ = GE_PRIM_TRIANGLES;
203 		pureCount_ = 0;
204 	}
205 }
206 
AddFan(int numVerts,bool clockwise)207 void IndexGenerator::AddFan(int numVerts, bool clockwise) {
208 	const int numTris = numVerts - 2;
209 	u16 *outInds = inds_;
210 	const int startIndex = index_;
211 	const int v1 = clockwise ? 1 : 2;
212 	const int v2 = clockwise ? 2 : 1;
213 	for (int i = 0; i < numTris; i++) {
214 		*outInds++ = startIndex;
215 		*outInds++ = startIndex + i + v1;
216 		*outInds++ = startIndex + i + v2;
217 	}
218 	inds_ = outInds;
219 	index_ += numVerts;
220 	count_ += numTris * 3;
221 	prim_ = GE_PRIM_TRIANGLES;
222 	seenPrims_ |= 1 << GE_PRIM_TRIANGLE_FAN;
223 	if (!clockwise) {
224 		// Make sure we don't treat this as pure.
225 		seenPrims_ |= 1 << GE_PRIM_TRIANGLE_STRIP;
226 	}
227 }
228 
229 //Lines
AddLineList(int numVerts)230 void IndexGenerator::AddLineList(int numVerts) {
231 	u16 *outInds = inds_;
232 	const int startIndex = index_;
233 	for (int i = 0; i < numVerts; i += 2) {
234 		*outInds++ = startIndex + i;
235 		*outInds++ = startIndex + i + 1;
236 	}
237 	inds_ = outInds;
238 	index_ += numVerts;
239 	count_ += numVerts;
240 	prim_ = GE_PRIM_LINES;
241 	seenPrims_ |= 1 << prim_;
242 }
243 
AddLineStrip(int numVerts)244 void IndexGenerator::AddLineStrip(int numVerts) {
245 	const int numLines = numVerts - 1;
246 	u16 *outInds = inds_;
247 	const int startIndex = index_;
248 	for (int i = 0; i < numLines; i++) {
249 		*outInds++ = startIndex + i;
250 		*outInds++ = startIndex + i + 1;
251 	}
252 	inds_ = outInds;
253 	index_ += numVerts;
254 	count_ += numLines * 2;
255 	prim_ = GE_PRIM_LINES;
256 	seenPrims_ |= 1 << GE_PRIM_LINE_STRIP;
257 }
258 
AddRectangles(int numVerts)259 void IndexGenerator::AddRectangles(int numVerts) {
260 	u16 *outInds = inds_;
261 	const int startIndex = index_;
262 	//rectangles always need 2 vertices, disregard the last one if there's an odd number
263 	numVerts = numVerts & ~1;
264 	for (int i = 0; i < numVerts; i += 2) {
265 		*outInds++ = startIndex + i;
266 		*outInds++ = startIndex + i + 1;
267 	}
268 	inds_ = outInds;
269 	index_ += numVerts;
270 	count_ += numVerts;
271 	prim_ = GE_PRIM_RECTANGLES;
272 	seenPrims_ |= 1 << GE_PRIM_RECTANGLES;
273 }
274 
275 template <class ITypeLE, int flag>
TranslatePoints(int numInds,const ITypeLE * inds,int indexOffset)276 void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int indexOffset) {
277 	indexOffset = index_ - indexOffset;
278 	u16 *outInds = inds_;
279 	for (int i = 0; i < numInds; i++)
280 		*outInds++ = indexOffset + inds[i];
281 	inds_ = outInds;
282 	count_ += numInds;
283 	prim_ = GE_PRIM_POINTS;
284 	seenPrims_ |= (1 << GE_PRIM_POINTS) | flag;
285 }
286 
287 template <class ITypeLE, int flag>
TranslateLineList(int numInds,const ITypeLE * inds,int indexOffset)288 void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int indexOffset) {
289 	indexOffset = index_ - indexOffset;
290 	u16 *outInds = inds_;
291 	numInds = numInds & ~1;
292 	for (int i = 0; i < numInds; i += 2) {
293 		*outInds++ = indexOffset + inds[i];
294 		*outInds++ = indexOffset + inds[i + 1];
295 	}
296 	inds_ = outInds;
297 	count_ += numInds;
298 	prim_ = GE_PRIM_LINES;
299 	seenPrims_ |= (1 << GE_PRIM_LINES) | flag;
300 }
301 
302 template <class ITypeLE, int flag>
TranslateLineStrip(int numInds,const ITypeLE * inds,int indexOffset)303 void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int indexOffset) {
304 	indexOffset = index_ - indexOffset;
305 	int numLines = numInds - 1;
306 	u16 *outInds = inds_;
307 	for (int i = 0; i < numLines; i++) {
308 		*outInds++ = indexOffset + inds[i];
309 		*outInds++ = indexOffset + inds[i + 1];
310 	}
311 	inds_ = outInds;
312 	count_ += numLines * 2;
313 	prim_ = GE_PRIM_LINES;
314 	seenPrims_ |= (1 << GE_PRIM_LINE_STRIP) | flag;
315 }
316 
317 template <class ITypeLE, int flag>
TranslateList(int numInds,const ITypeLE * inds,int indexOffset,bool clockwise)318 void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
319 	indexOffset = index_ - indexOffset;
320 	// We only bother doing this minor optimization in triangle list, since it's by far the most
321 	// common operation that can benefit.
322 	if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) {
323 		memcpy(inds_, inds, numInds * sizeof(ITypeLE));
324 		inds_ += numInds;
325 		count_ += numInds;
326 	} else {
327 		u16 *outInds = inds_;
328 		int numTris = numInds / 3;  // Round to whole triangles
329 		numInds = numTris * 3;
330 		const int v1 = clockwise ? 1 : 2;
331 		const int v2 = clockwise ? 2 : 1;
332 		for (int i = 0; i < numInds; i += 3) {
333 			*outInds++ = indexOffset + inds[i];
334 			*outInds++ = indexOffset + inds[i + v1];
335 			*outInds++ = indexOffset + inds[i + v2];
336 		}
337 		inds_ = outInds;
338 		count_ += numInds;
339 	}
340 	prim_ = GE_PRIM_TRIANGLES;
341 	seenPrims_ |= (1 << GE_PRIM_TRIANGLES) | flag;
342 }
343 
344 template <class ITypeLE, int flag>
TranslateStrip(int numInds,const ITypeLE * inds,int indexOffset,bool clockwise)345 void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
346 	int wind = clockwise ? 1 : 2;
347 	indexOffset = index_ - indexOffset;
348 	int numTris = numInds - 2;
349 	u16 *outInds = inds_;
350 	for (int i = 0; i < numTris; i++) {
351 		*outInds++ = indexOffset + inds[i];
352 		*outInds++ = indexOffset + inds[i + wind];
353 		wind ^= 3;  // Toggle between 1 and 2
354 		*outInds++ = indexOffset + inds[i + wind];
355 	}
356 	inds_ = outInds;
357 	count_ += numTris * 3;
358 	prim_ = GE_PRIM_TRIANGLES;
359 	seenPrims_ |= (1 << GE_PRIM_TRIANGLE_STRIP) | flag;
360 }
361 
362 template <class ITypeLE, int flag>
TranslateFan(int numInds,const ITypeLE * inds,int indexOffset,bool clockwise)363 void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
364 	if (numInds <= 0) return;
365 	indexOffset = index_ - indexOffset;
366 	int numTris = numInds - 2;
367 	u16 *outInds = inds_;
368 	const int v1 = clockwise ? 1 : 2;
369 	const int v2 = clockwise ? 2 : 1;
370 	for (int i = 0; i < numTris; i++) {
371 		*outInds++ = indexOffset + inds[0];
372 		*outInds++ = indexOffset + inds[i + v1];
373 		*outInds++ = indexOffset + inds[i + v2];
374 	}
375 	inds_ = outInds;
376 	count_ += numTris * 3;
377 	prim_ = GE_PRIM_TRIANGLES;
378 	seenPrims_ |= (1 << GE_PRIM_TRIANGLE_FAN) | flag;
379 }
380 
381 template <class ITypeLE, int flag>
TranslateRectangles(int numInds,const ITypeLE * inds,int indexOffset)382 inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds, int indexOffset) {
383 	indexOffset = index_ - indexOffset;
384 	u16 *outInds = inds_;
385 	//rectangles always need 2 vertices, disregard the last one if there's an odd number
386 	numInds = numInds & ~1;
387 	for (int i = 0; i < numInds; i += 2) {
388 		*outInds++ = indexOffset + inds[i];
389 		*outInds++ = indexOffset + inds[i+1];
390 	}
391 	inds_ = outInds;
392 	count_ += numInds;
393 	prim_ = GE_PRIM_RECTANGLES;
394 	seenPrims_ |= (1 << GE_PRIM_RECTANGLES) | flag;
395 }
396 
397 // Could template this too, but would have to define in header.
TranslatePrim(int prim,int numInds,const u8 * inds,int indexOffset,bool clockwise)398 void IndexGenerator::TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset, bool clockwise) {
399 	switch (prim) {
400 	case GE_PRIM_POINTS: TranslatePoints<u8, SEEN_INDEX8>(numInds, inds, indexOffset); break;
401 	case GE_PRIM_LINES: TranslateLineList<u8, SEEN_INDEX8>(numInds, inds, indexOffset); break;
402 	case GE_PRIM_LINE_STRIP: TranslateLineStrip<u8, SEEN_INDEX8>(numInds, inds, indexOffset); break;
403 	case GE_PRIM_TRIANGLES: TranslateList<u8, SEEN_INDEX8>(numInds, inds, indexOffset, clockwise); break;
404 	case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u8, SEEN_INDEX8>(numInds, inds, indexOffset, clockwise); break;
405 	case GE_PRIM_TRIANGLE_FAN: TranslateFan<u8, SEEN_INDEX8>(numInds, inds, indexOffset, clockwise); break;
406 	case GE_PRIM_RECTANGLES: TranslateRectangles<u8, SEEN_INDEX8>(numInds, inds, indexOffset); break;  // Same
407 	}
408 }
409 
TranslatePrim(int prim,int numInds,const u16_le * inds,int indexOffset,bool clockwise)410 void IndexGenerator::TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise) {
411 	switch (prim) {
412 	case GE_PRIM_POINTS: TranslatePoints<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset); break;
413 	case GE_PRIM_LINES: TranslateLineList<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset); break;
414 	case GE_PRIM_LINE_STRIP: TranslateLineStrip<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset); break;
415 	case GE_PRIM_TRIANGLES: TranslateList<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset, clockwise); break;
416 	case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset, clockwise); break;
417 	case GE_PRIM_TRIANGLE_FAN: TranslateFan<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset, clockwise); break;
418 	case GE_PRIM_RECTANGLES: TranslateRectangles<u16_le, SEEN_INDEX16>(numInds, inds, indexOffset); break;  // Same
419 	}
420 }
421 
TranslatePrim(int prim,int numInds,const u32_le * inds,int indexOffset,bool clockwise)422 void IndexGenerator::TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise) {
423 	switch (prim) {
424 	case GE_PRIM_POINTS: TranslatePoints<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset); break;
425 	case GE_PRIM_LINES: TranslateLineList<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset); break;
426 	case GE_PRIM_LINE_STRIP: TranslateLineStrip<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset); break;
427 	case GE_PRIM_TRIANGLES: TranslateList<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset, clockwise); break;
428 	case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset, clockwise); break;
429 	case GE_PRIM_TRIANGLE_FAN: TranslateFan<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset, clockwise); break;
430 	case GE_PRIM_RECTANGLES: TranslateRectangles<u32_le, SEEN_INDEX32>(numInds, inds, indexOffset); break;  // Same
431 	}
432 }
433