1 #include <assert.h>
2 #include <cstring>
3 #include <sys/stat.h>
4 #include <limits.h>
5 #include <algorithm>
6 #include "GSH_OpenGL.h"
7 #include "StdStream.h"
8 #include "bitmap/BMP.h"
9 #include "../GsPixelFormats.h"
10 
11 /////////////////////////////////////////////////////////////
12 // Texture Loading
13 /////////////////////////////////////////////////////////////
14 
SetupTextureUpdaters()15 void CGSH_OpenGL::SetupTextureUpdaters()
16 {
17 	for(unsigned int i = 0; i < PSM_MAX; i++)
18 	{
19 		m_textureUpdater[i] = &CGSH_OpenGL::TexUpdater_Invalid;
20 	}
21 
22 	m_textureUpdater[PSMCT32] = &CGSH_OpenGL::TexUpdater_Psm32;
23 	m_textureUpdater[PSMCT24] = &CGSH_OpenGL::TexUpdater_Psm32;
24 	m_textureUpdater[PSMCT16] = &CGSH_OpenGL::TexUpdater_Psm16<CGsPixelFormats::CPixelIndexorPSMCT16>;
25 	m_textureUpdater[PSMCT32_UNK] = &CGSH_OpenGL::TexUpdater_Psm32;
26 	m_textureUpdater[PSMCT24_UNK] = &CGSH_OpenGL::TexUpdater_Psm32;
27 	m_textureUpdater[PSMCT16S] = &CGSH_OpenGL::TexUpdater_Psm16<CGsPixelFormats::CPixelIndexorPSMCT16S>;
28 	m_textureUpdater[PSMT8] = &CGSH_OpenGL::TexUpdater_Psm8;
29 	m_textureUpdater[PSMT4] = &CGSH_OpenGL::TexUpdater_Psm4;
30 	// original for quick perf testing
31 	//m_textureUpdater[PSMT4] = &CGSH_OpenGL::TexUpdater_Psm48<CGsPixelFormats::CPixelIndexorPSMT4>;
32 
33 	m_textureUpdater[PSMT8H] = &CGSH_OpenGL::TexUpdater_Psm48H<24, 0xFF>;
34 	m_textureUpdater[PSMT4HL] = &CGSH_OpenGL::TexUpdater_Psm48H<24, 0x0F>;
35 	m_textureUpdater[PSMT4HH] = &CGSH_OpenGL::TexUpdater_Psm48H<28, 0x0F>;
36 }
37 
GetFramebufferBitDepth(uint32 psm)38 uint32 CGSH_OpenGL::GetFramebufferBitDepth(uint32 psm)
39 {
40 	if((psm == PSMCT32) || (psm == PSMCT24))
41 	{
42 		return 32;
43 	}
44 	else if((psm == PSMCT16) || (psm == PSMCT16S))
45 	{
46 		return 16;
47 	}
48 	else
49 	{
50 		assert(false);
51 		return 32;
52 	}
53 }
54 
GetTextureFormatInfo(uint32 psm)55 CGSH_OpenGL::TEXTUREFORMAT_INFO CGSH_OpenGL::GetTextureFormatInfo(uint32 psm)
56 {
57 	switch(psm)
58 	{
59 	case PSMCT32:
60 	case PSMCT24:
61 	case PSMCT32_UNK:
62 	case PSMCT24_UNK:
63 		return TEXTUREFORMAT_INFO{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
64 	case PSMCT16:
65 	case PSMCT16S:
66 		return TEXTUREFORMAT_INFO{GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1};
67 	case PSMT8:
68 	case PSMT4:
69 	case PSMT8H:
70 	case PSMT4HL:
71 	case PSMT4HH:
72 		return TEXTUREFORMAT_INFO{GL_R8, GL_RED, GL_UNSIGNED_BYTE};
73 	default:
74 		assert(false);
75 		return TEXTUREFORMAT_INFO{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
76 	}
77 }
78 
SearchTextureFramebuffer(const TEX0 & tex0)79 CGSH_OpenGL::TEXTURE_INFO CGSH_OpenGL::SearchTextureFramebuffer(const TEX0& tex0)
80 {
81 	TEXTURE_INFO texInfo;
82 	FramebufferPtr framebuffer;
83 
84 	//First pass, look for an exact match
85 	for(const auto& candidateFramebuffer : m_framebuffers)
86 	{
87 		//Case: TEX0 points at the start of a frame buffer with the same width
88 		if(candidateFramebuffer->m_basePtr == tex0.GetBufPtr() &&
89 		   candidateFramebuffer->m_width == tex0.GetBufWidth() &&
90 		   IsCompatibleFramebufferPSM(candidateFramebuffer->m_psm, tex0.nPsm))
91 		{
92 			framebuffer = candidateFramebuffer;
93 			break;
94 		}
95 
96 		//Case: TEX0 point at the start of a frame buffer with the same width
97 		//but uses upper 8-bits (alpha) as an indexed texture (used in Yakuza)
98 		else if(candidateFramebuffer->m_basePtr == tex0.GetBufPtr() &&
99 		        candidateFramebuffer->m_width == tex0.GetBufWidth() &&
100 		        candidateFramebuffer->m_psm == CGSHandler::PSMCT32 &&
101 		        tex0.nPsm == CGSHandler::PSMT8H)
102 		{
103 			framebuffer = candidateFramebuffer;
104 			texInfo.alphaAsIndex = true;
105 			break;
106 		}
107 	}
108 
109 	if(!framebuffer)
110 	{
111 		//Second pass, be a bit more flexible
112 		for(const auto& candidateFramebuffer : m_framebuffers)
113 		{
114 			//Another case: TEX0 is pointing to the start of a page within our framebuffer (BGDA does this)
115 			if(candidateFramebuffer->m_basePtr <= tex0.GetBufPtr() &&
116 			   candidateFramebuffer->m_width == tex0.GetBufWidth() &&
117 			   candidateFramebuffer->m_psm == tex0.nPsm)
118 			{
119 				uint32 framebufferOffset = tex0.GetBufPtr() - candidateFramebuffer->m_basePtr;
120 
121 				//Bail if offset is not aligned on a page boundary
122 				if((framebufferOffset & (CGsPixelFormats::PAGESIZE - 1)) != 0) continue;
123 
124 				auto framebufferPageSize = CGsPixelFormats::GetPsmPageSize(candidateFramebuffer->m_psm);
125 				uint32 framebufferPageCountX = candidateFramebuffer->m_width / framebufferPageSize.first;
126 				uint32 framebufferPageIndex = framebufferOffset / CGsPixelFormats::PAGESIZE;
127 
128 				//Bail if pointed page isn't on the first line
129 				if(framebufferPageIndex >= framebufferPageCountX) continue;
130 
131 				framebuffer = candidateFramebuffer;
132 				texInfo.offsetX = static_cast<float>(framebufferPageIndex * framebufferPageSize.first) / static_cast<float>(candidateFramebuffer->m_width);
133 				break;
134 			}
135 		}
136 	}
137 
138 	if(framebuffer)
139 	{
140 		CommitFramebufferDirtyPages(framebuffer, 0, tex0.GetHeight());
141 		if(m_multisampleEnabled)
142 		{
143 			ResolveFramebufferMultisample(framebuffer, m_fbScale);
144 		}
145 
146 		float scaleRatioX = static_cast<float>(tex0.GetWidth()) / static_cast<float>(framebuffer->m_width);
147 		float scaleRatioY = static_cast<float>(tex0.GetHeight()) / static_cast<float>(framebuffer->m_height);
148 
149 		texInfo.textureHandle = framebuffer->m_texture;
150 		texInfo.scaleRatioX = scaleRatioX;
151 		texInfo.scaleRatioY = scaleRatioY;
152 		return texInfo;
153 	}
154 	else
155 	{
156 		return TEXTURE_INFO();
157 	}
158 }
159 
PrepareTexture(const TEX0 & tex0)160 CGSH_OpenGL::TEXTURE_INFO CGSH_OpenGL::PrepareTexture(const TEX0& tex0)
161 {
162 	auto texInfo = SearchTextureFramebuffer(tex0);
163 	if(texInfo.textureHandle != 0)
164 	{
165 		return texInfo;
166 	}
167 
168 	auto texture = m_textureCache.Search(tex0);
169 	if(!texture)
170 	{
171 		//Validate texture dimensions to prevent problems
172 		auto texWidth = tex0.GetWidth();
173 		auto texHeight = tex0.GetHeight();
174 		assert(texWidth <= 1024);
175 		assert(texHeight <= 1024);
176 		texWidth = std::min<uint32>(texWidth, 1024);
177 		texHeight = std::min<uint32>(texHeight, 1024);
178 		auto texFormat = GetTextureFormatInfo(tex0.nPsm);
179 
180 		{
181 			auto textureHandle = Framework::OpenGl::CTexture::Create();
182 			glBindTexture(GL_TEXTURE_2D, textureHandle);
183 			glTexStorage2D(GL_TEXTURE_2D, 1, texFormat.internalFormat, texWidth, texHeight);
184 			CHECKGLERROR();
185 			m_textureCache.Insert(tex0, std::move(textureHandle));
186 		}
187 
188 		texture = m_textureCache.Search(tex0);
189 		texture->m_cachedArea.Invalidate(0, RAMSIZE);
190 	}
191 
192 	texInfo.textureHandle = texture->m_textureHandle;
193 
194 	glBindTexture(GL_TEXTURE_2D, texture->m_textureHandle);
195 	auto& cachedArea = texture->m_cachedArea;
196 	auto texturePageSize = CGsPixelFormats::GetPsmPageSize(tex0.nPsm);
197 	auto areaRect = cachedArea.GetAreaPageRect();
198 
199 	while(cachedArea.HasDirtyPages())
200 	{
201 		auto dirtyRect = cachedArea.GetDirtyPageRect();
202 		assert((dirtyRect.width != 0) && (dirtyRect.height != 0));
203 		cachedArea.ClearDirtyPages(dirtyRect);
204 
205 		uint32 texX = dirtyRect.x * texturePageSize.first;
206 		uint32 texY = dirtyRect.y * texturePageSize.second;
207 		uint32 texWidth = dirtyRect.width * texturePageSize.first;
208 		uint32 texHeight = dirtyRect.height * texturePageSize.second;
209 		if(texX >= tex0.GetWidth()) continue;
210 		if(texY >= tex0.GetHeight()) continue;
211 		//assert(texX < tex0.GetWidth());
212 		//assert(texY < tex0.GetHeight());
213 		if((texX + texWidth) > tex0.GetWidth())
214 		{
215 			texWidth = tex0.GetWidth() - texX;
216 		}
217 		if((texY + texHeight) > tex0.GetHeight())
218 		{
219 			texHeight = tex0.GetHeight() - texY;
220 		}
221 		((this)->*(m_textureUpdater[tex0.nPsm]))(tex0.GetBufPtr(), tex0.nBufWidth, texX, texY, texWidth, texHeight);
222 	}
223 
224 	cachedArea.ClearDirtyPages();
225 
226 	return texInfo;
227 }
228 
PreparePalette(const TEX0 & tex0)229 GLuint CGSH_OpenGL::PreparePalette(const TEX0& tex0)
230 {
231 	GLuint textureHandle = PalCache_Search(tex0);
232 	if(textureHandle != 0)
233 	{
234 		return textureHandle;
235 	}
236 
237 	std::array<uint32, 256> convertedClut;
238 	MakeLinearCLUT(tex0, convertedClut);
239 
240 	unsigned int entryCount = CGsPixelFormats::IsPsmIDTEX4(tex0.nPsm) ? 16 : 256;
241 	textureHandle = PalCache_Search(entryCount, convertedClut.data());
242 	if(textureHandle != 0)
243 	{
244 		return textureHandle;
245 	}
246 
247 	glGenTextures(1, &textureHandle);
248 	glBindTexture(GL_TEXTURE_2D, textureHandle);
249 	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, entryCount, 1, 0, GL_RGBA, GL_UNSIGNED_BYTE, convertedClut.data());
250 
251 	PalCache_Insert(tex0, convertedClut.data(), textureHandle);
252 
253 	return textureHandle;
254 }
255 
DumpTexture(unsigned int nWidth,unsigned int nHeight,uint32 checksum)256 void CGSH_OpenGL::DumpTexture(unsigned int nWidth, unsigned int nHeight, uint32 checksum)
257 {
258 #ifdef _WIN32
259 	char sFilename[256];
260 
261 	for(unsigned int i = 0; i < UINT_MAX; i++)
262 	{
263 		struct _stat Stat;
264 		sprintf(sFilename, "./textures/tex_%08X_%08X.bmp", i, checksum);
265 		if(_stat(sFilename, &Stat) == -1) break;
266 	}
267 
268 	Framework::CBitmap bitmap(nWidth, nHeight, 32);
269 
270 	glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_BYTE, bitmap.GetPixels());
271 	Framework::CStdStream outputStream(fopen(sFilename, "wb"));
272 	Framework::CBMP::WriteBitmap(bitmap, outputStream);
273 #endif
274 }
275 
TexUpdater_Invalid(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)276 void CGSH_OpenGL::TexUpdater_Invalid(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
277 {
278 	assert(0);
279 }
280 
TexUpdater_Psm32(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)281 void CGSH_OpenGL::TexUpdater_Psm32(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
282 {
283 	CGsPixelFormats::CPixelIndexorPSMCT32 indexor(m_pRAM, bufPtr, bufWidth);
284 
285 	uint32* dst = reinterpret_cast<uint32*>(m_pCvtBuffer);
286 	for(unsigned int y = 0; y < texHeight; y++)
287 	{
288 		for(unsigned int x = 0; x < texWidth; x++)
289 		{
290 			dst[x] = indexor.GetPixel(texX + x, texY + y);
291 		}
292 
293 		dst += texWidth;
294 	}
295 
296 	glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_BYTE, m_pCvtBuffer);
297 	CHECKGLERROR();
298 }
299 
300 template <typename IndexorType>
TexUpdater_Psm16(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)301 void CGSH_OpenGL::TexUpdater_Psm16(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
302 {
303 	IndexorType indexor(m_pRAM, bufPtr, bufWidth);
304 
305 	auto dst = reinterpret_cast<uint16*>(m_pCvtBuffer);
306 	for(unsigned int y = 0; y < texHeight; y++)
307 	{
308 		for(unsigned int x = 0; x < texWidth; x++)
309 		{
310 			auto pixel = indexor.GetPixel(texX + x, texY + y);
311 			auto cvtPixel =
312 			    (((pixel & 0x001F) >> 0) << 11) | //R
313 			    (((pixel & 0x03E0) >> 5) << 6) |  //G
314 			    (((pixel & 0x7C00) >> 10) << 1) | //B
315 			    (pixel >> 15);                    //A
316 			dst[x] = cvtPixel;
317 		}
318 
319 		dst += texWidth;
320 	}
321 
322 	glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1, m_pCvtBuffer);
323 	CHECKGLERROR();
324 }
325 
326 #ifdef _WIN32
327 #define USE_SSE
328 #elif defined(__APPLE__)
329 #include <TargetConditionals.h>
330 #if TARGET_CPU_X86_64
331 #define USE_SSE
332 #elif TARGET_CPU_ARM64
333 #define USE_NEON
334 #endif
335 #elif defined(__ANDROID__) || defined(__linux__) || defined(__FreeBSD__)
336 #if defined(__x86_64__) || defined(__i386__)
337 #define USE_SSE
338 #elif defined(__aarch64__) || defined(__arm__)
339 #define USE_NEON
340 #endif
341 #endif
342 
343 #if defined(USE_SSE)
344 #include <xmmintrin.h>
345 #include <emmintrin.h>
346 #include <tmmintrin.h>
347 
convertColumn8(uint8 * dest,const int destStride,int colNum,__m128i a,__m128i b,__m128i c,__m128i d)348 void convertColumn8(uint8* dest, const int destStride, int colNum, __m128i a, __m128i b, __m128i c, __m128i d)
349 {
350 	__m128i* mdest = (__m128i*)dest;
351 
352 	__m128i temp_a = a;
353 	__m128i temp_c = c;
354 
355 	a = _mm_unpacklo_epi8(temp_a, b);
356 	c = _mm_unpackhi_epi8(temp_a, b);
357 	b = _mm_unpacklo_epi8(temp_c, d);
358 	d = _mm_unpackhi_epi8(temp_c, d);
359 
360 	temp_a = a;
361 	temp_c = c;
362 
363 	a = _mm_unpacklo_epi16(temp_a, b);
364 	c = _mm_unpackhi_epi16(temp_a, b);
365 	b = _mm_unpacklo_epi16(temp_c, d);
366 	d = _mm_unpackhi_epi16(temp_c, d);
367 
368 	temp_a = a;
369 	__m128i temp_b = b;
370 
371 	a = _mm_unpacklo_epi8(temp_a, c);
372 	b = _mm_unpackhi_epi8(temp_a, c);
373 	c = _mm_unpacklo_epi8(temp_b, d);
374 	d = _mm_unpackhi_epi8(temp_b, d);
375 
376 	temp_a = a;
377 	temp_c = c;
378 
379 	a = _mm_unpacklo_epi64(temp_a, b);
380 	c = _mm_unpackhi_epi64(temp_a, b);
381 	b = _mm_unpacklo_epi64(temp_c, d);
382 	d = _mm_unpackhi_epi64(temp_c, d);
383 
384 	if((colNum & 1) == 0)
385 	{
386 		c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1));
387 		d = _mm_shuffle_epi32(d, _MM_SHUFFLE(2, 3, 0, 1));
388 	}
389 	else
390 	{
391 		a = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
392 		b = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 3, 0, 1));
393 	}
394 
395 	int mStride = destStride / 16;
396 
397 	mdest[0] = a;
398 	mdest[mStride] = b;
399 	mdest[mStride * 2] = c;
400 	mdest[mStride * 3] = d;
401 }
402 
convertColumn8(uint8 * dest,const int destStride,uint8 * src,int colNum)403 inline void convertColumn8(uint8* dest, const int destStride, uint8* src, int colNum)
404 {
405 	__m128i* mSrc = (__m128i*)src;
406 
407 	__m128i a = mSrc[0];
408 	__m128i b = mSrc[1];
409 	__m128i c = mSrc[2];
410 	__m128i d = mSrc[3];
411 	convertColumn8(dest, destStride, colNum, a, b, c, d);
412 }
413 
convertColumn4(uint8 * dest,const int destStride,uint8 * src,int colNum)414 inline void convertColumn4(uint8* dest, const int destStride, uint8* src, int colNum)
415 {
416 	__m128i* mSrc = (__m128i*)src;
417 
418 	__m128i a = mSrc[0];
419 	__m128i b = mSrc[1];
420 	__m128i c = mSrc[2];
421 	__m128i d = mSrc[3];
422 
423 	// 4 bpp looks like 2 8bpp columns side by side.
424 	// The 4pp are expanded to 8bpp.
425 	// so 01 23 45 67 89 ab cd ef gh ij kl mn op qr st uv expands to
426 	// 00 01 02 03 08 09 0a 0b 0g 0h 0i 0j 0o 0p 0q 0r as the first row on the left hand block.
427 
428 	__m128i perm = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 0x0c, 0x0d, 2, 3, 6, 7, 0x0a, 0x0b, 0x0e, 0x0f);
429 	a = _mm_shuffle_epi8(a, perm);
430 	b = _mm_shuffle_epi8(b, perm);
431 	c = _mm_shuffle_epi8(c, perm);
432 	d = _mm_shuffle_epi8(d, perm);
433 
434 	__m128i a_orig = a;
435 
436 	const __m128i mask = _mm_set1_epi32(0x0f0f0f0f);
437 	const __m128i shiftCount = _mm_set_epi32(0, 0, 0, 4);
438 	__m128i lowNybbles = _mm_and_si128(a, mask);
439 	__m128i highNybbles = _mm_and_si128(_mm_srl_epi32(a, shiftCount), mask);
440 	a = _mm_unpacklo_epi8(lowNybbles, highNybbles);
441 	__m128i a2 = _mm_unpackhi_epi8(lowNybbles, highNybbles);
442 
443 	lowNybbles = _mm_and_si128(b, mask);
444 	highNybbles = _mm_and_si128(_mm_srl_epi32(b, shiftCount), mask);
445 	b = _mm_unpacklo_epi8(lowNybbles, highNybbles);
446 	__m128i b2 = _mm_unpackhi_epi8(lowNybbles, highNybbles);
447 
448 	lowNybbles = _mm_and_si128(c, mask);
449 	highNybbles = _mm_and_si128(_mm_srl_epi32(c, shiftCount), mask);
450 	c = _mm_unpacklo_epi8(lowNybbles, highNybbles);
451 	__m128i c2 = _mm_unpackhi_epi8(lowNybbles, highNybbles);
452 
453 	lowNybbles = _mm_and_si128(d, mask);
454 	highNybbles = _mm_and_si128(_mm_srl_epi32(d, shiftCount), mask);
455 	d = _mm_unpacklo_epi8(lowNybbles, highNybbles);
456 	__m128i d2 = _mm_unpackhi_epi8(lowNybbles, highNybbles);
457 
458 	convertColumn8(dest, destStride, colNum, a, b, c, d);
459 	if(destStride > 16)
460 	{
461 		convertColumn8(dest + 16, destStride, colNum, a2, b2, c2, d2);
462 	}
463 }
464 
465 #elif defined(USE_NEON)
466 #include <arm_neon.h>
467 
convertColumn8(uint8x16x4_t data,uint8 * dest,const int destStride,int colNum)468 inline void convertColumn8(uint8x16x4_t data, uint8* dest, const int destStride, int colNum)
469 {
470 	uint16x8_t row0 = vcombine_u16(vmovn_u32(vreinterpretq_u32_u8(data.val[0])), vmovn_u32(vreinterpretq_u32_u8(data.val[2])));
471 	uint16x8_t revr0 = vrev32q_u16(vreinterpretq_u16_u8(data.val[0]));
472 	uint16x8_t revr2 = vrev32q_u16(vreinterpretq_u16_u8(data.val[2]));
473 	uint16x8_t row1 = vcombine_u16(vmovn_u32(vreinterpretq_u32_u16(revr0)), vmovn_u32(vreinterpretq_u32_u16(revr2)));
474 
475 	uint16x8_t row2 = vcombine_u16(vmovn_u32(vreinterpretq_u32_u8(data.val[1])), vmovn_u32(vreinterpretq_u32_u8(data.val[3])));
476 	uint16x8_t revr1 = vrev32q_u16(vreinterpretq_u16_u8(data.val[1]));
477 	uint16x8_t revr3 = vrev32q_u16(vreinterpretq_u16_u8(data.val[3]));
478 	uint16x8_t row3 = vcombine_u16(vmovn_u32(vreinterpretq_u32_u16(revr1)), vmovn_u32(vreinterpretq_u32_u16(revr3)));
479 
480 	if((colNum & 1) == 0)
481 	{
482 		row2 = vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(row2)));
483 		row3 = vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(row3)));
484 	}
485 	else
486 	{
487 		row0 = vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(row0)));
488 		row1 = vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(row1)));
489 	}
490 
491 	vst1q_u8(dest, vreinterpretq_u8_u16(row0));
492 	vst1q_u8(dest + destStride, vreinterpretq_u8_u16(row1));
493 	vst1q_u8(dest + 2 * destStride, vreinterpretq_u8_u16(row2));
494 	vst1q_u8(dest + 3 * destStride, vreinterpretq_u8_u16(row3));
495 }
496 
convertColumn8(uint8 * dest,const int destStride,uint8 * src,int colNum)497 inline void convertColumn8(uint8* dest, const int destStride, uint8* src, int colNum)
498 {
499 	// This sucks in the entire column and de-interleaves it
500 	uint8x16x4_t data = vld4q_u8(src);
501 	convertColumn8(data, dest, destStride, colNum);
502 }
503 
convertColumn4(uint8 * dest,const int destStride,uint8 * src,int colNum)504 inline void convertColumn4(uint8* dest, const int destStride, uint8* src, int colNum)
505 {
506 	// https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
507 
508 	uint8x16x4_t data = vld4q_u8(src);
509 
510 	const auto mask = vdupq_n_u8(0x0F);
511 
512 	auto high_nybbles = vshrq_n_u8(data.val[0], 4);
513 	auto lo_nybbles = vandq_u8(data.val[0], mask);
514 
515 	uint8x16x4_t col8Data;
516 	col8Data.val[0] = lo_nybbles;
517 	col8Data.val[1] = high_nybbles;
518 
519 	high_nybbles = vshrq_n_u8(data.val[1], 4);
520 	lo_nybbles = vandq_u8(data.val[1], mask);
521 	col8Data.val[2] = lo_nybbles;
522 	col8Data.val[3] = high_nybbles;
523 	convertColumn8(col8Data, dest, destStride, colNum);
524 
525 	if(destStride > 16)
526 	{
527 
528 		high_nybbles = vshrq_n_u8(data.val[2], 4);
529 		lo_nybbles = vandq_u8(data.val[2], mask);
530 		col8Data.val[0] = lo_nybbles;
531 		col8Data.val[1] = high_nybbles;
532 		high_nybbles = vshrq_n_u8(data.val[3], 4);
533 		lo_nybbles = vandq_u8(data.val[3], mask);
534 		col8Data.val[2] = lo_nybbles;
535 		col8Data.val[3] = high_nybbles;
536 		convertColumn8(col8Data, dest + 16, destStride, colNum);
537 	}
538 }
539 
540 #else
541 /*
542 // If we have a platform that does not have SIMD then implement the basic case here.
543 void convertColumn8(uint8* dest, const int destStride, uint8* src, int colNum)
544 {
545 
546 }
547 */
548 #endif
549 
TexUpdater_Psm8(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)550 void CGSH_OpenGL::TexUpdater_Psm8(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
551 {
552 	if(texWidth < 16)
553 	{
554 		// Widths are powers of 2, so anything over 16 will be an integral number of columns wide.
555 		// Note: for small textures it still may be a win to do the SIMD swizzle and then cut out the sub-region to
556 		// correct the row stride.
557 		return CGSH_OpenGL::TexUpdater_Psm48<CGsPixelFormats::CPixelIndexorPSMT8>(bufPtr, bufWidth, texX, texY, texWidth, texHeight);
558 	}
559 
560 	CGsPixelFormats::CPixelIndexorPSMT8 indexor(m_pRAM, bufPtr, bufWidth);
561 	uint8* dst = m_pCvtBuffer;
562 	for(unsigned int y = 0; y < texHeight; y += 16)
563 	{
564 		for(unsigned int x = 0; x < texWidth; x += 16)
565 		{
566 			uint8* colDst = dst;
567 			uint8* src = indexor.GetPixelAddress(texX + x, texY + y);
568 
569 			// process an entire 16x16 block.
570 			// A column (64 bytes) is 16x4 pixels and they stack vertically in a block
571 
572 			int colNum = 0;
573 			for(unsigned int coly = 0; coly < 16; coly += 4)
574 			{
575 				convertColumn8(colDst + x, texWidth, src, colNum++);
576 				src += 64;
577 				colDst += texWidth * 4;
578 			}
579 		}
580 
581 		dst += texWidth * 16;
582 	}
583 
584 	glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RED, GL_UNSIGNED_BYTE, m_pCvtBuffer);
585 	CHECKGLERROR();
586 }
587 
TexUpdater_Psm4(unsigned int bufPtr,unsigned int bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)588 void CGSH_OpenGL::TexUpdater_Psm4(unsigned int bufPtr, unsigned int bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
589 {
590 	if(texWidth < 16)
591 	{
592 		// Widths are powers of 2, so anything over 32 will be an integral number of columns wide.
593 		// 16 wide textures are dealt with as a special case in the SIMD code.
594 		// Note: for small textures it still may be a win to do the SIMD swizzle and then cut out the sub-region to
595 		// correct the row stride.
596 		return CGSH_OpenGL::TexUpdater_Psm48<CGsPixelFormats::CPixelIndexorPSMT4>(bufPtr, bufWidth, texX, texY, texWidth, texHeight);
597 	}
598 
599 	CGsPixelFormats::CPixelIndexorPSMT4 indexor(m_pRAM, bufPtr, bufWidth);
600 
601 	uint8* dst = m_pCvtBuffer;
602 	for(unsigned int y = 0; y < texHeight; y += 16)
603 	{
604 		for(unsigned int x = 0; x < texWidth; x += 32)
605 		{
606 			uint8* colDst = dst + x;
607 			unsigned int nx = texX + x;
608 			unsigned int ny = texY + y;
609 			uint32 colAddr = indexor.GetColumnAddress(nx, ny);
610 			uint8* src = m_pRAM + colAddr;
611 
612 			// process an entire 32x16 block.
613 			// A column (64 bytes) is 32x4 pixels and they stack vertically in a block
614 
615 			for(unsigned int colNum = 0; colNum < 4; ++colNum)
616 			{
617 				convertColumn4(colDst, texWidth, src, colNum);
618 				src += 64;
619 				colDst += texWidth * 4;
620 			}
621 		}
622 
623 		dst += texWidth * 16;
624 	}
625 	glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RED, GL_UNSIGNED_BYTE, m_pCvtBuffer);
626 	CHECKGLERROR();
627 }
628 
629 template <typename IndexorType>
TexUpdater_Psm48(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)630 void CGSH_OpenGL::TexUpdater_Psm48(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
631 {
632 	IndexorType indexor(m_pRAM, bufPtr, bufWidth);
633 
634 	uint8* dst = m_pCvtBuffer;
635 	for(unsigned int y = 0; y < texHeight; y++)
636 	{
637 		for(unsigned int x = 0; x < texWidth; x++)
638 		{
639 			uint8 pixel = indexor.GetPixel(texX + x, texY + y);
640 			dst[x] = pixel;
641 		}
642 
643 		dst += texWidth;
644 	}
645 
646 	glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RED, GL_UNSIGNED_BYTE, m_pCvtBuffer);
647 	CHECKGLERROR();
648 }
649 
650 template <uint32 shiftAmount, uint32 mask>
TexUpdater_Psm48H(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)651 void CGSH_OpenGL::TexUpdater_Psm48H(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
652 {
653 	CGsPixelFormats::CPixelIndexorPSMCT32 indexor(m_pRAM, bufPtr, bufWidth);
654 
655 	uint8* dst = m_pCvtBuffer;
656 	for(unsigned int y = 0; y < texHeight; y++)
657 	{
658 		for(unsigned int x = 0; x < texWidth; x++)
659 		{
660 			uint32 pixel = indexor.GetPixel(texX + x, texY + y);
661 			pixel = (pixel >> shiftAmount) & mask;
662 			dst[x] = static_cast<uint8>(pixel);
663 		}
664 
665 		dst += texWidth;
666 	}
667 
668 	glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RED, GL_UNSIGNED_BYTE, m_pCvtBuffer);
669 	CHECKGLERROR();
670 }
671 
672 /////////////////////////////////////////////////////////////
673 // Palette
674 /////////////////////////////////////////////////////////////
675 
CPalette()676 CGSH_OpenGL::CPalette::CPalette()
677     : m_live(false)
678     , m_isIDTEX4(false)
679     , m_cpsm(0)
680     , m_csa(0)
681     , m_texture(0)
682 {
683 }
684 
~CPalette()685 CGSH_OpenGL::CPalette::~CPalette()
686 {
687 	Free();
688 }
689 
Free()690 void CGSH_OpenGL::CPalette::Free()
691 {
692 	if(m_texture != 0)
693 	{
694 		glDeleteTextures(1, &m_texture);
695 		m_texture = 0;
696 		m_live = false;
697 	}
698 }
699 
Invalidate(uint32 csa)700 void CGSH_OpenGL::CPalette::Invalidate(uint32 csa)
701 {
702 	if(!m_live) return;
703 
704 	m_live = false;
705 }
706 
707 /////////////////////////////////////////////////////////////
708 // Palette Caching
709 /////////////////////////////////////////////////////////////
710 
PalCache_Search(const TEX0 & tex0)711 GLuint CGSH_OpenGL::PalCache_Search(const TEX0& tex0)
712 {
713 	for(auto paletteIterator(m_paletteCache.begin());
714 	    paletteIterator != m_paletteCache.end(); paletteIterator++)
715 	{
716 		auto palette = *paletteIterator;
717 		if(!palette->m_live) continue;
718 		if(CGsPixelFormats::IsPsmIDTEX4(tex0.nPsm) != palette->m_isIDTEX4) continue;
719 		if(tex0.nCPSM != palette->m_cpsm) continue;
720 		if(tex0.nCSA != palette->m_csa) continue;
721 		m_paletteCache.erase(paletteIterator);
722 		m_paletteCache.push_front(palette);
723 		return palette->m_texture;
724 	}
725 
726 	return 0;
727 }
728 
PalCache_Search(unsigned int entryCount,const uint32 * contents)729 GLuint CGSH_OpenGL::PalCache_Search(unsigned int entryCount, const uint32* contents)
730 {
731 	for(auto paletteIterator(m_paletteCache.begin());
732 	    paletteIterator != m_paletteCache.end(); paletteIterator++)
733 	{
734 		auto palette = *paletteIterator;
735 
736 		if(palette->m_texture == 0) continue;
737 
738 		unsigned int palEntryCount = palette->m_isIDTEX4 ? 16 : 256;
739 		if(palEntryCount != entryCount) continue;
740 
741 		if(memcmp(contents, palette->m_contents, sizeof(uint32) * entryCount) != 0) continue;
742 
743 		palette->m_live = true;
744 
745 		m_paletteCache.erase(paletteIterator);
746 		m_paletteCache.push_front(palette);
747 		return palette->m_texture;
748 	}
749 
750 	return 0;
751 }
752 
PalCache_Insert(const TEX0 & tex0,const uint32 * contents,GLuint textureHandle)753 void CGSH_OpenGL::PalCache_Insert(const TEX0& tex0, const uint32* contents, GLuint textureHandle)
754 {
755 	auto texture = *m_paletteCache.rbegin();
756 	texture->Free();
757 
758 	unsigned int entryCount = CGsPixelFormats::IsPsmIDTEX4(tex0.nPsm) ? 16 : 256;
759 
760 	texture->m_isIDTEX4 = CGsPixelFormats::IsPsmIDTEX4(tex0.nPsm);
761 	texture->m_cpsm = tex0.nCPSM;
762 	texture->m_csa = tex0.nCSA;
763 	texture->m_texture = textureHandle;
764 	texture->m_live = true;
765 	memcpy(texture->m_contents, contents, entryCount * sizeof(uint32));
766 
767 	m_paletteCache.pop_back();
768 	m_paletteCache.push_front(texture);
769 }
770 
PalCache_Invalidate(uint32 csa)771 void CGSH_OpenGL::PalCache_Invalidate(uint32 csa)
772 {
773 	std::for_each(std::begin(m_paletteCache), std::end(m_paletteCache),
774 	              [csa](PalettePtr& palette) { palette->Invalidate(csa); });
775 }
776 
PalCache_Flush()777 void CGSH_OpenGL::PalCache_Flush()
778 {
779 	std::for_each(std::begin(m_paletteCache), std::end(m_paletteCache),
780 	              [](PalettePtr& palette) { palette->Free(); });
781 }
782