1 #include <assert.h>
2 #include <cstring>
3 #include <sys/stat.h>
4 #include <limits.h>
5 #include <algorithm>
6 #include "GSH_OpenGL.h"
7 #include "StdStream.h"
8 #include "bitmap/BMP.h"
9 #include "../GsPixelFormats.h"
10
11 /////////////////////////////////////////////////////////////
12 // Texture Loading
13 /////////////////////////////////////////////////////////////
14
SetupTextureUpdaters()15 void CGSH_OpenGL::SetupTextureUpdaters()
16 {
17 for(unsigned int i = 0; i < PSM_MAX; i++)
18 {
19 m_textureUpdater[i] = &CGSH_OpenGL::TexUpdater_Invalid;
20 }
21
22 m_textureUpdater[PSMCT32] = &CGSH_OpenGL::TexUpdater_Psm32;
23 m_textureUpdater[PSMCT24] = &CGSH_OpenGL::TexUpdater_Psm32;
24 m_textureUpdater[PSMCT16] = &CGSH_OpenGL::TexUpdater_Psm16<CGsPixelFormats::CPixelIndexorPSMCT16>;
25 m_textureUpdater[PSMCT32_UNK] = &CGSH_OpenGL::TexUpdater_Psm32;
26 m_textureUpdater[PSMCT24_UNK] = &CGSH_OpenGL::TexUpdater_Psm32;
27 m_textureUpdater[PSMCT16S] = &CGSH_OpenGL::TexUpdater_Psm16<CGsPixelFormats::CPixelIndexorPSMCT16S>;
28 m_textureUpdater[PSMT8] = &CGSH_OpenGL::TexUpdater_Psm8;
29 m_textureUpdater[PSMT4] = &CGSH_OpenGL::TexUpdater_Psm4;
30 // original for quick perf testing
31 //m_textureUpdater[PSMT4] = &CGSH_OpenGL::TexUpdater_Psm48<CGsPixelFormats::CPixelIndexorPSMT4>;
32
33 m_textureUpdater[PSMT8H] = &CGSH_OpenGL::TexUpdater_Psm48H<24, 0xFF>;
34 m_textureUpdater[PSMT4HL] = &CGSH_OpenGL::TexUpdater_Psm48H<24, 0x0F>;
35 m_textureUpdater[PSMT4HH] = &CGSH_OpenGL::TexUpdater_Psm48H<28, 0x0F>;
36 }
37
GetFramebufferBitDepth(uint32 psm)38 uint32 CGSH_OpenGL::GetFramebufferBitDepth(uint32 psm)
39 {
40 if((psm == PSMCT32) || (psm == PSMCT24))
41 {
42 return 32;
43 }
44 else if((psm == PSMCT16) || (psm == PSMCT16S))
45 {
46 return 16;
47 }
48 else
49 {
50 assert(false);
51 return 32;
52 }
53 }
54
GetTextureFormatInfo(uint32 psm)55 CGSH_OpenGL::TEXTUREFORMAT_INFO CGSH_OpenGL::GetTextureFormatInfo(uint32 psm)
56 {
57 switch(psm)
58 {
59 case PSMCT32:
60 case PSMCT24:
61 case PSMCT32_UNK:
62 case PSMCT24_UNK:
63 return TEXTUREFORMAT_INFO{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
64 case PSMCT16:
65 case PSMCT16S:
66 return TEXTUREFORMAT_INFO{GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1};
67 case PSMT8:
68 case PSMT4:
69 case PSMT8H:
70 case PSMT4HL:
71 case PSMT4HH:
72 return TEXTUREFORMAT_INFO{GL_R8, GL_RED, GL_UNSIGNED_BYTE};
73 default:
74 assert(false);
75 return TEXTUREFORMAT_INFO{GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE};
76 }
77 }
78
SearchTextureFramebuffer(const TEX0 & tex0)79 CGSH_OpenGL::TEXTURE_INFO CGSH_OpenGL::SearchTextureFramebuffer(const TEX0& tex0)
80 {
81 TEXTURE_INFO texInfo;
82 FramebufferPtr framebuffer;
83
84 //First pass, look for an exact match
85 for(const auto& candidateFramebuffer : m_framebuffers)
86 {
87 //Case: TEX0 points at the start of a frame buffer with the same width
88 if(candidateFramebuffer->m_basePtr == tex0.GetBufPtr() &&
89 candidateFramebuffer->m_width == tex0.GetBufWidth() &&
90 IsCompatibleFramebufferPSM(candidateFramebuffer->m_psm, tex0.nPsm))
91 {
92 framebuffer = candidateFramebuffer;
93 break;
94 }
95
96 //Case: TEX0 point at the start of a frame buffer with the same width
97 //but uses upper 8-bits (alpha) as an indexed texture (used in Yakuza)
98 else if(candidateFramebuffer->m_basePtr == tex0.GetBufPtr() &&
99 candidateFramebuffer->m_width == tex0.GetBufWidth() &&
100 candidateFramebuffer->m_psm == CGSHandler::PSMCT32 &&
101 tex0.nPsm == CGSHandler::PSMT8H)
102 {
103 framebuffer = candidateFramebuffer;
104 texInfo.alphaAsIndex = true;
105 break;
106 }
107 }
108
109 if(!framebuffer)
110 {
111 //Second pass, be a bit more flexible
112 for(const auto& candidateFramebuffer : m_framebuffers)
113 {
114 //Another case: TEX0 is pointing to the start of a page within our framebuffer (BGDA does this)
115 if(candidateFramebuffer->m_basePtr <= tex0.GetBufPtr() &&
116 candidateFramebuffer->m_width == tex0.GetBufWidth() &&
117 candidateFramebuffer->m_psm == tex0.nPsm)
118 {
119 uint32 framebufferOffset = tex0.GetBufPtr() - candidateFramebuffer->m_basePtr;
120
121 //Bail if offset is not aligned on a page boundary
122 if((framebufferOffset & (CGsPixelFormats::PAGESIZE - 1)) != 0) continue;
123
124 auto framebufferPageSize = CGsPixelFormats::GetPsmPageSize(candidateFramebuffer->m_psm);
125 uint32 framebufferPageCountX = candidateFramebuffer->m_width / framebufferPageSize.first;
126 uint32 framebufferPageIndex = framebufferOffset / CGsPixelFormats::PAGESIZE;
127
128 //Bail if pointed page isn't on the first line
129 if(framebufferPageIndex >= framebufferPageCountX) continue;
130
131 framebuffer = candidateFramebuffer;
132 texInfo.offsetX = static_cast<float>(framebufferPageIndex * framebufferPageSize.first) / static_cast<float>(candidateFramebuffer->m_width);
133 break;
134 }
135 }
136 }
137
138 if(framebuffer)
139 {
140 CommitFramebufferDirtyPages(framebuffer, 0, tex0.GetHeight());
141 if(m_multisampleEnabled)
142 {
143 ResolveFramebufferMultisample(framebuffer, m_fbScale);
144 }
145
146 float scaleRatioX = static_cast<float>(tex0.GetWidth()) / static_cast<float>(framebuffer->m_width);
147 float scaleRatioY = static_cast<float>(tex0.GetHeight()) / static_cast<float>(framebuffer->m_height);
148
149 texInfo.textureHandle = framebuffer->m_texture;
150 texInfo.scaleRatioX = scaleRatioX;
151 texInfo.scaleRatioY = scaleRatioY;
152 return texInfo;
153 }
154 else
155 {
156 return TEXTURE_INFO();
157 }
158 }
159
PrepareTexture(const TEX0 & tex0)160 CGSH_OpenGL::TEXTURE_INFO CGSH_OpenGL::PrepareTexture(const TEX0& tex0)
161 {
162 auto texInfo = SearchTextureFramebuffer(tex0);
163 if(texInfo.textureHandle != 0)
164 {
165 return texInfo;
166 }
167
168 auto texture = m_textureCache.Search(tex0);
169 if(!texture)
170 {
171 //Validate texture dimensions to prevent problems
172 auto texWidth = tex0.GetWidth();
173 auto texHeight = tex0.GetHeight();
174 assert(texWidth <= 1024);
175 assert(texHeight <= 1024);
176 texWidth = std::min<uint32>(texWidth, 1024);
177 texHeight = std::min<uint32>(texHeight, 1024);
178 auto texFormat = GetTextureFormatInfo(tex0.nPsm);
179
180 {
181 auto textureHandle = Framework::OpenGl::CTexture::Create();
182 glBindTexture(GL_TEXTURE_2D, textureHandle);
183 glTexStorage2D(GL_TEXTURE_2D, 1, texFormat.internalFormat, texWidth, texHeight);
184 CHECKGLERROR();
185 m_textureCache.Insert(tex0, std::move(textureHandle));
186 }
187
188 texture = m_textureCache.Search(tex0);
189 texture->m_cachedArea.Invalidate(0, RAMSIZE);
190 }
191
192 texInfo.textureHandle = texture->m_textureHandle;
193
194 glBindTexture(GL_TEXTURE_2D, texture->m_textureHandle);
195 auto& cachedArea = texture->m_cachedArea;
196 auto texturePageSize = CGsPixelFormats::GetPsmPageSize(tex0.nPsm);
197 auto areaRect = cachedArea.GetAreaPageRect();
198
199 while(cachedArea.HasDirtyPages())
200 {
201 auto dirtyRect = cachedArea.GetDirtyPageRect();
202 assert((dirtyRect.width != 0) && (dirtyRect.height != 0));
203 cachedArea.ClearDirtyPages(dirtyRect);
204
205 uint32 texX = dirtyRect.x * texturePageSize.first;
206 uint32 texY = dirtyRect.y * texturePageSize.second;
207 uint32 texWidth = dirtyRect.width * texturePageSize.first;
208 uint32 texHeight = dirtyRect.height * texturePageSize.second;
209 if(texX >= tex0.GetWidth()) continue;
210 if(texY >= tex0.GetHeight()) continue;
211 //assert(texX < tex0.GetWidth());
212 //assert(texY < tex0.GetHeight());
213 if((texX + texWidth) > tex0.GetWidth())
214 {
215 texWidth = tex0.GetWidth() - texX;
216 }
217 if((texY + texHeight) > tex0.GetHeight())
218 {
219 texHeight = tex0.GetHeight() - texY;
220 }
221 ((this)->*(m_textureUpdater[tex0.nPsm]))(tex0.GetBufPtr(), tex0.nBufWidth, texX, texY, texWidth, texHeight);
222 }
223
224 cachedArea.ClearDirtyPages();
225
226 return texInfo;
227 }
228
PreparePalette(const TEX0 & tex0)229 GLuint CGSH_OpenGL::PreparePalette(const TEX0& tex0)
230 {
231 GLuint textureHandle = PalCache_Search(tex0);
232 if(textureHandle != 0)
233 {
234 return textureHandle;
235 }
236
237 std::array<uint32, 256> convertedClut;
238 MakeLinearCLUT(tex0, convertedClut);
239
240 unsigned int entryCount = CGsPixelFormats::IsPsmIDTEX4(tex0.nPsm) ? 16 : 256;
241 textureHandle = PalCache_Search(entryCount, convertedClut.data());
242 if(textureHandle != 0)
243 {
244 return textureHandle;
245 }
246
247 glGenTextures(1, &textureHandle);
248 glBindTexture(GL_TEXTURE_2D, textureHandle);
249 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, entryCount, 1, 0, GL_RGBA, GL_UNSIGNED_BYTE, convertedClut.data());
250
251 PalCache_Insert(tex0, convertedClut.data(), textureHandle);
252
253 return textureHandle;
254 }
255
DumpTexture(unsigned int nWidth,unsigned int nHeight,uint32 checksum)256 void CGSH_OpenGL::DumpTexture(unsigned int nWidth, unsigned int nHeight, uint32 checksum)
257 {
258 #ifdef _WIN32
259 char sFilename[256];
260
261 for(unsigned int i = 0; i < UINT_MAX; i++)
262 {
263 struct _stat Stat;
264 sprintf(sFilename, "./textures/tex_%08X_%08X.bmp", i, checksum);
265 if(_stat(sFilename, &Stat) == -1) break;
266 }
267
268 Framework::CBitmap bitmap(nWidth, nHeight, 32);
269
270 glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_BYTE, bitmap.GetPixels());
271 Framework::CStdStream outputStream(fopen(sFilename, "wb"));
272 Framework::CBMP::WriteBitmap(bitmap, outputStream);
273 #endif
274 }
275
TexUpdater_Invalid(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)276 void CGSH_OpenGL::TexUpdater_Invalid(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
277 {
278 assert(0);
279 }
280
TexUpdater_Psm32(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)281 void CGSH_OpenGL::TexUpdater_Psm32(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
282 {
283 CGsPixelFormats::CPixelIndexorPSMCT32 indexor(m_pRAM, bufPtr, bufWidth);
284
285 uint32* dst = reinterpret_cast<uint32*>(m_pCvtBuffer);
286 for(unsigned int y = 0; y < texHeight; y++)
287 {
288 for(unsigned int x = 0; x < texWidth; x++)
289 {
290 dst[x] = indexor.GetPixel(texX + x, texY + y);
291 }
292
293 dst += texWidth;
294 }
295
296 glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_BYTE, m_pCvtBuffer);
297 CHECKGLERROR();
298 }
299
300 template <typename IndexorType>
TexUpdater_Psm16(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)301 void CGSH_OpenGL::TexUpdater_Psm16(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
302 {
303 IndexorType indexor(m_pRAM, bufPtr, bufWidth);
304
305 auto dst = reinterpret_cast<uint16*>(m_pCvtBuffer);
306 for(unsigned int y = 0; y < texHeight; y++)
307 {
308 for(unsigned int x = 0; x < texWidth; x++)
309 {
310 auto pixel = indexor.GetPixel(texX + x, texY + y);
311 auto cvtPixel =
312 (((pixel & 0x001F) >> 0) << 11) | //R
313 (((pixel & 0x03E0) >> 5) << 6) | //G
314 (((pixel & 0x7C00) >> 10) << 1) | //B
315 (pixel >> 15); //A
316 dst[x] = cvtPixel;
317 }
318
319 dst += texWidth;
320 }
321
322 glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1, m_pCvtBuffer);
323 CHECKGLERROR();
324 }
325
326 #ifdef _WIN32
327 #define USE_SSE
328 #elif defined(__APPLE__)
329 #include <TargetConditionals.h>
330 #if TARGET_CPU_X86_64
331 #define USE_SSE
332 #elif TARGET_CPU_ARM64
333 #define USE_NEON
334 #endif
335 #elif defined(__ANDROID__) || defined(__linux__) || defined(__FreeBSD__)
336 #if defined(__x86_64__) || defined(__i386__)
337 #define USE_SSE
338 #elif defined(__aarch64__) || defined(__arm__)
339 #define USE_NEON
340 #endif
341 #endif
342
343 #if defined(USE_SSE)
344 #include <xmmintrin.h>
345 #include <emmintrin.h>
346 #include <tmmintrin.h>
347
convertColumn8(uint8 * dest,const int destStride,int colNum,__m128i a,__m128i b,__m128i c,__m128i d)348 void convertColumn8(uint8* dest, const int destStride, int colNum, __m128i a, __m128i b, __m128i c, __m128i d)
349 {
350 __m128i* mdest = (__m128i*)dest;
351
352 __m128i temp_a = a;
353 __m128i temp_c = c;
354
355 a = _mm_unpacklo_epi8(temp_a, b);
356 c = _mm_unpackhi_epi8(temp_a, b);
357 b = _mm_unpacklo_epi8(temp_c, d);
358 d = _mm_unpackhi_epi8(temp_c, d);
359
360 temp_a = a;
361 temp_c = c;
362
363 a = _mm_unpacklo_epi16(temp_a, b);
364 c = _mm_unpackhi_epi16(temp_a, b);
365 b = _mm_unpacklo_epi16(temp_c, d);
366 d = _mm_unpackhi_epi16(temp_c, d);
367
368 temp_a = a;
369 __m128i temp_b = b;
370
371 a = _mm_unpacklo_epi8(temp_a, c);
372 b = _mm_unpackhi_epi8(temp_a, c);
373 c = _mm_unpacklo_epi8(temp_b, d);
374 d = _mm_unpackhi_epi8(temp_b, d);
375
376 temp_a = a;
377 temp_c = c;
378
379 a = _mm_unpacklo_epi64(temp_a, b);
380 c = _mm_unpackhi_epi64(temp_a, b);
381 b = _mm_unpacklo_epi64(temp_c, d);
382 d = _mm_unpackhi_epi64(temp_c, d);
383
384 if((colNum & 1) == 0)
385 {
386 c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1));
387 d = _mm_shuffle_epi32(d, _MM_SHUFFLE(2, 3, 0, 1));
388 }
389 else
390 {
391 a = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
392 b = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 3, 0, 1));
393 }
394
395 int mStride = destStride / 16;
396
397 mdest[0] = a;
398 mdest[mStride] = b;
399 mdest[mStride * 2] = c;
400 mdest[mStride * 3] = d;
401 }
402
convertColumn8(uint8 * dest,const int destStride,uint8 * src,int colNum)403 inline void convertColumn8(uint8* dest, const int destStride, uint8* src, int colNum)
404 {
405 __m128i* mSrc = (__m128i*)src;
406
407 __m128i a = mSrc[0];
408 __m128i b = mSrc[1];
409 __m128i c = mSrc[2];
410 __m128i d = mSrc[3];
411 convertColumn8(dest, destStride, colNum, a, b, c, d);
412 }
413
convertColumn4(uint8 * dest,const int destStride,uint8 * src,int colNum)414 inline void convertColumn4(uint8* dest, const int destStride, uint8* src, int colNum)
415 {
416 __m128i* mSrc = (__m128i*)src;
417
418 __m128i a = mSrc[0];
419 __m128i b = mSrc[1];
420 __m128i c = mSrc[2];
421 __m128i d = mSrc[3];
422
423 // 4 bpp looks like 2 8bpp columns side by side.
424 // The 4pp are expanded to 8bpp.
425 // so 01 23 45 67 89 ab cd ef gh ij kl mn op qr st uv expands to
426 // 00 01 02 03 08 09 0a 0b 0g 0h 0i 0j 0o 0p 0q 0r as the first row on the left hand block.
427
428 __m128i perm = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 0x0c, 0x0d, 2, 3, 6, 7, 0x0a, 0x0b, 0x0e, 0x0f);
429 a = _mm_shuffle_epi8(a, perm);
430 b = _mm_shuffle_epi8(b, perm);
431 c = _mm_shuffle_epi8(c, perm);
432 d = _mm_shuffle_epi8(d, perm);
433
434 __m128i a_orig = a;
435
436 const __m128i mask = _mm_set1_epi32(0x0f0f0f0f);
437 const __m128i shiftCount = _mm_set_epi32(0, 0, 0, 4);
438 __m128i lowNybbles = _mm_and_si128(a, mask);
439 __m128i highNybbles = _mm_and_si128(_mm_srl_epi32(a, shiftCount), mask);
440 a = _mm_unpacklo_epi8(lowNybbles, highNybbles);
441 __m128i a2 = _mm_unpackhi_epi8(lowNybbles, highNybbles);
442
443 lowNybbles = _mm_and_si128(b, mask);
444 highNybbles = _mm_and_si128(_mm_srl_epi32(b, shiftCount), mask);
445 b = _mm_unpacklo_epi8(lowNybbles, highNybbles);
446 __m128i b2 = _mm_unpackhi_epi8(lowNybbles, highNybbles);
447
448 lowNybbles = _mm_and_si128(c, mask);
449 highNybbles = _mm_and_si128(_mm_srl_epi32(c, shiftCount), mask);
450 c = _mm_unpacklo_epi8(lowNybbles, highNybbles);
451 __m128i c2 = _mm_unpackhi_epi8(lowNybbles, highNybbles);
452
453 lowNybbles = _mm_and_si128(d, mask);
454 highNybbles = _mm_and_si128(_mm_srl_epi32(d, shiftCount), mask);
455 d = _mm_unpacklo_epi8(lowNybbles, highNybbles);
456 __m128i d2 = _mm_unpackhi_epi8(lowNybbles, highNybbles);
457
458 convertColumn8(dest, destStride, colNum, a, b, c, d);
459 if(destStride > 16)
460 {
461 convertColumn8(dest + 16, destStride, colNum, a2, b2, c2, d2);
462 }
463 }
464
465 #elif defined(USE_NEON)
466 #include <arm_neon.h>
467
convertColumn8(uint8x16x4_t data,uint8 * dest,const int destStride,int colNum)468 inline void convertColumn8(uint8x16x4_t data, uint8* dest, const int destStride, int colNum)
469 {
470 uint16x8_t row0 = vcombine_u16(vmovn_u32(vreinterpretq_u32_u8(data.val[0])), vmovn_u32(vreinterpretq_u32_u8(data.val[2])));
471 uint16x8_t revr0 = vrev32q_u16(vreinterpretq_u16_u8(data.val[0]));
472 uint16x8_t revr2 = vrev32q_u16(vreinterpretq_u16_u8(data.val[2]));
473 uint16x8_t row1 = vcombine_u16(vmovn_u32(vreinterpretq_u32_u16(revr0)), vmovn_u32(vreinterpretq_u32_u16(revr2)));
474
475 uint16x8_t row2 = vcombine_u16(vmovn_u32(vreinterpretq_u32_u8(data.val[1])), vmovn_u32(vreinterpretq_u32_u8(data.val[3])));
476 uint16x8_t revr1 = vrev32q_u16(vreinterpretq_u16_u8(data.val[1]));
477 uint16x8_t revr3 = vrev32q_u16(vreinterpretq_u16_u8(data.val[3]));
478 uint16x8_t row3 = vcombine_u16(vmovn_u32(vreinterpretq_u32_u16(revr1)), vmovn_u32(vreinterpretq_u32_u16(revr3)));
479
480 if((colNum & 1) == 0)
481 {
482 row2 = vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(row2)));
483 row3 = vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(row3)));
484 }
485 else
486 {
487 row0 = vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(row0)));
488 row1 = vreinterpretq_u16_u32(vrev64q_u32(vreinterpretq_u32_u16(row1)));
489 }
490
491 vst1q_u8(dest, vreinterpretq_u8_u16(row0));
492 vst1q_u8(dest + destStride, vreinterpretq_u8_u16(row1));
493 vst1q_u8(dest + 2 * destStride, vreinterpretq_u8_u16(row2));
494 vst1q_u8(dest + 3 * destStride, vreinterpretq_u8_u16(row3));
495 }
496
convertColumn8(uint8 * dest,const int destStride,uint8 * src,int colNum)497 inline void convertColumn8(uint8* dest, const int destStride, uint8* src, int colNum)
498 {
499 // This sucks in the entire column and de-interleaves it
500 uint8x16x4_t data = vld4q_u8(src);
501 convertColumn8(data, dest, destStride, colNum);
502 }
503
convertColumn4(uint8 * dest,const int destStride,uint8 * src,int colNum)504 inline void convertColumn4(uint8* dest, const int destStride, uint8* src, int colNum)
505 {
506 // https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
507
508 uint8x16x4_t data = vld4q_u8(src);
509
510 const auto mask = vdupq_n_u8(0x0F);
511
512 auto high_nybbles = vshrq_n_u8(data.val[0], 4);
513 auto lo_nybbles = vandq_u8(data.val[0], mask);
514
515 uint8x16x4_t col8Data;
516 col8Data.val[0] = lo_nybbles;
517 col8Data.val[1] = high_nybbles;
518
519 high_nybbles = vshrq_n_u8(data.val[1], 4);
520 lo_nybbles = vandq_u8(data.val[1], mask);
521 col8Data.val[2] = lo_nybbles;
522 col8Data.val[3] = high_nybbles;
523 convertColumn8(col8Data, dest, destStride, colNum);
524
525 if(destStride > 16)
526 {
527
528 high_nybbles = vshrq_n_u8(data.val[2], 4);
529 lo_nybbles = vandq_u8(data.val[2], mask);
530 col8Data.val[0] = lo_nybbles;
531 col8Data.val[1] = high_nybbles;
532 high_nybbles = vshrq_n_u8(data.val[3], 4);
533 lo_nybbles = vandq_u8(data.val[3], mask);
534 col8Data.val[2] = lo_nybbles;
535 col8Data.val[3] = high_nybbles;
536 convertColumn8(col8Data, dest + 16, destStride, colNum);
537 }
538 }
539
540 #else
541 /*
542 // If we have a platform that does not have SIMD then implement the basic case here.
543 void convertColumn8(uint8* dest, const int destStride, uint8* src, int colNum)
544 {
545
546 }
547 */
548 #endif
549
TexUpdater_Psm8(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)550 void CGSH_OpenGL::TexUpdater_Psm8(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
551 {
552 if(texWidth < 16)
553 {
554 // Widths are powers of 2, so anything over 16 will be an integral number of columns wide.
555 // Note: for small textures it still may be a win to do the SIMD swizzle and then cut out the sub-region to
556 // correct the row stride.
557 return CGSH_OpenGL::TexUpdater_Psm48<CGsPixelFormats::CPixelIndexorPSMT8>(bufPtr, bufWidth, texX, texY, texWidth, texHeight);
558 }
559
560 CGsPixelFormats::CPixelIndexorPSMT8 indexor(m_pRAM, bufPtr, bufWidth);
561 uint8* dst = m_pCvtBuffer;
562 for(unsigned int y = 0; y < texHeight; y += 16)
563 {
564 for(unsigned int x = 0; x < texWidth; x += 16)
565 {
566 uint8* colDst = dst;
567 uint8* src = indexor.GetPixelAddress(texX + x, texY + y);
568
569 // process an entire 16x16 block.
570 // A column (64 bytes) is 16x4 pixels and they stack vertically in a block
571
572 int colNum = 0;
573 for(unsigned int coly = 0; coly < 16; coly += 4)
574 {
575 convertColumn8(colDst + x, texWidth, src, colNum++);
576 src += 64;
577 colDst += texWidth * 4;
578 }
579 }
580
581 dst += texWidth * 16;
582 }
583
584 glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RED, GL_UNSIGNED_BYTE, m_pCvtBuffer);
585 CHECKGLERROR();
586 }
587
TexUpdater_Psm4(unsigned int bufPtr,unsigned int bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)588 void CGSH_OpenGL::TexUpdater_Psm4(unsigned int bufPtr, unsigned int bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
589 {
590 if(texWidth < 16)
591 {
592 // Widths are powers of 2, so anything over 32 will be an integral number of columns wide.
593 // 16 wide textures are dealt with as a special case in the SIMD code.
594 // Note: for small textures it still may be a win to do the SIMD swizzle and then cut out the sub-region to
595 // correct the row stride.
596 return CGSH_OpenGL::TexUpdater_Psm48<CGsPixelFormats::CPixelIndexorPSMT4>(bufPtr, bufWidth, texX, texY, texWidth, texHeight);
597 }
598
599 CGsPixelFormats::CPixelIndexorPSMT4 indexor(m_pRAM, bufPtr, bufWidth);
600
601 uint8* dst = m_pCvtBuffer;
602 for(unsigned int y = 0; y < texHeight; y += 16)
603 {
604 for(unsigned int x = 0; x < texWidth; x += 32)
605 {
606 uint8* colDst = dst + x;
607 unsigned int nx = texX + x;
608 unsigned int ny = texY + y;
609 uint32 colAddr = indexor.GetColumnAddress(nx, ny);
610 uint8* src = m_pRAM + colAddr;
611
612 // process an entire 32x16 block.
613 // A column (64 bytes) is 32x4 pixels and they stack vertically in a block
614
615 for(unsigned int colNum = 0; colNum < 4; ++colNum)
616 {
617 convertColumn4(colDst, texWidth, src, colNum);
618 src += 64;
619 colDst += texWidth * 4;
620 }
621 }
622
623 dst += texWidth * 16;
624 }
625 glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RED, GL_UNSIGNED_BYTE, m_pCvtBuffer);
626 CHECKGLERROR();
627 }
628
629 template <typename IndexorType>
TexUpdater_Psm48(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)630 void CGSH_OpenGL::TexUpdater_Psm48(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
631 {
632 IndexorType indexor(m_pRAM, bufPtr, bufWidth);
633
634 uint8* dst = m_pCvtBuffer;
635 for(unsigned int y = 0; y < texHeight; y++)
636 {
637 for(unsigned int x = 0; x < texWidth; x++)
638 {
639 uint8 pixel = indexor.GetPixel(texX + x, texY + y);
640 dst[x] = pixel;
641 }
642
643 dst += texWidth;
644 }
645
646 glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RED, GL_UNSIGNED_BYTE, m_pCvtBuffer);
647 CHECKGLERROR();
648 }
649
650 template <uint32 shiftAmount, uint32 mask>
TexUpdater_Psm48H(uint32 bufPtr,uint32 bufWidth,unsigned int texX,unsigned int texY,unsigned int texWidth,unsigned int texHeight)651 void CGSH_OpenGL::TexUpdater_Psm48H(uint32 bufPtr, uint32 bufWidth, unsigned int texX, unsigned int texY, unsigned int texWidth, unsigned int texHeight)
652 {
653 CGsPixelFormats::CPixelIndexorPSMCT32 indexor(m_pRAM, bufPtr, bufWidth);
654
655 uint8* dst = m_pCvtBuffer;
656 for(unsigned int y = 0; y < texHeight; y++)
657 {
658 for(unsigned int x = 0; x < texWidth; x++)
659 {
660 uint32 pixel = indexor.GetPixel(texX + x, texY + y);
661 pixel = (pixel >> shiftAmount) & mask;
662 dst[x] = static_cast<uint8>(pixel);
663 }
664
665 dst += texWidth;
666 }
667
668 glTexSubImage2D(GL_TEXTURE_2D, 0, texX, texY, texWidth, texHeight, GL_RED, GL_UNSIGNED_BYTE, m_pCvtBuffer);
669 CHECKGLERROR();
670 }
671
672 /////////////////////////////////////////////////////////////
673 // Palette
674 /////////////////////////////////////////////////////////////
675
CPalette()676 CGSH_OpenGL::CPalette::CPalette()
677 : m_live(false)
678 , m_isIDTEX4(false)
679 , m_cpsm(0)
680 , m_csa(0)
681 , m_texture(0)
682 {
683 }
684
~CPalette()685 CGSH_OpenGL::CPalette::~CPalette()
686 {
687 Free();
688 }
689
Free()690 void CGSH_OpenGL::CPalette::Free()
691 {
692 if(m_texture != 0)
693 {
694 glDeleteTextures(1, &m_texture);
695 m_texture = 0;
696 m_live = false;
697 }
698 }
699
Invalidate(uint32 csa)700 void CGSH_OpenGL::CPalette::Invalidate(uint32 csa)
701 {
702 if(!m_live) return;
703
704 m_live = false;
705 }
706
707 /////////////////////////////////////////////////////////////
708 // Palette Caching
709 /////////////////////////////////////////////////////////////
710
PalCache_Search(const TEX0 & tex0)711 GLuint CGSH_OpenGL::PalCache_Search(const TEX0& tex0)
712 {
713 for(auto paletteIterator(m_paletteCache.begin());
714 paletteIterator != m_paletteCache.end(); paletteIterator++)
715 {
716 auto palette = *paletteIterator;
717 if(!palette->m_live) continue;
718 if(CGsPixelFormats::IsPsmIDTEX4(tex0.nPsm) != palette->m_isIDTEX4) continue;
719 if(tex0.nCPSM != palette->m_cpsm) continue;
720 if(tex0.nCSA != palette->m_csa) continue;
721 m_paletteCache.erase(paletteIterator);
722 m_paletteCache.push_front(palette);
723 return palette->m_texture;
724 }
725
726 return 0;
727 }
728
PalCache_Search(unsigned int entryCount,const uint32 * contents)729 GLuint CGSH_OpenGL::PalCache_Search(unsigned int entryCount, const uint32* contents)
730 {
731 for(auto paletteIterator(m_paletteCache.begin());
732 paletteIterator != m_paletteCache.end(); paletteIterator++)
733 {
734 auto palette = *paletteIterator;
735
736 if(palette->m_texture == 0) continue;
737
738 unsigned int palEntryCount = palette->m_isIDTEX4 ? 16 : 256;
739 if(palEntryCount != entryCount) continue;
740
741 if(memcmp(contents, palette->m_contents, sizeof(uint32) * entryCount) != 0) continue;
742
743 palette->m_live = true;
744
745 m_paletteCache.erase(paletteIterator);
746 m_paletteCache.push_front(palette);
747 return palette->m_texture;
748 }
749
750 return 0;
751 }
752
PalCache_Insert(const TEX0 & tex0,const uint32 * contents,GLuint textureHandle)753 void CGSH_OpenGL::PalCache_Insert(const TEX0& tex0, const uint32* contents, GLuint textureHandle)
754 {
755 auto texture = *m_paletteCache.rbegin();
756 texture->Free();
757
758 unsigned int entryCount = CGsPixelFormats::IsPsmIDTEX4(tex0.nPsm) ? 16 : 256;
759
760 texture->m_isIDTEX4 = CGsPixelFormats::IsPsmIDTEX4(tex0.nPsm);
761 texture->m_cpsm = tex0.nCPSM;
762 texture->m_csa = tex0.nCSA;
763 texture->m_texture = textureHandle;
764 texture->m_live = true;
765 memcpy(texture->m_contents, contents, entryCount * sizeof(uint32));
766
767 m_paletteCache.pop_back();
768 m_paletteCache.push_front(texture);
769 }
770
PalCache_Invalidate(uint32 csa)771 void CGSH_OpenGL::PalCache_Invalidate(uint32 csa)
772 {
773 std::for_each(std::begin(m_paletteCache), std::end(m_paletteCache),
774 [csa](PalettePtr& palette) { palette->Invalidate(csa); });
775 }
776
PalCache_Flush()777 void CGSH_OpenGL::PalCache_Flush()
778 {
779 std::for_each(std::begin(m_paletteCache), std::end(m_paletteCache),
780 [](PalettePtr& palette) { palette->Free(); });
781 }
782