1 // Copyright (c) 2017- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include "ppsspp_config.h"
19 #include <unordered_map>
20 #include <mutex>
21 #include "Common/Data/Convert/ColorConv.h"
22 #include "Core/Reporting.h"
23 #include "GPU/Common/TextureDecoder.h"
24 #include "GPU/GPUState.h"
25 #include "GPU/Software/Sampler.h"
26 
27 #if defined(_M_SSE)
28 #include <emmintrin.h>
29 #endif
30 
31 using namespace Math3D;
32 
33 extern u32 clut[4096];
34 
35 namespace Sampler {
36 
37 static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level);
38 static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
39 
40 std::mutex jitCacheLock;
41 SamplerJitCache *jitCache = nullptr;
42 
Init()43 void Init() {
44 	jitCache = new SamplerJitCache();
45 }
46 
Shutdown()47 void Shutdown() {
48 	delete jitCache;
49 	jitCache = nullptr;
50 }
51 
DescribeCodePtr(const u8 * ptr,std::string & name)52 bool DescribeCodePtr(const u8 *ptr, std::string &name) {
53 	if (!jitCache->IsInSpace(ptr)) {
54 		return false;
55 	}
56 
57 	name = jitCache->DescribeCodePtr(ptr);
58 	return true;
59 }
60 
GetNearestFunc()61 NearestFunc GetNearestFunc() {
62 	SamplerID id;
63 	jitCache->ComputeSamplerID(&id, false);
64 	NearestFunc jitted = jitCache->GetNearest(id);
65 	if (jitted) {
66 		return jitted;
67 	}
68 
69 	return &SampleNearest;
70 }
71 
GetLinearFunc()72 LinearFunc GetLinearFunc() {
73 	SamplerID id;
74 	jitCache->ComputeSamplerID(&id, true);
75 	LinearFunc jitted = jitCache->GetLinear(id);
76 	if (jitted) {
77 		return jitted;
78 	}
79 
80 	return &SampleLinear;
81 }
82 
SamplerJitCache()83 SamplerJitCache::SamplerJitCache()
84 #if PPSSPP_ARCH(ARM64)
85  : fp(this)
86 #endif
87 {
88 	// 256k should be enough.
89 	AllocCodeSpace(1024 * 64 * 4);
90 
91 	// Add some random code to "help" MSVC's buggy disassembler :(
92 #if defined(_WIN32) && (PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)) && !PPSSPP_PLATFORM(UWP)
93 	using namespace Gen;
94 	for (int i = 0; i < 100; i++) {
95 		MOV(32, R(EAX), R(EBX));
96 		RET();
97 	}
98 #elif PPSSPP_ARCH(ARM)
99 	BKPT(0);
100 	BKPT(0);
101 #endif
102 }
103 
Clear()104 void SamplerJitCache::Clear() {
105 	ClearCodeSpace(0);
106 	cache_.clear();
107 	addresses_.clear();
108 }
109 
ComputeSamplerID(SamplerID * id_out,bool linear)110 void SamplerJitCache::ComputeSamplerID(SamplerID *id_out, bool linear) {
111 	SamplerID id{};
112 
113 	id.texfmt = gstate.getTextureFormat();
114 	id.swizzle = gstate.isTextureSwizzled();
115 	// Only CLUT4 can use separate CLUTs per mimap.
116 	id.useSharedClut = gstate.getTextureFormat() != GE_TFMT_CLUT4 || !gstate.isMipmapEnabled() || gstate.isClutSharedForMipmaps();
117 	if (gstate.isTextureFormatIndexed()) {
118 		id.clutfmt = gstate.getClutPaletteFormat();
119 		id.hasClutMask = gstate.getClutIndexMask() != 0xFF;
120 		id.hasClutShift = gstate.getClutIndexShift() != 0;
121 		id.hasClutOffset = gstate.getClutIndexStartPos() != 0;
122 	}
123 	id.linear = linear;
124 	int maxLevel = gstate.isMipmapEnabled() ? gstate.getTextureMaxLevel() : 0;
125 	for (int i = 0; i <= maxLevel; ++i) {
126 		if (gstate.getTextureAddress(i) == 0) {
127 			id.hasInvalidPtr = true;
128 		}
129 	}
130 
131 	*id_out = id;
132 }
133 
DescribeSamplerID(const SamplerID & id)134 std::string SamplerJitCache::DescribeSamplerID(const SamplerID &id) {
135 	std::string name;
136 	switch ((GETextureFormat)id.texfmt) {
137 	case GE_TFMT_5650: name = "5650"; break;
138 	case GE_TFMT_5551: name = "5551"; break;
139 	case GE_TFMT_4444: name = "4444"; break;
140 	case GE_TFMT_8888: name = "8888"; break;
141 	case GE_TFMT_CLUT4: name = "CLUT4"; break;
142 	case GE_TFMT_CLUT8: name = "CLUT8"; break;
143 	case GE_TFMT_CLUT16: name = "CLUT16"; break;
144 	case GE_TFMT_CLUT32: name = "CLUT32"; break;
145 	case GE_TFMT_DXT1: name = "DXT1"; break;
146 	case GE_TFMT_DXT3: name = "DXT3"; break;
147 	case GE_TFMT_DXT5: name = "DXT5"; break;
148 	}
149 	switch ((GEPaletteFormat)id.clutfmt) {
150 	case GE_CMODE_16BIT_BGR5650:
151 		switch ((GETextureFormat)id.texfmt) {
152 		case GE_TFMT_CLUT4:
153 		case GE_TFMT_CLUT8:
154 		case GE_TFMT_CLUT16:
155 		case GE_TFMT_CLUT32:
156 			name += ":C5650";
157 			break;
158 		default:
159 			// Ignore 0 clutfmt when no clut.
160 			break;
161 		}
162 		break;
163 	case GE_CMODE_16BIT_ABGR5551: name += ":C5551"; break;
164 	case GE_CMODE_16BIT_ABGR4444: name += ":C4444"; break;
165 	case GE_CMODE_32BIT_ABGR8888: name += ":C8888"; break;
166 	}
167 	if (id.swizzle) {
168 		name += ":SWZ";
169 	}
170 	if (!id.useSharedClut) {
171 		name += ":MIP";
172 	}
173 	if (id.hasInvalidPtr) {
174 		name += ":INV";
175 	}
176 	if (id.hasClutMask) {
177 		name += ":CMASK";
178 	}
179 	if (id.hasClutShift) {
180 		name += ":CSHF";
181 	}
182 	if (id.hasClutOffset) {
183 		name += ":COFF";
184 	}
185 	if (id.linear) {
186 		name += ":LERP";
187 	}
188 	return name;
189 }
190 
DescribeCodePtr(const u8 * ptr)191 std::string SamplerJitCache::DescribeCodePtr(const u8 *ptr) {
192 	ptrdiff_t dist = 0x7FFFFFFF;
193 	SamplerID found{};
194 	for (const auto &it : addresses_) {
195 		ptrdiff_t it_dist = ptr - it.second;
196 		if (it_dist >= 0 && it_dist < dist) {
197 			found = it.first;
198 			dist = it_dist;
199 		}
200 	}
201 
202 	return DescribeSamplerID(found);
203 }
204 
GetNearest(const SamplerID & id)205 NearestFunc SamplerJitCache::GetNearest(const SamplerID &id) {
206 	std::lock_guard<std::mutex> guard(jitCacheLock);
207 
208 	auto it = cache_.find(id);
209 	if (it != cache_.end()) {
210 		return it->second;
211 	}
212 
213 	// TODO: What should be the min size?  Can we even hit this?
214 	if (GetSpaceLeft() < 16384) {
215 		Clear();
216 	}
217 
218 #if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)
219 	addresses_[id] = GetCodePointer();
220 	NearestFunc func = Compile(id);
221 	cache_[id] = func;
222 	return func;
223 #else
224 	return nullptr;
225 #endif
226 }
227 
GetLinear(const SamplerID & id)228 LinearFunc SamplerJitCache::GetLinear(const SamplerID &id) {
229 	std::lock_guard<std::mutex> guard(jitCacheLock);
230 
231 	auto it = cache_.find(id);
232 	if (it != cache_.end()) {
233 		return (LinearFunc)it->second;
234 	}
235 
236 	// TODO: What should be the min size?  Can we even hit this?
237 	if (GetSpaceLeft() < 16384) {
238 		Clear();
239 	}
240 
241 #if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)
242 	addresses_[id] = GetCodePointer();
243 	LinearFunc func = CompileLinear(id);
244 	cache_[id] = (NearestFunc)func;
245 	return func;
246 #else
247 	return nullptr;
248 #endif
249 }
250 
251 template <unsigned int texel_size_bits>
GetPixelDataOffset(unsigned int row_pitch_pixels,unsigned int u,unsigned int v)252 static inline int GetPixelDataOffset(unsigned int row_pitch_pixels, unsigned int u, unsigned int v)
253 {
254 	if (!gstate.isTextureSwizzled())
255 		return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3);
256 
257 	const int tile_size_bits = 32;
258 	const int tiles_in_block_horizontal = 4;
259 	const int tiles_in_block_vertical = 8;
260 
261 	int texels_per_tile = tile_size_bits / texel_size_bits;
262 	int tile_u = u / texels_per_tile;
263 	int tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
264 	// TODO: not sure if the *texel_size_bits/8 factor is correct
265 					(v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) +
266 					(tile_u % tiles_in_block_horizontal) +
267 					(tile_u / tiles_in_block_horizontal) * (tiles_in_block_horizontal*tiles_in_block_vertical);
268 
269 	return tile_idx * (tile_size_bits / 8) + ((u % texels_per_tile) * texel_size_bits) / 8;
270 }
271 
LookupColor(unsigned int index,unsigned int level)272 static inline u32 LookupColor(unsigned int index, unsigned int level)
273 {
274 	const bool mipmapShareClut = gstate.isClutSharedForMipmaps();
275 	const int clutSharingOffset = mipmapShareClut ? 0 : level * 16;
276 
277 	switch (gstate.getClutPaletteFormat()) {
278 	case GE_CMODE_16BIT_BGR5650:
279 		return RGB565ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
280 
281 	case GE_CMODE_16BIT_ABGR5551:
282 		return RGBA5551ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
283 
284 	case GE_CMODE_16BIT_ABGR4444:
285 		return RGBA4444ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
286 
287 	case GE_CMODE_32BIT_ABGR8888:
288 		return clut[index + clutSharingOffset];
289 
290 	default:
291 		ERROR_LOG_REPORT(G3D, "Software: Unsupported palette format: %x", gstate.getClutPaletteFormat());
292 		return 0;
293 	}
294 }
295 
296 struct Nearest4 {
297 	alignas(16) u32 v[4];
298 
operator u32Sampler::Nearest4299 	operator u32() const {
300 		return v[0];
301 	}
302 };
303 
304 template <int N>
SampleNearest(int u[N],int v[N],const u8 * srcptr,int texbufw,int level)305 inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level)
306 {
307 	Nearest4 res;
308 	if (!srcptr) {
309 		memset(res.v, 0, sizeof(res.v));
310 		return res;
311 	}
312 
313 	GETextureFormat texfmt = gstate.getTextureFormat();
314 
315 	// TODO: Should probably check if textures are aligned properly...
316 
317 	switch (texfmt) {
318 	case GE_TFMT_4444:
319 		for (int i = 0; i < N; ++i) {
320 			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]);
321 			res.v[i] = RGBA4444ToRGBA8888(*(const u16 *)src);
322 		}
323 		return res;
324 
325 	case GE_TFMT_5551:
326 		for (int i = 0; i < N; ++i) {
327 			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]);
328 			res.v[i] = RGBA5551ToRGBA8888(*(const u16 *)src);
329 		}
330 		return res;
331 
332 	case GE_TFMT_5650:
333 		for (int i = 0; i < N; ++i) {
334 			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]);
335 			res.v[i] = RGB565ToRGBA8888(*(const u16 *)src);
336 		}
337 		return res;
338 
339 	case GE_TFMT_8888:
340 		for (int i = 0; i < N; ++i) {
341 			const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i]);
342 			res.v[i] = *(const u32 *)src;
343 		}
344 		return res;
345 
346 	case GE_TFMT_CLUT32:
347 		for (int i = 0; i < N; ++i) {
348 			const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i]);
349 			u32 val = src[0] + (src[1] << 8) + (src[2] << 16) + (src[3] << 24);
350 			res.v[i] = LookupColor(gstate.transformClutIndex(val), 0);
351 		}
352 		return res;
353 
354 	case GE_TFMT_CLUT16:
355 		for (int i = 0; i < N; ++i) {
356 			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]);
357 			u16 val = src[0] + (src[1] << 8);
358 			res.v[i] = LookupColor(gstate.transformClutIndex(val), 0);
359 		}
360 		return res;
361 
362 	case GE_TFMT_CLUT8:
363 		for (int i = 0; i < N; ++i) {
364 			const u8 *src = srcptr + GetPixelDataOffset<8>(texbufw, u[i], v[i]);
365 			u8 val = *src;
366 			res.v[i] = LookupColor(gstate.transformClutIndex(val), 0);
367 		}
368 		return res;
369 
370 	case GE_TFMT_CLUT4:
371 		for (int i = 0; i < N; ++i) {
372 			const u8 *src = srcptr + GetPixelDataOffset<4>(texbufw, u[i], v[i]);
373 			u8 val = (u[i] & 1) ? (src[0] >> 4) : (src[0] & 0xF);
374 			// Only CLUT4 uses separate mipmap palettes.
375 			res.v[i] = LookupColor(gstate.transformClutIndex(val), level);
376 		}
377 		return res;
378 
379 	case GE_TFMT_DXT1:
380 		for (int i = 0; i < N; ++i) {
381 			const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
382 			res.v[i] = GetDXT1Texel(block, u[i] % 4, v[i] % 4);
383 		}
384 		return res;
385 
386 	case GE_TFMT_DXT3:
387 		for (int i = 0; i < N; ++i) {
388 			const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
389 			res.v[i] = GetDXT3Texel(block, u[i] % 4, v[i] % 4);
390 		}
391 		return res;
392 
393 	case GE_TFMT_DXT5:
394 		for (int i = 0; i < N; ++i) {
395 			const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
396 			res.v[i] = GetDXT5Texel(block, u[i] % 4, v[i] % 4);
397 		}
398 		return res;
399 
400 	default:
401 		ERROR_LOG_REPORT(G3D, "Software: Unsupported texture format: %x", texfmt);
402 		memset(res.v, 0, sizeof(res.v));
403 		return res;
404 	}
405 }
406 
SampleNearest(int u,int v,const u8 * tptr,int bufw,int level)407 static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) {
408 	return SampleNearest<1>(&u, &v, tptr, bufw, level);
409 }
410 
SampleLinear(int u[4],int v[4],int frac_u,int frac_v,const u8 * tptr,int bufw,int texlevel)411 static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) {
412 	Nearest4 c = SampleNearest<4>(u, v, tptr, bufw, texlevel);
413 
414 	Vec4<int> texcolor_tl = Vec4<int>::FromRGBA(c.v[0]);
415 	Vec4<int> texcolor_tr = Vec4<int>::FromRGBA(c.v[1]);
416 	Vec4<int> texcolor_bl = Vec4<int>::FromRGBA(c.v[2]);
417 	Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
418 	Vec4<int> t = texcolor_tl * (0x100 - frac_u) + texcolor_tr * frac_u;
419 	Vec4<int> b = texcolor_bl * (0x100 - frac_u) + texcolor_br * frac_u;
420 	return ((t * (0x100 - frac_v) + b * frac_v) / (256 * 256)).ToRGBA();
421 }
422 
423 };
424