1 // Copyright (c) 2012- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include <algorithm>
19 #include <cstdlib>
20 #include <cstring>
21 #include <cmath>
22 
23 #include "GPU/Common/TextureScalerCommon.h"
24 
25 #include "Core/Config.h"
26 #include "Common/Common.h"
27 #include "Common/Log.h"
28 #include "Common/CommonFuncs.h"
29 #include "Common/Thread/ParallelLoop.h"
30 #include "Core/ThreadPools.h"
31 #include "Common/CPUDetect.h"
32 #include "ext/xbrz/xbrz.h"
33 
34 #if _M_SSE >= 0x401
35 #include <smmintrin.h>
36 #endif
37 
38 // Report the time and throughput for each larger scaling operation in the log
39 //#define SCALING_MEASURE_TIME
40 
41 //#define DEBUG_SCALER_OUTPUT
42 
43 #ifdef SCALING_MEASURE_TIME
44 #include "Common/TimeUtil.h"
45 #endif
46 
47 /////////////////////////////////////// Helper Functions (mostly math for parallelization)
48 
49 namespace {
50 //////////////////////////////////////////////////////////////////// Various image processing
51 
52 #define R(_col) ((_col>> 0)&0xFF)
53 #define G(_col) ((_col>> 8)&0xFF)
54 #define B(_col) ((_col>>16)&0xFF)
55 #define A(_col) ((_col>>24)&0xFF)
56 
57 #define DISTANCE(_p1,_p2) ( abs(static_cast<int>(static_cast<int>(R(_p1))-R(_p2))) + abs(static_cast<int>(static_cast<int>(G(_p1))-G(_p2))) \
58 							  + abs(static_cast<int>(static_cast<int>(B(_p1))-B(_p2))) + abs(static_cast<int>(static_cast<int>(A(_p1))-A(_p2))) )
59 
60 // this is sadly much faster than an inline function with a loop, at least in VC10
61 #define MIX_PIXELS(_p0, _p1, _factors) \
62 		( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 <<  0 ) | \
63 		( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 <<  8 ) | \
64 		( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \
65 		( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 )
66 
67 #define BLOCK_SIZE 32
68 
69 // 3x3 convolution with Neumann boundary conditions, parallelizable
70 // quite slow, could be sped up a lot
71 // especially handling of separable kernels
convolve3x3(u32 * data,u32 * out,const int kernel[3][3],int width,int height,int l,int u)72 void convolve3x3(u32* data, u32* out, const int kernel[3][3], int width, int height, int l, int u) {
73 	for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
74 		for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
75 			for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
76 				for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
77 					int val = 0;
78 					for (int yoff = -1; yoff <= 1; ++yoff) {
79 						int yy = std::max(std::min(y + yoff, height - 1), 0);
80 						for (int xoff = -1; xoff <= 1; ++xoff) {
81 							int xx = std::max(std::min(x + xoff, width - 1), 0);
82 							val += data[yy*width + xx] * kernel[yoff + 1][xoff + 1];
83 						}
84 					}
85 					out[y*width + x] = abs(val);
86 				}
87 			}
88 		}
89 	}
90 }
91 
92 // deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources
deposterizeH(u32 * data,u32 * out,int w,int l,int u)93 void deposterizeH(u32* data, u32* out, int w, int l, int u) {
94 	static const int T = 8;
95 	for (int y = l; y < u; ++y) {
96 		for (int x = 0; x < w; ++x) {
97 			int inpos = y*w + x;
98 			u32 center = data[inpos];
99 			if (x == 0 || x == w - 1) {
100 				out[y*w + x] = center;
101 				continue;
102 			}
103 			u32 left = data[inpos - 1];
104 			u32 right = data[inpos + 1];
105 			out[y*w + x] = 0;
106 			for (int c = 0; c < 4; ++c) {
107 				u8 lc = ((left >> c * 8) & 0xFF);
108 				u8 cc = ((center >> c * 8) & 0xFF);
109 				u8 rc = ((right >> c * 8) & 0xFF);
110 				if ((lc != rc) && ((lc == cc && abs((int)((int)rc) - cc) <= T) || (rc == cc && abs((int)((int)lc) - cc) <= T))) {
111 					// blend this component
112 					out[y*w + x] |= ((rc + lc) / 2) << (c * 8);
113 				} else {
114 					// no change for this component
115 					out[y*w + x] |= cc << (c * 8);
116 				}
117 			}
118 		}
119 	}
120 }
deposterizeV(u32 * data,u32 * out,int w,int h,int l,int u)121 void deposterizeV(u32* data, u32* out, int w, int h, int l, int u) {
122 	static const int T = 8;
123 	for (int xb = 0; xb < w / BLOCK_SIZE + 1; ++xb) {
124 		for (int y = l; y < u; ++y) {
125 			for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w; ++x) {
126 				u32 center = data[y    * w + x];
127 				if (y == 0 || y == h - 1) {
128 					out[y*w + x] = center;
129 					continue;
130 				}
131 				u32 upper = data[(y - 1) * w + x];
132 				u32 lower = data[(y + 1) * w + x];
133 				out[y*w + x] = 0;
134 				for (int c = 0; c < 4; ++c) {
135 					u8 uc = ((upper >> c * 8) & 0xFF);
136 					u8 cc = ((center >> c * 8) & 0xFF);
137 					u8 lc = ((lower >> c * 8) & 0xFF);
138 					if ((uc != lc) && ((uc == cc && abs((int)((int)lc) - cc) <= T) || (lc == cc && abs((int)((int)uc) - cc) <= T))) {
139 						// blend this component
140 						out[y*w + x] |= ((lc + uc) / 2) << (c * 8);
141 					} else {
142 						// no change for this component
143 						out[y*w + x] |= cc << (c * 8);
144 					}
145 				}
146 			}
147 		}
148 	}
149 }
150 
151 // generates a distance mask value for each pixel in data
152 // higher values -> larger distance to the surrounding pixels
generateDistanceMask(u32 * data,u32 * out,int width,int height,int l,int u)153 void generateDistanceMask(u32* data, u32* out, int width, int height, int l, int u) {
154 	for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
155 		for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
156 			for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
157 				for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
158 					const u32 center = data[y*width + x];
159 					u32 dist = 0;
160 					for (int yoff = -1; yoff <= 1; ++yoff) {
161 						int yy = y + yoff;
162 						if (yy == height || yy == -1) {
163 							dist += 1200; // assume distance at borders, usually makes for better result
164 							continue;
165 						}
166 						for (int xoff = -1; xoff <= 1; ++xoff) {
167 							if (yoff == 0 && xoff == 0) continue;
168 							int xx = x + xoff;
169 							if (xx == width || xx == -1) {
170 								dist += 400; // assume distance at borders, usually makes for better result
171 								continue;
172 							}
173 							dist += DISTANCE(data[yy*width + xx], center);
174 						}
175 					}
176 					out[y*width + x] = dist;
177 				}
178 			}
179 		}
180 	}
181 }
182 
183 // mix two images based on a mask
mix(u32 * data,u32 * source,u32 * mask,u32 maskmax,int width,int l,int u)184 void mix(u32* data, u32* source, u32* mask, u32 maskmax, int width, int l, int u) {
185 	for (int y = l; y < u; ++y) {
186 		for (int x = 0; x < width; ++x) {
187 			int pos = y*width + x;
188 			u8 mixFactors[2] = { 0, static_cast<u8>((std::min(mask[pos], maskmax) * 255) / maskmax) };
189 			mixFactors[0] = 255 - mixFactors[1];
190 			data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors);
191 			if (A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha
192 		}
193 	}
194 }
195 
196 //////////////////////////////////////////////////////////////////// Bicubic scaling
197 
198 // generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B
199 // B=1 C=0   : cubic B spline (very smooth)
200 // B=C=1/3   : recommended for general upscaling
201 // B=0 C=1/2 : Catmull-Rom spline (sharp, ringing)
202 // see Mitchell & Netravali, "Reconstruction Filters in Computer Graphics"
mitchell(float x,float B,float C)203 inline float mitchell(float x, float B, float C) {
204 	float ax = fabs(x);
205 	if (ax >= 2.0f) return 0.0f;
206 	if (ax >= 1.0f) return ((-B - 6 * C)*(x*x*x) + (6 * B + 30 * C)*(x*x) + (-12 * B - 48 * C)*x + (8 * B + 24 * C)) / 6.0f;
207 	return ((12 - 9 * B - 6 * C)*(x*x*x) + (-18 + 12 * B + 6 * C)*(x*x) + (6 - 2 * B)) / 6.0f;
208 }
209 
210 // arrays for pre-calculating weights and sums (~20KB)
211 // Dimensions:
212 //   0: 0 = BSpline, 1 = mitchell
213 //   2: 2-5x scaling
214 // 2,3: 5x5 generated pixels
215 // 4,5: 5x5 pixels sampled from
216 float bicubicWeights[2][4][5][5][5][5];
217 float bicubicInvSums[2][4][5][5];
218 
219 // initialize pre-computed weights array
initBicubicWeights()220 void initBicubicWeights() {
221 	float B[2] = { 1.0f, 0.334f };
222 	float C[2] = { 0.0f, 0.334f };
223 	for (int type = 0; type < 2; ++type) {
224 		for (int factor = 2; factor <= 5; ++factor) {
225 			for (int x = 0; x < factor; ++x) {
226 				for (int y = 0; y < factor; ++y) {
227 					float sum = 0.0f;
228 					for (int sx = -2; sx <= 2; ++sx) {
229 						for (int sy = -2; sy <= 2; ++sy) {
230 							float dx = (x + 0.5f) / factor - (sx + 0.5f);
231 							float dy = (y + 0.5f) / factor - (sy + 0.5f);
232 							float dist = sqrt(dx*dx + dy*dy);
233 							float weight = mitchell(dist, B[type], C[type]);
234 							bicubicWeights[type][factor - 2][x][y][sx + 2][sy + 2] = weight;
235 							sum += weight;
236 						}
237 					}
238 					bicubicInvSums[type][factor - 2][x][y] = 1.0f / sum;
239 				}
240 			}
241 		}
242 	}
243 }
244 
245 // perform bicubic scaling by factor f, with precomputed spline type T
246 template<int f, int T>
scaleBicubicT(u32 * data,u32 * out,int w,int h,int l,int u)247 void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) {
248 	int outw = w*f;
249 	for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) {
250 		for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) {
251 			for (int y = l*f + yb*BLOCK_SIZE; y < l*f + (yb + 1)*BLOCK_SIZE && y < u*f; ++y) {
252 				for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w*f; ++x) {
253 					float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f;
254 					int cx = x / f, cy = y / f;
255 					// sample supporting pixels in original image
256 					for (int sx = -2; sx <= 2; ++sx) {
257 						for (int sy = -2; sy <= 2; ++sy) {
258 							float weight = bicubicWeights[T][f - 2][x%f][y%f][sx + 2][sy + 2];
259 							if (weight != 0.0f) {
260 								// clamp pixel locations
261 								int csy = std::max(std::min(sy + cy, h - 1), 0);
262 								int csx = std::max(std::min(sx + cx, w - 1), 0);
263 								// sample & add weighted components
264 								u32 sample = data[csy*w + csx];
265 								r += weight*R(sample);
266 								g += weight*G(sample);
267 								b += weight*B(sample);
268 								a += weight*A(sample);
269 							}
270 						}
271 					}
272 					// generate and write result
273 					float invSum = bicubicInvSums[T][f - 2][x%f][y%f];
274 					int ri = std::min(std::max(static_cast<int>(ceilf(r*invSum)), 0), 255);
275 					int gi = std::min(std::max(static_cast<int>(ceilf(g*invSum)), 0), 255);
276 					int bi = std::min(std::max(static_cast<int>(ceilf(b*invSum)), 0), 255);
277 					int ai = std::min(std::max(static_cast<int>(ceilf(a*invSum)), 0), 255);
278 					out[y*outw + x] = (ai << 24) | (bi << 16) | (gi << 8) | ri;
279 				}
280 			}
281 		}
282 	}
283 }
284 #if _M_SSE >= 0x401
285 template<int f, int T>
scaleBicubicTSSE41(u32 * data,u32 * out,int w,int h,int l,int u)286 void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
287 	int outw = w*f;
288 	for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) {
289 		for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) {
290 			for (int y = l*f + yb*BLOCK_SIZE; y < l*f + (yb + 1)*BLOCK_SIZE && y < u*f; ++y) {
291 				for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w*f; ++x) {
292 					__m128 result = _mm_set1_ps(0.0f);
293 					int cx = x / f, cy = y / f;
294 					// sample supporting pixels in original image
295 					for (int sx = -2; sx <= 2; ++sx) {
296 						for (int sy = -2; sy <= 2; ++sy) {
297 							float weight = bicubicWeights[T][f - 2][x%f][y%f][sx + 2][sy + 2];
298 							if (weight != 0.0f) {
299 								// clamp pixel locations
300 								int csy = std::max(std::min(sy + cy, h - 1), 0);
301 								int csx = std::max(std::min(sx + cx, w - 1), 0);
302 								// sample & add weighted components
303 								__m128i sample = _mm_cvtsi32_si128(data[csy*w + csx]);
304 								sample = _mm_cvtepu8_epi32(sample);
305 								__m128 col = _mm_cvtepi32_ps(sample);
306 								col = _mm_mul_ps(col, _mm_set1_ps(weight));
307 								result = _mm_add_ps(result, col);
308 							}
309 						}
310 					}
311 					// generate and write result
312 					__m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(bicubicInvSums[T][f - 2][x%f][y%f])));
313 					pixel = _mm_packs_epi32(pixel, pixel);
314 					pixel = _mm_packus_epi16(pixel, pixel);
315 					out[y*outw + x] = _mm_cvtsi128_si32(pixel);
316 				}
317 			}
318 		}
319 	}
320 }
321 #endif
322 
scaleBicubicBSpline(int factor,u32 * data,u32 * out,int w,int h,int l,int u)323 void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) {
324 #if _M_SSE >= 0x401
325 	if (cpu_info.bSSE4_1) {
326 		switch (factor) {
327 		case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this,
328 		case 3: scaleBicubicTSSE41<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
329 		case 4: scaleBicubicTSSE41<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
330 		case 5: scaleBicubicTSSE41<5, 0>(data, out, w, h, l, u); break; // any of these break statements
331 		default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
332 		}
333 	} else {
334 #endif
335 		switch (factor) {
336 		case 2: scaleBicubicT<2, 0>(data, out, w, h, l, u); break; // when I first tested this,
337 		case 3: scaleBicubicT<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
338 		case 4: scaleBicubicT<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
339 		case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements
340 		default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
341 		}
342 #if _M_SSE >= 0x401
343 	}
344 #endif
345 }
346 
scaleBicubicMitchell(int factor,u32 * data,u32 * out,int w,int h,int l,int u)347 void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) {
348 #if _M_SSE >= 0x401
349 	if (cpu_info.bSSE4_1) {
350 		switch (factor) {
351 		case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break;
352 		case 3: scaleBicubicTSSE41<3, 1>(data, out, w, h, l, u); break;
353 		case 4: scaleBicubicTSSE41<4, 1>(data, out, w, h, l, u); break;
354 		case 5: scaleBicubicTSSE41<5, 1>(data, out, w, h, l, u); break;
355 		default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
356 		}
357 	} else {
358 #endif
359 		switch (factor) {
360 		case 2: scaleBicubicT<2, 1>(data, out, w, h, l, u); break;
361 		case 3: scaleBicubicT<3, 1>(data, out, w, h, l, u); break;
362 		case 4: scaleBicubicT<4, 1>(data, out, w, h, l, u); break;
363 		case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break;
364 		default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
365 		}
366 #if _M_SSE >= 0x401
367 	}
368 #endif
369 }
370 
371 //////////////////////////////////////////////////////////////////// Bilinear scaling
372 
373 const static u8 BILINEAR_FACTORS[4][3][2] = {
374 		{ { 44, 211 }, { 0, 0 }, { 0, 0 } }, // x2
375 		{ { 64, 191 }, { 0, 255 }, { 0, 0 } }, // x3
376 		{ { 77, 178 }, { 26, 229 }, { 0, 0 } }, // x4
377 		{ { 102, 153 }, { 51, 204 }, { 0, 255 } }, // x5
378 };
379 // integral bilinear upscaling by factor f, horizontal part
380 template<int f>
bilinearHt(u32 * data,u32 * out,int w,int l,int u)381 void bilinearHt(u32* data, u32* out, int w, int l, int u) {
382 	static_assert(f > 1 && f <= 5, "Bilinear scaling only implemented for factors 2 to 5");
383 	int outw = w*f;
384 	for (int y = l; y < u; ++y) {
385 		for (int x = 0; x < w; ++x) {
386 			int inpos = y*w + x;
387 			u32 left = data[inpos - (x == 0 ? 0 : 1)];
388 			u32 center = data[inpos];
389 			u32 right = data[inpos + (x == w - 1 ? 0 : 1)];
390 			int i = 0;
391 			for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
392 				out[y*outw + x*f + i] = MIX_PIXELS(left, center, BILINEAR_FACTORS[f - 2][i]);
393 			}
394 			for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
395 				out[y*outw + x*f + i] = MIX_PIXELS(right, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
396 			}
397 		}
398 	}
399 }
bilinearH(int factor,u32 * data,u32 * out,int w,int l,int u)400 void bilinearH(int factor, u32* data, u32* out, int w, int l, int u) {
401 	switch (factor) {
402 	case 2: bilinearHt<2>(data, out, w, l, u); break;
403 	case 3: bilinearHt<3>(data, out, w, l, u); break;
404 	case 4: bilinearHt<4>(data, out, w, l, u); break;
405 	case 5: bilinearHt<5>(data, out, w, l, u); break;
406 	default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
407 	}
408 }
409 // integral bilinear upscaling by factor f, vertical part
410 // gl/gu == global lower and upper bound
411 template<int f>
bilinearVt(u32 * data,u32 * out,int w,int gl,int gu,int l,int u)412 void bilinearVt(u32* data, u32* out, int w, int gl, int gu, int l, int u) {
413 	static_assert(f>1 && f <= 5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x");
414 	int outw = w*f;
415 	for (int xb = 0; xb < outw / BLOCK_SIZE + 1; ++xb) {
416 		for (int y = l; y < u; ++y) {
417 			u32 uy = y - (y == gl ? 0 : 1);
418 			u32 ly = y + (y == gu - 1 ? 0 : 1);
419 			for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < outw; ++x) {
420 				u32 upper = data[uy * outw + x];
421 				u32 center = data[y * outw + x];
422 				u32 lower = data[ly * outw + x];
423 				int i = 0;
424 				for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
425 					out[(y*f + i)*outw + x] = MIX_PIXELS(upper, center, BILINEAR_FACTORS[f - 2][i]);
426 				}
427 				for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
428 					out[(y*f + i)*outw + x] = MIX_PIXELS(lower, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
429 				}
430 			}
431 		}
432 	}
433 }
bilinearV(int factor,u32 * data,u32 * out,int w,int gl,int gu,int l,int u)434 void bilinearV(int factor, u32* data, u32* out, int w, int gl, int gu, int l, int u) {
435 	switch (factor) {
436 	case 2: bilinearVt<2>(data, out, w, gl, gu, l, u); break;
437 	case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break;
438 	case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break;
439 	case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break;
440 	default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
441 	}
442 }
443 
444 #undef BLOCK_SIZE
445 #undef MIX_PIXELS
446 #undef DISTANCE
447 #undef R
448 #undef G
449 #undef B
450 #undef A
451 
452 #ifdef DEBUG_SCALER_OUTPUT
453 
454 // used for debugging texture scaling (writing textures to files)
455 static int g_imgCount = 0;
dbgPPM(int w,int h,u8 * pixels,const char * prefix="dbg")456 void dbgPPM(int w, int h, u8* pixels, const char* prefix = "dbg") { // 3 component RGB
457 	char fn[32];
458 	snprintf(fn, 32, "%s%04d.ppm", prefix, g_imgCount++);
459 	FILE *fp = fopen(fn, "wb");
460 	fprintf(fp, "P6\n%d %d\n255\n", w, h);
461 	for (int j = 0; j < h; ++j) {
462 		for (int i = 0; i < w; ++i) {
463 			static unsigned char color[3];
464 			color[0] = pixels[(j*w + i) * 4 + 0];  /* red */
465 			color[1] = pixels[(j*w + i) * 4 + 1];  /* green */
466 			color[2] = pixels[(j*w + i) * 4 + 2];  /* blue */
467 			fwrite(color, 1, 3, fp);
468 		}
469 	}
470 	fclose(fp);
471 }
dbgPGM(int w,int h,u32 * pixels,const char * prefix="dbg")472 void dbgPGM(int w, int h, u32* pixels, const char* prefix = "dbg") { // 1 component
473 	char fn[32];
474 	snprintf(fn, 32, "%s%04d.pgm", prefix, g_imgCount++);
475 	FILE *fp = fopen(fn, "wb");
476 	fprintf(fp, "P5\n%d %d\n65536\n", w, h);
477 	for (int j = 0; j < h; ++j) {
478 		for (int i = 0; i < w; ++i) {
479 			fwrite((pixels + (j*w + i)), 1, 2, fp);
480 		}
481 	}
482 	fclose(fp);
483 }
484 
485 #endif
486 
487 }
488 
489 /////////////////////////////////////// Texture Scaler
490 
TextureScalerCommon()491 TextureScalerCommon::TextureScalerCommon() {
492 	initBicubicWeights();
493 }
494 
~TextureScalerCommon()495 TextureScalerCommon::~TextureScalerCommon() {
496 }
497 
IsEmptyOrFlat(u32 * data,int pixels,int fmt)498 bool TextureScalerCommon::IsEmptyOrFlat(u32* data, int pixels, int fmt) {
499 	int pixelsPerWord = 4 / BytesPerPixel(fmt);
500 	u32 ref = data[0];
501 	if (pixelsPerWord > 1 && (ref & 0x0000FFFF) != (ref >> 16)) {
502 		return false;
503 	}
504 	for (int i = 0; i < pixels / pixelsPerWord; ++i) {
505 		if (data[i] != ref) return false;
506 	}
507 	return true;
508 }
509 
ScaleAlways(u32 * out,u32 * src,u32 & dstFmt,int & width,int & height,int factor)510 void TextureScalerCommon::ScaleAlways(u32 *out, u32 *src, u32 &dstFmt, int &width, int &height, int factor) {
511 	if (IsEmptyOrFlat(src, width*height, dstFmt)) {
512 		// This means it was a flat texture.  Vulkan wants the size up front, so we need to make it happen.
513 		u32 pixel;
514 		// Since it's flat, one pixel is enough.  It might end up pointing to data, though.
515 		u32 *pixelPointer = &pixel;
516 		ConvertTo8888(dstFmt, src, pixelPointer, 1, 1);
517 		if (pixelPointer != &pixel) {
518 			pixel = *pixelPointer;
519 		}
520 
521 		dstFmt = Get8888Format();
522 		width *= factor;
523 		height *= factor;
524 
525 		// ABCD.  If A = D, and AB = CD, then they must all be equal (B = C, etc.)
526 		if ((pixel & 0x000000FF) == (pixel >> 24) && (pixel & 0x0000FFFF) == (pixel >> 16)) {
527 			memset(out, pixel & 0xFF, width * height * sizeof(u32));
528 		} else {
529 			// Let's hope this is vectorized.
530 			for (int i = 0; i < width * height; ++i) {
531 				out[i] = pixel;
532 			}
533 		}
534 	} else {
535 		ScaleInto(out, src, dstFmt, width, height, factor);
536 	}
537 }
538 
ScaleInto(u32 * outputBuf,u32 * src,u32 & dstFmt,int & width,int & height,int factor)539 bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, u32 &dstFmt, int &width, int &height, int factor) {
540 #ifdef SCALING_MEASURE_TIME
541 	double t_start = time_now_d();
542 #endif
543 
544 	bufInput.resize(width*height); // used to store the input image image if it needs to be reformatted
545 	u32 *inputBuf = bufInput.data();
546 
547 	// convert texture to correct format for scaling
548 	ConvertTo8888(dstFmt, src, inputBuf, width, height);
549 
550 	// deposterize
551 	if (g_Config.bTexDeposterize) {
552 		bufDeposter.resize(width*height);
553 		DePosterize(inputBuf, bufDeposter.data(), width, height);
554 		inputBuf = bufDeposter.data();
555 	}
556 
557 	// scale
558 	switch (g_Config.iTexScalingType) {
559 	case XBRZ:
560 		ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
561 		break;
562 	case HYBRID:
563 		ScaleHybrid(factor, inputBuf, outputBuf, width, height);
564 		break;
565 	case BICUBIC:
566 		ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);
567 		break;
568 	case HYBRID_BICUBIC:
569 		ScaleHybrid(factor, inputBuf, outputBuf, width, height, true);
570 		break;
571 	default:
572 		ERROR_LOG(G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);
573 	}
574 
575 	// update values accordingly
576 	dstFmt = Get8888Format();
577 	width *= factor;
578 	height *= factor;
579 
580 #ifdef SCALING_MEASURE_TIME
581 	if (width*height > 64 * 64 * factor*factor) {
582 		double t = time_now_d() - t_start;
583 		NOTICE_LOG(G3D, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)",
584 			width*height, t, (width*height) / (t * 1000 * 1000));
585 	}
586 #endif
587 
588 	return true;
589 }
590 
Scale(u32 * & data,u32 & dstFmt,int & width,int & height,int factor)591 bool TextureScalerCommon::Scale(u32* &data, u32 &dstFmt, int &width, int &height, int factor) {
592 	// prevent processing empty or flat textures (this happens a lot in some games)
593 	// doesn't hurt the standard case, will be very quick for textures with actual texture
594 	if (IsEmptyOrFlat(data, width*height, dstFmt)) {
595 		DEBUG_LOG(G3D, "TextureScaler: early exit -- empty/flat texture");
596 		return false;
597 	}
598 
599 	bufOutput.resize(width*height*factor*factor); // used to store the upscaled image
600 	u32 *outputBuf = bufOutput.data();
601 
602 	if (ScaleInto(outputBuf, data, dstFmt, width, height, factor)) {
603 		data = outputBuf;
604 		return true;
605 	}
606 	return false;
607 }
608 
609 const int MIN_LINES_PER_THREAD = 4;
610 
ScaleXBRZ(int factor,u32 * source,u32 * dest,int width,int height)611 void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
612 	xbrz::ScalerCfg cfg;
613 	ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
614 }
615 
ScaleBilinear(int factor,u32 * source,u32 * dest,int width,int height)616 void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
617 	bufTmp1.resize(width * height * factor);
618 	u32 *tmpBuf = bufTmp1.data();
619 	ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
620 	ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
621 }
622 
ScaleBicubicBSpline(int factor,u32 * source,u32 * dest,int width,int height)623 void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
624 	ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
625 }
626 
ScaleBicubicMitchell(int factor,u32 * source,u32 * dest,int width,int height)627 void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
628 	ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
629 }
630 
ScaleHybrid(int factor,u32 * source,u32 * dest,int width,int height,bool bicubic)631 void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
632 	// Basic algorithm:
633 	// 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly
634 	// 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ
635 	// 3) output = A*C + B*(1-C)
636 
637 	const static int KERNEL_SPLAT[3][3] = {
638 			{ 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }
639 	};
640 
641 	bufTmp1.resize(width*height);
642 	bufTmp2.resize(width*height*factor*factor);
643 	bufTmp3.resize(width*height*factor*factor);
644 
645 	ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
646 	ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
647 	ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
648 	// mask C is now in bufTmp3
649 
650 	ScaleXBRZ(factor, source, bufTmp2.data(), width, height);
651 	// xBRZ upscaled source is in bufTmp2
652 
653 	if (bicubic) ScaleBicubicBSpline(factor, source, dest, width, height);
654 	else ScaleBilinear(factor, source, dest, width, height);
655 	// Upscaled source is in dest
656 
657 	// Now we can mix it all together
658 	// The factor 8192 was found through practical testing on a variety of textures
659 	ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD);
660 }
661 
DePosterize(u32 * source,u32 * dest,int width,int height)662 void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
663 	bufTmp3.resize(width*height);
664 	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
665 	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
666 	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
667 	ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
668 }
669