1 // Copyright (c) 2012- PPSSPP Project.
2
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
11
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18 #include <algorithm>
19 #include <cstdlib>
20 #include <cstring>
21 #include <cmath>
22
23 #include "GPU/Common/TextureScalerCommon.h"
24
25 #include "Core/Config.h"
26 #include "Common/Common.h"
27 #include "Common/Log.h"
28 #include "Common/CommonFuncs.h"
29 #include "Common/Thread/ParallelLoop.h"
30 #include "Core/ThreadPools.h"
31 #include "Common/CPUDetect.h"
32 #include "ext/xbrz/xbrz.h"
33
34 #if _M_SSE >= 0x401
35 #include <smmintrin.h>
36 #endif
37
38 // Report the time and throughput for each larger scaling operation in the log
39 //#define SCALING_MEASURE_TIME
40
41 //#define DEBUG_SCALER_OUTPUT
42
43 #ifdef SCALING_MEASURE_TIME
44 #include "Common/TimeUtil.h"
45 #endif
46
47 /////////////////////////////////////// Helper Functions (mostly math for parallelization)
48
49 namespace {
50 //////////////////////////////////////////////////////////////////// Various image processing
51
52 #define R(_col) ((_col>> 0)&0xFF)
53 #define G(_col) ((_col>> 8)&0xFF)
54 #define B(_col) ((_col>>16)&0xFF)
55 #define A(_col) ((_col>>24)&0xFF)
56
57 #define DISTANCE(_p1,_p2) ( abs(static_cast<int>(static_cast<int>(R(_p1))-R(_p2))) + abs(static_cast<int>(static_cast<int>(G(_p1))-G(_p2))) \
58 + abs(static_cast<int>(static_cast<int>(B(_p1))-B(_p2))) + abs(static_cast<int>(static_cast<int>(A(_p1))-A(_p2))) )
59
60 // this is sadly much faster than an inline function with a loop, at least in VC10
61 #define MIX_PIXELS(_p0, _p1, _factors) \
62 ( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 << 0 ) | \
63 ( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 << 8 ) | \
64 ( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \
65 ( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 )
66
67 #define BLOCK_SIZE 32
68
69 // 3x3 convolution with Neumann boundary conditions, parallelizable
70 // quite slow, could be sped up a lot
71 // especially handling of separable kernels
convolve3x3(u32 * data,u32 * out,const int kernel[3][3],int width,int height,int l,int u)72 void convolve3x3(u32* data, u32* out, const int kernel[3][3], int width, int height, int l, int u) {
73 for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
74 for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
75 for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
76 for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
77 int val = 0;
78 for (int yoff = -1; yoff <= 1; ++yoff) {
79 int yy = std::max(std::min(y + yoff, height - 1), 0);
80 for (int xoff = -1; xoff <= 1; ++xoff) {
81 int xx = std::max(std::min(x + xoff, width - 1), 0);
82 val += data[yy*width + xx] * kernel[yoff + 1][xoff + 1];
83 }
84 }
85 out[y*width + x] = abs(val);
86 }
87 }
88 }
89 }
90 }
91
92 // deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources
deposterizeH(u32 * data,u32 * out,int w,int l,int u)93 void deposterizeH(u32* data, u32* out, int w, int l, int u) {
94 static const int T = 8;
95 for (int y = l; y < u; ++y) {
96 for (int x = 0; x < w; ++x) {
97 int inpos = y*w + x;
98 u32 center = data[inpos];
99 if (x == 0 || x == w - 1) {
100 out[y*w + x] = center;
101 continue;
102 }
103 u32 left = data[inpos - 1];
104 u32 right = data[inpos + 1];
105 out[y*w + x] = 0;
106 for (int c = 0; c < 4; ++c) {
107 u8 lc = ((left >> c * 8) & 0xFF);
108 u8 cc = ((center >> c * 8) & 0xFF);
109 u8 rc = ((right >> c * 8) & 0xFF);
110 if ((lc != rc) && ((lc == cc && abs((int)((int)rc) - cc) <= T) || (rc == cc && abs((int)((int)lc) - cc) <= T))) {
111 // blend this component
112 out[y*w + x] |= ((rc + lc) / 2) << (c * 8);
113 } else {
114 // no change for this component
115 out[y*w + x] |= cc << (c * 8);
116 }
117 }
118 }
119 }
120 }
deposterizeV(u32 * data,u32 * out,int w,int h,int l,int u)121 void deposterizeV(u32* data, u32* out, int w, int h, int l, int u) {
122 static const int T = 8;
123 for (int xb = 0; xb < w / BLOCK_SIZE + 1; ++xb) {
124 for (int y = l; y < u; ++y) {
125 for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w; ++x) {
126 u32 center = data[y * w + x];
127 if (y == 0 || y == h - 1) {
128 out[y*w + x] = center;
129 continue;
130 }
131 u32 upper = data[(y - 1) * w + x];
132 u32 lower = data[(y + 1) * w + x];
133 out[y*w + x] = 0;
134 for (int c = 0; c < 4; ++c) {
135 u8 uc = ((upper >> c * 8) & 0xFF);
136 u8 cc = ((center >> c * 8) & 0xFF);
137 u8 lc = ((lower >> c * 8) & 0xFF);
138 if ((uc != lc) && ((uc == cc && abs((int)((int)lc) - cc) <= T) || (lc == cc && abs((int)((int)uc) - cc) <= T))) {
139 // blend this component
140 out[y*w + x] |= ((lc + uc) / 2) << (c * 8);
141 } else {
142 // no change for this component
143 out[y*w + x] |= cc << (c * 8);
144 }
145 }
146 }
147 }
148 }
149 }
150
151 // generates a distance mask value for each pixel in data
152 // higher values -> larger distance to the surrounding pixels
generateDistanceMask(u32 * data,u32 * out,int width,int height,int l,int u)153 void generateDistanceMask(u32* data, u32* out, int width, int height, int l, int u) {
154 for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
155 for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
156 for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
157 for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
158 const u32 center = data[y*width + x];
159 u32 dist = 0;
160 for (int yoff = -1; yoff <= 1; ++yoff) {
161 int yy = y + yoff;
162 if (yy == height || yy == -1) {
163 dist += 1200; // assume distance at borders, usually makes for better result
164 continue;
165 }
166 for (int xoff = -1; xoff <= 1; ++xoff) {
167 if (yoff == 0 && xoff == 0) continue;
168 int xx = x + xoff;
169 if (xx == width || xx == -1) {
170 dist += 400; // assume distance at borders, usually makes for better result
171 continue;
172 }
173 dist += DISTANCE(data[yy*width + xx], center);
174 }
175 }
176 out[y*width + x] = dist;
177 }
178 }
179 }
180 }
181 }
182
183 // mix two images based on a mask
mix(u32 * data,u32 * source,u32 * mask,u32 maskmax,int width,int l,int u)184 void mix(u32* data, u32* source, u32* mask, u32 maskmax, int width, int l, int u) {
185 for (int y = l; y < u; ++y) {
186 for (int x = 0; x < width; ++x) {
187 int pos = y*width + x;
188 u8 mixFactors[2] = { 0, static_cast<u8>((std::min(mask[pos], maskmax) * 255) / maskmax) };
189 mixFactors[0] = 255 - mixFactors[1];
190 data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors);
191 if (A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha
192 }
193 }
194 }
195
196 //////////////////////////////////////////////////////////////////// Bicubic scaling
197
198 // generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B
199 // B=1 C=0 : cubic B spline (very smooth)
200 // B=C=1/3 : recommended for general upscaling
201 // B=0 C=1/2 : Catmull-Rom spline (sharp, ringing)
202 // see Mitchell & Netravali, "Reconstruction Filters in Computer Graphics"
mitchell(float x,float B,float C)203 inline float mitchell(float x, float B, float C) {
204 float ax = fabs(x);
205 if (ax >= 2.0f) return 0.0f;
206 if (ax >= 1.0f) return ((-B - 6 * C)*(x*x*x) + (6 * B + 30 * C)*(x*x) + (-12 * B - 48 * C)*x + (8 * B + 24 * C)) / 6.0f;
207 return ((12 - 9 * B - 6 * C)*(x*x*x) + (-18 + 12 * B + 6 * C)*(x*x) + (6 - 2 * B)) / 6.0f;
208 }
209
210 // arrays for pre-calculating weights and sums (~20KB)
211 // Dimensions:
212 // 0: 0 = BSpline, 1 = mitchell
213 // 2: 2-5x scaling
214 // 2,3: 5x5 generated pixels
215 // 4,5: 5x5 pixels sampled from
216 float bicubicWeights[2][4][5][5][5][5];
217 float bicubicInvSums[2][4][5][5];
218
219 // initialize pre-computed weights array
initBicubicWeights()220 void initBicubicWeights() {
221 float B[2] = { 1.0f, 0.334f };
222 float C[2] = { 0.0f, 0.334f };
223 for (int type = 0; type < 2; ++type) {
224 for (int factor = 2; factor <= 5; ++factor) {
225 for (int x = 0; x < factor; ++x) {
226 for (int y = 0; y < factor; ++y) {
227 float sum = 0.0f;
228 for (int sx = -2; sx <= 2; ++sx) {
229 for (int sy = -2; sy <= 2; ++sy) {
230 float dx = (x + 0.5f) / factor - (sx + 0.5f);
231 float dy = (y + 0.5f) / factor - (sy + 0.5f);
232 float dist = sqrt(dx*dx + dy*dy);
233 float weight = mitchell(dist, B[type], C[type]);
234 bicubicWeights[type][factor - 2][x][y][sx + 2][sy + 2] = weight;
235 sum += weight;
236 }
237 }
238 bicubicInvSums[type][factor - 2][x][y] = 1.0f / sum;
239 }
240 }
241 }
242 }
243 }
244
245 // perform bicubic scaling by factor f, with precomputed spline type T
246 template<int f, int T>
scaleBicubicT(u32 * data,u32 * out,int w,int h,int l,int u)247 void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) {
248 int outw = w*f;
249 for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) {
250 for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) {
251 for (int y = l*f + yb*BLOCK_SIZE; y < l*f + (yb + 1)*BLOCK_SIZE && y < u*f; ++y) {
252 for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w*f; ++x) {
253 float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f;
254 int cx = x / f, cy = y / f;
255 // sample supporting pixels in original image
256 for (int sx = -2; sx <= 2; ++sx) {
257 for (int sy = -2; sy <= 2; ++sy) {
258 float weight = bicubicWeights[T][f - 2][x%f][y%f][sx + 2][sy + 2];
259 if (weight != 0.0f) {
260 // clamp pixel locations
261 int csy = std::max(std::min(sy + cy, h - 1), 0);
262 int csx = std::max(std::min(sx + cx, w - 1), 0);
263 // sample & add weighted components
264 u32 sample = data[csy*w + csx];
265 r += weight*R(sample);
266 g += weight*G(sample);
267 b += weight*B(sample);
268 a += weight*A(sample);
269 }
270 }
271 }
272 // generate and write result
273 float invSum = bicubicInvSums[T][f - 2][x%f][y%f];
274 int ri = std::min(std::max(static_cast<int>(ceilf(r*invSum)), 0), 255);
275 int gi = std::min(std::max(static_cast<int>(ceilf(g*invSum)), 0), 255);
276 int bi = std::min(std::max(static_cast<int>(ceilf(b*invSum)), 0), 255);
277 int ai = std::min(std::max(static_cast<int>(ceilf(a*invSum)), 0), 255);
278 out[y*outw + x] = (ai << 24) | (bi << 16) | (gi << 8) | ri;
279 }
280 }
281 }
282 }
283 }
284 #if _M_SSE >= 0x401
285 template<int f, int T>
scaleBicubicTSSE41(u32 * data,u32 * out,int w,int h,int l,int u)286 void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
287 int outw = w*f;
288 for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) {
289 for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) {
290 for (int y = l*f + yb*BLOCK_SIZE; y < l*f + (yb + 1)*BLOCK_SIZE && y < u*f; ++y) {
291 for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w*f; ++x) {
292 __m128 result = _mm_set1_ps(0.0f);
293 int cx = x / f, cy = y / f;
294 // sample supporting pixels in original image
295 for (int sx = -2; sx <= 2; ++sx) {
296 for (int sy = -2; sy <= 2; ++sy) {
297 float weight = bicubicWeights[T][f - 2][x%f][y%f][sx + 2][sy + 2];
298 if (weight != 0.0f) {
299 // clamp pixel locations
300 int csy = std::max(std::min(sy + cy, h - 1), 0);
301 int csx = std::max(std::min(sx + cx, w - 1), 0);
302 // sample & add weighted components
303 __m128i sample = _mm_cvtsi32_si128(data[csy*w + csx]);
304 sample = _mm_cvtepu8_epi32(sample);
305 __m128 col = _mm_cvtepi32_ps(sample);
306 col = _mm_mul_ps(col, _mm_set1_ps(weight));
307 result = _mm_add_ps(result, col);
308 }
309 }
310 }
311 // generate and write result
312 __m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(bicubicInvSums[T][f - 2][x%f][y%f])));
313 pixel = _mm_packs_epi32(pixel, pixel);
314 pixel = _mm_packus_epi16(pixel, pixel);
315 out[y*outw + x] = _mm_cvtsi128_si32(pixel);
316 }
317 }
318 }
319 }
320 }
321 #endif
322
scaleBicubicBSpline(int factor,u32 * data,u32 * out,int w,int h,int l,int u)323 void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) {
324 #if _M_SSE >= 0x401
325 if (cpu_info.bSSE4_1) {
326 switch (factor) {
327 case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this,
328 case 3: scaleBicubicTSSE41<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
329 case 4: scaleBicubicTSSE41<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
330 case 5: scaleBicubicTSSE41<5, 0>(data, out, w, h, l, u); break; // any of these break statements
331 default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
332 }
333 } else {
334 #endif
335 switch (factor) {
336 case 2: scaleBicubicT<2, 0>(data, out, w, h, l, u); break; // when I first tested this,
337 case 3: scaleBicubicT<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
338 case 4: scaleBicubicT<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
339 case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements
340 default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
341 }
342 #if _M_SSE >= 0x401
343 }
344 #endif
345 }
346
scaleBicubicMitchell(int factor,u32 * data,u32 * out,int w,int h,int l,int u)347 void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) {
348 #if _M_SSE >= 0x401
349 if (cpu_info.bSSE4_1) {
350 switch (factor) {
351 case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break;
352 case 3: scaleBicubicTSSE41<3, 1>(data, out, w, h, l, u); break;
353 case 4: scaleBicubicTSSE41<4, 1>(data, out, w, h, l, u); break;
354 case 5: scaleBicubicTSSE41<5, 1>(data, out, w, h, l, u); break;
355 default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
356 }
357 } else {
358 #endif
359 switch (factor) {
360 case 2: scaleBicubicT<2, 1>(data, out, w, h, l, u); break;
361 case 3: scaleBicubicT<3, 1>(data, out, w, h, l, u); break;
362 case 4: scaleBicubicT<4, 1>(data, out, w, h, l, u); break;
363 case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break;
364 default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
365 }
366 #if _M_SSE >= 0x401
367 }
368 #endif
369 }
370
371 //////////////////////////////////////////////////////////////////// Bilinear scaling
372
373 const static u8 BILINEAR_FACTORS[4][3][2] = {
374 { { 44, 211 }, { 0, 0 }, { 0, 0 } }, // x2
375 { { 64, 191 }, { 0, 255 }, { 0, 0 } }, // x3
376 { { 77, 178 }, { 26, 229 }, { 0, 0 } }, // x4
377 { { 102, 153 }, { 51, 204 }, { 0, 255 } }, // x5
378 };
379 // integral bilinear upscaling by factor f, horizontal part
380 template<int f>
bilinearHt(u32 * data,u32 * out,int w,int l,int u)381 void bilinearHt(u32* data, u32* out, int w, int l, int u) {
382 static_assert(f > 1 && f <= 5, "Bilinear scaling only implemented for factors 2 to 5");
383 int outw = w*f;
384 for (int y = l; y < u; ++y) {
385 for (int x = 0; x < w; ++x) {
386 int inpos = y*w + x;
387 u32 left = data[inpos - (x == 0 ? 0 : 1)];
388 u32 center = data[inpos];
389 u32 right = data[inpos + (x == w - 1 ? 0 : 1)];
390 int i = 0;
391 for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
392 out[y*outw + x*f + i] = MIX_PIXELS(left, center, BILINEAR_FACTORS[f - 2][i]);
393 }
394 for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
395 out[y*outw + x*f + i] = MIX_PIXELS(right, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
396 }
397 }
398 }
399 }
bilinearH(int factor,u32 * data,u32 * out,int w,int l,int u)400 void bilinearH(int factor, u32* data, u32* out, int w, int l, int u) {
401 switch (factor) {
402 case 2: bilinearHt<2>(data, out, w, l, u); break;
403 case 3: bilinearHt<3>(data, out, w, l, u); break;
404 case 4: bilinearHt<4>(data, out, w, l, u); break;
405 case 5: bilinearHt<5>(data, out, w, l, u); break;
406 default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
407 }
408 }
409 // integral bilinear upscaling by factor f, vertical part
410 // gl/gu == global lower and upper bound
411 template<int f>
bilinearVt(u32 * data,u32 * out,int w,int gl,int gu,int l,int u)412 void bilinearVt(u32* data, u32* out, int w, int gl, int gu, int l, int u) {
413 static_assert(f>1 && f <= 5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x");
414 int outw = w*f;
415 for (int xb = 0; xb < outw / BLOCK_SIZE + 1; ++xb) {
416 for (int y = l; y < u; ++y) {
417 u32 uy = y - (y == gl ? 0 : 1);
418 u32 ly = y + (y == gu - 1 ? 0 : 1);
419 for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < outw; ++x) {
420 u32 upper = data[uy * outw + x];
421 u32 center = data[y * outw + x];
422 u32 lower = data[ly * outw + x];
423 int i = 0;
424 for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
425 out[(y*f + i)*outw + x] = MIX_PIXELS(upper, center, BILINEAR_FACTORS[f - 2][i]);
426 }
427 for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
428 out[(y*f + i)*outw + x] = MIX_PIXELS(lower, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
429 }
430 }
431 }
432 }
433 }
bilinearV(int factor,u32 * data,u32 * out,int w,int gl,int gu,int l,int u)434 void bilinearV(int factor, u32* data, u32* out, int w, int gl, int gu, int l, int u) {
435 switch (factor) {
436 case 2: bilinearVt<2>(data, out, w, gl, gu, l, u); break;
437 case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break;
438 case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break;
439 case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break;
440 default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
441 }
442 }
443
444 #undef BLOCK_SIZE
445 #undef MIX_PIXELS
446 #undef DISTANCE
447 #undef R
448 #undef G
449 #undef B
450 #undef A
451
452 #ifdef DEBUG_SCALER_OUTPUT
453
454 // used for debugging texture scaling (writing textures to files)
455 static int g_imgCount = 0;
dbgPPM(int w,int h,u8 * pixels,const char * prefix="dbg")456 void dbgPPM(int w, int h, u8* pixels, const char* prefix = "dbg") { // 3 component RGB
457 char fn[32];
458 snprintf(fn, 32, "%s%04d.ppm", prefix, g_imgCount++);
459 FILE *fp = fopen(fn, "wb");
460 fprintf(fp, "P6\n%d %d\n255\n", w, h);
461 for (int j = 0; j < h; ++j) {
462 for (int i = 0; i < w; ++i) {
463 static unsigned char color[3];
464 color[0] = pixels[(j*w + i) * 4 + 0]; /* red */
465 color[1] = pixels[(j*w + i) * 4 + 1]; /* green */
466 color[2] = pixels[(j*w + i) * 4 + 2]; /* blue */
467 fwrite(color, 1, 3, fp);
468 }
469 }
470 fclose(fp);
471 }
dbgPGM(int w,int h,u32 * pixels,const char * prefix="dbg")472 void dbgPGM(int w, int h, u32* pixels, const char* prefix = "dbg") { // 1 component
473 char fn[32];
474 snprintf(fn, 32, "%s%04d.pgm", prefix, g_imgCount++);
475 FILE *fp = fopen(fn, "wb");
476 fprintf(fp, "P5\n%d %d\n65536\n", w, h);
477 for (int j = 0; j < h; ++j) {
478 for (int i = 0; i < w; ++i) {
479 fwrite((pixels + (j*w + i)), 1, 2, fp);
480 }
481 }
482 fclose(fp);
483 }
484
485 #endif
486
487 }
488
489 /////////////////////////////////////// Texture Scaler
490
TextureScalerCommon()491 TextureScalerCommon::TextureScalerCommon() {
492 initBicubicWeights();
493 }
494
~TextureScalerCommon()495 TextureScalerCommon::~TextureScalerCommon() {
496 }
497
IsEmptyOrFlat(u32 * data,int pixels,int fmt)498 bool TextureScalerCommon::IsEmptyOrFlat(u32* data, int pixels, int fmt) {
499 int pixelsPerWord = 4 / BytesPerPixel(fmt);
500 u32 ref = data[0];
501 if (pixelsPerWord > 1 && (ref & 0x0000FFFF) != (ref >> 16)) {
502 return false;
503 }
504 for (int i = 0; i < pixels / pixelsPerWord; ++i) {
505 if (data[i] != ref) return false;
506 }
507 return true;
508 }
509
ScaleAlways(u32 * out,u32 * src,u32 & dstFmt,int & width,int & height,int factor)510 void TextureScalerCommon::ScaleAlways(u32 *out, u32 *src, u32 &dstFmt, int &width, int &height, int factor) {
511 if (IsEmptyOrFlat(src, width*height, dstFmt)) {
512 // This means it was a flat texture. Vulkan wants the size up front, so we need to make it happen.
513 u32 pixel;
514 // Since it's flat, one pixel is enough. It might end up pointing to data, though.
515 u32 *pixelPointer = &pixel;
516 ConvertTo8888(dstFmt, src, pixelPointer, 1, 1);
517 if (pixelPointer != &pixel) {
518 pixel = *pixelPointer;
519 }
520
521 dstFmt = Get8888Format();
522 width *= factor;
523 height *= factor;
524
525 // ABCD. If A = D, and AB = CD, then they must all be equal (B = C, etc.)
526 if ((pixel & 0x000000FF) == (pixel >> 24) && (pixel & 0x0000FFFF) == (pixel >> 16)) {
527 memset(out, pixel & 0xFF, width * height * sizeof(u32));
528 } else {
529 // Let's hope this is vectorized.
530 for (int i = 0; i < width * height; ++i) {
531 out[i] = pixel;
532 }
533 }
534 } else {
535 ScaleInto(out, src, dstFmt, width, height, factor);
536 }
537 }
538
ScaleInto(u32 * outputBuf,u32 * src,u32 & dstFmt,int & width,int & height,int factor)539 bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, u32 &dstFmt, int &width, int &height, int factor) {
540 #ifdef SCALING_MEASURE_TIME
541 double t_start = time_now_d();
542 #endif
543
544 bufInput.resize(width*height); // used to store the input image image if it needs to be reformatted
545 u32 *inputBuf = bufInput.data();
546
547 // convert texture to correct format for scaling
548 ConvertTo8888(dstFmt, src, inputBuf, width, height);
549
550 // deposterize
551 if (g_Config.bTexDeposterize) {
552 bufDeposter.resize(width*height);
553 DePosterize(inputBuf, bufDeposter.data(), width, height);
554 inputBuf = bufDeposter.data();
555 }
556
557 // scale
558 switch (g_Config.iTexScalingType) {
559 case XBRZ:
560 ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
561 break;
562 case HYBRID:
563 ScaleHybrid(factor, inputBuf, outputBuf, width, height);
564 break;
565 case BICUBIC:
566 ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);
567 break;
568 case HYBRID_BICUBIC:
569 ScaleHybrid(factor, inputBuf, outputBuf, width, height, true);
570 break;
571 default:
572 ERROR_LOG(G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);
573 }
574
575 // update values accordingly
576 dstFmt = Get8888Format();
577 width *= factor;
578 height *= factor;
579
580 #ifdef SCALING_MEASURE_TIME
581 if (width*height > 64 * 64 * factor*factor) {
582 double t = time_now_d() - t_start;
583 NOTICE_LOG(G3D, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)",
584 width*height, t, (width*height) / (t * 1000 * 1000));
585 }
586 #endif
587
588 return true;
589 }
590
Scale(u32 * & data,u32 & dstFmt,int & width,int & height,int factor)591 bool TextureScalerCommon::Scale(u32* &data, u32 &dstFmt, int &width, int &height, int factor) {
592 // prevent processing empty or flat textures (this happens a lot in some games)
593 // doesn't hurt the standard case, will be very quick for textures with actual texture
594 if (IsEmptyOrFlat(data, width*height, dstFmt)) {
595 DEBUG_LOG(G3D, "TextureScaler: early exit -- empty/flat texture");
596 return false;
597 }
598
599 bufOutput.resize(width*height*factor*factor); // used to store the upscaled image
600 u32 *outputBuf = bufOutput.data();
601
602 if (ScaleInto(outputBuf, data, dstFmt, width, height, factor)) {
603 data = outputBuf;
604 return true;
605 }
606 return false;
607 }
608
609 const int MIN_LINES_PER_THREAD = 4;
610
ScaleXBRZ(int factor,u32 * source,u32 * dest,int width,int height)611 void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
612 xbrz::ScalerCfg cfg;
613 ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
614 }
615
ScaleBilinear(int factor,u32 * source,u32 * dest,int width,int height)616 void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
617 bufTmp1.resize(width * height * factor);
618 u32 *tmpBuf = bufTmp1.data();
619 ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
620 ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
621 }
622
ScaleBicubicBSpline(int factor,u32 * source,u32 * dest,int width,int height)623 void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
624 ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
625 }
626
ScaleBicubicMitchell(int factor,u32 * source,u32 * dest,int width,int height)627 void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
628 ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
629 }
630
ScaleHybrid(int factor,u32 * source,u32 * dest,int width,int height,bool bicubic)631 void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
632 // Basic algorithm:
633 // 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly
634 // 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ
635 // 3) output = A*C + B*(1-C)
636
637 const static int KERNEL_SPLAT[3][3] = {
638 { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }
639 };
640
641 bufTmp1.resize(width*height);
642 bufTmp2.resize(width*height*factor*factor);
643 bufTmp3.resize(width*height*factor*factor);
644
645 ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
646 ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
647 ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
648 // mask C is now in bufTmp3
649
650 ScaleXBRZ(factor, source, bufTmp2.data(), width, height);
651 // xBRZ upscaled source is in bufTmp2
652
653 if (bicubic) ScaleBicubicBSpline(factor, source, dest, width, height);
654 else ScaleBilinear(factor, source, dest, width, height);
655 // Upscaled source is in dest
656
657 // Now we can mix it all together
658 // The factor 8192 was found through practical testing on a variety of textures
659 ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD);
660 }
661
DePosterize(u32 * source,u32 * dest,int width,int height)662 void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
663 bufTmp3.resize(width*height);
664 ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
665 ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
666 ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
667 ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
668 }
669