1 /*
2 * This file is part of libplacebo.
3 *
4 * libplacebo is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * libplacebo is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <math.h>
19 #include "shaders.h"
20
21 const struct pl_deband_params pl_deband_default_params = {
22 .iterations = 1,
23 .threshold = 4.0,
24 .radius = 16.0,
25 .grain = 6.0,
26 };
27
src_params(const struct pl_sample_src * src)28 static inline struct pl_tex_params src_params(const struct pl_sample_src *src)
29 {
30 if (src->tex)
31 return src->tex->params;
32
33 return (struct pl_tex_params) {
34 .w = src->tex_w,
35 .h = src->tex_h,
36 };
37 }
38
39 enum filter {
40 NEAREST = PL_TEX_SAMPLE_NEAREST,
41 LINEAR = PL_TEX_SAMPLE_LINEAR,
42 BEST,
43 FASTEST,
44 };
45
46 // Helper function to compute the src/dst sizes and upscaling ratios
setup_src(pl_shader sh,const struct pl_sample_src * src,ident_t * src_tex,ident_t * pos,ident_t * size,ident_t * pt,float * ratio_x,float * ratio_y,uint8_t * comp_mask,float * scale,bool resizeable,const char ** fn,enum filter filter)47 static bool setup_src(pl_shader sh, const struct pl_sample_src *src,
48 ident_t *src_tex, ident_t *pos, ident_t *size, ident_t *pt,
49 float *ratio_x, float *ratio_y, uint8_t *comp_mask,
50 float *scale, bool resizeable, const char **fn,
51 enum filter filter)
52 {
53 enum pl_shader_sig sig;
54 float src_w, src_h;
55 enum pl_tex_sample_mode sample_mode;
56 if (src->tex) {
57 pl_fmt fmt = src->tex->params.format;
58 bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR;
59 pl_assert(pl_tex_params_dimension(src->tex->params) == 2);
60 sig = PL_SHADER_SIG_NONE;
61 src_w = pl_rect_w(src->rect);
62 src_h = pl_rect_h(src->rect);
63 switch (filter) {
64 case FASTEST:
65 case NEAREST:
66 sample_mode = PL_TEX_SAMPLE_NEAREST;
67 break;
68 case LINEAR:
69 if (!can_linear) {
70 SH_FAIL(sh, "Trying to use a shader that requires linear "
71 "sampling with a texture whose format (%s) does not "
72 "support PL_FMT_CAP_LINEAR", fmt->name);
73 return false;
74 }
75 sample_mode = PL_TEX_SAMPLE_LINEAR;
76 break;
77 case BEST:
78 sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST;
79 break;
80 }
81 } else {
82 pl_assert(src->tex_w && src->tex_h);
83 sig = PL_SHADER_SIG_SAMPLER;
84 src_w = src->sampled_w;
85 src_h = src->sampled_h;
86 if (filter == BEST || filter == FASTEST) {
87 sample_mode = src->mode;
88 } else {
89 sample_mode = (enum pl_tex_sample_mode) filter;
90 if (sample_mode != src->mode) {
91 SH_FAIL(sh, "Trying to use a shader that requires a different "
92 "filter mode than the external sampler.");
93 return false;
94 }
95 }
96 }
97
98 src_w = PL_DEF(src_w, src_params(src).w);
99 src_h = PL_DEF(src_h, src_params(src).h);
100 pl_assert(src_w && src_h);
101
102 int out_w = PL_DEF(src->new_w, roundf(fabs(src_w)));
103 int out_h = PL_DEF(src->new_h, roundf(fabs(src_h)));
104 pl_assert(out_w && out_h);
105
106 if (ratio_x)
107 *ratio_x = out_w / fabs(src_w);
108 if (ratio_y)
109 *ratio_y = out_h / fabs(src_h);
110 if (scale)
111 *scale = PL_DEF(src->scale, 1.0);
112
113 if (comp_mask) {
114 uint8_t tex_mask = 0x0Fu;
115 if (src->tex) {
116 // Mask containing only the number of components in the texture
117 tex_mask = (1 << src->tex->params.format->num_components) - 1;
118 }
119
120 uint8_t src_mask = src->component_mask;
121 if (!src_mask)
122 src_mask = (1 << PL_DEF(src->components, 4)) - 1;
123
124 // Only actually sample components that are both requested and
125 // available in the texture being sampled
126 *comp_mask = tex_mask & src_mask;
127 }
128
129 if (resizeable)
130 out_w = out_h = 0;
131 if (!sh_require(sh, sig, out_w, out_h))
132 return false;
133
134 if (src->tex) {
135 struct pl_rect2df rect = {
136 .x0 = src->rect.x0,
137 .y0 = src->rect.y0,
138 .x1 = src->rect.x0 + src_w,
139 .y1 = src->rect.y0 + src_h,
140 };
141
142 if (fn)
143 *fn = sh_tex_fn(sh, src->tex->params);
144
145 *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode,
146 "src_tex", &rect, pos, size, pt);
147 } else {
148 if (size) {
149 *size = sh_var(sh, (struct pl_shader_var) {
150 .var = pl_var_vec2("tex_size"),
151 .data = &(float[2]) { src->tex_w, src->tex_h },
152 });
153 }
154
155 if (pt) {
156 float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h;
157 if (src->sampler == PL_SAMPLER_RECT)
158 sx = sy = 1.0;
159
160 *pt = sh_var(sh, (struct pl_shader_var) {
161 .var = pl_var_vec2("tex_pt"),
162 .data = &(float[2]) { sx, sy },
163 });
164 }
165
166 if (fn)
167 *fn = sh_tex_fn(sh, (struct pl_tex_params) { .w = 1, .h = 1 }); // 2D
168
169 sh->sampler_type = src->sampler;
170
171 pl_assert(src->format);
172 switch (src->format) {
173 case PL_FMT_UNKNOWN:
174 case PL_FMT_FLOAT:
175 case PL_FMT_UNORM:
176 case PL_FMT_SNORM: sh->sampler_prefix = ' '; break;
177 case PL_FMT_UINT: sh->sampler_prefix = 'u'; break;
178 case PL_FMT_SINT: sh->sampler_prefix = 's'; break;
179 case PL_FMT_TYPE_COUNT:
180 pl_unreachable();
181 }
182
183 *src_tex = "src_tex";
184 *pos = "tex_coord";
185 }
186
187 return true;
188 }
189
pl_shader_deband(pl_shader sh,const struct pl_sample_src * src,const struct pl_deband_params * params)190 void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
191 const struct pl_deband_params *params)
192 {
193 float scale;
194 ident_t tex, pos, pt;
195 const char *fn;
196 if (!setup_src(sh, src, &tex, &pos, NULL, &pt, NULL, NULL, NULL, &scale,
197 true, &fn, LINEAR))
198 return;
199
200 params = PL_DEF(params, &pl_deband_default_params);
201 sh_describe(sh, "debanding");
202 GLSL("vec4 color; \n"
203 "// pl_shader_deband \n"
204 "{ \n");
205
206 ident_t prng, state;
207 prng = sh_prng(sh, true, &state);
208
209 GLSL("vec2 pos = %s; \n"
210 "vec4 avg, diff; \n"
211 "color = %s(%s, pos); \n",
212 pos, fn, tex);
213
214 if (params->iterations > 0) {
215 // Helper function: Compute a stochastic approximation of the avg color
216 // around a pixel, given a specified radius
217 ident_t average = sh_fresh(sh, "average");
218 GLSLH("vec4 %s(vec2 pos, float range, inout float %s) { \n"
219 // Compute a random angle and distance
220 " float dist = %s * range; \n"
221 " float dir = %s * %f; \n"
222 " vec2 o = dist * vec2(cos(dir), sin(dir)); \n"
223 // Sample at quarter-turn intervals around the source pixel
224 " vec4 sum = vec4(0.0); \n"
225 " sum += %s(%s, pos + %s * vec2( o.x, o.y)); \n"
226 " sum += %s(%s, pos + %s * vec2(-o.x, o.y)); \n"
227 " sum += %s(%s, pos + %s * vec2(-o.x, -o.y)); \n"
228 " sum += %s(%s, pos + %s * vec2( o.x, -o.y)); \n"
229 // Return the (normalized) average
230 " return 0.25 * sum; \n"
231 "}\n",
232 average, state, prng, prng, M_PI * 2,
233 fn, tex, pt, fn, tex, pt, fn, tex, pt, fn, tex, pt);
234
235 ident_t radius = sh_const_float(sh, "radius", params->radius);
236 ident_t threshold = sh_const_float(sh, "threshold",
237 params->threshold / (1000 * scale));
238
239 // For each iteration, compute the average at a given distance and
240 // pick it instead of the color if the difference is below the threshold.
241 for (int i = 1; i <= params->iterations; i++) {
242 GLSL("avg = %s(pos, %d.0 * %s, %s); \n"
243 "diff = abs(color - avg); \n"
244 "color = mix(avg, color, %s(greaterThan(diff, vec4(%s / %d.0)))); \n",
245 average, i, radius, state, sh_bvec(sh, 4), threshold, i);
246 }
247 }
248
249 GLSL("color *= vec4(%s);\n", SH_FLOAT(scale));
250
251 // Add some random noise to smooth out residual differences
252 if (params->grain > 0) {
253 GLSL("vec3 noise = vec3(%s, %s, %s); \n"
254 "color.rgb += %s * (noise - vec3(0.5)); \n",
255 prng, prng, prng, SH_FLOAT(params->grain / 1000.0));
256 }
257
258 GLSL("}\n");
259 }
260
pl_shader_sample_direct(pl_shader sh,const struct pl_sample_src * src)261 bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src)
262 {
263 float scale;
264 ident_t tex, pos;
265 const char *fn;
266 if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, NULL, &scale,
267 true, &fn, BEST))
268 return false;
269
270 GLSL("// pl_shader_sample_direct \n"
271 "vec4 color = vec4(%s) * %s(%s, %s); \n",
272 SH_FLOAT(scale), fn, tex, pos);
273 return true;
274 }
275
pl_shader_sample_nearest(pl_shader sh,const struct pl_sample_src * src)276 bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src)
277 {
278 float scale;
279 ident_t tex, pos;
280 const char *fn;
281 if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, NULL, &scale,
282 true, &fn, NEAREST))
283 return false;
284
285 sh_describe(sh, "nearest");
286 GLSL("// pl_shader_sample_nearest \n"
287 "vec4 color = vec4(%s) * %s(%s, %s); \n",
288 SH_FLOAT(scale), fn, tex, pos);
289 return true;
290 }
291
pl_shader_sample_bilinear(pl_shader sh,const struct pl_sample_src * src)292 bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src)
293 {
294 float scale;
295 ident_t tex, pos;
296 const char *fn;
297 if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, NULL, &scale,
298 true, &fn, LINEAR))
299 return false;
300
301 sh_describe(sh, "bilinear");
302 GLSL("// pl_shader_sample_bilinear \n"
303 "vec4 color = vec4(%s) * %s(%s, %s); \n",
304 SH_FLOAT(scale), fn, tex, pos);
305 return true;
306 }
307
bicubic_calcweights(pl_shader sh,const char * t,const char * s)308 static void bicubic_calcweights(pl_shader sh, const char *t, const char *s)
309 {
310 // Explanation of how bicubic scaling with only 4 texel fetches is done:
311 // http://www.mate.tue.nl/mate/pdfs/10318.pdf
312 // 'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
313 GLSL("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s \n"
314 " + vec4(1, 0, -0.5, 0.5); \n"
315 "%s = %s * %s + vec4(0.0, 0.0, -0.5, 0.5); \n"
316 "%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666); \n"
317 "%s.xy /= %s.zw; \n"
318 "%s.xy += vec2(1.0 + %s, 1.0 - %s); \n",
319 t, s,
320 t, t, s,
321 t, t, s,
322 t, t,
323 t, s, s);
324 }
325
pl_shader_sample_bicubic(pl_shader sh,const struct pl_sample_src * src)326 bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
327 {
328 ident_t tex, pos, size, pt;
329 float rx, ry, scale;
330 const char *fn;
331 if (!setup_src(sh, src, &tex, &pos, &size, &pt, &rx, &ry, NULL, &scale,
332 true, &fn, LINEAR))
333 return false;
334
335 if (rx < 1 || ry < 1) {
336 PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
337 "will most likely result in nasty aliasing!");
338 }
339
340 sh_describe(sh, "bicubic");
341 GLSL("// pl_shader_sample_bicubic \n"
342 "vec4 color; \n"
343 "{ \n"
344 "vec2 pos = %s; \n"
345 "vec2 pt = %s; \n"
346 "vec2 size = %s; \n"
347 "vec2 fcoord = fract(pos * size + vec2(0.5)); \n",
348 pos, pt, size);
349
350 bicubic_calcweights(sh, "parmx", "fcoord.x");
351 bicubic_calcweights(sh, "parmy", "fcoord.y");
352
353 GLSL("vec4 cdelta; \n"
354 "cdelta.xz = parmx.rg * vec2(-pt.x, pt.x); \n"
355 "cdelta.yw = parmy.rg * vec2(-pt.y, pt.y); \n"
356 // first y-interpolation
357 "vec4 ar = %s(%s, pos + cdelta.xy); \n"
358 "vec4 ag = %s(%s, pos + cdelta.xw); \n"
359 "vec4 ab = mix(ag, ar, parmy.b); \n"
360 // second y-interpolation
361 "vec4 br = %s(%s, pos + cdelta.zy); \n"
362 "vec4 bg = %s(%s, pos + cdelta.zw); \n"
363 "vec4 aa = mix(bg, br, parmy.b); \n"
364 // x-interpolation
365 "color = vec4(%s) * mix(aa, ab, parmx.b); \n"
366 "} \n",
367 fn, tex, fn, tex, fn, tex, fn, tex, SH_FLOAT(scale));
368
369 return true;
370 }
371
pl_shader_sample_oversample(pl_shader sh,const struct pl_sample_src * src,float threshold)372 bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
373 float threshold)
374 {
375 ident_t tex, pos, size, pt;
376 float rx, ry, scale;
377 const char *fn;
378 if (!setup_src(sh, src, &tex, &pos, &size, &pt, &rx, &ry, NULL, &scale,
379 true, &fn, LINEAR))
380 return false;
381
382 ident_t ratio = sh_var(sh, (struct pl_shader_var) {
383 .var = pl_var_vec2("ratio"),
384 .data = &(float[2]) { rx, ry },
385 });
386
387 // Round the position to the nearest pixel
388 sh_describe(sh, "oversample");
389 GLSL("// pl_shader_sample_oversample \n"
390 "vec4 color; \n"
391 "{ \n"
392 "vec2 pt = %s; \n"
393 "vec2 pos = %s - vec2(0.5) * pt; \n"
394 "vec2 fcoord = fract(pos * %s - vec2(0.5)); \n"
395 "vec2 coeff = fcoord * %s; \n",
396 pt, pos, size, ratio);
397
398 if (threshold > 0.0) {
399 threshold = PL_MIN(threshold, 1.0);
400 ident_t thresh = sh_const_float(sh, "threshold", threshold);
401 GLSL("coeff = (coeff - %s) / (1.0 - 2.0 * %s); \n",
402 thresh, thresh);
403 }
404
405 // Compute the right output blend of colors
406 GLSL("coeff = clamp(coeff, 0.0, 1.0); \n"
407 "pos += (coeff - fcoord) * pt; \n"
408 "color = vec4(%s) * %s(%s, pos); \n"
409 "} \n",
410 SH_FLOAT(scale), fn, tex);
411
412 return true;
413 }
414
filter_compat(pl_filter filter,float inv_scale,int lut_entries,float cutoff,const struct pl_filter_config * params)415 static bool filter_compat(pl_filter filter, float inv_scale,
416 int lut_entries, float cutoff,
417 const struct pl_filter_config *params)
418 {
419 if (!filter)
420 return false;
421 if (filter->params.lut_entries != lut_entries)
422 return false;
423 if (fabs(filter->params.filter_scale - inv_scale) > 1e-3)
424 return false;
425 if (filter->params.cutoff != cutoff)
426 return false;
427
428 return pl_filter_config_eq(&filter->params.config, params);
429 }
430
431 // Subroutine for computing and adding an individual texel contribution
432 // If `in` is NULL, samples directly
433 // If `in` is set, takes the pixel from inX[idx] where X is the component,
434 // `in` is the given identifier, and `idx` must be defined by the caller
polar_sample(pl_shader sh,pl_filter filter,const char * fn,ident_t tex,ident_t lut,ident_t cutoff,ident_t radius,int x,int y,uint8_t comp_mask,ident_t in)435 static void polar_sample(pl_shader sh, pl_filter filter, const char *fn,
436 ident_t tex, ident_t lut, ident_t cutoff, ident_t radius,
437 int x, int y, uint8_t comp_mask, ident_t in)
438 {
439 // Since we can't know the subpixel position in advance, assume a
440 // worst case scenario
441 int yy = y > 0 ? y-1 : y;
442 int xx = x > 0 ? x-1 : x;
443 float dmax = sqrt(xx*xx + yy*yy);
444 // Skip samples definitely outside the radius
445 if (dmax >= filter->radius_cutoff)
446 return;
447
448 GLSL("d = length(vec2(%d.0, %d.0) - fcoord);\n", x, y);
449 // Check for samples that might be skippable
450 bool maybe_skippable = dmax >= filter->radius_cutoff - M_SQRT2;
451 if (maybe_skippable)
452 GLSL("if (d < %s) {\n", cutoff);
453
454 // Get the weight for this pixel
455 GLSL("w = %s(d * 1.0/%s); \n"
456 "wsum += w; \n",
457 lut, radius);
458
459 if (in) {
460 for (uint8_t comps = comp_mask; comps;) {
461 uint8_t c = __builtin_ctz(comps);
462 GLSL("color[%d] += w * %s%d[idx]; \n", c, in, c);
463 comps &= ~(1 << c);
464 }
465 } else {
466 GLSL("c = %s(%s, base + pt * vec2(%d.0, %d.0)); \n",
467 fn, tex, x, y);
468 for (uint8_t comps = comp_mask; comps;) {
469 uint8_t c = __builtin_ctz(comps);
470 GLSL("color[%d] += w * c[%d]; \n", c, c);
471 comps &= ~(1 << c);
472 }
473 }
474
475 if (maybe_skippable)
476 GLSL("}\n");
477 }
478
479 struct sh_sampler_obj {
480 pl_filter filter;
481 pl_shader_obj lut;
482 pl_shader_obj pass2; // for pl_shader_sample_ortho
483 };
484
sh_sampler_uninit(pl_gpu gpu,void * ptr)485 static void sh_sampler_uninit(pl_gpu gpu, void *ptr)
486 {
487 struct sh_sampler_obj *obj = ptr;
488 pl_shader_obj_destroy(&obj->lut);
489 pl_shader_obj_destroy(&obj->pass2);
490 pl_filter_free(&obj->filter);
491 *obj = (struct sh_sampler_obj) {0};
492 }
493
fill_polar_lut(void * data,const struct sh_lut_params * params)494 static void fill_polar_lut(void *data, const struct sh_lut_params *params)
495 {
496 const struct sh_sampler_obj *obj = params->priv;
497 pl_filter filt = obj->filter;
498
499 pl_assert(params->width == filt->params.lut_entries && params->comps == 1);
500 memcpy(data, filt->weights, params->width * sizeof(float));
501 }
502
pl_shader_sample_polar(pl_shader sh,const struct pl_sample_src * src,const struct pl_sample_filter_params * params)503 bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
504 const struct pl_sample_filter_params *params)
505 {
506 pl_assert(params);
507 if (!params->filter.polar) {
508 SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?");
509 return false;
510 }
511
512 bool has_compute = sh_glsl(sh).compute && !params->no_compute;
513 has_compute &= sh_glsl(sh).version >= 130; // needed for round()
514 if (!src->tex && has_compute) {
515 // FIXME: Could maybe solve this by communicating the wbase from
516 // invocation 0 to the rest of the workgroup using shmem, which would
517 // also allow us to avoid the use of the hacky %s_map below.
518 PL_WARN(sh, "Combining pl_shader_sample_polar with the sampler2D "
519 "interface prevents the use of compute shaders, which is a "
520 "potentially massive performance hit. If you're sure you want "
521 "this, set `params.no_compute` to suppress this warning.");
522 has_compute = false;
523 }
524
525 bool flipped = src->rect.x0 > src->rect.x1 || src->rect.y0 > src->rect.y1;
526 if (flipped && has_compute) {
527 // FIXME: I'm sure this case could actually be supported with some
528 // extra math in the positional calculations, should implement it
529 PL_WARN(sh, "Trying to use a flipped src.rect with polar sampling! "
530 "This prevents the use of compute shaders, which is a "
531 "potentially massive performance hit. If you're really sure you "
532 "want this, set `params.no_compute` to suppress this warning.");
533 has_compute = false;
534 }
535
536 uint8_t comp_mask;
537 float rx, ry, scale;
538 ident_t src_tex, pos, size, pt;
539 const char *fn;
540 if (!setup_src(sh, src, &src_tex, &pos, &size, &pt, &rx, &ry, &comp_mask,
541 &scale, false, &fn, FASTEST))
542 return false;
543
544 struct sh_sampler_obj *obj;
545 obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj,
546 sh_sampler_uninit);
547 if (!obj)
548 return false;
549
550 float inv_scale = 1.0 / PL_MIN(rx, ry);
551 inv_scale = PL_MAX(inv_scale, 1.0);
552
553 if (params->no_widening)
554 inv_scale = 1.0;
555
556 int lut_entries = PL_DEF(params->lut_entries, 64);
557 float cutoff = PL_DEF(params->cutoff, 0.001);
558 bool update = !filter_compat(obj->filter, inv_scale, lut_entries, cutoff,
559 ¶ms->filter);
560
561 if (update) {
562 pl_filter_free(&obj->filter);
563 obj->filter = pl_filter_generate(sh->log, &(struct pl_filter_params) {
564 .config = params->filter,
565 .lut_entries = lut_entries,
566 .filter_scale = inv_scale,
567 .cutoff = cutoff,
568 });
569
570 if (!obj->filter) {
571 // This should never happen, but just in case ..
572 SH_FAIL(sh, "Failed initializing polar filter!");
573 return false;
574 }
575 }
576
577 sh_describe(sh, "polar scaling");
578 GLSL("// pl_shader_sample_polar \n"
579 "vec4 color = vec4(0.0); \n"
580 "{ \n"
581 "vec2 pos = %s, size = %s, pt = %s; \n"
582 "vec2 fcoord = fract(pos * size - vec2(0.5)); \n"
583 "vec2 base = pos - pt * fcoord; \n"
584 "vec2 center = base + pt * vec2(0.5); \n"
585 "float w, d, wsum = 0.0; \n"
586 "int idx; \n"
587 "vec4 c; \n",
588 pos, size, pt);
589
590 int bound = ceil(obj->filter->radius_cutoff);
591 int offset = bound - 1; // padding top/left
592 int padding = offset + bound; // total padding
593
594 // Determined experimentally on modern AMD and Nvidia hardware. 32 is a
595 // good tradeoff for the horizontal work group size. Apart from that,
596 // just use as many threads as possible.
597 const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw;
598
599 // We need to sample everything from base_min to base_max, so make sure
600 // we have enough room in shmem
601 int iw = (int) ceil(bw / rx) + padding + 1,
602 ih = (int) ceil(bh / ry) + padding + 1;
603
604 ident_t in = NULL;
605 int num_comps = __builtin_popcount(comp_mask);
606 int shmem_req = iw * ih * num_comps * sizeof(float);
607 bool is_compute = has_compute && sh_try_compute(sh, bw, bh, false, shmem_req);
608
609 // For compute shaders, which read the input texels primarily from shmem,
610 // using a texture-based LUT is better. For the fragment shader fallback
611 // code, which is primarily texture bound, the extra cost of LUT
612 // interpolation is worth the reduction in texel fetches.
613 ident_t lut = sh_lut(sh, &(struct sh_lut_params) {
614 .object = &obj->lut,
615 .method = is_compute ? SH_LUT_TEXTURE : SH_LUT_AUTO,
616 .type = PL_VAR_FLOAT,
617 .width = lut_entries,
618 .comps = 1,
619 .linear = true,
620 .update = update,
621 .fill = fill_polar_lut,
622 .priv = obj,
623 });
624
625 if (!lut) {
626 SH_FAIL(sh, "Failed initializing polar LUT!");
627 return false;
628 }
629
630 ident_t cutoff_c = sh_const_float(sh, "radius_cutoff", obj->filter->radius_cutoff);
631 ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius);
632
633 if (is_compute) {
634 // Compute shader kernel
635 GLSL("vec2 wpos = %s_map(gl_WorkGroupID * gl_WorkGroupSize); \n"
636 "vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5)); \n"
637 "ivec2 rel = ivec2(round((base - wbase) * size)); \n",
638 pos);
639
640 ident_t iw_c = sh_const(sh, (struct pl_shader_const) {
641 .type = PL_VAR_SINT,
642 .compile_time = true,
643 .name ="iw",
644 .data = &iw,
645 });
646
647 ident_t ih_c = sh_const(sh, (struct pl_shader_const) {
648 .type = PL_VAR_SINT,
649 .compile_time = true,
650 .name = "ih",
651 .data = &ih,
652 });
653
654 // Load all relevant texels into shmem
655 GLSL("for (int y = int(gl_LocalInvocationID.y); y < %s; y += %d) { \n"
656 "for (int x = int(gl_LocalInvocationID.x); x < %s; x += %d) { \n"
657 "c = %s(%s, wbase + pt * vec2(x - %d, y - %d)); \n",
658 ih_c, bh, iw_c, bw, fn, src_tex, offset, offset);
659
660 in = sh_fresh(sh, "in");
661 for (uint8_t comps = comp_mask; comps;) {
662 uint8_t c = __builtin_ctz(comps);
663 GLSLH("shared float %s%d[%s * %s]; \n", in, c, ih_c, iw_c);
664 GLSL("%s%d[%s * y + x] = c[%d]; \n", in, c, iw_c, c);
665 comps &= ~(1 << c);
666 }
667
668 GLSL("}} \n"
669 "barrier(); \n");
670
671 // Dispatch the actual samples
672 for (int y = 1 - bound; y <= bound; y++) {
673 for (int x = 1 - bound; x <= bound; x++) {
674 GLSL("idx = %s * rel.y + rel.x + %s * %d + %d; \n",
675 iw_c, iw_c, y + offset, x + offset);
676 polar_sample(sh, obj->filter, fn, src_tex, lut, cutoff_c, radius_c,
677 x, y, comp_mask, in);
678 }
679 }
680 } else {
681 // Fragment shader sampling
682 for (uint8_t comps = comp_mask; comps;) {
683 uint8_t c = __builtin_ctz(comps);
684 GLSL("vec4 in%d;\n", c);
685 comps &= ~(1 << c);
686 }
687
688 // For maximum efficiency, we want to use textureGather() if
689 // possible, rather than direct sampling. Since this is not
690 // always possible/sensible, we need to possibly intermix gathering
691 // with regular sampling. This requires keeping track of which
692 // pixels in the next row were already gathered by the previous
693 // row.
694 uint32_t gathered_cur = 0x0, gathered_next = 0x0;
695 const float radius2 = PL_SQUARE(obj->filter->radius_cutoff);
696 const int base = bound - 1;
697
698 if (base + bound >= 8 * sizeof(gathered_cur)) {
699 SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!",
700 obj->filter->radius_cutoff);
701 return false;
702 }
703
704 for (int y = 1 - bound; y <= bound; y++) {
705 for (int x = 1 - bound; x <= bound; x++) {
706 // Skip already gathered texels
707 uint32_t bit = 1llu << (base + x);
708 if (gathered_cur & bit)
709 continue;
710
711 // Using texture gathering is only more efficient than direct
712 // sampling in the case where we expect to be able to use all
713 // four gathered texels, without having to discard any. So
714 // only do it if we suspect it will be a win rather than a
715 // loss.
716 int xx = x*x, xx1 = (x+1)*(x+1);
717 int yy = y*y, yy1 = (y+1)*(y+1);
718 bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2;
719 use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset;
720 use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset;
721 use_gather &= !src->tex || src->tex->params.format->gatherable;
722
723 // Gathering from components other than the R channel requires
724 // support for GLSL 400, which introduces the overload of
725 // textureGather* that allows specifying the component.
726 //
727 // This is also the minimum requirement if we don't know the
728 // texture format capabilities, for the sampler2D interface
729 if (comp_mask != 0x1 || !src->tex)
730 use_gather &= sh_glsl(sh).version >= 400;
731
732 if (!use_gather) {
733 // Switch to direct sampling instead
734 polar_sample(sh, obj->filter, fn, src_tex, lut, cutoff_c,
735 radius_c, x, y, comp_mask, NULL);
736 continue;
737 }
738
739 // Gather the four surrounding texels simultaneously
740 for (uint8_t comps = comp_mask; comps;) {
741 uint8_t c = __builtin_ctz(comps);
742 if (x || y) {
743 if (c) {
744 GLSL("in%d = textureGatherOffset(%s, center, "
745 "ivec2(%d, %d), %d);\n",
746 c, src_tex, x, y, c);
747 } else {
748 GLSL("in0 = textureGatherOffset(%s, center, "
749 "ivec2(%d, %d));\n", src_tex, x, y);
750 }
751 } else {
752 if (c) {
753 GLSL("in%d = textureGather(%s, center, %d);\n",
754 c, src_tex, c);
755 } else {
756 GLSL("in0 = textureGather(%s, center);\n", src_tex);
757 }
758 }
759 comps &= ~(1 << c);
760 }
761
762 // Mix in all of the points with their weights
763 for (int p = 0; p < 4; p++) {
764 // The four texels are gathered counterclockwise starting
765 // from the bottom left
766 static const int xo[4] = {0, 1, 1, 0};
767 static const int yo[4] = {1, 1, 0, 0};
768 if (x+xo[p] > bound || y+yo[p] > bound)
769 continue; // next subpixel
770
771 GLSL("idx = %d;\n", p);
772 polar_sample(sh, obj->filter, fn, src_tex, lut, cutoff_c,
773 radius_c, x+xo[p], y+yo[p], comp_mask, "in");
774 }
775
776 // Mark the other next row's pixels as already gathered
777 gathered_next |= bit | (bit << 1);
778 x++; // skip adjacent pixel
779 }
780
781 // Prepare for new row
782 gathered_cur = gathered_next;
783 gathered_next = 0;
784 }
785 }
786
787 GLSL("color = vec4(%s / wsum) * color; \n", SH_FLOAT(scale));
788 if (!(comp_mask & (1 << PL_CHANNEL_A)))
789 GLSL("color.a = 1.0; \n");
790
791 GLSL("}\n");
792 return true;
793 }
794
fill_ortho_lut(void * data,const struct sh_lut_params * params)795 static void fill_ortho_lut(void *data, const struct sh_lut_params *params)
796 {
797 const struct sh_sampler_obj *obj = params->priv;
798 pl_filter filt = obj->filter;
799 size_t entries = filt->params.lut_entries * filt->row_stride;
800
801 pl_assert(params->width * params->height * params->comps == entries);
802 memcpy(data, filt->weights, entries * sizeof(float));
803 }
804
pl_shader_sample_ortho(pl_shader sh,int pass,const struct pl_sample_src * src,const struct pl_sample_filter_params * params)805 bool pl_shader_sample_ortho(pl_shader sh, int pass,
806 const struct pl_sample_src *src,
807 const struct pl_sample_filter_params *params)
808 {
809 pl_assert(params);
810 if (params->filter.polar) {
811 SH_FAIL(sh, "Trying to use separated sampling with a polar filter?");
812 return false;
813 }
814
815 pl_gpu gpu = SH_GPU(sh);
816 pl_assert(gpu);
817
818 struct pl_sample_src srcfix = *src;
819 switch (pass) {
820 case PL_SEP_VERT:
821 srcfix.rect.x0 = 0;
822 srcfix.rect.x1 = srcfix.new_w = src_params(src).w;
823 break;
824 case PL_SEP_HORIZ:
825 srcfix.rect.y0 = 0;
826 srcfix.rect.y1 = srcfix.new_h = src_params(src).h;
827 break;
828 }
829
830 uint8_t comp_mask;
831 float ratio[PL_SEP_PASSES], scale;
832 ident_t src_tex, pos, size, pt;
833 const char *fn;
834 if (!setup_src(sh, &srcfix, &src_tex, &pos, &size, &pt,
835 &ratio[PL_SEP_HORIZ], &ratio[PL_SEP_VERT],
836 &comp_mask, &scale, false, &fn, FASTEST))
837 return false;
838
839 // We can store a separate sampler object per dimension, so dispatch the
840 // right one. This is needed for two reasons:
841 // 1. Anamorphic content can have a different scaling ratio for each
842 // dimension. In particular, you could be upscaling in one and
843 // downscaling in the other.
844 // 2. After fixing the source for `setup_src`, we lose information about
845 // the scaling ratio of the other component. (Although this is only a
846 // minor reason and could easily be changed with some boilerplate)
847 struct sh_sampler_obj *obj;
848 obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER,
849 struct sh_sampler_obj, sh_sampler_uninit);
850 if (!obj)
851 return false;
852
853 if (pass != 0) {
854 obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER,
855 struct sh_sampler_obj, sh_sampler_uninit);
856 assert(obj);
857 }
858
859 float inv_scale = 1.0 / ratio[pass];
860 inv_scale = PL_MAX(inv_scale, 1.0);
861
862 if (params->no_widening)
863 inv_scale = 1.0;
864
865 int lut_entries = PL_DEF(params->lut_entries, 64);
866 bool update = !filter_compat(obj->filter, inv_scale, lut_entries, 0.0,
867 ¶ms->filter);
868
869 if (update) {
870 pl_filter_free(&obj->filter);
871 obj->filter = pl_filter_generate(sh->log, &(struct pl_filter_params) {
872 .config = params->filter,
873 .lut_entries = lut_entries,
874 .filter_scale = inv_scale,
875 .max_row_size = gpu->limits.max_tex_2d_dim / 4,
876 .row_stride_align = 4,
877 });
878
879 if (!obj->filter) {
880 // This should never happen, but just in case ..
881 SH_FAIL(sh, "Failed initializing separated filter!");
882 return false;
883 }
884 }
885
886 int N = obj->filter->row_size; // number of samples to convolve
887 int width = obj->filter->row_stride / 4; // width of the LUT texture
888 ident_t lut = sh_lut(sh, &(struct sh_lut_params) {
889 .object = &obj->lut,
890 .type = PL_VAR_FLOAT,
891 .width = width,
892 .height = lut_entries,
893 .comps = 4,
894 .linear = true,
895 .update = update,
896 .fill = fill_ortho_lut,
897 .priv = obj,
898 });
899 if (!lut) {
900 SH_FAIL(sh, "Failed initializing separated LUT!");
901 return false;
902 }
903
904 const int dir[PL_SEP_PASSES][2] = {
905 [PL_SEP_HORIZ] = {1, 0},
906 [PL_SEP_VERT] = {0, 1},
907 };
908
909 static const char *names[PL_SEP_PASSES] = {
910 [PL_SEP_HORIZ] = "ortho scaling (horiz)",
911 [PL_SEP_VERT] = "ortho scaling (vert)",
912 };
913
914 sh_describe(sh, names[pass]);
915 GLSL("// pl_shader_sample_ortho \n"
916 "vec4 color = vec4(0.0); \n"
917 "{ \n"
918 "vec2 pos = %s, size = %s, pt = %s; \n"
919 "vec2 dir = vec2(%d.0, %d.0); \n"
920 "pt *= dir; \n"
921 "vec2 fcoord2 = fract(pos * size - vec2(0.5)); \n"
922 "float fcoord = dot(fcoord2, dir); \n"
923 "vec2 base = pos - fcoord * pt - pt * vec2(%d.0); \n"
924 "float weight; \n"
925 "vec4 ws, c; \n",
926 pos, size, pt,
927 dir[pass][0], dir[pass][1],
928 N / 2 - 1);
929
930 bool use_ar = params->antiring > 0;
931 if (use_ar) {
932 GLSL("vec4 hi = vec4(0.0); \n"
933 "vec4 lo = vec4(1e9); \n");
934 }
935
936 // Dispatch all of the samples
937 GLSL("// scaler samples\n");
938 for (int n = 0; n < N; n++) {
939 // Load the right weight for this instance. For every 4th weight, we
940 // need to fetch another LUT entry. Otherwise, just use the previous
941 if (n % 4 == 0) {
942 float denom = PL_MAX(1, width - 1); // avoid division by zero
943 GLSL("ws = %s(vec2(%f, fcoord));\n", lut, (n / 4) / denom);
944 }
945 GLSL("weight = ws[%d];\n", n % 4);
946
947 // Load the input texel and add it to the running sum
948 GLSL("c = %s(%s, base + pt * vec2(%d.0)); \n",
949 fn, src_tex, n);
950
951 for (uint8_t comps = comp_mask; comps;) {
952 uint8_t c = __builtin_ctz(comps);
953 GLSL("color[%d] += weight * c[%d]; \n", c, c);
954 comps &= ~(1 << c);
955
956 if (use_ar && (n == N / 2 - 1 || n == N / 2)) {
957 GLSL("lo[%d] = min(lo[%d], c[%d]); \n"
958 "hi[%d] = max(hi[%d], c[%d]); \n",
959 c, c, c, c, c, c);
960 }
961 }
962 }
963
964 if (use_ar) {
965 GLSL("color = mix(color, clamp(color, lo, hi), %s);\n",
966 sh_const_float(sh, "antiring", params->antiring));
967 }
968
969 GLSL("color *= vec4(%s);\n", SH_FLOAT(scale));
970 if (!(comp_mask & (1 << PL_CHANNEL_A)))
971 GLSL("color.a = 1.0; \n");
972
973 GLSL("}\n");
974 return true;
975 }
976