1 /*
2  * This file is part of libplacebo.
3  *
4  * libplacebo is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * libplacebo is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <math.h>
19 #include "shaders.h"
20 
21 const struct pl_deband_params pl_deband_default_params = {
22     .iterations = 1,
23     .threshold  = 4.0,
24     .radius     = 16.0,
25     .grain      = 6.0,
26 };
27 
src_params(const struct pl_sample_src * src)28 static inline struct pl_tex_params src_params(const struct pl_sample_src *src)
29 {
30     if (src->tex)
31         return src->tex->params;
32 
33     return (struct pl_tex_params) {
34         .w = src->tex_w,
35         .h = src->tex_h,
36     };
37 }
38 
39 enum filter {
40     NEAREST = PL_TEX_SAMPLE_NEAREST,
41     LINEAR  = PL_TEX_SAMPLE_LINEAR,
42     BEST,
43     FASTEST,
44 };
45 
46 // Helper function to compute the src/dst sizes and upscaling ratios
setup_src(pl_shader sh,const struct pl_sample_src * src,ident_t * src_tex,ident_t * pos,ident_t * size,ident_t * pt,float * ratio_x,float * ratio_y,uint8_t * comp_mask,float * scale,bool resizeable,const char ** fn,enum filter filter)47 static bool setup_src(pl_shader sh, const struct pl_sample_src *src,
48                       ident_t *src_tex, ident_t *pos, ident_t *size, ident_t *pt,
49                       float *ratio_x, float *ratio_y, uint8_t *comp_mask,
50                       float *scale, bool resizeable, const char **fn,
51                       enum filter filter)
52 {
53     enum pl_shader_sig sig;
54     float src_w, src_h;
55     enum pl_tex_sample_mode sample_mode;
56     if (src->tex) {
57         pl_fmt fmt = src->tex->params.format;
58         bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR;
59         pl_assert(pl_tex_params_dimension(src->tex->params) == 2);
60         sig = PL_SHADER_SIG_NONE;
61         src_w = pl_rect_w(src->rect);
62         src_h = pl_rect_h(src->rect);
63         switch (filter) {
64         case FASTEST:
65         case NEAREST:
66             sample_mode = PL_TEX_SAMPLE_NEAREST;
67             break;
68         case LINEAR:
69             if (!can_linear) {
70                 SH_FAIL(sh, "Trying to use a shader that requires linear "
71                         "sampling with a texture whose format (%s) does not "
72                         "support PL_FMT_CAP_LINEAR", fmt->name);
73                 return false;
74             }
75             sample_mode = PL_TEX_SAMPLE_LINEAR;
76             break;
77         case BEST:
78             sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST;
79             break;
80         }
81     } else {
82         pl_assert(src->tex_w && src->tex_h);
83         sig = PL_SHADER_SIG_SAMPLER;
84         src_w = src->sampled_w;
85         src_h = src->sampled_h;
86         if (filter == BEST || filter == FASTEST) {
87             sample_mode = src->mode;
88         } else {
89             sample_mode = (enum pl_tex_sample_mode) filter;
90             if (sample_mode != src->mode) {
91                 SH_FAIL(sh, "Trying to use a shader that requires a different "
92                         "filter mode than the external sampler.");
93                 return false;
94             }
95         }
96     }
97 
98     src_w = PL_DEF(src_w, src_params(src).w);
99     src_h = PL_DEF(src_h, src_params(src).h);
100     pl_assert(src_w && src_h);
101 
102     int out_w = PL_DEF(src->new_w, roundf(fabs(src_w)));
103     int out_h = PL_DEF(src->new_h, roundf(fabs(src_h)));
104     pl_assert(out_w && out_h);
105 
106     if (ratio_x)
107         *ratio_x = out_w / fabs(src_w);
108     if (ratio_y)
109         *ratio_y = out_h / fabs(src_h);
110     if (scale)
111         *scale = PL_DEF(src->scale, 1.0);
112 
113     if (comp_mask) {
114         uint8_t tex_mask = 0x0Fu;
115         if (src->tex) {
116             // Mask containing only the number of components in the texture
117             tex_mask = (1 << src->tex->params.format->num_components) - 1;
118         }
119 
120         uint8_t src_mask = src->component_mask;
121         if (!src_mask)
122             src_mask = (1 << PL_DEF(src->components, 4)) - 1;
123 
124         // Only actually sample components that are both requested and
125         // available in the texture being sampled
126         *comp_mask = tex_mask & src_mask;
127     }
128 
129     if (resizeable)
130         out_w = out_h = 0;
131     if (!sh_require(sh, sig, out_w, out_h))
132         return false;
133 
134     if (src->tex) {
135         struct pl_rect2df rect = {
136             .x0 = src->rect.x0,
137             .y0 = src->rect.y0,
138             .x1 = src->rect.x0 + src_w,
139             .y1 = src->rect.y0 + src_h,
140         };
141 
142         if (fn)
143             *fn = sh_tex_fn(sh, src->tex->params);
144 
145         *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode,
146                            "src_tex", &rect, pos, size, pt);
147     } else {
148         if (size) {
149             *size = sh_var(sh, (struct pl_shader_var) {
150                 .var = pl_var_vec2("tex_size"),
151                 .data = &(float[2]) { src->tex_w, src->tex_h },
152             });
153         }
154 
155         if (pt) {
156             float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h;
157             if (src->sampler == PL_SAMPLER_RECT)
158                 sx = sy = 1.0;
159 
160             *pt = sh_var(sh, (struct pl_shader_var) {
161                 .var = pl_var_vec2("tex_pt"),
162                 .data = &(float[2]) { sx, sy },
163             });
164         }
165 
166         if (fn)
167             *fn = sh_tex_fn(sh, (struct pl_tex_params) { .w = 1, .h = 1 }); // 2D
168 
169         sh->sampler_type = src->sampler;
170 
171         pl_assert(src->format);
172         switch (src->format) {
173         case PL_FMT_UNKNOWN:
174         case PL_FMT_FLOAT:
175         case PL_FMT_UNORM:
176         case PL_FMT_SNORM: sh->sampler_prefix = ' '; break;
177         case PL_FMT_UINT: sh->sampler_prefix = 'u'; break;
178         case PL_FMT_SINT: sh->sampler_prefix = 's'; break;
179         case PL_FMT_TYPE_COUNT:
180             pl_unreachable();
181         }
182 
183         *src_tex = "src_tex";
184         *pos = "tex_coord";
185     }
186 
187     return true;
188 }
189 
pl_shader_deband(pl_shader sh,const struct pl_sample_src * src,const struct pl_deband_params * params)190 void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
191                       const struct pl_deband_params *params)
192 {
193     float scale;
194     ident_t tex, pos, pt;
195     const char *fn;
196     if (!setup_src(sh, src, &tex, &pos, NULL, &pt, NULL, NULL, NULL, &scale,
197                    true, &fn, LINEAR))
198         return;
199 
200     params = PL_DEF(params, &pl_deband_default_params);
201     sh_describe(sh, "debanding");
202     GLSL("vec4 color;           \n"
203          "// pl_shader_deband   \n"
204          "{                     \n");
205 
206     ident_t prng, state;
207     prng = sh_prng(sh, true, &state);
208 
209     GLSL("vec2 pos = %s;       \n"
210          "vec4 avg, diff;      \n"
211          "color = %s(%s, pos); \n",
212          pos, fn, tex);
213 
214     if (params->iterations > 0) {
215         // Helper function: Compute a stochastic approximation of the avg color
216         // around a pixel, given a specified radius
217         ident_t average = sh_fresh(sh, "average");
218         GLSLH("vec4 %s(vec2 pos, float range, inout float %s) {     \n"
219               // Compute a random angle and distance
220               "    float dist = %s * range;                         \n"
221               "    float dir  = %s * %f;                            \n"
222               "    vec2 o = dist * vec2(cos(dir), sin(dir));        \n"
223               // Sample at quarter-turn intervals around the source pixel
224               "    vec4 sum = vec4(0.0);                            \n"
225               "    sum += %s(%s, pos + %s * vec2( o.x,  o.y)); \n"
226               "    sum += %s(%s, pos + %s * vec2(-o.x,  o.y)); \n"
227               "    sum += %s(%s, pos + %s * vec2(-o.x, -o.y)); \n"
228               "    sum += %s(%s, pos + %s * vec2( o.x, -o.y)); \n"
229               // Return the (normalized) average
230               "    return 0.25 * sum;                               \n"
231               "}\n",
232               average, state, prng, prng, M_PI * 2,
233               fn, tex, pt, fn, tex, pt, fn, tex, pt, fn, tex, pt);
234 
235         ident_t radius = sh_const_float(sh, "radius", params->radius);
236         ident_t threshold = sh_const_float(sh, "threshold",
237                                            params->threshold / (1000 * scale));
238 
239         // For each iteration, compute the average at a given distance and
240         // pick it instead of the color if the difference is below the threshold.
241         for (int i = 1; i <= params->iterations; i++) {
242             GLSL("avg = %s(pos, %d.0 * %s, %s);                                     \n"
243                  "diff = abs(color - avg);                                          \n"
244                  "color = mix(avg, color, %s(greaterThan(diff, vec4(%s / %d.0))));  \n",
245                  average, i, radius, state, sh_bvec(sh, 4), threshold, i);
246         }
247     }
248 
249     GLSL("color *= vec4(%s);\n", SH_FLOAT(scale));
250 
251     // Add some random noise to smooth out residual differences
252     if (params->grain > 0) {
253         GLSL("vec3 noise = vec3(%s, %s, %s);         \n"
254              "color.rgb += %s * (noise - vec3(0.5)); \n",
255              prng, prng, prng, SH_FLOAT(params->grain / 1000.0));
256     }
257 
258     GLSL("}\n");
259 }
260 
pl_shader_sample_direct(pl_shader sh,const struct pl_sample_src * src)261 bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src)
262 {
263     float scale;
264     ident_t tex, pos;
265     const char *fn;
266     if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, NULL, &scale,
267                    true, &fn, BEST))
268         return false;
269 
270     GLSL("// pl_shader_sample_direct          \n"
271          "vec4 color = vec4(%s) * %s(%s, %s); \n",
272          SH_FLOAT(scale), fn, tex, pos);
273     return true;
274 }
275 
pl_shader_sample_nearest(pl_shader sh,const struct pl_sample_src * src)276 bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src)
277 {
278     float scale;
279     ident_t tex, pos;
280     const char *fn;
281     if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, NULL, &scale,
282                    true, &fn, NEAREST))
283         return false;
284 
285     sh_describe(sh, "nearest");
286     GLSL("// pl_shader_sample_nearest         \n"
287          "vec4 color = vec4(%s) * %s(%s, %s); \n",
288          SH_FLOAT(scale), fn, tex, pos);
289     return true;
290 }
291 
pl_shader_sample_bilinear(pl_shader sh,const struct pl_sample_src * src)292 bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src)
293 {
294     float scale;
295     ident_t tex, pos;
296     const char *fn;
297     if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, NULL, &scale,
298                    true, &fn, LINEAR))
299         return false;
300 
301     sh_describe(sh, "bilinear");
302     GLSL("// pl_shader_sample_bilinear        \n"
303          "vec4 color = vec4(%s) * %s(%s, %s); \n",
304          SH_FLOAT(scale), fn, tex, pos);
305     return true;
306 }
307 
bicubic_calcweights(pl_shader sh,const char * t,const char * s)308 static void bicubic_calcweights(pl_shader sh, const char *t, const char *s)
309 {
310     // Explanation of how bicubic scaling with only 4 texel fetches is done:
311     //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
312     //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
313     GLSL("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s \n"
314          "          + vec4(1, 0, -0.5, 0.5);                 \n"
315          "%s = %s * %s + vec4(0.0, 0.0, -0.5, 0.5);          \n"
316          "%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);   \n"
317          "%s.xy /= %s.zw;                                    \n"
318          "%s.xy += vec2(1.0 + %s, 1.0 - %s);                 \n",
319          t, s,
320          t, t, s,
321          t, t, s,
322          t, t,
323          t, s, s);
324 }
325 
pl_shader_sample_bicubic(pl_shader sh,const struct pl_sample_src * src)326 bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
327 {
328     ident_t tex, pos, size, pt;
329     float rx, ry, scale;
330     const char *fn;
331     if (!setup_src(sh, src, &tex, &pos, &size, &pt, &rx, &ry, NULL, &scale,
332                    true, &fn, LINEAR))
333         return false;
334 
335     if (rx < 1 || ry < 1) {
336         PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
337                  "will most likely result in nasty aliasing!");
338     }
339 
340     sh_describe(sh, "bicubic");
341     GLSL("// pl_shader_sample_bicubic                   \n"
342          "vec4 color;                                   \n"
343          "{                                             \n"
344          "vec2 pos  = %s;                               \n"
345          "vec2 pt   = %s;                               \n"
346          "vec2 size = %s;                               \n"
347          "vec2 fcoord = fract(pos * size + vec2(0.5));  \n",
348          pos, pt, size);
349 
350     bicubic_calcweights(sh, "parmx", "fcoord.x");
351     bicubic_calcweights(sh, "parmy", "fcoord.y");
352 
353     GLSL("vec4 cdelta;                              \n"
354          "cdelta.xz = parmx.rg * vec2(-pt.x, pt.x); \n"
355          "cdelta.yw = parmy.rg * vec2(-pt.y, pt.y); \n"
356          // first y-interpolation
357          "vec4 ar = %s(%s, pos + cdelta.xy);        \n"
358          "vec4 ag = %s(%s, pos + cdelta.xw);        \n"
359          "vec4 ab = mix(ag, ar, parmy.b);           \n"
360          // second y-interpolation
361          "vec4 br = %s(%s, pos + cdelta.zy);        \n"
362          "vec4 bg = %s(%s, pos + cdelta.zw);        \n"
363          "vec4 aa = mix(bg, br, parmy.b);           \n"
364          // x-interpolation
365          "color = vec4(%s) * mix(aa, ab, parmx.b);  \n"
366          "}                                         \n",
367          fn, tex, fn, tex, fn, tex, fn, tex, SH_FLOAT(scale));
368 
369     return true;
370 }
371 
pl_shader_sample_oversample(pl_shader sh,const struct pl_sample_src * src,float threshold)372 bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
373                                  float threshold)
374 {
375     ident_t tex, pos, size, pt;
376     float rx, ry, scale;
377     const char *fn;
378     if (!setup_src(sh, src, &tex, &pos, &size, &pt, &rx, &ry, NULL, &scale,
379                    true, &fn, LINEAR))
380         return false;
381 
382     ident_t ratio = sh_var(sh, (struct pl_shader_var) {
383         .var = pl_var_vec2("ratio"),
384         .data = &(float[2]) { rx, ry },
385     });
386 
387     // Round the position to the nearest pixel
388     sh_describe(sh, "oversample");
389     GLSL("// pl_shader_sample_oversample                \n"
390          "vec4 color;                                   \n"
391          "{                                             \n"
392          "vec2 pt = %s;                                 \n"
393          "vec2 pos = %s - vec2(0.5) * pt;               \n"
394          "vec2 fcoord = fract(pos * %s - vec2(0.5));    \n"
395          "vec2 coeff = fcoord * %s;                     \n",
396          pt, pos, size, ratio);
397 
398     if (threshold > 0.0) {
399         threshold = PL_MIN(threshold, 1.0);
400         ident_t thresh = sh_const_float(sh, "threshold", threshold);
401         GLSL("coeff = (coeff - %s) / (1.0 - 2.0 * %s);  \n",
402              thresh, thresh);
403     }
404 
405     // Compute the right output blend of colors
406     GLSL("coeff = clamp(coeff, 0.0, 1.0);               \n"
407          "pos += (coeff - fcoord) * pt;                 \n"
408          "color = vec4(%s) * %s(%s, pos);               \n"
409          "}                                             \n",
410          SH_FLOAT(scale), fn, tex);
411 
412     return true;
413 }
414 
filter_compat(pl_filter filter,float inv_scale,int lut_entries,float cutoff,const struct pl_filter_config * params)415 static bool filter_compat(pl_filter filter, float inv_scale,
416                           int lut_entries, float cutoff,
417                           const struct pl_filter_config *params)
418 {
419     if (!filter)
420         return false;
421     if (filter->params.lut_entries != lut_entries)
422         return false;
423     if (fabs(filter->params.filter_scale - inv_scale) > 1e-3)
424         return false;
425     if (filter->params.cutoff != cutoff)
426         return false;
427 
428     return pl_filter_config_eq(&filter->params.config, params);
429 }
430 
431 // Subroutine for computing and adding an individual texel contribution
432 // If `in` is NULL, samples directly
433 // If `in` is set, takes the pixel from inX[idx] where X is the component,
434 // `in` is the given identifier, and `idx` must be defined by the caller
polar_sample(pl_shader sh,pl_filter filter,const char * fn,ident_t tex,ident_t lut,ident_t cutoff,ident_t radius,int x,int y,uint8_t comp_mask,ident_t in)435 static void polar_sample(pl_shader sh, pl_filter filter, const char *fn,
436                          ident_t tex, ident_t lut, ident_t cutoff, ident_t radius,
437                          int x, int y, uint8_t comp_mask, ident_t in)
438 {
439     // Since we can't know the subpixel position in advance, assume a
440     // worst case scenario
441     int yy = y > 0 ? y-1 : y;
442     int xx = x > 0 ? x-1 : x;
443     float dmax = sqrt(xx*xx + yy*yy);
444     // Skip samples definitely outside the radius
445     if (dmax >= filter->radius_cutoff)
446         return;
447 
448     GLSL("d = length(vec2(%d.0, %d.0) - fcoord);\n", x, y);
449     // Check for samples that might be skippable
450     bool maybe_skippable = dmax >= filter->radius_cutoff - M_SQRT2;
451     if (maybe_skippable)
452         GLSL("if (d < %s) {\n", cutoff);
453 
454     // Get the weight for this pixel
455     GLSL("w = %s(d * 1.0/%s); \n"
456          "wsum += w;          \n",
457          lut, radius);
458 
459     if (in) {
460         for (uint8_t comps = comp_mask; comps;) {
461             uint8_t c = __builtin_ctz(comps);
462             GLSL("color[%d] += w * %s%d[idx]; \n", c, in, c);
463             comps &= ~(1 << c);
464         }
465     } else {
466         GLSL("c = %s(%s, base + pt * vec2(%d.0, %d.0)); \n",
467              fn, tex, x, y);
468         for (uint8_t comps = comp_mask; comps;) {
469             uint8_t c = __builtin_ctz(comps);
470             GLSL("color[%d] += w * c[%d]; \n", c, c);
471             comps &= ~(1 << c);
472         }
473     }
474 
475     if (maybe_skippable)
476         GLSL("}\n");
477 }
478 
479 struct sh_sampler_obj {
480     pl_filter filter;
481     pl_shader_obj lut;
482     pl_shader_obj pass2; // for pl_shader_sample_ortho
483 };
484 
sh_sampler_uninit(pl_gpu gpu,void * ptr)485 static void sh_sampler_uninit(pl_gpu gpu, void *ptr)
486 {
487     struct sh_sampler_obj *obj = ptr;
488     pl_shader_obj_destroy(&obj->lut);
489     pl_shader_obj_destroy(&obj->pass2);
490     pl_filter_free(&obj->filter);
491     *obj = (struct sh_sampler_obj) {0};
492 }
493 
fill_polar_lut(void * data,const struct sh_lut_params * params)494 static void fill_polar_lut(void *data, const struct sh_lut_params *params)
495 {
496     const struct sh_sampler_obj *obj = params->priv;
497     pl_filter filt = obj->filter;
498 
499     pl_assert(params->width == filt->params.lut_entries && params->comps == 1);
500     memcpy(data, filt->weights, params->width * sizeof(float));
501 }
502 
pl_shader_sample_polar(pl_shader sh,const struct pl_sample_src * src,const struct pl_sample_filter_params * params)503 bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
504                             const struct pl_sample_filter_params *params)
505 {
506     pl_assert(params);
507     if (!params->filter.polar) {
508         SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?");
509         return false;
510     }
511 
512     bool has_compute = sh_glsl(sh).compute && !params->no_compute;
513     has_compute &= sh_glsl(sh).version >= 130; // needed for round()
514     if (!src->tex && has_compute) {
515         // FIXME: Could maybe solve this by communicating the wbase from
516         // invocation 0 to the rest of the workgroup using shmem, which would
517         // also allow us to avoid the use of the hacky %s_map below.
518         PL_WARN(sh, "Combining pl_shader_sample_polar with the sampler2D "
519                 "interface prevents the use of compute shaders, which is a "
520                 "potentially massive performance hit. If you're sure you want "
521                 "this, set `params.no_compute` to suppress this warning.");
522         has_compute = false;
523     }
524 
525     bool flipped = src->rect.x0 > src->rect.x1 || src->rect.y0 > src->rect.y1;
526     if (flipped && has_compute) {
527         // FIXME: I'm sure this case could actually be supported with some
528         // extra math in the positional calculations, should implement it
529         PL_WARN(sh, "Trying to use a flipped src.rect with polar sampling! "
530                 "This prevents the use of compute shaders, which is a "
531                 "potentially massive performance hit. If you're really sure you "
532                 "want this, set `params.no_compute` to suppress this warning.");
533         has_compute = false;
534     }
535 
536     uint8_t comp_mask;
537     float rx, ry, scale;
538     ident_t src_tex, pos, size, pt;
539     const char *fn;
540     if (!setup_src(sh, src, &src_tex, &pos, &size, &pt, &rx, &ry, &comp_mask,
541                    &scale, false, &fn, FASTEST))
542         return false;
543 
544     struct sh_sampler_obj *obj;
545     obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj,
546                  sh_sampler_uninit);
547     if (!obj)
548         return false;
549 
550     float inv_scale = 1.0 / PL_MIN(rx, ry);
551     inv_scale = PL_MAX(inv_scale, 1.0);
552 
553     if (params->no_widening)
554         inv_scale = 1.0;
555 
556     int lut_entries = PL_DEF(params->lut_entries, 64);
557     float cutoff = PL_DEF(params->cutoff, 0.001);
558     bool update = !filter_compat(obj->filter, inv_scale, lut_entries, cutoff,
559                                  &params->filter);
560 
561     if (update) {
562         pl_filter_free(&obj->filter);
563         obj->filter = pl_filter_generate(sh->log, &(struct pl_filter_params) {
564             .config         = params->filter,
565             .lut_entries    = lut_entries,
566             .filter_scale   = inv_scale,
567             .cutoff         = cutoff,
568         });
569 
570         if (!obj->filter) {
571             // This should never happen, but just in case ..
572             SH_FAIL(sh, "Failed initializing polar filter!");
573             return false;
574         }
575     }
576 
577     sh_describe(sh, "polar scaling");
578     GLSL("// pl_shader_sample_polar                     \n"
579          "vec4 color = vec4(0.0);                       \n"
580          "{                                             \n"
581          "vec2 pos = %s, size = %s, pt = %s;            \n"
582          "vec2 fcoord = fract(pos * size - vec2(0.5));  \n"
583          "vec2 base = pos - pt * fcoord;                \n"
584          "vec2 center = base + pt * vec2(0.5);          \n"
585          "float w, d, wsum = 0.0;                       \n"
586          "int idx;                                      \n"
587          "vec4 c;                                       \n",
588          pos, size, pt);
589 
590     int bound   = ceil(obj->filter->radius_cutoff);
591     int offset  = bound - 1; // padding top/left
592     int padding = offset + bound; // total padding
593 
594     // Determined experimentally on modern AMD and Nvidia hardware. 32 is a
595     // good tradeoff for the horizontal work group size. Apart from that,
596     // just use as many threads as possible.
597     const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw;
598 
599     // We need to sample everything from base_min to base_max, so make sure
600     // we have enough room in shmem
601     int iw = (int) ceil(bw / rx) + padding + 1,
602         ih = (int) ceil(bh / ry) + padding + 1;
603 
604     ident_t in = NULL;
605     int num_comps = __builtin_popcount(comp_mask);
606     int shmem_req = iw * ih * num_comps * sizeof(float);
607     bool is_compute = has_compute && sh_try_compute(sh, bw, bh, false, shmem_req);
608 
609     // For compute shaders, which read the input texels primarily from shmem,
610     // using a texture-based LUT is better. For the fragment shader fallback
611     // code, which is primarily texture bound, the extra cost of LUT
612     // interpolation is worth the reduction in texel fetches.
613     ident_t lut = sh_lut(sh, &(struct sh_lut_params) {
614         .object = &obj->lut,
615         .method = is_compute ? SH_LUT_TEXTURE : SH_LUT_AUTO,
616         .type = PL_VAR_FLOAT,
617         .width = lut_entries,
618         .comps = 1,
619         .linear = true,
620         .update = update,
621         .fill = fill_polar_lut,
622         .priv = obj,
623     });
624 
625     if (!lut) {
626         SH_FAIL(sh, "Failed initializing polar LUT!");
627         return false;
628     }
629 
630     ident_t cutoff_c = sh_const_float(sh, "radius_cutoff", obj->filter->radius_cutoff);
631     ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius);
632 
633     if (is_compute) {
634         // Compute shader kernel
635         GLSL("vec2 wpos = %s_map(gl_WorkGroupID * gl_WorkGroupSize);        \n"
636              "vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5));      \n"
637              "ivec2 rel = ivec2(round((base - wbase) * size));              \n",
638              pos);
639 
640         ident_t iw_c = sh_const(sh, (struct pl_shader_const) {
641             .type = PL_VAR_SINT,
642             .compile_time = true,
643             .name ="iw",
644             .data = &iw,
645         });
646 
647         ident_t ih_c = sh_const(sh, (struct pl_shader_const) {
648             .type = PL_VAR_SINT,
649             .compile_time = true,
650             .name = "ih",
651             .data = &ih,
652         });
653 
654         // Load all relevant texels into shmem
655         GLSL("for (int y = int(gl_LocalInvocationID.y); y < %s; y += %d) {  \n"
656              "for (int x = int(gl_LocalInvocationID.x); x < %s; x += %d) {  \n"
657              "c = %s(%s, wbase + pt * vec2(x - %d, y - %d));                \n",
658              ih_c, bh, iw_c, bw, fn, src_tex, offset, offset);
659 
660         in = sh_fresh(sh, "in");
661         for (uint8_t comps = comp_mask; comps;) {
662             uint8_t c = __builtin_ctz(comps);
663             GLSLH("shared float %s%d[%s * %s]; \n", in, c, ih_c, iw_c);
664             GLSL("%s%d[%s * y + x] = c[%d]; \n", in, c, iw_c, c);
665             comps &= ~(1 << c);
666         }
667 
668         GLSL("}}                     \n"
669              "barrier();             \n");
670 
671         // Dispatch the actual samples
672         for (int y = 1 - bound; y <= bound; y++) {
673             for (int x = 1 - bound; x <= bound; x++) {
674                 GLSL("idx = %s * rel.y + rel.x + %s * %d + %d; \n",
675                      iw_c, iw_c, y + offset, x + offset);
676                 polar_sample(sh, obj->filter, fn, src_tex, lut, cutoff_c, radius_c,
677                              x, y, comp_mask, in);
678             }
679         }
680     } else {
681         // Fragment shader sampling
682         for (uint8_t comps = comp_mask; comps;) {
683             uint8_t c = __builtin_ctz(comps);
684             GLSL("vec4 in%d;\n", c);
685             comps &= ~(1 << c);
686         }
687 
688         // For maximum efficiency, we want to use textureGather() if
689         // possible, rather than direct sampling. Since this is not
690         // always possible/sensible, we need to possibly intermix gathering
691         // with regular sampling. This requires keeping track of which
692         // pixels in the next row were already gathered by the previous
693         // row.
694         uint32_t gathered_cur = 0x0, gathered_next = 0x0;
695         const float radius2 = PL_SQUARE(obj->filter->radius_cutoff);
696         const int base = bound - 1;
697 
698         if (base + bound >= 8 * sizeof(gathered_cur)) {
699             SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!",
700                     obj->filter->radius_cutoff);
701             return false;
702         }
703 
704         for (int y = 1 - bound; y <= bound; y++) {
705             for (int x = 1 - bound; x <= bound; x++) {
706                 // Skip already gathered texels
707                 uint32_t bit = 1llu << (base + x);
708                 if (gathered_cur & bit)
709                     continue;
710 
711                 // Using texture gathering is only more efficient than direct
712                 // sampling in the case where we expect to be able to use all
713                 // four gathered texels, without having to discard any. So
714                 // only do it if we suspect it will be a win rather than a
715                 // loss.
716                 int xx = x*x, xx1 = (x+1)*(x+1);
717                 int yy = y*y, yy1 = (y+1)*(y+1);
718                 bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2;
719                 use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset;
720                 use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset;
721                 use_gather &= !src->tex || src->tex->params.format->gatherable;
722 
723                 // Gathering from components other than the R channel requires
724                 // support for GLSL 400, which introduces the overload of
725                 // textureGather* that allows specifying the component.
726                 //
727                 // This is also the minimum requirement if we don't know the
728                 // texture format capabilities, for the sampler2D interface
729                 if (comp_mask != 0x1 || !src->tex)
730                     use_gather &= sh_glsl(sh).version >= 400;
731 
732                 if (!use_gather) {
733                     // Switch to direct sampling instead
734                     polar_sample(sh, obj->filter, fn, src_tex, lut, cutoff_c,
735                                  radius_c, x, y, comp_mask, NULL);
736                     continue;
737                 }
738 
739                 // Gather the four surrounding texels simultaneously
740                 for (uint8_t comps = comp_mask; comps;) {
741                     uint8_t c = __builtin_ctz(comps);
742                     if (x || y) {
743                         if (c) {
744                             GLSL("in%d = textureGatherOffset(%s, center, "
745                                  "ivec2(%d, %d), %d);\n",
746                                  c, src_tex, x, y, c);
747                         } else {
748                             GLSL("in0 = textureGatherOffset(%s, center, "
749                                  "ivec2(%d, %d));\n", src_tex, x, y);
750                         }
751                     } else {
752                         if (c) {
753                             GLSL("in%d = textureGather(%s, center, %d);\n",
754                                  c, src_tex, c);
755                         } else {
756                             GLSL("in0 = textureGather(%s, center);\n", src_tex);
757                         }
758                     }
759                     comps &= ~(1 << c);
760                 }
761 
762                 // Mix in all of the points with their weights
763                 for (int p = 0; p < 4; p++) {
764                     // The four texels are gathered counterclockwise starting
765                     // from the bottom left
766                     static const int xo[4] = {0, 1, 1, 0};
767                     static const int yo[4] = {1, 1, 0, 0};
768                     if (x+xo[p] > bound || y+yo[p] > bound)
769                         continue; // next subpixel
770 
771                     GLSL("idx = %d;\n", p);
772                     polar_sample(sh, obj->filter, fn, src_tex, lut, cutoff_c,
773                                  radius_c, x+xo[p], y+yo[p], comp_mask, "in");
774                 }
775 
776                 // Mark the other next row's pixels as already gathered
777                 gathered_next |= bit | (bit << 1);
778                 x++; // skip adjacent pixel
779             }
780 
781             // Prepare for new row
782             gathered_cur = gathered_next;
783             gathered_next = 0;
784         }
785     }
786 
787     GLSL("color = vec4(%s / wsum) * color; \n", SH_FLOAT(scale));
788     if (!(comp_mask & (1 << PL_CHANNEL_A)))
789         GLSL("color.a = 1.0; \n");
790 
791     GLSL("}\n");
792     return true;
793 }
794 
fill_ortho_lut(void * data,const struct sh_lut_params * params)795 static void fill_ortho_lut(void *data, const struct sh_lut_params *params)
796 {
797     const struct sh_sampler_obj *obj = params->priv;
798     pl_filter filt = obj->filter;
799     size_t entries = filt->params.lut_entries * filt->row_stride;
800 
801     pl_assert(params->width * params->height * params->comps == entries);
802     memcpy(data, filt->weights, entries * sizeof(float));
803 }
804 
pl_shader_sample_ortho(pl_shader sh,int pass,const struct pl_sample_src * src,const struct pl_sample_filter_params * params)805 bool pl_shader_sample_ortho(pl_shader sh, int pass,
806                             const struct pl_sample_src *src,
807                             const struct pl_sample_filter_params *params)
808 {
809     pl_assert(params);
810     if (params->filter.polar) {
811         SH_FAIL(sh, "Trying to use separated sampling with a polar filter?");
812         return false;
813     }
814 
815     pl_gpu gpu = SH_GPU(sh);
816     pl_assert(gpu);
817 
818     struct pl_sample_src srcfix = *src;
819     switch (pass) {
820     case PL_SEP_VERT:
821         srcfix.rect.x0 = 0;
822         srcfix.rect.x1 = srcfix.new_w = src_params(src).w;
823         break;
824     case PL_SEP_HORIZ:
825         srcfix.rect.y0 = 0;
826         srcfix.rect.y1 = srcfix.new_h = src_params(src).h;
827         break;
828     }
829 
830     uint8_t comp_mask;
831     float ratio[PL_SEP_PASSES], scale;
832     ident_t src_tex, pos, size, pt;
833     const char *fn;
834     if (!setup_src(sh, &srcfix, &src_tex, &pos, &size, &pt,
835                    &ratio[PL_SEP_HORIZ], &ratio[PL_SEP_VERT],
836                    &comp_mask, &scale, false, &fn, FASTEST))
837         return false;
838 
839     // We can store a separate sampler object per dimension, so dispatch the
840     // right one. This is needed for two reasons:
841     // 1. Anamorphic content can have a different scaling ratio for each
842     //    dimension. In particular, you could be upscaling in one and
843     //    downscaling in the other.
844     // 2. After fixing the source for `setup_src`, we lose information about
845     //    the scaling ratio of the other component. (Although this is only a
846     //    minor reason and could easily be changed with some boilerplate)
847     struct sh_sampler_obj *obj;
848     obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER,
849                  struct sh_sampler_obj, sh_sampler_uninit);
850     if (!obj)
851         return false;
852 
853     if (pass != 0) {
854         obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER,
855                      struct sh_sampler_obj, sh_sampler_uninit);
856         assert(obj);
857     }
858 
859     float inv_scale = 1.0 / ratio[pass];
860     inv_scale = PL_MAX(inv_scale, 1.0);
861 
862     if (params->no_widening)
863         inv_scale = 1.0;
864 
865     int lut_entries = PL_DEF(params->lut_entries, 64);
866     bool update = !filter_compat(obj->filter, inv_scale, lut_entries, 0.0,
867                                  &params->filter);
868 
869     if (update) {
870         pl_filter_free(&obj->filter);
871         obj->filter = pl_filter_generate(sh->log, &(struct pl_filter_params) {
872             .config             = params->filter,
873             .lut_entries        = lut_entries,
874             .filter_scale       = inv_scale,
875             .max_row_size       = gpu->limits.max_tex_2d_dim / 4,
876             .row_stride_align   = 4,
877         });
878 
879         if (!obj->filter) {
880             // This should never happen, but just in case ..
881             SH_FAIL(sh, "Failed initializing separated filter!");
882             return false;
883         }
884     }
885 
886     int N = obj->filter->row_size; // number of samples to convolve
887     int width = obj->filter->row_stride / 4; // width of the LUT texture
888     ident_t lut = sh_lut(sh, &(struct sh_lut_params) {
889         .object = &obj->lut,
890         .type = PL_VAR_FLOAT,
891         .width = width,
892         .height = lut_entries,
893         .comps = 4,
894         .linear = true,
895         .update = update,
896         .fill = fill_ortho_lut,
897         .priv = obj,
898     });
899     if (!lut) {
900         SH_FAIL(sh, "Failed initializing separated LUT!");
901         return false;
902     }
903 
904     const int dir[PL_SEP_PASSES][2] = {
905         [PL_SEP_HORIZ] = {1, 0},
906         [PL_SEP_VERT]  = {0, 1},
907     };
908 
909     static const char *names[PL_SEP_PASSES] = {
910         [PL_SEP_HORIZ] = "ortho scaling (horiz)",
911         [PL_SEP_VERT]  = "ortho scaling (vert)",
912     };
913 
914     sh_describe(sh, names[pass]);
915     GLSL("// pl_shader_sample_ortho                        \n"
916          "vec4 color = vec4(0.0);                          \n"
917          "{                                                \n"
918          "vec2 pos = %s, size = %s, pt = %s;               \n"
919          "vec2 dir = vec2(%d.0, %d.0);                     \n"
920          "pt *= dir;                                       \n"
921          "vec2 fcoord2 = fract(pos * size - vec2(0.5));    \n"
922          "float fcoord = dot(fcoord2, dir);                \n"
923          "vec2 base = pos - fcoord * pt - pt * vec2(%d.0); \n"
924          "float weight;                                    \n"
925          "vec4 ws, c;                                      \n",
926          pos, size, pt,
927          dir[pass][0], dir[pass][1],
928          N / 2 - 1);
929 
930     bool use_ar = params->antiring > 0;
931     if (use_ar) {
932         GLSL("vec4 hi = vec4(0.0); \n"
933              "vec4 lo = vec4(1e9); \n");
934     }
935 
936     // Dispatch all of the samples
937     GLSL("// scaler samples\n");
938     for (int n = 0; n < N; n++) {
939         // Load the right weight for this instance. For every 4th weight, we
940         // need to fetch another LUT entry. Otherwise, just use the previous
941         if (n % 4 == 0) {
942             float denom = PL_MAX(1, width - 1); // avoid division by zero
943             GLSL("ws = %s(vec2(%f, fcoord));\n", lut, (n / 4) / denom);
944         }
945         GLSL("weight = ws[%d];\n", n % 4);
946 
947         // Load the input texel and add it to the running sum
948         GLSL("c = %s(%s, base + pt * vec2(%d.0)); \n",
949              fn, src_tex, n);
950 
951         for (uint8_t comps = comp_mask; comps;) {
952             uint8_t c = __builtin_ctz(comps);
953             GLSL("color[%d] += weight * c[%d]; \n", c, c);
954             comps &= ~(1 << c);
955 
956             if (use_ar && (n == N / 2 - 1 || n == N / 2)) {
957                 GLSL("lo[%d] = min(lo[%d], c[%d]); \n"
958                      "hi[%d] = max(hi[%d], c[%d]); \n",
959                      c, c, c, c, c, c);
960             }
961         }
962     }
963 
964     if (use_ar) {
965         GLSL("color = mix(color, clamp(color, lo, hi), %s);\n",
966              sh_const_float(sh, "antiring", params->antiring));
967     }
968 
969     GLSL("color *= vec4(%s);\n", SH_FLOAT(scale));
970     if (!(comp_mask & (1 << PL_CHANNEL_A)))
971         GLSL("color.a = 1.0; \n");
972 
973     GLSL("}\n");
974     return true;
975 }
976