1 /*
2  * This file is part of libplacebo.
3  *
4  * libplacebo is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * libplacebo is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "common.h"
19 #include "log.h"
20 #include "shaders.h"
21 #include "dispatch.h"
22 #include "gpu.h"
23 #include "pl_thread.h"
24 
25 // Maximum number of passes to keep around at once. If full, passes older than
26 // MIN_AGE are evicted to make room. (Failing that, the cache size doubles)
27 #define MAX_PASSES 100
28 #define MIN_AGE 10
29 
30 enum {
31     TMP_PRELUDE,   // GLSL version, global definitions, etc.
32     TMP_MAIN,      // main GLSL shader body
33     TMP_VERT_HEAD, // vertex shader inputs/outputs
34     TMP_VERT_BODY, // vertex shader body
35     TMP_COUNT,
36 };
37 
38 struct pl_dispatch {
39     pl_mutex lock;
40     pl_log log;
41     pl_gpu gpu;
42     uint8_t current_ident;
43     uint8_t current_index;
44     bool dynamic_constants;
45     int max_passes;
46 
47     void (*info_callback)(void *, const struct pl_dispatch_info *);
48     void *info_priv;
49 
50     PL_ARRAY(pl_shader) shaders;                // to avoid re-allocations
51     PL_ARRAY(struct pass *) passes;             // compiled passes
52     PL_ARRAY(struct cached_pass) cached_passes; // not-yet-compiled passes
53 
54     // temporary buffers to help avoid re_allocations during pass creation
55     pl_str tmp[TMP_COUNT];
56 };
57 
58 enum pass_var_type {
59     PASS_VAR_NONE = 0,
60     PASS_VAR_GLOBAL, // regular/global uniforms
61     PASS_VAR_UBO,    // uniform buffers
62     PASS_VAR_PUSHC   // push constants
63 };
64 
65 // Cached metadata about a variable's effective placement / update method
66 struct pass_var {
67     int index; // for pl_var_update
68     enum pass_var_type type;
69     struct pl_var_layout layout;
70     void *cached_data;
71 };
72 
73 struct pass {
74     uint64_t signature; // as returned by pl_shader_signature
75     pl_pass pass;
76     int last_index;
77 
78     // contains cached data and update metadata, same order as pl_shader
79     struct pass_var *vars;
80 
81     // for uniform buffer updates
82     struct pl_shader_desc ubo_desc; // temporary
83     int ubo_index;
84     pl_buf ubo;
85 
86     // Cached pl_pass_run_params. This will also contain mutable allocations
87     // for the push constants, descriptor bindings (including the binding for
88     // the UBO pre-filled), vertex array and variable updates
89     struct pl_pass_run_params run_params;
90 
91     // for pl_dispatch_info
92     pl_timer timer;
93     uint64_t ts_last;
94     uint64_t ts_peak;
95     uint64_t ts_sum;
96     uint64_t samples[PL_ARRAY_SIZE(((struct pl_dispatch_info *) NULL)->samples)];
97     int ts_idx;
98 };
99 
100 struct cached_pass {
101     uint64_t signature;
102     const uint8_t *cached_program;
103     size_t cached_program_len;
104 };
105 
pass_destroy(pl_dispatch dp,struct pass * pass)106 static void pass_destroy(pl_dispatch dp, struct pass *pass)
107 {
108     if (!pass)
109         return;
110 
111     pl_buf_destroy(dp->gpu, &pass->ubo);
112     pl_pass_destroy(dp->gpu, &pass->pass);
113     pl_timer_destroy(dp->gpu, &pass->timer);
114     pl_free(pass);
115 }
116 
pl_dispatch_create(pl_log log,pl_gpu gpu)117 pl_dispatch pl_dispatch_create(pl_log log, pl_gpu gpu)
118 {
119     struct pl_dispatch *dp = pl_zalloc_ptr(NULL, dp);
120     pl_mutex_init(&dp->lock);
121     dp->log = log;
122     dp->gpu = gpu;
123     dp->max_passes = MAX_PASSES;
124 
125     return dp;
126 }
127 
pl_dispatch_destroy(pl_dispatch * ptr)128 void pl_dispatch_destroy(pl_dispatch *ptr)
129 {
130     pl_dispatch dp = *ptr;
131     if (!dp)
132         return;
133 
134     for (int i = 0; i < dp->passes.num; i++)
135         pass_destroy(dp, dp->passes.elem[i]);
136     for (int i = 0; i < dp->shaders.num; i++)
137         pl_shader_free(&dp->shaders.elem[i]);
138 
139     pl_mutex_destroy(&dp->lock);
140     pl_free(dp);
141     *ptr = NULL;
142 }
143 
pl_dispatch_begin_ex(pl_dispatch dp,bool unique)144 pl_shader pl_dispatch_begin_ex(pl_dispatch dp, bool unique)
145 {
146     pl_mutex_lock(&dp->lock);
147 
148     struct pl_shader_params params = {
149         .id = unique ? dp->current_ident++ : 0,
150         .gpu = dp->gpu,
151         .index = dp->current_index,
152         .dynamic_constants = dp->dynamic_constants,
153     };
154 
155     pl_shader sh = NULL;
156     PL_ARRAY_POP(dp->shaders, &sh);
157     pl_mutex_unlock(&dp->lock);
158 
159     if (sh) {
160         sh->res.params = params;
161         return sh;
162     }
163 
164     return pl_shader_alloc(dp->log, &params);
165 }
166 
pl_dispatch_reset_frame(pl_dispatch dp)167 void pl_dispatch_reset_frame(pl_dispatch dp)
168 {
169     pl_mutex_lock(&dp->lock);
170     dp->current_ident = 0;
171     dp->current_index++;
172     pl_mutex_unlock(&dp->lock);
173 }
174 
pl_dispatch_mark_dynamic(pl_dispatch dp,bool dynamic)175 void pl_dispatch_mark_dynamic(pl_dispatch dp, bool dynamic)
176 {
177     dp->dynamic_constants = dynamic;
178 }
179 
pl_dispatch_callback(pl_dispatch dp,void * priv,void (* cb)(void * priv,const struct pl_dispatch_info *))180 void pl_dispatch_callback(pl_dispatch dp, void *priv,
181                           void (*cb)(void *priv, const struct pl_dispatch_info *))
182 {
183     dp->info_callback = cb;
184     dp->info_priv = priv;
185 }
186 
pl_dispatch_begin(pl_dispatch dp)187 pl_shader pl_dispatch_begin(pl_dispatch dp)
188 {
189     return pl_dispatch_begin_ex(dp, false);
190 }
191 
add_pass_var(pl_dispatch dp,void * tmp,struct pass * pass,struct pl_pass_params * params,const struct pl_shader_var * sv,struct pass_var * pv,bool greedy)192 static bool add_pass_var(pl_dispatch dp, void *tmp, struct pass *pass,
193                          struct pl_pass_params *params,
194                          const struct pl_shader_var *sv, struct pass_var *pv,
195                          bool greedy)
196 {
197     pl_gpu gpu = dp->gpu;
198     if (pv->type)
199         return true;
200 
201     // Try not to use push constants for "large" values like matrices in the
202     // first pass, since this is likely to exceed the VGPR/pushc size budgets
203     bool try_pushc = greedy || (sv->var.dim_m == 1 && sv->var.dim_a == 1) || sv->dynamic;
204     if (try_pushc && gpu->glsl.vulkan && gpu->limits.max_pushc_size) {
205         pv->layout = pl_std430_layout(params->push_constants_size, &sv->var);
206         size_t new_size = pv->layout.offset + pv->layout.size;
207         if (new_size <= gpu->limits.max_pushc_size) {
208             params->push_constants_size = new_size;
209             pv->type = PASS_VAR_PUSHC;
210             return true;
211         }
212     }
213 
214     // If we haven't placed all PCs yet, don't place anything else, since
215     // we want to try and fit more stuff into PCs before "giving up"
216     if (!greedy)
217         return true;
218 
219     // Attempt using uniform buffer next. The GLSL version 440 check is due
220     // to explicit offsets on UBO entries. In theory we could leave away
221     // the offsets and support UBOs for older GL as well, but this is a nice
222     // safety net for driver bugs (and also rules out potentially buggy drivers)
223     // Also avoid UBOs for highly dynamic stuff since that requires synchronizing
224     // the UBO writes every frame
225     bool try_ubo = params->num_variables == gpu->limits.max_variables || !sv->dynamic;
226     if (try_ubo && gpu->glsl.version >= 440 && gpu->limits.max_ubo_size) {
227         if (sh_buf_desc_append(tmp, gpu, &pass->ubo_desc, &pv->layout, sv->var)) {
228             pv->type = PASS_VAR_UBO;
229             return true;
230         }
231     }
232 
233     // Otherwise, use global uniforms
234     if (params->num_variables < gpu->limits.max_variables) {
235         pv->type = PASS_VAR_GLOBAL;
236         pv->index = params->num_variables;
237         pv->layout = pl_var_host_layout(0, &sv->var);
238         PL_ARRAY_APPEND_RAW(tmp, params->variables, params->num_variables, sv->var);
239         return true;
240     }
241 
242     // Ran out of variable binding methods. The most likely scenario in which
243     // this can happen is if we're using a GPU that does not support global
244     // input vars and we've exhausted the UBO size limits.
245     PL_ERR(dp, "Unable to add input variable '%s': possibly exhausted "
246            "variable count / UBO size limits?", sv->var.name);
247     return false;
248 }
249 
250 #define ADD(x, ...) pl_str_append_asprintf_c(dp, (x), __VA_ARGS__)
251 #define ADD_STR(x, s) pl_str_append(dp, (x), (s))
252 
add_var(pl_dispatch dp,pl_str * body,const struct pl_var * var)253 static void add_var(pl_dispatch dp, pl_str *body, const struct pl_var *var)
254 {
255     ADD(body, "%s %s", pl_var_glsl_type_name(*var), var->name);
256 
257     if (var->dim_a > 1) {
258         ADD(body, "[%d];\n", var->dim_a);
259     } else {
260         ADD(body, ";\n");
261     }
262 }
263 
cmp_buffer_var(const void * pa,const void * pb)264 static int cmp_buffer_var(const void *pa, const void *pb)
265 {
266     const struct pl_buffer_var * const *a = pa, * const *b = pb;
267     return PL_CMP((*a)->layout.offset, (*b)->layout.offset);
268 }
269 
add_buffer_vars(pl_dispatch dp,void * tmp,pl_str * body,const struct pl_buffer_var * vars,int num)270 static void add_buffer_vars(pl_dispatch dp, void *tmp, pl_str *body,
271                             const struct pl_buffer_var *vars, int num)
272 {
273     // Sort buffer vars
274     const struct pl_buffer_var **sorted_vars = pl_calloc_ptr(tmp, num, sorted_vars);
275     for (int i = 0; i < num; i++)
276         sorted_vars[i] = &vars[i];
277     qsort(sorted_vars, num, sizeof(sorted_vars[0]), cmp_buffer_var);
278 
279     ADD(body, "{\n");
280     for (int i = 0; i < num; i++) {
281         // Add an explicit offset wherever possible
282         if (dp->gpu->glsl.version >= 440)
283             ADD(body, "    layout(offset=%zu) ", sorted_vars[i]->layout.offset);
284         add_var(dp, body, &sorted_vars[i]->var);
285     }
286     ADD(body, "};\n");
287 }
288 
sh_var_from_va(pl_shader sh,const char * name,const struct pl_vertex_attrib * va,const void * data)289 static ident_t sh_var_from_va(pl_shader sh, const char *name,
290                               const struct pl_vertex_attrib *va,
291                               const void *data)
292 {
293     return sh_var(sh, (struct pl_shader_var) {
294         .var  = pl_var_from_fmt(va->fmt, name),
295         .data = data,
296     });
297 }
298 
sd_binding(const struct pl_shader_desc sd)299 static inline struct pl_desc_binding sd_binding(const struct pl_shader_desc sd)
300 {
301     // For backwards compatibility with the deprecated field sd.object
302     struct pl_desc_binding binding = sd.binding;
303     binding.object = PL_DEF(binding.object, sd.object);
304     return binding;
305 }
306 
generate_shaders(pl_dispatch dp,void * tmp,struct pass * pass,struct pl_pass_params * params,pl_shader sh,ident_t vert_pos,ident_t out_proj)307 static void generate_shaders(pl_dispatch dp, void *tmp, struct pass *pass,
308                              struct pl_pass_params *params, pl_shader sh,
309                              ident_t vert_pos, ident_t out_proj)
310 {
311     pl_gpu gpu = dp->gpu;
312     const struct pl_shader_res *res = pl_shader_finalize(sh);
313 
314     pl_str *pre = &dp->tmp[TMP_PRELUDE];
315     ADD(pre, "#version %d%s\n", gpu->glsl.version,
316         (gpu->glsl.gles && gpu->glsl.version > 100) ? " es" : "");
317     if (params->type == PL_PASS_COMPUTE)
318         ADD(pre, "#extension GL_ARB_compute_shader : enable\n");
319 
320     // Enable this unconditionally if the GPU supports it, since we have no way
321     // of knowing whether subgroups are being used or not
322     if (gpu->glsl.subgroup_size) {
323         ADD(pre, "#extension GL_KHR_shader_subgroup_basic : enable \n"
324                  "#extension GL_KHR_shader_subgroup_vote : enable \n"
325                  "#extension GL_KHR_shader_subgroup_arithmetic : enable \n"
326                  "#extension GL_KHR_shader_subgroup_ballot : enable \n"
327                  "#extension GL_KHR_shader_subgroup_shuffle : enable \n");
328     }
329 
330     // Enable all extensions needed for different types of input
331     bool has_ssbo = false, has_ubo = false, has_img = false, has_texel = false,
332          has_ext = false, has_nofmt = false, has_gather = false;
333     for (int i = 0; i < sh->descs.num; i++) {
334         switch (sh->descs.elem[i].desc.type) {
335         case PL_DESC_BUF_UNIFORM: has_ubo = true; break;
336         case PL_DESC_BUF_STORAGE: has_ssbo = true; break;
337         case PL_DESC_BUF_TEXEL_UNIFORM: has_texel = true; break;
338         case PL_DESC_BUF_TEXEL_STORAGE: {
339             pl_buf buf = sd_binding(res->descriptors[i]).object;
340             has_nofmt |= !buf->params.format->glsl_format;
341             has_texel = true;
342             break;
343         }
344         case PL_DESC_STORAGE_IMG: {
345             pl_tex tex = sd_binding(res->descriptors[i]).object;
346             has_nofmt |= !tex->params.format->glsl_format;
347             has_img = true;
348             break;
349         }
350         case PL_DESC_SAMPLED_TEX: {
351             pl_tex tex = sd_binding(res->descriptors[i]).object;
352             has_gather |= tex->params.format->gatherable;
353             switch (tex->sampler_type) {
354             case PL_SAMPLER_NORMAL: break;
355             case PL_SAMPLER_RECT: break;
356             case PL_SAMPLER_EXTERNAL: has_ext = true; break;
357             case PL_SAMPLER_TYPE_COUNT: pl_unreachable();
358             }
359             break;
360         }
361 
362         case PL_DESC_INVALID:
363         case PL_DESC_TYPE_COUNT:
364             pl_unreachable();
365         }
366     }
367 
368     if (has_img)
369         ADD(pre, "#extension GL_ARB_shader_image_load_store : enable\n");
370     if (has_ubo)
371         ADD(pre, "#extension GL_ARB_uniform_buffer_object : enable\n");
372     if (has_ssbo)
373         ADD(pre, "#extension GL_ARB_shader_storage_buffer_object : enable\n");
374     if (has_texel)
375         ADD(pre, "#extension GL_ARB_texture_buffer_object : enable\n");
376     if (has_ext)
377         ADD(pre, "#extension GL_OES_EGL_image_external : enable\n");
378     if (has_nofmt)
379         ADD(pre, "#extension GL_EXT_shader_image_load_formatted : enable\n");
380     if (has_gather)
381         ADD(pre, "#extension GL_ARB_texture_gather : enable\n");
382 
383     if (gpu->glsl.gles) {
384         // Use 32-bit precision for floats if possible
385         ADD(pre, "#ifdef GL_FRAGMENT_PRECISION_HIGH \n"
386                  "precision highp float;            \n"
387                  "#else                             \n"
388                  "precision mediump float;          \n"
389                  "#endif                            \n");
390 
391         // Always use 16-bit precision for samplers
392         ADD(pre, "precision mediump sampler2D; \n");
393         if (gpu->limits.max_tex_1d_dim)
394             ADD(pre, "precision mediump sampler1D; \n");
395         if (gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100)
396             ADD(pre, "precision mediump sampler3D; \n");
397     }
398 
399     // Add all of the push constants as their own element
400     if (params->push_constants_size) {
401         // We re-use add_buffer_vars to make sure variables are sorted, this
402         // is important because the push constants can be out-of-order in
403         // `pass->vars`
404         PL_ARRAY(struct pl_buffer_var) pc_bvars = {0};
405         for (int i = 0; i < res->num_variables; i++) {
406             if (pass->vars[i].type != PASS_VAR_PUSHC)
407                 continue;
408 
409             PL_ARRAY_APPEND(tmp, pc_bvars, (struct pl_buffer_var) {
410                 .var = res->variables[i].var,
411                 .layout = pass->vars[i].layout,
412             });
413         }
414 
415         ADD(pre, "layout(std430, push_constant) uniform PushC ");
416         add_buffer_vars(dp, tmp, pre, pc_bvars.elem, pc_bvars.num);
417     }
418 
419     // Add all of the specialization constants
420     for (int i = 0; i < res->num_constants; i++) {
421         static const char *types[PL_VAR_TYPE_COUNT] = {
422             [PL_VAR_SINT]   = "int",
423             [PL_VAR_UINT]   = "uint",
424             [PL_VAR_FLOAT]  = "float",
425         };
426 
427         const struct pl_shader_const *sc = &res->constants[i];
428         ADD(pre, "layout(constant_id=%"PRIu32") const %s %s = 0; \n",
429             params->constants[i].id, types[sc->type], sc->name);
430     }
431 
432     // Add all of the required descriptors
433     for (int i = 0; i < res->num_descriptors; i++) {
434         const struct pl_shader_desc *sd = &res->descriptors[i];
435         const struct pl_desc *desc = &params->descriptors[i];
436 
437         switch (desc->type) {
438         case PL_DESC_SAMPLED_TEX: {
439             static const char *types[][4] = {
440                 [PL_SAMPLER_NORMAL][1]  = "sampler1D",
441                 [PL_SAMPLER_NORMAL][2]  = "sampler2D",
442                 [PL_SAMPLER_NORMAL][3]  = "sampler3D",
443                 [PL_SAMPLER_RECT][2]    = "sampler2DRect",
444                 [PL_SAMPLER_EXTERNAL][2] = "samplerExternalOES",
445             };
446 
447             pl_tex tex = sd_binding(*sd).object;
448             int dims = pl_tex_params_dimension(tex->params);
449             const char *type = types[tex->sampler_type][dims];
450             pl_assert(type);
451 
452             static const char prefixes[PL_FMT_TYPE_COUNT] = {
453                 [PL_FMT_FLOAT]  = ' ',
454                 [PL_FMT_UNORM]  = ' ',
455                 [PL_FMT_SNORM]  = ' ',
456                 [PL_FMT_UINT]   = 'u',
457                 [PL_FMT_SINT]   = 'i',
458             };
459 
460             char prefix = prefixes[tex->params.format->type];
461             pl_assert(prefix);
462 
463             const char *prec = "";
464             if (prefix != ' ' && gpu->glsl.gles)
465                 prec = "highp ";
466 
467             // Vulkan requires explicit bindings; GL always sets the
468             // bindings manually to avoid relying on the user doing so
469             if (gpu->glsl.vulkan)
470                 ADD(pre, "layout(binding=%d) ", desc->binding);
471 
472             pl_assert(type && prefix);
473             ADD(pre, "uniform %s%c%s %s;\n", prec, prefix, type, desc->name);
474             break;
475         }
476 
477         case PL_DESC_STORAGE_IMG: {
478             static const char *types[] = {
479                 [1] = "image1D",
480                 [2] = "image2D",
481                 [3] = "image3D",
482             };
483 
484             // For better compatibility, we have to explicitly label the
485             // type of data we will be reading/writing to this image.
486             pl_tex tex = sd_binding(*sd).object;
487             const char *format = tex->params.format->glsl_format;
488             const char *access = pl_desc_access_glsl_name(desc->access);
489             int dims = pl_tex_params_dimension(tex->params);
490             if (gpu->glsl.vulkan) {
491                 if (format) {
492                     ADD(pre, "layout(binding=%d, %s) ", desc->binding, format);
493                 } else {
494                     ADD(pre, "layout(binding=%d) ", desc->binding);
495                 }
496             } else if (gpu->glsl.version >= 130 && format) {
497                 ADD(pre, "layout(%s) ", format);
498             }
499 
500             ADD(pre, "%s%s%s restrict uniform %s %s;\n", access,
501                 (sd->memory & PL_MEMORY_COHERENT) ? " coherent" : "",
502                 (sd->memory & PL_MEMORY_VOLATILE) ? " volatile" : "",
503                 types[dims], desc->name);
504             break;
505         }
506 
507         case PL_DESC_BUF_UNIFORM:
508             if (gpu->glsl.vulkan) {
509                 ADD(pre, "layout(std140, binding=%d) ", desc->binding);
510             } else {
511                 ADD(pre, "layout(std140) ");
512             }
513             ADD(pre, "uniform %s ", desc->name);
514             add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars);
515             break;
516 
517         case PL_DESC_BUF_STORAGE:
518             if (gpu->glsl.vulkan) {
519                 ADD(pre, "layout(std430, binding=%d) ", desc->binding);
520             } else if (gpu->glsl.version >= 140) {
521                 ADD(pre, "layout(std430) ");
522             }
523             ADD(pre, "%s%s%s restrict buffer %s ",
524                 pl_desc_access_glsl_name(desc->access),
525                 (sd->memory & PL_MEMORY_COHERENT) ? " coherent" : "",
526                 (sd->memory & PL_MEMORY_VOLATILE) ? " volatile" : "",
527                 desc->name);
528             add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars);
529             break;
530 
531         case PL_DESC_BUF_TEXEL_UNIFORM:
532             if (gpu->glsl.vulkan)
533                 ADD(pre, "layout(binding=%d) ", desc->binding);
534             ADD(pre, "uniform samplerBuffer %s;\n", desc->name);
535             break;
536 
537         case PL_DESC_BUF_TEXEL_STORAGE: {
538             pl_buf buf = sd_binding(*sd).object;
539             const char *format = buf->params.format->glsl_format;
540             const char *access = pl_desc_access_glsl_name(desc->access);
541             if (gpu->glsl.vulkan) {
542                 if (format) {
543                     ADD(pre, "layout(binding=%d, %s) ", desc->binding, format);
544                 } else {
545                     ADD(pre, "layout(binding=%d) ", desc->binding);
546                 }
547             } else if (format) {
548                 ADD(pre, "layout(%s) ", format);
549             }
550 
551             ADD(pre, "%s%s%s restrict uniform imageBuffer %s;\n", access,
552                 (sd->memory & PL_MEMORY_COHERENT) ? " coherent" : "",
553                 (sd->memory & PL_MEMORY_VOLATILE) ? " volatile" : "",
554                 desc->name);
555             break;
556         }
557 
558         case PL_DESC_INVALID:
559         case PL_DESC_TYPE_COUNT:
560             pl_unreachable();
561         }
562     }
563 
564     // Add all of the remaining variables
565     for (int i = 0; i < res->num_variables; i++) {
566         const struct pl_var *var = &res->variables[i].var;
567         const struct pass_var *pv = &pass->vars[i];
568         if (pv->type != PASS_VAR_GLOBAL)
569             continue;
570         ADD(pre, "uniform ");
571         add_var(dp, pre, var);
572     }
573 
574     char *vert_in  = gpu->glsl.version >= 130 ? "in" : "attribute";
575     char *vert_out = gpu->glsl.version >= 130 ? "out" : "varying";
576     char *frag_in  = gpu->glsl.version >= 130 ? "in" : "varying";
577 
578     pl_str *glsl = &dp->tmp[TMP_MAIN];
579     ADD_STR(glsl, *pre);
580 
581     const char *out_color = "gl_FragColor";
582     switch(params->type) {
583     case PL_PASS_RASTER: {
584         pl_assert(vert_pos);
585         pl_str *vert_head = &dp->tmp[TMP_VERT_HEAD];
586         pl_str *vert_body = &dp->tmp[TMP_VERT_BODY];
587 
588         // Set up a trivial vertex shader
589         ADD_STR(vert_head, *pre);
590         ADD(vert_body, "void main() {\n");
591         for (int i = 0; i < sh->vas.num; i++) {
592             const struct pl_vertex_attrib *va = &params->vertex_attribs[i];
593             const struct pl_shader_va *sva = &sh->vas.elem[i];
594             const char *type = va->fmt->glsl_type;
595 
596             // Use the pl_shader_va for the name in the fragment shader since
597             // the pl_vertex_attrib is already mangled for the vertex shader
598             const char *name = sva->attr.name;
599 
600             char loc[32];
601             snprintf(loc, sizeof(loc), "layout(location=%d)", va->location);
602             // Older GLSL doesn't support the use of explicit locations
603             if (gpu->glsl.version < 430)
604                 loc[0] = '\0';
605             ADD(vert_head, "%s %s %s %s;\n", loc, vert_in, type, va->name);
606 
607             if (strcmp(name, vert_pos) == 0) {
608                 pl_assert(va->fmt->num_components == 2);
609                 if (out_proj) {
610                     ADD(vert_body, "gl_Position = vec4((%s * vec3(%s, 1.0)).xy, 0.0, 1.0); \n",
611                         out_proj, va->name);
612                 } else {
613                     ADD(vert_body, "gl_Position = vec4(%s, 0.0, 1.0);\n", va->name);
614                 }
615             } else {
616                 // Everything else is just blindly passed through
617                 ADD(vert_head, "%s %s %s %s;\n", loc, vert_out, type, name);
618                 ADD(vert_body, "%s = %s;\n", name, va->name);
619                 ADD(glsl, "%s %s %s %s;\n", loc, frag_in, type, name);
620             }
621         }
622 
623         ADD(vert_body, "}");
624         ADD_STR(vert_head, *vert_body);
625         params->vertex_shader = vert_head->buf;
626         pl_hash_merge(&pass->signature, pl_str_hash(*vert_head));
627 
628         // GLSL 130+ doesn't use the magic gl_FragColor
629         if (gpu->glsl.version >= 130) {
630             out_color = "out_color";
631             ADD(glsl, "%s out vec4 %s;\n",
632                 gpu->glsl.version >= 430 ? "layout(location=0) " : "",
633                 out_color);
634         }
635         break;
636     }
637     case PL_PASS_COMPUTE:
638         ADD(glsl, "layout (local_size_x = %d, local_size_y = %d) in;\n",
639             res->compute_group_size[0], res->compute_group_size[1]);
640         break;
641     case PL_PASS_INVALID:
642     case PL_PASS_TYPE_COUNT:
643         pl_unreachable();
644     }
645 
646     // Set up the main shader body
647     ADD(glsl, "%s", res->glsl);
648     ADD(glsl, "void main() {\n");
649 
650     pl_assert(res->input == PL_SHADER_SIG_NONE);
651     switch (params->type) {
652     case PL_PASS_RASTER:
653         pl_assert(res->output == PL_SHADER_SIG_COLOR);
654         ADD(glsl, "%s = %s();\n", out_color, res->name);
655         break;
656     case PL_PASS_COMPUTE:
657         ADD(glsl, "%s();\n", res->name);
658         break;
659     case PL_PASS_INVALID:
660     case PL_PASS_TYPE_COUNT:
661         pl_unreachable();
662     }
663 
664     ADD(glsl, "}");
665     params->glsl_shader = glsl->buf;
666     pl_hash_merge(&pass->signature, pl_str_hash(*glsl));
667 }
668 
669 #undef ADD
670 #undef ADD_STR
671 
672 #define pass_age(pass) (dp->current_index - (pass)->last_index)
673 
cmp_pass_age(const void * ptra,const void * ptrb)674 static int cmp_pass_age(const void *ptra, const void *ptrb)
675 {
676     const struct pass *a = *(const struct pass **) ptra;
677     const struct pass *b = *(const struct pass **) ptrb;
678     return b->last_index - a->last_index;
679 }
680 
garbage_collect_passes(pl_dispatch dp)681 static void garbage_collect_passes(pl_dispatch dp)
682 {
683     if (dp->passes.num <= dp->max_passes)
684         return;
685 
686     // Garbage collect oldest passes, starting at the middle
687     qsort(dp->passes.elem, dp->passes.num, sizeof(struct pass *), cmp_pass_age);
688     int idx = dp->passes.num / 2;
689     while (idx < dp->passes.num && pass_age(dp->passes.elem[idx]) < MIN_AGE)
690         idx++;
691 
692     for (int i = idx; i < dp->passes.num; i++)
693         pass_destroy(dp, dp->passes.elem[i]);
694 
695     int num_evicted = dp->passes.num - idx;
696     dp->passes.num = idx;
697 
698     if (num_evicted) {
699         PL_DEBUG(dp, "Evicted %d passes from dispatch cache, consider "
700                  "using more dynamic shaders", num_evicted);
701     } else {
702         dp->max_passes *= 2;
703     }
704 }
705 
finalize_pass(pl_dispatch dp,pl_shader sh,pl_tex target,ident_t vert_pos,const struct pl_blend_params * blend,bool load,const struct pl_dispatch_vertex_params * vparams,ident_t out_proj)706 static struct pass *finalize_pass(pl_dispatch dp, pl_shader sh,
707                                   pl_tex target, ident_t vert_pos,
708                                   const struct pl_blend_params *blend, bool load,
709                                   const struct pl_dispatch_vertex_params *vparams,
710                                   ident_t out_proj)
711 {
712     struct pass *pass = pl_alloc_ptr(dp, pass);
713     *pass = (struct pass) {
714         .signature = 0x0, // updated incrementally below
715         .last_index = dp->current_index,
716         .ubo_desc = {
717             .desc = {
718                 .name = "UBO",
719                 .type = PL_DESC_BUF_UNIFORM,
720             },
721         },
722     };
723 
724     // For identifiers tied to the lifetime of this shader
725     void *tmp = SH_TMP(sh);
726 
727     struct pl_pass_params params = {
728         .type = pl_shader_is_compute(sh) ? PL_PASS_COMPUTE : PL_PASS_RASTER,
729         .num_descriptors = sh->descs.num,
730         .vertex_type = vparams ? vparams->vertex_type : PL_PRIM_TRIANGLE_STRIP,
731         .vertex_stride = vparams ? vparams->vertex_stride : 0,
732         .blend_params = blend,
733     };
734 
735     if (params.type == PL_PASS_RASTER) {
736         assert(target);
737         params.target_dummy = *target;
738         params.load_target = load;
739 
740         // Fill in the vertex attributes array
741         params.num_vertex_attribs = sh->vas.num;
742         params.vertex_attribs = pl_calloc_ptr(tmp, sh->vas.num, params.vertex_attribs);
743 
744         int va_loc = 0;
745         for (int i = 0; i < sh->vas.num; i++) {
746             struct pl_vertex_attrib *va = &params.vertex_attribs[i];
747             *va = sh->vas.elem[i].attr;
748 
749             // Mangle the name to make sure it doesn't conflict with the
750             // fragment shader input
751             va->name = pl_asprintf(tmp, "%s_v", va->name);
752 
753             // Place the vertex attribute
754             va->location = va_loc;
755             if (!vparams) {
756                 va->offset = params.vertex_stride;
757                 params.vertex_stride += va->fmt->texel_size;
758             }
759 
760             // The number of vertex attribute locations consumed by a vertex
761             // attribute is the number of vec4s it consumes, rounded up
762             const size_t va_loc_size = sizeof(float[4]);
763             va_loc += (va->fmt->texel_size + va_loc_size - 1) / va_loc_size;
764         }
765 
766         // Hash in the raster state configuration
767         pl_hash_merge(&pass->signature, (uint64_t) params.vertex_type);
768         pl_hash_merge(&pass->signature, (uint64_t) params.vertex_stride);
769         pl_hash_merge(&pass->signature, (uint64_t) params.load_target);
770         pl_hash_merge(&pass->signature, (uintptr_t) target->params.format);
771         if (blend)
772             pl_hash_merge(&pass->signature, pl_mem_hash(blend, sizeof(*blend)));
773     }
774 
775     // Place all of the compile-time constants
776     uint8_t *constant_data = NULL;
777     if (sh->consts.num) {
778         params.num_constants = sh->consts.num;
779         params.constants = pl_alloc(tmp, sh->consts.num * sizeof(struct pl_constant));
780 
781         // Compute offsets
782         size_t total_size = 0;
783         uint32_t const_id = 0;
784         for (int i = 0; i < sh->consts.num; i++) {
785             params.constants[i] = (struct pl_constant) {
786                 .type = sh->consts.elem[i].type,
787                 .id = const_id++,
788                 .offset = total_size,
789             };
790             total_size += pl_var_type_size(sh->consts.elem[i].type);
791         }
792 
793         // Write values into the constants buffer
794         params.constant_data = constant_data = pl_alloc(pass, total_size);
795         for (int i = 0; i < sh->consts.num; i++) {
796             const struct pl_shader_const *sc = &sh->consts.elem[i];
797             void *data = constant_data + params.constants[i].offset;
798             memcpy(data, sc->data, pl_var_type_size(sc->type));
799         }
800     }
801 
802     // Place all the variables; these will dynamically end up in different
803     // locations based on what the underlying GPU supports (UBOs, pushc, etc.)
804     //
805     // We go through the list twice, once to place stuff that we definitely
806     // want inside PCs, and then a second time to opportunistically place the rest.
807     pass->vars = pl_calloc_ptr(pass, sh->vars.num, pass->vars);
808     for (int i = 0; i < sh->vars.num; i++) {
809         if (!add_pass_var(dp, tmp, pass, &params, &sh->vars.elem[i], &pass->vars[i], false))
810             goto error;
811     }
812     for (int i = 0; i < sh->vars.num; i++) {
813         if (!add_pass_var(dp, tmp, pass, &params, &sh->vars.elem[i], &pass->vars[i], true))
814             goto error;
815     }
816 
817     // Now that we know the variable placement, finalize pushc/UBO sizes
818     params.push_constants_size = PL_ALIGN2(params.push_constants_size, 4);
819     size_t ubo_size = sh_buf_desc_size(&pass->ubo_desc);
820     if (ubo_size) {
821         pass->ubo_index = sh->descs.num;
822         sh_desc(sh, pass->ubo_desc);
823     };
824 
825     // Place and fill in the descriptors
826     const int num_descs = sh->descs.num;
827     int binding[PL_DESC_TYPE_COUNT] = {0};
828     params.num_descriptors = num_descs;
829     params.descriptors = pl_calloc_ptr(tmp, num_descs, params.descriptors);
830     for (int i = 0; i < num_descs; i++) {
831         struct pl_desc *desc = &params.descriptors[i];
832         *desc = sh->descs.elem[i].desc;
833         desc->binding = binding[pl_desc_namespace(dp->gpu, desc->type)]++;
834     }
835 
836     // Finalize the shader and look it up in the pass cache
837     generate_shaders(dp, tmp, pass, &params, sh, vert_pos, out_proj);
838     for (int i = 0; i < dp->passes.num; i++) {
839         struct pass *p = dp->passes.elem[i];
840         if (p->signature != pass->signature)
841             continue;
842 
843         // Found existing shader, re-use directly
844         if (p->ubo)
845             sh->descs.elem[p->ubo_index].binding.object = p->ubo;
846         pl_free(p->run_params.constant_data);
847         p->run_params.constant_data = pl_steal(p, constant_data);
848         p->last_index = dp->current_index;
849         pl_free(pass);
850         return p;
851     }
852 
853     // Find and attach the cached program, if any
854     for (int i = 0; i < dp->cached_passes.num; i++) {
855         if (dp->cached_passes.elem[i].signature == pass->signature) {
856             PL_DEBUG(dp, "Re-using cached program with signature 0x%llx",
857                      (unsigned long long) pass->signature);
858 
859             params.cached_program = dp->cached_passes.elem[i].cached_program;
860             params.cached_program_len = dp->cached_passes.elem[i].cached_program_len;
861             PL_ARRAY_REMOVE_AT(dp->cached_passes, i);
862             break;
863         }
864     }
865 
866     pass->pass = pl_pass_create(dp->gpu, &params);
867     if (!pass->pass) {
868         PL_ERR(dp, "Failed creating render pass for dispatch");
869         // Add it anyway
870     }
871 
872     struct pl_pass_run_params *rparams = &pass->run_params;
873     rparams->pass = pass->pass;
874     rparams->constant_data = constant_data;
875     rparams->push_constants = pl_zalloc(pass, params.push_constants_size);
876     rparams->desc_bindings = pl_calloc_ptr(pass, params.num_descriptors,
877                                            rparams->desc_bindings);
878 
879     if (ubo_size && pass->pass) {
880         // Create the UBO
881         pass->ubo = pl_buf_create(dp->gpu, &(struct pl_buf_params) {
882             .size = ubo_size,
883             .uniform = true,
884             .host_writable = true,
885         });
886 
887         if (!pass->ubo) {
888             PL_ERR(dp, "Failed creating uniform buffer for dispatch");
889             goto error;
890         }
891 
892         sh->descs.elem[pass->ubo_index].binding.object = pass->ubo;
893     }
894 
895     if (params.type == PL_PASS_RASTER && !vparams) {
896         // Generate the vertex array placeholder
897         rparams->vertex_count = 4; // single quad
898         size_t vert_size = rparams->vertex_count * params.vertex_stride;
899         rparams->vertex_data = pl_zalloc(pass, vert_size);
900     }
901 
902     pass->timer = pl_timer_create(dp->gpu);
903 
904     garbage_collect_passes(dp);
905     PL_ARRAY_APPEND(dp, dp->passes, pass);
906     return pass;
907 
908 error:
909     pass_destroy(dp, pass);
910     return NULL;
911 }
912 
update_pass_var(pl_dispatch dp,struct pass * pass,const struct pl_shader_var * sv,struct pass_var * pv)913 static void update_pass_var(pl_dispatch dp, struct pass *pass,
914                             const struct pl_shader_var *sv, struct pass_var *pv)
915 {
916     struct pl_var_layout host_layout = pl_var_host_layout(0, &sv->var);
917     pl_assert(host_layout.size);
918 
919     // Use the cache to skip updates if possible
920     if (pv->cached_data && !memcmp(sv->data, pv->cached_data, host_layout.size))
921         return;
922     if (!pv->cached_data)
923         pv->cached_data = pl_alloc(pass, host_layout.size);
924     memcpy(pv->cached_data, sv->data, host_layout.size);
925 
926     struct pl_pass_run_params *rparams = &pass->run_params;
927     switch (pv->type) {
928     case PASS_VAR_NONE:
929         pl_unreachable();
930     case PASS_VAR_GLOBAL: {
931         struct pl_var_update vu = {
932             .index = pv->index,
933             .data  = sv->data,
934         };
935         PL_ARRAY_APPEND_RAW(pass, rparams->var_updates, rparams->num_var_updates, vu);
936         break;
937     }
938     case PASS_VAR_UBO: {
939         pl_assert(pass->ubo);
940         const size_t offset = pv->layout.offset;
941         if (host_layout.stride == pv->layout.stride) {
942             pl_assert(host_layout.size == pv->layout.size);
943             pl_buf_write(dp->gpu, pass->ubo, offset, sv->data, host_layout.size);
944         } else {
945             // Coalesce strided UBO write into a single pl_buf_write to avoid
946             // unnecessary synchronization overhead by assembling the correctly
947             // strided upload in RAM
948             pl_grow(dp, &dp->tmp[0].buf, pv->layout.size);
949             uint8_t * const tmp = dp->tmp[0].buf;
950             const uint8_t *src = sv->data;
951             const uint8_t *end = src + host_layout.size;
952             uint8_t *dst = tmp;
953             while (src < end) {
954                 memcpy(dst, src, host_layout.stride);
955                 src += host_layout.stride;
956                 dst += pv->layout.stride;
957             }
958             pl_buf_write(dp->gpu, pass->ubo, offset, tmp, pv->layout.size);
959         }
960         break;
961     }
962     case PASS_VAR_PUSHC:
963         pl_assert(rparams->push_constants);
964         memcpy_layout(rparams->push_constants, pv->layout, sv->data, host_layout);
965         break;
966     };
967 }
968 
compute_vertex_attribs(pl_dispatch dp,pl_shader sh,int width,int height,ident_t * out_scale)969 static void compute_vertex_attribs(pl_dispatch dp, pl_shader sh,
970                                    int width, int height, ident_t *out_scale)
971 {
972     // Simulate vertex attributes using global definitions
973     *out_scale = sh_var(sh, (struct pl_shader_var) {
974         .var     = pl_var_vec2("out_scale"),
975         .data    = &(float[2]){ 1.0 / width, 1.0 / height },
976         .dynamic = true,
977     });
978 
979     GLSLP("#define frag_pos(id) (vec2(id) + vec2(0.5)) \n"
980           "#define frag_map(id) (%s * frag_pos(id))    \n"
981           "#define gl_FragCoord vec4(frag_pos(gl_GlobalInvocationID), 0.0, 1.0) \n",
982           *out_scale);
983 
984     for (int n = 0; n < sh->vas.num; n++) {
985         const struct pl_shader_va *sva = &sh->vas.elem[n];
986 
987         ident_t points[4];
988         for (int i = 0; i < PL_ARRAY_SIZE(points); i++) {
989             char name[4];
990             snprintf(name, sizeof(name), "p%d", i);
991             points[i] = sh_var_from_va(sh, name, &sva->attr, sva->data[i]);
992         }
993 
994         GLSLP("#define %s_map(id) "
995              "(mix(mix(%s, %s, frag_map(id).x), "
996              "     mix(%s, %s, frag_map(id).x), "
997              "frag_map(id).y))\n"
998              "#define %s (%s_map(gl_GlobalInvocationID))\n",
999              sva->attr.name,
1000              points[0], points[1], points[2], points[3],
1001              sva->attr.name, sva->attr.name);
1002     }
1003 }
1004 
translate_compute_shader(pl_dispatch dp,pl_shader sh,const struct pl_rect2d * rc,const struct pl_dispatch_params * params)1005 static void translate_compute_shader(pl_dispatch dp, pl_shader sh,
1006                                      const struct pl_rect2d *rc,
1007                                      const struct pl_dispatch_params *params)
1008 {
1009     int width = abs(pl_rect_w(*rc)), height = abs(pl_rect_h(*rc));
1010     ident_t out_scale;
1011     compute_vertex_attribs(dp, sh, width, height, &out_scale);
1012 
1013     // Simulate a framebuffer using storage images
1014     pl_assert(params->target->params.storable);
1015     pl_assert(sh->res.output == PL_SHADER_SIG_COLOR);
1016     ident_t fbo = sh_desc(sh, (struct pl_shader_desc) {
1017         .binding.object = params->target,
1018         .desc = {
1019             .name    = "out_image",
1020             .type    = PL_DESC_STORAGE_IMG,
1021             .access  = params->blend_params ? PL_DESC_ACCESS_READWRITE
1022                                             : PL_DESC_ACCESS_WRITEONLY,
1023         },
1024     });
1025 
1026     ident_t base = sh_var(sh, (struct pl_shader_var) {
1027         .data    = &(int[2]){ rc->x0, rc->y0 },
1028         .dynamic = true,
1029         .var     = {
1030             .name  = "base",
1031             .type  = PL_VAR_SINT,
1032             .dim_v = 2,
1033             .dim_m = 1,
1034             .dim_a = 1,
1035         },
1036     });
1037 
1038     int dx = rc->x0 > rc->x1 ? -1 : 1, dy = rc->y0 > rc->y1 ? -1 : 1;
1039     GLSL("ivec2 dir = ivec2(%d, %d);\n", dx, dy); // hard-code, not worth var
1040     GLSL("ivec2 pos = %s + dir * ivec2(gl_GlobalInvocationID);\n", base);
1041     GLSL("vec2 fpos = %s * vec2(gl_GlobalInvocationID);\n", out_scale);
1042     GLSL("if (max(fpos.x, fpos.y) < 1.0) {\n");
1043     if (params->blend_params) {
1044         GLSL("vec4 orig = imageLoad(%s, pos);\n", fbo);
1045 
1046         static const char *modes[] = {
1047             [PL_BLEND_ZERO] = "0.0",
1048             [PL_BLEND_ONE]  = "1.0",
1049             [PL_BLEND_SRC_ALPHA] = "color.a",
1050             [PL_BLEND_ONE_MINUS_SRC_ALPHA] = "(1.0 - color.a)",
1051         };
1052 
1053         GLSL("color = vec4(color.rgb * vec3(%s), color.a * %s) \n"
1054              "      + vec4(orig.rgb  * vec3(%s), orig.a  * %s);\n",
1055              modes[params->blend_params->src_rgb],
1056              modes[params->blend_params->src_alpha],
1057              modes[params->blend_params->dst_rgb],
1058              modes[params->blend_params->dst_alpha]);
1059     }
1060     GLSL("imageStore(%s, pos, color);\n", fbo);
1061     GLSL("}\n");
1062     sh->res.output = PL_SHADER_SIG_NONE;
1063 }
1064 
run_pass(pl_dispatch dp,pl_shader sh,struct pass * pass)1065 static void run_pass(pl_dispatch dp, pl_shader sh, struct pass *pass)
1066 {
1067     const struct pl_shader_res *res = pl_shader_finalize(sh);
1068     pl_pass_run(dp->gpu, &pass->run_params);
1069 
1070     for (uint64_t ts; (ts = pl_timer_query(dp->gpu, pass->timer));) {
1071         PL_TRACE(dp, "Spent %.3f ms on shader: %s", ts / 1e6, res->description);
1072 
1073         uint64_t old = pass->samples[pass->ts_idx];
1074         pass->samples[pass->ts_idx] = ts;
1075         pass->ts_last = ts;
1076         pass->ts_peak = PL_MAX(pass->ts_peak, ts);
1077         pass->ts_sum += ts;
1078         pass->ts_idx = (pass->ts_idx + 1) % PL_ARRAY_SIZE(pass->samples);
1079 
1080         if (old) {
1081             pass->ts_sum -= old;
1082             if (old == pass->ts_peak) {
1083                 uint64_t new_peak = 0;
1084                 for (int i = 0; i < PL_ARRAY_SIZE(pass->samples); i++)
1085                     new_peak = PL_MAX(new_peak, pass->samples[i]);
1086                 pass->ts_peak = new_peak;
1087             }
1088         }
1089     }
1090 
1091     if (!dp->info_callback)
1092         return;
1093 
1094     struct pl_dispatch_info info;
1095     info.signature = pass->signature;
1096     info.shader = res;
1097 
1098     // Test to see if the ring buffer already wrapped around once
1099     if (pass->samples[pass->ts_idx]) {
1100         info.num_samples = PL_ARRAY_SIZE(pass->samples);
1101         int num_wrapped = info.num_samples - pass->ts_idx;
1102         memcpy(info.samples, &pass->samples[pass->ts_idx],
1103                num_wrapped * sizeof(info.samples[0]));
1104         memcpy(&info.samples[num_wrapped], pass->samples,
1105                pass->ts_idx * sizeof(info.samples[0]));
1106     } else {
1107         info.num_samples = pass->ts_idx;
1108         memcpy(info.samples, pass->samples,
1109                pass->ts_idx * sizeof(info.samples[0]));
1110     }
1111 
1112     info.last = pass->ts_last;
1113     info.peak = pass->ts_peak;
1114     info.average = pass->ts_sum / PL_MAX(info.num_samples, 1);
1115     dp->info_callback(dp->info_priv, &info);
1116 }
1117 
pl_dispatch_finish(pl_dispatch dp,const struct pl_dispatch_params * params)1118 bool pl_dispatch_finish(pl_dispatch dp, const struct pl_dispatch_params *params)
1119 {
1120     pl_shader sh = *params->shader;
1121     const struct pl_shader_res *res = &sh->res;
1122     bool ret = false;
1123     pl_mutex_lock(&dp->lock);
1124 
1125     if (sh->failed) {
1126         PL_ERR(sh, "Trying to dispatch a failed shader.");
1127         goto error;
1128     }
1129 
1130     if (!sh->mutable) {
1131         PL_ERR(dp, "Trying to dispatch non-mutable shader?");
1132         goto error;
1133     }
1134 
1135     if (res->input != PL_SHADER_SIG_NONE || res->output != PL_SHADER_SIG_COLOR) {
1136         PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
1137         goto error;
1138     }
1139 
1140     const struct pl_tex_params *tpars = &params->target->params;
1141     if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) {
1142         PL_ERR(dp, "Trying to dispatch a shader using an invalid target "
1143                "texture. The target must be a renderable 2D texture.");
1144         goto error;
1145     }
1146 
1147     const struct pl_gpu_limits *limits = &dp->gpu->limits;
1148     bool can_compute = tpars->storable;
1149     if (can_compute && params->blend_params)
1150         can_compute = tpars->format->caps & PL_FMT_CAP_READWRITE;
1151 
1152     if (pl_shader_is_compute(sh) && !can_compute) {
1153         PL_ERR(dp, "Trying to dispatch using a compute shader with a "
1154                "non-storable or incompatible target texture.");
1155         goto error;
1156     } else if (can_compute && limits->compute_queues > limits->fragment_queues) {
1157         if (sh_try_compute(sh, 16, 16, true, 0))
1158             PL_TRACE(dp, "Upgrading fragment shader to compute shader.");
1159     }
1160 
1161     struct pl_rect2d rc = params->rect;
1162     if (!pl_rect_w(rc)) {
1163         rc.x0 = 0;
1164         rc.x1 = tpars->w;
1165     }
1166     if (!pl_rect_h(rc)) {
1167         rc.y0 = 0;
1168         rc.y1 = tpars->h;
1169     }
1170 
1171     int w, h, tw = abs(pl_rect_w(rc)), th = abs(pl_rect_h(rc));
1172     if (pl_shader_output_size(sh, &w, &h) && (w != tw || h != th))
1173     {
1174         PL_ERR(dp, "Trying to dispatch a shader with explicit output size "
1175                "requirements %dx%d using a target rect of size %dx%d.",
1176                w, h, tw, th);
1177         goto error;
1178     }
1179 
1180     ident_t vert_pos = NULL;
1181 
1182     if (pl_shader_is_compute(sh)) {
1183         // Translate the compute shader to simulate vertices etc.
1184         translate_compute_shader(dp, sh, &rc, params);
1185     } else {
1186         // Add the vertex information encoding the position
1187         vert_pos = sh_attr_vec2(sh, "position", &(const struct pl_rect2df) {
1188             .x0 = 2.0 * rc.x0 / tpars->w - 1.0,
1189             .y0 = 2.0 * rc.y0 / tpars->h - 1.0,
1190             .x1 = 2.0 * rc.x1 / tpars->w - 1.0,
1191             .y1 = 2.0 * rc.y1 / tpars->h - 1.0,
1192         });
1193     }
1194 
1195     // We need to set pl_pass_params.load_target when either blending is
1196     // enabled or we're drawing to some scissored sub-rect of the texture
1197     struct pl_rect2d full = { 0, 0, tpars->w, tpars->h };
1198     struct pl_rect2d rc_norm = rc;
1199     pl_rect2d_normalize(&rc_norm);
1200     rc_norm.x0 = PL_MAX(rc_norm.x0, 0);
1201     rc_norm.y0 = PL_MAX(rc_norm.y0, 0);
1202     rc_norm.x1 = PL_MIN(rc_norm.x1, tpars->w);
1203     rc_norm.y1 = PL_MIN(rc_norm.y1, tpars->h);
1204     bool load = params->blend_params || !pl_rect2d_eq(rc_norm, full);
1205 
1206     struct pass *pass = finalize_pass(dp, sh, params->target, vert_pos,
1207                                       params->blend_params, load, NULL, NULL);
1208 
1209     // Silently return on failed passes
1210     if (!pass || !pass->pass)
1211         goto error;
1212 
1213     struct pl_pass_run_params *rparams = &pass->run_params;
1214 
1215     // Update the descriptor bindings
1216     for (int i = 0; i < sh->descs.num; i++)
1217         rparams->desc_bindings[i] = sd_binding(sh->descs.elem[i]);
1218 
1219     // Update all of the variables (if needed)
1220     rparams->num_var_updates = 0;
1221     for (int i = 0; i < sh->vars.num; i++)
1222         update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
1223 
1224     // Update the vertex data
1225     if (rparams->vertex_data) {
1226         uintptr_t vert_base = (uintptr_t) rparams->vertex_data;
1227         size_t stride = rparams->pass->params.vertex_stride;
1228         for (int i = 0; i < sh->vas.num; i++) {
1229             const struct pl_shader_va *sva = &sh->vas.elem[i];
1230             struct pl_vertex_attrib *va = &rparams->pass->params.vertex_attribs[i];
1231 
1232             size_t size = sva->attr.fmt->texel_size;
1233             uintptr_t va_base = vert_base + va->offset; // use placed offset
1234             for (int n = 0; n < 4; n++)
1235                 memcpy((void *) (va_base + n * stride), sva->data[n], size);
1236         }
1237     }
1238 
1239     // For compute shaders: also update the dispatch dimensions
1240     if (pl_shader_is_compute(sh)) {
1241         // Round up to make sure we don-t leave off a part of the target
1242         int width = abs(pl_rect_w(rc)),
1243             height = abs(pl_rect_h(rc)),
1244             block_w = res->compute_group_size[0],
1245             block_h = res->compute_group_size[1],
1246             num_x   = (width  + block_w - 1) / block_w,
1247             num_y   = (height + block_h - 1) / block_h;
1248 
1249         rparams->compute_groups[0] = num_x;
1250         rparams->compute_groups[1] = num_y;
1251         rparams->compute_groups[2] = 1;
1252     } else {
1253         // Update the scissors for performance
1254         rparams->scissors = rc_norm;
1255     }
1256 
1257     // Dispatch the actual shader
1258     rparams->target = params->target;
1259     rparams->timer = PL_DEF(params->timer, pass->timer);
1260     run_pass(dp, sh, pass);
1261 
1262     ret = true;
1263     // fall through
1264 
1265 error:
1266     // Reset the temporary buffers which we use to build the shader
1267     for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
1268         dp->tmp[i].len = 0;
1269 
1270     pl_mutex_unlock(&dp->lock);
1271     pl_dispatch_abort(dp, params->shader);
1272     return ret;
1273 }
1274 
pl_dispatch_compute(pl_dispatch dp,const struct pl_dispatch_compute_params * params)1275 bool pl_dispatch_compute(pl_dispatch dp, const struct pl_dispatch_compute_params *params)
1276 {
1277     pl_shader sh = *params->shader;
1278     const struct pl_shader_res *res = &sh->res;
1279     bool ret = false;
1280     pl_mutex_lock(&dp->lock);
1281 
1282     if (sh->failed) {
1283         PL_ERR(sh, "Trying to dispatch a failed shader.");
1284         goto error;
1285     }
1286 
1287     if (!sh->mutable) {
1288         PL_ERR(dp, "Trying to dispatch non-mutable shader?");
1289         goto error;
1290     }
1291 
1292     if (res->input != PL_SHADER_SIG_NONE) {
1293         PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
1294         goto error;
1295     }
1296 
1297     if (!pl_shader_is_compute(sh)) {
1298         PL_ERR(dp, "Trying to dispatch a non-compute shader using "
1299                "`pl_dispatch_compute`!");
1300         goto error;
1301     }
1302 
1303     if (sh->vas.num) {
1304         if (!params->width || !params->height) {
1305             PL_ERR(dp, "Trying to dispatch a targetless compute shader that "
1306                    "uses vertex attributes, this requires specifying the size "
1307                    "of the effective rendering area!");
1308             goto error;
1309         }
1310 
1311         compute_vertex_attribs(dp, sh, params->width, params->height,
1312                                &(ident_t){0});
1313     }
1314 
1315     struct pass *pass = finalize_pass(dp, sh, NULL, NULL, NULL, false, NULL, NULL);
1316 
1317     // Silently return on failed passes
1318     if (!pass || !pass->pass)
1319         goto error;
1320 
1321     struct pl_pass_run_params *rparams = &pass->run_params;
1322 
1323     // Update the descriptor bindings
1324     for (int i = 0; i < sh->descs.num; i++)
1325         rparams->desc_bindings[i] = sd_binding(sh->descs.elem[i]);
1326 
1327     // Update all of the variables (if needed)
1328     rparams->num_var_updates = 0;
1329     for (int i = 0; i < sh->vars.num; i++)
1330         update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
1331 
1332     // Update the dispatch size
1333     int groups = 1;
1334     for (int i = 0; i < 3; i++) {
1335         groups *= params->dispatch_size[i];
1336         rparams->compute_groups[i] = params->dispatch_size[i];
1337     }
1338 
1339     if (!groups) {
1340         pl_assert(params->width && params->height);
1341         int block_w = res->compute_group_size[0],
1342             block_h = res->compute_group_size[1],
1343             num_x   = (params->width  + block_w - 1) / block_w,
1344             num_y   = (params->height + block_h - 1) / block_h;
1345 
1346         rparams->compute_groups[0] = num_x;
1347         rparams->compute_groups[1] = num_y;
1348         rparams->compute_groups[2] = 1;
1349     }
1350 
1351     // Dispatch the actual shader
1352     rparams->timer = PL_DEF(params->timer, pass->timer);
1353     run_pass(dp, sh, pass);
1354 
1355     ret = true;
1356     // fall through
1357 
1358 error:
1359     // Reset the temporary buffers which we use to build the shader
1360     for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
1361         dp->tmp[i].len = 0;
1362 
1363     pl_mutex_unlock(&dp->lock);
1364     pl_dispatch_abort(dp, params->shader);
1365     return ret;
1366 }
1367 
pl_dispatch_vertex(pl_dispatch dp,const struct pl_dispatch_vertex_params * params)1368 bool pl_dispatch_vertex(pl_dispatch dp, const struct pl_dispatch_vertex_params *params)
1369 {
1370     pl_shader sh = *params->shader;
1371     const struct pl_shader_res *res = &sh->res;
1372     bool ret = false;
1373     pl_mutex_lock(&dp->lock);
1374 
1375     if (sh->failed) {
1376         PL_ERR(sh, "Trying to dispatch a failed shader.");
1377         goto error;
1378     }
1379 
1380     if (!sh->mutable) {
1381         PL_ERR(dp, "Trying to dispatch non-mutable shader?");
1382         goto error;
1383     }
1384 
1385     if (res->input != PL_SHADER_SIG_NONE || res->output != PL_SHADER_SIG_COLOR) {
1386         PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
1387         goto error;
1388     }
1389 
1390     const struct pl_tex_params *tpars = &params->target->params;
1391     if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) {
1392         PL_ERR(dp, "Trying to dispatch a shader using an invalid target "
1393                "texture. The target must be a renderable 2D texture.");
1394         goto error;
1395     }
1396 
1397     if (pl_shader_is_compute(sh)) {
1398         PL_ERR(dp, "Trying to dispatch a compute shader using pl_dispatch_vertex.");
1399         goto error;
1400     }
1401 
1402     if (sh->vas.num) {
1403         PL_ERR(dp, "Trying to dispatch a custom vertex shader with already "
1404                "attached vertex attributes.");
1405         goto error;
1406     }
1407 
1408     int pos_idx = params->vertex_position_idx;
1409     if (pos_idx < 0 || pos_idx >= params->num_vertex_attribs) {
1410         PL_ERR(dp, "Vertex position index out of range?");
1411         goto error;
1412     }
1413 
1414     // Attach all of the vertex attributes to the shader manually
1415     sh->vas.num = params->num_vertex_attribs;
1416     PL_ARRAY_RESIZE(sh, sh->vas, sh->vas.num);
1417     for (int i = 0; i < params->num_vertex_attribs; i++)
1418         sh->vas.elem[i].attr = params->vertex_attribs[i];
1419 
1420     // Compute the coordinate projection matrix
1421     struct pl_transform2x2 proj = pl_transform2x2_identity;
1422     switch (params->vertex_coords) {
1423     case PL_COORDS_ABSOLUTE:
1424         proj.mat.m[0][0] /= tpars->w;
1425         proj.mat.m[1][1] /= tpars->h;
1426         // fall through
1427     case PL_COORDS_RELATIVE:
1428         proj.mat.m[0][0] *= 2.0;
1429         proj.mat.m[1][1] *= 2.0;
1430         proj.c[0] -= 1.0;
1431         proj.c[1] -= 1.0;
1432         // fall through
1433     case PL_COORDS_NORMALIZED:
1434         if (params->vertex_flipped) {
1435             proj.mat.m[1][1] = -proj.mat.m[1][1];
1436             proj.c[1] += 2.0;
1437         }
1438         break;
1439     }
1440 
1441     ident_t out_proj = NULL;
1442     if (memcmp(&proj, &pl_transform2x2_identity, sizeof(proj)) != 0) {
1443         struct pl_matrix3x3 mat = {{
1444             {proj.mat.m[0][0], proj.mat.m[0][1], proj.c[0]},
1445             {proj.mat.m[1][0], proj.mat.m[1][1], proj.c[1]},
1446             {0.0, 0.0, 1.0},
1447         }};
1448         out_proj = sh_var(sh, (struct pl_shader_var) {
1449             .var = pl_var_mat3("proj"),
1450             .data = PL_TRANSPOSE_3X3(mat.m),
1451         });
1452     }
1453 
1454     ident_t vert_pos = params->vertex_attribs[pos_idx].name;
1455     struct pass *pass = finalize_pass(dp, sh, params->target, vert_pos,
1456                                       params->blend_params, true, params, out_proj);
1457 
1458     // Silently return on failed passes
1459     if (!pass || !pass->pass)
1460         goto error;
1461 
1462     struct pl_pass_run_params *rparams = &pass->run_params;
1463 
1464     // Update the descriptor bindings
1465     for (int i = 0; i < sh->descs.num; i++)
1466         rparams->desc_bindings[i] = sd_binding(sh->descs.elem[i]);
1467 
1468     // Update all of the variables (if needed)
1469     rparams->num_var_updates = 0;
1470     for (int i = 0; i < sh->vars.num; i++)
1471         update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
1472 
1473     // Update the scissors
1474     rparams->scissors = params->scissors;
1475     if (params->vertex_flipped) {
1476         rparams->scissors.y0 = tpars->h - rparams->scissors.y0;
1477         rparams->scissors.y1 = tpars->h - rparams->scissors.y1;
1478     }
1479     pl_rect2d_normalize(&rparams->scissors);
1480 
1481     // Dispatch the actual shader
1482     rparams->target = params->target;
1483     rparams->vertex_count = params->vertex_count;
1484     rparams->vertex_data = params->vertex_data;
1485     rparams->vertex_buf = params->vertex_buf;
1486     rparams->buf_offset = params->buf_offset;
1487     rparams->index_data = params->index_data;
1488     rparams->index_buf = params->index_buf;
1489     rparams->index_offset = params->index_offset;
1490     rparams->timer = PL_DEF(params->timer, pass->timer);
1491     run_pass(dp, sh, pass);
1492 
1493     ret = true;
1494     // fall through
1495 
1496 error:
1497     // Reset the temporary buffers which we use to build the shader
1498     for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
1499         dp->tmp[i].len = 0;
1500 
1501     pl_mutex_unlock(&dp->lock);
1502     pl_dispatch_abort(dp, params->shader);
1503     return ret;
1504 }
1505 
pl_dispatch_abort(pl_dispatch dp,pl_shader * psh)1506 void pl_dispatch_abort(pl_dispatch dp, pl_shader *psh)
1507 {
1508     pl_shader sh = *psh;
1509     if (!sh)
1510         return;
1511 
1512     // Reset this as early as possible to free temporary resources
1513     pl_shader_reset(sh, NULL);
1514 
1515     // Re-add the shader to the internal pool of shaders
1516     pl_mutex_lock(&dp->lock);
1517     PL_ARRAY_APPEND(dp, dp->shaders, sh);
1518     pl_mutex_unlock(&dp->lock);
1519     *psh = NULL;
1520 }
1521 
1522 // Stuff related to caching
1523 static const char cache_magic[] = {'P', 'L', 'D', 'P'};
1524 static const uint32_t cache_version = 1;
1525 
write_buf(uint8_t * buf,size_t * pos,const void * src,size_t size)1526 static void write_buf(uint8_t *buf, size_t *pos, const void *src, size_t size)
1527 {
1528     assert(size);
1529     if (buf)
1530         memcpy(&buf[*pos], src, size);
1531     *pos += size;
1532 }
1533 
1534 #define WRITE(type, var) write_buf(out, &size, &(type){ var }, sizeof(type))
1535 #define LOAD(var)                           \
1536   do {                                      \
1537       memcpy(&(var), cache, sizeof(var));   \
1538       cache += sizeof(var);                 \
1539   } while (0)
1540 
pl_dispatch_save(pl_dispatch dp,uint8_t * out)1541 size_t pl_dispatch_save(pl_dispatch dp, uint8_t *out)
1542 {
1543     size_t size = 0;
1544     pl_mutex_lock(&dp->lock);
1545 
1546     write_buf(out, &size, cache_magic, sizeof(cache_magic));
1547     WRITE(uint32_t, cache_version);
1548 
1549     // Remember this position so we can go back and write the actual number of
1550     // cached programs
1551     uint32_t num_passes = 0;
1552     void *out_num = out ? &out[size] : NULL;
1553     size += sizeof(num_passes);
1554 
1555     // Save the cached programs for all compiled passes
1556     for (int i = 0; i < dp->passes.num; i++) {
1557         const struct pass *pass = dp->passes.elem[i];
1558         if (!pass->pass)
1559             continue;
1560 
1561         const struct pl_pass_params *params = &pass->pass->params;
1562         if (!params->cached_program_len)
1563             continue;
1564 
1565         if (out) {
1566             PL_DEBUG(dp, "Saving %zu bytes of cached program with signature 0x%llx",
1567                      params->cached_program_len, (unsigned long long) pass->signature);
1568         }
1569 
1570         num_passes++;
1571         WRITE(uint64_t, pass->signature);
1572         WRITE(uint64_t, params->cached_program_len);
1573         write_buf(out, &size, params->cached_program, params->cached_program_len);
1574     }
1575 
1576     // Re-save the cached programs for all previously loaded (but not yet
1577     // compiled) passes. This is simply to make `pl_dispatch_load` followed
1578     // by `pl_dispatch_save` return the same cache as was previously loaded.
1579     for (int i = 0; i < dp->cached_passes.num; i++) {
1580         const struct cached_pass *pass = &dp->cached_passes.elem[i];
1581         if (!pass->cached_program_len)
1582             continue;
1583 
1584         if (out) {
1585             PL_DEBUG(dp, "Saving %zu bytes of cached program with signature 0x%llx",
1586                      pass->cached_program_len, (unsigned long long) pass->signature);
1587         }
1588 
1589         num_passes++;
1590         WRITE(uint64_t, pass->signature);
1591         WRITE(uint64_t, pass->cached_program_len);
1592         write_buf(out, &size, pass->cached_program, pass->cached_program_len);
1593     }
1594 
1595     if (out)
1596         memcpy(out_num, &num_passes, sizeof(num_passes));
1597 
1598     pl_mutex_unlock(&dp->lock);
1599     return size;
1600 }
1601 
pl_dispatch_load(pl_dispatch dp,const uint8_t * cache)1602 void pl_dispatch_load(pl_dispatch dp, const uint8_t *cache)
1603 {
1604     char magic[4];
1605     LOAD(magic);
1606     if (memcmp(magic, cache_magic, sizeof(magic)) != 0) {
1607         PL_ERR(dp, "Failed loading dispatch cache: invalid magic bytes");
1608         return;
1609     }
1610 
1611     uint32_t version;
1612     LOAD(version);
1613     if (version != cache_version) {
1614         PL_WARN(dp, "Failed loading dispatch cache: wrong version");
1615         return;
1616     }
1617 
1618     uint32_t num;
1619     LOAD(num);
1620 
1621     pl_mutex_lock(&dp->lock);
1622     for (int i = 0; i < num; i++) {
1623         uint64_t sig, size;
1624         LOAD(sig);
1625         LOAD(size);
1626         if (!size)
1627             continue;
1628 
1629         // Skip passes that are already compiled
1630         for (int n = 0; n < dp->passes.num; n++) {
1631             if (dp->passes.elem[n]->signature == sig) {
1632                 PL_DEBUG(dp, "Skipping already compiled pass with signature %llx",
1633                          (unsigned long long) sig);
1634                 cache += size;
1635                 continue;
1636             }
1637         }
1638 
1639         // Find a cached_pass entry with this signature, if any
1640         struct cached_pass *pass = NULL;
1641         for (int n = 0; n < dp->cached_passes.num; n++) {
1642             if (dp->cached_passes.elem[n].signature == sig) {
1643                 pass = &dp->cached_passes.elem[n];
1644                 break;
1645             }
1646         }
1647 
1648         if (!pass) {
1649             // None found, add a new entry
1650             PL_ARRAY_GROW(dp, dp->cached_passes);
1651             pass = &dp->cached_passes.elem[dp->cached_passes.num++];
1652             *pass = (struct cached_pass) { .signature = sig };
1653         }
1654 
1655         PL_DEBUG(dp, "Loading %zu bytes of cached program with signature 0x%llx",
1656                  (size_t) size, (unsigned long long) sig);
1657 
1658         pl_free((void *) pass->cached_program);
1659         pass->cached_program = pl_memdup(dp, cache, size);
1660         pass->cached_program_len = size;
1661         cache += size;
1662     }
1663     pl_mutex_unlock(&dp->lock);
1664 }
1665