1 /*
2 * This file is part of libplacebo.
3 *
4 * libplacebo is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * libplacebo is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include "common.h"
19 #include "log.h"
20 #include "shaders.h"
21 #include "dispatch.h"
22 #include "gpu.h"
23 #include "pl_thread.h"
24
25 // Maximum number of passes to keep around at once. If full, passes older than
26 // MIN_AGE are evicted to make room. (Failing that, the cache size doubles)
27 #define MAX_PASSES 100
28 #define MIN_AGE 10
29
30 enum {
31 TMP_PRELUDE, // GLSL version, global definitions, etc.
32 TMP_MAIN, // main GLSL shader body
33 TMP_VERT_HEAD, // vertex shader inputs/outputs
34 TMP_VERT_BODY, // vertex shader body
35 TMP_COUNT,
36 };
37
38 struct pl_dispatch {
39 pl_mutex lock;
40 pl_log log;
41 pl_gpu gpu;
42 uint8_t current_ident;
43 uint8_t current_index;
44 bool dynamic_constants;
45 int max_passes;
46
47 void (*info_callback)(void *, const struct pl_dispatch_info *);
48 void *info_priv;
49
50 PL_ARRAY(pl_shader) shaders; // to avoid re-allocations
51 PL_ARRAY(struct pass *) passes; // compiled passes
52 PL_ARRAY(struct cached_pass) cached_passes; // not-yet-compiled passes
53
54 // temporary buffers to help avoid re_allocations during pass creation
55 pl_str tmp[TMP_COUNT];
56 };
57
58 enum pass_var_type {
59 PASS_VAR_NONE = 0,
60 PASS_VAR_GLOBAL, // regular/global uniforms
61 PASS_VAR_UBO, // uniform buffers
62 PASS_VAR_PUSHC // push constants
63 };
64
65 // Cached metadata about a variable's effective placement / update method
66 struct pass_var {
67 int index; // for pl_var_update
68 enum pass_var_type type;
69 struct pl_var_layout layout;
70 void *cached_data;
71 };
72
73 struct pass {
74 uint64_t signature; // as returned by pl_shader_signature
75 pl_pass pass;
76 int last_index;
77
78 // contains cached data and update metadata, same order as pl_shader
79 struct pass_var *vars;
80
81 // for uniform buffer updates
82 struct pl_shader_desc ubo_desc; // temporary
83 int ubo_index;
84 pl_buf ubo;
85
86 // Cached pl_pass_run_params. This will also contain mutable allocations
87 // for the push constants, descriptor bindings (including the binding for
88 // the UBO pre-filled), vertex array and variable updates
89 struct pl_pass_run_params run_params;
90
91 // for pl_dispatch_info
92 pl_timer timer;
93 uint64_t ts_last;
94 uint64_t ts_peak;
95 uint64_t ts_sum;
96 uint64_t samples[PL_ARRAY_SIZE(((struct pl_dispatch_info *) NULL)->samples)];
97 int ts_idx;
98 };
99
100 struct cached_pass {
101 uint64_t signature;
102 const uint8_t *cached_program;
103 size_t cached_program_len;
104 };
105
pass_destroy(pl_dispatch dp,struct pass * pass)106 static void pass_destroy(pl_dispatch dp, struct pass *pass)
107 {
108 if (!pass)
109 return;
110
111 pl_buf_destroy(dp->gpu, &pass->ubo);
112 pl_pass_destroy(dp->gpu, &pass->pass);
113 pl_timer_destroy(dp->gpu, &pass->timer);
114 pl_free(pass);
115 }
116
pl_dispatch_create(pl_log log,pl_gpu gpu)117 pl_dispatch pl_dispatch_create(pl_log log, pl_gpu gpu)
118 {
119 struct pl_dispatch *dp = pl_zalloc_ptr(NULL, dp);
120 pl_mutex_init(&dp->lock);
121 dp->log = log;
122 dp->gpu = gpu;
123 dp->max_passes = MAX_PASSES;
124
125 return dp;
126 }
127
pl_dispatch_destroy(pl_dispatch * ptr)128 void pl_dispatch_destroy(pl_dispatch *ptr)
129 {
130 pl_dispatch dp = *ptr;
131 if (!dp)
132 return;
133
134 for (int i = 0; i < dp->passes.num; i++)
135 pass_destroy(dp, dp->passes.elem[i]);
136 for (int i = 0; i < dp->shaders.num; i++)
137 pl_shader_free(&dp->shaders.elem[i]);
138
139 pl_mutex_destroy(&dp->lock);
140 pl_free(dp);
141 *ptr = NULL;
142 }
143
pl_dispatch_begin_ex(pl_dispatch dp,bool unique)144 pl_shader pl_dispatch_begin_ex(pl_dispatch dp, bool unique)
145 {
146 pl_mutex_lock(&dp->lock);
147
148 struct pl_shader_params params = {
149 .id = unique ? dp->current_ident++ : 0,
150 .gpu = dp->gpu,
151 .index = dp->current_index,
152 .dynamic_constants = dp->dynamic_constants,
153 };
154
155 pl_shader sh = NULL;
156 PL_ARRAY_POP(dp->shaders, &sh);
157 pl_mutex_unlock(&dp->lock);
158
159 if (sh) {
160 sh->res.params = params;
161 return sh;
162 }
163
164 return pl_shader_alloc(dp->log, ¶ms);
165 }
166
pl_dispatch_reset_frame(pl_dispatch dp)167 void pl_dispatch_reset_frame(pl_dispatch dp)
168 {
169 pl_mutex_lock(&dp->lock);
170 dp->current_ident = 0;
171 dp->current_index++;
172 pl_mutex_unlock(&dp->lock);
173 }
174
pl_dispatch_mark_dynamic(pl_dispatch dp,bool dynamic)175 void pl_dispatch_mark_dynamic(pl_dispatch dp, bool dynamic)
176 {
177 dp->dynamic_constants = dynamic;
178 }
179
pl_dispatch_callback(pl_dispatch dp,void * priv,void (* cb)(void * priv,const struct pl_dispatch_info *))180 void pl_dispatch_callback(pl_dispatch dp, void *priv,
181 void (*cb)(void *priv, const struct pl_dispatch_info *))
182 {
183 dp->info_callback = cb;
184 dp->info_priv = priv;
185 }
186
pl_dispatch_begin(pl_dispatch dp)187 pl_shader pl_dispatch_begin(pl_dispatch dp)
188 {
189 return pl_dispatch_begin_ex(dp, false);
190 }
191
add_pass_var(pl_dispatch dp,void * tmp,struct pass * pass,struct pl_pass_params * params,const struct pl_shader_var * sv,struct pass_var * pv,bool greedy)192 static bool add_pass_var(pl_dispatch dp, void *tmp, struct pass *pass,
193 struct pl_pass_params *params,
194 const struct pl_shader_var *sv, struct pass_var *pv,
195 bool greedy)
196 {
197 pl_gpu gpu = dp->gpu;
198 if (pv->type)
199 return true;
200
201 // Try not to use push constants for "large" values like matrices in the
202 // first pass, since this is likely to exceed the VGPR/pushc size budgets
203 bool try_pushc = greedy || (sv->var.dim_m == 1 && sv->var.dim_a == 1) || sv->dynamic;
204 if (try_pushc && gpu->glsl.vulkan && gpu->limits.max_pushc_size) {
205 pv->layout = pl_std430_layout(params->push_constants_size, &sv->var);
206 size_t new_size = pv->layout.offset + pv->layout.size;
207 if (new_size <= gpu->limits.max_pushc_size) {
208 params->push_constants_size = new_size;
209 pv->type = PASS_VAR_PUSHC;
210 return true;
211 }
212 }
213
214 // If we haven't placed all PCs yet, don't place anything else, since
215 // we want to try and fit more stuff into PCs before "giving up"
216 if (!greedy)
217 return true;
218
219 // Attempt using uniform buffer next. The GLSL version 440 check is due
220 // to explicit offsets on UBO entries. In theory we could leave away
221 // the offsets and support UBOs for older GL as well, but this is a nice
222 // safety net for driver bugs (and also rules out potentially buggy drivers)
223 // Also avoid UBOs for highly dynamic stuff since that requires synchronizing
224 // the UBO writes every frame
225 bool try_ubo = params->num_variables == gpu->limits.max_variables || !sv->dynamic;
226 if (try_ubo && gpu->glsl.version >= 440 && gpu->limits.max_ubo_size) {
227 if (sh_buf_desc_append(tmp, gpu, &pass->ubo_desc, &pv->layout, sv->var)) {
228 pv->type = PASS_VAR_UBO;
229 return true;
230 }
231 }
232
233 // Otherwise, use global uniforms
234 if (params->num_variables < gpu->limits.max_variables) {
235 pv->type = PASS_VAR_GLOBAL;
236 pv->index = params->num_variables;
237 pv->layout = pl_var_host_layout(0, &sv->var);
238 PL_ARRAY_APPEND_RAW(tmp, params->variables, params->num_variables, sv->var);
239 return true;
240 }
241
242 // Ran out of variable binding methods. The most likely scenario in which
243 // this can happen is if we're using a GPU that does not support global
244 // input vars and we've exhausted the UBO size limits.
245 PL_ERR(dp, "Unable to add input variable '%s': possibly exhausted "
246 "variable count / UBO size limits?", sv->var.name);
247 return false;
248 }
249
250 #define ADD(x, ...) pl_str_append_asprintf_c(dp, (x), __VA_ARGS__)
251 #define ADD_STR(x, s) pl_str_append(dp, (x), (s))
252
add_var(pl_dispatch dp,pl_str * body,const struct pl_var * var)253 static void add_var(pl_dispatch dp, pl_str *body, const struct pl_var *var)
254 {
255 ADD(body, "%s %s", pl_var_glsl_type_name(*var), var->name);
256
257 if (var->dim_a > 1) {
258 ADD(body, "[%d];\n", var->dim_a);
259 } else {
260 ADD(body, ";\n");
261 }
262 }
263
cmp_buffer_var(const void * pa,const void * pb)264 static int cmp_buffer_var(const void *pa, const void *pb)
265 {
266 const struct pl_buffer_var * const *a = pa, * const *b = pb;
267 return PL_CMP((*a)->layout.offset, (*b)->layout.offset);
268 }
269
add_buffer_vars(pl_dispatch dp,void * tmp,pl_str * body,const struct pl_buffer_var * vars,int num)270 static void add_buffer_vars(pl_dispatch dp, void *tmp, pl_str *body,
271 const struct pl_buffer_var *vars, int num)
272 {
273 // Sort buffer vars
274 const struct pl_buffer_var **sorted_vars = pl_calloc_ptr(tmp, num, sorted_vars);
275 for (int i = 0; i < num; i++)
276 sorted_vars[i] = &vars[i];
277 qsort(sorted_vars, num, sizeof(sorted_vars[0]), cmp_buffer_var);
278
279 ADD(body, "{\n");
280 for (int i = 0; i < num; i++) {
281 // Add an explicit offset wherever possible
282 if (dp->gpu->glsl.version >= 440)
283 ADD(body, " layout(offset=%zu) ", sorted_vars[i]->layout.offset);
284 add_var(dp, body, &sorted_vars[i]->var);
285 }
286 ADD(body, "};\n");
287 }
288
sh_var_from_va(pl_shader sh,const char * name,const struct pl_vertex_attrib * va,const void * data)289 static ident_t sh_var_from_va(pl_shader sh, const char *name,
290 const struct pl_vertex_attrib *va,
291 const void *data)
292 {
293 return sh_var(sh, (struct pl_shader_var) {
294 .var = pl_var_from_fmt(va->fmt, name),
295 .data = data,
296 });
297 }
298
sd_binding(const struct pl_shader_desc sd)299 static inline struct pl_desc_binding sd_binding(const struct pl_shader_desc sd)
300 {
301 // For backwards compatibility with the deprecated field sd.object
302 struct pl_desc_binding binding = sd.binding;
303 binding.object = PL_DEF(binding.object, sd.object);
304 return binding;
305 }
306
generate_shaders(pl_dispatch dp,void * tmp,struct pass * pass,struct pl_pass_params * params,pl_shader sh,ident_t vert_pos,ident_t out_proj)307 static void generate_shaders(pl_dispatch dp, void *tmp, struct pass *pass,
308 struct pl_pass_params *params, pl_shader sh,
309 ident_t vert_pos, ident_t out_proj)
310 {
311 pl_gpu gpu = dp->gpu;
312 const struct pl_shader_res *res = pl_shader_finalize(sh);
313
314 pl_str *pre = &dp->tmp[TMP_PRELUDE];
315 ADD(pre, "#version %d%s\n", gpu->glsl.version,
316 (gpu->glsl.gles && gpu->glsl.version > 100) ? " es" : "");
317 if (params->type == PL_PASS_COMPUTE)
318 ADD(pre, "#extension GL_ARB_compute_shader : enable\n");
319
320 // Enable this unconditionally if the GPU supports it, since we have no way
321 // of knowing whether subgroups are being used or not
322 if (gpu->glsl.subgroup_size) {
323 ADD(pre, "#extension GL_KHR_shader_subgroup_basic : enable \n"
324 "#extension GL_KHR_shader_subgroup_vote : enable \n"
325 "#extension GL_KHR_shader_subgroup_arithmetic : enable \n"
326 "#extension GL_KHR_shader_subgroup_ballot : enable \n"
327 "#extension GL_KHR_shader_subgroup_shuffle : enable \n");
328 }
329
330 // Enable all extensions needed for different types of input
331 bool has_ssbo = false, has_ubo = false, has_img = false, has_texel = false,
332 has_ext = false, has_nofmt = false, has_gather = false;
333 for (int i = 0; i < sh->descs.num; i++) {
334 switch (sh->descs.elem[i].desc.type) {
335 case PL_DESC_BUF_UNIFORM: has_ubo = true; break;
336 case PL_DESC_BUF_STORAGE: has_ssbo = true; break;
337 case PL_DESC_BUF_TEXEL_UNIFORM: has_texel = true; break;
338 case PL_DESC_BUF_TEXEL_STORAGE: {
339 pl_buf buf = sd_binding(res->descriptors[i]).object;
340 has_nofmt |= !buf->params.format->glsl_format;
341 has_texel = true;
342 break;
343 }
344 case PL_DESC_STORAGE_IMG: {
345 pl_tex tex = sd_binding(res->descriptors[i]).object;
346 has_nofmt |= !tex->params.format->glsl_format;
347 has_img = true;
348 break;
349 }
350 case PL_DESC_SAMPLED_TEX: {
351 pl_tex tex = sd_binding(res->descriptors[i]).object;
352 has_gather |= tex->params.format->gatherable;
353 switch (tex->sampler_type) {
354 case PL_SAMPLER_NORMAL: break;
355 case PL_SAMPLER_RECT: break;
356 case PL_SAMPLER_EXTERNAL: has_ext = true; break;
357 case PL_SAMPLER_TYPE_COUNT: pl_unreachable();
358 }
359 break;
360 }
361
362 case PL_DESC_INVALID:
363 case PL_DESC_TYPE_COUNT:
364 pl_unreachable();
365 }
366 }
367
368 if (has_img)
369 ADD(pre, "#extension GL_ARB_shader_image_load_store : enable\n");
370 if (has_ubo)
371 ADD(pre, "#extension GL_ARB_uniform_buffer_object : enable\n");
372 if (has_ssbo)
373 ADD(pre, "#extension GL_ARB_shader_storage_buffer_object : enable\n");
374 if (has_texel)
375 ADD(pre, "#extension GL_ARB_texture_buffer_object : enable\n");
376 if (has_ext)
377 ADD(pre, "#extension GL_OES_EGL_image_external : enable\n");
378 if (has_nofmt)
379 ADD(pre, "#extension GL_EXT_shader_image_load_formatted : enable\n");
380 if (has_gather)
381 ADD(pre, "#extension GL_ARB_texture_gather : enable\n");
382
383 if (gpu->glsl.gles) {
384 // Use 32-bit precision for floats if possible
385 ADD(pre, "#ifdef GL_FRAGMENT_PRECISION_HIGH \n"
386 "precision highp float; \n"
387 "#else \n"
388 "precision mediump float; \n"
389 "#endif \n");
390
391 // Always use 16-bit precision for samplers
392 ADD(pre, "precision mediump sampler2D; \n");
393 if (gpu->limits.max_tex_1d_dim)
394 ADD(pre, "precision mediump sampler1D; \n");
395 if (gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100)
396 ADD(pre, "precision mediump sampler3D; \n");
397 }
398
399 // Add all of the push constants as their own element
400 if (params->push_constants_size) {
401 // We re-use add_buffer_vars to make sure variables are sorted, this
402 // is important because the push constants can be out-of-order in
403 // `pass->vars`
404 PL_ARRAY(struct pl_buffer_var) pc_bvars = {0};
405 for (int i = 0; i < res->num_variables; i++) {
406 if (pass->vars[i].type != PASS_VAR_PUSHC)
407 continue;
408
409 PL_ARRAY_APPEND(tmp, pc_bvars, (struct pl_buffer_var) {
410 .var = res->variables[i].var,
411 .layout = pass->vars[i].layout,
412 });
413 }
414
415 ADD(pre, "layout(std430, push_constant) uniform PushC ");
416 add_buffer_vars(dp, tmp, pre, pc_bvars.elem, pc_bvars.num);
417 }
418
419 // Add all of the specialization constants
420 for (int i = 0; i < res->num_constants; i++) {
421 static const char *types[PL_VAR_TYPE_COUNT] = {
422 [PL_VAR_SINT] = "int",
423 [PL_VAR_UINT] = "uint",
424 [PL_VAR_FLOAT] = "float",
425 };
426
427 const struct pl_shader_const *sc = &res->constants[i];
428 ADD(pre, "layout(constant_id=%"PRIu32") const %s %s = 0; \n",
429 params->constants[i].id, types[sc->type], sc->name);
430 }
431
432 // Add all of the required descriptors
433 for (int i = 0; i < res->num_descriptors; i++) {
434 const struct pl_shader_desc *sd = &res->descriptors[i];
435 const struct pl_desc *desc = ¶ms->descriptors[i];
436
437 switch (desc->type) {
438 case PL_DESC_SAMPLED_TEX: {
439 static const char *types[][4] = {
440 [PL_SAMPLER_NORMAL][1] = "sampler1D",
441 [PL_SAMPLER_NORMAL][2] = "sampler2D",
442 [PL_SAMPLER_NORMAL][3] = "sampler3D",
443 [PL_SAMPLER_RECT][2] = "sampler2DRect",
444 [PL_SAMPLER_EXTERNAL][2] = "samplerExternalOES",
445 };
446
447 pl_tex tex = sd_binding(*sd).object;
448 int dims = pl_tex_params_dimension(tex->params);
449 const char *type = types[tex->sampler_type][dims];
450 pl_assert(type);
451
452 static const char prefixes[PL_FMT_TYPE_COUNT] = {
453 [PL_FMT_FLOAT] = ' ',
454 [PL_FMT_UNORM] = ' ',
455 [PL_FMT_SNORM] = ' ',
456 [PL_FMT_UINT] = 'u',
457 [PL_FMT_SINT] = 'i',
458 };
459
460 char prefix = prefixes[tex->params.format->type];
461 pl_assert(prefix);
462
463 const char *prec = "";
464 if (prefix != ' ' && gpu->glsl.gles)
465 prec = "highp ";
466
467 // Vulkan requires explicit bindings; GL always sets the
468 // bindings manually to avoid relying on the user doing so
469 if (gpu->glsl.vulkan)
470 ADD(pre, "layout(binding=%d) ", desc->binding);
471
472 pl_assert(type && prefix);
473 ADD(pre, "uniform %s%c%s %s;\n", prec, prefix, type, desc->name);
474 break;
475 }
476
477 case PL_DESC_STORAGE_IMG: {
478 static const char *types[] = {
479 [1] = "image1D",
480 [2] = "image2D",
481 [3] = "image3D",
482 };
483
484 // For better compatibility, we have to explicitly label the
485 // type of data we will be reading/writing to this image.
486 pl_tex tex = sd_binding(*sd).object;
487 const char *format = tex->params.format->glsl_format;
488 const char *access = pl_desc_access_glsl_name(desc->access);
489 int dims = pl_tex_params_dimension(tex->params);
490 if (gpu->glsl.vulkan) {
491 if (format) {
492 ADD(pre, "layout(binding=%d, %s) ", desc->binding, format);
493 } else {
494 ADD(pre, "layout(binding=%d) ", desc->binding);
495 }
496 } else if (gpu->glsl.version >= 130 && format) {
497 ADD(pre, "layout(%s) ", format);
498 }
499
500 ADD(pre, "%s%s%s restrict uniform %s %s;\n", access,
501 (sd->memory & PL_MEMORY_COHERENT) ? " coherent" : "",
502 (sd->memory & PL_MEMORY_VOLATILE) ? " volatile" : "",
503 types[dims], desc->name);
504 break;
505 }
506
507 case PL_DESC_BUF_UNIFORM:
508 if (gpu->glsl.vulkan) {
509 ADD(pre, "layout(std140, binding=%d) ", desc->binding);
510 } else {
511 ADD(pre, "layout(std140) ");
512 }
513 ADD(pre, "uniform %s ", desc->name);
514 add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars);
515 break;
516
517 case PL_DESC_BUF_STORAGE:
518 if (gpu->glsl.vulkan) {
519 ADD(pre, "layout(std430, binding=%d) ", desc->binding);
520 } else if (gpu->glsl.version >= 140) {
521 ADD(pre, "layout(std430) ");
522 }
523 ADD(pre, "%s%s%s restrict buffer %s ",
524 pl_desc_access_glsl_name(desc->access),
525 (sd->memory & PL_MEMORY_COHERENT) ? " coherent" : "",
526 (sd->memory & PL_MEMORY_VOLATILE) ? " volatile" : "",
527 desc->name);
528 add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars);
529 break;
530
531 case PL_DESC_BUF_TEXEL_UNIFORM:
532 if (gpu->glsl.vulkan)
533 ADD(pre, "layout(binding=%d) ", desc->binding);
534 ADD(pre, "uniform samplerBuffer %s;\n", desc->name);
535 break;
536
537 case PL_DESC_BUF_TEXEL_STORAGE: {
538 pl_buf buf = sd_binding(*sd).object;
539 const char *format = buf->params.format->glsl_format;
540 const char *access = pl_desc_access_glsl_name(desc->access);
541 if (gpu->glsl.vulkan) {
542 if (format) {
543 ADD(pre, "layout(binding=%d, %s) ", desc->binding, format);
544 } else {
545 ADD(pre, "layout(binding=%d) ", desc->binding);
546 }
547 } else if (format) {
548 ADD(pre, "layout(%s) ", format);
549 }
550
551 ADD(pre, "%s%s%s restrict uniform imageBuffer %s;\n", access,
552 (sd->memory & PL_MEMORY_COHERENT) ? " coherent" : "",
553 (sd->memory & PL_MEMORY_VOLATILE) ? " volatile" : "",
554 desc->name);
555 break;
556 }
557
558 case PL_DESC_INVALID:
559 case PL_DESC_TYPE_COUNT:
560 pl_unreachable();
561 }
562 }
563
564 // Add all of the remaining variables
565 for (int i = 0; i < res->num_variables; i++) {
566 const struct pl_var *var = &res->variables[i].var;
567 const struct pass_var *pv = &pass->vars[i];
568 if (pv->type != PASS_VAR_GLOBAL)
569 continue;
570 ADD(pre, "uniform ");
571 add_var(dp, pre, var);
572 }
573
574 char *vert_in = gpu->glsl.version >= 130 ? "in" : "attribute";
575 char *vert_out = gpu->glsl.version >= 130 ? "out" : "varying";
576 char *frag_in = gpu->glsl.version >= 130 ? "in" : "varying";
577
578 pl_str *glsl = &dp->tmp[TMP_MAIN];
579 ADD_STR(glsl, *pre);
580
581 const char *out_color = "gl_FragColor";
582 switch(params->type) {
583 case PL_PASS_RASTER: {
584 pl_assert(vert_pos);
585 pl_str *vert_head = &dp->tmp[TMP_VERT_HEAD];
586 pl_str *vert_body = &dp->tmp[TMP_VERT_BODY];
587
588 // Set up a trivial vertex shader
589 ADD_STR(vert_head, *pre);
590 ADD(vert_body, "void main() {\n");
591 for (int i = 0; i < sh->vas.num; i++) {
592 const struct pl_vertex_attrib *va = ¶ms->vertex_attribs[i];
593 const struct pl_shader_va *sva = &sh->vas.elem[i];
594 const char *type = va->fmt->glsl_type;
595
596 // Use the pl_shader_va for the name in the fragment shader since
597 // the pl_vertex_attrib is already mangled for the vertex shader
598 const char *name = sva->attr.name;
599
600 char loc[32];
601 snprintf(loc, sizeof(loc), "layout(location=%d)", va->location);
602 // Older GLSL doesn't support the use of explicit locations
603 if (gpu->glsl.version < 430)
604 loc[0] = '\0';
605 ADD(vert_head, "%s %s %s %s;\n", loc, vert_in, type, va->name);
606
607 if (strcmp(name, vert_pos) == 0) {
608 pl_assert(va->fmt->num_components == 2);
609 if (out_proj) {
610 ADD(vert_body, "gl_Position = vec4((%s * vec3(%s, 1.0)).xy, 0.0, 1.0); \n",
611 out_proj, va->name);
612 } else {
613 ADD(vert_body, "gl_Position = vec4(%s, 0.0, 1.0);\n", va->name);
614 }
615 } else {
616 // Everything else is just blindly passed through
617 ADD(vert_head, "%s %s %s %s;\n", loc, vert_out, type, name);
618 ADD(vert_body, "%s = %s;\n", name, va->name);
619 ADD(glsl, "%s %s %s %s;\n", loc, frag_in, type, name);
620 }
621 }
622
623 ADD(vert_body, "}");
624 ADD_STR(vert_head, *vert_body);
625 params->vertex_shader = vert_head->buf;
626 pl_hash_merge(&pass->signature, pl_str_hash(*vert_head));
627
628 // GLSL 130+ doesn't use the magic gl_FragColor
629 if (gpu->glsl.version >= 130) {
630 out_color = "out_color";
631 ADD(glsl, "%s out vec4 %s;\n",
632 gpu->glsl.version >= 430 ? "layout(location=0) " : "",
633 out_color);
634 }
635 break;
636 }
637 case PL_PASS_COMPUTE:
638 ADD(glsl, "layout (local_size_x = %d, local_size_y = %d) in;\n",
639 res->compute_group_size[0], res->compute_group_size[1]);
640 break;
641 case PL_PASS_INVALID:
642 case PL_PASS_TYPE_COUNT:
643 pl_unreachable();
644 }
645
646 // Set up the main shader body
647 ADD(glsl, "%s", res->glsl);
648 ADD(glsl, "void main() {\n");
649
650 pl_assert(res->input == PL_SHADER_SIG_NONE);
651 switch (params->type) {
652 case PL_PASS_RASTER:
653 pl_assert(res->output == PL_SHADER_SIG_COLOR);
654 ADD(glsl, "%s = %s();\n", out_color, res->name);
655 break;
656 case PL_PASS_COMPUTE:
657 ADD(glsl, "%s();\n", res->name);
658 break;
659 case PL_PASS_INVALID:
660 case PL_PASS_TYPE_COUNT:
661 pl_unreachable();
662 }
663
664 ADD(glsl, "}");
665 params->glsl_shader = glsl->buf;
666 pl_hash_merge(&pass->signature, pl_str_hash(*glsl));
667 }
668
669 #undef ADD
670 #undef ADD_STR
671
672 #define pass_age(pass) (dp->current_index - (pass)->last_index)
673
cmp_pass_age(const void * ptra,const void * ptrb)674 static int cmp_pass_age(const void *ptra, const void *ptrb)
675 {
676 const struct pass *a = *(const struct pass **) ptra;
677 const struct pass *b = *(const struct pass **) ptrb;
678 return b->last_index - a->last_index;
679 }
680
garbage_collect_passes(pl_dispatch dp)681 static void garbage_collect_passes(pl_dispatch dp)
682 {
683 if (dp->passes.num <= dp->max_passes)
684 return;
685
686 // Garbage collect oldest passes, starting at the middle
687 qsort(dp->passes.elem, dp->passes.num, sizeof(struct pass *), cmp_pass_age);
688 int idx = dp->passes.num / 2;
689 while (idx < dp->passes.num && pass_age(dp->passes.elem[idx]) < MIN_AGE)
690 idx++;
691
692 for (int i = idx; i < dp->passes.num; i++)
693 pass_destroy(dp, dp->passes.elem[i]);
694
695 int num_evicted = dp->passes.num - idx;
696 dp->passes.num = idx;
697
698 if (num_evicted) {
699 PL_DEBUG(dp, "Evicted %d passes from dispatch cache, consider "
700 "using more dynamic shaders", num_evicted);
701 } else {
702 dp->max_passes *= 2;
703 }
704 }
705
finalize_pass(pl_dispatch dp,pl_shader sh,pl_tex target,ident_t vert_pos,const struct pl_blend_params * blend,bool load,const struct pl_dispatch_vertex_params * vparams,ident_t out_proj)706 static struct pass *finalize_pass(pl_dispatch dp, pl_shader sh,
707 pl_tex target, ident_t vert_pos,
708 const struct pl_blend_params *blend, bool load,
709 const struct pl_dispatch_vertex_params *vparams,
710 ident_t out_proj)
711 {
712 struct pass *pass = pl_alloc_ptr(dp, pass);
713 *pass = (struct pass) {
714 .signature = 0x0, // updated incrementally below
715 .last_index = dp->current_index,
716 .ubo_desc = {
717 .desc = {
718 .name = "UBO",
719 .type = PL_DESC_BUF_UNIFORM,
720 },
721 },
722 };
723
724 // For identifiers tied to the lifetime of this shader
725 void *tmp = SH_TMP(sh);
726
727 struct pl_pass_params params = {
728 .type = pl_shader_is_compute(sh) ? PL_PASS_COMPUTE : PL_PASS_RASTER,
729 .num_descriptors = sh->descs.num,
730 .vertex_type = vparams ? vparams->vertex_type : PL_PRIM_TRIANGLE_STRIP,
731 .vertex_stride = vparams ? vparams->vertex_stride : 0,
732 .blend_params = blend,
733 };
734
735 if (params.type == PL_PASS_RASTER) {
736 assert(target);
737 params.target_dummy = *target;
738 params.load_target = load;
739
740 // Fill in the vertex attributes array
741 params.num_vertex_attribs = sh->vas.num;
742 params.vertex_attribs = pl_calloc_ptr(tmp, sh->vas.num, params.vertex_attribs);
743
744 int va_loc = 0;
745 for (int i = 0; i < sh->vas.num; i++) {
746 struct pl_vertex_attrib *va = ¶ms.vertex_attribs[i];
747 *va = sh->vas.elem[i].attr;
748
749 // Mangle the name to make sure it doesn't conflict with the
750 // fragment shader input
751 va->name = pl_asprintf(tmp, "%s_v", va->name);
752
753 // Place the vertex attribute
754 va->location = va_loc;
755 if (!vparams) {
756 va->offset = params.vertex_stride;
757 params.vertex_stride += va->fmt->texel_size;
758 }
759
760 // The number of vertex attribute locations consumed by a vertex
761 // attribute is the number of vec4s it consumes, rounded up
762 const size_t va_loc_size = sizeof(float[4]);
763 va_loc += (va->fmt->texel_size + va_loc_size - 1) / va_loc_size;
764 }
765
766 // Hash in the raster state configuration
767 pl_hash_merge(&pass->signature, (uint64_t) params.vertex_type);
768 pl_hash_merge(&pass->signature, (uint64_t) params.vertex_stride);
769 pl_hash_merge(&pass->signature, (uint64_t) params.load_target);
770 pl_hash_merge(&pass->signature, (uintptr_t) target->params.format);
771 if (blend)
772 pl_hash_merge(&pass->signature, pl_mem_hash(blend, sizeof(*blend)));
773 }
774
775 // Place all of the compile-time constants
776 uint8_t *constant_data = NULL;
777 if (sh->consts.num) {
778 params.num_constants = sh->consts.num;
779 params.constants = pl_alloc(tmp, sh->consts.num * sizeof(struct pl_constant));
780
781 // Compute offsets
782 size_t total_size = 0;
783 uint32_t const_id = 0;
784 for (int i = 0; i < sh->consts.num; i++) {
785 params.constants[i] = (struct pl_constant) {
786 .type = sh->consts.elem[i].type,
787 .id = const_id++,
788 .offset = total_size,
789 };
790 total_size += pl_var_type_size(sh->consts.elem[i].type);
791 }
792
793 // Write values into the constants buffer
794 params.constant_data = constant_data = pl_alloc(pass, total_size);
795 for (int i = 0; i < sh->consts.num; i++) {
796 const struct pl_shader_const *sc = &sh->consts.elem[i];
797 void *data = constant_data + params.constants[i].offset;
798 memcpy(data, sc->data, pl_var_type_size(sc->type));
799 }
800 }
801
802 // Place all the variables; these will dynamically end up in different
803 // locations based on what the underlying GPU supports (UBOs, pushc, etc.)
804 //
805 // We go through the list twice, once to place stuff that we definitely
806 // want inside PCs, and then a second time to opportunistically place the rest.
807 pass->vars = pl_calloc_ptr(pass, sh->vars.num, pass->vars);
808 for (int i = 0; i < sh->vars.num; i++) {
809 if (!add_pass_var(dp, tmp, pass, ¶ms, &sh->vars.elem[i], &pass->vars[i], false))
810 goto error;
811 }
812 for (int i = 0; i < sh->vars.num; i++) {
813 if (!add_pass_var(dp, tmp, pass, ¶ms, &sh->vars.elem[i], &pass->vars[i], true))
814 goto error;
815 }
816
817 // Now that we know the variable placement, finalize pushc/UBO sizes
818 params.push_constants_size = PL_ALIGN2(params.push_constants_size, 4);
819 size_t ubo_size = sh_buf_desc_size(&pass->ubo_desc);
820 if (ubo_size) {
821 pass->ubo_index = sh->descs.num;
822 sh_desc(sh, pass->ubo_desc);
823 };
824
825 // Place and fill in the descriptors
826 const int num_descs = sh->descs.num;
827 int binding[PL_DESC_TYPE_COUNT] = {0};
828 params.num_descriptors = num_descs;
829 params.descriptors = pl_calloc_ptr(tmp, num_descs, params.descriptors);
830 for (int i = 0; i < num_descs; i++) {
831 struct pl_desc *desc = ¶ms.descriptors[i];
832 *desc = sh->descs.elem[i].desc;
833 desc->binding = binding[pl_desc_namespace(dp->gpu, desc->type)]++;
834 }
835
836 // Finalize the shader and look it up in the pass cache
837 generate_shaders(dp, tmp, pass, ¶ms, sh, vert_pos, out_proj);
838 for (int i = 0; i < dp->passes.num; i++) {
839 struct pass *p = dp->passes.elem[i];
840 if (p->signature != pass->signature)
841 continue;
842
843 // Found existing shader, re-use directly
844 if (p->ubo)
845 sh->descs.elem[p->ubo_index].binding.object = p->ubo;
846 pl_free(p->run_params.constant_data);
847 p->run_params.constant_data = pl_steal(p, constant_data);
848 p->last_index = dp->current_index;
849 pl_free(pass);
850 return p;
851 }
852
853 // Find and attach the cached program, if any
854 for (int i = 0; i < dp->cached_passes.num; i++) {
855 if (dp->cached_passes.elem[i].signature == pass->signature) {
856 PL_DEBUG(dp, "Re-using cached program with signature 0x%llx",
857 (unsigned long long) pass->signature);
858
859 params.cached_program = dp->cached_passes.elem[i].cached_program;
860 params.cached_program_len = dp->cached_passes.elem[i].cached_program_len;
861 PL_ARRAY_REMOVE_AT(dp->cached_passes, i);
862 break;
863 }
864 }
865
866 pass->pass = pl_pass_create(dp->gpu, ¶ms);
867 if (!pass->pass) {
868 PL_ERR(dp, "Failed creating render pass for dispatch");
869 // Add it anyway
870 }
871
872 struct pl_pass_run_params *rparams = &pass->run_params;
873 rparams->pass = pass->pass;
874 rparams->constant_data = constant_data;
875 rparams->push_constants = pl_zalloc(pass, params.push_constants_size);
876 rparams->desc_bindings = pl_calloc_ptr(pass, params.num_descriptors,
877 rparams->desc_bindings);
878
879 if (ubo_size && pass->pass) {
880 // Create the UBO
881 pass->ubo = pl_buf_create(dp->gpu, &(struct pl_buf_params) {
882 .size = ubo_size,
883 .uniform = true,
884 .host_writable = true,
885 });
886
887 if (!pass->ubo) {
888 PL_ERR(dp, "Failed creating uniform buffer for dispatch");
889 goto error;
890 }
891
892 sh->descs.elem[pass->ubo_index].binding.object = pass->ubo;
893 }
894
895 if (params.type == PL_PASS_RASTER && !vparams) {
896 // Generate the vertex array placeholder
897 rparams->vertex_count = 4; // single quad
898 size_t vert_size = rparams->vertex_count * params.vertex_stride;
899 rparams->vertex_data = pl_zalloc(pass, vert_size);
900 }
901
902 pass->timer = pl_timer_create(dp->gpu);
903
904 garbage_collect_passes(dp);
905 PL_ARRAY_APPEND(dp, dp->passes, pass);
906 return pass;
907
908 error:
909 pass_destroy(dp, pass);
910 return NULL;
911 }
912
update_pass_var(pl_dispatch dp,struct pass * pass,const struct pl_shader_var * sv,struct pass_var * pv)913 static void update_pass_var(pl_dispatch dp, struct pass *pass,
914 const struct pl_shader_var *sv, struct pass_var *pv)
915 {
916 struct pl_var_layout host_layout = pl_var_host_layout(0, &sv->var);
917 pl_assert(host_layout.size);
918
919 // Use the cache to skip updates if possible
920 if (pv->cached_data && !memcmp(sv->data, pv->cached_data, host_layout.size))
921 return;
922 if (!pv->cached_data)
923 pv->cached_data = pl_alloc(pass, host_layout.size);
924 memcpy(pv->cached_data, sv->data, host_layout.size);
925
926 struct pl_pass_run_params *rparams = &pass->run_params;
927 switch (pv->type) {
928 case PASS_VAR_NONE:
929 pl_unreachable();
930 case PASS_VAR_GLOBAL: {
931 struct pl_var_update vu = {
932 .index = pv->index,
933 .data = sv->data,
934 };
935 PL_ARRAY_APPEND_RAW(pass, rparams->var_updates, rparams->num_var_updates, vu);
936 break;
937 }
938 case PASS_VAR_UBO: {
939 pl_assert(pass->ubo);
940 const size_t offset = pv->layout.offset;
941 if (host_layout.stride == pv->layout.stride) {
942 pl_assert(host_layout.size == pv->layout.size);
943 pl_buf_write(dp->gpu, pass->ubo, offset, sv->data, host_layout.size);
944 } else {
945 // Coalesce strided UBO write into a single pl_buf_write to avoid
946 // unnecessary synchronization overhead by assembling the correctly
947 // strided upload in RAM
948 pl_grow(dp, &dp->tmp[0].buf, pv->layout.size);
949 uint8_t * const tmp = dp->tmp[0].buf;
950 const uint8_t *src = sv->data;
951 const uint8_t *end = src + host_layout.size;
952 uint8_t *dst = tmp;
953 while (src < end) {
954 memcpy(dst, src, host_layout.stride);
955 src += host_layout.stride;
956 dst += pv->layout.stride;
957 }
958 pl_buf_write(dp->gpu, pass->ubo, offset, tmp, pv->layout.size);
959 }
960 break;
961 }
962 case PASS_VAR_PUSHC:
963 pl_assert(rparams->push_constants);
964 memcpy_layout(rparams->push_constants, pv->layout, sv->data, host_layout);
965 break;
966 };
967 }
968
compute_vertex_attribs(pl_dispatch dp,pl_shader sh,int width,int height,ident_t * out_scale)969 static void compute_vertex_attribs(pl_dispatch dp, pl_shader sh,
970 int width, int height, ident_t *out_scale)
971 {
972 // Simulate vertex attributes using global definitions
973 *out_scale = sh_var(sh, (struct pl_shader_var) {
974 .var = pl_var_vec2("out_scale"),
975 .data = &(float[2]){ 1.0 / width, 1.0 / height },
976 .dynamic = true,
977 });
978
979 GLSLP("#define frag_pos(id) (vec2(id) + vec2(0.5)) \n"
980 "#define frag_map(id) (%s * frag_pos(id)) \n"
981 "#define gl_FragCoord vec4(frag_pos(gl_GlobalInvocationID), 0.0, 1.0) \n",
982 *out_scale);
983
984 for (int n = 0; n < sh->vas.num; n++) {
985 const struct pl_shader_va *sva = &sh->vas.elem[n];
986
987 ident_t points[4];
988 for (int i = 0; i < PL_ARRAY_SIZE(points); i++) {
989 char name[4];
990 snprintf(name, sizeof(name), "p%d", i);
991 points[i] = sh_var_from_va(sh, name, &sva->attr, sva->data[i]);
992 }
993
994 GLSLP("#define %s_map(id) "
995 "(mix(mix(%s, %s, frag_map(id).x), "
996 " mix(%s, %s, frag_map(id).x), "
997 "frag_map(id).y))\n"
998 "#define %s (%s_map(gl_GlobalInvocationID))\n",
999 sva->attr.name,
1000 points[0], points[1], points[2], points[3],
1001 sva->attr.name, sva->attr.name);
1002 }
1003 }
1004
translate_compute_shader(pl_dispatch dp,pl_shader sh,const struct pl_rect2d * rc,const struct pl_dispatch_params * params)1005 static void translate_compute_shader(pl_dispatch dp, pl_shader sh,
1006 const struct pl_rect2d *rc,
1007 const struct pl_dispatch_params *params)
1008 {
1009 int width = abs(pl_rect_w(*rc)), height = abs(pl_rect_h(*rc));
1010 ident_t out_scale;
1011 compute_vertex_attribs(dp, sh, width, height, &out_scale);
1012
1013 // Simulate a framebuffer using storage images
1014 pl_assert(params->target->params.storable);
1015 pl_assert(sh->res.output == PL_SHADER_SIG_COLOR);
1016 ident_t fbo = sh_desc(sh, (struct pl_shader_desc) {
1017 .binding.object = params->target,
1018 .desc = {
1019 .name = "out_image",
1020 .type = PL_DESC_STORAGE_IMG,
1021 .access = params->blend_params ? PL_DESC_ACCESS_READWRITE
1022 : PL_DESC_ACCESS_WRITEONLY,
1023 },
1024 });
1025
1026 ident_t base = sh_var(sh, (struct pl_shader_var) {
1027 .data = &(int[2]){ rc->x0, rc->y0 },
1028 .dynamic = true,
1029 .var = {
1030 .name = "base",
1031 .type = PL_VAR_SINT,
1032 .dim_v = 2,
1033 .dim_m = 1,
1034 .dim_a = 1,
1035 },
1036 });
1037
1038 int dx = rc->x0 > rc->x1 ? -1 : 1, dy = rc->y0 > rc->y1 ? -1 : 1;
1039 GLSL("ivec2 dir = ivec2(%d, %d);\n", dx, dy); // hard-code, not worth var
1040 GLSL("ivec2 pos = %s + dir * ivec2(gl_GlobalInvocationID);\n", base);
1041 GLSL("vec2 fpos = %s * vec2(gl_GlobalInvocationID);\n", out_scale);
1042 GLSL("if (max(fpos.x, fpos.y) < 1.0) {\n");
1043 if (params->blend_params) {
1044 GLSL("vec4 orig = imageLoad(%s, pos);\n", fbo);
1045
1046 static const char *modes[] = {
1047 [PL_BLEND_ZERO] = "0.0",
1048 [PL_BLEND_ONE] = "1.0",
1049 [PL_BLEND_SRC_ALPHA] = "color.a",
1050 [PL_BLEND_ONE_MINUS_SRC_ALPHA] = "(1.0 - color.a)",
1051 };
1052
1053 GLSL("color = vec4(color.rgb * vec3(%s), color.a * %s) \n"
1054 " + vec4(orig.rgb * vec3(%s), orig.a * %s);\n",
1055 modes[params->blend_params->src_rgb],
1056 modes[params->blend_params->src_alpha],
1057 modes[params->blend_params->dst_rgb],
1058 modes[params->blend_params->dst_alpha]);
1059 }
1060 GLSL("imageStore(%s, pos, color);\n", fbo);
1061 GLSL("}\n");
1062 sh->res.output = PL_SHADER_SIG_NONE;
1063 }
1064
run_pass(pl_dispatch dp,pl_shader sh,struct pass * pass)1065 static void run_pass(pl_dispatch dp, pl_shader sh, struct pass *pass)
1066 {
1067 const struct pl_shader_res *res = pl_shader_finalize(sh);
1068 pl_pass_run(dp->gpu, &pass->run_params);
1069
1070 for (uint64_t ts; (ts = pl_timer_query(dp->gpu, pass->timer));) {
1071 PL_TRACE(dp, "Spent %.3f ms on shader: %s", ts / 1e6, res->description);
1072
1073 uint64_t old = pass->samples[pass->ts_idx];
1074 pass->samples[pass->ts_idx] = ts;
1075 pass->ts_last = ts;
1076 pass->ts_peak = PL_MAX(pass->ts_peak, ts);
1077 pass->ts_sum += ts;
1078 pass->ts_idx = (pass->ts_idx + 1) % PL_ARRAY_SIZE(pass->samples);
1079
1080 if (old) {
1081 pass->ts_sum -= old;
1082 if (old == pass->ts_peak) {
1083 uint64_t new_peak = 0;
1084 for (int i = 0; i < PL_ARRAY_SIZE(pass->samples); i++)
1085 new_peak = PL_MAX(new_peak, pass->samples[i]);
1086 pass->ts_peak = new_peak;
1087 }
1088 }
1089 }
1090
1091 if (!dp->info_callback)
1092 return;
1093
1094 struct pl_dispatch_info info;
1095 info.signature = pass->signature;
1096 info.shader = res;
1097
1098 // Test to see if the ring buffer already wrapped around once
1099 if (pass->samples[pass->ts_idx]) {
1100 info.num_samples = PL_ARRAY_SIZE(pass->samples);
1101 int num_wrapped = info.num_samples - pass->ts_idx;
1102 memcpy(info.samples, &pass->samples[pass->ts_idx],
1103 num_wrapped * sizeof(info.samples[0]));
1104 memcpy(&info.samples[num_wrapped], pass->samples,
1105 pass->ts_idx * sizeof(info.samples[0]));
1106 } else {
1107 info.num_samples = pass->ts_idx;
1108 memcpy(info.samples, pass->samples,
1109 pass->ts_idx * sizeof(info.samples[0]));
1110 }
1111
1112 info.last = pass->ts_last;
1113 info.peak = pass->ts_peak;
1114 info.average = pass->ts_sum / PL_MAX(info.num_samples, 1);
1115 dp->info_callback(dp->info_priv, &info);
1116 }
1117
pl_dispatch_finish(pl_dispatch dp,const struct pl_dispatch_params * params)1118 bool pl_dispatch_finish(pl_dispatch dp, const struct pl_dispatch_params *params)
1119 {
1120 pl_shader sh = *params->shader;
1121 const struct pl_shader_res *res = &sh->res;
1122 bool ret = false;
1123 pl_mutex_lock(&dp->lock);
1124
1125 if (sh->failed) {
1126 PL_ERR(sh, "Trying to dispatch a failed shader.");
1127 goto error;
1128 }
1129
1130 if (!sh->mutable) {
1131 PL_ERR(dp, "Trying to dispatch non-mutable shader?");
1132 goto error;
1133 }
1134
1135 if (res->input != PL_SHADER_SIG_NONE || res->output != PL_SHADER_SIG_COLOR) {
1136 PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
1137 goto error;
1138 }
1139
1140 const struct pl_tex_params *tpars = ¶ms->target->params;
1141 if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) {
1142 PL_ERR(dp, "Trying to dispatch a shader using an invalid target "
1143 "texture. The target must be a renderable 2D texture.");
1144 goto error;
1145 }
1146
1147 const struct pl_gpu_limits *limits = &dp->gpu->limits;
1148 bool can_compute = tpars->storable;
1149 if (can_compute && params->blend_params)
1150 can_compute = tpars->format->caps & PL_FMT_CAP_READWRITE;
1151
1152 if (pl_shader_is_compute(sh) && !can_compute) {
1153 PL_ERR(dp, "Trying to dispatch using a compute shader with a "
1154 "non-storable or incompatible target texture.");
1155 goto error;
1156 } else if (can_compute && limits->compute_queues > limits->fragment_queues) {
1157 if (sh_try_compute(sh, 16, 16, true, 0))
1158 PL_TRACE(dp, "Upgrading fragment shader to compute shader.");
1159 }
1160
1161 struct pl_rect2d rc = params->rect;
1162 if (!pl_rect_w(rc)) {
1163 rc.x0 = 0;
1164 rc.x1 = tpars->w;
1165 }
1166 if (!pl_rect_h(rc)) {
1167 rc.y0 = 0;
1168 rc.y1 = tpars->h;
1169 }
1170
1171 int w, h, tw = abs(pl_rect_w(rc)), th = abs(pl_rect_h(rc));
1172 if (pl_shader_output_size(sh, &w, &h) && (w != tw || h != th))
1173 {
1174 PL_ERR(dp, "Trying to dispatch a shader with explicit output size "
1175 "requirements %dx%d using a target rect of size %dx%d.",
1176 w, h, tw, th);
1177 goto error;
1178 }
1179
1180 ident_t vert_pos = NULL;
1181
1182 if (pl_shader_is_compute(sh)) {
1183 // Translate the compute shader to simulate vertices etc.
1184 translate_compute_shader(dp, sh, &rc, params);
1185 } else {
1186 // Add the vertex information encoding the position
1187 vert_pos = sh_attr_vec2(sh, "position", &(const struct pl_rect2df) {
1188 .x0 = 2.0 * rc.x0 / tpars->w - 1.0,
1189 .y0 = 2.0 * rc.y0 / tpars->h - 1.0,
1190 .x1 = 2.0 * rc.x1 / tpars->w - 1.0,
1191 .y1 = 2.0 * rc.y1 / tpars->h - 1.0,
1192 });
1193 }
1194
1195 // We need to set pl_pass_params.load_target when either blending is
1196 // enabled or we're drawing to some scissored sub-rect of the texture
1197 struct pl_rect2d full = { 0, 0, tpars->w, tpars->h };
1198 struct pl_rect2d rc_norm = rc;
1199 pl_rect2d_normalize(&rc_norm);
1200 rc_norm.x0 = PL_MAX(rc_norm.x0, 0);
1201 rc_norm.y0 = PL_MAX(rc_norm.y0, 0);
1202 rc_norm.x1 = PL_MIN(rc_norm.x1, tpars->w);
1203 rc_norm.y1 = PL_MIN(rc_norm.y1, tpars->h);
1204 bool load = params->blend_params || !pl_rect2d_eq(rc_norm, full);
1205
1206 struct pass *pass = finalize_pass(dp, sh, params->target, vert_pos,
1207 params->blend_params, load, NULL, NULL);
1208
1209 // Silently return on failed passes
1210 if (!pass || !pass->pass)
1211 goto error;
1212
1213 struct pl_pass_run_params *rparams = &pass->run_params;
1214
1215 // Update the descriptor bindings
1216 for (int i = 0; i < sh->descs.num; i++)
1217 rparams->desc_bindings[i] = sd_binding(sh->descs.elem[i]);
1218
1219 // Update all of the variables (if needed)
1220 rparams->num_var_updates = 0;
1221 for (int i = 0; i < sh->vars.num; i++)
1222 update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
1223
1224 // Update the vertex data
1225 if (rparams->vertex_data) {
1226 uintptr_t vert_base = (uintptr_t) rparams->vertex_data;
1227 size_t stride = rparams->pass->params.vertex_stride;
1228 for (int i = 0; i < sh->vas.num; i++) {
1229 const struct pl_shader_va *sva = &sh->vas.elem[i];
1230 struct pl_vertex_attrib *va = &rparams->pass->params.vertex_attribs[i];
1231
1232 size_t size = sva->attr.fmt->texel_size;
1233 uintptr_t va_base = vert_base + va->offset; // use placed offset
1234 for (int n = 0; n < 4; n++)
1235 memcpy((void *) (va_base + n * stride), sva->data[n], size);
1236 }
1237 }
1238
1239 // For compute shaders: also update the dispatch dimensions
1240 if (pl_shader_is_compute(sh)) {
1241 // Round up to make sure we don-t leave off a part of the target
1242 int width = abs(pl_rect_w(rc)),
1243 height = abs(pl_rect_h(rc)),
1244 block_w = res->compute_group_size[0],
1245 block_h = res->compute_group_size[1],
1246 num_x = (width + block_w - 1) / block_w,
1247 num_y = (height + block_h - 1) / block_h;
1248
1249 rparams->compute_groups[0] = num_x;
1250 rparams->compute_groups[1] = num_y;
1251 rparams->compute_groups[2] = 1;
1252 } else {
1253 // Update the scissors for performance
1254 rparams->scissors = rc_norm;
1255 }
1256
1257 // Dispatch the actual shader
1258 rparams->target = params->target;
1259 rparams->timer = PL_DEF(params->timer, pass->timer);
1260 run_pass(dp, sh, pass);
1261
1262 ret = true;
1263 // fall through
1264
1265 error:
1266 // Reset the temporary buffers which we use to build the shader
1267 for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
1268 dp->tmp[i].len = 0;
1269
1270 pl_mutex_unlock(&dp->lock);
1271 pl_dispatch_abort(dp, params->shader);
1272 return ret;
1273 }
1274
pl_dispatch_compute(pl_dispatch dp,const struct pl_dispatch_compute_params * params)1275 bool pl_dispatch_compute(pl_dispatch dp, const struct pl_dispatch_compute_params *params)
1276 {
1277 pl_shader sh = *params->shader;
1278 const struct pl_shader_res *res = &sh->res;
1279 bool ret = false;
1280 pl_mutex_lock(&dp->lock);
1281
1282 if (sh->failed) {
1283 PL_ERR(sh, "Trying to dispatch a failed shader.");
1284 goto error;
1285 }
1286
1287 if (!sh->mutable) {
1288 PL_ERR(dp, "Trying to dispatch non-mutable shader?");
1289 goto error;
1290 }
1291
1292 if (res->input != PL_SHADER_SIG_NONE) {
1293 PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
1294 goto error;
1295 }
1296
1297 if (!pl_shader_is_compute(sh)) {
1298 PL_ERR(dp, "Trying to dispatch a non-compute shader using "
1299 "`pl_dispatch_compute`!");
1300 goto error;
1301 }
1302
1303 if (sh->vas.num) {
1304 if (!params->width || !params->height) {
1305 PL_ERR(dp, "Trying to dispatch a targetless compute shader that "
1306 "uses vertex attributes, this requires specifying the size "
1307 "of the effective rendering area!");
1308 goto error;
1309 }
1310
1311 compute_vertex_attribs(dp, sh, params->width, params->height,
1312 &(ident_t){0});
1313 }
1314
1315 struct pass *pass = finalize_pass(dp, sh, NULL, NULL, NULL, false, NULL, NULL);
1316
1317 // Silently return on failed passes
1318 if (!pass || !pass->pass)
1319 goto error;
1320
1321 struct pl_pass_run_params *rparams = &pass->run_params;
1322
1323 // Update the descriptor bindings
1324 for (int i = 0; i < sh->descs.num; i++)
1325 rparams->desc_bindings[i] = sd_binding(sh->descs.elem[i]);
1326
1327 // Update all of the variables (if needed)
1328 rparams->num_var_updates = 0;
1329 for (int i = 0; i < sh->vars.num; i++)
1330 update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
1331
1332 // Update the dispatch size
1333 int groups = 1;
1334 for (int i = 0; i < 3; i++) {
1335 groups *= params->dispatch_size[i];
1336 rparams->compute_groups[i] = params->dispatch_size[i];
1337 }
1338
1339 if (!groups) {
1340 pl_assert(params->width && params->height);
1341 int block_w = res->compute_group_size[0],
1342 block_h = res->compute_group_size[1],
1343 num_x = (params->width + block_w - 1) / block_w,
1344 num_y = (params->height + block_h - 1) / block_h;
1345
1346 rparams->compute_groups[0] = num_x;
1347 rparams->compute_groups[1] = num_y;
1348 rparams->compute_groups[2] = 1;
1349 }
1350
1351 // Dispatch the actual shader
1352 rparams->timer = PL_DEF(params->timer, pass->timer);
1353 run_pass(dp, sh, pass);
1354
1355 ret = true;
1356 // fall through
1357
1358 error:
1359 // Reset the temporary buffers which we use to build the shader
1360 for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
1361 dp->tmp[i].len = 0;
1362
1363 pl_mutex_unlock(&dp->lock);
1364 pl_dispatch_abort(dp, params->shader);
1365 return ret;
1366 }
1367
pl_dispatch_vertex(pl_dispatch dp,const struct pl_dispatch_vertex_params * params)1368 bool pl_dispatch_vertex(pl_dispatch dp, const struct pl_dispatch_vertex_params *params)
1369 {
1370 pl_shader sh = *params->shader;
1371 const struct pl_shader_res *res = &sh->res;
1372 bool ret = false;
1373 pl_mutex_lock(&dp->lock);
1374
1375 if (sh->failed) {
1376 PL_ERR(sh, "Trying to dispatch a failed shader.");
1377 goto error;
1378 }
1379
1380 if (!sh->mutable) {
1381 PL_ERR(dp, "Trying to dispatch non-mutable shader?");
1382 goto error;
1383 }
1384
1385 if (res->input != PL_SHADER_SIG_NONE || res->output != PL_SHADER_SIG_COLOR) {
1386 PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
1387 goto error;
1388 }
1389
1390 const struct pl_tex_params *tpars = ¶ms->target->params;
1391 if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) {
1392 PL_ERR(dp, "Trying to dispatch a shader using an invalid target "
1393 "texture. The target must be a renderable 2D texture.");
1394 goto error;
1395 }
1396
1397 if (pl_shader_is_compute(sh)) {
1398 PL_ERR(dp, "Trying to dispatch a compute shader using pl_dispatch_vertex.");
1399 goto error;
1400 }
1401
1402 if (sh->vas.num) {
1403 PL_ERR(dp, "Trying to dispatch a custom vertex shader with already "
1404 "attached vertex attributes.");
1405 goto error;
1406 }
1407
1408 int pos_idx = params->vertex_position_idx;
1409 if (pos_idx < 0 || pos_idx >= params->num_vertex_attribs) {
1410 PL_ERR(dp, "Vertex position index out of range?");
1411 goto error;
1412 }
1413
1414 // Attach all of the vertex attributes to the shader manually
1415 sh->vas.num = params->num_vertex_attribs;
1416 PL_ARRAY_RESIZE(sh, sh->vas, sh->vas.num);
1417 for (int i = 0; i < params->num_vertex_attribs; i++)
1418 sh->vas.elem[i].attr = params->vertex_attribs[i];
1419
1420 // Compute the coordinate projection matrix
1421 struct pl_transform2x2 proj = pl_transform2x2_identity;
1422 switch (params->vertex_coords) {
1423 case PL_COORDS_ABSOLUTE:
1424 proj.mat.m[0][0] /= tpars->w;
1425 proj.mat.m[1][1] /= tpars->h;
1426 // fall through
1427 case PL_COORDS_RELATIVE:
1428 proj.mat.m[0][0] *= 2.0;
1429 proj.mat.m[1][1] *= 2.0;
1430 proj.c[0] -= 1.0;
1431 proj.c[1] -= 1.0;
1432 // fall through
1433 case PL_COORDS_NORMALIZED:
1434 if (params->vertex_flipped) {
1435 proj.mat.m[1][1] = -proj.mat.m[1][1];
1436 proj.c[1] += 2.0;
1437 }
1438 break;
1439 }
1440
1441 ident_t out_proj = NULL;
1442 if (memcmp(&proj, &pl_transform2x2_identity, sizeof(proj)) != 0) {
1443 struct pl_matrix3x3 mat = {{
1444 {proj.mat.m[0][0], proj.mat.m[0][1], proj.c[0]},
1445 {proj.mat.m[1][0], proj.mat.m[1][1], proj.c[1]},
1446 {0.0, 0.0, 1.0},
1447 }};
1448 out_proj = sh_var(sh, (struct pl_shader_var) {
1449 .var = pl_var_mat3("proj"),
1450 .data = PL_TRANSPOSE_3X3(mat.m),
1451 });
1452 }
1453
1454 ident_t vert_pos = params->vertex_attribs[pos_idx].name;
1455 struct pass *pass = finalize_pass(dp, sh, params->target, vert_pos,
1456 params->blend_params, true, params, out_proj);
1457
1458 // Silently return on failed passes
1459 if (!pass || !pass->pass)
1460 goto error;
1461
1462 struct pl_pass_run_params *rparams = &pass->run_params;
1463
1464 // Update the descriptor bindings
1465 for (int i = 0; i < sh->descs.num; i++)
1466 rparams->desc_bindings[i] = sd_binding(sh->descs.elem[i]);
1467
1468 // Update all of the variables (if needed)
1469 rparams->num_var_updates = 0;
1470 for (int i = 0; i < sh->vars.num; i++)
1471 update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
1472
1473 // Update the scissors
1474 rparams->scissors = params->scissors;
1475 if (params->vertex_flipped) {
1476 rparams->scissors.y0 = tpars->h - rparams->scissors.y0;
1477 rparams->scissors.y1 = tpars->h - rparams->scissors.y1;
1478 }
1479 pl_rect2d_normalize(&rparams->scissors);
1480
1481 // Dispatch the actual shader
1482 rparams->target = params->target;
1483 rparams->vertex_count = params->vertex_count;
1484 rparams->vertex_data = params->vertex_data;
1485 rparams->vertex_buf = params->vertex_buf;
1486 rparams->buf_offset = params->buf_offset;
1487 rparams->index_data = params->index_data;
1488 rparams->index_buf = params->index_buf;
1489 rparams->index_offset = params->index_offset;
1490 rparams->timer = PL_DEF(params->timer, pass->timer);
1491 run_pass(dp, sh, pass);
1492
1493 ret = true;
1494 // fall through
1495
1496 error:
1497 // Reset the temporary buffers which we use to build the shader
1498 for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
1499 dp->tmp[i].len = 0;
1500
1501 pl_mutex_unlock(&dp->lock);
1502 pl_dispatch_abort(dp, params->shader);
1503 return ret;
1504 }
1505
pl_dispatch_abort(pl_dispatch dp,pl_shader * psh)1506 void pl_dispatch_abort(pl_dispatch dp, pl_shader *psh)
1507 {
1508 pl_shader sh = *psh;
1509 if (!sh)
1510 return;
1511
1512 // Reset this as early as possible to free temporary resources
1513 pl_shader_reset(sh, NULL);
1514
1515 // Re-add the shader to the internal pool of shaders
1516 pl_mutex_lock(&dp->lock);
1517 PL_ARRAY_APPEND(dp, dp->shaders, sh);
1518 pl_mutex_unlock(&dp->lock);
1519 *psh = NULL;
1520 }
1521
1522 // Stuff related to caching
1523 static const char cache_magic[] = {'P', 'L', 'D', 'P'};
1524 static const uint32_t cache_version = 1;
1525
write_buf(uint8_t * buf,size_t * pos,const void * src,size_t size)1526 static void write_buf(uint8_t *buf, size_t *pos, const void *src, size_t size)
1527 {
1528 assert(size);
1529 if (buf)
1530 memcpy(&buf[*pos], src, size);
1531 *pos += size;
1532 }
1533
1534 #define WRITE(type, var) write_buf(out, &size, &(type){ var }, sizeof(type))
1535 #define LOAD(var) \
1536 do { \
1537 memcpy(&(var), cache, sizeof(var)); \
1538 cache += sizeof(var); \
1539 } while (0)
1540
pl_dispatch_save(pl_dispatch dp,uint8_t * out)1541 size_t pl_dispatch_save(pl_dispatch dp, uint8_t *out)
1542 {
1543 size_t size = 0;
1544 pl_mutex_lock(&dp->lock);
1545
1546 write_buf(out, &size, cache_magic, sizeof(cache_magic));
1547 WRITE(uint32_t, cache_version);
1548
1549 // Remember this position so we can go back and write the actual number of
1550 // cached programs
1551 uint32_t num_passes = 0;
1552 void *out_num = out ? &out[size] : NULL;
1553 size += sizeof(num_passes);
1554
1555 // Save the cached programs for all compiled passes
1556 for (int i = 0; i < dp->passes.num; i++) {
1557 const struct pass *pass = dp->passes.elem[i];
1558 if (!pass->pass)
1559 continue;
1560
1561 const struct pl_pass_params *params = &pass->pass->params;
1562 if (!params->cached_program_len)
1563 continue;
1564
1565 if (out) {
1566 PL_DEBUG(dp, "Saving %zu bytes of cached program with signature 0x%llx",
1567 params->cached_program_len, (unsigned long long) pass->signature);
1568 }
1569
1570 num_passes++;
1571 WRITE(uint64_t, pass->signature);
1572 WRITE(uint64_t, params->cached_program_len);
1573 write_buf(out, &size, params->cached_program, params->cached_program_len);
1574 }
1575
1576 // Re-save the cached programs for all previously loaded (but not yet
1577 // compiled) passes. This is simply to make `pl_dispatch_load` followed
1578 // by `pl_dispatch_save` return the same cache as was previously loaded.
1579 for (int i = 0; i < dp->cached_passes.num; i++) {
1580 const struct cached_pass *pass = &dp->cached_passes.elem[i];
1581 if (!pass->cached_program_len)
1582 continue;
1583
1584 if (out) {
1585 PL_DEBUG(dp, "Saving %zu bytes of cached program with signature 0x%llx",
1586 pass->cached_program_len, (unsigned long long) pass->signature);
1587 }
1588
1589 num_passes++;
1590 WRITE(uint64_t, pass->signature);
1591 WRITE(uint64_t, pass->cached_program_len);
1592 write_buf(out, &size, pass->cached_program, pass->cached_program_len);
1593 }
1594
1595 if (out)
1596 memcpy(out_num, &num_passes, sizeof(num_passes));
1597
1598 pl_mutex_unlock(&dp->lock);
1599 return size;
1600 }
1601
pl_dispatch_load(pl_dispatch dp,const uint8_t * cache)1602 void pl_dispatch_load(pl_dispatch dp, const uint8_t *cache)
1603 {
1604 char magic[4];
1605 LOAD(magic);
1606 if (memcmp(magic, cache_magic, sizeof(magic)) != 0) {
1607 PL_ERR(dp, "Failed loading dispatch cache: invalid magic bytes");
1608 return;
1609 }
1610
1611 uint32_t version;
1612 LOAD(version);
1613 if (version != cache_version) {
1614 PL_WARN(dp, "Failed loading dispatch cache: wrong version");
1615 return;
1616 }
1617
1618 uint32_t num;
1619 LOAD(num);
1620
1621 pl_mutex_lock(&dp->lock);
1622 for (int i = 0; i < num; i++) {
1623 uint64_t sig, size;
1624 LOAD(sig);
1625 LOAD(size);
1626 if (!size)
1627 continue;
1628
1629 // Skip passes that are already compiled
1630 for (int n = 0; n < dp->passes.num; n++) {
1631 if (dp->passes.elem[n]->signature == sig) {
1632 PL_DEBUG(dp, "Skipping already compiled pass with signature %llx",
1633 (unsigned long long) sig);
1634 cache += size;
1635 continue;
1636 }
1637 }
1638
1639 // Find a cached_pass entry with this signature, if any
1640 struct cached_pass *pass = NULL;
1641 for (int n = 0; n < dp->cached_passes.num; n++) {
1642 if (dp->cached_passes.elem[n].signature == sig) {
1643 pass = &dp->cached_passes.elem[n];
1644 break;
1645 }
1646 }
1647
1648 if (!pass) {
1649 // None found, add a new entry
1650 PL_ARRAY_GROW(dp, dp->cached_passes);
1651 pass = &dp->cached_passes.elem[dp->cached_passes.num++];
1652 *pass = (struct cached_pass) { .signature = sig };
1653 }
1654
1655 PL_DEBUG(dp, "Loading %zu bytes of cached program with signature 0x%llx",
1656 (size_t) size, (unsigned long long) sig);
1657
1658 pl_free((void *) pass->cached_program);
1659 pass->cached_program = pl_memdup(dp, cache, size);
1660 pass->cached_program_len = size;
1661 cache += size;
1662 }
1663 pl_mutex_unlock(&dp->lock);
1664 }
1665