1 /*
2  * This file is part of libplacebo.
3  *
4  * libplacebo is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * libplacebo is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "gpu.h"
19 #include "formats.h"
20 #include "glsl/spirv.h"
21 
22 struct stream_buf_slice {
23     const void *data;
24     unsigned int size;
25     unsigned int offset;
26 };
27 
28 // Upload one or more slices of single-use data to a suballocated dynamic
29 // buffer. Only call this once per-buffer per-pass, since it will discard or
30 // reallocate the buffer when full.
stream_buf_upload(pl_gpu gpu,struct d3d_stream_buf * stream,struct stream_buf_slice * slices,int num_slices)31 static bool stream_buf_upload(pl_gpu gpu, struct d3d_stream_buf *stream,
32                               struct stream_buf_slice *slices, int num_slices)
33 {
34     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
35     struct d3d11_ctx *ctx = p->ctx;
36     unsigned int align = PL_DEF(stream->align, sizeof(float));
37 
38     // Get total size, rounded up to the buffer's alignment
39     size_t size = 0;
40     for (int i = 0; i < num_slices; i++)
41         size += PL_ALIGN2(slices[i].size, align);
42 
43     if (size > gpu->limits.max_buf_size) {
44         PL_ERR(gpu, "Streaming buffer is too large");
45         return -1;
46     }
47 
48     // If the data doesn't fit, realloc the buffer
49     if (size > stream->size) {
50         size_t new_size = stream->size;
51         // Arbitrary base size
52         if (!new_size)
53             new_size = 16 * 1024;
54         while (new_size < size)
55             new_size *= 2;
56         new_size = PL_MIN(new_size, gpu->limits.max_buf_size);
57 
58         ID3D11Buffer *new_buf;
59         D3D11_BUFFER_DESC vbuf_desc = {
60             .ByteWidth = new_size,
61             .Usage = D3D11_USAGE_DYNAMIC,
62             .BindFlags = stream->bind_flags,
63             .CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
64         };
65         D3D(ID3D11Device_CreateBuffer(p->dev, &vbuf_desc, NULL, &new_buf));
66 
67         SAFE_RELEASE(stream->buf);
68         stream->buf = new_buf;
69         stream->size = new_size;
70         stream->used = 0;
71     }
72 
73     bool discard = false;
74     size_t offset = stream->used;
75     if (offset + size > stream->size) {
76         // We reached the end of the buffer, so discard and wrap around
77         discard = true;
78         offset = 0;
79     }
80 
81     D3D11_MAPPED_SUBRESOURCE map = {0};
82     UINT type = discard ? D3D11_MAP_WRITE_DISCARD : D3D11_MAP_WRITE_NO_OVERWRITE;
83     D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) stream->buf, 0, type,
84                                 0, &map));
85 
86     // Upload each slice
87     char *cdata = map.pData;
88     stream->used = offset;
89     for (int i = 0; i < num_slices; i++) {
90         slices[i].offset = stream->used;
91         memcpy(cdata + slices[i].offset, slices[i].data, slices[i].size);
92         stream->used += PL_ALIGN2(slices[i].size, align);
93     }
94 
95     ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource *) stream->buf, 0);
96 
97     return true;
98 
99 error:
100     return false;
101 }
102 
get_shader_target(pl_gpu gpu,enum glsl_shader_stage stage)103 static const char *get_shader_target(pl_gpu gpu, enum glsl_shader_stage stage)
104 {
105     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
106     switch (p->fl) {
107     default:
108         switch (stage) {
109         case GLSL_SHADER_VERTEX:   return "vs_5_0";
110         case GLSL_SHADER_FRAGMENT: return "ps_5_0";
111         case GLSL_SHADER_COMPUTE:  return "cs_5_0";
112         }
113         break;
114     case D3D_FEATURE_LEVEL_10_1:
115         switch (stage) {
116         case GLSL_SHADER_VERTEX:   return "vs_4_1";
117         case GLSL_SHADER_FRAGMENT: return "ps_4_1";
118         case GLSL_SHADER_COMPUTE:  return "cs_4_1";
119         }
120         break;
121     case D3D_FEATURE_LEVEL_10_0:
122         switch (stage) {
123         case GLSL_SHADER_VERTEX:   return "vs_4_0";
124         case GLSL_SHADER_FRAGMENT: return "ps_4_0";
125         case GLSL_SHADER_COMPUTE:  return "cs_4_0";
126         }
127         break;
128     case D3D_FEATURE_LEVEL_9_3:
129         switch (stage) {
130         case GLSL_SHADER_VERTEX:   return "vs_4_0_level_9_3";
131         case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_3";
132         case GLSL_SHADER_COMPUTE:  return NULL;
133         }
134         break;
135     case D3D_FEATURE_LEVEL_9_2:
136     case D3D_FEATURE_LEVEL_9_1:
137         switch (stage) {
138         case GLSL_SHADER_VERTEX:   return "vs_4_0_level_9_1";
139         case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_1";
140         case GLSL_SHADER_COMPUTE:  return NULL;
141         }
142         break;
143     }
144     return NULL;
145 }
146 
147 #define SC(cmd)                                                              \
148     do {                                                                     \
149         spvc_result res = (cmd);                                             \
150         if (res != SPVC_SUCCESS) {                                           \
151             PL_ERR(gpu, "%s: %s (%d) (%s:%d)",                               \
152                    #cmd, pass_s->sc ?                                        \
153                        spvc_context_get_last_error_string(pass_s->sc) : "",  \
154                    res, __FILE__, __LINE__);                                 \
155             goto error;                                                      \
156         }                                                                    \
157     } while (0)
158 
mark_resources_used(pl_pass pass,spvc_compiler sc_comp,spvc_resources resources,spvc_resource_type res_type,enum glsl_shader_stage stage)159 static spvc_result mark_resources_used(pl_pass pass, spvc_compiler sc_comp,
160                                        spvc_resources resources,
161                                        spvc_resource_type res_type,
162                                        enum glsl_shader_stage stage)
163 {
164     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
165     const spvc_reflected_resource *res_list;
166     size_t res_count;
167     spvc_result res;
168 
169     res = spvc_resources_get_resource_list_for_type(resources, res_type,
170                                                     &res_list, &res_count);
171     if (res != SPVC_SUCCESS)
172         return res;
173 
174     for (int i = 0; i < res_count; i++) {
175         unsigned int binding = spvc_compiler_get_decoration(sc_comp,
176             res_list[i].id, SpvDecorationBinding);
177         unsigned int descriptor_set = spvc_compiler_get_decoration(sc_comp,
178             res_list[i].id, SpvDecorationDescriptorSet);
179         if (descriptor_set != 0)
180             continue;
181 
182         // Find the pl_desc with this binding and mark it as used
183         for (int j = 0; j < pass->params.num_descriptors; j++) {
184             struct pl_desc *desc = &pass->params.descriptors[j];
185             if (desc->binding != binding)
186                 continue;
187 
188             struct pl_desc_d3d11 *desc_p = &pass_p->descriptors[j];
189             if (stage == GLSL_SHADER_VERTEX) {
190                 desc_p->vertex.used = true;
191             } else {
192                 desc_p->main.used = true;
193             }
194         }
195     }
196 
197     return res;
198 }
199 
200 static const char *shader_names[] = {
201     [GLSL_SHADER_VERTEX]   = "vertex",
202     [GLSL_SHADER_FRAGMENT] = "fragment",
203     [GLSL_SHADER_COMPUTE]  = "compute",
204 };
205 
shader_compile_glsl(pl_gpu gpu,pl_pass pass,struct d3d_pass_stage * pass_s,enum glsl_shader_stage stage,const char * glsl)206 static bool shader_compile_glsl(pl_gpu gpu, pl_pass pass,
207                                 struct d3d_pass_stage *pass_s,
208                                 enum glsl_shader_stage stage, const char *glsl)
209 {
210     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
211     void *tmp = pl_tmp(NULL);
212     bool success = false;
213 
214     clock_t start = clock();
215     pl_str spirv = spirv_compile_glsl(p->spirv, tmp, &gpu->glsl, stage, glsl);
216     if (!spirv.len)
217         goto error;
218 
219     pl_log_cpu_time(gpu->log, start, clock(), "translating GLSL to SPIR-V");
220 
221     SC(spvc_context_create(&pass_s->sc));
222 
223     spvc_parsed_ir sc_ir;
224     SC(spvc_context_parse_spirv(pass_s->sc, (SpvId *) spirv.buf,
225                                 spirv.len / sizeof(SpvId), &sc_ir));
226 
227     SC(spvc_context_create_compiler(pass_s->sc, SPVC_BACKEND_HLSL, sc_ir,
228                                     SPVC_CAPTURE_MODE_TAKE_OWNERSHIP,
229                                     &pass_s->sc_comp));
230 
231     spvc_compiler_options sc_opts;
232     SC(spvc_compiler_create_compiler_options(pass_s->sc_comp, &sc_opts));
233 
234     int sc_shader_model;
235     if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
236         sc_shader_model = 50;
237     } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
238         sc_shader_model = 41;
239     } else {
240         sc_shader_model = 40;
241     }
242 
243     SC(spvc_compiler_options_set_uint(sc_opts,
244         SPVC_COMPILER_OPTION_HLSL_SHADER_MODEL, sc_shader_model));
245 
246     // Unlike Vulkan and OpenGL, in D3D11, the clip-space is "flipped" with
247     // respect to framebuffer-space. In other words, if you render to a pixel at
248     // (0, -1), you have to sample from (0, 1) to get the value back. We unflip
249     // it by setting the following option, which inserts the equivalent of
250     // `gl_Position.y = -gl_Position.y` into the vertex shader
251     if (stage == GLSL_SHADER_VERTEX) {
252         SC(spvc_compiler_options_set_bool(sc_opts,
253             SPVC_COMPILER_OPTION_FLIP_VERTEX_Y, SPVC_TRUE));
254     }
255 
256     // Bind readonly images and imageBuffers as SRVs. This is done because a lot
257     // of hardware (especially FL11_x hardware) has very poor format support for
258     // reading values from UAVs. It allows the common case of readonly and
259     // writeonly images to support more formats, though the less common case of
260     // readwrite images still requires format support for UAV loads (represented
261     // by the PL_FMT_CAP_READWRITE cap in libplacebo.)
262     //
263     // Note that setting this option comes at the cost of GLSL support. Readonly
264     // and readwrite images are the same type in GLSL, but SRV and UAV bound
265     // textures are different types in HLSL, so for example, a GLSL function
266     // with an image parameter may fail to compile as HLSL if it's called with a
267     // readonly image and a readwrite image at different call sites.
268     SC(spvc_compiler_options_set_bool(sc_opts,
269         SPVC_COMPILER_OPTION_HLSL_NONWRITABLE_UAV_TEXTURE_AS_SRV, SPVC_TRUE));
270 
271     SC(spvc_compiler_install_compiler_options(pass_s->sc_comp, sc_opts));
272 
273     spvc_set active = NULL;
274     SC(spvc_compiler_get_active_interface_variables(pass_s->sc_comp, &active));
275     spvc_resources resources = NULL;
276     SC(spvc_compiler_create_shader_resources_for_active_variables(
277         pass_s->sc_comp, &resources, active));
278 
279     // In D3D11, the vertex shader and fragment shader can have a different set
280     // of bindings. At this point, SPIRV-Cross knows which resources are
281     // statically used in each stage. We can use this information to optimize
282     // HLSL register allocation by not binding resources to shader stages
283     // they're not used in.
284     mark_resources_used(pass, pass_s->sc_comp, resources,
285                         SPVC_RESOURCE_TYPE_UNIFORM_BUFFER, stage);
286     mark_resources_used(pass, pass_s->sc_comp, resources,
287                         SPVC_RESOURCE_TYPE_STORAGE_BUFFER, stage);
288     mark_resources_used(pass, pass_s->sc_comp, resources,
289                         SPVC_RESOURCE_TYPE_STORAGE_IMAGE, stage);
290     mark_resources_used(pass, pass_s->sc_comp, resources,
291                         SPVC_RESOURCE_TYPE_SAMPLED_IMAGE, stage);
292 
293     success = true;
294 error:;
295     if (!success) {
296         PL_ERR(gpu, "%s shader GLSL source:", shader_names[stage]);
297         pl_msg_source(gpu->ctx, PL_LOG_ERR, glsl);
298 
299         if (pass_s->sc) {
300             spvc_context_destroy(pass_s->sc);
301             pass_s->sc = NULL;
302         }
303     }
304     pl_free(tmp);
305 
306     return success;
307 }
308 
shader_compile_hlsl(pl_gpu gpu,pl_pass pass,struct d3d_pass_stage * pass_s,enum glsl_shader_stage stage,const char * glsl,ID3DBlob ** out)309 static bool shader_compile_hlsl(pl_gpu gpu, pl_pass pass,
310                                 struct d3d_pass_stage *pass_s,
311                                 enum glsl_shader_stage stage, const char *glsl,
312                                 ID3DBlob **out)
313 {
314     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
315     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
316     const char *hlsl = NULL;
317     ID3DBlob *errors = NULL;
318     bool success = false;
319     HRESULT hr;
320 
321     int max_binding = -1;
322 
323     // This should not be called without first calling shader_compile_glsl
324     pl_assert(pass_s->sc_comp);
325 
326     static const SpvExecutionModel spv_execution_model[] = {
327         [GLSL_SHADER_VERTEX]   = SpvExecutionModelVertex,
328         [GLSL_SHADER_FRAGMENT] = SpvExecutionModelFragment,
329         [GLSL_SHADER_COMPUTE]  = SpvExecutionModelGLCompute,
330     };
331 
332     // Assign the HLSL register numbers we want to use for each resource
333     for (int i = 0; i < pass->params.num_descriptors; i++) {
334         struct pl_desc *desc = &pass->params.descriptors[i];
335         struct pl_desc_d3d11 *desc_p = &pass_p->descriptors[i];
336         struct d3d_desc_stage *desc_s =
337             stage == GLSL_SHADER_VERTEX ? &desc_p->vertex : &desc_p->main;
338 
339         // Skip resources that aren't in this shader stage
340         if (!desc_s->used)
341             continue;
342 
343         spvc_hlsl_resource_binding binding;
344         spvc_hlsl_resource_binding_init(&binding);
345         binding.stage = spv_execution_model[stage];
346         binding.binding = desc->binding;
347         max_binding = PL_MAX(max_binding, desc->binding);
348         if (desc_s->cbv_slot > 0)
349             binding.cbv.register_binding = desc_s->cbv_slot;
350         if (desc_s->srv_slot > 0)
351             binding.srv.register_binding = desc_s->srv_slot;
352         if (desc_s->sampler_slot > 0)
353             binding.sampler.register_binding = desc_s->sampler_slot;
354         if (desc_s->uav_slot > 0)
355             binding.uav.register_binding = desc_s->uav_slot;
356         SC(spvc_compiler_hlsl_add_resource_binding(pass_s->sc_comp, &binding));
357     }
358 
359     if (stage == GLSL_SHADER_COMPUTE) {
360         // Check if the gl_NumWorkGroups builtin is used. If it is, we have to
361         // emulate it with a constant buffer, so allocate it a CBV register.
362         spvc_variable_id num_workgroups_id =
363             spvc_compiler_hlsl_remap_num_workgroups_builtin(pass_s->sc_comp);
364         if (num_workgroups_id) {
365             pass_p->num_workgroups_used = true;
366 
367             spvc_hlsl_resource_binding binding;
368             spvc_hlsl_resource_binding_init(&binding);
369             binding.stage = spv_execution_model[stage];
370             binding.binding = max_binding + 1;
371 
372             // Allocate a CBV register for the buffer
373             binding.cbv.register_binding = pass_s->cbvs.num;
374             PL_ARRAY_APPEND(pass, pass_s->cbvs, HLSL_BINDING_NUM_WORKGROUPS);
375             if (pass_s->cbvs.num >
376                     D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT) {
377                 PL_ERR(gpu, "Not enough constant buffer slots for gl_NumWorkGroups");
378                 goto error;
379             }
380 
381             spvc_compiler_set_decoration(pass_s->sc_comp, num_workgroups_id,
382                                          SpvDecorationDescriptorSet, 0);
383             spvc_compiler_set_decoration(pass_s->sc_comp, num_workgroups_id,
384                                          SpvDecorationBinding, binding.binding);
385 
386             SC(spvc_compiler_hlsl_add_resource_binding(pass_s->sc_comp, &binding));
387         }
388     }
389 
390     clock_t start = clock();
391     SC(spvc_compiler_compile(pass_s->sc_comp, &hlsl));
392 
393     clock_t after_spvc = clock();
394     pl_log_cpu_time(gpu->log, start, after_spvc, "translating SPIR-V to HLSL");
395 
396     // Check if each resource binding was actually used by SPIRV-Cross in the
397     // compiled HLSL. This information can be used to optimize resource binding
398     // to the pipeline.
399     for (int i = 0; i < pass->params.num_descriptors; i++) {
400         struct pl_desc *desc = &pass->params.descriptors[i];
401         struct pl_desc_d3d11 *desc_p = &pass_p->descriptors[i];
402         struct d3d_desc_stage *desc_s =
403             stage == GLSL_SHADER_VERTEX ? &desc_p->vertex : &desc_p->main;
404 
405         // Skip resources that aren't in this shader stage
406         if (!desc_s->used)
407             continue;
408 
409         bool used = spvc_compiler_hlsl_is_resource_used(pass_s->sc_comp,
410             spv_execution_model[stage], 0, desc->binding);
411         if (!used)
412             desc_s->used = false;
413     }
414 
415     hr = p->D3DCompile(hlsl, strlen(hlsl), NULL, NULL, NULL, "main",
416         get_shader_target(gpu, stage),
417         D3DCOMPILE_SKIP_VALIDATION | D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, out,
418         &errors);
419     if (FAILED(hr)) {
420         PL_ERR(gpu, "D3DCompile failed: %s\n%.*s", pl_hresult_to_str(hr),
421                (int) ID3D10Blob_GetBufferSize(errors),
422                (char *) ID3D10Blob_GetBufferPointer(errors));
423         goto error;
424     }
425 
426     pl_log_cpu_time(gpu->log, after_spvc, clock(), "translating HLSL to DXBC");
427 
428     success = true;
429 error:;
430     int level = success ? PL_LOG_DEBUG : PL_LOG_ERR;
431     PL_MSG(gpu, level, "%s shader GLSL source:", shader_names[stage]);
432     pl_msg_source(gpu->ctx, level, glsl);
433     if (hlsl) {
434         PL_MSG(gpu, level, "%s shader HLSL source:", shader_names[stage]);
435         pl_msg_source(gpu->ctx, level, hlsl);
436     }
437 
438     if (pass_s->sc) {
439         spvc_context_destroy(pass_s->sc);
440         pass_s->sc = NULL;
441     }
442     SAFE_RELEASE(errors);
443     return success;
444 }
445 
pl_d3d11_pass_destroy(pl_gpu gpu,pl_pass pass)446 void pl_d3d11_pass_destroy(pl_gpu gpu, pl_pass pass)
447 {
448     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
449     struct d3d11_ctx *ctx = p->ctx;
450     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
451 
452     if (pass_p->main.sc) {
453         spvc_context_destroy(pass_p->main.sc);
454         pass_p->main.sc = NULL;
455     }
456     if (pass_p->vertex.sc) {
457         spvc_context_destroy(pass_p->vertex.sc);
458         pass_p->vertex.sc = NULL;
459     }
460 
461     SAFE_RELEASE(pass_p->vs);
462     SAFE_RELEASE(pass_p->ps);
463     SAFE_RELEASE(pass_p->cs);
464     SAFE_RELEASE(pass_p->layout);
465     SAFE_RELEASE(pass_p->bstate);
466 
467     pl_d3d11_flush_message_queue(ctx, "After pass destroy");
468 
469     pl_free((void *) pass);
470 }
471 
pass_create_raster(pl_gpu gpu,struct pl_pass * pass,const struct pl_pass_params * params)472 static bool pass_create_raster(pl_gpu gpu, struct pl_pass *pass,
473                                const struct pl_pass_params *params)
474 {
475     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
476     struct d3d11_ctx *ctx = p->ctx;
477     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
478     ID3DBlob *vs_blob = NULL;
479     ID3DBlob *ps_blob = NULL;
480     D3D11_INPUT_ELEMENT_DESC *in_descs = NULL;
481     bool success = false;
482 
483     if (!shader_compile_hlsl(gpu, pass, &pass_p->vertex, GLSL_SHADER_VERTEX,
484                              params->vertex_shader, &vs_blob))
485         goto error;
486 
487     D3D(ID3D11Device_CreateVertexShader(p->dev,
488         ID3D10Blob_GetBufferPointer(vs_blob), ID3D10Blob_GetBufferSize(vs_blob),
489         NULL, &pass_p->vs));
490 
491     if (!shader_compile_hlsl(gpu, pass, &pass_p->main, GLSL_SHADER_FRAGMENT,
492                              params->glsl_shader, &ps_blob))
493         goto error;
494 
495     D3D(ID3D11Device_CreatePixelShader(p->dev,
496         ID3D10Blob_GetBufferPointer(ps_blob), ID3D10Blob_GetBufferSize(ps_blob),
497         NULL, &pass_p->ps));
498 
499     in_descs = pl_calloc_ptr(pass, params->num_vertex_attribs, in_descs);
500     for (int i = 0; i < params->num_vertex_attribs; i++) {
501         struct pl_vertex_attrib *va = &params->vertex_attribs[i];
502 
503         in_descs[i] = (D3D11_INPUT_ELEMENT_DESC) {
504             // The semantic name doesn't mean much and is just used to verify
505             // the input description matches the shader. SPIRV-Cross always
506             // uses TEXCOORD, so we should too.
507             .SemanticName = "TEXCOORD",
508             .SemanticIndex = va->location,
509             .AlignedByteOffset = va->offset,
510             .Format = fmt_to_dxgi(va->fmt),
511         };
512     }
513     D3D(ID3D11Device_CreateInputLayout(p->dev, in_descs,
514         params->num_vertex_attribs, ID3D10Blob_GetBufferPointer(vs_blob),
515         ID3D10Blob_GetBufferSize(vs_blob), &pass_p->layout));
516 
517     static const D3D11_BLEND blend_options[] = {
518         [PL_BLEND_ZERO] = D3D11_BLEND_ZERO,
519         [PL_BLEND_ONE] = D3D11_BLEND_ONE,
520         [PL_BLEND_SRC_ALPHA] = D3D11_BLEND_SRC_ALPHA,
521         [PL_BLEND_ONE_MINUS_SRC_ALPHA] = D3D11_BLEND_INV_SRC_ALPHA,
522     };
523 
524     D3D11_BLEND_DESC bdesc = {
525         .RenderTarget[0] = {
526             .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL,
527         },
528     };
529     if (params->blend_params) {
530         bdesc.RenderTarget[0] = (D3D11_RENDER_TARGET_BLEND_DESC) {
531             .BlendEnable = TRUE,
532             .SrcBlend = blend_options[params->blend_params->src_rgb],
533             .DestBlend = blend_options[params->blend_params->dst_rgb],
534             .BlendOp = D3D11_BLEND_OP_ADD,
535             .SrcBlendAlpha = blend_options[params->blend_params->src_alpha],
536             .DestBlendAlpha = blend_options[params->blend_params->dst_alpha],
537             .BlendOpAlpha = D3D11_BLEND_OP_ADD,
538             .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL,
539         };
540     }
541     D3D(ID3D11Device_CreateBlendState(p->dev, &bdesc, &pass_p->bstate));
542 
543     success = true;
544 error:
545     SAFE_RELEASE(vs_blob);
546     SAFE_RELEASE(ps_blob);
547     pl_free(in_descs);
548     return success;
549 }
550 
pass_create_compute(pl_gpu gpu,struct pl_pass * pass,const struct pl_pass_params * params)551 static bool pass_create_compute(pl_gpu gpu, struct pl_pass *pass,
552                                 const struct pl_pass_params *params)
553 {
554     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
555     struct d3d11_ctx *ctx = p->ctx;
556     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
557     ID3DBlob *cs_blob = NULL;
558     bool success = false;
559 
560     if (!shader_compile_hlsl(gpu, pass, &pass_p->main, GLSL_SHADER_COMPUTE,
561                              params->glsl_shader, &cs_blob))
562         goto error;
563 
564     D3D(ID3D11Device_CreateComputeShader(p->dev,
565         ID3D10Blob_GetBufferPointer(cs_blob), ID3D10Blob_GetBufferSize(cs_blob),
566         NULL, &pass_p->cs));
567 
568     if (pass_p->num_workgroups_used) {
569         D3D11_BUFFER_DESC bdesc = {
570             .BindFlags = D3D11_BIND_CONSTANT_BUFFER,
571             .ByteWidth = sizeof(pass_p->last_num_wgs),
572         };
573         D3D(ID3D11Device_CreateBuffer(p->dev, &bdesc, NULL,
574                                       &pass_p->num_workgroups_buf));
575     }
576 
577     success = true;
578 error:
579     SAFE_RELEASE(cs_blob);
580     return success;
581 }
582 
pl_d3d11_pass_create(pl_gpu gpu,const struct pl_pass_params * params)583 const struct pl_pass *pl_d3d11_pass_create(pl_gpu gpu,
584                                            const struct pl_pass_params *params)
585 {
586     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
587     struct d3d11_ctx *ctx = p->ctx;
588 
589     struct pl_pass *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_d3d11);
590     pass->params = pl_pass_params_copy(pass, params);
591 
592     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
593 
594     pass_p->descriptors = pl_calloc_ptr(pass, params->num_descriptors,
595                                         pass_p->descriptors);
596     for (int i = 0; i < params->num_descriptors; i++) {
597         struct pl_desc_d3d11 *desc_p = &pass_p->descriptors[i];
598         *desc_p = (struct pl_desc_d3d11) {
599             .main = {
600                 .cbv_slot = -1,
601                 .srv_slot = -1,
602                 .sampler_slot = -1,
603                 .uav_slot = -1,
604             },
605             .vertex = {
606                 .cbv_slot = -1,
607                 .srv_slot = -1,
608                 .sampler_slot = -1,
609             },
610         };
611     }
612 
613     // Compile GLSL to SPIR-V. This also sets `desc_stage.used` based on which
614     // resources are statically used in the shader for each pass.
615     if (params->type == PL_PASS_RASTER) {
616         if (!shader_compile_glsl(gpu, pass, &pass_p->vertex, GLSL_SHADER_VERTEX,
617                                  params->vertex_shader))
618             goto error;
619         if (!shader_compile_glsl(gpu, pass, &pass_p->main, GLSL_SHADER_FRAGMENT,
620                                  params->glsl_shader))
621             goto error;
622     } else {
623         if (!shader_compile_glsl(gpu, pass, &pass_p->main, GLSL_SHADER_COMPUTE,
624                                  params->glsl_shader))
625             goto error;
626     }
627 
628     // In a raster pass, one of the UAV slots is used by the runtime for the RTV
629     int uav_offset = params->type == PL_PASS_COMPUTE ? 0 : 1;
630     int max_uavs = p->max_uavs - uav_offset;
631 
632     for (int desc_idx = 0; desc_idx < params->num_descriptors; desc_idx++) {
633         struct pl_desc *desc = &params->descriptors[desc_idx];
634         struct pl_desc_d3d11 *desc_p = &pass_p->descriptors[desc_idx];
635 
636         bool has_cbv = false, has_srv = false, has_sampler = false, has_uav = false;
637 
638         switch (desc->type) {
639         case PL_DESC_SAMPLED_TEX:
640             has_sampler = true;
641             has_srv = true;
642             break;
643         case PL_DESC_BUF_STORAGE:
644         case PL_DESC_STORAGE_IMG:
645         case PL_DESC_BUF_TEXEL_STORAGE:
646             if (desc->access == PL_DESC_ACCESS_READONLY) {
647                 has_srv = true;
648             } else {
649                 has_uav = true;
650             }
651             break;
652         case PL_DESC_BUF_UNIFORM:
653             has_cbv = true;
654             break;
655         case PL_DESC_BUF_TEXEL_UNIFORM:
656             has_srv = true;
657             break;
658         case PL_DESC_INVALID:
659         case PL_DESC_TYPE_COUNT:
660             pl_unreachable();
661         }
662 
663         // Allocate HLSL register numbers for each shader stage
664         struct d3d_pass_stage *stages[] = { &pass_p->main, &pass_p->vertex };
665         for (int j = 0; j < PL_ARRAY_SIZE(stages); j++) {
666             struct d3d_pass_stage *pass_s = stages[j];
667             struct d3d_desc_stage *desc_s =
668                 pass_s == &pass_p->vertex ? &desc_p->vertex : &desc_p->main;
669             if (!desc_s->used)
670                 continue;
671 
672             if (has_cbv) {
673                 desc_s->cbv_slot = pass_s->cbvs.num;
674                 PL_ARRAY_APPEND(pass, pass_s->cbvs, desc_idx);
675                 if (pass_s->cbvs.num > D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT) {
676                     PL_ERR(gpu, "Too many constant buffers in shader");
677                     goto error;
678                 }
679             }
680 
681             if (has_srv) {
682                 desc_s->srv_slot = pass_s->srvs.num;
683                 PL_ARRAY_APPEND(pass, pass_s->srvs, desc_idx);
684                 if (pass_s->srvs.num > p->max_srvs) {
685                     PL_ERR(gpu, "Too many SRVs in shader");
686                     goto error;
687                 }
688             }
689 
690             if (has_sampler) {
691                 desc_s->sampler_slot = pass_s->samplers.num;
692                 PL_ARRAY_APPEND(pass, pass_s->samplers, desc_idx);
693                 if (pass_s->srvs.num > D3D11_COMMONSHADER_SAMPLER_SLOT_COUNT) {
694                     PL_ERR(gpu, "Too many samplers in shader");
695                     goto error;
696                 }
697             }
698         }
699 
700         // UAV bindings are shared between all shader stages
701         if (has_uav && (desc_p->main.used || desc_p->vertex.used)) {
702             desc_p->main.uav_slot = pass_p->uavs.num + uav_offset;
703             PL_ARRAY_APPEND(pass, pass_p->uavs, desc_idx);
704             if (pass_p->uavs.num > max_uavs) {
705                 PL_ERR(gpu, "Too many UAVs in shader");
706                 goto error;
707             }
708         }
709     }
710 
711     if (params->type == PL_PASS_COMPUTE) {
712         if (!pass_create_compute(gpu, pass, params))
713             goto error;
714     } else {
715         if (!pass_create_raster(gpu, pass, params))
716             goto error;
717     }
718 
719     // Pre-allocate resource arrays to use in pl_pass_run
720     pass_p->cbv_arr = pl_calloc(pass,
721         PL_MAX(pass_p->main.cbvs.num, pass_p->vertex.cbvs.num),
722         sizeof(*pass_p->cbv_arr));
723     pass_p->srv_arr = pl_calloc(pass,
724         PL_MAX(pass_p->main.srvs.num, pass_p->vertex.srvs.num),
725         sizeof(*pass_p->srv_arr));
726     pass_p->sampler_arr = pl_calloc(pass,
727         PL_MAX(pass_p->main.samplers.num, pass_p->vertex.samplers.num),
728         sizeof(*pass_p->sampler_arr));
729     pass_p->uav_arr = pl_calloc(pass, pass_p->uavs.num, sizeof(*pass_p->uav_arr));
730 
731     pl_d3d11_flush_message_queue(ctx, "After pass create");
732 
733     return pass;
734 
735 error:
736     pl_d3d11_pass_destroy(gpu, pass);
737     return NULL;
738 }
739 
740 // Shared logic between VS, PS and CS for filling the resource arrays that are
741 // passed to ID3D11DeviceContext methods
fill_resources(pl_gpu gpu,pl_pass pass,struct d3d_pass_stage * pass_s,const struct pl_pass_run_params * params,ID3D11Buffer ** cbvs,ID3D11ShaderResourceView ** srvs,ID3D11SamplerState ** samplers)742 static void fill_resources(pl_gpu gpu, pl_pass pass,
743                            struct d3d_pass_stage *pass_s,
744                            const struct pl_pass_run_params *params,
745                            ID3D11Buffer **cbvs, ID3D11ShaderResourceView **srvs,
746                            ID3D11SamplerState **samplers)
747 {
748     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
749     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
750 
751     for (int i = 0; i < pass_s->cbvs.num; i++) {
752         int binding = pass_s->cbvs.elem[i];
753         if (binding == HLSL_BINDING_NOT_USED) {
754             cbvs[i] = NULL;
755             continue;
756         } else if (binding == HLSL_BINDING_NUM_WORKGROUPS) {
757             cbvs[i] = pass_p->num_workgroups_buf;
758             continue;
759         }
760 
761         pl_buf buf = params->desc_bindings[binding].object;
762         pl_d3d11_buf_resolve(gpu, buf);
763         struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
764         cbvs[i] = buf_p->buf;
765     }
766 
767     for (int i = 0; i < pass_s->srvs.num; i++) {
768         int binding = pass_s->srvs.elem[i];
769         if (binding == HLSL_BINDING_NOT_USED) {
770             srvs[i] = NULL;
771             continue;
772         }
773 
774         pl_tex tex;
775         struct pl_tex_d3d11 *tex_p;
776         pl_buf buf;
777         struct pl_buf_d3d11 *buf_p;
778         switch (pass->params.descriptors[binding].type) {
779         case PL_DESC_SAMPLED_TEX:
780         case PL_DESC_STORAGE_IMG:
781             tex = params->desc_bindings[binding].object;
782             tex_p = PL_PRIV(tex);
783             srvs[i] = tex_p->srv;
784             break;
785         case PL_DESC_BUF_STORAGE:
786             buf = params->desc_bindings[binding].object;
787             buf_p = PL_PRIV(buf);
788             srvs[i] = buf_p->raw_srv;
789             break;
790         case PL_DESC_BUF_TEXEL_UNIFORM:
791         case PL_DESC_BUF_TEXEL_STORAGE:
792             buf = params->desc_bindings[binding].object;
793             buf_p = PL_PRIV(buf);
794             srvs[i] = buf_p->texel_srv;
795             break;
796         default:
797             break;
798         }
799     }
800 
801     for (int i = 0; i < pass_s->samplers.num; i++) {
802         int binding = pass_s->samplers.elem[i];
803         if (binding == HLSL_BINDING_NOT_USED) {
804             samplers[i] = NULL;
805             continue;
806         }
807 
808         struct pl_desc_binding *db = &params->desc_bindings[binding];
809         samplers[i] = p->samplers[db->sample_mode][db->address_mode];
810     }
811 }
812 
fill_uavs(pl_pass pass,const struct pl_pass_run_params * params,ID3D11UnorderedAccessView ** uavs)813 static void fill_uavs(pl_pass pass, const struct pl_pass_run_params *params,
814                       ID3D11UnorderedAccessView **uavs)
815 {
816     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
817 
818     for (int i = 0; i < pass_p->uavs.num; i++) {
819         int binding = pass_p->uavs.elem[i];
820         if (binding == HLSL_BINDING_NOT_USED) {
821             uavs[i] = NULL;
822             continue;
823         }
824 
825         pl_tex tex;
826         struct pl_tex_d3d11 *tex_p;
827         pl_buf buf;
828         struct pl_buf_d3d11 *buf_p;
829         switch (pass->params.descriptors[binding].type) {
830         case PL_DESC_BUF_STORAGE:
831             buf = params->desc_bindings[binding].object;
832             buf_p = PL_PRIV(buf);
833             uavs[i] = buf_p->raw_uav;
834             break;
835         case PL_DESC_STORAGE_IMG:
836             tex = params->desc_bindings[binding].object;
837             tex_p = PL_PRIV(tex);
838             uavs[i] = tex_p->uav;
839             break;
840         case PL_DESC_BUF_TEXEL_STORAGE:
841             buf = params->desc_bindings[binding].object;
842             buf_p = PL_PRIV(buf);
843             uavs[i] = buf_p->texel_uav;
844             break;
845         default:
846             break;
847         }
848     }
849 }
850 
pass_run_raster(pl_gpu gpu,const struct pl_pass_run_params * params)851 static void pass_run_raster(pl_gpu gpu, const struct pl_pass_run_params *params)
852 {
853     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
854     pl_pass pass = params->pass;
855     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
856 
857     if (p->fl <= D3D_FEATURE_LEVEL_9_3 && params->index_buf) {
858         // Index buffers are unsupported because we can't tell if they are an
859         // index buffer or a vertex buffer on creation, and FL9_x allows only
860         // one binding type per-buffer
861         PL_ERR(gpu, "Index buffers are unsupported in FL9_x");
862         return;
863     }
864 
865     // Figure out how much vertex data to upload, if any
866     size_t vertex_alloc = 0;
867     if (params->vertex_data) {
868         int num_verticies = 0;
869         if (params->index_data) {
870             // Indexed draw, so we need to store all indexed vertices
871             for (int i = 0; i < params->vertex_count; i++)
872                 num_verticies = PL_MAX(num_verticies, params->index_data[i] + 1);
873         } else {
874             num_verticies = params->vertex_count;
875         }
876         vertex_alloc = num_verticies * pass->params.vertex_stride;
877     }
878 
879     // Figure out how much index data to upload, if any
880     size_t index_alloc = 0;
881     if (params->index_data)
882         index_alloc = params->vertex_count * sizeof(uint16_t);
883 
884     // Upload vertex data. On >=FL10_0 we use the same buffer for index data, so
885     // upload that too.
886     bool share_vertex_index_buf = p->fl > D3D_FEATURE_LEVEL_9_3;
887     if (vertex_alloc || (share_vertex_index_buf && index_alloc)) {
888         struct stream_buf_slice slices[] = {
889             { .data = params->vertex_data, .size = vertex_alloc },
890             { .data = params->index_data, .size = index_alloc },
891         };
892 
893         if (!stream_buf_upload(gpu, &p->vbuf, slices,
894                                share_vertex_index_buf ? 2 : 1)) {
895             PL_ERR(gpu, "Failed to upload vertex data");
896             return;
897         }
898 
899         if (vertex_alloc) {
900             ID3D11DeviceContext_IASetVertexBuffers(p->imm, 0, 1, &p->vbuf.buf,
901                 &(UINT) { pass->params.vertex_stride }, &slices[0].offset);
902         }
903         if (share_vertex_index_buf && index_alloc) {
904             ID3D11DeviceContext_IASetIndexBuffer(p->imm, p->vbuf.buf,
905                 DXGI_FORMAT_R16_UINT, slices[1].offset);
906         }
907     }
908 
909     // Upload index data for <=FL9_3, which must be in its own buffer
910     if (!share_vertex_index_buf && index_alloc) {
911         struct stream_buf_slice slices[] = {
912             { .data = params->index_data, .size = index_alloc },
913         };
914 
915         if (!stream_buf_upload(gpu, &p->ibuf, slices, PL_ARRAY_SIZE(slices))) {
916             PL_ERR(gpu, "Failed to upload index data");
917             return;
918         }
919 
920         ID3D11DeviceContext_IASetIndexBuffer(p->imm, p->ibuf.buf,
921             DXGI_FORMAT_R16_UINT, slices[0].offset);
922     }
923 
924     if (params->vertex_buf) {
925         struct pl_buf_d3d11 *buf_p = PL_PRIV(params->vertex_buf);
926         ID3D11DeviceContext_IASetVertexBuffers(p->imm, 0, 1, &buf_p->buf,
927             &(UINT) { pass->params.vertex_stride },
928             &(UINT) { params->buf_offset });
929     }
930 
931     if (params->index_buf) {
932         struct pl_buf_d3d11 *buf_p = PL_PRIV(params->index_buf);
933         ID3D11DeviceContext_IASetIndexBuffer(p->imm, buf_p->buf,
934             DXGI_FORMAT_R16_UINT, params->index_offset);
935     }
936 
937     ID3D11DeviceContext_IASetInputLayout(p->imm, pass_p->layout);
938 
939     static const D3D_PRIMITIVE_TOPOLOGY prim_topology[] = {
940         [PL_PRIM_TRIANGLE_LIST] = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST,
941         [PL_PRIM_TRIANGLE_STRIP] = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP,
942     };
943     ID3D11DeviceContext_IASetPrimitiveTopology(p->imm,
944         prim_topology[pass->params.vertex_type]);
945 
946     ID3D11DeviceContext_VSSetShader(p->imm, pass_p->vs, NULL, 0);
947 
948     ID3D11Buffer **cbvs = pass_p->cbv_arr;
949     ID3D11ShaderResourceView **srvs = pass_p->srv_arr;
950     ID3D11SamplerState **samplers = pass_p->sampler_arr;
951     ID3D11UnorderedAccessView **uavs = pass_p->uav_arr;
952 
953     // Set vertex shader resources. The device context is called conditionally
954     // because the debug layer complains if these are called with 0 resources.
955     fill_resources(gpu, pass, &pass_p->vertex, params, cbvs, srvs, samplers);
956     if (pass_p->vertex.cbvs.num)
957         ID3D11DeviceContext_VSSetConstantBuffers(p->imm, 0, pass_p->vertex.cbvs.num, cbvs);
958     if (pass_p->vertex.srvs.num)
959         ID3D11DeviceContext_VSSetShaderResources(p->imm, 0, pass_p->vertex.srvs.num, srvs);
960     if (pass_p->vertex.samplers.num)
961         ID3D11DeviceContext_VSSetSamplers(p->imm, 0, pass_p->vertex.samplers.num, samplers);
962 
963     ID3D11DeviceContext_RSSetState(p->imm, p->rstate);
964     ID3D11DeviceContext_RSSetViewports(p->imm, 1, (&(D3D11_VIEWPORT) {
965         .TopLeftX = params->viewport.x0,
966         .TopLeftY = params->viewport.y0,
967         .Width = pl_rect_w(params->viewport),
968         .Height = pl_rect_h(params->viewport),
969         .MinDepth = 0,
970         .MaxDepth = 1,
971     }));
972     ID3D11DeviceContext_RSSetScissorRects(p->imm, 1, (&(D3D11_RECT) {
973         .left = params->scissors.x0,
974         .top = params->scissors.y0,
975         .right = params->scissors.x1,
976         .bottom = params->scissors.y1,
977     }));
978 
979     ID3D11DeviceContext_PSSetShader(p->imm, pass_p->ps, NULL, 0);
980 
981     // Set pixel shader resources
982     fill_resources(gpu, pass, &pass_p->main, params, cbvs, srvs, samplers);
983     if (pass_p->main.cbvs.num)
984         ID3D11DeviceContext_PSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
985     if (pass_p->main.srvs.num)
986         ID3D11DeviceContext_PSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
987     if (pass_p->main.samplers.num)
988         ID3D11DeviceContext_PSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
989 
990     ID3D11DeviceContext_OMSetBlendState(p->imm, pass_p->bstate, NULL,
991                                         D3D11_DEFAULT_SAMPLE_MASK);
992     ID3D11DeviceContext_OMSetDepthStencilState(p->imm, p->dsstate, 0);
993 
994     fill_uavs(pass, params, uavs);
995 
996     struct pl_tex_d3d11 *target_p = PL_PRIV(params->target);
997     ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews(
998         p->imm, 1, &target_p->rtv, NULL, 1, pass_p->uavs.num, uavs, NULL);
999 
1000     if (params->index_data || params->index_buf) {
1001         ID3D11DeviceContext_DrawIndexed(p->imm, params->vertex_count, 0, 0);
1002     } else {
1003         ID3D11DeviceContext_Draw(p->imm, params->vertex_count, 0);
1004     }
1005 
1006     // Unbind everything. It's easier to do this than to actually track state,
1007     // and if we leave the RTV bound, it could trip up D3D's conflict checker.
1008     // Also, apparently unbinding SRVs can prevent a 10level9 bug?
1009     // https://docs.microsoft.com/en-us/windows/win32/direct3d11/overviews-direct3d-11-devices-downlevel-prevent-null-srvs
1010     for (int i = 0; i < PL_MAX(pass_p->main.cbvs.num, pass_p->vertex.cbvs.num); i++)
1011         cbvs[i] = NULL;
1012     for (int i = 0; i < PL_MAX(pass_p->main.srvs.num, pass_p->vertex.srvs.num); i++)
1013         srvs[i] = NULL;
1014     for (int i = 0; i < PL_MAX(pass_p->main.samplers.num, pass_p->vertex.samplers.num); i++)
1015         samplers[i] = NULL;
1016     for (int i = 0; i < pass_p->uavs.num; i++)
1017         uavs[i] = NULL;
1018     if (pass_p->vertex.cbvs.num)
1019         ID3D11DeviceContext_VSSetConstantBuffers(p->imm, 0, pass_p->vertex.cbvs.num, cbvs);
1020     if (pass_p->vertex.srvs.num)
1021         ID3D11DeviceContext_VSSetShaderResources(p->imm, 0, pass_p->vertex.srvs.num, srvs);
1022     if (pass_p->vertex.samplers.num)
1023         ID3D11DeviceContext_VSSetSamplers(p->imm, 0, pass_p->vertex.samplers.num, samplers);
1024     if (pass_p->main.cbvs.num)
1025         ID3D11DeviceContext_PSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
1026     if (pass_p->main.srvs.num)
1027         ID3D11DeviceContext_PSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
1028     if (pass_p->main.samplers.num)
1029         ID3D11DeviceContext_PSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
1030     ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews(
1031         p->imm, 0, NULL, NULL, 1, pass_p->uavs.num, uavs, NULL);
1032 }
1033 
pass_run_compute(pl_gpu gpu,const struct pl_pass_run_params * params)1034 static void pass_run_compute(pl_gpu gpu, const struct pl_pass_run_params *params)
1035 {
1036     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
1037     pl_pass pass = params->pass;
1038     struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
1039 
1040     // Update gl_NumWorkGroups emulation buffer if necessary
1041     if (pass_p->num_workgroups_used) {
1042         bool needs_update = false;
1043         for (int i = 0; i < 3; i++) {
1044             if (pass_p->last_num_wgs.num_wgs[i] != params->compute_groups[i])
1045                 needs_update = true;
1046             pass_p->last_num_wgs.num_wgs[i] = params->compute_groups[i];
1047         }
1048 
1049         if (needs_update) {
1050             ID3D11DeviceContext_UpdateSubresource(p->imm,
1051                 (ID3D11Resource *) pass_p->num_workgroups_buf, 0, NULL,
1052                 &pass_p->last_num_wgs, 0, 0);
1053         }
1054     }
1055 
1056     ID3D11DeviceContext_CSSetShader(p->imm, pass_p->cs, NULL, 0);
1057 
1058     ID3D11Buffer **cbvs = pass_p->cbv_arr;
1059     ID3D11ShaderResourceView **srvs = pass_p->srv_arr;
1060     ID3D11UnorderedAccessView **uavs = pass_p->uav_arr;
1061     ID3D11SamplerState **samplers = pass_p->sampler_arr;
1062 
1063     fill_resources(gpu, pass, &pass_p->main, params, cbvs, srvs, samplers);
1064     fill_uavs(pass, params, uavs);
1065 
1066     if (pass_p->main.cbvs.num)
1067         ID3D11DeviceContext_CSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
1068     if (pass_p->main.srvs.num)
1069         ID3D11DeviceContext_CSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
1070     if (pass_p->main.samplers.num)
1071         ID3D11DeviceContext_CSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
1072     if (pass_p->uavs.num)
1073         ID3D11DeviceContext_CSSetUnorderedAccessViews(p->imm, 0, pass_p->uavs.num, uavs, NULL);
1074 
1075     ID3D11DeviceContext_Dispatch(p->imm, params->compute_groups[0],
1076                                          params->compute_groups[1],
1077                                          params->compute_groups[2]);
1078 
1079     // Unbind everything
1080     for (int i = 0; i < pass_p->main.cbvs.num; i++)
1081         cbvs[i] = NULL;
1082     for (int i = 0; i < pass_p->main.srvs.num; i++)
1083         srvs[i] = NULL;
1084     for (int i = 0; i < pass_p->main.samplers.num; i++)
1085         samplers[i] = NULL;
1086     for (int i = 0; i < pass_p->uavs.num; i++)
1087         uavs[i] = NULL;
1088     if (pass_p->main.cbvs.num)
1089         ID3D11DeviceContext_CSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
1090     if (pass_p->main.srvs.num)
1091         ID3D11DeviceContext_CSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
1092     if (pass_p->main.samplers.num)
1093         ID3D11DeviceContext_CSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
1094     if (pass_p->uavs.num)
1095         ID3D11DeviceContext_CSSetUnorderedAccessViews(p->imm, 0, pass_p->uavs.num, uavs, NULL);
1096 }
1097 
pl_d3d11_pass_run(pl_gpu gpu,const struct pl_pass_run_params * params)1098 void pl_d3d11_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
1099 {
1100     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
1101     struct d3d11_ctx *ctx = p->ctx;
1102     pl_pass pass = params->pass;
1103 
1104     pl_d3d11_timer_start(gpu, params->timer);
1105 
1106     if (pass->params.type == PL_PASS_COMPUTE) {
1107         pass_run_compute(gpu, params);
1108     } else {
1109         pass_run_raster(gpu, params);
1110     }
1111 
1112     pl_d3d11_timer_end(gpu, params->timer);
1113     pl_d3d11_flush_message_queue(ctx, "After pass run");
1114 }
1115