1 /*
2  * This file is part of libplacebo.
3  *
4  * libplacebo is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * libplacebo is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <windows.h>
19 #include <versionhelpers.h>
20 
21 #include "gpu.h"
22 #include "formats.h"
23 #include "glsl/spirv.h"
24 
25 #define D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE (0x80)
26 #define DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES (0x8)
27 
28 struct timer_query {
29     ID3D11Query *ts_start;
30     ID3D11Query *ts_end;
31     ID3D11Query *disjoint;
32 };
33 
34 struct pl_timer {
35     // Ring buffer of timer queries to use
36     int current;
37     int pending;
38     struct timer_query queries[16];
39 };
40 
pl_d3d11_timer_start(pl_gpu gpu,pl_timer timer)41 void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer)
42 {
43     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
44     struct d3d11_ctx *ctx = p->ctx;
45 
46     if (!timer)
47         return;
48     struct timer_query *query = &timer->queries[timer->current];
49 
50     // Create the query objects lazilly
51     if (!query->ts_start) {
52         D3D(ID3D11Device_CreateQuery(p->dev,
53             &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_start));
54         D3D(ID3D11Device_CreateQuery(p->dev,
55             &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_end));
56 
57         // Measuring duration in D3D11 requires three queries: start and end
58         // timestamp queries, and a disjoint query containing a flag which says
59         // whether the timestamps are usable or if a discontinuity occurred
60         // between them, like a change in power state or clock speed. The
61         // disjoint query also contains the timer frequency, so the timestamps
62         // are useless without it.
63         D3D(ID3D11Device_CreateQuery(p->dev,
64             &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP_DISJOINT }, &query->disjoint));
65     }
66 
67     // Query the start timestamp
68     ID3D11DeviceContext_Begin(p->imm, (ID3D11Asynchronous *) query->disjoint);
69     ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_start);
70     return;
71 
72 error:
73     SAFE_RELEASE(query->ts_start);
74     SAFE_RELEASE(query->ts_end);
75     SAFE_RELEASE(query->disjoint);
76 }
77 
pl_d3d11_timer_end(pl_gpu gpu,pl_timer timer)78 void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer)
79 {
80     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
81 
82     if (!timer)
83         return;
84     struct timer_query *query = &timer->queries[timer->current];
85 
86     // Even if timer_start and timer_end are called in-order, timer_start might
87     // have failed to create the timer objects
88     if (!query->ts_start)
89         return;
90 
91     // Query the end timestamp
92     ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_end);
93     ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->disjoint);
94 
95     // Advance to the next set of queries, for the next call to timer_start
96     timer->current++;
97     if (timer->current >= PL_ARRAY_SIZE(timer->queries))
98         timer->current = 0; // Wrap around
99 
100     // Increment the number of pending queries, unless the ring buffer is full,
101     // in which case, timer->current now points to the oldest one, which will be
102     // dropped and reused
103     if (timer->pending < PL_ARRAY_SIZE(timer->queries))
104         timer->pending++;
105 }
106 
timestamp_to_ns(uint64_t timestamp,uint64_t freq)107 static uint64_t timestamp_to_ns(uint64_t timestamp, uint64_t freq)
108 {
109     static const uint64_t ns_per_s = 1000000000llu;
110     return timestamp / freq * ns_per_s + timestamp % freq * ns_per_s / freq;
111 }
112 
d3d11_timer_query(pl_gpu gpu,pl_timer timer)113 static uint64_t d3d11_timer_query(pl_gpu gpu, pl_timer timer)
114 {
115     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
116     struct d3d11_ctx *ctx = p->ctx;
117     HRESULT hr;
118 
119     for (; timer->pending > 0; timer->pending--) {
120         int index = timer->current - timer->pending;
121         if (index < 0)
122             index += PL_ARRAY_SIZE(timer->queries);
123         struct timer_query *query = &timer->queries[index];
124 
125         UINT64 start, end;
126         D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dj;
127 
128         // Fetch the results of each query, or on S_FALSE, return 0 to indicate
129         // the queries are still pending
130         D3D(hr = ID3D11DeviceContext_GetData(p->imm,
131             (ID3D11Asynchronous *) query->disjoint, &dj, sizeof(dj),
132             D3D11_ASYNC_GETDATA_DONOTFLUSH));
133         if (hr == S_FALSE)
134             return 0;
135         D3D(hr = ID3D11DeviceContext_GetData(p->imm,
136             (ID3D11Asynchronous *) query->ts_end, &end, sizeof(end),
137             D3D11_ASYNC_GETDATA_DONOTFLUSH));
138         if (hr == S_FALSE)
139             return 0;
140         D3D(hr = ID3D11DeviceContext_GetData(p->imm,
141             (ID3D11Asynchronous *) query->ts_start, &start, sizeof(start),
142             D3D11_ASYNC_GETDATA_DONOTFLUSH));
143         if (hr == S_FALSE)
144             return 0;
145 
146         // There was a discontinuity during the queries, so a timestamp can't be
147         // produced. Skip it and try the next one.
148         if (dj.Disjoint || !dj.Frequency)
149             continue;
150 
151         // We got a result. Return it to the caller.
152         timer->pending--;
153         pl_d3d11_flush_message_queue(ctx, "After timer query");
154 
155         uint64_t ns = timestamp_to_ns(end - start, dj.Frequency);
156         return PL_MAX(ns, 1);
157 
158     error:
159         // There was an error fetching the timer result, so skip it and try the
160         // next one
161         continue;
162     }
163 
164     // No more unprocessed results
165     return 0;
166 }
167 
d3d11_timer_destroy(pl_gpu gpu,pl_timer timer)168 static void d3d11_timer_destroy(pl_gpu gpu, pl_timer timer)
169 {
170     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
171     struct d3d11_ctx *ctx = p->ctx;
172 
173     for (int i = 0; i < PL_ARRAY_SIZE(timer->queries); i++) {
174         SAFE_RELEASE(timer->queries[i].ts_start);
175         SAFE_RELEASE(timer->queries[i].ts_end);
176         SAFE_RELEASE(timer->queries[i].disjoint);
177     }
178 
179     pl_d3d11_flush_message_queue(ctx, "After timer destroy");
180 
181     pl_free(timer);
182 }
183 
d3d11_timer_create(pl_gpu gpu)184 static pl_timer d3d11_timer_create(pl_gpu gpu)
185 {
186     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
187     if (!p->has_timestamp_queries)
188         return NULL;
189 
190     struct pl_timer *timer = pl_alloc_ptr(NULL, timer);
191     *timer = (struct pl_timer) {0};
192     return timer;
193 }
194 
d3d11_desc_namespace(pl_gpu gpu,enum pl_desc_type type)195 static int d3d11_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
196 {
197     // Vulkan-style binding, where all descriptors are in the same namespace, is
198     // required to use SPIRV-Cross' HLSL resource mapping API, which targets
199     // resources by binding number
200     return 0;
201 }
202 
d3d11_gpu_flush(pl_gpu gpu)203 static void d3d11_gpu_flush(pl_gpu gpu)
204 {
205     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
206     struct d3d11_ctx *ctx = p->ctx;
207     ID3D11DeviceContext_Flush(p->imm);
208 
209     pl_d3d11_flush_message_queue(ctx, "After gpu flush");
210 }
211 
d3d11_gpu_finish(pl_gpu gpu)212 static void d3d11_gpu_finish(pl_gpu gpu)
213 {
214     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
215     struct d3d11_ctx *ctx = p->ctx;
216     HRESULT hr;
217 
218     if (p->finish_fence) {
219         p->finish_value++;
220         D3D(ID3D11Fence_SetEventOnCompletion(p->finish_fence, p->finish_value,
221                                              p->finish_event));
222         ID3D11DeviceContext4_Signal(p->imm4, p->finish_fence, p->finish_value);
223         ID3D11DeviceContext_Flush(p->imm);
224         WaitForSingleObject(p->finish_event, INFINITE);
225     } else {
226         ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) p->finish_query);
227 
228         // D3D11 doesn't have blocking queries, but it does have blocking
229         // readback. As a performance hack to try to avoid polling, do a dummy
230         // copy/readback between two buffers. Hopefully this will block until
231         // all prior commands are finished. If it does, the first GetData call
232         // will return a result and we won't have to poll.
233         pl_buf_copy(gpu, p->finish_buf_dst, 0, p->finish_buf_src, 0, sizeof(uint32_t));
234         pl_buf_read(gpu, p->finish_buf_dst, 0, &(uint32_t) {0}, sizeof(uint32_t));
235 
236         // Poll the event query until it completes
237         for (;;) {
238             BOOL idle;
239             D3D(hr = ID3D11DeviceContext_GetData(p->imm,
240                 (ID3D11Asynchronous *) p->finish_query, &idle, sizeof(idle), 0));
241             if (hr == S_OK && idle)
242                 break;
243             Sleep(1);
244         }
245     }
246 
247     pl_d3d11_flush_message_queue(ctx, "After gpu finish");
248 
249 error:
250     return;
251 }
252 
d3d11_gpu_is_failed(pl_gpu gpu)253 static bool d3d11_gpu_is_failed(pl_gpu gpu)
254 {
255     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
256     struct d3d11_ctx *ctx = p->ctx;
257 
258     if (ctx->is_failed)
259         return true;
260 
261     // GetDeviceRemovedReason returns S_OK if the device isn't removed
262     HRESULT hr = ID3D11Device_GetDeviceRemovedReason(p->dev);
263     if (FAILED(hr)) {
264         ctx->is_failed = true;
265         pl_d3d11_after_error(ctx, hr);
266     }
267 
268     return ctx->is_failed;
269 }
270 
d3d11_destroy_gpu(pl_gpu gpu)271 static void d3d11_destroy_gpu(pl_gpu gpu)
272 {
273     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
274 
275     pl_buf_destroy(gpu, &p->finish_buf_src);
276     pl_buf_destroy(gpu, &p->finish_buf_dst);
277     pl_dispatch_destroy(&p->dp);
278 
279     // Release everything except the immediate context
280     SAFE_RELEASE(p->dev);
281     SAFE_RELEASE(p->dev1);
282     SAFE_RELEASE(p->dev5);
283     SAFE_RELEASE(p->imm1);
284     SAFE_RELEASE(p->imm4);
285     SAFE_RELEASE(p->vbuf.buf);
286     SAFE_RELEASE(p->ibuf.buf);
287     SAFE_RELEASE(p->rstate);
288     SAFE_RELEASE(p->dsstate);
289     for (int i = 0; i < PL_TEX_SAMPLE_MODE_COUNT; i++) {
290         for (int j = 0; j < PL_TEX_ADDRESS_MODE_COUNT; j++) {
291             SAFE_RELEASE(p->samplers[i][j]);
292         }
293     }
294     SAFE_RELEASE(p->finish_fence);
295     if (p->finish_event)
296         CloseHandle(p->finish_event);
297     SAFE_RELEASE(p->finish_query);
298 
299     // Destroy the immediate context synchronously so referenced objects don't
300     // show up in the leak check
301     ID3D11DeviceContext_ClearState(p->imm);
302     ID3D11DeviceContext_Flush(p->imm);
303     SAFE_RELEASE(p->imm);
304 
305     pl_free((void *) gpu);
306 }
307 
load_d3d_compiler(pl_gpu gpu)308 static bool load_d3d_compiler(pl_gpu gpu)
309 {
310     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
311     HMODULE d3dcompiler = NULL;
312 
313     static const struct {
314         const wchar_t *name;
315         bool inbox;
316     } compiler_dlls[] = {
317         // Try the inbox D3DCompiler first (Windows 8.1 and up)
318         { .name = L"d3dcompiler_47.dll", .inbox = true },
319         // Check for a packaged version of d3dcompiler_47.dll
320         { .name = L"d3dcompiler_47.dll" },
321         // Try d3dcompiler_46.dll from the Windows 8 SDK
322         { .name = L"d3dcompiler_46.dll" },
323         // Try d3dcompiler_43.dll from the June 2010 DirectX SDK
324         { .name = L"d3dcompiler_43.dll" },
325     };
326 
327     for (int i = 0; i < PL_ARRAY_SIZE(compiler_dlls); i++) {
328         if (compiler_dlls[i].inbox) {
329             if (!IsWindows8Point1OrGreater())
330                 continue;
331             d3dcompiler = LoadLibraryExW(compiler_dlls[i].name, NULL,
332                                          LOAD_LIBRARY_SEARCH_SYSTEM32);
333         } else {
334             d3dcompiler = LoadLibraryW(compiler_dlls[i].name);
335         }
336         if (!d3dcompiler)
337             continue;
338 
339         p->D3DCompile = (void *) GetProcAddress(d3dcompiler, "D3DCompile");
340         if (!p->D3DCompile)
341             return false;
342         p->d3d_compiler_ver = pl_get_dll_version(compiler_dlls[i].name);
343 
344         return true;
345     }
346 
347     return false;
348 }
349 
350 static struct pl_gpu_fns pl_fns_d3d11 = {
351     .tex_create             = pl_d3d11_tex_create,
352     .tex_destroy            = pl_d3d11_tex_destroy,
353     .tex_invalidate         = pl_d3d11_tex_invalidate,
354     .tex_clear_ex           = pl_d3d11_tex_clear_ex,
355     .tex_blit               = pl_d3d11_tex_blit,
356     .tex_upload             = pl_d3d11_tex_upload,
357     .tex_download           = pl_d3d11_tex_download,
358     .buf_create             = pl_d3d11_buf_create,
359     .buf_destroy            = pl_d3d11_buf_destroy,
360     .buf_write              = pl_d3d11_buf_write,
361     .buf_read               = pl_d3d11_buf_read,
362     .buf_copy               = pl_d3d11_buf_copy,
363     .desc_namespace         = d3d11_desc_namespace,
364     .pass_create            = pl_d3d11_pass_create,
365     .pass_destroy           = pl_d3d11_pass_destroy,
366     .pass_run               = pl_d3d11_pass_run,
367     .timer_create           = d3d11_timer_create,
368     .timer_destroy          = d3d11_timer_destroy,
369     .timer_query            = d3d11_timer_query,
370     .gpu_flush              = d3d11_gpu_flush,
371     .gpu_finish             = d3d11_gpu_finish,
372     .gpu_is_failed          = d3d11_gpu_is_failed,
373     .destroy                = d3d11_destroy_gpu,
374 };
375 
pl_gpu_create_d3d11(struct d3d11_ctx * ctx)376 pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx)
377 {
378     pl_assert(ctx->dev);
379     IDXGIDevice1 *dxgi_dev = NULL;
380     IDXGIAdapter1 *adapter = NULL;
381     IDXGIAdapter4 *adapter4 = NULL;
382     bool success = false;
383     HRESULT hr;
384 
385     struct pl_gpu *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gpu_d3d11);
386     gpu->log = ctx->log;
387     gpu->ctx = gpu->log;
388 
389     struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
390     *p = (struct pl_gpu_d3d11) {
391         .ctx = ctx,
392         .impl = pl_fns_d3d11,
393         .dev = ctx->dev,
394         .spirv = spirv_compiler_create(ctx->log),
395         .vbuf.bind_flags = D3D11_BIND_VERTEX_BUFFER,
396         .ibuf.bind_flags = D3D11_BIND_INDEX_BUFFER,
397     };
398     if (!p->spirv)
399         goto error;
400 
401     ID3D11Device_AddRef(p->dev);
402     ID3D11Device_GetImmediateContext(p->dev, &p->imm);
403 
404     int minor = 0; // The Direct3D 11 minor version number
405 
406     // Check D3D11.1 interfaces
407     hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device1,
408                                      (void **) &p->dev1);
409     if (SUCCEEDED(hr)) {
410         minor = 1;
411         ID3D11Device1_GetImmediateContext1(p->dev1, &p->imm1);
412     }
413 
414     // Check D3D11.4 interfaces
415     hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device5,
416                                      (void **) &p->dev5);
417     if (SUCCEEDED(hr)) {
418         // There is no GetImmediateContext4 method
419         hr = ID3D11DeviceContext_QueryInterface(p->imm, &IID_ID3D11DeviceContext4,
420                                                 (void **) &p->imm4);
421         if (SUCCEEDED(hr))
422             minor = 4;
423     }
424 
425     PL_INFO(gpu, "Using Direct3D 11.%d runtime", minor);
426 
427     D3D(ID3D11Device_QueryInterface(p->dev, &IID_IDXGIDevice1, (void **) &dxgi_dev));
428     D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter));
429 
430     DXGI_ADAPTER_DESC1 adapter_desc = {0};
431     IDXGIAdapter1_GetDesc1(adapter, &adapter_desc);
432 
433     // No resource can be larger than max_res_size in bytes
434     unsigned int max_res_size = PL_CLAMP(
435         D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_B_TERM * adapter_desc.DedicatedVideoMemory,
436         D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_A_TERM * 1024u * 1024u,
437         D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_C_TERM * 1024u * 1024u);
438 
439     gpu->glsl = (struct pl_glsl_version) {
440         .version = 450,
441         .vulkan = true,
442     };
443 
444     gpu->limits = (struct pl_gpu_limits) {
445         .max_buf_size = max_res_size,
446         .max_ssbo_size = max_res_size,
447         .max_vbo_size = max_res_size,
448 
449         // Make up some values
450         .align_tex_xfer_offset = 32,
451         .align_tex_xfer_stride = 1,
452         .fragment_queues = 1,
453     };
454 
455     p->fl = ID3D11Device_GetFeatureLevel(p->dev);
456 
457     // If we're not using FL9_x, we can use the same suballocated buffer as a
458     // vertex buffer and index buffer
459     if (p->fl >= D3D_FEATURE_LEVEL_10_0)
460         p->vbuf.bind_flags |= D3D11_BIND_INDEX_BUFFER;
461 
462     if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
463         gpu->limits.max_ubo_size = D3D11_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * CBUF_ELEM;
464     } else {
465         // 10level9 restriction:
466         // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
467         gpu->limits.max_ubo_size = 255 * CBUF_ELEM;
468     }
469 
470     if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
471         gpu->limits.max_tex_1d_dim = D3D11_REQ_TEXTURE1D_U_DIMENSION;
472         gpu->limits.max_tex_2d_dim = D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION;
473         gpu->limits.max_tex_3d_dim = D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
474     } else if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
475         gpu->limits.max_tex_1d_dim = D3D10_REQ_TEXTURE1D_U_DIMENSION;
476         gpu->limits.max_tex_2d_dim = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
477         gpu->limits.max_tex_3d_dim = D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
478     } else if (p->fl >= D3D_FEATURE_LEVEL_9_3) {
479         gpu->limits.max_tex_2d_dim = D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION;
480         // Same limit as FL9_1
481         gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
482     } else {
483         gpu->limits.max_tex_2d_dim = D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION;
484         gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
485     }
486 
487     if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
488         gpu->limits.max_buffer_texels =
489             1 << D3D11_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
490     }
491 
492     if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
493         gpu->glsl.compute = true;
494         gpu->limits.compute_queues = 1;
495         // Set `gpu->limits.blittable_1d_3d`, since `pl_tex_blit_compute`, which
496         // is used to emulate blits on 11_0 and up, supports 1D and 3D textures
497         gpu->limits.blittable_1d_3d = true;
498 
499         gpu->glsl.max_shmem_size = D3D11_CS_TGSM_REGISTER_COUNT * sizeof(float);
500         gpu->glsl.max_group_threads = D3D11_CS_THREAD_GROUP_MAX_THREADS_PER_GROUP;
501         gpu->glsl.max_group_size[0] = D3D11_CS_THREAD_GROUP_MAX_X;
502         gpu->glsl.max_group_size[1] = D3D11_CS_THREAD_GROUP_MAX_Y;
503         gpu->glsl.max_group_size[2] = D3D11_CS_THREAD_GROUP_MAX_Z;
504         gpu->limits.max_dispatch[0] = gpu->limits.max_dispatch[1] =
505             gpu->limits.max_dispatch[2] =
506             D3D11_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION;
507     }
508 
509     if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
510         // The offset limits are defined by HLSL:
511         // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4-po--sm5---asm-
512         gpu->glsl.min_gather_offset = -32;
513         gpu->glsl.max_gather_offset = 31;
514     } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
515         // SM4.1 has no gather4_po, so the offset must be specified by an
516         // immediate with a range of [-8, 7]
517         // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4--sm4-1---asm-
518         // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sample--sm4---asm-#address-offset
519         gpu->glsl.min_gather_offset = -8;
520         gpu->glsl.max_gather_offset = 7;
521     }
522 
523     if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
524         p->max_srvs = D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT;
525     } else {
526         // 10level9 restriction:
527         // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
528         p->max_srvs = 8;
529     }
530 
531     if (p->fl >= D3D_FEATURE_LEVEL_11_1) {
532         p->max_uavs = D3D11_1_UAV_SLOT_COUNT;
533     } else {
534         p->max_uavs = D3D11_PS_CS_UAV_REGISTER_COUNT;
535     }
536 
537     if (!load_d3d_compiler(gpu)) {
538         PL_FATAL(gpu, "Could not find D3DCompiler DLL");
539         goto error;
540     }
541     PL_INFO(gpu, "D3DCompiler version: %u.%u.%u.%u",
542             p->d3d_compiler_ver.major, p->d3d_compiler_ver.minor,
543             p->d3d_compiler_ver.build, p->d3d_compiler_ver.revision);
544 
545     // Detect support for timestamp queries. Some FL9_x devices don't support them.
546     hr = ID3D11Device_CreateQuery(p->dev,
547         &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, NULL);
548     p->has_timestamp_queries = SUCCEEDED(hr);
549 
550     pl_d3d11_setup_formats(gpu);
551 
552     // The rasterizer state never changes, so create it here
553     D3D11_RASTERIZER_DESC rdesc = {
554         .FillMode = D3D11_FILL_SOLID,
555         .CullMode = D3D11_CULL_NONE,
556         .FrontCounterClockwise = FALSE,
557         .DepthClipEnable = TRUE, // Required for 10level9
558         .ScissorEnable = TRUE,
559     };
560     D3D(ID3D11Device_CreateRasterizerState(p->dev, &rdesc, &p->rstate));
561 
562     // The depth stencil state never changes either, and we only set it to turn
563     // depth testing off so the debug layer doesn't complain about an unbound
564     // depth buffer
565     D3D11_DEPTH_STENCIL_DESC dsdesc = {
566         .DepthEnable = FALSE,
567         .DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL,
568         .DepthFunc = D3D11_COMPARISON_LESS,
569         .StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK,
570         .StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK,
571         .FrontFace = {
572             .StencilFailOp = D3D11_STENCIL_OP_KEEP,
573             .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
574             .StencilPassOp = D3D11_STENCIL_OP_KEEP,
575             .StencilFunc = D3D11_COMPARISON_ALWAYS,
576         },
577         .BackFace = {
578             .StencilFailOp = D3D11_STENCIL_OP_KEEP,
579             .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
580             .StencilPassOp = D3D11_STENCIL_OP_KEEP,
581             .StencilFunc = D3D11_COMPARISON_ALWAYS,
582         },
583     };
584     D3D(ID3D11Device_CreateDepthStencilState(p->dev, &dsdesc, &p->dsstate));
585 
586     // Initialize the samplers
587     for (int sample_mode = 0; sample_mode < PL_TEX_SAMPLE_MODE_COUNT; sample_mode++) {
588         for (int address_mode = 0; address_mode < PL_TEX_ADDRESS_MODE_COUNT; address_mode++) {
589             static const D3D11_TEXTURE_ADDRESS_MODE d3d_address_mode[] = {
590                 [PL_TEX_ADDRESS_CLAMP] = D3D11_TEXTURE_ADDRESS_CLAMP,
591                 [PL_TEX_ADDRESS_REPEAT] = D3D11_TEXTURE_ADDRESS_WRAP,
592                 [PL_TEX_ADDRESS_MIRROR] = D3D11_TEXTURE_ADDRESS_MIRROR,
593             };
594             static const D3D11_FILTER d3d_filter[] = {
595                 [PL_TEX_SAMPLE_NEAREST] = D3D11_FILTER_MIN_MAG_MIP_POINT,
596                 [PL_TEX_SAMPLE_LINEAR] = D3D11_FILTER_MIN_MAG_MIP_LINEAR,
597             };
598 
599             D3D11_SAMPLER_DESC sdesc = {
600                 .AddressU = d3d_address_mode[address_mode],
601                 .AddressV = d3d_address_mode[address_mode],
602                 .AddressW = d3d_address_mode[address_mode],
603                 .ComparisonFunc = D3D11_COMPARISON_NEVER,
604                 .MinLOD = 0,
605                 .MaxLOD = D3D11_FLOAT32_MAX,
606                 .MaxAnisotropy = 1,
607                 .Filter = d3d_filter[sample_mode],
608             };
609             D3D(ID3D11Device_CreateSamplerState(p->dev, &sdesc,
610                 &p->samplers[sample_mode][address_mode]));
611         }
612     }
613 
614     hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter4,
615                                       (void **) &adapter4);
616     if (SUCCEEDED(hr)) {
617         DXGI_ADAPTER_DESC3 adapter_desc3 = {0};
618         IDXGIAdapter4_GetDesc3(adapter4, &adapter_desc3);
619 
620         p->has_monitored_fences =
621             adapter_desc3.Flags & DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES;
622     }
623 
624     // Try to create a D3D11.4 fence object to wait on in pl_gpu_finish()
625     if (p->dev5 && p->has_monitored_fences) {
626         hr = ID3D11Device5_CreateFence(p->dev5, 0, D3D11_FENCE_FLAG_NONE,
627                                        &IID_ID3D11Fence,
628                                        (void **) p->finish_fence);
629         if (SUCCEEDED(hr)) {
630             p->finish_event = CreateEventW(NULL, FALSE, FALSE, NULL);
631             if (!p->finish_event) {
632                 PL_ERR(gpu, "Failed to create finish() event");
633                 goto error;
634             }
635         }
636     }
637 
638     // If fences are not available, we will have to poll a event query instead
639     if (!p->finish_fence) {
640         // Buffers for dummy copy/readback (see d3d11_gpu_finish())
641         p->finish_buf_src = pl_buf_create(gpu, &(struct pl_buf_params) {
642             .size = sizeof(uint32_t),
643             .drawable = true, // Make these vertex buffers for 10level9
644             .initial_data = &(uint32_t) {0x11223344},
645         });
646         p->finish_buf_dst = pl_buf_create(gpu, &(struct pl_buf_params) {
647             .size = sizeof(uint32_t),
648             .host_readable = true,
649             .drawable = true,
650         });
651 
652         D3D(ID3D11Device_CreateQuery(p->dev,
653             &(D3D11_QUERY_DESC) { D3D11_QUERY_EVENT }, &p->finish_query));
654     }
655 
656     // Create the dispatch last, after any setup of `gpu` is done
657     p->dp = pl_dispatch_create(ctx->log, gpu);
658 
659     pl_d3d11_flush_message_queue(ctx, "After gpu create");
660 
661     success = true;
662 error:
663     SAFE_RELEASE(dxgi_dev);
664     SAFE_RELEASE(adapter);
665     if (success) {
666         return pl_gpu_finalize(gpu);
667     } else {
668         d3d11_destroy_gpu(gpu);
669         return NULL;
670     }
671 }
672