1 /*
2 * This file is part of libplacebo.
3 *
4 * libplacebo is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * libplacebo is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <windows.h>
19 #include <versionhelpers.h>
20
21 #include "gpu.h"
22 #include "formats.h"
23 #include "glsl/spirv.h"
24
25 #define D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE (0x80)
26 #define DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES (0x8)
27
28 struct timer_query {
29 ID3D11Query *ts_start;
30 ID3D11Query *ts_end;
31 ID3D11Query *disjoint;
32 };
33
34 struct pl_timer {
35 // Ring buffer of timer queries to use
36 int current;
37 int pending;
38 struct timer_query queries[16];
39 };
40
pl_d3d11_timer_start(pl_gpu gpu,pl_timer timer)41 void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer)
42 {
43 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
44 struct d3d11_ctx *ctx = p->ctx;
45
46 if (!timer)
47 return;
48 struct timer_query *query = &timer->queries[timer->current];
49
50 // Create the query objects lazilly
51 if (!query->ts_start) {
52 D3D(ID3D11Device_CreateQuery(p->dev,
53 &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_start));
54 D3D(ID3D11Device_CreateQuery(p->dev,
55 &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_end));
56
57 // Measuring duration in D3D11 requires three queries: start and end
58 // timestamp queries, and a disjoint query containing a flag which says
59 // whether the timestamps are usable or if a discontinuity occurred
60 // between them, like a change in power state or clock speed. The
61 // disjoint query also contains the timer frequency, so the timestamps
62 // are useless without it.
63 D3D(ID3D11Device_CreateQuery(p->dev,
64 &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP_DISJOINT }, &query->disjoint));
65 }
66
67 // Query the start timestamp
68 ID3D11DeviceContext_Begin(p->imm, (ID3D11Asynchronous *) query->disjoint);
69 ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_start);
70 return;
71
72 error:
73 SAFE_RELEASE(query->ts_start);
74 SAFE_RELEASE(query->ts_end);
75 SAFE_RELEASE(query->disjoint);
76 }
77
pl_d3d11_timer_end(pl_gpu gpu,pl_timer timer)78 void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer)
79 {
80 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
81
82 if (!timer)
83 return;
84 struct timer_query *query = &timer->queries[timer->current];
85
86 // Even if timer_start and timer_end are called in-order, timer_start might
87 // have failed to create the timer objects
88 if (!query->ts_start)
89 return;
90
91 // Query the end timestamp
92 ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_end);
93 ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->disjoint);
94
95 // Advance to the next set of queries, for the next call to timer_start
96 timer->current++;
97 if (timer->current >= PL_ARRAY_SIZE(timer->queries))
98 timer->current = 0; // Wrap around
99
100 // Increment the number of pending queries, unless the ring buffer is full,
101 // in which case, timer->current now points to the oldest one, which will be
102 // dropped and reused
103 if (timer->pending < PL_ARRAY_SIZE(timer->queries))
104 timer->pending++;
105 }
106
timestamp_to_ns(uint64_t timestamp,uint64_t freq)107 static uint64_t timestamp_to_ns(uint64_t timestamp, uint64_t freq)
108 {
109 static const uint64_t ns_per_s = 1000000000llu;
110 return timestamp / freq * ns_per_s + timestamp % freq * ns_per_s / freq;
111 }
112
d3d11_timer_query(pl_gpu gpu,pl_timer timer)113 static uint64_t d3d11_timer_query(pl_gpu gpu, pl_timer timer)
114 {
115 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
116 struct d3d11_ctx *ctx = p->ctx;
117 HRESULT hr;
118
119 for (; timer->pending > 0; timer->pending--) {
120 int index = timer->current - timer->pending;
121 if (index < 0)
122 index += PL_ARRAY_SIZE(timer->queries);
123 struct timer_query *query = &timer->queries[index];
124
125 UINT64 start, end;
126 D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dj;
127
128 // Fetch the results of each query, or on S_FALSE, return 0 to indicate
129 // the queries are still pending
130 D3D(hr = ID3D11DeviceContext_GetData(p->imm,
131 (ID3D11Asynchronous *) query->disjoint, &dj, sizeof(dj),
132 D3D11_ASYNC_GETDATA_DONOTFLUSH));
133 if (hr == S_FALSE)
134 return 0;
135 D3D(hr = ID3D11DeviceContext_GetData(p->imm,
136 (ID3D11Asynchronous *) query->ts_end, &end, sizeof(end),
137 D3D11_ASYNC_GETDATA_DONOTFLUSH));
138 if (hr == S_FALSE)
139 return 0;
140 D3D(hr = ID3D11DeviceContext_GetData(p->imm,
141 (ID3D11Asynchronous *) query->ts_start, &start, sizeof(start),
142 D3D11_ASYNC_GETDATA_DONOTFLUSH));
143 if (hr == S_FALSE)
144 return 0;
145
146 // There was a discontinuity during the queries, so a timestamp can't be
147 // produced. Skip it and try the next one.
148 if (dj.Disjoint || !dj.Frequency)
149 continue;
150
151 // We got a result. Return it to the caller.
152 timer->pending--;
153 pl_d3d11_flush_message_queue(ctx, "After timer query");
154
155 uint64_t ns = timestamp_to_ns(end - start, dj.Frequency);
156 return PL_MAX(ns, 1);
157
158 error:
159 // There was an error fetching the timer result, so skip it and try the
160 // next one
161 continue;
162 }
163
164 // No more unprocessed results
165 return 0;
166 }
167
d3d11_timer_destroy(pl_gpu gpu,pl_timer timer)168 static void d3d11_timer_destroy(pl_gpu gpu, pl_timer timer)
169 {
170 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
171 struct d3d11_ctx *ctx = p->ctx;
172
173 for (int i = 0; i < PL_ARRAY_SIZE(timer->queries); i++) {
174 SAFE_RELEASE(timer->queries[i].ts_start);
175 SAFE_RELEASE(timer->queries[i].ts_end);
176 SAFE_RELEASE(timer->queries[i].disjoint);
177 }
178
179 pl_d3d11_flush_message_queue(ctx, "After timer destroy");
180
181 pl_free(timer);
182 }
183
d3d11_timer_create(pl_gpu gpu)184 static pl_timer d3d11_timer_create(pl_gpu gpu)
185 {
186 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
187 if (!p->has_timestamp_queries)
188 return NULL;
189
190 struct pl_timer *timer = pl_alloc_ptr(NULL, timer);
191 *timer = (struct pl_timer) {0};
192 return timer;
193 }
194
d3d11_desc_namespace(pl_gpu gpu,enum pl_desc_type type)195 static int d3d11_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
196 {
197 // Vulkan-style binding, where all descriptors are in the same namespace, is
198 // required to use SPIRV-Cross' HLSL resource mapping API, which targets
199 // resources by binding number
200 return 0;
201 }
202
d3d11_gpu_flush(pl_gpu gpu)203 static void d3d11_gpu_flush(pl_gpu gpu)
204 {
205 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
206 struct d3d11_ctx *ctx = p->ctx;
207 ID3D11DeviceContext_Flush(p->imm);
208
209 pl_d3d11_flush_message_queue(ctx, "After gpu flush");
210 }
211
d3d11_gpu_finish(pl_gpu gpu)212 static void d3d11_gpu_finish(pl_gpu gpu)
213 {
214 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
215 struct d3d11_ctx *ctx = p->ctx;
216 HRESULT hr;
217
218 if (p->finish_fence) {
219 p->finish_value++;
220 D3D(ID3D11Fence_SetEventOnCompletion(p->finish_fence, p->finish_value,
221 p->finish_event));
222 ID3D11DeviceContext4_Signal(p->imm4, p->finish_fence, p->finish_value);
223 ID3D11DeviceContext_Flush(p->imm);
224 WaitForSingleObject(p->finish_event, INFINITE);
225 } else {
226 ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) p->finish_query);
227
228 // D3D11 doesn't have blocking queries, but it does have blocking
229 // readback. As a performance hack to try to avoid polling, do a dummy
230 // copy/readback between two buffers. Hopefully this will block until
231 // all prior commands are finished. If it does, the first GetData call
232 // will return a result and we won't have to poll.
233 pl_buf_copy(gpu, p->finish_buf_dst, 0, p->finish_buf_src, 0, sizeof(uint32_t));
234 pl_buf_read(gpu, p->finish_buf_dst, 0, &(uint32_t) {0}, sizeof(uint32_t));
235
236 // Poll the event query until it completes
237 for (;;) {
238 BOOL idle;
239 D3D(hr = ID3D11DeviceContext_GetData(p->imm,
240 (ID3D11Asynchronous *) p->finish_query, &idle, sizeof(idle), 0));
241 if (hr == S_OK && idle)
242 break;
243 Sleep(1);
244 }
245 }
246
247 pl_d3d11_flush_message_queue(ctx, "After gpu finish");
248
249 error:
250 return;
251 }
252
d3d11_gpu_is_failed(pl_gpu gpu)253 static bool d3d11_gpu_is_failed(pl_gpu gpu)
254 {
255 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
256 struct d3d11_ctx *ctx = p->ctx;
257
258 if (ctx->is_failed)
259 return true;
260
261 // GetDeviceRemovedReason returns S_OK if the device isn't removed
262 HRESULT hr = ID3D11Device_GetDeviceRemovedReason(p->dev);
263 if (FAILED(hr)) {
264 ctx->is_failed = true;
265 pl_d3d11_after_error(ctx, hr);
266 }
267
268 return ctx->is_failed;
269 }
270
d3d11_destroy_gpu(pl_gpu gpu)271 static void d3d11_destroy_gpu(pl_gpu gpu)
272 {
273 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
274
275 pl_buf_destroy(gpu, &p->finish_buf_src);
276 pl_buf_destroy(gpu, &p->finish_buf_dst);
277 pl_dispatch_destroy(&p->dp);
278
279 // Release everything except the immediate context
280 SAFE_RELEASE(p->dev);
281 SAFE_RELEASE(p->dev1);
282 SAFE_RELEASE(p->dev5);
283 SAFE_RELEASE(p->imm1);
284 SAFE_RELEASE(p->imm4);
285 SAFE_RELEASE(p->vbuf.buf);
286 SAFE_RELEASE(p->ibuf.buf);
287 SAFE_RELEASE(p->rstate);
288 SAFE_RELEASE(p->dsstate);
289 for (int i = 0; i < PL_TEX_SAMPLE_MODE_COUNT; i++) {
290 for (int j = 0; j < PL_TEX_ADDRESS_MODE_COUNT; j++) {
291 SAFE_RELEASE(p->samplers[i][j]);
292 }
293 }
294 SAFE_RELEASE(p->finish_fence);
295 if (p->finish_event)
296 CloseHandle(p->finish_event);
297 SAFE_RELEASE(p->finish_query);
298
299 // Destroy the immediate context synchronously so referenced objects don't
300 // show up in the leak check
301 ID3D11DeviceContext_ClearState(p->imm);
302 ID3D11DeviceContext_Flush(p->imm);
303 SAFE_RELEASE(p->imm);
304
305 pl_free((void *) gpu);
306 }
307
load_d3d_compiler(pl_gpu gpu)308 static bool load_d3d_compiler(pl_gpu gpu)
309 {
310 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
311 HMODULE d3dcompiler = NULL;
312
313 static const struct {
314 const wchar_t *name;
315 bool inbox;
316 } compiler_dlls[] = {
317 // Try the inbox D3DCompiler first (Windows 8.1 and up)
318 { .name = L"d3dcompiler_47.dll", .inbox = true },
319 // Check for a packaged version of d3dcompiler_47.dll
320 { .name = L"d3dcompiler_47.dll" },
321 // Try d3dcompiler_46.dll from the Windows 8 SDK
322 { .name = L"d3dcompiler_46.dll" },
323 // Try d3dcompiler_43.dll from the June 2010 DirectX SDK
324 { .name = L"d3dcompiler_43.dll" },
325 };
326
327 for (int i = 0; i < PL_ARRAY_SIZE(compiler_dlls); i++) {
328 if (compiler_dlls[i].inbox) {
329 if (!IsWindows8Point1OrGreater())
330 continue;
331 d3dcompiler = LoadLibraryExW(compiler_dlls[i].name, NULL,
332 LOAD_LIBRARY_SEARCH_SYSTEM32);
333 } else {
334 d3dcompiler = LoadLibraryW(compiler_dlls[i].name);
335 }
336 if (!d3dcompiler)
337 continue;
338
339 p->D3DCompile = (void *) GetProcAddress(d3dcompiler, "D3DCompile");
340 if (!p->D3DCompile)
341 return false;
342 p->d3d_compiler_ver = pl_get_dll_version(compiler_dlls[i].name);
343
344 return true;
345 }
346
347 return false;
348 }
349
350 static struct pl_gpu_fns pl_fns_d3d11 = {
351 .tex_create = pl_d3d11_tex_create,
352 .tex_destroy = pl_d3d11_tex_destroy,
353 .tex_invalidate = pl_d3d11_tex_invalidate,
354 .tex_clear_ex = pl_d3d11_tex_clear_ex,
355 .tex_blit = pl_d3d11_tex_blit,
356 .tex_upload = pl_d3d11_tex_upload,
357 .tex_download = pl_d3d11_tex_download,
358 .buf_create = pl_d3d11_buf_create,
359 .buf_destroy = pl_d3d11_buf_destroy,
360 .buf_write = pl_d3d11_buf_write,
361 .buf_read = pl_d3d11_buf_read,
362 .buf_copy = pl_d3d11_buf_copy,
363 .desc_namespace = d3d11_desc_namespace,
364 .pass_create = pl_d3d11_pass_create,
365 .pass_destroy = pl_d3d11_pass_destroy,
366 .pass_run = pl_d3d11_pass_run,
367 .timer_create = d3d11_timer_create,
368 .timer_destroy = d3d11_timer_destroy,
369 .timer_query = d3d11_timer_query,
370 .gpu_flush = d3d11_gpu_flush,
371 .gpu_finish = d3d11_gpu_finish,
372 .gpu_is_failed = d3d11_gpu_is_failed,
373 .destroy = d3d11_destroy_gpu,
374 };
375
pl_gpu_create_d3d11(struct d3d11_ctx * ctx)376 pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx)
377 {
378 pl_assert(ctx->dev);
379 IDXGIDevice1 *dxgi_dev = NULL;
380 IDXGIAdapter1 *adapter = NULL;
381 IDXGIAdapter4 *adapter4 = NULL;
382 bool success = false;
383 HRESULT hr;
384
385 struct pl_gpu *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gpu_d3d11);
386 gpu->log = ctx->log;
387 gpu->ctx = gpu->log;
388
389 struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
390 *p = (struct pl_gpu_d3d11) {
391 .ctx = ctx,
392 .impl = pl_fns_d3d11,
393 .dev = ctx->dev,
394 .spirv = spirv_compiler_create(ctx->log),
395 .vbuf.bind_flags = D3D11_BIND_VERTEX_BUFFER,
396 .ibuf.bind_flags = D3D11_BIND_INDEX_BUFFER,
397 };
398 if (!p->spirv)
399 goto error;
400
401 ID3D11Device_AddRef(p->dev);
402 ID3D11Device_GetImmediateContext(p->dev, &p->imm);
403
404 int minor = 0; // The Direct3D 11 minor version number
405
406 // Check D3D11.1 interfaces
407 hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device1,
408 (void **) &p->dev1);
409 if (SUCCEEDED(hr)) {
410 minor = 1;
411 ID3D11Device1_GetImmediateContext1(p->dev1, &p->imm1);
412 }
413
414 // Check D3D11.4 interfaces
415 hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device5,
416 (void **) &p->dev5);
417 if (SUCCEEDED(hr)) {
418 // There is no GetImmediateContext4 method
419 hr = ID3D11DeviceContext_QueryInterface(p->imm, &IID_ID3D11DeviceContext4,
420 (void **) &p->imm4);
421 if (SUCCEEDED(hr))
422 minor = 4;
423 }
424
425 PL_INFO(gpu, "Using Direct3D 11.%d runtime", minor);
426
427 D3D(ID3D11Device_QueryInterface(p->dev, &IID_IDXGIDevice1, (void **) &dxgi_dev));
428 D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter));
429
430 DXGI_ADAPTER_DESC1 adapter_desc = {0};
431 IDXGIAdapter1_GetDesc1(adapter, &adapter_desc);
432
433 // No resource can be larger than max_res_size in bytes
434 unsigned int max_res_size = PL_CLAMP(
435 D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_B_TERM * adapter_desc.DedicatedVideoMemory,
436 D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_A_TERM * 1024u * 1024u,
437 D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_C_TERM * 1024u * 1024u);
438
439 gpu->glsl = (struct pl_glsl_version) {
440 .version = 450,
441 .vulkan = true,
442 };
443
444 gpu->limits = (struct pl_gpu_limits) {
445 .max_buf_size = max_res_size,
446 .max_ssbo_size = max_res_size,
447 .max_vbo_size = max_res_size,
448
449 // Make up some values
450 .align_tex_xfer_offset = 32,
451 .align_tex_xfer_stride = 1,
452 .fragment_queues = 1,
453 };
454
455 p->fl = ID3D11Device_GetFeatureLevel(p->dev);
456
457 // If we're not using FL9_x, we can use the same suballocated buffer as a
458 // vertex buffer and index buffer
459 if (p->fl >= D3D_FEATURE_LEVEL_10_0)
460 p->vbuf.bind_flags |= D3D11_BIND_INDEX_BUFFER;
461
462 if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
463 gpu->limits.max_ubo_size = D3D11_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * CBUF_ELEM;
464 } else {
465 // 10level9 restriction:
466 // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
467 gpu->limits.max_ubo_size = 255 * CBUF_ELEM;
468 }
469
470 if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
471 gpu->limits.max_tex_1d_dim = D3D11_REQ_TEXTURE1D_U_DIMENSION;
472 gpu->limits.max_tex_2d_dim = D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION;
473 gpu->limits.max_tex_3d_dim = D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
474 } else if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
475 gpu->limits.max_tex_1d_dim = D3D10_REQ_TEXTURE1D_U_DIMENSION;
476 gpu->limits.max_tex_2d_dim = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
477 gpu->limits.max_tex_3d_dim = D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
478 } else if (p->fl >= D3D_FEATURE_LEVEL_9_3) {
479 gpu->limits.max_tex_2d_dim = D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION;
480 // Same limit as FL9_1
481 gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
482 } else {
483 gpu->limits.max_tex_2d_dim = D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION;
484 gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
485 }
486
487 if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
488 gpu->limits.max_buffer_texels =
489 1 << D3D11_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
490 }
491
492 if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
493 gpu->glsl.compute = true;
494 gpu->limits.compute_queues = 1;
495 // Set `gpu->limits.blittable_1d_3d`, since `pl_tex_blit_compute`, which
496 // is used to emulate blits on 11_0 and up, supports 1D and 3D textures
497 gpu->limits.blittable_1d_3d = true;
498
499 gpu->glsl.max_shmem_size = D3D11_CS_TGSM_REGISTER_COUNT * sizeof(float);
500 gpu->glsl.max_group_threads = D3D11_CS_THREAD_GROUP_MAX_THREADS_PER_GROUP;
501 gpu->glsl.max_group_size[0] = D3D11_CS_THREAD_GROUP_MAX_X;
502 gpu->glsl.max_group_size[1] = D3D11_CS_THREAD_GROUP_MAX_Y;
503 gpu->glsl.max_group_size[2] = D3D11_CS_THREAD_GROUP_MAX_Z;
504 gpu->limits.max_dispatch[0] = gpu->limits.max_dispatch[1] =
505 gpu->limits.max_dispatch[2] =
506 D3D11_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION;
507 }
508
509 if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
510 // The offset limits are defined by HLSL:
511 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4-po--sm5---asm-
512 gpu->glsl.min_gather_offset = -32;
513 gpu->glsl.max_gather_offset = 31;
514 } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
515 // SM4.1 has no gather4_po, so the offset must be specified by an
516 // immediate with a range of [-8, 7]
517 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4--sm4-1---asm-
518 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sample--sm4---asm-#address-offset
519 gpu->glsl.min_gather_offset = -8;
520 gpu->glsl.max_gather_offset = 7;
521 }
522
523 if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
524 p->max_srvs = D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT;
525 } else {
526 // 10level9 restriction:
527 // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
528 p->max_srvs = 8;
529 }
530
531 if (p->fl >= D3D_FEATURE_LEVEL_11_1) {
532 p->max_uavs = D3D11_1_UAV_SLOT_COUNT;
533 } else {
534 p->max_uavs = D3D11_PS_CS_UAV_REGISTER_COUNT;
535 }
536
537 if (!load_d3d_compiler(gpu)) {
538 PL_FATAL(gpu, "Could not find D3DCompiler DLL");
539 goto error;
540 }
541 PL_INFO(gpu, "D3DCompiler version: %u.%u.%u.%u",
542 p->d3d_compiler_ver.major, p->d3d_compiler_ver.minor,
543 p->d3d_compiler_ver.build, p->d3d_compiler_ver.revision);
544
545 // Detect support for timestamp queries. Some FL9_x devices don't support them.
546 hr = ID3D11Device_CreateQuery(p->dev,
547 &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, NULL);
548 p->has_timestamp_queries = SUCCEEDED(hr);
549
550 pl_d3d11_setup_formats(gpu);
551
552 // The rasterizer state never changes, so create it here
553 D3D11_RASTERIZER_DESC rdesc = {
554 .FillMode = D3D11_FILL_SOLID,
555 .CullMode = D3D11_CULL_NONE,
556 .FrontCounterClockwise = FALSE,
557 .DepthClipEnable = TRUE, // Required for 10level9
558 .ScissorEnable = TRUE,
559 };
560 D3D(ID3D11Device_CreateRasterizerState(p->dev, &rdesc, &p->rstate));
561
562 // The depth stencil state never changes either, and we only set it to turn
563 // depth testing off so the debug layer doesn't complain about an unbound
564 // depth buffer
565 D3D11_DEPTH_STENCIL_DESC dsdesc = {
566 .DepthEnable = FALSE,
567 .DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL,
568 .DepthFunc = D3D11_COMPARISON_LESS,
569 .StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK,
570 .StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK,
571 .FrontFace = {
572 .StencilFailOp = D3D11_STENCIL_OP_KEEP,
573 .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
574 .StencilPassOp = D3D11_STENCIL_OP_KEEP,
575 .StencilFunc = D3D11_COMPARISON_ALWAYS,
576 },
577 .BackFace = {
578 .StencilFailOp = D3D11_STENCIL_OP_KEEP,
579 .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
580 .StencilPassOp = D3D11_STENCIL_OP_KEEP,
581 .StencilFunc = D3D11_COMPARISON_ALWAYS,
582 },
583 };
584 D3D(ID3D11Device_CreateDepthStencilState(p->dev, &dsdesc, &p->dsstate));
585
586 // Initialize the samplers
587 for (int sample_mode = 0; sample_mode < PL_TEX_SAMPLE_MODE_COUNT; sample_mode++) {
588 for (int address_mode = 0; address_mode < PL_TEX_ADDRESS_MODE_COUNT; address_mode++) {
589 static const D3D11_TEXTURE_ADDRESS_MODE d3d_address_mode[] = {
590 [PL_TEX_ADDRESS_CLAMP] = D3D11_TEXTURE_ADDRESS_CLAMP,
591 [PL_TEX_ADDRESS_REPEAT] = D3D11_TEXTURE_ADDRESS_WRAP,
592 [PL_TEX_ADDRESS_MIRROR] = D3D11_TEXTURE_ADDRESS_MIRROR,
593 };
594 static const D3D11_FILTER d3d_filter[] = {
595 [PL_TEX_SAMPLE_NEAREST] = D3D11_FILTER_MIN_MAG_MIP_POINT,
596 [PL_TEX_SAMPLE_LINEAR] = D3D11_FILTER_MIN_MAG_MIP_LINEAR,
597 };
598
599 D3D11_SAMPLER_DESC sdesc = {
600 .AddressU = d3d_address_mode[address_mode],
601 .AddressV = d3d_address_mode[address_mode],
602 .AddressW = d3d_address_mode[address_mode],
603 .ComparisonFunc = D3D11_COMPARISON_NEVER,
604 .MinLOD = 0,
605 .MaxLOD = D3D11_FLOAT32_MAX,
606 .MaxAnisotropy = 1,
607 .Filter = d3d_filter[sample_mode],
608 };
609 D3D(ID3D11Device_CreateSamplerState(p->dev, &sdesc,
610 &p->samplers[sample_mode][address_mode]));
611 }
612 }
613
614 hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter4,
615 (void **) &adapter4);
616 if (SUCCEEDED(hr)) {
617 DXGI_ADAPTER_DESC3 adapter_desc3 = {0};
618 IDXGIAdapter4_GetDesc3(adapter4, &adapter_desc3);
619
620 p->has_monitored_fences =
621 adapter_desc3.Flags & DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES;
622 }
623
624 // Try to create a D3D11.4 fence object to wait on in pl_gpu_finish()
625 if (p->dev5 && p->has_monitored_fences) {
626 hr = ID3D11Device5_CreateFence(p->dev5, 0, D3D11_FENCE_FLAG_NONE,
627 &IID_ID3D11Fence,
628 (void **) p->finish_fence);
629 if (SUCCEEDED(hr)) {
630 p->finish_event = CreateEventW(NULL, FALSE, FALSE, NULL);
631 if (!p->finish_event) {
632 PL_ERR(gpu, "Failed to create finish() event");
633 goto error;
634 }
635 }
636 }
637
638 // If fences are not available, we will have to poll a event query instead
639 if (!p->finish_fence) {
640 // Buffers for dummy copy/readback (see d3d11_gpu_finish())
641 p->finish_buf_src = pl_buf_create(gpu, &(struct pl_buf_params) {
642 .size = sizeof(uint32_t),
643 .drawable = true, // Make these vertex buffers for 10level9
644 .initial_data = &(uint32_t) {0x11223344},
645 });
646 p->finish_buf_dst = pl_buf_create(gpu, &(struct pl_buf_params) {
647 .size = sizeof(uint32_t),
648 .host_readable = true,
649 .drawable = true,
650 });
651
652 D3D(ID3D11Device_CreateQuery(p->dev,
653 &(D3D11_QUERY_DESC) { D3D11_QUERY_EVENT }, &p->finish_query));
654 }
655
656 // Create the dispatch last, after any setup of `gpu` is done
657 p->dp = pl_dispatch_create(ctx->log, gpu);
658
659 pl_d3d11_flush_message_queue(ctx, "After gpu create");
660
661 success = true;
662 error:
663 SAFE_RELEASE(dxgi_dev);
664 SAFE_RELEASE(adapter);
665 if (success) {
666 return pl_gpu_finalize(gpu);
667 } else {
668 d3d11_destroy_gpu(gpu);
669 return NULL;
670 }
671 }
672