1 /*
2  * This file is part of libplacebo.
3  *
4  * libplacebo is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * libplacebo is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef LIBPLACEBO_GPU_H_
19 #define LIBPLACEBO_GPU_H_
20 
21 #include <stddef.h>
22 #include <stdbool.h>
23 #include <stdint.h>
24 
25 #include <libplacebo/common.h>
26 
27 PL_API_BEGIN
28 
29 // Type of a shader input descriptor.
30 enum pl_desc_type {
31     PL_DESC_INVALID = 0,
32     PL_DESC_SAMPLED_TEX,    // C: pl_tex*    GLSL: combined texture sampler
33                             // (`pl_tex->params.sampleable` must be set)
34     PL_DESC_STORAGE_IMG,    // C: pl_tex*    GLSL: storage image
35                             // (`pl_tex->params.storable` must be set)
36     PL_DESC_BUF_UNIFORM,    // C: pl_buf*    GLSL: uniform buffer
37                             // (`pl_buf->params.uniform` must be set)
38     PL_DESC_BUF_STORAGE,    // C: pl_buf*    GLSL: storage buffer
39                             // (`pl_buf->params.storable` must be set)
40     PL_DESC_BUF_TEXEL_UNIFORM,// C: pl_buf*  GLSL: uniform samplerBuffer
41                               // (`pl_buf->params.uniform` and `format` must be set)
42     PL_DESC_BUF_TEXEL_STORAGE,// C: pl_buf*  GLSL: uniform imageBuffer
43                               // (`pl_buf->params.uniform` and `format` must be set)
44     PL_DESC_TYPE_COUNT
45 };
46 
47 // This file contains the definition of an API which is designed to abstract
48 // away from platform-specific APIs like the various OpenGL variants, Direct3D
49 // and Vulkan in a common way. It is a much more limited API than those APIs,
50 // since it tries targeting a very small common subset of features that is
51 // needed to implement libplacebo's rendering.
52 //
53 // NOTE: Most, but not all, parameter conditions (phrases such as "must" or
54 // "valid usage" are explicitly tested and result in error messages followed by
55 // graceful failure. Exceptions are noted where they exist.
56 
57 // Structure which wraps metadata describing GLSL capabilities.
58 struct pl_glsl_version {
59     int version;        // GLSL version (e.g. 450), for #version
60     bool gles;          // GLSL ES semantics (ESSL)
61     bool vulkan;        // GL_KHR_vulkan_glsl semantics
62 
63     // Compute shader support and limits. If `compute` is false, then all
64     // of the remaining fields in this section are {0}.
65     bool compute;
66     size_t max_shmem_size;      // maximum compute shader shared memory size
67     uint32_t max_group_threads; // maximum number of local threads per work group
68     uint32_t max_group_size[3]; // maximum work group size per dimension
69 
70     // If nonzero, signals availability of shader subgroups. This guarantess
71     // availability of all of the following extensions:
72     // - GL_KHR_shader_subgroup_basic
73     // - GL_KHR_shader_subgroup_vote
74     // - GL_KHR_shader_subgroup_arithmetic
75     // - GL_KHR_shader_subgroup_ballot
76     // - GL_KHR_shader_subgroup_shuffle
77     uint32_t subgroup_size;
78 
79     // Miscellaneous shader limits
80     int16_t min_gather_offset;  // minimum `textureGatherOffset` offset
81     int16_t max_gather_offset;  // maximum `textureGatherOffset` offset
82 };
83 
84 // Backwards compatibility alias
85 #define pl_glsl_desc pl_glsl_version
86 
87 // Structure defining the physical limits and capabilities of this GPU
88 // instance. If a limit is given as 0, that means that feature is unsupported.
89 struct pl_gpu_limits {
90     // --- pl_gpu
91     bool thread_safe;           // `pl_gpu` calls are thread-safe
92     bool callbacks;             // supports asynchronous GPU callbacks
93 
94     // --- pl_buf
95     size_t max_buf_size;        // maximum size of any buffer
96     size_t max_ubo_size;        // maximum size of a `uniform` buffer
97     size_t max_ssbo_size;       // maximum size of a `storable` buffer
98     size_t max_vbo_size;        // maximum size of a `drawable` buffer
99     size_t max_mapped_size;     // maximum size of a `host_mapped` buffer
100     uint64_t max_buffer_texels; // maximum number of texels in a texel buffer
101 
102     // Required alignment for PL_HANDLE_HOST_PTR imports. This is provided
103     // merely as a hint to the user. If the host pointer being imported is
104     // misaligned, libplacebo will internally round (over-map) the region.
105     size_t align_host_ptr;
106 
107     // --- pl_tex
108     uint32_t max_tex_1d_dim;    // maximum width for a 1D texture
109     uint32_t max_tex_2d_dim;    // maximum width/height for a 2D texture (required)
110     uint32_t max_tex_3d_dim;    // maximum width/height/depth for a 3D texture
111     bool blittable_1d_3d;       // supports blittable 1D/3D textures
112     bool buf_transfer;          // supports `pl_tex_transfer_params.buf`
113 
114     // These don't represent hard limits but indicate performance hints for
115     // optimal alignment. For best performance, the corresponding field
116     // should be aligned to a multiple of these. They will always be a power
117     // of two.
118     uint32_t align_tex_xfer_stride; // optimal `pl_tex_transfer_params.stride_w/h`
119     size_t align_tex_xfer_offset;   // optimal `pl_tex_transfer_params.buf_offset`
120 
121     // --- pl_pass
122     size_t max_variables;       // maximum `pl_pass_params.num_variables`
123     size_t max_constants;       // maximum `pl_pass_params.num_constants`
124     size_t max_pushc_size;      // maximum `push_constants_size`
125     uint32_t max_dispatch[3];   // maximum dispatch size per dimension
126 
127     // Note: At least one of `max_variables` or `max_ubo_size` is guaranteed to
128     // be nonzero.
129 
130     // As a performance hint, the GPU may signal the number of command queues
131     // it has for fragment and compute shaders, respectively. Users may use
132     // this information to decide the appropriate type of shader to dispatch.
133     uint32_t fragment_queues;
134     uint32_t compute_queues;
135 
136     // --- Deprecated fields. Provided for backwards compatibility. See the
137     // corresponding fields in `pl_glsl_version` for their replacements.
138     size_t max_shmem_size PL_DEPRECATED;
139     uint32_t max_group_threads PL_DEPRECATED;
140     uint32_t max_group_size[3] PL_DEPRECATED;
141     uint32_t subgroup_size PL_DEPRECATED;
142     int16_t min_gather_offset PL_DEPRECATED;
143     int16_t max_gather_offset PL_DEPRECATED;
144 };
145 
146 // Backwards compatibility alias
147 #define max_xfer_size max_buf_size
148 
149 // Some `pl_gpu` operations allow sharing GPU resources with external APIs -
150 // examples include interop with other graphics APIs such as CUDA, and also
151 // various hardware decoding APIs. This defines the mechanism underpinning the
152 // communication of such an interoperation.
153 typedef uint64_t pl_handle_caps;
154 enum pl_handle_type {
155     PL_HANDLE_FD        = (1 << 0), // `int fd` for POSIX-style APIs
156     PL_HANDLE_WIN32     = (1 << 1), // `HANDLE` for win32 API
157     PL_HANDLE_WIN32_KMT = (1 << 2), // `HANDLE` for pre-Windows-8 win32 API
158     PL_HANDLE_DMA_BUF   = (1 << 3), // 'int fd' for a dma_buf fd
159     PL_HANDLE_HOST_PTR  = (1 << 4), // `void *` for a host-allocated pointer
160 };
161 
162 struct pl_gpu_handle_caps {
163     pl_handle_caps tex;  // supported handles for `pl_tex` + `pl_shared_mem`
164     pl_handle_caps buf;  // supported handles for `pl_buf` + `pl_shared_mem`
165     pl_handle_caps sync; // supported handles for `pl_sync`
166 };
167 
168 // Wrapper for the handle used to communicate a shared resource externally.
169 // This handle is owned by the `pl_gpu` - if a user wishes to use it in a way
170 // that takes over ownership (e.g. importing into some APIs), they must clone
171 // the handle before doing so (e.g. using `dup` for fds). It is important to
172 // read the external API documentation _very_ carefully as different handle
173 // types may be managed in different ways. (eg: CUDA takes ownership of an fd,
174 // but does not take ownership of a win32 handle).
175 union pl_handle {
176     int fd;         // PL_HANDLE_FD / PL_HANDLE_DMA_BUF
177     void *handle;   // PL_HANDLE_WIN32 / PL_HANDLE_WIN32_KMT
178     void *ptr;      // PL_HANDLE_HOST_PTR
179 };
180 
181 // Structure encapsulating memory that is shared between libplacebo and the
182 // user. This memory can be imported into external APIs using the handle.
183 //
184 // If the object a `pl_shared_mem` belongs to is destroyed (e.g. via
185 // `pl_buf_destroy`), the handle becomes undefined, as do the contents of the
186 // memory it points to, as well as any external API objects imported from it.
187 struct pl_shared_mem {
188     union pl_handle handle;
189     size_t size;   // the total size of the memory referenced by this handle
190     size_t offset; // the offset of the object within the referenced memory
191 
192     // For PL_HANDLE_DMA_BUF, this specifies the DRM format modifier that
193     // describes this resource. Note that when importing `pl_buf`, this must
194     // be DRM_FORMAT_MOD_LINEAR. For importing `pl_tex`, it can be any
195     // format modifier supported by the implementation.
196     uint64_t drm_format_mod;
197 
198     // When importing a `pl_tex` of type PL_HANDLE_DMA_BUF, this can be used to
199     // set the image stride (AKA pitch) in memory. If left as 0, defaults to
200     // the image width/height.
201     size_t stride_w;
202     size_t stride_h;
203 };
204 
205 // Structure grouping PCI bus address fields for GPU devices
206 struct pl_gpu_pci_address {
207     uint32_t domain;
208     uint32_t bus;
209     uint32_t device;
210     uint32_t function;
211 };
212 
213 // (Deprecated) Capability bits. Provided for backwards compatibility.
214 typedef uint64_t pl_gpu_caps;
215 enum PL_DEPRECATED {
216     PL_GPU_CAP_COMPUTE          = 1 << 0, // see `pl_glsl_version.compute`
217     PL_GPU_CAP_PARALLEL_COMPUTE = 1 << 1, // see `pl_gpu_limits.compute_queues`
218     PL_GPU_CAP_INPUT_VARIABLES  = 1 << 2, // see `pl_gpu_limits.max_variables`
219     PL_GPU_CAP_MAPPED_BUFFERS   = 1 << 3, // see `pl_gpu_limits.max_mapped_size`
220     PL_GPU_CAP_BLITTABLE_1D_3D  = 1 << 4, // see `pl_gpu_limits.blittable_1d_3d`
221     PL_GPU_CAP_SUBGROUPS        = 1 << 5, // see `pl_glsl_version.subgroup_size`
222     PL_GPU_CAP_CALLBACKS        = 1 << 6, // see `pl_gpu_limits.callbacks`
223     PL_GPU_CAP_THREAD_SAFE      = 1 << 7, // see `pl_gpu_limits.thread_safe`
224     PL_GPU_CAP_SPEC_CONSTANTS   = 1 << 8, // see `pl_gpu_limits.max_constants`
225 };
226 
227 typedef const PL_STRUCT(pl_fmt) *pl_fmt;
228 
229 // Abstract device context which wraps an underlying graphics context and can
230 // be used to dispatch rendering commands.
231 //
232 // Thread-safety: Depends on `pl_gpu_limits.thread_safe`
233 typedef const PL_STRUCT(pl_gpu) {
234     pl_log log;
235 
236     struct pl_glsl_version glsl; // GLSL features supported by this GPU
237     struct pl_gpu_limits limits; // physical device limits and capabilities
238 
239     // Fields relevant to external API interop. If the underlying device does
240     // not support interop with other APIs, these will all be {0}.
241     struct pl_gpu_handle_caps export_caps; // supported handles for exporting
242     struct pl_gpu_handle_caps import_caps; // supported handles for importing
243     uint8_t uuid[16];                      // underlying device UUID
244 
245     // Supported texture formats, in preference order. (If there are multiple
246     // similar formats, the "better" ones come first)
247     pl_fmt *formats;
248     int num_formats;
249 
250     // PCI Bus address of the underlying device, to help with interop.
251     // This will only be filled in if interop is supported.
252     struct pl_gpu_pci_address pci;
253 
254     // (Deprecated) Backwards compatibility fields.
255     pl_log ctx PL_DEPRECATED;       // equal to `log`
256     pl_gpu_caps caps PL_DEPRECATED; // replaced by `glsl` and `limits`
257 } *pl_gpu;
258 
259 // Helper function to align the given dimension (e.g. width or height) to a
260 // multiple of the optimal texture transfer stride.
261 int pl_optimal_transfer_stride(pl_gpu gpu, int dimension);
262 
263 enum pl_fmt_type {
264     PL_FMT_UNKNOWN = 0, // also used for inconsistent multi-component formats
265     PL_FMT_UNORM,       // unsigned, normalized integer format (sampled as float)
266     PL_FMT_SNORM,       // signed, normalized integer format (sampled as float)
267     PL_FMT_UINT,        // unsigned integer format (sampled as integer)
268     PL_FMT_SINT,        // signed integer format (sampled as integer)
269     PL_FMT_FLOAT,       // (signed) float formats, any bit size
270     PL_FMT_TYPE_COUNT,
271 };
272 
273 enum pl_fmt_caps {
274     PL_FMT_CAP_SAMPLEABLE    = 1 << 0,  // may be sampled from (PL_DESC_SAMPLED_TEX)
275     PL_FMT_CAP_STORABLE      = 1 << 1,  // may be used as storage image (PL_DESC_STORAGE_IMG)
276     PL_FMT_CAP_LINEAR        = 1 << 2,  // may be linearly samplied from (PL_TEX_SAMPLE_LINEAR)
277     PL_FMT_CAP_RENDERABLE    = 1 << 3,  // may be rendered to (pl_pass_params.target_fmt)
278     PL_FMT_CAP_BLENDABLE     = 1 << 4,  // may be blended to (pl_pass_params.enable_blend)
279     PL_FMT_CAP_BLITTABLE     = 1 << 5,  // may be blitted from/to (pl_tex_blit)
280     PL_FMT_CAP_VERTEX        = 1 << 6,  // may be used as a vertex attribute
281     PL_FMT_CAP_TEXEL_UNIFORM = 1 << 7,  // may be used as a texel uniform buffer
282     PL_FMT_CAP_TEXEL_STORAGE = 1 << 8,  // may be used as a texel storage buffer
283     PL_FMT_CAP_HOST_READABLE = 1 << 9,  // may be used with `host_readable` textures
284     PL_FMT_CAP_READWRITE     = 1 << 10, // may be used with PL_DESC_ACCESS_READWRITE
285 
286     // Notes:
287     // - PL_FMT_CAP_LINEAR also implies PL_FMT_CAP_SAMPLEABLE
288     // - PL_FMT_CAP_STORABLE also implies `pl_gpu.glsl.compute`
289     // - PL_FMT_CAP_BLENDABLE implies PL_FMT_CAP_RENDERABLE
290     // - PL_FMT_CAP_VERTEX implies that the format is non-opaque
291     // - PL_FMT_CAP_HOST_READABLE implies that the format is non-opaque
292 };
293 
294 // Structure describing a texel/vertex format.
PL_STRUCT(pl_fmt)295 PL_STRUCT(pl_fmt) {
296     const char *name;       // symbolic name for this format (e.g. rgba32f)
297 
298     enum pl_fmt_type type;  // the format's data type and interpretation
299     enum pl_fmt_caps caps;  // the features supported by this format
300     int num_components;     // number of components for this format
301     int component_depth[4]; // meaningful bits per component, texture precision
302     size_t internal_size;   // internal texel size (for blit compatibility)
303 
304     // This controls the relationship between the data as seen by the host and
305     // the way it's interpreted by the texture. The host representation is
306     // always tightly packed (no padding bits in between each component).
307     //
308     // This representation assumes little endian ordering, i.e. components
309     // being ordered from LSB to MSB in memory. Note that for oddly packed
310     // formats like rgb10a2 or rgb565, this is inconsistent with the naming.
311     // (That is to say, rgb565 has sample order {2, 1, 0} under this convention
312     // - because rgb565 treats the R channel as the *most* significant bits)
313     //
314     // If `opaque` is true, then there's no meaningful correspondence between
315     // the two, and all of the remaining fields in this section are unset.
316     //
317     // If `emulated` is true, then this format doesn't actually exist on the
318     // GPU as an uploadable texture format - and any apparent support is being
319     // emulated (typically using compute shaders in the upload path).
320     bool opaque;
321     bool emulated;
322     size_t texel_size;      // total size in bytes per texel
323     int host_bits[4];       // number of meaningful bits in host memory
324     int sample_order[4];    // sampled index for each component, e.g.
325                             // {2, 1, 0, 3} for BGRA textures
326 
327     // For sampleable formats, this bool indicates whether or not the format
328     // is compatible with `textureGather()`
329     bool gatherable;
330 
331     // If usable as a vertex or texel buffer format, this gives the GLSL type
332     // corresponding to the data. (e.g. vec4)
333     const char *glsl_type;
334 
335     // If usable as a storage image or texel storage buffer
336     // (PL_FMT_CAP_STORABLE / PL_FMT_CAP_TEXEL_STORAGE), this gives the GLSL
337     // texel format corresponding to the format (e.g. rgba16ui), if any. This
338     // field may be NULL, in which case the format modifier may be left
339     // unspecified.
340     const char *glsl_format;
341 
342     // If non-opaque, this gives the fourcc associated with the host
343     // representation. In particular, this is intended for use with
344     // PL_HANDLE_DMA_BUF, where this field will match the DRM format from
345     // <drm_fourcc.h>. May be 0, for formats without matching DRM fourcc.
346     uint32_t fourcc;
347 
348     // If `fourcc` is set, this contains the list of supported drm format
349     // modifiers for this format.
350     const uint64_t *modifiers;
351     int num_modifiers;
352 };
353 
354 // Returns whether or not a pl_fmt's components are ordered sequentially
355 // in memory in the order RGBA.
356 bool pl_fmt_is_ordered(pl_fmt fmt);
357 
358 // Returns whether or not a pl_fmt is sampled as a float (e.g. UNORM)
359 bool pl_fmt_is_float(pl_fmt fmt);
360 
361 // Helper function to find a format with a given number of components and
362 // minimum effective precision per component. If `host_bits` is set, then the
363 // format will always be non-opaque, unpadded, ordered and have exactly this
364 // bit depth for each component. Finally, all `caps` must be supported.
365 pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components,
366                     int min_depth, int host_bits, enum pl_fmt_caps caps);
367 
368 // Finds a vertex format for a given configuration. The resulting vertex will
369 // have a component depth equivalent to the sizeof() the equivalent host type.
370 // (e.g. PL_FMT_FLOAT will always have sizeof(float))
371 pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components);
372 
373 // Find a format based on its name.
374 pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name);
375 
376 // Find a format based on its fourcc.
377 pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc);
378 
379 // A generic 'timer query' object. These can be used to measure an
380 // approximation of the GPU execution time of a given operation. Due to the
381 // highly asynchronous nature of GPUs, the actual results of any individual
382 // timer query may be delayed by quite a bit. As such, users should avoid
383 // trying to pair any particular GPU command with any particular timer query
384 // result, and only reuse `pl_timer` objects with identical operations. The
385 // results of timer queries are guaranteed to be in-order, but individual
386 // queries may be dropped, and some operations might not record timer results
387 // at all. (For example, if the underlying hardware does not support timer
388 // queries for a given operation type)
389 //
390 // Thread-safety: Unsafe
391 typedef PL_STRUCT(pl_timer) *pl_timer;
392 
393 // Creates a new timer object. This may return NULL, for example if the
394 // implementation does not support timers, but since passing NULL to
395 // `pl_timer_destroy` and `pl_timer_query` is safe, users generally need not
396 // concern themselves with handling this.
397 pl_timer pl_timer_create(pl_gpu gpu);
398 void pl_timer_destroy(pl_gpu gpu, pl_timer *);
399 
400 // Queries any results that have been measured since the last execution of
401 // `pl_timer_query`. There may be more than one result, in which case the user
402 // should simply call the function again to get the subsequent values. This
403 // function returns a value of 0 in the event that there are no more
404 // unprocessed results.
405 //
406 // The results are reported in nanoseconds, but the actual precision of the
407 // timestamp queries may be significantly lower.
408 //
409 // Note: Results do not queue up indefinitely. Generally, the implementation
410 // will only keep track of a small, fixed number of results internally. Make
411 // sure to include this function as part of your main rendering loop to process
412 // all of its results, or older results will be overwritten by newer ones.
413 uint64_t pl_timer_query(pl_gpu gpu, pl_timer);
414 
415 // (Deprecated) Buffer usage type. This defines what types of operations may be
416 // performed on a buffer. They are defined merely for backwards compatibility,
417 // and correspond to merely enabling the respective usage flags.
418 enum pl_buf_type {
419     PL_BUF_TEX_TRANSFER,  // no extra usage flags
420     PL_BUF_UNIFORM,       // enables `uniform`
421     PL_BUF_STORAGE,       // enables `storable`
422     PL_BUF_TEXEL_UNIFORM, // equivalent to PL_BUF_UNIFORM (when `format` is set)
423     PL_BUF_TEXEL_STORAGE, // equivalent to PL_BUF_STORAGE (when `format` is set)
424     PL_BUF_TYPE_COUNT,
425 };
426 
427 enum pl_buf_mem_type {
428     PL_BUF_MEM_AUTO = 0, // use whatever seems most appropriate
429     PL_BUF_MEM_HOST,     // try allocating from host memory (RAM)
430     PL_BUF_MEM_DEVICE,   // try allocating from device memory (VRAM)
431     PL_BUF_MEM_TYPE_COUNT,
432 
433     // Note: This distinction only matters for discrete GPUs
434 };
435 
436 // Structure describing a buffer.
437 struct pl_buf_params {
438     size_t size;        // size in bytes (must be <= `pl_gpu_limits.max_buf_size`)
439     bool host_writable; // contents may be updated via pl_buf_write()
440     bool host_readable; // contents may be read back via pl_buf_read()
441     bool host_mapped;   // create a persistent, RW mapping (pl_buf.data)
442 
443     // May be used as PL_DESC_BUF_UNIFORM or PL_DESC_BUF_TEXEL_UNIFORM.
444     // Requires `size <= pl_gpu_limits.max_ubo_size`
445     bool uniform;
446 
447     // May be used as PL_DESC_BUF_STORAGE or PL_DESC_BUF_TEXEL_STORAGE.
448     // Requires `size <= pl_gpu_limits.max_ssbo_size`
449     bool storable;
450 
451     // May be used as the source of vertex data for `pl_pass_run`.
452     bool drawable;
453 
454     // Provide a hint for the memory type you want to use when allocating
455     // this buffer's memory.
456     //
457     // Note: Restrictions may apply depending on the usage flags. In
458     // particular, allocating buffers with `uniform` or `storable` enabled from
459     // non-device memory will almost surely fail.
460     enum pl_buf_mem_type memory_type;
461 
462     // Setting this to a format with the `PL_FMT_CAP_TEXEL_*` capability allows
463     // this buffer to be used as a `PL_DESC_BUF_TEXEL_*`, when `uniform` and
464     // `storage` are respectively also enabled.
465     pl_fmt format;
466 
467     // At most one of `export_handle` and `import_handle` can be set for a
468     // buffer.
469 
470     // Setting this indicates that the memory backing this buffer should be
471     // shared with external APIs, If so, this must be exactly *one* of
472     // `pl_gpu.export_caps.buf`.
473     enum pl_handle_type export_handle;
474 
475     // Setting this indicates that the memory backing this buffer will be
476     // imported from an external API. If so, this must be exactly *one* of
477     // `pl_gpu.import_caps.buf`.
478     enum pl_handle_type import_handle;
479 
480     // If the shared memory is being imported, the import handle must be
481     // specified here. Otherwise, this is ignored.
482     struct pl_shared_mem shared_mem;
483 
484     // If non-NULL, the buffer will be created with these contents. Otherwise,
485     // the initial data is undefined. Using this does *not* require setting
486     // host_writable.
487     const void *initial_data;
488 
489     // Arbitrary user data. libplacebo does not use this at all.
490     void *user_data;
491 
492     // Deprecated. Setting a type now effectively just enables some of the
493     // buffer usage flags. See `pl_buf_type`. This field will be removed
494     // in the future.
495     enum pl_buf_type type PL_DEPRECATED;
496 };
497 
498 // A generic buffer, which can be used for multiple purposes (texture transfer,
499 // storage buffer, uniform buffer, etc.)
500 //
501 // Note on efficiency: A pl_buf does not necessarily represent a true "buffer"
502 // object on the underlying graphics API. It may also refer to a sub-slice of
503 // a larger buffer, depending on the implementation details of the GPU. The
504 // bottom line is that users do not need to worry about the efficiency of using
505 // many small pl_buf objects. Having many small pl_bufs, even lots of few-byte
506 // vertex buffers, is designed to be completely fine.
507 //
508 // Thread-safety: Unsafe
509 typedef const PL_STRUCT(pl_buf) {
510     struct pl_buf_params params;
511     uint8_t *data; // for persistently mapped buffers, points to the first byte
512 
513     // If `params.handle_type` is set, this structure references the shared
514     // memory backing this buffer, via the requested handle type.
515     //
516     // While this buffer is not in an "exported" state, the contents of the
517     // memory are undefined. (See: `pl_buf_export`)
518     struct pl_shared_mem shared_mem;
519 } *pl_buf;
520 
521 // Create a buffer. The type of buffer depends on the parameters. The buffer
522 // parameters must adhere to the restrictions imposed by the pl_gpu_limits.
523 // Returns NULL on failure.
524 //
525 // For buffers with shared memory, the buffer is considered to be in an
526 // "exported" state by default, and may be used directly by the external API
527 // after being created (until the first libplacebo operation on the buffer).
528 pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params);
529 void pl_buf_destroy(pl_gpu gpu, pl_buf *buf);
530 
531 // This behaves like `pl_buf_create`, but if the buffer already exists and has
532 // incompatible parameters, it will get destroyed first. A buffer is considered
533 // "compatible" if it has the same buffer type and texel format, a size greater
534 // than or equal to the requested size, and it has a superset of the features
535 // the user requested. After this operation, the contents of the buffer are
536 // undefined.
537 //
538 // Note: Due to its unpredictability, it's not allowed to use this with
539 // `params->initial_data` being set. Similarly, it's not allowed on a buffer
540 // with `params->export_handle`. since this may invalidate the corresponding
541 // external API's handle. Conversely, it *is* allowed on a buffer with
542 // `params->host_mapped`, and the corresponding `buf->data` pointer *may*
543 // change as a result of doing so.
544 //
545 // Note: If the `user_data` alone changes, this does not trigger a buffer
546 // recreation. In theory, this can be used to detect when the buffer ended
547 // up being recreated.
548 bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params);
549 
550 // Update the contents of a buffer, starting at a given offset (must be a
551 // multiple of 4) and up to a given size, with the contents of *data.
552 //
553 // This function will block until the buffer is no longer in use. Use
554 // `pl_buf_poll` to perform non-blocking queries of buffer availability.
555 //
556 // Note: This function can incur synchronization overhead, so it shouldn't be
557 // used in tight loops. If you do need to loop (e.g. to perform a strided
558 // write), consider using host-mapped buffers, or fixing the memory in RAM,
559 // before calling this function.
560 void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
561                   const void *data, size_t size);
562 
563 // Read back the contents of a buffer, starting at a given offset, storing the
564 // data into *dest. Returns whether successful.
565 //
566 // This function will block until the buffer is no longer in use. Use
567 // `pl_buf_poll` to perform non-blocking queries of buffer availability.
568 bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
569                  void *dest, size_t size);
570 
571 // Copy `size` bytes from one buffer to another, reading from and writing to
572 // the respective offsets.
573 void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
574                  pl_buf src, size_t src_offset, size_t size);
575 
576 // Initiates a buffer export operation, allowing a buffer to be accessed by an
577 // external API. This is only valid for buffers with `params.handle_type`.
578 // Calling this twice in a row is a harmless no-op. Returns whether successful.
579 //
580 // There is no corresponding "buffer import" operation, the next libplacebo
581 // operation that touches the buffer (e.g. pl_tex_upload, but also pl_buf_write
582 // and pl_buf_read) will implicitly import the buffer back to libplacebo. Users
583 // must ensure that all pending operations made by the external API are fully
584 // completed before using it in libplacebo again. (Otherwise, the behaviour
585 // is undefined)
586 //
587 // Please note that this function returning does not mean the memory is
588 // immediately available as such. In general, it will mark a buffer as "in use"
589 // in the same way any other buffer operation would, and it is the user's
590 // responsibility to wait until `pl_buf_poll` returns false before accessing
591 // the memory from the external API.
592 //
593 // In terms of the access performed by this operation, it is not considered a
594 // "read" or "write" and therefore does not technically conflict with reads or
595 // writes to the buffer performed by the host (via mapped memory - any use of
596 // `pl_buf_read` or `pl_buf_write` would defeat the purpose of the export).
597 // However, restrictions made by the external API may apply that prevent this.
598 //
599 // The recommended use pattern is something like this:
600 //
601 // while (loop) {
602 //    pl_buf buf = get_free_buffer(); // or block on pl_buf_poll
603 //    // write to the buffer using the external API
604 //    pl_tex_upload(gpu, /* ... buf ... */); // implicitly imports
605 //    pl_buf_export(gpu, buf);
606 // }
607 //
608 // i.e. perform an external API operation, then use and immediately export the
609 // buffer in libplacebo, and finally wait until `pl_buf_poll` is false before
610 // re-using it in the external API. (Or get a new buffer in the meantime)
611 bool pl_buf_export(pl_gpu gpu, pl_buf buf);
612 
613 // Returns whether or not a buffer is currently "in use". This can either be
614 // because of a pending read operation, a pending write operation or a pending
615 // buffer export operation. Any access to the buffer by external APIs or via
616 // the host pointer (for host-mapped buffers) is forbidden while a buffer is
617 // "in use". The only exception to this rule is multiple reads, for example
618 // reading from a buffer with `pl_tex_upload` while simultaneously reading from
619 // it using mapped memory.
620 //
621 // The `timeout`, specified in nanoseconds, indicates how long to block for
622 // before returning. If set to 0, this function will never block, and only
623 // returns the current status of the buffer. The actual precision of the
624 // timeout may be significantly longer than one nanosecond, and has no upper
625 // bound. This function does not provide hard latency guarantees. This function
626 // may also return at any time, even if the buffer is still in use. If the user
627 // wishes to block until the buffer is definitely no longer in use, the
628 // recommended usage is:
629 //
630 // while (pl_buf_poll(gpu, buf, UINT64_MAX))
631 //      ; // do nothing
632 //
633 // Note: libplacebo operations on buffers are always internally synchronized,
634 // so this is only needed for host-mapped or externally exported buffers.
635 // However, it may be used to do non-blocking queries before calling blocking
636 // functions such as `pl_buf_read`.
637 //
638 // Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
639 // synchronized, meaning it can safely be called on a `pl_buf` that is in use
640 // by another thread.
641 bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout);
642 
643 enum pl_tex_sample_mode {
644     PL_TEX_SAMPLE_NEAREST,  // nearest neighbour sampling
645     PL_TEX_SAMPLE_LINEAR,   // linear filtering, requires PL_FMT_CAP_LINEAR
646     PL_TEX_SAMPLE_MODE_COUNT,
647 };
648 
649 enum pl_tex_address_mode {
650     PL_TEX_ADDRESS_CLAMP,  // clamp the nearest edge texel
651     PL_TEX_ADDRESS_REPEAT, // repeat (tile) the texture
652     PL_TEX_ADDRESS_MIRROR, // repeat (mirror) the texture
653     PL_TEX_ADDRESS_MODE_COUNT,
654 };
655 
656 // Structure describing a texture.
657 struct pl_tex_params {
658     int w, h, d;            // physical dimension; unused dimensions must be 0
659     pl_fmt format;
660 
661     // The following bools describe what operations can be performed. The
662     // corresponding pl_fmt capability must be set for every enabled
663     // operation type.
664     bool sampleable;    // usable as a PL_DESC_SAMPLED_TEX
665     bool renderable;    // usable as a render target (pl_pass_run)
666                         // (must only be used with 2D textures)
667     bool storable;      // usable as a storage image (PL_DESC_IMG_*)
668     bool blit_src;      // usable as a blit source
669     bool blit_dst;      // usable as a blit destination
670     bool host_writable; // may be updated with pl_tex_upload()
671     bool host_readable; // may be fetched with pl_tex_download()
672 
673     // Note: For `blit_src`, `blit_dst`, the texture must either be
674     // 2-dimensional or `pl_gpu_limits.blittable_1d_3d` must be set.
675 
676     // At most one of `export_handle` and `import_handle` can be set for a
677     // texture.
678 
679     // Setting this indicates that the memory backing this texture should be
680     // shared with external APIs, If so, this must be exactly *one* of
681     // `pl_gpu.export_caps.tex`.
682     enum pl_handle_type export_handle;
683 
684     // Setting this indicates that the memory backing this texture will be
685     // imported from an external API. If so, this must be exactly *one* of
686     // `pl_gpu.import_caps.tex`. Mutually exclusive with `initial_data`.
687     enum pl_handle_type import_handle;
688 
689     // If the shared memory is being imported, the import handle must be
690     // specified here. Otherwise, this is ignored.
691     struct pl_shared_mem shared_mem;
692 
693     // If non-NULL, the texture will be created with these contents (tightly
694     // packed). Using this does *not* require setting host_writable. Otherwise,
695     // the initial data is undefined. Mutually exclusive with `import_handle`.
696     const void *initial_data;
697 
698     // Arbitrary user data. libplacebo does not use this at all.
699     void *user_data;
700 
701     // Deprecated fields. These are now ignored entirely, and controlled
702     // via other mechanisms.
703     enum pl_tex_sample_mode sample_mode PL_DEPRECATED;
704     enum pl_tex_address_mode address_mode PL_DEPRECATED;
705 };
706 
pl_tex_params_dimension(const struct pl_tex_params params)707 static inline int pl_tex_params_dimension(const struct pl_tex_params params)
708 {
709     return params.d ? 3 : params.h ? 2 : 1;
710 }
711 
712 enum pl_sampler_type {
713     PL_SAMPLER_NORMAL,      // gsampler2D, gsampler3D etc.
714     PL_SAMPLER_RECT,        // gsampler2DRect
715     PL_SAMPLER_EXTERNAL,    // gsamplerExternalOES
716     PL_SAMPLER_TYPE_COUNT,
717 };
718 
719 // Conflates the following typical GPU API concepts:
720 // - texture itself
721 // - sampler state
722 // - staging buffers for texture upload
723 // - framebuffer objects
724 // - wrappers for swapchain framebuffers
725 // - synchronization needed for upload/rendering/etc.
726 //
727 // Essentially a pl_tex can be anything ranging from a normal texture, a wrapped
728 // external/real framebuffer, a framebuffer object + texture pair, a mapped
729 // texture (via pl_hwdec), or other sorts of things that can be sampled from
730 // and/or rendered to.
731 //
732 // Thread-safety: Unsafe
733 typedef const PL_STRUCT(pl_tex) {
734     struct pl_tex_params params;
735 
736     // If `params.export_handle` is set, this structure references the shared
737     // memory backing this buffer, via the requested handle type.
738     //
739     // While this texture is not in an "exported" state, the contents of the
740     // memory are undefined. (See: `pl_tex_export`)
741     //
742     // Note: Due to vulkan driver limitations, `shared_mem.drm_format_mod` will
743     // currently always be set to DRM_FORMAT_MOD_INVALID. No guarantee can be
744     // made about the cross-driver compatibility of textures exported this way.
745     struct pl_shared_mem shared_mem;
746 
747     // If `params.sampleable` is true, this indicates the correct sampler type
748     // to use when sampling from this texture.
749     enum pl_sampler_type sampler_type;
750 } *pl_tex;
751 
752 // Create a texture (with undefined contents). Returns NULL on failure. This is
753 // assumed to be an expensive/rare operation, and may need to perform memory
754 // allocation or framebuffer creation.
755 pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params);
756 void pl_tex_destroy(pl_gpu gpu, pl_tex *tex);
757 
758 // This works like `pl_tex_create`, but if the texture already exists and has
759 // incompatible texture parameters, it will get destroyed first. A texture is
760 // considered "compatible" if it has the same texture format and sample/address
761 // mode and it supports a superset of the features the user requested.
762 //
763 // Even if the texture is not recreated, calling this function will still
764 // invalidate the contents of the texture. (Note: Because of this,
765 // `initial_data` may not be used with `pl_tex_recreate`. Doing so is an error)
766 //
767 // Note: If the `user_data` alone changes, this does not trigger a texture
768 // recreation. In theory, this can be used to detect when the texture ended
769 // up being recreated.
770 bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params);
771 
772 // Invalidates the contents of a texture. After this, the contents are fully
773 // undefined.
774 void pl_tex_invalidate(pl_gpu gpu, pl_tex tex);
775 
776 union pl_clear_color {
777     float f[4];
778     int32_t i[4];
779     uint32_t u[4];
780 };
781 
782 // Clear the dst texture with the given color (rgba). This is functionally
783 // identical to a blit operation, which means `dst->params.blit_dst` must be
784 // set.
785 void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color);
786 
787 // Wrapper for `pl_tex_clear_ex` which only works for floating point textures.
788 void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4]);
789 
790 struct pl_tex_blit_params {
791     // The texture to blit from. Must have `params.blit_src` enabled.
792     pl_tex src;
793 
794     // The texture to blit to. Must have `params.blit_dst` enabled, and a
795     // format that is loosely compatible with `src`. This essentially means
796     // that they must have the same `internal_size`. Additionally, UINT
797     // textures can only be blitted to other UINT textures, and SINT textures
798     // can only be blitted to other SINT textures.
799     pl_tex dst;
800 
801     // The region of the source texture to blit. Must be within the texture
802     // bounds of `src`. May be flipped. (Optional)
803     struct pl_rect3d src_rc;
804 
805     // The region of the destination texture to blit into. Must be within the
806     // texture bounds of `dst`. May be flipped. Areas outside of `dst_rc` in
807     // `dst` are preserved. (Optional)
808     struct pl_rect3d dst_rc;
809 
810     // If `src_rc` and `dst_rc` have different sizes, the texture will be
811     // scaled using the given texture sampling mode.
812     enum pl_tex_sample_mode sample_mode;
813 };
814 
815 // Copy a sub-rectangle from one texture to another.
816 void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params);
817 
818 // Structure describing a texture transfer operation.
819 struct pl_tex_transfer_params {
820     // Texture to transfer to/from. Depending on the type of the operation,
821     // this must have params.host_writable (uploads) or params.host_readable
822     // (downloads) set, respectively.
823     pl_tex tex;
824 
825     // Note: Superfluous parameters are ignored, i.e. for a 1D texture, the y
826     // and z fields of `rc`, as well as the corresponding strides, are ignored.
827     // In all other cases, the stride must be >= the corresponding dimension of
828     // `rc`, and the `rc` must be normalized and fully contained within the
829     // image dimensions. Missing fields in the `rc` are inferred from the image
830     // size. If unset, the strides are inferred from `rc` (that is, it's
831     // assumed that the data is tightly packed in the buffer).
832     struct pl_rect3d rc;   // region of the texture to transfer
833     unsigned int stride_w; // the number of texels per horizontal row (x axis)
834     unsigned int stride_h; // the number of texels per vertical column (y axis)
835 
836     // An optional timer to report the approximate duration of the texture
837     // transfer to. Note that this is only an approximation, since the actual
838     // texture transfer may happen entirely in the background (in particular,
839     // for implementations with asynchronous transfer capabilities). It's also
840     // not guaranteed that all GPUs support this.
841     pl_timer timer;
842 
843     // An optional callback to fire after the operation completes. If this is
844     // specified, then the operation is performed asynchronously. Note that
845     // transfers to/from buffers are always asynchronous, even without, this
846     // field, so it's more useful for `ptr` transfers. (Though it can still be
847     // helpful to avoid having to manually poll buffers all the time)
848     //
849     // When this is *not* specified, uploads from `ptr` are still asynchronous
850     // but require a host memcpy, while downloads from `ptr` are blocking. As
851     // such, it's recommended to always try using asynchronous texture
852     // transfers wherever possible.
853     //
854     // Note: Requires `pl_gpu_limits.callbacks`
855     //
856     // Note: Callbacks are implicitly synchronized, meaning that callbacks are
857     // guaranteed to never execute concurrently with other callbacks. However,
858     // they may execute from any thread that the `pl_gpu` is used on.
859     void (*callback)(void *priv);
860     void *priv; // arbitrary user data
861 
862     // For the data source/target of a transfer operation, there are two valid
863     // options:
864     //
865     // 1. Transferring to/from a buffer: (requires `pl_gpu_limits.buf_transfer`)
866     pl_buf buf;         // buffer to use
867     size_t buf_offset;  // offset of data within buffer, should be a
868                         // multiple of `tex->params.format->texel_size`
869     // 2. Transferring to/from host memory directly:
870     void *ptr;          // address of data
871 
872     // Note: The contents of the memory region / buffer must exactly match the
873     // texture format; i.e. there is no explicit conversion between formats.
874 };
875 
876 // Upload data to a texture. Returns whether successful.
877 bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params);
878 
879 // Download data from a texture. Returns whether successful.
880 bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params);
881 
882 // Returns whether or not a texture is currently "in use". This can either be
883 // because of a pending read operation, a pending write operation or a pending
884 // texture export operation. Note that this function's usefulness is extremely
885 // limited under ordinary circumstances. In practically all cases, textures do
886 // not need to be directly synchronized by the user, except when interfacing
887 // with external libraries. This function should NOT, however, be used as a
888 // crutch to avoid having to implement semaphore-based synchronization. See
889 // `pl_sync` for a better replacement for external API interop.
890 //
891 // A good example of a use case in which this function is required is when
892 // interoperating with external memory management that needs to know when an
893 // imported texture is safe to free / reclaim internally, in which case
894 // semaphores are insufficient because memory management is a host operation.
895 //
896 // The `timeout`, specified in nanoseconds, indicates how long to block for
897 // before returning. If set to 0, this function will never block, and only
898 // returns the current status of the texture. The actual precision of the
899 // timeout may be significantly longer than one nanosecond, and has no upper
900 // bound. This function does not provide hard latency guarantees. This function
901 // may also return at any time, even if the texture is still in use. If the
902 // user wishes to block until the texture is definitely no longer in use, the
903 // recommended usage is:
904 //
905 // while (pl_tex_poll(gpu, buf, UINT64_MAX))
906 //      ; // do nothing
907 //
908 // Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
909 // synchronized, meaning it can safely be called on a `pl_tex` that is in use
910 // by another thread.
911 bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout);
912 
913 // Data type of a shader input variable (e.g. uniform, or UBO member)
914 enum pl_var_type {
915     PL_VAR_INVALID = 0,
916     PL_VAR_SINT,        // C: int           GLSL: int/ivec
917     PL_VAR_UINT,        // C: unsigned int  GLSL: uint/uvec
918     PL_VAR_FLOAT,       // C: float         GLSL: float/vec/mat
919     PL_VAR_TYPE_COUNT
920 };
921 
922 // Returns the host size (in bytes) of a pl_var_type.
923 size_t pl_var_type_size(enum pl_var_type type);
924 
925 // Represents a shader input variable (concrete data, e.g. vector, matrix)
926 struct pl_var {
927     const char *name;       // name as used in the shader
928     enum pl_var_type type;
929     // The total number of values is given by dim_v * dim_m. For example, a
930     // vec2 would have dim_v = 2 and dim_m = 1. A mat3x4 would have dim_v = 4
931     // and dim_m = 3.
932     int dim_v;              // vector dimension
933     int dim_m;              // matrix dimension (number of columns, see below)
934     int dim_a;              // array dimension
935 };
936 
937 // Helper functions for constructing the most common pl_vars, with names
938 // corresponding to their corresponding GLSL built-in types.
939 struct pl_var pl_var_float(const char *name);
940 struct pl_var pl_var_vec2(const char *name);
941 struct pl_var pl_var_vec3(const char *name);
942 struct pl_var pl_var_vec4(const char *name);
943 struct pl_var pl_var_mat2(const char *name);
944 struct pl_var pl_var_mat2x3(const char *name);
945 struct pl_var pl_var_mat2x4(const char *name);
946 struct pl_var pl_var_mat3(const char *name);
947 struct pl_var pl_var_mat3x4(const char *name);
948 struct pl_var pl_var_mat4x2(const char *name);
949 struct pl_var pl_var_mat4x3(const char *name);
950 struct pl_var pl_var_mat4(const char *name);
951 struct pl_var pl_var_int(const char *name);
952 struct pl_var pl_var_ivec2(const char *name);
953 struct pl_var pl_var_ivec3(const char *name);
954 struct pl_var pl_var_ivec4(const char *name);
955 struct pl_var pl_var_uint(const char *name);
956 struct pl_var pl_var_uvec2(const char *name);
957 struct pl_var pl_var_uvec3(const char *name);
958 struct pl_var pl_var_uvec4(const char *name);
959 
960 struct pl_named_var {
961     const char *glsl_name;
962     struct pl_var var;
963 };
964 
965 // The same list as above, tagged by name and terminated with a {0} entry.
966 extern const struct pl_named_var pl_var_glsl_types[];
967 
968 // Efficient helper function for performing a lookup in the above array.
969 // Returns NULL if the variable is not legal. Note that the array dimension is
970 // ignored, since it's usually part of the variable name and not the type name.
971 const char *pl_var_glsl_type_name(struct pl_var var);
972 
973 // Converts a pl_fmt to an "equivalent" pl_var. Equivalent in this sense means
974 // that the pl_var's type will be the same as the vertex's sampled type (e.g.
975 // PL_FMT_UNORM gets turned into PL_VAR_FLOAT).
976 struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name);
977 
978 // Describes the memory layout of a variable, relative to some starting location
979 // (typically the offset within a uniform/storage/pushconstant buffer)
980 //
981 // Note on matrices: All GPUs expect column major matrices, for both buffers and
982 // input variables. Care needs to be taken to avoid trying to use e.g. a
983 // pl_matrix3x3 (which is row major) directly as a pl_var_update.data!
984 //
985 // In terms of the host layout, a column-major matrix (e.g. matCxR) with C
986 // columns and R rows is treated like an array vecR[C]. The `stride` here refers
987 // to the separation between these array elements, i.e. the separation between
988 // the individual columns.
989 //
990 // Visualization of a mat4x3:
991 //
992 //       0   1   2   3  <- columns
993 // 0  [ (A) (D) (G) (J) ]
994 // 1  [ (B) (E) (H) (K) ]
995 // 2  [ (C) (F) (I) (L) ]
996 // ^ rows
997 //
998 // Layout in GPU memory: (stride=16, size=60)
999 //
1000 // [ A B C ] X <- column 0, offset +0
1001 // [ D E F ] X <- column 1, offset +16
1002 // [ G H I ] X <- column 2, offset +32
1003 // [ J K L ]   <- column 3, offset +48
1004 //
1005 // Note the lack of padding on the last column in this example.
1006 // In general: size <= stride * dim_m
1007 //
1008 // C representation: (stride=12, size=48)
1009 //
1010 // { { A, B, C },
1011 //   { D, E, F },
1012 //   { G, H, I },
1013 //   { J, K, L } }
1014 //
1015 // Note on arrays: `stride` represents both the stride between elements of a
1016 // matrix, and the stride between elements of an array. That is, there is no
1017 // distinction between the columns of a matrix and the rows of an array. For
1018 // example, a mat2[10] and a vec2[20] share the same pl_var_layout - the stride
1019 // would be sizeof(vec2) and the size would be sizeof(vec2) * 2 * 10.
1020 //
1021 // For non-array/matrix types, `stride` is equal to `size`.
1022 
1023 struct pl_var_layout {
1024     size_t offset; // the starting offset of the first byte
1025     size_t stride; // the delta between two elements of an array/matrix
1026     size_t size;   // the total size of the input
1027 };
1028 
1029 // Returns the host layout of an input variable as required for a
1030 // tightly-packed, byte-aligned C data type, given a starting offset.
1031 struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var);
1032 
1033 // Returns the GLSL std140 layout of an input variable given a current buffer
1034 // offset, as required for a buffer descriptor of type PL_DESC_BUF_UNIFORM
1035 //
1036 // The normal way to use this function is when calculating the size and offset
1037 // requirements of a uniform buffer in an incremental fashion, to calculate the
1038 // new offset of the next variable in this buffer.
1039 struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var);
1040 
1041 // Returns the GLSL std430 layout of an input variable given a current buffer
1042 // offset, as required for a buffer descriptor of type PL_DESC_BUF_STORAGE, and
1043 // for push constants.
1044 struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var);
1045 
1046 // Convenience definitions / friendly names for these
1047 #define pl_buf_uniform_layout pl_std140_layout
1048 #define pl_buf_storage_layout pl_std430_layout
1049 #define pl_push_constant_layout pl_std430_layout
1050 
1051 // Like memcpy, but copies bytes from `src` to `dst` in a manner governed by
1052 // the stride and size of `dst_layout` as well as `src_layout`. Also takes
1053 // into account the respective `offset`.
1054 void memcpy_layout(void *dst, struct pl_var_layout dst_layout,
1055                    const void *src, struct pl_var_layout src_layout);
1056 
1057 // Represents a compile-time constant.
1058 struct pl_constant {
1059     enum pl_var_type type;  // constant data type
1060     uint32_t id;            // GLSL `constant_id`
1061     size_t offset;          // byte offset in `constant_data`
1062 };
1063 
1064 // Represents a vertex attribute.
1065 struct pl_vertex_attrib {
1066     const char *name;   // name as used in the shader
1067     pl_fmt fmt;         // data format (must have PL_FMT_CAP_VERTEX)
1068     size_t offset;      // byte offset into the vertex struct
1069     int location;       // vertex location (as used in the shader)
1070 };
1071 
1072 // Returns an abstract namespace index for a given descriptor type. This will
1073 // always be a value >= 0 and < PL_DESC_TYPE_COUNT. Implementations can use
1074 // this to figure out which descriptors may share the same value of `binding`.
1075 // Bindings must only be unique for all descriptors within the same namespace.
1076 int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type);
1077 
1078 // Access mode of a shader input descriptor.
1079 enum pl_desc_access {
1080     PL_DESC_ACCESS_READWRITE,
1081     PL_DESC_ACCESS_READONLY,
1082     PL_DESC_ACCESS_WRITEONLY,
1083     PL_DESC_ACCESS_COUNT,
1084 };
1085 
1086 // Returns the GLSL syntax for a given access mode (e.g. "readonly").
1087 const char *pl_desc_access_glsl_name(enum pl_desc_access mode);
1088 
1089 // Represents a shader descriptor (e.g. texture or buffer binding)
1090 struct pl_desc {
1091     const char *name;       // name as used in the shader
1092     enum pl_desc_type type;
1093 
1094     // The binding of this descriptor, as used in the shader. All bindings
1095     // within a namespace must be unique. (see: pl_desc_namespace)
1096     int binding;
1097 
1098     // For storage images and storage buffers, this can be used to restrict
1099     // the type of access that may be performed on the descriptor. Ignored for
1100     // the other descriptor types (uniform buffers and sampled textures are
1101     // always read-only).
1102     enum pl_desc_access access;
1103 };
1104 
1105 // Framebuffer blending mode (for raster passes)
1106 enum pl_blend_mode {
1107     PL_BLEND_ZERO,
1108     PL_BLEND_ONE,
1109     PL_BLEND_SRC_ALPHA,
1110     PL_BLEND_ONE_MINUS_SRC_ALPHA,
1111     PL_BLEND_MODE_COUNT,
1112 };
1113 
1114 struct pl_blend_params {
1115     enum pl_blend_mode src_rgb;
1116     enum pl_blend_mode dst_rgb;
1117     enum pl_blend_mode src_alpha;
1118     enum pl_blend_mode dst_alpha;
1119 };
1120 
1121 extern const struct pl_blend_params pl_alpha_overlay; // typical alpha compositing
1122 
1123 enum pl_prim_type {
1124     PL_PRIM_TRIANGLE_LIST,
1125     PL_PRIM_TRIANGLE_STRIP,
1126     PL_PRIM_TYPE_COUNT,
1127 };
1128 
1129 enum pl_pass_type {
1130     PL_PASS_INVALID = 0,
1131     PL_PASS_RASTER,  // vertex+fragment shader
1132     PL_PASS_COMPUTE, // compute shader (requires `pl_gpu.glsl.compute`)
1133     PL_PASS_TYPE_COUNT,
1134 };
1135 
1136 // Description of a rendering pass. It conflates the following:
1137 //  - GLSL shader(s) and its list of inputs
1138 //  - target parameters (for raster passes)
1139 struct pl_pass_params {
1140     enum pl_pass_type type;
1141 
1142     // Input variables.
1143     struct pl_var *variables;
1144     int num_variables;
1145 
1146     // Input descriptors.
1147     struct pl_desc *descriptors;
1148     int num_descriptors;
1149 
1150     // Compile-time specialization constants.
1151     struct pl_constant *constants;
1152     int num_constants;
1153 
1154     // Initial data for the specialization constants. Optional. If NULL,
1155     // specialization constants receive the values from the shader text.
1156     void *constant_data;
1157 
1158     // Push constant region. Must be be a multiple of 4 <= limits.max_pushc_size
1159     size_t push_constants_size;
1160 
1161     // The shader text in GLSL. For PL_PASS_RASTER, this is interpreted
1162     // as a fragment shader. For PL_PASS_COMPUTE, this is interpreted as
1163     // a compute shader.
1164     const char *glsl_shader;
1165 
1166     // Highly implementation-specific byte array storing a compiled version of
1167     // the same shader. Can be used to speed up pass creation on already
1168     // known/cached shaders.
1169     //
1170     // Note: There are no restrictions on this. Passing an out-of-date cache,
1171     // passing a cache corresponding to a different program, or passing a cache
1172     // belonging to a different GPU, are all valid. But obviously, in such cases,
1173     // there is no benefit in doing so.
1174     const uint8_t *cached_program;
1175     size_t cached_program_len;
1176 
1177     // --- type==PL_PASS_RASTER only
1178 
1179     // Describes the interpretation and layout of the vertex data.
1180     enum pl_prim_type vertex_type;
1181     struct pl_vertex_attrib *vertex_attribs;
1182     int num_vertex_attribs;
1183     size_t vertex_stride;
1184 
1185     // The vertex shader itself.
1186     const char *vertex_shader;
1187 
1188     // The target texture this render pass is intended to be used with. This
1189     // doesn't have to come from a real texture - the caller can also invent
1190     // values or pass a blank struct, as long as `target_dummy.params.format`
1191     // is set. The format must support `PL_FMT_CAP_RENDERABLE`. If any other
1192     // fields are set, the GPU may be able to further optimize the render pass
1193     // for this particular type of texture.
1194     PL_STRUCT(pl_tex) target_dummy;
1195 
1196     // Target blending mode. If this is NULL, blending is disabled. Otherwise,
1197     // the `target_dummy.params.format` must have PL_FMT_CAP_BLENDABLE.
1198     const struct pl_blend_params *blend_params;
1199 
1200     // If false, the target's existing contents will be discarded before the
1201     // pass is run. (Semantically equivalent to calling pl_tex_invalidate
1202     // before every pl_pass_run, but slightly more efficient)
1203     //
1204     // Specifying `blend_params` requires `load_target` to be true.
1205     bool load_target;
1206 };
1207 
1208 // Conflates the following typical GPU API concepts:
1209 // - various kinds of shaders
1210 // - rendering pipelines
1211 // - descriptor sets, uniforms, other bindings
1212 // - all synchronization necessary
1213 // - the current values of all inputs
1214 //
1215 // Thread-safety: Unsafe
1216 typedef const PL_STRUCT(pl_pass) {
1217     struct pl_pass_params params;
1218 } *pl_pass;
1219 
1220 // Compile a shader and create a render pass. This is a rare/expensive
1221 // operation and may take a significant amount of time, even if a cached
1222 // program is used. Returns NULL on failure.
1223 //
1224 // The resulting pl_pass->params.cached_program will be initialized by
1225 // this function to point to a new, valid cached program (if any).
1226 pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params);
1227 void pl_pass_destroy(pl_gpu gpu, pl_pass *pass);
1228 
1229 struct pl_desc_binding {
1230     const void *object; // pl_* object with type corresponding to pl_desc_type
1231 
1232     // For PL_DESC_SAMPLED_TEX, this can be used to configure the sampler.
1233     enum pl_tex_address_mode address_mode;
1234     enum pl_tex_sample_mode sample_mode;
1235 };
1236 
1237 struct pl_var_update {
1238     int index;        // index into params.variables[]
1239     const void *data; // pointer to raw byte data corresponding to pl_var_host_layout()
1240 };
1241 
1242 struct pl_pass_run_params {
1243     pl_pass pass;
1244 
1245     // If present, the shader will be re-specialized with the new constants
1246     // provided. This is a significantly cheaper operation than recompiling a
1247     // brand new shader, but should still be avoided if possible.
1248     //
1249     // Leaving it as NULL re-uses the existing specialization values. Ignored
1250     // if the shader has no specialization constants. Guaranteed to be a no-op
1251     // if the values have not changed since the last invocation.
1252     void *constant_data;
1253 
1254     // This list only contains descriptors/variables which have changed
1255     // since the previous invocation. All non-mentioned variables implicitly
1256     // preserve their state from the last invocation.
1257     struct pl_var_update *var_updates;
1258     int num_var_updates;
1259 
1260     // This list contains all descriptors used by this pass. It must
1261     // always be filled, even if the descriptors haven't changed. The order
1262     // must match that of pass->params.descriptors
1263     struct pl_desc_binding *desc_bindings;
1264 
1265     // The push constants for this invocation. This must always be set and
1266     // fully defined for every invocation if params.push_constants_size > 0.
1267     void *push_constants;
1268 
1269     // An optional timer to report the approximate runtime of this shader pass
1270     // invocation to. Note that this is only an approximation, since shaders
1271     // may overlap their execution times and contend for GPU time.
1272     pl_timer timer;
1273 
1274     // --- pass->params.type==PL_PASS_RASTER only
1275 
1276     // Target must be a 2D texture, target->params.renderable must be true, and
1277     // target->params.format must match pass->params.target_dummy.params.format.
1278     // If the viewport or scissors are left blank, they are inferred from
1279     // target->params.
1280     //
1281     // WARNING: Rendering to a *target that is being read from by the same
1282     // shader is undefined behavior. In general, trying to bind the same
1283     // resource multiple times to the same shader is undefined behavior.
1284     pl_tex target;
1285     struct pl_rect2d viewport; // screen space viewport (must be normalized)
1286     struct pl_rect2d scissors; // target render scissors (must be normalized)
1287 
1288     // Number of vertices to render
1289     int vertex_count;
1290 
1291     // Vertex data may be provided in one of two forms:
1292     //
1293     // 1. Drawing from host memory directly
1294     const void *vertex_data;
1295     // 2. Drawing from a vertex buffer (requires `vertex_buf->params.drawable`)
1296     pl_buf vertex_buf;
1297     size_t buf_offset;
1298 
1299     // (Optional) Index data may be provided in the form of `uint16_t` index
1300     // offsets. These will be used for instanced rendering. Similar to
1301     // vertex data, this can be provided in two forms:
1302     // 1. From host memory
1303     const uint16_t *index_data;
1304     // 2. From an index buffer (requires `index_buf->params.drawable`)
1305     pl_buf index_buf;
1306     size_t index_offset;
1307     // Note: Drawing from an index buffer requires vertex data to also be
1308     // present in buffer form, i.e. it's forbidden to mix `index_buf` with
1309     // `vertex_data` (though vice versa is allowed).
1310 
1311     // --- pass->params.type==PL_PASS_COMPUTE only
1312 
1313     // Number of work groups to dispatch per dimension (X/Y/Z). Must be <= the
1314     // corresponding index of limits.max_dispatch
1315     int compute_groups[3];
1316 };
1317 
1318 // Execute a render pass.
1319 void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params);
1320 
1321 // A generic synchronization object intended for use with an external API. This
1322 // is not required when solely using libplacebo API functions, as all required
1323 // synchronisation is done internally. This comes in the form of a pair of
1324 // semaphores - one to synchronize access in each direction.
1325 //
1326 // Thread-safety: Unsafe
1327 typedef const PL_STRUCT(pl_sync) {
1328     enum pl_handle_type handle_type;
1329 
1330     // This handle is signalled by the `pl_gpu`, and waited on by the user. It
1331     // fires when it is safe for the user to access the shared resource.
1332     union pl_handle wait_handle;
1333 
1334     // This handle is signalled by the user, and waited on by the `pl_gpu`. It
1335     // must fire when the user has finished accessing the shared resource.
1336     union pl_handle signal_handle;
1337 } *pl_sync;
1338 
1339 // Create a synchronization object. Returns NULL on failure.
1340 //
1341 // `handle_type` must be exactly *one* of `pl_gpu.export_caps.sync`, and
1342 // indicates which type of handle to generate for sharing this sync object.
1343 pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type);
1344 
1345 // Destroy a `pl_sync`. Note that this invalidates the externally imported
1346 // semaphores. Users should therefore make sure that all operations that
1347 // wait on or signal any of the semaphore have been fully submitted and
1348 // processed by the external API before destroying the `pl_sync`.
1349 //
1350 // Despite this, it's safe to destroy a `pl_sync` if the only pending
1351 // operations that involve it are internal to libplacebo.
1352 void pl_sync_destroy(pl_gpu gpu, pl_sync *sync);
1353 
1354 // Initiates a texture export operation, allowing a texture to be accessed by
1355 // an external API. Returns whether successful. After this operation
1356 // successfully returns, it is guaranteed that `sync->wait_handle` will
1357 // eventually be signalled. For APIs where this is relevant, the image layout
1358 // should be specified as "general", e.g. `GL_LAYOUT_GENERAL_EXT` for OpenGL.
1359 //
1360 // There is no corresponding "import" operation - the next operation that uses
1361 // a texture will implicitly import the texture. Valid API usage requires that
1362 // the user *must* submit a semaphore signal operation on `sync->signal_handle`
1363 // before doing so. Not doing so is undefined behavior and may very well
1364 // deadlock the calling process and/or the graphics card!
1365 //
1366 // Note that despite this restriction, it is always valid to call
1367 // `pl_tex_destroy`, even if the texture is in an exported state, without
1368 // having to signal the corresponding sync object first.
1369 bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync);
1370 
1371 // This is semantically a no-op, but it provides a hint that you want to flush
1372 // any partially queued up commands and begin execution. There is normally no
1373 // need to call this, because queued commands will always be implicitly flushed
1374 // whenever necessary to make forward progress on commands like `pl_buf_poll`,
1375 // or when submitting a frame to a swapchain for display. In fact, calling this
1376 // function can negatively impact performance, because some GPUs rely on being
1377 // able to re-order and modify queued commands in order to enable optimizations
1378 // retroactively.
1379 //
1380 // The only time this might be beneficial to call explicitly is if you're doing
1381 // lots of offline processing, i.e. you aren't rendering to a swapchain but to
1382 // textures that you download from again. In that case you should call this
1383 // function after each "work item" to ensure good parallelism between them.
1384 //
1385 // It's worth noting that this function may block if you're over-feeding the
1386 // GPU without waiting for existing results to finish.
1387 void pl_gpu_flush(pl_gpu gpu);
1388 
1389 // This is like `pl_gpu_flush` but also blocks until the GPU is fully idle
1390 // before returning. Using this in your rendering loop is seriously disadvised,
1391 // and almost never the right solution. The intended use case is for deinit
1392 // logic, where users may want to force the all pending GPU operations to
1393 // finish so they can clean up their state more easily.
1394 //
1395 // After this operation is called, it's guaranteed that all pending buffer
1396 // operations are complete - i.e. `pl_buf_poll` is guaranteed to return false.
1397 // It's also guaranteed that any outstanding timer query results are available.
1398 //
1399 // Note: If you only care about buffer operations, you can accomplish this more
1400 // easily by using `pl_buf_poll` with the timeout set to `UINT64_MAX`. But if
1401 // you have many buffers it may be more convenient to call this function
1402 // instead. The difference is that this function will also affect e.g. renders
1403 // to a `pl_swapchain`.
1404 void pl_gpu_finish(pl_gpu gpu);
1405 
1406 // Returns true if the GPU is considered to be in a "failed" state, which
1407 // during normal operation is typically the result of things like the device
1408 // being lost (due to e.g. power management).
1409 //
1410 // If this returns true, users *should* destroy and recreate the `pl_gpu`,
1411 // including all associated resources, via the appropriate mechanism.
1412 bool pl_gpu_is_failed(pl_gpu gpu);
1413 
1414 PL_API_END
1415 
1416 #endif // LIBPLACEBO_GPU_H_
1417