1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #ifndef TU_PRIVATE_H
29 #define TU_PRIVATE_H
30
31 #include <assert.h>
32 #include <pthread.h>
33 #include <stdbool.h>
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef HAVE_VALGRIND
39 #include <memcheck.h>
40 #include <valgrind.h>
41 #define VG(x) x
42 #else
43 #define VG(x) ((void)0)
44 #endif
45
46 #define MESA_LOG_TAG "TU"
47
48 #include "c11/threads.h"
49 #include "util/rounding.h"
50 #include "util/bitscan.h"
51 #include "util/list.h"
52 #include "util/log.h"
53 #include "util/macros.h"
54 #include "util/sparse_array.h"
55 #include "util/u_atomic.h"
56 #include "util/u_dynarray.h"
57 #include "util/xmlconfig.h"
58 #include "util/perf/u_trace.h"
59 #include "vk_alloc.h"
60 #include "vk_debug_report.h"
61 #include "vk_device.h"
62 #include "vk_dispatch_table.h"
63 #include "vk_extensions.h"
64 #include "vk_instance.h"
65 #include "vk_log.h"
66 #include "vk_physical_device.h"
67 #include "vk_shader_module.h"
68 #include "wsi_common.h"
69
70 #include "ir3/ir3_compiler.h"
71 #include "ir3/ir3_shader.h"
72
73 #include "adreno_common.xml.h"
74 #include "adreno_pm4.xml.h"
75 #include "a6xx.xml.h"
76 #include "fdl/freedreno_layout.h"
77 #include "common/freedreno_dev_info.h"
78 #include "perfcntrs/freedreno_perfcntr.h"
79
80 #include "tu_descriptor_set.h"
81 #include "tu_autotune.h"
82 #include "tu_util.h"
83 #include "tu_perfetto.h"
84
85 /* Pre-declarations needed for WSI entrypoints */
86 struct wl_surface;
87 struct wl_display;
88 typedef struct xcb_connection_t xcb_connection_t;
89 typedef uint32_t xcb_visualid_t;
90 typedef uint32_t xcb_window_t;
91
92 #include <vulkan/vk_android_native_buffer.h>
93 #include <vulkan/vk_icd.h>
94 #include <vulkan/vulkan.h>
95
96 #include "tu_entrypoints.h"
97
98 #include "vk_format.h"
99 #include "vk_image.h"
100 #include "vk_command_buffer.h"
101 #include "vk_command_pool.h"
102 #include "vk_queue.h"
103 #include "vk_object.h"
104 #include "vk_sync.h"
105 #include "vk_fence.h"
106 #include "vk_semaphore.h"
107 #include "vk_drm_syncobj.h"
108 #include "vk_sync_timeline.h"
109
110 #define MAX_VBS 32
111 #define MAX_VERTEX_ATTRIBS 32
112 #define MAX_RTS 8
113 #define MAX_VSC_PIPES 32
114 #define MAX_VIEWPORTS 16
115 #define MAX_VIEWPORT_SIZE (1 << 14)
116 #define MAX_SCISSORS 16
117 #define MAX_DISCARD_RECTANGLES 4
118 #define MAX_PUSH_CONSTANTS_SIZE 128
119 #define MAX_PUSH_DESCRIPTORS 32
120 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16
121 #define MAX_DYNAMIC_STORAGE_BUFFERS 8
122 #define MAX_DYNAMIC_BUFFERS_SIZE \
123 (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \
124 A6XX_TEX_CONST_DWORDS
125
126 #define TU_MAX_DRM_DEVICES 8
127 #define MAX_VIEWS 16
128 #define MAX_BIND_POINTS 2 /* compute + graphics */
129 /* The Qualcomm driver exposes 0x20000058 */
130 #define MAX_STORAGE_BUFFER_RANGE 0x20000000
131 /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
132 * expose the same maximum range.
133 * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
134 * range might be higher.
135 */
136 #define MAX_UNIFORM_BUFFER_RANGE 0x10000
137
138 #define A6XX_TEX_CONST_DWORDS 16
139 #define A6XX_TEX_SAMP_DWORDS 4
140
141 #define COND(bool, val) ((bool) ? (val) : 0)
142 #define BIT(bit) (1u << (bit))
143
144 /* Whenever we generate an error, pass it through this function. Useful for
145 * debugging, where we can break on it. Only call at error site, not when
146 * propagating errors. Might be useful to plug in a stack trace here.
147 */
148
149 struct tu_instance;
150
151 VkResult
152 __vk_startup_errorf(struct tu_instance *instance,
153 VkResult error,
154 bool force_print,
155 const char *file,
156 int line,
157 const char *format,
158 ...) PRINTFLIKE(6, 7);
159
160 /* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
161 * build.
162 */
163 #define vk_startup_errorf(instance, error, format, ...) \
164 __vk_startup_errorf(instance, error, \
165 instance->debug_flags & TU_DEBUG_STARTUP, \
166 __FILE__, __LINE__, format, ##__VA_ARGS__)
167
168 void
169 __tu_finishme(const char *file, int line, const char *format, ...)
170 PRINTFLIKE(3, 4);
171
172 /**
173 * Print a FINISHME message, including its source location.
174 */
175 #define tu_finishme(format, ...) \
176 do { \
177 static bool reported = false; \
178 if (!reported) { \
179 __tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__); \
180 reported = true; \
181 } \
182 } while (0)
183
184 #define tu_stub() \
185 do { \
186 tu_finishme("stub %s", __func__); \
187 } while (0)
188
189 struct tu_memory_heap {
190 /* Standard bits passed on to the client */
191 VkDeviceSize size;
192 VkMemoryHeapFlags flags;
193
194 /** Copied from ANV:
195 *
196 * Driver-internal book-keeping.
197 *
198 * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
199 */
200 VkDeviceSize used __attribute__ ((aligned (8)));
201 };
202
203 uint64_t
204 tu_get_system_heap_size(void);
205
206 struct tu_physical_device
207 {
208 struct vk_physical_device vk;
209
210 struct tu_instance *instance;
211
212 const char *name;
213 uint8_t driver_uuid[VK_UUID_SIZE];
214 uint8_t device_uuid[VK_UUID_SIZE];
215 uint8_t cache_uuid[VK_UUID_SIZE];
216
217 struct wsi_device wsi_device;
218
219 int local_fd;
220 bool has_local;
221 int64_t local_major;
222 int64_t local_minor;
223 int master_fd;
224 bool has_master;
225 int64_t master_major;
226 int64_t master_minor;
227
228 uint32_t gmem_size;
229 uint64_t gmem_base;
230 uint32_t ccu_offset_gmem;
231 uint32_t ccu_offset_bypass;
232
233 struct fd_dev_id dev_id;
234 const struct fd_dev_info *info;
235
236 int msm_major_version;
237 int msm_minor_version;
238
239 /* Address space and global fault count for this local_fd with DRM backend */
240 uint64_t fault_count;
241
242 /* This is the drivers on-disk cache used as a fallback as opposed to
243 * the pipeline cache defined by apps.
244 */
245 struct disk_cache *disk_cache;
246
247 struct tu_memory_heap heap;
248
249 struct vk_sync_type syncobj_type;
250 struct vk_sync_timeline_type timeline_type;
251 const struct vk_sync_type *sync_types[3];
252 };
253
254 enum tu_debug_flags
255 {
256 TU_DEBUG_STARTUP = 1 << 0,
257 TU_DEBUG_NIR = 1 << 1,
258 TU_DEBUG_NOBIN = 1 << 3,
259 TU_DEBUG_SYSMEM = 1 << 4,
260 TU_DEBUG_FORCEBIN = 1 << 5,
261 TU_DEBUG_NOUBWC = 1 << 6,
262 TU_DEBUG_NOMULTIPOS = 1 << 7,
263 TU_DEBUG_NOLRZ = 1 << 8,
264 TU_DEBUG_PERFC = 1 << 9,
265 TU_DEBUG_FLUSHALL = 1 << 10,
266 TU_DEBUG_SYNCDRAW = 1 << 11,
267 TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
268 TU_DEBUG_GMEM = 1 << 13,
269 TU_DEBUG_RAST_ORDER = 1 << 14,
270 };
271
272 struct tu_instance
273 {
274 struct vk_instance vk;
275
276 uint32_t api_version;
277 int physical_device_count;
278 struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
279
280 struct driOptionCache dri_options;
281 struct driOptionCache available_dri_options;
282
283 enum tu_debug_flags debug_flags;
284 };
285
286 VkResult
287 tu_wsi_init(struct tu_physical_device *physical_device);
288 void
289 tu_wsi_finish(struct tu_physical_device *physical_device);
290
291 bool
292 tu_instance_extension_supported(const char *name);
293 uint32_t
294 tu_physical_device_api_version(struct tu_physical_device *dev);
295 bool
296 tu_physical_device_extension_supported(struct tu_physical_device *dev,
297 const char *name);
298
299 struct cache_entry;
300
301 struct tu_pipeline_cache
302 {
303 struct vk_object_base base;
304
305 struct tu_device *device;
306 pthread_mutex_t mutex;
307
308 uint32_t total_size;
309 uint32_t table_size;
310 uint32_t kernel_count;
311 struct cache_entry **hash_table;
312 bool modified;
313
314 VkAllocationCallbacks alloc;
315 };
316
317 struct tu_pipeline_key
318 {
319 };
320
321
322 /* queue types */
323 #define TU_QUEUE_GENERAL 0
324
325 #define TU_MAX_QUEUE_FAMILIES 1
326
327 /* Keep tu_syncobj until porting to common code for kgsl too */
328 #ifdef TU_USE_KGSL
329 struct tu_syncobj;
330 #endif
331 struct tu_u_trace_syncobj;
332
333 /* Define tu_timeline_sync type based on drm syncobj for a point type
334 * for vk_sync_timeline, and the logic to handle is mostly copied from
335 * anv_bo_sync since it seems it can be used by similar way to anv.
336 */
337 enum tu_timeline_sync_state {
338 /** Indicates that this is a new (or newly reset fence) */
339 TU_TIMELINE_SYNC_STATE_RESET,
340
341 /** Indicates that this fence has been submitted to the GPU but is still
342 * (as far as we know) in use by the GPU.
343 */
344 TU_TIMELINE_SYNC_STATE_SUBMITTED,
345
346 TU_TIMELINE_SYNC_STATE_SIGNALED,
347 };
348
349 struct tu_timeline_sync {
350 struct vk_sync base;
351
352 enum tu_timeline_sync_state state;
353 uint32_t syncobj;
354 };
355
356 struct tu_queue
357 {
358 struct vk_queue vk;
359
360 struct tu_device *device;
361
362 uint32_t msm_queue_id;
363 int fence;
364 };
365
366 struct tu_bo
367 {
368 uint32_t gem_handle;
369 uint64_t size;
370 uint64_t iova;
371 void *map;
372
373 #ifndef TU_USE_KGSL
374 int32_t refcnt;
375 uint32_t bo_list_idx;
376 #endif
377
378 bool implicit_sync : 1;
379 };
380
381 enum global_shader {
382 GLOBAL_SH_VS_BLIT,
383 GLOBAL_SH_VS_CLEAR,
384 GLOBAL_SH_FS_BLIT,
385 GLOBAL_SH_FS_BLIT_ZSCALE,
386 GLOBAL_SH_FS_COPY_MS,
387 GLOBAL_SH_FS_CLEAR0,
388 GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
389 GLOBAL_SH_COUNT,
390 };
391
392 #define TU_BORDER_COLOR_COUNT 4096
393 #define TU_BORDER_COLOR_BUILTIN 6
394
395 #define TU_BLIT_SHADER_SIZE 1024
396
397 /* This struct defines the layout of the global_bo */
398 struct tu6_global
399 {
400 /* clear/blit shaders */
401 uint32_t shaders[TU_BLIT_SHADER_SIZE];
402
403 uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
404 uint32_t _pad0;
405 volatile uint32_t vsc_draw_overflow;
406 uint32_t _pad1;
407 volatile uint32_t vsc_prim_overflow;
408 uint32_t _pad2;
409 uint64_t predicate;
410
411 /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
412 struct {
413 uint32_t offset;
414 uint32_t pad[7];
415 } flush_base[4];
416
417 ALIGN16 uint32_t cs_indirect_xyz[3];
418
419 /* To know when renderpass stats for autotune are valid */
420 volatile uint32_t autotune_fence;
421
422 /* note: larger global bo will be used for customBorderColors */
423 struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
424 };
425 #define gb_offset(member) offsetof(struct tu6_global, member)
426 #define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
427
428 /* extra space in vsc draw/prim streams */
429 #define VSC_PAD 0x40
430
431 struct tu_device
432 {
433 struct vk_device vk;
434 struct tu_instance *instance;
435
436 struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
437 int queue_count[TU_MAX_QUEUE_FAMILIES];
438
439 struct tu_physical_device *physical_device;
440 int fd;
441
442 struct ir3_compiler *compiler;
443
444 /* Backup in-memory cache to be used if the app doesn't provide one */
445 struct tu_pipeline_cache *mem_cache;
446
447 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
448
449 /* Currently the kernel driver uses a 32-bit GPU address space, but it
450 * should be impossible to go beyond 48 bits.
451 */
452 struct {
453 struct tu_bo *bo;
454 mtx_t construct_mtx;
455 bool initialized;
456 } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
457
458 struct tu_bo *global_bo;
459
460 uint32_t implicit_sync_bo_count;
461
462 /* the blob seems to always use 8K factor and 128K param sizes, copy them */
463 #define TU_TESS_FACTOR_SIZE (8 * 1024)
464 #define TU_TESS_PARAM_SIZE (128 * 1024)
465 #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
466 /* Lazily allocated, protected by the device mutex. */
467 struct tu_bo *tess_bo;
468
469 struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
470 uint64_t global_shader_va[GLOBAL_SH_COUNT];
471
472 uint32_t vsc_draw_strm_pitch;
473 uint32_t vsc_prim_strm_pitch;
474 BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
475 mtx_t mutex;
476
477 /* bo list for submits: */
478 struct drm_msm_gem_submit_bo *bo_list;
479 /* map bo handles to bo list index: */
480 uint32_t bo_count, bo_list_size;
481 mtx_t bo_mutex;
482 /* protects imported BOs creation/freeing */
483 struct u_rwlock dma_bo_lock;
484
485 /* This array holds all our 'struct tu_bo' allocations. We use this
486 * so we can add a refcount to our BOs and check if a particular BO
487 * was already allocated in this device using its GEM handle. This is
488 * necessary to properly manage BO imports, because the kernel doesn't
489 * refcount the underlying BO memory.
490 *
491 * Specifically, when self-importing (i.e. importing a BO into the same
492 * device that created it), the kernel will give us the same BO handle
493 * for both BOs and we must only free it once when both references are
494 * freed. Otherwise, if we are not self-importing, we get two different BO
495 * handles, and we want to free each one individually.
496 *
497 * The BOs in this map all have a refcnt with the reference counter and
498 * only self-imported BOs will ever have a refcnt > 1.
499 */
500 struct util_sparse_array bo_map;
501
502 /* Command streams to set pass index to a scratch reg */
503 struct tu_cs *perfcntrs_pass_cs;
504 struct tu_cs_entry *perfcntrs_pass_cs_entries;
505
506 /* Condition variable for timeline semaphore to notify waiters when a
507 * new submit is executed. */
508 pthread_cond_t timeline_cond;
509 pthread_mutex_t submit_mutex;
510
511 struct tu_autotune autotune;
512
513 #ifdef ANDROID
514 const void *gralloc;
515 enum {
516 TU_GRALLOC_UNKNOWN,
517 TU_GRALLOC_CROS,
518 TU_GRALLOC_OTHER,
519 } gralloc_type;
520 #endif
521
522 uint32_t submit_count;
523
524 struct u_trace_context trace_context;
525
526 #ifdef HAVE_PERFETTO
527 struct tu_perfetto_state perfetto;
528 #endif
529 };
530
531 void tu_init_clear_blit_shaders(struct tu_device *dev);
532
533 void tu_destroy_clear_blit_shaders(struct tu_device *dev);
534
535 VkResult
536 tu_device_submit_deferred_locked(struct tu_device *dev);
537
538 VkResult
539 tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
540
541 uint64_t
542 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
543
544 VkResult
545 tu_device_check_status(struct vk_device *vk_device);
546
547 enum tu_bo_alloc_flags
548 {
549 TU_BO_ALLOC_NO_FLAGS = 0,
550 TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
551 TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
552 };
553
554 VkResult
555 tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
556 enum tu_bo_alloc_flags flags);
557 VkResult
558 tu_bo_init_dmabuf(struct tu_device *dev,
559 struct tu_bo **bo,
560 uint64_t size,
561 int fd);
562 int
563 tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo);
564 void
565 tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
566 VkResult
567 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
568
569 static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device * device,uint32_t handle)570 tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
571 {
572 return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
573 }
574
575 /* Get a scratch bo for use inside a command buffer. This will always return
576 * the same bo given the same size or similar sizes, so only one scratch bo
577 * can be used at the same time. It's meant for short-lived things where we
578 * need to write to some piece of memory, read from it, and then immediately
579 * discard it.
580 */
581 VkResult
582 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
583
584 struct tu_cs_entry
585 {
586 /* No ownership */
587 const struct tu_bo *bo;
588
589 uint32_t size;
590 uint32_t offset;
591 };
592
593 struct tu_cs_memory {
594 uint32_t *map;
595 uint64_t iova;
596 };
597
598 struct tu_draw_state {
599 uint64_t iova : 48;
600 uint32_t size : 16;
601 };
602
603 enum tu_dynamic_state
604 {
605 /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
606 TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
607 TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
608 TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
609 TU_DYNAMIC_STATE_VB_STRIDE,
610 TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
611 TU_DYNAMIC_STATE_COUNT,
612 /* no associated draw state: */
613 TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
614 TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
615 /* re-use the line width enum as it uses GRAS_SU_CNTL: */
616 TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
617 };
618
619 enum tu_draw_state_group_id
620 {
621 TU_DRAW_STATE_PROGRAM_CONFIG,
622 TU_DRAW_STATE_PROGRAM,
623 TU_DRAW_STATE_PROGRAM_BINNING,
624 TU_DRAW_STATE_VB,
625 TU_DRAW_STATE_VI,
626 TU_DRAW_STATE_VI_BINNING,
627 TU_DRAW_STATE_RAST,
628 TU_DRAW_STATE_BLEND,
629 TU_DRAW_STATE_SHADER_GEOM_CONST,
630 TU_DRAW_STATE_FS_CONST,
631 TU_DRAW_STATE_DESC_SETS,
632 TU_DRAW_STATE_DESC_SETS_LOAD,
633 TU_DRAW_STATE_VS_PARAMS,
634 TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
635 TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
636 TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
637 TU_DRAW_STATE_PRIM_MODE_GMEM,
638 TU_DRAW_STATE_PRIM_MODE_SYSMEM,
639
640 /* dynamic state related draw states */
641 TU_DRAW_STATE_DYNAMIC,
642 TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
643 };
644
645 enum tu_cs_mode
646 {
647
648 /*
649 * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
650 * is full. tu_cs_begin must be called before command packet emission and
651 * tu_cs_end must be called after.
652 *
653 * This mode may create multiple entries internally. The entries must be
654 * submitted together.
655 */
656 TU_CS_MODE_GROW,
657
658 /*
659 * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
660 * fixed-size buffer. tu_cs_begin and tu_cs_end are optional and have no
661 * effect on it.
662 *
663 * This mode does not create any entry or any BO.
664 */
665 TU_CS_MODE_EXTERNAL,
666
667 /*
668 * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
669 * command packet emission. tu_cs_begin_sub_stream must be called to get a
670 * sub-stream to emit comamnd packets to. When done with the sub-stream,
671 * tu_cs_end_sub_stream must be called.
672 *
673 * This mode does not create any entry internally.
674 */
675 TU_CS_MODE_SUB_STREAM,
676 };
677
678 struct tu_cs
679 {
680 uint32_t *start;
681 uint32_t *cur;
682 uint32_t *reserved_end;
683 uint32_t *end;
684
685 struct tu_device *device;
686 enum tu_cs_mode mode;
687 uint32_t next_bo_size;
688
689 struct tu_cs_entry *entries;
690 uint32_t entry_count;
691 uint32_t entry_capacity;
692
693 struct tu_bo **bos;
694 uint32_t bo_count;
695 uint32_t bo_capacity;
696
697 /* state for cond_exec_start/cond_exec_end */
698 uint32_t cond_flags;
699 uint32_t *cond_dwords;
700 };
701
702 struct tu_device_memory
703 {
704 struct vk_object_base base;
705
706 struct tu_bo *bo;
707 };
708
709 struct tu_descriptor_range
710 {
711 uint64_t va;
712 uint32_t size;
713 };
714
715 struct tu_descriptor_set
716 {
717 struct vk_object_base base;
718
719 /* Link to descriptor pool's desc_sets list . */
720 struct list_head pool_link;
721
722 struct tu_descriptor_set_layout *layout;
723 struct tu_descriptor_pool *pool;
724 uint32_t size;
725
726 uint64_t va;
727 uint32_t *mapped_ptr;
728
729 uint32_t *dynamic_descriptors;
730 };
731
732 struct tu_descriptor_pool_entry
733 {
734 uint32_t offset;
735 uint32_t size;
736 struct tu_descriptor_set *set;
737 };
738
739 struct tu_descriptor_pool
740 {
741 struct vk_object_base base;
742
743 struct tu_bo *bo;
744 uint64_t current_offset;
745 uint64_t size;
746
747 uint8_t *host_memory_base;
748 uint8_t *host_memory_ptr;
749 uint8_t *host_memory_end;
750 uint8_t *host_bo;
751
752 struct list_head desc_sets;
753
754 uint32_t entry_count;
755 uint32_t max_entry_count;
756 struct tu_descriptor_pool_entry entries[0];
757 };
758
759 struct tu_descriptor_update_template_entry
760 {
761 VkDescriptorType descriptor_type;
762
763 /* The number of descriptors to update */
764 uint32_t descriptor_count;
765
766 /* Into mapped_ptr or dynamic_descriptors, in units of the respective array
767 */
768 uint32_t dst_offset;
769
770 /* In dwords. Not valid/used for dynamic descriptors */
771 uint32_t dst_stride;
772
773 uint32_t buffer_offset;
774
775 /* Only valid for combined image samplers and samplers */
776 uint16_t has_sampler;
777
778 /* In bytes */
779 size_t src_offset;
780 size_t src_stride;
781
782 /* For push descriptors */
783 const struct tu_sampler *immutable_samplers;
784 };
785
786 struct tu_descriptor_update_template
787 {
788 struct vk_object_base base;
789
790 uint32_t entry_count;
791 VkPipelineBindPoint bind_point;
792 struct tu_descriptor_update_template_entry entry[0];
793 };
794
795 struct tu_buffer
796 {
797 struct vk_object_base base;
798
799 VkDeviceSize size;
800
801 VkBufferUsageFlags usage;
802 VkBufferCreateFlags flags;
803
804 struct tu_bo *bo;
805 uint64_t iova;
806 };
807
808 const char *
809 tu_get_debug_option_name(int id);
810
811 const char *
812 tu_get_perftest_option_name(int id);
813
814 struct tu_descriptor_state
815 {
816 struct tu_descriptor_set *sets[MAX_SETS];
817 struct tu_descriptor_set push_set;
818 uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
819 };
820
821 enum tu_cmd_dirty_bits
822 {
823 TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
824 TU_CMD_DIRTY_VB_STRIDE = BIT(1),
825 TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
826 TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
827 TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
828 TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
829 TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
830 TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
831 TU_CMD_DIRTY_LRZ = BIT(8),
832 TU_CMD_DIRTY_VS_PARAMS = BIT(9),
833 TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
834 TU_CMD_DIRTY_VIEWPORTS = BIT(11),
835 /* all draw states were disabled and need to be re-enabled: */
836 TU_CMD_DIRTY_DRAW_STATE = BIT(12)
837 };
838
839 /* There are only three cache domains we have to care about: the CCU, or
840 * color cache unit, which is used for color and depth/stencil attachments
841 * and copy/blit destinations, and is split conceptually into color and depth,
842 * and the universal cache or UCHE which is used for pretty much everything
843 * else, except for the CP (uncached) and host. We need to flush whenever data
844 * crosses these boundaries.
845 */
846
847 enum tu_cmd_access_mask {
848 TU_ACCESS_UCHE_READ = 1 << 0,
849 TU_ACCESS_UCHE_WRITE = 1 << 1,
850 TU_ACCESS_CCU_COLOR_READ = 1 << 2,
851 TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
852 TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
853 TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
854
855 /* Experiments have shown that while it's safe to avoid flushing the CCU
856 * after each blit/renderpass, it's not safe to assume that subsequent
857 * lookups with a different attachment state will hit unflushed cache
858 * entries. That is, the CCU needs to be flushed and possibly invalidated
859 * when accessing memory with a different attachment state. Writing to an
860 * attachment under the following conditions after clearing using the
861 * normal 2d engine path is known to have issues:
862 *
863 * - It isn't the 0'th layer.
864 * - There are more than one attachment, and this isn't the 0'th attachment
865 * (this seems to also depend on the cpp of the attachments).
866 *
867 * Our best guess is that the layer/MRT state is used when computing
868 * the location of a cache entry in CCU, to avoid conflicts. We assume that
869 * any access in a renderpass after or before an access by a transfer needs
870 * a flush/invalidate, and use the _INCOHERENT variants to represent access
871 * by a renderpass.
872 */
873 TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
874 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
875 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
876 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
877
878 /* Accesses which bypasses any cache. e.g. writes via the host,
879 * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
880 */
881 TU_ACCESS_SYSMEM_READ = 1 << 10,
882 TU_ACCESS_SYSMEM_WRITE = 1 << 11,
883
884 /* Memory writes from the CP start in-order with draws and event writes,
885 * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
886 */
887 TU_ACCESS_CP_WRITE = 1 << 12,
888
889 TU_ACCESS_READ =
890 TU_ACCESS_UCHE_READ |
891 TU_ACCESS_CCU_COLOR_READ |
892 TU_ACCESS_CCU_DEPTH_READ |
893 TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
894 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
895 TU_ACCESS_SYSMEM_READ,
896
897 TU_ACCESS_WRITE =
898 TU_ACCESS_UCHE_WRITE |
899 TU_ACCESS_CCU_COLOR_WRITE |
900 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
901 TU_ACCESS_CCU_DEPTH_WRITE |
902 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
903 TU_ACCESS_SYSMEM_WRITE |
904 TU_ACCESS_CP_WRITE,
905
906 TU_ACCESS_ALL =
907 TU_ACCESS_READ |
908 TU_ACCESS_WRITE,
909 };
910
911 /* Starting with a6xx, the pipeline is split into several "clusters" (really
912 * pipeline stages). Each stage has its own pair of register banks and can
913 * switch them independently, so that earlier stages can run ahead of later
914 * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
915 * the same time.
916 *
917 * As a result of this, we need to insert a WFI when an earlier stage depends
918 * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
919 * pending WFI's to complete before starting, and usually before reading
920 * indirect params even, so a WFI also acts as a full "pipeline stall".
921 *
922 * Note, the names of the stages come from CLUSTER_* in devcoredump. We
923 * include all the stages for completeness, even ones which do not read/write
924 * anything.
925 */
926
927 enum tu_stage {
928 /* This doesn't correspond to a cluster, but we need it for tracking
929 * indirect draw parameter reads etc.
930 */
931 TU_STAGE_CP,
932
933 /* - Fetch index buffer
934 * - Fetch vertex attributes, dispatch VS
935 */
936 TU_STAGE_FE,
937
938 /* Execute all geometry stages (VS thru GS) */
939 TU_STAGE_SP_VS,
940
941 /* Write to VPC, do primitive assembly. */
942 TU_STAGE_PC_VS,
943
944 /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
945 * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
946 * early depth testing is enabled before dispatching fragments? However
947 * GRAS reads and writes LRZ directly.
948 */
949 TU_STAGE_GRAS,
950
951 /* Execute FS */
952 TU_STAGE_SP_PS,
953
954 /* - Fragment tests
955 * - Write color/depth
956 * - Streamout writes (???)
957 * - Varying interpolation (???)
958 */
959 TU_STAGE_PS,
960 };
961
962 enum tu_cmd_flush_bits {
963 TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
964 TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
965 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
966 TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
967 TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
968 TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
969 TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
970 TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
971 TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
972
973 TU_CMD_FLAG_ALL_FLUSH =
974 TU_CMD_FLAG_CCU_FLUSH_DEPTH |
975 TU_CMD_FLAG_CCU_FLUSH_COLOR |
976 TU_CMD_FLAG_CACHE_FLUSH |
977 /* Treat the CP as a sort of "cache" which may need to be "flushed" via
978 * waiting for writes to land with WAIT_FOR_MEM_WRITES.
979 */
980 TU_CMD_FLAG_WAIT_MEM_WRITES,
981
982 TU_CMD_FLAG_ALL_INVALIDATE =
983 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
984 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
985 TU_CMD_FLAG_CACHE_INVALIDATE,
986 };
987
988 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
989 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
990 * which part of the gmem is used by the CCU. Here we keep track of what the
991 * state of the CCU.
992 */
993 enum tu_cmd_ccu_state {
994 TU_CMD_CCU_SYSMEM,
995 TU_CMD_CCU_GMEM,
996 TU_CMD_CCU_UNKNOWN,
997 };
998
999 struct tu_cache_state {
1000 /* Caches which must be made available (flushed) eventually if there are
1001 * any users outside that cache domain, and caches which must be
1002 * invalidated eventually if there are any reads.
1003 */
1004 enum tu_cmd_flush_bits pending_flush_bits;
1005 /* Pending flushes */
1006 enum tu_cmd_flush_bits flush_bits;
1007 };
1008
1009 enum tu_lrz_force_disable_mask {
1010 TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
1011 TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
1012 };
1013
1014 enum tu_lrz_direction {
1015 TU_LRZ_UNKNOWN,
1016 /* Depth func less/less-than: */
1017 TU_LRZ_LESS,
1018 /* Depth func greater/greater-than: */
1019 TU_LRZ_GREATER,
1020 };
1021
1022 struct tu_lrz_pipeline
1023 {
1024 uint32_t force_disable_mask;
1025 bool fs_has_kill;
1026 bool force_late_z;
1027 bool early_fragment_tests;
1028 };
1029
1030 struct tu_lrz_state
1031 {
1032 /* Depth/Stencil image currently on use to do LRZ */
1033 struct tu_image *image;
1034 bool valid : 1;
1035 enum tu_lrz_direction prev_direction;
1036 };
1037
1038 struct tu_vs_params {
1039 uint32_t vertex_offset;
1040 uint32_t first_instance;
1041 };
1042
1043 struct tu_cmd_state
1044 {
1045 uint32_t dirty;
1046
1047 struct tu_pipeline *pipeline;
1048 struct tu_pipeline *compute_pipeline;
1049
1050 /* Vertex buffers, viewports, and scissors
1051 * the states for these can be updated partially, so we need to save these
1052 * to be able to emit a complete draw state
1053 */
1054 struct {
1055 uint64_t base;
1056 uint32_t size;
1057 uint32_t stride;
1058 } vb[MAX_VBS];
1059 VkViewport viewport[MAX_VIEWPORTS];
1060 VkRect2D scissor[MAX_SCISSORS];
1061 uint32_t max_viewport, max_scissor;
1062
1063 /* for dynamic states that can't be emitted directly */
1064 uint32_t dynamic_stencil_mask;
1065 uint32_t dynamic_stencil_wrmask;
1066 uint32_t dynamic_stencil_ref;
1067
1068 uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
1069 uint32_t pc_raster_cntl, vpc_unknown_9107;
1070 enum pc_di_primtype primtype;
1071 bool primitive_restart_enable;
1072
1073 /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
1074 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
1075 struct tu_draw_state vertex_buffers;
1076 struct tu_draw_state shader_const[2];
1077 struct tu_draw_state desc_sets;
1078
1079 struct tu_draw_state vs_params;
1080
1081 /* Index buffer */
1082 uint64_t index_va;
1083 uint32_t max_index_count;
1084 uint8_t index_size;
1085
1086 /* because streamout base has to be 32-byte aligned
1087 * there is an extra offset to deal with when it is
1088 * unaligned
1089 */
1090 uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
1091
1092 /* Renderpasses are tricky, because we may need to flush differently if
1093 * using sysmem vs. gmem and therefore we have to delay any flushing that
1094 * happens before a renderpass. So we have to have two copies of the flush
1095 * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
1096 * and one for outside a renderpass.
1097 */
1098 struct tu_cache_state cache;
1099 struct tu_cache_state renderpass_cache;
1100
1101 enum tu_cmd_ccu_state ccu_state;
1102
1103 const struct tu_render_pass *pass;
1104 const struct tu_subpass *subpass;
1105 const struct tu_framebuffer *framebuffer;
1106 VkRect2D render_area;
1107
1108 const struct tu_image_view **attachments;
1109
1110 bool xfb_used;
1111 bool has_tess;
1112 bool tessfactor_addr_set;
1113 bool has_subpass_predication;
1114 bool predication_active;
1115 bool disable_gmem;
1116 enum a5xx_line_mode line_mode;
1117 bool z_negative_one_to_one;
1118
1119 uint32_t drawcall_count;
1120
1121 /* A calculated "draw cost" value for renderpass, which tries to
1122 * estimate the bandwidth-per-sample of all the draws according
1123 * to:
1124 *
1125 * foreach_draw (...) {
1126 * cost += num_frag_outputs;
1127 * if (blend_enabled)
1128 * cost += num_blend_enabled;
1129 * if (depth_test_enabled)
1130 * cost++;
1131 * if (depth_write_enabled)
1132 * cost++;
1133 * }
1134 *
1135 * The idea is that each sample-passed minimally does one write
1136 * per MRT. If blend is enabled, the hw will additionally do
1137 * a framebuffer read per sample-passed (for each MRT with blend
1138 * enabled). If depth-test is enabled, the hw will additionally
1139 * a depth buffer read. If depth-write is enable, the hw will
1140 * additionally do a depth buffer write.
1141 *
1142 * This does ignore depth buffer traffic for samples which do not
1143 * pass do to depth-test fail, and some other details. But it is
1144 * just intended to be a rough estimate that is easy to calculate.
1145 */
1146 uint32_t total_drawcalls_cost;
1147
1148 struct tu_lrz_state lrz;
1149
1150 struct tu_draw_state lrz_and_depth_plane_state;
1151
1152 struct tu_vs_params last_vs_params;
1153 };
1154
1155 struct tu_cmd_pool
1156 {
1157 struct vk_command_pool vk;
1158
1159 struct list_head cmd_buffers;
1160 struct list_head free_cmd_buffers;
1161 };
1162
1163 enum tu_cmd_buffer_status
1164 {
1165 TU_CMD_BUFFER_STATUS_INVALID,
1166 TU_CMD_BUFFER_STATUS_INITIAL,
1167 TU_CMD_BUFFER_STATUS_RECORDING,
1168 TU_CMD_BUFFER_STATUS_EXECUTABLE,
1169 TU_CMD_BUFFER_STATUS_PENDING,
1170 };
1171
1172 struct tu_cmd_buffer
1173 {
1174 struct vk_command_buffer vk;
1175
1176 struct tu_device *device;
1177
1178 struct tu_cmd_pool *pool;
1179 struct list_head pool_link;
1180
1181 struct u_trace trace;
1182 struct u_trace_iterator trace_renderpass_start;
1183 struct u_trace_iterator trace_renderpass_end;
1184
1185 struct list_head renderpass_autotune_results;
1186 struct tu_autotune_results_buffer* autotune_buffer;
1187
1188 VkCommandBufferUsageFlags usage_flags;
1189 enum tu_cmd_buffer_status status;
1190
1191 struct tu_cmd_state state;
1192 uint32_t queue_family_index;
1193
1194 uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
1195 VkShaderStageFlags push_constant_stages;
1196 struct tu_descriptor_set meta_push_descriptors;
1197
1198 struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
1199
1200 VkResult record_result;
1201
1202 struct tu_cs cs;
1203 struct tu_cs draw_cs;
1204 struct tu_cs tile_store_cs;
1205 struct tu_cs draw_epilogue_cs;
1206 struct tu_cs sub_cs;
1207
1208 uint32_t vsc_draw_strm_pitch;
1209 uint32_t vsc_prim_strm_pitch;
1210 };
1211
1212 /* Temporary struct for tracking a register state to be written, used by
1213 * a6xx-pack.h and tu_cs_emit_regs()
1214 */
1215 struct tu_reg_value {
1216 uint32_t reg;
1217 uint64_t value;
1218 bool is_address;
1219 struct tu_bo *bo;
1220 bool bo_write;
1221 uint32_t bo_offset;
1222 uint32_t bo_shift;
1223 };
1224
1225
1226 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
1227 struct tu_cs *cs);
1228
1229 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
1230 struct tu_cs *cs,
1231 enum tu_cmd_ccu_state ccu_state);
1232
1233 void
1234 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
1235 struct tu_cs *cs,
1236 enum vgt_event_type event);
1237
1238 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)1239 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
1240 VkPipelineBindPoint bind_point)
1241 {
1242 return &cmd_buffer->descriptors[bind_point];
1243 }
1244
1245 struct tu_event
1246 {
1247 struct vk_object_base base;
1248 struct tu_bo *bo;
1249 };
1250
1251 struct tu_push_constant_range
1252 {
1253 uint32_t lo;
1254 uint32_t count;
1255 };
1256
1257 struct tu_shader
1258 {
1259 struct ir3_shader *ir3_shader;
1260
1261 struct tu_push_constant_range push_consts;
1262 uint8_t active_desc_sets;
1263 bool multi_pos_output;
1264 };
1265
1266 bool
1267 tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
1268 struct tu_device *dev);
1269
1270 nir_shader *
1271 tu_spirv_to_nir(struct tu_device *dev,
1272 void *mem_ctx,
1273 const VkPipelineShaderStageCreateInfo *stage_info,
1274 gl_shader_stage stage);
1275
1276 struct tu_shader *
1277 tu_shader_create(struct tu_device *dev,
1278 nir_shader *nir,
1279 const VkPipelineShaderStageCreateInfo *stage_info,
1280 unsigned multiview_mask,
1281 struct tu_pipeline_layout *layout,
1282 const VkAllocationCallbacks *alloc);
1283
1284 void
1285 tu_shader_destroy(struct tu_device *dev,
1286 struct tu_shader *shader,
1287 const VkAllocationCallbacks *alloc);
1288
1289 struct tu_program_descriptor_linkage
1290 {
1291 struct ir3_const_state const_state;
1292
1293 uint32_t constlen;
1294
1295 struct tu_push_constant_range push_consts;
1296 };
1297
1298 struct tu_pipeline_executable {
1299 gl_shader_stage stage;
1300
1301 struct ir3_info stats;
1302 bool is_binning;
1303
1304 char *nir_from_spirv;
1305 char *nir_final;
1306 char *disasm;
1307 };
1308
1309 struct tu_pipeline
1310 {
1311 struct vk_object_base base;
1312
1313 struct tu_cs cs;
1314
1315 /* Separate BO for private memory since it should GPU writable */
1316 struct tu_bo *pvtmem_bo;
1317
1318 struct tu_pipeline_layout *layout;
1319
1320 bool need_indirect_descriptor_sets;
1321 VkShaderStageFlags active_stages;
1322 uint32_t active_desc_sets;
1323
1324 /* mask of enabled dynamic states
1325 * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
1326 */
1327 uint32_t dynamic_state_mask;
1328 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
1329
1330 /* for dynamic states which use the same register: */
1331 uint32_t gras_su_cntl, gras_su_cntl_mask;
1332 uint32_t rb_depth_cntl, rb_depth_cntl_mask;
1333 uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
1334 uint32_t pc_raster_cntl, pc_raster_cntl_mask;
1335 uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
1336 uint32_t stencil_wrmask;
1337
1338 bool rb_depth_cntl_disable;
1339
1340 enum a5xx_line_mode line_mode;
1341
1342 /* draw states for the pipeline */
1343 struct tu_draw_state load_state, rast_state, blend_state;
1344 struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;
1345
1346 /* for vertex buffers state */
1347 uint32_t num_vbs;
1348
1349 struct
1350 {
1351 struct tu_draw_state config_state;
1352 struct tu_draw_state state;
1353 struct tu_draw_state binning_state;
1354
1355 struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
1356 } program;
1357
1358 struct
1359 {
1360 struct tu_draw_state state;
1361 struct tu_draw_state binning_state;
1362 } vi;
1363
1364 struct
1365 {
1366 enum pc_di_primtype primtype;
1367 bool primitive_restart;
1368 } ia;
1369
1370 struct
1371 {
1372 uint32_t patch_type;
1373 uint32_t param_stride;
1374 bool upper_left_domain_origin;
1375 } tess;
1376
1377 struct
1378 {
1379 uint32_t local_size[3];
1380 uint32_t subgroup_size;
1381 } compute;
1382
1383 bool provoking_vertex_last;
1384
1385 struct tu_lrz_pipeline lrz;
1386
1387 /* In other words - framebuffer fetch support */
1388 bool raster_order_attachment_access;
1389 bool subpass_feedback_loop_ds;
1390
1391 bool z_negative_one_to_one;
1392
1393 /* Base drawcall cost for sysmem vs gmem autotuner */
1394 uint8_t drawcall_base_cost;
1395
1396 void *executables_mem_ctx;
1397 /* tu_pipeline_executable */
1398 struct util_dynarray executables;
1399 };
1400
1401 void
1402 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
1403 bool z_negative_one_to_one);
1404
1405 void
1406 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
1407
1408 void
1409 tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
1410
1411 void
1412 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
1413
1414 void
1415 tu6_emit_depth_bias(struct tu_cs *cs,
1416 float constant_factor,
1417 float clamp,
1418 float slope_factor);
1419
1420 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
1421 enum a5xx_line_mode line_mode);
1422
1423 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
1424
1425 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
1426
1427 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1428
1429 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
1430 uint32_t *rb_depth_cntl);
1431
1432 struct tu_pvtmem_config {
1433 uint64_t iova;
1434 uint32_t per_fiber_size;
1435 uint32_t per_sp_size;
1436 bool per_wave;
1437 };
1438
1439 void
1440 tu6_emit_xs_config(struct tu_cs *cs,
1441 gl_shader_stage stage,
1442 const struct ir3_shader_variant *xs);
1443
1444 void
1445 tu6_emit_xs(struct tu_cs *cs,
1446 gl_shader_stage stage,
1447 const struct ir3_shader_variant *xs,
1448 const struct tu_pvtmem_config *pvtmem,
1449 uint64_t binary_iova);
1450
1451 void
1452 tu6_emit_vpc(struct tu_cs *cs,
1453 const struct ir3_shader_variant *vs,
1454 const struct ir3_shader_variant *hs,
1455 const struct ir3_shader_variant *ds,
1456 const struct ir3_shader_variant *gs,
1457 const struct ir3_shader_variant *fs,
1458 uint32_t patch_control_points);
1459
1460 void
1461 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
1462
1463 struct tu_image_view;
1464
1465 void
1466 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1467 struct tu_cs *cs,
1468 const struct tu_image_view *src,
1469 const struct tu_image_view *dst,
1470 uint32_t layer_mask,
1471 uint32_t layers,
1472 const VkRect2D *rect);
1473
1474 void
1475 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
1476 struct tu_cs *cs,
1477 uint32_t a,
1478 const VkRenderPassBeginInfo *info);
1479
1480 void
1481 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1482 struct tu_cs *cs,
1483 uint32_t a,
1484 const VkRenderPassBeginInfo *info);
1485
1486 void
1487 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
1488 struct tu_cs *cs,
1489 uint32_t a,
1490 bool force_load);
1491
1492 /* expose this function to be able to emit load without checking LOAD_OP */
1493 void
1494 tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
1495
1496 /* note: gmem store can also resolve */
1497 void
1498 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
1499 struct tu_cs *cs,
1500 uint32_t a,
1501 uint32_t gmem_a);
1502
1503 enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
1504
1505 struct tu_native_format
1506 {
1507 enum a6xx_format fmt : 8;
1508 enum a3xx_color_swap swap : 8;
1509 enum a6xx_tile_mode tile_mode : 8;
1510 };
1511
1512 enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
1513 bool tu6_format_vtx_supported(VkFormat format);
1514 struct tu_native_format tu6_format_vtx(VkFormat format);
1515 bool tu6_format_color_supported(enum pipe_format format);
1516 struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
1517 bool tu6_format_texture_supported(enum pipe_format format);
1518 struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);
1519
1520 static inline enum a6xx_format
tu6_base_format(enum pipe_format format)1521 tu6_base_format(enum pipe_format format)
1522 {
1523 /* note: tu6_format_color doesn't care about tiling for .fmt field */
1524 return tu6_format_color(format, TILE6_LINEAR).fmt;
1525 }
1526
1527 struct tu_image
1528 {
1529 struct vk_object_base base;
1530
1531 /* The original VkFormat provided by the client. This may not match any
1532 * of the actual surface formats.
1533 */
1534 VkFormat vk_format;
1535 uint32_t level_count;
1536 uint32_t layer_count;
1537
1538 struct fdl_layout layout[3];
1539 uint32_t total_size;
1540
1541 #ifdef ANDROID
1542 /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
1543 VkDeviceMemory owned_memory;
1544 #endif
1545
1546 /* Set when bound */
1547 struct tu_bo *bo;
1548 uint64_t iova;
1549
1550 uint32_t lrz_height;
1551 uint32_t lrz_pitch;
1552 uint32_t lrz_offset;
1553
1554 bool shareable;
1555 };
1556
1557 static inline uint32_t
tu_get_layerCount(const struct tu_image * image,const VkImageSubresourceRange * range)1558 tu_get_layerCount(const struct tu_image *image,
1559 const VkImageSubresourceRange *range)
1560 {
1561 return range->layerCount == VK_REMAINING_ARRAY_LAYERS
1562 ? image->layer_count - range->baseArrayLayer
1563 : range->layerCount;
1564 }
1565
1566 static inline uint32_t
tu_get_levelCount(const struct tu_image * image,const VkImageSubresourceRange * range)1567 tu_get_levelCount(const struct tu_image *image,
1568 const VkImageSubresourceRange *range)
1569 {
1570 return range->levelCount == VK_REMAINING_MIP_LEVELS
1571 ? image->level_count - range->baseMipLevel
1572 : range->levelCount;
1573 }
1574
1575 enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);
1576
1577 uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);
1578
1579 enum pipe_format tu_format_for_aspect(enum pipe_format format,
1580 VkImageAspectFlags aspect_mask);
1581
1582 struct tu_image_view
1583 {
1584 struct vk_object_base base;
1585
1586 struct tu_image *image; /**< VkImageViewCreateInfo::image */
1587
1588 struct fdl6_view view;
1589
1590 /* for d32s8 separate stencil */
1591 uint64_t stencil_base_addr;
1592 uint32_t stencil_layer_size;
1593 uint32_t stencil_PITCH;
1594 };
1595
1596 struct tu_sampler_ycbcr_conversion {
1597 struct vk_object_base base;
1598
1599 VkFormat format;
1600 VkSamplerYcbcrModelConversion ycbcr_model;
1601 VkSamplerYcbcrRange ycbcr_range;
1602 VkComponentMapping components;
1603 VkChromaLocation chroma_offsets[2];
1604 VkFilter chroma_filter;
1605 };
1606
1607 struct tu_sampler {
1608 struct vk_object_base base;
1609
1610 uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
1611 struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
1612 };
1613
1614 void
1615 tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
1616
1617 void
1618 tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);
1619
1620 void
1621 tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
1622
1623 void
1624 tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1625
1626 #define tu_image_view_stencil(iview, x) \
1627 ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
1628
1629 VkResult
1630 tu_gralloc_info(struct tu_device *device,
1631 const VkNativeBufferANDROID *gralloc_info,
1632 int *dma_buf,
1633 uint64_t *modifier);
1634
1635 VkResult
1636 tu_import_memory_from_gralloc_handle(VkDevice device_h,
1637 int dma_buf,
1638 const VkAllocationCallbacks *alloc,
1639 VkImage image_h);
1640
1641 void
1642 tu_image_view_init(struct tu_image_view *iview,
1643 const VkImageViewCreateInfo *pCreateInfo,
1644 bool limited_z24s8);
1645
1646 bool
1647 tiling_possible(VkFormat format);
1648
1649 bool
1650 ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
1651 const struct fd_dev_info *info, VkSampleCountFlagBits samples);
1652
1653 struct tu_buffer_view
1654 {
1655 struct vk_object_base base;
1656
1657 uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
1658
1659 struct tu_buffer *buffer;
1660 };
1661 void
1662 tu_buffer_view_init(struct tu_buffer_view *view,
1663 struct tu_device *device,
1664 const VkBufferViewCreateInfo *pCreateInfo);
1665
1666 struct tu_attachment_info
1667 {
1668 struct tu_image_view *attachment;
1669 };
1670
1671 struct tu_framebuffer
1672 {
1673 struct vk_object_base base;
1674
1675 uint32_t width;
1676 uint32_t height;
1677 uint32_t layers;
1678
1679 /* size of the first tile */
1680 VkExtent2D tile0;
1681 /* number of tiles */
1682 VkExtent2D tile_count;
1683
1684 /* size of the first VSC pipe */
1685 VkExtent2D pipe0;
1686 /* number of VSC pipes */
1687 VkExtent2D pipe_count;
1688
1689 /* pipe register values */
1690 uint32_t pipe_config[MAX_VSC_PIPES];
1691 uint32_t pipe_sizes[MAX_VSC_PIPES];
1692
1693 uint32_t attachment_count;
1694 struct tu_attachment_info attachments[0];
1695 };
1696
1697 void
1698 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
1699 const struct tu_device *device,
1700 const struct tu_render_pass *pass);
1701
1702 struct tu_subpass_barrier {
1703 VkPipelineStageFlags src_stage_mask;
1704 VkPipelineStageFlags dst_stage_mask;
1705 VkAccessFlags src_access_mask;
1706 VkAccessFlags dst_access_mask;
1707 bool incoherent_ccu_color, incoherent_ccu_depth;
1708 };
1709
1710 struct tu_subpass_attachment
1711 {
1712 uint32_t attachment;
1713
1714 /* For input attachments, true if it needs to be patched to refer to GMEM
1715 * in GMEM mode. This is false if it hasn't already been written as an
1716 * attachment.
1717 */
1718 bool patch_input_gmem;
1719 };
1720
1721 struct tu_subpass
1722 {
1723 uint32_t input_count;
1724 uint32_t color_count;
1725 uint32_t resolve_count;
1726 bool resolve_depth_stencil;
1727
1728 bool feedback_loop_color;
1729 bool feedback_loop_ds;
1730
1731 /* True if we must invalidate UCHE thanks to a feedback loop. */
1732 bool feedback_invalidate;
1733
1734 /* In other words - framebuffer fetch support */
1735 bool raster_order_attachment_access;
1736
1737 struct tu_subpass_attachment *input_attachments;
1738 struct tu_subpass_attachment *color_attachments;
1739 struct tu_subpass_attachment *resolve_attachments;
1740 struct tu_subpass_attachment depth_stencil_attachment;
1741
1742 VkSampleCountFlagBits samples;
1743
1744 uint32_t srgb_cntl;
1745 uint32_t multiview_mask;
1746
1747 struct tu_subpass_barrier start_barrier;
1748 };
1749
1750 struct tu_render_pass_attachment
1751 {
1752 VkFormat format;
1753 uint32_t samples;
1754 uint32_t cpp;
1755 VkImageAspectFlags clear_mask;
1756 uint32_t clear_views;
1757 bool load;
1758 bool store;
1759 int32_t gmem_offset;
1760 /* for D32S8 separate stencil: */
1761 bool load_stencil;
1762 bool store_stencil;
1763 int32_t gmem_offset_stencil;
1764 };
1765
1766 struct tu_render_pass
1767 {
1768 struct vk_object_base base;
1769
1770 uint32_t attachment_count;
1771 uint32_t subpass_count;
1772 uint32_t gmem_pixels;
1773 uint32_t tile_align_w;
1774 struct tu_subpass_attachment *subpass_attachments;
1775 struct tu_render_pass_attachment *attachments;
1776 struct tu_subpass_barrier end_barrier;
1777 struct tu_subpass subpasses[0];
1778 };
1779
1780 #define PERF_CNTRS_REG 4
1781
1782 struct tu_perf_query_data
1783 {
1784 uint32_t gid; /* group-id */
1785 uint32_t cid; /* countable-id within the group */
1786 uint32_t cntr_reg; /* counter register within the group */
1787 uint32_t pass; /* pass index that countables can be requested */
1788 uint32_t app_idx; /* index provided by apps */
1789 };
1790
1791 struct tu_query_pool
1792 {
1793 struct vk_object_base base;
1794
1795 VkQueryType type;
1796 uint32_t stride;
1797 uint64_t size;
1798 uint32_t pipeline_statistics;
1799 struct tu_bo *bo;
1800
1801 /* For performance query */
1802 const struct fd_perfcntr_group *perf_group;
1803 uint32_t perf_group_count;
1804 uint32_t counter_index_count;
1805 struct tu_perf_query_data perf_query_data[0];
1806 };
1807
1808 uint32_t
1809 tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
1810
1811 void
1812 tu_update_descriptor_sets(const struct tu_device *device,
1813 VkDescriptorSet overrideSet,
1814 uint32_t descriptorWriteCount,
1815 const VkWriteDescriptorSet *pDescriptorWrites,
1816 uint32_t descriptorCopyCount,
1817 const VkCopyDescriptorSet *pDescriptorCopies);
1818
1819 void
1820 tu_update_descriptor_set_with_template(
1821 const struct tu_device *device,
1822 struct tu_descriptor_set *set,
1823 VkDescriptorUpdateTemplate descriptorUpdateTemplate,
1824 const void *pData);
1825
1826 VkResult
1827 tu_physical_device_init(struct tu_physical_device *device,
1828 struct tu_instance *instance);
1829 VkResult
1830 tu_enumerate_devices(struct tu_instance *instance);
1831
1832 int
1833 tu_device_get_gpu_timestamp(struct tu_device *dev,
1834 uint64_t *ts);
1835
1836 int
1837 tu_device_get_suspend_count(struct tu_device *dev,
1838 uint64_t *suspend_count);
1839
1840 int
1841 tu_drm_submitqueue_new(const struct tu_device *dev,
1842 int priority,
1843 uint32_t *queue_id);
1844
1845 void
1846 tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
1847
1848 int
1849 tu_signal_syncs(struct tu_device *device, struct vk_sync *sync1, struct vk_sync *sync2);
1850
1851 int
1852 tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync);
1853
1854 VkResult
1855 tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit);
1856
1857 void
1858 tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
1859 void *ts_from, uint32_t from_offset,
1860 void *ts_to, uint32_t to_offset,
1861 uint32_t count);
1862
1863
1864 VkResult
1865 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
1866 struct u_trace **trace_copy);
1867
1868 /* If we copy trace and timestamps we will have to free them. */
1869 struct tu_u_trace_cmd_data
1870 {
1871 struct tu_cs *timestamp_copy_cs;
1872 struct u_trace *trace;
1873 };
1874
1875 /* Data necessary to retrieve timestamps and clean all
1876 * associated resources afterwards.
1877 */
1878 struct tu_u_trace_submission_data
1879 {
1880 uint32_t submission_id;
1881 /* We have to know when timestamps are available,
1882 * this sync object indicates it.
1883 */
1884 struct tu_u_trace_syncobj *syncobj;
1885
1886 uint32_t cmd_buffer_count;
1887 uint32_t last_buffer_with_tracepoints;
1888 struct tu_u_trace_cmd_data *cmd_trace_data;
1889 };
1890
1891 VkResult
1892 tu_u_trace_submission_data_create(
1893 struct tu_device *device,
1894 struct tu_cmd_buffer **cmd_buffers,
1895 uint32_t cmd_buffer_count,
1896 struct tu_u_trace_submission_data **submission_data);
1897
1898 void
1899 tu_u_trace_submission_data_finish(
1900 struct tu_device *device,
1901 struct tu_u_trace_submission_data *submission_data);
1902
1903 #define TU_FROM_HANDLE(__tu_type, __name, __handle) \
1904 VK_FROM_HANDLE(__tu_type, __name, __handle)
1905
1906 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
1907 VK_OBJECT_TYPE_COMMAND_BUFFER)
1908 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
1909 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
1910 VK_OBJECT_TYPE_INSTANCE)
1911 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
1912 VK_OBJECT_TYPE_PHYSICAL_DEVICE)
1913 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
1914
1915 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
1916 VK_OBJECT_TYPE_COMMAND_POOL)
1917 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
1918 VK_OBJECT_TYPE_BUFFER)
1919 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
1920 VK_OBJECT_TYPE_BUFFER_VIEW)
1921 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
1922 VK_OBJECT_TYPE_DESCRIPTOR_POOL)
1923 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
1924 VK_OBJECT_TYPE_DESCRIPTOR_SET)
1925 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
1926 VkDescriptorSetLayout,
1927 VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
1928 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
1929 VkDescriptorUpdateTemplate,
1930 VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
1931 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
1932 VK_OBJECT_TYPE_DEVICE_MEMORY)
1933 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
1934 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
1935 VK_OBJECT_TYPE_FRAMEBUFFER)
1936 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
1937 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView,
1938 VK_OBJECT_TYPE_IMAGE_VIEW);
1939 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
1940 VK_OBJECT_TYPE_PIPELINE_CACHE)
1941 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
1942 VK_OBJECT_TYPE_PIPELINE)
1943 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
1944 VK_OBJECT_TYPE_PIPELINE_LAYOUT)
1945 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
1946 VK_OBJECT_TYPE_QUERY_POOL)
1947 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
1948 VK_OBJECT_TYPE_RENDER_PASS)
1949 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
1950 VK_OBJECT_TYPE_SAMPLER)
1951 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
1952 VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
1953
1954 /* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
1955 #define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
1956
1957 void
1958 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
1959
1960 #endif /* TU_PRIVATE_H */
1961