1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  * DEALINGS IN THE SOFTWARE.
26  */
27 
28 #ifndef TU_PRIVATE_H
29 #define TU_PRIVATE_H
30 
31 #include <assert.h>
32 #include <pthread.h>
33 #include <stdbool.h>
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef HAVE_VALGRIND
39 #include <memcheck.h>
40 #include <valgrind.h>
41 #define VG(x) x
42 #else
43 #define VG(x) ((void)0)
44 #endif
45 
46 #define MESA_LOG_TAG "TU"
47 
48 #include "c11/threads.h"
49 #include "util/rounding.h"
50 #include "util/bitscan.h"
51 #include "util/list.h"
52 #include "util/log.h"
53 #include "util/macros.h"
54 #include "util/sparse_array.h"
55 #include "util/u_atomic.h"
56 #include "util/u_dynarray.h"
57 #include "util/xmlconfig.h"
58 #include "util/perf/u_trace.h"
59 #include "vk_alloc.h"
60 #include "vk_debug_report.h"
61 #include "vk_device.h"
62 #include "vk_dispatch_table.h"
63 #include "vk_extensions.h"
64 #include "vk_instance.h"
65 #include "vk_log.h"
66 #include "vk_physical_device.h"
67 #include "vk_shader_module.h"
68 #include "wsi_common.h"
69 
70 #include "ir3/ir3_compiler.h"
71 #include "ir3/ir3_shader.h"
72 
73 #include "adreno_common.xml.h"
74 #include "adreno_pm4.xml.h"
75 #include "a6xx.xml.h"
76 #include "fdl/freedreno_layout.h"
77 #include "common/freedreno_dev_info.h"
78 #include "perfcntrs/freedreno_perfcntr.h"
79 
80 #include "tu_descriptor_set.h"
81 #include "tu_autotune.h"
82 #include "tu_util.h"
83 #include "tu_perfetto.h"
84 
85 /* Pre-declarations needed for WSI entrypoints */
86 struct wl_surface;
87 struct wl_display;
88 typedef struct xcb_connection_t xcb_connection_t;
89 typedef uint32_t xcb_visualid_t;
90 typedef uint32_t xcb_window_t;
91 
92 #include <vulkan/vk_android_native_buffer.h>
93 #include <vulkan/vk_icd.h>
94 #include <vulkan/vulkan.h>
95 
96 #include "tu_entrypoints.h"
97 
98 #include "vk_format.h"
99 #include "vk_image.h"
100 #include "vk_command_buffer.h"
101 #include "vk_command_pool.h"
102 #include "vk_queue.h"
103 #include "vk_object.h"
104 #include "vk_sync.h"
105 #include "vk_fence.h"
106 #include "vk_semaphore.h"
107 #include "vk_drm_syncobj.h"
108 #include "vk_sync_timeline.h"
109 
110 #define MAX_VBS 32
111 #define MAX_VERTEX_ATTRIBS 32
112 #define MAX_RTS 8
113 #define MAX_VSC_PIPES 32
114 #define MAX_VIEWPORTS 16
115 #define MAX_VIEWPORT_SIZE (1 << 14)
116 #define MAX_SCISSORS 16
117 #define MAX_DISCARD_RECTANGLES 4
118 #define MAX_PUSH_CONSTANTS_SIZE 128
119 #define MAX_PUSH_DESCRIPTORS 32
120 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16
121 #define MAX_DYNAMIC_STORAGE_BUFFERS 8
122 #define MAX_DYNAMIC_BUFFERS_SIZE                                             \
123    (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) *         \
124    A6XX_TEX_CONST_DWORDS
125 
126 #define TU_MAX_DRM_DEVICES 8
127 #define MAX_VIEWS 16
128 #define MAX_BIND_POINTS 2 /* compute + graphics */
129 /* The Qualcomm driver exposes 0x20000058 */
130 #define MAX_STORAGE_BUFFER_RANGE 0x20000000
131 /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
132  * expose the same maximum range.
133  * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
134  * range might be higher.
135  */
136 #define MAX_UNIFORM_BUFFER_RANGE 0x10000
137 
138 #define A6XX_TEX_CONST_DWORDS 16
139 #define A6XX_TEX_SAMP_DWORDS 4
140 
141 #define COND(bool, val) ((bool) ? (val) : 0)
142 #define BIT(bit) (1u << (bit))
143 
144 /* Whenever we generate an error, pass it through this function. Useful for
145  * debugging, where we can break on it. Only call at error site, not when
146  * propagating errors. Might be useful to plug in a stack trace here.
147  */
148 
149 struct tu_instance;
150 
151 VkResult
152 __vk_startup_errorf(struct tu_instance *instance,
153                     VkResult error,
154                     bool force_print,
155                     const char *file,
156                     int line,
157                     const char *format,
158                     ...) PRINTFLIKE(6, 7);
159 
160 /* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
161  * build.
162  */
163 #define vk_startup_errorf(instance, error, format, ...) \
164    __vk_startup_errorf(instance, error, \
165                        instance->debug_flags & TU_DEBUG_STARTUP, \
166                        __FILE__, __LINE__, format, ##__VA_ARGS__)
167 
168 void
169 __tu_finishme(const char *file, int line, const char *format, ...)
170    PRINTFLIKE(3, 4);
171 
172 /**
173  * Print a FINISHME message, including its source location.
174  */
175 #define tu_finishme(format, ...)                                             \
176    do {                                                                      \
177       static bool reported = false;                                          \
178       if (!reported) {                                                       \
179          __tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__);           \
180          reported = true;                                                    \
181       }                                                                      \
182    } while (0)
183 
184 #define tu_stub()                                                            \
185    do {                                                                      \
186       tu_finishme("stub %s", __func__);                                      \
187    } while (0)
188 
189 struct tu_memory_heap {
190    /* Standard bits passed on to the client */
191    VkDeviceSize      size;
192    VkMemoryHeapFlags flags;
193 
194    /** Copied from ANV:
195     *
196     * Driver-internal book-keeping.
197     *
198     * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
199     */
200    VkDeviceSize      used __attribute__ ((aligned (8)));
201 };
202 
203 uint64_t
204 tu_get_system_heap_size(void);
205 
206 struct tu_physical_device
207 {
208    struct vk_physical_device vk;
209 
210    struct tu_instance *instance;
211 
212    const char *name;
213    uint8_t driver_uuid[VK_UUID_SIZE];
214    uint8_t device_uuid[VK_UUID_SIZE];
215    uint8_t cache_uuid[VK_UUID_SIZE];
216 
217    struct wsi_device wsi_device;
218 
219    int local_fd;
220    bool has_local;
221    int64_t local_major;
222    int64_t local_minor;
223    int master_fd;
224    bool has_master;
225    int64_t master_major;
226    int64_t master_minor;
227 
228    uint32_t gmem_size;
229    uint64_t gmem_base;
230    uint32_t ccu_offset_gmem;
231    uint32_t ccu_offset_bypass;
232 
233    struct fd_dev_id dev_id;
234    const struct fd_dev_info *info;
235 
236    int msm_major_version;
237    int msm_minor_version;
238 
239    /* Address space and global fault count for this local_fd with DRM backend */
240    uint64_t fault_count;
241 
242    /* This is the drivers on-disk cache used as a fallback as opposed to
243     * the pipeline cache defined by apps.
244     */
245    struct disk_cache *disk_cache;
246 
247    struct tu_memory_heap heap;
248 
249    struct vk_sync_type syncobj_type;
250    struct vk_sync_timeline_type timeline_type;
251    const struct vk_sync_type *sync_types[3];
252 };
253 
254 enum tu_debug_flags
255 {
256    TU_DEBUG_STARTUP = 1 << 0,
257    TU_DEBUG_NIR = 1 << 1,
258    TU_DEBUG_NOBIN = 1 << 3,
259    TU_DEBUG_SYSMEM = 1 << 4,
260    TU_DEBUG_FORCEBIN = 1 << 5,
261    TU_DEBUG_NOUBWC = 1 << 6,
262    TU_DEBUG_NOMULTIPOS = 1 << 7,
263    TU_DEBUG_NOLRZ = 1 << 8,
264    TU_DEBUG_PERFC = 1 << 9,
265    TU_DEBUG_FLUSHALL = 1 << 10,
266    TU_DEBUG_SYNCDRAW = 1 << 11,
267    TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
268    TU_DEBUG_GMEM = 1 << 13,
269    TU_DEBUG_RAST_ORDER = 1 << 14,
270 };
271 
272 struct tu_instance
273 {
274    struct vk_instance vk;
275 
276    uint32_t api_version;
277    int physical_device_count;
278    struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
279 
280    struct driOptionCache dri_options;
281    struct driOptionCache available_dri_options;
282 
283    enum tu_debug_flags debug_flags;
284 };
285 
286 VkResult
287 tu_wsi_init(struct tu_physical_device *physical_device);
288 void
289 tu_wsi_finish(struct tu_physical_device *physical_device);
290 
291 bool
292 tu_instance_extension_supported(const char *name);
293 uint32_t
294 tu_physical_device_api_version(struct tu_physical_device *dev);
295 bool
296 tu_physical_device_extension_supported(struct tu_physical_device *dev,
297                                        const char *name);
298 
299 struct cache_entry;
300 
301 struct tu_pipeline_cache
302 {
303    struct vk_object_base base;
304 
305    struct tu_device *device;
306    pthread_mutex_t mutex;
307 
308    uint32_t total_size;
309    uint32_t table_size;
310    uint32_t kernel_count;
311    struct cache_entry **hash_table;
312    bool modified;
313 
314    VkAllocationCallbacks alloc;
315 };
316 
317 struct tu_pipeline_key
318 {
319 };
320 
321 
322 /* queue types */
323 #define TU_QUEUE_GENERAL 0
324 
325 #define TU_MAX_QUEUE_FAMILIES 1
326 
327 /* Keep tu_syncobj until porting to common code for kgsl too */
328 #ifdef TU_USE_KGSL
329 struct tu_syncobj;
330 #endif
331 struct tu_u_trace_syncobj;
332 
333 /* Define tu_timeline_sync type based on drm syncobj for a point type
334  * for vk_sync_timeline, and the logic to handle is mostly copied from
335  * anv_bo_sync since it seems it can be used by similar way to anv.
336  */
337 enum tu_timeline_sync_state {
338    /** Indicates that this is a new (or newly reset fence) */
339    TU_TIMELINE_SYNC_STATE_RESET,
340 
341    /** Indicates that this fence has been submitted to the GPU but is still
342     * (as far as we know) in use by the GPU.
343     */
344    TU_TIMELINE_SYNC_STATE_SUBMITTED,
345 
346    TU_TIMELINE_SYNC_STATE_SIGNALED,
347 };
348 
349 struct tu_timeline_sync {
350    struct vk_sync base;
351 
352    enum tu_timeline_sync_state state;
353    uint32_t syncobj;
354 };
355 
356 struct tu_queue
357 {
358    struct vk_queue vk;
359 
360    struct tu_device *device;
361 
362    uint32_t msm_queue_id;
363    int fence;
364 };
365 
366 struct tu_bo
367 {
368    uint32_t gem_handle;
369    uint64_t size;
370    uint64_t iova;
371    void *map;
372 
373 #ifndef TU_USE_KGSL
374    int32_t refcnt;
375    uint32_t bo_list_idx;
376 #endif
377 
378    bool implicit_sync : 1;
379 };
380 
381 enum global_shader {
382    GLOBAL_SH_VS_BLIT,
383    GLOBAL_SH_VS_CLEAR,
384    GLOBAL_SH_FS_BLIT,
385    GLOBAL_SH_FS_BLIT_ZSCALE,
386    GLOBAL_SH_FS_COPY_MS,
387    GLOBAL_SH_FS_CLEAR0,
388    GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
389    GLOBAL_SH_COUNT,
390 };
391 
392 #define TU_BORDER_COLOR_COUNT 4096
393 #define TU_BORDER_COLOR_BUILTIN 6
394 
395 #define TU_BLIT_SHADER_SIZE 1024
396 
397 /* This struct defines the layout of the global_bo */
398 struct tu6_global
399 {
400    /* clear/blit shaders */
401    uint32_t shaders[TU_BLIT_SHADER_SIZE];
402 
403    uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
404    uint32_t _pad0;
405    volatile uint32_t vsc_draw_overflow;
406    uint32_t _pad1;
407    volatile uint32_t vsc_prim_overflow;
408    uint32_t _pad2;
409    uint64_t predicate;
410 
411    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
412    struct {
413       uint32_t offset;
414       uint32_t pad[7];
415    } flush_base[4];
416 
417    ALIGN16 uint32_t cs_indirect_xyz[3];
418 
419    /* To know when renderpass stats for autotune are valid */
420    volatile uint32_t autotune_fence;
421 
422    /* note: larger global bo will be used for customBorderColors */
423    struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
424 };
425 #define gb_offset(member) offsetof(struct tu6_global, member)
426 #define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
427 
428 /* extra space in vsc draw/prim streams */
429 #define VSC_PAD 0x40
430 
431 struct tu_device
432 {
433    struct vk_device vk;
434    struct tu_instance *instance;
435 
436    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
437    int queue_count[TU_MAX_QUEUE_FAMILIES];
438 
439    struct tu_physical_device *physical_device;
440    int fd;
441 
442    struct ir3_compiler *compiler;
443 
444    /* Backup in-memory cache to be used if the app doesn't provide one */
445    struct tu_pipeline_cache *mem_cache;
446 
447 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
448 
449    /* Currently the kernel driver uses a 32-bit GPU address space, but it
450     * should be impossible to go beyond 48 bits.
451     */
452    struct {
453       struct tu_bo *bo;
454       mtx_t construct_mtx;
455       bool initialized;
456    } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
457 
458    struct tu_bo *global_bo;
459 
460    uint32_t implicit_sync_bo_count;
461 
462    /* the blob seems to always use 8K factor and 128K param sizes, copy them */
463 #define TU_TESS_FACTOR_SIZE (8 * 1024)
464 #define TU_TESS_PARAM_SIZE (128 * 1024)
465 #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
466    /* Lazily allocated, protected by the device mutex. */
467    struct tu_bo *tess_bo;
468 
469    struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
470    uint64_t global_shader_va[GLOBAL_SH_COUNT];
471 
472    uint32_t vsc_draw_strm_pitch;
473    uint32_t vsc_prim_strm_pitch;
474    BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
475    mtx_t mutex;
476 
477    /* bo list for submits: */
478    struct drm_msm_gem_submit_bo *bo_list;
479    /* map bo handles to bo list index: */
480    uint32_t bo_count, bo_list_size;
481    mtx_t bo_mutex;
482    /* protects imported BOs creation/freeing */
483    struct u_rwlock dma_bo_lock;
484 
485    /* This array holds all our 'struct tu_bo' allocations. We use this
486     * so we can add a refcount to our BOs and check if a particular BO
487     * was already allocated in this device using its GEM handle. This is
488     * necessary to properly manage BO imports, because the kernel doesn't
489     * refcount the underlying BO memory.
490     *
491     * Specifically, when self-importing (i.e. importing a BO into the same
492     * device that created it), the kernel will give us the same BO handle
493     * for both BOs and we must only free it once when  both references are
494     * freed. Otherwise, if we are not self-importing, we get two different BO
495     * handles, and we want to free each one individually.
496     *
497     * The BOs in this map all have a refcnt with the reference counter and
498     * only self-imported BOs will ever have a refcnt > 1.
499     */
500    struct util_sparse_array bo_map;
501 
502    /* Command streams to set pass index to a scratch reg */
503    struct tu_cs *perfcntrs_pass_cs;
504    struct tu_cs_entry *perfcntrs_pass_cs_entries;
505 
506    /* Condition variable for timeline semaphore to notify waiters when a
507     * new submit is executed. */
508    pthread_cond_t timeline_cond;
509    pthread_mutex_t submit_mutex;
510 
511    struct tu_autotune autotune;
512 
513 #ifdef ANDROID
514    const void *gralloc;
515    enum {
516       TU_GRALLOC_UNKNOWN,
517       TU_GRALLOC_CROS,
518       TU_GRALLOC_OTHER,
519    } gralloc_type;
520 #endif
521 
522    uint32_t submit_count;
523 
524    struct u_trace_context trace_context;
525 
526    #ifdef HAVE_PERFETTO
527    struct tu_perfetto_state perfetto;
528    #endif
529 };
530 
531 void tu_init_clear_blit_shaders(struct tu_device *dev);
532 
533 void tu_destroy_clear_blit_shaders(struct tu_device *dev);
534 
535 VkResult
536 tu_device_submit_deferred_locked(struct tu_device *dev);
537 
538 VkResult
539 tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
540 
541 uint64_t
542 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
543 
544 VkResult
545 tu_device_check_status(struct vk_device *vk_device);
546 
547 enum tu_bo_alloc_flags
548 {
549    TU_BO_ALLOC_NO_FLAGS = 0,
550    TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
551    TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
552 };
553 
554 VkResult
555 tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
556                enum tu_bo_alloc_flags flags);
557 VkResult
558 tu_bo_init_dmabuf(struct tu_device *dev,
559                   struct tu_bo **bo,
560                   uint64_t size,
561                   int fd);
562 int
563 tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo);
564 void
565 tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
566 VkResult
567 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
568 
569 static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device * device,uint32_t handle)570 tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
571 {
572    return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
573 }
574 
575 /* Get a scratch bo for use inside a command buffer. This will always return
576  * the same bo given the same size or similar sizes, so only one scratch bo
577  * can be used at the same time. It's meant for short-lived things where we
578  * need to write to some piece of memory, read from it, and then immediately
579  * discard it.
580  */
581 VkResult
582 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
583 
584 struct tu_cs_entry
585 {
586    /* No ownership */
587    const struct tu_bo *bo;
588 
589    uint32_t size;
590    uint32_t offset;
591 };
592 
593 struct tu_cs_memory {
594    uint32_t *map;
595    uint64_t iova;
596 };
597 
598 struct tu_draw_state {
599    uint64_t iova : 48;
600    uint32_t size : 16;
601 };
602 
603 enum tu_dynamic_state
604 {
605    /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
606    TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
607    TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
608    TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
609    TU_DYNAMIC_STATE_VB_STRIDE,
610    TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
611    TU_DYNAMIC_STATE_COUNT,
612    /* no associated draw state: */
613    TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
614    TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
615    /* re-use the line width enum as it uses GRAS_SU_CNTL: */
616    TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
617 };
618 
619 enum tu_draw_state_group_id
620 {
621    TU_DRAW_STATE_PROGRAM_CONFIG,
622    TU_DRAW_STATE_PROGRAM,
623    TU_DRAW_STATE_PROGRAM_BINNING,
624    TU_DRAW_STATE_VB,
625    TU_DRAW_STATE_VI,
626    TU_DRAW_STATE_VI_BINNING,
627    TU_DRAW_STATE_RAST,
628    TU_DRAW_STATE_BLEND,
629    TU_DRAW_STATE_SHADER_GEOM_CONST,
630    TU_DRAW_STATE_FS_CONST,
631    TU_DRAW_STATE_DESC_SETS,
632    TU_DRAW_STATE_DESC_SETS_LOAD,
633    TU_DRAW_STATE_VS_PARAMS,
634    TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
635    TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
636    TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
637    TU_DRAW_STATE_PRIM_MODE_GMEM,
638    TU_DRAW_STATE_PRIM_MODE_SYSMEM,
639 
640    /* dynamic state related draw states */
641    TU_DRAW_STATE_DYNAMIC,
642    TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
643 };
644 
645 enum tu_cs_mode
646 {
647 
648    /*
649     * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
650     * is full.  tu_cs_begin must be called before command packet emission and
651     * tu_cs_end must be called after.
652     *
653     * This mode may create multiple entries internally.  The entries must be
654     * submitted together.
655     */
656    TU_CS_MODE_GROW,
657 
658    /*
659     * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
660     * fixed-size buffer.  tu_cs_begin and tu_cs_end are optional and have no
661     * effect on it.
662     *
663     * This mode does not create any entry or any BO.
664     */
665    TU_CS_MODE_EXTERNAL,
666 
667    /*
668     * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
669     * command packet emission.  tu_cs_begin_sub_stream must be called to get a
670     * sub-stream to emit comamnd packets to.  When done with the sub-stream,
671     * tu_cs_end_sub_stream must be called.
672     *
673     * This mode does not create any entry internally.
674     */
675    TU_CS_MODE_SUB_STREAM,
676 };
677 
678 struct tu_cs
679 {
680    uint32_t *start;
681    uint32_t *cur;
682    uint32_t *reserved_end;
683    uint32_t *end;
684 
685    struct tu_device *device;
686    enum tu_cs_mode mode;
687    uint32_t next_bo_size;
688 
689    struct tu_cs_entry *entries;
690    uint32_t entry_count;
691    uint32_t entry_capacity;
692 
693    struct tu_bo **bos;
694    uint32_t bo_count;
695    uint32_t bo_capacity;
696 
697    /* state for cond_exec_start/cond_exec_end */
698    uint32_t cond_flags;
699    uint32_t *cond_dwords;
700 };
701 
702 struct tu_device_memory
703 {
704    struct vk_object_base base;
705 
706    struct tu_bo *bo;
707 };
708 
709 struct tu_descriptor_range
710 {
711    uint64_t va;
712    uint32_t size;
713 };
714 
715 struct tu_descriptor_set
716 {
717    struct vk_object_base base;
718 
719    /* Link to descriptor pool's desc_sets list . */
720    struct list_head pool_link;
721 
722    struct tu_descriptor_set_layout *layout;
723    struct tu_descriptor_pool *pool;
724    uint32_t size;
725 
726    uint64_t va;
727    uint32_t *mapped_ptr;
728 
729    uint32_t *dynamic_descriptors;
730 };
731 
732 struct tu_descriptor_pool_entry
733 {
734    uint32_t offset;
735    uint32_t size;
736    struct tu_descriptor_set *set;
737 };
738 
739 struct tu_descriptor_pool
740 {
741    struct vk_object_base base;
742 
743    struct tu_bo *bo;
744    uint64_t current_offset;
745    uint64_t size;
746 
747    uint8_t *host_memory_base;
748    uint8_t *host_memory_ptr;
749    uint8_t *host_memory_end;
750    uint8_t *host_bo;
751 
752    struct list_head desc_sets;
753 
754    uint32_t entry_count;
755    uint32_t max_entry_count;
756    struct tu_descriptor_pool_entry entries[0];
757 };
758 
759 struct tu_descriptor_update_template_entry
760 {
761    VkDescriptorType descriptor_type;
762 
763    /* The number of descriptors to update */
764    uint32_t descriptor_count;
765 
766    /* Into mapped_ptr or dynamic_descriptors, in units of the respective array
767     */
768    uint32_t dst_offset;
769 
770    /* In dwords. Not valid/used for dynamic descriptors */
771    uint32_t dst_stride;
772 
773    uint32_t buffer_offset;
774 
775    /* Only valid for combined image samplers and samplers */
776    uint16_t has_sampler;
777 
778    /* In bytes */
779    size_t src_offset;
780    size_t src_stride;
781 
782    /* For push descriptors */
783    const struct tu_sampler *immutable_samplers;
784 };
785 
786 struct tu_descriptor_update_template
787 {
788    struct vk_object_base base;
789 
790    uint32_t entry_count;
791    VkPipelineBindPoint bind_point;
792    struct tu_descriptor_update_template_entry entry[0];
793 };
794 
795 struct tu_buffer
796 {
797    struct vk_object_base base;
798 
799    VkDeviceSize size;
800 
801    VkBufferUsageFlags usage;
802    VkBufferCreateFlags flags;
803 
804    struct tu_bo *bo;
805    uint64_t iova;
806 };
807 
808 const char *
809 tu_get_debug_option_name(int id);
810 
811 const char *
812 tu_get_perftest_option_name(int id);
813 
814 struct tu_descriptor_state
815 {
816    struct tu_descriptor_set *sets[MAX_SETS];
817    struct tu_descriptor_set push_set;
818    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
819 };
820 
821 enum tu_cmd_dirty_bits
822 {
823    TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
824    TU_CMD_DIRTY_VB_STRIDE = BIT(1),
825    TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
826    TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
827    TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
828    TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
829    TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
830    TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
831    TU_CMD_DIRTY_LRZ = BIT(8),
832    TU_CMD_DIRTY_VS_PARAMS = BIT(9),
833    TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
834    TU_CMD_DIRTY_VIEWPORTS = BIT(11),
835    /* all draw states were disabled and need to be re-enabled: */
836    TU_CMD_DIRTY_DRAW_STATE = BIT(12)
837 };
838 
839 /* There are only three cache domains we have to care about: the CCU, or
840  * color cache unit, which is used for color and depth/stencil attachments
841  * and copy/blit destinations, and is split conceptually into color and depth,
842  * and the universal cache or UCHE which is used for pretty much everything
843  * else, except for the CP (uncached) and host. We need to flush whenever data
844  * crosses these boundaries.
845  */
846 
847 enum tu_cmd_access_mask {
848    TU_ACCESS_UCHE_READ = 1 << 0,
849    TU_ACCESS_UCHE_WRITE = 1 << 1,
850    TU_ACCESS_CCU_COLOR_READ = 1 << 2,
851    TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
852    TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
853    TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
854 
855    /* Experiments have shown that while it's safe to avoid flushing the CCU
856     * after each blit/renderpass, it's not safe to assume that subsequent
857     * lookups with a different attachment state will hit unflushed cache
858     * entries. That is, the CCU needs to be flushed and possibly invalidated
859     * when accessing memory with a different attachment state. Writing to an
860     * attachment under the following conditions after clearing using the
861     * normal 2d engine path is known to have issues:
862     *
863     * - It isn't the 0'th layer.
864     * - There are more than one attachment, and this isn't the 0'th attachment
865     *   (this seems to also depend on the cpp of the attachments).
866     *
867     * Our best guess is that the layer/MRT state is used when computing
868     * the location of a cache entry in CCU, to avoid conflicts. We assume that
869     * any access in a renderpass after or before an access by a transfer needs
870     * a flush/invalidate, and use the _INCOHERENT variants to represent access
871     * by a renderpass.
872     */
873    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
874    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
875    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
876    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
877 
878    /* Accesses which bypasses any cache. e.g. writes via the host,
879     * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
880     */
881    TU_ACCESS_SYSMEM_READ = 1 << 10,
882    TU_ACCESS_SYSMEM_WRITE = 1 << 11,
883 
884    /* Memory writes from the CP start in-order with draws and event writes,
885     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
886     */
887    TU_ACCESS_CP_WRITE = 1 << 12,
888 
889    TU_ACCESS_READ =
890       TU_ACCESS_UCHE_READ |
891       TU_ACCESS_CCU_COLOR_READ |
892       TU_ACCESS_CCU_DEPTH_READ |
893       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
894       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
895       TU_ACCESS_SYSMEM_READ,
896 
897    TU_ACCESS_WRITE =
898       TU_ACCESS_UCHE_WRITE |
899       TU_ACCESS_CCU_COLOR_WRITE |
900       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
901       TU_ACCESS_CCU_DEPTH_WRITE |
902       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
903       TU_ACCESS_SYSMEM_WRITE |
904       TU_ACCESS_CP_WRITE,
905 
906    TU_ACCESS_ALL =
907       TU_ACCESS_READ |
908       TU_ACCESS_WRITE,
909 };
910 
911 /* Starting with a6xx, the pipeline is split into several "clusters" (really
912  * pipeline stages). Each stage has its own pair of register banks and can
913  * switch them independently, so that earlier stages can run ahead of later
914  * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
915  * the same time.
916  *
917  * As a result of this, we need to insert a WFI when an earlier stage depends
918  * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
919  * pending WFI's to complete before starting, and usually before reading
920  * indirect params even, so a WFI also acts as a full "pipeline stall".
921  *
922  * Note, the names of the stages come from CLUSTER_* in devcoredump. We
923  * include all the stages for completeness, even ones which do not read/write
924  * anything.
925  */
926 
927 enum tu_stage {
928    /* This doesn't correspond to a cluster, but we need it for tracking
929     * indirect draw parameter reads etc.
930     */
931    TU_STAGE_CP,
932 
933    /* - Fetch index buffer
934     * - Fetch vertex attributes, dispatch VS
935     */
936    TU_STAGE_FE,
937 
938    /* Execute all geometry stages (VS thru GS) */
939    TU_STAGE_SP_VS,
940 
941    /* Write to VPC, do primitive assembly. */
942    TU_STAGE_PC_VS,
943 
944    /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
945     * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
946     * early depth testing is enabled before dispatching fragments? However
947     * GRAS reads and writes LRZ directly.
948     */
949    TU_STAGE_GRAS,
950 
951    /* Execute FS */
952    TU_STAGE_SP_PS,
953 
954    /* - Fragment tests
955     * - Write color/depth
956     * - Streamout writes (???)
957     * - Varying interpolation (???)
958     */
959    TU_STAGE_PS,
960 };
961 
962 enum tu_cmd_flush_bits {
963    TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
964    TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
965    TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
966    TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
967    TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
968    TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
969    TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
970    TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
971    TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
972 
973    TU_CMD_FLAG_ALL_FLUSH =
974       TU_CMD_FLAG_CCU_FLUSH_DEPTH |
975       TU_CMD_FLAG_CCU_FLUSH_COLOR |
976       TU_CMD_FLAG_CACHE_FLUSH |
977       /* Treat the CP as a sort of "cache" which may need to be "flushed" via
978        * waiting for writes to land with WAIT_FOR_MEM_WRITES.
979        */
980       TU_CMD_FLAG_WAIT_MEM_WRITES,
981 
982    TU_CMD_FLAG_ALL_INVALIDATE =
983       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
984       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
985       TU_CMD_FLAG_CACHE_INVALIDATE,
986 };
987 
988 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
989  * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
990  * which part of the gmem is used by the CCU. Here we keep track of what the
991  * state of the CCU.
992  */
993 enum tu_cmd_ccu_state {
994    TU_CMD_CCU_SYSMEM,
995    TU_CMD_CCU_GMEM,
996    TU_CMD_CCU_UNKNOWN,
997 };
998 
999 struct tu_cache_state {
1000    /* Caches which must be made available (flushed) eventually if there are
1001     * any users outside that cache domain, and caches which must be
1002     * invalidated eventually if there are any reads.
1003     */
1004    enum tu_cmd_flush_bits pending_flush_bits;
1005    /* Pending flushes */
1006    enum tu_cmd_flush_bits flush_bits;
1007 };
1008 
1009 enum tu_lrz_force_disable_mask {
1010    TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
1011    TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
1012 };
1013 
1014 enum tu_lrz_direction {
1015    TU_LRZ_UNKNOWN,
1016    /* Depth func less/less-than: */
1017    TU_LRZ_LESS,
1018    /* Depth func greater/greater-than: */
1019    TU_LRZ_GREATER,
1020 };
1021 
1022 struct tu_lrz_pipeline
1023 {
1024    uint32_t force_disable_mask;
1025    bool fs_has_kill;
1026    bool force_late_z;
1027    bool early_fragment_tests;
1028 };
1029 
1030 struct tu_lrz_state
1031 {
1032    /* Depth/Stencil image currently on use to do LRZ */
1033    struct tu_image *image;
1034    bool valid : 1;
1035    enum tu_lrz_direction prev_direction;
1036 };
1037 
1038 struct tu_vs_params {
1039    uint32_t vertex_offset;
1040    uint32_t first_instance;
1041 };
1042 
1043 struct tu_cmd_state
1044 {
1045    uint32_t dirty;
1046 
1047    struct tu_pipeline *pipeline;
1048    struct tu_pipeline *compute_pipeline;
1049 
1050    /* Vertex buffers, viewports, and scissors
1051     * the states for these can be updated partially, so we need to save these
1052     * to be able to emit a complete draw state
1053     */
1054    struct {
1055       uint64_t base;
1056       uint32_t size;
1057       uint32_t stride;
1058    } vb[MAX_VBS];
1059    VkViewport viewport[MAX_VIEWPORTS];
1060    VkRect2D scissor[MAX_SCISSORS];
1061    uint32_t max_viewport, max_scissor;
1062 
1063    /* for dynamic states that can't be emitted directly */
1064    uint32_t dynamic_stencil_mask;
1065    uint32_t dynamic_stencil_wrmask;
1066    uint32_t dynamic_stencil_ref;
1067 
1068    uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
1069    uint32_t pc_raster_cntl, vpc_unknown_9107;
1070    enum pc_di_primtype primtype;
1071    bool primitive_restart_enable;
1072 
1073    /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
1074    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
1075    struct tu_draw_state vertex_buffers;
1076    struct tu_draw_state shader_const[2];
1077    struct tu_draw_state desc_sets;
1078 
1079    struct tu_draw_state vs_params;
1080 
1081    /* Index buffer */
1082    uint64_t index_va;
1083    uint32_t max_index_count;
1084    uint8_t index_size;
1085 
1086    /* because streamout base has to be 32-byte aligned
1087     * there is an extra offset to deal with when it is
1088     * unaligned
1089     */
1090    uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
1091 
1092    /* Renderpasses are tricky, because we may need to flush differently if
1093     * using sysmem vs. gmem and therefore we have to delay any flushing that
1094     * happens before a renderpass. So we have to have two copies of the flush
1095     * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
1096     * and one for outside a renderpass.
1097     */
1098    struct tu_cache_state cache;
1099    struct tu_cache_state renderpass_cache;
1100 
1101    enum tu_cmd_ccu_state ccu_state;
1102 
1103    const struct tu_render_pass *pass;
1104    const struct tu_subpass *subpass;
1105    const struct tu_framebuffer *framebuffer;
1106    VkRect2D render_area;
1107 
1108    const struct tu_image_view **attachments;
1109 
1110    bool xfb_used;
1111    bool has_tess;
1112    bool tessfactor_addr_set;
1113    bool has_subpass_predication;
1114    bool predication_active;
1115    bool disable_gmem;
1116    enum a5xx_line_mode line_mode;
1117    bool z_negative_one_to_one;
1118 
1119    uint32_t drawcall_count;
1120 
1121    /* A calculated "draw cost" value for renderpass, which tries to
1122     * estimate the bandwidth-per-sample of all the draws according
1123     * to:
1124     *
1125     *    foreach_draw (...) {
1126     *      cost += num_frag_outputs;
1127     *      if (blend_enabled)
1128     *        cost += num_blend_enabled;
1129     *      if (depth_test_enabled)
1130     *        cost++;
1131     *      if (depth_write_enabled)
1132     *        cost++;
1133     *    }
1134     *
1135     * The idea is that each sample-passed minimally does one write
1136     * per MRT.  If blend is enabled, the hw will additionally do
1137     * a framebuffer read per sample-passed (for each MRT with blend
1138     * enabled).  If depth-test is enabled, the hw will additionally
1139     * a depth buffer read.  If depth-write is enable, the hw will
1140     * additionally do a depth buffer write.
1141     *
1142     * This does ignore depth buffer traffic for samples which do not
1143     * pass do to depth-test fail, and some other details.  But it is
1144     * just intended to be a rough estimate that is easy to calculate.
1145     */
1146    uint32_t total_drawcalls_cost;
1147 
1148    struct tu_lrz_state lrz;
1149 
1150    struct tu_draw_state lrz_and_depth_plane_state;
1151 
1152    struct tu_vs_params last_vs_params;
1153 };
1154 
1155 struct tu_cmd_pool
1156 {
1157    struct vk_command_pool vk;
1158 
1159    struct list_head cmd_buffers;
1160    struct list_head free_cmd_buffers;
1161 };
1162 
1163 enum tu_cmd_buffer_status
1164 {
1165    TU_CMD_BUFFER_STATUS_INVALID,
1166    TU_CMD_BUFFER_STATUS_INITIAL,
1167    TU_CMD_BUFFER_STATUS_RECORDING,
1168    TU_CMD_BUFFER_STATUS_EXECUTABLE,
1169    TU_CMD_BUFFER_STATUS_PENDING,
1170 };
1171 
1172 struct tu_cmd_buffer
1173 {
1174    struct vk_command_buffer vk;
1175 
1176    struct tu_device *device;
1177 
1178    struct tu_cmd_pool *pool;
1179    struct list_head pool_link;
1180 
1181    struct u_trace trace;
1182    struct u_trace_iterator trace_renderpass_start;
1183    struct u_trace_iterator trace_renderpass_end;
1184 
1185    struct list_head renderpass_autotune_results;
1186    struct tu_autotune_results_buffer* autotune_buffer;
1187 
1188    VkCommandBufferUsageFlags usage_flags;
1189    enum tu_cmd_buffer_status status;
1190 
1191    struct tu_cmd_state state;
1192    uint32_t queue_family_index;
1193 
1194    uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
1195    VkShaderStageFlags push_constant_stages;
1196    struct tu_descriptor_set meta_push_descriptors;
1197 
1198    struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
1199 
1200    VkResult record_result;
1201 
1202    struct tu_cs cs;
1203    struct tu_cs draw_cs;
1204    struct tu_cs tile_store_cs;
1205    struct tu_cs draw_epilogue_cs;
1206    struct tu_cs sub_cs;
1207 
1208    uint32_t vsc_draw_strm_pitch;
1209    uint32_t vsc_prim_strm_pitch;
1210 };
1211 
1212 /* Temporary struct for tracking a register state to be written, used by
1213  * a6xx-pack.h and tu_cs_emit_regs()
1214  */
1215 struct tu_reg_value {
1216    uint32_t reg;
1217    uint64_t value;
1218    bool is_address;
1219    struct tu_bo *bo;
1220    bool bo_write;
1221    uint32_t bo_offset;
1222    uint32_t bo_shift;
1223 };
1224 
1225 
1226 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
1227                                     struct tu_cs *cs);
1228 
1229 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
1230                              struct tu_cs *cs,
1231                              enum tu_cmd_ccu_state ccu_state);
1232 
1233 void
1234 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
1235                      struct tu_cs *cs,
1236                      enum vgt_event_type event);
1237 
1238 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)1239 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
1240                          VkPipelineBindPoint bind_point)
1241 {
1242    return &cmd_buffer->descriptors[bind_point];
1243 }
1244 
1245 struct tu_event
1246 {
1247    struct vk_object_base base;
1248    struct tu_bo *bo;
1249 };
1250 
1251 struct tu_push_constant_range
1252 {
1253    uint32_t lo;
1254    uint32_t count;
1255 };
1256 
1257 struct tu_shader
1258 {
1259    struct ir3_shader *ir3_shader;
1260 
1261    struct tu_push_constant_range push_consts;
1262    uint8_t active_desc_sets;
1263    bool multi_pos_output;
1264 };
1265 
1266 bool
1267 tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
1268                        struct tu_device *dev);
1269 
1270 nir_shader *
1271 tu_spirv_to_nir(struct tu_device *dev,
1272                 void *mem_ctx,
1273                 const VkPipelineShaderStageCreateInfo *stage_info,
1274                 gl_shader_stage stage);
1275 
1276 struct tu_shader *
1277 tu_shader_create(struct tu_device *dev,
1278                  nir_shader *nir,
1279                  const VkPipelineShaderStageCreateInfo *stage_info,
1280                  unsigned multiview_mask,
1281                  struct tu_pipeline_layout *layout,
1282                  const VkAllocationCallbacks *alloc);
1283 
1284 void
1285 tu_shader_destroy(struct tu_device *dev,
1286                   struct tu_shader *shader,
1287                   const VkAllocationCallbacks *alloc);
1288 
1289 struct tu_program_descriptor_linkage
1290 {
1291    struct ir3_const_state const_state;
1292 
1293    uint32_t constlen;
1294 
1295    struct tu_push_constant_range push_consts;
1296 };
1297 
1298 struct tu_pipeline_executable {
1299    gl_shader_stage stage;
1300 
1301    struct ir3_info stats;
1302    bool is_binning;
1303 
1304    char *nir_from_spirv;
1305    char *nir_final;
1306    char *disasm;
1307 };
1308 
1309 struct tu_pipeline
1310 {
1311    struct vk_object_base base;
1312 
1313    struct tu_cs cs;
1314 
1315    /* Separate BO for private memory since it should GPU writable */
1316    struct tu_bo *pvtmem_bo;
1317 
1318    struct tu_pipeline_layout *layout;
1319 
1320    bool need_indirect_descriptor_sets;
1321    VkShaderStageFlags active_stages;
1322    uint32_t active_desc_sets;
1323 
1324    /* mask of enabled dynamic states
1325     * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
1326     */
1327    uint32_t dynamic_state_mask;
1328    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
1329 
1330    /* for dynamic states which use the same register: */
1331    uint32_t gras_su_cntl, gras_su_cntl_mask;
1332    uint32_t rb_depth_cntl, rb_depth_cntl_mask;
1333    uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
1334    uint32_t pc_raster_cntl, pc_raster_cntl_mask;
1335    uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
1336    uint32_t stencil_wrmask;
1337 
1338    bool rb_depth_cntl_disable;
1339 
1340    enum a5xx_line_mode line_mode;
1341 
1342    /* draw states for the pipeline */
1343    struct tu_draw_state load_state, rast_state, blend_state;
1344    struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;
1345 
1346    /* for vertex buffers state */
1347    uint32_t num_vbs;
1348 
1349    struct
1350    {
1351       struct tu_draw_state config_state;
1352       struct tu_draw_state state;
1353       struct tu_draw_state binning_state;
1354 
1355       struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
1356    } program;
1357 
1358    struct
1359    {
1360       struct tu_draw_state state;
1361       struct tu_draw_state binning_state;
1362    } vi;
1363 
1364    struct
1365    {
1366       enum pc_di_primtype primtype;
1367       bool primitive_restart;
1368    } ia;
1369 
1370    struct
1371    {
1372       uint32_t patch_type;
1373       uint32_t param_stride;
1374       bool upper_left_domain_origin;
1375    } tess;
1376 
1377    struct
1378    {
1379       uint32_t local_size[3];
1380       uint32_t subgroup_size;
1381    } compute;
1382 
1383    bool provoking_vertex_last;
1384 
1385    struct tu_lrz_pipeline lrz;
1386 
1387    /* In other words - framebuffer fetch support */
1388    bool raster_order_attachment_access;
1389    bool subpass_feedback_loop_ds;
1390 
1391    bool z_negative_one_to_one;
1392 
1393    /* Base drawcall cost for sysmem vs gmem autotuner */
1394    uint8_t drawcall_base_cost;
1395 
1396    void *executables_mem_ctx;
1397    /* tu_pipeline_executable */
1398    struct util_dynarray executables;
1399 };
1400 
1401 void
1402 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
1403                   bool z_negative_one_to_one);
1404 
1405 void
1406 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
1407 
1408 void
1409 tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
1410 
1411 void
1412 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
1413 
1414 void
1415 tu6_emit_depth_bias(struct tu_cs *cs,
1416                     float constant_factor,
1417                     float clamp,
1418                     float slope_factor);
1419 
1420 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
1421                    enum a5xx_line_mode line_mode);
1422 
1423 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
1424 
1425 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
1426 
1427 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1428 
1429 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
1430                                        uint32_t *rb_depth_cntl);
1431 
1432 struct tu_pvtmem_config {
1433    uint64_t iova;
1434    uint32_t per_fiber_size;
1435    uint32_t per_sp_size;
1436    bool per_wave;
1437 };
1438 
1439 void
1440 tu6_emit_xs_config(struct tu_cs *cs,
1441                    gl_shader_stage stage,
1442                    const struct ir3_shader_variant *xs);
1443 
1444 void
1445 tu6_emit_xs(struct tu_cs *cs,
1446             gl_shader_stage stage,
1447             const struct ir3_shader_variant *xs,
1448             const struct tu_pvtmem_config *pvtmem,
1449             uint64_t binary_iova);
1450 
1451 void
1452 tu6_emit_vpc(struct tu_cs *cs,
1453              const struct ir3_shader_variant *vs,
1454              const struct ir3_shader_variant *hs,
1455              const struct ir3_shader_variant *ds,
1456              const struct ir3_shader_variant *gs,
1457              const struct ir3_shader_variant *fs,
1458              uint32_t patch_control_points);
1459 
1460 void
1461 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
1462 
1463 struct tu_image_view;
1464 
1465 void
1466 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1467                   struct tu_cs *cs,
1468                   const struct tu_image_view *src,
1469                   const struct tu_image_view *dst,
1470                   uint32_t layer_mask,
1471                   uint32_t layers,
1472                   const VkRect2D *rect);
1473 
1474 void
1475 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
1476                            struct tu_cs *cs,
1477                            uint32_t a,
1478                            const VkRenderPassBeginInfo *info);
1479 
1480 void
1481 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1482                          struct tu_cs *cs,
1483                          uint32_t a,
1484                          const VkRenderPassBeginInfo *info);
1485 
1486 void
1487 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
1488                         struct tu_cs *cs,
1489                         uint32_t a,
1490                         bool force_load);
1491 
1492 /* expose this function to be able to emit load without checking LOAD_OP */
1493 void
1494 tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
1495 
1496 /* note: gmem store can also resolve */
1497 void
1498 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
1499                          struct tu_cs *cs,
1500                          uint32_t a,
1501                          uint32_t gmem_a);
1502 
1503 enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
1504 
1505 struct tu_native_format
1506 {
1507    enum a6xx_format fmt : 8;
1508    enum a3xx_color_swap swap : 8;
1509    enum a6xx_tile_mode tile_mode : 8;
1510 };
1511 
1512 enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
1513 bool tu6_format_vtx_supported(VkFormat format);
1514 struct tu_native_format tu6_format_vtx(VkFormat format);
1515 bool tu6_format_color_supported(enum pipe_format format);
1516 struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
1517 bool tu6_format_texture_supported(enum pipe_format format);
1518 struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);
1519 
1520 static inline enum a6xx_format
tu6_base_format(enum pipe_format format)1521 tu6_base_format(enum pipe_format format)
1522 {
1523    /* note: tu6_format_color doesn't care about tiling for .fmt field */
1524    return tu6_format_color(format, TILE6_LINEAR).fmt;
1525 }
1526 
1527 struct tu_image
1528 {
1529    struct vk_object_base base;
1530 
1531    /* The original VkFormat provided by the client.  This may not match any
1532     * of the actual surface formats.
1533     */
1534    VkFormat vk_format;
1535    uint32_t level_count;
1536    uint32_t layer_count;
1537 
1538    struct fdl_layout layout[3];
1539    uint32_t total_size;
1540 
1541 #ifdef ANDROID
1542    /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
1543    VkDeviceMemory owned_memory;
1544 #endif
1545 
1546    /* Set when bound */
1547    struct tu_bo *bo;
1548    uint64_t iova;
1549 
1550    uint32_t lrz_height;
1551    uint32_t lrz_pitch;
1552    uint32_t lrz_offset;
1553 
1554    bool shareable;
1555 };
1556 
1557 static inline uint32_t
tu_get_layerCount(const struct tu_image * image,const VkImageSubresourceRange * range)1558 tu_get_layerCount(const struct tu_image *image,
1559                   const VkImageSubresourceRange *range)
1560 {
1561    return range->layerCount == VK_REMAINING_ARRAY_LAYERS
1562              ? image->layer_count - range->baseArrayLayer
1563              : range->layerCount;
1564 }
1565 
1566 static inline uint32_t
tu_get_levelCount(const struct tu_image * image,const VkImageSubresourceRange * range)1567 tu_get_levelCount(const struct tu_image *image,
1568                   const VkImageSubresourceRange *range)
1569 {
1570    return range->levelCount == VK_REMAINING_MIP_LEVELS
1571              ? image->level_count - range->baseMipLevel
1572              : range->levelCount;
1573 }
1574 
1575 enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);
1576 
1577 uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);
1578 
1579 enum pipe_format tu_format_for_aspect(enum pipe_format format,
1580                                       VkImageAspectFlags aspect_mask);
1581 
1582 struct tu_image_view
1583 {
1584    struct vk_object_base base;
1585 
1586    struct tu_image *image; /**< VkImageViewCreateInfo::image */
1587 
1588    struct fdl6_view view;
1589 
1590    /* for d32s8 separate stencil */
1591    uint64_t stencil_base_addr;
1592    uint32_t stencil_layer_size;
1593    uint32_t stencil_PITCH;
1594 };
1595 
1596 struct tu_sampler_ycbcr_conversion {
1597    struct vk_object_base base;
1598 
1599    VkFormat format;
1600    VkSamplerYcbcrModelConversion ycbcr_model;
1601    VkSamplerYcbcrRange ycbcr_range;
1602    VkComponentMapping components;
1603    VkChromaLocation chroma_offsets[2];
1604    VkFilter chroma_filter;
1605 };
1606 
1607 struct tu_sampler {
1608    struct vk_object_base base;
1609 
1610    uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
1611    struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
1612 };
1613 
1614 void
1615 tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
1616 
1617 void
1618 tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);
1619 
1620 void
1621 tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
1622 
1623 void
1624 tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1625 
1626 #define tu_image_view_stencil(iview, x) \
1627    ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
1628 
1629 VkResult
1630 tu_gralloc_info(struct tu_device *device,
1631                 const VkNativeBufferANDROID *gralloc_info,
1632                 int *dma_buf,
1633                 uint64_t *modifier);
1634 
1635 VkResult
1636 tu_import_memory_from_gralloc_handle(VkDevice device_h,
1637                                      int dma_buf,
1638                                      const VkAllocationCallbacks *alloc,
1639                                      VkImage image_h);
1640 
1641 void
1642 tu_image_view_init(struct tu_image_view *iview,
1643                    const VkImageViewCreateInfo *pCreateInfo,
1644                    bool limited_z24s8);
1645 
1646 bool
1647 tiling_possible(VkFormat format);
1648 
1649 bool
1650 ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
1651               const struct fd_dev_info *info, VkSampleCountFlagBits samples);
1652 
1653 struct tu_buffer_view
1654 {
1655    struct vk_object_base base;
1656 
1657    uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
1658 
1659    struct tu_buffer *buffer;
1660 };
1661 void
1662 tu_buffer_view_init(struct tu_buffer_view *view,
1663                     struct tu_device *device,
1664                     const VkBufferViewCreateInfo *pCreateInfo);
1665 
1666 struct tu_attachment_info
1667 {
1668    struct tu_image_view *attachment;
1669 };
1670 
1671 struct tu_framebuffer
1672 {
1673    struct vk_object_base base;
1674 
1675    uint32_t width;
1676    uint32_t height;
1677    uint32_t layers;
1678 
1679    /* size of the first tile */
1680    VkExtent2D tile0;
1681    /* number of tiles */
1682    VkExtent2D tile_count;
1683 
1684    /* size of the first VSC pipe */
1685    VkExtent2D pipe0;
1686    /* number of VSC pipes */
1687    VkExtent2D pipe_count;
1688 
1689    /* pipe register values */
1690    uint32_t pipe_config[MAX_VSC_PIPES];
1691    uint32_t pipe_sizes[MAX_VSC_PIPES];
1692 
1693    uint32_t attachment_count;
1694    struct tu_attachment_info attachments[0];
1695 };
1696 
1697 void
1698 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
1699                              const struct tu_device *device,
1700                              const struct tu_render_pass *pass);
1701 
1702 struct tu_subpass_barrier {
1703    VkPipelineStageFlags src_stage_mask;
1704    VkPipelineStageFlags dst_stage_mask;
1705    VkAccessFlags src_access_mask;
1706    VkAccessFlags dst_access_mask;
1707    bool incoherent_ccu_color, incoherent_ccu_depth;
1708 };
1709 
1710 struct tu_subpass_attachment
1711 {
1712    uint32_t attachment;
1713 
1714    /* For input attachments, true if it needs to be patched to refer to GMEM
1715     * in GMEM mode. This is false if it hasn't already been written as an
1716     * attachment.
1717     */
1718    bool patch_input_gmem;
1719 };
1720 
1721 struct tu_subpass
1722 {
1723    uint32_t input_count;
1724    uint32_t color_count;
1725    uint32_t resolve_count;
1726    bool resolve_depth_stencil;
1727 
1728    bool feedback_loop_color;
1729    bool feedback_loop_ds;
1730 
1731    /* True if we must invalidate UCHE thanks to a feedback loop. */
1732    bool feedback_invalidate;
1733 
1734    /* In other words - framebuffer fetch support */
1735    bool raster_order_attachment_access;
1736 
1737    struct tu_subpass_attachment *input_attachments;
1738    struct tu_subpass_attachment *color_attachments;
1739    struct tu_subpass_attachment *resolve_attachments;
1740    struct tu_subpass_attachment depth_stencil_attachment;
1741 
1742    VkSampleCountFlagBits samples;
1743 
1744    uint32_t srgb_cntl;
1745    uint32_t multiview_mask;
1746 
1747    struct tu_subpass_barrier start_barrier;
1748 };
1749 
1750 struct tu_render_pass_attachment
1751 {
1752    VkFormat format;
1753    uint32_t samples;
1754    uint32_t cpp;
1755    VkImageAspectFlags clear_mask;
1756    uint32_t clear_views;
1757    bool load;
1758    bool store;
1759    int32_t gmem_offset;
1760    /* for D32S8 separate stencil: */
1761    bool load_stencil;
1762    bool store_stencil;
1763    int32_t gmem_offset_stencil;
1764 };
1765 
1766 struct tu_render_pass
1767 {
1768    struct vk_object_base base;
1769 
1770    uint32_t attachment_count;
1771    uint32_t subpass_count;
1772    uint32_t gmem_pixels;
1773    uint32_t tile_align_w;
1774    struct tu_subpass_attachment *subpass_attachments;
1775    struct tu_render_pass_attachment *attachments;
1776    struct tu_subpass_barrier end_barrier;
1777    struct tu_subpass subpasses[0];
1778 };
1779 
1780 #define PERF_CNTRS_REG 4
1781 
1782 struct tu_perf_query_data
1783 {
1784    uint32_t gid;      /* group-id */
1785    uint32_t cid;      /* countable-id within the group */
1786    uint32_t cntr_reg; /* counter register within the group */
1787    uint32_t pass;     /* pass index that countables can be requested */
1788    uint32_t app_idx;  /* index provided by apps */
1789 };
1790 
1791 struct tu_query_pool
1792 {
1793    struct vk_object_base base;
1794 
1795    VkQueryType type;
1796    uint32_t stride;
1797    uint64_t size;
1798    uint32_t pipeline_statistics;
1799    struct tu_bo *bo;
1800 
1801    /* For performance query */
1802    const struct fd_perfcntr_group *perf_group;
1803    uint32_t perf_group_count;
1804    uint32_t counter_index_count;
1805    struct tu_perf_query_data perf_query_data[0];
1806 };
1807 
1808 uint32_t
1809 tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
1810 
1811 void
1812 tu_update_descriptor_sets(const struct tu_device *device,
1813                           VkDescriptorSet overrideSet,
1814                           uint32_t descriptorWriteCount,
1815                           const VkWriteDescriptorSet *pDescriptorWrites,
1816                           uint32_t descriptorCopyCount,
1817                           const VkCopyDescriptorSet *pDescriptorCopies);
1818 
1819 void
1820 tu_update_descriptor_set_with_template(
1821    const struct tu_device *device,
1822    struct tu_descriptor_set *set,
1823    VkDescriptorUpdateTemplate descriptorUpdateTemplate,
1824    const void *pData);
1825 
1826 VkResult
1827 tu_physical_device_init(struct tu_physical_device *device,
1828                         struct tu_instance *instance);
1829 VkResult
1830 tu_enumerate_devices(struct tu_instance *instance);
1831 
1832 int
1833 tu_device_get_gpu_timestamp(struct tu_device *dev,
1834                             uint64_t *ts);
1835 
1836 int
1837 tu_device_get_suspend_count(struct tu_device *dev,
1838                             uint64_t *suspend_count);
1839 
1840 int
1841 tu_drm_submitqueue_new(const struct tu_device *dev,
1842                        int priority,
1843                        uint32_t *queue_id);
1844 
1845 void
1846 tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
1847 
1848 int
1849 tu_signal_syncs(struct tu_device *device, struct vk_sync *sync1, struct vk_sync *sync2);
1850 
1851 int
1852 tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync);
1853 
1854 VkResult
1855 tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit);
1856 
1857 void
1858 tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
1859                          void *ts_from, uint32_t from_offset,
1860                          void *ts_to, uint32_t to_offset,
1861                          uint32_t count);
1862 
1863 
1864 VkResult
1865 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
1866                             struct u_trace **trace_copy);
1867 
1868 /* If we copy trace and timestamps we will have to free them. */
1869 struct tu_u_trace_cmd_data
1870 {
1871    struct tu_cs *timestamp_copy_cs;
1872    struct u_trace *trace;
1873 };
1874 
1875 /* Data necessary to retrieve timestamps and clean all
1876  * associated resources afterwards.
1877  */
1878 struct tu_u_trace_submission_data
1879 {
1880    uint32_t submission_id;
1881    /* We have to know when timestamps are available,
1882     * this sync object indicates it.
1883     */
1884    struct tu_u_trace_syncobj *syncobj;
1885 
1886    uint32_t cmd_buffer_count;
1887    uint32_t last_buffer_with_tracepoints;
1888    struct tu_u_trace_cmd_data *cmd_trace_data;
1889 };
1890 
1891 VkResult
1892 tu_u_trace_submission_data_create(
1893    struct tu_device *device,
1894    struct tu_cmd_buffer **cmd_buffers,
1895    uint32_t cmd_buffer_count,
1896    struct tu_u_trace_submission_data **submission_data);
1897 
1898 void
1899 tu_u_trace_submission_data_finish(
1900    struct tu_device *device,
1901    struct tu_u_trace_submission_data *submission_data);
1902 
1903 #define TU_FROM_HANDLE(__tu_type, __name, __handle)                          \
1904    VK_FROM_HANDLE(__tu_type, __name, __handle)
1905 
1906 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
1907                        VK_OBJECT_TYPE_COMMAND_BUFFER)
1908 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
1909 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
1910                        VK_OBJECT_TYPE_INSTANCE)
1911 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
1912                        VK_OBJECT_TYPE_PHYSICAL_DEVICE)
1913 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
1914 
1915 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
1916                                VK_OBJECT_TYPE_COMMAND_POOL)
1917 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
1918                                VK_OBJECT_TYPE_BUFFER)
1919 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
1920                                VK_OBJECT_TYPE_BUFFER_VIEW)
1921 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
1922                                VK_OBJECT_TYPE_DESCRIPTOR_POOL)
1923 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
1924                                VK_OBJECT_TYPE_DESCRIPTOR_SET)
1925 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
1926                                VkDescriptorSetLayout,
1927                                VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
1928 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
1929                                VkDescriptorUpdateTemplate,
1930                                VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
1931 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
1932                                VK_OBJECT_TYPE_DEVICE_MEMORY)
1933 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
1934 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
1935                                VK_OBJECT_TYPE_FRAMEBUFFER)
1936 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
1937 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView,
1938                                VK_OBJECT_TYPE_IMAGE_VIEW);
1939 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
1940                                VK_OBJECT_TYPE_PIPELINE_CACHE)
1941 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
1942                                VK_OBJECT_TYPE_PIPELINE)
1943 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
1944                                VK_OBJECT_TYPE_PIPELINE_LAYOUT)
1945 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
1946                                VK_OBJECT_TYPE_QUERY_POOL)
1947 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
1948                                VK_OBJECT_TYPE_RENDER_PASS)
1949 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
1950                                VK_OBJECT_TYPE_SAMPLER)
1951 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
1952                                VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
1953 
1954 /* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
1955 #define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
1956 
1957 void
1958 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
1959 
1960 #endif /* TU_PRIVATE_H */
1961