1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  * DEALINGS IN THE SOFTWARE.
26  */
27 
28 #ifndef TU_PRIVATE_H
29 #define TU_PRIVATE_H
30 
31 #include <assert.h>
32 #include <pthread.h>
33 #include <stdbool.h>
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef HAVE_VALGRIND
39 #include <memcheck.h>
40 #include <valgrind.h>
41 #define VG(x) x
42 #else
43 #define VG(x) ((void)0)
44 #endif
45 
46 #define MESA_LOG_TAG "TU"
47 
48 #include "c11/threads.h"
49 #include "main/macros.h"
50 #include "util/bitscan.h"
51 #include "util/list.h"
52 #include "util/log.h"
53 #include "util/macros.h"
54 #include "util/u_atomic.h"
55 #include "util/u_dynarray.h"
56 #include "util/perf/u_trace.h"
57 #include "vk_alloc.h"
58 #include "vk_debug_report.h"
59 #include "vk_device.h"
60 #include "vk_dispatch_table.h"
61 #include "vk_extensions.h"
62 #include "vk_instance.h"
63 #include "vk_log.h"
64 #include "vk_physical_device.h"
65 #include "vk_shader_module.h"
66 #include "wsi_common.h"
67 
68 #include "ir3/ir3_compiler.h"
69 #include "ir3/ir3_shader.h"
70 
71 #include "adreno_common.xml.h"
72 #include "adreno_pm4.xml.h"
73 #include "a6xx.xml.h"
74 #include "fdl/freedreno_layout.h"
75 #include "common/freedreno_dev_info.h"
76 #include "perfcntrs/freedreno_perfcntr.h"
77 
78 #include "tu_descriptor_set.h"
79 #include "tu_util.h"
80 #include "tu_perfetto.h"
81 
82 /* Pre-declarations needed for WSI entrypoints */
83 struct wl_surface;
84 struct wl_display;
85 typedef struct xcb_connection_t xcb_connection_t;
86 typedef uint32_t xcb_visualid_t;
87 typedef uint32_t xcb_window_t;
88 
89 #include <vulkan/vk_android_native_buffer.h>
90 #include <vulkan/vk_icd.h>
91 #include <vulkan/vulkan.h>
92 
93 #include "tu_entrypoints.h"
94 
95 #include "vk_format.h"
96 #include "vk_command_buffer.h"
97 #include "vk_queue.h"
98 
99 #define MAX_VBS 32
100 #define MAX_VERTEX_ATTRIBS 32
101 #define MAX_RTS 8
102 #define MAX_VSC_PIPES 32
103 #define MAX_VIEWPORTS 16
104 #define MAX_VIEWPORT_SIZE (1 << 14)
105 #define MAX_SCISSORS 16
106 #define MAX_DISCARD_RECTANGLES 4
107 #define MAX_PUSH_CONSTANTS_SIZE 128
108 #define MAX_PUSH_DESCRIPTORS 32
109 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16
110 #define MAX_DYNAMIC_STORAGE_BUFFERS 8
111 #define MAX_DYNAMIC_BUFFERS                                                  \
112    (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS)
113 #define TU_MAX_DRM_DEVICES 8
114 #define MAX_VIEWS 16
115 #define MAX_BIND_POINTS 2 /* compute + graphics */
116 /* The Qualcomm driver exposes 0x20000058 */
117 #define MAX_STORAGE_BUFFER_RANGE 0x20000000
118 /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
119  * expose the same maximum range.
120  * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
121  * range might be higher.
122  */
123 #define MAX_UNIFORM_BUFFER_RANGE 0x10000
124 
125 #define A6XX_TEX_CONST_DWORDS 16
126 #define A6XX_TEX_SAMP_DWORDS 4
127 
128 #define COND(bool, val) ((bool) ? (val) : 0)
129 #define BIT(bit) (1u << (bit))
130 
131 /* Whenever we generate an error, pass it through this function. Useful for
132  * debugging, where we can break on it. Only call at error site, not when
133  * propagating errors. Might be useful to plug in a stack trace here.
134  */
135 
136 struct tu_instance;
137 
138 VkResult
139 __vk_startup_errorf(struct tu_instance *instance,
140                     VkResult error,
141                     bool force_print,
142                     const char *file,
143                     int line,
144                     const char *format,
145                     ...) PRINTFLIKE(6, 7);
146 
147 /* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
148  * build.
149  */
150 #define vk_startup_errorf(instance, error, format, ...) \
151    __vk_startup_errorf(instance, error, \
152                        instance->debug_flags & TU_DEBUG_STARTUP, \
153                        __FILE__, __LINE__, format, ##__VA_ARGS__)
154 
155 void
156 __tu_finishme(const char *file, int line, const char *format, ...)
157    PRINTFLIKE(3, 4);
158 
159 /**
160  * Print a FINISHME message, including its source location.
161  */
162 #define tu_finishme(format, ...)                                             \
163    do {                                                                      \
164       static bool reported = false;                                          \
165       if (!reported) {                                                       \
166          __tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__);           \
167          reported = true;                                                    \
168       }                                                                      \
169    } while (0)
170 
171 #define tu_stub()                                                            \
172    do {                                                                      \
173       tu_finishme("stub %s", __func__);                                      \
174    } while (0)
175 
176 struct tu_memory_heap {
177    /* Standard bits passed on to the client */
178    VkDeviceSize      size;
179    VkMemoryHeapFlags flags;
180 
181    /** Copied from ANV:
182     *
183     * Driver-internal book-keeping.
184     *
185     * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
186     */
187    VkDeviceSize      used __attribute__ ((aligned (8)));
188 };
189 
190 uint64_t
191 tu_get_system_heap_size(void);
192 
193 struct tu_physical_device
194 {
195    struct vk_physical_device vk;
196 
197    struct tu_instance *instance;
198 
199    const char *name;
200    uint8_t driver_uuid[VK_UUID_SIZE];
201    uint8_t device_uuid[VK_UUID_SIZE];
202    uint8_t cache_uuid[VK_UUID_SIZE];
203 
204    struct wsi_device wsi_device;
205 
206    int local_fd;
207    int master_fd;
208 
209    uint32_t gmem_size;
210    uint64_t gmem_base;
211    uint32_t ccu_offset_gmem;
212    uint32_t ccu_offset_bypass;
213 
214    struct fd_dev_id dev_id;
215    const struct fd_dev_info *info;
216 
217    int msm_major_version;
218    int msm_minor_version;
219 
220    /* This is the drivers on-disk cache used as a fallback as opposed to
221     * the pipeline cache defined by apps.
222     */
223    struct disk_cache *disk_cache;
224 
225    struct tu_memory_heap heap;
226 };
227 
228 enum tu_debug_flags
229 {
230    TU_DEBUG_STARTUP = 1 << 0,
231    TU_DEBUG_NIR = 1 << 1,
232    TU_DEBUG_NOBIN = 1 << 3,
233    TU_DEBUG_SYSMEM = 1 << 4,
234    TU_DEBUG_FORCEBIN = 1 << 5,
235    TU_DEBUG_NOUBWC = 1 << 6,
236    TU_DEBUG_NOMULTIPOS = 1 << 7,
237    TU_DEBUG_NOLRZ = 1 << 8,
238    TU_DEBUG_PERFC = 1 << 9,
239    TU_DEBUG_FLUSHALL = 1 << 10,
240    TU_DEBUG_SYNCDRAW = 1 << 11,
241 };
242 
243 struct tu_instance
244 {
245    struct vk_instance vk;
246 
247    uint32_t api_version;
248    int physical_device_count;
249    struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
250 
251    enum tu_debug_flags debug_flags;
252 };
253 
254 VkResult
255 tu_wsi_init(struct tu_physical_device *physical_device);
256 void
257 tu_wsi_finish(struct tu_physical_device *physical_device);
258 
259 bool
260 tu_instance_extension_supported(const char *name);
261 uint32_t
262 tu_physical_device_api_version(struct tu_physical_device *dev);
263 bool
264 tu_physical_device_extension_supported(struct tu_physical_device *dev,
265                                        const char *name);
266 
267 struct cache_entry;
268 
269 struct tu_pipeline_cache
270 {
271    struct vk_object_base base;
272 
273    struct tu_device *device;
274    pthread_mutex_t mutex;
275 
276    uint32_t total_size;
277    uint32_t table_size;
278    uint32_t kernel_count;
279    struct cache_entry **hash_table;
280    bool modified;
281 
282    VkAllocationCallbacks alloc;
283 };
284 
285 struct tu_pipeline_key
286 {
287 };
288 
289 
290 /* queue types */
291 #define TU_QUEUE_GENERAL 0
292 
293 #define TU_MAX_QUEUE_FAMILIES 1
294 
295 struct tu_syncobj;
296 struct tu_u_trace_syncobj;
297 
298 struct tu_queue
299 {
300    struct vk_queue vk;
301 
302    struct tu_device *device;
303 
304    uint32_t msm_queue_id;
305    int fence;
306 
307    /* Queue containing deferred submits */
308    struct list_head queued_submits;
309 };
310 
311 struct tu_bo
312 {
313    uint32_t gem_handle;
314    uint64_t size;
315    uint64_t iova;
316    void *map;
317 };
318 
319 enum global_shader {
320    GLOBAL_SH_VS_BLIT,
321    GLOBAL_SH_VS_CLEAR,
322    GLOBAL_SH_FS_BLIT,
323    GLOBAL_SH_FS_BLIT_ZSCALE,
324    GLOBAL_SH_FS_COPY_MS,
325    GLOBAL_SH_FS_CLEAR0,
326    GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
327    GLOBAL_SH_COUNT,
328 };
329 
330 #define TU_BORDER_COLOR_COUNT 4096
331 #define TU_BORDER_COLOR_BUILTIN 6
332 
333 #define TU_BLIT_SHADER_SIZE 1024
334 
335 /* This struct defines the layout of the global_bo */
336 struct tu6_global
337 {
338    /* clear/blit shaders */
339    uint32_t shaders[TU_BLIT_SHADER_SIZE];
340 
341    uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
342    uint32_t _pad0;
343    volatile uint32_t vsc_draw_overflow;
344    uint32_t _pad1;
345    volatile uint32_t vsc_prim_overflow;
346    uint32_t _pad2;
347    uint64_t predicate;
348 
349    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
350    struct {
351       uint32_t offset;
352       uint32_t pad[7];
353    } flush_base[4];
354 
355    ALIGN16 uint32_t cs_indirect_xyz[3];
356 
357    /* note: larger global bo will be used for customBorderColors */
358    struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
359 };
360 #define gb_offset(member) offsetof(struct tu6_global, member)
361 #define global_iova(cmd, member) ((cmd)->device->global_bo.iova + gb_offset(member))
362 
363 /* extra space in vsc draw/prim streams */
364 #define VSC_PAD 0x40
365 
366 struct tu_device
367 {
368    struct vk_device vk;
369    struct tu_instance *instance;
370 
371    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
372    int queue_count[TU_MAX_QUEUE_FAMILIES];
373 
374    struct tu_physical_device *physical_device;
375    int fd;
376    int _lost;
377 
378    struct ir3_compiler *compiler;
379 
380    /* Backup in-memory cache to be used if the app doesn't provide one */
381    struct tu_pipeline_cache *mem_cache;
382 
383 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
384 
385    /* Currently the kernel driver uses a 32-bit GPU address space, but it
386     * should be impossible to go beyond 48 bits.
387     */
388    struct {
389       struct tu_bo bo;
390       mtx_t construct_mtx;
391       bool initialized;
392    } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
393 
394    struct tu_bo global_bo;
395 
396    struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
397    uint64_t global_shader_va[GLOBAL_SH_COUNT];
398 
399    uint32_t vsc_draw_strm_pitch;
400    uint32_t vsc_prim_strm_pitch;
401    BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
402    mtx_t mutex;
403 
404    /* bo list for submits: */
405    struct drm_msm_gem_submit_bo *bo_list;
406    /* map bo handles to bo list index: */
407    uint32_t *bo_idx;
408    uint32_t bo_count, bo_list_size, bo_idx_size;
409    mtx_t bo_mutex;
410 
411    /* Command streams to set pass index to a scratch reg */
412    struct tu_cs *perfcntrs_pass_cs;
413    struct tu_cs_entry *perfcntrs_pass_cs_entries;
414 
415    /* Condition variable for timeline semaphore to notify waiters when a
416     * new submit is executed. */
417    pthread_cond_t timeline_cond;
418    pthread_mutex_t submit_mutex;
419 
420 #ifdef ANDROID
421    const void *gralloc;
422    enum {
423       TU_GRALLOC_UNKNOWN,
424       TU_GRALLOC_CROS,
425       TU_GRALLOC_OTHER,
426    } gralloc_type;
427 #endif
428 
429    uint32_t submit_count;
430 
431    struct u_trace_context trace_context;
432 
433    #ifdef HAVE_PERFETTO
434    struct tu_perfetto_state perfetto;
435    #endif
436 };
437 
438 void tu_init_clear_blit_shaders(struct tu_device *dev);
439 
440 void tu_destroy_clear_blit_shaders(struct tu_device *dev);
441 
442 VkResult _tu_device_set_lost(struct tu_device *device,
443                              const char *msg, ...) PRINTFLIKE(2, 3);
444 #define tu_device_set_lost(dev, ...) \
445    _tu_device_set_lost(dev, __VA_ARGS__)
446 
447 static inline bool
tu_device_is_lost(struct tu_device * device)448 tu_device_is_lost(struct tu_device *device)
449 {
450    return unlikely(p_atomic_read(&device->_lost));
451 }
452 
453 VkResult
454 tu_device_submit_deferred_locked(struct tu_device *dev);
455 
456 VkResult
457 tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
458 
459 uint64_t
460 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
461 
462 enum tu_bo_alloc_flags
463 {
464    TU_BO_ALLOC_NO_FLAGS = 0,
465    TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
466    TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
467 };
468 
469 VkResult
470 tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size,
471                enum tu_bo_alloc_flags flags);
472 VkResult
473 tu_bo_init_dmabuf(struct tu_device *dev,
474                   struct tu_bo *bo,
475                   uint64_t size,
476                   int fd);
477 int
478 tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo);
479 void
480 tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
481 VkResult
482 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
483 
484 /* Get a scratch bo for use inside a command buffer. This will always return
485  * the same bo given the same size or similar sizes, so only one scratch bo
486  * can be used at the same time. It's meant for short-lived things where we
487  * need to write to some piece of memory, read from it, and then immediately
488  * discard it.
489  */
490 VkResult
491 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
492 
493 struct tu_cs_entry
494 {
495    /* No ownership */
496    const struct tu_bo *bo;
497 
498    uint32_t size;
499    uint32_t offset;
500 };
501 
502 struct tu_cs_memory {
503    uint32_t *map;
504    uint64_t iova;
505 };
506 
507 struct tu_draw_state {
508    uint64_t iova : 48;
509    uint32_t size : 16;
510 };
511 
512 enum tu_dynamic_state
513 {
514    /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
515    TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
516    TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
517    TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
518    TU_DYNAMIC_STATE_VB_STRIDE,
519    TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
520    TU_DYNAMIC_STATE_COUNT,
521    /* no associated draw state: */
522    TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
523    TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
524    /* re-use the line width enum as it uses GRAS_SU_CNTL: */
525    TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
526 };
527 
528 enum tu_draw_state_group_id
529 {
530    TU_DRAW_STATE_PROGRAM_CONFIG,
531    TU_DRAW_STATE_PROGRAM,
532    TU_DRAW_STATE_PROGRAM_BINNING,
533    TU_DRAW_STATE_TESS,
534    TU_DRAW_STATE_VB,
535    TU_DRAW_STATE_VI,
536    TU_DRAW_STATE_VI_BINNING,
537    TU_DRAW_STATE_RAST,
538    TU_DRAW_STATE_BLEND,
539    TU_DRAW_STATE_SHADER_GEOM_CONST,
540    TU_DRAW_STATE_FS_CONST,
541    TU_DRAW_STATE_DESC_SETS,
542    TU_DRAW_STATE_DESC_SETS_LOAD,
543    TU_DRAW_STATE_VS_PARAMS,
544    TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
545    TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
546    TU_DRAW_STATE_LRZ,
547    TU_DRAW_STATE_DEPTH_PLANE,
548 
549    /* dynamic state related draw states */
550    TU_DRAW_STATE_DYNAMIC,
551    TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
552 };
553 
554 enum tu_cs_mode
555 {
556 
557    /*
558     * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
559     * is full.  tu_cs_begin must be called before command packet emission and
560     * tu_cs_end must be called after.
561     *
562     * This mode may create multiple entries internally.  The entries must be
563     * submitted together.
564     */
565    TU_CS_MODE_GROW,
566 
567    /*
568     * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
569     * fixed-size buffer.  tu_cs_begin and tu_cs_end are optional and have no
570     * effect on it.
571     *
572     * This mode does not create any entry or any BO.
573     */
574    TU_CS_MODE_EXTERNAL,
575 
576    /*
577     * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
578     * command packet emission.  tu_cs_begin_sub_stream must be called to get a
579     * sub-stream to emit comamnd packets to.  When done with the sub-stream,
580     * tu_cs_end_sub_stream must be called.
581     *
582     * This mode does not create any entry internally.
583     */
584    TU_CS_MODE_SUB_STREAM,
585 };
586 
587 struct tu_cs
588 {
589    uint32_t *start;
590    uint32_t *cur;
591    uint32_t *reserved_end;
592    uint32_t *end;
593 
594    struct tu_device *device;
595    enum tu_cs_mode mode;
596    uint32_t next_bo_size;
597 
598    struct tu_cs_entry *entries;
599    uint32_t entry_count;
600    uint32_t entry_capacity;
601 
602    struct tu_bo **bos;
603    uint32_t bo_count;
604    uint32_t bo_capacity;
605 
606    /* state for cond_exec_start/cond_exec_end */
607    uint32_t cond_flags;
608    uint32_t *cond_dwords;
609 };
610 
611 struct tu_device_memory
612 {
613    struct vk_object_base base;
614 
615    struct tu_bo bo;
616 };
617 
618 struct tu_descriptor_range
619 {
620    uint64_t va;
621    uint32_t size;
622 };
623 
624 struct tu_descriptor_set
625 {
626    struct vk_object_base base;
627 
628    const struct tu_descriptor_set_layout *layout;
629    struct tu_descriptor_pool *pool;
630    uint32_t size;
631 
632    uint64_t va;
633    uint32_t *mapped_ptr;
634 
635    uint32_t *dynamic_descriptors;
636 };
637 
638 struct tu_descriptor_pool_entry
639 {
640    uint32_t offset;
641    uint32_t size;
642    struct tu_descriptor_set *set;
643 };
644 
645 struct tu_descriptor_pool
646 {
647    struct vk_object_base base;
648 
649    struct tu_bo bo;
650    uint64_t current_offset;
651    uint64_t size;
652 
653    uint8_t *host_memory_base;
654    uint8_t *host_memory_ptr;
655    uint8_t *host_memory_end;
656    uint8_t *host_bo;
657 
658    uint32_t entry_count;
659    uint32_t max_entry_count;
660    struct tu_descriptor_pool_entry entries[0];
661 };
662 
663 struct tu_descriptor_update_template_entry
664 {
665    VkDescriptorType descriptor_type;
666 
667    /* The number of descriptors to update */
668    uint32_t descriptor_count;
669 
670    /* Into mapped_ptr or dynamic_descriptors, in units of the respective array
671     */
672    uint32_t dst_offset;
673 
674    /* In dwords. Not valid/used for dynamic descriptors */
675    uint32_t dst_stride;
676 
677    uint32_t buffer_offset;
678 
679    /* Only valid for combined image samplers and samplers */
680    uint16_t has_sampler;
681 
682    /* In bytes */
683    size_t src_offset;
684    size_t src_stride;
685 
686    /* For push descriptors */
687    const struct tu_sampler *immutable_samplers;
688 };
689 
690 struct tu_descriptor_update_template
691 {
692    struct vk_object_base base;
693 
694    uint32_t entry_count;
695    VkPipelineBindPoint bind_point;
696    struct tu_descriptor_update_template_entry entry[0];
697 };
698 
699 struct tu_buffer
700 {
701    struct vk_object_base base;
702 
703    VkDeviceSize size;
704 
705    VkBufferUsageFlags usage;
706    VkBufferCreateFlags flags;
707 
708    struct tu_bo *bo;
709    VkDeviceSize bo_offset;
710 };
711 
712 static inline uint64_t
tu_buffer_iova(struct tu_buffer * buffer)713 tu_buffer_iova(struct tu_buffer *buffer)
714 {
715    return buffer->bo->iova + buffer->bo_offset;
716 }
717 
718 const char *
719 tu_get_debug_option_name(int id);
720 
721 const char *
722 tu_get_perftest_option_name(int id);
723 
724 struct tu_descriptor_state
725 {
726    struct tu_descriptor_set *sets[MAX_SETS];
727    struct tu_descriptor_set push_set;
728    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS];
729 };
730 
731 enum tu_cmd_dirty_bits
732 {
733    TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
734    TU_CMD_DIRTY_VB_STRIDE = BIT(1),
735    TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
736    TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
737    TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
738    TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
739    TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
740    TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
741    TU_CMD_DIRTY_LRZ = BIT(8),
742    TU_CMD_DIRTY_VS_PARAMS = BIT(9),
743    TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
744    /* all draw states were disabled and need to be re-enabled: */
745    TU_CMD_DIRTY_DRAW_STATE = BIT(11)
746 };
747 
748 /* There are only three cache domains we have to care about: the CCU, or
749  * color cache unit, which is used for color and depth/stencil attachments
750  * and copy/blit destinations, and is split conceptually into color and depth,
751  * and the universal cache or UCHE which is used for pretty much everything
752  * else, except for the CP (uncached) and host. We need to flush whenever data
753  * crosses these boundaries.
754  */
755 
756 enum tu_cmd_access_mask {
757    TU_ACCESS_UCHE_READ = 1 << 0,
758    TU_ACCESS_UCHE_WRITE = 1 << 1,
759    TU_ACCESS_CCU_COLOR_READ = 1 << 2,
760    TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
761    TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
762    TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
763 
764    /* Experiments have shown that while it's safe to avoid flushing the CCU
765     * after each blit/renderpass, it's not safe to assume that subsequent
766     * lookups with a different attachment state will hit unflushed cache
767     * entries. That is, the CCU needs to be flushed and possibly invalidated
768     * when accessing memory with a different attachment state. Writing to an
769     * attachment under the following conditions after clearing using the
770     * normal 2d engine path is known to have issues:
771     *
772     * - It isn't the 0'th layer.
773     * - There are more than one attachment, and this isn't the 0'th attachment
774     *   (this seems to also depend on the cpp of the attachments).
775     *
776     * Our best guess is that the layer/MRT state is used when computing
777     * the location of a cache entry in CCU, to avoid conflicts. We assume that
778     * any access in a renderpass after or before an access by a transfer needs
779     * a flush/invalidate, and use the _INCOHERENT variants to represent access
780     * by a renderpass.
781     */
782    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
783    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
784    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
785    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
786 
787    /* Accesses which bypasses any cache. e.g. writes via the host,
788     * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
789     */
790    TU_ACCESS_SYSMEM_READ = 1 << 10,
791    TU_ACCESS_SYSMEM_WRITE = 1 << 11,
792 
793    /* Memory writes from the CP start in-order with draws and event writes,
794     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
795     */
796    TU_ACCESS_CP_WRITE = 1 << 12,
797 
798    TU_ACCESS_READ =
799       TU_ACCESS_UCHE_READ |
800       TU_ACCESS_CCU_COLOR_READ |
801       TU_ACCESS_CCU_DEPTH_READ |
802       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
803       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
804       TU_ACCESS_SYSMEM_READ,
805 
806    TU_ACCESS_WRITE =
807       TU_ACCESS_UCHE_WRITE |
808       TU_ACCESS_CCU_COLOR_WRITE |
809       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
810       TU_ACCESS_CCU_DEPTH_WRITE |
811       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
812       TU_ACCESS_SYSMEM_WRITE |
813       TU_ACCESS_CP_WRITE,
814 
815    TU_ACCESS_ALL =
816       TU_ACCESS_READ |
817       TU_ACCESS_WRITE,
818 };
819 
820 /* Starting with a6xx, the pipeline is split into several "clusters" (really
821  * pipeline stages). Each stage has its own pair of register banks and can
822  * switch them independently, so that earlier stages can run ahead of later
823  * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
824  * the same time.
825  *
826  * As a result of this, we need to insert a WFI when an earlier stage depends
827  * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
828  * pending WFI's to complete before starting, and usually before reading
829  * indirect params even, so a WFI also acts as a full "pipeline stall".
830  *
831  * Note, the names of the stages come from CLUSTER_* in devcoredump. We
832  * include all the stages for completeness, even ones which do not read/write
833  * anything.
834  */
835 
836 enum tu_stage {
837    /* This doesn't correspond to a cluster, but we need it for tracking
838     * indirect draw parameter reads etc.
839     */
840    TU_STAGE_CP,
841 
842    /* - Fetch index buffer
843     * - Fetch vertex attributes, dispatch VS
844     */
845    TU_STAGE_FE,
846 
847    /* Execute all geometry stages (VS thru GS) */
848    TU_STAGE_SP_VS,
849 
850    /* Write to VPC, do primitive assembly. */
851    TU_STAGE_PC_VS,
852 
853    /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
854     * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
855     * early depth testing is enabled before dispatching fragments? However
856     * GRAS reads and writes LRZ directly.
857     */
858    TU_STAGE_GRAS,
859 
860    /* Execute FS */
861    TU_STAGE_SP_PS,
862 
863    /* - Fragment tests
864     * - Write color/depth
865     * - Streamout writes (???)
866     * - Varying interpolation (???)
867     */
868    TU_STAGE_PS,
869 };
870 
871 enum tu_cmd_flush_bits {
872    TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
873    TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
874    TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
875    TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
876    TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
877    TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
878    TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
879    TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
880    TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
881 
882    TU_CMD_FLAG_ALL_FLUSH =
883       TU_CMD_FLAG_CCU_FLUSH_DEPTH |
884       TU_CMD_FLAG_CCU_FLUSH_COLOR |
885       TU_CMD_FLAG_CACHE_FLUSH |
886       /* Treat the CP as a sort of "cache" which may need to be "flushed" via
887        * waiting for writes to land with WAIT_FOR_MEM_WRITES.
888        */
889       TU_CMD_FLAG_WAIT_MEM_WRITES,
890 
891    TU_CMD_FLAG_ALL_INVALIDATE =
892       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
893       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
894       TU_CMD_FLAG_CACHE_INVALIDATE,
895 };
896 
897 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
898  * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
899  * which part of the gmem is used by the CCU. Here we keep track of what the
900  * state of the CCU.
901  */
902 enum tu_cmd_ccu_state {
903    TU_CMD_CCU_SYSMEM,
904    TU_CMD_CCU_GMEM,
905    TU_CMD_CCU_UNKNOWN,
906 };
907 
908 struct tu_cache_state {
909    /* Caches which must be made available (flushed) eventually if there are
910     * any users outside that cache domain, and caches which must be
911     * invalidated eventually if there are any reads.
912     */
913    enum tu_cmd_flush_bits pending_flush_bits;
914    /* Pending flushes */
915    enum tu_cmd_flush_bits flush_bits;
916 };
917 
918 enum tu_lrz_force_disable_mask {
919    TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
920    TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
921 };
922 
923 enum tu_lrz_direction {
924    TU_LRZ_UNKNOWN,
925    /* Depth func less/less-than: */
926    TU_LRZ_LESS,
927    /* Depth func greater/greater-than: */
928    TU_LRZ_GREATER,
929 };
930 
931 struct tu_lrz_pipeline
932 {
933    uint32_t force_disable_mask;
934    bool fs_has_kill;
935    bool force_late_z;
936    bool early_fragment_tests;
937 };
938 
939 struct tu_lrz_state
940 {
941    /* Depth/Stencil image currently on use to do LRZ */
942    struct tu_image *image;
943    bool valid : 1;
944    struct tu_draw_state state;
945    enum tu_lrz_direction prev_direction;
946 };
947 
948 struct tu_vs_params {
949    uint32_t vertex_offset;
950    uint32_t first_instance;
951 };
952 
953 struct tu_cmd_state
954 {
955    uint32_t dirty;
956 
957    struct tu_pipeline *pipeline;
958    struct tu_pipeline *compute_pipeline;
959 
960    /* Vertex buffers, viewports, and scissors
961     * the states for these can be updated partially, so we need to save these
962     * to be able to emit a complete draw state
963     */
964    struct {
965       uint64_t base;
966       uint32_t size;
967       uint32_t stride;
968    } vb[MAX_VBS];
969    VkViewport viewport[MAX_VIEWPORTS];
970    VkRect2D scissor[MAX_SCISSORS];
971    uint32_t max_viewport, max_scissor;
972 
973    /* for dynamic states that can't be emitted directly */
974    uint32_t dynamic_stencil_mask;
975    uint32_t dynamic_stencil_wrmask;
976    uint32_t dynamic_stencil_ref;
977 
978    uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
979    uint32_t pc_raster_cntl, vpc_unknown_9107;
980    enum pc_di_primtype primtype;
981    bool primitive_restart_enable;
982 
983    /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
984    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
985    struct tu_draw_state vertex_buffers;
986    struct tu_draw_state shader_const[2];
987    struct tu_draw_state desc_sets;
988 
989    struct tu_draw_state vs_params;
990 
991    /* Index buffer */
992    uint64_t index_va;
993    uint32_t max_index_count;
994    uint8_t index_size;
995 
996    /* because streamout base has to be 32-byte aligned
997     * there is an extra offset to deal with when it is
998     * unaligned
999     */
1000    uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
1001 
1002    /* Renderpasses are tricky, because we may need to flush differently if
1003     * using sysmem vs. gmem and therefore we have to delay any flushing that
1004     * happens before a renderpass. So we have to have two copies of the flush
1005     * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
1006     * and one for outside a renderpass.
1007     */
1008    struct tu_cache_state cache;
1009    struct tu_cache_state renderpass_cache;
1010 
1011    enum tu_cmd_ccu_state ccu_state;
1012 
1013    const struct tu_render_pass *pass;
1014    const struct tu_subpass *subpass;
1015    const struct tu_framebuffer *framebuffer;
1016    VkRect2D render_area;
1017 
1018    const struct tu_image_view **attachments;
1019 
1020    bool xfb_used;
1021    bool has_tess;
1022    bool has_subpass_predication;
1023    bool predication_active;
1024    bool disable_gmem;
1025    enum a5xx_line_mode line_mode;
1026 
1027    struct tu_lrz_state lrz;
1028 
1029    struct tu_draw_state depth_plane_state;
1030 
1031    struct tu_vs_params last_vs_params;
1032 };
1033 
1034 struct tu_cmd_pool
1035 {
1036    struct vk_object_base base;
1037 
1038    VkAllocationCallbacks alloc;
1039    struct list_head cmd_buffers;
1040    struct list_head free_cmd_buffers;
1041    uint32_t queue_family_index;
1042 };
1043 
1044 enum tu_cmd_buffer_status
1045 {
1046    TU_CMD_BUFFER_STATUS_INVALID,
1047    TU_CMD_BUFFER_STATUS_INITIAL,
1048    TU_CMD_BUFFER_STATUS_RECORDING,
1049    TU_CMD_BUFFER_STATUS_EXECUTABLE,
1050    TU_CMD_BUFFER_STATUS_PENDING,
1051 };
1052 
1053 struct tu_cmd_buffer
1054 {
1055    struct vk_command_buffer vk;
1056 
1057    struct tu_device *device;
1058 
1059    struct tu_cmd_pool *pool;
1060    struct list_head pool_link;
1061 
1062    struct u_trace trace;
1063    struct u_trace_iterator trace_renderpass_start;
1064    struct u_trace_iterator trace_renderpass_end;
1065 
1066    VkCommandBufferUsageFlags usage_flags;
1067    VkCommandBufferLevel level;
1068    enum tu_cmd_buffer_status status;
1069 
1070    struct tu_cmd_state state;
1071    uint32_t queue_family_index;
1072 
1073    uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
1074    VkShaderStageFlags push_constant_stages;
1075    struct tu_descriptor_set meta_push_descriptors;
1076 
1077    struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
1078 
1079    VkResult record_result;
1080 
1081    struct tu_cs cs;
1082    struct tu_cs draw_cs;
1083    struct tu_cs tile_store_cs;
1084    struct tu_cs draw_epilogue_cs;
1085    struct tu_cs sub_cs;
1086 
1087    uint32_t vsc_draw_strm_pitch;
1088    uint32_t vsc_prim_strm_pitch;
1089 };
1090 
1091 /* Temporary struct for tracking a register state to be written, used by
1092  * a6xx-pack.h and tu_cs_emit_regs()
1093  */
1094 struct tu_reg_value {
1095    uint32_t reg;
1096    uint64_t value;
1097    bool is_address;
1098    struct tu_bo *bo;
1099    bool bo_write;
1100    uint32_t bo_offset;
1101    uint32_t bo_shift;
1102 };
1103 
1104 
1105 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
1106                                     struct tu_cs *cs);
1107 
1108 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
1109                              struct tu_cs *cs,
1110                              enum tu_cmd_ccu_state ccu_state);
1111 
1112 void
1113 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
1114                      struct tu_cs *cs,
1115                      enum vgt_event_type event);
1116 
1117 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)1118 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
1119                          VkPipelineBindPoint bind_point)
1120 {
1121    return &cmd_buffer->descriptors[bind_point];
1122 }
1123 
1124 struct tu_event
1125 {
1126    struct vk_object_base base;
1127    struct tu_bo bo;
1128 };
1129 
1130 struct tu_push_constant_range
1131 {
1132    uint32_t lo;
1133    uint32_t count;
1134 };
1135 
1136 struct tu_shader
1137 {
1138    struct ir3_shader *ir3_shader;
1139 
1140    struct tu_push_constant_range push_consts;
1141    uint8_t active_desc_sets;
1142    bool multi_pos_output;
1143 };
1144 
1145 bool
1146 tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
1147                        struct tu_device *dev);
1148 
1149 nir_shader *
1150 tu_spirv_to_nir(struct tu_device *dev,
1151                 const VkPipelineShaderStageCreateInfo *stage_info,
1152                 gl_shader_stage stage);
1153 
1154 struct tu_shader *
1155 tu_shader_create(struct tu_device *dev,
1156                  nir_shader *nir,
1157                  unsigned multiview_mask,
1158                  struct tu_pipeline_layout *layout,
1159                  const VkAllocationCallbacks *alloc);
1160 
1161 void
1162 tu_shader_destroy(struct tu_device *dev,
1163                   struct tu_shader *shader,
1164                   const VkAllocationCallbacks *alloc);
1165 
1166 struct tu_program_descriptor_linkage
1167 {
1168    struct ir3_const_state const_state;
1169 
1170    uint32_t constlen;
1171 
1172    struct tu_push_constant_range push_consts;
1173 };
1174 
1175 struct tu_pipeline_executable {
1176    gl_shader_stage stage;
1177 
1178    struct ir3_info stats;
1179    bool is_binning;
1180 
1181    char *nir_from_spirv;
1182    char *nir_final;
1183    char *disasm;
1184 };
1185 
1186 struct tu_pipeline
1187 {
1188    struct vk_object_base base;
1189 
1190    struct tu_cs cs;
1191 
1192    /* Separate BO for private memory since it should GPU writable */
1193    struct tu_bo pvtmem_bo;
1194 
1195    struct tu_pipeline_layout *layout;
1196 
1197    bool need_indirect_descriptor_sets;
1198    VkShaderStageFlags active_stages;
1199    uint32_t active_desc_sets;
1200 
1201    /* mask of enabled dynamic states
1202     * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
1203     */
1204    uint32_t dynamic_state_mask;
1205    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
1206 
1207    /* for dynamic states which use the same register: */
1208    uint32_t gras_su_cntl, gras_su_cntl_mask;
1209    uint32_t rb_depth_cntl, rb_depth_cntl_mask;
1210    uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
1211    uint32_t pc_raster_cntl, pc_raster_cntl_mask;
1212    uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
1213    uint32_t stencil_wrmask;
1214 
1215    bool rb_depth_cntl_disable;
1216 
1217    enum a5xx_line_mode line_mode;
1218 
1219    /* draw states for the pipeline */
1220    struct tu_draw_state load_state, rast_state, blend_state;
1221 
1222    /* for vertex buffers state */
1223    uint32_t num_vbs;
1224 
1225    struct
1226    {
1227       struct tu_draw_state config_state;
1228       struct tu_draw_state state;
1229       struct tu_draw_state binning_state;
1230 
1231       struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
1232    } program;
1233 
1234    struct
1235    {
1236       struct tu_draw_state state;
1237       struct tu_draw_state binning_state;
1238    } vi;
1239 
1240    struct
1241    {
1242       enum pc_di_primtype primtype;
1243       bool primitive_restart;
1244    } ia;
1245 
1246    struct
1247    {
1248       uint32_t patch_type;
1249       uint32_t param_stride;
1250       uint32_t hs_bo_regid;
1251       uint32_t ds_bo_regid;
1252       bool upper_left_domain_origin;
1253    } tess;
1254 
1255    struct
1256    {
1257       uint32_t local_size[3];
1258       uint32_t subgroup_size;
1259    } compute;
1260 
1261    bool provoking_vertex_last;
1262 
1263    struct tu_lrz_pipeline lrz;
1264 
1265    void *executables_mem_ctx;
1266    /* tu_pipeline_executable */
1267    struct util_dynarray executables;
1268 };
1269 
1270 void
1271 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport);
1272 
1273 void
1274 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
1275 
1276 void
1277 tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
1278 
1279 void
1280 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
1281 
1282 void
1283 tu6_emit_depth_bias(struct tu_cs *cs,
1284                     float constant_factor,
1285                     float clamp,
1286                     float slope_factor);
1287 
1288 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
1289                    enum a5xx_line_mode line_mode);
1290 
1291 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
1292 
1293 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
1294 
1295 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1296 
1297 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
1298                                        uint32_t *rb_depth_cntl);
1299 
1300 struct tu_pvtmem_config {
1301    uint64_t iova;
1302    uint32_t per_fiber_size;
1303    uint32_t per_sp_size;
1304    bool per_wave;
1305 };
1306 
1307 void
1308 tu6_emit_xs_config(struct tu_cs *cs,
1309                    gl_shader_stage stage,
1310                    const struct ir3_shader_variant *xs);
1311 
1312 void
1313 tu6_emit_xs(struct tu_cs *cs,
1314             gl_shader_stage stage,
1315             const struct ir3_shader_variant *xs,
1316             const struct tu_pvtmem_config *pvtmem,
1317             uint64_t binary_iova);
1318 
1319 void
1320 tu6_emit_vpc(struct tu_cs *cs,
1321              const struct ir3_shader_variant *vs,
1322              const struct ir3_shader_variant *hs,
1323              const struct ir3_shader_variant *ds,
1324              const struct ir3_shader_variant *gs,
1325              const struct ir3_shader_variant *fs,
1326              uint32_t patch_control_points);
1327 
1328 void
1329 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
1330 
1331 struct tu_image_view;
1332 
1333 void
1334 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1335                   struct tu_cs *cs,
1336                   const struct tu_image_view *src,
1337                   const struct tu_image_view *dst,
1338                   uint32_t layer_mask,
1339                   uint32_t layers,
1340                   const VkRect2D *rect);
1341 
1342 void
1343 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
1344                            struct tu_cs *cs,
1345                            uint32_t a,
1346                            const VkRenderPassBeginInfo *info);
1347 
1348 void
1349 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1350                          struct tu_cs *cs,
1351                          uint32_t a,
1352                          const VkRenderPassBeginInfo *info);
1353 
1354 void
1355 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
1356                         struct tu_cs *cs,
1357                         uint32_t a,
1358                         bool force_load);
1359 
1360 /* expose this function to be able to emit load without checking LOAD_OP */
1361 void
1362 tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
1363 
1364 /* note: gmem store can also resolve */
1365 void
1366 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
1367                          struct tu_cs *cs,
1368                          uint32_t a,
1369                          uint32_t gmem_a);
1370 
1371 struct tu_native_format
1372 {
1373    enum a6xx_format fmt : 8;
1374    enum a3xx_color_swap swap : 8;
1375    enum a6xx_tile_mode tile_mode : 8;
1376 };
1377 
1378 bool tu6_format_vtx_supported(VkFormat format);
1379 struct tu_native_format tu6_format_vtx(VkFormat format);
1380 bool tu6_format_color_supported(VkFormat format);
1381 struct tu_native_format tu6_format_color(VkFormat format, enum a6xx_tile_mode tile_mode);
1382 bool tu6_format_texture_supported(VkFormat format);
1383 struct tu_native_format tu6_format_texture(VkFormat format, enum a6xx_tile_mode tile_mode);
1384 
1385 static inline enum a6xx_format
tu6_base_format(VkFormat format)1386 tu6_base_format(VkFormat format)
1387 {
1388    /* note: tu6_format_color doesn't care about tiling for .fmt field */
1389    return tu6_format_color(format, TILE6_LINEAR).fmt;
1390 }
1391 
1392 struct tu_image
1393 {
1394    struct vk_object_base base;
1395 
1396    /* The original VkFormat provided by the client.  This may not match any
1397     * of the actual surface formats.
1398     */
1399    VkFormat vk_format;
1400    uint32_t level_count;
1401    uint32_t layer_count;
1402 
1403    struct fdl_layout layout[3];
1404    uint32_t total_size;
1405 
1406 #ifdef ANDROID
1407    /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
1408    VkDeviceMemory owned_memory;
1409 #endif
1410 
1411    /* Set when bound */
1412    struct tu_bo *bo;
1413    VkDeviceSize bo_offset;
1414 
1415    uint32_t lrz_height;
1416    uint32_t lrz_pitch;
1417    uint32_t lrz_offset;
1418 
1419    bool shareable;
1420 };
1421 
1422 static inline uint32_t
tu_get_layerCount(const struct tu_image * image,const VkImageSubresourceRange * range)1423 tu_get_layerCount(const struct tu_image *image,
1424                   const VkImageSubresourceRange *range)
1425 {
1426    return range->layerCount == VK_REMAINING_ARRAY_LAYERS
1427              ? image->layer_count - range->baseArrayLayer
1428              : range->layerCount;
1429 }
1430 
1431 static inline uint32_t
tu_get_levelCount(const struct tu_image * image,const VkImageSubresourceRange * range)1432 tu_get_levelCount(const struct tu_image *image,
1433                   const VkImageSubresourceRange *range)
1434 {
1435    return range->levelCount == VK_REMAINING_MIP_LEVELS
1436              ? image->level_count - range->baseMipLevel
1437              : range->levelCount;
1438 }
1439 
1440 struct tu_image_view
1441 {
1442    struct vk_object_base base;
1443 
1444    struct tu_image *image; /**< VkImageViewCreateInfo::image */
1445 
1446    uint64_t base_addr;
1447    uint64_t ubwc_addr;
1448    uint32_t layer_size;
1449    uint32_t ubwc_layer_size;
1450 
1451    /* used to determine if fast gmem store path can be used */
1452    VkExtent2D extent;
1453    bool need_y2_align;
1454 
1455    bool ubwc_enabled;
1456 
1457    uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
1458 
1459    /* Descriptor for use as a storage image as opposed to a sampled image.
1460     * This has a few differences for cube maps (e.g. type).
1461     */
1462    uint32_t storage_descriptor[A6XX_TEX_CONST_DWORDS];
1463 
1464    /* pre-filled register values */
1465    uint32_t PITCH;
1466    uint32_t FLAG_BUFFER_PITCH;
1467 
1468    uint32_t RB_MRT_BUF_INFO;
1469    uint32_t SP_FS_MRT_REG;
1470 
1471    uint32_t SP_PS_2D_SRC_INFO;
1472    uint32_t SP_PS_2D_SRC_SIZE;
1473 
1474    uint32_t RB_2D_DST_INFO;
1475 
1476    uint32_t RB_BLIT_DST_INFO;
1477 
1478    /* for d32s8 separate stencil */
1479    uint64_t stencil_base_addr;
1480    uint32_t stencil_layer_size;
1481    uint32_t stencil_PITCH;
1482 };
1483 
1484 struct tu_sampler_ycbcr_conversion {
1485    struct vk_object_base base;
1486 
1487    VkFormat format;
1488    VkSamplerYcbcrModelConversion ycbcr_model;
1489    VkSamplerYcbcrRange ycbcr_range;
1490    VkComponentMapping components;
1491    VkChromaLocation chroma_offsets[2];
1492    VkFilter chroma_filter;
1493 };
1494 
1495 struct tu_sampler {
1496    struct vk_object_base base;
1497 
1498    uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
1499    struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
1500 };
1501 
1502 void
1503 tu_cs_image_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1504 
1505 void
1506 tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, bool src);
1507 
1508 void
1509 tu_cs_image_flag_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1510 
1511 void
1512 tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1513 
1514 #define tu_image_view_stencil(iview, x) \
1515    ((iview->x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
1516 
1517 VkResult
1518 tu_gralloc_info(struct tu_device *device,
1519                 const VkNativeBufferANDROID *gralloc_info,
1520                 int *dma_buf,
1521                 uint64_t *modifier);
1522 
1523 VkResult
1524 tu_import_memory_from_gralloc_handle(VkDevice device_h,
1525                                      int dma_buf,
1526                                      const VkAllocationCallbacks *alloc,
1527                                      VkImage image_h);
1528 
1529 void
1530 tu_image_view_init(struct tu_image_view *iview,
1531                    const VkImageViewCreateInfo *pCreateInfo,
1532                    bool limited_z24s8);
1533 
1534 bool
1535 ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
1536               const struct fd_dev_info *info, VkSampleCountFlagBits samples);
1537 
1538 struct tu_buffer_view
1539 {
1540    struct vk_object_base base;
1541 
1542    uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
1543 
1544    struct tu_buffer *buffer;
1545 };
1546 void
1547 tu_buffer_view_init(struct tu_buffer_view *view,
1548                     struct tu_device *device,
1549                     const VkBufferViewCreateInfo *pCreateInfo);
1550 
1551 struct tu_attachment_info
1552 {
1553    struct tu_image_view *attachment;
1554 };
1555 
1556 struct tu_framebuffer
1557 {
1558    struct vk_object_base base;
1559 
1560    uint32_t width;
1561    uint32_t height;
1562    uint32_t layers;
1563 
1564    /* size of the first tile */
1565    VkExtent2D tile0;
1566    /* number of tiles */
1567    VkExtent2D tile_count;
1568 
1569    /* size of the first VSC pipe */
1570    VkExtent2D pipe0;
1571    /* number of VSC pipes */
1572    VkExtent2D pipe_count;
1573 
1574    /* pipe register values */
1575    uint32_t pipe_config[MAX_VSC_PIPES];
1576    uint32_t pipe_sizes[MAX_VSC_PIPES];
1577 
1578    uint32_t attachment_count;
1579    struct tu_attachment_info attachments[0];
1580 };
1581 
1582 void
1583 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
1584                              const struct tu_device *device,
1585                              const struct tu_render_pass *pass);
1586 
1587 struct tu_subpass_barrier {
1588    VkPipelineStageFlags src_stage_mask;
1589    VkPipelineStageFlags dst_stage_mask;
1590    VkAccessFlags src_access_mask;
1591    VkAccessFlags dst_access_mask;
1592    bool incoherent_ccu_color, incoherent_ccu_depth;
1593 };
1594 
1595 struct tu_subpass_attachment
1596 {
1597    uint32_t attachment;
1598 
1599    /* For input attachments, true if it needs to be patched to refer to GMEM
1600     * in GMEM mode. This is false if it hasn't already been written as an
1601     * attachment.
1602     */
1603    bool patch_input_gmem;
1604 };
1605 
1606 struct tu_subpass
1607 {
1608    uint32_t input_count;
1609    uint32_t color_count;
1610    uint32_t resolve_count;
1611    bool resolve_depth_stencil;
1612 
1613    /* True if there is any feedback loop at all. */
1614    bool feedback;
1615 
1616    /* True if we must invalidate UCHE thanks to a feedback loop. */
1617    bool feedback_invalidate;
1618 
1619    struct tu_subpass_attachment *input_attachments;
1620    struct tu_subpass_attachment *color_attachments;
1621    struct tu_subpass_attachment *resolve_attachments;
1622    struct tu_subpass_attachment depth_stencil_attachment;
1623 
1624    VkSampleCountFlagBits samples;
1625 
1626    uint32_t srgb_cntl;
1627    uint32_t multiview_mask;
1628 
1629    struct tu_subpass_barrier start_barrier;
1630 };
1631 
1632 struct tu_render_pass_attachment
1633 {
1634    VkFormat format;
1635    uint32_t samples;
1636    uint32_t cpp;
1637    VkImageAspectFlags clear_mask;
1638    uint32_t clear_views;
1639    bool load;
1640    bool store;
1641    int32_t gmem_offset;
1642    /* for D32S8 separate stencil: */
1643    bool load_stencil;
1644    bool store_stencil;
1645    int32_t gmem_offset_stencil;
1646 };
1647 
1648 struct tu_render_pass
1649 {
1650    struct vk_object_base base;
1651 
1652    uint32_t attachment_count;
1653    uint32_t subpass_count;
1654    uint32_t gmem_pixels;
1655    uint32_t tile_align_w;
1656    struct tu_subpass_attachment *subpass_attachments;
1657    struct tu_render_pass_attachment *attachments;
1658    struct tu_subpass_barrier end_barrier;
1659    struct tu_subpass subpasses[0];
1660 };
1661 
1662 #define PERF_CNTRS_REG 4
1663 
1664 struct tu_perf_query_data
1665 {
1666    uint32_t gid;      /* group-id */
1667    uint32_t cid;      /* countable-id within the group */
1668    uint32_t cntr_reg; /* counter register within the group */
1669    uint32_t pass;     /* pass index that countables can be requested */
1670    uint32_t app_idx;  /* index provided by apps */
1671 };
1672 
1673 struct tu_query_pool
1674 {
1675    struct vk_object_base base;
1676 
1677    VkQueryType type;
1678    uint32_t stride;
1679    uint64_t size;
1680    uint32_t pipeline_statistics;
1681    struct tu_bo bo;
1682 
1683    /* For performance query */
1684    const struct fd_perfcntr_group *perf_group;
1685    uint32_t perf_group_count;
1686    uint32_t counter_index_count;
1687    struct tu_perf_query_data perf_query_data[0];
1688 };
1689 
1690 uint32_t
1691 tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
1692 
1693 void
1694 tu_update_descriptor_sets(const struct tu_device *device,
1695                           VkDescriptorSet overrideSet,
1696                           uint32_t descriptorWriteCount,
1697                           const VkWriteDescriptorSet *pDescriptorWrites,
1698                           uint32_t descriptorCopyCount,
1699                           const VkCopyDescriptorSet *pDescriptorCopies);
1700 
1701 void
1702 tu_update_descriptor_set_with_template(
1703    const struct tu_device *device,
1704    struct tu_descriptor_set *set,
1705    VkDescriptorUpdateTemplate descriptorUpdateTemplate,
1706    const void *pData);
1707 
1708 VkResult
1709 tu_physical_device_init(struct tu_physical_device *device,
1710                         struct tu_instance *instance);
1711 VkResult
1712 tu_enumerate_devices(struct tu_instance *instance);
1713 
1714 int
1715 tu_drm_get_timestamp(struct tu_physical_device *device,
1716                      uint64_t *ts);
1717 
1718 int
1719 tu_drm_submitqueue_new(const struct tu_device *dev,
1720                        int priority,
1721                        uint32_t *queue_id);
1722 
1723 void
1724 tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
1725 
1726 int
1727 tu_signal_fences(struct tu_device *device, struct tu_syncobj *fence1, struct tu_syncobj *fence2);
1728 
1729 int
1730 tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync);
1731 
1732 
1733 void
1734 tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
1735                          void *ts_from, uint32_t from_offset,
1736                          void *ts_to, uint32_t to_offset,
1737                          uint32_t count);
1738 
1739 
1740 VkResult
1741 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
1742                             struct u_trace **trace_copy);
1743 
1744 struct tu_u_trace_cmd_data
1745 {
1746    struct tu_cs *timestamp_copy_cs;
1747    struct u_trace *trace;
1748 };
1749 
1750 void
1751 tu_u_trace_cmd_data_finish(struct tu_device *device,
1752                            struct tu_u_trace_cmd_data *trace_data,
1753                            uint32_t entry_count);
1754 
1755 struct tu_u_trace_flush_data
1756 {
1757    uint32_t submission_id;
1758    struct tu_u_trace_syncobj *syncobj;
1759    uint32_t trace_count;
1760    struct tu_u_trace_cmd_data *cmd_trace_data;
1761 };
1762 
1763 #define TU_FROM_HANDLE(__tu_type, __name, __handle)                          \
1764    VK_FROM_HANDLE(__tu_type, __name, __handle)
1765 
1766 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
1767                        VK_OBJECT_TYPE_COMMAND_BUFFER)
1768 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
1769 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
1770                        VK_OBJECT_TYPE_INSTANCE)
1771 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
1772                        VK_OBJECT_TYPE_PHYSICAL_DEVICE)
1773 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
1774 
1775 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, base, VkCommandPool,
1776                                VK_OBJECT_TYPE_COMMAND_POOL)
1777 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
1778                                VK_OBJECT_TYPE_BUFFER)
1779 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
1780                                VK_OBJECT_TYPE_BUFFER_VIEW)
1781 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
1782                                VK_OBJECT_TYPE_DESCRIPTOR_POOL)
1783 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
1784                                VK_OBJECT_TYPE_DESCRIPTOR_SET)
1785 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
1786                                VkDescriptorSetLayout,
1787                                VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
1788 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
1789                                VkDescriptorUpdateTemplate,
1790                                VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
1791 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
1792                                VK_OBJECT_TYPE_DEVICE_MEMORY)
1793 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
1794 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
1795                                VK_OBJECT_TYPE_FRAMEBUFFER)
1796 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
1797 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView,
1798                                VK_OBJECT_TYPE_IMAGE_VIEW);
1799 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
1800                                VK_OBJECT_TYPE_PIPELINE_CACHE)
1801 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
1802                                VK_OBJECT_TYPE_PIPELINE)
1803 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
1804                                VK_OBJECT_TYPE_PIPELINE_LAYOUT)
1805 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
1806                                VK_OBJECT_TYPE_QUERY_POOL)
1807 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
1808                                VK_OBJECT_TYPE_RENDER_PASS)
1809 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
1810                                VK_OBJECT_TYPE_SAMPLER)
1811 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
1812                                VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
1813 
1814 /* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
1815 #define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
1816 
1817 void
1818 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
1819 
1820 #endif /* TU_PRIVATE_H */
1821