1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * Based on anv:
4  * Copyright © 2015 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23  * DEALINGS IN THE SOFTWARE.
24  */
25 
26 #include "tu_private.h"
27 
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <stdbool.h>
31 #include <string.h>
32 #include <unistd.h>
33 
34 #include "adreno_pm4.xml.h"
35 #include "adreno_common.xml.h"
36 #include "a6xx.xml.h"
37 
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
40 
41 #include "tu_cs.h"
42 #include "vk_util.h"
43 
44 #define NSEC_PER_SEC 1000000000ull
45 #define WAIT_TIMEOUT 5
46 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
47 
48 struct PACKED query_slot {
49    uint64_t available;
50 };
51 
52 struct PACKED occlusion_slot_value {
53    /* Seems sample counters are placed to be 16-byte aligned
54     * even though this query needs an 8-byte slot. */
55    uint64_t value;
56    uint64_t _padding;
57 };
58 
59 struct PACKED occlusion_query_slot {
60    struct query_slot common;
61    uint64_t result;
62 
63    struct occlusion_slot_value begin;
64    struct occlusion_slot_value end;
65 };
66 
67 struct PACKED timestamp_query_slot {
68    struct query_slot common;
69    uint64_t result;
70 };
71 
72 struct PACKED primitive_slot_value {
73    uint64_t values[2];
74 };
75 
76 struct PACKED pipeline_stat_query_slot {
77    struct query_slot common;
78    uint64_t results[STAT_COUNT];
79 
80    uint64_t begin[STAT_COUNT];
81    uint64_t end[STAT_COUNT];
82 };
83 
84 struct PACKED primitive_query_slot {
85    struct query_slot common;
86    /* The result of transform feedback queries is two integer values:
87     *   results[0] is the count of primitives written,
88     *   results[1] is the count of primitives generated.
89     * Also a result for each stream is stored at 4 slots respectively.
90     */
91    uint64_t results[2];
92 
93    /* Primitive counters also need to be 16-byte aligned. */
94    uint64_t _padding;
95 
96    struct primitive_slot_value begin[4];
97    struct primitive_slot_value end[4];
98 };
99 
100 struct PACKED perfcntr_query_slot {
101    uint64_t result;
102    uint64_t begin;
103    uint64_t end;
104 };
105 
106 struct PACKED perf_query_slot {
107    struct query_slot common;
108    struct perfcntr_query_slot perfcntr;
109 };
110 
111 /* Returns the IOVA of a given uint64_t field in a given slot of a query
112  * pool. */
113 #define query_iova(type, pool, query, field)                         \
114    pool->bo.iova + pool->stride * (query) + offsetof(type, field)
115 
116 #define occlusion_query_iova(pool, query, field)                     \
117    query_iova(struct occlusion_query_slot, pool, query, field)
118 
119 #define pipeline_stat_query_iova(pool, query, field)                 \
120    pool->bo.iova + pool->stride * (query) +                            \
121    offsetof(struct pipeline_stat_query_slot, field)
122 
123 #define primitive_query_iova(pool, query, field, i)                  \
124    query_iova(struct primitive_query_slot, pool, query, field) +     \
125    offsetof(struct primitive_slot_value, values[i])
126 
127 #define perf_query_iova(pool, query, field, i)                          \
128    pool->bo.iova + pool->stride * (query) +                             \
129    sizeof(struct query_slot) +                                   \
130    sizeof(struct perfcntr_query_slot) * (i) +                          \
131    offsetof(struct perfcntr_query_slot, field)
132 
133 #define query_available_iova(pool, query)                            \
134    query_iova(struct query_slot, pool, query, available)
135 
136 #define query_result_iova(pool, query, type, i)                            \
137    pool->bo.iova + pool->stride * (query) +                          \
138    sizeof(struct query_slot) + sizeof(type) * (i)
139 
140 #define query_result_addr(pool, query, type, i)                            \
141    pool->bo.map + pool->stride * (query) +                             \
142    sizeof(struct query_slot) + sizeof(type) * (i)
143 
144 #define query_is_available(slot) slot->available
145 
146 static const VkPerformanceCounterUnitKHR
147 fd_perfcntr_type_to_vk_unit[] = {
148    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
149    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
150    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
151    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
152    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
153    /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
154    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
155    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
156    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
157    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
158    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
159    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
160    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
161 };
162 
163 /* TODO. Basically this comes from the freedreno implementation where
164  * only UINT64 is used. We'd better confirm this by the blob vulkan driver
165  * when it starts supporting perf query.
166  */
167 static const VkPerformanceCounterStorageKHR
168 fd_perfcntr_type_to_vk_storage[] = {
169    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
170    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
171    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
172    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
173    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
174    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
175    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
176    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
177    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
178    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
179    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
180    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
181 };
182 
183 /*
184  * Returns a pointer to a given slot in a query pool.
185  */
slot_address(struct tu_query_pool * pool,uint32_t query)186 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
187 {
188    return (char*)pool->bo.map + query * pool->stride;
189 }
190 
191 static void
perfcntr_index(const struct fd_perfcntr_group * group,uint32_t group_count,uint32_t index,uint32_t * gid,uint32_t * cid)192 perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
193                uint32_t index, uint32_t *gid, uint32_t *cid)
194 
195 {
196    uint32_t i;
197 
198    for (i = 0; i < group_count; i++) {
199       if (group[i].num_countables > index) {
200          *gid = i;
201          *cid = index;
202          break;
203       }
204       index -= group[i].num_countables;
205    }
206 
207    assert(i < group_count);
208 }
209 
210 static int
compare_perfcntr_pass(const void * a,const void * b)211 compare_perfcntr_pass(const void *a, const void *b)
212 {
213    return ((struct tu_perf_query_data *)a)->pass -
214           ((struct tu_perf_query_data *)b)->pass;
215 }
216 
217 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)218 tu_CreateQueryPool(VkDevice _device,
219                    const VkQueryPoolCreateInfo *pCreateInfo,
220                    const VkAllocationCallbacks *pAllocator,
221                    VkQueryPool *pQueryPool)
222 {
223    TU_FROM_HANDLE(tu_device, device, _device);
224    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
225    assert(pCreateInfo->queryCount > 0);
226 
227    uint32_t pool_size, slot_size;
228    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
229 
230    pool_size = sizeof(struct tu_query_pool);
231 
232    switch (pCreateInfo->queryType) {
233    case VK_QUERY_TYPE_OCCLUSION:
234       slot_size = sizeof(struct occlusion_query_slot);
235       break;
236    case VK_QUERY_TYPE_TIMESTAMP:
237       slot_size = sizeof(struct timestamp_query_slot);
238       break;
239    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
240       slot_size = sizeof(struct primitive_query_slot);
241       break;
242    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
243       perf_query_info =
244             vk_find_struct_const(pCreateInfo->pNext,
245                                  QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
246       assert(perf_query_info);
247 
248       slot_size = sizeof(struct perf_query_slot) +
249                   sizeof(struct perfcntr_query_slot) *
250                   (perf_query_info->counterIndexCount - 1);
251 
252       /* Size of the array pool->tu_perf_query_data */
253       pool_size += sizeof(struct tu_perf_query_data) *
254                    perf_query_info->counterIndexCount;
255       break;
256    }
257    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
258       slot_size = sizeof(struct pipeline_stat_query_slot);
259       break;
260    default:
261       unreachable("Invalid query type");
262    }
263 
264    struct tu_query_pool *pool =
265          vk_object_alloc(&device->vk, pAllocator, pool_size,
266                          VK_OBJECT_TYPE_QUERY_POOL);
267    if (!pool)
268       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
269 
270    if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
271       pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
272                                       &pool->perf_group_count);
273 
274       pool->counter_index_count = perf_query_info->counterIndexCount;
275 
276       /* Build all perf counters data that is requested, so we could get
277        * correct group id, countable id, counter register and pass index with
278        * only a counter index provided by applications at each command submit.
279        *
280        * Also, since this built data will be sorted by pass index later, we
281        * should keep the original indices and store perfcntrs results according
282        * to them so apps can get correct results with their own indices.
283        */
284       uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
285       memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
286       memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
287 
288       for (uint32_t i = 0; i < pool->counter_index_count; i++) {
289          uint32_t gid = 0, cid = 0;
290 
291          perfcntr_index(pool->perf_group, pool->perf_group_count,
292                         perf_query_info->pCounterIndices[i], &gid, &cid);
293 
294          pool->perf_query_data[i].gid = gid;
295          pool->perf_query_data[i].cid = cid;
296          pool->perf_query_data[i].app_idx = i;
297 
298          /* When a counter register is over the capacity(num_counters),
299           * reset it for next pass.
300           */
301          if (regs[gid] < pool->perf_group[gid].num_counters) {
302             pool->perf_query_data[i].cntr_reg = regs[gid]++;
303             pool->perf_query_data[i].pass = pass[gid];
304          } else {
305             pool->perf_query_data[i].pass = ++pass[gid];
306             pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
307             regs[gid]++;
308          }
309       }
310 
311       /* Sort by pass index so we could easily prepare a command stream
312        * with the ascending order of pass index.
313        */
314       qsort(pool->perf_query_data, pool->counter_index_count,
315             sizeof(pool->perf_query_data[0]),
316             compare_perfcntr_pass);
317    }
318 
319    VkResult result = tu_bo_init_new(device, &pool->bo,
320          pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS);
321    if (result != VK_SUCCESS) {
322       vk_object_free(&device->vk, pAllocator, pool);
323       return result;
324    }
325 
326    result = tu_bo_map(device, &pool->bo);
327    if (result != VK_SUCCESS) {
328       tu_bo_finish(device, &pool->bo);
329       vk_object_free(&device->vk, pAllocator, pool);
330       return result;
331    }
332 
333    /* Initialize all query statuses to unavailable */
334    memset(pool->bo.map, 0, pool->bo.size);
335 
336    pool->type = pCreateInfo->queryType;
337    pool->stride = slot_size;
338    pool->size = pCreateInfo->queryCount;
339    pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
340    *pQueryPool = tu_query_pool_to_handle(pool);
341 
342    return VK_SUCCESS;
343 }
344 
345 VKAPI_ATTR void VKAPI_CALL
tu_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)346 tu_DestroyQueryPool(VkDevice _device,
347                     VkQueryPool _pool,
348                     const VkAllocationCallbacks *pAllocator)
349 {
350    TU_FROM_HANDLE(tu_device, device, _device);
351    TU_FROM_HANDLE(tu_query_pool, pool, _pool);
352 
353    if (!pool)
354       return;
355 
356    tu_bo_finish(device, &pool->bo);
357    vk_object_free(&device->vk, pAllocator, pool);
358 }
359 
360 static uint32_t
get_result_count(struct tu_query_pool * pool)361 get_result_count(struct tu_query_pool *pool)
362 {
363    switch (pool->type) {
364    /* Occulusion and timestamp queries write one integer value */
365    case VK_QUERY_TYPE_OCCLUSION:
366    case VK_QUERY_TYPE_TIMESTAMP:
367       return 1;
368    /* Transform feedback queries write two integer values */
369    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
370       return 2;
371    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
372       return util_bitcount(pool->pipeline_statistics);
373    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
374       return pool->counter_index_count;
375    default:
376       assert(!"Invalid query type");
377       return 0;
378    }
379 }
380 
381 static uint32_t
statistics_index(uint32_t * statistics)382 statistics_index(uint32_t *statistics)
383 {
384    uint32_t stat;
385    stat = u_bit_scan(statistics);
386 
387    switch (1 << stat) {
388    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
389    case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
390       return 0;
391    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
392       return 1;
393    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
394       return 2;
395    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
396       return 4;
397    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
398       return 5;
399    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
400       return 6;
401    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
402       return 7;
403    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
404       return 8;
405    case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
406       return 9;
407    case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
408       return 10;
409    default:
410       return 0;
411    }
412 }
413 
414 /* Wait on the the availability status of a query up until a timeout. */
415 static VkResult
wait_for_available(struct tu_device * device,struct tu_query_pool * pool,uint32_t query)416 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
417                    uint32_t query)
418 {
419    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
420     * scheduler friendly way instead of busy polling once the patch has landed
421     * upstream. */
422    struct query_slot *slot = slot_address(pool, query);
423    uint64_t abs_timeout = os_time_get_absolute_timeout(
424          WAIT_TIMEOUT * NSEC_PER_SEC);
425    while(os_time_get_nano() < abs_timeout) {
426       if (query_is_available(slot))
427          return VK_SUCCESS;
428    }
429    return vk_error(device, VK_TIMEOUT);
430 }
431 
432 /* Writes a query value to a buffer from the CPU. */
433 static void
write_query_value_cpu(char * base,uint32_t offset,uint64_t value,VkQueryResultFlags flags)434 write_query_value_cpu(char* base,
435                       uint32_t offset,
436                       uint64_t value,
437                       VkQueryResultFlags flags)
438 {
439    if (flags & VK_QUERY_RESULT_64_BIT) {
440       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
441    } else {
442       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
443    }
444 }
445 
446 static VkResult
get_query_pool_results(struct tu_device * device,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)447 get_query_pool_results(struct tu_device *device,
448                        struct tu_query_pool *pool,
449                        uint32_t firstQuery,
450                        uint32_t queryCount,
451                        size_t dataSize,
452                        void *pData,
453                        VkDeviceSize stride,
454                        VkQueryResultFlags flags)
455 {
456    assert(dataSize >= stride * queryCount);
457 
458    char *result_base = pData;
459    VkResult result = VK_SUCCESS;
460    for (uint32_t i = 0; i < queryCount; i++) {
461       uint32_t query = firstQuery + i;
462       struct query_slot *slot = slot_address(pool, query);
463       bool available = query_is_available(slot);
464       uint32_t result_count = get_result_count(pool);
465       uint32_t statistics = pool->pipeline_statistics;
466 
467       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
468          VkResult wait_result = wait_for_available(device, pool, query);
469          if (wait_result != VK_SUCCESS)
470             return wait_result;
471          available = true;
472       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
473          /* From the Vulkan 1.1.130 spec:
474           *
475           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
476           *    both not set then no result values are written to pData for
477           *    queries that are in the unavailable state at the time of the
478           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
479           *    availability state is still written to pData for those queries
480           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
481           */
482          result = VK_NOT_READY;
483          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
484             result_base += stride;
485             continue;
486          }
487       }
488 
489       for (uint32_t k = 0; k < result_count; k++) {
490          if (available) {
491             uint64_t *result;
492 
493             if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
494                uint32_t stat_idx = statistics_index(&statistics);
495                result = query_result_addr(pool, query, uint64_t, stat_idx);
496             } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
497                result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
498             } else {
499                result = query_result_addr(pool, query, uint64_t, k);
500             }
501 
502             write_query_value_cpu(result_base, k, *result, flags);
503          } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
504              /* From the Vulkan 1.1.130 spec:
505               *
506               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
507               *   is not set, and the query’s status is unavailable, an
508               *   intermediate result value between zero and the final result
509               *   value is written to pData for that query.
510               *
511               * Just return 0 here for simplicity since it's a valid result.
512               */
513             write_query_value_cpu(result_base, k, 0, flags);
514       }
515 
516       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
517          /* From the Vulkan 1.1.130 spec:
518           *
519           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
520           *    integer value written for each query is non-zero if the query’s
521           *    status was available or zero if the status was unavailable.
522           */
523          write_query_value_cpu(result_base, result_count, available, flags);
524 
525       result_base += stride;
526    }
527    return result;
528 }
529 
530 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)531 tu_GetQueryPoolResults(VkDevice _device,
532                        VkQueryPool queryPool,
533                        uint32_t firstQuery,
534                        uint32_t queryCount,
535                        size_t dataSize,
536                        void *pData,
537                        VkDeviceSize stride,
538                        VkQueryResultFlags flags)
539 {
540    TU_FROM_HANDLE(tu_device, device, _device);
541    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
542    assert(firstQuery + queryCount <= pool->size);
543 
544    if (tu_device_is_lost(device))
545       return VK_ERROR_DEVICE_LOST;
546 
547    switch (pool->type) {
548    case VK_QUERY_TYPE_OCCLUSION:
549    case VK_QUERY_TYPE_TIMESTAMP:
550    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
551    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
552    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
553       return get_query_pool_results(device, pool, firstQuery, queryCount,
554                                     dataSize, pData, stride, flags);
555    default:
556       assert(!"Invalid query type");
557    }
558    return VK_SUCCESS;
559 }
560 
561 /* Copies a query value from one buffer to another from the GPU. */
562 static void
copy_query_value_gpu(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,uint64_t src_iova,uint64_t base_write_iova,uint32_t offset,VkQueryResultFlags flags)563 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
564                      struct tu_cs *cs,
565                      uint64_t src_iova,
566                      uint64_t base_write_iova,
567                      uint32_t offset,
568                      VkQueryResultFlags flags) {
569    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
570          sizeof(uint64_t) : sizeof(uint32_t);
571    uint64_t write_iova = base_write_iova + (offset * element_size);
572 
573    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
574    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
575          CP_MEM_TO_MEM_0_DOUBLE : 0;
576    tu_cs_emit(cs, mem_to_mem_flags);
577    tu_cs_emit_qw(cs, write_iova);
578    tu_cs_emit_qw(cs, src_iova);
579 }
580 
581 static void
emit_copy_query_pool_results(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,struct tu_buffer * buffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)582 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
583                              struct tu_cs *cs,
584                              struct tu_query_pool *pool,
585                              uint32_t firstQuery,
586                              uint32_t queryCount,
587                              struct tu_buffer *buffer,
588                              VkDeviceSize dstOffset,
589                              VkDeviceSize stride,
590                              VkQueryResultFlags flags)
591 {
592    /* From the Vulkan 1.1.130 spec:
593     *
594     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
595     *    uses of vkCmdResetQueryPool in the same queue, without any additional
596     *    synchronization.
597     *
598     * To ensure that previous writes to the available bit are coherent, first
599     * wait for all writes to complete.
600     */
601    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
602 
603    for (uint32_t i = 0; i < queryCount; i++) {
604       uint32_t query = firstQuery + i;
605       uint64_t available_iova = query_available_iova(pool, query);
606       uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
607       uint32_t result_count = get_result_count(pool);
608       uint32_t statistics = pool->pipeline_statistics;
609 
610       /* Wait for the available bit to be set if executed with the
611        * VK_QUERY_RESULT_WAIT_BIT flag. */
612       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
613          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
614          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
615                         CP_WAIT_REG_MEM_0_POLL_MEMORY);
616          tu_cs_emit_qw(cs, available_iova);
617          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
618          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
619          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
620       }
621 
622       for (uint32_t k = 0; k < result_count; k++) {
623          uint64_t result_iova;
624 
625          if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
626             uint32_t stat_idx = statistics_index(&statistics);
627             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
628          } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
629             result_iova = query_result_iova(pool, query,
630                                             struct perfcntr_query_slot, k);
631          } else {
632             result_iova = query_result_iova(pool, query, uint64_t, k);
633          }
634 
635          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
636             /* Unconditionally copying the bo->result into the buffer here is
637              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
638              * if the query is unavailable, this will copy the correct partial
639              * value of 0.
640              */
641             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
642                                  k /* offset */, flags);
643          } else {
644             /* Conditionally copy bo->result into the buffer based on whether the
645              * query is available.
646              *
647              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
648              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
649              * that 0 < available < 2, aka available == 1.
650              */
651             tu_cs_reserve(cs, 7 + 6);
652             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
653             tu_cs_emit_qw(cs, available_iova);
654             tu_cs_emit_qw(cs, available_iova);
655             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
656             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
657 
658             /* Start of conditional execution */
659             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
660                               k /* offset */, flags);
661             /* End of conditional execution */
662          }
663       }
664 
665       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
666          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
667                               result_count /* offset */, flags);
668       }
669    }
670 }
671 
672 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)673 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
674                            VkQueryPool queryPool,
675                            uint32_t firstQuery,
676                            uint32_t queryCount,
677                            VkBuffer dstBuffer,
678                            VkDeviceSize dstOffset,
679                            VkDeviceSize stride,
680                            VkQueryResultFlags flags)
681 {
682    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
683    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
684    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
685    struct tu_cs *cs = &cmdbuf->cs;
686    assert(firstQuery + queryCount <= pool->size);
687 
688    switch (pool->type) {
689    case VK_QUERY_TYPE_OCCLUSION:
690    case VK_QUERY_TYPE_TIMESTAMP:
691    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
692    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
693       return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
694                queryCount, buffer, dstOffset, stride, flags);
695    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
696       unreachable("allowCommandBufferQueryCopies is false");
697    default:
698       assert(!"Invalid query type");
699    }
700 }
701 
702 static void
emit_reset_query_pool(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount)703 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
704                       struct tu_query_pool *pool,
705                       uint32_t firstQuery,
706                       uint32_t queryCount)
707 {
708    struct tu_cs *cs = &cmdbuf->cs;
709 
710    for (uint32_t i = 0; i < queryCount; i++) {
711       uint32_t query = firstQuery + i;
712       uint32_t statistics = pool->pipeline_statistics;
713 
714       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
715       tu_cs_emit_qw(cs, query_available_iova(pool, query));
716       tu_cs_emit_qw(cs, 0x0);
717 
718       for (uint32_t k = 0; k < get_result_count(pool); k++) {
719          uint64_t result_iova;
720 
721          if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
722             uint32_t stat_idx = statistics_index(&statistics);
723             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
724          } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
725             result_iova = query_result_iova(pool, query,
726                                             struct perfcntr_query_slot, k);
727          } else {
728             result_iova = query_result_iova(pool, query, uint64_t, k);
729          }
730 
731          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
732          tu_cs_emit_qw(cs, result_iova);
733          tu_cs_emit_qw(cs, 0x0);
734       }
735    }
736 
737 }
738 
739 VKAPI_ATTR void VKAPI_CALL
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)740 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
741                      VkQueryPool queryPool,
742                      uint32_t firstQuery,
743                      uint32_t queryCount)
744 {
745    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
746    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
747 
748    switch (pool->type) {
749    case VK_QUERY_TYPE_TIMESTAMP:
750    case VK_QUERY_TYPE_OCCLUSION:
751    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
752    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
753    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
754       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
755       break;
756    default:
757       assert(!"Invalid query type");
758    }
759 }
760 
761 VKAPI_ATTR void VKAPI_CALL
tu_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)762 tu_ResetQueryPool(VkDevice device,
763                   VkQueryPool queryPool,
764                   uint32_t firstQuery,
765                   uint32_t queryCount)
766 {
767    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
768 
769    for (uint32_t i = 0; i < queryCount; i++) {
770       struct query_slot *slot = slot_address(pool, i + firstQuery);
771       slot->available = 0;
772 
773       for (uint32_t k = 0; k < get_result_count(pool); k++) {
774          uint64_t *res;
775 
776          if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
777             res = query_result_addr(pool, i + firstQuery,
778                                     struct perfcntr_query_slot, k);
779          } else {
780             res = query_result_addr(pool, i + firstQuery, uint64_t, k);
781          }
782 
783          *res = 0;
784       }
785    }
786 }
787 
788 static void
emit_begin_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)789 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
790                            struct tu_query_pool *pool,
791                            uint32_t query)
792 {
793    /* From the Vulkan 1.1.130 spec:
794     *
795     *    A query must begin and end inside the same subpass of a render pass
796     *    instance, or must both begin and end outside of a render pass
797     *    instance.
798     *
799     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
800     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
801     * query begins/ends inside the same subpass of a render pass, we need to
802     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
803     * is then run on every tile during render, so we just need to accumulate
804     * sample counts in slot->result to compute the query result.
805     */
806    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
807 
808    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
809 
810    tu_cs_emit_regs(cs,
811                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
812 
813    tu_cs_emit_regs(cs,
814                    A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
815 
816    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
817    tu_cs_emit(cs, ZPASS_DONE);
818 }
819 
820 static void
emit_begin_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)821 emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
822                       struct tu_query_pool *pool,
823                       uint32_t query)
824 {
825    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
826    uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin);
827 
828    tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
829    tu6_emit_event_write(cmdbuf, cs, RST_PIX_CNT);
830    tu6_emit_event_write(cmdbuf, cs, TILE_FLUSH);
831 
832    tu_cs_emit_wfi(cs);
833 
834    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
835    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
836                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
837                   CP_REG_TO_MEM_0_64B);
838    tu_cs_emit_qw(cs, begin_iova);
839 }
840 
841 static void
emit_perfcntrs_pass_start(struct tu_cs * cs,uint32_t pass)842 emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
843 {
844    tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
845    tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
846                         REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
847                   A6XX_CP_REG_TEST_0_BIT(pass) |
848                   A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
849    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
850 }
851 
852 static void
emit_begin_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)853 emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
854                            struct tu_query_pool *pool,
855                            uint32_t query)
856 {
857    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
858    uint32_t last_pass = ~0;
859 
860    /* Querying perf counters happens in these steps:
861     *
862     *  0) There's a scratch reg to set a pass index for perf counters query.
863     *     Prepare cmd streams to set each pass index to the reg at device
864     *     creation time. See tu_CreateDevice in tu_device.c
865     *  1) Emit command streams to read all requested perf counters at all
866     *     passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
867     *     reads the scratch reg where pass index is set.
868     *     See emit_perfcntrs_pass_start.
869     *  2) Pick the right cs setting proper pass index to the reg and prepend
870     *     it to the command buffer at each submit time.
871     *     See tu_QueueSubmit in tu_drm.c
872     *  3) If the pass index in the reg is true, then executes the command
873     *     stream below CP_COND_REG_EXEC.
874     */
875 
876    tu_cs_emit_wfi(cs);
877 
878    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
879       struct tu_perf_query_data *data = &pool->perf_query_data[i];
880 
881       if (last_pass != data->pass) {
882          last_pass = data->pass;
883 
884          if (data->pass != 0)
885             tu_cond_exec_end(cs);
886          emit_perfcntrs_pass_start(cs, data->pass);
887       }
888 
889       const struct fd_perfcntr_counter *counter =
890             &pool->perf_group[data->gid].counters[data->cntr_reg];
891       const struct fd_perfcntr_countable *countable =
892             &pool->perf_group[data->gid].countables[data->cid];
893 
894       tu_cs_emit_pkt4(cs, counter->select_reg, 1);
895       tu_cs_emit(cs, countable->selector);
896    }
897    tu_cond_exec_end(cs);
898 
899    last_pass = ~0;
900    tu_cs_emit_wfi(cs);
901 
902    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
903       struct tu_perf_query_data *data = &pool->perf_query_data[i];
904 
905       if (last_pass != data->pass) {
906          last_pass = data->pass;
907 
908          if (data->pass != 0)
909             tu_cond_exec_end(cs);
910          emit_perfcntrs_pass_start(cs, data->pass);
911       }
912 
913       const struct fd_perfcntr_counter *counter =
914             &pool->perf_group[data->gid].counters[data->cntr_reg];
915 
916       uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
917 
918       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
919       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
920                      CP_REG_TO_MEM_0_64B);
921       tu_cs_emit_qw(cs, begin_iova);
922    }
923    tu_cond_exec_end(cs);
924 }
925 
926 static void
emit_begin_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)927 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
928                      struct tu_query_pool *pool,
929                      uint32_t query,
930                      uint32_t stream_id)
931 {
932    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
933    uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
934 
935    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
936    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
937 }
938 
939 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)940 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
941                  VkQueryPool queryPool,
942                  uint32_t query,
943                  VkQueryControlFlags flags)
944 {
945    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
946    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
947    assert(query < pool->size);
948 
949    switch (pool->type) {
950    case VK_QUERY_TYPE_OCCLUSION:
951       /* In freedreno, there is no implementation difference between
952        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
953        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
954        */
955       emit_begin_occlusion_query(cmdbuf, pool, query);
956       break;
957    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
958       emit_begin_xfb_query(cmdbuf, pool, query, 0);
959       break;
960    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
961       emit_begin_perf_query(cmdbuf, pool, query);
962       break;
963    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
964       emit_begin_stat_query(cmdbuf, pool, query);
965       break;
966    case VK_QUERY_TYPE_TIMESTAMP:
967       unreachable("Unimplemented query type");
968    default:
969       assert(!"Invalid query type");
970    }
971 }
972 
973 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)974 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
975                            VkQueryPool queryPool,
976                            uint32_t query,
977                            VkQueryControlFlags flags,
978                            uint32_t index)
979 {
980    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
981    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
982    assert(query < pool->size);
983 
984    switch (pool->type) {
985    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
986       emit_begin_xfb_query(cmdbuf, pool, query, index);
987       break;
988    default:
989       assert(!"Invalid query type");
990    }
991 }
992 
993 static void
emit_end_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)994 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
995                          struct tu_query_pool *pool,
996                          uint32_t query)
997 {
998    /* Ending an occlusion query happens in a few steps:
999     *    1) Set the slot->end to UINT64_MAX.
1000     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1001     *       write the current sample count value into slot->end.
1002     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
1003     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1004     *    4) Accumulate the results of the query (slot->end - slot->begin) into
1005     *       slot->result.
1006     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
1007     *       pass, set the slot's available bit since the query is now done.
1008     *    6) If vkCmdEndQuery *is* called from within the scope of a render
1009     *       pass, we cannot mark as available yet since the commands in
1010     *       draw_cs are not run until vkCmdEndRenderPass.
1011     */
1012    const struct tu_render_pass *pass = cmdbuf->state.pass;
1013    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1014 
1015    uint64_t available_iova = query_available_iova(pool, query);
1016    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1017    uint64_t end_iova = occlusion_query_iova(pool, query, end);
1018    uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);
1019    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1020    tu_cs_emit_qw(cs, end_iova);
1021    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1022 
1023    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1024 
1025    tu_cs_emit_regs(cs,
1026                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1027 
1028    tu_cs_emit_regs(cs,
1029                    A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1030 
1031    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1032    tu_cs_emit(cs, ZPASS_DONE);
1033 
1034    tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1035    tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1036                   CP_WAIT_REG_MEM_0_POLL_MEMORY);
1037    tu_cs_emit_qw(cs, end_iova);
1038    tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1039    tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1040    tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1041 
1042    /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1043    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1044    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1045    tu_cs_emit_qw(cs, result_iova);
1046    tu_cs_emit_qw(cs, result_iova);
1047    tu_cs_emit_qw(cs, end_iova);
1048    tu_cs_emit_qw(cs, begin_iova);
1049 
1050    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1051 
1052    if (pass)
1053       /* Technically, queries should be tracked per-subpass, but here we track
1054        * at the render pass level to simply the code a bit. This is safe
1055        * because the only commands that use the available bit are
1056        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1057        * cannot be invoked from inside a render pass scope.
1058        */
1059       cs = &cmdbuf->draw_epilogue_cs;
1060 
1061    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1062    tu_cs_emit_qw(cs, available_iova);
1063    tu_cs_emit_qw(cs, 0x1);
1064 }
1065 
1066 static void
emit_end_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1067 emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1068                     struct tu_query_pool *pool,
1069                     uint32_t query)
1070 {
1071    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1072    uint64_t end_iova = pipeline_stat_query_iova(pool, query, end);
1073    uint64_t available_iova = query_available_iova(pool, query);
1074    uint64_t result_iova;
1075    uint64_t stat_start_iova;
1076    uint64_t stat_stop_iova;
1077 
1078    tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
1079    tu6_emit_event_write(cmdbuf, cs, RST_VTX_CNT);
1080    tu6_emit_event_write(cmdbuf, cs, STAT_EVENT);
1081 
1082    tu_cs_emit_wfi(cs);
1083 
1084    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1085    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1086                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1087                   CP_REG_TO_MEM_0_64B);
1088    tu_cs_emit_qw(cs, end_iova);
1089 
1090    for (int i = 0; i < STAT_COUNT; i++) {
1091       result_iova = query_result_iova(pool, query, uint64_t, i);
1092       stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]);
1093       stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]);
1094 
1095       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1096       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1097                      CP_MEM_TO_MEM_0_DOUBLE |
1098                      CP_MEM_TO_MEM_0_NEG_C);
1099 
1100       tu_cs_emit_qw(cs, result_iova);
1101       tu_cs_emit_qw(cs, result_iova);
1102       tu_cs_emit_qw(cs, stat_stop_iova);
1103       tu_cs_emit_qw(cs, stat_start_iova);
1104    }
1105 
1106    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1107 
1108    if (cmdbuf->state.pass)
1109       cs = &cmdbuf->draw_epilogue_cs;
1110 
1111    /* Set the availability to 1 */
1112    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1113    tu_cs_emit_qw(cs, available_iova);
1114    tu_cs_emit_qw(cs, 0x1);
1115 }
1116 
1117 static void
emit_end_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1118 emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1119                          struct tu_query_pool *pool,
1120                          uint32_t query)
1121 {
1122    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1123    uint64_t available_iova = query_available_iova(pool, query);
1124    uint64_t end_iova;
1125    uint64_t begin_iova;
1126    uint64_t result_iova;
1127    uint32_t last_pass = ~0;
1128 
1129    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1130       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1131 
1132       if (last_pass != data->pass) {
1133          last_pass = data->pass;
1134 
1135          if (data->pass != 0)
1136             tu_cond_exec_end(cs);
1137          emit_perfcntrs_pass_start(cs, data->pass);
1138       }
1139 
1140       const struct fd_perfcntr_counter *counter =
1141             &pool->perf_group[data->gid].counters[data->cntr_reg];
1142 
1143       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1144 
1145       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1146       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1147                      CP_REG_TO_MEM_0_64B);
1148       tu_cs_emit_qw(cs, end_iova);
1149    }
1150    tu_cond_exec_end(cs);
1151 
1152    last_pass = ~0;
1153    tu_cs_emit_wfi(cs);
1154 
1155    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1156       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1157 
1158       if (last_pass != data->pass) {
1159          last_pass = data->pass;
1160 
1161 
1162          if (data->pass != 0)
1163             tu_cond_exec_end(cs);
1164          emit_perfcntrs_pass_start(cs, data->pass);
1165       }
1166 
1167       result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1168              data->app_idx);
1169       begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1170       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1171 
1172       /* result += end - begin */
1173       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1174       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1175                      CP_MEM_TO_MEM_0_DOUBLE |
1176                      CP_MEM_TO_MEM_0_NEG_C);
1177 
1178       tu_cs_emit_qw(cs, result_iova);
1179       tu_cs_emit_qw(cs, result_iova);
1180       tu_cs_emit_qw(cs, end_iova);
1181       tu_cs_emit_qw(cs, begin_iova);
1182    }
1183    tu_cond_exec_end(cs);
1184 
1185    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1186 
1187    if (cmdbuf->state.pass)
1188       cs = &cmdbuf->draw_epilogue_cs;
1189 
1190    /* Set the availability to 1 */
1191    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1192    tu_cs_emit_qw(cs, available_iova);
1193    tu_cs_emit_qw(cs, 0x1);
1194 }
1195 
1196 static void
emit_end_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1197 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1198                    struct tu_query_pool *pool,
1199                    uint32_t query,
1200                    uint32_t stream_id)
1201 {
1202    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1203 
1204    uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
1205    uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1206    uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1207    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
1208    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
1209    uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
1210    uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
1211    uint64_t available_iova = query_available_iova(pool, query);
1212 
1213    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1214    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
1215 
1216    tu_cs_emit_wfi(cs);
1217    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
1218 
1219    /* Set the count of written primitives */
1220    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1221    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1222                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1223    tu_cs_emit_qw(cs, result_written_iova);
1224    tu_cs_emit_qw(cs, result_written_iova);
1225    tu_cs_emit_qw(cs, end_written_iova);
1226    tu_cs_emit_qw(cs, begin_written_iova);
1227 
1228    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
1229 
1230    /* Set the count of generated primitives */
1231    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1232    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1233                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1234    tu_cs_emit_qw(cs, result_generated_iova);
1235    tu_cs_emit_qw(cs, result_generated_iova);
1236    tu_cs_emit_qw(cs, end_generated_iova);
1237    tu_cs_emit_qw(cs, begin_generated_iova);
1238 
1239    /* Set the availability to 1 */
1240    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1241    tu_cs_emit_qw(cs, available_iova);
1242    tu_cs_emit_qw(cs, 0x1);
1243 }
1244 
1245 /* Implement this bit of spec text from section 17.2 "Query Operation":
1246  *
1247  *     If queries are used while executing a render pass instance that has
1248  *     multiview enabled, the query uses N consecutive query indices in the
1249  *     query pool (starting at query) where N is the number of bits set in the
1250  *     view mask in the subpass the query is used in. How the numerical
1251  *     results of the query are distributed among the queries is
1252  *     implementation-dependent. For example, some implementations may write
1253  *     each view’s results to a distinct query, while other implementations
1254  *     may write the total result to the first query and write zero to the
1255  *     other queries. However, the sum of the results in all the queries must
1256  *     accurately reflect the total result of the query summed over all views.
1257  *     Applications can sum the results from all the queries to compute the
1258  *     total result.
1259  *
1260  * Since we execute all views at once, we write zero to the other queries.
1261  * Furthermore, because queries must be reset before use, and we set the
1262  * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1263  */
1264 
1265 static void
handle_multiview_queries(struct tu_cmd_buffer * cmd,struct tu_query_pool * pool,uint32_t query)1266 handle_multiview_queries(struct tu_cmd_buffer *cmd,
1267                          struct tu_query_pool *pool,
1268                          uint32_t query)
1269 {
1270    if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1271       return;
1272 
1273    unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1274    struct tu_cs *cs = &cmd->draw_epilogue_cs;
1275 
1276    for (uint32_t i = 1; i < views; i++) {
1277       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1278       tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1279       tu_cs_emit_qw(cs, 0x1);
1280    }
1281 }
1282 
1283 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1284 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
1285                VkQueryPool queryPool,
1286                uint32_t query)
1287 {
1288    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1289    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1290    assert(query < pool->size);
1291 
1292    switch (pool->type) {
1293    case VK_QUERY_TYPE_OCCLUSION:
1294       emit_end_occlusion_query(cmdbuf, pool, query);
1295       break;
1296    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1297       emit_end_xfb_query(cmdbuf, pool, query, 0);
1298       break;
1299    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1300       emit_end_perf_query(cmdbuf, pool, query);
1301       break;
1302    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1303       emit_end_stat_query(cmdbuf, pool, query);
1304       break;
1305    case VK_QUERY_TYPE_TIMESTAMP:
1306       unreachable("Unimplemented query type");
1307    default:
1308       assert(!"Invalid query type");
1309    }
1310 
1311    handle_multiview_queries(cmdbuf, pool, query);
1312 }
1313 
1314 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1315 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1316                          VkQueryPool queryPool,
1317                          uint32_t query,
1318                          uint32_t index)
1319 {
1320    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1321    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1322    assert(query < pool->size);
1323 
1324    switch (pool->type) {
1325    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1326       assert(index <= 4);
1327       emit_end_xfb_query(cmdbuf, pool, query, index);
1328       break;
1329    default:
1330       assert(!"Invalid query type");
1331    }
1332 }
1333 
1334 VKAPI_ATTR void VKAPI_CALL
tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits pipelineStage,VkQueryPool queryPool,uint32_t query)1335 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
1336                      VkPipelineStageFlagBits pipelineStage,
1337                      VkQueryPool queryPool,
1338                      uint32_t query)
1339 {
1340    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1341    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1342 
1343    /* Inside a render pass, just write the timestamp multiple times so that
1344     * the user gets the last one if we use GMEM. There isn't really much
1345     * better we can do, and this seems to be what the blob does too.
1346     */
1347    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1348 
1349    /* Stages that will already have been executed by the time the CP executes
1350     * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1351     * indirect stage counts as top-of-pipe too.
1352     */
1353    VkPipelineStageFlags top_of_pipe_flags =
1354       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
1355       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
1356 
1357    if (pipelineStage & ~top_of_pipe_flags) {
1358       /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1359        * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1360        * complete.
1361        *
1362        * Stalling the CP like this is really unfortunate, but I don't think
1363        * there's a better solution that allows all 48 bits of precision
1364        * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1365        */
1366       tu_cs_emit_wfi(cs);
1367    }
1368 
1369    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1370    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
1371                   CP_REG_TO_MEM_0_CNT(2) |
1372                   CP_REG_TO_MEM_0_64B);
1373    tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1374 
1375    /* Only flag availability once the entire renderpass is done, similar to
1376     * the begin/end path.
1377     */
1378    cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1379 
1380    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1381    tu_cs_emit_qw(cs, query_available_iova(pool, query));
1382    tu_cs_emit_qw(cs, 0x1);
1383 
1384    /* From the spec for vkCmdWriteTimestamp:
1385     *
1386     *    If vkCmdWriteTimestamp is called while executing a render pass
1387     *    instance that has multiview enabled, the timestamp uses N consecutive
1388     *    query indices in the query pool (starting at query) where N is the
1389     *    number of bits set in the view mask of the subpass the command is
1390     *    executed in. The resulting query values are determined by an
1391     *    implementation-dependent choice of one of the following behaviors:
1392     *
1393     *    -   The first query is a timestamp value and (if more than one bit is
1394     *        set in the view mask) zero is written to the remaining queries.
1395     *        If two timestamps are written in the same subpass, the sum of the
1396     *        execution time of all views between those commands is the
1397     *        difference between the first query written by each command.
1398     *
1399     *    -   All N queries are timestamp values. If two timestamps are written
1400     *        in the same subpass, the sum of the execution time of all views
1401     *        between those commands is the sum of the difference between
1402     *        corresponding queries written by each command. The difference
1403     *        between corresponding queries may be the execution time of a
1404     *        single view.
1405     *
1406     * We execute all views in the same draw call, so we implement the first
1407     * option, the same as regular queries.
1408     */
1409    handle_multiview_queries(cmd, pool, query);
1410 }
1411 
1412 VKAPI_ATTR VkResult VKAPI_CALL
tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1413 tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1414     VkPhysicalDevice                            physicalDevice,
1415     uint32_t                                    queueFamilyIndex,
1416     uint32_t*                                   pCounterCount,
1417     VkPerformanceCounterKHR*                    pCounters,
1418     VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
1419 {
1420    TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1421 
1422    uint32_t desc_count = *pCounterCount;
1423    uint32_t group_count;
1424    const struct fd_perfcntr_group *group =
1425          fd_perfcntrs(&phydev->dev_id, &group_count);
1426 
1427    VK_OUTARRAY_MAKE(out, pCounters, pCounterCount);
1428    VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count);
1429 
1430    for (int i = 0; i < group_count; i++) {
1431       for (int j = 0; j < group[i].num_countables; j++) {
1432 
1433          vk_outarray_append(&out, counter) {
1434             counter->scope = VK_QUERY_SCOPE_COMMAND_BUFFER_KHR;
1435             counter->unit =
1436                   fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1437             counter->storage =
1438                   fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1439 
1440             unsigned char sha1_result[20];
1441             _mesa_sha1_compute(group[i].countables[j].name,
1442                                strlen(group[i].countables[j].name),
1443                                sha1_result);
1444             memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1445          }
1446 
1447          vk_outarray_append(&out_desc, desc) {
1448             desc->flags = 0;
1449 
1450             snprintf(desc->name, sizeof(desc->name),
1451                      "%s", group[i].countables[j].name);
1452             snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1453             snprintf(desc->description, sizeof(desc->description),
1454                      "%s: %s performance counter",
1455                      group[i].name, group[i].countables[j].name);
1456          }
1457       }
1458    }
1459 
1460    return vk_outarray_status(&out);
1461 }
1462 
1463 VKAPI_ATTR void VKAPI_CALL
tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1464 tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1465       VkPhysicalDevice                            physicalDevice,
1466       const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
1467       uint32_t*                                   pNumPasses)
1468 {
1469    TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1470    uint32_t group_count = 0;
1471    uint32_t gid = 0, cid = 0, n_passes;
1472    const struct fd_perfcntr_group *group =
1473          fd_perfcntrs(&phydev->dev_id, &group_count);
1474 
1475    uint32_t counters_requested[group_count];
1476    memset(counters_requested, 0x0, sizeof(counters_requested));
1477    *pNumPasses = 1;
1478 
1479    for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1480       perfcntr_index(group, group_count,
1481                      pPerformanceQueryCreateInfo->pCounterIndices[i],
1482                      &gid, &cid);
1483 
1484       counters_requested[gid]++;
1485    }
1486 
1487    for (uint32_t i = 0; i < group_count; i++) {
1488       n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1489       *pNumPasses = MAX2(*pNumPasses, n_passes);
1490    }
1491 }
1492 
1493 VKAPI_ATTR VkResult VKAPI_CALL
tu_AcquireProfilingLockKHR(VkDevice device,const VkAcquireProfilingLockInfoKHR * pInfo)1494 tu_AcquireProfilingLockKHR(VkDevice device,
1495                            const VkAcquireProfilingLockInfoKHR* pInfo)
1496 {
1497    /* TODO. Probably there's something to do for kgsl. */
1498    return VK_SUCCESS;
1499 }
1500 
1501 VKAPI_ATTR void VKAPI_CALL
tu_ReleaseProfilingLockKHR(VkDevice device)1502 tu_ReleaseProfilingLockKHR(VkDevice device)
1503 {
1504    /* TODO. Probably there's something to do for kgsl. */
1505    return;
1506 }
1507