1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include "anv_private.h"
31 
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34 
35 /* We reserve :
36  *    - GPR 14 for perf queries
37  *    - GPR 15 for conditional rendering
38  */
39 #define MI_BUILDER_NUM_ALLOC_GPRS 14
40 #define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
41 #define __gen_get_batch_dwords anv_batch_emit_dwords
42 #define __gen_address_offset anv_address_add
43 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
44 #include "common/mi_builder.h"
45 #include "perf/intel_perf.h"
46 #include "perf/intel_perf_mdapi.h"
47 #include "perf/intel_perf_regs.h"
48 
49 #include "vk_util.h"
50 
51 static struct anv_address
anv_query_address(struct anv_query_pool * pool,uint32_t query)52 anv_query_address(struct anv_query_pool *pool, uint32_t query)
53 {
54    return (struct anv_address) {
55       .bo = pool->bo,
56       .offset = query * pool->stride,
57    };
58 }
59 
genX(CreateQueryPool)60 VkResult genX(CreateQueryPool)(
61     VkDevice                                    _device,
62     const VkQueryPoolCreateInfo*                pCreateInfo,
63     const VkAllocationCallbacks*                pAllocator,
64     VkQueryPool*                                pQueryPool)
65 {
66    ANV_FROM_HANDLE(anv_device, device, _device);
67    const struct anv_physical_device *pdevice = device->physical;
68 #if GFX_VER >= 8
69    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
70    struct intel_perf_counter_pass *counter_pass;
71    struct intel_perf_query_info **pass_query;
72    uint32_t n_passes = 0;
73 #endif
74    uint32_t data_offset = 0;
75    VK_MULTIALLOC(ma);
76    VkResult result;
77 
78    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
79 
80    /* Query pool slots are made up of some number of 64-bit values packed
81     * tightly together. For most query types have the first 64-bit value is
82     * the "available" bit which is 0 when the query is unavailable and 1 when
83     * it is available. The 64-bit values that follow are determined by the
84     * type of query.
85     *
86     * For performance queries, we have a requirement to align OA reports at
87     * 64bytes so we put those first and have the "available" bit behind
88     * together with some other counters.
89     */
90    uint32_t uint64s_per_slot = 0;
91 
92    VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1);
93 
94    VkQueryPipelineStatisticFlags pipeline_statistics = 0;
95    switch (pCreateInfo->queryType) {
96    case VK_QUERY_TYPE_OCCLUSION:
97       /* Occlusion queries have two values: begin and end. */
98       uint64s_per_slot = 1 + 2;
99       break;
100    case VK_QUERY_TYPE_TIMESTAMP:
101       /* Timestamps just have the one timestamp value */
102       uint64s_per_slot = 1 + 1;
103       break;
104    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
105       pipeline_statistics = pCreateInfo->pipelineStatistics;
106       /* We're going to trust this field implicitly so we need to ensure that
107        * no unhandled extension bits leak in.
108        */
109       pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
110 
111       /* Statistics queries have a min and max for every statistic */
112       uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
113       break;
114    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
115       /* Transform feedback queries are 4 values, begin/end for
116        * written/available.
117        */
118       uint64s_per_slot = 1 + 4;
119       break;
120    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
121       const struct intel_perf_query_field_layout *layout =
122          &pdevice->perf->query_layout;
123 
124       uint64s_per_slot = 2; /* availability + marker */
125       /* Align to the requirement of the layout */
126       uint64s_per_slot = align_u32(uint64s_per_slot,
127                                    DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
128       data_offset = uint64s_per_slot * sizeof(uint64_t);
129       /* Add the query data for begin & end commands */
130       uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
131       break;
132    }
133 #if GFX_VER >= 8
134    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
135       const struct intel_perf_query_field_layout *layout =
136          &pdevice->perf->query_layout;
137 
138       perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
139                                              QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
140       n_passes = intel_perf_get_n_passes(pdevice->perf,
141                                          perf_query_info->pCounterIndices,
142                                          perf_query_info->counterIndexCount,
143                                          NULL);
144       vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass,
145                              perf_query_info->counterIndexCount);
146       vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
147                              n_passes);
148       uint64s_per_slot = 4 /* availability + small batch */;
149       /* Align to the requirement of the layout */
150       uint64s_per_slot = align_u32(uint64s_per_slot,
151                                    DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
152       data_offset = uint64s_per_slot * sizeof(uint64_t);
153       /* Add the query data for begin & end commands */
154       uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
155       /* Multiply by the number of passes */
156       uint64s_per_slot *= n_passes;
157       break;
158    }
159 #endif
160    default:
161       assert(!"Invalid query type");
162    }
163 
164    if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
165                              VK_OBJECT_TYPE_QUERY_POOL))
166       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
167 
168    pool->type = pCreateInfo->queryType;
169    pool->pipeline_statistics = pipeline_statistics;
170    pool->stride = uint64s_per_slot * sizeof(uint64_t);
171    pool->slots = pCreateInfo->queryCount;
172 
173    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
174       pool->data_offset = data_offset;
175       pool->snapshot_size = (pool->stride - data_offset) / 2;
176    }
177 #if GFX_VER >= 8
178    else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
179       pool->pass_size = pool->stride / n_passes;
180       pool->data_offset = data_offset;
181       pool->snapshot_size = (pool->pass_size - data_offset) / 2;
182       pool->n_counters = perf_query_info->counterIndexCount;
183       pool->counter_pass = counter_pass;
184       intel_perf_get_counters_passes(pdevice->perf,
185                                      perf_query_info->pCounterIndices,
186                                      perf_query_info->counterIndexCount,
187                                      pool->counter_pass);
188       pool->n_passes = n_passes;
189       pool->pass_query = pass_query;
190       intel_perf_get_n_passes(pdevice->perf,
191                               perf_query_info->pCounterIndices,
192                               perf_query_info->counterIndexCount,
193                               pool->pass_query);
194    }
195 #endif
196 
197    uint64_t size = pool->slots * (uint64_t)pool->stride;
198    result = anv_device_alloc_bo(device, "query-pool", size,
199                                 ANV_BO_ALLOC_MAPPED |
200                                 ANV_BO_ALLOC_SNOOPED,
201                                 0 /* explicit_address */,
202                                 &pool->bo);
203    if (result != VK_SUCCESS)
204       goto fail;
205 
206 #if GFX_VER >= 8
207    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
208       for (uint32_t p = 0; p < pool->n_passes; p++) {
209          struct mi_builder b;
210          struct anv_batch batch = {
211             .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p),
212             .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset,
213          };
214          batch.next = batch.start;
215 
216          mi_builder_init(&b, &device->info, &batch);
217          mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
218                       mi_imm(p * (uint64_t)pool->pass_size));
219          anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
220       }
221    }
222 #endif
223 
224    *pQueryPool = anv_query_pool_to_handle(pool);
225 
226    return VK_SUCCESS;
227 
228  fail:
229    vk_free2(&device->vk.alloc, pAllocator, pool);
230 
231    return result;
232 }
233 
genX(DestroyQueryPool)234 void genX(DestroyQueryPool)(
235     VkDevice                                    _device,
236     VkQueryPool                                 _pool,
237     const VkAllocationCallbacks*                pAllocator)
238 {
239    ANV_FROM_HANDLE(anv_device, device, _device);
240    ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
241 
242    if (!pool)
243       return;
244 
245    anv_device_release_bo(device, pool->bo);
246    vk_object_free(&device->vk, pAllocator, pool);
247 }
248 
249 #if GFX_VER >= 8
250 /**
251  * VK_KHR_performance_query layout  :
252  *
253  * --------------------------------------------
254  * |       availability (8b)       | |        |
255  * |-------------------------------| |        |
256  * |      Small batch loading      | |        |
257  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
258  * |            (24b)              | | Pass 0 |
259  * |-------------------------------| |        |
260  * |       some padding (see       | |        |
261  * | query_field_layout:alignment) | |        |
262  * |-------------------------------| |        |
263  * |           query data          | |        |
264  * | (2 * query_field_layout:size) | |        |
265  * |-------------------------------|--        | Query 0
266  * |       availability (8b)       | |        |
267  * |-------------------------------| |        |
268  * |      Small batch loading      | |        |
269  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
270  * |            (24b)              | | Pass 1 |
271  * |-------------------------------| |        |
272  * |       some padding (see       | |        |
273  * | query_field_layout:alignment) | |        |
274  * |-------------------------------| |        |
275  * |           query data          | |        |
276  * | (2 * query_field_layout:size) | |        |
277  * |-------------------------------|-----------
278  * |       availability (8b)       | |        |
279  * |-------------------------------| |        |
280  * |      Small batch loading      | |        |
281  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
282  * |            (24b)              | | Pass 0 |
283  * |-------------------------------| |        |
284  * |       some padding (see       | |        |
285  * | query_field_layout:alignment) | |        |
286  * |-------------------------------| |        |
287  * |           query data          | |        |
288  * | (2 * query_field_layout:size) | |        |
289  * |-------------------------------|--        | Query 1
290  * |               ...             | |        |
291  * --------------------------------------------
292  */
293 
294 static uint64_t
khr_perf_query_availability_offset(struct anv_query_pool * pool,uint32_t query,uint32_t pass)295 khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
296 {
297    return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size;
298 }
299 
300 static uint64_t
khr_perf_query_data_offset(struct anv_query_pool * pool,uint32_t query,uint32_t pass,bool end)301 khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
302 {
303    return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size +
304       pool->data_offset + (end ? pool->snapshot_size : 0);
305 }
306 
307 static struct anv_address
khr_perf_query_availability_address(struct anv_query_pool * pool,uint32_t query,uint32_t pass)308 khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
309 {
310    return anv_address_add(
311       (struct anv_address) { .bo = pool->bo, },
312       khr_perf_query_availability_offset(pool, query, pass));
313 }
314 
315 static struct anv_address
khr_perf_query_data_address(struct anv_query_pool * pool,uint32_t query,uint32_t pass,bool end)316 khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
317 {
318    return anv_address_add(
319       (struct anv_address) { .bo = pool->bo, },
320       khr_perf_query_data_offset(pool, query, pass, end));
321 }
322 
323 static bool
khr_perf_query_ensure_relocs(struct anv_cmd_buffer * cmd_buffer)324 khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
325 {
326    if (anv_batch_has_error(&cmd_buffer->batch))
327       return false;
328 
329    if (cmd_buffer->self_mod_locations)
330       return true;
331 
332    struct anv_device *device = cmd_buffer->device;
333    const struct anv_physical_device *pdevice = device->physical;
334 
335    cmd_buffer->self_mod_locations =
336       vk_alloc(&cmd_buffer->pool->alloc,
337                pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
338                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
339 
340    if (!cmd_buffer->self_mod_locations) {
341       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
342       return false;
343    }
344 
345    return true;
346 }
347 #endif
348 
349 /**
350  * VK_INTEL_performance_query layout :
351  *
352  * ---------------------------------
353  * |       availability (8b)       |
354  * |-------------------------------|
355  * |          marker (8b)          |
356  * |-------------------------------|
357  * |       some padding (see       |
358  * | query_field_layout:alignment) |
359  * |-------------------------------|
360  * |           query data          |
361  * | (2 * query_field_layout:size) |
362  * ---------------------------------
363  */
364 
365 static uint32_t
intel_perf_marker_offset(void)366 intel_perf_marker_offset(void)
367 {
368    return 8;
369 }
370 
371 static uint32_t
intel_perf_query_data_offset(struct anv_query_pool * pool,bool end)372 intel_perf_query_data_offset(struct anv_query_pool *pool, bool end)
373 {
374    return pool->data_offset + (end ? pool->snapshot_size : 0);
375 }
376 
377 static void
cpu_write_query_result(void * dst_slot,VkQueryResultFlags flags,uint32_t value_index,uint64_t result)378 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
379                        uint32_t value_index, uint64_t result)
380 {
381    if (flags & VK_QUERY_RESULT_64_BIT) {
382       uint64_t *dst64 = dst_slot;
383       dst64[value_index] = result;
384    } else {
385       uint32_t *dst32 = dst_slot;
386       dst32[value_index] = result;
387    }
388 }
389 
390 static void *
query_slot(struct anv_query_pool * pool,uint32_t query)391 query_slot(struct anv_query_pool *pool, uint32_t query)
392 {
393    return pool->bo->map + query * pool->stride;
394 }
395 
396 static bool
query_is_available(struct anv_query_pool * pool,uint32_t query)397 query_is_available(struct anv_query_pool *pool, uint32_t query)
398 {
399 #if GFX_VER >= 8
400    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
401       for (uint32_t p = 0; p < pool->n_passes; p++) {
402          volatile uint64_t *slot =
403             pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
404          if (!slot[0])
405             return false;
406       }
407       return true;
408    }
409 #endif
410 
411    return *(volatile uint64_t *)query_slot(pool, query);
412 }
413 
414 static VkResult
wait_for_available(struct anv_device * device,struct anv_query_pool * pool,uint32_t query)415 wait_for_available(struct anv_device *device,
416                    struct anv_query_pool *pool, uint32_t query)
417 {
418    uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC);
419 
420    while (anv_gettime_ns() < abs_timeout) {
421       if (query_is_available(pool, query))
422          return VK_SUCCESS;
423       VkResult status = anv_device_query_status(device);
424       if (status != VK_SUCCESS)
425          return status;
426    }
427 
428    return anv_device_set_lost(device, "query timeout");
429 }
430 
genX(GetQueryPoolResults)431 VkResult genX(GetQueryPoolResults)(
432     VkDevice                                    _device,
433     VkQueryPool                                 queryPool,
434     uint32_t                                    firstQuery,
435     uint32_t                                    queryCount,
436     size_t                                      dataSize,
437     void*                                       pData,
438     VkDeviceSize                                stride,
439     VkQueryResultFlags                          flags)
440 {
441    ANV_FROM_HANDLE(anv_device, device, _device);
442    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
443 
444    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
445           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
446           pool->type == VK_QUERY_TYPE_TIMESTAMP ||
447           pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
448           pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
449           pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
450 
451    if (anv_device_is_lost(device))
452       return VK_ERROR_DEVICE_LOST;
453 
454    if (pData == NULL)
455       return VK_SUCCESS;
456 
457    void *data_end = pData + dataSize;
458 
459    VkResult status = VK_SUCCESS;
460    for (uint32_t i = 0; i < queryCount; i++) {
461       bool available = query_is_available(pool, firstQuery + i);
462 
463       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
464          status = wait_for_available(device, pool, firstQuery + i);
465          if (status != VK_SUCCESS) {
466             return status;
467          }
468 
469          available = true;
470       }
471 
472       /* From the Vulkan 1.0.42 spec:
473        *
474        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
475        *    both not set then no result values are written to pData for
476        *    queries that are in the unavailable state at the time of the call,
477        *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
478        *    availability state is still written to pData for those queries if
479        *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
480        *
481        * From VK_KHR_performance_query :
482        *
483        *    "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
484        *     that the result should contain the number of counters that were recorded
485        *     into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
486        */
487       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
488 
489       uint32_t idx = 0;
490       switch (pool->type) {
491       case VK_QUERY_TYPE_OCCLUSION: {
492          uint64_t *slot = query_slot(pool, firstQuery + i);
493          if (write_results) {
494             /* From the Vulkan 1.2.132 spec:
495              *
496              *    "If VK_QUERY_RESULT_PARTIAL_BIT is set,
497              *    VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
498              *    is unavailable, an intermediate result value between zero and
499              *    the final result value is written to pData for that query."
500              */
501             uint64_t result = available ? slot[2] - slot[1] : 0;
502             cpu_write_query_result(pData, flags, idx, result);
503          }
504          idx++;
505          break;
506       }
507 
508       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
509          uint64_t *slot = query_slot(pool, firstQuery + i);
510          uint32_t statistics = pool->pipeline_statistics;
511          while (statistics) {
512             uint32_t stat = u_bit_scan(&statistics);
513             if (write_results) {
514                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
515 
516                /* WaDividePSInvocationCountBy4:HSW,BDW */
517                if ((device->info.ver == 8 || device->info.is_haswell) &&
518                    (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
519                   result >>= 2;
520 
521                cpu_write_query_result(pData, flags, idx, result);
522             }
523             idx++;
524          }
525          assert(idx == util_bitcount(pool->pipeline_statistics));
526          break;
527       }
528 
529       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
530          uint64_t *slot = query_slot(pool, firstQuery + i);
531          if (write_results)
532             cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
533          idx++;
534          if (write_results)
535             cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
536          idx++;
537          break;
538       }
539 
540       case VK_QUERY_TYPE_TIMESTAMP: {
541          uint64_t *slot = query_slot(pool, firstQuery + i);
542          if (write_results)
543             cpu_write_query_result(pData, flags, idx, slot[1]);
544          idx++;
545          break;
546       }
547 
548 #if GFX_VER >= 8
549       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
550          const struct anv_physical_device *pdevice = device->physical;
551          assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
552                           VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
553          for (uint32_t p = 0; p < pool->n_passes; p++) {
554             const struct intel_perf_query_info *query = pool->pass_query[p];
555             struct intel_perf_query_result result;
556             intel_perf_query_result_clear(&result);
557             intel_perf_query_result_accumulate_fields(&result, query, &device->info,
558                                                       pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
559                                                       pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
560                                                       false /* no_oa_accumulate */);
561             anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
562          }
563          break;
564       }
565 #endif
566 
567       case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
568          if (!write_results)
569             break;
570          const void *query_data = query_slot(pool, firstQuery + i);
571          const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
572          struct intel_perf_query_result result;
573          intel_perf_query_result_clear(&result);
574          intel_perf_query_result_accumulate_fields(&result, query, &device->info,
575                                                    query_data + intel_perf_query_data_offset(pool, false),
576                                                    query_data + intel_perf_query_data_offset(pool, true),
577                                                    false /* no_oa_accumulate */);
578          intel_perf_query_result_write_mdapi(pData, stride,
579                                              &device->info,
580                                              query, &result);
581          const uint64_t *marker = query_data + intel_perf_marker_offset();
582          intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
583          break;
584       }
585 
586       default:
587          unreachable("invalid pool type");
588       }
589 
590       if (!write_results)
591          status = VK_NOT_READY;
592 
593       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
594          cpu_write_query_result(pData, flags, idx, available);
595 
596       pData += stride;
597       if (pData >= data_end)
598          break;
599    }
600 
601    return status;
602 }
603 
604 static void
emit_ps_depth_count(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)605 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
606                     struct anv_address addr)
607 {
608    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
609    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
610 
611    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
612       pc.DestinationAddressType  = DAT_PPGTT;
613       pc.PostSyncOperation       = WritePSDepthCount;
614       pc.DepthStallEnable        = true;
615       pc.Address                 = addr;
616 
617       if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
618          pc.CommandStreamerStallEnable = true;
619    }
620 }
621 
622 static void
emit_query_mi_availability(struct mi_builder * b,struct anv_address addr,bool available)623 emit_query_mi_availability(struct mi_builder *b,
624                            struct anv_address addr,
625                            bool available)
626 {
627    mi_store(b, mi_mem64(addr), mi_imm(available));
628 }
629 
630 static void
emit_query_pc_availability(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool available)631 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
632                            struct anv_address addr,
633                            bool available)
634 {
635    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
636    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
637 
638    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
639       pc.DestinationAddressType  = DAT_PPGTT;
640       pc.PostSyncOperation       = WriteImmediateData;
641       pc.Address                 = addr;
642       pc.ImmediateData           = available;
643    }
644 }
645 
646 /**
647  * Goes through a series of consecutive query indices in the given pool
648  * setting all element values to 0 and emitting them as available.
649  */
650 static void
emit_zero_queries(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_query_pool * pool,uint32_t first_index,uint32_t num_queries)651 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
652                   struct mi_builder *b, struct anv_query_pool *pool,
653                   uint32_t first_index, uint32_t num_queries)
654 {
655    switch (pool->type) {
656    case VK_QUERY_TYPE_OCCLUSION:
657    case VK_QUERY_TYPE_TIMESTAMP:
658       /* These queries are written with a PIPE_CONTROL so clear them using the
659        * PIPE_CONTROL as well so we don't have to synchronize between 2 types
660        * of operations.
661        */
662       assert((pool->stride % 8) == 0);
663       for (uint32_t i = 0; i < num_queries; i++) {
664          struct anv_address slot_addr =
665             anv_query_address(pool, first_index + i);
666 
667          for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
668             emit_query_pc_availability(cmd_buffer,
669                                        anv_address_add(slot_addr, qword * 8),
670                                        false);
671          }
672          emit_query_pc_availability(cmd_buffer, slot_addr, true);
673       }
674       break;
675 
676    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
677    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
678       for (uint32_t i = 0; i < num_queries; i++) {
679          struct anv_address slot_addr =
680             anv_query_address(pool, first_index + i);
681          mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
682          emit_query_mi_availability(b, slot_addr, true);
683       }
684       break;
685 
686 #if GFX_VER >= 8
687    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
688       for (uint32_t i = 0; i < num_queries; i++) {
689          for (uint32_t p = 0; p < pool->n_passes; p++) {
690             mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false),
691                          0, 2 * pool->snapshot_size);
692             emit_query_mi_availability(b,
693                                        khr_perf_query_availability_address(pool, first_index + i, p),
694                                        true);
695          }
696       }
697       break;
698    }
699 #endif
700 
701    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
702       for (uint32_t i = 0; i < num_queries; i++) {
703          struct anv_address slot_addr =
704             anv_query_address(pool, first_index + i);
705          mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
706          emit_query_mi_availability(b, slot_addr, true);
707       }
708       break;
709 
710    default:
711       unreachable("Unsupported query type");
712    }
713 }
714 
genX(CmdResetQueryPool)715 void genX(CmdResetQueryPool)(
716     VkCommandBuffer                             commandBuffer,
717     VkQueryPool                                 queryPool,
718     uint32_t                                    firstQuery,
719     uint32_t                                    queryCount)
720 {
721    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
722    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
723 
724    switch (pool->type) {
725    case VK_QUERY_TYPE_OCCLUSION:
726    case VK_QUERY_TYPE_TIMESTAMP:
727       for (uint32_t i = 0; i < queryCount; i++) {
728          emit_query_pc_availability(cmd_buffer,
729                                     anv_query_address(pool, firstQuery + i),
730                                     false);
731       }
732       break;
733 
734    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
735    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
736       struct mi_builder b;
737       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
738 
739       for (uint32_t i = 0; i < queryCount; i++)
740          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
741       break;
742    }
743 
744 #if GFX_VER >= 8
745    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
746       struct mi_builder b;
747       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
748 
749       for (uint32_t i = 0; i < queryCount; i++) {
750          for (uint32_t p = 0; p < pool->n_passes; p++) {
751             emit_query_mi_availability(
752                &b,
753                khr_perf_query_availability_address(pool, firstQuery + i, p),
754                false);
755          }
756       }
757       break;
758    }
759 #endif
760 
761    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
762       struct mi_builder b;
763       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
764 
765       for (uint32_t i = 0; i < queryCount; i++)
766          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
767       break;
768    }
769 
770    default:
771       unreachable("Unsupported query type");
772    }
773 }
774 
genX(ResetQueryPool)775 void genX(ResetQueryPool)(
776     VkDevice                                    _device,
777     VkQueryPool                                 queryPool,
778     uint32_t                                    firstQuery,
779     uint32_t                                    queryCount)
780 {
781    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
782 
783    for (uint32_t i = 0; i < queryCount; i++) {
784       if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
785 #if GFX_VER >= 8
786          for (uint32_t p = 0; p < pool->n_passes; p++) {
787             uint64_t *pass_slot = pool->bo->map +
788                khr_perf_query_availability_offset(pool, firstQuery + i, p);
789             *pass_slot = 0;
790          }
791 #endif
792       } else {
793          uint64_t *slot = query_slot(pool, firstQuery + i);
794          *slot = 0;
795       }
796    }
797 }
798 
799 static const uint32_t vk_pipeline_stat_to_reg[] = {
800    GENX(IA_VERTICES_COUNT_num),
801    GENX(IA_PRIMITIVES_COUNT_num),
802    GENX(VS_INVOCATION_COUNT_num),
803    GENX(GS_INVOCATION_COUNT_num),
804    GENX(GS_PRIMITIVES_COUNT_num),
805    GENX(CL_INVOCATION_COUNT_num),
806    GENX(CL_PRIMITIVES_COUNT_num),
807    GENX(PS_INVOCATION_COUNT_num),
808    GENX(HS_INVOCATION_COUNT_num),
809    GENX(DS_INVOCATION_COUNT_num),
810    GENX(CS_INVOCATION_COUNT_num),
811 };
812 
813 static void
emit_pipeline_stat(struct mi_builder * b,uint32_t stat,struct anv_address addr)814 emit_pipeline_stat(struct mi_builder *b, uint32_t stat,
815                    struct anv_address addr)
816 {
817    STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
818                  (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
819 
820    assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
821    mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat]));
822 }
823 
824 static void
emit_xfb_query(struct mi_builder * b,uint32_t stream,struct anv_address addr)825 emit_xfb_query(struct mi_builder *b, uint32_t stream,
826                struct anv_address addr)
827 {
828    assert(stream < MAX_XFB_STREAMS);
829 
830    mi_store(b, mi_mem64(anv_address_add(addr, 0)),
831                mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
832    mi_store(b, mi_mem64(anv_address_add(addr, 16)),
833                mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
834 }
835 
836 static void
emit_perf_intel_query(struct anv_cmd_buffer * cmd_buffer,struct anv_query_pool * pool,struct mi_builder * b,struct anv_address query_addr,bool end)837 emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
838                       struct anv_query_pool *pool,
839                       struct mi_builder *b,
840                       struct anv_address query_addr,
841                       bool end)
842 {
843    const struct intel_perf_query_field_layout *layout =
844       &cmd_buffer->device->physical->perf->query_layout;
845    struct anv_address data_addr =
846       anv_address_add(query_addr, intel_perf_query_data_offset(pool, end));
847 
848    for (uint32_t f = 0; f < layout->n_fields; f++) {
849       const struct intel_perf_query_field *field =
850          &layout->fields[end ? f : (layout->n_fields - 1 - f)];
851 
852       switch (field->type) {
853       case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
854          anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
855             rpc.MemoryAddress = anv_address_add(data_addr, field->location);
856          }
857          break;
858 
859       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
860       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
861       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
862       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
863          struct anv_address addr = anv_address_add(data_addr, field->location);
864          struct mi_value src = field->size == 8 ?
865             mi_reg64(field->mmio_offset) :
866             mi_reg32(field->mmio_offset);
867          struct mi_value dst = field->size == 8 ?
868             mi_mem64(addr) : mi_mem32(addr);
869          mi_store(b, dst, src);
870          break;
871       }
872 
873       default:
874          unreachable("Invalid query field");
875          break;
876       }
877    }
878 }
879 
genX(CmdBeginQuery)880 void genX(CmdBeginQuery)(
881     VkCommandBuffer                             commandBuffer,
882     VkQueryPool                                 queryPool,
883     uint32_t                                    query,
884     VkQueryControlFlags                         flags)
885 {
886    genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
887 }
888 
genX(CmdBeginQueryIndexedEXT)889 void genX(CmdBeginQueryIndexedEXT)(
890     VkCommandBuffer                             commandBuffer,
891     VkQueryPool                                 queryPool,
892     uint32_t                                    query,
893     VkQueryControlFlags                         flags,
894     uint32_t                                    index)
895 {
896    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
897    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
898    struct anv_address query_addr = anv_query_address(pool, query);
899 
900    struct mi_builder b;
901    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
902 
903    switch (pool->type) {
904    case VK_QUERY_TYPE_OCCLUSION:
905       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
906       break;
907 
908    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
909       /* TODO: This might only be necessary for certain stats */
910       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
911          pc.CommandStreamerStallEnable = true;
912          pc.StallAtPixelScoreboard = true;
913       }
914 
915       uint32_t statistics = pool->pipeline_statistics;
916       uint32_t offset = 8;
917       while (statistics) {
918          uint32_t stat = u_bit_scan(&statistics);
919          emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
920          offset += 16;
921       }
922       break;
923    }
924 
925    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
926       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
927          pc.CommandStreamerStallEnable = true;
928          pc.StallAtPixelScoreboard = true;
929       }
930       emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
931       break;
932 
933 #if GFX_VER >= 8
934    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
935       if (!khr_perf_query_ensure_relocs(cmd_buffer))
936          return;
937 
938       const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
939       const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
940 
941       uint32_t reloc_idx = 0;
942       for (uint32_t end = 0; end < 2; end++) {
943          for (uint32_t r = 0; r < layout->n_fields; r++) {
944             const struct intel_perf_query_field *field =
945                &layout->fields[end ? r : (layout->n_fields - 1 - r)];
946             struct mi_value reg_addr =
947                mi_iadd(
948                   &b,
949                   mi_imm(intel_canonical_address(pool->bo->offset +
950                                                  khr_perf_query_data_offset(pool, query, 0, end) +
951                                                  field->location)),
952                   mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
953             cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
954 
955             if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC &&
956                 field->size == 8) {
957                reg_addr =
958                   mi_iadd(
959                      &b,
960                      mi_imm(intel_canonical_address(pool->bo->offset +
961                                                     khr_perf_query_data_offset(pool, query, 0, end) +
962                                                     field->location + 4)),
963                      mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
964                cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
965             }
966          }
967       }
968 
969       struct mi_value availability_write_offset =
970          mi_iadd(
971             &b,
972             mi_imm(
973                intel_canonical_address(
974                   pool->bo->offset +
975                   khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
976             mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
977       cmd_buffer->self_mod_locations[reloc_idx++] =
978          mi_store_address(&b, availability_write_offset);
979 
980       assert(reloc_idx == pdevice->n_perf_query_commands);
981 
982       mi_self_mod_barrier(&b);
983 
984       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
985          pc.CommandStreamerStallEnable = true;
986          pc.StallAtPixelScoreboard = true;
987       }
988       cmd_buffer->perf_query_pool = pool;
989 
990       cmd_buffer->perf_reloc_idx = 0;
991       for (uint32_t r = 0; r < layout->n_fields; r++) {
992          const struct intel_perf_query_field *field =
993             &layout->fields[layout->n_fields - 1 - r];
994          void *dws;
995 
996          switch (field->type) {
997          case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
998             dws = anv_batch_emitn(&cmd_buffer->batch,
999                                   GENX(MI_REPORT_PERF_COUNT_length),
1000                                   GENX(MI_REPORT_PERF_COUNT),
1001                                   .MemoryAddress = query_addr /* Will be overwritten */);
1002             _mi_resolve_address_token(&b,
1003                                       cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1004                                       dws +
1005                                       GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1006             break;
1007 
1008          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1009          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1010          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1011          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1012             dws =
1013                anv_batch_emitn(&cmd_buffer->batch,
1014                                GENX(MI_STORE_REGISTER_MEM_length),
1015                                GENX(MI_STORE_REGISTER_MEM),
1016                                .RegisterAddress = field->mmio_offset,
1017                                .MemoryAddress = query_addr /* Will be overwritten */ );
1018             _mi_resolve_address_token(&b,
1019                                       cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1020                                       dws +
1021                                       GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1022             if (field->size == 8) {
1023                dws =
1024                   anv_batch_emitn(&cmd_buffer->batch,
1025                                   GENX(MI_STORE_REGISTER_MEM_length),
1026                                   GENX(MI_STORE_REGISTER_MEM),
1027                                   .RegisterAddress = field->mmio_offset + 4,
1028                                   .MemoryAddress = query_addr /* Will be overwritten */ );
1029                _mi_resolve_address_token(&b,
1030                                          cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1031                                          dws +
1032                                          GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1033             }
1034             break;
1035 
1036          default:
1037             unreachable("Invalid query field");
1038             break;
1039          }
1040       }
1041       break;
1042    }
1043 #endif
1044 
1045    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1046       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1047          pc.CommandStreamerStallEnable = true;
1048          pc.StallAtPixelScoreboard = true;
1049       }
1050       emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
1051       break;
1052    }
1053 
1054    default:
1055       unreachable("");
1056    }
1057 }
1058 
genX(CmdEndQuery)1059 void genX(CmdEndQuery)(
1060     VkCommandBuffer                             commandBuffer,
1061     VkQueryPool                                 queryPool,
1062     uint32_t                                    query)
1063 {
1064    genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
1065 }
1066 
genX(CmdEndQueryIndexedEXT)1067 void genX(CmdEndQueryIndexedEXT)(
1068     VkCommandBuffer                             commandBuffer,
1069     VkQueryPool                                 queryPool,
1070     uint32_t                                    query,
1071     uint32_t                                    index)
1072 {
1073    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1074    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1075    struct anv_address query_addr = anv_query_address(pool, query);
1076 
1077    struct mi_builder b;
1078    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1079 
1080    switch (pool->type) {
1081    case VK_QUERY_TYPE_OCCLUSION:
1082       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
1083       emit_query_pc_availability(cmd_buffer, query_addr, true);
1084       break;
1085 
1086    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1087       /* TODO: This might only be necessary for certain stats */
1088       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1089          pc.CommandStreamerStallEnable = true;
1090          pc.StallAtPixelScoreboard = true;
1091       }
1092 
1093       uint32_t statistics = pool->pipeline_statistics;
1094       uint32_t offset = 16;
1095       while (statistics) {
1096          uint32_t stat = u_bit_scan(&statistics);
1097          emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
1098          offset += 16;
1099       }
1100 
1101       emit_query_mi_availability(&b, query_addr, true);
1102       break;
1103    }
1104 
1105    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1106       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1107          pc.CommandStreamerStallEnable = true;
1108          pc.StallAtPixelScoreboard = true;
1109       }
1110 
1111       emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
1112       emit_query_mi_availability(&b, query_addr, true);
1113       break;
1114 
1115 #if GFX_VER >= 8
1116    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1117       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1118          pc.CommandStreamerStallEnable = true;
1119          pc.StallAtPixelScoreboard = true;
1120       }
1121       cmd_buffer->perf_query_pool = pool;
1122 
1123       if (!khr_perf_query_ensure_relocs(cmd_buffer))
1124          return;
1125 
1126       const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
1127       const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
1128 
1129       void *dws;
1130       for (uint32_t r = 0; r < layout->n_fields; r++) {
1131          const struct intel_perf_query_field *field = &layout->fields[r];
1132 
1133          switch (field->type) {
1134          case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1135             dws = anv_batch_emitn(&cmd_buffer->batch,
1136                                   GENX(MI_REPORT_PERF_COUNT_length),
1137                                   GENX(MI_REPORT_PERF_COUNT),
1138                                   .MemoryAddress = query_addr /* Will be overwritten */);
1139             _mi_resolve_address_token(&b,
1140                                       cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1141                                       dws +
1142                                       GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1143             break;
1144 
1145          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1146          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1147          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1148          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1149             dws =
1150                anv_batch_emitn(&cmd_buffer->batch,
1151                                GENX(MI_STORE_REGISTER_MEM_length),
1152                                GENX(MI_STORE_REGISTER_MEM),
1153                                .RegisterAddress = field->mmio_offset,
1154                                .MemoryAddress = query_addr /* Will be overwritten */ );
1155             _mi_resolve_address_token(&b,
1156                                       cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1157                                       dws +
1158                                       GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1159             if (field->size == 8) {
1160                dws =
1161                   anv_batch_emitn(&cmd_buffer->batch,
1162                                   GENX(MI_STORE_REGISTER_MEM_length),
1163                                   GENX(MI_STORE_REGISTER_MEM),
1164                                   .RegisterAddress = field->mmio_offset + 4,
1165                                   .MemoryAddress = query_addr /* Will be overwritten */ );
1166                _mi_resolve_address_token(&b,
1167                                          cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1168                                          dws +
1169                                          GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1170             }
1171             break;
1172 
1173          default:
1174             unreachable("Invalid query field");
1175             break;
1176          }
1177       }
1178 
1179       dws =
1180          anv_batch_emitn(&cmd_buffer->batch,
1181                          GENX(MI_STORE_DATA_IMM_length),
1182                          GENX(MI_STORE_DATA_IMM),
1183                          .ImmediateData = true);
1184       _mi_resolve_address_token(&b,
1185                                 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1186                                 dws +
1187                                 GENX(MI_STORE_DATA_IMM_Address_start) / 8);
1188 
1189       assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
1190       break;
1191    }
1192 #endif
1193 
1194    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1195       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1196          pc.CommandStreamerStallEnable = true;
1197          pc.StallAtPixelScoreboard = true;
1198       }
1199       uint32_t marker_offset = intel_perf_marker_offset();
1200       mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
1201                    mi_imm(cmd_buffer->intel_perf_marker));
1202       emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true);
1203       emit_query_mi_availability(&b, query_addr, true);
1204       break;
1205    }
1206 
1207    default:
1208       unreachable("");
1209    }
1210 
1211    /* When multiview is active the spec requires that N consecutive query
1212     * indices are used, where N is the number of active views in the subpass.
1213     * The spec allows that we only write the results to one of the queries
1214     * but we still need to manage result availability for all the query indices.
1215     * Since we only emit a single query for all active views in the
1216     * first index, mark the other query indices as being already available
1217     * with result 0.
1218     */
1219    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1220       const uint32_t num_queries =
1221          util_bitcount(cmd_buffer->state.subpass->view_mask);
1222       if (num_queries > 1)
1223          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1224    }
1225 }
1226 
1227 #define TIMESTAMP 0x2358
1228 
genX(CmdWriteTimestamp2KHR)1229 void genX(CmdWriteTimestamp2KHR)(
1230     VkCommandBuffer                             commandBuffer,
1231     VkPipelineStageFlags2KHR                    stage,
1232     VkQueryPool                                 queryPool,
1233     uint32_t                                    query)
1234 {
1235    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1236    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1237    struct anv_address query_addr = anv_query_address(pool, query);
1238 
1239    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1240 
1241    struct mi_builder b;
1242    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1243 
1244    if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR) {
1245       mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
1246                    mi_reg64(TIMESTAMP));
1247    } else {
1248       /* Everything else is bottom-of-pipe */
1249       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
1250       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1251 
1252       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1253          pc.DestinationAddressType  = DAT_PPGTT;
1254          pc.PostSyncOperation       = WriteTimestamp;
1255          pc.Address                 = anv_address_add(query_addr, 8);
1256 
1257          if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
1258             pc.CommandStreamerStallEnable = true;
1259       }
1260    }
1261 
1262    emit_query_pc_availability(cmd_buffer, query_addr, true);
1263 
1264    /* When multiview is active the spec requires that N consecutive query
1265     * indices are used, where N is the number of active views in the subpass.
1266     * The spec allows that we only write the results to one of the queries
1267     * but we still need to manage result availability for all the query indices.
1268     * Since we only emit a single query for all active views in the
1269     * first index, mark the other query indices as being already available
1270     * with result 0.
1271     */
1272    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1273       const uint32_t num_queries =
1274          util_bitcount(cmd_buffer->state.subpass->view_mask);
1275       if (num_queries > 1)
1276          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1277    }
1278 }
1279 
1280 #if GFX_VERx10 >= 75
1281 
1282 #define MI_PREDICATE_SRC0    0x2400
1283 #define MI_PREDICATE_SRC1    0x2408
1284 #define MI_PREDICATE_RESULT  0x2418
1285 
1286 /**
1287  * Writes the results of a query to dst_addr is the value at poll_addr is equal
1288  * to the reference value.
1289  */
1290 static void
gpu_write_query_result_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address poll_addr,struct anv_address dst_addr,uint64_t ref_value,VkQueryResultFlags flags,uint32_t value_index,struct mi_value query_result)1291 gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
1292                             struct mi_builder *b,
1293                             struct anv_address poll_addr,
1294                             struct anv_address dst_addr,
1295                             uint64_t ref_value,
1296                             VkQueryResultFlags flags,
1297                             uint32_t value_index,
1298                             struct mi_value query_result)
1299 {
1300    mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr));
1301    mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value));
1302    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1303       mip.LoadOperation    = LOAD_LOAD;
1304       mip.CombineOperation = COMBINE_SET;
1305       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1306    }
1307 
1308    if (flags & VK_QUERY_RESULT_64_BIT) {
1309       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1310       mi_store_if(b, mi_mem64(res_addr), query_result);
1311    } else {
1312       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1313       mi_store_if(b, mi_mem32(res_addr), query_result);
1314    }
1315 }
1316 
1317 static void
gpu_write_query_result(struct mi_builder * b,struct anv_address dst_addr,VkQueryResultFlags flags,uint32_t value_index,struct mi_value query_result)1318 gpu_write_query_result(struct mi_builder *b,
1319                        struct anv_address dst_addr,
1320                        VkQueryResultFlags flags,
1321                        uint32_t value_index,
1322                        struct mi_value query_result)
1323 {
1324    if (flags & VK_QUERY_RESULT_64_BIT) {
1325       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1326       mi_store(b, mi_mem64(res_addr), query_result);
1327    } else {
1328       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1329       mi_store(b, mi_mem32(res_addr), query_result);
1330    }
1331 }
1332 
1333 static struct mi_value
compute_query_result(struct mi_builder * b,struct anv_address addr)1334 compute_query_result(struct mi_builder *b, struct anv_address addr)
1335 {
1336    return mi_isub(b, mi_mem64(anv_address_add(addr, 8)),
1337                      mi_mem64(anv_address_add(addr, 0)));
1338 }
1339 
genX(CmdCopyQueryPoolResults)1340 void genX(CmdCopyQueryPoolResults)(
1341     VkCommandBuffer                             commandBuffer,
1342     VkQueryPool                                 queryPool,
1343     uint32_t                                    firstQuery,
1344     uint32_t                                    queryCount,
1345     VkBuffer                                    destBuffer,
1346     VkDeviceSize                                destOffset,
1347     VkDeviceSize                                destStride,
1348     VkQueryResultFlags                          flags)
1349 {
1350    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1351    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1352    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1353 
1354    struct mi_builder b;
1355    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1356    struct mi_value result;
1357 
1358    /* If render target writes are ongoing, request a render target cache flush
1359     * to ensure proper ordering of the commands from the 3d pipe and the
1360     * command streamer.
1361     */
1362    if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
1363       anv_add_pending_pipe_bits(cmd_buffer,
1364                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
1365                                 "CopyQueryPoolResults");
1366    }
1367 
1368    if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
1369        (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
1370        /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
1371         * because we're about to copy values from MI commands, we need to
1372         * stall the command streamer to make sure the PIPE_CONTROL values have
1373         * landed, otherwise we could see inconsistent values & availability.
1374         *
1375         *  From the vulkan spec:
1376         *
1377         *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1378         *     previous uses of vkCmdResetQueryPool in the same queue, without
1379         *     any additional synchronization."
1380         */
1381        pool->type == VK_QUERY_TYPE_OCCLUSION ||
1382        pool->type == VK_QUERY_TYPE_TIMESTAMP) {
1383       anv_add_pending_pipe_bits(cmd_buffer,
1384                                 ANV_PIPE_CS_STALL_BIT,
1385                                 "CopyQueryPoolResults");
1386       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1387    }
1388 
1389    struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
1390    for (uint32_t i = 0; i < queryCount; i++) {
1391       struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
1392       uint32_t idx = 0;
1393       switch (pool->type) {
1394       case VK_QUERY_TYPE_OCCLUSION:
1395          result = compute_query_result(&b, anv_address_add(query_addr, 8));
1396          /* Like in the case of vkGetQueryPoolResults, if the query is
1397           * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
1398           * conservatively write 0 as the query result. If the
1399           * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1400           */
1401          gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1402                1 /* available */, flags, idx, result);
1403          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
1404             gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1405                   0 /* unavailable */, flags, idx, mi_imm(0));
1406          }
1407          idx++;
1408          break;
1409 
1410       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1411          uint32_t statistics = pool->pipeline_statistics;
1412          while (statistics) {
1413             uint32_t stat = u_bit_scan(&statistics);
1414 
1415             result = compute_query_result(&b, anv_address_add(query_addr,
1416                                                               idx * 16 + 8));
1417 
1418             /* WaDividePSInvocationCountBy4:HSW,BDW */
1419             if ((cmd_buffer->device->info.ver == 8 ||
1420                  cmd_buffer->device->info.is_haswell) &&
1421                 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1422                result = mi_ushr32_imm(&b, result, 2);
1423             }
1424 
1425             gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1426          }
1427          assert(idx == util_bitcount(pool->pipeline_statistics));
1428          break;
1429       }
1430 
1431       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1432          result = compute_query_result(&b, anv_address_add(query_addr, 8));
1433          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1434          result = compute_query_result(&b, anv_address_add(query_addr, 24));
1435          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1436          break;
1437 
1438       case VK_QUERY_TYPE_TIMESTAMP:
1439          result = mi_mem64(anv_address_add(query_addr, 8));
1440          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1441          break;
1442 
1443 #if GFX_VER >= 8
1444       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1445          unreachable("Copy KHR performance query results not implemented");
1446          break;
1447 #endif
1448 
1449       default:
1450          unreachable("unhandled query type");
1451       }
1452 
1453       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1454          gpu_write_query_result(&b, dest_addr, flags, idx,
1455                                 mi_mem64(query_addr));
1456       }
1457 
1458       dest_addr = anv_address_add(dest_addr, destStride);
1459    }
1460 }
1461 
1462 #else
genX(CmdCopyQueryPoolResults)1463 void genX(CmdCopyQueryPoolResults)(
1464     VkCommandBuffer                             commandBuffer,
1465     VkQueryPool                                 queryPool,
1466     uint32_t                                    firstQuery,
1467     uint32_t                                    queryCount,
1468     VkBuffer                                    destBuffer,
1469     VkDeviceSize                                destOffset,
1470     VkDeviceSize                                destStride,
1471     VkQueryResultFlags                          flags)
1472 {
1473    anv_finishme("Queries not yet supported on Ivy Bridge");
1474 }
1475 #endif
1476