1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include "anv_private.h"
31 
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34 
35 /* We reserve :
36  *    - GPR 14 for perf queries
37  *    - GPR 15 for conditional rendering
38  */
39 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
40 #define GEN_MI_BUILDER_CAN_WRITE_BATCH GEN_GEN >= 8
41 #define __gen_get_batch_dwords anv_batch_emit_dwords
42 #define __gen_address_offset anv_address_add
43 #define __gen_get_batch_address(b, a) anv_address_physical(anv_batch_address(b, a))
44 #include "common/gen_mi_builder.h"
45 #include "perf/gen_perf.h"
46 #include "perf/gen_perf_mdapi.h"
47 
48 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
49 
50 #include "vk_util.h"
51 
52 static struct anv_address
anv_query_address(struct anv_query_pool * pool,uint32_t query)53 anv_query_address(struct anv_query_pool *pool, uint32_t query)
54 {
55    return (struct anv_address) {
56       .bo = pool->bo,
57       .offset = query * pool->stride,
58    };
59 }
60 
genX(CreateQueryPool)61 VkResult genX(CreateQueryPool)(
62     VkDevice                                    _device,
63     const VkQueryPoolCreateInfo*                pCreateInfo,
64     const VkAllocationCallbacks*                pAllocator,
65     VkQueryPool*                                pQueryPool)
66 {
67    ANV_FROM_HANDLE(anv_device, device, _device);
68    const struct anv_physical_device *pdevice = device->physical;
69    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
70    struct anv_query_pool *pool;
71    struct gen_perf_counter_pass *counter_pass;
72    struct gen_perf_query_info **pass_query;
73    ANV_MULTIALLOC(ma);
74    VkResult result;
75 
76    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
77 
78    /* Query pool slots are made up of some number of 64-bit values packed
79     * tightly together. For most query types have the first 64-bit value is
80     * the "available" bit which is 0 when the query is unavailable and 1 when
81     * it is available. The 64-bit values that follow are determined by the
82     * type of query.
83     *
84     * For performance queries, we have a requirement to align OA reports at
85     * 64bytes so we put those first and have the "available" bit behind
86     * together with some other counters.
87     */
88    uint32_t uint64s_per_slot = 0;
89    UNUSED uint32_t n_passes = 0;
90 
91    anv_multialloc_add(&ma, &pool, 1);
92 
93    VkQueryPipelineStatisticFlags pipeline_statistics = 0;
94    switch (pCreateInfo->queryType) {
95    case VK_QUERY_TYPE_OCCLUSION:
96       /* Occlusion queries have two values: begin and end. */
97       uint64s_per_slot = 1 + 2;
98       break;
99    case VK_QUERY_TYPE_TIMESTAMP:
100       /* Timestamps just have the one timestamp value */
101       uint64s_per_slot = 1 + 1;
102       break;
103    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
104       pipeline_statistics = pCreateInfo->pipelineStatistics;
105       /* We're going to trust this field implicitly so we need to ensure that
106        * no unhandled extension bits leak in.
107        */
108       pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
109 
110       /* Statistics queries have a min and max for every statistic */
111       uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
112       break;
113    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
114       /* Transform feedback queries are 4 values, begin/end for
115        * written/available.
116        */
117       uint64s_per_slot = 1 + 4;
118       break;
119    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
120       uint64s_per_slot = 72; /* 576 bytes, see layout below */
121       break;
122    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
123       perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
124                                              QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
125       n_passes = gen_perf_get_n_passes(pdevice->perf,
126                                        perf_query_info->pCounterIndices,
127                                        perf_query_info->counterIndexCount,
128                                        NULL);
129       anv_multialloc_add(&ma, &counter_pass, perf_query_info->counterIndexCount);
130       anv_multialloc_add(&ma, &pass_query, n_passes);
131       STATIC_ASSERT(ANV_KHR_PERF_QUERY_SIZE % sizeof(uint64_t) == 0);
132       uint64s_per_slot = (ANV_KHR_PERF_QUERY_SIZE / sizeof(uint64_t)) * n_passes;
133       break;
134    default:
135       assert(!"Invalid query type");
136    }
137 
138    if (!anv_multialloc_alloc2(&ma, &device->vk.alloc,
139                               pAllocator,
140                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
141       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
142 
143    vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL);
144    pool->type = pCreateInfo->queryType;
145    pool->pipeline_statistics = pipeline_statistics;
146    pool->stride = uint64s_per_slot * sizeof(uint64_t);
147    pool->slots = pCreateInfo->queryCount;
148 
149    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
150       pool->n_counters = perf_query_info->counterIndexCount;
151       pool->counter_pass = counter_pass;
152       gen_perf_get_counters_passes(pdevice->perf,
153                                    perf_query_info->pCounterIndices,
154                                    perf_query_info->counterIndexCount,
155                                    pool->counter_pass);
156       pool->n_passes = n_passes;
157       pool->pass_query = pass_query;
158       gen_perf_get_n_passes(pdevice->perf,
159                             perf_query_info->pCounterIndices,
160                             perf_query_info->counterIndexCount,
161                             pool->pass_query);
162    }
163 
164    uint32_t bo_flags = 0;
165    if (pdevice->supports_48bit_addresses)
166       bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
167 
168    if (pdevice->use_softpin)
169       bo_flags |= EXEC_OBJECT_PINNED;
170 
171    if (pdevice->has_exec_async)
172       bo_flags |= EXEC_OBJECT_ASYNC;
173 
174    uint64_t size = pool->slots * pool->stride;
175    result = anv_device_alloc_bo(device, size,
176                                 ANV_BO_ALLOC_MAPPED |
177                                 ANV_BO_ALLOC_SNOOPED,
178                                 0 /* explicit_address */,
179                                 &pool->bo);
180    if (result != VK_SUCCESS)
181       goto fail;
182 
183    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
184       for (uint32_t p = 0; p < pool->n_passes; p++) {
185          struct gen_mi_builder b;
186          struct anv_batch batch = {
187             .start = pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 8,
188             .end = pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 64,
189          };
190          batch.next = batch.start;
191 
192          gen_mi_builder_init(&b, &batch);
193          gen_mi_store(&b, gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
194                       gen_mi_imm(p * ANV_KHR_PERF_QUERY_SIZE));
195          anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
196          assert(batch.next <= (pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 64));
197       }
198    }
199 
200    *pQueryPool = anv_query_pool_to_handle(pool);
201 
202    return VK_SUCCESS;
203 
204  fail:
205    vk_free2(&device->vk.alloc, pAllocator, pool);
206 
207    return result;
208 }
209 
genX(DestroyQueryPool)210 void genX(DestroyQueryPool)(
211     VkDevice                                    _device,
212     VkQueryPool                                 _pool,
213     const VkAllocationCallbacks*                pAllocator)
214 {
215    ANV_FROM_HANDLE(anv_device, device, _device);
216    ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
217 
218    if (!pool)
219       return;
220 
221    anv_device_release_bo(device, pool->bo);
222    vk_object_base_finish(&pool->base);
223    vk_free2(&device->vk.alloc, pAllocator, pool);
224 }
225 
226 /**
227  * VK_KHR_performance_query layout (576 bytes * number of passes) :
228  *
229  * -----------------------------------------
230  * |       availability (8b)    | |        |
231  * |----------------------------| |        |
232  * |     Small batch loading    | |        |
233  * |  ANV_PERF_QUERY_OFFSET_REG | |        |
234  * |          (56b)             | | Pass 0 |
235  * |----------------------------| |        |
236  * |     begin MI_RPC (256b)    | |        |
237  * |----------------------------| |        |
238  * |       end MI_RPC (256b)    | |        |
239  * |----------------------------|--        | Query 0
240  * |       availability (8b)    | |        |
241  * |----------------------------| |        |
242  * |     Small batch loading    | |        |
243  * |  ANV_PERF_QUERY_OFFSET_REG | |        |
244  * |          (56b)             | | Pass 1 |
245  * |----------------------------| |        |
246  * |     begin MI_RPC (256b)    | |        |
247  * |----------------------------| |        |
248  * |       end MI_RPC (256b)    | |        |
249  * |----------------------------|-----------
250  * |       availability (8b)    | |        |
251  * |----------------------------| |        |
252  * |        Unused (48b)        | |        |
253  * |----------------------------| | Pass 0 |
254  * |     begin MI_RPC (256b)    | |        |
255  * |----------------------------| |        | Query 1
256  * |       end MI_RPC (256b)    | |        |
257  * |----------------------------|--        |
258  * |             ...            | |        |
259  * -----------------------------------------
260  */
261 UNUSED static uint64_t
khr_perf_query_availability_offset(struct anv_query_pool * pool,uint32_t query,uint32_t pass)262 khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
263 {
264    return query * (pool->n_passes * ANV_KHR_PERF_QUERY_SIZE) +
265       pass * ANV_KHR_PERF_QUERY_SIZE;
266 }
267 
268 UNUSED static uint64_t
khr_perf_query_oa_offset(struct anv_query_pool * pool,uint32_t query,uint32_t pass,bool end)269 khr_perf_query_oa_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
270 {
271    return query * (pool->n_passes * ANV_KHR_PERF_QUERY_SIZE) +
272       pass * ANV_KHR_PERF_QUERY_SIZE +
273       64 + (end ? OA_SNAPSHOT_SIZE : 0);
274 }
275 
276 UNUSED static struct anv_address
khr_perf_query_availability_address(struct anv_query_pool * pool,uint32_t query,uint32_t pass)277 khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
278 {
279    return anv_address_add(
280       (struct anv_address) { .bo = pool->bo, },
281       khr_perf_query_availability_offset(pool, query, pass));
282 }
283 
284 UNUSED static struct anv_address
khr_perf_query_oa_address(struct anv_query_pool * pool,uint32_t query,uint32_t pass,bool end)285 khr_perf_query_oa_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
286 {
287    return anv_address_add(
288       (struct anv_address) { .bo = pool->bo, },
289       khr_perf_query_oa_offset(pool, query, pass, end));
290 }
291 
292 
293 /**
294  * VK_INTEL_performance_query layout (576 bytes) :
295  *
296  * ------------------------------
297  * |       availability (8b)    |
298  * |----------------------------|
299  * |         marker (8b)        |
300  * |----------------------------|
301  * | begin RPSTAT register (4b) |
302  * |----------------------------|
303  * |  end RPSTAT register (4b)  |
304  * |----------------------------|
305  * | begin perfcntr 1 & 2 (16b) |
306  * |----------------------------|
307  * |  end perfcntr 1 & 2 (16b)  |
308  * |----------------------------|
309  * |          Unused (8b)       |
310  * |----------------------------|
311  * |     begin MI_RPC (256b)    |
312  * |----------------------------|
313  * |       end MI_RPC (256b)    |
314  * ------------------------------
315  */
316 
317 static uint32_t
intel_perf_marker_offset(void)318 intel_perf_marker_offset(void)
319 {
320    return 8;
321 }
322 
323 static uint32_t
intel_perf_rpstart_offset(bool end)324 intel_perf_rpstart_offset(bool end)
325 {
326    return 16 + (end ? sizeof(uint32_t) : 0);
327 }
328 
329 #if GEN_GEN >= 8 && GEN_GEN <= 11
330 static uint32_t
intel_perf_counter(bool end)331 intel_perf_counter(bool end)
332 {
333    return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
334 }
335 #endif
336 
337 static uint32_t
intel_perf_mi_rpc_offset(bool end)338 intel_perf_mi_rpc_offset(bool end)
339 {
340    return 64 + (end ? 256 : 0);
341 }
342 
343 static void
cpu_write_query_result(void * dst_slot,VkQueryResultFlags flags,uint32_t value_index,uint64_t result)344 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
345                        uint32_t value_index, uint64_t result)
346 {
347    if (flags & VK_QUERY_RESULT_64_BIT) {
348       uint64_t *dst64 = dst_slot;
349       dst64[value_index] = result;
350    } else {
351       uint32_t *dst32 = dst_slot;
352       dst32[value_index] = result;
353    }
354 }
355 
356 static void *
query_slot(struct anv_query_pool * pool,uint32_t query)357 query_slot(struct anv_query_pool *pool, uint32_t query)
358 {
359    return pool->bo->map + query * pool->stride;
360 }
361 
362 static bool
query_is_available(struct anv_query_pool * pool,uint32_t query)363 query_is_available(struct anv_query_pool *pool, uint32_t query)
364 {
365    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
366       for (uint32_t p = 0; p < pool->n_passes; p++) {
367          volatile uint64_t *slot =
368             pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
369          if (!slot[0])
370             return false;
371       }
372       return true;
373    } else {
374       return *(volatile uint64_t *)query_slot(pool, query);
375    }
376 }
377 
378 static VkResult
wait_for_available(struct anv_device * device,struct anv_query_pool * pool,uint32_t query)379 wait_for_available(struct anv_device *device,
380                    struct anv_query_pool *pool, uint32_t query)
381 {
382    uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC);
383 
384    while (anv_gettime_ns() < abs_timeout) {
385       if (query_is_available(pool, query))
386          return VK_SUCCESS;
387       VkResult status = anv_device_query_status(device);
388       if (status != VK_SUCCESS)
389          return status;
390    }
391 
392    return anv_device_set_lost(device, "query timeout");
393 }
394 
genX(GetQueryPoolResults)395 VkResult genX(GetQueryPoolResults)(
396     VkDevice                                    _device,
397     VkQueryPool                                 queryPool,
398     uint32_t                                    firstQuery,
399     uint32_t                                    queryCount,
400     size_t                                      dataSize,
401     void*                                       pData,
402     VkDeviceSize                                stride,
403     VkQueryResultFlags                          flags)
404 {
405    ANV_FROM_HANDLE(anv_device, device, _device);
406    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
407 
408    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
409           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
410           pool->type == VK_QUERY_TYPE_TIMESTAMP ||
411           pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
412           pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
413           pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
414 
415    if (anv_device_is_lost(device))
416       return VK_ERROR_DEVICE_LOST;
417 
418    if (pData == NULL)
419       return VK_SUCCESS;
420 
421    void *data_end = pData + dataSize;
422 
423    VkResult status = VK_SUCCESS;
424    for (uint32_t i = 0; i < queryCount; i++) {
425       bool available = query_is_available(pool, firstQuery + i);
426 
427       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
428          status = wait_for_available(device, pool, firstQuery + i);
429          if (status != VK_SUCCESS)
430             return status;
431 
432          available = true;
433       }
434 
435       /* From the Vulkan 1.0.42 spec:
436        *
437        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
438        *    both not set then no result values are written to pData for
439        *    queries that are in the unavailable state at the time of the call,
440        *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
441        *    availability state is still written to pData for those queries if
442        *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
443        *
444        * From VK_KHR_performance_query :
445        *
446        *    "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
447        *     that the result should contain the number of counters that were recorded
448        *     into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
449        */
450       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
451 
452       uint32_t idx = 0;
453       switch (pool->type) {
454       case VK_QUERY_TYPE_OCCLUSION: {
455          uint64_t *slot = query_slot(pool, firstQuery + i);
456          if (write_results) {
457             /* From the Vulkan 1.2.132 spec:
458              *
459              *    "If VK_QUERY_RESULT_PARTIAL_BIT is set,
460              *    VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
461              *    is unavailable, an intermediate result value between zero and
462              *    the final result value is written to pData for that query."
463              */
464             uint64_t result = available ? slot[2] - slot[1] : 0;
465             cpu_write_query_result(pData, flags, idx, result);
466          }
467          idx++;
468          break;
469       }
470 
471       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
472          uint64_t *slot = query_slot(pool, firstQuery + i);
473          uint32_t statistics = pool->pipeline_statistics;
474          while (statistics) {
475             uint32_t stat = u_bit_scan(&statistics);
476             if (write_results) {
477                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
478 
479                /* WaDividePSInvocationCountBy4:HSW,BDW */
480                if ((device->info.gen == 8 || device->info.is_haswell) &&
481                    (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
482                   result >>= 2;
483 
484                cpu_write_query_result(pData, flags, idx, result);
485             }
486             idx++;
487          }
488          assert(idx == util_bitcount(pool->pipeline_statistics));
489          break;
490       }
491 
492       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
493          uint64_t *slot = query_slot(pool, firstQuery + i);
494          if (write_results)
495             cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
496          idx++;
497          if (write_results)
498             cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
499          idx++;
500          break;
501       }
502 
503       case VK_QUERY_TYPE_TIMESTAMP: {
504          uint64_t *slot = query_slot(pool, firstQuery + i);
505          if (write_results)
506             cpu_write_query_result(pData, flags, idx, slot[1]);
507          idx++;
508          break;
509       }
510 
511 #if GEN_GEN >= 8
512       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
513          const struct anv_physical_device *pdevice = device->physical;
514          assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
515                           VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
516          for (uint32_t p = 0; p < pool->n_passes; p++) {
517             const uint32_t *begin = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, false);
518             const uint32_t *end = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, true);
519             struct gen_perf_query_result result;
520             gen_perf_query_result_clear(&result);
521             gen_perf_query_result_accumulate(&result, pool->pass_query[p], begin, end);
522             anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
523          }
524          break;
525       }
526 #endif
527 
528       case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
529          if (!write_results)
530             break;
531          const void *query_data = query_slot(pool, firstQuery + i);
532          const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
533          const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
534          const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
535          const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
536          struct gen_perf_query_result result;
537          uint32_t core_freq[2];
538 #if GEN_GEN < 9
539          core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
540          core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
541 #else
542          core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
543          core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
544 #endif
545          gen_perf_query_result_clear(&result);
546          gen_perf_query_result_accumulate(&result, &device->physical->perf->queries[0],
547                                           oa_begin, oa_end);
548          gen_perf_query_result_read_frequencies(&result, &device->info,
549                                                 oa_begin, oa_end);
550          gen_perf_query_result_write_mdapi(pData, stride,
551                                            &device->info,
552                                            &result,
553                                            core_freq[0], core_freq[1]);
554 #if GEN_GEN >= 8 && GEN_GEN <= 11
555          gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
556                                              query_data + intel_perf_counter(false),
557                                              query_data + intel_perf_counter(true));
558 #endif
559          const uint64_t *marker = query_data + intel_perf_marker_offset();
560          gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
561          break;
562       }
563 
564       default:
565          unreachable("invalid pool type");
566       }
567 
568       if (!write_results)
569          status = VK_NOT_READY;
570 
571       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
572          cpu_write_query_result(pData, flags, idx, available);
573 
574       pData += stride;
575       if (pData >= data_end)
576          break;
577    }
578 
579    return status;
580 }
581 
582 static void
emit_ps_depth_count(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)583 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
584                     struct anv_address addr)
585 {
586    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
587    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
588 
589    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
590       pc.DestinationAddressType  = DAT_PPGTT;
591       pc.PostSyncOperation       = WritePSDepthCount;
592       pc.DepthStallEnable        = true;
593       pc.Address                 = addr;
594 
595       if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
596          pc.CommandStreamerStallEnable = true;
597    }
598 }
599 
600 static void
emit_query_mi_availability(struct gen_mi_builder * b,struct anv_address addr,bool available)601 emit_query_mi_availability(struct gen_mi_builder *b,
602                            struct anv_address addr,
603                            bool available)
604 {
605    gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
606 }
607 
608 static void
emit_query_pc_availability(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool available)609 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
610                            struct anv_address addr,
611                            bool available)
612 {
613    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
614    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
615 
616    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
617       pc.DestinationAddressType  = DAT_PPGTT;
618       pc.PostSyncOperation       = WriteImmediateData;
619       pc.Address                 = addr;
620       pc.ImmediateData           = available;
621    }
622 }
623 
624 /**
625  * Goes through a series of consecutive query indices in the given pool
626  * setting all element values to 0 and emitting them as available.
627  */
628 static void
emit_zero_queries(struct anv_cmd_buffer * cmd_buffer,struct gen_mi_builder * b,struct anv_query_pool * pool,uint32_t first_index,uint32_t num_queries)629 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
630                   struct gen_mi_builder *b, struct anv_query_pool *pool,
631                   uint32_t first_index, uint32_t num_queries)
632 {
633    switch (pool->type) {
634    case VK_QUERY_TYPE_OCCLUSION:
635    case VK_QUERY_TYPE_TIMESTAMP:
636       /* These queries are written with a PIPE_CONTROL so clear them using the
637        * PIPE_CONTROL as well so we don't have to synchronize between 2 types
638        * of operations.
639        */
640       assert((pool->stride % 8) == 0);
641       for (uint32_t i = 0; i < num_queries; i++) {
642          struct anv_address slot_addr =
643             anv_query_address(pool, first_index + i);
644 
645          for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
646             emit_query_pc_availability(cmd_buffer,
647                                        anv_address_add(slot_addr, qword * 8),
648                                        false);
649          }
650          emit_query_pc_availability(cmd_buffer, slot_addr, true);
651       }
652       break;
653 
654    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
655    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
656       for (uint32_t i = 0; i < num_queries; i++) {
657          struct anv_address slot_addr =
658             anv_query_address(pool, first_index + i);
659          gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
660          emit_query_mi_availability(b, slot_addr, true);
661       }
662       break;
663 
664 #if GEN_GEN >= 8
665    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
666       for (uint32_t i = 0; i < num_queries; i++) {
667          for (uint32_t p = 0; p < pool->n_passes; p++) {
668             gen_mi_memset(b,
669                           khr_perf_query_oa_address(pool,
670                                                     first_index + i, p, false),
671                           0, 2 * OA_SNAPSHOT_SIZE);
672             emit_query_mi_availability(b,
673                                        khr_perf_query_availability_address(pool, first_index + i, p),
674                                        true);
675          }
676       }
677       break;
678    }
679 #endif
680 
681    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
682       for (uint32_t i = 0; i < num_queries; i++) {
683          struct anv_address slot_addr =
684             anv_query_address(pool, first_index + i);
685          gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
686          emit_query_mi_availability(b, slot_addr, true);
687       }
688       break;
689 
690    default:
691       unreachable("Unsupported query type");
692    }
693 }
694 
genX(CmdResetQueryPool)695 void genX(CmdResetQueryPool)(
696     VkCommandBuffer                             commandBuffer,
697     VkQueryPool                                 queryPool,
698     uint32_t                                    firstQuery,
699     uint32_t                                    queryCount)
700 {
701    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
702    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
703 
704    switch (pool->type) {
705    case VK_QUERY_TYPE_OCCLUSION:
706    case VK_QUERY_TYPE_TIMESTAMP:
707       for (uint32_t i = 0; i < queryCount; i++) {
708          emit_query_pc_availability(cmd_buffer,
709                                     anv_query_address(pool, firstQuery + i),
710                                     false);
711       }
712       break;
713 
714    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
715    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
716       struct gen_mi_builder b;
717       gen_mi_builder_init(&b, &cmd_buffer->batch);
718 
719       for (uint32_t i = 0; i < queryCount; i++)
720          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
721       break;
722    }
723 
724 #if GEN_GEN >= 8
725    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
726       struct gen_mi_builder b;
727       gen_mi_builder_init(&b, &cmd_buffer->batch);
728 
729       for (uint32_t i = 0; i < queryCount; i++) {
730          for (uint32_t p = 0; p < pool->n_passes; p++) {
731             emit_query_mi_availability(
732                &b,
733                khr_perf_query_availability_address(pool, firstQuery + i, p),
734                false);
735          }
736       }
737       break;
738    }
739 #endif
740 
741    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
742       struct gen_mi_builder b;
743       gen_mi_builder_init(&b, &cmd_buffer->batch);
744 
745       for (uint32_t i = 0; i < queryCount; i++)
746          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
747       break;
748    }
749 
750    default:
751       unreachable("Unsupported query type");
752    }
753 }
754 
genX(ResetQueryPool)755 void genX(ResetQueryPool)(
756     VkDevice                                    _device,
757     VkQueryPool                                 queryPool,
758     uint32_t                                    firstQuery,
759     uint32_t                                    queryCount)
760 {
761    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
762 
763    for (uint32_t i = 0; i < queryCount; i++) {
764       if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
765          for (uint32_t p = 0; p < pool->n_passes; p++) {
766             uint64_t *pass_slot = pool->bo->map +
767                khr_perf_query_availability_offset(pool, firstQuery + i, p);
768             *pass_slot = 0;
769          }
770       } else {
771          uint64_t *slot = query_slot(pool, firstQuery + i);
772          *slot = 0;
773       }
774    }
775 }
776 
777 static const uint32_t vk_pipeline_stat_to_reg[] = {
778    GENX(IA_VERTICES_COUNT_num),
779    GENX(IA_PRIMITIVES_COUNT_num),
780    GENX(VS_INVOCATION_COUNT_num),
781    GENX(GS_INVOCATION_COUNT_num),
782    GENX(GS_PRIMITIVES_COUNT_num),
783    GENX(CL_INVOCATION_COUNT_num),
784    GENX(CL_PRIMITIVES_COUNT_num),
785    GENX(PS_INVOCATION_COUNT_num),
786    GENX(HS_INVOCATION_COUNT_num),
787    GENX(DS_INVOCATION_COUNT_num),
788    GENX(CS_INVOCATION_COUNT_num),
789 };
790 
791 static void
emit_pipeline_stat(struct gen_mi_builder * b,uint32_t stat,struct anv_address addr)792 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
793                    struct anv_address addr)
794 {
795    STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
796                  (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
797 
798    assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
799    gen_mi_store(b, gen_mi_mem64(addr),
800                 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
801 }
802 
803 static void
emit_xfb_query(struct gen_mi_builder * b,uint32_t stream,struct anv_address addr)804 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
805                struct anv_address addr)
806 {
807    assert(stream < MAX_XFB_STREAMS);
808 
809    gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
810                 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
811    gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
812                 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
813 }
814 
genX(CmdBeginQuery)815 void genX(CmdBeginQuery)(
816     VkCommandBuffer                             commandBuffer,
817     VkQueryPool                                 queryPool,
818     uint32_t                                    query,
819     VkQueryControlFlags                         flags)
820 {
821    genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
822 }
823 
genX(CmdBeginQueryIndexedEXT)824 void genX(CmdBeginQueryIndexedEXT)(
825     VkCommandBuffer                             commandBuffer,
826     VkQueryPool                                 queryPool,
827     uint32_t                                    query,
828     VkQueryControlFlags                         flags,
829     uint32_t                                    index)
830 {
831    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
832    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
833    struct anv_address query_addr = anv_query_address(pool, query);
834 
835    struct gen_mi_builder b;
836    gen_mi_builder_init(&b, &cmd_buffer->batch);
837 
838    switch (pool->type) {
839    case VK_QUERY_TYPE_OCCLUSION:
840       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
841       break;
842 
843    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
844       /* TODO: This might only be necessary for certain stats */
845       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
846          pc.CommandStreamerStallEnable = true;
847          pc.StallAtPixelScoreboard = true;
848       }
849 
850       uint32_t statistics = pool->pipeline_statistics;
851       uint32_t offset = 8;
852       while (statistics) {
853          uint32_t stat = u_bit_scan(&statistics);
854          emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
855          offset += 16;
856       }
857       break;
858    }
859 
860    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
861       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
862          pc.CommandStreamerStallEnable = true;
863          pc.StallAtPixelScoreboard = true;
864       }
865       emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
866       break;
867 
868 #if GEN_GEN >= 8
869    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
870       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
871          pc.CommandStreamerStallEnable = true;
872          pc.StallAtPixelScoreboard = true;
873       }
874       cmd_buffer->perf_query_pool = pool;
875 
876       /* We know the bottom bits of the address are 0s which match what we
877        * want in the MI_RPC packet.
878        */
879       struct gen_mi_value mi_rpc_write_offset =
880          gen_mi_iadd(
881             &b,
882             gen_mi_imm(
883                gen_canonical_address(
884                   pool->bo->offset +
885                   khr_perf_query_oa_offset(pool, query, 0 /* pass */, false))),
886             gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
887       struct gen_mi_address_token mi_rpc_addr_dest =
888          gen_mi_store_address(&b, mi_rpc_write_offset);
889       gen_mi_self_mod_barrier(&b);
890 
891       void *mi_rpc_dws =
892          anv_batch_emitn(&cmd_buffer->batch,
893                          GENX(MI_REPORT_PERF_COUNT_length),
894                          GENX(MI_REPORT_PERF_COUNT),
895                          .MemoryAddress = query_addr /* Will be overwritten */ );
896       _gen_mi_resolve_address_token(&b, mi_rpc_addr_dest,
897                                     mi_rpc_dws +
898                                     GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
899       break;
900    }
901 #endif
902 
903    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
904       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
905          pc.CommandStreamerStallEnable = true;
906          pc.StallAtPixelScoreboard = true;
907       }
908       anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
909          rpc.MemoryAddress =
910             anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
911       }
912 #if GEN_GEN < 9
913       gen_mi_store(&b,
914                    gen_mi_mem32(anv_address_add(query_addr,
915                                                 intel_perf_rpstart_offset(false))),
916                    gen_mi_reg32(GENX(RPSTAT1_num)));
917 #else
918       gen_mi_store(&b,
919                    gen_mi_mem32(anv_address_add(query_addr,
920                                                 intel_perf_rpstart_offset(false))),
921                    gen_mi_reg32(GENX(RPSTAT0_num)));
922 #endif
923 #if GEN_GEN >= 8 && GEN_GEN <= 11
924       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
925                                                     intel_perf_counter(false))),
926                    gen_mi_reg64(GENX(PERFCNT1_num)));
927       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
928                                                     intel_perf_counter(false) + 8)),
929                    gen_mi_reg64(GENX(PERFCNT2_num)));
930 #endif
931       break;
932    }
933 
934    default:
935       unreachable("");
936    }
937 }
938 
genX(CmdEndQuery)939 void genX(CmdEndQuery)(
940     VkCommandBuffer                             commandBuffer,
941     VkQueryPool                                 queryPool,
942     uint32_t                                    query)
943 {
944    genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
945 }
946 
genX(CmdEndQueryIndexedEXT)947 void genX(CmdEndQueryIndexedEXT)(
948     VkCommandBuffer                             commandBuffer,
949     VkQueryPool                                 queryPool,
950     uint32_t                                    query,
951     uint32_t                                    index)
952 {
953    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
954    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
955    struct anv_address query_addr = anv_query_address(pool, query);
956 
957    struct gen_mi_builder b;
958    gen_mi_builder_init(&b, &cmd_buffer->batch);
959 
960    switch (pool->type) {
961    case VK_QUERY_TYPE_OCCLUSION:
962       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
963       emit_query_pc_availability(cmd_buffer, query_addr, true);
964       break;
965 
966    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
967       /* TODO: This might only be necessary for certain stats */
968       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
969          pc.CommandStreamerStallEnable = true;
970          pc.StallAtPixelScoreboard = true;
971       }
972 
973       uint32_t statistics = pool->pipeline_statistics;
974       uint32_t offset = 16;
975       while (statistics) {
976          uint32_t stat = u_bit_scan(&statistics);
977          emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
978          offset += 16;
979       }
980 
981       emit_query_mi_availability(&b, query_addr, true);
982       break;
983    }
984 
985    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
986       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
987          pc.CommandStreamerStallEnable = true;
988          pc.StallAtPixelScoreboard = true;
989       }
990 
991       emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
992       emit_query_mi_availability(&b, query_addr, true);
993       break;
994 
995 #if GEN_GEN >= 8
996    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
997       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
998          pc.CommandStreamerStallEnable = true;
999          pc.StallAtPixelScoreboard = true;
1000       }
1001 
1002       /* We know the bottom bits of the address are 0s which match what we
1003        * want in the MI_RPC/MI_SDI packets.
1004        */
1005       struct gen_mi_value mi_rpc_write_offset =
1006          gen_mi_iadd(
1007             &b,
1008             gen_mi_imm(
1009                gen_canonical_address(
1010                   pool->bo->offset +
1011                   khr_perf_query_oa_offset(pool, query, 0 /* pass*/, true))),
1012             gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
1013       struct gen_mi_value availability_write_offset =
1014          gen_mi_iadd(
1015             &b,
1016             gen_mi_imm(
1017                gen_canonical_address(
1018                   pool->bo->offset +
1019                   khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
1020             gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
1021 
1022       struct gen_mi_address_token mi_rpc_addr_dest =
1023          gen_mi_store_address(&b, mi_rpc_write_offset);
1024       struct gen_mi_address_token availability_addr_dest =
1025          gen_mi_store_address(&b, availability_write_offset);
1026       gen_mi_self_mod_barrier(&b);
1027 
1028       void *mi_rpc_dws =
1029          anv_batch_emitn(&cmd_buffer->batch,
1030                          GENX(MI_REPORT_PERF_COUNT_length),
1031                          GENX(MI_REPORT_PERF_COUNT),
1032                          .MemoryAddress = query_addr /* Will be overwritten */ );
1033       _gen_mi_resolve_address_token(&b, mi_rpc_addr_dest,
1034                                     mi_rpc_dws +
1035                                     GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1036 
1037       void *availability_dws =
1038          anv_batch_emitn(&cmd_buffer->batch,
1039                          GENX(MI_STORE_DATA_IMM_length),
1040                          GENX(MI_STORE_DATA_IMM),
1041                          .ImmediateData = true);
1042       _gen_mi_resolve_address_token(&b, availability_addr_dest,
1043                                     availability_dws +
1044                                     GENX(MI_STORE_DATA_IMM_Address_start) / 8);
1045       break;
1046    }
1047 #endif
1048 
1049    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1050       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1051          pc.CommandStreamerStallEnable = true;
1052          pc.StallAtPixelScoreboard = true;
1053       }
1054       uint32_t marker_offset = intel_perf_marker_offset();
1055       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
1056                    gen_mi_imm(cmd_buffer->intel_perf_marker));
1057 #if GEN_GEN >= 8 && GEN_GEN <= 11
1058       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
1059                    gen_mi_reg64(GENX(PERFCNT1_num)));
1060       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
1061                    gen_mi_reg64(GENX(PERFCNT2_num)));
1062 #endif
1063 #if GEN_GEN < 9
1064       gen_mi_store(&b,
1065                    gen_mi_mem32(anv_address_add(query_addr,
1066                                                 intel_perf_rpstart_offset(true))),
1067                    gen_mi_reg32(GENX(RPSTAT1_num)));
1068 #else
1069       gen_mi_store(&b,
1070                    gen_mi_mem32(anv_address_add(query_addr,
1071                                                 intel_perf_rpstart_offset(true))),
1072                    gen_mi_reg32(GENX(RPSTAT0_num)));
1073 #endif
1074       /* Position the last OA snapshot at the beginning of the query so that
1075        * we can tell whether it's ready.
1076        */
1077       anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
1078          rpc.MemoryAddress = anv_address_add(query_addr,
1079                                              intel_perf_mi_rpc_offset(true));
1080          rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
1081       }
1082       emit_query_mi_availability(&b, query_addr, true);
1083       break;
1084    }
1085 
1086    default:
1087       unreachable("");
1088    }
1089 
1090    /* When multiview is active the spec requires that N consecutive query
1091     * indices are used, where N is the number of active views in the subpass.
1092     * The spec allows that we only write the results to one of the queries
1093     * but we still need to manage result availability for all the query indices.
1094     * Since we only emit a single query for all active views in the
1095     * first index, mark the other query indices as being already available
1096     * with result 0.
1097     */
1098    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1099       const uint32_t num_queries =
1100          util_bitcount(cmd_buffer->state.subpass->view_mask);
1101       if (num_queries > 1)
1102          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1103    }
1104 }
1105 
1106 #define TIMESTAMP 0x2358
1107 
genX(CmdWriteTimestamp)1108 void genX(CmdWriteTimestamp)(
1109     VkCommandBuffer                             commandBuffer,
1110     VkPipelineStageFlagBits                     pipelineStage,
1111     VkQueryPool                                 queryPool,
1112     uint32_t                                    query)
1113 {
1114    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1115    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1116    struct anv_address query_addr = anv_query_address(pool, query);
1117 
1118    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1119 
1120    struct gen_mi_builder b;
1121    gen_mi_builder_init(&b, &cmd_buffer->batch);
1122 
1123    switch (pipelineStage) {
1124    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1125       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
1126                        gen_mi_reg64(TIMESTAMP));
1127       break;
1128 
1129    default:
1130       /* Everything else is bottom-of-pipe */
1131       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
1132       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1133 
1134       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1135          pc.DestinationAddressType  = DAT_PPGTT;
1136          pc.PostSyncOperation       = WriteTimestamp;
1137          pc.Address                 = anv_address_add(query_addr, 8);
1138 
1139          if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
1140             pc.CommandStreamerStallEnable = true;
1141       }
1142       break;
1143    }
1144 
1145    emit_query_pc_availability(cmd_buffer, query_addr, true);
1146 
1147    /* When multiview is active the spec requires that N consecutive query
1148     * indices are used, where N is the number of active views in the subpass.
1149     * The spec allows that we only write the results to one of the queries
1150     * but we still need to manage result availability for all the query indices.
1151     * Since we only emit a single query for all active views in the
1152     * first index, mark the other query indices as being already available
1153     * with result 0.
1154     */
1155    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1156       const uint32_t num_queries =
1157          util_bitcount(cmd_buffer->state.subpass->view_mask);
1158       if (num_queries > 1)
1159          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1160    }
1161 }
1162 
1163 #if GEN_GEN > 7 || GEN_IS_HASWELL
1164 
1165 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1166 
1167 #define MI_PREDICATE_SRC0    0x2400
1168 #define MI_PREDICATE_SRC1    0x2408
1169 #define MI_PREDICATE_RESULT  0x2418
1170 
1171 /**
1172  * Writes the results of a query to dst_addr is the value at poll_addr is equal
1173  * to the reference value.
1174  */
1175 static void
gpu_write_query_result_cond(struct anv_cmd_buffer * cmd_buffer,struct gen_mi_builder * b,struct anv_address poll_addr,struct anv_address dst_addr,uint64_t ref_value,VkQueryResultFlags flags,uint32_t value_index,struct gen_mi_value query_result)1176 gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
1177                             struct gen_mi_builder *b,
1178                             struct anv_address poll_addr,
1179                             struct anv_address dst_addr,
1180                             uint64_t ref_value,
1181                             VkQueryResultFlags flags,
1182                             uint32_t value_index,
1183                             struct gen_mi_value query_result)
1184 {
1185    gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC0), gen_mi_mem64(poll_addr));
1186    gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(ref_value));
1187    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1188       mip.LoadOperation    = LOAD_LOAD;
1189       mip.CombineOperation = COMBINE_SET;
1190       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1191    }
1192 
1193    if (flags & VK_QUERY_RESULT_64_BIT) {
1194       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1195       gen_mi_store_if(b, gen_mi_mem64(res_addr), query_result);
1196    } else {
1197       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1198       gen_mi_store_if(b, gen_mi_mem32(res_addr), query_result);
1199    }
1200 }
1201 
1202 #endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */
1203 
1204 static void
gpu_write_query_result(struct gen_mi_builder * b,struct anv_address dst_addr,VkQueryResultFlags flags,uint32_t value_index,struct gen_mi_value query_result)1205 gpu_write_query_result(struct gen_mi_builder *b,
1206                        struct anv_address dst_addr,
1207                        VkQueryResultFlags flags,
1208                        uint32_t value_index,
1209                        struct gen_mi_value query_result)
1210 {
1211    if (flags & VK_QUERY_RESULT_64_BIT) {
1212       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1213       gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
1214    } else {
1215       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1216       gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
1217    }
1218 }
1219 
1220 static struct gen_mi_value
compute_query_result(struct gen_mi_builder * b,struct anv_address addr)1221 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
1222 {
1223    return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
1224                          gen_mi_mem64(anv_address_add(addr, 0)));
1225 }
1226 
genX(CmdCopyQueryPoolResults)1227 void genX(CmdCopyQueryPoolResults)(
1228     VkCommandBuffer                             commandBuffer,
1229     VkQueryPool                                 queryPool,
1230     uint32_t                                    firstQuery,
1231     uint32_t                                    queryCount,
1232     VkBuffer                                    destBuffer,
1233     VkDeviceSize                                destOffset,
1234     VkDeviceSize                                destStride,
1235     VkQueryResultFlags                          flags)
1236 {
1237    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1238    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1239    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1240 
1241    struct gen_mi_builder b;
1242    gen_mi_builder_init(&b, &cmd_buffer->batch);
1243    struct gen_mi_value result;
1244 
1245    /* If render target writes are ongoing, request a render target cache flush
1246     * to ensure proper ordering of the commands from the 3d pipe and the
1247     * command streamer.
1248     */
1249    if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
1250       cmd_buffer->state.pending_pipe_bits |=
1251          ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
1252    }
1253 
1254    if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
1255        (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
1256        /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
1257         * because we're about to copy values from MI commands, we need to
1258         * stall the command streamer to make sure the PIPE_CONTROL values have
1259         * landed, otherwise we could see inconsistent values & availability.
1260         *
1261         *  From the vulkan spec:
1262         *
1263         *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1264         *     previous uses of vkCmdResetQueryPool in the same queue, without
1265         *     any additional synchronization."
1266         */
1267        pool->type == VK_QUERY_TYPE_OCCLUSION ||
1268        pool->type == VK_QUERY_TYPE_TIMESTAMP) {
1269       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
1270       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1271    }
1272 
1273    struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
1274    for (uint32_t i = 0; i < queryCount; i++) {
1275       struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
1276       uint32_t idx = 0;
1277       switch (pool->type) {
1278       case VK_QUERY_TYPE_OCCLUSION:
1279          result = compute_query_result(&b, anv_address_add(query_addr, 8));
1280 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1281          /* Like in the case of vkGetQueryPoolResults, if the query is
1282           * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
1283           * conservatively write 0 as the query result. If the
1284           * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1285           */
1286          gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1287                1 /* available */, flags, idx, result);
1288          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
1289             gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1290                   0 /* unavailable */, flags, idx, gen_mi_imm(0));
1291          }
1292          idx++;
1293 #else /* GEN_GEN < 8 && !GEN_IS_HASWELL */
1294          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1295 #endif
1296          break;
1297 
1298       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1299          uint32_t statistics = pool->pipeline_statistics;
1300          while (statistics) {
1301             uint32_t stat = u_bit_scan(&statistics);
1302 
1303             result = compute_query_result(&b, anv_address_add(query_addr,
1304                                                               idx * 16 + 8));
1305 
1306             /* WaDividePSInvocationCountBy4:HSW,BDW */
1307             if ((cmd_buffer->device->info.gen == 8 ||
1308                  cmd_buffer->device->info.is_haswell) &&
1309                 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1310                result = gen_mi_ushr32_imm(&b, result, 2);
1311             }
1312 
1313             gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1314          }
1315          assert(idx == util_bitcount(pool->pipeline_statistics));
1316          break;
1317       }
1318 
1319       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1320          result = compute_query_result(&b, anv_address_add(query_addr, 8));
1321          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1322          result = compute_query_result(&b, anv_address_add(query_addr, 24));
1323          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1324          break;
1325 
1326       case VK_QUERY_TYPE_TIMESTAMP:
1327          result = gen_mi_mem64(anv_address_add(query_addr, 8));
1328          gpu_write_query_result(&b, dest_addr, flags, 0, result);
1329          break;
1330 
1331 #if GEN_GEN >= 8
1332       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1333          unreachable("Copy KHR performance query results not implemented");
1334          break;
1335 #endif
1336 
1337       default:
1338          unreachable("unhandled query type");
1339       }
1340 
1341       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1342          gpu_write_query_result(&b, dest_addr, flags, idx,
1343                                 gen_mi_mem64(query_addr));
1344       }
1345 
1346       dest_addr = anv_address_add(dest_addr, destStride);
1347    }
1348 }
1349 
1350 #else
genX(CmdCopyQueryPoolResults)1351 void genX(CmdCopyQueryPoolResults)(
1352     VkCommandBuffer                             commandBuffer,
1353     VkQueryPool                                 queryPool,
1354     uint32_t                                    firstQuery,
1355     uint32_t                                    queryCount,
1356     VkBuffer                                    destBuffer,
1357     VkDeviceSize                                destOffset,
1358     VkDeviceSize                                destStride,
1359     VkQueryResultFlags                          flags)
1360 {
1361    anv_finishme("Queries not yet supported on Ivy Bridge");
1362 }
1363 #endif
1364