1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "util/u_memory.h"
28 
29 #include "ac_perfcounter.h"
30 
31 struct si_query_group {
32    struct si_query_group *next;
33    struct ac_pc_block *block;
34    unsigned sub_gid;     /* only used during init */
35    unsigned result_base; /* only used during init */
36    int se;
37    int instance;
38    unsigned num_counters;
39    unsigned selectors[AC_QUERY_MAX_COUNTERS];
40 };
41 
42 struct si_query_counter {
43    unsigned base;
44    unsigned qwords;
45    unsigned stride; /* in uint64s */
46 };
47 
48 struct si_query_pc {
49    struct si_query b;
50    struct si_query_buffer buffer;
51 
52    /* Size of the results in memory, in bytes. */
53    unsigned result_size;
54 
55    unsigned shaders;
56    unsigned num_counters;
57    struct si_query_counter *counters;
58    struct si_query_group *groups;
59 };
60 
si_pc_emit_instance(struct si_context * sctx,int se,int instance)61 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
62 {
63    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
64    unsigned value = S_030800_SH_BROADCAST_WRITES(1);
65 
66    if (se >= 0) {
67       value |= S_030800_SE_INDEX(se);
68    } else {
69       value |= S_030800_SE_BROADCAST_WRITES(1);
70    }
71 
72    if (sctx->chip_class >= GFX10) {
73       /* TODO: Expose counters from each shader array separately if needed. */
74       value |= S_030800_SA_BROADCAST_WRITES(1);
75    }
76 
77    if (instance >= 0) {
78       value |= S_030800_INSTANCE_INDEX(instance);
79    } else {
80       value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
81    }
82 
83    radeon_begin(cs);
84    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
85    radeon_end();
86 }
87 
si_pc_emit_shaders(struct si_context * sctx,unsigned shaders)88 static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
89 {
90    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
91 
92    radeon_begin(cs);
93    radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
94    radeon_emit(shaders & 0x7f);
95    radeon_emit(0xffffffff);
96    radeon_end();
97 }
98 
si_pc_emit_select(struct si_context * sctx,struct ac_pc_block * block,unsigned count,unsigned * selectors)99 static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
100                               unsigned *selectors)
101 {
102    struct ac_pc_block_base *regs = block->b->b;
103    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
104    unsigned idx;
105 
106    assert(count <= regs->num_counters);
107 
108    /* Fake counters. */
109    if (!regs->select0)
110       return;
111 
112    radeon_begin(cs);
113 
114    for (idx = 0; idx < count; ++idx) {
115       radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false);
116       radeon_emit(selectors[idx] | regs->select_or);
117    }
118 
119    for (idx = 0; idx < regs->num_spm_counters; idx++) {
120       radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false);
121       radeon_emit(0);
122    }
123 
124    radeon_end();
125 }
126 
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)127 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
128 {
129    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
130 
131    si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
132                    COPY_DATA_IMM, NULL, 1);
133 
134    radeon_begin(cs);
135    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
136                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
137    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
138    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
139    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
140                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
141    radeon_end();
142 }
143 
144 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
145  * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)146 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
147 {
148    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
149 
150    si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
151                      EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
152    si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
153 
154    radeon_begin(cs);
155    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
156    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
157    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
158    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
159    radeon_set_uconfig_reg(
160       R_036020_CP_PERFMON_CNTL,
161       S_036020_PERFMON_STATE(sctx->screen->info.never_stop_sq_perf_counters ?
162                                 V_036020_CP_PERFMON_STATE_START_COUNTING :
163                                 V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
164       S_036020_PERFMON_SAMPLE_ENABLE(1));
165    radeon_end();
166 }
167 
si_pc_emit_read(struct si_context * sctx,struct ac_pc_block * block,unsigned count,uint64_t va)168 static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
169                             uint64_t va)
170 {
171    struct ac_pc_block_base *regs = block->b->b;
172    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
173    unsigned idx;
174    unsigned reg = regs->counter0_lo;
175    unsigned reg_delta = 8;
176 
177    radeon_begin(cs);
178 
179    if (regs->select0) {
180       for (idx = 0; idx < count; ++idx) {
181          if (regs->counters)
182             reg = regs->counters[idx];
183 
184          radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
185          radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
186                             COPY_DATA_COUNT_SEL); /* 64 bits */
187          radeon_emit(reg >> 2);
188          radeon_emit(0); /* unused */
189          radeon_emit(va);
190          radeon_emit(va >> 32);
191          va += sizeof(uint64_t);
192          reg += reg_delta;
193       }
194    } else {
195       /* Fake counters. */
196       for (idx = 0; idx < count; ++idx) {
197          radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
198          radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
199                      COPY_DATA_COUNT_SEL);
200          radeon_emit(0); /* immediate */
201          radeon_emit(0);
202          radeon_emit(va);
203          radeon_emit(va >> 32);
204          va += sizeof(uint64_t);
205       }
206    }
207    radeon_end();
208 }
209 
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)210 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
211 {
212    struct si_query_pc *query = (struct si_query_pc *)squery;
213 
214    while (query->groups) {
215       struct si_query_group *group = query->groups;
216       query->groups = group->next;
217       FREE(group);
218    }
219 
220    FREE(query->counters);
221 
222    si_query_buffer_destroy(sctx->screen, &query->buffer);
223    FREE(query);
224 }
225 
si_inhibit_clockgating(struct si_context * sctx,struct radeon_cmdbuf * cs,bool inhibit)226 void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
227 {
228    radeon_begin(&sctx->gfx_cs);
229 
230    if (sctx->chip_class >= GFX10) {
231       radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
232                              S_037390_PERFMON_CLOCK_STATE(inhibit));
233    } else if (sctx->chip_class >= GFX8) {
234       radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
235                              S_0372FC_PERFMON_CLOCK_STATE(inhibit));
236    }
237    radeon_end();
238 }
239 
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)240 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
241 /*
242                                    struct si_query_hw *hwquery,
243                                    struct si_resource *buffer, uint64_t va)*/
244 {
245    struct si_query_pc *query = (struct si_query_pc *)squery;
246    int current_se = -1;
247    int current_instance = -1;
248 
249    if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
250       return;
251    si_need_gfx_cs_space(sctx, 0);
252 
253    if (query->shaders)
254       si_pc_emit_shaders(sctx, query->shaders);
255 
256    si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
257 
258    for (struct si_query_group *group = query->groups; group; group = group->next) {
259       struct ac_pc_block *block = group->block;
260 
261       if (group->se != current_se || group->instance != current_instance) {
262          current_se = group->se;
263          current_instance = group->instance;
264          si_pc_emit_instance(sctx, group->se, group->instance);
265       }
266 
267       si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
268    }
269 
270    if (current_se != -1 || current_instance != -1)
271       si_pc_emit_instance(sctx, -1, -1);
272 
273    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
274    si_pc_emit_start(sctx, query->buffer.buf, va);
275 }
276 
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)277 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
278 {
279    struct si_query_pc *query = (struct si_query_pc *)squery;
280 
281    if (!query->buffer.buf)
282       return;
283 
284    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
285    query->buffer.results_end += query->result_size;
286 
287    si_pc_emit_stop(sctx, query->buffer.buf, va);
288 
289    for (struct si_query_group *group = query->groups; group; group = group->next) {
290       struct ac_pc_block *block = group->block;
291       unsigned se = group->se >= 0 ? group->se : 0;
292       unsigned se_end = se + 1;
293 
294       if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
295          se_end = sctx->screen->info.max_se;
296 
297       do {
298          unsigned instance = group->instance >= 0 ? group->instance : 0;
299 
300          do {
301             si_pc_emit_instance(sctx, se, instance);
302             si_pc_emit_read(sctx, block, group->num_counters, va);
303             va += sizeof(uint64_t) * group->num_counters;
304          } while (group->instance < 0 && ++instance < block->num_instances);
305       } while (++se < se_end);
306    }
307 
308    si_pc_emit_instance(sctx, -1, -1);
309 
310    si_inhibit_clockgating(sctx, &sctx->gfx_cs, false);
311 }
312 
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)313 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
314 {
315    struct si_query_pc *query = (struct si_query_pc *)squery;
316 
317    si_query_buffer_reset(ctx, &query->buffer);
318 
319    list_addtail(&query->b.active_list, &ctx->active_queries);
320    ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
321 
322    si_pc_query_resume(ctx, squery);
323 
324    return true;
325 }
326 
si_pc_query_end(struct si_context * ctx,struct si_query * squery)327 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
328 {
329    struct si_query_pc *query = (struct si_query_pc *)squery;
330 
331    si_pc_query_suspend(ctx, squery);
332 
333    list_del(&squery->active_list);
334    ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
335 
336    return query->buffer.buf != NULL;
337 }
338 
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)339 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
340                                    union pipe_query_result *result)
341 {
342    uint64_t *results = buffer;
343    unsigned i, j;
344 
345    for (i = 0; i < query->num_counters; ++i) {
346       struct si_query_counter *counter = &query->counters[i];
347 
348       for (j = 0; j < counter->qwords; ++j) {
349          uint32_t value = results[counter->base + j * counter->stride];
350          result->batch[i].u64 += value;
351       }
352    }
353 }
354 
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)355 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
356                                    union pipe_query_result *result)
357 {
358    struct si_query_pc *query = (struct si_query_pc *)squery;
359 
360    memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
361 
362    for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
363       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
364       unsigned results_base = 0;
365       void *map;
366 
367       if (squery->b.flushed)
368          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
369       else
370          map = si_buffer_map(sctx, qbuf->buf, usage);
371 
372       if (!map)
373          return false;
374 
375       while (results_base != qbuf->results_end) {
376          si_pc_query_add_result(query, map + results_base, result);
377          results_base += query->result_size;
378       }
379    }
380 
381    return true;
382 }
383 
384 static const struct si_query_ops batch_query_ops = {
385    .destroy = si_pc_query_destroy,
386    .begin = si_pc_query_begin,
387    .end = si_pc_query_end,
388    .get_result = si_pc_query_get_result,
389 
390    .suspend = si_pc_query_suspend,
391    .resume = si_pc_query_resume,
392 };
393 
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct ac_pc_block * block,unsigned sub_gid)394 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
395                                               struct ac_pc_block *block, unsigned sub_gid)
396 {
397    struct si_perfcounters *pc = screen->perfcounters;
398    struct si_query_group *group = query->groups;
399 
400    while (group) {
401       if (group->block == block && group->sub_gid == sub_gid)
402          return group;
403       group = group->next;
404    }
405 
406    group = CALLOC_STRUCT(si_query_group);
407    if (!group)
408       return NULL;
409 
410    group->block = block;
411    group->sub_gid = sub_gid;
412 
413    if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
414       unsigned sub_gids = block->num_instances;
415       unsigned shader_id;
416       unsigned shaders;
417       unsigned query_shaders;
418 
419       if (ac_pc_block_has_per_se_groups(&pc->base, block))
420          sub_gids = sub_gids * screen->info.max_se;
421       shader_id = sub_gid / sub_gids;
422       sub_gid = sub_gid % sub_gids;
423 
424       shaders = ac_pc_shader_type_bits[shader_id];
425 
426       query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
427       if (query_shaders && query_shaders != shaders) {
428          fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
429          FREE(group);
430          return NULL;
431       }
432       query->shaders = shaders;
433    }
434 
435    if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
436       // A non-zero value in query->shaders ensures that the shader
437       // masking is reset unless the user explicitly requests one.
438       query->shaders = AC_PC_SHADERS_WINDOWING;
439    }
440 
441    if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
442       group->se = sub_gid / block->num_instances;
443       sub_gid = sub_gid % block->num_instances;
444    } else {
445       group->se = -1;
446    }
447 
448    if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
449       group->instance = sub_gid;
450    } else {
451       group->instance = -1;
452    }
453 
454    group->next = query->groups;
455    query->groups = group;
456 
457    return group;
458 }
459 
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)460 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
461                                          unsigned *query_types)
462 {
463    struct si_screen *screen = (struct si_screen *)ctx->screen;
464    struct si_perfcounters *pc = screen->perfcounters;
465    struct ac_pc_block *block;
466    struct si_query_group *group;
467    struct si_query_pc *query;
468    unsigned base_gid, sub_gid, sub_index;
469    unsigned i, j;
470 
471    if (!pc)
472       return NULL;
473 
474    query = CALLOC_STRUCT(si_query_pc);
475    if (!query)
476       return NULL;
477 
478    query->b.ops = &batch_query_ops;
479 
480    query->num_counters = num_queries;
481 
482    /* Collect selectors per group */
483    for (i = 0; i < num_queries; ++i) {
484       unsigned sub_gid;
485 
486       if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
487          goto error;
488 
489       block =
490          ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
491       if (!block)
492          goto error;
493 
494       sub_gid = sub_index / block->b->selectors;
495       sub_index = sub_index % block->b->selectors;
496 
497       group = get_group_state(screen, query, block, sub_gid);
498       if (!group)
499          goto error;
500 
501       if (group->num_counters >= block->b->b->num_counters) {
502          fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
503          goto error;
504       }
505       group->selectors[group->num_counters] = sub_index;
506       ++group->num_counters;
507    }
508 
509    /* Compute result bases and CS size per group */
510    query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
511    query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
512 
513    i = 0;
514    for (group = query->groups; group; group = group->next) {
515       struct ac_pc_block *block = group->block;
516       unsigned read_dw;
517       unsigned instances = 1;
518 
519       if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
520          instances = screen->info.max_se;
521       if (group->instance < 0)
522          instances *= block->num_instances;
523 
524       group->result_base = i;
525       query->result_size += sizeof(uint64_t) * instances * group->num_counters;
526       i += instances * group->num_counters;
527 
528       read_dw = 6 * group->num_counters;
529       query->b.num_cs_dw_suspend += instances * read_dw;
530       query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
531    }
532 
533    if (query->shaders) {
534       if (query->shaders == AC_PC_SHADERS_WINDOWING)
535          query->shaders = 0xffffffff;
536    }
537 
538    /* Map user-supplied query array to result indices */
539    query->counters = CALLOC(num_queries, sizeof(*query->counters));
540    for (i = 0; i < num_queries; ++i) {
541       struct si_query_counter *counter = &query->counters[i];
542       struct ac_pc_block *block;
543 
544       block =
545          ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
546 
547       sub_gid = sub_index / block->b->selectors;
548       sub_index = sub_index % block->b->selectors;
549 
550       group = get_group_state(screen, query, block, sub_gid);
551       assert(group != NULL);
552 
553       for (j = 0; j < group->num_counters; ++j) {
554          if (group->selectors[j] == sub_index)
555             break;
556       }
557 
558       counter->base = group->result_base + j;
559       counter->stride = group->num_counters;
560 
561       counter->qwords = 1;
562       if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
563          counter->qwords = screen->info.max_se;
564       if (group->instance < 0)
565          counter->qwords *= block->num_instances;
566    }
567 
568    return (struct pipe_query *)query;
569 
570 error:
571    si_pc_query_destroy((struct si_context *)ctx, &query->b);
572    return NULL;
573 }
574 
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)575 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
576                             struct pipe_driver_query_info *info)
577 {
578    struct si_perfcounters *pc = screen->perfcounters;
579    struct ac_pc_block *block;
580    unsigned base_gid, sub;
581 
582    if (!pc)
583       return 0;
584 
585    if (!info) {
586       unsigned bid, num_queries = 0;
587 
588       for (bid = 0; bid < pc->base.num_blocks; ++bid) {
589          num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
590       }
591 
592       return num_queries;
593    }
594 
595    block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
596    if (!block)
597       return 0;
598 
599    if (!block->selector_names) {
600       if (!ac_init_block_names(&screen->info, &pc->base, block))
601          return 0;
602    }
603    info->name = block->selector_names + sub * block->selector_name_stride;
604    info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
605    info->max_value.u64 = 0;
606    info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
607    info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
608    info->group_id = base_gid + sub / block->b->selectors;
609    info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
610    if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
611       info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
612    return 1;
613 }
614 
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)615 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
616                                   struct pipe_driver_query_group_info *info)
617 {
618    struct si_perfcounters *pc = screen->perfcounters;
619    struct ac_pc_block *block;
620 
621    if (!pc)
622       return 0;
623 
624    if (!info)
625       return pc->base.num_groups;
626 
627    block = ac_lookup_group(&pc->base, &index);
628    if (!block)
629       return 0;
630 
631    if (!block->group_names) {
632       if (!ac_init_block_names(&screen->info, &pc->base, block))
633          return 0;
634    }
635    info->name = block->group_names + index * block->group_name_stride;
636    info->num_queries = block->b->selectors;
637    info->max_active_queries = block->b->b->num_counters;
638    return 1;
639 }
640 
si_destroy_perfcounters(struct si_screen * screen)641 void si_destroy_perfcounters(struct si_screen *screen)
642 {
643    struct si_perfcounters *pc = screen->perfcounters;
644 
645    if (!pc)
646       return;
647 
648    ac_destroy_perfcounters(&pc->base);
649    FREE(pc);
650    screen->perfcounters = NULL;
651 }
652 
si_init_perfcounters(struct si_screen * screen)653 void si_init_perfcounters(struct si_screen *screen)
654 {
655    bool separate_se, separate_instance;
656 
657    separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
658    separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
659 
660    screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
661    if (!screen->perfcounters)
662       return;
663 
664    screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
665    screen->perfcounters->num_instance_cs_dwords = 3;
666 
667    if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
668                              &screen->perfcounters->base)) {
669       si_destroy_perfcounters(screen);
670    }
671 }
672