1 /*
2  * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 /* NOTE: see https://github.com/freedreno/freedreno/wiki/A5xx-Queries */
28 
29 #include "freedreno_query_acc.h"
30 #include "freedreno_resource.h"
31 
32 #include "fd5_context.h"
33 #include "fd5_emit.h"
34 #include "fd5_format.h"
35 #include "fd5_query.h"
36 
37 struct PACKED fd5_query_sample {
38    uint64_t start;
39    uint64_t result;
40    uint64_t stop;
41 };
42 
43 /* offset of a single field of an array of fd5_query_sample: */
44 #define query_sample_idx(aq, idx, field)                                       \
45    fd_resource((aq)->prsc)->bo,                                                \
46       (idx * sizeof(struct fd5_query_sample)) +                                \
47          offsetof(struct fd5_query_sample, field),                             \
48       0, 0
49 
50 /* offset of a single field of fd5_query_sample: */
51 #define query_sample(aq, field) query_sample_idx(aq, 0, field)
52 
53 /*
54  * Occlusion Query:
55  *
56  * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
57  * interpret results
58  */
59 
60 static void
occlusion_resume(struct fd_acc_query * aq,struct fd_batch * batch)61 occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
62 {
63    struct fd_ringbuffer *ring = batch->draw;
64 
65    OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
66    OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
67 
68    OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
69    OUT_RELOC(ring, query_sample(aq, start));
70 
71    fd5_event_write(batch, ring, ZPASS_DONE, false);
72    fd_reset_wfi(batch);
73 
74    fd5_context(batch->ctx)->samples_passed_queries++;
75 }
76 
77 static void
occlusion_pause(struct fd_acc_query * aq,struct fd_batch * batch)78 occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch)
79 {
80    struct fd_ringbuffer *ring = batch->draw;
81 
82    OUT_PKT7(ring, CP_MEM_WRITE, 4);
83    OUT_RELOC(ring, query_sample(aq, stop));
84    OUT_RING(ring, 0xffffffff);
85    OUT_RING(ring, 0xffffffff);
86 
87    OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
88 
89    OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
90    OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
91 
92    OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
93    OUT_RELOC(ring, query_sample(aq, stop));
94 
95    fd5_event_write(batch, ring, ZPASS_DONE, false);
96    fd_reset_wfi(batch);
97 
98    OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
99    OUT_RING(ring, 0x00000014); // XXX
100    OUT_RELOC(ring, query_sample(aq, stop));
101    OUT_RING(ring, 0xffffffff);
102    OUT_RING(ring, 0xffffffff);
103    OUT_RING(ring, 0x00000010); // XXX
104 
105    /* result += stop - start: */
106    OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
107    OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
108    OUT_RELOC(ring, query_sample(aq, result)); /* dst */
109    OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
110    OUT_RELOC(ring, query_sample(aq, stop));   /* srcB */
111    OUT_RELOC(ring, query_sample(aq, start));  /* srcC */
112 
113    fd5_context(batch->ctx)->samples_passed_queries--;
114 }
115 
116 static void
occlusion_counter_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)117 occlusion_counter_result(struct fd_acc_query *aq, void *buf,
118                          union pipe_query_result *result)
119 {
120    struct fd5_query_sample *sp = buf;
121    result->u64 = sp->result;
122 }
123 
124 static void
occlusion_predicate_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)125 occlusion_predicate_result(struct fd_acc_query *aq, void *buf,
126                            union pipe_query_result *result)
127 {
128    struct fd5_query_sample *sp = buf;
129    result->b = !!sp->result;
130 }
131 
132 static const struct fd_acc_sample_provider occlusion_counter = {
133    .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
134    .size = sizeof(struct fd5_query_sample),
135    .resume = occlusion_resume,
136    .pause = occlusion_pause,
137    .result = occlusion_counter_result,
138 };
139 
140 static const struct fd_acc_sample_provider occlusion_predicate = {
141    .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
142    .size = sizeof(struct fd5_query_sample),
143    .resume = occlusion_resume,
144    .pause = occlusion_pause,
145    .result = occlusion_predicate_result,
146 };
147 
148 static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
149    .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
150    .size = sizeof(struct fd5_query_sample),
151    .resume = occlusion_resume,
152    .pause = occlusion_pause,
153    .result = occlusion_predicate_result,
154 };
155 
156 /*
157  * Timestamp Queries:
158  */
159 
160 static void
timestamp_resume(struct fd_acc_query * aq,struct fd_batch * batch)161 timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
162 {
163    struct fd_ringbuffer *ring = batch->draw;
164 
165    OUT_PKT7(ring, CP_EVENT_WRITE, 4);
166    OUT_RING(ring,
167             CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
168    OUT_RELOC(ring, query_sample(aq, start));
169    OUT_RING(ring, 0x00000000);
170 
171    fd_reset_wfi(batch);
172 }
173 
174 static void
timestamp_pause(struct fd_acc_query * aq,struct fd_batch * batch)175 timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
176 {
177    struct fd_ringbuffer *ring = batch->draw;
178 
179    OUT_PKT7(ring, CP_EVENT_WRITE, 4);
180    OUT_RING(ring,
181             CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
182    OUT_RELOC(ring, query_sample(aq, stop));
183    OUT_RING(ring, 0x00000000);
184 
185    fd_reset_wfi(batch);
186    fd_wfi(batch, ring);
187 
188    /* result += stop - start: */
189    OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
190    OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
191    OUT_RELOC(ring, query_sample(aq, result)); /* dst */
192    OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
193    OUT_RELOC(ring, query_sample(aq, stop));   /* srcB */
194    OUT_RELOC(ring, query_sample(aq, start));  /* srcC */
195 }
196 
197 static uint64_t
ticks_to_ns(uint32_t ts)198 ticks_to_ns(uint32_t ts)
199 {
200    /* This is based on the 19.2MHz always-on rbbm timer.
201     *
202     * TODO we should probably query this value from kernel..
203     */
204    return ts * (1000000000 / 19200000);
205 }
206 
207 static void
time_elapsed_accumulate_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)208 time_elapsed_accumulate_result(struct fd_acc_query *aq, void *buf,
209                                union pipe_query_result *result)
210 {
211    struct fd5_query_sample *sp = buf;
212    result->u64 = ticks_to_ns(sp->result);
213 }
214 
215 static void
timestamp_accumulate_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)216 timestamp_accumulate_result(struct fd_acc_query *aq, void *buf,
217                             union pipe_query_result *result)
218 {
219    struct fd5_query_sample *sp = buf;
220    result->u64 = ticks_to_ns(sp->result);
221 }
222 
223 static const struct fd_acc_sample_provider time_elapsed = {
224    .query_type = PIPE_QUERY_TIME_ELAPSED,
225    .always = true,
226    .size = sizeof(struct fd5_query_sample),
227    .resume = timestamp_resume,
228    .pause = timestamp_pause,
229    .result = time_elapsed_accumulate_result,
230 };
231 
232 /* NOTE: timestamp query isn't going to give terribly sensible results
233  * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
234  * add in a binning pass, the results get even more non-sensical.  So
235  * we just return the timestamp on the first tile and hope that is
236  * kind of good enough.
237  */
238 
239 static const struct fd_acc_sample_provider timestamp = {
240    .query_type = PIPE_QUERY_TIMESTAMP,
241    .always = true,
242    .size = sizeof(struct fd5_query_sample),
243    .resume = timestamp_resume,
244    .pause = timestamp_pause,
245    .result = timestamp_accumulate_result,
246 };
247 
248 /*
249  * Performance Counter (batch) queries:
250  *
251  * Only one of these is active at a time, per design of the gallium
252  * batch_query API design.  On perfcntr query tracks N query_types,
253  * each of which has a 'fd_batch_query_entry' that maps it back to
254  * the associated group and counter.
255  */
256 
257 struct fd_batch_query_entry {
258    uint8_t gid; /* group-id */
259    uint8_t cid; /* countable-id within the group */
260 };
261 
262 struct fd_batch_query_data {
263    struct fd_screen *screen;
264    unsigned num_query_entries;
265    struct fd_batch_query_entry query_entries[];
266 };
267 
268 static void
perfcntr_resume(struct fd_acc_query * aq,struct fd_batch * batch)269 perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
270 {
271    struct fd_batch_query_data *data = aq->query_data;
272    struct fd_screen *screen = data->screen;
273    struct fd_ringbuffer *ring = batch->draw;
274 
275    unsigned counters_per_group[screen->num_perfcntr_groups];
276    memset(counters_per_group, 0, sizeof(counters_per_group));
277 
278    fd_wfi(batch, ring);
279 
280    /* configure performance counters for the requested queries: */
281    for (unsigned i = 0; i < data->num_query_entries; i++) {
282       struct fd_batch_query_entry *entry = &data->query_entries[i];
283       const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
284       unsigned counter_idx = counters_per_group[entry->gid]++;
285 
286       debug_assert(counter_idx < g->num_counters);
287 
288       OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
289       OUT_RING(ring, g->countables[entry->cid].selector);
290    }
291 
292    memset(counters_per_group, 0, sizeof(counters_per_group));
293 
294    /* and snapshot the start values */
295    for (unsigned i = 0; i < data->num_query_entries; i++) {
296       struct fd_batch_query_entry *entry = &data->query_entries[i];
297       const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
298       unsigned counter_idx = counters_per_group[entry->gid]++;
299       const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
300 
301       OUT_PKT7(ring, CP_REG_TO_MEM, 3);
302       OUT_RING(ring, CP_REG_TO_MEM_0_64B |
303                         CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
304       OUT_RELOC(ring, query_sample_idx(aq, i, start));
305    }
306 }
307 
308 static void
perfcntr_pause(struct fd_acc_query * aq,struct fd_batch * batch)309 perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
310 {
311    struct fd_batch_query_data *data = aq->query_data;
312    struct fd_screen *screen = data->screen;
313    struct fd_ringbuffer *ring = batch->draw;
314 
315    unsigned counters_per_group[screen->num_perfcntr_groups];
316    memset(counters_per_group, 0, sizeof(counters_per_group));
317 
318    fd_wfi(batch, ring);
319 
320    /* TODO do we need to bother to turn anything off? */
321 
322    /* snapshot the end values: */
323    for (unsigned i = 0; i < data->num_query_entries; i++) {
324       struct fd_batch_query_entry *entry = &data->query_entries[i];
325       const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
326       unsigned counter_idx = counters_per_group[entry->gid]++;
327       const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
328 
329       OUT_PKT7(ring, CP_REG_TO_MEM, 3);
330       OUT_RING(ring, CP_REG_TO_MEM_0_64B |
331                         CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
332       OUT_RELOC(ring, query_sample_idx(aq, i, stop));
333    }
334 
335    /* and compute the result: */
336    for (unsigned i = 0; i < data->num_query_entries; i++) {
337       /* result += stop - start: */
338       OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
339       OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
340       OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */
341       OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */
342       OUT_RELOC(ring, query_sample_idx(aq, i, stop));   /* srcB */
343       OUT_RELOC(ring, query_sample_idx(aq, i, start));  /* srcC */
344    }
345 }
346 
347 static void
perfcntr_accumulate_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)348 perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf,
349                            union pipe_query_result *result)
350 {
351    struct fd_batch_query_data *data = aq->query_data;
352    struct fd5_query_sample *sp = buf;
353 
354    for (unsigned i = 0; i < data->num_query_entries; i++) {
355       result->batch[i].u64 = sp[i].result;
356    }
357 }
358 
359 static const struct fd_acc_sample_provider perfcntr = {
360    .query_type = FD_QUERY_FIRST_PERFCNTR,
361    .always = true,
362    .resume = perfcntr_resume,
363    .pause = perfcntr_pause,
364    .result = perfcntr_accumulate_result,
365 };
366 
367 static struct pipe_query *
fd5_create_batch_query(struct pipe_context * pctx,unsigned num_queries,unsigned * query_types)368 fd5_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
369                        unsigned *query_types)
370 {
371    struct fd_context *ctx = fd_context(pctx);
372    struct fd_screen *screen = ctx->screen;
373    struct fd_query *q;
374    struct fd_acc_query *aq;
375    struct fd_batch_query_data *data;
376 
377    data = CALLOC_VARIANT_LENGTH_STRUCT(
378       fd_batch_query_data, num_queries * sizeof(data->query_entries[0]));
379 
380    data->screen = screen;
381    data->num_query_entries = num_queries;
382 
383    /* validate the requested query_types and ensure we don't try
384     * to request more query_types of a given group than we have
385     * counters:
386     */
387    unsigned counters_per_group[screen->num_perfcntr_groups];
388    memset(counters_per_group, 0, sizeof(counters_per_group));
389 
390    for (unsigned i = 0; i < num_queries; i++) {
391       unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
392 
393       /* verify valid query_type, ie. is it actually a perfcntr? */
394       if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
395           (idx >= screen->num_perfcntr_queries)) {
396          mesa_loge("invalid batch query query_type: %u", query_types[i]);
397          goto error;
398       }
399 
400       struct fd_batch_query_entry *entry = &data->query_entries[i];
401       struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
402 
403       entry->gid = pq->group_id;
404 
405       /* the perfcntr_queries[] table flattens all the countables
406        * for each group in series, ie:
407        *
408        *   (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
409        *
410        * So to find the countable index just step back through the
411        * table to find the first entry with the same group-id.
412        */
413       while (pq > screen->perfcntr_queries) {
414          pq--;
415          if (pq->group_id == entry->gid)
416             entry->cid++;
417       }
418 
419       if (counters_per_group[entry->gid] >=
420           screen->perfcntr_groups[entry->gid].num_counters) {
421          mesa_loge("too many counters for group %u\n", entry->gid);
422          goto error;
423       }
424 
425       counters_per_group[entry->gid]++;
426    }
427 
428    q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
429    aq = fd_acc_query(q);
430 
431    /* sample buffer size is based on # of queries: */
432    aq->size = num_queries * sizeof(struct fd5_query_sample);
433    aq->query_data = data;
434 
435    return (struct pipe_query *)q;
436 
437 error:
438    free(data);
439    return NULL;
440 }
441 
442 void
fd5_query_context_init(struct pipe_context * pctx)443 fd5_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
444 {
445    struct fd_context *ctx = fd_context(pctx);
446 
447    ctx->create_query = fd_acc_create_query;
448    ctx->query_update_batch = fd_acc_query_update_batch;
449 
450    pctx->create_batch_query = fd5_create_batch_query;
451 
452    fd_acc_query_register_provider(pctx, &occlusion_counter);
453    fd_acc_query_register_provider(pctx, &occlusion_predicate);
454    fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
455 
456    fd_acc_query_register_provider(pctx, &time_elapsed);
457    fd_acc_query_register_provider(pctx, &timestamp);
458 }
459