1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_query.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 #include "util/u_suballoc.h"
30 
31 #include <stddef.h>
32 
emit_shader_query(struct si_context * sctx)33 static void emit_shader_query(struct si_context *sctx)
34 {
35    assert(!list_is_empty(&sctx->shader_query_buffers));
36 
37    struct gfx10_sh_query_buffer *qbuf =
38       list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
39    qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
40 }
41 
gfx10_release_query_buffers(struct si_context * sctx,struct gfx10_sh_query_buffer * first,struct gfx10_sh_query_buffer * last)42 static void gfx10_release_query_buffers(struct si_context *sctx,
43                                         struct gfx10_sh_query_buffer *first,
44                                         struct gfx10_sh_query_buffer *last)
45 {
46    while (first) {
47       struct gfx10_sh_query_buffer *qbuf = first;
48       if (first != last)
49          first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
50       else
51          first = NULL;
52 
53       qbuf->refcount--;
54       if (qbuf->refcount)
55          continue;
56 
57       if (qbuf->list.next == &sctx->shader_query_buffers)
58          continue; /* keep the most recent buffer; it may not be full yet */
59       if (qbuf->list.prev == &sctx->shader_query_buffers)
60          continue; /* keep the oldest buffer for recycling */
61 
62       list_del(&qbuf->list);
63       si_resource_reference(&qbuf->buf, NULL);
64       FREE(qbuf);
65    }
66 }
67 
gfx10_alloc_query_buffer(struct si_context * sctx)68 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
69 {
70    if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
71       return true;
72 
73    struct gfx10_sh_query_buffer *qbuf = NULL;
74 
75    if (!list_is_empty(&sctx->shader_query_buffers)) {
76       qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
77       if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
78          goto success;
79 
80       qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
81       if (!qbuf->refcount &&
82           !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
83           sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
84          /* Can immediately re-use the oldest buffer */
85          list_del(&qbuf->list);
86       } else {
87          qbuf = NULL;
88       }
89    }
90 
91    if (!qbuf) {
92       qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
93       if (unlikely(!qbuf))
94          return false;
95 
96       struct si_screen *screen = sctx->screen;
97       unsigned buf_size =
98          MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
99       qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
100       if (unlikely(!qbuf->buf)) {
101          FREE(qbuf);
102          return false;
103       }
104    }
105 
106    /* The buffer is currently unused by the GPU. Initialize it.
107     *
108     * We need to set the high bit of all the primitive counters for
109     * compatibility with the SET_PREDICATION packet.
110     */
111    uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
112                                             PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
113    assert(results);
114 
115    for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
116         ++i) {
117       for (unsigned j = 0; j < 16; ++j)
118          results[32 * i + j] = (uint64_t)1 << 63;
119       results[32 * i + 16] = 0;
120    }
121 
122    list_addtail(&qbuf->list, &sctx->shader_query_buffers);
123    qbuf->head = 0;
124    qbuf->refcount = sctx->num_active_shader_queries;
125 
126 success:;
127    struct pipe_shader_buffer sbuf;
128    sbuf.buffer = &qbuf->buf->b.b;
129    sbuf.buffer_offset = qbuf->head;
130    sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
131    si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
132    sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
133 
134    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
135    return true;
136 }
137 
gfx10_sh_query_destroy(struct si_context * sctx,struct si_query * rquery)138 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
139 {
140    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
141    gfx10_release_query_buffers(sctx, query->first, query->last);
142    FREE(query);
143 }
144 
gfx10_sh_query_begin(struct si_context * sctx,struct si_query * rquery)145 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
146 {
147    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
148 
149    gfx10_release_query_buffers(sctx, query->first, query->last);
150    query->first = query->last = NULL;
151 
152    if (unlikely(!gfx10_alloc_query_buffer(sctx)))
153       return false;
154 
155    query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
156    query->first_begin = query->first->head;
157 
158    sctx->num_active_shader_queries++;
159    query->first->refcount++;
160 
161    return true;
162 }
163 
gfx10_sh_query_end(struct si_context * sctx,struct si_query * rquery)164 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
165 {
166    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
167 
168    if (unlikely(!query->first))
169       return false; /* earlier out of memory error */
170 
171    query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
172    query->last_end = query->last->head;
173 
174    /* Signal the fence of the previous chunk */
175    if (query->last_end != 0) {
176       uint64_t fence_va = query->last->buf->gpu_address;
177       fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
178       fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
179       si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
180                         EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
181                         0xffffffff, PIPE_QUERY_GPU_FINISHED);
182    }
183 
184    sctx->num_active_shader_queries--;
185 
186    if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
187       si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
188       sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
189 
190       /* If a query_begin is followed by a query_end without a draw
191        * in-between, we need to clear the atom to ensure that the
192        * next query_begin will re-initialize the shader buffer. */
193       si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
194    }
195 
196    return true;
197 }
198 
gfx10_sh_query_add_result(struct gfx10_sh_query * query,struct gfx10_sh_query_buffer_mem * qmem,union pipe_query_result * result)199 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
200                                       struct gfx10_sh_query_buffer_mem *qmem,
201                                       union pipe_query_result *result)
202 {
203    static const uint64_t mask = ((uint64_t)1 << 63) - 1;
204 
205    switch (query->b.type) {
206    case PIPE_QUERY_PRIMITIVES_EMITTED:
207       result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
208       break;
209    case PIPE_QUERY_PRIMITIVES_GENERATED:
210       result->u64 += qmem->stream[query->stream].generated_primitives & mask;
211       break;
212    case PIPE_QUERY_SO_STATISTICS:
213       result->so_statistics.num_primitives_written +=
214          qmem->stream[query->stream].emitted_primitives & mask;
215       result->so_statistics.primitives_storage_needed +=
216          qmem->stream[query->stream].generated_primitives & mask;
217       break;
218    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
219       result->b |= qmem->stream[query->stream].emitted_primitives !=
220                    qmem->stream[query->stream].generated_primitives;
221       break;
222    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
223       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
224          result->b |= qmem->stream[stream].emitted_primitives !=
225                       qmem->stream[stream].generated_primitives;
226       }
227       break;
228    default:
229       assert(0);
230    }
231 }
232 
gfx10_sh_query_get_result(struct si_context * sctx,struct si_query * rquery,bool wait,union pipe_query_result * result)233 static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
234                                       union pipe_query_result *result)
235 {
236    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
237 
238    util_query_clear_result(result, query->b.type);
239 
240    if (unlikely(!query->first))
241       return false; /* earlier out of memory error */
242    assert(query->last);
243 
244    for (struct gfx10_sh_query_buffer *qbuf = query->last;;
245         qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
246       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
247       void *map;
248 
249       if (rquery->b.flushed)
250          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
251       else
252          map = si_buffer_map(sctx, qbuf->buf, usage);
253 
254       if (!map)
255          return false;
256 
257       unsigned results_begin = 0;
258       unsigned results_end = qbuf->head;
259       if (qbuf == query->first)
260          results_begin = query->first_begin;
261       if (qbuf == query->last)
262          results_end = query->last_end;
263 
264       while (results_begin != results_end) {
265          struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
266          results_begin += sizeof(*qmem);
267 
268          gfx10_sh_query_add_result(query, qmem, result);
269       }
270 
271       if (qbuf == query->first)
272          break;
273    }
274 
275    return true;
276 }
277 
gfx10_sh_query_get_result_resource(struct si_context * sctx,struct si_query * rquery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)278 static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
279                                                enum pipe_query_flags flags,
280                                                enum pipe_query_value_type result_type,
281                                                int index, struct pipe_resource *resource,
282                                                unsigned offset)
283 {
284    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
285    struct si_qbo_state saved_state = {};
286    struct pipe_resource *tmp_buffer = NULL;
287    unsigned tmp_buffer_offset = 0;
288 
289    if (!sctx->sh_query_result_shader) {
290       sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
291       if (!sctx->sh_query_result_shader)
292          return;
293    }
294 
295    if (query->first != query->last) {
296       u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
297       if (!tmp_buffer)
298          return;
299    }
300 
301    si_save_qbo_state(sctx, &saved_state);
302 
303    /* Pre-fill the constants configuring the shader behavior. */
304    struct {
305       uint32_t config;
306       uint32_t offset;
307       uint32_t chain;
308       uint32_t result_count;
309    } consts;
310    struct pipe_constant_buffer constant_buffer = {};
311 
312    if (index >= 0) {
313       switch (query->b.type) {
314       case PIPE_QUERY_PRIMITIVES_GENERATED:
315          consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
316          consts.config = 0;
317          break;
318       case PIPE_QUERY_PRIMITIVES_EMITTED:
319          consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
320          consts.config = 0;
321          break;
322       case PIPE_QUERY_SO_STATISTICS:
323          consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
324          consts.config = 0;
325          break;
326       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
327          consts.offset = 4 * sizeof(uint64_t) * query->stream;
328          consts.config = 2;
329          break;
330       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
331          consts.offset = 0;
332          consts.config = 3;
333          break;
334       default:
335          unreachable("bad query type");
336       }
337    } else {
338       /* Check result availability. */
339       consts.offset = 0;
340       consts.config = 1;
341    }
342 
343    if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
344       consts.config |= 8;
345 
346    constant_buffer.buffer_size = sizeof(consts);
347    constant_buffer.user_buffer = &consts;
348 
349    /* Pre-fill the SSBOs and grid. */
350    struct pipe_shader_buffer ssbo[3];
351    struct pipe_grid_info grid = {};
352 
353    ssbo[1].buffer = tmp_buffer;
354    ssbo[1].buffer_offset = tmp_buffer_offset;
355    ssbo[1].buffer_size = 16;
356 
357    ssbo[2] = ssbo[1];
358 
359    grid.block[0] = 1;
360    grid.block[1] = 1;
361    grid.block[2] = 1;
362    grid.grid[0] = 1;
363    grid.grid[1] = 1;
364    grid.grid[2] = 1;
365 
366    struct gfx10_sh_query_buffer *qbuf = query->first;
367    for (;;) {
368       unsigned begin = qbuf == query->first ? query->first_begin : 0;
369       unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
370       if (!end)
371          continue;
372 
373       ssbo[0].buffer = &qbuf->buf->b.b;
374       ssbo[0].buffer_offset = begin;
375       ssbo[0].buffer_size = end - begin;
376 
377       consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
378       consts.chain = 0;
379       if (qbuf != query->first)
380          consts.chain |= 1;
381       if (qbuf != query->last)
382          consts.chain |= 2;
383 
384       if (qbuf == query->last) {
385          ssbo[2].buffer = resource;
386          ssbo[2].buffer_offset = offset;
387          ssbo[2].buffer_size = 8;
388       }
389 
390       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
391 
392       if (flags & PIPE_QUERY_WAIT) {
393          uint64_t va;
394 
395          /* Wait for result availability. Wait only for readiness
396           * of the last entry, since the fence writes should be
397           * serialized in the CP.
398           */
399          va = qbuf->buf->gpu_address;
400          va += end - sizeof(struct gfx10_sh_query_buffer_mem);
401          va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
402 
403          si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
404       }
405 
406       si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader,
407                                     SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
408                                     3, ssbo, 0x6);
409 
410       if (qbuf == query->last)
411          break;
412       qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
413    }
414 
415    si_restore_qbo_state(sctx, &saved_state);
416    pipe_resource_reference(&tmp_buffer, NULL);
417 }
418 
419 static const struct si_query_ops gfx10_sh_query_ops = {
420    .destroy = gfx10_sh_query_destroy,
421    .begin = gfx10_sh_query_begin,
422    .end = gfx10_sh_query_end,
423    .get_result = gfx10_sh_query_get_result,
424    .get_result_resource = gfx10_sh_query_get_result_resource,
425 };
426 
gfx10_sh_query_create(struct si_screen * screen,enum pipe_query_type query_type,unsigned index)427 struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
428                                          unsigned index)
429 {
430    struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
431    if (unlikely(!query))
432       return NULL;
433 
434    query->b.ops = &gfx10_sh_query_ops;
435    query->b.type = query_type;
436    query->stream = index;
437 
438    return (struct pipe_query *)query;
439 }
440 
gfx10_init_query(struct si_context * sctx)441 void gfx10_init_query(struct si_context *sctx)
442 {
443    list_inithead(&sctx->shader_query_buffers);
444    sctx->atoms.s.shader_query.emit = emit_shader_query;
445 }
446 
gfx10_destroy_query(struct si_context * sctx)447 void gfx10_destroy_query(struct si_context *sctx)
448 {
449    while (!list_is_empty(&sctx->shader_query_buffers)) {
450       struct gfx10_sh_query_buffer *qbuf =
451          list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
452       list_del(&qbuf->list);
453 
454       assert(!qbuf->refcount);
455       si_resource_reference(&qbuf->buf, NULL);
456       FREE(qbuf);
457    }
458 }
459