1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * The above copyright notice and this permission notice (including the
24  * next paragraph) shall be included in all copies or substantial portions
25  * of the Software.
26  */
27 
28 /*
29     This file replaces libdrm's radeon_cs_gem with our own implemention.
30     It's optimized specifically for Radeon DRM.
31     Adding buffers and space checking are faster and simpler than their
32     counterparts in libdrm (the time complexity of all the functions
33     is O(1) in nearly all scenarios, thanks to hashing).
34 
35     It works like this:
36 
37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
39     based on the domains, which are simply or'd for the accounting purposes.
40     The adding is skipped if the reloc is already present in the list, but it
41     accounts any newly-referenced domains.
42 
43     cs_validate is then called, which just checks:
44         used_vram/gart < vram/gart_size * 0.8
45     The 0.8 number allows for some memory fragmentation. If the validation
46     fails, the pipe driver flushes CS and tries do the validation again,
47     i.e. it validates only that one operation. If it fails again, it drops
48     the operation on the floor and prints some nasty message to stderr.
49     (done in the pipe driver)
50 
51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
53     because we already specify them in cs_add_buffer.
54 */
55 
56 #include "radeon_drm_cs.h"
57 
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60 
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65 
66 
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68 
69 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70 static void radeon_fence_reference(struct pipe_fence_handle **dst,
71                                    struct pipe_fence_handle *src);
72 
radeon_drm_ctx_create(struct radeon_winsys * ws)73 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
74 {
75    struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
76    if (!ctx)
77       return NULL;
78 
79    ctx->ws = (struct radeon_drm_winsys*)ws;
80    ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
81    return (struct radeon_winsys_ctx*)ctx;
82 }
83 
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)84 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
85 {
86    FREE(ctx);
87 }
88 
89 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx,bool full_reset_only,bool * needs_reset)90 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
91                                   bool *needs_reset)
92 {
93    struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
94 
95    unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
96 
97    if (ctx->gpu_reset_counter == latest) {
98       if (needs_reset)
99          *needs_reset = false;
100       return PIPE_NO_RESET;
101    }
102 
103    if (needs_reset)
104       *needs_reset = true;
105 
106    ctx->gpu_reset_counter = latest;
107    return PIPE_UNKNOWN_CONTEXT_RESET;
108 }
109 
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)110 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
111                                    struct radeon_drm_winsys *ws)
112 {
113    int i;
114 
115    csc->fd = ws->fd;
116 
117    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
118    csc->chunks[0].length_dw = 0;
119    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
120    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
121    csc->chunks[1].length_dw = 0;
122    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
123    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
124    csc->chunks[2].length_dw = 2;
125    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
126 
127    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
128    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
129    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
130 
131    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
132 
133    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
134       csc->reloc_indices_hashlist[i] = -1;
135    }
136    return true;
137 }
138 
radeon_cs_context_cleanup(struct radeon_cs_context * csc)139 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
140 {
141    unsigned i;
142 
143    for (i = 0; i < csc->num_relocs; i++) {
144       p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145       radeon_ws_bo_reference(&csc->relocs_bo[i].bo, NULL);
146    }
147    for (i = 0; i < csc->num_slab_buffers; ++i) {
148       p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149       radeon_ws_bo_reference(&csc->slab_buffers[i].bo, NULL);
150    }
151 
152    csc->num_relocs = 0;
153    csc->num_validated_relocs = 0;
154    csc->num_slab_buffers = 0;
155    csc->chunks[0].length_dw = 0;
156    csc->chunks[1].length_dw = 0;
157 
158    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159       csc->reloc_indices_hashlist[i] = -1;
160    }
161 }
162 
radeon_destroy_cs_context(struct radeon_cs_context * csc)163 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
164 {
165    radeon_cs_context_cleanup(csc);
166    FREE(csc->slab_buffers);
167    FREE(csc->relocs_bo);
168    FREE(csc->relocs);
169 }
170 
171 
172 static bool
radeon_drm_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * ctx,enum ring_type ring_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx,bool stop_exec_on_failure)173 radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
174                      struct radeon_winsys_ctx *ctx,
175                      enum ring_type ring_type,
176                      void (*flush)(void *ctx, unsigned flags,
177                                    struct pipe_fence_handle **fence),
178                      void *flush_ctx,
179                      bool stop_exec_on_failure)
180 {
181    struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
182    struct radeon_drm_cs *cs;
183 
184    cs = CALLOC_STRUCT(radeon_drm_cs);
185    if (!cs) {
186       return false;
187    }
188    util_queue_fence_init(&cs->flush_completed);
189 
190    cs->ws = ws;
191    cs->flush_cs = flush;
192    cs->flush_data = flush_ctx;
193 
194    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
195       FREE(cs);
196       return false;
197    }
198    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
199       radeon_destroy_cs_context(&cs->csc1);
200       FREE(cs);
201       return false;
202    }
203 
204    /* Set the first command buffer as current. */
205    cs->csc = &cs->csc1;
206    cs->cst = &cs->csc2;
207    cs->ring_type = ring_type;
208 
209    memset(rcs, 0, sizeof(*rcs));
210    rcs->current.buf = cs->csc->buf;
211    rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
212    rcs->priv = cs;
213 
214    p_atomic_inc(&ws->num_cs);
215    return true;
216 }
217 
radeon_lookup_buffer(struct radeon_cs_context * csc,struct radeon_bo * bo)218 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
219 {
220    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
221    struct radeon_bo_item *buffers;
222    unsigned num_buffers;
223    int i = csc->reloc_indices_hashlist[hash];
224 
225    if (bo->handle) {
226       buffers = csc->relocs_bo;
227       num_buffers = csc->num_relocs;
228    } else {
229       buffers = csc->slab_buffers;
230       num_buffers = csc->num_slab_buffers;
231    }
232 
233    /* not found or found */
234    if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
235       return i;
236 
237    /* Hash collision, look for the BO in the list of relocs linearly. */
238    for (i = num_buffers - 1; i >= 0; i--) {
239       if (buffers[i].bo == bo) {
240          /* Put this reloc in the hash list.
241           * This will prevent additional hash collisions if there are
242           * several consecutive lookup_buffer calls for the same buffer.
243           *
244           * Example: Assuming buffers A,B,C collide in the hash list,
245           * the following sequence of relocs:
246           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
247           * will collide here: ^ and here:   ^,
248           * meaning that we should get very few collisions in the end. */
249          csc->reloc_indices_hashlist[hash] = i;
250          return i;
251       }
252    }
253    return -1;
254 }
255 
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)256 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
257                                                  struct radeon_bo *bo)
258 {
259    struct radeon_cs_context *csc = cs->csc;
260    struct drm_radeon_cs_reloc *reloc;
261    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
262    int i = -1;
263 
264    i = radeon_lookup_buffer(csc, bo);
265 
266    if (i >= 0) {
267       /* For async DMA, every add_buffer call must add a buffer to the list
268        * no matter how many duplicates there are. This is due to the fact
269        * the DMA CS checker doesn't use NOP packets for offset patching,
270        * but always uses the i-th buffer from the list to patch the i-th
271        * offset. If there are N offsets in a DMA CS, there must also be N
272        * buffers in the relocation list.
273        *
274        * This doesn't have to be done if virtual memory is enabled,
275        * because there is no offset patching with virtual memory.
276        */
277       if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
278          return i;
279       }
280    }
281 
282    /* New relocation, check if the backing array is large enough. */
283    if (csc->num_relocs >= csc->max_relocs) {
284       uint32_t size;
285       csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
286 
287       size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
288       csc->relocs_bo = realloc(csc->relocs_bo, size);
289 
290       size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
291       csc->relocs = realloc(csc->relocs, size);
292 
293       csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
294    }
295 
296    /* Initialize the new relocation. */
297    csc->relocs_bo[csc->num_relocs].bo = NULL;
298    csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
299    radeon_ws_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
300    p_atomic_inc(&bo->num_cs_references);
301    reloc = &csc->relocs[csc->num_relocs];
302    reloc->handle = bo->handle;
303    reloc->read_domains = 0;
304    reloc->write_domain = 0;
305    reloc->flags = 0;
306 
307    csc->reloc_indices_hashlist[hash] = csc->num_relocs;
308 
309    csc->chunks[1].length_dw += RELOC_DWORDS;
310 
311    return csc->num_relocs++;
312 }
313 
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)314 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
315                                             struct radeon_bo *bo)
316 {
317    struct radeon_cs_context *csc = cs->csc;
318    unsigned hash;
319    struct radeon_bo_item *item;
320    int idx;
321    int real_idx;
322 
323    idx = radeon_lookup_buffer(csc, bo);
324    if (idx >= 0)
325       return idx;
326 
327    real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
328 
329    /* Check if the backing array is large enough. */
330    if (csc->num_slab_buffers >= csc->max_slab_buffers) {
331       unsigned new_max = MAX2(csc->max_slab_buffers + 16,
332                               (unsigned)(csc->max_slab_buffers * 1.3));
333       struct radeon_bo_item *new_buffers =
334             REALLOC(csc->slab_buffers,
335                     csc->max_slab_buffers * sizeof(*new_buffers),
336                     new_max * sizeof(*new_buffers));
337       if (!new_buffers) {
338          fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
339          return -1;
340       }
341 
342       csc->max_slab_buffers = new_max;
343       csc->slab_buffers = new_buffers;
344    }
345 
346    /* Initialize the new relocation. */
347    idx = csc->num_slab_buffers++;
348    item = &csc->slab_buffers[idx];
349 
350    item->bo = NULL;
351    item->u.slab.real_idx = real_idx;
352    radeon_ws_bo_reference(&item->bo, bo);
353    p_atomic_inc(&bo->num_cs_references);
354 
355    hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
356    csc->reloc_indices_hashlist[hash] = idx;
357 
358    return idx;
359 }
360 
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf,enum radeon_bo_usage usage,enum radeon_bo_domain domains,enum radeon_bo_priority priority)361 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
362                                          struct pb_buffer *buf,
363                                          enum radeon_bo_usage usage,
364                                          enum radeon_bo_domain domains,
365                                          enum radeon_bo_priority priority)
366 {
367    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
368    struct radeon_bo *bo = (struct radeon_bo*)buf;
369    enum radeon_bo_domain added_domains;
370 
371    /* If VRAM is just stolen system memory, allow both VRAM and
372     * GTT, whichever has free space. If a buffer is evicted from
373     * VRAM to GTT, it will stay there.
374     */
375    if (!cs->ws->info.has_dedicated_vram)
376       domains |= RADEON_DOMAIN_GTT;
377 
378    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
379    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
380    struct drm_radeon_cs_reloc *reloc;
381    int index;
382 
383    if (!bo->handle) {
384       index = radeon_lookup_or_add_slab_buffer(cs, bo);
385       if (index < 0)
386          return 0;
387 
388       index = cs->csc->slab_buffers[index].u.slab.real_idx;
389    } else {
390       index = radeon_lookup_or_add_real_buffer(cs, bo);
391    }
392 
393    reloc = &cs->csc->relocs[index];
394    added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
395    reloc->read_domains |= rd;
396    reloc->write_domain |= wd;
397    reloc->flags = MAX2(reloc->flags, priority);
398    cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
399 
400    if (added_domains & RADEON_DOMAIN_VRAM)
401       rcs->used_vram_kb += bo->base.size / 1024;
402    else if (added_domains & RADEON_DOMAIN_GTT)
403       rcs->used_gart_kb += bo->base.size / 1024;
404 
405    return index;
406 }
407 
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf)408 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
409                                        struct pb_buffer *buf)
410 {
411    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
412 
413    return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
414 }
415 
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)416 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
417 {
418    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
419    bool status =
420          rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
421          rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
422 
423    if (status) {
424       cs->csc->num_validated_relocs = cs->csc->num_relocs;
425    } else {
426       /* Remove lately-added buffers. The validation failed with them
427        * and the CS is about to be flushed because of that. Keep only
428        * the already-validated buffers. */
429       unsigned i;
430 
431       for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
432          p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
433          radeon_ws_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
434       }
435       cs->csc->num_relocs = cs->csc->num_validated_relocs;
436 
437       /* Flush if there are any relocs. Clean up otherwise. */
438       if (cs->csc->num_relocs) {
439          cs->flush_cs(cs->flush_data,
440                       RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
441       } else {
442          radeon_cs_context_cleanup(cs->csc);
443          rcs->used_vram_kb = 0;
444          rcs->used_gart_kb = 0;
445 
446          assert(rcs->current.cdw == 0);
447          if (rcs->current.cdw != 0) {
448             fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
449          }
450       }
451    }
452    return status;
453 }
454 
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw,bool force_chaining)455 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
456                                       bool force_chaining)
457 {
458    assert(rcs->current.cdw <= rcs->current.max_dw);
459    return rcs->current.max_dw - rcs->current.cdw >= dw;
460 }
461 
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)462 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
463                                               struct radeon_bo_list_item *list)
464 {
465    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
466    int i;
467 
468    if (list) {
469       for (i = 0; i < cs->csc->num_relocs; i++) {
470          list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
471          list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
472          list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
473       }
474    }
475    return cs->csc->num_relocs;
476 }
477 
radeon_drm_cs_emit_ioctl_oneshot(void * job,void * gdata,int thread_index)478 void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
479 {
480    struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
481    unsigned i;
482    int r;
483 
484    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
485                            &csc->cs, sizeof(struct drm_radeon_cs));
486    if (r) {
487       if (r == -ENOMEM)
488          fprintf(stderr, "radeon: Not enough memory for command submission.\n");
489       else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
490          unsigned i;
491 
492          fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
493          for (i = 0; i < csc->chunks[0].length_dw; i++) {
494             fprintf(stderr, "0x%08X\n", csc->buf[i]);
495          }
496       } else {
497          fprintf(stderr, "radeon: The kernel rejected CS, "
498                          "see dmesg for more information (%i).\n", r);
499       }
500    }
501 
502    for (i = 0; i < csc->num_relocs; i++)
503       p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
504    for (i = 0; i < csc->num_slab_buffers; i++)
505       p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
506 
507    radeon_cs_context_cleanup(csc);
508 }
509 
510 /*
511  * Make sure previous submission of this cs are completed
512  */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)513 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
514 {
515    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
516 
517    /* Wait for any pending ioctl of this CS to complete. */
518    if (util_queue_is_initialized(&cs->ws->cs_queue))
519       util_queue_fence_wait(&cs->flush_completed);
520 }
521 
522 /* Add the given fence to a slab buffer fence list.
523  *
524  * There is a potential race condition when bo participates in submissions on
525  * two or more threads simultaneously. Since we do not know which of the
526  * submissions will be sent to the GPU first, we have to keep the fences
527  * of all submissions.
528  *
529  * However, fences that belong to submissions that have already returned from
530  * their respective ioctl do not have to be kept, because we know that they
531  * will signal earlier.
532  */
radeon_bo_slab_fence(struct radeon_bo * bo,struct radeon_bo * fence)533 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
534 {
535    unsigned dst;
536 
537    assert(fence->num_cs_references);
538 
539    /* Cleanup older fences */
540    dst = 0;
541    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
542       if (bo->u.slab.fences[src]->num_cs_references) {
543          bo->u.slab.fences[dst] = bo->u.slab.fences[src];
544          dst++;
545       } else {
546          radeon_ws_bo_reference(&bo->u.slab.fences[src], NULL);
547       }
548    }
549    bo->u.slab.num_fences = dst;
550 
551    /* Check available space for the new fence */
552    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
553       unsigned new_max_fences = bo->u.slab.max_fences + 1;
554       struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
555                                               bo->u.slab.max_fences * sizeof(*new_fences),
556                                               new_max_fences * sizeof(*new_fences));
557       if (!new_fences) {
558          fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
559          return;
560       }
561 
562       bo->u.slab.fences = new_fences;
563       bo->u.slab.max_fences = new_max_fences;
564    }
565 
566    /* Add the new fence */
567    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
568    radeon_ws_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
569    bo->u.slab.num_fences++;
570 }
571 
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)572 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
573                                unsigned flags,
574                                struct pipe_fence_handle **pfence)
575 {
576    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
577    struct radeon_cs_context *tmp;
578 
579    switch (cs->ring_type) {
580    case RING_DMA:
581       /* pad DMA ring to 8 DWs */
582       if (cs->ws->info.chip_class <= GFX6) {
583          while (rcs->current.cdw & 7)
584             radeon_emit(rcs, 0xf0000000); /* NOP packet */
585       } else {
586          while (rcs->current.cdw & 7)
587             radeon_emit(rcs, 0x00000000); /* NOP packet */
588       }
589       break;
590    case RING_GFX:
591       /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
592        * r6xx, requires at least 4 dw alignment to avoid a hw bug.
593        */
594       if (cs->ws->info.gfx_ib_pad_with_type2) {
595          while (rcs->current.cdw & 7)
596             radeon_emit(rcs, 0x80000000); /* type2 nop packet */
597       } else {
598          while (rcs->current.cdw & 7)
599             radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
600       }
601       break;
602    case RING_UVD:
603       while (rcs->current.cdw & 15)
604          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
605       break;
606    default:
607       break;
608    }
609 
610    if (rcs->current.cdw > rcs->current.max_dw) {
611       fprintf(stderr, "radeon: command stream overflowed\n");
612    }
613 
614    if (pfence || cs->csc->num_slab_buffers) {
615       struct pipe_fence_handle *fence;
616 
617       if (cs->next_fence) {
618          fence = cs->next_fence;
619          cs->next_fence = NULL;
620       } else {
621          fence = radeon_cs_create_fence(rcs);
622       }
623 
624       if (fence) {
625          if (pfence)
626             radeon_fence_reference(pfence, fence);
627 
628          mtx_lock(&cs->ws->bo_fence_lock);
629          for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
630             struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
631             p_atomic_inc(&bo->num_active_ioctls);
632             radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
633          }
634          mtx_unlock(&cs->ws->bo_fence_lock);
635 
636          radeon_fence_reference(&fence, NULL);
637       }
638    } else {
639       radeon_fence_reference(&cs->next_fence, NULL);
640    }
641 
642    radeon_drm_cs_sync_flush(rcs);
643 
644    /* Swap command streams. */
645    tmp = cs->csc;
646    cs->csc = cs->cst;
647    cs->cst = tmp;
648 
649    /* If the CS is not empty or overflowed, emit it in a separate thread. */
650    if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
651        !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
652       unsigned i, num_relocs;
653 
654       num_relocs = cs->cst->num_relocs;
655 
656       cs->cst->chunks[0].length_dw = rcs->current.cdw;
657 
658       for (i = 0; i < num_relocs; i++) {
659          /* Update the number of active asynchronous CS ioctls for the buffer. */
660          p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
661       }
662 
663       switch (cs->ring_type) {
664       case RING_DMA:
665          cs->cst->flags[0] = 0;
666          cs->cst->flags[1] = RADEON_CS_RING_DMA;
667          cs->cst->cs.num_chunks = 3;
668          if (cs->ws->info.r600_has_virtual_memory) {
669             cs->cst->flags[0] |= RADEON_CS_USE_VM;
670          }
671          break;
672 
673       case RING_UVD:
674          cs->cst->flags[0] = 0;
675          cs->cst->flags[1] = RADEON_CS_RING_UVD;
676          cs->cst->cs.num_chunks = 3;
677          break;
678 
679       case RING_VCE:
680          cs->cst->flags[0] = 0;
681          cs->cst->flags[1] = RADEON_CS_RING_VCE;
682          cs->cst->cs.num_chunks = 3;
683          break;
684 
685       default:
686       case RING_GFX:
687       case RING_COMPUTE:
688          cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
689          cs->cst->flags[1] = RADEON_CS_RING_GFX;
690          cs->cst->cs.num_chunks = 3;
691 
692          if (cs->ws->info.r600_has_virtual_memory) {
693             cs->cst->flags[0] |= RADEON_CS_USE_VM;
694             cs->cst->cs.num_chunks = 3;
695          }
696          if (flags & PIPE_FLUSH_END_OF_FRAME) {
697             cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
698             cs->cst->cs.num_chunks = 3;
699          }
700          if (cs->ring_type == RING_COMPUTE) {
701             cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
702             cs->cst->cs.num_chunks = 3;
703          }
704          break;
705       }
706 
707       if (util_queue_is_initialized(&cs->ws->cs_queue)) {
708          util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
709                             radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
710          if (!(flags & PIPE_FLUSH_ASYNC))
711             radeon_drm_cs_sync_flush(rcs);
712       } else {
713          radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
714       }
715    } else {
716       radeon_cs_context_cleanup(cs->cst);
717    }
718 
719    /* Prepare a new CS. */
720    rcs->current.buf = cs->csc->buf;
721    rcs->current.cdw = 0;
722    rcs->used_vram_kb = 0;
723    rcs->used_gart_kb = 0;
724 
725    if (cs->ring_type == RING_GFX)
726       cs->ws->num_gfx_IBs++;
727    else if (cs->ring_type == RING_DMA)
728       cs->ws->num_sdma_IBs++;
729    return 0;
730 }
731 
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)732 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
733 {
734    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
735 
736    if (!cs)
737       return;
738 
739    radeon_drm_cs_sync_flush(rcs);
740    util_queue_fence_destroy(&cs->flush_completed);
741    radeon_cs_context_cleanup(&cs->csc1);
742    radeon_cs_context_cleanup(&cs->csc2);
743    p_atomic_dec(&cs->ws->num_cs);
744    radeon_destroy_cs_context(&cs->csc1);
745    radeon_destroy_cs_context(&cs->csc2);
746    radeon_fence_reference(&cs->next_fence, NULL);
747    FREE(cs);
748 }
749 
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer * _buf,enum radeon_bo_usage usage)750 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
751                                     struct pb_buffer *_buf,
752                                     enum radeon_bo_usage usage)
753 {
754    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
755    struct radeon_bo *bo = (struct radeon_bo*)_buf;
756    int index;
757 
758    if (!bo->num_cs_references)
759       return false;
760 
761    index = radeon_lookup_buffer(cs->csc, bo);
762    if (index == -1)
763       return false;
764 
765    if (!bo->handle)
766       index = cs->csc->slab_buffers[index].u.slab.real_idx;
767 
768    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
769       return true;
770    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
771       return true;
772 
773    return false;
774 }
775 
776 /* FENCES */
777 
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)778 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
779 {
780    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
781    struct pb_buffer *fence;
782 
783    /* Create a fence, which is a dummy BO. */
784    fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
785                                       RADEON_DOMAIN_GTT,
786                                       RADEON_FLAG_NO_SUBALLOC
787                                       | RADEON_FLAG_NO_INTERPROCESS_SHARING);
788    if (!fence)
789       return NULL;
790 
791    /* Add the fence as a dummy relocation. */
792    cs->ws->base.cs_add_buffer(rcs, fence,
793                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
794                               RADEON_PRIO_FENCE);
795    return (struct pipe_fence_handle*)fence;
796 }
797 
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)798 static bool radeon_fence_wait(struct radeon_winsys *ws,
799                               struct pipe_fence_handle *fence,
800                               uint64_t timeout)
801 {
802    return ws->buffer_wait(ws, (struct pb_buffer*)fence, timeout,
803                           RADEON_USAGE_READWRITE);
804 }
805 
radeon_fence_reference(struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)806 static void radeon_fence_reference(struct pipe_fence_handle **dst,
807                                    struct pipe_fence_handle *src)
808 {
809    pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
810 }
811 
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)812 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
813 {
814    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
815    struct pipe_fence_handle *fence = NULL;
816 
817    if (cs->next_fence) {
818       radeon_fence_reference(&fence, cs->next_fence);
819       return fence;
820    }
821 
822    fence = radeon_cs_create_fence(rcs);
823    if (!fence)
824       return NULL;
825 
826    radeon_fence_reference(&cs->next_fence, fence);
827    return fence;
828 }
829 
830 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence,unsigned dependency_flags)831 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
832                                    struct pipe_fence_handle *fence,
833                                    unsigned dependency_flags)
834 {
835    /* TODO: Handle the following unlikely multi-threaded scenario:
836     *
837     *  Thread 1 / Context 1                   Thread 2 / Context 2
838     *  --------------------                   --------------------
839     *  f = cs_get_next_fence()
840     *                                         cs_add_fence_dependency(f)
841     *                                         cs_flush()
842     *  cs_flush()
843     *
844     * We currently assume that this does not happen because we don't support
845     * asynchronous flushes on Radeon.
846     */
847 }
848 
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)849 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
850 {
851    ws->base.ctx_create = radeon_drm_ctx_create;
852    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
853    ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
854    ws->base.cs_create = radeon_drm_cs_create;
855    ws->base.cs_destroy = radeon_drm_cs_destroy;
856    ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
857    ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
858    ws->base.cs_validate = radeon_drm_cs_validate;
859    ws->base.cs_check_space = radeon_drm_cs_check_space;
860    ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
861    ws->base.cs_flush = radeon_drm_cs_flush;
862    ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
863    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
864    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
865    ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
866    ws->base.fence_wait = radeon_fence_wait;
867    ws->base.fence_reference = radeon_fence_reference;
868 }
869