1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 /*
29 This file replaces libdrm's radeon_cs_gem with our own implemention.
30 It's optimized specifically for Radeon DRM.
31 Adding buffers and space checking are faster and simpler than their
32 counterparts in libdrm (the time complexity of all the functions
33 is O(1) in nearly all scenarios, thanks to hashing).
34
35 It works like this:
36
37 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38 also adds the size of 'buf' to the used_gart and used_vram winsys variables
39 based on the domains, which are simply or'd for the accounting purposes.
40 The adding is skipped if the reloc is already present in the list, but it
41 accounts any newly-referenced domains.
42
43 cs_validate is then called, which just checks:
44 used_vram/gart < vram/gart_size * 0.8
45 The 0.8 number allows for some memory fragmentation. If the validation
46 fails, the pipe driver flushes CS and tries do the validation again,
47 i.e. it validates only that one operation. If it fails again, it drops
48 the operation on the floor and prints some nasty message to stderr.
49 (done in the pipe driver)
50
51 cs_write_reloc(cs, buf) just writes a reloc that has been added using
52 cs_add_buffer. The read_domain and write_domain parameters have been removed,
53 because we already specify them in cs_add_buffer.
54 */
55
56 #include "radeon_drm_cs.h"
57
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65
66
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68
69 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70 static void radeon_fence_reference(struct pipe_fence_handle **dst,
71 struct pipe_fence_handle *src);
72
radeon_drm_ctx_create(struct radeon_winsys * ws)73 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
74 {
75 struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
76 if (!ctx)
77 return NULL;
78
79 ctx->ws = (struct radeon_drm_winsys*)ws;
80 ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
81 return (struct radeon_winsys_ctx*)ctx;
82 }
83
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)84 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
85 {
86 FREE(ctx);
87 }
88
89 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx,bool full_reset_only,bool * needs_reset)90 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
91 bool *needs_reset)
92 {
93 struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
94
95 unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
96
97 if (ctx->gpu_reset_counter == latest) {
98 if (needs_reset)
99 *needs_reset = false;
100 return PIPE_NO_RESET;
101 }
102
103 if (needs_reset)
104 *needs_reset = true;
105
106 ctx->gpu_reset_counter = latest;
107 return PIPE_UNKNOWN_CONTEXT_RESET;
108 }
109
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)110 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
111 struct radeon_drm_winsys *ws)
112 {
113 int i;
114
115 csc->fd = ws->fd;
116
117 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
118 csc->chunks[0].length_dw = 0;
119 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
120 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
121 csc->chunks[1].length_dw = 0;
122 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
123 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
124 csc->chunks[2].length_dw = 2;
125 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
126
127 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
128 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
129 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
130
131 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
132
133 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
134 csc->reloc_indices_hashlist[i] = -1;
135 }
136 return true;
137 }
138
radeon_cs_context_cleanup(struct radeon_cs_context * csc)139 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
140 {
141 unsigned i;
142
143 for (i = 0; i < csc->num_relocs; i++) {
144 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145 radeon_ws_bo_reference(&csc->relocs_bo[i].bo, NULL);
146 }
147 for (i = 0; i < csc->num_slab_buffers; ++i) {
148 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149 radeon_ws_bo_reference(&csc->slab_buffers[i].bo, NULL);
150 }
151
152 csc->num_relocs = 0;
153 csc->num_validated_relocs = 0;
154 csc->num_slab_buffers = 0;
155 csc->chunks[0].length_dw = 0;
156 csc->chunks[1].length_dw = 0;
157
158 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159 csc->reloc_indices_hashlist[i] = -1;
160 }
161 }
162
radeon_destroy_cs_context(struct radeon_cs_context * csc)163 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
164 {
165 radeon_cs_context_cleanup(csc);
166 FREE(csc->slab_buffers);
167 FREE(csc->relocs_bo);
168 FREE(csc->relocs);
169 }
170
171
172 static bool
radeon_drm_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * ctx,enum ring_type ring_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx,bool stop_exec_on_failure)173 radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
174 struct radeon_winsys_ctx *ctx,
175 enum ring_type ring_type,
176 void (*flush)(void *ctx, unsigned flags,
177 struct pipe_fence_handle **fence),
178 void *flush_ctx,
179 bool stop_exec_on_failure)
180 {
181 struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
182 struct radeon_drm_cs *cs;
183
184 cs = CALLOC_STRUCT(radeon_drm_cs);
185 if (!cs) {
186 return false;
187 }
188 util_queue_fence_init(&cs->flush_completed);
189
190 cs->ws = ws;
191 cs->flush_cs = flush;
192 cs->flush_data = flush_ctx;
193
194 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
195 FREE(cs);
196 return false;
197 }
198 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
199 radeon_destroy_cs_context(&cs->csc1);
200 FREE(cs);
201 return false;
202 }
203
204 /* Set the first command buffer as current. */
205 cs->csc = &cs->csc1;
206 cs->cst = &cs->csc2;
207 cs->ring_type = ring_type;
208
209 memset(rcs, 0, sizeof(*rcs));
210 rcs->current.buf = cs->csc->buf;
211 rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
212 rcs->priv = cs;
213
214 p_atomic_inc(&ws->num_cs);
215 return true;
216 }
217
radeon_lookup_buffer(struct radeon_cs_context * csc,struct radeon_bo * bo)218 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
219 {
220 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
221 struct radeon_bo_item *buffers;
222 unsigned num_buffers;
223 int i = csc->reloc_indices_hashlist[hash];
224
225 if (bo->handle) {
226 buffers = csc->relocs_bo;
227 num_buffers = csc->num_relocs;
228 } else {
229 buffers = csc->slab_buffers;
230 num_buffers = csc->num_slab_buffers;
231 }
232
233 /* not found or found */
234 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
235 return i;
236
237 /* Hash collision, look for the BO in the list of relocs linearly. */
238 for (i = num_buffers - 1; i >= 0; i--) {
239 if (buffers[i].bo == bo) {
240 /* Put this reloc in the hash list.
241 * This will prevent additional hash collisions if there are
242 * several consecutive lookup_buffer calls for the same buffer.
243 *
244 * Example: Assuming buffers A,B,C collide in the hash list,
245 * the following sequence of relocs:
246 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
247 * will collide here: ^ and here: ^,
248 * meaning that we should get very few collisions in the end. */
249 csc->reloc_indices_hashlist[hash] = i;
250 return i;
251 }
252 }
253 return -1;
254 }
255
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)256 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
257 struct radeon_bo *bo)
258 {
259 struct radeon_cs_context *csc = cs->csc;
260 struct drm_radeon_cs_reloc *reloc;
261 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
262 int i = -1;
263
264 i = radeon_lookup_buffer(csc, bo);
265
266 if (i >= 0) {
267 /* For async DMA, every add_buffer call must add a buffer to the list
268 * no matter how many duplicates there are. This is due to the fact
269 * the DMA CS checker doesn't use NOP packets for offset patching,
270 * but always uses the i-th buffer from the list to patch the i-th
271 * offset. If there are N offsets in a DMA CS, there must also be N
272 * buffers in the relocation list.
273 *
274 * This doesn't have to be done if virtual memory is enabled,
275 * because there is no offset patching with virtual memory.
276 */
277 if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
278 return i;
279 }
280 }
281
282 /* New relocation, check if the backing array is large enough. */
283 if (csc->num_relocs >= csc->max_relocs) {
284 uint32_t size;
285 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
286
287 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
288 csc->relocs_bo = realloc(csc->relocs_bo, size);
289
290 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
291 csc->relocs = realloc(csc->relocs, size);
292
293 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
294 }
295
296 /* Initialize the new relocation. */
297 csc->relocs_bo[csc->num_relocs].bo = NULL;
298 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
299 radeon_ws_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
300 p_atomic_inc(&bo->num_cs_references);
301 reloc = &csc->relocs[csc->num_relocs];
302 reloc->handle = bo->handle;
303 reloc->read_domains = 0;
304 reloc->write_domain = 0;
305 reloc->flags = 0;
306
307 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
308
309 csc->chunks[1].length_dw += RELOC_DWORDS;
310
311 return csc->num_relocs++;
312 }
313
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)314 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
315 struct radeon_bo *bo)
316 {
317 struct radeon_cs_context *csc = cs->csc;
318 unsigned hash;
319 struct radeon_bo_item *item;
320 int idx;
321 int real_idx;
322
323 idx = radeon_lookup_buffer(csc, bo);
324 if (idx >= 0)
325 return idx;
326
327 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
328
329 /* Check if the backing array is large enough. */
330 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
331 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
332 (unsigned)(csc->max_slab_buffers * 1.3));
333 struct radeon_bo_item *new_buffers =
334 REALLOC(csc->slab_buffers,
335 csc->max_slab_buffers * sizeof(*new_buffers),
336 new_max * sizeof(*new_buffers));
337 if (!new_buffers) {
338 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
339 return -1;
340 }
341
342 csc->max_slab_buffers = new_max;
343 csc->slab_buffers = new_buffers;
344 }
345
346 /* Initialize the new relocation. */
347 idx = csc->num_slab_buffers++;
348 item = &csc->slab_buffers[idx];
349
350 item->bo = NULL;
351 item->u.slab.real_idx = real_idx;
352 radeon_ws_bo_reference(&item->bo, bo);
353 p_atomic_inc(&bo->num_cs_references);
354
355 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
356 csc->reloc_indices_hashlist[hash] = idx;
357
358 return idx;
359 }
360
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf,enum radeon_bo_usage usage,enum radeon_bo_domain domains,enum radeon_bo_priority priority)361 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
362 struct pb_buffer *buf,
363 enum radeon_bo_usage usage,
364 enum radeon_bo_domain domains,
365 enum radeon_bo_priority priority)
366 {
367 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
368 struct radeon_bo *bo = (struct radeon_bo*)buf;
369 enum radeon_bo_domain added_domains;
370
371 /* If VRAM is just stolen system memory, allow both VRAM and
372 * GTT, whichever has free space. If a buffer is evicted from
373 * VRAM to GTT, it will stay there.
374 */
375 if (!cs->ws->info.has_dedicated_vram)
376 domains |= RADEON_DOMAIN_GTT;
377
378 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
379 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
380 struct drm_radeon_cs_reloc *reloc;
381 int index;
382
383 if (!bo->handle) {
384 index = radeon_lookup_or_add_slab_buffer(cs, bo);
385 if (index < 0)
386 return 0;
387
388 index = cs->csc->slab_buffers[index].u.slab.real_idx;
389 } else {
390 index = radeon_lookup_or_add_real_buffer(cs, bo);
391 }
392
393 reloc = &cs->csc->relocs[index];
394 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
395 reloc->read_domains |= rd;
396 reloc->write_domain |= wd;
397 reloc->flags = MAX2(reloc->flags, priority);
398 cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
399
400 if (added_domains & RADEON_DOMAIN_VRAM)
401 rcs->used_vram_kb += bo->base.size / 1024;
402 else if (added_domains & RADEON_DOMAIN_GTT)
403 rcs->used_gart_kb += bo->base.size / 1024;
404
405 return index;
406 }
407
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf)408 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
409 struct pb_buffer *buf)
410 {
411 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
412
413 return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
414 }
415
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)416 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
417 {
418 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
419 bool status =
420 rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
421 rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
422
423 if (status) {
424 cs->csc->num_validated_relocs = cs->csc->num_relocs;
425 } else {
426 /* Remove lately-added buffers. The validation failed with them
427 * and the CS is about to be flushed because of that. Keep only
428 * the already-validated buffers. */
429 unsigned i;
430
431 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
432 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
433 radeon_ws_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
434 }
435 cs->csc->num_relocs = cs->csc->num_validated_relocs;
436
437 /* Flush if there are any relocs. Clean up otherwise. */
438 if (cs->csc->num_relocs) {
439 cs->flush_cs(cs->flush_data,
440 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
441 } else {
442 radeon_cs_context_cleanup(cs->csc);
443 rcs->used_vram_kb = 0;
444 rcs->used_gart_kb = 0;
445
446 assert(rcs->current.cdw == 0);
447 if (rcs->current.cdw != 0) {
448 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
449 }
450 }
451 }
452 return status;
453 }
454
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw,bool force_chaining)455 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
456 bool force_chaining)
457 {
458 assert(rcs->current.cdw <= rcs->current.max_dw);
459 return rcs->current.max_dw - rcs->current.cdw >= dw;
460 }
461
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)462 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
463 struct radeon_bo_list_item *list)
464 {
465 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
466 int i;
467
468 if (list) {
469 for (i = 0; i < cs->csc->num_relocs; i++) {
470 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
471 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
472 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
473 }
474 }
475 return cs->csc->num_relocs;
476 }
477
radeon_drm_cs_emit_ioctl_oneshot(void * job,void * gdata,int thread_index)478 void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
479 {
480 struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
481 unsigned i;
482 int r;
483
484 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
485 &csc->cs, sizeof(struct drm_radeon_cs));
486 if (r) {
487 if (r == -ENOMEM)
488 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
489 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
490 unsigned i;
491
492 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
493 for (i = 0; i < csc->chunks[0].length_dw; i++) {
494 fprintf(stderr, "0x%08X\n", csc->buf[i]);
495 }
496 } else {
497 fprintf(stderr, "radeon: The kernel rejected CS, "
498 "see dmesg for more information (%i).\n", r);
499 }
500 }
501
502 for (i = 0; i < csc->num_relocs; i++)
503 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
504 for (i = 0; i < csc->num_slab_buffers; i++)
505 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
506
507 radeon_cs_context_cleanup(csc);
508 }
509
510 /*
511 * Make sure previous submission of this cs are completed
512 */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)513 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
514 {
515 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
516
517 /* Wait for any pending ioctl of this CS to complete. */
518 if (util_queue_is_initialized(&cs->ws->cs_queue))
519 util_queue_fence_wait(&cs->flush_completed);
520 }
521
522 /* Add the given fence to a slab buffer fence list.
523 *
524 * There is a potential race condition when bo participates in submissions on
525 * two or more threads simultaneously. Since we do not know which of the
526 * submissions will be sent to the GPU first, we have to keep the fences
527 * of all submissions.
528 *
529 * However, fences that belong to submissions that have already returned from
530 * their respective ioctl do not have to be kept, because we know that they
531 * will signal earlier.
532 */
radeon_bo_slab_fence(struct radeon_bo * bo,struct radeon_bo * fence)533 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
534 {
535 unsigned dst;
536
537 assert(fence->num_cs_references);
538
539 /* Cleanup older fences */
540 dst = 0;
541 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
542 if (bo->u.slab.fences[src]->num_cs_references) {
543 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
544 dst++;
545 } else {
546 radeon_ws_bo_reference(&bo->u.slab.fences[src], NULL);
547 }
548 }
549 bo->u.slab.num_fences = dst;
550
551 /* Check available space for the new fence */
552 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
553 unsigned new_max_fences = bo->u.slab.max_fences + 1;
554 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
555 bo->u.slab.max_fences * sizeof(*new_fences),
556 new_max_fences * sizeof(*new_fences));
557 if (!new_fences) {
558 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
559 return;
560 }
561
562 bo->u.slab.fences = new_fences;
563 bo->u.slab.max_fences = new_max_fences;
564 }
565
566 /* Add the new fence */
567 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
568 radeon_ws_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
569 bo->u.slab.num_fences++;
570 }
571
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)572 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
573 unsigned flags,
574 struct pipe_fence_handle **pfence)
575 {
576 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
577 struct radeon_cs_context *tmp;
578
579 switch (cs->ring_type) {
580 case RING_DMA:
581 /* pad DMA ring to 8 DWs */
582 if (cs->ws->info.chip_class <= GFX6) {
583 while (rcs->current.cdw & 7)
584 radeon_emit(rcs, 0xf0000000); /* NOP packet */
585 } else {
586 while (rcs->current.cdw & 7)
587 radeon_emit(rcs, 0x00000000); /* NOP packet */
588 }
589 break;
590 case RING_GFX:
591 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
592 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
593 */
594 if (cs->ws->info.gfx_ib_pad_with_type2) {
595 while (rcs->current.cdw & 7)
596 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
597 } else {
598 while (rcs->current.cdw & 7)
599 radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
600 }
601 break;
602 case RING_UVD:
603 while (rcs->current.cdw & 15)
604 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
605 break;
606 default:
607 break;
608 }
609
610 if (rcs->current.cdw > rcs->current.max_dw) {
611 fprintf(stderr, "radeon: command stream overflowed\n");
612 }
613
614 if (pfence || cs->csc->num_slab_buffers) {
615 struct pipe_fence_handle *fence;
616
617 if (cs->next_fence) {
618 fence = cs->next_fence;
619 cs->next_fence = NULL;
620 } else {
621 fence = radeon_cs_create_fence(rcs);
622 }
623
624 if (fence) {
625 if (pfence)
626 radeon_fence_reference(pfence, fence);
627
628 mtx_lock(&cs->ws->bo_fence_lock);
629 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
630 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
631 p_atomic_inc(&bo->num_active_ioctls);
632 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
633 }
634 mtx_unlock(&cs->ws->bo_fence_lock);
635
636 radeon_fence_reference(&fence, NULL);
637 }
638 } else {
639 radeon_fence_reference(&cs->next_fence, NULL);
640 }
641
642 radeon_drm_cs_sync_flush(rcs);
643
644 /* Swap command streams. */
645 tmp = cs->csc;
646 cs->csc = cs->cst;
647 cs->cst = tmp;
648
649 /* If the CS is not empty or overflowed, emit it in a separate thread. */
650 if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
651 !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
652 unsigned i, num_relocs;
653
654 num_relocs = cs->cst->num_relocs;
655
656 cs->cst->chunks[0].length_dw = rcs->current.cdw;
657
658 for (i = 0; i < num_relocs; i++) {
659 /* Update the number of active asynchronous CS ioctls for the buffer. */
660 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
661 }
662
663 switch (cs->ring_type) {
664 case RING_DMA:
665 cs->cst->flags[0] = 0;
666 cs->cst->flags[1] = RADEON_CS_RING_DMA;
667 cs->cst->cs.num_chunks = 3;
668 if (cs->ws->info.r600_has_virtual_memory) {
669 cs->cst->flags[0] |= RADEON_CS_USE_VM;
670 }
671 break;
672
673 case RING_UVD:
674 cs->cst->flags[0] = 0;
675 cs->cst->flags[1] = RADEON_CS_RING_UVD;
676 cs->cst->cs.num_chunks = 3;
677 break;
678
679 case RING_VCE:
680 cs->cst->flags[0] = 0;
681 cs->cst->flags[1] = RADEON_CS_RING_VCE;
682 cs->cst->cs.num_chunks = 3;
683 break;
684
685 default:
686 case RING_GFX:
687 case RING_COMPUTE:
688 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
689 cs->cst->flags[1] = RADEON_CS_RING_GFX;
690 cs->cst->cs.num_chunks = 3;
691
692 if (cs->ws->info.r600_has_virtual_memory) {
693 cs->cst->flags[0] |= RADEON_CS_USE_VM;
694 cs->cst->cs.num_chunks = 3;
695 }
696 if (flags & PIPE_FLUSH_END_OF_FRAME) {
697 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
698 cs->cst->cs.num_chunks = 3;
699 }
700 if (cs->ring_type == RING_COMPUTE) {
701 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
702 cs->cst->cs.num_chunks = 3;
703 }
704 break;
705 }
706
707 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
708 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
709 radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
710 if (!(flags & PIPE_FLUSH_ASYNC))
711 radeon_drm_cs_sync_flush(rcs);
712 } else {
713 radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
714 }
715 } else {
716 radeon_cs_context_cleanup(cs->cst);
717 }
718
719 /* Prepare a new CS. */
720 rcs->current.buf = cs->csc->buf;
721 rcs->current.cdw = 0;
722 rcs->used_vram_kb = 0;
723 rcs->used_gart_kb = 0;
724
725 if (cs->ring_type == RING_GFX)
726 cs->ws->num_gfx_IBs++;
727 else if (cs->ring_type == RING_DMA)
728 cs->ws->num_sdma_IBs++;
729 return 0;
730 }
731
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)732 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
733 {
734 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
735
736 if (!cs)
737 return;
738
739 radeon_drm_cs_sync_flush(rcs);
740 util_queue_fence_destroy(&cs->flush_completed);
741 radeon_cs_context_cleanup(&cs->csc1);
742 radeon_cs_context_cleanup(&cs->csc2);
743 p_atomic_dec(&cs->ws->num_cs);
744 radeon_destroy_cs_context(&cs->csc1);
745 radeon_destroy_cs_context(&cs->csc2);
746 radeon_fence_reference(&cs->next_fence, NULL);
747 FREE(cs);
748 }
749
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer * _buf,enum radeon_bo_usage usage)750 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
751 struct pb_buffer *_buf,
752 enum radeon_bo_usage usage)
753 {
754 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
755 struct radeon_bo *bo = (struct radeon_bo*)_buf;
756 int index;
757
758 if (!bo->num_cs_references)
759 return false;
760
761 index = radeon_lookup_buffer(cs->csc, bo);
762 if (index == -1)
763 return false;
764
765 if (!bo->handle)
766 index = cs->csc->slab_buffers[index].u.slab.real_idx;
767
768 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
769 return true;
770 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
771 return true;
772
773 return false;
774 }
775
776 /* FENCES */
777
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)778 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
779 {
780 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
781 struct pb_buffer *fence;
782
783 /* Create a fence, which is a dummy BO. */
784 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
785 RADEON_DOMAIN_GTT,
786 RADEON_FLAG_NO_SUBALLOC
787 | RADEON_FLAG_NO_INTERPROCESS_SHARING);
788 if (!fence)
789 return NULL;
790
791 /* Add the fence as a dummy relocation. */
792 cs->ws->base.cs_add_buffer(rcs, fence,
793 RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
794 RADEON_PRIO_FENCE);
795 return (struct pipe_fence_handle*)fence;
796 }
797
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)798 static bool radeon_fence_wait(struct radeon_winsys *ws,
799 struct pipe_fence_handle *fence,
800 uint64_t timeout)
801 {
802 return ws->buffer_wait(ws, (struct pb_buffer*)fence, timeout,
803 RADEON_USAGE_READWRITE);
804 }
805
radeon_fence_reference(struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)806 static void radeon_fence_reference(struct pipe_fence_handle **dst,
807 struct pipe_fence_handle *src)
808 {
809 pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
810 }
811
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)812 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
813 {
814 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
815 struct pipe_fence_handle *fence = NULL;
816
817 if (cs->next_fence) {
818 radeon_fence_reference(&fence, cs->next_fence);
819 return fence;
820 }
821
822 fence = radeon_cs_create_fence(rcs);
823 if (!fence)
824 return NULL;
825
826 radeon_fence_reference(&cs->next_fence, fence);
827 return fence;
828 }
829
830 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence,unsigned dependency_flags)831 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
832 struct pipe_fence_handle *fence,
833 unsigned dependency_flags)
834 {
835 /* TODO: Handle the following unlikely multi-threaded scenario:
836 *
837 * Thread 1 / Context 1 Thread 2 / Context 2
838 * -------------------- --------------------
839 * f = cs_get_next_fence()
840 * cs_add_fence_dependency(f)
841 * cs_flush()
842 * cs_flush()
843 *
844 * We currently assume that this does not happen because we don't support
845 * asynchronous flushes on Radeon.
846 */
847 }
848
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)849 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
850 {
851 ws->base.ctx_create = radeon_drm_ctx_create;
852 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
853 ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
854 ws->base.cs_create = radeon_drm_cs_create;
855 ws->base.cs_destroy = radeon_drm_cs_destroy;
856 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
857 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
858 ws->base.cs_validate = radeon_drm_cs_validate;
859 ws->base.cs_check_space = radeon_drm_cs_check_space;
860 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
861 ws->base.cs_flush = radeon_drm_cs_flush;
862 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
863 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
864 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
865 ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
866 ws->base.fence_wait = radeon_fence_wait;
867 ws->base.fence_reference = radeon_fence_reference;
868 }
869