1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #include <amdgpu.h>
26 #include <assert.h>
27 #include <pthread.h>
28 #include <stdlib.h>
29 #include "drm-uapi/amdgpu_drm.h"
30 
31 #include "util/u_memory.h"
32 #include "ac_debug.h"
33 #include "radv_amdgpu_bo.h"
34 #include "radv_amdgpu_cs.h"
35 #include "radv_amdgpu_winsys.h"
36 #include "radv_debug.h"
37 #include "radv_radeon_winsys.h"
38 #include "sid.h"
39 
40 #define GFX6_MAX_CS_SIZE 0xffff8 /* in dwords */
41 
42 enum { VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 };
43 
44 struct radv_amdgpu_ib {
45    struct radeon_winsys_bo *bo;
46    unsigned cdw;
47 };
48 
49 struct radv_amdgpu_cs {
50    struct radeon_cmdbuf base;
51    struct radv_amdgpu_winsys *ws;
52 
53    struct amdgpu_cs_ib_info ib;
54 
55    struct radeon_winsys_bo *ib_buffer;
56    uint8_t *ib_mapped;
57    unsigned max_num_buffers;
58    unsigned num_buffers;
59    struct drm_amdgpu_bo_list_entry *handles;
60 
61    struct radv_amdgpu_ib *old_ib_buffers;
62    unsigned num_old_ib_buffers;
63    unsigned max_num_old_ib_buffers;
64    unsigned *ib_size_ptr;
65    VkResult status;
66    bool is_chained;
67 
68    int buffer_hash_table[1024];
69    unsigned hw_ip;
70 
71    unsigned num_virtual_buffers;
72    unsigned max_num_virtual_buffers;
73    struct radeon_winsys_bo **virtual_buffers;
74    int *virtual_buffer_hash_table;
75 
76    /* For chips that don't support chaining. */
77    struct radeon_cmdbuf *old_cs_buffers;
78    unsigned num_old_cs_buffers;
79 };
80 
81 static inline struct radv_amdgpu_cs *
radv_amdgpu_cs(struct radeon_cmdbuf * base)82 radv_amdgpu_cs(struct radeon_cmdbuf *base)
83 {
84    return (struct radv_amdgpu_cs *)base;
85 }
86 
87 static int
ring_to_hw_ip(enum ring_type ring)88 ring_to_hw_ip(enum ring_type ring)
89 {
90    switch (ring) {
91    case RING_GFX:
92       return AMDGPU_HW_IP_GFX;
93    case RING_DMA:
94       return AMDGPU_HW_IP_DMA;
95    case RING_COMPUTE:
96       return AMDGPU_HW_IP_COMPUTE;
97    default:
98       unreachable("unsupported ring");
99    }
100 }
101 
102 struct radv_amdgpu_cs_request {
103    /** Specify HW IP block type to which to send the IB. */
104    unsigned ip_type;
105 
106    /** IP instance index if there are several IPs of the same type. */
107    unsigned ip_instance;
108 
109    /**
110     * Specify ring index of the IP. We could have several rings
111     * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1.
112     */
113    uint32_t ring;
114 
115    /**
116     * BO list handles used by this request.
117     */
118    struct drm_amdgpu_bo_list_entry *handles;
119    uint32_t num_handles;
120 
121    /** Number of IBs to submit in the field ibs. */
122    uint32_t number_of_ibs;
123 
124    /**
125     * IBs to submit. Those IBs will be submit together as single entity
126     */
127    struct amdgpu_cs_ib_info *ibs;
128 
129    /**
130     * The returned sequence number for the command submission
131     */
132    uint64_t seq_no;
133 };
134 
135 static int radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
136                                  struct radv_amdgpu_cs_request *request,
137                                  struct radv_winsys_sem_info *sem_info);
138 
139 static void
radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_fence * fence,struct radv_amdgpu_cs_request * req)140 radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_fence *fence,
141                              struct radv_amdgpu_cs_request *req)
142 {
143    fence->fence.context = ctx->ctx;
144    fence->fence.ip_type = req->ip_type;
145    fence->fence.ip_instance = req->ip_instance;
146    fence->fence.ring = req->ring;
147    fence->fence.fence = req->seq_no;
148    fence->user_ptr =
149       (volatile uint64_t *)(ctx->fence_map + req->ip_type * MAX_RINGS_PER_TYPE + req->ring);
150 }
151 
152 static void
radv_amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)153 radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
154 {
155    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
156 
157    if (cs->ib_buffer)
158       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
159    else
160       free(cs->base.buf);
161 
162    for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
163       cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i].bo);
164 
165    for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
166       free(cs->old_cs_buffers[i].buf);
167    }
168 
169    free(cs->old_cs_buffers);
170    free(cs->old_ib_buffers);
171    free(cs->virtual_buffers);
172    free(cs->virtual_buffer_hash_table);
173    free(cs->handles);
174    free(cs);
175 }
176 
177 static void
radv_amdgpu_init_cs(struct radv_amdgpu_cs * cs,enum ring_type ring_type)178 radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs, enum ring_type ring_type)
179 {
180    for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
181       cs->buffer_hash_table[i] = -1;
182 
183    cs->hw_ip = ring_to_hw_ip(ring_type);
184 }
185 
186 static enum radeon_bo_domain
radv_amdgpu_cs_domain(const struct radeon_winsys * _ws)187 radv_amdgpu_cs_domain(const struct radeon_winsys *_ws)
188 {
189    const struct radv_amdgpu_winsys *ws = (const struct radv_amdgpu_winsys *)_ws;
190 
191    bool enough_vram = ws->info.all_vram_visible ||
192                       p_atomic_read_relaxed(&ws->allocated_vram_vis) * 2 <= ws->info.vram_vis_size;
193    bool use_sam =
194       (enough_vram && ws->info.has_dedicated_vram && !(ws->perftest & RADV_PERFTEST_NO_SAM)) ||
195       (ws->perftest & RADV_PERFTEST_SAM);
196    return use_sam ? RADEON_DOMAIN_VRAM : RADEON_DOMAIN_GTT;
197 }
198 
199 static struct radeon_cmdbuf *
radv_amdgpu_cs_create(struct radeon_winsys * ws,enum ring_type ring_type)200 radv_amdgpu_cs_create(struct radeon_winsys *ws, enum ring_type ring_type)
201 {
202    struct radv_amdgpu_cs *cs;
203    uint32_t ib_size = 20 * 1024 * 4;
204    cs = calloc(1, sizeof(struct radv_amdgpu_cs));
205    if (!cs)
206       return NULL;
207 
208    cs->ws = radv_amdgpu_winsys(ws);
209    radv_amdgpu_init_cs(cs, ring_type);
210 
211    if (cs->ws->use_ib_bos) {
212       VkResult result =
213          ws->buffer_create(ws, ib_size, 0, radv_amdgpu_cs_domain(ws),
214                            RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
215                               RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC,
216                            RADV_BO_PRIORITY_CS, 0, &cs->ib_buffer);
217       if (result != VK_SUCCESS) {
218          free(cs);
219          return NULL;
220       }
221 
222       cs->ib_mapped = ws->buffer_map(cs->ib_buffer);
223       if (!cs->ib_mapped) {
224          ws->buffer_destroy(ws, cs->ib_buffer);
225          free(cs);
226          return NULL;
227       }
228 
229       cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
230       cs->base.buf = (uint32_t *)cs->ib_mapped;
231       cs->base.max_dw = ib_size / 4 - 4;
232       cs->ib_size_ptr = &cs->ib.size;
233       cs->ib.size = 0;
234 
235       ws->cs_add_buffer(&cs->base, cs->ib_buffer);
236    } else {
237       uint32_t *buf = malloc(16384);
238       if (!buf) {
239          free(cs);
240          return NULL;
241       }
242       cs->base.buf = buf;
243       cs->base.max_dw = 4096;
244    }
245 
246    return &cs->base;
247 }
248 
249 static void
radv_amdgpu_cs_grow(struct radeon_cmdbuf * _cs,size_t min_size)250 radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
251 {
252    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
253 
254    if (cs->status != VK_SUCCESS) {
255       cs->base.cdw = 0;
256       return;
257    }
258 
259    if (!cs->ws->use_ib_bos) {
260       const uint64_t limit_dws = GFX6_MAX_CS_SIZE;
261       uint64_t ib_dws = MAX2(cs->base.cdw + min_size, MIN2(cs->base.max_dw * 2, limit_dws));
262 
263       /* The total ib size cannot exceed limit_dws dwords. */
264       if (ib_dws > limit_dws) {
265          /* The maximum size in dwords has been reached,
266           * try to allocate a new one.
267           */
268          struct radeon_cmdbuf *old_cs_buffers =
269             realloc(cs->old_cs_buffers, (cs->num_old_cs_buffers + 1) * sizeof(*cs->old_cs_buffers));
270          if (!old_cs_buffers) {
271             cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
272             cs->base.cdw = 0;
273             return;
274          }
275          cs->old_cs_buffers = old_cs_buffers;
276 
277          /* Store the current one for submitting it later. */
278          cs->old_cs_buffers[cs->num_old_cs_buffers].cdw = cs->base.cdw;
279          cs->old_cs_buffers[cs->num_old_cs_buffers].max_dw = cs->base.max_dw;
280          cs->old_cs_buffers[cs->num_old_cs_buffers].buf = cs->base.buf;
281          cs->num_old_cs_buffers++;
282 
283          /* Reset the cs, it will be re-allocated below. */
284          cs->base.cdw = 0;
285          cs->base.buf = NULL;
286 
287          /* Re-compute the number of dwords to allocate. */
288          ib_dws = MAX2(cs->base.cdw + min_size, MIN2(cs->base.max_dw * 2, limit_dws));
289          if (ib_dws > limit_dws) {
290             fprintf(stderr, "amdgpu: Too high number of "
291                             "dwords to allocate\n");
292             cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
293             return;
294          }
295       }
296 
297       uint32_t *new_buf = realloc(cs->base.buf, ib_dws * 4);
298       if (new_buf) {
299          cs->base.buf = new_buf;
300          cs->base.max_dw = ib_dws;
301       } else {
302          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
303          cs->base.cdw = 0;
304       }
305       return;
306    }
307 
308    uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
309 
310    /* max that fits in the chain size field. */
311    ib_size = MIN2(ib_size, 0xfffff);
312 
313    while (!cs->base.cdw || (cs->base.cdw & 7) != 4)
314       radeon_emit(&cs->base, PKT3_NOP_PAD);
315 
316    *cs->ib_size_ptr |= cs->base.cdw + 4;
317 
318    if (cs->num_old_ib_buffers == cs->max_num_old_ib_buffers) {
319       unsigned max_num_old_ib_buffers = MAX2(1, cs->max_num_old_ib_buffers * 2);
320       struct radv_amdgpu_ib *old_ib_buffers =
321          realloc(cs->old_ib_buffers, max_num_old_ib_buffers * sizeof(*old_ib_buffers));
322       if (!old_ib_buffers) {
323          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
324          return;
325       }
326       cs->max_num_old_ib_buffers = max_num_old_ib_buffers;
327       cs->old_ib_buffers = old_ib_buffers;
328    }
329 
330    cs->old_ib_buffers[cs->num_old_ib_buffers].bo = cs->ib_buffer;
331    cs->old_ib_buffers[cs->num_old_ib_buffers++].cdw = cs->base.cdw;
332 
333    VkResult result =
334       cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0, radv_amdgpu_cs_domain(&cs->ws->base),
335                                  RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
336                                     RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC,
337                                  RADV_BO_PRIORITY_CS, 0, &cs->ib_buffer);
338 
339    if (result != VK_SUCCESS) {
340       cs->base.cdw = 0;
341       cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
342       cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers].bo;
343    }
344 
345    cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer);
346    if (!cs->ib_mapped) {
347       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
348       cs->base.cdw = 0;
349 
350       /* VK_ERROR_MEMORY_MAP_FAILED is not valid for vkEndCommandBuffer. */
351       cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
352       cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers].bo;
353    }
354 
355    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
356 
357    radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
358    radeon_emit(&cs->base, radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va);
359    radeon_emit(&cs->base, radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32);
360    radeon_emit(&cs->base, S_3F2_CHAIN(1) | S_3F2_VALID(1));
361 
362    cs->ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
363 
364    cs->base.buf = (uint32_t *)cs->ib_mapped;
365    cs->base.cdw = 0;
366    cs->base.max_dw = ib_size / 4 - 4;
367 }
368 
369 static VkResult
radv_amdgpu_cs_finalize(struct radeon_cmdbuf * _cs)370 radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
371 {
372    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
373 
374    if (cs->ws->use_ib_bos) {
375       while (!cs->base.cdw || (cs->base.cdw & 7) != 0)
376          radeon_emit(&cs->base, PKT3_NOP_PAD);
377 
378       *cs->ib_size_ptr |= cs->base.cdw;
379 
380       cs->is_chained = false;
381    }
382 
383    return cs->status;
384 }
385 
386 static void
radv_amdgpu_cs_reset(struct radeon_cmdbuf * _cs)387 radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
388 {
389    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
390    cs->base.cdw = 0;
391    cs->status = VK_SUCCESS;
392 
393    for (unsigned i = 0; i < cs->num_buffers; ++i) {
394       unsigned hash = cs->handles[i].bo_handle & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
395       cs->buffer_hash_table[hash] = -1;
396    }
397 
398    for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
399       unsigned hash =
400          ((uintptr_t)cs->virtual_buffers[i] >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
401       cs->virtual_buffer_hash_table[hash] = -1;
402    }
403 
404    cs->num_buffers = 0;
405    cs->num_virtual_buffers = 0;
406 
407    if (cs->ws->use_ib_bos) {
408       cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
409 
410       for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
411          cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i].bo);
412 
413       cs->num_old_ib_buffers = 0;
414       cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
415       cs->ib_size_ptr = &cs->ib.size;
416       cs->ib.size = 0;
417    } else {
418       for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
419          struct radeon_cmdbuf *rcs = &cs->old_cs_buffers[i];
420          free(rcs->buf);
421       }
422 
423       free(cs->old_cs_buffers);
424       cs->old_cs_buffers = NULL;
425       cs->num_old_cs_buffers = 0;
426    }
427 }
428 
429 static int
radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs * cs,uint32_t bo)430 radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs, uint32_t bo)
431 {
432    unsigned hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
433    int index = cs->buffer_hash_table[hash];
434 
435    if (index == -1)
436       return -1;
437 
438    if (cs->handles[index].bo_handle == bo)
439       return index;
440 
441    for (unsigned i = 0; i < cs->num_buffers; ++i) {
442       if (cs->handles[i].bo_handle == bo) {
443          cs->buffer_hash_table[hash] = i;
444          return i;
445       }
446    }
447 
448    return -1;
449 }
450 
451 static void
radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs * cs,uint32_t bo,uint8_t priority)452 radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, uint32_t bo, uint8_t priority)
453 {
454    unsigned hash;
455    int index = radv_amdgpu_cs_find_buffer(cs, bo);
456 
457    if (index != -1)
458       return;
459 
460    if (cs->num_buffers == cs->max_num_buffers) {
461       unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
462       struct drm_amdgpu_bo_list_entry *new_entries =
463          realloc(cs->handles, new_count * sizeof(struct drm_amdgpu_bo_list_entry));
464       if (new_entries) {
465          cs->max_num_buffers = new_count;
466          cs->handles = new_entries;
467       } else {
468          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
469          return;
470       }
471    }
472 
473    cs->handles[cs->num_buffers].bo_handle = bo;
474    cs->handles[cs->num_buffers].bo_priority = priority;
475 
476    hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
477    cs->buffer_hash_table[hash] = cs->num_buffers;
478 
479    ++cs->num_buffers;
480 }
481 
482 static void
radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo)483 radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo)
484 {
485    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
486    unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
487 
488    if (!cs->virtual_buffer_hash_table) {
489       int *virtual_buffer_hash_table = malloc(VIRTUAL_BUFFER_HASH_TABLE_SIZE * sizeof(int));
490       if (!virtual_buffer_hash_table) {
491          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
492          return;
493       }
494       cs->virtual_buffer_hash_table = virtual_buffer_hash_table;
495 
496       for (int i = 0; i < VIRTUAL_BUFFER_HASH_TABLE_SIZE; ++i)
497          cs->virtual_buffer_hash_table[i] = -1;
498    }
499 
500    if (cs->virtual_buffer_hash_table[hash] >= 0) {
501       int idx = cs->virtual_buffer_hash_table[hash];
502       if (cs->virtual_buffers[idx] == bo) {
503          return;
504       }
505       for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
506          if (cs->virtual_buffers[i] == bo) {
507             cs->virtual_buffer_hash_table[hash] = i;
508             return;
509          }
510       }
511    }
512 
513    if (cs->max_num_virtual_buffers <= cs->num_virtual_buffers) {
514       unsigned max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2);
515       struct radeon_winsys_bo **virtual_buffers =
516          realloc(cs->virtual_buffers, sizeof(struct radeon_winsys_bo *) * max_num_virtual_buffers);
517       if (!virtual_buffers) {
518          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
519          return;
520       }
521       cs->max_num_virtual_buffers = max_num_virtual_buffers;
522       cs->virtual_buffers = virtual_buffers;
523    }
524 
525    cs->virtual_buffers[cs->num_virtual_buffers] = bo;
526 
527    cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers;
528    ++cs->num_virtual_buffers;
529 }
530 
531 static void
radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * _bo)532 radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *_bo)
533 {
534    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
535    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
536 
537    if (cs->status != VK_SUCCESS)
538       return;
539 
540    if (bo->is_virtual) {
541       radv_amdgpu_cs_add_virtual_buffer(_cs, _bo);
542       return;
543    }
544 
545    radv_amdgpu_cs_add_buffer_internal(cs, bo->bo_handle, bo->priority);
546 }
547 
548 static void
radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf * _parent,struct radeon_cmdbuf * _child,bool allow_ib2)549 radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child,
550                                  bool allow_ib2)
551 {
552    struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
553    struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
554    struct radv_amdgpu_winsys *ws = parent->ws;
555    bool use_ib2 = ws->use_ib_bos && allow_ib2;
556 
557    if (parent->status != VK_SUCCESS || child->status != VK_SUCCESS)
558       return;
559 
560    for (unsigned i = 0; i < child->num_buffers; ++i) {
561       radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i].bo_handle,
562                                          child->handles[i].bo_priority);
563    }
564 
565    for (unsigned i = 0; i < child->num_virtual_buffers; ++i) {
566       radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
567    }
568 
569    if (use_ib2) {
570       if (parent->base.cdw + 4 > parent->base.max_dw)
571          radv_amdgpu_cs_grow(&parent->base, 4);
572 
573       /* Not setting the CHAIN bit will launch an IB2. */
574       radeon_emit(&parent->base, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
575       radeon_emit(&parent->base, child->ib.ib_mc_address);
576       radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
577       radeon_emit(&parent->base, child->ib.size);
578    } else {
579       if (parent->ws->use_ib_bos) {
580          /* Copy and chain old IB buffers from the child to the parent IB. */
581          for (unsigned i = 0; i < child->num_old_ib_buffers; i++) {
582             struct radv_amdgpu_ib *ib = &child->old_ib_buffers[i];
583             uint8_t *mapped;
584 
585             if (parent->base.cdw + ib->cdw > parent->base.max_dw)
586                radv_amdgpu_cs_grow(&parent->base, ib->cdw);
587 
588             mapped = ws->base.buffer_map(ib->bo);
589             if (!mapped) {
590                parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
591                return;
592             }
593 
594             /* Copy the IB data without the original chain link. */
595             memcpy(parent->base.buf + parent->base.cdw, mapped, 4 * ib->cdw);
596             parent->base.cdw += ib->cdw;
597          }
598       } else {
599          /* When the secondary command buffer is huge we have to copy the list of CS buffers to the
600           * parent to submit multiple IBs.
601           */
602          if (child->num_old_cs_buffers > 0) {
603             unsigned num_cs_buffers;
604             uint32_t *new_buf;
605 
606             /* Compute the total number of CS buffers needed. */
607             num_cs_buffers = parent->num_old_cs_buffers + child->num_old_cs_buffers + 1;
608 
609             struct radeon_cmdbuf *old_cs_buffers =
610                realloc(parent->old_cs_buffers, num_cs_buffers * sizeof(*parent->old_cs_buffers));
611             if (!old_cs_buffers) {
612                parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
613                parent->base.cdw = 0;
614                return;
615             }
616             parent->old_cs_buffers = old_cs_buffers;
617 
618             /* Copy the parent CS to its list of CS buffers, so submission ordering is maintained. */
619             new_buf = malloc(parent->base.max_dw * 4);
620             if (!new_buf) {
621                parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
622                parent->base.cdw = 0;
623                return;
624             }
625             memcpy(new_buf, parent->base.buf, parent->base.max_dw * 4);
626 
627             parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = parent->base.cdw;
628             parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = parent->base.max_dw;
629             parent->old_cs_buffers[parent->num_old_cs_buffers].buf = new_buf;
630             parent->num_old_cs_buffers++;
631 
632             /* Then, copy all child CS buffers to the parent list. */
633             for (unsigned i = 0; i < child->num_old_cs_buffers; i++) {
634                new_buf = malloc(child->old_cs_buffers[i].max_dw * 4);
635                if (!new_buf) {
636                   parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
637                   parent->base.cdw = 0;
638                   return;
639                }
640                memcpy(new_buf, child->old_cs_buffers[i].buf, child->old_cs_buffers[i].max_dw * 4);
641 
642                parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = child->old_cs_buffers[i].cdw;
643                parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = child->old_cs_buffers[i].max_dw;
644                parent->old_cs_buffers[parent->num_old_cs_buffers].buf = new_buf;
645                parent->num_old_cs_buffers++;
646             }
647 
648             /* Reset the parent CS before copying the child CS into it. */
649             parent->base.cdw = 0;
650          }
651       }
652 
653       if (parent->base.cdw + child->base.cdw > parent->base.max_dw)
654          radv_amdgpu_cs_grow(&parent->base, child->base.cdw);
655 
656       memcpy(parent->base.buf + parent->base.cdw, child->base.buf, 4 * child->base.cdw);
657       parent->base.cdw += child->base.cdw;
658    }
659 }
660 
661 static VkResult
radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys * ws,struct radeon_cmdbuf ** cs_array,unsigned count,struct radv_amdgpu_winsys_bo ** extra_bo_array,unsigned num_extra_bo,struct radeon_cmdbuf * extra_cs,unsigned * rnum_handles,struct drm_amdgpu_bo_list_entry ** rhandles)662 radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys *ws, struct radeon_cmdbuf **cs_array,
663                         unsigned count, struct radv_amdgpu_winsys_bo **extra_bo_array,
664                         unsigned num_extra_bo, struct radeon_cmdbuf *extra_cs,
665                         unsigned *rnum_handles, struct drm_amdgpu_bo_list_entry **rhandles)
666 {
667    struct drm_amdgpu_bo_list_entry *handles = NULL;
668    unsigned num_handles = 0;
669 
670    if (ws->debug_all_bos) {
671       handles = malloc(sizeof(handles[0]) * ws->global_bo_list.count);
672       if (!handles) {
673          return VK_ERROR_OUT_OF_HOST_MEMORY;
674       }
675 
676       for (uint32_t i = 0; i < ws->global_bo_list.count; i++) {
677          handles[i].bo_handle = ws->global_bo_list.bos[i]->bo_handle;
678          handles[i].bo_priority = ws->global_bo_list.bos[i]->priority;
679          num_handles++;
680       }
681    } else if (count == 1 && !num_extra_bo && !extra_cs &&
682               !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers && !ws->global_bo_list.count) {
683       struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)cs_array[0];
684       if (cs->num_buffers == 0)
685          return VK_SUCCESS;
686 
687       handles = malloc(sizeof(handles[0]) * cs->num_buffers);
688       if (!handles)
689          return VK_ERROR_OUT_OF_HOST_MEMORY;
690 
691       memcpy(handles, cs->handles, sizeof(handles[0]) * cs->num_buffers);
692       num_handles = cs->num_buffers;
693    } else {
694       unsigned total_buffer_count = num_extra_bo;
695       num_handles = num_extra_bo;
696       for (unsigned i = 0; i < count; ++i) {
697          struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)cs_array[i];
698          total_buffer_count += cs->num_buffers;
699          for (unsigned j = 0; j < cs->num_virtual_buffers; ++j)
700             total_buffer_count += radv_amdgpu_winsys_bo(cs->virtual_buffers[j])->bo_count;
701       }
702 
703       if (extra_cs) {
704          total_buffer_count += ((struct radv_amdgpu_cs *)extra_cs)->num_buffers;
705       }
706 
707       total_buffer_count += ws->global_bo_list.count;
708 
709       if (total_buffer_count == 0)
710          return VK_SUCCESS;
711 
712       handles = malloc(sizeof(handles[0]) * total_buffer_count);
713       if (!handles)
714          return VK_ERROR_OUT_OF_HOST_MEMORY;
715 
716       for (unsigned i = 0; i < num_extra_bo; i++) {
717          handles[i].bo_handle = extra_bo_array[i]->bo_handle;
718          handles[i].bo_priority = extra_bo_array[i]->priority;
719       }
720 
721       for (unsigned i = 0; i < count + !!extra_cs; ++i) {
722          struct radv_amdgpu_cs *cs;
723 
724          if (i == count)
725             cs = (struct radv_amdgpu_cs *)extra_cs;
726          else
727             cs = (struct radv_amdgpu_cs *)cs_array[i];
728 
729          if (!cs->num_buffers)
730             continue;
731 
732          if (num_handles == 0 && !cs->num_virtual_buffers) {
733             memcpy(handles, cs->handles, cs->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
734             num_handles = cs->num_buffers;
735             continue;
736          }
737          int unique_bo_so_far = num_handles;
738          for (unsigned j = 0; j < cs->num_buffers; ++j) {
739             bool found = false;
740             for (unsigned k = 0; k < unique_bo_so_far; ++k) {
741                if (handles[k].bo_handle == cs->handles[j].bo_handle) {
742                   found = true;
743                   break;
744                }
745             }
746             if (!found) {
747                handles[num_handles] = cs->handles[j];
748                ++num_handles;
749             }
750          }
751          for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) {
752             struct radv_amdgpu_winsys_bo *virtual_bo =
753                radv_amdgpu_winsys_bo(cs->virtual_buffers[j]);
754             for (unsigned k = 0; k < virtual_bo->bo_count; ++k) {
755                struct radv_amdgpu_winsys_bo *bo = virtual_bo->bos[k];
756                bool found = false;
757                for (unsigned m = 0; m < num_handles; ++m) {
758                   if (handles[m].bo_handle == bo->bo_handle) {
759                      found = true;
760                      break;
761                   }
762                }
763                if (!found) {
764                   handles[num_handles].bo_handle = bo->bo_handle;
765                   handles[num_handles].bo_priority = bo->priority;
766                   ++num_handles;
767                }
768             }
769          }
770       }
771 
772       unsigned unique_bo_so_far = num_handles;
773       for (unsigned i = 0; i < ws->global_bo_list.count; ++i) {
774          struct radv_amdgpu_winsys_bo *bo = ws->global_bo_list.bos[i];
775          bool found = false;
776          for (unsigned j = 0; j < unique_bo_so_far; ++j) {
777             if (bo->bo_handle == handles[j].bo_handle) {
778                found = true;
779                break;
780             }
781          }
782          if (!found) {
783             handles[num_handles].bo_handle = bo->bo_handle;
784             handles[num_handles].bo_priority = bo->priority;
785             ++num_handles;
786          }
787       }
788    }
789 
790    *rhandles = handles;
791    *rnum_handles = num_handles;
792 
793    return VK_SUCCESS;
794 }
795 
796 static void
radv_assign_last_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request)797 radv_assign_last_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request)
798 {
799    radv_amdgpu_request_to_fence(ctx, &ctx->last_submission[request->ip_type][request->ring],
800                                 request);
801 }
802 
803 static VkResult
radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx * _ctx,int queue_idx,struct radv_winsys_sem_info * sem_info,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf * initial_preamble_cs)804 radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx, int queue_idx,
805                                      struct radv_winsys_sem_info *sem_info,
806                                      struct radeon_cmdbuf **cs_array, unsigned cs_count,
807                                      struct radeon_cmdbuf *initial_preamble_cs)
808 {
809    struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
810    struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
811    struct radv_amdgpu_winsys *aws = cs0->ws;
812    struct drm_amdgpu_bo_list_entry *handles = NULL;
813    struct radv_amdgpu_cs_request request;
814    struct amdgpu_cs_ib_info ibs[2];
815    unsigned number_of_ibs = 1;
816    unsigned num_handles = 0;
817    VkResult result;
818 
819    for (unsigned i = cs_count; i--;) {
820       struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
821 
822       if (cs->is_chained) {
823          *cs->ib_size_ptr -= 4;
824          cs->is_chained = false;
825       }
826 
827       if (i + 1 < cs_count) {
828          struct radv_amdgpu_cs *next = radv_amdgpu_cs(cs_array[i + 1]);
829          assert(cs->base.cdw + 4 <= cs->base.max_dw);
830 
831          cs->is_chained = true;
832          *cs->ib_size_ptr += 4;
833 
834          cs->base.buf[cs->base.cdw + 0] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
835          cs->base.buf[cs->base.cdw + 1] = next->ib.ib_mc_address;
836          cs->base.buf[cs->base.cdw + 2] = next->ib.ib_mc_address >> 32;
837          cs->base.buf[cs->base.cdw + 3] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | next->ib.size;
838       }
839    }
840 
841    u_rwlock_rdlock(&aws->global_bo_list.lock);
842 
843    /* Get the BO list. */
844    result = radv_amdgpu_get_bo_list(cs0->ws, cs_array, cs_count, NULL, 0, initial_preamble_cs,
845                                     &num_handles, &handles);
846    if (result != VK_SUCCESS)
847       goto fail;
848 
849    /* Configure the CS request. */
850    if (initial_preamble_cs) {
851       ibs[0] = radv_amdgpu_cs(initial_preamble_cs)->ib;
852       ibs[1] = cs0->ib;
853       number_of_ibs++;
854    } else {
855       ibs[0] = cs0->ib;
856    }
857 
858    request.ip_type = cs0->hw_ip;
859    request.ip_instance = 0;
860    request.ring = queue_idx;
861    request.number_of_ibs = number_of_ibs;
862    request.ibs = ibs;
863    request.handles = handles;
864    request.num_handles = num_handles;
865 
866    /* Submit the CS. */
867    result = radv_amdgpu_cs_submit(ctx, &request, sem_info);
868 
869    free(request.handles);
870 
871    if (result != VK_SUCCESS)
872       goto fail;
873 
874    radv_assign_last_submit(ctx, &request);
875 
876 fail:
877    u_rwlock_rdunlock(&aws->global_bo_list.lock);
878    return result;
879 }
880 
881 static VkResult
radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx * _ctx,int queue_idx,struct radv_winsys_sem_info * sem_info,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf * initial_preamble_cs)882 radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx, int queue_idx,
883                                       struct radv_winsys_sem_info *sem_info,
884                                       struct radeon_cmdbuf **cs_array, unsigned cs_count,
885                                       struct radeon_cmdbuf *initial_preamble_cs)
886 {
887    struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
888    struct drm_amdgpu_bo_list_entry *handles = NULL;
889    struct radv_amdgpu_cs_request request;
890    struct amdgpu_cs_ib_info *ibs;
891    struct radv_amdgpu_cs *cs0;
892    struct radv_amdgpu_winsys *aws;
893    unsigned num_handles = 0;
894    unsigned number_of_ibs;
895    VkResult result;
896 
897    assert(cs_count);
898    cs0 = radv_amdgpu_cs(cs_array[0]);
899    aws = cs0->ws;
900 
901    /* Compute the number of IBs for this submit. */
902    number_of_ibs = cs_count + !!initial_preamble_cs;
903 
904    u_rwlock_rdlock(&aws->global_bo_list.lock);
905 
906    /* Get the BO list. */
907    result = radv_amdgpu_get_bo_list(cs0->ws, &cs_array[0], cs_count, NULL, 0, initial_preamble_cs,
908                                     &num_handles, &handles);
909    if (result != VK_SUCCESS) {
910       goto fail;
911    }
912 
913    ibs = malloc(number_of_ibs * sizeof(*ibs));
914    if (!ibs) {
915       free(handles);
916       result = VK_ERROR_OUT_OF_HOST_MEMORY;
917       goto fail;
918    }
919 
920    /* Configure the CS request. */
921    if (initial_preamble_cs)
922       ibs[0] = radv_amdgpu_cs(initial_preamble_cs)->ib;
923 
924    for (unsigned i = 0; i < cs_count; i++) {
925       struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
926 
927       ibs[i + !!initial_preamble_cs] = cs->ib;
928 
929       if (cs->is_chained) {
930          *cs->ib_size_ptr -= 4;
931          cs->is_chained = false;
932       }
933    }
934 
935    request.ip_type = cs0->hw_ip;
936    request.ip_instance = 0;
937    request.ring = queue_idx;
938    request.handles = handles;
939    request.num_handles = num_handles;
940    request.number_of_ibs = number_of_ibs;
941    request.ibs = ibs;
942 
943    /* Submit the CS. */
944    result = radv_amdgpu_cs_submit(ctx, &request, sem_info);
945 
946    free(request.handles);
947    free(ibs);
948 
949    if (result != VK_SUCCESS)
950       goto fail;
951 
952    radv_assign_last_submit(ctx, &request);
953 
954 fail:
955    u_rwlock_rdunlock(&aws->global_bo_list.lock);
956    return result;
957 }
958 
959 static VkResult
radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx * _ctx,int queue_idx,struct radv_winsys_sem_info * sem_info,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf * initial_preamble_cs,struct radeon_cmdbuf * continue_preamble_cs)960 radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx, int queue_idx,
961                                     struct radv_winsys_sem_info *sem_info,
962                                     struct radeon_cmdbuf **cs_array, unsigned cs_count,
963                                     struct radeon_cmdbuf *initial_preamble_cs,
964                                     struct radeon_cmdbuf *continue_preamble_cs)
965 {
966    struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
967    struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
968    struct radeon_winsys *ws = (struct radeon_winsys *)cs0->ws;
969    struct radv_amdgpu_winsys *aws = cs0->ws;
970    struct radv_amdgpu_cs_request request;
971    uint32_t pad_word = PKT3_NOP_PAD;
972    bool emit_signal_sem = sem_info->cs_emit_signal;
973    VkResult result;
974 
975    if (radv_amdgpu_winsys(ws)->info.chip_class == GFX6)
976       pad_word = 0x80000000;
977 
978    assert(cs_count);
979 
980    for (unsigned i = 0; i < cs_count;) {
981       struct amdgpu_cs_ib_info *ibs;
982       struct radeon_winsys_bo **bos;
983       struct radeon_cmdbuf *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs;
984       struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
985       struct drm_amdgpu_bo_list_entry *handles = NULL;
986       unsigned num_handles = 0;
987       unsigned number_of_ibs;
988       uint32_t *ptr;
989       unsigned cnt = 0;
990 
991       /* Compute the number of IBs for this submit. */
992       number_of_ibs = cs->num_old_cs_buffers + 1;
993 
994       ibs = malloc(number_of_ibs * sizeof(*ibs));
995       if (!ibs)
996          return VK_ERROR_OUT_OF_HOST_MEMORY;
997 
998       bos = malloc(number_of_ibs * sizeof(*bos));
999       if (!bos) {
1000          free(ibs);
1001          return VK_ERROR_OUT_OF_HOST_MEMORY;
1002       }
1003 
1004       if (number_of_ibs > 1) {
1005          /* Special path when the maximum size in dwords has
1006           * been reached because we need to handle more than one
1007           * IB per submit.
1008           */
1009          struct radeon_cmdbuf **new_cs_array;
1010          unsigned idx = 0;
1011 
1012          new_cs_array = malloc(number_of_ibs * sizeof(*new_cs_array));
1013          assert(new_cs_array);
1014 
1015          for (unsigned j = 0; j < cs->num_old_cs_buffers; j++)
1016             new_cs_array[idx++] = &cs->old_cs_buffers[j];
1017          new_cs_array[idx++] = cs_array[i];
1018 
1019          for (unsigned j = 0; j < number_of_ibs; j++) {
1020             struct radeon_cmdbuf *rcs = new_cs_array[j];
1021             bool needs_preamble = preamble_cs && j == 0;
1022             unsigned pad_words = 0;
1023             unsigned size = 0;
1024 
1025             if (needs_preamble)
1026                size += preamble_cs->cdw;
1027             size += rcs->cdw;
1028 
1029             assert(size < GFX6_MAX_CS_SIZE);
1030 
1031             while (!size || (size & 7)) {
1032                size++;
1033                pad_words++;
1034             }
1035 
1036             ws->buffer_create(
1037                ws, 4 * size, 4096, radv_amdgpu_cs_domain(ws),
1038                RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY,
1039                RADV_BO_PRIORITY_CS, 0, &bos[j]);
1040             ptr = ws->buffer_map(bos[j]);
1041 
1042             if (needs_preamble) {
1043                memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
1044                ptr += preamble_cs->cdw;
1045             }
1046 
1047             memcpy(ptr, rcs->buf, 4 * rcs->cdw);
1048             ptr += rcs->cdw;
1049 
1050             for (unsigned k = 0; k < pad_words; ++k)
1051                *ptr++ = pad_word;
1052 
1053             ibs[j].size = size;
1054             ibs[j].ib_mc_address = radv_buffer_get_va(bos[j]);
1055             ibs[j].flags = 0;
1056          }
1057 
1058          cnt++;
1059          free(new_cs_array);
1060       } else {
1061          unsigned pad_words = 0;
1062          unsigned size = 0;
1063 
1064          if (preamble_cs)
1065             size += preamble_cs->cdw;
1066 
1067          while (i + cnt < cs_count &&
1068                 GFX6_MAX_CS_SIZE - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) {
1069             size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw;
1070             ++cnt;
1071          }
1072 
1073          while (!size || (size & 7)) {
1074             size++;
1075             pad_words++;
1076          }
1077          assert(cnt);
1078 
1079          ws->buffer_create(
1080             ws, 4 * size, 4096, radv_amdgpu_cs_domain(ws),
1081             RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY,
1082             RADV_BO_PRIORITY_CS, 0, &bos[0]);
1083          ptr = ws->buffer_map(bos[0]);
1084 
1085          if (preamble_cs) {
1086             memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
1087             ptr += preamble_cs->cdw;
1088          }
1089 
1090          for (unsigned j = 0; j < cnt; ++j) {
1091             struct radv_amdgpu_cs *cs2 = radv_amdgpu_cs(cs_array[i + j]);
1092             memcpy(ptr, cs2->base.buf, 4 * cs2->base.cdw);
1093             ptr += cs2->base.cdw;
1094          }
1095 
1096          for (unsigned j = 0; j < pad_words; ++j)
1097             *ptr++ = pad_word;
1098 
1099          ibs[0].size = size;
1100          ibs[0].ib_mc_address = radv_buffer_get_va(bos[0]);
1101          ibs[0].flags = 0;
1102       }
1103 
1104       u_rwlock_rdlock(&aws->global_bo_list.lock);
1105 
1106       result =
1107          radv_amdgpu_get_bo_list(cs0->ws, &cs_array[i], cnt, (struct radv_amdgpu_winsys_bo **)bos,
1108                                  number_of_ibs, preamble_cs, &num_handles, &handles);
1109       if (result != VK_SUCCESS) {
1110          free(ibs);
1111          free(bos);
1112          u_rwlock_rdunlock(&aws->global_bo_list.lock);
1113          return result;
1114       }
1115 
1116       request.ip_type = cs0->hw_ip;
1117       request.ip_instance = 0;
1118       request.ring = queue_idx;
1119       request.handles = handles;
1120       request.num_handles = num_handles;
1121       request.number_of_ibs = number_of_ibs;
1122       request.ibs = ibs;
1123 
1124       sem_info->cs_emit_signal = (i == cs_count - cnt) ? emit_signal_sem : false;
1125       result = radv_amdgpu_cs_submit(ctx, &request, sem_info);
1126 
1127       free(request.handles);
1128       u_rwlock_rdunlock(&aws->global_bo_list.lock);
1129 
1130       for (unsigned j = 0; j < number_of_ibs; j++) {
1131          ws->buffer_destroy(ws, bos[j]);
1132       }
1133 
1134       free(ibs);
1135       free(bos);
1136 
1137       if (result != VK_SUCCESS)
1138          return result;
1139 
1140       i += cnt;
1141    }
1142 
1143    radv_assign_last_submit(ctx, &request);
1144 
1145    return VK_SUCCESS;
1146 }
1147 
1148 static VkResult
radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx * _ctx,int queue_idx,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf * initial_preamble_cs,struct radeon_cmdbuf * continue_preamble_cs,struct radv_winsys_sem_info * sem_info,bool can_patch)1149 radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx, int queue_idx,
1150                              struct radeon_cmdbuf **cs_array, unsigned cs_count,
1151                              struct radeon_cmdbuf *initial_preamble_cs,
1152                              struct radeon_cmdbuf *continue_preamble_cs,
1153                              struct radv_winsys_sem_info *sem_info, bool can_patch)
1154 {
1155    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[0]);
1156    VkResult result;
1157 
1158    assert(sem_info);
1159    if (!cs->ws->use_ib_bos) {
1160       result = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, cs_array, cs_count,
1161                                                    initial_preamble_cs, continue_preamble_cs);
1162    } else if (can_patch) {
1163       result = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, cs_array, cs_count,
1164                                                     initial_preamble_cs);
1165    } else {
1166       result = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, cs_array, cs_count,
1167                                                      initial_preamble_cs);
1168    }
1169 
1170    return result;
1171 }
1172 
1173 static void *
radv_amdgpu_winsys_get_cpu_addr(void * _cs,uint64_t addr)1174 radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr)
1175 {
1176    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1177    void *ret = NULL;
1178 
1179    if (!cs->ib_buffer)
1180       return NULL;
1181    for (unsigned i = 0; i <= cs->num_old_ib_buffers; ++i) {
1182       struct radv_amdgpu_winsys_bo *bo;
1183 
1184       bo = (struct radv_amdgpu_winsys_bo *)(i == cs->num_old_ib_buffers ? cs->ib_buffer
1185                                                                         : cs->old_ib_buffers[i].bo);
1186       if (addr >= bo->base.va && addr - bo->base.va < bo->size) {
1187          if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0)
1188             return (char *)ret + (addr - bo->base.va);
1189       }
1190    }
1191    u_rwlock_rdlock(&cs->ws->global_bo_list.lock);
1192    for (uint32_t i = 0; i < cs->ws->global_bo_list.count; i++) {
1193       struct radv_amdgpu_winsys_bo *bo = cs->ws->global_bo_list.bos[i];
1194       if (addr >= bo->base.va && addr - bo->base.va < bo->size) {
1195          if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0) {
1196             u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1197             return (char *)ret + (addr - bo->base.va);
1198          }
1199       }
1200    }
1201    u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1202 
1203    return ret;
1204 }
1205 
1206 static void
radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf * _cs,FILE * file,const int * trace_ids,int trace_id_count)1207 radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf *_cs, FILE *file, const int *trace_ids,
1208                            int trace_id_count)
1209 {
1210    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1211    void *ib = cs->base.buf;
1212    int num_dw = cs->base.cdw;
1213 
1214    if (cs->ws->use_ib_bos) {
1215       ib = radv_amdgpu_winsys_get_cpu_addr(cs, cs->ib.ib_mc_address);
1216       num_dw = cs->ib.size;
1217    }
1218    assert(ib);
1219    ac_parse_ib(file, ib, num_dw, trace_ids, trace_id_count, "main IB", cs->ws->info.chip_class,
1220                radv_amdgpu_winsys_get_cpu_addr, cs);
1221 }
1222 
1223 static uint32_t
radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)1224 radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)
1225 {
1226    switch (radv_priority) {
1227    case RADEON_CTX_PRIORITY_REALTIME:
1228       return AMDGPU_CTX_PRIORITY_VERY_HIGH;
1229    case RADEON_CTX_PRIORITY_HIGH:
1230       return AMDGPU_CTX_PRIORITY_HIGH;
1231    case RADEON_CTX_PRIORITY_MEDIUM:
1232       return AMDGPU_CTX_PRIORITY_NORMAL;
1233    case RADEON_CTX_PRIORITY_LOW:
1234       return AMDGPU_CTX_PRIORITY_LOW;
1235    default:
1236       unreachable("Invalid context priority");
1237    }
1238 }
1239 
1240 static VkResult
radv_amdgpu_ctx_create(struct radeon_winsys * _ws,enum radeon_ctx_priority priority,struct radeon_winsys_ctx ** rctx)1241 radv_amdgpu_ctx_create(struct radeon_winsys *_ws, enum radeon_ctx_priority priority,
1242                        struct radeon_winsys_ctx **rctx)
1243 {
1244    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1245    struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
1246    uint32_t amdgpu_priority = radv_to_amdgpu_priority(priority);
1247    VkResult result;
1248    int r;
1249 
1250    if (!ctx)
1251       return VK_ERROR_OUT_OF_HOST_MEMORY;
1252 
1253    r = amdgpu_cs_ctx_create2(ws->dev, amdgpu_priority, &ctx->ctx);
1254    if (r && r == -EACCES) {
1255       result = VK_ERROR_NOT_PERMITTED_EXT;
1256       goto fail_create;
1257    } else if (r) {
1258       fprintf(stderr, "amdgpu: radv_amdgpu_cs_ctx_create2 failed. (%i)\n", r);
1259       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1260       goto fail_create;
1261    }
1262    ctx->ws = ws;
1263 
1264    assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * sizeof(uint64_t) <= 4096);
1265    result = ws->base.buffer_create(&ws->base, 4096, 8, RADEON_DOMAIN_GTT,
1266                                    RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING,
1267                                    RADV_BO_PRIORITY_CS, 0, &ctx->fence_bo);
1268    if (result != VK_SUCCESS) {
1269       goto fail_alloc;
1270    }
1271 
1272    ctx->fence_map = (uint64_t *)ws->base.buffer_map(ctx->fence_bo);
1273    if (!ctx->fence_map) {
1274       result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
1275       goto fail_map;
1276    }
1277 
1278    memset(ctx->fence_map, 0, 4096);
1279 
1280    *rctx = (struct radeon_winsys_ctx *)ctx;
1281    return VK_SUCCESS;
1282 
1283 fail_map:
1284    ws->base.buffer_destroy(&ws->base, ctx->fence_bo);
1285 fail_alloc:
1286    amdgpu_cs_ctx_free(ctx->ctx);
1287 fail_create:
1288    FREE(ctx);
1289    return result;
1290 }
1291 
1292 static void
radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)1293 radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
1294 {
1295    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1296    ctx->ws->base.buffer_destroy(&ctx->ws->base, ctx->fence_bo);
1297    amdgpu_cs_ctx_free(ctx->ctx);
1298    FREE(ctx);
1299 }
1300 
1301 static bool
radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx * rwctx,enum ring_type ring_type,int ring_index)1302 radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum ring_type ring_type, int ring_index)
1303 {
1304    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1305    int ip_type = ring_to_hw_ip(ring_type);
1306 
1307    if (ctx->last_submission[ip_type][ring_index].fence.fence) {
1308       uint32_t expired;
1309       int ret = amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index].fence,
1310                                              1000000000ull, 0, &expired);
1311 
1312       if (ret || !expired)
1313          return false;
1314    }
1315 
1316    return true;
1317 }
1318 
1319 static void *
radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts * counts,const uint32_t * syncobj_override,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1320 radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts,
1321                                    const uint32_t *syncobj_override,
1322                                    struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1323 {
1324    const uint32_t *src = syncobj_override ? syncobj_override : counts->syncobj;
1325    struct drm_amdgpu_cs_chunk_sem *syncobj =
1326       malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * counts->syncobj_count);
1327    if (!syncobj)
1328       return NULL;
1329 
1330    for (unsigned i = 0; i < counts->syncobj_count; i++) {
1331       struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
1332       sem->handle = src[i];
1333    }
1334 
1335    chunk->chunk_id = chunk_id;
1336    chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * counts->syncobj_count;
1337    chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1338    return syncobj;
1339 }
1340 
1341 static void *
radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts * counts,const uint32_t * syncobj_override,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1342 radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts *counts,
1343                                             const uint32_t *syncobj_override,
1344                                             struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1345 {
1346    const uint32_t *src = syncobj_override ? syncobj_override : counts->syncobj;
1347    struct drm_amdgpu_cs_chunk_syncobj *syncobj =
1348       malloc(sizeof(struct drm_amdgpu_cs_chunk_syncobj) *
1349              (counts->syncobj_count + counts->timeline_syncobj_count));
1350    if (!syncobj)
1351       return NULL;
1352 
1353    for (unsigned i = 0; i < counts->syncobj_count; i++) {
1354       struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i];
1355       sem->handle = src[i];
1356       sem->flags = 0;
1357       sem->point = 0;
1358    }
1359 
1360    for (unsigned i = 0; i < counts->timeline_syncobj_count; i++) {
1361       struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i + counts->syncobj_count];
1362       sem->handle = counts->syncobj[i + counts->syncobj_count];
1363       sem->flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
1364       sem->point = counts->points[i];
1365    }
1366 
1367    chunk->chunk_id = chunk_id;
1368    chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_syncobj) / 4 *
1369                       (counts->syncobj_count + counts->timeline_syncobj_count);
1370    chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1371    return syncobj;
1372 }
1373 
1374 static int
radv_amdgpu_cache_alloc_syncobjs(struct radv_amdgpu_winsys * ws,unsigned count,uint32_t * dst)1375 radv_amdgpu_cache_alloc_syncobjs(struct radv_amdgpu_winsys *ws, unsigned count, uint32_t *dst)
1376 {
1377    pthread_mutex_lock(&ws->syncobj_lock);
1378    if (count > ws->syncobj_capacity) {
1379       if (ws->syncobj_capacity > UINT32_MAX / 2)
1380          goto fail;
1381 
1382       unsigned new_capacity = MAX2(count, ws->syncobj_capacity * 2);
1383       uint32_t *n = realloc(ws->syncobj, new_capacity * sizeof(*ws->syncobj));
1384       if (!n)
1385          goto fail;
1386       ws->syncobj_capacity = new_capacity;
1387       ws->syncobj = n;
1388    }
1389 
1390    while (ws->syncobj_count < count) {
1391       int r = amdgpu_cs_create_syncobj(ws->dev, ws->syncobj + ws->syncobj_count);
1392       if (r)
1393          goto fail;
1394       ++ws->syncobj_count;
1395    }
1396 
1397    for (unsigned i = 0; i < count; ++i)
1398       dst[i] = ws->syncobj[--ws->syncobj_count];
1399 
1400    pthread_mutex_unlock(&ws->syncobj_lock);
1401    return 0;
1402 
1403 fail:
1404    pthread_mutex_unlock(&ws->syncobj_lock);
1405    return -ENOMEM;
1406 }
1407 
1408 static void
radv_amdgpu_cache_free_syncobjs(struct radv_amdgpu_winsys * ws,unsigned count,uint32_t * src)1409 radv_amdgpu_cache_free_syncobjs(struct radv_amdgpu_winsys *ws, unsigned count, uint32_t *src)
1410 {
1411    pthread_mutex_lock(&ws->syncobj_lock);
1412 
1413    uint32_t cache_count = MIN2(count, UINT32_MAX - ws->syncobj_count);
1414    if (cache_count + ws->syncobj_count > ws->syncobj_capacity) {
1415       unsigned new_capacity = MAX2(ws->syncobj_count + cache_count, ws->syncobj_capacity * 2);
1416       uint32_t *n = realloc(ws->syncobj, new_capacity * sizeof(*ws->syncobj));
1417       if (n) {
1418          ws->syncobj_capacity = new_capacity;
1419          ws->syncobj = n;
1420       }
1421    }
1422 
1423    for (unsigned i = 0; i < count; ++i) {
1424       if (ws->syncobj_count < ws->syncobj_capacity)
1425          ws->syncobj[ws->syncobj_count++] = src[i];
1426       else
1427          amdgpu_cs_destroy_syncobj(ws->dev, src[i]);
1428    }
1429 
1430    pthread_mutex_unlock(&ws->syncobj_lock);
1431 }
1432 
1433 static int
radv_amdgpu_cs_prepare_syncobjs(struct radv_amdgpu_winsys * ws,struct radv_winsys_sem_counts * counts,uint32_t ** out_syncobjs)1434 radv_amdgpu_cs_prepare_syncobjs(struct radv_amdgpu_winsys *ws,
1435                                 struct radv_winsys_sem_counts *counts, uint32_t **out_syncobjs)
1436 {
1437    int r = 0;
1438 
1439    if (!ws->info.has_timeline_syncobj || !counts->syncobj_count) {
1440       *out_syncobjs = NULL;
1441       return 0;
1442    }
1443 
1444    *out_syncobjs = malloc(counts->syncobj_count * sizeof(**out_syncobjs));
1445    if (!*out_syncobjs)
1446       return -ENOMEM;
1447 
1448    r = radv_amdgpu_cache_alloc_syncobjs(ws, counts->syncobj_count, *out_syncobjs);
1449    if (r)
1450       return r;
1451 
1452    for (unsigned i = 0; i < counts->syncobj_count; ++i) {
1453       r = amdgpu_cs_syncobj_transfer(ws->dev, (*out_syncobjs)[i], 0, counts->syncobj[i], 0,
1454                                      DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT);
1455       if (r)
1456          goto fail;
1457    }
1458 
1459    r = amdgpu_cs_syncobj_reset(ws->dev, counts->syncobj, counts->syncobj_reset_count);
1460    if (r)
1461       goto fail;
1462 
1463    return 0;
1464 fail:
1465    radv_amdgpu_cache_free_syncobjs(ws, counts->syncobj_count, *out_syncobjs);
1466    free(*out_syncobjs);
1467    *out_syncobjs = NULL;
1468    return r;
1469 }
1470 
1471 static VkResult
radv_amdgpu_cs_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request,struct radv_winsys_sem_info * sem_info)1472 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
1473                       struct radv_winsys_sem_info *sem_info)
1474 {
1475    int r;
1476    int num_chunks;
1477    int size;
1478    struct drm_amdgpu_cs_chunk *chunks;
1479    struct drm_amdgpu_cs_chunk_data *chunk_data;
1480    bool use_bo_list_create = ctx->ws->info.drm_minor < 27;
1481    struct drm_amdgpu_bo_list_in bo_list_in;
1482    void *wait_syncobj = NULL, *signal_syncobj = NULL;
1483    uint32_t *in_syncobjs = NULL;
1484    int i;
1485    uint32_t bo_list = 0;
1486    VkResult result = VK_SUCCESS;
1487 
1488    size = request->number_of_ibs + 2 /* user fence */ + (!use_bo_list_create ? 1 : 0) + 3;
1489 
1490    chunks = malloc(sizeof(chunks[0]) * size);
1491    if (!chunks)
1492       return VK_ERROR_OUT_OF_HOST_MEMORY;
1493 
1494    size = request->number_of_ibs + 1 /* user fence */;
1495 
1496    chunk_data = malloc(sizeof(chunk_data[0]) * size);
1497    if (!chunk_data) {
1498       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1499       goto error_out;
1500    }
1501 
1502    num_chunks = request->number_of_ibs;
1503    for (i = 0; i < request->number_of_ibs; i++) {
1504       struct amdgpu_cs_ib_info *ib;
1505       chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
1506       chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1507       chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1508 
1509       ib = &request->ibs[i];
1510 
1511       chunk_data[i].ib_data._pad = 0;
1512       chunk_data[i].ib_data.va_start = ib->ib_mc_address;
1513       chunk_data[i].ib_data.ib_bytes = ib->size * 4;
1514       chunk_data[i].ib_data.ip_type = request->ip_type;
1515       chunk_data[i].ib_data.ip_instance = request->ip_instance;
1516       chunk_data[i].ib_data.ring = request->ring;
1517       chunk_data[i].ib_data.flags = ib->flags;
1518    }
1519 
1520    i = num_chunks++;
1521    chunks[i].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1522    chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1523    chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1524 
1525    struct amdgpu_cs_fence_info fence_info;
1526    fence_info.handle = radv_amdgpu_winsys_bo(ctx->fence_bo)->bo;
1527    fence_info.offset = (request->ip_type * MAX_RINGS_PER_TYPE + request->ring) * sizeof(uint64_t);
1528    amdgpu_cs_chunk_fence_info_to_data(&fence_info, &chunk_data[i]);
1529 
1530    if ((sem_info->wait.syncobj_count || sem_info->wait.timeline_syncobj_count) &&
1531        sem_info->cs_emit_wait) {
1532       r = radv_amdgpu_cs_prepare_syncobjs(ctx->ws, &sem_info->wait, &in_syncobjs);
1533       if (r)
1534          goto error_out;
1535 
1536       if (ctx->ws->info.has_timeline_syncobj) {
1537          wait_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(
1538             &sem_info->wait, in_syncobjs, &chunks[num_chunks],
1539             AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT);
1540       } else {
1541          wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(
1542             &sem_info->wait, in_syncobjs, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_IN);
1543       }
1544       if (!wait_syncobj) {
1545          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1546          goto error_out;
1547       }
1548       num_chunks++;
1549 
1550       sem_info->cs_emit_wait = false;
1551    }
1552 
1553    if ((sem_info->signal.syncobj_count || sem_info->signal.timeline_syncobj_count) &&
1554        sem_info->cs_emit_signal) {
1555       if (ctx->ws->info.has_timeline_syncobj) {
1556          signal_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(
1557             &sem_info->signal, NULL, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL);
1558       } else {
1559          signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(
1560             &sem_info->signal, NULL, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
1561       }
1562       if (!signal_syncobj) {
1563          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1564          goto error_out;
1565       }
1566       num_chunks++;
1567    }
1568 
1569    if (use_bo_list_create) {
1570       /* Legacy path creating the buffer list handle and passing it
1571        * to the CS ioctl.
1572        */
1573       r = amdgpu_bo_list_create_raw(ctx->ws->dev, request->num_handles,
1574                                     request->handles, &bo_list);
1575       if (r) {
1576          if (r == -ENOMEM) {
1577             fprintf(stderr, "amdgpu: Not enough memory for buffer list creation.\n");
1578             result = VK_ERROR_OUT_OF_HOST_MEMORY;
1579          } else {
1580             fprintf(stderr, "amdgpu: buffer list creation failed (%d).\n", r);
1581             result = VK_ERROR_UNKNOWN;
1582          }
1583          goto error_out;
1584       }
1585    } else {
1586       /* Standard path passing the buffer list via the CS ioctl. */
1587       bo_list_in.operation = ~0;
1588       bo_list_in.list_handle = ~0;
1589       bo_list_in.bo_number = request->num_handles;
1590       bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1591       bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)request->handles;
1592 
1593       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1594       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1595       chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1596       num_chunks++;
1597    }
1598 
1599    r = amdgpu_cs_submit_raw2(ctx->ws->dev, ctx->ctx, bo_list, num_chunks, chunks, &request->seq_no);
1600 
1601    if (r) {
1602       if (r == -ENOMEM) {
1603          fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
1604          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1605       } else if (r == -ECANCELED) {
1606          fprintf(stderr, "amdgpu: The CS has been cancelled because the context is lost.\n");
1607          result = VK_ERROR_DEVICE_LOST;
1608       } else {
1609          fprintf(stderr,
1610                  "amdgpu: The CS has been rejected, "
1611                  "see dmesg for more information (%i).\n",
1612                  r);
1613          result = VK_ERROR_UNKNOWN;
1614       }
1615    }
1616 
1617    if (bo_list)
1618       amdgpu_bo_list_destroy_raw(ctx->ws->dev, bo_list);
1619 
1620 error_out:
1621    if (in_syncobjs) {
1622       radv_amdgpu_cache_free_syncobjs(ctx->ws, sem_info->wait.syncobj_count, in_syncobjs);
1623       free(in_syncobjs);
1624    }
1625    free(chunks);
1626    free(chunk_data);
1627    free(wait_syncobj);
1628    free(signal_syncobj);
1629    return result;
1630 }
1631 
1632 static int
radv_amdgpu_create_syncobj(struct radeon_winsys * _ws,bool create_signaled,uint32_t * handle)1633 radv_amdgpu_create_syncobj(struct radeon_winsys *_ws, bool create_signaled, uint32_t *handle)
1634 {
1635    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1636    uint32_t flags = 0;
1637 
1638    if (create_signaled)
1639       flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
1640 
1641    return amdgpu_cs_create_syncobj2(ws->dev, flags, handle);
1642 }
1643 
1644 static void
radv_amdgpu_destroy_syncobj(struct radeon_winsys * _ws,uint32_t handle)1645 radv_amdgpu_destroy_syncobj(struct radeon_winsys *_ws, uint32_t handle)
1646 {
1647    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1648    amdgpu_cs_destroy_syncobj(ws->dev, handle);
1649 }
1650 
1651 static void
radv_amdgpu_reset_syncobj(struct radeon_winsys * _ws,uint32_t handle)1652 radv_amdgpu_reset_syncobj(struct radeon_winsys *_ws, uint32_t handle)
1653 {
1654    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1655    amdgpu_cs_syncobj_reset(ws->dev, &handle, 1);
1656 }
1657 
1658 static void
radv_amdgpu_signal_syncobj(struct radeon_winsys * _ws,uint32_t handle,uint64_t point)1659 radv_amdgpu_signal_syncobj(struct radeon_winsys *_ws, uint32_t handle, uint64_t point)
1660 {
1661    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1662    if (point)
1663       amdgpu_cs_syncobj_timeline_signal(ws->dev, &handle, &point, 1);
1664    else
1665       amdgpu_cs_syncobj_signal(ws->dev, &handle, 1);
1666 }
1667 
1668 static VkResult
radv_amdgpu_query_syncobj(struct radeon_winsys * _ws,uint32_t handle,uint64_t * point)1669 radv_amdgpu_query_syncobj(struct radeon_winsys *_ws, uint32_t handle, uint64_t *point)
1670 {
1671    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1672    int ret = amdgpu_cs_syncobj_query(ws->dev, &handle, point, 1);
1673    if (ret == 0)
1674       return VK_SUCCESS;
1675    else if (ret == -ENOMEM)
1676       return VK_ERROR_OUT_OF_HOST_MEMORY;
1677    else {
1678       /* Remaining error are driver internal issues: EFAULT for
1679        * dangling pointers and ENOENT for non-existing syncobj. */
1680       fprintf(stderr, "amdgpu: internal error in radv_amdgpu_query_syncobj. (%d)\n", ret);
1681       return VK_ERROR_UNKNOWN;
1682    }
1683 }
1684 
1685 static bool
radv_amdgpu_wait_syncobj(struct radeon_winsys * _ws,const uint32_t * handles,uint32_t handle_count,bool wait_all,uint64_t timeout)1686 radv_amdgpu_wait_syncobj(struct radeon_winsys *_ws, const uint32_t *handles, uint32_t handle_count,
1687                          bool wait_all, uint64_t timeout)
1688 {
1689    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1690    uint32_t tmp;
1691 
1692    /* The timeouts are signed, while vulkan timeouts are unsigned. */
1693    timeout = MIN2(timeout, INT64_MAX);
1694 
1695    int ret = amdgpu_cs_syncobj_wait(
1696       ws->dev, (uint32_t *)handles, handle_count, timeout,
1697       DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | (wait_all ? DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL : 0),
1698       &tmp);
1699    if (ret == 0) {
1700       return true;
1701    } else if (ret == -ETIME) {
1702       return false;
1703    } else {
1704       fprintf(stderr, "amdgpu: radv_amdgpu_wait_syncobj failed! (%d)\n", ret);
1705       return false;
1706    }
1707 }
1708 
1709 static bool
radv_amdgpu_wait_timeline_syncobj(struct radeon_winsys * _ws,const uint32_t * handles,const uint64_t * points,uint32_t handle_count,bool wait_all,bool available,uint64_t timeout)1710 radv_amdgpu_wait_timeline_syncobj(struct radeon_winsys *_ws, const uint32_t *handles,
1711                                   const uint64_t *points, uint32_t handle_count, bool wait_all,
1712                                   bool available, uint64_t timeout)
1713 {
1714    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1715 
1716    /* The timeouts are signed, while vulkan timeouts are unsigned. */
1717    timeout = MIN2(timeout, INT64_MAX);
1718 
1719    int ret = amdgpu_cs_syncobj_timeline_wait(
1720       ws->dev, (uint32_t *)handles, (uint64_t *)points, handle_count, timeout,
1721       DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | (wait_all ? DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL : 0) |
1722          (available ? DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE : 0),
1723       NULL);
1724    if (ret == 0) {
1725       return true;
1726    } else if (ret == -ETIME) {
1727       return false;
1728    } else {
1729       fprintf(stderr, "amdgpu: radv_amdgpu_wait_timeline_syncobj failed! (%d)\n", ret);
1730       return false;
1731    }
1732 }
1733 
1734 static int
radv_amdgpu_export_syncobj(struct radeon_winsys * _ws,uint32_t syncobj,int * fd)1735 radv_amdgpu_export_syncobj(struct radeon_winsys *_ws, uint32_t syncobj, int *fd)
1736 {
1737    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1738 
1739    return amdgpu_cs_export_syncobj(ws->dev, syncobj, fd);
1740 }
1741 
1742 static int
radv_amdgpu_import_syncobj(struct radeon_winsys * _ws,int fd,uint32_t * syncobj)1743 radv_amdgpu_import_syncobj(struct radeon_winsys *_ws, int fd, uint32_t *syncobj)
1744 {
1745    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1746 
1747    return amdgpu_cs_import_syncobj(ws->dev, fd, syncobj);
1748 }
1749 
1750 static int
radv_amdgpu_export_syncobj_to_sync_file(struct radeon_winsys * _ws,uint32_t syncobj,int * fd)1751 radv_amdgpu_export_syncobj_to_sync_file(struct radeon_winsys *_ws, uint32_t syncobj, int *fd)
1752 {
1753    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1754 
1755    return amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, fd);
1756 }
1757 
1758 static int
radv_amdgpu_import_syncobj_from_sync_file(struct radeon_winsys * _ws,uint32_t syncobj,int fd)1759 radv_amdgpu_import_syncobj_from_sync_file(struct radeon_winsys *_ws, uint32_t syncobj, int fd)
1760 {
1761    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1762 
1763    return amdgpu_cs_syncobj_import_sync_file(ws->dev, syncobj, fd);
1764 }
1765 
1766 void
radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys * ws)1767 radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
1768 {
1769    ws->base.ctx_create = radv_amdgpu_ctx_create;
1770    ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
1771    ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
1772    ws->base.cs_domain = radv_amdgpu_cs_domain;
1773    ws->base.cs_create = radv_amdgpu_cs_create;
1774    ws->base.cs_destroy = radv_amdgpu_cs_destroy;
1775    ws->base.cs_grow = radv_amdgpu_cs_grow;
1776    ws->base.cs_finalize = radv_amdgpu_cs_finalize;
1777    ws->base.cs_reset = radv_amdgpu_cs_reset;
1778    ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
1779    ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
1780    ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
1781    ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
1782    ws->base.create_syncobj = radv_amdgpu_create_syncobj;
1783    ws->base.destroy_syncobj = radv_amdgpu_destroy_syncobj;
1784    ws->base.reset_syncobj = radv_amdgpu_reset_syncobj;
1785    ws->base.signal_syncobj = radv_amdgpu_signal_syncobj;
1786    ws->base.query_syncobj = radv_amdgpu_query_syncobj;
1787    ws->base.wait_syncobj = radv_amdgpu_wait_syncobj;
1788    ws->base.wait_timeline_syncobj = radv_amdgpu_wait_timeline_syncobj;
1789    ws->base.export_syncobj = radv_amdgpu_export_syncobj;
1790    ws->base.import_syncobj = radv_amdgpu_import_syncobj;
1791    ws->base.export_syncobj_to_sync_file = radv_amdgpu_export_syncobj_to_sync_file;
1792    ws->base.import_syncobj_from_sync_file = radv_amdgpu_import_syncobj_from_sync_file;
1793 }
1794