1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 /**
24  * @file crocus_batch.c
25  *
26  * Batchbuffer and command submission module.
27  *
28  * Every API draw call results in a number of GPU commands, which we
29  * collect into a "batch buffer".  Typically, many draw calls are grouped
30  * into a single batch to amortize command submission overhead.
31  *
32  * We submit batches to the kernel using the I915_GEM_EXECBUFFER2 ioctl.
33  * One critical piece of data is the "validation list", which contains a
34  * list of the buffer objects (BOs) which the commands in the GPU need.
35  * The kernel will make sure these are resident and pinned at the correct
36  * virtual memory address before executing our batch.  If a BO is not in
37  * the validation list, it effectively does not exist, so take care.
38  */
39 
40 #include "crocus_batch.h"
41 #include "crocus_bufmgr.h"
42 #include "crocus_context.h"
43 #include "crocus_fence.h"
44 
45 #include "drm-uapi/i915_drm.h"
46 
47 #include "intel/common/intel_gem.h"
48 #include "main/macros.h"
49 #include "util/hash_table.h"
50 #include "util/set.h"
51 #include "util/u_upload_mgr.h"
52 
53 #include <errno.h>
54 #include <xf86drm.h>
55 
56 #if HAVE_VALGRIND
57 #include <memcheck.h>
58 #include <valgrind.h>
59 #define VG(x) x
60 #else
61 #define VG(x)
62 #endif
63 
64 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
65 
66 /* Terminating the batch takes either 4 bytes for MI_BATCH_BUFFER_END
67  * or 12 bytes for MI_BATCH_BUFFER_START (when chaining).  Plus, we may
68  * need an extra 4 bytes to pad out to the nearest QWord.  So reserve 16.
69  */
70 #define BATCH_RESERVED(devinfo) ((devinfo)->is_haswell ? 32 : 16)
71 
72 static void crocus_batch_reset(struct crocus_batch *batch);
73 
74 static unsigned
num_fences(struct crocus_batch * batch)75 num_fences(struct crocus_batch *batch)
76 {
77    return util_dynarray_num_elements(&batch->exec_fences,
78                                      struct drm_i915_gem_exec_fence);
79 }
80 
81 /**
82  * Debugging code to dump the fence list, used by INTEL_DEBUG=submit.
83  */
84 static void
dump_fence_list(struct crocus_batch * batch)85 dump_fence_list(struct crocus_batch *batch)
86 {
87    fprintf(stderr, "Fence list (length %u):      ", num_fences(batch));
88 
89    util_dynarray_foreach(&batch->exec_fences,
90                          struct drm_i915_gem_exec_fence, f) {
91       fprintf(stderr, "%s%u%s ",
92               (f->flags & I915_EXEC_FENCE_WAIT) ? "..." : "",
93               f->handle,
94               (f->flags & I915_EXEC_FENCE_SIGNAL) ? "!" : "");
95    }
96 
97    fprintf(stderr, "\n");
98 }
99 
100 /**
101  * Debugging code to dump the validation list, used by INTEL_DEBUG=submit.
102  */
103 static void
dump_validation_list(struct crocus_batch * batch)104 dump_validation_list(struct crocus_batch *batch)
105 {
106    fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
107 
108    for (int i = 0; i < batch->exec_count; i++) {
109       uint64_t flags = batch->validation_list[i].flags;
110       assert(batch->validation_list[i].handle ==
111              batch->exec_bos[i]->gem_handle);
112       fprintf(stderr,
113               "[%2d]: %2d %-14s @ 0x%"PRIx64" (%" PRIu64 "B)\t %2d refs %s\n", i,
114               batch->validation_list[i].handle, batch->exec_bos[i]->name,
115               (uint64_t)batch->validation_list[i].offset, batch->exec_bos[i]->size,
116               batch->exec_bos[i]->refcount,
117               (flags & EXEC_OBJECT_WRITE) ? " (write)" : "");
118    }
119 }
120 
121 /**
122  * Return BO information to the batch decoder (for debugging).
123  */
124 static struct intel_batch_decode_bo
decode_get_bo(void * v_batch,bool ppgtt,uint64_t address)125 decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
126 {
127    struct crocus_batch *batch = v_batch;
128 
129    for (int i = 0; i < batch->exec_count; i++) {
130       struct crocus_bo *bo = batch->exec_bos[i];
131       /* The decoder zeroes out the top 16 bits, so we need to as well */
132       uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
133 
134       if (address >= bo_address && address < bo_address + bo->size) {
135          return (struct intel_batch_decode_bo){
136             .addr = address,
137             .size = bo->size,
138             .map = crocus_bo_map(batch->dbg, bo, MAP_READ) +
139                    (address - bo_address),
140          };
141       }
142    }
143 
144    return (struct intel_batch_decode_bo) { };
145 }
146 
147 static unsigned
decode_get_state_size(void * v_batch,uint64_t address,uint64_t base_address)148 decode_get_state_size(void *v_batch, uint64_t address,
149                       uint64_t base_address)
150 {
151    struct crocus_batch *batch = v_batch;
152 
153    /* The decoder gives us offsets from a base address, which is not great.
154     * Binding tables are relative to surface state base address, and other
155     * state is relative to dynamic state base address.  These could alias,
156     * but in practice it's unlikely because surface offsets are always in
157     * the [0, 64K) range, and we assign dynamic state addresses starting at
158     * the top of the 4GB range.  We should fix this but it's likely good
159     * enough for now.
160     */
161    unsigned size = (uintptr_t)
162       _mesa_hash_table_u64_search(batch->state_sizes, address - base_address);
163 
164    return size;
165 }
166 
167 /**
168  * Decode the current batch.
169  */
170 static void
decode_batch(struct crocus_batch * batch)171 decode_batch(struct crocus_batch *batch)
172 {
173    void *map = crocus_bo_map(batch->dbg, batch->exec_bos[0], MAP_READ);
174    intel_print_batch(&batch->decoder, map, batch->primary_batch_size,
175                      batch->exec_bos[0]->gtt_offset, false);
176 }
177 
178 static void
init_reloc_list(struct crocus_reloc_list * rlist,int count)179 init_reloc_list(struct crocus_reloc_list *rlist, int count)
180 {
181    rlist->reloc_count = 0;
182    rlist->reloc_array_size = count;
183    rlist->relocs = malloc(rlist->reloc_array_size *
184                           sizeof(struct drm_i915_gem_relocation_entry));
185 }
186 
187 void
crocus_init_batch(struct crocus_context * ice,enum crocus_batch_name name,int priority)188 crocus_init_batch(struct crocus_context *ice,
189                   enum crocus_batch_name name,
190                   int priority)
191 {
192    struct crocus_batch *batch = &ice->batches[name];
193    struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
194    struct intel_device_info *devinfo = &screen->devinfo;
195 
196    batch->ice = ice;
197    batch->screen = screen;
198    batch->dbg = &ice->dbg;
199    batch->reset = &ice->reset;
200    batch->name = name;
201    batch->contains_fence_signal = false;
202 
203    if (devinfo->ver >= 7) {
204       batch->fine_fences.uploader =
205          u_upload_create(&ice->ctx, 4096, PIPE_BIND_CUSTOM,
206                          PIPE_USAGE_STAGING, 0);
207    }
208    crocus_fine_fence_init(batch);
209 
210    batch->hw_ctx_id = crocus_create_hw_context(screen->bufmgr);
211    assert(batch->hw_ctx_id);
212 
213    crocus_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
214 
215    batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
216    if (devinfo->ver == 6)
217       batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
218 
219    if (INTEL_DEBUG(DEBUG_BATCH)) {
220       /* The shadow doesn't get relocs written so state decode fails. */
221       batch->use_shadow_copy = false;
222    } else
223       batch->use_shadow_copy = !devinfo->has_llc;
224 
225    util_dynarray_init(&batch->exec_fences, ralloc_context(NULL));
226    util_dynarray_init(&batch->syncobjs, ralloc_context(NULL));
227 
228    init_reloc_list(&batch->command.relocs, 250);
229    init_reloc_list(&batch->state.relocs, 250);
230 
231    batch->exec_count = 0;
232    batch->exec_array_size = 100;
233    batch->exec_bos =
234       malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
235    batch->validation_list =
236       malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
237 
238    batch->cache.render = _mesa_hash_table_create(NULL, NULL,
239                                                  _mesa_key_pointer_equal);
240    batch->cache.depth = _mesa_set_create(NULL, NULL,
241                                          _mesa_key_pointer_equal);
242 
243    memset(batch->other_batches, 0, sizeof(batch->other_batches));
244 
245    for (int i = 0, j = 0; i < ice->batch_count; i++) {
246       if (i != name)
247          batch->other_batches[j++] = &ice->batches[i];
248    }
249 
250    if (INTEL_DEBUG(DEBUG_BATCH)) {
251 
252       batch->state_sizes = _mesa_hash_table_u64_create(NULL);
253       const unsigned decode_flags =
254          INTEL_BATCH_DECODE_FULL |
255          (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
256          INTEL_BATCH_DECODE_OFFSETS | INTEL_BATCH_DECODE_FLOATS;
257 
258       intel_batch_decode_ctx_init(&batch->decoder, &screen->devinfo, stderr,
259                                   decode_flags, NULL, decode_get_bo,
260                                   decode_get_state_size, batch);
261       batch->decoder.max_vbo_decoded_lines = 32;
262    }
263 
264    crocus_batch_reset(batch);
265 }
266 
267 static int
find_exec_index(struct crocus_batch * batch,struct crocus_bo * bo)268 find_exec_index(struct crocus_batch *batch, struct crocus_bo *bo)
269 {
270    unsigned index = READ_ONCE(bo->index);
271 
272    if (index < batch->exec_count && batch->exec_bos[index] == bo)
273       return index;
274 
275    /* May have been shared between multiple active batches */
276    for (index = 0; index < batch->exec_count; index++) {
277       if (batch->exec_bos[index] == bo)
278 	 return index;
279    }
280    return -1;
281 }
282 
283 static struct drm_i915_gem_exec_object2 *
find_validation_entry(struct crocus_batch * batch,struct crocus_bo * bo)284 find_validation_entry(struct crocus_batch *batch, struct crocus_bo *bo)
285 {
286    int index = find_exec_index(batch, bo);
287 
288    if (index == -1)
289       return NULL;
290    return &batch->validation_list[index];
291 }
292 
293 static void
ensure_exec_obj_space(struct crocus_batch * batch,uint32_t count)294 ensure_exec_obj_space(struct crocus_batch *batch, uint32_t count)
295 {
296    while (batch->exec_count + count > batch->exec_array_size) {
297       batch->exec_array_size *= 2;
298       batch->exec_bos = realloc(
299          batch->exec_bos, batch->exec_array_size * sizeof(batch->exec_bos[0]));
300       batch->validation_list =
301          realloc(batch->validation_list,
302                  batch->exec_array_size * sizeof(batch->validation_list[0]));
303    }
304 }
305 
306 static struct drm_i915_gem_exec_object2 *
crocus_use_bo(struct crocus_batch * batch,struct crocus_bo * bo,bool writable)307 crocus_use_bo(struct crocus_batch *batch, struct crocus_bo *bo, bool writable)
308 {
309    assert(bo->bufmgr == batch->command.bo->bufmgr);
310 
311    struct drm_i915_gem_exec_object2 *existing_entry =
312       find_validation_entry(batch, bo);
313 
314    if (existing_entry) {
315       /* The BO is already in the validation list; mark it writable */
316       if (writable)
317          existing_entry->flags |= EXEC_OBJECT_WRITE;
318       return existing_entry;
319    }
320 
321    if (bo != batch->command.bo && bo != batch->state.bo) {
322       /* This is the first time our batch has seen this BO.  Before we use it,
323        * we may need to flush and synchronize with other batches.
324        */
325       for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) {
326 
327          if (!batch->other_batches[b])
328             continue;
329          struct drm_i915_gem_exec_object2 *other_entry =
330             find_validation_entry(batch->other_batches[b], bo);
331 
332          /* If the buffer is referenced by another batch, and either batch
333           * intends to write it, then flush the other batch and synchronize.
334           *
335           * Consider these cases:
336           *
337           * 1. They read, we read   =>  No synchronization required.
338           * 2. They read, we write  =>  Synchronize (they need the old value)
339           * 3. They write, we read  =>  Synchronize (we need their new value)
340           * 4. They write, we write =>  Synchronize (order writes)
341           *
342           * The read/read case is very common, as multiple batches usually
343           * share a streaming state buffer or shader assembly buffer, and
344           * we want to avoid synchronizing in this case.
345           */
346          if (other_entry &&
347              ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) {
348             crocus_batch_flush(batch->other_batches[b]);
349             crocus_batch_add_syncobj(batch,
350                                      batch->other_batches[b]->last_fence->syncobj,
351                                      I915_EXEC_FENCE_WAIT);
352          }
353       }
354    }
355 
356    /* Bump the ref count since the batch is now using this bo. */
357    crocus_bo_reference(bo);
358 
359    ensure_exec_obj_space(batch, 1);
360 
361    batch->validation_list[batch->exec_count] =
362       (struct drm_i915_gem_exec_object2) {
363          .handle = bo->gem_handle,
364          .offset = bo->gtt_offset,
365          .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0),
366       };
367 
368    bo->index = batch->exec_count;
369    batch->exec_bos[batch->exec_count] = bo;
370    batch->aperture_space += bo->size;
371 
372    batch->exec_count++;
373 
374    return &batch->validation_list[batch->exec_count - 1];
375 }
376 
377 static uint64_t
emit_reloc(struct crocus_batch * batch,struct crocus_reloc_list * rlist,uint32_t offset,struct crocus_bo * target,int32_t target_offset,unsigned int reloc_flags)378 emit_reloc(struct crocus_batch *batch,
379            struct crocus_reloc_list *rlist, uint32_t offset,
380            struct crocus_bo *target, int32_t target_offset,
381            unsigned int reloc_flags)
382 {
383    assert(target != NULL);
384 
385    if (target == batch->ice->workaround_bo)
386       reloc_flags &= ~RELOC_WRITE;
387 
388    bool writable = reloc_flags & RELOC_WRITE;
389 
390    struct drm_i915_gem_exec_object2 *entry =
391       crocus_use_bo(batch, target, writable);
392 
393    if (rlist->reloc_count == rlist->reloc_array_size) {
394       rlist->reloc_array_size *= 2;
395       rlist->relocs = realloc(rlist->relocs,
396                               rlist->reloc_array_size *
397                               sizeof(struct drm_i915_gem_relocation_entry));
398    }
399 
400    if (reloc_flags & RELOC_32BIT) {
401       /* Restrict this buffer to the low 32 bits of the address space.
402        *
403        * Altering the validation list flags restricts it for this batch,
404        * but we also alter the BO's kflags to restrict it permanently
405        * (until the BO is destroyed and put back in the cache).  Buffers
406        * may stay bound across batches, and we want keep it constrained.
407        */
408       target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
409       entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
410 
411       /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
412       reloc_flags &= ~RELOC_32BIT;
413    }
414 
415    if (reloc_flags)
416       entry->flags |= reloc_flags & batch->valid_reloc_flags;
417 
418    rlist->relocs[rlist->reloc_count++] =
419       (struct drm_i915_gem_relocation_entry) {
420          .offset = offset,
421          .delta = target_offset,
422          .target_handle = find_exec_index(batch, target),
423          .presumed_offset = entry->offset,
424       };
425 
426    /* Using the old buffer offset, write in what the right data would be, in
427     * case the buffer doesn't move and we can short-circuit the relocation
428     * processing in the kernel
429     */
430    return entry->offset + target_offset;
431 }
432 
433 uint64_t
crocus_command_reloc(struct crocus_batch * batch,uint32_t batch_offset,struct crocus_bo * target,uint32_t target_offset,unsigned int reloc_flags)434 crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset,
435                      struct crocus_bo *target, uint32_t target_offset,
436                      unsigned int reloc_flags)
437 {
438    assert(batch_offset <= batch->command.bo->size - sizeof(uint32_t));
439 
440    return emit_reloc(batch, &batch->command.relocs, batch_offset,
441                      target, target_offset, reloc_flags);
442 }
443 
444 uint64_t
crocus_state_reloc(struct crocus_batch * batch,uint32_t state_offset,struct crocus_bo * target,uint32_t target_offset,unsigned int reloc_flags)445 crocus_state_reloc(struct crocus_batch *batch, uint32_t state_offset,
446                    struct crocus_bo *target, uint32_t target_offset,
447                    unsigned int reloc_flags)
448 {
449    assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
450 
451    return emit_reloc(batch, &batch->state.relocs, state_offset,
452                      target, target_offset, reloc_flags);
453 }
454 
455 static void
recreate_growing_buffer(struct crocus_batch * batch,struct crocus_growing_bo * grow,const char * name,unsigned size)456 recreate_growing_buffer(struct crocus_batch *batch,
457                         struct crocus_growing_bo *grow,
458                         const char *name, unsigned size)
459 {
460    struct crocus_screen *screen = batch->screen;
461    struct crocus_bufmgr *bufmgr = screen->bufmgr;
462    grow->bo = crocus_bo_alloc(bufmgr, name, size);
463    grow->bo->kflags |= EXEC_OBJECT_CAPTURE;
464    grow->partial_bo = NULL;
465    grow->partial_bo_map = NULL;
466    grow->partial_bytes = 0;
467    if (batch->use_shadow_copy)
468       grow->map = realloc(grow->map, grow->bo->size);
469    else
470       grow->map = crocus_bo_map(NULL, grow->bo, MAP_READ | MAP_WRITE);
471    grow->map_next = grow->map;
472 }
473 
474 static void
create_batch(struct crocus_batch * batch)475 create_batch(struct crocus_batch *batch)
476 {
477    struct crocus_screen *screen = batch->screen;
478 
479    recreate_growing_buffer(batch, &batch->command,
480                            "command buffer",
481                            BATCH_SZ + BATCH_RESERVED(&screen->devinfo));
482 
483    crocus_use_bo(batch, batch->command.bo, false);
484 
485    /* Always add workaround_bo which contains a driver identifier to be
486     * recorded in error states.
487     */
488    crocus_use_bo(batch, batch->ice->workaround_bo, false);
489 
490    recreate_growing_buffer(batch, &batch->state,
491                            "state buffer",
492                            STATE_SZ);
493 
494    batch->state.used = 1;
495    crocus_use_bo(batch, batch->state.bo, false);
496 }
497 
498 static void
crocus_batch_maybe_noop(struct crocus_batch * batch)499 crocus_batch_maybe_noop(struct crocus_batch *batch)
500 {
501    /* We only insert the NOOP at the beginning of the batch. */
502    assert(crocus_batch_bytes_used(batch) == 0);
503 
504    if (batch->noop_enabled) {
505       /* Emit MI_BATCH_BUFFER_END to prevent any further command to be
506        * executed.
507        */
508       uint32_t *map = batch->command.map_next;
509 
510       map[0] = (0xA << 23);
511 
512       batch->command.map_next += 4;
513    }
514 }
515 
516 static void
crocus_batch_reset(struct crocus_batch * batch)517 crocus_batch_reset(struct crocus_batch *batch)
518 {
519    struct crocus_screen *screen = batch->screen;
520 
521    crocus_bo_unreference(batch->command.bo);
522    crocus_bo_unreference(batch->state.bo);
523    batch->primary_batch_size = 0;
524    batch->contains_draw = false;
525    batch->contains_fence_signal = false;
526    batch->state_base_address_emitted = false;
527    batch->screen->vtbl.batch_reset_dirty(batch);
528 
529    create_batch(batch);
530    assert(batch->command.bo->index == 0);
531 
532    if (batch->state_sizes)
533       _mesa_hash_table_u64_clear(batch->state_sizes);
534    struct crocus_syncobj *syncobj = crocus_create_syncobj(screen);
535    crocus_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL);
536    crocus_syncobj_reference(screen, &syncobj, NULL);
537 
538    crocus_cache_sets_clear(batch);
539 }
540 
541 void
crocus_batch_free(struct crocus_batch * batch)542 crocus_batch_free(struct crocus_batch *batch)
543 {
544    struct crocus_screen *screen = batch->screen;
545    struct crocus_bufmgr *bufmgr = screen->bufmgr;
546 
547    if (batch->use_shadow_copy) {
548       free(batch->command.map);
549       free(batch->state.map);
550    }
551 
552    for (int i = 0; i < batch->exec_count; i++) {
553       crocus_bo_unreference(batch->exec_bos[i]);
554    }
555 
556    pipe_resource_reference(&batch->fine_fences.ref.res, NULL);
557 
558    free(batch->command.relocs.relocs);
559    free(batch->state.relocs.relocs);
560    free(batch->exec_bos);
561    free(batch->validation_list);
562 
563    ralloc_free(batch->exec_fences.mem_ctx);
564 
565    util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s)
566       crocus_syncobj_reference(screen, s, NULL);
567    ralloc_free(batch->syncobjs.mem_ctx);
568 
569    crocus_fine_fence_reference(batch->screen, &batch->last_fence, NULL);
570    if (batch_has_fine_fence(batch))
571       u_upload_destroy(batch->fine_fences.uploader);
572 
573    crocus_bo_unreference(batch->command.bo);
574    crocus_bo_unreference(batch->state.bo);
575    batch->command.bo = NULL;
576    batch->command.map = NULL;
577    batch->command.map_next = NULL;
578 
579    crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id);
580 
581    _mesa_hash_table_destroy(batch->cache.render, NULL);
582    _mesa_set_destroy(batch->cache.depth, NULL);
583 
584    if (batch->state_sizes) {
585       _mesa_hash_table_u64_destroy(batch->state_sizes);
586       intel_batch_decode_ctx_finish(&batch->decoder);
587    }
588 }
589 
590 /**
591  * If we've chained to a secondary batch, or are getting near to the end,
592  * then flush.  This should only be called between draws.
593  */
594 void
crocus_batch_maybe_flush(struct crocus_batch * batch,unsigned estimate)595 crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate)
596 {
597    if (batch->command.bo != batch->exec_bos[0] ||
598        crocus_batch_bytes_used(batch) + estimate >= BATCH_SZ) {
599       crocus_batch_flush(batch);
600    }
601 }
602 
603 /**
604  * Finish copying the old batch/state buffer's contents to the new one
605  * after we tried to "grow" the buffer in an earlier operation.
606  */
607 static void
finish_growing_bos(struct crocus_growing_bo * grow)608 finish_growing_bos(struct crocus_growing_bo *grow)
609 {
610    struct crocus_bo *old_bo = grow->partial_bo;
611    if (!old_bo)
612       return;
613 
614    memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
615 
616    grow->partial_bo = NULL;
617    grow->partial_bo_map = NULL;
618    grow->partial_bytes = 0;
619 
620    crocus_bo_unreference(old_bo);
621 }
622 
623 void
crocus_grow_buffer(struct crocus_batch * batch,bool grow_state,unsigned used,unsigned new_size)624 crocus_grow_buffer(struct crocus_batch *batch, bool grow_state,
625                    unsigned used,
626                    unsigned new_size)
627 {
628    struct crocus_screen *screen = batch->screen;
629    struct crocus_bufmgr *bufmgr = screen->bufmgr;
630    struct crocus_growing_bo *grow = grow_state ? &batch->state : &batch->command;
631    struct crocus_bo *bo = grow->bo;
632 
633    if (grow->partial_bo) {
634       /* We've already grown once, and now we need to do it again.
635        * Finish our last grow operation so we can start a new one.
636        * This should basically never happen.
637        */
638       finish_growing_bos(grow);
639    }
640 
641    struct crocus_bo *new_bo = crocus_bo_alloc(bufmgr, bo->name, new_size);
642 
643    /* Copy existing data to the new larger buffer */
644    grow->partial_bo_map = grow->map;
645 
646    if (batch->use_shadow_copy) {
647       /* We can't safely use realloc, as it may move the existing buffer,
648        * breaking existing pointers the caller may still be using.  Just
649        * malloc a new copy and memcpy it like the normal BO path.
650        *
651        * Use bo->size rather than new_size because the bufmgr may have
652        * rounded up the size, and we want the shadow size to match.
653        */
654       grow->map = malloc(new_bo->size);
655    } else {
656       grow->map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE);
657    }
658    /* Try to put the new BO at the same GTT offset as the old BO (which
659     * we're throwing away, so it doesn't need to be there).
660     *
661     * This guarantees that our relocations continue to work: values we've
662     * already written into the buffer, values we're going to write into the
663     * buffer, and the validation/relocation lists all will match.
664     *
665     * Also preserve kflags for EXEC_OBJECT_CAPTURE.
666     */
667    new_bo->gtt_offset = bo->gtt_offset;
668    new_bo->index = bo->index;
669    new_bo->kflags = bo->kflags;
670 
671    /* Batch/state buffers are per-context, and if we've run out of space,
672     * we must have actually used them before, so...they will be in the list.
673     */
674    assert(bo->index < batch->exec_count);
675    assert(batch->exec_bos[bo->index] == bo);
676 
677    /* Update the validation list to use the new BO. */
678    batch->validation_list[bo->index].handle = new_bo->gem_handle;
679    /* Exchange the two BOs...without breaking pointers to the old BO.
680     *
681     * Consider this scenario:
682     *
683     * 1. Somebody calls brw_state_batch() to get a region of memory, and
684     *    and then creates a brw_address pointing to brw->batch.state.bo.
685     * 2. They then call brw_state_batch() a second time, which happens to
686     *    grow and replace the state buffer.  They then try to emit a
687     *    relocation to their first section of memory.
688     *
689     * If we replace the brw->batch.state.bo pointer at step 2, we would
690     * break the address created in step 1.  They'd have a pointer to the
691     * old destroyed BO.  Emitting a relocation would add this dead BO to
692     * the validation list...causing /both/ statebuffers to be in the list,
693     * and all kinds of disasters.
694     *
695     * This is not a contrived case - BLORP vertex data upload hits this.
696     *
697     * There are worse scenarios too.  Fences for GL sync objects reference
698     * brw->batch.batch.bo.  If we replaced the batch pointer when growing,
699     * we'd need to chase down every fence and update it to point to the
700     * new BO.  Otherwise, it would refer to a "batch" that never actually
701     * gets submitted, and would fail to trigger.
702     *
703     * To work around both of these issues, we transmutate the buffers in
704     * place, making the existing struct brw_bo represent the new buffer,
705     * and "new_bo" represent the old BO.  This is highly unusual, but it
706     * seems like a necessary evil.
707     *
708     * We also defer the memcpy of the existing batch's contents.  Callers
709     * may make multiple brw_state_batch calls, and retain pointers to the
710     * old BO's map.  We'll perform the memcpy in finish_growing_bo() when
711     * we finally submit the batch, at which point we've finished uploading
712     * state, and nobody should have any old references anymore.
713     *
714     * To do that, we keep a reference to the old BO in grow->partial_bo,
715     * and store the number of bytes to copy in grow->partial_bytes.  We
716     * can monkey with the refcounts directly without atomics because these
717     * are per-context BOs and they can only be touched by this thread.
718     */
719    assert(new_bo->refcount == 1);
720    new_bo->refcount = bo->refcount;
721    bo->refcount = 1;
722 
723    struct crocus_bo tmp;
724    memcpy(&tmp, bo, sizeof(struct crocus_bo));
725    memcpy(bo, new_bo, sizeof(struct crocus_bo));
726    memcpy(new_bo, &tmp, sizeof(struct crocus_bo));
727 
728    grow->partial_bo = new_bo; /* the one reference of the OLD bo */
729    grow->partial_bytes = used;
730 }
731 
732 static void
finish_seqno(struct crocus_batch * batch)733 finish_seqno(struct crocus_batch *batch)
734 {
735    struct crocus_fine_fence *sq = crocus_fine_fence_new(batch, CROCUS_FENCE_END);
736    if (!sq)
737       return;
738 
739    crocus_fine_fence_reference(batch->screen, &batch->last_fence, sq);
740    crocus_fine_fence_reference(batch->screen, &sq, NULL);
741 }
742 
743 /**
744  * Terminate a batch with MI_BATCH_BUFFER_END.
745  */
746 static void
crocus_finish_batch(struct crocus_batch * batch)747 crocus_finish_batch(struct crocus_batch *batch)
748 {
749 
750    batch->no_wrap = true;
751    if (batch->screen->vtbl.finish_batch)
752       batch->screen->vtbl.finish_batch(batch);
753 
754    finish_seqno(batch);
755 
756    /* Emit MI_BATCH_BUFFER_END to finish our batch. */
757    uint32_t *map = batch->command.map_next;
758 
759    map[0] = (0xA << 23);
760 
761    batch->command.map_next += 4;
762    VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->command.map, crocus_batch_bytes_used(batch)));
763 
764    if (batch->command.bo == batch->exec_bos[0])
765       batch->primary_batch_size = crocus_batch_bytes_used(batch);
766    batch->no_wrap = false;
767 }
768 
769 /**
770  * Replace our current GEM context with a new one (in case it got banned).
771  */
772 static bool
replace_hw_ctx(struct crocus_batch * batch)773 replace_hw_ctx(struct crocus_batch *batch)
774 {
775    struct crocus_screen *screen = batch->screen;
776    struct crocus_bufmgr *bufmgr = screen->bufmgr;
777 
778    uint32_t new_ctx = crocus_clone_hw_context(bufmgr, batch->hw_ctx_id);
779    if (!new_ctx)
780       return false;
781 
782    crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id);
783    batch->hw_ctx_id = new_ctx;
784 
785    /* Notify the context that state must be re-initialized. */
786    crocus_lost_context_state(batch);
787 
788    return true;
789 }
790 
791 enum pipe_reset_status
crocus_batch_check_for_reset(struct crocus_batch * batch)792 crocus_batch_check_for_reset(struct crocus_batch *batch)
793 {
794    struct crocus_screen *screen = batch->screen;
795    enum pipe_reset_status status = PIPE_NO_RESET;
796    struct drm_i915_reset_stats stats = { .ctx_id = batch->hw_ctx_id };
797 
798    if (drmIoctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats))
799       DBG("DRM_IOCTL_I915_GET_RESET_STATS failed: %s\n", strerror(errno));
800 
801    if (stats.batch_active != 0) {
802       /* A reset was observed while a batch from this hardware context was
803        * executing.  Assume that this context was at fault.
804        */
805       status = PIPE_GUILTY_CONTEXT_RESET;
806    } else if (stats.batch_pending != 0) {
807       /* A reset was observed while a batch from this context was in progress,
808        * but the batch was not executing.  In this case, assume that the
809        * context was not at fault.
810        */
811       status = PIPE_INNOCENT_CONTEXT_RESET;
812    }
813 
814    if (status != PIPE_NO_RESET) {
815       /* Our context is likely banned, or at least in an unknown state.
816        * Throw it away and start with a fresh context.  Ideally this may
817        * catch the problem before our next execbuf fails with -EIO.
818        */
819       replace_hw_ctx(batch);
820    }
821 
822    return status;
823 }
824 
825 /**
826  * Submit the batch to the GPU via execbuffer2.
827  */
828 static int
submit_batch(struct crocus_batch * batch)829 submit_batch(struct crocus_batch *batch)
830 {
831 
832    if (batch->use_shadow_copy) {
833       void *bo_map = crocus_bo_map(batch->dbg, batch->command.bo, MAP_WRITE);
834       memcpy(bo_map, batch->command.map, crocus_batch_bytes_used(batch));
835 
836       bo_map = crocus_bo_map(batch->dbg, batch->state.bo, MAP_WRITE);
837       memcpy(bo_map, batch->state.map, batch->state.used);
838    }
839 
840    crocus_bo_unmap(batch->command.bo);
841    crocus_bo_unmap(batch->state.bo);
842 
843    /* The requirement for using I915_EXEC_NO_RELOC are:
844     *
845     *   The addresses written in the objects must match the corresponding
846     *   reloc.gtt_offset which in turn must match the corresponding
847     *   execobject.offset.
848     *
849     *   Any render targets written to in the batch must be flagged with
850     *   EXEC_OBJECT_WRITE.
851     *
852     *   To avoid stalling, execobject.offset should match the current
853     *   address of that object within the active context.
854     */
855    /* Set statebuffer relocations */
856    const unsigned state_index = batch->state.bo->index;
857    if (state_index < batch->exec_count &&
858        batch->exec_bos[state_index] == batch->state.bo) {
859       struct drm_i915_gem_exec_object2 *entry =
860          &batch->validation_list[state_index];
861       assert(entry->handle == batch->state.bo->gem_handle);
862       entry->relocation_count = batch->state.relocs.reloc_count;
863       entry->relocs_ptr = (uintptr_t)batch->state.relocs.relocs;
864    }
865 
866    /* Set batchbuffer relocations */
867    struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
868    assert(entry->handle == batch->command.bo->gem_handle);
869    entry->relocation_count = batch->command.relocs.reloc_count;
870    entry->relocs_ptr = (uintptr_t)batch->command.relocs.relocs;
871 
872    struct drm_i915_gem_execbuffer2 execbuf = {
873       .buffers_ptr = (uintptr_t)batch->validation_list,
874       .buffer_count = batch->exec_count,
875       .batch_start_offset = 0,
876       /* This must be QWord aligned. */
877       .batch_len = ALIGN(batch->primary_batch_size, 8),
878       .flags = I915_EXEC_RENDER |
879                I915_EXEC_NO_RELOC |
880                I915_EXEC_BATCH_FIRST |
881                I915_EXEC_HANDLE_LUT,
882       .rsvd1 = batch->hw_ctx_id, /* rsvd1 is actually the context ID */
883    };
884 
885    if (num_fences(batch)) {
886       execbuf.flags |= I915_EXEC_FENCE_ARRAY;
887       execbuf.num_cliprects = num_fences(batch);
888       execbuf.cliprects_ptr =
889          (uintptr_t)util_dynarray_begin(&batch->exec_fences);
890    }
891 
892    int ret = 0;
893    if (!batch->screen->devinfo.no_hw &&
894        intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
895       ret = -errno;
896 
897    for (int i = 0; i < batch->exec_count; i++) {
898       struct crocus_bo *bo = batch->exec_bos[i];
899 
900       bo->idle = false;
901       bo->index = -1;
902 
903       /* Update brw_bo::gtt_offset */
904       if (batch->validation_list[i].offset != bo->gtt_offset) {
905          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
906              bo->gem_handle, bo->gtt_offset,
907              (uint64_t)batch->validation_list[i].offset);
908          assert(!(bo->kflags & EXEC_OBJECT_PINNED));
909          bo->gtt_offset = batch->validation_list[i].offset;
910       }
911    }
912 
913    return ret;
914 }
915 
916 static const char *
batch_name_to_string(enum crocus_batch_name name)917 batch_name_to_string(enum crocus_batch_name name)
918 {
919    const char *names[CROCUS_BATCH_COUNT] = {
920       [CROCUS_BATCH_RENDER] = "render",
921       [CROCUS_BATCH_COMPUTE] = "compute",
922    };
923    return names[name];
924 }
925 
926 /**
927  * Flush the batch buffer, submitting it to the GPU and resetting it so
928  * we're ready to emit the next batch.
929  *
930  * \param in_fence_fd is ignored if -1.  Otherwise, this function takes
931  * ownership of the fd.
932  *
933  * \param out_fence_fd is ignored if NULL.  Otherwise, the caller must
934  * take ownership of the returned fd.
935  */
936 void
_crocus_batch_flush(struct crocus_batch * batch,const char * file,int line)937 _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line)
938 {
939    struct crocus_screen *screen = batch->screen;
940 
941    /* If a fence signals we need to flush it. */
942    if (crocus_batch_bytes_used(batch) == 0 && !batch->contains_fence_signal)
943       return;
944 
945    assert(!batch->no_wrap);
946    crocus_finish_batch(batch);
947 
948    finish_growing_bos(&batch->command);
949    finish_growing_bos(&batch->state);
950    int ret = submit_batch(batch);
951 
952    if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL)) {
953       int bytes_for_commands = crocus_batch_bytes_used(batch);
954       int second_bytes = 0;
955       if (batch->command.bo != batch->exec_bos[0]) {
956          second_bytes = bytes_for_commands;
957          bytes_for_commands += batch->primary_batch_size;
958       }
959       fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5d+%5db (%0.1f%%) "
960               "(cmds), %4d BOs (%0.1fMb aperture),"
961               " %4d command relocs, %4d state relocs\n",
962               file, line, batch_name_to_string(batch->name), batch->hw_ctx_id,
963               batch->primary_batch_size, second_bytes,
964               100.0f * bytes_for_commands / BATCH_SZ,
965               batch->exec_count,
966               (float) batch->aperture_space / (1024 * 1024),
967               batch->command.relocs.reloc_count,
968               batch->state.relocs.reloc_count);
969 
970       if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT)) {
971          dump_fence_list(batch);
972          dump_validation_list(batch);
973       }
974 
975       if (INTEL_DEBUG(DEBUG_BATCH)) {
976          decode_batch(batch);
977       }
978    }
979 
980    for (int i = 0; i < batch->exec_count; i++) {
981       struct crocus_bo *bo = batch->exec_bos[i];
982       crocus_bo_unreference(bo);
983    }
984 
985    batch->command.relocs.reloc_count = 0;
986    batch->state.relocs.reloc_count = 0;
987    batch->exec_count = 0;
988    batch->aperture_space = 0;
989 
990    util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s)
991       crocus_syncobj_reference(screen, s, NULL);
992    util_dynarray_clear(&batch->syncobjs);
993 
994    util_dynarray_clear(&batch->exec_fences);
995 
996    if (INTEL_DEBUG(DEBUG_SYNC)) {
997       dbg_printf("waiting for idle\n");
998       crocus_bo_wait_rendering(batch->command.bo); /* if execbuf failed; this is a nop */
999    }
1000 
1001    /* Start a new batch buffer. */
1002    crocus_batch_reset(batch);
1003 
1004    /* EIO means our context is banned.  In this case, try and replace it
1005     * with a new logical context, and inform crocus_context that all state
1006     * has been lost and needs to be re-initialized.  If this succeeds,
1007     * dubiously claim success...
1008     */
1009    if (ret == -EIO && replace_hw_ctx(batch)) {
1010       if (batch->reset->reset) {
1011          /* Tell the state tracker the device is lost and it was our fault. */
1012          batch->reset->reset(batch->reset->data, PIPE_GUILTY_CONTEXT_RESET);
1013       }
1014 
1015       ret = 0;
1016    }
1017 
1018    if (ret < 0) {
1019 #ifdef DEBUG
1020       const bool color = INTEL_DEBUG(DEBUG_COLOR);
1021       fprintf(stderr, "%scrocus: Failed to submit batchbuffer: %-80s%s\n",
1022               color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
1023 #endif
1024       abort();
1025    }
1026 }
1027 
1028 /**
1029  * Does the current batch refer to the given BO?
1030  *
1031  * (In other words, is the BO in the current batch's validation list?)
1032  */
1033 bool
crocus_batch_references(struct crocus_batch * batch,struct crocus_bo * bo)1034 crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo)
1035 {
1036    return find_validation_entry(batch, bo) != NULL;
1037 }
1038 
1039 /**
1040  * Updates the state of the noop feature.  Returns true if there was a noop
1041  * transition that led to state invalidation.
1042  */
1043 bool
crocus_batch_prepare_noop(struct crocus_batch * batch,bool noop_enable)1044 crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable)
1045 {
1046    if (batch->noop_enabled == noop_enable)
1047       return 0;
1048 
1049    batch->noop_enabled = noop_enable;
1050 
1051    crocus_batch_flush(batch);
1052 
1053    /* If the batch was empty, flush had no effect, so insert our noop. */
1054    if (crocus_batch_bytes_used(batch) == 0)
1055       crocus_batch_maybe_noop(batch);
1056 
1057    /* We only need to update the entire state if we transition from noop ->
1058     * not-noop.
1059     */
1060    return !batch->noop_enabled;
1061 }
1062