/* * Copyright (C) 2012-2018 Rob Clark * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Authors: * Rob Clark */ #ifndef FREEDRENO_RINGBUFFER_H_ #define FREEDRENO_RINGBUFFER_H_ #include #include "util/u_atomic.h" #include "util/u_debug.h" #include "util/u_queue.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" #include "freedreno_drmif.h" #include "freedreno_pm4.h" #ifdef __cplusplus extern "C" { #endif struct fd_submit; struct fd_ringbuffer; enum fd_ringbuffer_flags { /* Primary ringbuffer for a submit, ie. an IB1 level rb * which kernel must setup RB->IB1 CP_INDIRECT_BRANCH * packets. */ FD_RINGBUFFER_PRIMARY = 0x1, /* Hint that the stateobj will be used for streaming state * that is used once or a few times and then discarded. * * For sub-allocation, non streaming stateobj's should be * sub-allocated from a page size buffer, so one long lived * state obj doesn't prevent other pages from being freed. * (Ie. it would be no worse than allocating a page sized * bo for each small non-streaming stateobj). * * But streaming stateobj's could be sub-allocated from a * larger buffer to reduce the alloc/del overhead. */ FD_RINGBUFFER_STREAMING = 0x2, /* Indicates that "growable" cmdstream can be used, * consisting of multiple physical cmdstream buffers */ FD_RINGBUFFER_GROWABLE = 0x4, /* Internal use only: */ _FD_RINGBUFFER_OBJECT = 0x8, }; /* A submit object manages/tracks all the state buildup for a "submit" * ioctl to the kernel. Additionally, with the exception of long-lived * non-STREAMING stateobj rb's, rb's are allocated from the submit. */ struct fd_submit *fd_submit_new(struct fd_pipe *pipe); /* NOTE: all ringbuffer's create from the submit should be unref'd * before destroying the submit. */ void fd_submit_del(struct fd_submit *submit); struct fd_submit * fd_submit_ref(struct fd_submit *submit); /* Allocate a new rb from the submit. */ struct fd_ringbuffer *fd_submit_new_ringbuffer(struct fd_submit *submit, uint32_t size, enum fd_ringbuffer_flags flags); /** * Encapsulates submit out-fence(s), which consist of a 'timestamp' (per- * pipe (submitqueue) sequence number) and optionally, if requested, an * out-fence-fd */ struct fd_submit_fence { /** * The ready fence is signaled once the submit is actually flushed down * to the kernel, and fence/fence_fd are populated. You must wait for * this fence to be signaled before reading fence/fence_fd. */ struct util_queue_fence ready; struct fd_fence fence; /** * Optional dma_fence fd, returned by submit if use_fence_fd is true */ int fence_fd; bool use_fence_fd; }; /* in_fence_fd: -1 for no in-fence, else fence fd * out_fence can be NULL if no output fence is required */ int fd_submit_flush(struct fd_submit *submit, int in_fence_fd, struct fd_submit_fence *out_fence); struct fd_ringbuffer; struct fd_reloc; struct fd_ringbuffer_funcs { void (*grow)(struct fd_ringbuffer *ring, uint32_t size); void (*emit_reloc)(struct fd_ringbuffer *ring, const struct fd_reloc *reloc); uint32_t (*emit_reloc_ring)(struct fd_ringbuffer *ring, struct fd_ringbuffer *target, uint32_t cmd_idx); uint32_t (*cmd_count)(struct fd_ringbuffer *ring); bool (*check_size)(struct fd_ringbuffer *ring); void (*destroy)(struct fd_ringbuffer *ring); }; /* the ringbuffer object is not opaque so that OUT_RING() type stuff * can be inlined. Note that users should not make assumptions about * the size of this struct. */ struct fd_ringbuffer { uint32_t *cur, *end, *start; const struct fd_ringbuffer_funcs *funcs; // size or end coudl probably go away int size; int32_t refcnt; enum fd_ringbuffer_flags flags; }; /* Allocate a new long-lived state object, not associated with * a submit: */ struct fd_ringbuffer *fd_ringbuffer_new_object(struct fd_pipe *pipe, uint32_t size); static inline void fd_ringbuffer_del(struct fd_ringbuffer *ring) { if (!p_atomic_dec_zero(&ring->refcnt)) return; ring->funcs->destroy(ring); } static inline struct fd_ringbuffer * fd_ringbuffer_ref(struct fd_ringbuffer *ring) { p_atomic_inc(&ring->refcnt); return ring; } static inline void fd_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t ndwords) { assert(ring->funcs->grow); /* unsupported on kgsl */ /* there is an upper bound on IB size, which appears to be 0x0fffff */ ring->size = MIN2(ring->size << 1, 0x0fffff); ring->funcs->grow(ring, ring->size); } static inline bool fd_ringbuffer_check_size(struct fd_ringbuffer *ring) { return ring->funcs->check_size(ring); } static inline void fd_ringbuffer_emit(struct fd_ringbuffer *ring, uint32_t data) { (*ring->cur++) = data; } struct fd_reloc { struct fd_bo *bo; uint64_t iova; #define FD_RELOC_READ 0x0001 #define FD_RELOC_WRITE 0x0002 #define FD_RELOC_DUMP 0x0004 uint32_t offset; uint32_t orlo; int32_t shift; uint32_t orhi; /* used for a5xx+ */ }; /* We always mark BOs for write, instead of tracking it across reloc * sources in userspace. On the kernel side, this means we track a single * excl fence in the BO instead of a set of read fences, which is cheaper. * The downside is that a dmabuf-shared device won't be able to read in * parallel with a read-only access by freedreno, but most other drivers * have decided that that usecase isn't important enough to do this * tracking, as well. */ #define FD_RELOC_FLAGS_INIT (FD_RELOC_READ | FD_RELOC_WRITE) /* NOTE: relocs are 2 dwords on a5xx+ */ static inline void fd_ringbuffer_reloc(struct fd_ringbuffer *ring, const struct fd_reloc *reloc) { ring->funcs->emit_reloc(ring, reloc); } static inline uint32_t fd_ringbuffer_cmd_count(struct fd_ringbuffer *ring) { if (!ring->funcs->cmd_count) return 1; return ring->funcs->cmd_count(ring); } static inline uint32_t fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer *ring, struct fd_ringbuffer *target, uint32_t cmd_idx) { return ring->funcs->emit_reloc_ring(ring, target, cmd_idx); } static inline uint32_t offset_bytes(void *end, void *start) { return ((char *)end) - ((char *)start); } static inline uint32_t fd_ringbuffer_size(struct fd_ringbuffer *ring) { /* only really needed for stateobj ringbuffers, and won't really * do what you expect for growable rb's.. so lets just restrict * this to stateobj's for now: */ debug_assert(!(ring->flags & FD_RINGBUFFER_GROWABLE)); return offset_bytes(ring->cur, ring->start); } static inline bool fd_ringbuffer_empty(struct fd_ringbuffer *ring) { return (fd_ringbuffer_cmd_count(ring) == 1) && (offset_bytes(ring->cur, ring->start) == 0); } #define LOG_DWORDS 0 static inline void OUT_RING(struct fd_ringbuffer *ring, uint32_t data) { if (LOG_DWORDS) { fprintf(stderr, "ring[%p]: OUT_RING %04x: %08x", ring, (uint32_t)(ring->cur - ring->start), data); } fd_ringbuffer_emit(ring, data); } /* * NOTE: OUT_RELOC() is 2 dwords (64b) on a5xx+ */ #ifndef __cplusplus static inline void OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset, uint64_t or, int32_t shift) { if (LOG_DWORDS) { fprintf(stderr, "ring[%p]: OUT_RELOC %04x: %p+%u << %d", ring, (uint32_t)(ring->cur - ring->start), bo, offset, shift); } debug_assert(offset < fd_bo_size(bo)); uint64_t iova = fd_bo_get_iova(bo) + offset; if (shift < 0) iova >>= -shift; else iova <<= shift; iova |= or ; fd_ringbuffer_reloc(ring, &(struct fd_reloc){ .bo = bo, .iova = iova, .offset = offset, .orlo = or , .shift = shift, .orhi = or >> 32, }); } #endif static inline void OUT_RB(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) { fd_ringbuffer_emit_reloc_ring_full(ring, target, 0); } static inline void BEGIN_RING(struct fd_ringbuffer *ring, uint32_t ndwords) { if (unlikely(ring->cur + ndwords > ring->end)) fd_ringbuffer_grow(ring, ndwords); } static inline void OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt) { BEGIN_RING(ring, cnt + 1); OUT_RING(ring, pm4_pkt0_hdr(regindx, cnt)); } static inline void OUT_PKT2(struct fd_ringbuffer *ring) { BEGIN_RING(ring, 1); OUT_RING(ring, CP_TYPE2_PKT); } static inline void OUT_PKT3(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt) { BEGIN_RING(ring, cnt + 1); OUT_RING(ring, CP_TYPE3_PKT | ((cnt - 1) << 16) | ((opcode & 0xFF) << 8)); } /* * Starting with a5xx, pkt4/pkt7 are used instead of pkt0/pkt3 */ static inline void OUT_PKT4(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt) { BEGIN_RING(ring, cnt + 1); OUT_RING(ring, pm4_pkt4_hdr(regindx, cnt)); } static inline void OUT_PKT7(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt) { BEGIN_RING(ring, cnt + 1); OUT_RING(ring, pm4_pkt7_hdr(opcode, cnt)); } static inline void OUT_WFI(struct fd_ringbuffer *ring) { OUT_PKT3(ring, CP_WAIT_FOR_IDLE, 1); OUT_RING(ring, 0x00000000); } static inline void OUT_WFI5(struct fd_ringbuffer *ring) { OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0); } #ifdef __cplusplus } /* end of extern "C" */ #endif #endif /* FREEDRENO_RINGBUFFER_H_ */