xref: /openbsd/sys/dev/pci/drm/i915/gt/gen2_engine_cs.c (revision f005ef32)
1ad8b1aafSjsg // SPDX-License-Identifier: MIT
2ad8b1aafSjsg /*
3ad8b1aafSjsg  * Copyright © 2020 Intel Corporation
4ad8b1aafSjsg  */
5ad8b1aafSjsg 
6ad8b1aafSjsg #include "gen2_engine_cs.h"
7ad8b1aafSjsg #include "i915_drv.h"
81bb76ff1Sjsg #include "i915_reg.h"
9ad8b1aafSjsg #include "intel_engine.h"
101bb76ff1Sjsg #include "intel_engine_regs.h"
11ad8b1aafSjsg #include "intel_gpu_commands.h"
12ad8b1aafSjsg #include "intel_gt.h"
13ad8b1aafSjsg #include "intel_gt_irq.h"
14ad8b1aafSjsg #include "intel_ring.h"
15ad8b1aafSjsg 
gen2_emit_flush(struct i915_request * rq,u32 mode)16ad8b1aafSjsg int gen2_emit_flush(struct i915_request *rq, u32 mode)
17ad8b1aafSjsg {
18ad8b1aafSjsg 	unsigned int num_store_dw = 12;
19ad8b1aafSjsg 	u32 cmd, *cs;
20ad8b1aafSjsg 
21ad8b1aafSjsg 	cmd = MI_FLUSH;
22ad8b1aafSjsg 	if (mode & EMIT_INVALIDATE)
23ad8b1aafSjsg 		cmd |= MI_READ_FLUSH;
24ad8b1aafSjsg 
25ad8b1aafSjsg 	cs = intel_ring_begin(rq, 2 + 4 * num_store_dw);
26ad8b1aafSjsg 	if (IS_ERR(cs))
27ad8b1aafSjsg 		return PTR_ERR(cs);
28ad8b1aafSjsg 
29ad8b1aafSjsg 	*cs++ = cmd;
30ad8b1aafSjsg 	while (num_store_dw--) {
31ad8b1aafSjsg 		*cs++ = MI_STORE_DWORD_INDEX;
32ad8b1aafSjsg 		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
33ad8b1aafSjsg 		*cs++ = 0;
34ad8b1aafSjsg 		*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
35ad8b1aafSjsg 	}
36ad8b1aafSjsg 	*cs++ = cmd;
37ad8b1aafSjsg 
38ad8b1aafSjsg 	intel_ring_advance(rq, cs);
39ad8b1aafSjsg 
40ad8b1aafSjsg 	return 0;
41ad8b1aafSjsg }
42ad8b1aafSjsg 
gen4_emit_flush_rcs(struct i915_request * rq,u32 mode)43ad8b1aafSjsg int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode)
44ad8b1aafSjsg {
45ad8b1aafSjsg 	u32 cmd, *cs;
46ad8b1aafSjsg 	int i;
47ad8b1aafSjsg 
48ad8b1aafSjsg 	/*
49ad8b1aafSjsg 	 * read/write caches:
50ad8b1aafSjsg 	 *
51ad8b1aafSjsg 	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
52ad8b1aafSjsg 	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
53ad8b1aafSjsg 	 * also flushed at 2d versus 3d pipeline switches.
54ad8b1aafSjsg 	 *
55ad8b1aafSjsg 	 * read-only caches:
56ad8b1aafSjsg 	 *
57ad8b1aafSjsg 	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
58ad8b1aafSjsg 	 * MI_READ_FLUSH is set, and is always flushed on 965.
59ad8b1aafSjsg 	 *
60ad8b1aafSjsg 	 * I915_GEM_DOMAIN_COMMAND may not exist?
61ad8b1aafSjsg 	 *
62ad8b1aafSjsg 	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
63ad8b1aafSjsg 	 * invalidated when MI_EXE_FLUSH is set.
64ad8b1aafSjsg 	 *
65ad8b1aafSjsg 	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
66ad8b1aafSjsg 	 * invalidated with every MI_FLUSH.
67ad8b1aafSjsg 	 *
68ad8b1aafSjsg 	 * TLBs:
69ad8b1aafSjsg 	 *
70ad8b1aafSjsg 	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
71ad8b1aafSjsg 	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
72ad8b1aafSjsg 	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
73ad8b1aafSjsg 	 * are flushed at any MI_FLUSH.
74ad8b1aafSjsg 	 */
75ad8b1aafSjsg 
76ad8b1aafSjsg 	cmd = MI_FLUSH;
77ad8b1aafSjsg 	if (mode & EMIT_INVALIDATE) {
78ad8b1aafSjsg 		cmd |= MI_EXE_FLUSH;
79*f005ef32Sjsg 		if (IS_G4X(rq->i915) || GRAPHICS_VER(rq->i915) == 5)
80ad8b1aafSjsg 			cmd |= MI_INVALIDATE_ISP;
81ad8b1aafSjsg 	}
82ad8b1aafSjsg 
83ad8b1aafSjsg 	i = 2;
84ad8b1aafSjsg 	if (mode & EMIT_INVALIDATE)
85ad8b1aafSjsg 		i += 20;
86ad8b1aafSjsg 
87ad8b1aafSjsg 	cs = intel_ring_begin(rq, i);
88ad8b1aafSjsg 	if (IS_ERR(cs))
89ad8b1aafSjsg 		return PTR_ERR(cs);
90ad8b1aafSjsg 
91ad8b1aafSjsg 	*cs++ = cmd;
92ad8b1aafSjsg 
93ad8b1aafSjsg 	/*
94ad8b1aafSjsg 	 * A random delay to let the CS invalidate take effect? Without this
95ad8b1aafSjsg 	 * delay, the GPU relocation path fails as the CS does not see
96ad8b1aafSjsg 	 * the updated contents. Just as important, if we apply the flushes
97ad8b1aafSjsg 	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
98ad8b1aafSjsg 	 * write and before the invalidate on the next batch), the relocations
99ad8b1aafSjsg 	 * still fail. This implies that is a delay following invalidation
100ad8b1aafSjsg 	 * that is required to reset the caches as opposed to a delay to
101ad8b1aafSjsg 	 * ensure the memory is written.
102ad8b1aafSjsg 	 */
103ad8b1aafSjsg 	if (mode & EMIT_INVALIDATE) {
104ad8b1aafSjsg 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
105ad8b1aafSjsg 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
106ad8b1aafSjsg 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
107ad8b1aafSjsg 			PIPE_CONTROL_GLOBAL_GTT;
108ad8b1aafSjsg 		*cs++ = 0;
109ad8b1aafSjsg 		*cs++ = 0;
110ad8b1aafSjsg 
111ad8b1aafSjsg 		for (i = 0; i < 12; i++)
112ad8b1aafSjsg 			*cs++ = MI_FLUSH;
113ad8b1aafSjsg 
114ad8b1aafSjsg 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
115ad8b1aafSjsg 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
116ad8b1aafSjsg 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
117ad8b1aafSjsg 			PIPE_CONTROL_GLOBAL_GTT;
118ad8b1aafSjsg 		*cs++ = 0;
119ad8b1aafSjsg 		*cs++ = 0;
120ad8b1aafSjsg 	}
121ad8b1aafSjsg 
122ad8b1aafSjsg 	*cs++ = cmd;
123ad8b1aafSjsg 
124ad8b1aafSjsg 	intel_ring_advance(rq, cs);
125ad8b1aafSjsg 
126ad8b1aafSjsg 	return 0;
127ad8b1aafSjsg }
128ad8b1aafSjsg 
gen4_emit_flush_vcs(struct i915_request * rq,u32 mode)129ad8b1aafSjsg int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode)
130ad8b1aafSjsg {
131ad8b1aafSjsg 	u32 *cs;
132ad8b1aafSjsg 
133ad8b1aafSjsg 	cs = intel_ring_begin(rq, 2);
134ad8b1aafSjsg 	if (IS_ERR(cs))
135ad8b1aafSjsg 		return PTR_ERR(cs);
136ad8b1aafSjsg 
137ad8b1aafSjsg 	*cs++ = MI_FLUSH;
138ad8b1aafSjsg 	*cs++ = MI_NOOP;
139ad8b1aafSjsg 	intel_ring_advance(rq, cs);
140ad8b1aafSjsg 
141ad8b1aafSjsg 	return 0;
142ad8b1aafSjsg }
143ad8b1aafSjsg 
__gen2_emit_breadcrumb(struct i915_request * rq,u32 * cs,int flush,int post)144ad8b1aafSjsg static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs,
145ad8b1aafSjsg 				   int flush, int post)
146ad8b1aafSjsg {
147ad8b1aafSjsg 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
1485ca02815Sjsg 	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
149ad8b1aafSjsg 
150ad8b1aafSjsg 	*cs++ = MI_FLUSH;
151ad8b1aafSjsg 
152ad8b1aafSjsg 	while (flush--) {
153ad8b1aafSjsg 		*cs++ = MI_STORE_DWORD_INDEX;
154ad8b1aafSjsg 		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
155ad8b1aafSjsg 		*cs++ = rq->fence.seqno;
156ad8b1aafSjsg 	}
157ad8b1aafSjsg 
158ad8b1aafSjsg 	while (post--) {
159ad8b1aafSjsg 		*cs++ = MI_STORE_DWORD_INDEX;
160ad8b1aafSjsg 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
161ad8b1aafSjsg 		*cs++ = rq->fence.seqno;
162ad8b1aafSjsg 	}
163ad8b1aafSjsg 
164ad8b1aafSjsg 	*cs++ = MI_USER_INTERRUPT;
165ad8b1aafSjsg 
166ad8b1aafSjsg 	rq->tail = intel_ring_offset(rq, cs);
167ad8b1aafSjsg 	assert_ring_tail_valid(rq->ring, rq->tail);
168ad8b1aafSjsg 
169ad8b1aafSjsg 	return cs;
170ad8b1aafSjsg }
171ad8b1aafSjsg 
gen3_emit_breadcrumb(struct i915_request * rq,u32 * cs)172ad8b1aafSjsg u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs)
173ad8b1aafSjsg {
174ad8b1aafSjsg 	return __gen2_emit_breadcrumb(rq, cs, 16, 8);
175ad8b1aafSjsg }
176ad8b1aafSjsg 
gen5_emit_breadcrumb(struct i915_request * rq,u32 * cs)177ad8b1aafSjsg u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
178ad8b1aafSjsg {
179ad8b1aafSjsg 	return __gen2_emit_breadcrumb(rq, cs, 8, 8);
180ad8b1aafSjsg }
181ad8b1aafSjsg 
182ad8b1aafSjsg /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
183ad8b1aafSjsg #define I830_BATCH_LIMIT SZ_256K
184ad8b1aafSjsg #define I830_TLB_ENTRIES (2)
185ad8b1aafSjsg #define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT)
i830_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)186ad8b1aafSjsg int i830_emit_bb_start(struct i915_request *rq,
187ad8b1aafSjsg 		       u64 offset, u32 len,
188ad8b1aafSjsg 		       unsigned int dispatch_flags)
189ad8b1aafSjsg {
190ad8b1aafSjsg 	u32 *cs, cs_offset =
191ad8b1aafSjsg 		intel_gt_scratch_offset(rq->engine->gt,
192ad8b1aafSjsg 					INTEL_GT_SCRATCH_FIELD_DEFAULT);
193ad8b1aafSjsg 
194ad8b1aafSjsg 	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
195ad8b1aafSjsg 
196ad8b1aafSjsg 	cs = intel_ring_begin(rq, 6);
197ad8b1aafSjsg 	if (IS_ERR(cs))
198ad8b1aafSjsg 		return PTR_ERR(cs);
199ad8b1aafSjsg 
200ad8b1aafSjsg 	/* Evict the invalid PTE TLBs */
201ad8b1aafSjsg 	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
202ad8b1aafSjsg 	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
203ad8b1aafSjsg 	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
204ad8b1aafSjsg 	*cs++ = cs_offset;
205ad8b1aafSjsg 	*cs++ = 0xdeadbeef;
206ad8b1aafSjsg 	*cs++ = MI_NOOP;
207ad8b1aafSjsg 	intel_ring_advance(rq, cs);
208ad8b1aafSjsg 
209ad8b1aafSjsg 	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
210ad8b1aafSjsg 		if (len > I830_BATCH_LIMIT)
211ad8b1aafSjsg 			return -ENOSPC;
212ad8b1aafSjsg 
213ad8b1aafSjsg 		cs = intel_ring_begin(rq, 6 + 2);
214ad8b1aafSjsg 		if (IS_ERR(cs))
215ad8b1aafSjsg 			return PTR_ERR(cs);
216ad8b1aafSjsg 
217ad8b1aafSjsg 		/*
218ad8b1aafSjsg 		 * Blit the batch (which has now all relocs applied) to the
219ad8b1aafSjsg 		 * stable batch scratch bo area (so that the CS never
220ad8b1aafSjsg 		 * stumbles over its tlb invalidation bug) ...
221ad8b1aafSjsg 		 */
222ad8b1aafSjsg 		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
223ad8b1aafSjsg 		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
224ad8b1aafSjsg 		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
225ad8b1aafSjsg 		*cs++ = cs_offset;
226ad8b1aafSjsg 		*cs++ = 4096;
227ad8b1aafSjsg 		*cs++ = offset;
228ad8b1aafSjsg 
229ad8b1aafSjsg 		*cs++ = MI_FLUSH;
230ad8b1aafSjsg 		*cs++ = MI_NOOP;
231ad8b1aafSjsg 		intel_ring_advance(rq, cs);
232ad8b1aafSjsg 
233ad8b1aafSjsg 		/* ... and execute it. */
234ad8b1aafSjsg 		offset = cs_offset;
235ad8b1aafSjsg 	}
236ad8b1aafSjsg 
237ad8b1aafSjsg 	if (!(dispatch_flags & I915_DISPATCH_SECURE))
238ad8b1aafSjsg 		offset |= MI_BATCH_NON_SECURE;
239ad8b1aafSjsg 
240ad8b1aafSjsg 	cs = intel_ring_begin(rq, 2);
241ad8b1aafSjsg 	if (IS_ERR(cs))
242ad8b1aafSjsg 		return PTR_ERR(cs);
243ad8b1aafSjsg 
244ad8b1aafSjsg 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
245ad8b1aafSjsg 	*cs++ = offset;
246ad8b1aafSjsg 	intel_ring_advance(rq, cs);
247ad8b1aafSjsg 
248ad8b1aafSjsg 	return 0;
249ad8b1aafSjsg }
250ad8b1aafSjsg 
gen3_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)251ad8b1aafSjsg int gen3_emit_bb_start(struct i915_request *rq,
252ad8b1aafSjsg 		       u64 offset, u32 len,
253ad8b1aafSjsg 		       unsigned int dispatch_flags)
254ad8b1aafSjsg {
255ad8b1aafSjsg 	u32 *cs;
256ad8b1aafSjsg 
257ad8b1aafSjsg 	if (!(dispatch_flags & I915_DISPATCH_SECURE))
258ad8b1aafSjsg 		offset |= MI_BATCH_NON_SECURE;
259ad8b1aafSjsg 
260ad8b1aafSjsg 	cs = intel_ring_begin(rq, 2);
261ad8b1aafSjsg 	if (IS_ERR(cs))
262ad8b1aafSjsg 		return PTR_ERR(cs);
263ad8b1aafSjsg 
264ad8b1aafSjsg 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
265ad8b1aafSjsg 	*cs++ = offset;
266ad8b1aafSjsg 	intel_ring_advance(rq, cs);
267ad8b1aafSjsg 
268ad8b1aafSjsg 	return 0;
269ad8b1aafSjsg }
270ad8b1aafSjsg 
gen4_emit_bb_start(struct i915_request * rq,u64 offset,u32 length,unsigned int dispatch_flags)271ad8b1aafSjsg int gen4_emit_bb_start(struct i915_request *rq,
272ad8b1aafSjsg 		       u64 offset, u32 length,
273ad8b1aafSjsg 		       unsigned int dispatch_flags)
274ad8b1aafSjsg {
275ad8b1aafSjsg 	u32 security;
276ad8b1aafSjsg 	u32 *cs;
277ad8b1aafSjsg 
278ad8b1aafSjsg 	security = MI_BATCH_NON_SECURE_I965;
279ad8b1aafSjsg 	if (dispatch_flags & I915_DISPATCH_SECURE)
280ad8b1aafSjsg 		security = 0;
281ad8b1aafSjsg 
282ad8b1aafSjsg 	cs = intel_ring_begin(rq, 2);
283ad8b1aafSjsg 	if (IS_ERR(cs))
284ad8b1aafSjsg 		return PTR_ERR(cs);
285ad8b1aafSjsg 
286ad8b1aafSjsg 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security;
287ad8b1aafSjsg 	*cs++ = offset;
288ad8b1aafSjsg 	intel_ring_advance(rq, cs);
289ad8b1aafSjsg 
290ad8b1aafSjsg 	return 0;
291ad8b1aafSjsg }
292ad8b1aafSjsg 
gen2_irq_enable(struct intel_engine_cs * engine)293ad8b1aafSjsg void gen2_irq_enable(struct intel_engine_cs *engine)
294ad8b1aafSjsg {
295ad8b1aafSjsg 	struct drm_i915_private *i915 = engine->i915;
296ad8b1aafSjsg 
297ad8b1aafSjsg 	i915->irq_mask &= ~engine->irq_enable_mask;
298ad8b1aafSjsg 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
299ad8b1aafSjsg 	ENGINE_POSTING_READ16(engine, RING_IMR);
300ad8b1aafSjsg }
301ad8b1aafSjsg 
gen2_irq_disable(struct intel_engine_cs * engine)302ad8b1aafSjsg void gen2_irq_disable(struct intel_engine_cs *engine)
303ad8b1aafSjsg {
304ad8b1aafSjsg 	struct drm_i915_private *i915 = engine->i915;
305ad8b1aafSjsg 
306ad8b1aafSjsg 	i915->irq_mask |= engine->irq_enable_mask;
307ad8b1aafSjsg 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
308ad8b1aafSjsg }
309ad8b1aafSjsg 
gen3_irq_enable(struct intel_engine_cs * engine)310ad8b1aafSjsg void gen3_irq_enable(struct intel_engine_cs *engine)
311ad8b1aafSjsg {
312ad8b1aafSjsg 	engine->i915->irq_mask &= ~engine->irq_enable_mask;
313ad8b1aafSjsg 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
314ad8b1aafSjsg 	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
315ad8b1aafSjsg }
316ad8b1aafSjsg 
gen3_irq_disable(struct intel_engine_cs * engine)317ad8b1aafSjsg void gen3_irq_disable(struct intel_engine_cs *engine)
318ad8b1aafSjsg {
319ad8b1aafSjsg 	engine->i915->irq_mask |= engine->irq_enable_mask;
320ad8b1aafSjsg 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
321ad8b1aafSjsg }
322ad8b1aafSjsg 
gen5_irq_enable(struct intel_engine_cs * engine)323ad8b1aafSjsg void gen5_irq_enable(struct intel_engine_cs *engine)
324ad8b1aafSjsg {
325ad8b1aafSjsg 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
326ad8b1aafSjsg }
327ad8b1aafSjsg 
gen5_irq_disable(struct intel_engine_cs * engine)328ad8b1aafSjsg void gen5_irq_disable(struct intel_engine_cs *engine)
329ad8b1aafSjsg {
330ad8b1aafSjsg 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
331ad8b1aafSjsg }
332