1 /*	$NetBSD: radeon_cik_sdma.c,v 1.3 2021/12/18 23:45:43 riastradh Exp $	*/
2 
3 /*
4  * Copyright 2013 Advanced Micro Devices, Inc.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Alex Deucher
25  */
26 #include <sys/cdefs.h>
27 __KERNEL_RCSID(0, "$NetBSD: radeon_cik_sdma.c,v 1.3 2021/12/18 23:45:43 riastradh Exp $");
28 
29 #include <linux/firmware.h>
30 
31 #include "radeon.h"
32 #include "radeon_ucode.h"
33 #include "radeon_asic.h"
34 #include "radeon_trace.h"
35 #include "cikd.h"
36 
37 /* sdma */
38 #define CIK_SDMA_UCODE_SIZE 1050
39 #define CIK_SDMA_UCODE_VERSION 64
40 
41 u32 cik_gpu_check_soft_reset(struct radeon_device *rdev);
42 
43 /*
44  * sDMA - System DMA
45  * Starting with CIK, the GPU has new asynchronous
46  * DMA engines.  These engines are used for compute
47  * and gfx.  There are two DMA engines (SDMA0, SDMA1)
48  * and each one supports 1 ring buffer used for gfx
49  * and 2 queues used for compute.
50  *
51  * The programming model is very similar to the CP
52  * (ring buffer, IBs, etc.), but sDMA has it's own
53  * packet format that is different from the PM4 format
54  * used by the CP. sDMA supports copying data, writing
55  * embedded data, solid fills, and a number of other
56  * things.  It also has support for tiling/detiling of
57  * buffers.
58  */
59 
60 /**
61  * cik_sdma_get_rptr - get the current read pointer
62  *
63  * @rdev: radeon_device pointer
64  * @ring: radeon ring pointer
65  *
66  * Get the current rptr from the hardware (CIK+).
67  */
cik_sdma_get_rptr(struct radeon_device * rdev,struct radeon_ring * ring)68 uint32_t cik_sdma_get_rptr(struct radeon_device *rdev,
69 			   struct radeon_ring *ring)
70 {
71 	u32 rptr, reg;
72 
73 	if (rdev->wb.enabled) {
74 		rptr = rdev->wb.wb[ring->rptr_offs/4];
75 	} else {
76 		if (ring->idx == R600_RING_TYPE_DMA_INDEX)
77 			reg = SDMA0_GFX_RB_RPTR + SDMA0_REGISTER_OFFSET;
78 		else
79 			reg = SDMA0_GFX_RB_RPTR + SDMA1_REGISTER_OFFSET;
80 
81 		rptr = RREG32(reg);
82 	}
83 
84 	return (rptr & 0x3fffc) >> 2;
85 }
86 
87 /**
88  * cik_sdma_get_wptr - get the current write pointer
89  *
90  * @rdev: radeon_device pointer
91  * @ring: radeon ring pointer
92  *
93  * Get the current wptr from the hardware (CIK+).
94  */
cik_sdma_get_wptr(struct radeon_device * rdev,struct radeon_ring * ring)95 uint32_t cik_sdma_get_wptr(struct radeon_device *rdev,
96 			   struct radeon_ring *ring)
97 {
98 	u32 reg;
99 
100 	if (ring->idx == R600_RING_TYPE_DMA_INDEX)
101 		reg = SDMA0_GFX_RB_WPTR + SDMA0_REGISTER_OFFSET;
102 	else
103 		reg = SDMA0_GFX_RB_WPTR + SDMA1_REGISTER_OFFSET;
104 
105 	return (RREG32(reg) & 0x3fffc) >> 2;
106 }
107 
108 /**
109  * cik_sdma_set_wptr - commit the write pointer
110  *
111  * @rdev: radeon_device pointer
112  * @ring: radeon ring pointer
113  *
114  * Write the wptr back to the hardware (CIK+).
115  */
cik_sdma_set_wptr(struct radeon_device * rdev,struct radeon_ring * ring)116 void cik_sdma_set_wptr(struct radeon_device *rdev,
117 		       struct radeon_ring *ring)
118 {
119 	u32 reg;
120 
121 	if (ring->idx == R600_RING_TYPE_DMA_INDEX)
122 		reg = SDMA0_GFX_RB_WPTR + SDMA0_REGISTER_OFFSET;
123 	else
124 		reg = SDMA0_GFX_RB_WPTR + SDMA1_REGISTER_OFFSET;
125 
126 	WREG32(reg, (ring->wptr << 2) & 0x3fffc);
127 	(void)RREG32(reg);
128 }
129 
130 /**
131  * cik_sdma_ring_ib_execute - Schedule an IB on the DMA engine
132  *
133  * @rdev: radeon_device pointer
134  * @ib: IB object to schedule
135  *
136  * Schedule an IB in the DMA ring (CIK).
137  */
cik_sdma_ring_ib_execute(struct radeon_device * rdev,struct radeon_ib * ib)138 void cik_sdma_ring_ib_execute(struct radeon_device *rdev,
139 			      struct radeon_ib *ib)
140 {
141 	struct radeon_ring *ring = &rdev->ring[ib->ring];
142 	u32 extra_bits = (ib->vm ? ib->vm->ids[ib->ring].id : 0) & 0xf;
143 
144 	if (rdev->wb.enabled) {
145 		u32 next_rptr = ring->wptr + 5;
146 		while ((next_rptr & 7) != 4)
147 			next_rptr++;
148 		next_rptr += 4;
149 		radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_WRITE, SDMA_WRITE_SUB_OPCODE_LINEAR, 0));
150 		radeon_ring_write(ring, ring->next_rptr_gpu_addr & 0xfffffffc);
151 		radeon_ring_write(ring, upper_32_bits(ring->next_rptr_gpu_addr));
152 		radeon_ring_write(ring, 1); /* number of DWs to follow */
153 		radeon_ring_write(ring, next_rptr);
154 	}
155 
156 	/* IB packet must end on a 8 DW boundary */
157 	while ((ring->wptr & 7) != 4)
158 		radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
159 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
160 	radeon_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
161 	radeon_ring_write(ring, upper_32_bits(ib->gpu_addr));
162 	radeon_ring_write(ring, ib->length_dw);
163 
164 }
165 
166 /**
167  * cik_sdma_hdp_flush_ring_emit - emit an hdp flush on the DMA ring
168  *
169  * @rdev: radeon_device pointer
170  * @ridx: radeon ring index
171  *
172  * Emit an hdp flush packet on the requested DMA ring.
173  */
cik_sdma_hdp_flush_ring_emit(struct radeon_device * rdev,int ridx)174 static void cik_sdma_hdp_flush_ring_emit(struct radeon_device *rdev,
175 					 int ridx)
176 {
177 	struct radeon_ring *ring = &rdev->ring[ridx];
178 	u32 extra_bits = (SDMA_POLL_REG_MEM_EXTRA_OP(1) |
179 			  SDMA_POLL_REG_MEM_EXTRA_FUNC(3)); /* == */
180 	u32 ref_and_mask;
181 
182 	if (ridx == R600_RING_TYPE_DMA_INDEX)
183 		ref_and_mask = SDMA0;
184 	else
185 		ref_and_mask = SDMA1;
186 
187 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_POLL_REG_MEM, 0, extra_bits));
188 	radeon_ring_write(ring, GPU_HDP_FLUSH_DONE);
189 	radeon_ring_write(ring, GPU_HDP_FLUSH_REQ);
190 	radeon_ring_write(ring, ref_and_mask); /* reference */
191 	radeon_ring_write(ring, ref_and_mask); /* mask */
192 	radeon_ring_write(ring, (0xfff << 16) | 10); /* retry count, poll interval */
193 }
194 
195 /**
196  * cik_sdma_fence_ring_emit - emit a fence on the DMA ring
197  *
198  * @rdev: radeon_device pointer
199  * @fence: radeon fence object
200  *
201  * Add a DMA fence packet to the ring to write
202  * the fence seq number and DMA trap packet to generate
203  * an interrupt if needed (CIK).
204  */
cik_sdma_fence_ring_emit(struct radeon_device * rdev,struct radeon_fence * fence)205 void cik_sdma_fence_ring_emit(struct radeon_device *rdev,
206 			      struct radeon_fence *fence)
207 {
208 	struct radeon_ring *ring = &rdev->ring[fence->ring];
209 	u64 addr = rdev->fence_drv[fence->ring].gpu_addr;
210 
211 	/* write the fence */
212 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_FENCE, 0, 0));
213 	radeon_ring_write(ring, lower_32_bits(addr));
214 	radeon_ring_write(ring, upper_32_bits(addr));
215 	radeon_ring_write(ring, fence->seq);
216 	/* generate an interrupt */
217 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_TRAP, 0, 0));
218 	/* flush HDP */
219 	cik_sdma_hdp_flush_ring_emit(rdev, fence->ring);
220 }
221 
222 /**
223  * cik_sdma_semaphore_ring_emit - emit a semaphore on the dma ring
224  *
225  * @rdev: radeon_device pointer
226  * @ring: radeon_ring structure holding ring information
227  * @semaphore: radeon semaphore object
228  * @emit_wait: wait or signal semaphore
229  *
230  * Add a DMA semaphore packet to the ring wait on or signal
231  * other rings (CIK).
232  */
cik_sdma_semaphore_ring_emit(struct radeon_device * rdev,struct radeon_ring * ring,struct radeon_semaphore * semaphore,bool emit_wait)233 bool cik_sdma_semaphore_ring_emit(struct radeon_device *rdev,
234 				  struct radeon_ring *ring,
235 				  struct radeon_semaphore *semaphore,
236 				  bool emit_wait)
237 {
238 	u64 addr = semaphore->gpu_addr;
239 	u32 extra_bits = emit_wait ? 0 : SDMA_SEMAPHORE_EXTRA_S;
240 
241 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SEMAPHORE, 0, extra_bits));
242 	radeon_ring_write(ring, addr & 0xfffffff8);
243 	radeon_ring_write(ring, upper_32_bits(addr));
244 
245 	return true;
246 }
247 
248 /**
249  * cik_sdma_gfx_stop - stop the gfx async dma engines
250  *
251  * @rdev: radeon_device pointer
252  *
253  * Stop the gfx async dma ring buffers (CIK).
254  */
cik_sdma_gfx_stop(struct radeon_device * rdev)255 static void cik_sdma_gfx_stop(struct radeon_device *rdev)
256 {
257 	u32 rb_cntl, reg_offset;
258 	int i;
259 
260 	if ((rdev->asic->copy.copy_ring_index == R600_RING_TYPE_DMA_INDEX) ||
261 	    (rdev->asic->copy.copy_ring_index == CAYMAN_RING_TYPE_DMA1_INDEX))
262 		radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size);
263 
264 	for (i = 0; i < 2; i++) {
265 		if (i == 0)
266 			reg_offset = SDMA0_REGISTER_OFFSET;
267 		else
268 			reg_offset = SDMA1_REGISTER_OFFSET;
269 		rb_cntl = RREG32(SDMA0_GFX_RB_CNTL + reg_offset);
270 		rb_cntl &= ~SDMA_RB_ENABLE;
271 		WREG32(SDMA0_GFX_RB_CNTL + reg_offset, rb_cntl);
272 		WREG32(SDMA0_GFX_IB_CNTL + reg_offset, 0);
273 	}
274 	rdev->ring[R600_RING_TYPE_DMA_INDEX].ready = false;
275 	rdev->ring[CAYMAN_RING_TYPE_DMA1_INDEX].ready = false;
276 
277 	/* FIXME use something else than big hammer but after few days can not
278 	 * seem to find good combination so reset SDMA blocks as it seems we
279 	 * do not shut them down properly. This fix hibernation and does not
280 	 * affect suspend to ram.
281 	 */
282 	WREG32(SRBM_SOFT_RESET, SOFT_RESET_SDMA | SOFT_RESET_SDMA1);
283 	(void)RREG32(SRBM_SOFT_RESET);
284 	udelay(50);
285 	WREG32(SRBM_SOFT_RESET, 0);
286 	(void)RREG32(SRBM_SOFT_RESET);
287 }
288 
289 /**
290  * cik_sdma_rlc_stop - stop the compute async dma engines
291  *
292  * @rdev: radeon_device pointer
293  *
294  * Stop the compute async dma queues (CIK).
295  */
cik_sdma_rlc_stop(struct radeon_device * rdev)296 static void cik_sdma_rlc_stop(struct radeon_device *rdev)
297 {
298 	/* XXX todo */
299 }
300 
301 /**
302  * cik_sdma_ctx_switch_enable - enable/disable sdma engine preemption
303  *
304  * @rdev: radeon_device pointer
305  * @enable: enable/disable preemption.
306  *
307  * Halt or unhalt the async dma engines (CIK).
308  */
cik_sdma_ctx_switch_enable(struct radeon_device * rdev,bool enable)309 static void cik_sdma_ctx_switch_enable(struct radeon_device *rdev, bool enable)
310 {
311 	uint32_t reg_offset, value;
312 	int i;
313 
314 	for (i = 0; i < 2; i++) {
315 		if (i == 0)
316 			reg_offset = SDMA0_REGISTER_OFFSET;
317 		else
318 			reg_offset = SDMA1_REGISTER_OFFSET;
319 		value = RREG32(SDMA0_CNTL + reg_offset);
320 		if (enable)
321 			value |= AUTO_CTXSW_ENABLE;
322 		else
323 			value &= ~AUTO_CTXSW_ENABLE;
324 		WREG32(SDMA0_CNTL + reg_offset, value);
325 	}
326 }
327 
328 /**
329  * cik_sdma_enable - stop the async dma engines
330  *
331  * @rdev: radeon_device pointer
332  * @enable: enable/disable the DMA MEs.
333  *
334  * Halt or unhalt the async dma engines (CIK).
335  */
cik_sdma_enable(struct radeon_device * rdev,bool enable)336 void cik_sdma_enable(struct radeon_device *rdev, bool enable)
337 {
338 	u32 me_cntl, reg_offset;
339 	int i;
340 
341 	if (!enable) {
342 		cik_sdma_gfx_stop(rdev);
343 		cik_sdma_rlc_stop(rdev);
344 	}
345 
346 	for (i = 0; i < 2; i++) {
347 		if (i == 0)
348 			reg_offset = SDMA0_REGISTER_OFFSET;
349 		else
350 			reg_offset = SDMA1_REGISTER_OFFSET;
351 		me_cntl = RREG32(SDMA0_ME_CNTL + reg_offset);
352 		if (enable)
353 			me_cntl &= ~SDMA_HALT;
354 		else
355 			me_cntl |= SDMA_HALT;
356 		WREG32(SDMA0_ME_CNTL + reg_offset, me_cntl);
357 	}
358 
359 	cik_sdma_ctx_switch_enable(rdev, enable);
360 }
361 
362 /**
363  * cik_sdma_gfx_resume - setup and start the async dma engines
364  *
365  * @rdev: radeon_device pointer
366  *
367  * Set up the gfx DMA ring buffers and enable them (CIK).
368  * Returns 0 for success, error for failure.
369  */
cik_sdma_gfx_resume(struct radeon_device * rdev)370 static int cik_sdma_gfx_resume(struct radeon_device *rdev)
371 {
372 	struct radeon_ring *ring;
373 	u32 rb_cntl, ib_cntl;
374 	u32 rb_bufsz;
375 	u32 reg_offset, wb_offset;
376 	int i, r;
377 
378 	for (i = 0; i < 2; i++) {
379 		if (i == 0) {
380 			ring = &rdev->ring[R600_RING_TYPE_DMA_INDEX];
381 			reg_offset = SDMA0_REGISTER_OFFSET;
382 			wb_offset = R600_WB_DMA_RPTR_OFFSET;
383 		} else {
384 			ring = &rdev->ring[CAYMAN_RING_TYPE_DMA1_INDEX];
385 			reg_offset = SDMA1_REGISTER_OFFSET;
386 			wb_offset = CAYMAN_WB_DMA1_RPTR_OFFSET;
387 		}
388 
389 		WREG32(SDMA0_SEM_INCOMPLETE_TIMER_CNTL + reg_offset, 0);
390 		WREG32(SDMA0_SEM_WAIT_FAIL_TIMER_CNTL + reg_offset, 0);
391 
392 		/* Set ring buffer size in dwords */
393 		rb_bufsz = order_base_2(ring->ring_size / 4);
394 		rb_cntl = rb_bufsz << 1;
395 #ifdef __BIG_ENDIAN
396 		rb_cntl |= SDMA_RB_SWAP_ENABLE | SDMA_RPTR_WRITEBACK_SWAP_ENABLE;
397 #endif
398 		WREG32(SDMA0_GFX_RB_CNTL + reg_offset, rb_cntl);
399 
400 		/* Initialize the ring buffer's read and write pointers */
401 		WREG32(SDMA0_GFX_RB_RPTR + reg_offset, 0);
402 		WREG32(SDMA0_GFX_RB_WPTR + reg_offset, 0);
403 
404 		/* set the wb address whether it's enabled or not */
405 		WREG32(SDMA0_GFX_RB_RPTR_ADDR_HI + reg_offset,
406 		       upper_32_bits(rdev->wb.gpu_addr + wb_offset) & 0xFFFFFFFF);
407 		WREG32(SDMA0_GFX_RB_RPTR_ADDR_LO + reg_offset,
408 		       ((rdev->wb.gpu_addr + wb_offset) & 0xFFFFFFFC));
409 
410 		if (rdev->wb.enabled)
411 			rb_cntl |= SDMA_RPTR_WRITEBACK_ENABLE;
412 
413 		WREG32(SDMA0_GFX_RB_BASE + reg_offset, ring->gpu_addr >> 8);
414 		WREG32(SDMA0_GFX_RB_BASE_HI + reg_offset, ring->gpu_addr >> 40);
415 
416 		ring->wptr = 0;
417 		WREG32(SDMA0_GFX_RB_WPTR + reg_offset, ring->wptr << 2);
418 
419 		/* enable DMA RB */
420 		WREG32(SDMA0_GFX_RB_CNTL + reg_offset, rb_cntl | SDMA_RB_ENABLE);
421 
422 		ib_cntl = SDMA_IB_ENABLE;
423 #ifdef __BIG_ENDIAN
424 		ib_cntl |= SDMA_IB_SWAP_ENABLE;
425 #endif
426 		/* enable DMA IBs */
427 		WREG32(SDMA0_GFX_IB_CNTL + reg_offset, ib_cntl);
428 
429 		ring->ready = true;
430 
431 		r = radeon_ring_test(rdev, ring->idx, ring);
432 		if (r) {
433 			ring->ready = false;
434 			return r;
435 		}
436 	}
437 
438 	if ((rdev->asic->copy.copy_ring_index == R600_RING_TYPE_DMA_INDEX) ||
439 	    (rdev->asic->copy.copy_ring_index == CAYMAN_RING_TYPE_DMA1_INDEX))
440 		radeon_ttm_set_active_vram_size(rdev, rdev->mc.real_vram_size);
441 
442 	return 0;
443 }
444 
445 /**
446  * cik_sdma_rlc_resume - setup and start the async dma engines
447  *
448  * @rdev: radeon_device pointer
449  *
450  * Set up the compute DMA queues and enable them (CIK).
451  * Returns 0 for success, error for failure.
452  */
cik_sdma_rlc_resume(struct radeon_device * rdev)453 static int cik_sdma_rlc_resume(struct radeon_device *rdev)
454 {
455 	/* XXX todo */
456 	return 0;
457 }
458 
459 /**
460  * cik_sdma_load_microcode - load the sDMA ME ucode
461  *
462  * @rdev: radeon_device pointer
463  *
464  * Loads the sDMA0/1 ucode.
465  * Returns 0 for success, -EINVAL if the ucode is not available.
466  */
cik_sdma_load_microcode(struct radeon_device * rdev)467 static int cik_sdma_load_microcode(struct radeon_device *rdev)
468 {
469 	int i;
470 
471 	if (!rdev->sdma_fw)
472 		return -EINVAL;
473 
474 	/* halt the MEs */
475 	cik_sdma_enable(rdev, false);
476 
477 	if (rdev->new_fw) {
478 		const struct sdma_firmware_header_v1_0 *hdr =
479 			(const struct sdma_firmware_header_v1_0 *)rdev->sdma_fw->data;
480 		const __le32 *fw_data;
481 		u32 fw_size;
482 
483 		radeon_ucode_print_sdma_hdr(&hdr->header);
484 
485 		/* sdma0 */
486 		fw_data = (const __le32 *)
487 			(rdev->sdma_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes));
488 		fw_size = le32_to_cpu(hdr->header.ucode_size_bytes) / 4;
489 		WREG32(SDMA0_UCODE_ADDR + SDMA0_REGISTER_OFFSET, 0);
490 		for (i = 0; i < fw_size; i++)
491 			WREG32(SDMA0_UCODE_DATA + SDMA0_REGISTER_OFFSET, le32_to_cpup(fw_data++));
492 		WREG32(SDMA0_UCODE_DATA + SDMA0_REGISTER_OFFSET, CIK_SDMA_UCODE_VERSION);
493 
494 		/* sdma1 */
495 		fw_data = (const __le32 *)
496 			(rdev->sdma_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes));
497 		fw_size = le32_to_cpu(hdr->header.ucode_size_bytes) / 4;
498 		WREG32(SDMA0_UCODE_ADDR + SDMA1_REGISTER_OFFSET, 0);
499 		for (i = 0; i < fw_size; i++)
500 			WREG32(SDMA0_UCODE_DATA + SDMA1_REGISTER_OFFSET, le32_to_cpup(fw_data++));
501 		WREG32(SDMA0_UCODE_DATA + SDMA1_REGISTER_OFFSET, CIK_SDMA_UCODE_VERSION);
502 	} else {
503 		const __be32 *fw_data;
504 
505 		/* sdma0 */
506 		fw_data = (const __be32 *)rdev->sdma_fw->data;
507 		WREG32(SDMA0_UCODE_ADDR + SDMA0_REGISTER_OFFSET, 0);
508 		for (i = 0; i < CIK_SDMA_UCODE_SIZE; i++)
509 			WREG32(SDMA0_UCODE_DATA + SDMA0_REGISTER_OFFSET, be32_to_cpup(fw_data++));
510 		WREG32(SDMA0_UCODE_DATA + SDMA0_REGISTER_OFFSET, CIK_SDMA_UCODE_VERSION);
511 
512 		/* sdma1 */
513 		fw_data = (const __be32 *)rdev->sdma_fw->data;
514 		WREG32(SDMA0_UCODE_ADDR + SDMA1_REGISTER_OFFSET, 0);
515 		for (i = 0; i < CIK_SDMA_UCODE_SIZE; i++)
516 			WREG32(SDMA0_UCODE_DATA + SDMA1_REGISTER_OFFSET, be32_to_cpup(fw_data++));
517 		WREG32(SDMA0_UCODE_DATA + SDMA1_REGISTER_OFFSET, CIK_SDMA_UCODE_VERSION);
518 	}
519 
520 	WREG32(SDMA0_UCODE_ADDR + SDMA0_REGISTER_OFFSET, 0);
521 	WREG32(SDMA0_UCODE_ADDR + SDMA1_REGISTER_OFFSET, 0);
522 	return 0;
523 }
524 
525 /**
526  * cik_sdma_resume - setup and start the async dma engines
527  *
528  * @rdev: radeon_device pointer
529  *
530  * Set up the DMA engines and enable them (CIK).
531  * Returns 0 for success, error for failure.
532  */
cik_sdma_resume(struct radeon_device * rdev)533 int cik_sdma_resume(struct radeon_device *rdev)
534 {
535 	int r;
536 
537 	r = cik_sdma_load_microcode(rdev);
538 	if (r)
539 		return r;
540 
541 	/* unhalt the MEs */
542 	cik_sdma_enable(rdev, true);
543 
544 	/* start the gfx rings and rlc compute queues */
545 	r = cik_sdma_gfx_resume(rdev);
546 	if (r)
547 		return r;
548 	r = cik_sdma_rlc_resume(rdev);
549 	if (r)
550 		return r;
551 
552 	return 0;
553 }
554 
555 /**
556  * cik_sdma_fini - tear down the async dma engines
557  *
558  * @rdev: radeon_device pointer
559  *
560  * Stop the async dma engines and free the rings (CIK).
561  */
cik_sdma_fini(struct radeon_device * rdev)562 void cik_sdma_fini(struct radeon_device *rdev)
563 {
564 	/* halt the MEs */
565 	cik_sdma_enable(rdev, false);
566 	radeon_ring_fini(rdev, &rdev->ring[R600_RING_TYPE_DMA_INDEX]);
567 	radeon_ring_fini(rdev, &rdev->ring[CAYMAN_RING_TYPE_DMA1_INDEX]);
568 	/* XXX - compute dma queue tear down */
569 }
570 
571 /**
572  * cik_copy_dma - copy pages using the DMA engine
573  *
574  * @rdev: radeon_device pointer
575  * @src_offset: src GPU address
576  * @dst_offset: dst GPU address
577  * @num_gpu_pages: number of GPU pages to xfer
578  * @resv: reservation object to sync to
579  *
580  * Copy GPU paging using the DMA engine (CIK).
581  * Used by the radeon ttm implementation to move pages if
582  * registered as the asic copy callback.
583  */
cik_copy_dma(struct radeon_device * rdev,uint64_t src_offset,uint64_t dst_offset,unsigned num_gpu_pages,struct dma_resv * resv)584 struct radeon_fence *cik_copy_dma(struct radeon_device *rdev,
585 				  uint64_t src_offset, uint64_t dst_offset,
586 				  unsigned num_gpu_pages,
587 				  struct dma_resv *resv)
588 {
589 	struct radeon_fence *fence;
590 	struct radeon_sync sync;
591 	int ring_index = rdev->asic->copy.dma_ring_index;
592 	struct radeon_ring *ring = &rdev->ring[ring_index];
593 	u32 size_in_bytes, cur_size_in_bytes;
594 	int i, num_loops;
595 	int r = 0;
596 
597 	radeon_sync_create(&sync);
598 
599 	size_in_bytes = (num_gpu_pages << RADEON_GPU_PAGE_SHIFT);
600 	num_loops = DIV_ROUND_UP(size_in_bytes, 0x1fffff);
601 	r = radeon_ring_lock(rdev, ring, num_loops * 7 + 14);
602 	if (r) {
603 		DRM_ERROR("radeon: moving bo (%d).\n", r);
604 		radeon_sync_free(rdev, &sync, NULL);
605 		return ERR_PTR(r);
606 	}
607 
608 	radeon_sync_resv(rdev, &sync, resv, false);
609 	radeon_sync_rings(rdev, &sync, ring->idx);
610 
611 	for (i = 0; i < num_loops; i++) {
612 		cur_size_in_bytes = size_in_bytes;
613 		if (cur_size_in_bytes > 0x1fffff)
614 			cur_size_in_bytes = 0x1fffff;
615 		size_in_bytes -= cur_size_in_bytes;
616 		radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0));
617 		radeon_ring_write(ring, cur_size_in_bytes);
618 		radeon_ring_write(ring, 0); /* src/dst endian swap */
619 		radeon_ring_write(ring, lower_32_bits(src_offset));
620 		radeon_ring_write(ring, upper_32_bits(src_offset));
621 		radeon_ring_write(ring, lower_32_bits(dst_offset));
622 		radeon_ring_write(ring, upper_32_bits(dst_offset));
623 		src_offset += cur_size_in_bytes;
624 		dst_offset += cur_size_in_bytes;
625 	}
626 
627 	r = radeon_fence_emit(rdev, &fence, ring->idx);
628 	if (r) {
629 		radeon_ring_unlock_undo(rdev, ring);
630 		radeon_sync_free(rdev, &sync, NULL);
631 		return ERR_PTR(r);
632 	}
633 
634 	radeon_ring_unlock_commit(rdev, ring, false);
635 	radeon_sync_free(rdev, &sync, fence);
636 
637 	return fence;
638 }
639 
640 /**
641  * cik_sdma_ring_test - simple async dma engine test
642  *
643  * @rdev: radeon_device pointer
644  * @ring: radeon_ring structure holding ring information
645  *
646  * Test the DMA engine by writing using it to write an
647  * value to memory. (CIK).
648  * Returns 0 for success, error for failure.
649  */
cik_sdma_ring_test(struct radeon_device * rdev,struct radeon_ring * ring)650 int cik_sdma_ring_test(struct radeon_device *rdev,
651 		       struct radeon_ring *ring)
652 {
653 	unsigned i;
654 	int r;
655 	unsigned index;
656 	u32 tmp;
657 	u64 gpu_addr;
658 
659 	if (ring->idx == R600_RING_TYPE_DMA_INDEX)
660 		index = R600_WB_DMA_RING_TEST_OFFSET;
661 	else
662 		index = CAYMAN_WB_DMA1_RING_TEST_OFFSET;
663 
664 	gpu_addr = rdev->wb.gpu_addr + index;
665 
666 	tmp = 0xCAFEDEAD;
667 	rdev->wb.wb[index/4] = cpu_to_le32(tmp);
668 
669 	r = radeon_ring_lock(rdev, ring, 5);
670 	if (r) {
671 		DRM_ERROR("radeon: dma failed to lock ring %d (%d).\n", ring->idx, r);
672 		return r;
673 	}
674 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_WRITE, SDMA_WRITE_SUB_OPCODE_LINEAR, 0));
675 	radeon_ring_write(ring, lower_32_bits(gpu_addr));
676 	radeon_ring_write(ring, upper_32_bits(gpu_addr));
677 	radeon_ring_write(ring, 1); /* number of DWs to follow */
678 	radeon_ring_write(ring, 0xDEADBEEF);
679 	radeon_ring_unlock_commit(rdev, ring, false);
680 
681 	for (i = 0; i < rdev->usec_timeout; i++) {
682 		tmp = le32_to_cpu(rdev->wb.wb[index/4]);
683 		if (tmp == 0xDEADBEEF)
684 			break;
685 		udelay(1);
686 	}
687 
688 	if (i < rdev->usec_timeout) {
689 		DRM_INFO("ring test on %d succeeded in %d usecs\n", ring->idx, i);
690 	} else {
691 		DRM_ERROR("radeon: ring %d test failed (0x%08X)\n",
692 			  ring->idx, tmp);
693 		r = -EINVAL;
694 	}
695 	return r;
696 }
697 
698 /**
699  * cik_sdma_ib_test - test an IB on the DMA engine
700  *
701  * @rdev: radeon_device pointer
702  * @ring: radeon_ring structure holding ring information
703  *
704  * Test a simple IB in the DMA ring (CIK).
705  * Returns 0 on success, error on failure.
706  */
cik_sdma_ib_test(struct radeon_device * rdev,struct radeon_ring * ring)707 int cik_sdma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
708 {
709 	struct radeon_ib ib;
710 	unsigned i;
711 	unsigned index;
712 	int r;
713 	u32 tmp = 0;
714 	u64 gpu_addr;
715 
716 	if (ring->idx == R600_RING_TYPE_DMA_INDEX)
717 		index = R600_WB_DMA_RING_TEST_OFFSET;
718 	else
719 		index = CAYMAN_WB_DMA1_RING_TEST_OFFSET;
720 
721 	gpu_addr = rdev->wb.gpu_addr + index;
722 
723 	tmp = 0xCAFEDEAD;
724 	rdev->wb.wb[index/4] = cpu_to_le32(tmp);
725 
726 	r = radeon_ib_get(rdev, ring->idx, &ib, NULL, 256);
727 	if (r) {
728 		DRM_ERROR("radeon: failed to get ib (%d).\n", r);
729 		return r;
730 	}
731 
732 	ib.ptr[0] = SDMA_PACKET(SDMA_OPCODE_WRITE, SDMA_WRITE_SUB_OPCODE_LINEAR, 0);
733 	ib.ptr[1] = lower_32_bits(gpu_addr);
734 	ib.ptr[2] = upper_32_bits(gpu_addr);
735 	ib.ptr[3] = 1;
736 	ib.ptr[4] = 0xDEADBEEF;
737 	ib.length_dw = 5;
738 
739 	r = radeon_ib_schedule(rdev, &ib, NULL, false);
740 	if (r) {
741 		radeon_ib_free(rdev, &ib);
742 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
743 		return r;
744 	}
745 	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
746 		RADEON_USEC_IB_TEST_TIMEOUT));
747 	if (r < 0) {
748 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
749 		return r;
750 	} else if (r == 0) {
751 		DRM_ERROR("radeon: fence wait timed out.\n");
752 		return -ETIMEDOUT;
753 	}
754 	r = 0;
755 	for (i = 0; i < rdev->usec_timeout; i++) {
756 		tmp = le32_to_cpu(rdev->wb.wb[index/4]);
757 		if (tmp == 0xDEADBEEF)
758 			break;
759 		udelay(1);
760 	}
761 	if (i < rdev->usec_timeout) {
762 		DRM_INFO("ib test on ring %d succeeded in %u usecs\n", ib.fence->ring, i);
763 	} else {
764 		DRM_ERROR("radeon: ib test failed (0x%08X)\n", tmp);
765 		r = -EINVAL;
766 	}
767 	radeon_ib_free(rdev, &ib);
768 	return r;
769 }
770 
771 /**
772  * cik_sdma_is_lockup - Check if the DMA engine is locked up
773  *
774  * @rdev: radeon_device pointer
775  * @ring: radeon_ring structure holding ring information
776  *
777  * Check if the async DMA engine is locked up (CIK).
778  * Returns true if the engine appears to be locked up, false if not.
779  */
cik_sdma_is_lockup(struct radeon_device * rdev,struct radeon_ring * ring)780 bool cik_sdma_is_lockup(struct radeon_device *rdev, struct radeon_ring *ring)
781 {
782 	u32 reset_mask = cik_gpu_check_soft_reset(rdev);
783 	u32 mask;
784 
785 	if (ring->idx == R600_RING_TYPE_DMA_INDEX)
786 		mask = RADEON_RESET_DMA;
787 	else
788 		mask = RADEON_RESET_DMA1;
789 
790 	if (!(reset_mask & mask)) {
791 		radeon_ring_lockup_update(rdev, ring);
792 		return false;
793 	}
794 	return radeon_ring_test_lockup(rdev, ring);
795 }
796 
797 /**
798  * cik_sdma_vm_copy_pages - update PTEs by copying them from the GART
799  *
800  * @rdev: radeon_device pointer
801  * @ib: indirect buffer to fill with commands
802  * @pe: addr of the page entry
803  * @src: src addr to copy from
804  * @count: number of page entries to update
805  *
806  * Update PTEs by copying them from the GART using sDMA (CIK).
807  */
cik_sdma_vm_copy_pages(struct radeon_device * rdev,struct radeon_ib * ib,uint64_t pe,uint64_t src,unsigned count)808 void cik_sdma_vm_copy_pages(struct radeon_device *rdev,
809 			    struct radeon_ib *ib,
810 			    uint64_t pe, uint64_t src,
811 			    unsigned count)
812 {
813 	while (count) {
814 		unsigned bytes = count * 8;
815 		if (bytes > 0x1FFFF8)
816 			bytes = 0x1FFFF8;
817 
818 		ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_COPY,
819 			SDMA_WRITE_SUB_OPCODE_LINEAR, 0);
820 		ib->ptr[ib->length_dw++] = bytes;
821 		ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */
822 		ib->ptr[ib->length_dw++] = lower_32_bits(src);
823 		ib->ptr[ib->length_dw++] = upper_32_bits(src);
824 		ib->ptr[ib->length_dw++] = lower_32_bits(pe);
825 		ib->ptr[ib->length_dw++] = upper_32_bits(pe);
826 
827 		pe += bytes;
828 		src += bytes;
829 		count -= bytes / 8;
830 	}
831 }
832 
833 /**
834  * cik_sdma_vm_write_pages - update PTEs by writing them manually
835  *
836  * @rdev: radeon_device pointer
837  * @ib: indirect buffer to fill with commands
838  * @pe: addr of the page entry
839  * @addr: dst addr to write into pe
840  * @count: number of page entries to update
841  * @incr: increase next addr by incr bytes
842  * @flags: access flags
843  *
844  * Update PTEs by writing them manually using sDMA (CIK).
845  */
cik_sdma_vm_write_pages(struct radeon_device * rdev,struct radeon_ib * ib,uint64_t pe,uint64_t addr,unsigned count,uint32_t incr,uint32_t flags)846 void cik_sdma_vm_write_pages(struct radeon_device *rdev,
847 			     struct radeon_ib *ib,
848 			     uint64_t pe,
849 			     uint64_t addr, unsigned count,
850 			     uint32_t incr, uint32_t flags)
851 {
852 	uint64_t value;
853 	unsigned ndw;
854 
855 	while (count) {
856 		ndw = count * 2;
857 		if (ndw > 0xFFFFE)
858 			ndw = 0xFFFFE;
859 
860 		/* for non-physically contiguous pages (system) */
861 		ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_WRITE,
862 			SDMA_WRITE_SUB_OPCODE_LINEAR, 0);
863 		ib->ptr[ib->length_dw++] = pe;
864 		ib->ptr[ib->length_dw++] = upper_32_bits(pe);
865 		ib->ptr[ib->length_dw++] = ndw;
866 		for (; ndw > 0; ndw -= 2, --count, pe += 8) {
867 			if (flags & R600_PTE_SYSTEM) {
868 				value = radeon_vm_map_gart(rdev, addr);
869 			} else if (flags & R600_PTE_VALID) {
870 				value = addr;
871 			} else {
872 				value = 0;
873 			}
874 			addr += incr;
875 			value |= flags;
876 			ib->ptr[ib->length_dw++] = value;
877 			ib->ptr[ib->length_dw++] = upper_32_bits(value);
878 		}
879 	}
880 }
881 
882 /**
883  * cik_sdma_vm_set_pages - update the page tables using sDMA
884  *
885  * @rdev: radeon_device pointer
886  * @ib: indirect buffer to fill with commands
887  * @pe: addr of the page entry
888  * @addr: dst addr to write into pe
889  * @count: number of page entries to update
890  * @incr: increase next addr by incr bytes
891  * @flags: access flags
892  *
893  * Update the page tables using sDMA (CIK).
894  */
cik_sdma_vm_set_pages(struct radeon_device * rdev,struct radeon_ib * ib,uint64_t pe,uint64_t addr,unsigned count,uint32_t incr,uint32_t flags)895 void cik_sdma_vm_set_pages(struct radeon_device *rdev,
896 			   struct radeon_ib *ib,
897 			   uint64_t pe,
898 			   uint64_t addr, unsigned count,
899 			   uint32_t incr, uint32_t flags)
900 {
901 	uint64_t value;
902 	unsigned ndw;
903 
904 	while (count) {
905 		ndw = count;
906 		if (ndw > 0x7FFFF)
907 			ndw = 0x7FFFF;
908 
909 		if (flags & R600_PTE_VALID)
910 			value = addr;
911 		else
912 			value = 0;
913 
914 		/* for physically contiguous pages (vram) */
915 		ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_GENERATE_PTE_PDE, 0, 0);
916 		ib->ptr[ib->length_dw++] = pe; /* dst addr */
917 		ib->ptr[ib->length_dw++] = upper_32_bits(pe);
918 		ib->ptr[ib->length_dw++] = flags; /* mask */
919 		ib->ptr[ib->length_dw++] = 0;
920 		ib->ptr[ib->length_dw++] = value; /* value */
921 		ib->ptr[ib->length_dw++] = upper_32_bits(value);
922 		ib->ptr[ib->length_dw++] = incr; /* increment size */
923 		ib->ptr[ib->length_dw++] = 0;
924 		ib->ptr[ib->length_dw++] = ndw; /* number of entries */
925 
926 		pe += ndw * 8;
927 		addr += ndw * incr;
928 		count -= ndw;
929 	}
930 }
931 
932 /**
933  * cik_sdma_vm_pad_ib - pad the IB to the required number of dw
934  *
935  * @ib: indirect buffer to fill with padding
936  *
937  */
cik_sdma_vm_pad_ib(struct radeon_ib * ib)938 void cik_sdma_vm_pad_ib(struct radeon_ib *ib)
939 {
940 	while (ib->length_dw & 0x7)
941 		ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0);
942 }
943 
944 /**
945  * cik_dma_vm_flush - cik vm flush using sDMA
946  *
947  * @rdev: radeon_device pointer
948  *
949  * Update the page table base and flush the VM TLB
950  * using sDMA (CIK).
951  */
cik_dma_vm_flush(struct radeon_device * rdev,struct radeon_ring * ring,unsigned vm_id,uint64_t pd_addr)952 void cik_dma_vm_flush(struct radeon_device *rdev, struct radeon_ring *ring,
953 		      unsigned vm_id, uint64_t pd_addr)
954 {
955 	u32 extra_bits = (SDMA_POLL_REG_MEM_EXTRA_OP(0) |
956 			  SDMA_POLL_REG_MEM_EXTRA_FUNC(0)); /* always */
957 
958 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000));
959 	if (vm_id < 8) {
960 		radeon_ring_write(ring, (VM_CONTEXT0_PAGE_TABLE_BASE_ADDR + (vm_id << 2)) >> 2);
961 	} else {
962 		radeon_ring_write(ring, (VM_CONTEXT8_PAGE_TABLE_BASE_ADDR + ((vm_id - 8) << 2)) >> 2);
963 	}
964 	radeon_ring_write(ring, pd_addr >> 12);
965 
966 	/* update SH_MEM_* regs */
967 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000));
968 	radeon_ring_write(ring, SRBM_GFX_CNTL >> 2);
969 	radeon_ring_write(ring, VMID(vm_id));
970 
971 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000));
972 	radeon_ring_write(ring, SH_MEM_BASES >> 2);
973 	radeon_ring_write(ring, 0);
974 
975 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000));
976 	radeon_ring_write(ring, SH_MEM_CONFIG >> 2);
977 	radeon_ring_write(ring, 0);
978 
979 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000));
980 	radeon_ring_write(ring, SH_MEM_APE1_BASE >> 2);
981 	radeon_ring_write(ring, 1);
982 
983 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000));
984 	radeon_ring_write(ring, SH_MEM_APE1_LIMIT >> 2);
985 	radeon_ring_write(ring, 0);
986 
987 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000));
988 	radeon_ring_write(ring, SRBM_GFX_CNTL >> 2);
989 	radeon_ring_write(ring, VMID(0));
990 
991 	/* flush HDP */
992 	cik_sdma_hdp_flush_ring_emit(rdev, ring->idx);
993 
994 	/* flush TLB */
995 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000));
996 	radeon_ring_write(ring, VM_INVALIDATE_REQUEST >> 2);
997 	radeon_ring_write(ring, 1 << vm_id);
998 
999 	radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_POLL_REG_MEM, 0, extra_bits));
1000 	radeon_ring_write(ring, VM_INVALIDATE_REQUEST >> 2);
1001 	radeon_ring_write(ring, 0);
1002 	radeon_ring_write(ring, 0); /* reference */
1003 	radeon_ring_write(ring, 0); /* mask */
1004 	radeon_ring_write(ring, (0xfff << 16) | 10); /* retry count, poll interval */
1005 }
1006 
1007