xref: /openbsd/sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c (revision 4a4ef11e)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39 
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69 
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72 
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78 
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82 
83 #include <drm/drm_drv.h>
84 
85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__)
86 #include <asm/intel-family.h>
87 #endif
88 
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96 
97 #define AMDGPU_RESUME_MS		2000
98 #define AMDGPU_MAX_RETRY_LIMIT		2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100 
101 static const struct drm_driver amdgpu_kms_driver;
102 
103 const char *amdgpu_asic_name[] = {
104 	"TAHITI",
105 	"PITCAIRN",
106 	"VERDE",
107 	"OLAND",
108 	"HAINAN",
109 	"BONAIRE",
110 	"KAVERI",
111 	"KABINI",
112 	"HAWAII",
113 	"MULLINS",
114 	"TOPAZ",
115 	"TONGA",
116 	"FIJI",
117 	"CARRIZO",
118 	"STONEY",
119 	"POLARIS10",
120 	"POLARIS11",
121 	"POLARIS12",
122 	"VEGAM",
123 	"VEGA10",
124 	"VEGA12",
125 	"VEGA20",
126 	"RAVEN",
127 	"ARCTURUS",
128 	"RENOIR",
129 	"ALDEBARAN",
130 	"NAVI10",
131 	"CYAN_SKILLFISH",
132 	"NAVI14",
133 	"NAVI12",
134 	"SIENNA_CICHLID",
135 	"NAVY_FLOUNDER",
136 	"VANGOGH",
137 	"DIMGREY_CAVEFISH",
138 	"BEIGE_GOBY",
139 	"YELLOW_CARP",
140 	"IP DISCOVERY",
141 	"LAST",
142 };
143 
144 /**
145  * DOC: pcie_replay_count
146  *
147  * The amdgpu driver provides a sysfs API for reporting the total number
148  * of PCIe replays (NAKs)
149  * The file pcie_replay_count is used for this and returns the total
150  * number of replays as a sum of the NAKs generated and NAKs received
151  */
152 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
154 		struct device_attribute *attr, char *buf)
155 {
156 	struct drm_device *ddev = dev_get_drvdata(dev);
157 	struct amdgpu_device *adev = drm_to_adev(ddev);
158 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
159 
160 	return sysfs_emit(buf, "%llu\n", cnt);
161 }
162 
163 static DEVICE_ATTR(pcie_replay_count, 0444,
164 		amdgpu_device_get_pcie_replay_count, NULL);
165 
166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
167 
168 
169 /**
170  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
171  *
172  * @dev: drm_device pointer
173  *
174  * Returns true if the device is a dGPU with ATPX power control,
175  * otherwise return false.
176  */
amdgpu_device_supports_px(struct drm_device * dev)177 bool amdgpu_device_supports_px(struct drm_device *dev)
178 {
179 	struct amdgpu_device *adev = drm_to_adev(dev);
180 
181 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
182 		return true;
183 	return false;
184 }
185 
186 /**
187  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
188  *
189  * @dev: drm_device pointer
190  *
191  * Returns true if the device is a dGPU with ACPI power control,
192  * otherwise return false.
193  */
amdgpu_device_supports_boco(struct drm_device * dev)194 bool amdgpu_device_supports_boco(struct drm_device *dev)
195 {
196 	struct amdgpu_device *adev = drm_to_adev(dev);
197 
198 	if (adev->has_pr3 ||
199 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
200 		return true;
201 	return false;
202 }
203 
204 /**
205  * amdgpu_device_supports_baco - Does the device support BACO
206  *
207  * @dev: drm_device pointer
208  *
209  * Returns true if the device supporte BACO,
210  * otherwise return false.
211  */
amdgpu_device_supports_baco(struct drm_device * dev)212 bool amdgpu_device_supports_baco(struct drm_device *dev)
213 {
214 	struct amdgpu_device *adev = drm_to_adev(dev);
215 
216 	return amdgpu_asic_supports_baco(adev);
217 }
218 
219 /**
220  * amdgpu_device_supports_smart_shift - Is the device dGPU with
221  * smart shift support
222  *
223  * @dev: drm_device pointer
224  *
225  * Returns true if the device is a dGPU with Smart Shift support,
226  * otherwise returns false.
227  */
amdgpu_device_supports_smart_shift(struct drm_device * dev)228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
229 {
230 	return (amdgpu_device_supports_boco(dev) &&
231 		amdgpu_acpi_is_power_shift_control_supported());
232 }
233 
234 /*
235  * VRAM access helper functions
236  */
237 
238 /**
239  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
240  *
241  * @adev: amdgpu_device pointer
242  * @pos: offset of the buffer in vram
243  * @buf: virtual address of the buffer in system memory
244  * @size: read/write size, sizeof(@buf) must > @size
245  * @write: true - write to vram, otherwise - read from vram
246  */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
248 			     void *buf, size_t size, bool write)
249 {
250 	unsigned long flags;
251 	uint32_t hi = ~0, tmp = 0;
252 	uint32_t *data = buf;
253 	uint64_t last;
254 	int idx;
255 
256 	if (!drm_dev_enter(adev_to_drm(adev), &idx))
257 		return;
258 
259 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
260 
261 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
262 	for (last = pos + size; pos < last; pos += 4) {
263 		tmp = pos >> 31;
264 
265 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
266 		if (tmp != hi) {
267 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
268 			hi = tmp;
269 		}
270 		if (write)
271 			WREG32_NO_KIQ(mmMM_DATA, *data++);
272 		else
273 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
274 	}
275 
276 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
277 	drm_dev_exit(idx);
278 }
279 
280 /**
281  * amdgpu_device_aper_access - access vram by vram aperature
282  *
283  * @adev: amdgpu_device pointer
284  * @pos: offset of the buffer in vram
285  * @buf: virtual address of the buffer in system memory
286  * @size: read/write size, sizeof(@buf) must > @size
287  * @write: true - write to vram, otherwise - read from vram
288  *
289  * The return value means how many bytes have been transferred.
290  */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
292 				 void *buf, size_t size, bool write)
293 {
294 #ifdef CONFIG_64BIT
295 	void __iomem *addr;
296 	size_t count = 0;
297 	uint64_t last;
298 
299 	if (!adev->mman.aper_base_kaddr)
300 		return 0;
301 
302 	last = min(pos + size, adev->gmc.visible_vram_size);
303 	if (last > pos) {
304 		addr = adev->mman.aper_base_kaddr + pos;
305 		count = last - pos;
306 
307 		if (write) {
308 			memcpy_toio(addr, buf, count);
309 			/* Make sure HDP write cache flush happens without any reordering
310 			 * after the system memory contents are sent over PCIe device
311 			 */
312 			mb();
313 			amdgpu_device_flush_hdp(adev, NULL);
314 		} else {
315 			amdgpu_device_invalidate_hdp(adev, NULL);
316 			/* Make sure HDP read cache is invalidated before issuing a read
317 			 * to the PCIe device
318 			 */
319 			mb();
320 			memcpy_fromio(buf, addr, count);
321 		}
322 
323 	}
324 
325 	return count;
326 #else
327 	return 0;
328 #endif
329 }
330 
331 /**
332  * amdgpu_device_vram_access - read/write a buffer in vram
333  *
334  * @adev: amdgpu_device pointer
335  * @pos: offset of the buffer in vram
336  * @buf: virtual address of the buffer in system memory
337  * @size: read/write size, sizeof(@buf) must > @size
338  * @write: true - write to vram, otherwise - read from vram
339  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
341 			       void *buf, size_t size, bool write)
342 {
343 	size_t count;
344 
345 	/* try to using vram apreature to access vram first */
346 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
347 	size -= count;
348 	if (size) {
349 		/* using MM to access rest vram */
350 		pos += count;
351 		buf += count;
352 		amdgpu_device_mm_access(adev, pos, buf, size, write);
353 	}
354 }
355 
356 /*
357  * register access helper functions.
358  */
359 
360 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
362 {
363 	if (adev->no_hw_access)
364 		return true;
365 
366 #ifdef CONFIG_LOCKDEP
367 	/*
368 	 * This is a bit complicated to understand, so worth a comment. What we assert
369 	 * here is that the GPU reset is not running on another thread in parallel.
370 	 *
371 	 * For this we trylock the read side of the reset semaphore, if that succeeds
372 	 * we know that the reset is not running in paralell.
373 	 *
374 	 * If the trylock fails we assert that we are either already holding the read
375 	 * side of the lock or are the reset thread itself and hold the write side of
376 	 * the lock.
377 	 */
378 	if (in_task()) {
379 		if (down_read_trylock(&adev->reset_domain->sem))
380 			up_read(&adev->reset_domain->sem);
381 		else
382 			lockdep_assert_held(&adev->reset_domain->sem);
383 	}
384 #endif
385 	return false;
386 }
387 
388 /**
389  * amdgpu_device_rreg - read a memory mapped IO or indirect register
390  *
391  * @adev: amdgpu_device pointer
392  * @reg: dword aligned register offset
393  * @acc_flags: access flags which require special behavior
394  *
395  * Returns the 32 bit value from the offset specified.
396  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
398 			    uint32_t reg, uint32_t acc_flags)
399 {
400 	uint32_t ret;
401 
402 	if (amdgpu_device_skip_hw_access(adev))
403 		return 0;
404 
405 	if ((reg * 4) < adev->rmmio_size) {
406 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
407 		    amdgpu_sriov_runtime(adev) &&
408 		    down_read_trylock(&adev->reset_domain->sem)) {
409 			ret = amdgpu_kiq_rreg(adev, reg);
410 			up_read(&adev->reset_domain->sem);
411 		} else {
412 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
413 		}
414 	} else {
415 		ret = adev->pcie_rreg(adev, reg * 4);
416 	}
417 
418 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
419 
420 	return ret;
421 }
422 
423 /*
424  * MMIO register read with bytes helper functions
425  * @offset:bytes offset from MMIO start
426  */
427 
428 /**
429  * amdgpu_mm_rreg8 - read a memory mapped IO register
430  *
431  * @adev: amdgpu_device pointer
432  * @offset: byte aligned register offset
433  *
434  * Returns the 8 bit value from the offset specified.
435  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
437 {
438 	if (amdgpu_device_skip_hw_access(adev))
439 		return 0;
440 
441 	if (offset < adev->rmmio_size)
442 		return (readb(adev->rmmio + offset));
443 	BUG();
444 }
445 
446 /*
447  * MMIO register write with bytes helper functions
448  * @offset:bytes offset from MMIO start
449  * @value: the value want to be written to the register
450  */
451 
452 /**
453  * amdgpu_mm_wreg8 - read a memory mapped IO register
454  *
455  * @adev: amdgpu_device pointer
456  * @offset: byte aligned register offset
457  * @value: 8 bit value to write
458  *
459  * Writes the value specified to the offset specified.
460  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
462 {
463 	if (amdgpu_device_skip_hw_access(adev))
464 		return;
465 
466 	if (offset < adev->rmmio_size)
467 		writeb(value, adev->rmmio + offset);
468 	else
469 		BUG();
470 }
471 
472 /**
473  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
474  *
475  * @adev: amdgpu_device pointer
476  * @reg: dword aligned register offset
477  * @v: 32 bit value to write to the register
478  * @acc_flags: access flags which require special behavior
479  *
480  * Writes the value specified to the offset specified.
481  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)482 void amdgpu_device_wreg(struct amdgpu_device *adev,
483 			uint32_t reg, uint32_t v,
484 			uint32_t acc_flags)
485 {
486 	if (amdgpu_device_skip_hw_access(adev))
487 		return;
488 
489 	if ((reg * 4) < adev->rmmio_size) {
490 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
491 		    amdgpu_sriov_runtime(adev) &&
492 		    down_read_trylock(&adev->reset_domain->sem)) {
493 			amdgpu_kiq_wreg(adev, reg, v);
494 			up_read(&adev->reset_domain->sem);
495 		} else {
496 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
497 		}
498 	} else {
499 		adev->pcie_wreg(adev, reg * 4, v);
500 	}
501 
502 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
503 }
504 
505 /**
506  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
507  *
508  * @adev: amdgpu_device pointer
509  * @reg: mmio/rlc register
510  * @v: value to write
511  *
512  * this function is invoked only for the debugfs register access
513  */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
515 			     uint32_t reg, uint32_t v,
516 			     uint32_t xcc_id)
517 {
518 	if (amdgpu_device_skip_hw_access(adev))
519 		return;
520 
521 	if (amdgpu_sriov_fullaccess(adev) &&
522 	    adev->gfx.rlc.funcs &&
523 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
524 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
525 			return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
526 	} else if ((reg * 4) >= adev->rmmio_size) {
527 		adev->pcie_wreg(adev, reg * 4, v);
528 	} else {
529 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
530 	}
531 }
532 
533 /**
534  * amdgpu_device_indirect_rreg - read an indirect register
535  *
536  * @adev: amdgpu_device pointer
537  * @reg_addr: indirect register address to read from
538  *
539  * Returns the value of indirect register @reg_addr
540  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
542 				u32 reg_addr)
543 {
544 	unsigned long flags, pcie_index, pcie_data;
545 	void __iomem *pcie_index_offset;
546 	void __iomem *pcie_data_offset;
547 	u32 r;
548 
549 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
550 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
551 
552 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
553 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
554 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
555 
556 	writel(reg_addr, pcie_index_offset);
557 	readl(pcie_index_offset);
558 	r = readl(pcie_data_offset);
559 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
560 
561 	return r;
562 }
563 
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
565 				    u64 reg_addr)
566 {
567 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
568 	u32 r;
569 	void __iomem *pcie_index_offset;
570 	void __iomem *pcie_index_hi_offset;
571 	void __iomem *pcie_data_offset;
572 
573 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
574 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
575 	if (adev->nbio.funcs->get_pcie_index_hi_offset)
576 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
577 	else
578 		pcie_index_hi = 0;
579 
580 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
581 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
582 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
583 	if (pcie_index_hi != 0)
584 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
585 				pcie_index_hi * 4;
586 
587 	writel(reg_addr, pcie_index_offset);
588 	readl(pcie_index_offset);
589 	if (pcie_index_hi != 0) {
590 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
591 		readl(pcie_index_hi_offset);
592 	}
593 	r = readl(pcie_data_offset);
594 
595 	/* clear the high bits */
596 	if (pcie_index_hi != 0) {
597 		writel(0, pcie_index_hi_offset);
598 		readl(pcie_index_hi_offset);
599 	}
600 
601 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602 
603 	return r;
604 }
605 
606 /**
607  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608  *
609  * @adev: amdgpu_device pointer
610  * @reg_addr: indirect register address to read from
611  *
612  * Returns the value of indirect register @reg_addr
613  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
615 				  u32 reg_addr)
616 {
617 	unsigned long flags, pcie_index, pcie_data;
618 	void __iomem *pcie_index_offset;
619 	void __iomem *pcie_data_offset;
620 	u64 r;
621 
622 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
623 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
624 
625 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
626 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
627 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
628 
629 	/* read low 32 bits */
630 	writel(reg_addr, pcie_index_offset);
631 	readl(pcie_index_offset);
632 	r = readl(pcie_data_offset);
633 	/* read high 32 bits */
634 	writel(reg_addr + 4, pcie_index_offset);
635 	readl(pcie_index_offset);
636 	r |= ((u64)readl(pcie_data_offset) << 32);
637 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
638 
639 	return r;
640 }
641 
642 /**
643  * amdgpu_device_indirect_wreg - write an indirect register address
644  *
645  * @adev: amdgpu_device pointer
646  * @reg_addr: indirect register offset
647  * @reg_data: indirect register data
648  *
649  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
651 				 u32 reg_addr, u32 reg_data)
652 {
653 	unsigned long flags, pcie_index, pcie_data;
654 	void __iomem *pcie_index_offset;
655 	void __iomem *pcie_data_offset;
656 
657 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
658 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
659 
660 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
661 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
662 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
663 
664 	writel(reg_addr, pcie_index_offset);
665 	readl(pcie_index_offset);
666 	writel(reg_data, pcie_data_offset);
667 	readl(pcie_data_offset);
668 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
669 }
670 
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
672 				     u64 reg_addr, u32 reg_data)
673 {
674 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
675 	void __iomem *pcie_index_offset;
676 	void __iomem *pcie_index_hi_offset;
677 	void __iomem *pcie_data_offset;
678 
679 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
680 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
681 	if (adev->nbio.funcs->get_pcie_index_hi_offset)
682 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
683 	else
684 		pcie_index_hi = 0;
685 
686 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
687 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
688 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
689 	if (pcie_index_hi != 0)
690 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
691 				pcie_index_hi * 4;
692 
693 	writel(reg_addr, pcie_index_offset);
694 	readl(pcie_index_offset);
695 	if (pcie_index_hi != 0) {
696 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
697 		readl(pcie_index_hi_offset);
698 	}
699 	writel(reg_data, pcie_data_offset);
700 	readl(pcie_data_offset);
701 
702 	/* clear the high bits */
703 	if (pcie_index_hi != 0) {
704 		writel(0, pcie_index_hi_offset);
705 		readl(pcie_index_hi_offset);
706 	}
707 
708 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
709 }
710 
711 /**
712  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
713  *
714  * @adev: amdgpu_device pointer
715  * @reg_addr: indirect register offset
716  * @reg_data: indirect register data
717  *
718  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
720 				   u32 reg_addr, u64 reg_data)
721 {
722 	unsigned long flags, pcie_index, pcie_data;
723 	void __iomem *pcie_index_offset;
724 	void __iomem *pcie_data_offset;
725 
726 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
727 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
728 
729 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
730 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
731 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
732 
733 	/* write low 32 bits */
734 	writel(reg_addr, pcie_index_offset);
735 	readl(pcie_index_offset);
736 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
737 	readl(pcie_data_offset);
738 	/* write high 32 bits */
739 	writel(reg_addr + 4, pcie_index_offset);
740 	readl(pcie_index_offset);
741 	writel((u32)(reg_data >> 32), pcie_data_offset);
742 	readl(pcie_data_offset);
743 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
744 }
745 
746 /**
747  * amdgpu_device_get_rev_id - query device rev_id
748  *
749  * @adev: amdgpu_device pointer
750  *
751  * Return device rev_id
752  */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
754 {
755 	return adev->nbio.funcs->get_rev_id(adev);
756 }
757 
758 /**
759  * amdgpu_invalid_rreg - dummy reg read function
760  *
761  * @adev: amdgpu_device pointer
762  * @reg: offset of register
763  *
764  * Dummy register read function.  Used for register blocks
765  * that certain asics don't have (all asics).
766  * Returns the value in the register.
767  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
769 {
770 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
771 	BUG();
772 	return 0;
773 }
774 
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
776 {
777 	DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
778 	BUG();
779 	return 0;
780 }
781 
782 /**
783  * amdgpu_invalid_wreg - dummy reg write function
784  *
785  * @adev: amdgpu_device pointer
786  * @reg: offset of register
787  * @v: value to write to the register
788  *
789  * Dummy register read function.  Used for register blocks
790  * that certain asics don't have (all asics).
791  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
793 {
794 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
795 		  reg, v);
796 	BUG();
797 }
798 
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
800 {
801 	DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
802 		  reg, v);
803 	BUG();
804 }
805 
806 /**
807  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
808  *
809  * @adev: amdgpu_device pointer
810  * @reg: offset of register
811  *
812  * Dummy register read function.  Used for register blocks
813  * that certain asics don't have (all asics).
814  * Returns the value in the register.
815  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
817 {
818 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
819 	BUG();
820 	return 0;
821 }
822 
823 /**
824  * amdgpu_invalid_wreg64 - dummy reg write function
825  *
826  * @adev: amdgpu_device pointer
827  * @reg: offset of register
828  * @v: value to write to the register
829  *
830  * Dummy register read function.  Used for register blocks
831  * that certain asics don't have (all asics).
832  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
834 {
835 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
836 		  reg, v);
837 	BUG();
838 }
839 
840 /**
841  * amdgpu_block_invalid_rreg - dummy reg read function
842  *
843  * @adev: amdgpu_device pointer
844  * @block: offset of instance
845  * @reg: offset of register
846  *
847  * Dummy register read function.  Used for register blocks
848  * that certain asics don't have (all asics).
849  * Returns the value in the register.
850  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
852 					  uint32_t block, uint32_t reg)
853 {
854 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
855 		  reg, block);
856 	BUG();
857 	return 0;
858 }
859 
860 /**
861  * amdgpu_block_invalid_wreg - dummy reg write function
862  *
863  * @adev: amdgpu_device pointer
864  * @block: offset of instance
865  * @reg: offset of register
866  * @v: value to write to the register
867  *
868  * Dummy register read function.  Used for register blocks
869  * that certain asics don't have (all asics).
870  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
872 				      uint32_t block,
873 				      uint32_t reg, uint32_t v)
874 {
875 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
876 		  reg, block, v);
877 	BUG();
878 }
879 
880 /**
881  * amdgpu_device_asic_init - Wrapper for atom asic_init
882  *
883  * @adev: amdgpu_device pointer
884  *
885  * Does any asic specific work and then calls atom asic init.
886  */
amdgpu_device_asic_init(struct amdgpu_device * adev)887 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
888 {
889 	int ret;
890 
891 	amdgpu_asic_pre_asic_init(adev);
892 
893 	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
894 	    adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
895 		amdgpu_psp_wait_for_bootloader(adev);
896 		ret = amdgpu_atomfirmware_asic_init(adev, true);
897 		return ret;
898 	} else {
899 		return amdgpu_atom_asic_init(adev->mode_info.atom_context);
900 	}
901 
902 	return 0;
903 }
904 
905 /**
906  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
907  *
908  * @adev: amdgpu_device pointer
909  *
910  * Allocates a scratch page of VRAM for use by various things in the
911  * driver.
912  */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
914 {
915 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
916 				       AMDGPU_GEM_DOMAIN_VRAM |
917 				       AMDGPU_GEM_DOMAIN_GTT,
918 				       &adev->mem_scratch.robj,
919 				       &adev->mem_scratch.gpu_addr,
920 				       (void **)&adev->mem_scratch.ptr);
921 }
922 
923 /**
924  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
925  *
926  * @adev: amdgpu_device pointer
927  *
928  * Frees the VRAM scratch page.
929  */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
931 {
932 	amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
933 }
934 
935 /**
936  * amdgpu_device_program_register_sequence - program an array of registers.
937  *
938  * @adev: amdgpu_device pointer
939  * @registers: pointer to the register array
940  * @array_size: size of the register array
941  *
942  * Programs an array or registers with and or masks.
943  * This is a helper for setting golden registers.
944  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
946 					     const u32 *registers,
947 					     const u32 array_size)
948 {
949 	u32 tmp, reg, and_mask, or_mask;
950 	int i;
951 
952 	if (array_size % 3)
953 		return;
954 
955 	for (i = 0; i < array_size; i += 3) {
956 		reg = registers[i + 0];
957 		and_mask = registers[i + 1];
958 		or_mask = registers[i + 2];
959 
960 		if (and_mask == 0xffffffff) {
961 			tmp = or_mask;
962 		} else {
963 			tmp = RREG32(reg);
964 			tmp &= ~and_mask;
965 			if (adev->family >= AMDGPU_FAMILY_AI)
966 				tmp |= (or_mask & and_mask);
967 			else
968 				tmp |= or_mask;
969 		}
970 		WREG32(reg, tmp);
971 	}
972 }
973 
974 /**
975  * amdgpu_device_pci_config_reset - reset the GPU
976  *
977  * @adev: amdgpu_device pointer
978  *
979  * Resets the GPU using the pci config reset sequence.
980  * Only applicable to asics prior to vega10.
981  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
983 {
984 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
985 }
986 
987 /**
988  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
989  *
990  * @adev: amdgpu_device pointer
991  *
992  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
993  */
amdgpu_device_pci_reset(struct amdgpu_device * adev)994 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
995 {
996 	STUB();
997 	return -ENOSYS;
998 #ifdef notyet
999 	return pci_reset_function(adev->pdev);
1000 #endif
1001 }
1002 
1003 /*
1004  * amdgpu_device_wb_*()
1005  * Writeback is the method by which the GPU updates special pages in memory
1006  * with the status of certain GPU events (fences, ring pointers,etc.).
1007  */
1008 
1009 /**
1010  * amdgpu_device_wb_fini - Disable Writeback and free memory
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Disables Writeback and frees the Writeback memory (all asics).
1015  * Used at driver shutdown.
1016  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 	if (adev->wb.wb_obj) {
1020 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 				      &adev->wb.gpu_addr,
1022 				      (void **)&adev->wb.wb);
1023 		adev->wb.wb_obj = NULL;
1024 	}
1025 }
1026 
1027 /**
1028  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1029  *
1030  * @adev: amdgpu_device pointer
1031  *
1032  * Initializes writeback and allocates writeback memory (all asics).
1033  * Used at driver startup.
1034  * Returns 0 on success or an -error on failure.
1035  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 	int r;
1039 
1040 	if (adev->wb.wb_obj == NULL) {
1041 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 					    (void **)&adev->wb.wb);
1046 		if (r) {
1047 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 			return r;
1049 		}
1050 
1051 		adev->wb.num_wb = AMDGPU_MAX_WB;
1052 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053 
1054 		/* clear wb memory */
1055 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 	}
1057 
1058 	return 0;
1059 }
1060 
1061 /**
1062  * amdgpu_device_wb_get - Allocate a wb entry
1063  *
1064  * @adev: amdgpu_device pointer
1065  * @wb: wb index
1066  *
1067  * Allocate a wb slot for use by the driver (all asics).
1068  * Returns 0 on success or -EINVAL on failure.
1069  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073 
1074 	if (offset < adev->wb.num_wb) {
1075 		__set_bit(offset, adev->wb.used);
1076 		*wb = offset << 3; /* convert to dw offset */
1077 		return 0;
1078 	} else {
1079 		return -EINVAL;
1080 	}
1081 }
1082 
1083 /**
1084  * amdgpu_device_wb_free - Free a wb entry
1085  *
1086  * @adev: amdgpu_device pointer
1087  * @wb: wb index
1088  *
1089  * Free a wb slot allocated for use by the driver (all asics)
1090  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 	wb >>= 3;
1094 	if (wb < adev->wb.num_wb)
1095 		__clear_bit(wb, adev->wb.used);
1096 }
1097 
1098 /**
1099  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100  *
1101  * @adev: amdgpu_device pointer
1102  *
1103  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104  * to fail, but if any of the BARs is not accessible after the size we abort
1105  * driver loading by returning -ENODEV.
1106  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 #ifdef __linux__
1110 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1111 	struct pci_bus *root;
1112 	struct resource *res;
1113 	unsigned int i;
1114 	u16 cmd;
1115 	int r;
1116 
1117 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1118 		return 0;
1119 
1120 	/* Bypass for VF */
1121 	if (amdgpu_sriov_vf(adev))
1122 		return 0;
1123 
1124 	/* skip if the bios has already enabled large BAR */
1125 	if (adev->gmc.real_vram_size &&
1126 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1127 		return 0;
1128 
1129 	/* Check if the root BUS has 64bit memory resources */
1130 	root = adev->pdev->bus;
1131 	while (root->parent)
1132 		root = root->parent;
1133 
1134 	pci_bus_for_each_resource(root, res, i) {
1135 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1136 		    res->start > 0x100000000ull)
1137 			break;
1138 	}
1139 
1140 	/* Trying to resize is pointless without a root hub window above 4GB */
1141 	if (!res)
1142 		return 0;
1143 
1144 	/* Limit the BAR size to what is available */
1145 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1146 			rbar_size);
1147 
1148 	/* Disable memory decoding while we change the BAR addresses and size */
1149 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1150 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1151 			      cmd & ~PCI_COMMAND_MEMORY);
1152 
1153 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1154 	amdgpu_doorbell_fini(adev);
1155 	if (adev->asic_type >= CHIP_BONAIRE)
1156 		pci_release_resource(adev->pdev, 2);
1157 
1158 	pci_release_resource(adev->pdev, 0);
1159 
1160 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1161 	if (r == -ENOSPC)
1162 		DRM_INFO("Not enough PCI address space for a large BAR.");
1163 	else if (r && r != -ENOTSUPP)
1164 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1165 
1166 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1167 
1168 	/* When the doorbell or fb BAR isn't available we have no chance of
1169 	 * using the device.
1170 	 */
1171 	r = amdgpu_doorbell_init(adev);
1172 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1173 		return -ENODEV;
1174 
1175 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1176 #endif /* __linux__ */
1177 
1178 	return 0;
1179 }
1180 
amdgpu_device_read_bios(struct amdgpu_device * adev)1181 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1182 {
1183 	if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1184 		return false;
1185 
1186 	return true;
1187 }
1188 
1189 /*
1190  * GPU helpers function.
1191  */
1192 /**
1193  * amdgpu_device_need_post - check if the hw need post or not
1194  *
1195  * @adev: amdgpu_device pointer
1196  *
1197  * Check if the asic has been initialized (all asics) at driver startup
1198  * or post is needed if  hw reset is performed.
1199  * Returns true if need or false if not.
1200  */
amdgpu_device_need_post(struct amdgpu_device * adev)1201 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1202 {
1203 	uint32_t reg;
1204 
1205 	if (amdgpu_sriov_vf(adev))
1206 		return false;
1207 
1208 	if (!amdgpu_device_read_bios(adev))
1209 		return false;
1210 
1211 	if (amdgpu_passthrough(adev)) {
1212 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1213 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1214 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1215 		 * vpost executed for smc version below 22.15
1216 		 */
1217 		if (adev->asic_type == CHIP_FIJI) {
1218 			int err;
1219 			uint32_t fw_ver;
1220 
1221 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1222 			/* force vPost if error occured */
1223 			if (err)
1224 				return true;
1225 
1226 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1227 			release_firmware(adev->pm.fw);
1228 			if (fw_ver < 0x00160e00)
1229 				return true;
1230 		}
1231 	}
1232 
1233 	/* Don't post if we need to reset whole hive on init */
1234 	if (adev->gmc.xgmi.pending_reset)
1235 		return false;
1236 
1237 	if (adev->has_hw_reset) {
1238 		adev->has_hw_reset = false;
1239 		return true;
1240 	}
1241 
1242 	/* bios scratch used on CIK+ */
1243 	if (adev->asic_type >= CHIP_BONAIRE)
1244 		return amdgpu_atombios_scratch_need_asic_init(adev);
1245 
1246 	/* check MEM_SIZE for older asics */
1247 	reg = amdgpu_asic_get_config_memsize(adev);
1248 
1249 	if ((reg != 0) && (reg != 0xffffffff))
1250 		return false;
1251 
1252 	return true;
1253 }
1254 
1255 /*
1256  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1257  * speed switching. Until we have confirmation from Intel that a specific host
1258  * supports it, it's safer that we keep it disabled for all.
1259  *
1260  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1261  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1262  */
amdgpu_device_pcie_dynamic_switching_supported(void)1263 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1264 {
1265 #if IS_ENABLED(CONFIG_X86)
1266 #ifdef __linux__
1267 	struct cpuinfo_x86 *c = &cpu_data(0);
1268 
1269 	if (c->x86_vendor == X86_VENDOR_INTEL)
1270 #else
1271 	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
1272 #endif
1273 		return false;
1274 #endif
1275 	return true;
1276 }
1277 
1278 /**
1279  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1280  *
1281  * @adev: amdgpu_device pointer
1282  *
1283  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1284  * be set for this device.
1285  *
1286  * Returns true if it should be used or false if not.
1287  */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1288 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1289 {
1290 	switch (amdgpu_aspm) {
1291 	case -1:
1292 		break;
1293 	case 0:
1294 		return false;
1295 	case 1:
1296 		return true;
1297 	default:
1298 		return false;
1299 	}
1300 	return pcie_aspm_enabled(adev->pdev);
1301 }
1302 
amdgpu_device_aspm_support_quirk(void)1303 bool amdgpu_device_aspm_support_quirk(void)
1304 {
1305 #if IS_ENABLED(CONFIG_X86)
1306 	struct cpu_info *ci = curcpu();
1307 
1308 	return !(ci->ci_family == 6 && ci->ci_model == 0x97);
1309 #else
1310 	return true;
1311 #endif
1312 }
1313 
1314 /* if we get transitioned to only one device, take VGA back */
1315 /**
1316  * amdgpu_device_vga_set_decode - enable/disable vga decode
1317  *
1318  * @pdev: PCI device pointer
1319  * @state: enable/disable vga decode
1320  *
1321  * Enable/disable vga decode (all asics).
1322  * Returns VGA resource flags.
1323  */
1324 #ifdef notyet
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326 		bool state)
1327 {
1328 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1329 
1330 	amdgpu_asic_set_vga_state(adev, state);
1331 	if (state)
1332 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1333 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1334 	else
1335 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1336 }
1337 #endif
1338 
1339 /**
1340  * amdgpu_device_check_block_size - validate the vm block size
1341  *
1342  * @adev: amdgpu_device pointer
1343  *
1344  * Validates the vm block size specified via module parameter.
1345  * The vm block size defines number of bits in page table versus page directory,
1346  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1347  * page table and the remaining bits are in the page directory.
1348  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1349 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1350 {
1351 	/* defines number of bits in page table versus page directory,
1352 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1353 	 * page table and the remaining bits are in the page directory
1354 	 */
1355 	if (amdgpu_vm_block_size == -1)
1356 		return;
1357 
1358 	if (amdgpu_vm_block_size < 9) {
1359 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1360 			 amdgpu_vm_block_size);
1361 		amdgpu_vm_block_size = -1;
1362 	}
1363 }
1364 
1365 /**
1366  * amdgpu_device_check_vm_size - validate the vm size
1367  *
1368  * @adev: amdgpu_device pointer
1369  *
1370  * Validates the vm size in GB specified via module parameter.
1371  * The VM size is the size of the GPU virtual memory space in GB.
1372  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1373 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1374 {
1375 	/* no need to check the default value */
1376 	if (amdgpu_vm_size == -1)
1377 		return;
1378 
1379 	if (amdgpu_vm_size < 1) {
1380 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1381 			 amdgpu_vm_size);
1382 		amdgpu_vm_size = -1;
1383 	}
1384 }
1385 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1386 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1387 {
1388 #ifdef __linux__
1389 	struct sysinfo si;
1390 #endif
1391 	bool is_os_64 = (sizeof(void *) == 8);
1392 	uint64_t total_memory;
1393 	uint64_t dram_size_seven_GB = 0x1B8000000;
1394 	uint64_t dram_size_three_GB = 0xB8000000;
1395 
1396 	if (amdgpu_smu_memory_pool_size == 0)
1397 		return;
1398 
1399 	if (!is_os_64) {
1400 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1401 		goto def_value;
1402 	}
1403 #ifdef __linux__
1404 	si_meminfo(&si);
1405 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1406 #else
1407 	total_memory = ptoa(physmem);
1408 #endif
1409 
1410 	if ((amdgpu_smu_memory_pool_size == 1) ||
1411 		(amdgpu_smu_memory_pool_size == 2)) {
1412 		if (total_memory < dram_size_three_GB)
1413 			goto def_value1;
1414 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1415 		(amdgpu_smu_memory_pool_size == 8)) {
1416 		if (total_memory < dram_size_seven_GB)
1417 			goto def_value1;
1418 	} else {
1419 		DRM_WARN("Smu memory pool size not supported\n");
1420 		goto def_value;
1421 	}
1422 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1423 
1424 	return;
1425 
1426 def_value1:
1427 	DRM_WARN("No enough system memory\n");
1428 def_value:
1429 	adev->pm.smu_prv_buffer_size = 0;
1430 }
1431 
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1432 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1433 {
1434 	if (!(adev->flags & AMD_IS_APU) ||
1435 	    adev->asic_type < CHIP_RAVEN)
1436 		return 0;
1437 
1438 	switch (adev->asic_type) {
1439 	case CHIP_RAVEN:
1440 		if (adev->pdev->device == 0x15dd)
1441 			adev->apu_flags |= AMD_APU_IS_RAVEN;
1442 		if (adev->pdev->device == 0x15d8)
1443 			adev->apu_flags |= AMD_APU_IS_PICASSO;
1444 		break;
1445 	case CHIP_RENOIR:
1446 		if ((adev->pdev->device == 0x1636) ||
1447 		    (adev->pdev->device == 0x164c))
1448 			adev->apu_flags |= AMD_APU_IS_RENOIR;
1449 		else
1450 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1451 		break;
1452 	case CHIP_VANGOGH:
1453 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
1454 		break;
1455 	case CHIP_YELLOW_CARP:
1456 		break;
1457 	case CHIP_CYAN_SKILLFISH:
1458 		if ((adev->pdev->device == 0x13FE) ||
1459 		    (adev->pdev->device == 0x143F))
1460 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1461 		break;
1462 	default:
1463 		break;
1464 	}
1465 
1466 	return 0;
1467 }
1468 
1469 /**
1470  * amdgpu_device_check_arguments - validate module params
1471  *
1472  * @adev: amdgpu_device pointer
1473  *
1474  * Validates certain module parameters and updates
1475  * the associated values used by the driver (all asics).
1476  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1477 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1478 {
1479 	if (amdgpu_sched_jobs < 4) {
1480 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1481 			 amdgpu_sched_jobs);
1482 		amdgpu_sched_jobs = 4;
1483 	} else if (!is_power_of_2(amdgpu_sched_jobs)) {
1484 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1485 			 amdgpu_sched_jobs);
1486 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1487 	}
1488 
1489 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1490 		/* gart size must be greater or equal to 32M */
1491 		dev_warn(adev->dev, "gart size (%d) too small\n",
1492 			 amdgpu_gart_size);
1493 		amdgpu_gart_size = -1;
1494 	}
1495 
1496 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1497 		/* gtt size must be greater or equal to 32M */
1498 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1499 				 amdgpu_gtt_size);
1500 		amdgpu_gtt_size = -1;
1501 	}
1502 
1503 	/* valid range is between 4 and 9 inclusive */
1504 	if (amdgpu_vm_fragment_size != -1 &&
1505 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1506 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1507 		amdgpu_vm_fragment_size = -1;
1508 	}
1509 
1510 	if (amdgpu_sched_hw_submission < 2) {
1511 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1512 			 amdgpu_sched_hw_submission);
1513 		amdgpu_sched_hw_submission = 2;
1514 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1515 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1516 			 amdgpu_sched_hw_submission);
1517 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1518 	}
1519 
1520 	if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1521 		dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1522 		amdgpu_reset_method = -1;
1523 	}
1524 
1525 	amdgpu_device_check_smu_prv_buffer_size(adev);
1526 
1527 	amdgpu_device_check_vm_size(adev);
1528 
1529 	amdgpu_device_check_block_size(adev);
1530 
1531 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1532 
1533 	return 0;
1534 }
1535 
1536 #ifdef __linux__
1537 /**
1538  * amdgpu_switcheroo_set_state - set switcheroo state
1539  *
1540  * @pdev: pci dev pointer
1541  * @state: vga_switcheroo state
1542  *
1543  * Callback for the switcheroo driver.  Suspends or resumes
1544  * the asics before or after it is powered up using ACPI methods.
1545  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1546 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1547 					enum vga_switcheroo_state state)
1548 {
1549 	struct drm_device *dev = pci_get_drvdata(pdev);
1550 	int r;
1551 
1552 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1553 		return;
1554 
1555 	if (state == VGA_SWITCHEROO_ON) {
1556 		pr_info("switched on\n");
1557 		/* don't suspend or resume card normally */
1558 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1559 
1560 		pci_set_power_state(pdev, PCI_D0);
1561 		amdgpu_device_load_pci_state(pdev);
1562 		r = pci_enable_device(pdev);
1563 		if (r)
1564 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1565 		amdgpu_device_resume(dev, true);
1566 
1567 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1568 	} else {
1569 		pr_info("switched off\n");
1570 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1571 		amdgpu_device_prepare(dev);
1572 		amdgpu_device_suspend(dev, true);
1573 		amdgpu_device_cache_pci_state(pdev);
1574 		/* Shut down the device */
1575 		pci_disable_device(pdev);
1576 		pci_set_power_state(pdev, PCI_D3cold);
1577 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1578 	}
1579 }
1580 
1581 /**
1582  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1583  *
1584  * @pdev: pci dev pointer
1585  *
1586  * Callback for the switcheroo driver.  Check of the switcheroo
1587  * state can be changed.
1588  * Returns true if the state can be changed, false if not.
1589  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1590 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1591 {
1592 	struct drm_device *dev = pci_get_drvdata(pdev);
1593 
1594        /*
1595 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1596 	* locking inversion with the driver load path. And the access here is
1597 	* completely racy anyway. So don't bother with locking for now.
1598 	*/
1599 	return atomic_read(&dev->open_count) == 0;
1600 }
1601 #endif /* __linux__ */
1602 
1603 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1604 #ifdef notyet
1605 	.set_gpu_state = amdgpu_switcheroo_set_state,
1606 	.reprobe = NULL,
1607 	.can_switch = amdgpu_switcheroo_can_switch,
1608 #endif
1609 };
1610 
1611 /**
1612  * amdgpu_device_ip_set_clockgating_state - set the CG state
1613  *
1614  * @dev: amdgpu_device pointer
1615  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1616  * @state: clockgating state (gate or ungate)
1617  *
1618  * Sets the requested clockgating state for all instances of
1619  * the hardware IP specified.
1620  * Returns the error code from the last instance.
1621  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1622 int amdgpu_device_ip_set_clockgating_state(void *dev,
1623 					   enum amd_ip_block_type block_type,
1624 					   enum amd_clockgating_state state)
1625 {
1626 	struct amdgpu_device *adev = dev;
1627 	int i, r = 0;
1628 
1629 	for (i = 0; i < adev->num_ip_blocks; i++) {
1630 		if (!adev->ip_blocks[i].status.valid)
1631 			continue;
1632 		if (adev->ip_blocks[i].version->type != block_type)
1633 			continue;
1634 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1635 			continue;
1636 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1637 			(void *)adev, state);
1638 		if (r)
1639 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1640 				  adev->ip_blocks[i].version->funcs->name, r);
1641 	}
1642 	return r;
1643 }
1644 
1645 /**
1646  * amdgpu_device_ip_set_powergating_state - set the PG state
1647  *
1648  * @dev: amdgpu_device pointer
1649  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1650  * @state: powergating state (gate or ungate)
1651  *
1652  * Sets the requested powergating state for all instances of
1653  * the hardware IP specified.
1654  * Returns the error code from the last instance.
1655  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1656 int amdgpu_device_ip_set_powergating_state(void *dev,
1657 					   enum amd_ip_block_type block_type,
1658 					   enum amd_powergating_state state)
1659 {
1660 	struct amdgpu_device *adev = dev;
1661 	int i, r = 0;
1662 
1663 	for (i = 0; i < adev->num_ip_blocks; i++) {
1664 		if (!adev->ip_blocks[i].status.valid)
1665 			continue;
1666 		if (adev->ip_blocks[i].version->type != block_type)
1667 			continue;
1668 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1669 			continue;
1670 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1671 			(void *)adev, state);
1672 		if (r)
1673 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1674 				  adev->ip_blocks[i].version->funcs->name, r);
1675 	}
1676 	return r;
1677 }
1678 
1679 /**
1680  * amdgpu_device_ip_get_clockgating_state - get the CG state
1681  *
1682  * @adev: amdgpu_device pointer
1683  * @flags: clockgating feature flags
1684  *
1685  * Walks the list of IPs on the device and updates the clockgating
1686  * flags for each IP.
1687  * Updates @flags with the feature flags for each hardware IP where
1688  * clockgating is enabled.
1689  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1690 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1691 					    u64 *flags)
1692 {
1693 	int i;
1694 
1695 	for (i = 0; i < adev->num_ip_blocks; i++) {
1696 		if (!adev->ip_blocks[i].status.valid)
1697 			continue;
1698 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1699 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1700 	}
1701 }
1702 
1703 /**
1704  * amdgpu_device_ip_wait_for_idle - wait for idle
1705  *
1706  * @adev: amdgpu_device pointer
1707  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1708  *
1709  * Waits for the request hardware IP to be idle.
1710  * Returns 0 for success or a negative error code on failure.
1711  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1712 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1713 				   enum amd_ip_block_type block_type)
1714 {
1715 	int i, r;
1716 
1717 	for (i = 0; i < adev->num_ip_blocks; i++) {
1718 		if (!adev->ip_blocks[i].status.valid)
1719 			continue;
1720 		if (adev->ip_blocks[i].version->type == block_type) {
1721 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1722 			if (r)
1723 				return r;
1724 			break;
1725 		}
1726 	}
1727 	return 0;
1728 
1729 }
1730 
1731 /**
1732  * amdgpu_device_ip_is_idle - is the hardware IP idle
1733  *
1734  * @adev: amdgpu_device pointer
1735  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1736  *
1737  * Check if the hardware IP is idle or not.
1738  * Returns true if it the IP is idle, false if not.
1739  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1740 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1741 			      enum amd_ip_block_type block_type)
1742 {
1743 	int i;
1744 
1745 	for (i = 0; i < adev->num_ip_blocks; i++) {
1746 		if (!adev->ip_blocks[i].status.valid)
1747 			continue;
1748 		if (adev->ip_blocks[i].version->type == block_type)
1749 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1750 	}
1751 	return true;
1752 
1753 }
1754 
1755 /**
1756  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1757  *
1758  * @adev: amdgpu_device pointer
1759  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1760  *
1761  * Returns a pointer to the hardware IP block structure
1762  * if it exists for the asic, otherwise NULL.
1763  */
1764 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1765 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1766 			      enum amd_ip_block_type type)
1767 {
1768 	int i;
1769 
1770 	for (i = 0; i < adev->num_ip_blocks; i++)
1771 		if (adev->ip_blocks[i].version->type == type)
1772 			return &adev->ip_blocks[i];
1773 
1774 	return NULL;
1775 }
1776 
1777 /**
1778  * amdgpu_device_ip_block_version_cmp
1779  *
1780  * @adev: amdgpu_device pointer
1781  * @type: enum amd_ip_block_type
1782  * @major: major version
1783  * @minor: minor version
1784  *
1785  * return 0 if equal or greater
1786  * return 1 if smaller or the ip_block doesn't exist
1787  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1788 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1789 				       enum amd_ip_block_type type,
1790 				       u32 major, u32 minor)
1791 {
1792 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1793 
1794 	if (ip_block && ((ip_block->version->major > major) ||
1795 			((ip_block->version->major == major) &&
1796 			(ip_block->version->minor >= minor))))
1797 		return 0;
1798 
1799 	return 1;
1800 }
1801 
1802 /**
1803  * amdgpu_device_ip_block_add
1804  *
1805  * @adev: amdgpu_device pointer
1806  * @ip_block_version: pointer to the IP to add
1807  *
1808  * Adds the IP block driver information to the collection of IPs
1809  * on the asic.
1810  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1811 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1812 			       const struct amdgpu_ip_block_version *ip_block_version)
1813 {
1814 	if (!ip_block_version)
1815 		return -EINVAL;
1816 
1817 	switch (ip_block_version->type) {
1818 	case AMD_IP_BLOCK_TYPE_VCN:
1819 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1820 			return 0;
1821 		break;
1822 	case AMD_IP_BLOCK_TYPE_JPEG:
1823 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1824 			return 0;
1825 		break;
1826 	default:
1827 		break;
1828 	}
1829 
1830 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1831 		  ip_block_version->funcs->name);
1832 
1833 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1834 
1835 	return 0;
1836 }
1837 
1838 /**
1839  * amdgpu_device_enable_virtual_display - enable virtual display feature
1840  *
1841  * @adev: amdgpu_device pointer
1842  *
1843  * Enabled the virtual display feature if the user has enabled it via
1844  * the module parameter virtual_display.  This feature provides a virtual
1845  * display hardware on headless boards or in virtualized environments.
1846  * This function parses and validates the configuration string specified by
1847  * the user and configues the virtual display configuration (number of
1848  * virtual connectors, crtcs, etc.) specified.
1849  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1850 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1851 {
1852 	adev->enable_virtual_display = false;
1853 
1854 #ifdef notyet
1855 	if (amdgpu_virtual_display) {
1856 		const char *pci_address_name = pci_name(adev->pdev);
1857 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1858 
1859 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1860 		pciaddstr_tmp = pciaddstr;
1861 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1862 			pciaddname = strsep(&pciaddname_tmp, ",");
1863 			if (!strcmp("all", pciaddname)
1864 			    || !strcmp(pci_address_name, pciaddname)) {
1865 				long num_crtc;
1866 				int res = -1;
1867 
1868 				adev->enable_virtual_display = true;
1869 
1870 				if (pciaddname_tmp)
1871 					res = kstrtol(pciaddname_tmp, 10,
1872 						      &num_crtc);
1873 
1874 				if (!res) {
1875 					if (num_crtc < 1)
1876 						num_crtc = 1;
1877 					if (num_crtc > 6)
1878 						num_crtc = 6;
1879 					adev->mode_info.num_crtc = num_crtc;
1880 				} else {
1881 					adev->mode_info.num_crtc = 1;
1882 				}
1883 				break;
1884 			}
1885 		}
1886 
1887 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1888 			 amdgpu_virtual_display, pci_address_name,
1889 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1890 
1891 		kfree(pciaddstr);
1892 	}
1893 #endif
1894 }
1895 
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1896 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1897 {
1898 	if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1899 		adev->mode_info.num_crtc = 1;
1900 		adev->enable_virtual_display = true;
1901 		DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1902 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1903 	}
1904 }
1905 
1906 /**
1907  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1908  *
1909  * @adev: amdgpu_device pointer
1910  *
1911  * Parses the asic configuration parameters specified in the gpu info
1912  * firmware and makes them availale to the driver for use in configuring
1913  * the asic.
1914  * Returns 0 on success, -EINVAL on failure.
1915  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1916 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1917 {
1918 	const char *chip_name;
1919 	char fw_name[40];
1920 	int err;
1921 	const struct gpu_info_firmware_header_v1_0 *hdr;
1922 
1923 	adev->firmware.gpu_info_fw = NULL;
1924 
1925 	if (adev->mman.discovery_bin)
1926 		return 0;
1927 
1928 	switch (adev->asic_type) {
1929 	default:
1930 		return 0;
1931 	case CHIP_VEGA10:
1932 		chip_name = "vega10";
1933 		break;
1934 	case CHIP_VEGA12:
1935 		chip_name = "vega12";
1936 		break;
1937 	case CHIP_RAVEN:
1938 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1939 			chip_name = "raven2";
1940 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1941 			chip_name = "picasso";
1942 		else
1943 			chip_name = "raven";
1944 		break;
1945 	case CHIP_ARCTURUS:
1946 		chip_name = "arcturus";
1947 		break;
1948 	case CHIP_NAVI12:
1949 		chip_name = "navi12";
1950 		break;
1951 	}
1952 
1953 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1954 	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1955 	if (err) {
1956 		dev_err(adev->dev,
1957 			"Failed to get gpu_info firmware \"%s\"\n",
1958 			fw_name);
1959 		goto out;
1960 	}
1961 
1962 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1963 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1964 
1965 	switch (hdr->version_major) {
1966 	case 1:
1967 	{
1968 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1969 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1970 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1971 
1972 		/*
1973 		 * Should be droped when DAL no longer needs it.
1974 		 */
1975 		if (adev->asic_type == CHIP_NAVI12)
1976 			goto parse_soc_bounding_box;
1977 
1978 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1979 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1980 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1981 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1982 		adev->gfx.config.max_texture_channel_caches =
1983 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1984 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1985 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1986 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1987 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1988 		adev->gfx.config.double_offchip_lds_buf =
1989 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1990 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1991 		adev->gfx.cu_info.max_waves_per_simd =
1992 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1993 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1994 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1995 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1996 		if (hdr->version_minor >= 1) {
1997 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1998 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1999 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2000 			adev->gfx.config.num_sc_per_sh =
2001 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2002 			adev->gfx.config.num_packer_per_sc =
2003 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2004 		}
2005 
2006 parse_soc_bounding_box:
2007 		/*
2008 		 * soc bounding box info is not integrated in disocovery table,
2009 		 * we always need to parse it from gpu info firmware if needed.
2010 		 */
2011 		if (hdr->version_minor == 2) {
2012 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2013 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2014 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2015 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2016 		}
2017 		break;
2018 	}
2019 	default:
2020 		dev_err(adev->dev,
2021 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2022 		err = -EINVAL;
2023 		goto out;
2024 	}
2025 out:
2026 	return err;
2027 }
2028 
2029 /**
2030  * amdgpu_device_ip_early_init - run early init for hardware IPs
2031  *
2032  * @adev: amdgpu_device pointer
2033  *
2034  * Early initialization pass for hardware IPs.  The hardware IPs that make
2035  * up each asic are discovered each IP's early_init callback is run.  This
2036  * is the first stage in initializing the asic.
2037  * Returns 0 on success, negative error code on failure.
2038  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2039 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2040 {
2041 	struct pci_dev *parent;
2042 	int i, r;
2043 	bool total;
2044 
2045 	amdgpu_device_enable_virtual_display(adev);
2046 
2047 	if (amdgpu_sriov_vf(adev)) {
2048 		r = amdgpu_virt_request_full_gpu(adev, true);
2049 		if (r)
2050 			return r;
2051 	}
2052 
2053 	switch (adev->asic_type) {
2054 #ifdef CONFIG_DRM_AMDGPU_SI
2055 	case CHIP_VERDE:
2056 	case CHIP_TAHITI:
2057 	case CHIP_PITCAIRN:
2058 	case CHIP_OLAND:
2059 	case CHIP_HAINAN:
2060 		adev->family = AMDGPU_FAMILY_SI;
2061 		r = si_set_ip_blocks(adev);
2062 		if (r)
2063 			return r;
2064 		break;
2065 #endif
2066 #ifdef CONFIG_DRM_AMDGPU_CIK
2067 	case CHIP_BONAIRE:
2068 	case CHIP_HAWAII:
2069 	case CHIP_KAVERI:
2070 	case CHIP_KABINI:
2071 	case CHIP_MULLINS:
2072 		if (adev->flags & AMD_IS_APU)
2073 			adev->family = AMDGPU_FAMILY_KV;
2074 		else
2075 			adev->family = AMDGPU_FAMILY_CI;
2076 
2077 		r = cik_set_ip_blocks(adev);
2078 		if (r)
2079 			return r;
2080 		break;
2081 #endif
2082 	case CHIP_TOPAZ:
2083 	case CHIP_TONGA:
2084 	case CHIP_FIJI:
2085 	case CHIP_POLARIS10:
2086 	case CHIP_POLARIS11:
2087 	case CHIP_POLARIS12:
2088 	case CHIP_VEGAM:
2089 	case CHIP_CARRIZO:
2090 	case CHIP_STONEY:
2091 		if (adev->flags & AMD_IS_APU)
2092 			adev->family = AMDGPU_FAMILY_CZ;
2093 		else
2094 			adev->family = AMDGPU_FAMILY_VI;
2095 
2096 		r = vi_set_ip_blocks(adev);
2097 		if (r)
2098 			return r;
2099 		break;
2100 	default:
2101 		r = amdgpu_discovery_set_ip_blocks(adev);
2102 		if (r)
2103 			return r;
2104 		break;
2105 	}
2106 
2107 	if (amdgpu_has_atpx() &&
2108 	    (amdgpu_is_atpx_hybrid() ||
2109 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2110 	    ((adev->flags & AMD_IS_APU) == 0) &&
2111 	    !dev_is_removable(&adev->pdev->dev))
2112 		adev->flags |= AMD_IS_PX;
2113 
2114 	if (!(adev->flags & AMD_IS_APU)) {
2115 #ifdef notyet
2116 		parent = pcie_find_root_port(adev->pdev);
2117 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2118 #else
2119 		adev->has_pr3 = false;
2120 #endif
2121 	}
2122 
2123 
2124 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2125 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2126 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2127 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2128 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2129 	if (!amdgpu_device_pcie_dynamic_switching_supported())
2130 		adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2131 
2132 	total = true;
2133 	for (i = 0; i < adev->num_ip_blocks; i++) {
2134 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2135 			DRM_WARN("disabled ip block: %d <%s>\n",
2136 				  i, adev->ip_blocks[i].version->funcs->name);
2137 			adev->ip_blocks[i].status.valid = false;
2138 		} else {
2139 			if (adev->ip_blocks[i].version->funcs->early_init) {
2140 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2141 				if (r == -ENOENT) {
2142 					adev->ip_blocks[i].status.valid = false;
2143 				} else if (r) {
2144 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2145 						  adev->ip_blocks[i].version->funcs->name, r);
2146 					total = false;
2147 				} else {
2148 					adev->ip_blocks[i].status.valid = true;
2149 				}
2150 			} else {
2151 				adev->ip_blocks[i].status.valid = true;
2152 			}
2153 		}
2154 		/* get the vbios after the asic_funcs are set up */
2155 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2156 			r = amdgpu_device_parse_gpu_info_fw(adev);
2157 			if (r)
2158 				return r;
2159 
2160 			/* Read BIOS */
2161 			if (amdgpu_device_read_bios(adev)) {
2162 				if (!amdgpu_get_bios(adev))
2163 					return -EINVAL;
2164 
2165 				r = amdgpu_atombios_init(adev);
2166 				if (r) {
2167 					dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2168 					amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2169 					return r;
2170 				}
2171 			}
2172 
2173 			/*get pf2vf msg info at it's earliest time*/
2174 			if (amdgpu_sriov_vf(adev))
2175 				amdgpu_virt_init_data_exchange(adev);
2176 
2177 		}
2178 	}
2179 	if (!total)
2180 		return -ENODEV;
2181 
2182 	amdgpu_amdkfd_device_probe(adev);
2183 	adev->cg_flags &= amdgpu_cg_mask;
2184 	adev->pg_flags &= amdgpu_pg_mask;
2185 
2186 	return 0;
2187 }
2188 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2189 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2190 {
2191 	int i, r;
2192 
2193 	for (i = 0; i < adev->num_ip_blocks; i++) {
2194 		if (!adev->ip_blocks[i].status.sw)
2195 			continue;
2196 		if (adev->ip_blocks[i].status.hw)
2197 			continue;
2198 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2199 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2200 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2201 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2202 			if (r) {
2203 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2204 					  adev->ip_blocks[i].version->funcs->name, r);
2205 				return r;
2206 			}
2207 			adev->ip_blocks[i].status.hw = true;
2208 		}
2209 	}
2210 
2211 	return 0;
2212 }
2213 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2214 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2215 {
2216 	int i, r;
2217 
2218 	for (i = 0; i < adev->num_ip_blocks; i++) {
2219 		if (!adev->ip_blocks[i].status.sw)
2220 			continue;
2221 		if (adev->ip_blocks[i].status.hw)
2222 			continue;
2223 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2224 		if (r) {
2225 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2226 				  adev->ip_blocks[i].version->funcs->name, r);
2227 			return r;
2228 		}
2229 		adev->ip_blocks[i].status.hw = true;
2230 	}
2231 
2232 	return 0;
2233 }
2234 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2235 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2236 {
2237 	int r = 0;
2238 	int i;
2239 	uint32_t smu_version;
2240 
2241 	if (adev->asic_type >= CHIP_VEGA10) {
2242 		for (i = 0; i < adev->num_ip_blocks; i++) {
2243 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2244 				continue;
2245 
2246 			if (!adev->ip_blocks[i].status.sw)
2247 				continue;
2248 
2249 			/* no need to do the fw loading again if already done*/
2250 			if (adev->ip_blocks[i].status.hw == true)
2251 				break;
2252 
2253 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2254 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2255 				if (r) {
2256 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2257 							  adev->ip_blocks[i].version->funcs->name, r);
2258 					return r;
2259 				}
2260 			} else {
2261 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2262 				if (r) {
2263 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2264 							  adev->ip_blocks[i].version->funcs->name, r);
2265 					return r;
2266 				}
2267 			}
2268 
2269 			adev->ip_blocks[i].status.hw = true;
2270 			break;
2271 		}
2272 	}
2273 
2274 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2275 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2276 
2277 	return r;
2278 }
2279 
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2280 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2281 {
2282 	long timeout;
2283 	int r, i;
2284 
2285 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2286 		struct amdgpu_ring *ring = adev->rings[i];
2287 
2288 		/* No need to setup the GPU scheduler for rings that don't need it */
2289 		if (!ring || ring->no_scheduler)
2290 			continue;
2291 
2292 		switch (ring->funcs->type) {
2293 		case AMDGPU_RING_TYPE_GFX:
2294 			timeout = adev->gfx_timeout;
2295 			break;
2296 		case AMDGPU_RING_TYPE_COMPUTE:
2297 			timeout = adev->compute_timeout;
2298 			break;
2299 		case AMDGPU_RING_TYPE_SDMA:
2300 			timeout = adev->sdma_timeout;
2301 			break;
2302 		default:
2303 			timeout = adev->video_timeout;
2304 			break;
2305 		}
2306 
2307 		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2308 				   ring->num_hw_submission, 0,
2309 				   timeout, adev->reset_domain->wq,
2310 				   ring->sched_score, ring->name,
2311 				   adev->dev);
2312 		if (r) {
2313 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
2314 				  ring->name);
2315 			return r;
2316 		}
2317 	}
2318 
2319 	amdgpu_xcp_update_partition_sched_list(adev);
2320 
2321 	return 0;
2322 }
2323 
2324 
2325 /**
2326  * amdgpu_device_ip_init - run init for hardware IPs
2327  *
2328  * @adev: amdgpu_device pointer
2329  *
2330  * Main initialization pass for hardware IPs.  The list of all the hardware
2331  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2332  * are run.  sw_init initializes the software state associated with each IP
2333  * and hw_init initializes the hardware associated with each IP.
2334  * Returns 0 on success, negative error code on failure.
2335  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2336 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2337 {
2338 	int i, r;
2339 
2340 	r = amdgpu_ras_init(adev);
2341 	if (r)
2342 		return r;
2343 
2344 	for (i = 0; i < adev->num_ip_blocks; i++) {
2345 		if (!adev->ip_blocks[i].status.valid)
2346 			continue;
2347 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2348 		if (r) {
2349 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2350 				  adev->ip_blocks[i].version->funcs->name, r);
2351 			goto init_failed;
2352 		}
2353 		adev->ip_blocks[i].status.sw = true;
2354 
2355 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2356 			/* need to do common hw init early so everything is set up for gmc */
2357 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2358 			if (r) {
2359 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2360 				goto init_failed;
2361 			}
2362 			adev->ip_blocks[i].status.hw = true;
2363 		} else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2364 			/* need to do gmc hw init early so we can allocate gpu mem */
2365 			/* Try to reserve bad pages early */
2366 			if (amdgpu_sriov_vf(adev))
2367 				amdgpu_virt_exchange_data(adev);
2368 
2369 			r = amdgpu_device_mem_scratch_init(adev);
2370 			if (r) {
2371 				DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2372 				goto init_failed;
2373 			}
2374 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2375 			if (r) {
2376 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2377 				goto init_failed;
2378 			}
2379 			r = amdgpu_device_wb_init(adev);
2380 			if (r) {
2381 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2382 				goto init_failed;
2383 			}
2384 			adev->ip_blocks[i].status.hw = true;
2385 
2386 			/* right after GMC hw init, we create CSA */
2387 			if (adev->gfx.mcbp) {
2388 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2389 							       AMDGPU_GEM_DOMAIN_VRAM |
2390 							       AMDGPU_GEM_DOMAIN_GTT,
2391 							       AMDGPU_CSA_SIZE);
2392 				if (r) {
2393 					DRM_ERROR("allocate CSA failed %d\n", r);
2394 					goto init_failed;
2395 				}
2396 			}
2397 		}
2398 	}
2399 
2400 	if (amdgpu_sriov_vf(adev))
2401 		amdgpu_virt_init_data_exchange(adev);
2402 
2403 	r = amdgpu_ib_pool_init(adev);
2404 	if (r) {
2405 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2406 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2407 		goto init_failed;
2408 	}
2409 
2410 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2411 	if (r)
2412 		goto init_failed;
2413 
2414 	r = amdgpu_device_ip_hw_init_phase1(adev);
2415 	if (r)
2416 		goto init_failed;
2417 
2418 	r = amdgpu_device_fw_loading(adev);
2419 	if (r)
2420 		goto init_failed;
2421 
2422 	r = amdgpu_device_ip_hw_init_phase2(adev);
2423 	if (r)
2424 		goto init_failed;
2425 
2426 	/*
2427 	 * retired pages will be loaded from eeprom and reserved here,
2428 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2429 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2430 	 * for I2C communication which only true at this point.
2431 	 *
2432 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2433 	 * failure from bad gpu situation and stop amdgpu init process
2434 	 * accordingly. For other failed cases, it will still release all
2435 	 * the resource and print error message, rather than returning one
2436 	 * negative value to upper level.
2437 	 *
2438 	 * Note: theoretically, this should be called before all vram allocations
2439 	 * to protect retired page from abusing
2440 	 */
2441 	r = amdgpu_ras_recovery_init(adev);
2442 	if (r)
2443 		goto init_failed;
2444 
2445 	/**
2446 	 * In case of XGMI grab extra reference for reset domain for this device
2447 	 */
2448 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2449 		if (amdgpu_xgmi_add_device(adev) == 0) {
2450 			if (!amdgpu_sriov_vf(adev)) {
2451 				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2452 
2453 				if (WARN_ON(!hive)) {
2454 					r = -ENOENT;
2455 					goto init_failed;
2456 				}
2457 
2458 				if (!hive->reset_domain ||
2459 				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2460 					r = -ENOENT;
2461 					amdgpu_put_xgmi_hive(hive);
2462 					goto init_failed;
2463 				}
2464 
2465 				/* Drop the early temporary reset domain we created for device */
2466 				amdgpu_reset_put_reset_domain(adev->reset_domain);
2467 				adev->reset_domain = hive->reset_domain;
2468 				amdgpu_put_xgmi_hive(hive);
2469 			}
2470 		}
2471 	}
2472 
2473 	r = amdgpu_device_init_schedulers(adev);
2474 	if (r)
2475 		goto init_failed;
2476 
2477 	/* Don't init kfd if whole hive need to be reset during init */
2478 	if (!adev->gmc.xgmi.pending_reset) {
2479 		kgd2kfd_init_zone_device(adev);
2480 		amdgpu_amdkfd_device_init(adev);
2481 	}
2482 
2483 	amdgpu_fru_get_product_info(adev);
2484 
2485 init_failed:
2486 
2487 	return r;
2488 }
2489 
2490 /**
2491  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2492  *
2493  * @adev: amdgpu_device pointer
2494  *
2495  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2496  * this function before a GPU reset.  If the value is retained after a
2497  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2498  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2499 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2500 {
2501 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2502 }
2503 
2504 /**
2505  * amdgpu_device_check_vram_lost - check if vram is valid
2506  *
2507  * @adev: amdgpu_device pointer
2508  *
2509  * Checks the reset magic value written to the gart pointer in VRAM.
2510  * The driver calls this after a GPU reset to see if the contents of
2511  * VRAM is lost or now.
2512  * returns true if vram is lost, false if not.
2513  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2514 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2515 {
2516 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2517 			AMDGPU_RESET_MAGIC_NUM))
2518 		return true;
2519 
2520 	if (!amdgpu_in_reset(adev))
2521 		return false;
2522 
2523 	/*
2524 	 * For all ASICs with baco/mode1 reset, the VRAM is
2525 	 * always assumed to be lost.
2526 	 */
2527 	switch (amdgpu_asic_reset_method(adev)) {
2528 	case AMD_RESET_METHOD_BACO:
2529 	case AMD_RESET_METHOD_MODE1:
2530 		return true;
2531 	default:
2532 		return false;
2533 	}
2534 }
2535 
2536 /**
2537  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2538  *
2539  * @adev: amdgpu_device pointer
2540  * @state: clockgating state (gate or ungate)
2541  *
2542  * The list of all the hardware IPs that make up the asic is walked and the
2543  * set_clockgating_state callbacks are run.
2544  * Late initialization pass enabling clockgating for hardware IPs.
2545  * Fini or suspend, pass disabling clockgating for hardware IPs.
2546  * Returns 0 on success, negative error code on failure.
2547  */
2548 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2549 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2550 			       enum amd_clockgating_state state)
2551 {
2552 	int i, j, r;
2553 
2554 	if (amdgpu_emu_mode == 1)
2555 		return 0;
2556 
2557 	for (j = 0; j < adev->num_ip_blocks; j++) {
2558 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2559 		if (!adev->ip_blocks[i].status.late_initialized)
2560 			continue;
2561 		/* skip CG for GFX, SDMA on S0ix */
2562 		if (adev->in_s0ix &&
2563 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2564 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2565 			continue;
2566 		/* skip CG for VCE/UVD, it's handled specially */
2567 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2568 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2569 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2570 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2571 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2572 			/* enable clockgating to save power */
2573 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2574 										     state);
2575 			if (r) {
2576 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2577 					  adev->ip_blocks[i].version->funcs->name, r);
2578 				return r;
2579 			}
2580 		}
2581 	}
2582 
2583 	return 0;
2584 }
2585 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2586 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2587 			       enum amd_powergating_state state)
2588 {
2589 	int i, j, r;
2590 
2591 	if (amdgpu_emu_mode == 1)
2592 		return 0;
2593 
2594 	for (j = 0; j < adev->num_ip_blocks; j++) {
2595 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2596 		if (!adev->ip_blocks[i].status.late_initialized)
2597 			continue;
2598 		/* skip PG for GFX, SDMA on S0ix */
2599 		if (adev->in_s0ix &&
2600 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2601 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2602 			continue;
2603 		/* skip CG for VCE/UVD, it's handled specially */
2604 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2605 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2606 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2607 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2608 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2609 			/* enable powergating to save power */
2610 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2611 											state);
2612 			if (r) {
2613 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2614 					  adev->ip_blocks[i].version->funcs->name, r);
2615 				return r;
2616 			}
2617 		}
2618 	}
2619 	return 0;
2620 }
2621 
amdgpu_device_enable_mgpu_fan_boost(void)2622 static int amdgpu_device_enable_mgpu_fan_boost(void)
2623 {
2624 	struct amdgpu_gpu_instance *gpu_ins;
2625 	struct amdgpu_device *adev;
2626 	int i, ret = 0;
2627 
2628 	mutex_lock(&mgpu_info.mutex);
2629 
2630 	/*
2631 	 * MGPU fan boost feature should be enabled
2632 	 * only when there are two or more dGPUs in
2633 	 * the system
2634 	 */
2635 	if (mgpu_info.num_dgpu < 2)
2636 		goto out;
2637 
2638 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2639 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2640 		adev = gpu_ins->adev;
2641 		if (!(adev->flags & AMD_IS_APU) &&
2642 		    !gpu_ins->mgpu_fan_enabled) {
2643 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2644 			if (ret)
2645 				break;
2646 
2647 			gpu_ins->mgpu_fan_enabled = 1;
2648 		}
2649 	}
2650 
2651 out:
2652 	mutex_unlock(&mgpu_info.mutex);
2653 
2654 	return ret;
2655 }
2656 
2657 /**
2658  * amdgpu_device_ip_late_init - run late init for hardware IPs
2659  *
2660  * @adev: amdgpu_device pointer
2661  *
2662  * Late initialization pass for hardware IPs.  The list of all the hardware
2663  * IPs that make up the asic is walked and the late_init callbacks are run.
2664  * late_init covers any special initialization that an IP requires
2665  * after all of the have been initialized or something that needs to happen
2666  * late in the init process.
2667  * Returns 0 on success, negative error code on failure.
2668  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2669 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2670 {
2671 	struct amdgpu_gpu_instance *gpu_instance;
2672 	int i = 0, r;
2673 
2674 	for (i = 0; i < adev->num_ip_blocks; i++) {
2675 		if (!adev->ip_blocks[i].status.hw)
2676 			continue;
2677 		if (adev->ip_blocks[i].version->funcs->late_init) {
2678 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2679 			if (r) {
2680 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2681 					  adev->ip_blocks[i].version->funcs->name, r);
2682 				return r;
2683 			}
2684 		}
2685 		adev->ip_blocks[i].status.late_initialized = true;
2686 	}
2687 
2688 	r = amdgpu_ras_late_init(adev);
2689 	if (r) {
2690 		DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2691 		return r;
2692 	}
2693 
2694 	amdgpu_ras_set_error_query_ready(adev, true);
2695 
2696 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2697 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2698 
2699 	amdgpu_device_fill_reset_magic(adev);
2700 
2701 	r = amdgpu_device_enable_mgpu_fan_boost();
2702 	if (r)
2703 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2704 
2705 	/* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2706 	if (amdgpu_passthrough(adev) &&
2707 	    ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2708 	     adev->asic_type == CHIP_ALDEBARAN))
2709 		amdgpu_dpm_handle_passthrough_sbr(adev, true);
2710 
2711 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2712 		mutex_lock(&mgpu_info.mutex);
2713 
2714 		/*
2715 		 * Reset device p-state to low as this was booted with high.
2716 		 *
2717 		 * This should be performed only after all devices from the same
2718 		 * hive get initialized.
2719 		 *
2720 		 * However, it's unknown how many device in the hive in advance.
2721 		 * As this is counted one by one during devices initializations.
2722 		 *
2723 		 * So, we wait for all XGMI interlinked devices initialized.
2724 		 * This may bring some delays as those devices may come from
2725 		 * different hives. But that should be OK.
2726 		 */
2727 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2728 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2729 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2730 				if (gpu_instance->adev->flags & AMD_IS_APU)
2731 					continue;
2732 
2733 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2734 						AMDGPU_XGMI_PSTATE_MIN);
2735 				if (r) {
2736 					DRM_ERROR("pstate setting failed (%d).\n", r);
2737 					break;
2738 				}
2739 			}
2740 		}
2741 
2742 		mutex_unlock(&mgpu_info.mutex);
2743 	}
2744 
2745 	return 0;
2746 }
2747 
2748 /**
2749  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2750  *
2751  * @adev: amdgpu_device pointer
2752  *
2753  * For ASICs need to disable SMC first
2754  */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2755 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2756 {
2757 	int i, r;
2758 
2759 	if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2760 		return;
2761 
2762 	for (i = 0; i < adev->num_ip_blocks; i++) {
2763 		if (!adev->ip_blocks[i].status.hw)
2764 			continue;
2765 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2766 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2767 			/* XXX handle errors */
2768 			if (r) {
2769 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2770 					  adev->ip_blocks[i].version->funcs->name, r);
2771 			}
2772 			adev->ip_blocks[i].status.hw = false;
2773 			break;
2774 		}
2775 	}
2776 }
2777 
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2778 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2779 {
2780 	int i, r;
2781 
2782 	for (i = 0; i < adev->num_ip_blocks; i++) {
2783 		if (!adev->ip_blocks[i].version->funcs->early_fini)
2784 			continue;
2785 
2786 		r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2787 		if (r) {
2788 			DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2789 				  adev->ip_blocks[i].version->funcs->name, r);
2790 		}
2791 	}
2792 
2793 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2794 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2795 
2796 	amdgpu_amdkfd_suspend(adev, false);
2797 
2798 	/* Workaroud for ASICs need to disable SMC first */
2799 	amdgpu_device_smu_fini_early(adev);
2800 
2801 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2802 		if (!adev->ip_blocks[i].status.hw)
2803 			continue;
2804 
2805 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2806 		/* XXX handle errors */
2807 		if (r) {
2808 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2809 				  adev->ip_blocks[i].version->funcs->name, r);
2810 		}
2811 
2812 		adev->ip_blocks[i].status.hw = false;
2813 	}
2814 
2815 	if (amdgpu_sriov_vf(adev)) {
2816 		if (amdgpu_virt_release_full_gpu(adev, false))
2817 			DRM_ERROR("failed to release exclusive mode on fini\n");
2818 	}
2819 
2820 	return 0;
2821 }
2822 
2823 /**
2824  * amdgpu_device_ip_fini - run fini for hardware IPs
2825  *
2826  * @adev: amdgpu_device pointer
2827  *
2828  * Main teardown pass for hardware IPs.  The list of all the hardware
2829  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2830  * are run.  hw_fini tears down the hardware associated with each IP
2831  * and sw_fini tears down any software state associated with each IP.
2832  * Returns 0 on success, negative error code on failure.
2833  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2834 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2835 {
2836 	int i, r;
2837 
2838 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2839 		amdgpu_virt_release_ras_err_handler_data(adev);
2840 
2841 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2842 		amdgpu_xgmi_remove_device(adev);
2843 
2844 	amdgpu_amdkfd_device_fini_sw(adev);
2845 
2846 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2847 		if (!adev->ip_blocks[i].status.sw)
2848 			continue;
2849 
2850 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2851 			amdgpu_ucode_free_bo(adev);
2852 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2853 			amdgpu_device_wb_fini(adev);
2854 			amdgpu_device_mem_scratch_fini(adev);
2855 			amdgpu_ib_pool_fini(adev);
2856 		}
2857 
2858 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2859 		/* XXX handle errors */
2860 		if (r) {
2861 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2862 				  adev->ip_blocks[i].version->funcs->name, r);
2863 		}
2864 		adev->ip_blocks[i].status.sw = false;
2865 		adev->ip_blocks[i].status.valid = false;
2866 	}
2867 
2868 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2869 		if (!adev->ip_blocks[i].status.late_initialized)
2870 			continue;
2871 		if (adev->ip_blocks[i].version->funcs->late_fini)
2872 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2873 		adev->ip_blocks[i].status.late_initialized = false;
2874 	}
2875 
2876 	amdgpu_ras_fini(adev);
2877 
2878 	return 0;
2879 }
2880 
2881 /**
2882  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2883  *
2884  * @work: work_struct.
2885  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2886 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2887 {
2888 	struct amdgpu_device *adev =
2889 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2890 	int r;
2891 
2892 	r = amdgpu_ib_ring_tests(adev);
2893 	if (r)
2894 		DRM_ERROR("ib ring test failed (%d).\n", r);
2895 }
2896 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2897 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2898 {
2899 	struct amdgpu_device *adev =
2900 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2901 
2902 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2903 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2904 
2905 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2906 		adev->gfx.gfx_off_state = true;
2907 }
2908 
2909 /**
2910  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2911  *
2912  * @adev: amdgpu_device pointer
2913  *
2914  * Main suspend function for hardware IPs.  The list of all the hardware
2915  * IPs that make up the asic is walked, clockgating is disabled and the
2916  * suspend callbacks are run.  suspend puts the hardware and software state
2917  * in each IP into a state suitable for suspend.
2918  * Returns 0 on success, negative error code on failure.
2919  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2920 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2921 {
2922 	int i, r;
2923 
2924 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2925 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2926 
2927 	/*
2928 	 * Per PMFW team's suggestion, driver needs to handle gfxoff
2929 	 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2930 	 * scenario. Add the missing df cstate disablement here.
2931 	 */
2932 	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2933 		dev_warn(adev->dev, "Failed to disallow df cstate");
2934 
2935 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2936 		if (!adev->ip_blocks[i].status.valid)
2937 			continue;
2938 
2939 		/* displays are handled separately */
2940 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2941 			continue;
2942 
2943 		/* XXX handle errors */
2944 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2945 		/* XXX handle errors */
2946 		if (r) {
2947 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2948 				  adev->ip_blocks[i].version->funcs->name, r);
2949 			return r;
2950 		}
2951 
2952 		adev->ip_blocks[i].status.hw = false;
2953 	}
2954 
2955 	return 0;
2956 }
2957 
2958 /**
2959  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2960  *
2961  * @adev: amdgpu_device pointer
2962  *
2963  * Main suspend function for hardware IPs.  The list of all the hardware
2964  * IPs that make up the asic is walked, clockgating is disabled and the
2965  * suspend callbacks are run.  suspend puts the hardware and software state
2966  * in each IP into a state suitable for suspend.
2967  * Returns 0 on success, negative error code on failure.
2968  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2969 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2970 {
2971 	int i, r;
2972 
2973 	if (adev->in_s0ix)
2974 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2975 
2976 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2977 		if (!adev->ip_blocks[i].status.valid)
2978 			continue;
2979 		/* displays are handled in phase1 */
2980 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2981 			continue;
2982 		/* PSP lost connection when err_event_athub occurs */
2983 		if (amdgpu_ras_intr_triggered() &&
2984 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2985 			adev->ip_blocks[i].status.hw = false;
2986 			continue;
2987 		}
2988 
2989 		/* skip unnecessary suspend if we do not initialize them yet */
2990 		if (adev->gmc.xgmi.pending_reset &&
2991 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2992 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2993 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2994 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2995 			adev->ip_blocks[i].status.hw = false;
2996 			continue;
2997 		}
2998 
2999 		/* skip suspend of gfx/mes and psp for S0ix
3000 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3001 		 * like at runtime. PSP is also part of the always on hardware
3002 		 * so no need to suspend it.
3003 		 */
3004 		if (adev->in_s0ix &&
3005 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3006 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3007 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3008 			continue;
3009 
3010 		/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3011 		if (adev->in_s0ix &&
3012 		    (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3013 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3014 			continue;
3015 
3016 		/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3017 		 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3018 		 * from this location and RLC Autoload automatically also gets loaded
3019 		 * from here based on PMFW -> PSP message during re-init sequence.
3020 		 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3021 		 * the TMR and reload FWs again for IMU enabled APU ASICs.
3022 		 */
3023 		if (amdgpu_in_reset(adev) &&
3024 		    (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3025 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3026 			continue;
3027 
3028 		/* XXX handle errors */
3029 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
3030 		/* XXX handle errors */
3031 		if (r) {
3032 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
3033 				  adev->ip_blocks[i].version->funcs->name, r);
3034 		}
3035 		adev->ip_blocks[i].status.hw = false;
3036 		/* handle putting the SMC in the appropriate state */
3037 		if (!amdgpu_sriov_vf(adev)) {
3038 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3039 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3040 				if (r) {
3041 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3042 							adev->mp1_state, r);
3043 					return r;
3044 				}
3045 			}
3046 		}
3047 	}
3048 
3049 	return 0;
3050 }
3051 
3052 /**
3053  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3054  *
3055  * @adev: amdgpu_device pointer
3056  *
3057  * Main suspend function for hardware IPs.  The list of all the hardware
3058  * IPs that make up the asic is walked, clockgating is disabled and the
3059  * suspend callbacks are run.  suspend puts the hardware and software state
3060  * in each IP into a state suitable for suspend.
3061  * Returns 0 on success, negative error code on failure.
3062  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3063 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3064 {
3065 	int r;
3066 
3067 	if (amdgpu_sriov_vf(adev)) {
3068 		amdgpu_virt_fini_data_exchange(adev);
3069 		amdgpu_virt_request_full_gpu(adev, false);
3070 	}
3071 
3072 	r = amdgpu_device_ip_suspend_phase1(adev);
3073 	if (r)
3074 		return r;
3075 	r = amdgpu_device_ip_suspend_phase2(adev);
3076 
3077 	if (amdgpu_sriov_vf(adev))
3078 		amdgpu_virt_release_full_gpu(adev, false);
3079 
3080 	return r;
3081 }
3082 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3083 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3084 {
3085 	int i, r;
3086 
3087 	static enum amd_ip_block_type ip_order[] = {
3088 		AMD_IP_BLOCK_TYPE_COMMON,
3089 		AMD_IP_BLOCK_TYPE_GMC,
3090 		AMD_IP_BLOCK_TYPE_PSP,
3091 		AMD_IP_BLOCK_TYPE_IH,
3092 	};
3093 
3094 	for (i = 0; i < adev->num_ip_blocks; i++) {
3095 		int j;
3096 		struct amdgpu_ip_block *block;
3097 
3098 		block = &adev->ip_blocks[i];
3099 		block->status.hw = false;
3100 
3101 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3102 
3103 			if (block->version->type != ip_order[j] ||
3104 				!block->status.valid)
3105 				continue;
3106 
3107 			r = block->version->funcs->hw_init(adev);
3108 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3109 			if (r)
3110 				return r;
3111 			block->status.hw = true;
3112 		}
3113 	}
3114 
3115 	return 0;
3116 }
3117 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3118 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3119 {
3120 	int i, r;
3121 
3122 	static enum amd_ip_block_type ip_order[] = {
3123 		AMD_IP_BLOCK_TYPE_SMC,
3124 		AMD_IP_BLOCK_TYPE_DCE,
3125 		AMD_IP_BLOCK_TYPE_GFX,
3126 		AMD_IP_BLOCK_TYPE_SDMA,
3127 		AMD_IP_BLOCK_TYPE_MES,
3128 		AMD_IP_BLOCK_TYPE_UVD,
3129 		AMD_IP_BLOCK_TYPE_VCE,
3130 		AMD_IP_BLOCK_TYPE_VCN,
3131 		AMD_IP_BLOCK_TYPE_JPEG
3132 	};
3133 
3134 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3135 		int j;
3136 		struct amdgpu_ip_block *block;
3137 
3138 		for (j = 0; j < adev->num_ip_blocks; j++) {
3139 			block = &adev->ip_blocks[j];
3140 
3141 			if (block->version->type != ip_order[i] ||
3142 				!block->status.valid ||
3143 				block->status.hw)
3144 				continue;
3145 
3146 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3147 				r = block->version->funcs->resume(adev);
3148 			else
3149 				r = block->version->funcs->hw_init(adev);
3150 
3151 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3152 			if (r)
3153 				return r;
3154 			block->status.hw = true;
3155 		}
3156 	}
3157 
3158 	return 0;
3159 }
3160 
3161 /**
3162  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3163  *
3164  * @adev: amdgpu_device pointer
3165  *
3166  * First resume function for hardware IPs.  The list of all the hardware
3167  * IPs that make up the asic is walked and the resume callbacks are run for
3168  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3169  * after a suspend and updates the software state as necessary.  This
3170  * function is also used for restoring the GPU after a GPU reset.
3171  * Returns 0 on success, negative error code on failure.
3172  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3173 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3174 {
3175 	int i, r;
3176 
3177 	for (i = 0; i < adev->num_ip_blocks; i++) {
3178 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3179 			continue;
3180 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3181 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3182 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3183 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3184 
3185 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3186 			if (r) {
3187 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3188 					  adev->ip_blocks[i].version->funcs->name, r);
3189 				return r;
3190 			}
3191 			adev->ip_blocks[i].status.hw = true;
3192 		}
3193 	}
3194 
3195 	return 0;
3196 }
3197 
3198 /**
3199  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3200  *
3201  * @adev: amdgpu_device pointer
3202  *
3203  * First resume function for hardware IPs.  The list of all the hardware
3204  * IPs that make up the asic is walked and the resume callbacks are run for
3205  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3206  * functional state after a suspend and updates the software state as
3207  * necessary.  This function is also used for restoring the GPU after a GPU
3208  * reset.
3209  * Returns 0 on success, negative error code on failure.
3210  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3211 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3212 {
3213 	int i, r;
3214 
3215 	for (i = 0; i < adev->num_ip_blocks; i++) {
3216 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3217 			continue;
3218 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3219 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3220 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3221 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3222 			continue;
3223 		r = adev->ip_blocks[i].version->funcs->resume(adev);
3224 		if (r) {
3225 			DRM_ERROR("resume of IP block <%s> failed %d\n",
3226 				  adev->ip_blocks[i].version->funcs->name, r);
3227 			return r;
3228 		}
3229 		adev->ip_blocks[i].status.hw = true;
3230 	}
3231 
3232 	return 0;
3233 }
3234 
3235 /**
3236  * amdgpu_device_ip_resume - run resume for hardware IPs
3237  *
3238  * @adev: amdgpu_device pointer
3239  *
3240  * Main resume function for hardware IPs.  The hardware IPs
3241  * are split into two resume functions because they are
3242  * also used in recovering from a GPU reset and some additional
3243  * steps need to be take between them.  In this case (S3/S4) they are
3244  * run sequentially.
3245  * Returns 0 on success, negative error code on failure.
3246  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3247 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3248 {
3249 	int r;
3250 
3251 	r = amdgpu_device_ip_resume_phase1(adev);
3252 	if (r)
3253 		return r;
3254 
3255 	r = amdgpu_device_fw_loading(adev);
3256 	if (r)
3257 		return r;
3258 
3259 	r = amdgpu_device_ip_resume_phase2(adev);
3260 
3261 	return r;
3262 }
3263 
3264 /**
3265  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3266  *
3267  * @adev: amdgpu_device pointer
3268  *
3269  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3270  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3271 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3272 {
3273 	if (amdgpu_sriov_vf(adev)) {
3274 		if (adev->is_atom_fw) {
3275 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3276 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3277 		} else {
3278 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3279 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3280 		}
3281 
3282 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3283 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3284 	}
3285 }
3286 
3287 /**
3288  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3289  *
3290  * @asic_type: AMD asic type
3291  *
3292  * Check if there is DC (new modesetting infrastructre) support for an asic.
3293  * returns true if DC has support, false if not.
3294  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3295 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3296 {
3297 	switch (asic_type) {
3298 #ifdef CONFIG_DRM_AMDGPU_SI
3299 	case CHIP_HAINAN:
3300 #endif
3301 	case CHIP_TOPAZ:
3302 		/* chips with no display hardware */
3303 		return false;
3304 #if defined(CONFIG_DRM_AMD_DC)
3305 	case CHIP_TAHITI:
3306 	case CHIP_PITCAIRN:
3307 	case CHIP_VERDE:
3308 	case CHIP_OLAND:
3309 		/*
3310 		 * We have systems in the wild with these ASICs that require
3311 		 * LVDS and VGA support which is not supported with DC.
3312 		 *
3313 		 * Fallback to the non-DC driver here by default so as not to
3314 		 * cause regressions.
3315 		 */
3316 #if defined(CONFIG_DRM_AMD_DC_SI)
3317 		return amdgpu_dc > 0;
3318 #else
3319 		return false;
3320 #endif
3321 	case CHIP_BONAIRE:
3322 	case CHIP_KAVERI:
3323 	case CHIP_KABINI:
3324 	case CHIP_MULLINS:
3325 		/*
3326 		 * We have systems in the wild with these ASICs that require
3327 		 * VGA support which is not supported with DC.
3328 		 *
3329 		 * Fallback to the non-DC driver here by default so as not to
3330 		 * cause regressions.
3331 		 */
3332 		return amdgpu_dc > 0;
3333 	default:
3334 		return amdgpu_dc != 0;
3335 #else
3336 	default:
3337 		if (amdgpu_dc > 0)
3338 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3339 		return false;
3340 #endif
3341 	}
3342 }
3343 
3344 /**
3345  * amdgpu_device_has_dc_support - check if dc is supported
3346  *
3347  * @adev: amdgpu_device pointer
3348  *
3349  * Returns true for supported, false for not supported
3350  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3351 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3352 {
3353 	if (adev->enable_virtual_display ||
3354 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3355 		return false;
3356 
3357 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3358 }
3359 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3360 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3361 {
3362 	struct amdgpu_device *adev =
3363 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3364 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3365 
3366 	/* It's a bug to not have a hive within this function */
3367 	if (WARN_ON(!hive))
3368 		return;
3369 
3370 	/*
3371 	 * Use task barrier to synchronize all xgmi reset works across the
3372 	 * hive. task_barrier_enter and task_barrier_exit will block
3373 	 * until all the threads running the xgmi reset works reach
3374 	 * those points. task_barrier_full will do both blocks.
3375 	 */
3376 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3377 
3378 		task_barrier_enter(&hive->tb);
3379 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3380 
3381 		if (adev->asic_reset_res)
3382 			goto fail;
3383 
3384 		task_barrier_exit(&hive->tb);
3385 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3386 
3387 		if (adev->asic_reset_res)
3388 			goto fail;
3389 
3390 		if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3391 		    adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3392 			adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3393 	} else {
3394 
3395 		task_barrier_full(&hive->tb);
3396 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3397 	}
3398 
3399 fail:
3400 	if (adev->asic_reset_res)
3401 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3402 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3403 	amdgpu_put_xgmi_hive(hive);
3404 }
3405 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3406 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3407 {
3408 	char *input = amdgpu_lockup_timeout;
3409 	char *timeout_setting = NULL;
3410 	int index = 0;
3411 	long timeout;
3412 	int ret = 0;
3413 
3414 	/*
3415 	 * By default timeout for non compute jobs is 10000
3416 	 * and 60000 for compute jobs.
3417 	 * In SR-IOV or passthrough mode, timeout for compute
3418 	 * jobs are 60000 by default.
3419 	 */
3420 	adev->gfx_timeout = msecs_to_jiffies(10000);
3421 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3422 	if (amdgpu_sriov_vf(adev))
3423 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3424 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3425 	else
3426 		adev->compute_timeout =  msecs_to_jiffies(60000);
3427 
3428 #ifdef notyet
3429 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3430 		while ((timeout_setting = strsep(&input, ",")) &&
3431 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3432 			ret = kstrtol(timeout_setting, 0, &timeout);
3433 			if (ret)
3434 				return ret;
3435 
3436 			if (timeout == 0) {
3437 				index++;
3438 				continue;
3439 			} else if (timeout < 0) {
3440 				timeout = MAX_SCHEDULE_TIMEOUT;
3441 				dev_warn(adev->dev, "lockup timeout disabled");
3442 				add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3443 			} else {
3444 				timeout = msecs_to_jiffies(timeout);
3445 			}
3446 
3447 			switch (index++) {
3448 			case 0:
3449 				adev->gfx_timeout = timeout;
3450 				break;
3451 			case 1:
3452 				adev->compute_timeout = timeout;
3453 				break;
3454 			case 2:
3455 				adev->sdma_timeout = timeout;
3456 				break;
3457 			case 3:
3458 				adev->video_timeout = timeout;
3459 				break;
3460 			default:
3461 				break;
3462 			}
3463 		}
3464 		/*
3465 		 * There is only one value specified and
3466 		 * it should apply to all non-compute jobs.
3467 		 */
3468 		if (index == 1) {
3469 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3470 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3471 				adev->compute_timeout = adev->gfx_timeout;
3472 		}
3473 	}
3474 #endif
3475 
3476 	return ret;
3477 }
3478 
3479 /**
3480  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3481  *
3482  * @adev: amdgpu_device pointer
3483  *
3484  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3485  */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3486 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3487 {
3488 #ifdef notyet
3489 	struct iommu_domain *domain;
3490 
3491 	domain = iommu_get_domain_for_dev(adev->dev);
3492 	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3493 #endif
3494 		adev->ram_is_direct_mapped = true;
3495 }
3496 
3497 static const struct attribute *amdgpu_dev_attributes[] = {
3498 	&dev_attr_pcie_replay_count.attr,
3499 	NULL
3500 };
3501 
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3502 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3503 {
3504 	if (amdgpu_mcbp == 1)
3505 		adev->gfx.mcbp = true;
3506 	else if (amdgpu_mcbp == 0)
3507 		adev->gfx.mcbp = false;
3508 
3509 	if (amdgpu_sriov_vf(adev))
3510 		adev->gfx.mcbp = true;
3511 
3512 	if (adev->gfx.mcbp)
3513 		DRM_INFO("MCBP is enabled\n");
3514 }
3515 
3516 /**
3517  * amdgpu_device_init - initialize the driver
3518  *
3519  * @adev: amdgpu_device pointer
3520  * @flags: driver flags
3521  *
3522  * Initializes the driver info and hw (all asics).
3523  * Returns 0 for success or an error on failure.
3524  * Called at driver startup.
3525  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3526 int amdgpu_device_init(struct amdgpu_device *adev,
3527 		       uint32_t flags)
3528 {
3529 	struct drm_device *ddev = adev_to_drm(adev);
3530 	struct pci_dev *pdev = adev->pdev;
3531 	int r, i;
3532 	bool px = false;
3533 	u32 max_MBps;
3534 	int tmp;
3535 
3536 	adev->shutdown = false;
3537 	adev->flags = flags;
3538 
3539 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3540 		adev->asic_type = amdgpu_force_asic_type;
3541 	else
3542 		adev->asic_type = flags & AMD_ASIC_MASK;
3543 
3544 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3545 	if (amdgpu_emu_mode == 1)
3546 		adev->usec_timeout *= 10;
3547 	adev->gmc.gart_size = 512 * 1024 * 1024;
3548 	adev->accel_working = false;
3549 	adev->num_rings = 0;
3550 	RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3551 	adev->mman.buffer_funcs = NULL;
3552 	adev->mman.buffer_funcs_ring = NULL;
3553 	adev->vm_manager.vm_pte_funcs = NULL;
3554 	adev->vm_manager.vm_pte_num_scheds = 0;
3555 	adev->gmc.gmc_funcs = NULL;
3556 	adev->harvest_ip_mask = 0x0;
3557 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3558 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3559 
3560 	adev->smc_rreg = &amdgpu_invalid_rreg;
3561 	adev->smc_wreg = &amdgpu_invalid_wreg;
3562 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3563 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3564 	adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3565 	adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3566 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3567 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3568 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3569 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3570 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3571 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3572 	adev->didt_rreg = &amdgpu_invalid_rreg;
3573 	adev->didt_wreg = &amdgpu_invalid_wreg;
3574 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3575 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3576 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3577 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3578 
3579 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3580 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3581 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3582 
3583 	/* mutex initialization are all done here so we
3584 	 * can recall function without having locking issues
3585 	 */
3586 	rw_init(&adev->firmware.mutex, "agfw");
3587 	rw_init(&adev->pm.mutex, "agpm");
3588 	rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
3589 	rw_init(&adev->srbm_mutex, "srbm");
3590 	rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
3591 	rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
3592 	rw_init(&adev->gfx.partition_mutex, "gfxpar");
3593 	rw_init(&adev->grbm_idx_mutex, "grbmidx");
3594 	rw_init(&adev->mn_lock, "agpumn");
3595 	rw_init(&adev->virt.vf_errors.lock, "vferr");
3596 	hash_init(adev->mn_hash);
3597 	rw_init(&adev->psp.mutex, "agpsp");
3598 	rw_init(&adev->notifier_lock, "agnf");
3599 	rw_init(&adev->pm.stable_pstate_ctx_lock, "agps");
3600 	rw_init(&adev->benchmark_mutex, "agbm");
3601 
3602 	amdgpu_device_init_apu_flags(adev);
3603 
3604 	r = amdgpu_device_check_arguments(adev);
3605 	if (r)
3606 		return r;
3607 
3608 	mtx_init(&adev->mmio_idx_lock, IPL_TTY);
3609 	mtx_init(&adev->smc_idx_lock, IPL_TTY);
3610 	mtx_init(&adev->pcie_idx_lock, IPL_TTY);
3611 	mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
3612 	mtx_init(&adev->didt_idx_lock, IPL_TTY);
3613 	mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
3614 	mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
3615 	mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
3616 	mtx_init(&adev->mm_stats.lock, IPL_NONE);
3617 
3618 	INIT_LIST_HEAD(&adev->shadow_list);
3619 	rw_init(&adev->shadow_list_lock, "sdwlst");
3620 
3621 	INIT_LIST_HEAD(&adev->reset_list);
3622 
3623 	INIT_LIST_HEAD(&adev->ras_list);
3624 
3625 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3626 			  amdgpu_device_delayed_init_work_handler);
3627 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3628 			  amdgpu_device_delay_enable_gfx_off);
3629 
3630 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3631 
3632 	adev->gfx.gfx_off_req_count = 1;
3633 	adev->gfx.gfx_off_residency = 0;
3634 	adev->gfx.gfx_off_entrycount = 0;
3635 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3636 
3637 	atomic_set(&adev->throttling_logging_enabled, 1);
3638 	/*
3639 	 * If throttling continues, logging will be performed every minute
3640 	 * to avoid log flooding. "-1" is subtracted since the thermal
3641 	 * throttling interrupt comes every second. Thus, the total logging
3642 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3643 	 * for throttling interrupt) = 60 seconds.
3644 	 */
3645 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3646 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3647 
3648 #ifdef __linux__
3649 	/* Registers mapping */
3650 	/* TODO: block userspace mapping of io register */
3651 	if (adev->asic_type >= CHIP_BONAIRE) {
3652 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3653 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3654 	} else {
3655 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3656 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3657 	}
3658 #endif
3659 
3660 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3661 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3662 
3663 #ifdef __linux__
3664 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3665 	if (!adev->rmmio)
3666 		return -ENOMEM;
3667 #endif
3668 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3669 	DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3670 
3671 	/*
3672 	 * Reset domain needs to be present early, before XGMI hive discovered
3673 	 * (if any) and intitialized to use reset sem and in_gpu reset flag
3674 	 * early on during init and before calling to RREG32.
3675 	 */
3676 	adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3677 	if (!adev->reset_domain)
3678 		return -ENOMEM;
3679 
3680 	/* detect hw virtualization here */
3681 	amdgpu_detect_virtualization(adev);
3682 
3683 	amdgpu_device_get_pcie_info(adev);
3684 
3685 	r = amdgpu_device_get_job_timeout_settings(adev);
3686 	if (r) {
3687 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3688 		return r;
3689 	}
3690 
3691 	/* early init functions */
3692 	r = amdgpu_device_ip_early_init(adev);
3693 	if (r)
3694 		return r;
3695 
3696 	amdgpu_device_set_mcbp(adev);
3697 
3698 	/* Get rid of things like offb */
3699 	r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3700 	if (r)
3701 		return r;
3702 
3703 	/* Enable TMZ based on IP_VERSION */
3704 	amdgpu_gmc_tmz_set(adev);
3705 
3706 	amdgpu_gmc_noretry_set(adev);
3707 	/* Need to get xgmi info early to decide the reset behavior*/
3708 	if (adev->gmc.xgmi.supported) {
3709 		r = adev->gfxhub.funcs->get_xgmi_info(adev);
3710 		if (r)
3711 			return r;
3712 	}
3713 
3714 	/* enable PCIE atomic ops */
3715 #ifdef notyet
3716 	if (amdgpu_sriov_vf(adev)) {
3717 		if (adev->virt.fw_reserve.p_pf2vf)
3718 			adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3719 						      adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3720 				(PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3721 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3722 	 * internal path natively support atomics, set have_atomics_support to true.
3723 	 */
3724 	} else if ((adev->flags & AMD_IS_APU) &&
3725 		   (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3726 		adev->have_atomics_support = true;
3727 	} else {
3728 		adev->have_atomics_support =
3729 			!pci_enable_atomic_ops_to_root(adev->pdev,
3730 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3731 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3732 	}
3733 
3734 	if (!adev->have_atomics_support)
3735 		dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3736 #else
3737 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3738 	 * internal path natively support atomics, set have_atomics_support to true.
3739 	 */
3740 	if ((adev->flags & AMD_IS_APU) &&
3741 		(adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)))
3742 		adev->have_atomics_support = true;
3743 	else
3744 		adev->have_atomics_support = false;
3745 #endif
3746 
3747 	/* doorbell bar mapping and doorbell index init*/
3748 	amdgpu_doorbell_init(adev);
3749 
3750 	if (amdgpu_emu_mode == 1) {
3751 		/* post the asic on emulation mode */
3752 		emu_soc_asic_init(adev);
3753 		goto fence_driver_init;
3754 	}
3755 
3756 	amdgpu_reset_init(adev);
3757 
3758 	/* detect if we are with an SRIOV vbios */
3759 	if (adev->bios)
3760 		amdgpu_device_detect_sriov_bios(adev);
3761 
3762 	/* check if we need to reset the asic
3763 	 *  E.g., driver was not cleanly unloaded previously, etc.
3764 	 */
3765 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3766 		if (adev->gmc.xgmi.num_physical_nodes) {
3767 			dev_info(adev->dev, "Pending hive reset.\n");
3768 			adev->gmc.xgmi.pending_reset = true;
3769 			/* Only need to init necessary block for SMU to handle the reset */
3770 			for (i = 0; i < adev->num_ip_blocks; i++) {
3771 				if (!adev->ip_blocks[i].status.valid)
3772 					continue;
3773 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3774 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3775 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3776 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3777 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3778 						adev->ip_blocks[i].version->funcs->name);
3779 					adev->ip_blocks[i].status.hw = true;
3780 				}
3781 			}
3782 		} else {
3783 			tmp = amdgpu_reset_method;
3784 			/* It should do a default reset when loading or reloading the driver,
3785 			 * regardless of the module parameter reset_method.
3786 			 */
3787 			amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3788 			r = amdgpu_asic_reset(adev);
3789 			amdgpu_reset_method = tmp;
3790 			if (r) {
3791 				dev_err(adev->dev, "asic reset on init failed\n");
3792 				goto failed;
3793 			}
3794 		}
3795 	}
3796 
3797 	/* Post card if necessary */
3798 	if (amdgpu_device_need_post(adev)) {
3799 		if (!adev->bios) {
3800 			dev_err(adev->dev, "no vBIOS found\n");
3801 			r = -EINVAL;
3802 			goto failed;
3803 		}
3804 		DRM_INFO("GPU posting now...\n");
3805 		r = amdgpu_device_asic_init(adev);
3806 		if (r) {
3807 			dev_err(adev->dev, "gpu post error!\n");
3808 			goto failed;
3809 		}
3810 	}
3811 
3812 	if (adev->bios) {
3813 		if (adev->is_atom_fw) {
3814 			/* Initialize clocks */
3815 			r = amdgpu_atomfirmware_get_clock_info(adev);
3816 			if (r) {
3817 				dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3818 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3819 				goto failed;
3820 			}
3821 		} else {
3822 			/* Initialize clocks */
3823 			r = amdgpu_atombios_get_clock_info(adev);
3824 			if (r) {
3825 				dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3826 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3827 				goto failed;
3828 			}
3829 			/* init i2c buses */
3830 			if (!amdgpu_device_has_dc_support(adev))
3831 				amdgpu_atombios_i2c_init(adev);
3832 		}
3833 	}
3834 
3835 fence_driver_init:
3836 	/* Fence driver */
3837 	r = amdgpu_fence_driver_sw_init(adev);
3838 	if (r) {
3839 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3840 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3841 		goto failed;
3842 	}
3843 
3844 	/* init the mode config */
3845 	drm_mode_config_init(adev_to_drm(adev));
3846 
3847 	r = amdgpu_device_ip_init(adev);
3848 	if (r) {
3849 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3850 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3851 		goto release_ras_con;
3852 	}
3853 
3854 	amdgpu_fence_driver_hw_init(adev);
3855 
3856 	dev_info(adev->dev,
3857 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3858 			adev->gfx.config.max_shader_engines,
3859 			adev->gfx.config.max_sh_per_se,
3860 			adev->gfx.config.max_cu_per_sh,
3861 			adev->gfx.cu_info.number);
3862 
3863 #ifdef __OpenBSD__
3864 {
3865 	const char *chip_name;
3866 	uint32_t version = adev->ip_versions[GC_HWIP][0];
3867 	int maj, min, rev;
3868 
3869 	switch (adev->asic_type) {
3870 	case CHIP_RAVEN:
3871 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
3872 			chip_name = "RAVEN2";
3873 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
3874 			chip_name = "PICASSO";
3875 		else
3876 			chip_name = "RAVEN";
3877 		break;
3878 	case CHIP_RENOIR:
3879 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
3880 			chip_name = "RENOIR";
3881 		else
3882 			chip_name = "GREEN_SARDINE";
3883 		break;
3884 	default:
3885 		chip_name = amdgpu_asic_name[adev->asic_type];
3886 	}
3887 
3888 	printf("%s: %s", adev->self.dv_xname, chip_name);
3889 	/* show graphics/compute ip block version, not set on < GFX9 */
3890 	if (version) {
3891 		maj = IP_VERSION_MAJ(version);
3892 		min = IP_VERSION_MIN(version);
3893 		rev = IP_VERSION_REV(version);
3894 		printf(" GC %d.%d.%d", maj, min, rev);
3895 	}
3896 	printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id);
3897 }
3898 #endif
3899 
3900 	adev->accel_working = true;
3901 
3902 	amdgpu_vm_check_compute_bug(adev);
3903 
3904 	/* Initialize the buffer migration limit. */
3905 	if (amdgpu_moverate >= 0)
3906 		max_MBps = amdgpu_moverate;
3907 	else
3908 		max_MBps = 8; /* Allow 8 MB/s. */
3909 	/* Get a log2 for easy divisions. */
3910 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3911 
3912 	r = amdgpu_atombios_sysfs_init(adev);
3913 	if (r)
3914 		drm_err(&adev->ddev,
3915 			"registering atombios sysfs failed (%d).\n", r);
3916 
3917 	r = amdgpu_pm_sysfs_init(adev);
3918 	if (r)
3919 		DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3920 
3921 	r = amdgpu_ucode_sysfs_init(adev);
3922 	if (r) {
3923 		adev->ucode_sysfs_en = false;
3924 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3925 	} else
3926 		adev->ucode_sysfs_en = true;
3927 
3928 	/*
3929 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3930 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3931 	 * gpu instance is counted less.
3932 	 */
3933 	amdgpu_register_gpu_instance(adev);
3934 
3935 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3936 	 * explicit gating rather than handling it automatically.
3937 	 */
3938 	if (!adev->gmc.xgmi.pending_reset) {
3939 		r = amdgpu_device_ip_late_init(adev);
3940 		if (r) {
3941 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3942 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3943 			goto release_ras_con;
3944 		}
3945 		/* must succeed. */
3946 		amdgpu_ras_resume(adev);
3947 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3948 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3949 	}
3950 
3951 	if (amdgpu_sriov_vf(adev)) {
3952 		amdgpu_virt_release_full_gpu(adev, true);
3953 		flush_delayed_work(&adev->delayed_init_work);
3954 	}
3955 
3956 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3957 	if (r)
3958 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3959 
3960 	amdgpu_fru_sysfs_init(adev);
3961 
3962 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3963 		r = amdgpu_pmu_init(adev);
3964 	if (r)
3965 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3966 
3967 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3968 	if (amdgpu_device_cache_pci_state(adev->pdev))
3969 		pci_restore_state(pdev);
3970 
3971 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3972 	/* this will fail for cards that aren't VGA class devices, just
3973 	 * ignore it
3974 	 */
3975 #ifdef notyet
3976 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3977 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3978 #endif
3979 
3980 	px = amdgpu_device_supports_px(ddev);
3981 
3982 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
3983 				apple_gmux_detect(NULL, NULL)))
3984 		vga_switcheroo_register_client(adev->pdev,
3985 					       &amdgpu_switcheroo_ops, px);
3986 
3987 	if (px)
3988 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3989 
3990 	if (adev->gmc.xgmi.pending_reset)
3991 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3992 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3993 
3994 	amdgpu_device_check_iommu_direct_map(adev);
3995 
3996 	return 0;
3997 
3998 release_ras_con:
3999 	if (amdgpu_sriov_vf(adev))
4000 		amdgpu_virt_release_full_gpu(adev, true);
4001 
4002 	/* failed in exclusive mode due to timeout */
4003 	if (amdgpu_sriov_vf(adev) &&
4004 		!amdgpu_sriov_runtime(adev) &&
4005 		amdgpu_virt_mmio_blocked(adev) &&
4006 		!amdgpu_virt_wait_reset(adev)) {
4007 		dev_err(adev->dev, "VF exclusive mode timeout\n");
4008 		/* Don't send request since VF is inactive. */
4009 		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4010 		adev->virt.ops = NULL;
4011 		r = -EAGAIN;
4012 	}
4013 	amdgpu_release_ras_context(adev);
4014 
4015 failed:
4016 	amdgpu_vf_error_trans_all(adev);
4017 
4018 	return r;
4019 }
4020 
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4021 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4022 {
4023 	STUB();
4024 #ifdef notyet
4025 
4026 	/* Clear all CPU mappings pointing to this device */
4027 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4028 #endif
4029 
4030 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
4031 	amdgpu_doorbell_fini(adev);
4032 
4033 #ifdef __linux__
4034 	iounmap(adev->rmmio);
4035 	adev->rmmio = NULL;
4036 	if (adev->mman.aper_base_kaddr)
4037 		iounmap(adev->mman.aper_base_kaddr);
4038 	adev->mman.aper_base_kaddr = NULL;
4039 #else
4040 	if (adev->rmmio_size > 0)
4041 		bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4042 		    adev->rmmio_size);
4043 	adev->rmmio_size = 0;
4044 	adev->rmmio = NULL;
4045 	if (adev->mman.aper_base_kaddr)
4046 		bus_space_unmap(adev->memt, adev->mman.aper_bsh,
4047 		    adev->gmc.visible_vram_size);
4048 	adev->mman.aper_base_kaddr = NULL;
4049 #endif
4050 
4051 	/* Memory manager related */
4052 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4053 #ifdef __linux__
4054 		arch_phys_wc_del(adev->gmc.vram_mtrr);
4055 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4056 #else
4057 		drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
4058 #endif
4059 	}
4060 }
4061 
4062 /**
4063  * amdgpu_device_fini_hw - tear down the driver
4064  *
4065  * @adev: amdgpu_device pointer
4066  *
4067  * Tear down the driver info (all asics).
4068  * Called at driver shutdown.
4069  */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4070 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4071 {
4072 	dev_info(adev->dev, "amdgpu: finishing device.\n");
4073 	flush_delayed_work(&adev->delayed_init_work);
4074 	adev->shutdown = true;
4075 
4076 	/* make sure IB test finished before entering exclusive mode
4077 	 * to avoid preemption on IB test
4078 	 */
4079 	if (amdgpu_sriov_vf(adev)) {
4080 		amdgpu_virt_request_full_gpu(adev, false);
4081 		amdgpu_virt_fini_data_exchange(adev);
4082 	}
4083 
4084 	/* disable all interrupts */
4085 	amdgpu_irq_disable_all(adev);
4086 	if (adev->mode_info.mode_config_initialized) {
4087 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4088 			drm_helper_force_disable_all(adev_to_drm(adev));
4089 		else
4090 			drm_atomic_helper_shutdown(adev_to_drm(adev));
4091 	}
4092 	amdgpu_fence_driver_hw_fini(adev);
4093 
4094 	if (adev->mman.initialized)
4095 		drain_workqueue(adev->mman.bdev.wq);
4096 
4097 	if (adev->pm.sysfs_initialized)
4098 		amdgpu_pm_sysfs_fini(adev);
4099 	if (adev->ucode_sysfs_en)
4100 		amdgpu_ucode_sysfs_fini(adev);
4101 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4102 	amdgpu_fru_sysfs_fini(adev);
4103 
4104 	/* disable ras feature must before hw fini */
4105 	amdgpu_ras_pre_fini(adev);
4106 
4107 	amdgpu_device_ip_fini_early(adev);
4108 
4109 	amdgpu_irq_fini_hw(adev);
4110 
4111 	if (adev->mman.initialized)
4112 		ttm_device_clear_dma_mappings(&adev->mman.bdev);
4113 
4114 	amdgpu_gart_dummy_page_fini(adev);
4115 
4116 	if (drm_dev_is_unplugged(adev_to_drm(adev)))
4117 		amdgpu_device_unmap_mmio(adev);
4118 
4119 }
4120 
amdgpu_device_fini_sw(struct amdgpu_device * adev)4121 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4122 {
4123 	int idx;
4124 	bool px;
4125 
4126 	amdgpu_fence_driver_sw_fini(adev);
4127 	amdgpu_device_ip_fini(adev);
4128 	amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4129 	adev->accel_working = false;
4130 	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4131 
4132 	amdgpu_reset_fini(adev);
4133 
4134 	/* free i2c buses */
4135 	if (!amdgpu_device_has_dc_support(adev))
4136 		amdgpu_i2c_fini(adev);
4137 
4138 	if (amdgpu_emu_mode != 1)
4139 		amdgpu_atombios_fini(adev);
4140 
4141 	kfree(adev->bios);
4142 	adev->bios = NULL;
4143 
4144 	px = amdgpu_device_supports_px(adev_to_drm(adev));
4145 
4146 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4147 				apple_gmux_detect(NULL, NULL)))
4148 		vga_switcheroo_unregister_client(adev->pdev);
4149 
4150 	if (px)
4151 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
4152 
4153 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4154 		vga_client_unregister(adev->pdev);
4155 
4156 	if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4157 #ifdef __linux__
4158 		iounmap(adev->rmmio);
4159 		adev->rmmio = NULL;
4160 #else
4161 		if (adev->rmmio_size > 0)
4162 			bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4163 			    adev->rmmio_size);
4164 		adev->rmmio_size = 0;
4165 		adev->rmmio = NULL;
4166 #endif
4167 		amdgpu_doorbell_fini(adev);
4168 		drm_dev_exit(idx);
4169 	}
4170 
4171 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4172 		amdgpu_pmu_fini(adev);
4173 	if (adev->mman.discovery_bin)
4174 		amdgpu_discovery_fini(adev);
4175 
4176 	amdgpu_reset_put_reset_domain(adev->reset_domain);
4177 	adev->reset_domain = NULL;
4178 
4179 	kfree(adev->pci_state);
4180 
4181 }
4182 
4183 /**
4184  * amdgpu_device_evict_resources - evict device resources
4185  * @adev: amdgpu device object
4186  *
4187  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4188  * of the vram memory type. Mainly used for evicting device resources
4189  * at suspend time.
4190  *
4191  */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4192 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4193 {
4194 	int ret;
4195 
4196 	/* No need to evict vram on APUs for suspend to ram or s2idle */
4197 	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4198 		return 0;
4199 
4200 	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4201 	if (ret)
4202 		DRM_WARN("evicting device resources failed\n");
4203 	return ret;
4204 }
4205 
4206 /*
4207  * Suspend & resume.
4208  */
4209 /**
4210  * amdgpu_device_prepare - prepare for device suspend
4211  *
4212  * @dev: drm dev pointer
4213  *
4214  * Prepare to put the hw in the suspend state (all asics).
4215  * Returns 0 for success or an error on failure.
4216  * Called at driver suspend.
4217  */
amdgpu_device_prepare(struct drm_device * dev)4218 int amdgpu_device_prepare(struct drm_device *dev)
4219 {
4220 	struct amdgpu_device *adev = drm_to_adev(dev);
4221 	int i, r;
4222 
4223 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4224 		return 0;
4225 
4226 	/* Evict the majority of BOs before starting suspend sequence */
4227 	r = amdgpu_device_evict_resources(adev);
4228 	if (r)
4229 		return r;
4230 
4231 	flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4232 
4233 	for (i = 0; i < adev->num_ip_blocks; i++) {
4234 		if (!adev->ip_blocks[i].status.valid)
4235 			continue;
4236 		if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4237 			continue;
4238 		r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4239 		if (r)
4240 			return r;
4241 	}
4242 
4243 	return 0;
4244 }
4245 
4246 /**
4247  * amdgpu_device_suspend - initiate device suspend
4248  *
4249  * @dev: drm dev pointer
4250  * @fbcon : notify the fbdev of suspend
4251  *
4252  * Puts the hw in the suspend state (all asics).
4253  * Returns 0 for success or an error on failure.
4254  * Called at driver suspend.
4255  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4256 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4257 {
4258 	struct amdgpu_device *adev = drm_to_adev(dev);
4259 	int r = 0;
4260 
4261 	if (adev->shutdown)
4262 		return 0;
4263 
4264 #ifdef notyet
4265 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4266 		return 0;
4267 #endif
4268 
4269 	adev->in_suspend = true;
4270 
4271 	if (amdgpu_sriov_vf(adev)) {
4272 		amdgpu_virt_fini_data_exchange(adev);
4273 		r = amdgpu_virt_request_full_gpu(adev, false);
4274 		if (r)
4275 			return r;
4276 	}
4277 
4278 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4279 		DRM_WARN("smart shift update failed\n");
4280 
4281 	if (fbcon)
4282 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4283 
4284 	cancel_delayed_work_sync(&adev->delayed_init_work);
4285 
4286 	amdgpu_ras_suspend(adev);
4287 
4288 	amdgpu_device_ip_suspend_phase1(adev);
4289 
4290 	if (!adev->in_s0ix)
4291 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4292 
4293 	r = amdgpu_device_evict_resources(adev);
4294 	if (r)
4295 		return r;
4296 
4297 	amdgpu_fence_driver_hw_fini(adev);
4298 
4299 	amdgpu_device_ip_suspend_phase2(adev);
4300 
4301 	if (amdgpu_sriov_vf(adev))
4302 		amdgpu_virt_release_full_gpu(adev, false);
4303 
4304 	return 0;
4305 }
4306 
4307 /**
4308  * amdgpu_device_resume - initiate device resume
4309  *
4310  * @dev: drm dev pointer
4311  * @fbcon : notify the fbdev of resume
4312  *
4313  * Bring the hw back to operating state (all asics).
4314  * Returns 0 for success or an error on failure.
4315  * Called at driver resume.
4316  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4317 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4318 {
4319 	struct amdgpu_device *adev = drm_to_adev(dev);
4320 	int r = 0;
4321 
4322 	if (amdgpu_sriov_vf(adev)) {
4323 		r = amdgpu_virt_request_full_gpu(adev, true);
4324 		if (r)
4325 			return r;
4326 	}
4327 
4328 #ifdef notyet
4329 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4330 		return 0;
4331 #endif
4332 
4333 	if (adev->in_s0ix)
4334 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4335 
4336 	/* post card */
4337 	if (amdgpu_device_need_post(adev)) {
4338 		r = amdgpu_device_asic_init(adev);
4339 		if (r)
4340 			dev_err(adev->dev, "amdgpu asic init failed\n");
4341 	}
4342 
4343 	r = amdgpu_device_ip_resume(adev);
4344 
4345 	if (r) {
4346 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4347 		goto exit;
4348 	}
4349 	amdgpu_fence_driver_hw_init(adev);
4350 
4351 	r = amdgpu_device_ip_late_init(adev);
4352 	if (r)
4353 		goto exit;
4354 
4355 	queue_delayed_work(system_wq, &adev->delayed_init_work,
4356 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
4357 
4358 	if (!adev->in_s0ix) {
4359 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4360 		if (r)
4361 			goto exit;
4362 	}
4363 
4364 exit:
4365 	if (amdgpu_sriov_vf(adev)) {
4366 		amdgpu_virt_init_data_exchange(adev);
4367 		amdgpu_virt_release_full_gpu(adev, true);
4368 	}
4369 
4370 	if (r)
4371 		return r;
4372 
4373 	/* Make sure IB tests flushed */
4374 	flush_delayed_work(&adev->delayed_init_work);
4375 
4376 	if (fbcon)
4377 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4378 
4379 	amdgpu_ras_resume(adev);
4380 
4381 	if (adev->mode_info.num_crtc) {
4382 		/*
4383 		 * Most of the connector probing functions try to acquire runtime pm
4384 		 * refs to ensure that the GPU is powered on when connector polling is
4385 		 * performed. Since we're calling this from a runtime PM callback,
4386 		 * trying to acquire rpm refs will cause us to deadlock.
4387 		 *
4388 		 * Since we're guaranteed to be holding the rpm lock, it's safe to
4389 		 * temporarily disable the rpm helpers so this doesn't deadlock us.
4390 		 */
4391 #if defined(CONFIG_PM) && defined(__linux__)
4392 		dev->dev->power.disable_depth++;
4393 #endif
4394 		if (!adev->dc_enabled)
4395 			drm_helper_hpd_irq_event(dev);
4396 		else
4397 			drm_kms_helper_hotplug_event(dev);
4398 #if defined(CONFIG_PM) && defined(__linux__)
4399 		dev->dev->power.disable_depth--;
4400 #endif
4401 	}
4402 	adev->in_suspend = false;
4403 
4404 	if (adev->enable_mes)
4405 		amdgpu_mes_self_test(adev);
4406 
4407 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4408 		DRM_WARN("smart shift update failed\n");
4409 
4410 	return 0;
4411 }
4412 
4413 /**
4414  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4415  *
4416  * @adev: amdgpu_device pointer
4417  *
4418  * The list of all the hardware IPs that make up the asic is walked and
4419  * the check_soft_reset callbacks are run.  check_soft_reset determines
4420  * if the asic is still hung or not.
4421  * Returns true if any of the IPs are still in a hung state, false if not.
4422  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4423 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4424 {
4425 	int i;
4426 	bool asic_hang = false;
4427 
4428 	if (amdgpu_sriov_vf(adev))
4429 		return true;
4430 
4431 	if (amdgpu_asic_need_full_reset(adev))
4432 		return true;
4433 
4434 	for (i = 0; i < adev->num_ip_blocks; i++) {
4435 		if (!adev->ip_blocks[i].status.valid)
4436 			continue;
4437 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4438 			adev->ip_blocks[i].status.hang =
4439 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4440 		if (adev->ip_blocks[i].status.hang) {
4441 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4442 			asic_hang = true;
4443 		}
4444 	}
4445 	return asic_hang;
4446 }
4447 
4448 /**
4449  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4450  *
4451  * @adev: amdgpu_device pointer
4452  *
4453  * The list of all the hardware IPs that make up the asic is walked and the
4454  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4455  * handles any IP specific hardware or software state changes that are
4456  * necessary for a soft reset to succeed.
4457  * Returns 0 on success, negative error code on failure.
4458  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4459 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4460 {
4461 	int i, r = 0;
4462 
4463 	for (i = 0; i < adev->num_ip_blocks; i++) {
4464 		if (!adev->ip_blocks[i].status.valid)
4465 			continue;
4466 		if (adev->ip_blocks[i].status.hang &&
4467 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4468 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4469 			if (r)
4470 				return r;
4471 		}
4472 	}
4473 
4474 	return 0;
4475 }
4476 
4477 /**
4478  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4479  *
4480  * @adev: amdgpu_device pointer
4481  *
4482  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4483  * reset is necessary to recover.
4484  * Returns true if a full asic reset is required, false if not.
4485  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4486 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4487 {
4488 	int i;
4489 
4490 	if (amdgpu_asic_need_full_reset(adev))
4491 		return true;
4492 
4493 	for (i = 0; i < adev->num_ip_blocks; i++) {
4494 		if (!adev->ip_blocks[i].status.valid)
4495 			continue;
4496 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4497 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4498 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4499 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4500 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4501 			if (adev->ip_blocks[i].status.hang) {
4502 				dev_info(adev->dev, "Some block need full reset!\n");
4503 				return true;
4504 			}
4505 		}
4506 	}
4507 	return false;
4508 }
4509 
4510 /**
4511  * amdgpu_device_ip_soft_reset - do a soft reset
4512  *
4513  * @adev: amdgpu_device pointer
4514  *
4515  * The list of all the hardware IPs that make up the asic is walked and the
4516  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4517  * IP specific hardware or software state changes that are necessary to soft
4518  * reset the IP.
4519  * Returns 0 on success, negative error code on failure.
4520  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4521 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4522 {
4523 	int i, r = 0;
4524 
4525 	for (i = 0; i < adev->num_ip_blocks; i++) {
4526 		if (!adev->ip_blocks[i].status.valid)
4527 			continue;
4528 		if (adev->ip_blocks[i].status.hang &&
4529 		    adev->ip_blocks[i].version->funcs->soft_reset) {
4530 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4531 			if (r)
4532 				return r;
4533 		}
4534 	}
4535 
4536 	return 0;
4537 }
4538 
4539 /**
4540  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4541  *
4542  * @adev: amdgpu_device pointer
4543  *
4544  * The list of all the hardware IPs that make up the asic is walked and the
4545  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4546  * handles any IP specific hardware or software state changes that are
4547  * necessary after the IP has been soft reset.
4548  * Returns 0 on success, negative error code on failure.
4549  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4550 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4551 {
4552 	int i, r = 0;
4553 
4554 	for (i = 0; i < adev->num_ip_blocks; i++) {
4555 		if (!adev->ip_blocks[i].status.valid)
4556 			continue;
4557 		if (adev->ip_blocks[i].status.hang &&
4558 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
4559 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4560 		if (r)
4561 			return r;
4562 	}
4563 
4564 	return 0;
4565 }
4566 
4567 /**
4568  * amdgpu_device_recover_vram - Recover some VRAM contents
4569  *
4570  * @adev: amdgpu_device pointer
4571  *
4572  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4573  * restore things like GPUVM page tables after a GPU reset where
4574  * the contents of VRAM might be lost.
4575  *
4576  * Returns:
4577  * 0 on success, negative error code on failure.
4578  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4579 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4580 {
4581 	struct dma_fence *fence = NULL, *next = NULL;
4582 	struct amdgpu_bo *shadow;
4583 	struct amdgpu_bo_vm *vmbo;
4584 	long r = 1, tmo;
4585 
4586 	if (amdgpu_sriov_runtime(adev))
4587 		tmo = msecs_to_jiffies(8000);
4588 	else
4589 		tmo = msecs_to_jiffies(100);
4590 
4591 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4592 	mutex_lock(&adev->shadow_list_lock);
4593 	list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4594 		/* If vm is compute context or adev is APU, shadow will be NULL */
4595 		if (!vmbo->shadow)
4596 			continue;
4597 		shadow = vmbo->shadow;
4598 
4599 		/* No need to recover an evicted BO */
4600 		if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4601 		    shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4602 		    shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4603 			continue;
4604 
4605 		r = amdgpu_bo_restore_shadow(shadow, &next);
4606 		if (r)
4607 			break;
4608 
4609 		if (fence) {
4610 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4611 			dma_fence_put(fence);
4612 			fence = next;
4613 			if (tmo == 0) {
4614 				r = -ETIMEDOUT;
4615 				break;
4616 			} else if (tmo < 0) {
4617 				r = tmo;
4618 				break;
4619 			}
4620 		} else {
4621 			fence = next;
4622 		}
4623 	}
4624 	mutex_unlock(&adev->shadow_list_lock);
4625 
4626 	if (fence)
4627 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4628 	dma_fence_put(fence);
4629 
4630 	if (r < 0 || tmo <= 0) {
4631 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4632 		return -EIO;
4633 	}
4634 
4635 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4636 	return 0;
4637 }
4638 
4639 
4640 /**
4641  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4642  *
4643  * @adev: amdgpu_device pointer
4644  * @from_hypervisor: request from hypervisor
4645  *
4646  * do VF FLR and reinitialize Asic
4647  * return 0 means succeeded otherwise failed
4648  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4649 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4650 				     bool from_hypervisor)
4651 {
4652 	int r;
4653 	struct amdgpu_hive_info *hive = NULL;
4654 	int retry_limit = 0;
4655 
4656 retry:
4657 	amdgpu_amdkfd_pre_reset(adev);
4658 
4659 	if (from_hypervisor)
4660 		r = amdgpu_virt_request_full_gpu(adev, true);
4661 	else
4662 		r = amdgpu_virt_reset_gpu(adev);
4663 	if (r)
4664 		return r;
4665 	amdgpu_irq_gpu_reset_resume_helper(adev);
4666 
4667 	/* some sw clean up VF needs to do before recover */
4668 	amdgpu_virt_post_reset(adev);
4669 
4670 	/* Resume IP prior to SMC */
4671 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4672 	if (r)
4673 		goto error;
4674 
4675 	amdgpu_virt_init_data_exchange(adev);
4676 
4677 	r = amdgpu_device_fw_loading(adev);
4678 	if (r)
4679 		return r;
4680 
4681 	/* now we are okay to resume SMC/CP/SDMA */
4682 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4683 	if (r)
4684 		goto error;
4685 
4686 	hive = amdgpu_get_xgmi_hive(adev);
4687 	/* Update PSP FW topology after reset */
4688 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4689 		r = amdgpu_xgmi_update_topology(hive, adev);
4690 
4691 	if (hive)
4692 		amdgpu_put_xgmi_hive(hive);
4693 
4694 	if (!r) {
4695 		r = amdgpu_ib_ring_tests(adev);
4696 
4697 		amdgpu_amdkfd_post_reset(adev);
4698 	}
4699 
4700 error:
4701 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4702 		amdgpu_inc_vram_lost(adev);
4703 		r = amdgpu_device_recover_vram(adev);
4704 	}
4705 	amdgpu_virt_release_full_gpu(adev, true);
4706 
4707 	if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4708 		if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4709 			retry_limit++;
4710 			goto retry;
4711 		} else
4712 			DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4713 	}
4714 
4715 	return r;
4716 }
4717 
4718 /**
4719  * amdgpu_device_has_job_running - check if there is any job in mirror list
4720  *
4721  * @adev: amdgpu_device pointer
4722  *
4723  * check if there is any job in mirror list
4724  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4725 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4726 {
4727 	int i;
4728 	struct drm_sched_job *job;
4729 
4730 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4731 		struct amdgpu_ring *ring = adev->rings[i];
4732 
4733 		if (!ring || !ring->sched.thread)
4734 			continue;
4735 
4736 		spin_lock(&ring->sched.job_list_lock);
4737 		job = list_first_entry_or_null(&ring->sched.pending_list,
4738 					       struct drm_sched_job, list);
4739 		spin_unlock(&ring->sched.job_list_lock);
4740 		if (job)
4741 			return true;
4742 	}
4743 	return false;
4744 }
4745 
4746 /**
4747  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4748  *
4749  * @adev: amdgpu_device pointer
4750  *
4751  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4752  * a hung GPU.
4753  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4754 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4755 {
4756 
4757 	if (amdgpu_gpu_recovery == 0)
4758 		goto disabled;
4759 
4760 	/* Skip soft reset check in fatal error mode */
4761 	if (!amdgpu_ras_is_poison_mode_supported(adev))
4762 		return true;
4763 
4764 	if (amdgpu_sriov_vf(adev))
4765 		return true;
4766 
4767 	if (amdgpu_gpu_recovery == -1) {
4768 		switch (adev->asic_type) {
4769 #ifdef CONFIG_DRM_AMDGPU_SI
4770 		case CHIP_VERDE:
4771 		case CHIP_TAHITI:
4772 		case CHIP_PITCAIRN:
4773 		case CHIP_OLAND:
4774 		case CHIP_HAINAN:
4775 #endif
4776 #ifdef CONFIG_DRM_AMDGPU_CIK
4777 		case CHIP_KAVERI:
4778 		case CHIP_KABINI:
4779 		case CHIP_MULLINS:
4780 #endif
4781 		case CHIP_CARRIZO:
4782 		case CHIP_STONEY:
4783 		case CHIP_CYAN_SKILLFISH:
4784 			goto disabled;
4785 		default:
4786 			break;
4787 		}
4788 	}
4789 
4790 	return true;
4791 
4792 disabled:
4793 		dev_info(adev->dev, "GPU recovery disabled.\n");
4794 		return false;
4795 }
4796 
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4797 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4798 {
4799 	u32 i;
4800 	int ret = 0;
4801 
4802 	amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4803 
4804 	dev_info(adev->dev, "GPU mode1 reset\n");
4805 
4806 	/* disable BM */
4807 	pci_clear_master(adev->pdev);
4808 
4809 	amdgpu_device_cache_pci_state(adev->pdev);
4810 
4811 	if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4812 		dev_info(adev->dev, "GPU smu mode1 reset\n");
4813 		ret = amdgpu_dpm_mode1_reset(adev);
4814 	} else {
4815 		dev_info(adev->dev, "GPU psp mode1 reset\n");
4816 		ret = psp_gpu_reset(adev);
4817 	}
4818 
4819 	if (ret)
4820 		goto mode1_reset_failed;
4821 
4822 	amdgpu_device_load_pci_state(adev->pdev);
4823 	ret = amdgpu_psp_wait_for_bootloader(adev);
4824 	if (ret)
4825 		goto mode1_reset_failed;
4826 
4827 	/* wait for asic to come out of reset */
4828 	for (i = 0; i < adev->usec_timeout; i++) {
4829 		u32 memsize = adev->nbio.funcs->get_memsize(adev);
4830 
4831 		if (memsize != 0xffffffff)
4832 			break;
4833 		udelay(1);
4834 	}
4835 
4836 	if (i >= adev->usec_timeout) {
4837 		ret = -ETIMEDOUT;
4838 		goto mode1_reset_failed;
4839 	}
4840 
4841 	amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4842 
4843 	return 0;
4844 
4845 mode1_reset_failed:
4846 	dev_err(adev->dev, "GPU mode1 reset failed\n");
4847 	return ret;
4848 }
4849 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4850 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4851 				 struct amdgpu_reset_context *reset_context)
4852 {
4853 	int i, r = 0;
4854 	struct amdgpu_job *job = NULL;
4855 	bool need_full_reset =
4856 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4857 
4858 	if (reset_context->reset_req_dev == adev)
4859 		job = reset_context->job;
4860 
4861 	if (amdgpu_sriov_vf(adev)) {
4862 		/* stop the data exchange thread */
4863 		amdgpu_virt_fini_data_exchange(adev);
4864 	}
4865 
4866 	amdgpu_fence_driver_isr_toggle(adev, true);
4867 
4868 	/* block all schedulers and reset given job's ring */
4869 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4870 		struct amdgpu_ring *ring = adev->rings[i];
4871 
4872 		if (!ring || !ring->sched.thread)
4873 			continue;
4874 
4875 		/* Clear job fence from fence drv to avoid force_completion
4876 		 * leave NULL and vm flush fence in fence drv
4877 		 */
4878 		amdgpu_fence_driver_clear_job_fences(ring);
4879 
4880 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4881 		amdgpu_fence_driver_force_completion(ring);
4882 	}
4883 
4884 	amdgpu_fence_driver_isr_toggle(adev, false);
4885 
4886 	if (job && job->vm)
4887 		drm_sched_increase_karma(&job->base);
4888 
4889 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4890 	/* If reset handler not implemented, continue; otherwise return */
4891 	if (r == -EOPNOTSUPP)
4892 		r = 0;
4893 	else
4894 		return r;
4895 
4896 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4897 	if (!amdgpu_sriov_vf(adev)) {
4898 
4899 		if (!need_full_reset)
4900 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4901 
4902 		if (!need_full_reset && amdgpu_gpu_recovery &&
4903 		    amdgpu_device_ip_check_soft_reset(adev)) {
4904 			amdgpu_device_ip_pre_soft_reset(adev);
4905 			r = amdgpu_device_ip_soft_reset(adev);
4906 			amdgpu_device_ip_post_soft_reset(adev);
4907 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4908 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4909 				need_full_reset = true;
4910 			}
4911 		}
4912 
4913 		if (need_full_reset)
4914 			r = amdgpu_device_ip_suspend(adev);
4915 		if (need_full_reset)
4916 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4917 		else
4918 			clear_bit(AMDGPU_NEED_FULL_RESET,
4919 				  &reset_context->flags);
4920 	}
4921 
4922 	return r;
4923 }
4924 
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4925 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4926 {
4927 	int i;
4928 
4929 	lockdep_assert_held(&adev->reset_domain->sem);
4930 
4931 	for (i = 0; i < adev->num_regs; i++) {
4932 		adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4933 		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4934 					     adev->reset_dump_reg_value[i]);
4935 	}
4936 
4937 	return 0;
4938 }
4939 
4940 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4941 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4942 		size_t count, void *data, size_t datalen)
4943 {
4944 	struct drm_printer p;
4945 	struct amdgpu_device *adev = data;
4946 	struct drm_print_iterator iter;
4947 	int i;
4948 
4949 	iter.data = buffer;
4950 	iter.offset = 0;
4951 	iter.start = offset;
4952 	iter.remain = count;
4953 
4954 	p = drm_coredump_printer(&iter);
4955 
4956 	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4957 	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4958 	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4959 	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4960 	if (adev->reset_task_info.pid)
4961 		drm_printf(&p, "process_name: %s PID: %d\n",
4962 			   adev->reset_task_info.process_name,
4963 			   adev->reset_task_info.pid);
4964 
4965 	if (adev->reset_vram_lost)
4966 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4967 	if (adev->num_regs) {
4968 		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4969 
4970 		for (i = 0; i < adev->num_regs; i++)
4971 			drm_printf(&p, "0x%08x: 0x%08x\n",
4972 				   adev->reset_dump_reg_list[i],
4973 				   adev->reset_dump_reg_value[i]);
4974 	}
4975 
4976 	return count - iter.remain;
4977 }
4978 
amdgpu_devcoredump_free(void * data)4979 static void amdgpu_devcoredump_free(void *data)
4980 {
4981 }
4982 
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)4983 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4984 {
4985 	struct drm_device *dev = adev_to_drm(adev);
4986 
4987 	ktime_get_ts64(&adev->reset_time);
4988 	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
4989 		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4990 }
4991 #endif
4992 
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4993 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4994 			 struct amdgpu_reset_context *reset_context)
4995 {
4996 	struct amdgpu_device *tmp_adev = NULL;
4997 	bool need_full_reset, skip_hw_reset, vram_lost = false;
4998 	int r = 0;
4999 	bool gpu_reset_for_dev_remove = 0;
5000 
5001 	/* Try reset handler method first */
5002 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5003 				    reset_list);
5004 	amdgpu_reset_reg_dumps(tmp_adev);
5005 
5006 	reset_context->reset_device_list = device_list_handle;
5007 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5008 	/* If reset handler not implemented, continue; otherwise return */
5009 	if (r == -EOPNOTSUPP)
5010 		r = 0;
5011 	else
5012 		return r;
5013 
5014 	/* Reset handler not implemented, use the default method */
5015 	need_full_reset =
5016 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5017 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5018 
5019 	gpu_reset_for_dev_remove =
5020 		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5021 			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5022 
5023 	/*
5024 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
5025 	 * to allow proper links negotiation in FW (within 1 sec)
5026 	 */
5027 	if (!skip_hw_reset && need_full_reset) {
5028 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5029 			/* For XGMI run all resets in parallel to speed up the process */
5030 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5031 				tmp_adev->gmc.xgmi.pending_reset = false;
5032 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5033 					r = -EALREADY;
5034 			} else
5035 				r = amdgpu_asic_reset(tmp_adev);
5036 
5037 			if (r) {
5038 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5039 					 r, adev_to_drm(tmp_adev)->unique);
5040 				break;
5041 			}
5042 		}
5043 
5044 		/* For XGMI wait for all resets to complete before proceed */
5045 		if (!r) {
5046 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5047 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5048 					flush_work(&tmp_adev->xgmi_reset_work);
5049 					r = tmp_adev->asic_reset_res;
5050 					if (r)
5051 						break;
5052 				}
5053 			}
5054 		}
5055 	}
5056 
5057 	if (!r && amdgpu_ras_intr_triggered()) {
5058 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5059 			if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
5060 			    tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
5061 				tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5062 		}
5063 
5064 		amdgpu_ras_intr_cleared();
5065 	}
5066 
5067 	/* Since the mode1 reset affects base ip blocks, the
5068 	 * phase1 ip blocks need to be resumed. Otherwise there
5069 	 * will be a BIOS signature error and the psp bootloader
5070 	 * can't load kdb on the next amdgpu install.
5071 	 */
5072 	if (gpu_reset_for_dev_remove) {
5073 		list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5074 			amdgpu_device_ip_resume_phase1(tmp_adev);
5075 
5076 		goto end;
5077 	}
5078 
5079 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5080 		if (need_full_reset) {
5081 			/* post card */
5082 			r = amdgpu_device_asic_init(tmp_adev);
5083 			if (r) {
5084 				dev_warn(tmp_adev->dev, "asic atom init failed!");
5085 			} else {
5086 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5087 
5088 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
5089 				if (r)
5090 					goto out;
5091 
5092 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5093 #ifdef CONFIG_DEV_COREDUMP
5094 				tmp_adev->reset_vram_lost = vram_lost;
5095 				memset(&tmp_adev->reset_task_info, 0,
5096 						sizeof(tmp_adev->reset_task_info));
5097 				if (reset_context->job && reset_context->job->vm)
5098 					tmp_adev->reset_task_info =
5099 						reset_context->job->vm->task_info;
5100 				amdgpu_reset_capture_coredumpm(tmp_adev);
5101 #endif
5102 				if (vram_lost) {
5103 					DRM_INFO("VRAM is lost due to GPU reset!\n");
5104 					amdgpu_inc_vram_lost(tmp_adev);
5105 				}
5106 
5107 				r = amdgpu_device_fw_loading(tmp_adev);
5108 				if (r)
5109 					return r;
5110 
5111 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
5112 				if (r)
5113 					goto out;
5114 
5115 				if (vram_lost)
5116 					amdgpu_device_fill_reset_magic(tmp_adev);
5117 
5118 				/*
5119 				 * Add this ASIC as tracked as reset was already
5120 				 * complete successfully.
5121 				 */
5122 				amdgpu_register_gpu_instance(tmp_adev);
5123 
5124 				if (!reset_context->hive &&
5125 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5126 					amdgpu_xgmi_add_device(tmp_adev);
5127 
5128 				r = amdgpu_device_ip_late_init(tmp_adev);
5129 				if (r)
5130 					goto out;
5131 
5132 				drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5133 
5134 				/*
5135 				 * The GPU enters bad state once faulty pages
5136 				 * by ECC has reached the threshold, and ras
5137 				 * recovery is scheduled next. So add one check
5138 				 * here to break recovery if it indeed exceeds
5139 				 * bad page threshold, and remind user to
5140 				 * retire this GPU or setting one bigger
5141 				 * bad_page_threshold value to fix this once
5142 				 * probing driver again.
5143 				 */
5144 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5145 					/* must succeed. */
5146 					amdgpu_ras_resume(tmp_adev);
5147 				} else {
5148 					r = -EINVAL;
5149 					goto out;
5150 				}
5151 
5152 				/* Update PSP FW topology after reset */
5153 				if (reset_context->hive &&
5154 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5155 					r = amdgpu_xgmi_update_topology(
5156 						reset_context->hive, tmp_adev);
5157 			}
5158 		}
5159 
5160 out:
5161 		if (!r) {
5162 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5163 			r = amdgpu_ib_ring_tests(tmp_adev);
5164 			if (r) {
5165 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5166 				need_full_reset = true;
5167 				r = -EAGAIN;
5168 				goto end;
5169 			}
5170 		}
5171 
5172 		if (!r)
5173 			r = amdgpu_device_recover_vram(tmp_adev);
5174 		else
5175 			tmp_adev->asic_reset_res = r;
5176 	}
5177 
5178 end:
5179 	if (need_full_reset)
5180 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5181 	else
5182 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5183 	return r;
5184 }
5185 
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5186 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5187 {
5188 
5189 	switch (amdgpu_asic_reset_method(adev)) {
5190 	case AMD_RESET_METHOD_MODE1:
5191 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5192 		break;
5193 	case AMD_RESET_METHOD_MODE2:
5194 		adev->mp1_state = PP_MP1_STATE_RESET;
5195 		break;
5196 	default:
5197 		adev->mp1_state = PP_MP1_STATE_NONE;
5198 		break;
5199 	}
5200 
5201 	pci_dev_put(p);
5202 }
5203 
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5204 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5205 {
5206 	amdgpu_vf_error_trans_all(adev);
5207 	adev->mp1_state = PP_MP1_STATE_NONE;
5208 }
5209 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5210 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5211 {
5212 	STUB();
5213 #ifdef notyet
5214 	struct pci_dev *p = NULL;
5215 
5216 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5217 			adev->pdev->bus->number, 1);
5218 	if (p) {
5219 		pm_runtime_enable(&(p->dev));
5220 		pm_runtime_resume(&(p->dev));
5221 	}
5222 #endif
5223 }
5224 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5225 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5226 {
5227 	enum amd_reset_method reset_method;
5228 	struct pci_dev *p = NULL;
5229 	u64 expires;
5230 
5231 	/*
5232 	 * For now, only BACO and mode1 reset are confirmed
5233 	 * to suffer the audio issue without proper suspended.
5234 	 */
5235 	reset_method = amdgpu_asic_reset_method(adev);
5236 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
5237 	     (reset_method != AMD_RESET_METHOD_MODE1))
5238 		return -EINVAL;
5239 
5240 	STUB();
5241 	return -ENOSYS;
5242 #ifdef notyet
5243 
5244 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5245 			adev->pdev->bus->number, 1);
5246 	if (!p)
5247 		return -ENODEV;
5248 
5249 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
5250 	if (!expires)
5251 		/*
5252 		 * If we cannot get the audio device autosuspend delay,
5253 		 * a fixed 4S interval will be used. Considering 3S is
5254 		 * the audio controller default autosuspend delay setting.
5255 		 * 4S used here is guaranteed to cover that.
5256 		 */
5257 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5258 
5259 	while (!pm_runtime_status_suspended(&(p->dev))) {
5260 		if (!pm_runtime_suspend(&(p->dev)))
5261 			break;
5262 
5263 		if (expires < ktime_get_mono_fast_ns()) {
5264 			dev_warn(adev->dev, "failed to suspend display audio\n");
5265 			pci_dev_put(p);
5266 			/* TODO: abort the succeeding gpu reset? */
5267 			return -ETIMEDOUT;
5268 		}
5269 	}
5270 
5271 	pm_runtime_disable(&(p->dev));
5272 
5273 	pci_dev_put(p);
5274 	return 0;
5275 #endif
5276 }
5277 
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5278 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5279 {
5280 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5281 
5282 #if defined(CONFIG_DEBUG_FS)
5283 	if (!amdgpu_sriov_vf(adev))
5284 		cancel_work(&adev->reset_work);
5285 #endif
5286 
5287 	if (adev->kfd.dev)
5288 		cancel_work(&adev->kfd.reset_work);
5289 
5290 	if (amdgpu_sriov_vf(adev))
5291 		cancel_work(&adev->virt.flr_work);
5292 
5293 	if (con && adev->ras_enabled)
5294 		cancel_work(&con->recovery_work);
5295 
5296 }
5297 
5298 /**
5299  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5300  *
5301  * @adev: amdgpu_device pointer
5302  * @job: which job trigger hang
5303  * @reset_context: amdgpu reset context pointer
5304  *
5305  * Attempt to reset the GPU if it has hung (all asics).
5306  * Attempt to do soft-reset or full-reset and reinitialize Asic
5307  * Returns 0 for success or an error on failure.
5308  */
5309 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5310 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5311 			      struct amdgpu_job *job,
5312 			      struct amdgpu_reset_context *reset_context)
5313 {
5314 	struct list_head device_list, *device_list_handle =  NULL;
5315 	bool job_signaled = false;
5316 	struct amdgpu_hive_info *hive = NULL;
5317 	struct amdgpu_device *tmp_adev = NULL;
5318 	int i, r = 0;
5319 	bool need_emergency_restart = false;
5320 	bool audio_suspended = false;
5321 	bool gpu_reset_for_dev_remove = false;
5322 
5323 	gpu_reset_for_dev_remove =
5324 			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5325 				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5326 
5327 	/*
5328 	 * Special case: RAS triggered and full reset isn't supported
5329 	 */
5330 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5331 
5332 	/*
5333 	 * Flush RAM to disk so that after reboot
5334 	 * the user can read log and see why the system rebooted.
5335 	 */
5336 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5337 		amdgpu_ras_get_context(adev)->reboot) {
5338 		DRM_WARN("Emergency reboot.");
5339 
5340 #ifdef notyet
5341 		ksys_sync_helper();
5342 		emergency_restart();
5343 #else
5344 		panic("emergency_restart");
5345 #endif
5346 	}
5347 
5348 	dev_info(adev->dev, "GPU %s begin!\n",
5349 		need_emergency_restart ? "jobs stop":"reset");
5350 
5351 	if (!amdgpu_sriov_vf(adev))
5352 		hive = amdgpu_get_xgmi_hive(adev);
5353 	if (hive)
5354 		mutex_lock(&hive->hive_lock);
5355 
5356 	reset_context->job = job;
5357 	reset_context->hive = hive;
5358 	/*
5359 	 * Build list of devices to reset.
5360 	 * In case we are in XGMI hive mode, resort the device list
5361 	 * to put adev in the 1st position.
5362 	 */
5363 	INIT_LIST_HEAD(&device_list);
5364 	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5365 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5366 			list_add_tail(&tmp_adev->reset_list, &device_list);
5367 			if (gpu_reset_for_dev_remove && adev->shutdown)
5368 				tmp_adev->shutdown = true;
5369 		}
5370 		if (!list_is_first(&adev->reset_list, &device_list))
5371 			list_rotate_to_front(&adev->reset_list, &device_list);
5372 		device_list_handle = &device_list;
5373 	} else {
5374 		list_add_tail(&adev->reset_list, &device_list);
5375 		device_list_handle = &device_list;
5376 	}
5377 
5378 	/* We need to lock reset domain only once both for XGMI and single device */
5379 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5380 				    reset_list);
5381 	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5382 
5383 	/* block all schedulers and reset given job's ring */
5384 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5385 
5386 		amdgpu_device_set_mp1_state(tmp_adev);
5387 
5388 		/*
5389 		 * Try to put the audio codec into suspend state
5390 		 * before gpu reset started.
5391 		 *
5392 		 * Due to the power domain of the graphics device
5393 		 * is shared with AZ power domain. Without this,
5394 		 * we may change the audio hardware from behind
5395 		 * the audio driver's back. That will trigger
5396 		 * some audio codec errors.
5397 		 */
5398 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
5399 			audio_suspended = true;
5400 
5401 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
5402 
5403 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5404 
5405 		if (!amdgpu_sriov_vf(tmp_adev))
5406 			amdgpu_amdkfd_pre_reset(tmp_adev);
5407 
5408 		/*
5409 		 * Mark these ASICs to be reseted as untracked first
5410 		 * And add them back after reset completed
5411 		 */
5412 		amdgpu_unregister_gpu_instance(tmp_adev);
5413 
5414 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5415 
5416 		/* disable ras on ALL IPs */
5417 		if (!need_emergency_restart &&
5418 		      amdgpu_device_ip_need_full_reset(tmp_adev))
5419 			amdgpu_ras_suspend(tmp_adev);
5420 
5421 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5422 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5423 
5424 			if (!ring || !ring->sched.thread)
5425 				continue;
5426 
5427 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5428 
5429 			if (need_emergency_restart)
5430 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5431 		}
5432 		atomic_inc(&tmp_adev->gpu_reset_counter);
5433 	}
5434 
5435 	if (need_emergency_restart)
5436 		goto skip_sched_resume;
5437 
5438 	/*
5439 	 * Must check guilty signal here since after this point all old
5440 	 * HW fences are force signaled.
5441 	 *
5442 	 * job->base holds a reference to parent fence
5443 	 */
5444 	if (job && dma_fence_is_signaled(&job->hw_fence)) {
5445 		job_signaled = true;
5446 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5447 		goto skip_hw_reset;
5448 	}
5449 
5450 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
5451 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5452 		if (gpu_reset_for_dev_remove) {
5453 			/* Workaroud for ASICs need to disable SMC first */
5454 			amdgpu_device_smu_fini_early(tmp_adev);
5455 		}
5456 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5457 		/*TODO Should we stop ?*/
5458 		if (r) {
5459 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5460 				  r, adev_to_drm(tmp_adev)->unique);
5461 			tmp_adev->asic_reset_res = r;
5462 		}
5463 
5464 		/*
5465 		 * Drop all pending non scheduler resets. Scheduler resets
5466 		 * were already dropped during drm_sched_stop
5467 		 */
5468 		amdgpu_device_stop_pending_resets(tmp_adev);
5469 	}
5470 
5471 	/* Actual ASIC resets if needed.*/
5472 	/* Host driver will handle XGMI hive reset for SRIOV */
5473 	if (amdgpu_sriov_vf(adev)) {
5474 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
5475 		if (r)
5476 			adev->asic_reset_res = r;
5477 
5478 		/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5479 		if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5480 		    adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5481 			amdgpu_ras_resume(adev);
5482 	} else {
5483 		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5484 		if (r && r == -EAGAIN)
5485 			goto retry;
5486 
5487 		if (!r && gpu_reset_for_dev_remove)
5488 			goto recover_end;
5489 	}
5490 
5491 skip_hw_reset:
5492 
5493 	/* Post ASIC reset for all devs .*/
5494 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5495 
5496 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5497 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5498 
5499 			if (!ring || !ring->sched.thread)
5500 				continue;
5501 
5502 			drm_sched_start(&ring->sched, true);
5503 		}
5504 
5505 		if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5506 			amdgpu_mes_self_test(tmp_adev);
5507 
5508 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5509 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5510 
5511 		if (tmp_adev->asic_reset_res)
5512 			r = tmp_adev->asic_reset_res;
5513 
5514 		tmp_adev->asic_reset_res = 0;
5515 
5516 		if (r) {
5517 			/* bad news, how to tell it to userspace ? */
5518 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5519 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5520 		} else {
5521 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5522 			if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5523 				DRM_WARN("smart shift update failed\n");
5524 		}
5525 	}
5526 
5527 skip_sched_resume:
5528 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5529 		/* unlock kfd: SRIOV would do it separately */
5530 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5531 			amdgpu_amdkfd_post_reset(tmp_adev);
5532 
5533 		/* kfd_post_reset will do nothing if kfd device is not initialized,
5534 		 * need to bring up kfd here if it's not be initialized before
5535 		 */
5536 		if (!adev->kfd.init_complete)
5537 			amdgpu_amdkfd_device_init(adev);
5538 
5539 		if (audio_suspended)
5540 			amdgpu_device_resume_display_audio(tmp_adev);
5541 
5542 		amdgpu_device_unset_mp1_state(tmp_adev);
5543 
5544 		amdgpu_ras_set_error_query_ready(tmp_adev, true);
5545 	}
5546 
5547 recover_end:
5548 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5549 					    reset_list);
5550 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5551 
5552 	if (hive) {
5553 		mutex_unlock(&hive->hive_lock);
5554 		amdgpu_put_xgmi_hive(hive);
5555 	}
5556 
5557 	if (r)
5558 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5559 
5560 	atomic_set(&adev->reset_domain->reset_res, r);
5561 	return r;
5562 }
5563 
5564 /**
5565  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5566  *
5567  * @adev: amdgpu_device pointer
5568  *
5569  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5570  * and lanes) of the slot the device is in. Handles APUs and
5571  * virtualized environments where PCIE config space may not be available.
5572  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5573 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5574 {
5575 	struct pci_dev *pdev;
5576 	enum pci_bus_speed speed_cap, platform_speed_cap;
5577 	enum pcie_link_width platform_link_width;
5578 
5579 	if (amdgpu_pcie_gen_cap)
5580 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5581 
5582 	if (amdgpu_pcie_lane_cap)
5583 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5584 
5585 	/* covers APUs as well */
5586 	if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5587 		if (adev->pm.pcie_gen_mask == 0)
5588 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5589 		if (adev->pm.pcie_mlw_mask == 0)
5590 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5591 		return;
5592 	}
5593 
5594 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5595 		return;
5596 
5597 	pcie_bandwidth_available(adev->pdev, NULL,
5598 				 &platform_speed_cap, &platform_link_width);
5599 
5600 	if (adev->pm.pcie_gen_mask == 0) {
5601 		/* asic caps */
5602 		pdev = adev->pdev;
5603 		speed_cap = pcie_get_speed_cap(pdev);
5604 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5605 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5606 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5607 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5608 		} else {
5609 			if (speed_cap == PCIE_SPEED_32_0GT)
5610 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5611 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5612 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5613 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5614 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5615 			else if (speed_cap == PCIE_SPEED_16_0GT)
5616 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5617 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5618 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5619 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5620 			else if (speed_cap == PCIE_SPEED_8_0GT)
5621 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5622 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5623 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5624 			else if (speed_cap == PCIE_SPEED_5_0GT)
5625 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5626 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5627 			else
5628 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5629 		}
5630 		/* platform caps */
5631 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5632 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5633 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5634 		} else {
5635 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5636 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5637 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5638 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5639 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5640 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5641 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5642 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5643 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5644 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5645 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5646 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5647 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5648 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5649 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5650 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5651 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5652 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5653 			else
5654 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5655 
5656 		}
5657 	}
5658 	if (adev->pm.pcie_mlw_mask == 0) {
5659 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5660 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5661 		} else {
5662 			switch (platform_link_width) {
5663 			case PCIE_LNK_X32:
5664 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5665 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5666 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5667 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5668 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5669 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5670 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5671 				break;
5672 			case PCIE_LNK_X16:
5673 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5674 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5675 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5676 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5677 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5678 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5679 				break;
5680 			case PCIE_LNK_X12:
5681 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5682 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5683 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5684 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5685 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5686 				break;
5687 			case PCIE_LNK_X8:
5688 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5689 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5690 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5691 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5692 				break;
5693 			case PCIE_LNK_X4:
5694 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5695 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5696 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5697 				break;
5698 			case PCIE_LNK_X2:
5699 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5700 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5701 				break;
5702 			case PCIE_LNK_X1:
5703 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5704 				break;
5705 			default:
5706 				break;
5707 			}
5708 		}
5709 	}
5710 }
5711 
5712 /**
5713  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5714  *
5715  * @adev: amdgpu_device pointer
5716  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5717  *
5718  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5719  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5720  * @peer_adev.
5721  */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5722 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5723 				      struct amdgpu_device *peer_adev)
5724 {
5725 #ifdef CONFIG_HSA_AMD_P2P
5726 	uint64_t address_mask = peer_adev->dev->dma_mask ?
5727 		~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5728 	resource_size_t aper_limit =
5729 		adev->gmc.aper_base + adev->gmc.aper_size - 1;
5730 	bool p2p_access =
5731 		!adev->gmc.xgmi.connected_to_cpu &&
5732 		!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5733 
5734 	return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5735 		adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5736 		!(adev->gmc.aper_base & address_mask ||
5737 		  aper_limit & address_mask));
5738 #else
5739 	return false;
5740 #endif
5741 }
5742 
amdgpu_device_baco_enter(struct drm_device * dev)5743 int amdgpu_device_baco_enter(struct drm_device *dev)
5744 {
5745 	struct amdgpu_device *adev = drm_to_adev(dev);
5746 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5747 
5748 	if (!amdgpu_device_supports_baco(dev))
5749 		return -ENOTSUPP;
5750 
5751 	if (ras && adev->ras_enabled &&
5752 	    adev->nbio.funcs->enable_doorbell_interrupt)
5753 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5754 
5755 	return amdgpu_dpm_baco_enter(adev);
5756 }
5757 
amdgpu_device_baco_exit(struct drm_device * dev)5758 int amdgpu_device_baco_exit(struct drm_device *dev)
5759 {
5760 	struct amdgpu_device *adev = drm_to_adev(dev);
5761 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5762 	int ret = 0;
5763 
5764 	if (!amdgpu_device_supports_baco(dev))
5765 		return -ENOTSUPP;
5766 
5767 	ret = amdgpu_dpm_baco_exit(adev);
5768 	if (ret)
5769 		return ret;
5770 
5771 	if (ras && adev->ras_enabled &&
5772 	    adev->nbio.funcs->enable_doorbell_interrupt)
5773 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5774 
5775 	if (amdgpu_passthrough(adev) &&
5776 	    adev->nbio.funcs->clear_doorbell_interrupt)
5777 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
5778 
5779 	return 0;
5780 }
5781 
5782 /**
5783  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5784  * @pdev: PCI device struct
5785  * @state: PCI channel state
5786  *
5787  * Description: Called when a PCI error is detected.
5788  *
5789  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5790  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5791 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5792 {
5793 	STUB();
5794 	return 0;
5795 #ifdef notyet
5796 	struct drm_device *dev = pci_get_drvdata(pdev);
5797 	struct amdgpu_device *adev = drm_to_adev(dev);
5798 	int i;
5799 
5800 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5801 
5802 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5803 		DRM_WARN("No support for XGMI hive yet...");
5804 		return PCI_ERS_RESULT_DISCONNECT;
5805 	}
5806 
5807 	adev->pci_channel_state = state;
5808 
5809 	switch (state) {
5810 	case pci_channel_io_normal:
5811 		return PCI_ERS_RESULT_CAN_RECOVER;
5812 	/* Fatal error, prepare for slot reset */
5813 	case pci_channel_io_frozen:
5814 		/*
5815 		 * Locking adev->reset_domain->sem will prevent any external access
5816 		 * to GPU during PCI error recovery
5817 		 */
5818 		amdgpu_device_lock_reset_domain(adev->reset_domain);
5819 		amdgpu_device_set_mp1_state(adev);
5820 
5821 		/*
5822 		 * Block any work scheduling as we do for regular GPU reset
5823 		 * for the duration of the recovery
5824 		 */
5825 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5826 			struct amdgpu_ring *ring = adev->rings[i];
5827 
5828 			if (!ring || !ring->sched.thread)
5829 				continue;
5830 
5831 			drm_sched_stop(&ring->sched, NULL);
5832 		}
5833 		atomic_inc(&adev->gpu_reset_counter);
5834 		return PCI_ERS_RESULT_NEED_RESET;
5835 	case pci_channel_io_perm_failure:
5836 		/* Permanent error, prepare for device removal */
5837 		return PCI_ERS_RESULT_DISCONNECT;
5838 	}
5839 
5840 	return PCI_ERS_RESULT_NEED_RESET;
5841 #endif
5842 }
5843 
5844 /**
5845  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5846  * @pdev: pointer to PCI device
5847  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5848 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5849 {
5850 
5851 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5852 
5853 	/* TODO - dump whatever for debugging purposes */
5854 
5855 	/* This called only if amdgpu_pci_error_detected returns
5856 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5857 	 * works, no need to reset slot.
5858 	 */
5859 
5860 	return PCI_ERS_RESULT_RECOVERED;
5861 }
5862 
5863 /**
5864  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5865  * @pdev: PCI device struct
5866  *
5867  * Description: This routine is called by the pci error recovery
5868  * code after the PCI slot has been reset, just before we
5869  * should resume normal operations.
5870  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5871 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5872 {
5873 	STUB();
5874 	return PCI_ERS_RESULT_RECOVERED;
5875 #ifdef notyet
5876 	struct drm_device *dev = pci_get_drvdata(pdev);
5877 	struct amdgpu_device *adev = drm_to_adev(dev);
5878 	int r, i;
5879 	struct amdgpu_reset_context reset_context;
5880 	u32 memsize;
5881 	struct list_head device_list;
5882 
5883 	DRM_INFO("PCI error: slot reset callback!!\n");
5884 
5885 	memset(&reset_context, 0, sizeof(reset_context));
5886 
5887 	INIT_LIST_HEAD(&device_list);
5888 	list_add_tail(&adev->reset_list, &device_list);
5889 
5890 	/* wait for asic to come out of reset */
5891 	drm_msleep(500);
5892 
5893 	/* Restore PCI confspace */
5894 	amdgpu_device_load_pci_state(pdev);
5895 
5896 	/* confirm  ASIC came out of reset */
5897 	for (i = 0; i < adev->usec_timeout; i++) {
5898 		memsize = amdgpu_asic_get_config_memsize(adev);
5899 
5900 		if (memsize != 0xffffffff)
5901 			break;
5902 		udelay(1);
5903 	}
5904 	if (memsize == 0xffffffff) {
5905 		r = -ETIME;
5906 		goto out;
5907 	}
5908 
5909 	reset_context.method = AMD_RESET_METHOD_NONE;
5910 	reset_context.reset_req_dev = adev;
5911 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5912 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5913 
5914 	adev->no_hw_access = true;
5915 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5916 	adev->no_hw_access = false;
5917 	if (r)
5918 		goto out;
5919 
5920 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5921 
5922 out:
5923 	if (!r) {
5924 		if (amdgpu_device_cache_pci_state(adev->pdev))
5925 			pci_restore_state(adev->pdev);
5926 
5927 		DRM_INFO("PCIe error recovery succeeded\n");
5928 	} else {
5929 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5930 		amdgpu_device_unset_mp1_state(adev);
5931 		amdgpu_device_unlock_reset_domain(adev->reset_domain);
5932 	}
5933 
5934 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5935 #endif
5936 }
5937 
5938 /**
5939  * amdgpu_pci_resume() - resume normal ops after PCI reset
5940  * @pdev: pointer to PCI device
5941  *
5942  * Called when the error recovery driver tells us that its
5943  * OK to resume normal operation.
5944  */
amdgpu_pci_resume(struct pci_dev * pdev)5945 void amdgpu_pci_resume(struct pci_dev *pdev)
5946 {
5947 	STUB();
5948 #ifdef notyet
5949 	struct drm_device *dev = pci_get_drvdata(pdev);
5950 	struct amdgpu_device *adev = drm_to_adev(dev);
5951 	int i;
5952 
5953 
5954 	DRM_INFO("PCI error: resume callback!!\n");
5955 
5956 	/* Only continue execution for the case of pci_channel_io_frozen */
5957 	if (adev->pci_channel_state != pci_channel_io_frozen)
5958 		return;
5959 
5960 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5961 		struct amdgpu_ring *ring = adev->rings[i];
5962 
5963 		if (!ring || !ring->sched.thread)
5964 			continue;
5965 
5966 		drm_sched_start(&ring->sched, true);
5967 	}
5968 
5969 	amdgpu_device_unset_mp1_state(adev);
5970 	amdgpu_device_unlock_reset_domain(adev->reset_domain);
5971 #endif
5972 }
5973 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5974 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5975 {
5976 	return false;
5977 #ifdef notyet
5978 	struct drm_device *dev = pci_get_drvdata(pdev);
5979 	struct amdgpu_device *adev = drm_to_adev(dev);
5980 	int r;
5981 
5982 	r = pci_save_state(pdev);
5983 	if (!r) {
5984 		kfree(adev->pci_state);
5985 
5986 		adev->pci_state = pci_store_saved_state(pdev);
5987 
5988 		if (!adev->pci_state) {
5989 			DRM_ERROR("Failed to store PCI saved state");
5990 			return false;
5991 		}
5992 	} else {
5993 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5994 		return false;
5995 	}
5996 
5997 	return true;
5998 #endif
5999 }
6000 
amdgpu_device_load_pci_state(struct pci_dev * pdev)6001 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6002 {
6003 	STUB();
6004 	return false;
6005 #ifdef notyet
6006 	struct drm_device *dev = pci_get_drvdata(pdev);
6007 	struct amdgpu_device *adev = drm_to_adev(dev);
6008 	int r;
6009 
6010 	if (!adev->pci_state)
6011 		return false;
6012 
6013 	r = pci_load_saved_state(pdev, adev->pci_state);
6014 
6015 	if (!r) {
6016 		pci_restore_state(pdev);
6017 	} else {
6018 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
6019 		return false;
6020 	}
6021 
6022 	return true;
6023 #endif
6024 }
6025 
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6026 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6027 		struct amdgpu_ring *ring)
6028 {
6029 #ifdef CONFIG_X86_64
6030 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6031 		return;
6032 #endif
6033 	if (adev->gmc.xgmi.connected_to_cpu)
6034 		return;
6035 
6036 	if (ring && ring->funcs->emit_hdp_flush)
6037 		amdgpu_ring_emit_hdp_flush(ring);
6038 	else
6039 		amdgpu_asic_flush_hdp(adev, ring);
6040 }
6041 
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6042 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6043 		struct amdgpu_ring *ring)
6044 {
6045 #ifdef CONFIG_X86_64
6046 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6047 		return;
6048 #endif
6049 	if (adev->gmc.xgmi.connected_to_cpu)
6050 		return;
6051 
6052 	amdgpu_asic_invalidate_hdp(adev, ring);
6053 }
6054 
amdgpu_in_reset(struct amdgpu_device * adev)6055 int amdgpu_in_reset(struct amdgpu_device *adev)
6056 {
6057 	return atomic_read(&adev->reset_domain->in_gpu_reset);
6058 }
6059 
6060 /**
6061  * amdgpu_device_halt() - bring hardware to some kind of halt state
6062  *
6063  * @adev: amdgpu_device pointer
6064  *
6065  * Bring hardware to some kind of halt state so that no one can touch it
6066  * any more. It will help to maintain error context when error occurred.
6067  * Compare to a simple hang, the system will keep stable at least for SSH
6068  * access. Then it should be trivial to inspect the hardware state and
6069  * see what's going on. Implemented as following:
6070  *
6071  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6072  *    clears all CPU mappings to device, disallows remappings through page faults
6073  * 2. amdgpu_irq_disable_all() disables all interrupts
6074  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6075  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6076  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6077  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6078  *    flush any in flight DMA operations
6079  */
amdgpu_device_halt(struct amdgpu_device * adev)6080 void amdgpu_device_halt(struct amdgpu_device *adev)
6081 {
6082 	struct pci_dev *pdev = adev->pdev;
6083 	struct drm_device *ddev = adev_to_drm(adev);
6084 
6085 	amdgpu_xcp_dev_unplug(adev);
6086 	drm_dev_unplug(ddev);
6087 
6088 	amdgpu_irq_disable_all(adev);
6089 
6090 	amdgpu_fence_driver_hw_fini(adev);
6091 
6092 	adev->no_hw_access = true;
6093 
6094 	amdgpu_device_unmap_mmio(adev);
6095 
6096 	pci_disable_device(pdev);
6097 	pci_wait_for_pending_transaction(pdev);
6098 }
6099 
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6100 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6101 				u32 reg)
6102 {
6103 	unsigned long flags, address, data;
6104 	u32 r;
6105 
6106 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6107 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6108 
6109 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6110 	WREG32(address, reg * 4);
6111 	(void)RREG32(address);
6112 	r = RREG32(data);
6113 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6114 	return r;
6115 }
6116 
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6117 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6118 				u32 reg, u32 v)
6119 {
6120 	unsigned long flags, address, data;
6121 
6122 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6123 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6124 
6125 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6126 	WREG32(address, reg * 4);
6127 	(void)RREG32(address);
6128 	WREG32(data, v);
6129 	(void)RREG32(data);
6130 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6131 }
6132 
6133 /**
6134  * amdgpu_device_switch_gang - switch to a new gang
6135  * @adev: amdgpu_device pointer
6136  * @gang: the gang to switch to
6137  *
6138  * Try to switch to a new gang.
6139  * Returns: NULL if we switched to the new gang or a reference to the current
6140  * gang leader.
6141  */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6142 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6143 					    struct dma_fence *gang)
6144 {
6145 	struct dma_fence *old = NULL;
6146 
6147 	do {
6148 		dma_fence_put(old);
6149 		rcu_read_lock();
6150 		old = dma_fence_get_rcu_safe(&adev->gang_submit);
6151 		rcu_read_unlock();
6152 
6153 		if (old == gang)
6154 			break;
6155 
6156 		if (!dma_fence_is_signaled(old))
6157 			return old;
6158 
6159 	} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6160 			 old, gang) != old);
6161 
6162 	dma_fence_put(old);
6163 	return NULL;
6164 }
6165 
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6166 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6167 {
6168 	switch (adev->asic_type) {
6169 #ifdef CONFIG_DRM_AMDGPU_SI
6170 	case CHIP_HAINAN:
6171 #endif
6172 	case CHIP_TOPAZ:
6173 		/* chips with no display hardware */
6174 		return false;
6175 #ifdef CONFIG_DRM_AMDGPU_SI
6176 	case CHIP_TAHITI:
6177 	case CHIP_PITCAIRN:
6178 	case CHIP_VERDE:
6179 	case CHIP_OLAND:
6180 #endif
6181 #ifdef CONFIG_DRM_AMDGPU_CIK
6182 	case CHIP_BONAIRE:
6183 	case CHIP_HAWAII:
6184 	case CHIP_KAVERI:
6185 	case CHIP_KABINI:
6186 	case CHIP_MULLINS:
6187 #endif
6188 	case CHIP_TONGA:
6189 	case CHIP_FIJI:
6190 	case CHIP_POLARIS10:
6191 	case CHIP_POLARIS11:
6192 	case CHIP_POLARIS12:
6193 	case CHIP_VEGAM:
6194 	case CHIP_CARRIZO:
6195 	case CHIP_STONEY:
6196 		/* chips with display hardware */
6197 		return true;
6198 	default:
6199 		/* IP discovery */
6200 		if (!adev->ip_versions[DCE_HWIP][0] ||
6201 		    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6202 			return false;
6203 		return true;
6204 	}
6205 }
6206 
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6207 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6208 		uint32_t inst, uint32_t reg_addr, char reg_name[],
6209 		uint32_t expected_value, uint32_t mask)
6210 {
6211 	uint32_t ret = 0;
6212 	uint32_t old_ = 0;
6213 	uint32_t tmp_ = RREG32(reg_addr);
6214 	uint32_t loop = adev->usec_timeout;
6215 
6216 	while ((tmp_ & (mask)) != (expected_value)) {
6217 		if (old_ != tmp_) {
6218 			loop = adev->usec_timeout;
6219 			old_ = tmp_;
6220 		} else
6221 			udelay(1);
6222 		tmp_ = RREG32(reg_addr);
6223 		loop--;
6224 		if (!loop) {
6225 			DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6226 				  inst, reg_name, (uint32_t)expected_value,
6227 				  (uint32_t)(tmp_ & (mask)));
6228 			ret = -ETIMEDOUT;
6229 			break;
6230 		}
6231 	}
6232 	return ret;
6233 }
6234