xref: /openbsd/sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c (revision b2e9d6ba)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39 
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69 
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72 
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78 
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82 
83 #include <drm/drm_drv.h>
84 
85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__)
86 #include <asm/intel-family.h>
87 #endif
88 
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96 
97 #define AMDGPU_RESUME_MS		2000
98 #define AMDGPU_MAX_RETRY_LIMIT		2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100 
101 static const struct drm_driver amdgpu_kms_driver;
102 
103 const char *amdgpu_asic_name[] = {
104 	"TAHITI",
105 	"PITCAIRN",
106 	"VERDE",
107 	"OLAND",
108 	"HAINAN",
109 	"BONAIRE",
110 	"KAVERI",
111 	"KABINI",
112 	"HAWAII",
113 	"MULLINS",
114 	"TOPAZ",
115 	"TONGA",
116 	"FIJI",
117 	"CARRIZO",
118 	"STONEY",
119 	"POLARIS10",
120 	"POLARIS11",
121 	"POLARIS12",
122 	"VEGAM",
123 	"VEGA10",
124 	"VEGA12",
125 	"VEGA20",
126 	"RAVEN",
127 	"ARCTURUS",
128 	"RENOIR",
129 	"ALDEBARAN",
130 	"NAVI10",
131 	"CYAN_SKILLFISH",
132 	"NAVI14",
133 	"NAVI12",
134 	"SIENNA_CICHLID",
135 	"NAVY_FLOUNDER",
136 	"VANGOGH",
137 	"DIMGREY_CAVEFISH",
138 	"BEIGE_GOBY",
139 	"YELLOW_CARP",
140 	"IP DISCOVERY",
141 	"LAST",
142 };
143 
144 /**
145  * DOC: pcie_replay_count
146  *
147  * The amdgpu driver provides a sysfs API for reporting the total number
148  * of PCIe replays (NAKs)
149  * The file pcie_replay_count is used for this and returns the total
150  * number of replays as a sum of the NAKs generated and NAKs received
151  */
152 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
154 		struct device_attribute *attr, char *buf)
155 {
156 	struct drm_device *ddev = dev_get_drvdata(dev);
157 	struct amdgpu_device *adev = drm_to_adev(ddev);
158 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
159 
160 	return sysfs_emit(buf, "%llu\n", cnt);
161 }
162 
163 static DEVICE_ATTR(pcie_replay_count, 0444,
164 		amdgpu_device_get_pcie_replay_count, NULL);
165 
166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
167 
168 
169 /**
170  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
171  *
172  * @dev: drm_device pointer
173  *
174  * Returns true if the device is a dGPU with ATPX power control,
175  * otherwise return false.
176  */
amdgpu_device_supports_px(struct drm_device * dev)177 bool amdgpu_device_supports_px(struct drm_device *dev)
178 {
179 	struct amdgpu_device *adev = drm_to_adev(dev);
180 
181 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
182 		return true;
183 	return false;
184 }
185 
186 /**
187  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
188  *
189  * @dev: drm_device pointer
190  *
191  * Returns true if the device is a dGPU with ACPI power control,
192  * otherwise return false.
193  */
amdgpu_device_supports_boco(struct drm_device * dev)194 bool amdgpu_device_supports_boco(struct drm_device *dev)
195 {
196 	struct amdgpu_device *adev = drm_to_adev(dev);
197 
198 	if (adev->has_pr3 ||
199 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
200 		return true;
201 	return false;
202 }
203 
204 /**
205  * amdgpu_device_supports_baco - Does the device support BACO
206  *
207  * @dev: drm_device pointer
208  *
209  * Returns true if the device supporte BACO,
210  * otherwise return false.
211  */
amdgpu_device_supports_baco(struct drm_device * dev)212 bool amdgpu_device_supports_baco(struct drm_device *dev)
213 {
214 	struct amdgpu_device *adev = drm_to_adev(dev);
215 
216 	return amdgpu_asic_supports_baco(adev);
217 }
218 
219 /**
220  * amdgpu_device_supports_smart_shift - Is the device dGPU with
221  * smart shift support
222  *
223  * @dev: drm_device pointer
224  *
225  * Returns true if the device is a dGPU with Smart Shift support,
226  * otherwise returns false.
227  */
amdgpu_device_supports_smart_shift(struct drm_device * dev)228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
229 {
230 	return (amdgpu_device_supports_boco(dev) &&
231 		amdgpu_acpi_is_power_shift_control_supported());
232 }
233 
234 /*
235  * VRAM access helper functions
236  */
237 
238 /**
239  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
240  *
241  * @adev: amdgpu_device pointer
242  * @pos: offset of the buffer in vram
243  * @buf: virtual address of the buffer in system memory
244  * @size: read/write size, sizeof(@buf) must > @size
245  * @write: true - write to vram, otherwise - read from vram
246  */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
248 			     void *buf, size_t size, bool write)
249 {
250 	unsigned long flags;
251 	uint32_t hi = ~0, tmp = 0;
252 	uint32_t *data = buf;
253 	uint64_t last;
254 	int idx;
255 
256 	if (!drm_dev_enter(adev_to_drm(adev), &idx))
257 		return;
258 
259 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
260 
261 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
262 	for (last = pos + size; pos < last; pos += 4) {
263 		tmp = pos >> 31;
264 
265 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
266 		if (tmp != hi) {
267 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
268 			hi = tmp;
269 		}
270 		if (write)
271 			WREG32_NO_KIQ(mmMM_DATA, *data++);
272 		else
273 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
274 	}
275 
276 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
277 	drm_dev_exit(idx);
278 }
279 
280 /**
281  * amdgpu_device_aper_access - access vram by vram aperature
282  *
283  * @adev: amdgpu_device pointer
284  * @pos: offset of the buffer in vram
285  * @buf: virtual address of the buffer in system memory
286  * @size: read/write size, sizeof(@buf) must > @size
287  * @write: true - write to vram, otherwise - read from vram
288  *
289  * The return value means how many bytes have been transferred.
290  */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
292 				 void *buf, size_t size, bool write)
293 {
294 #ifdef CONFIG_64BIT
295 	void __iomem *addr;
296 	size_t count = 0;
297 	uint64_t last;
298 
299 	if (!adev->mman.aper_base_kaddr)
300 		return 0;
301 
302 	last = min(pos + size, adev->gmc.visible_vram_size);
303 	if (last > pos) {
304 		addr = adev->mman.aper_base_kaddr + pos;
305 		count = last - pos;
306 
307 		if (write) {
308 			memcpy_toio(addr, buf, count);
309 			/* Make sure HDP write cache flush happens without any reordering
310 			 * after the system memory contents are sent over PCIe device
311 			 */
312 			mb();
313 			amdgpu_device_flush_hdp(adev, NULL);
314 		} else {
315 			amdgpu_device_invalidate_hdp(adev, NULL);
316 			/* Make sure HDP read cache is invalidated before issuing a read
317 			 * to the PCIe device
318 			 */
319 			mb();
320 			memcpy_fromio(buf, addr, count);
321 		}
322 
323 	}
324 
325 	return count;
326 #else
327 	return 0;
328 #endif
329 }
330 
331 /**
332  * amdgpu_device_vram_access - read/write a buffer in vram
333  *
334  * @adev: amdgpu_device pointer
335  * @pos: offset of the buffer in vram
336  * @buf: virtual address of the buffer in system memory
337  * @size: read/write size, sizeof(@buf) must > @size
338  * @write: true - write to vram, otherwise - read from vram
339  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
341 			       void *buf, size_t size, bool write)
342 {
343 	size_t count;
344 
345 	/* try to using vram apreature to access vram first */
346 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
347 	size -= count;
348 	if (size) {
349 		/* using MM to access rest vram */
350 		pos += count;
351 		buf += count;
352 		amdgpu_device_mm_access(adev, pos, buf, size, write);
353 	}
354 }
355 
356 /*
357  * register access helper functions.
358  */
359 
360 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
362 {
363 	if (adev->no_hw_access)
364 		return true;
365 
366 #ifdef CONFIG_LOCKDEP
367 	/*
368 	 * This is a bit complicated to understand, so worth a comment. What we assert
369 	 * here is that the GPU reset is not running on another thread in parallel.
370 	 *
371 	 * For this we trylock the read side of the reset semaphore, if that succeeds
372 	 * we know that the reset is not running in paralell.
373 	 *
374 	 * If the trylock fails we assert that we are either already holding the read
375 	 * side of the lock or are the reset thread itself and hold the write side of
376 	 * the lock.
377 	 */
378 	if (in_task()) {
379 		if (down_read_trylock(&adev->reset_domain->sem))
380 			up_read(&adev->reset_domain->sem);
381 		else
382 			lockdep_assert_held(&adev->reset_domain->sem);
383 	}
384 #endif
385 	return false;
386 }
387 
388 /**
389  * amdgpu_device_rreg - read a memory mapped IO or indirect register
390  *
391  * @adev: amdgpu_device pointer
392  * @reg: dword aligned register offset
393  * @acc_flags: access flags which require special behavior
394  *
395  * Returns the 32 bit value from the offset specified.
396  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
398 			    uint32_t reg, uint32_t acc_flags)
399 {
400 	uint32_t ret;
401 
402 	if (amdgpu_device_skip_hw_access(adev))
403 		return 0;
404 
405 	if ((reg * 4) < adev->rmmio_size) {
406 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
407 		    amdgpu_sriov_runtime(adev) &&
408 		    down_read_trylock(&adev->reset_domain->sem)) {
409 			ret = amdgpu_kiq_rreg(adev, reg);
410 			up_read(&adev->reset_domain->sem);
411 		} else {
412 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
413 		}
414 	} else {
415 		ret = adev->pcie_rreg(adev, reg * 4);
416 	}
417 
418 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
419 
420 	return ret;
421 }
422 
423 /*
424  * MMIO register read with bytes helper functions
425  * @offset:bytes offset from MMIO start
426  */
427 
428 /**
429  * amdgpu_mm_rreg8 - read a memory mapped IO register
430  *
431  * @adev: amdgpu_device pointer
432  * @offset: byte aligned register offset
433  *
434  * Returns the 8 bit value from the offset specified.
435  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
437 {
438 	if (amdgpu_device_skip_hw_access(adev))
439 		return 0;
440 
441 	if (offset < adev->rmmio_size)
442 		return (readb(adev->rmmio + offset));
443 	BUG();
444 }
445 
446 /*
447  * MMIO register write with bytes helper functions
448  * @offset:bytes offset from MMIO start
449  * @value: the value want to be written to the register
450  */
451 
452 /**
453  * amdgpu_mm_wreg8 - read a memory mapped IO register
454  *
455  * @adev: amdgpu_device pointer
456  * @offset: byte aligned register offset
457  * @value: 8 bit value to write
458  *
459  * Writes the value specified to the offset specified.
460  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
462 {
463 	if (amdgpu_device_skip_hw_access(adev))
464 		return;
465 
466 	if (offset < adev->rmmio_size)
467 		writeb(value, adev->rmmio + offset);
468 	else
469 		BUG();
470 }
471 
472 /**
473  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
474  *
475  * @adev: amdgpu_device pointer
476  * @reg: dword aligned register offset
477  * @v: 32 bit value to write to the register
478  * @acc_flags: access flags which require special behavior
479  *
480  * Writes the value specified to the offset specified.
481  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)482 void amdgpu_device_wreg(struct amdgpu_device *adev,
483 			uint32_t reg, uint32_t v,
484 			uint32_t acc_flags)
485 {
486 	if (amdgpu_device_skip_hw_access(adev))
487 		return;
488 
489 	if ((reg * 4) < adev->rmmio_size) {
490 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
491 		    amdgpu_sriov_runtime(adev) &&
492 		    down_read_trylock(&adev->reset_domain->sem)) {
493 			amdgpu_kiq_wreg(adev, reg, v);
494 			up_read(&adev->reset_domain->sem);
495 		} else {
496 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
497 		}
498 	} else {
499 		adev->pcie_wreg(adev, reg * 4, v);
500 	}
501 
502 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
503 }
504 
505 /**
506  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
507  *
508  * @adev: amdgpu_device pointer
509  * @reg: mmio/rlc register
510  * @v: value to write
511  *
512  * this function is invoked only for the debugfs register access
513  */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
515 			     uint32_t reg, uint32_t v,
516 			     uint32_t xcc_id)
517 {
518 	if (amdgpu_device_skip_hw_access(adev))
519 		return;
520 
521 	if (amdgpu_sriov_fullaccess(adev) &&
522 	    adev->gfx.rlc.funcs &&
523 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
524 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
525 			return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
526 	} else if ((reg * 4) >= adev->rmmio_size) {
527 		adev->pcie_wreg(adev, reg * 4, v);
528 	} else {
529 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
530 	}
531 }
532 
533 /**
534  * amdgpu_device_indirect_rreg - read an indirect register
535  *
536  * @adev: amdgpu_device pointer
537  * @reg_addr: indirect register address to read from
538  *
539  * Returns the value of indirect register @reg_addr
540  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
542 				u32 reg_addr)
543 {
544 	unsigned long flags, pcie_index, pcie_data;
545 	void __iomem *pcie_index_offset;
546 	void __iomem *pcie_data_offset;
547 	u32 r;
548 
549 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
550 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
551 
552 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
553 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
554 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
555 
556 	writel(reg_addr, pcie_index_offset);
557 	readl(pcie_index_offset);
558 	r = readl(pcie_data_offset);
559 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
560 
561 	return r;
562 }
563 
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
565 				    u64 reg_addr)
566 {
567 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
568 	u32 r;
569 	void __iomem *pcie_index_offset;
570 	void __iomem *pcie_index_hi_offset;
571 	void __iomem *pcie_data_offset;
572 
573 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
574 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
575 	if (adev->nbio.funcs->get_pcie_index_hi_offset)
576 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
577 	else
578 		pcie_index_hi = 0;
579 
580 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
581 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
582 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
583 	if (pcie_index_hi != 0)
584 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
585 				pcie_index_hi * 4;
586 
587 	writel(reg_addr, pcie_index_offset);
588 	readl(pcie_index_offset);
589 	if (pcie_index_hi != 0) {
590 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
591 		readl(pcie_index_hi_offset);
592 	}
593 	r = readl(pcie_data_offset);
594 
595 	/* clear the high bits */
596 	if (pcie_index_hi != 0) {
597 		writel(0, pcie_index_hi_offset);
598 		readl(pcie_index_hi_offset);
599 	}
600 
601 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602 
603 	return r;
604 }
605 
606 /**
607  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608  *
609  * @adev: amdgpu_device pointer
610  * @reg_addr: indirect register address to read from
611  *
612  * Returns the value of indirect register @reg_addr
613  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
615 				  u32 reg_addr)
616 {
617 	unsigned long flags, pcie_index, pcie_data;
618 	void __iomem *pcie_index_offset;
619 	void __iomem *pcie_data_offset;
620 	u64 r;
621 
622 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
623 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
624 
625 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
626 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
627 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
628 
629 	/* read low 32 bits */
630 	writel(reg_addr, pcie_index_offset);
631 	readl(pcie_index_offset);
632 	r = readl(pcie_data_offset);
633 	/* read high 32 bits */
634 	writel(reg_addr + 4, pcie_index_offset);
635 	readl(pcie_index_offset);
636 	r |= ((u64)readl(pcie_data_offset) << 32);
637 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
638 
639 	return r;
640 }
641 
642 /**
643  * amdgpu_device_indirect_wreg - write an indirect register address
644  *
645  * @adev: amdgpu_device pointer
646  * @reg_addr: indirect register offset
647  * @reg_data: indirect register data
648  *
649  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
651 				 u32 reg_addr, u32 reg_data)
652 {
653 	unsigned long flags, pcie_index, pcie_data;
654 	void __iomem *pcie_index_offset;
655 	void __iomem *pcie_data_offset;
656 
657 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
658 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
659 
660 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
661 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
662 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
663 
664 	writel(reg_addr, pcie_index_offset);
665 	readl(pcie_index_offset);
666 	writel(reg_data, pcie_data_offset);
667 	readl(pcie_data_offset);
668 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
669 }
670 
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
672 				     u64 reg_addr, u32 reg_data)
673 {
674 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
675 	void __iomem *pcie_index_offset;
676 	void __iomem *pcie_index_hi_offset;
677 	void __iomem *pcie_data_offset;
678 
679 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
680 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
681 	if (adev->nbio.funcs->get_pcie_index_hi_offset)
682 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
683 	else
684 		pcie_index_hi = 0;
685 
686 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
687 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
688 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
689 	if (pcie_index_hi != 0)
690 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
691 				pcie_index_hi * 4;
692 
693 	writel(reg_addr, pcie_index_offset);
694 	readl(pcie_index_offset);
695 	if (pcie_index_hi != 0) {
696 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
697 		readl(pcie_index_hi_offset);
698 	}
699 	writel(reg_data, pcie_data_offset);
700 	readl(pcie_data_offset);
701 
702 	/* clear the high bits */
703 	if (pcie_index_hi != 0) {
704 		writel(0, pcie_index_hi_offset);
705 		readl(pcie_index_hi_offset);
706 	}
707 
708 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
709 }
710 
711 /**
712  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
713  *
714  * @adev: amdgpu_device pointer
715  * @reg_addr: indirect register offset
716  * @reg_data: indirect register data
717  *
718  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
720 				   u32 reg_addr, u64 reg_data)
721 {
722 	unsigned long flags, pcie_index, pcie_data;
723 	void __iomem *pcie_index_offset;
724 	void __iomem *pcie_data_offset;
725 
726 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
727 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
728 
729 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
730 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
731 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
732 
733 	/* write low 32 bits */
734 	writel(reg_addr, pcie_index_offset);
735 	readl(pcie_index_offset);
736 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
737 	readl(pcie_data_offset);
738 	/* write high 32 bits */
739 	writel(reg_addr + 4, pcie_index_offset);
740 	readl(pcie_index_offset);
741 	writel((u32)(reg_data >> 32), pcie_data_offset);
742 	readl(pcie_data_offset);
743 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
744 }
745 
746 /**
747  * amdgpu_device_get_rev_id - query device rev_id
748  *
749  * @adev: amdgpu_device pointer
750  *
751  * Return device rev_id
752  */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
754 {
755 	return adev->nbio.funcs->get_rev_id(adev);
756 }
757 
758 /**
759  * amdgpu_invalid_rreg - dummy reg read function
760  *
761  * @adev: amdgpu_device pointer
762  * @reg: offset of register
763  *
764  * Dummy register read function.  Used for register blocks
765  * that certain asics don't have (all asics).
766  * Returns the value in the register.
767  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
769 {
770 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
771 	BUG();
772 	return 0;
773 }
774 
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
776 {
777 	DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
778 	BUG();
779 	return 0;
780 }
781 
782 /**
783  * amdgpu_invalid_wreg - dummy reg write function
784  *
785  * @adev: amdgpu_device pointer
786  * @reg: offset of register
787  * @v: value to write to the register
788  *
789  * Dummy register read function.  Used for register blocks
790  * that certain asics don't have (all asics).
791  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
793 {
794 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
795 		  reg, v);
796 	BUG();
797 }
798 
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
800 {
801 	DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
802 		  reg, v);
803 	BUG();
804 }
805 
806 /**
807  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
808  *
809  * @adev: amdgpu_device pointer
810  * @reg: offset of register
811  *
812  * Dummy register read function.  Used for register blocks
813  * that certain asics don't have (all asics).
814  * Returns the value in the register.
815  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
817 {
818 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
819 	BUG();
820 	return 0;
821 }
822 
823 /**
824  * amdgpu_invalid_wreg64 - dummy reg write function
825  *
826  * @adev: amdgpu_device pointer
827  * @reg: offset of register
828  * @v: value to write to the register
829  *
830  * Dummy register read function.  Used for register blocks
831  * that certain asics don't have (all asics).
832  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
834 {
835 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
836 		  reg, v);
837 	BUG();
838 }
839 
840 /**
841  * amdgpu_block_invalid_rreg - dummy reg read function
842  *
843  * @adev: amdgpu_device pointer
844  * @block: offset of instance
845  * @reg: offset of register
846  *
847  * Dummy register read function.  Used for register blocks
848  * that certain asics don't have (all asics).
849  * Returns the value in the register.
850  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
852 					  uint32_t block, uint32_t reg)
853 {
854 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
855 		  reg, block);
856 	BUG();
857 	return 0;
858 }
859 
860 /**
861  * amdgpu_block_invalid_wreg - dummy reg write function
862  *
863  * @adev: amdgpu_device pointer
864  * @block: offset of instance
865  * @reg: offset of register
866  * @v: value to write to the register
867  *
868  * Dummy register read function.  Used for register blocks
869  * that certain asics don't have (all asics).
870  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
872 				      uint32_t block,
873 				      uint32_t reg, uint32_t v)
874 {
875 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
876 		  reg, block, v);
877 	BUG();
878 }
879 
880 /**
881  * amdgpu_device_asic_init - Wrapper for atom asic_init
882  *
883  * @adev: amdgpu_device pointer
884  *
885  * Does any asic specific work and then calls atom asic init.
886  */
amdgpu_device_asic_init(struct amdgpu_device * adev)887 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
888 {
889 	int ret;
890 
891 	amdgpu_asic_pre_asic_init(adev);
892 
893 	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
894 	    adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
895 		amdgpu_psp_wait_for_bootloader(adev);
896 		ret = amdgpu_atomfirmware_asic_init(adev, true);
897 		return ret;
898 	} else {
899 		return amdgpu_atom_asic_init(adev->mode_info.atom_context);
900 	}
901 
902 	return 0;
903 }
904 
905 /**
906  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
907  *
908  * @adev: amdgpu_device pointer
909  *
910  * Allocates a scratch page of VRAM for use by various things in the
911  * driver.
912  */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
914 {
915 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
916 				       AMDGPU_GEM_DOMAIN_VRAM |
917 				       AMDGPU_GEM_DOMAIN_GTT,
918 				       &adev->mem_scratch.robj,
919 				       &adev->mem_scratch.gpu_addr,
920 				       (void **)&adev->mem_scratch.ptr);
921 }
922 
923 /**
924  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
925  *
926  * @adev: amdgpu_device pointer
927  *
928  * Frees the VRAM scratch page.
929  */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
931 {
932 	amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
933 }
934 
935 /**
936  * amdgpu_device_program_register_sequence - program an array of registers.
937  *
938  * @adev: amdgpu_device pointer
939  * @registers: pointer to the register array
940  * @array_size: size of the register array
941  *
942  * Programs an array or registers with and or masks.
943  * This is a helper for setting golden registers.
944  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
946 					     const u32 *registers,
947 					     const u32 array_size)
948 {
949 	u32 tmp, reg, and_mask, or_mask;
950 	int i;
951 
952 	if (array_size % 3)
953 		return;
954 
955 	for (i = 0; i < array_size; i += 3) {
956 		reg = registers[i + 0];
957 		and_mask = registers[i + 1];
958 		or_mask = registers[i + 2];
959 
960 		if (and_mask == 0xffffffff) {
961 			tmp = or_mask;
962 		} else {
963 			tmp = RREG32(reg);
964 			tmp &= ~and_mask;
965 			if (adev->family >= AMDGPU_FAMILY_AI)
966 				tmp |= (or_mask & and_mask);
967 			else
968 				tmp |= or_mask;
969 		}
970 		WREG32(reg, tmp);
971 	}
972 }
973 
974 /**
975  * amdgpu_device_pci_config_reset - reset the GPU
976  *
977  * @adev: amdgpu_device pointer
978  *
979  * Resets the GPU using the pci config reset sequence.
980  * Only applicable to asics prior to vega10.
981  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
983 {
984 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
985 }
986 
987 /**
988  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
989  *
990  * @adev: amdgpu_device pointer
991  *
992  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
993  */
amdgpu_device_pci_reset(struct amdgpu_device * adev)994 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
995 {
996 	STUB();
997 	return -ENOSYS;
998 #ifdef notyet
999 	return pci_reset_function(adev->pdev);
1000 #endif
1001 }
1002 
1003 /*
1004  * amdgpu_device_wb_*()
1005  * Writeback is the method by which the GPU updates special pages in memory
1006  * with the status of certain GPU events (fences, ring pointers,etc.).
1007  */
1008 
1009 /**
1010  * amdgpu_device_wb_fini - Disable Writeback and free memory
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Disables Writeback and frees the Writeback memory (all asics).
1015  * Used at driver shutdown.
1016  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 	if (adev->wb.wb_obj) {
1020 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 				      &adev->wb.gpu_addr,
1022 				      (void **)&adev->wb.wb);
1023 		adev->wb.wb_obj = NULL;
1024 	}
1025 }
1026 
1027 /**
1028  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1029  *
1030  * @adev: amdgpu_device pointer
1031  *
1032  * Initializes writeback and allocates writeback memory (all asics).
1033  * Used at driver startup.
1034  * Returns 0 on success or an -error on failure.
1035  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 	int r;
1039 
1040 	if (adev->wb.wb_obj == NULL) {
1041 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 					    (void **)&adev->wb.wb);
1046 		if (r) {
1047 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 			return r;
1049 		}
1050 
1051 		adev->wb.num_wb = AMDGPU_MAX_WB;
1052 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053 
1054 		/* clear wb memory */
1055 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 	}
1057 
1058 	return 0;
1059 }
1060 
1061 /**
1062  * amdgpu_device_wb_get - Allocate a wb entry
1063  *
1064  * @adev: amdgpu_device pointer
1065  * @wb: wb index
1066  *
1067  * Allocate a wb slot for use by the driver (all asics).
1068  * Returns 0 on success or -EINVAL on failure.
1069  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073 
1074 	if (offset < adev->wb.num_wb) {
1075 		__set_bit(offset, adev->wb.used);
1076 		*wb = offset << 3; /* convert to dw offset */
1077 		return 0;
1078 	} else {
1079 		return -EINVAL;
1080 	}
1081 }
1082 
1083 /**
1084  * amdgpu_device_wb_free - Free a wb entry
1085  *
1086  * @adev: amdgpu_device pointer
1087  * @wb: wb index
1088  *
1089  * Free a wb slot allocated for use by the driver (all asics)
1090  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 	wb >>= 3;
1094 	if (wb < adev->wb.num_wb)
1095 		__clear_bit(wb, adev->wb.used);
1096 }
1097 
1098 /**
1099  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100  *
1101  * @adev: amdgpu_device pointer
1102  *
1103  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104  * to fail, but if any of the BARs is not accessible after the size we abort
1105  * driver loading by returning -ENODEV.
1106  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 #ifdef __linux__
1110 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1111 	struct pci_bus *root;
1112 	struct resource *res;
1113 	unsigned int i;
1114 	u16 cmd;
1115 	int r;
1116 
1117 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1118 		return 0;
1119 
1120 	/* Bypass for VF */
1121 	if (amdgpu_sriov_vf(adev))
1122 		return 0;
1123 
1124 	/* skip if the bios has already enabled large BAR */
1125 	if (adev->gmc.real_vram_size &&
1126 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1127 		return 0;
1128 
1129 	/* Check if the root BUS has 64bit memory resources */
1130 	root = adev->pdev->bus;
1131 	while (root->parent)
1132 		root = root->parent;
1133 
1134 	pci_bus_for_each_resource(root, res, i) {
1135 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1136 		    res->start > 0x100000000ull)
1137 			break;
1138 	}
1139 
1140 	/* Trying to resize is pointless without a root hub window above 4GB */
1141 	if (!res)
1142 		return 0;
1143 
1144 	/* Limit the BAR size to what is available */
1145 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1146 			rbar_size);
1147 
1148 	/* Disable memory decoding while we change the BAR addresses and size */
1149 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1150 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1151 			      cmd & ~PCI_COMMAND_MEMORY);
1152 
1153 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1154 	amdgpu_doorbell_fini(adev);
1155 	if (adev->asic_type >= CHIP_BONAIRE)
1156 		pci_release_resource(adev->pdev, 2);
1157 
1158 	pci_release_resource(adev->pdev, 0);
1159 
1160 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1161 	if (r == -ENOSPC)
1162 		DRM_INFO("Not enough PCI address space for a large BAR.");
1163 	else if (r && r != -ENOTSUPP)
1164 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1165 
1166 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1167 
1168 	/* When the doorbell or fb BAR isn't available we have no chance of
1169 	 * using the device.
1170 	 */
1171 	r = amdgpu_doorbell_init(adev);
1172 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1173 		return -ENODEV;
1174 
1175 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1176 #endif /* __linux__ */
1177 
1178 	return 0;
1179 }
1180 
amdgpu_device_read_bios(struct amdgpu_device * adev)1181 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1182 {
1183 	if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1184 		return false;
1185 
1186 	return true;
1187 }
1188 
1189 /*
1190  * GPU helpers function.
1191  */
1192 /**
1193  * amdgpu_device_need_post - check if the hw need post or not
1194  *
1195  * @adev: amdgpu_device pointer
1196  *
1197  * Check if the asic has been initialized (all asics) at driver startup
1198  * or post is needed if  hw reset is performed.
1199  * Returns true if need or false if not.
1200  */
amdgpu_device_need_post(struct amdgpu_device * adev)1201 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1202 {
1203 	uint32_t reg;
1204 
1205 	if (amdgpu_sriov_vf(adev))
1206 		return false;
1207 
1208 	if (!amdgpu_device_read_bios(adev))
1209 		return false;
1210 
1211 	if (amdgpu_passthrough(adev)) {
1212 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1213 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1214 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1215 		 * vpost executed for smc version below 22.15
1216 		 */
1217 		if (adev->asic_type == CHIP_FIJI) {
1218 			int err;
1219 			uint32_t fw_ver;
1220 
1221 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1222 			/* force vPost if error occured */
1223 			if (err)
1224 				return true;
1225 
1226 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1227 			release_firmware(adev->pm.fw);
1228 			if (fw_ver < 0x00160e00)
1229 				return true;
1230 		}
1231 	}
1232 
1233 	/* Don't post if we need to reset whole hive on init */
1234 	if (adev->gmc.xgmi.pending_reset)
1235 		return false;
1236 
1237 	if (adev->has_hw_reset) {
1238 		adev->has_hw_reset = false;
1239 		return true;
1240 	}
1241 
1242 	/* bios scratch used on CIK+ */
1243 	if (adev->asic_type >= CHIP_BONAIRE)
1244 		return amdgpu_atombios_scratch_need_asic_init(adev);
1245 
1246 	/* check MEM_SIZE for older asics */
1247 	reg = amdgpu_asic_get_config_memsize(adev);
1248 
1249 	if ((reg != 0) && (reg != 0xffffffff))
1250 		return false;
1251 
1252 	return true;
1253 }
1254 
1255 /*
1256  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1257  * speed switching. Until we have confirmation from Intel that a specific host
1258  * supports it, it's safer that we keep it disabled for all.
1259  *
1260  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1261  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1262  */
amdgpu_device_pcie_dynamic_switching_supported(void)1263 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1264 {
1265 #if IS_ENABLED(CONFIG_X86)
1266 #ifdef __linux__
1267 	struct cpuinfo_x86 *c = &cpu_data(0);
1268 
1269 	if (c->x86_vendor == X86_VENDOR_INTEL)
1270 #else
1271 	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
1272 #endif
1273 		return false;
1274 #endif
1275 	return true;
1276 }
1277 
1278 /**
1279  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1280  *
1281  * @adev: amdgpu_device pointer
1282  *
1283  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1284  * be set for this device.
1285  *
1286  * Returns true if it should be used or false if not.
1287  */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1288 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1289 {
1290 	switch (amdgpu_aspm) {
1291 	case -1:
1292 		break;
1293 	case 0:
1294 		return false;
1295 	case 1:
1296 		return true;
1297 	default:
1298 		return false;
1299 	}
1300 	return pcie_aspm_enabled(adev->pdev);
1301 }
1302 
amdgpu_device_aspm_support_quirk(void)1303 bool amdgpu_device_aspm_support_quirk(void)
1304 {
1305 #if IS_ENABLED(CONFIG_X86)
1306 	struct cpu_info *ci = curcpu();
1307 
1308 	return !(ci->ci_family == 6 && ci->ci_model == 0x97);
1309 #else
1310 	return true;
1311 #endif
1312 }
1313 
1314 /* if we get transitioned to only one device, take VGA back */
1315 /**
1316  * amdgpu_device_vga_set_decode - enable/disable vga decode
1317  *
1318  * @pdev: PCI device pointer
1319  * @state: enable/disable vga decode
1320  *
1321  * Enable/disable vga decode (all asics).
1322  * Returns VGA resource flags.
1323  */
1324 #ifdef notyet
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326 		bool state)
1327 {
1328 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1329 
1330 	amdgpu_asic_set_vga_state(adev, state);
1331 	if (state)
1332 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1333 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1334 	else
1335 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1336 }
1337 #endif
1338 
1339 /**
1340  * amdgpu_device_check_block_size - validate the vm block size
1341  *
1342  * @adev: amdgpu_device pointer
1343  *
1344  * Validates the vm block size specified via module parameter.
1345  * The vm block size defines number of bits in page table versus page directory,
1346  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1347  * page table and the remaining bits are in the page directory.
1348  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1349 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1350 {
1351 	/* defines number of bits in page table versus page directory,
1352 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1353 	 * page table and the remaining bits are in the page directory
1354 	 */
1355 	if (amdgpu_vm_block_size == -1)
1356 		return;
1357 
1358 	if (amdgpu_vm_block_size < 9) {
1359 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1360 			 amdgpu_vm_block_size);
1361 		amdgpu_vm_block_size = -1;
1362 	}
1363 }
1364 
1365 /**
1366  * amdgpu_device_check_vm_size - validate the vm size
1367  *
1368  * @adev: amdgpu_device pointer
1369  *
1370  * Validates the vm size in GB specified via module parameter.
1371  * The VM size is the size of the GPU virtual memory space in GB.
1372  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1373 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1374 {
1375 	/* no need to check the default value */
1376 	if (amdgpu_vm_size == -1)
1377 		return;
1378 
1379 	if (amdgpu_vm_size < 1) {
1380 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1381 			 amdgpu_vm_size);
1382 		amdgpu_vm_size = -1;
1383 	}
1384 }
1385 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1386 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1387 {
1388 #ifdef __linux__
1389 	struct sysinfo si;
1390 #endif
1391 	bool is_os_64 = (sizeof(void *) == 8);
1392 	uint64_t total_memory;
1393 	uint64_t dram_size_seven_GB = 0x1B8000000;
1394 	uint64_t dram_size_three_GB = 0xB8000000;
1395 
1396 	if (amdgpu_smu_memory_pool_size == 0)
1397 		return;
1398 
1399 	if (!is_os_64) {
1400 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1401 		goto def_value;
1402 	}
1403 #ifdef __linux__
1404 	si_meminfo(&si);
1405 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1406 #else
1407 	total_memory = ptoa(physmem);
1408 #endif
1409 
1410 	if ((amdgpu_smu_memory_pool_size == 1) ||
1411 		(amdgpu_smu_memory_pool_size == 2)) {
1412 		if (total_memory < dram_size_three_GB)
1413 			goto def_value1;
1414 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1415 		(amdgpu_smu_memory_pool_size == 8)) {
1416 		if (total_memory < dram_size_seven_GB)
1417 			goto def_value1;
1418 	} else {
1419 		DRM_WARN("Smu memory pool size not supported\n");
1420 		goto def_value;
1421 	}
1422 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1423 
1424 	return;
1425 
1426 def_value1:
1427 	DRM_WARN("No enough system memory\n");
1428 def_value:
1429 	adev->pm.smu_prv_buffer_size = 0;
1430 }
1431 
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1432 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1433 {
1434 	if (!(adev->flags & AMD_IS_APU) ||
1435 	    adev->asic_type < CHIP_RAVEN)
1436 		return 0;
1437 
1438 	switch (adev->asic_type) {
1439 	case CHIP_RAVEN:
1440 		if (adev->pdev->device == 0x15dd)
1441 			adev->apu_flags |= AMD_APU_IS_RAVEN;
1442 		if (adev->pdev->device == 0x15d8)
1443 			adev->apu_flags |= AMD_APU_IS_PICASSO;
1444 		break;
1445 	case CHIP_RENOIR:
1446 		if ((adev->pdev->device == 0x1636) ||
1447 		    (adev->pdev->device == 0x164c))
1448 			adev->apu_flags |= AMD_APU_IS_RENOIR;
1449 		else
1450 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1451 		break;
1452 	case CHIP_VANGOGH:
1453 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
1454 		break;
1455 	case CHIP_YELLOW_CARP:
1456 		break;
1457 	case CHIP_CYAN_SKILLFISH:
1458 		if ((adev->pdev->device == 0x13FE) ||
1459 		    (adev->pdev->device == 0x143F))
1460 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1461 		break;
1462 	default:
1463 		break;
1464 	}
1465 
1466 	return 0;
1467 }
1468 
1469 /**
1470  * amdgpu_device_check_arguments - validate module params
1471  *
1472  * @adev: amdgpu_device pointer
1473  *
1474  * Validates certain module parameters and updates
1475  * the associated values used by the driver (all asics).
1476  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1477 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1478 {
1479 	if (amdgpu_sched_jobs < 4) {
1480 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1481 			 amdgpu_sched_jobs);
1482 		amdgpu_sched_jobs = 4;
1483 	} else if (!is_power_of_2(amdgpu_sched_jobs)) {
1484 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1485 			 amdgpu_sched_jobs);
1486 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1487 	}
1488 
1489 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1490 		/* gart size must be greater or equal to 32M */
1491 		dev_warn(adev->dev, "gart size (%d) too small\n",
1492 			 amdgpu_gart_size);
1493 		amdgpu_gart_size = -1;
1494 	}
1495 
1496 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1497 		/* gtt size must be greater or equal to 32M */
1498 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1499 				 amdgpu_gtt_size);
1500 		amdgpu_gtt_size = -1;
1501 	}
1502 
1503 	/* valid range is between 4 and 9 inclusive */
1504 	if (amdgpu_vm_fragment_size != -1 &&
1505 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1506 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1507 		amdgpu_vm_fragment_size = -1;
1508 	}
1509 
1510 	if (amdgpu_sched_hw_submission < 2) {
1511 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1512 			 amdgpu_sched_hw_submission);
1513 		amdgpu_sched_hw_submission = 2;
1514 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1515 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1516 			 amdgpu_sched_hw_submission);
1517 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1518 	}
1519 
1520 	if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1521 		dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1522 		amdgpu_reset_method = -1;
1523 	}
1524 
1525 	amdgpu_device_check_smu_prv_buffer_size(adev);
1526 
1527 	amdgpu_device_check_vm_size(adev);
1528 
1529 	amdgpu_device_check_block_size(adev);
1530 
1531 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1532 
1533 	return 0;
1534 }
1535 
1536 #ifdef __linux__
1537 /**
1538  * amdgpu_switcheroo_set_state - set switcheroo state
1539  *
1540  * @pdev: pci dev pointer
1541  * @state: vga_switcheroo state
1542  *
1543  * Callback for the switcheroo driver.  Suspends or resumes
1544  * the asics before or after it is powered up using ACPI methods.
1545  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1546 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1547 					enum vga_switcheroo_state state)
1548 {
1549 	struct drm_device *dev = pci_get_drvdata(pdev);
1550 	int r;
1551 
1552 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1553 		return;
1554 
1555 	if (state == VGA_SWITCHEROO_ON) {
1556 		pr_info("switched on\n");
1557 		/* don't suspend or resume card normally */
1558 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1559 
1560 		pci_set_power_state(pdev, PCI_D0);
1561 		amdgpu_device_load_pci_state(pdev);
1562 		r = pci_enable_device(pdev);
1563 		if (r)
1564 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1565 		amdgpu_device_resume(dev, true);
1566 
1567 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1568 	} else {
1569 		pr_info("switched off\n");
1570 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1571 		amdgpu_device_prepare(dev);
1572 		amdgpu_device_suspend(dev, true);
1573 		amdgpu_device_cache_pci_state(pdev);
1574 		/* Shut down the device */
1575 		pci_disable_device(pdev);
1576 		pci_set_power_state(pdev, PCI_D3cold);
1577 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1578 	}
1579 }
1580 
1581 /**
1582  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1583  *
1584  * @pdev: pci dev pointer
1585  *
1586  * Callback for the switcheroo driver.  Check of the switcheroo
1587  * state can be changed.
1588  * Returns true if the state can be changed, false if not.
1589  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1590 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1591 {
1592 	struct drm_device *dev = pci_get_drvdata(pdev);
1593 
1594        /*
1595 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1596 	* locking inversion with the driver load path. And the access here is
1597 	* completely racy anyway. So don't bother with locking for now.
1598 	*/
1599 	return atomic_read(&dev->open_count) == 0;
1600 }
1601 #endif /* __linux__ */
1602 
1603 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1604 #ifdef notyet
1605 	.set_gpu_state = amdgpu_switcheroo_set_state,
1606 	.reprobe = NULL,
1607 	.can_switch = amdgpu_switcheroo_can_switch,
1608 #endif
1609 };
1610 
1611 /**
1612  * amdgpu_device_ip_set_clockgating_state - set the CG state
1613  *
1614  * @dev: amdgpu_device pointer
1615  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1616  * @state: clockgating state (gate or ungate)
1617  *
1618  * Sets the requested clockgating state for all instances of
1619  * the hardware IP specified.
1620  * Returns the error code from the last instance.
1621  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1622 int amdgpu_device_ip_set_clockgating_state(void *dev,
1623 					   enum amd_ip_block_type block_type,
1624 					   enum amd_clockgating_state state)
1625 {
1626 	struct amdgpu_device *adev = dev;
1627 	int i, r = 0;
1628 
1629 	for (i = 0; i < adev->num_ip_blocks; i++) {
1630 		if (!adev->ip_blocks[i].status.valid)
1631 			continue;
1632 		if (adev->ip_blocks[i].version->type != block_type)
1633 			continue;
1634 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1635 			continue;
1636 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1637 			(void *)adev, state);
1638 		if (r)
1639 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1640 				  adev->ip_blocks[i].version->funcs->name, r);
1641 	}
1642 	return r;
1643 }
1644 
1645 /**
1646  * amdgpu_device_ip_set_powergating_state - set the PG state
1647  *
1648  * @dev: amdgpu_device pointer
1649  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1650  * @state: powergating state (gate or ungate)
1651  *
1652  * Sets the requested powergating state for all instances of
1653  * the hardware IP specified.
1654  * Returns the error code from the last instance.
1655  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1656 int amdgpu_device_ip_set_powergating_state(void *dev,
1657 					   enum amd_ip_block_type block_type,
1658 					   enum amd_powergating_state state)
1659 {
1660 	struct amdgpu_device *adev = dev;
1661 	int i, r = 0;
1662 
1663 	for (i = 0; i < adev->num_ip_blocks; i++) {
1664 		if (!adev->ip_blocks[i].status.valid)
1665 			continue;
1666 		if (adev->ip_blocks[i].version->type != block_type)
1667 			continue;
1668 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1669 			continue;
1670 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1671 			(void *)adev, state);
1672 		if (r)
1673 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1674 				  adev->ip_blocks[i].version->funcs->name, r);
1675 	}
1676 	return r;
1677 }
1678 
1679 /**
1680  * amdgpu_device_ip_get_clockgating_state - get the CG state
1681  *
1682  * @adev: amdgpu_device pointer
1683  * @flags: clockgating feature flags
1684  *
1685  * Walks the list of IPs on the device and updates the clockgating
1686  * flags for each IP.
1687  * Updates @flags with the feature flags for each hardware IP where
1688  * clockgating is enabled.
1689  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1690 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1691 					    u64 *flags)
1692 {
1693 	int i;
1694 
1695 	for (i = 0; i < adev->num_ip_blocks; i++) {
1696 		if (!adev->ip_blocks[i].status.valid)
1697 			continue;
1698 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1699 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1700 	}
1701 }
1702 
1703 /**
1704  * amdgpu_device_ip_wait_for_idle - wait for idle
1705  *
1706  * @adev: amdgpu_device pointer
1707  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1708  *
1709  * Waits for the request hardware IP to be idle.
1710  * Returns 0 for success or a negative error code on failure.
1711  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1712 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1713 				   enum amd_ip_block_type block_type)
1714 {
1715 	int i, r;
1716 
1717 	for (i = 0; i < adev->num_ip_blocks; i++) {
1718 		if (!adev->ip_blocks[i].status.valid)
1719 			continue;
1720 		if (adev->ip_blocks[i].version->type == block_type) {
1721 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1722 			if (r)
1723 				return r;
1724 			break;
1725 		}
1726 	}
1727 	return 0;
1728 
1729 }
1730 
1731 /**
1732  * amdgpu_device_ip_is_idle - is the hardware IP idle
1733  *
1734  * @adev: amdgpu_device pointer
1735  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1736  *
1737  * Check if the hardware IP is idle or not.
1738  * Returns true if it the IP is idle, false if not.
1739  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1740 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1741 			      enum amd_ip_block_type block_type)
1742 {
1743 	int i;
1744 
1745 	for (i = 0; i < adev->num_ip_blocks; i++) {
1746 		if (!adev->ip_blocks[i].status.valid)
1747 			continue;
1748 		if (adev->ip_blocks[i].version->type == block_type)
1749 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1750 	}
1751 	return true;
1752 
1753 }
1754 
1755 /**
1756  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1757  *
1758  * @adev: amdgpu_device pointer
1759  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1760  *
1761  * Returns a pointer to the hardware IP block structure
1762  * if it exists for the asic, otherwise NULL.
1763  */
1764 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1765 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1766 			      enum amd_ip_block_type type)
1767 {
1768 	int i;
1769 
1770 	for (i = 0; i < adev->num_ip_blocks; i++)
1771 		if (adev->ip_blocks[i].version->type == type)
1772 			return &adev->ip_blocks[i];
1773 
1774 	return NULL;
1775 }
1776 
1777 /**
1778  * amdgpu_device_ip_block_version_cmp
1779  *
1780  * @adev: amdgpu_device pointer
1781  * @type: enum amd_ip_block_type
1782  * @major: major version
1783  * @minor: minor version
1784  *
1785  * return 0 if equal or greater
1786  * return 1 if smaller or the ip_block doesn't exist
1787  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1788 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1789 				       enum amd_ip_block_type type,
1790 				       u32 major, u32 minor)
1791 {
1792 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1793 
1794 	if (ip_block && ((ip_block->version->major > major) ||
1795 			((ip_block->version->major == major) &&
1796 			(ip_block->version->minor >= minor))))
1797 		return 0;
1798 
1799 	return 1;
1800 }
1801 
1802 /**
1803  * amdgpu_device_ip_block_add
1804  *
1805  * @adev: amdgpu_device pointer
1806  * @ip_block_version: pointer to the IP to add
1807  *
1808  * Adds the IP block driver information to the collection of IPs
1809  * on the asic.
1810  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1811 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1812 			       const struct amdgpu_ip_block_version *ip_block_version)
1813 {
1814 	if (!ip_block_version)
1815 		return -EINVAL;
1816 
1817 	switch (ip_block_version->type) {
1818 	case AMD_IP_BLOCK_TYPE_VCN:
1819 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1820 			return 0;
1821 		break;
1822 	case AMD_IP_BLOCK_TYPE_JPEG:
1823 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1824 			return 0;
1825 		break;
1826 	default:
1827 		break;
1828 	}
1829 
1830 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1831 		  ip_block_version->funcs->name);
1832 
1833 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1834 
1835 	return 0;
1836 }
1837 
1838 /**
1839  * amdgpu_device_enable_virtual_display - enable virtual display feature
1840  *
1841  * @adev: amdgpu_device pointer
1842  *
1843  * Enabled the virtual display feature if the user has enabled it via
1844  * the module parameter virtual_display.  This feature provides a virtual
1845  * display hardware on headless boards or in virtualized environments.
1846  * This function parses and validates the configuration string specified by
1847  * the user and configues the virtual display configuration (number of
1848  * virtual connectors, crtcs, etc.) specified.
1849  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1850 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1851 {
1852 	adev->enable_virtual_display = false;
1853 
1854 #ifdef notyet
1855 	if (amdgpu_virtual_display) {
1856 		const char *pci_address_name = pci_name(adev->pdev);
1857 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1858 
1859 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1860 		pciaddstr_tmp = pciaddstr;
1861 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1862 			pciaddname = strsep(&pciaddname_tmp, ",");
1863 			if (!strcmp("all", pciaddname)
1864 			    || !strcmp(pci_address_name, pciaddname)) {
1865 				long num_crtc;
1866 				int res = -1;
1867 
1868 				adev->enable_virtual_display = true;
1869 
1870 				if (pciaddname_tmp)
1871 					res = kstrtol(pciaddname_tmp, 10,
1872 						      &num_crtc);
1873 
1874 				if (!res) {
1875 					if (num_crtc < 1)
1876 						num_crtc = 1;
1877 					if (num_crtc > 6)
1878 						num_crtc = 6;
1879 					adev->mode_info.num_crtc = num_crtc;
1880 				} else {
1881 					adev->mode_info.num_crtc = 1;
1882 				}
1883 				break;
1884 			}
1885 		}
1886 
1887 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1888 			 amdgpu_virtual_display, pci_address_name,
1889 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1890 
1891 		kfree(pciaddstr);
1892 	}
1893 #endif
1894 }
1895 
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1896 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1897 {
1898 	if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1899 		adev->mode_info.num_crtc = 1;
1900 		adev->enable_virtual_display = true;
1901 		DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1902 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1903 	}
1904 }
1905 
1906 /**
1907  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1908  *
1909  * @adev: amdgpu_device pointer
1910  *
1911  * Parses the asic configuration parameters specified in the gpu info
1912  * firmware and makes them availale to the driver for use in configuring
1913  * the asic.
1914  * Returns 0 on success, -EINVAL on failure.
1915  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1916 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1917 {
1918 	const char *chip_name;
1919 	char fw_name[40];
1920 	int err;
1921 	const struct gpu_info_firmware_header_v1_0 *hdr;
1922 
1923 	adev->firmware.gpu_info_fw = NULL;
1924 
1925 	if (adev->mman.discovery_bin)
1926 		return 0;
1927 
1928 	switch (adev->asic_type) {
1929 	default:
1930 		return 0;
1931 	case CHIP_VEGA10:
1932 		chip_name = "vega10";
1933 		break;
1934 	case CHIP_VEGA12:
1935 		chip_name = "vega12";
1936 		break;
1937 	case CHIP_RAVEN:
1938 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1939 			chip_name = "raven2";
1940 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1941 			chip_name = "picasso";
1942 		else
1943 			chip_name = "raven";
1944 		break;
1945 	case CHIP_ARCTURUS:
1946 		chip_name = "arcturus";
1947 		break;
1948 	case CHIP_NAVI12:
1949 		chip_name = "navi12";
1950 		break;
1951 	}
1952 
1953 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1954 	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1955 	if (err) {
1956 		dev_err(adev->dev,
1957 			"Failed to get gpu_info firmware \"%s\"\n",
1958 			fw_name);
1959 		goto out;
1960 	}
1961 
1962 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1963 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1964 
1965 	switch (hdr->version_major) {
1966 	case 1:
1967 	{
1968 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1969 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1970 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1971 
1972 		/*
1973 		 * Should be droped when DAL no longer needs it.
1974 		 */
1975 		if (adev->asic_type == CHIP_NAVI12)
1976 			goto parse_soc_bounding_box;
1977 
1978 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1979 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1980 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1981 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1982 		adev->gfx.config.max_texture_channel_caches =
1983 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1984 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1985 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1986 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1987 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1988 		adev->gfx.config.double_offchip_lds_buf =
1989 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1990 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1991 		adev->gfx.cu_info.max_waves_per_simd =
1992 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1993 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1994 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1995 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1996 		if (hdr->version_minor >= 1) {
1997 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1998 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1999 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2000 			adev->gfx.config.num_sc_per_sh =
2001 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2002 			adev->gfx.config.num_packer_per_sc =
2003 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2004 		}
2005 
2006 parse_soc_bounding_box:
2007 		/*
2008 		 * soc bounding box info is not integrated in disocovery table,
2009 		 * we always need to parse it from gpu info firmware if needed.
2010 		 */
2011 		if (hdr->version_minor == 2) {
2012 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2013 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2014 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2015 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2016 		}
2017 		break;
2018 	}
2019 	default:
2020 		dev_err(adev->dev,
2021 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2022 		err = -EINVAL;
2023 		goto out;
2024 	}
2025 out:
2026 	return err;
2027 }
2028 
2029 /**
2030  * amdgpu_device_ip_early_init - run early init for hardware IPs
2031  *
2032  * @adev: amdgpu_device pointer
2033  *
2034  * Early initialization pass for hardware IPs.  The hardware IPs that make
2035  * up each asic are discovered each IP's early_init callback is run.  This
2036  * is the first stage in initializing the asic.
2037  * Returns 0 on success, negative error code on failure.
2038  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2039 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2040 {
2041 	struct pci_dev *parent;
2042 	int i, r;
2043 	bool total;
2044 
2045 	amdgpu_device_enable_virtual_display(adev);
2046 
2047 	if (amdgpu_sriov_vf(adev)) {
2048 		r = amdgpu_virt_request_full_gpu(adev, true);
2049 		if (r)
2050 			return r;
2051 	}
2052 
2053 	switch (adev->asic_type) {
2054 #ifdef CONFIG_DRM_AMDGPU_SI
2055 	case CHIP_VERDE:
2056 	case CHIP_TAHITI:
2057 	case CHIP_PITCAIRN:
2058 	case CHIP_OLAND:
2059 	case CHIP_HAINAN:
2060 		adev->family = AMDGPU_FAMILY_SI;
2061 		r = si_set_ip_blocks(adev);
2062 		if (r)
2063 			return r;
2064 		break;
2065 #endif
2066 #ifdef CONFIG_DRM_AMDGPU_CIK
2067 	case CHIP_BONAIRE:
2068 	case CHIP_HAWAII:
2069 	case CHIP_KAVERI:
2070 	case CHIP_KABINI:
2071 	case CHIP_MULLINS:
2072 		if (adev->flags & AMD_IS_APU)
2073 			adev->family = AMDGPU_FAMILY_KV;
2074 		else
2075 			adev->family = AMDGPU_FAMILY_CI;
2076 
2077 		r = cik_set_ip_blocks(adev);
2078 		if (r)
2079 			return r;
2080 		break;
2081 #endif
2082 	case CHIP_TOPAZ:
2083 	case CHIP_TONGA:
2084 	case CHIP_FIJI:
2085 	case CHIP_POLARIS10:
2086 	case CHIP_POLARIS11:
2087 	case CHIP_POLARIS12:
2088 	case CHIP_VEGAM:
2089 	case CHIP_CARRIZO:
2090 	case CHIP_STONEY:
2091 		if (adev->flags & AMD_IS_APU)
2092 			adev->family = AMDGPU_FAMILY_CZ;
2093 		else
2094 			adev->family = AMDGPU_FAMILY_VI;
2095 
2096 		r = vi_set_ip_blocks(adev);
2097 		if (r)
2098 			return r;
2099 		break;
2100 	default:
2101 		r = amdgpu_discovery_set_ip_blocks(adev);
2102 		if (r)
2103 			return r;
2104 		break;
2105 	}
2106 
2107 	if (amdgpu_has_atpx() &&
2108 	    (amdgpu_is_atpx_hybrid() ||
2109 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2110 	    ((adev->flags & AMD_IS_APU) == 0) &&
2111 	    !dev_is_removable(&adev->pdev->dev))
2112 		adev->flags |= AMD_IS_PX;
2113 
2114 	if (!(adev->flags & AMD_IS_APU)) {
2115 #ifdef notyet
2116 		parent = pcie_find_root_port(adev->pdev);
2117 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2118 #else
2119 		adev->has_pr3 = false;
2120 #endif
2121 	}
2122 
2123 
2124 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2125 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2126 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2127 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2128 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2129 	if (!amdgpu_device_pcie_dynamic_switching_supported())
2130 		adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2131 
2132 	total = true;
2133 	for (i = 0; i < adev->num_ip_blocks; i++) {
2134 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2135 			DRM_WARN("disabled ip block: %d <%s>\n",
2136 				  i, adev->ip_blocks[i].version->funcs->name);
2137 			adev->ip_blocks[i].status.valid = false;
2138 		} else {
2139 			if (adev->ip_blocks[i].version->funcs->early_init) {
2140 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2141 				if (r == -ENOENT) {
2142 					adev->ip_blocks[i].status.valid = false;
2143 				} else if (r) {
2144 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2145 						  adev->ip_blocks[i].version->funcs->name, r);
2146 					total = false;
2147 				} else {
2148 					adev->ip_blocks[i].status.valid = true;
2149 				}
2150 			} else {
2151 				adev->ip_blocks[i].status.valid = true;
2152 			}
2153 		}
2154 		/* get the vbios after the asic_funcs are set up */
2155 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2156 			r = amdgpu_device_parse_gpu_info_fw(adev);
2157 			if (r)
2158 				return r;
2159 
2160 			/* Read BIOS */
2161 			if (amdgpu_device_read_bios(adev)) {
2162 				if (!amdgpu_get_bios(adev))
2163 					return -EINVAL;
2164 
2165 				r = amdgpu_atombios_init(adev);
2166 				if (r) {
2167 					dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2168 					amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2169 					return r;
2170 				}
2171 			}
2172 
2173 			/*get pf2vf msg info at it's earliest time*/
2174 			if (amdgpu_sriov_vf(adev))
2175 				amdgpu_virt_init_data_exchange(adev);
2176 
2177 		}
2178 	}
2179 	if (!total)
2180 		return -ENODEV;
2181 
2182 	amdgpu_amdkfd_device_probe(adev);
2183 	adev->cg_flags &= amdgpu_cg_mask;
2184 	adev->pg_flags &= amdgpu_pg_mask;
2185 
2186 	return 0;
2187 }
2188 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2189 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2190 {
2191 	int i, r;
2192 
2193 	for (i = 0; i < adev->num_ip_blocks; i++) {
2194 		if (!adev->ip_blocks[i].status.sw)
2195 			continue;
2196 		if (adev->ip_blocks[i].status.hw)
2197 			continue;
2198 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2199 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2200 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2201 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2202 			if (r) {
2203 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2204 					  adev->ip_blocks[i].version->funcs->name, r);
2205 				return r;
2206 			}
2207 			adev->ip_blocks[i].status.hw = true;
2208 		}
2209 	}
2210 
2211 	return 0;
2212 }
2213 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2214 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2215 {
2216 	int i, r;
2217 
2218 	for (i = 0; i < adev->num_ip_blocks; i++) {
2219 		if (!adev->ip_blocks[i].status.sw)
2220 			continue;
2221 		if (adev->ip_blocks[i].status.hw)
2222 			continue;
2223 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2224 		if (r) {
2225 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2226 				  adev->ip_blocks[i].version->funcs->name, r);
2227 			return r;
2228 		}
2229 		adev->ip_blocks[i].status.hw = true;
2230 	}
2231 
2232 	return 0;
2233 }
2234 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2235 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2236 {
2237 	int r = 0;
2238 	int i;
2239 	uint32_t smu_version;
2240 
2241 	if (adev->asic_type >= CHIP_VEGA10) {
2242 		for (i = 0; i < adev->num_ip_blocks; i++) {
2243 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2244 				continue;
2245 
2246 			if (!adev->ip_blocks[i].status.sw)
2247 				continue;
2248 
2249 			/* no need to do the fw loading again if already done*/
2250 			if (adev->ip_blocks[i].status.hw == true)
2251 				break;
2252 
2253 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2254 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2255 				if (r) {
2256 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2257 							  adev->ip_blocks[i].version->funcs->name, r);
2258 					return r;
2259 				}
2260 			} else {
2261 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2262 				if (r) {
2263 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2264 							  adev->ip_blocks[i].version->funcs->name, r);
2265 					return r;
2266 				}
2267 			}
2268 
2269 			adev->ip_blocks[i].status.hw = true;
2270 			break;
2271 		}
2272 	}
2273 
2274 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2275 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2276 
2277 	return r;
2278 }
2279 
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2280 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2281 {
2282 	long timeout;
2283 	int r, i;
2284 
2285 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2286 		struct amdgpu_ring *ring = adev->rings[i];
2287 
2288 		/* No need to setup the GPU scheduler for rings that don't need it */
2289 		if (!ring || ring->no_scheduler)
2290 			continue;
2291 
2292 		switch (ring->funcs->type) {
2293 		case AMDGPU_RING_TYPE_GFX:
2294 			timeout = adev->gfx_timeout;
2295 			break;
2296 		case AMDGPU_RING_TYPE_COMPUTE:
2297 			timeout = adev->compute_timeout;
2298 			break;
2299 		case AMDGPU_RING_TYPE_SDMA:
2300 			timeout = adev->sdma_timeout;
2301 			break;
2302 		default:
2303 			timeout = adev->video_timeout;
2304 			break;
2305 		}
2306 
2307 		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2308 				   ring->num_hw_submission, 0,
2309 				   timeout, adev->reset_domain->wq,
2310 				   ring->sched_score, ring->name,
2311 				   adev->dev);
2312 		if (r) {
2313 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
2314 				  ring->name);
2315 			return r;
2316 		}
2317 	}
2318 
2319 	amdgpu_xcp_update_partition_sched_list(adev);
2320 
2321 	return 0;
2322 }
2323 
2324 
2325 /**
2326  * amdgpu_device_ip_init - run init for hardware IPs
2327  *
2328  * @adev: amdgpu_device pointer
2329  *
2330  * Main initialization pass for hardware IPs.  The list of all the hardware
2331  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2332  * are run.  sw_init initializes the software state associated with each IP
2333  * and hw_init initializes the hardware associated with each IP.
2334  * Returns 0 on success, negative error code on failure.
2335  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2336 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2337 {
2338 	int i, r;
2339 
2340 	r = amdgpu_ras_init(adev);
2341 	if (r)
2342 		return r;
2343 
2344 	for (i = 0; i < adev->num_ip_blocks; i++) {
2345 		if (!adev->ip_blocks[i].status.valid)
2346 			continue;
2347 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2348 		if (r) {
2349 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2350 				  adev->ip_blocks[i].version->funcs->name, r);
2351 			goto init_failed;
2352 		}
2353 		adev->ip_blocks[i].status.sw = true;
2354 
2355 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2356 			/* need to do common hw init early so everything is set up for gmc */
2357 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2358 			if (r) {
2359 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2360 				goto init_failed;
2361 			}
2362 			adev->ip_blocks[i].status.hw = true;
2363 		} else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2364 			/* need to do gmc hw init early so we can allocate gpu mem */
2365 			/* Try to reserve bad pages early */
2366 			if (amdgpu_sriov_vf(adev))
2367 				amdgpu_virt_exchange_data(adev);
2368 
2369 			r = amdgpu_device_mem_scratch_init(adev);
2370 			if (r) {
2371 				DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2372 				goto init_failed;
2373 			}
2374 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2375 			if (r) {
2376 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2377 				goto init_failed;
2378 			}
2379 			r = amdgpu_device_wb_init(adev);
2380 			if (r) {
2381 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2382 				goto init_failed;
2383 			}
2384 			adev->ip_blocks[i].status.hw = true;
2385 
2386 			/* right after GMC hw init, we create CSA */
2387 			if (adev->gfx.mcbp) {
2388 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2389 							       AMDGPU_GEM_DOMAIN_VRAM |
2390 							       AMDGPU_GEM_DOMAIN_GTT,
2391 							       AMDGPU_CSA_SIZE);
2392 				if (r) {
2393 					DRM_ERROR("allocate CSA failed %d\n", r);
2394 					goto init_failed;
2395 				}
2396 			}
2397 		}
2398 	}
2399 
2400 	if (amdgpu_sriov_vf(adev))
2401 		amdgpu_virt_init_data_exchange(adev);
2402 
2403 	r = amdgpu_ib_pool_init(adev);
2404 	if (r) {
2405 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2406 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2407 		goto init_failed;
2408 	}
2409 
2410 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2411 	if (r)
2412 		goto init_failed;
2413 
2414 	r = amdgpu_device_ip_hw_init_phase1(adev);
2415 	if (r)
2416 		goto init_failed;
2417 
2418 	r = amdgpu_device_fw_loading(adev);
2419 	if (r)
2420 		goto init_failed;
2421 
2422 	r = amdgpu_device_ip_hw_init_phase2(adev);
2423 	if (r)
2424 		goto init_failed;
2425 
2426 	/*
2427 	 * retired pages will be loaded from eeprom and reserved here,
2428 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2429 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2430 	 * for I2C communication which only true at this point.
2431 	 *
2432 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2433 	 * failure from bad gpu situation and stop amdgpu init process
2434 	 * accordingly. For other failed cases, it will still release all
2435 	 * the resource and print error message, rather than returning one
2436 	 * negative value to upper level.
2437 	 *
2438 	 * Note: theoretically, this should be called before all vram allocations
2439 	 * to protect retired page from abusing
2440 	 */
2441 	r = amdgpu_ras_recovery_init(adev);
2442 	if (r)
2443 		goto init_failed;
2444 
2445 	/**
2446 	 * In case of XGMI grab extra reference for reset domain for this device
2447 	 */
2448 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2449 		if (amdgpu_xgmi_add_device(adev) == 0) {
2450 			if (!amdgpu_sriov_vf(adev)) {
2451 				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2452 
2453 				if (WARN_ON(!hive)) {
2454 					r = -ENOENT;
2455 					goto init_failed;
2456 				}
2457 
2458 				if (!hive->reset_domain ||
2459 				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2460 					r = -ENOENT;
2461 					amdgpu_put_xgmi_hive(hive);
2462 					goto init_failed;
2463 				}
2464 
2465 				/* Drop the early temporary reset domain we created for device */
2466 				amdgpu_reset_put_reset_domain(adev->reset_domain);
2467 				adev->reset_domain = hive->reset_domain;
2468 				amdgpu_put_xgmi_hive(hive);
2469 			}
2470 		}
2471 	}
2472 
2473 	r = amdgpu_device_init_schedulers(adev);
2474 	if (r)
2475 		goto init_failed;
2476 
2477 	/* Don't init kfd if whole hive need to be reset during init */
2478 	if (!adev->gmc.xgmi.pending_reset) {
2479 		kgd2kfd_init_zone_device(adev);
2480 		amdgpu_amdkfd_device_init(adev);
2481 	}
2482 
2483 	amdgpu_fru_get_product_info(adev);
2484 
2485 init_failed:
2486 
2487 	return r;
2488 }
2489 
2490 /**
2491  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2492  *
2493  * @adev: amdgpu_device pointer
2494  *
2495  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2496  * this function before a GPU reset.  If the value is retained after a
2497  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2498  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2499 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2500 {
2501 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2502 }
2503 
2504 /**
2505  * amdgpu_device_check_vram_lost - check if vram is valid
2506  *
2507  * @adev: amdgpu_device pointer
2508  *
2509  * Checks the reset magic value written to the gart pointer in VRAM.
2510  * The driver calls this after a GPU reset to see if the contents of
2511  * VRAM is lost or now.
2512  * returns true if vram is lost, false if not.
2513  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2514 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2515 {
2516 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2517 			AMDGPU_RESET_MAGIC_NUM))
2518 		return true;
2519 
2520 	if (!amdgpu_in_reset(adev))
2521 		return false;
2522 
2523 	/*
2524 	 * For all ASICs with baco/mode1 reset, the VRAM is
2525 	 * always assumed to be lost.
2526 	 */
2527 	switch (amdgpu_asic_reset_method(adev)) {
2528 	case AMD_RESET_METHOD_BACO:
2529 	case AMD_RESET_METHOD_MODE1:
2530 		return true;
2531 	default:
2532 		return false;
2533 	}
2534 }
2535 
2536 /**
2537  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2538  *
2539  * @adev: amdgpu_device pointer
2540  * @state: clockgating state (gate or ungate)
2541  *
2542  * The list of all the hardware IPs that make up the asic is walked and the
2543  * set_clockgating_state callbacks are run.
2544  * Late initialization pass enabling clockgating for hardware IPs.
2545  * Fini or suspend, pass disabling clockgating for hardware IPs.
2546  * Returns 0 on success, negative error code on failure.
2547  */
2548 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2549 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2550 			       enum amd_clockgating_state state)
2551 {
2552 	int i, j, r;
2553 
2554 	if (amdgpu_emu_mode == 1)
2555 		return 0;
2556 
2557 	for (j = 0; j < adev->num_ip_blocks; j++) {
2558 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2559 		if (!adev->ip_blocks[i].status.late_initialized)
2560 			continue;
2561 		/* skip CG for GFX, SDMA on S0ix */
2562 		if (adev->in_s0ix &&
2563 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2564 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2565 			continue;
2566 		/* skip CG for VCE/UVD, it's handled specially */
2567 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2568 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2569 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2570 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2571 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2572 			/* enable clockgating to save power */
2573 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2574 										     state);
2575 			if (r) {
2576 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2577 					  adev->ip_blocks[i].version->funcs->name, r);
2578 				return r;
2579 			}
2580 		}
2581 	}
2582 
2583 	return 0;
2584 }
2585 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2586 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2587 			       enum amd_powergating_state state)
2588 {
2589 	int i, j, r;
2590 
2591 	if (amdgpu_emu_mode == 1)
2592 		return 0;
2593 
2594 	for (j = 0; j < adev->num_ip_blocks; j++) {
2595 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2596 		if (!adev->ip_blocks[i].status.late_initialized)
2597 			continue;
2598 		/* skip PG for GFX, SDMA on S0ix */
2599 		if (adev->in_s0ix &&
2600 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2601 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2602 			continue;
2603 		/* skip CG for VCE/UVD, it's handled specially */
2604 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2605 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2606 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2607 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2608 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2609 			/* enable powergating to save power */
2610 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2611 											state);
2612 			if (r) {
2613 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2614 					  adev->ip_blocks[i].version->funcs->name, r);
2615 				return r;
2616 			}
2617 		}
2618 	}
2619 	return 0;
2620 }
2621 
amdgpu_device_enable_mgpu_fan_boost(void)2622 static int amdgpu_device_enable_mgpu_fan_boost(void)
2623 {
2624 	struct amdgpu_gpu_instance *gpu_ins;
2625 	struct amdgpu_device *adev;
2626 	int i, ret = 0;
2627 
2628 	mutex_lock(&mgpu_info.mutex);
2629 
2630 	/*
2631 	 * MGPU fan boost feature should be enabled
2632 	 * only when there are two or more dGPUs in
2633 	 * the system
2634 	 */
2635 	if (mgpu_info.num_dgpu < 2)
2636 		goto out;
2637 
2638 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2639 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2640 		adev = gpu_ins->adev;
2641 		if (!(adev->flags & AMD_IS_APU) &&
2642 		    !gpu_ins->mgpu_fan_enabled) {
2643 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2644 			if (ret)
2645 				break;
2646 
2647 			gpu_ins->mgpu_fan_enabled = 1;
2648 		}
2649 	}
2650 
2651 out:
2652 	mutex_unlock(&mgpu_info.mutex);
2653 
2654 	return ret;
2655 }
2656 
2657 /**
2658  * amdgpu_device_ip_late_init - run late init for hardware IPs
2659  *
2660  * @adev: amdgpu_device pointer
2661  *
2662  * Late initialization pass for hardware IPs.  The list of all the hardware
2663  * IPs that make up the asic is walked and the late_init callbacks are run.
2664  * late_init covers any special initialization that an IP requires
2665  * after all of the have been initialized or something that needs to happen
2666  * late in the init process.
2667  * Returns 0 on success, negative error code on failure.
2668  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2669 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2670 {
2671 	struct amdgpu_gpu_instance *gpu_instance;
2672 	int i = 0, r;
2673 
2674 	for (i = 0; i < adev->num_ip_blocks; i++) {
2675 		if (!adev->ip_blocks[i].status.hw)
2676 			continue;
2677 		if (adev->ip_blocks[i].version->funcs->late_init) {
2678 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2679 			if (r) {
2680 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2681 					  adev->ip_blocks[i].version->funcs->name, r);
2682 				return r;
2683 			}
2684 		}
2685 		adev->ip_blocks[i].status.late_initialized = true;
2686 	}
2687 
2688 	r = amdgpu_ras_late_init(adev);
2689 	if (r) {
2690 		DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2691 		return r;
2692 	}
2693 
2694 	amdgpu_ras_set_error_query_ready(adev, true);
2695 
2696 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2697 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2698 
2699 	amdgpu_device_fill_reset_magic(adev);
2700 
2701 	r = amdgpu_device_enable_mgpu_fan_boost();
2702 	if (r)
2703 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2704 
2705 	/* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2706 	if (amdgpu_passthrough(adev) &&
2707 	    ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2708 	     adev->asic_type == CHIP_ALDEBARAN))
2709 		amdgpu_dpm_handle_passthrough_sbr(adev, true);
2710 
2711 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2712 		mutex_lock(&mgpu_info.mutex);
2713 
2714 		/*
2715 		 * Reset device p-state to low as this was booted with high.
2716 		 *
2717 		 * This should be performed only after all devices from the same
2718 		 * hive get initialized.
2719 		 *
2720 		 * However, it's unknown how many device in the hive in advance.
2721 		 * As this is counted one by one during devices initializations.
2722 		 *
2723 		 * So, we wait for all XGMI interlinked devices initialized.
2724 		 * This may bring some delays as those devices may come from
2725 		 * different hives. But that should be OK.
2726 		 */
2727 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2728 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2729 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2730 				if (gpu_instance->adev->flags & AMD_IS_APU)
2731 					continue;
2732 
2733 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2734 						AMDGPU_XGMI_PSTATE_MIN);
2735 				if (r) {
2736 					DRM_ERROR("pstate setting failed (%d).\n", r);
2737 					break;
2738 				}
2739 			}
2740 		}
2741 
2742 		mutex_unlock(&mgpu_info.mutex);
2743 	}
2744 
2745 	return 0;
2746 }
2747 
2748 /**
2749  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2750  *
2751  * @adev: amdgpu_device pointer
2752  *
2753  * For ASICs need to disable SMC first
2754  */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2755 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2756 {
2757 	int i, r;
2758 
2759 	if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2760 		return;
2761 
2762 	for (i = 0; i < adev->num_ip_blocks; i++) {
2763 		if (!adev->ip_blocks[i].status.hw)
2764 			continue;
2765 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2766 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2767 			/* XXX handle errors */
2768 			if (r) {
2769 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2770 					  adev->ip_blocks[i].version->funcs->name, r);
2771 			}
2772 			adev->ip_blocks[i].status.hw = false;
2773 			break;
2774 		}
2775 	}
2776 }
2777 
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2778 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2779 {
2780 	int i, r;
2781 
2782 	for (i = 0; i < adev->num_ip_blocks; i++) {
2783 		if (!adev->ip_blocks[i].version->funcs->early_fini)
2784 			continue;
2785 
2786 		r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2787 		if (r) {
2788 			DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2789 				  adev->ip_blocks[i].version->funcs->name, r);
2790 		}
2791 	}
2792 
2793 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2794 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2795 
2796 	amdgpu_amdkfd_suspend(adev, false);
2797 
2798 	/* Workaroud for ASICs need to disable SMC first */
2799 	amdgpu_device_smu_fini_early(adev);
2800 
2801 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2802 		if (!adev->ip_blocks[i].status.hw)
2803 			continue;
2804 
2805 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2806 		/* XXX handle errors */
2807 		if (r) {
2808 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2809 				  adev->ip_blocks[i].version->funcs->name, r);
2810 		}
2811 
2812 		adev->ip_blocks[i].status.hw = false;
2813 	}
2814 
2815 	if (amdgpu_sriov_vf(adev)) {
2816 		if (amdgpu_virt_release_full_gpu(adev, false))
2817 			DRM_ERROR("failed to release exclusive mode on fini\n");
2818 	}
2819 
2820 	return 0;
2821 }
2822 
2823 /**
2824  * amdgpu_device_ip_fini - run fini for hardware IPs
2825  *
2826  * @adev: amdgpu_device pointer
2827  *
2828  * Main teardown pass for hardware IPs.  The list of all the hardware
2829  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2830  * are run.  hw_fini tears down the hardware associated with each IP
2831  * and sw_fini tears down any software state associated with each IP.
2832  * Returns 0 on success, negative error code on failure.
2833  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2834 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2835 {
2836 	int i, r;
2837 
2838 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2839 		amdgpu_virt_release_ras_err_handler_data(adev);
2840 
2841 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2842 		amdgpu_xgmi_remove_device(adev);
2843 
2844 	amdgpu_amdkfd_device_fini_sw(adev);
2845 
2846 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2847 		if (!adev->ip_blocks[i].status.sw)
2848 			continue;
2849 
2850 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2851 			amdgpu_ucode_free_bo(adev);
2852 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2853 			amdgpu_device_wb_fini(adev);
2854 			amdgpu_device_mem_scratch_fini(adev);
2855 			amdgpu_ib_pool_fini(adev);
2856 		}
2857 
2858 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2859 		/* XXX handle errors */
2860 		if (r) {
2861 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2862 				  adev->ip_blocks[i].version->funcs->name, r);
2863 		}
2864 		adev->ip_blocks[i].status.sw = false;
2865 		adev->ip_blocks[i].status.valid = false;
2866 	}
2867 
2868 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2869 		if (!adev->ip_blocks[i].status.late_initialized)
2870 			continue;
2871 		if (adev->ip_blocks[i].version->funcs->late_fini)
2872 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2873 		adev->ip_blocks[i].status.late_initialized = false;
2874 	}
2875 
2876 	amdgpu_ras_fini(adev);
2877 
2878 	return 0;
2879 }
2880 
2881 /**
2882  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2883  *
2884  * @work: work_struct.
2885  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2886 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2887 {
2888 	struct amdgpu_device *adev =
2889 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2890 	int r;
2891 
2892 	r = amdgpu_ib_ring_tests(adev);
2893 	if (r)
2894 		DRM_ERROR("ib ring test failed (%d).\n", r);
2895 }
2896 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2897 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2898 {
2899 	struct amdgpu_device *adev =
2900 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2901 
2902 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2903 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2904 
2905 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2906 		adev->gfx.gfx_off_state = true;
2907 }
2908 
2909 /**
2910  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2911  *
2912  * @adev: amdgpu_device pointer
2913  *
2914  * Main suspend function for hardware IPs.  The list of all the hardware
2915  * IPs that make up the asic is walked, clockgating is disabled and the
2916  * suspend callbacks are run.  suspend puts the hardware and software state
2917  * in each IP into a state suitable for suspend.
2918  * Returns 0 on success, negative error code on failure.
2919  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2920 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2921 {
2922 	int i, r;
2923 
2924 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2925 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2926 
2927 	/*
2928 	 * Per PMFW team's suggestion, driver needs to handle gfxoff
2929 	 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2930 	 * scenario. Add the missing df cstate disablement here.
2931 	 */
2932 	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2933 		dev_warn(adev->dev, "Failed to disallow df cstate");
2934 
2935 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2936 		if (!adev->ip_blocks[i].status.valid)
2937 			continue;
2938 
2939 		/* displays are handled separately */
2940 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2941 			continue;
2942 
2943 		/* XXX handle errors */
2944 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2945 		/* XXX handle errors */
2946 		if (r) {
2947 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2948 				  adev->ip_blocks[i].version->funcs->name, r);
2949 			return r;
2950 		}
2951 
2952 		adev->ip_blocks[i].status.hw = false;
2953 	}
2954 
2955 	return 0;
2956 }
2957 
2958 /**
2959  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2960  *
2961  * @adev: amdgpu_device pointer
2962  *
2963  * Main suspend function for hardware IPs.  The list of all the hardware
2964  * IPs that make up the asic is walked, clockgating is disabled and the
2965  * suspend callbacks are run.  suspend puts the hardware and software state
2966  * in each IP into a state suitable for suspend.
2967  * Returns 0 on success, negative error code on failure.
2968  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2969 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2970 {
2971 	int i, r;
2972 
2973 	if (adev->in_s0ix)
2974 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2975 
2976 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2977 		if (!adev->ip_blocks[i].status.valid)
2978 			continue;
2979 		/* displays are handled in phase1 */
2980 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2981 			continue;
2982 		/* PSP lost connection when err_event_athub occurs */
2983 		if (amdgpu_ras_intr_triggered() &&
2984 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2985 			adev->ip_blocks[i].status.hw = false;
2986 			continue;
2987 		}
2988 
2989 		/* skip unnecessary suspend if we do not initialize them yet */
2990 		if (adev->gmc.xgmi.pending_reset &&
2991 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2992 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2993 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2994 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2995 			adev->ip_blocks[i].status.hw = false;
2996 			continue;
2997 		}
2998 
2999 		/* skip suspend of gfx/mes and psp for S0ix
3000 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3001 		 * like at runtime. PSP is also part of the always on hardware
3002 		 * so no need to suspend it.
3003 		 */
3004 		if (adev->in_s0ix &&
3005 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3006 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3007 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3008 			continue;
3009 
3010 		/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3011 		if (adev->in_s0ix &&
3012 		    (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3013 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3014 			continue;
3015 
3016 		/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3017 		 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3018 		 * from this location and RLC Autoload automatically also gets loaded
3019 		 * from here based on PMFW -> PSP message during re-init sequence.
3020 		 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3021 		 * the TMR and reload FWs again for IMU enabled APU ASICs.
3022 		 */
3023 		if (amdgpu_in_reset(adev) &&
3024 		    (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3025 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3026 			continue;
3027 
3028 		/* XXX handle errors */
3029 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
3030 		/* XXX handle errors */
3031 		if (r) {
3032 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
3033 				  adev->ip_blocks[i].version->funcs->name, r);
3034 		}
3035 		adev->ip_blocks[i].status.hw = false;
3036 		/* handle putting the SMC in the appropriate state */
3037 		if (!amdgpu_sriov_vf(adev)) {
3038 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3039 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3040 				if (r) {
3041 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3042 							adev->mp1_state, r);
3043 					return r;
3044 				}
3045 			}
3046 		}
3047 	}
3048 
3049 	return 0;
3050 }
3051 
3052 /**
3053  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3054  *
3055  * @adev: amdgpu_device pointer
3056  *
3057  * Main suspend function for hardware IPs.  The list of all the hardware
3058  * IPs that make up the asic is walked, clockgating is disabled and the
3059  * suspend callbacks are run.  suspend puts the hardware and software state
3060  * in each IP into a state suitable for suspend.
3061  * Returns 0 on success, negative error code on failure.
3062  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3063 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3064 {
3065 	int r;
3066 
3067 	if (amdgpu_sriov_vf(adev)) {
3068 		amdgpu_virt_fini_data_exchange(adev);
3069 		amdgpu_virt_request_full_gpu(adev, false);
3070 	}
3071 
3072 	r = amdgpu_device_ip_suspend_phase1(adev);
3073 	if (r)
3074 		return r;
3075 	r = amdgpu_device_ip_suspend_phase2(adev);
3076 
3077 	if (amdgpu_sriov_vf(adev))
3078 		amdgpu_virt_release_full_gpu(adev, false);
3079 
3080 	return r;
3081 }
3082 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3083 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3084 {
3085 	int i, r;
3086 
3087 	static enum amd_ip_block_type ip_order[] = {
3088 		AMD_IP_BLOCK_TYPE_COMMON,
3089 		AMD_IP_BLOCK_TYPE_GMC,
3090 		AMD_IP_BLOCK_TYPE_PSP,
3091 		AMD_IP_BLOCK_TYPE_IH,
3092 	};
3093 
3094 	for (i = 0; i < adev->num_ip_blocks; i++) {
3095 		int j;
3096 		struct amdgpu_ip_block *block;
3097 
3098 		block = &adev->ip_blocks[i];
3099 		block->status.hw = false;
3100 
3101 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3102 
3103 			if (block->version->type != ip_order[j] ||
3104 				!block->status.valid)
3105 				continue;
3106 
3107 			r = block->version->funcs->hw_init(adev);
3108 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3109 			if (r)
3110 				return r;
3111 			block->status.hw = true;
3112 		}
3113 	}
3114 
3115 	return 0;
3116 }
3117 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3118 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3119 {
3120 	int i, r;
3121 
3122 	static enum amd_ip_block_type ip_order[] = {
3123 		AMD_IP_BLOCK_TYPE_SMC,
3124 		AMD_IP_BLOCK_TYPE_DCE,
3125 		AMD_IP_BLOCK_TYPE_GFX,
3126 		AMD_IP_BLOCK_TYPE_SDMA,
3127 		AMD_IP_BLOCK_TYPE_MES,
3128 		AMD_IP_BLOCK_TYPE_UVD,
3129 		AMD_IP_BLOCK_TYPE_VCE,
3130 		AMD_IP_BLOCK_TYPE_VCN,
3131 		AMD_IP_BLOCK_TYPE_JPEG
3132 	};
3133 
3134 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3135 		int j;
3136 		struct amdgpu_ip_block *block;
3137 
3138 		for (j = 0; j < adev->num_ip_blocks; j++) {
3139 			block = &adev->ip_blocks[j];
3140 
3141 			if (block->version->type != ip_order[i] ||
3142 				!block->status.valid ||
3143 				block->status.hw)
3144 				continue;
3145 
3146 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3147 				r = block->version->funcs->resume(adev);
3148 			else
3149 				r = block->version->funcs->hw_init(adev);
3150 
3151 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3152 			if (r)
3153 				return r;
3154 			block->status.hw = true;
3155 		}
3156 	}
3157 
3158 	return 0;
3159 }
3160 
3161 /**
3162  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3163  *
3164  * @adev: amdgpu_device pointer
3165  *
3166  * First resume function for hardware IPs.  The list of all the hardware
3167  * IPs that make up the asic is walked and the resume callbacks are run for
3168  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3169  * after a suspend and updates the software state as necessary.  This
3170  * function is also used for restoring the GPU after a GPU reset.
3171  * Returns 0 on success, negative error code on failure.
3172  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3173 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3174 {
3175 	int i, r;
3176 
3177 	for (i = 0; i < adev->num_ip_blocks; i++) {
3178 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3179 			continue;
3180 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3181 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3182 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3183 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3184 
3185 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3186 			if (r) {
3187 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3188 					  adev->ip_blocks[i].version->funcs->name, r);
3189 				return r;
3190 			}
3191 			adev->ip_blocks[i].status.hw = true;
3192 		}
3193 	}
3194 
3195 	return 0;
3196 }
3197 
3198 /**
3199  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3200  *
3201  * @adev: amdgpu_device pointer
3202  *
3203  * First resume function for hardware IPs.  The list of all the hardware
3204  * IPs that make up the asic is walked and the resume callbacks are run for
3205  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3206  * functional state after a suspend and updates the software state as
3207  * necessary.  This function is also used for restoring the GPU after a GPU
3208  * reset.
3209  * Returns 0 on success, negative error code on failure.
3210  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3211 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3212 {
3213 	int i, r;
3214 
3215 	for (i = 0; i < adev->num_ip_blocks; i++) {
3216 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3217 			continue;
3218 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3219 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3220 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3221 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3222 			continue;
3223 		r = adev->ip_blocks[i].version->funcs->resume(adev);
3224 		if (r) {
3225 			DRM_ERROR("resume of IP block <%s> failed %d\n",
3226 				  adev->ip_blocks[i].version->funcs->name, r);
3227 			return r;
3228 		}
3229 		adev->ip_blocks[i].status.hw = true;
3230 	}
3231 
3232 	return 0;
3233 }
3234 
3235 /**
3236  * amdgpu_device_ip_resume - run resume for hardware IPs
3237  *
3238  * @adev: amdgpu_device pointer
3239  *
3240  * Main resume function for hardware IPs.  The hardware IPs
3241  * are split into two resume functions because they are
3242  * also used in recovering from a GPU reset and some additional
3243  * steps need to be take between them.  In this case (S3/S4) they are
3244  * run sequentially.
3245  * Returns 0 on success, negative error code on failure.
3246  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3247 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3248 {
3249 	int r;
3250 
3251 	r = amdgpu_device_ip_resume_phase1(adev);
3252 	if (r)
3253 		return r;
3254 
3255 	r = amdgpu_device_fw_loading(adev);
3256 	if (r)
3257 		return r;
3258 
3259 	r = amdgpu_device_ip_resume_phase2(adev);
3260 
3261 	return r;
3262 }
3263 
3264 /**
3265  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3266  *
3267  * @adev: amdgpu_device pointer
3268  *
3269  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3270  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3271 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3272 {
3273 	if (amdgpu_sriov_vf(adev)) {
3274 		if (adev->is_atom_fw) {
3275 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3276 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3277 		} else {
3278 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3279 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3280 		}
3281 
3282 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3283 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3284 	}
3285 }
3286 
3287 /**
3288  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3289  *
3290  * @asic_type: AMD asic type
3291  *
3292  * Check if there is DC (new modesetting infrastructre) support for an asic.
3293  * returns true if DC has support, false if not.
3294  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3295 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3296 {
3297 	switch (asic_type) {
3298 #ifdef CONFIG_DRM_AMDGPU_SI
3299 	case CHIP_HAINAN:
3300 #endif
3301 	case CHIP_TOPAZ:
3302 		/* chips with no display hardware */
3303 		return false;
3304 #if defined(CONFIG_DRM_AMD_DC)
3305 	case CHIP_TAHITI:
3306 	case CHIP_PITCAIRN:
3307 	case CHIP_VERDE:
3308 	case CHIP_OLAND:
3309 		/*
3310 		 * We have systems in the wild with these ASICs that require
3311 		 * LVDS and VGA support which is not supported with DC.
3312 		 *
3313 		 * Fallback to the non-DC driver here by default so as not to
3314 		 * cause regressions.
3315 		 */
3316 #if defined(CONFIG_DRM_AMD_DC_SI)
3317 		return amdgpu_dc > 0;
3318 #else
3319 		return false;
3320 #endif
3321 	case CHIP_BONAIRE:
3322 	case CHIP_KAVERI:
3323 	case CHIP_KABINI:
3324 	case CHIP_MULLINS:
3325 		/*
3326 		 * We have systems in the wild with these ASICs that require
3327 		 * VGA support which is not supported with DC.
3328 		 *
3329 		 * Fallback to the non-DC driver here by default so as not to
3330 		 * cause regressions.
3331 		 */
3332 		return amdgpu_dc > 0;
3333 	default:
3334 		return amdgpu_dc != 0;
3335 #else
3336 	default:
3337 		if (amdgpu_dc > 0)
3338 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3339 		return false;
3340 #endif
3341 	}
3342 }
3343 
3344 /**
3345  * amdgpu_device_has_dc_support - check if dc is supported
3346  *
3347  * @adev: amdgpu_device pointer
3348  *
3349  * Returns true for supported, false for not supported
3350  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3351 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3352 {
3353 	if (adev->enable_virtual_display ||
3354 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3355 		return false;
3356 
3357 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3358 }
3359 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3360 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3361 {
3362 	struct amdgpu_device *adev =
3363 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3364 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3365 
3366 	/* It's a bug to not have a hive within this function */
3367 	if (WARN_ON(!hive))
3368 		return;
3369 
3370 	/*
3371 	 * Use task barrier to synchronize all xgmi reset works across the
3372 	 * hive. task_barrier_enter and task_barrier_exit will block
3373 	 * until all the threads running the xgmi reset works reach
3374 	 * those points. task_barrier_full will do both blocks.
3375 	 */
3376 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3377 
3378 		task_barrier_enter(&hive->tb);
3379 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3380 
3381 		if (adev->asic_reset_res)
3382 			goto fail;
3383 
3384 		task_barrier_exit(&hive->tb);
3385 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3386 
3387 		if (adev->asic_reset_res)
3388 			goto fail;
3389 
3390 		if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3391 		    adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3392 			adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3393 	} else {
3394 
3395 		task_barrier_full(&hive->tb);
3396 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3397 	}
3398 
3399 fail:
3400 	if (adev->asic_reset_res)
3401 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3402 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3403 	amdgpu_put_xgmi_hive(hive);
3404 }
3405 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3406 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3407 {
3408 	char *input = amdgpu_lockup_timeout;
3409 	char *timeout_setting = NULL;
3410 	int index = 0;
3411 	long timeout;
3412 	int ret = 0;
3413 
3414 	/*
3415 	 * By default timeout for non compute jobs is 10000
3416 	 * and 60000 for compute jobs.
3417 	 * In SR-IOV or passthrough mode, timeout for compute
3418 	 * jobs are 60000 by default.
3419 	 */
3420 	adev->gfx_timeout = msecs_to_jiffies(10000);
3421 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3422 	if (amdgpu_sriov_vf(adev))
3423 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3424 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3425 	else
3426 		adev->compute_timeout =  msecs_to_jiffies(60000);
3427 
3428 #ifdef notyet
3429 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3430 		while ((timeout_setting = strsep(&input, ",")) &&
3431 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3432 			ret = kstrtol(timeout_setting, 0, &timeout);
3433 			if (ret)
3434 				return ret;
3435 
3436 			if (timeout == 0) {
3437 				index++;
3438 				continue;
3439 			} else if (timeout < 0) {
3440 				timeout = MAX_SCHEDULE_TIMEOUT;
3441 				dev_warn(adev->dev, "lockup timeout disabled");
3442 				add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3443 			} else {
3444 				timeout = msecs_to_jiffies(timeout);
3445 			}
3446 
3447 			switch (index++) {
3448 			case 0:
3449 				adev->gfx_timeout = timeout;
3450 				break;
3451 			case 1:
3452 				adev->compute_timeout = timeout;
3453 				break;
3454 			case 2:
3455 				adev->sdma_timeout = timeout;
3456 				break;
3457 			case 3:
3458 				adev->video_timeout = timeout;
3459 				break;
3460 			default:
3461 				break;
3462 			}
3463 		}
3464 		/*
3465 		 * There is only one value specified and
3466 		 * it should apply to all non-compute jobs.
3467 		 */
3468 		if (index == 1) {
3469 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3470 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3471 				adev->compute_timeout = adev->gfx_timeout;
3472 		}
3473 	}
3474 #endif
3475 
3476 	return ret;
3477 }
3478 
3479 /**
3480  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3481  *
3482  * @adev: amdgpu_device pointer
3483  *
3484  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3485  */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3486 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3487 {
3488 #ifdef notyet
3489 	struct iommu_domain *domain;
3490 
3491 	domain = iommu_get_domain_for_dev(adev->dev);
3492 	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3493 #endif
3494 		adev->ram_is_direct_mapped = true;
3495 }
3496 
3497 static const struct attribute *amdgpu_dev_attributes[] = {
3498 	&dev_attr_pcie_replay_count.attr,
3499 	NULL
3500 };
3501 
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3502 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3503 {
3504 	if (amdgpu_mcbp == 1)
3505 		adev->gfx.mcbp = true;
3506 	else if (amdgpu_mcbp == 0)
3507 		adev->gfx.mcbp = false;
3508 
3509 	if (amdgpu_sriov_vf(adev))
3510 		adev->gfx.mcbp = true;
3511 
3512 	if (adev->gfx.mcbp)
3513 		DRM_INFO("MCBP is enabled\n");
3514 }
3515 
3516 /**
3517  * amdgpu_device_init - initialize the driver
3518  *
3519  * @adev: amdgpu_device pointer
3520  * @flags: driver flags
3521  *
3522  * Initializes the driver info and hw (all asics).
3523  * Returns 0 for success or an error on failure.
3524  * Called at driver startup.
3525  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3526 int amdgpu_device_init(struct amdgpu_device *adev,
3527 		       uint32_t flags)
3528 {
3529 	struct drm_device *ddev = adev_to_drm(adev);
3530 	struct pci_dev *pdev = adev->pdev;
3531 	int r, i;
3532 	bool px = false;
3533 	u32 max_MBps;
3534 	int tmp;
3535 
3536 	adev->shutdown = false;
3537 	adev->flags = flags;
3538 
3539 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3540 		adev->asic_type = amdgpu_force_asic_type;
3541 	else
3542 		adev->asic_type = flags & AMD_ASIC_MASK;
3543 
3544 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3545 	if (amdgpu_emu_mode == 1)
3546 		adev->usec_timeout *= 10;
3547 	adev->gmc.gart_size = 512 * 1024 * 1024;
3548 	adev->accel_working = false;
3549 	adev->num_rings = 0;
3550 	RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3551 	adev->mman.buffer_funcs = NULL;
3552 	adev->mman.buffer_funcs_ring = NULL;
3553 	adev->vm_manager.vm_pte_funcs = NULL;
3554 	adev->vm_manager.vm_pte_num_scheds = 0;
3555 	adev->gmc.gmc_funcs = NULL;
3556 	adev->harvest_ip_mask = 0x0;
3557 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3558 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3559 
3560 	adev->smc_rreg = &amdgpu_invalid_rreg;
3561 	adev->smc_wreg = &amdgpu_invalid_wreg;
3562 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3563 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3564 	adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3565 	adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3566 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3567 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3568 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3569 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3570 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3571 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3572 	adev->didt_rreg = &amdgpu_invalid_rreg;
3573 	adev->didt_wreg = &amdgpu_invalid_wreg;
3574 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3575 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3576 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3577 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3578 
3579 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3580 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3581 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3582 
3583 	/* mutex initialization are all done here so we
3584 	 * can recall function without having locking issues
3585 	 */
3586 	rw_init(&adev->firmware.mutex, "agfw");
3587 	rw_init(&adev->pm.mutex, "agpm");
3588 	rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
3589 	rw_init(&adev->srbm_mutex, "srbm");
3590 	rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
3591 	rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
3592 	rw_init(&adev->gfx.partition_mutex, "gfxpar");
3593 	rw_init(&adev->grbm_idx_mutex, "grbmidx");
3594 	rw_init(&adev->mn_lock, "agpumn");
3595 	rw_init(&adev->virt.vf_errors.lock, "vferr");
3596 	rw_init(&adev->virt.rlcg_reg_lock, "vrlcg");
3597 	hash_init(adev->mn_hash);
3598 	rw_init(&adev->psp.mutex, "agpsp");
3599 	rw_init(&adev->notifier_lock, "agnf");
3600 	rw_init(&adev->pm.stable_pstate_ctx_lock, "agps");
3601 	rw_init(&adev->benchmark_mutex, "agbm");
3602 
3603 	amdgpu_device_init_apu_flags(adev);
3604 
3605 	r = amdgpu_device_check_arguments(adev);
3606 	if (r)
3607 		return r;
3608 
3609 	mtx_init(&adev->mmio_idx_lock, IPL_TTY);
3610 	mtx_init(&adev->smc_idx_lock, IPL_TTY);
3611 	mtx_init(&adev->pcie_idx_lock, IPL_TTY);
3612 	mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
3613 	mtx_init(&adev->didt_idx_lock, IPL_TTY);
3614 	mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
3615 	mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
3616 	mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
3617 	mtx_init(&adev->mm_stats.lock, IPL_NONE);
3618 
3619 	INIT_LIST_HEAD(&adev->shadow_list);
3620 	rw_init(&adev->shadow_list_lock, "sdwlst");
3621 
3622 	INIT_LIST_HEAD(&adev->reset_list);
3623 
3624 	INIT_LIST_HEAD(&adev->ras_list);
3625 
3626 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3627 			  amdgpu_device_delayed_init_work_handler);
3628 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3629 			  amdgpu_device_delay_enable_gfx_off);
3630 
3631 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3632 
3633 	adev->gfx.gfx_off_req_count = 1;
3634 	adev->gfx.gfx_off_residency = 0;
3635 	adev->gfx.gfx_off_entrycount = 0;
3636 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3637 
3638 	atomic_set(&adev->throttling_logging_enabled, 1);
3639 	/*
3640 	 * If throttling continues, logging will be performed every minute
3641 	 * to avoid log flooding. "-1" is subtracted since the thermal
3642 	 * throttling interrupt comes every second. Thus, the total logging
3643 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3644 	 * for throttling interrupt) = 60 seconds.
3645 	 */
3646 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3647 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3648 
3649 #ifdef __linux__
3650 	/* Registers mapping */
3651 	/* TODO: block userspace mapping of io register */
3652 	if (adev->asic_type >= CHIP_BONAIRE) {
3653 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3654 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3655 	} else {
3656 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3657 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3658 	}
3659 #endif
3660 
3661 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3662 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3663 
3664 #ifdef __linux__
3665 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3666 	if (!adev->rmmio)
3667 		return -ENOMEM;
3668 #endif
3669 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3670 	DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3671 
3672 	/*
3673 	 * Reset domain needs to be present early, before XGMI hive discovered
3674 	 * (if any) and intitialized to use reset sem and in_gpu reset flag
3675 	 * early on during init and before calling to RREG32.
3676 	 */
3677 	adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3678 	if (!adev->reset_domain)
3679 		return -ENOMEM;
3680 
3681 	/* detect hw virtualization here */
3682 	amdgpu_detect_virtualization(adev);
3683 
3684 	amdgpu_device_get_pcie_info(adev);
3685 
3686 	r = amdgpu_device_get_job_timeout_settings(adev);
3687 	if (r) {
3688 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3689 		return r;
3690 	}
3691 
3692 	/* early init functions */
3693 	r = amdgpu_device_ip_early_init(adev);
3694 	if (r)
3695 		return r;
3696 
3697 	amdgpu_device_set_mcbp(adev);
3698 
3699 	/* Get rid of things like offb */
3700 	r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3701 	if (r)
3702 		return r;
3703 
3704 	/* Enable TMZ based on IP_VERSION */
3705 	amdgpu_gmc_tmz_set(adev);
3706 
3707 	amdgpu_gmc_noretry_set(adev);
3708 	/* Need to get xgmi info early to decide the reset behavior*/
3709 	if (adev->gmc.xgmi.supported) {
3710 		r = adev->gfxhub.funcs->get_xgmi_info(adev);
3711 		if (r)
3712 			return r;
3713 	}
3714 
3715 	/* enable PCIE atomic ops */
3716 #ifdef notyet
3717 	if (amdgpu_sriov_vf(adev)) {
3718 		if (adev->virt.fw_reserve.p_pf2vf)
3719 			adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3720 						      adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3721 				(PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3722 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3723 	 * internal path natively support atomics, set have_atomics_support to true.
3724 	 */
3725 	} else if ((adev->flags & AMD_IS_APU) &&
3726 		   (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3727 		adev->have_atomics_support = true;
3728 	} else {
3729 		adev->have_atomics_support =
3730 			!pci_enable_atomic_ops_to_root(adev->pdev,
3731 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3732 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3733 	}
3734 
3735 	if (!adev->have_atomics_support)
3736 		dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3737 #else
3738 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3739 	 * internal path natively support atomics, set have_atomics_support to true.
3740 	 */
3741 	if ((adev->flags & AMD_IS_APU) &&
3742 		(adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)))
3743 		adev->have_atomics_support = true;
3744 	else
3745 		adev->have_atomics_support = false;
3746 #endif
3747 
3748 	/* doorbell bar mapping and doorbell index init*/
3749 	amdgpu_doorbell_init(adev);
3750 
3751 	if (amdgpu_emu_mode == 1) {
3752 		/* post the asic on emulation mode */
3753 		emu_soc_asic_init(adev);
3754 		goto fence_driver_init;
3755 	}
3756 
3757 	amdgpu_reset_init(adev);
3758 
3759 	/* detect if we are with an SRIOV vbios */
3760 	if (adev->bios)
3761 		amdgpu_device_detect_sriov_bios(adev);
3762 
3763 	/* check if we need to reset the asic
3764 	 *  E.g., driver was not cleanly unloaded previously, etc.
3765 	 */
3766 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3767 		if (adev->gmc.xgmi.num_physical_nodes) {
3768 			dev_info(adev->dev, "Pending hive reset.\n");
3769 			adev->gmc.xgmi.pending_reset = true;
3770 			/* Only need to init necessary block for SMU to handle the reset */
3771 			for (i = 0; i < adev->num_ip_blocks; i++) {
3772 				if (!adev->ip_blocks[i].status.valid)
3773 					continue;
3774 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3775 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3776 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3777 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3778 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3779 						adev->ip_blocks[i].version->funcs->name);
3780 					adev->ip_blocks[i].status.hw = true;
3781 				}
3782 			}
3783 		} else {
3784 			tmp = amdgpu_reset_method;
3785 			/* It should do a default reset when loading or reloading the driver,
3786 			 * regardless of the module parameter reset_method.
3787 			 */
3788 			amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3789 			r = amdgpu_asic_reset(adev);
3790 			amdgpu_reset_method = tmp;
3791 			if (r) {
3792 				dev_err(adev->dev, "asic reset on init failed\n");
3793 				goto failed;
3794 			}
3795 		}
3796 	}
3797 
3798 	/* Post card if necessary */
3799 	if (amdgpu_device_need_post(adev)) {
3800 		if (!adev->bios) {
3801 			dev_err(adev->dev, "no vBIOS found\n");
3802 			r = -EINVAL;
3803 			goto failed;
3804 		}
3805 		DRM_INFO("GPU posting now...\n");
3806 		r = amdgpu_device_asic_init(adev);
3807 		if (r) {
3808 			dev_err(adev->dev, "gpu post error!\n");
3809 			goto failed;
3810 		}
3811 	}
3812 
3813 	if (adev->bios) {
3814 		if (adev->is_atom_fw) {
3815 			/* Initialize clocks */
3816 			r = amdgpu_atomfirmware_get_clock_info(adev);
3817 			if (r) {
3818 				dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3819 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3820 				goto failed;
3821 			}
3822 		} else {
3823 			/* Initialize clocks */
3824 			r = amdgpu_atombios_get_clock_info(adev);
3825 			if (r) {
3826 				dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3827 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3828 				goto failed;
3829 			}
3830 			/* init i2c buses */
3831 			if (!amdgpu_device_has_dc_support(adev))
3832 				amdgpu_atombios_i2c_init(adev);
3833 		}
3834 	}
3835 
3836 fence_driver_init:
3837 	/* Fence driver */
3838 	r = amdgpu_fence_driver_sw_init(adev);
3839 	if (r) {
3840 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3841 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3842 		goto failed;
3843 	}
3844 
3845 	/* init the mode config */
3846 	drm_mode_config_init(adev_to_drm(adev));
3847 
3848 	r = amdgpu_device_ip_init(adev);
3849 	if (r) {
3850 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3851 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3852 		goto release_ras_con;
3853 	}
3854 
3855 	amdgpu_fence_driver_hw_init(adev);
3856 
3857 	dev_info(adev->dev,
3858 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3859 			adev->gfx.config.max_shader_engines,
3860 			adev->gfx.config.max_sh_per_se,
3861 			adev->gfx.config.max_cu_per_sh,
3862 			adev->gfx.cu_info.number);
3863 
3864 #ifdef __OpenBSD__
3865 {
3866 	const char *chip_name;
3867 	uint32_t version = adev->ip_versions[GC_HWIP][0];
3868 	int maj, min, rev;
3869 
3870 	switch (adev->asic_type) {
3871 	case CHIP_RAVEN:
3872 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
3873 			chip_name = "RAVEN2";
3874 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
3875 			chip_name = "PICASSO";
3876 		else
3877 			chip_name = "RAVEN";
3878 		break;
3879 	case CHIP_RENOIR:
3880 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
3881 			chip_name = "RENOIR";
3882 		else
3883 			chip_name = "GREEN_SARDINE";
3884 		break;
3885 	default:
3886 		chip_name = amdgpu_asic_name[adev->asic_type];
3887 	}
3888 
3889 	printf("%s: %s", adev->self.dv_xname, chip_name);
3890 	/* show graphics/compute ip block version, not set on < GFX9 */
3891 	if (version) {
3892 		maj = IP_VERSION_MAJ(version);
3893 		min = IP_VERSION_MIN(version);
3894 		rev = IP_VERSION_REV(version);
3895 		printf(" GC %d.%d.%d", maj, min, rev);
3896 	}
3897 	printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id);
3898 }
3899 #endif
3900 
3901 	adev->accel_working = true;
3902 
3903 	amdgpu_vm_check_compute_bug(adev);
3904 
3905 	/* Initialize the buffer migration limit. */
3906 	if (amdgpu_moverate >= 0)
3907 		max_MBps = amdgpu_moverate;
3908 	else
3909 		max_MBps = 8; /* Allow 8 MB/s. */
3910 	/* Get a log2 for easy divisions. */
3911 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3912 
3913 	r = amdgpu_atombios_sysfs_init(adev);
3914 	if (r)
3915 		drm_err(&adev->ddev,
3916 			"registering atombios sysfs failed (%d).\n", r);
3917 
3918 	r = amdgpu_pm_sysfs_init(adev);
3919 	if (r)
3920 		DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3921 
3922 	r = amdgpu_ucode_sysfs_init(adev);
3923 	if (r) {
3924 		adev->ucode_sysfs_en = false;
3925 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3926 	} else
3927 		adev->ucode_sysfs_en = true;
3928 
3929 	/*
3930 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3931 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3932 	 * gpu instance is counted less.
3933 	 */
3934 	amdgpu_register_gpu_instance(adev);
3935 
3936 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3937 	 * explicit gating rather than handling it automatically.
3938 	 */
3939 	if (!adev->gmc.xgmi.pending_reset) {
3940 		r = amdgpu_device_ip_late_init(adev);
3941 		if (r) {
3942 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3943 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3944 			goto release_ras_con;
3945 		}
3946 		/* must succeed. */
3947 		amdgpu_ras_resume(adev);
3948 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3949 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3950 	}
3951 
3952 	if (amdgpu_sriov_vf(adev)) {
3953 		amdgpu_virt_release_full_gpu(adev, true);
3954 		flush_delayed_work(&adev->delayed_init_work);
3955 	}
3956 
3957 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3958 	if (r)
3959 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3960 
3961 	amdgpu_fru_sysfs_init(adev);
3962 
3963 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3964 		r = amdgpu_pmu_init(adev);
3965 	if (r)
3966 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3967 
3968 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3969 	if (amdgpu_device_cache_pci_state(adev->pdev))
3970 		pci_restore_state(pdev);
3971 
3972 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3973 	/* this will fail for cards that aren't VGA class devices, just
3974 	 * ignore it
3975 	 */
3976 #ifdef notyet
3977 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3978 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3979 #endif
3980 
3981 	px = amdgpu_device_supports_px(ddev);
3982 
3983 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
3984 				apple_gmux_detect(NULL, NULL)))
3985 		vga_switcheroo_register_client(adev->pdev,
3986 					       &amdgpu_switcheroo_ops, px);
3987 
3988 	if (px)
3989 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3990 
3991 	if (adev->gmc.xgmi.pending_reset)
3992 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3993 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3994 
3995 	amdgpu_device_check_iommu_direct_map(adev);
3996 
3997 	return 0;
3998 
3999 release_ras_con:
4000 	if (amdgpu_sriov_vf(adev))
4001 		amdgpu_virt_release_full_gpu(adev, true);
4002 
4003 	/* failed in exclusive mode due to timeout */
4004 	if (amdgpu_sriov_vf(adev) &&
4005 		!amdgpu_sriov_runtime(adev) &&
4006 		amdgpu_virt_mmio_blocked(adev) &&
4007 		!amdgpu_virt_wait_reset(adev)) {
4008 		dev_err(adev->dev, "VF exclusive mode timeout\n");
4009 		/* Don't send request since VF is inactive. */
4010 		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4011 		adev->virt.ops = NULL;
4012 		r = -EAGAIN;
4013 	}
4014 	amdgpu_release_ras_context(adev);
4015 
4016 failed:
4017 	amdgpu_vf_error_trans_all(adev);
4018 
4019 	return r;
4020 }
4021 
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4022 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4023 {
4024 	STUB();
4025 #ifdef notyet
4026 
4027 	/* Clear all CPU mappings pointing to this device */
4028 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4029 #endif
4030 
4031 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
4032 	amdgpu_doorbell_fini(adev);
4033 
4034 #ifdef __linux__
4035 	iounmap(adev->rmmio);
4036 	adev->rmmio = NULL;
4037 	if (adev->mman.aper_base_kaddr)
4038 		iounmap(adev->mman.aper_base_kaddr);
4039 	adev->mman.aper_base_kaddr = NULL;
4040 #else
4041 	if (adev->rmmio_size > 0)
4042 		bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4043 		    adev->rmmio_size);
4044 	adev->rmmio_size = 0;
4045 	adev->rmmio = NULL;
4046 	if (adev->mman.aper_base_kaddr)
4047 		bus_space_unmap(adev->memt, adev->mman.aper_bsh,
4048 		    adev->gmc.visible_vram_size);
4049 	adev->mman.aper_base_kaddr = NULL;
4050 #endif
4051 
4052 	/* Memory manager related */
4053 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4054 #ifdef __linux__
4055 		arch_phys_wc_del(adev->gmc.vram_mtrr);
4056 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4057 #else
4058 		drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
4059 #endif
4060 	}
4061 }
4062 
4063 /**
4064  * amdgpu_device_fini_hw - tear down the driver
4065  *
4066  * @adev: amdgpu_device pointer
4067  *
4068  * Tear down the driver info (all asics).
4069  * Called at driver shutdown.
4070  */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4071 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4072 {
4073 	dev_info(adev->dev, "amdgpu: finishing device.\n");
4074 	flush_delayed_work(&adev->delayed_init_work);
4075 	adev->shutdown = true;
4076 
4077 	/* make sure IB test finished before entering exclusive mode
4078 	 * to avoid preemption on IB test
4079 	 */
4080 	if (amdgpu_sriov_vf(adev)) {
4081 		amdgpu_virt_request_full_gpu(adev, false);
4082 		amdgpu_virt_fini_data_exchange(adev);
4083 	}
4084 
4085 	/* disable all interrupts */
4086 	amdgpu_irq_disable_all(adev);
4087 	if (adev->mode_info.mode_config_initialized) {
4088 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4089 			drm_helper_force_disable_all(adev_to_drm(adev));
4090 		else
4091 			drm_atomic_helper_shutdown(adev_to_drm(adev));
4092 	}
4093 	amdgpu_fence_driver_hw_fini(adev);
4094 
4095 	if (adev->mman.initialized)
4096 		drain_workqueue(adev->mman.bdev.wq);
4097 
4098 	if (adev->pm.sysfs_initialized)
4099 		amdgpu_pm_sysfs_fini(adev);
4100 	if (adev->ucode_sysfs_en)
4101 		amdgpu_ucode_sysfs_fini(adev);
4102 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4103 	amdgpu_fru_sysfs_fini(adev);
4104 
4105 	/* disable ras feature must before hw fini */
4106 	amdgpu_ras_pre_fini(adev);
4107 
4108 	amdgpu_device_ip_fini_early(adev);
4109 
4110 	amdgpu_irq_fini_hw(adev);
4111 
4112 	if (adev->mman.initialized)
4113 		ttm_device_clear_dma_mappings(&adev->mman.bdev);
4114 
4115 	amdgpu_gart_dummy_page_fini(adev);
4116 
4117 	if (drm_dev_is_unplugged(adev_to_drm(adev)))
4118 		amdgpu_device_unmap_mmio(adev);
4119 
4120 }
4121 
amdgpu_device_fini_sw(struct amdgpu_device * adev)4122 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4123 {
4124 	int idx;
4125 	bool px;
4126 
4127 	amdgpu_fence_driver_sw_fini(adev);
4128 	amdgpu_device_ip_fini(adev);
4129 	amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4130 	adev->accel_working = false;
4131 	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4132 
4133 	amdgpu_reset_fini(adev);
4134 
4135 	/* free i2c buses */
4136 	if (!amdgpu_device_has_dc_support(adev))
4137 		amdgpu_i2c_fini(adev);
4138 
4139 	if (amdgpu_emu_mode != 1)
4140 		amdgpu_atombios_fini(adev);
4141 
4142 	kfree(adev->bios);
4143 	adev->bios = NULL;
4144 
4145 	px = amdgpu_device_supports_px(adev_to_drm(adev));
4146 
4147 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4148 				apple_gmux_detect(NULL, NULL)))
4149 		vga_switcheroo_unregister_client(adev->pdev);
4150 
4151 	if (px)
4152 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
4153 
4154 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4155 		vga_client_unregister(adev->pdev);
4156 
4157 	if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4158 #ifdef __linux__
4159 		iounmap(adev->rmmio);
4160 		adev->rmmio = NULL;
4161 #else
4162 		if (adev->rmmio_size > 0)
4163 			bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4164 			    adev->rmmio_size);
4165 		adev->rmmio_size = 0;
4166 		adev->rmmio = NULL;
4167 #endif
4168 		amdgpu_doorbell_fini(adev);
4169 		drm_dev_exit(idx);
4170 	}
4171 
4172 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4173 		amdgpu_pmu_fini(adev);
4174 	if (adev->mman.discovery_bin)
4175 		amdgpu_discovery_fini(adev);
4176 
4177 	amdgpu_reset_put_reset_domain(adev->reset_domain);
4178 	adev->reset_domain = NULL;
4179 
4180 	kfree(adev->pci_state);
4181 
4182 }
4183 
4184 /**
4185  * amdgpu_device_evict_resources - evict device resources
4186  * @adev: amdgpu device object
4187  *
4188  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4189  * of the vram memory type. Mainly used for evicting device resources
4190  * at suspend time.
4191  *
4192  */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4193 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4194 {
4195 	int ret;
4196 
4197 	/* No need to evict vram on APUs for suspend to ram or s2idle */
4198 	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4199 		return 0;
4200 
4201 	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4202 	if (ret)
4203 		DRM_WARN("evicting device resources failed\n");
4204 	return ret;
4205 }
4206 
4207 /*
4208  * Suspend & resume.
4209  */
4210 /**
4211  * amdgpu_device_prepare - prepare for device suspend
4212  *
4213  * @dev: drm dev pointer
4214  *
4215  * Prepare to put the hw in the suspend state (all asics).
4216  * Returns 0 for success or an error on failure.
4217  * Called at driver suspend.
4218  */
amdgpu_device_prepare(struct drm_device * dev)4219 int amdgpu_device_prepare(struct drm_device *dev)
4220 {
4221 	struct amdgpu_device *adev = drm_to_adev(dev);
4222 	int i, r;
4223 
4224 	amdgpu_choose_low_power_state(adev);
4225 
4226 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4227 		return 0;
4228 
4229 	/* Evict the majority of BOs before starting suspend sequence */
4230 	r = amdgpu_device_evict_resources(adev);
4231 	if (r)
4232 		goto unprepare;
4233 
4234 	flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4235 
4236 	for (i = 0; i < adev->num_ip_blocks; i++) {
4237 		if (!adev->ip_blocks[i].status.valid)
4238 			continue;
4239 		if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4240 			continue;
4241 		r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4242 		if (r)
4243 			goto unprepare;
4244 	}
4245 
4246 	return 0;
4247 
4248 unprepare:
4249 	adev->in_s0ix = adev->in_s3 = false;
4250 
4251 	return r;
4252 }
4253 
4254 /**
4255  * amdgpu_device_suspend - initiate device suspend
4256  *
4257  * @dev: drm dev pointer
4258  * @fbcon : notify the fbdev of suspend
4259  *
4260  * Puts the hw in the suspend state (all asics).
4261  * Returns 0 for success or an error on failure.
4262  * Called at driver suspend.
4263  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4264 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4265 {
4266 	struct amdgpu_device *adev = drm_to_adev(dev);
4267 	int r = 0;
4268 
4269 	if (adev->shutdown)
4270 		return 0;
4271 
4272 #ifdef notyet
4273 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4274 		return 0;
4275 #endif
4276 
4277 	adev->in_suspend = true;
4278 
4279 	if (amdgpu_sriov_vf(adev)) {
4280 		amdgpu_virt_fini_data_exchange(adev);
4281 		r = amdgpu_virt_request_full_gpu(adev, false);
4282 		if (r)
4283 			return r;
4284 	}
4285 
4286 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4287 		DRM_WARN("smart shift update failed\n");
4288 
4289 	if (fbcon)
4290 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4291 
4292 	cancel_delayed_work_sync(&adev->delayed_init_work);
4293 
4294 	amdgpu_ras_suspend(adev);
4295 
4296 	amdgpu_device_ip_suspend_phase1(adev);
4297 
4298 	if (!adev->in_s0ix)
4299 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4300 
4301 	r = amdgpu_device_evict_resources(adev);
4302 	if (r)
4303 		return r;
4304 
4305 	amdgpu_fence_driver_hw_fini(adev);
4306 
4307 	amdgpu_device_ip_suspend_phase2(adev);
4308 
4309 	if (amdgpu_sriov_vf(adev))
4310 		amdgpu_virt_release_full_gpu(adev, false);
4311 
4312 	return 0;
4313 }
4314 
4315 /**
4316  * amdgpu_device_resume - initiate device resume
4317  *
4318  * @dev: drm dev pointer
4319  * @fbcon : notify the fbdev of resume
4320  *
4321  * Bring the hw back to operating state (all asics).
4322  * Returns 0 for success or an error on failure.
4323  * Called at driver resume.
4324  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4325 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4326 {
4327 	struct amdgpu_device *adev = drm_to_adev(dev);
4328 	int r = 0;
4329 
4330 	if (amdgpu_sriov_vf(adev)) {
4331 		r = amdgpu_virt_request_full_gpu(adev, true);
4332 		if (r)
4333 			return r;
4334 	}
4335 
4336 #ifdef notyet
4337 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4338 		return 0;
4339 #endif
4340 
4341 	if (adev->in_s0ix)
4342 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4343 
4344 	/* post card */
4345 	if (amdgpu_device_need_post(adev)) {
4346 		r = amdgpu_device_asic_init(adev);
4347 		if (r)
4348 			dev_err(adev->dev, "amdgpu asic init failed\n");
4349 	}
4350 
4351 	r = amdgpu_device_ip_resume(adev);
4352 
4353 	if (r) {
4354 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4355 		goto exit;
4356 	}
4357 	amdgpu_fence_driver_hw_init(adev);
4358 
4359 	r = amdgpu_device_ip_late_init(adev);
4360 	if (r)
4361 		goto exit;
4362 
4363 	queue_delayed_work(system_wq, &adev->delayed_init_work,
4364 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
4365 
4366 	if (!adev->in_s0ix) {
4367 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4368 		if (r)
4369 			goto exit;
4370 	}
4371 
4372 exit:
4373 	if (amdgpu_sriov_vf(adev)) {
4374 		amdgpu_virt_init_data_exchange(adev);
4375 		amdgpu_virt_release_full_gpu(adev, true);
4376 	}
4377 
4378 	if (r)
4379 		return r;
4380 
4381 	/* Make sure IB tests flushed */
4382 	flush_delayed_work(&adev->delayed_init_work);
4383 
4384 	if (fbcon)
4385 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4386 
4387 	amdgpu_ras_resume(adev);
4388 
4389 	if (adev->mode_info.num_crtc) {
4390 		/*
4391 		 * Most of the connector probing functions try to acquire runtime pm
4392 		 * refs to ensure that the GPU is powered on when connector polling is
4393 		 * performed. Since we're calling this from a runtime PM callback,
4394 		 * trying to acquire rpm refs will cause us to deadlock.
4395 		 *
4396 		 * Since we're guaranteed to be holding the rpm lock, it's safe to
4397 		 * temporarily disable the rpm helpers so this doesn't deadlock us.
4398 		 */
4399 #if defined(CONFIG_PM) && defined(__linux__)
4400 		dev->dev->power.disable_depth++;
4401 #endif
4402 		if (!adev->dc_enabled)
4403 			drm_helper_hpd_irq_event(dev);
4404 		else
4405 			drm_kms_helper_hotplug_event(dev);
4406 #if defined(CONFIG_PM) && defined(__linux__)
4407 		dev->dev->power.disable_depth--;
4408 #endif
4409 	}
4410 	adev->in_suspend = false;
4411 
4412 	if (adev->enable_mes)
4413 		amdgpu_mes_self_test(adev);
4414 
4415 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4416 		DRM_WARN("smart shift update failed\n");
4417 
4418 	return 0;
4419 }
4420 
4421 /**
4422  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4423  *
4424  * @adev: amdgpu_device pointer
4425  *
4426  * The list of all the hardware IPs that make up the asic is walked and
4427  * the check_soft_reset callbacks are run.  check_soft_reset determines
4428  * if the asic is still hung or not.
4429  * Returns true if any of the IPs are still in a hung state, false if not.
4430  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4431 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4432 {
4433 	int i;
4434 	bool asic_hang = false;
4435 
4436 	if (amdgpu_sriov_vf(adev))
4437 		return true;
4438 
4439 	if (amdgpu_asic_need_full_reset(adev))
4440 		return true;
4441 
4442 	for (i = 0; i < adev->num_ip_blocks; i++) {
4443 		if (!adev->ip_blocks[i].status.valid)
4444 			continue;
4445 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4446 			adev->ip_blocks[i].status.hang =
4447 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4448 		if (adev->ip_blocks[i].status.hang) {
4449 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4450 			asic_hang = true;
4451 		}
4452 	}
4453 	return asic_hang;
4454 }
4455 
4456 /**
4457  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4458  *
4459  * @adev: amdgpu_device pointer
4460  *
4461  * The list of all the hardware IPs that make up the asic is walked and the
4462  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4463  * handles any IP specific hardware or software state changes that are
4464  * necessary for a soft reset to succeed.
4465  * Returns 0 on success, negative error code on failure.
4466  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4467 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4468 {
4469 	int i, r = 0;
4470 
4471 	for (i = 0; i < adev->num_ip_blocks; i++) {
4472 		if (!adev->ip_blocks[i].status.valid)
4473 			continue;
4474 		if (adev->ip_blocks[i].status.hang &&
4475 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4476 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4477 			if (r)
4478 				return r;
4479 		}
4480 	}
4481 
4482 	return 0;
4483 }
4484 
4485 /**
4486  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4487  *
4488  * @adev: amdgpu_device pointer
4489  *
4490  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4491  * reset is necessary to recover.
4492  * Returns true if a full asic reset is required, false if not.
4493  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4494 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4495 {
4496 	int i;
4497 
4498 	if (amdgpu_asic_need_full_reset(adev))
4499 		return true;
4500 
4501 	for (i = 0; i < adev->num_ip_blocks; i++) {
4502 		if (!adev->ip_blocks[i].status.valid)
4503 			continue;
4504 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4505 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4506 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4507 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4508 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4509 			if (adev->ip_blocks[i].status.hang) {
4510 				dev_info(adev->dev, "Some block need full reset!\n");
4511 				return true;
4512 			}
4513 		}
4514 	}
4515 	return false;
4516 }
4517 
4518 /**
4519  * amdgpu_device_ip_soft_reset - do a soft reset
4520  *
4521  * @adev: amdgpu_device pointer
4522  *
4523  * The list of all the hardware IPs that make up the asic is walked and the
4524  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4525  * IP specific hardware or software state changes that are necessary to soft
4526  * reset the IP.
4527  * Returns 0 on success, negative error code on failure.
4528  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4529 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4530 {
4531 	int i, r = 0;
4532 
4533 	for (i = 0; i < adev->num_ip_blocks; i++) {
4534 		if (!adev->ip_blocks[i].status.valid)
4535 			continue;
4536 		if (adev->ip_blocks[i].status.hang &&
4537 		    adev->ip_blocks[i].version->funcs->soft_reset) {
4538 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4539 			if (r)
4540 				return r;
4541 		}
4542 	}
4543 
4544 	return 0;
4545 }
4546 
4547 /**
4548  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4549  *
4550  * @adev: amdgpu_device pointer
4551  *
4552  * The list of all the hardware IPs that make up the asic is walked and the
4553  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4554  * handles any IP specific hardware or software state changes that are
4555  * necessary after the IP has been soft reset.
4556  * Returns 0 on success, negative error code on failure.
4557  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4558 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4559 {
4560 	int i, r = 0;
4561 
4562 	for (i = 0; i < adev->num_ip_blocks; i++) {
4563 		if (!adev->ip_blocks[i].status.valid)
4564 			continue;
4565 		if (adev->ip_blocks[i].status.hang &&
4566 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
4567 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4568 		if (r)
4569 			return r;
4570 	}
4571 
4572 	return 0;
4573 }
4574 
4575 /**
4576  * amdgpu_device_recover_vram - Recover some VRAM contents
4577  *
4578  * @adev: amdgpu_device pointer
4579  *
4580  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4581  * restore things like GPUVM page tables after a GPU reset where
4582  * the contents of VRAM might be lost.
4583  *
4584  * Returns:
4585  * 0 on success, negative error code on failure.
4586  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4587 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4588 {
4589 	struct dma_fence *fence = NULL, *next = NULL;
4590 	struct amdgpu_bo *shadow;
4591 	struct amdgpu_bo_vm *vmbo;
4592 	long r = 1, tmo;
4593 
4594 	if (amdgpu_sriov_runtime(adev))
4595 		tmo = msecs_to_jiffies(8000);
4596 	else
4597 		tmo = msecs_to_jiffies(100);
4598 
4599 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4600 	mutex_lock(&adev->shadow_list_lock);
4601 	list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4602 		/* If vm is compute context or adev is APU, shadow will be NULL */
4603 		if (!vmbo->shadow)
4604 			continue;
4605 		shadow = vmbo->shadow;
4606 
4607 		/* No need to recover an evicted BO */
4608 		if (!shadow->tbo.resource ||
4609 		    shadow->tbo.resource->mem_type != TTM_PL_TT ||
4610 		    shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4611 		    shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4612 			continue;
4613 
4614 		r = amdgpu_bo_restore_shadow(shadow, &next);
4615 		if (r)
4616 			break;
4617 
4618 		if (fence) {
4619 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4620 			dma_fence_put(fence);
4621 			fence = next;
4622 			if (tmo == 0) {
4623 				r = -ETIMEDOUT;
4624 				break;
4625 			} else if (tmo < 0) {
4626 				r = tmo;
4627 				break;
4628 			}
4629 		} else {
4630 			fence = next;
4631 		}
4632 	}
4633 	mutex_unlock(&adev->shadow_list_lock);
4634 
4635 	if (fence)
4636 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4637 	dma_fence_put(fence);
4638 
4639 	if (r < 0 || tmo <= 0) {
4640 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4641 		return -EIO;
4642 	}
4643 
4644 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4645 	return 0;
4646 }
4647 
4648 
4649 /**
4650  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4651  *
4652  * @adev: amdgpu_device pointer
4653  * @from_hypervisor: request from hypervisor
4654  *
4655  * do VF FLR and reinitialize Asic
4656  * return 0 means succeeded otherwise failed
4657  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4658 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4659 				     bool from_hypervisor)
4660 {
4661 	int r;
4662 	struct amdgpu_hive_info *hive = NULL;
4663 	int retry_limit = 0;
4664 
4665 retry:
4666 	amdgpu_amdkfd_pre_reset(adev);
4667 
4668 	if (from_hypervisor)
4669 		r = amdgpu_virt_request_full_gpu(adev, true);
4670 	else
4671 		r = amdgpu_virt_reset_gpu(adev);
4672 	if (r)
4673 		return r;
4674 	amdgpu_irq_gpu_reset_resume_helper(adev);
4675 
4676 	/* some sw clean up VF needs to do before recover */
4677 	amdgpu_virt_post_reset(adev);
4678 
4679 	/* Resume IP prior to SMC */
4680 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4681 	if (r)
4682 		goto error;
4683 
4684 	amdgpu_virt_init_data_exchange(adev);
4685 
4686 	r = amdgpu_device_fw_loading(adev);
4687 	if (r)
4688 		return r;
4689 
4690 	/* now we are okay to resume SMC/CP/SDMA */
4691 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4692 	if (r)
4693 		goto error;
4694 
4695 	hive = amdgpu_get_xgmi_hive(adev);
4696 	/* Update PSP FW topology after reset */
4697 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4698 		r = amdgpu_xgmi_update_topology(hive, adev);
4699 
4700 	if (hive)
4701 		amdgpu_put_xgmi_hive(hive);
4702 
4703 	if (!r) {
4704 		r = amdgpu_ib_ring_tests(adev);
4705 
4706 		amdgpu_amdkfd_post_reset(adev);
4707 	}
4708 
4709 error:
4710 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4711 		amdgpu_inc_vram_lost(adev);
4712 		r = amdgpu_device_recover_vram(adev);
4713 	}
4714 	amdgpu_virt_release_full_gpu(adev, true);
4715 
4716 	if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4717 		if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4718 			retry_limit++;
4719 			goto retry;
4720 		} else
4721 			DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4722 	}
4723 
4724 	return r;
4725 }
4726 
4727 /**
4728  * amdgpu_device_has_job_running - check if there is any job in mirror list
4729  *
4730  * @adev: amdgpu_device pointer
4731  *
4732  * check if there is any job in mirror list
4733  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4734 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4735 {
4736 	int i;
4737 	struct drm_sched_job *job;
4738 
4739 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4740 		struct amdgpu_ring *ring = adev->rings[i];
4741 
4742 		if (!ring || !ring->sched.thread)
4743 			continue;
4744 
4745 		spin_lock(&ring->sched.job_list_lock);
4746 		job = list_first_entry_or_null(&ring->sched.pending_list,
4747 					       struct drm_sched_job, list);
4748 		spin_unlock(&ring->sched.job_list_lock);
4749 		if (job)
4750 			return true;
4751 	}
4752 	return false;
4753 }
4754 
4755 /**
4756  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4757  *
4758  * @adev: amdgpu_device pointer
4759  *
4760  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4761  * a hung GPU.
4762  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4763 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4764 {
4765 
4766 	if (amdgpu_gpu_recovery == 0)
4767 		goto disabled;
4768 
4769 	/* Skip soft reset check in fatal error mode */
4770 	if (!amdgpu_ras_is_poison_mode_supported(adev))
4771 		return true;
4772 
4773 	if (amdgpu_sriov_vf(adev))
4774 		return true;
4775 
4776 	if (amdgpu_gpu_recovery == -1) {
4777 		switch (adev->asic_type) {
4778 #ifdef CONFIG_DRM_AMDGPU_SI
4779 		case CHIP_VERDE:
4780 		case CHIP_TAHITI:
4781 		case CHIP_PITCAIRN:
4782 		case CHIP_OLAND:
4783 		case CHIP_HAINAN:
4784 #endif
4785 #ifdef CONFIG_DRM_AMDGPU_CIK
4786 		case CHIP_KAVERI:
4787 		case CHIP_KABINI:
4788 		case CHIP_MULLINS:
4789 #endif
4790 		case CHIP_CARRIZO:
4791 		case CHIP_STONEY:
4792 		case CHIP_CYAN_SKILLFISH:
4793 			goto disabled;
4794 		default:
4795 			break;
4796 		}
4797 	}
4798 
4799 	return true;
4800 
4801 disabled:
4802 		dev_info(adev->dev, "GPU recovery disabled.\n");
4803 		return false;
4804 }
4805 
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4806 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4807 {
4808 	u32 i;
4809 	int ret = 0;
4810 
4811 	amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4812 
4813 	dev_info(adev->dev, "GPU mode1 reset\n");
4814 
4815 	/* Cache the state before bus master disable. The saved config space
4816 	 * values are used in other cases like restore after mode-2 reset.
4817 	 */
4818 	amdgpu_device_cache_pci_state(adev->pdev);
4819 
4820 	/* disable BM */
4821 	pci_clear_master(adev->pdev);
4822 
4823 	if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4824 		dev_info(adev->dev, "GPU smu mode1 reset\n");
4825 		ret = amdgpu_dpm_mode1_reset(adev);
4826 	} else {
4827 		dev_info(adev->dev, "GPU psp mode1 reset\n");
4828 		ret = psp_gpu_reset(adev);
4829 	}
4830 
4831 	if (ret)
4832 		goto mode1_reset_failed;
4833 
4834 	amdgpu_device_load_pci_state(adev->pdev);
4835 	ret = amdgpu_psp_wait_for_bootloader(adev);
4836 	if (ret)
4837 		goto mode1_reset_failed;
4838 
4839 	/* wait for asic to come out of reset */
4840 	for (i = 0; i < adev->usec_timeout; i++) {
4841 		u32 memsize = adev->nbio.funcs->get_memsize(adev);
4842 
4843 		if (memsize != 0xffffffff)
4844 			break;
4845 		udelay(1);
4846 	}
4847 
4848 	if (i >= adev->usec_timeout) {
4849 		ret = -ETIMEDOUT;
4850 		goto mode1_reset_failed;
4851 	}
4852 
4853 	amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4854 
4855 	return 0;
4856 
4857 mode1_reset_failed:
4858 	dev_err(adev->dev, "GPU mode1 reset failed\n");
4859 	return ret;
4860 }
4861 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4862 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4863 				 struct amdgpu_reset_context *reset_context)
4864 {
4865 	int i, r = 0;
4866 	struct amdgpu_job *job = NULL;
4867 	bool need_full_reset =
4868 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4869 
4870 	if (reset_context->reset_req_dev == adev)
4871 		job = reset_context->job;
4872 
4873 	if (amdgpu_sriov_vf(adev)) {
4874 		/* stop the data exchange thread */
4875 		amdgpu_virt_fini_data_exchange(adev);
4876 	}
4877 
4878 	amdgpu_fence_driver_isr_toggle(adev, true);
4879 
4880 	/* block all schedulers and reset given job's ring */
4881 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4882 		struct amdgpu_ring *ring = adev->rings[i];
4883 
4884 		if (!ring || !ring->sched.thread)
4885 			continue;
4886 
4887 		/* Clear job fence from fence drv to avoid force_completion
4888 		 * leave NULL and vm flush fence in fence drv
4889 		 */
4890 		amdgpu_fence_driver_clear_job_fences(ring);
4891 
4892 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4893 		amdgpu_fence_driver_force_completion(ring);
4894 	}
4895 
4896 	amdgpu_fence_driver_isr_toggle(adev, false);
4897 
4898 	if (job && job->vm)
4899 		drm_sched_increase_karma(&job->base);
4900 
4901 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4902 	/* If reset handler not implemented, continue; otherwise return */
4903 	if (r == -EOPNOTSUPP)
4904 		r = 0;
4905 	else
4906 		return r;
4907 
4908 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4909 	if (!amdgpu_sriov_vf(adev)) {
4910 
4911 		if (!need_full_reset)
4912 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4913 
4914 		if (!need_full_reset && amdgpu_gpu_recovery &&
4915 		    amdgpu_device_ip_check_soft_reset(adev)) {
4916 			amdgpu_device_ip_pre_soft_reset(adev);
4917 			r = amdgpu_device_ip_soft_reset(adev);
4918 			amdgpu_device_ip_post_soft_reset(adev);
4919 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4920 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4921 				need_full_reset = true;
4922 			}
4923 		}
4924 
4925 		if (need_full_reset)
4926 			r = amdgpu_device_ip_suspend(adev);
4927 		if (need_full_reset)
4928 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4929 		else
4930 			clear_bit(AMDGPU_NEED_FULL_RESET,
4931 				  &reset_context->flags);
4932 	}
4933 
4934 	return r;
4935 }
4936 
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4937 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4938 {
4939 	int i;
4940 
4941 	lockdep_assert_held(&adev->reset_domain->sem);
4942 
4943 	for (i = 0; i < adev->num_regs; i++) {
4944 		adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4945 		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4946 					     adev->reset_dump_reg_value[i]);
4947 	}
4948 
4949 	return 0;
4950 }
4951 
4952 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4953 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4954 		size_t count, void *data, size_t datalen)
4955 {
4956 	struct drm_printer p;
4957 	struct amdgpu_device *adev = data;
4958 	struct drm_print_iterator iter;
4959 	int i;
4960 
4961 	iter.data = buffer;
4962 	iter.offset = 0;
4963 	iter.start = offset;
4964 	iter.remain = count;
4965 
4966 	p = drm_coredump_printer(&iter);
4967 
4968 	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4969 	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4970 	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4971 	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4972 	if (adev->reset_task_info.pid)
4973 		drm_printf(&p, "process_name: %s PID: %d\n",
4974 			   adev->reset_task_info.process_name,
4975 			   adev->reset_task_info.pid);
4976 
4977 	if (adev->reset_vram_lost)
4978 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4979 	if (adev->num_regs) {
4980 		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4981 
4982 		for (i = 0; i < adev->num_regs; i++)
4983 			drm_printf(&p, "0x%08x: 0x%08x\n",
4984 				   adev->reset_dump_reg_list[i],
4985 				   adev->reset_dump_reg_value[i]);
4986 	}
4987 
4988 	return count - iter.remain;
4989 }
4990 
amdgpu_devcoredump_free(void * data)4991 static void amdgpu_devcoredump_free(void *data)
4992 {
4993 }
4994 
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)4995 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4996 {
4997 	struct drm_device *dev = adev_to_drm(adev);
4998 
4999 	ktime_get_ts64(&adev->reset_time);
5000 	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
5001 		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
5002 }
5003 #endif
5004 
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5005 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5006 			 struct amdgpu_reset_context *reset_context)
5007 {
5008 	struct amdgpu_device *tmp_adev = NULL;
5009 	bool need_full_reset, skip_hw_reset, vram_lost = false;
5010 	int r = 0;
5011 	bool gpu_reset_for_dev_remove = 0;
5012 
5013 	/* Try reset handler method first */
5014 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5015 				    reset_list);
5016 	amdgpu_reset_reg_dumps(tmp_adev);
5017 
5018 	reset_context->reset_device_list = device_list_handle;
5019 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5020 	/* If reset handler not implemented, continue; otherwise return */
5021 	if (r == -EOPNOTSUPP)
5022 		r = 0;
5023 	else
5024 		return r;
5025 
5026 	/* Reset handler not implemented, use the default method */
5027 	need_full_reset =
5028 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5029 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5030 
5031 	gpu_reset_for_dev_remove =
5032 		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5033 			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5034 
5035 	/*
5036 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
5037 	 * to allow proper links negotiation in FW (within 1 sec)
5038 	 */
5039 	if (!skip_hw_reset && need_full_reset) {
5040 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5041 			/* For XGMI run all resets in parallel to speed up the process */
5042 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5043 				tmp_adev->gmc.xgmi.pending_reset = false;
5044 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5045 					r = -EALREADY;
5046 			} else
5047 				r = amdgpu_asic_reset(tmp_adev);
5048 
5049 			if (r) {
5050 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5051 					 r, adev_to_drm(tmp_adev)->unique);
5052 				break;
5053 			}
5054 		}
5055 
5056 		/* For XGMI wait for all resets to complete before proceed */
5057 		if (!r) {
5058 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5059 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5060 					flush_work(&tmp_adev->xgmi_reset_work);
5061 					r = tmp_adev->asic_reset_res;
5062 					if (r)
5063 						break;
5064 				}
5065 			}
5066 		}
5067 	}
5068 
5069 	if (!r && amdgpu_ras_intr_triggered()) {
5070 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5071 			if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
5072 			    tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
5073 				tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5074 		}
5075 
5076 		amdgpu_ras_intr_cleared();
5077 	}
5078 
5079 	/* Since the mode1 reset affects base ip blocks, the
5080 	 * phase1 ip blocks need to be resumed. Otherwise there
5081 	 * will be a BIOS signature error and the psp bootloader
5082 	 * can't load kdb on the next amdgpu install.
5083 	 */
5084 	if (gpu_reset_for_dev_remove) {
5085 		list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5086 			amdgpu_device_ip_resume_phase1(tmp_adev);
5087 
5088 		goto end;
5089 	}
5090 
5091 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5092 		if (need_full_reset) {
5093 			/* post card */
5094 			r = amdgpu_device_asic_init(tmp_adev);
5095 			if (r) {
5096 				dev_warn(tmp_adev->dev, "asic atom init failed!");
5097 			} else {
5098 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5099 
5100 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
5101 				if (r)
5102 					goto out;
5103 
5104 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5105 #ifdef CONFIG_DEV_COREDUMP
5106 				tmp_adev->reset_vram_lost = vram_lost;
5107 				memset(&tmp_adev->reset_task_info, 0,
5108 						sizeof(tmp_adev->reset_task_info));
5109 				if (reset_context->job && reset_context->job->vm)
5110 					tmp_adev->reset_task_info =
5111 						reset_context->job->vm->task_info;
5112 				amdgpu_reset_capture_coredumpm(tmp_adev);
5113 #endif
5114 				if (vram_lost) {
5115 					DRM_INFO("VRAM is lost due to GPU reset!\n");
5116 					amdgpu_inc_vram_lost(tmp_adev);
5117 				}
5118 
5119 				r = amdgpu_device_fw_loading(tmp_adev);
5120 				if (r)
5121 					return r;
5122 
5123 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
5124 				if (r)
5125 					goto out;
5126 
5127 				if (vram_lost)
5128 					amdgpu_device_fill_reset_magic(tmp_adev);
5129 
5130 				/*
5131 				 * Add this ASIC as tracked as reset was already
5132 				 * complete successfully.
5133 				 */
5134 				amdgpu_register_gpu_instance(tmp_adev);
5135 
5136 				if (!reset_context->hive &&
5137 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5138 					amdgpu_xgmi_add_device(tmp_adev);
5139 
5140 				r = amdgpu_device_ip_late_init(tmp_adev);
5141 				if (r)
5142 					goto out;
5143 
5144 				drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5145 
5146 				/*
5147 				 * The GPU enters bad state once faulty pages
5148 				 * by ECC has reached the threshold, and ras
5149 				 * recovery is scheduled next. So add one check
5150 				 * here to break recovery if it indeed exceeds
5151 				 * bad page threshold, and remind user to
5152 				 * retire this GPU or setting one bigger
5153 				 * bad_page_threshold value to fix this once
5154 				 * probing driver again.
5155 				 */
5156 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5157 					/* must succeed. */
5158 					amdgpu_ras_resume(tmp_adev);
5159 				} else {
5160 					r = -EINVAL;
5161 					goto out;
5162 				}
5163 
5164 				/* Update PSP FW topology after reset */
5165 				if (reset_context->hive &&
5166 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5167 					r = amdgpu_xgmi_update_topology(
5168 						reset_context->hive, tmp_adev);
5169 			}
5170 		}
5171 
5172 out:
5173 		if (!r) {
5174 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5175 			r = amdgpu_ib_ring_tests(tmp_adev);
5176 			if (r) {
5177 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5178 				need_full_reset = true;
5179 				r = -EAGAIN;
5180 				goto end;
5181 			}
5182 		}
5183 
5184 		if (!r)
5185 			r = amdgpu_device_recover_vram(tmp_adev);
5186 		else
5187 			tmp_adev->asic_reset_res = r;
5188 	}
5189 
5190 end:
5191 	if (need_full_reset)
5192 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5193 	else
5194 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5195 	return r;
5196 }
5197 
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5198 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5199 {
5200 
5201 	switch (amdgpu_asic_reset_method(adev)) {
5202 	case AMD_RESET_METHOD_MODE1:
5203 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5204 		break;
5205 	case AMD_RESET_METHOD_MODE2:
5206 		adev->mp1_state = PP_MP1_STATE_RESET;
5207 		break;
5208 	default:
5209 		adev->mp1_state = PP_MP1_STATE_NONE;
5210 		break;
5211 	}
5212 
5213 	pci_dev_put(p);
5214 }
5215 
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5216 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5217 {
5218 	amdgpu_vf_error_trans_all(adev);
5219 	adev->mp1_state = PP_MP1_STATE_NONE;
5220 }
5221 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5222 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5223 {
5224 	STUB();
5225 #ifdef notyet
5226 	struct pci_dev *p = NULL;
5227 
5228 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5229 			adev->pdev->bus->number, 1);
5230 	if (p) {
5231 		pm_runtime_enable(&(p->dev));
5232 		pm_runtime_resume(&(p->dev));
5233 	}
5234 #endif
5235 }
5236 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5237 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5238 {
5239 	enum amd_reset_method reset_method;
5240 	struct pci_dev *p = NULL;
5241 	u64 expires;
5242 
5243 	/*
5244 	 * For now, only BACO and mode1 reset are confirmed
5245 	 * to suffer the audio issue without proper suspended.
5246 	 */
5247 	reset_method = amdgpu_asic_reset_method(adev);
5248 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
5249 	     (reset_method != AMD_RESET_METHOD_MODE1))
5250 		return -EINVAL;
5251 
5252 	STUB();
5253 	return -ENOSYS;
5254 #ifdef notyet
5255 
5256 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5257 			adev->pdev->bus->number, 1);
5258 	if (!p)
5259 		return -ENODEV;
5260 
5261 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
5262 	if (!expires)
5263 		/*
5264 		 * If we cannot get the audio device autosuspend delay,
5265 		 * a fixed 4S interval will be used. Considering 3S is
5266 		 * the audio controller default autosuspend delay setting.
5267 		 * 4S used here is guaranteed to cover that.
5268 		 */
5269 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5270 
5271 	while (!pm_runtime_status_suspended(&(p->dev))) {
5272 		if (!pm_runtime_suspend(&(p->dev)))
5273 			break;
5274 
5275 		if (expires < ktime_get_mono_fast_ns()) {
5276 			dev_warn(adev->dev, "failed to suspend display audio\n");
5277 			pci_dev_put(p);
5278 			/* TODO: abort the succeeding gpu reset? */
5279 			return -ETIMEDOUT;
5280 		}
5281 	}
5282 
5283 	pm_runtime_disable(&(p->dev));
5284 
5285 	pci_dev_put(p);
5286 	return 0;
5287 #endif
5288 }
5289 
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5290 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5291 {
5292 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5293 
5294 #if defined(CONFIG_DEBUG_FS)
5295 	if (!amdgpu_sriov_vf(adev))
5296 		cancel_work(&adev->reset_work);
5297 #endif
5298 
5299 	if (adev->kfd.dev)
5300 		cancel_work(&adev->kfd.reset_work);
5301 
5302 	if (amdgpu_sriov_vf(adev))
5303 		cancel_work(&adev->virt.flr_work);
5304 
5305 	if (con && adev->ras_enabled)
5306 		cancel_work(&con->recovery_work);
5307 
5308 }
5309 
5310 /**
5311  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5312  *
5313  * @adev: amdgpu_device pointer
5314  * @job: which job trigger hang
5315  * @reset_context: amdgpu reset context pointer
5316  *
5317  * Attempt to reset the GPU if it has hung (all asics).
5318  * Attempt to do soft-reset or full-reset and reinitialize Asic
5319  * Returns 0 for success or an error on failure.
5320  */
5321 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5322 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5323 			      struct amdgpu_job *job,
5324 			      struct amdgpu_reset_context *reset_context)
5325 {
5326 	struct list_head device_list, *device_list_handle =  NULL;
5327 	bool job_signaled = false;
5328 	struct amdgpu_hive_info *hive = NULL;
5329 	struct amdgpu_device *tmp_adev = NULL;
5330 	int i, r = 0;
5331 	bool need_emergency_restart = false;
5332 	bool audio_suspended = false;
5333 	bool gpu_reset_for_dev_remove = false;
5334 
5335 	gpu_reset_for_dev_remove =
5336 			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5337 				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5338 
5339 	/*
5340 	 * Special case: RAS triggered and full reset isn't supported
5341 	 */
5342 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5343 
5344 	/*
5345 	 * Flush RAM to disk so that after reboot
5346 	 * the user can read log and see why the system rebooted.
5347 	 */
5348 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5349 		amdgpu_ras_get_context(adev)->reboot) {
5350 		DRM_WARN("Emergency reboot.");
5351 
5352 #ifdef notyet
5353 		ksys_sync_helper();
5354 		emergency_restart();
5355 #else
5356 		panic("emergency_restart");
5357 #endif
5358 	}
5359 
5360 	dev_info(adev->dev, "GPU %s begin!\n",
5361 		need_emergency_restart ? "jobs stop":"reset");
5362 
5363 	if (!amdgpu_sriov_vf(adev))
5364 		hive = amdgpu_get_xgmi_hive(adev);
5365 	if (hive)
5366 		mutex_lock(&hive->hive_lock);
5367 
5368 	reset_context->job = job;
5369 	reset_context->hive = hive;
5370 	/*
5371 	 * Build list of devices to reset.
5372 	 * In case we are in XGMI hive mode, resort the device list
5373 	 * to put adev in the 1st position.
5374 	 */
5375 	INIT_LIST_HEAD(&device_list);
5376 	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5377 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5378 			list_add_tail(&tmp_adev->reset_list, &device_list);
5379 			if (gpu_reset_for_dev_remove && adev->shutdown)
5380 				tmp_adev->shutdown = true;
5381 		}
5382 		if (!list_is_first(&adev->reset_list, &device_list))
5383 			list_rotate_to_front(&adev->reset_list, &device_list);
5384 		device_list_handle = &device_list;
5385 	} else {
5386 		list_add_tail(&adev->reset_list, &device_list);
5387 		device_list_handle = &device_list;
5388 	}
5389 
5390 	/* We need to lock reset domain only once both for XGMI and single device */
5391 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5392 				    reset_list);
5393 	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5394 
5395 	/* block all schedulers and reset given job's ring */
5396 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5397 
5398 		amdgpu_device_set_mp1_state(tmp_adev);
5399 
5400 		/*
5401 		 * Try to put the audio codec into suspend state
5402 		 * before gpu reset started.
5403 		 *
5404 		 * Due to the power domain of the graphics device
5405 		 * is shared with AZ power domain. Without this,
5406 		 * we may change the audio hardware from behind
5407 		 * the audio driver's back. That will trigger
5408 		 * some audio codec errors.
5409 		 */
5410 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
5411 			audio_suspended = true;
5412 
5413 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
5414 
5415 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5416 
5417 		if (!amdgpu_sriov_vf(tmp_adev))
5418 			amdgpu_amdkfd_pre_reset(tmp_adev);
5419 
5420 		/*
5421 		 * Mark these ASICs to be reseted as untracked first
5422 		 * And add them back after reset completed
5423 		 */
5424 		amdgpu_unregister_gpu_instance(tmp_adev);
5425 
5426 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5427 
5428 		/* disable ras on ALL IPs */
5429 		if (!need_emergency_restart &&
5430 		      amdgpu_device_ip_need_full_reset(tmp_adev))
5431 			amdgpu_ras_suspend(tmp_adev);
5432 
5433 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5434 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5435 
5436 			if (!ring || !ring->sched.thread)
5437 				continue;
5438 
5439 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5440 
5441 			if (need_emergency_restart)
5442 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5443 		}
5444 		atomic_inc(&tmp_adev->gpu_reset_counter);
5445 	}
5446 
5447 	if (need_emergency_restart)
5448 		goto skip_sched_resume;
5449 
5450 	/*
5451 	 * Must check guilty signal here since after this point all old
5452 	 * HW fences are force signaled.
5453 	 *
5454 	 * job->base holds a reference to parent fence
5455 	 */
5456 	if (job && dma_fence_is_signaled(&job->hw_fence)) {
5457 		job_signaled = true;
5458 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5459 		goto skip_hw_reset;
5460 	}
5461 
5462 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
5463 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5464 		if (gpu_reset_for_dev_remove) {
5465 			/* Workaroud for ASICs need to disable SMC first */
5466 			amdgpu_device_smu_fini_early(tmp_adev);
5467 		}
5468 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5469 		/*TODO Should we stop ?*/
5470 		if (r) {
5471 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5472 				  r, adev_to_drm(tmp_adev)->unique);
5473 			tmp_adev->asic_reset_res = r;
5474 		}
5475 
5476 		/*
5477 		 * Drop all pending non scheduler resets. Scheduler resets
5478 		 * were already dropped during drm_sched_stop
5479 		 */
5480 		amdgpu_device_stop_pending_resets(tmp_adev);
5481 	}
5482 
5483 	/* Actual ASIC resets if needed.*/
5484 	/* Host driver will handle XGMI hive reset for SRIOV */
5485 	if (amdgpu_sriov_vf(adev)) {
5486 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
5487 		if (r)
5488 			adev->asic_reset_res = r;
5489 
5490 		/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5491 		if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5492 		    adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5493 			amdgpu_ras_resume(adev);
5494 	} else {
5495 		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5496 		if (r && r == -EAGAIN)
5497 			goto retry;
5498 
5499 		if (!r && gpu_reset_for_dev_remove)
5500 			goto recover_end;
5501 	}
5502 
5503 skip_hw_reset:
5504 
5505 	/* Post ASIC reset for all devs .*/
5506 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5507 
5508 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5509 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5510 
5511 			if (!ring || !ring->sched.thread)
5512 				continue;
5513 
5514 			drm_sched_start(&ring->sched, true);
5515 		}
5516 
5517 		if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5518 			amdgpu_mes_self_test(tmp_adev);
5519 
5520 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5521 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5522 
5523 		if (tmp_adev->asic_reset_res)
5524 			r = tmp_adev->asic_reset_res;
5525 
5526 		tmp_adev->asic_reset_res = 0;
5527 
5528 		if (r) {
5529 			/* bad news, how to tell it to userspace ? */
5530 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5531 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5532 		} else {
5533 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5534 			if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5535 				DRM_WARN("smart shift update failed\n");
5536 		}
5537 	}
5538 
5539 skip_sched_resume:
5540 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5541 		/* unlock kfd: SRIOV would do it separately */
5542 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5543 			amdgpu_amdkfd_post_reset(tmp_adev);
5544 
5545 		/* kfd_post_reset will do nothing if kfd device is not initialized,
5546 		 * need to bring up kfd here if it's not be initialized before
5547 		 */
5548 		if (!adev->kfd.init_complete)
5549 			amdgpu_amdkfd_device_init(adev);
5550 
5551 		if (audio_suspended)
5552 			amdgpu_device_resume_display_audio(tmp_adev);
5553 
5554 		amdgpu_device_unset_mp1_state(tmp_adev);
5555 
5556 		amdgpu_ras_set_error_query_ready(tmp_adev, true);
5557 	}
5558 
5559 recover_end:
5560 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5561 					    reset_list);
5562 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5563 
5564 	if (hive) {
5565 		mutex_unlock(&hive->hive_lock);
5566 		amdgpu_put_xgmi_hive(hive);
5567 	}
5568 
5569 	if (r)
5570 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5571 
5572 	atomic_set(&adev->reset_domain->reset_res, r);
5573 	return r;
5574 }
5575 
5576 /**
5577  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5578  *
5579  * @adev: amdgpu_device pointer
5580  *
5581  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5582  * and lanes) of the slot the device is in. Handles APUs and
5583  * virtualized environments where PCIE config space may not be available.
5584  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5585 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5586 {
5587 	struct pci_dev *pdev;
5588 	enum pci_bus_speed speed_cap, platform_speed_cap;
5589 	enum pcie_link_width platform_link_width;
5590 
5591 	if (amdgpu_pcie_gen_cap)
5592 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5593 
5594 	if (amdgpu_pcie_lane_cap)
5595 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5596 
5597 	/* covers APUs as well */
5598 	if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5599 		if (adev->pm.pcie_gen_mask == 0)
5600 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5601 		if (adev->pm.pcie_mlw_mask == 0)
5602 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5603 		return;
5604 	}
5605 
5606 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5607 		return;
5608 
5609 	pcie_bandwidth_available(adev->pdev, NULL,
5610 				 &platform_speed_cap, &platform_link_width);
5611 
5612 	if (adev->pm.pcie_gen_mask == 0) {
5613 		/* asic caps */
5614 		pdev = adev->pdev;
5615 		speed_cap = pcie_get_speed_cap(pdev);
5616 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5617 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5618 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5619 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5620 		} else {
5621 			if (speed_cap == PCIE_SPEED_32_0GT)
5622 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5623 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5624 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5625 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5626 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5627 			else if (speed_cap == PCIE_SPEED_16_0GT)
5628 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5629 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5630 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5631 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5632 			else if (speed_cap == PCIE_SPEED_8_0GT)
5633 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5634 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5635 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5636 			else if (speed_cap == PCIE_SPEED_5_0GT)
5637 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5638 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5639 			else
5640 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5641 		}
5642 		/* platform caps */
5643 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5644 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5645 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5646 		} else {
5647 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5648 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5649 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5650 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5651 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5652 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5653 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5654 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5655 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5656 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5657 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5658 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5659 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5660 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5661 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5662 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5663 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5664 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5665 			else
5666 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5667 
5668 		}
5669 	}
5670 	if (adev->pm.pcie_mlw_mask == 0) {
5671 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5672 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5673 		} else {
5674 			switch (platform_link_width) {
5675 			case PCIE_LNK_X32:
5676 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5677 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5678 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5679 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5680 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5681 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5682 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5683 				break;
5684 			case PCIE_LNK_X16:
5685 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5686 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5687 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5688 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5689 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5690 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5691 				break;
5692 			case PCIE_LNK_X12:
5693 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5694 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5695 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5696 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5697 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5698 				break;
5699 			case PCIE_LNK_X8:
5700 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5701 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5702 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5703 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5704 				break;
5705 			case PCIE_LNK_X4:
5706 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5707 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5708 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5709 				break;
5710 			case PCIE_LNK_X2:
5711 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5712 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5713 				break;
5714 			case PCIE_LNK_X1:
5715 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5716 				break;
5717 			default:
5718 				break;
5719 			}
5720 		}
5721 	}
5722 }
5723 
5724 /**
5725  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5726  *
5727  * @adev: amdgpu_device pointer
5728  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5729  *
5730  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5731  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5732  * @peer_adev.
5733  */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5734 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5735 				      struct amdgpu_device *peer_adev)
5736 {
5737 #ifdef CONFIG_HSA_AMD_P2P
5738 	uint64_t address_mask = peer_adev->dev->dma_mask ?
5739 		~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5740 	resource_size_t aper_limit =
5741 		adev->gmc.aper_base + adev->gmc.aper_size - 1;
5742 	bool p2p_access =
5743 		!adev->gmc.xgmi.connected_to_cpu &&
5744 		!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5745 
5746 	return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5747 		adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5748 		!(adev->gmc.aper_base & address_mask ||
5749 		  aper_limit & address_mask));
5750 #else
5751 	return false;
5752 #endif
5753 }
5754 
amdgpu_device_baco_enter(struct drm_device * dev)5755 int amdgpu_device_baco_enter(struct drm_device *dev)
5756 {
5757 	struct amdgpu_device *adev = drm_to_adev(dev);
5758 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5759 
5760 	if (!amdgpu_device_supports_baco(dev))
5761 		return -ENOTSUPP;
5762 
5763 	if (ras && adev->ras_enabled &&
5764 	    adev->nbio.funcs->enable_doorbell_interrupt)
5765 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5766 
5767 	return amdgpu_dpm_baco_enter(adev);
5768 }
5769 
amdgpu_device_baco_exit(struct drm_device * dev)5770 int amdgpu_device_baco_exit(struct drm_device *dev)
5771 {
5772 	struct amdgpu_device *adev = drm_to_adev(dev);
5773 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5774 	int ret = 0;
5775 
5776 	if (!amdgpu_device_supports_baco(dev))
5777 		return -ENOTSUPP;
5778 
5779 	ret = amdgpu_dpm_baco_exit(adev);
5780 	if (ret)
5781 		return ret;
5782 
5783 	if (ras && adev->ras_enabled &&
5784 	    adev->nbio.funcs->enable_doorbell_interrupt)
5785 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5786 
5787 	if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
5788 	    adev->nbio.funcs->clear_doorbell_interrupt)
5789 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
5790 
5791 	return 0;
5792 }
5793 
5794 /**
5795  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5796  * @pdev: PCI device struct
5797  * @state: PCI channel state
5798  *
5799  * Description: Called when a PCI error is detected.
5800  *
5801  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5802  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5803 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5804 {
5805 	STUB();
5806 	return 0;
5807 #ifdef notyet
5808 	struct drm_device *dev = pci_get_drvdata(pdev);
5809 	struct amdgpu_device *adev = drm_to_adev(dev);
5810 	int i;
5811 
5812 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5813 
5814 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5815 		DRM_WARN("No support for XGMI hive yet...");
5816 		return PCI_ERS_RESULT_DISCONNECT;
5817 	}
5818 
5819 	adev->pci_channel_state = state;
5820 
5821 	switch (state) {
5822 	case pci_channel_io_normal:
5823 		return PCI_ERS_RESULT_CAN_RECOVER;
5824 	/* Fatal error, prepare for slot reset */
5825 	case pci_channel_io_frozen:
5826 		/*
5827 		 * Locking adev->reset_domain->sem will prevent any external access
5828 		 * to GPU during PCI error recovery
5829 		 */
5830 		amdgpu_device_lock_reset_domain(adev->reset_domain);
5831 		amdgpu_device_set_mp1_state(adev);
5832 
5833 		/*
5834 		 * Block any work scheduling as we do for regular GPU reset
5835 		 * for the duration of the recovery
5836 		 */
5837 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5838 			struct amdgpu_ring *ring = adev->rings[i];
5839 
5840 			if (!ring || !ring->sched.thread)
5841 				continue;
5842 
5843 			drm_sched_stop(&ring->sched, NULL);
5844 		}
5845 		atomic_inc(&adev->gpu_reset_counter);
5846 		return PCI_ERS_RESULT_NEED_RESET;
5847 	case pci_channel_io_perm_failure:
5848 		/* Permanent error, prepare for device removal */
5849 		return PCI_ERS_RESULT_DISCONNECT;
5850 	}
5851 
5852 	return PCI_ERS_RESULT_NEED_RESET;
5853 #endif
5854 }
5855 
5856 /**
5857  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5858  * @pdev: pointer to PCI device
5859  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5860 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5861 {
5862 
5863 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5864 
5865 	/* TODO - dump whatever for debugging purposes */
5866 
5867 	/* This called only if amdgpu_pci_error_detected returns
5868 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5869 	 * works, no need to reset slot.
5870 	 */
5871 
5872 	return PCI_ERS_RESULT_RECOVERED;
5873 }
5874 
5875 /**
5876  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5877  * @pdev: PCI device struct
5878  *
5879  * Description: This routine is called by the pci error recovery
5880  * code after the PCI slot has been reset, just before we
5881  * should resume normal operations.
5882  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5883 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5884 {
5885 	STUB();
5886 	return PCI_ERS_RESULT_RECOVERED;
5887 #ifdef notyet
5888 	struct drm_device *dev = pci_get_drvdata(pdev);
5889 	struct amdgpu_device *adev = drm_to_adev(dev);
5890 	int r, i;
5891 	struct amdgpu_reset_context reset_context;
5892 	u32 memsize;
5893 	struct list_head device_list;
5894 
5895 	DRM_INFO("PCI error: slot reset callback!!\n");
5896 
5897 	memset(&reset_context, 0, sizeof(reset_context));
5898 
5899 	INIT_LIST_HEAD(&device_list);
5900 	list_add_tail(&adev->reset_list, &device_list);
5901 
5902 	/* wait for asic to come out of reset */
5903 	drm_msleep(500);
5904 
5905 	/* Restore PCI confspace */
5906 	amdgpu_device_load_pci_state(pdev);
5907 
5908 	/* confirm  ASIC came out of reset */
5909 	for (i = 0; i < adev->usec_timeout; i++) {
5910 		memsize = amdgpu_asic_get_config_memsize(adev);
5911 
5912 		if (memsize != 0xffffffff)
5913 			break;
5914 		udelay(1);
5915 	}
5916 	if (memsize == 0xffffffff) {
5917 		r = -ETIME;
5918 		goto out;
5919 	}
5920 
5921 	reset_context.method = AMD_RESET_METHOD_NONE;
5922 	reset_context.reset_req_dev = adev;
5923 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5924 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5925 
5926 	adev->no_hw_access = true;
5927 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5928 	adev->no_hw_access = false;
5929 	if (r)
5930 		goto out;
5931 
5932 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5933 
5934 out:
5935 	if (!r) {
5936 		if (amdgpu_device_cache_pci_state(adev->pdev))
5937 			pci_restore_state(adev->pdev);
5938 
5939 		DRM_INFO("PCIe error recovery succeeded\n");
5940 	} else {
5941 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5942 		amdgpu_device_unset_mp1_state(adev);
5943 		amdgpu_device_unlock_reset_domain(adev->reset_domain);
5944 	}
5945 
5946 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5947 #endif
5948 }
5949 
5950 /**
5951  * amdgpu_pci_resume() - resume normal ops after PCI reset
5952  * @pdev: pointer to PCI device
5953  *
5954  * Called when the error recovery driver tells us that its
5955  * OK to resume normal operation.
5956  */
amdgpu_pci_resume(struct pci_dev * pdev)5957 void amdgpu_pci_resume(struct pci_dev *pdev)
5958 {
5959 	STUB();
5960 #ifdef notyet
5961 	struct drm_device *dev = pci_get_drvdata(pdev);
5962 	struct amdgpu_device *adev = drm_to_adev(dev);
5963 	int i;
5964 
5965 
5966 	DRM_INFO("PCI error: resume callback!!\n");
5967 
5968 	/* Only continue execution for the case of pci_channel_io_frozen */
5969 	if (adev->pci_channel_state != pci_channel_io_frozen)
5970 		return;
5971 
5972 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5973 		struct amdgpu_ring *ring = adev->rings[i];
5974 
5975 		if (!ring || !ring->sched.thread)
5976 			continue;
5977 
5978 		drm_sched_start(&ring->sched, true);
5979 	}
5980 
5981 	amdgpu_device_unset_mp1_state(adev);
5982 	amdgpu_device_unlock_reset_domain(adev->reset_domain);
5983 #endif
5984 }
5985 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5986 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5987 {
5988 	return false;
5989 #ifdef notyet
5990 	struct drm_device *dev = pci_get_drvdata(pdev);
5991 	struct amdgpu_device *adev = drm_to_adev(dev);
5992 	int r;
5993 
5994 	r = pci_save_state(pdev);
5995 	if (!r) {
5996 		kfree(adev->pci_state);
5997 
5998 		adev->pci_state = pci_store_saved_state(pdev);
5999 
6000 		if (!adev->pci_state) {
6001 			DRM_ERROR("Failed to store PCI saved state");
6002 			return false;
6003 		}
6004 	} else {
6005 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
6006 		return false;
6007 	}
6008 
6009 	return true;
6010 #endif
6011 }
6012 
amdgpu_device_load_pci_state(struct pci_dev * pdev)6013 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6014 {
6015 	STUB();
6016 	return false;
6017 #ifdef notyet
6018 	struct drm_device *dev = pci_get_drvdata(pdev);
6019 	struct amdgpu_device *adev = drm_to_adev(dev);
6020 	int r;
6021 
6022 	if (!adev->pci_state)
6023 		return false;
6024 
6025 	r = pci_load_saved_state(pdev, adev->pci_state);
6026 
6027 	if (!r) {
6028 		pci_restore_state(pdev);
6029 	} else {
6030 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
6031 		return false;
6032 	}
6033 
6034 	return true;
6035 #endif
6036 }
6037 
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6038 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6039 		struct amdgpu_ring *ring)
6040 {
6041 #ifdef CONFIG_X86_64
6042 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6043 		return;
6044 #endif
6045 	if (adev->gmc.xgmi.connected_to_cpu)
6046 		return;
6047 
6048 	if (ring && ring->funcs->emit_hdp_flush)
6049 		amdgpu_ring_emit_hdp_flush(ring);
6050 	else
6051 		amdgpu_asic_flush_hdp(adev, ring);
6052 }
6053 
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6054 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6055 		struct amdgpu_ring *ring)
6056 {
6057 #ifdef CONFIG_X86_64
6058 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6059 		return;
6060 #endif
6061 	if (adev->gmc.xgmi.connected_to_cpu)
6062 		return;
6063 
6064 	amdgpu_asic_invalidate_hdp(adev, ring);
6065 }
6066 
amdgpu_in_reset(struct amdgpu_device * adev)6067 int amdgpu_in_reset(struct amdgpu_device *adev)
6068 {
6069 	return atomic_read(&adev->reset_domain->in_gpu_reset);
6070 }
6071 
6072 /**
6073  * amdgpu_device_halt() - bring hardware to some kind of halt state
6074  *
6075  * @adev: amdgpu_device pointer
6076  *
6077  * Bring hardware to some kind of halt state so that no one can touch it
6078  * any more. It will help to maintain error context when error occurred.
6079  * Compare to a simple hang, the system will keep stable at least for SSH
6080  * access. Then it should be trivial to inspect the hardware state and
6081  * see what's going on. Implemented as following:
6082  *
6083  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6084  *    clears all CPU mappings to device, disallows remappings through page faults
6085  * 2. amdgpu_irq_disable_all() disables all interrupts
6086  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6087  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6088  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6089  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6090  *    flush any in flight DMA operations
6091  */
amdgpu_device_halt(struct amdgpu_device * adev)6092 void amdgpu_device_halt(struct amdgpu_device *adev)
6093 {
6094 	struct pci_dev *pdev = adev->pdev;
6095 	struct drm_device *ddev = adev_to_drm(adev);
6096 
6097 	amdgpu_xcp_dev_unplug(adev);
6098 	drm_dev_unplug(ddev);
6099 
6100 	amdgpu_irq_disable_all(adev);
6101 
6102 	amdgpu_fence_driver_hw_fini(adev);
6103 
6104 	adev->no_hw_access = true;
6105 
6106 	amdgpu_device_unmap_mmio(adev);
6107 
6108 	pci_disable_device(pdev);
6109 	pci_wait_for_pending_transaction(pdev);
6110 }
6111 
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6112 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6113 				u32 reg)
6114 {
6115 	unsigned long flags, address, data;
6116 	u32 r;
6117 
6118 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6119 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6120 
6121 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6122 	WREG32(address, reg * 4);
6123 	(void)RREG32(address);
6124 	r = RREG32(data);
6125 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6126 	return r;
6127 }
6128 
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6129 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6130 				u32 reg, u32 v)
6131 {
6132 	unsigned long flags, address, data;
6133 
6134 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6135 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6136 
6137 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6138 	WREG32(address, reg * 4);
6139 	(void)RREG32(address);
6140 	WREG32(data, v);
6141 	(void)RREG32(data);
6142 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6143 }
6144 
6145 /**
6146  * amdgpu_device_switch_gang - switch to a new gang
6147  * @adev: amdgpu_device pointer
6148  * @gang: the gang to switch to
6149  *
6150  * Try to switch to a new gang.
6151  * Returns: NULL if we switched to the new gang or a reference to the current
6152  * gang leader.
6153  */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6154 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6155 					    struct dma_fence *gang)
6156 {
6157 	struct dma_fence *old = NULL;
6158 
6159 	do {
6160 		dma_fence_put(old);
6161 		rcu_read_lock();
6162 		old = dma_fence_get_rcu_safe(&adev->gang_submit);
6163 		rcu_read_unlock();
6164 
6165 		if (old == gang)
6166 			break;
6167 
6168 		if (!dma_fence_is_signaled(old))
6169 			return old;
6170 
6171 	} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6172 			 old, gang) != old);
6173 
6174 	dma_fence_put(old);
6175 	return NULL;
6176 }
6177 
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6178 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6179 {
6180 	switch (adev->asic_type) {
6181 #ifdef CONFIG_DRM_AMDGPU_SI
6182 	case CHIP_HAINAN:
6183 #endif
6184 	case CHIP_TOPAZ:
6185 		/* chips with no display hardware */
6186 		return false;
6187 #ifdef CONFIG_DRM_AMDGPU_SI
6188 	case CHIP_TAHITI:
6189 	case CHIP_PITCAIRN:
6190 	case CHIP_VERDE:
6191 	case CHIP_OLAND:
6192 #endif
6193 #ifdef CONFIG_DRM_AMDGPU_CIK
6194 	case CHIP_BONAIRE:
6195 	case CHIP_HAWAII:
6196 	case CHIP_KAVERI:
6197 	case CHIP_KABINI:
6198 	case CHIP_MULLINS:
6199 #endif
6200 	case CHIP_TONGA:
6201 	case CHIP_FIJI:
6202 	case CHIP_POLARIS10:
6203 	case CHIP_POLARIS11:
6204 	case CHIP_POLARIS12:
6205 	case CHIP_VEGAM:
6206 	case CHIP_CARRIZO:
6207 	case CHIP_STONEY:
6208 		/* chips with display hardware */
6209 		return true;
6210 	default:
6211 		/* IP discovery */
6212 		if (!adev->ip_versions[DCE_HWIP][0] ||
6213 		    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6214 			return false;
6215 		return true;
6216 	}
6217 }
6218 
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6219 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6220 		uint32_t inst, uint32_t reg_addr, char reg_name[],
6221 		uint32_t expected_value, uint32_t mask)
6222 {
6223 	uint32_t ret = 0;
6224 	uint32_t old_ = 0;
6225 	uint32_t tmp_ = RREG32(reg_addr);
6226 	uint32_t loop = adev->usec_timeout;
6227 
6228 	while ((tmp_ & (mask)) != (expected_value)) {
6229 		if (old_ != tmp_) {
6230 			loop = adev->usec_timeout;
6231 			old_ = tmp_;
6232 		} else
6233 			udelay(1);
6234 		tmp_ = RREG32(reg_addr);
6235 		loop--;
6236 		if (!loop) {
6237 			DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6238 				  inst, reg_name, (uint32_t)expected_value,
6239 				  (uint32_t)(tmp_ & (mask)));
6240 			ret = -ETIMEDOUT;
6241 			break;
6242 		}
6243 	}
6244 	return ret;
6245 }
6246