xref: /openbsd/sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c (revision ac374fd8)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39 
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69 
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72 
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78 
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82 
83 #include <drm/drm_drv.h>
84 
85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__)
86 #include <asm/intel-family.h>
87 #endif
88 
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96 
97 #define AMDGPU_RESUME_MS		2000
98 #define AMDGPU_MAX_RETRY_LIMIT		2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100 
101 static const struct drm_driver amdgpu_kms_driver;
102 
103 const char *amdgpu_asic_name[] = {
104 	"TAHITI",
105 	"PITCAIRN",
106 	"VERDE",
107 	"OLAND",
108 	"HAINAN",
109 	"BONAIRE",
110 	"KAVERI",
111 	"KABINI",
112 	"HAWAII",
113 	"MULLINS",
114 	"TOPAZ",
115 	"TONGA",
116 	"FIJI",
117 	"CARRIZO",
118 	"STONEY",
119 	"POLARIS10",
120 	"POLARIS11",
121 	"POLARIS12",
122 	"VEGAM",
123 	"VEGA10",
124 	"VEGA12",
125 	"VEGA20",
126 	"RAVEN",
127 	"ARCTURUS",
128 	"RENOIR",
129 	"ALDEBARAN",
130 	"NAVI10",
131 	"CYAN_SKILLFISH",
132 	"NAVI14",
133 	"NAVI12",
134 	"SIENNA_CICHLID",
135 	"NAVY_FLOUNDER",
136 	"VANGOGH",
137 	"DIMGREY_CAVEFISH",
138 	"BEIGE_GOBY",
139 	"YELLOW_CARP",
140 	"IP DISCOVERY",
141 	"LAST",
142 };
143 
144 /**
145  * DOC: pcie_replay_count
146  *
147  * The amdgpu driver provides a sysfs API for reporting the total number
148  * of PCIe replays (NAKs)
149  * The file pcie_replay_count is used for this and returns the total
150  * number of replays as a sum of the NAKs generated and NAKs received
151  */
152 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
154 		struct device_attribute *attr, char *buf)
155 {
156 	struct drm_device *ddev = dev_get_drvdata(dev);
157 	struct amdgpu_device *adev = drm_to_adev(ddev);
158 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
159 
160 	return sysfs_emit(buf, "%llu\n", cnt);
161 }
162 
163 static DEVICE_ATTR(pcie_replay_count, 0444,
164 		amdgpu_device_get_pcie_replay_count, NULL);
165 
166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
167 
168 
169 /**
170  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
171  *
172  * @dev: drm_device pointer
173  *
174  * Returns true if the device is a dGPU with ATPX power control,
175  * otherwise return false.
176  */
amdgpu_device_supports_px(struct drm_device * dev)177 bool amdgpu_device_supports_px(struct drm_device *dev)
178 {
179 	struct amdgpu_device *adev = drm_to_adev(dev);
180 
181 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
182 		return true;
183 	return false;
184 }
185 
186 /**
187  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
188  *
189  * @dev: drm_device pointer
190  *
191  * Returns true if the device is a dGPU with ACPI power control,
192  * otherwise return false.
193  */
amdgpu_device_supports_boco(struct drm_device * dev)194 bool amdgpu_device_supports_boco(struct drm_device *dev)
195 {
196 	struct amdgpu_device *adev = drm_to_adev(dev);
197 
198 	if (adev->has_pr3 ||
199 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
200 		return true;
201 	return false;
202 }
203 
204 /**
205  * amdgpu_device_supports_baco - Does the device support BACO
206  *
207  * @dev: drm_device pointer
208  *
209  * Returns true if the device supporte BACO,
210  * otherwise return false.
211  */
amdgpu_device_supports_baco(struct drm_device * dev)212 bool amdgpu_device_supports_baco(struct drm_device *dev)
213 {
214 	struct amdgpu_device *adev = drm_to_adev(dev);
215 
216 	return amdgpu_asic_supports_baco(adev);
217 }
218 
219 /**
220  * amdgpu_device_supports_smart_shift - Is the device dGPU with
221  * smart shift support
222  *
223  * @dev: drm_device pointer
224  *
225  * Returns true if the device is a dGPU with Smart Shift support,
226  * otherwise returns false.
227  */
amdgpu_device_supports_smart_shift(struct drm_device * dev)228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
229 {
230 	return (amdgpu_device_supports_boco(dev) &&
231 		amdgpu_acpi_is_power_shift_control_supported());
232 }
233 
234 /*
235  * VRAM access helper functions
236  */
237 
238 /**
239  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
240  *
241  * @adev: amdgpu_device pointer
242  * @pos: offset of the buffer in vram
243  * @buf: virtual address of the buffer in system memory
244  * @size: read/write size, sizeof(@buf) must > @size
245  * @write: true - write to vram, otherwise - read from vram
246  */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
248 			     void *buf, size_t size, bool write)
249 {
250 	unsigned long flags;
251 	uint32_t hi = ~0, tmp = 0;
252 	uint32_t *data = buf;
253 	uint64_t last;
254 	int idx;
255 
256 	if (!drm_dev_enter(adev_to_drm(adev), &idx))
257 		return;
258 
259 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
260 
261 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
262 	for (last = pos + size; pos < last; pos += 4) {
263 		tmp = pos >> 31;
264 
265 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
266 		if (tmp != hi) {
267 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
268 			hi = tmp;
269 		}
270 		if (write)
271 			WREG32_NO_KIQ(mmMM_DATA, *data++);
272 		else
273 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
274 	}
275 
276 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
277 	drm_dev_exit(idx);
278 }
279 
280 /**
281  * amdgpu_device_aper_access - access vram by vram aperature
282  *
283  * @adev: amdgpu_device pointer
284  * @pos: offset of the buffer in vram
285  * @buf: virtual address of the buffer in system memory
286  * @size: read/write size, sizeof(@buf) must > @size
287  * @write: true - write to vram, otherwise - read from vram
288  *
289  * The return value means how many bytes have been transferred.
290  */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
292 				 void *buf, size_t size, bool write)
293 {
294 #ifdef CONFIG_64BIT
295 	void __iomem *addr;
296 	size_t count = 0;
297 	uint64_t last;
298 
299 	if (!adev->mman.aper_base_kaddr)
300 		return 0;
301 
302 	last = min(pos + size, adev->gmc.visible_vram_size);
303 	if (last > pos) {
304 		addr = adev->mman.aper_base_kaddr + pos;
305 		count = last - pos;
306 
307 		if (write) {
308 			memcpy_toio(addr, buf, count);
309 			/* Make sure HDP write cache flush happens without any reordering
310 			 * after the system memory contents are sent over PCIe device
311 			 */
312 			mb();
313 			amdgpu_device_flush_hdp(adev, NULL);
314 		} else {
315 			amdgpu_device_invalidate_hdp(adev, NULL);
316 			/* Make sure HDP read cache is invalidated before issuing a read
317 			 * to the PCIe device
318 			 */
319 			mb();
320 			memcpy_fromio(buf, addr, count);
321 		}
322 
323 	}
324 
325 	return count;
326 #else
327 	return 0;
328 #endif
329 }
330 
331 /**
332  * amdgpu_device_vram_access - read/write a buffer in vram
333  *
334  * @adev: amdgpu_device pointer
335  * @pos: offset of the buffer in vram
336  * @buf: virtual address of the buffer in system memory
337  * @size: read/write size, sizeof(@buf) must > @size
338  * @write: true - write to vram, otherwise - read from vram
339  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
341 			       void *buf, size_t size, bool write)
342 {
343 	size_t count;
344 
345 	/* try to using vram apreature to access vram first */
346 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
347 	size -= count;
348 	if (size) {
349 		/* using MM to access rest vram */
350 		pos += count;
351 		buf += count;
352 		amdgpu_device_mm_access(adev, pos, buf, size, write);
353 	}
354 }
355 
356 /*
357  * register access helper functions.
358  */
359 
360 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
362 {
363 	if (adev->no_hw_access)
364 		return true;
365 
366 #ifdef CONFIG_LOCKDEP
367 	/*
368 	 * This is a bit complicated to understand, so worth a comment. What we assert
369 	 * here is that the GPU reset is not running on another thread in parallel.
370 	 *
371 	 * For this we trylock the read side of the reset semaphore, if that succeeds
372 	 * we know that the reset is not running in paralell.
373 	 *
374 	 * If the trylock fails we assert that we are either already holding the read
375 	 * side of the lock or are the reset thread itself and hold the write side of
376 	 * the lock.
377 	 */
378 	if (in_task()) {
379 		if (down_read_trylock(&adev->reset_domain->sem))
380 			up_read(&adev->reset_domain->sem);
381 		else
382 			lockdep_assert_held(&adev->reset_domain->sem);
383 	}
384 #endif
385 	return false;
386 }
387 
388 /**
389  * amdgpu_device_rreg - read a memory mapped IO or indirect register
390  *
391  * @adev: amdgpu_device pointer
392  * @reg: dword aligned register offset
393  * @acc_flags: access flags which require special behavior
394  *
395  * Returns the 32 bit value from the offset specified.
396  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
398 			    uint32_t reg, uint32_t acc_flags)
399 {
400 	uint32_t ret;
401 
402 	if (amdgpu_device_skip_hw_access(adev))
403 		return 0;
404 
405 	if ((reg * 4) < adev->rmmio_size) {
406 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
407 		    amdgpu_sriov_runtime(adev) &&
408 		    down_read_trylock(&adev->reset_domain->sem)) {
409 			ret = amdgpu_kiq_rreg(adev, reg);
410 			up_read(&adev->reset_domain->sem);
411 		} else {
412 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
413 		}
414 	} else {
415 		ret = adev->pcie_rreg(adev, reg * 4);
416 	}
417 
418 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
419 
420 	return ret;
421 }
422 
423 /*
424  * MMIO register read with bytes helper functions
425  * @offset:bytes offset from MMIO start
426  */
427 
428 /**
429  * amdgpu_mm_rreg8 - read a memory mapped IO register
430  *
431  * @adev: amdgpu_device pointer
432  * @offset: byte aligned register offset
433  *
434  * Returns the 8 bit value from the offset specified.
435  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
437 {
438 	if (amdgpu_device_skip_hw_access(adev))
439 		return 0;
440 
441 	if (offset < adev->rmmio_size)
442 		return (readb(adev->rmmio + offset));
443 	BUG();
444 }
445 
446 /*
447  * MMIO register write with bytes helper functions
448  * @offset:bytes offset from MMIO start
449  * @value: the value want to be written to the register
450  */
451 
452 /**
453  * amdgpu_mm_wreg8 - read a memory mapped IO register
454  *
455  * @adev: amdgpu_device pointer
456  * @offset: byte aligned register offset
457  * @value: 8 bit value to write
458  *
459  * Writes the value specified to the offset specified.
460  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
462 {
463 	if (amdgpu_device_skip_hw_access(adev))
464 		return;
465 
466 	if (offset < adev->rmmio_size)
467 		writeb(value, adev->rmmio + offset);
468 	else
469 		BUG();
470 }
471 
472 /**
473  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
474  *
475  * @adev: amdgpu_device pointer
476  * @reg: dword aligned register offset
477  * @v: 32 bit value to write to the register
478  * @acc_flags: access flags which require special behavior
479  *
480  * Writes the value specified to the offset specified.
481  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)482 void amdgpu_device_wreg(struct amdgpu_device *adev,
483 			uint32_t reg, uint32_t v,
484 			uint32_t acc_flags)
485 {
486 	if (amdgpu_device_skip_hw_access(adev))
487 		return;
488 
489 	if ((reg * 4) < adev->rmmio_size) {
490 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
491 		    amdgpu_sriov_runtime(adev) &&
492 		    down_read_trylock(&adev->reset_domain->sem)) {
493 			amdgpu_kiq_wreg(adev, reg, v);
494 			up_read(&adev->reset_domain->sem);
495 		} else {
496 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
497 		}
498 	} else {
499 		adev->pcie_wreg(adev, reg * 4, v);
500 	}
501 
502 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
503 }
504 
505 /**
506  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
507  *
508  * @adev: amdgpu_device pointer
509  * @reg: mmio/rlc register
510  * @v: value to write
511  *
512  * this function is invoked only for the debugfs register access
513  */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
515 			     uint32_t reg, uint32_t v,
516 			     uint32_t xcc_id)
517 {
518 	if (amdgpu_device_skip_hw_access(adev))
519 		return;
520 
521 	if (amdgpu_sriov_fullaccess(adev) &&
522 	    adev->gfx.rlc.funcs &&
523 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
524 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
525 			return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
526 	} else if ((reg * 4) >= adev->rmmio_size) {
527 		adev->pcie_wreg(adev, reg * 4, v);
528 	} else {
529 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
530 	}
531 }
532 
533 /**
534  * amdgpu_device_indirect_rreg - read an indirect register
535  *
536  * @adev: amdgpu_device pointer
537  * @reg_addr: indirect register address to read from
538  *
539  * Returns the value of indirect register @reg_addr
540  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
542 				u32 reg_addr)
543 {
544 	unsigned long flags, pcie_index, pcie_data;
545 	void __iomem *pcie_index_offset;
546 	void __iomem *pcie_data_offset;
547 	u32 r;
548 
549 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
550 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
551 
552 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
553 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
554 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
555 
556 	writel(reg_addr, pcie_index_offset);
557 	readl(pcie_index_offset);
558 	r = readl(pcie_data_offset);
559 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
560 
561 	return r;
562 }
563 
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
565 				    u64 reg_addr)
566 {
567 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
568 	u32 r;
569 	void __iomem *pcie_index_offset;
570 	void __iomem *pcie_index_hi_offset;
571 	void __iomem *pcie_data_offset;
572 
573 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
574 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
575 	if (adev->nbio.funcs->get_pcie_index_hi_offset)
576 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
577 	else
578 		pcie_index_hi = 0;
579 
580 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
581 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
582 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
583 	if (pcie_index_hi != 0)
584 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
585 				pcie_index_hi * 4;
586 
587 	writel(reg_addr, pcie_index_offset);
588 	readl(pcie_index_offset);
589 	if (pcie_index_hi != 0) {
590 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
591 		readl(pcie_index_hi_offset);
592 	}
593 	r = readl(pcie_data_offset);
594 
595 	/* clear the high bits */
596 	if (pcie_index_hi != 0) {
597 		writel(0, pcie_index_hi_offset);
598 		readl(pcie_index_hi_offset);
599 	}
600 
601 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602 
603 	return r;
604 }
605 
606 /**
607  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608  *
609  * @adev: amdgpu_device pointer
610  * @reg_addr: indirect register address to read from
611  *
612  * Returns the value of indirect register @reg_addr
613  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
615 				  u32 reg_addr)
616 {
617 	unsigned long flags, pcie_index, pcie_data;
618 	void __iomem *pcie_index_offset;
619 	void __iomem *pcie_data_offset;
620 	u64 r;
621 
622 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
623 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
624 
625 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
626 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
627 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
628 
629 	/* read low 32 bits */
630 	writel(reg_addr, pcie_index_offset);
631 	readl(pcie_index_offset);
632 	r = readl(pcie_data_offset);
633 	/* read high 32 bits */
634 	writel(reg_addr + 4, pcie_index_offset);
635 	readl(pcie_index_offset);
636 	r |= ((u64)readl(pcie_data_offset) << 32);
637 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
638 
639 	return r;
640 }
641 
642 /**
643  * amdgpu_device_indirect_wreg - write an indirect register address
644  *
645  * @adev: amdgpu_device pointer
646  * @reg_addr: indirect register offset
647  * @reg_data: indirect register data
648  *
649  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
651 				 u32 reg_addr, u32 reg_data)
652 {
653 	unsigned long flags, pcie_index, pcie_data;
654 	void __iomem *pcie_index_offset;
655 	void __iomem *pcie_data_offset;
656 
657 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
658 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
659 
660 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
661 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
662 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
663 
664 	writel(reg_addr, pcie_index_offset);
665 	readl(pcie_index_offset);
666 	writel(reg_data, pcie_data_offset);
667 	readl(pcie_data_offset);
668 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
669 }
670 
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
672 				     u64 reg_addr, u32 reg_data)
673 {
674 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
675 	void __iomem *pcie_index_offset;
676 	void __iomem *pcie_index_hi_offset;
677 	void __iomem *pcie_data_offset;
678 
679 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
680 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
681 	if (adev->nbio.funcs->get_pcie_index_hi_offset)
682 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
683 	else
684 		pcie_index_hi = 0;
685 
686 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
687 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
688 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
689 	if (pcie_index_hi != 0)
690 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
691 				pcie_index_hi * 4;
692 
693 	writel(reg_addr, pcie_index_offset);
694 	readl(pcie_index_offset);
695 	if (pcie_index_hi != 0) {
696 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
697 		readl(pcie_index_hi_offset);
698 	}
699 	writel(reg_data, pcie_data_offset);
700 	readl(pcie_data_offset);
701 
702 	/* clear the high bits */
703 	if (pcie_index_hi != 0) {
704 		writel(0, pcie_index_hi_offset);
705 		readl(pcie_index_hi_offset);
706 	}
707 
708 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
709 }
710 
711 /**
712  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
713  *
714  * @adev: amdgpu_device pointer
715  * @reg_addr: indirect register offset
716  * @reg_data: indirect register data
717  *
718  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
720 				   u32 reg_addr, u64 reg_data)
721 {
722 	unsigned long flags, pcie_index, pcie_data;
723 	void __iomem *pcie_index_offset;
724 	void __iomem *pcie_data_offset;
725 
726 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
727 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
728 
729 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
730 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
731 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
732 
733 	/* write low 32 bits */
734 	writel(reg_addr, pcie_index_offset);
735 	readl(pcie_index_offset);
736 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
737 	readl(pcie_data_offset);
738 	/* write high 32 bits */
739 	writel(reg_addr + 4, pcie_index_offset);
740 	readl(pcie_index_offset);
741 	writel((u32)(reg_data >> 32), pcie_data_offset);
742 	readl(pcie_data_offset);
743 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
744 }
745 
746 /**
747  * amdgpu_device_get_rev_id - query device rev_id
748  *
749  * @adev: amdgpu_device pointer
750  *
751  * Return device rev_id
752  */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
754 {
755 	return adev->nbio.funcs->get_rev_id(adev);
756 }
757 
758 /**
759  * amdgpu_invalid_rreg - dummy reg read function
760  *
761  * @adev: amdgpu_device pointer
762  * @reg: offset of register
763  *
764  * Dummy register read function.  Used for register blocks
765  * that certain asics don't have (all asics).
766  * Returns the value in the register.
767  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
769 {
770 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
771 	BUG();
772 	return 0;
773 }
774 
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
776 {
777 	DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
778 	BUG();
779 	return 0;
780 }
781 
782 /**
783  * amdgpu_invalid_wreg - dummy reg write function
784  *
785  * @adev: amdgpu_device pointer
786  * @reg: offset of register
787  * @v: value to write to the register
788  *
789  * Dummy register read function.  Used for register blocks
790  * that certain asics don't have (all asics).
791  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
793 {
794 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
795 		  reg, v);
796 	BUG();
797 }
798 
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
800 {
801 	DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
802 		  reg, v);
803 	BUG();
804 }
805 
806 /**
807  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
808  *
809  * @adev: amdgpu_device pointer
810  * @reg: offset of register
811  *
812  * Dummy register read function.  Used for register blocks
813  * that certain asics don't have (all asics).
814  * Returns the value in the register.
815  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
817 {
818 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
819 	BUG();
820 	return 0;
821 }
822 
823 /**
824  * amdgpu_invalid_wreg64 - dummy reg write function
825  *
826  * @adev: amdgpu_device pointer
827  * @reg: offset of register
828  * @v: value to write to the register
829  *
830  * Dummy register read function.  Used for register blocks
831  * that certain asics don't have (all asics).
832  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
834 {
835 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
836 		  reg, v);
837 	BUG();
838 }
839 
840 /**
841  * amdgpu_block_invalid_rreg - dummy reg read function
842  *
843  * @adev: amdgpu_device pointer
844  * @block: offset of instance
845  * @reg: offset of register
846  *
847  * Dummy register read function.  Used for register blocks
848  * that certain asics don't have (all asics).
849  * Returns the value in the register.
850  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
852 					  uint32_t block, uint32_t reg)
853 {
854 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
855 		  reg, block);
856 	BUG();
857 	return 0;
858 }
859 
860 /**
861  * amdgpu_block_invalid_wreg - dummy reg write function
862  *
863  * @adev: amdgpu_device pointer
864  * @block: offset of instance
865  * @reg: offset of register
866  * @v: value to write to the register
867  *
868  * Dummy register read function.  Used for register blocks
869  * that certain asics don't have (all asics).
870  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
872 				      uint32_t block,
873 				      uint32_t reg, uint32_t v)
874 {
875 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
876 		  reg, block, v);
877 	BUG();
878 }
879 
880 /**
881  * amdgpu_device_asic_init - Wrapper for atom asic_init
882  *
883  * @adev: amdgpu_device pointer
884  *
885  * Does any asic specific work and then calls atom asic init.
886  */
amdgpu_device_asic_init(struct amdgpu_device * adev)887 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
888 {
889 	int ret;
890 
891 	amdgpu_asic_pre_asic_init(adev);
892 
893 	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
894 	    adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
895 		amdgpu_psp_wait_for_bootloader(adev);
896 		ret = amdgpu_atomfirmware_asic_init(adev, true);
897 		return ret;
898 	} else {
899 		return amdgpu_atom_asic_init(adev->mode_info.atom_context);
900 	}
901 
902 	return 0;
903 }
904 
905 /**
906  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
907  *
908  * @adev: amdgpu_device pointer
909  *
910  * Allocates a scratch page of VRAM for use by various things in the
911  * driver.
912  */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
914 {
915 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
916 				       AMDGPU_GEM_DOMAIN_VRAM |
917 				       AMDGPU_GEM_DOMAIN_GTT,
918 				       &adev->mem_scratch.robj,
919 				       &adev->mem_scratch.gpu_addr,
920 				       (void **)&adev->mem_scratch.ptr);
921 }
922 
923 /**
924  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
925  *
926  * @adev: amdgpu_device pointer
927  *
928  * Frees the VRAM scratch page.
929  */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
931 {
932 	amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
933 }
934 
935 /**
936  * amdgpu_device_program_register_sequence - program an array of registers.
937  *
938  * @adev: amdgpu_device pointer
939  * @registers: pointer to the register array
940  * @array_size: size of the register array
941  *
942  * Programs an array or registers with and or masks.
943  * This is a helper for setting golden registers.
944  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
946 					     const u32 *registers,
947 					     const u32 array_size)
948 {
949 	u32 tmp, reg, and_mask, or_mask;
950 	int i;
951 
952 	if (array_size % 3)
953 		return;
954 
955 	for (i = 0; i < array_size; i += 3) {
956 		reg = registers[i + 0];
957 		and_mask = registers[i + 1];
958 		or_mask = registers[i + 2];
959 
960 		if (and_mask == 0xffffffff) {
961 			tmp = or_mask;
962 		} else {
963 			tmp = RREG32(reg);
964 			tmp &= ~and_mask;
965 			if (adev->family >= AMDGPU_FAMILY_AI)
966 				tmp |= (or_mask & and_mask);
967 			else
968 				tmp |= or_mask;
969 		}
970 		WREG32(reg, tmp);
971 	}
972 }
973 
974 /**
975  * amdgpu_device_pci_config_reset - reset the GPU
976  *
977  * @adev: amdgpu_device pointer
978  *
979  * Resets the GPU using the pci config reset sequence.
980  * Only applicable to asics prior to vega10.
981  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
983 {
984 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
985 }
986 
987 /**
988  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
989  *
990  * @adev: amdgpu_device pointer
991  *
992  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
993  */
amdgpu_device_pci_reset(struct amdgpu_device * adev)994 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
995 {
996 	STUB();
997 	return -ENOSYS;
998 #ifdef notyet
999 	return pci_reset_function(adev->pdev);
1000 #endif
1001 }
1002 
1003 /*
1004  * amdgpu_device_wb_*()
1005  * Writeback is the method by which the GPU updates special pages in memory
1006  * with the status of certain GPU events (fences, ring pointers,etc.).
1007  */
1008 
1009 /**
1010  * amdgpu_device_wb_fini - Disable Writeback and free memory
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Disables Writeback and frees the Writeback memory (all asics).
1015  * Used at driver shutdown.
1016  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 	if (adev->wb.wb_obj) {
1020 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 				      &adev->wb.gpu_addr,
1022 				      (void **)&adev->wb.wb);
1023 		adev->wb.wb_obj = NULL;
1024 	}
1025 }
1026 
1027 /**
1028  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1029  *
1030  * @adev: amdgpu_device pointer
1031  *
1032  * Initializes writeback and allocates writeback memory (all asics).
1033  * Used at driver startup.
1034  * Returns 0 on success or an -error on failure.
1035  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 	int r;
1039 
1040 	if (adev->wb.wb_obj == NULL) {
1041 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 					    (void **)&adev->wb.wb);
1046 		if (r) {
1047 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 			return r;
1049 		}
1050 
1051 		adev->wb.num_wb = AMDGPU_MAX_WB;
1052 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053 
1054 		/* clear wb memory */
1055 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 	}
1057 
1058 	return 0;
1059 }
1060 
1061 /**
1062  * amdgpu_device_wb_get - Allocate a wb entry
1063  *
1064  * @adev: amdgpu_device pointer
1065  * @wb: wb index
1066  *
1067  * Allocate a wb slot for use by the driver (all asics).
1068  * Returns 0 on success or -EINVAL on failure.
1069  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073 
1074 	if (offset < adev->wb.num_wb) {
1075 		__set_bit(offset, adev->wb.used);
1076 		*wb = offset << 3; /* convert to dw offset */
1077 		return 0;
1078 	} else {
1079 		return -EINVAL;
1080 	}
1081 }
1082 
1083 /**
1084  * amdgpu_device_wb_free - Free a wb entry
1085  *
1086  * @adev: amdgpu_device pointer
1087  * @wb: wb index
1088  *
1089  * Free a wb slot allocated for use by the driver (all asics)
1090  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 	wb >>= 3;
1094 	if (wb < adev->wb.num_wb)
1095 		__clear_bit(wb, adev->wb.used);
1096 }
1097 
1098 /**
1099  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100  *
1101  * @adev: amdgpu_device pointer
1102  *
1103  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104  * to fail, but if any of the BARs is not accessible after the size we abort
1105  * driver loading by returning -ENODEV.
1106  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 #ifdef __linux__
1110 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1111 	struct pci_bus *root;
1112 	struct resource *res;
1113 	unsigned int i;
1114 	u16 cmd;
1115 	int r;
1116 
1117 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1118 		return 0;
1119 
1120 	/* Bypass for VF */
1121 	if (amdgpu_sriov_vf(adev))
1122 		return 0;
1123 
1124 	/* skip if the bios has already enabled large BAR */
1125 	if (adev->gmc.real_vram_size &&
1126 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1127 		return 0;
1128 
1129 	/* Check if the root BUS has 64bit memory resources */
1130 	root = adev->pdev->bus;
1131 	while (root->parent)
1132 		root = root->parent;
1133 
1134 	pci_bus_for_each_resource(root, res, i) {
1135 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1136 		    res->start > 0x100000000ull)
1137 			break;
1138 	}
1139 
1140 	/* Trying to resize is pointless without a root hub window above 4GB */
1141 	if (!res)
1142 		return 0;
1143 
1144 	/* Limit the BAR size to what is available */
1145 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1146 			rbar_size);
1147 
1148 	/* Disable memory decoding while we change the BAR addresses and size */
1149 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1150 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1151 			      cmd & ~PCI_COMMAND_MEMORY);
1152 
1153 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1154 	amdgpu_doorbell_fini(adev);
1155 	if (adev->asic_type >= CHIP_BONAIRE)
1156 		pci_release_resource(adev->pdev, 2);
1157 
1158 	pci_release_resource(adev->pdev, 0);
1159 
1160 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1161 	if (r == -ENOSPC)
1162 		DRM_INFO("Not enough PCI address space for a large BAR.");
1163 	else if (r && r != -ENOTSUPP)
1164 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1165 
1166 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1167 
1168 	/* When the doorbell or fb BAR isn't available we have no chance of
1169 	 * using the device.
1170 	 */
1171 	r = amdgpu_doorbell_init(adev);
1172 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1173 		return -ENODEV;
1174 
1175 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1176 #endif /* __linux__ */
1177 
1178 	return 0;
1179 }
1180 
amdgpu_device_read_bios(struct amdgpu_device * adev)1181 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1182 {
1183 	if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1184 		return false;
1185 
1186 	return true;
1187 }
1188 
1189 /*
1190  * GPU helpers function.
1191  */
1192 /**
1193  * amdgpu_device_need_post - check if the hw need post or not
1194  *
1195  * @adev: amdgpu_device pointer
1196  *
1197  * Check if the asic has been initialized (all asics) at driver startup
1198  * or post is needed if  hw reset is performed.
1199  * Returns true if need or false if not.
1200  */
amdgpu_device_need_post(struct amdgpu_device * adev)1201 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1202 {
1203 	uint32_t reg;
1204 
1205 	if (amdgpu_sriov_vf(adev))
1206 		return false;
1207 
1208 	if (!amdgpu_device_read_bios(adev))
1209 		return false;
1210 
1211 	if (amdgpu_passthrough(adev)) {
1212 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1213 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1214 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1215 		 * vpost executed for smc version below 22.15
1216 		 */
1217 		if (adev->asic_type == CHIP_FIJI) {
1218 			int err;
1219 			uint32_t fw_ver;
1220 
1221 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1222 			/* force vPost if error occured */
1223 			if (err)
1224 				return true;
1225 
1226 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1227 			release_firmware(adev->pm.fw);
1228 			if (fw_ver < 0x00160e00)
1229 				return true;
1230 		}
1231 	}
1232 
1233 	/* Don't post if we need to reset whole hive on init */
1234 	if (adev->gmc.xgmi.pending_reset)
1235 		return false;
1236 
1237 	if (adev->has_hw_reset) {
1238 		adev->has_hw_reset = false;
1239 		return true;
1240 	}
1241 
1242 	/* bios scratch used on CIK+ */
1243 	if (adev->asic_type >= CHIP_BONAIRE)
1244 		return amdgpu_atombios_scratch_need_asic_init(adev);
1245 
1246 	/* check MEM_SIZE for older asics */
1247 	reg = amdgpu_asic_get_config_memsize(adev);
1248 
1249 	if ((reg != 0) && (reg != 0xffffffff))
1250 		return false;
1251 
1252 	return true;
1253 }
1254 
1255 /*
1256  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1257  * speed switching. Until we have confirmation from Intel that a specific host
1258  * supports it, it's safer that we keep it disabled for all.
1259  *
1260  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1261  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1262  */
amdgpu_device_pcie_dynamic_switching_supported(void)1263 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1264 {
1265 #if IS_ENABLED(CONFIG_X86)
1266 #ifdef __linux__
1267 	struct cpuinfo_x86 *c = &cpu_data(0);
1268 
1269 	if (c->x86_vendor == X86_VENDOR_INTEL)
1270 #else
1271 	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
1272 #endif
1273 		return false;
1274 #endif
1275 	return true;
1276 }
1277 
1278 /**
1279  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1280  *
1281  * @adev: amdgpu_device pointer
1282  *
1283  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1284  * be set for this device.
1285  *
1286  * Returns true if it should be used or false if not.
1287  */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1288 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1289 {
1290 	switch (amdgpu_aspm) {
1291 	case -1:
1292 		break;
1293 	case 0:
1294 		return false;
1295 	case 1:
1296 		return true;
1297 	default:
1298 		return false;
1299 	}
1300 	return pcie_aspm_enabled(adev->pdev);
1301 }
1302 
amdgpu_device_aspm_support_quirk(void)1303 bool amdgpu_device_aspm_support_quirk(void)
1304 {
1305 #if IS_ENABLED(CONFIG_X86)
1306 	struct cpu_info *ci = curcpu();
1307 
1308 	return !(ci->ci_family == 6 && ci->ci_model == 0x97);
1309 #else
1310 	return true;
1311 #endif
1312 }
1313 
1314 /* if we get transitioned to only one device, take VGA back */
1315 /**
1316  * amdgpu_device_vga_set_decode - enable/disable vga decode
1317  *
1318  * @pdev: PCI device pointer
1319  * @state: enable/disable vga decode
1320  *
1321  * Enable/disable vga decode (all asics).
1322  * Returns VGA resource flags.
1323  */
1324 #ifdef notyet
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326 		bool state)
1327 {
1328 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1329 
1330 	amdgpu_asic_set_vga_state(adev, state);
1331 	if (state)
1332 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1333 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1334 	else
1335 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1336 }
1337 #endif
1338 
1339 /**
1340  * amdgpu_device_check_block_size - validate the vm block size
1341  *
1342  * @adev: amdgpu_device pointer
1343  *
1344  * Validates the vm block size specified via module parameter.
1345  * The vm block size defines number of bits in page table versus page directory,
1346  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1347  * page table and the remaining bits are in the page directory.
1348  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1349 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1350 {
1351 	/* defines number of bits in page table versus page directory,
1352 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1353 	 * page table and the remaining bits are in the page directory
1354 	 */
1355 	if (amdgpu_vm_block_size == -1)
1356 		return;
1357 
1358 	if (amdgpu_vm_block_size < 9) {
1359 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1360 			 amdgpu_vm_block_size);
1361 		amdgpu_vm_block_size = -1;
1362 	}
1363 }
1364 
1365 /**
1366  * amdgpu_device_check_vm_size - validate the vm size
1367  *
1368  * @adev: amdgpu_device pointer
1369  *
1370  * Validates the vm size in GB specified via module parameter.
1371  * The VM size is the size of the GPU virtual memory space in GB.
1372  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1373 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1374 {
1375 	/* no need to check the default value */
1376 	if (amdgpu_vm_size == -1)
1377 		return;
1378 
1379 	if (amdgpu_vm_size < 1) {
1380 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1381 			 amdgpu_vm_size);
1382 		amdgpu_vm_size = -1;
1383 	}
1384 }
1385 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1386 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1387 {
1388 #ifdef __linux__
1389 	struct sysinfo si;
1390 #endif
1391 	bool is_os_64 = (sizeof(void *) == 8);
1392 	uint64_t total_memory;
1393 	uint64_t dram_size_seven_GB = 0x1B8000000;
1394 	uint64_t dram_size_three_GB = 0xB8000000;
1395 
1396 	if (amdgpu_smu_memory_pool_size == 0)
1397 		return;
1398 
1399 	if (!is_os_64) {
1400 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1401 		goto def_value;
1402 	}
1403 #ifdef __linux__
1404 	si_meminfo(&si);
1405 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1406 #else
1407 	total_memory = ptoa(physmem);
1408 #endif
1409 
1410 	if ((amdgpu_smu_memory_pool_size == 1) ||
1411 		(amdgpu_smu_memory_pool_size == 2)) {
1412 		if (total_memory < dram_size_three_GB)
1413 			goto def_value1;
1414 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1415 		(amdgpu_smu_memory_pool_size == 8)) {
1416 		if (total_memory < dram_size_seven_GB)
1417 			goto def_value1;
1418 	} else {
1419 		DRM_WARN("Smu memory pool size not supported\n");
1420 		goto def_value;
1421 	}
1422 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1423 
1424 	return;
1425 
1426 def_value1:
1427 	DRM_WARN("No enough system memory\n");
1428 def_value:
1429 	adev->pm.smu_prv_buffer_size = 0;
1430 }
1431 
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1432 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1433 {
1434 	if (!(adev->flags & AMD_IS_APU) ||
1435 	    adev->asic_type < CHIP_RAVEN)
1436 		return 0;
1437 
1438 	switch (adev->asic_type) {
1439 	case CHIP_RAVEN:
1440 		if (adev->pdev->device == 0x15dd)
1441 			adev->apu_flags |= AMD_APU_IS_RAVEN;
1442 		if (adev->pdev->device == 0x15d8)
1443 			adev->apu_flags |= AMD_APU_IS_PICASSO;
1444 		break;
1445 	case CHIP_RENOIR:
1446 		if ((adev->pdev->device == 0x1636) ||
1447 		    (adev->pdev->device == 0x164c))
1448 			adev->apu_flags |= AMD_APU_IS_RENOIR;
1449 		else
1450 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1451 		break;
1452 	case CHIP_VANGOGH:
1453 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
1454 		break;
1455 	case CHIP_YELLOW_CARP:
1456 		break;
1457 	case CHIP_CYAN_SKILLFISH:
1458 		if ((adev->pdev->device == 0x13FE) ||
1459 		    (adev->pdev->device == 0x143F))
1460 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1461 		break;
1462 	default:
1463 		break;
1464 	}
1465 
1466 	return 0;
1467 }
1468 
1469 /**
1470  * amdgpu_device_check_arguments - validate module params
1471  *
1472  * @adev: amdgpu_device pointer
1473  *
1474  * Validates certain module parameters and updates
1475  * the associated values used by the driver (all asics).
1476  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1477 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1478 {
1479 	if (amdgpu_sched_jobs < 4) {
1480 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1481 			 amdgpu_sched_jobs);
1482 		amdgpu_sched_jobs = 4;
1483 	} else if (!is_power_of_2(amdgpu_sched_jobs)) {
1484 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1485 			 amdgpu_sched_jobs);
1486 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1487 	}
1488 
1489 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1490 		/* gart size must be greater or equal to 32M */
1491 		dev_warn(adev->dev, "gart size (%d) too small\n",
1492 			 amdgpu_gart_size);
1493 		amdgpu_gart_size = -1;
1494 	}
1495 
1496 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1497 		/* gtt size must be greater or equal to 32M */
1498 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1499 				 amdgpu_gtt_size);
1500 		amdgpu_gtt_size = -1;
1501 	}
1502 
1503 	/* valid range is between 4 and 9 inclusive */
1504 	if (amdgpu_vm_fragment_size != -1 &&
1505 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1506 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1507 		amdgpu_vm_fragment_size = -1;
1508 	}
1509 
1510 	if (amdgpu_sched_hw_submission < 2) {
1511 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1512 			 amdgpu_sched_hw_submission);
1513 		amdgpu_sched_hw_submission = 2;
1514 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1515 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1516 			 amdgpu_sched_hw_submission);
1517 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1518 	}
1519 
1520 	if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1521 		dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1522 		amdgpu_reset_method = -1;
1523 	}
1524 
1525 	amdgpu_device_check_smu_prv_buffer_size(adev);
1526 
1527 	amdgpu_device_check_vm_size(adev);
1528 
1529 	amdgpu_device_check_block_size(adev);
1530 
1531 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1532 
1533 	return 0;
1534 }
1535 
1536 #ifdef __linux__
1537 /**
1538  * amdgpu_switcheroo_set_state - set switcheroo state
1539  *
1540  * @pdev: pci dev pointer
1541  * @state: vga_switcheroo state
1542  *
1543  * Callback for the switcheroo driver.  Suspends or resumes
1544  * the asics before or after it is powered up using ACPI methods.
1545  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1546 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1547 					enum vga_switcheroo_state state)
1548 {
1549 	struct drm_device *dev = pci_get_drvdata(pdev);
1550 	int r;
1551 
1552 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1553 		return;
1554 
1555 	if (state == VGA_SWITCHEROO_ON) {
1556 		pr_info("switched on\n");
1557 		/* don't suspend or resume card normally */
1558 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1559 
1560 		pci_set_power_state(pdev, PCI_D0);
1561 		amdgpu_device_load_pci_state(pdev);
1562 		r = pci_enable_device(pdev);
1563 		if (r)
1564 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1565 		amdgpu_device_resume(dev, true);
1566 
1567 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1568 	} else {
1569 		pr_info("switched off\n");
1570 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1571 		amdgpu_device_prepare(dev);
1572 		amdgpu_device_suspend(dev, true);
1573 		amdgpu_device_cache_pci_state(pdev);
1574 		/* Shut down the device */
1575 		pci_disable_device(pdev);
1576 		pci_set_power_state(pdev, PCI_D3cold);
1577 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1578 	}
1579 }
1580 
1581 /**
1582  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1583  *
1584  * @pdev: pci dev pointer
1585  *
1586  * Callback for the switcheroo driver.  Check of the switcheroo
1587  * state can be changed.
1588  * Returns true if the state can be changed, false if not.
1589  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1590 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1591 {
1592 	struct drm_device *dev = pci_get_drvdata(pdev);
1593 
1594        /*
1595 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1596 	* locking inversion with the driver load path. And the access here is
1597 	* completely racy anyway. So don't bother with locking for now.
1598 	*/
1599 	return atomic_read(&dev->open_count) == 0;
1600 }
1601 #endif /* __linux__ */
1602 
1603 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1604 #ifdef notyet
1605 	.set_gpu_state = amdgpu_switcheroo_set_state,
1606 	.reprobe = NULL,
1607 	.can_switch = amdgpu_switcheroo_can_switch,
1608 #endif
1609 };
1610 
1611 /**
1612  * amdgpu_device_ip_set_clockgating_state - set the CG state
1613  *
1614  * @dev: amdgpu_device pointer
1615  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1616  * @state: clockgating state (gate or ungate)
1617  *
1618  * Sets the requested clockgating state for all instances of
1619  * the hardware IP specified.
1620  * Returns the error code from the last instance.
1621  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1622 int amdgpu_device_ip_set_clockgating_state(void *dev,
1623 					   enum amd_ip_block_type block_type,
1624 					   enum amd_clockgating_state state)
1625 {
1626 	struct amdgpu_device *adev = dev;
1627 	int i, r = 0;
1628 
1629 	for (i = 0; i < adev->num_ip_blocks; i++) {
1630 		if (!adev->ip_blocks[i].status.valid)
1631 			continue;
1632 		if (adev->ip_blocks[i].version->type != block_type)
1633 			continue;
1634 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1635 			continue;
1636 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1637 			(void *)adev, state);
1638 		if (r)
1639 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1640 				  adev->ip_blocks[i].version->funcs->name, r);
1641 	}
1642 	return r;
1643 }
1644 
1645 /**
1646  * amdgpu_device_ip_set_powergating_state - set the PG state
1647  *
1648  * @dev: amdgpu_device pointer
1649  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1650  * @state: powergating state (gate or ungate)
1651  *
1652  * Sets the requested powergating state for all instances of
1653  * the hardware IP specified.
1654  * Returns the error code from the last instance.
1655  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1656 int amdgpu_device_ip_set_powergating_state(void *dev,
1657 					   enum amd_ip_block_type block_type,
1658 					   enum amd_powergating_state state)
1659 {
1660 	struct amdgpu_device *adev = dev;
1661 	int i, r = 0;
1662 
1663 	for (i = 0; i < adev->num_ip_blocks; i++) {
1664 		if (!adev->ip_blocks[i].status.valid)
1665 			continue;
1666 		if (adev->ip_blocks[i].version->type != block_type)
1667 			continue;
1668 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1669 			continue;
1670 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1671 			(void *)adev, state);
1672 		if (r)
1673 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1674 				  adev->ip_blocks[i].version->funcs->name, r);
1675 	}
1676 	return r;
1677 }
1678 
1679 /**
1680  * amdgpu_device_ip_get_clockgating_state - get the CG state
1681  *
1682  * @adev: amdgpu_device pointer
1683  * @flags: clockgating feature flags
1684  *
1685  * Walks the list of IPs on the device and updates the clockgating
1686  * flags for each IP.
1687  * Updates @flags with the feature flags for each hardware IP where
1688  * clockgating is enabled.
1689  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1690 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1691 					    u64 *flags)
1692 {
1693 	int i;
1694 
1695 	for (i = 0; i < adev->num_ip_blocks; i++) {
1696 		if (!adev->ip_blocks[i].status.valid)
1697 			continue;
1698 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1699 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1700 	}
1701 }
1702 
1703 /**
1704  * amdgpu_device_ip_wait_for_idle - wait for idle
1705  *
1706  * @adev: amdgpu_device pointer
1707  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1708  *
1709  * Waits for the request hardware IP to be idle.
1710  * Returns 0 for success or a negative error code on failure.
1711  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1712 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1713 				   enum amd_ip_block_type block_type)
1714 {
1715 	int i, r;
1716 
1717 	for (i = 0; i < adev->num_ip_blocks; i++) {
1718 		if (!adev->ip_blocks[i].status.valid)
1719 			continue;
1720 		if (adev->ip_blocks[i].version->type == block_type) {
1721 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1722 			if (r)
1723 				return r;
1724 			break;
1725 		}
1726 	}
1727 	return 0;
1728 
1729 }
1730 
1731 /**
1732  * amdgpu_device_ip_is_idle - is the hardware IP idle
1733  *
1734  * @adev: amdgpu_device pointer
1735  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1736  *
1737  * Check if the hardware IP is idle or not.
1738  * Returns true if it the IP is idle, false if not.
1739  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1740 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1741 			      enum amd_ip_block_type block_type)
1742 {
1743 	int i;
1744 
1745 	for (i = 0; i < adev->num_ip_blocks; i++) {
1746 		if (!adev->ip_blocks[i].status.valid)
1747 			continue;
1748 		if (adev->ip_blocks[i].version->type == block_type)
1749 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1750 	}
1751 	return true;
1752 
1753 }
1754 
1755 /**
1756  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1757  *
1758  * @adev: amdgpu_device pointer
1759  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1760  *
1761  * Returns a pointer to the hardware IP block structure
1762  * if it exists for the asic, otherwise NULL.
1763  */
1764 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1765 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1766 			      enum amd_ip_block_type type)
1767 {
1768 	int i;
1769 
1770 	for (i = 0; i < adev->num_ip_blocks; i++)
1771 		if (adev->ip_blocks[i].version->type == type)
1772 			return &adev->ip_blocks[i];
1773 
1774 	return NULL;
1775 }
1776 
1777 /**
1778  * amdgpu_device_ip_block_version_cmp
1779  *
1780  * @adev: amdgpu_device pointer
1781  * @type: enum amd_ip_block_type
1782  * @major: major version
1783  * @minor: minor version
1784  *
1785  * return 0 if equal or greater
1786  * return 1 if smaller or the ip_block doesn't exist
1787  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1788 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1789 				       enum amd_ip_block_type type,
1790 				       u32 major, u32 minor)
1791 {
1792 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1793 
1794 	if (ip_block && ((ip_block->version->major > major) ||
1795 			((ip_block->version->major == major) &&
1796 			(ip_block->version->minor >= minor))))
1797 		return 0;
1798 
1799 	return 1;
1800 }
1801 
1802 /**
1803  * amdgpu_device_ip_block_add
1804  *
1805  * @adev: amdgpu_device pointer
1806  * @ip_block_version: pointer to the IP to add
1807  *
1808  * Adds the IP block driver information to the collection of IPs
1809  * on the asic.
1810  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1811 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1812 			       const struct amdgpu_ip_block_version *ip_block_version)
1813 {
1814 	if (!ip_block_version)
1815 		return -EINVAL;
1816 
1817 	switch (ip_block_version->type) {
1818 	case AMD_IP_BLOCK_TYPE_VCN:
1819 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1820 			return 0;
1821 		break;
1822 	case AMD_IP_BLOCK_TYPE_JPEG:
1823 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1824 			return 0;
1825 		break;
1826 	default:
1827 		break;
1828 	}
1829 
1830 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1831 		  ip_block_version->funcs->name);
1832 
1833 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1834 
1835 	return 0;
1836 }
1837 
1838 /**
1839  * amdgpu_device_enable_virtual_display - enable virtual display feature
1840  *
1841  * @adev: amdgpu_device pointer
1842  *
1843  * Enabled the virtual display feature if the user has enabled it via
1844  * the module parameter virtual_display.  This feature provides a virtual
1845  * display hardware on headless boards or in virtualized environments.
1846  * This function parses and validates the configuration string specified by
1847  * the user and configues the virtual display configuration (number of
1848  * virtual connectors, crtcs, etc.) specified.
1849  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1850 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1851 {
1852 	adev->enable_virtual_display = false;
1853 
1854 #ifdef notyet
1855 	if (amdgpu_virtual_display) {
1856 		const char *pci_address_name = pci_name(adev->pdev);
1857 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1858 
1859 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1860 		pciaddstr_tmp = pciaddstr;
1861 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1862 			pciaddname = strsep(&pciaddname_tmp, ",");
1863 			if (!strcmp("all", pciaddname)
1864 			    || !strcmp(pci_address_name, pciaddname)) {
1865 				long num_crtc;
1866 				int res = -1;
1867 
1868 				adev->enable_virtual_display = true;
1869 
1870 				if (pciaddname_tmp)
1871 					res = kstrtol(pciaddname_tmp, 10,
1872 						      &num_crtc);
1873 
1874 				if (!res) {
1875 					if (num_crtc < 1)
1876 						num_crtc = 1;
1877 					if (num_crtc > 6)
1878 						num_crtc = 6;
1879 					adev->mode_info.num_crtc = num_crtc;
1880 				} else {
1881 					adev->mode_info.num_crtc = 1;
1882 				}
1883 				break;
1884 			}
1885 		}
1886 
1887 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1888 			 amdgpu_virtual_display, pci_address_name,
1889 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1890 
1891 		kfree(pciaddstr);
1892 	}
1893 #endif
1894 }
1895 
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1896 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1897 {
1898 	if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1899 		adev->mode_info.num_crtc = 1;
1900 		adev->enable_virtual_display = true;
1901 		DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1902 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1903 	}
1904 }
1905 
1906 /**
1907  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1908  *
1909  * @adev: amdgpu_device pointer
1910  *
1911  * Parses the asic configuration parameters specified in the gpu info
1912  * firmware and makes them availale to the driver for use in configuring
1913  * the asic.
1914  * Returns 0 on success, -EINVAL on failure.
1915  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1916 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1917 {
1918 	const char *chip_name;
1919 	char fw_name[40];
1920 	int err;
1921 	const struct gpu_info_firmware_header_v1_0 *hdr;
1922 
1923 	adev->firmware.gpu_info_fw = NULL;
1924 
1925 	if (adev->mman.discovery_bin)
1926 		return 0;
1927 
1928 	switch (adev->asic_type) {
1929 	default:
1930 		return 0;
1931 	case CHIP_VEGA10:
1932 		chip_name = "vega10";
1933 		break;
1934 	case CHIP_VEGA12:
1935 		chip_name = "vega12";
1936 		break;
1937 	case CHIP_RAVEN:
1938 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1939 			chip_name = "raven2";
1940 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1941 			chip_name = "picasso";
1942 		else
1943 			chip_name = "raven";
1944 		break;
1945 	case CHIP_ARCTURUS:
1946 		chip_name = "arcturus";
1947 		break;
1948 	case CHIP_NAVI12:
1949 		chip_name = "navi12";
1950 		break;
1951 	}
1952 
1953 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1954 	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1955 	if (err) {
1956 		dev_err(adev->dev,
1957 			"Failed to get gpu_info firmware \"%s\"\n",
1958 			fw_name);
1959 		goto out;
1960 	}
1961 
1962 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1963 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1964 
1965 	switch (hdr->version_major) {
1966 	case 1:
1967 	{
1968 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1969 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1970 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1971 
1972 		/*
1973 		 * Should be droped when DAL no longer needs it.
1974 		 */
1975 		if (adev->asic_type == CHIP_NAVI12)
1976 			goto parse_soc_bounding_box;
1977 
1978 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1979 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1980 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1981 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1982 		adev->gfx.config.max_texture_channel_caches =
1983 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1984 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1985 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1986 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1987 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1988 		adev->gfx.config.double_offchip_lds_buf =
1989 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1990 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1991 		adev->gfx.cu_info.max_waves_per_simd =
1992 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1993 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1994 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1995 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1996 		if (hdr->version_minor >= 1) {
1997 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1998 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1999 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2000 			adev->gfx.config.num_sc_per_sh =
2001 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2002 			adev->gfx.config.num_packer_per_sc =
2003 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2004 		}
2005 
2006 parse_soc_bounding_box:
2007 		/*
2008 		 * soc bounding box info is not integrated in disocovery table,
2009 		 * we always need to parse it from gpu info firmware if needed.
2010 		 */
2011 		if (hdr->version_minor == 2) {
2012 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2013 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2014 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2015 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2016 		}
2017 		break;
2018 	}
2019 	default:
2020 		dev_err(adev->dev,
2021 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2022 		err = -EINVAL;
2023 		goto out;
2024 	}
2025 out:
2026 	return err;
2027 }
2028 
2029 /**
2030  * amdgpu_device_ip_early_init - run early init for hardware IPs
2031  *
2032  * @adev: amdgpu_device pointer
2033  *
2034  * Early initialization pass for hardware IPs.  The hardware IPs that make
2035  * up each asic are discovered each IP's early_init callback is run.  This
2036  * is the first stage in initializing the asic.
2037  * Returns 0 on success, negative error code on failure.
2038  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2039 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2040 {
2041 	struct pci_dev *parent;
2042 	int i, r;
2043 	bool total;
2044 
2045 	amdgpu_device_enable_virtual_display(adev);
2046 
2047 	if (amdgpu_sriov_vf(adev)) {
2048 		r = amdgpu_virt_request_full_gpu(adev, true);
2049 		if (r)
2050 			return r;
2051 	}
2052 
2053 	switch (adev->asic_type) {
2054 #ifdef CONFIG_DRM_AMDGPU_SI
2055 	case CHIP_VERDE:
2056 	case CHIP_TAHITI:
2057 	case CHIP_PITCAIRN:
2058 	case CHIP_OLAND:
2059 	case CHIP_HAINAN:
2060 		adev->family = AMDGPU_FAMILY_SI;
2061 		r = si_set_ip_blocks(adev);
2062 		if (r)
2063 			return r;
2064 		break;
2065 #endif
2066 #ifdef CONFIG_DRM_AMDGPU_CIK
2067 	case CHIP_BONAIRE:
2068 	case CHIP_HAWAII:
2069 	case CHIP_KAVERI:
2070 	case CHIP_KABINI:
2071 	case CHIP_MULLINS:
2072 		if (adev->flags & AMD_IS_APU)
2073 			adev->family = AMDGPU_FAMILY_KV;
2074 		else
2075 			adev->family = AMDGPU_FAMILY_CI;
2076 
2077 		r = cik_set_ip_blocks(adev);
2078 		if (r)
2079 			return r;
2080 		break;
2081 #endif
2082 	case CHIP_TOPAZ:
2083 	case CHIP_TONGA:
2084 	case CHIP_FIJI:
2085 	case CHIP_POLARIS10:
2086 	case CHIP_POLARIS11:
2087 	case CHIP_POLARIS12:
2088 	case CHIP_VEGAM:
2089 	case CHIP_CARRIZO:
2090 	case CHIP_STONEY:
2091 		if (adev->flags & AMD_IS_APU)
2092 			adev->family = AMDGPU_FAMILY_CZ;
2093 		else
2094 			adev->family = AMDGPU_FAMILY_VI;
2095 
2096 		r = vi_set_ip_blocks(adev);
2097 		if (r)
2098 			return r;
2099 		break;
2100 	default:
2101 		r = amdgpu_discovery_set_ip_blocks(adev);
2102 		if (r)
2103 			return r;
2104 		break;
2105 	}
2106 
2107 	if (amdgpu_has_atpx() &&
2108 	    (amdgpu_is_atpx_hybrid() ||
2109 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2110 	    ((adev->flags & AMD_IS_APU) == 0) &&
2111 	    !dev_is_removable(&adev->pdev->dev))
2112 		adev->flags |= AMD_IS_PX;
2113 
2114 	if (!(adev->flags & AMD_IS_APU)) {
2115 #ifdef notyet
2116 		parent = pcie_find_root_port(adev->pdev);
2117 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2118 #else
2119 		adev->has_pr3 = false;
2120 #endif
2121 	}
2122 
2123 
2124 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2125 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2126 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2127 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2128 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2129 	if (!amdgpu_device_pcie_dynamic_switching_supported())
2130 		adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2131 
2132 	total = true;
2133 	for (i = 0; i < adev->num_ip_blocks; i++) {
2134 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2135 			DRM_WARN("disabled ip block: %d <%s>\n",
2136 				  i, adev->ip_blocks[i].version->funcs->name);
2137 			adev->ip_blocks[i].status.valid = false;
2138 		} else {
2139 			if (adev->ip_blocks[i].version->funcs->early_init) {
2140 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2141 				if (r == -ENOENT) {
2142 					adev->ip_blocks[i].status.valid = false;
2143 				} else if (r) {
2144 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2145 						  adev->ip_blocks[i].version->funcs->name, r);
2146 					total = false;
2147 				} else {
2148 					adev->ip_blocks[i].status.valid = true;
2149 				}
2150 			} else {
2151 				adev->ip_blocks[i].status.valid = true;
2152 			}
2153 		}
2154 		/* get the vbios after the asic_funcs are set up */
2155 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2156 			r = amdgpu_device_parse_gpu_info_fw(adev);
2157 			if (r)
2158 				return r;
2159 
2160 			/* Read BIOS */
2161 			if (amdgpu_device_read_bios(adev)) {
2162 				if (!amdgpu_get_bios(adev))
2163 					return -EINVAL;
2164 
2165 				r = amdgpu_atombios_init(adev);
2166 				if (r) {
2167 					dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2168 					amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2169 					return r;
2170 				}
2171 			}
2172 
2173 			/*get pf2vf msg info at it's earliest time*/
2174 			if (amdgpu_sriov_vf(adev))
2175 				amdgpu_virt_init_data_exchange(adev);
2176 
2177 		}
2178 	}
2179 	if (!total)
2180 		return -ENODEV;
2181 
2182 	amdgpu_amdkfd_device_probe(adev);
2183 	adev->cg_flags &= amdgpu_cg_mask;
2184 	adev->pg_flags &= amdgpu_pg_mask;
2185 
2186 	return 0;
2187 }
2188 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2189 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2190 {
2191 	int i, r;
2192 
2193 	for (i = 0; i < adev->num_ip_blocks; i++) {
2194 		if (!adev->ip_blocks[i].status.sw)
2195 			continue;
2196 		if (adev->ip_blocks[i].status.hw)
2197 			continue;
2198 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2199 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2200 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2201 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2202 			if (r) {
2203 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2204 					  adev->ip_blocks[i].version->funcs->name, r);
2205 				return r;
2206 			}
2207 			adev->ip_blocks[i].status.hw = true;
2208 		}
2209 	}
2210 
2211 	return 0;
2212 }
2213 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2214 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2215 {
2216 	int i, r;
2217 
2218 	for (i = 0; i < adev->num_ip_blocks; i++) {
2219 		if (!adev->ip_blocks[i].status.sw)
2220 			continue;
2221 		if (adev->ip_blocks[i].status.hw)
2222 			continue;
2223 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2224 		if (r) {
2225 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2226 				  adev->ip_blocks[i].version->funcs->name, r);
2227 			return r;
2228 		}
2229 		adev->ip_blocks[i].status.hw = true;
2230 	}
2231 
2232 	return 0;
2233 }
2234 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2235 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2236 {
2237 	int r = 0;
2238 	int i;
2239 	uint32_t smu_version;
2240 
2241 	if (adev->asic_type >= CHIP_VEGA10) {
2242 		for (i = 0; i < adev->num_ip_blocks; i++) {
2243 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2244 				continue;
2245 
2246 			if (!adev->ip_blocks[i].status.sw)
2247 				continue;
2248 
2249 			/* no need to do the fw loading again if already done*/
2250 			if (adev->ip_blocks[i].status.hw == true)
2251 				break;
2252 
2253 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2254 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2255 				if (r) {
2256 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2257 							  adev->ip_blocks[i].version->funcs->name, r);
2258 					return r;
2259 				}
2260 			} else {
2261 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2262 				if (r) {
2263 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2264 							  adev->ip_blocks[i].version->funcs->name, r);
2265 					return r;
2266 				}
2267 			}
2268 
2269 			adev->ip_blocks[i].status.hw = true;
2270 			break;
2271 		}
2272 	}
2273 
2274 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2275 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2276 
2277 	return r;
2278 }
2279 
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2280 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2281 {
2282 	long timeout;
2283 	int r, i;
2284 
2285 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2286 		struct amdgpu_ring *ring = adev->rings[i];
2287 
2288 		/* No need to setup the GPU scheduler for rings that don't need it */
2289 		if (!ring || ring->no_scheduler)
2290 			continue;
2291 
2292 		switch (ring->funcs->type) {
2293 		case AMDGPU_RING_TYPE_GFX:
2294 			timeout = adev->gfx_timeout;
2295 			break;
2296 		case AMDGPU_RING_TYPE_COMPUTE:
2297 			timeout = adev->compute_timeout;
2298 			break;
2299 		case AMDGPU_RING_TYPE_SDMA:
2300 			timeout = adev->sdma_timeout;
2301 			break;
2302 		default:
2303 			timeout = adev->video_timeout;
2304 			break;
2305 		}
2306 
2307 		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2308 				   ring->num_hw_submission, 0,
2309 				   timeout, adev->reset_domain->wq,
2310 				   ring->sched_score, ring->name,
2311 				   adev->dev);
2312 		if (r) {
2313 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
2314 				  ring->name);
2315 			return r;
2316 		}
2317 	}
2318 
2319 	amdgpu_xcp_update_partition_sched_list(adev);
2320 
2321 	return 0;
2322 }
2323 
2324 
2325 /**
2326  * amdgpu_device_ip_init - run init for hardware IPs
2327  *
2328  * @adev: amdgpu_device pointer
2329  *
2330  * Main initialization pass for hardware IPs.  The list of all the hardware
2331  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2332  * are run.  sw_init initializes the software state associated with each IP
2333  * and hw_init initializes the hardware associated with each IP.
2334  * Returns 0 on success, negative error code on failure.
2335  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2336 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2337 {
2338 	int i, r;
2339 
2340 	r = amdgpu_ras_init(adev);
2341 	if (r)
2342 		return r;
2343 
2344 	for (i = 0; i < adev->num_ip_blocks; i++) {
2345 		if (!adev->ip_blocks[i].status.valid)
2346 			continue;
2347 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2348 		if (r) {
2349 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2350 				  adev->ip_blocks[i].version->funcs->name, r);
2351 			goto init_failed;
2352 		}
2353 		adev->ip_blocks[i].status.sw = true;
2354 
2355 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2356 			/* need to do common hw init early so everything is set up for gmc */
2357 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2358 			if (r) {
2359 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2360 				goto init_failed;
2361 			}
2362 			adev->ip_blocks[i].status.hw = true;
2363 		} else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2364 			/* need to do gmc hw init early so we can allocate gpu mem */
2365 			/* Try to reserve bad pages early */
2366 			if (amdgpu_sriov_vf(adev))
2367 				amdgpu_virt_exchange_data(adev);
2368 
2369 			r = amdgpu_device_mem_scratch_init(adev);
2370 			if (r) {
2371 				DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2372 				goto init_failed;
2373 			}
2374 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2375 			if (r) {
2376 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2377 				goto init_failed;
2378 			}
2379 			r = amdgpu_device_wb_init(adev);
2380 			if (r) {
2381 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2382 				goto init_failed;
2383 			}
2384 			adev->ip_blocks[i].status.hw = true;
2385 
2386 			/* right after GMC hw init, we create CSA */
2387 			if (adev->gfx.mcbp) {
2388 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2389 							       AMDGPU_GEM_DOMAIN_VRAM |
2390 							       AMDGPU_GEM_DOMAIN_GTT,
2391 							       AMDGPU_CSA_SIZE);
2392 				if (r) {
2393 					DRM_ERROR("allocate CSA failed %d\n", r);
2394 					goto init_failed;
2395 				}
2396 			}
2397 		}
2398 	}
2399 
2400 	if (amdgpu_sriov_vf(adev))
2401 		amdgpu_virt_init_data_exchange(adev);
2402 
2403 	r = amdgpu_ib_pool_init(adev);
2404 	if (r) {
2405 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2406 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2407 		goto init_failed;
2408 	}
2409 
2410 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2411 	if (r)
2412 		goto init_failed;
2413 
2414 	r = amdgpu_device_ip_hw_init_phase1(adev);
2415 	if (r)
2416 		goto init_failed;
2417 
2418 	r = amdgpu_device_fw_loading(adev);
2419 	if (r)
2420 		goto init_failed;
2421 
2422 	r = amdgpu_device_ip_hw_init_phase2(adev);
2423 	if (r)
2424 		goto init_failed;
2425 
2426 	/*
2427 	 * retired pages will be loaded from eeprom and reserved here,
2428 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2429 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2430 	 * for I2C communication which only true at this point.
2431 	 *
2432 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2433 	 * failure from bad gpu situation and stop amdgpu init process
2434 	 * accordingly. For other failed cases, it will still release all
2435 	 * the resource and print error message, rather than returning one
2436 	 * negative value to upper level.
2437 	 *
2438 	 * Note: theoretically, this should be called before all vram allocations
2439 	 * to protect retired page from abusing
2440 	 */
2441 	r = amdgpu_ras_recovery_init(adev);
2442 	if (r)
2443 		goto init_failed;
2444 
2445 	/**
2446 	 * In case of XGMI grab extra reference for reset domain for this device
2447 	 */
2448 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2449 		if (amdgpu_xgmi_add_device(adev) == 0) {
2450 			if (!amdgpu_sriov_vf(adev)) {
2451 				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2452 
2453 				if (WARN_ON(!hive)) {
2454 					r = -ENOENT;
2455 					goto init_failed;
2456 				}
2457 
2458 				if (!hive->reset_domain ||
2459 				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2460 					r = -ENOENT;
2461 					amdgpu_put_xgmi_hive(hive);
2462 					goto init_failed;
2463 				}
2464 
2465 				/* Drop the early temporary reset domain we created for device */
2466 				amdgpu_reset_put_reset_domain(adev->reset_domain);
2467 				adev->reset_domain = hive->reset_domain;
2468 				amdgpu_put_xgmi_hive(hive);
2469 			}
2470 		}
2471 	}
2472 
2473 	r = amdgpu_device_init_schedulers(adev);
2474 	if (r)
2475 		goto init_failed;
2476 
2477 	/* Don't init kfd if whole hive need to be reset during init */
2478 	if (!adev->gmc.xgmi.pending_reset) {
2479 		kgd2kfd_init_zone_device(adev);
2480 		amdgpu_amdkfd_device_init(adev);
2481 	}
2482 
2483 	amdgpu_fru_get_product_info(adev);
2484 
2485 init_failed:
2486 
2487 	return r;
2488 }
2489 
2490 /**
2491  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2492  *
2493  * @adev: amdgpu_device pointer
2494  *
2495  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2496  * this function before a GPU reset.  If the value is retained after a
2497  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2498  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2499 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2500 {
2501 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2502 }
2503 
2504 /**
2505  * amdgpu_device_check_vram_lost - check if vram is valid
2506  *
2507  * @adev: amdgpu_device pointer
2508  *
2509  * Checks the reset magic value written to the gart pointer in VRAM.
2510  * The driver calls this after a GPU reset to see if the contents of
2511  * VRAM is lost or now.
2512  * returns true if vram is lost, false if not.
2513  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2514 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2515 {
2516 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2517 			AMDGPU_RESET_MAGIC_NUM))
2518 		return true;
2519 
2520 	if (!amdgpu_in_reset(adev))
2521 		return false;
2522 
2523 	/*
2524 	 * For all ASICs with baco/mode1 reset, the VRAM is
2525 	 * always assumed to be lost.
2526 	 */
2527 	switch (amdgpu_asic_reset_method(adev)) {
2528 	case AMD_RESET_METHOD_BACO:
2529 	case AMD_RESET_METHOD_MODE1:
2530 		return true;
2531 	default:
2532 		return false;
2533 	}
2534 }
2535 
2536 /**
2537  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2538  *
2539  * @adev: amdgpu_device pointer
2540  * @state: clockgating state (gate or ungate)
2541  *
2542  * The list of all the hardware IPs that make up the asic is walked and the
2543  * set_clockgating_state callbacks are run.
2544  * Late initialization pass enabling clockgating for hardware IPs.
2545  * Fini or suspend, pass disabling clockgating for hardware IPs.
2546  * Returns 0 on success, negative error code on failure.
2547  */
2548 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2549 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2550 			       enum amd_clockgating_state state)
2551 {
2552 	int i, j, r;
2553 
2554 	if (amdgpu_emu_mode == 1)
2555 		return 0;
2556 
2557 	for (j = 0; j < adev->num_ip_blocks; j++) {
2558 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2559 		if (!adev->ip_blocks[i].status.late_initialized)
2560 			continue;
2561 		/* skip CG for GFX, SDMA on S0ix */
2562 		if (adev->in_s0ix &&
2563 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2564 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2565 			continue;
2566 		/* skip CG for VCE/UVD, it's handled specially */
2567 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2568 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2569 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2570 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2571 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2572 			/* enable clockgating to save power */
2573 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2574 										     state);
2575 			if (r) {
2576 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2577 					  adev->ip_blocks[i].version->funcs->name, r);
2578 				return r;
2579 			}
2580 		}
2581 	}
2582 
2583 	return 0;
2584 }
2585 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2586 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2587 			       enum amd_powergating_state state)
2588 {
2589 	int i, j, r;
2590 
2591 	if (amdgpu_emu_mode == 1)
2592 		return 0;
2593 
2594 	for (j = 0; j < adev->num_ip_blocks; j++) {
2595 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2596 		if (!adev->ip_blocks[i].status.late_initialized)
2597 			continue;
2598 		/* skip PG for GFX, SDMA on S0ix */
2599 		if (adev->in_s0ix &&
2600 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2601 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2602 			continue;
2603 		/* skip CG for VCE/UVD, it's handled specially */
2604 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2605 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2606 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2607 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2608 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2609 			/* enable powergating to save power */
2610 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2611 											state);
2612 			if (r) {
2613 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2614 					  adev->ip_blocks[i].version->funcs->name, r);
2615 				return r;
2616 			}
2617 		}
2618 	}
2619 	return 0;
2620 }
2621 
amdgpu_device_enable_mgpu_fan_boost(void)2622 static int amdgpu_device_enable_mgpu_fan_boost(void)
2623 {
2624 	struct amdgpu_gpu_instance *gpu_ins;
2625 	struct amdgpu_device *adev;
2626 	int i, ret = 0;
2627 
2628 	mutex_lock(&mgpu_info.mutex);
2629 
2630 	/*
2631 	 * MGPU fan boost feature should be enabled
2632 	 * only when there are two or more dGPUs in
2633 	 * the system
2634 	 */
2635 	if (mgpu_info.num_dgpu < 2)
2636 		goto out;
2637 
2638 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2639 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2640 		adev = gpu_ins->adev;
2641 		if (!(adev->flags & AMD_IS_APU) &&
2642 		    !gpu_ins->mgpu_fan_enabled) {
2643 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2644 			if (ret)
2645 				break;
2646 
2647 			gpu_ins->mgpu_fan_enabled = 1;
2648 		}
2649 	}
2650 
2651 out:
2652 	mutex_unlock(&mgpu_info.mutex);
2653 
2654 	return ret;
2655 }
2656 
2657 /**
2658  * amdgpu_device_ip_late_init - run late init for hardware IPs
2659  *
2660  * @adev: amdgpu_device pointer
2661  *
2662  * Late initialization pass for hardware IPs.  The list of all the hardware
2663  * IPs that make up the asic is walked and the late_init callbacks are run.
2664  * late_init covers any special initialization that an IP requires
2665  * after all of the have been initialized or something that needs to happen
2666  * late in the init process.
2667  * Returns 0 on success, negative error code on failure.
2668  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2669 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2670 {
2671 	struct amdgpu_gpu_instance *gpu_instance;
2672 	int i = 0, r;
2673 
2674 	for (i = 0; i < adev->num_ip_blocks; i++) {
2675 		if (!adev->ip_blocks[i].status.hw)
2676 			continue;
2677 		if (adev->ip_blocks[i].version->funcs->late_init) {
2678 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2679 			if (r) {
2680 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2681 					  adev->ip_blocks[i].version->funcs->name, r);
2682 				return r;
2683 			}
2684 		}
2685 		adev->ip_blocks[i].status.late_initialized = true;
2686 	}
2687 
2688 	r = amdgpu_ras_late_init(adev);
2689 	if (r) {
2690 		DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2691 		return r;
2692 	}
2693 
2694 	amdgpu_ras_set_error_query_ready(adev, true);
2695 
2696 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2697 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2698 
2699 	amdgpu_device_fill_reset_magic(adev);
2700 
2701 	r = amdgpu_device_enable_mgpu_fan_boost();
2702 	if (r)
2703 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2704 
2705 	/* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2706 	if (amdgpu_passthrough(adev) &&
2707 	    ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2708 	     adev->asic_type == CHIP_ALDEBARAN))
2709 		amdgpu_dpm_handle_passthrough_sbr(adev, true);
2710 
2711 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2712 		mutex_lock(&mgpu_info.mutex);
2713 
2714 		/*
2715 		 * Reset device p-state to low as this was booted with high.
2716 		 *
2717 		 * This should be performed only after all devices from the same
2718 		 * hive get initialized.
2719 		 *
2720 		 * However, it's unknown how many device in the hive in advance.
2721 		 * As this is counted one by one during devices initializations.
2722 		 *
2723 		 * So, we wait for all XGMI interlinked devices initialized.
2724 		 * This may bring some delays as those devices may come from
2725 		 * different hives. But that should be OK.
2726 		 */
2727 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2728 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2729 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2730 				if (gpu_instance->adev->flags & AMD_IS_APU)
2731 					continue;
2732 
2733 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2734 						AMDGPU_XGMI_PSTATE_MIN);
2735 				if (r) {
2736 					DRM_ERROR("pstate setting failed (%d).\n", r);
2737 					break;
2738 				}
2739 			}
2740 		}
2741 
2742 		mutex_unlock(&mgpu_info.mutex);
2743 	}
2744 
2745 	return 0;
2746 }
2747 
2748 /**
2749  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2750  *
2751  * @adev: amdgpu_device pointer
2752  *
2753  * For ASICs need to disable SMC first
2754  */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2755 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2756 {
2757 	int i, r;
2758 
2759 	if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2760 		return;
2761 
2762 	for (i = 0; i < adev->num_ip_blocks; i++) {
2763 		if (!adev->ip_blocks[i].status.hw)
2764 			continue;
2765 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2766 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2767 			/* XXX handle errors */
2768 			if (r) {
2769 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2770 					  adev->ip_blocks[i].version->funcs->name, r);
2771 			}
2772 			adev->ip_blocks[i].status.hw = false;
2773 			break;
2774 		}
2775 	}
2776 }
2777 
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2778 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2779 {
2780 	int i, r;
2781 
2782 	for (i = 0; i < adev->num_ip_blocks; i++) {
2783 		if (!adev->ip_blocks[i].version->funcs->early_fini)
2784 			continue;
2785 
2786 		r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2787 		if (r) {
2788 			DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2789 				  adev->ip_blocks[i].version->funcs->name, r);
2790 		}
2791 	}
2792 
2793 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2794 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2795 
2796 	amdgpu_amdkfd_suspend(adev, false);
2797 
2798 	/* Workaroud for ASICs need to disable SMC first */
2799 	amdgpu_device_smu_fini_early(adev);
2800 
2801 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2802 		if (!adev->ip_blocks[i].status.hw)
2803 			continue;
2804 
2805 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2806 		/* XXX handle errors */
2807 		if (r) {
2808 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2809 				  adev->ip_blocks[i].version->funcs->name, r);
2810 		}
2811 
2812 		adev->ip_blocks[i].status.hw = false;
2813 	}
2814 
2815 	if (amdgpu_sriov_vf(adev)) {
2816 		if (amdgpu_virt_release_full_gpu(adev, false))
2817 			DRM_ERROR("failed to release exclusive mode on fini\n");
2818 	}
2819 
2820 	return 0;
2821 }
2822 
2823 /**
2824  * amdgpu_device_ip_fini - run fini for hardware IPs
2825  *
2826  * @adev: amdgpu_device pointer
2827  *
2828  * Main teardown pass for hardware IPs.  The list of all the hardware
2829  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2830  * are run.  hw_fini tears down the hardware associated with each IP
2831  * and sw_fini tears down any software state associated with each IP.
2832  * Returns 0 on success, negative error code on failure.
2833  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2834 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2835 {
2836 	int i, r;
2837 
2838 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2839 		amdgpu_virt_release_ras_err_handler_data(adev);
2840 
2841 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2842 		amdgpu_xgmi_remove_device(adev);
2843 
2844 	amdgpu_amdkfd_device_fini_sw(adev);
2845 
2846 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2847 		if (!adev->ip_blocks[i].status.sw)
2848 			continue;
2849 
2850 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2851 			amdgpu_ucode_free_bo(adev);
2852 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2853 			amdgpu_device_wb_fini(adev);
2854 			amdgpu_device_mem_scratch_fini(adev);
2855 			amdgpu_ib_pool_fini(adev);
2856 		}
2857 
2858 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2859 		/* XXX handle errors */
2860 		if (r) {
2861 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2862 				  adev->ip_blocks[i].version->funcs->name, r);
2863 		}
2864 		adev->ip_blocks[i].status.sw = false;
2865 		adev->ip_blocks[i].status.valid = false;
2866 	}
2867 
2868 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2869 		if (!adev->ip_blocks[i].status.late_initialized)
2870 			continue;
2871 		if (adev->ip_blocks[i].version->funcs->late_fini)
2872 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2873 		adev->ip_blocks[i].status.late_initialized = false;
2874 	}
2875 
2876 	amdgpu_ras_fini(adev);
2877 
2878 	return 0;
2879 }
2880 
2881 /**
2882  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2883  *
2884  * @work: work_struct.
2885  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2886 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2887 {
2888 	struct amdgpu_device *adev =
2889 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2890 	int r;
2891 
2892 	r = amdgpu_ib_ring_tests(adev);
2893 	if (r)
2894 		DRM_ERROR("ib ring test failed (%d).\n", r);
2895 }
2896 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2897 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2898 {
2899 	struct amdgpu_device *adev =
2900 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2901 
2902 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2903 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2904 
2905 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2906 		adev->gfx.gfx_off_state = true;
2907 }
2908 
2909 /**
2910  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2911  *
2912  * @adev: amdgpu_device pointer
2913  *
2914  * Main suspend function for hardware IPs.  The list of all the hardware
2915  * IPs that make up the asic is walked, clockgating is disabled and the
2916  * suspend callbacks are run.  suspend puts the hardware and software state
2917  * in each IP into a state suitable for suspend.
2918  * Returns 0 on success, negative error code on failure.
2919  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2920 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2921 {
2922 	int i, r;
2923 
2924 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2925 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2926 
2927 	/*
2928 	 * Per PMFW team's suggestion, driver needs to handle gfxoff
2929 	 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2930 	 * scenario. Add the missing df cstate disablement here.
2931 	 */
2932 	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2933 		dev_warn(adev->dev, "Failed to disallow df cstate");
2934 
2935 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2936 		if (!adev->ip_blocks[i].status.valid)
2937 			continue;
2938 
2939 		/* displays are handled separately */
2940 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2941 			continue;
2942 
2943 		/* XXX handle errors */
2944 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2945 		/* XXX handle errors */
2946 		if (r) {
2947 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2948 				  adev->ip_blocks[i].version->funcs->name, r);
2949 			return r;
2950 		}
2951 
2952 		adev->ip_blocks[i].status.hw = false;
2953 	}
2954 
2955 	return 0;
2956 }
2957 
2958 /**
2959  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2960  *
2961  * @adev: amdgpu_device pointer
2962  *
2963  * Main suspend function for hardware IPs.  The list of all the hardware
2964  * IPs that make up the asic is walked, clockgating is disabled and the
2965  * suspend callbacks are run.  suspend puts the hardware and software state
2966  * in each IP into a state suitable for suspend.
2967  * Returns 0 on success, negative error code on failure.
2968  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2969 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2970 {
2971 	int i, r;
2972 
2973 	if (adev->in_s0ix)
2974 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2975 
2976 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2977 		if (!adev->ip_blocks[i].status.valid)
2978 			continue;
2979 		/* displays are handled in phase1 */
2980 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2981 			continue;
2982 		/* PSP lost connection when err_event_athub occurs */
2983 		if (amdgpu_ras_intr_triggered() &&
2984 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2985 			adev->ip_blocks[i].status.hw = false;
2986 			continue;
2987 		}
2988 
2989 		/* skip unnecessary suspend if we do not initialize them yet */
2990 		if (adev->gmc.xgmi.pending_reset &&
2991 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2992 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2993 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2994 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2995 			adev->ip_blocks[i].status.hw = false;
2996 			continue;
2997 		}
2998 
2999 		/* skip suspend of gfx/mes and psp for S0ix
3000 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3001 		 * like at runtime. PSP is also part of the always on hardware
3002 		 * so no need to suspend it.
3003 		 */
3004 		if (adev->in_s0ix &&
3005 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3006 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3007 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3008 			continue;
3009 
3010 		/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3011 		if (adev->in_s0ix &&
3012 		    (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3013 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3014 			continue;
3015 
3016 		/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3017 		 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3018 		 * from this location and RLC Autoload automatically also gets loaded
3019 		 * from here based on PMFW -> PSP message during re-init sequence.
3020 		 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3021 		 * the TMR and reload FWs again for IMU enabled APU ASICs.
3022 		 */
3023 		if (amdgpu_in_reset(adev) &&
3024 		    (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3025 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3026 			continue;
3027 
3028 		/* XXX handle errors */
3029 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
3030 		/* XXX handle errors */
3031 		if (r) {
3032 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
3033 				  adev->ip_blocks[i].version->funcs->name, r);
3034 		}
3035 		adev->ip_blocks[i].status.hw = false;
3036 		/* handle putting the SMC in the appropriate state */
3037 		if (!amdgpu_sriov_vf(adev)) {
3038 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3039 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3040 				if (r) {
3041 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3042 							adev->mp1_state, r);
3043 					return r;
3044 				}
3045 			}
3046 		}
3047 	}
3048 
3049 	return 0;
3050 }
3051 
3052 /**
3053  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3054  *
3055  * @adev: amdgpu_device pointer
3056  *
3057  * Main suspend function for hardware IPs.  The list of all the hardware
3058  * IPs that make up the asic is walked, clockgating is disabled and the
3059  * suspend callbacks are run.  suspend puts the hardware and software state
3060  * in each IP into a state suitable for suspend.
3061  * Returns 0 on success, negative error code on failure.
3062  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3063 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3064 {
3065 	int r;
3066 
3067 	if (amdgpu_sriov_vf(adev)) {
3068 		amdgpu_virt_fini_data_exchange(adev);
3069 		amdgpu_virt_request_full_gpu(adev, false);
3070 	}
3071 
3072 	r = amdgpu_device_ip_suspend_phase1(adev);
3073 	if (r)
3074 		return r;
3075 	r = amdgpu_device_ip_suspend_phase2(adev);
3076 
3077 	if (amdgpu_sriov_vf(adev))
3078 		amdgpu_virt_release_full_gpu(adev, false);
3079 
3080 	return r;
3081 }
3082 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3083 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3084 {
3085 	int i, r;
3086 
3087 	static enum amd_ip_block_type ip_order[] = {
3088 		AMD_IP_BLOCK_TYPE_COMMON,
3089 		AMD_IP_BLOCK_TYPE_GMC,
3090 		AMD_IP_BLOCK_TYPE_PSP,
3091 		AMD_IP_BLOCK_TYPE_IH,
3092 	};
3093 
3094 	for (i = 0; i < adev->num_ip_blocks; i++) {
3095 		int j;
3096 		struct amdgpu_ip_block *block;
3097 
3098 		block = &adev->ip_blocks[i];
3099 		block->status.hw = false;
3100 
3101 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3102 
3103 			if (block->version->type != ip_order[j] ||
3104 				!block->status.valid)
3105 				continue;
3106 
3107 			r = block->version->funcs->hw_init(adev);
3108 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3109 			if (r)
3110 				return r;
3111 			block->status.hw = true;
3112 		}
3113 	}
3114 
3115 	return 0;
3116 }
3117 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3118 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3119 {
3120 	int i, r;
3121 
3122 	static enum amd_ip_block_type ip_order[] = {
3123 		AMD_IP_BLOCK_TYPE_SMC,
3124 		AMD_IP_BLOCK_TYPE_DCE,
3125 		AMD_IP_BLOCK_TYPE_GFX,
3126 		AMD_IP_BLOCK_TYPE_SDMA,
3127 		AMD_IP_BLOCK_TYPE_MES,
3128 		AMD_IP_BLOCK_TYPE_UVD,
3129 		AMD_IP_BLOCK_TYPE_VCE,
3130 		AMD_IP_BLOCK_TYPE_VCN,
3131 		AMD_IP_BLOCK_TYPE_JPEG
3132 	};
3133 
3134 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3135 		int j;
3136 		struct amdgpu_ip_block *block;
3137 
3138 		for (j = 0; j < adev->num_ip_blocks; j++) {
3139 			block = &adev->ip_blocks[j];
3140 
3141 			if (block->version->type != ip_order[i] ||
3142 				!block->status.valid ||
3143 				block->status.hw)
3144 				continue;
3145 
3146 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3147 				r = block->version->funcs->resume(adev);
3148 			else
3149 				r = block->version->funcs->hw_init(adev);
3150 
3151 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3152 			if (r)
3153 				return r;
3154 			block->status.hw = true;
3155 		}
3156 	}
3157 
3158 	return 0;
3159 }
3160 
3161 /**
3162  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3163  *
3164  * @adev: amdgpu_device pointer
3165  *
3166  * First resume function for hardware IPs.  The list of all the hardware
3167  * IPs that make up the asic is walked and the resume callbacks are run for
3168  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3169  * after a suspend and updates the software state as necessary.  This
3170  * function is also used for restoring the GPU after a GPU reset.
3171  * Returns 0 on success, negative error code on failure.
3172  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3173 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3174 {
3175 	int i, r;
3176 
3177 	for (i = 0; i < adev->num_ip_blocks; i++) {
3178 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3179 			continue;
3180 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3181 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3182 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3183 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3184 
3185 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3186 			if (r) {
3187 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3188 					  adev->ip_blocks[i].version->funcs->name, r);
3189 				return r;
3190 			}
3191 			adev->ip_blocks[i].status.hw = true;
3192 		}
3193 	}
3194 
3195 	return 0;
3196 }
3197 
3198 /**
3199  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3200  *
3201  * @adev: amdgpu_device pointer
3202  *
3203  * Second resume function for hardware IPs.  The list of all the hardware
3204  * IPs that make up the asic is walked and the resume callbacks are run for
3205  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3206  * functional state after a suspend and updates the software state as
3207  * necessary.  This function is also used for restoring the GPU after a GPU
3208  * reset.
3209  * Returns 0 on success, negative error code on failure.
3210  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3211 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3212 {
3213 	int i, r;
3214 
3215 	for (i = 0; i < adev->num_ip_blocks; i++) {
3216 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3217 			continue;
3218 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3219 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3220 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3221 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3222 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3223 			continue;
3224 		r = adev->ip_blocks[i].version->funcs->resume(adev);
3225 		if (r) {
3226 			DRM_ERROR("resume of IP block <%s> failed %d\n",
3227 				  adev->ip_blocks[i].version->funcs->name, r);
3228 			return r;
3229 		}
3230 		adev->ip_blocks[i].status.hw = true;
3231 	}
3232 
3233 	return 0;
3234 }
3235 
3236 /**
3237  * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3238  *
3239  * @adev: amdgpu_device pointer
3240  *
3241  * Third resume function for hardware IPs.  The list of all the hardware
3242  * IPs that make up the asic is walked and the resume callbacks are run for
3243  * all DCE.  resume puts the hardware into a functional state after a suspend
3244  * and updates the software state as necessary.  This function is also used
3245  * for restoring the GPU after a GPU reset.
3246  *
3247  * Returns 0 on success, negative error code on failure.
3248  */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3249 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3250 {
3251 	int i, r;
3252 
3253 	for (i = 0; i < adev->num_ip_blocks; i++) {
3254 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3255 			continue;
3256 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3257 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3258 			if (r) {
3259 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3260 					  adev->ip_blocks[i].version->funcs->name, r);
3261 				return r;
3262 			}
3263 			adev->ip_blocks[i].status.hw = true;
3264 		}
3265 	}
3266 
3267 	return 0;
3268 }
3269 
3270 /**
3271  * amdgpu_device_ip_resume - run resume for hardware IPs
3272  *
3273  * @adev: amdgpu_device pointer
3274  *
3275  * Main resume function for hardware IPs.  The hardware IPs
3276  * are split into two resume functions because they are
3277  * also used in recovering from a GPU reset and some additional
3278  * steps need to be take between them.  In this case (S3/S4) they are
3279  * run sequentially.
3280  * Returns 0 on success, negative error code on failure.
3281  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3282 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3283 {
3284 	int r;
3285 
3286 	r = amdgpu_device_ip_resume_phase1(adev);
3287 	if (r)
3288 		return r;
3289 
3290 	r = amdgpu_device_fw_loading(adev);
3291 	if (r)
3292 		return r;
3293 
3294 	r = amdgpu_device_ip_resume_phase2(adev);
3295 
3296 	if (r)
3297 		return r;
3298 
3299 	amdgpu_fence_driver_hw_init(adev);
3300 
3301 	r = amdgpu_device_ip_resume_phase3(adev);
3302 
3303 	return r;
3304 }
3305 
3306 /**
3307  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3308  *
3309  * @adev: amdgpu_device pointer
3310  *
3311  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3312  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3313 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3314 {
3315 	if (amdgpu_sriov_vf(adev)) {
3316 		if (adev->is_atom_fw) {
3317 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3318 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3319 		} else {
3320 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3321 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3322 		}
3323 
3324 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3325 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3326 	}
3327 }
3328 
3329 /**
3330  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3331  *
3332  * @asic_type: AMD asic type
3333  *
3334  * Check if there is DC (new modesetting infrastructre) support for an asic.
3335  * returns true if DC has support, false if not.
3336  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3337 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3338 {
3339 	switch (asic_type) {
3340 #ifdef CONFIG_DRM_AMDGPU_SI
3341 	case CHIP_HAINAN:
3342 #endif
3343 	case CHIP_TOPAZ:
3344 		/* chips with no display hardware */
3345 		return false;
3346 #if defined(CONFIG_DRM_AMD_DC)
3347 	case CHIP_TAHITI:
3348 	case CHIP_PITCAIRN:
3349 	case CHIP_VERDE:
3350 	case CHIP_OLAND:
3351 		/*
3352 		 * We have systems in the wild with these ASICs that require
3353 		 * LVDS and VGA support which is not supported with DC.
3354 		 *
3355 		 * Fallback to the non-DC driver here by default so as not to
3356 		 * cause regressions.
3357 		 */
3358 #if defined(CONFIG_DRM_AMD_DC_SI)
3359 		return amdgpu_dc > 0;
3360 #else
3361 		return false;
3362 #endif
3363 	case CHIP_BONAIRE:
3364 	case CHIP_KAVERI:
3365 	case CHIP_KABINI:
3366 	case CHIP_MULLINS:
3367 		/*
3368 		 * We have systems in the wild with these ASICs that require
3369 		 * VGA support which is not supported with DC.
3370 		 *
3371 		 * Fallback to the non-DC driver here by default so as not to
3372 		 * cause regressions.
3373 		 */
3374 		return amdgpu_dc > 0;
3375 	default:
3376 		return amdgpu_dc != 0;
3377 #else
3378 	default:
3379 		if (amdgpu_dc > 0)
3380 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3381 		return false;
3382 #endif
3383 	}
3384 }
3385 
3386 /**
3387  * amdgpu_device_has_dc_support - check if dc is supported
3388  *
3389  * @adev: amdgpu_device pointer
3390  *
3391  * Returns true for supported, false for not supported
3392  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3393 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3394 {
3395 	if (adev->enable_virtual_display ||
3396 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3397 		return false;
3398 
3399 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3400 }
3401 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3402 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3403 {
3404 	struct amdgpu_device *adev =
3405 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3406 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3407 
3408 	/* It's a bug to not have a hive within this function */
3409 	if (WARN_ON(!hive))
3410 		return;
3411 
3412 	/*
3413 	 * Use task barrier to synchronize all xgmi reset works across the
3414 	 * hive. task_barrier_enter and task_barrier_exit will block
3415 	 * until all the threads running the xgmi reset works reach
3416 	 * those points. task_barrier_full will do both blocks.
3417 	 */
3418 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3419 
3420 		task_barrier_enter(&hive->tb);
3421 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3422 
3423 		if (adev->asic_reset_res)
3424 			goto fail;
3425 
3426 		task_barrier_exit(&hive->tb);
3427 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3428 
3429 		if (adev->asic_reset_res)
3430 			goto fail;
3431 
3432 		if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3433 		    adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3434 			adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3435 	} else {
3436 
3437 		task_barrier_full(&hive->tb);
3438 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3439 	}
3440 
3441 fail:
3442 	if (adev->asic_reset_res)
3443 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3444 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3445 	amdgpu_put_xgmi_hive(hive);
3446 }
3447 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3448 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3449 {
3450 	char *input = amdgpu_lockup_timeout;
3451 	char *timeout_setting = NULL;
3452 	int index = 0;
3453 	long timeout;
3454 	int ret = 0;
3455 
3456 	/*
3457 	 * By default timeout for non compute jobs is 10000
3458 	 * and 60000 for compute jobs.
3459 	 * In SR-IOV or passthrough mode, timeout for compute
3460 	 * jobs are 60000 by default.
3461 	 */
3462 	adev->gfx_timeout = msecs_to_jiffies(10000);
3463 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3464 	if (amdgpu_sriov_vf(adev))
3465 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3466 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3467 	else
3468 		adev->compute_timeout =  msecs_to_jiffies(60000);
3469 
3470 #ifdef notyet
3471 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3472 		while ((timeout_setting = strsep(&input, ",")) &&
3473 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3474 			ret = kstrtol(timeout_setting, 0, &timeout);
3475 			if (ret)
3476 				return ret;
3477 
3478 			if (timeout == 0) {
3479 				index++;
3480 				continue;
3481 			} else if (timeout < 0) {
3482 				timeout = MAX_SCHEDULE_TIMEOUT;
3483 				dev_warn(adev->dev, "lockup timeout disabled");
3484 				add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3485 			} else {
3486 				timeout = msecs_to_jiffies(timeout);
3487 			}
3488 
3489 			switch (index++) {
3490 			case 0:
3491 				adev->gfx_timeout = timeout;
3492 				break;
3493 			case 1:
3494 				adev->compute_timeout = timeout;
3495 				break;
3496 			case 2:
3497 				adev->sdma_timeout = timeout;
3498 				break;
3499 			case 3:
3500 				adev->video_timeout = timeout;
3501 				break;
3502 			default:
3503 				break;
3504 			}
3505 		}
3506 		/*
3507 		 * There is only one value specified and
3508 		 * it should apply to all non-compute jobs.
3509 		 */
3510 		if (index == 1) {
3511 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3512 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3513 				adev->compute_timeout = adev->gfx_timeout;
3514 		}
3515 	}
3516 #endif
3517 
3518 	return ret;
3519 }
3520 
3521 /**
3522  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3523  *
3524  * @adev: amdgpu_device pointer
3525  *
3526  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3527  */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3528 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3529 {
3530 #ifdef notyet
3531 	struct iommu_domain *domain;
3532 
3533 	domain = iommu_get_domain_for_dev(adev->dev);
3534 	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3535 #endif
3536 		adev->ram_is_direct_mapped = true;
3537 }
3538 
3539 static const struct attribute *amdgpu_dev_attributes[] = {
3540 	&dev_attr_pcie_replay_count.attr,
3541 	NULL
3542 };
3543 
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3544 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3545 {
3546 	if (amdgpu_mcbp == 1)
3547 		adev->gfx.mcbp = true;
3548 	else if (amdgpu_mcbp == 0)
3549 		adev->gfx.mcbp = false;
3550 
3551 	if (amdgpu_sriov_vf(adev))
3552 		adev->gfx.mcbp = true;
3553 
3554 	if (adev->gfx.mcbp)
3555 		DRM_INFO("MCBP is enabled\n");
3556 }
3557 
3558 /**
3559  * amdgpu_device_init - initialize the driver
3560  *
3561  * @adev: amdgpu_device pointer
3562  * @flags: driver flags
3563  *
3564  * Initializes the driver info and hw (all asics).
3565  * Returns 0 for success or an error on failure.
3566  * Called at driver startup.
3567  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3568 int amdgpu_device_init(struct amdgpu_device *adev,
3569 		       uint32_t flags)
3570 {
3571 	struct drm_device *ddev = adev_to_drm(adev);
3572 	struct pci_dev *pdev = adev->pdev;
3573 	int r, i;
3574 	bool px = false;
3575 	u32 max_MBps;
3576 	int tmp;
3577 
3578 	adev->shutdown = false;
3579 	adev->flags = flags;
3580 
3581 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3582 		adev->asic_type = amdgpu_force_asic_type;
3583 	else
3584 		adev->asic_type = flags & AMD_ASIC_MASK;
3585 
3586 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3587 	if (amdgpu_emu_mode == 1)
3588 		adev->usec_timeout *= 10;
3589 	adev->gmc.gart_size = 512 * 1024 * 1024;
3590 	adev->accel_working = false;
3591 	adev->num_rings = 0;
3592 	RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3593 	adev->mman.buffer_funcs = NULL;
3594 	adev->mman.buffer_funcs_ring = NULL;
3595 	adev->vm_manager.vm_pte_funcs = NULL;
3596 	adev->vm_manager.vm_pte_num_scheds = 0;
3597 	adev->gmc.gmc_funcs = NULL;
3598 	adev->harvest_ip_mask = 0x0;
3599 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3600 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3601 
3602 	adev->smc_rreg = &amdgpu_invalid_rreg;
3603 	adev->smc_wreg = &amdgpu_invalid_wreg;
3604 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3605 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3606 	adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3607 	adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3608 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3609 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3610 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3611 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3612 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3613 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3614 	adev->didt_rreg = &amdgpu_invalid_rreg;
3615 	adev->didt_wreg = &amdgpu_invalid_wreg;
3616 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3617 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3618 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3619 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3620 
3621 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3622 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3623 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3624 
3625 	/* mutex initialization are all done here so we
3626 	 * can recall function without having locking issues
3627 	 */
3628 	rw_init(&adev->firmware.mutex, "agfw");
3629 	rw_init(&adev->pm.mutex, "agpm");
3630 	rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
3631 	rw_init(&adev->srbm_mutex, "srbm");
3632 	rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
3633 	rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
3634 	rw_init(&adev->gfx.partition_mutex, "gfxpar");
3635 	rw_init(&adev->grbm_idx_mutex, "grbmidx");
3636 	rw_init(&adev->mn_lock, "agpumn");
3637 	rw_init(&adev->virt.vf_errors.lock, "vferr");
3638 	rw_init(&adev->virt.rlcg_reg_lock, "vrlcg");
3639 	hash_init(adev->mn_hash);
3640 	rw_init(&adev->psp.mutex, "agpsp");
3641 	rw_init(&adev->notifier_lock, "agnf");
3642 	rw_init(&adev->pm.stable_pstate_ctx_lock, "agps");
3643 	rw_init(&adev->benchmark_mutex, "agbm");
3644 
3645 	amdgpu_device_init_apu_flags(adev);
3646 
3647 	r = amdgpu_device_check_arguments(adev);
3648 	if (r)
3649 		return r;
3650 
3651 	mtx_init(&adev->mmio_idx_lock, IPL_TTY);
3652 	mtx_init(&adev->smc_idx_lock, IPL_TTY);
3653 	mtx_init(&adev->pcie_idx_lock, IPL_TTY);
3654 	mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
3655 	mtx_init(&adev->didt_idx_lock, IPL_TTY);
3656 	mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
3657 	mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
3658 	mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
3659 	mtx_init(&adev->mm_stats.lock, IPL_NONE);
3660 
3661 	INIT_LIST_HEAD(&adev->shadow_list);
3662 	rw_init(&adev->shadow_list_lock, "sdwlst");
3663 
3664 	INIT_LIST_HEAD(&adev->reset_list);
3665 
3666 	INIT_LIST_HEAD(&adev->ras_list);
3667 
3668 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3669 			  amdgpu_device_delayed_init_work_handler);
3670 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3671 			  amdgpu_device_delay_enable_gfx_off);
3672 
3673 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3674 
3675 	adev->gfx.gfx_off_req_count = 1;
3676 	adev->gfx.gfx_off_residency = 0;
3677 	adev->gfx.gfx_off_entrycount = 0;
3678 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3679 
3680 	atomic_set(&adev->throttling_logging_enabled, 1);
3681 	/*
3682 	 * If throttling continues, logging will be performed every minute
3683 	 * to avoid log flooding. "-1" is subtracted since the thermal
3684 	 * throttling interrupt comes every second. Thus, the total logging
3685 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3686 	 * for throttling interrupt) = 60 seconds.
3687 	 */
3688 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3689 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3690 
3691 #ifdef __linux__
3692 	/* Registers mapping */
3693 	/* TODO: block userspace mapping of io register */
3694 	if (adev->asic_type >= CHIP_BONAIRE) {
3695 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3696 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3697 	} else {
3698 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3699 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3700 	}
3701 #endif
3702 
3703 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3704 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3705 
3706 #ifdef __linux__
3707 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3708 	if (!adev->rmmio)
3709 		return -ENOMEM;
3710 #endif
3711 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3712 	DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3713 
3714 	/*
3715 	 * Reset domain needs to be present early, before XGMI hive discovered
3716 	 * (if any) and intitialized to use reset sem and in_gpu reset flag
3717 	 * early on during init and before calling to RREG32.
3718 	 */
3719 	adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3720 	if (!adev->reset_domain)
3721 		return -ENOMEM;
3722 
3723 	/* detect hw virtualization here */
3724 	amdgpu_detect_virtualization(adev);
3725 
3726 	amdgpu_device_get_pcie_info(adev);
3727 
3728 	r = amdgpu_device_get_job_timeout_settings(adev);
3729 	if (r) {
3730 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3731 		return r;
3732 	}
3733 
3734 	/* early init functions */
3735 	r = amdgpu_device_ip_early_init(adev);
3736 	if (r)
3737 		return r;
3738 
3739 	amdgpu_device_set_mcbp(adev);
3740 
3741 	/* Get rid of things like offb */
3742 	r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3743 	if (r)
3744 		return r;
3745 
3746 	/* Enable TMZ based on IP_VERSION */
3747 	amdgpu_gmc_tmz_set(adev);
3748 
3749 	amdgpu_gmc_noretry_set(adev);
3750 	/* Need to get xgmi info early to decide the reset behavior*/
3751 	if (adev->gmc.xgmi.supported) {
3752 		r = adev->gfxhub.funcs->get_xgmi_info(adev);
3753 		if (r)
3754 			return r;
3755 	}
3756 
3757 	/* enable PCIE atomic ops */
3758 #ifdef notyet
3759 	if (amdgpu_sriov_vf(adev)) {
3760 		if (adev->virt.fw_reserve.p_pf2vf)
3761 			adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3762 						      adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3763 				(PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3764 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3765 	 * internal path natively support atomics, set have_atomics_support to true.
3766 	 */
3767 	} else if ((adev->flags & AMD_IS_APU) &&
3768 		   (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3769 		adev->have_atomics_support = true;
3770 	} else {
3771 		adev->have_atomics_support =
3772 			!pci_enable_atomic_ops_to_root(adev->pdev,
3773 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3774 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3775 	}
3776 
3777 	if (!adev->have_atomics_support)
3778 		dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3779 #else
3780 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3781 	 * internal path natively support atomics, set have_atomics_support to true.
3782 	 */
3783 	if ((adev->flags & AMD_IS_APU) &&
3784 		(adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)))
3785 		adev->have_atomics_support = true;
3786 	else
3787 		adev->have_atomics_support = false;
3788 #endif
3789 
3790 	/* doorbell bar mapping and doorbell index init*/
3791 	amdgpu_doorbell_init(adev);
3792 
3793 	if (amdgpu_emu_mode == 1) {
3794 		/* post the asic on emulation mode */
3795 		emu_soc_asic_init(adev);
3796 		goto fence_driver_init;
3797 	}
3798 
3799 	amdgpu_reset_init(adev);
3800 
3801 	/* detect if we are with an SRIOV vbios */
3802 	if (adev->bios)
3803 		amdgpu_device_detect_sriov_bios(adev);
3804 
3805 	/* check if we need to reset the asic
3806 	 *  E.g., driver was not cleanly unloaded previously, etc.
3807 	 */
3808 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3809 		if (adev->gmc.xgmi.num_physical_nodes) {
3810 			dev_info(adev->dev, "Pending hive reset.\n");
3811 			adev->gmc.xgmi.pending_reset = true;
3812 			/* Only need to init necessary block for SMU to handle the reset */
3813 			for (i = 0; i < adev->num_ip_blocks; i++) {
3814 				if (!adev->ip_blocks[i].status.valid)
3815 					continue;
3816 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3817 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3818 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3819 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3820 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3821 						adev->ip_blocks[i].version->funcs->name);
3822 					adev->ip_blocks[i].status.hw = true;
3823 				}
3824 			}
3825 		} else {
3826 			tmp = amdgpu_reset_method;
3827 			/* It should do a default reset when loading or reloading the driver,
3828 			 * regardless of the module parameter reset_method.
3829 			 */
3830 			amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3831 			r = amdgpu_asic_reset(adev);
3832 			amdgpu_reset_method = tmp;
3833 			if (r) {
3834 				dev_err(adev->dev, "asic reset on init failed\n");
3835 				goto failed;
3836 			}
3837 		}
3838 	}
3839 
3840 	/* Post card if necessary */
3841 	if (amdgpu_device_need_post(adev)) {
3842 		if (!adev->bios) {
3843 			dev_err(adev->dev, "no vBIOS found\n");
3844 			r = -EINVAL;
3845 			goto failed;
3846 		}
3847 		DRM_INFO("GPU posting now...\n");
3848 		r = amdgpu_device_asic_init(adev);
3849 		if (r) {
3850 			dev_err(adev->dev, "gpu post error!\n");
3851 			goto failed;
3852 		}
3853 	}
3854 
3855 	if (adev->bios) {
3856 		if (adev->is_atom_fw) {
3857 			/* Initialize clocks */
3858 			r = amdgpu_atomfirmware_get_clock_info(adev);
3859 			if (r) {
3860 				dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3861 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3862 				goto failed;
3863 			}
3864 		} else {
3865 			/* Initialize clocks */
3866 			r = amdgpu_atombios_get_clock_info(adev);
3867 			if (r) {
3868 				dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3869 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3870 				goto failed;
3871 			}
3872 			/* init i2c buses */
3873 			if (!amdgpu_device_has_dc_support(adev))
3874 				amdgpu_atombios_i2c_init(adev);
3875 		}
3876 	}
3877 
3878 fence_driver_init:
3879 	/* Fence driver */
3880 	r = amdgpu_fence_driver_sw_init(adev);
3881 	if (r) {
3882 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3883 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3884 		goto failed;
3885 	}
3886 
3887 	/* init the mode config */
3888 	drm_mode_config_init(adev_to_drm(adev));
3889 
3890 	r = amdgpu_device_ip_init(adev);
3891 	if (r) {
3892 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3893 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3894 		goto release_ras_con;
3895 	}
3896 
3897 	amdgpu_fence_driver_hw_init(adev);
3898 
3899 	dev_info(adev->dev,
3900 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3901 			adev->gfx.config.max_shader_engines,
3902 			adev->gfx.config.max_sh_per_se,
3903 			adev->gfx.config.max_cu_per_sh,
3904 			adev->gfx.cu_info.number);
3905 
3906 #ifdef __OpenBSD__
3907 {
3908 	const char *chip_name;
3909 	uint32_t version = adev->ip_versions[GC_HWIP][0];
3910 	int maj, min, rev;
3911 
3912 	switch (adev->asic_type) {
3913 	case CHIP_RAVEN:
3914 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
3915 			chip_name = "RAVEN2";
3916 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
3917 			chip_name = "PICASSO";
3918 		else
3919 			chip_name = "RAVEN";
3920 		break;
3921 	case CHIP_RENOIR:
3922 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
3923 			chip_name = "RENOIR";
3924 		else
3925 			chip_name = "GREEN_SARDINE";
3926 		break;
3927 	default:
3928 		chip_name = amdgpu_asic_name[adev->asic_type];
3929 	}
3930 
3931 	printf("%s: %s", adev->self.dv_xname, chip_name);
3932 	/* show graphics/compute ip block version, not set on < GFX9 */
3933 	if (version) {
3934 		maj = IP_VERSION_MAJ(version);
3935 		min = IP_VERSION_MIN(version);
3936 		rev = IP_VERSION_REV(version);
3937 		printf(" GC %d.%d.%d", maj, min, rev);
3938 	}
3939 	printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id);
3940 }
3941 #endif
3942 
3943 	adev->accel_working = true;
3944 
3945 	amdgpu_vm_check_compute_bug(adev);
3946 
3947 	/* Initialize the buffer migration limit. */
3948 	if (amdgpu_moverate >= 0)
3949 		max_MBps = amdgpu_moverate;
3950 	else
3951 		max_MBps = 8; /* Allow 8 MB/s. */
3952 	/* Get a log2 for easy divisions. */
3953 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3954 
3955 	r = amdgpu_atombios_sysfs_init(adev);
3956 	if (r)
3957 		drm_err(&adev->ddev,
3958 			"registering atombios sysfs failed (%d).\n", r);
3959 
3960 	r = amdgpu_pm_sysfs_init(adev);
3961 	if (r)
3962 		DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3963 
3964 	r = amdgpu_ucode_sysfs_init(adev);
3965 	if (r) {
3966 		adev->ucode_sysfs_en = false;
3967 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3968 	} else
3969 		adev->ucode_sysfs_en = true;
3970 
3971 	/*
3972 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3973 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3974 	 * gpu instance is counted less.
3975 	 */
3976 	amdgpu_register_gpu_instance(adev);
3977 
3978 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3979 	 * explicit gating rather than handling it automatically.
3980 	 */
3981 	if (!adev->gmc.xgmi.pending_reset) {
3982 		r = amdgpu_device_ip_late_init(adev);
3983 		if (r) {
3984 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3985 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3986 			goto release_ras_con;
3987 		}
3988 		/* must succeed. */
3989 		amdgpu_ras_resume(adev);
3990 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3991 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3992 	}
3993 
3994 	if (amdgpu_sriov_vf(adev)) {
3995 		amdgpu_virt_release_full_gpu(adev, true);
3996 		flush_delayed_work(&adev->delayed_init_work);
3997 	}
3998 
3999 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4000 	if (r)
4001 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
4002 
4003 	amdgpu_fru_sysfs_init(adev);
4004 
4005 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4006 		r = amdgpu_pmu_init(adev);
4007 	if (r)
4008 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4009 
4010 	/* Have stored pci confspace at hand for restore in sudden PCI error */
4011 	if (amdgpu_device_cache_pci_state(adev->pdev))
4012 		pci_restore_state(pdev);
4013 
4014 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4015 	/* this will fail for cards that aren't VGA class devices, just
4016 	 * ignore it
4017 	 */
4018 #ifdef notyet
4019 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4020 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4021 #endif
4022 
4023 	px = amdgpu_device_supports_px(ddev);
4024 
4025 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4026 				apple_gmux_detect(NULL, NULL)))
4027 		vga_switcheroo_register_client(adev->pdev,
4028 					       &amdgpu_switcheroo_ops, px);
4029 
4030 	if (px)
4031 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4032 
4033 	if (adev->gmc.xgmi.pending_reset)
4034 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
4035 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
4036 
4037 	amdgpu_device_check_iommu_direct_map(adev);
4038 
4039 	return 0;
4040 
4041 release_ras_con:
4042 	if (amdgpu_sriov_vf(adev))
4043 		amdgpu_virt_release_full_gpu(adev, true);
4044 
4045 	/* failed in exclusive mode due to timeout */
4046 	if (amdgpu_sriov_vf(adev) &&
4047 		!amdgpu_sriov_runtime(adev) &&
4048 		amdgpu_virt_mmio_blocked(adev) &&
4049 		!amdgpu_virt_wait_reset(adev)) {
4050 		dev_err(adev->dev, "VF exclusive mode timeout\n");
4051 		/* Don't send request since VF is inactive. */
4052 		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4053 		adev->virt.ops = NULL;
4054 		r = -EAGAIN;
4055 	}
4056 	amdgpu_release_ras_context(adev);
4057 
4058 failed:
4059 	amdgpu_vf_error_trans_all(adev);
4060 
4061 	return r;
4062 }
4063 
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4064 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4065 {
4066 	STUB();
4067 #ifdef notyet
4068 
4069 	/* Clear all CPU mappings pointing to this device */
4070 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4071 #endif
4072 
4073 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
4074 	amdgpu_doorbell_fini(adev);
4075 
4076 #ifdef __linux__
4077 	iounmap(adev->rmmio);
4078 	adev->rmmio = NULL;
4079 	if (adev->mman.aper_base_kaddr)
4080 		iounmap(adev->mman.aper_base_kaddr);
4081 	adev->mman.aper_base_kaddr = NULL;
4082 #else
4083 	if (adev->rmmio_size > 0)
4084 		bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4085 		    adev->rmmio_size);
4086 	adev->rmmio_size = 0;
4087 	adev->rmmio = NULL;
4088 	if (adev->mman.aper_base_kaddr)
4089 		bus_space_unmap(adev->memt, adev->mman.aper_bsh,
4090 		    adev->gmc.visible_vram_size);
4091 	adev->mman.aper_base_kaddr = NULL;
4092 #endif
4093 
4094 	/* Memory manager related */
4095 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4096 #ifdef __linux__
4097 		arch_phys_wc_del(adev->gmc.vram_mtrr);
4098 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4099 #else
4100 		drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
4101 #endif
4102 	}
4103 }
4104 
4105 /**
4106  * amdgpu_device_fini_hw - tear down the driver
4107  *
4108  * @adev: amdgpu_device pointer
4109  *
4110  * Tear down the driver info (all asics).
4111  * Called at driver shutdown.
4112  */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4113 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4114 {
4115 	dev_info(adev->dev, "amdgpu: finishing device.\n");
4116 	flush_delayed_work(&adev->delayed_init_work);
4117 	adev->shutdown = true;
4118 
4119 	/* make sure IB test finished before entering exclusive mode
4120 	 * to avoid preemption on IB test
4121 	 */
4122 	if (amdgpu_sriov_vf(adev)) {
4123 		amdgpu_virt_request_full_gpu(adev, false);
4124 		amdgpu_virt_fini_data_exchange(adev);
4125 	}
4126 
4127 	/* disable all interrupts */
4128 	amdgpu_irq_disable_all(adev);
4129 	if (adev->mode_info.mode_config_initialized) {
4130 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4131 			drm_helper_force_disable_all(adev_to_drm(adev));
4132 		else
4133 			drm_atomic_helper_shutdown(adev_to_drm(adev));
4134 	}
4135 	amdgpu_fence_driver_hw_fini(adev);
4136 
4137 	if (adev->mman.initialized)
4138 		drain_workqueue(adev->mman.bdev.wq);
4139 
4140 	if (adev->pm.sysfs_initialized)
4141 		amdgpu_pm_sysfs_fini(adev);
4142 	if (adev->ucode_sysfs_en)
4143 		amdgpu_ucode_sysfs_fini(adev);
4144 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4145 	amdgpu_fru_sysfs_fini(adev);
4146 
4147 	/* disable ras feature must before hw fini */
4148 	amdgpu_ras_pre_fini(adev);
4149 
4150 	amdgpu_device_ip_fini_early(adev);
4151 
4152 	amdgpu_irq_fini_hw(adev);
4153 
4154 	if (adev->mman.initialized)
4155 		ttm_device_clear_dma_mappings(&adev->mman.bdev);
4156 
4157 	amdgpu_gart_dummy_page_fini(adev);
4158 
4159 	if (drm_dev_is_unplugged(adev_to_drm(adev)))
4160 		amdgpu_device_unmap_mmio(adev);
4161 
4162 }
4163 
amdgpu_device_fini_sw(struct amdgpu_device * adev)4164 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4165 {
4166 	int idx;
4167 	bool px;
4168 
4169 	amdgpu_device_ip_fini(adev);
4170 	amdgpu_fence_driver_sw_fini(adev);
4171 	amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4172 	adev->accel_working = false;
4173 	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4174 
4175 	amdgpu_reset_fini(adev);
4176 
4177 	/* free i2c buses */
4178 	if (!amdgpu_device_has_dc_support(adev))
4179 		amdgpu_i2c_fini(adev);
4180 
4181 	if (amdgpu_emu_mode != 1)
4182 		amdgpu_atombios_fini(adev);
4183 
4184 	kfree(adev->bios);
4185 	adev->bios = NULL;
4186 
4187 	px = amdgpu_device_supports_px(adev_to_drm(adev));
4188 
4189 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4190 				apple_gmux_detect(NULL, NULL)))
4191 		vga_switcheroo_unregister_client(adev->pdev);
4192 
4193 	if (px)
4194 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
4195 
4196 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4197 		vga_client_unregister(adev->pdev);
4198 
4199 	if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4200 #ifdef __linux__
4201 		iounmap(adev->rmmio);
4202 		adev->rmmio = NULL;
4203 #else
4204 		if (adev->rmmio_size > 0)
4205 			bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4206 			    adev->rmmio_size);
4207 		adev->rmmio_size = 0;
4208 		adev->rmmio = NULL;
4209 #endif
4210 		amdgpu_doorbell_fini(adev);
4211 		drm_dev_exit(idx);
4212 	}
4213 
4214 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4215 		amdgpu_pmu_fini(adev);
4216 	if (adev->mman.discovery_bin)
4217 		amdgpu_discovery_fini(adev);
4218 
4219 	amdgpu_reset_put_reset_domain(adev->reset_domain);
4220 	adev->reset_domain = NULL;
4221 
4222 	kfree(adev->pci_state);
4223 
4224 }
4225 
4226 /**
4227  * amdgpu_device_evict_resources - evict device resources
4228  * @adev: amdgpu device object
4229  *
4230  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4231  * of the vram memory type. Mainly used for evicting device resources
4232  * at suspend time.
4233  *
4234  */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4235 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4236 {
4237 	int ret;
4238 
4239 	/* No need to evict vram on APUs for suspend to ram or s2idle */
4240 	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4241 		return 0;
4242 
4243 	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4244 	if (ret)
4245 		DRM_WARN("evicting device resources failed\n");
4246 	return ret;
4247 }
4248 
4249 /*
4250  * Suspend & resume.
4251  */
4252 /**
4253  * amdgpu_device_prepare - prepare for device suspend
4254  *
4255  * @dev: drm dev pointer
4256  *
4257  * Prepare to put the hw in the suspend state (all asics).
4258  * Returns 0 for success or an error on failure.
4259  * Called at driver suspend.
4260  */
amdgpu_device_prepare(struct drm_device * dev)4261 int amdgpu_device_prepare(struct drm_device *dev)
4262 {
4263 	struct amdgpu_device *adev = drm_to_adev(dev);
4264 	int i, r;
4265 
4266 	amdgpu_choose_low_power_state(adev);
4267 
4268 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4269 		return 0;
4270 
4271 	/* Evict the majority of BOs before starting suspend sequence */
4272 	r = amdgpu_device_evict_resources(adev);
4273 	if (r)
4274 		goto unprepare;
4275 
4276 	flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4277 
4278 	for (i = 0; i < adev->num_ip_blocks; i++) {
4279 		if (!adev->ip_blocks[i].status.valid)
4280 			continue;
4281 		if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4282 			continue;
4283 		r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4284 		if (r)
4285 			goto unprepare;
4286 	}
4287 
4288 	return 0;
4289 
4290 unprepare:
4291 	adev->in_s0ix = adev->in_s3 = false;
4292 
4293 	return r;
4294 }
4295 
4296 /**
4297  * amdgpu_device_suspend - initiate device suspend
4298  *
4299  * @dev: drm dev pointer
4300  * @fbcon : notify the fbdev of suspend
4301  *
4302  * Puts the hw in the suspend state (all asics).
4303  * Returns 0 for success or an error on failure.
4304  * Called at driver suspend.
4305  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4306 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4307 {
4308 	struct amdgpu_device *adev = drm_to_adev(dev);
4309 	int r = 0;
4310 
4311 	if (adev->shutdown)
4312 		return 0;
4313 
4314 #ifdef notyet
4315 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4316 		return 0;
4317 #endif
4318 
4319 	adev->in_suspend = true;
4320 
4321 	if (amdgpu_sriov_vf(adev)) {
4322 		amdgpu_virt_fini_data_exchange(adev);
4323 		r = amdgpu_virt_request_full_gpu(adev, false);
4324 		if (r)
4325 			return r;
4326 	}
4327 
4328 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4329 		DRM_WARN("smart shift update failed\n");
4330 
4331 	if (fbcon)
4332 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4333 
4334 	cancel_delayed_work_sync(&adev->delayed_init_work);
4335 
4336 	amdgpu_ras_suspend(adev);
4337 
4338 	amdgpu_device_ip_suspend_phase1(adev);
4339 
4340 	if (!adev->in_s0ix)
4341 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4342 
4343 	r = amdgpu_device_evict_resources(adev);
4344 	if (r)
4345 		return r;
4346 
4347 	amdgpu_fence_driver_hw_fini(adev);
4348 
4349 	amdgpu_device_ip_suspend_phase2(adev);
4350 
4351 	if (amdgpu_sriov_vf(adev))
4352 		amdgpu_virt_release_full_gpu(adev, false);
4353 
4354 	return 0;
4355 }
4356 
4357 /**
4358  * amdgpu_device_resume - initiate device resume
4359  *
4360  * @dev: drm dev pointer
4361  * @fbcon : notify the fbdev of resume
4362  *
4363  * Bring the hw back to operating state (all asics).
4364  * Returns 0 for success or an error on failure.
4365  * Called at driver resume.
4366  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4367 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4368 {
4369 	struct amdgpu_device *adev = drm_to_adev(dev);
4370 	int r = 0;
4371 
4372 	if (amdgpu_sriov_vf(adev)) {
4373 		r = amdgpu_virt_request_full_gpu(adev, true);
4374 		if (r)
4375 			return r;
4376 	}
4377 
4378 #ifdef notyet
4379 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4380 		return 0;
4381 #endif
4382 
4383 	if (adev->in_s0ix)
4384 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4385 
4386 	/* post card */
4387 	if (amdgpu_device_need_post(adev)) {
4388 		r = amdgpu_device_asic_init(adev);
4389 		if (r)
4390 			dev_err(adev->dev, "amdgpu asic init failed\n");
4391 	}
4392 
4393 	r = amdgpu_device_ip_resume(adev);
4394 
4395 	if (r) {
4396 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4397 		goto exit;
4398 	}
4399 
4400 	r = amdgpu_device_ip_late_init(adev);
4401 	if (r)
4402 		goto exit;
4403 
4404 	queue_delayed_work(system_wq, &adev->delayed_init_work,
4405 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
4406 
4407 	if (!adev->in_s0ix) {
4408 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4409 		if (r)
4410 			goto exit;
4411 	}
4412 
4413 exit:
4414 	if (amdgpu_sriov_vf(adev)) {
4415 		amdgpu_virt_init_data_exchange(adev);
4416 		amdgpu_virt_release_full_gpu(adev, true);
4417 	}
4418 
4419 	if (r)
4420 		return r;
4421 
4422 	/* Make sure IB tests flushed */
4423 	flush_delayed_work(&adev->delayed_init_work);
4424 
4425 	if (fbcon)
4426 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4427 
4428 	amdgpu_ras_resume(adev);
4429 
4430 	if (adev->mode_info.num_crtc) {
4431 		/*
4432 		 * Most of the connector probing functions try to acquire runtime pm
4433 		 * refs to ensure that the GPU is powered on when connector polling is
4434 		 * performed. Since we're calling this from a runtime PM callback,
4435 		 * trying to acquire rpm refs will cause us to deadlock.
4436 		 *
4437 		 * Since we're guaranteed to be holding the rpm lock, it's safe to
4438 		 * temporarily disable the rpm helpers so this doesn't deadlock us.
4439 		 */
4440 #if defined(CONFIG_PM) && defined(__linux__)
4441 		dev->dev->power.disable_depth++;
4442 #endif
4443 		if (!adev->dc_enabled)
4444 			drm_helper_hpd_irq_event(dev);
4445 		else
4446 			drm_kms_helper_hotplug_event(dev);
4447 #if defined(CONFIG_PM) && defined(__linux__)
4448 		dev->dev->power.disable_depth--;
4449 #endif
4450 	}
4451 	adev->in_suspend = false;
4452 
4453 	if (adev->enable_mes)
4454 		amdgpu_mes_self_test(adev);
4455 
4456 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4457 		DRM_WARN("smart shift update failed\n");
4458 
4459 	return 0;
4460 }
4461 
4462 /**
4463  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4464  *
4465  * @adev: amdgpu_device pointer
4466  *
4467  * The list of all the hardware IPs that make up the asic is walked and
4468  * the check_soft_reset callbacks are run.  check_soft_reset determines
4469  * if the asic is still hung or not.
4470  * Returns true if any of the IPs are still in a hung state, false if not.
4471  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4472 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4473 {
4474 	int i;
4475 	bool asic_hang = false;
4476 
4477 	if (amdgpu_sriov_vf(adev))
4478 		return true;
4479 
4480 	if (amdgpu_asic_need_full_reset(adev))
4481 		return true;
4482 
4483 	for (i = 0; i < adev->num_ip_blocks; i++) {
4484 		if (!adev->ip_blocks[i].status.valid)
4485 			continue;
4486 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4487 			adev->ip_blocks[i].status.hang =
4488 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4489 		if (adev->ip_blocks[i].status.hang) {
4490 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4491 			asic_hang = true;
4492 		}
4493 	}
4494 	return asic_hang;
4495 }
4496 
4497 /**
4498  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4499  *
4500  * @adev: amdgpu_device pointer
4501  *
4502  * The list of all the hardware IPs that make up the asic is walked and the
4503  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4504  * handles any IP specific hardware or software state changes that are
4505  * necessary for a soft reset to succeed.
4506  * Returns 0 on success, negative error code on failure.
4507  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4508 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4509 {
4510 	int i, r = 0;
4511 
4512 	for (i = 0; i < adev->num_ip_blocks; i++) {
4513 		if (!adev->ip_blocks[i].status.valid)
4514 			continue;
4515 		if (adev->ip_blocks[i].status.hang &&
4516 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4517 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4518 			if (r)
4519 				return r;
4520 		}
4521 	}
4522 
4523 	return 0;
4524 }
4525 
4526 /**
4527  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4528  *
4529  * @adev: amdgpu_device pointer
4530  *
4531  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4532  * reset is necessary to recover.
4533  * Returns true if a full asic reset is required, false if not.
4534  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4535 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4536 {
4537 	int i;
4538 
4539 	if (amdgpu_asic_need_full_reset(adev))
4540 		return true;
4541 
4542 	for (i = 0; i < adev->num_ip_blocks; i++) {
4543 		if (!adev->ip_blocks[i].status.valid)
4544 			continue;
4545 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4546 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4547 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4548 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4549 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4550 			if (adev->ip_blocks[i].status.hang) {
4551 				dev_info(adev->dev, "Some block need full reset!\n");
4552 				return true;
4553 			}
4554 		}
4555 	}
4556 	return false;
4557 }
4558 
4559 /**
4560  * amdgpu_device_ip_soft_reset - do a soft reset
4561  *
4562  * @adev: amdgpu_device pointer
4563  *
4564  * The list of all the hardware IPs that make up the asic is walked and the
4565  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4566  * IP specific hardware or software state changes that are necessary to soft
4567  * reset the IP.
4568  * Returns 0 on success, negative error code on failure.
4569  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4570 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4571 {
4572 	int i, r = 0;
4573 
4574 	for (i = 0; i < adev->num_ip_blocks; i++) {
4575 		if (!adev->ip_blocks[i].status.valid)
4576 			continue;
4577 		if (adev->ip_blocks[i].status.hang &&
4578 		    adev->ip_blocks[i].version->funcs->soft_reset) {
4579 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4580 			if (r)
4581 				return r;
4582 		}
4583 	}
4584 
4585 	return 0;
4586 }
4587 
4588 /**
4589  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4590  *
4591  * @adev: amdgpu_device pointer
4592  *
4593  * The list of all the hardware IPs that make up the asic is walked and the
4594  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4595  * handles any IP specific hardware or software state changes that are
4596  * necessary after the IP has been soft reset.
4597  * Returns 0 on success, negative error code on failure.
4598  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4599 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4600 {
4601 	int i, r = 0;
4602 
4603 	for (i = 0; i < adev->num_ip_blocks; i++) {
4604 		if (!adev->ip_blocks[i].status.valid)
4605 			continue;
4606 		if (adev->ip_blocks[i].status.hang &&
4607 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
4608 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4609 		if (r)
4610 			return r;
4611 	}
4612 
4613 	return 0;
4614 }
4615 
4616 /**
4617  * amdgpu_device_recover_vram - Recover some VRAM contents
4618  *
4619  * @adev: amdgpu_device pointer
4620  *
4621  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4622  * restore things like GPUVM page tables after a GPU reset where
4623  * the contents of VRAM might be lost.
4624  *
4625  * Returns:
4626  * 0 on success, negative error code on failure.
4627  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4628 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4629 {
4630 	struct dma_fence *fence = NULL, *next = NULL;
4631 	struct amdgpu_bo *shadow;
4632 	struct amdgpu_bo_vm *vmbo;
4633 	long r = 1, tmo;
4634 
4635 	if (amdgpu_sriov_runtime(adev))
4636 		tmo = msecs_to_jiffies(8000);
4637 	else
4638 		tmo = msecs_to_jiffies(100);
4639 
4640 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4641 	mutex_lock(&adev->shadow_list_lock);
4642 	list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4643 		/* If vm is compute context or adev is APU, shadow will be NULL */
4644 		if (!vmbo->shadow)
4645 			continue;
4646 		shadow = vmbo->shadow;
4647 
4648 		/* No need to recover an evicted BO */
4649 		if (!shadow->tbo.resource ||
4650 		    shadow->tbo.resource->mem_type != TTM_PL_TT ||
4651 		    shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4652 		    shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4653 			continue;
4654 
4655 		r = amdgpu_bo_restore_shadow(shadow, &next);
4656 		if (r)
4657 			break;
4658 
4659 		if (fence) {
4660 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4661 			dma_fence_put(fence);
4662 			fence = next;
4663 			if (tmo == 0) {
4664 				r = -ETIMEDOUT;
4665 				break;
4666 			} else if (tmo < 0) {
4667 				r = tmo;
4668 				break;
4669 			}
4670 		} else {
4671 			fence = next;
4672 		}
4673 	}
4674 	mutex_unlock(&adev->shadow_list_lock);
4675 
4676 	if (fence)
4677 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4678 	dma_fence_put(fence);
4679 
4680 	if (r < 0 || tmo <= 0) {
4681 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4682 		return -EIO;
4683 	}
4684 
4685 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4686 	return 0;
4687 }
4688 
4689 
4690 /**
4691  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4692  *
4693  * @adev: amdgpu_device pointer
4694  * @from_hypervisor: request from hypervisor
4695  *
4696  * do VF FLR and reinitialize Asic
4697  * return 0 means succeeded otherwise failed
4698  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4699 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4700 				     bool from_hypervisor)
4701 {
4702 	int r;
4703 	struct amdgpu_hive_info *hive = NULL;
4704 	int retry_limit = 0;
4705 
4706 retry:
4707 	amdgpu_amdkfd_pre_reset(adev);
4708 
4709 	if (from_hypervisor)
4710 		r = amdgpu_virt_request_full_gpu(adev, true);
4711 	else
4712 		r = amdgpu_virt_reset_gpu(adev);
4713 	if (r)
4714 		return r;
4715 	amdgpu_irq_gpu_reset_resume_helper(adev);
4716 
4717 	/* some sw clean up VF needs to do before recover */
4718 	amdgpu_virt_post_reset(adev);
4719 
4720 	/* Resume IP prior to SMC */
4721 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4722 	if (r)
4723 		goto error;
4724 
4725 	amdgpu_virt_init_data_exchange(adev);
4726 
4727 	r = amdgpu_device_fw_loading(adev);
4728 	if (r)
4729 		return r;
4730 
4731 	/* now we are okay to resume SMC/CP/SDMA */
4732 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4733 	if (r)
4734 		goto error;
4735 
4736 	hive = amdgpu_get_xgmi_hive(adev);
4737 	/* Update PSP FW topology after reset */
4738 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4739 		r = amdgpu_xgmi_update_topology(hive, adev);
4740 
4741 	if (hive)
4742 		amdgpu_put_xgmi_hive(hive);
4743 
4744 	if (!r) {
4745 		r = amdgpu_ib_ring_tests(adev);
4746 
4747 		amdgpu_amdkfd_post_reset(adev);
4748 	}
4749 
4750 error:
4751 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4752 		amdgpu_inc_vram_lost(adev);
4753 		r = amdgpu_device_recover_vram(adev);
4754 	}
4755 	amdgpu_virt_release_full_gpu(adev, true);
4756 
4757 	if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4758 		if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4759 			retry_limit++;
4760 			goto retry;
4761 		} else
4762 			DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4763 	}
4764 
4765 	return r;
4766 }
4767 
4768 /**
4769  * amdgpu_device_has_job_running - check if there is any job in mirror list
4770  *
4771  * @adev: amdgpu_device pointer
4772  *
4773  * check if there is any job in mirror list
4774  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4775 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4776 {
4777 	int i;
4778 	struct drm_sched_job *job;
4779 
4780 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4781 		struct amdgpu_ring *ring = adev->rings[i];
4782 
4783 		if (!ring || !ring->sched.thread)
4784 			continue;
4785 
4786 		spin_lock(&ring->sched.job_list_lock);
4787 		job = list_first_entry_or_null(&ring->sched.pending_list,
4788 					       struct drm_sched_job, list);
4789 		spin_unlock(&ring->sched.job_list_lock);
4790 		if (job)
4791 			return true;
4792 	}
4793 	return false;
4794 }
4795 
4796 /**
4797  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4798  *
4799  * @adev: amdgpu_device pointer
4800  *
4801  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4802  * a hung GPU.
4803  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4804 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4805 {
4806 
4807 	if (amdgpu_gpu_recovery == 0)
4808 		goto disabled;
4809 
4810 	/* Skip soft reset check in fatal error mode */
4811 	if (!amdgpu_ras_is_poison_mode_supported(adev))
4812 		return true;
4813 
4814 	if (amdgpu_sriov_vf(adev))
4815 		return true;
4816 
4817 	if (amdgpu_gpu_recovery == -1) {
4818 		switch (adev->asic_type) {
4819 #ifdef CONFIG_DRM_AMDGPU_SI
4820 		case CHIP_VERDE:
4821 		case CHIP_TAHITI:
4822 		case CHIP_PITCAIRN:
4823 		case CHIP_OLAND:
4824 		case CHIP_HAINAN:
4825 #endif
4826 #ifdef CONFIG_DRM_AMDGPU_CIK
4827 		case CHIP_KAVERI:
4828 		case CHIP_KABINI:
4829 		case CHIP_MULLINS:
4830 #endif
4831 		case CHIP_CARRIZO:
4832 		case CHIP_STONEY:
4833 		case CHIP_CYAN_SKILLFISH:
4834 			goto disabled;
4835 		default:
4836 			break;
4837 		}
4838 	}
4839 
4840 	return true;
4841 
4842 disabled:
4843 		dev_info(adev->dev, "GPU recovery disabled.\n");
4844 		return false;
4845 }
4846 
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4847 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4848 {
4849 	u32 i;
4850 	int ret = 0;
4851 
4852 	amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4853 
4854 	dev_info(adev->dev, "GPU mode1 reset\n");
4855 
4856 	/* Cache the state before bus master disable. The saved config space
4857 	 * values are used in other cases like restore after mode-2 reset.
4858 	 */
4859 	amdgpu_device_cache_pci_state(adev->pdev);
4860 
4861 	/* disable BM */
4862 	pci_clear_master(adev->pdev);
4863 
4864 	if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4865 		dev_info(adev->dev, "GPU smu mode1 reset\n");
4866 		ret = amdgpu_dpm_mode1_reset(adev);
4867 	} else {
4868 		dev_info(adev->dev, "GPU psp mode1 reset\n");
4869 		ret = psp_gpu_reset(adev);
4870 	}
4871 
4872 	if (ret)
4873 		goto mode1_reset_failed;
4874 
4875 	amdgpu_device_load_pci_state(adev->pdev);
4876 	ret = amdgpu_psp_wait_for_bootloader(adev);
4877 	if (ret)
4878 		goto mode1_reset_failed;
4879 
4880 	/* wait for asic to come out of reset */
4881 	for (i = 0; i < adev->usec_timeout; i++) {
4882 		u32 memsize = adev->nbio.funcs->get_memsize(adev);
4883 
4884 		if (memsize != 0xffffffff)
4885 			break;
4886 		udelay(1);
4887 	}
4888 
4889 	if (i >= adev->usec_timeout) {
4890 		ret = -ETIMEDOUT;
4891 		goto mode1_reset_failed;
4892 	}
4893 
4894 	amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4895 
4896 	return 0;
4897 
4898 mode1_reset_failed:
4899 	dev_err(adev->dev, "GPU mode1 reset failed\n");
4900 	return ret;
4901 }
4902 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4903 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4904 				 struct amdgpu_reset_context *reset_context)
4905 {
4906 	int i, r = 0;
4907 	struct amdgpu_job *job = NULL;
4908 	bool need_full_reset =
4909 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4910 
4911 	if (reset_context->reset_req_dev == adev)
4912 		job = reset_context->job;
4913 
4914 	if (amdgpu_sriov_vf(adev)) {
4915 		/* stop the data exchange thread */
4916 		amdgpu_virt_fini_data_exchange(adev);
4917 	}
4918 
4919 	amdgpu_fence_driver_isr_toggle(adev, true);
4920 
4921 	/* block all schedulers and reset given job's ring */
4922 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4923 		struct amdgpu_ring *ring = adev->rings[i];
4924 
4925 		if (!ring || !ring->sched.thread)
4926 			continue;
4927 
4928 		/* Clear job fence from fence drv to avoid force_completion
4929 		 * leave NULL and vm flush fence in fence drv
4930 		 */
4931 		amdgpu_fence_driver_clear_job_fences(ring);
4932 
4933 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4934 		amdgpu_fence_driver_force_completion(ring);
4935 	}
4936 
4937 	amdgpu_fence_driver_isr_toggle(adev, false);
4938 
4939 	if (job && job->vm)
4940 		drm_sched_increase_karma(&job->base);
4941 
4942 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4943 	/* If reset handler not implemented, continue; otherwise return */
4944 	if (r == -EOPNOTSUPP)
4945 		r = 0;
4946 	else
4947 		return r;
4948 
4949 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4950 	if (!amdgpu_sriov_vf(adev)) {
4951 
4952 		if (!need_full_reset)
4953 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4954 
4955 		if (!need_full_reset && amdgpu_gpu_recovery &&
4956 		    amdgpu_device_ip_check_soft_reset(adev)) {
4957 			amdgpu_device_ip_pre_soft_reset(adev);
4958 			r = amdgpu_device_ip_soft_reset(adev);
4959 			amdgpu_device_ip_post_soft_reset(adev);
4960 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4961 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4962 				need_full_reset = true;
4963 			}
4964 		}
4965 
4966 		if (need_full_reset)
4967 			r = amdgpu_device_ip_suspend(adev);
4968 		if (need_full_reset)
4969 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4970 		else
4971 			clear_bit(AMDGPU_NEED_FULL_RESET,
4972 				  &reset_context->flags);
4973 	}
4974 
4975 	return r;
4976 }
4977 
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4978 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4979 {
4980 	int i;
4981 
4982 	lockdep_assert_held(&adev->reset_domain->sem);
4983 
4984 	for (i = 0; i < adev->num_regs; i++) {
4985 		adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4986 		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4987 					     adev->reset_dump_reg_value[i]);
4988 	}
4989 
4990 	return 0;
4991 }
4992 
4993 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4994 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4995 		size_t count, void *data, size_t datalen)
4996 {
4997 	struct drm_printer p;
4998 	struct amdgpu_device *adev = data;
4999 	struct drm_print_iterator iter;
5000 	int i;
5001 
5002 	iter.data = buffer;
5003 	iter.offset = 0;
5004 	iter.start = offset;
5005 	iter.remain = count;
5006 
5007 	p = drm_coredump_printer(&iter);
5008 
5009 	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
5010 	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
5011 	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
5012 	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
5013 	if (adev->reset_task_info.pid)
5014 		drm_printf(&p, "process_name: %s PID: %d\n",
5015 			   adev->reset_task_info.process_name,
5016 			   adev->reset_task_info.pid);
5017 
5018 	if (adev->reset_vram_lost)
5019 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
5020 	if (adev->num_regs) {
5021 		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
5022 
5023 		for (i = 0; i < adev->num_regs; i++)
5024 			drm_printf(&p, "0x%08x: 0x%08x\n",
5025 				   adev->reset_dump_reg_list[i],
5026 				   adev->reset_dump_reg_value[i]);
5027 	}
5028 
5029 	return count - iter.remain;
5030 }
5031 
amdgpu_devcoredump_free(void * data)5032 static void amdgpu_devcoredump_free(void *data)
5033 {
5034 }
5035 
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)5036 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
5037 {
5038 	struct drm_device *dev = adev_to_drm(adev);
5039 
5040 	ktime_get_ts64(&adev->reset_time);
5041 	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
5042 		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
5043 }
5044 #endif
5045 
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5046 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5047 			 struct amdgpu_reset_context *reset_context)
5048 {
5049 	struct amdgpu_device *tmp_adev = NULL;
5050 	bool need_full_reset, skip_hw_reset, vram_lost = false;
5051 	int r = 0;
5052 	bool gpu_reset_for_dev_remove = 0;
5053 
5054 	/* Try reset handler method first */
5055 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5056 				    reset_list);
5057 	amdgpu_reset_reg_dumps(tmp_adev);
5058 
5059 	reset_context->reset_device_list = device_list_handle;
5060 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5061 	/* If reset handler not implemented, continue; otherwise return */
5062 	if (r == -EOPNOTSUPP)
5063 		r = 0;
5064 	else
5065 		return r;
5066 
5067 	/* Reset handler not implemented, use the default method */
5068 	need_full_reset =
5069 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5070 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5071 
5072 	gpu_reset_for_dev_remove =
5073 		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5074 			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5075 
5076 	/*
5077 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
5078 	 * to allow proper links negotiation in FW (within 1 sec)
5079 	 */
5080 	if (!skip_hw_reset && need_full_reset) {
5081 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5082 			/* For XGMI run all resets in parallel to speed up the process */
5083 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5084 				tmp_adev->gmc.xgmi.pending_reset = false;
5085 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5086 					r = -EALREADY;
5087 			} else
5088 				r = amdgpu_asic_reset(tmp_adev);
5089 
5090 			if (r) {
5091 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5092 					 r, adev_to_drm(tmp_adev)->unique);
5093 				break;
5094 			}
5095 		}
5096 
5097 		/* For XGMI wait for all resets to complete before proceed */
5098 		if (!r) {
5099 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5100 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5101 					flush_work(&tmp_adev->xgmi_reset_work);
5102 					r = tmp_adev->asic_reset_res;
5103 					if (r)
5104 						break;
5105 				}
5106 			}
5107 		}
5108 	}
5109 
5110 	if (!r && amdgpu_ras_intr_triggered()) {
5111 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5112 			if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
5113 			    tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
5114 				tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5115 		}
5116 
5117 		amdgpu_ras_intr_cleared();
5118 	}
5119 
5120 	/* Since the mode1 reset affects base ip blocks, the
5121 	 * phase1 ip blocks need to be resumed. Otherwise there
5122 	 * will be a BIOS signature error and the psp bootloader
5123 	 * can't load kdb on the next amdgpu install.
5124 	 */
5125 	if (gpu_reset_for_dev_remove) {
5126 		list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5127 			amdgpu_device_ip_resume_phase1(tmp_adev);
5128 
5129 		goto end;
5130 	}
5131 
5132 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5133 		if (need_full_reset) {
5134 			/* post card */
5135 			r = amdgpu_device_asic_init(tmp_adev);
5136 			if (r) {
5137 				dev_warn(tmp_adev->dev, "asic atom init failed!");
5138 			} else {
5139 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5140 
5141 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
5142 				if (r)
5143 					goto out;
5144 
5145 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5146 #ifdef CONFIG_DEV_COREDUMP
5147 				tmp_adev->reset_vram_lost = vram_lost;
5148 				memset(&tmp_adev->reset_task_info, 0,
5149 						sizeof(tmp_adev->reset_task_info));
5150 				if (reset_context->job && reset_context->job->vm)
5151 					tmp_adev->reset_task_info =
5152 						reset_context->job->vm->task_info;
5153 				amdgpu_reset_capture_coredumpm(tmp_adev);
5154 #endif
5155 				if (vram_lost) {
5156 					DRM_INFO("VRAM is lost due to GPU reset!\n");
5157 					amdgpu_inc_vram_lost(tmp_adev);
5158 				}
5159 
5160 				r = amdgpu_device_fw_loading(tmp_adev);
5161 				if (r)
5162 					return r;
5163 
5164 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
5165 				if (r)
5166 					goto out;
5167 
5168 				r = amdgpu_device_ip_resume_phase3(tmp_adev);
5169 				if (r)
5170 					goto out;
5171 
5172 				if (vram_lost)
5173 					amdgpu_device_fill_reset_magic(tmp_adev);
5174 
5175 				/*
5176 				 * Add this ASIC as tracked as reset was already
5177 				 * complete successfully.
5178 				 */
5179 				amdgpu_register_gpu_instance(tmp_adev);
5180 
5181 				if (!reset_context->hive &&
5182 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5183 					amdgpu_xgmi_add_device(tmp_adev);
5184 
5185 				r = amdgpu_device_ip_late_init(tmp_adev);
5186 				if (r)
5187 					goto out;
5188 
5189 				drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5190 
5191 				/*
5192 				 * The GPU enters bad state once faulty pages
5193 				 * by ECC has reached the threshold, and ras
5194 				 * recovery is scheduled next. So add one check
5195 				 * here to break recovery if it indeed exceeds
5196 				 * bad page threshold, and remind user to
5197 				 * retire this GPU or setting one bigger
5198 				 * bad_page_threshold value to fix this once
5199 				 * probing driver again.
5200 				 */
5201 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5202 					/* must succeed. */
5203 					amdgpu_ras_resume(tmp_adev);
5204 				} else {
5205 					r = -EINVAL;
5206 					goto out;
5207 				}
5208 
5209 				/* Update PSP FW topology after reset */
5210 				if (reset_context->hive &&
5211 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5212 					r = amdgpu_xgmi_update_topology(
5213 						reset_context->hive, tmp_adev);
5214 			}
5215 		}
5216 
5217 out:
5218 		if (!r) {
5219 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5220 			r = amdgpu_ib_ring_tests(tmp_adev);
5221 			if (r) {
5222 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5223 				need_full_reset = true;
5224 				r = -EAGAIN;
5225 				goto end;
5226 			}
5227 		}
5228 
5229 		if (!r)
5230 			r = amdgpu_device_recover_vram(tmp_adev);
5231 		else
5232 			tmp_adev->asic_reset_res = r;
5233 	}
5234 
5235 end:
5236 	if (need_full_reset)
5237 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5238 	else
5239 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5240 	return r;
5241 }
5242 
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5243 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5244 {
5245 
5246 	switch (amdgpu_asic_reset_method(adev)) {
5247 	case AMD_RESET_METHOD_MODE1:
5248 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5249 		break;
5250 	case AMD_RESET_METHOD_MODE2:
5251 		adev->mp1_state = PP_MP1_STATE_RESET;
5252 		break;
5253 	default:
5254 		adev->mp1_state = PP_MP1_STATE_NONE;
5255 		break;
5256 	}
5257 
5258 	pci_dev_put(p);
5259 }
5260 
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5261 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5262 {
5263 	amdgpu_vf_error_trans_all(adev);
5264 	adev->mp1_state = PP_MP1_STATE_NONE;
5265 }
5266 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5267 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5268 {
5269 	STUB();
5270 #ifdef notyet
5271 	struct pci_dev *p = NULL;
5272 
5273 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5274 			adev->pdev->bus->number, 1);
5275 	if (p) {
5276 		pm_runtime_enable(&(p->dev));
5277 		pm_runtime_resume(&(p->dev));
5278 	}
5279 #endif
5280 }
5281 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5282 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5283 {
5284 	enum amd_reset_method reset_method;
5285 	struct pci_dev *p = NULL;
5286 	u64 expires;
5287 
5288 	/*
5289 	 * For now, only BACO and mode1 reset are confirmed
5290 	 * to suffer the audio issue without proper suspended.
5291 	 */
5292 	reset_method = amdgpu_asic_reset_method(adev);
5293 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
5294 	     (reset_method != AMD_RESET_METHOD_MODE1))
5295 		return -EINVAL;
5296 
5297 	STUB();
5298 	return -ENOSYS;
5299 #ifdef notyet
5300 
5301 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5302 			adev->pdev->bus->number, 1);
5303 	if (!p)
5304 		return -ENODEV;
5305 
5306 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
5307 	if (!expires)
5308 		/*
5309 		 * If we cannot get the audio device autosuspend delay,
5310 		 * a fixed 4S interval will be used. Considering 3S is
5311 		 * the audio controller default autosuspend delay setting.
5312 		 * 4S used here is guaranteed to cover that.
5313 		 */
5314 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5315 
5316 	while (!pm_runtime_status_suspended(&(p->dev))) {
5317 		if (!pm_runtime_suspend(&(p->dev)))
5318 			break;
5319 
5320 		if (expires < ktime_get_mono_fast_ns()) {
5321 			dev_warn(adev->dev, "failed to suspend display audio\n");
5322 			pci_dev_put(p);
5323 			/* TODO: abort the succeeding gpu reset? */
5324 			return -ETIMEDOUT;
5325 		}
5326 	}
5327 
5328 	pm_runtime_disable(&(p->dev));
5329 
5330 	pci_dev_put(p);
5331 	return 0;
5332 #endif
5333 }
5334 
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5335 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5336 {
5337 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5338 
5339 #if defined(CONFIG_DEBUG_FS)
5340 	if (!amdgpu_sriov_vf(adev))
5341 		cancel_work(&adev->reset_work);
5342 #endif
5343 
5344 	if (adev->kfd.dev)
5345 		cancel_work(&adev->kfd.reset_work);
5346 
5347 	if (amdgpu_sriov_vf(adev))
5348 		cancel_work(&adev->virt.flr_work);
5349 
5350 	if (con && adev->ras_enabled)
5351 		cancel_work(&con->recovery_work);
5352 
5353 }
5354 
5355 /**
5356  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5357  *
5358  * @adev: amdgpu_device pointer
5359  * @job: which job trigger hang
5360  * @reset_context: amdgpu reset context pointer
5361  *
5362  * Attempt to reset the GPU if it has hung (all asics).
5363  * Attempt to do soft-reset or full-reset and reinitialize Asic
5364  * Returns 0 for success or an error on failure.
5365  */
5366 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5367 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5368 			      struct amdgpu_job *job,
5369 			      struct amdgpu_reset_context *reset_context)
5370 {
5371 	struct list_head device_list, *device_list_handle =  NULL;
5372 	bool job_signaled = false;
5373 	struct amdgpu_hive_info *hive = NULL;
5374 	struct amdgpu_device *tmp_adev = NULL;
5375 	int i, r = 0;
5376 	bool need_emergency_restart = false;
5377 	bool audio_suspended = false;
5378 	bool gpu_reset_for_dev_remove = false;
5379 
5380 	gpu_reset_for_dev_remove =
5381 			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5382 				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5383 
5384 	/*
5385 	 * Special case: RAS triggered and full reset isn't supported
5386 	 */
5387 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5388 
5389 	/*
5390 	 * Flush RAM to disk so that after reboot
5391 	 * the user can read log and see why the system rebooted.
5392 	 */
5393 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5394 		amdgpu_ras_get_context(adev)->reboot) {
5395 		DRM_WARN("Emergency reboot.");
5396 
5397 #ifdef notyet
5398 		ksys_sync_helper();
5399 		emergency_restart();
5400 #else
5401 		panic("emergency_restart");
5402 #endif
5403 	}
5404 
5405 	dev_info(adev->dev, "GPU %s begin!\n",
5406 		need_emergency_restart ? "jobs stop":"reset");
5407 
5408 	if (!amdgpu_sriov_vf(adev))
5409 		hive = amdgpu_get_xgmi_hive(adev);
5410 	if (hive)
5411 		mutex_lock(&hive->hive_lock);
5412 
5413 	reset_context->job = job;
5414 	reset_context->hive = hive;
5415 	/*
5416 	 * Build list of devices to reset.
5417 	 * In case we are in XGMI hive mode, resort the device list
5418 	 * to put adev in the 1st position.
5419 	 */
5420 	INIT_LIST_HEAD(&device_list);
5421 	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5422 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5423 			list_add_tail(&tmp_adev->reset_list, &device_list);
5424 			if (gpu_reset_for_dev_remove && adev->shutdown)
5425 				tmp_adev->shutdown = true;
5426 		}
5427 		if (!list_is_first(&adev->reset_list, &device_list))
5428 			list_rotate_to_front(&adev->reset_list, &device_list);
5429 		device_list_handle = &device_list;
5430 	} else {
5431 		list_add_tail(&adev->reset_list, &device_list);
5432 		device_list_handle = &device_list;
5433 	}
5434 
5435 	/* We need to lock reset domain only once both for XGMI and single device */
5436 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5437 				    reset_list);
5438 	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5439 
5440 	/* block all schedulers and reset given job's ring */
5441 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5442 
5443 		amdgpu_device_set_mp1_state(tmp_adev);
5444 
5445 		/*
5446 		 * Try to put the audio codec into suspend state
5447 		 * before gpu reset started.
5448 		 *
5449 		 * Due to the power domain of the graphics device
5450 		 * is shared with AZ power domain. Without this,
5451 		 * we may change the audio hardware from behind
5452 		 * the audio driver's back. That will trigger
5453 		 * some audio codec errors.
5454 		 */
5455 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
5456 			audio_suspended = true;
5457 
5458 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
5459 
5460 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5461 
5462 		if (!amdgpu_sriov_vf(tmp_adev))
5463 			amdgpu_amdkfd_pre_reset(tmp_adev);
5464 
5465 		/*
5466 		 * Mark these ASICs to be reseted as untracked first
5467 		 * And add them back after reset completed
5468 		 */
5469 		amdgpu_unregister_gpu_instance(tmp_adev);
5470 
5471 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5472 
5473 		/* disable ras on ALL IPs */
5474 		if (!need_emergency_restart &&
5475 		      amdgpu_device_ip_need_full_reset(tmp_adev))
5476 			amdgpu_ras_suspend(tmp_adev);
5477 
5478 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5479 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5480 
5481 			if (!ring || !ring->sched.thread)
5482 				continue;
5483 
5484 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5485 
5486 			if (need_emergency_restart)
5487 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5488 		}
5489 		atomic_inc(&tmp_adev->gpu_reset_counter);
5490 	}
5491 
5492 	if (need_emergency_restart)
5493 		goto skip_sched_resume;
5494 
5495 	/*
5496 	 * Must check guilty signal here since after this point all old
5497 	 * HW fences are force signaled.
5498 	 *
5499 	 * job->base holds a reference to parent fence
5500 	 */
5501 	if (job && dma_fence_is_signaled(&job->hw_fence)) {
5502 		job_signaled = true;
5503 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5504 		goto skip_hw_reset;
5505 	}
5506 
5507 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
5508 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5509 		if (gpu_reset_for_dev_remove) {
5510 			/* Workaroud for ASICs need to disable SMC first */
5511 			amdgpu_device_smu_fini_early(tmp_adev);
5512 		}
5513 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5514 		/*TODO Should we stop ?*/
5515 		if (r) {
5516 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5517 				  r, adev_to_drm(tmp_adev)->unique);
5518 			tmp_adev->asic_reset_res = r;
5519 		}
5520 
5521 		/*
5522 		 * Drop all pending non scheduler resets. Scheduler resets
5523 		 * were already dropped during drm_sched_stop
5524 		 */
5525 		amdgpu_device_stop_pending_resets(tmp_adev);
5526 	}
5527 
5528 	/* Actual ASIC resets if needed.*/
5529 	/* Host driver will handle XGMI hive reset for SRIOV */
5530 	if (amdgpu_sriov_vf(adev)) {
5531 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
5532 		if (r)
5533 			adev->asic_reset_res = r;
5534 
5535 		/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5536 		if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5537 		    adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5538 			amdgpu_ras_resume(adev);
5539 	} else {
5540 		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5541 		if (r && r == -EAGAIN)
5542 			goto retry;
5543 
5544 		if (!r && gpu_reset_for_dev_remove)
5545 			goto recover_end;
5546 	}
5547 
5548 skip_hw_reset:
5549 
5550 	/* Post ASIC reset for all devs .*/
5551 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5552 
5553 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5554 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5555 
5556 			if (!ring || !ring->sched.thread)
5557 				continue;
5558 
5559 			drm_sched_start(&ring->sched, true);
5560 		}
5561 
5562 		if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5563 			amdgpu_mes_self_test(tmp_adev);
5564 
5565 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5566 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5567 
5568 		if (tmp_adev->asic_reset_res)
5569 			r = tmp_adev->asic_reset_res;
5570 
5571 		tmp_adev->asic_reset_res = 0;
5572 
5573 		if (r) {
5574 			/* bad news, how to tell it to userspace ? */
5575 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5576 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5577 		} else {
5578 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5579 			if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5580 				DRM_WARN("smart shift update failed\n");
5581 		}
5582 	}
5583 
5584 skip_sched_resume:
5585 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5586 		/* unlock kfd: SRIOV would do it separately */
5587 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5588 			amdgpu_amdkfd_post_reset(tmp_adev);
5589 
5590 		/* kfd_post_reset will do nothing if kfd device is not initialized,
5591 		 * need to bring up kfd here if it's not be initialized before
5592 		 */
5593 		if (!adev->kfd.init_complete)
5594 			amdgpu_amdkfd_device_init(adev);
5595 
5596 		if (audio_suspended)
5597 			amdgpu_device_resume_display_audio(tmp_adev);
5598 
5599 		amdgpu_device_unset_mp1_state(tmp_adev);
5600 
5601 		amdgpu_ras_set_error_query_ready(tmp_adev, true);
5602 	}
5603 
5604 recover_end:
5605 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5606 					    reset_list);
5607 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5608 
5609 	if (hive) {
5610 		mutex_unlock(&hive->hive_lock);
5611 		amdgpu_put_xgmi_hive(hive);
5612 	}
5613 
5614 	if (r)
5615 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5616 
5617 	atomic_set(&adev->reset_domain->reset_res, r);
5618 	return r;
5619 }
5620 
5621 /**
5622  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5623  *
5624  * @adev: amdgpu_device pointer
5625  *
5626  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5627  * and lanes) of the slot the device is in. Handles APUs and
5628  * virtualized environments where PCIE config space may not be available.
5629  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5630 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5631 {
5632 	struct pci_dev *pdev;
5633 	enum pci_bus_speed speed_cap, platform_speed_cap;
5634 	enum pcie_link_width platform_link_width;
5635 
5636 	if (amdgpu_pcie_gen_cap)
5637 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5638 
5639 	if (amdgpu_pcie_lane_cap)
5640 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5641 
5642 	/* covers APUs as well */
5643 	if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5644 		if (adev->pm.pcie_gen_mask == 0)
5645 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5646 		if (adev->pm.pcie_mlw_mask == 0)
5647 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5648 		return;
5649 	}
5650 
5651 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5652 		return;
5653 
5654 	pcie_bandwidth_available(adev->pdev, NULL,
5655 				 &platform_speed_cap, &platform_link_width);
5656 
5657 	if (adev->pm.pcie_gen_mask == 0) {
5658 		/* asic caps */
5659 		pdev = adev->pdev;
5660 		speed_cap = pcie_get_speed_cap(pdev);
5661 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5662 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5663 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5664 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5665 		} else {
5666 			if (speed_cap == PCIE_SPEED_32_0GT)
5667 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5668 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5669 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5670 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5671 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5672 			else if (speed_cap == PCIE_SPEED_16_0GT)
5673 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5674 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5675 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5676 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5677 			else if (speed_cap == PCIE_SPEED_8_0GT)
5678 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5679 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5680 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5681 			else if (speed_cap == PCIE_SPEED_5_0GT)
5682 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5683 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5684 			else
5685 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5686 		}
5687 		/* platform caps */
5688 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5689 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5690 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5691 		} else {
5692 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5693 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5694 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5695 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5696 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5697 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5698 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5699 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5700 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5701 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5702 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5703 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5704 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5705 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5706 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5707 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5708 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5709 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5710 			else
5711 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5712 
5713 		}
5714 	}
5715 	if (adev->pm.pcie_mlw_mask == 0) {
5716 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5717 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5718 		} else {
5719 			switch (platform_link_width) {
5720 			case PCIE_LNK_X32:
5721 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5722 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5723 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5724 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5725 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5726 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5727 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5728 				break;
5729 			case PCIE_LNK_X16:
5730 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5731 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5732 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5733 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5734 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5735 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5736 				break;
5737 			case PCIE_LNK_X12:
5738 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5739 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5740 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5741 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5742 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5743 				break;
5744 			case PCIE_LNK_X8:
5745 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5746 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5747 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5748 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5749 				break;
5750 			case PCIE_LNK_X4:
5751 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5752 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5753 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5754 				break;
5755 			case PCIE_LNK_X2:
5756 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5757 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5758 				break;
5759 			case PCIE_LNK_X1:
5760 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5761 				break;
5762 			default:
5763 				break;
5764 			}
5765 		}
5766 	}
5767 }
5768 
5769 /**
5770  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5771  *
5772  * @adev: amdgpu_device pointer
5773  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5774  *
5775  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5776  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5777  * @peer_adev.
5778  */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5779 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5780 				      struct amdgpu_device *peer_adev)
5781 {
5782 #ifdef CONFIG_HSA_AMD_P2P
5783 	uint64_t address_mask = peer_adev->dev->dma_mask ?
5784 		~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5785 	resource_size_t aper_limit =
5786 		adev->gmc.aper_base + adev->gmc.aper_size - 1;
5787 	bool p2p_access =
5788 		!adev->gmc.xgmi.connected_to_cpu &&
5789 		!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5790 
5791 	return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5792 		adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5793 		!(adev->gmc.aper_base & address_mask ||
5794 		  aper_limit & address_mask));
5795 #else
5796 	return false;
5797 #endif
5798 }
5799 
amdgpu_device_baco_enter(struct drm_device * dev)5800 int amdgpu_device_baco_enter(struct drm_device *dev)
5801 {
5802 	struct amdgpu_device *adev = drm_to_adev(dev);
5803 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5804 
5805 	if (!amdgpu_device_supports_baco(dev))
5806 		return -ENOTSUPP;
5807 
5808 	if (ras && adev->ras_enabled &&
5809 	    adev->nbio.funcs->enable_doorbell_interrupt)
5810 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5811 
5812 	return amdgpu_dpm_baco_enter(adev);
5813 }
5814 
amdgpu_device_baco_exit(struct drm_device * dev)5815 int amdgpu_device_baco_exit(struct drm_device *dev)
5816 {
5817 	struct amdgpu_device *adev = drm_to_adev(dev);
5818 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5819 	int ret = 0;
5820 
5821 	if (!amdgpu_device_supports_baco(dev))
5822 		return -ENOTSUPP;
5823 
5824 	ret = amdgpu_dpm_baco_exit(adev);
5825 	if (ret)
5826 		return ret;
5827 
5828 	if (ras && adev->ras_enabled &&
5829 	    adev->nbio.funcs->enable_doorbell_interrupt)
5830 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5831 
5832 	if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
5833 	    adev->nbio.funcs->clear_doorbell_interrupt)
5834 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
5835 
5836 	return 0;
5837 }
5838 
5839 /**
5840  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5841  * @pdev: PCI device struct
5842  * @state: PCI channel state
5843  *
5844  * Description: Called when a PCI error is detected.
5845  *
5846  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5847  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5848 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5849 {
5850 	STUB();
5851 	return 0;
5852 #ifdef notyet
5853 	struct drm_device *dev = pci_get_drvdata(pdev);
5854 	struct amdgpu_device *adev = drm_to_adev(dev);
5855 	int i;
5856 
5857 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5858 
5859 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5860 		DRM_WARN("No support for XGMI hive yet...");
5861 		return PCI_ERS_RESULT_DISCONNECT;
5862 	}
5863 
5864 	adev->pci_channel_state = state;
5865 
5866 	switch (state) {
5867 	case pci_channel_io_normal:
5868 		return PCI_ERS_RESULT_CAN_RECOVER;
5869 	/* Fatal error, prepare for slot reset */
5870 	case pci_channel_io_frozen:
5871 		/*
5872 		 * Locking adev->reset_domain->sem will prevent any external access
5873 		 * to GPU during PCI error recovery
5874 		 */
5875 		amdgpu_device_lock_reset_domain(adev->reset_domain);
5876 		amdgpu_device_set_mp1_state(adev);
5877 
5878 		/*
5879 		 * Block any work scheduling as we do for regular GPU reset
5880 		 * for the duration of the recovery
5881 		 */
5882 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5883 			struct amdgpu_ring *ring = adev->rings[i];
5884 
5885 			if (!ring || !ring->sched.thread)
5886 				continue;
5887 
5888 			drm_sched_stop(&ring->sched, NULL);
5889 		}
5890 		atomic_inc(&adev->gpu_reset_counter);
5891 		return PCI_ERS_RESULT_NEED_RESET;
5892 	case pci_channel_io_perm_failure:
5893 		/* Permanent error, prepare for device removal */
5894 		return PCI_ERS_RESULT_DISCONNECT;
5895 	}
5896 
5897 	return PCI_ERS_RESULT_NEED_RESET;
5898 #endif
5899 }
5900 
5901 /**
5902  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5903  * @pdev: pointer to PCI device
5904  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5905 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5906 {
5907 
5908 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5909 
5910 	/* TODO - dump whatever for debugging purposes */
5911 
5912 	/* This called only if amdgpu_pci_error_detected returns
5913 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5914 	 * works, no need to reset slot.
5915 	 */
5916 
5917 	return PCI_ERS_RESULT_RECOVERED;
5918 }
5919 
5920 /**
5921  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5922  * @pdev: PCI device struct
5923  *
5924  * Description: This routine is called by the pci error recovery
5925  * code after the PCI slot has been reset, just before we
5926  * should resume normal operations.
5927  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5928 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5929 {
5930 	STUB();
5931 	return PCI_ERS_RESULT_RECOVERED;
5932 #ifdef notyet
5933 	struct drm_device *dev = pci_get_drvdata(pdev);
5934 	struct amdgpu_device *adev = drm_to_adev(dev);
5935 	int r, i;
5936 	struct amdgpu_reset_context reset_context;
5937 	u32 memsize;
5938 	struct list_head device_list;
5939 
5940 	DRM_INFO("PCI error: slot reset callback!!\n");
5941 
5942 	memset(&reset_context, 0, sizeof(reset_context));
5943 
5944 	INIT_LIST_HEAD(&device_list);
5945 	list_add_tail(&adev->reset_list, &device_list);
5946 
5947 	/* wait for asic to come out of reset */
5948 	drm_msleep(500);
5949 
5950 	/* Restore PCI confspace */
5951 	amdgpu_device_load_pci_state(pdev);
5952 
5953 	/* confirm  ASIC came out of reset */
5954 	for (i = 0; i < adev->usec_timeout; i++) {
5955 		memsize = amdgpu_asic_get_config_memsize(adev);
5956 
5957 		if (memsize != 0xffffffff)
5958 			break;
5959 		udelay(1);
5960 	}
5961 	if (memsize == 0xffffffff) {
5962 		r = -ETIME;
5963 		goto out;
5964 	}
5965 
5966 	reset_context.method = AMD_RESET_METHOD_NONE;
5967 	reset_context.reset_req_dev = adev;
5968 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5969 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5970 
5971 	adev->no_hw_access = true;
5972 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5973 	adev->no_hw_access = false;
5974 	if (r)
5975 		goto out;
5976 
5977 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5978 
5979 out:
5980 	if (!r) {
5981 		if (amdgpu_device_cache_pci_state(adev->pdev))
5982 			pci_restore_state(adev->pdev);
5983 
5984 		DRM_INFO("PCIe error recovery succeeded\n");
5985 	} else {
5986 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5987 		amdgpu_device_unset_mp1_state(adev);
5988 		amdgpu_device_unlock_reset_domain(adev->reset_domain);
5989 	}
5990 
5991 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5992 #endif
5993 }
5994 
5995 /**
5996  * amdgpu_pci_resume() - resume normal ops after PCI reset
5997  * @pdev: pointer to PCI device
5998  *
5999  * Called when the error recovery driver tells us that its
6000  * OK to resume normal operation.
6001  */
amdgpu_pci_resume(struct pci_dev * pdev)6002 void amdgpu_pci_resume(struct pci_dev *pdev)
6003 {
6004 	STUB();
6005 #ifdef notyet
6006 	struct drm_device *dev = pci_get_drvdata(pdev);
6007 	struct amdgpu_device *adev = drm_to_adev(dev);
6008 	int i;
6009 
6010 
6011 	DRM_INFO("PCI error: resume callback!!\n");
6012 
6013 	/* Only continue execution for the case of pci_channel_io_frozen */
6014 	if (adev->pci_channel_state != pci_channel_io_frozen)
6015 		return;
6016 
6017 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6018 		struct amdgpu_ring *ring = adev->rings[i];
6019 
6020 		if (!ring || !ring->sched.thread)
6021 			continue;
6022 
6023 		drm_sched_start(&ring->sched, true);
6024 	}
6025 
6026 	amdgpu_device_unset_mp1_state(adev);
6027 	amdgpu_device_unlock_reset_domain(adev->reset_domain);
6028 #endif
6029 }
6030 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6031 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6032 {
6033 	return false;
6034 #ifdef notyet
6035 	struct drm_device *dev = pci_get_drvdata(pdev);
6036 	struct amdgpu_device *adev = drm_to_adev(dev);
6037 	int r;
6038 
6039 	if (amdgpu_sriov_vf(adev))
6040 		return false;
6041 
6042 	r = pci_save_state(pdev);
6043 	if (!r) {
6044 		kfree(adev->pci_state);
6045 
6046 		adev->pci_state = pci_store_saved_state(pdev);
6047 
6048 		if (!adev->pci_state) {
6049 			DRM_ERROR("Failed to store PCI saved state");
6050 			return false;
6051 		}
6052 	} else {
6053 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
6054 		return false;
6055 	}
6056 
6057 	return true;
6058 #endif
6059 }
6060 
amdgpu_device_load_pci_state(struct pci_dev * pdev)6061 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6062 {
6063 	STUB();
6064 	return false;
6065 #ifdef notyet
6066 	struct drm_device *dev = pci_get_drvdata(pdev);
6067 	struct amdgpu_device *adev = drm_to_adev(dev);
6068 	int r;
6069 
6070 	if (!adev->pci_state)
6071 		return false;
6072 
6073 	r = pci_load_saved_state(pdev, adev->pci_state);
6074 
6075 	if (!r) {
6076 		pci_restore_state(pdev);
6077 	} else {
6078 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
6079 		return false;
6080 	}
6081 
6082 	return true;
6083 #endif
6084 }
6085 
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6086 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6087 		struct amdgpu_ring *ring)
6088 {
6089 #ifdef CONFIG_X86_64
6090 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6091 		return;
6092 #endif
6093 	if (adev->gmc.xgmi.connected_to_cpu)
6094 		return;
6095 
6096 	if (ring && ring->funcs->emit_hdp_flush)
6097 		amdgpu_ring_emit_hdp_flush(ring);
6098 	else
6099 		amdgpu_asic_flush_hdp(adev, ring);
6100 }
6101 
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6102 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6103 		struct amdgpu_ring *ring)
6104 {
6105 #ifdef CONFIG_X86_64
6106 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6107 		return;
6108 #endif
6109 	if (adev->gmc.xgmi.connected_to_cpu)
6110 		return;
6111 
6112 	amdgpu_asic_invalidate_hdp(adev, ring);
6113 }
6114 
amdgpu_in_reset(struct amdgpu_device * adev)6115 int amdgpu_in_reset(struct amdgpu_device *adev)
6116 {
6117 	return atomic_read(&adev->reset_domain->in_gpu_reset);
6118 }
6119 
6120 /**
6121  * amdgpu_device_halt() - bring hardware to some kind of halt state
6122  *
6123  * @adev: amdgpu_device pointer
6124  *
6125  * Bring hardware to some kind of halt state so that no one can touch it
6126  * any more. It will help to maintain error context when error occurred.
6127  * Compare to a simple hang, the system will keep stable at least for SSH
6128  * access. Then it should be trivial to inspect the hardware state and
6129  * see what's going on. Implemented as following:
6130  *
6131  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6132  *    clears all CPU mappings to device, disallows remappings through page faults
6133  * 2. amdgpu_irq_disable_all() disables all interrupts
6134  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6135  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6136  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6137  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6138  *    flush any in flight DMA operations
6139  */
amdgpu_device_halt(struct amdgpu_device * adev)6140 void amdgpu_device_halt(struct amdgpu_device *adev)
6141 {
6142 	struct pci_dev *pdev = adev->pdev;
6143 	struct drm_device *ddev = adev_to_drm(adev);
6144 
6145 	amdgpu_xcp_dev_unplug(adev);
6146 	drm_dev_unplug(ddev);
6147 
6148 	amdgpu_irq_disable_all(adev);
6149 
6150 	amdgpu_fence_driver_hw_fini(adev);
6151 
6152 	adev->no_hw_access = true;
6153 
6154 	amdgpu_device_unmap_mmio(adev);
6155 
6156 	pci_disable_device(pdev);
6157 	pci_wait_for_pending_transaction(pdev);
6158 }
6159 
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6160 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6161 				u32 reg)
6162 {
6163 	unsigned long flags, address, data;
6164 	u32 r;
6165 
6166 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6167 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6168 
6169 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6170 	WREG32(address, reg * 4);
6171 	(void)RREG32(address);
6172 	r = RREG32(data);
6173 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6174 	return r;
6175 }
6176 
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6177 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6178 				u32 reg, u32 v)
6179 {
6180 	unsigned long flags, address, data;
6181 
6182 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6183 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6184 
6185 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6186 	WREG32(address, reg * 4);
6187 	(void)RREG32(address);
6188 	WREG32(data, v);
6189 	(void)RREG32(data);
6190 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6191 }
6192 
6193 /**
6194  * amdgpu_device_switch_gang - switch to a new gang
6195  * @adev: amdgpu_device pointer
6196  * @gang: the gang to switch to
6197  *
6198  * Try to switch to a new gang.
6199  * Returns: NULL if we switched to the new gang or a reference to the current
6200  * gang leader.
6201  */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6202 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6203 					    struct dma_fence *gang)
6204 {
6205 	struct dma_fence *old = NULL;
6206 
6207 	do {
6208 		dma_fence_put(old);
6209 		rcu_read_lock();
6210 		old = dma_fence_get_rcu_safe(&adev->gang_submit);
6211 		rcu_read_unlock();
6212 
6213 		if (old == gang)
6214 			break;
6215 
6216 		if (!dma_fence_is_signaled(old))
6217 			return old;
6218 
6219 	} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6220 			 old, gang) != old);
6221 
6222 	dma_fence_put(old);
6223 	return NULL;
6224 }
6225 
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6226 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6227 {
6228 	switch (adev->asic_type) {
6229 #ifdef CONFIG_DRM_AMDGPU_SI
6230 	case CHIP_HAINAN:
6231 #endif
6232 	case CHIP_TOPAZ:
6233 		/* chips with no display hardware */
6234 		return false;
6235 #ifdef CONFIG_DRM_AMDGPU_SI
6236 	case CHIP_TAHITI:
6237 	case CHIP_PITCAIRN:
6238 	case CHIP_VERDE:
6239 	case CHIP_OLAND:
6240 #endif
6241 #ifdef CONFIG_DRM_AMDGPU_CIK
6242 	case CHIP_BONAIRE:
6243 	case CHIP_HAWAII:
6244 	case CHIP_KAVERI:
6245 	case CHIP_KABINI:
6246 	case CHIP_MULLINS:
6247 #endif
6248 	case CHIP_TONGA:
6249 	case CHIP_FIJI:
6250 	case CHIP_POLARIS10:
6251 	case CHIP_POLARIS11:
6252 	case CHIP_POLARIS12:
6253 	case CHIP_VEGAM:
6254 	case CHIP_CARRIZO:
6255 	case CHIP_STONEY:
6256 		/* chips with display hardware */
6257 		return true;
6258 	default:
6259 		/* IP discovery */
6260 		if (!adev->ip_versions[DCE_HWIP][0] ||
6261 		    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6262 			return false;
6263 		return true;
6264 	}
6265 }
6266 
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6267 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6268 		uint32_t inst, uint32_t reg_addr, char reg_name[],
6269 		uint32_t expected_value, uint32_t mask)
6270 {
6271 	uint32_t ret = 0;
6272 	uint32_t old_ = 0;
6273 	uint32_t tmp_ = RREG32(reg_addr);
6274 	uint32_t loop = adev->usec_timeout;
6275 
6276 	while ((tmp_ & (mask)) != (expected_value)) {
6277 		if (old_ != tmp_) {
6278 			loop = adev->usec_timeout;
6279 			old_ = tmp_;
6280 		} else
6281 			udelay(1);
6282 		tmp_ = RREG32(reg_addr);
6283 		loop--;
6284 		if (!loop) {
6285 			DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6286 				  inst, reg_name, (uint32_t)expected_value,
6287 				  (uint32_t)(tmp_ & (mask)));
6288 			ret = -ETIMEDOUT;
6289 			break;
6290 		}
6291 	}
6292 	return ret;
6293 }
6294