1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82
83 #include <drm/drm_drv.h>
84
85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__)
86 #include <asm/intel-family.h>
87 #endif
88
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97 #define AMDGPU_RESUME_MS 2000
98 #define AMDGPU_MAX_RETRY_LIMIT 2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100
101 static const struct drm_driver amdgpu_kms_driver;
102
103 const char *amdgpu_asic_name[] = {
104 "TAHITI",
105 "PITCAIRN",
106 "VERDE",
107 "OLAND",
108 "HAINAN",
109 "BONAIRE",
110 "KAVERI",
111 "KABINI",
112 "HAWAII",
113 "MULLINS",
114 "TOPAZ",
115 "TONGA",
116 "FIJI",
117 "CARRIZO",
118 "STONEY",
119 "POLARIS10",
120 "POLARIS11",
121 "POLARIS12",
122 "VEGAM",
123 "VEGA10",
124 "VEGA12",
125 "VEGA20",
126 "RAVEN",
127 "ARCTURUS",
128 "RENOIR",
129 "ALDEBARAN",
130 "NAVI10",
131 "CYAN_SKILLFISH",
132 "NAVI14",
133 "NAVI12",
134 "SIENNA_CICHLID",
135 "NAVY_FLOUNDER",
136 "VANGOGH",
137 "DIMGREY_CAVEFISH",
138 "BEIGE_GOBY",
139 "YELLOW_CARP",
140 "IP DISCOVERY",
141 "LAST",
142 };
143
144 /**
145 * DOC: pcie_replay_count
146 *
147 * The amdgpu driver provides a sysfs API for reporting the total number
148 * of PCIe replays (NAKs)
149 * The file pcie_replay_count is used for this and returns the total
150 * number of replays as a sum of the NAKs generated and NAKs received
151 */
152
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
154 struct device_attribute *attr, char *buf)
155 {
156 struct drm_device *ddev = dev_get_drvdata(dev);
157 struct amdgpu_device *adev = drm_to_adev(ddev);
158 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
159
160 return sysfs_emit(buf, "%llu\n", cnt);
161 }
162
163 static DEVICE_ATTR(pcie_replay_count, 0444,
164 amdgpu_device_get_pcie_replay_count, NULL);
165
166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
167
168
169 /**
170 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
171 *
172 * @dev: drm_device pointer
173 *
174 * Returns true if the device is a dGPU with ATPX power control,
175 * otherwise return false.
176 */
amdgpu_device_supports_px(struct drm_device * dev)177 bool amdgpu_device_supports_px(struct drm_device *dev)
178 {
179 struct amdgpu_device *adev = drm_to_adev(dev);
180
181 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
182 return true;
183 return false;
184 }
185
186 /**
187 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
188 *
189 * @dev: drm_device pointer
190 *
191 * Returns true if the device is a dGPU with ACPI power control,
192 * otherwise return false.
193 */
amdgpu_device_supports_boco(struct drm_device * dev)194 bool amdgpu_device_supports_boco(struct drm_device *dev)
195 {
196 struct amdgpu_device *adev = drm_to_adev(dev);
197
198 if (adev->has_pr3 ||
199 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
200 return true;
201 return false;
202 }
203
204 /**
205 * amdgpu_device_supports_baco - Does the device support BACO
206 *
207 * @dev: drm_device pointer
208 *
209 * Returns true if the device supporte BACO,
210 * otherwise return false.
211 */
amdgpu_device_supports_baco(struct drm_device * dev)212 bool amdgpu_device_supports_baco(struct drm_device *dev)
213 {
214 struct amdgpu_device *adev = drm_to_adev(dev);
215
216 return amdgpu_asic_supports_baco(adev);
217 }
218
219 /**
220 * amdgpu_device_supports_smart_shift - Is the device dGPU with
221 * smart shift support
222 *
223 * @dev: drm_device pointer
224 *
225 * Returns true if the device is a dGPU with Smart Shift support,
226 * otherwise returns false.
227 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
229 {
230 return (amdgpu_device_supports_boco(dev) &&
231 amdgpu_acpi_is_power_shift_control_supported());
232 }
233
234 /*
235 * VRAM access helper functions
236 */
237
238 /**
239 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
240 *
241 * @adev: amdgpu_device pointer
242 * @pos: offset of the buffer in vram
243 * @buf: virtual address of the buffer in system memory
244 * @size: read/write size, sizeof(@buf) must > @size
245 * @write: true - write to vram, otherwise - read from vram
246 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
248 void *buf, size_t size, bool write)
249 {
250 unsigned long flags;
251 uint32_t hi = ~0, tmp = 0;
252 uint32_t *data = buf;
253 uint64_t last;
254 int idx;
255
256 if (!drm_dev_enter(adev_to_drm(adev), &idx))
257 return;
258
259 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
260
261 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
262 for (last = pos + size; pos < last; pos += 4) {
263 tmp = pos >> 31;
264
265 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
266 if (tmp != hi) {
267 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
268 hi = tmp;
269 }
270 if (write)
271 WREG32_NO_KIQ(mmMM_DATA, *data++);
272 else
273 *data++ = RREG32_NO_KIQ(mmMM_DATA);
274 }
275
276 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
277 drm_dev_exit(idx);
278 }
279
280 /**
281 * amdgpu_device_aper_access - access vram by vram aperature
282 *
283 * @adev: amdgpu_device pointer
284 * @pos: offset of the buffer in vram
285 * @buf: virtual address of the buffer in system memory
286 * @size: read/write size, sizeof(@buf) must > @size
287 * @write: true - write to vram, otherwise - read from vram
288 *
289 * The return value means how many bytes have been transferred.
290 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
292 void *buf, size_t size, bool write)
293 {
294 #ifdef CONFIG_64BIT
295 void __iomem *addr;
296 size_t count = 0;
297 uint64_t last;
298
299 if (!adev->mman.aper_base_kaddr)
300 return 0;
301
302 last = min(pos + size, adev->gmc.visible_vram_size);
303 if (last > pos) {
304 addr = adev->mman.aper_base_kaddr + pos;
305 count = last - pos;
306
307 if (write) {
308 memcpy_toio(addr, buf, count);
309 /* Make sure HDP write cache flush happens without any reordering
310 * after the system memory contents are sent over PCIe device
311 */
312 mb();
313 amdgpu_device_flush_hdp(adev, NULL);
314 } else {
315 amdgpu_device_invalidate_hdp(adev, NULL);
316 /* Make sure HDP read cache is invalidated before issuing a read
317 * to the PCIe device
318 */
319 mb();
320 memcpy_fromio(buf, addr, count);
321 }
322
323 }
324
325 return count;
326 #else
327 return 0;
328 #endif
329 }
330
331 /**
332 * amdgpu_device_vram_access - read/write a buffer in vram
333 *
334 * @adev: amdgpu_device pointer
335 * @pos: offset of the buffer in vram
336 * @buf: virtual address of the buffer in system memory
337 * @size: read/write size, sizeof(@buf) must > @size
338 * @write: true - write to vram, otherwise - read from vram
339 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
341 void *buf, size_t size, bool write)
342 {
343 size_t count;
344
345 /* try to using vram apreature to access vram first */
346 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
347 size -= count;
348 if (size) {
349 /* using MM to access rest vram */
350 pos += count;
351 buf += count;
352 amdgpu_device_mm_access(adev, pos, buf, size, write);
353 }
354 }
355
356 /*
357 * register access helper functions.
358 */
359
360 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
362 {
363 if (adev->no_hw_access)
364 return true;
365
366 #ifdef CONFIG_LOCKDEP
367 /*
368 * This is a bit complicated to understand, so worth a comment. What we assert
369 * here is that the GPU reset is not running on another thread in parallel.
370 *
371 * For this we trylock the read side of the reset semaphore, if that succeeds
372 * we know that the reset is not running in paralell.
373 *
374 * If the trylock fails we assert that we are either already holding the read
375 * side of the lock or are the reset thread itself and hold the write side of
376 * the lock.
377 */
378 if (in_task()) {
379 if (down_read_trylock(&adev->reset_domain->sem))
380 up_read(&adev->reset_domain->sem);
381 else
382 lockdep_assert_held(&adev->reset_domain->sem);
383 }
384 #endif
385 return false;
386 }
387
388 /**
389 * amdgpu_device_rreg - read a memory mapped IO or indirect register
390 *
391 * @adev: amdgpu_device pointer
392 * @reg: dword aligned register offset
393 * @acc_flags: access flags which require special behavior
394 *
395 * Returns the 32 bit value from the offset specified.
396 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
398 uint32_t reg, uint32_t acc_flags)
399 {
400 uint32_t ret;
401
402 if (amdgpu_device_skip_hw_access(adev))
403 return 0;
404
405 if ((reg * 4) < adev->rmmio_size) {
406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
407 amdgpu_sriov_runtime(adev) &&
408 down_read_trylock(&adev->reset_domain->sem)) {
409 ret = amdgpu_kiq_rreg(adev, reg);
410 up_read(&adev->reset_domain->sem);
411 } else {
412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
413 }
414 } else {
415 ret = adev->pcie_rreg(adev, reg * 4);
416 }
417
418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
419
420 return ret;
421 }
422
423 /*
424 * MMIO register read with bytes helper functions
425 * @offset:bytes offset from MMIO start
426 */
427
428 /**
429 * amdgpu_mm_rreg8 - read a memory mapped IO register
430 *
431 * @adev: amdgpu_device pointer
432 * @offset: byte aligned register offset
433 *
434 * Returns the 8 bit value from the offset specified.
435 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
437 {
438 if (amdgpu_device_skip_hw_access(adev))
439 return 0;
440
441 if (offset < adev->rmmio_size)
442 return (readb(adev->rmmio + offset));
443 BUG();
444 }
445
446 /*
447 * MMIO register write with bytes helper functions
448 * @offset:bytes offset from MMIO start
449 * @value: the value want to be written to the register
450 */
451
452 /**
453 * amdgpu_mm_wreg8 - read a memory mapped IO register
454 *
455 * @adev: amdgpu_device pointer
456 * @offset: byte aligned register offset
457 * @value: 8 bit value to write
458 *
459 * Writes the value specified to the offset specified.
460 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
462 {
463 if (amdgpu_device_skip_hw_access(adev))
464 return;
465
466 if (offset < adev->rmmio_size)
467 writeb(value, adev->rmmio + offset);
468 else
469 BUG();
470 }
471
472 /**
473 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
474 *
475 * @adev: amdgpu_device pointer
476 * @reg: dword aligned register offset
477 * @v: 32 bit value to write to the register
478 * @acc_flags: access flags which require special behavior
479 *
480 * Writes the value specified to the offset specified.
481 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)482 void amdgpu_device_wreg(struct amdgpu_device *adev,
483 uint32_t reg, uint32_t v,
484 uint32_t acc_flags)
485 {
486 if (amdgpu_device_skip_hw_access(adev))
487 return;
488
489 if ((reg * 4) < adev->rmmio_size) {
490 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
491 amdgpu_sriov_runtime(adev) &&
492 down_read_trylock(&adev->reset_domain->sem)) {
493 amdgpu_kiq_wreg(adev, reg, v);
494 up_read(&adev->reset_domain->sem);
495 } else {
496 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
497 }
498 } else {
499 adev->pcie_wreg(adev, reg * 4, v);
500 }
501
502 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
503 }
504
505 /**
506 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
507 *
508 * @adev: amdgpu_device pointer
509 * @reg: mmio/rlc register
510 * @v: value to write
511 *
512 * this function is invoked only for the debugfs register access
513 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
515 uint32_t reg, uint32_t v,
516 uint32_t xcc_id)
517 {
518 if (amdgpu_device_skip_hw_access(adev))
519 return;
520
521 if (amdgpu_sriov_fullaccess(adev) &&
522 adev->gfx.rlc.funcs &&
523 adev->gfx.rlc.funcs->is_rlcg_access_range) {
524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
526 } else if ((reg * 4) >= adev->rmmio_size) {
527 adev->pcie_wreg(adev, reg * 4, v);
528 } else {
529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
530 }
531 }
532
533 /**
534 * amdgpu_device_indirect_rreg - read an indirect register
535 *
536 * @adev: amdgpu_device pointer
537 * @reg_addr: indirect register address to read from
538 *
539 * Returns the value of indirect register @reg_addr
540 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
542 u32 reg_addr)
543 {
544 unsigned long flags, pcie_index, pcie_data;
545 void __iomem *pcie_index_offset;
546 void __iomem *pcie_data_offset;
547 u32 r;
548
549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
551
552 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
555
556 writel(reg_addr, pcie_index_offset);
557 readl(pcie_index_offset);
558 r = readl(pcie_data_offset);
559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
560
561 return r;
562 }
563
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
565 u64 reg_addr)
566 {
567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
568 u32 r;
569 void __iomem *pcie_index_offset;
570 void __iomem *pcie_index_hi_offset;
571 void __iomem *pcie_data_offset;
572
573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
575 if (adev->nbio.funcs->get_pcie_index_hi_offset)
576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
577 else
578 pcie_index_hi = 0;
579
580 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
583 if (pcie_index_hi != 0)
584 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
585 pcie_index_hi * 4;
586
587 writel(reg_addr, pcie_index_offset);
588 readl(pcie_index_offset);
589 if (pcie_index_hi != 0) {
590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
591 readl(pcie_index_hi_offset);
592 }
593 r = readl(pcie_data_offset);
594
595 /* clear the high bits */
596 if (pcie_index_hi != 0) {
597 writel(0, pcie_index_hi_offset);
598 readl(pcie_index_hi_offset);
599 }
600
601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602
603 return r;
604 }
605
606 /**
607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608 *
609 * @adev: amdgpu_device pointer
610 * @reg_addr: indirect register address to read from
611 *
612 * Returns the value of indirect register @reg_addr
613 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
615 u32 reg_addr)
616 {
617 unsigned long flags, pcie_index, pcie_data;
618 void __iomem *pcie_index_offset;
619 void __iomem *pcie_data_offset;
620 u64 r;
621
622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
624
625 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
628
629 /* read low 32 bits */
630 writel(reg_addr, pcie_index_offset);
631 readl(pcie_index_offset);
632 r = readl(pcie_data_offset);
633 /* read high 32 bits */
634 writel(reg_addr + 4, pcie_index_offset);
635 readl(pcie_index_offset);
636 r |= ((u64)readl(pcie_data_offset) << 32);
637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
638
639 return r;
640 }
641
642 /**
643 * amdgpu_device_indirect_wreg - write an indirect register address
644 *
645 * @adev: amdgpu_device pointer
646 * @reg_addr: indirect register offset
647 * @reg_data: indirect register data
648 *
649 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
651 u32 reg_addr, u32 reg_data)
652 {
653 unsigned long flags, pcie_index, pcie_data;
654 void __iomem *pcie_index_offset;
655 void __iomem *pcie_data_offset;
656
657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
659
660 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
663
664 writel(reg_addr, pcie_index_offset);
665 readl(pcie_index_offset);
666 writel(reg_data, pcie_data_offset);
667 readl(pcie_data_offset);
668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
669 }
670
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
672 u64 reg_addr, u32 reg_data)
673 {
674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
675 void __iomem *pcie_index_offset;
676 void __iomem *pcie_index_hi_offset;
677 void __iomem *pcie_data_offset;
678
679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
681 if (adev->nbio.funcs->get_pcie_index_hi_offset)
682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
683 else
684 pcie_index_hi = 0;
685
686 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
689 if (pcie_index_hi != 0)
690 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
691 pcie_index_hi * 4;
692
693 writel(reg_addr, pcie_index_offset);
694 readl(pcie_index_offset);
695 if (pcie_index_hi != 0) {
696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
697 readl(pcie_index_hi_offset);
698 }
699 writel(reg_data, pcie_data_offset);
700 readl(pcie_data_offset);
701
702 /* clear the high bits */
703 if (pcie_index_hi != 0) {
704 writel(0, pcie_index_hi_offset);
705 readl(pcie_index_hi_offset);
706 }
707
708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
709 }
710
711 /**
712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
713 *
714 * @adev: amdgpu_device pointer
715 * @reg_addr: indirect register offset
716 * @reg_data: indirect register data
717 *
718 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
720 u32 reg_addr, u64 reg_data)
721 {
722 unsigned long flags, pcie_index, pcie_data;
723 void __iomem *pcie_index_offset;
724 void __iomem *pcie_data_offset;
725
726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
728
729 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
732
733 /* write low 32 bits */
734 writel(reg_addr, pcie_index_offset);
735 readl(pcie_index_offset);
736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
737 readl(pcie_data_offset);
738 /* write high 32 bits */
739 writel(reg_addr + 4, pcie_index_offset);
740 readl(pcie_index_offset);
741 writel((u32)(reg_data >> 32), pcie_data_offset);
742 readl(pcie_data_offset);
743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
744 }
745
746 /**
747 * amdgpu_device_get_rev_id - query device rev_id
748 *
749 * @adev: amdgpu_device pointer
750 *
751 * Return device rev_id
752 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
754 {
755 return adev->nbio.funcs->get_rev_id(adev);
756 }
757
758 /**
759 * amdgpu_invalid_rreg - dummy reg read function
760 *
761 * @adev: amdgpu_device pointer
762 * @reg: offset of register
763 *
764 * Dummy register read function. Used for register blocks
765 * that certain asics don't have (all asics).
766 * Returns the value in the register.
767 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
769 {
770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
771 BUG();
772 return 0;
773 }
774
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
776 {
777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
778 BUG();
779 return 0;
780 }
781
782 /**
783 * amdgpu_invalid_wreg - dummy reg write function
784 *
785 * @adev: amdgpu_device pointer
786 * @reg: offset of register
787 * @v: value to write to the register
788 *
789 * Dummy register read function. Used for register blocks
790 * that certain asics don't have (all asics).
791 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
793 {
794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
795 reg, v);
796 BUG();
797 }
798
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
800 {
801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
802 reg, v);
803 BUG();
804 }
805
806 /**
807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
808 *
809 * @adev: amdgpu_device pointer
810 * @reg: offset of register
811 *
812 * Dummy register read function. Used for register blocks
813 * that certain asics don't have (all asics).
814 * Returns the value in the register.
815 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
817 {
818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
819 BUG();
820 return 0;
821 }
822
823 /**
824 * amdgpu_invalid_wreg64 - dummy reg write function
825 *
826 * @adev: amdgpu_device pointer
827 * @reg: offset of register
828 * @v: value to write to the register
829 *
830 * Dummy register read function. Used for register blocks
831 * that certain asics don't have (all asics).
832 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
834 {
835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
836 reg, v);
837 BUG();
838 }
839
840 /**
841 * amdgpu_block_invalid_rreg - dummy reg read function
842 *
843 * @adev: amdgpu_device pointer
844 * @block: offset of instance
845 * @reg: offset of register
846 *
847 * Dummy register read function. Used for register blocks
848 * that certain asics don't have (all asics).
849 * Returns the value in the register.
850 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
852 uint32_t block, uint32_t reg)
853 {
854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
855 reg, block);
856 BUG();
857 return 0;
858 }
859
860 /**
861 * amdgpu_block_invalid_wreg - dummy reg write function
862 *
863 * @adev: amdgpu_device pointer
864 * @block: offset of instance
865 * @reg: offset of register
866 * @v: value to write to the register
867 *
868 * Dummy register read function. Used for register blocks
869 * that certain asics don't have (all asics).
870 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
872 uint32_t block,
873 uint32_t reg, uint32_t v)
874 {
875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
876 reg, block, v);
877 BUG();
878 }
879
880 /**
881 * amdgpu_device_asic_init - Wrapper for atom asic_init
882 *
883 * @adev: amdgpu_device pointer
884 *
885 * Does any asic specific work and then calls atom asic init.
886 */
amdgpu_device_asic_init(struct amdgpu_device * adev)887 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
888 {
889 int ret;
890
891 amdgpu_asic_pre_asic_init(adev);
892
893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
895 amdgpu_psp_wait_for_bootloader(adev);
896 ret = amdgpu_atomfirmware_asic_init(adev, true);
897 return ret;
898 } else {
899 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
900 }
901
902 return 0;
903 }
904
905 /**
906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
907 *
908 * @adev: amdgpu_device pointer
909 *
910 * Allocates a scratch page of VRAM for use by various things in the
911 * driver.
912 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
914 {
915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
916 AMDGPU_GEM_DOMAIN_VRAM |
917 AMDGPU_GEM_DOMAIN_GTT,
918 &adev->mem_scratch.robj,
919 &adev->mem_scratch.gpu_addr,
920 (void **)&adev->mem_scratch.ptr);
921 }
922
923 /**
924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
925 *
926 * @adev: amdgpu_device pointer
927 *
928 * Frees the VRAM scratch page.
929 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
931 {
932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
933 }
934
935 /**
936 * amdgpu_device_program_register_sequence - program an array of registers.
937 *
938 * @adev: amdgpu_device pointer
939 * @registers: pointer to the register array
940 * @array_size: size of the register array
941 *
942 * Programs an array or registers with and or masks.
943 * This is a helper for setting golden registers.
944 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
946 const u32 *registers,
947 const u32 array_size)
948 {
949 u32 tmp, reg, and_mask, or_mask;
950 int i;
951
952 if (array_size % 3)
953 return;
954
955 for (i = 0; i < array_size; i += 3) {
956 reg = registers[i + 0];
957 and_mask = registers[i + 1];
958 or_mask = registers[i + 2];
959
960 if (and_mask == 0xffffffff) {
961 tmp = or_mask;
962 } else {
963 tmp = RREG32(reg);
964 tmp &= ~and_mask;
965 if (adev->family >= AMDGPU_FAMILY_AI)
966 tmp |= (or_mask & and_mask);
967 else
968 tmp |= or_mask;
969 }
970 WREG32(reg, tmp);
971 }
972 }
973
974 /**
975 * amdgpu_device_pci_config_reset - reset the GPU
976 *
977 * @adev: amdgpu_device pointer
978 *
979 * Resets the GPU using the pci config reset sequence.
980 * Only applicable to asics prior to vega10.
981 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
983 {
984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
985 }
986
987 /**
988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
989 *
990 * @adev: amdgpu_device pointer
991 *
992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
993 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)994 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
995 {
996 STUB();
997 return -ENOSYS;
998 #ifdef notyet
999 return pci_reset_function(adev->pdev);
1000 #endif
1001 }
1002
1003 /*
1004 * amdgpu_device_wb_*()
1005 * Writeback is the method by which the GPU updates special pages in memory
1006 * with the status of certain GPU events (fences, ring pointers,etc.).
1007 */
1008
1009 /**
1010 * amdgpu_device_wb_fini - Disable Writeback and free memory
1011 *
1012 * @adev: amdgpu_device pointer
1013 *
1014 * Disables Writeback and frees the Writeback memory (all asics).
1015 * Used at driver shutdown.
1016 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 if (adev->wb.wb_obj) {
1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 &adev->wb.gpu_addr,
1022 (void **)&adev->wb.wb);
1023 adev->wb.wb_obj = NULL;
1024 }
1025 }
1026
1027 /**
1028 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1029 *
1030 * @adev: amdgpu_device pointer
1031 *
1032 * Initializes writeback and allocates writeback memory (all asics).
1033 * Used at driver startup.
1034 * Returns 0 on success or an -error on failure.
1035 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 int r;
1039
1040 if (adev->wb.wb_obj == NULL) {
1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 (void **)&adev->wb.wb);
1046 if (r) {
1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 return r;
1049 }
1050
1051 adev->wb.num_wb = AMDGPU_MAX_WB;
1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053
1054 /* clear wb memory */
1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 }
1057
1058 return 0;
1059 }
1060
1061 /**
1062 * amdgpu_device_wb_get - Allocate a wb entry
1063 *
1064 * @adev: amdgpu_device pointer
1065 * @wb: wb index
1066 *
1067 * Allocate a wb slot for use by the driver (all asics).
1068 * Returns 0 on success or -EINVAL on failure.
1069 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073
1074 if (offset < adev->wb.num_wb) {
1075 __set_bit(offset, adev->wb.used);
1076 *wb = offset << 3; /* convert to dw offset */
1077 return 0;
1078 } else {
1079 return -EINVAL;
1080 }
1081 }
1082
1083 /**
1084 * amdgpu_device_wb_free - Free a wb entry
1085 *
1086 * @adev: amdgpu_device pointer
1087 * @wb: wb index
1088 *
1089 * Free a wb slot allocated for use by the driver (all asics)
1090 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 wb >>= 3;
1094 if (wb < adev->wb.num_wb)
1095 __clear_bit(wb, adev->wb.used);
1096 }
1097
1098 /**
1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100 *
1101 * @adev: amdgpu_device pointer
1102 *
1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104 * to fail, but if any of the BARs is not accessible after the size we abort
1105 * driver loading by returning -ENODEV.
1106 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 #ifdef __linux__
1110 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1111 struct pci_bus *root;
1112 struct resource *res;
1113 unsigned int i;
1114 u16 cmd;
1115 int r;
1116
1117 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1118 return 0;
1119
1120 /* Bypass for VF */
1121 if (amdgpu_sriov_vf(adev))
1122 return 0;
1123
1124 /* skip if the bios has already enabled large BAR */
1125 if (adev->gmc.real_vram_size &&
1126 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1127 return 0;
1128
1129 /* Check if the root BUS has 64bit memory resources */
1130 root = adev->pdev->bus;
1131 while (root->parent)
1132 root = root->parent;
1133
1134 pci_bus_for_each_resource(root, res, i) {
1135 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1136 res->start > 0x100000000ull)
1137 break;
1138 }
1139
1140 /* Trying to resize is pointless without a root hub window above 4GB */
1141 if (!res)
1142 return 0;
1143
1144 /* Limit the BAR size to what is available */
1145 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1146 rbar_size);
1147
1148 /* Disable memory decoding while we change the BAR addresses and size */
1149 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1150 pci_write_config_word(adev->pdev, PCI_COMMAND,
1151 cmd & ~PCI_COMMAND_MEMORY);
1152
1153 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1154 amdgpu_doorbell_fini(adev);
1155 if (adev->asic_type >= CHIP_BONAIRE)
1156 pci_release_resource(adev->pdev, 2);
1157
1158 pci_release_resource(adev->pdev, 0);
1159
1160 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1161 if (r == -ENOSPC)
1162 DRM_INFO("Not enough PCI address space for a large BAR.");
1163 else if (r && r != -ENOTSUPP)
1164 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1165
1166 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1167
1168 /* When the doorbell or fb BAR isn't available we have no chance of
1169 * using the device.
1170 */
1171 r = amdgpu_doorbell_init(adev);
1172 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1173 return -ENODEV;
1174
1175 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1176 #endif /* __linux__ */
1177
1178 return 0;
1179 }
1180
amdgpu_device_read_bios(struct amdgpu_device * adev)1181 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1182 {
1183 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1184 return false;
1185
1186 return true;
1187 }
1188
1189 /*
1190 * GPU helpers function.
1191 */
1192 /**
1193 * amdgpu_device_need_post - check if the hw need post or not
1194 *
1195 * @adev: amdgpu_device pointer
1196 *
1197 * Check if the asic has been initialized (all asics) at driver startup
1198 * or post is needed if hw reset is performed.
1199 * Returns true if need or false if not.
1200 */
amdgpu_device_need_post(struct amdgpu_device * adev)1201 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1202 {
1203 uint32_t reg;
1204
1205 if (amdgpu_sriov_vf(adev))
1206 return false;
1207
1208 if (!amdgpu_device_read_bios(adev))
1209 return false;
1210
1211 if (amdgpu_passthrough(adev)) {
1212 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1213 * some old smc fw still need driver do vPost otherwise gpu hang, while
1214 * those smc fw version above 22.15 doesn't have this flaw, so we force
1215 * vpost executed for smc version below 22.15
1216 */
1217 if (adev->asic_type == CHIP_FIJI) {
1218 int err;
1219 uint32_t fw_ver;
1220
1221 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1222 /* force vPost if error occured */
1223 if (err)
1224 return true;
1225
1226 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1227 release_firmware(adev->pm.fw);
1228 if (fw_ver < 0x00160e00)
1229 return true;
1230 }
1231 }
1232
1233 /* Don't post if we need to reset whole hive on init */
1234 if (adev->gmc.xgmi.pending_reset)
1235 return false;
1236
1237 if (adev->has_hw_reset) {
1238 adev->has_hw_reset = false;
1239 return true;
1240 }
1241
1242 /* bios scratch used on CIK+ */
1243 if (adev->asic_type >= CHIP_BONAIRE)
1244 return amdgpu_atombios_scratch_need_asic_init(adev);
1245
1246 /* check MEM_SIZE for older asics */
1247 reg = amdgpu_asic_get_config_memsize(adev);
1248
1249 if ((reg != 0) && (reg != 0xffffffff))
1250 return false;
1251
1252 return true;
1253 }
1254
1255 /*
1256 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1257 * speed switching. Until we have confirmation from Intel that a specific host
1258 * supports it, it's safer that we keep it disabled for all.
1259 *
1260 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1261 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1262 */
amdgpu_device_pcie_dynamic_switching_supported(void)1263 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1264 {
1265 #if IS_ENABLED(CONFIG_X86)
1266 #ifdef __linux__
1267 struct cpuinfo_x86 *c = &cpu_data(0);
1268
1269 if (c->x86_vendor == X86_VENDOR_INTEL)
1270 #else
1271 if (strcmp(cpu_vendor, "GenuineIntel") == 0)
1272 #endif
1273 return false;
1274 #endif
1275 return true;
1276 }
1277
1278 /**
1279 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1280 *
1281 * @adev: amdgpu_device pointer
1282 *
1283 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1284 * be set for this device.
1285 *
1286 * Returns true if it should be used or false if not.
1287 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1288 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1289 {
1290 switch (amdgpu_aspm) {
1291 case -1:
1292 break;
1293 case 0:
1294 return false;
1295 case 1:
1296 return true;
1297 default:
1298 return false;
1299 }
1300 return pcie_aspm_enabled(adev->pdev);
1301 }
1302
amdgpu_device_aspm_support_quirk(void)1303 bool amdgpu_device_aspm_support_quirk(void)
1304 {
1305 #if IS_ENABLED(CONFIG_X86)
1306 struct cpu_info *ci = curcpu();
1307
1308 return !(ci->ci_family == 6 && ci->ci_model == 0x97);
1309 #else
1310 return true;
1311 #endif
1312 }
1313
1314 /* if we get transitioned to only one device, take VGA back */
1315 /**
1316 * amdgpu_device_vga_set_decode - enable/disable vga decode
1317 *
1318 * @pdev: PCI device pointer
1319 * @state: enable/disable vga decode
1320 *
1321 * Enable/disable vga decode (all asics).
1322 * Returns VGA resource flags.
1323 */
1324 #ifdef notyet
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326 bool state)
1327 {
1328 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1329
1330 amdgpu_asic_set_vga_state(adev, state);
1331 if (state)
1332 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1333 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1334 else
1335 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1336 }
1337 #endif
1338
1339 /**
1340 * amdgpu_device_check_block_size - validate the vm block size
1341 *
1342 * @adev: amdgpu_device pointer
1343 *
1344 * Validates the vm block size specified via module parameter.
1345 * The vm block size defines number of bits in page table versus page directory,
1346 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1347 * page table and the remaining bits are in the page directory.
1348 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1349 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1350 {
1351 /* defines number of bits in page table versus page directory,
1352 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1353 * page table and the remaining bits are in the page directory
1354 */
1355 if (amdgpu_vm_block_size == -1)
1356 return;
1357
1358 if (amdgpu_vm_block_size < 9) {
1359 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1360 amdgpu_vm_block_size);
1361 amdgpu_vm_block_size = -1;
1362 }
1363 }
1364
1365 /**
1366 * amdgpu_device_check_vm_size - validate the vm size
1367 *
1368 * @adev: amdgpu_device pointer
1369 *
1370 * Validates the vm size in GB specified via module parameter.
1371 * The VM size is the size of the GPU virtual memory space in GB.
1372 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1373 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1374 {
1375 /* no need to check the default value */
1376 if (amdgpu_vm_size == -1)
1377 return;
1378
1379 if (amdgpu_vm_size < 1) {
1380 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1381 amdgpu_vm_size);
1382 amdgpu_vm_size = -1;
1383 }
1384 }
1385
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1386 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1387 {
1388 #ifdef __linux__
1389 struct sysinfo si;
1390 #endif
1391 bool is_os_64 = (sizeof(void *) == 8);
1392 uint64_t total_memory;
1393 uint64_t dram_size_seven_GB = 0x1B8000000;
1394 uint64_t dram_size_three_GB = 0xB8000000;
1395
1396 if (amdgpu_smu_memory_pool_size == 0)
1397 return;
1398
1399 if (!is_os_64) {
1400 DRM_WARN("Not 64-bit OS, feature not supported\n");
1401 goto def_value;
1402 }
1403 #ifdef __linux__
1404 si_meminfo(&si);
1405 total_memory = (uint64_t)si.totalram * si.mem_unit;
1406 #else
1407 total_memory = ptoa(physmem);
1408 #endif
1409
1410 if ((amdgpu_smu_memory_pool_size == 1) ||
1411 (amdgpu_smu_memory_pool_size == 2)) {
1412 if (total_memory < dram_size_three_GB)
1413 goto def_value1;
1414 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1415 (amdgpu_smu_memory_pool_size == 8)) {
1416 if (total_memory < dram_size_seven_GB)
1417 goto def_value1;
1418 } else {
1419 DRM_WARN("Smu memory pool size not supported\n");
1420 goto def_value;
1421 }
1422 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1423
1424 return;
1425
1426 def_value1:
1427 DRM_WARN("No enough system memory\n");
1428 def_value:
1429 adev->pm.smu_prv_buffer_size = 0;
1430 }
1431
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1432 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1433 {
1434 if (!(adev->flags & AMD_IS_APU) ||
1435 adev->asic_type < CHIP_RAVEN)
1436 return 0;
1437
1438 switch (adev->asic_type) {
1439 case CHIP_RAVEN:
1440 if (adev->pdev->device == 0x15dd)
1441 adev->apu_flags |= AMD_APU_IS_RAVEN;
1442 if (adev->pdev->device == 0x15d8)
1443 adev->apu_flags |= AMD_APU_IS_PICASSO;
1444 break;
1445 case CHIP_RENOIR:
1446 if ((adev->pdev->device == 0x1636) ||
1447 (adev->pdev->device == 0x164c))
1448 adev->apu_flags |= AMD_APU_IS_RENOIR;
1449 else
1450 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1451 break;
1452 case CHIP_VANGOGH:
1453 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1454 break;
1455 case CHIP_YELLOW_CARP:
1456 break;
1457 case CHIP_CYAN_SKILLFISH:
1458 if ((adev->pdev->device == 0x13FE) ||
1459 (adev->pdev->device == 0x143F))
1460 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1461 break;
1462 default:
1463 break;
1464 }
1465
1466 return 0;
1467 }
1468
1469 /**
1470 * amdgpu_device_check_arguments - validate module params
1471 *
1472 * @adev: amdgpu_device pointer
1473 *
1474 * Validates certain module parameters and updates
1475 * the associated values used by the driver (all asics).
1476 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1477 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1478 {
1479 if (amdgpu_sched_jobs < 4) {
1480 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1481 amdgpu_sched_jobs);
1482 amdgpu_sched_jobs = 4;
1483 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1484 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1485 amdgpu_sched_jobs);
1486 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1487 }
1488
1489 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1490 /* gart size must be greater or equal to 32M */
1491 dev_warn(adev->dev, "gart size (%d) too small\n",
1492 amdgpu_gart_size);
1493 amdgpu_gart_size = -1;
1494 }
1495
1496 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1497 /* gtt size must be greater or equal to 32M */
1498 dev_warn(adev->dev, "gtt size (%d) too small\n",
1499 amdgpu_gtt_size);
1500 amdgpu_gtt_size = -1;
1501 }
1502
1503 /* valid range is between 4 and 9 inclusive */
1504 if (amdgpu_vm_fragment_size != -1 &&
1505 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1506 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1507 amdgpu_vm_fragment_size = -1;
1508 }
1509
1510 if (amdgpu_sched_hw_submission < 2) {
1511 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1512 amdgpu_sched_hw_submission);
1513 amdgpu_sched_hw_submission = 2;
1514 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1515 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1516 amdgpu_sched_hw_submission);
1517 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1518 }
1519
1520 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1521 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1522 amdgpu_reset_method = -1;
1523 }
1524
1525 amdgpu_device_check_smu_prv_buffer_size(adev);
1526
1527 amdgpu_device_check_vm_size(adev);
1528
1529 amdgpu_device_check_block_size(adev);
1530
1531 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1532
1533 return 0;
1534 }
1535
1536 #ifdef __linux__
1537 /**
1538 * amdgpu_switcheroo_set_state - set switcheroo state
1539 *
1540 * @pdev: pci dev pointer
1541 * @state: vga_switcheroo state
1542 *
1543 * Callback for the switcheroo driver. Suspends or resumes
1544 * the asics before or after it is powered up using ACPI methods.
1545 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1546 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1547 enum vga_switcheroo_state state)
1548 {
1549 struct drm_device *dev = pci_get_drvdata(pdev);
1550 int r;
1551
1552 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1553 return;
1554
1555 if (state == VGA_SWITCHEROO_ON) {
1556 pr_info("switched on\n");
1557 /* don't suspend or resume card normally */
1558 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1559
1560 pci_set_power_state(pdev, PCI_D0);
1561 amdgpu_device_load_pci_state(pdev);
1562 r = pci_enable_device(pdev);
1563 if (r)
1564 DRM_WARN("pci_enable_device failed (%d)\n", r);
1565 amdgpu_device_resume(dev, true);
1566
1567 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1568 } else {
1569 pr_info("switched off\n");
1570 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1571 amdgpu_device_prepare(dev);
1572 amdgpu_device_suspend(dev, true);
1573 amdgpu_device_cache_pci_state(pdev);
1574 /* Shut down the device */
1575 pci_disable_device(pdev);
1576 pci_set_power_state(pdev, PCI_D3cold);
1577 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1578 }
1579 }
1580
1581 /**
1582 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1583 *
1584 * @pdev: pci dev pointer
1585 *
1586 * Callback for the switcheroo driver. Check of the switcheroo
1587 * state can be changed.
1588 * Returns true if the state can be changed, false if not.
1589 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1590 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1591 {
1592 struct drm_device *dev = pci_get_drvdata(pdev);
1593
1594 /*
1595 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1596 * locking inversion with the driver load path. And the access here is
1597 * completely racy anyway. So don't bother with locking for now.
1598 */
1599 return atomic_read(&dev->open_count) == 0;
1600 }
1601 #endif /* __linux__ */
1602
1603 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1604 #ifdef notyet
1605 .set_gpu_state = amdgpu_switcheroo_set_state,
1606 .reprobe = NULL,
1607 .can_switch = amdgpu_switcheroo_can_switch,
1608 #endif
1609 };
1610
1611 /**
1612 * amdgpu_device_ip_set_clockgating_state - set the CG state
1613 *
1614 * @dev: amdgpu_device pointer
1615 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1616 * @state: clockgating state (gate or ungate)
1617 *
1618 * Sets the requested clockgating state for all instances of
1619 * the hardware IP specified.
1620 * Returns the error code from the last instance.
1621 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1622 int amdgpu_device_ip_set_clockgating_state(void *dev,
1623 enum amd_ip_block_type block_type,
1624 enum amd_clockgating_state state)
1625 {
1626 struct amdgpu_device *adev = dev;
1627 int i, r = 0;
1628
1629 for (i = 0; i < adev->num_ip_blocks; i++) {
1630 if (!adev->ip_blocks[i].status.valid)
1631 continue;
1632 if (adev->ip_blocks[i].version->type != block_type)
1633 continue;
1634 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1635 continue;
1636 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1637 (void *)adev, state);
1638 if (r)
1639 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1640 adev->ip_blocks[i].version->funcs->name, r);
1641 }
1642 return r;
1643 }
1644
1645 /**
1646 * amdgpu_device_ip_set_powergating_state - set the PG state
1647 *
1648 * @dev: amdgpu_device pointer
1649 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1650 * @state: powergating state (gate or ungate)
1651 *
1652 * Sets the requested powergating state for all instances of
1653 * the hardware IP specified.
1654 * Returns the error code from the last instance.
1655 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1656 int amdgpu_device_ip_set_powergating_state(void *dev,
1657 enum amd_ip_block_type block_type,
1658 enum amd_powergating_state state)
1659 {
1660 struct amdgpu_device *adev = dev;
1661 int i, r = 0;
1662
1663 for (i = 0; i < adev->num_ip_blocks; i++) {
1664 if (!adev->ip_blocks[i].status.valid)
1665 continue;
1666 if (adev->ip_blocks[i].version->type != block_type)
1667 continue;
1668 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1669 continue;
1670 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1671 (void *)adev, state);
1672 if (r)
1673 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1674 adev->ip_blocks[i].version->funcs->name, r);
1675 }
1676 return r;
1677 }
1678
1679 /**
1680 * amdgpu_device_ip_get_clockgating_state - get the CG state
1681 *
1682 * @adev: amdgpu_device pointer
1683 * @flags: clockgating feature flags
1684 *
1685 * Walks the list of IPs on the device and updates the clockgating
1686 * flags for each IP.
1687 * Updates @flags with the feature flags for each hardware IP where
1688 * clockgating is enabled.
1689 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1690 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1691 u64 *flags)
1692 {
1693 int i;
1694
1695 for (i = 0; i < adev->num_ip_blocks; i++) {
1696 if (!adev->ip_blocks[i].status.valid)
1697 continue;
1698 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1699 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1700 }
1701 }
1702
1703 /**
1704 * amdgpu_device_ip_wait_for_idle - wait for idle
1705 *
1706 * @adev: amdgpu_device pointer
1707 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1708 *
1709 * Waits for the request hardware IP to be idle.
1710 * Returns 0 for success or a negative error code on failure.
1711 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1712 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1713 enum amd_ip_block_type block_type)
1714 {
1715 int i, r;
1716
1717 for (i = 0; i < adev->num_ip_blocks; i++) {
1718 if (!adev->ip_blocks[i].status.valid)
1719 continue;
1720 if (adev->ip_blocks[i].version->type == block_type) {
1721 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1722 if (r)
1723 return r;
1724 break;
1725 }
1726 }
1727 return 0;
1728
1729 }
1730
1731 /**
1732 * amdgpu_device_ip_is_idle - is the hardware IP idle
1733 *
1734 * @adev: amdgpu_device pointer
1735 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1736 *
1737 * Check if the hardware IP is idle or not.
1738 * Returns true if it the IP is idle, false if not.
1739 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1740 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1741 enum amd_ip_block_type block_type)
1742 {
1743 int i;
1744
1745 for (i = 0; i < adev->num_ip_blocks; i++) {
1746 if (!adev->ip_blocks[i].status.valid)
1747 continue;
1748 if (adev->ip_blocks[i].version->type == block_type)
1749 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1750 }
1751 return true;
1752
1753 }
1754
1755 /**
1756 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1757 *
1758 * @adev: amdgpu_device pointer
1759 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1760 *
1761 * Returns a pointer to the hardware IP block structure
1762 * if it exists for the asic, otherwise NULL.
1763 */
1764 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1765 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1766 enum amd_ip_block_type type)
1767 {
1768 int i;
1769
1770 for (i = 0; i < adev->num_ip_blocks; i++)
1771 if (adev->ip_blocks[i].version->type == type)
1772 return &adev->ip_blocks[i];
1773
1774 return NULL;
1775 }
1776
1777 /**
1778 * amdgpu_device_ip_block_version_cmp
1779 *
1780 * @adev: amdgpu_device pointer
1781 * @type: enum amd_ip_block_type
1782 * @major: major version
1783 * @minor: minor version
1784 *
1785 * return 0 if equal or greater
1786 * return 1 if smaller or the ip_block doesn't exist
1787 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1788 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1789 enum amd_ip_block_type type,
1790 u32 major, u32 minor)
1791 {
1792 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1793
1794 if (ip_block && ((ip_block->version->major > major) ||
1795 ((ip_block->version->major == major) &&
1796 (ip_block->version->minor >= minor))))
1797 return 0;
1798
1799 return 1;
1800 }
1801
1802 /**
1803 * amdgpu_device_ip_block_add
1804 *
1805 * @adev: amdgpu_device pointer
1806 * @ip_block_version: pointer to the IP to add
1807 *
1808 * Adds the IP block driver information to the collection of IPs
1809 * on the asic.
1810 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1811 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1812 const struct amdgpu_ip_block_version *ip_block_version)
1813 {
1814 if (!ip_block_version)
1815 return -EINVAL;
1816
1817 switch (ip_block_version->type) {
1818 case AMD_IP_BLOCK_TYPE_VCN:
1819 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1820 return 0;
1821 break;
1822 case AMD_IP_BLOCK_TYPE_JPEG:
1823 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1824 return 0;
1825 break;
1826 default:
1827 break;
1828 }
1829
1830 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1831 ip_block_version->funcs->name);
1832
1833 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1834
1835 return 0;
1836 }
1837
1838 /**
1839 * amdgpu_device_enable_virtual_display - enable virtual display feature
1840 *
1841 * @adev: amdgpu_device pointer
1842 *
1843 * Enabled the virtual display feature if the user has enabled it via
1844 * the module parameter virtual_display. This feature provides a virtual
1845 * display hardware on headless boards or in virtualized environments.
1846 * This function parses and validates the configuration string specified by
1847 * the user and configues the virtual display configuration (number of
1848 * virtual connectors, crtcs, etc.) specified.
1849 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1850 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1851 {
1852 adev->enable_virtual_display = false;
1853
1854 #ifdef notyet
1855 if (amdgpu_virtual_display) {
1856 const char *pci_address_name = pci_name(adev->pdev);
1857 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1858
1859 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1860 pciaddstr_tmp = pciaddstr;
1861 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1862 pciaddname = strsep(&pciaddname_tmp, ",");
1863 if (!strcmp("all", pciaddname)
1864 || !strcmp(pci_address_name, pciaddname)) {
1865 long num_crtc;
1866 int res = -1;
1867
1868 adev->enable_virtual_display = true;
1869
1870 if (pciaddname_tmp)
1871 res = kstrtol(pciaddname_tmp, 10,
1872 &num_crtc);
1873
1874 if (!res) {
1875 if (num_crtc < 1)
1876 num_crtc = 1;
1877 if (num_crtc > 6)
1878 num_crtc = 6;
1879 adev->mode_info.num_crtc = num_crtc;
1880 } else {
1881 adev->mode_info.num_crtc = 1;
1882 }
1883 break;
1884 }
1885 }
1886
1887 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1888 amdgpu_virtual_display, pci_address_name,
1889 adev->enable_virtual_display, adev->mode_info.num_crtc);
1890
1891 kfree(pciaddstr);
1892 }
1893 #endif
1894 }
1895
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1896 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1897 {
1898 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1899 adev->mode_info.num_crtc = 1;
1900 adev->enable_virtual_display = true;
1901 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1902 adev->enable_virtual_display, adev->mode_info.num_crtc);
1903 }
1904 }
1905
1906 /**
1907 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1908 *
1909 * @adev: amdgpu_device pointer
1910 *
1911 * Parses the asic configuration parameters specified in the gpu info
1912 * firmware and makes them availale to the driver for use in configuring
1913 * the asic.
1914 * Returns 0 on success, -EINVAL on failure.
1915 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1916 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1917 {
1918 const char *chip_name;
1919 char fw_name[40];
1920 int err;
1921 const struct gpu_info_firmware_header_v1_0 *hdr;
1922
1923 adev->firmware.gpu_info_fw = NULL;
1924
1925 if (adev->mman.discovery_bin)
1926 return 0;
1927
1928 switch (adev->asic_type) {
1929 default:
1930 return 0;
1931 case CHIP_VEGA10:
1932 chip_name = "vega10";
1933 break;
1934 case CHIP_VEGA12:
1935 chip_name = "vega12";
1936 break;
1937 case CHIP_RAVEN:
1938 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1939 chip_name = "raven2";
1940 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1941 chip_name = "picasso";
1942 else
1943 chip_name = "raven";
1944 break;
1945 case CHIP_ARCTURUS:
1946 chip_name = "arcturus";
1947 break;
1948 case CHIP_NAVI12:
1949 chip_name = "navi12";
1950 break;
1951 }
1952
1953 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1954 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1955 if (err) {
1956 dev_err(adev->dev,
1957 "Failed to get gpu_info firmware \"%s\"\n",
1958 fw_name);
1959 goto out;
1960 }
1961
1962 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1963 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1964
1965 switch (hdr->version_major) {
1966 case 1:
1967 {
1968 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1969 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1970 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1971
1972 /*
1973 * Should be droped when DAL no longer needs it.
1974 */
1975 if (adev->asic_type == CHIP_NAVI12)
1976 goto parse_soc_bounding_box;
1977
1978 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1979 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1980 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1981 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1982 adev->gfx.config.max_texture_channel_caches =
1983 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1984 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1985 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1986 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1987 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1988 adev->gfx.config.double_offchip_lds_buf =
1989 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1990 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1991 adev->gfx.cu_info.max_waves_per_simd =
1992 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1993 adev->gfx.cu_info.max_scratch_slots_per_cu =
1994 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1995 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1996 if (hdr->version_minor >= 1) {
1997 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1998 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1999 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2000 adev->gfx.config.num_sc_per_sh =
2001 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2002 adev->gfx.config.num_packer_per_sc =
2003 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2004 }
2005
2006 parse_soc_bounding_box:
2007 /*
2008 * soc bounding box info is not integrated in disocovery table,
2009 * we always need to parse it from gpu info firmware if needed.
2010 */
2011 if (hdr->version_minor == 2) {
2012 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2013 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2014 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2015 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2016 }
2017 break;
2018 }
2019 default:
2020 dev_err(adev->dev,
2021 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2022 err = -EINVAL;
2023 goto out;
2024 }
2025 out:
2026 return err;
2027 }
2028
2029 /**
2030 * amdgpu_device_ip_early_init - run early init for hardware IPs
2031 *
2032 * @adev: amdgpu_device pointer
2033 *
2034 * Early initialization pass for hardware IPs. The hardware IPs that make
2035 * up each asic are discovered each IP's early_init callback is run. This
2036 * is the first stage in initializing the asic.
2037 * Returns 0 on success, negative error code on failure.
2038 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2039 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2040 {
2041 struct pci_dev *parent;
2042 int i, r;
2043 bool total;
2044
2045 amdgpu_device_enable_virtual_display(adev);
2046
2047 if (amdgpu_sriov_vf(adev)) {
2048 r = amdgpu_virt_request_full_gpu(adev, true);
2049 if (r)
2050 return r;
2051 }
2052
2053 switch (adev->asic_type) {
2054 #ifdef CONFIG_DRM_AMDGPU_SI
2055 case CHIP_VERDE:
2056 case CHIP_TAHITI:
2057 case CHIP_PITCAIRN:
2058 case CHIP_OLAND:
2059 case CHIP_HAINAN:
2060 adev->family = AMDGPU_FAMILY_SI;
2061 r = si_set_ip_blocks(adev);
2062 if (r)
2063 return r;
2064 break;
2065 #endif
2066 #ifdef CONFIG_DRM_AMDGPU_CIK
2067 case CHIP_BONAIRE:
2068 case CHIP_HAWAII:
2069 case CHIP_KAVERI:
2070 case CHIP_KABINI:
2071 case CHIP_MULLINS:
2072 if (adev->flags & AMD_IS_APU)
2073 adev->family = AMDGPU_FAMILY_KV;
2074 else
2075 adev->family = AMDGPU_FAMILY_CI;
2076
2077 r = cik_set_ip_blocks(adev);
2078 if (r)
2079 return r;
2080 break;
2081 #endif
2082 case CHIP_TOPAZ:
2083 case CHIP_TONGA:
2084 case CHIP_FIJI:
2085 case CHIP_POLARIS10:
2086 case CHIP_POLARIS11:
2087 case CHIP_POLARIS12:
2088 case CHIP_VEGAM:
2089 case CHIP_CARRIZO:
2090 case CHIP_STONEY:
2091 if (adev->flags & AMD_IS_APU)
2092 adev->family = AMDGPU_FAMILY_CZ;
2093 else
2094 adev->family = AMDGPU_FAMILY_VI;
2095
2096 r = vi_set_ip_blocks(adev);
2097 if (r)
2098 return r;
2099 break;
2100 default:
2101 r = amdgpu_discovery_set_ip_blocks(adev);
2102 if (r)
2103 return r;
2104 break;
2105 }
2106
2107 if (amdgpu_has_atpx() &&
2108 (amdgpu_is_atpx_hybrid() ||
2109 amdgpu_has_atpx_dgpu_power_cntl()) &&
2110 ((adev->flags & AMD_IS_APU) == 0) &&
2111 !dev_is_removable(&adev->pdev->dev))
2112 adev->flags |= AMD_IS_PX;
2113
2114 if (!(adev->flags & AMD_IS_APU)) {
2115 #ifdef notyet
2116 parent = pcie_find_root_port(adev->pdev);
2117 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2118 #else
2119 adev->has_pr3 = false;
2120 #endif
2121 }
2122
2123
2124 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2125 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2126 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2127 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2128 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2129 if (!amdgpu_device_pcie_dynamic_switching_supported())
2130 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2131
2132 total = true;
2133 for (i = 0; i < adev->num_ip_blocks; i++) {
2134 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2135 DRM_WARN("disabled ip block: %d <%s>\n",
2136 i, adev->ip_blocks[i].version->funcs->name);
2137 adev->ip_blocks[i].status.valid = false;
2138 } else {
2139 if (adev->ip_blocks[i].version->funcs->early_init) {
2140 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2141 if (r == -ENOENT) {
2142 adev->ip_blocks[i].status.valid = false;
2143 } else if (r) {
2144 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2145 adev->ip_blocks[i].version->funcs->name, r);
2146 total = false;
2147 } else {
2148 adev->ip_blocks[i].status.valid = true;
2149 }
2150 } else {
2151 adev->ip_blocks[i].status.valid = true;
2152 }
2153 }
2154 /* get the vbios after the asic_funcs are set up */
2155 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2156 r = amdgpu_device_parse_gpu_info_fw(adev);
2157 if (r)
2158 return r;
2159
2160 /* Read BIOS */
2161 if (amdgpu_device_read_bios(adev)) {
2162 if (!amdgpu_get_bios(adev))
2163 return -EINVAL;
2164
2165 r = amdgpu_atombios_init(adev);
2166 if (r) {
2167 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2168 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2169 return r;
2170 }
2171 }
2172
2173 /*get pf2vf msg info at it's earliest time*/
2174 if (amdgpu_sriov_vf(adev))
2175 amdgpu_virt_init_data_exchange(adev);
2176
2177 }
2178 }
2179 if (!total)
2180 return -ENODEV;
2181
2182 amdgpu_amdkfd_device_probe(adev);
2183 adev->cg_flags &= amdgpu_cg_mask;
2184 adev->pg_flags &= amdgpu_pg_mask;
2185
2186 return 0;
2187 }
2188
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2189 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2190 {
2191 int i, r;
2192
2193 for (i = 0; i < adev->num_ip_blocks; i++) {
2194 if (!adev->ip_blocks[i].status.sw)
2195 continue;
2196 if (adev->ip_blocks[i].status.hw)
2197 continue;
2198 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2199 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2200 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2201 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2202 if (r) {
2203 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2204 adev->ip_blocks[i].version->funcs->name, r);
2205 return r;
2206 }
2207 adev->ip_blocks[i].status.hw = true;
2208 }
2209 }
2210
2211 return 0;
2212 }
2213
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2214 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2215 {
2216 int i, r;
2217
2218 for (i = 0; i < adev->num_ip_blocks; i++) {
2219 if (!adev->ip_blocks[i].status.sw)
2220 continue;
2221 if (adev->ip_blocks[i].status.hw)
2222 continue;
2223 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2224 if (r) {
2225 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2226 adev->ip_blocks[i].version->funcs->name, r);
2227 return r;
2228 }
2229 adev->ip_blocks[i].status.hw = true;
2230 }
2231
2232 return 0;
2233 }
2234
amdgpu_device_fw_loading(struct amdgpu_device * adev)2235 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2236 {
2237 int r = 0;
2238 int i;
2239 uint32_t smu_version;
2240
2241 if (adev->asic_type >= CHIP_VEGA10) {
2242 for (i = 0; i < adev->num_ip_blocks; i++) {
2243 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2244 continue;
2245
2246 if (!adev->ip_blocks[i].status.sw)
2247 continue;
2248
2249 /* no need to do the fw loading again if already done*/
2250 if (adev->ip_blocks[i].status.hw == true)
2251 break;
2252
2253 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2254 r = adev->ip_blocks[i].version->funcs->resume(adev);
2255 if (r) {
2256 DRM_ERROR("resume of IP block <%s> failed %d\n",
2257 adev->ip_blocks[i].version->funcs->name, r);
2258 return r;
2259 }
2260 } else {
2261 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2262 if (r) {
2263 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2264 adev->ip_blocks[i].version->funcs->name, r);
2265 return r;
2266 }
2267 }
2268
2269 adev->ip_blocks[i].status.hw = true;
2270 break;
2271 }
2272 }
2273
2274 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2275 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2276
2277 return r;
2278 }
2279
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2280 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2281 {
2282 long timeout;
2283 int r, i;
2284
2285 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2286 struct amdgpu_ring *ring = adev->rings[i];
2287
2288 /* No need to setup the GPU scheduler for rings that don't need it */
2289 if (!ring || ring->no_scheduler)
2290 continue;
2291
2292 switch (ring->funcs->type) {
2293 case AMDGPU_RING_TYPE_GFX:
2294 timeout = adev->gfx_timeout;
2295 break;
2296 case AMDGPU_RING_TYPE_COMPUTE:
2297 timeout = adev->compute_timeout;
2298 break;
2299 case AMDGPU_RING_TYPE_SDMA:
2300 timeout = adev->sdma_timeout;
2301 break;
2302 default:
2303 timeout = adev->video_timeout;
2304 break;
2305 }
2306
2307 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2308 ring->num_hw_submission, 0,
2309 timeout, adev->reset_domain->wq,
2310 ring->sched_score, ring->name,
2311 adev->dev);
2312 if (r) {
2313 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2314 ring->name);
2315 return r;
2316 }
2317 }
2318
2319 amdgpu_xcp_update_partition_sched_list(adev);
2320
2321 return 0;
2322 }
2323
2324
2325 /**
2326 * amdgpu_device_ip_init - run init for hardware IPs
2327 *
2328 * @adev: amdgpu_device pointer
2329 *
2330 * Main initialization pass for hardware IPs. The list of all the hardware
2331 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2332 * are run. sw_init initializes the software state associated with each IP
2333 * and hw_init initializes the hardware associated with each IP.
2334 * Returns 0 on success, negative error code on failure.
2335 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2336 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2337 {
2338 int i, r;
2339
2340 r = amdgpu_ras_init(adev);
2341 if (r)
2342 return r;
2343
2344 for (i = 0; i < adev->num_ip_blocks; i++) {
2345 if (!adev->ip_blocks[i].status.valid)
2346 continue;
2347 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2348 if (r) {
2349 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2350 adev->ip_blocks[i].version->funcs->name, r);
2351 goto init_failed;
2352 }
2353 adev->ip_blocks[i].status.sw = true;
2354
2355 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2356 /* need to do common hw init early so everything is set up for gmc */
2357 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2358 if (r) {
2359 DRM_ERROR("hw_init %d failed %d\n", i, r);
2360 goto init_failed;
2361 }
2362 adev->ip_blocks[i].status.hw = true;
2363 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2364 /* need to do gmc hw init early so we can allocate gpu mem */
2365 /* Try to reserve bad pages early */
2366 if (amdgpu_sriov_vf(adev))
2367 amdgpu_virt_exchange_data(adev);
2368
2369 r = amdgpu_device_mem_scratch_init(adev);
2370 if (r) {
2371 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2372 goto init_failed;
2373 }
2374 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2375 if (r) {
2376 DRM_ERROR("hw_init %d failed %d\n", i, r);
2377 goto init_failed;
2378 }
2379 r = amdgpu_device_wb_init(adev);
2380 if (r) {
2381 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2382 goto init_failed;
2383 }
2384 adev->ip_blocks[i].status.hw = true;
2385
2386 /* right after GMC hw init, we create CSA */
2387 if (adev->gfx.mcbp) {
2388 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2389 AMDGPU_GEM_DOMAIN_VRAM |
2390 AMDGPU_GEM_DOMAIN_GTT,
2391 AMDGPU_CSA_SIZE);
2392 if (r) {
2393 DRM_ERROR("allocate CSA failed %d\n", r);
2394 goto init_failed;
2395 }
2396 }
2397 }
2398 }
2399
2400 if (amdgpu_sriov_vf(adev))
2401 amdgpu_virt_init_data_exchange(adev);
2402
2403 r = amdgpu_ib_pool_init(adev);
2404 if (r) {
2405 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2406 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2407 goto init_failed;
2408 }
2409
2410 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2411 if (r)
2412 goto init_failed;
2413
2414 r = amdgpu_device_ip_hw_init_phase1(adev);
2415 if (r)
2416 goto init_failed;
2417
2418 r = amdgpu_device_fw_loading(adev);
2419 if (r)
2420 goto init_failed;
2421
2422 r = amdgpu_device_ip_hw_init_phase2(adev);
2423 if (r)
2424 goto init_failed;
2425
2426 /*
2427 * retired pages will be loaded from eeprom and reserved here,
2428 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2429 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2430 * for I2C communication which only true at this point.
2431 *
2432 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2433 * failure from bad gpu situation and stop amdgpu init process
2434 * accordingly. For other failed cases, it will still release all
2435 * the resource and print error message, rather than returning one
2436 * negative value to upper level.
2437 *
2438 * Note: theoretically, this should be called before all vram allocations
2439 * to protect retired page from abusing
2440 */
2441 r = amdgpu_ras_recovery_init(adev);
2442 if (r)
2443 goto init_failed;
2444
2445 /**
2446 * In case of XGMI grab extra reference for reset domain for this device
2447 */
2448 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2449 if (amdgpu_xgmi_add_device(adev) == 0) {
2450 if (!amdgpu_sriov_vf(adev)) {
2451 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2452
2453 if (WARN_ON(!hive)) {
2454 r = -ENOENT;
2455 goto init_failed;
2456 }
2457
2458 if (!hive->reset_domain ||
2459 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2460 r = -ENOENT;
2461 amdgpu_put_xgmi_hive(hive);
2462 goto init_failed;
2463 }
2464
2465 /* Drop the early temporary reset domain we created for device */
2466 amdgpu_reset_put_reset_domain(adev->reset_domain);
2467 adev->reset_domain = hive->reset_domain;
2468 amdgpu_put_xgmi_hive(hive);
2469 }
2470 }
2471 }
2472
2473 r = amdgpu_device_init_schedulers(adev);
2474 if (r)
2475 goto init_failed;
2476
2477 /* Don't init kfd if whole hive need to be reset during init */
2478 if (!adev->gmc.xgmi.pending_reset) {
2479 kgd2kfd_init_zone_device(adev);
2480 amdgpu_amdkfd_device_init(adev);
2481 }
2482
2483 amdgpu_fru_get_product_info(adev);
2484
2485 init_failed:
2486
2487 return r;
2488 }
2489
2490 /**
2491 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2492 *
2493 * @adev: amdgpu_device pointer
2494 *
2495 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2496 * this function before a GPU reset. If the value is retained after a
2497 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2498 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2499 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2500 {
2501 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2502 }
2503
2504 /**
2505 * amdgpu_device_check_vram_lost - check if vram is valid
2506 *
2507 * @adev: amdgpu_device pointer
2508 *
2509 * Checks the reset magic value written to the gart pointer in VRAM.
2510 * The driver calls this after a GPU reset to see if the contents of
2511 * VRAM is lost or now.
2512 * returns true if vram is lost, false if not.
2513 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2514 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2515 {
2516 if (memcmp(adev->gart.ptr, adev->reset_magic,
2517 AMDGPU_RESET_MAGIC_NUM))
2518 return true;
2519
2520 if (!amdgpu_in_reset(adev))
2521 return false;
2522
2523 /*
2524 * For all ASICs with baco/mode1 reset, the VRAM is
2525 * always assumed to be lost.
2526 */
2527 switch (amdgpu_asic_reset_method(adev)) {
2528 case AMD_RESET_METHOD_BACO:
2529 case AMD_RESET_METHOD_MODE1:
2530 return true;
2531 default:
2532 return false;
2533 }
2534 }
2535
2536 /**
2537 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2538 *
2539 * @adev: amdgpu_device pointer
2540 * @state: clockgating state (gate or ungate)
2541 *
2542 * The list of all the hardware IPs that make up the asic is walked and the
2543 * set_clockgating_state callbacks are run.
2544 * Late initialization pass enabling clockgating for hardware IPs.
2545 * Fini or suspend, pass disabling clockgating for hardware IPs.
2546 * Returns 0 on success, negative error code on failure.
2547 */
2548
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2549 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2550 enum amd_clockgating_state state)
2551 {
2552 int i, j, r;
2553
2554 if (amdgpu_emu_mode == 1)
2555 return 0;
2556
2557 for (j = 0; j < adev->num_ip_blocks; j++) {
2558 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2559 if (!adev->ip_blocks[i].status.late_initialized)
2560 continue;
2561 /* skip CG for GFX, SDMA on S0ix */
2562 if (adev->in_s0ix &&
2563 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2564 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2565 continue;
2566 /* skip CG for VCE/UVD, it's handled specially */
2567 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2568 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2569 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2570 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2571 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2572 /* enable clockgating to save power */
2573 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2574 state);
2575 if (r) {
2576 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2577 adev->ip_blocks[i].version->funcs->name, r);
2578 return r;
2579 }
2580 }
2581 }
2582
2583 return 0;
2584 }
2585
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2586 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2587 enum amd_powergating_state state)
2588 {
2589 int i, j, r;
2590
2591 if (amdgpu_emu_mode == 1)
2592 return 0;
2593
2594 for (j = 0; j < adev->num_ip_blocks; j++) {
2595 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2596 if (!adev->ip_blocks[i].status.late_initialized)
2597 continue;
2598 /* skip PG for GFX, SDMA on S0ix */
2599 if (adev->in_s0ix &&
2600 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2601 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2602 continue;
2603 /* skip CG for VCE/UVD, it's handled specially */
2604 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2605 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2606 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2607 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2608 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2609 /* enable powergating to save power */
2610 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2611 state);
2612 if (r) {
2613 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2614 adev->ip_blocks[i].version->funcs->name, r);
2615 return r;
2616 }
2617 }
2618 }
2619 return 0;
2620 }
2621
amdgpu_device_enable_mgpu_fan_boost(void)2622 static int amdgpu_device_enable_mgpu_fan_boost(void)
2623 {
2624 struct amdgpu_gpu_instance *gpu_ins;
2625 struct amdgpu_device *adev;
2626 int i, ret = 0;
2627
2628 mutex_lock(&mgpu_info.mutex);
2629
2630 /*
2631 * MGPU fan boost feature should be enabled
2632 * only when there are two or more dGPUs in
2633 * the system
2634 */
2635 if (mgpu_info.num_dgpu < 2)
2636 goto out;
2637
2638 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2639 gpu_ins = &(mgpu_info.gpu_ins[i]);
2640 adev = gpu_ins->adev;
2641 if (!(adev->flags & AMD_IS_APU) &&
2642 !gpu_ins->mgpu_fan_enabled) {
2643 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2644 if (ret)
2645 break;
2646
2647 gpu_ins->mgpu_fan_enabled = 1;
2648 }
2649 }
2650
2651 out:
2652 mutex_unlock(&mgpu_info.mutex);
2653
2654 return ret;
2655 }
2656
2657 /**
2658 * amdgpu_device_ip_late_init - run late init for hardware IPs
2659 *
2660 * @adev: amdgpu_device pointer
2661 *
2662 * Late initialization pass for hardware IPs. The list of all the hardware
2663 * IPs that make up the asic is walked and the late_init callbacks are run.
2664 * late_init covers any special initialization that an IP requires
2665 * after all of the have been initialized or something that needs to happen
2666 * late in the init process.
2667 * Returns 0 on success, negative error code on failure.
2668 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2669 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2670 {
2671 struct amdgpu_gpu_instance *gpu_instance;
2672 int i = 0, r;
2673
2674 for (i = 0; i < adev->num_ip_blocks; i++) {
2675 if (!adev->ip_blocks[i].status.hw)
2676 continue;
2677 if (adev->ip_blocks[i].version->funcs->late_init) {
2678 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2679 if (r) {
2680 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2681 adev->ip_blocks[i].version->funcs->name, r);
2682 return r;
2683 }
2684 }
2685 adev->ip_blocks[i].status.late_initialized = true;
2686 }
2687
2688 r = amdgpu_ras_late_init(adev);
2689 if (r) {
2690 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2691 return r;
2692 }
2693
2694 amdgpu_ras_set_error_query_ready(adev, true);
2695
2696 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2697 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2698
2699 amdgpu_device_fill_reset_magic(adev);
2700
2701 r = amdgpu_device_enable_mgpu_fan_boost();
2702 if (r)
2703 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2704
2705 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2706 if (amdgpu_passthrough(adev) &&
2707 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2708 adev->asic_type == CHIP_ALDEBARAN))
2709 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2710
2711 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2712 mutex_lock(&mgpu_info.mutex);
2713
2714 /*
2715 * Reset device p-state to low as this was booted with high.
2716 *
2717 * This should be performed only after all devices from the same
2718 * hive get initialized.
2719 *
2720 * However, it's unknown how many device in the hive in advance.
2721 * As this is counted one by one during devices initializations.
2722 *
2723 * So, we wait for all XGMI interlinked devices initialized.
2724 * This may bring some delays as those devices may come from
2725 * different hives. But that should be OK.
2726 */
2727 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2728 for (i = 0; i < mgpu_info.num_gpu; i++) {
2729 gpu_instance = &(mgpu_info.gpu_ins[i]);
2730 if (gpu_instance->adev->flags & AMD_IS_APU)
2731 continue;
2732
2733 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2734 AMDGPU_XGMI_PSTATE_MIN);
2735 if (r) {
2736 DRM_ERROR("pstate setting failed (%d).\n", r);
2737 break;
2738 }
2739 }
2740 }
2741
2742 mutex_unlock(&mgpu_info.mutex);
2743 }
2744
2745 return 0;
2746 }
2747
2748 /**
2749 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2750 *
2751 * @adev: amdgpu_device pointer
2752 *
2753 * For ASICs need to disable SMC first
2754 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2755 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2756 {
2757 int i, r;
2758
2759 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2760 return;
2761
2762 for (i = 0; i < adev->num_ip_blocks; i++) {
2763 if (!adev->ip_blocks[i].status.hw)
2764 continue;
2765 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2766 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2767 /* XXX handle errors */
2768 if (r) {
2769 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2770 adev->ip_blocks[i].version->funcs->name, r);
2771 }
2772 adev->ip_blocks[i].status.hw = false;
2773 break;
2774 }
2775 }
2776 }
2777
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2778 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2779 {
2780 int i, r;
2781
2782 for (i = 0; i < adev->num_ip_blocks; i++) {
2783 if (!adev->ip_blocks[i].version->funcs->early_fini)
2784 continue;
2785
2786 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2787 if (r) {
2788 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2789 adev->ip_blocks[i].version->funcs->name, r);
2790 }
2791 }
2792
2793 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2794 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2795
2796 amdgpu_amdkfd_suspend(adev, false);
2797
2798 /* Workaroud for ASICs need to disable SMC first */
2799 amdgpu_device_smu_fini_early(adev);
2800
2801 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2802 if (!adev->ip_blocks[i].status.hw)
2803 continue;
2804
2805 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2806 /* XXX handle errors */
2807 if (r) {
2808 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2809 adev->ip_blocks[i].version->funcs->name, r);
2810 }
2811
2812 adev->ip_blocks[i].status.hw = false;
2813 }
2814
2815 if (amdgpu_sriov_vf(adev)) {
2816 if (amdgpu_virt_release_full_gpu(adev, false))
2817 DRM_ERROR("failed to release exclusive mode on fini\n");
2818 }
2819
2820 return 0;
2821 }
2822
2823 /**
2824 * amdgpu_device_ip_fini - run fini for hardware IPs
2825 *
2826 * @adev: amdgpu_device pointer
2827 *
2828 * Main teardown pass for hardware IPs. The list of all the hardware
2829 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2830 * are run. hw_fini tears down the hardware associated with each IP
2831 * and sw_fini tears down any software state associated with each IP.
2832 * Returns 0 on success, negative error code on failure.
2833 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2834 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2835 {
2836 int i, r;
2837
2838 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2839 amdgpu_virt_release_ras_err_handler_data(adev);
2840
2841 if (adev->gmc.xgmi.num_physical_nodes > 1)
2842 amdgpu_xgmi_remove_device(adev);
2843
2844 amdgpu_amdkfd_device_fini_sw(adev);
2845
2846 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2847 if (!adev->ip_blocks[i].status.sw)
2848 continue;
2849
2850 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2851 amdgpu_ucode_free_bo(adev);
2852 amdgpu_free_static_csa(&adev->virt.csa_obj);
2853 amdgpu_device_wb_fini(adev);
2854 amdgpu_device_mem_scratch_fini(adev);
2855 amdgpu_ib_pool_fini(adev);
2856 }
2857
2858 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2859 /* XXX handle errors */
2860 if (r) {
2861 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2862 adev->ip_blocks[i].version->funcs->name, r);
2863 }
2864 adev->ip_blocks[i].status.sw = false;
2865 adev->ip_blocks[i].status.valid = false;
2866 }
2867
2868 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2869 if (!adev->ip_blocks[i].status.late_initialized)
2870 continue;
2871 if (adev->ip_blocks[i].version->funcs->late_fini)
2872 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2873 adev->ip_blocks[i].status.late_initialized = false;
2874 }
2875
2876 amdgpu_ras_fini(adev);
2877
2878 return 0;
2879 }
2880
2881 /**
2882 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2883 *
2884 * @work: work_struct.
2885 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2886 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2887 {
2888 struct amdgpu_device *adev =
2889 container_of(work, struct amdgpu_device, delayed_init_work.work);
2890 int r;
2891
2892 r = amdgpu_ib_ring_tests(adev);
2893 if (r)
2894 DRM_ERROR("ib ring test failed (%d).\n", r);
2895 }
2896
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2897 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2898 {
2899 struct amdgpu_device *adev =
2900 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2901
2902 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2903 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2904
2905 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2906 adev->gfx.gfx_off_state = true;
2907 }
2908
2909 /**
2910 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2911 *
2912 * @adev: amdgpu_device pointer
2913 *
2914 * Main suspend function for hardware IPs. The list of all the hardware
2915 * IPs that make up the asic is walked, clockgating is disabled and the
2916 * suspend callbacks are run. suspend puts the hardware and software state
2917 * in each IP into a state suitable for suspend.
2918 * Returns 0 on success, negative error code on failure.
2919 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2920 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2921 {
2922 int i, r;
2923
2924 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2925 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2926
2927 /*
2928 * Per PMFW team's suggestion, driver needs to handle gfxoff
2929 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2930 * scenario. Add the missing df cstate disablement here.
2931 */
2932 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2933 dev_warn(adev->dev, "Failed to disallow df cstate");
2934
2935 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2936 if (!adev->ip_blocks[i].status.valid)
2937 continue;
2938
2939 /* displays are handled separately */
2940 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2941 continue;
2942
2943 /* XXX handle errors */
2944 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2945 /* XXX handle errors */
2946 if (r) {
2947 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2948 adev->ip_blocks[i].version->funcs->name, r);
2949 return r;
2950 }
2951
2952 adev->ip_blocks[i].status.hw = false;
2953 }
2954
2955 return 0;
2956 }
2957
2958 /**
2959 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2960 *
2961 * @adev: amdgpu_device pointer
2962 *
2963 * Main suspend function for hardware IPs. The list of all the hardware
2964 * IPs that make up the asic is walked, clockgating is disabled and the
2965 * suspend callbacks are run. suspend puts the hardware and software state
2966 * in each IP into a state suitable for suspend.
2967 * Returns 0 on success, negative error code on failure.
2968 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2969 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2970 {
2971 int i, r;
2972
2973 if (adev->in_s0ix)
2974 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2975
2976 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2977 if (!adev->ip_blocks[i].status.valid)
2978 continue;
2979 /* displays are handled in phase1 */
2980 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2981 continue;
2982 /* PSP lost connection when err_event_athub occurs */
2983 if (amdgpu_ras_intr_triggered() &&
2984 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2985 adev->ip_blocks[i].status.hw = false;
2986 continue;
2987 }
2988
2989 /* skip unnecessary suspend if we do not initialize them yet */
2990 if (adev->gmc.xgmi.pending_reset &&
2991 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2992 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2995 adev->ip_blocks[i].status.hw = false;
2996 continue;
2997 }
2998
2999 /* skip suspend of gfx/mes and psp for S0ix
3000 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3001 * like at runtime. PSP is also part of the always on hardware
3002 * so no need to suspend it.
3003 */
3004 if (adev->in_s0ix &&
3005 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3006 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3007 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3008 continue;
3009
3010 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3011 if (adev->in_s0ix &&
3012 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3013 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3014 continue;
3015
3016 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3017 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3018 * from this location and RLC Autoload automatically also gets loaded
3019 * from here based on PMFW -> PSP message during re-init sequence.
3020 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3021 * the TMR and reload FWs again for IMU enabled APU ASICs.
3022 */
3023 if (amdgpu_in_reset(adev) &&
3024 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3025 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3026 continue;
3027
3028 /* XXX handle errors */
3029 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3030 /* XXX handle errors */
3031 if (r) {
3032 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3033 adev->ip_blocks[i].version->funcs->name, r);
3034 }
3035 adev->ip_blocks[i].status.hw = false;
3036 /* handle putting the SMC in the appropriate state */
3037 if (!amdgpu_sriov_vf(adev)) {
3038 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3039 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3040 if (r) {
3041 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3042 adev->mp1_state, r);
3043 return r;
3044 }
3045 }
3046 }
3047 }
3048
3049 return 0;
3050 }
3051
3052 /**
3053 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3054 *
3055 * @adev: amdgpu_device pointer
3056 *
3057 * Main suspend function for hardware IPs. The list of all the hardware
3058 * IPs that make up the asic is walked, clockgating is disabled and the
3059 * suspend callbacks are run. suspend puts the hardware and software state
3060 * in each IP into a state suitable for suspend.
3061 * Returns 0 on success, negative error code on failure.
3062 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3063 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3064 {
3065 int r;
3066
3067 if (amdgpu_sriov_vf(adev)) {
3068 amdgpu_virt_fini_data_exchange(adev);
3069 amdgpu_virt_request_full_gpu(adev, false);
3070 }
3071
3072 r = amdgpu_device_ip_suspend_phase1(adev);
3073 if (r)
3074 return r;
3075 r = amdgpu_device_ip_suspend_phase2(adev);
3076
3077 if (amdgpu_sriov_vf(adev))
3078 amdgpu_virt_release_full_gpu(adev, false);
3079
3080 return r;
3081 }
3082
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3083 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3084 {
3085 int i, r;
3086
3087 static enum amd_ip_block_type ip_order[] = {
3088 AMD_IP_BLOCK_TYPE_COMMON,
3089 AMD_IP_BLOCK_TYPE_GMC,
3090 AMD_IP_BLOCK_TYPE_PSP,
3091 AMD_IP_BLOCK_TYPE_IH,
3092 };
3093
3094 for (i = 0; i < adev->num_ip_blocks; i++) {
3095 int j;
3096 struct amdgpu_ip_block *block;
3097
3098 block = &adev->ip_blocks[i];
3099 block->status.hw = false;
3100
3101 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3102
3103 if (block->version->type != ip_order[j] ||
3104 !block->status.valid)
3105 continue;
3106
3107 r = block->version->funcs->hw_init(adev);
3108 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3109 if (r)
3110 return r;
3111 block->status.hw = true;
3112 }
3113 }
3114
3115 return 0;
3116 }
3117
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3118 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3119 {
3120 int i, r;
3121
3122 static enum amd_ip_block_type ip_order[] = {
3123 AMD_IP_BLOCK_TYPE_SMC,
3124 AMD_IP_BLOCK_TYPE_DCE,
3125 AMD_IP_BLOCK_TYPE_GFX,
3126 AMD_IP_BLOCK_TYPE_SDMA,
3127 AMD_IP_BLOCK_TYPE_MES,
3128 AMD_IP_BLOCK_TYPE_UVD,
3129 AMD_IP_BLOCK_TYPE_VCE,
3130 AMD_IP_BLOCK_TYPE_VCN,
3131 AMD_IP_BLOCK_TYPE_JPEG
3132 };
3133
3134 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3135 int j;
3136 struct amdgpu_ip_block *block;
3137
3138 for (j = 0; j < adev->num_ip_blocks; j++) {
3139 block = &adev->ip_blocks[j];
3140
3141 if (block->version->type != ip_order[i] ||
3142 !block->status.valid ||
3143 block->status.hw)
3144 continue;
3145
3146 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3147 r = block->version->funcs->resume(adev);
3148 else
3149 r = block->version->funcs->hw_init(adev);
3150
3151 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3152 if (r)
3153 return r;
3154 block->status.hw = true;
3155 }
3156 }
3157
3158 return 0;
3159 }
3160
3161 /**
3162 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3163 *
3164 * @adev: amdgpu_device pointer
3165 *
3166 * First resume function for hardware IPs. The list of all the hardware
3167 * IPs that make up the asic is walked and the resume callbacks are run for
3168 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3169 * after a suspend and updates the software state as necessary. This
3170 * function is also used for restoring the GPU after a GPU reset.
3171 * Returns 0 on success, negative error code on failure.
3172 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3173 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3174 {
3175 int i, r;
3176
3177 for (i = 0; i < adev->num_ip_blocks; i++) {
3178 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3179 continue;
3180 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3181 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3182 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3183 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3184
3185 r = adev->ip_blocks[i].version->funcs->resume(adev);
3186 if (r) {
3187 DRM_ERROR("resume of IP block <%s> failed %d\n",
3188 adev->ip_blocks[i].version->funcs->name, r);
3189 return r;
3190 }
3191 adev->ip_blocks[i].status.hw = true;
3192 }
3193 }
3194
3195 return 0;
3196 }
3197
3198 /**
3199 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3200 *
3201 * @adev: amdgpu_device pointer
3202 *
3203 * Second resume function for hardware IPs. The list of all the hardware
3204 * IPs that make up the asic is walked and the resume callbacks are run for
3205 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3206 * functional state after a suspend and updates the software state as
3207 * necessary. This function is also used for restoring the GPU after a GPU
3208 * reset.
3209 * Returns 0 on success, negative error code on failure.
3210 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3211 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3212 {
3213 int i, r;
3214
3215 for (i = 0; i < adev->num_ip_blocks; i++) {
3216 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3217 continue;
3218 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3222 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3223 continue;
3224 r = adev->ip_blocks[i].version->funcs->resume(adev);
3225 if (r) {
3226 DRM_ERROR("resume of IP block <%s> failed %d\n",
3227 adev->ip_blocks[i].version->funcs->name, r);
3228 return r;
3229 }
3230 adev->ip_blocks[i].status.hw = true;
3231 }
3232
3233 return 0;
3234 }
3235
3236 /**
3237 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3238 *
3239 * @adev: amdgpu_device pointer
3240 *
3241 * Third resume function for hardware IPs. The list of all the hardware
3242 * IPs that make up the asic is walked and the resume callbacks are run for
3243 * all DCE. resume puts the hardware into a functional state after a suspend
3244 * and updates the software state as necessary. This function is also used
3245 * for restoring the GPU after a GPU reset.
3246 *
3247 * Returns 0 on success, negative error code on failure.
3248 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3249 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3250 {
3251 int i, r;
3252
3253 for (i = 0; i < adev->num_ip_blocks; i++) {
3254 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3255 continue;
3256 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3257 r = adev->ip_blocks[i].version->funcs->resume(adev);
3258 if (r) {
3259 DRM_ERROR("resume of IP block <%s> failed %d\n",
3260 adev->ip_blocks[i].version->funcs->name, r);
3261 return r;
3262 }
3263 adev->ip_blocks[i].status.hw = true;
3264 }
3265 }
3266
3267 return 0;
3268 }
3269
3270 /**
3271 * amdgpu_device_ip_resume - run resume for hardware IPs
3272 *
3273 * @adev: amdgpu_device pointer
3274 *
3275 * Main resume function for hardware IPs. The hardware IPs
3276 * are split into two resume functions because they are
3277 * also used in recovering from a GPU reset and some additional
3278 * steps need to be take between them. In this case (S3/S4) they are
3279 * run sequentially.
3280 * Returns 0 on success, negative error code on failure.
3281 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3282 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3283 {
3284 int r;
3285
3286 r = amdgpu_device_ip_resume_phase1(adev);
3287 if (r)
3288 return r;
3289
3290 r = amdgpu_device_fw_loading(adev);
3291 if (r)
3292 return r;
3293
3294 r = amdgpu_device_ip_resume_phase2(adev);
3295
3296 if (r)
3297 return r;
3298
3299 amdgpu_fence_driver_hw_init(adev);
3300
3301 r = amdgpu_device_ip_resume_phase3(adev);
3302
3303 return r;
3304 }
3305
3306 /**
3307 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3308 *
3309 * @adev: amdgpu_device pointer
3310 *
3311 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3312 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3313 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3314 {
3315 if (amdgpu_sriov_vf(adev)) {
3316 if (adev->is_atom_fw) {
3317 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3318 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3319 } else {
3320 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3321 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3322 }
3323
3324 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3325 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3326 }
3327 }
3328
3329 /**
3330 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3331 *
3332 * @asic_type: AMD asic type
3333 *
3334 * Check if there is DC (new modesetting infrastructre) support for an asic.
3335 * returns true if DC has support, false if not.
3336 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3337 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3338 {
3339 switch (asic_type) {
3340 #ifdef CONFIG_DRM_AMDGPU_SI
3341 case CHIP_HAINAN:
3342 #endif
3343 case CHIP_TOPAZ:
3344 /* chips with no display hardware */
3345 return false;
3346 #if defined(CONFIG_DRM_AMD_DC)
3347 case CHIP_TAHITI:
3348 case CHIP_PITCAIRN:
3349 case CHIP_VERDE:
3350 case CHIP_OLAND:
3351 /*
3352 * We have systems in the wild with these ASICs that require
3353 * LVDS and VGA support which is not supported with DC.
3354 *
3355 * Fallback to the non-DC driver here by default so as not to
3356 * cause regressions.
3357 */
3358 #if defined(CONFIG_DRM_AMD_DC_SI)
3359 return amdgpu_dc > 0;
3360 #else
3361 return false;
3362 #endif
3363 case CHIP_BONAIRE:
3364 case CHIP_KAVERI:
3365 case CHIP_KABINI:
3366 case CHIP_MULLINS:
3367 /*
3368 * We have systems in the wild with these ASICs that require
3369 * VGA support which is not supported with DC.
3370 *
3371 * Fallback to the non-DC driver here by default so as not to
3372 * cause regressions.
3373 */
3374 return amdgpu_dc > 0;
3375 default:
3376 return amdgpu_dc != 0;
3377 #else
3378 default:
3379 if (amdgpu_dc > 0)
3380 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3381 return false;
3382 #endif
3383 }
3384 }
3385
3386 /**
3387 * amdgpu_device_has_dc_support - check if dc is supported
3388 *
3389 * @adev: amdgpu_device pointer
3390 *
3391 * Returns true for supported, false for not supported
3392 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3393 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3394 {
3395 if (adev->enable_virtual_display ||
3396 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3397 return false;
3398
3399 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3400 }
3401
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3402 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3403 {
3404 struct amdgpu_device *adev =
3405 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3406 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3407
3408 /* It's a bug to not have a hive within this function */
3409 if (WARN_ON(!hive))
3410 return;
3411
3412 /*
3413 * Use task barrier to synchronize all xgmi reset works across the
3414 * hive. task_barrier_enter and task_barrier_exit will block
3415 * until all the threads running the xgmi reset works reach
3416 * those points. task_barrier_full will do both blocks.
3417 */
3418 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3419
3420 task_barrier_enter(&hive->tb);
3421 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3422
3423 if (adev->asic_reset_res)
3424 goto fail;
3425
3426 task_barrier_exit(&hive->tb);
3427 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3428
3429 if (adev->asic_reset_res)
3430 goto fail;
3431
3432 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3433 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3434 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3435 } else {
3436
3437 task_barrier_full(&hive->tb);
3438 adev->asic_reset_res = amdgpu_asic_reset(adev);
3439 }
3440
3441 fail:
3442 if (adev->asic_reset_res)
3443 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3444 adev->asic_reset_res, adev_to_drm(adev)->unique);
3445 amdgpu_put_xgmi_hive(hive);
3446 }
3447
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3448 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3449 {
3450 char *input = amdgpu_lockup_timeout;
3451 char *timeout_setting = NULL;
3452 int index = 0;
3453 long timeout;
3454 int ret = 0;
3455
3456 /*
3457 * By default timeout for non compute jobs is 10000
3458 * and 60000 for compute jobs.
3459 * In SR-IOV or passthrough mode, timeout for compute
3460 * jobs are 60000 by default.
3461 */
3462 adev->gfx_timeout = msecs_to_jiffies(10000);
3463 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3464 if (amdgpu_sriov_vf(adev))
3465 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3466 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3467 else
3468 adev->compute_timeout = msecs_to_jiffies(60000);
3469
3470 #ifdef notyet
3471 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3472 while ((timeout_setting = strsep(&input, ",")) &&
3473 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3474 ret = kstrtol(timeout_setting, 0, &timeout);
3475 if (ret)
3476 return ret;
3477
3478 if (timeout == 0) {
3479 index++;
3480 continue;
3481 } else if (timeout < 0) {
3482 timeout = MAX_SCHEDULE_TIMEOUT;
3483 dev_warn(adev->dev, "lockup timeout disabled");
3484 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3485 } else {
3486 timeout = msecs_to_jiffies(timeout);
3487 }
3488
3489 switch (index++) {
3490 case 0:
3491 adev->gfx_timeout = timeout;
3492 break;
3493 case 1:
3494 adev->compute_timeout = timeout;
3495 break;
3496 case 2:
3497 adev->sdma_timeout = timeout;
3498 break;
3499 case 3:
3500 adev->video_timeout = timeout;
3501 break;
3502 default:
3503 break;
3504 }
3505 }
3506 /*
3507 * There is only one value specified and
3508 * it should apply to all non-compute jobs.
3509 */
3510 if (index == 1) {
3511 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3512 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3513 adev->compute_timeout = adev->gfx_timeout;
3514 }
3515 }
3516 #endif
3517
3518 return ret;
3519 }
3520
3521 /**
3522 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3523 *
3524 * @adev: amdgpu_device pointer
3525 *
3526 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3527 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3528 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3529 {
3530 #ifdef notyet
3531 struct iommu_domain *domain;
3532
3533 domain = iommu_get_domain_for_dev(adev->dev);
3534 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3535 #endif
3536 adev->ram_is_direct_mapped = true;
3537 }
3538
3539 static const struct attribute *amdgpu_dev_attributes[] = {
3540 &dev_attr_pcie_replay_count.attr,
3541 NULL
3542 };
3543
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3544 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3545 {
3546 if (amdgpu_mcbp == 1)
3547 adev->gfx.mcbp = true;
3548 else if (amdgpu_mcbp == 0)
3549 adev->gfx.mcbp = false;
3550
3551 if (amdgpu_sriov_vf(adev))
3552 adev->gfx.mcbp = true;
3553
3554 if (adev->gfx.mcbp)
3555 DRM_INFO("MCBP is enabled\n");
3556 }
3557
3558 /**
3559 * amdgpu_device_init - initialize the driver
3560 *
3561 * @adev: amdgpu_device pointer
3562 * @flags: driver flags
3563 *
3564 * Initializes the driver info and hw (all asics).
3565 * Returns 0 for success or an error on failure.
3566 * Called at driver startup.
3567 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3568 int amdgpu_device_init(struct amdgpu_device *adev,
3569 uint32_t flags)
3570 {
3571 struct drm_device *ddev = adev_to_drm(adev);
3572 struct pci_dev *pdev = adev->pdev;
3573 int r, i;
3574 bool px = false;
3575 u32 max_MBps;
3576 int tmp;
3577
3578 adev->shutdown = false;
3579 adev->flags = flags;
3580
3581 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3582 adev->asic_type = amdgpu_force_asic_type;
3583 else
3584 adev->asic_type = flags & AMD_ASIC_MASK;
3585
3586 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3587 if (amdgpu_emu_mode == 1)
3588 adev->usec_timeout *= 10;
3589 adev->gmc.gart_size = 512 * 1024 * 1024;
3590 adev->accel_working = false;
3591 adev->num_rings = 0;
3592 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3593 adev->mman.buffer_funcs = NULL;
3594 adev->mman.buffer_funcs_ring = NULL;
3595 adev->vm_manager.vm_pte_funcs = NULL;
3596 adev->vm_manager.vm_pte_num_scheds = 0;
3597 adev->gmc.gmc_funcs = NULL;
3598 adev->harvest_ip_mask = 0x0;
3599 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3600 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3601
3602 adev->smc_rreg = &amdgpu_invalid_rreg;
3603 adev->smc_wreg = &amdgpu_invalid_wreg;
3604 adev->pcie_rreg = &amdgpu_invalid_rreg;
3605 adev->pcie_wreg = &amdgpu_invalid_wreg;
3606 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3607 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3608 adev->pciep_rreg = &amdgpu_invalid_rreg;
3609 adev->pciep_wreg = &amdgpu_invalid_wreg;
3610 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3611 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3612 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3613 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3614 adev->didt_rreg = &amdgpu_invalid_rreg;
3615 adev->didt_wreg = &amdgpu_invalid_wreg;
3616 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3617 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3618 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3619 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3620
3621 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3622 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3623 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3624
3625 /* mutex initialization are all done here so we
3626 * can recall function without having locking issues
3627 */
3628 rw_init(&adev->firmware.mutex, "agfw");
3629 rw_init(&adev->pm.mutex, "agpm");
3630 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
3631 rw_init(&adev->srbm_mutex, "srbm");
3632 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
3633 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
3634 rw_init(&adev->gfx.partition_mutex, "gfxpar");
3635 rw_init(&adev->grbm_idx_mutex, "grbmidx");
3636 rw_init(&adev->mn_lock, "agpumn");
3637 rw_init(&adev->virt.vf_errors.lock, "vferr");
3638 rw_init(&adev->virt.rlcg_reg_lock, "vrlcg");
3639 hash_init(adev->mn_hash);
3640 rw_init(&adev->psp.mutex, "agpsp");
3641 rw_init(&adev->notifier_lock, "agnf");
3642 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps");
3643 rw_init(&adev->benchmark_mutex, "agbm");
3644
3645 amdgpu_device_init_apu_flags(adev);
3646
3647 r = amdgpu_device_check_arguments(adev);
3648 if (r)
3649 return r;
3650
3651 mtx_init(&adev->mmio_idx_lock, IPL_TTY);
3652 mtx_init(&adev->smc_idx_lock, IPL_TTY);
3653 mtx_init(&adev->pcie_idx_lock, IPL_TTY);
3654 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
3655 mtx_init(&adev->didt_idx_lock, IPL_TTY);
3656 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
3657 mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
3658 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
3659 mtx_init(&adev->mm_stats.lock, IPL_NONE);
3660
3661 INIT_LIST_HEAD(&adev->shadow_list);
3662 rw_init(&adev->shadow_list_lock, "sdwlst");
3663
3664 INIT_LIST_HEAD(&adev->reset_list);
3665
3666 INIT_LIST_HEAD(&adev->ras_list);
3667
3668 INIT_DELAYED_WORK(&adev->delayed_init_work,
3669 amdgpu_device_delayed_init_work_handler);
3670 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3671 amdgpu_device_delay_enable_gfx_off);
3672
3673 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3674
3675 adev->gfx.gfx_off_req_count = 1;
3676 adev->gfx.gfx_off_residency = 0;
3677 adev->gfx.gfx_off_entrycount = 0;
3678 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3679
3680 atomic_set(&adev->throttling_logging_enabled, 1);
3681 /*
3682 * If throttling continues, logging will be performed every minute
3683 * to avoid log flooding. "-1" is subtracted since the thermal
3684 * throttling interrupt comes every second. Thus, the total logging
3685 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3686 * for throttling interrupt) = 60 seconds.
3687 */
3688 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3689 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3690
3691 #ifdef __linux__
3692 /* Registers mapping */
3693 /* TODO: block userspace mapping of io register */
3694 if (adev->asic_type >= CHIP_BONAIRE) {
3695 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3696 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3697 } else {
3698 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3699 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3700 }
3701 #endif
3702
3703 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3704 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3705
3706 #ifdef __linux__
3707 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3708 if (!adev->rmmio)
3709 return -ENOMEM;
3710 #endif
3711 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3712 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3713
3714 /*
3715 * Reset domain needs to be present early, before XGMI hive discovered
3716 * (if any) and intitialized to use reset sem and in_gpu reset flag
3717 * early on during init and before calling to RREG32.
3718 */
3719 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3720 if (!adev->reset_domain)
3721 return -ENOMEM;
3722
3723 /* detect hw virtualization here */
3724 amdgpu_detect_virtualization(adev);
3725
3726 amdgpu_device_get_pcie_info(adev);
3727
3728 r = amdgpu_device_get_job_timeout_settings(adev);
3729 if (r) {
3730 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3731 return r;
3732 }
3733
3734 /* early init functions */
3735 r = amdgpu_device_ip_early_init(adev);
3736 if (r)
3737 return r;
3738
3739 amdgpu_device_set_mcbp(adev);
3740
3741 /* Get rid of things like offb */
3742 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3743 if (r)
3744 return r;
3745
3746 /* Enable TMZ based on IP_VERSION */
3747 amdgpu_gmc_tmz_set(adev);
3748
3749 amdgpu_gmc_noretry_set(adev);
3750 /* Need to get xgmi info early to decide the reset behavior*/
3751 if (adev->gmc.xgmi.supported) {
3752 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3753 if (r)
3754 return r;
3755 }
3756
3757 /* enable PCIE atomic ops */
3758 #ifdef notyet
3759 if (amdgpu_sriov_vf(adev)) {
3760 if (adev->virt.fw_reserve.p_pf2vf)
3761 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3762 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3763 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3764 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3765 * internal path natively support atomics, set have_atomics_support to true.
3766 */
3767 } else if ((adev->flags & AMD_IS_APU) &&
3768 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3769 adev->have_atomics_support = true;
3770 } else {
3771 adev->have_atomics_support =
3772 !pci_enable_atomic_ops_to_root(adev->pdev,
3773 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3774 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3775 }
3776
3777 if (!adev->have_atomics_support)
3778 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3779 #else
3780 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3781 * internal path natively support atomics, set have_atomics_support to true.
3782 */
3783 if ((adev->flags & AMD_IS_APU) &&
3784 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)))
3785 adev->have_atomics_support = true;
3786 else
3787 adev->have_atomics_support = false;
3788 #endif
3789
3790 /* doorbell bar mapping and doorbell index init*/
3791 amdgpu_doorbell_init(adev);
3792
3793 if (amdgpu_emu_mode == 1) {
3794 /* post the asic on emulation mode */
3795 emu_soc_asic_init(adev);
3796 goto fence_driver_init;
3797 }
3798
3799 amdgpu_reset_init(adev);
3800
3801 /* detect if we are with an SRIOV vbios */
3802 if (adev->bios)
3803 amdgpu_device_detect_sriov_bios(adev);
3804
3805 /* check if we need to reset the asic
3806 * E.g., driver was not cleanly unloaded previously, etc.
3807 */
3808 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3809 if (adev->gmc.xgmi.num_physical_nodes) {
3810 dev_info(adev->dev, "Pending hive reset.\n");
3811 adev->gmc.xgmi.pending_reset = true;
3812 /* Only need to init necessary block for SMU to handle the reset */
3813 for (i = 0; i < adev->num_ip_blocks; i++) {
3814 if (!adev->ip_blocks[i].status.valid)
3815 continue;
3816 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3817 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3818 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3819 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3820 DRM_DEBUG("IP %s disabled for hw_init.\n",
3821 adev->ip_blocks[i].version->funcs->name);
3822 adev->ip_blocks[i].status.hw = true;
3823 }
3824 }
3825 } else {
3826 tmp = amdgpu_reset_method;
3827 /* It should do a default reset when loading or reloading the driver,
3828 * regardless of the module parameter reset_method.
3829 */
3830 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3831 r = amdgpu_asic_reset(adev);
3832 amdgpu_reset_method = tmp;
3833 if (r) {
3834 dev_err(adev->dev, "asic reset on init failed\n");
3835 goto failed;
3836 }
3837 }
3838 }
3839
3840 /* Post card if necessary */
3841 if (amdgpu_device_need_post(adev)) {
3842 if (!adev->bios) {
3843 dev_err(adev->dev, "no vBIOS found\n");
3844 r = -EINVAL;
3845 goto failed;
3846 }
3847 DRM_INFO("GPU posting now...\n");
3848 r = amdgpu_device_asic_init(adev);
3849 if (r) {
3850 dev_err(adev->dev, "gpu post error!\n");
3851 goto failed;
3852 }
3853 }
3854
3855 if (adev->bios) {
3856 if (adev->is_atom_fw) {
3857 /* Initialize clocks */
3858 r = amdgpu_atomfirmware_get_clock_info(adev);
3859 if (r) {
3860 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3861 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3862 goto failed;
3863 }
3864 } else {
3865 /* Initialize clocks */
3866 r = amdgpu_atombios_get_clock_info(adev);
3867 if (r) {
3868 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3869 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3870 goto failed;
3871 }
3872 /* init i2c buses */
3873 if (!amdgpu_device_has_dc_support(adev))
3874 amdgpu_atombios_i2c_init(adev);
3875 }
3876 }
3877
3878 fence_driver_init:
3879 /* Fence driver */
3880 r = amdgpu_fence_driver_sw_init(adev);
3881 if (r) {
3882 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3883 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3884 goto failed;
3885 }
3886
3887 /* init the mode config */
3888 drm_mode_config_init(adev_to_drm(adev));
3889
3890 r = amdgpu_device_ip_init(adev);
3891 if (r) {
3892 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3893 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3894 goto release_ras_con;
3895 }
3896
3897 amdgpu_fence_driver_hw_init(adev);
3898
3899 dev_info(adev->dev,
3900 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3901 adev->gfx.config.max_shader_engines,
3902 adev->gfx.config.max_sh_per_se,
3903 adev->gfx.config.max_cu_per_sh,
3904 adev->gfx.cu_info.number);
3905
3906 #ifdef __OpenBSD__
3907 {
3908 const char *chip_name;
3909 uint32_t version = adev->ip_versions[GC_HWIP][0];
3910 int maj, min, rev;
3911
3912 switch (adev->asic_type) {
3913 case CHIP_RAVEN:
3914 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
3915 chip_name = "RAVEN2";
3916 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
3917 chip_name = "PICASSO";
3918 else
3919 chip_name = "RAVEN";
3920 break;
3921 case CHIP_RENOIR:
3922 if (adev->apu_flags & AMD_APU_IS_RENOIR)
3923 chip_name = "RENOIR";
3924 else
3925 chip_name = "GREEN_SARDINE";
3926 break;
3927 default:
3928 chip_name = amdgpu_asic_name[adev->asic_type];
3929 }
3930
3931 printf("%s: %s", adev->self.dv_xname, chip_name);
3932 /* show graphics/compute ip block version, not set on < GFX9 */
3933 if (version) {
3934 maj = IP_VERSION_MAJ(version);
3935 min = IP_VERSION_MIN(version);
3936 rev = IP_VERSION_REV(version);
3937 printf(" GC %d.%d.%d", maj, min, rev);
3938 }
3939 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id);
3940 }
3941 #endif
3942
3943 adev->accel_working = true;
3944
3945 amdgpu_vm_check_compute_bug(adev);
3946
3947 /* Initialize the buffer migration limit. */
3948 if (amdgpu_moverate >= 0)
3949 max_MBps = amdgpu_moverate;
3950 else
3951 max_MBps = 8; /* Allow 8 MB/s. */
3952 /* Get a log2 for easy divisions. */
3953 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3954
3955 r = amdgpu_atombios_sysfs_init(adev);
3956 if (r)
3957 drm_err(&adev->ddev,
3958 "registering atombios sysfs failed (%d).\n", r);
3959
3960 r = amdgpu_pm_sysfs_init(adev);
3961 if (r)
3962 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3963
3964 r = amdgpu_ucode_sysfs_init(adev);
3965 if (r) {
3966 adev->ucode_sysfs_en = false;
3967 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3968 } else
3969 adev->ucode_sysfs_en = true;
3970
3971 /*
3972 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3973 * Otherwise the mgpu fan boost feature will be skipped due to the
3974 * gpu instance is counted less.
3975 */
3976 amdgpu_register_gpu_instance(adev);
3977
3978 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3979 * explicit gating rather than handling it automatically.
3980 */
3981 if (!adev->gmc.xgmi.pending_reset) {
3982 r = amdgpu_device_ip_late_init(adev);
3983 if (r) {
3984 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3985 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3986 goto release_ras_con;
3987 }
3988 /* must succeed. */
3989 amdgpu_ras_resume(adev);
3990 queue_delayed_work(system_wq, &adev->delayed_init_work,
3991 msecs_to_jiffies(AMDGPU_RESUME_MS));
3992 }
3993
3994 if (amdgpu_sriov_vf(adev)) {
3995 amdgpu_virt_release_full_gpu(adev, true);
3996 flush_delayed_work(&adev->delayed_init_work);
3997 }
3998
3999 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4000 if (r)
4001 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4002
4003 amdgpu_fru_sysfs_init(adev);
4004
4005 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4006 r = amdgpu_pmu_init(adev);
4007 if (r)
4008 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4009
4010 /* Have stored pci confspace at hand for restore in sudden PCI error */
4011 if (amdgpu_device_cache_pci_state(adev->pdev))
4012 pci_restore_state(pdev);
4013
4014 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4015 /* this will fail for cards that aren't VGA class devices, just
4016 * ignore it
4017 */
4018 #ifdef notyet
4019 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4020 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4021 #endif
4022
4023 px = amdgpu_device_supports_px(ddev);
4024
4025 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4026 apple_gmux_detect(NULL, NULL)))
4027 vga_switcheroo_register_client(adev->pdev,
4028 &amdgpu_switcheroo_ops, px);
4029
4030 if (px)
4031 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4032
4033 if (adev->gmc.xgmi.pending_reset)
4034 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
4035 msecs_to_jiffies(AMDGPU_RESUME_MS));
4036
4037 amdgpu_device_check_iommu_direct_map(adev);
4038
4039 return 0;
4040
4041 release_ras_con:
4042 if (amdgpu_sriov_vf(adev))
4043 amdgpu_virt_release_full_gpu(adev, true);
4044
4045 /* failed in exclusive mode due to timeout */
4046 if (amdgpu_sriov_vf(adev) &&
4047 !amdgpu_sriov_runtime(adev) &&
4048 amdgpu_virt_mmio_blocked(adev) &&
4049 !amdgpu_virt_wait_reset(adev)) {
4050 dev_err(adev->dev, "VF exclusive mode timeout\n");
4051 /* Don't send request since VF is inactive. */
4052 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4053 adev->virt.ops = NULL;
4054 r = -EAGAIN;
4055 }
4056 amdgpu_release_ras_context(adev);
4057
4058 failed:
4059 amdgpu_vf_error_trans_all(adev);
4060
4061 return r;
4062 }
4063
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4064 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4065 {
4066 STUB();
4067 #ifdef notyet
4068
4069 /* Clear all CPU mappings pointing to this device */
4070 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4071 #endif
4072
4073 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4074 amdgpu_doorbell_fini(adev);
4075
4076 #ifdef __linux__
4077 iounmap(adev->rmmio);
4078 adev->rmmio = NULL;
4079 if (adev->mman.aper_base_kaddr)
4080 iounmap(adev->mman.aper_base_kaddr);
4081 adev->mman.aper_base_kaddr = NULL;
4082 #else
4083 if (adev->rmmio_size > 0)
4084 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4085 adev->rmmio_size);
4086 adev->rmmio_size = 0;
4087 adev->rmmio = NULL;
4088 if (adev->mman.aper_base_kaddr)
4089 bus_space_unmap(adev->memt, adev->mman.aper_bsh,
4090 adev->gmc.visible_vram_size);
4091 adev->mman.aper_base_kaddr = NULL;
4092 #endif
4093
4094 /* Memory manager related */
4095 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4096 #ifdef __linux__
4097 arch_phys_wc_del(adev->gmc.vram_mtrr);
4098 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4099 #else
4100 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
4101 #endif
4102 }
4103 }
4104
4105 /**
4106 * amdgpu_device_fini_hw - tear down the driver
4107 *
4108 * @adev: amdgpu_device pointer
4109 *
4110 * Tear down the driver info (all asics).
4111 * Called at driver shutdown.
4112 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4113 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4114 {
4115 dev_info(adev->dev, "amdgpu: finishing device.\n");
4116 flush_delayed_work(&adev->delayed_init_work);
4117 adev->shutdown = true;
4118
4119 /* make sure IB test finished before entering exclusive mode
4120 * to avoid preemption on IB test
4121 */
4122 if (amdgpu_sriov_vf(adev)) {
4123 amdgpu_virt_request_full_gpu(adev, false);
4124 amdgpu_virt_fini_data_exchange(adev);
4125 }
4126
4127 /* disable all interrupts */
4128 amdgpu_irq_disable_all(adev);
4129 if (adev->mode_info.mode_config_initialized) {
4130 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4131 drm_helper_force_disable_all(adev_to_drm(adev));
4132 else
4133 drm_atomic_helper_shutdown(adev_to_drm(adev));
4134 }
4135 amdgpu_fence_driver_hw_fini(adev);
4136
4137 if (adev->mman.initialized)
4138 drain_workqueue(adev->mman.bdev.wq);
4139
4140 if (adev->pm.sysfs_initialized)
4141 amdgpu_pm_sysfs_fini(adev);
4142 if (adev->ucode_sysfs_en)
4143 amdgpu_ucode_sysfs_fini(adev);
4144 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4145 amdgpu_fru_sysfs_fini(adev);
4146
4147 /* disable ras feature must before hw fini */
4148 amdgpu_ras_pre_fini(adev);
4149
4150 amdgpu_device_ip_fini_early(adev);
4151
4152 amdgpu_irq_fini_hw(adev);
4153
4154 if (adev->mman.initialized)
4155 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4156
4157 amdgpu_gart_dummy_page_fini(adev);
4158
4159 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4160 amdgpu_device_unmap_mmio(adev);
4161
4162 }
4163
amdgpu_device_fini_sw(struct amdgpu_device * adev)4164 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4165 {
4166 int idx;
4167 bool px;
4168
4169 amdgpu_device_ip_fini(adev);
4170 amdgpu_fence_driver_sw_fini(adev);
4171 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4172 adev->accel_working = false;
4173 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4174
4175 amdgpu_reset_fini(adev);
4176
4177 /* free i2c buses */
4178 if (!amdgpu_device_has_dc_support(adev))
4179 amdgpu_i2c_fini(adev);
4180
4181 if (amdgpu_emu_mode != 1)
4182 amdgpu_atombios_fini(adev);
4183
4184 kfree(adev->bios);
4185 adev->bios = NULL;
4186
4187 px = amdgpu_device_supports_px(adev_to_drm(adev));
4188
4189 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4190 apple_gmux_detect(NULL, NULL)))
4191 vga_switcheroo_unregister_client(adev->pdev);
4192
4193 if (px)
4194 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4195
4196 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4197 vga_client_unregister(adev->pdev);
4198
4199 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4200 #ifdef __linux__
4201 iounmap(adev->rmmio);
4202 adev->rmmio = NULL;
4203 #else
4204 if (adev->rmmio_size > 0)
4205 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4206 adev->rmmio_size);
4207 adev->rmmio_size = 0;
4208 adev->rmmio = NULL;
4209 #endif
4210 amdgpu_doorbell_fini(adev);
4211 drm_dev_exit(idx);
4212 }
4213
4214 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4215 amdgpu_pmu_fini(adev);
4216 if (adev->mman.discovery_bin)
4217 amdgpu_discovery_fini(adev);
4218
4219 amdgpu_reset_put_reset_domain(adev->reset_domain);
4220 adev->reset_domain = NULL;
4221
4222 kfree(adev->pci_state);
4223
4224 }
4225
4226 /**
4227 * amdgpu_device_evict_resources - evict device resources
4228 * @adev: amdgpu device object
4229 *
4230 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4231 * of the vram memory type. Mainly used for evicting device resources
4232 * at suspend time.
4233 *
4234 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4235 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4236 {
4237 int ret;
4238
4239 /* No need to evict vram on APUs for suspend to ram or s2idle */
4240 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4241 return 0;
4242
4243 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4244 if (ret)
4245 DRM_WARN("evicting device resources failed\n");
4246 return ret;
4247 }
4248
4249 /*
4250 * Suspend & resume.
4251 */
4252 /**
4253 * amdgpu_device_prepare - prepare for device suspend
4254 *
4255 * @dev: drm dev pointer
4256 *
4257 * Prepare to put the hw in the suspend state (all asics).
4258 * Returns 0 for success or an error on failure.
4259 * Called at driver suspend.
4260 */
amdgpu_device_prepare(struct drm_device * dev)4261 int amdgpu_device_prepare(struct drm_device *dev)
4262 {
4263 struct amdgpu_device *adev = drm_to_adev(dev);
4264 int i, r;
4265
4266 amdgpu_choose_low_power_state(adev);
4267
4268 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4269 return 0;
4270
4271 /* Evict the majority of BOs before starting suspend sequence */
4272 r = amdgpu_device_evict_resources(adev);
4273 if (r)
4274 goto unprepare;
4275
4276 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4277
4278 for (i = 0; i < adev->num_ip_blocks; i++) {
4279 if (!adev->ip_blocks[i].status.valid)
4280 continue;
4281 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4282 continue;
4283 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4284 if (r)
4285 goto unprepare;
4286 }
4287
4288 return 0;
4289
4290 unprepare:
4291 adev->in_s0ix = adev->in_s3 = false;
4292
4293 return r;
4294 }
4295
4296 /**
4297 * amdgpu_device_suspend - initiate device suspend
4298 *
4299 * @dev: drm dev pointer
4300 * @fbcon : notify the fbdev of suspend
4301 *
4302 * Puts the hw in the suspend state (all asics).
4303 * Returns 0 for success or an error on failure.
4304 * Called at driver suspend.
4305 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4306 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4307 {
4308 struct amdgpu_device *adev = drm_to_adev(dev);
4309 int r = 0;
4310
4311 if (adev->shutdown)
4312 return 0;
4313
4314 #ifdef notyet
4315 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4316 return 0;
4317 #endif
4318
4319 adev->in_suspend = true;
4320
4321 if (amdgpu_sriov_vf(adev)) {
4322 amdgpu_virt_fini_data_exchange(adev);
4323 r = amdgpu_virt_request_full_gpu(adev, false);
4324 if (r)
4325 return r;
4326 }
4327
4328 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4329 DRM_WARN("smart shift update failed\n");
4330
4331 if (fbcon)
4332 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4333
4334 cancel_delayed_work_sync(&adev->delayed_init_work);
4335
4336 amdgpu_ras_suspend(adev);
4337
4338 amdgpu_device_ip_suspend_phase1(adev);
4339
4340 if (!adev->in_s0ix)
4341 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4342
4343 r = amdgpu_device_evict_resources(adev);
4344 if (r)
4345 return r;
4346
4347 amdgpu_fence_driver_hw_fini(adev);
4348
4349 amdgpu_device_ip_suspend_phase2(adev);
4350
4351 if (amdgpu_sriov_vf(adev))
4352 amdgpu_virt_release_full_gpu(adev, false);
4353
4354 return 0;
4355 }
4356
4357 /**
4358 * amdgpu_device_resume - initiate device resume
4359 *
4360 * @dev: drm dev pointer
4361 * @fbcon : notify the fbdev of resume
4362 *
4363 * Bring the hw back to operating state (all asics).
4364 * Returns 0 for success or an error on failure.
4365 * Called at driver resume.
4366 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4367 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4368 {
4369 struct amdgpu_device *adev = drm_to_adev(dev);
4370 int r = 0;
4371
4372 if (amdgpu_sriov_vf(adev)) {
4373 r = amdgpu_virt_request_full_gpu(adev, true);
4374 if (r)
4375 return r;
4376 }
4377
4378 #ifdef notyet
4379 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4380 return 0;
4381 #endif
4382
4383 if (adev->in_s0ix)
4384 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4385
4386 /* post card */
4387 if (amdgpu_device_need_post(adev)) {
4388 r = amdgpu_device_asic_init(adev);
4389 if (r)
4390 dev_err(adev->dev, "amdgpu asic init failed\n");
4391 }
4392
4393 r = amdgpu_device_ip_resume(adev);
4394
4395 if (r) {
4396 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4397 goto exit;
4398 }
4399
4400 r = amdgpu_device_ip_late_init(adev);
4401 if (r)
4402 goto exit;
4403
4404 queue_delayed_work(system_wq, &adev->delayed_init_work,
4405 msecs_to_jiffies(AMDGPU_RESUME_MS));
4406
4407 if (!adev->in_s0ix) {
4408 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4409 if (r)
4410 goto exit;
4411 }
4412
4413 exit:
4414 if (amdgpu_sriov_vf(adev)) {
4415 amdgpu_virt_init_data_exchange(adev);
4416 amdgpu_virt_release_full_gpu(adev, true);
4417 }
4418
4419 if (r)
4420 return r;
4421
4422 /* Make sure IB tests flushed */
4423 flush_delayed_work(&adev->delayed_init_work);
4424
4425 if (fbcon)
4426 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4427
4428 amdgpu_ras_resume(adev);
4429
4430 if (adev->mode_info.num_crtc) {
4431 /*
4432 * Most of the connector probing functions try to acquire runtime pm
4433 * refs to ensure that the GPU is powered on when connector polling is
4434 * performed. Since we're calling this from a runtime PM callback,
4435 * trying to acquire rpm refs will cause us to deadlock.
4436 *
4437 * Since we're guaranteed to be holding the rpm lock, it's safe to
4438 * temporarily disable the rpm helpers so this doesn't deadlock us.
4439 */
4440 #if defined(CONFIG_PM) && defined(__linux__)
4441 dev->dev->power.disable_depth++;
4442 #endif
4443 if (!adev->dc_enabled)
4444 drm_helper_hpd_irq_event(dev);
4445 else
4446 drm_kms_helper_hotplug_event(dev);
4447 #if defined(CONFIG_PM) && defined(__linux__)
4448 dev->dev->power.disable_depth--;
4449 #endif
4450 }
4451 adev->in_suspend = false;
4452
4453 if (adev->enable_mes)
4454 amdgpu_mes_self_test(adev);
4455
4456 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4457 DRM_WARN("smart shift update failed\n");
4458
4459 return 0;
4460 }
4461
4462 /**
4463 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4464 *
4465 * @adev: amdgpu_device pointer
4466 *
4467 * The list of all the hardware IPs that make up the asic is walked and
4468 * the check_soft_reset callbacks are run. check_soft_reset determines
4469 * if the asic is still hung or not.
4470 * Returns true if any of the IPs are still in a hung state, false if not.
4471 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4472 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4473 {
4474 int i;
4475 bool asic_hang = false;
4476
4477 if (amdgpu_sriov_vf(adev))
4478 return true;
4479
4480 if (amdgpu_asic_need_full_reset(adev))
4481 return true;
4482
4483 for (i = 0; i < adev->num_ip_blocks; i++) {
4484 if (!adev->ip_blocks[i].status.valid)
4485 continue;
4486 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4487 adev->ip_blocks[i].status.hang =
4488 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4489 if (adev->ip_blocks[i].status.hang) {
4490 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4491 asic_hang = true;
4492 }
4493 }
4494 return asic_hang;
4495 }
4496
4497 /**
4498 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4499 *
4500 * @adev: amdgpu_device pointer
4501 *
4502 * The list of all the hardware IPs that make up the asic is walked and the
4503 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4504 * handles any IP specific hardware or software state changes that are
4505 * necessary for a soft reset to succeed.
4506 * Returns 0 on success, negative error code on failure.
4507 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4508 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4509 {
4510 int i, r = 0;
4511
4512 for (i = 0; i < adev->num_ip_blocks; i++) {
4513 if (!adev->ip_blocks[i].status.valid)
4514 continue;
4515 if (adev->ip_blocks[i].status.hang &&
4516 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4517 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4518 if (r)
4519 return r;
4520 }
4521 }
4522
4523 return 0;
4524 }
4525
4526 /**
4527 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4528 *
4529 * @adev: amdgpu_device pointer
4530 *
4531 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4532 * reset is necessary to recover.
4533 * Returns true if a full asic reset is required, false if not.
4534 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4535 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4536 {
4537 int i;
4538
4539 if (amdgpu_asic_need_full_reset(adev))
4540 return true;
4541
4542 for (i = 0; i < adev->num_ip_blocks; i++) {
4543 if (!adev->ip_blocks[i].status.valid)
4544 continue;
4545 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4546 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4547 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4548 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4549 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4550 if (adev->ip_blocks[i].status.hang) {
4551 dev_info(adev->dev, "Some block need full reset!\n");
4552 return true;
4553 }
4554 }
4555 }
4556 return false;
4557 }
4558
4559 /**
4560 * amdgpu_device_ip_soft_reset - do a soft reset
4561 *
4562 * @adev: amdgpu_device pointer
4563 *
4564 * The list of all the hardware IPs that make up the asic is walked and the
4565 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4566 * IP specific hardware or software state changes that are necessary to soft
4567 * reset the IP.
4568 * Returns 0 on success, negative error code on failure.
4569 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4570 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4571 {
4572 int i, r = 0;
4573
4574 for (i = 0; i < adev->num_ip_blocks; i++) {
4575 if (!adev->ip_blocks[i].status.valid)
4576 continue;
4577 if (adev->ip_blocks[i].status.hang &&
4578 adev->ip_blocks[i].version->funcs->soft_reset) {
4579 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4580 if (r)
4581 return r;
4582 }
4583 }
4584
4585 return 0;
4586 }
4587
4588 /**
4589 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4590 *
4591 * @adev: amdgpu_device pointer
4592 *
4593 * The list of all the hardware IPs that make up the asic is walked and the
4594 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4595 * handles any IP specific hardware or software state changes that are
4596 * necessary after the IP has been soft reset.
4597 * Returns 0 on success, negative error code on failure.
4598 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4599 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4600 {
4601 int i, r = 0;
4602
4603 for (i = 0; i < adev->num_ip_blocks; i++) {
4604 if (!adev->ip_blocks[i].status.valid)
4605 continue;
4606 if (adev->ip_blocks[i].status.hang &&
4607 adev->ip_blocks[i].version->funcs->post_soft_reset)
4608 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4609 if (r)
4610 return r;
4611 }
4612
4613 return 0;
4614 }
4615
4616 /**
4617 * amdgpu_device_recover_vram - Recover some VRAM contents
4618 *
4619 * @adev: amdgpu_device pointer
4620 *
4621 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4622 * restore things like GPUVM page tables after a GPU reset where
4623 * the contents of VRAM might be lost.
4624 *
4625 * Returns:
4626 * 0 on success, negative error code on failure.
4627 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4628 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4629 {
4630 struct dma_fence *fence = NULL, *next = NULL;
4631 struct amdgpu_bo *shadow;
4632 struct amdgpu_bo_vm *vmbo;
4633 long r = 1, tmo;
4634
4635 if (amdgpu_sriov_runtime(adev))
4636 tmo = msecs_to_jiffies(8000);
4637 else
4638 tmo = msecs_to_jiffies(100);
4639
4640 dev_info(adev->dev, "recover vram bo from shadow start\n");
4641 mutex_lock(&adev->shadow_list_lock);
4642 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4643 /* If vm is compute context or adev is APU, shadow will be NULL */
4644 if (!vmbo->shadow)
4645 continue;
4646 shadow = vmbo->shadow;
4647
4648 /* No need to recover an evicted BO */
4649 if (!shadow->tbo.resource ||
4650 shadow->tbo.resource->mem_type != TTM_PL_TT ||
4651 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4652 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4653 continue;
4654
4655 r = amdgpu_bo_restore_shadow(shadow, &next);
4656 if (r)
4657 break;
4658
4659 if (fence) {
4660 tmo = dma_fence_wait_timeout(fence, false, tmo);
4661 dma_fence_put(fence);
4662 fence = next;
4663 if (tmo == 0) {
4664 r = -ETIMEDOUT;
4665 break;
4666 } else if (tmo < 0) {
4667 r = tmo;
4668 break;
4669 }
4670 } else {
4671 fence = next;
4672 }
4673 }
4674 mutex_unlock(&adev->shadow_list_lock);
4675
4676 if (fence)
4677 tmo = dma_fence_wait_timeout(fence, false, tmo);
4678 dma_fence_put(fence);
4679
4680 if (r < 0 || tmo <= 0) {
4681 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4682 return -EIO;
4683 }
4684
4685 dev_info(adev->dev, "recover vram bo from shadow done\n");
4686 return 0;
4687 }
4688
4689
4690 /**
4691 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4692 *
4693 * @adev: amdgpu_device pointer
4694 * @from_hypervisor: request from hypervisor
4695 *
4696 * do VF FLR and reinitialize Asic
4697 * return 0 means succeeded otherwise failed
4698 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4699 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4700 bool from_hypervisor)
4701 {
4702 int r;
4703 struct amdgpu_hive_info *hive = NULL;
4704 int retry_limit = 0;
4705
4706 retry:
4707 amdgpu_amdkfd_pre_reset(adev);
4708
4709 if (from_hypervisor)
4710 r = amdgpu_virt_request_full_gpu(adev, true);
4711 else
4712 r = amdgpu_virt_reset_gpu(adev);
4713 if (r)
4714 return r;
4715 amdgpu_irq_gpu_reset_resume_helper(adev);
4716
4717 /* some sw clean up VF needs to do before recover */
4718 amdgpu_virt_post_reset(adev);
4719
4720 /* Resume IP prior to SMC */
4721 r = amdgpu_device_ip_reinit_early_sriov(adev);
4722 if (r)
4723 goto error;
4724
4725 amdgpu_virt_init_data_exchange(adev);
4726
4727 r = amdgpu_device_fw_loading(adev);
4728 if (r)
4729 return r;
4730
4731 /* now we are okay to resume SMC/CP/SDMA */
4732 r = amdgpu_device_ip_reinit_late_sriov(adev);
4733 if (r)
4734 goto error;
4735
4736 hive = amdgpu_get_xgmi_hive(adev);
4737 /* Update PSP FW topology after reset */
4738 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4739 r = amdgpu_xgmi_update_topology(hive, adev);
4740
4741 if (hive)
4742 amdgpu_put_xgmi_hive(hive);
4743
4744 if (!r) {
4745 r = amdgpu_ib_ring_tests(adev);
4746
4747 amdgpu_amdkfd_post_reset(adev);
4748 }
4749
4750 error:
4751 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4752 amdgpu_inc_vram_lost(adev);
4753 r = amdgpu_device_recover_vram(adev);
4754 }
4755 amdgpu_virt_release_full_gpu(adev, true);
4756
4757 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4758 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4759 retry_limit++;
4760 goto retry;
4761 } else
4762 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4763 }
4764
4765 return r;
4766 }
4767
4768 /**
4769 * amdgpu_device_has_job_running - check if there is any job in mirror list
4770 *
4771 * @adev: amdgpu_device pointer
4772 *
4773 * check if there is any job in mirror list
4774 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4775 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4776 {
4777 int i;
4778 struct drm_sched_job *job;
4779
4780 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4781 struct amdgpu_ring *ring = adev->rings[i];
4782
4783 if (!ring || !ring->sched.thread)
4784 continue;
4785
4786 spin_lock(&ring->sched.job_list_lock);
4787 job = list_first_entry_or_null(&ring->sched.pending_list,
4788 struct drm_sched_job, list);
4789 spin_unlock(&ring->sched.job_list_lock);
4790 if (job)
4791 return true;
4792 }
4793 return false;
4794 }
4795
4796 /**
4797 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4798 *
4799 * @adev: amdgpu_device pointer
4800 *
4801 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4802 * a hung GPU.
4803 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4804 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4805 {
4806
4807 if (amdgpu_gpu_recovery == 0)
4808 goto disabled;
4809
4810 /* Skip soft reset check in fatal error mode */
4811 if (!amdgpu_ras_is_poison_mode_supported(adev))
4812 return true;
4813
4814 if (amdgpu_sriov_vf(adev))
4815 return true;
4816
4817 if (amdgpu_gpu_recovery == -1) {
4818 switch (adev->asic_type) {
4819 #ifdef CONFIG_DRM_AMDGPU_SI
4820 case CHIP_VERDE:
4821 case CHIP_TAHITI:
4822 case CHIP_PITCAIRN:
4823 case CHIP_OLAND:
4824 case CHIP_HAINAN:
4825 #endif
4826 #ifdef CONFIG_DRM_AMDGPU_CIK
4827 case CHIP_KAVERI:
4828 case CHIP_KABINI:
4829 case CHIP_MULLINS:
4830 #endif
4831 case CHIP_CARRIZO:
4832 case CHIP_STONEY:
4833 case CHIP_CYAN_SKILLFISH:
4834 goto disabled;
4835 default:
4836 break;
4837 }
4838 }
4839
4840 return true;
4841
4842 disabled:
4843 dev_info(adev->dev, "GPU recovery disabled.\n");
4844 return false;
4845 }
4846
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4847 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4848 {
4849 u32 i;
4850 int ret = 0;
4851
4852 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4853
4854 dev_info(adev->dev, "GPU mode1 reset\n");
4855
4856 /* Cache the state before bus master disable. The saved config space
4857 * values are used in other cases like restore after mode-2 reset.
4858 */
4859 amdgpu_device_cache_pci_state(adev->pdev);
4860
4861 /* disable BM */
4862 pci_clear_master(adev->pdev);
4863
4864 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4865 dev_info(adev->dev, "GPU smu mode1 reset\n");
4866 ret = amdgpu_dpm_mode1_reset(adev);
4867 } else {
4868 dev_info(adev->dev, "GPU psp mode1 reset\n");
4869 ret = psp_gpu_reset(adev);
4870 }
4871
4872 if (ret)
4873 goto mode1_reset_failed;
4874
4875 amdgpu_device_load_pci_state(adev->pdev);
4876 ret = amdgpu_psp_wait_for_bootloader(adev);
4877 if (ret)
4878 goto mode1_reset_failed;
4879
4880 /* wait for asic to come out of reset */
4881 for (i = 0; i < adev->usec_timeout; i++) {
4882 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4883
4884 if (memsize != 0xffffffff)
4885 break;
4886 udelay(1);
4887 }
4888
4889 if (i >= adev->usec_timeout) {
4890 ret = -ETIMEDOUT;
4891 goto mode1_reset_failed;
4892 }
4893
4894 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4895
4896 return 0;
4897
4898 mode1_reset_failed:
4899 dev_err(adev->dev, "GPU mode1 reset failed\n");
4900 return ret;
4901 }
4902
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4903 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4904 struct amdgpu_reset_context *reset_context)
4905 {
4906 int i, r = 0;
4907 struct amdgpu_job *job = NULL;
4908 bool need_full_reset =
4909 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4910
4911 if (reset_context->reset_req_dev == adev)
4912 job = reset_context->job;
4913
4914 if (amdgpu_sriov_vf(adev)) {
4915 /* stop the data exchange thread */
4916 amdgpu_virt_fini_data_exchange(adev);
4917 }
4918
4919 amdgpu_fence_driver_isr_toggle(adev, true);
4920
4921 /* block all schedulers and reset given job's ring */
4922 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4923 struct amdgpu_ring *ring = adev->rings[i];
4924
4925 if (!ring || !ring->sched.thread)
4926 continue;
4927
4928 /* Clear job fence from fence drv to avoid force_completion
4929 * leave NULL and vm flush fence in fence drv
4930 */
4931 amdgpu_fence_driver_clear_job_fences(ring);
4932
4933 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4934 amdgpu_fence_driver_force_completion(ring);
4935 }
4936
4937 amdgpu_fence_driver_isr_toggle(adev, false);
4938
4939 if (job && job->vm)
4940 drm_sched_increase_karma(&job->base);
4941
4942 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4943 /* If reset handler not implemented, continue; otherwise return */
4944 if (r == -EOPNOTSUPP)
4945 r = 0;
4946 else
4947 return r;
4948
4949 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4950 if (!amdgpu_sriov_vf(adev)) {
4951
4952 if (!need_full_reset)
4953 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4954
4955 if (!need_full_reset && amdgpu_gpu_recovery &&
4956 amdgpu_device_ip_check_soft_reset(adev)) {
4957 amdgpu_device_ip_pre_soft_reset(adev);
4958 r = amdgpu_device_ip_soft_reset(adev);
4959 amdgpu_device_ip_post_soft_reset(adev);
4960 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4961 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4962 need_full_reset = true;
4963 }
4964 }
4965
4966 if (need_full_reset)
4967 r = amdgpu_device_ip_suspend(adev);
4968 if (need_full_reset)
4969 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4970 else
4971 clear_bit(AMDGPU_NEED_FULL_RESET,
4972 &reset_context->flags);
4973 }
4974
4975 return r;
4976 }
4977
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4978 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4979 {
4980 int i;
4981
4982 lockdep_assert_held(&adev->reset_domain->sem);
4983
4984 for (i = 0; i < adev->num_regs; i++) {
4985 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4986 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4987 adev->reset_dump_reg_value[i]);
4988 }
4989
4990 return 0;
4991 }
4992
4993 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4994 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4995 size_t count, void *data, size_t datalen)
4996 {
4997 struct drm_printer p;
4998 struct amdgpu_device *adev = data;
4999 struct drm_print_iterator iter;
5000 int i;
5001
5002 iter.data = buffer;
5003 iter.offset = 0;
5004 iter.start = offset;
5005 iter.remain = count;
5006
5007 p = drm_coredump_printer(&iter);
5008
5009 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
5010 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
5011 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
5012 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
5013 if (adev->reset_task_info.pid)
5014 drm_printf(&p, "process_name: %s PID: %d\n",
5015 adev->reset_task_info.process_name,
5016 adev->reset_task_info.pid);
5017
5018 if (adev->reset_vram_lost)
5019 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
5020 if (adev->num_regs) {
5021 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
5022
5023 for (i = 0; i < adev->num_regs; i++)
5024 drm_printf(&p, "0x%08x: 0x%08x\n",
5025 adev->reset_dump_reg_list[i],
5026 adev->reset_dump_reg_value[i]);
5027 }
5028
5029 return count - iter.remain;
5030 }
5031
amdgpu_devcoredump_free(void * data)5032 static void amdgpu_devcoredump_free(void *data)
5033 {
5034 }
5035
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)5036 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
5037 {
5038 struct drm_device *dev = adev_to_drm(adev);
5039
5040 ktime_get_ts64(&adev->reset_time);
5041 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
5042 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
5043 }
5044 #endif
5045
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5046 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5047 struct amdgpu_reset_context *reset_context)
5048 {
5049 struct amdgpu_device *tmp_adev = NULL;
5050 bool need_full_reset, skip_hw_reset, vram_lost = false;
5051 int r = 0;
5052 bool gpu_reset_for_dev_remove = 0;
5053
5054 /* Try reset handler method first */
5055 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5056 reset_list);
5057 amdgpu_reset_reg_dumps(tmp_adev);
5058
5059 reset_context->reset_device_list = device_list_handle;
5060 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5061 /* If reset handler not implemented, continue; otherwise return */
5062 if (r == -EOPNOTSUPP)
5063 r = 0;
5064 else
5065 return r;
5066
5067 /* Reset handler not implemented, use the default method */
5068 need_full_reset =
5069 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5070 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5071
5072 gpu_reset_for_dev_remove =
5073 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5074 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5075
5076 /*
5077 * ASIC reset has to be done on all XGMI hive nodes ASAP
5078 * to allow proper links negotiation in FW (within 1 sec)
5079 */
5080 if (!skip_hw_reset && need_full_reset) {
5081 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5082 /* For XGMI run all resets in parallel to speed up the process */
5083 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5084 tmp_adev->gmc.xgmi.pending_reset = false;
5085 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5086 r = -EALREADY;
5087 } else
5088 r = amdgpu_asic_reset(tmp_adev);
5089
5090 if (r) {
5091 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5092 r, adev_to_drm(tmp_adev)->unique);
5093 break;
5094 }
5095 }
5096
5097 /* For XGMI wait for all resets to complete before proceed */
5098 if (!r) {
5099 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5100 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5101 flush_work(&tmp_adev->xgmi_reset_work);
5102 r = tmp_adev->asic_reset_res;
5103 if (r)
5104 break;
5105 }
5106 }
5107 }
5108 }
5109
5110 if (!r && amdgpu_ras_intr_triggered()) {
5111 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5112 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
5113 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
5114 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5115 }
5116
5117 amdgpu_ras_intr_cleared();
5118 }
5119
5120 /* Since the mode1 reset affects base ip blocks, the
5121 * phase1 ip blocks need to be resumed. Otherwise there
5122 * will be a BIOS signature error and the psp bootloader
5123 * can't load kdb on the next amdgpu install.
5124 */
5125 if (gpu_reset_for_dev_remove) {
5126 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5127 amdgpu_device_ip_resume_phase1(tmp_adev);
5128
5129 goto end;
5130 }
5131
5132 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5133 if (need_full_reset) {
5134 /* post card */
5135 r = amdgpu_device_asic_init(tmp_adev);
5136 if (r) {
5137 dev_warn(tmp_adev->dev, "asic atom init failed!");
5138 } else {
5139 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5140
5141 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5142 if (r)
5143 goto out;
5144
5145 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5146 #ifdef CONFIG_DEV_COREDUMP
5147 tmp_adev->reset_vram_lost = vram_lost;
5148 memset(&tmp_adev->reset_task_info, 0,
5149 sizeof(tmp_adev->reset_task_info));
5150 if (reset_context->job && reset_context->job->vm)
5151 tmp_adev->reset_task_info =
5152 reset_context->job->vm->task_info;
5153 amdgpu_reset_capture_coredumpm(tmp_adev);
5154 #endif
5155 if (vram_lost) {
5156 DRM_INFO("VRAM is lost due to GPU reset!\n");
5157 amdgpu_inc_vram_lost(tmp_adev);
5158 }
5159
5160 r = amdgpu_device_fw_loading(tmp_adev);
5161 if (r)
5162 return r;
5163
5164 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5165 if (r)
5166 goto out;
5167
5168 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5169 if (r)
5170 goto out;
5171
5172 if (vram_lost)
5173 amdgpu_device_fill_reset_magic(tmp_adev);
5174
5175 /*
5176 * Add this ASIC as tracked as reset was already
5177 * complete successfully.
5178 */
5179 amdgpu_register_gpu_instance(tmp_adev);
5180
5181 if (!reset_context->hive &&
5182 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5183 amdgpu_xgmi_add_device(tmp_adev);
5184
5185 r = amdgpu_device_ip_late_init(tmp_adev);
5186 if (r)
5187 goto out;
5188
5189 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5190
5191 /*
5192 * The GPU enters bad state once faulty pages
5193 * by ECC has reached the threshold, and ras
5194 * recovery is scheduled next. So add one check
5195 * here to break recovery if it indeed exceeds
5196 * bad page threshold, and remind user to
5197 * retire this GPU or setting one bigger
5198 * bad_page_threshold value to fix this once
5199 * probing driver again.
5200 */
5201 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5202 /* must succeed. */
5203 amdgpu_ras_resume(tmp_adev);
5204 } else {
5205 r = -EINVAL;
5206 goto out;
5207 }
5208
5209 /* Update PSP FW topology after reset */
5210 if (reset_context->hive &&
5211 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5212 r = amdgpu_xgmi_update_topology(
5213 reset_context->hive, tmp_adev);
5214 }
5215 }
5216
5217 out:
5218 if (!r) {
5219 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5220 r = amdgpu_ib_ring_tests(tmp_adev);
5221 if (r) {
5222 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5223 need_full_reset = true;
5224 r = -EAGAIN;
5225 goto end;
5226 }
5227 }
5228
5229 if (!r)
5230 r = amdgpu_device_recover_vram(tmp_adev);
5231 else
5232 tmp_adev->asic_reset_res = r;
5233 }
5234
5235 end:
5236 if (need_full_reset)
5237 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5238 else
5239 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5240 return r;
5241 }
5242
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5243 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5244 {
5245
5246 switch (amdgpu_asic_reset_method(adev)) {
5247 case AMD_RESET_METHOD_MODE1:
5248 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5249 break;
5250 case AMD_RESET_METHOD_MODE2:
5251 adev->mp1_state = PP_MP1_STATE_RESET;
5252 break;
5253 default:
5254 adev->mp1_state = PP_MP1_STATE_NONE;
5255 break;
5256 }
5257
5258 pci_dev_put(p);
5259 }
5260
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5261 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5262 {
5263 amdgpu_vf_error_trans_all(adev);
5264 adev->mp1_state = PP_MP1_STATE_NONE;
5265 }
5266
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5267 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5268 {
5269 STUB();
5270 #ifdef notyet
5271 struct pci_dev *p = NULL;
5272
5273 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5274 adev->pdev->bus->number, 1);
5275 if (p) {
5276 pm_runtime_enable(&(p->dev));
5277 pm_runtime_resume(&(p->dev));
5278 }
5279 #endif
5280 }
5281
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5282 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5283 {
5284 enum amd_reset_method reset_method;
5285 struct pci_dev *p = NULL;
5286 u64 expires;
5287
5288 /*
5289 * For now, only BACO and mode1 reset are confirmed
5290 * to suffer the audio issue without proper suspended.
5291 */
5292 reset_method = amdgpu_asic_reset_method(adev);
5293 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5294 (reset_method != AMD_RESET_METHOD_MODE1))
5295 return -EINVAL;
5296
5297 STUB();
5298 return -ENOSYS;
5299 #ifdef notyet
5300
5301 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5302 adev->pdev->bus->number, 1);
5303 if (!p)
5304 return -ENODEV;
5305
5306 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5307 if (!expires)
5308 /*
5309 * If we cannot get the audio device autosuspend delay,
5310 * a fixed 4S interval will be used. Considering 3S is
5311 * the audio controller default autosuspend delay setting.
5312 * 4S used here is guaranteed to cover that.
5313 */
5314 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5315
5316 while (!pm_runtime_status_suspended(&(p->dev))) {
5317 if (!pm_runtime_suspend(&(p->dev)))
5318 break;
5319
5320 if (expires < ktime_get_mono_fast_ns()) {
5321 dev_warn(adev->dev, "failed to suspend display audio\n");
5322 pci_dev_put(p);
5323 /* TODO: abort the succeeding gpu reset? */
5324 return -ETIMEDOUT;
5325 }
5326 }
5327
5328 pm_runtime_disable(&(p->dev));
5329
5330 pci_dev_put(p);
5331 return 0;
5332 #endif
5333 }
5334
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5335 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5336 {
5337 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5338
5339 #if defined(CONFIG_DEBUG_FS)
5340 if (!amdgpu_sriov_vf(adev))
5341 cancel_work(&adev->reset_work);
5342 #endif
5343
5344 if (adev->kfd.dev)
5345 cancel_work(&adev->kfd.reset_work);
5346
5347 if (amdgpu_sriov_vf(adev))
5348 cancel_work(&adev->virt.flr_work);
5349
5350 if (con && adev->ras_enabled)
5351 cancel_work(&con->recovery_work);
5352
5353 }
5354
5355 /**
5356 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5357 *
5358 * @adev: amdgpu_device pointer
5359 * @job: which job trigger hang
5360 * @reset_context: amdgpu reset context pointer
5361 *
5362 * Attempt to reset the GPU if it has hung (all asics).
5363 * Attempt to do soft-reset or full-reset and reinitialize Asic
5364 * Returns 0 for success or an error on failure.
5365 */
5366
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5367 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5368 struct amdgpu_job *job,
5369 struct amdgpu_reset_context *reset_context)
5370 {
5371 struct list_head device_list, *device_list_handle = NULL;
5372 bool job_signaled = false;
5373 struct amdgpu_hive_info *hive = NULL;
5374 struct amdgpu_device *tmp_adev = NULL;
5375 int i, r = 0;
5376 bool need_emergency_restart = false;
5377 bool audio_suspended = false;
5378 bool gpu_reset_for_dev_remove = false;
5379
5380 gpu_reset_for_dev_remove =
5381 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5382 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5383
5384 /*
5385 * Special case: RAS triggered and full reset isn't supported
5386 */
5387 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5388
5389 /*
5390 * Flush RAM to disk so that after reboot
5391 * the user can read log and see why the system rebooted.
5392 */
5393 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5394 amdgpu_ras_get_context(adev)->reboot) {
5395 DRM_WARN("Emergency reboot.");
5396
5397 #ifdef notyet
5398 ksys_sync_helper();
5399 emergency_restart();
5400 #else
5401 panic("emergency_restart");
5402 #endif
5403 }
5404
5405 dev_info(adev->dev, "GPU %s begin!\n",
5406 need_emergency_restart ? "jobs stop":"reset");
5407
5408 if (!amdgpu_sriov_vf(adev))
5409 hive = amdgpu_get_xgmi_hive(adev);
5410 if (hive)
5411 mutex_lock(&hive->hive_lock);
5412
5413 reset_context->job = job;
5414 reset_context->hive = hive;
5415 /*
5416 * Build list of devices to reset.
5417 * In case we are in XGMI hive mode, resort the device list
5418 * to put adev in the 1st position.
5419 */
5420 INIT_LIST_HEAD(&device_list);
5421 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5422 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5423 list_add_tail(&tmp_adev->reset_list, &device_list);
5424 if (gpu_reset_for_dev_remove && adev->shutdown)
5425 tmp_adev->shutdown = true;
5426 }
5427 if (!list_is_first(&adev->reset_list, &device_list))
5428 list_rotate_to_front(&adev->reset_list, &device_list);
5429 device_list_handle = &device_list;
5430 } else {
5431 list_add_tail(&adev->reset_list, &device_list);
5432 device_list_handle = &device_list;
5433 }
5434
5435 /* We need to lock reset domain only once both for XGMI and single device */
5436 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5437 reset_list);
5438 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5439
5440 /* block all schedulers and reset given job's ring */
5441 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5442
5443 amdgpu_device_set_mp1_state(tmp_adev);
5444
5445 /*
5446 * Try to put the audio codec into suspend state
5447 * before gpu reset started.
5448 *
5449 * Due to the power domain of the graphics device
5450 * is shared with AZ power domain. Without this,
5451 * we may change the audio hardware from behind
5452 * the audio driver's back. That will trigger
5453 * some audio codec errors.
5454 */
5455 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5456 audio_suspended = true;
5457
5458 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5459
5460 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5461
5462 if (!amdgpu_sriov_vf(tmp_adev))
5463 amdgpu_amdkfd_pre_reset(tmp_adev);
5464
5465 /*
5466 * Mark these ASICs to be reseted as untracked first
5467 * And add them back after reset completed
5468 */
5469 amdgpu_unregister_gpu_instance(tmp_adev);
5470
5471 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5472
5473 /* disable ras on ALL IPs */
5474 if (!need_emergency_restart &&
5475 amdgpu_device_ip_need_full_reset(tmp_adev))
5476 amdgpu_ras_suspend(tmp_adev);
5477
5478 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5479 struct amdgpu_ring *ring = tmp_adev->rings[i];
5480
5481 if (!ring || !ring->sched.thread)
5482 continue;
5483
5484 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5485
5486 if (need_emergency_restart)
5487 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5488 }
5489 atomic_inc(&tmp_adev->gpu_reset_counter);
5490 }
5491
5492 if (need_emergency_restart)
5493 goto skip_sched_resume;
5494
5495 /*
5496 * Must check guilty signal here since after this point all old
5497 * HW fences are force signaled.
5498 *
5499 * job->base holds a reference to parent fence
5500 */
5501 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5502 job_signaled = true;
5503 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5504 goto skip_hw_reset;
5505 }
5506
5507 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5508 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5509 if (gpu_reset_for_dev_remove) {
5510 /* Workaroud for ASICs need to disable SMC first */
5511 amdgpu_device_smu_fini_early(tmp_adev);
5512 }
5513 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5514 /*TODO Should we stop ?*/
5515 if (r) {
5516 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5517 r, adev_to_drm(tmp_adev)->unique);
5518 tmp_adev->asic_reset_res = r;
5519 }
5520
5521 /*
5522 * Drop all pending non scheduler resets. Scheduler resets
5523 * were already dropped during drm_sched_stop
5524 */
5525 amdgpu_device_stop_pending_resets(tmp_adev);
5526 }
5527
5528 /* Actual ASIC resets if needed.*/
5529 /* Host driver will handle XGMI hive reset for SRIOV */
5530 if (amdgpu_sriov_vf(adev)) {
5531 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5532 if (r)
5533 adev->asic_reset_res = r;
5534
5535 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5536 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5537 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5538 amdgpu_ras_resume(adev);
5539 } else {
5540 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5541 if (r && r == -EAGAIN)
5542 goto retry;
5543
5544 if (!r && gpu_reset_for_dev_remove)
5545 goto recover_end;
5546 }
5547
5548 skip_hw_reset:
5549
5550 /* Post ASIC reset for all devs .*/
5551 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5552
5553 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5554 struct amdgpu_ring *ring = tmp_adev->rings[i];
5555
5556 if (!ring || !ring->sched.thread)
5557 continue;
5558
5559 drm_sched_start(&ring->sched, true);
5560 }
5561
5562 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5563 amdgpu_mes_self_test(tmp_adev);
5564
5565 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5566 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5567
5568 if (tmp_adev->asic_reset_res)
5569 r = tmp_adev->asic_reset_res;
5570
5571 tmp_adev->asic_reset_res = 0;
5572
5573 if (r) {
5574 /* bad news, how to tell it to userspace ? */
5575 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5576 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5577 } else {
5578 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5579 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5580 DRM_WARN("smart shift update failed\n");
5581 }
5582 }
5583
5584 skip_sched_resume:
5585 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5586 /* unlock kfd: SRIOV would do it separately */
5587 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5588 amdgpu_amdkfd_post_reset(tmp_adev);
5589
5590 /* kfd_post_reset will do nothing if kfd device is not initialized,
5591 * need to bring up kfd here if it's not be initialized before
5592 */
5593 if (!adev->kfd.init_complete)
5594 amdgpu_amdkfd_device_init(adev);
5595
5596 if (audio_suspended)
5597 amdgpu_device_resume_display_audio(tmp_adev);
5598
5599 amdgpu_device_unset_mp1_state(tmp_adev);
5600
5601 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5602 }
5603
5604 recover_end:
5605 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5606 reset_list);
5607 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5608
5609 if (hive) {
5610 mutex_unlock(&hive->hive_lock);
5611 amdgpu_put_xgmi_hive(hive);
5612 }
5613
5614 if (r)
5615 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5616
5617 atomic_set(&adev->reset_domain->reset_res, r);
5618 return r;
5619 }
5620
5621 /**
5622 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5623 *
5624 * @adev: amdgpu_device pointer
5625 *
5626 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5627 * and lanes) of the slot the device is in. Handles APUs and
5628 * virtualized environments where PCIE config space may not be available.
5629 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5630 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5631 {
5632 struct pci_dev *pdev;
5633 enum pci_bus_speed speed_cap, platform_speed_cap;
5634 enum pcie_link_width platform_link_width;
5635
5636 if (amdgpu_pcie_gen_cap)
5637 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5638
5639 if (amdgpu_pcie_lane_cap)
5640 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5641
5642 /* covers APUs as well */
5643 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5644 if (adev->pm.pcie_gen_mask == 0)
5645 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5646 if (adev->pm.pcie_mlw_mask == 0)
5647 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5648 return;
5649 }
5650
5651 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5652 return;
5653
5654 pcie_bandwidth_available(adev->pdev, NULL,
5655 &platform_speed_cap, &platform_link_width);
5656
5657 if (adev->pm.pcie_gen_mask == 0) {
5658 /* asic caps */
5659 pdev = adev->pdev;
5660 speed_cap = pcie_get_speed_cap(pdev);
5661 if (speed_cap == PCI_SPEED_UNKNOWN) {
5662 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5663 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5664 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5665 } else {
5666 if (speed_cap == PCIE_SPEED_32_0GT)
5667 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5668 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5669 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5670 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5671 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5672 else if (speed_cap == PCIE_SPEED_16_0GT)
5673 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5674 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5675 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5676 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5677 else if (speed_cap == PCIE_SPEED_8_0GT)
5678 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5679 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5680 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5681 else if (speed_cap == PCIE_SPEED_5_0GT)
5682 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5683 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5684 else
5685 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5686 }
5687 /* platform caps */
5688 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5689 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5690 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5691 } else {
5692 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5693 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5694 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5695 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5696 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5697 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5698 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5699 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5700 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5701 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5702 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5703 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5704 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5705 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5706 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5707 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5708 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5709 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5710 else
5711 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5712
5713 }
5714 }
5715 if (adev->pm.pcie_mlw_mask == 0) {
5716 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5717 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5718 } else {
5719 switch (platform_link_width) {
5720 case PCIE_LNK_X32:
5721 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5722 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5723 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5724 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5725 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5726 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5727 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5728 break;
5729 case PCIE_LNK_X16:
5730 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5731 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5732 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5733 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5734 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5735 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5736 break;
5737 case PCIE_LNK_X12:
5738 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5739 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5740 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5741 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5742 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5743 break;
5744 case PCIE_LNK_X8:
5745 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5746 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5747 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5748 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5749 break;
5750 case PCIE_LNK_X4:
5751 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5752 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5753 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5754 break;
5755 case PCIE_LNK_X2:
5756 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5758 break;
5759 case PCIE_LNK_X1:
5760 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5761 break;
5762 default:
5763 break;
5764 }
5765 }
5766 }
5767 }
5768
5769 /**
5770 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5771 *
5772 * @adev: amdgpu_device pointer
5773 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5774 *
5775 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5776 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5777 * @peer_adev.
5778 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5779 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5780 struct amdgpu_device *peer_adev)
5781 {
5782 #ifdef CONFIG_HSA_AMD_P2P
5783 uint64_t address_mask = peer_adev->dev->dma_mask ?
5784 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5785 resource_size_t aper_limit =
5786 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5787 bool p2p_access =
5788 !adev->gmc.xgmi.connected_to_cpu &&
5789 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5790
5791 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5792 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5793 !(adev->gmc.aper_base & address_mask ||
5794 aper_limit & address_mask));
5795 #else
5796 return false;
5797 #endif
5798 }
5799
amdgpu_device_baco_enter(struct drm_device * dev)5800 int amdgpu_device_baco_enter(struct drm_device *dev)
5801 {
5802 struct amdgpu_device *adev = drm_to_adev(dev);
5803 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5804
5805 if (!amdgpu_device_supports_baco(dev))
5806 return -ENOTSUPP;
5807
5808 if (ras && adev->ras_enabled &&
5809 adev->nbio.funcs->enable_doorbell_interrupt)
5810 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5811
5812 return amdgpu_dpm_baco_enter(adev);
5813 }
5814
amdgpu_device_baco_exit(struct drm_device * dev)5815 int amdgpu_device_baco_exit(struct drm_device *dev)
5816 {
5817 struct amdgpu_device *adev = drm_to_adev(dev);
5818 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5819 int ret = 0;
5820
5821 if (!amdgpu_device_supports_baco(dev))
5822 return -ENOTSUPP;
5823
5824 ret = amdgpu_dpm_baco_exit(adev);
5825 if (ret)
5826 return ret;
5827
5828 if (ras && adev->ras_enabled &&
5829 adev->nbio.funcs->enable_doorbell_interrupt)
5830 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5831
5832 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
5833 adev->nbio.funcs->clear_doorbell_interrupt)
5834 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5835
5836 return 0;
5837 }
5838
5839 /**
5840 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5841 * @pdev: PCI device struct
5842 * @state: PCI channel state
5843 *
5844 * Description: Called when a PCI error is detected.
5845 *
5846 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5847 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5848 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5849 {
5850 STUB();
5851 return 0;
5852 #ifdef notyet
5853 struct drm_device *dev = pci_get_drvdata(pdev);
5854 struct amdgpu_device *adev = drm_to_adev(dev);
5855 int i;
5856
5857 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5858
5859 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5860 DRM_WARN("No support for XGMI hive yet...");
5861 return PCI_ERS_RESULT_DISCONNECT;
5862 }
5863
5864 adev->pci_channel_state = state;
5865
5866 switch (state) {
5867 case pci_channel_io_normal:
5868 return PCI_ERS_RESULT_CAN_RECOVER;
5869 /* Fatal error, prepare for slot reset */
5870 case pci_channel_io_frozen:
5871 /*
5872 * Locking adev->reset_domain->sem will prevent any external access
5873 * to GPU during PCI error recovery
5874 */
5875 amdgpu_device_lock_reset_domain(adev->reset_domain);
5876 amdgpu_device_set_mp1_state(adev);
5877
5878 /*
5879 * Block any work scheduling as we do for regular GPU reset
5880 * for the duration of the recovery
5881 */
5882 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5883 struct amdgpu_ring *ring = adev->rings[i];
5884
5885 if (!ring || !ring->sched.thread)
5886 continue;
5887
5888 drm_sched_stop(&ring->sched, NULL);
5889 }
5890 atomic_inc(&adev->gpu_reset_counter);
5891 return PCI_ERS_RESULT_NEED_RESET;
5892 case pci_channel_io_perm_failure:
5893 /* Permanent error, prepare for device removal */
5894 return PCI_ERS_RESULT_DISCONNECT;
5895 }
5896
5897 return PCI_ERS_RESULT_NEED_RESET;
5898 #endif
5899 }
5900
5901 /**
5902 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5903 * @pdev: pointer to PCI device
5904 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5905 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5906 {
5907
5908 DRM_INFO("PCI error: mmio enabled callback!!\n");
5909
5910 /* TODO - dump whatever for debugging purposes */
5911
5912 /* This called only if amdgpu_pci_error_detected returns
5913 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5914 * works, no need to reset slot.
5915 */
5916
5917 return PCI_ERS_RESULT_RECOVERED;
5918 }
5919
5920 /**
5921 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5922 * @pdev: PCI device struct
5923 *
5924 * Description: This routine is called by the pci error recovery
5925 * code after the PCI slot has been reset, just before we
5926 * should resume normal operations.
5927 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5928 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5929 {
5930 STUB();
5931 return PCI_ERS_RESULT_RECOVERED;
5932 #ifdef notyet
5933 struct drm_device *dev = pci_get_drvdata(pdev);
5934 struct amdgpu_device *adev = drm_to_adev(dev);
5935 int r, i;
5936 struct amdgpu_reset_context reset_context;
5937 u32 memsize;
5938 struct list_head device_list;
5939
5940 DRM_INFO("PCI error: slot reset callback!!\n");
5941
5942 memset(&reset_context, 0, sizeof(reset_context));
5943
5944 INIT_LIST_HEAD(&device_list);
5945 list_add_tail(&adev->reset_list, &device_list);
5946
5947 /* wait for asic to come out of reset */
5948 drm_msleep(500);
5949
5950 /* Restore PCI confspace */
5951 amdgpu_device_load_pci_state(pdev);
5952
5953 /* confirm ASIC came out of reset */
5954 for (i = 0; i < adev->usec_timeout; i++) {
5955 memsize = amdgpu_asic_get_config_memsize(adev);
5956
5957 if (memsize != 0xffffffff)
5958 break;
5959 udelay(1);
5960 }
5961 if (memsize == 0xffffffff) {
5962 r = -ETIME;
5963 goto out;
5964 }
5965
5966 reset_context.method = AMD_RESET_METHOD_NONE;
5967 reset_context.reset_req_dev = adev;
5968 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5969 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5970
5971 adev->no_hw_access = true;
5972 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5973 adev->no_hw_access = false;
5974 if (r)
5975 goto out;
5976
5977 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5978
5979 out:
5980 if (!r) {
5981 if (amdgpu_device_cache_pci_state(adev->pdev))
5982 pci_restore_state(adev->pdev);
5983
5984 DRM_INFO("PCIe error recovery succeeded\n");
5985 } else {
5986 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5987 amdgpu_device_unset_mp1_state(adev);
5988 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5989 }
5990
5991 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5992 #endif
5993 }
5994
5995 /**
5996 * amdgpu_pci_resume() - resume normal ops after PCI reset
5997 * @pdev: pointer to PCI device
5998 *
5999 * Called when the error recovery driver tells us that its
6000 * OK to resume normal operation.
6001 */
amdgpu_pci_resume(struct pci_dev * pdev)6002 void amdgpu_pci_resume(struct pci_dev *pdev)
6003 {
6004 STUB();
6005 #ifdef notyet
6006 struct drm_device *dev = pci_get_drvdata(pdev);
6007 struct amdgpu_device *adev = drm_to_adev(dev);
6008 int i;
6009
6010
6011 DRM_INFO("PCI error: resume callback!!\n");
6012
6013 /* Only continue execution for the case of pci_channel_io_frozen */
6014 if (adev->pci_channel_state != pci_channel_io_frozen)
6015 return;
6016
6017 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6018 struct amdgpu_ring *ring = adev->rings[i];
6019
6020 if (!ring || !ring->sched.thread)
6021 continue;
6022
6023 drm_sched_start(&ring->sched, true);
6024 }
6025
6026 amdgpu_device_unset_mp1_state(adev);
6027 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6028 #endif
6029 }
6030
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6031 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6032 {
6033 return false;
6034 #ifdef notyet
6035 struct drm_device *dev = pci_get_drvdata(pdev);
6036 struct amdgpu_device *adev = drm_to_adev(dev);
6037 int r;
6038
6039 if (amdgpu_sriov_vf(adev))
6040 return false;
6041
6042 r = pci_save_state(pdev);
6043 if (!r) {
6044 kfree(adev->pci_state);
6045
6046 adev->pci_state = pci_store_saved_state(pdev);
6047
6048 if (!adev->pci_state) {
6049 DRM_ERROR("Failed to store PCI saved state");
6050 return false;
6051 }
6052 } else {
6053 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6054 return false;
6055 }
6056
6057 return true;
6058 #endif
6059 }
6060
amdgpu_device_load_pci_state(struct pci_dev * pdev)6061 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6062 {
6063 STUB();
6064 return false;
6065 #ifdef notyet
6066 struct drm_device *dev = pci_get_drvdata(pdev);
6067 struct amdgpu_device *adev = drm_to_adev(dev);
6068 int r;
6069
6070 if (!adev->pci_state)
6071 return false;
6072
6073 r = pci_load_saved_state(pdev, adev->pci_state);
6074
6075 if (!r) {
6076 pci_restore_state(pdev);
6077 } else {
6078 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6079 return false;
6080 }
6081
6082 return true;
6083 #endif
6084 }
6085
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6086 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6087 struct amdgpu_ring *ring)
6088 {
6089 #ifdef CONFIG_X86_64
6090 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6091 return;
6092 #endif
6093 if (adev->gmc.xgmi.connected_to_cpu)
6094 return;
6095
6096 if (ring && ring->funcs->emit_hdp_flush)
6097 amdgpu_ring_emit_hdp_flush(ring);
6098 else
6099 amdgpu_asic_flush_hdp(adev, ring);
6100 }
6101
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6102 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6103 struct amdgpu_ring *ring)
6104 {
6105 #ifdef CONFIG_X86_64
6106 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6107 return;
6108 #endif
6109 if (adev->gmc.xgmi.connected_to_cpu)
6110 return;
6111
6112 amdgpu_asic_invalidate_hdp(adev, ring);
6113 }
6114
amdgpu_in_reset(struct amdgpu_device * adev)6115 int amdgpu_in_reset(struct amdgpu_device *adev)
6116 {
6117 return atomic_read(&adev->reset_domain->in_gpu_reset);
6118 }
6119
6120 /**
6121 * amdgpu_device_halt() - bring hardware to some kind of halt state
6122 *
6123 * @adev: amdgpu_device pointer
6124 *
6125 * Bring hardware to some kind of halt state so that no one can touch it
6126 * any more. It will help to maintain error context when error occurred.
6127 * Compare to a simple hang, the system will keep stable at least for SSH
6128 * access. Then it should be trivial to inspect the hardware state and
6129 * see what's going on. Implemented as following:
6130 *
6131 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6132 * clears all CPU mappings to device, disallows remappings through page faults
6133 * 2. amdgpu_irq_disable_all() disables all interrupts
6134 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6135 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6136 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6137 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6138 * flush any in flight DMA operations
6139 */
amdgpu_device_halt(struct amdgpu_device * adev)6140 void amdgpu_device_halt(struct amdgpu_device *adev)
6141 {
6142 struct pci_dev *pdev = adev->pdev;
6143 struct drm_device *ddev = adev_to_drm(adev);
6144
6145 amdgpu_xcp_dev_unplug(adev);
6146 drm_dev_unplug(ddev);
6147
6148 amdgpu_irq_disable_all(adev);
6149
6150 amdgpu_fence_driver_hw_fini(adev);
6151
6152 adev->no_hw_access = true;
6153
6154 amdgpu_device_unmap_mmio(adev);
6155
6156 pci_disable_device(pdev);
6157 pci_wait_for_pending_transaction(pdev);
6158 }
6159
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6160 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6161 u32 reg)
6162 {
6163 unsigned long flags, address, data;
6164 u32 r;
6165
6166 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6167 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6168
6169 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6170 WREG32(address, reg * 4);
6171 (void)RREG32(address);
6172 r = RREG32(data);
6173 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6174 return r;
6175 }
6176
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6177 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6178 u32 reg, u32 v)
6179 {
6180 unsigned long flags, address, data;
6181
6182 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6183 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6184
6185 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6186 WREG32(address, reg * 4);
6187 (void)RREG32(address);
6188 WREG32(data, v);
6189 (void)RREG32(data);
6190 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6191 }
6192
6193 /**
6194 * amdgpu_device_switch_gang - switch to a new gang
6195 * @adev: amdgpu_device pointer
6196 * @gang: the gang to switch to
6197 *
6198 * Try to switch to a new gang.
6199 * Returns: NULL if we switched to the new gang or a reference to the current
6200 * gang leader.
6201 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6202 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6203 struct dma_fence *gang)
6204 {
6205 struct dma_fence *old = NULL;
6206
6207 do {
6208 dma_fence_put(old);
6209 rcu_read_lock();
6210 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6211 rcu_read_unlock();
6212
6213 if (old == gang)
6214 break;
6215
6216 if (!dma_fence_is_signaled(old))
6217 return old;
6218
6219 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6220 old, gang) != old);
6221
6222 dma_fence_put(old);
6223 return NULL;
6224 }
6225
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6226 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6227 {
6228 switch (adev->asic_type) {
6229 #ifdef CONFIG_DRM_AMDGPU_SI
6230 case CHIP_HAINAN:
6231 #endif
6232 case CHIP_TOPAZ:
6233 /* chips with no display hardware */
6234 return false;
6235 #ifdef CONFIG_DRM_AMDGPU_SI
6236 case CHIP_TAHITI:
6237 case CHIP_PITCAIRN:
6238 case CHIP_VERDE:
6239 case CHIP_OLAND:
6240 #endif
6241 #ifdef CONFIG_DRM_AMDGPU_CIK
6242 case CHIP_BONAIRE:
6243 case CHIP_HAWAII:
6244 case CHIP_KAVERI:
6245 case CHIP_KABINI:
6246 case CHIP_MULLINS:
6247 #endif
6248 case CHIP_TONGA:
6249 case CHIP_FIJI:
6250 case CHIP_POLARIS10:
6251 case CHIP_POLARIS11:
6252 case CHIP_POLARIS12:
6253 case CHIP_VEGAM:
6254 case CHIP_CARRIZO:
6255 case CHIP_STONEY:
6256 /* chips with display hardware */
6257 return true;
6258 default:
6259 /* IP discovery */
6260 if (!adev->ip_versions[DCE_HWIP][0] ||
6261 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6262 return false;
6263 return true;
6264 }
6265 }
6266
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6267 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6268 uint32_t inst, uint32_t reg_addr, char reg_name[],
6269 uint32_t expected_value, uint32_t mask)
6270 {
6271 uint32_t ret = 0;
6272 uint32_t old_ = 0;
6273 uint32_t tmp_ = RREG32(reg_addr);
6274 uint32_t loop = adev->usec_timeout;
6275
6276 while ((tmp_ & (mask)) != (expected_value)) {
6277 if (old_ != tmp_) {
6278 loop = adev->usec_timeout;
6279 old_ = tmp_;
6280 } else
6281 udelay(1);
6282 tmp_ = RREG32(reg_addr);
6283 loop--;
6284 if (!loop) {
6285 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6286 inst, reg_name, (uint32_t)expected_value,
6287 (uint32_t)(tmp_ & (mask)));
6288 ret = -ETIMEDOUT;
6289 break;
6290 }
6291 }
6292 return ret;
6293 }
6294