1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82
83 #include <drm/drm_drv.h>
84
85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__)
86 #include <asm/intel-family.h>
87 #endif
88
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97 #define AMDGPU_RESUME_MS 2000
98 #define AMDGPU_MAX_RETRY_LIMIT 2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100
101 static const struct drm_driver amdgpu_kms_driver;
102
103 const char *amdgpu_asic_name[] = {
104 "TAHITI",
105 "PITCAIRN",
106 "VERDE",
107 "OLAND",
108 "HAINAN",
109 "BONAIRE",
110 "KAVERI",
111 "KABINI",
112 "HAWAII",
113 "MULLINS",
114 "TOPAZ",
115 "TONGA",
116 "FIJI",
117 "CARRIZO",
118 "STONEY",
119 "POLARIS10",
120 "POLARIS11",
121 "POLARIS12",
122 "VEGAM",
123 "VEGA10",
124 "VEGA12",
125 "VEGA20",
126 "RAVEN",
127 "ARCTURUS",
128 "RENOIR",
129 "ALDEBARAN",
130 "NAVI10",
131 "CYAN_SKILLFISH",
132 "NAVI14",
133 "NAVI12",
134 "SIENNA_CICHLID",
135 "NAVY_FLOUNDER",
136 "VANGOGH",
137 "DIMGREY_CAVEFISH",
138 "BEIGE_GOBY",
139 "YELLOW_CARP",
140 "IP DISCOVERY",
141 "LAST",
142 };
143
144 /**
145 * DOC: pcie_replay_count
146 *
147 * The amdgpu driver provides a sysfs API for reporting the total number
148 * of PCIe replays (NAKs)
149 * The file pcie_replay_count is used for this and returns the total
150 * number of replays as a sum of the NAKs generated and NAKs received
151 */
152
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
154 struct device_attribute *attr, char *buf)
155 {
156 struct drm_device *ddev = dev_get_drvdata(dev);
157 struct amdgpu_device *adev = drm_to_adev(ddev);
158 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
159
160 return sysfs_emit(buf, "%llu\n", cnt);
161 }
162
163 static DEVICE_ATTR(pcie_replay_count, 0444,
164 amdgpu_device_get_pcie_replay_count, NULL);
165
166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
167
168
169 /**
170 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
171 *
172 * @dev: drm_device pointer
173 *
174 * Returns true if the device is a dGPU with ATPX power control,
175 * otherwise return false.
176 */
amdgpu_device_supports_px(struct drm_device * dev)177 bool amdgpu_device_supports_px(struct drm_device *dev)
178 {
179 struct amdgpu_device *adev = drm_to_adev(dev);
180
181 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
182 return true;
183 return false;
184 }
185
186 /**
187 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
188 *
189 * @dev: drm_device pointer
190 *
191 * Returns true if the device is a dGPU with ACPI power control,
192 * otherwise return false.
193 */
amdgpu_device_supports_boco(struct drm_device * dev)194 bool amdgpu_device_supports_boco(struct drm_device *dev)
195 {
196 struct amdgpu_device *adev = drm_to_adev(dev);
197
198 if (adev->has_pr3 ||
199 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
200 return true;
201 return false;
202 }
203
204 /**
205 * amdgpu_device_supports_baco - Does the device support BACO
206 *
207 * @dev: drm_device pointer
208 *
209 * Returns true if the device supporte BACO,
210 * otherwise return false.
211 */
amdgpu_device_supports_baco(struct drm_device * dev)212 bool amdgpu_device_supports_baco(struct drm_device *dev)
213 {
214 struct amdgpu_device *adev = drm_to_adev(dev);
215
216 return amdgpu_asic_supports_baco(adev);
217 }
218
219 /**
220 * amdgpu_device_supports_smart_shift - Is the device dGPU with
221 * smart shift support
222 *
223 * @dev: drm_device pointer
224 *
225 * Returns true if the device is a dGPU with Smart Shift support,
226 * otherwise returns false.
227 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
229 {
230 return (amdgpu_device_supports_boco(dev) &&
231 amdgpu_acpi_is_power_shift_control_supported());
232 }
233
234 /*
235 * VRAM access helper functions
236 */
237
238 /**
239 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
240 *
241 * @adev: amdgpu_device pointer
242 * @pos: offset of the buffer in vram
243 * @buf: virtual address of the buffer in system memory
244 * @size: read/write size, sizeof(@buf) must > @size
245 * @write: true - write to vram, otherwise - read from vram
246 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
248 void *buf, size_t size, bool write)
249 {
250 unsigned long flags;
251 uint32_t hi = ~0, tmp = 0;
252 uint32_t *data = buf;
253 uint64_t last;
254 int idx;
255
256 if (!drm_dev_enter(adev_to_drm(adev), &idx))
257 return;
258
259 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
260
261 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
262 for (last = pos + size; pos < last; pos += 4) {
263 tmp = pos >> 31;
264
265 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
266 if (tmp != hi) {
267 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
268 hi = tmp;
269 }
270 if (write)
271 WREG32_NO_KIQ(mmMM_DATA, *data++);
272 else
273 *data++ = RREG32_NO_KIQ(mmMM_DATA);
274 }
275
276 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
277 drm_dev_exit(idx);
278 }
279
280 /**
281 * amdgpu_device_aper_access - access vram by vram aperature
282 *
283 * @adev: amdgpu_device pointer
284 * @pos: offset of the buffer in vram
285 * @buf: virtual address of the buffer in system memory
286 * @size: read/write size, sizeof(@buf) must > @size
287 * @write: true - write to vram, otherwise - read from vram
288 *
289 * The return value means how many bytes have been transferred.
290 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
292 void *buf, size_t size, bool write)
293 {
294 #ifdef CONFIG_64BIT
295 void __iomem *addr;
296 size_t count = 0;
297 uint64_t last;
298
299 if (!adev->mman.aper_base_kaddr)
300 return 0;
301
302 last = min(pos + size, adev->gmc.visible_vram_size);
303 if (last > pos) {
304 addr = adev->mman.aper_base_kaddr + pos;
305 count = last - pos;
306
307 if (write) {
308 memcpy_toio(addr, buf, count);
309 /* Make sure HDP write cache flush happens without any reordering
310 * after the system memory contents are sent over PCIe device
311 */
312 mb();
313 amdgpu_device_flush_hdp(adev, NULL);
314 } else {
315 amdgpu_device_invalidate_hdp(adev, NULL);
316 /* Make sure HDP read cache is invalidated before issuing a read
317 * to the PCIe device
318 */
319 mb();
320 memcpy_fromio(buf, addr, count);
321 }
322
323 }
324
325 return count;
326 #else
327 return 0;
328 #endif
329 }
330
331 /**
332 * amdgpu_device_vram_access - read/write a buffer in vram
333 *
334 * @adev: amdgpu_device pointer
335 * @pos: offset of the buffer in vram
336 * @buf: virtual address of the buffer in system memory
337 * @size: read/write size, sizeof(@buf) must > @size
338 * @write: true - write to vram, otherwise - read from vram
339 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
341 void *buf, size_t size, bool write)
342 {
343 size_t count;
344
345 /* try to using vram apreature to access vram first */
346 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
347 size -= count;
348 if (size) {
349 /* using MM to access rest vram */
350 pos += count;
351 buf += count;
352 amdgpu_device_mm_access(adev, pos, buf, size, write);
353 }
354 }
355
356 /*
357 * register access helper functions.
358 */
359
360 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
362 {
363 if (adev->no_hw_access)
364 return true;
365
366 #ifdef CONFIG_LOCKDEP
367 /*
368 * This is a bit complicated to understand, so worth a comment. What we assert
369 * here is that the GPU reset is not running on another thread in parallel.
370 *
371 * For this we trylock the read side of the reset semaphore, if that succeeds
372 * we know that the reset is not running in paralell.
373 *
374 * If the trylock fails we assert that we are either already holding the read
375 * side of the lock or are the reset thread itself and hold the write side of
376 * the lock.
377 */
378 if (in_task()) {
379 if (down_read_trylock(&adev->reset_domain->sem))
380 up_read(&adev->reset_domain->sem);
381 else
382 lockdep_assert_held(&adev->reset_domain->sem);
383 }
384 #endif
385 return false;
386 }
387
388 /**
389 * amdgpu_device_rreg - read a memory mapped IO or indirect register
390 *
391 * @adev: amdgpu_device pointer
392 * @reg: dword aligned register offset
393 * @acc_flags: access flags which require special behavior
394 *
395 * Returns the 32 bit value from the offset specified.
396 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
398 uint32_t reg, uint32_t acc_flags)
399 {
400 uint32_t ret;
401
402 if (amdgpu_device_skip_hw_access(adev))
403 return 0;
404
405 if ((reg * 4) < adev->rmmio_size) {
406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
407 amdgpu_sriov_runtime(adev) &&
408 down_read_trylock(&adev->reset_domain->sem)) {
409 ret = amdgpu_kiq_rreg(adev, reg);
410 up_read(&adev->reset_domain->sem);
411 } else {
412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
413 }
414 } else {
415 ret = adev->pcie_rreg(adev, reg * 4);
416 }
417
418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
419
420 return ret;
421 }
422
423 /*
424 * MMIO register read with bytes helper functions
425 * @offset:bytes offset from MMIO start
426 */
427
428 /**
429 * amdgpu_mm_rreg8 - read a memory mapped IO register
430 *
431 * @adev: amdgpu_device pointer
432 * @offset: byte aligned register offset
433 *
434 * Returns the 8 bit value from the offset specified.
435 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
437 {
438 if (amdgpu_device_skip_hw_access(adev))
439 return 0;
440
441 if (offset < adev->rmmio_size)
442 return (readb(adev->rmmio + offset));
443 BUG();
444 }
445
446 /*
447 * MMIO register write with bytes helper functions
448 * @offset:bytes offset from MMIO start
449 * @value: the value want to be written to the register
450 */
451
452 /**
453 * amdgpu_mm_wreg8 - read a memory mapped IO register
454 *
455 * @adev: amdgpu_device pointer
456 * @offset: byte aligned register offset
457 * @value: 8 bit value to write
458 *
459 * Writes the value specified to the offset specified.
460 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
462 {
463 if (amdgpu_device_skip_hw_access(adev))
464 return;
465
466 if (offset < adev->rmmio_size)
467 writeb(value, adev->rmmio + offset);
468 else
469 BUG();
470 }
471
472 /**
473 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
474 *
475 * @adev: amdgpu_device pointer
476 * @reg: dword aligned register offset
477 * @v: 32 bit value to write to the register
478 * @acc_flags: access flags which require special behavior
479 *
480 * Writes the value specified to the offset specified.
481 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)482 void amdgpu_device_wreg(struct amdgpu_device *adev,
483 uint32_t reg, uint32_t v,
484 uint32_t acc_flags)
485 {
486 if (amdgpu_device_skip_hw_access(adev))
487 return;
488
489 if ((reg * 4) < adev->rmmio_size) {
490 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
491 amdgpu_sriov_runtime(adev) &&
492 down_read_trylock(&adev->reset_domain->sem)) {
493 amdgpu_kiq_wreg(adev, reg, v);
494 up_read(&adev->reset_domain->sem);
495 } else {
496 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
497 }
498 } else {
499 adev->pcie_wreg(adev, reg * 4, v);
500 }
501
502 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
503 }
504
505 /**
506 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
507 *
508 * @adev: amdgpu_device pointer
509 * @reg: mmio/rlc register
510 * @v: value to write
511 *
512 * this function is invoked only for the debugfs register access
513 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
515 uint32_t reg, uint32_t v,
516 uint32_t xcc_id)
517 {
518 if (amdgpu_device_skip_hw_access(adev))
519 return;
520
521 if (amdgpu_sriov_fullaccess(adev) &&
522 adev->gfx.rlc.funcs &&
523 adev->gfx.rlc.funcs->is_rlcg_access_range) {
524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
526 } else if ((reg * 4) >= adev->rmmio_size) {
527 adev->pcie_wreg(adev, reg * 4, v);
528 } else {
529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
530 }
531 }
532
533 /**
534 * amdgpu_device_indirect_rreg - read an indirect register
535 *
536 * @adev: amdgpu_device pointer
537 * @reg_addr: indirect register address to read from
538 *
539 * Returns the value of indirect register @reg_addr
540 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
542 u32 reg_addr)
543 {
544 unsigned long flags, pcie_index, pcie_data;
545 void __iomem *pcie_index_offset;
546 void __iomem *pcie_data_offset;
547 u32 r;
548
549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
551
552 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
555
556 writel(reg_addr, pcie_index_offset);
557 readl(pcie_index_offset);
558 r = readl(pcie_data_offset);
559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
560
561 return r;
562 }
563
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
565 u64 reg_addr)
566 {
567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
568 u32 r;
569 void __iomem *pcie_index_offset;
570 void __iomem *pcie_index_hi_offset;
571 void __iomem *pcie_data_offset;
572
573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
575 if (adev->nbio.funcs->get_pcie_index_hi_offset)
576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
577 else
578 pcie_index_hi = 0;
579
580 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
583 if (pcie_index_hi != 0)
584 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
585 pcie_index_hi * 4;
586
587 writel(reg_addr, pcie_index_offset);
588 readl(pcie_index_offset);
589 if (pcie_index_hi != 0) {
590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
591 readl(pcie_index_hi_offset);
592 }
593 r = readl(pcie_data_offset);
594
595 /* clear the high bits */
596 if (pcie_index_hi != 0) {
597 writel(0, pcie_index_hi_offset);
598 readl(pcie_index_hi_offset);
599 }
600
601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602
603 return r;
604 }
605
606 /**
607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608 *
609 * @adev: amdgpu_device pointer
610 * @reg_addr: indirect register address to read from
611 *
612 * Returns the value of indirect register @reg_addr
613 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
615 u32 reg_addr)
616 {
617 unsigned long flags, pcie_index, pcie_data;
618 void __iomem *pcie_index_offset;
619 void __iomem *pcie_data_offset;
620 u64 r;
621
622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
624
625 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
628
629 /* read low 32 bits */
630 writel(reg_addr, pcie_index_offset);
631 readl(pcie_index_offset);
632 r = readl(pcie_data_offset);
633 /* read high 32 bits */
634 writel(reg_addr + 4, pcie_index_offset);
635 readl(pcie_index_offset);
636 r |= ((u64)readl(pcie_data_offset) << 32);
637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
638
639 return r;
640 }
641
642 /**
643 * amdgpu_device_indirect_wreg - write an indirect register address
644 *
645 * @adev: amdgpu_device pointer
646 * @reg_addr: indirect register offset
647 * @reg_data: indirect register data
648 *
649 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
651 u32 reg_addr, u32 reg_data)
652 {
653 unsigned long flags, pcie_index, pcie_data;
654 void __iomem *pcie_index_offset;
655 void __iomem *pcie_data_offset;
656
657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
659
660 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
663
664 writel(reg_addr, pcie_index_offset);
665 readl(pcie_index_offset);
666 writel(reg_data, pcie_data_offset);
667 readl(pcie_data_offset);
668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
669 }
670
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
672 u64 reg_addr, u32 reg_data)
673 {
674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
675 void __iomem *pcie_index_offset;
676 void __iomem *pcie_index_hi_offset;
677 void __iomem *pcie_data_offset;
678
679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
681 if (adev->nbio.funcs->get_pcie_index_hi_offset)
682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
683 else
684 pcie_index_hi = 0;
685
686 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
689 if (pcie_index_hi != 0)
690 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
691 pcie_index_hi * 4;
692
693 writel(reg_addr, pcie_index_offset);
694 readl(pcie_index_offset);
695 if (pcie_index_hi != 0) {
696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
697 readl(pcie_index_hi_offset);
698 }
699 writel(reg_data, pcie_data_offset);
700 readl(pcie_data_offset);
701
702 /* clear the high bits */
703 if (pcie_index_hi != 0) {
704 writel(0, pcie_index_hi_offset);
705 readl(pcie_index_hi_offset);
706 }
707
708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
709 }
710
711 /**
712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
713 *
714 * @adev: amdgpu_device pointer
715 * @reg_addr: indirect register offset
716 * @reg_data: indirect register data
717 *
718 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
720 u32 reg_addr, u64 reg_data)
721 {
722 unsigned long flags, pcie_index, pcie_data;
723 void __iomem *pcie_index_offset;
724 void __iomem *pcie_data_offset;
725
726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
728
729 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
732
733 /* write low 32 bits */
734 writel(reg_addr, pcie_index_offset);
735 readl(pcie_index_offset);
736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
737 readl(pcie_data_offset);
738 /* write high 32 bits */
739 writel(reg_addr + 4, pcie_index_offset);
740 readl(pcie_index_offset);
741 writel((u32)(reg_data >> 32), pcie_data_offset);
742 readl(pcie_data_offset);
743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
744 }
745
746 /**
747 * amdgpu_device_get_rev_id - query device rev_id
748 *
749 * @adev: amdgpu_device pointer
750 *
751 * Return device rev_id
752 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
754 {
755 return adev->nbio.funcs->get_rev_id(adev);
756 }
757
758 /**
759 * amdgpu_invalid_rreg - dummy reg read function
760 *
761 * @adev: amdgpu_device pointer
762 * @reg: offset of register
763 *
764 * Dummy register read function. Used for register blocks
765 * that certain asics don't have (all asics).
766 * Returns the value in the register.
767 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
769 {
770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
771 BUG();
772 return 0;
773 }
774
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
776 {
777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
778 BUG();
779 return 0;
780 }
781
782 /**
783 * amdgpu_invalid_wreg - dummy reg write function
784 *
785 * @adev: amdgpu_device pointer
786 * @reg: offset of register
787 * @v: value to write to the register
788 *
789 * Dummy register read function. Used for register blocks
790 * that certain asics don't have (all asics).
791 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
793 {
794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
795 reg, v);
796 BUG();
797 }
798
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
800 {
801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
802 reg, v);
803 BUG();
804 }
805
806 /**
807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
808 *
809 * @adev: amdgpu_device pointer
810 * @reg: offset of register
811 *
812 * Dummy register read function. Used for register blocks
813 * that certain asics don't have (all asics).
814 * Returns the value in the register.
815 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
817 {
818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
819 BUG();
820 return 0;
821 }
822
823 /**
824 * amdgpu_invalid_wreg64 - dummy reg write function
825 *
826 * @adev: amdgpu_device pointer
827 * @reg: offset of register
828 * @v: value to write to the register
829 *
830 * Dummy register read function. Used for register blocks
831 * that certain asics don't have (all asics).
832 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
834 {
835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
836 reg, v);
837 BUG();
838 }
839
840 /**
841 * amdgpu_block_invalid_rreg - dummy reg read function
842 *
843 * @adev: amdgpu_device pointer
844 * @block: offset of instance
845 * @reg: offset of register
846 *
847 * Dummy register read function. Used for register blocks
848 * that certain asics don't have (all asics).
849 * Returns the value in the register.
850 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
852 uint32_t block, uint32_t reg)
853 {
854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
855 reg, block);
856 BUG();
857 return 0;
858 }
859
860 /**
861 * amdgpu_block_invalid_wreg - dummy reg write function
862 *
863 * @adev: amdgpu_device pointer
864 * @block: offset of instance
865 * @reg: offset of register
866 * @v: value to write to the register
867 *
868 * Dummy register read function. Used for register blocks
869 * that certain asics don't have (all asics).
870 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
872 uint32_t block,
873 uint32_t reg, uint32_t v)
874 {
875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
876 reg, block, v);
877 BUG();
878 }
879
880 /**
881 * amdgpu_device_asic_init - Wrapper for atom asic_init
882 *
883 * @adev: amdgpu_device pointer
884 *
885 * Does any asic specific work and then calls atom asic init.
886 */
amdgpu_device_asic_init(struct amdgpu_device * adev)887 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
888 {
889 int ret;
890
891 amdgpu_asic_pre_asic_init(adev);
892
893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
895 amdgpu_psp_wait_for_bootloader(adev);
896 ret = amdgpu_atomfirmware_asic_init(adev, true);
897 return ret;
898 } else {
899 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
900 }
901
902 return 0;
903 }
904
905 /**
906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
907 *
908 * @adev: amdgpu_device pointer
909 *
910 * Allocates a scratch page of VRAM for use by various things in the
911 * driver.
912 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
914 {
915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
916 AMDGPU_GEM_DOMAIN_VRAM |
917 AMDGPU_GEM_DOMAIN_GTT,
918 &adev->mem_scratch.robj,
919 &adev->mem_scratch.gpu_addr,
920 (void **)&adev->mem_scratch.ptr);
921 }
922
923 /**
924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
925 *
926 * @adev: amdgpu_device pointer
927 *
928 * Frees the VRAM scratch page.
929 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
931 {
932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
933 }
934
935 /**
936 * amdgpu_device_program_register_sequence - program an array of registers.
937 *
938 * @adev: amdgpu_device pointer
939 * @registers: pointer to the register array
940 * @array_size: size of the register array
941 *
942 * Programs an array or registers with and or masks.
943 * This is a helper for setting golden registers.
944 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
946 const u32 *registers,
947 const u32 array_size)
948 {
949 u32 tmp, reg, and_mask, or_mask;
950 int i;
951
952 if (array_size % 3)
953 return;
954
955 for (i = 0; i < array_size; i += 3) {
956 reg = registers[i + 0];
957 and_mask = registers[i + 1];
958 or_mask = registers[i + 2];
959
960 if (and_mask == 0xffffffff) {
961 tmp = or_mask;
962 } else {
963 tmp = RREG32(reg);
964 tmp &= ~and_mask;
965 if (adev->family >= AMDGPU_FAMILY_AI)
966 tmp |= (or_mask & and_mask);
967 else
968 tmp |= or_mask;
969 }
970 WREG32(reg, tmp);
971 }
972 }
973
974 /**
975 * amdgpu_device_pci_config_reset - reset the GPU
976 *
977 * @adev: amdgpu_device pointer
978 *
979 * Resets the GPU using the pci config reset sequence.
980 * Only applicable to asics prior to vega10.
981 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
983 {
984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
985 }
986
987 /**
988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
989 *
990 * @adev: amdgpu_device pointer
991 *
992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
993 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)994 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
995 {
996 STUB();
997 return -ENOSYS;
998 #ifdef notyet
999 return pci_reset_function(adev->pdev);
1000 #endif
1001 }
1002
1003 /*
1004 * amdgpu_device_wb_*()
1005 * Writeback is the method by which the GPU updates special pages in memory
1006 * with the status of certain GPU events (fences, ring pointers,etc.).
1007 */
1008
1009 /**
1010 * amdgpu_device_wb_fini - Disable Writeback and free memory
1011 *
1012 * @adev: amdgpu_device pointer
1013 *
1014 * Disables Writeback and frees the Writeback memory (all asics).
1015 * Used at driver shutdown.
1016 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 if (adev->wb.wb_obj) {
1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 &adev->wb.gpu_addr,
1022 (void **)&adev->wb.wb);
1023 adev->wb.wb_obj = NULL;
1024 }
1025 }
1026
1027 /**
1028 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1029 *
1030 * @adev: amdgpu_device pointer
1031 *
1032 * Initializes writeback and allocates writeback memory (all asics).
1033 * Used at driver startup.
1034 * Returns 0 on success or an -error on failure.
1035 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 int r;
1039
1040 if (adev->wb.wb_obj == NULL) {
1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 (void **)&adev->wb.wb);
1046 if (r) {
1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 return r;
1049 }
1050
1051 adev->wb.num_wb = AMDGPU_MAX_WB;
1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053
1054 /* clear wb memory */
1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 }
1057
1058 return 0;
1059 }
1060
1061 /**
1062 * amdgpu_device_wb_get - Allocate a wb entry
1063 *
1064 * @adev: amdgpu_device pointer
1065 * @wb: wb index
1066 *
1067 * Allocate a wb slot for use by the driver (all asics).
1068 * Returns 0 on success or -EINVAL on failure.
1069 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073
1074 if (offset < adev->wb.num_wb) {
1075 __set_bit(offset, adev->wb.used);
1076 *wb = offset << 3; /* convert to dw offset */
1077 return 0;
1078 } else {
1079 return -EINVAL;
1080 }
1081 }
1082
1083 /**
1084 * amdgpu_device_wb_free - Free a wb entry
1085 *
1086 * @adev: amdgpu_device pointer
1087 * @wb: wb index
1088 *
1089 * Free a wb slot allocated for use by the driver (all asics)
1090 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 wb >>= 3;
1094 if (wb < adev->wb.num_wb)
1095 __clear_bit(wb, adev->wb.used);
1096 }
1097
1098 /**
1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100 *
1101 * @adev: amdgpu_device pointer
1102 *
1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104 * to fail, but if any of the BARs is not accessible after the size we abort
1105 * driver loading by returning -ENODEV.
1106 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 #ifdef __linux__
1110 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1111 struct pci_bus *root;
1112 struct resource *res;
1113 unsigned int i;
1114 u16 cmd;
1115 int r;
1116
1117 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1118 return 0;
1119
1120 /* Bypass for VF */
1121 if (amdgpu_sriov_vf(adev))
1122 return 0;
1123
1124 /* skip if the bios has already enabled large BAR */
1125 if (adev->gmc.real_vram_size &&
1126 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1127 return 0;
1128
1129 /* Check if the root BUS has 64bit memory resources */
1130 root = adev->pdev->bus;
1131 while (root->parent)
1132 root = root->parent;
1133
1134 pci_bus_for_each_resource(root, res, i) {
1135 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1136 res->start > 0x100000000ull)
1137 break;
1138 }
1139
1140 /* Trying to resize is pointless without a root hub window above 4GB */
1141 if (!res)
1142 return 0;
1143
1144 /* Limit the BAR size to what is available */
1145 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1146 rbar_size);
1147
1148 /* Disable memory decoding while we change the BAR addresses and size */
1149 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1150 pci_write_config_word(adev->pdev, PCI_COMMAND,
1151 cmd & ~PCI_COMMAND_MEMORY);
1152
1153 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1154 amdgpu_doorbell_fini(adev);
1155 if (adev->asic_type >= CHIP_BONAIRE)
1156 pci_release_resource(adev->pdev, 2);
1157
1158 pci_release_resource(adev->pdev, 0);
1159
1160 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1161 if (r == -ENOSPC)
1162 DRM_INFO("Not enough PCI address space for a large BAR.");
1163 else if (r && r != -ENOTSUPP)
1164 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1165
1166 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1167
1168 /* When the doorbell or fb BAR isn't available we have no chance of
1169 * using the device.
1170 */
1171 r = amdgpu_doorbell_init(adev);
1172 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1173 return -ENODEV;
1174
1175 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1176 #endif /* __linux__ */
1177
1178 return 0;
1179 }
1180
amdgpu_device_read_bios(struct amdgpu_device * adev)1181 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1182 {
1183 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1184 return false;
1185
1186 return true;
1187 }
1188
1189 /*
1190 * GPU helpers function.
1191 */
1192 /**
1193 * amdgpu_device_need_post - check if the hw need post or not
1194 *
1195 * @adev: amdgpu_device pointer
1196 *
1197 * Check if the asic has been initialized (all asics) at driver startup
1198 * or post is needed if hw reset is performed.
1199 * Returns true if need or false if not.
1200 */
amdgpu_device_need_post(struct amdgpu_device * adev)1201 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1202 {
1203 uint32_t reg;
1204
1205 if (amdgpu_sriov_vf(adev))
1206 return false;
1207
1208 if (!amdgpu_device_read_bios(adev))
1209 return false;
1210
1211 if (amdgpu_passthrough(adev)) {
1212 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1213 * some old smc fw still need driver do vPost otherwise gpu hang, while
1214 * those smc fw version above 22.15 doesn't have this flaw, so we force
1215 * vpost executed for smc version below 22.15
1216 */
1217 if (adev->asic_type == CHIP_FIJI) {
1218 int err;
1219 uint32_t fw_ver;
1220
1221 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1222 /* force vPost if error occured */
1223 if (err)
1224 return true;
1225
1226 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1227 release_firmware(adev->pm.fw);
1228 if (fw_ver < 0x00160e00)
1229 return true;
1230 }
1231 }
1232
1233 /* Don't post if we need to reset whole hive on init */
1234 if (adev->gmc.xgmi.pending_reset)
1235 return false;
1236
1237 if (adev->has_hw_reset) {
1238 adev->has_hw_reset = false;
1239 return true;
1240 }
1241
1242 /* bios scratch used on CIK+ */
1243 if (adev->asic_type >= CHIP_BONAIRE)
1244 return amdgpu_atombios_scratch_need_asic_init(adev);
1245
1246 /* check MEM_SIZE for older asics */
1247 reg = amdgpu_asic_get_config_memsize(adev);
1248
1249 if ((reg != 0) && (reg != 0xffffffff))
1250 return false;
1251
1252 return true;
1253 }
1254
1255 /*
1256 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1257 * speed switching. Until we have confirmation from Intel that a specific host
1258 * supports it, it's safer that we keep it disabled for all.
1259 *
1260 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1261 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1262 */
amdgpu_device_pcie_dynamic_switching_supported(void)1263 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1264 {
1265 #if IS_ENABLED(CONFIG_X86)
1266 #ifdef __linux__
1267 struct cpuinfo_x86 *c = &cpu_data(0);
1268
1269 if (c->x86_vendor == X86_VENDOR_INTEL)
1270 #else
1271 if (strcmp(cpu_vendor, "GenuineIntel") == 0)
1272 #endif
1273 return false;
1274 #endif
1275 return true;
1276 }
1277
1278 /**
1279 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1280 *
1281 * @adev: amdgpu_device pointer
1282 *
1283 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1284 * be set for this device.
1285 *
1286 * Returns true if it should be used or false if not.
1287 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1288 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1289 {
1290 switch (amdgpu_aspm) {
1291 case -1:
1292 break;
1293 case 0:
1294 return false;
1295 case 1:
1296 return true;
1297 default:
1298 return false;
1299 }
1300 return pcie_aspm_enabled(adev->pdev);
1301 }
1302
amdgpu_device_aspm_support_quirk(void)1303 bool amdgpu_device_aspm_support_quirk(void)
1304 {
1305 #if IS_ENABLED(CONFIG_X86)
1306 struct cpu_info *ci = curcpu();
1307
1308 return !(ci->ci_family == 6 && ci->ci_model == 0x97);
1309 #else
1310 return true;
1311 #endif
1312 }
1313
1314 /* if we get transitioned to only one device, take VGA back */
1315 /**
1316 * amdgpu_device_vga_set_decode - enable/disable vga decode
1317 *
1318 * @pdev: PCI device pointer
1319 * @state: enable/disable vga decode
1320 *
1321 * Enable/disable vga decode (all asics).
1322 * Returns VGA resource flags.
1323 */
1324 #ifdef notyet
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326 bool state)
1327 {
1328 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1329
1330 amdgpu_asic_set_vga_state(adev, state);
1331 if (state)
1332 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1333 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1334 else
1335 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1336 }
1337 #endif
1338
1339 /**
1340 * amdgpu_device_check_block_size - validate the vm block size
1341 *
1342 * @adev: amdgpu_device pointer
1343 *
1344 * Validates the vm block size specified via module parameter.
1345 * The vm block size defines number of bits in page table versus page directory,
1346 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1347 * page table and the remaining bits are in the page directory.
1348 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1349 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1350 {
1351 /* defines number of bits in page table versus page directory,
1352 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1353 * page table and the remaining bits are in the page directory
1354 */
1355 if (amdgpu_vm_block_size == -1)
1356 return;
1357
1358 if (amdgpu_vm_block_size < 9) {
1359 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1360 amdgpu_vm_block_size);
1361 amdgpu_vm_block_size = -1;
1362 }
1363 }
1364
1365 /**
1366 * amdgpu_device_check_vm_size - validate the vm size
1367 *
1368 * @adev: amdgpu_device pointer
1369 *
1370 * Validates the vm size in GB specified via module parameter.
1371 * The VM size is the size of the GPU virtual memory space in GB.
1372 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1373 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1374 {
1375 /* no need to check the default value */
1376 if (amdgpu_vm_size == -1)
1377 return;
1378
1379 if (amdgpu_vm_size < 1) {
1380 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1381 amdgpu_vm_size);
1382 amdgpu_vm_size = -1;
1383 }
1384 }
1385
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1386 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1387 {
1388 #ifdef __linux__
1389 struct sysinfo si;
1390 #endif
1391 bool is_os_64 = (sizeof(void *) == 8);
1392 uint64_t total_memory;
1393 uint64_t dram_size_seven_GB = 0x1B8000000;
1394 uint64_t dram_size_three_GB = 0xB8000000;
1395
1396 if (amdgpu_smu_memory_pool_size == 0)
1397 return;
1398
1399 if (!is_os_64) {
1400 DRM_WARN("Not 64-bit OS, feature not supported\n");
1401 goto def_value;
1402 }
1403 #ifdef __linux__
1404 si_meminfo(&si);
1405 total_memory = (uint64_t)si.totalram * si.mem_unit;
1406 #else
1407 total_memory = ptoa(physmem);
1408 #endif
1409
1410 if ((amdgpu_smu_memory_pool_size == 1) ||
1411 (amdgpu_smu_memory_pool_size == 2)) {
1412 if (total_memory < dram_size_three_GB)
1413 goto def_value1;
1414 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1415 (amdgpu_smu_memory_pool_size == 8)) {
1416 if (total_memory < dram_size_seven_GB)
1417 goto def_value1;
1418 } else {
1419 DRM_WARN("Smu memory pool size not supported\n");
1420 goto def_value;
1421 }
1422 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1423
1424 return;
1425
1426 def_value1:
1427 DRM_WARN("No enough system memory\n");
1428 def_value:
1429 adev->pm.smu_prv_buffer_size = 0;
1430 }
1431
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1432 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1433 {
1434 if (!(adev->flags & AMD_IS_APU) ||
1435 adev->asic_type < CHIP_RAVEN)
1436 return 0;
1437
1438 switch (adev->asic_type) {
1439 case CHIP_RAVEN:
1440 if (adev->pdev->device == 0x15dd)
1441 adev->apu_flags |= AMD_APU_IS_RAVEN;
1442 if (adev->pdev->device == 0x15d8)
1443 adev->apu_flags |= AMD_APU_IS_PICASSO;
1444 break;
1445 case CHIP_RENOIR:
1446 if ((adev->pdev->device == 0x1636) ||
1447 (adev->pdev->device == 0x164c))
1448 adev->apu_flags |= AMD_APU_IS_RENOIR;
1449 else
1450 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1451 break;
1452 case CHIP_VANGOGH:
1453 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1454 break;
1455 case CHIP_YELLOW_CARP:
1456 break;
1457 case CHIP_CYAN_SKILLFISH:
1458 if ((adev->pdev->device == 0x13FE) ||
1459 (adev->pdev->device == 0x143F))
1460 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1461 break;
1462 default:
1463 break;
1464 }
1465
1466 return 0;
1467 }
1468
1469 /**
1470 * amdgpu_device_check_arguments - validate module params
1471 *
1472 * @adev: amdgpu_device pointer
1473 *
1474 * Validates certain module parameters and updates
1475 * the associated values used by the driver (all asics).
1476 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1477 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1478 {
1479 if (amdgpu_sched_jobs < 4) {
1480 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1481 amdgpu_sched_jobs);
1482 amdgpu_sched_jobs = 4;
1483 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1484 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1485 amdgpu_sched_jobs);
1486 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1487 }
1488
1489 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1490 /* gart size must be greater or equal to 32M */
1491 dev_warn(adev->dev, "gart size (%d) too small\n",
1492 amdgpu_gart_size);
1493 amdgpu_gart_size = -1;
1494 }
1495
1496 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1497 /* gtt size must be greater or equal to 32M */
1498 dev_warn(adev->dev, "gtt size (%d) too small\n",
1499 amdgpu_gtt_size);
1500 amdgpu_gtt_size = -1;
1501 }
1502
1503 /* valid range is between 4 and 9 inclusive */
1504 if (amdgpu_vm_fragment_size != -1 &&
1505 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1506 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1507 amdgpu_vm_fragment_size = -1;
1508 }
1509
1510 if (amdgpu_sched_hw_submission < 2) {
1511 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1512 amdgpu_sched_hw_submission);
1513 amdgpu_sched_hw_submission = 2;
1514 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1515 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1516 amdgpu_sched_hw_submission);
1517 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1518 }
1519
1520 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1521 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1522 amdgpu_reset_method = -1;
1523 }
1524
1525 amdgpu_device_check_smu_prv_buffer_size(adev);
1526
1527 amdgpu_device_check_vm_size(adev);
1528
1529 amdgpu_device_check_block_size(adev);
1530
1531 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1532
1533 return 0;
1534 }
1535
1536 #ifdef __linux__
1537 /**
1538 * amdgpu_switcheroo_set_state - set switcheroo state
1539 *
1540 * @pdev: pci dev pointer
1541 * @state: vga_switcheroo state
1542 *
1543 * Callback for the switcheroo driver. Suspends or resumes
1544 * the asics before or after it is powered up using ACPI methods.
1545 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1546 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1547 enum vga_switcheroo_state state)
1548 {
1549 struct drm_device *dev = pci_get_drvdata(pdev);
1550 int r;
1551
1552 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1553 return;
1554
1555 if (state == VGA_SWITCHEROO_ON) {
1556 pr_info("switched on\n");
1557 /* don't suspend or resume card normally */
1558 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1559
1560 pci_set_power_state(pdev, PCI_D0);
1561 amdgpu_device_load_pci_state(pdev);
1562 r = pci_enable_device(pdev);
1563 if (r)
1564 DRM_WARN("pci_enable_device failed (%d)\n", r);
1565 amdgpu_device_resume(dev, true);
1566
1567 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1568 } else {
1569 pr_info("switched off\n");
1570 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1571 amdgpu_device_prepare(dev);
1572 amdgpu_device_suspend(dev, true);
1573 amdgpu_device_cache_pci_state(pdev);
1574 /* Shut down the device */
1575 pci_disable_device(pdev);
1576 pci_set_power_state(pdev, PCI_D3cold);
1577 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1578 }
1579 }
1580
1581 /**
1582 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1583 *
1584 * @pdev: pci dev pointer
1585 *
1586 * Callback for the switcheroo driver. Check of the switcheroo
1587 * state can be changed.
1588 * Returns true if the state can be changed, false if not.
1589 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1590 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1591 {
1592 struct drm_device *dev = pci_get_drvdata(pdev);
1593
1594 /*
1595 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1596 * locking inversion with the driver load path. And the access here is
1597 * completely racy anyway. So don't bother with locking for now.
1598 */
1599 return atomic_read(&dev->open_count) == 0;
1600 }
1601 #endif /* __linux__ */
1602
1603 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1604 #ifdef notyet
1605 .set_gpu_state = amdgpu_switcheroo_set_state,
1606 .reprobe = NULL,
1607 .can_switch = amdgpu_switcheroo_can_switch,
1608 #endif
1609 };
1610
1611 /**
1612 * amdgpu_device_ip_set_clockgating_state - set the CG state
1613 *
1614 * @dev: amdgpu_device pointer
1615 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1616 * @state: clockgating state (gate or ungate)
1617 *
1618 * Sets the requested clockgating state for all instances of
1619 * the hardware IP specified.
1620 * Returns the error code from the last instance.
1621 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1622 int amdgpu_device_ip_set_clockgating_state(void *dev,
1623 enum amd_ip_block_type block_type,
1624 enum amd_clockgating_state state)
1625 {
1626 struct amdgpu_device *adev = dev;
1627 int i, r = 0;
1628
1629 for (i = 0; i < adev->num_ip_blocks; i++) {
1630 if (!adev->ip_blocks[i].status.valid)
1631 continue;
1632 if (adev->ip_blocks[i].version->type != block_type)
1633 continue;
1634 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1635 continue;
1636 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1637 (void *)adev, state);
1638 if (r)
1639 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1640 adev->ip_blocks[i].version->funcs->name, r);
1641 }
1642 return r;
1643 }
1644
1645 /**
1646 * amdgpu_device_ip_set_powergating_state - set the PG state
1647 *
1648 * @dev: amdgpu_device pointer
1649 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1650 * @state: powergating state (gate or ungate)
1651 *
1652 * Sets the requested powergating state for all instances of
1653 * the hardware IP specified.
1654 * Returns the error code from the last instance.
1655 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1656 int amdgpu_device_ip_set_powergating_state(void *dev,
1657 enum amd_ip_block_type block_type,
1658 enum amd_powergating_state state)
1659 {
1660 struct amdgpu_device *adev = dev;
1661 int i, r = 0;
1662
1663 for (i = 0; i < adev->num_ip_blocks; i++) {
1664 if (!adev->ip_blocks[i].status.valid)
1665 continue;
1666 if (adev->ip_blocks[i].version->type != block_type)
1667 continue;
1668 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1669 continue;
1670 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1671 (void *)adev, state);
1672 if (r)
1673 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1674 adev->ip_blocks[i].version->funcs->name, r);
1675 }
1676 return r;
1677 }
1678
1679 /**
1680 * amdgpu_device_ip_get_clockgating_state - get the CG state
1681 *
1682 * @adev: amdgpu_device pointer
1683 * @flags: clockgating feature flags
1684 *
1685 * Walks the list of IPs on the device and updates the clockgating
1686 * flags for each IP.
1687 * Updates @flags with the feature flags for each hardware IP where
1688 * clockgating is enabled.
1689 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1690 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1691 u64 *flags)
1692 {
1693 int i;
1694
1695 for (i = 0; i < adev->num_ip_blocks; i++) {
1696 if (!adev->ip_blocks[i].status.valid)
1697 continue;
1698 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1699 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1700 }
1701 }
1702
1703 /**
1704 * amdgpu_device_ip_wait_for_idle - wait for idle
1705 *
1706 * @adev: amdgpu_device pointer
1707 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1708 *
1709 * Waits for the request hardware IP to be idle.
1710 * Returns 0 for success or a negative error code on failure.
1711 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1712 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1713 enum amd_ip_block_type block_type)
1714 {
1715 int i, r;
1716
1717 for (i = 0; i < adev->num_ip_blocks; i++) {
1718 if (!adev->ip_blocks[i].status.valid)
1719 continue;
1720 if (adev->ip_blocks[i].version->type == block_type) {
1721 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1722 if (r)
1723 return r;
1724 break;
1725 }
1726 }
1727 return 0;
1728
1729 }
1730
1731 /**
1732 * amdgpu_device_ip_is_idle - is the hardware IP idle
1733 *
1734 * @adev: amdgpu_device pointer
1735 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1736 *
1737 * Check if the hardware IP is idle or not.
1738 * Returns true if it the IP is idle, false if not.
1739 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1740 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1741 enum amd_ip_block_type block_type)
1742 {
1743 int i;
1744
1745 for (i = 0; i < adev->num_ip_blocks; i++) {
1746 if (!adev->ip_blocks[i].status.valid)
1747 continue;
1748 if (adev->ip_blocks[i].version->type == block_type)
1749 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1750 }
1751 return true;
1752
1753 }
1754
1755 /**
1756 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1757 *
1758 * @adev: amdgpu_device pointer
1759 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1760 *
1761 * Returns a pointer to the hardware IP block structure
1762 * if it exists for the asic, otherwise NULL.
1763 */
1764 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1765 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1766 enum amd_ip_block_type type)
1767 {
1768 int i;
1769
1770 for (i = 0; i < adev->num_ip_blocks; i++)
1771 if (adev->ip_blocks[i].version->type == type)
1772 return &adev->ip_blocks[i];
1773
1774 return NULL;
1775 }
1776
1777 /**
1778 * amdgpu_device_ip_block_version_cmp
1779 *
1780 * @adev: amdgpu_device pointer
1781 * @type: enum amd_ip_block_type
1782 * @major: major version
1783 * @minor: minor version
1784 *
1785 * return 0 if equal or greater
1786 * return 1 if smaller or the ip_block doesn't exist
1787 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1788 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1789 enum amd_ip_block_type type,
1790 u32 major, u32 minor)
1791 {
1792 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1793
1794 if (ip_block && ((ip_block->version->major > major) ||
1795 ((ip_block->version->major == major) &&
1796 (ip_block->version->minor >= minor))))
1797 return 0;
1798
1799 return 1;
1800 }
1801
1802 /**
1803 * amdgpu_device_ip_block_add
1804 *
1805 * @adev: amdgpu_device pointer
1806 * @ip_block_version: pointer to the IP to add
1807 *
1808 * Adds the IP block driver information to the collection of IPs
1809 * on the asic.
1810 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1811 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1812 const struct amdgpu_ip_block_version *ip_block_version)
1813 {
1814 if (!ip_block_version)
1815 return -EINVAL;
1816
1817 switch (ip_block_version->type) {
1818 case AMD_IP_BLOCK_TYPE_VCN:
1819 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1820 return 0;
1821 break;
1822 case AMD_IP_BLOCK_TYPE_JPEG:
1823 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1824 return 0;
1825 break;
1826 default:
1827 break;
1828 }
1829
1830 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1831 ip_block_version->funcs->name);
1832
1833 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1834
1835 return 0;
1836 }
1837
1838 /**
1839 * amdgpu_device_enable_virtual_display - enable virtual display feature
1840 *
1841 * @adev: amdgpu_device pointer
1842 *
1843 * Enabled the virtual display feature if the user has enabled it via
1844 * the module parameter virtual_display. This feature provides a virtual
1845 * display hardware on headless boards or in virtualized environments.
1846 * This function parses and validates the configuration string specified by
1847 * the user and configues the virtual display configuration (number of
1848 * virtual connectors, crtcs, etc.) specified.
1849 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1850 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1851 {
1852 adev->enable_virtual_display = false;
1853
1854 #ifdef notyet
1855 if (amdgpu_virtual_display) {
1856 const char *pci_address_name = pci_name(adev->pdev);
1857 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1858
1859 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1860 pciaddstr_tmp = pciaddstr;
1861 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1862 pciaddname = strsep(&pciaddname_tmp, ",");
1863 if (!strcmp("all", pciaddname)
1864 || !strcmp(pci_address_name, pciaddname)) {
1865 long num_crtc;
1866 int res = -1;
1867
1868 adev->enable_virtual_display = true;
1869
1870 if (pciaddname_tmp)
1871 res = kstrtol(pciaddname_tmp, 10,
1872 &num_crtc);
1873
1874 if (!res) {
1875 if (num_crtc < 1)
1876 num_crtc = 1;
1877 if (num_crtc > 6)
1878 num_crtc = 6;
1879 adev->mode_info.num_crtc = num_crtc;
1880 } else {
1881 adev->mode_info.num_crtc = 1;
1882 }
1883 break;
1884 }
1885 }
1886
1887 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1888 amdgpu_virtual_display, pci_address_name,
1889 adev->enable_virtual_display, adev->mode_info.num_crtc);
1890
1891 kfree(pciaddstr);
1892 }
1893 #endif
1894 }
1895
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1896 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1897 {
1898 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1899 adev->mode_info.num_crtc = 1;
1900 adev->enable_virtual_display = true;
1901 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1902 adev->enable_virtual_display, adev->mode_info.num_crtc);
1903 }
1904 }
1905
1906 /**
1907 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1908 *
1909 * @adev: amdgpu_device pointer
1910 *
1911 * Parses the asic configuration parameters specified in the gpu info
1912 * firmware and makes them availale to the driver for use in configuring
1913 * the asic.
1914 * Returns 0 on success, -EINVAL on failure.
1915 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1916 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1917 {
1918 const char *chip_name;
1919 char fw_name[40];
1920 int err;
1921 const struct gpu_info_firmware_header_v1_0 *hdr;
1922
1923 adev->firmware.gpu_info_fw = NULL;
1924
1925 if (adev->mman.discovery_bin)
1926 return 0;
1927
1928 switch (adev->asic_type) {
1929 default:
1930 return 0;
1931 case CHIP_VEGA10:
1932 chip_name = "vega10";
1933 break;
1934 case CHIP_VEGA12:
1935 chip_name = "vega12";
1936 break;
1937 case CHIP_RAVEN:
1938 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1939 chip_name = "raven2";
1940 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1941 chip_name = "picasso";
1942 else
1943 chip_name = "raven";
1944 break;
1945 case CHIP_ARCTURUS:
1946 chip_name = "arcturus";
1947 break;
1948 case CHIP_NAVI12:
1949 chip_name = "navi12";
1950 break;
1951 }
1952
1953 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1954 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1955 if (err) {
1956 dev_err(adev->dev,
1957 "Failed to get gpu_info firmware \"%s\"\n",
1958 fw_name);
1959 goto out;
1960 }
1961
1962 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1963 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1964
1965 switch (hdr->version_major) {
1966 case 1:
1967 {
1968 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1969 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1970 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1971
1972 /*
1973 * Should be droped when DAL no longer needs it.
1974 */
1975 if (adev->asic_type == CHIP_NAVI12)
1976 goto parse_soc_bounding_box;
1977
1978 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1979 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1980 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1981 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1982 adev->gfx.config.max_texture_channel_caches =
1983 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1984 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1985 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1986 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1987 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1988 adev->gfx.config.double_offchip_lds_buf =
1989 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1990 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1991 adev->gfx.cu_info.max_waves_per_simd =
1992 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1993 adev->gfx.cu_info.max_scratch_slots_per_cu =
1994 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1995 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1996 if (hdr->version_minor >= 1) {
1997 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1998 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1999 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2000 adev->gfx.config.num_sc_per_sh =
2001 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2002 adev->gfx.config.num_packer_per_sc =
2003 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2004 }
2005
2006 parse_soc_bounding_box:
2007 /*
2008 * soc bounding box info is not integrated in disocovery table,
2009 * we always need to parse it from gpu info firmware if needed.
2010 */
2011 if (hdr->version_minor == 2) {
2012 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2013 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2014 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2015 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2016 }
2017 break;
2018 }
2019 default:
2020 dev_err(adev->dev,
2021 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2022 err = -EINVAL;
2023 goto out;
2024 }
2025 out:
2026 return err;
2027 }
2028
2029 /**
2030 * amdgpu_device_ip_early_init - run early init for hardware IPs
2031 *
2032 * @adev: amdgpu_device pointer
2033 *
2034 * Early initialization pass for hardware IPs. The hardware IPs that make
2035 * up each asic are discovered each IP's early_init callback is run. This
2036 * is the first stage in initializing the asic.
2037 * Returns 0 on success, negative error code on failure.
2038 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2039 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2040 {
2041 struct pci_dev *parent;
2042 int i, r;
2043 bool total;
2044
2045 amdgpu_device_enable_virtual_display(adev);
2046
2047 if (amdgpu_sriov_vf(adev)) {
2048 r = amdgpu_virt_request_full_gpu(adev, true);
2049 if (r)
2050 return r;
2051 }
2052
2053 switch (adev->asic_type) {
2054 #ifdef CONFIG_DRM_AMDGPU_SI
2055 case CHIP_VERDE:
2056 case CHIP_TAHITI:
2057 case CHIP_PITCAIRN:
2058 case CHIP_OLAND:
2059 case CHIP_HAINAN:
2060 adev->family = AMDGPU_FAMILY_SI;
2061 r = si_set_ip_blocks(adev);
2062 if (r)
2063 return r;
2064 break;
2065 #endif
2066 #ifdef CONFIG_DRM_AMDGPU_CIK
2067 case CHIP_BONAIRE:
2068 case CHIP_HAWAII:
2069 case CHIP_KAVERI:
2070 case CHIP_KABINI:
2071 case CHIP_MULLINS:
2072 if (adev->flags & AMD_IS_APU)
2073 adev->family = AMDGPU_FAMILY_KV;
2074 else
2075 adev->family = AMDGPU_FAMILY_CI;
2076
2077 r = cik_set_ip_blocks(adev);
2078 if (r)
2079 return r;
2080 break;
2081 #endif
2082 case CHIP_TOPAZ:
2083 case CHIP_TONGA:
2084 case CHIP_FIJI:
2085 case CHIP_POLARIS10:
2086 case CHIP_POLARIS11:
2087 case CHIP_POLARIS12:
2088 case CHIP_VEGAM:
2089 case CHIP_CARRIZO:
2090 case CHIP_STONEY:
2091 if (adev->flags & AMD_IS_APU)
2092 adev->family = AMDGPU_FAMILY_CZ;
2093 else
2094 adev->family = AMDGPU_FAMILY_VI;
2095
2096 r = vi_set_ip_blocks(adev);
2097 if (r)
2098 return r;
2099 break;
2100 default:
2101 r = amdgpu_discovery_set_ip_blocks(adev);
2102 if (r)
2103 return r;
2104 break;
2105 }
2106
2107 if (amdgpu_has_atpx() &&
2108 (amdgpu_is_atpx_hybrid() ||
2109 amdgpu_has_atpx_dgpu_power_cntl()) &&
2110 ((adev->flags & AMD_IS_APU) == 0) &&
2111 !dev_is_removable(&adev->pdev->dev))
2112 adev->flags |= AMD_IS_PX;
2113
2114 if (!(adev->flags & AMD_IS_APU)) {
2115 #ifdef notyet
2116 parent = pcie_find_root_port(adev->pdev);
2117 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2118 #else
2119 adev->has_pr3 = false;
2120 #endif
2121 }
2122
2123
2124 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2125 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2126 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2127 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2128 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2129 if (!amdgpu_device_pcie_dynamic_switching_supported())
2130 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2131
2132 total = true;
2133 for (i = 0; i < adev->num_ip_blocks; i++) {
2134 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2135 DRM_WARN("disabled ip block: %d <%s>\n",
2136 i, adev->ip_blocks[i].version->funcs->name);
2137 adev->ip_blocks[i].status.valid = false;
2138 } else {
2139 if (adev->ip_blocks[i].version->funcs->early_init) {
2140 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2141 if (r == -ENOENT) {
2142 adev->ip_blocks[i].status.valid = false;
2143 } else if (r) {
2144 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2145 adev->ip_blocks[i].version->funcs->name, r);
2146 total = false;
2147 } else {
2148 adev->ip_blocks[i].status.valid = true;
2149 }
2150 } else {
2151 adev->ip_blocks[i].status.valid = true;
2152 }
2153 }
2154 /* get the vbios after the asic_funcs are set up */
2155 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2156 r = amdgpu_device_parse_gpu_info_fw(adev);
2157 if (r)
2158 return r;
2159
2160 /* Read BIOS */
2161 if (amdgpu_device_read_bios(adev)) {
2162 if (!amdgpu_get_bios(adev))
2163 return -EINVAL;
2164
2165 r = amdgpu_atombios_init(adev);
2166 if (r) {
2167 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2168 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2169 return r;
2170 }
2171 }
2172
2173 /*get pf2vf msg info at it's earliest time*/
2174 if (amdgpu_sriov_vf(adev))
2175 amdgpu_virt_init_data_exchange(adev);
2176
2177 }
2178 }
2179 if (!total)
2180 return -ENODEV;
2181
2182 amdgpu_amdkfd_device_probe(adev);
2183 adev->cg_flags &= amdgpu_cg_mask;
2184 adev->pg_flags &= amdgpu_pg_mask;
2185
2186 return 0;
2187 }
2188
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2189 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2190 {
2191 int i, r;
2192
2193 for (i = 0; i < adev->num_ip_blocks; i++) {
2194 if (!adev->ip_blocks[i].status.sw)
2195 continue;
2196 if (adev->ip_blocks[i].status.hw)
2197 continue;
2198 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2199 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2200 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2201 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2202 if (r) {
2203 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2204 adev->ip_blocks[i].version->funcs->name, r);
2205 return r;
2206 }
2207 adev->ip_blocks[i].status.hw = true;
2208 }
2209 }
2210
2211 return 0;
2212 }
2213
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2214 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2215 {
2216 int i, r;
2217
2218 for (i = 0; i < adev->num_ip_blocks; i++) {
2219 if (!adev->ip_blocks[i].status.sw)
2220 continue;
2221 if (adev->ip_blocks[i].status.hw)
2222 continue;
2223 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2224 if (r) {
2225 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2226 adev->ip_blocks[i].version->funcs->name, r);
2227 return r;
2228 }
2229 adev->ip_blocks[i].status.hw = true;
2230 }
2231
2232 return 0;
2233 }
2234
amdgpu_device_fw_loading(struct amdgpu_device * adev)2235 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2236 {
2237 int r = 0;
2238 int i;
2239 uint32_t smu_version;
2240
2241 if (adev->asic_type >= CHIP_VEGA10) {
2242 for (i = 0; i < adev->num_ip_blocks; i++) {
2243 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2244 continue;
2245
2246 if (!adev->ip_blocks[i].status.sw)
2247 continue;
2248
2249 /* no need to do the fw loading again if already done*/
2250 if (adev->ip_blocks[i].status.hw == true)
2251 break;
2252
2253 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2254 r = adev->ip_blocks[i].version->funcs->resume(adev);
2255 if (r) {
2256 DRM_ERROR("resume of IP block <%s> failed %d\n",
2257 adev->ip_blocks[i].version->funcs->name, r);
2258 return r;
2259 }
2260 } else {
2261 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2262 if (r) {
2263 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2264 adev->ip_blocks[i].version->funcs->name, r);
2265 return r;
2266 }
2267 }
2268
2269 adev->ip_blocks[i].status.hw = true;
2270 break;
2271 }
2272 }
2273
2274 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2275 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2276
2277 return r;
2278 }
2279
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2280 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2281 {
2282 long timeout;
2283 int r, i;
2284
2285 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2286 struct amdgpu_ring *ring = adev->rings[i];
2287
2288 /* No need to setup the GPU scheduler for rings that don't need it */
2289 if (!ring || ring->no_scheduler)
2290 continue;
2291
2292 switch (ring->funcs->type) {
2293 case AMDGPU_RING_TYPE_GFX:
2294 timeout = adev->gfx_timeout;
2295 break;
2296 case AMDGPU_RING_TYPE_COMPUTE:
2297 timeout = adev->compute_timeout;
2298 break;
2299 case AMDGPU_RING_TYPE_SDMA:
2300 timeout = adev->sdma_timeout;
2301 break;
2302 default:
2303 timeout = adev->video_timeout;
2304 break;
2305 }
2306
2307 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2308 ring->num_hw_submission, 0,
2309 timeout, adev->reset_domain->wq,
2310 ring->sched_score, ring->name,
2311 adev->dev);
2312 if (r) {
2313 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2314 ring->name);
2315 return r;
2316 }
2317 }
2318
2319 amdgpu_xcp_update_partition_sched_list(adev);
2320
2321 return 0;
2322 }
2323
2324
2325 /**
2326 * amdgpu_device_ip_init - run init for hardware IPs
2327 *
2328 * @adev: amdgpu_device pointer
2329 *
2330 * Main initialization pass for hardware IPs. The list of all the hardware
2331 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2332 * are run. sw_init initializes the software state associated with each IP
2333 * and hw_init initializes the hardware associated with each IP.
2334 * Returns 0 on success, negative error code on failure.
2335 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2336 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2337 {
2338 int i, r;
2339
2340 r = amdgpu_ras_init(adev);
2341 if (r)
2342 return r;
2343
2344 for (i = 0; i < adev->num_ip_blocks; i++) {
2345 if (!adev->ip_blocks[i].status.valid)
2346 continue;
2347 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2348 if (r) {
2349 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2350 adev->ip_blocks[i].version->funcs->name, r);
2351 goto init_failed;
2352 }
2353 adev->ip_blocks[i].status.sw = true;
2354
2355 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2356 /* need to do common hw init early so everything is set up for gmc */
2357 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2358 if (r) {
2359 DRM_ERROR("hw_init %d failed %d\n", i, r);
2360 goto init_failed;
2361 }
2362 adev->ip_blocks[i].status.hw = true;
2363 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2364 /* need to do gmc hw init early so we can allocate gpu mem */
2365 /* Try to reserve bad pages early */
2366 if (amdgpu_sriov_vf(adev))
2367 amdgpu_virt_exchange_data(adev);
2368
2369 r = amdgpu_device_mem_scratch_init(adev);
2370 if (r) {
2371 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2372 goto init_failed;
2373 }
2374 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2375 if (r) {
2376 DRM_ERROR("hw_init %d failed %d\n", i, r);
2377 goto init_failed;
2378 }
2379 r = amdgpu_device_wb_init(adev);
2380 if (r) {
2381 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2382 goto init_failed;
2383 }
2384 adev->ip_blocks[i].status.hw = true;
2385
2386 /* right after GMC hw init, we create CSA */
2387 if (adev->gfx.mcbp) {
2388 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2389 AMDGPU_GEM_DOMAIN_VRAM |
2390 AMDGPU_GEM_DOMAIN_GTT,
2391 AMDGPU_CSA_SIZE);
2392 if (r) {
2393 DRM_ERROR("allocate CSA failed %d\n", r);
2394 goto init_failed;
2395 }
2396 }
2397 }
2398 }
2399
2400 if (amdgpu_sriov_vf(adev))
2401 amdgpu_virt_init_data_exchange(adev);
2402
2403 r = amdgpu_ib_pool_init(adev);
2404 if (r) {
2405 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2406 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2407 goto init_failed;
2408 }
2409
2410 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2411 if (r)
2412 goto init_failed;
2413
2414 r = amdgpu_device_ip_hw_init_phase1(adev);
2415 if (r)
2416 goto init_failed;
2417
2418 r = amdgpu_device_fw_loading(adev);
2419 if (r)
2420 goto init_failed;
2421
2422 r = amdgpu_device_ip_hw_init_phase2(adev);
2423 if (r)
2424 goto init_failed;
2425
2426 /*
2427 * retired pages will be loaded from eeprom and reserved here,
2428 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2429 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2430 * for I2C communication which only true at this point.
2431 *
2432 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2433 * failure from bad gpu situation and stop amdgpu init process
2434 * accordingly. For other failed cases, it will still release all
2435 * the resource and print error message, rather than returning one
2436 * negative value to upper level.
2437 *
2438 * Note: theoretically, this should be called before all vram allocations
2439 * to protect retired page from abusing
2440 */
2441 r = amdgpu_ras_recovery_init(adev);
2442 if (r)
2443 goto init_failed;
2444
2445 /**
2446 * In case of XGMI grab extra reference for reset domain for this device
2447 */
2448 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2449 if (amdgpu_xgmi_add_device(adev) == 0) {
2450 if (!amdgpu_sriov_vf(adev)) {
2451 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2452
2453 if (WARN_ON(!hive)) {
2454 r = -ENOENT;
2455 goto init_failed;
2456 }
2457
2458 if (!hive->reset_domain ||
2459 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2460 r = -ENOENT;
2461 amdgpu_put_xgmi_hive(hive);
2462 goto init_failed;
2463 }
2464
2465 /* Drop the early temporary reset domain we created for device */
2466 amdgpu_reset_put_reset_domain(adev->reset_domain);
2467 adev->reset_domain = hive->reset_domain;
2468 amdgpu_put_xgmi_hive(hive);
2469 }
2470 }
2471 }
2472
2473 r = amdgpu_device_init_schedulers(adev);
2474 if (r)
2475 goto init_failed;
2476
2477 /* Don't init kfd if whole hive need to be reset during init */
2478 if (!adev->gmc.xgmi.pending_reset) {
2479 kgd2kfd_init_zone_device(adev);
2480 amdgpu_amdkfd_device_init(adev);
2481 }
2482
2483 amdgpu_fru_get_product_info(adev);
2484
2485 init_failed:
2486
2487 return r;
2488 }
2489
2490 /**
2491 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2492 *
2493 * @adev: amdgpu_device pointer
2494 *
2495 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2496 * this function before a GPU reset. If the value is retained after a
2497 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2498 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2499 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2500 {
2501 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2502 }
2503
2504 /**
2505 * amdgpu_device_check_vram_lost - check if vram is valid
2506 *
2507 * @adev: amdgpu_device pointer
2508 *
2509 * Checks the reset magic value written to the gart pointer in VRAM.
2510 * The driver calls this after a GPU reset to see if the contents of
2511 * VRAM is lost or now.
2512 * returns true if vram is lost, false if not.
2513 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2514 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2515 {
2516 if (memcmp(adev->gart.ptr, adev->reset_magic,
2517 AMDGPU_RESET_MAGIC_NUM))
2518 return true;
2519
2520 if (!amdgpu_in_reset(adev))
2521 return false;
2522
2523 /*
2524 * For all ASICs with baco/mode1 reset, the VRAM is
2525 * always assumed to be lost.
2526 */
2527 switch (amdgpu_asic_reset_method(adev)) {
2528 case AMD_RESET_METHOD_BACO:
2529 case AMD_RESET_METHOD_MODE1:
2530 return true;
2531 default:
2532 return false;
2533 }
2534 }
2535
2536 /**
2537 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2538 *
2539 * @adev: amdgpu_device pointer
2540 * @state: clockgating state (gate or ungate)
2541 *
2542 * The list of all the hardware IPs that make up the asic is walked and the
2543 * set_clockgating_state callbacks are run.
2544 * Late initialization pass enabling clockgating for hardware IPs.
2545 * Fini or suspend, pass disabling clockgating for hardware IPs.
2546 * Returns 0 on success, negative error code on failure.
2547 */
2548
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2549 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2550 enum amd_clockgating_state state)
2551 {
2552 int i, j, r;
2553
2554 if (amdgpu_emu_mode == 1)
2555 return 0;
2556
2557 for (j = 0; j < adev->num_ip_blocks; j++) {
2558 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2559 if (!adev->ip_blocks[i].status.late_initialized)
2560 continue;
2561 /* skip CG for GFX, SDMA on S0ix */
2562 if (adev->in_s0ix &&
2563 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2564 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2565 continue;
2566 /* skip CG for VCE/UVD, it's handled specially */
2567 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2568 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2569 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2570 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2571 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2572 /* enable clockgating to save power */
2573 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2574 state);
2575 if (r) {
2576 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2577 adev->ip_blocks[i].version->funcs->name, r);
2578 return r;
2579 }
2580 }
2581 }
2582
2583 return 0;
2584 }
2585
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2586 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2587 enum amd_powergating_state state)
2588 {
2589 int i, j, r;
2590
2591 if (amdgpu_emu_mode == 1)
2592 return 0;
2593
2594 for (j = 0; j < adev->num_ip_blocks; j++) {
2595 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2596 if (!adev->ip_blocks[i].status.late_initialized)
2597 continue;
2598 /* skip PG for GFX, SDMA on S0ix */
2599 if (adev->in_s0ix &&
2600 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2601 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2602 continue;
2603 /* skip CG for VCE/UVD, it's handled specially */
2604 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2605 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2606 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2607 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2608 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2609 /* enable powergating to save power */
2610 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2611 state);
2612 if (r) {
2613 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2614 adev->ip_blocks[i].version->funcs->name, r);
2615 return r;
2616 }
2617 }
2618 }
2619 return 0;
2620 }
2621
amdgpu_device_enable_mgpu_fan_boost(void)2622 static int amdgpu_device_enable_mgpu_fan_boost(void)
2623 {
2624 struct amdgpu_gpu_instance *gpu_ins;
2625 struct amdgpu_device *adev;
2626 int i, ret = 0;
2627
2628 mutex_lock(&mgpu_info.mutex);
2629
2630 /*
2631 * MGPU fan boost feature should be enabled
2632 * only when there are two or more dGPUs in
2633 * the system
2634 */
2635 if (mgpu_info.num_dgpu < 2)
2636 goto out;
2637
2638 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2639 gpu_ins = &(mgpu_info.gpu_ins[i]);
2640 adev = gpu_ins->adev;
2641 if (!(adev->flags & AMD_IS_APU) &&
2642 !gpu_ins->mgpu_fan_enabled) {
2643 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2644 if (ret)
2645 break;
2646
2647 gpu_ins->mgpu_fan_enabled = 1;
2648 }
2649 }
2650
2651 out:
2652 mutex_unlock(&mgpu_info.mutex);
2653
2654 return ret;
2655 }
2656
2657 /**
2658 * amdgpu_device_ip_late_init - run late init for hardware IPs
2659 *
2660 * @adev: amdgpu_device pointer
2661 *
2662 * Late initialization pass for hardware IPs. The list of all the hardware
2663 * IPs that make up the asic is walked and the late_init callbacks are run.
2664 * late_init covers any special initialization that an IP requires
2665 * after all of the have been initialized or something that needs to happen
2666 * late in the init process.
2667 * Returns 0 on success, negative error code on failure.
2668 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2669 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2670 {
2671 struct amdgpu_gpu_instance *gpu_instance;
2672 int i = 0, r;
2673
2674 for (i = 0; i < adev->num_ip_blocks; i++) {
2675 if (!adev->ip_blocks[i].status.hw)
2676 continue;
2677 if (adev->ip_blocks[i].version->funcs->late_init) {
2678 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2679 if (r) {
2680 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2681 adev->ip_blocks[i].version->funcs->name, r);
2682 return r;
2683 }
2684 }
2685 adev->ip_blocks[i].status.late_initialized = true;
2686 }
2687
2688 r = amdgpu_ras_late_init(adev);
2689 if (r) {
2690 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2691 return r;
2692 }
2693
2694 amdgpu_ras_set_error_query_ready(adev, true);
2695
2696 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2697 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2698
2699 amdgpu_device_fill_reset_magic(adev);
2700
2701 r = amdgpu_device_enable_mgpu_fan_boost();
2702 if (r)
2703 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2704
2705 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2706 if (amdgpu_passthrough(adev) &&
2707 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2708 adev->asic_type == CHIP_ALDEBARAN))
2709 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2710
2711 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2712 mutex_lock(&mgpu_info.mutex);
2713
2714 /*
2715 * Reset device p-state to low as this was booted with high.
2716 *
2717 * This should be performed only after all devices from the same
2718 * hive get initialized.
2719 *
2720 * However, it's unknown how many device in the hive in advance.
2721 * As this is counted one by one during devices initializations.
2722 *
2723 * So, we wait for all XGMI interlinked devices initialized.
2724 * This may bring some delays as those devices may come from
2725 * different hives. But that should be OK.
2726 */
2727 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2728 for (i = 0; i < mgpu_info.num_gpu; i++) {
2729 gpu_instance = &(mgpu_info.gpu_ins[i]);
2730 if (gpu_instance->adev->flags & AMD_IS_APU)
2731 continue;
2732
2733 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2734 AMDGPU_XGMI_PSTATE_MIN);
2735 if (r) {
2736 DRM_ERROR("pstate setting failed (%d).\n", r);
2737 break;
2738 }
2739 }
2740 }
2741
2742 mutex_unlock(&mgpu_info.mutex);
2743 }
2744
2745 return 0;
2746 }
2747
2748 /**
2749 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2750 *
2751 * @adev: amdgpu_device pointer
2752 *
2753 * For ASICs need to disable SMC first
2754 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2755 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2756 {
2757 int i, r;
2758
2759 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2760 return;
2761
2762 for (i = 0; i < adev->num_ip_blocks; i++) {
2763 if (!adev->ip_blocks[i].status.hw)
2764 continue;
2765 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2766 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2767 /* XXX handle errors */
2768 if (r) {
2769 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2770 adev->ip_blocks[i].version->funcs->name, r);
2771 }
2772 adev->ip_blocks[i].status.hw = false;
2773 break;
2774 }
2775 }
2776 }
2777
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2778 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2779 {
2780 int i, r;
2781
2782 for (i = 0; i < adev->num_ip_blocks; i++) {
2783 if (!adev->ip_blocks[i].version->funcs->early_fini)
2784 continue;
2785
2786 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2787 if (r) {
2788 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2789 adev->ip_blocks[i].version->funcs->name, r);
2790 }
2791 }
2792
2793 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2794 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2795
2796 amdgpu_amdkfd_suspend(adev, false);
2797
2798 /* Workaroud for ASICs need to disable SMC first */
2799 amdgpu_device_smu_fini_early(adev);
2800
2801 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2802 if (!adev->ip_blocks[i].status.hw)
2803 continue;
2804
2805 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2806 /* XXX handle errors */
2807 if (r) {
2808 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2809 adev->ip_blocks[i].version->funcs->name, r);
2810 }
2811
2812 adev->ip_blocks[i].status.hw = false;
2813 }
2814
2815 if (amdgpu_sriov_vf(adev)) {
2816 if (amdgpu_virt_release_full_gpu(adev, false))
2817 DRM_ERROR("failed to release exclusive mode on fini\n");
2818 }
2819
2820 return 0;
2821 }
2822
2823 /**
2824 * amdgpu_device_ip_fini - run fini for hardware IPs
2825 *
2826 * @adev: amdgpu_device pointer
2827 *
2828 * Main teardown pass for hardware IPs. The list of all the hardware
2829 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2830 * are run. hw_fini tears down the hardware associated with each IP
2831 * and sw_fini tears down any software state associated with each IP.
2832 * Returns 0 on success, negative error code on failure.
2833 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2834 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2835 {
2836 int i, r;
2837
2838 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2839 amdgpu_virt_release_ras_err_handler_data(adev);
2840
2841 if (adev->gmc.xgmi.num_physical_nodes > 1)
2842 amdgpu_xgmi_remove_device(adev);
2843
2844 amdgpu_amdkfd_device_fini_sw(adev);
2845
2846 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2847 if (!adev->ip_blocks[i].status.sw)
2848 continue;
2849
2850 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2851 amdgpu_ucode_free_bo(adev);
2852 amdgpu_free_static_csa(&adev->virt.csa_obj);
2853 amdgpu_device_wb_fini(adev);
2854 amdgpu_device_mem_scratch_fini(adev);
2855 amdgpu_ib_pool_fini(adev);
2856 }
2857
2858 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2859 /* XXX handle errors */
2860 if (r) {
2861 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2862 adev->ip_blocks[i].version->funcs->name, r);
2863 }
2864 adev->ip_blocks[i].status.sw = false;
2865 adev->ip_blocks[i].status.valid = false;
2866 }
2867
2868 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2869 if (!adev->ip_blocks[i].status.late_initialized)
2870 continue;
2871 if (adev->ip_blocks[i].version->funcs->late_fini)
2872 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2873 adev->ip_blocks[i].status.late_initialized = false;
2874 }
2875
2876 amdgpu_ras_fini(adev);
2877
2878 return 0;
2879 }
2880
2881 /**
2882 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2883 *
2884 * @work: work_struct.
2885 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2886 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2887 {
2888 struct amdgpu_device *adev =
2889 container_of(work, struct amdgpu_device, delayed_init_work.work);
2890 int r;
2891
2892 r = amdgpu_ib_ring_tests(adev);
2893 if (r)
2894 DRM_ERROR("ib ring test failed (%d).\n", r);
2895 }
2896
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2897 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2898 {
2899 struct amdgpu_device *adev =
2900 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2901
2902 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2903 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2904
2905 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2906 adev->gfx.gfx_off_state = true;
2907 }
2908
2909 /**
2910 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2911 *
2912 * @adev: amdgpu_device pointer
2913 *
2914 * Main suspend function for hardware IPs. The list of all the hardware
2915 * IPs that make up the asic is walked, clockgating is disabled and the
2916 * suspend callbacks are run. suspend puts the hardware and software state
2917 * in each IP into a state suitable for suspend.
2918 * Returns 0 on success, negative error code on failure.
2919 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2920 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2921 {
2922 int i, r;
2923
2924 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2925 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2926
2927 /*
2928 * Per PMFW team's suggestion, driver needs to handle gfxoff
2929 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2930 * scenario. Add the missing df cstate disablement here.
2931 */
2932 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2933 dev_warn(adev->dev, "Failed to disallow df cstate");
2934
2935 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2936 if (!adev->ip_blocks[i].status.valid)
2937 continue;
2938
2939 /* displays are handled separately */
2940 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2941 continue;
2942
2943 /* XXX handle errors */
2944 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2945 /* XXX handle errors */
2946 if (r) {
2947 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2948 adev->ip_blocks[i].version->funcs->name, r);
2949 return r;
2950 }
2951
2952 adev->ip_blocks[i].status.hw = false;
2953 }
2954
2955 return 0;
2956 }
2957
2958 /**
2959 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2960 *
2961 * @adev: amdgpu_device pointer
2962 *
2963 * Main suspend function for hardware IPs. The list of all the hardware
2964 * IPs that make up the asic is walked, clockgating is disabled and the
2965 * suspend callbacks are run. suspend puts the hardware and software state
2966 * in each IP into a state suitable for suspend.
2967 * Returns 0 on success, negative error code on failure.
2968 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2969 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2970 {
2971 int i, r;
2972
2973 if (adev->in_s0ix)
2974 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2975
2976 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2977 if (!adev->ip_blocks[i].status.valid)
2978 continue;
2979 /* displays are handled in phase1 */
2980 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2981 continue;
2982 /* PSP lost connection when err_event_athub occurs */
2983 if (amdgpu_ras_intr_triggered() &&
2984 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2985 adev->ip_blocks[i].status.hw = false;
2986 continue;
2987 }
2988
2989 /* skip unnecessary suspend if we do not initialize them yet */
2990 if (adev->gmc.xgmi.pending_reset &&
2991 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2992 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2995 adev->ip_blocks[i].status.hw = false;
2996 continue;
2997 }
2998
2999 /* skip suspend of gfx/mes and psp for S0ix
3000 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3001 * like at runtime. PSP is also part of the always on hardware
3002 * so no need to suspend it.
3003 */
3004 if (adev->in_s0ix &&
3005 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3006 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3007 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3008 continue;
3009
3010 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3011 if (adev->in_s0ix &&
3012 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3013 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3014 continue;
3015
3016 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3017 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3018 * from this location and RLC Autoload automatically also gets loaded
3019 * from here based on PMFW -> PSP message during re-init sequence.
3020 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3021 * the TMR and reload FWs again for IMU enabled APU ASICs.
3022 */
3023 if (amdgpu_in_reset(adev) &&
3024 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3025 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3026 continue;
3027
3028 /* XXX handle errors */
3029 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3030 /* XXX handle errors */
3031 if (r) {
3032 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3033 adev->ip_blocks[i].version->funcs->name, r);
3034 }
3035 adev->ip_blocks[i].status.hw = false;
3036 /* handle putting the SMC in the appropriate state */
3037 if (!amdgpu_sriov_vf(adev)) {
3038 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3039 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3040 if (r) {
3041 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3042 adev->mp1_state, r);
3043 return r;
3044 }
3045 }
3046 }
3047 }
3048
3049 return 0;
3050 }
3051
3052 /**
3053 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3054 *
3055 * @adev: amdgpu_device pointer
3056 *
3057 * Main suspend function for hardware IPs. The list of all the hardware
3058 * IPs that make up the asic is walked, clockgating is disabled and the
3059 * suspend callbacks are run. suspend puts the hardware and software state
3060 * in each IP into a state suitable for suspend.
3061 * Returns 0 on success, negative error code on failure.
3062 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3063 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3064 {
3065 int r;
3066
3067 if (amdgpu_sriov_vf(adev)) {
3068 amdgpu_virt_fini_data_exchange(adev);
3069 amdgpu_virt_request_full_gpu(adev, false);
3070 }
3071
3072 r = amdgpu_device_ip_suspend_phase1(adev);
3073 if (r)
3074 return r;
3075 r = amdgpu_device_ip_suspend_phase2(adev);
3076
3077 if (amdgpu_sriov_vf(adev))
3078 amdgpu_virt_release_full_gpu(adev, false);
3079
3080 return r;
3081 }
3082
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3083 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3084 {
3085 int i, r;
3086
3087 static enum amd_ip_block_type ip_order[] = {
3088 AMD_IP_BLOCK_TYPE_COMMON,
3089 AMD_IP_BLOCK_TYPE_GMC,
3090 AMD_IP_BLOCK_TYPE_PSP,
3091 AMD_IP_BLOCK_TYPE_IH,
3092 };
3093
3094 for (i = 0; i < adev->num_ip_blocks; i++) {
3095 int j;
3096 struct amdgpu_ip_block *block;
3097
3098 block = &adev->ip_blocks[i];
3099 block->status.hw = false;
3100
3101 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3102
3103 if (block->version->type != ip_order[j] ||
3104 !block->status.valid)
3105 continue;
3106
3107 r = block->version->funcs->hw_init(adev);
3108 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3109 if (r)
3110 return r;
3111 block->status.hw = true;
3112 }
3113 }
3114
3115 return 0;
3116 }
3117
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3118 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3119 {
3120 int i, r;
3121
3122 static enum amd_ip_block_type ip_order[] = {
3123 AMD_IP_BLOCK_TYPE_SMC,
3124 AMD_IP_BLOCK_TYPE_DCE,
3125 AMD_IP_BLOCK_TYPE_GFX,
3126 AMD_IP_BLOCK_TYPE_SDMA,
3127 AMD_IP_BLOCK_TYPE_MES,
3128 AMD_IP_BLOCK_TYPE_UVD,
3129 AMD_IP_BLOCK_TYPE_VCE,
3130 AMD_IP_BLOCK_TYPE_VCN,
3131 AMD_IP_BLOCK_TYPE_JPEG
3132 };
3133
3134 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3135 int j;
3136 struct amdgpu_ip_block *block;
3137
3138 for (j = 0; j < adev->num_ip_blocks; j++) {
3139 block = &adev->ip_blocks[j];
3140
3141 if (block->version->type != ip_order[i] ||
3142 !block->status.valid ||
3143 block->status.hw)
3144 continue;
3145
3146 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3147 r = block->version->funcs->resume(adev);
3148 else
3149 r = block->version->funcs->hw_init(adev);
3150
3151 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3152 if (r)
3153 return r;
3154 block->status.hw = true;
3155 }
3156 }
3157
3158 return 0;
3159 }
3160
3161 /**
3162 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3163 *
3164 * @adev: amdgpu_device pointer
3165 *
3166 * First resume function for hardware IPs. The list of all the hardware
3167 * IPs that make up the asic is walked and the resume callbacks are run for
3168 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3169 * after a suspend and updates the software state as necessary. This
3170 * function is also used for restoring the GPU after a GPU reset.
3171 * Returns 0 on success, negative error code on failure.
3172 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3173 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3174 {
3175 int i, r;
3176
3177 for (i = 0; i < adev->num_ip_blocks; i++) {
3178 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3179 continue;
3180 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3181 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3182 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3183 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3184
3185 r = adev->ip_blocks[i].version->funcs->resume(adev);
3186 if (r) {
3187 DRM_ERROR("resume of IP block <%s> failed %d\n",
3188 adev->ip_blocks[i].version->funcs->name, r);
3189 return r;
3190 }
3191 adev->ip_blocks[i].status.hw = true;
3192 }
3193 }
3194
3195 return 0;
3196 }
3197
3198 /**
3199 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3200 *
3201 * @adev: amdgpu_device pointer
3202 *
3203 * First resume function for hardware IPs. The list of all the hardware
3204 * IPs that make up the asic is walked and the resume callbacks are run for
3205 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3206 * functional state after a suspend and updates the software state as
3207 * necessary. This function is also used for restoring the GPU after a GPU
3208 * reset.
3209 * Returns 0 on success, negative error code on failure.
3210 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3211 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3212 {
3213 int i, r;
3214
3215 for (i = 0; i < adev->num_ip_blocks; i++) {
3216 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3217 continue;
3218 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3222 continue;
3223 r = adev->ip_blocks[i].version->funcs->resume(adev);
3224 if (r) {
3225 DRM_ERROR("resume of IP block <%s> failed %d\n",
3226 adev->ip_blocks[i].version->funcs->name, r);
3227 return r;
3228 }
3229 adev->ip_blocks[i].status.hw = true;
3230 }
3231
3232 return 0;
3233 }
3234
3235 /**
3236 * amdgpu_device_ip_resume - run resume for hardware IPs
3237 *
3238 * @adev: amdgpu_device pointer
3239 *
3240 * Main resume function for hardware IPs. The hardware IPs
3241 * are split into two resume functions because they are
3242 * also used in recovering from a GPU reset and some additional
3243 * steps need to be take between them. In this case (S3/S4) they are
3244 * run sequentially.
3245 * Returns 0 on success, negative error code on failure.
3246 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3247 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3248 {
3249 int r;
3250
3251 r = amdgpu_device_ip_resume_phase1(adev);
3252 if (r)
3253 return r;
3254
3255 r = amdgpu_device_fw_loading(adev);
3256 if (r)
3257 return r;
3258
3259 r = amdgpu_device_ip_resume_phase2(adev);
3260
3261 return r;
3262 }
3263
3264 /**
3265 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3266 *
3267 * @adev: amdgpu_device pointer
3268 *
3269 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3270 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3271 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3272 {
3273 if (amdgpu_sriov_vf(adev)) {
3274 if (adev->is_atom_fw) {
3275 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3276 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3277 } else {
3278 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3279 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3280 }
3281
3282 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3283 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3284 }
3285 }
3286
3287 /**
3288 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3289 *
3290 * @asic_type: AMD asic type
3291 *
3292 * Check if there is DC (new modesetting infrastructre) support for an asic.
3293 * returns true if DC has support, false if not.
3294 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3295 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3296 {
3297 switch (asic_type) {
3298 #ifdef CONFIG_DRM_AMDGPU_SI
3299 case CHIP_HAINAN:
3300 #endif
3301 case CHIP_TOPAZ:
3302 /* chips with no display hardware */
3303 return false;
3304 #if defined(CONFIG_DRM_AMD_DC)
3305 case CHIP_TAHITI:
3306 case CHIP_PITCAIRN:
3307 case CHIP_VERDE:
3308 case CHIP_OLAND:
3309 /*
3310 * We have systems in the wild with these ASICs that require
3311 * LVDS and VGA support which is not supported with DC.
3312 *
3313 * Fallback to the non-DC driver here by default so as not to
3314 * cause regressions.
3315 */
3316 #if defined(CONFIG_DRM_AMD_DC_SI)
3317 return amdgpu_dc > 0;
3318 #else
3319 return false;
3320 #endif
3321 case CHIP_BONAIRE:
3322 case CHIP_KAVERI:
3323 case CHIP_KABINI:
3324 case CHIP_MULLINS:
3325 /*
3326 * We have systems in the wild with these ASICs that require
3327 * VGA support which is not supported with DC.
3328 *
3329 * Fallback to the non-DC driver here by default so as not to
3330 * cause regressions.
3331 */
3332 return amdgpu_dc > 0;
3333 default:
3334 return amdgpu_dc != 0;
3335 #else
3336 default:
3337 if (amdgpu_dc > 0)
3338 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3339 return false;
3340 #endif
3341 }
3342 }
3343
3344 /**
3345 * amdgpu_device_has_dc_support - check if dc is supported
3346 *
3347 * @adev: amdgpu_device pointer
3348 *
3349 * Returns true for supported, false for not supported
3350 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3351 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3352 {
3353 if (adev->enable_virtual_display ||
3354 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3355 return false;
3356
3357 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3358 }
3359
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3360 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3361 {
3362 struct amdgpu_device *adev =
3363 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3364 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3365
3366 /* It's a bug to not have a hive within this function */
3367 if (WARN_ON(!hive))
3368 return;
3369
3370 /*
3371 * Use task barrier to synchronize all xgmi reset works across the
3372 * hive. task_barrier_enter and task_barrier_exit will block
3373 * until all the threads running the xgmi reset works reach
3374 * those points. task_barrier_full will do both blocks.
3375 */
3376 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3377
3378 task_barrier_enter(&hive->tb);
3379 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3380
3381 if (adev->asic_reset_res)
3382 goto fail;
3383
3384 task_barrier_exit(&hive->tb);
3385 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3386
3387 if (adev->asic_reset_res)
3388 goto fail;
3389
3390 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3391 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3392 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3393 } else {
3394
3395 task_barrier_full(&hive->tb);
3396 adev->asic_reset_res = amdgpu_asic_reset(adev);
3397 }
3398
3399 fail:
3400 if (adev->asic_reset_res)
3401 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3402 adev->asic_reset_res, adev_to_drm(adev)->unique);
3403 amdgpu_put_xgmi_hive(hive);
3404 }
3405
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3406 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3407 {
3408 char *input = amdgpu_lockup_timeout;
3409 char *timeout_setting = NULL;
3410 int index = 0;
3411 long timeout;
3412 int ret = 0;
3413
3414 /*
3415 * By default timeout for non compute jobs is 10000
3416 * and 60000 for compute jobs.
3417 * In SR-IOV or passthrough mode, timeout for compute
3418 * jobs are 60000 by default.
3419 */
3420 adev->gfx_timeout = msecs_to_jiffies(10000);
3421 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3422 if (amdgpu_sriov_vf(adev))
3423 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3424 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3425 else
3426 adev->compute_timeout = msecs_to_jiffies(60000);
3427
3428 #ifdef notyet
3429 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3430 while ((timeout_setting = strsep(&input, ",")) &&
3431 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3432 ret = kstrtol(timeout_setting, 0, &timeout);
3433 if (ret)
3434 return ret;
3435
3436 if (timeout == 0) {
3437 index++;
3438 continue;
3439 } else if (timeout < 0) {
3440 timeout = MAX_SCHEDULE_TIMEOUT;
3441 dev_warn(adev->dev, "lockup timeout disabled");
3442 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3443 } else {
3444 timeout = msecs_to_jiffies(timeout);
3445 }
3446
3447 switch (index++) {
3448 case 0:
3449 adev->gfx_timeout = timeout;
3450 break;
3451 case 1:
3452 adev->compute_timeout = timeout;
3453 break;
3454 case 2:
3455 adev->sdma_timeout = timeout;
3456 break;
3457 case 3:
3458 adev->video_timeout = timeout;
3459 break;
3460 default:
3461 break;
3462 }
3463 }
3464 /*
3465 * There is only one value specified and
3466 * it should apply to all non-compute jobs.
3467 */
3468 if (index == 1) {
3469 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3470 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3471 adev->compute_timeout = adev->gfx_timeout;
3472 }
3473 }
3474 #endif
3475
3476 return ret;
3477 }
3478
3479 /**
3480 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3481 *
3482 * @adev: amdgpu_device pointer
3483 *
3484 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3485 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3486 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3487 {
3488 #ifdef notyet
3489 struct iommu_domain *domain;
3490
3491 domain = iommu_get_domain_for_dev(adev->dev);
3492 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3493 #endif
3494 adev->ram_is_direct_mapped = true;
3495 }
3496
3497 static const struct attribute *amdgpu_dev_attributes[] = {
3498 &dev_attr_pcie_replay_count.attr,
3499 NULL
3500 };
3501
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3502 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3503 {
3504 if (amdgpu_mcbp == 1)
3505 adev->gfx.mcbp = true;
3506 else if (amdgpu_mcbp == 0)
3507 adev->gfx.mcbp = false;
3508
3509 if (amdgpu_sriov_vf(adev))
3510 adev->gfx.mcbp = true;
3511
3512 if (adev->gfx.mcbp)
3513 DRM_INFO("MCBP is enabled\n");
3514 }
3515
3516 /**
3517 * amdgpu_device_init - initialize the driver
3518 *
3519 * @adev: amdgpu_device pointer
3520 * @flags: driver flags
3521 *
3522 * Initializes the driver info and hw (all asics).
3523 * Returns 0 for success or an error on failure.
3524 * Called at driver startup.
3525 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3526 int amdgpu_device_init(struct amdgpu_device *adev,
3527 uint32_t flags)
3528 {
3529 struct drm_device *ddev = adev_to_drm(adev);
3530 struct pci_dev *pdev = adev->pdev;
3531 int r, i;
3532 bool px = false;
3533 u32 max_MBps;
3534 int tmp;
3535
3536 adev->shutdown = false;
3537 adev->flags = flags;
3538
3539 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3540 adev->asic_type = amdgpu_force_asic_type;
3541 else
3542 adev->asic_type = flags & AMD_ASIC_MASK;
3543
3544 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3545 if (amdgpu_emu_mode == 1)
3546 adev->usec_timeout *= 10;
3547 adev->gmc.gart_size = 512 * 1024 * 1024;
3548 adev->accel_working = false;
3549 adev->num_rings = 0;
3550 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3551 adev->mman.buffer_funcs = NULL;
3552 adev->mman.buffer_funcs_ring = NULL;
3553 adev->vm_manager.vm_pte_funcs = NULL;
3554 adev->vm_manager.vm_pte_num_scheds = 0;
3555 adev->gmc.gmc_funcs = NULL;
3556 adev->harvest_ip_mask = 0x0;
3557 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3558 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3559
3560 adev->smc_rreg = &amdgpu_invalid_rreg;
3561 adev->smc_wreg = &amdgpu_invalid_wreg;
3562 adev->pcie_rreg = &amdgpu_invalid_rreg;
3563 adev->pcie_wreg = &amdgpu_invalid_wreg;
3564 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3565 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3566 adev->pciep_rreg = &amdgpu_invalid_rreg;
3567 adev->pciep_wreg = &amdgpu_invalid_wreg;
3568 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3569 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3570 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3571 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3572 adev->didt_rreg = &amdgpu_invalid_rreg;
3573 adev->didt_wreg = &amdgpu_invalid_wreg;
3574 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3575 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3576 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3577 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3578
3579 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3580 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3581 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3582
3583 /* mutex initialization are all done here so we
3584 * can recall function without having locking issues
3585 */
3586 rw_init(&adev->firmware.mutex, "agfw");
3587 rw_init(&adev->pm.mutex, "agpm");
3588 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
3589 rw_init(&adev->srbm_mutex, "srbm");
3590 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
3591 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
3592 rw_init(&adev->gfx.partition_mutex, "gfxpar");
3593 rw_init(&adev->grbm_idx_mutex, "grbmidx");
3594 rw_init(&adev->mn_lock, "agpumn");
3595 rw_init(&adev->virt.vf_errors.lock, "vferr");
3596 hash_init(adev->mn_hash);
3597 rw_init(&adev->psp.mutex, "agpsp");
3598 rw_init(&adev->notifier_lock, "agnf");
3599 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps");
3600 rw_init(&adev->benchmark_mutex, "agbm");
3601
3602 amdgpu_device_init_apu_flags(adev);
3603
3604 r = amdgpu_device_check_arguments(adev);
3605 if (r)
3606 return r;
3607
3608 mtx_init(&adev->mmio_idx_lock, IPL_TTY);
3609 mtx_init(&adev->smc_idx_lock, IPL_TTY);
3610 mtx_init(&adev->pcie_idx_lock, IPL_TTY);
3611 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
3612 mtx_init(&adev->didt_idx_lock, IPL_TTY);
3613 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
3614 mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
3615 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
3616 mtx_init(&adev->mm_stats.lock, IPL_NONE);
3617
3618 INIT_LIST_HEAD(&adev->shadow_list);
3619 rw_init(&adev->shadow_list_lock, "sdwlst");
3620
3621 INIT_LIST_HEAD(&adev->reset_list);
3622
3623 INIT_LIST_HEAD(&adev->ras_list);
3624
3625 INIT_DELAYED_WORK(&adev->delayed_init_work,
3626 amdgpu_device_delayed_init_work_handler);
3627 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3628 amdgpu_device_delay_enable_gfx_off);
3629
3630 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3631
3632 adev->gfx.gfx_off_req_count = 1;
3633 adev->gfx.gfx_off_residency = 0;
3634 adev->gfx.gfx_off_entrycount = 0;
3635 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3636
3637 atomic_set(&adev->throttling_logging_enabled, 1);
3638 /*
3639 * If throttling continues, logging will be performed every minute
3640 * to avoid log flooding. "-1" is subtracted since the thermal
3641 * throttling interrupt comes every second. Thus, the total logging
3642 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3643 * for throttling interrupt) = 60 seconds.
3644 */
3645 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3646 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3647
3648 #ifdef __linux__
3649 /* Registers mapping */
3650 /* TODO: block userspace mapping of io register */
3651 if (adev->asic_type >= CHIP_BONAIRE) {
3652 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3653 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3654 } else {
3655 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3656 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3657 }
3658 #endif
3659
3660 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3661 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3662
3663 #ifdef __linux__
3664 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3665 if (!adev->rmmio)
3666 return -ENOMEM;
3667 #endif
3668 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3669 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3670
3671 /*
3672 * Reset domain needs to be present early, before XGMI hive discovered
3673 * (if any) and intitialized to use reset sem and in_gpu reset flag
3674 * early on during init and before calling to RREG32.
3675 */
3676 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3677 if (!adev->reset_domain)
3678 return -ENOMEM;
3679
3680 /* detect hw virtualization here */
3681 amdgpu_detect_virtualization(adev);
3682
3683 amdgpu_device_get_pcie_info(adev);
3684
3685 r = amdgpu_device_get_job_timeout_settings(adev);
3686 if (r) {
3687 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3688 return r;
3689 }
3690
3691 /* early init functions */
3692 r = amdgpu_device_ip_early_init(adev);
3693 if (r)
3694 return r;
3695
3696 amdgpu_device_set_mcbp(adev);
3697
3698 /* Get rid of things like offb */
3699 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3700 if (r)
3701 return r;
3702
3703 /* Enable TMZ based on IP_VERSION */
3704 amdgpu_gmc_tmz_set(adev);
3705
3706 amdgpu_gmc_noretry_set(adev);
3707 /* Need to get xgmi info early to decide the reset behavior*/
3708 if (adev->gmc.xgmi.supported) {
3709 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3710 if (r)
3711 return r;
3712 }
3713
3714 /* enable PCIE atomic ops */
3715 #ifdef notyet
3716 if (amdgpu_sriov_vf(adev)) {
3717 if (adev->virt.fw_reserve.p_pf2vf)
3718 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3719 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3720 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3721 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3722 * internal path natively support atomics, set have_atomics_support to true.
3723 */
3724 } else if ((adev->flags & AMD_IS_APU) &&
3725 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3726 adev->have_atomics_support = true;
3727 } else {
3728 adev->have_atomics_support =
3729 !pci_enable_atomic_ops_to_root(adev->pdev,
3730 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3731 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3732 }
3733
3734 if (!adev->have_atomics_support)
3735 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3736 #else
3737 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3738 * internal path natively support atomics, set have_atomics_support to true.
3739 */
3740 if ((adev->flags & AMD_IS_APU) &&
3741 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)))
3742 adev->have_atomics_support = true;
3743 else
3744 adev->have_atomics_support = false;
3745 #endif
3746
3747 /* doorbell bar mapping and doorbell index init*/
3748 amdgpu_doorbell_init(adev);
3749
3750 if (amdgpu_emu_mode == 1) {
3751 /* post the asic on emulation mode */
3752 emu_soc_asic_init(adev);
3753 goto fence_driver_init;
3754 }
3755
3756 amdgpu_reset_init(adev);
3757
3758 /* detect if we are with an SRIOV vbios */
3759 if (adev->bios)
3760 amdgpu_device_detect_sriov_bios(adev);
3761
3762 /* check if we need to reset the asic
3763 * E.g., driver was not cleanly unloaded previously, etc.
3764 */
3765 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3766 if (adev->gmc.xgmi.num_physical_nodes) {
3767 dev_info(adev->dev, "Pending hive reset.\n");
3768 adev->gmc.xgmi.pending_reset = true;
3769 /* Only need to init necessary block for SMU to handle the reset */
3770 for (i = 0; i < adev->num_ip_blocks; i++) {
3771 if (!adev->ip_blocks[i].status.valid)
3772 continue;
3773 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3774 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3775 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3777 DRM_DEBUG("IP %s disabled for hw_init.\n",
3778 adev->ip_blocks[i].version->funcs->name);
3779 adev->ip_blocks[i].status.hw = true;
3780 }
3781 }
3782 } else {
3783 tmp = amdgpu_reset_method;
3784 /* It should do a default reset when loading or reloading the driver,
3785 * regardless of the module parameter reset_method.
3786 */
3787 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3788 r = amdgpu_asic_reset(adev);
3789 amdgpu_reset_method = tmp;
3790 if (r) {
3791 dev_err(adev->dev, "asic reset on init failed\n");
3792 goto failed;
3793 }
3794 }
3795 }
3796
3797 /* Post card if necessary */
3798 if (amdgpu_device_need_post(adev)) {
3799 if (!adev->bios) {
3800 dev_err(adev->dev, "no vBIOS found\n");
3801 r = -EINVAL;
3802 goto failed;
3803 }
3804 DRM_INFO("GPU posting now...\n");
3805 r = amdgpu_device_asic_init(adev);
3806 if (r) {
3807 dev_err(adev->dev, "gpu post error!\n");
3808 goto failed;
3809 }
3810 }
3811
3812 if (adev->bios) {
3813 if (adev->is_atom_fw) {
3814 /* Initialize clocks */
3815 r = amdgpu_atomfirmware_get_clock_info(adev);
3816 if (r) {
3817 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3818 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3819 goto failed;
3820 }
3821 } else {
3822 /* Initialize clocks */
3823 r = amdgpu_atombios_get_clock_info(adev);
3824 if (r) {
3825 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3826 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3827 goto failed;
3828 }
3829 /* init i2c buses */
3830 if (!amdgpu_device_has_dc_support(adev))
3831 amdgpu_atombios_i2c_init(adev);
3832 }
3833 }
3834
3835 fence_driver_init:
3836 /* Fence driver */
3837 r = amdgpu_fence_driver_sw_init(adev);
3838 if (r) {
3839 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3840 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3841 goto failed;
3842 }
3843
3844 /* init the mode config */
3845 drm_mode_config_init(adev_to_drm(adev));
3846
3847 r = amdgpu_device_ip_init(adev);
3848 if (r) {
3849 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3850 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3851 goto release_ras_con;
3852 }
3853
3854 amdgpu_fence_driver_hw_init(adev);
3855
3856 dev_info(adev->dev,
3857 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3858 adev->gfx.config.max_shader_engines,
3859 adev->gfx.config.max_sh_per_se,
3860 adev->gfx.config.max_cu_per_sh,
3861 adev->gfx.cu_info.number);
3862
3863 #ifdef __OpenBSD__
3864 {
3865 const char *chip_name;
3866 uint32_t version = adev->ip_versions[GC_HWIP][0];
3867 int maj, min, rev;
3868
3869 switch (adev->asic_type) {
3870 case CHIP_RAVEN:
3871 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
3872 chip_name = "RAVEN2";
3873 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
3874 chip_name = "PICASSO";
3875 else
3876 chip_name = "RAVEN";
3877 break;
3878 case CHIP_RENOIR:
3879 if (adev->apu_flags & AMD_APU_IS_RENOIR)
3880 chip_name = "RENOIR";
3881 else
3882 chip_name = "GREEN_SARDINE";
3883 break;
3884 default:
3885 chip_name = amdgpu_asic_name[adev->asic_type];
3886 }
3887
3888 printf("%s: %s", adev->self.dv_xname, chip_name);
3889 /* show graphics/compute ip block version, not set on < GFX9 */
3890 if (version) {
3891 maj = IP_VERSION_MAJ(version);
3892 min = IP_VERSION_MIN(version);
3893 rev = IP_VERSION_REV(version);
3894 printf(" GC %d.%d.%d", maj, min, rev);
3895 }
3896 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id);
3897 }
3898 #endif
3899
3900 adev->accel_working = true;
3901
3902 amdgpu_vm_check_compute_bug(adev);
3903
3904 /* Initialize the buffer migration limit. */
3905 if (amdgpu_moverate >= 0)
3906 max_MBps = amdgpu_moverate;
3907 else
3908 max_MBps = 8; /* Allow 8 MB/s. */
3909 /* Get a log2 for easy divisions. */
3910 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3911
3912 r = amdgpu_atombios_sysfs_init(adev);
3913 if (r)
3914 drm_err(&adev->ddev,
3915 "registering atombios sysfs failed (%d).\n", r);
3916
3917 r = amdgpu_pm_sysfs_init(adev);
3918 if (r)
3919 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3920
3921 r = amdgpu_ucode_sysfs_init(adev);
3922 if (r) {
3923 adev->ucode_sysfs_en = false;
3924 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3925 } else
3926 adev->ucode_sysfs_en = true;
3927
3928 /*
3929 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3930 * Otherwise the mgpu fan boost feature will be skipped due to the
3931 * gpu instance is counted less.
3932 */
3933 amdgpu_register_gpu_instance(adev);
3934
3935 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3936 * explicit gating rather than handling it automatically.
3937 */
3938 if (!adev->gmc.xgmi.pending_reset) {
3939 r = amdgpu_device_ip_late_init(adev);
3940 if (r) {
3941 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3942 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3943 goto release_ras_con;
3944 }
3945 /* must succeed. */
3946 amdgpu_ras_resume(adev);
3947 queue_delayed_work(system_wq, &adev->delayed_init_work,
3948 msecs_to_jiffies(AMDGPU_RESUME_MS));
3949 }
3950
3951 if (amdgpu_sriov_vf(adev)) {
3952 amdgpu_virt_release_full_gpu(adev, true);
3953 flush_delayed_work(&adev->delayed_init_work);
3954 }
3955
3956 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3957 if (r)
3958 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3959
3960 amdgpu_fru_sysfs_init(adev);
3961
3962 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3963 r = amdgpu_pmu_init(adev);
3964 if (r)
3965 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3966
3967 /* Have stored pci confspace at hand for restore in sudden PCI error */
3968 if (amdgpu_device_cache_pci_state(adev->pdev))
3969 pci_restore_state(pdev);
3970
3971 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3972 /* this will fail for cards that aren't VGA class devices, just
3973 * ignore it
3974 */
3975 #ifdef notyet
3976 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3977 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3978 #endif
3979
3980 px = amdgpu_device_supports_px(ddev);
3981
3982 if (px || (!dev_is_removable(&adev->pdev->dev) &&
3983 apple_gmux_detect(NULL, NULL)))
3984 vga_switcheroo_register_client(adev->pdev,
3985 &amdgpu_switcheroo_ops, px);
3986
3987 if (px)
3988 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3989
3990 if (adev->gmc.xgmi.pending_reset)
3991 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3992 msecs_to_jiffies(AMDGPU_RESUME_MS));
3993
3994 amdgpu_device_check_iommu_direct_map(adev);
3995
3996 return 0;
3997
3998 release_ras_con:
3999 if (amdgpu_sriov_vf(adev))
4000 amdgpu_virt_release_full_gpu(adev, true);
4001
4002 /* failed in exclusive mode due to timeout */
4003 if (amdgpu_sriov_vf(adev) &&
4004 !amdgpu_sriov_runtime(adev) &&
4005 amdgpu_virt_mmio_blocked(adev) &&
4006 !amdgpu_virt_wait_reset(adev)) {
4007 dev_err(adev->dev, "VF exclusive mode timeout\n");
4008 /* Don't send request since VF is inactive. */
4009 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4010 adev->virt.ops = NULL;
4011 r = -EAGAIN;
4012 }
4013 amdgpu_release_ras_context(adev);
4014
4015 failed:
4016 amdgpu_vf_error_trans_all(adev);
4017
4018 return r;
4019 }
4020
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4021 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4022 {
4023 STUB();
4024 #ifdef notyet
4025
4026 /* Clear all CPU mappings pointing to this device */
4027 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4028 #endif
4029
4030 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4031 amdgpu_doorbell_fini(adev);
4032
4033 #ifdef __linux__
4034 iounmap(adev->rmmio);
4035 adev->rmmio = NULL;
4036 if (adev->mman.aper_base_kaddr)
4037 iounmap(adev->mman.aper_base_kaddr);
4038 adev->mman.aper_base_kaddr = NULL;
4039 #else
4040 if (adev->rmmio_size > 0)
4041 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4042 adev->rmmio_size);
4043 adev->rmmio_size = 0;
4044 adev->rmmio = NULL;
4045 if (adev->mman.aper_base_kaddr)
4046 bus_space_unmap(adev->memt, adev->mman.aper_bsh,
4047 adev->gmc.visible_vram_size);
4048 adev->mman.aper_base_kaddr = NULL;
4049 #endif
4050
4051 /* Memory manager related */
4052 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4053 #ifdef __linux__
4054 arch_phys_wc_del(adev->gmc.vram_mtrr);
4055 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4056 #else
4057 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
4058 #endif
4059 }
4060 }
4061
4062 /**
4063 * amdgpu_device_fini_hw - tear down the driver
4064 *
4065 * @adev: amdgpu_device pointer
4066 *
4067 * Tear down the driver info (all asics).
4068 * Called at driver shutdown.
4069 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4070 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4071 {
4072 dev_info(adev->dev, "amdgpu: finishing device.\n");
4073 flush_delayed_work(&adev->delayed_init_work);
4074 adev->shutdown = true;
4075
4076 /* make sure IB test finished before entering exclusive mode
4077 * to avoid preemption on IB test
4078 */
4079 if (amdgpu_sriov_vf(adev)) {
4080 amdgpu_virt_request_full_gpu(adev, false);
4081 amdgpu_virt_fini_data_exchange(adev);
4082 }
4083
4084 /* disable all interrupts */
4085 amdgpu_irq_disable_all(adev);
4086 if (adev->mode_info.mode_config_initialized) {
4087 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4088 drm_helper_force_disable_all(adev_to_drm(adev));
4089 else
4090 drm_atomic_helper_shutdown(adev_to_drm(adev));
4091 }
4092 amdgpu_fence_driver_hw_fini(adev);
4093
4094 if (adev->mman.initialized)
4095 drain_workqueue(adev->mman.bdev.wq);
4096
4097 if (adev->pm.sysfs_initialized)
4098 amdgpu_pm_sysfs_fini(adev);
4099 if (adev->ucode_sysfs_en)
4100 amdgpu_ucode_sysfs_fini(adev);
4101 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4102 amdgpu_fru_sysfs_fini(adev);
4103
4104 /* disable ras feature must before hw fini */
4105 amdgpu_ras_pre_fini(adev);
4106
4107 amdgpu_device_ip_fini_early(adev);
4108
4109 amdgpu_irq_fini_hw(adev);
4110
4111 if (adev->mman.initialized)
4112 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4113
4114 amdgpu_gart_dummy_page_fini(adev);
4115
4116 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4117 amdgpu_device_unmap_mmio(adev);
4118
4119 }
4120
amdgpu_device_fini_sw(struct amdgpu_device * adev)4121 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4122 {
4123 int idx;
4124 bool px;
4125
4126 amdgpu_fence_driver_sw_fini(adev);
4127 amdgpu_device_ip_fini(adev);
4128 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4129 adev->accel_working = false;
4130 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4131
4132 amdgpu_reset_fini(adev);
4133
4134 /* free i2c buses */
4135 if (!amdgpu_device_has_dc_support(adev))
4136 amdgpu_i2c_fini(adev);
4137
4138 if (amdgpu_emu_mode != 1)
4139 amdgpu_atombios_fini(adev);
4140
4141 kfree(adev->bios);
4142 adev->bios = NULL;
4143
4144 px = amdgpu_device_supports_px(adev_to_drm(adev));
4145
4146 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4147 apple_gmux_detect(NULL, NULL)))
4148 vga_switcheroo_unregister_client(adev->pdev);
4149
4150 if (px)
4151 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4152
4153 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4154 vga_client_unregister(adev->pdev);
4155
4156 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4157 #ifdef __linux__
4158 iounmap(adev->rmmio);
4159 adev->rmmio = NULL;
4160 #else
4161 if (adev->rmmio_size > 0)
4162 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4163 adev->rmmio_size);
4164 adev->rmmio_size = 0;
4165 adev->rmmio = NULL;
4166 #endif
4167 amdgpu_doorbell_fini(adev);
4168 drm_dev_exit(idx);
4169 }
4170
4171 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4172 amdgpu_pmu_fini(adev);
4173 if (adev->mman.discovery_bin)
4174 amdgpu_discovery_fini(adev);
4175
4176 amdgpu_reset_put_reset_domain(adev->reset_domain);
4177 adev->reset_domain = NULL;
4178
4179 kfree(adev->pci_state);
4180
4181 }
4182
4183 /**
4184 * amdgpu_device_evict_resources - evict device resources
4185 * @adev: amdgpu device object
4186 *
4187 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4188 * of the vram memory type. Mainly used for evicting device resources
4189 * at suspend time.
4190 *
4191 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4192 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4193 {
4194 int ret;
4195
4196 /* No need to evict vram on APUs for suspend to ram or s2idle */
4197 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4198 return 0;
4199
4200 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4201 if (ret)
4202 DRM_WARN("evicting device resources failed\n");
4203 return ret;
4204 }
4205
4206 /*
4207 * Suspend & resume.
4208 */
4209 /**
4210 * amdgpu_device_prepare - prepare for device suspend
4211 *
4212 * @dev: drm dev pointer
4213 *
4214 * Prepare to put the hw in the suspend state (all asics).
4215 * Returns 0 for success or an error on failure.
4216 * Called at driver suspend.
4217 */
amdgpu_device_prepare(struct drm_device * dev)4218 int amdgpu_device_prepare(struct drm_device *dev)
4219 {
4220 struct amdgpu_device *adev = drm_to_adev(dev);
4221 int i, r;
4222
4223 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4224 return 0;
4225
4226 /* Evict the majority of BOs before starting suspend sequence */
4227 r = amdgpu_device_evict_resources(adev);
4228 if (r)
4229 return r;
4230
4231 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4232
4233 for (i = 0; i < adev->num_ip_blocks; i++) {
4234 if (!adev->ip_blocks[i].status.valid)
4235 continue;
4236 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4237 continue;
4238 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4239 if (r)
4240 return r;
4241 }
4242
4243 return 0;
4244 }
4245
4246 /**
4247 * amdgpu_device_suspend - initiate device suspend
4248 *
4249 * @dev: drm dev pointer
4250 * @fbcon : notify the fbdev of suspend
4251 *
4252 * Puts the hw in the suspend state (all asics).
4253 * Returns 0 for success or an error on failure.
4254 * Called at driver suspend.
4255 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4256 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4257 {
4258 struct amdgpu_device *adev = drm_to_adev(dev);
4259 int r = 0;
4260
4261 if (adev->shutdown)
4262 return 0;
4263
4264 #ifdef notyet
4265 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4266 return 0;
4267 #endif
4268
4269 adev->in_suspend = true;
4270
4271 if (amdgpu_sriov_vf(adev)) {
4272 amdgpu_virt_fini_data_exchange(adev);
4273 r = amdgpu_virt_request_full_gpu(adev, false);
4274 if (r)
4275 return r;
4276 }
4277
4278 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4279 DRM_WARN("smart shift update failed\n");
4280
4281 if (fbcon)
4282 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4283
4284 cancel_delayed_work_sync(&adev->delayed_init_work);
4285
4286 amdgpu_ras_suspend(adev);
4287
4288 amdgpu_device_ip_suspend_phase1(adev);
4289
4290 if (!adev->in_s0ix)
4291 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4292
4293 r = amdgpu_device_evict_resources(adev);
4294 if (r)
4295 return r;
4296
4297 amdgpu_fence_driver_hw_fini(adev);
4298
4299 amdgpu_device_ip_suspend_phase2(adev);
4300
4301 if (amdgpu_sriov_vf(adev))
4302 amdgpu_virt_release_full_gpu(adev, false);
4303
4304 return 0;
4305 }
4306
4307 /**
4308 * amdgpu_device_resume - initiate device resume
4309 *
4310 * @dev: drm dev pointer
4311 * @fbcon : notify the fbdev of resume
4312 *
4313 * Bring the hw back to operating state (all asics).
4314 * Returns 0 for success or an error on failure.
4315 * Called at driver resume.
4316 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4317 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4318 {
4319 struct amdgpu_device *adev = drm_to_adev(dev);
4320 int r = 0;
4321
4322 if (amdgpu_sriov_vf(adev)) {
4323 r = amdgpu_virt_request_full_gpu(adev, true);
4324 if (r)
4325 return r;
4326 }
4327
4328 #ifdef notyet
4329 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4330 return 0;
4331 #endif
4332
4333 if (adev->in_s0ix)
4334 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4335
4336 /* post card */
4337 if (amdgpu_device_need_post(adev)) {
4338 r = amdgpu_device_asic_init(adev);
4339 if (r)
4340 dev_err(adev->dev, "amdgpu asic init failed\n");
4341 }
4342
4343 r = amdgpu_device_ip_resume(adev);
4344
4345 if (r) {
4346 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4347 goto exit;
4348 }
4349 amdgpu_fence_driver_hw_init(adev);
4350
4351 r = amdgpu_device_ip_late_init(adev);
4352 if (r)
4353 goto exit;
4354
4355 queue_delayed_work(system_wq, &adev->delayed_init_work,
4356 msecs_to_jiffies(AMDGPU_RESUME_MS));
4357
4358 if (!adev->in_s0ix) {
4359 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4360 if (r)
4361 goto exit;
4362 }
4363
4364 exit:
4365 if (amdgpu_sriov_vf(adev)) {
4366 amdgpu_virt_init_data_exchange(adev);
4367 amdgpu_virt_release_full_gpu(adev, true);
4368 }
4369
4370 if (r)
4371 return r;
4372
4373 /* Make sure IB tests flushed */
4374 flush_delayed_work(&adev->delayed_init_work);
4375
4376 if (fbcon)
4377 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4378
4379 amdgpu_ras_resume(adev);
4380
4381 if (adev->mode_info.num_crtc) {
4382 /*
4383 * Most of the connector probing functions try to acquire runtime pm
4384 * refs to ensure that the GPU is powered on when connector polling is
4385 * performed. Since we're calling this from a runtime PM callback,
4386 * trying to acquire rpm refs will cause us to deadlock.
4387 *
4388 * Since we're guaranteed to be holding the rpm lock, it's safe to
4389 * temporarily disable the rpm helpers so this doesn't deadlock us.
4390 */
4391 #if defined(CONFIG_PM) && defined(__linux__)
4392 dev->dev->power.disable_depth++;
4393 #endif
4394 if (!adev->dc_enabled)
4395 drm_helper_hpd_irq_event(dev);
4396 else
4397 drm_kms_helper_hotplug_event(dev);
4398 #if defined(CONFIG_PM) && defined(__linux__)
4399 dev->dev->power.disable_depth--;
4400 #endif
4401 }
4402 adev->in_suspend = false;
4403
4404 if (adev->enable_mes)
4405 amdgpu_mes_self_test(adev);
4406
4407 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4408 DRM_WARN("smart shift update failed\n");
4409
4410 return 0;
4411 }
4412
4413 /**
4414 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4415 *
4416 * @adev: amdgpu_device pointer
4417 *
4418 * The list of all the hardware IPs that make up the asic is walked and
4419 * the check_soft_reset callbacks are run. check_soft_reset determines
4420 * if the asic is still hung or not.
4421 * Returns true if any of the IPs are still in a hung state, false if not.
4422 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4423 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4424 {
4425 int i;
4426 bool asic_hang = false;
4427
4428 if (amdgpu_sriov_vf(adev))
4429 return true;
4430
4431 if (amdgpu_asic_need_full_reset(adev))
4432 return true;
4433
4434 for (i = 0; i < adev->num_ip_blocks; i++) {
4435 if (!adev->ip_blocks[i].status.valid)
4436 continue;
4437 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4438 adev->ip_blocks[i].status.hang =
4439 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4440 if (adev->ip_blocks[i].status.hang) {
4441 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4442 asic_hang = true;
4443 }
4444 }
4445 return asic_hang;
4446 }
4447
4448 /**
4449 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4450 *
4451 * @adev: amdgpu_device pointer
4452 *
4453 * The list of all the hardware IPs that make up the asic is walked and the
4454 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4455 * handles any IP specific hardware or software state changes that are
4456 * necessary for a soft reset to succeed.
4457 * Returns 0 on success, negative error code on failure.
4458 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4459 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4460 {
4461 int i, r = 0;
4462
4463 for (i = 0; i < adev->num_ip_blocks; i++) {
4464 if (!adev->ip_blocks[i].status.valid)
4465 continue;
4466 if (adev->ip_blocks[i].status.hang &&
4467 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4468 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4469 if (r)
4470 return r;
4471 }
4472 }
4473
4474 return 0;
4475 }
4476
4477 /**
4478 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4479 *
4480 * @adev: amdgpu_device pointer
4481 *
4482 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4483 * reset is necessary to recover.
4484 * Returns true if a full asic reset is required, false if not.
4485 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4486 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4487 {
4488 int i;
4489
4490 if (amdgpu_asic_need_full_reset(adev))
4491 return true;
4492
4493 for (i = 0; i < adev->num_ip_blocks; i++) {
4494 if (!adev->ip_blocks[i].status.valid)
4495 continue;
4496 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4497 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4498 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4499 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4500 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4501 if (adev->ip_blocks[i].status.hang) {
4502 dev_info(adev->dev, "Some block need full reset!\n");
4503 return true;
4504 }
4505 }
4506 }
4507 return false;
4508 }
4509
4510 /**
4511 * amdgpu_device_ip_soft_reset - do a soft reset
4512 *
4513 * @adev: amdgpu_device pointer
4514 *
4515 * The list of all the hardware IPs that make up the asic is walked and the
4516 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4517 * IP specific hardware or software state changes that are necessary to soft
4518 * reset the IP.
4519 * Returns 0 on success, negative error code on failure.
4520 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4521 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4522 {
4523 int i, r = 0;
4524
4525 for (i = 0; i < adev->num_ip_blocks; i++) {
4526 if (!adev->ip_blocks[i].status.valid)
4527 continue;
4528 if (adev->ip_blocks[i].status.hang &&
4529 adev->ip_blocks[i].version->funcs->soft_reset) {
4530 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4531 if (r)
4532 return r;
4533 }
4534 }
4535
4536 return 0;
4537 }
4538
4539 /**
4540 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4541 *
4542 * @adev: amdgpu_device pointer
4543 *
4544 * The list of all the hardware IPs that make up the asic is walked and the
4545 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4546 * handles any IP specific hardware or software state changes that are
4547 * necessary after the IP has been soft reset.
4548 * Returns 0 on success, negative error code on failure.
4549 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4550 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4551 {
4552 int i, r = 0;
4553
4554 for (i = 0; i < adev->num_ip_blocks; i++) {
4555 if (!adev->ip_blocks[i].status.valid)
4556 continue;
4557 if (adev->ip_blocks[i].status.hang &&
4558 adev->ip_blocks[i].version->funcs->post_soft_reset)
4559 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4560 if (r)
4561 return r;
4562 }
4563
4564 return 0;
4565 }
4566
4567 /**
4568 * amdgpu_device_recover_vram - Recover some VRAM contents
4569 *
4570 * @adev: amdgpu_device pointer
4571 *
4572 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4573 * restore things like GPUVM page tables after a GPU reset where
4574 * the contents of VRAM might be lost.
4575 *
4576 * Returns:
4577 * 0 on success, negative error code on failure.
4578 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4579 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4580 {
4581 struct dma_fence *fence = NULL, *next = NULL;
4582 struct amdgpu_bo *shadow;
4583 struct amdgpu_bo_vm *vmbo;
4584 long r = 1, tmo;
4585
4586 if (amdgpu_sriov_runtime(adev))
4587 tmo = msecs_to_jiffies(8000);
4588 else
4589 tmo = msecs_to_jiffies(100);
4590
4591 dev_info(adev->dev, "recover vram bo from shadow start\n");
4592 mutex_lock(&adev->shadow_list_lock);
4593 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4594 /* If vm is compute context or adev is APU, shadow will be NULL */
4595 if (!vmbo->shadow)
4596 continue;
4597 shadow = vmbo->shadow;
4598
4599 /* No need to recover an evicted BO */
4600 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4601 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4602 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4603 continue;
4604
4605 r = amdgpu_bo_restore_shadow(shadow, &next);
4606 if (r)
4607 break;
4608
4609 if (fence) {
4610 tmo = dma_fence_wait_timeout(fence, false, tmo);
4611 dma_fence_put(fence);
4612 fence = next;
4613 if (tmo == 0) {
4614 r = -ETIMEDOUT;
4615 break;
4616 } else if (tmo < 0) {
4617 r = tmo;
4618 break;
4619 }
4620 } else {
4621 fence = next;
4622 }
4623 }
4624 mutex_unlock(&adev->shadow_list_lock);
4625
4626 if (fence)
4627 tmo = dma_fence_wait_timeout(fence, false, tmo);
4628 dma_fence_put(fence);
4629
4630 if (r < 0 || tmo <= 0) {
4631 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4632 return -EIO;
4633 }
4634
4635 dev_info(adev->dev, "recover vram bo from shadow done\n");
4636 return 0;
4637 }
4638
4639
4640 /**
4641 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4642 *
4643 * @adev: amdgpu_device pointer
4644 * @from_hypervisor: request from hypervisor
4645 *
4646 * do VF FLR and reinitialize Asic
4647 * return 0 means succeeded otherwise failed
4648 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4649 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4650 bool from_hypervisor)
4651 {
4652 int r;
4653 struct amdgpu_hive_info *hive = NULL;
4654 int retry_limit = 0;
4655
4656 retry:
4657 amdgpu_amdkfd_pre_reset(adev);
4658
4659 if (from_hypervisor)
4660 r = amdgpu_virt_request_full_gpu(adev, true);
4661 else
4662 r = amdgpu_virt_reset_gpu(adev);
4663 if (r)
4664 return r;
4665 amdgpu_irq_gpu_reset_resume_helper(adev);
4666
4667 /* some sw clean up VF needs to do before recover */
4668 amdgpu_virt_post_reset(adev);
4669
4670 /* Resume IP prior to SMC */
4671 r = amdgpu_device_ip_reinit_early_sriov(adev);
4672 if (r)
4673 goto error;
4674
4675 amdgpu_virt_init_data_exchange(adev);
4676
4677 r = amdgpu_device_fw_loading(adev);
4678 if (r)
4679 return r;
4680
4681 /* now we are okay to resume SMC/CP/SDMA */
4682 r = amdgpu_device_ip_reinit_late_sriov(adev);
4683 if (r)
4684 goto error;
4685
4686 hive = amdgpu_get_xgmi_hive(adev);
4687 /* Update PSP FW topology after reset */
4688 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4689 r = amdgpu_xgmi_update_topology(hive, adev);
4690
4691 if (hive)
4692 amdgpu_put_xgmi_hive(hive);
4693
4694 if (!r) {
4695 r = amdgpu_ib_ring_tests(adev);
4696
4697 amdgpu_amdkfd_post_reset(adev);
4698 }
4699
4700 error:
4701 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4702 amdgpu_inc_vram_lost(adev);
4703 r = amdgpu_device_recover_vram(adev);
4704 }
4705 amdgpu_virt_release_full_gpu(adev, true);
4706
4707 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4708 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4709 retry_limit++;
4710 goto retry;
4711 } else
4712 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4713 }
4714
4715 return r;
4716 }
4717
4718 /**
4719 * amdgpu_device_has_job_running - check if there is any job in mirror list
4720 *
4721 * @adev: amdgpu_device pointer
4722 *
4723 * check if there is any job in mirror list
4724 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4725 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4726 {
4727 int i;
4728 struct drm_sched_job *job;
4729
4730 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4731 struct amdgpu_ring *ring = adev->rings[i];
4732
4733 if (!ring || !ring->sched.thread)
4734 continue;
4735
4736 spin_lock(&ring->sched.job_list_lock);
4737 job = list_first_entry_or_null(&ring->sched.pending_list,
4738 struct drm_sched_job, list);
4739 spin_unlock(&ring->sched.job_list_lock);
4740 if (job)
4741 return true;
4742 }
4743 return false;
4744 }
4745
4746 /**
4747 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4748 *
4749 * @adev: amdgpu_device pointer
4750 *
4751 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4752 * a hung GPU.
4753 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4754 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4755 {
4756
4757 if (amdgpu_gpu_recovery == 0)
4758 goto disabled;
4759
4760 /* Skip soft reset check in fatal error mode */
4761 if (!amdgpu_ras_is_poison_mode_supported(adev))
4762 return true;
4763
4764 if (amdgpu_sriov_vf(adev))
4765 return true;
4766
4767 if (amdgpu_gpu_recovery == -1) {
4768 switch (adev->asic_type) {
4769 #ifdef CONFIG_DRM_AMDGPU_SI
4770 case CHIP_VERDE:
4771 case CHIP_TAHITI:
4772 case CHIP_PITCAIRN:
4773 case CHIP_OLAND:
4774 case CHIP_HAINAN:
4775 #endif
4776 #ifdef CONFIG_DRM_AMDGPU_CIK
4777 case CHIP_KAVERI:
4778 case CHIP_KABINI:
4779 case CHIP_MULLINS:
4780 #endif
4781 case CHIP_CARRIZO:
4782 case CHIP_STONEY:
4783 case CHIP_CYAN_SKILLFISH:
4784 goto disabled;
4785 default:
4786 break;
4787 }
4788 }
4789
4790 return true;
4791
4792 disabled:
4793 dev_info(adev->dev, "GPU recovery disabled.\n");
4794 return false;
4795 }
4796
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4797 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4798 {
4799 u32 i;
4800 int ret = 0;
4801
4802 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4803
4804 dev_info(adev->dev, "GPU mode1 reset\n");
4805
4806 /* disable BM */
4807 pci_clear_master(adev->pdev);
4808
4809 amdgpu_device_cache_pci_state(adev->pdev);
4810
4811 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4812 dev_info(adev->dev, "GPU smu mode1 reset\n");
4813 ret = amdgpu_dpm_mode1_reset(adev);
4814 } else {
4815 dev_info(adev->dev, "GPU psp mode1 reset\n");
4816 ret = psp_gpu_reset(adev);
4817 }
4818
4819 if (ret)
4820 goto mode1_reset_failed;
4821
4822 amdgpu_device_load_pci_state(adev->pdev);
4823 ret = amdgpu_psp_wait_for_bootloader(adev);
4824 if (ret)
4825 goto mode1_reset_failed;
4826
4827 /* wait for asic to come out of reset */
4828 for (i = 0; i < adev->usec_timeout; i++) {
4829 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4830
4831 if (memsize != 0xffffffff)
4832 break;
4833 udelay(1);
4834 }
4835
4836 if (i >= adev->usec_timeout) {
4837 ret = -ETIMEDOUT;
4838 goto mode1_reset_failed;
4839 }
4840
4841 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4842
4843 return 0;
4844
4845 mode1_reset_failed:
4846 dev_err(adev->dev, "GPU mode1 reset failed\n");
4847 return ret;
4848 }
4849
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4850 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4851 struct amdgpu_reset_context *reset_context)
4852 {
4853 int i, r = 0;
4854 struct amdgpu_job *job = NULL;
4855 bool need_full_reset =
4856 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4857
4858 if (reset_context->reset_req_dev == adev)
4859 job = reset_context->job;
4860
4861 if (amdgpu_sriov_vf(adev)) {
4862 /* stop the data exchange thread */
4863 amdgpu_virt_fini_data_exchange(adev);
4864 }
4865
4866 amdgpu_fence_driver_isr_toggle(adev, true);
4867
4868 /* block all schedulers and reset given job's ring */
4869 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4870 struct amdgpu_ring *ring = adev->rings[i];
4871
4872 if (!ring || !ring->sched.thread)
4873 continue;
4874
4875 /* Clear job fence from fence drv to avoid force_completion
4876 * leave NULL and vm flush fence in fence drv
4877 */
4878 amdgpu_fence_driver_clear_job_fences(ring);
4879
4880 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4881 amdgpu_fence_driver_force_completion(ring);
4882 }
4883
4884 amdgpu_fence_driver_isr_toggle(adev, false);
4885
4886 if (job && job->vm)
4887 drm_sched_increase_karma(&job->base);
4888
4889 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4890 /* If reset handler not implemented, continue; otherwise return */
4891 if (r == -EOPNOTSUPP)
4892 r = 0;
4893 else
4894 return r;
4895
4896 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4897 if (!amdgpu_sriov_vf(adev)) {
4898
4899 if (!need_full_reset)
4900 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4901
4902 if (!need_full_reset && amdgpu_gpu_recovery &&
4903 amdgpu_device_ip_check_soft_reset(adev)) {
4904 amdgpu_device_ip_pre_soft_reset(adev);
4905 r = amdgpu_device_ip_soft_reset(adev);
4906 amdgpu_device_ip_post_soft_reset(adev);
4907 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4908 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4909 need_full_reset = true;
4910 }
4911 }
4912
4913 if (need_full_reset)
4914 r = amdgpu_device_ip_suspend(adev);
4915 if (need_full_reset)
4916 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4917 else
4918 clear_bit(AMDGPU_NEED_FULL_RESET,
4919 &reset_context->flags);
4920 }
4921
4922 return r;
4923 }
4924
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4925 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4926 {
4927 int i;
4928
4929 lockdep_assert_held(&adev->reset_domain->sem);
4930
4931 for (i = 0; i < adev->num_regs; i++) {
4932 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4933 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4934 adev->reset_dump_reg_value[i]);
4935 }
4936
4937 return 0;
4938 }
4939
4940 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4941 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4942 size_t count, void *data, size_t datalen)
4943 {
4944 struct drm_printer p;
4945 struct amdgpu_device *adev = data;
4946 struct drm_print_iterator iter;
4947 int i;
4948
4949 iter.data = buffer;
4950 iter.offset = 0;
4951 iter.start = offset;
4952 iter.remain = count;
4953
4954 p = drm_coredump_printer(&iter);
4955
4956 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4957 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4958 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4959 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4960 if (adev->reset_task_info.pid)
4961 drm_printf(&p, "process_name: %s PID: %d\n",
4962 adev->reset_task_info.process_name,
4963 adev->reset_task_info.pid);
4964
4965 if (adev->reset_vram_lost)
4966 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4967 if (adev->num_regs) {
4968 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4969
4970 for (i = 0; i < adev->num_regs; i++)
4971 drm_printf(&p, "0x%08x: 0x%08x\n",
4972 adev->reset_dump_reg_list[i],
4973 adev->reset_dump_reg_value[i]);
4974 }
4975
4976 return count - iter.remain;
4977 }
4978
amdgpu_devcoredump_free(void * data)4979 static void amdgpu_devcoredump_free(void *data)
4980 {
4981 }
4982
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)4983 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4984 {
4985 struct drm_device *dev = adev_to_drm(adev);
4986
4987 ktime_get_ts64(&adev->reset_time);
4988 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
4989 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4990 }
4991 #endif
4992
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4993 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4994 struct amdgpu_reset_context *reset_context)
4995 {
4996 struct amdgpu_device *tmp_adev = NULL;
4997 bool need_full_reset, skip_hw_reset, vram_lost = false;
4998 int r = 0;
4999 bool gpu_reset_for_dev_remove = 0;
5000
5001 /* Try reset handler method first */
5002 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5003 reset_list);
5004 amdgpu_reset_reg_dumps(tmp_adev);
5005
5006 reset_context->reset_device_list = device_list_handle;
5007 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5008 /* If reset handler not implemented, continue; otherwise return */
5009 if (r == -EOPNOTSUPP)
5010 r = 0;
5011 else
5012 return r;
5013
5014 /* Reset handler not implemented, use the default method */
5015 need_full_reset =
5016 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5017 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5018
5019 gpu_reset_for_dev_remove =
5020 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5021 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5022
5023 /*
5024 * ASIC reset has to be done on all XGMI hive nodes ASAP
5025 * to allow proper links negotiation in FW (within 1 sec)
5026 */
5027 if (!skip_hw_reset && need_full_reset) {
5028 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5029 /* For XGMI run all resets in parallel to speed up the process */
5030 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5031 tmp_adev->gmc.xgmi.pending_reset = false;
5032 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5033 r = -EALREADY;
5034 } else
5035 r = amdgpu_asic_reset(tmp_adev);
5036
5037 if (r) {
5038 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5039 r, adev_to_drm(tmp_adev)->unique);
5040 break;
5041 }
5042 }
5043
5044 /* For XGMI wait for all resets to complete before proceed */
5045 if (!r) {
5046 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5047 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5048 flush_work(&tmp_adev->xgmi_reset_work);
5049 r = tmp_adev->asic_reset_res;
5050 if (r)
5051 break;
5052 }
5053 }
5054 }
5055 }
5056
5057 if (!r && amdgpu_ras_intr_triggered()) {
5058 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5059 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
5060 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
5061 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5062 }
5063
5064 amdgpu_ras_intr_cleared();
5065 }
5066
5067 /* Since the mode1 reset affects base ip blocks, the
5068 * phase1 ip blocks need to be resumed. Otherwise there
5069 * will be a BIOS signature error and the psp bootloader
5070 * can't load kdb on the next amdgpu install.
5071 */
5072 if (gpu_reset_for_dev_remove) {
5073 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5074 amdgpu_device_ip_resume_phase1(tmp_adev);
5075
5076 goto end;
5077 }
5078
5079 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5080 if (need_full_reset) {
5081 /* post card */
5082 r = amdgpu_device_asic_init(tmp_adev);
5083 if (r) {
5084 dev_warn(tmp_adev->dev, "asic atom init failed!");
5085 } else {
5086 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5087
5088 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5089 if (r)
5090 goto out;
5091
5092 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5093 #ifdef CONFIG_DEV_COREDUMP
5094 tmp_adev->reset_vram_lost = vram_lost;
5095 memset(&tmp_adev->reset_task_info, 0,
5096 sizeof(tmp_adev->reset_task_info));
5097 if (reset_context->job && reset_context->job->vm)
5098 tmp_adev->reset_task_info =
5099 reset_context->job->vm->task_info;
5100 amdgpu_reset_capture_coredumpm(tmp_adev);
5101 #endif
5102 if (vram_lost) {
5103 DRM_INFO("VRAM is lost due to GPU reset!\n");
5104 amdgpu_inc_vram_lost(tmp_adev);
5105 }
5106
5107 r = amdgpu_device_fw_loading(tmp_adev);
5108 if (r)
5109 return r;
5110
5111 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5112 if (r)
5113 goto out;
5114
5115 if (vram_lost)
5116 amdgpu_device_fill_reset_magic(tmp_adev);
5117
5118 /*
5119 * Add this ASIC as tracked as reset was already
5120 * complete successfully.
5121 */
5122 amdgpu_register_gpu_instance(tmp_adev);
5123
5124 if (!reset_context->hive &&
5125 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5126 amdgpu_xgmi_add_device(tmp_adev);
5127
5128 r = amdgpu_device_ip_late_init(tmp_adev);
5129 if (r)
5130 goto out;
5131
5132 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5133
5134 /*
5135 * The GPU enters bad state once faulty pages
5136 * by ECC has reached the threshold, and ras
5137 * recovery is scheduled next. So add one check
5138 * here to break recovery if it indeed exceeds
5139 * bad page threshold, and remind user to
5140 * retire this GPU or setting one bigger
5141 * bad_page_threshold value to fix this once
5142 * probing driver again.
5143 */
5144 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5145 /* must succeed. */
5146 amdgpu_ras_resume(tmp_adev);
5147 } else {
5148 r = -EINVAL;
5149 goto out;
5150 }
5151
5152 /* Update PSP FW topology after reset */
5153 if (reset_context->hive &&
5154 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5155 r = amdgpu_xgmi_update_topology(
5156 reset_context->hive, tmp_adev);
5157 }
5158 }
5159
5160 out:
5161 if (!r) {
5162 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5163 r = amdgpu_ib_ring_tests(tmp_adev);
5164 if (r) {
5165 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5166 need_full_reset = true;
5167 r = -EAGAIN;
5168 goto end;
5169 }
5170 }
5171
5172 if (!r)
5173 r = amdgpu_device_recover_vram(tmp_adev);
5174 else
5175 tmp_adev->asic_reset_res = r;
5176 }
5177
5178 end:
5179 if (need_full_reset)
5180 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5181 else
5182 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5183 return r;
5184 }
5185
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5186 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5187 {
5188
5189 switch (amdgpu_asic_reset_method(adev)) {
5190 case AMD_RESET_METHOD_MODE1:
5191 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5192 break;
5193 case AMD_RESET_METHOD_MODE2:
5194 adev->mp1_state = PP_MP1_STATE_RESET;
5195 break;
5196 default:
5197 adev->mp1_state = PP_MP1_STATE_NONE;
5198 break;
5199 }
5200
5201 pci_dev_put(p);
5202 }
5203
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5204 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5205 {
5206 amdgpu_vf_error_trans_all(adev);
5207 adev->mp1_state = PP_MP1_STATE_NONE;
5208 }
5209
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5210 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5211 {
5212 STUB();
5213 #ifdef notyet
5214 struct pci_dev *p = NULL;
5215
5216 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5217 adev->pdev->bus->number, 1);
5218 if (p) {
5219 pm_runtime_enable(&(p->dev));
5220 pm_runtime_resume(&(p->dev));
5221 }
5222 #endif
5223 }
5224
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5225 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5226 {
5227 enum amd_reset_method reset_method;
5228 struct pci_dev *p = NULL;
5229 u64 expires;
5230
5231 /*
5232 * For now, only BACO and mode1 reset are confirmed
5233 * to suffer the audio issue without proper suspended.
5234 */
5235 reset_method = amdgpu_asic_reset_method(adev);
5236 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5237 (reset_method != AMD_RESET_METHOD_MODE1))
5238 return -EINVAL;
5239
5240 STUB();
5241 return -ENOSYS;
5242 #ifdef notyet
5243
5244 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5245 adev->pdev->bus->number, 1);
5246 if (!p)
5247 return -ENODEV;
5248
5249 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5250 if (!expires)
5251 /*
5252 * If we cannot get the audio device autosuspend delay,
5253 * a fixed 4S interval will be used. Considering 3S is
5254 * the audio controller default autosuspend delay setting.
5255 * 4S used here is guaranteed to cover that.
5256 */
5257 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5258
5259 while (!pm_runtime_status_suspended(&(p->dev))) {
5260 if (!pm_runtime_suspend(&(p->dev)))
5261 break;
5262
5263 if (expires < ktime_get_mono_fast_ns()) {
5264 dev_warn(adev->dev, "failed to suspend display audio\n");
5265 pci_dev_put(p);
5266 /* TODO: abort the succeeding gpu reset? */
5267 return -ETIMEDOUT;
5268 }
5269 }
5270
5271 pm_runtime_disable(&(p->dev));
5272
5273 pci_dev_put(p);
5274 return 0;
5275 #endif
5276 }
5277
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5278 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5279 {
5280 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5281
5282 #if defined(CONFIG_DEBUG_FS)
5283 if (!amdgpu_sriov_vf(adev))
5284 cancel_work(&adev->reset_work);
5285 #endif
5286
5287 if (adev->kfd.dev)
5288 cancel_work(&adev->kfd.reset_work);
5289
5290 if (amdgpu_sriov_vf(adev))
5291 cancel_work(&adev->virt.flr_work);
5292
5293 if (con && adev->ras_enabled)
5294 cancel_work(&con->recovery_work);
5295
5296 }
5297
5298 /**
5299 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5300 *
5301 * @adev: amdgpu_device pointer
5302 * @job: which job trigger hang
5303 * @reset_context: amdgpu reset context pointer
5304 *
5305 * Attempt to reset the GPU if it has hung (all asics).
5306 * Attempt to do soft-reset or full-reset and reinitialize Asic
5307 * Returns 0 for success or an error on failure.
5308 */
5309
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5310 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5311 struct amdgpu_job *job,
5312 struct amdgpu_reset_context *reset_context)
5313 {
5314 struct list_head device_list, *device_list_handle = NULL;
5315 bool job_signaled = false;
5316 struct amdgpu_hive_info *hive = NULL;
5317 struct amdgpu_device *tmp_adev = NULL;
5318 int i, r = 0;
5319 bool need_emergency_restart = false;
5320 bool audio_suspended = false;
5321 bool gpu_reset_for_dev_remove = false;
5322
5323 gpu_reset_for_dev_remove =
5324 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5325 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5326
5327 /*
5328 * Special case: RAS triggered and full reset isn't supported
5329 */
5330 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5331
5332 /*
5333 * Flush RAM to disk so that after reboot
5334 * the user can read log and see why the system rebooted.
5335 */
5336 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5337 amdgpu_ras_get_context(adev)->reboot) {
5338 DRM_WARN("Emergency reboot.");
5339
5340 #ifdef notyet
5341 ksys_sync_helper();
5342 emergency_restart();
5343 #else
5344 panic("emergency_restart");
5345 #endif
5346 }
5347
5348 dev_info(adev->dev, "GPU %s begin!\n",
5349 need_emergency_restart ? "jobs stop":"reset");
5350
5351 if (!amdgpu_sriov_vf(adev))
5352 hive = amdgpu_get_xgmi_hive(adev);
5353 if (hive)
5354 mutex_lock(&hive->hive_lock);
5355
5356 reset_context->job = job;
5357 reset_context->hive = hive;
5358 /*
5359 * Build list of devices to reset.
5360 * In case we are in XGMI hive mode, resort the device list
5361 * to put adev in the 1st position.
5362 */
5363 INIT_LIST_HEAD(&device_list);
5364 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5365 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5366 list_add_tail(&tmp_adev->reset_list, &device_list);
5367 if (gpu_reset_for_dev_remove && adev->shutdown)
5368 tmp_adev->shutdown = true;
5369 }
5370 if (!list_is_first(&adev->reset_list, &device_list))
5371 list_rotate_to_front(&adev->reset_list, &device_list);
5372 device_list_handle = &device_list;
5373 } else {
5374 list_add_tail(&adev->reset_list, &device_list);
5375 device_list_handle = &device_list;
5376 }
5377
5378 /* We need to lock reset domain only once both for XGMI and single device */
5379 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5380 reset_list);
5381 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5382
5383 /* block all schedulers and reset given job's ring */
5384 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5385
5386 amdgpu_device_set_mp1_state(tmp_adev);
5387
5388 /*
5389 * Try to put the audio codec into suspend state
5390 * before gpu reset started.
5391 *
5392 * Due to the power domain of the graphics device
5393 * is shared with AZ power domain. Without this,
5394 * we may change the audio hardware from behind
5395 * the audio driver's back. That will trigger
5396 * some audio codec errors.
5397 */
5398 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5399 audio_suspended = true;
5400
5401 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5402
5403 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5404
5405 if (!amdgpu_sriov_vf(tmp_adev))
5406 amdgpu_amdkfd_pre_reset(tmp_adev);
5407
5408 /*
5409 * Mark these ASICs to be reseted as untracked first
5410 * And add them back after reset completed
5411 */
5412 amdgpu_unregister_gpu_instance(tmp_adev);
5413
5414 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5415
5416 /* disable ras on ALL IPs */
5417 if (!need_emergency_restart &&
5418 amdgpu_device_ip_need_full_reset(tmp_adev))
5419 amdgpu_ras_suspend(tmp_adev);
5420
5421 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5422 struct amdgpu_ring *ring = tmp_adev->rings[i];
5423
5424 if (!ring || !ring->sched.thread)
5425 continue;
5426
5427 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5428
5429 if (need_emergency_restart)
5430 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5431 }
5432 atomic_inc(&tmp_adev->gpu_reset_counter);
5433 }
5434
5435 if (need_emergency_restart)
5436 goto skip_sched_resume;
5437
5438 /*
5439 * Must check guilty signal here since after this point all old
5440 * HW fences are force signaled.
5441 *
5442 * job->base holds a reference to parent fence
5443 */
5444 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5445 job_signaled = true;
5446 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5447 goto skip_hw_reset;
5448 }
5449
5450 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5451 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5452 if (gpu_reset_for_dev_remove) {
5453 /* Workaroud for ASICs need to disable SMC first */
5454 amdgpu_device_smu_fini_early(tmp_adev);
5455 }
5456 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5457 /*TODO Should we stop ?*/
5458 if (r) {
5459 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5460 r, adev_to_drm(tmp_adev)->unique);
5461 tmp_adev->asic_reset_res = r;
5462 }
5463
5464 /*
5465 * Drop all pending non scheduler resets. Scheduler resets
5466 * were already dropped during drm_sched_stop
5467 */
5468 amdgpu_device_stop_pending_resets(tmp_adev);
5469 }
5470
5471 /* Actual ASIC resets if needed.*/
5472 /* Host driver will handle XGMI hive reset for SRIOV */
5473 if (amdgpu_sriov_vf(adev)) {
5474 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5475 if (r)
5476 adev->asic_reset_res = r;
5477
5478 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5479 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5480 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5481 amdgpu_ras_resume(adev);
5482 } else {
5483 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5484 if (r && r == -EAGAIN)
5485 goto retry;
5486
5487 if (!r && gpu_reset_for_dev_remove)
5488 goto recover_end;
5489 }
5490
5491 skip_hw_reset:
5492
5493 /* Post ASIC reset for all devs .*/
5494 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5495
5496 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5497 struct amdgpu_ring *ring = tmp_adev->rings[i];
5498
5499 if (!ring || !ring->sched.thread)
5500 continue;
5501
5502 drm_sched_start(&ring->sched, true);
5503 }
5504
5505 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5506 amdgpu_mes_self_test(tmp_adev);
5507
5508 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5509 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5510
5511 if (tmp_adev->asic_reset_res)
5512 r = tmp_adev->asic_reset_res;
5513
5514 tmp_adev->asic_reset_res = 0;
5515
5516 if (r) {
5517 /* bad news, how to tell it to userspace ? */
5518 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5519 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5520 } else {
5521 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5522 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5523 DRM_WARN("smart shift update failed\n");
5524 }
5525 }
5526
5527 skip_sched_resume:
5528 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5529 /* unlock kfd: SRIOV would do it separately */
5530 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5531 amdgpu_amdkfd_post_reset(tmp_adev);
5532
5533 /* kfd_post_reset will do nothing if kfd device is not initialized,
5534 * need to bring up kfd here if it's not be initialized before
5535 */
5536 if (!adev->kfd.init_complete)
5537 amdgpu_amdkfd_device_init(adev);
5538
5539 if (audio_suspended)
5540 amdgpu_device_resume_display_audio(tmp_adev);
5541
5542 amdgpu_device_unset_mp1_state(tmp_adev);
5543
5544 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5545 }
5546
5547 recover_end:
5548 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5549 reset_list);
5550 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5551
5552 if (hive) {
5553 mutex_unlock(&hive->hive_lock);
5554 amdgpu_put_xgmi_hive(hive);
5555 }
5556
5557 if (r)
5558 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5559
5560 atomic_set(&adev->reset_domain->reset_res, r);
5561 return r;
5562 }
5563
5564 /**
5565 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5566 *
5567 * @adev: amdgpu_device pointer
5568 *
5569 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5570 * and lanes) of the slot the device is in. Handles APUs and
5571 * virtualized environments where PCIE config space may not be available.
5572 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5573 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5574 {
5575 struct pci_dev *pdev;
5576 enum pci_bus_speed speed_cap, platform_speed_cap;
5577 enum pcie_link_width platform_link_width;
5578
5579 if (amdgpu_pcie_gen_cap)
5580 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5581
5582 if (amdgpu_pcie_lane_cap)
5583 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5584
5585 /* covers APUs as well */
5586 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5587 if (adev->pm.pcie_gen_mask == 0)
5588 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5589 if (adev->pm.pcie_mlw_mask == 0)
5590 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5591 return;
5592 }
5593
5594 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5595 return;
5596
5597 pcie_bandwidth_available(adev->pdev, NULL,
5598 &platform_speed_cap, &platform_link_width);
5599
5600 if (adev->pm.pcie_gen_mask == 0) {
5601 /* asic caps */
5602 pdev = adev->pdev;
5603 speed_cap = pcie_get_speed_cap(pdev);
5604 if (speed_cap == PCI_SPEED_UNKNOWN) {
5605 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5606 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5607 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5608 } else {
5609 if (speed_cap == PCIE_SPEED_32_0GT)
5610 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5611 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5612 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5613 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5614 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5615 else if (speed_cap == PCIE_SPEED_16_0GT)
5616 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5617 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5618 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5619 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5620 else if (speed_cap == PCIE_SPEED_8_0GT)
5621 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5622 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5623 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5624 else if (speed_cap == PCIE_SPEED_5_0GT)
5625 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5626 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5627 else
5628 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5629 }
5630 /* platform caps */
5631 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5632 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5633 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5634 } else {
5635 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5636 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5637 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5638 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5639 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5640 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5641 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5642 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5643 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5644 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5645 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5646 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5647 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5648 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5649 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5650 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5651 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5652 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5653 else
5654 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5655
5656 }
5657 }
5658 if (adev->pm.pcie_mlw_mask == 0) {
5659 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5660 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5661 } else {
5662 switch (platform_link_width) {
5663 case PCIE_LNK_X32:
5664 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5665 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5666 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5667 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5668 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5669 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5670 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5671 break;
5672 case PCIE_LNK_X16:
5673 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5674 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5675 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5676 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5677 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5678 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5679 break;
5680 case PCIE_LNK_X12:
5681 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5682 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5683 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5684 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5685 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5686 break;
5687 case PCIE_LNK_X8:
5688 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5689 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5691 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5692 break;
5693 case PCIE_LNK_X4:
5694 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5695 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5697 break;
5698 case PCIE_LNK_X2:
5699 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5700 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5701 break;
5702 case PCIE_LNK_X1:
5703 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5704 break;
5705 default:
5706 break;
5707 }
5708 }
5709 }
5710 }
5711
5712 /**
5713 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5714 *
5715 * @adev: amdgpu_device pointer
5716 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5717 *
5718 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5719 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5720 * @peer_adev.
5721 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5722 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5723 struct amdgpu_device *peer_adev)
5724 {
5725 #ifdef CONFIG_HSA_AMD_P2P
5726 uint64_t address_mask = peer_adev->dev->dma_mask ?
5727 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5728 resource_size_t aper_limit =
5729 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5730 bool p2p_access =
5731 !adev->gmc.xgmi.connected_to_cpu &&
5732 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5733
5734 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5735 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5736 !(adev->gmc.aper_base & address_mask ||
5737 aper_limit & address_mask));
5738 #else
5739 return false;
5740 #endif
5741 }
5742
amdgpu_device_baco_enter(struct drm_device * dev)5743 int amdgpu_device_baco_enter(struct drm_device *dev)
5744 {
5745 struct amdgpu_device *adev = drm_to_adev(dev);
5746 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5747
5748 if (!amdgpu_device_supports_baco(dev))
5749 return -ENOTSUPP;
5750
5751 if (ras && adev->ras_enabled &&
5752 adev->nbio.funcs->enable_doorbell_interrupt)
5753 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5754
5755 return amdgpu_dpm_baco_enter(adev);
5756 }
5757
amdgpu_device_baco_exit(struct drm_device * dev)5758 int amdgpu_device_baco_exit(struct drm_device *dev)
5759 {
5760 struct amdgpu_device *adev = drm_to_adev(dev);
5761 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5762 int ret = 0;
5763
5764 if (!amdgpu_device_supports_baco(dev))
5765 return -ENOTSUPP;
5766
5767 ret = amdgpu_dpm_baco_exit(adev);
5768 if (ret)
5769 return ret;
5770
5771 if (ras && adev->ras_enabled &&
5772 adev->nbio.funcs->enable_doorbell_interrupt)
5773 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5774
5775 if (amdgpu_passthrough(adev) &&
5776 adev->nbio.funcs->clear_doorbell_interrupt)
5777 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5778
5779 return 0;
5780 }
5781
5782 /**
5783 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5784 * @pdev: PCI device struct
5785 * @state: PCI channel state
5786 *
5787 * Description: Called when a PCI error is detected.
5788 *
5789 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5790 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5791 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5792 {
5793 STUB();
5794 return 0;
5795 #ifdef notyet
5796 struct drm_device *dev = pci_get_drvdata(pdev);
5797 struct amdgpu_device *adev = drm_to_adev(dev);
5798 int i;
5799
5800 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5801
5802 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5803 DRM_WARN("No support for XGMI hive yet...");
5804 return PCI_ERS_RESULT_DISCONNECT;
5805 }
5806
5807 adev->pci_channel_state = state;
5808
5809 switch (state) {
5810 case pci_channel_io_normal:
5811 return PCI_ERS_RESULT_CAN_RECOVER;
5812 /* Fatal error, prepare for slot reset */
5813 case pci_channel_io_frozen:
5814 /*
5815 * Locking adev->reset_domain->sem will prevent any external access
5816 * to GPU during PCI error recovery
5817 */
5818 amdgpu_device_lock_reset_domain(adev->reset_domain);
5819 amdgpu_device_set_mp1_state(adev);
5820
5821 /*
5822 * Block any work scheduling as we do for regular GPU reset
5823 * for the duration of the recovery
5824 */
5825 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5826 struct amdgpu_ring *ring = adev->rings[i];
5827
5828 if (!ring || !ring->sched.thread)
5829 continue;
5830
5831 drm_sched_stop(&ring->sched, NULL);
5832 }
5833 atomic_inc(&adev->gpu_reset_counter);
5834 return PCI_ERS_RESULT_NEED_RESET;
5835 case pci_channel_io_perm_failure:
5836 /* Permanent error, prepare for device removal */
5837 return PCI_ERS_RESULT_DISCONNECT;
5838 }
5839
5840 return PCI_ERS_RESULT_NEED_RESET;
5841 #endif
5842 }
5843
5844 /**
5845 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5846 * @pdev: pointer to PCI device
5847 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5848 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5849 {
5850
5851 DRM_INFO("PCI error: mmio enabled callback!!\n");
5852
5853 /* TODO - dump whatever for debugging purposes */
5854
5855 /* This called only if amdgpu_pci_error_detected returns
5856 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5857 * works, no need to reset slot.
5858 */
5859
5860 return PCI_ERS_RESULT_RECOVERED;
5861 }
5862
5863 /**
5864 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5865 * @pdev: PCI device struct
5866 *
5867 * Description: This routine is called by the pci error recovery
5868 * code after the PCI slot has been reset, just before we
5869 * should resume normal operations.
5870 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5871 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5872 {
5873 STUB();
5874 return PCI_ERS_RESULT_RECOVERED;
5875 #ifdef notyet
5876 struct drm_device *dev = pci_get_drvdata(pdev);
5877 struct amdgpu_device *adev = drm_to_adev(dev);
5878 int r, i;
5879 struct amdgpu_reset_context reset_context;
5880 u32 memsize;
5881 struct list_head device_list;
5882
5883 DRM_INFO("PCI error: slot reset callback!!\n");
5884
5885 memset(&reset_context, 0, sizeof(reset_context));
5886
5887 INIT_LIST_HEAD(&device_list);
5888 list_add_tail(&adev->reset_list, &device_list);
5889
5890 /* wait for asic to come out of reset */
5891 drm_msleep(500);
5892
5893 /* Restore PCI confspace */
5894 amdgpu_device_load_pci_state(pdev);
5895
5896 /* confirm ASIC came out of reset */
5897 for (i = 0; i < adev->usec_timeout; i++) {
5898 memsize = amdgpu_asic_get_config_memsize(adev);
5899
5900 if (memsize != 0xffffffff)
5901 break;
5902 udelay(1);
5903 }
5904 if (memsize == 0xffffffff) {
5905 r = -ETIME;
5906 goto out;
5907 }
5908
5909 reset_context.method = AMD_RESET_METHOD_NONE;
5910 reset_context.reset_req_dev = adev;
5911 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5912 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5913
5914 adev->no_hw_access = true;
5915 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5916 adev->no_hw_access = false;
5917 if (r)
5918 goto out;
5919
5920 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5921
5922 out:
5923 if (!r) {
5924 if (amdgpu_device_cache_pci_state(adev->pdev))
5925 pci_restore_state(adev->pdev);
5926
5927 DRM_INFO("PCIe error recovery succeeded\n");
5928 } else {
5929 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5930 amdgpu_device_unset_mp1_state(adev);
5931 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5932 }
5933
5934 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5935 #endif
5936 }
5937
5938 /**
5939 * amdgpu_pci_resume() - resume normal ops after PCI reset
5940 * @pdev: pointer to PCI device
5941 *
5942 * Called when the error recovery driver tells us that its
5943 * OK to resume normal operation.
5944 */
amdgpu_pci_resume(struct pci_dev * pdev)5945 void amdgpu_pci_resume(struct pci_dev *pdev)
5946 {
5947 STUB();
5948 #ifdef notyet
5949 struct drm_device *dev = pci_get_drvdata(pdev);
5950 struct amdgpu_device *adev = drm_to_adev(dev);
5951 int i;
5952
5953
5954 DRM_INFO("PCI error: resume callback!!\n");
5955
5956 /* Only continue execution for the case of pci_channel_io_frozen */
5957 if (adev->pci_channel_state != pci_channel_io_frozen)
5958 return;
5959
5960 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5961 struct amdgpu_ring *ring = adev->rings[i];
5962
5963 if (!ring || !ring->sched.thread)
5964 continue;
5965
5966 drm_sched_start(&ring->sched, true);
5967 }
5968
5969 amdgpu_device_unset_mp1_state(adev);
5970 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5971 #endif
5972 }
5973
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5974 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5975 {
5976 return false;
5977 #ifdef notyet
5978 struct drm_device *dev = pci_get_drvdata(pdev);
5979 struct amdgpu_device *adev = drm_to_adev(dev);
5980 int r;
5981
5982 r = pci_save_state(pdev);
5983 if (!r) {
5984 kfree(adev->pci_state);
5985
5986 adev->pci_state = pci_store_saved_state(pdev);
5987
5988 if (!adev->pci_state) {
5989 DRM_ERROR("Failed to store PCI saved state");
5990 return false;
5991 }
5992 } else {
5993 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5994 return false;
5995 }
5996
5997 return true;
5998 #endif
5999 }
6000
amdgpu_device_load_pci_state(struct pci_dev * pdev)6001 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6002 {
6003 STUB();
6004 return false;
6005 #ifdef notyet
6006 struct drm_device *dev = pci_get_drvdata(pdev);
6007 struct amdgpu_device *adev = drm_to_adev(dev);
6008 int r;
6009
6010 if (!adev->pci_state)
6011 return false;
6012
6013 r = pci_load_saved_state(pdev, adev->pci_state);
6014
6015 if (!r) {
6016 pci_restore_state(pdev);
6017 } else {
6018 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6019 return false;
6020 }
6021
6022 return true;
6023 #endif
6024 }
6025
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6026 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6027 struct amdgpu_ring *ring)
6028 {
6029 #ifdef CONFIG_X86_64
6030 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6031 return;
6032 #endif
6033 if (adev->gmc.xgmi.connected_to_cpu)
6034 return;
6035
6036 if (ring && ring->funcs->emit_hdp_flush)
6037 amdgpu_ring_emit_hdp_flush(ring);
6038 else
6039 amdgpu_asic_flush_hdp(adev, ring);
6040 }
6041
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6042 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6043 struct amdgpu_ring *ring)
6044 {
6045 #ifdef CONFIG_X86_64
6046 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6047 return;
6048 #endif
6049 if (adev->gmc.xgmi.connected_to_cpu)
6050 return;
6051
6052 amdgpu_asic_invalidate_hdp(adev, ring);
6053 }
6054
amdgpu_in_reset(struct amdgpu_device * adev)6055 int amdgpu_in_reset(struct amdgpu_device *adev)
6056 {
6057 return atomic_read(&adev->reset_domain->in_gpu_reset);
6058 }
6059
6060 /**
6061 * amdgpu_device_halt() - bring hardware to some kind of halt state
6062 *
6063 * @adev: amdgpu_device pointer
6064 *
6065 * Bring hardware to some kind of halt state so that no one can touch it
6066 * any more. It will help to maintain error context when error occurred.
6067 * Compare to a simple hang, the system will keep stable at least for SSH
6068 * access. Then it should be trivial to inspect the hardware state and
6069 * see what's going on. Implemented as following:
6070 *
6071 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6072 * clears all CPU mappings to device, disallows remappings through page faults
6073 * 2. amdgpu_irq_disable_all() disables all interrupts
6074 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6075 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6076 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6077 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6078 * flush any in flight DMA operations
6079 */
amdgpu_device_halt(struct amdgpu_device * adev)6080 void amdgpu_device_halt(struct amdgpu_device *adev)
6081 {
6082 struct pci_dev *pdev = adev->pdev;
6083 struct drm_device *ddev = adev_to_drm(adev);
6084
6085 amdgpu_xcp_dev_unplug(adev);
6086 drm_dev_unplug(ddev);
6087
6088 amdgpu_irq_disable_all(adev);
6089
6090 amdgpu_fence_driver_hw_fini(adev);
6091
6092 adev->no_hw_access = true;
6093
6094 amdgpu_device_unmap_mmio(adev);
6095
6096 pci_disable_device(pdev);
6097 pci_wait_for_pending_transaction(pdev);
6098 }
6099
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6100 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6101 u32 reg)
6102 {
6103 unsigned long flags, address, data;
6104 u32 r;
6105
6106 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6107 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6108
6109 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6110 WREG32(address, reg * 4);
6111 (void)RREG32(address);
6112 r = RREG32(data);
6113 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6114 return r;
6115 }
6116
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6117 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6118 u32 reg, u32 v)
6119 {
6120 unsigned long flags, address, data;
6121
6122 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6123 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6124
6125 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6126 WREG32(address, reg * 4);
6127 (void)RREG32(address);
6128 WREG32(data, v);
6129 (void)RREG32(data);
6130 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6131 }
6132
6133 /**
6134 * amdgpu_device_switch_gang - switch to a new gang
6135 * @adev: amdgpu_device pointer
6136 * @gang: the gang to switch to
6137 *
6138 * Try to switch to a new gang.
6139 * Returns: NULL if we switched to the new gang or a reference to the current
6140 * gang leader.
6141 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6142 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6143 struct dma_fence *gang)
6144 {
6145 struct dma_fence *old = NULL;
6146
6147 do {
6148 dma_fence_put(old);
6149 rcu_read_lock();
6150 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6151 rcu_read_unlock();
6152
6153 if (old == gang)
6154 break;
6155
6156 if (!dma_fence_is_signaled(old))
6157 return old;
6158
6159 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6160 old, gang) != old);
6161
6162 dma_fence_put(old);
6163 return NULL;
6164 }
6165
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6166 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6167 {
6168 switch (adev->asic_type) {
6169 #ifdef CONFIG_DRM_AMDGPU_SI
6170 case CHIP_HAINAN:
6171 #endif
6172 case CHIP_TOPAZ:
6173 /* chips with no display hardware */
6174 return false;
6175 #ifdef CONFIG_DRM_AMDGPU_SI
6176 case CHIP_TAHITI:
6177 case CHIP_PITCAIRN:
6178 case CHIP_VERDE:
6179 case CHIP_OLAND:
6180 #endif
6181 #ifdef CONFIG_DRM_AMDGPU_CIK
6182 case CHIP_BONAIRE:
6183 case CHIP_HAWAII:
6184 case CHIP_KAVERI:
6185 case CHIP_KABINI:
6186 case CHIP_MULLINS:
6187 #endif
6188 case CHIP_TONGA:
6189 case CHIP_FIJI:
6190 case CHIP_POLARIS10:
6191 case CHIP_POLARIS11:
6192 case CHIP_POLARIS12:
6193 case CHIP_VEGAM:
6194 case CHIP_CARRIZO:
6195 case CHIP_STONEY:
6196 /* chips with display hardware */
6197 return true;
6198 default:
6199 /* IP discovery */
6200 if (!adev->ip_versions[DCE_HWIP][0] ||
6201 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6202 return false;
6203 return true;
6204 }
6205 }
6206
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6207 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6208 uint32_t inst, uint32_t reg_addr, char reg_name[],
6209 uint32_t expected_value, uint32_t mask)
6210 {
6211 uint32_t ret = 0;
6212 uint32_t old_ = 0;
6213 uint32_t tmp_ = RREG32(reg_addr);
6214 uint32_t loop = adev->usec_timeout;
6215
6216 while ((tmp_ & (mask)) != (expected_value)) {
6217 if (old_ != tmp_) {
6218 loop = adev->usec_timeout;
6219 old_ = tmp_;
6220 } else
6221 udelay(1);
6222 tmp_ = RREG32(reg_addr);
6223 loop--;
6224 if (!loop) {
6225 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6226 inst, reg_name, (uint32_t)expected_value,
6227 (uint32_t)(tmp_ & (mask)));
6228 ret = -ETIMEDOUT;
6229 break;
6230 }
6231 }
6232 return ret;
6233 }
6234