1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/device.h>
47 #include <linux/vgaarb.h>
48 #include <linux/vga_switcheroo.h>
49 #include <linux/efi.h>
50 #include "amdgpu.h"
51 #include "amdgpu_trace.h"
52 #include "amdgpu_i2c.h"
53 #include "atom.h"
54 #include "amdgpu_atombios.h"
55 #include "amdgpu_atomfirmware.h"
56 #include "amd_pcie.h"
57 #ifdef CONFIG_DRM_AMDGPU_SI
58 #include "si.h"
59 #endif
60 #ifdef CONFIG_DRM_AMDGPU_CIK
61 #include "cik.h"
62 #endif
63 #include "vi.h"
64 #include "soc15.h"
65 #include "nv.h"
66 #include "bif/bif_4_1_d.h"
67 #include <linux/firmware.h>
68 #include "amdgpu_vf_error.h"
69
70 #include "amdgpu_amdkfd.h"
71 #include "amdgpu_pm.h"
72
73 #include "amdgpu_xgmi.h"
74 #include "amdgpu_ras.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82
83 #include <drm/drm_drv.h>
84
85 #if IS_ENABLED(CONFIG_X86) && defined(__linux__)
86 #include <asm/intel-family.h>
87 #endif
88
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97 #define AMDGPU_RESUME_MS 2000
98 #define AMDGPU_MAX_RETRY_LIMIT 2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100
101 static const struct drm_driver amdgpu_kms_driver;
102
103 const char *amdgpu_asic_name[] = {
104 "TAHITI",
105 "PITCAIRN",
106 "VERDE",
107 "OLAND",
108 "HAINAN",
109 "BONAIRE",
110 "KAVERI",
111 "KABINI",
112 "HAWAII",
113 "MULLINS",
114 "TOPAZ",
115 "TONGA",
116 "FIJI",
117 "CARRIZO",
118 "STONEY",
119 "POLARIS10",
120 "POLARIS11",
121 "POLARIS12",
122 "VEGAM",
123 "VEGA10",
124 "VEGA12",
125 "VEGA20",
126 "RAVEN",
127 "ARCTURUS",
128 "RENOIR",
129 "ALDEBARAN",
130 "NAVI10",
131 "CYAN_SKILLFISH",
132 "NAVI14",
133 "NAVI12",
134 "SIENNA_CICHLID",
135 "NAVY_FLOUNDER",
136 "VANGOGH",
137 "DIMGREY_CAVEFISH",
138 "BEIGE_GOBY",
139 "YELLOW_CARP",
140 "IP DISCOVERY",
141 "LAST",
142 };
143
144 /**
145 * DOC: pcie_replay_count
146 *
147 * The amdgpu driver provides a sysfs API for reporting the total number
148 * of PCIe replays (NAKs)
149 * The file pcie_replay_count is used for this and returns the total
150 * number of replays as a sum of the NAKs generated and NAKs received
151 */
152
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)153 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
154 struct device_attribute *attr, char *buf)
155 {
156 struct drm_device *ddev = dev_get_drvdata(dev);
157 struct amdgpu_device *adev = drm_to_adev(ddev);
158 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
159
160 return sysfs_emit(buf, "%llu\n", cnt);
161 }
162
163 static DEVICE_ATTR(pcie_replay_count, 0444,
164 amdgpu_device_get_pcie_replay_count, NULL);
165
166 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
167
168
169 /**
170 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
171 *
172 * @dev: drm_device pointer
173 *
174 * Returns true if the device is a dGPU with ATPX power control,
175 * otherwise return false.
176 */
amdgpu_device_supports_px(struct drm_device * dev)177 bool amdgpu_device_supports_px(struct drm_device *dev)
178 {
179 struct amdgpu_device *adev = drm_to_adev(dev);
180
181 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
182 return true;
183 return false;
184 }
185
186 /**
187 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
188 *
189 * @dev: drm_device pointer
190 *
191 * Returns true if the device is a dGPU with ACPI power control,
192 * otherwise return false.
193 */
amdgpu_device_supports_boco(struct drm_device * dev)194 bool amdgpu_device_supports_boco(struct drm_device *dev)
195 {
196 struct amdgpu_device *adev = drm_to_adev(dev);
197
198 if (adev->has_pr3 ||
199 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
200 return true;
201 return false;
202 }
203
204 /**
205 * amdgpu_device_supports_baco - Does the device support BACO
206 *
207 * @dev: drm_device pointer
208 *
209 * Returns true if the device supporte BACO,
210 * otherwise return false.
211 */
amdgpu_device_supports_baco(struct drm_device * dev)212 bool amdgpu_device_supports_baco(struct drm_device *dev)
213 {
214 struct amdgpu_device *adev = drm_to_adev(dev);
215
216 return amdgpu_asic_supports_baco(adev);
217 }
218
219 /**
220 * amdgpu_device_supports_smart_shift - Is the device dGPU with
221 * smart shift support
222 *
223 * @dev: drm_device pointer
224 *
225 * Returns true if the device is a dGPU with Smart Shift support,
226 * otherwise returns false.
227 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)228 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
229 {
230 return (amdgpu_device_supports_boco(dev) &&
231 amdgpu_acpi_is_power_shift_control_supported());
232 }
233
234 /*
235 * VRAM access helper functions
236 */
237
238 /**
239 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
240 *
241 * @adev: amdgpu_device pointer
242 * @pos: offset of the buffer in vram
243 * @buf: virtual address of the buffer in system memory
244 * @size: read/write size, sizeof(@buf) must > @size
245 * @write: true - write to vram, otherwise - read from vram
246 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)247 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
248 void *buf, size_t size, bool write)
249 {
250 unsigned long flags;
251 uint32_t hi = ~0, tmp = 0;
252 uint32_t *data = buf;
253 uint64_t last;
254 int idx;
255
256 if (!drm_dev_enter(adev_to_drm(adev), &idx))
257 return;
258
259 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
260
261 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
262 for (last = pos + size; pos < last; pos += 4) {
263 tmp = pos >> 31;
264
265 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
266 if (tmp != hi) {
267 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
268 hi = tmp;
269 }
270 if (write)
271 WREG32_NO_KIQ(mmMM_DATA, *data++);
272 else
273 *data++ = RREG32_NO_KIQ(mmMM_DATA);
274 }
275
276 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
277 drm_dev_exit(idx);
278 }
279
280 /**
281 * amdgpu_device_aper_access - access vram by vram aperature
282 *
283 * @adev: amdgpu_device pointer
284 * @pos: offset of the buffer in vram
285 * @buf: virtual address of the buffer in system memory
286 * @size: read/write size, sizeof(@buf) must > @size
287 * @write: true - write to vram, otherwise - read from vram
288 *
289 * The return value means how many bytes have been transferred.
290 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)291 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
292 void *buf, size_t size, bool write)
293 {
294 #ifdef CONFIG_64BIT
295 void __iomem *addr;
296 size_t count = 0;
297 uint64_t last;
298
299 if (!adev->mman.aper_base_kaddr)
300 return 0;
301
302 last = min(pos + size, adev->gmc.visible_vram_size);
303 if (last > pos) {
304 addr = adev->mman.aper_base_kaddr + pos;
305 count = last - pos;
306
307 if (write) {
308 memcpy_toio(addr, buf, count);
309 /* Make sure HDP write cache flush happens without any reordering
310 * after the system memory contents are sent over PCIe device
311 */
312 mb();
313 amdgpu_device_flush_hdp(adev, NULL);
314 } else {
315 amdgpu_device_invalidate_hdp(adev, NULL);
316 /* Make sure HDP read cache is invalidated before issuing a read
317 * to the PCIe device
318 */
319 mb();
320 memcpy_fromio(buf, addr, count);
321 }
322
323 }
324
325 return count;
326 #else
327 return 0;
328 #endif
329 }
330
331 /**
332 * amdgpu_device_vram_access - read/write a buffer in vram
333 *
334 * @adev: amdgpu_device pointer
335 * @pos: offset of the buffer in vram
336 * @buf: virtual address of the buffer in system memory
337 * @size: read/write size, sizeof(@buf) must > @size
338 * @write: true - write to vram, otherwise - read from vram
339 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)340 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
341 void *buf, size_t size, bool write)
342 {
343 size_t count;
344
345 /* try to using vram apreature to access vram first */
346 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
347 size -= count;
348 if (size) {
349 /* using MM to access rest vram */
350 pos += count;
351 buf += count;
352 amdgpu_device_mm_access(adev, pos, buf, size, write);
353 }
354 }
355
356 /*
357 * register access helper functions.
358 */
359
360 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
362 {
363 if (adev->no_hw_access)
364 return true;
365
366 #ifdef CONFIG_LOCKDEP
367 /*
368 * This is a bit complicated to understand, so worth a comment. What we assert
369 * here is that the GPU reset is not running on another thread in parallel.
370 *
371 * For this we trylock the read side of the reset semaphore, if that succeeds
372 * we know that the reset is not running in paralell.
373 *
374 * If the trylock fails we assert that we are either already holding the read
375 * side of the lock or are the reset thread itself and hold the write side of
376 * the lock.
377 */
378 if (in_task()) {
379 if (down_read_trylock(&adev->reset_domain->sem))
380 up_read(&adev->reset_domain->sem);
381 else
382 lockdep_assert_held(&adev->reset_domain->sem);
383 }
384 #endif
385 return false;
386 }
387
388 /**
389 * amdgpu_device_rreg - read a memory mapped IO or indirect register
390 *
391 * @adev: amdgpu_device pointer
392 * @reg: dword aligned register offset
393 * @acc_flags: access flags which require special behavior
394 *
395 * Returns the 32 bit value from the offset specified.
396 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
398 uint32_t reg, uint32_t acc_flags)
399 {
400 uint32_t ret;
401
402 if (amdgpu_device_skip_hw_access(adev))
403 return 0;
404
405 if ((reg * 4) < adev->rmmio_size) {
406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
407 amdgpu_sriov_runtime(adev) &&
408 down_read_trylock(&adev->reset_domain->sem)) {
409 ret = amdgpu_kiq_rreg(adev, reg);
410 up_read(&adev->reset_domain->sem);
411 } else {
412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
413 }
414 } else {
415 ret = adev->pcie_rreg(adev, reg * 4);
416 }
417
418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
419
420 return ret;
421 }
422
423 /*
424 * MMIO register read with bytes helper functions
425 * @offset:bytes offset from MMIO start
426 */
427
428 /**
429 * amdgpu_mm_rreg8 - read a memory mapped IO register
430 *
431 * @adev: amdgpu_device pointer
432 * @offset: byte aligned register offset
433 *
434 * Returns the 8 bit value from the offset specified.
435 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)436 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
437 {
438 if (amdgpu_device_skip_hw_access(adev))
439 return 0;
440
441 if (offset < adev->rmmio_size)
442 return (readb(adev->rmmio + offset));
443 BUG();
444 }
445
446 /*
447 * MMIO register write with bytes helper functions
448 * @offset:bytes offset from MMIO start
449 * @value: the value want to be written to the register
450 */
451
452 /**
453 * amdgpu_mm_wreg8 - read a memory mapped IO register
454 *
455 * @adev: amdgpu_device pointer
456 * @offset: byte aligned register offset
457 * @value: 8 bit value to write
458 *
459 * Writes the value specified to the offset specified.
460 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)461 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
462 {
463 if (amdgpu_device_skip_hw_access(adev))
464 return;
465
466 if (offset < adev->rmmio_size)
467 writeb(value, adev->rmmio + offset);
468 else
469 BUG();
470 }
471
472 /**
473 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
474 *
475 * @adev: amdgpu_device pointer
476 * @reg: dword aligned register offset
477 * @v: 32 bit value to write to the register
478 * @acc_flags: access flags which require special behavior
479 *
480 * Writes the value specified to the offset specified.
481 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)482 void amdgpu_device_wreg(struct amdgpu_device *adev,
483 uint32_t reg, uint32_t v,
484 uint32_t acc_flags)
485 {
486 if (amdgpu_device_skip_hw_access(adev))
487 return;
488
489 if ((reg * 4) < adev->rmmio_size) {
490 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
491 amdgpu_sriov_runtime(adev) &&
492 down_read_trylock(&adev->reset_domain->sem)) {
493 amdgpu_kiq_wreg(adev, reg, v);
494 up_read(&adev->reset_domain->sem);
495 } else {
496 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
497 }
498 } else {
499 adev->pcie_wreg(adev, reg * 4, v);
500 }
501
502 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
503 }
504
505 /**
506 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
507 *
508 * @adev: amdgpu_device pointer
509 * @reg: mmio/rlc register
510 * @v: value to write
511 *
512 * this function is invoked only for the debugfs register access
513 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
515 uint32_t reg, uint32_t v,
516 uint32_t xcc_id)
517 {
518 if (amdgpu_device_skip_hw_access(adev))
519 return;
520
521 if (amdgpu_sriov_fullaccess(adev) &&
522 adev->gfx.rlc.funcs &&
523 adev->gfx.rlc.funcs->is_rlcg_access_range) {
524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
526 } else if ((reg * 4) >= adev->rmmio_size) {
527 adev->pcie_wreg(adev, reg * 4, v);
528 } else {
529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
530 }
531 }
532
533 /**
534 * amdgpu_device_indirect_rreg - read an indirect register
535 *
536 * @adev: amdgpu_device pointer
537 * @reg_addr: indirect register address to read from
538 *
539 * Returns the value of indirect register @reg_addr
540 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
542 u32 reg_addr)
543 {
544 unsigned long flags, pcie_index, pcie_data;
545 void __iomem *pcie_index_offset;
546 void __iomem *pcie_data_offset;
547 u32 r;
548
549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
551
552 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
555
556 writel(reg_addr, pcie_index_offset);
557 readl(pcie_index_offset);
558 r = readl(pcie_data_offset);
559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
560
561 return r;
562 }
563
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
565 u64 reg_addr)
566 {
567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
568 u32 r;
569 void __iomem *pcie_index_offset;
570 void __iomem *pcie_index_hi_offset;
571 void __iomem *pcie_data_offset;
572
573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
575 if (adev->nbio.funcs->get_pcie_index_hi_offset)
576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
577 else
578 pcie_index_hi = 0;
579
580 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
583 if (pcie_index_hi != 0)
584 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
585 pcie_index_hi * 4;
586
587 writel(reg_addr, pcie_index_offset);
588 readl(pcie_index_offset);
589 if (pcie_index_hi != 0) {
590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
591 readl(pcie_index_hi_offset);
592 }
593 r = readl(pcie_data_offset);
594
595 /* clear the high bits */
596 if (pcie_index_hi != 0) {
597 writel(0, pcie_index_hi_offset);
598 readl(pcie_index_hi_offset);
599 }
600
601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602
603 return r;
604 }
605
606 /**
607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608 *
609 * @adev: amdgpu_device pointer
610 * @reg_addr: indirect register address to read from
611 *
612 * Returns the value of indirect register @reg_addr
613 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
615 u32 reg_addr)
616 {
617 unsigned long flags, pcie_index, pcie_data;
618 void __iomem *pcie_index_offset;
619 void __iomem *pcie_data_offset;
620 u64 r;
621
622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
624
625 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
628
629 /* read low 32 bits */
630 writel(reg_addr, pcie_index_offset);
631 readl(pcie_index_offset);
632 r = readl(pcie_data_offset);
633 /* read high 32 bits */
634 writel(reg_addr + 4, pcie_index_offset);
635 readl(pcie_index_offset);
636 r |= ((u64)readl(pcie_data_offset) << 32);
637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
638
639 return r;
640 }
641
642 /**
643 * amdgpu_device_indirect_wreg - write an indirect register address
644 *
645 * @adev: amdgpu_device pointer
646 * @reg_addr: indirect register offset
647 * @reg_data: indirect register data
648 *
649 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
651 u32 reg_addr, u32 reg_data)
652 {
653 unsigned long flags, pcie_index, pcie_data;
654 void __iomem *pcie_index_offset;
655 void __iomem *pcie_data_offset;
656
657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
659
660 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
663
664 writel(reg_addr, pcie_index_offset);
665 readl(pcie_index_offset);
666 writel(reg_data, pcie_data_offset);
667 readl(pcie_data_offset);
668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
669 }
670
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
672 u64 reg_addr, u32 reg_data)
673 {
674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
675 void __iomem *pcie_index_offset;
676 void __iomem *pcie_index_hi_offset;
677 void __iomem *pcie_data_offset;
678
679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
681 if (adev->nbio.funcs->get_pcie_index_hi_offset)
682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
683 else
684 pcie_index_hi = 0;
685
686 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
689 if (pcie_index_hi != 0)
690 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
691 pcie_index_hi * 4;
692
693 writel(reg_addr, pcie_index_offset);
694 readl(pcie_index_offset);
695 if (pcie_index_hi != 0) {
696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
697 readl(pcie_index_hi_offset);
698 }
699 writel(reg_data, pcie_data_offset);
700 readl(pcie_data_offset);
701
702 /* clear the high bits */
703 if (pcie_index_hi != 0) {
704 writel(0, pcie_index_hi_offset);
705 readl(pcie_index_hi_offset);
706 }
707
708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
709 }
710
711 /**
712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
713 *
714 * @adev: amdgpu_device pointer
715 * @reg_addr: indirect register offset
716 * @reg_data: indirect register data
717 *
718 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
720 u32 reg_addr, u64 reg_data)
721 {
722 unsigned long flags, pcie_index, pcie_data;
723 void __iomem *pcie_index_offset;
724 void __iomem *pcie_data_offset;
725
726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
728
729 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
732
733 /* write low 32 bits */
734 writel(reg_addr, pcie_index_offset);
735 readl(pcie_index_offset);
736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
737 readl(pcie_data_offset);
738 /* write high 32 bits */
739 writel(reg_addr + 4, pcie_index_offset);
740 readl(pcie_index_offset);
741 writel((u32)(reg_data >> 32), pcie_data_offset);
742 readl(pcie_data_offset);
743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
744 }
745
746 /**
747 * amdgpu_device_get_rev_id - query device rev_id
748 *
749 * @adev: amdgpu_device pointer
750 *
751 * Return device rev_id
752 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
754 {
755 return adev->nbio.funcs->get_rev_id(adev);
756 }
757
758 /**
759 * amdgpu_invalid_rreg - dummy reg read function
760 *
761 * @adev: amdgpu_device pointer
762 * @reg: offset of register
763 *
764 * Dummy register read function. Used for register blocks
765 * that certain asics don't have (all asics).
766 * Returns the value in the register.
767 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
769 {
770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
771 BUG();
772 return 0;
773 }
774
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
776 {
777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
778 BUG();
779 return 0;
780 }
781
782 /**
783 * amdgpu_invalid_wreg - dummy reg write function
784 *
785 * @adev: amdgpu_device pointer
786 * @reg: offset of register
787 * @v: value to write to the register
788 *
789 * Dummy register read function. Used for register blocks
790 * that certain asics don't have (all asics).
791 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
793 {
794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
795 reg, v);
796 BUG();
797 }
798
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
800 {
801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
802 reg, v);
803 BUG();
804 }
805
806 /**
807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
808 *
809 * @adev: amdgpu_device pointer
810 * @reg: offset of register
811 *
812 * Dummy register read function. Used for register blocks
813 * that certain asics don't have (all asics).
814 * Returns the value in the register.
815 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
817 {
818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
819 BUG();
820 return 0;
821 }
822
823 /**
824 * amdgpu_invalid_wreg64 - dummy reg write function
825 *
826 * @adev: amdgpu_device pointer
827 * @reg: offset of register
828 * @v: value to write to the register
829 *
830 * Dummy register read function. Used for register blocks
831 * that certain asics don't have (all asics).
832 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
834 {
835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
836 reg, v);
837 BUG();
838 }
839
840 /**
841 * amdgpu_block_invalid_rreg - dummy reg read function
842 *
843 * @adev: amdgpu_device pointer
844 * @block: offset of instance
845 * @reg: offset of register
846 *
847 * Dummy register read function. Used for register blocks
848 * that certain asics don't have (all asics).
849 * Returns the value in the register.
850 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
852 uint32_t block, uint32_t reg)
853 {
854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
855 reg, block);
856 BUG();
857 return 0;
858 }
859
860 /**
861 * amdgpu_block_invalid_wreg - dummy reg write function
862 *
863 * @adev: amdgpu_device pointer
864 * @block: offset of instance
865 * @reg: offset of register
866 * @v: value to write to the register
867 *
868 * Dummy register read function. Used for register blocks
869 * that certain asics don't have (all asics).
870 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
872 uint32_t block,
873 uint32_t reg, uint32_t v)
874 {
875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
876 reg, block, v);
877 BUG();
878 }
879
880 /**
881 * amdgpu_device_asic_init - Wrapper for atom asic_init
882 *
883 * @adev: amdgpu_device pointer
884 *
885 * Does any asic specific work and then calls atom asic init.
886 */
amdgpu_device_asic_init(struct amdgpu_device * adev)887 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
888 {
889 int ret;
890
891 amdgpu_asic_pre_asic_init(adev);
892
893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
895 amdgpu_psp_wait_for_bootloader(adev);
896 ret = amdgpu_atomfirmware_asic_init(adev, true);
897 return ret;
898 } else {
899 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
900 }
901
902 return 0;
903 }
904
905 /**
906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
907 *
908 * @adev: amdgpu_device pointer
909 *
910 * Allocates a scratch page of VRAM for use by various things in the
911 * driver.
912 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
914 {
915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
916 AMDGPU_GEM_DOMAIN_VRAM |
917 AMDGPU_GEM_DOMAIN_GTT,
918 &adev->mem_scratch.robj,
919 &adev->mem_scratch.gpu_addr,
920 (void **)&adev->mem_scratch.ptr);
921 }
922
923 /**
924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
925 *
926 * @adev: amdgpu_device pointer
927 *
928 * Frees the VRAM scratch page.
929 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
931 {
932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
933 }
934
935 /**
936 * amdgpu_device_program_register_sequence - program an array of registers.
937 *
938 * @adev: amdgpu_device pointer
939 * @registers: pointer to the register array
940 * @array_size: size of the register array
941 *
942 * Programs an array or registers with and or masks.
943 * This is a helper for setting golden registers.
944 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
946 const u32 *registers,
947 const u32 array_size)
948 {
949 u32 tmp, reg, and_mask, or_mask;
950 int i;
951
952 if (array_size % 3)
953 return;
954
955 for (i = 0; i < array_size; i += 3) {
956 reg = registers[i + 0];
957 and_mask = registers[i + 1];
958 or_mask = registers[i + 2];
959
960 if (and_mask == 0xffffffff) {
961 tmp = or_mask;
962 } else {
963 tmp = RREG32(reg);
964 tmp &= ~and_mask;
965 if (adev->family >= AMDGPU_FAMILY_AI)
966 tmp |= (or_mask & and_mask);
967 else
968 tmp |= or_mask;
969 }
970 WREG32(reg, tmp);
971 }
972 }
973
974 /**
975 * amdgpu_device_pci_config_reset - reset the GPU
976 *
977 * @adev: amdgpu_device pointer
978 *
979 * Resets the GPU using the pci config reset sequence.
980 * Only applicable to asics prior to vega10.
981 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
983 {
984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
985 }
986
987 /**
988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
989 *
990 * @adev: amdgpu_device pointer
991 *
992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
993 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)994 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
995 {
996 STUB();
997 return -ENOSYS;
998 #ifdef notyet
999 return pci_reset_function(adev->pdev);
1000 #endif
1001 }
1002
1003 /*
1004 * amdgpu_device_wb_*()
1005 * Writeback is the method by which the GPU updates special pages in memory
1006 * with the status of certain GPU events (fences, ring pointers,etc.).
1007 */
1008
1009 /**
1010 * amdgpu_device_wb_fini - Disable Writeback and free memory
1011 *
1012 * @adev: amdgpu_device pointer
1013 *
1014 * Disables Writeback and frees the Writeback memory (all asics).
1015 * Used at driver shutdown.
1016 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1018 {
1019 if (adev->wb.wb_obj) {
1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1021 &adev->wb.gpu_addr,
1022 (void **)&adev->wb.wb);
1023 adev->wb.wb_obj = NULL;
1024 }
1025 }
1026
1027 /**
1028 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1029 *
1030 * @adev: amdgpu_device pointer
1031 *
1032 * Initializes writeback and allocates writeback memory (all asics).
1033 * Used at driver startup.
1034 * Returns 0 on success or an -error on failure.
1035 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1037 {
1038 int r;
1039
1040 if (adev->wb.wb_obj == NULL) {
1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1044 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1045 (void **)&adev->wb.wb);
1046 if (r) {
1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1048 return r;
1049 }
1050
1051 adev->wb.num_wb = AMDGPU_MAX_WB;
1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1053
1054 /* clear wb memory */
1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1056 }
1057
1058 return 0;
1059 }
1060
1061 /**
1062 * amdgpu_device_wb_get - Allocate a wb entry
1063 *
1064 * @adev: amdgpu_device pointer
1065 * @wb: wb index
1066 *
1067 * Allocate a wb slot for use by the driver (all asics).
1068 * Returns 0 on success or -EINVAL on failure.
1069 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1071 {
1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1073
1074 if (offset < adev->wb.num_wb) {
1075 __set_bit(offset, adev->wb.used);
1076 *wb = offset << 3; /* convert to dw offset */
1077 return 0;
1078 } else {
1079 return -EINVAL;
1080 }
1081 }
1082
1083 /**
1084 * amdgpu_device_wb_free - Free a wb entry
1085 *
1086 * @adev: amdgpu_device pointer
1087 * @wb: wb index
1088 *
1089 * Free a wb slot allocated for use by the driver (all asics)
1090 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1092 {
1093 wb >>= 3;
1094 if (wb < adev->wb.num_wb)
1095 __clear_bit(wb, adev->wb.used);
1096 }
1097
1098 /**
1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1100 *
1101 * @adev: amdgpu_device pointer
1102 *
1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1104 * to fail, but if any of the BARs is not accessible after the size we abort
1105 * driver loading by returning -ENODEV.
1106 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1108 {
1109 #ifdef __linux__
1110 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1111 struct pci_bus *root;
1112 struct resource *res;
1113 unsigned int i;
1114 u16 cmd;
1115 int r;
1116
1117 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1118 return 0;
1119
1120 /* Bypass for VF */
1121 if (amdgpu_sriov_vf(adev))
1122 return 0;
1123
1124 /* skip if the bios has already enabled large BAR */
1125 if (adev->gmc.real_vram_size &&
1126 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1127 return 0;
1128
1129 /* Check if the root BUS has 64bit memory resources */
1130 root = adev->pdev->bus;
1131 while (root->parent)
1132 root = root->parent;
1133
1134 pci_bus_for_each_resource(root, res, i) {
1135 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1136 res->start > 0x100000000ull)
1137 break;
1138 }
1139
1140 /* Trying to resize is pointless without a root hub window above 4GB */
1141 if (!res)
1142 return 0;
1143
1144 /* Limit the BAR size to what is available */
1145 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1146 rbar_size);
1147
1148 /* Disable memory decoding while we change the BAR addresses and size */
1149 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1150 pci_write_config_word(adev->pdev, PCI_COMMAND,
1151 cmd & ~PCI_COMMAND_MEMORY);
1152
1153 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1154 amdgpu_doorbell_fini(adev);
1155 if (adev->asic_type >= CHIP_BONAIRE)
1156 pci_release_resource(adev->pdev, 2);
1157
1158 pci_release_resource(adev->pdev, 0);
1159
1160 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1161 if (r == -ENOSPC)
1162 DRM_INFO("Not enough PCI address space for a large BAR.");
1163 else if (r && r != -ENOTSUPP)
1164 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1165
1166 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1167
1168 /* When the doorbell or fb BAR isn't available we have no chance of
1169 * using the device.
1170 */
1171 r = amdgpu_doorbell_init(adev);
1172 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1173 return -ENODEV;
1174
1175 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1176 #endif /* __linux__ */
1177
1178 return 0;
1179 }
1180
amdgpu_device_read_bios(struct amdgpu_device * adev)1181 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1182 {
1183 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1184 return false;
1185
1186 return true;
1187 }
1188
1189 /*
1190 * GPU helpers function.
1191 */
1192 /**
1193 * amdgpu_device_need_post - check if the hw need post or not
1194 *
1195 * @adev: amdgpu_device pointer
1196 *
1197 * Check if the asic has been initialized (all asics) at driver startup
1198 * or post is needed if hw reset is performed.
1199 * Returns true if need or false if not.
1200 */
amdgpu_device_need_post(struct amdgpu_device * adev)1201 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1202 {
1203 uint32_t reg;
1204
1205 if (amdgpu_sriov_vf(adev))
1206 return false;
1207
1208 if (!amdgpu_device_read_bios(adev))
1209 return false;
1210
1211 if (amdgpu_passthrough(adev)) {
1212 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1213 * some old smc fw still need driver do vPost otherwise gpu hang, while
1214 * those smc fw version above 22.15 doesn't have this flaw, so we force
1215 * vpost executed for smc version below 22.15
1216 */
1217 if (adev->asic_type == CHIP_FIJI) {
1218 int err;
1219 uint32_t fw_ver;
1220
1221 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1222 /* force vPost if error occured */
1223 if (err)
1224 return true;
1225
1226 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1227 release_firmware(adev->pm.fw);
1228 if (fw_ver < 0x00160e00)
1229 return true;
1230 }
1231 }
1232
1233 /* Don't post if we need to reset whole hive on init */
1234 if (adev->gmc.xgmi.pending_reset)
1235 return false;
1236
1237 if (adev->has_hw_reset) {
1238 adev->has_hw_reset = false;
1239 return true;
1240 }
1241
1242 /* bios scratch used on CIK+ */
1243 if (adev->asic_type >= CHIP_BONAIRE)
1244 return amdgpu_atombios_scratch_need_asic_init(adev);
1245
1246 /* check MEM_SIZE for older asics */
1247 reg = amdgpu_asic_get_config_memsize(adev);
1248
1249 if ((reg != 0) && (reg != 0xffffffff))
1250 return false;
1251
1252 return true;
1253 }
1254
1255 /*
1256 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1257 * speed switching. Until we have confirmation from Intel that a specific host
1258 * supports it, it's safer that we keep it disabled for all.
1259 *
1260 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1261 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1262 */
amdgpu_device_pcie_dynamic_switching_supported(void)1263 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1264 {
1265 #if IS_ENABLED(CONFIG_X86)
1266 #ifdef __linux__
1267 struct cpuinfo_x86 *c = &cpu_data(0);
1268
1269 if (c->x86_vendor == X86_VENDOR_INTEL)
1270 #else
1271 if (strcmp(cpu_vendor, "GenuineIntel") == 0)
1272 #endif
1273 return false;
1274 #endif
1275 return true;
1276 }
1277
1278 /**
1279 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1280 *
1281 * @adev: amdgpu_device pointer
1282 *
1283 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1284 * be set for this device.
1285 *
1286 * Returns true if it should be used or false if not.
1287 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1288 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1289 {
1290 switch (amdgpu_aspm) {
1291 case -1:
1292 break;
1293 case 0:
1294 return false;
1295 case 1:
1296 return true;
1297 default:
1298 return false;
1299 }
1300 return pcie_aspm_enabled(adev->pdev);
1301 }
1302
amdgpu_device_aspm_support_quirk(void)1303 bool amdgpu_device_aspm_support_quirk(void)
1304 {
1305 #if IS_ENABLED(CONFIG_X86)
1306 struct cpu_info *ci = curcpu();
1307
1308 return !(ci->ci_family == 6 && ci->ci_model == 0x97);
1309 #else
1310 return true;
1311 #endif
1312 }
1313
1314 /* if we get transitioned to only one device, take VGA back */
1315 /**
1316 * amdgpu_device_vga_set_decode - enable/disable vga decode
1317 *
1318 * @pdev: PCI device pointer
1319 * @state: enable/disable vga decode
1320 *
1321 * Enable/disable vga decode (all asics).
1322 * Returns VGA resource flags.
1323 */
1324 #ifdef notyet
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326 bool state)
1327 {
1328 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1329
1330 amdgpu_asic_set_vga_state(adev, state);
1331 if (state)
1332 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1333 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1334 else
1335 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1336 }
1337 #endif
1338
1339 /**
1340 * amdgpu_device_check_block_size - validate the vm block size
1341 *
1342 * @adev: amdgpu_device pointer
1343 *
1344 * Validates the vm block size specified via module parameter.
1345 * The vm block size defines number of bits in page table versus page directory,
1346 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1347 * page table and the remaining bits are in the page directory.
1348 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1349 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1350 {
1351 /* defines number of bits in page table versus page directory,
1352 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1353 * page table and the remaining bits are in the page directory
1354 */
1355 if (amdgpu_vm_block_size == -1)
1356 return;
1357
1358 if (amdgpu_vm_block_size < 9) {
1359 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1360 amdgpu_vm_block_size);
1361 amdgpu_vm_block_size = -1;
1362 }
1363 }
1364
1365 /**
1366 * amdgpu_device_check_vm_size - validate the vm size
1367 *
1368 * @adev: amdgpu_device pointer
1369 *
1370 * Validates the vm size in GB specified via module parameter.
1371 * The VM size is the size of the GPU virtual memory space in GB.
1372 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1373 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1374 {
1375 /* no need to check the default value */
1376 if (amdgpu_vm_size == -1)
1377 return;
1378
1379 if (amdgpu_vm_size < 1) {
1380 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1381 amdgpu_vm_size);
1382 amdgpu_vm_size = -1;
1383 }
1384 }
1385
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1386 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1387 {
1388 #ifdef __linux__
1389 struct sysinfo si;
1390 #endif
1391 bool is_os_64 = (sizeof(void *) == 8);
1392 uint64_t total_memory;
1393 uint64_t dram_size_seven_GB = 0x1B8000000;
1394 uint64_t dram_size_three_GB = 0xB8000000;
1395
1396 if (amdgpu_smu_memory_pool_size == 0)
1397 return;
1398
1399 if (!is_os_64) {
1400 DRM_WARN("Not 64-bit OS, feature not supported\n");
1401 goto def_value;
1402 }
1403 #ifdef __linux__
1404 si_meminfo(&si);
1405 total_memory = (uint64_t)si.totalram * si.mem_unit;
1406 #else
1407 total_memory = ptoa(physmem);
1408 #endif
1409
1410 if ((amdgpu_smu_memory_pool_size == 1) ||
1411 (amdgpu_smu_memory_pool_size == 2)) {
1412 if (total_memory < dram_size_three_GB)
1413 goto def_value1;
1414 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1415 (amdgpu_smu_memory_pool_size == 8)) {
1416 if (total_memory < dram_size_seven_GB)
1417 goto def_value1;
1418 } else {
1419 DRM_WARN("Smu memory pool size not supported\n");
1420 goto def_value;
1421 }
1422 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1423
1424 return;
1425
1426 def_value1:
1427 DRM_WARN("No enough system memory\n");
1428 def_value:
1429 adev->pm.smu_prv_buffer_size = 0;
1430 }
1431
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1432 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1433 {
1434 if (!(adev->flags & AMD_IS_APU) ||
1435 adev->asic_type < CHIP_RAVEN)
1436 return 0;
1437
1438 switch (adev->asic_type) {
1439 case CHIP_RAVEN:
1440 if (adev->pdev->device == 0x15dd)
1441 adev->apu_flags |= AMD_APU_IS_RAVEN;
1442 if (adev->pdev->device == 0x15d8)
1443 adev->apu_flags |= AMD_APU_IS_PICASSO;
1444 break;
1445 case CHIP_RENOIR:
1446 if ((adev->pdev->device == 0x1636) ||
1447 (adev->pdev->device == 0x164c))
1448 adev->apu_flags |= AMD_APU_IS_RENOIR;
1449 else
1450 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1451 break;
1452 case CHIP_VANGOGH:
1453 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1454 break;
1455 case CHIP_YELLOW_CARP:
1456 break;
1457 case CHIP_CYAN_SKILLFISH:
1458 if ((adev->pdev->device == 0x13FE) ||
1459 (adev->pdev->device == 0x143F))
1460 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1461 break;
1462 default:
1463 break;
1464 }
1465
1466 return 0;
1467 }
1468
1469 /**
1470 * amdgpu_device_check_arguments - validate module params
1471 *
1472 * @adev: amdgpu_device pointer
1473 *
1474 * Validates certain module parameters and updates
1475 * the associated values used by the driver (all asics).
1476 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1477 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1478 {
1479 if (amdgpu_sched_jobs < 4) {
1480 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1481 amdgpu_sched_jobs);
1482 amdgpu_sched_jobs = 4;
1483 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1484 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1485 amdgpu_sched_jobs);
1486 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1487 }
1488
1489 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1490 /* gart size must be greater or equal to 32M */
1491 dev_warn(adev->dev, "gart size (%d) too small\n",
1492 amdgpu_gart_size);
1493 amdgpu_gart_size = -1;
1494 }
1495
1496 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1497 /* gtt size must be greater or equal to 32M */
1498 dev_warn(adev->dev, "gtt size (%d) too small\n",
1499 amdgpu_gtt_size);
1500 amdgpu_gtt_size = -1;
1501 }
1502
1503 /* valid range is between 4 and 9 inclusive */
1504 if (amdgpu_vm_fragment_size != -1 &&
1505 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1506 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1507 amdgpu_vm_fragment_size = -1;
1508 }
1509
1510 if (amdgpu_sched_hw_submission < 2) {
1511 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1512 amdgpu_sched_hw_submission);
1513 amdgpu_sched_hw_submission = 2;
1514 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1515 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1516 amdgpu_sched_hw_submission);
1517 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1518 }
1519
1520 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1521 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1522 amdgpu_reset_method = -1;
1523 }
1524
1525 amdgpu_device_check_smu_prv_buffer_size(adev);
1526
1527 amdgpu_device_check_vm_size(adev);
1528
1529 amdgpu_device_check_block_size(adev);
1530
1531 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1532
1533 return 0;
1534 }
1535
1536 #ifdef __linux__
1537 /**
1538 * amdgpu_switcheroo_set_state - set switcheroo state
1539 *
1540 * @pdev: pci dev pointer
1541 * @state: vga_switcheroo state
1542 *
1543 * Callback for the switcheroo driver. Suspends or resumes
1544 * the asics before or after it is powered up using ACPI methods.
1545 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1546 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1547 enum vga_switcheroo_state state)
1548 {
1549 struct drm_device *dev = pci_get_drvdata(pdev);
1550 int r;
1551
1552 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1553 return;
1554
1555 if (state == VGA_SWITCHEROO_ON) {
1556 pr_info("switched on\n");
1557 /* don't suspend or resume card normally */
1558 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1559
1560 pci_set_power_state(pdev, PCI_D0);
1561 amdgpu_device_load_pci_state(pdev);
1562 r = pci_enable_device(pdev);
1563 if (r)
1564 DRM_WARN("pci_enable_device failed (%d)\n", r);
1565 amdgpu_device_resume(dev, true);
1566
1567 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1568 } else {
1569 pr_info("switched off\n");
1570 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1571 amdgpu_device_prepare(dev);
1572 amdgpu_device_suspend(dev, true);
1573 amdgpu_device_cache_pci_state(pdev);
1574 /* Shut down the device */
1575 pci_disable_device(pdev);
1576 pci_set_power_state(pdev, PCI_D3cold);
1577 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1578 }
1579 }
1580
1581 /**
1582 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1583 *
1584 * @pdev: pci dev pointer
1585 *
1586 * Callback for the switcheroo driver. Check of the switcheroo
1587 * state can be changed.
1588 * Returns true if the state can be changed, false if not.
1589 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1590 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1591 {
1592 struct drm_device *dev = pci_get_drvdata(pdev);
1593
1594 /*
1595 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1596 * locking inversion with the driver load path. And the access here is
1597 * completely racy anyway. So don't bother with locking for now.
1598 */
1599 return atomic_read(&dev->open_count) == 0;
1600 }
1601 #endif /* __linux__ */
1602
1603 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1604 #ifdef notyet
1605 .set_gpu_state = amdgpu_switcheroo_set_state,
1606 .reprobe = NULL,
1607 .can_switch = amdgpu_switcheroo_can_switch,
1608 #endif
1609 };
1610
1611 /**
1612 * amdgpu_device_ip_set_clockgating_state - set the CG state
1613 *
1614 * @dev: amdgpu_device pointer
1615 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1616 * @state: clockgating state (gate or ungate)
1617 *
1618 * Sets the requested clockgating state for all instances of
1619 * the hardware IP specified.
1620 * Returns the error code from the last instance.
1621 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1622 int amdgpu_device_ip_set_clockgating_state(void *dev,
1623 enum amd_ip_block_type block_type,
1624 enum amd_clockgating_state state)
1625 {
1626 struct amdgpu_device *adev = dev;
1627 int i, r = 0;
1628
1629 for (i = 0; i < adev->num_ip_blocks; i++) {
1630 if (!adev->ip_blocks[i].status.valid)
1631 continue;
1632 if (adev->ip_blocks[i].version->type != block_type)
1633 continue;
1634 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1635 continue;
1636 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1637 (void *)adev, state);
1638 if (r)
1639 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1640 adev->ip_blocks[i].version->funcs->name, r);
1641 }
1642 return r;
1643 }
1644
1645 /**
1646 * amdgpu_device_ip_set_powergating_state - set the PG state
1647 *
1648 * @dev: amdgpu_device pointer
1649 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1650 * @state: powergating state (gate or ungate)
1651 *
1652 * Sets the requested powergating state for all instances of
1653 * the hardware IP specified.
1654 * Returns the error code from the last instance.
1655 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1656 int amdgpu_device_ip_set_powergating_state(void *dev,
1657 enum amd_ip_block_type block_type,
1658 enum amd_powergating_state state)
1659 {
1660 struct amdgpu_device *adev = dev;
1661 int i, r = 0;
1662
1663 for (i = 0; i < adev->num_ip_blocks; i++) {
1664 if (!adev->ip_blocks[i].status.valid)
1665 continue;
1666 if (adev->ip_blocks[i].version->type != block_type)
1667 continue;
1668 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1669 continue;
1670 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1671 (void *)adev, state);
1672 if (r)
1673 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1674 adev->ip_blocks[i].version->funcs->name, r);
1675 }
1676 return r;
1677 }
1678
1679 /**
1680 * amdgpu_device_ip_get_clockgating_state - get the CG state
1681 *
1682 * @adev: amdgpu_device pointer
1683 * @flags: clockgating feature flags
1684 *
1685 * Walks the list of IPs on the device and updates the clockgating
1686 * flags for each IP.
1687 * Updates @flags with the feature flags for each hardware IP where
1688 * clockgating is enabled.
1689 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1690 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1691 u64 *flags)
1692 {
1693 int i;
1694
1695 for (i = 0; i < adev->num_ip_blocks; i++) {
1696 if (!adev->ip_blocks[i].status.valid)
1697 continue;
1698 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1699 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1700 }
1701 }
1702
1703 /**
1704 * amdgpu_device_ip_wait_for_idle - wait for idle
1705 *
1706 * @adev: amdgpu_device pointer
1707 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1708 *
1709 * Waits for the request hardware IP to be idle.
1710 * Returns 0 for success or a negative error code on failure.
1711 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1712 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1713 enum amd_ip_block_type block_type)
1714 {
1715 int i, r;
1716
1717 for (i = 0; i < adev->num_ip_blocks; i++) {
1718 if (!adev->ip_blocks[i].status.valid)
1719 continue;
1720 if (adev->ip_blocks[i].version->type == block_type) {
1721 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1722 if (r)
1723 return r;
1724 break;
1725 }
1726 }
1727 return 0;
1728
1729 }
1730
1731 /**
1732 * amdgpu_device_ip_is_idle - is the hardware IP idle
1733 *
1734 * @adev: amdgpu_device pointer
1735 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1736 *
1737 * Check if the hardware IP is idle or not.
1738 * Returns true if it the IP is idle, false if not.
1739 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1740 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1741 enum amd_ip_block_type block_type)
1742 {
1743 int i;
1744
1745 for (i = 0; i < adev->num_ip_blocks; i++) {
1746 if (!adev->ip_blocks[i].status.valid)
1747 continue;
1748 if (adev->ip_blocks[i].version->type == block_type)
1749 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1750 }
1751 return true;
1752
1753 }
1754
1755 /**
1756 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1757 *
1758 * @adev: amdgpu_device pointer
1759 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1760 *
1761 * Returns a pointer to the hardware IP block structure
1762 * if it exists for the asic, otherwise NULL.
1763 */
1764 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1765 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1766 enum amd_ip_block_type type)
1767 {
1768 int i;
1769
1770 for (i = 0; i < adev->num_ip_blocks; i++)
1771 if (adev->ip_blocks[i].version->type == type)
1772 return &adev->ip_blocks[i];
1773
1774 return NULL;
1775 }
1776
1777 /**
1778 * amdgpu_device_ip_block_version_cmp
1779 *
1780 * @adev: amdgpu_device pointer
1781 * @type: enum amd_ip_block_type
1782 * @major: major version
1783 * @minor: minor version
1784 *
1785 * return 0 if equal or greater
1786 * return 1 if smaller or the ip_block doesn't exist
1787 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1788 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1789 enum amd_ip_block_type type,
1790 u32 major, u32 minor)
1791 {
1792 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1793
1794 if (ip_block && ((ip_block->version->major > major) ||
1795 ((ip_block->version->major == major) &&
1796 (ip_block->version->minor >= minor))))
1797 return 0;
1798
1799 return 1;
1800 }
1801
1802 /**
1803 * amdgpu_device_ip_block_add
1804 *
1805 * @adev: amdgpu_device pointer
1806 * @ip_block_version: pointer to the IP to add
1807 *
1808 * Adds the IP block driver information to the collection of IPs
1809 * on the asic.
1810 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1811 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1812 const struct amdgpu_ip_block_version *ip_block_version)
1813 {
1814 if (!ip_block_version)
1815 return -EINVAL;
1816
1817 switch (ip_block_version->type) {
1818 case AMD_IP_BLOCK_TYPE_VCN:
1819 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1820 return 0;
1821 break;
1822 case AMD_IP_BLOCK_TYPE_JPEG:
1823 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1824 return 0;
1825 break;
1826 default:
1827 break;
1828 }
1829
1830 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1831 ip_block_version->funcs->name);
1832
1833 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1834
1835 return 0;
1836 }
1837
1838 /**
1839 * amdgpu_device_enable_virtual_display - enable virtual display feature
1840 *
1841 * @adev: amdgpu_device pointer
1842 *
1843 * Enabled the virtual display feature if the user has enabled it via
1844 * the module parameter virtual_display. This feature provides a virtual
1845 * display hardware on headless boards or in virtualized environments.
1846 * This function parses and validates the configuration string specified by
1847 * the user and configues the virtual display configuration (number of
1848 * virtual connectors, crtcs, etc.) specified.
1849 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1850 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1851 {
1852 adev->enable_virtual_display = false;
1853
1854 #ifdef notyet
1855 if (amdgpu_virtual_display) {
1856 const char *pci_address_name = pci_name(adev->pdev);
1857 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1858
1859 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1860 pciaddstr_tmp = pciaddstr;
1861 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1862 pciaddname = strsep(&pciaddname_tmp, ",");
1863 if (!strcmp("all", pciaddname)
1864 || !strcmp(pci_address_name, pciaddname)) {
1865 long num_crtc;
1866 int res = -1;
1867
1868 adev->enable_virtual_display = true;
1869
1870 if (pciaddname_tmp)
1871 res = kstrtol(pciaddname_tmp, 10,
1872 &num_crtc);
1873
1874 if (!res) {
1875 if (num_crtc < 1)
1876 num_crtc = 1;
1877 if (num_crtc > 6)
1878 num_crtc = 6;
1879 adev->mode_info.num_crtc = num_crtc;
1880 } else {
1881 adev->mode_info.num_crtc = 1;
1882 }
1883 break;
1884 }
1885 }
1886
1887 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1888 amdgpu_virtual_display, pci_address_name,
1889 adev->enable_virtual_display, adev->mode_info.num_crtc);
1890
1891 kfree(pciaddstr);
1892 }
1893 #endif
1894 }
1895
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1896 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1897 {
1898 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1899 adev->mode_info.num_crtc = 1;
1900 adev->enable_virtual_display = true;
1901 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1902 adev->enable_virtual_display, adev->mode_info.num_crtc);
1903 }
1904 }
1905
1906 /**
1907 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1908 *
1909 * @adev: amdgpu_device pointer
1910 *
1911 * Parses the asic configuration parameters specified in the gpu info
1912 * firmware and makes them availale to the driver for use in configuring
1913 * the asic.
1914 * Returns 0 on success, -EINVAL on failure.
1915 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1916 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1917 {
1918 const char *chip_name;
1919 char fw_name[40];
1920 int err;
1921 const struct gpu_info_firmware_header_v1_0 *hdr;
1922
1923 adev->firmware.gpu_info_fw = NULL;
1924
1925 if (adev->mman.discovery_bin)
1926 return 0;
1927
1928 switch (adev->asic_type) {
1929 default:
1930 return 0;
1931 case CHIP_VEGA10:
1932 chip_name = "vega10";
1933 break;
1934 case CHIP_VEGA12:
1935 chip_name = "vega12";
1936 break;
1937 case CHIP_RAVEN:
1938 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1939 chip_name = "raven2";
1940 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1941 chip_name = "picasso";
1942 else
1943 chip_name = "raven";
1944 break;
1945 case CHIP_ARCTURUS:
1946 chip_name = "arcturus";
1947 break;
1948 case CHIP_NAVI12:
1949 chip_name = "navi12";
1950 break;
1951 }
1952
1953 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1954 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1955 if (err) {
1956 dev_err(adev->dev,
1957 "Failed to get gpu_info firmware \"%s\"\n",
1958 fw_name);
1959 goto out;
1960 }
1961
1962 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1963 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1964
1965 switch (hdr->version_major) {
1966 case 1:
1967 {
1968 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1969 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1970 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1971
1972 /*
1973 * Should be droped when DAL no longer needs it.
1974 */
1975 if (adev->asic_type == CHIP_NAVI12)
1976 goto parse_soc_bounding_box;
1977
1978 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1979 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1980 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1981 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1982 adev->gfx.config.max_texture_channel_caches =
1983 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1984 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1985 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1986 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1987 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1988 adev->gfx.config.double_offchip_lds_buf =
1989 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1990 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1991 adev->gfx.cu_info.max_waves_per_simd =
1992 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1993 adev->gfx.cu_info.max_scratch_slots_per_cu =
1994 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1995 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1996 if (hdr->version_minor >= 1) {
1997 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1998 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1999 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2000 adev->gfx.config.num_sc_per_sh =
2001 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2002 adev->gfx.config.num_packer_per_sc =
2003 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2004 }
2005
2006 parse_soc_bounding_box:
2007 /*
2008 * soc bounding box info is not integrated in disocovery table,
2009 * we always need to parse it from gpu info firmware if needed.
2010 */
2011 if (hdr->version_minor == 2) {
2012 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2013 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2014 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2015 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2016 }
2017 break;
2018 }
2019 default:
2020 dev_err(adev->dev,
2021 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2022 err = -EINVAL;
2023 goto out;
2024 }
2025 out:
2026 return err;
2027 }
2028
2029 /**
2030 * amdgpu_device_ip_early_init - run early init for hardware IPs
2031 *
2032 * @adev: amdgpu_device pointer
2033 *
2034 * Early initialization pass for hardware IPs. The hardware IPs that make
2035 * up each asic are discovered each IP's early_init callback is run. This
2036 * is the first stage in initializing the asic.
2037 * Returns 0 on success, negative error code on failure.
2038 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2039 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2040 {
2041 struct pci_dev *parent;
2042 int i, r;
2043 bool total;
2044
2045 amdgpu_device_enable_virtual_display(adev);
2046
2047 if (amdgpu_sriov_vf(adev)) {
2048 r = amdgpu_virt_request_full_gpu(adev, true);
2049 if (r)
2050 return r;
2051 }
2052
2053 switch (adev->asic_type) {
2054 #ifdef CONFIG_DRM_AMDGPU_SI
2055 case CHIP_VERDE:
2056 case CHIP_TAHITI:
2057 case CHIP_PITCAIRN:
2058 case CHIP_OLAND:
2059 case CHIP_HAINAN:
2060 adev->family = AMDGPU_FAMILY_SI;
2061 r = si_set_ip_blocks(adev);
2062 if (r)
2063 return r;
2064 break;
2065 #endif
2066 #ifdef CONFIG_DRM_AMDGPU_CIK
2067 case CHIP_BONAIRE:
2068 case CHIP_HAWAII:
2069 case CHIP_KAVERI:
2070 case CHIP_KABINI:
2071 case CHIP_MULLINS:
2072 if (adev->flags & AMD_IS_APU)
2073 adev->family = AMDGPU_FAMILY_KV;
2074 else
2075 adev->family = AMDGPU_FAMILY_CI;
2076
2077 r = cik_set_ip_blocks(adev);
2078 if (r)
2079 return r;
2080 break;
2081 #endif
2082 case CHIP_TOPAZ:
2083 case CHIP_TONGA:
2084 case CHIP_FIJI:
2085 case CHIP_POLARIS10:
2086 case CHIP_POLARIS11:
2087 case CHIP_POLARIS12:
2088 case CHIP_VEGAM:
2089 case CHIP_CARRIZO:
2090 case CHIP_STONEY:
2091 if (adev->flags & AMD_IS_APU)
2092 adev->family = AMDGPU_FAMILY_CZ;
2093 else
2094 adev->family = AMDGPU_FAMILY_VI;
2095
2096 r = vi_set_ip_blocks(adev);
2097 if (r)
2098 return r;
2099 break;
2100 default:
2101 r = amdgpu_discovery_set_ip_blocks(adev);
2102 if (r)
2103 return r;
2104 break;
2105 }
2106
2107 if (amdgpu_has_atpx() &&
2108 (amdgpu_is_atpx_hybrid() ||
2109 amdgpu_has_atpx_dgpu_power_cntl()) &&
2110 ((adev->flags & AMD_IS_APU) == 0) &&
2111 !dev_is_removable(&adev->pdev->dev))
2112 adev->flags |= AMD_IS_PX;
2113
2114 if (!(adev->flags & AMD_IS_APU)) {
2115 #ifdef notyet
2116 parent = pcie_find_root_port(adev->pdev);
2117 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2118 #else
2119 adev->has_pr3 = false;
2120 #endif
2121 }
2122
2123
2124 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2125 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2126 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2127 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2128 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2129 if (!amdgpu_device_pcie_dynamic_switching_supported())
2130 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2131
2132 total = true;
2133 for (i = 0; i < adev->num_ip_blocks; i++) {
2134 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2135 DRM_WARN("disabled ip block: %d <%s>\n",
2136 i, adev->ip_blocks[i].version->funcs->name);
2137 adev->ip_blocks[i].status.valid = false;
2138 } else {
2139 if (adev->ip_blocks[i].version->funcs->early_init) {
2140 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2141 if (r == -ENOENT) {
2142 adev->ip_blocks[i].status.valid = false;
2143 } else if (r) {
2144 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2145 adev->ip_blocks[i].version->funcs->name, r);
2146 total = false;
2147 } else {
2148 adev->ip_blocks[i].status.valid = true;
2149 }
2150 } else {
2151 adev->ip_blocks[i].status.valid = true;
2152 }
2153 }
2154 /* get the vbios after the asic_funcs are set up */
2155 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2156 r = amdgpu_device_parse_gpu_info_fw(adev);
2157 if (r)
2158 return r;
2159
2160 /* Read BIOS */
2161 if (amdgpu_device_read_bios(adev)) {
2162 if (!amdgpu_get_bios(adev))
2163 return -EINVAL;
2164
2165 r = amdgpu_atombios_init(adev);
2166 if (r) {
2167 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2168 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2169 return r;
2170 }
2171 }
2172
2173 /*get pf2vf msg info at it's earliest time*/
2174 if (amdgpu_sriov_vf(adev))
2175 amdgpu_virt_init_data_exchange(adev);
2176
2177 }
2178 }
2179 if (!total)
2180 return -ENODEV;
2181
2182 amdgpu_amdkfd_device_probe(adev);
2183 adev->cg_flags &= amdgpu_cg_mask;
2184 adev->pg_flags &= amdgpu_pg_mask;
2185
2186 return 0;
2187 }
2188
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2189 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2190 {
2191 int i, r;
2192
2193 for (i = 0; i < adev->num_ip_blocks; i++) {
2194 if (!adev->ip_blocks[i].status.sw)
2195 continue;
2196 if (adev->ip_blocks[i].status.hw)
2197 continue;
2198 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2199 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2200 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2201 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2202 if (r) {
2203 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2204 adev->ip_blocks[i].version->funcs->name, r);
2205 return r;
2206 }
2207 adev->ip_blocks[i].status.hw = true;
2208 }
2209 }
2210
2211 return 0;
2212 }
2213
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2214 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2215 {
2216 int i, r;
2217
2218 for (i = 0; i < adev->num_ip_blocks; i++) {
2219 if (!adev->ip_blocks[i].status.sw)
2220 continue;
2221 if (adev->ip_blocks[i].status.hw)
2222 continue;
2223 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2224 if (r) {
2225 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2226 adev->ip_blocks[i].version->funcs->name, r);
2227 return r;
2228 }
2229 adev->ip_blocks[i].status.hw = true;
2230 }
2231
2232 return 0;
2233 }
2234
amdgpu_device_fw_loading(struct amdgpu_device * adev)2235 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2236 {
2237 int r = 0;
2238 int i;
2239 uint32_t smu_version;
2240
2241 if (adev->asic_type >= CHIP_VEGA10) {
2242 for (i = 0; i < adev->num_ip_blocks; i++) {
2243 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2244 continue;
2245
2246 if (!adev->ip_blocks[i].status.sw)
2247 continue;
2248
2249 /* no need to do the fw loading again if already done*/
2250 if (adev->ip_blocks[i].status.hw == true)
2251 break;
2252
2253 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2254 r = adev->ip_blocks[i].version->funcs->resume(adev);
2255 if (r) {
2256 DRM_ERROR("resume of IP block <%s> failed %d\n",
2257 adev->ip_blocks[i].version->funcs->name, r);
2258 return r;
2259 }
2260 } else {
2261 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2262 if (r) {
2263 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2264 adev->ip_blocks[i].version->funcs->name, r);
2265 return r;
2266 }
2267 }
2268
2269 adev->ip_blocks[i].status.hw = true;
2270 break;
2271 }
2272 }
2273
2274 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2275 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2276
2277 return r;
2278 }
2279
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2280 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2281 {
2282 long timeout;
2283 int r, i;
2284
2285 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2286 struct amdgpu_ring *ring = adev->rings[i];
2287
2288 /* No need to setup the GPU scheduler for rings that don't need it */
2289 if (!ring || ring->no_scheduler)
2290 continue;
2291
2292 switch (ring->funcs->type) {
2293 case AMDGPU_RING_TYPE_GFX:
2294 timeout = adev->gfx_timeout;
2295 break;
2296 case AMDGPU_RING_TYPE_COMPUTE:
2297 timeout = adev->compute_timeout;
2298 break;
2299 case AMDGPU_RING_TYPE_SDMA:
2300 timeout = adev->sdma_timeout;
2301 break;
2302 default:
2303 timeout = adev->video_timeout;
2304 break;
2305 }
2306
2307 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2308 ring->num_hw_submission, 0,
2309 timeout, adev->reset_domain->wq,
2310 ring->sched_score, ring->name,
2311 adev->dev);
2312 if (r) {
2313 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2314 ring->name);
2315 return r;
2316 }
2317 }
2318
2319 amdgpu_xcp_update_partition_sched_list(adev);
2320
2321 return 0;
2322 }
2323
2324
2325 /**
2326 * amdgpu_device_ip_init - run init for hardware IPs
2327 *
2328 * @adev: amdgpu_device pointer
2329 *
2330 * Main initialization pass for hardware IPs. The list of all the hardware
2331 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2332 * are run. sw_init initializes the software state associated with each IP
2333 * and hw_init initializes the hardware associated with each IP.
2334 * Returns 0 on success, negative error code on failure.
2335 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2336 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2337 {
2338 int i, r;
2339
2340 r = amdgpu_ras_init(adev);
2341 if (r)
2342 return r;
2343
2344 for (i = 0; i < adev->num_ip_blocks; i++) {
2345 if (!adev->ip_blocks[i].status.valid)
2346 continue;
2347 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2348 if (r) {
2349 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2350 adev->ip_blocks[i].version->funcs->name, r);
2351 goto init_failed;
2352 }
2353 adev->ip_blocks[i].status.sw = true;
2354
2355 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2356 /* need to do common hw init early so everything is set up for gmc */
2357 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2358 if (r) {
2359 DRM_ERROR("hw_init %d failed %d\n", i, r);
2360 goto init_failed;
2361 }
2362 adev->ip_blocks[i].status.hw = true;
2363 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2364 /* need to do gmc hw init early so we can allocate gpu mem */
2365 /* Try to reserve bad pages early */
2366 if (amdgpu_sriov_vf(adev))
2367 amdgpu_virt_exchange_data(adev);
2368
2369 r = amdgpu_device_mem_scratch_init(adev);
2370 if (r) {
2371 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2372 goto init_failed;
2373 }
2374 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2375 if (r) {
2376 DRM_ERROR("hw_init %d failed %d\n", i, r);
2377 goto init_failed;
2378 }
2379 r = amdgpu_device_wb_init(adev);
2380 if (r) {
2381 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2382 goto init_failed;
2383 }
2384 adev->ip_blocks[i].status.hw = true;
2385
2386 /* right after GMC hw init, we create CSA */
2387 if (adev->gfx.mcbp) {
2388 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2389 AMDGPU_GEM_DOMAIN_VRAM |
2390 AMDGPU_GEM_DOMAIN_GTT,
2391 AMDGPU_CSA_SIZE);
2392 if (r) {
2393 DRM_ERROR("allocate CSA failed %d\n", r);
2394 goto init_failed;
2395 }
2396 }
2397 }
2398 }
2399
2400 if (amdgpu_sriov_vf(adev))
2401 amdgpu_virt_init_data_exchange(adev);
2402
2403 r = amdgpu_ib_pool_init(adev);
2404 if (r) {
2405 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2406 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2407 goto init_failed;
2408 }
2409
2410 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2411 if (r)
2412 goto init_failed;
2413
2414 r = amdgpu_device_ip_hw_init_phase1(adev);
2415 if (r)
2416 goto init_failed;
2417
2418 r = amdgpu_device_fw_loading(adev);
2419 if (r)
2420 goto init_failed;
2421
2422 r = amdgpu_device_ip_hw_init_phase2(adev);
2423 if (r)
2424 goto init_failed;
2425
2426 /*
2427 * retired pages will be loaded from eeprom and reserved here,
2428 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2429 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2430 * for I2C communication which only true at this point.
2431 *
2432 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2433 * failure from bad gpu situation and stop amdgpu init process
2434 * accordingly. For other failed cases, it will still release all
2435 * the resource and print error message, rather than returning one
2436 * negative value to upper level.
2437 *
2438 * Note: theoretically, this should be called before all vram allocations
2439 * to protect retired page from abusing
2440 */
2441 r = amdgpu_ras_recovery_init(adev);
2442 if (r)
2443 goto init_failed;
2444
2445 /**
2446 * In case of XGMI grab extra reference for reset domain for this device
2447 */
2448 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2449 if (amdgpu_xgmi_add_device(adev) == 0) {
2450 if (!amdgpu_sriov_vf(adev)) {
2451 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2452
2453 if (WARN_ON(!hive)) {
2454 r = -ENOENT;
2455 goto init_failed;
2456 }
2457
2458 if (!hive->reset_domain ||
2459 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2460 r = -ENOENT;
2461 amdgpu_put_xgmi_hive(hive);
2462 goto init_failed;
2463 }
2464
2465 /* Drop the early temporary reset domain we created for device */
2466 amdgpu_reset_put_reset_domain(adev->reset_domain);
2467 adev->reset_domain = hive->reset_domain;
2468 amdgpu_put_xgmi_hive(hive);
2469 }
2470 }
2471 }
2472
2473 r = amdgpu_device_init_schedulers(adev);
2474 if (r)
2475 goto init_failed;
2476
2477 /* Don't init kfd if whole hive need to be reset during init */
2478 if (!adev->gmc.xgmi.pending_reset) {
2479 kgd2kfd_init_zone_device(adev);
2480 amdgpu_amdkfd_device_init(adev);
2481 }
2482
2483 amdgpu_fru_get_product_info(adev);
2484
2485 init_failed:
2486
2487 return r;
2488 }
2489
2490 /**
2491 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2492 *
2493 * @adev: amdgpu_device pointer
2494 *
2495 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2496 * this function before a GPU reset. If the value is retained after a
2497 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2498 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2499 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2500 {
2501 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2502 }
2503
2504 /**
2505 * amdgpu_device_check_vram_lost - check if vram is valid
2506 *
2507 * @adev: amdgpu_device pointer
2508 *
2509 * Checks the reset magic value written to the gart pointer in VRAM.
2510 * The driver calls this after a GPU reset to see if the contents of
2511 * VRAM is lost or now.
2512 * returns true if vram is lost, false if not.
2513 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2514 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2515 {
2516 if (memcmp(adev->gart.ptr, adev->reset_magic,
2517 AMDGPU_RESET_MAGIC_NUM))
2518 return true;
2519
2520 if (!amdgpu_in_reset(adev))
2521 return false;
2522
2523 /*
2524 * For all ASICs with baco/mode1 reset, the VRAM is
2525 * always assumed to be lost.
2526 */
2527 switch (amdgpu_asic_reset_method(adev)) {
2528 case AMD_RESET_METHOD_BACO:
2529 case AMD_RESET_METHOD_MODE1:
2530 return true;
2531 default:
2532 return false;
2533 }
2534 }
2535
2536 /**
2537 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2538 *
2539 * @adev: amdgpu_device pointer
2540 * @state: clockgating state (gate or ungate)
2541 *
2542 * The list of all the hardware IPs that make up the asic is walked and the
2543 * set_clockgating_state callbacks are run.
2544 * Late initialization pass enabling clockgating for hardware IPs.
2545 * Fini or suspend, pass disabling clockgating for hardware IPs.
2546 * Returns 0 on success, negative error code on failure.
2547 */
2548
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2549 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2550 enum amd_clockgating_state state)
2551 {
2552 int i, j, r;
2553
2554 if (amdgpu_emu_mode == 1)
2555 return 0;
2556
2557 for (j = 0; j < adev->num_ip_blocks; j++) {
2558 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2559 if (!adev->ip_blocks[i].status.late_initialized)
2560 continue;
2561 /* skip CG for GFX, SDMA on S0ix */
2562 if (adev->in_s0ix &&
2563 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2564 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2565 continue;
2566 /* skip CG for VCE/UVD, it's handled specially */
2567 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2568 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2569 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2570 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2571 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2572 /* enable clockgating to save power */
2573 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2574 state);
2575 if (r) {
2576 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2577 adev->ip_blocks[i].version->funcs->name, r);
2578 return r;
2579 }
2580 }
2581 }
2582
2583 return 0;
2584 }
2585
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2586 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2587 enum amd_powergating_state state)
2588 {
2589 int i, j, r;
2590
2591 if (amdgpu_emu_mode == 1)
2592 return 0;
2593
2594 for (j = 0; j < adev->num_ip_blocks; j++) {
2595 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2596 if (!adev->ip_blocks[i].status.late_initialized)
2597 continue;
2598 /* skip PG for GFX, SDMA on S0ix */
2599 if (adev->in_s0ix &&
2600 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2601 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2602 continue;
2603 /* skip CG for VCE/UVD, it's handled specially */
2604 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2605 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2606 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2607 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2608 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2609 /* enable powergating to save power */
2610 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2611 state);
2612 if (r) {
2613 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2614 adev->ip_blocks[i].version->funcs->name, r);
2615 return r;
2616 }
2617 }
2618 }
2619 return 0;
2620 }
2621
amdgpu_device_enable_mgpu_fan_boost(void)2622 static int amdgpu_device_enable_mgpu_fan_boost(void)
2623 {
2624 struct amdgpu_gpu_instance *gpu_ins;
2625 struct amdgpu_device *adev;
2626 int i, ret = 0;
2627
2628 mutex_lock(&mgpu_info.mutex);
2629
2630 /*
2631 * MGPU fan boost feature should be enabled
2632 * only when there are two or more dGPUs in
2633 * the system
2634 */
2635 if (mgpu_info.num_dgpu < 2)
2636 goto out;
2637
2638 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2639 gpu_ins = &(mgpu_info.gpu_ins[i]);
2640 adev = gpu_ins->adev;
2641 if (!(adev->flags & AMD_IS_APU) &&
2642 !gpu_ins->mgpu_fan_enabled) {
2643 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2644 if (ret)
2645 break;
2646
2647 gpu_ins->mgpu_fan_enabled = 1;
2648 }
2649 }
2650
2651 out:
2652 mutex_unlock(&mgpu_info.mutex);
2653
2654 return ret;
2655 }
2656
2657 /**
2658 * amdgpu_device_ip_late_init - run late init for hardware IPs
2659 *
2660 * @adev: amdgpu_device pointer
2661 *
2662 * Late initialization pass for hardware IPs. The list of all the hardware
2663 * IPs that make up the asic is walked and the late_init callbacks are run.
2664 * late_init covers any special initialization that an IP requires
2665 * after all of the have been initialized or something that needs to happen
2666 * late in the init process.
2667 * Returns 0 on success, negative error code on failure.
2668 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2669 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2670 {
2671 struct amdgpu_gpu_instance *gpu_instance;
2672 int i = 0, r;
2673
2674 for (i = 0; i < adev->num_ip_blocks; i++) {
2675 if (!adev->ip_blocks[i].status.hw)
2676 continue;
2677 if (adev->ip_blocks[i].version->funcs->late_init) {
2678 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2679 if (r) {
2680 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2681 adev->ip_blocks[i].version->funcs->name, r);
2682 return r;
2683 }
2684 }
2685 adev->ip_blocks[i].status.late_initialized = true;
2686 }
2687
2688 r = amdgpu_ras_late_init(adev);
2689 if (r) {
2690 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2691 return r;
2692 }
2693
2694 amdgpu_ras_set_error_query_ready(adev, true);
2695
2696 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2697 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2698
2699 amdgpu_device_fill_reset_magic(adev);
2700
2701 r = amdgpu_device_enable_mgpu_fan_boost();
2702 if (r)
2703 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2704
2705 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2706 if (amdgpu_passthrough(adev) &&
2707 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2708 adev->asic_type == CHIP_ALDEBARAN))
2709 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2710
2711 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2712 mutex_lock(&mgpu_info.mutex);
2713
2714 /*
2715 * Reset device p-state to low as this was booted with high.
2716 *
2717 * This should be performed only after all devices from the same
2718 * hive get initialized.
2719 *
2720 * However, it's unknown how many device in the hive in advance.
2721 * As this is counted one by one during devices initializations.
2722 *
2723 * So, we wait for all XGMI interlinked devices initialized.
2724 * This may bring some delays as those devices may come from
2725 * different hives. But that should be OK.
2726 */
2727 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2728 for (i = 0; i < mgpu_info.num_gpu; i++) {
2729 gpu_instance = &(mgpu_info.gpu_ins[i]);
2730 if (gpu_instance->adev->flags & AMD_IS_APU)
2731 continue;
2732
2733 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2734 AMDGPU_XGMI_PSTATE_MIN);
2735 if (r) {
2736 DRM_ERROR("pstate setting failed (%d).\n", r);
2737 break;
2738 }
2739 }
2740 }
2741
2742 mutex_unlock(&mgpu_info.mutex);
2743 }
2744
2745 return 0;
2746 }
2747
2748 /**
2749 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2750 *
2751 * @adev: amdgpu_device pointer
2752 *
2753 * For ASICs need to disable SMC first
2754 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2755 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2756 {
2757 int i, r;
2758
2759 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2760 return;
2761
2762 for (i = 0; i < adev->num_ip_blocks; i++) {
2763 if (!adev->ip_blocks[i].status.hw)
2764 continue;
2765 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2766 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2767 /* XXX handle errors */
2768 if (r) {
2769 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2770 adev->ip_blocks[i].version->funcs->name, r);
2771 }
2772 adev->ip_blocks[i].status.hw = false;
2773 break;
2774 }
2775 }
2776 }
2777
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2778 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2779 {
2780 int i, r;
2781
2782 for (i = 0; i < adev->num_ip_blocks; i++) {
2783 if (!adev->ip_blocks[i].version->funcs->early_fini)
2784 continue;
2785
2786 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2787 if (r) {
2788 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2789 adev->ip_blocks[i].version->funcs->name, r);
2790 }
2791 }
2792
2793 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2794 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2795
2796 amdgpu_amdkfd_suspend(adev, false);
2797
2798 /* Workaroud for ASICs need to disable SMC first */
2799 amdgpu_device_smu_fini_early(adev);
2800
2801 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2802 if (!adev->ip_blocks[i].status.hw)
2803 continue;
2804
2805 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2806 /* XXX handle errors */
2807 if (r) {
2808 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2809 adev->ip_blocks[i].version->funcs->name, r);
2810 }
2811
2812 adev->ip_blocks[i].status.hw = false;
2813 }
2814
2815 if (amdgpu_sriov_vf(adev)) {
2816 if (amdgpu_virt_release_full_gpu(adev, false))
2817 DRM_ERROR("failed to release exclusive mode on fini\n");
2818 }
2819
2820 return 0;
2821 }
2822
2823 /**
2824 * amdgpu_device_ip_fini - run fini for hardware IPs
2825 *
2826 * @adev: amdgpu_device pointer
2827 *
2828 * Main teardown pass for hardware IPs. The list of all the hardware
2829 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2830 * are run. hw_fini tears down the hardware associated with each IP
2831 * and sw_fini tears down any software state associated with each IP.
2832 * Returns 0 on success, negative error code on failure.
2833 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2834 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2835 {
2836 int i, r;
2837
2838 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2839 amdgpu_virt_release_ras_err_handler_data(adev);
2840
2841 if (adev->gmc.xgmi.num_physical_nodes > 1)
2842 amdgpu_xgmi_remove_device(adev);
2843
2844 amdgpu_amdkfd_device_fini_sw(adev);
2845
2846 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2847 if (!adev->ip_blocks[i].status.sw)
2848 continue;
2849
2850 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2851 amdgpu_ucode_free_bo(adev);
2852 amdgpu_free_static_csa(&adev->virt.csa_obj);
2853 amdgpu_device_wb_fini(adev);
2854 amdgpu_device_mem_scratch_fini(adev);
2855 amdgpu_ib_pool_fini(adev);
2856 }
2857
2858 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2859 /* XXX handle errors */
2860 if (r) {
2861 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2862 adev->ip_blocks[i].version->funcs->name, r);
2863 }
2864 adev->ip_blocks[i].status.sw = false;
2865 adev->ip_blocks[i].status.valid = false;
2866 }
2867
2868 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2869 if (!adev->ip_blocks[i].status.late_initialized)
2870 continue;
2871 if (adev->ip_blocks[i].version->funcs->late_fini)
2872 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2873 adev->ip_blocks[i].status.late_initialized = false;
2874 }
2875
2876 amdgpu_ras_fini(adev);
2877
2878 return 0;
2879 }
2880
2881 /**
2882 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2883 *
2884 * @work: work_struct.
2885 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2886 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2887 {
2888 struct amdgpu_device *adev =
2889 container_of(work, struct amdgpu_device, delayed_init_work.work);
2890 int r;
2891
2892 r = amdgpu_ib_ring_tests(adev);
2893 if (r)
2894 DRM_ERROR("ib ring test failed (%d).\n", r);
2895 }
2896
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2897 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2898 {
2899 struct amdgpu_device *adev =
2900 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2901
2902 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2903 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2904
2905 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2906 adev->gfx.gfx_off_state = true;
2907 }
2908
2909 /**
2910 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2911 *
2912 * @adev: amdgpu_device pointer
2913 *
2914 * Main suspend function for hardware IPs. The list of all the hardware
2915 * IPs that make up the asic is walked, clockgating is disabled and the
2916 * suspend callbacks are run. suspend puts the hardware and software state
2917 * in each IP into a state suitable for suspend.
2918 * Returns 0 on success, negative error code on failure.
2919 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2920 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2921 {
2922 int i, r;
2923
2924 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2925 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2926
2927 /*
2928 * Per PMFW team's suggestion, driver needs to handle gfxoff
2929 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2930 * scenario. Add the missing df cstate disablement here.
2931 */
2932 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2933 dev_warn(adev->dev, "Failed to disallow df cstate");
2934
2935 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2936 if (!adev->ip_blocks[i].status.valid)
2937 continue;
2938
2939 /* displays are handled separately */
2940 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2941 continue;
2942
2943 /* XXX handle errors */
2944 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2945 /* XXX handle errors */
2946 if (r) {
2947 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2948 adev->ip_blocks[i].version->funcs->name, r);
2949 return r;
2950 }
2951
2952 adev->ip_blocks[i].status.hw = false;
2953 }
2954
2955 return 0;
2956 }
2957
2958 /**
2959 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2960 *
2961 * @adev: amdgpu_device pointer
2962 *
2963 * Main suspend function for hardware IPs. The list of all the hardware
2964 * IPs that make up the asic is walked, clockgating is disabled and the
2965 * suspend callbacks are run. suspend puts the hardware and software state
2966 * in each IP into a state suitable for suspend.
2967 * Returns 0 on success, negative error code on failure.
2968 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2969 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2970 {
2971 int i, r;
2972
2973 if (adev->in_s0ix)
2974 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2975
2976 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2977 if (!adev->ip_blocks[i].status.valid)
2978 continue;
2979 /* displays are handled in phase1 */
2980 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2981 continue;
2982 /* PSP lost connection when err_event_athub occurs */
2983 if (amdgpu_ras_intr_triggered() &&
2984 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2985 adev->ip_blocks[i].status.hw = false;
2986 continue;
2987 }
2988
2989 /* skip unnecessary suspend if we do not initialize them yet */
2990 if (adev->gmc.xgmi.pending_reset &&
2991 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2992 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2995 adev->ip_blocks[i].status.hw = false;
2996 continue;
2997 }
2998
2999 /* skip suspend of gfx/mes and psp for S0ix
3000 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3001 * like at runtime. PSP is also part of the always on hardware
3002 * so no need to suspend it.
3003 */
3004 if (adev->in_s0ix &&
3005 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3006 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3007 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3008 continue;
3009
3010 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3011 if (adev->in_s0ix &&
3012 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3013 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3014 continue;
3015
3016 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3017 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3018 * from this location and RLC Autoload automatically also gets loaded
3019 * from here based on PMFW -> PSP message during re-init sequence.
3020 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3021 * the TMR and reload FWs again for IMU enabled APU ASICs.
3022 */
3023 if (amdgpu_in_reset(adev) &&
3024 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3025 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3026 continue;
3027
3028 /* XXX handle errors */
3029 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3030 /* XXX handle errors */
3031 if (r) {
3032 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3033 adev->ip_blocks[i].version->funcs->name, r);
3034 }
3035 adev->ip_blocks[i].status.hw = false;
3036 /* handle putting the SMC in the appropriate state */
3037 if (!amdgpu_sriov_vf(adev)) {
3038 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3039 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3040 if (r) {
3041 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3042 adev->mp1_state, r);
3043 return r;
3044 }
3045 }
3046 }
3047 }
3048
3049 return 0;
3050 }
3051
3052 /**
3053 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3054 *
3055 * @adev: amdgpu_device pointer
3056 *
3057 * Main suspend function for hardware IPs. The list of all the hardware
3058 * IPs that make up the asic is walked, clockgating is disabled and the
3059 * suspend callbacks are run. suspend puts the hardware and software state
3060 * in each IP into a state suitable for suspend.
3061 * Returns 0 on success, negative error code on failure.
3062 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3063 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3064 {
3065 int r;
3066
3067 if (amdgpu_sriov_vf(adev)) {
3068 amdgpu_virt_fini_data_exchange(adev);
3069 amdgpu_virt_request_full_gpu(adev, false);
3070 }
3071
3072 r = amdgpu_device_ip_suspend_phase1(adev);
3073 if (r)
3074 return r;
3075 r = amdgpu_device_ip_suspend_phase2(adev);
3076
3077 if (amdgpu_sriov_vf(adev))
3078 amdgpu_virt_release_full_gpu(adev, false);
3079
3080 return r;
3081 }
3082
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3083 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3084 {
3085 int i, r;
3086
3087 static enum amd_ip_block_type ip_order[] = {
3088 AMD_IP_BLOCK_TYPE_COMMON,
3089 AMD_IP_BLOCK_TYPE_GMC,
3090 AMD_IP_BLOCK_TYPE_PSP,
3091 AMD_IP_BLOCK_TYPE_IH,
3092 };
3093
3094 for (i = 0; i < adev->num_ip_blocks; i++) {
3095 int j;
3096 struct amdgpu_ip_block *block;
3097
3098 block = &adev->ip_blocks[i];
3099 block->status.hw = false;
3100
3101 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3102
3103 if (block->version->type != ip_order[j] ||
3104 !block->status.valid)
3105 continue;
3106
3107 r = block->version->funcs->hw_init(adev);
3108 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3109 if (r)
3110 return r;
3111 block->status.hw = true;
3112 }
3113 }
3114
3115 return 0;
3116 }
3117
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3118 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3119 {
3120 int i, r;
3121
3122 static enum amd_ip_block_type ip_order[] = {
3123 AMD_IP_BLOCK_TYPE_SMC,
3124 AMD_IP_BLOCK_TYPE_DCE,
3125 AMD_IP_BLOCK_TYPE_GFX,
3126 AMD_IP_BLOCK_TYPE_SDMA,
3127 AMD_IP_BLOCK_TYPE_MES,
3128 AMD_IP_BLOCK_TYPE_UVD,
3129 AMD_IP_BLOCK_TYPE_VCE,
3130 AMD_IP_BLOCK_TYPE_VCN,
3131 AMD_IP_BLOCK_TYPE_JPEG
3132 };
3133
3134 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3135 int j;
3136 struct amdgpu_ip_block *block;
3137
3138 for (j = 0; j < adev->num_ip_blocks; j++) {
3139 block = &adev->ip_blocks[j];
3140
3141 if (block->version->type != ip_order[i] ||
3142 !block->status.valid ||
3143 block->status.hw)
3144 continue;
3145
3146 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3147 r = block->version->funcs->resume(adev);
3148 else
3149 r = block->version->funcs->hw_init(adev);
3150
3151 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3152 if (r)
3153 return r;
3154 block->status.hw = true;
3155 }
3156 }
3157
3158 return 0;
3159 }
3160
3161 /**
3162 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3163 *
3164 * @adev: amdgpu_device pointer
3165 *
3166 * First resume function for hardware IPs. The list of all the hardware
3167 * IPs that make up the asic is walked and the resume callbacks are run for
3168 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3169 * after a suspend and updates the software state as necessary. This
3170 * function is also used for restoring the GPU after a GPU reset.
3171 * Returns 0 on success, negative error code on failure.
3172 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3173 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3174 {
3175 int i, r;
3176
3177 for (i = 0; i < adev->num_ip_blocks; i++) {
3178 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3179 continue;
3180 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3181 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3182 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3183 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3184
3185 r = adev->ip_blocks[i].version->funcs->resume(adev);
3186 if (r) {
3187 DRM_ERROR("resume of IP block <%s> failed %d\n",
3188 adev->ip_blocks[i].version->funcs->name, r);
3189 return r;
3190 }
3191 adev->ip_blocks[i].status.hw = true;
3192 }
3193 }
3194
3195 return 0;
3196 }
3197
3198 /**
3199 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3200 *
3201 * @adev: amdgpu_device pointer
3202 *
3203 * First resume function for hardware IPs. The list of all the hardware
3204 * IPs that make up the asic is walked and the resume callbacks are run for
3205 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3206 * functional state after a suspend and updates the software state as
3207 * necessary. This function is also used for restoring the GPU after a GPU
3208 * reset.
3209 * Returns 0 on success, negative error code on failure.
3210 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3211 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3212 {
3213 int i, r;
3214
3215 for (i = 0; i < adev->num_ip_blocks; i++) {
3216 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3217 continue;
3218 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3222 continue;
3223 r = adev->ip_blocks[i].version->funcs->resume(adev);
3224 if (r) {
3225 DRM_ERROR("resume of IP block <%s> failed %d\n",
3226 adev->ip_blocks[i].version->funcs->name, r);
3227 return r;
3228 }
3229 adev->ip_blocks[i].status.hw = true;
3230 }
3231
3232 return 0;
3233 }
3234
3235 /**
3236 * amdgpu_device_ip_resume - run resume for hardware IPs
3237 *
3238 * @adev: amdgpu_device pointer
3239 *
3240 * Main resume function for hardware IPs. The hardware IPs
3241 * are split into two resume functions because they are
3242 * also used in recovering from a GPU reset and some additional
3243 * steps need to be take between them. In this case (S3/S4) they are
3244 * run sequentially.
3245 * Returns 0 on success, negative error code on failure.
3246 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3247 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3248 {
3249 int r;
3250
3251 r = amdgpu_device_ip_resume_phase1(adev);
3252 if (r)
3253 return r;
3254
3255 r = amdgpu_device_fw_loading(adev);
3256 if (r)
3257 return r;
3258
3259 r = amdgpu_device_ip_resume_phase2(adev);
3260
3261 return r;
3262 }
3263
3264 /**
3265 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3266 *
3267 * @adev: amdgpu_device pointer
3268 *
3269 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3270 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3271 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3272 {
3273 if (amdgpu_sriov_vf(adev)) {
3274 if (adev->is_atom_fw) {
3275 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3276 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3277 } else {
3278 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3279 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3280 }
3281
3282 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3283 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3284 }
3285 }
3286
3287 /**
3288 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3289 *
3290 * @asic_type: AMD asic type
3291 *
3292 * Check if there is DC (new modesetting infrastructre) support for an asic.
3293 * returns true if DC has support, false if not.
3294 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3295 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3296 {
3297 switch (asic_type) {
3298 #ifdef CONFIG_DRM_AMDGPU_SI
3299 case CHIP_HAINAN:
3300 #endif
3301 case CHIP_TOPAZ:
3302 /* chips with no display hardware */
3303 return false;
3304 #if defined(CONFIG_DRM_AMD_DC)
3305 case CHIP_TAHITI:
3306 case CHIP_PITCAIRN:
3307 case CHIP_VERDE:
3308 case CHIP_OLAND:
3309 /*
3310 * We have systems in the wild with these ASICs that require
3311 * LVDS and VGA support which is not supported with DC.
3312 *
3313 * Fallback to the non-DC driver here by default so as not to
3314 * cause regressions.
3315 */
3316 #if defined(CONFIG_DRM_AMD_DC_SI)
3317 return amdgpu_dc > 0;
3318 #else
3319 return false;
3320 #endif
3321 case CHIP_BONAIRE:
3322 case CHIP_KAVERI:
3323 case CHIP_KABINI:
3324 case CHIP_MULLINS:
3325 /*
3326 * We have systems in the wild with these ASICs that require
3327 * VGA support which is not supported with DC.
3328 *
3329 * Fallback to the non-DC driver here by default so as not to
3330 * cause regressions.
3331 */
3332 return amdgpu_dc > 0;
3333 default:
3334 return amdgpu_dc != 0;
3335 #else
3336 default:
3337 if (amdgpu_dc > 0)
3338 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3339 return false;
3340 #endif
3341 }
3342 }
3343
3344 /**
3345 * amdgpu_device_has_dc_support - check if dc is supported
3346 *
3347 * @adev: amdgpu_device pointer
3348 *
3349 * Returns true for supported, false for not supported
3350 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3351 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3352 {
3353 if (adev->enable_virtual_display ||
3354 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3355 return false;
3356
3357 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3358 }
3359
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3360 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3361 {
3362 struct amdgpu_device *adev =
3363 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3364 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3365
3366 /* It's a bug to not have a hive within this function */
3367 if (WARN_ON(!hive))
3368 return;
3369
3370 /*
3371 * Use task barrier to synchronize all xgmi reset works across the
3372 * hive. task_barrier_enter and task_barrier_exit will block
3373 * until all the threads running the xgmi reset works reach
3374 * those points. task_barrier_full will do both blocks.
3375 */
3376 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3377
3378 task_barrier_enter(&hive->tb);
3379 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3380
3381 if (adev->asic_reset_res)
3382 goto fail;
3383
3384 task_barrier_exit(&hive->tb);
3385 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3386
3387 if (adev->asic_reset_res)
3388 goto fail;
3389
3390 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3391 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3392 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3393 } else {
3394
3395 task_barrier_full(&hive->tb);
3396 adev->asic_reset_res = amdgpu_asic_reset(adev);
3397 }
3398
3399 fail:
3400 if (adev->asic_reset_res)
3401 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3402 adev->asic_reset_res, adev_to_drm(adev)->unique);
3403 amdgpu_put_xgmi_hive(hive);
3404 }
3405
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3406 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3407 {
3408 char *input = amdgpu_lockup_timeout;
3409 char *timeout_setting = NULL;
3410 int index = 0;
3411 long timeout;
3412 int ret = 0;
3413
3414 /*
3415 * By default timeout for non compute jobs is 10000
3416 * and 60000 for compute jobs.
3417 * In SR-IOV or passthrough mode, timeout for compute
3418 * jobs are 60000 by default.
3419 */
3420 adev->gfx_timeout = msecs_to_jiffies(10000);
3421 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3422 if (amdgpu_sriov_vf(adev))
3423 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3424 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3425 else
3426 adev->compute_timeout = msecs_to_jiffies(60000);
3427
3428 #ifdef notyet
3429 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3430 while ((timeout_setting = strsep(&input, ",")) &&
3431 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3432 ret = kstrtol(timeout_setting, 0, &timeout);
3433 if (ret)
3434 return ret;
3435
3436 if (timeout == 0) {
3437 index++;
3438 continue;
3439 } else if (timeout < 0) {
3440 timeout = MAX_SCHEDULE_TIMEOUT;
3441 dev_warn(adev->dev, "lockup timeout disabled");
3442 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3443 } else {
3444 timeout = msecs_to_jiffies(timeout);
3445 }
3446
3447 switch (index++) {
3448 case 0:
3449 adev->gfx_timeout = timeout;
3450 break;
3451 case 1:
3452 adev->compute_timeout = timeout;
3453 break;
3454 case 2:
3455 adev->sdma_timeout = timeout;
3456 break;
3457 case 3:
3458 adev->video_timeout = timeout;
3459 break;
3460 default:
3461 break;
3462 }
3463 }
3464 /*
3465 * There is only one value specified and
3466 * it should apply to all non-compute jobs.
3467 */
3468 if (index == 1) {
3469 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3470 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3471 adev->compute_timeout = adev->gfx_timeout;
3472 }
3473 }
3474 #endif
3475
3476 return ret;
3477 }
3478
3479 /**
3480 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3481 *
3482 * @adev: amdgpu_device pointer
3483 *
3484 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3485 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3486 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3487 {
3488 #ifdef notyet
3489 struct iommu_domain *domain;
3490
3491 domain = iommu_get_domain_for_dev(adev->dev);
3492 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3493 #endif
3494 adev->ram_is_direct_mapped = true;
3495 }
3496
3497 static const struct attribute *amdgpu_dev_attributes[] = {
3498 &dev_attr_pcie_replay_count.attr,
3499 NULL
3500 };
3501
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3502 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3503 {
3504 if (amdgpu_mcbp == 1)
3505 adev->gfx.mcbp = true;
3506 else if (amdgpu_mcbp == 0)
3507 adev->gfx.mcbp = false;
3508
3509 if (amdgpu_sriov_vf(adev))
3510 adev->gfx.mcbp = true;
3511
3512 if (adev->gfx.mcbp)
3513 DRM_INFO("MCBP is enabled\n");
3514 }
3515
3516 /**
3517 * amdgpu_device_init - initialize the driver
3518 *
3519 * @adev: amdgpu_device pointer
3520 * @flags: driver flags
3521 *
3522 * Initializes the driver info and hw (all asics).
3523 * Returns 0 for success or an error on failure.
3524 * Called at driver startup.
3525 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3526 int amdgpu_device_init(struct amdgpu_device *adev,
3527 uint32_t flags)
3528 {
3529 struct drm_device *ddev = adev_to_drm(adev);
3530 struct pci_dev *pdev = adev->pdev;
3531 int r, i;
3532 bool px = false;
3533 u32 max_MBps;
3534 int tmp;
3535
3536 adev->shutdown = false;
3537 adev->flags = flags;
3538
3539 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3540 adev->asic_type = amdgpu_force_asic_type;
3541 else
3542 adev->asic_type = flags & AMD_ASIC_MASK;
3543
3544 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3545 if (amdgpu_emu_mode == 1)
3546 adev->usec_timeout *= 10;
3547 adev->gmc.gart_size = 512 * 1024 * 1024;
3548 adev->accel_working = false;
3549 adev->num_rings = 0;
3550 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3551 adev->mman.buffer_funcs = NULL;
3552 adev->mman.buffer_funcs_ring = NULL;
3553 adev->vm_manager.vm_pte_funcs = NULL;
3554 adev->vm_manager.vm_pte_num_scheds = 0;
3555 adev->gmc.gmc_funcs = NULL;
3556 adev->harvest_ip_mask = 0x0;
3557 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3558 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3559
3560 adev->smc_rreg = &amdgpu_invalid_rreg;
3561 adev->smc_wreg = &amdgpu_invalid_wreg;
3562 adev->pcie_rreg = &amdgpu_invalid_rreg;
3563 adev->pcie_wreg = &amdgpu_invalid_wreg;
3564 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3565 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3566 adev->pciep_rreg = &amdgpu_invalid_rreg;
3567 adev->pciep_wreg = &amdgpu_invalid_wreg;
3568 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3569 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3570 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3571 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3572 adev->didt_rreg = &amdgpu_invalid_rreg;
3573 adev->didt_wreg = &amdgpu_invalid_wreg;
3574 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3575 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3576 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3577 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3578
3579 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3580 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3581 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3582
3583 /* mutex initialization are all done here so we
3584 * can recall function without having locking issues
3585 */
3586 rw_init(&adev->firmware.mutex, "agfw");
3587 rw_init(&adev->pm.mutex, "agpm");
3588 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
3589 rw_init(&adev->srbm_mutex, "srbm");
3590 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
3591 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
3592 rw_init(&adev->gfx.partition_mutex, "gfxpar");
3593 rw_init(&adev->grbm_idx_mutex, "grbmidx");
3594 rw_init(&adev->mn_lock, "agpumn");
3595 rw_init(&adev->virt.vf_errors.lock, "vferr");
3596 rw_init(&adev->virt.rlcg_reg_lock, "vrlcg");
3597 hash_init(adev->mn_hash);
3598 rw_init(&adev->psp.mutex, "agpsp");
3599 rw_init(&adev->notifier_lock, "agnf");
3600 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps");
3601 rw_init(&adev->benchmark_mutex, "agbm");
3602
3603 amdgpu_device_init_apu_flags(adev);
3604
3605 r = amdgpu_device_check_arguments(adev);
3606 if (r)
3607 return r;
3608
3609 mtx_init(&adev->mmio_idx_lock, IPL_TTY);
3610 mtx_init(&adev->smc_idx_lock, IPL_TTY);
3611 mtx_init(&adev->pcie_idx_lock, IPL_TTY);
3612 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
3613 mtx_init(&adev->didt_idx_lock, IPL_TTY);
3614 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
3615 mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
3616 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
3617 mtx_init(&adev->mm_stats.lock, IPL_NONE);
3618
3619 INIT_LIST_HEAD(&adev->shadow_list);
3620 rw_init(&adev->shadow_list_lock, "sdwlst");
3621
3622 INIT_LIST_HEAD(&adev->reset_list);
3623
3624 INIT_LIST_HEAD(&adev->ras_list);
3625
3626 INIT_DELAYED_WORK(&adev->delayed_init_work,
3627 amdgpu_device_delayed_init_work_handler);
3628 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3629 amdgpu_device_delay_enable_gfx_off);
3630
3631 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3632
3633 adev->gfx.gfx_off_req_count = 1;
3634 adev->gfx.gfx_off_residency = 0;
3635 adev->gfx.gfx_off_entrycount = 0;
3636 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3637
3638 atomic_set(&adev->throttling_logging_enabled, 1);
3639 /*
3640 * If throttling continues, logging will be performed every minute
3641 * to avoid log flooding. "-1" is subtracted since the thermal
3642 * throttling interrupt comes every second. Thus, the total logging
3643 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3644 * for throttling interrupt) = 60 seconds.
3645 */
3646 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3647 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3648
3649 #ifdef __linux__
3650 /* Registers mapping */
3651 /* TODO: block userspace mapping of io register */
3652 if (adev->asic_type >= CHIP_BONAIRE) {
3653 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3654 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3655 } else {
3656 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3657 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3658 }
3659 #endif
3660
3661 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3662 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3663
3664 #ifdef __linux__
3665 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3666 if (!adev->rmmio)
3667 return -ENOMEM;
3668 #endif
3669 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3670 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3671
3672 /*
3673 * Reset domain needs to be present early, before XGMI hive discovered
3674 * (if any) and intitialized to use reset sem and in_gpu reset flag
3675 * early on during init and before calling to RREG32.
3676 */
3677 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3678 if (!adev->reset_domain)
3679 return -ENOMEM;
3680
3681 /* detect hw virtualization here */
3682 amdgpu_detect_virtualization(adev);
3683
3684 amdgpu_device_get_pcie_info(adev);
3685
3686 r = amdgpu_device_get_job_timeout_settings(adev);
3687 if (r) {
3688 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3689 return r;
3690 }
3691
3692 /* early init functions */
3693 r = amdgpu_device_ip_early_init(adev);
3694 if (r)
3695 return r;
3696
3697 amdgpu_device_set_mcbp(adev);
3698
3699 /* Get rid of things like offb */
3700 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3701 if (r)
3702 return r;
3703
3704 /* Enable TMZ based on IP_VERSION */
3705 amdgpu_gmc_tmz_set(adev);
3706
3707 amdgpu_gmc_noretry_set(adev);
3708 /* Need to get xgmi info early to decide the reset behavior*/
3709 if (adev->gmc.xgmi.supported) {
3710 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3711 if (r)
3712 return r;
3713 }
3714
3715 /* enable PCIE atomic ops */
3716 #ifdef notyet
3717 if (amdgpu_sriov_vf(adev)) {
3718 if (adev->virt.fw_reserve.p_pf2vf)
3719 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3720 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3721 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3722 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3723 * internal path natively support atomics, set have_atomics_support to true.
3724 */
3725 } else if ((adev->flags & AMD_IS_APU) &&
3726 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3727 adev->have_atomics_support = true;
3728 } else {
3729 adev->have_atomics_support =
3730 !pci_enable_atomic_ops_to_root(adev->pdev,
3731 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3732 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3733 }
3734
3735 if (!adev->have_atomics_support)
3736 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3737 #else
3738 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3739 * internal path natively support atomics, set have_atomics_support to true.
3740 */
3741 if ((adev->flags & AMD_IS_APU) &&
3742 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)))
3743 adev->have_atomics_support = true;
3744 else
3745 adev->have_atomics_support = false;
3746 #endif
3747
3748 /* doorbell bar mapping and doorbell index init*/
3749 amdgpu_doorbell_init(adev);
3750
3751 if (amdgpu_emu_mode == 1) {
3752 /* post the asic on emulation mode */
3753 emu_soc_asic_init(adev);
3754 goto fence_driver_init;
3755 }
3756
3757 amdgpu_reset_init(adev);
3758
3759 /* detect if we are with an SRIOV vbios */
3760 if (adev->bios)
3761 amdgpu_device_detect_sriov_bios(adev);
3762
3763 /* check if we need to reset the asic
3764 * E.g., driver was not cleanly unloaded previously, etc.
3765 */
3766 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3767 if (adev->gmc.xgmi.num_physical_nodes) {
3768 dev_info(adev->dev, "Pending hive reset.\n");
3769 adev->gmc.xgmi.pending_reset = true;
3770 /* Only need to init necessary block for SMU to handle the reset */
3771 for (i = 0; i < adev->num_ip_blocks; i++) {
3772 if (!adev->ip_blocks[i].status.valid)
3773 continue;
3774 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3775 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3778 DRM_DEBUG("IP %s disabled for hw_init.\n",
3779 adev->ip_blocks[i].version->funcs->name);
3780 adev->ip_blocks[i].status.hw = true;
3781 }
3782 }
3783 } else {
3784 tmp = amdgpu_reset_method;
3785 /* It should do a default reset when loading or reloading the driver,
3786 * regardless of the module parameter reset_method.
3787 */
3788 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3789 r = amdgpu_asic_reset(adev);
3790 amdgpu_reset_method = tmp;
3791 if (r) {
3792 dev_err(adev->dev, "asic reset on init failed\n");
3793 goto failed;
3794 }
3795 }
3796 }
3797
3798 /* Post card if necessary */
3799 if (amdgpu_device_need_post(adev)) {
3800 if (!adev->bios) {
3801 dev_err(adev->dev, "no vBIOS found\n");
3802 r = -EINVAL;
3803 goto failed;
3804 }
3805 DRM_INFO("GPU posting now...\n");
3806 r = amdgpu_device_asic_init(adev);
3807 if (r) {
3808 dev_err(adev->dev, "gpu post error!\n");
3809 goto failed;
3810 }
3811 }
3812
3813 if (adev->bios) {
3814 if (adev->is_atom_fw) {
3815 /* Initialize clocks */
3816 r = amdgpu_atomfirmware_get_clock_info(adev);
3817 if (r) {
3818 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3819 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3820 goto failed;
3821 }
3822 } else {
3823 /* Initialize clocks */
3824 r = amdgpu_atombios_get_clock_info(adev);
3825 if (r) {
3826 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3827 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3828 goto failed;
3829 }
3830 /* init i2c buses */
3831 if (!amdgpu_device_has_dc_support(adev))
3832 amdgpu_atombios_i2c_init(adev);
3833 }
3834 }
3835
3836 fence_driver_init:
3837 /* Fence driver */
3838 r = amdgpu_fence_driver_sw_init(adev);
3839 if (r) {
3840 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3841 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3842 goto failed;
3843 }
3844
3845 /* init the mode config */
3846 drm_mode_config_init(adev_to_drm(adev));
3847
3848 r = amdgpu_device_ip_init(adev);
3849 if (r) {
3850 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3851 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3852 goto release_ras_con;
3853 }
3854
3855 amdgpu_fence_driver_hw_init(adev);
3856
3857 dev_info(adev->dev,
3858 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3859 adev->gfx.config.max_shader_engines,
3860 adev->gfx.config.max_sh_per_se,
3861 adev->gfx.config.max_cu_per_sh,
3862 adev->gfx.cu_info.number);
3863
3864 #ifdef __OpenBSD__
3865 {
3866 const char *chip_name;
3867 uint32_t version = adev->ip_versions[GC_HWIP][0];
3868 int maj, min, rev;
3869
3870 switch (adev->asic_type) {
3871 case CHIP_RAVEN:
3872 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
3873 chip_name = "RAVEN2";
3874 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
3875 chip_name = "PICASSO";
3876 else
3877 chip_name = "RAVEN";
3878 break;
3879 case CHIP_RENOIR:
3880 if (adev->apu_flags & AMD_APU_IS_RENOIR)
3881 chip_name = "RENOIR";
3882 else
3883 chip_name = "GREEN_SARDINE";
3884 break;
3885 default:
3886 chip_name = amdgpu_asic_name[adev->asic_type];
3887 }
3888
3889 printf("%s: %s", adev->self.dv_xname, chip_name);
3890 /* show graphics/compute ip block version, not set on < GFX9 */
3891 if (version) {
3892 maj = IP_VERSION_MAJ(version);
3893 min = IP_VERSION_MIN(version);
3894 rev = IP_VERSION_REV(version);
3895 printf(" GC %d.%d.%d", maj, min, rev);
3896 }
3897 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id);
3898 }
3899 #endif
3900
3901 adev->accel_working = true;
3902
3903 amdgpu_vm_check_compute_bug(adev);
3904
3905 /* Initialize the buffer migration limit. */
3906 if (amdgpu_moverate >= 0)
3907 max_MBps = amdgpu_moverate;
3908 else
3909 max_MBps = 8; /* Allow 8 MB/s. */
3910 /* Get a log2 for easy divisions. */
3911 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3912
3913 r = amdgpu_atombios_sysfs_init(adev);
3914 if (r)
3915 drm_err(&adev->ddev,
3916 "registering atombios sysfs failed (%d).\n", r);
3917
3918 r = amdgpu_pm_sysfs_init(adev);
3919 if (r)
3920 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3921
3922 r = amdgpu_ucode_sysfs_init(adev);
3923 if (r) {
3924 adev->ucode_sysfs_en = false;
3925 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3926 } else
3927 adev->ucode_sysfs_en = true;
3928
3929 /*
3930 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3931 * Otherwise the mgpu fan boost feature will be skipped due to the
3932 * gpu instance is counted less.
3933 */
3934 amdgpu_register_gpu_instance(adev);
3935
3936 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3937 * explicit gating rather than handling it automatically.
3938 */
3939 if (!adev->gmc.xgmi.pending_reset) {
3940 r = amdgpu_device_ip_late_init(adev);
3941 if (r) {
3942 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3943 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3944 goto release_ras_con;
3945 }
3946 /* must succeed. */
3947 amdgpu_ras_resume(adev);
3948 queue_delayed_work(system_wq, &adev->delayed_init_work,
3949 msecs_to_jiffies(AMDGPU_RESUME_MS));
3950 }
3951
3952 if (amdgpu_sriov_vf(adev)) {
3953 amdgpu_virt_release_full_gpu(adev, true);
3954 flush_delayed_work(&adev->delayed_init_work);
3955 }
3956
3957 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3958 if (r)
3959 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3960
3961 amdgpu_fru_sysfs_init(adev);
3962
3963 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3964 r = amdgpu_pmu_init(adev);
3965 if (r)
3966 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3967
3968 /* Have stored pci confspace at hand for restore in sudden PCI error */
3969 if (amdgpu_device_cache_pci_state(adev->pdev))
3970 pci_restore_state(pdev);
3971
3972 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3973 /* this will fail for cards that aren't VGA class devices, just
3974 * ignore it
3975 */
3976 #ifdef notyet
3977 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3978 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3979 #endif
3980
3981 px = amdgpu_device_supports_px(ddev);
3982
3983 if (px || (!dev_is_removable(&adev->pdev->dev) &&
3984 apple_gmux_detect(NULL, NULL)))
3985 vga_switcheroo_register_client(adev->pdev,
3986 &amdgpu_switcheroo_ops, px);
3987
3988 if (px)
3989 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3990
3991 if (adev->gmc.xgmi.pending_reset)
3992 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3993 msecs_to_jiffies(AMDGPU_RESUME_MS));
3994
3995 amdgpu_device_check_iommu_direct_map(adev);
3996
3997 return 0;
3998
3999 release_ras_con:
4000 if (amdgpu_sriov_vf(adev))
4001 amdgpu_virt_release_full_gpu(adev, true);
4002
4003 /* failed in exclusive mode due to timeout */
4004 if (amdgpu_sriov_vf(adev) &&
4005 !amdgpu_sriov_runtime(adev) &&
4006 amdgpu_virt_mmio_blocked(adev) &&
4007 !amdgpu_virt_wait_reset(adev)) {
4008 dev_err(adev->dev, "VF exclusive mode timeout\n");
4009 /* Don't send request since VF is inactive. */
4010 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4011 adev->virt.ops = NULL;
4012 r = -EAGAIN;
4013 }
4014 amdgpu_release_ras_context(adev);
4015
4016 failed:
4017 amdgpu_vf_error_trans_all(adev);
4018
4019 return r;
4020 }
4021
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4022 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4023 {
4024 STUB();
4025 #ifdef notyet
4026
4027 /* Clear all CPU mappings pointing to this device */
4028 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4029 #endif
4030
4031 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4032 amdgpu_doorbell_fini(adev);
4033
4034 #ifdef __linux__
4035 iounmap(adev->rmmio);
4036 adev->rmmio = NULL;
4037 if (adev->mman.aper_base_kaddr)
4038 iounmap(adev->mman.aper_base_kaddr);
4039 adev->mman.aper_base_kaddr = NULL;
4040 #else
4041 if (adev->rmmio_size > 0)
4042 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4043 adev->rmmio_size);
4044 adev->rmmio_size = 0;
4045 adev->rmmio = NULL;
4046 if (adev->mman.aper_base_kaddr)
4047 bus_space_unmap(adev->memt, adev->mman.aper_bsh,
4048 adev->gmc.visible_vram_size);
4049 adev->mman.aper_base_kaddr = NULL;
4050 #endif
4051
4052 /* Memory manager related */
4053 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4054 #ifdef __linux__
4055 arch_phys_wc_del(adev->gmc.vram_mtrr);
4056 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4057 #else
4058 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
4059 #endif
4060 }
4061 }
4062
4063 /**
4064 * amdgpu_device_fini_hw - tear down the driver
4065 *
4066 * @adev: amdgpu_device pointer
4067 *
4068 * Tear down the driver info (all asics).
4069 * Called at driver shutdown.
4070 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4071 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4072 {
4073 dev_info(adev->dev, "amdgpu: finishing device.\n");
4074 flush_delayed_work(&adev->delayed_init_work);
4075 adev->shutdown = true;
4076
4077 /* make sure IB test finished before entering exclusive mode
4078 * to avoid preemption on IB test
4079 */
4080 if (amdgpu_sriov_vf(adev)) {
4081 amdgpu_virt_request_full_gpu(adev, false);
4082 amdgpu_virt_fini_data_exchange(adev);
4083 }
4084
4085 /* disable all interrupts */
4086 amdgpu_irq_disable_all(adev);
4087 if (adev->mode_info.mode_config_initialized) {
4088 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4089 drm_helper_force_disable_all(adev_to_drm(adev));
4090 else
4091 drm_atomic_helper_shutdown(adev_to_drm(adev));
4092 }
4093 amdgpu_fence_driver_hw_fini(adev);
4094
4095 if (adev->mman.initialized)
4096 drain_workqueue(adev->mman.bdev.wq);
4097
4098 if (adev->pm.sysfs_initialized)
4099 amdgpu_pm_sysfs_fini(adev);
4100 if (adev->ucode_sysfs_en)
4101 amdgpu_ucode_sysfs_fini(adev);
4102 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4103 amdgpu_fru_sysfs_fini(adev);
4104
4105 /* disable ras feature must before hw fini */
4106 amdgpu_ras_pre_fini(adev);
4107
4108 amdgpu_device_ip_fini_early(adev);
4109
4110 amdgpu_irq_fini_hw(adev);
4111
4112 if (adev->mman.initialized)
4113 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4114
4115 amdgpu_gart_dummy_page_fini(adev);
4116
4117 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4118 amdgpu_device_unmap_mmio(adev);
4119
4120 }
4121
amdgpu_device_fini_sw(struct amdgpu_device * adev)4122 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4123 {
4124 int idx;
4125 bool px;
4126
4127 amdgpu_fence_driver_sw_fini(adev);
4128 amdgpu_device_ip_fini(adev);
4129 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4130 adev->accel_working = false;
4131 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4132
4133 amdgpu_reset_fini(adev);
4134
4135 /* free i2c buses */
4136 if (!amdgpu_device_has_dc_support(adev))
4137 amdgpu_i2c_fini(adev);
4138
4139 if (amdgpu_emu_mode != 1)
4140 amdgpu_atombios_fini(adev);
4141
4142 kfree(adev->bios);
4143 adev->bios = NULL;
4144
4145 px = amdgpu_device_supports_px(adev_to_drm(adev));
4146
4147 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4148 apple_gmux_detect(NULL, NULL)))
4149 vga_switcheroo_unregister_client(adev->pdev);
4150
4151 if (px)
4152 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4153
4154 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4155 vga_client_unregister(adev->pdev);
4156
4157 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4158 #ifdef __linux__
4159 iounmap(adev->rmmio);
4160 adev->rmmio = NULL;
4161 #else
4162 if (adev->rmmio_size > 0)
4163 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
4164 adev->rmmio_size);
4165 adev->rmmio_size = 0;
4166 adev->rmmio = NULL;
4167 #endif
4168 amdgpu_doorbell_fini(adev);
4169 drm_dev_exit(idx);
4170 }
4171
4172 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4173 amdgpu_pmu_fini(adev);
4174 if (adev->mman.discovery_bin)
4175 amdgpu_discovery_fini(adev);
4176
4177 amdgpu_reset_put_reset_domain(adev->reset_domain);
4178 adev->reset_domain = NULL;
4179
4180 kfree(adev->pci_state);
4181
4182 }
4183
4184 /**
4185 * amdgpu_device_evict_resources - evict device resources
4186 * @adev: amdgpu device object
4187 *
4188 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4189 * of the vram memory type. Mainly used for evicting device resources
4190 * at suspend time.
4191 *
4192 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4193 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4194 {
4195 int ret;
4196
4197 /* No need to evict vram on APUs for suspend to ram or s2idle */
4198 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4199 return 0;
4200
4201 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4202 if (ret)
4203 DRM_WARN("evicting device resources failed\n");
4204 return ret;
4205 }
4206
4207 /*
4208 * Suspend & resume.
4209 */
4210 /**
4211 * amdgpu_device_prepare - prepare for device suspend
4212 *
4213 * @dev: drm dev pointer
4214 *
4215 * Prepare to put the hw in the suspend state (all asics).
4216 * Returns 0 for success or an error on failure.
4217 * Called at driver suspend.
4218 */
amdgpu_device_prepare(struct drm_device * dev)4219 int amdgpu_device_prepare(struct drm_device *dev)
4220 {
4221 struct amdgpu_device *adev = drm_to_adev(dev);
4222 int i, r;
4223
4224 amdgpu_choose_low_power_state(adev);
4225
4226 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4227 return 0;
4228
4229 /* Evict the majority of BOs before starting suspend sequence */
4230 r = amdgpu_device_evict_resources(adev);
4231 if (r)
4232 goto unprepare;
4233
4234 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4235
4236 for (i = 0; i < adev->num_ip_blocks; i++) {
4237 if (!adev->ip_blocks[i].status.valid)
4238 continue;
4239 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4240 continue;
4241 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4242 if (r)
4243 goto unprepare;
4244 }
4245
4246 return 0;
4247
4248 unprepare:
4249 adev->in_s0ix = adev->in_s3 = false;
4250
4251 return r;
4252 }
4253
4254 /**
4255 * amdgpu_device_suspend - initiate device suspend
4256 *
4257 * @dev: drm dev pointer
4258 * @fbcon : notify the fbdev of suspend
4259 *
4260 * Puts the hw in the suspend state (all asics).
4261 * Returns 0 for success or an error on failure.
4262 * Called at driver suspend.
4263 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4264 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4265 {
4266 struct amdgpu_device *adev = drm_to_adev(dev);
4267 int r = 0;
4268
4269 if (adev->shutdown)
4270 return 0;
4271
4272 #ifdef notyet
4273 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4274 return 0;
4275 #endif
4276
4277 adev->in_suspend = true;
4278
4279 if (amdgpu_sriov_vf(adev)) {
4280 amdgpu_virt_fini_data_exchange(adev);
4281 r = amdgpu_virt_request_full_gpu(adev, false);
4282 if (r)
4283 return r;
4284 }
4285
4286 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4287 DRM_WARN("smart shift update failed\n");
4288
4289 if (fbcon)
4290 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4291
4292 cancel_delayed_work_sync(&adev->delayed_init_work);
4293
4294 amdgpu_ras_suspend(adev);
4295
4296 amdgpu_device_ip_suspend_phase1(adev);
4297
4298 if (!adev->in_s0ix)
4299 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4300
4301 r = amdgpu_device_evict_resources(adev);
4302 if (r)
4303 return r;
4304
4305 amdgpu_fence_driver_hw_fini(adev);
4306
4307 amdgpu_device_ip_suspend_phase2(adev);
4308
4309 if (amdgpu_sriov_vf(adev))
4310 amdgpu_virt_release_full_gpu(adev, false);
4311
4312 return 0;
4313 }
4314
4315 /**
4316 * amdgpu_device_resume - initiate device resume
4317 *
4318 * @dev: drm dev pointer
4319 * @fbcon : notify the fbdev of resume
4320 *
4321 * Bring the hw back to operating state (all asics).
4322 * Returns 0 for success or an error on failure.
4323 * Called at driver resume.
4324 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4325 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4326 {
4327 struct amdgpu_device *adev = drm_to_adev(dev);
4328 int r = 0;
4329
4330 if (amdgpu_sriov_vf(adev)) {
4331 r = amdgpu_virt_request_full_gpu(adev, true);
4332 if (r)
4333 return r;
4334 }
4335
4336 #ifdef notyet
4337 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4338 return 0;
4339 #endif
4340
4341 if (adev->in_s0ix)
4342 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4343
4344 /* post card */
4345 if (amdgpu_device_need_post(adev)) {
4346 r = amdgpu_device_asic_init(adev);
4347 if (r)
4348 dev_err(adev->dev, "amdgpu asic init failed\n");
4349 }
4350
4351 r = amdgpu_device_ip_resume(adev);
4352
4353 if (r) {
4354 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4355 goto exit;
4356 }
4357 amdgpu_fence_driver_hw_init(adev);
4358
4359 r = amdgpu_device_ip_late_init(adev);
4360 if (r)
4361 goto exit;
4362
4363 queue_delayed_work(system_wq, &adev->delayed_init_work,
4364 msecs_to_jiffies(AMDGPU_RESUME_MS));
4365
4366 if (!adev->in_s0ix) {
4367 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4368 if (r)
4369 goto exit;
4370 }
4371
4372 exit:
4373 if (amdgpu_sriov_vf(adev)) {
4374 amdgpu_virt_init_data_exchange(adev);
4375 amdgpu_virt_release_full_gpu(adev, true);
4376 }
4377
4378 if (r)
4379 return r;
4380
4381 /* Make sure IB tests flushed */
4382 flush_delayed_work(&adev->delayed_init_work);
4383
4384 if (fbcon)
4385 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4386
4387 amdgpu_ras_resume(adev);
4388
4389 if (adev->mode_info.num_crtc) {
4390 /*
4391 * Most of the connector probing functions try to acquire runtime pm
4392 * refs to ensure that the GPU is powered on when connector polling is
4393 * performed. Since we're calling this from a runtime PM callback,
4394 * trying to acquire rpm refs will cause us to deadlock.
4395 *
4396 * Since we're guaranteed to be holding the rpm lock, it's safe to
4397 * temporarily disable the rpm helpers so this doesn't deadlock us.
4398 */
4399 #if defined(CONFIG_PM) && defined(__linux__)
4400 dev->dev->power.disable_depth++;
4401 #endif
4402 if (!adev->dc_enabled)
4403 drm_helper_hpd_irq_event(dev);
4404 else
4405 drm_kms_helper_hotplug_event(dev);
4406 #if defined(CONFIG_PM) && defined(__linux__)
4407 dev->dev->power.disable_depth--;
4408 #endif
4409 }
4410 adev->in_suspend = false;
4411
4412 if (adev->enable_mes)
4413 amdgpu_mes_self_test(adev);
4414
4415 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4416 DRM_WARN("smart shift update failed\n");
4417
4418 return 0;
4419 }
4420
4421 /**
4422 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4423 *
4424 * @adev: amdgpu_device pointer
4425 *
4426 * The list of all the hardware IPs that make up the asic is walked and
4427 * the check_soft_reset callbacks are run. check_soft_reset determines
4428 * if the asic is still hung or not.
4429 * Returns true if any of the IPs are still in a hung state, false if not.
4430 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4431 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4432 {
4433 int i;
4434 bool asic_hang = false;
4435
4436 if (amdgpu_sriov_vf(adev))
4437 return true;
4438
4439 if (amdgpu_asic_need_full_reset(adev))
4440 return true;
4441
4442 for (i = 0; i < adev->num_ip_blocks; i++) {
4443 if (!adev->ip_blocks[i].status.valid)
4444 continue;
4445 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4446 adev->ip_blocks[i].status.hang =
4447 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4448 if (adev->ip_blocks[i].status.hang) {
4449 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4450 asic_hang = true;
4451 }
4452 }
4453 return asic_hang;
4454 }
4455
4456 /**
4457 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4458 *
4459 * @adev: amdgpu_device pointer
4460 *
4461 * The list of all the hardware IPs that make up the asic is walked and the
4462 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4463 * handles any IP specific hardware or software state changes that are
4464 * necessary for a soft reset to succeed.
4465 * Returns 0 on success, negative error code on failure.
4466 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4467 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4468 {
4469 int i, r = 0;
4470
4471 for (i = 0; i < adev->num_ip_blocks; i++) {
4472 if (!adev->ip_blocks[i].status.valid)
4473 continue;
4474 if (adev->ip_blocks[i].status.hang &&
4475 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4476 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4477 if (r)
4478 return r;
4479 }
4480 }
4481
4482 return 0;
4483 }
4484
4485 /**
4486 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4487 *
4488 * @adev: amdgpu_device pointer
4489 *
4490 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4491 * reset is necessary to recover.
4492 * Returns true if a full asic reset is required, false if not.
4493 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4494 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4495 {
4496 int i;
4497
4498 if (amdgpu_asic_need_full_reset(adev))
4499 return true;
4500
4501 for (i = 0; i < adev->num_ip_blocks; i++) {
4502 if (!adev->ip_blocks[i].status.valid)
4503 continue;
4504 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4505 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4506 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4507 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4508 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4509 if (adev->ip_blocks[i].status.hang) {
4510 dev_info(adev->dev, "Some block need full reset!\n");
4511 return true;
4512 }
4513 }
4514 }
4515 return false;
4516 }
4517
4518 /**
4519 * amdgpu_device_ip_soft_reset - do a soft reset
4520 *
4521 * @adev: amdgpu_device pointer
4522 *
4523 * The list of all the hardware IPs that make up the asic is walked and the
4524 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4525 * IP specific hardware or software state changes that are necessary to soft
4526 * reset the IP.
4527 * Returns 0 on success, negative error code on failure.
4528 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4529 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4530 {
4531 int i, r = 0;
4532
4533 for (i = 0; i < adev->num_ip_blocks; i++) {
4534 if (!adev->ip_blocks[i].status.valid)
4535 continue;
4536 if (adev->ip_blocks[i].status.hang &&
4537 adev->ip_blocks[i].version->funcs->soft_reset) {
4538 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4539 if (r)
4540 return r;
4541 }
4542 }
4543
4544 return 0;
4545 }
4546
4547 /**
4548 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4549 *
4550 * @adev: amdgpu_device pointer
4551 *
4552 * The list of all the hardware IPs that make up the asic is walked and the
4553 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4554 * handles any IP specific hardware or software state changes that are
4555 * necessary after the IP has been soft reset.
4556 * Returns 0 on success, negative error code on failure.
4557 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4558 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4559 {
4560 int i, r = 0;
4561
4562 for (i = 0; i < adev->num_ip_blocks; i++) {
4563 if (!adev->ip_blocks[i].status.valid)
4564 continue;
4565 if (adev->ip_blocks[i].status.hang &&
4566 adev->ip_blocks[i].version->funcs->post_soft_reset)
4567 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4568 if (r)
4569 return r;
4570 }
4571
4572 return 0;
4573 }
4574
4575 /**
4576 * amdgpu_device_recover_vram - Recover some VRAM contents
4577 *
4578 * @adev: amdgpu_device pointer
4579 *
4580 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4581 * restore things like GPUVM page tables after a GPU reset where
4582 * the contents of VRAM might be lost.
4583 *
4584 * Returns:
4585 * 0 on success, negative error code on failure.
4586 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4587 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4588 {
4589 struct dma_fence *fence = NULL, *next = NULL;
4590 struct amdgpu_bo *shadow;
4591 struct amdgpu_bo_vm *vmbo;
4592 long r = 1, tmo;
4593
4594 if (amdgpu_sriov_runtime(adev))
4595 tmo = msecs_to_jiffies(8000);
4596 else
4597 tmo = msecs_to_jiffies(100);
4598
4599 dev_info(adev->dev, "recover vram bo from shadow start\n");
4600 mutex_lock(&adev->shadow_list_lock);
4601 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4602 /* If vm is compute context or adev is APU, shadow will be NULL */
4603 if (!vmbo->shadow)
4604 continue;
4605 shadow = vmbo->shadow;
4606
4607 /* No need to recover an evicted BO */
4608 if (!shadow->tbo.resource ||
4609 shadow->tbo.resource->mem_type != TTM_PL_TT ||
4610 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4611 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4612 continue;
4613
4614 r = amdgpu_bo_restore_shadow(shadow, &next);
4615 if (r)
4616 break;
4617
4618 if (fence) {
4619 tmo = dma_fence_wait_timeout(fence, false, tmo);
4620 dma_fence_put(fence);
4621 fence = next;
4622 if (tmo == 0) {
4623 r = -ETIMEDOUT;
4624 break;
4625 } else if (tmo < 0) {
4626 r = tmo;
4627 break;
4628 }
4629 } else {
4630 fence = next;
4631 }
4632 }
4633 mutex_unlock(&adev->shadow_list_lock);
4634
4635 if (fence)
4636 tmo = dma_fence_wait_timeout(fence, false, tmo);
4637 dma_fence_put(fence);
4638
4639 if (r < 0 || tmo <= 0) {
4640 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4641 return -EIO;
4642 }
4643
4644 dev_info(adev->dev, "recover vram bo from shadow done\n");
4645 return 0;
4646 }
4647
4648
4649 /**
4650 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4651 *
4652 * @adev: amdgpu_device pointer
4653 * @from_hypervisor: request from hypervisor
4654 *
4655 * do VF FLR and reinitialize Asic
4656 * return 0 means succeeded otherwise failed
4657 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4658 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4659 bool from_hypervisor)
4660 {
4661 int r;
4662 struct amdgpu_hive_info *hive = NULL;
4663 int retry_limit = 0;
4664
4665 retry:
4666 amdgpu_amdkfd_pre_reset(adev);
4667
4668 if (from_hypervisor)
4669 r = amdgpu_virt_request_full_gpu(adev, true);
4670 else
4671 r = amdgpu_virt_reset_gpu(adev);
4672 if (r)
4673 return r;
4674 amdgpu_irq_gpu_reset_resume_helper(adev);
4675
4676 /* some sw clean up VF needs to do before recover */
4677 amdgpu_virt_post_reset(adev);
4678
4679 /* Resume IP prior to SMC */
4680 r = amdgpu_device_ip_reinit_early_sriov(adev);
4681 if (r)
4682 goto error;
4683
4684 amdgpu_virt_init_data_exchange(adev);
4685
4686 r = amdgpu_device_fw_loading(adev);
4687 if (r)
4688 return r;
4689
4690 /* now we are okay to resume SMC/CP/SDMA */
4691 r = amdgpu_device_ip_reinit_late_sriov(adev);
4692 if (r)
4693 goto error;
4694
4695 hive = amdgpu_get_xgmi_hive(adev);
4696 /* Update PSP FW topology after reset */
4697 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4698 r = amdgpu_xgmi_update_topology(hive, adev);
4699
4700 if (hive)
4701 amdgpu_put_xgmi_hive(hive);
4702
4703 if (!r) {
4704 r = amdgpu_ib_ring_tests(adev);
4705
4706 amdgpu_amdkfd_post_reset(adev);
4707 }
4708
4709 error:
4710 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4711 amdgpu_inc_vram_lost(adev);
4712 r = amdgpu_device_recover_vram(adev);
4713 }
4714 amdgpu_virt_release_full_gpu(adev, true);
4715
4716 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4717 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4718 retry_limit++;
4719 goto retry;
4720 } else
4721 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4722 }
4723
4724 return r;
4725 }
4726
4727 /**
4728 * amdgpu_device_has_job_running - check if there is any job in mirror list
4729 *
4730 * @adev: amdgpu_device pointer
4731 *
4732 * check if there is any job in mirror list
4733 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4734 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4735 {
4736 int i;
4737 struct drm_sched_job *job;
4738
4739 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4740 struct amdgpu_ring *ring = adev->rings[i];
4741
4742 if (!ring || !ring->sched.thread)
4743 continue;
4744
4745 spin_lock(&ring->sched.job_list_lock);
4746 job = list_first_entry_or_null(&ring->sched.pending_list,
4747 struct drm_sched_job, list);
4748 spin_unlock(&ring->sched.job_list_lock);
4749 if (job)
4750 return true;
4751 }
4752 return false;
4753 }
4754
4755 /**
4756 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4757 *
4758 * @adev: amdgpu_device pointer
4759 *
4760 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4761 * a hung GPU.
4762 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4763 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4764 {
4765
4766 if (amdgpu_gpu_recovery == 0)
4767 goto disabled;
4768
4769 /* Skip soft reset check in fatal error mode */
4770 if (!amdgpu_ras_is_poison_mode_supported(adev))
4771 return true;
4772
4773 if (amdgpu_sriov_vf(adev))
4774 return true;
4775
4776 if (amdgpu_gpu_recovery == -1) {
4777 switch (adev->asic_type) {
4778 #ifdef CONFIG_DRM_AMDGPU_SI
4779 case CHIP_VERDE:
4780 case CHIP_TAHITI:
4781 case CHIP_PITCAIRN:
4782 case CHIP_OLAND:
4783 case CHIP_HAINAN:
4784 #endif
4785 #ifdef CONFIG_DRM_AMDGPU_CIK
4786 case CHIP_KAVERI:
4787 case CHIP_KABINI:
4788 case CHIP_MULLINS:
4789 #endif
4790 case CHIP_CARRIZO:
4791 case CHIP_STONEY:
4792 case CHIP_CYAN_SKILLFISH:
4793 goto disabled;
4794 default:
4795 break;
4796 }
4797 }
4798
4799 return true;
4800
4801 disabled:
4802 dev_info(adev->dev, "GPU recovery disabled.\n");
4803 return false;
4804 }
4805
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4806 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4807 {
4808 u32 i;
4809 int ret = 0;
4810
4811 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4812
4813 dev_info(adev->dev, "GPU mode1 reset\n");
4814
4815 /* Cache the state before bus master disable. The saved config space
4816 * values are used in other cases like restore after mode-2 reset.
4817 */
4818 amdgpu_device_cache_pci_state(adev->pdev);
4819
4820 /* disable BM */
4821 pci_clear_master(adev->pdev);
4822
4823 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4824 dev_info(adev->dev, "GPU smu mode1 reset\n");
4825 ret = amdgpu_dpm_mode1_reset(adev);
4826 } else {
4827 dev_info(adev->dev, "GPU psp mode1 reset\n");
4828 ret = psp_gpu_reset(adev);
4829 }
4830
4831 if (ret)
4832 goto mode1_reset_failed;
4833
4834 amdgpu_device_load_pci_state(adev->pdev);
4835 ret = amdgpu_psp_wait_for_bootloader(adev);
4836 if (ret)
4837 goto mode1_reset_failed;
4838
4839 /* wait for asic to come out of reset */
4840 for (i = 0; i < adev->usec_timeout; i++) {
4841 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4842
4843 if (memsize != 0xffffffff)
4844 break;
4845 udelay(1);
4846 }
4847
4848 if (i >= adev->usec_timeout) {
4849 ret = -ETIMEDOUT;
4850 goto mode1_reset_failed;
4851 }
4852
4853 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4854
4855 return 0;
4856
4857 mode1_reset_failed:
4858 dev_err(adev->dev, "GPU mode1 reset failed\n");
4859 return ret;
4860 }
4861
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4862 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4863 struct amdgpu_reset_context *reset_context)
4864 {
4865 int i, r = 0;
4866 struct amdgpu_job *job = NULL;
4867 bool need_full_reset =
4868 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4869
4870 if (reset_context->reset_req_dev == adev)
4871 job = reset_context->job;
4872
4873 if (amdgpu_sriov_vf(adev)) {
4874 /* stop the data exchange thread */
4875 amdgpu_virt_fini_data_exchange(adev);
4876 }
4877
4878 amdgpu_fence_driver_isr_toggle(adev, true);
4879
4880 /* block all schedulers and reset given job's ring */
4881 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4882 struct amdgpu_ring *ring = adev->rings[i];
4883
4884 if (!ring || !ring->sched.thread)
4885 continue;
4886
4887 /* Clear job fence from fence drv to avoid force_completion
4888 * leave NULL and vm flush fence in fence drv
4889 */
4890 amdgpu_fence_driver_clear_job_fences(ring);
4891
4892 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4893 amdgpu_fence_driver_force_completion(ring);
4894 }
4895
4896 amdgpu_fence_driver_isr_toggle(adev, false);
4897
4898 if (job && job->vm)
4899 drm_sched_increase_karma(&job->base);
4900
4901 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4902 /* If reset handler not implemented, continue; otherwise return */
4903 if (r == -EOPNOTSUPP)
4904 r = 0;
4905 else
4906 return r;
4907
4908 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4909 if (!amdgpu_sriov_vf(adev)) {
4910
4911 if (!need_full_reset)
4912 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4913
4914 if (!need_full_reset && amdgpu_gpu_recovery &&
4915 amdgpu_device_ip_check_soft_reset(adev)) {
4916 amdgpu_device_ip_pre_soft_reset(adev);
4917 r = amdgpu_device_ip_soft_reset(adev);
4918 amdgpu_device_ip_post_soft_reset(adev);
4919 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4920 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4921 need_full_reset = true;
4922 }
4923 }
4924
4925 if (need_full_reset)
4926 r = amdgpu_device_ip_suspend(adev);
4927 if (need_full_reset)
4928 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4929 else
4930 clear_bit(AMDGPU_NEED_FULL_RESET,
4931 &reset_context->flags);
4932 }
4933
4934 return r;
4935 }
4936
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4937 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4938 {
4939 int i;
4940
4941 lockdep_assert_held(&adev->reset_domain->sem);
4942
4943 for (i = 0; i < adev->num_regs; i++) {
4944 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4945 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4946 adev->reset_dump_reg_value[i]);
4947 }
4948
4949 return 0;
4950 }
4951
4952 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4953 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4954 size_t count, void *data, size_t datalen)
4955 {
4956 struct drm_printer p;
4957 struct amdgpu_device *adev = data;
4958 struct drm_print_iterator iter;
4959 int i;
4960
4961 iter.data = buffer;
4962 iter.offset = 0;
4963 iter.start = offset;
4964 iter.remain = count;
4965
4966 p = drm_coredump_printer(&iter);
4967
4968 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4969 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4970 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4971 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4972 if (adev->reset_task_info.pid)
4973 drm_printf(&p, "process_name: %s PID: %d\n",
4974 adev->reset_task_info.process_name,
4975 adev->reset_task_info.pid);
4976
4977 if (adev->reset_vram_lost)
4978 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4979 if (adev->num_regs) {
4980 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4981
4982 for (i = 0; i < adev->num_regs; i++)
4983 drm_printf(&p, "0x%08x: 0x%08x\n",
4984 adev->reset_dump_reg_list[i],
4985 adev->reset_dump_reg_value[i]);
4986 }
4987
4988 return count - iter.remain;
4989 }
4990
amdgpu_devcoredump_free(void * data)4991 static void amdgpu_devcoredump_free(void *data)
4992 {
4993 }
4994
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)4995 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4996 {
4997 struct drm_device *dev = adev_to_drm(adev);
4998
4999 ktime_get_ts64(&adev->reset_time);
5000 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
5001 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
5002 }
5003 #endif
5004
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5005 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5006 struct amdgpu_reset_context *reset_context)
5007 {
5008 struct amdgpu_device *tmp_adev = NULL;
5009 bool need_full_reset, skip_hw_reset, vram_lost = false;
5010 int r = 0;
5011 bool gpu_reset_for_dev_remove = 0;
5012
5013 /* Try reset handler method first */
5014 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5015 reset_list);
5016 amdgpu_reset_reg_dumps(tmp_adev);
5017
5018 reset_context->reset_device_list = device_list_handle;
5019 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5020 /* If reset handler not implemented, continue; otherwise return */
5021 if (r == -EOPNOTSUPP)
5022 r = 0;
5023 else
5024 return r;
5025
5026 /* Reset handler not implemented, use the default method */
5027 need_full_reset =
5028 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5029 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5030
5031 gpu_reset_for_dev_remove =
5032 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5033 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5034
5035 /*
5036 * ASIC reset has to be done on all XGMI hive nodes ASAP
5037 * to allow proper links negotiation in FW (within 1 sec)
5038 */
5039 if (!skip_hw_reset && need_full_reset) {
5040 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5041 /* For XGMI run all resets in parallel to speed up the process */
5042 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5043 tmp_adev->gmc.xgmi.pending_reset = false;
5044 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5045 r = -EALREADY;
5046 } else
5047 r = amdgpu_asic_reset(tmp_adev);
5048
5049 if (r) {
5050 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5051 r, adev_to_drm(tmp_adev)->unique);
5052 break;
5053 }
5054 }
5055
5056 /* For XGMI wait for all resets to complete before proceed */
5057 if (!r) {
5058 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5059 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5060 flush_work(&tmp_adev->xgmi_reset_work);
5061 r = tmp_adev->asic_reset_res;
5062 if (r)
5063 break;
5064 }
5065 }
5066 }
5067 }
5068
5069 if (!r && amdgpu_ras_intr_triggered()) {
5070 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5071 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
5072 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
5073 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5074 }
5075
5076 amdgpu_ras_intr_cleared();
5077 }
5078
5079 /* Since the mode1 reset affects base ip blocks, the
5080 * phase1 ip blocks need to be resumed. Otherwise there
5081 * will be a BIOS signature error and the psp bootloader
5082 * can't load kdb on the next amdgpu install.
5083 */
5084 if (gpu_reset_for_dev_remove) {
5085 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5086 amdgpu_device_ip_resume_phase1(tmp_adev);
5087
5088 goto end;
5089 }
5090
5091 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5092 if (need_full_reset) {
5093 /* post card */
5094 r = amdgpu_device_asic_init(tmp_adev);
5095 if (r) {
5096 dev_warn(tmp_adev->dev, "asic atom init failed!");
5097 } else {
5098 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5099
5100 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5101 if (r)
5102 goto out;
5103
5104 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5105 #ifdef CONFIG_DEV_COREDUMP
5106 tmp_adev->reset_vram_lost = vram_lost;
5107 memset(&tmp_adev->reset_task_info, 0,
5108 sizeof(tmp_adev->reset_task_info));
5109 if (reset_context->job && reset_context->job->vm)
5110 tmp_adev->reset_task_info =
5111 reset_context->job->vm->task_info;
5112 amdgpu_reset_capture_coredumpm(tmp_adev);
5113 #endif
5114 if (vram_lost) {
5115 DRM_INFO("VRAM is lost due to GPU reset!\n");
5116 amdgpu_inc_vram_lost(tmp_adev);
5117 }
5118
5119 r = amdgpu_device_fw_loading(tmp_adev);
5120 if (r)
5121 return r;
5122
5123 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5124 if (r)
5125 goto out;
5126
5127 if (vram_lost)
5128 amdgpu_device_fill_reset_magic(tmp_adev);
5129
5130 /*
5131 * Add this ASIC as tracked as reset was already
5132 * complete successfully.
5133 */
5134 amdgpu_register_gpu_instance(tmp_adev);
5135
5136 if (!reset_context->hive &&
5137 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5138 amdgpu_xgmi_add_device(tmp_adev);
5139
5140 r = amdgpu_device_ip_late_init(tmp_adev);
5141 if (r)
5142 goto out;
5143
5144 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5145
5146 /*
5147 * The GPU enters bad state once faulty pages
5148 * by ECC has reached the threshold, and ras
5149 * recovery is scheduled next. So add one check
5150 * here to break recovery if it indeed exceeds
5151 * bad page threshold, and remind user to
5152 * retire this GPU or setting one bigger
5153 * bad_page_threshold value to fix this once
5154 * probing driver again.
5155 */
5156 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5157 /* must succeed. */
5158 amdgpu_ras_resume(tmp_adev);
5159 } else {
5160 r = -EINVAL;
5161 goto out;
5162 }
5163
5164 /* Update PSP FW topology after reset */
5165 if (reset_context->hive &&
5166 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5167 r = amdgpu_xgmi_update_topology(
5168 reset_context->hive, tmp_adev);
5169 }
5170 }
5171
5172 out:
5173 if (!r) {
5174 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5175 r = amdgpu_ib_ring_tests(tmp_adev);
5176 if (r) {
5177 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5178 need_full_reset = true;
5179 r = -EAGAIN;
5180 goto end;
5181 }
5182 }
5183
5184 if (!r)
5185 r = amdgpu_device_recover_vram(tmp_adev);
5186 else
5187 tmp_adev->asic_reset_res = r;
5188 }
5189
5190 end:
5191 if (need_full_reset)
5192 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5193 else
5194 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5195 return r;
5196 }
5197
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5198 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5199 {
5200
5201 switch (amdgpu_asic_reset_method(adev)) {
5202 case AMD_RESET_METHOD_MODE1:
5203 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5204 break;
5205 case AMD_RESET_METHOD_MODE2:
5206 adev->mp1_state = PP_MP1_STATE_RESET;
5207 break;
5208 default:
5209 adev->mp1_state = PP_MP1_STATE_NONE;
5210 break;
5211 }
5212
5213 pci_dev_put(p);
5214 }
5215
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5216 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5217 {
5218 amdgpu_vf_error_trans_all(adev);
5219 adev->mp1_state = PP_MP1_STATE_NONE;
5220 }
5221
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5222 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5223 {
5224 STUB();
5225 #ifdef notyet
5226 struct pci_dev *p = NULL;
5227
5228 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5229 adev->pdev->bus->number, 1);
5230 if (p) {
5231 pm_runtime_enable(&(p->dev));
5232 pm_runtime_resume(&(p->dev));
5233 }
5234 #endif
5235 }
5236
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5237 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5238 {
5239 enum amd_reset_method reset_method;
5240 struct pci_dev *p = NULL;
5241 u64 expires;
5242
5243 /*
5244 * For now, only BACO and mode1 reset are confirmed
5245 * to suffer the audio issue without proper suspended.
5246 */
5247 reset_method = amdgpu_asic_reset_method(adev);
5248 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5249 (reset_method != AMD_RESET_METHOD_MODE1))
5250 return -EINVAL;
5251
5252 STUB();
5253 return -ENOSYS;
5254 #ifdef notyet
5255
5256 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5257 adev->pdev->bus->number, 1);
5258 if (!p)
5259 return -ENODEV;
5260
5261 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5262 if (!expires)
5263 /*
5264 * If we cannot get the audio device autosuspend delay,
5265 * a fixed 4S interval will be used. Considering 3S is
5266 * the audio controller default autosuspend delay setting.
5267 * 4S used here is guaranteed to cover that.
5268 */
5269 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5270
5271 while (!pm_runtime_status_suspended(&(p->dev))) {
5272 if (!pm_runtime_suspend(&(p->dev)))
5273 break;
5274
5275 if (expires < ktime_get_mono_fast_ns()) {
5276 dev_warn(adev->dev, "failed to suspend display audio\n");
5277 pci_dev_put(p);
5278 /* TODO: abort the succeeding gpu reset? */
5279 return -ETIMEDOUT;
5280 }
5281 }
5282
5283 pm_runtime_disable(&(p->dev));
5284
5285 pci_dev_put(p);
5286 return 0;
5287 #endif
5288 }
5289
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5290 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5291 {
5292 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5293
5294 #if defined(CONFIG_DEBUG_FS)
5295 if (!amdgpu_sriov_vf(adev))
5296 cancel_work(&adev->reset_work);
5297 #endif
5298
5299 if (adev->kfd.dev)
5300 cancel_work(&adev->kfd.reset_work);
5301
5302 if (amdgpu_sriov_vf(adev))
5303 cancel_work(&adev->virt.flr_work);
5304
5305 if (con && adev->ras_enabled)
5306 cancel_work(&con->recovery_work);
5307
5308 }
5309
5310 /**
5311 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5312 *
5313 * @adev: amdgpu_device pointer
5314 * @job: which job trigger hang
5315 * @reset_context: amdgpu reset context pointer
5316 *
5317 * Attempt to reset the GPU if it has hung (all asics).
5318 * Attempt to do soft-reset or full-reset and reinitialize Asic
5319 * Returns 0 for success or an error on failure.
5320 */
5321
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5322 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5323 struct amdgpu_job *job,
5324 struct amdgpu_reset_context *reset_context)
5325 {
5326 struct list_head device_list, *device_list_handle = NULL;
5327 bool job_signaled = false;
5328 struct amdgpu_hive_info *hive = NULL;
5329 struct amdgpu_device *tmp_adev = NULL;
5330 int i, r = 0;
5331 bool need_emergency_restart = false;
5332 bool audio_suspended = false;
5333 bool gpu_reset_for_dev_remove = false;
5334
5335 gpu_reset_for_dev_remove =
5336 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5337 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5338
5339 /*
5340 * Special case: RAS triggered and full reset isn't supported
5341 */
5342 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5343
5344 /*
5345 * Flush RAM to disk so that after reboot
5346 * the user can read log and see why the system rebooted.
5347 */
5348 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5349 amdgpu_ras_get_context(adev)->reboot) {
5350 DRM_WARN("Emergency reboot.");
5351
5352 #ifdef notyet
5353 ksys_sync_helper();
5354 emergency_restart();
5355 #else
5356 panic("emergency_restart");
5357 #endif
5358 }
5359
5360 dev_info(adev->dev, "GPU %s begin!\n",
5361 need_emergency_restart ? "jobs stop":"reset");
5362
5363 if (!amdgpu_sriov_vf(adev))
5364 hive = amdgpu_get_xgmi_hive(adev);
5365 if (hive)
5366 mutex_lock(&hive->hive_lock);
5367
5368 reset_context->job = job;
5369 reset_context->hive = hive;
5370 /*
5371 * Build list of devices to reset.
5372 * In case we are in XGMI hive mode, resort the device list
5373 * to put adev in the 1st position.
5374 */
5375 INIT_LIST_HEAD(&device_list);
5376 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5377 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5378 list_add_tail(&tmp_adev->reset_list, &device_list);
5379 if (gpu_reset_for_dev_remove && adev->shutdown)
5380 tmp_adev->shutdown = true;
5381 }
5382 if (!list_is_first(&adev->reset_list, &device_list))
5383 list_rotate_to_front(&adev->reset_list, &device_list);
5384 device_list_handle = &device_list;
5385 } else {
5386 list_add_tail(&adev->reset_list, &device_list);
5387 device_list_handle = &device_list;
5388 }
5389
5390 /* We need to lock reset domain only once both for XGMI and single device */
5391 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5392 reset_list);
5393 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5394
5395 /* block all schedulers and reset given job's ring */
5396 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5397
5398 amdgpu_device_set_mp1_state(tmp_adev);
5399
5400 /*
5401 * Try to put the audio codec into suspend state
5402 * before gpu reset started.
5403 *
5404 * Due to the power domain of the graphics device
5405 * is shared with AZ power domain. Without this,
5406 * we may change the audio hardware from behind
5407 * the audio driver's back. That will trigger
5408 * some audio codec errors.
5409 */
5410 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5411 audio_suspended = true;
5412
5413 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5414
5415 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5416
5417 if (!amdgpu_sriov_vf(tmp_adev))
5418 amdgpu_amdkfd_pre_reset(tmp_adev);
5419
5420 /*
5421 * Mark these ASICs to be reseted as untracked first
5422 * And add them back after reset completed
5423 */
5424 amdgpu_unregister_gpu_instance(tmp_adev);
5425
5426 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5427
5428 /* disable ras on ALL IPs */
5429 if (!need_emergency_restart &&
5430 amdgpu_device_ip_need_full_reset(tmp_adev))
5431 amdgpu_ras_suspend(tmp_adev);
5432
5433 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5434 struct amdgpu_ring *ring = tmp_adev->rings[i];
5435
5436 if (!ring || !ring->sched.thread)
5437 continue;
5438
5439 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5440
5441 if (need_emergency_restart)
5442 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5443 }
5444 atomic_inc(&tmp_adev->gpu_reset_counter);
5445 }
5446
5447 if (need_emergency_restart)
5448 goto skip_sched_resume;
5449
5450 /*
5451 * Must check guilty signal here since after this point all old
5452 * HW fences are force signaled.
5453 *
5454 * job->base holds a reference to parent fence
5455 */
5456 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5457 job_signaled = true;
5458 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5459 goto skip_hw_reset;
5460 }
5461
5462 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5463 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5464 if (gpu_reset_for_dev_remove) {
5465 /* Workaroud for ASICs need to disable SMC first */
5466 amdgpu_device_smu_fini_early(tmp_adev);
5467 }
5468 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5469 /*TODO Should we stop ?*/
5470 if (r) {
5471 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5472 r, adev_to_drm(tmp_adev)->unique);
5473 tmp_adev->asic_reset_res = r;
5474 }
5475
5476 /*
5477 * Drop all pending non scheduler resets. Scheduler resets
5478 * were already dropped during drm_sched_stop
5479 */
5480 amdgpu_device_stop_pending_resets(tmp_adev);
5481 }
5482
5483 /* Actual ASIC resets if needed.*/
5484 /* Host driver will handle XGMI hive reset for SRIOV */
5485 if (amdgpu_sriov_vf(adev)) {
5486 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5487 if (r)
5488 adev->asic_reset_res = r;
5489
5490 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5491 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5492 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5493 amdgpu_ras_resume(adev);
5494 } else {
5495 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5496 if (r && r == -EAGAIN)
5497 goto retry;
5498
5499 if (!r && gpu_reset_for_dev_remove)
5500 goto recover_end;
5501 }
5502
5503 skip_hw_reset:
5504
5505 /* Post ASIC reset for all devs .*/
5506 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5507
5508 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5509 struct amdgpu_ring *ring = tmp_adev->rings[i];
5510
5511 if (!ring || !ring->sched.thread)
5512 continue;
5513
5514 drm_sched_start(&ring->sched, true);
5515 }
5516
5517 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5518 amdgpu_mes_self_test(tmp_adev);
5519
5520 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5521 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5522
5523 if (tmp_adev->asic_reset_res)
5524 r = tmp_adev->asic_reset_res;
5525
5526 tmp_adev->asic_reset_res = 0;
5527
5528 if (r) {
5529 /* bad news, how to tell it to userspace ? */
5530 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5531 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5532 } else {
5533 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5534 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5535 DRM_WARN("smart shift update failed\n");
5536 }
5537 }
5538
5539 skip_sched_resume:
5540 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5541 /* unlock kfd: SRIOV would do it separately */
5542 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5543 amdgpu_amdkfd_post_reset(tmp_adev);
5544
5545 /* kfd_post_reset will do nothing if kfd device is not initialized,
5546 * need to bring up kfd here if it's not be initialized before
5547 */
5548 if (!adev->kfd.init_complete)
5549 amdgpu_amdkfd_device_init(adev);
5550
5551 if (audio_suspended)
5552 amdgpu_device_resume_display_audio(tmp_adev);
5553
5554 amdgpu_device_unset_mp1_state(tmp_adev);
5555
5556 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5557 }
5558
5559 recover_end:
5560 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5561 reset_list);
5562 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5563
5564 if (hive) {
5565 mutex_unlock(&hive->hive_lock);
5566 amdgpu_put_xgmi_hive(hive);
5567 }
5568
5569 if (r)
5570 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5571
5572 atomic_set(&adev->reset_domain->reset_res, r);
5573 return r;
5574 }
5575
5576 /**
5577 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5578 *
5579 * @adev: amdgpu_device pointer
5580 *
5581 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5582 * and lanes) of the slot the device is in. Handles APUs and
5583 * virtualized environments where PCIE config space may not be available.
5584 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5585 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5586 {
5587 struct pci_dev *pdev;
5588 enum pci_bus_speed speed_cap, platform_speed_cap;
5589 enum pcie_link_width platform_link_width;
5590
5591 if (amdgpu_pcie_gen_cap)
5592 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5593
5594 if (amdgpu_pcie_lane_cap)
5595 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5596
5597 /* covers APUs as well */
5598 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5599 if (adev->pm.pcie_gen_mask == 0)
5600 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5601 if (adev->pm.pcie_mlw_mask == 0)
5602 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5603 return;
5604 }
5605
5606 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5607 return;
5608
5609 pcie_bandwidth_available(adev->pdev, NULL,
5610 &platform_speed_cap, &platform_link_width);
5611
5612 if (adev->pm.pcie_gen_mask == 0) {
5613 /* asic caps */
5614 pdev = adev->pdev;
5615 speed_cap = pcie_get_speed_cap(pdev);
5616 if (speed_cap == PCI_SPEED_UNKNOWN) {
5617 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5618 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5619 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5620 } else {
5621 if (speed_cap == PCIE_SPEED_32_0GT)
5622 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5623 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5624 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5625 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5626 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5627 else if (speed_cap == PCIE_SPEED_16_0GT)
5628 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5629 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5630 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5631 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5632 else if (speed_cap == PCIE_SPEED_8_0GT)
5633 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5634 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5635 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5636 else if (speed_cap == PCIE_SPEED_5_0GT)
5637 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5638 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5639 else
5640 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5641 }
5642 /* platform caps */
5643 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5644 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5645 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5646 } else {
5647 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5648 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5649 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5650 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5651 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5652 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5653 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5654 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5655 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5656 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5657 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5658 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5659 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5660 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5661 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5662 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5663 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5664 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5665 else
5666 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5667
5668 }
5669 }
5670 if (adev->pm.pcie_mlw_mask == 0) {
5671 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5672 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5673 } else {
5674 switch (platform_link_width) {
5675 case PCIE_LNK_X32:
5676 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5677 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5678 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5679 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5680 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5681 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5682 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5683 break;
5684 case PCIE_LNK_X16:
5685 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5686 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5687 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5689 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5691 break;
5692 case PCIE_LNK_X12:
5693 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5694 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5695 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5697 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5698 break;
5699 case PCIE_LNK_X8:
5700 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5701 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5702 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5703 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5704 break;
5705 case PCIE_LNK_X4:
5706 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5707 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5708 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5709 break;
5710 case PCIE_LNK_X2:
5711 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5712 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5713 break;
5714 case PCIE_LNK_X1:
5715 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5716 break;
5717 default:
5718 break;
5719 }
5720 }
5721 }
5722 }
5723
5724 /**
5725 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5726 *
5727 * @adev: amdgpu_device pointer
5728 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5729 *
5730 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5731 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5732 * @peer_adev.
5733 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5734 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5735 struct amdgpu_device *peer_adev)
5736 {
5737 #ifdef CONFIG_HSA_AMD_P2P
5738 uint64_t address_mask = peer_adev->dev->dma_mask ?
5739 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5740 resource_size_t aper_limit =
5741 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5742 bool p2p_access =
5743 !adev->gmc.xgmi.connected_to_cpu &&
5744 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5745
5746 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5747 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5748 !(adev->gmc.aper_base & address_mask ||
5749 aper_limit & address_mask));
5750 #else
5751 return false;
5752 #endif
5753 }
5754
amdgpu_device_baco_enter(struct drm_device * dev)5755 int amdgpu_device_baco_enter(struct drm_device *dev)
5756 {
5757 struct amdgpu_device *adev = drm_to_adev(dev);
5758 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5759
5760 if (!amdgpu_device_supports_baco(dev))
5761 return -ENOTSUPP;
5762
5763 if (ras && adev->ras_enabled &&
5764 adev->nbio.funcs->enable_doorbell_interrupt)
5765 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5766
5767 return amdgpu_dpm_baco_enter(adev);
5768 }
5769
amdgpu_device_baco_exit(struct drm_device * dev)5770 int amdgpu_device_baco_exit(struct drm_device *dev)
5771 {
5772 struct amdgpu_device *adev = drm_to_adev(dev);
5773 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5774 int ret = 0;
5775
5776 if (!amdgpu_device_supports_baco(dev))
5777 return -ENOTSUPP;
5778
5779 ret = amdgpu_dpm_baco_exit(adev);
5780 if (ret)
5781 return ret;
5782
5783 if (ras && adev->ras_enabled &&
5784 adev->nbio.funcs->enable_doorbell_interrupt)
5785 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5786
5787 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
5788 adev->nbio.funcs->clear_doorbell_interrupt)
5789 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5790
5791 return 0;
5792 }
5793
5794 /**
5795 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5796 * @pdev: PCI device struct
5797 * @state: PCI channel state
5798 *
5799 * Description: Called when a PCI error is detected.
5800 *
5801 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5802 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5803 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5804 {
5805 STUB();
5806 return 0;
5807 #ifdef notyet
5808 struct drm_device *dev = pci_get_drvdata(pdev);
5809 struct amdgpu_device *adev = drm_to_adev(dev);
5810 int i;
5811
5812 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5813
5814 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5815 DRM_WARN("No support for XGMI hive yet...");
5816 return PCI_ERS_RESULT_DISCONNECT;
5817 }
5818
5819 adev->pci_channel_state = state;
5820
5821 switch (state) {
5822 case pci_channel_io_normal:
5823 return PCI_ERS_RESULT_CAN_RECOVER;
5824 /* Fatal error, prepare for slot reset */
5825 case pci_channel_io_frozen:
5826 /*
5827 * Locking adev->reset_domain->sem will prevent any external access
5828 * to GPU during PCI error recovery
5829 */
5830 amdgpu_device_lock_reset_domain(adev->reset_domain);
5831 amdgpu_device_set_mp1_state(adev);
5832
5833 /*
5834 * Block any work scheduling as we do for regular GPU reset
5835 * for the duration of the recovery
5836 */
5837 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5838 struct amdgpu_ring *ring = adev->rings[i];
5839
5840 if (!ring || !ring->sched.thread)
5841 continue;
5842
5843 drm_sched_stop(&ring->sched, NULL);
5844 }
5845 atomic_inc(&adev->gpu_reset_counter);
5846 return PCI_ERS_RESULT_NEED_RESET;
5847 case pci_channel_io_perm_failure:
5848 /* Permanent error, prepare for device removal */
5849 return PCI_ERS_RESULT_DISCONNECT;
5850 }
5851
5852 return PCI_ERS_RESULT_NEED_RESET;
5853 #endif
5854 }
5855
5856 /**
5857 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5858 * @pdev: pointer to PCI device
5859 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5860 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5861 {
5862
5863 DRM_INFO("PCI error: mmio enabled callback!!\n");
5864
5865 /* TODO - dump whatever for debugging purposes */
5866
5867 /* This called only if amdgpu_pci_error_detected returns
5868 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5869 * works, no need to reset slot.
5870 */
5871
5872 return PCI_ERS_RESULT_RECOVERED;
5873 }
5874
5875 /**
5876 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5877 * @pdev: PCI device struct
5878 *
5879 * Description: This routine is called by the pci error recovery
5880 * code after the PCI slot has been reset, just before we
5881 * should resume normal operations.
5882 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5883 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5884 {
5885 STUB();
5886 return PCI_ERS_RESULT_RECOVERED;
5887 #ifdef notyet
5888 struct drm_device *dev = pci_get_drvdata(pdev);
5889 struct amdgpu_device *adev = drm_to_adev(dev);
5890 int r, i;
5891 struct amdgpu_reset_context reset_context;
5892 u32 memsize;
5893 struct list_head device_list;
5894
5895 DRM_INFO("PCI error: slot reset callback!!\n");
5896
5897 memset(&reset_context, 0, sizeof(reset_context));
5898
5899 INIT_LIST_HEAD(&device_list);
5900 list_add_tail(&adev->reset_list, &device_list);
5901
5902 /* wait for asic to come out of reset */
5903 drm_msleep(500);
5904
5905 /* Restore PCI confspace */
5906 amdgpu_device_load_pci_state(pdev);
5907
5908 /* confirm ASIC came out of reset */
5909 for (i = 0; i < adev->usec_timeout; i++) {
5910 memsize = amdgpu_asic_get_config_memsize(adev);
5911
5912 if (memsize != 0xffffffff)
5913 break;
5914 udelay(1);
5915 }
5916 if (memsize == 0xffffffff) {
5917 r = -ETIME;
5918 goto out;
5919 }
5920
5921 reset_context.method = AMD_RESET_METHOD_NONE;
5922 reset_context.reset_req_dev = adev;
5923 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5924 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5925
5926 adev->no_hw_access = true;
5927 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5928 adev->no_hw_access = false;
5929 if (r)
5930 goto out;
5931
5932 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5933
5934 out:
5935 if (!r) {
5936 if (amdgpu_device_cache_pci_state(adev->pdev))
5937 pci_restore_state(adev->pdev);
5938
5939 DRM_INFO("PCIe error recovery succeeded\n");
5940 } else {
5941 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5942 amdgpu_device_unset_mp1_state(adev);
5943 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5944 }
5945
5946 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5947 #endif
5948 }
5949
5950 /**
5951 * amdgpu_pci_resume() - resume normal ops after PCI reset
5952 * @pdev: pointer to PCI device
5953 *
5954 * Called when the error recovery driver tells us that its
5955 * OK to resume normal operation.
5956 */
amdgpu_pci_resume(struct pci_dev * pdev)5957 void amdgpu_pci_resume(struct pci_dev *pdev)
5958 {
5959 STUB();
5960 #ifdef notyet
5961 struct drm_device *dev = pci_get_drvdata(pdev);
5962 struct amdgpu_device *adev = drm_to_adev(dev);
5963 int i;
5964
5965
5966 DRM_INFO("PCI error: resume callback!!\n");
5967
5968 /* Only continue execution for the case of pci_channel_io_frozen */
5969 if (adev->pci_channel_state != pci_channel_io_frozen)
5970 return;
5971
5972 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5973 struct amdgpu_ring *ring = adev->rings[i];
5974
5975 if (!ring || !ring->sched.thread)
5976 continue;
5977
5978 drm_sched_start(&ring->sched, true);
5979 }
5980
5981 amdgpu_device_unset_mp1_state(adev);
5982 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5983 #endif
5984 }
5985
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5986 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5987 {
5988 return false;
5989 #ifdef notyet
5990 struct drm_device *dev = pci_get_drvdata(pdev);
5991 struct amdgpu_device *adev = drm_to_adev(dev);
5992 int r;
5993
5994 r = pci_save_state(pdev);
5995 if (!r) {
5996 kfree(adev->pci_state);
5997
5998 adev->pci_state = pci_store_saved_state(pdev);
5999
6000 if (!adev->pci_state) {
6001 DRM_ERROR("Failed to store PCI saved state");
6002 return false;
6003 }
6004 } else {
6005 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6006 return false;
6007 }
6008
6009 return true;
6010 #endif
6011 }
6012
amdgpu_device_load_pci_state(struct pci_dev * pdev)6013 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6014 {
6015 STUB();
6016 return false;
6017 #ifdef notyet
6018 struct drm_device *dev = pci_get_drvdata(pdev);
6019 struct amdgpu_device *adev = drm_to_adev(dev);
6020 int r;
6021
6022 if (!adev->pci_state)
6023 return false;
6024
6025 r = pci_load_saved_state(pdev, adev->pci_state);
6026
6027 if (!r) {
6028 pci_restore_state(pdev);
6029 } else {
6030 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6031 return false;
6032 }
6033
6034 return true;
6035 #endif
6036 }
6037
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6038 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6039 struct amdgpu_ring *ring)
6040 {
6041 #ifdef CONFIG_X86_64
6042 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6043 return;
6044 #endif
6045 if (adev->gmc.xgmi.connected_to_cpu)
6046 return;
6047
6048 if (ring && ring->funcs->emit_hdp_flush)
6049 amdgpu_ring_emit_hdp_flush(ring);
6050 else
6051 amdgpu_asic_flush_hdp(adev, ring);
6052 }
6053
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6054 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6055 struct amdgpu_ring *ring)
6056 {
6057 #ifdef CONFIG_X86_64
6058 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6059 return;
6060 #endif
6061 if (adev->gmc.xgmi.connected_to_cpu)
6062 return;
6063
6064 amdgpu_asic_invalidate_hdp(adev, ring);
6065 }
6066
amdgpu_in_reset(struct amdgpu_device * adev)6067 int amdgpu_in_reset(struct amdgpu_device *adev)
6068 {
6069 return atomic_read(&adev->reset_domain->in_gpu_reset);
6070 }
6071
6072 /**
6073 * amdgpu_device_halt() - bring hardware to some kind of halt state
6074 *
6075 * @adev: amdgpu_device pointer
6076 *
6077 * Bring hardware to some kind of halt state so that no one can touch it
6078 * any more. It will help to maintain error context when error occurred.
6079 * Compare to a simple hang, the system will keep stable at least for SSH
6080 * access. Then it should be trivial to inspect the hardware state and
6081 * see what's going on. Implemented as following:
6082 *
6083 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6084 * clears all CPU mappings to device, disallows remappings through page faults
6085 * 2. amdgpu_irq_disable_all() disables all interrupts
6086 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6087 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6088 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6089 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6090 * flush any in flight DMA operations
6091 */
amdgpu_device_halt(struct amdgpu_device * adev)6092 void amdgpu_device_halt(struct amdgpu_device *adev)
6093 {
6094 struct pci_dev *pdev = adev->pdev;
6095 struct drm_device *ddev = adev_to_drm(adev);
6096
6097 amdgpu_xcp_dev_unplug(adev);
6098 drm_dev_unplug(ddev);
6099
6100 amdgpu_irq_disable_all(adev);
6101
6102 amdgpu_fence_driver_hw_fini(adev);
6103
6104 adev->no_hw_access = true;
6105
6106 amdgpu_device_unmap_mmio(adev);
6107
6108 pci_disable_device(pdev);
6109 pci_wait_for_pending_transaction(pdev);
6110 }
6111
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6112 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6113 u32 reg)
6114 {
6115 unsigned long flags, address, data;
6116 u32 r;
6117
6118 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6119 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6120
6121 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6122 WREG32(address, reg * 4);
6123 (void)RREG32(address);
6124 r = RREG32(data);
6125 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6126 return r;
6127 }
6128
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6129 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6130 u32 reg, u32 v)
6131 {
6132 unsigned long flags, address, data;
6133
6134 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6135 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6136
6137 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6138 WREG32(address, reg * 4);
6139 (void)RREG32(address);
6140 WREG32(data, v);
6141 (void)RREG32(data);
6142 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6143 }
6144
6145 /**
6146 * amdgpu_device_switch_gang - switch to a new gang
6147 * @adev: amdgpu_device pointer
6148 * @gang: the gang to switch to
6149 *
6150 * Try to switch to a new gang.
6151 * Returns: NULL if we switched to the new gang or a reference to the current
6152 * gang leader.
6153 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6154 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6155 struct dma_fence *gang)
6156 {
6157 struct dma_fence *old = NULL;
6158
6159 do {
6160 dma_fence_put(old);
6161 rcu_read_lock();
6162 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6163 rcu_read_unlock();
6164
6165 if (old == gang)
6166 break;
6167
6168 if (!dma_fence_is_signaled(old))
6169 return old;
6170
6171 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6172 old, gang) != old);
6173
6174 dma_fence_put(old);
6175 return NULL;
6176 }
6177
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6178 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6179 {
6180 switch (adev->asic_type) {
6181 #ifdef CONFIG_DRM_AMDGPU_SI
6182 case CHIP_HAINAN:
6183 #endif
6184 case CHIP_TOPAZ:
6185 /* chips with no display hardware */
6186 return false;
6187 #ifdef CONFIG_DRM_AMDGPU_SI
6188 case CHIP_TAHITI:
6189 case CHIP_PITCAIRN:
6190 case CHIP_VERDE:
6191 case CHIP_OLAND:
6192 #endif
6193 #ifdef CONFIG_DRM_AMDGPU_CIK
6194 case CHIP_BONAIRE:
6195 case CHIP_HAWAII:
6196 case CHIP_KAVERI:
6197 case CHIP_KABINI:
6198 case CHIP_MULLINS:
6199 #endif
6200 case CHIP_TONGA:
6201 case CHIP_FIJI:
6202 case CHIP_POLARIS10:
6203 case CHIP_POLARIS11:
6204 case CHIP_POLARIS12:
6205 case CHIP_VEGAM:
6206 case CHIP_CARRIZO:
6207 case CHIP_STONEY:
6208 /* chips with display hardware */
6209 return true;
6210 default:
6211 /* IP discovery */
6212 if (!adev->ip_versions[DCE_HWIP][0] ||
6213 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6214 return false;
6215 return true;
6216 }
6217 }
6218
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6219 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6220 uint32_t inst, uint32_t reg_addr, char reg_name[],
6221 uint32_t expected_value, uint32_t mask)
6222 {
6223 uint32_t ret = 0;
6224 uint32_t old_ = 0;
6225 uint32_t tmp_ = RREG32(reg_addr);
6226 uint32_t loop = adev->usec_timeout;
6227
6228 while ((tmp_ & (mask)) != (expected_value)) {
6229 if (old_ != tmp_) {
6230 loop = adev->usec_timeout;
6231 old_ = tmp_;
6232 } else
6233 udelay(1);
6234 tmp_ = RREG32(reg_addr);
6235 loop--;
6236 if (!loop) {
6237 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6238 inst, reg_name, (uint32_t)expected_value,
6239 (uint32_t)(tmp_ & (mask)));
6240 ret = -ETIMEDOUT;
6241 break;
6242 }
6243 }
6244 return ret;
6245 }
6246