1 /* $NetBSD: amdgpu_device.c,v 1.19 2023/05/25 12:07:43 riastradh Exp $ */
2
3 /*
4 * Copyright 2008 Advanced Micro Devices, Inc.
5 * Copyright 2008 Red Hat Inc.
6 * Copyright 2009 Jerome Glisse.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
22 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
25 *
26 * Authors: Dave Airlie
27 * Alex Deucher
28 * Jerome Glisse
29 */
30 #include <sys/cdefs.h>
31 __KERNEL_RCSID(0, "$NetBSD: amdgpu_device.c,v 1.19 2023/05/25 12:07:43 riastradh Exp $");
32
33 #include <linux/power_supply.h>
34 #include <linux/kthread.h>
35 #include <linux/module.h>
36 #include <linux/console.h>
37 #include <linux/slab.h>
38 #include <linux/reboot.h>
39
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_probe_helper.h>
42 #include <drm/amdgpu_drm.h>
43 #include <linux/vgaarb.h>
44 #include <linux/vga_switcheroo.h>
45 #include <linux/efi.h>
46 #include "amdgpu.h"
47 #include "amdgpu_trace.h"
48 #include "amdgpu_i2c.h"
49 #include "atom.h"
50 #include "amdgpu_atombios.h"
51 #include "amdgpu_atomfirmware.h"
52 #include "amd_pcie.h"
53 #ifdef CONFIG_DRM_AMDGPU_SI
54 #include "si.h"
55 #endif
56 #ifdef CONFIG_DRM_AMDGPU_CIK
57 #include "cik.h"
58 #endif
59 #include "vi.h"
60 #include "soc15.h"
61 #include "nv.h"
62 #include "bif/bif_4_1_d.h"
63 #include <linux/pci.h>
64 #include <linux/firmware.h>
65 #include "amdgpu_vf_error.h"
66
67 #include "amdgpu_amdkfd.h"
68 #include "amdgpu_pm.h"
69
70 #include "amdgpu_xgmi.h"
71 #include "amdgpu_ras.h"
72 #include "amdgpu_pmu.h"
73
74 #include <linux/suspend.h>
75 #include <drm/task_barrier.h>
76 #include <linux/nbsd-namespace.h>
77
78 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
88
89 #define AMDGPU_RESUME_MS 2000
90
91 const char *amdgpu_asic_name[] = {
92 "TAHITI",
93 "PITCAIRN",
94 "VERDE",
95 "OLAND",
96 "HAINAN",
97 "BONAIRE",
98 "KAVERI",
99 "KABINI",
100 "HAWAII",
101 "MULLINS",
102 "TOPAZ",
103 "TONGA",
104 "FIJI",
105 "CARRIZO",
106 "STONEY",
107 "POLARIS10",
108 "POLARIS11",
109 "POLARIS12",
110 "VEGAM",
111 "VEGA10",
112 "VEGA12",
113 "VEGA20",
114 "RAVEN",
115 "ARCTURUS",
116 "RENOIR",
117 "NAVI10",
118 "NAVI14",
119 "NAVI12",
120 "LAST",
121 };
122
123 #ifndef __NetBSD__ /* XXX amdgpu sysfs */
124
125 /**
126 * DOC: pcie_replay_count
127 *
128 * The amdgpu driver provides a sysfs API for reporting the total number
129 * of PCIe replays (NAKs)
130 * The file pcie_replay_count is used for this and returns the total
131 * number of replays as a sum of the NAKs generated and NAKs received
132 */
133
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
135 struct device_attribute *attr, char *buf)
136 {
137 struct drm_device *ddev = dev_get_drvdata(dev);
138 struct amdgpu_device *adev = ddev->dev_private;
139 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
140
141 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
142 }
143
144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
145 amdgpu_device_get_pcie_replay_count, NULL);
146
147 #endif /* __NetBSD__ */
148
149 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
150
151 /**
152 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
153 *
154 * @dev: drm_device pointer
155 *
156 * Returns true if the device is a dGPU with HG/PX power control,
157 * otherwise return false.
158 */
amdgpu_device_supports_boco(struct drm_device * dev)159 bool amdgpu_device_supports_boco(struct drm_device *dev)
160 {
161 struct amdgpu_device *adev = dev->dev_private;
162
163 if (adev->flags & AMD_IS_PX)
164 return true;
165 return false;
166 }
167
168 /**
169 * amdgpu_device_supports_baco - Does the device support BACO
170 *
171 * @dev: drm_device pointer
172 *
173 * Returns true if the device supporte BACO,
174 * otherwise return false.
175 */
amdgpu_device_supports_baco(struct drm_device * dev)176 bool amdgpu_device_supports_baco(struct drm_device *dev)
177 {
178 struct amdgpu_device *adev = dev->dev_private;
179
180 return amdgpu_asic_supports_baco(adev);
181 }
182
183 /**
184 * VRAM access helper functions.
185 *
186 * amdgpu_device_vram_access - read/write a buffer in vram
187 *
188 * @adev: amdgpu_device pointer
189 * @pos: offset of the buffer in vram
190 * @buf: virtual address of the buffer in system memory
191 * @size: read/write size, sizeof(@buf) must > @size
192 * @write: true - write to vram, otherwise - read from vram
193 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)194 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
195 uint32_t *buf, size_t size, bool write)
196 {
197 uint64_t last;
198 unsigned long flags;
199
200 last = size - 4;
201 for (last += pos; pos <= last; pos += 4) {
202 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
203 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
204 WREG32_NO_KIQ(mmMM_INDEX_HI, pos >> 31);
205 if (write)
206 WREG32_NO_KIQ(mmMM_DATA, *buf++);
207 else
208 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
209 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
210 }
211 }
212
213 /*
214 * MMIO register access helper functions.
215 */
216 /**
217 * amdgpu_mm_rreg - read a memory mapped IO register
218 *
219 * @adev: amdgpu_device pointer
220 * @reg: dword aligned register offset
221 * @acc_flags: access flags which require special behavior
222 *
223 * Returns the 32 bit value from the offset specified.
224 */
amdgpu_mm_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)225 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
226 uint32_t acc_flags)
227 {
228 uint32_t ret;
229
230 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
231 return amdgpu_kiq_rreg(adev, reg);
232
233 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
234 #ifdef __NetBSD__
235 return bus_space_read_4(adev->rmmiot, adev->rmmioh, 4*reg);
236 #else
237 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
238 #endif
239 else {
240 unsigned long flags;
241
242 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
243 #ifdef __NetBSD__
244 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX,
245 4*reg);
246 ret = bus_space_read_4(adev->rmmiot, adev->rmmioh,
247 4*mmMM_DATA);
248 #else
249 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
250 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
251 #endif
252 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
253 }
254 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
255 return ret;
256 }
257
258 /*
259 * MMIO register read with bytes helper functions
260 * @offset:bytes offset from MMIO start
261 *
262 */
263
264 /**
265 * amdgpu_mm_rreg8 - read a memory mapped IO register
266 *
267 * @adev: amdgpu_device pointer
268 * @offset: byte aligned register offset
269 *
270 * Returns the 8 bit value from the offset specified.
271 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)272 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
273 if (offset < adev->rmmio_size)
274 #ifdef __NetBSD__
275 return bus_space_read_1(adev->rmmiot, adev->rmmioh, offset);
276 #else
277 return (readb(adev->rmmio + offset));
278 #endif
279 BUG();
280 }
281
282 /*
283 * MMIO register write with bytes helper functions
284 * @offset:bytes offset from MMIO start
285 * @value: the value want to be written to the register
286 *
287 */
288 /**
289 * amdgpu_mm_wreg8 - read a memory mapped IO register
290 *
291 * @adev: amdgpu_device pointer
292 * @offset: byte aligned register offset
293 * @value: 8 bit value to write
294 *
295 * Writes the value specified to the offset specified.
296 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)297 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
298 if (offset < adev->rmmio_size)
299 #ifdef __NetBSD__
300 bus_space_write_1(adev->rmmiot, adev->rmmioh, offset, value);
301 #else
302 writeb(value, adev->rmmio + offset);
303 #endif
304 else
305 BUG();
306 }
307
308 /**
309 * amdgpu_mm_wreg - write to a memory mapped IO register
310 *
311 * @adev: amdgpu_device pointer
312 * @reg: dword aligned register offset
313 * @v: 32 bit value to write to the register
314 * @acc_flags: access flags which require special behavior
315 *
316 * Writes the value specified to the offset specified.
317 */
amdgpu_mm_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)318 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
319 uint32_t acc_flags)
320 {
321 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
322
323 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
324 adev->last_mm_index = v;
325 }
326
327 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
328 return amdgpu_kiq_wreg(adev, reg, v);
329
330 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
331 #ifdef __NetBSD__
332 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*reg, v);
333 #else
334 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
335 #endif
336 else {
337 unsigned long flags;
338
339 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
340 #ifdef __NetBSD__
341 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX,
342 reg*4);
343 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_DATA, v);
344 #else
345 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
346 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
347 #endif
348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
349 }
350
351 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
352 udelay(500);
353 }
354 }
355
356 /**
357 * amdgpu_io_rreg - read an IO register
358 *
359 * @adev: amdgpu_device pointer
360 * @reg: dword aligned register offset
361 *
362 * Returns the 32 bit value from the offset specified.
363 */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)364 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
365 {
366 if ((reg * 4) < adev->rio_mem_size)
367 #ifdef __NetBSD__
368 return bus_space_read_4(adev->rio_memt, adev->rio_memh, 4*reg);
369 #else
370 return ioread32(adev->rio_mem + (reg * 4));
371 #endif
372 else {
373 #ifdef __NetBSD__
374 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX,
375 4*reg);
376 return bus_space_read_4(adev->rio_memt, adev->rio_memh,
377 4*mmMM_DATA);
378 #else
379 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
380 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
381 #endif
382 }
383 }
384
385 /**
386 * amdgpu_io_wreg - write to an IO register
387 *
388 * @adev: amdgpu_device pointer
389 * @reg: dword aligned register offset
390 * @v: 32 bit value to write to the register
391 *
392 * Writes the value specified to the offset specified.
393 */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)394 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
395 {
396 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
397 adev->last_mm_index = v;
398 }
399
400 if ((reg * 4) < adev->rio_mem_size)
401 #ifdef __NetBSD__
402 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*reg, v);
403 #else
404 iowrite32(v, adev->rio_mem + (reg * 4));
405 #endif
406 else {
407 #ifdef __NetBSD__
408 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX,
409 4*reg);
410 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_DATA,
411 v);
412 #else
413 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
414 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
415 #endif
416 }
417
418 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
419 udelay(500);
420 }
421 }
422
423 /**
424 * amdgpu_mm_rdoorbell - read a doorbell dword
425 *
426 * @adev: amdgpu_device pointer
427 * @index: doorbell index
428 *
429 * Returns the value in the doorbell aperture at the
430 * requested doorbell index (CIK).
431 */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)432 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
433 {
434 if (index < adev->doorbell.num_doorbells) {
435 #ifdef __NetBSD__
436 return bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
437 4*index);
438 #else
439 return readl(adev->doorbell.ptr + index);
440 #endif
441 } else {
442 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
443 return 0;
444 }
445 }
446
447 /**
448 * amdgpu_mm_wdoorbell - write a doorbell dword
449 *
450 * @adev: amdgpu_device pointer
451 * @index: doorbell index
452 * @v: value to write
453 *
454 * Writes @v to the doorbell aperture at the
455 * requested doorbell index (CIK).
456 */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)457 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
458 {
459 if (index < adev->doorbell.num_doorbells) {
460 #ifdef __NetBSD__
461 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
462 4*index, v);
463 #else
464 writel(v, adev->doorbell.ptr + index);
465 #endif
466 } else {
467 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
468 }
469 }
470
471 /**
472 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
473 *
474 * @adev: amdgpu_device pointer
475 * @index: doorbell index
476 *
477 * Returns the value in the doorbell aperture at the
478 * requested doorbell index (VEGA10+).
479 */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)480 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
481 {
482 if (index < adev->doorbell.num_doorbells) {
483 #ifdef __NetBSD__
484 #ifdef _LP64
485 return bus_space_read_8(adev->doorbell.bst, adev->doorbell.bsh,
486 4*index);
487 #else
488 uint64_t lo, hi;
489 #if _BYTE_ORDER == _LITTLE_ENDIAN
490 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
491 4*index);
492 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
493 4*index + 4);
494 #else
495 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
496 4*index);
497 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
498 4*index + 4);
499 #endif
500 return lo | (hi << 32);
501 #endif
502 #else
503 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
504 #endif
505 } else {
506 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
507 return 0;
508 }
509 }
510
511 /**
512 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
513 *
514 * @adev: amdgpu_device pointer
515 * @index: doorbell index
516 * @v: value to write
517 *
518 * Writes @v to the doorbell aperture at the
519 * requested doorbell index (VEGA10+).
520 */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)521 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
522 {
523 if (index < adev->doorbell.num_doorbells) {
524 #ifdef __NetBSD__
525 #ifdef _LP64
526 bus_space_write_8(adev->doorbell.bst, adev->doorbell.bsh,
527 4*index, v);
528 #else
529 /*
530 * XXX This might not be as atomic as one might hope...
531 */
532 #if _BYTE_ORDER == _LITTLE_ENDIAN
533 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
534 4*index, v & 0xffffffffU);
535 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
536 4*index + 4, v >> 32);
537 #else
538 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
539 4*index, v >> 32);
540 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
541 4*index + 4, v & 0xffffffffU);
542 #endif
543 #endif
544 #else
545 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
546 #endif
547 } else {
548 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
549 }
550 }
551
552 /**
553 * amdgpu_invalid_rreg - dummy reg read function
554 *
555 * @adev: amdgpu device pointer
556 * @reg: offset of register
557 *
558 * Dummy register read function. Used for register blocks
559 * that certain asics don't have (all asics).
560 * Returns the value in the register.
561 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)562 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
563 {
564 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
565 BUG();
566 return 0;
567 }
568
569 /**
570 * amdgpu_invalid_wreg - dummy reg write function
571 *
572 * @adev: amdgpu device pointer
573 * @reg: offset of register
574 * @v: value to write to the register
575 *
576 * Dummy register read function. Used for register blocks
577 * that certain asics don't have (all asics).
578 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)579 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
580 {
581 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
582 reg, v);
583 BUG();
584 }
585
586 /**
587 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
588 *
589 * @adev: amdgpu device pointer
590 * @reg: offset of register
591 *
592 * Dummy register read function. Used for register blocks
593 * that certain asics don't have (all asics).
594 * Returns the value in the register.
595 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)596 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
597 {
598 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
599 BUG();
600 return 0;
601 }
602
603 /**
604 * amdgpu_invalid_wreg64 - dummy reg write function
605 *
606 * @adev: amdgpu device pointer
607 * @reg: offset of register
608 * @v: value to write to the register
609 *
610 * Dummy register read function. Used for register blocks
611 * that certain asics don't have (all asics).
612 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)613 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
614 {
615 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08"PRIX64"\n",
616 reg, v);
617 BUG();
618 }
619
620 /**
621 * amdgpu_block_invalid_rreg - dummy reg read function
622 *
623 * @adev: amdgpu device pointer
624 * @block: offset of instance
625 * @reg: offset of register
626 *
627 * Dummy register read function. Used for register blocks
628 * that certain asics don't have (all asics).
629 * Returns the value in the register.
630 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)631 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
632 uint32_t block, uint32_t reg)
633 {
634 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
635 reg, block);
636 BUG();
637 return 0;
638 }
639
640 /**
641 * amdgpu_block_invalid_wreg - dummy reg write function
642 *
643 * @adev: amdgpu device pointer
644 * @block: offset of instance
645 * @reg: offset of register
646 * @v: value to write to the register
647 *
648 * Dummy register read function. Used for register blocks
649 * that certain asics don't have (all asics).
650 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)651 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
652 uint32_t block,
653 uint32_t reg, uint32_t v)
654 {
655 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
656 reg, block, v);
657 BUG();
658 }
659
660 /**
661 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
662 *
663 * @adev: amdgpu device pointer
664 *
665 * Allocates a scratch page of VRAM for use by various things in the
666 * driver.
667 */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)668 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
669 {
670 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
671 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
672 &adev->vram_scratch.robj,
673 &adev->vram_scratch.gpu_addr,
674 (void **)__UNVOLATILE(&adev->vram_scratch.ptr));
675 }
676
677 /**
678 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
679 *
680 * @adev: amdgpu device pointer
681 *
682 * Frees the VRAM scratch page.
683 */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)684 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
685 {
686 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
687 }
688
689 /**
690 * amdgpu_device_program_register_sequence - program an array of registers.
691 *
692 * @adev: amdgpu_device pointer
693 * @registers: pointer to the register array
694 * @array_size: size of the register array
695 *
696 * Programs an array or registers with and and or masks.
697 * This is a helper for setting golden registers.
698 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)699 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
700 const u32 *registers,
701 const u32 array_size)
702 {
703 u32 tmp, reg, and_mask, or_mask;
704 int i;
705
706 if (array_size % 3)
707 return;
708
709 for (i = 0; i < array_size; i +=3) {
710 reg = registers[i + 0];
711 and_mask = registers[i + 1];
712 or_mask = registers[i + 2];
713
714 if (and_mask == 0xffffffff) {
715 tmp = or_mask;
716 } else {
717 tmp = RREG32(reg);
718 tmp &= ~and_mask;
719 if (adev->family >= AMDGPU_FAMILY_AI)
720 tmp |= (or_mask & and_mask);
721 else
722 tmp |= or_mask;
723 }
724 WREG32(reg, tmp);
725 }
726 }
727
728 /**
729 * amdgpu_device_pci_config_reset - reset the GPU
730 *
731 * @adev: amdgpu_device pointer
732 *
733 * Resets the GPU using the pci config reset sequence.
734 * Only applicable to asics prior to vega10.
735 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)736 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
737 {
738 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
739 }
740
741 /*
742 * GPU doorbell aperture helpers function.
743 */
744 /**
745 * amdgpu_device_doorbell_init - Init doorbell driver information.
746 *
747 * @adev: amdgpu_device pointer
748 *
749 * Init doorbell driver information (CIK)
750 * Returns 0 on success, error on failure.
751 */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)752 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
753 {
754
755 /* No doorbell on SI hardware generation */
756 if (adev->asic_type < CHIP_BONAIRE) {
757 adev->doorbell.base = 0;
758 adev->doorbell.size = 0;
759 adev->doorbell.num_doorbells = 0;
760 #ifndef __NetBSD__
761 adev->doorbell.ptr = NULL;
762 #endif
763 return 0;
764 }
765
766 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
767 return -EINVAL;
768
769 amdgpu_asic_init_doorbell_index(adev);
770
771 /* doorbell bar mapping */
772 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
773 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
774
775 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
776 adev->doorbell_index.max_assignment+1);
777 if (adev->doorbell.num_doorbells == 0)
778 return -EINVAL;
779
780 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
781 * paging queue doorbell use the second page. The
782 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
783 * doorbells are in the first page. So with paging queue enabled,
784 * the max num_doorbells should + 1 page (0x400 in dword)
785 */
786 if (adev->asic_type >= CHIP_VEGA10)
787 adev->doorbell.num_doorbells += 0x400;
788
789 #ifdef __NetBSD__
790 int r;
791 adev->doorbell.bst = adev->pdev->pd_pa.pa_memt;
792 /* XXX errno NetBSD->Linux */
793 r = -bus_space_map(adev->doorbell.bst, adev->doorbell.base,
794 adev->doorbell.num_doorbells * sizeof(u32), 0,
795 &adev->doorbell.bsh);
796 if (r)
797 return r;
798 #else
799 adev->doorbell.ptr = ioremap(adev->doorbell.base,
800 adev->doorbell.num_doorbells *
801 sizeof(u32));
802 if (adev->doorbell.ptr == NULL)
803 return -ENOMEM;
804 #endif
805
806 return 0;
807 }
808
809 /**
810 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
811 *
812 * @adev: amdgpu_device pointer
813 *
814 * Tear down doorbell driver information (CIK)
815 */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)816 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
817 {
818 #ifdef __NetBSD__
819 if (adev->doorbell.num_doorbells) {
820 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh,
821 adev->doorbell.num_doorbells * sizeof(u32));
822 adev->doorbell.num_doorbells = 0;
823 }
824 #else
825 iounmap(adev->doorbell.ptr);
826 adev->doorbell.ptr = NULL;
827 #endif
828 }
829
830
831
832 /*
833 * amdgpu_device_wb_*()
834 * Writeback is the method by which the GPU updates special pages in memory
835 * with the status of certain GPU events (fences, ring pointers,etc.).
836 */
837
838 /**
839 * amdgpu_device_wb_fini - Disable Writeback and free memory
840 *
841 * @adev: amdgpu_device pointer
842 *
843 * Disables Writeback and frees the Writeback memory (all asics).
844 * Used at driver shutdown.
845 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)846 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
847 {
848 if (adev->wb.wb_obj) {
849 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
850 &adev->wb.gpu_addr,
851 (void **)__UNVOLATILE(&adev->wb.wb));
852 adev->wb.wb_obj = NULL;
853 }
854 }
855
856 /**
857 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
858 *
859 * @adev: amdgpu_device pointer
860 *
861 * Initializes writeback and allocates writeback memory (all asics).
862 * Used at driver startup.
863 * Returns 0 on success or an -error on failure.
864 */
amdgpu_device_wb_init(struct amdgpu_device * adev)865 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
866 {
867 int r;
868
869 if (adev->wb.wb_obj == NULL) {
870 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
871 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
872 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
873 &adev->wb.wb_obj, &adev->wb.gpu_addr,
874 (void **)__UNVOLATILE(&adev->wb.wb));
875 if (r) {
876 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
877 return r;
878 }
879
880 adev->wb.num_wb = AMDGPU_MAX_WB;
881 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
882
883 /* clear wb memory */
884 memset(__UNVOLATILE(adev->wb.wb), 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
885 }
886
887 return 0;
888 }
889
890 /**
891 * amdgpu_device_wb_get - Allocate a wb entry
892 *
893 * @adev: amdgpu_device pointer
894 * @wb: wb index
895 *
896 * Allocate a wb slot for use by the driver (all asics).
897 * Returns 0 on success or -EINVAL on failure.
898 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)899 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
900 {
901 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
902
903 if (offset < adev->wb.num_wb) {
904 __set_bit(offset, adev->wb.used);
905 *wb = offset << 3; /* convert to dw offset */
906 return 0;
907 } else {
908 return -EINVAL;
909 }
910 }
911
912 /**
913 * amdgpu_device_wb_free - Free a wb entry
914 *
915 * @adev: amdgpu_device pointer
916 * @wb: wb index
917 *
918 * Free a wb slot allocated for use by the driver (all asics)
919 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)920 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
921 {
922 wb >>= 3;
923 if (wb < adev->wb.num_wb)
924 __clear_bit(wb, adev->wb.used);
925 }
926
927 /**
928 * amdgpu_device_resize_fb_bar - try to resize FB BAR
929 *
930 * @adev: amdgpu_device pointer
931 *
932 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
933 * to fail, but if any of the BARs is not accessible after the size we abort
934 * driver loading by returning -ENODEV.
935 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)936 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
937 {
938 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
939 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
940 struct pci_bus *root;
941 struct resource *res;
942 unsigned i;
943 u16 cmd;
944 int r;
945
946 /* Bypass for VF */
947 if (amdgpu_sriov_vf(adev))
948 return 0;
949
950 #ifdef __NetBSD__ /* XXX amdgpu fb resize */
951 __USE(space_needed);
952 __USE(rbar_size);
953 __USE(root);
954 __USE(res);
955 __USE(i);
956 __USE(cmd);
957 __USE(r);
958 #else
959
960 /* Check if the root BUS has 64bit memory resources */
961 root = adev->pdev->bus;
962 while (root->parent)
963 root = root->parent;
964
965 pci_bus_for_each_resource(root, res, i) {
966 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
967 res->start > 0x100000000ull)
968 break;
969 }
970
971 /* Trying to resize is pointless without a root hub window above 4GB */
972 if (!res)
973 return 0;
974
975 /* Disable memory decoding while we change the BAR addresses and size */
976 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
977 pci_write_config_word(adev->pdev, PCI_COMMAND,
978 cmd & ~PCI_COMMAND_MEMORY);
979
980 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
981 amdgpu_device_doorbell_fini(adev);
982 if (adev->asic_type >= CHIP_BONAIRE)
983 pci_release_resource(adev->pdev, 2);
984
985 pci_release_resource(adev->pdev, 0);
986
987 r = pci_resize_resource(adev->pdev, 0, rbar_size);
988 if (r == -ENOSPC)
989 DRM_INFO("Not enough PCI address space for a large BAR.");
990 else if (r && r != -ENOTSUPP)
991 DRM_ERROR("Problem resizing BAR0 (%d).", r);
992
993 pci_assign_unassigned_bus_resources(adev->pdev->bus);
994
995 /* When the doorbell or fb BAR isn't available we have no chance of
996 * using the device.
997 */
998 r = amdgpu_device_doorbell_init(adev);
999 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1000 return -ENODEV;
1001
1002 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1003
1004 #endif
1005
1006 return 0;
1007 }
1008
1009 /*
1010 * GPU helpers function.
1011 */
1012 /**
1013 * amdgpu_device_need_post - check if the hw need post or not
1014 *
1015 * @adev: amdgpu_device pointer
1016 *
1017 * Check if the asic has been initialized (all asics) at driver startup
1018 * or post is needed if hw reset is performed.
1019 * Returns true if need or false if not.
1020 */
amdgpu_device_need_post(struct amdgpu_device * adev)1021 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1022 {
1023 uint32_t reg;
1024
1025 if (amdgpu_sriov_vf(adev))
1026 return false;
1027
1028 if (amdgpu_passthrough(adev)) {
1029 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1030 * some old smc fw still need driver do vPost otherwise gpu hang, while
1031 * those smc fw version above 22.15 doesn't have this flaw, so we force
1032 * vpost executed for smc version below 22.15
1033 */
1034 if (adev->asic_type == CHIP_FIJI) {
1035 int err;
1036 uint32_t fw_ver;
1037 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1038 /* force vPost if error occured */
1039 if (err)
1040 return true;
1041
1042 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1043 if (fw_ver < 0x00160e00)
1044 return true;
1045 }
1046 }
1047
1048 if (adev->has_hw_reset) {
1049 adev->has_hw_reset = false;
1050 return true;
1051 }
1052
1053 /* bios scratch used on CIK+ */
1054 if (adev->asic_type >= CHIP_BONAIRE)
1055 return amdgpu_atombios_scratch_need_asic_init(adev);
1056
1057 /* check MEM_SIZE for older asics */
1058 reg = amdgpu_asic_get_config_memsize(adev);
1059
1060 if ((reg != 0) && (reg != 0xffffffff))
1061 return false;
1062
1063 return true;
1064 }
1065
1066 #ifndef __NetBSD__ /* XXX amdgpu vga */
1067 /* if we get transitioned to only one device, take VGA back */
1068 /**
1069 * amdgpu_device_vga_set_decode - enable/disable vga decode
1070 *
1071 * @cookie: amdgpu_device pointer
1072 * @state: enable/disable vga decode
1073 *
1074 * Enable/disable vga decode (all asics).
1075 * Returns VGA resource flags.
1076 */
amdgpu_device_vga_set_decode(void * cookie,bool state)1077 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1078 {
1079 struct amdgpu_device *adev = cookie;
1080 amdgpu_asic_set_vga_state(adev, state);
1081 if (state)
1082 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1083 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1084 else
1085 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1086 }
1087 #endif /* __NetBSD__ */
1088
1089 /**
1090 * amdgpu_device_check_block_size - validate the vm block size
1091 *
1092 * @adev: amdgpu_device pointer
1093 *
1094 * Validates the vm block size specified via module parameter.
1095 * The vm block size defines number of bits in page table versus page directory,
1096 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1097 * page table and the remaining bits are in the page directory.
1098 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1099 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1100 {
1101 /* defines number of bits in page table versus page directory,
1102 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1103 * page table and the remaining bits are in the page directory */
1104 if (amdgpu_vm_block_size == -1)
1105 return;
1106
1107 if (amdgpu_vm_block_size < 9) {
1108 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1109 amdgpu_vm_block_size);
1110 amdgpu_vm_block_size = -1;
1111 }
1112 }
1113
1114 /**
1115 * amdgpu_device_check_vm_size - validate the vm size
1116 *
1117 * @adev: amdgpu_device pointer
1118 *
1119 * Validates the vm size in GB specified via module parameter.
1120 * The VM size is the size of the GPU virtual memory space in GB.
1121 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1122 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1123 {
1124 /* no need to check the default value */
1125 if (amdgpu_vm_size == -1)
1126 return;
1127
1128 if (amdgpu_vm_size < 1) {
1129 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1130 amdgpu_vm_size);
1131 amdgpu_vm_size = -1;
1132 }
1133 }
1134
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1135 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1136 {
1137 struct sysinfo si;
1138 bool is_os_64 = (sizeof(void *) == 8);
1139 uint64_t total_memory;
1140 uint64_t dram_size_seven_GB = 0x1B8000000;
1141 uint64_t dram_size_three_GB = 0xB8000000;
1142
1143 if (amdgpu_smu_memory_pool_size == 0)
1144 return;
1145
1146 if (!is_os_64) {
1147 DRM_WARN("Not 64-bit OS, feature not supported\n");
1148 goto def_value;
1149 }
1150 si_meminfo(&si);
1151 total_memory = (uint64_t)si.totalram * si.mem_unit;
1152
1153 if ((amdgpu_smu_memory_pool_size == 1) ||
1154 (amdgpu_smu_memory_pool_size == 2)) {
1155 if (total_memory < dram_size_three_GB)
1156 goto def_value1;
1157 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1158 (amdgpu_smu_memory_pool_size == 8)) {
1159 if (total_memory < dram_size_seven_GB)
1160 goto def_value1;
1161 } else {
1162 DRM_WARN("Smu memory pool size not supported\n");
1163 goto def_value;
1164 }
1165 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1166
1167 return;
1168
1169 def_value1:
1170 DRM_WARN("No enough system memory\n");
1171 def_value:
1172 adev->pm.smu_prv_buffer_size = 0;
1173 }
1174
1175 /**
1176 * amdgpu_device_check_arguments - validate module params
1177 *
1178 * @adev: amdgpu_device pointer
1179 *
1180 * Validates certain module parameters and updates
1181 * the associated values used by the driver (all asics).
1182 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1183 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1184 {
1185 if (amdgpu_sched_jobs < 4) {
1186 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1187 amdgpu_sched_jobs);
1188 amdgpu_sched_jobs = 4;
1189 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1190 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1191 amdgpu_sched_jobs);
1192 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1193 }
1194
1195 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1196 /* gart size must be greater or equal to 32M */
1197 dev_warn(adev->dev, "gart size (%d) too small\n",
1198 amdgpu_gart_size);
1199 amdgpu_gart_size = -1;
1200 }
1201
1202 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1203 /* gtt size must be greater or equal to 32M */
1204 dev_warn(adev->dev, "gtt size (%d) too small\n",
1205 amdgpu_gtt_size);
1206 amdgpu_gtt_size = -1;
1207 }
1208
1209 /* valid range is between 4 and 9 inclusive */
1210 if (amdgpu_vm_fragment_size != -1 &&
1211 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1212 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1213 amdgpu_vm_fragment_size = -1;
1214 }
1215
1216 amdgpu_device_check_smu_prv_buffer_size(adev);
1217
1218 amdgpu_device_check_vm_size(adev);
1219
1220 amdgpu_device_check_block_size(adev);
1221
1222 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1223
1224 return 0;
1225 }
1226
1227 #ifndef __NetBSD__ /* XXX amdgpu vga */
1228 /**
1229 * amdgpu_switcheroo_set_state - set switcheroo state
1230 *
1231 * @pdev: pci dev pointer
1232 * @state: vga_switcheroo state
1233 *
1234 * Callback for the switcheroo driver. Suspends or resumes the
1235 * the asics before or after it is powered up using ACPI methods.
1236 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1237 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1238 {
1239 struct drm_device *dev = pci_get_drvdata(pdev);
1240 int r;
1241
1242 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1243 return;
1244
1245 if (state == VGA_SWITCHEROO_ON) {
1246 pr_info("amdgpu: switched on\n");
1247 /* don't suspend or resume card normally */
1248 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1249
1250 #ifndef __NetBSD__ /* pmf handles this for us. */
1251 pci_set_power_state(dev->pdev, PCI_D0);
1252 pci_restore_state(dev->pdev);
1253 r = pci_enable_device(dev->pdev);
1254 if (r)
1255 DRM_WARN("pci_enable_device failed (%d)\n", r);
1256 #endif
1257 amdgpu_device_resume(dev, true);
1258
1259 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1260 drm_kms_helper_poll_enable(dev);
1261 } else {
1262 pr_info("amdgpu: switched off\n");
1263 drm_kms_helper_poll_disable(dev);
1264 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1265 amdgpu_device_suspend(dev, true);
1266 #ifndef __NetBSD__ /* pmf handles this for us. */
1267 pci_save_state(dev->pdev);
1268 /* Shut down the device */
1269 pci_disable_device(dev->pdev);
1270 pci_set_power_state(dev->pdev, PCI_D3cold);
1271 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1272 #endif
1273 }
1274 }
1275
1276 /**
1277 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1278 *
1279 * @pdev: pci dev pointer
1280 *
1281 * Callback for the switcheroo driver. Check of the switcheroo
1282 * state can be changed.
1283 * Returns true if the state can be changed, false if not.
1284 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1285 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1286 {
1287 struct drm_device *dev = pci_get_drvdata(pdev);
1288
1289 /*
1290 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1291 * locking inversion with the driver load path. And the access here is
1292 * completely racy anyway. So don't bother with locking for now.
1293 */
1294 return dev->open_count == 0;
1295 }
1296
1297 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1298 .set_gpu_state = amdgpu_switcheroo_set_state,
1299 .reprobe = NULL,
1300 .can_switch = amdgpu_switcheroo_can_switch,
1301 };
1302 #endif /* __NetBSD__ */
1303
1304 /**
1305 * amdgpu_device_ip_set_clockgating_state - set the CG state
1306 *
1307 * @dev: amdgpu_device pointer
1308 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1309 * @state: clockgating state (gate or ungate)
1310 *
1311 * Sets the requested clockgating state for all instances of
1312 * the hardware IP specified.
1313 * Returns the error code from the last instance.
1314 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1315 int amdgpu_device_ip_set_clockgating_state(void *dev,
1316 enum amd_ip_block_type block_type,
1317 enum amd_clockgating_state state)
1318 {
1319 struct amdgpu_device *adev = dev;
1320 int i, r = 0;
1321
1322 for (i = 0; i < adev->num_ip_blocks; i++) {
1323 if (!adev->ip_blocks[i].status.valid)
1324 continue;
1325 if (adev->ip_blocks[i].version->type != block_type)
1326 continue;
1327 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1328 continue;
1329 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1330 (void *)adev, state);
1331 if (r)
1332 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1333 adev->ip_blocks[i].version->funcs->name, r);
1334 }
1335 return r;
1336 }
1337
1338 /**
1339 * amdgpu_device_ip_set_powergating_state - set the PG state
1340 *
1341 * @dev: amdgpu_device pointer
1342 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1343 * @state: powergating state (gate or ungate)
1344 *
1345 * Sets the requested powergating state for all instances of
1346 * the hardware IP specified.
1347 * Returns the error code from the last instance.
1348 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1349 int amdgpu_device_ip_set_powergating_state(void *dev,
1350 enum amd_ip_block_type block_type,
1351 enum amd_powergating_state state)
1352 {
1353 struct amdgpu_device *adev = dev;
1354 int i, r = 0;
1355
1356 for (i = 0; i < adev->num_ip_blocks; i++) {
1357 if (!adev->ip_blocks[i].status.valid)
1358 continue;
1359 if (adev->ip_blocks[i].version->type != block_type)
1360 continue;
1361 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1362 continue;
1363 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1364 (void *)adev, state);
1365 if (r)
1366 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1367 adev->ip_blocks[i].version->funcs->name, r);
1368 }
1369 return r;
1370 }
1371
1372 /**
1373 * amdgpu_device_ip_get_clockgating_state - get the CG state
1374 *
1375 * @adev: amdgpu_device pointer
1376 * @flags: clockgating feature flags
1377 *
1378 * Walks the list of IPs on the device and updates the clockgating
1379 * flags for each IP.
1380 * Updates @flags with the feature flags for each hardware IP where
1381 * clockgating is enabled.
1382 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1383 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1384 u32 *flags)
1385 {
1386 int i;
1387
1388 for (i = 0; i < adev->num_ip_blocks; i++) {
1389 if (!adev->ip_blocks[i].status.valid)
1390 continue;
1391 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1392 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1393 }
1394 }
1395
1396 /**
1397 * amdgpu_device_ip_wait_for_idle - wait for idle
1398 *
1399 * @adev: amdgpu_device pointer
1400 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1401 *
1402 * Waits for the request hardware IP to be idle.
1403 * Returns 0 for success or a negative error code on failure.
1404 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1405 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1406 enum amd_ip_block_type block_type)
1407 {
1408 int i, r;
1409
1410 for (i = 0; i < adev->num_ip_blocks; i++) {
1411 if (!adev->ip_blocks[i].status.valid)
1412 continue;
1413 if (adev->ip_blocks[i].version->type == block_type) {
1414 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1415 if (r)
1416 return r;
1417 break;
1418 }
1419 }
1420 return 0;
1421
1422 }
1423
1424 /**
1425 * amdgpu_device_ip_is_idle - is the hardware IP idle
1426 *
1427 * @adev: amdgpu_device pointer
1428 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1429 *
1430 * Check if the hardware IP is idle or not.
1431 * Returns true if it the IP is idle, false if not.
1432 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1433 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1434 enum amd_ip_block_type block_type)
1435 {
1436 int i;
1437
1438 for (i = 0; i < adev->num_ip_blocks; i++) {
1439 if (!adev->ip_blocks[i].status.valid)
1440 continue;
1441 if (adev->ip_blocks[i].version->type == block_type)
1442 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1443 }
1444 return true;
1445
1446 }
1447
1448 /**
1449 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1450 *
1451 * @adev: amdgpu_device pointer
1452 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1453 *
1454 * Returns a pointer to the hardware IP block structure
1455 * if it exists for the asic, otherwise NULL.
1456 */
1457 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1458 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1459 enum amd_ip_block_type type)
1460 {
1461 int i;
1462
1463 for (i = 0; i < adev->num_ip_blocks; i++)
1464 if (adev->ip_blocks[i].version->type == type)
1465 return &adev->ip_blocks[i];
1466
1467 return NULL;
1468 }
1469
1470 /**
1471 * amdgpu_device_ip_block_version_cmp
1472 *
1473 * @adev: amdgpu_device pointer
1474 * @type: enum amd_ip_block_type
1475 * @major: major version
1476 * @minor: minor version
1477 *
1478 * return 0 if equal or greater
1479 * return 1 if smaller or the ip_block doesn't exist
1480 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1481 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1482 enum amd_ip_block_type type,
1483 u32 major, u32 minor)
1484 {
1485 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1486
1487 if (ip_block && ((ip_block->version->major > major) ||
1488 ((ip_block->version->major == major) &&
1489 (ip_block->version->minor >= minor))))
1490 return 0;
1491
1492 return 1;
1493 }
1494
1495 /**
1496 * amdgpu_device_ip_block_add
1497 *
1498 * @adev: amdgpu_device pointer
1499 * @ip_block_version: pointer to the IP to add
1500 *
1501 * Adds the IP block driver information to the collection of IPs
1502 * on the asic.
1503 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1504 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1505 const struct amdgpu_ip_block_version *ip_block_version)
1506 {
1507 if (!ip_block_version)
1508 return -EINVAL;
1509
1510 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1511 ip_block_version->funcs->name);
1512
1513 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1514
1515 return 0;
1516 }
1517
1518 /**
1519 * amdgpu_device_enable_virtual_display - enable virtual display feature
1520 *
1521 * @adev: amdgpu_device pointer
1522 *
1523 * Enabled the virtual display feature if the user has enabled it via
1524 * the module parameter virtual_display. This feature provides a virtual
1525 * display hardware on headless boards or in virtualized environments.
1526 * This function parses and validates the configuration string specified by
1527 * the user and configues the virtual display configuration (number of
1528 * virtual connectors, crtcs, etc.) specified.
1529 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1530 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1531 {
1532 adev->enable_virtual_display = false;
1533
1534 if (amdgpu_virtual_display) {
1535 struct drm_device *ddev = adev->ddev;
1536 const char *pci_address_name = pci_name(ddev->pdev);
1537 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1538
1539 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1540 pciaddstr_tmp = pciaddstr;
1541 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1542 pciaddname = strsep(&pciaddname_tmp, ",");
1543 if (!strcmp("all", pciaddname)
1544 || !strcmp(pci_address_name, pciaddname)) {
1545 long num_crtc;
1546 int res = -1;
1547
1548 adev->enable_virtual_display = true;
1549
1550 if (pciaddname_tmp)
1551 res = kstrtol(pciaddname_tmp, 10,
1552 &num_crtc);
1553
1554 if (!res) {
1555 if (num_crtc < 1)
1556 num_crtc = 1;
1557 if (num_crtc > 6)
1558 num_crtc = 6;
1559 adev->mode_info.num_crtc = num_crtc;
1560 } else {
1561 adev->mode_info.num_crtc = 1;
1562 }
1563 break;
1564 }
1565 }
1566
1567 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1568 amdgpu_virtual_display, pci_address_name,
1569 adev->enable_virtual_display, adev->mode_info.num_crtc);
1570
1571 kfree(pciaddstr);
1572 }
1573 }
1574
1575 /**
1576 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1577 *
1578 * @adev: amdgpu_device pointer
1579 *
1580 * Parses the asic configuration parameters specified in the gpu info
1581 * firmware and makes them availale to the driver for use in configuring
1582 * the asic.
1583 * Returns 0 on success, -EINVAL on failure.
1584 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1585 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1586 {
1587 const char *chip_name;
1588 char fw_name[30];
1589 int err;
1590 const struct gpu_info_firmware_header_v1_0 *hdr;
1591
1592 adev->firmware.gpu_info_fw = NULL;
1593
1594 switch (adev->asic_type) {
1595 case CHIP_TOPAZ:
1596 case CHIP_TONGA:
1597 case CHIP_FIJI:
1598 case CHIP_POLARIS10:
1599 case CHIP_POLARIS11:
1600 case CHIP_POLARIS12:
1601 case CHIP_VEGAM:
1602 case CHIP_CARRIZO:
1603 case CHIP_STONEY:
1604 #ifdef CONFIG_DRM_AMDGPU_SI
1605 case CHIP_VERDE:
1606 case CHIP_TAHITI:
1607 case CHIP_PITCAIRN:
1608 case CHIP_OLAND:
1609 case CHIP_HAINAN:
1610 #endif
1611 #ifdef CONFIG_DRM_AMDGPU_CIK
1612 case CHIP_BONAIRE:
1613 case CHIP_HAWAII:
1614 case CHIP_KAVERI:
1615 case CHIP_KABINI:
1616 case CHIP_MULLINS:
1617 #endif
1618 case CHIP_VEGA20:
1619 default:
1620 return 0;
1621 case CHIP_VEGA10:
1622 chip_name = "vega10";
1623 break;
1624 case CHIP_VEGA12:
1625 chip_name = "vega12";
1626 break;
1627 case CHIP_RAVEN:
1628 if (adev->rev_id >= 8)
1629 chip_name = "raven2";
1630 else if (adev->pdev->device == 0x15d8)
1631 chip_name = "picasso";
1632 else
1633 chip_name = "raven";
1634 break;
1635 case CHIP_ARCTURUS:
1636 chip_name = "arcturus";
1637 break;
1638 case CHIP_RENOIR:
1639 chip_name = "renoir";
1640 break;
1641 case CHIP_NAVI10:
1642 chip_name = "navi10";
1643 break;
1644 case CHIP_NAVI14:
1645 chip_name = "navi14";
1646 break;
1647 case CHIP_NAVI12:
1648 chip_name = "navi12";
1649 break;
1650 }
1651
1652 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1653 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1654 if (err) {
1655 dev_err(adev->dev,
1656 "Failed to load gpu_info firmware \"%s\"\n",
1657 fw_name);
1658 goto out;
1659 }
1660 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1661 if (err) {
1662 dev_err(adev->dev,
1663 "Failed to validate gpu_info firmware \"%s\"\n",
1664 fw_name);
1665 goto out;
1666 }
1667
1668 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1669 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1670
1671 switch (hdr->version_major) {
1672 case 1:
1673 {
1674 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1675 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1676 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1677
1678 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
1679 goto parse_soc_bounding_box;
1680
1681 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1682 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1683 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1684 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1685 adev->gfx.config.max_texture_channel_caches =
1686 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1687 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1688 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1689 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1690 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1691 adev->gfx.config.double_offchip_lds_buf =
1692 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1693 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1694 adev->gfx.cu_info.max_waves_per_simd =
1695 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1696 adev->gfx.cu_info.max_scratch_slots_per_cu =
1697 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1698 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1699 if (hdr->version_minor >= 1) {
1700 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1701 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1702 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1703 adev->gfx.config.num_sc_per_sh =
1704 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1705 adev->gfx.config.num_packer_per_sc =
1706 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1707 }
1708
1709 parse_soc_bounding_box:
1710 /*
1711 * soc bounding box info is not integrated in disocovery table,
1712 * we always need to parse it from gpu info firmware.
1713 */
1714 if (hdr->version_minor == 2) {
1715 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1716 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1717 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1718 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1719 }
1720 break;
1721 }
1722 default:
1723 dev_err(adev->dev,
1724 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1725 err = -EINVAL;
1726 goto out;
1727 }
1728 out:
1729 return err;
1730 }
1731
1732 /**
1733 * amdgpu_device_ip_early_init - run early init for hardware IPs
1734 *
1735 * @adev: amdgpu_device pointer
1736 *
1737 * Early initialization pass for hardware IPs. The hardware IPs that make
1738 * up each asic are discovered each IP's early_init callback is run. This
1739 * is the first stage in initializing the asic.
1740 * Returns 0 on success, negative error code on failure.
1741 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1742 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1743 {
1744 int i, r;
1745
1746 amdgpu_device_enable_virtual_display(adev);
1747
1748 switch (adev->asic_type) {
1749 case CHIP_TOPAZ:
1750 case CHIP_TONGA:
1751 case CHIP_FIJI:
1752 case CHIP_POLARIS10:
1753 case CHIP_POLARIS11:
1754 case CHIP_POLARIS12:
1755 case CHIP_VEGAM:
1756 case CHIP_CARRIZO:
1757 case CHIP_STONEY:
1758 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY)
1759 adev->family = AMDGPU_FAMILY_CZ;
1760 else
1761 adev->family = AMDGPU_FAMILY_VI;
1762
1763 r = vi_set_ip_blocks(adev);
1764 if (r)
1765 return r;
1766 break;
1767 #ifdef CONFIG_DRM_AMDGPU_SI
1768 case CHIP_VERDE:
1769 case CHIP_TAHITI:
1770 case CHIP_PITCAIRN:
1771 case CHIP_OLAND:
1772 case CHIP_HAINAN:
1773 adev->family = AMDGPU_FAMILY_SI;
1774 r = si_set_ip_blocks(adev);
1775 if (r)
1776 return r;
1777 break;
1778 #endif
1779 #ifdef CONFIG_DRM_AMDGPU_CIK
1780 case CHIP_BONAIRE:
1781 case CHIP_HAWAII:
1782 case CHIP_KAVERI:
1783 case CHIP_KABINI:
1784 case CHIP_MULLINS:
1785 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII))
1786 adev->family = AMDGPU_FAMILY_CI;
1787 else
1788 adev->family = AMDGPU_FAMILY_KV;
1789
1790 r = cik_set_ip_blocks(adev);
1791 if (r)
1792 return r;
1793 break;
1794 #endif
1795 case CHIP_VEGA10:
1796 case CHIP_VEGA12:
1797 case CHIP_VEGA20:
1798 case CHIP_RAVEN:
1799 case CHIP_ARCTURUS:
1800 case CHIP_RENOIR:
1801 if (adev->asic_type == CHIP_RAVEN ||
1802 adev->asic_type == CHIP_RENOIR)
1803 adev->family = AMDGPU_FAMILY_RV;
1804 else
1805 adev->family = AMDGPU_FAMILY_AI;
1806
1807 r = soc15_set_ip_blocks(adev);
1808 if (r)
1809 return r;
1810 break;
1811 case CHIP_NAVI10:
1812 case CHIP_NAVI14:
1813 case CHIP_NAVI12:
1814 adev->family = AMDGPU_FAMILY_NV;
1815
1816 r = nv_set_ip_blocks(adev);
1817 if (r)
1818 return r;
1819 break;
1820 default:
1821 /* FIXME: not supported yet */
1822 return -EINVAL;
1823 }
1824
1825 r = amdgpu_device_parse_gpu_info_fw(adev);
1826 if (r)
1827 return r;
1828
1829 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
1830 amdgpu_discovery_get_gfx_info(adev);
1831
1832 amdgpu_amdkfd_device_probe(adev);
1833
1834 if (amdgpu_sriov_vf(adev)) {
1835 r = amdgpu_virt_request_full_gpu(adev, true);
1836 if (r)
1837 return -EAGAIN;
1838 }
1839
1840 adev->pm.pp_feature = amdgpu_pp_feature_mask;
1841 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1842 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1843
1844 for (i = 0; i < adev->num_ip_blocks; i++) {
1845 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1846 DRM_ERROR("disabled ip block: %d <%s>\n",
1847 i, adev->ip_blocks[i].version->funcs->name);
1848 adev->ip_blocks[i].status.valid = false;
1849 } else {
1850 if (adev->ip_blocks[i].version->funcs->early_init) {
1851 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1852 if (r == -ENOENT) {
1853 adev->ip_blocks[i].status.valid = false;
1854 } else if (r) {
1855 DRM_ERROR("early_init of IP block <%s> failed %d\n",
1856 adev->ip_blocks[i].version->funcs->name, r);
1857 return r;
1858 } else {
1859 adev->ip_blocks[i].status.valid = true;
1860 }
1861 } else {
1862 adev->ip_blocks[i].status.valid = true;
1863 }
1864 }
1865 /* get the vbios after the asic_funcs are set up */
1866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1867 /* Read BIOS */
1868 if (!amdgpu_get_bios(adev))
1869 return -EINVAL;
1870
1871 r = amdgpu_atombios_init(adev);
1872 if (r) {
1873 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1874 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1875 return r;
1876 }
1877 }
1878 }
1879
1880 adev->cg_flags &= amdgpu_cg_mask;
1881 adev->pg_flags &= amdgpu_pg_mask;
1882
1883 return 0;
1884 }
1885
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)1886 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1887 {
1888 int i, r;
1889
1890 for (i = 0; i < adev->num_ip_blocks; i++) {
1891 if (!adev->ip_blocks[i].status.sw)
1892 continue;
1893 if (adev->ip_blocks[i].status.hw)
1894 continue;
1895 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1896 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1898 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1899 if (r) {
1900 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1901 adev->ip_blocks[i].version->funcs->name, r);
1902 return r;
1903 }
1904 adev->ip_blocks[i].status.hw = true;
1905 }
1906 }
1907
1908 return 0;
1909 }
1910
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)1911 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1912 {
1913 int i, r;
1914
1915 for (i = 0; i < adev->num_ip_blocks; i++) {
1916 if (!adev->ip_blocks[i].status.sw)
1917 continue;
1918 if (adev->ip_blocks[i].status.hw)
1919 continue;
1920 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1921 if (r) {
1922 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1923 adev->ip_blocks[i].version->funcs->name, r);
1924 return r;
1925 }
1926 adev->ip_blocks[i].status.hw = true;
1927 }
1928
1929 return 0;
1930 }
1931
amdgpu_device_fw_loading(struct amdgpu_device * adev)1932 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1933 {
1934 int r = 0;
1935 int i;
1936 uint32_t smu_version;
1937
1938 if (adev->asic_type >= CHIP_VEGA10) {
1939 for (i = 0; i < adev->num_ip_blocks; i++) {
1940 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1941 continue;
1942
1943 /* no need to do the fw loading again if already done*/
1944 if (adev->ip_blocks[i].status.hw == true)
1945 break;
1946
1947 if (adev->in_gpu_reset || adev->in_suspend) {
1948 r = adev->ip_blocks[i].version->funcs->resume(adev);
1949 if (r) {
1950 DRM_ERROR("resume of IP block <%s> failed %d\n",
1951 adev->ip_blocks[i].version->funcs->name, r);
1952 return r;
1953 }
1954 } else {
1955 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1956 if (r) {
1957 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1958 adev->ip_blocks[i].version->funcs->name, r);
1959 return r;
1960 }
1961 }
1962
1963 adev->ip_blocks[i].status.hw = true;
1964 break;
1965 }
1966 }
1967
1968 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1969 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1970
1971 return r;
1972 }
1973
1974 /**
1975 * amdgpu_device_ip_init - run init for hardware IPs
1976 *
1977 * @adev: amdgpu_device pointer
1978 *
1979 * Main initialization pass for hardware IPs. The list of all the hardware
1980 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1981 * are run. sw_init initializes the software state associated with each IP
1982 * and hw_init initializes the hardware associated with each IP.
1983 * Returns 0 on success, negative error code on failure.
1984 */
amdgpu_device_ip_init(struct amdgpu_device * adev)1985 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1986 {
1987 int i, r;
1988
1989 r = amdgpu_ras_init(adev);
1990 if (r)
1991 return r;
1992
1993 for (i = 0; i < adev->num_ip_blocks; i++) {
1994 if (!adev->ip_blocks[i].status.valid)
1995 continue;
1996 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1997 if (r) {
1998 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1999 adev->ip_blocks[i].version->funcs->name, r);
2000 goto init_failed;
2001 }
2002 adev->ip_blocks[i].status.sw = true;
2003
2004 /* need to do gmc hw init early so we can allocate gpu mem */
2005 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2006 r = amdgpu_device_vram_scratch_init(adev);
2007 if (r) {
2008 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2009 goto init_failed;
2010 }
2011 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2012 if (r) {
2013 DRM_ERROR("hw_init %d failed %d\n", i, r);
2014 goto init_failed;
2015 }
2016 r = amdgpu_device_wb_init(adev);
2017 if (r) {
2018 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2019 goto init_failed;
2020 }
2021 adev->ip_blocks[i].status.hw = true;
2022
2023 /* right after GMC hw init, we create CSA */
2024 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2025 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2026 AMDGPU_GEM_DOMAIN_VRAM,
2027 AMDGPU_CSA_SIZE);
2028 if (r) {
2029 DRM_ERROR("allocate CSA failed %d\n", r);
2030 goto init_failed;
2031 }
2032 }
2033 }
2034 }
2035
2036 if (amdgpu_sriov_vf(adev))
2037 amdgpu_virt_init_data_exchange(adev);
2038
2039 r = amdgpu_ib_pool_init(adev);
2040 if (r) {
2041 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2042 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2043 goto init_failed;
2044 }
2045
2046 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2047 if (r)
2048 goto init_failed;
2049
2050 r = amdgpu_device_ip_hw_init_phase1(adev);
2051 if (r)
2052 goto init_failed;
2053
2054 r = amdgpu_device_fw_loading(adev);
2055 if (r)
2056 goto init_failed;
2057
2058 r = amdgpu_device_ip_hw_init_phase2(adev);
2059 if (r)
2060 goto init_failed;
2061
2062 /*
2063 * retired pages will be loaded from eeprom and reserved here,
2064 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2065 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2066 * for I2C communication which only true at this point.
2067 * recovery_init may fail, but it can free all resources allocated by
2068 * itself and its failure should not stop amdgpu init process.
2069 *
2070 * Note: theoretically, this should be called before all vram allocations
2071 * to protect retired page from abusing
2072 */
2073 amdgpu_ras_recovery_init(adev);
2074
2075 if (adev->gmc.xgmi.num_physical_nodes > 1)
2076 amdgpu_xgmi_add_device(adev);
2077 amdgpu_amdkfd_device_init(adev);
2078
2079 init_failed:
2080 if (amdgpu_sriov_vf(adev))
2081 amdgpu_virt_release_full_gpu(adev, true);
2082
2083 return r;
2084 }
2085
2086 /**
2087 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2088 *
2089 * @adev: amdgpu_device pointer
2090 *
2091 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2092 * this function before a GPU reset. If the value is retained after a
2093 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2094 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2095 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2096 {
2097 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2098 }
2099
2100 /**
2101 * amdgpu_device_check_vram_lost - check if vram is valid
2102 *
2103 * @adev: amdgpu_device pointer
2104 *
2105 * Checks the reset magic value written to the gart pointer in VRAM.
2106 * The driver calls this after a GPU reset to see if the contents of
2107 * VRAM is lost or now.
2108 * returns true if vram is lost, false if not.
2109 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2110 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2111 {
2112 return !!memcmp(adev->gart.ptr, adev->reset_magic,
2113 AMDGPU_RESET_MAGIC_NUM);
2114 }
2115
2116 /**
2117 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2118 *
2119 * @adev: amdgpu_device pointer
2120 * @state: clockgating state (gate or ungate)
2121 *
2122 * The list of all the hardware IPs that make up the asic is walked and the
2123 * set_clockgating_state callbacks are run.
2124 * Late initialization pass enabling clockgating for hardware IPs.
2125 * Fini or suspend, pass disabling clockgating for hardware IPs.
2126 * Returns 0 on success, negative error code on failure.
2127 */
2128
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2129 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2130 enum amd_clockgating_state state)
2131 {
2132 int i, j, r;
2133
2134 if (amdgpu_emu_mode == 1)
2135 return 0;
2136
2137 for (j = 0; j < adev->num_ip_blocks; j++) {
2138 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2139 if (!adev->ip_blocks[i].status.late_initialized)
2140 continue;
2141 /* skip CG for VCE/UVD, it's handled specially */
2142 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2143 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2144 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2145 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2146 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2147 /* enable clockgating to save power */
2148 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2149 state);
2150 if (r) {
2151 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2152 adev->ip_blocks[i].version->funcs->name, r);
2153 return r;
2154 }
2155 }
2156 }
2157
2158 return 0;
2159 }
2160
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2161 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2162 {
2163 int i, j, r;
2164
2165 if (amdgpu_emu_mode == 1)
2166 return 0;
2167
2168 for (j = 0; j < adev->num_ip_blocks; j++) {
2169 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2170 if (!adev->ip_blocks[i].status.late_initialized)
2171 continue;
2172 /* skip CG for VCE/UVD, it's handled specially */
2173 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2174 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2175 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2177 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2178 /* enable powergating to save power */
2179 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2180 state);
2181 if (r) {
2182 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2183 adev->ip_blocks[i].version->funcs->name, r);
2184 return r;
2185 }
2186 }
2187 }
2188 return 0;
2189 }
2190
amdgpu_device_enable_mgpu_fan_boost(void)2191 static int amdgpu_device_enable_mgpu_fan_boost(void)
2192 {
2193 struct amdgpu_gpu_instance *gpu_ins;
2194 struct amdgpu_device *adev;
2195 int i, ret = 0;
2196
2197 mutex_lock(&mgpu_info.mutex);
2198
2199 /*
2200 * MGPU fan boost feature should be enabled
2201 * only when there are two or more dGPUs in
2202 * the system
2203 */
2204 if (mgpu_info.num_dgpu < 2)
2205 goto out;
2206
2207 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2208 gpu_ins = &(mgpu_info.gpu_ins[i]);
2209 adev = gpu_ins->adev;
2210 if (!(adev->flags & AMD_IS_APU) &&
2211 !gpu_ins->mgpu_fan_enabled &&
2212 adev->powerplay.pp_funcs &&
2213 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2214 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2215 if (ret)
2216 break;
2217
2218 gpu_ins->mgpu_fan_enabled = 1;
2219 }
2220 }
2221
2222 out:
2223 mutex_unlock(&mgpu_info.mutex);
2224
2225 return ret;
2226 }
2227
2228 /**
2229 * amdgpu_device_ip_late_init - run late init for hardware IPs
2230 *
2231 * @adev: amdgpu_device pointer
2232 *
2233 * Late initialization pass for hardware IPs. The list of all the hardware
2234 * IPs that make up the asic is walked and the late_init callbacks are run.
2235 * late_init covers any special initialization that an IP requires
2236 * after all of the have been initialized or something that needs to happen
2237 * late in the init process.
2238 * Returns 0 on success, negative error code on failure.
2239 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2240 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2241 {
2242 struct amdgpu_gpu_instance *gpu_instance;
2243 int i = 0, r;
2244
2245 for (i = 0; i < adev->num_ip_blocks; i++) {
2246 if (!adev->ip_blocks[i].status.hw)
2247 continue;
2248 if (adev->ip_blocks[i].version->funcs->late_init) {
2249 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2250 if (r) {
2251 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2252 adev->ip_blocks[i].version->funcs->name, r);
2253 return r;
2254 }
2255 }
2256 adev->ip_blocks[i].status.late_initialized = true;
2257 }
2258
2259 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2260 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2261
2262 amdgpu_device_fill_reset_magic(adev);
2263
2264 r = amdgpu_device_enable_mgpu_fan_boost();
2265 if (r)
2266 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2267
2268
2269 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2270 mutex_lock(&mgpu_info.mutex);
2271
2272 /*
2273 * Reset device p-state to low as this was booted with high.
2274 *
2275 * This should be performed only after all devices from the same
2276 * hive get initialized.
2277 *
2278 * However, it's unknown how many device in the hive in advance.
2279 * As this is counted one by one during devices initializations.
2280 *
2281 * So, we wait for all XGMI interlinked devices initialized.
2282 * This may bring some delays as those devices may come from
2283 * different hives. But that should be OK.
2284 */
2285 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2286 for (i = 0; i < mgpu_info.num_gpu; i++) {
2287 gpu_instance = &(mgpu_info.gpu_ins[i]);
2288 if (gpu_instance->adev->flags & AMD_IS_APU)
2289 continue;
2290
2291 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
2292 if (r) {
2293 DRM_ERROR("pstate setting failed (%d).\n", r);
2294 break;
2295 }
2296 }
2297 }
2298
2299 mutex_unlock(&mgpu_info.mutex);
2300 }
2301
2302 return 0;
2303 }
2304
2305 /**
2306 * amdgpu_device_ip_fini - run fini for hardware IPs
2307 *
2308 * @adev: amdgpu_device pointer
2309 *
2310 * Main teardown pass for hardware IPs. The list of all the hardware
2311 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2312 * are run. hw_fini tears down the hardware associated with each IP
2313 * and sw_fini tears down any software state associated with each IP.
2314 * Returns 0 on success, negative error code on failure.
2315 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2316 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2317 {
2318 int i, r;
2319
2320 amdgpu_ras_pre_fini(adev);
2321
2322 if (adev->gmc.xgmi.num_physical_nodes > 1)
2323 amdgpu_xgmi_remove_device(adev);
2324
2325 amdgpu_amdkfd_device_fini(adev);
2326
2327 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2328 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2329
2330 /* need to disable SMC first */
2331 for (i = 0; i < adev->num_ip_blocks; i++) {
2332 if (!adev->ip_blocks[i].status.hw)
2333 continue;
2334 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2335 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2336 /* XXX handle errors */
2337 if (r) {
2338 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2339 adev->ip_blocks[i].version->funcs->name, r);
2340 }
2341 adev->ip_blocks[i].status.hw = false;
2342 break;
2343 }
2344 }
2345
2346 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2347 if (!adev->ip_blocks[i].status.hw)
2348 continue;
2349
2350 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2351 /* XXX handle errors */
2352 if (r) {
2353 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2354 adev->ip_blocks[i].version->funcs->name, r);
2355 }
2356
2357 adev->ip_blocks[i].status.hw = false;
2358 }
2359
2360
2361 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2362 if (!adev->ip_blocks[i].status.sw)
2363 continue;
2364
2365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2366 amdgpu_ucode_free_bo(adev);
2367 amdgpu_free_static_csa(&adev->virt.csa_obj);
2368 amdgpu_device_wb_fini(adev);
2369 amdgpu_device_vram_scratch_fini(adev);
2370 amdgpu_ib_pool_fini(adev);
2371 }
2372
2373 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2374 /* XXX handle errors */
2375 if (r) {
2376 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2377 adev->ip_blocks[i].version->funcs->name, r);
2378 }
2379 adev->ip_blocks[i].status.sw = false;
2380 adev->ip_blocks[i].status.valid = false;
2381 }
2382
2383 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2384 if (!adev->ip_blocks[i].status.late_initialized)
2385 continue;
2386 if (adev->ip_blocks[i].version->funcs->late_fini)
2387 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2388 adev->ip_blocks[i].status.late_initialized = false;
2389 }
2390
2391 amdgpu_ras_fini(adev);
2392
2393 if (amdgpu_sriov_vf(adev))
2394 if (amdgpu_virt_release_full_gpu(adev, false))
2395 DRM_ERROR("failed to release exclusive mode on fini\n");
2396
2397 return 0;
2398 }
2399
2400 /**
2401 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2402 *
2403 * @work: work_struct.
2404 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2405 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2406 {
2407 struct amdgpu_device *adev =
2408 container_of(work, struct amdgpu_device, delayed_init_work.work);
2409 int r;
2410
2411 r = amdgpu_ib_ring_tests(adev);
2412 if (r)
2413 DRM_ERROR("ib ring test failed (%d).\n", r);
2414 }
2415
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2416 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2417 {
2418 struct amdgpu_device *adev =
2419 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2420
2421 mutex_lock(&adev->gfx.gfx_off_mutex);
2422 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2423 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2424 adev->gfx.gfx_off_state = true;
2425 }
2426 mutex_unlock(&adev->gfx.gfx_off_mutex);
2427 }
2428
2429 /**
2430 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2431 *
2432 * @adev: amdgpu_device pointer
2433 *
2434 * Main suspend function for hardware IPs. The list of all the hardware
2435 * IPs that make up the asic is walked, clockgating is disabled and the
2436 * suspend callbacks are run. suspend puts the hardware and software state
2437 * in each IP into a state suitable for suspend.
2438 * Returns 0 on success, negative error code on failure.
2439 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2440 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2441 {
2442 int i, r;
2443
2444 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2445 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2446
2447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2448 if (!adev->ip_blocks[i].status.valid)
2449 continue;
2450 /* displays are handled separately */
2451 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
2452 /* XXX handle errors */
2453 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2454 /* XXX handle errors */
2455 if (r) {
2456 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2457 adev->ip_blocks[i].version->funcs->name, r);
2458 return r;
2459 }
2460 adev->ip_blocks[i].status.hw = false;
2461 }
2462 }
2463
2464 return 0;
2465 }
2466
2467 /**
2468 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2469 *
2470 * @adev: amdgpu_device pointer
2471 *
2472 * Main suspend function for hardware IPs. The list of all the hardware
2473 * IPs that make up the asic is walked, clockgating is disabled and the
2474 * suspend callbacks are run. suspend puts the hardware and software state
2475 * in each IP into a state suitable for suspend.
2476 * Returns 0 on success, negative error code on failure.
2477 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2478 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2479 {
2480 int i, r __unused;
2481
2482 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2483 if (!adev->ip_blocks[i].status.valid)
2484 continue;
2485 /* displays are handled in phase1 */
2486 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2487 continue;
2488 /* PSP lost connection when err_event_athub occurs */
2489 if (amdgpu_ras_intr_triggered() &&
2490 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2491 adev->ip_blocks[i].status.hw = false;
2492 continue;
2493 }
2494 /* XXX handle errors */
2495 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2496 /* XXX handle errors */
2497 if (r) {
2498 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2499 adev->ip_blocks[i].version->funcs->name, r);
2500 }
2501 adev->ip_blocks[i].status.hw = false;
2502 /* handle putting the SMC in the appropriate state */
2503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2504 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2505 if (r) {
2506 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2507 adev->mp1_state, r);
2508 return r;
2509 }
2510 }
2511
2512 adev->ip_blocks[i].status.hw = false;
2513 }
2514
2515 return 0;
2516 }
2517
2518 /**
2519 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2520 *
2521 * @adev: amdgpu_device pointer
2522 *
2523 * Main suspend function for hardware IPs. The list of all the hardware
2524 * IPs that make up the asic is walked, clockgating is disabled and the
2525 * suspend callbacks are run. suspend puts the hardware and software state
2526 * in each IP into a state suitable for suspend.
2527 * Returns 0 on success, negative error code on failure.
2528 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2529 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2530 {
2531 int r;
2532
2533 if (amdgpu_sriov_vf(adev))
2534 amdgpu_virt_request_full_gpu(adev, false);
2535
2536 r = amdgpu_device_ip_suspend_phase1(adev);
2537 if (r)
2538 return r;
2539 r = amdgpu_device_ip_suspend_phase2(adev);
2540
2541 if (amdgpu_sriov_vf(adev))
2542 amdgpu_virt_release_full_gpu(adev, false);
2543
2544 return r;
2545 }
2546
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2547 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2548 {
2549 int i, r;
2550
2551 static enum amd_ip_block_type ip_order[] = {
2552 AMD_IP_BLOCK_TYPE_GMC,
2553 AMD_IP_BLOCK_TYPE_COMMON,
2554 AMD_IP_BLOCK_TYPE_PSP,
2555 AMD_IP_BLOCK_TYPE_IH,
2556 };
2557
2558 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2559 int j;
2560 struct amdgpu_ip_block *block;
2561
2562 for (j = 0; j < adev->num_ip_blocks; j++) {
2563 block = &adev->ip_blocks[j];
2564
2565 block->status.hw = false;
2566 if (block->version->type != ip_order[i] ||
2567 !block->status.valid)
2568 continue;
2569
2570 r = block->version->funcs->hw_init(adev);
2571 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2572 if (r)
2573 return r;
2574 block->status.hw = true;
2575 }
2576 }
2577
2578 return 0;
2579 }
2580
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2581 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2582 {
2583 int i, r;
2584
2585 static enum amd_ip_block_type ip_order[] = {
2586 AMD_IP_BLOCK_TYPE_SMC,
2587 AMD_IP_BLOCK_TYPE_DCE,
2588 AMD_IP_BLOCK_TYPE_GFX,
2589 AMD_IP_BLOCK_TYPE_SDMA,
2590 AMD_IP_BLOCK_TYPE_UVD,
2591 AMD_IP_BLOCK_TYPE_VCE,
2592 AMD_IP_BLOCK_TYPE_VCN
2593 };
2594
2595 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2596 int j;
2597 struct amdgpu_ip_block *block;
2598
2599 for (j = 0; j < adev->num_ip_blocks; j++) {
2600 block = &adev->ip_blocks[j];
2601
2602 if (block->version->type != ip_order[i] ||
2603 !block->status.valid ||
2604 block->status.hw)
2605 continue;
2606
2607 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2608 r = block->version->funcs->resume(adev);
2609 else
2610 r = block->version->funcs->hw_init(adev);
2611
2612 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2613 if (r)
2614 return r;
2615 block->status.hw = true;
2616 }
2617 }
2618
2619 return 0;
2620 }
2621
2622 /**
2623 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2624 *
2625 * @adev: amdgpu_device pointer
2626 *
2627 * First resume function for hardware IPs. The list of all the hardware
2628 * IPs that make up the asic is walked and the resume callbacks are run for
2629 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2630 * after a suspend and updates the software state as necessary. This
2631 * function is also used for restoring the GPU after a GPU reset.
2632 * Returns 0 on success, negative error code on failure.
2633 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2634 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2635 {
2636 int i, r;
2637
2638 for (i = 0; i < adev->num_ip_blocks; i++) {
2639 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2640 continue;
2641 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2642 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2643 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2644
2645 r = adev->ip_blocks[i].version->funcs->resume(adev);
2646 if (r) {
2647 DRM_ERROR("resume of IP block <%s> failed %d\n",
2648 adev->ip_blocks[i].version->funcs->name, r);
2649 return r;
2650 }
2651 adev->ip_blocks[i].status.hw = true;
2652 }
2653 }
2654
2655 return 0;
2656 }
2657
2658 /**
2659 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2660 *
2661 * @adev: amdgpu_device pointer
2662 *
2663 * First resume function for hardware IPs. The list of all the hardware
2664 * IPs that make up the asic is walked and the resume callbacks are run for
2665 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2666 * functional state after a suspend and updates the software state as
2667 * necessary. This function is also used for restoring the GPU after a GPU
2668 * reset.
2669 * Returns 0 on success, negative error code on failure.
2670 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2671 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2672 {
2673 int i, r;
2674
2675 for (i = 0; i < adev->num_ip_blocks; i++) {
2676 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2677 continue;
2678 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2679 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2680 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2681 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2682 continue;
2683 r = adev->ip_blocks[i].version->funcs->resume(adev);
2684 if (r) {
2685 DRM_ERROR("resume of IP block <%s> failed %d\n",
2686 adev->ip_blocks[i].version->funcs->name, r);
2687 return r;
2688 }
2689 adev->ip_blocks[i].status.hw = true;
2690 }
2691
2692 return 0;
2693 }
2694
2695 /**
2696 * amdgpu_device_ip_resume - run resume for hardware IPs
2697 *
2698 * @adev: amdgpu_device pointer
2699 *
2700 * Main resume function for hardware IPs. The hardware IPs
2701 * are split into two resume functions because they are
2702 * are also used in in recovering from a GPU reset and some additional
2703 * steps need to be take between them. In this case (S3/S4) they are
2704 * run sequentially.
2705 * Returns 0 on success, negative error code on failure.
2706 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2707 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2708 {
2709 int r;
2710
2711 r = amdgpu_device_ip_resume_phase1(adev);
2712 if (r)
2713 return r;
2714
2715 r = amdgpu_device_fw_loading(adev);
2716 if (r)
2717 return r;
2718
2719 r = amdgpu_device_ip_resume_phase2(adev);
2720
2721 return r;
2722 }
2723
2724 /**
2725 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2726 *
2727 * @adev: amdgpu_device pointer
2728 *
2729 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2730 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2731 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2732 {
2733 if (amdgpu_sriov_vf(adev)) {
2734 if (adev->is_atom_fw) {
2735 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2736 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2737 } else {
2738 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2739 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2740 }
2741
2742 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2744 }
2745 }
2746
2747 /**
2748 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2749 *
2750 * @asic_type: AMD asic type
2751 *
2752 * Check if there is DC (new modesetting infrastructre) support for an asic.
2753 * returns true if DC has support, false if not.
2754 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2755 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2756 {
2757 switch (asic_type) {
2758 #if defined(CONFIG_DRM_AMD_DC)
2759 case CHIP_BONAIRE:
2760 case CHIP_KAVERI:
2761 case CHIP_KABINI:
2762 case CHIP_MULLINS:
2763 /*
2764 * We have systems in the wild with these ASICs that require
2765 * LVDS and VGA support which is not supported with DC.
2766 *
2767 * Fallback to the non-DC driver here by default so as not to
2768 * cause regressions.
2769 */
2770 return amdgpu_dc > 0;
2771 case CHIP_HAWAII:
2772 case CHIP_CARRIZO:
2773 case CHIP_STONEY:
2774 case CHIP_POLARIS10:
2775 case CHIP_POLARIS11:
2776 case CHIP_POLARIS12:
2777 case CHIP_VEGAM:
2778 case CHIP_TONGA:
2779 case CHIP_FIJI:
2780 case CHIP_VEGA10:
2781 case CHIP_VEGA12:
2782 case CHIP_VEGA20:
2783 #if defined(CONFIG_DRM_AMD_DC_DCN)
2784 case CHIP_RAVEN:
2785 case CHIP_NAVI10:
2786 case CHIP_NAVI14:
2787 case CHIP_NAVI12:
2788 case CHIP_RENOIR:
2789 #endif
2790 return amdgpu_dc != 0;
2791 #endif
2792 default:
2793 if (amdgpu_dc > 0)
2794 DRM_INFO("Display Core has been requested via kernel parameter "
2795 "but isn't supported by ASIC, ignoring\n");
2796 return false;
2797 }
2798 }
2799
2800 /**
2801 * amdgpu_device_has_dc_support - check if dc is supported
2802 *
2803 * @adev: amdgpu_device_pointer
2804 *
2805 * Returns true for supported, false for not supported
2806 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)2807 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2808 {
2809 if (amdgpu_sriov_vf(adev))
2810 return false;
2811
2812 return amdgpu_device_asic_has_dc_support(adev->asic_type);
2813 }
2814
2815
amdgpu_device_xgmi_reset_func(struct work_struct * __work)2816 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2817 {
2818 struct amdgpu_device *adev =
2819 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2820 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2821
2822 /* It's a bug to not have a hive within this function */
2823 if (WARN_ON(!hive))
2824 return;
2825
2826 /*
2827 * Use task barrier to synchronize all xgmi reset works across the
2828 * hive. task_barrier_enter and task_barrier_exit will block
2829 * until all the threads running the xgmi reset works reach
2830 * those points. task_barrier_full will do both blocks.
2831 */
2832 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2833
2834 task_barrier_enter(&hive->tb);
2835 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2836
2837 if (adev->asic_reset_res)
2838 goto fail;
2839
2840 task_barrier_exit(&hive->tb);
2841 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2842
2843 if (adev->asic_reset_res)
2844 goto fail;
2845 } else {
2846
2847 task_barrier_full(&hive->tb);
2848 adev->asic_reset_res = amdgpu_asic_reset(adev);
2849 }
2850
2851 fail:
2852 if (adev->asic_reset_res)
2853 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2854 adev->asic_reset_res, adev->ddev->unique);
2855 }
2856
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)2857 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2858 {
2859 char *input = amdgpu_lockup_timeout;
2860 char *timeout_setting = NULL;
2861 int index = 0;
2862 long timeout;
2863 int ret = 0;
2864
2865 /*
2866 * By default timeout for non compute jobs is 10000.
2867 * And there is no timeout enforced on compute jobs.
2868 * In SR-IOV or passthrough mode, timeout for compute
2869 * jobs are 10000 by default.
2870 */
2871 adev->gfx_timeout = msecs_to_jiffies(10000);
2872 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2873 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2874 adev->compute_timeout = adev->gfx_timeout;
2875 else
2876 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2877
2878 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2879 while ((timeout_setting = strsep(&input, ",")) &&
2880 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2881 ret = kstrtol(timeout_setting, 0, &timeout);
2882 if (ret)
2883 return ret;
2884
2885 if (timeout == 0) {
2886 index++;
2887 continue;
2888 } else if (timeout < 0) {
2889 timeout = MAX_SCHEDULE_TIMEOUT;
2890 } else {
2891 timeout = msecs_to_jiffies(timeout);
2892 }
2893
2894 switch (index++) {
2895 case 0:
2896 adev->gfx_timeout = timeout;
2897 break;
2898 case 1:
2899 adev->compute_timeout = timeout;
2900 break;
2901 case 2:
2902 adev->sdma_timeout = timeout;
2903 break;
2904 case 3:
2905 adev->video_timeout = timeout;
2906 break;
2907 default:
2908 break;
2909 }
2910 }
2911 /*
2912 * There is only one value specified and
2913 * it should apply to all non-compute jobs.
2914 */
2915 if (index == 1) {
2916 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2917 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2918 adev->compute_timeout = adev->gfx_timeout;
2919 }
2920 }
2921
2922 return ret;
2923 }
2924
2925 /**
2926 * amdgpu_device_init - initialize the driver
2927 *
2928 * @adev: amdgpu_device pointer
2929 * @ddev: drm dev pointer
2930 * @pdev: pci dev pointer
2931 * @flags: driver flags
2932 *
2933 * Initializes the driver info and hw (all asics).
2934 * Returns 0 for success or an error on failure.
2935 * Called at driver startup.
2936 */
amdgpu_device_init(struct amdgpu_device * adev,struct drm_device * ddev,struct pci_dev * pdev,uint32_t flags)2937 int amdgpu_device_init(struct amdgpu_device *adev,
2938 struct drm_device *ddev,
2939 struct pci_dev *pdev,
2940 uint32_t flags)
2941 {
2942 int r, i;
2943 bool boco = false;
2944 u32 max_MBps;
2945
2946 adev->shutdown = false;
2947 adev->dev = pci_dev_dev(pdev);
2948 adev->ddev = ddev;
2949 adev->pdev = pdev;
2950 adev->flags = flags;
2951
2952 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
2953 adev->asic_type = amdgpu_force_asic_type;
2954 else
2955 adev->asic_type = flags & AMD_ASIC_MASK;
2956
2957 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2958 if (amdgpu_emu_mode == 1)
2959 adev->usec_timeout *= 2;
2960 adev->gmc.gart_size = 512 * 1024 * 1024;
2961 adev->accel_working = false;
2962 adev->num_rings = 0;
2963 adev->mman.buffer_funcs = NULL;
2964 adev->mman.buffer_funcs_ring = NULL;
2965 adev->vm_manager.vm_pte_funcs = NULL;
2966 adev->vm_manager.vm_pte_num_scheds = 0;
2967 adev->gmc.gmc_funcs = NULL;
2968 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2969 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
2970
2971 adev->smc_rreg = &amdgpu_invalid_rreg;
2972 adev->smc_wreg = &amdgpu_invalid_wreg;
2973 adev->pcie_rreg = &amdgpu_invalid_rreg;
2974 adev->pcie_wreg = &amdgpu_invalid_wreg;
2975 adev->pciep_rreg = &amdgpu_invalid_rreg;
2976 adev->pciep_wreg = &amdgpu_invalid_wreg;
2977 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
2978 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
2979 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
2980 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
2981 adev->didt_rreg = &amdgpu_invalid_rreg;
2982 adev->didt_wreg = &amdgpu_invalid_wreg;
2983 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
2984 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
2985 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
2986 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
2987
2988 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
2989 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
2990 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
2991
2992 /* mutex initialization are all done here so we
2993 * can recall function without having locking issues */
2994 atomic_set(&adev->irq.ih.lock, 0);
2995 mutex_init(&adev->firmware.mutex);
2996 mutex_init(&adev->pm.mutex);
2997 mutex_init(&adev->gfx.gpu_clock_mutex);
2998 mutex_init(&adev->srbm_mutex);
2999 mutex_init(&adev->gfx.pipe_reserve_mutex);
3000 mutex_init(&adev->gfx.gfx_off_mutex);
3001 mutex_init(&adev->grbm_idx_mutex);
3002 mutex_init(&adev->mn_lock);
3003 mutex_init(&adev->virt.vf_errors.lock);
3004 hash_init(adev->mn_hash);
3005 mutex_init(&adev->lock_reset);
3006 mutex_init(&adev->psp.mutex);
3007 mutex_init(&adev->notifier_lock);
3008
3009 spin_lock_init(&adev->mmio_idx_lock);
3010 spin_lock_init(&adev->smc_idx_lock);
3011 spin_lock_init(&adev->pcie_idx_lock);
3012 spin_lock_init(&adev->uvd_ctx_idx_lock);
3013 spin_lock_init(&adev->didt_idx_lock);
3014 spin_lock_init(&adev->gc_cac_idx_lock);
3015 spin_lock_init(&adev->se_cac_idx_lock);
3016 spin_lock_init(&adev->audio_endpt_idx_lock);
3017 spin_lock_init(&adev->mm_stats.lock);
3018
3019 INIT_LIST_HEAD(&adev->shadow_list);
3020 mutex_init(&adev->shadow_list_lock);
3021
3022 INIT_LIST_HEAD(&adev->ring_lru_list);
3023 spin_lock_init(&adev->ring_lru_list_lock);
3024
3025 INIT_DELAYED_WORK(&adev->delayed_init_work,
3026 amdgpu_device_delayed_init_work_handler);
3027 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3028 amdgpu_device_delay_enable_gfx_off);
3029
3030 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3031
3032 r = amdgpu_device_check_arguments(adev);
3033 if (r)
3034 return r;
3035
3036 adev->gfx.gfx_off_req_count = 1;
3037 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
3038
3039 /* Registers mapping */
3040 /* TODO: block userspace mapping of io register */
3041 if (adev->asic_type >= CHIP_BONAIRE) {
3042 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3043 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3044 } else {
3045 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3046 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3047 }
3048
3049 #ifdef __NetBSD__
3050 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(5),
3051 pci_mapreg_type(adev->pdev->pd_pa.pa_pc,
3052 adev->pdev->pd_pa.pa_tag, PCI_BAR(5)),
3053 0,
3054 &adev->rmmiot, &adev->rmmioh,
3055 &adev->rmmio_base, &adev->rmmio_size))
3056 return -EIO;
3057 DRM_INFO("register mmio base: 0x%8"PRIXMAX"\n",
3058 (uintmax_t)adev->rmmio_base);
3059 DRM_INFO("register mmio size: %"PRIuMAX"\n",
3060 (uintmax_t)adev->rmmio_size);
3061 #else
3062 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3063 if (adev->rmmio == NULL) {
3064 return -ENOMEM;
3065 }
3066 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3067 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3068 #endif
3069
3070 /* io port mapping */
3071 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3072 #ifdef __NetBSD__
3073 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(i),
3074 PCI_MAPREG_TYPE_IO, 0,
3075 &adev->rio_memt, &adev->rio_memh,
3076 NULL, &adev->rio_mem_size) == 0)
3077 break;
3078 #else
3079 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3080 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3081 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3082 break;
3083 }
3084 #endif
3085 }
3086 #ifdef __NetBSD__
3087 if (i == DEVICE_COUNT_RESOURCE)
3088 #else
3089 if (adev->rio_mem == NULL)
3090 #endif
3091 DRM_INFO("PCI I/O BAR is not found.\n");
3092
3093 /* enable PCIE atomic ops */
3094 #ifndef __NetBSD__ /* XXX amdgpu pcie atomics */
3095 r = pci_enable_atomic_ops_to_root(adev->pdev,
3096 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3097 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3098 if (r) {
3099 adev->have_atomics_support = false;
3100 DRM_INFO("PCIE atomic ops is not supported\n");
3101 } else {
3102 adev->have_atomics_support = true;
3103 }
3104 #endif
3105
3106 amdgpu_device_get_pcie_info(adev);
3107
3108 if (amdgpu_mcbp)
3109 DRM_INFO("MCBP is enabled\n");
3110
3111 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3112 adev->enable_mes = true;
3113
3114 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) {
3115 r = amdgpu_discovery_init(adev);
3116 if (r) {
3117 dev_err(adev->dev, "amdgpu_discovery_init failed\n");
3118 return r;
3119 }
3120 }
3121
3122 /* early init functions */
3123 r = amdgpu_device_ip_early_init(adev);
3124 if (r)
3125 return r;
3126
3127 r = amdgpu_device_get_job_timeout_settings(adev);
3128 if (r) {
3129 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3130 return r;
3131 }
3132
3133 /* doorbell bar mapping and doorbell index init*/
3134 amdgpu_device_doorbell_init(adev);
3135
3136 #ifndef __NetBSD__ /* XXX amdgpu vga */
3137 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3138 /* this will fail for cards that aren't VGA class devices, just
3139 * ignore it */
3140 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3141
3142 if (amdgpu_device_supports_boco(ddev))
3143 boco = true;
3144 if (amdgpu_has_atpx() &&
3145 (amdgpu_is_atpx_hybrid() ||
3146 amdgpu_has_atpx_dgpu_power_cntl()) &&
3147 !pci_is_thunderbolt_attached(adev->pdev))
3148 vga_switcheroo_register_client(adev->pdev,
3149 &amdgpu_switcheroo_ops, boco);
3150 if (boco)
3151 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3152 #endif
3153
3154 if (amdgpu_emu_mode == 1) {
3155 /* post the asic on emulation mode */
3156 emu_soc_asic_init(adev);
3157 goto fence_driver_init;
3158 }
3159
3160 /* detect if we are with an SRIOV vbios */
3161 amdgpu_device_detect_sriov_bios(adev);
3162
3163 /* check if we need to reset the asic
3164 * E.g., driver was not cleanly unloaded previously, etc.
3165 */
3166 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3167 r = amdgpu_asic_reset(adev);
3168 if (r) {
3169 dev_err(adev->dev, "asic reset on init failed\n");
3170 goto failed;
3171 }
3172 }
3173
3174 /* Post card if necessary */
3175 if (amdgpu_device_need_post(adev)) {
3176 if (!adev->bios) {
3177 dev_err(adev->dev, "no vBIOS found\n");
3178 r = -EINVAL;
3179 goto failed;
3180 }
3181 DRM_INFO("GPU posting now...\n");
3182 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3183 if (r) {
3184 dev_err(adev->dev, "gpu post error!\n");
3185 goto failed;
3186 }
3187 }
3188
3189 if (adev->is_atom_fw) {
3190 /* Initialize clocks */
3191 r = amdgpu_atomfirmware_get_clock_info(adev);
3192 if (r) {
3193 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3194 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3195 goto failed;
3196 }
3197 } else {
3198 /* Initialize clocks */
3199 r = amdgpu_atombios_get_clock_info(adev);
3200 if (r) {
3201 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3202 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3203 goto failed;
3204 }
3205 /* init i2c buses */
3206 if (!amdgpu_device_has_dc_support(adev))
3207 amdgpu_atombios_i2c_init(adev);
3208 }
3209
3210 fence_driver_init:
3211 /* Fence driver */
3212 r = amdgpu_fence_driver_init(adev);
3213 if (r) {
3214 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3215 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3216 goto failed;
3217 }
3218
3219 /* init the mode config */
3220 drm_mode_config_init(adev->ddev);
3221
3222 r = amdgpu_device_ip_init(adev);
3223 if (r) {
3224 /* failed in exclusive mode due to timeout */
3225 if (amdgpu_sriov_vf(adev) &&
3226 !amdgpu_sriov_runtime(adev) &&
3227 amdgpu_virt_mmio_blocked(adev) &&
3228 !amdgpu_virt_wait_reset(adev)) {
3229 dev_err(adev->dev, "VF exclusive mode timeout\n");
3230 /* Don't send request since VF is inactive. */
3231 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3232 adev->virt.ops = NULL;
3233 r = -EAGAIN;
3234 goto failed;
3235 }
3236 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3237 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3238 goto failed;
3239 }
3240
3241 DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3242 adev->gfx.config.max_shader_engines,
3243 adev->gfx.config.max_sh_per_se,
3244 adev->gfx.config.max_cu_per_sh,
3245 adev->gfx.cu_info.number);
3246
3247 amdgpu_ctx_init_sched(adev);
3248
3249 adev->accel_working = true;
3250
3251 amdgpu_vm_check_compute_bug(adev);
3252
3253 /* Initialize the buffer migration limit. */
3254 if (amdgpu_moverate >= 0)
3255 max_MBps = amdgpu_moverate;
3256 else
3257 max_MBps = 8; /* Allow 8 MB/s. */
3258 /* Get a log2 for easy divisions. */
3259 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3260
3261 amdgpu_fbdev_init(adev);
3262
3263 r = amdgpu_pm_sysfs_init(adev);
3264 if (r) {
3265 adev->pm_sysfs_en = false;
3266 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3267 } else
3268 adev->pm_sysfs_en = true;
3269
3270 r = amdgpu_ucode_sysfs_init(adev);
3271 if (r) {
3272 adev->ucode_sysfs_en = false;
3273 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3274 } else
3275 adev->ucode_sysfs_en = true;
3276
3277 r = amdgpu_debugfs_gem_init(adev);
3278 if (r)
3279 DRM_ERROR("registering gem debugfs failed (%d).\n", r);
3280
3281 r = amdgpu_debugfs_regs_init(adev);
3282 if (r)
3283 DRM_ERROR("registering register debugfs failed (%d).\n", r);
3284
3285 r = amdgpu_debugfs_firmware_init(adev);
3286 if (r)
3287 DRM_ERROR("registering firmware debugfs failed (%d).\n", r);
3288
3289 r = amdgpu_debugfs_init(adev);
3290 if (r)
3291 DRM_ERROR("Creating debugfs files failed (%d).\n", r);
3292
3293 if ((amdgpu_testing & 1)) {
3294 if (adev->accel_working)
3295 amdgpu_test_moves(adev);
3296 else
3297 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3298 }
3299 if (amdgpu_benchmarking) {
3300 if (adev->accel_working)
3301 amdgpu_benchmark(adev, amdgpu_benchmarking);
3302 else
3303 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3304 }
3305
3306 /*
3307 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3308 * Otherwise the mgpu fan boost feature will be skipped due to the
3309 * gpu instance is counted less.
3310 */
3311 amdgpu_register_gpu_instance(adev);
3312
3313 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3314 * explicit gating rather than handling it automatically.
3315 */
3316 r = amdgpu_device_ip_late_init(adev);
3317 if (r) {
3318 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3319 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3320 goto failed;
3321 }
3322
3323 /* must succeed. */
3324 amdgpu_ras_resume(adev);
3325
3326 queue_delayed_work(system_wq, &adev->delayed_init_work,
3327 msecs_to_jiffies(AMDGPU_RESUME_MS));
3328
3329 #ifndef __NetBSD__ /* XXX amdgpu sysfs */
3330 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
3331 if (r) {
3332 dev_err(adev->dev, "Could not create pcie_replay_count");
3333 return r;
3334 }
3335 #endif
3336
3337 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3338 r = amdgpu_pmu_init(adev);
3339 if (r)
3340 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3341
3342 return 0;
3343
3344 failed:
3345 amdgpu_vf_error_trans_all(adev);
3346 if (boco)
3347 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3348
3349 return r;
3350 }
3351
3352 /**
3353 * amdgpu_device_fini - tear down the driver
3354 *
3355 * @adev: amdgpu_device pointer
3356 *
3357 * Tear down the driver info (all asics).
3358 * Called at driver shutdown.
3359 */
amdgpu_device_fini(struct amdgpu_device * adev)3360 void amdgpu_device_fini(struct amdgpu_device *adev)
3361 {
3362 int r __unused;
3363
3364 DRM_INFO("amdgpu: finishing device.\n");
3365 flush_delayed_work(&adev->delayed_init_work);
3366 adev->shutdown = true;
3367
3368 /* disable all interrupts */
3369 amdgpu_irq_disable_all(adev);
3370 if (adev->mode_info.mode_config_initialized){
3371 if (!amdgpu_device_has_dc_support(adev))
3372 drm_helper_force_disable_all(adev->ddev);
3373 else
3374 drm_atomic_helper_shutdown(adev->ddev);
3375 }
3376 amdgpu_fence_driver_fini(adev);
3377 if (adev->pm_sysfs_en)
3378 amdgpu_pm_sysfs_fini(adev);
3379 amdgpu_fbdev_fini(adev);
3380 r = amdgpu_device_ip_fini(adev);
3381 if (adev->firmware.gpu_info_fw) {
3382 release_firmware(adev->firmware.gpu_info_fw);
3383 adev->firmware.gpu_info_fw = NULL;
3384 }
3385 adev->accel_working = false;
3386 /* free i2c buses */
3387 if (!amdgpu_device_has_dc_support(adev))
3388 amdgpu_i2c_fini(adev);
3389
3390 if (amdgpu_emu_mode != 1)
3391 amdgpu_atombios_fini(adev);
3392
3393 kfree(adev->bios);
3394 adev->bios = NULL;
3395 #ifndef __NetBSD__ /* XXX amdgpu vga */
3396 if (amdgpu_has_atpx() &&
3397 (amdgpu_is_atpx_hybrid() ||
3398 amdgpu_has_atpx_dgpu_power_cntl()) &&
3399 !pci_is_thunderbolt_attached(adev->pdev))
3400 vga_switcheroo_unregister_client(adev->pdev);
3401 if (amdgpu_device_supports_boco(adev->ddev))
3402 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3403 vga_client_register(adev->pdev, NULL, NULL, NULL);
3404 #endif
3405 #ifdef __NetBSD__
3406 if (adev->rio_mem_size)
3407 bus_space_unmap(adev->rio_memt, adev->rio_memh,
3408 adev->rio_mem_size);
3409 adev->rio_mem_size = 0;
3410 bus_space_unmap(adev->rmmiot, adev->rmmioh, adev->rmmio_size);
3411 #else
3412 if (adev->rio_mem)
3413 pci_iounmap(adev->pdev, adev->rio_mem);
3414 adev->rio_mem = NULL;
3415 iounmap(adev->rmmio);
3416 adev->rmmio = NULL;
3417 #endif
3418 amdgpu_device_doorbell_fini(adev);
3419
3420 amdgpu_debugfs_regs_cleanup(adev);
3421 #ifndef __NetBSD__ /* XXX amdgpu sysfs */
3422 device_remove_file(adev->dev, &dev_attr_pcie_replay_count);
3423 #endif
3424 if (adev->ucode_sysfs_en)
3425 amdgpu_ucode_sysfs_fini(adev);
3426 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3427 amdgpu_pmu_fini(adev);
3428 amdgpu_debugfs_preempt_cleanup(adev);
3429 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
3430 amdgpu_discovery_fini(adev);
3431 spin_lock_destroy(&adev->ring_lru_list_lock);
3432 mutex_destroy(&adev->shadow_list_lock);
3433 spin_lock_destroy(&adev->mm_stats.lock);
3434 spin_lock_destroy(&adev->audio_endpt_idx_lock);
3435 spin_lock_destroy(&adev->se_cac_idx_lock);
3436 spin_lock_destroy(&adev->gc_cac_idx_lock);
3437 spin_lock_destroy(&adev->didt_idx_lock);
3438 spin_lock_destroy(&adev->uvd_ctx_idx_lock);
3439 spin_lock_destroy(&adev->pcie_idx_lock);
3440 spin_lock_destroy(&adev->smc_idx_lock);
3441 spin_lock_destroy(&adev->mmio_idx_lock);
3442 mutex_destroy(&adev->notifier_lock);
3443 mutex_destroy(&adev->psp.mutex);
3444 mutex_destroy(&adev->lock_reset);
3445 /* hash_destroy(adev->mn_hash)? */
3446 mutex_destroy(&adev->virt.vf_errors.lock);
3447 mutex_destroy(&adev->mn_lock);
3448 mutex_destroy(&adev->grbm_idx_mutex);
3449 mutex_destroy(&adev->gfx.gfx_off_mutex);
3450 mutex_destroy(&adev->gfx.pipe_reserve_mutex);
3451 mutex_destroy(&adev->srbm_mutex);
3452 mutex_destroy(&adev->gfx.gpu_clock_mutex);
3453 mutex_destroy(&adev->pm.mutex);
3454 mutex_destroy(&adev->firmware.mutex);
3455 }
3456
3457
3458 /*
3459 * Suspend & resume.
3460 */
3461 /**
3462 * amdgpu_device_suspend - initiate device suspend
3463 *
3464 * @dev: drm dev pointer
3465 * @suspend: suspend state
3466 * @fbcon : notify the fbdev of suspend
3467 *
3468 * Puts the hw in the suspend state (all asics).
3469 * Returns 0 for success or an error on failure.
3470 * Called at driver suspend.
3471 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3472 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3473 {
3474 struct amdgpu_device *adev;
3475 struct drm_crtc *crtc;
3476 struct drm_connector *connector;
3477 struct drm_connector_list_iter iter;
3478 int r;
3479
3480 if (dev == NULL || dev->dev_private == NULL) {
3481 return -ENODEV;
3482 }
3483
3484 adev = dev->dev_private;
3485
3486 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3487 return 0;
3488
3489 adev->in_suspend = true;
3490 drm_kms_helper_poll_disable(dev);
3491
3492 if (fbcon)
3493 amdgpu_fbdev_set_suspend(adev, 1);
3494
3495 cancel_delayed_work_sync(&adev->delayed_init_work);
3496
3497 if (!amdgpu_device_has_dc_support(adev)) {
3498 /* turn off display hw */
3499 drm_modeset_lock_all(dev);
3500 drm_connector_list_iter_begin(dev, &iter);
3501 drm_for_each_connector_iter(connector, &iter)
3502 drm_helper_connector_dpms(connector,
3503 DRM_MODE_DPMS_OFF);
3504 drm_connector_list_iter_end(&iter);
3505 drm_modeset_unlock_all(dev);
3506 /* unpin the front buffers and cursors */
3507 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3508 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3509 struct drm_framebuffer *fb = crtc->primary->fb;
3510 struct amdgpu_bo *robj;
3511
3512 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3513 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3514 r = amdgpu_bo_reserve(aobj, true);
3515 if (r == 0) {
3516 amdgpu_bo_unpin(aobj);
3517 amdgpu_bo_unreserve(aobj);
3518 }
3519 }
3520
3521 if (fb == NULL || fb->obj[0] == NULL) {
3522 continue;
3523 }
3524 robj = gem_to_amdgpu_bo(fb->obj[0]);
3525 /* don't unpin kernel fb objects */
3526 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3527 r = amdgpu_bo_reserve(robj, true);
3528 if (r == 0) {
3529 amdgpu_bo_unpin(robj);
3530 amdgpu_bo_unreserve(robj);
3531 }
3532 }
3533 }
3534 }
3535
3536 amdgpu_amdkfd_suspend(adev);
3537
3538 amdgpu_ras_suspend(adev);
3539
3540 r = amdgpu_device_ip_suspend_phase1(adev);
3541
3542 /* evict vram memory */
3543 amdgpu_bo_evict_vram(adev);
3544
3545 amdgpu_fence_driver_suspend(adev);
3546
3547 r = amdgpu_device_ip_suspend_phase2(adev);
3548
3549 /* evict remaining vram memory
3550 * This second call to evict vram is to evict the gart page table
3551 * using the CPU.
3552 */
3553 amdgpu_bo_evict_vram(adev);
3554
3555 return 0;
3556 }
3557
3558 /**
3559 * amdgpu_device_resume - initiate device resume
3560 *
3561 * @dev: drm dev pointer
3562 * @resume: resume state
3563 * @fbcon : notify the fbdev of resume
3564 *
3565 * Bring the hw back to operating state (all asics).
3566 * Returns 0 for success or an error on failure.
3567 * Called at driver resume.
3568 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3569 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3570 {
3571 struct drm_connector *connector;
3572 struct drm_connector_list_iter iter;
3573 struct amdgpu_device *adev = dev->dev_private;
3574 struct drm_crtc *crtc;
3575 int r = 0;
3576
3577 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3578 return 0;
3579
3580 /* post card */
3581 if (amdgpu_device_need_post(adev)) {
3582 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3583 if (r)
3584 DRM_ERROR("amdgpu asic init failed\n");
3585 }
3586
3587 r = amdgpu_device_ip_resume(adev);
3588 if (r) {
3589 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3590 return r;
3591 }
3592 amdgpu_fence_driver_resume(adev);
3593
3594
3595 r = amdgpu_device_ip_late_init(adev);
3596 if (r)
3597 return r;
3598
3599 queue_delayed_work(system_wq, &adev->delayed_init_work,
3600 msecs_to_jiffies(AMDGPU_RESUME_MS));
3601
3602 if (!amdgpu_device_has_dc_support(adev)) {
3603 /* pin cursors */
3604 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3605 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3606
3607 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3608 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3609 r = amdgpu_bo_reserve(aobj, true);
3610 if (r == 0) {
3611 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3612 if (r != 0)
3613 DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3614 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3615 amdgpu_bo_unreserve(aobj);
3616 }
3617 }
3618 }
3619 }
3620 r = amdgpu_amdkfd_resume(adev);
3621 if (r)
3622 return r;
3623
3624 /* Make sure IB tests flushed */
3625 flush_delayed_work(&adev->delayed_init_work);
3626
3627 /* blat the mode back in */
3628 if (fbcon) {
3629 if (!amdgpu_device_has_dc_support(adev)) {
3630 /* pre DCE11 */
3631 drm_helper_resume_force_mode(dev);
3632
3633 /* turn on display hw */
3634 drm_modeset_lock_all(dev);
3635
3636 drm_connector_list_iter_begin(dev, &iter);
3637 drm_for_each_connector_iter(connector, &iter)
3638 drm_helper_connector_dpms(connector,
3639 DRM_MODE_DPMS_ON);
3640 drm_connector_list_iter_end(&iter);
3641
3642 drm_modeset_unlock_all(dev);
3643 }
3644 amdgpu_fbdev_set_suspend(adev, 0);
3645 }
3646
3647 drm_kms_helper_poll_enable(dev);
3648
3649 amdgpu_ras_resume(adev);
3650
3651 /*
3652 * Most of the connector probing functions try to acquire runtime pm
3653 * refs to ensure that the GPU is powered on when connector polling is
3654 * performed. Since we're calling this from a runtime PM callback,
3655 * trying to acquire rpm refs will cause us to deadlock.
3656 *
3657 * Since we're guaranteed to be holding the rpm lock, it's safe to
3658 * temporarily disable the rpm helpers so this doesn't deadlock us.
3659 */
3660 #ifdef CONFIG_PM
3661 dev->dev->power.disable_depth++;
3662 #endif
3663 if (!amdgpu_device_has_dc_support(adev))
3664 drm_helper_hpd_irq_event(dev);
3665 else
3666 drm_kms_helper_hotplug_event(dev);
3667 #ifdef CONFIG_PM
3668 dev->dev->power.disable_depth--;
3669 #endif
3670 adev->in_suspend = false;
3671
3672 return 0;
3673 }
3674
3675 /**
3676 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3677 *
3678 * @adev: amdgpu_device pointer
3679 *
3680 * The list of all the hardware IPs that make up the asic is walked and
3681 * the check_soft_reset callbacks are run. check_soft_reset determines
3682 * if the asic is still hung or not.
3683 * Returns true if any of the IPs are still in a hung state, false if not.
3684 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3685 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3686 {
3687 int i;
3688 bool asic_hang = false;
3689
3690 if (amdgpu_sriov_vf(adev))
3691 return true;
3692
3693 if (amdgpu_asic_need_full_reset(adev))
3694 return true;
3695
3696 for (i = 0; i < adev->num_ip_blocks; i++) {
3697 if (!adev->ip_blocks[i].status.valid)
3698 continue;
3699 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3700 adev->ip_blocks[i].status.hang =
3701 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3702 if (adev->ip_blocks[i].status.hang) {
3703 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3704 asic_hang = true;
3705 }
3706 }
3707 return asic_hang;
3708 }
3709
3710 /**
3711 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3712 *
3713 * @adev: amdgpu_device pointer
3714 *
3715 * The list of all the hardware IPs that make up the asic is walked and the
3716 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3717 * handles any IP specific hardware or software state changes that are
3718 * necessary for a soft reset to succeed.
3719 * Returns 0 on success, negative error code on failure.
3720 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3721 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3722 {
3723 int i, r = 0;
3724
3725 for (i = 0; i < adev->num_ip_blocks; i++) {
3726 if (!adev->ip_blocks[i].status.valid)
3727 continue;
3728 if (adev->ip_blocks[i].status.hang &&
3729 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3730 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3731 if (r)
3732 return r;
3733 }
3734 }
3735
3736 return 0;
3737 }
3738
3739 /**
3740 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3741 *
3742 * @adev: amdgpu_device pointer
3743 *
3744 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3745 * reset is necessary to recover.
3746 * Returns true if a full asic reset is required, false if not.
3747 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3748 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3749 {
3750 int i;
3751
3752 if (amdgpu_asic_need_full_reset(adev))
3753 return true;
3754
3755 for (i = 0; i < adev->num_ip_blocks; i++) {
3756 if (!adev->ip_blocks[i].status.valid)
3757 continue;
3758 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3759 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3760 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3761 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3762 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3763 if (adev->ip_blocks[i].status.hang) {
3764 DRM_INFO("Some block need full reset!\n");
3765 return true;
3766 }
3767 }
3768 }
3769 return false;
3770 }
3771
3772 /**
3773 * amdgpu_device_ip_soft_reset - do a soft reset
3774 *
3775 * @adev: amdgpu_device pointer
3776 *
3777 * The list of all the hardware IPs that make up the asic is walked and the
3778 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3779 * IP specific hardware or software state changes that are necessary to soft
3780 * reset the IP.
3781 * Returns 0 on success, negative error code on failure.
3782 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3783 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3784 {
3785 int i, r = 0;
3786
3787 for (i = 0; i < adev->num_ip_blocks; i++) {
3788 if (!adev->ip_blocks[i].status.valid)
3789 continue;
3790 if (adev->ip_blocks[i].status.hang &&
3791 adev->ip_blocks[i].version->funcs->soft_reset) {
3792 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3793 if (r)
3794 return r;
3795 }
3796 }
3797
3798 return 0;
3799 }
3800
3801 /**
3802 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3803 *
3804 * @adev: amdgpu_device pointer
3805 *
3806 * The list of all the hardware IPs that make up the asic is walked and the
3807 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3808 * handles any IP specific hardware or software state changes that are
3809 * necessary after the IP has been soft reset.
3810 * Returns 0 on success, negative error code on failure.
3811 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3812 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3813 {
3814 int i, r = 0;
3815
3816 for (i = 0; i < adev->num_ip_blocks; i++) {
3817 if (!adev->ip_blocks[i].status.valid)
3818 continue;
3819 if (adev->ip_blocks[i].status.hang &&
3820 adev->ip_blocks[i].version->funcs->post_soft_reset)
3821 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3822 if (r)
3823 return r;
3824 }
3825
3826 return 0;
3827 }
3828
3829 /**
3830 * amdgpu_device_recover_vram - Recover some VRAM contents
3831 *
3832 * @adev: amdgpu_device pointer
3833 *
3834 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
3835 * restore things like GPUVM page tables after a GPU reset where
3836 * the contents of VRAM might be lost.
3837 *
3838 * Returns:
3839 * 0 on success, negative error code on failure.
3840 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)3841 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3842 {
3843 struct dma_fence *fence = NULL, *next = NULL;
3844 struct amdgpu_bo *shadow;
3845 long r = 1, tmo;
3846
3847 if (amdgpu_sriov_runtime(adev))
3848 tmo = msecs_to_jiffies(8000);
3849 else
3850 tmo = msecs_to_jiffies(100);
3851
3852 DRM_INFO("recover vram bo from shadow start\n");
3853 mutex_lock(&adev->shadow_list_lock);
3854 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3855
3856 /* No need to recover an evicted BO */
3857 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3858 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3859 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3860 continue;
3861
3862 r = amdgpu_bo_restore_shadow(shadow, &next);
3863 if (r)
3864 break;
3865
3866 if (fence) {
3867 tmo = dma_fence_wait_timeout(fence, false, tmo);
3868 dma_fence_put(fence);
3869 fence = next;
3870 if (tmo == 0) {
3871 r = -ETIMEDOUT;
3872 break;
3873 } else if (tmo < 0) {
3874 r = tmo;
3875 break;
3876 }
3877 } else {
3878 fence = next;
3879 }
3880 }
3881 mutex_unlock(&adev->shadow_list_lock);
3882
3883 if (fence)
3884 tmo = dma_fence_wait_timeout(fence, false, tmo);
3885 dma_fence_put(fence);
3886
3887 if (r < 0 || tmo <= 0) {
3888 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3889 return -EIO;
3890 }
3891
3892 DRM_INFO("recover vram bo from shadow done\n");
3893 return 0;
3894 }
3895
3896
3897 /**
3898 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3899 *
3900 * @adev: amdgpu device pointer
3901 * @from_hypervisor: request from hypervisor
3902 *
3903 * do VF FLR and reinitialize Asic
3904 * return 0 means succeeded otherwise failed
3905 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)3906 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3907 bool from_hypervisor)
3908 {
3909 int r;
3910
3911 if (from_hypervisor)
3912 r = amdgpu_virt_request_full_gpu(adev, true);
3913 else
3914 r = amdgpu_virt_reset_gpu(adev);
3915 if (r)
3916 return r;
3917
3918 /* Resume IP prior to SMC */
3919 r = amdgpu_device_ip_reinit_early_sriov(adev);
3920 if (r)
3921 goto error;
3922
3923 amdgpu_virt_init_data_exchange(adev);
3924 /* we need recover gart prior to run SMC/CP/SDMA resume */
3925 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3926
3927 r = amdgpu_device_fw_loading(adev);
3928 if (r)
3929 return r;
3930
3931 /* now we are okay to resume SMC/CP/SDMA */
3932 r = amdgpu_device_ip_reinit_late_sriov(adev);
3933 if (r)
3934 goto error;
3935
3936 amdgpu_irq_gpu_reset_resume_helper(adev);
3937 r = amdgpu_ib_ring_tests(adev);
3938 amdgpu_amdkfd_post_reset(adev);
3939
3940 error:
3941 amdgpu_virt_release_full_gpu(adev, true);
3942 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3943 amdgpu_inc_vram_lost(adev);
3944 r = amdgpu_device_recover_vram(adev);
3945 }
3946
3947 return r;
3948 }
3949
3950 /**
3951 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3952 *
3953 * @adev: amdgpu device pointer
3954 *
3955 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3956 * a hung GPU.
3957 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)3958 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3959 {
3960 if (!amdgpu_device_ip_check_soft_reset(adev)) {
3961 DRM_INFO("Timeout, but no hardware hang detected.\n");
3962 return false;
3963 }
3964
3965 if (amdgpu_gpu_recovery == 0)
3966 goto disabled;
3967
3968 if (amdgpu_sriov_vf(adev))
3969 return true;
3970
3971 if (amdgpu_gpu_recovery == -1) {
3972 switch (adev->asic_type) {
3973 case CHIP_BONAIRE:
3974 case CHIP_HAWAII:
3975 case CHIP_TOPAZ:
3976 case CHIP_TONGA:
3977 case CHIP_FIJI:
3978 case CHIP_POLARIS10:
3979 case CHIP_POLARIS11:
3980 case CHIP_POLARIS12:
3981 case CHIP_VEGAM:
3982 case CHIP_VEGA20:
3983 case CHIP_VEGA10:
3984 case CHIP_VEGA12:
3985 case CHIP_RAVEN:
3986 case CHIP_ARCTURUS:
3987 case CHIP_RENOIR:
3988 case CHIP_NAVI10:
3989 case CHIP_NAVI14:
3990 case CHIP_NAVI12:
3991 break;
3992 default:
3993 goto disabled;
3994 }
3995 }
3996
3997 return true;
3998
3999 disabled:
4000 DRM_INFO("GPU recovery disabled.\n");
4001 return false;
4002 }
4003
4004
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4005 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4006 struct amdgpu_job *job,
4007 bool *need_full_reset_arg)
4008 {
4009 int i, r = 0;
4010 bool need_full_reset = *need_full_reset_arg;
4011
4012 /* block all schedulers and reset given job's ring */
4013 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4014 struct amdgpu_ring *ring = adev->rings[i];
4015
4016 if (!ring || !ring->sched.thread)
4017 continue;
4018
4019 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4020 amdgpu_fence_driver_force_completion(ring);
4021 }
4022
4023 if(job)
4024 drm_sched_increase_karma(&job->base);
4025
4026 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4027 if (!amdgpu_sriov_vf(adev)) {
4028
4029 if (!need_full_reset)
4030 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4031
4032 if (!need_full_reset) {
4033 amdgpu_device_ip_pre_soft_reset(adev);
4034 r = amdgpu_device_ip_soft_reset(adev);
4035 amdgpu_device_ip_post_soft_reset(adev);
4036 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4037 DRM_INFO("soft reset failed, will fallback to full reset!\n");
4038 need_full_reset = true;
4039 }
4040 }
4041
4042 if (need_full_reset)
4043 r = amdgpu_device_ip_suspend(adev);
4044
4045 *need_full_reset_arg = need_full_reset;
4046 }
4047
4048 return r;
4049 }
4050
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg)4051 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4052 struct list_head *device_list_handle,
4053 bool *need_full_reset_arg)
4054 {
4055 struct amdgpu_device *tmp_adev = NULL;
4056 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4057 int r = 0;
4058
4059 /*
4060 * ASIC reset has to be done on all HGMI hive nodes ASAP
4061 * to allow proper links negotiation in FW (within 1 sec)
4062 */
4063 if (need_full_reset) {
4064 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4065 /* For XGMI run all resets in parallel to speed up the process */
4066 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4067 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4068 r = -EALREADY;
4069 } else
4070 r = amdgpu_asic_reset(tmp_adev);
4071
4072 if (r) {
4073 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
4074 r, tmp_adev->ddev->unique);
4075 break;
4076 }
4077 }
4078
4079 /* For XGMI wait for all resets to complete before proceed */
4080 if (!r) {
4081 list_for_each_entry(tmp_adev, device_list_handle,
4082 gmc.xgmi.head) {
4083 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4084 flush_work(&tmp_adev->xgmi_reset_work);
4085 r = tmp_adev->asic_reset_res;
4086 if (r)
4087 break;
4088 }
4089 }
4090 }
4091 }
4092
4093 if (!r && amdgpu_ras_intr_triggered())
4094 amdgpu_ras_intr_cleared();
4095
4096 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4097 if (need_full_reset) {
4098 /* post card */
4099 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
4100 DRM_WARN("asic atom init failed!");
4101
4102 if (!r) {
4103 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4104 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4105 if (r)
4106 goto out;
4107
4108 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4109 if (vram_lost) {
4110 DRM_INFO("VRAM is lost due to GPU reset!\n");
4111 amdgpu_inc_vram_lost(tmp_adev);
4112 }
4113
4114 r = amdgpu_gtt_mgr_recover(
4115 &tmp_adev->mman.bdev.man[TTM_PL_TT]);
4116 if (r)
4117 goto out;
4118
4119 r = amdgpu_device_fw_loading(tmp_adev);
4120 if (r)
4121 return r;
4122
4123 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4124 if (r)
4125 goto out;
4126
4127 if (vram_lost)
4128 amdgpu_device_fill_reset_magic(tmp_adev);
4129
4130 /*
4131 * Add this ASIC as tracked as reset was already
4132 * complete successfully.
4133 */
4134 amdgpu_register_gpu_instance(tmp_adev);
4135
4136 r = amdgpu_device_ip_late_init(tmp_adev);
4137 if (r)
4138 goto out;
4139
4140 /* must succeed. */
4141 amdgpu_ras_resume(tmp_adev);
4142
4143 /* Update PSP FW topology after reset */
4144 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4145 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4146 }
4147 }
4148
4149
4150 out:
4151 if (!r) {
4152 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4153 r = amdgpu_ib_ring_tests(tmp_adev);
4154 if (r) {
4155 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4156 r = amdgpu_device_ip_suspend(tmp_adev);
4157 need_full_reset = true;
4158 r = -EAGAIN;
4159 goto end;
4160 }
4161 }
4162
4163 if (!r)
4164 r = amdgpu_device_recover_vram(tmp_adev);
4165 else
4166 tmp_adev->asic_reset_res = r;
4167 }
4168
4169 end:
4170 *need_full_reset_arg = need_full_reset;
4171 return r;
4172 }
4173
amdgpu_device_lock_adev(struct amdgpu_device * adev,bool trylock)4174 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
4175 {
4176 if (trylock) {
4177 if (!mutex_trylock(&adev->lock_reset))
4178 return false;
4179 } else
4180 mutex_lock(&adev->lock_reset);
4181
4182 atomic_inc(&adev->gpu_reset_counter);
4183 adev->in_gpu_reset = true;
4184 switch (amdgpu_asic_reset_method(adev)) {
4185 case AMD_RESET_METHOD_MODE1:
4186 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4187 break;
4188 case AMD_RESET_METHOD_MODE2:
4189 adev->mp1_state = PP_MP1_STATE_RESET;
4190 break;
4191 default:
4192 adev->mp1_state = PP_MP1_STATE_NONE;
4193 break;
4194 }
4195
4196 return true;
4197 }
4198
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4199 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4200 {
4201 amdgpu_vf_error_trans_all(adev);
4202 adev->mp1_state = PP_MP1_STATE_NONE;
4203 adev->in_gpu_reset = false;
4204 mutex_unlock(&adev->lock_reset);
4205 }
4206
4207 /**
4208 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4209 *
4210 * @adev: amdgpu device pointer
4211 * @job: which job trigger hang
4212 *
4213 * Attempt to reset the GPU if it has hung (all asics).
4214 * Attempt to do soft-reset or full-reset and reinitialize Asic
4215 * Returns 0 for success or an error on failure.
4216 */
4217
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4218 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4219 struct amdgpu_job *job)
4220 {
4221 struct list_head device_list, *device_list_handle = NULL;
4222 bool need_full_reset, job_signaled;
4223 struct amdgpu_hive_info *hive = NULL;
4224 struct amdgpu_device *tmp_adev = NULL;
4225 int i, r = 0;
4226 bool in_ras_intr = amdgpu_ras_intr_triggered();
4227 bool use_baco =
4228 (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
4229 true : false;
4230
4231 /*
4232 * Flush RAM to disk so that after reboot
4233 * the user can read log and see why the system rebooted.
4234 */
4235 if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
4236
4237 DRM_WARN("Emergency reboot.");
4238
4239 ksys_sync_helper();
4240 emergency_restart();
4241 }
4242
4243 need_full_reset = job_signaled = false;
4244 INIT_LIST_HEAD(&device_list);
4245
4246 dev_info(adev->dev, "GPU %s begin!\n",
4247 (in_ras_intr && !use_baco) ? "jobs stop":"reset");
4248
4249 cancel_delayed_work_sync(&adev->delayed_init_work);
4250
4251 hive = amdgpu_get_xgmi_hive(adev, false);
4252
4253 /*
4254 * Here we trylock to avoid chain of resets executing from
4255 * either trigger by jobs on different adevs in XGMI hive or jobs on
4256 * different schedulers for same device while this TO handler is running.
4257 * We always reset all schedulers for device and all devices for XGMI
4258 * hive so that should take care of them too.
4259 */
4260
4261 if (hive && !mutex_trylock(&hive->reset_lock)) {
4262 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", hive: %"PRIx64" as another already in progress",
4263 job ? job->base.id : -1, hive->hive_id);
4264 return 0;
4265 }
4266
4267 /* Start with adev pre asic reset first for soft reset check.*/
4268 if (!amdgpu_device_lock_adev(adev, !hive)) {
4269 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", as another already in progress",
4270 job ? job->base.id : -1);
4271 return 0;
4272 }
4273
4274 /* Block kfd: SRIOV would do it separately */
4275 if (!amdgpu_sriov_vf(adev))
4276 amdgpu_amdkfd_pre_reset(adev);
4277
4278 /* Build list of devices to reset */
4279 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4280 if (!hive) {
4281 /*unlock kfd: SRIOV would do it separately */
4282 if (!amdgpu_sriov_vf(adev))
4283 amdgpu_amdkfd_post_reset(adev);
4284 amdgpu_device_unlock_adev(adev);
4285 return -ENODEV;
4286 }
4287
4288 /*
4289 * In case we are in XGMI hive mode device reset is done for all the
4290 * nodes in the hive to retrain all XGMI links and hence the reset
4291 * sequence is executed in loop on all nodes.
4292 */
4293 device_list_handle = &hive->device_list;
4294 } else {
4295 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4296 device_list_handle = &device_list;
4297 }
4298
4299 /* block all schedulers and reset given job's ring */
4300 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4301 if (tmp_adev != adev) {
4302 amdgpu_device_lock_adev(tmp_adev, false);
4303 if (!amdgpu_sriov_vf(tmp_adev))
4304 amdgpu_amdkfd_pre_reset(tmp_adev);
4305 }
4306
4307 /*
4308 * Mark these ASICs to be reseted as untracked first
4309 * And add them back after reset completed
4310 */
4311 amdgpu_unregister_gpu_instance(tmp_adev);
4312
4313 /* disable ras on ALL IPs */
4314 if (!(in_ras_intr && !use_baco) &&
4315 amdgpu_device_ip_need_full_reset(tmp_adev))
4316 amdgpu_ras_suspend(tmp_adev);
4317
4318 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4319 struct amdgpu_ring *ring = tmp_adev->rings[i];
4320
4321 if (!ring || !ring->sched.thread)
4322 continue;
4323
4324 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4325
4326 if (in_ras_intr && !use_baco)
4327 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4328 }
4329 }
4330
4331
4332 if (in_ras_intr && !use_baco)
4333 goto skip_sched_resume;
4334
4335 /*
4336 * Must check guilty signal here since after this point all old
4337 * HW fences are force signaled.
4338 *
4339 * job->base holds a reference to parent fence
4340 */
4341 if (job && job->base.s_fence->parent &&
4342 dma_fence_is_signaled(job->base.s_fence->parent))
4343 job_signaled = true;
4344
4345 if (job_signaled) {
4346 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4347 goto skip_hw_reset;
4348 }
4349
4350
4351 /* Guilty job will be freed after this*/
4352 r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset);
4353 if (r) {
4354 /*TODO Should we stop ?*/
4355 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4356 r, adev->ddev->unique);
4357 adev->asic_reset_res = r;
4358 }
4359
4360 retry: /* Rest of adevs pre asic reset from XGMI hive. */
4361 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4362
4363 if (tmp_adev == adev)
4364 continue;
4365
4366 r = amdgpu_device_pre_asic_reset(tmp_adev,
4367 NULL,
4368 &need_full_reset);
4369 /*TODO Should we stop ?*/
4370 if (r) {
4371 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4372 r, tmp_adev->ddev->unique);
4373 tmp_adev->asic_reset_res = r;
4374 }
4375 }
4376
4377 /* Actual ASIC resets if needed.*/
4378 /* TODO Implement XGMI hive reset logic for SRIOV */
4379 if (amdgpu_sriov_vf(adev)) {
4380 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4381 if (r)
4382 adev->asic_reset_res = r;
4383 } else {
4384 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4385 if (r && r == -EAGAIN)
4386 goto retry;
4387 }
4388
4389 skip_hw_reset:
4390
4391 /* Post ASIC reset for all devs .*/
4392 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4393
4394 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4395 struct amdgpu_ring *ring = tmp_adev->rings[i];
4396
4397 if (!ring || !ring->sched.thread)
4398 continue;
4399
4400 /* No point to resubmit jobs if we didn't HW reset*/
4401 if (!tmp_adev->asic_reset_res && !job_signaled)
4402 drm_sched_resubmit_jobs(&ring->sched);
4403
4404 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4405 }
4406
4407 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4408 drm_helper_resume_force_mode(tmp_adev->ddev);
4409 }
4410
4411 tmp_adev->asic_reset_res = 0;
4412
4413 if (r) {
4414 /* bad news, how to tell it to userspace ? */
4415 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4416 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4417 } else {
4418 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4419 }
4420 }
4421
4422 skip_sched_resume:
4423 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4424 /*unlock kfd: SRIOV would do it separately */
4425 if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
4426 amdgpu_amdkfd_post_reset(tmp_adev);
4427 amdgpu_device_unlock_adev(tmp_adev);
4428 }
4429
4430 if (hive)
4431 mutex_unlock(&hive->reset_lock);
4432
4433 if (r)
4434 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4435 return r;
4436 }
4437
4438 /**
4439 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4440 *
4441 * @adev: amdgpu_device pointer
4442 *
4443 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4444 * and lanes) of the slot the device is in. Handles APUs and
4445 * virtualized environments where PCIE config space may not be available.
4446 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4447 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4448 {
4449 struct pci_dev *pdev;
4450 enum pci_bus_speed speed_cap, platform_speed_cap;
4451 enum pcie_link_width platform_link_width;
4452
4453 if (amdgpu_pcie_gen_cap)
4454 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4455
4456 if (amdgpu_pcie_lane_cap)
4457 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4458
4459 /* covers APUs as well */
4460 if (pci_is_root_bus(adev->pdev->bus)) {
4461 if (adev->pm.pcie_gen_mask == 0)
4462 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4463 if (adev->pm.pcie_mlw_mask == 0)
4464 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4465 return;
4466 }
4467
4468 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4469 return;
4470
4471 pcie_bandwidth_available(adev->pdev, NULL,
4472 &platform_speed_cap, &platform_link_width);
4473
4474 if (adev->pm.pcie_gen_mask == 0) {
4475 /* asic caps */
4476 pdev = adev->pdev;
4477 speed_cap = pcie_get_speed_cap(pdev);
4478 if (speed_cap == PCI_SPEED_UNKNOWN) {
4479 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4480 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4481 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4482 } else {
4483 if (speed_cap == PCIE_SPEED_16_0GT)
4484 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4485 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4486 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4487 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4488 else if (speed_cap == PCIE_SPEED_8_0GT)
4489 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4490 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4491 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4492 else if (speed_cap == PCIE_SPEED_5_0GT)
4493 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4494 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4495 else
4496 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4497 }
4498 /* platform caps */
4499 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4500 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4501 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4502 } else {
4503 if (platform_speed_cap == PCIE_SPEED_16_0GT)
4504 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4505 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4506 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4507 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4508 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4509 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4510 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4511 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4512 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4513 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4514 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4515 else
4516 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4517
4518 }
4519 }
4520 if (adev->pm.pcie_mlw_mask == 0) {
4521 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4522 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4523 } else {
4524 switch (platform_link_width) {
4525 case PCIE_LNK_X32:
4526 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4532 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4533 break;
4534 case PCIE_LNK_X16:
4535 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4541 break;
4542 case PCIE_LNK_X12:
4543 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4548 break;
4549 case PCIE_LNK_X8:
4550 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4554 break;
4555 case PCIE_LNK_X4:
4556 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4559 break;
4560 case PCIE_LNK_X2:
4561 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4563 break;
4564 case PCIE_LNK_X1:
4565 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4566 break;
4567 default:
4568 break;
4569 }
4570 }
4571 }
4572 }
4573
amdgpu_device_baco_enter(struct drm_device * dev)4574 int amdgpu_device_baco_enter(struct drm_device *dev)
4575 {
4576 struct amdgpu_device *adev = dev->dev_private;
4577 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4578
4579 if (!amdgpu_device_supports_baco(adev->ddev))
4580 return -ENOTSUPP;
4581
4582 if (ras && ras->supported)
4583 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4584
4585 return amdgpu_dpm_baco_enter(adev);
4586 }
4587
amdgpu_device_baco_exit(struct drm_device * dev)4588 int amdgpu_device_baco_exit(struct drm_device *dev)
4589 {
4590 struct amdgpu_device *adev = dev->dev_private;
4591 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4592 int ret = 0;
4593
4594 if (!amdgpu_device_supports_baco(adev->ddev))
4595 return -ENOTSUPP;
4596
4597 ret = amdgpu_dpm_baco_exit(adev);
4598 if (ret)
4599 return ret;
4600
4601 if (ras && ras->supported)
4602 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4603
4604 return 0;
4605 }
4606