1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2020-2024 Intel Corporation 4 */ 5 6 #include <linux/highmem.h> 7 #include <linux/moduleparam.h> 8 #include <linux/pci.h> 9 #include <linux/pm_runtime.h> 10 #include <linux/reboot.h> 11 12 #include "vpu_boot_api.h" 13 #include "ivpu_drv.h" 14 #include "ivpu_hw.h" 15 #include "ivpu_fw.h" 16 #include "ivpu_fw_log.h" 17 #include "ivpu_ipc.h" 18 #include "ivpu_job.h" 19 #include "ivpu_jsm_msg.h" 20 #include "ivpu_mmu.h" 21 #include "ivpu_pm.h" 22 23 static bool ivpu_disable_recovery; 24 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644); 25 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected"); 26 27 static unsigned long ivpu_tdr_timeout_ms; 28 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644); 29 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default"); 30 31 #define PM_RESCHEDULE_LIMIT 5 32 33 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev) 34 { 35 struct ivpu_fw_info *fw = vdev->fw; 36 37 ivpu_cmdq_reset_all_contexts(vdev); 38 ivpu_ipc_reset(vdev); 39 ivpu_fw_load(vdev); 40 fw->entry_point = fw->cold_boot_entry_point; 41 } 42 43 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev) 44 { 45 struct ivpu_fw_info *fw = vdev->fw; 46 struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem); 47 48 if (!bp->save_restore_ret_address) { 49 ivpu_pm_prepare_cold_boot(vdev); 50 return; 51 } 52 53 ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx", bp->save_restore_ret_address); 54 fw->entry_point = bp->save_restore_ret_address; 55 } 56 57 static int ivpu_suspend(struct ivpu_device *vdev) 58 { 59 int ret; 60 61 ivpu_prepare_for_reset(vdev); 62 63 ret = ivpu_shutdown(vdev); 64 if (ret) 65 ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret); 66 67 return ret; 68 } 69 70 static int ivpu_resume(struct ivpu_device *vdev) 71 { 72 int ret; 73 74 retry: 75 pci_restore_state(to_pci_dev(vdev->drm.dev)); 76 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); 77 78 ret = ivpu_hw_power_up(vdev); 79 if (ret) { 80 ivpu_err(vdev, "Failed to power up HW: %d\n", ret); 81 goto err_power_down; 82 } 83 84 ret = ivpu_mmu_enable(vdev); 85 if (ret) { 86 ivpu_err(vdev, "Failed to resume MMU: %d\n", ret); 87 goto err_power_down; 88 } 89 90 ret = ivpu_boot(vdev); 91 if (ret) 92 goto err_mmu_disable; 93 94 return 0; 95 96 err_mmu_disable: 97 ivpu_mmu_disable(vdev); 98 err_power_down: 99 ivpu_hw_power_down(vdev); 100 101 if (!ivpu_fw_is_cold_boot(vdev)) { 102 ivpu_pm_prepare_cold_boot(vdev); 103 goto retry; 104 } else { 105 ivpu_err(vdev, "Failed to resume the FW: %d\n", ret); 106 } 107 108 return ret; 109 } 110 111 static void ivpu_pm_recovery_work(struct work_struct *work) 112 { 113 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); 114 struct ivpu_device *vdev = pm->vdev; 115 char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; 116 int ret; 117 118 ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); 119 120 ret = pm_runtime_resume_and_get(vdev->drm.dev); 121 if (ret) 122 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 123 124 ivpu_fw_log_dump(vdev); 125 126 atomic_inc(&vdev->pm->reset_counter); 127 atomic_set(&vdev->pm->reset_pending, 1); 128 down_write(&vdev->pm->reset_lock); 129 130 ivpu_suspend(vdev); 131 ivpu_pm_prepare_cold_boot(vdev); 132 ivpu_jobs_abort_all(vdev); 133 134 ret = ivpu_resume(vdev); 135 if (ret) 136 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 137 138 up_write(&vdev->pm->reset_lock); 139 atomic_set(&vdev->pm->reset_pending, 0); 140 141 kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); 142 pm_runtime_mark_last_busy(vdev->drm.dev); 143 pm_runtime_put_autosuspend(vdev->drm.dev); 144 } 145 146 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) 147 { 148 ivpu_err(vdev, "Recovery triggered by %s\n", reason); 149 150 if (ivpu_disable_recovery) { 151 ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n"); 152 return; 153 } 154 155 if (ivpu_is_fpga(vdev)) { 156 ivpu_err(vdev, "Recovery not available on FPGA\n"); 157 return; 158 } 159 160 /* Trigger recovery if it's not in progress */ 161 if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) { 162 ivpu_hw_diagnose_failure(vdev); 163 ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */ 164 queue_work(system_long_wq, &vdev->pm->recovery_work); 165 } 166 } 167 168 static void ivpu_job_timeout_work(struct work_struct *work) 169 { 170 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work); 171 struct ivpu_device *vdev = pm->vdev; 172 173 ivpu_pm_trigger_recovery(vdev, "TDR"); 174 } 175 176 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) 177 { 178 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; 179 180 /* No-op if already queued */ 181 queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms)); 182 } 183 184 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev) 185 { 186 cancel_delayed_work_sync(&vdev->pm->job_timeout_work); 187 } 188 189 int ivpu_pm_suspend_cb(struct device *dev) 190 { 191 struct drm_device *drm = dev_get_drvdata(dev); 192 struct ivpu_device *vdev = to_ivpu_device(drm); 193 unsigned long timeout; 194 195 ivpu_dbg(vdev, PM, "Suspend..\n"); 196 197 timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr); 198 while (!ivpu_hw_is_idle(vdev)) { 199 cond_resched(); 200 if (time_after_eq(jiffies, timeout)) { 201 ivpu_err(vdev, "Failed to enter idle on system suspend\n"); 202 return -EBUSY; 203 } 204 } 205 206 ivpu_jsm_pwr_d0i3_enter(vdev); 207 208 ivpu_suspend(vdev); 209 ivpu_pm_prepare_warm_boot(vdev); 210 211 ivpu_dbg(vdev, PM, "Suspend done.\n"); 212 213 return 0; 214 } 215 216 int ivpu_pm_resume_cb(struct device *dev) 217 { 218 struct drm_device *drm = dev_get_drvdata(dev); 219 struct ivpu_device *vdev = to_ivpu_device(drm); 220 int ret; 221 222 ivpu_dbg(vdev, PM, "Resume..\n"); 223 224 ret = ivpu_resume(vdev); 225 if (ret) 226 ivpu_err(vdev, "Failed to resume: %d\n", ret); 227 228 ivpu_dbg(vdev, PM, "Resume done.\n"); 229 230 return ret; 231 } 232 233 int ivpu_pm_runtime_suspend_cb(struct device *dev) 234 { 235 struct drm_device *drm = dev_get_drvdata(dev); 236 struct ivpu_device *vdev = to_ivpu_device(drm); 237 bool hw_is_idle = true; 238 int ret; 239 240 drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa)); 241 drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work)); 242 243 ivpu_dbg(vdev, PM, "Runtime suspend..\n"); 244 245 if (!ivpu_hw_is_idle(vdev) && vdev->pm->suspend_reschedule_counter) { 246 ivpu_dbg(vdev, PM, "Failed to enter idle, rescheduling suspend, retries left %d\n", 247 vdev->pm->suspend_reschedule_counter); 248 pm_schedule_suspend(dev, vdev->timeout.reschedule_suspend); 249 vdev->pm->suspend_reschedule_counter--; 250 return -EAGAIN; 251 } 252 253 if (!vdev->pm->suspend_reschedule_counter) 254 hw_is_idle = false; 255 else if (ivpu_jsm_pwr_d0i3_enter(vdev)) 256 hw_is_idle = false; 257 258 ret = ivpu_suspend(vdev); 259 if (ret) 260 ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret); 261 262 if (!hw_is_idle) { 263 ivpu_err(vdev, "NPU failed to enter idle, force suspended.\n"); 264 ivpu_fw_log_dump(vdev); 265 ivpu_pm_prepare_cold_boot(vdev); 266 } else { 267 ivpu_pm_prepare_warm_boot(vdev); 268 } 269 270 vdev->pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT; 271 272 ivpu_dbg(vdev, PM, "Runtime suspend done.\n"); 273 274 return 0; 275 } 276 277 int ivpu_pm_runtime_resume_cb(struct device *dev) 278 { 279 struct drm_device *drm = dev_get_drvdata(dev); 280 struct ivpu_device *vdev = to_ivpu_device(drm); 281 int ret; 282 283 ivpu_dbg(vdev, PM, "Runtime resume..\n"); 284 285 ret = ivpu_resume(vdev); 286 if (ret) 287 ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); 288 289 ivpu_dbg(vdev, PM, "Runtime resume done.\n"); 290 291 return ret; 292 } 293 294 int ivpu_rpm_get(struct ivpu_device *vdev) 295 { 296 int ret; 297 298 ret = pm_runtime_resume_and_get(vdev->drm.dev); 299 if (!drm_WARN_ON(&vdev->drm, ret < 0)) 300 vdev->pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT; 301 302 return ret; 303 } 304 305 int ivpu_rpm_get_if_active(struct ivpu_device *vdev) 306 { 307 int ret; 308 309 ret = pm_runtime_get_if_in_use(vdev->drm.dev); 310 drm_WARN_ON(&vdev->drm, ret < 0); 311 312 return ret; 313 } 314 315 void ivpu_rpm_put(struct ivpu_device *vdev) 316 { 317 pm_runtime_mark_last_busy(vdev->drm.dev); 318 pm_runtime_put_autosuspend(vdev->drm.dev); 319 } 320 321 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) 322 { 323 struct ivpu_device *vdev = pci_get_drvdata(pdev); 324 325 ivpu_dbg(vdev, PM, "Pre-reset..\n"); 326 atomic_inc(&vdev->pm->reset_counter); 327 atomic_set(&vdev->pm->reset_pending, 1); 328 329 pm_runtime_get_sync(vdev->drm.dev); 330 down_write(&vdev->pm->reset_lock); 331 ivpu_prepare_for_reset(vdev); 332 ivpu_hw_reset(vdev); 333 ivpu_pm_prepare_cold_boot(vdev); 334 ivpu_jobs_abort_all(vdev); 335 ivpu_dbg(vdev, PM, "Pre-reset done.\n"); 336 } 337 338 void ivpu_pm_reset_done_cb(struct pci_dev *pdev) 339 { 340 struct ivpu_device *vdev = pci_get_drvdata(pdev); 341 int ret; 342 343 ivpu_dbg(vdev, PM, "Post-reset..\n"); 344 ret = ivpu_resume(vdev); 345 if (ret) 346 ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); 347 up_write(&vdev->pm->reset_lock); 348 atomic_set(&vdev->pm->reset_pending, 0); 349 ivpu_dbg(vdev, PM, "Post-reset done.\n"); 350 351 pm_runtime_mark_last_busy(vdev->drm.dev); 352 pm_runtime_put_autosuspend(vdev->drm.dev); 353 } 354 355 void ivpu_pm_init(struct ivpu_device *vdev) 356 { 357 struct device *dev = vdev->drm.dev; 358 struct ivpu_pm_info *pm = vdev->pm; 359 int delay; 360 361 pm->vdev = vdev; 362 pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT; 363 364 init_rwsem(&pm->reset_lock); 365 atomic_set(&pm->reset_pending, 0); 366 atomic_set(&pm->reset_counter, 0); 367 368 INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); 369 INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); 370 371 if (ivpu_disable_recovery) 372 delay = -1; 373 else 374 delay = vdev->timeout.autosuspend; 375 376 pm_runtime_use_autosuspend(dev); 377 pm_runtime_set_autosuspend_delay(dev, delay); 378 379 ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); 380 } 381 382 void ivpu_pm_cancel_recovery(struct ivpu_device *vdev) 383 { 384 drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work)); 385 cancel_work_sync(&vdev->pm->recovery_work); 386 } 387 388 void ivpu_pm_enable(struct ivpu_device *vdev) 389 { 390 struct device *dev = vdev->drm.dev; 391 392 pm_runtime_set_active(dev); 393 pm_runtime_allow(dev); 394 pm_runtime_mark_last_busy(dev); 395 pm_runtime_put_autosuspend(dev); 396 } 397 398 void ivpu_pm_disable(struct ivpu_device *vdev) 399 { 400 pm_runtime_get_noresume(vdev->drm.dev); 401 pm_runtime_forbid(vdev->drm.dev); 402 } 403