xref: /openbsd/sys/dev/pci/drm/amd/amdgpu/sienna_cichlid.c (revision f005ef32)
11bb76ff1Sjsg /*
21bb76ff1Sjsg  * Copyright 2021 Advanced Micro Devices, Inc.
31bb76ff1Sjsg  *
41bb76ff1Sjsg  * Permission is hereby granted, free of charge, to any person obtaining a
51bb76ff1Sjsg  * copy of this software and associated documentation files (the "Software"),
61bb76ff1Sjsg  * to deal in the Software without restriction, including without limitation
71bb76ff1Sjsg  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
81bb76ff1Sjsg  * and/or sell copies of the Software, and to permit persons to whom the
91bb76ff1Sjsg  * Software is furnished to do so, subject to the following conditions:
101bb76ff1Sjsg  *
111bb76ff1Sjsg  * The above copyright notice and this permission notice shall be included in
121bb76ff1Sjsg  * all copies or substantial portions of the Software.
131bb76ff1Sjsg  *
141bb76ff1Sjsg  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
151bb76ff1Sjsg  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
161bb76ff1Sjsg  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
171bb76ff1Sjsg  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
181bb76ff1Sjsg  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
191bb76ff1Sjsg  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
201bb76ff1Sjsg  * OTHER DEALINGS IN THE SOFTWARE.
211bb76ff1Sjsg  *
221bb76ff1Sjsg  */
231bb76ff1Sjsg 
241bb76ff1Sjsg #include "sienna_cichlid.h"
251bb76ff1Sjsg #include "amdgpu_reset.h"
261bb76ff1Sjsg #include "amdgpu_amdkfd.h"
271bb76ff1Sjsg #include "amdgpu_dpm.h"
281bb76ff1Sjsg #include "amdgpu_job.h"
291bb76ff1Sjsg #include "amdgpu_ring.h"
301bb76ff1Sjsg #include "amdgpu_ras.h"
311bb76ff1Sjsg #include "amdgpu_psp.h"
321bb76ff1Sjsg #include "amdgpu_xgmi.h"
331bb76ff1Sjsg 
sienna_cichlid_is_mode2_default(struct amdgpu_reset_control * reset_ctl)341bb76ff1Sjsg static bool sienna_cichlid_is_mode2_default(struct amdgpu_reset_control *reset_ctl)
351bb76ff1Sjsg {
361bb76ff1Sjsg #if 0
371bb76ff1Sjsg 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
381bb76ff1Sjsg 
391bb76ff1Sjsg 	if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(11, 0, 7) &&
401bb76ff1Sjsg 	    adev->pm.fw_version >= 0x3a5500 && !amdgpu_sriov_vf(adev))
411bb76ff1Sjsg 		return true;
421bb76ff1Sjsg #endif
43*f005ef32Sjsg 	return amdgpu_reset_method == AMD_RESET_METHOD_MODE2;
441bb76ff1Sjsg }
451bb76ff1Sjsg 
461bb76ff1Sjsg static struct amdgpu_reset_handler *
sienna_cichlid_get_reset_handler(struct amdgpu_reset_control * reset_ctl,struct amdgpu_reset_context * reset_context)471bb76ff1Sjsg sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
481bb76ff1Sjsg 			    struct amdgpu_reset_context *reset_context)
491bb76ff1Sjsg {
501bb76ff1Sjsg 	struct amdgpu_reset_handler *handler;
511bb76ff1Sjsg 
521bb76ff1Sjsg 	if (reset_context->method != AMD_RESET_METHOD_NONE) {
531bb76ff1Sjsg 		list_for_each_entry(handler, &reset_ctl->reset_handlers,
541bb76ff1Sjsg 				     handler_list) {
551bb76ff1Sjsg 			if (handler->reset_method == reset_context->method)
561bb76ff1Sjsg 				return handler;
571bb76ff1Sjsg 		}
581bb76ff1Sjsg 	}
591bb76ff1Sjsg 
601bb76ff1Sjsg 	if (sienna_cichlid_is_mode2_default(reset_ctl)) {
611bb76ff1Sjsg 		list_for_each_entry (handler, &reset_ctl->reset_handlers,
621bb76ff1Sjsg 				     handler_list) {
631bb76ff1Sjsg 			if (handler->reset_method == AMD_RESET_METHOD_MODE2)
641bb76ff1Sjsg 				return handler;
651bb76ff1Sjsg 		}
661bb76ff1Sjsg 	}
671bb76ff1Sjsg 
681bb76ff1Sjsg 	return NULL;
691bb76ff1Sjsg }
701bb76ff1Sjsg 
sienna_cichlid_mode2_suspend_ip(struct amdgpu_device * adev)711bb76ff1Sjsg static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device *adev)
721bb76ff1Sjsg {
731bb76ff1Sjsg 	int r, i;
741bb76ff1Sjsg 
751bb76ff1Sjsg 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
761bb76ff1Sjsg 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
771bb76ff1Sjsg 
781bb76ff1Sjsg 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
791bb76ff1Sjsg 		if (!(adev->ip_blocks[i].version->type ==
801bb76ff1Sjsg 			      AMD_IP_BLOCK_TYPE_GFX ||
811bb76ff1Sjsg 		      adev->ip_blocks[i].version->type ==
821bb76ff1Sjsg 			      AMD_IP_BLOCK_TYPE_SDMA))
831bb76ff1Sjsg 			continue;
841bb76ff1Sjsg 
851bb76ff1Sjsg 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
861bb76ff1Sjsg 
871bb76ff1Sjsg 		if (r) {
881bb76ff1Sjsg 			dev_err(adev->dev,
891bb76ff1Sjsg 				"suspend of IP block <%s> failed %d\n",
901bb76ff1Sjsg 				adev->ip_blocks[i].version->funcs->name, r);
911bb76ff1Sjsg 			return r;
921bb76ff1Sjsg 		}
931bb76ff1Sjsg 		adev->ip_blocks[i].status.hw = false;
941bb76ff1Sjsg 	}
951bb76ff1Sjsg 
961bb76ff1Sjsg 	return r;
971bb76ff1Sjsg }
981bb76ff1Sjsg 
991bb76ff1Sjsg static int
sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control * reset_ctl,struct amdgpu_reset_context * reset_context)1001bb76ff1Sjsg sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
1011bb76ff1Sjsg 				  struct amdgpu_reset_context *reset_context)
1021bb76ff1Sjsg {
1031bb76ff1Sjsg 	int r = 0;
1041bb76ff1Sjsg 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
1051bb76ff1Sjsg 
1061bb76ff1Sjsg 	if (!amdgpu_sriov_vf(adev)) {
1071bb76ff1Sjsg 		if (adev->gfxhub.funcs->mode2_save_regs)
1081bb76ff1Sjsg 			adev->gfxhub.funcs->mode2_save_regs(adev);
1091bb76ff1Sjsg 		if (adev->gfxhub.funcs->halt)
1101bb76ff1Sjsg 			adev->gfxhub.funcs->halt(adev);
1111bb76ff1Sjsg 		r = sienna_cichlid_mode2_suspend_ip(adev);
1121bb76ff1Sjsg 	}
1131bb76ff1Sjsg 
1141bb76ff1Sjsg 	return r;
1151bb76ff1Sjsg }
1161bb76ff1Sjsg 
sienna_cichlid_async_reset(struct work_struct * work)1171bb76ff1Sjsg static void sienna_cichlid_async_reset(struct work_struct *work)
1181bb76ff1Sjsg {
1191bb76ff1Sjsg 	struct amdgpu_reset_handler *handler;
1201bb76ff1Sjsg 	struct amdgpu_reset_control *reset_ctl =
1211bb76ff1Sjsg 		container_of(work, struct amdgpu_reset_control, reset_work);
1221bb76ff1Sjsg 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
1231bb76ff1Sjsg 
1241bb76ff1Sjsg 	list_for_each_entry(handler, &reset_ctl->reset_handlers,
1251bb76ff1Sjsg 			     handler_list) {
1261bb76ff1Sjsg 		if (handler->reset_method == reset_ctl->active_reset) {
1271bb76ff1Sjsg 			dev_dbg(adev->dev, "Resetting device\n");
1281bb76ff1Sjsg 			handler->do_reset(adev);
1291bb76ff1Sjsg 			break;
1301bb76ff1Sjsg 		}
1311bb76ff1Sjsg 	}
1321bb76ff1Sjsg }
1331bb76ff1Sjsg 
sienna_cichlid_mode2_reset(struct amdgpu_device * adev)1341bb76ff1Sjsg static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev)
1351bb76ff1Sjsg {
1361bb76ff1Sjsg 	/* disable BM */
1371bb76ff1Sjsg 	pci_clear_master(adev->pdev);
1381bb76ff1Sjsg 	return amdgpu_dpm_mode2_reset(adev);
1391bb76ff1Sjsg }
1401bb76ff1Sjsg 
1411bb76ff1Sjsg static int
sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control * reset_ctl,struct amdgpu_reset_context * reset_context)1421bb76ff1Sjsg sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
1431bb76ff1Sjsg 			      struct amdgpu_reset_context *reset_context)
1441bb76ff1Sjsg {
1451bb76ff1Sjsg 	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
1461bb76ff1Sjsg 	int r;
1471bb76ff1Sjsg 
1481bb76ff1Sjsg 	r = sienna_cichlid_mode2_reset(adev);
1491bb76ff1Sjsg 	if (r) {
1501bb76ff1Sjsg 		dev_err(adev->dev,
1511bb76ff1Sjsg 			"ASIC reset failed with error, %d ", r);
1521bb76ff1Sjsg 	}
1531bb76ff1Sjsg 	return r;
1541bb76ff1Sjsg }
1551bb76ff1Sjsg 
sienna_cichlid_mode2_restore_ip(struct amdgpu_device * adev)1561bb76ff1Sjsg static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
1571bb76ff1Sjsg {
1581bb76ff1Sjsg 	int i, r;
1591bb76ff1Sjsg 	struct psp_context *psp = &adev->psp;
1601bb76ff1Sjsg 
1611bb76ff1Sjsg 	r = psp_rlc_autoload_start(psp);
1621bb76ff1Sjsg 	if (r) {
1631bb76ff1Sjsg 		dev_err(adev->dev, "Failed to start rlc autoload\n");
1641bb76ff1Sjsg 		return r;
1651bb76ff1Sjsg 	}
1661bb76ff1Sjsg 
1671bb76ff1Sjsg 	/* Reinit GFXHUB */
1681bb76ff1Sjsg 	if (adev->gfxhub.funcs->mode2_restore_regs)
1691bb76ff1Sjsg 		adev->gfxhub.funcs->mode2_restore_regs(adev);
1701bb76ff1Sjsg 	adev->gfxhub.funcs->init(adev);
1711bb76ff1Sjsg 	r = adev->gfxhub.funcs->gart_enable(adev);
1721bb76ff1Sjsg 	if (r) {
1731bb76ff1Sjsg 		dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
1741bb76ff1Sjsg 		return r;
1751bb76ff1Sjsg 	}
1761bb76ff1Sjsg 
1771bb76ff1Sjsg 	for (i = 0; i < adev->num_ip_blocks; i++) {
1781bb76ff1Sjsg 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1791bb76ff1Sjsg 			r = adev->ip_blocks[i].version->funcs->resume(adev);
1801bb76ff1Sjsg 			if (r) {
1811bb76ff1Sjsg 				dev_err(adev->dev,
1821bb76ff1Sjsg 					"resume of IP block <%s> failed %d\n",
1831bb76ff1Sjsg 					adev->ip_blocks[i].version->funcs->name, r);
1841bb76ff1Sjsg 				return r;
1851bb76ff1Sjsg 			}
1861bb76ff1Sjsg 
1871bb76ff1Sjsg 			adev->ip_blocks[i].status.hw = true;
1881bb76ff1Sjsg 		}
1891bb76ff1Sjsg 	}
1901bb76ff1Sjsg 
1911bb76ff1Sjsg 	for (i = 0; i < adev->num_ip_blocks; i++) {
1921bb76ff1Sjsg 		if (!(adev->ip_blocks[i].version->type ==
1931bb76ff1Sjsg 			      AMD_IP_BLOCK_TYPE_GFX ||
1941bb76ff1Sjsg 		      adev->ip_blocks[i].version->type ==
1951bb76ff1Sjsg 			      AMD_IP_BLOCK_TYPE_SDMA))
1961bb76ff1Sjsg 			continue;
1971bb76ff1Sjsg 		r = adev->ip_blocks[i].version->funcs->resume(adev);
1981bb76ff1Sjsg 		if (r) {
1991bb76ff1Sjsg 			dev_err(adev->dev,
2001bb76ff1Sjsg 				"resume of IP block <%s> failed %d\n",
2011bb76ff1Sjsg 				adev->ip_blocks[i].version->funcs->name, r);
2021bb76ff1Sjsg 			return r;
2031bb76ff1Sjsg 		}
2041bb76ff1Sjsg 
2051bb76ff1Sjsg 		adev->ip_blocks[i].status.hw = true;
2061bb76ff1Sjsg 	}
2071bb76ff1Sjsg 
2081bb76ff1Sjsg 	for (i = 0; i < adev->num_ip_blocks; i++) {
2091bb76ff1Sjsg 		if (!(adev->ip_blocks[i].version->type ==
2101bb76ff1Sjsg 			      AMD_IP_BLOCK_TYPE_GFX ||
2111bb76ff1Sjsg 		      adev->ip_blocks[i].version->type ==
2121bb76ff1Sjsg 			      AMD_IP_BLOCK_TYPE_SDMA))
2131bb76ff1Sjsg 			continue;
2141bb76ff1Sjsg 
2151bb76ff1Sjsg 		if (adev->ip_blocks[i].version->funcs->late_init) {
2161bb76ff1Sjsg 			r = adev->ip_blocks[i].version->funcs->late_init(
2171bb76ff1Sjsg 				(void *)adev);
2181bb76ff1Sjsg 			if (r) {
2191bb76ff1Sjsg 				dev_err(adev->dev,
2201bb76ff1Sjsg 					"late_init of IP block <%s> failed %d after reset\n",
2211bb76ff1Sjsg 					adev->ip_blocks[i].version->funcs->name,
2221bb76ff1Sjsg 					r);
2231bb76ff1Sjsg 				return r;
2241bb76ff1Sjsg 			}
2251bb76ff1Sjsg 		}
2261bb76ff1Sjsg 		adev->ip_blocks[i].status.late_initialized = true;
2271bb76ff1Sjsg 	}
2281bb76ff1Sjsg 
2291bb76ff1Sjsg 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2301bb76ff1Sjsg 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2311bb76ff1Sjsg 
2321bb76ff1Sjsg 	return r;
2331bb76ff1Sjsg }
2341bb76ff1Sjsg 
2351bb76ff1Sjsg static int
sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control * reset_ctl,struct amdgpu_reset_context * reset_context)2361bb76ff1Sjsg sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
2371bb76ff1Sjsg 				  struct amdgpu_reset_context *reset_context)
2381bb76ff1Sjsg {
2391bb76ff1Sjsg 	int r;
2401bb76ff1Sjsg 	struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
2411bb76ff1Sjsg 
2421bb76ff1Sjsg 	dev_info(tmp_adev->dev,
2431bb76ff1Sjsg 			"GPU reset succeeded, trying to resume\n");
2441bb76ff1Sjsg 	r = sienna_cichlid_mode2_restore_ip(tmp_adev);
2451bb76ff1Sjsg 	if (r)
2461bb76ff1Sjsg 		goto end;
2471bb76ff1Sjsg 
2481bb76ff1Sjsg 	/*
2491bb76ff1Sjsg 	* Add this ASIC as tracked as reset was already
2501bb76ff1Sjsg 	* complete successfully.
2511bb76ff1Sjsg 	*/
2521bb76ff1Sjsg 	amdgpu_register_gpu_instance(tmp_adev);
2531bb76ff1Sjsg 
2541bb76ff1Sjsg 	/* Resume RAS */
2551bb76ff1Sjsg 	amdgpu_ras_resume(tmp_adev);
2561bb76ff1Sjsg 
2571bb76ff1Sjsg 	amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
2581bb76ff1Sjsg 
2591bb76ff1Sjsg 	r = amdgpu_ib_ring_tests(tmp_adev);
2601bb76ff1Sjsg 	if (r) {
2611bb76ff1Sjsg 		dev_err(tmp_adev->dev,
2621bb76ff1Sjsg 			"ib ring test failed (%d).\n", r);
2631bb76ff1Sjsg 		r = -EAGAIN;
2641bb76ff1Sjsg 		goto end;
2651bb76ff1Sjsg 	}
2661bb76ff1Sjsg 
2671bb76ff1Sjsg end:
2681bb76ff1Sjsg 	if (r)
2691bb76ff1Sjsg 		return -EAGAIN;
2701bb76ff1Sjsg 	else
2711bb76ff1Sjsg 		return r;
2721bb76ff1Sjsg }
2731bb76ff1Sjsg 
2741bb76ff1Sjsg static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = {
2751bb76ff1Sjsg 	.reset_method		= AMD_RESET_METHOD_MODE2,
2761bb76ff1Sjsg 	.prepare_env		= NULL,
2771bb76ff1Sjsg 	.prepare_hwcontext	= sienna_cichlid_mode2_prepare_hwcontext,
2781bb76ff1Sjsg 	.perform_reset		= sienna_cichlid_mode2_perform_reset,
2791bb76ff1Sjsg 	.restore_hwcontext	= sienna_cichlid_mode2_restore_hwcontext,
2801bb76ff1Sjsg 	.restore_env		= NULL,
2811bb76ff1Sjsg 	.do_reset		= sienna_cichlid_mode2_reset,
2821bb76ff1Sjsg };
2831bb76ff1Sjsg 
sienna_cichlid_reset_init(struct amdgpu_device * adev)2841bb76ff1Sjsg int sienna_cichlid_reset_init(struct amdgpu_device *adev)
2851bb76ff1Sjsg {
2861bb76ff1Sjsg 	struct amdgpu_reset_control *reset_ctl;
2871bb76ff1Sjsg 
2881bb76ff1Sjsg 	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
2891bb76ff1Sjsg 	if (!reset_ctl)
2901bb76ff1Sjsg 		return -ENOMEM;
2911bb76ff1Sjsg 
2921bb76ff1Sjsg 	reset_ctl->handle = adev;
2931bb76ff1Sjsg 	reset_ctl->async_reset = sienna_cichlid_async_reset;
2941bb76ff1Sjsg 	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
2951bb76ff1Sjsg 	reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler;
2961bb76ff1Sjsg 
2971bb76ff1Sjsg 	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
2981bb76ff1Sjsg 	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
2991bb76ff1Sjsg 	/* Only mode2 is handled through reset control now */
3001bb76ff1Sjsg 	amdgpu_reset_add_handler(reset_ctl, &sienna_cichlid_mode2_handler);
3011bb76ff1Sjsg 
3021bb76ff1Sjsg 	adev->reset_cntl = reset_ctl;
3031bb76ff1Sjsg 
3041bb76ff1Sjsg 	return 0;
3051bb76ff1Sjsg }
3061bb76ff1Sjsg 
sienna_cichlid_reset_fini(struct amdgpu_device * adev)3071bb76ff1Sjsg int sienna_cichlid_reset_fini(struct amdgpu_device *adev)
3081bb76ff1Sjsg {
3091bb76ff1Sjsg 	kfree(adev->reset_cntl);
3101bb76ff1Sjsg 	adev->reset_cntl = NULL;
3111bb76ff1Sjsg 	return 0;
3121bb76ff1Sjsg }
313