1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 * SPDX-License-Identifier: MIT 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "kernel/gpu/rc/kernel_rc.h" 25 26 #include "kernel/core/locks.h" 27 #include "kernel/diagnostics/journal.h" 28 #include "kernel/gpu/device/device.h" 29 #include "kernel/gpu/fifo/kernel_channel_group.h" 30 #include "kernel/gpu/fifo/kernel_channel_group_api.h" 31 #include "kernel/gpu/mmu/kern_gmmu.h" 32 #include "kernel/os/os.h" 33 #include "rmapi/client.h" 34 #include "rmapi/rs_utils.h" 35 36 #include "ctrl/ctrl506f.h" 37 38 #include "libraries/utils/nvprintf.h" 39 #include "nverror.h" 40 #include "nvtypes.h" 41 #include "objtmr.h" 42 #include "vgpu/rpc.h" 43 44 45 static NV_STATUS 46 _vgpuRcResetCallback 47 ( 48 NvHandle hClient, 49 NvHandle hDevice, 50 NvHandle hChannel, 51 RC_ERROR_CONTEXT *pRcErrorContext 52 ) 53 { 54 OBJSYS *pSys = SYS_GET_INSTANCE(); 55 NV_STATUS status = NV_OK; 56 57 if (osCondAcquireRmSema(pSys->pSema) == NV_OK) 58 { 59 if (rmGpuLocksAcquire(GPU_LOCK_FLAGS_COND_ACQUIRE, 60 RM_LOCK_MODULES_RC) == NV_OK) 61 { 62 THREAD_STATE_NODE threadState; 63 NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL_PARAMS params = {0}; 64 65 threadStateInitISRAndDeferredIntHandler( 66 &threadState, 67 pRcErrorContext->pGpu, 68 THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER); 69 70 params.engineID = pRcErrorContext->EngineId; 71 params.exceptType = pRcErrorContext->exceptType; 72 73 NV_RM_RPC_CONTROL(pRcErrorContext->pGpu, 74 hClient, 75 hChannel, 76 NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL, 77 ¶ms, 78 sizeof params, 79 status); 80 81 threadStateFreeISRAndDeferredIntHandler( 82 &threadState, 83 pRcErrorContext->pGpu, 84 THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER); 85 86 portMemFree(pRcErrorContext); 87 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL); 88 } 89 else 90 { 91 status = NV_ERR_STATE_IN_USE; 92 } 93 osReleaseRmSema(pSys->pSema, NULL); 94 } 95 else 96 { 97 status = NV_ERR_STATE_IN_USE; 98 } 99 100 return status; 101 } 102 103 104 // 105 // krcResetCallback is called by both LDDM and MODS 106 // When adding more function parameters make sure to use datatypes that are 107 // defined in nvtypes.h and also update the RC_RESET_CALLBACK typedef in 108 // sdk/nvidia/inc/rmcd.h 109 // 110 NvU32 111 krcResetCallback 112 ( 113 NvHandle hClient, 114 NvHandle hDevice, 115 NvHandle hFifo, 116 NvHandle hChannel, 117 void *pContext, 118 NvBool bClearRc 119 ) 120 { 121 THREAD_STATE_NODE threadState; 122 RC_ERROR_CONTEXT *pRcErrorContext = (RC_ERROR_CONTEXT *)pContext; 123 OBJSYS *pSys = SYS_GET_INSTANCE(); 124 NV_STATUS status = NV_ERR_GENERIC; 125 126 if (pRcErrorContext != NULL) 127 { 128 if (bClearRc) 129 { 130 // 131 // This is an error condition encountered where the caller 132 // wants to free the RC allocated data structure and nothing 133 // else. Currently, only called by the KMD when a TDR occurs 134 // and there are pending RCs that needs to be cancelled. 135 // 136 portMemFree(pRcErrorContext); 137 status = NV_OK; 138 } 139 else if (IS_VIRTUAL(pRcErrorContext->pGpu)) 140 { 141 status = _vgpuRcResetCallback(hClient, 142 hDevice, 143 hChannel, 144 pRcErrorContext); 145 } 146 else if (osCondAcquireRmSema(pSys->pSema) == NV_OK) 147 { 148 if (rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_RC) == 149 NV_OK) 150 { 151 RsClient *pClient; 152 KernelChannel *pKernelChannel = NULL; 153 154 threadStateInitISRAndDeferredIntHandler( 155 &threadState, 156 pRcErrorContext->pGpu, 157 THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER); 158 159 NV_ASSERT_OK_OR_GOTO( 160 status, 161 serverGetClientUnderLock(&g_resServ, hClient, &pClient), 162 error_cleanup); 163 NV_ASSERT_OK_OR_GOTO( 164 status, 165 CliGetKernelChannel(pClient, hChannel, &pKernelChannel), 166 error_cleanup); 167 168 NV_ASSERT_OR_ELSE(pKernelChannel != NULL, 169 status = NV_ERR_INVALID_STATE; 170 goto error_cleanup); 171 172 { 173 NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL_PARAMS params = {0}; 174 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pRcErrorContext->pGpu); 175 176 // Client lock is already obtained above. 177 status = pRmApi->Control(pRmApi, 178 RES_GET_CLIENT_HANDLE(pKernelChannel), 179 RES_GET_HANDLE(pKernelChannel), 180 NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL, 181 ¶ms, 182 sizeof params); 183 } 184 185 threadStateFreeISRAndDeferredIntHandler( 186 &threadState, 187 pRcErrorContext->pGpu, 188 THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER); 189 190 portMemFree(pRcErrorContext); 191 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL); 192 } 193 else 194 { 195 status = NV_ERR_STATE_IN_USE; 196 } 197 osReleaseRmSema(pSys->pSema, NULL); 198 } 199 else 200 { 201 status = NV_ERR_STATE_IN_USE; 202 } 203 } 204 else 205 { 206 // If no context then just skip.... 207 NV_PRINTF(LEVEL_ERROR, "-- No context skipping reset of channel...\n"); 208 status = NV_OK; 209 } 210 211 return status; 212 213 error_cleanup: 214 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL); 215 osReleaseRmSema(pSys->pSema, NULL); 216 return status; 217 } 218 219 220 NvBool 221 krcErrorInvokeCallback_IMPL 222 ( 223 OBJGPU *pGpu, 224 KernelRc *pKernelRc, 225 KernelChannel *pKernelChannel, 226 FIFO_MMU_EXCEPTION_DATA *pMmuExceptionData, 227 NvU32 exceptType, 228 NvU32 exceptLevel, 229 RM_ENGINE_TYPE rmEngineType, 230 NvU32 rcDiagRecStart 231 ) 232 { 233 OBJSYS *pSys = SYS_GET_INSTANCE(); 234 Journal *pRcDB = SYS_GET_RCDB(pSys); 235 OBJOS *pOS = SYS_GET_OS(pSys); 236 KernelMIGManager *pKernelMigManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu); 237 RmClient *pClient = NULL; 238 RC_CALLBACK_STATUS clientAction; 239 RM_ENGINE_TYPE localRmEngineType = rmEngineType; 240 NvU32 rcDiagRecOwner = RCDB_RCDIAG_DEFAULT_OWNER; 241 NV_STATUS status; 242 NvBool bReturn = NV_TRUE; 243 244 NV_ASSERT_OR_RETURN(!gpumgrGetBcEnabledStatus(pGpu), bReturn); 245 NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelChannel != NULL, bReturn); 246 247 pClient = dynamicCast(RES_GET_CLIENT(pKernelChannel), RmClient); 248 if (pClient == NULL) 249 return bReturn; 250 251 // 252 // If SMC is enabled, RM need to notify partition local engineIds. 253 // Convert global ID to partition local 254 // 255 if (IS_MIG_IN_USE(pGpu) && RM_ENGINE_TYPE_IS_VALID(rmEngineType) && 256 kmigmgrIsEnginePartitionable(pGpu, pKernelMigManager, rmEngineType)) 257 { 258 MIG_INSTANCE_REF ref; 259 status = kmigmgrGetInstanceRefFromClient(pGpu, 260 pKernelMigManager, 261 RES_GET_CLIENT_HANDLE(pKernelChannel), 262 &ref); 263 if (status != NV_OK) 264 return bReturn; 265 266 if (!kmigmgrIsEngineInInstance(pGpu, pKernelMigManager, rmEngineType, ref)) 267 { 268 // Notifier is requested for an unsupported engine 269 NV_PRINTF(LEVEL_ERROR, "RcErroCallback requested for an unsupported engine 0x%x (0x%x)\n", 270 gpuGetNv2080EngineType(rmEngineType), rmEngineType); 271 return bReturn; 272 } 273 274 // Override the engine type with the local engine idx 275 status = kmigmgrGetGlobalToLocalEngineType(pGpu, 276 pKernelMigManager, 277 ref, 278 rmEngineType, 279 &localRmEngineType); 280 if (status != NV_OK) 281 return bReturn; 282 } 283 284 if (pOS->osCheckCallback(pGpu)) 285 { 286 NvHandle hDevice, hFifo; 287 RC_ERROR_CONTEXT *pRcErrorContext = NULL; 288 Device *pDevice; 289 290 NV_ASSERT_OK_OR_RETURN( 291 deviceGetByGpu(RES_GET_CLIENT(pKernelChannel), pGpu, NV_TRUE, &pDevice)); 292 293 hDevice = RES_GET_HANDLE(pDevice); 294 295 296 if (!pKernelChannel->pKernelChannelGroupApi->pKernelChannelGroup 297 ->bAllocatedByRm) 298 { 299 hFifo = RES_GET_PARENT_HANDLE(pKernelChannel); 300 } 301 else 302 { 303 hFifo = RES_GET_HANDLE(pKernelChannel); 304 } 305 306 pRcErrorContext = portMemAllocNonPaged(sizeof *pRcErrorContext); 307 if (pRcErrorContext != NULL) 308 { 309 portMemSet(pRcErrorContext, 0, sizeof *pRcErrorContext); 310 311 pRcErrorContext->pGpu = pGpu; 312 pRcErrorContext->ChId = pKernelChannel->ChID; 313 pRcErrorContext->secChId = 0xFFFFFFFF; 314 pRcErrorContext->sechClient = RES_GET_CLIENT_HANDLE(pKernelChannel); 315 pRcErrorContext->exceptType = exceptType; 316 pRcErrorContext->EngineId = gpuGetNv2080EngineType(localRmEngineType); 317 pRcErrorContext->subdeviceInstance = pGpu->subdeviceInstance; 318 319 if (pMmuExceptionData != NULL) 320 { 321 pRcErrorContext->addrLo = pMmuExceptionData->addrLo; 322 pRcErrorContext->addrHi = pMmuExceptionData->addrHi; 323 pRcErrorContext->faultType = pMmuExceptionData->faultType; 324 pRcErrorContext->faultStr = kgmmuGetFaultTypeString_HAL( 325 GPU_GET_KERNEL_GMMU(pGpu), 326 pMmuExceptionData->faultType); 327 } 328 } 329 330 clientAction = pOS->osRCCallback(pGpu, 331 RES_GET_CLIENT_HANDLE(pKernelChannel), 332 hDevice, 333 hFifo, 334 RES_GET_HANDLE(pKernelChannel), 335 exceptLevel, 336 exceptType, 337 (NvU32 *)pRcErrorContext, 338 &krcResetCallback); 339 340 if (clientAction == RC_CALLBACK_IGNORE || 341 clientAction == RC_CALLBACK_ISOLATE_NO_RESET) 342 { 343 if (clientAction == RC_CALLBACK_IGNORE) 344 { 345 NV_PRINTF(LEVEL_ERROR, "-- Drivers tells RM to ignore\n"); 346 } 347 348 // 349 // if osRCCallback returns RC_HANDLER_ISOLATE_NO_RESET or 350 // IGNORE, client won't call rcResetChannel to put channel back 351 // pRcErrorContext has to be released here 352 // 353 portMemFree(pRcErrorContext); 354 } 355 bReturn = (clientAction != RC_CALLBACK_IGNORE); 356 } 357 else 358 { 359 // use the new CliNotifyDeviceFifoEvent() notification method 360 NvRcNotification params; 361 OBJTMR *pTmr = GPU_GET_TIMER(pGpu); 362 NvU64 time; 363 CLI_CHANNEL_CLASS_INFO classInfo; 364 365 tmrGetCurrentTime(pTmr, &time); 366 367 params.timeStamp.nanoseconds[0] = NvU64_HI32(time); 368 params.timeStamp.nanoseconds[1] = NvU64_LO32(time); 369 params.exceptLevel = exceptLevel; 370 params.exceptType = exceptType; 371 372 // Get rc notifier index from class info 373 374 CliGetChannelClassInfo(RES_GET_EXT_CLASS_ID(pKernelChannel), 375 &classInfo); 376 377 // notify the Fifo channel based event listeners 378 kchannelNotifyGeneric(pKernelChannel, 379 classInfo.rcNotifierIndex, 380 ¶ms, 381 sizeof(params)); 382 } 383 384 // update RC diagnostic records with process id and owner 385 if (rcDiagRecStart != INVALID_RCDB_RCDIAG_INDEX) 386 { 387 rcdbUpdateRcDiagRecContext(pRcDB, 388 rcDiagRecStart, 389 pRcDB->RcErrRptNextIdx - 1, 390 pClient->ProcID, 391 rcDiagRecOwner); 392 } 393 return bReturn; 394 } 395