1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "kernel/gpu/rc/kernel_rc.h"
25 
26 #include "kernel/core/locks.h"
27 #include "kernel/diagnostics/journal.h"
28 #include "kernel/gpu/device/device.h"
29 #include "kernel/gpu/fifo/kernel_channel_group.h"
30 #include "kernel/gpu/fifo/kernel_channel_group_api.h"
31 #include "kernel/gpu/mmu/kern_gmmu.h"
32 #include "kernel/os/os.h"
33 #include "rmapi/client.h"
34 #include "rmapi/rs_utils.h"
35 
36 #include "ctrl/ctrl506f.h"
37 
38 #include "libraries/utils/nvprintf.h"
39 #include "nverror.h"
40 #include "nvtypes.h"
41 #include "objtmr.h"
42 #include "vgpu/rpc.h"
43 
44 
45 static NV_STATUS
46 _vgpuRcResetCallback
47 (
48     NvHandle          hClient,
49     NvHandle          hDevice,
50     NvHandle          hChannel,
51     RC_ERROR_CONTEXT *pRcErrorContext
52 )
53 {
54     OBJSYS   *pSys   = SYS_GET_INSTANCE();
55     NV_STATUS status = NV_OK;
56 
57     if (osCondAcquireRmSema(pSys->pSema) == NV_OK)
58     {
59         if (rmGpuLocksAcquire(GPU_LOCK_FLAGS_COND_ACQUIRE,
60                               RM_LOCK_MODULES_RC) == NV_OK)
61         {
62             THREAD_STATE_NODE                             threadState;
63             NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL_PARAMS params = {0};
64 
65             threadStateInitISRAndDeferredIntHandler(
66                 &threadState,
67                 pRcErrorContext->pGpu,
68                 THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER);
69 
70             params.engineID   = pRcErrorContext->EngineId;
71             params.exceptType = pRcErrorContext->exceptType;
72 
73             NV_RM_RPC_CONTROL(pRcErrorContext->pGpu,
74                               hClient,
75                               hChannel,
76                               NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL,
77                               &params,
78                               sizeof params,
79                               status);
80 
81             threadStateFreeISRAndDeferredIntHandler(
82                 &threadState,
83                 pRcErrorContext->pGpu,
84                 THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER);
85 
86             portMemFree(pRcErrorContext);
87             rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
88         }
89         else
90         {
91             status = NV_ERR_STATE_IN_USE;
92         }
93         osReleaseRmSema(pSys->pSema, NULL);
94     }
95     else
96     {
97         status = NV_ERR_STATE_IN_USE;
98     }
99 
100     return status;
101 }
102 
103 
104 //
105 // krcResetCallback is called by both LDDM and MODS
106 // When adding more function parameters make sure to use datatypes that are
107 // defined in nvtypes.h and also update the RC_RESET_CALLBACK typedef in
108 // sdk/nvidia/inc/rmcd.h
109 //
110 NvU32
111 krcResetCallback
112 (
113     NvHandle hClient,
114     NvHandle hDevice,
115     NvHandle hFifo,
116     NvHandle hChannel,
117     void    *pContext,
118     NvBool   bClearRc
119 )
120 {
121     THREAD_STATE_NODE threadState;
122     RC_ERROR_CONTEXT *pRcErrorContext = (RC_ERROR_CONTEXT *)pContext;
123     OBJSYS           *pSys            = SYS_GET_INSTANCE();
124     NV_STATUS         status          = NV_ERR_GENERIC;
125 
126     if (pRcErrorContext != NULL)
127     {
128         if (bClearRc)
129         {
130             //
131             // This is an error condition encountered where the caller
132             // wants to free the RC allocated data structure and nothing
133             // else.  Currently, only called by the KMD when a TDR occurs
134             // and there are pending RCs that needs to be cancelled.
135             //
136             portMemFree(pRcErrorContext);
137             status = NV_OK;
138         }
139         else if (IS_VIRTUAL(pRcErrorContext->pGpu))
140         {
141             status = _vgpuRcResetCallback(hClient,
142                                           hDevice,
143                                           hChannel,
144                                           pRcErrorContext);
145         }
146         else if (osCondAcquireRmSema(pSys->pSema) == NV_OK)
147         {
148             if (rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_RC) ==
149                 NV_OK)
150             {
151                 RsClient      *pClient;
152                 KernelChannel *pKernelChannel = NULL;
153 
154                 threadStateInitISRAndDeferredIntHandler(
155                     &threadState,
156                     pRcErrorContext->pGpu,
157                     THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER);
158 
159                 NV_ASSERT_OK_OR_GOTO(
160                     status,
161                     serverGetClientUnderLock(&g_resServ, hClient, &pClient),
162                     error_cleanup);
163                 NV_ASSERT_OK_OR_GOTO(
164                     status,
165                     CliGetKernelChannel(pClient, hChannel, &pKernelChannel),
166                     error_cleanup);
167 
168                 NV_ASSERT_OR_ELSE(pKernelChannel != NULL,
169                                   status = NV_ERR_INVALID_STATE;
170                                   goto error_cleanup);
171 
172                 {
173                     NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL_PARAMS params = {0};
174                     RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pRcErrorContext->pGpu);
175 
176                     // Client lock is already obtained above.
177                     status = pRmApi->Control(pRmApi,
178                         RES_GET_CLIENT_HANDLE(pKernelChannel),
179                         RES_GET_HANDLE(pKernelChannel),
180                         NV506F_CTRL_CMD_RESET_ISOLATED_CHANNEL,
181                         &params,
182                         sizeof params);
183                 }
184 
185                 threadStateFreeISRAndDeferredIntHandler(
186                     &threadState,
187                     pRcErrorContext->pGpu,
188                     THREAD_STATE_FLAGS_IS_DEFERRED_INT_HANDLER);
189 
190                 portMemFree(pRcErrorContext);
191                 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
192             }
193             else
194             {
195                 status = NV_ERR_STATE_IN_USE;
196             }
197             osReleaseRmSema(pSys->pSema, NULL);
198         }
199         else
200         {
201             status = NV_ERR_STATE_IN_USE;
202         }
203     }
204     else
205     {
206         // If no context then just skip....
207         NV_PRINTF(LEVEL_ERROR, "-- No context skipping reset of channel...\n");
208         status = NV_OK;
209     }
210 
211     return status;
212 
213 error_cleanup:
214     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
215     osReleaseRmSema(pSys->pSema, NULL);
216     return status;
217 }
218 
219 
220 NvBool
221 krcErrorInvokeCallback_IMPL
222 (
223     OBJGPU                  *pGpu,
224     KernelRc                *pKernelRc,
225     KernelChannel           *pKernelChannel,
226     FIFO_MMU_EXCEPTION_DATA *pMmuExceptionData,
227     NvU32                    exceptType,
228     NvU32                    exceptLevel,
229     RM_ENGINE_TYPE           rmEngineType,
230     NvU32                    rcDiagRecStart
231 )
232 {
233     OBJSYS             *pSys              = SYS_GET_INSTANCE();
234     Journal            *pRcDB             = SYS_GET_RCDB(pSys);
235     OBJOS              *pOS               = SYS_GET_OS(pSys);
236     KernelMIGManager   *pKernelMigManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
237     RmClient           *pClient           = NULL;
238     RC_CALLBACK_STATUS  clientAction;
239     RM_ENGINE_TYPE      localRmEngineType  = rmEngineType;
240     NvU32               rcDiagRecOwner = RCDB_RCDIAG_DEFAULT_OWNER;
241     NV_STATUS           status;
242     NvBool              bReturn = NV_TRUE;
243 
244     NV_ASSERT_OR_RETURN(!gpumgrGetBcEnabledStatus(pGpu), bReturn);
245     NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelChannel != NULL, bReturn);
246 
247     pClient = dynamicCast(RES_GET_CLIENT(pKernelChannel), RmClient);
248     if (pClient == NULL)
249         return bReturn;
250 
251     //
252     // If SMC is enabled, RM need to notify partition local engineIds.
253     // Convert global ID to partition local
254     //
255     if (IS_MIG_IN_USE(pGpu) && RM_ENGINE_TYPE_IS_VALID(rmEngineType) &&
256         kmigmgrIsEnginePartitionable(pGpu, pKernelMigManager, rmEngineType))
257     {
258         MIG_INSTANCE_REF ref;
259         status = kmigmgrGetInstanceRefFromClient(pGpu,
260                                                  pKernelMigManager,
261                                                  RES_GET_CLIENT_HANDLE(pKernelChannel),
262                                                  &ref);
263         if (status != NV_OK)
264             return bReturn;
265 
266         if (!kmigmgrIsEngineInInstance(pGpu, pKernelMigManager, rmEngineType, ref))
267         {
268             // Notifier is requested for an unsupported engine
269             NV_PRINTF(LEVEL_ERROR, "RcErroCallback requested for an unsupported engine 0x%x (0x%x)\n",
270                                     gpuGetNv2080EngineType(rmEngineType), rmEngineType);
271             return bReturn;
272         }
273 
274         // Override the engine type with the local engine idx
275         status = kmigmgrGetGlobalToLocalEngineType(pGpu,
276                                                    pKernelMigManager,
277                                                    ref,
278                                                    rmEngineType,
279                                                    &localRmEngineType);
280         if (status != NV_OK)
281             return bReturn;
282     }
283 
284     if (pOS->osCheckCallback(pGpu))
285     {
286         NvHandle          hDevice, hFifo;
287         RC_ERROR_CONTEXT *pRcErrorContext = NULL;
288         Device           *pDevice;
289 
290         NV_ASSERT_OK_OR_RETURN(
291             deviceGetByGpu(RES_GET_CLIENT(pKernelChannel), pGpu, NV_TRUE, &pDevice));
292 
293         hDevice = RES_GET_HANDLE(pDevice);
294 
295 
296         if (!pKernelChannel->pKernelChannelGroupApi->pKernelChannelGroup
297                  ->bAllocatedByRm)
298         {
299             hFifo = RES_GET_PARENT_HANDLE(pKernelChannel);
300         }
301         else
302         {
303             hFifo = RES_GET_HANDLE(pKernelChannel);
304         }
305 
306         pRcErrorContext = portMemAllocNonPaged(sizeof *pRcErrorContext);
307         if (pRcErrorContext != NULL)
308         {
309             portMemSet(pRcErrorContext, 0, sizeof *pRcErrorContext);
310 
311             pRcErrorContext->pGpu       = pGpu;
312             pRcErrorContext->ChId       = pKernelChannel->ChID;
313             pRcErrorContext->secChId    = 0xFFFFFFFF;
314             pRcErrorContext->sechClient = RES_GET_CLIENT_HANDLE(pKernelChannel);
315             pRcErrorContext->exceptType = exceptType;
316             pRcErrorContext->EngineId   = gpuGetNv2080EngineType(localRmEngineType);
317             pRcErrorContext->subdeviceInstance = pGpu->subdeviceInstance;
318 
319             if (pMmuExceptionData != NULL)
320             {
321                 pRcErrorContext->addrLo    = pMmuExceptionData->addrLo;
322                 pRcErrorContext->addrHi    = pMmuExceptionData->addrHi;
323                 pRcErrorContext->faultType = pMmuExceptionData->faultType;
324                 pRcErrorContext->faultStr  = kgmmuGetFaultTypeString_HAL(
325                     GPU_GET_KERNEL_GMMU(pGpu),
326                     pMmuExceptionData->faultType);
327             }
328         }
329 
330         clientAction = pOS->osRCCallback(pGpu,
331                                          RES_GET_CLIENT_HANDLE(pKernelChannel),
332                                          hDevice,
333                                          hFifo,
334                                          RES_GET_HANDLE(pKernelChannel),
335                                          exceptLevel,
336                                          exceptType,
337                                          (NvU32 *)pRcErrorContext,
338                                          &krcResetCallback);
339 
340         if (clientAction == RC_CALLBACK_IGNORE ||
341             clientAction == RC_CALLBACK_ISOLATE_NO_RESET)
342         {
343             if (clientAction == RC_CALLBACK_IGNORE)
344             {
345                 NV_PRINTF(LEVEL_ERROR, "-- Drivers tells RM to ignore\n");
346             }
347 
348             //
349             // if osRCCallback returns RC_HANDLER_ISOLATE_NO_RESET or
350             // IGNORE, client won't call rcResetChannel to put channel back
351             // pRcErrorContext has to be released here
352             //
353             portMemFree(pRcErrorContext);
354         }
355         bReturn = (clientAction != RC_CALLBACK_IGNORE);
356     }
357     else
358     {
359         // use the new CliNotifyDeviceFifoEvent() notification method
360         NvRcNotification       params;
361         OBJTMR                *pTmr = GPU_GET_TIMER(pGpu);
362         NvU64                  time;
363         CLI_CHANNEL_CLASS_INFO classInfo;
364 
365         tmrGetCurrentTime(pTmr, &time);
366 
367         params.timeStamp.nanoseconds[0] = NvU64_HI32(time);
368         params.timeStamp.nanoseconds[1] = NvU64_LO32(time);
369         params.exceptLevel              = exceptLevel;
370         params.exceptType               = exceptType;
371 
372         // Get rc notifier index from class info
373 
374         CliGetChannelClassInfo(RES_GET_EXT_CLASS_ID(pKernelChannel),
375                                &classInfo);
376 
377         // notify the Fifo channel based event listeners
378         kchannelNotifyGeneric(pKernelChannel,
379                               classInfo.rcNotifierIndex,
380                               &params,
381                               sizeof(params));
382     }
383 
384     // update RC diagnostic records with process id and owner
385     if (rcDiagRecStart != INVALID_RCDB_RCDIAG_INDEX)
386     {
387         rcdbUpdateRcDiagRecContext(pRcDB,
388                                    rcDiagRecStart,
389                                    pRcDB->RcErrRptNextIdx - 1,
390                                    pClient->ProcID,
391                                    rcDiagRecOwner);
392     }
393     return bReturn;
394 }
395