1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "kernel/gpu/rc/kernel_rc.h"
25 
26 #include "kernel/core/locks.h"
27 #include "kernel/core/system.h"
28 #include "kernel/gpu/bif/kernel_bif.h"
29 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
30 #include "kernel/os/os.h"
31 #include "kernel/platform/chipset/chipset.h"
32 #include "kernel/rmapi/client.h"
33 
34 
35 #include "libraries/utils/nvprintf.h"
36 #include "nvRmReg.h"
37 #include "nverror.h"
38 #include "nvtypes.h"
39 #include "objtmr.h"
40 
41 
42 static void _krcInitRegistryOverrides(OBJGPU *pGpu, KernelRc *pKernelRc);
43 static void _krcLogUuidOnce(OBJGPU *pGpu, KernelRc *pKernelRc);
44 
45 
46 NV_STATUS
krcConstructEngine_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,ENGDESCRIPTOR engDescriptor)47 krcConstructEngine_IMPL
48 (
49     OBJGPU        *pGpu,
50     KernelRc      *pKernelRc,
51     ENGDESCRIPTOR  engDescriptor
52 )
53 {
54     _krcInitRegistryOverrides(pGpu, pKernelRc);
55 
56     return NV_OK;
57 }
58 
59 
60 void
krcInitRegistryOverridesDelayed_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc)61 krcInitRegistryOverridesDelayed_IMPL
62 (
63     OBJGPU   *pGpu,
64     KernelRc *pKernelRc
65 )
66 {
67     KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu);
68     NvU32 dword = 0;
69     (void) dword;
70 
71 
72     dword = 0;
73     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_ROBUST_CHANNELS, &dword) !=
74         NV_OK)
75     {
76 #if RMCFG_FEATURE_PLATFORM_WINDOWS || RMCFG_FEATURE_PLATFORM_GSP || \
77     RMCFG_FEATURE_PLATFORM_UNIX
78         dword = NV_REG_STR_RM_ROBUST_CHANNELS_ENABLE;
79 #else
80 #error "unrecognized platform"
81 #endif
82     }
83     pKernelRc->bRobustChannelsEnabled = (dword ==
84                                          NV_REG_STR_RM_ROBUST_CHANNELS_ENABLE);
85 
86 
87     dword = 0;
88     //
89     // Force uncached pushbuffers for robust channel.
90     //
91     // We used to allocate the recovery channel as uncached, which is achieved
92     // by allocating physically contiguous memory then remap that uncached.
93     // However, this caused allocations issues in cases which shares a channel
94     // with the robust channel, and ended up requesting sizeof(RC + pushbuffer)
95     // of contiguous memory (bug 73669).
96     //
97     // We therefore switched to cached allocations, with a few exceptions where
98     // an uncached pushbuffer is still needed:
99     // - When the system does not support CPU cache snooping (bugs 292461 and
100     // 976485).
101     //
102     if ((osReadRegistryDword(pGpu,
103                              NV_REG_STR_USE_UNCACHED_PCI_MAPPINGS,
104                              &dword) == NV_OK &&
105          dword != 0) ||
106         ((pKernelBif != NULL) &&
107          !kbifIsSnoopDmaCapable(pGpu, pKernelBif)))
108     {
109         pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_ALLOC_UNCACHED_PCI;
110     }
111 }
112 
113 
114 static void
_krcInitRegistryOverrides(OBJGPU * pGpu,KernelRc * pKernelRc)115 _krcInitRegistryOverrides
116 (
117     OBJGPU   *pGpu,
118     KernelRc *pKernelRc
119 )
120 {
121     NvU32 dword = 0;
122     (void) dword;
123 
124     dword = 0;
125     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_BREAK_ON_RC, &dword) != NV_OK)
126     {
127         dword = NV_REG_STR_RM_BREAK_ON_RC_DEFAULT;
128     }
129 
130     pKernelRc->bBreakOnRc = (dword == NV_REG_STR_RM_BREAK_ON_RC_ENABLE);
131 
132     // Allow driver registry key RmBreak to override Device Key
133     if (DRF_VAL(_DEBUG, _BREAK_FLAGS, _RC, SYS_GET_INSTANCE()->debugFlags) ==
134         NV_DEBUG_BREAK_FLAGS_RC_ENABLE)
135     {
136         pKernelRc->bBreakOnRc = NV_TRUE;
137     }
138 
139     if (pKernelRc->bBreakOnRc)
140     {
141         NV_PRINTF(LEVEL_INFO, "Breakpoint on RC Error is enabled\n");
142     }
143     else
144     {
145         NV_PRINTF(LEVEL_INFO, "Breakpoint on RC Error is disabled\n");
146     }
147 
148 
149     if (osReadRegistryDword(pGpu,
150                             NV_REG_STR_RM_WATCHDOG_TIMEOUT,
151                             &pKernelRc->watchdogPersistent.timeoutSecs) !=
152             NV_OK ||
153         pKernelRc->watchdogPersistent.timeoutSecs == 0)
154     {
155         pKernelRc->watchdogPersistent.timeoutSecs =
156             NV_REG_STR_RM_WATCHDOG_TIMEOUT_DEFAULT;
157     }
158     if (osReadRegistryDword(pGpu,
159                             NV_REG_STR_RM_WATCHDOG_INTERVAL,
160                             &pKernelRc->watchdogPersistent.intervalSecs) !=
161             NV_OK ||
162         pKernelRc->watchdogPersistent.intervalSecs == 0)
163     {
164         pKernelRc->watchdogPersistent.intervalSecs =
165             NV_REG_STR_RM_WATCHDOG_INTERVAL_DEFAULT;
166     }
167 
168     if (pKernelRc->watchdogPersistent.intervalSecs >
169         pKernelRc->watchdogPersistent.timeoutSecs)
170     {
171         pKernelRc->watchdogPersistent.intervalSecs =
172             pKernelRc->watchdogPersistent.timeoutSecs;
173     }
174 
175 
176     dword = 0;
177     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_RC_WATCHDOG, &dword) == NV_OK)
178     {
179         if (dword == NV_REG_STR_RM_RC_WATCHDOG_DISABLE)
180         {
181             pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
182         }
183     }
184     else if (IS_EMULATION(pGpu) || IS_SIMULATION(pGpu))
185     {
186         pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
187     }
188     else if (gpuIsCCFeatureEnabled(pGpu))
189     {
190         pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
191     }
192 
193     dword = 0;
194     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_DO_LOG_RC_EVENTS, &dword) ==
195         NV_OK)
196     {
197         pKernelRc->bLogEvents = (dword == NV_REG_STR_RM_DO_LOG_RC_ENABLE);
198         if (pKernelRc->bLogEvents)
199         {
200             NV_PRINTF(LEVEL_INFO, "RC Error Logging is enabled\n");
201 #if defined(DEBUG)
202             // Don't print out the initialization log on a retail build
203             osErrorLog(pGpu, ROBUST_CHANNEL_RC_LOGGING_ENABLED, "");
204 #endif
205         }
206     }
207 
208     //
209     // Do RC on BAR faults by default (For bug 1842228).
210     // Only applicable to Volta+ chips.
211     //
212     pKernelRc->bRcOnBar2Fault = NV_TRUE;
213 
214 }
215 
216 
217 static void
_krcLogUuidOnce(OBJGPU * pGpu,KernelRc * pKernelRc)218 _krcLogUuidOnce
219 (
220     OBJGPU   *pGpu,
221     KernelRc *pKernelRc
222 )
223 {
224     if (!pKernelRc->bGpuUuidLoggedOnce)
225     {
226         NvU8 *gidString = NULL;
227         NvU32 gidStrlen;
228 
229         if (gpuGetGidInfo(pGpu,
230                 &gidString,
231                 &gidStrlen,
232                 (DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _FORMAT, _ASCII) |
233                  DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _TYPE,   _SHA1))) ==
234             NV_OK)
235         {
236             portDbgPrintf("NVRM: GPU at PCI:%04x:%02x:%02x: %s\n",
237                           gpuGetDomain(pGpu),
238                           gpuGetBus(pGpu),
239                           gpuGetDevice(pGpu),
240                           gidString);
241             portMemFree(gidString);
242         }
243 
244         if (pGpu->boardInfo != NULL && pGpu->boardInfo->serialNumber[0] != '\0')
245         {
246             portDbgPrintf("NVRM: GPU Board Serial Number: %s\n",
247                           pGpu->boardInfo->serialNumber);
248         }
249 
250         pKernelRc->bGpuUuidLoggedOnce = NV_TRUE;
251     }
252 }
253 
254 
255 void
krcGetMigAttributionForError_KERNEL(KernelRc * pKernelRc,NvU32 exceptType,NvU16 * pGpuPartitionId,NvU16 * pComputeInstanceId)256 krcGetMigAttributionForError_KERNEL
257 (
258     KernelRc *pKernelRc,
259     NvU32     exceptType,
260     NvU16    *pGpuPartitionId,
261     NvU16    *pComputeInstanceId
262 )
263 {
264     if (pGpuPartitionId != NULL)
265     {
266         *pGpuPartitionId = KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID;
267     }
268     if (pComputeInstanceId != NULL)
269     {
270         *pComputeInstanceId = KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID;
271     }
272 }
273 
274 
275 void
krcReportXid_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,NvU32 exceptType,const char * pMsg)276 krcReportXid_IMPL
277 (
278     OBJGPU     *pGpu,
279     KernelRc   *pKernelRc,
280     NvU32       exceptType,
281     const char *pMsg
282 )
283 {
284     //
285     // Log the RC error to the OS
286     //
287     // Enforce the policy of gating the log output by "RmLogonRC" regkey.
288     // Some of our callers do not abide by this rule.
289     // That is how they want it under Windows.
290     //
291     if (GPU_GET_KERNEL_RC(pGpu)->bLogEvents)
292     {
293         NvU16          gpuPartitionId;
294         NvU16          computeInstanceId;
295         KernelChannel *pKernelChannel = krcGetChannelInError(pKernelRc);
296         char          *current_procname = NULL;
297 
298         // Channels are populated with osGetCurrentProcessName() and pid of
299         // their process at creation-time. If no channel was found, mark unknown
300         const char *procname = "<unknown>";
301         char pid_string[12] = "'<unknown>'";
302 
303         //
304         // Get PID of channel creator if available, or get the current PID for
305         // exception types that never have an associated channel
306         //
307         // Check for API lock since this can be called from parallel init
308         // path without API lock, and RES_GET_CLIENT requires API lock
309         //
310         if (rmapiLockIsOwner() && (pKernelChannel != NULL))
311         {
312             RsClient *pClient = RES_GET_CLIENT(pKernelChannel);
313             RmClient *pRmClient = dynamicCast(pClient, RmClient);
314             procname = pRmClient->name;
315             nvDbgSnprintf(pid_string, sizeof(pid_string), "%u", pKernelChannel->ProcessID);
316         }
317         else if (exceptType == GSP_RPC_TIMEOUT)
318         {
319             NvU32 current_pid = osGetCurrentProcess();
320 
321             nvDbgSnprintf(pid_string, sizeof(pid_string), "%u", current_pid);
322 
323             current_procname = portMemAllocNonPaged(NV_PROC_NAME_MAX_LENGTH);
324             if (current_procname != NULL)
325             {
326                 osGetCurrentProcessName(current_procname, NV_PROC_NAME_MAX_LENGTH);
327                 procname = current_procname;
328             }
329         }
330 
331         _krcLogUuidOnce(pGpu, pKernelRc);
332 
333         krcGetMigAttributionForError_HAL(pKernelRc,
334                                          exceptType,
335                                          &gpuPartitionId,
336                                          &computeInstanceId);
337 
338         if (gpuPartitionId    != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID &&
339             computeInstanceId != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID)
340         {
341             // Attribute this XID to both GPU / Compute instance
342             portDbgPrintf(
343                 "NVRM: Xid (PCI:%04x:%02x:%02x GPU-I:%02u GPU-CI:%02u): %d, pid=%s, name=%s, %s\n",
344                 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
345                 gpuPartitionId, computeInstanceId,
346                 exceptType,
347                 pid_string,
348                 procname,
349                 pMsg != NULL ? pMsg : "");
350         }
351         else if (gpuPartitionId != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID)
352         {
353             // Attribute this XID to GPU instance only
354             portDbgPrintf(
355                 "NVRM: Xid (PCI:%04x:%02x:%02x GPU-I:%02u): %d, pid=%s, name=%s, %s\n",
356                 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
357                 gpuPartitionId,
358                 exceptType,
359                 pid_string,
360                 procname,
361                 pMsg != NULL ? pMsg : "");
362         }
363         else
364         {
365             // Legacy (no attribution) XID reporting
366             portDbgPrintf("NVRM: Xid (PCI:%04x:%02x:%02x): %d, pid=%s, name=%s, %s\n",
367                 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
368                 exceptType,
369                 pid_string,
370                 procname,
371                 pMsg != NULL ? pMsg : "");
372         }
373 
374         portMemFree(current_procname);
375     }
376 }
377 
378 
379 NvBool
krcTestAllowAlloc_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,NvU32 failMask)380 krcTestAllowAlloc_IMPL
381 (
382     OBJGPU   *pGpu,
383     KernelRc *pKernelRc,
384     NvU32     failMask
385 )
386 {
387     if (pKernelRc->bRobustChannelsEnabled &&
388         (pKernelRc->watchdog.allocFailMask & failMask))
389     {
390         OBJTMR   *pTmr = GPU_GET_TIMER(pGpu);
391         NvU64     time;
392         NV_STATUS status = tmrGetCurrentTime(pTmr, &time);
393 
394         //
395         // randomly fail this alloc based on NV timer
396         // assuming here that we don't get allocations within 128ns of each
397         // other
398         //
399         if (status == NV_OK && ((time & 0xff) > (0xffu / 2)))
400             return NV_FALSE;
401     }
402 
403     return NV_TRUE;
404 }
405 
406 
407 NV_STATUS
krcCheckBusError_KERNEL(OBJGPU * pGpu,KernelRc * pKernelRc)408 krcCheckBusError_KERNEL
409 (
410     OBJGPU   *pGpu,
411     KernelRc *pKernelRc
412 )
413 {
414     KernelBif        *pKernelBif               = GPU_GET_KERNEL_BIF(pGpu);
415     OBJCL            *pCl                      = SYS_GET_CL(SYS_GET_INSTANCE());
416     NvU32             clDevCtrlStatusFlags     = 0;
417     NvU32             clDevCtrlStatusFlags_Org = 0;
418     NvU32             clDevCtrlStatus          = 0;
419     PcieAerCapability clAer;
420 
421 
422     // PCI-E provides extended error reporting
423     if (pKernelBif == NULL || kbifGetBusIntfType_HAL(pKernelBif) !=
424                                   NV2080_CTRL_BUS_INFO_TYPE_PCI_EXPRESS)
425     {
426         return NV_OK;
427     }
428 
429     // Clear PCIe dev ctrl/status errors and AER errors
430     kbifClearConfigErrors(pGpu, pKernelBif, NV_TRUE,
431                           KBIF_CLEAR_XVE_AER_ALL_MASK);
432 
433     // Corelogic device control status
434     if (pCl != NULL &&
435         clPcieReadDevCtrlStatus(pGpu, pCl,
436                                 &clDevCtrlStatusFlags,
437                                 &clDevCtrlStatus) == NV_OK &&
438         clDevCtrlStatusFlags != 0)
439     {
440         NV_PRINTF(LEVEL_ERROR,
441             "PCI-E corelogic status has pending errors (CL_PCIE_DEV_CTRL_STATUS = %08X):\n",
442             clDevCtrlStatus);
443 
444         clDevCtrlStatusFlags_Org = clDevCtrlStatusFlags;
445 
446         if (clDevCtrlStatusFlags &
447             NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_CORR_ERROR)
448         {
449             NV_PRINTF(LEVEL_ERROR, "     _CORR_ERROR_DETECTED\n");
450             // not much interested in this one
451             clDevCtrlStatusFlags &=
452                 ~NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_CORR_ERROR;
453         }
454         if (clDevCtrlStatusFlags &
455             NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_NON_FATAL_ERROR)
456         {
457             NV_PRINTF(LEVEL_ERROR, "     _NON_FATAL_ERROR_DETECTED\n");
458         }
459         if (clDevCtrlStatusFlags &
460             NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_FATAL_ERROR)
461         {
462             NV_PRINTF(LEVEL_ERROR, "     _FATAL_ERROR_DETECTED\n");
463         }
464         if (clDevCtrlStatusFlags &
465             NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_UNSUPP_REQUEST)
466         {
467             NV_PRINTF(LEVEL_ERROR, "     _UNSUPP_REQUEST_DETECTED\n");
468         }
469     }
470 
471     // Corelogic AER
472     if (pCl != NULL && clPcieReadAerCapability(pGpu, pCl, &clAer) == NV_OK &&
473         (clAer.UncorrErrStatusReg != 0 || clAer.RooErrStatus != 0))
474     {
475         NV_PRINTF(LEVEL_ERROR,
476                   "PCE-I Advanced Error Reporting Corelogic Info:\n");
477         NV_PRINTF(LEVEL_ERROR,
478                   "     Uncorr Error Status Register    : %08X\n",
479                   clAer.UncorrErrStatusReg);
480         NV_PRINTF(LEVEL_ERROR,
481                   "     Uncorr Error Mask Register      : %08X\n",
482                   clAer.UncorrErrMaskReg);
483         NV_PRINTF(LEVEL_ERROR,
484                   "     Uncorr Error Severity Register  : %08X\n",
485                   clAer.UncorrErrSeverityReg);
486         NV_PRINTF(LEVEL_ERROR,
487                   "     Corr Error Status Register      : %08X\n",
488                   clAer.CorrErrStatusReg);
489         NV_PRINTF(LEVEL_ERROR,
490                   "     Corr Error Mask Register        : %08X\n",
491                   clAer.CorrErrMaskReg);
492         NV_PRINTF(LEVEL_ERROR,
493                   "     Advanced Err Cap & Ctrl Register: %08X\n",
494                   clAer.AEcapCrtlReg);
495         NV_PRINTF(LEVEL_ERROR,
496                   "     Header Log [0-3]                : %08X\n",
497                   clAer.HeaderLogReg.Header[0]);
498         NV_PRINTF(LEVEL_ERROR,
499                   "     Header Log [4-7]                : %08X\n",
500                   clAer.HeaderLogReg.Header[1]);
501         NV_PRINTF(LEVEL_ERROR,
502                   "     Header Log [8-B]                : %08X\n",
503                   clAer.HeaderLogReg.Header[2]);
504         NV_PRINTF(LEVEL_ERROR,
505                   "     Header Log [C-F]                : %08X\n",
506                   clAer.HeaderLogReg.Header[3]);
507         NV_PRINTF(LEVEL_ERROR,
508                   "     Root Error Command Register     : %08X\n",
509                   clAer.RootErrCmd);
510         NV_PRINTF(LEVEL_ERROR,
511                   "     Root Error Status               : %08X\n",
512                   clAer.RooErrStatus);
513         NV_PRINTF(LEVEL_ERROR,
514                   "     Error Source ID Register        : %08X\n",
515                   clAer.ErrSrcReg);
516 
517         //
518         // if you hit this case with some AER errors reported please refer to
519         // PCI-E manual for detailed bits spec
520         // TODO: add details bits here
521         //
522     }
523 
524     if (clDevCtrlStatusFlags_Org)
525     {
526         // clear the corelogic status after we had a chance to examine it
527         clPcieClearDevCtrlStatus(pGpu, pCl, &clDevCtrlStatus);
528     }
529 
530     return NV_OK;
531 }
532 
533 KernelChannel *
krcGetChannelInError_FWCLIENT(KernelRc * pKernelRc)534 krcGetChannelInError_FWCLIENT
535 (
536     KernelRc *pKernelRc
537 )
538 {
539     NV_ASSERT_OR_RETURN(IS_GSP_CLIENT(ENG_GET_GPU(pKernelRc)), NULL);
540     return pKernelRc->pPreviousChannelInError;
541 }
542