1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "kernel/gpu/rc/kernel_rc.h"
25
26 #include "kernel/core/locks.h"
27 #include "kernel/core/system.h"
28 #include "kernel/gpu/bif/kernel_bif.h"
29 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
30 #include "kernel/os/os.h"
31 #include "kernel/platform/chipset/chipset.h"
32 #include "kernel/rmapi/client.h"
33
34
35 #include "libraries/utils/nvprintf.h"
36 #include "nvRmReg.h"
37 #include "nverror.h"
38 #include "nvtypes.h"
39 #include "objtmr.h"
40
41
42 static void _krcInitRegistryOverrides(OBJGPU *pGpu, KernelRc *pKernelRc);
43 static void _krcLogUuidOnce(OBJGPU *pGpu, KernelRc *pKernelRc);
44
45
46 NV_STATUS
krcConstructEngine_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,ENGDESCRIPTOR engDescriptor)47 krcConstructEngine_IMPL
48 (
49 OBJGPU *pGpu,
50 KernelRc *pKernelRc,
51 ENGDESCRIPTOR engDescriptor
52 )
53 {
54 _krcInitRegistryOverrides(pGpu, pKernelRc);
55
56 return NV_OK;
57 }
58
59
60 void
krcInitRegistryOverridesDelayed_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc)61 krcInitRegistryOverridesDelayed_IMPL
62 (
63 OBJGPU *pGpu,
64 KernelRc *pKernelRc
65 )
66 {
67 KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu);
68 NvU32 dword = 0;
69 (void) dword;
70
71
72 dword = 0;
73 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_ROBUST_CHANNELS, &dword) !=
74 NV_OK)
75 {
76 #if RMCFG_FEATURE_PLATFORM_WINDOWS || RMCFG_FEATURE_PLATFORM_GSP || \
77 RMCFG_FEATURE_PLATFORM_UNIX
78 dword = NV_REG_STR_RM_ROBUST_CHANNELS_ENABLE;
79 #else
80 #error "unrecognized platform"
81 #endif
82 }
83 pKernelRc->bRobustChannelsEnabled = (dword ==
84 NV_REG_STR_RM_ROBUST_CHANNELS_ENABLE);
85
86
87 dword = 0;
88 //
89 // Force uncached pushbuffers for robust channel.
90 //
91 // We used to allocate the recovery channel as uncached, which is achieved
92 // by allocating physically contiguous memory then remap that uncached.
93 // However, this caused allocations issues in cases which shares a channel
94 // with the robust channel, and ended up requesting sizeof(RC + pushbuffer)
95 // of contiguous memory (bug 73669).
96 //
97 // We therefore switched to cached allocations, with a few exceptions where
98 // an uncached pushbuffer is still needed:
99 // - When the system does not support CPU cache snooping (bugs 292461 and
100 // 976485).
101 //
102 if ((osReadRegistryDword(pGpu,
103 NV_REG_STR_USE_UNCACHED_PCI_MAPPINGS,
104 &dword) == NV_OK &&
105 dword != 0) ||
106 ((pKernelBif != NULL) &&
107 !kbifIsSnoopDmaCapable(pGpu, pKernelBif)))
108 {
109 pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_ALLOC_UNCACHED_PCI;
110 }
111 }
112
113
114 static void
_krcInitRegistryOverrides(OBJGPU * pGpu,KernelRc * pKernelRc)115 _krcInitRegistryOverrides
116 (
117 OBJGPU *pGpu,
118 KernelRc *pKernelRc
119 )
120 {
121 NvU32 dword = 0;
122 (void) dword;
123
124 dword = 0;
125 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_BREAK_ON_RC, &dword) != NV_OK)
126 {
127 dword = NV_REG_STR_RM_BREAK_ON_RC_DEFAULT;
128 }
129
130 pKernelRc->bBreakOnRc = (dword == NV_REG_STR_RM_BREAK_ON_RC_ENABLE);
131
132 // Allow driver registry key RmBreak to override Device Key
133 if (DRF_VAL(_DEBUG, _BREAK_FLAGS, _RC, SYS_GET_INSTANCE()->debugFlags) ==
134 NV_DEBUG_BREAK_FLAGS_RC_ENABLE)
135 {
136 pKernelRc->bBreakOnRc = NV_TRUE;
137 }
138
139 if (pKernelRc->bBreakOnRc)
140 {
141 NV_PRINTF(LEVEL_INFO, "Breakpoint on RC Error is enabled\n");
142 }
143 else
144 {
145 NV_PRINTF(LEVEL_INFO, "Breakpoint on RC Error is disabled\n");
146 }
147
148
149 if (osReadRegistryDword(pGpu,
150 NV_REG_STR_RM_WATCHDOG_TIMEOUT,
151 &pKernelRc->watchdogPersistent.timeoutSecs) !=
152 NV_OK ||
153 pKernelRc->watchdogPersistent.timeoutSecs == 0)
154 {
155 pKernelRc->watchdogPersistent.timeoutSecs =
156 NV_REG_STR_RM_WATCHDOG_TIMEOUT_DEFAULT;
157 }
158 if (osReadRegistryDword(pGpu,
159 NV_REG_STR_RM_WATCHDOG_INTERVAL,
160 &pKernelRc->watchdogPersistent.intervalSecs) !=
161 NV_OK ||
162 pKernelRc->watchdogPersistent.intervalSecs == 0)
163 {
164 pKernelRc->watchdogPersistent.intervalSecs =
165 NV_REG_STR_RM_WATCHDOG_INTERVAL_DEFAULT;
166 }
167
168 if (pKernelRc->watchdogPersistent.intervalSecs >
169 pKernelRc->watchdogPersistent.timeoutSecs)
170 {
171 pKernelRc->watchdogPersistent.intervalSecs =
172 pKernelRc->watchdogPersistent.timeoutSecs;
173 }
174
175
176 dword = 0;
177 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_RC_WATCHDOG, &dword) == NV_OK)
178 {
179 if (dword == NV_REG_STR_RM_RC_WATCHDOG_DISABLE)
180 {
181 pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
182 }
183 }
184 else if (IS_EMULATION(pGpu) || IS_SIMULATION(pGpu))
185 {
186 pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
187 }
188 else if (gpuIsCCFeatureEnabled(pGpu))
189 {
190 pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
191 }
192
193 dword = 0;
194 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_DO_LOG_RC_EVENTS, &dword) ==
195 NV_OK)
196 {
197 pKernelRc->bLogEvents = (dword == NV_REG_STR_RM_DO_LOG_RC_ENABLE);
198 if (pKernelRc->bLogEvents)
199 {
200 NV_PRINTF(LEVEL_INFO, "RC Error Logging is enabled\n");
201 #if defined(DEBUG)
202 // Don't print out the initialization log on a retail build
203 osErrorLog(pGpu, ROBUST_CHANNEL_RC_LOGGING_ENABLED, "");
204 #endif
205 }
206 }
207
208 //
209 // Do RC on BAR faults by default (For bug 1842228).
210 // Only applicable to Volta+ chips.
211 //
212 pKernelRc->bRcOnBar2Fault = NV_TRUE;
213
214 }
215
216
217 static void
_krcLogUuidOnce(OBJGPU * pGpu,KernelRc * pKernelRc)218 _krcLogUuidOnce
219 (
220 OBJGPU *pGpu,
221 KernelRc *pKernelRc
222 )
223 {
224 if (!pKernelRc->bGpuUuidLoggedOnce)
225 {
226 NvU8 *gidString = NULL;
227 NvU32 gidStrlen;
228
229 if (gpuGetGidInfo(pGpu,
230 &gidString,
231 &gidStrlen,
232 (DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _FORMAT, _ASCII) |
233 DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _TYPE, _SHA1))) ==
234 NV_OK)
235 {
236 portDbgPrintf("NVRM: GPU at PCI:%04x:%02x:%02x: %s\n",
237 gpuGetDomain(pGpu),
238 gpuGetBus(pGpu),
239 gpuGetDevice(pGpu),
240 gidString);
241 portMemFree(gidString);
242 }
243
244 if (pGpu->boardInfo != NULL && pGpu->boardInfo->serialNumber[0] != '\0')
245 {
246 portDbgPrintf("NVRM: GPU Board Serial Number: %s\n",
247 pGpu->boardInfo->serialNumber);
248 }
249
250 pKernelRc->bGpuUuidLoggedOnce = NV_TRUE;
251 }
252 }
253
254
255 void
krcGetMigAttributionForError_KERNEL(KernelRc * pKernelRc,NvU32 exceptType,NvU16 * pGpuPartitionId,NvU16 * pComputeInstanceId)256 krcGetMigAttributionForError_KERNEL
257 (
258 KernelRc *pKernelRc,
259 NvU32 exceptType,
260 NvU16 *pGpuPartitionId,
261 NvU16 *pComputeInstanceId
262 )
263 {
264 if (pGpuPartitionId != NULL)
265 {
266 *pGpuPartitionId = KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID;
267 }
268 if (pComputeInstanceId != NULL)
269 {
270 *pComputeInstanceId = KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID;
271 }
272 }
273
274
275 void
krcReportXid_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,NvU32 exceptType,const char * pMsg)276 krcReportXid_IMPL
277 (
278 OBJGPU *pGpu,
279 KernelRc *pKernelRc,
280 NvU32 exceptType,
281 const char *pMsg
282 )
283 {
284 //
285 // Log the RC error to the OS
286 //
287 // Enforce the policy of gating the log output by "RmLogonRC" regkey.
288 // Some of our callers do not abide by this rule.
289 // That is how they want it under Windows.
290 //
291 if (GPU_GET_KERNEL_RC(pGpu)->bLogEvents)
292 {
293 NvU16 gpuPartitionId;
294 NvU16 computeInstanceId;
295 KernelChannel *pKernelChannel = krcGetChannelInError(pKernelRc);
296 char *current_procname = NULL;
297
298 // Channels are populated with osGetCurrentProcessName() and pid of
299 // their process at creation-time. If no channel was found, mark unknown
300 const char *procname = "<unknown>";
301 char pid_string[12] = "'<unknown>'";
302
303 //
304 // Get PID of channel creator if available, or get the current PID for
305 // exception types that never have an associated channel
306 //
307 // Check for API lock since this can be called from parallel init
308 // path without API lock, and RES_GET_CLIENT requires API lock
309 //
310 if (rmapiLockIsOwner() && (pKernelChannel != NULL))
311 {
312 RsClient *pClient = RES_GET_CLIENT(pKernelChannel);
313 RmClient *pRmClient = dynamicCast(pClient, RmClient);
314 procname = pRmClient->name;
315 nvDbgSnprintf(pid_string, sizeof(pid_string), "%u", pKernelChannel->ProcessID);
316 }
317 else if (exceptType == GSP_RPC_TIMEOUT)
318 {
319 NvU32 current_pid = osGetCurrentProcess();
320
321 nvDbgSnprintf(pid_string, sizeof(pid_string), "%u", current_pid);
322
323 current_procname = portMemAllocNonPaged(NV_PROC_NAME_MAX_LENGTH);
324 if (current_procname != NULL)
325 {
326 osGetCurrentProcessName(current_procname, NV_PROC_NAME_MAX_LENGTH);
327 procname = current_procname;
328 }
329 }
330
331 _krcLogUuidOnce(pGpu, pKernelRc);
332
333 krcGetMigAttributionForError_HAL(pKernelRc,
334 exceptType,
335 &gpuPartitionId,
336 &computeInstanceId);
337
338 if (gpuPartitionId != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID &&
339 computeInstanceId != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID)
340 {
341 // Attribute this XID to both GPU / Compute instance
342 portDbgPrintf(
343 "NVRM: Xid (PCI:%04x:%02x:%02x GPU-I:%02u GPU-CI:%02u): %d, pid=%s, name=%s, %s\n",
344 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
345 gpuPartitionId, computeInstanceId,
346 exceptType,
347 pid_string,
348 procname,
349 pMsg != NULL ? pMsg : "");
350 }
351 else if (gpuPartitionId != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID)
352 {
353 // Attribute this XID to GPU instance only
354 portDbgPrintf(
355 "NVRM: Xid (PCI:%04x:%02x:%02x GPU-I:%02u): %d, pid=%s, name=%s, %s\n",
356 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
357 gpuPartitionId,
358 exceptType,
359 pid_string,
360 procname,
361 pMsg != NULL ? pMsg : "");
362 }
363 else
364 {
365 // Legacy (no attribution) XID reporting
366 portDbgPrintf("NVRM: Xid (PCI:%04x:%02x:%02x): %d, pid=%s, name=%s, %s\n",
367 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
368 exceptType,
369 pid_string,
370 procname,
371 pMsg != NULL ? pMsg : "");
372 }
373
374 portMemFree(current_procname);
375 }
376 }
377
378
379 NvBool
krcTestAllowAlloc_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,NvU32 failMask)380 krcTestAllowAlloc_IMPL
381 (
382 OBJGPU *pGpu,
383 KernelRc *pKernelRc,
384 NvU32 failMask
385 )
386 {
387 if (pKernelRc->bRobustChannelsEnabled &&
388 (pKernelRc->watchdog.allocFailMask & failMask))
389 {
390 OBJTMR *pTmr = GPU_GET_TIMER(pGpu);
391 NvU64 time;
392 NV_STATUS status = tmrGetCurrentTime(pTmr, &time);
393
394 //
395 // randomly fail this alloc based on NV timer
396 // assuming here that we don't get allocations within 128ns of each
397 // other
398 //
399 if (status == NV_OK && ((time & 0xff) > (0xffu / 2)))
400 return NV_FALSE;
401 }
402
403 return NV_TRUE;
404 }
405
406
407 NV_STATUS
krcCheckBusError_KERNEL(OBJGPU * pGpu,KernelRc * pKernelRc)408 krcCheckBusError_KERNEL
409 (
410 OBJGPU *pGpu,
411 KernelRc *pKernelRc
412 )
413 {
414 KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu);
415 OBJCL *pCl = SYS_GET_CL(SYS_GET_INSTANCE());
416 NvU32 clDevCtrlStatusFlags = 0;
417 NvU32 clDevCtrlStatusFlags_Org = 0;
418 NvU32 clDevCtrlStatus = 0;
419 PcieAerCapability clAer;
420
421
422 // PCI-E provides extended error reporting
423 if (pKernelBif == NULL || kbifGetBusIntfType_HAL(pKernelBif) !=
424 NV2080_CTRL_BUS_INFO_TYPE_PCI_EXPRESS)
425 {
426 return NV_OK;
427 }
428
429 // Clear PCIe dev ctrl/status errors and AER errors
430 kbifClearConfigErrors(pGpu, pKernelBif, NV_TRUE,
431 KBIF_CLEAR_XVE_AER_ALL_MASK);
432
433 // Corelogic device control status
434 if (pCl != NULL &&
435 clPcieReadDevCtrlStatus(pGpu, pCl,
436 &clDevCtrlStatusFlags,
437 &clDevCtrlStatus) == NV_OK &&
438 clDevCtrlStatusFlags != 0)
439 {
440 NV_PRINTF(LEVEL_ERROR,
441 "PCI-E corelogic status has pending errors (CL_PCIE_DEV_CTRL_STATUS = %08X):\n",
442 clDevCtrlStatus);
443
444 clDevCtrlStatusFlags_Org = clDevCtrlStatusFlags;
445
446 if (clDevCtrlStatusFlags &
447 NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_CORR_ERROR)
448 {
449 NV_PRINTF(LEVEL_ERROR, " _CORR_ERROR_DETECTED\n");
450 // not much interested in this one
451 clDevCtrlStatusFlags &=
452 ~NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_CORR_ERROR;
453 }
454 if (clDevCtrlStatusFlags &
455 NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_NON_FATAL_ERROR)
456 {
457 NV_PRINTF(LEVEL_ERROR, " _NON_FATAL_ERROR_DETECTED\n");
458 }
459 if (clDevCtrlStatusFlags &
460 NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_FATAL_ERROR)
461 {
462 NV_PRINTF(LEVEL_ERROR, " _FATAL_ERROR_DETECTED\n");
463 }
464 if (clDevCtrlStatusFlags &
465 NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_UNSUPP_REQUEST)
466 {
467 NV_PRINTF(LEVEL_ERROR, " _UNSUPP_REQUEST_DETECTED\n");
468 }
469 }
470
471 // Corelogic AER
472 if (pCl != NULL && clPcieReadAerCapability(pGpu, pCl, &clAer) == NV_OK &&
473 (clAer.UncorrErrStatusReg != 0 || clAer.RooErrStatus != 0))
474 {
475 NV_PRINTF(LEVEL_ERROR,
476 "PCE-I Advanced Error Reporting Corelogic Info:\n");
477 NV_PRINTF(LEVEL_ERROR,
478 " Uncorr Error Status Register : %08X\n",
479 clAer.UncorrErrStatusReg);
480 NV_PRINTF(LEVEL_ERROR,
481 " Uncorr Error Mask Register : %08X\n",
482 clAer.UncorrErrMaskReg);
483 NV_PRINTF(LEVEL_ERROR,
484 " Uncorr Error Severity Register : %08X\n",
485 clAer.UncorrErrSeverityReg);
486 NV_PRINTF(LEVEL_ERROR,
487 " Corr Error Status Register : %08X\n",
488 clAer.CorrErrStatusReg);
489 NV_PRINTF(LEVEL_ERROR,
490 " Corr Error Mask Register : %08X\n",
491 clAer.CorrErrMaskReg);
492 NV_PRINTF(LEVEL_ERROR,
493 " Advanced Err Cap & Ctrl Register: %08X\n",
494 clAer.AEcapCrtlReg);
495 NV_PRINTF(LEVEL_ERROR,
496 " Header Log [0-3] : %08X\n",
497 clAer.HeaderLogReg.Header[0]);
498 NV_PRINTF(LEVEL_ERROR,
499 " Header Log [4-7] : %08X\n",
500 clAer.HeaderLogReg.Header[1]);
501 NV_PRINTF(LEVEL_ERROR,
502 " Header Log [8-B] : %08X\n",
503 clAer.HeaderLogReg.Header[2]);
504 NV_PRINTF(LEVEL_ERROR,
505 " Header Log [C-F] : %08X\n",
506 clAer.HeaderLogReg.Header[3]);
507 NV_PRINTF(LEVEL_ERROR,
508 " Root Error Command Register : %08X\n",
509 clAer.RootErrCmd);
510 NV_PRINTF(LEVEL_ERROR,
511 " Root Error Status : %08X\n",
512 clAer.RooErrStatus);
513 NV_PRINTF(LEVEL_ERROR,
514 " Error Source ID Register : %08X\n",
515 clAer.ErrSrcReg);
516
517 //
518 // if you hit this case with some AER errors reported please refer to
519 // PCI-E manual for detailed bits spec
520 // TODO: add details bits here
521 //
522 }
523
524 if (clDevCtrlStatusFlags_Org)
525 {
526 // clear the corelogic status after we had a chance to examine it
527 clPcieClearDevCtrlStatus(pGpu, pCl, &clDevCtrlStatus);
528 }
529
530 return NV_OK;
531 }
532
533 KernelChannel *
krcGetChannelInError_FWCLIENT(KernelRc * pKernelRc)534 krcGetChannelInError_FWCLIENT
535 (
536 KernelRc *pKernelRc
537 )
538 {
539 NV_ASSERT_OR_RETURN(IS_GSP_CLIENT(ENG_GET_GPU(pKernelRc)), NULL);
540 return pKernelRc->pPreviousChannelInError;
541 }
542