11739a20eSAndy Ritger /*
2337e28efSBernhard Stoeckner  * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
31739a20eSAndy Ritger  * SPDX-License-Identifier: MIT
41739a20eSAndy Ritger  *
51739a20eSAndy Ritger  * Permission is hereby granted, free of charge, to any person obtaining a
61739a20eSAndy Ritger  * copy of this software and associated documentation files (the "Software"),
71739a20eSAndy Ritger  * to deal in the Software without restriction, including without limitation
81739a20eSAndy Ritger  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
91739a20eSAndy Ritger  * and/or sell copies of the Software, and to permit persons to whom the
101739a20eSAndy Ritger  * Software is furnished to do so, subject to the following conditions:
111739a20eSAndy Ritger  *
121739a20eSAndy Ritger  * The above copyright notice and this permission notice shall be included in
131739a20eSAndy Ritger  * all copies or substantial portions of the Software.
141739a20eSAndy Ritger  *
151739a20eSAndy Ritger  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
161739a20eSAndy Ritger  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
171739a20eSAndy Ritger  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
181739a20eSAndy Ritger  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
191739a20eSAndy Ritger  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
201739a20eSAndy Ritger  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
211739a20eSAndy Ritger  * DEALINGS IN THE SOFTWARE.
221739a20eSAndy Ritger  */
231739a20eSAndy Ritger 
241739a20eSAndy Ritger #include "kernel/gpu/rc/kernel_rc.h"
251739a20eSAndy Ritger 
26*91676d66SBernhard Stoeckner #include "kernel/core/locks.h"
271739a20eSAndy Ritger #include "kernel/core/system.h"
281739a20eSAndy Ritger #include "kernel/gpu/bif/kernel_bif.h"
291739a20eSAndy Ritger #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
301739a20eSAndy Ritger #include "kernel/os/os.h"
311739a20eSAndy Ritger #include "kernel/platform/chipset/chipset.h"
321739a20eSAndy Ritger #include "kernel/rmapi/client.h"
331739a20eSAndy Ritger 
341739a20eSAndy Ritger 
351739a20eSAndy Ritger #include "libraries/utils/nvprintf.h"
361739a20eSAndy Ritger #include "nvRmReg.h"
371739a20eSAndy Ritger #include "nverror.h"
381739a20eSAndy Ritger #include "nvtypes.h"
391739a20eSAndy Ritger #include "objtmr.h"
401739a20eSAndy Ritger 
411739a20eSAndy Ritger 
421739a20eSAndy Ritger static void _krcInitRegistryOverrides(OBJGPU *pGpu, KernelRc *pKernelRc);
431739a20eSAndy Ritger static void _krcLogUuidOnce(OBJGPU *pGpu, KernelRc *pKernelRc);
441739a20eSAndy Ritger 
451739a20eSAndy Ritger 
461739a20eSAndy Ritger NV_STATUS
krcConstructEngine_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,ENGDESCRIPTOR engDescriptor)471739a20eSAndy Ritger krcConstructEngine_IMPL
481739a20eSAndy Ritger (
491739a20eSAndy Ritger     OBJGPU        *pGpu,
501739a20eSAndy Ritger     KernelRc      *pKernelRc,
511739a20eSAndy Ritger     ENGDESCRIPTOR  engDescriptor
521739a20eSAndy Ritger )
531739a20eSAndy Ritger {
541739a20eSAndy Ritger     _krcInitRegistryOverrides(pGpu, pKernelRc);
551739a20eSAndy Ritger 
561739a20eSAndy Ritger     return NV_OK;
571739a20eSAndy Ritger }
581739a20eSAndy Ritger 
591739a20eSAndy Ritger 
601739a20eSAndy Ritger void
krcInitRegistryOverridesDelayed_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc)611739a20eSAndy Ritger krcInitRegistryOverridesDelayed_IMPL
621739a20eSAndy Ritger (
631739a20eSAndy Ritger     OBJGPU   *pGpu,
641739a20eSAndy Ritger     KernelRc *pKernelRc
651739a20eSAndy Ritger )
661739a20eSAndy Ritger {
671739a20eSAndy Ritger     KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu);
681739a20eSAndy Ritger     NvU32 dword = 0;
691739a20eSAndy Ritger     (void) dword;
701739a20eSAndy Ritger 
711739a20eSAndy Ritger 
721739a20eSAndy Ritger     dword = 0;
731739a20eSAndy Ritger     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_ROBUST_CHANNELS, &dword) !=
741739a20eSAndy Ritger         NV_OK)
751739a20eSAndy Ritger     {
761739a20eSAndy Ritger #if RMCFG_FEATURE_PLATFORM_WINDOWS || RMCFG_FEATURE_PLATFORM_GSP || \
771739a20eSAndy Ritger     RMCFG_FEATURE_PLATFORM_UNIX
781739a20eSAndy Ritger         dword = NV_REG_STR_RM_ROBUST_CHANNELS_ENABLE;
791739a20eSAndy Ritger #else
801739a20eSAndy Ritger #error "unrecognized platform"
811739a20eSAndy Ritger #endif
821739a20eSAndy Ritger     }
831739a20eSAndy Ritger     pKernelRc->bRobustChannelsEnabled = (dword ==
841739a20eSAndy Ritger                                          NV_REG_STR_RM_ROBUST_CHANNELS_ENABLE);
851739a20eSAndy Ritger 
861739a20eSAndy Ritger 
871739a20eSAndy Ritger     dword = 0;
881739a20eSAndy Ritger     //
891739a20eSAndy Ritger     // Force uncached pushbuffers for robust channel.
901739a20eSAndy Ritger     //
911739a20eSAndy Ritger     // We used to allocate the recovery channel as uncached, which is achieved
921739a20eSAndy Ritger     // by allocating physically contiguous memory then remap that uncached.
931739a20eSAndy Ritger     // However, this caused allocations issues in cases which shares a channel
941739a20eSAndy Ritger     // with the robust channel, and ended up requesting sizeof(RC + pushbuffer)
951739a20eSAndy Ritger     // of contiguous memory (bug 73669).
961739a20eSAndy Ritger     //
971739a20eSAndy Ritger     // We therefore switched to cached allocations, with a few exceptions where
981739a20eSAndy Ritger     // an uncached pushbuffer is still needed:
991739a20eSAndy Ritger     // - When the system does not support CPU cache snooping (bugs 292461 and
1001739a20eSAndy Ritger     // 976485).
1011739a20eSAndy Ritger     //
1021739a20eSAndy Ritger     if ((osReadRegistryDword(pGpu,
1031739a20eSAndy Ritger                              NV_REG_STR_USE_UNCACHED_PCI_MAPPINGS,
1041739a20eSAndy Ritger                              &dword) == NV_OK &&
1051739a20eSAndy Ritger          dword != 0) ||
1061739a20eSAndy Ritger         ((pKernelBif != NULL) &&
1071739a20eSAndy Ritger          !kbifIsSnoopDmaCapable(pGpu, pKernelBif)))
1081739a20eSAndy Ritger     {
1091739a20eSAndy Ritger         pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_ALLOC_UNCACHED_PCI;
1101739a20eSAndy Ritger     }
1111739a20eSAndy Ritger }
1121739a20eSAndy Ritger 
1131739a20eSAndy Ritger 
1141739a20eSAndy Ritger static void
_krcInitRegistryOverrides(OBJGPU * pGpu,KernelRc * pKernelRc)1151739a20eSAndy Ritger _krcInitRegistryOverrides
1161739a20eSAndy Ritger (
1171739a20eSAndy Ritger     OBJGPU   *pGpu,
1181739a20eSAndy Ritger     KernelRc *pKernelRc
1191739a20eSAndy Ritger )
1201739a20eSAndy Ritger {
1211739a20eSAndy Ritger     NvU32 dword = 0;
1221739a20eSAndy Ritger     (void) dword;
1231739a20eSAndy Ritger 
1241739a20eSAndy Ritger     dword = 0;
1251739a20eSAndy Ritger     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_BREAK_ON_RC, &dword) != NV_OK)
1261739a20eSAndy Ritger     {
1271739a20eSAndy Ritger         dword = NV_REG_STR_RM_BREAK_ON_RC_DEFAULT;
1281739a20eSAndy Ritger     }
1291739a20eSAndy Ritger 
1301739a20eSAndy Ritger     pKernelRc->bBreakOnRc = (dword == NV_REG_STR_RM_BREAK_ON_RC_ENABLE);
1311739a20eSAndy Ritger 
1321739a20eSAndy Ritger     // Allow driver registry key RmBreak to override Device Key
1331739a20eSAndy Ritger     if (DRF_VAL(_DEBUG, _BREAK_FLAGS, _RC, SYS_GET_INSTANCE()->debugFlags) ==
1341739a20eSAndy Ritger         NV_DEBUG_BREAK_FLAGS_RC_ENABLE)
1351739a20eSAndy Ritger     {
1361739a20eSAndy Ritger         pKernelRc->bBreakOnRc = NV_TRUE;
1371739a20eSAndy Ritger     }
1381739a20eSAndy Ritger 
1391739a20eSAndy Ritger     if (pKernelRc->bBreakOnRc)
1401739a20eSAndy Ritger     {
1411739a20eSAndy Ritger         NV_PRINTF(LEVEL_INFO, "Breakpoint on RC Error is enabled\n");
1421739a20eSAndy Ritger     }
1431739a20eSAndy Ritger     else
1441739a20eSAndy Ritger     {
1451739a20eSAndy Ritger         NV_PRINTF(LEVEL_INFO, "Breakpoint on RC Error is disabled\n");
1461739a20eSAndy Ritger     }
1471739a20eSAndy Ritger 
1481739a20eSAndy Ritger 
1491739a20eSAndy Ritger     if (osReadRegistryDword(pGpu,
1501739a20eSAndy Ritger                             NV_REG_STR_RM_WATCHDOG_TIMEOUT,
1511739a20eSAndy Ritger                             &pKernelRc->watchdogPersistent.timeoutSecs) !=
1521739a20eSAndy Ritger             NV_OK ||
1531739a20eSAndy Ritger         pKernelRc->watchdogPersistent.timeoutSecs == 0)
1541739a20eSAndy Ritger     {
1551739a20eSAndy Ritger         pKernelRc->watchdogPersistent.timeoutSecs =
1561739a20eSAndy Ritger             NV_REG_STR_RM_WATCHDOG_TIMEOUT_DEFAULT;
1571739a20eSAndy Ritger     }
1581739a20eSAndy Ritger     if (osReadRegistryDword(pGpu,
1591739a20eSAndy Ritger                             NV_REG_STR_RM_WATCHDOG_INTERVAL,
1601739a20eSAndy Ritger                             &pKernelRc->watchdogPersistent.intervalSecs) !=
1611739a20eSAndy Ritger             NV_OK ||
1621739a20eSAndy Ritger         pKernelRc->watchdogPersistent.intervalSecs == 0)
1631739a20eSAndy Ritger     {
1641739a20eSAndy Ritger         pKernelRc->watchdogPersistent.intervalSecs =
1651739a20eSAndy Ritger             NV_REG_STR_RM_WATCHDOG_INTERVAL_DEFAULT;
1661739a20eSAndy Ritger     }
1671739a20eSAndy Ritger 
1681739a20eSAndy Ritger     if (pKernelRc->watchdogPersistent.intervalSecs >
1691739a20eSAndy Ritger         pKernelRc->watchdogPersistent.timeoutSecs)
1701739a20eSAndy Ritger     {
1711739a20eSAndy Ritger         pKernelRc->watchdogPersistent.intervalSecs =
1721739a20eSAndy Ritger             pKernelRc->watchdogPersistent.timeoutSecs;
1731739a20eSAndy Ritger     }
1741739a20eSAndy Ritger 
1751739a20eSAndy Ritger 
1761739a20eSAndy Ritger     dword = 0;
1771739a20eSAndy Ritger     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_RC_WATCHDOG, &dword) == NV_OK)
1781739a20eSAndy Ritger     {
1791739a20eSAndy Ritger         if (dword == NV_REG_STR_RM_RC_WATCHDOG_DISABLE)
1801739a20eSAndy Ritger         {
1811739a20eSAndy Ritger             pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
1821739a20eSAndy Ritger         }
1831739a20eSAndy Ritger     }
18490eb1077SAndy Ritger     else if (IS_EMULATION(pGpu) || IS_SIMULATION(pGpu))
1851739a20eSAndy Ritger     {
1861739a20eSAndy Ritger         pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
1871739a20eSAndy Ritger     }
188*91676d66SBernhard Stoeckner     else if (gpuIsCCFeatureEnabled(pGpu))
189337e28efSBernhard Stoeckner     {
190337e28efSBernhard Stoeckner         pKernelRc->watchdog.flags |= WATCHDOG_FLAGS_DISABLED;
191337e28efSBernhard Stoeckner     }
1921739a20eSAndy Ritger 
1931739a20eSAndy Ritger     dword = 0;
1941739a20eSAndy Ritger     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_DO_LOG_RC_EVENTS, &dword) ==
1951739a20eSAndy Ritger         NV_OK)
1961739a20eSAndy Ritger     {
1971739a20eSAndy Ritger         pKernelRc->bLogEvents = (dword == NV_REG_STR_RM_DO_LOG_RC_ENABLE);
1981739a20eSAndy Ritger         if (pKernelRc->bLogEvents)
1991739a20eSAndy Ritger         {
2001739a20eSAndy Ritger             NV_PRINTF(LEVEL_INFO, "RC Error Logging is enabled\n");
2011739a20eSAndy Ritger #if defined(DEBUG)
2021739a20eSAndy Ritger             // Don't print out the initialization log on a retail build
2031739a20eSAndy Ritger             osErrorLog(pGpu, ROBUST_CHANNEL_RC_LOGGING_ENABLED, "");
2041739a20eSAndy Ritger #endif
2051739a20eSAndy Ritger         }
2061739a20eSAndy Ritger     }
207b5bf85a8SAndy Ritger 
208b5bf85a8SAndy Ritger     //
209b5bf85a8SAndy Ritger     // Do RC on BAR faults by default (For bug 1842228).
210b5bf85a8SAndy Ritger     // Only applicable to Volta+ chips.
211b5bf85a8SAndy Ritger     //
212b5bf85a8SAndy Ritger     pKernelRc->bRcOnBar2Fault = NV_TRUE;
213b5bf85a8SAndy Ritger 
2141739a20eSAndy Ritger }
2151739a20eSAndy Ritger 
2161739a20eSAndy Ritger 
2171739a20eSAndy Ritger static void
_krcLogUuidOnce(OBJGPU * pGpu,KernelRc * pKernelRc)2181739a20eSAndy Ritger _krcLogUuidOnce
2191739a20eSAndy Ritger (
2201739a20eSAndy Ritger     OBJGPU   *pGpu,
2211739a20eSAndy Ritger     KernelRc *pKernelRc
2221739a20eSAndy Ritger )
2231739a20eSAndy Ritger {
2241739a20eSAndy Ritger     if (!pKernelRc->bGpuUuidLoggedOnce)
2251739a20eSAndy Ritger     {
2261739a20eSAndy Ritger         NvU8 *gidString = NULL;
2271739a20eSAndy Ritger         NvU32 gidStrlen;
2281739a20eSAndy Ritger 
2291739a20eSAndy Ritger         if (gpuGetGidInfo(pGpu,
2301739a20eSAndy Ritger                 &gidString,
2311739a20eSAndy Ritger                 &gidStrlen,
2321739a20eSAndy Ritger                 (DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _FORMAT, _ASCII) |
2331739a20eSAndy Ritger                  DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _TYPE,   _SHA1))) ==
2341739a20eSAndy Ritger             NV_OK)
2351739a20eSAndy Ritger         {
2361739a20eSAndy Ritger             portDbgPrintf("NVRM: GPU at PCI:%04x:%02x:%02x: %s\n",
2371739a20eSAndy Ritger                           gpuGetDomain(pGpu),
2381739a20eSAndy Ritger                           gpuGetBus(pGpu),
2391739a20eSAndy Ritger                           gpuGetDevice(pGpu),
2401739a20eSAndy Ritger                           gidString);
2411739a20eSAndy Ritger             portMemFree(gidString);
2421739a20eSAndy Ritger         }
2431739a20eSAndy Ritger 
2441739a20eSAndy Ritger         if (pGpu->boardInfo != NULL && pGpu->boardInfo->serialNumber[0] != '\0')
2451739a20eSAndy Ritger         {
2461739a20eSAndy Ritger             portDbgPrintf("NVRM: GPU Board Serial Number: %s\n",
2471739a20eSAndy Ritger                           pGpu->boardInfo->serialNumber);
2481739a20eSAndy Ritger         }
2491739a20eSAndy Ritger 
2501739a20eSAndy Ritger         pKernelRc->bGpuUuidLoggedOnce = NV_TRUE;
2511739a20eSAndy Ritger     }
2521739a20eSAndy Ritger }
2531739a20eSAndy Ritger 
2541739a20eSAndy Ritger 
2551739a20eSAndy Ritger void
krcGetMigAttributionForError_KERNEL(KernelRc * pKernelRc,NvU32 exceptType,NvU16 * pGpuPartitionId,NvU16 * pComputeInstanceId)2561739a20eSAndy Ritger krcGetMigAttributionForError_KERNEL
2571739a20eSAndy Ritger (
2581739a20eSAndy Ritger     KernelRc *pKernelRc,
2591739a20eSAndy Ritger     NvU32     exceptType,
2601739a20eSAndy Ritger     NvU16    *pGpuPartitionId,
2611739a20eSAndy Ritger     NvU16    *pComputeInstanceId
2621739a20eSAndy Ritger )
2631739a20eSAndy Ritger {
2641739a20eSAndy Ritger     if (pGpuPartitionId != NULL)
2651739a20eSAndy Ritger     {
2661739a20eSAndy Ritger         *pGpuPartitionId = KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID;
2671739a20eSAndy Ritger     }
2681739a20eSAndy Ritger     if (pComputeInstanceId != NULL)
2691739a20eSAndy Ritger     {
2701739a20eSAndy Ritger         *pComputeInstanceId = KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID;
2711739a20eSAndy Ritger     }
2721739a20eSAndy Ritger }
2731739a20eSAndy Ritger 
2741739a20eSAndy Ritger 
2751739a20eSAndy Ritger void
krcReportXid_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,NvU32 exceptType,const char * pMsg)2761739a20eSAndy Ritger krcReportXid_IMPL
2771739a20eSAndy Ritger (
2781739a20eSAndy Ritger     OBJGPU     *pGpu,
2791739a20eSAndy Ritger     KernelRc   *pKernelRc,
2801739a20eSAndy Ritger     NvU32       exceptType,
2811739a20eSAndy Ritger     const char *pMsg
2821739a20eSAndy Ritger )
2831739a20eSAndy Ritger {
2841739a20eSAndy Ritger     //
2851739a20eSAndy Ritger     // Log the RC error to the OS
2861739a20eSAndy Ritger     //
2871739a20eSAndy Ritger     // Enforce the policy of gating the log output by "RmLogonRC" regkey.
2881739a20eSAndy Ritger     // Some of our callers do not abide by this rule.
2891739a20eSAndy Ritger     // That is how they want it under Windows.
2901739a20eSAndy Ritger     //
2911739a20eSAndy Ritger     if (GPU_GET_KERNEL_RC(pGpu)->bLogEvents)
2921739a20eSAndy Ritger     {
2931739a20eSAndy Ritger         NvU16          gpuPartitionId;
2941739a20eSAndy Ritger         NvU16          computeInstanceId;
2951739a20eSAndy Ritger         KernelChannel *pKernelChannel = krcGetChannelInError(pKernelRc);
296758b4ee8SAndy Ritger         char          *current_procname = NULL;
2971739a20eSAndy Ritger 
2981739a20eSAndy Ritger         // Channels are populated with osGetCurrentProcessName() and pid of
2991739a20eSAndy Ritger         // their process at creation-time. If no channel was found, mark unknown
3001739a20eSAndy Ritger         const char *procname = "<unknown>";
3011739a20eSAndy Ritger         char pid_string[12] = "'<unknown>'";
3021739a20eSAndy Ritger 
3031739a20eSAndy Ritger         //
304758b4ee8SAndy Ritger         // Get PID of channel creator if available, or get the current PID for
305758b4ee8SAndy Ritger         // exception types that never have an associated channel
3061739a20eSAndy Ritger         //
307*91676d66SBernhard Stoeckner         // Check for API lock since this can be called from parallel init
308*91676d66SBernhard Stoeckner         // path without API lock, and RES_GET_CLIENT requires API lock
309*91676d66SBernhard Stoeckner         //
310*91676d66SBernhard Stoeckner         if (rmapiLockIsOwner() && (pKernelChannel != NULL))
3111739a20eSAndy Ritger         {
3121739a20eSAndy Ritger             RsClient *pClient = RES_GET_CLIENT(pKernelChannel);
3131739a20eSAndy Ritger             RmClient *pRmClient = dynamicCast(pClient, RmClient);
3141739a20eSAndy Ritger             procname = pRmClient->name;
3151739a20eSAndy Ritger             nvDbgSnprintf(pid_string, sizeof(pid_string), "%u", pKernelChannel->ProcessID);
3161739a20eSAndy Ritger         }
317758b4ee8SAndy Ritger         else if (exceptType == GSP_RPC_TIMEOUT)
318758b4ee8SAndy Ritger         {
319758b4ee8SAndy Ritger             NvU32 current_pid = osGetCurrentProcess();
320758b4ee8SAndy Ritger 
321758b4ee8SAndy Ritger             nvDbgSnprintf(pid_string, sizeof(pid_string), "%u", current_pid);
322758b4ee8SAndy Ritger 
323758b4ee8SAndy Ritger             current_procname = portMemAllocNonPaged(NV_PROC_NAME_MAX_LENGTH);
324758b4ee8SAndy Ritger             if (current_procname != NULL)
325758b4ee8SAndy Ritger             {
326758b4ee8SAndy Ritger                 osGetCurrentProcessName(current_procname, NV_PROC_NAME_MAX_LENGTH);
327758b4ee8SAndy Ritger                 procname = current_procname;
328758b4ee8SAndy Ritger             }
329758b4ee8SAndy Ritger         }
3301739a20eSAndy Ritger 
3311739a20eSAndy Ritger         _krcLogUuidOnce(pGpu, pKernelRc);
3321739a20eSAndy Ritger 
3331739a20eSAndy Ritger         krcGetMigAttributionForError_HAL(pKernelRc,
3341739a20eSAndy Ritger                                          exceptType,
3351739a20eSAndy Ritger                                          &gpuPartitionId,
3361739a20eSAndy Ritger                                          &computeInstanceId);
3371739a20eSAndy Ritger 
3381739a20eSAndy Ritger         if (gpuPartitionId    != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID &&
3391739a20eSAndy Ritger             computeInstanceId != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID)
3401739a20eSAndy Ritger         {
3411739a20eSAndy Ritger             // Attribute this XID to both GPU / Compute instance
3421739a20eSAndy Ritger             portDbgPrintf(
3431739a20eSAndy Ritger                 "NVRM: Xid (PCI:%04x:%02x:%02x GPU-I:%02u GPU-CI:%02u): %d, pid=%s, name=%s, %s\n",
3441739a20eSAndy Ritger                 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
3451739a20eSAndy Ritger                 gpuPartitionId, computeInstanceId,
3461739a20eSAndy Ritger                 exceptType,
3471739a20eSAndy Ritger                 pid_string,
3481739a20eSAndy Ritger                 procname,
3491739a20eSAndy Ritger                 pMsg != NULL ? pMsg : "");
3501739a20eSAndy Ritger         }
3511739a20eSAndy Ritger         else if (gpuPartitionId != KMIGMGR_INSTANCE_ATTRIBUTION_ID_INVALID)
3521739a20eSAndy Ritger         {
3531739a20eSAndy Ritger             // Attribute this XID to GPU instance only
3541739a20eSAndy Ritger             portDbgPrintf(
3551739a20eSAndy Ritger                 "NVRM: Xid (PCI:%04x:%02x:%02x GPU-I:%02u): %d, pid=%s, name=%s, %s\n",
3561739a20eSAndy Ritger                 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
3571739a20eSAndy Ritger                 gpuPartitionId,
3581739a20eSAndy Ritger                 exceptType,
3591739a20eSAndy Ritger                 pid_string,
3601739a20eSAndy Ritger                 procname,
3611739a20eSAndy Ritger                 pMsg != NULL ? pMsg : "");
3621739a20eSAndy Ritger         }
3631739a20eSAndy Ritger         else
3641739a20eSAndy Ritger         {
3651739a20eSAndy Ritger             // Legacy (no attribution) XID reporting
3661739a20eSAndy Ritger             portDbgPrintf("NVRM: Xid (PCI:%04x:%02x:%02x): %d, pid=%s, name=%s, %s\n",
3671739a20eSAndy Ritger                 gpuGetDomain(pGpu), gpuGetBus(pGpu), gpuGetDevice(pGpu),
3681739a20eSAndy Ritger                 exceptType,
3691739a20eSAndy Ritger                 pid_string,
3701739a20eSAndy Ritger                 procname,
3711739a20eSAndy Ritger                 pMsg != NULL ? pMsg : "");
3721739a20eSAndy Ritger         }
373758b4ee8SAndy Ritger 
374758b4ee8SAndy Ritger         portMemFree(current_procname);
3751739a20eSAndy Ritger     }
3761739a20eSAndy Ritger }
3771739a20eSAndy Ritger 
3781739a20eSAndy Ritger 
3791739a20eSAndy Ritger NvBool
krcTestAllowAlloc_IMPL(OBJGPU * pGpu,KernelRc * pKernelRc,NvU32 failMask)3801739a20eSAndy Ritger krcTestAllowAlloc_IMPL
3811739a20eSAndy Ritger (
3821739a20eSAndy Ritger     OBJGPU   *pGpu,
3831739a20eSAndy Ritger     KernelRc *pKernelRc,
3841739a20eSAndy Ritger     NvU32     failMask
3851739a20eSAndy Ritger )
3861739a20eSAndy Ritger {
3871739a20eSAndy Ritger     if (pKernelRc->bRobustChannelsEnabled &&
3881739a20eSAndy Ritger         (pKernelRc->watchdog.allocFailMask & failMask))
3891739a20eSAndy Ritger     {
3901739a20eSAndy Ritger         OBJTMR   *pTmr = GPU_GET_TIMER(pGpu);
3911739a20eSAndy Ritger         NvU64     time;
3921739a20eSAndy Ritger         NV_STATUS status = tmrGetCurrentTime(pTmr, &time);
3931739a20eSAndy Ritger 
3941739a20eSAndy Ritger         //
3951739a20eSAndy Ritger         // randomly fail this alloc based on NV timer
3961739a20eSAndy Ritger         // assuming here that we don't get allocations within 128ns of each
3971739a20eSAndy Ritger         // other
3981739a20eSAndy Ritger         //
3991739a20eSAndy Ritger         if (status == NV_OK && ((time & 0xff) > (0xffu / 2)))
4001739a20eSAndy Ritger             return NV_FALSE;
4011739a20eSAndy Ritger     }
4021739a20eSAndy Ritger 
4031739a20eSAndy Ritger     return NV_TRUE;
4041739a20eSAndy Ritger }
4051739a20eSAndy Ritger 
4061739a20eSAndy Ritger 
4071739a20eSAndy Ritger NV_STATUS
krcCheckBusError_KERNEL(OBJGPU * pGpu,KernelRc * pKernelRc)4081739a20eSAndy Ritger krcCheckBusError_KERNEL
4091739a20eSAndy Ritger (
4101739a20eSAndy Ritger     OBJGPU   *pGpu,
4111739a20eSAndy Ritger     KernelRc *pKernelRc
4121739a20eSAndy Ritger )
4131739a20eSAndy Ritger {
4141739a20eSAndy Ritger     KernelBif        *pKernelBif               = GPU_GET_KERNEL_BIF(pGpu);
4151739a20eSAndy Ritger     OBJCL            *pCl                      = SYS_GET_CL(SYS_GET_INSTANCE());
4161739a20eSAndy Ritger     NvU32             clDevCtrlStatusFlags     = 0;
4171739a20eSAndy Ritger     NvU32             clDevCtrlStatusFlags_Org = 0;
4181739a20eSAndy Ritger     NvU32             clDevCtrlStatus          = 0;
4191739a20eSAndy Ritger     PcieAerCapability clAer;
4201739a20eSAndy Ritger 
4211739a20eSAndy Ritger 
4221739a20eSAndy Ritger     // PCI-E provides extended error reporting
4231739a20eSAndy Ritger     if (pKernelBif == NULL || kbifGetBusIntfType_HAL(pKernelBif) !=
4241739a20eSAndy Ritger                                   NV2080_CTRL_BUS_INFO_TYPE_PCI_EXPRESS)
4251739a20eSAndy Ritger     {
4261739a20eSAndy Ritger         return NV_OK;
4271739a20eSAndy Ritger     }
4281739a20eSAndy Ritger 
4291739a20eSAndy Ritger     // Clear PCIe dev ctrl/status errors and AER errors
4301739a20eSAndy Ritger     kbifClearConfigErrors(pGpu, pKernelBif, NV_TRUE,
4311739a20eSAndy Ritger                           KBIF_CLEAR_XVE_AER_ALL_MASK);
4321739a20eSAndy Ritger 
4331739a20eSAndy Ritger     // Corelogic device control status
4341739a20eSAndy Ritger     if (pCl != NULL &&
4351739a20eSAndy Ritger         clPcieReadDevCtrlStatus(pGpu, pCl,
4361739a20eSAndy Ritger                                 &clDevCtrlStatusFlags,
4371739a20eSAndy Ritger                                 &clDevCtrlStatus) == NV_OK &&
4381739a20eSAndy Ritger         clDevCtrlStatusFlags != 0)
4391739a20eSAndy Ritger     {
4401739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4411739a20eSAndy Ritger             "PCI-E corelogic status has pending errors (CL_PCIE_DEV_CTRL_STATUS = %08X):\n",
4421739a20eSAndy Ritger             clDevCtrlStatus);
4431739a20eSAndy Ritger 
4441739a20eSAndy Ritger         clDevCtrlStatusFlags_Org = clDevCtrlStatusFlags;
4451739a20eSAndy Ritger 
4461739a20eSAndy Ritger         if (clDevCtrlStatusFlags &
4471739a20eSAndy Ritger             NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_CORR_ERROR)
4481739a20eSAndy Ritger         {
4491739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR, "     _CORR_ERROR_DETECTED\n");
4501739a20eSAndy Ritger             // not much interested in this one
4511739a20eSAndy Ritger             clDevCtrlStatusFlags &=
4521739a20eSAndy Ritger                 ~NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_CORR_ERROR;
4531739a20eSAndy Ritger         }
4541739a20eSAndy Ritger         if (clDevCtrlStatusFlags &
4551739a20eSAndy Ritger             NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_NON_FATAL_ERROR)
4561739a20eSAndy Ritger         {
4571739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR, "     _NON_FATAL_ERROR_DETECTED\n");
4581739a20eSAndy Ritger         }
4591739a20eSAndy Ritger         if (clDevCtrlStatusFlags &
4601739a20eSAndy Ritger             NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_FATAL_ERROR)
4611739a20eSAndy Ritger         {
4621739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR, "     _FATAL_ERROR_DETECTED\n");
4631739a20eSAndy Ritger         }
4641739a20eSAndy Ritger         if (clDevCtrlStatusFlags &
4651739a20eSAndy Ritger             NV2080_CTRL_BUS_INFO_PCIE_LINK_ERRORS_UNSUPP_REQUEST)
4661739a20eSAndy Ritger         {
4671739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR, "     _UNSUPP_REQUEST_DETECTED\n");
4681739a20eSAndy Ritger         }
4691739a20eSAndy Ritger     }
4701739a20eSAndy Ritger 
4711739a20eSAndy Ritger     // Corelogic AER
4721739a20eSAndy Ritger     if (pCl != NULL && clPcieReadAerCapability(pGpu, pCl, &clAer) == NV_OK &&
4731739a20eSAndy Ritger         (clAer.UncorrErrStatusReg != 0 || clAer.RooErrStatus != 0))
4741739a20eSAndy Ritger     {
4751739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4761739a20eSAndy Ritger                   "PCE-I Advanced Error Reporting Corelogic Info:\n");
4771739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4781739a20eSAndy Ritger                   "     Uncorr Error Status Register    : %08X\n",
4791739a20eSAndy Ritger                   clAer.UncorrErrStatusReg);
4801739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4811739a20eSAndy Ritger                   "     Uncorr Error Mask Register      : %08X\n",
4821739a20eSAndy Ritger                   clAer.UncorrErrMaskReg);
4831739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4841739a20eSAndy Ritger                   "     Uncorr Error Severity Register  : %08X\n",
4851739a20eSAndy Ritger                   clAer.UncorrErrSeverityReg);
4861739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4871739a20eSAndy Ritger                   "     Corr Error Status Register      : %08X\n",
4881739a20eSAndy Ritger                   clAer.CorrErrStatusReg);
4891739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4901739a20eSAndy Ritger                   "     Corr Error Mask Register        : %08X\n",
4911739a20eSAndy Ritger                   clAer.CorrErrMaskReg);
4921739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4931739a20eSAndy Ritger                   "     Advanced Err Cap & Ctrl Register: %08X\n",
4941739a20eSAndy Ritger                   clAer.AEcapCrtlReg);
4951739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4961739a20eSAndy Ritger                   "     Header Log [0-3]                : %08X\n",
4971739a20eSAndy Ritger                   clAer.HeaderLogReg.Header[0]);
4981739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
4991739a20eSAndy Ritger                   "     Header Log [4-7]                : %08X\n",
5001739a20eSAndy Ritger                   clAer.HeaderLogReg.Header[1]);
5011739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
5021739a20eSAndy Ritger                   "     Header Log [8-B]                : %08X\n",
5031739a20eSAndy Ritger                   clAer.HeaderLogReg.Header[2]);
5041739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
5051739a20eSAndy Ritger                   "     Header Log [C-F]                : %08X\n",
5061739a20eSAndy Ritger                   clAer.HeaderLogReg.Header[3]);
5071739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
5081739a20eSAndy Ritger                   "     Root Error Command Register     : %08X\n",
5091739a20eSAndy Ritger                   clAer.RootErrCmd);
5101739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
5111739a20eSAndy Ritger                   "     Root Error Status               : %08X\n",
5121739a20eSAndy Ritger                   clAer.RooErrStatus);
5131739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
5141739a20eSAndy Ritger                   "     Error Source ID Register        : %08X\n",
5151739a20eSAndy Ritger                   clAer.ErrSrcReg);
5161739a20eSAndy Ritger 
5171739a20eSAndy Ritger         //
5181739a20eSAndy Ritger         // if you hit this case with some AER errors reported please refer to
5191739a20eSAndy Ritger         // PCI-E manual for detailed bits spec
5201739a20eSAndy Ritger         // TODO: add details bits here
5211739a20eSAndy Ritger         //
5221739a20eSAndy Ritger     }
5231739a20eSAndy Ritger 
5241739a20eSAndy Ritger     if (clDevCtrlStatusFlags_Org)
5251739a20eSAndy Ritger     {
5261739a20eSAndy Ritger         // clear the corelogic status after we had a chance to examine it
5271739a20eSAndy Ritger         clPcieClearDevCtrlStatus(pGpu, pCl, &clDevCtrlStatus);
5281739a20eSAndy Ritger     }
5291739a20eSAndy Ritger 
5301739a20eSAndy Ritger     return NV_OK;
5311739a20eSAndy Ritger }
53294eaea97SAndy Ritger 
53394eaea97SAndy Ritger KernelChannel *
krcGetChannelInError_FWCLIENT(KernelRc * pKernelRc)53494eaea97SAndy Ritger krcGetChannelInError_FWCLIENT
53594eaea97SAndy Ritger (
53694eaea97SAndy Ritger     KernelRc *pKernelRc
53794eaea97SAndy Ritger )
53894eaea97SAndy Ritger {
53994eaea97SAndy Ritger     NV_ASSERT_OR_RETURN(IS_GSP_CLIENT(ENG_GET_GPU(pKernelRc)), NULL);
54094eaea97SAndy Ritger     return pKernelRc->pPreviousChannelInError;
54194eaea97SAndy Ritger }
542