1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "gpu/gpu.h"
25 #include "gpu/gpu_child_class_defs.h"
26 #include "kernel/gpu/intr/intr.h"
27 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
28 #include "gpu/mem_mgr/mem_mgr.h"
29 #include "published/ampere/ga100/dev_fb.h"
30 #include "published/ampere/ga100/dev_vm.h"
31 #include "published/ampere/ga100/dev_fuse.h"
32 #include "virtualization/hypervisor/hypervisor.h"
33 
34 // Error containment error id string description.
35 const char *ppErrContErrorIdStr[] = NV_ERROR_CONT_ERR_ID_STRING_PUBLIC;
36 
37 // Error containment table
38 NV_ERROR_CONT_STATE_TABLE g_errContStateTable[] = NV_ERROR_CONT_STATE_TABLE_SETTINGS;
39 
40 /*!
41  * @brief Get index of specified errorCode in the Error Containment state table
42  *
43  * @param[in] pGpu              OBJGPU pointer
44  * @param[in] errorCode         Error Containment error code
45  * @param[in] pTableIndex       Index of specified errorCode in the Error Containment State table
46  *
47  * @returns NV_STATUS
48  */
49 static NV_STATUS _gpuGetErrorContStateTableIndex_GA100(OBJGPU              *pGpu,
50                                                        NV_ERROR_CONT_ERR_ID errorCode,
51                                                        NvU32                *pTableIndex);
52 
53 /*!
54  * @brief Send NV2080_NOTIFIER*
55  *
56  * @param[in]  pGpu                   OBJGPU pointer
57  * @param[in]  errorCode              Error Containment error code
58  * @param[in]  loc                    Location, SubLocation information
59  * @param[in]  nv2080Notifier         NV2080_NOTIFIER*
60  *
61  * @returns NV_STATUS
62  */
63 static NV_STATUS _gpuNotifySubDeviceEventNotifier_GA100(OBJGPU                 *pGpu,
64                                                         NV_ERROR_CONT_ERR_ID    errorCode,
65                                                         NV_ERROR_CONT_LOCATION  loc,
66                                                         NvU32                   nv2080Notifier);
67 
68 /*!
69  * @brief Generate error log for corresponding error containment error code.
70  *
71  * @param[in]  pGpu                   OBJGPU pointer
72  * @param[in]  errorCode              Error Containment error code
73  * @param[in]  loc                    Location, SubLocation information
74  * @param[in]  pErrorContSmcSetting   Error containment SMC Disable / Enable settings
75  *
76  * @returns NV_STATUS
77  */
78 static NV_STATUS _gpuGenerateErrorLog_GA100(OBJGPU                           *pGpu,
79                                             NV_ERROR_CONT_ERR_ID              errorCode,
80                                             NV_ERROR_CONT_LOCATION            loc,
81                                             NV_ERROR_CONT_SMC_DIS_EN_SETTING *pErrorContSmcSetting);
82 
83 /*!
84  * @brief Read fuse for display supported status.
85  *        Some chips not marked displayless do not support display
86  */
87 NvBool
gpuFuseSupportsDisplay_GA100(OBJGPU * pGpu)88 gpuFuseSupportsDisplay_GA100
89 (
90     OBJGPU *pGpu
91 )
92 {
93     return GPU_FLD_TEST_DRF_DEF(pGpu, _FUSE, _STATUS_OPT_DISPLAY, _DATA, _ENABLE);
94 }
95 
96 /*!
97  * @brief Clear FBHUB POISON Interrupt state for Bug 2924523.
98  * This HAL handles the CPU interrupt tree
99  *
100  * @param[in]      pGpu           OBJGPU pointer
101  *
102  * @return NV_OK if success, else appropriate NV_STATUS code
103  */
104 NV_STATUS
gpuClearFbhubPoisonIntrForBug2924523_GA100(OBJGPU * pGpu)105 gpuClearFbhubPoisonIntrForBug2924523_GA100
106 (
107     OBJGPU *pGpu
108 )
109 {
110     // INTR module is not stateloaded at gpuPostConstruct, so use HW default
111     NvU32 intrVector = NV_PFB_FBHUB_POISON_INTR_VECTOR_HW_INIT;
112 
113     if (pGpu == NULL)
114         return NV_OK;
115 
116     //
117     // Check if FBHUB Poison interrupt got triggered before RM Init due
118     // to VBIOS IFR on GA100. If yes, clear the FBHUB Interrupt. This WAR is
119     // required for Bug 2924523 as VBIOS IFR causes FBHUB Poison intr.
120     //
121     if (intrIsVectorPending_HAL(pGpu, GPU_GET_INTR(pGpu), intrVector, NULL))
122     {
123         NV_PRINTF(LEVEL_ERROR, "FBHUB Interrupt detected. Clearing it.\n");
124         intrClearLeafVector_HAL(pGpu, GPU_GET_INTR(pGpu), intrVector, NULL);
125     }
126 
127     return NV_OK;
128 }
129 
130 /*!
131  * @brief Returns FLA VASpace Size for Ampere
132  *
133  * @param[in] pGpu                        OBJGPU pointer
134  * @param[in] bNvSwitchVirtualization     boolean
135  *
136  * @returns NvU64 -> size of FLA VASpace
137  */
138 NvU64
gpuGetFlaVasSize_GA100(OBJGPU * pGpu,NvBool bNvswitchVirtualization)139 gpuGetFlaVasSize_GA100
140 (
141     OBJGPU *pGpu,
142     NvBool  bNvswitchVirtualization
143 )
144 {
145     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
146     NvU64  totalFbSize = (pMemoryManager->Ram.fbTotalMemSizeMb << 20);
147 
148     if (bNvswitchVirtualization || totalFbSize <= NVBIT64(36))
149     {
150         return 0x2000000000;  // 128GB
151     }
152     else
153     {
154         return (totalFbSize * 2);
155     }
156 }
157 
158 /*!
159  * @brief Is ctx buffer allocation in PMA supported
160  */
161 NvBool
gpuIsCtxBufAllocInPmaSupported_GA100(OBJGPU * pGpu)162 gpuIsCtxBufAllocInPmaSupported_GA100
163 (
164     OBJGPU *pGpu
165 )
166 {
167     //
168     // This is supported by default on baremetal RM.
169     // This has no impact in guest-RM since ctxBufPools are disabled on guest.
170     // We leave this disabled on host-RM. TODO: Bug 4066846
171     //
172     if (!hypervisorIsVgxHyper())
173         return NV_TRUE;
174     return NV_FALSE;
175 }
176 
177 //
178 // List of GPU children that present for the chip. List entries contain$
179 // {CLASS-ID, # of instances} pairs, e.g.: {CE, 2} is 2 instance of OBJCE. This$
180 // list controls only engine presence. Order is defined by$
181 // gpuGetChildrenOrder_HAL.$
182 //
183 // IMPORTANT: This function is to be deleted. Engine removal should instead be$
184 // handled by <eng>ConstructEngine returning NV_ERR_NOT_SUPPORTED. PLEASE DO NOT$
185 // FORK THIS LIST!$
186 //
187 // List entries contain {CLASS-ID, # of instances} pairs.
188 //
189 
190 static const GPUCHILDPRESENT gpuChildrenPresent_GA100[] =
191 {
192     GPU_CHILD_PRESENT(OBJTMR, 1),
193     GPU_CHILD_PRESENT(KernelMIGManager, 1),
194     GPU_CHILD_PRESENT(KernelGraphicsManager, 1),
195     GPU_CHILD_PRESENT(KernelRc, 1),
196     GPU_CHILD_PRESENT(Intr, 1),
197     GPU_CHILD_PRESENT(NvDebugDump, 1),
198     GPU_CHILD_PRESENT(OBJGPUMON, 1),
199     GPU_CHILD_PRESENT(OBJSWENG, 1),
200     GPU_CHILD_PRESENT(OBJUVM, 1),
201     GPU_CHILD_PRESENT(KernelBif, 1),
202     GPU_CHILD_PRESENT(KernelBus, 1),
203     GPU_CHILD_PRESENT(KernelCE, 10),
204     GPU_CHILD_PRESENT(KernelDisplay, 1),
205     GPU_CHILD_PRESENT(VirtMemAllocator, 1),
206     GPU_CHILD_PRESENT(KernelMemorySystem, 1),
207     GPU_CHILD_PRESENT(MemoryManager, 1),
208     GPU_CHILD_PRESENT(KernelFifo, 1),
209     GPU_CHILD_PRESENT(KernelGmmu, 1),
210     GPU_CHILD_PRESENT(KernelGraphics, 8),
211     GPU_CHILD_PRESENT(KernelHwpm, 1),
212     GPU_CHILD_PRESENT(KernelMc, 1),
213     GPU_CHILD_PRESENT(SwIntr, 1),
214     GPU_CHILD_PRESENT(KernelNvlink, 1),
215     GPU_CHILD_PRESENT(KernelPerf, 1),
216     GPU_CHILD_PRESENT(KernelPmu, 1),
217     GPU_CHILD_PRESENT(KernelSec2, 1),
218     GPU_CHILD_PRESENT(KernelGsp, 1),
219     GPU_CHILD_PRESENT(ConfidentialCompute, 1),
220 };
221 
222 const GPUCHILDPRESENT *
gpuGetChildrenPresent_GA100(OBJGPU * pGpu,NvU32 * pNumEntries)223 gpuGetChildrenPresent_GA100(OBJGPU *pGpu, NvU32 *pNumEntries)
224 {
225     *pNumEntries = NV_ARRAY_ELEMENTS(gpuChildrenPresent_GA100);
226     return gpuChildrenPresent_GA100;
227 }
228 
229 //
230 // List of GPU children that present for the chip. List entries contain$
231 // {CLASS-ID, # of instances} pairs, e.g.: {CE, 2} is 2 instance of OBJCE. This$
232 // list controls only engine presence. Order is defined by$
233 // gpuGetChildrenOrder_HAL.$
234 //
235 // IMPORTANT: This function is to be deleted. Engine removal should instead be$
236 // handled by <eng>ConstructEngine returning NV_ERR_NOT_SUPPORTED. PLEASE DO NOT$
237 // FORK THIS LIST!$
238 //
239 // List entries contain {CLASS-ID, # of instances} pairs.
240 //
241 static const GPUCHILDPRESENT gpuChildrenPresent_GA102[] =
242 {
243     GPU_CHILD_PRESENT(OBJTMR, 1),
244     GPU_CHILD_PRESENT(KernelMIGManager, 1),
245     GPU_CHILD_PRESENT(KernelGraphicsManager, 1),
246     GPU_CHILD_PRESENT(KernelRc, 1),
247     GPU_CHILD_PRESENT(Intr, 1),
248     GPU_CHILD_PRESENT(NvDebugDump, 1),
249     GPU_CHILD_PRESENT(OBJGPUMON, 1),
250     GPU_CHILD_PRESENT(OBJSWENG, 1),
251     GPU_CHILD_PRESENT(OBJUVM, 1),
252     GPU_CHILD_PRESENT(KernelBif, 1),
253     GPU_CHILD_PRESENT(KernelBus, 1),
254     GPU_CHILD_PRESENT(KernelCE, 5),
255     GPU_CHILD_PRESENT(KernelDisplay, 1),
256     GPU_CHILD_PRESENT(VirtMemAllocator, 1),
257     GPU_CHILD_PRESENT(KernelMemorySystem, 1),
258     GPU_CHILD_PRESENT(MemoryManager, 1),
259     GPU_CHILD_PRESENT(KernelFifo, 1),
260     GPU_CHILD_PRESENT(KernelGmmu, 1),
261     GPU_CHILD_PRESENT(KernelGraphics, 1),
262     GPU_CHILD_PRESENT(KernelHwpm, 1),
263     GPU_CHILD_PRESENT(KernelMc, 1),
264     GPU_CHILD_PRESENT(SwIntr, 1),
265     GPU_CHILD_PRESENT(KernelNvlink, 1),
266     GPU_CHILD_PRESENT(KernelPerf, 1),
267     GPU_CHILD_PRESENT(KernelPmu, 1),
268     GPU_CHILD_PRESENT(KernelSec2, 1),
269     GPU_CHILD_PRESENT(KernelGsp, 1),
270     GPU_CHILD_PRESENT(ConfidentialCompute, 1),
271 };
272 
273 const GPUCHILDPRESENT *
gpuGetChildrenPresent_GA102(OBJGPU * pGpu,NvU32 * pNumEntries)274 gpuGetChildrenPresent_GA102(OBJGPU *pGpu, NvU32 *pNumEntries)
275 {
276     *pNumEntries = NV_ARRAY_ELEMENTS(gpuChildrenPresent_GA102);
277     return gpuChildrenPresent_GA102;
278 }
279 
280 /*! @brief Returns if a P2P object is allocated in SRIOV mode.
281  *
282  *  @param[in]   pGpu     OBJGPU pointer
283  *
284  *  @returns for baremetal, this should just return NV_TRUE
285              for SRIOV, return the SRIOV Info
286  */
287 NvBool
gpuCheckIsP2PAllocated_GA100(OBJGPU * pGpu)288 gpuCheckIsP2PAllocated_GA100
289 (
290     OBJGPU *pGpu
291 )
292 {
293     if (!IS_VIRTUAL(pGpu) && !gpuIsSriovEnabled(pGpu))
294         return NV_TRUE;
295 
296     return pGpu->sriovState.bP2PAllocated;
297 }
298 
299 /*!
300  * @brief Get index of specified errorCode in the Error Containment state table
301  *
302  * @param[in] pGpu              OBJGPU pointer
303  * @param[in] errorCode         Error Containment error code
304  * @param[in] pTableIndex       Index of specified errorCode in the Error Containment state table
305  *
306  * @returns NV_STATUS
307  */
308 static
309 NV_STATUS
_gpuGetErrorContStateTableIndex_GA100(OBJGPU * pGpu,NV_ERROR_CONT_ERR_ID errorCode,NvU32 * pTableIndex)310 _gpuGetErrorContStateTableIndex_GA100
311 (
312     OBJGPU              *pGpu,
313     NV_ERROR_CONT_ERR_ID errorCode,
314     NvU32                *pTableIndex
315 )
316 {
317     NvU32 index;
318     NvU32 tableSize = NV_ARRAY_ELEMENTS(g_errContStateTable);
319 
320     NV_ASSERT_OR_RETURN(pTableIndex != NULL, NV_ERR_INVALID_ARGUMENT);
321 
322     for (index = 0; index < tableSize; index++)
323     {
324         if (errorCode == g_errContStateTable[index].errorCode)
325         {
326             *pTableIndex = index;
327             return NV_OK;
328         }
329     }
330 
331     return NV_ERR_INVALID_ARGUMENT;
332 }
333 
334 /*!
335  * @brief Send NV2080_NOTIFIER*
336  *
337  * @param[in]  pGpu                   OBJGPU pointer
338  * @param[in]  errorCode              Error Containment error code
339  * @param[in]  loc                    Location, SubLocation information
340  * @param[in]  nv2080Notifier         NV2080_NOTIFIER*
341  *
342  * @returns NV_STATUS
343  */
344 static
345 NV_STATUS
_gpuNotifySubDeviceEventNotifier_GA100(OBJGPU * pGpu,NV_ERROR_CONT_ERR_ID errorCode,NV_ERROR_CONT_LOCATION loc,NvU32 nv2080Notifier)346 _gpuNotifySubDeviceEventNotifier_GA100
347 (
348     OBJGPU                 *pGpu,
349     NV_ERROR_CONT_ERR_ID    errorCode,
350     NV_ERROR_CONT_LOCATION  loc,
351     NvU32                   nv2080Notifier
352 )
353 {
354     NvV16 info16 = 0;
355     NvV32 info32 = 0;
356     RM_ENGINE_TYPE localRmEngineType = 0;
357 
358     // Return if no notifier needs to be sent for this errorCode.
359     if (nv2080Notifier == NO_NV2080_NOTIFIER)
360     {
361         return NV_OK;
362     }
363 
364     switch (errorCode)
365     {
366         // Intentional fall-through
367         case NV_ERROR_CONT_ERR_ID_E01_FB_ECC_DED:
368         case NV_ERROR_CONT_ERR_ID_E02_FB_ECC_DED_IN_CBC_STORE:
369         case NV_ERROR_CONT_ERR_ID_E09_FBHUB_POISON:
370         case NV_ERROR_CONT_ERR_ID_E20_XALEP_POISON:
371             info16 = FB_MEMORY_ERROR;
372             break;
373 
374         // Intentional fall-through
375         case NV_ERROR_CONT_ERR_ID_E05_LTC_ECC_DSTG:
376         case NV_ERROR_CONT_ERR_ID_E06_LTC_UNSUPPORTED_CLIENT_POISON:
377         case NV_ERROR_CONT_ERR_ID_E07_LTC_ECC_TSTG:
378         case NV_ERROR_CONT_ERR_ID_E08_LTC_ECC_RSTG:
379             info16 = LTC_ERROR;
380             break;
381 
382         case NV_ERROR_CONT_ERR_ID_E10_SM_POISON:
383         case NV_ERROR_CONT_ERR_ID_E16_GCC_POISON:
384         case NV_ERROR_CONT_ERR_ID_E17_CTXSW_POISON:
385             info16 = ROBUST_CHANNEL_GR_EXCEPTION;
386             break;
387 
388         // Intentional fall-through
389         case NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL:
390         case NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL:
391             NV_ASSERT_OR_RETURN(loc.locType == NV_ERROR_CONT_LOCATION_TYPE_ENGINE, NV_ERR_INVALID_ARGUMENT);
392             //
393             // If SMC is enabled, RM need to notify partition local engineId. Convert
394             // global ID to partition local if client has filled proper engineIDs
395             //
396             localRmEngineType = loc.locInfo.engineLoc.rmEngineId;
397             if (IS_MIG_IN_USE(pGpu) &&
398                 RM_ENGINE_TYPE_IS_VALID(loc.locInfo.engineLoc.rmEngineId))
399             {
400                 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
401                 MIG_INSTANCE_REF ref;
402 
403                 NV_ASSERT_OK_OR_RETURN(kmigmgrGetInstanceRefFromDevice(pGpu,
404                                                                        pKernelMIGManager,
405                                                                        loc.locInfo.engineLoc.pDevice,
406                                                                        &ref));
407 
408                 if (!kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, loc.locInfo.engineLoc.rmEngineId, ref))
409                 {
410                     // Notifier is requested for an unsupported engine
411                     NV_PRINTF(LEVEL_ERROR,
412                               "Notifier requested for an unsupported rm engine id (0x%x)\n",
413                               loc.locInfo.engineLoc.rmEngineId);
414                     return NV_ERR_INVALID_ARGUMENT;
415                 }
416 
417                 // Override the engine type with the local engine idx
418                 NV_ASSERT_OK_OR_RETURN(kmigmgrGetGlobalToLocalEngineType(pGpu,
419                                                                          pKernelMIGManager,
420                                                                          ref,
421                                                                          loc.locInfo.engineLoc.rmEngineId,
422                                                                          &localRmEngineType));
423             }
424 
425             info16 = ROBUST_CHANNEL_CE_ERROR(NV2080_ENGINE_TYPE_COPY_IDX(localRmEngineType));
426             break;
427 
428         case NV_ERROR_CONT_ERR_ID_E13_MMU_POISON:
429             info16 = ROBUST_CHANNEL_FIFO_ERROR_MMU_ERR_FLT;
430             break;
431     }
432 
433     gpuNotifySubDeviceEvent(pGpu,
434                             nv2080Notifier,
435                             NULL,
436                             0,
437                             info32,         // Unused
438                             info16);
439 
440     return NV_OK;
441 }
442 
443 /*!
444  * @brief Generate error log for corresponding error containment error code.
445  *
446  * Format / example :
447  *    1) Contained Error with SMC Partition attribution (Error attributable to SMC Partition or process in SMC partition):
448  *    2) Contained Error with no SMC partitioning  (Error attributable to process on GPU):
449  *    3) Uncontaned Error
450  *
451  * >> NVRM: Xid (PCI:0000:01:00 GPU-I:05): 94, pid=7194, Contained: CE User Channel (0x9). RST: No, D-RST: No
452  * >> NVRM: Xid (PCI:0000:01:00): 94, pid=7062, Contained: CE User Channel (0x9). RST: No, D-RST: No
453  * >> NVRM: Xid (PCI:0000:01:00): 95, pid=7062, Uncontained: LTC TAG (0x2,0x1). RST: Yes, D-RST: No
454  *
455  * @param[in]  pGpu                   OBJGPU pointer
456  * @param[in]  errorCode              Error Containment error code
457  * @param[in]  loc                    Location, SubLocation information
458  * @param[in]  pErrorContSmcSetting   Error containment SMC Disable / Enable settings
459  *
460  * @returns NV_STATUS
461  */
462 static
463 NV_STATUS
_gpuGenerateErrorLog_GA100(OBJGPU * pGpu,NV_ERROR_CONT_ERR_ID errorCode,NV_ERROR_CONT_LOCATION loc,NV_ERROR_CONT_SMC_DIS_EN_SETTING * pErrorContSmcSetting)464 _gpuGenerateErrorLog_GA100(OBJGPU                           *pGpu,
465                            NV_ERROR_CONT_ERR_ID              errorCode,
466                            NV_ERROR_CONT_LOCATION            loc,
467                            NV_ERROR_CONT_SMC_DIS_EN_SETTING *pErrorContSmcSetting)
468 {
469     RM_ENGINE_TYPE localRmEngineType;
470     NvU32 rcErrorCode = pErrorContSmcSetting->rcErrorCode;
471 
472     NV_ASSERT_OR_RETURN((pErrorContSmcSetting != NULL), NV_ERR_INVALID_ARGUMENT);
473 
474     switch (loc.locType)
475     {
476         case NV_ERROR_CONT_LOCATION_TYPE_DRAM:
477             nvErrorLog_va((void *)pGpu,
478                           rcErrorCode,
479                           "%s: %s (0x%x,0x%x). physAddr: 0x%08llx RST: %s, D-RST: %s",
480                           rcErrorCode == ROBUST_CHANNEL_CONTAINED_ERROR ?
481                               ROBUST_CHANNEL_CONTAINED_ERROR_STR :
482                               ROBUST_CHANNEL_UNCONTAINED_ERROR_STR,
483                           ppErrContErrorIdStr[errorCode],
484                           loc.locInfo.dramLoc.partition,
485                           loc.locInfo.dramLoc.subPartition,
486                           loc.locInfo.dramLoc.physicalAddress,
487                           pErrorContSmcSetting->bGpuResetReqd ? "Yes" : "No",
488                           pErrorContSmcSetting->bGpuDrainAndResetReqd ? "Yes" : "No");
489             break;
490 
491         case NV_ERROR_CONT_LOCATION_TYPE_LTC:
492             nvErrorLog_va((void *)pGpu,
493                           rcErrorCode,
494                           "%s: %s (0x%x,0x%x). RST: %s, D-RST: %s",
495                           rcErrorCode == ROBUST_CHANNEL_CONTAINED_ERROR ?
496                               ROBUST_CHANNEL_CONTAINED_ERROR_STR :
497                               ROBUST_CHANNEL_UNCONTAINED_ERROR_STR,
498                           ppErrContErrorIdStr[errorCode],
499                           loc.locInfo.ltcLoc.partition,
500                           loc.locInfo.ltcLoc.slice,
501                           pErrorContSmcSetting->bGpuResetReqd ? "Yes" : "No",
502                           pErrorContSmcSetting->bGpuDrainAndResetReqd ? "Yes" : "No");
503             break;
504 
505         case NV_ERROR_CONT_LOCATION_TYPE_ENGINE:
506             NV_ASSERT_OR_RETURN(loc.locType == NV_ERROR_CONT_LOCATION_TYPE_ENGINE, NV_ERR_INVALID_ARGUMENT);
507             //
508             // If SMC is enabled, RM need to notify partition local engineId. Convert
509             // global ID to partition local if client has filled proper engineIDs
510             //
511             localRmEngineType = loc.locInfo.engineLoc.rmEngineId;
512             if (IS_MIG_IN_USE(pGpu) &&
513                 RM_ENGINE_TYPE_IS_VALID(loc.locInfo.engineLoc.rmEngineId))
514             {
515                 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
516                 MIG_INSTANCE_REF ref;
517                 NV_ASSERT_OK_OR_RETURN(kmigmgrGetMIGReferenceFromEngineType(pGpu,
518                                                                             pKernelMIGManager,
519                                                                             loc.locInfo.engineLoc.rmEngineId,
520                                                                             &ref));
521                 // Override the engine type with the local engine idx
522                 NV_ASSERT_OK_OR_RETURN(kmigmgrGetGlobalToLocalEngineType(pGpu,
523                                                                          pKernelMIGManager,
524                                                                          ref,
525                                                                          loc.locInfo.engineLoc.rmEngineId,
526                                                                          &localRmEngineType));
527             }
528 
529             nvErrorLog_va((void *)pGpu,
530                           rcErrorCode,
531                           "%s: %s (0x%x). RST: %s, D-RST: %s",
532                           rcErrorCode == ROBUST_CHANNEL_CONTAINED_ERROR ?
533                               ROBUST_CHANNEL_CONTAINED_ERROR_STR :
534                               ROBUST_CHANNEL_UNCONTAINED_ERROR_STR,
535                           ppErrContErrorIdStr[errorCode],
536                           gpuGetNv2080EngineType(localRmEngineType),
537                           pErrorContSmcSetting->bGpuResetReqd ? "Yes" : "No",
538                           pErrorContSmcSetting->bGpuDrainAndResetReqd ? "Yes" : "No");
539             break;
540 
541         case NV_ERROR_CONT_LOCATION_TYPE_NONE:
542             nvErrorLog_va((void *)pGpu,
543                           rcErrorCode,
544                           "%s: %s. RST: %s, D-RST: %s",
545                           rcErrorCode == ROBUST_CHANNEL_CONTAINED_ERROR ?
546                               ROBUST_CHANNEL_CONTAINED_ERROR_STR :
547                               ROBUST_CHANNEL_UNCONTAINED_ERROR_STR,
548                           ppErrContErrorIdStr[errorCode],
549                           pErrorContSmcSetting->bGpuResetReqd ? "Yes" : "No",
550                           pErrorContSmcSetting->bGpuDrainAndResetReqd ? "Yes" : "No");
551             break;
552     }
553 
554     return NV_OK;
555 }
556 
557 /*!
558  * @brief Determine Error Containment RC code, print Xid, send NV2080_NOTIFIER*,
559  * mark device for reset or mark device for drain and reset as indicated in
560  * error containment state table (refer gpu/error_cont.h).
561  *
562  * @param[in]  pGpu              OBJGPU pointer
563  * @param[in]  errorCode         Error Containment error code
564  * @param[in]  loc               Location, SubLocation information
565  * @param[out] pRcErrorCode      RC Error code
566  *
567  * @returns NV_STATUS
568  */
569 NV_STATUS
gpuUpdateErrorContainmentState_GA100(OBJGPU * pGpu,NV_ERROR_CONT_ERR_ID errorCode,NV_ERROR_CONT_LOCATION loc,NvU32 * pRcErrorCode)570 gpuUpdateErrorContainmentState_GA100
571 (
572     OBJGPU                 *pGpu,
573     NV_ERROR_CONT_ERR_ID    errorCode,
574     NV_ERROR_CONT_LOCATION  loc,
575     NvU32                  *pRcErrorCode
576 )
577 {
578     NvU32 tableIndex = 0;
579     NvBool bIsSmcEnabled = NV_FALSE;
580     NvU32 smcDisEnSettingIndex = 0;
581     NvU32 rcErrorCode = 0;
582     KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
583     NV_ERROR_CONT_SMC_DIS_EN_SETTING *pErrorContSmcSetting = NULL;
584 
585     if (!gpuIsGlobalPoisonFuseEnabled(pGpu))
586     {
587         return NV_ERR_NOT_SUPPORTED;
588     }
589 
590     NV_ASSERT_OK_OR_RETURN(_gpuGetErrorContStateTableIndex_GA100(pGpu, errorCode, &tableIndex));
591 
592     // Check if MIG GPU partitioning is enabled
593     if (IS_MIG_IN_USE(pGpu))
594     {
595         bIsSmcEnabled = NV_TRUE;
596     }
597 
598     // MIG Memory partitioning config entry index.
599     if (pKernelMIGManager != NULL && kmigmgrIsMIGMemPartitioningEnabled(pGpu, pKernelMIGManager))
600     {
601         smcDisEnSettingIndex = 1;
602     }
603 
604     pErrorContSmcSetting = &(g_errContStateTable[tableIndex].smcDisEnSetting[smcDisEnSettingIndex]);
605 
606     rcErrorCode = pErrorContSmcSetting->rcErrorCode;
607 
608     // Pass RC Error code if user requested it.
609     if (pRcErrorCode != NULL)
610     {
611         *pRcErrorCode = rcErrorCode;
612     }
613 
614     // Update partition attribution for this exception only if SMC is enabled.
615     if (pErrorContSmcSetting->bPrintSmcPartitionInfo && bIsSmcEnabled)
616     {
617         // Fall through on error.
618         gpuSetPartitionErrorAttribution_HAL(pGpu,
619                                             errorCode,
620                                             loc,
621                                             rcErrorCode);
622     }
623 
624     // Print Xid only if Ampere Error Containment XIDs printing is enabled and rcErrorCode is valid
625     if (gpuIsAmpereErrorContainmentXidEnabled(pGpu) && rcErrorCode != NO_XID)
626     {
627         NV_ASSERT_OK_OR_RETURN(_gpuGenerateErrorLog_GA100(pGpu,
628                                                           errorCode,
629                                                           loc,
630                                                           pErrorContSmcSetting));
631     }
632 
633     // Send NV2080_NOTIFIER*
634     if (pErrorContSmcSetting->nv2080Notifier != NO_NV2080_NOTIFIER)
635     {
636         NV_ASSERT_OK(_gpuNotifySubDeviceEventNotifier_GA100(pGpu,
637                                                             errorCode,
638                                                             loc,
639                                                             pErrorContSmcSetting->nv2080Notifier));
640     }
641 
642     // Set the scratch bit to indicate the GPU needs to be reset.
643     if ((pErrorContSmcSetting->bGpuResetReqd) &&
644         (gpuMarkDeviceForReset(pGpu) != NV_OK))
645     {
646         NV_PRINTF(LEVEL_ERROR, "Failed to mark GPU for pending reset");
647     }
648 
649     // Set the scratch bit to indicate the GPU needs to be reset.
650     if (pErrorContSmcSetting->bGpuDrainAndResetReqd &&
651         gpuMarkDeviceForDrainAndReset(pGpu) != NV_OK)
652     {
653         NV_PRINTF(LEVEL_ERROR, "Failed to mark GPU for pending drain and reset");
654     }
655 
656     return NV_OK;
657 }
658 
659 NvBool
gpuCheckIfFbhubPoisonIntrPending_GA100(OBJGPU * pGpu)660 gpuCheckIfFbhubPoisonIntrPending_GA100
661 (
662     OBJGPU *pGpu
663 )
664 {
665     return intrIsVectorPending_HAL(pGpu, GPU_GET_INTR(pGpu), NV_PFB_FBHUB_POISON_INTR_VECTOR_HW_INIT, NULL);
666 }
667