1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "gpu/gpu.h"
25 #include "gpu/gpu_child_class_defs.h"
26 #include "kernel/gpu/intr/intr.h"
27 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
28 #include "gpu/mem_mgr/mem_mgr.h"
29 #include "published/ampere/ga100/dev_fb.h"
30 #include "published/ampere/ga100/dev_vm.h"
31 #include "published/ampere/ga100/dev_fuse.h"
32 #include "virtualization/hypervisor/hypervisor.h"
33
34 // Error containment error id string description.
35 const char *ppErrContErrorIdStr[] = NV_ERROR_CONT_ERR_ID_STRING_PUBLIC;
36
37 // Error containment table
38 NV_ERROR_CONT_STATE_TABLE g_errContStateTable[] = NV_ERROR_CONT_STATE_TABLE_SETTINGS;
39
40 /*!
41 * @brief Get index of specified errorCode in the Error Containment state table
42 *
43 * @param[in] pGpu OBJGPU pointer
44 * @param[in] errorCode Error Containment error code
45 * @param[in] pTableIndex Index of specified errorCode in the Error Containment State table
46 *
47 * @returns NV_STATUS
48 */
49 static NV_STATUS _gpuGetErrorContStateTableIndex_GA100(OBJGPU *pGpu,
50 NV_ERROR_CONT_ERR_ID errorCode,
51 NvU32 *pTableIndex);
52
53 /*!
54 * @brief Send NV2080_NOTIFIER*
55 *
56 * @param[in] pGpu OBJGPU pointer
57 * @param[in] errorCode Error Containment error code
58 * @param[in] loc Location, SubLocation information
59 * @param[in] nv2080Notifier NV2080_NOTIFIER*
60 *
61 * @returns NV_STATUS
62 */
63 static NV_STATUS _gpuNotifySubDeviceEventNotifier_GA100(OBJGPU *pGpu,
64 NV_ERROR_CONT_ERR_ID errorCode,
65 NV_ERROR_CONT_LOCATION loc,
66 NvU32 nv2080Notifier);
67
68 /*!
69 * @brief Generate error log for corresponding error containment error code.
70 *
71 * @param[in] pGpu OBJGPU pointer
72 * @param[in] errorCode Error Containment error code
73 * @param[in] loc Location, SubLocation information
74 * @param[in] pErrorContSmcSetting Error containment SMC Disable / Enable settings
75 *
76 * @returns NV_STATUS
77 */
78 static NV_STATUS _gpuGenerateErrorLog_GA100(OBJGPU *pGpu,
79 NV_ERROR_CONT_ERR_ID errorCode,
80 NV_ERROR_CONT_LOCATION loc,
81 NV_ERROR_CONT_SMC_DIS_EN_SETTING *pErrorContSmcSetting);
82
83 /*!
84 * @brief Read fuse for display supported status.
85 * Some chips not marked displayless do not support display
86 */
87 NvBool
gpuFuseSupportsDisplay_GA100(OBJGPU * pGpu)88 gpuFuseSupportsDisplay_GA100
89 (
90 OBJGPU *pGpu
91 )
92 {
93 return GPU_FLD_TEST_DRF_DEF(pGpu, _FUSE, _STATUS_OPT_DISPLAY, _DATA, _ENABLE);
94 }
95
96 /*!
97 * @brief Clear FBHUB POISON Interrupt state for Bug 2924523.
98 * This HAL handles the CPU interrupt tree
99 *
100 * @param[in] pGpu OBJGPU pointer
101 *
102 * @return NV_OK if success, else appropriate NV_STATUS code
103 */
104 NV_STATUS
gpuClearFbhubPoisonIntrForBug2924523_GA100(OBJGPU * pGpu)105 gpuClearFbhubPoisonIntrForBug2924523_GA100
106 (
107 OBJGPU *pGpu
108 )
109 {
110 // INTR module is not stateloaded at gpuPostConstruct, so use HW default
111 NvU32 intrVector = NV_PFB_FBHUB_POISON_INTR_VECTOR_HW_INIT;
112
113 if (pGpu == NULL)
114 return NV_OK;
115
116 //
117 // Check if FBHUB Poison interrupt got triggered before RM Init due
118 // to VBIOS IFR on GA100. If yes, clear the FBHUB Interrupt. This WAR is
119 // required for Bug 2924523 as VBIOS IFR causes FBHUB Poison intr.
120 //
121 if (intrIsVectorPending_HAL(pGpu, GPU_GET_INTR(pGpu), intrVector, NULL))
122 {
123 NV_PRINTF(LEVEL_ERROR, "FBHUB Interrupt detected. Clearing it.\n");
124 intrClearLeafVector_HAL(pGpu, GPU_GET_INTR(pGpu), intrVector, NULL);
125 }
126
127 return NV_OK;
128 }
129
130 /*!
131 * @brief Returns FLA VASpace Size for Ampere
132 *
133 * @param[in] pGpu OBJGPU pointer
134 * @param[in] bNvSwitchVirtualization boolean
135 *
136 * @returns NvU64 -> size of FLA VASpace
137 */
138 NvU64
gpuGetFlaVasSize_GA100(OBJGPU * pGpu,NvBool bNvswitchVirtualization)139 gpuGetFlaVasSize_GA100
140 (
141 OBJGPU *pGpu,
142 NvBool bNvswitchVirtualization
143 )
144 {
145 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
146 NvU64 totalFbSize = (pMemoryManager->Ram.fbTotalMemSizeMb << 20);
147
148 if (bNvswitchVirtualization || totalFbSize <= NVBIT64(36))
149 {
150 return 0x2000000000; // 128GB
151 }
152 else
153 {
154 return (totalFbSize * 2);
155 }
156 }
157
158 /*!
159 * @brief Is ctx buffer allocation in PMA supported
160 */
161 NvBool
gpuIsCtxBufAllocInPmaSupported_GA100(OBJGPU * pGpu)162 gpuIsCtxBufAllocInPmaSupported_GA100
163 (
164 OBJGPU *pGpu
165 )
166 {
167 //
168 // This is supported by default on baremetal RM.
169 // This has no impact in guest-RM since ctxBufPools are disabled on guest.
170 // We leave this disabled on host-RM. TODO: Bug 4066846
171 //
172 if (!hypervisorIsVgxHyper())
173 return NV_TRUE;
174 return NV_FALSE;
175 }
176
177 //
178 // List of GPU children that present for the chip. List entries contain$
179 // {CLASS-ID, # of instances} pairs, e.g.: {CE, 2} is 2 instance of OBJCE. This$
180 // list controls only engine presence. Order is defined by$
181 // gpuGetChildrenOrder_HAL.$
182 //
183 // IMPORTANT: This function is to be deleted. Engine removal should instead be$
184 // handled by <eng>ConstructEngine returning NV_ERR_NOT_SUPPORTED. PLEASE DO NOT$
185 // FORK THIS LIST!$
186 //
187 // List entries contain {CLASS-ID, # of instances} pairs.
188 //
189
190 static const GPUCHILDPRESENT gpuChildrenPresent_GA100[] =
191 {
192 GPU_CHILD_PRESENT(OBJTMR, 1),
193 GPU_CHILD_PRESENT(KernelMIGManager, 1),
194 GPU_CHILD_PRESENT(KernelGraphicsManager, 1),
195 GPU_CHILD_PRESENT(KernelRc, 1),
196 GPU_CHILD_PRESENT(Intr, 1),
197 GPU_CHILD_PRESENT(NvDebugDump, 1),
198 GPU_CHILD_PRESENT(OBJGPUMON, 1),
199 GPU_CHILD_PRESENT(OBJSWENG, 1),
200 GPU_CHILD_PRESENT(OBJUVM, 1),
201 GPU_CHILD_PRESENT(KernelBif, 1),
202 GPU_CHILD_PRESENT(KernelBus, 1),
203 GPU_CHILD_PRESENT(KernelCE, 10),
204 GPU_CHILD_PRESENT(KernelDisplay, 1),
205 GPU_CHILD_PRESENT(VirtMemAllocator, 1),
206 GPU_CHILD_PRESENT(KernelMemorySystem, 1),
207 GPU_CHILD_PRESENT(MemoryManager, 1),
208 GPU_CHILD_PRESENT(KernelFifo, 1),
209 GPU_CHILD_PRESENT(KernelGmmu, 1),
210 GPU_CHILD_PRESENT(KernelGraphics, 8),
211 GPU_CHILD_PRESENT(KernelHwpm, 1),
212 GPU_CHILD_PRESENT(KernelMc, 1),
213 GPU_CHILD_PRESENT(SwIntr, 1),
214 GPU_CHILD_PRESENT(KernelNvlink, 1),
215 GPU_CHILD_PRESENT(KernelPerf, 1),
216 GPU_CHILD_PRESENT(KernelPmu, 1),
217 GPU_CHILD_PRESENT(KernelSec2, 1),
218 GPU_CHILD_PRESENT(KernelGsp, 1),
219 GPU_CHILD_PRESENT(ConfidentialCompute, 1),
220 };
221
222 const GPUCHILDPRESENT *
gpuGetChildrenPresent_GA100(OBJGPU * pGpu,NvU32 * pNumEntries)223 gpuGetChildrenPresent_GA100(OBJGPU *pGpu, NvU32 *pNumEntries)
224 {
225 *pNumEntries = NV_ARRAY_ELEMENTS(gpuChildrenPresent_GA100);
226 return gpuChildrenPresent_GA100;
227 }
228
229 //
230 // List of GPU children that present for the chip. List entries contain$
231 // {CLASS-ID, # of instances} pairs, e.g.: {CE, 2} is 2 instance of OBJCE. This$
232 // list controls only engine presence. Order is defined by$
233 // gpuGetChildrenOrder_HAL.$
234 //
235 // IMPORTANT: This function is to be deleted. Engine removal should instead be$
236 // handled by <eng>ConstructEngine returning NV_ERR_NOT_SUPPORTED. PLEASE DO NOT$
237 // FORK THIS LIST!$
238 //
239 // List entries contain {CLASS-ID, # of instances} pairs.
240 //
241 static const GPUCHILDPRESENT gpuChildrenPresent_GA102[] =
242 {
243 GPU_CHILD_PRESENT(OBJTMR, 1),
244 GPU_CHILD_PRESENT(KernelMIGManager, 1),
245 GPU_CHILD_PRESENT(KernelGraphicsManager, 1),
246 GPU_CHILD_PRESENT(KernelRc, 1),
247 GPU_CHILD_PRESENT(Intr, 1),
248 GPU_CHILD_PRESENT(NvDebugDump, 1),
249 GPU_CHILD_PRESENT(OBJGPUMON, 1),
250 GPU_CHILD_PRESENT(OBJSWENG, 1),
251 GPU_CHILD_PRESENT(OBJUVM, 1),
252 GPU_CHILD_PRESENT(KernelBif, 1),
253 GPU_CHILD_PRESENT(KernelBus, 1),
254 GPU_CHILD_PRESENT(KernelCE, 5),
255 GPU_CHILD_PRESENT(KernelDisplay, 1),
256 GPU_CHILD_PRESENT(VirtMemAllocator, 1),
257 GPU_CHILD_PRESENT(KernelMemorySystem, 1),
258 GPU_CHILD_PRESENT(MemoryManager, 1),
259 GPU_CHILD_PRESENT(KernelFifo, 1),
260 GPU_CHILD_PRESENT(KernelGmmu, 1),
261 GPU_CHILD_PRESENT(KernelGraphics, 1),
262 GPU_CHILD_PRESENT(KernelHwpm, 1),
263 GPU_CHILD_PRESENT(KernelMc, 1),
264 GPU_CHILD_PRESENT(SwIntr, 1),
265 GPU_CHILD_PRESENT(KernelNvlink, 1),
266 GPU_CHILD_PRESENT(KernelPerf, 1),
267 GPU_CHILD_PRESENT(KernelPmu, 1),
268 GPU_CHILD_PRESENT(KernelSec2, 1),
269 GPU_CHILD_PRESENT(KernelGsp, 1),
270 GPU_CHILD_PRESENT(ConfidentialCompute, 1),
271 };
272
273 const GPUCHILDPRESENT *
gpuGetChildrenPresent_GA102(OBJGPU * pGpu,NvU32 * pNumEntries)274 gpuGetChildrenPresent_GA102(OBJGPU *pGpu, NvU32 *pNumEntries)
275 {
276 *pNumEntries = NV_ARRAY_ELEMENTS(gpuChildrenPresent_GA102);
277 return gpuChildrenPresent_GA102;
278 }
279
280 /*! @brief Returns if a P2P object is allocated in SRIOV mode.
281 *
282 * @param[in] pGpu OBJGPU pointer
283 *
284 * @returns for baremetal, this should just return NV_TRUE
285 for SRIOV, return the SRIOV Info
286 */
287 NvBool
gpuCheckIsP2PAllocated_GA100(OBJGPU * pGpu)288 gpuCheckIsP2PAllocated_GA100
289 (
290 OBJGPU *pGpu
291 )
292 {
293 if (!IS_VIRTUAL(pGpu) && !gpuIsSriovEnabled(pGpu))
294 return NV_TRUE;
295
296 return pGpu->sriovState.bP2PAllocated;
297 }
298
299 /*!
300 * @brief Get index of specified errorCode in the Error Containment state table
301 *
302 * @param[in] pGpu OBJGPU pointer
303 * @param[in] errorCode Error Containment error code
304 * @param[in] pTableIndex Index of specified errorCode in the Error Containment state table
305 *
306 * @returns NV_STATUS
307 */
308 static
309 NV_STATUS
_gpuGetErrorContStateTableIndex_GA100(OBJGPU * pGpu,NV_ERROR_CONT_ERR_ID errorCode,NvU32 * pTableIndex)310 _gpuGetErrorContStateTableIndex_GA100
311 (
312 OBJGPU *pGpu,
313 NV_ERROR_CONT_ERR_ID errorCode,
314 NvU32 *pTableIndex
315 )
316 {
317 NvU32 index;
318 NvU32 tableSize = NV_ARRAY_ELEMENTS(g_errContStateTable);
319
320 NV_ASSERT_OR_RETURN(pTableIndex != NULL, NV_ERR_INVALID_ARGUMENT);
321
322 for (index = 0; index < tableSize; index++)
323 {
324 if (errorCode == g_errContStateTable[index].errorCode)
325 {
326 *pTableIndex = index;
327 return NV_OK;
328 }
329 }
330
331 return NV_ERR_INVALID_ARGUMENT;
332 }
333
334 /*!
335 * @brief Send NV2080_NOTIFIER*
336 *
337 * @param[in] pGpu OBJGPU pointer
338 * @param[in] errorCode Error Containment error code
339 * @param[in] loc Location, SubLocation information
340 * @param[in] nv2080Notifier NV2080_NOTIFIER*
341 *
342 * @returns NV_STATUS
343 */
344 static
345 NV_STATUS
_gpuNotifySubDeviceEventNotifier_GA100(OBJGPU * pGpu,NV_ERROR_CONT_ERR_ID errorCode,NV_ERROR_CONT_LOCATION loc,NvU32 nv2080Notifier)346 _gpuNotifySubDeviceEventNotifier_GA100
347 (
348 OBJGPU *pGpu,
349 NV_ERROR_CONT_ERR_ID errorCode,
350 NV_ERROR_CONT_LOCATION loc,
351 NvU32 nv2080Notifier
352 )
353 {
354 NvV16 info16 = 0;
355 NvV32 info32 = 0;
356 RM_ENGINE_TYPE localRmEngineType = 0;
357
358 // Return if no notifier needs to be sent for this errorCode.
359 if (nv2080Notifier == NO_NV2080_NOTIFIER)
360 {
361 return NV_OK;
362 }
363
364 switch (errorCode)
365 {
366 // Intentional fall-through
367 case NV_ERROR_CONT_ERR_ID_E01_FB_ECC_DED:
368 case NV_ERROR_CONT_ERR_ID_E02_FB_ECC_DED_IN_CBC_STORE:
369 case NV_ERROR_CONT_ERR_ID_E09_FBHUB_POISON:
370 case NV_ERROR_CONT_ERR_ID_E20_XALEP_POISON:
371 info16 = FB_MEMORY_ERROR;
372 break;
373
374 // Intentional fall-through
375 case NV_ERROR_CONT_ERR_ID_E05_LTC_ECC_DSTG:
376 case NV_ERROR_CONT_ERR_ID_E06_LTC_UNSUPPORTED_CLIENT_POISON:
377 case NV_ERROR_CONT_ERR_ID_E07_LTC_ECC_TSTG:
378 case NV_ERROR_CONT_ERR_ID_E08_LTC_ECC_RSTG:
379 info16 = LTC_ERROR;
380 break;
381
382 case NV_ERROR_CONT_ERR_ID_E10_SM_POISON:
383 case NV_ERROR_CONT_ERR_ID_E16_GCC_POISON:
384 case NV_ERROR_CONT_ERR_ID_E17_CTXSW_POISON:
385 info16 = ROBUST_CHANNEL_GR_EXCEPTION;
386 break;
387
388 // Intentional fall-through
389 case NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL:
390 case NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL:
391 NV_ASSERT_OR_RETURN(loc.locType == NV_ERROR_CONT_LOCATION_TYPE_ENGINE, NV_ERR_INVALID_ARGUMENT);
392 //
393 // If SMC is enabled, RM need to notify partition local engineId. Convert
394 // global ID to partition local if client has filled proper engineIDs
395 //
396 localRmEngineType = loc.locInfo.engineLoc.rmEngineId;
397 if (IS_MIG_IN_USE(pGpu) &&
398 RM_ENGINE_TYPE_IS_VALID(loc.locInfo.engineLoc.rmEngineId))
399 {
400 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
401 MIG_INSTANCE_REF ref;
402
403 NV_ASSERT_OK_OR_RETURN(kmigmgrGetInstanceRefFromDevice(pGpu,
404 pKernelMIGManager,
405 loc.locInfo.engineLoc.pDevice,
406 &ref));
407
408 if (!kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, loc.locInfo.engineLoc.rmEngineId, ref))
409 {
410 // Notifier is requested for an unsupported engine
411 NV_PRINTF(LEVEL_ERROR,
412 "Notifier requested for an unsupported rm engine id (0x%x)\n",
413 loc.locInfo.engineLoc.rmEngineId);
414 return NV_ERR_INVALID_ARGUMENT;
415 }
416
417 // Override the engine type with the local engine idx
418 NV_ASSERT_OK_OR_RETURN(kmigmgrGetGlobalToLocalEngineType(pGpu,
419 pKernelMIGManager,
420 ref,
421 loc.locInfo.engineLoc.rmEngineId,
422 &localRmEngineType));
423 }
424
425 info16 = ROBUST_CHANNEL_CE_ERROR(NV2080_ENGINE_TYPE_COPY_IDX(localRmEngineType));
426 break;
427
428 case NV_ERROR_CONT_ERR_ID_E13_MMU_POISON:
429 info16 = ROBUST_CHANNEL_FIFO_ERROR_MMU_ERR_FLT;
430 break;
431 }
432
433 gpuNotifySubDeviceEvent(pGpu,
434 nv2080Notifier,
435 NULL,
436 0,
437 info32, // Unused
438 info16);
439
440 return NV_OK;
441 }
442
443 /*!
444 * @brief Generate error log for corresponding error containment error code.
445 *
446 * Format / example :
447 * 1) Contained Error with SMC Partition attribution (Error attributable to SMC Partition or process in SMC partition):
448 * 2) Contained Error with no SMC partitioning (Error attributable to process on GPU):
449 * 3) Uncontaned Error
450 *
451 * >> NVRM: Xid (PCI:0000:01:00 GPU-I:05): 94, pid=7194, Contained: CE User Channel (0x9). RST: No, D-RST: No
452 * >> NVRM: Xid (PCI:0000:01:00): 94, pid=7062, Contained: CE User Channel (0x9). RST: No, D-RST: No
453 * >> NVRM: Xid (PCI:0000:01:00): 95, pid=7062, Uncontained: LTC TAG (0x2,0x1). RST: Yes, D-RST: No
454 *
455 * @param[in] pGpu OBJGPU pointer
456 * @param[in] errorCode Error Containment error code
457 * @param[in] loc Location, SubLocation information
458 * @param[in] pErrorContSmcSetting Error containment SMC Disable / Enable settings
459 *
460 * @returns NV_STATUS
461 */
462 static
463 NV_STATUS
_gpuGenerateErrorLog_GA100(OBJGPU * pGpu,NV_ERROR_CONT_ERR_ID errorCode,NV_ERROR_CONT_LOCATION loc,NV_ERROR_CONT_SMC_DIS_EN_SETTING * pErrorContSmcSetting)464 _gpuGenerateErrorLog_GA100(OBJGPU *pGpu,
465 NV_ERROR_CONT_ERR_ID errorCode,
466 NV_ERROR_CONT_LOCATION loc,
467 NV_ERROR_CONT_SMC_DIS_EN_SETTING *pErrorContSmcSetting)
468 {
469 RM_ENGINE_TYPE localRmEngineType;
470 NvU32 rcErrorCode = pErrorContSmcSetting->rcErrorCode;
471
472 NV_ASSERT_OR_RETURN((pErrorContSmcSetting != NULL), NV_ERR_INVALID_ARGUMENT);
473
474 switch (loc.locType)
475 {
476 case NV_ERROR_CONT_LOCATION_TYPE_DRAM:
477 nvErrorLog_va((void *)pGpu,
478 rcErrorCode,
479 "%s: %s (0x%x,0x%x). physAddr: 0x%08llx RST: %s, D-RST: %s",
480 rcErrorCode == ROBUST_CHANNEL_CONTAINED_ERROR ?
481 ROBUST_CHANNEL_CONTAINED_ERROR_STR :
482 ROBUST_CHANNEL_UNCONTAINED_ERROR_STR,
483 ppErrContErrorIdStr[errorCode],
484 loc.locInfo.dramLoc.partition,
485 loc.locInfo.dramLoc.subPartition,
486 loc.locInfo.dramLoc.physicalAddress,
487 pErrorContSmcSetting->bGpuResetReqd ? "Yes" : "No",
488 pErrorContSmcSetting->bGpuDrainAndResetReqd ? "Yes" : "No");
489 break;
490
491 case NV_ERROR_CONT_LOCATION_TYPE_LTC:
492 nvErrorLog_va((void *)pGpu,
493 rcErrorCode,
494 "%s: %s (0x%x,0x%x). RST: %s, D-RST: %s",
495 rcErrorCode == ROBUST_CHANNEL_CONTAINED_ERROR ?
496 ROBUST_CHANNEL_CONTAINED_ERROR_STR :
497 ROBUST_CHANNEL_UNCONTAINED_ERROR_STR,
498 ppErrContErrorIdStr[errorCode],
499 loc.locInfo.ltcLoc.partition,
500 loc.locInfo.ltcLoc.slice,
501 pErrorContSmcSetting->bGpuResetReqd ? "Yes" : "No",
502 pErrorContSmcSetting->bGpuDrainAndResetReqd ? "Yes" : "No");
503 break;
504
505 case NV_ERROR_CONT_LOCATION_TYPE_ENGINE:
506 NV_ASSERT_OR_RETURN(loc.locType == NV_ERROR_CONT_LOCATION_TYPE_ENGINE, NV_ERR_INVALID_ARGUMENT);
507 //
508 // If SMC is enabled, RM need to notify partition local engineId. Convert
509 // global ID to partition local if client has filled proper engineIDs
510 //
511 localRmEngineType = loc.locInfo.engineLoc.rmEngineId;
512 if (IS_MIG_IN_USE(pGpu) &&
513 RM_ENGINE_TYPE_IS_VALID(loc.locInfo.engineLoc.rmEngineId))
514 {
515 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
516 MIG_INSTANCE_REF ref;
517 NV_ASSERT_OK_OR_RETURN(kmigmgrGetMIGReferenceFromEngineType(pGpu,
518 pKernelMIGManager,
519 loc.locInfo.engineLoc.rmEngineId,
520 &ref));
521 // Override the engine type with the local engine idx
522 NV_ASSERT_OK_OR_RETURN(kmigmgrGetGlobalToLocalEngineType(pGpu,
523 pKernelMIGManager,
524 ref,
525 loc.locInfo.engineLoc.rmEngineId,
526 &localRmEngineType));
527 }
528
529 nvErrorLog_va((void *)pGpu,
530 rcErrorCode,
531 "%s: %s (0x%x). RST: %s, D-RST: %s",
532 rcErrorCode == ROBUST_CHANNEL_CONTAINED_ERROR ?
533 ROBUST_CHANNEL_CONTAINED_ERROR_STR :
534 ROBUST_CHANNEL_UNCONTAINED_ERROR_STR,
535 ppErrContErrorIdStr[errorCode],
536 gpuGetNv2080EngineType(localRmEngineType),
537 pErrorContSmcSetting->bGpuResetReqd ? "Yes" : "No",
538 pErrorContSmcSetting->bGpuDrainAndResetReqd ? "Yes" : "No");
539 break;
540
541 case NV_ERROR_CONT_LOCATION_TYPE_NONE:
542 nvErrorLog_va((void *)pGpu,
543 rcErrorCode,
544 "%s: %s. RST: %s, D-RST: %s",
545 rcErrorCode == ROBUST_CHANNEL_CONTAINED_ERROR ?
546 ROBUST_CHANNEL_CONTAINED_ERROR_STR :
547 ROBUST_CHANNEL_UNCONTAINED_ERROR_STR,
548 ppErrContErrorIdStr[errorCode],
549 pErrorContSmcSetting->bGpuResetReqd ? "Yes" : "No",
550 pErrorContSmcSetting->bGpuDrainAndResetReqd ? "Yes" : "No");
551 break;
552 }
553
554 return NV_OK;
555 }
556
557 /*!
558 * @brief Determine Error Containment RC code, print Xid, send NV2080_NOTIFIER*,
559 * mark device for reset or mark device for drain and reset as indicated in
560 * error containment state table (refer gpu/error_cont.h).
561 *
562 * @param[in] pGpu OBJGPU pointer
563 * @param[in] errorCode Error Containment error code
564 * @param[in] loc Location, SubLocation information
565 * @param[out] pRcErrorCode RC Error code
566 *
567 * @returns NV_STATUS
568 */
569 NV_STATUS
gpuUpdateErrorContainmentState_GA100(OBJGPU * pGpu,NV_ERROR_CONT_ERR_ID errorCode,NV_ERROR_CONT_LOCATION loc,NvU32 * pRcErrorCode)570 gpuUpdateErrorContainmentState_GA100
571 (
572 OBJGPU *pGpu,
573 NV_ERROR_CONT_ERR_ID errorCode,
574 NV_ERROR_CONT_LOCATION loc,
575 NvU32 *pRcErrorCode
576 )
577 {
578 NvU32 tableIndex = 0;
579 NvBool bIsSmcEnabled = NV_FALSE;
580 NvU32 smcDisEnSettingIndex = 0;
581 NvU32 rcErrorCode = 0;
582 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
583 NV_ERROR_CONT_SMC_DIS_EN_SETTING *pErrorContSmcSetting = NULL;
584
585 if (!gpuIsGlobalPoisonFuseEnabled(pGpu))
586 {
587 return NV_ERR_NOT_SUPPORTED;
588 }
589
590 NV_ASSERT_OK_OR_RETURN(_gpuGetErrorContStateTableIndex_GA100(pGpu, errorCode, &tableIndex));
591
592 // Check if MIG GPU partitioning is enabled
593 if (IS_MIG_IN_USE(pGpu))
594 {
595 bIsSmcEnabled = NV_TRUE;
596 }
597
598 // MIG Memory partitioning config entry index.
599 if (pKernelMIGManager != NULL && kmigmgrIsMIGMemPartitioningEnabled(pGpu, pKernelMIGManager))
600 {
601 smcDisEnSettingIndex = 1;
602 }
603
604 pErrorContSmcSetting = &(g_errContStateTable[tableIndex].smcDisEnSetting[smcDisEnSettingIndex]);
605
606 rcErrorCode = pErrorContSmcSetting->rcErrorCode;
607
608 // Pass RC Error code if user requested it.
609 if (pRcErrorCode != NULL)
610 {
611 *pRcErrorCode = rcErrorCode;
612 }
613
614 // Update partition attribution for this exception only if SMC is enabled.
615 if (pErrorContSmcSetting->bPrintSmcPartitionInfo && bIsSmcEnabled)
616 {
617 // Fall through on error.
618 gpuSetPartitionErrorAttribution_HAL(pGpu,
619 errorCode,
620 loc,
621 rcErrorCode);
622 }
623
624 // Print Xid only if Ampere Error Containment XIDs printing is enabled and rcErrorCode is valid
625 if (gpuIsAmpereErrorContainmentXidEnabled(pGpu) && rcErrorCode != NO_XID)
626 {
627 NV_ASSERT_OK_OR_RETURN(_gpuGenerateErrorLog_GA100(pGpu,
628 errorCode,
629 loc,
630 pErrorContSmcSetting));
631 }
632
633 // Send NV2080_NOTIFIER*
634 if (pErrorContSmcSetting->nv2080Notifier != NO_NV2080_NOTIFIER)
635 {
636 NV_ASSERT_OK(_gpuNotifySubDeviceEventNotifier_GA100(pGpu,
637 errorCode,
638 loc,
639 pErrorContSmcSetting->nv2080Notifier));
640 }
641
642 // Set the scratch bit to indicate the GPU needs to be reset.
643 if ((pErrorContSmcSetting->bGpuResetReqd) &&
644 (gpuMarkDeviceForReset(pGpu) != NV_OK))
645 {
646 NV_PRINTF(LEVEL_ERROR, "Failed to mark GPU for pending reset");
647 }
648
649 // Set the scratch bit to indicate the GPU needs to be reset.
650 if (pErrorContSmcSetting->bGpuDrainAndResetReqd &&
651 gpuMarkDeviceForDrainAndReset(pGpu) != NV_OK)
652 {
653 NV_PRINTF(LEVEL_ERROR, "Failed to mark GPU for pending drain and reset");
654 }
655
656 return NV_OK;
657 }
658
659 NvBool
gpuCheckIfFbhubPoisonIntrPending_GA100(OBJGPU * pGpu)660 gpuCheckIfFbhubPoisonIntrPending_GA100
661 (
662 OBJGPU *pGpu
663 )
664 {
665 return intrIsVectorPending_HAL(pGpu, GPU_GET_INTR(pGpu), NV_PFB_FBHUB_POISON_INTR_VECTOR_HW_INIT, NULL);
666 }
667