1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVOC_KERN_GMMU_H_PRIVATE_ACCESS_ALLOWED
25
26 #include "gpu/mmu/kern_gmmu.h"
27 #include "gpu/gpu.h"
28 #include "gpu/bus/kern_bus.h"
29 #include "gpu/intr/intr.h"
30 #include "gpu/device/device.h"
31 #include "gpu/subdevice/subdevice.h"
32 #include "gpu/uvm/uvm.h"
33 #include "vgpu/vgpu_events.h"
34 #include "vgpu/rpc.h"
35 #include "kernel/gpu/rc/kernel_rc.h"
36 #include "nverror.h"
37
38 #include "published/volta/gv100/dev_fb.h"
39 #include "published/volta/gv100/dev_ram.h"
40 #include "published/volta/gv100/dev_fault.h"
41
42 // Static function Definition
43 static NV_STATUS _kgmmuCreateExceptionDataFromPriv_GV100(OBJGPU *pGpu, KernelGmmu *pKernelGmmu,
44 MMU_FAULT_BUFFER_ENTRY *pParsedFaultEntry,
45 FIFO_MMU_EXCEPTION_DATA *pMmuExceptionData);
46 static NvU32 _kgmmuResetFaultBufferError_GV100(NvU32 faultBufType);
47 static NV_STATUS _kgmmuServiceBar2Faults_GV100(OBJGPU *pGpu, KernelGmmu *pKernelGmmu, NvU32 faultStatus, NvU32 mmuFaultClientId);
48 static NV_STATUS _kgmmuHandleReplayablePrivFault_GV100(OBJGPU *pGpu, KernelGmmu *pKernelGmmu,
49 MMU_FAULT_BUFFER_ENTRY *pParsedFaultEntry);
50
51 /**
52 * @brief Initialize the supported GMMU HW format structures.
53 * @details GV100+ supports ATS NV4K 64K PTE encoding
54 *
55 * @param pKernelGmmu The KernelGmmu
56 * @param pGpu The gpu
57 */
58 NV_STATUS
kgmmuFmtFamiliesInit_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu)59 kgmmuFmtFamiliesInit_GV100(OBJGPU *pGpu, KernelGmmu *pKernelGmmu)
60 {
61 extern NV_STATUS kgmmuFmtFamiliesInit_GM200(OBJGPU *pGpu, KernelGmmu *pKernelGmmu);
62 NvU32 v;
63 NV_STATUS result;
64 GMMU_FMT_FAMILY *pFam;
65
66 // setup nv4kPte endcoding: v - 0, vol - 1, priv - 1
67 for (v = 0; v < GMMU_FMT_MAX_VERSION_COUNT; ++v)
68 {
69 pFam = pKernelGmmu->pFmtFamilies[v];
70 if (NULL != pFam)
71 {
72 nvFieldSetBool(&pFam->pte.fldValid, NV_FALSE, pFam->nv4kPte.v8);
73 nvFieldSetBool(&pFam->pte.fldVolatile, NV_TRUE, pFam->nv4kPte.v8);
74 nvFieldSetBool(&pFam->pte.fldPrivilege, NV_TRUE, pFam->nv4kPte.v8);
75 }
76 }
77
78 // inherit former FmtFamilies setup procedure
79 result = kgmmuFmtFamiliesInit_GM200(pGpu, pKernelGmmu);
80
81 return result;
82 }
83
84 NV_STATUS
kgmmuChangeReplayableFaultOwnership_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvBool bOwnedByRm)85 kgmmuChangeReplayableFaultOwnership_GV100(OBJGPU *pGpu, KernelGmmu *pKernelGmmu, NvBool bOwnedByRm)
86 {
87 //
88 // Disable the interrupt when RM loses the ownership and enable it back when
89 // RM regains it. At least nvUvmInterfaceOwnPageFaultIntr() relies on that behavior.
90 //
91 if (IS_VIRTUAL_WITHOUT_SRIOV(pGpu))
92 return NV_OK;
93
94 if (bOwnedByRm)
95 pKernelGmmu->uvmSharedIntrRmOwnsMask |= RM_UVM_SHARED_INTR_MASK_MMU_REPLAYABLE_FAULT_NOTIFY;
96 else
97 pKernelGmmu->uvmSharedIntrRmOwnsMask &= ~RM_UVM_SHARED_INTR_MASK_MMU_REPLAYABLE_FAULT_NOTIFY;
98
99 //
100 // Notably don't set the PDB_PROP_GPU_IGNORE_REPLAYABLE_FAULTS property as
101 // on Volta that would mean masking out all MMU faults from pending
102 // interrupts.
103 //
104
105 return NV_OK;
106 }
107
108 /*!
109 * @brief Creates the shadow fault buffer for client handling of replayable/non-replayable faults
110 *
111 * @param[in] pGpu
112 * @param[in] pKernelGmmu
113 *
114 * @returns
115 */
116 NV_STATUS
kgmmuClientShadowFaultBufferAlloc_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,FAULT_BUFFER_TYPE index)117 kgmmuClientShadowFaultBufferAlloc_GV100
118 (
119 OBJGPU *pGpu,
120 KernelGmmu *pKernelGmmu,
121 FAULT_BUFFER_TYPE index
122 )
123 {
124 if (IS_VIRTUAL_WITHOUT_SRIOV(pGpu))
125 return NV_OK;
126
127 if (pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED))
128 {
129 NV_PRINTF(LEVEL_ERROR, "Fault-Buffer is disabled. ShadowBuffer cannot be created\n");
130 NV_ASSERT_OR_RETURN(0, NV_ERR_INVALID_STATE);
131 }
132
133 return kgmmuClientShadowFaultBufferAllocate(pGpu, pKernelGmmu, index);
134 }
135
136 /*!
137 * @brief Frees the shadow fault buffer for client handling of non-replayable faults
138 *
139 * @param[in] pGpu
140 * @param[in] pKernelGmmu
141 *
142 * @returns
143 */
144 NV_STATUS
kgmmuClientShadowFaultBufferFree_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,FAULT_BUFFER_TYPE index)145 kgmmuClientShadowFaultBufferFree_GV100
146 (
147 OBJGPU *pGpu,
148 KernelGmmu *pKernelGmmu,
149 FAULT_BUFFER_TYPE index
150 )
151 {
152 if (IS_VIRTUAL_WITHOUT_SRIOV(pGpu))
153 return NV_OK;
154
155 return kgmmuClientShadowFaultBufferDestroy(pGpu, pKernelGmmu, index);
156 }
157
158 /*!
159 * @brief Writes the ATS properties to the instance block
160 *
161 * @param[in] pKernelGmmu
162 * @param[in] pVAS OBJVASPACE pointer
163 * @param[in] subctxId subctxId value
164 * @param[in] pInstBlkDesc Memory descriptor for the instance block of the engine
165 *
166 * @returns NV_STATUS
167 */
168 NV_STATUS
kgmmuInstBlkAtsGet_GV100(KernelGmmu * pKernelGmmu,OBJVASPACE * pVAS,NvU32 subctxId,NvU32 * pOffset,NvU32 * pData)169 kgmmuInstBlkAtsGet_GV100
170 (
171 KernelGmmu *pKernelGmmu,
172 OBJVASPACE *pVAS,
173 NvU32 subctxId,
174 NvU32 *pOffset,
175 NvU32 *pData
176 )
177 {
178 NvU32 pasid = 0;
179
180 if (subctxId == FIFO_PDB_IDX_BASE)
181 {
182 // A channel is setting base PDB with a valid VAS. Otherwise, it should fail.
183 if (pVAS != NULL)
184 {
185 // Since ct_assert has to be done within compile time, it has to be at the top of the scope. Otherwise, the build fails.
186 ct_assert(SF_WIDTH(NV_RAMIN_PASID) <= 32);
187
188 //
189 // The PASID value is provided by the OS and out of client control
190 // however if the PASID value is invalid the ATS feature will not function
191 // as expected so check sanity and fail early
192 //
193 NV_ASSERT_OR_RETURN(NV_OK == vaspaceGetPasid(pVAS, &pasid),
194 NV_ERR_INVALID_DATA);
195 if (pasid > MASK_BITS(SF_WIDTH(NV_RAMIN_PASID)))
196 {
197 NV_PRINTF(LEVEL_ERROR,
198 "Invalid PASID %d (max width %d bits)\n", pasid,
199 SF_WIDTH(NV_RAMIN_PASID));
200 return NV_ERR_OPERATING_SYSTEM;
201 }
202
203 *pOffset = SF_OFFSET(NV_RAMIN_ENABLE_ATS);
204 *pData = SF_NUM(_RAMIN, _ENABLE_ATS, vaspaceIsAtsEnabled(pVAS)) |
205 SF_NUM(_RAMIN, _PASID, pasid);
206 }
207 else
208 {
209 // We cannot set base PDB without pVAS!
210 NV_ASSERT_OR_RETURN(pVAS != NULL, NV_ERR_INVALID_STATE);
211 }
212 }
213 else
214 {
215 // In subcontext supported PDB, we set valid values with non-NULL VAS. Otherwise, PDB entry is invalid.
216 if (pVAS != NULL)
217 {
218 ct_assert(SF_WIDTH(NV_RAMIN_SC_PASID(0)) <= 32);
219
220 //
221 // set ATS for legacy PDB if SubctxId is set to be FIFO_PDB_IDX_BASE
222 // Otherwise, set PDB with given SubctxId.
223 //
224 NV_ASSERT_OR_RETURN(NV_OK == vaspaceGetPasid(pVAS, &pasid),
225 NV_ERR_INVALID_DATA);
226
227 if (pasid > MASK_BITS(SF_WIDTH(NV_RAMIN_SC_PASID(subctxId))))
228 {
229 NV_PRINTF(LEVEL_ERROR,
230 "Invalid PASID %d (max width %d bits)\n", pasid,
231 SF_WIDTH(NV_RAMIN_SC_PASID(subctxId)));
232 return NV_ERR_OPERATING_SYSTEM;
233 }
234
235 *pData = SF_IDX_NUM(_RAMIN_SC, _ENABLE_ATS, vaspaceIsAtsEnabled(pVAS), subctxId) |
236 SF_IDX_NUM(_RAMIN_SC, _PASID, pasid, subctxId);
237 }
238 else
239 {
240 //
241 // If pVAS is NULL, that means the PDB of this SubctxId is set to Invalid.
242 // In this case, ATS should be Disabled.
243 //
244 *pData = NV_RAMIN_ENABLE_ATS_FALSE;
245 }
246
247 *pOffset = SF_OFFSET(NV_RAMIN_SC_ENABLE_ATS(subctxId));
248 }
249
250 return NV_OK;
251 }
252
253 /*!
254 * @brief This gets the offset and data for vaLimit
255 *
256 * @param[in] pKernelGmmu
257 * @param[in] pVAS OBJVASPACE pointer
258 * @param[in] subctxId subctxId value
259 * @param[in] pParams Pointer to the structure containing parameters passed by the engine
260 * @param[out] pOffset Pointer to offset of NV_RAMIN_ADR_LIMIT_LO:NV_RAMIN_ADR_LIMIT_HI pair
261 * @param[out] pData Pointer to value to write
262 *
263 * @returns NV_STATUS
264 */
265 NV_STATUS
kgmmuInstBlkVaLimitGet_GV100(KernelGmmu * pKernelGmmu,OBJVASPACE * pVAS,NvU32 subctxId,INST_BLK_INIT_PARAMS * pParams,NvU32 * pOffset,NvU64 * pData)266 kgmmuInstBlkVaLimitGet_GV100
267 (
268 KernelGmmu *pKernelGmmu,
269 OBJVASPACE *pVAS,
270 NvU32 subctxId,
271 INST_BLK_INIT_PARAMS *pParams,
272 NvU32 *pOffset,
273 NvU64 *pData
274 )
275 {
276 extern NV_STATUS kgmmuInstBlkVaLimitGet_GP100(KernelGmmu *pKernelGmmu, OBJVASPACE *pVAS, NvU32 subctxId, INST_BLK_INIT_PARAMS *pParams, NvU32 *pOffset, NvU64 *pData);
277
278 if (subctxId == FIFO_PDB_IDX_BASE)
279 {
280 return kgmmuInstBlkVaLimitGet_GP100(pKernelGmmu, pVAS, subctxId, pParams,
281 pOffset, pData);
282 }
283
284 *pOffset = 0;
285 *pData = 0;
286
287 return NV_OK;
288 }
289
290 /*!
291 * @brief This gets the offsets and data for the PDB limit
292 *
293 * @param[in] pGpu
294 * @param[in] pKernelGmmu
295 * @param[in] pVAS OBJVASPACE pointer
296 * @param[in] pParams Pointer to the structure containing parameters passed by the engine
297 * @param[in] subctxId subctxId value
298 * @param[out] pOffsetLo Pointer to low offset
299 * @param[out] pDataLo Pointer to data written at above offset
300 * @param[out] pOffsetHi Pointer to high offset
301 * @param[out] pDataHi Pointer to data written at above offset
302 *
303 * @returns
304 */
305 NV_STATUS
kgmmuInstBlkPageDirBaseGet_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,OBJVASPACE * pVAS,INST_BLK_INIT_PARAMS * pParams,NvU32 subctxId,NvU32 * pOffsetLo,NvU32 * pDataLo,NvU32 * pOffsetHi,NvU32 * pDataHi)306 kgmmuInstBlkPageDirBaseGet_GV100
307 (
308 OBJGPU *pGpu,
309 KernelGmmu *pKernelGmmu,
310 OBJVASPACE *pVAS,
311 INST_BLK_INIT_PARAMS *pParams,
312 NvU32 subctxId,
313 NvU32 *pOffsetLo,
314 NvU32 *pDataLo,
315 NvU32 *pOffsetHi,
316 NvU32 *pDataHi
317 )
318 {
319 extern NV_STATUS kgmmuInstBlkPageDirBaseGet_GP100(OBJGPU *pGpu, KernelGmmu *pKernelGmmu, OBJVASPACE *pVAS, INST_BLK_INIT_PARAMS *pParams, NvU32 subctxid, NvU32 *pOffsetLo, NvU32 *pDataLo, NvU32 *pOffsetHi, NvU32 *pDataHi);
320
321 if (subctxId == FIFO_PDB_IDX_BASE)
322 {
323 return kgmmuInstBlkPageDirBaseGet_GP100(pGpu, pKernelGmmu, pVAS,
324 pParams, subctxId, pOffsetLo, pDataLo, pOffsetHi, pDataHi);
325 }
326 else
327 {
328 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
329 MEMORY_DESCRIPTOR *pPDB = NULL;
330
331 if (pParams->bIsZombieSubctx)
332 {
333 pPDB = kfifoGetDummyPageMemDesc(pKernelFifo);
334
335 NV_ASSERT_OR_RETURN((pPDB != NULL), NV_ERR_INVALID_STATE);
336 }
337 else if (pVAS != NULL)
338 {
339 pPDB = (pParams->bIsClientAdmin) ?
340 vaspaceGetKernelPageDirBase(pVAS, pGpu) :
341 vaspaceGetPageDirBase(pVAS, pGpu);
342 }
343
344 if (pPDB == NULL)
345 {
346 //
347 // The teardown model for subcontext with UVM + CUDA is as follows:
348 //
349 // Step 1: Unregister(vas) --> UnsetPageDirectory(vas)
350 // Step 2: FreeSubcontext(vas)
351 //
352 // But new subcontext can be added between step 1 & step 2.
353 // Currently RM doesn't support the notion of a subcontext with NULL PDB.
354 // This results in RM failing subsequent subcontext allocation, causing the UNBOUND instance block failure in bug 1823795.
355 // To fix this, we will allow a subcontext to exist with invalid PDB until it is freed later.
356 // This shouldn't cause any functional issue as no access memory shouldn't happen from this subcontext.
357
358 *pDataLo = NV_RAMIN_SC_PAGE_DIR_BASE_TARGET_INVALID;
359 *pDataHi = NV_RAMIN_SC_PAGE_DIR_BASE_TARGET_INVALID;
360 }
361 else
362 {
363 RmPhysAddr physAdd = memdescGetPhysAddr(pPDB, AT_GPU, 0);
364 NvU32 aperture = kgmmuGetHwPteApertureFromMemdesc(pKernelGmmu, pPDB);
365 NvU32 addrLo = NvU64_LO32(physAdd >> NV_RAMIN_BASE_SHIFT);
366
367 //
368 // Volta only supports new page table format and 64KB big page size so
369 // forcing _USE_VER2_PT_FORMAT to _TRUE and _BIG_PAGE_SIZE to 64KB.
370 //
371 *pDataLo =
372 SF_IDX_NUM(_RAMIN_SC_PAGE_DIR_BASE, _TARGET, aperture,subctxId) |
373 ((pParams->bIsFaultReplayable)?
374 SF_IDX_DEF(_RAMIN_SC_PAGE_DIR_BASE, _FAULT_REPLAY_TEX, _ENABLED, subctxId) |
375 SF_IDX_DEF(_RAMIN_SC_PAGE_DIR_BASE, _FAULT_REPLAY_GCC, _ENABLED, subctxId) :
376 SF_IDX_DEF(_RAMIN_SC_PAGE_DIR_BASE, _FAULT_REPLAY_TEX, _DISABLED, subctxId) |
377 SF_IDX_DEF(_RAMIN_SC_PAGE_DIR_BASE, _FAULT_REPLAY_GCC, _DISABLED, subctxId)) |
378 SF_IDX_DEF(_RAMIN_SC, _USE_VER2_PT_FORMAT, _TRUE, subctxId) |
379 SF_IDX_DEF(_RAMIN_SC, _BIG_PAGE_SIZE, _64KB, subctxId) |
380 SF_IDX_NUM(_RAMIN_SC_PAGE_DIR_BASE, _VOL, memdescGetVolatility(pPDB), subctxId) |
381 SF_IDX_NUM(_RAMIN_SC_PAGE_DIR_BASE, _LO, addrLo, subctxId);
382
383 *pDataHi = SF_IDX_NUM(_RAMIN_SC_PAGE_DIR_BASE, _HI, NvU64_HI32(physAdd), subctxId);
384 }
385
386 *pOffsetLo = SF_OFFSET(NV_RAMIN_SC_PAGE_DIR_BASE_LO(subctxId));
387 *pOffsetHi = SF_OFFSET(NV_RAMIN_SC_PAGE_DIR_BASE_HI(subctxId));
388 }
389
390 return NV_OK;
391 }
392
393 /**
394 * @brief Report MMU Fault buffer overflow errors. MMU Fault
395 * buffer overflow is a fatal error. Raise an assert and
396 * any client notifications if registered, to ensure
397 * overflow is debugged properly.
398 *
399 * @param[in] pGpu
400 * @param[in] pKernelGmmu
401 *
402 * @returns
403 */
404 NV_STATUS
kgmmuReportFaultBufferOverflow_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu)405 kgmmuReportFaultBufferOverflow_GV100
406 (
407 OBJGPU *pGpu,
408 KernelGmmu *pKernelGmmu
409 )
410 {
411 NV_STATUS rmStatus = NV_OK;
412 NvU32 faultStatus = kgmmuReadMmuFaultStatus_HAL(pGpu, pKernelGmmu, GPU_GFID_PF);
413 NvU32 faultBufferGet;
414 NvU32 faultBufferPut;
415 PEVENTNOTIFICATION *ppEventNotification = NULL;
416 NvU32 faultBufferSize;
417
418 kgmmuReadFaultBufferGetPtr_HAL(pGpu, pKernelGmmu, NON_REPLAYABLE_FAULT_BUFFER,
419 &faultBufferGet, NULL);
420 faultBufferGet = DRF_VAL(_PFB_PRI, _MMU_FAULT_BUFFER_GET, _PTR, faultBufferGet);
421
422 kgmmuReadFaultBufferPutPtr_HAL(pGpu, pKernelGmmu, NON_REPLAYABLE_FAULT_BUFFER,
423 &faultBufferPut, NULL);
424 faultBufferPut = DRF_VAL(_PFB_PRI, _MMU_FAULT_BUFFER_PUT, _PTR, faultBufferPut);
425
426 faultBufferSize = kgmmuReadMmuFaultBufferSize_HAL(pGpu, pKernelGmmu, NON_REPLAYABLE_FAULT_BUFFER, GPU_GFID_PF);
427
428 if (kgmmuIsNonReplayableFaultPending_HAL(pGpu, pKernelGmmu, NULL))
429 {
430 if (IsVOLTA(pGpu))
431 {
432 //
433 // Check if Non_replayable interrupt is set when overflow is seen.
434 // This shouldn't happen as this can cause a live-lock considering
435 // top-half will kept on coming and will not let overflow interrupt
436 // serviced. HW should disable the FAULT_INTR when overflow is
437 // detected.
438 //
439 NV_PRINTF(LEVEL_ERROR, "MMU Fault: GPU %d: HW-BUG : "
440 "NON_REPLAYABLE_INTR is high when OVERFLOW is detected\n",
441 pGpu->gpuInstance);
442 NV_ASSERT(0);
443 }
444 else
445 {
446 //
447 // With message-based MMU interrupts (Turing onwards), it is
448 // possible for us to get here - a real fault can happen while an
449 // overflow happens, and there is no ordering guarantee about the
450 // order of these interrupts in HW. However, if we write GET pointer
451 // with GET != PUT while overflow is detected, the fault interrupt
452 // will not be sent. Instead, the overflow interrupt will be sent,
453 // so this will not cause an interrupt storm with message-based
454 // interrupts. If HW does have a bug though, we'll see the below
455 // print repeatedly which can point to a HW bug where it isn't
456 // behaving the way it is designed to do.
457 //
458 NV_PRINTF(LEVEL_INFO, "MMU Fault: GPU %d: NON_REPLAYABLE_INTR "
459 "is high when OVERFLOW is detected\n", pGpu->gpuInstance);
460 }
461 }
462
463 // Check if overflow is due to incorrect fault buffer size or GET > SIZE
464 if (FLD_TEST_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _NON_REPLAYABLE_GETPTR_CORRUPTED, _SET, faultStatus) ||
465 FLD_TEST_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _REPLAYABLE_GETPTR_CORRUPTED, _SET, faultStatus))
466 {
467 NV_PRINTF(LEVEL_ERROR,
468 "MMU Fault: GPU %d: Buffer overflow detected due to GET > SIZE\n",
469 pGpu->gpuInstance);
470 }
471 else
472 {
473 NV_PRINTF(LEVEL_ERROR,
474 "MMU Fault: GPU %d: Buffer overflow detected due to incorrect SIZE\n",
475 pGpu->gpuInstance);
476
477 NV_PRINTF(LEVEL_ERROR,
478 "MMU Fault: GPU %d: Buffer SIZE is expected to handle max faults "
479 "possible in system\n", pGpu->gpuInstance);
480 }
481
482 NV_PRINTF(LEVEL_ERROR,
483 "MMU Fault: GPU %d: STATUS - 0x%x GET - 0x%x, PUT - 0x%x SIZE - 0x%x\n",
484 pGpu->gpuInstance, faultStatus, faultBufferGet, faultBufferPut,
485 faultBufferSize);
486
487 // Raise an event for Mods if registered as Mods checks for overflow
488 if ((NV_OK == CliGetEventNotificationList(pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].hFaultBufferClient,
489 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].hFaultBufferObject, NULL, &ppEventNotification)) && ppEventNotification)
490 {
491 MODS_ARCH_ERROR_PRINTF("MMU Fault Buffer overflow detected\n");
492 rmStatus = notifyEvents(pGpu, *ppEventNotification, NVC369_NOTIFIER_MMU_FAULT_ERROR,
493 0, 0, NV_OK, NV_OS_WRITE_THEN_AWAKEN);
494 //
495 // Mods will check the error and clear error status. As Mods uses Async event
496 // clearing the error in RM will cause a race with Mods
497 //
498 if (RMCFG_FEATURE_MODS_FEATURES)
499 return rmStatus;
500 }
501
502 krcBreakpoint(GPU_GET_KERNEL_RC(pGpu));
503
504 faultStatus = kgmmuReadMmuFaultStatus_HAL(pGpu, pKernelGmmu, GPU_GFID_PF);
505 faultStatus = FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _NON_REPLAYABLE_OVERFLOW, _RESET,
506 faultStatus);
507 kgmmuWriteMmuFaultStatus_HAL(pGpu, pKernelGmmu, faultStatus);
508
509 gpuMarkDeviceForReset(pGpu);
510 return rmStatus;
511 }
512
513 /*!
514 * @brief Get the engine ID associated with the Graphics Engine
515 */
516 NvU32
kgmmuGetGraphicsEngineId_GV100(KernelGmmu * pKernelGmmu)517 kgmmuGetGraphicsEngineId_GV100
518 (
519 KernelGmmu *pKernelGmmu
520 )
521 {
522 return NV_PFAULT_MMU_ENG_ID_GRAPHICS;
523 }
524
525 /*!
526 * @brief Reinit GMMU Peer PTE format to handle 47-bit peer addressing.
527 * This is controlled by NVSWITCH discovery and will not be enabled
528 * outside of specialized compute configurations.
529 *
530 * @param[in] pGmmu The valid gmmu
531 */
532 NV_STATUS
kgmmuEnableNvlinkComputePeerAddressing_GV100(KernelGmmu * pKernelGmmu)533 kgmmuEnableNvlinkComputePeerAddressing_GV100(KernelGmmu *pKernelGmmu)
534 {
535 NvU32 v;
536
537 //
538 // Recalculate the format structures
539 //
540 for (v = 0; v < GMMU_FMT_MAX_VERSION_COUNT; ++v)
541 {
542 if (!kgmmuFmtIsVersionSupported_HAL(pKernelGmmu, g_gmmuFmtVersions[v]))
543 continue;
544
545 kgmmuFmtInitPeerPteFld_HAL(pKernelGmmu, &pKernelGmmu->pFmtFamilies[v]->pte,
546 g_gmmuFmtVersions[v]);
547 }
548
549 return NV_OK;
550 }
551
552 /**
553 * @brief Print information about a MMU fault
554 *
555 * @param[in] pGpu OBJGPU pointer
556 * @param[in] pKernelGmmu KernelGmmu pointer
557 * @param[in] mmuFaultEngineId Engine ID of the faulted engine
558 * @param[in] pMmuExceptionData FIFO exception packet used
559 * for printing fault info
560 * @returns
561 */
562 void
kgmmuPrintFaultInfo_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvU32 mmuFaultEngineId,FIFO_MMU_EXCEPTION_DATA * pMmuExceptionData)563 kgmmuPrintFaultInfo_GV100
564 (
565 OBJGPU *pGpu,
566 KernelGmmu *pKernelGmmu,
567 NvU32 mmuFaultEngineId,
568 FIFO_MMU_EXCEPTION_DATA *pMmuExceptionData
569 )
570 {
571 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
572
573 NV_PRINTF(LEVEL_ERROR, "MMU Fault: inst:0x%x dev:0x%x subdev:0x%x\n",
574 gpuGetInstance(pGpu), gpuGetDeviceInstance(pGpu),
575 pGpu->subdeviceInstance);
576
577 NV_PRINTF(LEVEL_ERROR,
578 "MMU Fault: ENGINE 0x%x (%s %s) faulted @ 0x%x_%08x. Fault is of type 0x%x (%s). Access type is 0x%x (%s)",
579 mmuFaultEngineId,
580 kfifoPrintInternalEngine_HAL(pGpu, pKernelFifo, mmuFaultEngineId),
581 kfifoGetClientIdString_HAL(pGpu, pKernelFifo, pMmuExceptionData),
582 pMmuExceptionData->addrHi,
583 pMmuExceptionData->addrLo,
584 pMmuExceptionData->faultType,
585 kgmmuGetFaultTypeString_HAL(GPU_GET_KERNEL_GMMU(pGpu),
586 pMmuExceptionData->faultType),
587 pMmuExceptionData->accessType,
588 kfifoGetFaultAccessTypeString_HAL(pGpu, pKernelFifo,
589 pMmuExceptionData->accessType));
590
591 MODS_ARCH_ERROR_PRINTF("MMU Fault : ENGINE_%s %s %s %s",
592 kfifoPrintInternalEngine_HAL(pGpu, pKernelFifo, mmuFaultEngineId),
593 kgmmuGetFaultTypeString_HAL(GPU_GET_KERNEL_GMMU(pGpu),
594 pMmuExceptionData->faultType),
595 kfifoGetClientIdString_HAL(pGpu, pKernelFifo, pMmuExceptionData),
596 kfifoGetFaultAccessTypeString_HAL(pGpu, pKernelFifo,
597 pMmuExceptionData->accessType));
598
599 {
600 NvU32 engTag;
601 NV_STATUS status = kfifoEngineInfoXlate_HAL(pGpu, pKernelFifo, ENGINE_INFO_TYPE_MMU_FAULT_ID,
602 mmuFaultEngineId, ENGINE_INFO_TYPE_ENG_DESC, &engTag);
603
604 if ((status == NV_OK) && (IS_GR(engTag)))
605 {
606 NvU32 baseFaultId;
607
608 if (pMmuExceptionData->bGpc)
609 {
610 NV_PRINTF_EX(NV_PRINTF_MODULE, LEVEL_ERROR, " on GPC %d", pMmuExceptionData->gpcId);
611 }
612
613 status = kfifoEngineInfoXlate_HAL(pGpu, pKernelFifo, ENGINE_INFO_TYPE_ENG_DESC,
614 engTag, ENGINE_INFO_TYPE_MMU_FAULT_ID, &baseFaultId);
615 if (status == NV_OK)
616 {
617 NvU32 subctxId = (mmuFaultEngineId - baseFaultId);
618
619 NV_PRINTF_EX(NV_PRINTF_MODULE, LEVEL_ERROR, " on VEID %d", subctxId);
620
621 }
622 else
623 {
624 DBG_BREAKPOINT();
625 }
626 }
627 }
628
629 NV_PRINTF_EX(NV_PRINTF_MODULE, LEVEL_ERROR, "\n");
630 MODS_ARCH_ERROR_PRINTF("\n");
631 }
632
633 static NvBool
_kgmmuFaultBufferHasMapping(struct HW_FAULT_BUFFER * pFaultBuffer)634 _kgmmuFaultBufferHasMapping(struct HW_FAULT_BUFFER *pFaultBuffer)
635 {
636 return (pFaultBuffer->kernelVaddr || pFaultBuffer->pBufferPages) ? NV_TRUE : NV_FALSE;
637 }
638
639 static void
_kgmmuFaultEntryRmServiceable_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,KernelFifo * pKernelFifo,struct HW_FAULT_BUFFER * pFaultBuffer,NvU32 entryIndex,NvU32 hwBufferPut,NvBool bClientBufEnabled,NvBool * bRmServiceable,NvBool * bFaultValid,NvBool bPollForValidBit)640 _kgmmuFaultEntryRmServiceable_GV100
641 (
642 OBJGPU *pGpu,
643 KernelGmmu *pKernelGmmu,
644 KernelFifo *pKernelFifo,
645 struct HW_FAULT_BUFFER *pFaultBuffer,
646 NvU32 entryIndex,
647 NvU32 hwBufferPut,
648 NvBool bClientBufEnabled,
649 NvBool *bRmServiceable,
650 NvBool *bFaultValid,
651 NvBool bPollForValidBit
652 )
653 {
654 NvU32 faultEntry;
655 NvBool bUvmHandledNonFatal, bEngineCE, bPbdmaFault;
656 NvU32 engineId;
657 NvU32 mwValid, mwUvmHandledNonFatal, mwEngineId;
658 NvBool bUvmHandledReplayable;
659
660 *bRmServiceable = NV_FALSE;
661
662 // Get MW which contains Valid, replayable and Client Type
663 mwValid = DRF_WORD_MW(NVC369_BUF_ENTRY_VALID);
664 mwUvmHandledNonFatal = DRF_WORD_MW(NVC369_BUF_ENTRY_REPLAYABLE_FAULT_EN);
665 mwEngineId = DRF_WORD_MW(NVC369_BUF_ENTRY_ENGINE_ID);
666
667 //
668 // Currently they all are in same DWORD, so we will read only one DWORD
669 // Add assert for future HW changes
670 //
671 NV_ASSERT(mwValid == mwUvmHandledNonFatal);
672
673 //
674 // The caller specified that all packets between HW GET and HW PUT
675 // need to be copied to the shadow buffer. RM will not optimize
676 // the copy by skipping packets that are not marked valid and
677 // rely on HW triggering a reentry to the top-half to service
678 // the remaining faults.
679 //
680 if (bPollForValidBit == NV_TRUE)
681 {
682 RMTIMEOUT timeout;
683 NV_STATUS status;
684
685 //
686 // For the replayable buffer we read HW put.
687 // If the GET value equals PUT, we know there are no more faults to process.
688 //
689 if (entryIndex == hwBufferPut)
690 {
691 *bFaultValid = NV_FALSE;
692 return;
693 }
694
695 gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
696
697 do
698 {
699 faultEntry = MEM_RD32(((NvU32 *)kgmmuFaultBufferGetFault_HAL(pGpu, pKernelGmmu, pFaultBuffer, entryIndex) + mwValid));
700 *bFaultValid = !!(NVBIT32(DRF_EXTENT_MW(NVC369_BUF_ENTRY_VALID)) & faultEntry);
701
702 status = gpuCheckTimeout(pGpu, &timeout);
703 if (status != NV_OK)
704 {
705 NV_PRINTF(LEVEL_ERROR, "Timed out while waiting for valid bit.\n");
706 gpuMarkDeviceForReset(pGpu);
707 break;
708 }
709 } while (*bFaultValid != NV_TRUE);
710 }
711 else
712 {
713 faultEntry = MEM_RD32(((NvU32 *)kgmmuFaultBufferGetFault_HAL(pGpu, pKernelGmmu, pFaultBuffer, entryIndex) + mwValid));
714 *bFaultValid = !!(NVBIT32(DRF_EXTENT_MW(NVC369_BUF_ENTRY_VALID)) & faultEntry);
715 }
716
717 bUvmHandledNonFatal = !!(NVBIT32(DRF_EXTENT_MW(NVC369_BUF_ENTRY_REPLAYABLE_FAULT_EN)) & faultEntry);
718 bUvmHandledReplayable = !!(NVBIT32(DRF_EXTENT_MW(NVC369_BUF_ENTRY_REPLAYABLE_FAULT)) & faultEntry);
719
720 //
721 // Check engine Id. RM doesn't service CE faults with replayable_en bit set. Such faults are serviced by
722 // clients. In case client wants to cancel such faults, it would need to make a RM control call for RCing
723 // the channel.
724 //
725 if (*bFaultValid)
726 {
727 //
728 // GPU is now done writing to this fault entry. A read memory barrier
729 // here ensures that fault entry values are not read before the valid
730 // bit is set. It is needed on architectures like PowerPC and ARM where
731 // read instructions can be reordered.
732 //
733 portAtomicMemoryFenceLoad();
734
735 faultEntry = MEM_RD32(((NvU32 *)kgmmuFaultBufferGetFault_HAL(pGpu, pKernelGmmu, pFaultBuffer, entryIndex) + mwEngineId));
736 engineId = (faultEntry >> DRF_SHIFT_MW(NVC369_BUF_ENTRY_ENGINE_ID))
737 & DRF_MASK_MW(NVC369_BUF_ENTRY_ENGINE_ID);
738
739 bEngineCE = ((engineId >= kgmmuGetMinCeEngineId_HAL(pKernelGmmu)) && (engineId <= kgmmuGetMaxCeEngineId_HAL(pGpu, pKernelGmmu)));
740 bPbdmaFault = kfifoIsMmuFaultEngineIdPbdma(pGpu, pKernelFifo, engineId);
741
742 *bRmServiceable = (!bClientBufEnabled || !bUvmHandledNonFatal || !(bEngineCE || bPbdmaFault));
743
744 if (gpuIsCCFeatureEnabled(pGpu) && gpuIsGspOwnedFaultBuffersEnabled(pGpu))
745 {
746 *bRmServiceable &= !bUvmHandledReplayable;
747 }
748
749 }
750 }
751
752 static inline NV_STATUS
_kgmmuCopyFaultPktInShadowBuf_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,FAULT_BUFFER_TYPE type,struct GMMU_FAULT_BUFFER * pFaultBuffer,NvU32 getIndex,NvU32 entriesToCopy,NvU32 maxBufferEntries,NvBool bRmServiceable,THREAD_STATE_NODE * pThreadState)753 _kgmmuCopyFaultPktInShadowBuf_GV100
754 (
755 OBJGPU *pGpu,
756 KernelGmmu *pKernelGmmu,
757 FAULT_BUFFER_TYPE type,
758 struct GMMU_FAULT_BUFFER *pFaultBuffer,
759 NvU32 getIndex,
760 NvU32 entriesToCopy,
761 NvU32 maxBufferEntries,
762 NvBool bRmServiceable,
763 THREAD_STATE_NODE *pThreadState
764 )
765 {
766 GMMU_SHADOW_FAULT_BUF *pRmShadowFaultBuf = NULL;
767 GMMU_CLIENT_SHADOW_FAULT_BUFFER *pClientShadowFaultBuf = NULL;
768 NvU32 copiedCount = 0;
769 NvU32 copyIndex = 0;
770 NvU32 origShadowBufPutIndex = 0;
771 NvU32 shadowBufPutIndex = 0;
772 struct HW_FAULT_BUFFER *pHwFaultBuffer;
773 NV_STATUS status = NV_OK;
774
775 pHwFaultBuffer = &pFaultBuffer->hwFaultBuffers[type];
776
777 if (entriesToCopy == 0)
778 {
779 return NV_ERR_INVALID_ARGUMENT;
780 }
781
782 if (bRmServiceable)
783 {
784 pRmShadowFaultBuf = KERNEL_POINTER_FROM_NvP64(GMMU_SHADOW_FAULT_BUF *, pFaultBuffer->pRmShadowFaultBuffer);
785 }
786 else
787 {
788 pClientShadowFaultBuf = pFaultBuffer->pClientShadowFaultBuffer[type];
789 // Fetch the current put index for the appropriate shadow buffer
790 shadowBufPutIndex = kgmmuReadClientShadowBufPutIndex_HAL(pGpu, pKernelGmmu,
791 GPU_GFID_PF, type);
792 origShadowBufPutIndex = shadowBufPutIndex;
793 }
794
795 if ((bRmServiceable && pRmShadowFaultBuf == NULL) ||
796 (!bRmServiceable && pClientShadowFaultBuf == NULL))
797 {
798 return NV_ERR_INVALID_POINTER;
799 }
800
801 copyIndex = getIndex;
802
803 while (entriesToCopy > 0)
804 {
805 if (bRmServiceable)
806 {
807 copiedCount = queuePush(pRmShadowFaultBuf,
808 kgmmuFaultBufferGetFault_HAL(pGpu, pKernelGmmu, pHwFaultBuffer, copyIndex), 1);
809 if (copiedCount != 1)
810 {
811 return NV_ERR_NO_MEMORY;
812 }
813 }
814 else
815 {
816 status = kgmmuCopyFaultPacketToClientShadowBuffer_HAL(pGpu, pKernelGmmu,
817 pFaultBuffer,
818 type,
819 copyIndex,
820 shadowBufPutIndex,
821 maxBufferEntries,
822 pThreadState,
823 &copiedCount);
824 if (status != NV_OK)
825 {
826 return status;
827 }
828
829 // If nothing was copied, but the status is NV_OK check if PUT needs to be updated.
830 if (copiedCount == 0)
831 {
832 //
833 // Assert we only end up here in case of Replayable faults.
834 // Non-Replayable copies always need to succeed.
835 //
836 NV_ASSERT_OR_RETURN(type == REPLAYABLE_FAULT_BUFFER, NV_ERR_INVALID_STATE);
837
838 if (shadowBufPutIndex != origShadowBufPutIndex)
839 {
840 goto update_client_put;
841 }
842
843
844 // Signal the caller that no copies have taken place.
845 return NV_WARN_NOTHING_TO_DO;
846 }
847
848 shadowBufPutIndex = (shadowBufPutIndex + 1) % maxBufferEntries;
849 }
850
851 entriesToCopy--;
852 copyIndex++;
853 if (copyIndex == maxBufferEntries)
854 {
855 copyIndex = 0;
856 }
857 }
858
859 update_client_put:
860 // Update the put index for CPU driver
861 if (!bRmServiceable)
862 {
863 // Make sure the packet reaches memory before writing the PUT.
864 portAtomicMemoryFenceStore();
865 kgmmuWriteClientShadowBufPutIndex_HAL(pGpu, pKernelGmmu, GPU_GFID_PF,
866 type, shadowBufPutIndex);
867 }
868
869 return status;
870 }
871
872 /*!
873 * @brief Copy fault packets from RM owned HW fault buffer to Shadow faultBuffers
874 *
875 * @param[in] pGpu OBJGPU pointer
876 * @param[in] pKernelGmmu KernelGmmu pointer
877 * @param[in] faultBufferType Fault buffer Type [Replayable/Non-Replyabale]
878 * @param[in] pShadowFaultBuf Shadow fault buffer pointer
879 * @param[out] entriesCopied Number of fault packets copied into shadow buffer
880 * @param[in] type Replayable or non-replayable fault buffer
881 *
882 * @returns
883 */
884 NV_STATUS
kgmmuCopyMmuFaults_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,THREAD_STATE_NODE * pThreadState,NvU32 * entriesCopied,FAULT_BUFFER_TYPE type,NvBool bPollForValidBit)885 kgmmuCopyMmuFaults_GV100
886 (
887 OBJGPU *pGpu,
888 KernelGmmu *pKernelGmmu,
889 THREAD_STATE_NODE *pThreadState,
890 NvU32 *entriesCopied,
891 FAULT_BUFFER_TYPE type,
892 NvBool bPollForValidBit
893 )
894 {
895 NV_STATUS rmStatus = NV_OK;
896 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
897 Intr *pIntr = GPU_GET_INTR(pGpu);
898 GMMU_SHADOW_FAULT_BUF *pRmShadowFaultBuf = NULL;
899 GMMU_CLIENT_SHADOW_FAULT_BUFFER *pClientShadowFaultBuf = NULL;
900 NvU32 nextGetIndex = 0, curGetIndex = 0;
901 NvU32 totalCount = 0, curCount = 0, rmPktCount = 0;
902 NvU32 maxBufferEntries = 0;
903 NvBool bRmServiceable = NV_FALSE, bPrevRmServiceable = NV_FALSE;
904 NvBool bFaultValid = NV_FALSE;
905 struct HW_FAULT_BUFFER *pHwFaultBuffer;
906 struct GMMU_FAULT_BUFFER *pFaultBuffer;
907 NvU32 hwBufferPut = 0;
908
909 ct_assert(NV_PFB_PRI_MMU_NON_REPLAY_FAULT_BUFFER == NON_REPLAYABLE_FAULT_BUFFER);
910 ct_assert(NV_PFB_PRI_MMU_REPLAY_FAULT_BUFFER == REPLAYABLE_FAULT_BUFFER);
911
912 NV_ASSERT_OR_RETURN(type == REPLAYABLE_FAULT_BUFFER ||
913 type == NON_REPLAYABLE_FAULT_BUFFER,
914 NV_ERR_INVALID_ARGUMENT);
915
916 *entriesCopied = 0;
917 pFaultBuffer = &pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF];
918 pHwFaultBuffer = &pFaultBuffer->hwFaultBuffers[type];
919
920 //
921 // Sanity checks to see if SW is ready to handle interrupts. If interrupts are
922 // not enabled in SW return fine as we don't want to error out top half.
923 //
924 if ((pIntr == NULL) || (intrGetIntrEn(pIntr) == INTERRUPT_TYPE_DISABLED) ||
925 pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED) ||
926 !pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pShadowFaultBufLock)
927 {
928 return NV_OK;
929 }
930
931 if ((!gpuIsCCFeatureEnabled(pGpu) || !gpuIsGspOwnedFaultBuffersEnabled(pGpu)) &&
932 (type == REPLAYABLE_FAULT_BUFFER))
933 {
934 return NV_OK;
935 }
936
937 // If there is no replayable buffer registered, then there is no work to be done.
938 if (type == REPLAYABLE_FAULT_BUFFER &&
939 !pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_REPLAYABLE_FAULT_BUFFER_IN_USE))
940 {
941 // RM still needs to clear the interrupt to avoid an interrupt storm.
942 kgmmuClearReplayableFaultIntr_HAL(pGpu, pKernelGmmu, pThreadState);
943 return NV_OK;
944 }
945
946 portSyncSpinlockAcquire(pFaultBuffer->pShadowFaultBufLock);
947
948 pRmShadowFaultBuf = KERNEL_POINTER_FROM_NvP64(GMMU_SHADOW_FAULT_BUF *, pFaultBuffer->pRmShadowFaultBuffer);
949 pClientShadowFaultBuf = KERNEL_POINTER_FROM_NvP64(GMMU_CLIENT_SHADOW_FAULT_BUFFER *,
950 pFaultBuffer->pClientShadowFaultBuffer[type]);
951
952 // Max entries exposed in register takes care of 1 entry used for Full detection
953 NV_ASSERT(pHwFaultBuffer->faultBufferSize);
954 maxBufferEntries = (pHwFaultBuffer->faultBufferSize / NVC369_BUF_SIZE);
955
956 if (!_kgmmuFaultBufferHasMapping(pHwFaultBuffer))
957 {
958 NV_PRINTF(LEVEL_ERROR,
959 "GPU %d HW's fault buffer doesn't have kernel mappings\n",
960 pGpu->gpuInstance);
961 rmStatus = NV_ERR_INVALID_STATE;
962 goto done;
963 }
964
965 if (pRmShadowFaultBuf == NULL)
966 {
967 NV_PRINTF(LEVEL_ERROR, "GPU %d RM's shadow buffer should be setup\n",
968 pGpu->gpuInstance);
969 rmStatus = NV_ERR_INVALID_STATE;
970 goto done;
971 }
972
973 //
974 // For SRIOV vGPU, GET and PUT reg gets reset to zero during migration.
975 // This results in wrong fault buffer entry referenced by RM after migration
976 // for next fault entry read. So for SRIOV vgpu setup, instead of using RM cached value
977 // always read NV_VIRTUAL_FUNCTION_PRIV_MMU_FAULT_BUFFER_GET(index) reg in order to
978 // identify next fault entry to read.
979 //
980 if (IS_VIRTUAL_WITH_SRIOV(pGpu))
981 {
982 kgmmuReadFaultBufferGetPtr_HAL(pGpu, pKernelGmmu, type,
983 &pHwFaultBuffer->cachedGetIndex, pThreadState);
984 }
985
986 nextGetIndex = curGetIndex = pHwFaultBuffer->cachedGetIndex;
987
988 if (bPollForValidBit == NV_TRUE)
989 {
990 kgmmuReadFaultBufferPutPtr_HAL(pGpu, pKernelGmmu, type,
991 &hwBufferPut, pThreadState);
992 }
993
994 if (type == NON_REPLAYABLE_FAULT_BUFFER)
995 {
996 // Clear non replayable fault pulse interrupt
997 kgmmuClearNonReplayableFaultIntr_HAL(pGpu, pKernelGmmu, pThreadState);
998 }
999 else
1000 {
1001 // Clear replayable fault pulse interrupt
1002 kgmmuClearReplayableFaultIntr_HAL(pGpu, pKernelGmmu, pThreadState);
1003 }
1004
1005 //
1006 // Check how many entries are valid and serviceable by one driver in HW fault buffer.
1007 // We copy everything in one go to optimize copy performance
1008 //
1009 while (totalCount < maxBufferEntries)
1010 {
1011 _kgmmuFaultEntryRmServiceable_GV100(pGpu, pKernelGmmu, pKernelFifo,
1012 pHwFaultBuffer, nextGetIndex, hwBufferPut, (pClientShadowFaultBuf != NULL),
1013 &bRmServiceable, &bFaultValid, bPollForValidBit);
1014
1015 if (!bFaultValid)
1016 break;
1017
1018 // Non replayable fault copy path.
1019 if (type == NON_REPLAYABLE_FAULT_BUFFER)
1020 {
1021 // Check if servicing entity has changed
1022 if ((bPrevRmServiceable != bRmServiceable) && (curCount > 0))
1023 {
1024 rmStatus = _kgmmuCopyFaultPktInShadowBuf_GV100(pGpu, pKernelGmmu,
1025 type,
1026 pFaultBuffer,
1027 curGetIndex,
1028 curCount,
1029 maxBufferEntries,
1030 bPrevRmServiceable,
1031 pThreadState);
1032 if (rmStatus != NV_OK)
1033 {
1034 NV_PRINTF(LEVEL_ERROR,
1035 "Failed to copy faults into GPU %d's %s shadow buffer\n",
1036 pGpu->gpuInstance,
1037 (bPrevRmServiceable ? "RM" : "Client"));
1038 gpuMarkDeviceForReset(pGpu);
1039 goto done;
1040 }
1041
1042 curGetIndex = (curGetIndex + curCount) % maxBufferEntries;
1043 curCount = 0;
1044 }
1045 }
1046
1047 //
1048 // A Replayable fault is never RM servicable.
1049 // As a result, we will never perform the fault copy within this loop,
1050 // but only count the number of entries.
1051 //
1052 // The number of entries is used outside of the loop to
1053 // determine the number of packets to copy.
1054 //
1055 if (type == REPLAYABLE_FAULT_BUFFER)
1056 {
1057 NV_ASSERT_OR_RETURN(!bRmServiceable, NV_ERR_INVALID_STATE);
1058 }
1059
1060 if (curCount == 0)
1061 {
1062 bPrevRmServiceable = bRmServiceable;
1063 }
1064
1065 if (bRmServiceable)
1066 {
1067 rmPktCount++;
1068 }
1069
1070 curCount++;
1071 totalCount++;
1072 nextGetIndex++;
1073 if (nextGetIndex == maxBufferEntries)
1074 {
1075 nextGetIndex = 0;
1076 if (type == NON_REPLAYABLE_FAULT_BUFFER)
1077 {
1078 pFaultBuffer->faultBufferGenerationCounter++;
1079 }
1080 }
1081 }
1082
1083 // Nothing to copy
1084 if (totalCount == 0)
1085 {
1086 //
1087 // Writing to GET will cause the GET != PUT condition to get reevaluated
1088 // and the interrupt signal will be sent again if they are not equal.
1089 // This is necessary because the non replayable fault interrupt was cleared
1090 // earlier. Once the packet is ready and it is copied, GET == PUT and the
1091 // interrupt will not be retriggered.
1092 //
1093
1094 NvU32 val;
1095 val = pHwFaultBuffer->cachedGetIndex;
1096
1097 kgmmuWriteFaultBufferGetPtr_HAL(pGpu, pKernelGmmu, type,
1098 val, pThreadState);
1099 goto done;
1100 }
1101
1102 // Copy all remaining entries
1103 if (curCount > 0)
1104 {
1105 rmStatus = _kgmmuCopyFaultPktInShadowBuf_GV100(pGpu, pKernelGmmu,
1106 type,
1107 pFaultBuffer,
1108 curGetIndex,
1109 curCount,
1110 maxBufferEntries,
1111 bPrevRmServiceable,
1112 pThreadState);
1113 //
1114 // NV_WARN_NOTHING_TO_DO signals that no faults have been copied
1115 // in the replayable fault handling case. There is no need to notify the client.
1116 //
1117 if (rmStatus == NV_WARN_NOTHING_TO_DO)
1118 {
1119 rmStatus = NV_OK;
1120 goto done;
1121 }
1122 if (rmStatus != NV_OK)
1123 {
1124 NV_PRINTF(LEVEL_ERROR,
1125 "Failed to copy faults into GPU %d's %s shadow buffer\n",
1126 pGpu->gpuInstance,
1127 (bPrevRmServiceable ? "RM" : "Client"));
1128 gpuMarkDeviceForReset(pGpu);
1129 goto done;
1130 }
1131 }
1132
1133 //
1134 // Reset the valid bit in all these fault packets. It's easy to zero down the overall packets rather than writing
1135 // INVALID to one packet at a time
1136 //
1137 curGetIndex = pHwFaultBuffer->cachedGetIndex;
1138 curCount = totalCount;
1139
1140 if ((curGetIndex + curCount) > maxBufferEntries)
1141 {
1142 kgmmuFaultBufferClearPackets_HAL(pGpu, pKernelGmmu, pHwFaultBuffer, curGetIndex, maxBufferEntries - curGetIndex);
1143 curCount = totalCount - (maxBufferEntries - curGetIndex);
1144 curGetIndex = 0;
1145 }
1146
1147 kgmmuFaultBufferClearPackets_HAL(pGpu, pKernelGmmu, pHwFaultBuffer, curGetIndex, curCount);
1148
1149 //
1150 // Ensure fatalFaultIntrPending is set only after fault buffer entries have been
1151 // copied into shadow buffer.
1152 //
1153 portAtomicMemoryFenceStore();
1154
1155 //
1156 // Set the SW flag needed for interrupt processing. This should be set before we write GET pointer as moving
1157 // GET ptr can disable top level interrupt and we should not have a window where an interrupt pending state is lost
1158 // As we don't mask these interrupts, top level interrupts will be reset as soon as GET == PUT after copy. McService
1159 // in RM's bottom_half relies on top level interrupts for servicing, hence we need a SW flag to state that there is
1160 // an intr pending for servicing.
1161 //
1162 portAtomicSetS32(&pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].fatalFaultIntrPending, 1);
1163
1164 //
1165 // Ensure all writes to the current entry are completed before updating the
1166 // GET pointer.
1167 //
1168 portAtomicMemoryFenceStore();
1169
1170 // Increment the GET pointer to enable HW to write new fault packets
1171 kgmmuWriteFaultBufferGetPtr_HAL(pGpu, pKernelGmmu, type, nextGetIndex, pThreadState);
1172
1173 *entriesCopied = rmPktCount;
1174 pHwFaultBuffer->cachedGetIndex = nextGetIndex;
1175
1176 done:
1177 portSyncSpinlockRelease(pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pShadowFaultBufLock);
1178 return rmStatus;
1179 }
1180
1181 void
kgmmuFaultBufferClearPackets_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,struct HW_FAULT_BUFFER * pFaultBuffer,NvU32 beginIdx,NvU32 numFaultPackets)1182 kgmmuFaultBufferClearPackets_GV100
1183 (
1184 OBJGPU *pGpu,
1185 KernelGmmu *pKernelGmmu,
1186 struct HW_FAULT_BUFFER *pFaultBuffer,
1187 NvU32 beginIdx,
1188 NvU32 numFaultPackets
1189 )
1190 {
1191 if (pFaultBuffer->kernelVaddr)
1192 {
1193 void *bufferAddr = (void *)(NvUPtr)(KERNEL_POINTER_FROM_NvP64(NvU64, pFaultBuffer->kernelVaddr) +
1194 (NvU64)(beginIdx * sizeof(GMMU_FAULT_PACKET)));
1195 portMemSet(bufferAddr, 0, sizeof(GMMU_FAULT_PACKET) * numFaultPackets);
1196 }
1197 else
1198 {
1199 NvU32 pktPerPage = RM_PAGE_SIZE / sizeof(GMMU_FAULT_PACKET);
1200
1201 NvU32 pageNumber = beginIdx * sizeof(GMMU_FAULT_PACKET) / RM_PAGE_SIZE;
1202 NvU32 fstPktInPage = beginIdx % (RM_PAGE_SIZE / sizeof(GMMU_FAULT_PACKET));
1203 NvU32 clearInThisPage = NV_MIN(numFaultPackets, pktPerPage - fstPktInPage);
1204
1205 while (numFaultPackets > 0)
1206 {
1207 NvP64 pAddress = pFaultBuffer->pBufferPages[pageNumber].pAddress;
1208
1209 void *bufferAddr = (void *)(NvUPtr)(KERNEL_POINTER_FROM_NvP64(NvU64, pAddress) + fstPktInPage * sizeof(GMMU_FAULT_PACKET));
1210
1211 portMemSet(bufferAddr, 0, clearInThisPage * sizeof(GMMU_FAULT_PACKET));
1212
1213 pageNumber++;
1214 fstPktInPage = 0;
1215 numFaultPackets -= clearInThisPage;
1216 clearInThisPage = NV_MIN(pktPerPage, numFaultPackets);
1217 }
1218 }
1219 }
1220
1221 GMMU_FAULT_PACKET *
kgmmuFaultBufferGetFault_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,struct HW_FAULT_BUFFER * pFaultBuffer,NvU32 idx)1222 kgmmuFaultBufferGetFault_GV100
1223 (
1224 OBJGPU *pGpu,
1225 KernelGmmu *pKernelGmmu,
1226 struct HW_FAULT_BUFFER *pFaultBuffer,
1227 NvU32 idx)
1228 {
1229 if (pFaultBuffer->kernelVaddr)
1230 {
1231 void *pFault = (void *)(NvUPtr)(KERNEL_POINTER_FROM_NvP64(NvU64, pFaultBuffer->kernelVaddr) +
1232 (NvU64)(idx * sizeof(GMMU_FAULT_PACKET)));
1233 return pFault;
1234 }
1235 else
1236 {
1237 NvU32 pageNumber = idx * sizeof(GMMU_FAULT_PACKET) / RM_PAGE_SIZE;
1238 NvP64 pAddress = pFaultBuffer->pBufferPages[pageNumber].pAddress;
1239 NvU32 inPageIdx = idx % (RM_PAGE_SIZE / sizeof(GMMU_FAULT_PACKET));
1240 void *pFault = (void *)(NvUPtr)(KERNEL_POINTER_FROM_NvP64(NvU64, pAddress) +
1241 inPageIdx * sizeof(GMMU_FAULT_PACKET));
1242
1243 return pFault;
1244 }
1245 }
1246
1247 /*
1248 * @brief Copies a single fault packet from the replayable/non-replayable
1249 * HW fault buffer to the corresponding client shadow buffer
1250 *
1251 * @param[in] pFaultBuffer Pointer to GMMU_FAULT_BUFFER
1252 * @param[in] type Replayable/Non-replayable fault type
1253 * @param[in] getIndex Get pointer of the HW fault buffer
1254 * @param[in] shadowBufPutIndex Put pointer of the shadow buffer
1255 * @param[in] maxBufferEntries Maximum possible entries in the HW buffer
1256 * @param[in] pThreadState Pointer to THREAD_STATE_NODE
1257 * @param[out] pFaultsCopied Number of fault packets copied by the function
1258 *
1259 * @returns NV_STATUS
1260 */
1261 NvU32
kgmmuCopyFaultPacketToClientShadowBuffer_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,struct GMMU_FAULT_BUFFER * pFaultBuffer,FAULT_BUFFER_TYPE type,NvU32 getIndex,NvU32 shadowBufPutIndex,NvU32 maxBufferEntries,THREAD_STATE_NODE * pThreadState,NvU32 * pFaultsCopied)1262 kgmmuCopyFaultPacketToClientShadowBuffer_GV100
1263 (
1264 OBJGPU *pGpu,
1265 KernelGmmu *pKernelGmmu,
1266 struct GMMU_FAULT_BUFFER *pFaultBuffer,
1267 FAULT_BUFFER_TYPE type,
1268 NvU32 getIndex,
1269 NvU32 shadowBufPutIndex,
1270 NvU32 maxBufferEntries,
1271 THREAD_STATE_NODE *pThreadState,
1272 NvU32 *pFaultsCopied
1273 )
1274 {
1275 GMMU_SHADOW_FAULT_BUF *pQueue = NULL;
1276 struct HW_FAULT_BUFFER *pHwFaultBuffer = NULL;
1277 GMMU_CLIENT_SHADOW_FAULT_BUFFER *pClientShadowFaultBuf = NULL;
1278
1279 pHwFaultBuffer = &pFaultBuffer->hwFaultBuffers[type];
1280 pClientShadowFaultBuf = pFaultBuffer->pClientShadowFaultBuffer[type];
1281
1282 pQueue = (GMMU_SHADOW_FAULT_BUF *) pClientShadowFaultBuf->pQueueAddress;
1283
1284 *pFaultsCopied = queuePushNonManaged(pQueue, &pClientShadowFaultBuf->queueContext,
1285 kgmmuFaultBufferGetFault_HAL(pGpu, pKernelGmmu,
1286 pHwFaultBuffer,
1287 getIndex),
1288 1);
1289 if (*pFaultsCopied == 0)
1290 {
1291 return NV_ERR_BUFFER_TOO_SMALL;
1292 }
1293
1294 return NV_OK;
1295 }
1296
1297 /*!
1298 * @brief Get the engine ID associated with the min CE
1299 *
1300 * @param[in] pKernelGmmu KernelGmmu object
1301 *
1302 * return engine ID of the min CE
1303 */
1304 NvU32
kgmmuGetMinCeEngineId_GV100(KernelGmmu * pKernelGmmu)1305 kgmmuGetMinCeEngineId_GV100
1306 (
1307 KernelGmmu *pKernelGmmu
1308 )
1309 {
1310 return NV_PFAULT_MMU_ENG_ID_CE0;
1311 }
1312
1313 /*!
1314 * @brief Get the engine ID associated with the max CE
1315 *
1316 * @param[in] pGpu OBJGPU object
1317 * @param[in] pKernelGmmu KernelGmmu object
1318 *
1319 * return engine ID of the max CE
1320 */
1321 NvU32
kgmmuGetMaxCeEngineId_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu)1322 kgmmuGetMaxCeEngineId_GV100
1323 (
1324 OBJGPU *pGpu,
1325 KernelGmmu *pKernelGmmu
1326 )
1327 {
1328 return NV_PFAULT_MMU_ENG_ID_CE8;
1329 }
1330
1331 /*!
1332 * @brief Creates shadow fault buffer for RM fatal fault handling
1333 *
1334 * @param[in] pGpu OBJGPU pointer
1335 * @param[in] pKernelGmmu KernelGmmu pointer
1336 *
1337 * @returns
1338 */
1339 static NV_STATUS
_kgmmuAllocShadowFaultBuffer_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu)1340 _kgmmuAllocShadowFaultBuffer_GV100
1341 (
1342 OBJGPU *pGpu,
1343 KernelGmmu *pKernelGmmu
1344 )
1345 {
1346 NV_STATUS rmStatus = NV_OK;
1347 GMMU_SHADOW_FAULT_BUF *gmmuShadowFaultBuf = NULL;
1348 NvU32 queueMaxEntries = 0;
1349 struct HW_FAULT_BUFFER *pFaultBuffer;
1350
1351 // NvPort should be initialized in RM before these calls are made
1352 if (!portIsInitialized())
1353 {
1354 NV_PRINTF(LEVEL_ERROR, "NvPort needed but not initaiized\n");
1355 NV_ASSERT(0);
1356 }
1357
1358 if (pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED))
1359 {
1360 NV_PRINTF(LEVEL_ERROR, "Fault-Buffer is disabled. ShadowBuffer cannot be created\n");
1361 NV_ASSERT_OR_RETURN(0, NV_ERR_INVALID_STATE);
1362 }
1363
1364 pFaultBuffer = &pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].hwFaultBuffers[NV_PFB_PRI_MMU_NON_REPLAY_FAULT_BUFFER];
1365 NV_ASSERT_OR_RETURN(pFaultBuffer->faultBufferSize != 0, NV_ERR_INVALID_ARGUMENT);
1366
1367 // Allocate memory for queue dataStructure and initialize queue
1368 gmmuShadowFaultBuf = portMemAllocNonPaged(sizeof(GMMU_SHADOW_FAULT_BUF));
1369 NV_ASSERT_OR_RETURN(gmmuShadowFaultBuf != NULL, NV_ERR_NO_MEMORY);
1370
1371 queueMaxEntries = (pFaultBuffer->faultBufferSize / NVC369_BUF_SIZE);
1372 rmStatus = queueInit(gmmuShadowFaultBuf, portMemAllocatorGetGlobalNonPaged(), queueMaxEntries);
1373 if (rmStatus != NV_OK)
1374 goto error;
1375
1376 //
1377 // SpinLock needed to protect shadow buffer setup as it gets accessed
1378 // from top half and clientShadowBuffer can get setup/destroy later
1379 //
1380 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pShadowFaultBufLock = portSyncSpinlockCreate(portMemAllocatorGetGlobalNonPaged());
1381 if (!pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pShadowFaultBufLock)
1382 {
1383 rmStatus = NV_ERR_INSUFFICIENT_RESOURCES;
1384 goto error;
1385 }
1386
1387 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pRmShadowFaultBuffer = NV_PTR_TO_NvP64(gmmuShadowFaultBuf);
1388 return NV_OK;
1389
1390 error:
1391 portMemFree(gmmuShadowFaultBuf);
1392 return rmStatus;
1393 }
1394
1395 NV_STATUS
kgmmuFaultBufferInit_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu)1396 kgmmuFaultBufferInit_GV100
1397 (
1398 OBJGPU *pGpu,
1399 KernelGmmu *pKernelGmmu
1400 )
1401 {
1402 NV_STATUS status;
1403 NvU32 faultBufferSize;
1404
1405 if (IS_VIRTUAL_WITHOUT_SRIOV(pGpu) ||
1406 pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED))
1407 return NV_OK;
1408
1409 ct_assert(NV_PFB_PRI_MMU_REPLAY_FAULT_BUFFER == REPLAYABLE_FAULT_BUFFER);
1410 ct_assert(NV_PFB_PRI_MMU_NON_REPLAY_FAULT_BUFFER == NON_REPLAYABLE_FAULT_BUFFER);
1411
1412 faultBufferSize = kgmmuSetAndGetDefaultFaultBufferSize_HAL(pGpu, pKernelGmmu,
1413 NON_REPLAYABLE_FAULT_BUFFER,
1414 GPU_GFID_PF);
1415 status = kgmmuFaultBufferAlloc(pGpu, pKernelGmmu,
1416 NON_REPLAYABLE_FAULT_BUFFER,
1417 faultBufferSize);
1418 if (status != NV_OK)
1419 {
1420 return status;
1421 }
1422
1423 // Allocate shadow fault buffer for fatal fault handling inside RM
1424 status = _kgmmuAllocShadowFaultBuffer_GV100(pGpu, pKernelGmmu);
1425 if (status != NV_OK)
1426 {
1427 (void) kgmmuFaultBufferFree(pGpu, pKernelGmmu, NON_REPLAYABLE_FAULT_BUFFER);
1428 return status;
1429 }
1430
1431 return NV_OK;
1432 }
1433
1434 NV_STATUS
kgmmuFaultBufferLoad_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvU32 index,NvU32 gfid)1435 kgmmuFaultBufferLoad_GV100
1436 (
1437 OBJGPU *pGpu,
1438 KernelGmmu *pKernelGmmu,
1439 NvU32 index,
1440 NvU32 gfid
1441 )
1442 {
1443 NV_STATUS status;
1444
1445 if (IS_VIRTUAL_WITHOUT_SRIOV(pGpu) ||
1446 pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED))
1447 return NV_OK;
1448
1449 // Skip the map if this is a GC6 cycle as FB is preserved.
1450 if (!IS_GPU_GC6_STATE_EXITING(pGpu))
1451 {
1452 status = kgmmuFaultBufferMap(pGpu, pKernelGmmu, index, gfid);
1453 if (status != NV_OK)
1454 {
1455 return status;
1456 }
1457 // Init the PUT pointer PRI before use. Applicable only to Hopper CC
1458 kgmmuWriteClientShadowBufPutIndex_HAL(pGpu, pKernelGmmu, gfid, index, 0);
1459 }
1460
1461 status = kgmmuEnableFaultBuffer_HAL(pGpu, pKernelGmmu, index, NV_FALSE, gfid);
1462 if (status != NV_OK)
1463 {
1464 (void) kgmmuFaultBufferUnmap(pGpu, pKernelGmmu, index, gfid);
1465 return status;
1466 }
1467
1468 return NV_OK;
1469 }
1470
1471 NV_STATUS
kgmmuEnableFaultBuffer_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvU32 index,NvBool bIsErrorRecovery,NvU32 gfid)1472 kgmmuEnableFaultBuffer_GV100
1473 (
1474 OBJGPU *pGpu,
1475 KernelGmmu *pKernelGmmu,
1476 NvU32 index,
1477 NvBool bIsErrorRecovery,
1478 NvU32 gfid
1479 )
1480 {
1481 NvU32 faultBufferHi;
1482 NvU32 faultBufferLo;
1483 NvU32 regVal;
1484 struct HW_FAULT_BUFFER *pFaultBuffer;
1485
1486 if (IS_VIRTUAL_WITHOUT_SRIOV(pGpu) ||
1487 (IS_VIRTUAL(pGpu) && gpuIsWarBug200577889SriovHeavyEnabled(pGpu)) ||
1488 pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED))
1489 {
1490 return NV_OK;
1491 }
1492
1493 NV_ASSERT_OR_RETURN((index < NUM_FAULT_BUFFERS), NV_ERR_INVALID_ARGUMENT);
1494
1495 pFaultBuffer = &pKernelGmmu->mmuFaultBuffer[gfid].hwFaultBuffers[index];
1496
1497 faultBufferHi = NvU64_HI32(pFaultBuffer->bar2FaultBufferAddr);
1498 faultBufferLo = NvU64_LO32(pFaultBuffer->bar2FaultBufferAddr);
1499 kgmmuWriteMmuFaultBufferHiLo_HAL(pGpu, pKernelGmmu, index, faultBufferLo, faultBufferHi, gfid);
1500
1501 kgmmuSetAndGetDefaultFaultBufferSize_HAL(pGpu, pKernelGmmu, index, gfid);
1502
1503 regVal = kgmmuReadMmuFaultBufferSize_HAL(pGpu, pKernelGmmu, index, gfid);
1504 regVal = FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_BUFFER_SIZE, _ENABLE, _TRUE, regVal);
1505
1506 if (index == NON_REPLAYABLE_FAULT_BUFFER)
1507 {
1508 //
1509 // Non-Replayable Fault buffer needs overflow interrupt reporting
1510 // as overflow is considered fatal due to fault packet loss. Also
1511 // this interrupt will disable non_replayable interrupt when raised.
1512 //
1513 regVal = FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_BUFFER_SIZE, _OVERFLOW_INTR, _ENABLE, regVal);
1514 }
1515 else
1516 {
1517 //
1518 // Replayable Fault buffer does not need overflow interrupt reporting
1519 // as overflow is not considered fatal. There is no fault packet loss
1520 // due to replays.
1521 //
1522 regVal = FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_BUFFER_SIZE, _OVERFLOW_INTR, _DISABLE, regVal);
1523 }
1524
1525 kgmmuWriteMmuFaultBufferSize_HAL(pGpu, pKernelGmmu, index, regVal, gfid);
1526 // Don't touch interrupts if called in error recovery path
1527 if (!bIsErrorRecovery)
1528 {
1529 kgmmuEnableMmuFaultInterrupts_HAL(pGpu, pKernelGmmu, index);
1530 kgmmuEnableMmuFaultOverflowIntr_HAL(pGpu, pKernelGmmu, index);
1531 }
1532 return NV_OK;
1533 }
1534
1535 NV_STATUS
kgmmuDisableFaultBuffer_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvU32 index,NvBool bIsErrorRecovery,NvU32 gfid)1536 kgmmuDisableFaultBuffer_GV100
1537 (
1538 OBJGPU *pGpu,
1539 KernelGmmu *pKernelGmmu,
1540 NvU32 index,
1541 NvBool bIsErrorRecovery,
1542 NvU32 gfid
1543 )
1544 {
1545 NV_STATUS rmStatus = NV_OK;
1546 NvU32 faultBufferSize;
1547 NV_ASSERT_OR_RETURN((index < NUM_FAULT_BUFFERS), NV_ERR_INVALID_ARGUMENT);
1548
1549 if (pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED) ||
1550 (IS_VIRTUAL(pGpu) && gpuIsWarBug200577889SriovHeavyEnabled(pGpu)))
1551 {
1552 return NV_OK;
1553 }
1554
1555 rmStatus = gpuSanityCheckRegisterAccess(pGpu, 0, NULL);
1556 if (rmStatus != NV_OK)
1557 return rmStatus;
1558
1559 //
1560 // Before disabling fault buffer make sure that no packets are pending in pipe.
1561 // The status register here provides status for all fault buffers(replayable/non-replayable)
1562 // and ideally we should have separate status register for these buffers. This is tracked in
1563 // Bug 1848948
1564 //
1565 if (!bIsErrorRecovery)
1566 {
1567 RMTIMEOUT timeout;
1568 gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1569
1570 while (FLD_TEST_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _BUSY, _TRUE,
1571 kgmmuReadMmuFaultStatus_HAL(pGpu, pKernelGmmu, gfid)))
1572 {
1573 rmStatus = gpuCheckTimeout(pGpu, &timeout);
1574 if (rmStatus == NV_ERR_TIMEOUT)
1575 {
1576 NV_PRINTF(LEVEL_ERROR, "HW couldn't flush %s buffer.\n",
1577 (index == REPLAYABLE_FAULT_BUFFER) ? "REPLAYABLE_FAULT_BUFFER" : "NON_REPLAYABLE_FAULT_BUFFER");
1578 DBG_BREAKPOINT();
1579 break;
1580 }
1581 }
1582 }
1583
1584 faultBufferSize = kgmmuReadMmuFaultBufferSize_HAL(pGpu, pKernelGmmu, index, gfid);
1585 faultBufferSize = FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_BUFFER_SIZE, _ENABLE, _FALSE,
1586 faultBufferSize);
1587 kgmmuWriteMmuFaultBufferSize_HAL(pGpu, pKernelGmmu, index, faultBufferSize, gfid);
1588
1589 return rmStatus;
1590 }
1591
1592 NV_STATUS
kgmmuFaultBufferDestroy_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu)1593 kgmmuFaultBufferDestroy_GV100
1594 (
1595 OBJGPU *pGpu,
1596 KernelGmmu *pKernelGmmu
1597 )
1598 {
1599 NV_STATUS status;
1600
1601 if (IS_VIRTUAL_WITHOUT_SRIOV(pGpu) ||
1602 pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED))
1603 return NV_OK;
1604
1605 status = kgmmuClientShadowFaultBufferFree_HAL(pGpu, pKernelGmmu, NON_REPLAYABLE_FAULT_BUFFER);
1606 if (status != NV_OK)
1607 return status;
1608
1609 // Free RM shadow fault buffer created for fatal fault handling
1610 if (pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pRmShadowFaultBuffer)
1611 {
1612 queueDestroy(KERNEL_POINTER_FROM_NvP64(GMMU_SHADOW_FAULT_BUF *,
1613 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pRmShadowFaultBuffer));
1614 portMemFree(NvP64_VALUE(pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pRmShadowFaultBuffer));
1615 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pRmShadowFaultBuffer = NvP64_NULL;
1616 }
1617
1618 if (pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pShadowFaultBufLock)
1619 {
1620 portSyncSpinlockDestroy(pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pShadowFaultBufLock);
1621 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pShadowFaultBufLock = NULL;
1622 }
1623
1624 (void) kgmmuFaultBufferFree(pGpu, pKernelGmmu, NON_REPLAYABLE_FAULT_BUFFER);
1625
1626 return NV_OK;
1627 }
1628
1629 NV_STATUS
kgmmuFaultBufferUnload_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvU32 index,NvU32 gfid)1630 kgmmuFaultBufferUnload_GV100
1631 (
1632 OBJGPU *pGpu,
1633 KernelGmmu *pKernelGmmu,
1634 NvU32 index,
1635 NvU32 gfid
1636 )
1637 {
1638 if (IS_VIRTUAL_WITHOUT_SRIOV(pGpu) ||
1639 pKernelGmmu->getProperty(pKernelGmmu, PDB_PROP_KGMMU_FAULT_BUFFER_DISABLED))
1640 return NV_OK;
1641
1642 kgmmuDisableFaultBuffer_HAL(pGpu, pKernelGmmu, index, NV_FALSE, gfid);
1643
1644 // Skip the unmap if this is a GC6 cycle as FB is preserved.
1645 if (!IS_GPU_GC6_STATE_ENTERING(pGpu))
1646 kgmmuFaultBufferUnmap(pGpu, pKernelGmmu, index, gfid);
1647
1648 return NV_OK;
1649 }
1650
1651 /**
1652 * @brief Sign extend a fault address to a supported width as per UVM requirements
1653 */
1654 void
kgmmuSignExtendFaultAddress_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvU64 * pMmuFaultAddress)1655 kgmmuSignExtendFaultAddress_GV100
1656 (
1657 OBJGPU *pGpu,
1658 KernelGmmu *pKernelGmmu,
1659 NvU64 *pMmuFaultAddress
1660 )
1661 {
1662 NvU32 cpuAddrShift = osGetCpuVaAddrShift();
1663 NvU32 gpuVaAddrShift = portUtilCountTrailingZeros64(pKernelGmmu->maxVASize);
1664
1665 // Sign extend VA to ensure it's in canonical form if required
1666 if (gpuVaAddrShift >= cpuAddrShift)
1667 {
1668 switch (pGpu->busInfo.oorArch)
1669 {
1670 case OOR_ARCH_X86_64:
1671 case OOR_ARCH_ARM:
1672 case OOR_ARCH_AARCH64:
1673 *pMmuFaultAddress = (NvU64)(((NvS64)*pMmuFaultAddress << (64 - cpuAddrShift)) >>
1674 (64 - cpuAddrShift));
1675 break;
1676 case OOR_ARCH_PPC64LE:
1677 break;
1678 case OOR_ARCH_NONE:
1679 NV_ASSERT_FAILED("Invalid oor address mode type.");
1680 break;
1681 }
1682 }
1683 else
1684 {
1685 switch (pGpu->busInfo.oorArch)
1686 {
1687 case OOR_ARCH_X86_64:
1688 *pMmuFaultAddress = (NvU64)(((NvS64)*pMmuFaultAddress << (64 - 48)) >>
1689 (64 - 48));
1690 break;
1691 case OOR_ARCH_ARM:
1692 case OOR_ARCH_AARCH64:
1693 *pMmuFaultAddress = (NvU64)(((NvS64)*pMmuFaultAddress << (64 - 49)) >>
1694 (64 - 49));
1695 break;
1696 case OOR_ARCH_PPC64LE:
1697 break;
1698 case OOR_ARCH_NONE:
1699 NV_ASSERT_FAILED("Invalid oor address mode type.");
1700 break;
1701 }
1702 }
1703 }
1704
1705 /*!
1706 * @brief Parses Faultbuffer entry and returns Fault Type
1707 *
1708 * @param[in] fault Fault Value
1709 * @param[out] pMmuFaultType Fault Type
1710 *
1711 * @returns
1712 */
1713 NV_STATUS
kgmmuGetFaultType_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvU32 fault,FAULT_TYPE * pMmuFaultType)1714 kgmmuGetFaultType_GV100
1715 (
1716 OBJGPU *pGpu,
1717 KernelGmmu *pKernelGmmu,
1718 NvU32 fault,
1719 FAULT_TYPE *pMmuFaultType
1720 )
1721 {
1722 NV_ASSERT_OR_RETURN(pMmuFaultType != NULL, NV_ERR_INVALID_POINTER);
1723 switch (fault)
1724 {
1725 case NV_PFAULT_FAULT_TYPE_PDE:
1726 *pMmuFaultType = fault_invalidPde;
1727 break;
1728
1729 case NV_PFAULT_FAULT_TYPE_PDE_SIZE:
1730 *pMmuFaultType = fault_invalidPdeSize;
1731 break;
1732
1733 case NV_PFAULT_FAULT_TYPE_PTE:
1734 *pMmuFaultType = fault_invalidPte;
1735 break;
1736
1737 case NV_PFAULT_FAULT_TYPE_VA_LIMIT_VIOLATION:
1738 *pMmuFaultType = fault_limitViolation;
1739 break;
1740
1741 case NV_PFAULT_FAULT_TYPE_UNBOUND_INST_BLOCK:
1742 *pMmuFaultType = fault_unboundInstBlock;
1743 break;
1744
1745 case NV_PFAULT_FAULT_TYPE_PRIV_VIOLATION:
1746 *pMmuFaultType = fault_privViolation;
1747 break;
1748
1749 case NV_PFAULT_FAULT_TYPE_RO_VIOLATION:
1750 *pMmuFaultType = fault_write;
1751 break;
1752
1753 case NV_PFAULT_FAULT_TYPE_WO_VIOLATION:
1754 *pMmuFaultType = fault_read;
1755 break;
1756
1757 case NV_PFAULT_FAULT_TYPE_PITCH_MASK_VIOLATION:
1758 *pMmuFaultType = fault_pitchMaskViolation;
1759 break;
1760
1761 case NV_PFAULT_FAULT_TYPE_WORK_CREATION:
1762 *pMmuFaultType = fault_workCreation;
1763 break;
1764
1765 case NV_PFAULT_FAULT_TYPE_UNSUPPORTED_APERTURE:
1766 *pMmuFaultType = fault_unsupportedAperture;
1767 break;
1768
1769 case NV_PFAULT_FAULT_TYPE_COMPRESSION_FAILURE:
1770 *pMmuFaultType = fault_compressionFailure;
1771 break;
1772
1773 case NV_PFAULT_FAULT_TYPE_UNSUPPORTED_KIND:
1774 *pMmuFaultType = fault_unsupportedKind;
1775 break;
1776
1777 case NV_PFAULT_FAULT_TYPE_REGION_VIOLATION:
1778 *pMmuFaultType = fault_regionViolation;
1779 break;
1780
1781 case NV_PFAULT_FAULT_TYPE_POISONED:
1782 *pMmuFaultType = fault_poison;
1783 break;
1784
1785 case NV_PFAULT_FAULT_TYPE_ATOMIC_VIOLATION:
1786 *pMmuFaultType = fault_atomic;
1787 break;
1788
1789 default:
1790 return NV_ERR_INVALID_ARGUMENT;
1791 }
1792
1793 return NV_OK;
1794 }
1795
1796 /*!
1797 * @brief Check if the given engineID is PHYSICAL
1798 *
1799 * @param[in] pKernelGmmu KernelGmmu object
1800 * @param[in] engineID Engine ID
1801 *
1802 * @return True if PHYSICAL
1803 */
1804 NvBool
kgmmuIsFaultEnginePhysical_GV100(KernelGmmu * pKernelGmmu,NvU32 engineID)1805 kgmmuIsFaultEnginePhysical_GV100
1806 (
1807 KernelGmmu *pKernelGmmu,
1808 NvU32 engineID
1809 )
1810 {
1811 return (engineID == NV_PFAULT_MMU_ENG_ID_PHYSICAL);
1812 }
1813
1814 /*!
1815 * @brief Parse fault Buffer Packet
1816 *
1817 * @param[in] pGpu OBJGPU pointer
1818 * @param[in] pKernelGmmu KernelGmmu pointer
1819 * @param[in] pFaultPacket Raw Fault Packet data
1820 * @param[out] pParsedFaultEntry Parsed Fault entry
1821 *
1822 * @returns
1823 */
1824 NV_STATUS
kgmmuParseFaultPacket_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvP64 pFaultPacket,NvP64 pParsedFaultEntry)1825 kgmmuParseFaultPacket_GV100
1826 (
1827 OBJGPU *pGpu,
1828 KernelGmmu *pKernelGmmu,
1829 NvP64 pFaultPacket,
1830 NvP64 pParsedFaultEntry
1831 )
1832 {
1833 NV_STATUS rmStatus = NV_OK;
1834 NvU32 *faultEntry = KERNEL_POINTER_FROM_NvP64(NvU32 *, pFaultPacket);
1835 MMU_FAULT_BUFFER_ENTRY * pParsedEntry = KERNEL_POINTER_FROM_NvP64(MMU_FAULT_BUFFER_ENTRY *, pParsedFaultEntry);
1836 NvU64 addrHi, addrLo;
1837 NvU64 timestampLo, timestampHi;
1838 NvU32 tmp;
1839
1840 addrLo = DRF_VAL_MW(C369, _BUF_ENTRY, _INST_LO, faultEntry);
1841 addrHi = DRF_VAL_MW(C369, _BUF_ENTRY, _INST_HI, faultEntry);
1842 pParsedEntry->mmuFaultInstBlock.address = addrLo + (addrHi << DRF_SIZE_MW(NVC369_BUF_ENTRY_INST_LO));
1843 pParsedEntry->mmuFaultInstBlock.address <<= 12;
1844
1845 tmp = DRF_VAL_MW(C369, _BUF_ENTRY, _INST_APERTURE, faultEntry);
1846 pParsedEntry->mmuFaultInstBlock.aperture = tmp;
1847 pParsedEntry->mmuFaultInstBlock.gfid = GPU_GFID_PF;
1848
1849 VERIFY_INST_BLOCK_APERTURE(NVC369_BUF_ENTRY_INST_APERTURE_VID_MEM,
1850 NVC369_BUF_ENTRY_INST_APERTURE_SYS_MEM_COHERENT,
1851 NVC369_BUF_ENTRY_INST_APERTURE_SYS_MEM_NONCOHERENT);
1852
1853 addrLo = DRF_VAL_MW(C369, _BUF_ENTRY, _ADDR_LO, faultEntry);
1854 addrHi = DRF_VAL_MW(C369, _BUF_ENTRY, _ADDR_HI, faultEntry);
1855 pParsedEntry->mmuFaultAddress = (addrLo + (addrHi << DRF_SIZE_MW(NVC369_BUF_ENTRY_ADDR_LO))) <<
1856 (DRF_BASE_MW(NVC369_BUF_ENTRY_ADDR_LO));
1857
1858 kgmmuSignExtendFaultAddress_HAL(pGpu, pKernelGmmu, &pParsedEntry->mmuFaultAddress);
1859
1860 timestampLo = DRF_VAL_MW(C369, _BUF_ENTRY, _TIMESTAMP_LO, faultEntry);
1861 timestampHi = DRF_VAL_MW(C369, _BUF_ENTRY, _TIMESTAMP_HI, faultEntry);
1862 pParsedEntry->mmuFaultTimestamp = timestampLo + (timestampHi << DRF_SIZE_MW(NVC369_BUF_ENTRY_TIMESTAMP_LO));
1863
1864 tmp = DRF_VAL_MW(C369, _BUF_ENTRY, _FAULT_TYPE, faultEntry);
1865 rmStatus = kgmmuGetFaultType_HAL(pGpu, pKernelGmmu, tmp, &pParsedEntry->mmuFaultType);
1866 NV_ASSERT_OR_RETURN(rmStatus == NV_OK, rmStatus);
1867
1868 pParsedEntry->mmuFaultAccessType = DRF_VAL_MW(C369, _BUF_ENTRY, _ACCESS_TYPE, faultEntry);
1869
1870 pParsedEntry->mmuFaultEngineId = DRF_VAL_MW(C369, _BUF_ENTRY, _ENGINE_ID, faultEntry);
1871 pParsedEntry->mmuFaultClientId = DRF_VAL_MW(C369, _BUF_ENTRY, _CLIENT, faultEntry);
1872 pParsedEntry->mmuFaultClientType = DRF_VAL_MW(C369, _BUF_ENTRY, _MMU_CLIENT_TYPE, faultEntry);
1873 pParsedEntry->mmuFaultGpcId = DRF_VAL_MW(C369, _BUF_ENTRY, _GPC_ID, faultEntry);
1874 pParsedEntry->bFaultEntryValid = DRF_VAL_MW(C369, _BUF_ENTRY, _VALID, faultEntry);
1875 pParsedEntry->bFaultInProtectedMode = DRF_VAL_MW(C369, _BUF_ENTRY, _PROTECTED_MODE, faultEntry);
1876 pParsedEntry->bFaultTypeReplayable = DRF_VAL_MW(C369, _BUF_ENTRY, _REPLAYABLE_FAULT, faultEntry);
1877 pParsedEntry->bReplayableFaultEn = DRF_VAL_MW(C369, _BUF_ENTRY, _REPLAYABLE_FAULT_EN, faultEntry);
1878 return rmStatus;
1879 }
1880
1881 /*!
1882 * @brief Handles the nonreplayable fault based on a fault packet
1883 *
1884 * @param[in] pGpu OBJGPU pointer
1885 * @param[in] pKernelGmmu KernelGmmu pointer
1886 * @param[in] pFaultPacket Raw Fault Packet data
1887 *
1888 * @returns
1889 * NV_ERR_INVALID_DATA if fault packet data is not valid.
1890 */
1891 NV_STATUS
kgmmuHandleNonReplayableFaultPacket_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,GMMU_FAULT_PACKET * pFaultPacket)1892 kgmmuHandleNonReplayableFaultPacket_GV100
1893 (
1894 OBJGPU *pGpu,
1895 KernelGmmu *pKernelGmmu,
1896 GMMU_FAULT_PACKET *pFaultPacket
1897 )
1898 {
1899 NV_STATUS rmStatus = NV_OK;
1900 MMU_FAULT_BUFFER_ENTRY parsedFaultEntry;
1901 FIFO_MMU_EXCEPTION_DATA MmuExceptionData = {0};
1902
1903 // Parse the fault packet. RM fault buffer will have only RM serviceable packets
1904 rmStatus = kgmmuParseFaultPacket_HAL(pGpu, pKernelGmmu, NV_PTR_TO_NvP64(pFaultPacket),
1905 NV_PTR_TO_NvP64(&parsedFaultEntry));
1906 if (rmStatus != NV_OK)
1907 return NV_ERR_INVALID_DATA;
1908
1909 // Let's create the exception data just for printing purposes
1910 MmuExceptionData.addrLo = (NvU32)(parsedFaultEntry.mmuFaultAddress & 0xFFFFFFFFUL);
1911 MmuExceptionData.addrHi = (NvU32)(parsedFaultEntry.mmuFaultAddress >> 32);
1912 MmuExceptionData.faultType = parsedFaultEntry.mmuFaultType;
1913 MmuExceptionData.clientId = parsedFaultEntry.mmuFaultClientId;
1914 if (parsedFaultEntry.mmuFaultClientType == NV_PFAULT_MMU_CLIENT_TYPE_GPC)
1915 {
1916 MmuExceptionData.bGpc = NV_TRUE;
1917 MmuExceptionData.gpcId = parsedFaultEntry.mmuFaultGpcId;
1918 }
1919 else
1920 {
1921 MmuExceptionData.bGpc = NV_FALSE;
1922 MmuExceptionData.gpcId = 0;
1923 }
1924 MmuExceptionData.accessType = parsedFaultEntry.mmuFaultAccessType;
1925 MmuExceptionData.faultEngineId = parsedFaultEntry.mmuFaultEngineId;
1926
1927 kgmmuPrintFaultInfo_HAL(pGpu, pKernelGmmu, parsedFaultEntry.mmuFaultEngineId, &MmuExceptionData);
1928
1929 rmStatus = kgmmuServiceMmuFault_HAL(pGpu, pKernelGmmu, NV_PTR_TO_NvP64(&parsedFaultEntry),
1930 &MmuExceptionData);
1931 NV_ASSERT(rmStatus == NV_OK);
1932
1933 // Send event notifier in response to MMU Poison Fault.
1934 if ((parsedFaultEntry.mmuFaultType == fault_poison) &&
1935 gpuIsGlobalPoisonFuseEnabled(pGpu))
1936 {
1937 NV_ERROR_CONT_LOCATION loc = {0};
1938
1939 // Error containment location information
1940 loc.locType = NV_ERROR_CONT_LOCATION_TYPE_NONE;
1941
1942 // Generate Error Containment Xid, send NV2080_NOTIFIER*, mark device for Reset or Drain And Reset
1943 NV_ASSERT_OK_OR_RETURN(
1944 gpuUpdateErrorContainmentState_HAL(pGpu,
1945 NV_ERROR_CONT_ERR_ID_E13_MMU_POISON, // Error type
1946 loc, // Loc
1947 NULL)); // RC Error Code
1948 }
1949
1950 return rmStatus;
1951 }
1952
1953 NV_STATUS
kgmmuNotifyNonReplayableFault_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvBool bNotifyClient)1954 kgmmuNotifyNonReplayableFault_GV100
1955 (
1956 OBJGPU *pGpu,
1957 KernelGmmu *pKernelGmmu,
1958 NvBool bNotifyClient
1959 )
1960 {
1961 NV_STATUS rmStatus = NV_OK;
1962 PEVENTNOTIFICATION *ppEventNotification = NULL;
1963 GMMU_CLIENT_SHADOW_FAULT_BUFFER *pClientShadowBuffer = NULL;
1964
1965 //
1966 // We call clients unconditionally as clients should be walking thru their shadow buffer and filter all
1967 // redundant faults if any
1968 //
1969 if (CliGetEventNotificationList(pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].hFaultBufferClient,
1970 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].hFaultBufferObject, NULL, &ppEventNotification) == NV_OK &&
1971 ppEventNotification != NULL)
1972 {
1973 //
1974 // Check if client has registered for shadow buffer. If not, we will call client only if RM has
1975 // serviced a fault packet. This reduces the duplicate notifications and enhances test speeds.
1976 //
1977 pClientShadowBuffer =
1978 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pClientShadowFaultBuffer[NON_REPLAYABLE_FAULT_BUFFER];
1979
1980 if (pClientShadowBuffer != NULL || bNotifyClient)
1981 {
1982 rmStatus = notifyEvents(pGpu, *ppEventNotification, NVC369_NOTIFIER_MMU_FAULT_NON_REPLAYABLE,
1983 0, 0, NV_OK, NV_OS_WRITE_THEN_AWAKEN);
1984 NV_ASSERT(rmStatus == NV_OK);
1985 }
1986 }
1987
1988 return rmStatus;
1989 }
1990
1991 NV_STATUS
kgmmuServiceNonReplayableFault_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu)1992 kgmmuServiceNonReplayableFault_GV100
1993 (
1994 OBJGPU *pGpu,
1995 KernelGmmu *pKernelGmmu
1996 )
1997 {
1998 NV_STATUS rmStatus = NV_OK;
1999 GMMU_SHADOW_FAULT_BUF *pShadowFaultBuf;
2000 GMMU_FAULT_PACKET faultPacket;
2001 NvBool bNotifyClient = NV_FALSE;
2002
2003 pShadowFaultBuf = KERNEL_POINTER_FROM_NvP64(GMMU_SHADOW_FAULT_BUF *, pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].pRmShadowFaultBuffer);
2004 NV_ASSERT_OR_RETURN(pShadowFaultBuf != NULL, NV_ERR_INVALID_POINTER);
2005
2006 //
2007 // We don't disable Non-Replayable interrupts as we have copied all data from HW buffers and interrupts
2008 // should be enabled if new fault is generated
2009 // Walk thru the fault packets in shadow fault buffer and see if RM need to service anything
2010 //
2011 while (!queueIsEmpty(pShadowFaultBuf))
2012 {
2013
2014 portMemSet((void *)&faultPacket, 0, sizeof(GMMU_FAULT_PACKET));
2015 queuePopAndCopy(pShadowFaultBuf, (void *)&faultPacket);
2016
2017 rmStatus = kgmmuHandleNonReplayableFaultPacket_HAL(pGpu, pKernelGmmu, &faultPacket);
2018 if (rmStatus != NV_OK)
2019 return rmStatus;
2020
2021 bNotifyClient = NV_TRUE;
2022 }
2023
2024 rmStatus = kgmmuNotifyNonReplayableFault_HAL(pGpu, pKernelGmmu, bNotifyClient);
2025 return rmStatus;
2026 }
2027
2028 /**
2029 * @brief handles an engine or PBDMA MMU fault
2030 *
2031 * "engine" is defined as an engine that is downstream of host (graphics, ce,
2032 * etc...).
2033 *
2034 * @param[in] pGpu OBJGPU pointer
2035 * @param[in] pKernelGmmu KernelGmmu pointer
2036 * @param[in] pParsedFaultEntry Parsed Fault entry
2037 * @param[in] pMmuExceptionData FIFO exception packet used
2038 * for printing fault info.
2039 *
2040 * @returns
2041 */
2042 NV_STATUS
kgmmuServiceMmuFault_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvP64 pParsedFaultInfo,FIFO_MMU_EXCEPTION_DATA * pMmuExceptionData)2043 kgmmuServiceMmuFault_GV100
2044 (
2045 OBJGPU *pGpu,
2046 KernelGmmu *pKernelGmmu,
2047 NvP64 pParsedFaultInfo,
2048 FIFO_MMU_EXCEPTION_DATA *pMmuExceptionData
2049 )
2050 {
2051 NV_STATUS rmStatus = NV_OK;
2052 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
2053 KernelChannel *pKernelChannel = NULL;
2054 MMU_FAULT_BUFFER_ENTRY *pParsedFaultEntry = KERNEL_POINTER_FROM_NvP64(MMU_FAULT_BUFFER_ENTRY *, pParsedFaultInfo);
2055
2056 // Find channel ID from instance pointer
2057 rmStatus = kfifoConvertInstToKernelChannel_HAL(pGpu, pKernelFifo, &pParsedFaultEntry->mmuFaultInstBlock, &pKernelChannel);
2058
2059 if (rmStatus != NV_OK || pKernelChannel == NULL)
2060 {
2061 NV_PRINTF(LEVEL_ERROR, "Could not get chid from inst addr\n");
2062 DBG_BREAKPOINT();
2063 return rmStatus;
2064 }
2065
2066 // Reset channel
2067 if (IS_VIRTUAL_WITH_SRIOV(pGpu))
2068 {
2069 NvBool bIsMmuDebugModeEnabled = NV_FALSE;
2070 NvU32 engDesc;
2071
2072 rmStatus = kchannelGetEngine_HAL(pGpu, pKernelChannel, &engDesc);
2073 if ((rmStatus == NV_OK) && IS_GR(engDesc))
2074 {
2075 NV0090_CTRL_GET_MMU_DEBUG_MODE_PARAMS params;
2076
2077 portMemSet(¶ms, 0, sizeof(params));
2078 NV_RM_RPC_CONTROL(pGpu,
2079 RES_GET_CLIENT_HANDLE(pKernelChannel),
2080 RES_GET_HANDLE(pKernelChannel),
2081 NV0090_CTRL_CMD_GET_MMU_DEBUG_MODE,
2082 ¶ms,
2083 sizeof(params),
2084 rmStatus);
2085
2086 if (rmStatus != NV_OK)
2087 {
2088 NV_PRINTF(LEVEL_ERROR,
2089 "RM control call to read MMU debug mode failed, rmStatus 0x%x\n",
2090 rmStatus);
2091 DBG_BREAKPOINT();
2092 }
2093 else
2094 {
2095 bIsMmuDebugModeEnabled = params.bMode;
2096 }
2097 }
2098
2099 NV_PRINTF(LEVEL_INFO, "bIsMmuDebugModeEnabled: %s\n",
2100 bIsMmuDebugModeEnabled ? "TRUE" : "FALSE");
2101
2102 if (!bIsMmuDebugModeEnabled)
2103 {
2104 RmCtrlParams rmCtrlParams = {0};
2105 NV906F_CTRL_CMD_RESET_CHANNEL_PARAMS resetChannelParams = {0};
2106 RsClient *pClient = RES_GET_CLIENT(pKernelChannel);
2107 Device *pDevice = GPU_RES_GET_DEVICE(pKernelChannel);
2108 NvU32 subdeviceInstance = gpumgrGetSubDeviceInstanceFromGpu(pGpu);
2109 Subdevice *pSubDevice;
2110 RM_ENGINE_TYPE rmEngineType;
2111
2112 rmStatus = subdeviceGetByInstance(pClient, RES_GET_HANDLE(pDevice),
2113 subdeviceInstance, &pSubDevice);
2114 if (rmStatus != NV_OK)
2115 return rmStatus;
2116
2117 GPU_RES_SET_THREAD_BC_STATE(pSubDevice);
2118
2119 rmCtrlParams.hClient = RES_GET_CLIENT_HANDLE(pKernelChannel);
2120 rmCtrlParams.hObject = RES_GET_HANDLE(pKernelChannel);
2121 rmCtrlParams.cmd = NV906F_CTRL_CMD_RESET_CHANNEL;
2122 rmCtrlParams.pParams = &resetChannelParams;
2123 rmCtrlParams.paramsSize = sizeof(NV906F_CTRL_CMD_RESET_CHANNEL_PARAMS);
2124
2125 if (kfifoIsMmuFaultEngineIdPbdma(pGpu, pKernelFifo, pParsedFaultEntry->mmuFaultEngineId))
2126 {
2127 rmEngineType = RM_ENGINE_TYPE_HOST;
2128 }
2129 else
2130 {
2131 rmStatus = kfifoEngineInfoXlate_HAL(pGpu, pKernelFifo,
2132 ENGINE_INFO_TYPE_MMU_FAULT_ID, pParsedFaultEntry->mmuFaultEngineId,
2133 ENGINE_INFO_TYPE_RM_ENGINE_TYPE, (NvU32 *)&rmEngineType);
2134 NV_ASSERT(rmStatus == NV_OK);
2135 }
2136
2137 resetChannelParams.engineID = gpuGetNv2080EngineType(rmEngineType);
2138 resetChannelParams.subdeviceInstance = pSubDevice->subDeviceInst;
2139 resetChannelParams.resetReason = NV906F_CTRL_CMD_RESET_CHANNEL_REASON_MMU_FLT;
2140
2141 // Update the per-channel error notifier before performing the RC
2142 rmStatus = krcErrorSetNotifier(pGpu, GPU_GET_KERNEL_RC(pGpu),
2143 pKernelChannel,
2144 ROBUST_CHANNEL_FIFO_ERROR_MMU_ERR_FLT,
2145 rmEngineType,
2146 RC_NOTIFIER_SCOPE_TSG);
2147 if (rmStatus != NV_OK)
2148 {
2149 NV_PRINTF(LEVEL_ERROR,
2150 "Failed to set error notifier, rmStatus 0x%x\n",
2151 rmStatus);
2152 DBG_BREAKPOINT();
2153 }
2154
2155 //
2156 // Reset rmStatus before calling reset channel RPC as we should return
2157 // status of this RPC which actually performs channel reset.
2158 //
2159 rmStatus = NV_OK;
2160
2161 NV_RM_RPC_CONTROL(pGpu,
2162 rmCtrlParams.hClient,
2163 rmCtrlParams.hObject,
2164 rmCtrlParams.cmd,
2165 rmCtrlParams.pParams,
2166 rmCtrlParams.paramsSize,
2167 rmStatus
2168 );
2169 if (rmStatus != NV_OK)
2170 {
2171 NV_PRINTF(LEVEL_ERROR,
2172 "RM control call to reset channel failed, rmStatus 0x%x\n",
2173 rmStatus);
2174 DBG_BREAKPOINT();
2175 }
2176 }
2177
2178 //
2179 // Fill exception info in FifoData.
2180 // Also mark this exception as notified to prevent duplication notification
2181 // in vgpuService when PF has done its RC.
2182 //
2183 kchannelFillMmuExceptionInfo(pKernelChannel, pMmuExceptionData);
2184
2185 if (IS_GR(engDesc) && pMmuExceptionData->bGpc)
2186 {
2187 KernelGraphicsContext *pKernelGraphicsContext;
2188
2189 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
2190 kgrctxFromKernelChannel(pKernelChannel,
2191 &pKernelGraphicsContext));
2192 if (rmStatus == NV_OK)
2193 {
2194 kgrctxRecordMmuFault(pGpu, pKernelGraphicsContext,
2195 kgmmuGetFaultInfoFromFaultPckt_HAL(pKernelGmmu, pParsedFaultEntry),
2196 pParsedFaultEntry->mmuFaultAddress,
2197 pParsedFaultEntry->mmuFaultType,
2198 pParsedFaultEntry->mmuFaultAccessType);
2199 }
2200 }
2201
2202 return rmStatus;
2203 }
2204
2205 rmStatus = kgmmuServiceChannelMmuFault_HAL(pGpu, pKernelGmmu, pParsedFaultEntry,
2206 pMmuExceptionData, pKernelChannel);
2207 if (NV_OK != rmStatus)
2208 {
2209 NV_PRINTF(LEVEL_ERROR, "Could not service MMU fault for chid %x\n",
2210 kchannelGetDebugTag(pKernelChannel));
2211 }
2212
2213 return rmStatus;
2214 }
2215
2216 NvU32
kgmmuGetFaultInfoFromFaultPckt_GV100(KernelGmmu * pKernelGmmu,MMU_FAULT_BUFFER_ENTRY * pParsedFaultEntry)2217 kgmmuGetFaultInfoFromFaultPckt_GV100
2218 (
2219 KernelGmmu *pKernelGmmu,
2220 MMU_FAULT_BUFFER_ENTRY *pParsedFaultEntry
2221 )
2222 {
2223 NvU32 faultInfo = 0;
2224
2225 if (pParsedFaultEntry == NULL)
2226 return 0;
2227
2228 //
2229 // This is a bit insane. We don't have any protection against changing bit position.
2230 // Still copying the bits and relying on fact that MMU_FAULT_INFO will always keep info consistent
2231 //
2232 faultInfo = FLD_SET_DRF_NUM(_PFB_PRI, _MMU_FAULT_INFO, _FAULT_TYPE, pParsedFaultEntry->mmuFaultType, faultInfo);
2233 faultInfo = FLD_SET_DRF_NUM(_PFB_PRI, _MMU_FAULT_INFO, _CLIENT, pParsedFaultEntry->mmuFaultClientId, faultInfo);
2234 faultInfo = FLD_SET_DRF_NUM(_PFB_PRI, _MMU_FAULT_INFO, _ACCESS_TYPE, pParsedFaultEntry->mmuFaultAccessType, faultInfo);
2235 faultInfo = FLD_SET_DRF_NUM(_PFB_PRI, _MMU_FAULT_INFO, _CLIENT_TYPE, pParsedFaultEntry->mmuFaultClientType, faultInfo);
2236 faultInfo = FLD_SET_DRF_NUM(_PFB_PRI, _MMU_FAULT_INFO, _GPC_ID, pParsedFaultEntry->mmuFaultGpcId, faultInfo);
2237 faultInfo = FLD_SET_DRF_NUM(_PFB_PRI, _MMU_FAULT_INFO, _PROTECTED_MODE, pParsedFaultEntry->bFaultInProtectedMode, faultInfo);
2238 faultInfo = FLD_SET_DRF_NUM(_PFB_PRI, _MMU_FAULT_INFO, _REPLAYABLE_FAULT, pParsedFaultEntry->bFaultTypeReplayable, faultInfo);
2239 faultInfo = FLD_SET_DRF_NUM(_PFB_PRI, _MMU_FAULT_INFO, _VALID, pParsedFaultEntry->bFaultEntryValid, faultInfo);
2240
2241 return faultInfo;
2242 }
2243
2244 static NV_STATUS
_kgmmuCreateExceptionDataFromPriv_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,MMU_FAULT_BUFFER_ENTRY * pParsedFaultEntry,FIFO_MMU_EXCEPTION_DATA * pMmuExceptionData)2245 _kgmmuCreateExceptionDataFromPriv_GV100
2246 (
2247 OBJGPU *pGpu,
2248 KernelGmmu *pKernelGmmu,
2249 MMU_FAULT_BUFFER_ENTRY *pParsedFaultEntry,
2250 FIFO_MMU_EXCEPTION_DATA *pMmuExceptionData
2251 )
2252 {
2253 NV_STATUS rmStatus = NV_OK;
2254 NvU32 regDataLo, regDataHi;
2255 NvU64 tempLo = 0;
2256 NvU64 tempHi = 0;
2257 NV_ASSERT_OR_RETURN(pParsedFaultEntry != NULL, NV_ERR_INVALID_ARGUMENT);
2258 NV_ASSERT_OR_RETURN(pMmuExceptionData != NULL, NV_ERR_INVALID_ARGUMENT);
2259
2260 // FaultInstPtr
2261 {
2262 kgmmuReadMmuFaultInstHiLo_HAL(pGpu, pKernelGmmu, ®DataHi, ®DataLo);
2263
2264 tempLo = DRF_VAL(_PFB_PRI, _MMU_FAULT_INST_LO, _ADDR, regDataLo);
2265 tempHi = DRF_VAL(_PFB_PRI, _MMU_FAULT_INST_HI, _ADDR, regDataHi);
2266 pParsedFaultEntry->mmuFaultInstBlock.address = tempLo + (tempHi << DRF_SIZE(NV_PFB_PRI_MMU_FAULT_INST_LO_ADDR));
2267 pParsedFaultEntry->mmuFaultInstBlock.address <<= DRF_BASE(NV_PFB_PRI_MMU_FAULT_INST_LO_ADDR);
2268 pParsedFaultEntry->mmuFaultInstBlock.aperture = DRF_VAL(_PFB_PRI, _MMU_FAULT_INST_LO, _APERTURE, regDataLo);
2269 pParsedFaultEntry->mmuFaultInstBlock.gfid = GPU_GFID_PF;
2270 pParsedFaultEntry->mmuFaultEngineId = DRF_VAL(_PFB_PRI, _MMU_FAULT_INST_LO, _ENGINE_ID, regDataLo);
2271 }
2272
2273 // Fault Addr
2274 {
2275 kgmmuReadMmuFaultAddrHiLo_HAL(pGpu, pKernelGmmu, ®DataHi, ®DataLo);
2276
2277 tempLo = DRF_VAL(_PFB_PRI, _MMU_FAULT_ADDR_LO, _ADDR, regDataLo);
2278 tempHi = DRF_VAL(_PFB_PRI, _MMU_FAULT_ADDR_HI, _ADDR, regDataHi);
2279
2280 pParsedFaultEntry->mmuFaultAddress = (tempLo + (tempHi << DRF_SIZE(NV_PFB_PRI_MMU_FAULT_ADDR_LO_ADDR))) <<
2281 (DRF_BASE(NV_PFB_PRI_MMU_FAULT_ADDR_LO_ADDR));
2282
2283 kgmmuSignExtendFaultAddress_HAL(pGpu, pKernelGmmu, &pParsedFaultEntry->mmuFaultAddress);
2284 }
2285
2286 // FaultInfo
2287 {
2288 regDataLo = kgmmuReadMmuFaultInfo_HAL(pGpu, pKernelGmmu);
2289
2290 rmStatus = kgmmuGetFaultType_HAL(pGpu, pKernelGmmu, DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _FAULT_TYPE, regDataLo),
2291 &pParsedFaultEntry->mmuFaultType);
2292 NV_ASSERT_OR_RETURN(rmStatus == NV_OK, rmStatus);
2293 pParsedFaultEntry->mmuFaultAccessType = DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _ACCESS_TYPE, regDataLo);
2294 pParsedFaultEntry->mmuFaultClientId = DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _CLIENT, regDataLo);
2295 pParsedFaultEntry->mmuFaultClientType = DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _CLIENT_TYPE, regDataLo);
2296 pParsedFaultEntry->mmuFaultGpcId = DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _GPC_ID, regDataLo);
2297 pParsedFaultEntry->bFaultEntryValid = DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _VALID, regDataLo);
2298 pParsedFaultEntry->bFaultInProtectedMode = DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _PROTECTED_MODE, regDataLo);
2299 pParsedFaultEntry->bFaultTypeReplayable = DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _REPLAYABLE_FAULT, regDataLo);
2300 pParsedFaultEntry->bReplayableFaultEn = DRF_VAL(_PFB_PRI, _MMU_FAULT_INFO, _REPLAYABLE_FAULT_EN, regDataLo);
2301 }
2302
2303 pMmuExceptionData->addrLo = (NvU32)(pParsedFaultEntry->mmuFaultAddress & 0xFFFFFFFFUL);
2304 pMmuExceptionData->addrHi = (NvU32)(pParsedFaultEntry->mmuFaultAddress >> 32);
2305 pMmuExceptionData->faultType = pParsedFaultEntry->mmuFaultType;
2306 pMmuExceptionData->clientId = pParsedFaultEntry->mmuFaultClientId;
2307 if (pParsedFaultEntry->mmuFaultClientType == NV_PFAULT_MMU_CLIENT_TYPE_GPC)
2308 {
2309 pMmuExceptionData->bGpc = NV_TRUE;
2310 pMmuExceptionData->gpcId = pParsedFaultEntry->mmuFaultGpcId;
2311 }
2312 else
2313 {
2314 pMmuExceptionData->bGpc = NV_FALSE;
2315 pMmuExceptionData->gpcId = 0;
2316 }
2317 pMmuExceptionData->accessType = pParsedFaultEntry->mmuFaultAccessType;
2318 pMmuExceptionData->faultEngineId = pParsedFaultEntry->mmuFaultEngineId;
2319
2320 return rmStatus;
2321 }
2322
2323 /**
2324 * @brief Resets REPLAYABLE/NON_REPLAYABLE FAULT error status
2325 *
2326 * @param[in] faultBufType FaultBuffer type
2327 *
2328 * @returns - Value resetting corresponding status fields
2329 */
2330
2331 static inline NvU32
_kgmmuResetFaultBufferError_GV100(NvU32 faultBufType)2332 _kgmmuResetFaultBufferError_GV100
2333 (
2334 NvU32 faultBufType
2335 )
2336 {
2337 NvU32 faultBufStatus = 0;
2338 NV_ASSERT_OR_RETURN((faultBufType < NUM_FAULT_BUFFERS), NV_ERR_INVALID_ARGUMENT);
2339
2340 if (faultBufType == REPLAYABLE_FAULT_BUFFER)
2341 return FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _REPLAYABLE_ERROR, _RESET, faultBufStatus);
2342 else
2343 return FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _NON_REPLAYABLE_ERROR, _RESET, faultBufStatus);
2344 }
2345
2346 /**
2347 * @brief Handles all BAR2 faults including fault buffer and
2348 * access counter BAR2 errors.
2349 *
2350 */
2351 static NV_STATUS
_kgmmuServiceBar2Faults_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,NvU32 faultStatus,NvU32 mmuFaultClientId)2352 _kgmmuServiceBar2Faults_GV100
2353 (
2354 OBJGPU *pGpu,
2355 KernelGmmu *pKernelGmmu,
2356 NvU32 faultStatus,
2357 NvU32 mmuFaultClientId
2358 )
2359 {
2360 NV_STATUS rmStatus = NV_OK;
2361 OBJUVM *pUvm = GPU_GET_UVM(pGpu);
2362 NvU32 i;
2363
2364 NvBool replayableFaultError = FLD_TEST_DRF(_PFB_PRI, _MMU_FAULT_STATUS,
2365 _REPLAYABLE_ERROR, _SET, faultStatus);
2366 NvBool nonReplayableFaultError = FLD_TEST_DRF(_PFB_PRI, _MMU_FAULT_STATUS,
2367 _NON_REPLAYABLE_ERROR, _SET, faultStatus);
2368 NvBool accessCntrError = kgmmuTestAccessCounterWriteNak_HAL(pGpu, pKernelGmmu);
2369
2370 NvBool bVidmemAccessBitBufError = kgmmuTestVidmemAccessBitBufferError_HAL(pGpu, pKernelGmmu, faultStatus);
2371
2372 //
2373 // If it's a MMU Fault buffer BAR2 error, disable and re-enable fault buffers after BAR2 rebind.
2374 // In case of BAR2 fault on both fault buffers, RM should see two error interrupts.
2375 // So, just handle one buffer here and next buffer can be reset with next incoming interrupt
2376 //
2377 NvU32 faultBufType = replayableFaultError ? REPLAYABLE_FAULT_BUFFER :
2378 NON_REPLAYABLE_FAULT_BUFFER;
2379 if (replayableFaultError || nonReplayableFaultError)
2380 {
2381 rmStatus = kgmmuDisableFaultBuffer_HAL(pGpu, pKernelGmmu, faultBufType, NV_TRUE, GPU_GFID_PF);
2382 if (rmStatus != NV_OK)
2383 return rmStatus;
2384 }
2385
2386 // Access counter Bar2 fault handling
2387 if (accessCntrError)
2388 {
2389 for (i = 0; i < pUvm->accessCounterBufferCount; i++)
2390 {
2391 rmStatus = uvmDisableAccessCntr_HAL(pGpu, pUvm, i, NV_TRUE);
2392 if (rmStatus != NV_OK)
2393 return rmStatus;
2394 }
2395 }
2396
2397 // vidmem access bit buffer BAR2 fault handling
2398 if (bVidmemAccessBitBufError)
2399 {
2400 kgmmuDisableVidmemAccessBitBuf_HAL(pGpu, pKernelGmmu);
2401 }
2402
2403 // do a rebind of bar2
2404 kbusBindBar2_HAL(pGpu, GPU_GET_KERNEL_BUS(pGpu), BAR2_MODE_VIRTUAL);
2405
2406 kgmmuServiceMthdBuffFaultInBar2Fault(pGpu, pKernelGmmu);
2407
2408 if (replayableFaultError || nonReplayableFaultError)
2409 {
2410 rmStatus = kgmmuEnableFaultBuffer_HAL(pGpu, pKernelGmmu, faultBufType, NV_TRUE, GPU_GFID_PF);
2411 if (rmStatus != NV_OK)
2412 return rmStatus;
2413
2414 faultStatus |= _kgmmuResetFaultBufferError_GV100(faultBufType);
2415 }
2416
2417 if (accessCntrError)
2418 {
2419 for (i = 0; i < pUvm->accessCounterBufferCount; i++)
2420 {
2421 rmStatus = uvmEnableAccessCntr_HAL(pGpu, pUvm, i, NV_TRUE);
2422 if (rmStatus != NV_OK)
2423 return rmStatus;
2424 }
2425
2426 kgmmuClearAccessCounterWriteNak_HAL(pGpu, pKernelGmmu);
2427 }
2428
2429 if (bVidmemAccessBitBufError)
2430 {
2431 rmStatus = kgmmuEnableVidmemAccessBitBuf_HAL(pGpu, pKernelGmmu);
2432 if (rmStatus != NV_OK)
2433 return rmStatus;
2434 }
2435
2436 return rmStatus;
2437 }
2438
2439 static NV_STATUS
_kgmmuHandleReplayablePrivFault_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu,MMU_FAULT_BUFFER_ENTRY * pParsedFaultEntry)2440 _kgmmuHandleReplayablePrivFault_GV100
2441 (
2442 OBJGPU *pGpu,
2443 KernelGmmu *pKernelGmmu,
2444 MMU_FAULT_BUFFER_ENTRY *pParsedFaultEntry
2445 )
2446 {
2447 GMMU_FAULT_CANCEL_INFO cancelInfo;
2448 NvU32 faultStatus;
2449
2450 // Fill in the structure used by cancel routine to cancel fault
2451 cancelInfo.clientId = pParsedFaultEntry->mmuFaultClientId;
2452 cancelInfo.gpcId = pParsedFaultEntry->mmuFaultGpcId;
2453 cancelInfo.instBlock = pParsedFaultEntry->mmuFaultInstBlock;
2454
2455 //
2456 // Clear the VALID bit to ensure non_replayable fault shows up when replayable
2457 // fault is canceled
2458 //
2459 faultStatus = kgmmuReadMmuFaultStatus_HAL(pGpu, pKernelGmmu, GPU_GFID_PF);
2460 faultStatus = FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _VALID, _CLEAR, faultStatus);
2461 kgmmuWriteMmuFaultStatus_HAL(pGpu, pKernelGmmu, faultStatus);
2462
2463 return kgmmuFaultCancelTargeted_HAL(pGpu, pKernelGmmu, &cancelInfo);
2464 }
2465
2466 /**
2467 * @brief Handles PRI MMU faults
2468 * PRI MMU faults are used for BAR1/BAR2, Physical faults and
2469 * capturing faults in the event fault buffers are disabled.
2470 *
2471 * @param[in] pGpu OBJGPU pointer
2472 * @param[in] pKernelGmmu KernelGmmu pointer
2473 *
2474 * @returns
2475 */
2476 NV_STATUS
kgmmuServicePriFaults_GV100(OBJGPU * pGpu,KernelGmmu * pKernelGmmu)2477 kgmmuServicePriFaults_GV100
2478 (
2479 OBJGPU *pGpu,
2480 KernelGmmu *pKernelGmmu
2481 )
2482 {
2483 NV_STATUS rmStatus = NV_OK;
2484 KernelRc *pKernelRC = GPU_GET_KERNEL_RC(pGpu);
2485 KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu);
2486 NvU32 faultStatus = kgmmuReadMmuFaultStatus_HAL(pGpu, pKernelGmmu, GPU_GFID_PF);
2487 PEVENTNOTIFICATION *ppEventNotification = NULL;
2488 MMU_FAULT_BUFFER_ENTRY parsedFaultEntry = {{0}, 0};
2489 FIFO_MMU_EXCEPTION_DATA mmuExceptionData = {0};
2490 NvBool bBarFault = NV_FALSE;
2491 NvU32 vfFaultType = NV2080_CTRL_CMD_GPU_HANDLE_VF_PRI_FAULT_TYPE_INVALID;
2492
2493 if (FLD_TEST_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _VALID, _SET, faultStatus))
2494 {
2495 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, _kgmmuCreateExceptionDataFromPriv_GV100(pGpu, pKernelGmmu, &parsedFaultEntry, &mmuExceptionData));
2496
2497 //
2498 // BAR faults mean incorrect BAR mappings. These are usually fatal with
2499 // fullchip reset that follows this.
2500 //
2501 if (kgmmuIsFaultEngineBar1_HAL(pKernelGmmu, parsedFaultEntry.mmuFaultEngineId))
2502 {
2503 vfFaultType = NV2080_CTRL_CMD_GPU_HANDLE_VF_PRI_FAULT_TYPE_BAR1;
2504
2505 // do a rebind of bar1
2506 kbusBar1InstBlkBind_HAL(pGpu, pKernelBus);
2507 NV_PRINTF(LEVEL_ERROR, "BAR1 MMU Fault\n");
2508 kgmmuPrintFaultInfo_HAL(pGpu, pKernelGmmu, parsedFaultEntry.mmuFaultEngineId, &mmuExceptionData);
2509 krcBreakpoint(GPU_GET_KERNEL_RC(pGpu));
2510 }
2511 else if (kgmmuIsFaultEngineBar2_HAL(pKernelGmmu, parsedFaultEntry.mmuFaultEngineId) &&
2512 !kgmmuIsP2PUnboundInstFault_HAL(pKernelGmmu, parsedFaultEntry.mmuFaultType, parsedFaultEntry.mmuFaultClientId))
2513 {
2514 vfFaultType = NV2080_CTRL_CMD_GPU_HANDLE_VF_PRI_FAULT_TYPE_BAR2;
2515
2516 NV_PRINTF(LEVEL_ERROR, "BAR2 MMU Fault\n");
2517 kgmmuPrintFaultInfo_HAL(pGpu, pKernelGmmu, parsedFaultEntry.mmuFaultEngineId, &mmuExceptionData);
2518 rmStatus = _kgmmuServiceBar2Faults_GV100(pGpu, pKernelGmmu, faultStatus, parsedFaultEntry.mmuFaultClientId);
2519 NV_ASSERT(rmStatus == NV_OK);
2520 krcBreakpoint(GPU_GET_KERNEL_RC(pGpu));
2521
2522 bBarFault = NV_TRUE;
2523 }
2524 else if (kgmmuIsFaultEnginePhysical_HAL(pKernelGmmu, parsedFaultEntry.mmuFaultEngineId))
2525 {
2526 vfFaultType = NV2080_CTRL_CMD_GPU_HANDLE_VF_PRI_FAULT_TYPE_PHYSICAL;
2527 //
2528 // This fault usually means VPR or out of bounds physical accesses.
2529 // Nothing much we can do except notify clients and wait for a TDR
2530 //
2531 NV_PRINTF(LEVEL_ERROR, "Physical MMU fault\n");
2532 kgmmuPrintFaultInfo_HAL(pGpu, pKernelGmmu, parsedFaultEntry.mmuFaultEngineId, &mmuExceptionData);
2533 krcBreakpoint(GPU_GET_KERNEL_RC(pGpu));
2534
2535 gpuNotifySubDeviceEvent(pGpu,
2536 NV2080_NOTIFIERS_PHYSICAL_PAGE_FAULT, NULL, 0, 0, 0);
2537 }
2538 else if (mmuExceptionData.faultType == NV_PFAULT_FAULT_TYPE_UNBOUND_INST_BLOCK)
2539 {
2540 NV_PRINTF(LEVEL_ERROR, "Unbound Instance MMU fault\n");
2541 kgmmuPrintFaultInfo_HAL(pGpu, pKernelGmmu, parsedFaultEntry.mmuFaultEngineId, &mmuExceptionData);
2542 krcBreakpoint(GPU_GET_KERNEL_RC(pGpu));
2543 vfFaultType = NV2080_CTRL_CMD_GPU_HANDLE_VF_PRI_FAULT_TYPE_UNBOUND_INSTANCE;
2544 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgmmuServiceUnboundInstBlockFault_HAL(pGpu, pKernelGmmu, NV_PTR_TO_NvP64(&parsedFaultEntry),
2545 &mmuExceptionData));
2546 }
2547 else
2548 {
2549 //
2550 // Check if any fault buffer is disabled. If yes, then service the fault
2551 // snapped in priv register.
2552 // Non-Replayable Faults - These faults are serviced as fault buffer faults i.e channel
2553 // will be RCed.
2554 // Replayable Faults - These faults will be cancelled as RM doesn't support replaying such
2555 // faults. Cancelling these faults will bring them back as non-replayable faults.
2556 //
2557 NvBool bReplayableBufDis = FLD_TEST_DRF(_PFB_PRI, _MMU_FAULT_BUFFER_SIZE,_ENABLE, _FALSE,
2558 kgmmuReadMmuFaultBufferSize_HAL(pGpu, pKernelGmmu, REPLAYABLE_FAULT_BUFFER, GPU_GFID_PF));
2559 NvBool bNonReplayBufDis = FLD_TEST_DRF(_PFB_PRI, _MMU_FAULT_BUFFER_SIZE,_ENABLE, _FALSE,
2560 kgmmuReadMmuFaultBufferSize_HAL(pGpu, pKernelGmmu, NON_REPLAYABLE_FAULT_BUFFER, GPU_GFID_PF));
2561 if (bReplayableBufDis || bNonReplayBufDis)
2562 {
2563 if (parsedFaultEntry.bFaultEntryValid)
2564 {
2565 // Replayable faults snapped in privs are handled differently, so check if replayable
2566 if (parsedFaultEntry.bFaultTypeReplayable)
2567 {
2568 NV_PRINTF(LEVEL_ERROR,
2569 "MMU Fault : Replayable fault with fault-buffer disabled. Initiating cancel\n");
2570 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, _kgmmuHandleReplayablePrivFault_GV100(pGpu, pKernelGmmu, &parsedFaultEntry));
2571 }
2572 else
2573 {
2574 kgmmuPrintFaultInfo_HAL(pGpu, pKernelGmmu, parsedFaultEntry.mmuFaultEngineId,
2575 &mmuExceptionData);
2576 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgmmuServiceMmuFault_HAL(pGpu, pKernelGmmu, NV_PTR_TO_NvP64(&parsedFaultEntry),
2577 &mmuExceptionData));
2578 }
2579 }
2580 }
2581 else
2582 {
2583 kgmmuPrintFaultInfo_HAL(pGpu, pKernelGmmu, parsedFaultEntry.mmuFaultEngineId, &mmuExceptionData);
2584 krcBreakpoint(GPU_GET_KERNEL_RC(pGpu));
2585 }
2586 }
2587
2588 // In case of SR-IOV vgpu, the fault needs to be cleared from the host side.
2589 if (IS_VIRTUAL_WITH_SRIOV(pGpu) && (vfFaultType != NV2080_CTRL_CMD_GPU_HANDLE_VF_PRI_FAULT_TYPE_INVALID))
2590 rmStatus = kgmmuServiceVfPriFaults(pGpu, pKernelGmmu, vfFaultType);
2591
2592 // Clear the VALID bit to indicate we have seen this.
2593 faultStatus = FLD_SET_DRF(_PFB_PRI, _MMU_FAULT_STATUS, _VALID, _CLEAR, faultStatus);
2594 kgmmuWriteMmuFaultStatus_HAL(pGpu, pKernelGmmu, faultStatus);
2595
2596 if (bBarFault && pKernelRC != NULL && pKernelRC->bRcOnBar2Fault)
2597 {
2598 pGpu->setProperty(pGpu, PDB_PROP_GPU_IN_FATAL_ERROR, NV_TRUE);
2599 (void)kfifoRecoverAllChannels_HAL(pGpu, GPU_GET_KERNEL_FIFO(pGpu), GPU_GFID_PF);
2600 }
2601
2602 if ((NV_OK == CliGetEventNotificationList(pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].hFaultBufferClient,
2603 pKernelGmmu->mmuFaultBuffer[GPU_GFID_PF].hFaultBufferObject, NULL, &ppEventNotification)) && ppEventNotification != NULL)
2604 {
2605 rmStatus = notifyEvents(pGpu, *ppEventNotification, NVC369_NOTIFIER_MMU_FAULT_NON_REPLAYABLE_IN_PRIV,
2606 0, 0, NV_OK, NV_OS_WRITE_THEN_AWAKEN);
2607 }
2608
2609 // Send event notifier in response to MMU Poison Fault.
2610 if ((parsedFaultEntry.mmuFaultType == fault_poison) &&
2611 gpuIsGlobalPoisonFuseEnabled(pGpu))
2612 {
2613 NV_ERROR_CONT_LOCATION loc = {0};
2614
2615 // Error containment location information
2616 loc.locType = NV_ERROR_CONT_LOCATION_TYPE_NONE;
2617
2618 // Generate Error Containment Xid, send NV2080_NOTIFIER*, mark device for Reset or Drain And Reset
2619 NV_ASSERT_OK_OR_RETURN(
2620 gpuUpdateErrorContainmentState_HAL(pGpu,
2621 NV_ERROR_CONT_ERR_ID_E13_MMU_POISON, // Error type
2622 loc, // Loc
2623 NULL)); // RC Error Code
2624 }
2625 }
2626 return rmStatus;
2627 }
2628