1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2013-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "gpu/gpu.h"
25 #include "gpu/mem_mgr/mem_mgr.h"
26 #include "mem_mgr/gpu_vaspace.h"
27 #include "gpu/mmu/kern_gmmu.h"
28 #include "kernel/gpu/nvlink/kernel_nvlink.h"
29 #include "gpu/mem_mgr/mem_desc.h"
30 #include "nvRmReg.h"  // NV_REG_STR_RM_*
31 
32 #include "mmu/gmmu_fmt.h"
33 #include "mmu/mmu_fmt.h"
34 
35 /*!
36  * @file
37  * @brief struct MMU_WALK_CALLBACKS g_gmmuWalkCallbacks and the callback
38  *        function implementations.
39  */
40 
41 /**
42  * See @ref MMU_WALK_FILL_STATE
43  */
44 #if NV_PRINTF_STRINGS_ALLOWED
45 const char *g_gmmuFillStateStrings[]           = { "INVALID", "SPARSE", "NV4K" };
46 const char *g_gmmuUVMMirroringDirStrings[]     = { "[User Root] ", "[Mirrored Root] " };
47 #else // NV_PRINTF_STRINGS_ALLOWED
48 static const char _gmmuFillStateString[]       = "XS4";
49 static const char _gmmuUVMMirroringDirString[] = "UM";
50 #endif // NV_PRINTF_STRINGS_ALLOWED
51 
52 static PMEMORY_DESCRIPTOR
53 _gmmuMemDescCacheCreate(MMU_WALK_USER_CTX *pUserCtx,
54                         MEMORY_DESCRIPTOR *pMemDesc,
55                         NvU32 memSize);
56 
57 static PMEMORY_DESCRIPTOR
58 _gmmuMemDescCacheAlloc(MMU_WALK_USER_CTX *pUserCtx);
59 
60 /*!
61  * Utility function to decide if a level should be mirrored.
62  * Used by MMU callbacks.
63  */
64 static NvBool NV_FORCEINLINE
65 _mirrorLevel
66 (
67     MMU_WALK_USER_CTX   *pUserCtx,
68     const MMU_FMT_LEVEL *pLevelFmt
69 )
70 {
71     return (pLevelFmt == pUserCtx->pGpuState->pFmt->pRoot) && pUserCtx->pGVAS->bIsMirrored;
72 }
73 
74 /*!
75  * Utility function to get the number of Page Dirs to loop over.
76  * Used by MMU callbacks.
77  */
78 static NvU8 NV_FORCEINLINE
79 _getMaxPageDirs(NvBool bMirror)
80 {
81     return bMirror ? GMMU_MAX_PAGE_DIR_INDEX_COUNT :
82                      GMMU_MAX_PAGE_DIR_INDEX_COUNT - 1;
83 }
84 
85 static NV_STATUS
86 _gmmuScrubMemDesc
87 (
88     OBJGPU              *pGpu,
89     MEMORY_DESCRIPTOR   *pMemDesc
90 )
91 {
92     TRANSFER_SURFACE dest = {0};
93 
94     dest.pMemDesc = pMemDesc;
95     dest.offset = 0;
96 
97     NV_ASSERT_OK_OR_RETURN(memmgrMemSet(GPU_GET_MEMORY_MANAGER(pGpu), &dest, 0,
98                                         (NvU32)memdescGetSize(pMemDesc),
99                                         TRANSFER_FLAGS_NONE));
100 
101     return NV_OK;
102 }
103 
104 static NV_STATUS
105 _gmmuWalkCBLevelAlloc
106 (
107     MMU_WALK_USER_CTX       *pUserCtx,
108     const MMU_FMT_LEVEL     *pLevelFmt,
109     const NvU64              vaBase,
110     const NvU64              vaLimit,
111     const NvBool             bTarget,
112     MMU_WALK_MEMDESC       **ppMemDesc,
113     NvU32                   *pMemSize,
114     NvBool                  *pBChanged
115 )
116 {
117     OBJGPU              *pGpu     = pUserCtx->pGpu;
118     KernelGmmu          *pKernelGmmu = GPU_GET_KERNEL_GMMU(pGpu);
119     OBJGVASPACE         *pGVAS    = pUserCtx->pGVAS;
120     const GVAS_BLOCK    *pBlock   = pUserCtx->pBlock;
121     const GMMU_FMT      *pFmt     = pUserCtx->pGpuState->pFmt;
122     MEMORY_DESCRIPTOR   *pMemDesc[GMMU_MAX_PAGE_DIR_INDEX_COUNT] = {NULL};
123     const NvU32          minMemSize = (mmuFmtVirtAddrToEntryIndex(pLevelFmt, vaLimit) + 1) *
124                                       pLevelFmt->entrySize;
125     NvU32                newMemSize;
126     NV_STATUS            status   = NV_OK;
127     NvU32                alignment;
128     NvU32                aperture;
129     NvU32                attr;
130     NvU64                memDescFlags = MEMDESC_FLAGS_NONE;
131     NvU32                memPoolListCount = 0;
132     NvU32                memPoolList[4];
133     NvBool               bAllowSysmem;
134     NvBool               bPacked     = NV_FALSE;
135     NvBool               bPartialTbl = NV_FALSE;
136     NvBool               bPmaManaged = !!(pGVAS->flags & VASPACE_FLAGS_PTETABLE_PMA_MANAGED);
137     NvBool               bMirror     = _mirrorLevel(pUserCtx, pLevelFmt);
138     NvU8                 maxPgDirs   = _getMaxPageDirs(bMirror);
139     NvU8                 i = 0, j = 0;
140 
141     // Abort early if level is not targeted or already sufficiently sized.
142     if (((NULL == *ppMemDesc) && !bTarget) ||
143         ((NULL != *ppMemDesc) && (minMemSize <= *pMemSize)))
144     {
145         return NV_OK;
146     }
147 
148     // Check if this level is the root page directory.
149     if (pLevelFmt == pFmt->pRoot)
150     {
151         newMemSize = kgmmuGetPDBAllocSize_HAL(pKernelGmmu, pLevelFmt, pGVAS->vaLimitInternal);
152 
153         // TODO: PDB alignemnt.
154         alignment = RM_PAGE_SIZE;
155 
156         // Determine level aperture and memory attributes.
157         if (pGVAS->flags & VASPACE_FLAGS_BAR)
158         {
159             aperture     = kgmmuGetPDEBAR1Aperture(pKernelGmmu);
160             attr         = kgmmuGetPDEBAR1Attr(pKernelGmmu);
161             bAllowSysmem = !FLD_TEST_DRF(_REG_STR_RM, _INST_LOC, _BAR_PDE, _VID,
162                                          pGpu->instLocOverrides);
163         }
164         else
165         {
166             aperture     = kgmmuGetPDEAperture(pKernelGmmu);
167             attr         = kgmmuGetPDEAttr(pKernelGmmu);
168             bAllowSysmem = !FLD_TEST_DRF(_REG_STR_RM, _INST_LOC, _PDE, _VID,
169                                          pGpu->instLocOverrides);
170         }
171 
172         // Default aperture.
173         memPoolList[memPoolListCount++] = aperture;
174 
175         // Fallback to sysmem if allowed.
176         if (bAllowSysmem &&
177             (aperture != ADDR_SYSMEM) && !(pGVAS->flags & VASPACE_FLAGS_BAR))
178         {
179             memPoolList[memPoolListCount++] = ADDR_SYSMEM;
180         }
181     }
182     else
183     {
184         const MMU_FMT_LEVEL       *pParent;
185         const GMMU_FMT_PDE_MULTI  *pPdeMulti = pFmt->pPdeMulti;
186         const GMMU_FMT_PDE        *pPde;
187         NvU32                      subLevel;
188 
189         // Find the level's parent format.
190         pParent = mmuFmtFindLevelParent(pFmt->pRoot, pLevelFmt, &subLevel);
191         NV_ASSERT_OR_RETURN(NULL != pParent, NV_ERR_INVALID_ARGUMENT);
192 
193         // Get the alignment from the parent PDE address shift.
194         pPde = gmmuFmtGetPde(pFmt, pParent, subLevel);
195 
196         if (pPde->version == GMMU_FMT_VERSION_3)
197         {
198             alignment = NVBIT(pPde->fldAddr.shift);
199         }
200         else
201         {
202             alignment = NVBIT(pPde->fldAddrSysmem.shift);
203         }
204 
205         // Initially assume full size.
206         newMemSize = mmuFmtLevelSize(pLevelFmt);
207 
208         // Shrink size if partial page tables are supported.
209         if ((pGVAS->flags & VASPACE_FLAGS_MINIMIZE_PTETABLE_SIZE) &&
210             (pParent->numSubLevels > 1) &&
211             nvFieldIsValid32(&pPdeMulti->fldSizeRecipExp))
212         {
213             NvU32  i;
214             //
215             // Only a fixed set of PDE ranges are allowed to have partial size.
216             // Partial VA holes of these PDEs are blocked at VAS creation time.
217             // See @ref gvaspaceConstructHal_IMPL for details.
218             //
219             for (i = 0; i < pGVAS->numPartialPtRanges; ++i)
220             {
221                 if ((vaBase >= pGVAS->partialPtVaRangeBase[i]) &&
222                     (vaBase <= (pGVAS->partialPtVaRangeBase[i] +
223                                 pGVAS->partialPtVaRangeSize - 1)))
224                 {
225                     const NvU32 recipExpMax = pPdeMulti->fldSizeRecipExp.maskPos >>
226                                               pPdeMulti->fldSizeRecipExp.shift;
227                     const NvU32 fracMemSize = nvNextPow2_U32(minMemSize);
228                     const NvU32 recipExpTgt = BIT_IDX_32(newMemSize / fracMemSize);
229                     const NvU32 recipExp    = NV_MIN(recipExpMax, recipExpTgt);
230                     newMemSize >>= recipExp;
231                     bPartialTbl = NV_TRUE;
232                     break;
233                 }
234             }
235         }
236 
237         // New size must satisfy the minimum size.
238         NV_ASSERT(newMemSize >= minMemSize);
239         // New size must be larger than old size, otherwise should have aborted earlier.
240         NV_ASSERT(newMemSize > *pMemSize);
241 
242         // Determine level aperture and memory attributes.
243         if (pGVAS->flags & VASPACE_FLAGS_BAR)
244         {
245             aperture     = kgmmuGetPTEBAR1Aperture(pKernelGmmu);
246             attr         = kgmmuGetPTEBAR1Attr(pKernelGmmu);
247             bAllowSysmem = !FLD_TEST_DRF(_REG_STR_RM, _INST_LOC, _BAR_PTE, _VID,
248                                          pGpu->instLocOverrides);
249         }
250         else
251         {
252             aperture     = kgmmuGetPTEAperture(pKernelGmmu);
253             attr         = kgmmuGetPTEAttr(pKernelGmmu);
254             bAllowSysmem = !FLD_TEST_DRF(_REG_STR_RM, _INST_LOC, _PTE, _VID,
255                                          pGpu->instLocOverrides);
256         }
257 
258         //
259         // BAR PDEs/PTEs are not allowed in sysmem since it can cause deadlock
260         // during PCIE transactions.
261         // PMU PDEs/PTEs must be in vidmem so that PMU can access virtually mapped
262         // memory during GC6 exit.
263         //
264         bAllowSysmem = bAllowSysmem &&
265                        !(pGVAS->flags & VASPACE_FLAGS_BAR) &&
266                        !(pGVAS->flags & VASPACE_FLAGS_PMU);
267 
268         // Prefer sysmem if requested and allowed.
269         if (bAllowSysmem &&
270             (NULL != pBlock && pBlock->flags.bPreferSysmemPageTables))
271         {
272             memPoolList[memPoolListCount++] = ADDR_SYSMEM;
273         }
274 
275         // Default aperture.
276         memPoolList[memPoolListCount++] = aperture;
277 
278         // Fallback to sysmem if requested and allowed.
279         if (bAllowSysmem &&
280             (pGVAS->flags & VASPACE_FLAGS_RETRY_PTE_ALLOC_IN_SYS))
281         {
282             memPoolList[memPoolListCount++] = ADDR_SYSMEM;
283         }
284     }
285 
286     // Add memList end entry.
287     memPoolList[memPoolListCount++] = ADDR_UNKNOWN;
288     NV_ASSERT(memPoolListCount <= NV_ARRAY_ELEMENTS(memPoolList));
289 
290     // MEMDESC flags
291     memDescFlags = MEMDESC_FLAGS_LOCKLESS_SYSMEM_ALLOC  |
292                    MEMDESC_FLAGS_PAGE_SIZE_ALIGN_IGNORE;
293 
294     if (pGVAS->flags & VASPACE_FLAGS_ALLOW_PAGES_IN_PHYS_MEM_SUBALLOCATOR)
295     {
296         memDescFlags |= MEMDESC_FLAGS_OWNED_BY_CURRENT_DEVICE;
297     }
298 
299     // Create the level memdesc.
300     for (i = 0; i < maxPgDirs; i++)
301     {
302         MEMORY_DESCRIPTOR *pMemDescTemp;
303 
304         status = memdescCreate(&pMemDescTemp, pGpu,
305                                (((newMemSize < RM_PAGE_SIZE) && !bPartialTbl && !bPmaManaged) ?
306                                 RM_PAGE_SIZE : newMemSize),
307                                alignment,
308                                NV_TRUE,
309                                ADDR_UNKNOWN,
310                                attr,
311                                memDescFlags);
312         NV_ASSERT_OR_GOTO(NV_OK == status, done);
313 
314         // Page levels always use 4KB swizzle.
315         memdescSetPageSize(pMemDescTemp, AT_GPU, RM_PAGE_SIZE);
316 
317         //
318         // Allocate the page level memory from reserved pool if aperture is vidmem
319         // and PMA is enabled. Otherwise, allocate the same way on both vidmem and
320         // sysmem.
321         //
322         while (memPoolList[j] != ADDR_UNKNOWN)
323         {
324             memdescSetAddressSpace(pMemDescTemp, memPoolList[j]);
325             switch (memPoolList[j])
326             {
327                 case ADDR_FBMEM:
328                     if (RMCFG_FEATURE_PMA &&
329                         (pGVAS->flags & VASPACE_FLAGS_PTETABLE_PMA_MANAGED) &&
330                         (pGVAS->pPageTableMemPool != NULL))
331                     {
332                         pMemDescTemp->ActualSize = RM_ALIGN_UP(newMemSize, alignment);
333                         status = rmMemPoolAllocate(pGVAS->pPageTableMemPool,
334                                          (RM_POOL_ALLOC_MEMDESC*)pMemDescTemp);
335                         break;
336                     }
337                 case ADDR_SYSMEM:
338                     status = memdescAlloc(pMemDescTemp);
339                     break;
340                 default:
341                     NV_ASSERT_OR_GOTO(0, done);
342             }
343             if (NV_OK == status)
344             {
345                 //
346                 // Always scrub the allocation for the PDB allocation in case
347                 // GMMU prefetches some uninitialized entries
348                 //
349                 if (pLevelFmt == pFmt->pRoot)
350                 {
351                     status = _gmmuScrubMemDesc(pGpu, pMemDescTemp);
352                 }
353 
354                 memdescSetName(pGpu, pMemDescTemp, NV_RM_SURF_NAME_PAGE_TABLE, mmuFmtConvertLevelIdToSuffix(pLevelFmt));
355                 break;
356             }
357             j++;
358         }
359 
360         if (NV_OK != status)
361         {
362             memdescDestroy(pMemDescTemp);
363             goto done;
364         }
365 
366         //
367         // The packing optimization is only needed for allocations in vidmem since
368         // the 4K granularity is not applicable to allocations in sysmem.
369         //
370         bPacked = ((memdescGetAddressSpace(pMemDescTemp) == ADDR_FBMEM) &&
371                    (alignment < RM_PAGE_SIZE) && !bPmaManaged);
372 
373         if (bPacked)
374         {
375             // Try to allocate from the free list of packed memdescs
376             pMemDesc[i] = _gmmuMemDescCacheAlloc(pUserCtx);
377             if (NULL != pMemDesc[i])
378             {
379                 // Free this if we have already allocated from the list.
380                 memdescFree(pMemDescTemp);
381                 memdescDestroy(pMemDescTemp);
382             }
383             else
384             {
385                 // Add another page to the cache and then alloc.
386                 pMemDesc[i] = _gmmuMemDescCacheCreate(pUserCtx,
387                                                       pMemDescTemp,
388                                                       newMemSize);
389                 if (NULL == pMemDesc[i])
390                 {
391                     memdescFree(pMemDescTemp);
392                     memdescDestroy(pMemDescTemp);
393                     goto done;
394                 }
395             }
396         }
397         else
398         {
399             pMemDesc[i] = pMemDescTemp;
400         }
401 
402 #if NV_PRINTF_STRINGS_ALLOWED
403         NV_PRINTF(LEVEL_INFO,
404                   "[GPU%u]: [%s] %sPA 0x%llX (0x%X bytes) for VA 0x%llX-0x%llX\n",
405                   pUserCtx->pGpu->gpuInstance,
406                   bPacked ? "Packed" : "Unpacked",
407                   bMirror ? g_gmmuUVMMirroringDirStrings[i] : "",
408                   memdescGetPhysAddr(pMemDesc[i], AT_GPU, 0), newMemSize,
409                   mmuFmtLevelVirtAddrLo(pLevelFmt, vaBase),
410                   mmuFmtLevelVirtAddrHi(pLevelFmt, vaLimit));
411 #else // NV_PRINTF_STRINGS_ALLOWED
412         NV_PRINTF(LEVEL_INFO,
413                   "[GPU%u]:  [Packed: %c] %sPA 0x%llX (0x%X bytes) for VA 0x%llX-0x%llX\n",
414                   pUserCtx->pGpu->gpuInstance,
415                   bPacked ? 'Y' : 'N',
416                   bMirror ? _gmmuUVMMirroringDirString[i] : ' ',
417                   memdescGetPhysAddr(pMemDesc[i], AT_GPU, 0), newMemSize,
418                   mmuFmtLevelVirtAddrLo(pLevelFmt, vaBase),
419                   mmuFmtLevelVirtAddrHi(pLevelFmt, vaLimit));
420 #endif // NV_PRINTF_STRINGS_ALLOWED
421     }
422 
423     // Commit return values.
424     *ppMemDesc = (MMU_WALK_MEMDESC*)pMemDesc[GMMU_USER_PAGE_DIR_INDEX];
425     *pMemSize  = newMemSize;
426     *pBChanged = NV_TRUE;
427 
428 done:
429     if (NV_OK == status)
430     {
431         // Commit mirrored root desc.
432         if (bMirror)
433         {
434             pUserCtx->pGpuState->pMirroredRoot =
435                 (MMU_WALK_MEMDESC*)pMemDesc[GMMU_KERNEL_PAGE_DIR_INDEX];
436         }
437     }
438     else
439     {
440         for (i = 0; i < maxPgDirs; i++)
441         {
442             memdescFree(pMemDesc[i]);
443             memdescDestroy(pMemDesc[i]);
444         }
445     }
446     return status;
447 }
448 
449 static PMEMORY_DESCRIPTOR
450 _gmmuMemDescCacheCreate
451 (
452     MMU_WALK_USER_CTX  *pUserCtx,
453     MEMORY_DESCRIPTOR  *pMemDesc,
454     NvU32               memSize
455 )
456 {
457     NV_STATUS status = NV_OK;
458     MEMORY_DESCRIPTOR* pMemDescTmp;
459     NvU32 i;
460 
461     NV_ASSERT_OR_RETURN((NULL != pMemDesc), NULL);
462     NV_ASSERT_OR_RETURN((memSize <= pMemDesc->ActualSize), NULL);
463 
464     if (pMemDesc->pSubMemDescList == NULL)
465     {
466         pMemDesc->pSubMemDescList = portMemAllocNonPaged(sizeof(MEMORY_DESCRIPTOR_LIST));
467         NV_ASSERT_OR_RETURN(pMemDesc->pSubMemDescList != NULL, NULL);
468     }
469 
470     // Initialize the list head of the unpacked memdesc
471     listInitIntrusive(pMemDesc->pSubMemDescList);
472 
473     // Form the list of submemdescs with the parent memdesc as the head
474     for (i = 0; i < (pMemDesc->ActualSize / memSize); i++)
475     {
476         MEMORY_DESCRIPTOR *pSubMemDesc = NULL;
477         status = memdescCreateSubMem(&pSubMemDesc,
478                                      pMemDesc,
479                                      pUserCtx->pGpu,
480                                      i * memSize,
481                                      memSize);
482         NV_ASSERT_OR_RETURN((NV_OK == status), NULL);
483         listAppendExisting(pMemDesc->pSubMemDescList, pSubMemDesc);
484     }
485 
486     // Add the parent memdesc to the per VAS/per GPU list of unpacked memdescs
487     listAppendExisting(&pUserCtx->pGpuState->unpackedMemDescList, pMemDesc);
488 
489     // Pop the free list of packed memdescs and return one
490     pMemDescTmp = listTail(pMemDesc->pSubMemDescList);
491     listRemove(pMemDesc->pSubMemDescList, pMemDescTmp);
492     return pMemDescTmp;
493 }
494 
495 static PMEMORY_DESCRIPTOR
496 _gmmuMemDescCacheAlloc
497 (
498     MMU_WALK_USER_CTX *pUserCtx
499 )
500 {
501     MEMORY_DESCRIPTOR *pParentMemDesc;
502     MEMORY_DESCRIPTOR *pParentMemDescNext;
503 
504     for (pParentMemDesc = listHead(&pUserCtx->pGpuState->unpackedMemDescList);
505          pParentMemDesc != NULL;
506          pParentMemDesc = pParentMemDescNext)
507     {
508         pParentMemDescNext = listNext(&pUserCtx->pGpuState->unpackedMemDescList, pParentMemDesc);
509         MEMORY_DESCRIPTOR *pChild;
510         pChild = listTail(pParentMemDesc->pSubMemDescList);
511         listRemove(pParentMemDesc->pSubMemDescList, pChild);
512         if (NULL != pChild)
513         {
514             return pChild;
515         }
516     }
517     return NULL;
518 }
519 
520 void
521 gmmuMemDescCacheFree
522 (
523     GVAS_GPU_STATE *pGpuState
524 )
525 {
526     NV_ASSERT_OR_RETURN_VOID(NULL != pGpuState);
527 
528     while (listCount(&pGpuState->unpackedMemDescList) > 0)
529     {
530         MEMORY_DESCRIPTOR *pTmp;
531         MEMORY_DESCRIPTOR *pParentMemDesc;
532         pParentMemDesc = listTail(&pGpuState->unpackedMemDescList);
533 
534         // Assert if all submemdescs have not been returned to the parent.
535         NV_ASSERT(pParentMemDesc->RefCount - listCount(pParentMemDesc->pSubMemDescList) == 1);
536 
537         while(listCount(pParentMemDesc->pSubMemDescList) > 0)
538         {
539             pTmp = listTail(pParentMemDesc->pSubMemDescList);
540             listRemove(pParentMemDesc->pSubMemDescList, pTmp);
541             memdescDestroy(pTmp);
542         }
543         listRemove(&pGpuState->unpackedMemDescList, pParentMemDesc);
544         memdescFree(pParentMemDesc);
545         memdescDestroy(pParentMemDesc);
546     }
547 }
548 
549 static void
550 _gmmuWalkCBLevelFree
551 (
552     MMU_WALK_USER_CTX   *pUserCtx,
553     const MMU_FMT_LEVEL *pLevelFmt,
554     const NvU64          vaBase,
555     MMU_WALK_MEMDESC    *pOldMem
556 )
557 {
558     NvU8               i;
559     NvBool             bMirror   = _mirrorLevel(pUserCtx, pLevelFmt);
560     NvU8               maxPgDirs = _getMaxPageDirs(bMirror);
561     MEMORY_DESCRIPTOR *pMemDesc[GMMU_MAX_PAGE_DIR_INDEX_COUNT] = {NULL};
562 
563     pMemDesc[GMMU_USER_PAGE_DIR_INDEX] = (MEMORY_DESCRIPTOR*)pOldMem;
564     if (bMirror)
565     {
566         pMemDesc[GMMU_KERNEL_PAGE_DIR_INDEX] =
567                 (MEMORY_DESCRIPTOR*)pUserCtx->pGpuState->pMirroredRoot;
568         pUserCtx->pGpuState->pMirroredRoot = NULL;
569     }
570 
571     for (i = 0; i < maxPgDirs; i++)
572     {
573         if (NULL == pMemDesc[i])
574         {
575             continue;
576         }
577 
578 #if NV_PRINTF_STRINGS_ALLOWED
579         NV_PRINTF(LEVEL_INFO,
580                   "[GPU%u]: %sPA 0x%llX for VA 0x%llX-0x%llX\n",
581                   pUserCtx->pGpu->gpuInstance,
582                   bMirror ? g_gmmuUVMMirroringDirStrings[i] : "",
583                   memdescGetPhysAddr(pMemDesc[i], AT_GPU, 0),
584                   mmuFmtLevelVirtAddrLo(pLevelFmt, vaBase),
585                   mmuFmtLevelVirtAddrHi(pLevelFmt, vaBase));
586 #else // NV_PRINTF_STRINGS_ALLOWED
587         NV_PRINTF(LEVEL_INFO,
588                   "[GPU%u]: %cPA 0x%llX for VA 0x%llX-0x%llX\n",
589                   pUserCtx->pGpu->gpuInstance,
590                   bMirror ? _gmmuUVMMirroringDirString[i] : ' ',
591                   memdescGetPhysAddr(pMemDesc[i], AT_GPU, 0),
592                   mmuFmtLevelVirtAddrLo(pLevelFmt, vaBase),
593                   mmuFmtLevelVirtAddrHi(pLevelFmt, vaBase));
594 #endif // NV_PRINTF_STRINGS_ALLOWED
595 
596         //
597         // If this is a submemdesc, return it to its free list only when
598         // the refcount is 1. A refcount greater than 1 implies that 2 or
599         // more GPUs in SLI are using it. GPUs in SLI can share a page level
600         // instance.
601         //
602         if (memdescIsSubMemoryMemDesc(pMemDesc[i]) &&
603            (pMemDesc[i]->RefCount == 1))
604         {
605             // Return this to the free list from which it was borrowed
606             listAppendExisting(memdescGetParentDescriptor(pMemDesc[i])->pSubMemDescList, pMemDesc[i]);
607         }
608         else
609         {
610             if (RMCFG_FEATURE_PMA &&
611                 (pUserCtx->pGVAS->flags & VASPACE_FLAGS_PTETABLE_PMA_MANAGED) &&
612                 (pMemDesc[i]->pPageHandleList != NULL) &&
613                 (listCount(pMemDesc[i]->pPageHandleList) != 0) &&
614                 (pUserCtx->pGVAS->pPageTableMemPool != NULL))
615             {
616                 rmMemPoolFree(pUserCtx->pGVAS->pPageTableMemPool,
617                               (RM_POOL_ALLOC_MEMDESC*)pMemDesc[i],
618                               pUserCtx->pGVAS->flags);
619             }
620 
621             if (!memdescIsSubMemoryMemDesc(pMemDesc[i]))
622             {
623                 memdescFree(pMemDesc[i]);
624             }
625             memdescDestroy(pMemDesc[i]);
626         }
627     }
628 }
629 
630 static NvBool
631 _gmmuWalkCBUpdatePdb
632 (
633     MMU_WALK_USER_CTX       *pUserCtx,
634     const MMU_FMT_LEVEL     *pRootFmt,
635     const MMU_WALK_MEMDESC  *pRootMem,
636     const NvBool             bIgnoreChannelBusy
637 )
638 {
639     OBJGPU            *pGpu = pUserCtx->pGpu;
640     MEMORY_DESCRIPTOR *pPDB = (MEMORY_DESCRIPTOR*)pRootMem;
641 
642     NV_PRINTF(LEVEL_INFO, "[GPU%u]: PA 0x%llX (%s)\n",
643               pUserCtx->pGpu->gpuInstance,
644               (NULL != pPDB) ? memdescGetPhysAddr(pPDB, AT_GPU, 0) : 0,
645               (NULL != pPDB) ? "valid" : "null");
646 
647     if (pUserCtx->pGVAS->flags & VASPACE_FLAGS_BAR_BAR1)
648     {
649         //
650         // Do nothing, as BAR1 pdb is static and is only created and
651         // destroyed along with the vaspace itself. Since the bar1
652         // instance memory is appropriately updated then, we do not
653         // do anything inside update pdb for bar1 which will be invoked
654         // for mmuwalksparsify and mmuwalkunmap.
655         //
656         return NV_TRUE;
657     }
658     else if ((pUserCtx->pGVAS->flags & VASPACE_FLAGS_HDA))
659     {
660         // Instance Block set up once by caller.
661         return NV_TRUE;
662     }
663     else if (IS_VIRTUAL_WITH_SRIOV(pGpu) || IS_GSP_CLIENT(pGpu))
664     {
665         // Noop inside a guest or CPU RM.
666         return NV_TRUE;
667     }
668         return NV_TRUE;
669 }
670 
671 static NvBool
672 _gmmuWalkCBUpdatePde
673 (
674     MMU_WALK_USER_CTX       *pUserCtx,
675     const MMU_FMT_LEVEL     *pLevelFmt,
676     const MMU_WALK_MEMDESC  *pLevelMem,
677     const NvU32              entryIndex,
678     const MMU_WALK_MEMDESC **pSubLevels
679 )
680 {
681     NvU32              i;
682     GMMU_ENTRY_VALUE   entry;
683     NvBool             bMirror     = _mirrorLevel(pUserCtx, pLevelFmt);
684     NvU8               maxPgDirs   = _getMaxPageDirs(bMirror);
685     OBJGPU            *pGpu        = pUserCtx->pGpu;
686     OBJGVASPACE       *pGVAS       = pUserCtx->pGVAS;
687     KernelGmmu        *pKernelGmmu = GPU_GET_KERNEL_GMMU(pGpu);
688     const GMMU_FMT    *pFmt        = pUserCtx->pGpuState->pFmt;
689     MEMORY_DESCRIPTOR *pMemDesc[GMMU_MAX_PAGE_DIR_INDEX_COUNT] = {NULL};
690     NvU32                      recipExp  = NV_U32_MAX;
691     const GMMU_FMT_PDE_MULTI  *pPdeMulti = pFmt->pPdeMulti;
692 
693     pMemDesc[GMMU_USER_PAGE_DIR_INDEX] = (MEMORY_DESCRIPTOR*)pLevelMem;
694     if (bMirror)
695     {
696         pMemDesc[GMMU_KERNEL_PAGE_DIR_INDEX] =
697             (MEMORY_DESCRIPTOR*)pUserCtx->pGpuState->pMirroredRoot;
698     }
699 
700     for (i = 0; i < maxPgDirs; i++)
701     {
702 #if NV_PRINTF_STRINGS_ALLOWED
703         NV_PRINTF(LEVEL_INFO, "[GPU%u]: %sPA 0x%llX, Entry 0x%X\n",
704                   pUserCtx->pGpu->gpuInstance,
705                   bMirror ? g_gmmuUVMMirroringDirStrings[i] : "",
706                   memdescGetPhysAddr(pMemDesc[i], AT_GPU, 0), entryIndex);
707 #else // NV_PRINTF_STRINGS_ALLOWED
708         NV_PRINTF(LEVEL_INFO, "[GPU%u]: %cPA 0x%llX, Entry 0x%X\n",
709                   pUserCtx->pGpu->gpuInstance,
710                   bMirror ? _gmmuUVMMirroringDirString[i] : ' ',
711                   memdescGetPhysAddr(pMemDesc[i], AT_GPU, 0), entryIndex);
712 #endif // NV_PRINTF_STRINGS_ALLOWED
713     }
714 
715     portMemSet(entry.v8, 0, pLevelFmt->entrySize);
716 
717     for (i = 0; i < pLevelFmt->numSubLevels; ++i)
718     {
719         const GMMU_FMT_PDE *pPde        = gmmuFmtGetPde(pFmt, pLevelFmt, i);
720         MEMORY_DESCRIPTOR  *pSubMemDesc = (MEMORY_DESCRIPTOR*)pSubLevels[i];
721 
722         if (NULL != pSubMemDesc)
723         {
724             const GMMU_APERTURE       aperture = kgmmuGetMemAperture(pKernelGmmu, pSubMemDesc);
725             const GMMU_FIELD_ADDRESS *pFldAddr = gmmuFmtPdePhysAddrFld(pPde, aperture);
726             const NvU64               physAddr = memdescGetPhysAddr(pSubMemDesc, AT_GPU, 0);
727 
728             if (pFmt->version == GMMU_FMT_VERSION_3)
729             {
730                 NvU32 pdePcfHw    = 0;
731                 NvU32 pdePcfSw    = 0;
732 
733                 pdePcfSw |= gvaspaceIsAtsEnabled(pGVAS) ? (1 << SW_MMU_PCF_ATS_ALLOWED_IDX) : 0;
734                 pdePcfSw |= memdescGetVolatility(pSubMemDesc) ? (1 << SW_MMU_PCF_UNCACHED_IDX) : 0;
735 
736                 NV_ASSERT_OR_RETURN((kgmmuTranslatePdePcfFromSw_HAL(pKernelGmmu, pdePcfSw, &pdePcfHw) == NV_OK),
737                                       NV_ERR_INVALID_ARGUMENT);
738                 nvFieldSet32(&pPde->fldPdePcf, pdePcfHw, entry.v8);
739             }
740             else
741             {
742                 nvFieldSetBool(&pPde->fldVolatile, memdescGetVolatility(pSubMemDesc), entry.v8);
743             }
744 
745             gmmuFieldSetAperture(&pPde->fldAperture, aperture, entry.v8);
746             gmmuFieldSetAddress(pFldAddr,
747                 kgmmuEncodePhysAddr(pKernelGmmu, aperture, physAddr,
748                     NVLINK_INVALID_FABRIC_ADDR),
749                 entry.v8);
750 
751             // Calculate partial page table size if supported.
752             if ((pGVAS->flags & VASPACE_FLAGS_MINIMIZE_PTETABLE_SIZE) &&
753                 (pLevelFmt->numSubLevels > 1) &&
754                 nvFieldIsValid32(&pPdeMulti->fldSizeRecipExp))
755             {
756                 const NvU32 maxMemSize  = mmuFmtLevelSize(&pLevelFmt->subLevels[i]);
757                 const NvU32 curMemSize  = (NvU32)pSubMemDesc->Size;
758                 const NvU32 minRecipExp = BIT_IDX_32(maxMemSize / curMemSize);
759 
760                 // We should have allocated on a fractional (pow2) boundary.
761                 NV_ASSERT(ONEBITSET(curMemSize));
762 
763                 if (recipExp == NV_U32_MAX)
764                 {
765                     // Save exponent if not set yet.
766                     recipExp = minRecipExp;
767                 }
768                 else
769                 {
770                     // Otherwise ensure parallel sub-levels match.
771                     NV_ASSERT(recipExp == minRecipExp);
772                 }
773             }
774 
775             NV_PRINTF(LEVEL_INFO, "    SubLevel %u = PA 0x%llX\n", i,
776                       physAddr);
777         }
778         else
779         {
780             NV_PRINTF(LEVEL_INFO, "    SubLevel %u = INVALID\n", i);
781         }
782     }
783 
784     // Set partial page table size exponent if needed.
785     if (recipExp != NV_U32_MAX)
786     {
787         nvFieldSet32(&pPdeMulti->fldSizeRecipExp, recipExp, entry.v8);
788     }
789 
790     for (i = 0; i < maxPgDirs; i++)
791     {
792         TRANSFER_SURFACE dest = {0};
793 
794         dest.pMemDesc = pMemDesc[i];
795         dest.offset = entryIndex * pLevelFmt->entrySize;
796         NV_ASSERT_OK(memmgrMemWrite(GPU_GET_MEMORY_MANAGER(pGpu), &dest,
797                                     entry.v8, pLevelFmt->entrySize,
798                                     TRANSFER_FLAGS_NONE));
799     }
800 
801     return NV_TRUE;
802 }
803 
804 static void
805 _gmmuWalkCBFillEntries
806 (
807     MMU_WALK_USER_CTX         *pUserCtx,
808     const MMU_FMT_LEVEL       *pLevelFmt,
809     const MMU_WALK_MEMDESC    *pLevelMem,
810     const NvU32                entryIndexLo,
811     const NvU32                entryIndexHi,
812     const MMU_WALK_FILL_STATE  fillState,
813     NvU32                     *pProgress
814 )
815 {
816     NvU32              i;
817     NvU32              j;
818     OBJGPU            *pGpu           = pUserCtx->pGpu;
819     KernelGmmu        *pKernelGmmu    = GPU_GET_KERNEL_GMMU(pGpu);
820     MemoryManager     *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
821     KernelBus         *pKernelBus     = GPU_GET_KERNEL_BUS(pGpu);
822     const GMMU_FMT    *pFmt      = pUserCtx->pGpuState->pFmt;
823     NvBool             bMirror   = _mirrorLevel(pUserCtx, pLevelFmt);
824     NvU8               maxPgDirs = _getMaxPageDirs(bMirror);
825     MEMORY_DESCRIPTOR *pMemDesc[GMMU_MAX_PAGE_DIR_INDEX_COUNT] = {NULL};
826     NvU32              sizeOfEntries = (entryIndexHi - entryIndexLo + 1) *
827                                         pLevelFmt->entrySize;
828     NvU8              *pEntries;
829 
830     pMemDesc[GMMU_USER_PAGE_DIR_INDEX] = (MEMORY_DESCRIPTOR*)pLevelMem;
831     if (bMirror)
832     {
833         pMemDesc[GMMU_KERNEL_PAGE_DIR_INDEX] =
834             (MEMORY_DESCRIPTOR*)pUserCtx->pGpuState->pMirroredRoot;
835     }
836 
837     for (j = 0; j < maxPgDirs; j++)
838     {
839         TRANSFER_SURFACE dest = {0};
840 
841         dest.pMemDesc = pMemDesc[j];
842         dest.offset = entryIndexLo * pLevelFmt->entrySize;
843 
844         //
845         // A shadow buffer is allocated to store the PTEs in case of writes
846         // using CE and GSP DMA task. This code gets called in a high IRQL
847         // path on Windows and shadow buffer allocation may fail there.
848         //
849         pEntries = memmgrMemBeginTransfer(pMemoryManager, &dest, sizeOfEntries,
850                                           TRANSFER_FLAGS_SHADOW_ALLOC);
851         NV_ASSERT_OR_RETURN_VOID(pEntries != NULL);
852 
853 #if NV_PRINTF_STRINGS_ALLOWED
854         NV_PRINTF(LEVEL_INFO,
855                   "[GPU%u]: %sPA 0x%llX, Entries 0x%X-0x%X = %s\n",
856                   pUserCtx->pGpu->gpuInstance,
857                   bMirror ? g_gmmuUVMMirroringDirStrings[j] : "",
858                   memdescGetPhysAddr(pMemDesc[j], AT_GPU, 0),
859                   entryIndexLo, entryIndexHi,
860                   g_gmmuFillStateStrings[fillState]);
861 #else // NV_PRINTF_STRINGS_ALLOWED
862         NV_PRINTF(LEVEL_INFO,
863                   "[GPU%u] %cPA 0x%llX, Entries 0x%X-0x%X = %c\n",
864                   pUserCtx->pGpu->gpuInstance,
865                   bMirror ? _gmmuUVMMirroringDirString[j] : ' ',
866                   memdescGetPhysAddr(pMemDesc[j], AT_GPU, 0),
867                   entryIndexLo, entryIndexHi,
868                   _gmmuFillStateString[fillState]);
869 #endif // NV_PRINTF_STRINGS_ALLOWED
870 
871         switch (fillState)
872         {
873             case MMU_WALK_FILL_INVALID:
874                 portMemSet(pEntries, 0, sizeOfEntries);
875                 break;
876             case MMU_WALK_FILL_SPARSE:
877             {
878                 const GMMU_FMT_FAMILY  *pFam = kgmmuFmtGetFamily(pKernelGmmu, pFmt->version);
879                 const GMMU_ENTRY_VALUE *pSparseEntry;
880 
881                 // Select sparse entry template based on number of sub-levels.
882                 if (pLevelFmt->numSubLevels > 1)
883                 {
884                     pSparseEntry = &pFam->sparsePdeMulti;
885                 }
886                 else if (pLevelFmt->numSubLevels == 1)
887                 {
888                     pSparseEntry = &pFam->sparsePde;
889                 }
890                 else
891                 {
892                     if (kbusIsFlaDummyPageEnabled(pKernelBus) &&
893                         (pUserCtx->pGVAS->flags & VASPACE_FLAGS_FLA))
894                         pSparseEntry = &pUserCtx->pGpuState->flaDummyPage.pte;
895                     else
896                         pSparseEntry = &pFam->sparsePte;
897                 }
898 
899                 // Copy sparse template to each entry.
900                 for (i = entryIndexLo; i <= entryIndexHi; ++i)
901                 {
902                     NvU32 entryIndex = (i - entryIndexLo) * pLevelFmt->entrySize;
903                     portMemCopy(&pEntries[entryIndex],
904                                 pLevelFmt->entrySize,
905                                 pSparseEntry->v8,
906                                 pLevelFmt->entrySize);
907                 }
908                 break;
909             }
910             case MMU_WALK_FILL_NV4K:
911             {
912                 const GMMU_FMT_FAMILY  *pFam =
913                     kgmmuFmtGetFamily(pKernelGmmu, pFmt->version);
914                 const GMMU_ENTRY_VALUE *pNv4kEntry = &pFam->nv4kPte;
915 
916                 // debug print - to remove when the code is robust enough
917                 if (!gvaspaceIsAtsEnabled(pUserCtx->pGVAS) ||
918                      mmuFmtLevelPageSize(pLevelFmt) != RM_PAGE_SIZE_64K)
919                 {
920 #if NV_PRINTF_STRINGS_ALLOWED
921                     NV_PRINTF(LEVEL_ERROR,
922                               "[GPU%u]: %sPA 0x%llX, Entries 0x%X-0x%X = %s FAIL\n",
923                               pUserCtx->pGpu->gpuInstance,
924                               bMirror ? g_gmmuUVMMirroringDirStrings[j] : "",
925                               memdescGetPhysAddr(pMemDesc[j], AT_GPU, 0),
926                               entryIndexLo, entryIndexHi,
927                               g_gmmuFillStateStrings[fillState]);
928 #else // NV_PRINTF_STRINGS_ALLOWED
929                     NV_PRINTF(LEVEL_ERROR,
930                               "[GPU%u]: %cPA 0x%llX, Entries 0x%X-0x%X = %c FAIL\n",
931                               pUserCtx->pGpu->gpuInstance,
932                               bMirror ? _gmmuUVMMirroringDirString[j] : ' ',
933                               memdescGetPhysAddr(pMemDesc[j], AT_GPU, 0),
934                               entryIndexLo, entryIndexHi,
935                               _gmmuFillStateString[fillState]);
936 #endif // NV_PRINTF_STRINGS_ALLOWED
937 
938                     DBG_BREAKPOINT();
939                     return;
940                 }
941 
942                 // Copy nv4k template to each entry
943                 for (i = entryIndexLo; i <= entryIndexHi; ++i)
944                 {
945                     NvU32 entryIndex = (i - entryIndexLo) * pLevelFmt->entrySize;
946                     portMemCopy(&pEntries[entryIndex],
947                                 pLevelFmt->entrySize,
948                                 pNv4kEntry->v8,
949                                 pLevelFmt->entrySize);
950                 }
951                 break;
952             }
953             default:
954                 NV_ASSERT(0);
955                 break;
956         }
957 
958         memmgrMemEndTransfer(pMemoryManager, &dest, sizeOfEntries,
959                              TRANSFER_FLAGS_SHADOW_ALLOC);
960     }
961 
962     *pProgress = entryIndexHi - entryIndexLo + 1;
963 }
964 
965 static void
966 _gmmuWalkCBCopyEntries
967 (
968     MMU_WALK_USER_CTX         *pUserCtx,
969     const MMU_FMT_LEVEL       *pLevelFmt,
970     const MMU_WALK_MEMDESC    *pSrcMem,
971     const MMU_WALK_MEMDESC    *pDstMem,
972     const NvU32                entryIndexLo,
973     const NvU32                entryIndexHi,
974     NvU32                     *pProgress
975 )
976 {
977     MEMORY_DESCRIPTOR *pSrcDesc = (MEMORY_DESCRIPTOR *)pSrcMem;
978     MEMORY_DESCRIPTOR *pDstDesc = (MEMORY_DESCRIPTOR *)pDstMem;
979     TRANSFER_SURFACE   src      = {0};
980     TRANSFER_SURFACE   dest     = {0};
981 
982     src.pMemDesc = pSrcDesc;
983     src.offset = entryIndexLo * pLevelFmt->entrySize;
984     dest.pMemDesc = pDstDesc;
985     dest.offset = entryIndexLo * pLevelFmt->entrySize;
986 
987     // Only copy if different source and destination memory.
988     if (!memdescDescIsEqual(pSrcDesc, pDstDesc))
989     {
990         OBJGPU *pGpu = pUserCtx->pGpu;
991         NvU32   sizeOfEntries = (entryIndexHi - entryIndexLo + 1) *
992                                  pLevelFmt->entrySize;
993 
994         NV_PRINTF(LEVEL_INFO,
995                   "[GPU%u]: GVAS(%p) PA 0x%llX -> PA 0x%llX, Entries 0x%X-0x%X\n",
996                   pGpu->gpuInstance, pUserCtx->pGVAS,
997                   memdescGetPhysAddr(pSrcDesc, AT_GPU, 0),
998                   memdescGetPhysAddr(pDstDesc, AT_GPU, 0), entryIndexLo,
999                   entryIndexHi);
1000 
1001         NV_ASSERT_OK(memmgrMemCopy(GPU_GET_MEMORY_MANAGER(pGpu), &dest, &src,
1002                                    sizeOfEntries, TRANSFER_FLAGS_NONE));
1003     }
1004 
1005     // Report full range complete.
1006     *pProgress = entryIndexHi - entryIndexLo + 1;
1007 }
1008 
1009 const MMU_WALK_CALLBACKS g_gmmuWalkCallbacks =
1010 {
1011     _gmmuWalkCBLevelAlloc,
1012     _gmmuWalkCBLevelFree,
1013     _gmmuWalkCBUpdatePdb,
1014     _gmmuWalkCBUpdatePde,
1015     _gmmuWalkCBFillEntries,
1016     _gmmuWalkCBCopyEntries,
1017     NULL,
1018 };
1019