1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2005-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24
25 /*!
26 * @file
27 * @brief The FERMI specific HAL VMA routines reside in this file
28 *
29 * ===========================================================================
30 * GLOSSARY OF INCONSISTENCIES
31 * ===========================================================================
32 *
33 * --------
34 * LOW, MIN
35 * --------
36 * (1) Synonyms for the first address or index in a range.
37 * e.g. In the inclusive range 37 to 509, the "low" or "min" is 37.
38 *
39 * ---------------------
40 * HIGH, MAX, LIMIT, END
41 * ---------------------
42 * (1) Synonyms for the last address or index in a range.
43 * e.g. In the inclusive range 37 to 509, the "limit" is 509.
44 * (2) Sometimes "high" or "end" are used for the "limit plus one" - e.g. 510.
45 * Currently this can only be determined by context.
46 * TODO: Decide on consistent terms and clean this up.
47 *
48 * ---
49 * PDB
50 * ---
51 * (1) Page Directory Base
52 * The base address of a page directory,
53 * e.g. written to the PD_BASE field of an instance block.
54 * (2) Page Directory Block
55 * The entire physical memory block of a page directory,
56 * e.g. described by a memdesc associated with a VASPACE object.
57 * (3) Property DataBase - e.g. in PDB_PROP_*
58 * The common meaning to the rest of RM - boolean values associated
59 * with an object. Completely unrelated to (1) and (2).
60 *
61 * ---
62 * PDE
63 * ---
64 * (1) Page Directory Entry
65 * An *ENTRY* within a page directory, containing the physical
66 * addresses and attributes of a single small/big page table pair.
67 * (2) !!!WRONG!!! The page direcory itself
68 * Somtimes also used in the plural form "PDEs".
69 * Use "page directory" or "PD" instead.
70 *
71 * --------------------------
72 * PDE ENTRY !!!DO NOT USE!!!
73 * --------------------------
74 * (1) !!!WRONG!!! Page Directory Entry Entry(?!)
75 * This is redundant - just use "PDE".
76 * (2) Page Dir*E*ctory Entry
77 * Desperate bacronym to justify current usage.
78 *
79 * --------
80 * PDE SIZE
81 * --------
82 * (1) Size or index corresponding to the NV_MMU_PDE_SIZE field of a PDE.
83 * This refers to the size of *page tables* that this
84 * PDE points to (1/8, 1/4, 1/2, full), not the size of the PDE itself.
85 * The more accurate term is "PT size" - most code has been cleaned up
86 * to use this instead, but some API params remain.
87 * (2) Size of the PDE itself (8 bytes), defined by the constant NV_MMU_PDE__SIZE.
88 *
89 * ---
90 * PTE
91 * ---
92 * (1) Page Table Entry
93 * An *ENTRY* within a page table, containing the physical
94 * address and attributes of a single page (small or big).
95 * (2) !!!WRONG!!! The page table itself
96 * Somtimes also used in the plural form "PTEs".
97 * Use "page table" or "PT" instead.
98 *
99 * --------------------------
100 * PTE ENTRY !!!DO NOT USE!!!
101 * --------------------------
102 * (1) !!!WRONG!!! Page Table Entry Entry(?!)
103 * This is redundant - just use "PTE".
104 * (2) Page Tabl*E* Entry
105 * Desperate bacronym to justify current usage.
106 *
107 */
108
109 #include "core/core.h"
110 #include "gpu/gpu.h"
111 #include "lib/base_utils.h"
112 #include "gpu/mem_mgr/heap.h"
113 #include "os/os.h"
114 #include "rmapi/client.h"
115 #include "nvRmReg.h"
116 #include "gpu/mem_mgr/virt_mem_allocator.h"
117 #include "gpu/bif/kernel_bif.h"
118 #include "core/system.h"
119 #include "core/thread_state.h"
120 #include "mem_mgr/vaspace.h"
121 #include "mem_mgr/fabric_vaspace.h"
122 #include "mem_mgr/virt_mem_mgr.h"
123 #include "platform/sli/sli.h"
124
125 #include "mem_mgr/fla_mem.h"
126
127 #include "gpu/mmu/kern_gmmu.h"
128 #include "gpu/mem_sys/kern_mem_sys.h"
129 #include "gpu_mgr/gpu_group.h"
130 #include "mmu/mmu_fmt.h"
131 #include "gpu/device/device.h"
132 #include "gpu/nvlink/kernel_nvlink.h"
133 #include "gpu/bus/kern_bus.h"
134 #include "gpu/mem_mgr/mem_mgr.h"
135
136 #include "gpu/mem_mgr/fermi_dma.h"
137
138 #include "published/maxwell/gm107/dev_mmu.h"
139 #include "published/maxwell/gm107/dev_bus.h"
140
141 #include "ctrl/ctrl0002.h"
142
143 #include "vgpu/rpc.h"
144
145 #define _MMUXLATEVADDR_FLAG_SHOW_INVALID NVBIT(0)
146 #define _MMUXLATEVADDR_FLAG_VALIDATE_ONLY NVBIT(1) // incomplete
147 #define _MMUXLATEVADDR_FLAG_VALIDATE_TERSELY NVBIT(2) // incomplete
148 // no trace output
149 #define _MMUXLATEVADDR_FLAG_XLATE_ONLY _MMUXLATEVADDR_FLAG_VALIDATE_TERSELY
150
151 static NV_STATUS _dmaGetFabricAddress(OBJGPU *pGpu, NvU32 aperture, NvU32 kind,
152 NvU64 *fabricAddr);
153 static NV_STATUS _dmaGetFabricEgmAddress(OBJGPU *pGpu, NvU32 aperture, NvU32 kind,
154 NvU64 *fabricEgmAddr);
155
156 static NV_STATUS
157 _dmaApplyWarForBug2720120
158 (
159 OBJGVASPACE *pGVAS,
160 OBJGPU *pGpu,
161 const NvU64 vaLo,
162 const NvU64 vaHi
163 );
164
165 //
166 // Virtual Address Space Block - Data tracked per virtual allocation
167 //
168 // only used with NV_REG_STR_RESERVE_PTE_SYSMEM_MB. Protential dead code.
169 typedef struct VASINFO_MAXWELL
170 {
171 PNODE pMapTree; // Tree of current mappings.
172 NvU32 pageSizeMask; // Mask of page size indices supported.
173 // See VAS_PAGESIZE_MASK.
174 VAS_ALLOC_FLAGS flags;
175 VA_MANAGEMENT management; // Level of management.
176 } VASINFO_MAXWELL, *PVASINFO_MAXWELL;
177
178 /*!
179 * @brief Allocate virtual memory and map it to physical memory.
180 *
181 * The virtual memory may already be allocated, in which case it is just
182 * initialized (backing page table tables allocated).
183 *
184 * VMM-TODO: If possible remove overloading - e.g. just map, never allocate.
185 * Definitely move MMU stuff down.
186 *
187 * @param[in] pGpu OBJGPU pointer
188 * @param[in] pDma VirtMemAllocator pointer
189 * @param[in] pVAS OBJVASPACE pointer
190 * @param[in] pMemDesc Physical memory descriptor
191 * @param[in/out] pVaddr Pointer to Virtual memory base address
192 * @param[in] flags Mapping options
193 * @param[in] pDmaMappingInfo CLI_DMA_MAPPING_INFO pointer (for RM Client mappings)
194 * @param[in] swizzId SMC swizzId (Only used for BAR1 mapping)
195 *
196 * @returns NV_STATUS status = NV_OK on success, or status upon failure.
197 */
198 NV_STATUS
dmaAllocMapping_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma,OBJVASPACE * pVAS,MEMORY_DESCRIPTOR * pMemDesc,NvU64 * pVaddr,NvU32 flags,CLI_DMA_ALLOC_MAP_INFO * pCliMapInfo,NvU32 swizzId)199 dmaAllocMapping_GM107
200 (
201 OBJGPU *pGpu,
202 VirtMemAllocator *pDma,
203 OBJVASPACE *pVAS,
204 MEMORY_DESCRIPTOR *pMemDesc,
205 NvU64 *pVaddr,
206 NvU32 flags,
207 CLI_DMA_ALLOC_MAP_INFO *pCliMapInfo,
208 NvU32 swizzId
209 )
210 {
211 NV_STATUS status = NV_OK;
212 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
213 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
214 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
215 OBJEHEAP *pVASpaceHeap = NULL;
216 KernelGmmu *pKernelGmmu = GPU_GET_KERNEL_GMMU(pGpu);
217 KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu);
218 FABRIC_VASPACE *pFabricVAS = dynamicCast(pGpu->pFabricVAS, FABRIC_VASPACE);
219 MEMORY_DESCRIPTOR *pAdjustedMemDesc = pMemDesc;
220 ADDRESS_TRANSLATION addressTranslation;
221 NvU32 gfid;
222 NvBool bCallingContextPlugin;
223 const MEMORY_SYSTEM_STATIC_CONFIG *pMemorySystemConfig =
224 kmemsysGetStaticConfig(pGpu, pKernelMemorySystem);
225 OBJGVASPACE *pGVAS = NULL;
226
227 struct
228 {
229 NvU32 pteCount;
230 NvU32 pageCount;
231 NvU32 overMap;
232 NvU64 vaLo;
233 NvU64 vaHi;
234 NvU64 mapLength;
235 NvU64 pageOffset;
236 NvU64 pageSize;
237 NvU64 vaRangeLo;
238 NvU64 vaRangeHi;
239 NvU32 kind;
240 NvU32 priv;
241 NvU32 cacheSnoop;
242 COMPR_INFO comprInfo;
243 NvU32 aperture;
244 NvU32 tlbLock;
245 NvU32 p2p;
246 NvU32 writeOnly;
247 NvU32 readOnly;
248 NvU32 subDevIdSrc;
249 NvU32 deferInvalidate;
250 NODE *pMapNode;
251 NvU32 shaderFlags;
252 NvU32 disableEncryption;
253 VASINFO_MAXWELL *pVASInfo;
254 OBJGPU *pSrcGpu;
255 NvU32 peerNumber;
256 NvBool bAllocVASpace;
257 NvBool bIsBarOrPerf;
258 NvBool bIsBar1;
259 NvBool bIsMIGMemPartitioningEnabled;
260 RmPhysAddr *pPteArray;
261 DMA_PAGE_ARRAY pageArray;
262 NvU64 vaspaceBigPageSize;
263 NvBool bIsMemContiguous;
264 NvU64 fabricAddr;
265 NvU32 indirectPeer;
266 NvBool bFlaImport;
267 NV_RANGE totalVaRange;
268 MEMORY_DESCRIPTOR *pRootMemDesc;
269 MEMORY_DESCRIPTOR *pTempMemDesc;
270 Memory *pMemory;
271 NvU32 pageArrayGranularity;
272 NvU8 pageShift;
273 NvU64 physPageSize;
274 NvU64 pageArrayFlags;
275 } *pLocals = portMemAllocNonPaged(sizeof(*pLocals));
276 // Heap Allocate to avoid stack overflow
277
278 if (pLocals == NULL)
279 return NV_ERR_NO_MEMORY;
280
281 portMemSet(pLocals, 0, sizeof(*pLocals));
282 pLocals->pSrcGpu = pGpu;
283 pLocals->peerNumber = BUS_INVALID_PEER;
284 pLocals->totalVaRange = NV_RANGE_EMPTY;
285
286 if (pCliMapInfo != NULL)
287 pLocals->pMemory = pCliMapInfo->pMemory;
288
289 pLocals->vaspaceBigPageSize = vaspaceGetBigPageSize(pVAS);
290 pLocals->bIsBarOrPerf = (vaspaceGetFlags(pVAS) &
291 (VASPACE_FLAGS_BAR|VASPACE_FLAGS_PERFMON|VASPACE_FLAGS_HDA)) != 0;
292 pLocals->p2p = DRF_VAL(OS46, _FLAGS, _P2P_ENABLE, flags);
293 pLocals->subDevIdSrc = DRF_VAL(OS46, _FLAGS, _P2P_SUBDEV_ID_SRC, flags);
294 pLocals->deferInvalidate = FLD_TEST_DRF(OS46, _FLAGS, _DEFER_TLB_INVALIDATION, _TRUE, flags) ?
295 DMA_DEFER_TLB_INVALIDATE : DMA_TLB_INVALIDATE;
296 pLocals->bAllocVASpace = FLD_TEST_DRF(OS46, _FLAGS, _DMA_UNICAST_REUSE_ALLOC, _FALSE, flags);
297 pLocals->bIsBar1 = (vaspaceGetFlags(pVAS) & VASPACE_FLAGS_BAR_BAR1) != 0;
298 pLocals->bIsMIGMemPartitioningEnabled = (pKernelMIGManager != NULL) && kmigmgrIsMIGMemPartitioningEnabled(pGpu, pKernelMIGManager);
299
300 pLocals->cacheSnoop = (NVOS46_FLAGS_CACHE_SNOOP_ENABLE == DRF_VAL(OS46, _FLAGS, _CACHE_SNOOP, flags));
301 pLocals->writeOnly = (NVOS46_FLAGS_ACCESS_WRITE_ONLY == DRF_VAL(OS46, _FLAGS, _ACCESS, flags));
302 pLocals->readOnly = (NVOS46_FLAGS_ACCESS_READ_ONLY == DRF_VAL(OS46, _FLAGS, _ACCESS, flags)) ?
303 DMA_UPDATE_VASPACE_FLAGS_READ_ONLY : 0;
304 pLocals->tlbLock = (NVOS46_FLAGS_TLB_LOCK_ENABLE == DRF_VAL(OS46, _FLAGS, _TLB_LOCK, flags)) ?
305 DMA_UPDATE_VASPACE_FLAGS_TLB_LOCK : 0;
306
307 switch (DRF_VAL(OS46, _FLAGS, _SHADER_ACCESS, flags))
308 {
309 default:
310 case NVOS46_FLAGS_SHADER_ACCESS_DEFAULT:
311 // The default (0) case we pick up the SHADER_ACCESS from ACCESS.
312 pLocals->shaderFlags = 0;
313 if (pLocals->readOnly)
314 pLocals->shaderFlags |= DMA_UPDATE_VASPACE_FLAGS_SHADER_READ_ONLY;
315 if (pLocals->writeOnly)
316 pLocals->shaderFlags |= DMA_UPDATE_VASPACE_FLAGS_SHADER_WRITE_ONLY;
317 break;
318 case NVOS46_FLAGS_SHADER_ACCESS_READ_WRITE:
319 pLocals->shaderFlags = 0;
320 break;
321 case NVOS46_FLAGS_SHADER_ACCESS_READ_ONLY:
322 pLocals->shaderFlags = DMA_UPDATE_VASPACE_FLAGS_SHADER_READ_ONLY;
323 break;
324 case NVOS46_FLAGS_SHADER_ACCESS_WRITE_ONLY:
325 pLocals->shaderFlags = DMA_UPDATE_VASPACE_FLAGS_SHADER_WRITE_ONLY;
326 break;
327 }
328
329 addressTranslation = VAS_ADDRESS_TRANSLATION(pVAS);
330 // In SRIOV-heavy plugin may map subheap allocations for itself using BAR1
331 NV_ASSERT_OK_OR_GOTO(status, vgpuIsCallingContextPlugin(pGpu, &bCallingContextPlugin), cleanup);
332 if (bCallingContextPlugin)
333 addressTranslation = FORCE_VMMU_TRANSLATION(pMemDesc, addressTranslation);
334
335 if (pFabricVAS != NULL)
336 {
337 status = fabricvaspaceGetGpaMemdesc(pFabricVAS, pMemDesc, pGpu, &pAdjustedMemDesc);
338 if (status != NV_OK)
339 {
340 NV_PRINTF(LEVEL_ERROR, "Failed to get the adjusted memdesc for the fabric memdesc\n");
341 goto cleanup;
342 }
343 }
344
345 // Get pageSize
346 pLocals->pTempMemDesc = memdescGetMemDescFromGpu(pAdjustedMemDesc, pGpu);
347
348 // Get physical allocation granularity and page size.
349 pLocals->pageArrayGranularity = pLocals->pTempMemDesc->pageArrayGranularity;
350 pLocals->physPageSize = memdescGetPageSize(pLocals->pTempMemDesc, addressTranslation);
351
352 // retrieve mapping page size from flags
353 switch(DRF_VAL(OS46, _FLAGS, _PAGE_SIZE, flags))
354 {
355 case NVOS46_FLAGS_PAGE_SIZE_DEFAULT:
356 case NVOS46_FLAGS_PAGE_SIZE_BOTH:
357 pLocals->pageSize = memdescGetPageSize(pLocals->pTempMemDesc, addressTranslation);
358 break;
359 case NVOS46_FLAGS_PAGE_SIZE_4KB:
360 pLocals->pageSize = RM_PAGE_SIZE;
361 break;
362 case NVOS46_FLAGS_PAGE_SIZE_BIG:
363 // case for arch specific 128K
364 pLocals->pageSize = pLocals->vaspaceBigPageSize;
365 break;
366 case NVOS46_FLAGS_PAGE_SIZE_HUGE:
367 pLocals->pageSize = RM_PAGE_SIZE_HUGE;
368 break;
369 default:
370 NV_PRINTF(LEVEL_ERROR, "Unknown page size flag encountered during mapping\n");
371 status = NV_ERR_INVALID_ARGUMENT;
372 goto cleanup;
373 }
374
375 NV_PRINTF(LEVEL_INFO, "Picked Page size based on flags: 0x%llx flagVal: 0x%x\n",
376 pLocals->pageSize, DRF_VAL(OS46, _FLAGS, _PAGE_SIZE, flags));
377
378 if (pLocals->physPageSize < pLocals->pageSize)
379 {
380 NV_PRINTF(LEVEL_WARNING, "Requested mapping at larger page size than the physical granularity "
381 "PhysPageSize = 0x%llx MapPageSize = 0x%llx. Overriding to physical page granularity...\n",
382 pLocals->physPageSize, pLocals->pageSize);
383 pLocals->pageSize = pLocals->physPageSize;
384 }
385
386 if (memdescGetFlag(pLocals->pTempMemDesc, MEMDESC_FLAGS_DEVICE_READ_ONLY))
387 {
388 NV_ASSERT_OR_ELSE((pLocals->readOnly == DMA_UPDATE_VASPACE_FLAGS_READ_ONLY),
389 status = NV_ERR_INVALID_ARGUMENT; goto cleanup);
390 }
391
392 //
393 // Force BAR1 VA pageSize at bigPageSize only if total BAR1 size is less
394 // than threshold(default: 256MB) to not waste BAR1.
395 // For large BAR1 SKUs, avoid forcing 64KB size and use the pagesize of
396 // the memdesc.
397 //
398 if (kgmmuIsVaspaceInteropSupported(pKernelGmmu) &&
399 pLocals->bIsBar1)
400 {
401 if ((pLocals->pageSize > pLocals->vaspaceBigPageSize) &&
402 kbusIsBar1Force64KBMappingEnabled(pKernelBus))
403 {
404 pLocals->pageSize = pLocals->vaspaceBigPageSize;
405
406 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY | SLI_LOOP_FLAGS_IGNORE_REENTRANCY)
407 memdescSetPageSize(memdescGetMemDescFromGpu(pAdjustedMemDesc, pGpu),
408 addressTranslation, (NvU32)pLocals->pageSize);
409 SLI_LOOP_END
410 }
411 }
412
413 pLocals->pageShift = BIT_IDX_32(pLocals->pageArrayGranularity);
414
415 // Get mapping params on current gpu memdesc
416 pLocals->pageOffset = memdescGetPhysAddr(pLocals->pTempMemDesc, addressTranslation, 0) & (pLocals->pageSize - 1);
417 pLocals->mapLength = RM_ALIGN_UP(pLocals->pageOffset + pLocals->pTempMemDesc->Size, pLocals->pageSize);
418 pLocals->pageCount = NvU64_LO32(pLocals->mapLength >> pLocals->pageShift);
419 pLocals->bIsMemContiguous = memdescGetContiguity(pLocals->pTempMemDesc, addressTranslation);
420
421 pLocals->kind = NV_MMU_PTE_KIND_PITCH;
422
423 // Get compression/pte pLocals->kind on current gpu memdesc
424 status = memmgrGetKindComprFromMemDesc(pMemoryManager,
425 pLocals->pTempMemDesc,
426 0,
427 &pLocals->kind, &pLocals->comprInfo);
428
429 if (NV_OK != status)
430 goto cleanup;
431
432 //
433 // When compression is enabled mapping at 4K is not supported due to
434 // RM allocating one comptagline per 64KB allocation (From Pascal to Turing).
435 // See bug 3909010
436 //
437 // Skipping it for Raw mode, See bug 4036809
438 //
439 if ((pLocals->pageSize == RM_PAGE_SIZE) &&
440 memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_COMPRESSIBLE, pLocals->kind) &&
441 !(pMemorySystemConfig->bUseRawModeComptaglineAllocation))
442 {
443 NV_PRINTF(LEVEL_WARNING, "Requested 4K mapping on compressible sufrace. Overriding to physical page granularity...\n");
444 pLocals->pageSize = pLocals->physPageSize;
445 }
446
447 #ifdef DEBUG
448 // Check for subdevices consistency if broadcast memdesc is passed in
449 if (memdescHasSubDeviceMemDescs(pAdjustedMemDesc))
450 {
451 // Check pageOffset, pageSize consistency across subdevices
452 memdescCheckSubDevicePageSizeConsistency(pGpu, pAdjustedMemDesc, pVAS, pLocals->pageSize, pLocals->pageOffset);
453
454 // Check mem contiguity consistency across subdevices
455 memdescCheckSubDeviceMemContiguityConsistency(pGpu, pAdjustedMemDesc, pVAS, pLocals->bIsMemContiguous);
456
457 // Check compression/pte pLocals->kind consistency across subdevices
458 status = memdescCheckSubDeviceKindComprConsistency(pGpu, pAdjustedMemDesc, pVAS,
459 pLocals->kind, &pLocals->comprInfo);
460 NV_ASSERT(!status);
461 }
462 #endif
463
464 //
465 // +-- +-- +------------+ --+
466 // | | | | |==> pageOffset
467 // pageSize <==| | | Page 0 | --+
468 // | | | | |
469 // +-- | +------------+ |
470 // | | | |
471 // | | Page 1 | |
472 // | | | |
473 // mapLength <==| +------------+ |==> pMemDesc->Size
474 // | | | |
475 // | | ... | |
476 // | | | |
477 // | +------------+ |
478 // | | | |
479 // | | Page N-1 | --+
480 // | | |
481 // +-- +------------+
482 //
483
484 if (pLocals->bIsMemContiguous)
485 {
486 // FIXME: Throwing away physical length information is dangerous.
487 pLocals->pteCount = 1;
488 }
489 else
490 {
491 // FIXME: This is broken for page size > 4KB and page offset
492 // that crosses a page boundary (can overrun pPteArray).
493 // --
494 // page count is one more than integral division in case of presence of offset hence being rounded up
495 pLocals->pteCount = RM_ALIGN_UP((pLocals->pTempMemDesc->Size + pLocals->pageOffset), pLocals->pageArrayGranularity) >> BIT_IDX_32(pLocals->pageArrayGranularity);
496 }
497
498 // Disable PLC Compression for FLA->PA Mapping because of the HW Bug: 3046774
499 if (pMemorySystemConfig->bUseRawModeComptaglineAllocation &&
500 pKernelMemorySystem->bDisablePlcForCertainOffsetsBug3046774)
501 {
502 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
503
504 if (((vaspaceGetFlags(pVAS) & VASPACE_FLAGS_FLA) || (dynamicCast(pVAS, FABRIC_VASPACE) != NULL)) &&
505 memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_COMPRESSIBLE, pLocals->kind) &&
506 !memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_DISALLOW_PLC, pLocals->kind))
507 {
508 memmgrGetDisablePlcKind_HAL(pMemoryManager, &pLocals->kind);
509 }
510 }
511
512 if (pLocals->bIsBarOrPerf)
513 {
514 pLocals->totalVaRange = rangeMake(vaspaceGetVaStart(pVAS), vaspaceGetVaLimit(pVAS));
515
516 // !!!! Nasty hack
517 //
518 // NVOS46_FLAGS_PTE_COALESCE_LEVEL_CAP used to get the encryption info from _busMapAperture_GF100().
519 // Since we have no bit fields left in NVOS46_FLAGS_* to specify encryption info.
520 // This is applicable to FERMI+ chips.
521 //
522 // NVOS46_FLAGS_PTE_COALESCE_LEVEL_CAP is _NV50 specific, and is not used in FERMI+.
523 // NVOS46_FLAGS_PTE_COALESCE_LEVEL_CAP_DEFAULT means use default encryption status
524 // NVOS46_FLAGS_PTE_COALESCE_LEVEL_CAP_1 means disable encryption
525 //
526 // VMM-TODO: Add meaningful alias defines or just expand flag bits?
527 //
528 pLocals->disableEncryption = FLD_TEST_DRF(OS46, _FLAGS, _PTE_COALESCE_LEVEL_CAP, _1, flags) ?
529 DMA_UPDATE_VASPACE_FLAGS_DISABLE_ENCRYPTION : 0;
530
531 if (pLocals->bIsMemContiguous)
532 {
533 pLocals->overMap = pLocals->pageCount + NvU64_LO32((pLocals->pageOffset + (pLocals->pageSize - 1)) / pLocals->pageSize);
534 }
535 else
536 {
537 pLocals->overMap = pLocals->pageCount;
538 }
539
540 NV_ASSERT_OK_OR_GOTO(status, vgpuGetCallingContextGfid(pGpu, &gfid), cleanup);
541
542 // BAR1 VA space is split when MIG mem partitioning is enabled
543 if (pLocals->bIsBar1 && pLocals->bIsMIGMemPartitioningEnabled && IS_GFID_PF(gfid))
544 {
545 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
546
547 pLocals->totalVaRange = memmgrGetMIGPartitionableBAR1Range(pGpu, pMemoryManager);
548 NV_ASSERT_OK_OR_GOTO(status,
549 kmemsysSwizzIdToMIGMemRange(pGpu, pKernelMemorySystem, swizzId, pLocals->totalVaRange, &pLocals->totalVaRange),
550 cleanup);
551 }
552
553 if (!FLD_TEST_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _TRUE, flags))
554 {
555 pLocals->vaRangeLo = pLocals->totalVaRange.lo;
556 pLocals->vaRangeHi = pLocals->totalVaRange.hi;
557 }
558 }
559 else
560 {
561
562 NvU64 targetSpaceLength, targetSpaceBase, targetSpaceLimit;
563
564 NV_ASSERT((pLocals->pageSize == pLocals->vaspaceBigPageSize) ||
565 (pLocals->pageSize == RM_PAGE_SIZE) ||
566 (pLocals->pageSize == RM_PAGE_SIZE_HUGE) ||
567 (pLocals->pageSize == RM_PAGE_SIZE_512M));
568
569 pLocals->overMap = 0;
570
571 if (pCliMapInfo != NULL)
572 {
573 VirtualMemory *pVirtualMemory = pCliMapInfo->pVirtualMemory;
574
575 virtmemGetAddressAndSize(pVirtualMemory, &targetSpaceBase, &targetSpaceLength);
576 targetSpaceLimit = targetSpaceBase + targetSpaceLength - 1;
577 }
578 else
579 {
580 // RM internal mappings. Alt to dmaMapBuffer_HAL()
581 targetSpaceBase = vaspaceGetVaStart(pVAS);
582 targetSpaceLimit = vaspaceGetVaLimit(pVAS);
583 targetSpaceLength = targetSpaceLimit - targetSpaceBase + 1;
584 }
585
586 if (pLocals->pteCount > ((targetSpaceLength + (pLocals->pageArrayGranularity - 1)) / pLocals->pageArrayGranularity))
587 {
588 NV_ASSERT(0);
589 status = NV_ERR_INVALID_ARGUMENT;
590 goto cleanup;
591 }
592
593 pVASpaceHeap = vaspaceGetHeap(pVAS);
594
595 if (!FLD_TEST_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _TRUE, flags))
596 {
597 // offset of the context dma passed in when ctxdma allocated
598 // Virtual memory don't have any SMMU mapping. It is still OK to use the engine MMU context; it dont have any effect.
599
600 pLocals->vaRangeLo = NV_MAX(targetSpaceBase, vaspaceGetVaStart(pVAS));
601 pLocals->vaRangeHi = NV_MIN(targetSpaceLimit, vaspaceGetVaLimit(pVAS));
602
603 //
604 // Handle 32bit pointer requests. 32b pointers are forced below 32b
605 // on all chips. Non-32b requests are only forced on some chips,
606 // typically kepler, and only if there are no other address hints.
607 //
608 if (DRF_VAL(OS46, _FLAGS, _32BIT_POINTER, flags) ==
609 NVOS46_FLAGS_32BIT_POINTER_ENABLE)
610 {
611 pLocals->vaRangeHi = NV_MIN(0xffffffff, pLocals->vaRangeHi);
612 }
613 else if (pDma->getProperty(pDma, PDB_PROP_DMA_ENFORCE_32BIT_POINTER) &&
614 (pVASpaceHeap->free > NVBIT64(32))) // Pressured address spaces are exempt
615 {
616 pLocals->vaRangeLo = NV_MAX(NVBIT64(32), pLocals->vaRangeLo);
617 }
618 }
619 }
620
621 //
622 // Align the virtual address passed in down to the page size.
623 //
624 // There is no requirement that the physical offset of a mapping
625 // be page-aligned, so we need to map the entire page that contains
626 // the desired offset. We then add the page offset
627 // onto the returned virtual address.
628 //
629 pLocals->vaLo = RM_ALIGN_DOWN(*pVaddr, pLocals->pageSize);
630
631 if (pLocals->bAllocVASpace)
632 {
633 //
634 // TODO: This flag handling logic should be consolidated with dmaMapBuffer_GM107
635 // when old path removed.
636 //
637 VAS_ALLOC_FLAGS allocFlags = {0};
638 NvU64 compAlign = NVBIT64(pLocals->comprInfo.compPageShift);
639 NvU64 vaAlign = NV_MAX(pLocals->pageSize, compAlign);
640 NvU64 vaSize = RM_ALIGN_UP(pLocals->mapLength, vaAlign);
641 NvU64 pageSizeLockMask = 0;
642 pGVAS = dynamicCast(pVAS, OBJGVASPACE);
643
644 if (FLD_TEST_DRF(OS46, _FLAGS, _PAGE_SIZE, _BOTH, flags))
645 {
646 vaAlign = NV_MAX(vaAlign, pLocals->vaspaceBigPageSize);
647 vaSize = RM_ALIGN_UP(pLocals->mapLength, vaAlign);
648 }
649 //
650 // Third party code path, nvidia_p2p_get_pages, expects on BAR1 VA to be
651 // always aligned at 64K.
652 //
653 // Also, RmMapMemory on PPC64LE expects BAR1 VA to be aligned at 64K.
654 //
655 if (pLocals->bIsBar1)
656 {
657 vaAlign = NV_MAX(vaAlign, pLocals->vaspaceBigPageSize);
658 vaSize = RM_ALIGN_UP(pLocals->mapLength, vaAlign);
659 }
660 if (FLD_TEST_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _TRUE, flags))
661 {
662 pLocals->vaRangeLo = pLocals->vaLo;
663 pLocals->vaRangeHi = pLocals->vaLo + vaSize - 1;
664 if (pLocals->bIsBar1)
665 {
666 NV_RANGE requestedRange = rangeMake(pLocals->vaRangeLo, pLocals->vaRangeHi);
667 if (!rangeContains(pLocals->totalVaRange, requestedRange))
668 {
669 NV_PRINTF(LEVEL_ERROR, "Requested BAR1 VA Lo=0x%llx Hi=0x%llx\n"
670 "total BAR1 VA range Lo=0x%llx Hi=0x%llx\n",
671 requestedRange.lo, requestedRange.hi,
672 pLocals->totalVaRange.lo, pLocals->totalVaRange.hi);
673 status = NV_ERR_INVALID_ARGUMENT;
674 DBG_BREAKPOINT();
675 goto cleanup;
676 }
677 }
678 if (pGVAS != NULL && gvaspaceIsInternalVaRestricted(pGVAS))
679 {
680 if ((pLocals->vaRangeLo >= pGVAS->vaStartInternal && pLocals->vaRangeLo <= pGVAS->vaLimitInternal) ||
681 (pLocals->vaRangeHi <= pGVAS->vaLimitInternal && pLocals->vaRangeHi >= pGVAS->vaStartInternal))
682 {
683 status = NV_ERR_INVALID_PARAMETER;
684 goto cleanup;
685 }
686 }
687 }
688 else if (pDma->getProperty(pDma, PDB_PROP_DMA_RESTRICT_VA_RANGE))
689 {
690 // See comments in vaspaceFillAllocParams_IMPL.
691 pLocals->vaRangeHi = NV_MIN(pLocals->vaRangeHi, NVBIT64(40) - 1);
692 }
693
694 if (FLD_TEST_DRF(OS46, _FLAGS, _PAGE_SIZE, _BOTH, flags))
695 {
696 NV_ASSERT(pLocals->pageSize <= pLocals->vaspaceBigPageSize);
697 pageSizeLockMask |= RM_PAGE_SIZE;
698 pageSizeLockMask |= pLocals->vaspaceBigPageSize;
699 }
700 else
701 {
702 pageSizeLockMask |= pLocals->pageSize;
703 }
704
705 allocFlags.bReverse = FLD_TEST_DRF(OS46, _FLAGS, _DMA_OFFSET_GROWS, _DOWN, flags);
706
707 //
708 // Feature requested for RM unlinked SLI:
709 // Clients can pass an allocation flag to the device or VA space constructor
710 // so that mappings and allocations will fail without an explicit address.
711 //
712 if (pGVAS != NULL)
713 {
714 if ((pGVAS->flags & VASPACE_FLAGS_REQUIRE_FIXED_OFFSET) &&
715 !FLD_TEST_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _TRUE, flags))
716 {
717 status = NV_ERR_INVALID_ARGUMENT;
718 NV_PRINTF(LEVEL_ERROR, "The VA space requires all allocations to specify a fixed address\n");
719 goto cleanup;
720 }
721
722 //
723 // Bug 3610538 clients can allocate GPU VA, during mapping for ctx dma.
724 // But if clients enable RM to map internal buffers in a reserved
725 // range of VA for unlinked SLI in Linux, we want to tag these
726 // allocations as "client allocated", so that it comes outside of
727 // RM internal region.
728 //
729 if (gvaspaceIsInternalVaRestricted(pGVAS))
730 {
731 allocFlags.bClientAllocation = NV_TRUE;
732 }
733 }
734
735 status = vaspaceAlloc(pVAS, vaSize, vaAlign, pLocals->vaRangeLo, pLocals->vaRangeHi,
736 pageSizeLockMask, allocFlags, &pLocals->vaLo);
737 if (NV_OK != status)
738 {
739 NV_PRINTF(LEVEL_ERROR, "can't alloc VA space for mapping.\n");
740 goto cleanup;
741 }
742 NV_ASSERT_OR_ELSE(0 == (pLocals->vaLo & (pLocals->pageSize - 1)),
743 status = NV_ERR_INVALID_STATE;
744 goto cleanup; );
745 NV_ASSERT_OR_ELSE(vaSize >= pLocals->mapLength,
746 status = NV_ERR_INVALID_STATE;
747 goto cleanup; );
748
749 //
750 // Handle overmapping for BAR1.
751 //
752 // BAR1 VA is allocated at big page size granularity
753 // regardless of the physical memory size being mapped.
754 // Unmapped regions of BAR1 need to be mapped to dummy
755 // pages (or sparse) to avoid faults on PCIe prefetch.
756 //
757 // Overmap solves this by wrapping around the target physical
758 // memory for the remainder of the last big page so
759 // any left over 4K pages are "scratch invalidated."
760 //
761 // When this is used, the mapLength must be extended to
762 // to the entire VA range and dmaUpdateVASpace
763 // takes care of the overMap modulus.
764 //
765 // TODO: With VMM enabled BAR1 scratch invalidate is handled
766 // transparently with SW (or HW) sparse support.
767 // Removing this special overmap logic should be
768 // possible when the old VAS path is fully
769 // deprecated.
770 //
771 // See Bug 200090426.
772 //
773 if (pLocals->overMap != 0)
774 {
775 pLocals->mapLength = vaSize;
776 }
777 }
778 else
779 {
780 //
781 // We are mapping to an existing virtual memory allocation.
782 //
783 // The virtual offset passed in may or may not account for
784 // the page offset. Check for either the page-aligned case or
785 // the adjusted case to ensure clients are not requesting
786 // bogus offsets.
787 //
788 if (((*pVaddr - pLocals->vaLo) != 0) &&
789 ((*pVaddr - pLocals->vaLo) != pLocals->pageOffset))
790 {
791 NV_PRINTF(LEVEL_ERROR,
792 "Virtual address 0x%llX is not compatible with page size 0x%llX or page"
793 " offset 0x%llX.\n", *pVaddr, pLocals->pageSize, pLocals->pageOffset);
794 DBG_BREAKPOINT();
795 status = NV_ERR_INVALID_OFFSET;
796 goto cleanup;
797 }
798 }
799
800 //
801 // Calculate mapping virtual address limit based on
802 // mapping length derived from number of physical pages going to map.
803 //
804 pLocals->vaHi = pLocals->vaLo + pLocals->mapLength - 1;
805
806 if (pLocals->p2p == NVOS46_FLAGS_P2P_ENABLE_NOSLI)
807 {
808 NV_ASSERT_OR_GOTO(pLocals->pMemory != NULL, fail_post_register);
809
810 FlaMemory *pFlaMemory = dynamicCast(pLocals->pMemory, FlaMemory);
811 if (pFlaMemory != NULL)
812 {
813 pLocals->pSrcGpu = gpumgrGetGpu(pFlaMemory->peerGpuInst);
814 pLocals->bFlaImport = NV_TRUE;
815
816 if (!pLocals->pSrcGpu)
817 {
818 NV_PRINTF(LEVEL_ERROR, "Cannot map FLA Memory without a valid srcGpu, failing....\n");
819 status = NV_ERR_INVALID_ARGUMENT;
820 DBG_BREAKPOINT();
821 goto fail_post_register;
822 }
823 }
824 else
825 {
826 pLocals->pSrcGpu = pLocals->pMemory->pGpu;
827
828 // XXX - is this required here if we disable SLI BC below?
829 GPU_RES_SET_THREAD_BC_STATE(pLocals->pMemory->pDevice);
830 }
831
832 if (IsSLIEnabled(pLocals->pSrcGpu))
833 {
834 NvU32 deviceInstance = gpuGetDeviceInstance(pLocals->pSrcGpu);
835
836 pLocals->pSrcGpu = gpumgrGetGpuFromSubDeviceInst(deviceInstance, pLocals->subDevIdSrc);
837 gpumgrSetBcEnabledStatus(pLocals->pSrcGpu, NV_FALSE);
838 }
839
840 pLocals->peerNumber = kbusGetPeerId_HAL(pGpu, pKernelBus, pLocals->pSrcGpu);
841
842 // only needed pLocals->pSrcGpu for the one line above, swap back now.
843 if (IsSLIEnabled(pLocals->pSrcGpu))
844 {
845 pLocals->pSrcGpu = gpumgrGetParentGPU(pLocals->pSrcGpu);
846 gpumgrSetBcEnabledStatus(pLocals->pSrcGpu, NV_TRUE);
847 }
848
849 NV_PRINTF(LEVEL_INFO,
850 "P2P LOOPBACK setup with physical vidmem at 0x%llx and virtual address "
851 "at 0x%llx\n",
852 memdescGetPhysAddr(pAdjustedMemDesc, addressTranslation, 0), pLocals->vaLo);
853 }
854 else if (pLocals->p2p == NVOS46_FLAGS_P2P_ENABLE_SLI)
855 {
856 //
857 // All the peer GPUs will have valid PTEs written as
858 // P2P mappings. The local GPU will have this region marked as
859 // invalid.
860 //
861 const NvU32 deviceInst = gpuGetDeviceInstance(pGpu);
862 pLocals->pSrcGpu = gpumgrGetGpuFromSubDeviceInst(deviceInst, pLocals->subDevIdSrc);
863 }
864
865 pLocals->pRootMemDesc = memdescGetRootMemDesc(pAdjustedMemDesc, NULL);
866 if (memdescGetAddressSpace(pLocals->pRootMemDesc) == ADDR_FBMEM)
867 {
868 if (gpumgrCheckIndirectPeer(pGpu, pLocals->pRootMemDesc->pGpu))
869 {
870 pLocals->indirectPeer = DMA_UPDATE_VASPACE_FLAGS_INDIRECT_PEER;
871 }
872 }
873
874 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY | SLI_LOOP_FLAGS_IGNORE_REENTRANCY)
875 {
876 if (pLocals->p2p)
877 {
878 if (pLocals->bFlaImport)
879 {
880 pLocals->pTempMemDesc = memdescGetMemDescFromGpu(pAdjustedMemDesc, pGpu);
881 }
882 else
883 {
884 pLocals->pTempMemDesc = memdescGetMemDescFromGpu(pAdjustedMemDesc, pLocals->pSrcGpu);
885 }
886 }
887 else
888 {
889 pLocals->pTempMemDesc = memdescGetMemDescFromGpu(pAdjustedMemDesc, pGpu);
890 }
891
892 // Commit the mapping update
893 pLocals->pPteArray = memdescGetPteArray(pLocals->pTempMemDesc, addressTranslation);
894
895 dmaPageArrayInitWithFlags(&pLocals->pageArray, pLocals->pPteArray, pLocals->pteCount,
896 pLocals->pageArrayFlags);
897
898 // Get pLocals->aperture
899 if (memdescGetAddressSpace(pLocals->pTempMemDesc) == ADDR_FBMEM)
900 {
901 if (pLocals->p2p)
902 {
903 pLocals->aperture = NV_MMU_PTE_APERTURE_PEER_MEMORY;
904 }
905 else if (pLocals->indirectPeer)
906 {
907 pLocals->aperture = NV_MMU_PTE_APERTURE_SYSTEM_COHERENT_MEMORY;
908 }
909 else
910 {
911 pLocals->aperture = NV_MMU_PTE_APERTURE_VIDEO_MEMORY;
912 }
913 }
914 else if (
915 (memdescGetAddressSpace(pLocals->pTempMemDesc) == ADDR_FABRIC_MC) ||
916 (memdescGetAddressSpace(pLocals->pTempMemDesc) == ADDR_FABRIC_V2))
917 {
918 OBJGPU *pMappingGpu = pGpu;
919 OBJGPU *pPeerGpu;
920 pLocals->peerNumber = BUS_INVALID_PEER;
921
922 if (pLocals->pMemory == NULL)
923 {
924 status = NV_ERR_INVALID_STATE;
925 DBG_BREAKPOINT();
926 SLI_LOOP_BREAK;
927 }
928
929 pPeerGpu = pLocals->pMemory->pGpu;
930
931 if (memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_COMPRESSIBLE, pLocals->kind))
932 {
933 NV_PRINTF(LEVEL_ERROR,
934 "Fabric memory should not be compressible.\n");
935 status = NV_ERR_INVALID_STATE;
936 DBG_BREAKPOINT();
937 SLI_LOOP_BREAK;
938 }
939
940 pLocals->aperture = NV_MMU_PTE_APERTURE_PEER_MEMORY;
941
942 if (!memIsGpuMapAllowed(pLocals->pMemory, pMappingGpu))
943 {
944 NV_PRINTF(LEVEL_ERROR,
945 "Mapping Gpu is not attached to the given memory object\n");
946 status = NV_ERR_INVALID_STATE;
947 DBG_BREAKPOINT();
948 SLI_LOOP_BREAK;
949 }
950
951 if (pPeerGpu != NULL)
952 {
953 if (IS_VIRTUAL_WITH_SRIOV(pMappingGpu) &&
954 !gpuIsWarBug200577889SriovHeavyEnabled(pMappingGpu))
955 {
956 pLocals->peerNumber = kbusGetNvlinkPeerId_HAL(pMappingGpu,
957 GPU_GET_KERNEL_BUS(pMappingGpu),
958 pPeerGpu);
959 }
960 else
961 {
962 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pMappingGpu);
963
964 if ((pKernelNvlink != NULL) &&
965 knvlinkIsNvlinkP2pSupported(pMappingGpu, pKernelNvlink, pPeerGpu))
966 {
967 pLocals->peerNumber = kbusGetPeerId_HAL(pMappingGpu, GPU_GET_KERNEL_BUS(pMappingGpu),
968 pPeerGpu);
969 }
970 }
971 }
972 else
973 {
974 pLocals->peerNumber = kbusGetNvSwitchPeerId_HAL(pMappingGpu,
975 GPU_GET_KERNEL_BUS(pMappingGpu));
976 }
977
978 if (pLocals->peerNumber == BUS_INVALID_PEER)
979 {
980 status = NV_ERR_INVALID_STATE;
981 DBG_BREAKPOINT();
982 SLI_LOOP_BREAK;
983 }
984 }
985 else if (memdescIsEgm(pLocals->pTempMemDesc))
986 {
987 pLocals->aperture = NV_MMU_PTE_APERTURE_PEER_MEMORY;
988
989 if (pLocals->p2p)
990 {
991 OBJGPU *pMappingGpu = pGpu;
992 OBJGPU *pPeerGpu;
993
994 NV_ASSERT_OR_ELSE(pLocals->pMemory != NULL, status = NV_ERR_INVALID_STATE; goto cleanup);
995
996 pPeerGpu = pLocals->pMemory->pGpu;
997 pLocals->peerNumber = kbusGetEgmPeerId_HAL(pMappingGpu, GPU_GET_KERNEL_BUS(pMappingGpu), pPeerGpu);
998 }
999 else
1000 {
1001 //
1002 // Make sure that we receive a mapping request for EGM memory
1003 // only if local EGM is enabled.
1004 //
1005 NV_ASSERT_OR_ELSE(pMemoryManager->bLocalEgmEnabled, status = NV_ERR_INVALID_STATE; goto cleanup);
1006 pLocals->peerNumber = pMemoryManager->localEgmPeerId;
1007 }
1008 }
1009 else
1010 {
1011 // No P2P for system memory
1012 if (pLocals->p2p)
1013 {
1014 status = NV_ERR_INVALID_ARGUMENT;
1015 NV_PRINTF(LEVEL_ERROR, "No P2P for system memory.\n");
1016 SLI_LOOP_BREAK;
1017 }
1018
1019 if (pLocals->cacheSnoop || memdescGetFlag(pAdjustedMemDesc, MEMDESC_FLAGS_MAP_SYSCOH_OVER_BAR1))
1020 {
1021 pLocals->aperture = NV_MMU_PTE_APERTURE_SYSTEM_COHERENT_MEMORY;
1022 }
1023 else
1024 {
1025 pLocals->aperture = NV_MMU_PTE_APERTURE_SYSTEM_NON_COHERENT_MEMORY;
1026 }
1027 }
1028
1029 if (pLocals->p2p == NVOS46_FLAGS_P2P_ENABLE_SLI)
1030 {
1031 if (pLocals->pSrcGpu == pGpu)
1032 {
1033 // Leave the local GPU VA range unmapped (invalid).
1034 SLI_LOOP_CONTINUE;
1035 }
1036 else
1037 {
1038 pLocals->peerNumber = kbusGetPeerId_HAL(pGpu, GPU_GET_KERNEL_BUS(pGpu), pLocals->pSrcGpu);
1039 }
1040 }
1041
1042 if (pLocals->aperture == NV_MMU_PTE_APERTURE_PEER_MEMORY &&
1043 pLocals->peerNumber == BUS_INVALID_PEER)
1044 {
1045 status = NV_ERR_INVALID_STATE;
1046 DBG_BREAKPOINT();
1047 SLI_LOOP_BREAK;
1048 }
1049
1050 //
1051 // Fabric memory descriptors are pre-encoded with the fabric base address
1052 // use NVLINK_INVALID_FABRIC_ADDR to avoid encoding twice
1053 //
1054 // Skip fabric base address for Local EGM as it uses peer aperture but
1055 // doesn't require fabric address
1056 //
1057 if (pLocals->bFlaImport ||
1058 (memdescGetAddressSpace(pLocals->pTempMemDesc) == ADDR_FABRIC_MC) ||
1059 (memdescGetAddressSpace(pLocals->pTempMemDesc) == ADDR_FABRIC_V2) ||
1060 (memdescIsEgm(pLocals->pTempMemDesc) && (pGpu == pLocals->pSrcGpu)))
1061 {
1062 pLocals->fabricAddr = NVLINK_INVALID_FABRIC_ADDR;
1063 }
1064 else
1065 {
1066 // Get EGM fabric address for Remote EGM
1067 if (memdescIsEgm(pLocals->pTempMemDesc))
1068 {
1069 status = _dmaGetFabricEgmAddress(pLocals->pSrcGpu, pLocals->aperture,
1070 pLocals->kind, &pLocals->fabricAddr);
1071 }
1072 else
1073 {
1074 status = _dmaGetFabricAddress(pLocals->pSrcGpu, pLocals->aperture,
1075 pLocals->kind, &pLocals->fabricAddr);
1076 }
1077
1078 if (status != NV_OK)
1079 {
1080 DBG_BREAKPOINT();
1081 SLI_LOOP_BREAK;
1082 }
1083 }
1084
1085 pDma = GPU_GET_DMA(pGpu);
1086
1087 status = dmaUpdateVASpace_HAL(pGpu, pDma,
1088 pVAS,
1089 pLocals->pTempMemDesc,
1090 NULL,
1091 pLocals->vaLo, pLocals->vaHi,
1092 DMA_UPDATE_VASPACE_FLAGS_UPDATE_ALL | pLocals->readOnly | pLocals->priv |
1093 pLocals->tlbLock | pLocals->shaderFlags | pLocals->disableEncryption | pLocals->indirectPeer,
1094 &pLocals->pageArray, pLocals->overMap,
1095 &pLocals->comprInfo,
1096 0,
1097 NV_MMU_PTE_VALID_TRUE,
1098 pLocals->aperture,
1099 pLocals->peerNumber,
1100 pLocals->fabricAddr,
1101 pLocals->deferInvalidate,
1102 NV_FALSE,
1103 pLocals->pageSize);
1104 if (NV_OK != status)
1105 {
1106 NV_PRINTF(LEVEL_ERROR,
1107 "can't update VA space for mapping @vaddr=0x%llx\n",
1108 pLocals->vaLo);
1109 DBG_BREAKPOINT();
1110 SLI_LOOP_BREAK;
1111 }
1112 }
1113 SLI_LOOP_END
1114
1115 if (NV_OK == status)
1116 {
1117 //
1118 // Fill in the final virtual address of this mapping.
1119 //
1120 // This accounts for page offset for all cases, whether or not
1121 // the input *pVaddr accounted for it.
1122 //
1123 *pVaddr = pLocals->vaLo + pLocals->pageOffset;
1124
1125 // Fill in the final mapping page size for client mappings.
1126 if (pCliMapInfo != NULL)
1127 {
1128 pCliMapInfo->pDmaMappingInfo->mapPageSize = pLocals->pageSize;
1129 }
1130
1131 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY | SLI_LOOP_FLAGS_IGNORE_REENTRANCY)
1132 // This is needed for cliDB tracking of the map.
1133 memdescSetPageSize(memdescGetMemDescFromGpu(pAdjustedMemDesc, pGpu), addressTranslation, pLocals->pageSize);
1134 SLI_LOOP_END
1135 }
1136 else
1137 {
1138 fail_post_register:
1139 if (pLocals->pMapNode)
1140 btreeUnlink(pLocals->pMapNode, &pLocals->pVASInfo->pMapTree);
1141
1142 portMemFree(pLocals->pMapNode);
1143
1144 // Only free the VA allocation if we allocated here.
1145 if (pLocals->bAllocVASpace)
1146 {
1147 vaspaceFree(pVAS, pLocals->vaLo);
1148 }
1149 }
1150
1151 cleanup:
1152
1153 if (pAdjustedMemDesc != pMemDesc)
1154 fabricvaspacePutGpaMemdesc(pFabricVAS, pAdjustedMemDesc);
1155
1156 portMemFree(pLocals);
1157
1158 return status;
1159 }
1160
1161 /*!
1162 * @brief Unmap a virtual allocation.
1163 *
1164 * For client allocations, invalidate the page tables, but don't bother freeing.
1165 * For internal allocations, free the allocation, but don't bother invalidating.
1166 * Wait, what?
1167 *
1168 * VMM-TODO: Split into two APIs - one for clients one for internal?
1169 *
1170 * @param[in] pGpu OBJGPU pointer
1171 * @param[in] pDma VirtMemAllocator pointer
1172 * @param[in] pVAS OBJVASPACE pointer
1173 * @param[in] vAddr Virtual memory base address
1174 * @param[in] pMemDesc Physical memory descriptor
1175 * @param[in] flags Unmap options
1176 * @param[in] pCliMapInfo PCLI_DMA_ALLOC_MAP_INFO pointer (for RM Client mappings)
1177 *
1178 * @returns NV_STATUS status = NV_OK on success, or status upon failure.
1179 */
1180 NV_STATUS
dmaFreeMapping_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma,OBJVASPACE * pVAS,NvU64 vAddr,MEMORY_DESCRIPTOR * pMemDesc,NvU32 flags,CLI_DMA_ALLOC_MAP_INFO * pCliMapInfo)1181 dmaFreeMapping_GM107
1182 (
1183 OBJGPU *pGpu,
1184 VirtMemAllocator *pDma,
1185 OBJVASPACE *pVAS,
1186 NvU64 vAddr,
1187 MEMORY_DESCRIPTOR *pMemDesc,
1188 NvU32 flags,
1189 CLI_DMA_ALLOC_MAP_INFO *pCliMapInfo
1190 )
1191 {
1192 VirtualMemory *pVirtualMemory = NULL;
1193 NvU32 p2p = NVOS46_FLAGS_P2P_ENABLE_NONE;
1194 NvU64 vaLo;
1195 NvU64 vaHi;
1196 NvU64 mapLength;
1197 NvU64 pageOffset;
1198 NvU64 pageSize;
1199 NvU32 deferInvalidate;
1200 NvU32 subDevIdSrc;
1201 OBJGPU *pLocalGpu = NULL;
1202
1203 NV_STATUS status = NV_OK;
1204 MEMORY_DESCRIPTOR *pTempMemDesc = NULL;
1205
1206 NV_ASSERT_OR_RETURN(NULL != pMemDesc, NV_ERR_INVALID_ARGUMENT);
1207
1208 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY | SLI_LOOP_FLAGS_IGNORE_REENTRANCY)
1209 // ensure the page size has been set before continuing
1210 NV_ASSERT(memdescGetPageSize(memdescGetMemDescFromGpu(pMemDesc, pGpu), VAS_ADDRESS_TRANSLATION(pVAS)) != 0);
1211 SLI_LOOP_END
1212
1213 if (pCliMapInfo)
1214 {
1215 p2p = DRF_VAL(OS46, _FLAGS, _P2P_ENABLE, pCliMapInfo->pDmaMappingInfo->Flags);
1216 subDevIdSrc = DRF_VAL(OS46, _FLAGS, _P2P_SUBDEV_ID_SRC, pCliMapInfo->pDmaMappingInfo->Flags);
1217 pVirtualMemory = pCliMapInfo->pVirtualMemory;
1218 }
1219
1220 if (p2p == NVOS46_FLAGS_P2P_ENABLE_SLI)
1221 {
1222 const NvU32 deviceInst = gpuGetDeviceInstance(pGpu);
1223 pLocalGpu = gpumgrGetGpuFromSubDeviceInst(deviceInst, subDevIdSrc);
1224 }
1225
1226 deferInvalidate = DRF_VAL(OS47, _FLAGS, _DEFER_TLB_INVALIDATION, flags) ? DMA_DEFER_TLB_INVALIDATE : DMA_TLB_INVALIDATE;
1227
1228 // Handle NV50_MEMORY_VIRTUAL use case
1229 if ((pVirtualMemory != NULL) && pVirtualMemory->bReserveVaOnAlloc)
1230 {
1231 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY | SLI_LOOP_FLAGS_IGNORE_REENTRANCY)
1232 {
1233 if (p2p == NVOS46_FLAGS_P2P_ENABLE_SLI)
1234 {
1235 if (pLocalGpu == pGpu)
1236 {
1237 SLI_LOOP_CONTINUE;
1238 }
1239 }
1240
1241 pTempMemDesc = memdescGetMemDescFromGpu(pMemDesc, pGpu);
1242
1243 NV_ASSERT_OR_RETURN(pCliMapInfo != NULL, NV_ERR_INVALID_STATE);
1244 NV_ASSERT_OR_RETURN(pCliMapInfo->pDmaMappingInfo->mapPageSize != 0, NV_ERR_INVALID_STATE);
1245
1246 pageSize = pCliMapInfo->pDmaMappingInfo->mapPageSize;
1247 pageOffset = memdescGetPhysAddr(pTempMemDesc, VAS_ADDRESS_TRANSLATION(pVAS), 0) & (pageSize - 1);
1248 mapLength = RM_ALIGN_UP(pageOffset + pTempMemDesc->Size, pageSize);
1249 vaLo = RM_ALIGN_DOWN(vAddr, pageSize);
1250 vaHi = vaLo + mapLength - 1;
1251
1252 pDma = GPU_GET_DMA(pGpu);
1253 if (vaspaceGetFlags(pVAS) & VASPACE_FLAGS_BAR_BAR1)
1254 {
1255 NV_PRINTF(LEVEL_ERROR, "Using dmaFreeMapping with sparse == False in BAR1 path!\n");
1256 NV_ASSERT(0);
1257 return status;
1258 }
1259
1260 status = dmaUpdateVASpace_HAL(pGpu, pDma,
1261 pVAS,
1262 pTempMemDesc,
1263 NULL,
1264 vaLo, vaHi,
1265 DMA_UPDATE_VASPACE_FLAGS_UPDATE_VALID, // only change validity
1266 NULL, 0,
1267 NULL, 0,
1268 NV_MMU_PTE_VALID_FALSE,
1269 kgmmuGetHwPteApertureFromMemdesc(GPU_GET_KERNEL_GMMU(pGpu), pTempMemDesc), 0,
1270 NVLINK_INVALID_FABRIC_ADDR,
1271 deferInvalidate,
1272 NV_FALSE,
1273 pageSize);
1274 if (status != NV_OK)
1275 {
1276 NV_PRINTF(LEVEL_ERROR, "error updating VA space.\n");
1277 vaspaceFree(pVAS, vaLo);
1278 return status;
1279 }
1280 }
1281 SLI_LOOP_END
1282 }
1283 else
1284 {
1285 vaspaceFree(pVAS, vAddr);
1286 }
1287
1288 //
1289 // invalidate any cached peer data if this memory was mapped p2p cached.
1290 // for SLI case - kmemsysCacheOp would loop through all GPUs
1291 // for non-SLI case pGpu is pointing to the P2P mapped GPU would
1292 // invalidate only on that GPU.
1293 //
1294 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY | SLI_LOOP_FLAGS_IGNORE_REENTRANCY)
1295 if ((memdescGetGpuP2PCacheAttrib(memdescGetMemDescFromGpu(pMemDesc, pGpu)) == NV_MEMORY_CACHED) &&
1296 (p2p != NVOS46_FLAGS_P2P_ENABLE_NONE))
1297 {
1298 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
1299 kmemsysCacheOp_HAL(pGpu, pKernelMemorySystem, pMemDesc, FB_CACHE_PEER_MEMORY,
1300 FB_CACHE_INVALIDATE);
1301 }
1302 SLI_LOOP_END
1303
1304 return status;
1305 }
1306
1307 /*!
1308 * Defines the data needed to iterate over the last level duing map VA op.
1309 * Note: Used only in the new VMM code path.
1310 */
1311 struct MMU_MAP_ITERATOR
1312 {
1313 /*!
1314 * @copydoc GMMU_FMT
1315 */
1316 const GMMU_FMT *pFmt;
1317
1318 /*!
1319 * Physical aperture of the pages.
1320 */
1321 GMMU_APERTURE aperture;
1322
1323 /*!
1324 * Opaque array of physical memory to map. Always points to 4K sized pages.
1325 */
1326 DMA_PAGE_ARRAY *pPageArray;
1327
1328 /*!
1329 * Points to the index in pPageArray that needs to be mapped.
1330 */
1331 NvU32 currIdx;
1332
1333 /*!
1334 * Base offset in bytes into the logical surface being mapped.
1335 */
1336 NvU64 surfaceOffset;
1337
1338 /*!
1339 * Physical address of the last page mapped.
1340 */
1341 NvU64 physAddr;
1342
1343 /*!
1344 * NvLink fabric address. Used for NVSwitch systems only!
1345 */
1346 NvU64 fabricAddr;
1347
1348 /*!
1349 * @copydoc COMPR_INFO
1350 */
1351 COMPR_INFO comprInfo;
1352
1353
1354 /*!
1355 * Non-compressed kind.
1356 */
1357 NvU32 kindNoCompr;
1358
1359 /*!
1360 * Indicates whether compression is enabled.
1361 */
1362 NvBool bCompr;
1363
1364 /*!
1365 * Template used to initialize the actual PTEs. Will have values that do not
1366 * change across one map operation.
1367 */
1368 NvU8 pteTemplate[GMMU_FMT_MAX_ENTRY_SIZE] NV_ALIGN_BYTES(8);
1369
1370 /*!
1371 * The addr field that needs to be filled out, based on the
1372 * aperture.
1373 */
1374 const GMMU_FIELD_ADDRESS *pAddrField;
1375
1376 /*!
1377 * Indicates after how many indexes in pPageArray, should the
1378 * map wrap around to the first mapped page.
1379 */
1380 NvU32 overMapModulus;
1381
1382 /*!
1383 * Indicates to read-modify-write each PTE instead of
1384 * using the pteTemplate as the base value.
1385 */
1386 NvBool bReadPtes;
1387
1388 /*!
1389 * Indicates to update physical address field of each PTE.
1390 */
1391 NvBool bUpdatePhysAddr;
1392
1393 /*!
1394 * Indicates to update comptag line and kind of each PTE
1395 * that points to a compressed page.
1396 */
1397 NvBool bUpdateCompr;
1398
1399 /*!
1400 * Indicates that we are writing PDEs for Bug 2720120.
1401 * Applicable only to GA100
1402 */
1403 NvBool bApplyWarForBug2720120;
1404
1405 /*!
1406 * Current page table BAR2 aperture mapping (or user buffer).
1407 */
1408 NvU8 *pMap;
1409 };
1410
1411 static void
_gmmuWalkCBMapNextEntries_Direct(MMU_WALK_USER_CTX * pUserCtx,const MMU_MAP_TARGET * pTarget,const MMU_WALK_MEMDESC * pLevelMem,const NvU32 entryIndexLo,const NvU32 entryIndexHi,NvU32 * pProgress)1412 _gmmuWalkCBMapNextEntries_Direct
1413 (
1414 MMU_WALK_USER_CTX *pUserCtx,
1415 const MMU_MAP_TARGET *pTarget,
1416 const MMU_WALK_MEMDESC *pLevelMem,
1417 const NvU32 entryIndexLo,
1418 const NvU32 entryIndexHi,
1419 NvU32 *pProgress
1420 )
1421 {
1422 NvU32 i;
1423 const MMU_FMT_LEVEL *pLevelFmt = pTarget->pLevelFmt;
1424 MMU_MAP_ITERATOR *pIter = pTarget->pIter;
1425 NvU8 *pMap = pIter->pMap;
1426 const NvU64 pageSize = mmuFmtLevelPageSize(pLevelFmt);
1427 GMMU_ENTRY_VALUE entry;
1428
1429 NV_ASSERT_OR_RETURN_VOID(pMap != NULL);
1430
1431 //
1432 // This function will always write the caller supplied buffer
1433 // at offset 0. The onus of writing the buffer out to the target
1434 // location in memory at the appropriate offset is on the caller.
1435 //
1436
1437 for (i = entryIndexLo; i <= entryIndexHi; ++i)
1438 {
1439 NvU32 entryOffset = (i - entryIndexLo) * pLevelFmt->entrySize;
1440
1441 // Copy out current PTE if we are overwriting (Read-Modify-Write)
1442 if (pIter->bReadPtes)
1443 {
1444 portMemCopy(entry.v8, pLevelFmt->entrySize,
1445 &pMap[entryOffset], pLevelFmt->entrySize);
1446 }
1447 else
1448 {
1449 // Copy the static fields passed in, if we aren't overwriting a subset of fields.
1450 portMemCopy(entry.v8, pLevelFmt->entrySize,
1451 pIter->pteTemplate, pLevelFmt->entrySize);
1452 }
1453
1454 if (pIter->bApplyWarForBug2720120)
1455 {
1456 // Commit to memory.
1457 portMemCopy(&pMap[entryOffset], pLevelFmt->entrySize,
1458 entry.v8, pLevelFmt->entrySize);
1459 continue;
1460 }
1461
1462 // Calculate the new physical address for the compression check below.
1463 if (pIter->bUpdatePhysAddr)
1464 {
1465 NvU32 currIdxMod = pIter->currIdx;
1466
1467 // Wrap the curr idx to the start offset for BAR1 overmapping.
1468 if (0 != pIter->overMapModulus)
1469 {
1470 currIdxMod %= pIter->overMapModulus;
1471 }
1472
1473 // Extract the physical address of the page to map.
1474 if (currIdxMod < pIter->pPageArray->count)
1475 {
1476 pIter->physAddr = dmaPageArrayGetPhysAddr(pIter->pPageArray, currIdxMod);
1477 // Hack to WAR submemesc mappings
1478 pIter->physAddr = NV_ALIGN_DOWN64(pIter->physAddr, pageSize);
1479 }
1480 else
1481 {
1482 //
1483 // Physically contiguous just increments physAddr
1484 // Should not be the first page (currIdxMod == 0) being mapped.
1485 //
1486 NV_ASSERT_OR_RETURN_VOID((pIter->pPageArray->count == 1) &&
1487 (currIdxMod > 0));
1488 pIter->physAddr += pageSize;
1489 }
1490 }
1491
1492 // Init comptag
1493 if (pIter->bUpdateCompr)
1494 {
1495 OBJGPU *pGpu = pUserCtx->pGpu;
1496 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
1497 NvBool bCompressible = NV_TRUE;
1498
1499 //
1500 // Check if the FB physical address lands in a segment that
1501 // supports compression.
1502 // On WDDM neither RM or KMD has the physical address
1503 // information at compression allocation time.
1504 // On non-WDDM platforms, RM allocates compression before the
1505 // actual physical allocation. For non-contig allocations, the
1506 // physical pages can be spread across multiple regions
1507 // Therefore compression tags are always allocated and compression must
1508 // be disabled on a per-PTE basis at map time.
1509 //
1510 if ((pMemoryManager->Ram.numFBRegions > 1) &&
1511 (gmmuFieldGetAperture(&pIter->pFmt->pPte->fldAperture, entry.v8) ==
1512 GMMU_APERTURE_VIDEO))
1513 {
1514 NvU32 iRegion;
1515 // Find the region in which the candidate block resides
1516 for (iRegion = 0; iRegion < pMemoryManager->Ram.numFBRegions; iRegion++)
1517 {
1518 // Does the block resides within this region? If so, then we are done searching.
1519 if ((pIter->physAddr >= pMemoryManager->Ram.fbRegion[iRegion].base) &&
1520 (pIter->physAddr <= pMemoryManager->Ram.fbRegion[iRegion].limit))
1521 {
1522 // Check if the region supports compression
1523 bCompressible = pMemoryManager->Ram.fbRegion[iRegion].bSupportCompressed;
1524 break;
1525 }
1526 }
1527 }
1528
1529 //
1530 // TODO: The flags that enable compression are confusing -
1531 // complicated by memsysReleaseReacquireCompr_GF100 usage.
1532 // Clean this up when removing old path and simplifying
1533 // the primitive "map" interface.
1534 //
1535 if (pIter->bCompr && bCompressible)
1536 {
1537 //
1538 // For VF, HW does 1 to 1 FB-comptag mapping. HW manages comptag
1539 // allocation, hence RM can skip the comptagline assignment to PTE.
1540 // Just updating the compressed kind is sufficient for VF.
1541 //
1542 if (!IS_VIRTUAL_WITH_SRIOV(pGpu) && pIter->pFmt->version <= GMMU_FMT_VERSION_2)
1543 {
1544 NvBool bIsWarApplied = NV_FALSE;
1545 NvU32 savedKind = pIter->comprInfo.kind;
1546 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
1547 const MEMORY_SYSTEM_STATIC_CONFIG *pMemorySystemConfig =
1548 kmemsysGetStaticConfig(pGpu, pKernelMemorySystem);
1549
1550 if (pMemorySystemConfig->bUseRawModeComptaglineAllocation &&
1551 pKernelMemorySystem->bDisablePlcForCertainOffsetsBug3046774 &&
1552 !memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_DISALLOW_PLC, pIter->comprInfo.kind) &&
1553 !kmemsysIsPagePLCable_HAL(pGpu, pKernelMemorySystem, (pIter->surfaceOffset + pIter->currIdx * pTarget->pageArrayGranularity), pageSize))
1554 {
1555 bIsWarApplied = NV_TRUE;
1556 memmgrGetDisablePlcKind_HAL(pMemoryManager, &pIter->comprInfo.kind);
1557 }
1558 kgmmuFieldSetKindCompTags(GPU_GET_KERNEL_GMMU(pGpu), pIter->pFmt, pLevelFmt, &pIter->comprInfo, pIter->physAddr,
1559 pIter->surfaceOffset + pIter->currIdx * pTarget->pageArrayGranularity,
1560 i, entry.v8);
1561 //
1562 // restore the kind to PLC if changd, since kind is associated with entire surface, and the WAR applies to
1563 // individual pages in the surface.
1564 //
1565 if (bIsWarApplied)
1566 pIter->comprInfo.kind = savedKind;
1567 }
1568 else
1569 {
1570 nvFieldSet32(&pIter->pFmt->pPte->fldKind, pIter->comprInfo.kind, entry.v8);
1571 }
1572 }
1573 else
1574 {
1575 nvFieldSet32(&pIter->pFmt->pPte->fldKind, pIter->kindNoCompr, entry.v8);
1576
1577 if (pIter->pFmt->version <= GMMU_FMT_VERSION_2)
1578 {
1579 nvFieldSet32(&pIter->pFmt->pPte->fldCompTagLine, 0, entry.v8);
1580 if (nvFieldIsValid32(&pIter->pFmt->pPte->fldCompTagSubIndex))
1581 {
1582 nvFieldSet32(&pIter->pFmt->pPte->fldCompTagSubIndex, 0, entry.v8);
1583 }
1584 }
1585 }
1586 }
1587
1588 // Fill the physical address field.
1589 if (pIter->bUpdatePhysAddr && (pIter->pAddrField != NULL))
1590 {
1591 // Update the pte with the physical address
1592 gmmuFieldSetAddress(pIter->pAddrField,
1593 kgmmuEncodePhysAddr(GPU_GET_KERNEL_GMMU(pUserCtx->pGpu), pIter->aperture, pIter->physAddr,
1594 pIter->fabricAddr),
1595 entry.v8);
1596 }
1597
1598 // Commit to memory.
1599 portMemCopy(&pMap[entryOffset], pLevelFmt->entrySize,
1600 entry.v8, pLevelFmt->entrySize);
1601
1602 //
1603 // pPageArray deals in 4K pages.
1604 // So increment by the ratio of mapping page size to 4K
1605 // --
1606 // The above assumption will be invalid upon implementation of memdesc dynamic page arrays
1607 //
1608 pIter->currIdx += (NvU32)(pageSize / pTarget->pageArrayGranularity);
1609 }
1610
1611 *pProgress = entryIndexHi - entryIndexLo + 1;
1612 }
1613
1614 static void
_gmmuWalkCBMapNextEntries_RmAperture(MMU_WALK_USER_CTX * pUserCtx,const MMU_MAP_TARGET * pTarget,const MMU_WALK_MEMDESC * pLevelMem,const NvU32 entryIndexLo,const NvU32 entryIndexHi,NvU32 * pProgress)1615 _gmmuWalkCBMapNextEntries_RmAperture
1616 (
1617 MMU_WALK_USER_CTX *pUserCtx,
1618 const MMU_MAP_TARGET *pTarget,
1619 const MMU_WALK_MEMDESC *pLevelMem,
1620 const NvU32 entryIndexLo,
1621 const NvU32 entryIndexHi,
1622 NvU32 *pProgress
1623 )
1624 {
1625 OBJGPU *pGpu = pUserCtx->pGpu;
1626 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
1627 MMU_MAP_ITERATOR *pIter = pTarget->pIter;
1628 MEMORY_DESCRIPTOR *pMemDesc = (MEMORY_DESCRIPTOR*)pLevelMem;
1629 const MMU_FMT_LEVEL *pLevelFmt = pTarget->pLevelFmt;
1630 TRANSFER_SURFACE surf = {0};
1631 NvU32 sizeOfEntries;
1632
1633 NV_PRINTF(LEVEL_INFO, "[GPU%u]: PA 0x%llX, Entries 0x%X-0x%X\n",
1634 pUserCtx->pGpu->gpuInstance,
1635 memdescGetPhysAddr(pMemDesc, AT_GPU, 0), entryIndexLo,
1636 entryIndexHi);
1637
1638 surf.pMemDesc = pMemDesc;
1639 surf.offset = entryIndexLo * pLevelFmt->entrySize;
1640
1641 sizeOfEntries = (entryIndexHi - entryIndexLo + 1 ) * pLevelFmt->entrySize;
1642
1643 pIter->pMap = memmgrMemBeginTransfer(pMemoryManager, &surf, sizeOfEntries,
1644 TRANSFER_FLAGS_SHADOW_ALLOC |
1645 TRANSFER_FLAGS_SHADOW_INIT_MEM);
1646 NV_ASSERT_OR_RETURN_VOID(NULL != pIter->pMap);
1647
1648 _gmmuWalkCBMapNextEntries_Direct(pUserCtx, pTarget, pLevelMem,
1649 entryIndexLo, entryIndexHi, pProgress);
1650
1651 memmgrMemEndTransfer(pMemoryManager, &surf, sizeOfEntries,
1652 TRANSFER_FLAGS_SHADOW_ALLOC |
1653 TRANSFER_FLAGS_SHADOW_INIT_MEM);
1654 }
1655
_dmaGetFabricAddress(OBJGPU * pGpu,NvU32 aperture,NvU32 kind,NvU64 * fabricAddr)1656 static NV_STATUS _dmaGetFabricAddress
1657 (
1658 OBJGPU *pGpu,
1659 NvU32 aperture,
1660 NvU32 kind,
1661 NvU64 *fabricAddr
1662 )
1663 {
1664 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
1665 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
1666
1667 *fabricAddr = NVLINK_INVALID_FABRIC_ADDR;
1668
1669 if (pKernelNvlink == NULL)
1670 {
1671 return NV_OK;
1672 }
1673
1674 if (aperture != NV_MMU_PTE_APERTURE_PEER_MEMORY)
1675 {
1676 return NV_OK;
1677 }
1678
1679 //
1680 // Fabric address should be available for NVSwitch connected GPUs,
1681 // otherwise it is a NOP.
1682 //
1683 *fabricAddr = knvlinkGetUniqueFabricBaseAddress(pGpu, pKernelNvlink);
1684 if (*fabricAddr == NVLINK_INVALID_FABRIC_ADDR)
1685 {
1686 return NV_OK;
1687 }
1688
1689 if (memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_COMPRESSIBLE, kind))
1690 {
1691 NV_PRINTF(LEVEL_ERROR,
1692 "Nvswitch systems don't support compression.\n");
1693 return NV_ERR_NOT_SUPPORTED;
1694 }
1695
1696 return NV_OK;
1697 }
1698
_dmaGetFabricEgmAddress(OBJGPU * pGpu,NvU32 aperture,NvU32 kind,NvU64 * fabricEgmAddr)1699 static NV_STATUS _dmaGetFabricEgmAddress
1700 (
1701 OBJGPU *pGpu,
1702 NvU32 aperture,
1703 NvU32 kind,
1704 NvU64 *fabricEgmAddr
1705 )
1706 {
1707 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
1708 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
1709
1710 *fabricEgmAddr = NVLINK_INVALID_FABRIC_ADDR;
1711
1712 if (pKernelNvlink == NULL)
1713 {
1714 return NV_OK;
1715 }
1716
1717 if (aperture != NV_MMU_PTE_APERTURE_PEER_MEMORY)
1718 {
1719 return NV_OK;
1720 }
1721
1722 //
1723 // Fabric address should be available for NVSwitch connected GPUs,
1724 // otherwise it is a NOP.
1725 //
1726 *fabricEgmAddr = knvlinkGetUniqueFabricEgmBaseAddress(pGpu, pKernelNvlink);
1727 if (*fabricEgmAddr == NVLINK_INVALID_FABRIC_ADDR)
1728 {
1729 return NV_OK;
1730 }
1731
1732 if (memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_COMPRESSIBLE, kind))
1733 {
1734 NV_PRINTF(LEVEL_ERROR,
1735 "Nvswitch systems don't support compression.\n");
1736 return NV_ERR_NOT_SUPPORTED;
1737 }
1738
1739 return NV_OK;
1740 }
1741
1742 // VMM-TODO: PL(N) mmuPageLevelUpdate - but major splits
1743 NV_STATUS
dmaUpdateVASpace_GF100(OBJGPU * pGpu,VirtMemAllocator * pDma,OBJVASPACE * pVAS,MEMORY_DESCRIPTOR * pMemDesc,NvU8 * pTgtPteMem,NvU64 vAddr,NvU64 vAddrLimit,NvU32 flags,DMA_PAGE_ARRAY * pPageArray,NvU32 overmapPteMod,COMPR_INFO * pComprInfo,NvU64 surfaceOffset,NvU32 valid,NvU32 aperture,NvU32 peer,NvU64 fabricAddr,NvU32 deferInvalidate,NvBool bSparse,NvU64 pageSize)1744 dmaUpdateVASpace_GF100
1745 (
1746 OBJGPU *pGpu,
1747 VirtMemAllocator *pDma,
1748 OBJVASPACE *pVAS,
1749 MEMORY_DESCRIPTOR *pMemDesc,
1750 NvU8 *pTgtPteMem, // CPU pointer to PTE memory for Vista updates
1751 NvU64 vAddr,
1752 NvU64 vAddrLimit,
1753 NvU32 flags,
1754 DMA_PAGE_ARRAY *pPageArray,
1755 NvU32 overmapPteMod,
1756 COMPR_INFO *pComprInfo,
1757 NvU64 surfaceOffset,
1758 NvU32 valid,
1759 NvU32 aperture,
1760 NvU32 peer,
1761 NvU64 fabricAddr,
1762 NvU32 deferInvalidate,
1763 NvBool bSparse,
1764 NvU64 pageSize
1765 )
1766 {
1767 KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu);
1768 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
1769 NvBool readPte = NV_FALSE;
1770 NV_STATUS status = NV_OK;
1771 NvBool isVolatile = NV_TRUE;
1772 NvU32 encrypted = 0;
1773 NvU32 tlbLock;
1774 NvU32 readOnly;
1775 NvU32 priv;
1776 NvU32 writeDisable;
1777 NvU32 readDisable;
1778 NvU64 vaSpaceBigPageSize = 0;
1779 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
1780 const MEMORY_SYSTEM_STATIC_CONFIG *pMemorySystemConfig =
1781 kmemsysGetStaticConfig(pGpu, pKernelMemorySystem);
1782 NvU32 alignSize = pMemorySystemConfig->comprPageSize;
1783 KernelGmmu *pKernelGmmu = GPU_GET_KERNEL_GMMU(pGpu);
1784 NvBool bFillPteMem = !!(flags & DMA_UPDATE_VASPACE_FLAGS_FILL_PTE_MEM);
1785 NvBool bUnmap = !bFillPteMem &&
1786 (flags & DMA_UPDATE_VASPACE_FLAGS_UPDATE_VALID) &&
1787 (SF_VAL(_MMU, _PTE_VALID, valid) == NV_MMU_PTE_VALID_FALSE);
1788 NvBool bIsIndirectPeer;
1789 VAS_PTE_UPDATE_TYPE update_type;
1790
1791 {
1792 OBJGVASPACE *pGVAS = dynamicCast(pVAS, OBJGVASPACE);
1793 if (bFillPteMem &&
1794 (pGVAS->flags & VASPACE_FLAGS_BAR_BAR1) &&
1795 (flags & DMA_UPDATE_VASPACE_FLAGS_UPDATE_VALID) &&
1796 (SF_VAL(_MMU, _PTE_VALID, valid) == NV_MMU_PTE_VALID_FALSE))
1797 {
1798 bSparse = NV_TRUE;
1799 }
1800 }
1801
1802 priv = (flags & DMA_UPDATE_VASPACE_FLAGS_PRIV) ? NV_MMU_PTE_PRIVILEGE_TRUE : NV_MMU_PTE_PRIVILEGE_FALSE;
1803 tlbLock = (flags & DMA_UPDATE_VASPACE_FLAGS_TLB_LOCK) ? NV_MMU_PTE_LOCK_TRUE : NV_MMU_PTE_LOCK_FALSE;
1804 readOnly = (flags & DMA_UPDATE_VASPACE_FLAGS_READ_ONLY) ? NV_MMU_PTE_READ_ONLY_TRUE : NV_MMU_PTE_READ_ONLY_FALSE;
1805 writeDisable = !!(flags & DMA_UPDATE_VASPACE_FLAGS_SHADER_READ_ONLY);
1806 readDisable = !!(flags & DMA_UPDATE_VASPACE_FLAGS_SHADER_WRITE_ONLY);
1807 bIsIndirectPeer = !!(flags & DMA_UPDATE_VASPACE_FLAGS_INDIRECT_PEER);
1808
1809 NV_ASSERT_OR_RETURN(pageSize, NV_ERR_INVALID_ARGUMENT);
1810
1811 vaSpaceBigPageSize = vaspaceGetBigPageSize(pVAS);
1812 if ((pageSize == RM_PAGE_SIZE_64K) || (pageSize == RM_PAGE_SIZE_128K))
1813 {
1814 NV_ASSERT_OR_RETURN(pageSize == vaSpaceBigPageSize, NV_ERR_INVALID_STATE);
1815 }
1816
1817 //
1818 // Determine whether we are invalidating or revoking privileges, so we know
1819 // whether to flush page accesses or not. ReadDisable and writeDisable have
1820 // been deprecated Pascal+, and we don't have the capability to guarantee
1821 // coherency post TLB invalidate on pre-Pascal, so we ignore them here.
1822 //
1823 update_type = (bUnmap || (NV_MMU_PTE_LOCK_FALSE == tlbLock)
1824 || (NV_MMU_PTE_READ_ONLY_TRUE == readOnly)) ? PTE_DOWNGRADE : PTE_UPGRADE;
1825
1826 if (pMemDesc == NULL)
1827 {
1828 NV_ASSERT(pMemDesc);
1829 return NV_ERR_INVALID_ARGUMENT;
1830 }
1831
1832 switch (aperture)
1833 {
1834 case NV_MMU_PTE_APERTURE_PEER_MEMORY:
1835 isVolatile = (memdescGetGpuP2PCacheAttrib(pMemDesc) == NV_MEMORY_UNCACHED) ? NV_TRUE : NV_FALSE;
1836 break;
1837 case NV_MMU_PTE_APERTURE_SYSTEM_COHERENT_MEMORY:
1838 case NV_MMU_PTE_APERTURE_SYSTEM_NON_COHERENT_MEMORY:
1839 if (bIsIndirectPeer)
1840 isVolatile = (memdescGetGpuP2PCacheAttrib(pMemDesc) == NV_MEMORY_UNCACHED) ? NV_TRUE : NV_FALSE;
1841 else
1842 isVolatile = (memdescGetGpuCacheAttrib(pMemDesc) == NV_MEMORY_UNCACHED) ? NV_TRUE : NV_FALSE;
1843
1844 break;
1845 default:
1846 case NV_MMU_PTE_APERTURE_VIDEO_MEMORY:
1847 isVolatile = NV_FALSE;
1848 break;
1849 }
1850
1851 encrypted = (flags & DMA_UPDATE_VASPACE_FLAGS_DISABLE_ENCRYPTION) ? 0 :
1852 memdescGetFlag(pMemDesc, MEMDESC_FLAGS_ENCRYPTED);
1853
1854 // Check this here so we don't have to in the loop(s) below as necessary.
1855 if ((flags & DMA_UPDATE_VASPACE_FLAGS_UPDATE_PADDR) && (pPageArray == NULL))
1856 {
1857 return NV_ERR_INVALID_ARGUMENT;
1858 }
1859
1860 //
1861 // Must get some attrs from existing PTE and Only if PTE is
1862 // going to be invalidated, no need to read it
1863 //
1864 if (DMA_UPDATE_VASPACE_FLAGS_UPDATE_ALL != (flags & DMA_UPDATE_VASPACE_FLAGS_UPDATE_ALL))
1865 {
1866 readPte = !((flags & DMA_UPDATE_VASPACE_FLAGS_UPDATE_VALID) &&
1867 (SF_VAL(_MMU, _PTE_VALID, valid) == NV_MMU_PTE_VALID_FALSE));
1868 }
1869
1870 //
1871 // Compressed surfaces must be aligned to the compression page size
1872 // but don't check for LDDM which may pass in unaligned surfaces incrementally.
1873 // Chips that support full comp tag lines will support the VA being aligned to
1874 // the big page size. This is because the PA alignement chooses between even/odd pages
1875 // and SW programs the PA alignment.
1876 //
1877 if (pDma->getProperty(pDma, PDB_PROP_DMA_ENABLE_FULL_COMP_TAG_LINE))
1878 {
1879 alignSize = vaSpaceBigPageSize;
1880 }
1881
1882 //
1883 // If we have dynamic granularity page arrays enabled we will never
1884 // encounter a case where a larger page granularity physical surface gets
1885 // represented by a smaller granularity pageArray.
1886 //
1887 if (!pMemoryManager->bEnableDynamicGranularityPageArrays)
1888 {
1889 //
1890 // VMM-TODO: Merge into PL1 traveral.
1891 //
1892 // If the pageSize of the mapping != 4K then be sure that the 4k pages
1893 // making up the big physical page are contiguous. This is currently
1894 // necessary since pMemDesc->PteArray is always in terms of 4KB pages.
1895 // Different large pages do not have to be contiguous with each other.
1896 // This check isn't needed for contig allocations.
1897 //
1898 if (pPageArray && (pageSize != RM_PAGE_SIZE) && (pPageArray->count > 1) &&
1899 !(flags & DMA_UPDATE_VASPACE_FLAGS_SKIP_4K_PTE_CHECK))
1900 {
1901 NvU32 i, j;
1902 RmPhysAddr pageAddr, pagePrevAddr;
1903
1904 for (i = 0; i < pPageArray->count; i += j)
1905 {
1906 for (j = i + 1; j < pPageArray->count; j++)
1907 {
1908 pagePrevAddr = dmaPageArrayGetPhysAddr(pPageArray, j - 1);
1909 pageAddr = dmaPageArrayGetPhysAddr(pPageArray, j);
1910
1911 if ((1 + (pagePrevAddr/(RM_PAGE_SIZE))) !=
1912 (pageAddr/(RM_PAGE_SIZE)))
1913 {
1914 NV_PRINTF(LEVEL_ERROR,
1915 "MMU: given non-contig 4KB pages for %lldkB mapping\n",
1916 pageSize / 1024);
1917 DBG_BREAKPOINT();
1918 return NV_ERR_GENERIC;
1919 }
1920
1921 // Are we at the pageSize boundary yet?
1922 if ((pageAddr + RM_PAGE_SIZE)
1923 % pageSize == 0)
1924 {
1925 j++;
1926 break;
1927 }
1928 }
1929 }
1930 }
1931 }
1932
1933 // Zero peer on non-peer requests to simplify pte construction
1934 if (aperture != NV_MMU_PTE_APERTURE_PEER_MEMORY)
1935 {
1936 peer = 0;
1937 }
1938
1939 MMU_MAP_TARGET mapTarget = {0};
1940 MMU_MAP_ITERATOR mapIter = {0};
1941 OBJGVASPACE *pGVAS = dynamicCast(pVAS, OBJGVASPACE);
1942 const NvU64 vaLo = NV_ALIGN_DOWN64(vAddr, pageSize);
1943 const NvU64 vaHi = NV_ALIGN_DOWN64(vAddrLimit + pageSize, pageSize) - 1;
1944 GVAS_GPU_STATE *pGpuState = gvaspaceGetGpuState(pGVAS, pGpu);
1945 const GMMU_FMT *pFmt = pGpuState->pFmt;
1946
1947 // Enforce unicast.
1948 NV_ASSERT_OR_RETURN(!gpumgrGetBcEnabledStatus(pGpu), NV_ERR_INVALID_STATE);
1949
1950 if (bUnmap)
1951 {
1952 gvaspaceUnmap(pGVAS, pGpu, vaLo, vaHi);
1953 }
1954 else
1955 {
1956 NvU32 kind = pComprInfo->kind;
1957 NvU32 kindNoCompression;
1958
1959 //
1960 // If the original kind is compressible we need to know what the non-compresible
1961 // kind is so we can fall back to that if we run out of compression tags.
1962 //
1963 if (memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_COMPRESSIBLE, kind))
1964 {
1965 kindNoCompression = memmgrGetUncompressedKind_HAL(pGpu, pMemoryManager, kind, NV_FALSE);
1966 }
1967 else
1968 {
1969 kindNoCompression = kind;
1970 }
1971
1972 if (!RMCFG_FEATURE_PLATFORM_WINDOWS &&
1973 memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_COMPRESSIBLE, pComprInfo->kind) &&
1974 ((vAddr & (alignSize-1)) != 0) &&
1975 !(flags & DMA_UPDATE_VASPACE_FLAGS_UNALIGNED_COMP))
1976 {
1977 return NV_ERR_INVALID_ARGUMENT;
1978 }
1979
1980 // MMU_MAP_CTX
1981 mapTarget.pLevelFmt = mmuFmtFindLevelWithPageShift(pFmt->pRoot,
1982 BIT_IDX_64(pageSize));
1983 mapTarget.pIter = &mapIter;
1984 mapTarget.MapNextEntries = _gmmuWalkCBMapNextEntries_RmAperture;
1985 mapTarget.pageArrayGranularity = pMemDesc->pageArrayGranularity;
1986
1987 //MMU_MAP_ITER
1988 mapIter.pFmt = pFmt;
1989 mapIter.pPageArray = pPageArray;
1990 mapIter.surfaceOffset = surfaceOffset;
1991 mapIter.comprInfo = *pComprInfo;
1992 mapIter.overMapModulus = overmapPteMod;
1993 mapIter.bReadPtes = readPte;
1994 mapIter.kindNoCompr = kindNoCompression;
1995 mapIter.bCompr = memmgrIsKind_HAL(pMemoryManager, FB_IS_KIND_COMPRESSIBLE, pComprInfo->kind);
1996 mapIter.bUpdatePhysAddr = !!(flags & DMA_UPDATE_VASPACE_FLAGS_UPDATE_PADDR);
1997 mapIter.bUpdateCompr = !!(flags & DMA_UPDATE_VASPACE_FLAGS_UPDATE_COMPR);
1998 mapIter.fabricAddr = fabricAddr;
1999
2000 if ((pageSize == RM_PAGE_SIZE_512M) && kgmmuIsBug2720120WarEnabled(pKernelGmmu))
2001 {
2002 NV_ASSERT_OK_OR_RETURN(_dmaApplyWarForBug2720120(pGVAS, pGpu,
2003 vaLo, vaHi));
2004 }
2005
2006 if (kmemsysNeedInvalidateGpuCacheOnMap_HAL(pGpu, pKernelMemorySystem, isVolatile, aperture))
2007 {
2008 kmemsysCacheOp_HAL(pGpu, pKernelMemorySystem, pMemDesc,
2009 (aperture == NV_MMU_PTE_APERTURE_PEER_MEMORY) ? FB_CACHE_PEER_MEMORY : FB_CACHE_SYSTEM_MEMORY,
2010 FB_CACHE_EVICT);
2011 }
2012
2013 // Build PTE template
2014 if (pFmt->version == GMMU_FMT_VERSION_3)
2015 {
2016 NvU32 ptePcfHw = 0;
2017 NvU32 ptePcfSw = 0;
2018
2019 // Set the new PTE PCF bits
2020 if (valid)
2021 {
2022 nvFieldSetBool(&pFmt->pPte->fldValid, NV_TRUE,
2023 mapIter.pteTemplate);
2024 nvFieldSet32(&pFmt->pPte->fldAperture._enum.desc,
2025 aperture, mapIter.pteTemplate);
2026 nvFieldSet32(&pFmt->pPte->fldPeerIndex, peer,
2027 mapIter.pteTemplate);
2028 nvFieldSet32(&pFmt->pPte->fldKind, kindNoCompression,
2029 mapIter.pteTemplate);
2030 //
2031 // Initializing the PTE V3 PCF bits whose default values are as follows:
2032 // 1.Regular vs Privilege : Regular (controlled by the priv flag)
2033 // 2.RO vs RW : RW (controlled by the readOnly flag)
2034 // 3.Atomic Enabled vs Atomic Disabled : Atomic Enabled
2035 // 4.Cached vs Uncached : Cached (controlled by the isVolatile flag)
2036 // 5.ACE vs ACD : ACD
2037 //
2038 ptePcfSw |= isVolatile ? (1 << SW_MMU_PCF_UNCACHED_IDX) : 0;
2039 ptePcfSw |= readOnly ? (1 << SW_MMU_PCF_RO_IDX) : 0;
2040 ptePcfSw |= tlbLock ? (1 << SW_MMU_PCF_NOATOMIC_IDX) : 0;
2041 ptePcfSw |= !priv ? (1 << SW_MMU_PCF_REGULAR_IDX) : 0;
2042 if ((memdescGetAddressSpace(pMemDesc) == ADDR_FABRIC_MC))
2043 {
2044 ptePcfSw |= (1 << SW_MMU_PCF_ACE_IDX);
2045 }
2046 }
2047 else
2048 {
2049 // NV4K and NOMAPPING are not supported right now
2050 if (bSparse)
2051 {
2052 ptePcfSw |= (1 << SW_MMU_PCF_SPARSE_IDX);
2053 }
2054 else
2055 {
2056 ptePcfSw |= (1 << SW_MMU_PCF_INVALID_IDX);
2057 }
2058 }
2059 NV_CHECK_OR_RETURN(LEVEL_ERROR,
2060 (kgmmuTranslatePtePcfFromSw_HAL(pKernelGmmu, ptePcfSw, &ptePcfHw) == NV_OK),
2061 NV_ERR_INVALID_ARGUMENT);
2062
2063 nvFieldSet32(&pFmt->pPte->fldPtePcf, ptePcfHw, mapIter.pteTemplate);
2064 }
2065 else
2066 {
2067 if (bSparse)
2068 {
2069 const GMMU_FMT_FAMILY *pFmtFamily =
2070 kgmmuFmtGetFamily(pKernelGmmu, pFmt->version);
2071 NV_ASSERT_OR_RETURN(NULL != pFmtFamily, NV_ERR_INVALID_DATA);
2072 portMemCopy(mapIter.pteTemplate,
2073 mapTarget.pLevelFmt->entrySize, pFmtFamily->sparsePte.v8,
2074 mapTarget.pLevelFmt->entrySize);
2075 }
2076 else
2077 {
2078 nvFieldSetBool(&pFmt->pPte->fldValid, !!valid,
2079 mapIter.pteTemplate);
2080 nvFieldSet32(&pFmt->pPte->fldAperture._enum.desc,
2081 aperture, mapIter.pteTemplate);
2082 nvFieldSet32(&pFmt->pPte->fldPeerIndex, peer,
2083 mapIter.pteTemplate);
2084
2085 nvFieldSetBool(&pFmt->pPte->fldVolatile, !!isVolatile,
2086 mapIter.pteTemplate);
2087 nvFieldSet32(&pFmt->pPte->fldKind, kindNoCompression,
2088 mapIter.pteTemplate);
2089 nvFieldSetBool(&pFmt->pPte->fldReadOnly, !!readOnly,
2090 mapIter.pteTemplate);
2091 nvFieldSetBool(&pFmt->pPte->fldPrivilege, !!priv,
2092 mapIter.pteTemplate);
2093 nvFieldSetBool(&pFmt->pPte->fldEncrypted, !!encrypted,
2094 mapIter.pteTemplate);
2095 if (nvFieldIsValid32(&pFmt->pPte->fldReadDisable.desc))
2096 {
2097 nvFieldSetBool(&pFmt->pPte->fldReadDisable, !!readDisable,
2098 mapIter.pteTemplate);
2099 }
2100 if (nvFieldIsValid32(&pFmt->pPte->fldWriteDisable.desc))
2101 {
2102 nvFieldSetBool(&pFmt->pPte->fldWriteDisable, !!writeDisable,
2103 mapIter.pteTemplate);
2104 }
2105 if (nvFieldIsValid32(&pFmt->pPte->fldLocked.desc))
2106 {
2107 nvFieldSetBool(&pFmt->pPte->fldLocked, !!tlbLock,
2108 mapIter.pteTemplate);
2109 }
2110 if (nvFieldIsValid32(&pFmt->pPte->fldAtomicDisable.desc))
2111 {
2112 // tlbLock is overridden by atomic_disable
2113 nvFieldSetBool(&pFmt->pPte->fldAtomicDisable, !!tlbLock,
2114 mapIter.pteTemplate);
2115 }
2116 }
2117 }
2118
2119 // Extract the physical address field based on aperture.
2120 mapIter.aperture =
2121 gmmuFieldGetAperture(&pFmt->pPte->fldAperture,
2122 mapIter.pteTemplate);
2123 if (mapIter.aperture != GMMU_APERTURE_INVALID)
2124 {
2125 mapIter.pAddrField =
2126 gmmuFmtPtePhysAddrFld(pFmt->pPte, mapIter.aperture);
2127 }
2128
2129 //
2130 // FillPteMem case must be handled specially as it violates
2131 // internal VAS alignment and constistency checks.
2132 //
2133 if (bFillPteMem)
2134 {
2135 // If caller supplies buffer to write PTEs to, use that
2136 if (NULL != pTgtPteMem)
2137 {
2138 MMU_WALK_USER_CTX userCtx = {0};
2139 NvU32 progress = 0;
2140 NvU32 entryIndexLo = mmuFmtVirtAddrToEntryIndex(mapTarget.pLevelFmt, vaLo);
2141 // Calculated to allow cross-page-table-boundary updates.
2142 NvU32 entryIndexHi = (NvU32)(vaHi >> mapTarget.pLevelFmt->virtAddrBitLo) -
2143 (NvU32)(vaLo >> mapTarget.pLevelFmt->virtAddrBitLo) +
2144 entryIndexLo;
2145
2146 //
2147 // Use pTgtPteMem directly as mapping and pass NULL memdesc to
2148 // indicate buffered mode.
2149 //
2150 userCtx.pGpu = pGpu;
2151 mapIter.pMap = pTgtPteMem;
2152 _gmmuWalkCBMapNextEntries_Direct(&userCtx, &mapTarget, NULL,
2153 entryIndexLo, entryIndexHi, &progress);
2154 NV_ASSERT(progress == entryIndexHi - entryIndexLo + 1);
2155 }
2156 // Otherwise use walker directly.
2157 else
2158 {
2159 GVAS_BLOCK *pVASBlock = NULL;
2160 EMEMBLOCK *pMemBlock = NULL;
2161 MMU_WALK_USER_CTX userCtx = {0};
2162
2163 pMemBlock = pGVAS->pHeap->eheapGetBlock(pGVAS->pHeap, vaLo, 0);
2164 NV_ASSERT_OR_RETURN(NULL != pMemBlock, NV_ERR_INVALID_ARGUMENT);
2165 pVASBlock = pMemBlock->pData;
2166
2167 gvaspaceWalkUserCtxAcquire(pGVAS, pGpu, pVASBlock, &userCtx);
2168 status = mmuWalkMap(userCtx.pGpuState->pWalk, vaLo, vaHi, &mapTarget);
2169 NV_ASSERT(NV_OK == status);
2170 gvaspaceWalkUserCtxRelease(pGVAS, &userCtx);
2171 }
2172 }
2173 else
2174 {
2175 VAS_MAP_FLAGS mapFlags = {0};
2176 mapFlags.bRemap = readPte ||
2177 (flags & DMA_UPDATE_VASPACE_FLAGS_ALLOW_REMAP);
2178 status = gvaspaceMap(pGVAS, pGpu, vaLo, vaHi, &mapTarget, mapFlags);
2179 NV_ASSERT(NV_OK == status);
2180 }
2181 }
2182
2183 // Invalidate VAS TLB entries.
2184 if ((NULL == pTgtPteMem) && DMA_TLB_INVALIDATE == deferInvalidate)
2185 {
2186 kbusFlush_HAL(pGpu, pKernelBus, BUS_FLUSH_VIDEO_MEMORY |
2187 BUS_FLUSH_SYSTEM_MEMORY);
2188 gvaspaceInvalidateTlb(pGVAS, pGpu, update_type);
2189 }
2190
2191 #if NV_PRINTF_LEVEL_ENABLED(LEVEL_INFO)
2192 if (DBG_RMMSG_CHECK(LEVEL_INFO))
2193 {
2194 MMU_TRACE_ARG arg = {0};
2195 MMU_TRACE_PARAM params = {0};
2196 params.mode = MMU_TRACE_MODE_TRACE_VERBOSE;
2197 params.va = vAddr;
2198 params.vaLimit = vAddrLimit;
2199 params.pArg = &arg;
2200
2201 mmuTrace(pGpu, pVAS, ¶ms);
2202 }
2203 #endif
2204 return status;
2205 }
2206
2207 NV_STATUS
dmaInit_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma)2208 dmaInit_GM107(OBJGPU *pGpu, VirtMemAllocator *pDma)
2209 {
2210 DMAHALINFO_FERMI *pDHPI = NULL;
2211 NvU32 data;
2212
2213 // Allocate and link in an 'info block' for this engine.
2214 if (NULL == (pDHPI = (PDMAHALINFO_FERMI)addInfoPtr(&pDma->infoList, HAL_IMPL_GF100,
2215 sizeof(DMAHALINFO_FERMI))))
2216 {
2217 return NV_ERR_NO_MEMORY;
2218 }
2219
2220 if (IS_VIRTUAL(pGpu) || IS_GSP_CLIENT(pGpu))
2221 {
2222 pGpu->optimizeUseCaseOverride =
2223 NV_REG_STR_RM_OPTIMIZE_COMPUTE_OR_SPARSE_TEX_DEFAULT;
2224 }
2225
2226 pDHPI->vasReverse = !(!pDHPI->vasReverse);
2227
2228 pDHPI->compTagLineMultiplier = 1;
2229
2230 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_RESTRICT_VA_RANGE, &data)
2231 == NV_OK)
2232 {
2233 if (NV_REG_STR_RM_RESTRICT_VA_RANGE_ON == data)
2234 {
2235 pDma->setProperty(pDma, PDB_PROP_DMA_RESTRICT_VA_RANGE, NV_TRUE);
2236 }
2237 }
2238
2239 return NV_OK;
2240 }
2241
2242 void
dmaDestruct_GM107(VirtMemAllocator * pDma)2243 dmaDestruct_GM107(VirtMemAllocator *pDma)
2244 {
2245 deleteInfoPtr(&pDma->infoList, HAL_IMPL_GF100);
2246 }
2247
2248 // Called when IsSLI = NV_TRUE and all linked GPUs are loaded
2249 NV_STATUS
dmaStatePostLoad_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma,NvU32 flags)2250 dmaStatePostLoad_GM107(OBJGPU *pGpu, VirtMemAllocator *pDma, NvU32 flags)
2251 {
2252 #ifdef DEBUG
2253 DMAHALINFO_FERMI *pDHPI = DMA_GET_FERMI_INFOBLK(pDma);
2254 DMAHALINFO_FERMI *pDHPIPeer;
2255 VirtMemAllocator *pPeerDma;
2256
2257 pPeerDma = GPU_GET_DMA(pGpu);
2258 pDHPIPeer = DMA_GET_FERMI_INFOBLK(pPeerDma);
2259
2260 //
2261 // Require these attributes to be symmetric for now. If we need to support
2262 // heterogeneous SLI across GPUs that don't match here we'll need to implement
2263 // dma[Get|Set]TunableState.
2264 //
2265 NV_ASSERT(pDHPIPeer->vasReverse == pDHPI->vasReverse);
2266 NV_ASSERT(pDHPIPeer->compTagLineMultiplier == pDHPI->compTagLineMultiplier);
2267 #endif
2268 return NV_OK;
2269 }
2270
2271 // VMM-TODO: Remove or merge with dmaAllocMapping_GF100.
2272 NV_STATUS
dmaMapBuffer_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma,OBJVASPACE * pVAS,MEMORY_DESCRIPTOR * pMemDesc,NvU64 * pVaddr,NvU32 flagsForAlloc,NvU32 flagsForUpdate)2273 dmaMapBuffer_GM107
2274 (
2275 OBJGPU *pGpu,
2276 VirtMemAllocator *pDma,
2277 OBJVASPACE *pVAS,
2278 MEMORY_DESCRIPTOR *pMemDesc,
2279 NvU64 *pVaddr,
2280 NvU32 flagsForAlloc,
2281 NvU32 flagsForUpdate
2282 )
2283 {
2284 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
2285
2286 KernelGmmu *pKernelGmmu = GPU_GET_KERNEL_GMMU(pGpu);
2287 NvU32 kind;
2288 COMPR_INFO comprInfo;
2289 NvU32 pteCount, aperture;
2290 NvU64 mapLength;
2291 NvU64 vaddr;
2292 NV_STATUS status = NV_OK;
2293 NvU64 rangeLo = 0;
2294 NvU64 rangeHi = 0;
2295 NvU64 compAlign;
2296 NvU64 vaSize;
2297 NvU64 vaAlign;
2298 OBJEHEAP *pVASpaceHeap = vaspaceGetHeap(pVAS);
2299 NvU64 pageSize = 0;
2300 NvU64 pageSizeSubDev = 0;
2301 NvU64 pageOffs = 0;
2302 NvU64 pageOffsSubDev = 0;
2303 NvU32 flags;
2304
2305 DMA_PAGE_ARRAY pageArray;
2306 MEMORY_DESCRIPTOR *pSubDevMemDesc = NULL;
2307 VAS_ALLOC_FLAGS allocFlags = {0};
2308
2309 NV_ASSERT(pVaddr);
2310 NV_ASSERT(pVAS);
2311
2312 //
2313 // Sets the page size for all subdevice memdescs when present. Since we don't support
2314 // different page size per subdevice, it asserts when the page size differs.
2315 //
2316 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY)
2317 pSubDevMemDesc = memdescGetMemDescFromGpu(pMemDesc, pGpu);
2318 if (memmgrSetMemDescPageSize_HAL(pGpu, pMemoryManager, pSubDevMemDesc, VAS_ADDRESS_TRANSLATION(pVAS),
2319 RM_ATTR_PAGE_SIZE_DEFAULT) != NV_OK)
2320 {
2321 SLI_LOOP_RETURN(NV_ERR_INVALID_ARGUMENT);
2322 }
2323 pageSizeSubDev = memdescGetPageSize(pSubDevMemDesc, VAS_ADDRESS_TRANSLATION(pVAS));
2324 pageOffsSubDev = memdescGetPhysAddr(pSubDevMemDesc, VAS_ADDRESS_TRANSLATION(pVAS), 0) &
2325 (pageSizeSubDev - 1);
2326 if (0 == pageSize)
2327 {
2328 pageSize = pageSizeSubDev;
2329 pageOffs = pageOffsSubDev;
2330 }
2331 else
2332 {
2333 NV_ASSERT(pageSize == pageSizeSubDev);
2334 NV_ASSERT(pageOffs == pageOffsSubDev);
2335 }
2336 SLI_LOOP_END
2337
2338 status = memmgrGetKindComprFromMemDesc(pMemoryManager, pMemDesc, 0, &kind, &comprInfo);
2339 if (status != NV_OK)
2340 {
2341 NV_PRINTF(LEVEL_ERROR, "memmgrGetKindComprFromMemDesc failed\n");
2342 return NV_ERR_GENERIC;
2343 }
2344
2345 if (kgmmuIsPerVaspaceBigPageEn(pKernelGmmu) &&
2346 (pageSize >= RM_PAGE_SIZE_64K))
2347 {
2348 NV_ASSERT(pageSize != RM_PAGE_SIZE_HUGE);
2349 pageSize = vaspaceGetBigPageSize(pVAS);
2350 }
2351
2352 mapLength = RM_ALIGN_UP(pageOffs + memdescGetSize(pMemDesc), pageSize);
2353
2354 vaddr = 0;
2355 compAlign = NVBIT64(comprInfo.compPageShift);
2356 vaAlign = NV_MAX(pageSize, compAlign);
2357 vaSize = RM_ALIGN_UP(mapLength, vaAlign);
2358
2359 if (flagsForAlloc & DMA_ALLOC_VASPACE_SIZE_ALIGNED)
2360 {
2361 NvU64 temp = vaSize;
2362 ROUNDUP_POW2_U64(temp);
2363 vaAlign = NV_MAX(vaAlign, temp);
2364 }
2365
2366 // Set this first in case we ignore DMA_ALLOC_VASPACE_USE_RM_INTERNAL_VALIMITS next
2367 rangeLo = vaspaceGetVaStart(pVAS);
2368 rangeHi = vaspaceGetVaLimit(pVAS);
2369
2370 if (flagsForAlloc & DMA_ALLOC_VASPACE_USE_RM_INTERNAL_VALIMITS)
2371 {
2372 OBJGVASPACE *pGVAS = dynamicCast(pVAS, OBJGVASPACE);
2373 if (pGVAS)
2374 {
2375 rangeLo = pGVAS->vaStartInternal;
2376 rangeHi = pGVAS->vaLimitInternal;
2377 }
2378 }
2379
2380 // If trying to conserve 32bit address space, map RM buffers at 4GB+
2381 if (pDma->getProperty(pDma, PDB_PROP_DMA_ENFORCE_32BIT_POINTER) &&
2382 (pVASpaceHeap->free > NVBIT64(32)))
2383 {
2384 rangeLo = NV_MAX(NVBIT64(32), rangeLo);
2385 }
2386
2387 if (flagsForAlloc & DMA_VA_LIMIT_57B)
2388 {
2389 rangeHi = NV_MIN(rangeHi, NVBIT64(57) - 1);
2390 }
2391 else if (flagsForAlloc & DMA_VA_LIMIT_49B)
2392 {
2393 rangeHi = NV_MIN(rangeHi, NVBIT64(49) - 1);
2394 }
2395 else if (pDma->getProperty(pDma, PDB_PROP_DMA_RESTRICT_VA_RANGE))
2396 {
2397 // See comments in vaspaceFillAllocParams_IMPL.
2398 rangeHi = NV_MIN(rangeHi, NVBIT64(40) - 1);
2399 }
2400
2401 status = vaspaceAlloc(pVAS, vaSize, vaAlign, rangeLo, rangeHi,
2402 pageSize, allocFlags, &vaddr);
2403 if (status != NV_OK)
2404 {
2405 NV_PRINTF(LEVEL_ERROR, "vaspaceAlloc failed\n");
2406 return status;
2407 }
2408
2409 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY)
2410
2411 pSubDevMemDesc = memdescGetMemDescFromGpu(pMemDesc, pGpu);
2412
2413 pteCount = memdescGetContiguity(pSubDevMemDesc, VAS_ADDRESS_TRANSLATION(pVAS)) ? 1 :
2414 (NvU32)(mapLength >> RM_PAGE_SHIFT);
2415
2416 dmaPageArrayInit(&pageArray,
2417 memdescGetPteArray(pSubDevMemDesc, VAS_ADDRESS_TRANSLATION(pVAS)),
2418 pteCount);
2419 flags = flagsForUpdate;
2420 flags |= memdescGetFlag(pSubDevMemDesc, MEMDESC_FLAGS_GPU_PRIVILEGED) ?
2421 DMA_UPDATE_VASPACE_FLAGS_PRIV : 0;
2422
2423 if (memdescGetAddressSpace(pSubDevMemDesc) == ADDR_FBMEM)
2424 {
2425 aperture = NV_MMU_PTE_APERTURE_VIDEO_MEMORY;
2426 }
2427 else if (memdescGetCpuCacheAttrib(pSubDevMemDesc) == NV_MEMORY_CACHED)
2428 {
2429 aperture = NV_MMU_PTE_APERTURE_SYSTEM_COHERENT_MEMORY;
2430 }
2431 else
2432 {
2433 aperture = NV_MMU_PTE_APERTURE_SYSTEM_NON_COHERENT_MEMORY;
2434 }
2435
2436 status = dmaUpdateVASpace_HAL(pGpu, pDma, pVAS,
2437 pSubDevMemDesc,
2438 NULL,
2439 vaddr, vaddr + mapLength - 1,
2440 flags | DMA_UPDATE_VASPACE_FLAGS_UPDATE_ALL,
2441 &pageArray, 0, &comprInfo,
2442 0,
2443 NV_MMU_PTE_VALID_TRUE,
2444 aperture, 0,
2445 NVLINK_INVALID_FABRIC_ADDR,
2446 NV_FALSE, NV_FALSE,
2447 pageSize);
2448
2449 if (status != NV_OK)
2450 {
2451 SLI_LOOP_BREAK;
2452 }
2453
2454 SLI_LOOP_END
2455
2456 if (status != NV_OK)
2457 {
2458 NV_PRINTF(LEVEL_ERROR, "dmaUpdateVASpace_GF100 failed\n");
2459 vaspaceFree(pVAS, vaddr);
2460 return NV_ERR_GENERIC;
2461 }
2462
2463 if (pVaddr)
2464 {
2465 *pVaddr = vaddr;
2466 }
2467
2468 return NV_OK;
2469 }
2470
dmaUnmapBuffer_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma,OBJVASPACE * pVAS,NvU64 vaddr)2471 void dmaUnmapBuffer_GM107(OBJGPU *pGpu, VirtMemAllocator *pDma, OBJVASPACE *pVAS, NvU64 vaddr)
2472 {
2473 NV_ASSERT_OR_RETURN_VOID(NULL != pVAS);
2474
2475 vaspaceFree(pVAS, vaddr);
2476 }
2477
2478 #ifdef DEBUG
2479 /*
2480 * These routines are not used by the RM proper. They are meant to be used by by
2481 * external debuggers. Because of this we do not have have a global prototype.
2482 */
2483 NvU32 _mmuReadFb32(OBJGPU *pGpu, RmPhysAddr addr, NvU32 aperture);
2484 void _mmuWriteFb32(OBJGPU *pGpu, RmPhysAddr addr, NvU32 data, NvU32 aperture);
2485
_mmuReadFb32(OBJGPU * pGpu,RmPhysAddr addr,NvU32 aperture)2486 NvU32 _mmuReadFb32(OBJGPU *pGpu, RmPhysAddr addr, NvU32 aperture)
2487 {
2488 MEMORY_DESCRIPTOR memDesc = {0};
2489 NvU8 *pOffset = NULL;
2490 NvU32 data = 0;
2491
2492 if (aperture == 0)
2493 aperture = ADDR_FBMEM;
2494 memdescCreateExisting(&memDesc, pGpu, 4, aperture, NV_MEMORY_UNCACHED, MEMDESC_FLAGS_NONE);
2495 memdescDescribe(&memDesc, aperture, addr, 4); // Note that this will probably fail with MODS/sysmem
2496 pOffset = kbusMapRmAperture_HAL(pGpu, &memDesc);
2497 if (pOffset == NULL)
2498 {
2499 NV_ASSERT(pOffset != NULL);
2500 goto _mmuReadFb32_failed;
2501 }
2502
2503 data = MEM_RD32(pOffset);
2504
2505 kbusUnmapRmAperture_HAL(pGpu, &memDesc, &pOffset, NV_TRUE);
2506 _mmuReadFb32_failed:
2507 memdescDestroy(&memDesc);
2508
2509 return data;
2510 }
2511
_mmuWriteFb32(OBJGPU * pGpu,RmPhysAddr addr,NvU32 data,NvU32 aperture)2512 void _mmuWriteFb32(OBJGPU *pGpu, RmPhysAddr addr, NvU32 data, NvU32 aperture)
2513 {
2514 MEMORY_DESCRIPTOR memDesc = {0};
2515 NvU8 *pOffset = NULL;
2516
2517 if (aperture == 0)
2518 aperture = ADDR_FBMEM;
2519 memdescCreateExisting(&memDesc, pGpu, 4, aperture, NV_MEMORY_UNCACHED, MEMDESC_FLAGS_NONE);
2520 memdescDescribe(&memDesc, aperture, addr, 4); // Note that this will probably fail with MODS/sysmem
2521 pOffset = kbusMapRmAperture_HAL(pGpu, &memDesc);
2522 if (pOffset == NULL)
2523 {
2524 NV_ASSERT(pOffset != NULL);
2525 goto _mmuWriteFb32_failed;
2526 }
2527
2528 MEM_WR32(pOffset, data);
2529
2530 kbusUnmapRmAperture_HAL(pGpu, &memDesc, &pOffset, NV_TRUE);
2531 _mmuWriteFb32_failed:
2532 memdescDestroy(&memDesc);
2533 }
2534
2535 #endif // DEBUG
2536
2537 //--------------------------------------------------------------------------------
2538 // dmaXlateVAtoPAforChannel_GM107 - this function translates virtual address
2539 // to physical address through page table walk for a given channel id.
2540 //
2541 // Returns NV_OK if translation was successful, NV_ERR_GENERIC otherwise.
2542 //
2543 // Output parameters:
2544 // pAddr - physical address
2545 // memType - memory type where this physical address belongs to
2546 // (ADDR_SYSMEM or ADDR_FBMEM)
2547 //
2548 //--------------------------------------------------------------------------------
2549 NV_STATUS
dmaXlateVAtoPAforChannel_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma,KernelChannel * pKernelChannel,NvU64 vAddr,NvU64 * pAddr,NvU32 * memType)2550 dmaXlateVAtoPAforChannel_GM107
2551 (
2552 OBJGPU *pGpu,
2553 VirtMemAllocator *pDma,
2554 KernelChannel *pKernelChannel,
2555 NvU64 vAddr,
2556 NvU64 *pAddr,
2557 NvU32 *memType
2558 )
2559 {
2560 NV_ASSERT_OR_RETURN(pKernelChannel != NULL, NV_ERR_INVALID_ARGUMENT);
2561 NV_ASSERT_OR_RETURN(pAddr != NULL, NV_ERR_INVALID_ARGUMENT);
2562 NV_ASSERT_OR_RETURN(memType != NULL, NV_ERR_INVALID_ARGUMENT);
2563
2564 MMU_TRACE_ARG arg = {0};
2565 MMU_TRACE_PARAM params = {0};
2566 NV_STATUS status;
2567
2568 params.mode = MMU_TRACE_MODE_TRANSLATE;
2569 params.va = vAddr;
2570 params.vaLimit = vAddr;
2571 params.pArg = &arg;
2572
2573 status = mmuTrace(pGpu, pKernelChannel->pVAS, ¶ms);
2574 if (status == NV_OK)
2575 {
2576 *memType = arg.aperture;
2577 *pAddr = arg.pa;
2578 }
2579
2580 return status;
2581 }
2582
2583 static NV_STATUS
_dmaApplyWarForBug2720120(OBJGVASPACE * pGVAS,OBJGPU * pGpu,const NvU64 vaLo,const NvU64 vaHi)2584 _dmaApplyWarForBug2720120
2585 (
2586 OBJGVASPACE *pGVAS,
2587 OBJGPU *pGpu,
2588 const NvU64 vaLo,
2589 const NvU64 vaHi
2590 )
2591 {
2592 KernelGmmu *pKernelGmmu = GPU_GET_KERNEL_GMMU(pGpu);
2593 KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu);
2594 GVAS_GPU_STATE *pGpuState = gvaspaceGetGpuState(pGVAS, pGpu);
2595 const GMMU_FMT *pFmt = pGpuState->pFmt;
2596 const GMMU_FMT_FAMILY *pFmtFamily = kgmmuFmtGetFamily(pKernelGmmu, pFmt->version);
2597 GVAS_BLOCK *pVASBlock = NULL;
2598 EMEMBLOCK *pMemBlock = NULL;
2599 MMU_WALK_USER_CTX userCtx = {0};
2600 MMU_MAP_TARGET mapTarget = {0};
2601 MMU_MAP_ITERATOR mapIter = {0};
2602
2603 // MMU_MAP_CTX
2604 mapTarget.pLevelFmt = mmuFmtFindLevelWithPageShift(pFmt->pRoot, 29);
2605 mapTarget.pIter = &mapIter;
2606 mapTarget.MapNextEntries = _gmmuWalkCBMapNextEntries_RmAperture;
2607
2608 //MMU_MAP_ITER
2609 mapIter.pFmt = pFmt;
2610 mapIter.bApplyWarForBug2720120 = NV_TRUE;
2611
2612 // Copy the template
2613 portMemCopy(mapIter.pteTemplate,
2614 mapTarget.pLevelFmt->entrySize, pFmtFamily->bug2720120WarPde1.v8,
2615 mapTarget.pLevelFmt->entrySize);
2616
2617 pMemBlock = pGVAS->pHeap->eheapGetBlock(pGVAS->pHeap, vaLo, 0);
2618 NV_ASSERT_OR_RETURN(pMemBlock != NULL, NV_ERR_INVALID_ARGUMENT);
2619 pVASBlock = pMemBlock->pData;
2620
2621 gvaspaceWalkUserCtxAcquire(pGVAS, pGpu, pVASBlock, &userCtx);
2622 NV_ASSERT_OK_OR_RETURN(mmuWalkMap(userCtx.pGpuState->pWalk,
2623 vaLo, vaHi, &mapTarget));
2624 gvaspaceWalkUserCtxRelease(pGVAS, &userCtx);
2625
2626 // Flush PTE writes to vidmem and issue TLB invalidate
2627 kbusFlush_HAL(pGpu, pKernelBus, BUS_FLUSH_VIDEO_MEMORY |
2628 BUS_FLUSH_SYSTEM_MEMORY);
2629 gvaspaceInvalidateTlb(pGVAS, pGpu, PTE_UPGRADE);
2630
2631 return NV_OK;
2632 }
2633
2634 NV_STATUS
dmaInitGart_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma)2635 dmaInitGart_GM107(OBJGPU *pGpu, VirtMemAllocator *pDma)
2636 {
2637 KernelBif *pKernelBif = GPU_GET_KERNEL_BIF(pGpu);
2638 pDma->gpuGartCaps = DMA_GPU_GART_CAPS_NOSNOOP;
2639
2640 if ((pKernelBif != NULL) && FLD_TEST_REF(BIF_DMA_CAPS_SNOOP, _CTXDMA,
2641 kbifGetDmaCaps(pGpu, pKernelBif)))
2642 {
2643 pDma->gpuGartCaps |= DMA_GPU_GART_CAPS_SNOOP;
2644 }
2645
2646 return NV_OK;
2647 }
2648
2649 /*!
2650 * @brief This function returns the size of a large page
2651 *
2652 * @param[in] pGpu OBJGPU pointer
2653 *
2654 * @returns The size of GPU PTE
2655 */
2656 NvU32
dmaGetPTESize_GM107(OBJGPU * pGpu,VirtMemAllocator * pDma)2657 dmaGetPTESize_GM107(OBJGPU *pGpu, VirtMemAllocator *pDma)
2658 {
2659 return NV_MMU_PTE__SIZE;
2660 }
2661