1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2012-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "core/core.h"
25 #include "gpu/gpu.h"
26 #include "gpu/device/device.h"
27 #include "os/os.h"
28 #include "gpu/bus/kern_bus.h"
29 #include "gpu/mem_mgr/mem_mgr.h"
30 #include "gpu/mem_mgr/heap.h"
31 #include "gpu/mem_mgr/mem_scrub.h"
32 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
33 #include "gpu/mem_mgr/mem_desc.h"
34 #include "gpu/ce/kernel_ce.h"
35 #include "gpu/ce/kernel_ce_private.h"
36 #include "mem_mgr/gpu_vaspace.h"
37 #include "core/locks.h"
38 #include "nvRmReg.h"
39 #include "rmapi/rs_utils.h"
40 #include "mem_mgr/ctx_buf_pool.h"
41 #include "vgpu/rpc.h"
42 #include "kernel/gpu/fifo/kernel_channel.h"
43 #include "platform/chipset/chipset.h"
44 #include "platform/sli/sli.h"
45
46 #include "class/clc0b5sw.h"
47 #include "class/cla06fsubch.h" // NVA06F_SUBCHANNEL_COPY_ENGINE
48 #include "class/cl003e.h" // NV01_MEMORY_SYSTEM
49 #include "class/cl0040.h" // NV01_MEMORY_LOCAL_USER
50 #include "class/cl50a0.h" // NV50_MEMORY_VIRTUAL
51 #include "class/cl00c2.h" // NV01_MEMORY_LOCAL_PHYSICAL
52 #include "class/clb0b5.h" // MAXWELL_DMA_COPY_A
53 #include "class/clc8b5.h" // HOPPER_DMA_COPY_A
54 #include "class/cl90f1.h" // FERMI_VASPACE_A
55
56 #define NONSTALL_METHOD_SIZE 8
57 #define SEMAPHORE_ONLY_METHOD_SIZE 32
58 #define MAX_EXTRA_PAYLOAD (NONSTALL_METHOD_SIZE + SEMAPHORE_ONLY_METHOD_SIZE)
59
60
61 static NV_STATUS _memUtilsChannelAllocatePB_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel);
62 static NV_STATUS _memUtilsAllocateChannel(OBJGPU *pGpu, MemoryManager *pMemoryManager, NvHandle hClientId,
63 NvHandle hDeviceId, NvHandle hChannelId, NvHandle hObjectError,
64 NvHandle hObjectBuffer, OBJCHANNEL *pChannel);
65 static NV_STATUS _memUtilsAllocCe_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
66 NvHandle hClientId, NvHandle hDeviceId, NvHandle hChannelId, NvHandle hCopyObjectId);
67 static NV_STATUS _memUtilsAllocateUserD(OBJGPU *pGpu, MemoryManager *pMemoryManager, NvHandle hClientId,
68 NvHandle hDeviceId, OBJCHANNEL *pChannel);
69 static NV_STATUS _memUtilsMapUserd_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager,
70 OBJCHANNEL *pChannel, NvHandle hClientId, NvHandle hDeviceId,
71 NvHandle hChannelId, NvBool bUseRmApiForBar1);
72 static NV_STATUS _memUtilsAllocateReductionSema(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel);
73 static NvU32 _ceChannelScheduleWork_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
74 RmPhysAddr src, NV_ADDRESS_SPACE srcAddressSpace, NvU32 srcCpuCacheAttrib,
75 RmPhysAddr dst, NV_ADDRESS_SPACE dstAddressSpace, NvU32 dstCpuCacheAttrib,
76 NvU64 size, NvBool blocking, NvBool insertFinishPayload, NvBool bMemcopy);
77 static void _ceChannelUpdateGpFifo_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
78 NvU32 gpOffset,NvU32 gpSize);
79 static NvU32 _ceChannelPushMethodsBlock_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
80 RmPhysAddr src, NV_ADDRESS_SPACE srcAddressSpace, NvU32 srcCpuCacheAttrib,
81 RmPhysAddr dst, NV_ADDRESS_SPACE dstAddressSpace, NvU32 dstCpuCacheAttrib,
82 NvU64 size, NvU32 **pPtr, NvBool addPayloadSema,
83 NvBool addNonStallIntr, NvBool addFinishPayload, NvBool bMemcopy);
84 static NvU32 _getSpaceInPb(OBJCHANNEL *pChannel);
85 static NvBool _checkSynchronization(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel, NvU32 block);
86
87 static NV_STATUS
_memUtilsAllocateReductionSema(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)88 _memUtilsAllocateReductionSema
89 (
90 OBJGPU *pGpu,
91 MemoryManager *pMemoryManager,
92 OBJCHANNEL *pChannel
93 )
94 {
95
96 NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
97 NV_STATUS rmStatus;
98 NvU32 i;
99 NV_STATUS lockStatus;
100 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
101
102 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
103 // allocate physical memory for a bit map semaphore
104 portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
105
106 memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
107 memAllocParams.type = NVOS32_TYPE_IMAGE;
108 memAllocParams.size = (((pChannel->blockCount + 31)/32)*4);
109 memAllocParams.attr = 0;
110 memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
111 // memAllocParams.attr |= NVOS32_ATTR_COHERENCY_WRITE_COMBINE;
112 memAllocParams.attr2 = NVOS32_ATTR2_NONE;
113 memAllocParams.flags = 0;
114
115 //
116 // When APM feature is enabled all RM internal sysmem allocations must
117 // be in unprotected memory
118 // When Hopper CC is enabled all RM internal sysmem allocations that
119 // are required to be accessed from GPU should be in unprotected memory
120 // but those sysmem allocations that are not required to be accessed from
121 // GPU should be in protected memory.
122 //
123
124 NV_ASSERT_OK_OR_RETURN(
125 pRmApi->AllocWithHandle(pRmApi,
126 pChannel->hClient,
127 pChannel->deviceId,
128 pChannel->bitMapSemPhysId,
129 NV01_MEMORY_SYSTEM,
130 &memAllocParams,
131 sizeof(memAllocParams)));
132
133 // allocate virtual memory for a bit map semaphore
134 portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
135 memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
136 memAllocParams.type = NVOS32_TYPE_IMAGE;
137 memAllocParams.size = (((pChannel->blockCount + 31)/32)*4);
138 memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
139 memAllocParams.attr2 = NVOS32_ATTR2_NONE;
140 memAllocParams.flags = 0;
141 memAllocParams.flags |= NVOS32_ALLOC_FLAGS_VIRTUAL;
142
143 NV_ASSERT_OK_OR_RETURN(
144 pRmApi->AllocWithHandle(pRmApi,
145 pChannel->hClient,
146 pChannel->deviceId,
147 pChannel->bitMapSemVirtId,
148 NV50_MEMORY_VIRTUAL,
149 &memAllocParams,
150 sizeof(memAllocParams)));
151
152 lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
153 if(lockStatus != NV_OK)
154 {
155 NV_ASSERT_FAILED("Could not get back lock after allocating reduction sema");
156 return NV_ERR_GENERIC;
157 }
158
159 NV_CHECK_OK_OR_GOTO(
160 rmStatus,
161 LEVEL_ERROR,
162 pRmApi->Map(pRmApi,
163 pChannel->hClient,
164 pChannel->deviceId,
165 pChannel->bitMapSemVirtId,
166 pChannel->bitMapSemPhysId, //hMemory,
167 0,
168 (((pChannel->blockCount + 31)/32)*4),
169 NV04_MAP_MEMORY_FLAGS_NONE,
170 &pChannel->pbGpuBitMapVA),
171 exit_sema_creation);
172
173 NV_CHECK_OK_OR_GOTO(
174 rmStatus,
175 LEVEL_ERROR,
176 pRmApi->MapToCpu(pRmApi,
177 pChannel->hClient,
178 pChannel->deviceId,
179 pChannel->bitMapSemPhysId,
180 0,
181 (((pChannel->blockCount + 31)/32)*4),
182 (void **)&pChannel->pbBitMapVA,
183 0),
184 exit_sema_creation);
185
186 for(i = 0; i < (((pChannel->blockCount + 31) / 32) * 4);)
187 {
188 MEM_WR32((NvU8*)pChannel->pbBitMapVA + (i), 0);
189 i = i + 4;
190 }
191
192 return NV_OK;
193 exit_sema_creation:
194 pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
195 NV_PRINTF(LEVEL_INFO, "end NV_STATUS=0x%08x\n", rmStatus);
196 return rmStatus;
197 }
198
199 static NV_STATUS
_memUtilsChannelAllocatePB_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)200 _memUtilsChannelAllocatePB_GM107
201 (
202 OBJGPU *pGpu,
203 MemoryManager *pMemoryManager,
204 OBJCHANNEL *pChannel
205
206 // OBJMEMUTILS *to be added here
207 )
208 {
209 NV_STATUS rmStatus = NV_OK;
210 NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
211 NvHandle hDevice;
212 NvHandle hPhysMem;
213 NvU64 size;
214 NvHandle hVirtMem;
215 NvU32 hClass;
216 NvU32 attr;
217 NvU32 flags = 0;
218 NvU32 attrNotifier = NVOS32_ATTR_NONE;
219 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
220
221 // Apply registry overrides to channel pushbuffer.
222 switch (DRF_VAL(_REG_STR_RM, _INST_LOC_4, _CHANNEL_PUSHBUFFER, pGpu->instLocOverrides4))
223 {
224 case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_VID:
225 hClass = NV01_MEMORY_LOCAL_USER;
226 attr = DRF_DEF(OS32, _ATTR, _LOCATION, _VIDMEM) |
227 DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
228
229 flags = NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM;
230 if (!IS_MIG_IN_USE(pGpu))
231 {
232 attr |= DRF_DEF(OS32, _ATTR, _ALLOCATE_FROM_RESERVED_HEAP, _YES);
233 }
234 attrNotifier = attr;
235 break;
236
237 case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_COH:
238 hClass = NV01_MEMORY_SYSTEM;
239 attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI) |
240 DRF_DEF(OS32, _ATTR, _COHERENCY, _CACHED);
241 attrNotifier = attr;
242 break;
243
244 case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_NCOH:
245 hClass = NV01_MEMORY_SYSTEM;
246 attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI) |
247 DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
248 attrNotifier = attr;
249 break;
250
251 case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_DEFAULT:
252 default:
253 hClass = NV01_MEMORY_SYSTEM;
254 attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI) |
255 DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
256
257 //
258 // The work submit token is read from notifier memory to support
259 // VM migration for the memory scrubber. The token is read from
260 // the notifier memory every time when the scrubber submits the work.
261 // It will help performance by changing the default setting of
262 // the notifier memory to be cached.
263 //
264 attrNotifier = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI) |
265 DRF_DEF(OS32, _ATTR, _COHERENCY, _CACHED);
266 break;
267 }
268
269 hDevice = pChannel->deviceId;
270 hPhysMem = pChannel->physMemId;
271 hVirtMem = pChannel->pushBufferId;
272 size = pChannel->channelSize;
273
274 LOCK_ASSERT_AND_RETURN(!rmGpuLockIsOwner());
275 // allocate the physical memory
276 portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
277 memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
278 memAllocParams.type = NVOS32_TYPE_IMAGE;
279 memAllocParams.size = size;
280 memAllocParams.attr = attr;
281 memAllocParams.attr2 = NVOS32_ATTR2_NONE;
282 memAllocParams.flags = flags;
283 memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
284
285 //
286 // When APM is enabled all RM internal allocations must to go to
287 // unprotected memory irrespective of vidmem or sysmem
288 // When Hopper CC is enabled all RM internal sysmem allocations that
289 // are required to be accessed from GPU should be in unprotected memory
290 // but all vidmem allocations must go to protected memory
291 //
292 if (gpuIsApmFeatureEnabled(pGpu) ||
293 FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
294 {
295 memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
296 _UNPROTECTED);
297 }
298
299 NV_CHECK_OK_OR_RETURN(
300 LEVEL_ERROR,
301 pRmApi->AllocWithHandle(pRmApi,
302 pChannel->hClient,
303 hDevice,
304 hPhysMem,
305 hClass,
306 &memAllocParams,
307 sizeof(memAllocParams)));
308
309 // allocate the Virtual memory
310 portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
311 memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
312 memAllocParams.type = NVOS32_TYPE_IMAGE;
313 memAllocParams.size = size;
314 memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
315 memAllocParams.attr2 = NVOS32_ATTR2_NONE;
316 memAllocParams.flags |= NVOS32_ALLOC_FLAGS_VIRTUAL;
317 memAllocParams.hVASpace = pChannel->hVASpaceId;
318
319 NV_CHECK_OK_OR_RETURN(
320 LEVEL_ERROR,
321 pRmApi->AllocWithHandle(pRmApi,
322 pChannel->hClient,
323 hDevice,
324 hVirtMem,
325 NV50_MEMORY_VIRTUAL,
326 &memAllocParams,
327 sizeof(memAllocParams)));
328
329 // allocate the physmem for the notifier
330
331 if (gpuIsCCFeatureEnabled(pGpu))
332 {
333 //
334 // Force error notifier to ncoh sysmem when CC is enabled
335 // since key rotation notifier is part of error notifier and
336 // it needs to be in sysmem so we can create persistent mapping for it.
337 // we cannot create mappins on the fly since this notifier is
338 // written as part of 1 sec callback where creating mappings is
339 // not allowed.
340 //
341 hClass = NV01_MEMORY_SYSTEM;
342 attrNotifier = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI) |
343 DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
344 }
345
346 portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
347 memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
348 memAllocParams.type = NVOS32_TYPE_IMAGE;
349 memAllocParams.size = pChannel->channelNotifierSize;
350 memAllocParams.attr = attrNotifier;
351 memAllocParams.attr2 = NVOS32_ATTR2_NONE;
352 memAllocParams.flags = 0;
353 memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
354
355 //
356 // When APM is enabled all RM internal allocations must to go to
357 // unprotected memory irrespective of vidmem or sysmem
358 // When Hopper CC is enabled all RM internal sysmem allocations that
359 // are required to be accessed from GPU should be in unprotected memory
360 // but all vidmem allocations must go to protected memory
361 //
362 if (gpuIsApmFeatureEnabled(pGpu) ||
363 FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
364 {
365 memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
366 _UNPROTECTED);
367 }
368
369 NV_CHECK_OK_OR_RETURN(
370 LEVEL_ERROR,
371 pRmApi->AllocWithHandle(pRmApi,
372 pChannel->hClient,
373 hDevice,
374 pChannel->errNotifierIdPhys,
375 hClass,
376 &memAllocParams,
377 sizeof(memAllocParams)));
378
379 // allocate Virtual Memory for the notifier
380 portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
381 memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
382 memAllocParams.type = NVOS32_TYPE_IMAGE;
383 memAllocParams.size = pChannel->channelNotifierSize;
384 memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
385 memAllocParams.attr2 = NVOS32_ATTR2_NONE;
386 memAllocParams.flags |= NVOS32_ALLOC_FLAGS_VIRTUAL;
387 memAllocParams.hVASpace = pChannel->hVASpaceId;
388
389 NV_CHECK_OK_OR_RETURN(
390 LEVEL_ERROR,
391 pRmApi->AllocWithHandle(pRmApi,
392 pChannel->hClient,
393 hDevice,
394 pChannel->errNotifierIdVirt,
395 NV50_MEMORY_VIRTUAL,
396 &memAllocParams,
397 sizeof(memAllocParams)));
398
399 return rmStatus;
400 }
401
402 NV_STATUS
memmgrMemUtilsChannelInitialize_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)403 memmgrMemUtilsChannelInitialize_GM107
404 (
405 OBJGPU *pGpu,
406 MemoryManager *pMemoryManager,
407 OBJCHANNEL *pChannel
408 )
409 {
410 NV_STATUS rmStatus;
411 NV_STATUS lockStatus;
412 RsClient *pRsClient = pChannel->pRsClient;
413 NvHandle hClient = pChannel->hClient;
414 NvHandle hDevice = pChannel->deviceId;
415 NvHandle hPhysMem = pChannel->physMemId;
416 NvU64 size = pChannel->channelSize;
417 NvHandle hChannel = pChannel->channelId;
418 NvHandle hErrNotifierVirt = pChannel->errNotifierIdVirt;
419 NvHandle hErrNotifierPhys = pChannel->errNotifierIdPhys;
420 NvHandle hPushBuffer = pChannel->pushBufferId;
421 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
422 NvBool bMIGInUse = IS_MIG_IN_USE(pGpu);
423 NvU8 *pErrNotifierCpuVA = NULL;
424 NV_ADDRESS_SPACE userdAddrSpace;
425 NV_ADDRESS_SPACE pushBuffAddrSpace;
426 NV_ADDRESS_SPACE gpFifoAddrSpace;
427 OBJSYS *pSys = SYS_GET_INSTANCE();
428 OBJCL *pCl = SYS_GET_CL(pSys);
429 NvU32 cacheSnoopFlag = 0 ;
430 NvBool bUseRmApiForBar1 = NV_FALSE;
431
432 //
433 // Heap alloc one chunk of memory to hold all of our alloc parameters to
434 // reduce stack usage
435 //
436 union
437 {
438 NV_VASPACE_ALLOCATION_PARAMETERS va;
439 NV_MEMORY_ALLOCATION_PARAMS mem;
440 } *pParams = NULL;
441
442 if (pCl->getProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT))
443 {
444 cacheSnoopFlag = DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _ENABLE);
445 }
446
447 pParams = portMemAllocNonPaged(sizeof(*pParams));
448 if (pParams == NULL)
449 {
450 rmStatus = NV_ERR_NO_MEMORY;
451 goto exit_free_client;
452 }
453
454 //
455 // client allocated userd only supported on volta+
456 // TODO: Use property to check if client allocated userd is supported
457 //
458 pChannel->bClientUserd = IsVOLTAorBetter(pGpu);
459
460 //
461 // We need to allocate a VAS to use for CE copies, but also for
462 // GSP-RM + MIG, so that it doesn't get the device
463 // default VAS during channel bind (which is not properly handled
464 // by split VAS in MIG currently). We only need the identity mapping
465 // when actually using the VAS for copies.
466 //
467 if (pChannel->bUseVasForCeCopy ||
468 (IS_GSP_CLIENT(pGpu) && bMIGInUse))
469 {
470 NvBool bAcquireLock = NV_FALSE;
471 NvU64 startFbOffset = GPU_GET_HEAP(pGpu)->base;
472 NvU64 fbSize = GPU_GET_HEAP(pGpu)->total;
473 NvU64 vaStartOffset = startFbOffset;
474
475 NV_PRINTF(LEVEL_INFO, "Channel VAS heap base: %llx total: %llx \n", GPU_GET_HEAP(pGpu)->base,
476 GPU_GET_HEAP(pGpu)->total);
477
478 pChannel->startFbOffset = startFbOffset;
479 pChannel->fbSize = fbSize;
480
481 if (pChannel->bUseVasForCeCopy)
482 {
483 NV_ASSERT_OK_OR_GOTO(rmStatus,
484 clientGenResourceHandle(pRsClient, &pChannel->hFbAlias),
485 exit_free_client);
486
487 rmStatus = memmgrMemUtilsCreateMemoryAlias_HAL(pGpu, pMemoryManager, pChannel);
488 if (rmStatus != NV_OK)
489 {
490 NV_PRINTF(LEVEL_ERROR, "Setting Identity mapping failed.. status: %x\n", rmStatus);
491 goto exit_free_client;
492 }
493 }
494
495 {
496 NV_VASPACE_ALLOCATION_PARAMETERS *pVa = &pParams->va;
497
498 portMemSet(pVa, 0, sizeof(*pVa));
499 pVa->index = NV_VASPACE_ALLOCATION_INDEX_GPU_NEW;
500 pVa->vaBase = pChannel->startFbOffset;
501 //
502 // how large should we go here ? we definitely need more than heapSize to allocate
503 // other metadata related to chnanel. Also need to account the discontiguous VA Range
504 // for split VAS, where we allocate 4GB to (4GB + 512MB) for Server VAS (mirrored).
505 // Rough VASpace Layout will be documented here:
506 //
507 //
508 if (gpuIsSplitVasManagementServerClientRmEnabled(pGpu))
509 {
510 pVa->vaSize += (SPLIT_VAS_SERVER_RM_MANAGED_VA_START +
511 SPLIT_VAS_SERVER_RM_MANAGED_VA_SIZE) ;
512 }
513 pVa->vaSize += fbSize + pChannel->channelSize + SCRUBBER_VASPACE_BUFFER_SIZE;
514
515 //
516 // We definitely need ALLOW_ZERO_ADDRESS, but SKIP_SCRUB_MEMPOOL is a patch
517 // until we figure out the right place for Scrubber page tables
518 //
519 pVa->flags |= NV_VASPACE_ALLOCATION_FLAGS_ALLOW_ZERO_ADDRESS |
520 NV_VASPACE_ALLOCATION_FLAGS_SKIP_SCRUB_MEMPOOL |
521 NV_VASPACE_ALLOCATION_FLAGS_OPTIMIZE_PTETABLE_MEMPOOL_USAGE;
522
523 if (!IS_MIG_IN_USE(pGpu))
524 {
525 pVa->flags |= NV_VASPACE_ALLOCATION_FLAGS_PTETABLE_HEAP_MANAGED;
526 }
527
528 if (rmDeviceGpuLockIsOwner(pGpu->gpuInstance))
529 {
530 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
531 bAcquireLock = NV_TRUE;
532 pRmApi = rmapiGetInterface(RMAPI_API_LOCK_INTERNAL);
533 }
534
535 rmStatus = pRmApi->AllocWithHandle(pRmApi, hClient, pChannel->deviceId,
536 pChannel->hVASpaceId, FERMI_VASPACE_A,
537 pVa, sizeof(*pVa));
538 }
539 if (bAcquireLock)
540 {
541 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus, rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
542 bAcquireLock = NV_FALSE;
543 pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
544 }
545
546 if (rmStatus != NV_OK)
547 {
548 NV_PRINTF(LEVEL_ERROR, "failed allocating scrubber vaspace, status=0x%x\n",
549 rmStatus);
550 goto exit_free_client;
551 }
552
553 rmStatus = vaspaceGetByHandleOrDeviceDefault(pRsClient,
554 pChannel->deviceId,
555 pChannel->hVASpaceId,
556 &pChannel->pVAS);
557 if (rmStatus != NV_OK)
558 {
559 NV_PRINTF(LEVEL_ERROR,
560 "failed getting the scrubber vaspace from handle, status=0x%x\n",
561 rmStatus);
562 goto exit_free_client;
563 }
564
565 if (pChannel->bUseVasForCeCopy)
566 {
567 if (!gpuIsWarBug200577889SriovHeavyEnabled(pGpu))
568 {
569 rmStatus = vaspacePinRootPageDir(pChannel->pVAS, pGpu);
570 if (rmStatus != NV_OK)
571 {
572 NV_PRINTF(LEVEL_ERROR, "failed pinning down Scrubber VAS, status=0x%x\n",
573 rmStatus);
574 goto exit_free_client;
575 }
576 }
577
578 NV_ASSERT_OK_OR_GOTO(rmStatus,
579 clientGenResourceHandle(pRsClient, &pChannel->hFbAliasVA), exit_free_client);
580 }
581
582 if (gpuIsSplitVasManagementServerClientRmEnabled(pGpu))
583 {
584 OBJGVASPACE *pGVAS = dynamicCast(pChannel->pVAS, OBJGVASPACE);
585 vaStartOffset += pGVAS->vaLimitServerRMOwned + 1;
586 pChannel->vaStartOffset = vaStartOffset;
587 }
588
589 if (rmDeviceGpuLockIsOwner(pGpu->gpuInstance))
590 {
591 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
592 bAcquireLock = NV_TRUE;
593 pRmApi = rmapiGetInterface(RMAPI_API_LOCK_INTERNAL);
594 }
595
596 // Allocate virtual memory for Identity Mapping
597 if (pChannel->bUseVasForCeCopy)
598 {
599 NV_MEMORY_ALLOCATION_PARAMS *pMem = &pParams->mem;
600 portMemSet(pMem, 0, sizeof(*pMem));
601 pMem->owner = NVOS32_TYPE_OWNER_RM;
602 pMem->type = NVOS32_TYPE_IMAGE;
603 pMem->size = pChannel->fbSize;
604 pMem->attr = (DRF_DEF(OS32, _ATTR, _LOCATION, _PCI) |
605 DRF_DEF(OS32, _ATTR, _PAGE_SIZE, _BIG));
606 pMem->attr2 = NVOS32_ATTR2_NONE;
607 pMem->offset = vaStartOffset;
608 pMem->flags = 0;
609 pMem->flags |= NVOS32_ALLOC_FLAGS_VIRTUAL |
610 NVOS32_ALLOC_FLAGS_FIXED_ADDRESS_ALLOCATE |
611 NVOS32_ALLOC_FLAGS_LAZY;
612 pMem->hVASpace = pChannel->hVASpaceId;
613
614 rmStatus = pRmApi->AllocWithHandle(pRmApi,
615 hClient,
616 pChannel->deviceId,
617 pChannel->hFbAliasVA,
618 NV50_MEMORY_VIRTUAL,
619 pMem,
620 sizeof(*pMem));
621 }
622
623 if (bAcquireLock)
624 {
625 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus, rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
626 bAcquireLock = NV_FALSE;
627 pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
628 }
629
630 if (rmStatus != NV_OK)
631 {
632 NV_PRINTF(LEVEL_ERROR, "Allocating VASpace for (base, size): (%llx, %llx) failed,"
633 " with status: %x\n", vaStartOffset, pChannel->fbSize, rmStatus);
634 goto exit_free_client;
635 }
636
637 // set up mapping of VA -> PA
638 if (pChannel->bUseVasForCeCopy)
639 {
640 NV_CHECK_OK_OR_GOTO(
641 rmStatus,
642 LEVEL_ERROR,
643 pRmApi->Map(pRmApi,
644 hClient,
645 pChannel->deviceId,
646 pChannel->hFbAliasVA,
647 pChannel->hFbAlias,
648 0,
649 pChannel->fbSize,
650 DRF_DEF(OS46, _FLAGS, _ACCESS, _READ_WRITE) |
651 DRF_DEF(OS46, _FLAGS, _PAGE_SIZE, _BIG) |
652 DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _ENABLE),
653 &pChannel->fbAliasVA),
654 exit_free_client);
655
656 NV_PRINTF(LEVEL_INFO, "Scrubber VAS :%x identity mapped with start addr: %llx, size: %llx\n",
657 pChannel->hFbAliasVA, pChannel->fbAliasVA, pChannel->fbSize);
658 }
659 }
660
661 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
662
663 //
664 // Fetch the physical location of the push buffer
665 //
666 // Bug 3434881 filed to track the following
667 // a.Implementation of the utility function to parse the
668 // push buffer and userd regkeys
669 // b.Replace all instances of regkey pushbuffer/userd regkey
670 // parsing with the utility function
671 //
672 switch (DRF_VAL(_REG_STR_RM, _INST_LOC_4, _CHANNEL_PUSHBUFFER, pGpu->instLocOverrides4))
673 {
674 case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_VID:
675 pushBuffAddrSpace = ADDR_FBMEM;
676 break;
677
678 case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_COH:
679 case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_NCOH:
680 case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_DEFAULT:
681 default:
682 pushBuffAddrSpace = ADDR_SYSMEM;
683 break;
684 }
685
686 gpFifoAddrSpace = pushBuffAddrSpace;
687
688 //Fetch the physical location of userD
689 switch (DRF_VAL(_REG_STR_RM, _INST_LOC, _USERD, pGpu->instLocOverrides))
690 {
691 case NV_REG_STR_RM_INST_LOC_USERD_NCOH:
692 case NV_REG_STR_RM_INST_LOC_USERD_COH:
693 userdAddrSpace = ADDR_SYSMEM;
694 break;
695
696 case NV_REG_STR_RM_INST_LOC_USERD_VID:
697 case NV_REG_STR_RM_INST_LOC_USERD_DEFAULT:
698 default:
699 userdAddrSpace = ADDR_FBMEM;
700 break;
701 }
702
703 // RM WAR for Bug 3313719
704 // Disallow USERD in sysmem and (GPFIFO or pushbuffer) in vidmem
705 rmStatus = kfifoCheckChannelAllocAddrSpaces_HAL(GPU_GET_KERNEL_FIFO(pGpu),
706 userdAddrSpace,
707 pushBuffAddrSpace,
708 gpFifoAddrSpace);
709 if (rmStatus != NV_OK)
710 {
711 NV_ASSERT_FAILED("USERD in sysmem and PushBuffer/GPFIFO in vidmem not allowed");
712 goto exit_free_client;
713 }
714
715 _memUtilsChannelAllocatePB_GM107(pGpu, pMemoryManager, pChannel);
716 lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
717 if(lockStatus != NV_OK)
718 {
719 NV_ASSERT_FAILED("Could not get back lock after allocating Push Buffer sema");
720 goto exit_free_client;
721 }
722
723 // map the pushbuffer
724 rmStatus = pRmApi->Map(pRmApi, hClient, hDevice,
725 hPushBuffer,
726 hPhysMem, //hMemory,
727 0,
728 size,
729 cacheSnoopFlag,
730 &pChannel->pbGpuVA);
731 // map the error notifier
732 rmStatus = pRmApi->Map(pRmApi, hClient, hDevice,
733 hErrNotifierVirt,
734 hErrNotifierPhys, //hMemory,
735 0,
736 pChannel->channelNotifierSize,
737 DRF_DEF(OS46, _FLAGS, _KERNEL_MAPPING, _ENABLE) | cacheSnoopFlag,
738 &pChannel->pbGpuNotifierVA);
739
740 NV_CHECK_OK_OR_GOTO(
741 rmStatus,
742 LEVEL_ERROR,
743 _memUtilsAllocateChannel(pGpu,
744 pMemoryManager,
745 hClient,
746 hDevice,
747 hChannel,
748 hErrNotifierVirt,
749 hPushBuffer,
750 pChannel),
751 exit_free_client);
752
753 // _memUtilsMapUserd
754 NV_CHECK_OK_OR_GOTO(
755 rmStatus,
756 LEVEL_ERROR,
757 _memUtilsMapUserd_GM107(pGpu, pMemoryManager, pChannel,
758 hClient, hDevice, hChannel, bUseRmApiForBar1),
759 exit_free_client);
760
761 // Set up pushbuffer and semaphore memdesc and memset the buffer
762 pChannel->pChannelBufferMemdesc =
763 memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, pChannel->hClient, hPhysMem);
764 NV_ASSERT_OR_GOTO(pChannel->pChannelBufferMemdesc != NULL, exit_free_client);
765
766 // Set up notifier memory
767 pChannel->pErrNotifierMemdesc =
768 memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, pChannel->hClient, hErrNotifierPhys);
769 NV_ASSERT_OR_GOTO(pChannel->pErrNotifierMemdesc != NULL, exit_free_client);
770
771 if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
772 {
773 rmStatus = memmgrMemDescMemSet(pMemoryManager, pChannel->pChannelBufferMemdesc, 0,
774 (TRANSFER_FLAGS_SHADOW_ALLOC | TRANSFER_FLAGS_SHADOW_INIT_MEM));
775 NV_ASSERT_OR_GOTO(rmStatus == NV_OK, exit_free_client);
776
777 pChannel->pbCpuVA = NULL;
778 pChannel->pTokenFromNotifier = NULL;
779 }
780 else
781 {
782 if (bUseRmApiForBar1)
783 {
784 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
785 pRmApi->MapToCpu(pRmApi, hClient, hDevice, hPhysMem, 0, size,
786 (void **)&pChannel->pbCpuVA, 0),
787 exit_free_client);
788
789 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
790 pRmApi->MapToCpu(pRmApi, hClient, hDevice, hErrNotifierPhys, 0,
791 pChannel->channelNotifierSize, (void **)&pErrNotifierCpuVA, 0),
792 exit_free_client);
793 }
794 else
795 {
796 //
797 // Most use cases can migrate to the internal memdescMap path for BAR1
798 // And it is preferred because external path will not work with CC
799 //
800 pChannel->pbCpuVA = memmgrMemDescBeginTransfer(pMemoryManager,
801 pChannel->pChannelBufferMemdesc, TRANSFER_FLAGS_USE_BAR1);
802 NV_ASSERT_OR_GOTO(pChannel->pbCpuVA != NULL, exit_free_client);
803
804 pErrNotifierCpuVA = memmgrMemDescBeginTransfer(pMemoryManager,
805 pChannel->pErrNotifierMemdesc, TRANSFER_FLAGS_USE_BAR1);
806 NV_ASSERT_OR_GOTO(pErrNotifierCpuVA != NULL, exit_free_client);
807 }
808
809 portMemSet(pChannel->pbCpuVA, 0, (NvLength)size);
810
811 pChannel->pTokenFromNotifier =
812 (NvNotification *)(pErrNotifierCpuVA +
813 (NV_CHANNELGPFIFO_NOTIFICATION_TYPE_WORK_SUBMIT_TOKEN *
814 sizeof(NvNotification)));
815 }
816
817 //
818 // Allocate and map the doorbell region to use in scrub on free
819 // Set the doorbellregister to False, since pre-volta chips doesn't support
820 //
821 NV_CHECK_OK_OR_GOTO(
822 rmStatus,
823 LEVEL_ERROR,
824 memmgrScrubMapDoorbellRegion_HAL(pGpu, pMemoryManager, pChannel),
825 exit_free_client);
826
827 portMemFree(pParams);
828 return NV_OK;
829
830 exit_free_client:
831 if(!pChannel->bClientAllocated)
832 {
833 pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
834 }
835 portMemFree(pParams);
836 NV_PRINTF(LEVEL_INFO, "end NV_STATUS=0x%08x\n", rmStatus);
837 return rmStatus;
838 }
839
840
841 /** memmgrMemUtilsCreateMemoryAlias_GM107
842 *
843 * @brief Creates an alias for the FB region
844 * This function doesn't allocate any memory but just creates memory handle
845 * which refers to FB range. This call can support for both baremetal and vGPU.
846 * @param[in] pChannel CHANNEL Pointer
847 *
848 * @returns NV_OK on success
849 */
850 NV_STATUS
memmgrMemUtilsCreateMemoryAlias_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)851 memmgrMemUtilsCreateMemoryAlias_GM107
852 (
853 OBJGPU *pGpu,
854 MemoryManager *pMemoryManager,
855 OBJCHANNEL *pChannel
856 )
857 {
858 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
859 NV_STATUS status = NV_OK;
860
861 NV_PHYSICAL_MEMORY_ALLOCATION_PARAMS physMemParams = {0};
862
863 memmgrGetPteKindForScrubber_HAL(pMemoryManager, &physMemParams.format);
864
865
866 status = pRmApi->AllocWithHandle(pRmApi,
867 pChannel->hClient,
868 pChannel->deviceId,
869 pChannel->hFbAlias,
870 NV01_MEMORY_LOCAL_PHYSICAL,
871 &physMemParams,
872 sizeof(physMemParams));
873 if (status != NV_OK)
874 {
875 NV_CHECK_OK_FAILED(LEVEL_WARNING, "Aliasing FbListMem", status);
876 return status;
877 }
878
879 NV_PRINTF(LEVEL_INFO, "Allocating FbAlias: %x for size: %llx, kind: %x\n", pChannel->hFbAlias,
880 pChannel->fbSize, physMemParams.format);
881
882
883 return NV_OK;
884 }
885
886 NV_STATUS
memmgrMemUtilsCopyEngineInitialize_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)887 memmgrMemUtilsCopyEngineInitialize_GM107
888 (
889 OBJGPU *pGpu,
890 MemoryManager *pMemoryManager,
891 OBJCHANNEL *pChannel
892 )
893 {
894 NV_STATUS rmStatus = NV_OK;
895 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
896
897 //allocce
898 NV_CHECK_OK_OR_GOTO(
899 rmStatus,
900 LEVEL_ERROR,
901 _memUtilsAllocCe_GM107(pGpu,
902 pMemoryManager,
903 pChannel,
904 pChannel->hClient,
905 pChannel->deviceId,
906 pChannel->channelId,
907 pChannel->engineObjectId),
908 exit_free);
909
910 NV_CHECK_OK_OR_GOTO(
911 rmStatus,
912 LEVEL_ERROR,
913 memmgrMemUtilsChannelSchedulingSetup(pGpu, pMemoryManager, pChannel), exit_free);
914
915 return NV_OK;
916
917 exit_free:
918 pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
919 NV_PRINTF(LEVEL_INFO, "end NV_STATUS=0x%08x\n", rmStatus);
920 return rmStatus;
921 }
922
_memUtilsAllocCe_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,NvHandle hClientId,NvHandle hDeviceId,NvHandle hChannelId,NvHandle hCopyObjectId)923 static NV_STATUS _memUtilsAllocCe_GM107
924 (
925 OBJGPU *pGpu,
926 MemoryManager *pMemoryManager,
927 OBJCHANNEL *pChannel,
928 NvHandle hClientId,
929 NvHandle hDeviceId,
930 NvHandle hChannelId,
931 NvHandle hCopyObjectId
932
933 )
934 {
935 NVC0B5_ALLOCATION_PARAMETERS createParams = {0};
936 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
937
938 createParams.version = NVC0B5_ALLOCATION_PARAMETERS_VERSION_1;
939 createParams.engineType = NV2080_ENGINE_TYPE_COPY(pChannel->ceId);
940 memmgrMemUtilsGetCopyEngineClass_HAL(pGpu, pMemoryManager, &pChannel->hTdCopyClass);
941 pChannel->engineType = gpuGetRmEngineType(createParams.engineType);
942
943 if (!pChannel->hTdCopyClass)
944 {
945 NV_PRINTF(LEVEL_ERROR, "Unable to determine CE's engine class.\n");
946 return NV_ERR_GENERIC;
947 }
948
949 NV_CHECK_OK_OR_RETURN(
950 LEVEL_ERROR,
951 pRmApi->AllocWithHandle(pRmApi,
952 hClientId,
953 hChannelId,
954 hCopyObjectId,
955 pChannel->hTdCopyClass,
956 &createParams,
957 sizeof(createParams)));
958
959 return NV_OK;
960 }
961
962 static NV_STATUS
_memUtilsMapUserd_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,NvHandle hClientId,NvHandle hDeviceId,NvHandle hChannelId,NvBool bUseRmApiForBar1)963 _memUtilsMapUserd_GM107
964 (
965 OBJGPU *pGpu,
966 MemoryManager *pMemoryManager,
967 OBJCHANNEL *pChannel,
968 NvHandle hClientId,
969 NvHandle hDeviceId,
970 NvHandle hChannelId,
971 NvBool bUseRmApiForBar1
972 )
973 {
974 //
975 // The memTransfer API only works for client-allocated USERD
976 // because otherwise we are calling MapToCpu using the channel
977 // handle instead.
978 //
979 if (pChannel->bClientUserd && !bUseRmApiForBar1)
980 {
981 pChannel->pUserdMemdesc =
982 memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, hClientId, pChannel->hUserD);
983 NV_ASSERT_OR_RETURN(pChannel->pUserdMemdesc != NULL, NV_ERR_GENERIC);
984
985 if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
986 {
987 //
988 // GPFIFO aceess will not be set up in order to facilitate memTransfer APIs
989 // which will use GSP-DMA/CE with shadow buffers
990 //
991 pChannel->pControlGPFifo = NULL;
992 }
993 else
994 {
995 pChannel->pControlGPFifo =
996 (void *)memmgrMemDescBeginTransfer(pMemoryManager, pChannel->pUserdMemdesc,
997 TRANSFER_FLAGS_USE_BAR1);
998 NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
999 }
1000 }
1001 else
1002 {
1003 NvU32 userdSize = 0;
1004 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1005 kfifoGetUserdSizeAlign_HAL(GPU_GET_KERNEL_FIFO(pGpu), &userdSize, NULL);
1006
1007 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1008 pRmApi->MapToCpu(pRmApi, hClientId, hDeviceId,
1009 pChannel->bClientUserd ? pChannel->hUserD : hChannelId, 0,
1010 userdSize, (void **)&pChannel->pControlGPFifo, 0));
1011 }
1012 return NV_OK;
1013 }
1014
1015 static NV_STATUS
_memUtilsAllocateUserD(OBJGPU * pGpu,MemoryManager * pMemoryManager,NvHandle hClientId,NvHandle hDeviceId,OBJCHANNEL * pChannel)1016 _memUtilsAllocateUserD
1017 (
1018 OBJGPU *pGpu,
1019 MemoryManager *pMemoryManager,
1020 NvHandle hClientId,
1021 NvHandle hDeviceId,
1022 OBJCHANNEL *pChannel
1023 )
1024 {
1025 NV_STATUS rmStatus = NV_OK;
1026 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
1027 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1028 NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
1029 NvU32 userdMemClass = NV01_MEMORY_LOCAL_USER;
1030
1031 // Ensure that call is not made with lock held
1032 LOCK_ASSERT_AND_RETURN(!rmGpuLockIsOwner());
1033
1034 portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
1035
1036 memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
1037 kfifoGetUserdSizeAlign_HAL(pKernelFifo, (NvU32 *)&memAllocParams.size, NULL);
1038 memAllocParams.type = NVOS32_TYPE_IMAGE;
1039 memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
1040
1041 // Apply registry overrides to USERD.
1042 switch (DRF_VAL(_REG_STR_RM, _INST_LOC, _USERD, pGpu->instLocOverrides))
1043 {
1044 case NV_REG_STR_RM_INST_LOC_USERD_NCOH:
1045 case NV_REG_STR_RM_INST_LOC_USERD_COH:
1046 userdMemClass = NV01_MEMORY_SYSTEM;
1047 memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
1048 break;
1049
1050 case NV_REG_STR_RM_INST_LOC_USERD_VID:
1051 case NV_REG_STR_RM_INST_LOC_USERD_DEFAULT:
1052 userdMemClass = NV01_MEMORY_LOCAL_USER;
1053 memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _VIDMEM);
1054 memAllocParams.flags |= NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM;
1055 if (!IS_MIG_IN_USE(pGpu))
1056 {
1057 memAllocParams.attr |= DRF_DEF(OS32, _ATTR, _ALLOCATE_FROM_RESERVED_HEAP, _YES);
1058 }
1059 break;
1060 }
1061
1062 //
1063 // When APM is enabled all RM internal allocations must to go to
1064 // unprotected memory irrespective of vidmem or sysmem
1065 // When Hopper CC is enabled all RM internal sysmem allocations that
1066 // are required to be accessed from GPU should be in unprotected memory
1067 // but all vidmem allocations must go to protected memory
1068 //
1069 if (gpuIsApmFeatureEnabled(pGpu) ||
1070 FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
1071 {
1072 memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
1073 _UNPROTECTED);
1074 }
1075
1076 NV_ASSERT_OK_OR_RETURN(pRmApi->AllocWithHandle(pRmApi, hClientId, hDeviceId,
1077 pChannel->hUserD,
1078 userdMemClass,
1079 &memAllocParams,
1080 sizeof(memAllocParams)));
1081
1082 return rmStatus;
1083 }
1084
1085 static NV_STATUS
_memUtilsAllocateChannel(OBJGPU * pGpu,MemoryManager * pMemoryManager,NvHandle hClientId,NvHandle hDeviceId,NvHandle hChannelId,NvHandle hObjectError,NvHandle hObjectBuffer,OBJCHANNEL * pChannel)1086 _memUtilsAllocateChannel
1087 (
1088 OBJGPU *pGpu,
1089 MemoryManager *pMemoryManager,
1090 NvHandle hClientId,
1091 NvHandle hDeviceId,
1092 NvHandle hChannelId,
1093 NvHandle hObjectError,
1094 NvHandle hObjectBuffer,
1095 OBJCHANNEL *pChannel
1096 )
1097 {
1098 NV_CHANNEL_ALLOC_PARAMS channelGPFIFOAllocParams;
1099 NV_STATUS rmStatus = NV_OK;
1100 NvU32 hClass;
1101 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1102 NvBool bMIGInUse = IS_MIG_IN_USE(pGpu);
1103 NvU32 flags = DRF_DEF(OS04, _FLAGS, _CHANNEL_SKIP_SCRUBBER, _TRUE);
1104 RM_ENGINE_TYPE engineType = (pChannel->type == SWL_SCRUBBER_CHANNEL) ?
1105 RM_ENGINE_TYPE_SEC2 : RM_ENGINE_TYPE_COPY(pChannel->ceId);
1106
1107 if (pChannel->bSecure)
1108 {
1109 flags |= DRF_DEF(OS04, _FLAGS, _CC_SECURE, _TRUE);
1110 }
1111
1112 portMemSet(&channelGPFIFOAllocParams, 0, sizeof(NV_CHANNEL_ALLOC_PARAMS));
1113 channelGPFIFOAllocParams.hObjectError = hObjectError;
1114 channelGPFIFOAllocParams.hObjectBuffer = hObjectBuffer;
1115 channelGPFIFOAllocParams.gpFifoOffset = pChannel->pbGpuVA + pChannel->channelPbSize;
1116 channelGPFIFOAllocParams.gpFifoEntries = pChannel->channelNumGpFifioEntries;
1117 channelGPFIFOAllocParams.hContextShare = NV01_NULL_OBJECT;
1118 channelGPFIFOAllocParams.flags = flags;
1119 channelGPFIFOAllocParams.hVASpace = pChannel->hVASpaceId;
1120
1121 //
1122 // Use GPU instance local Id if MIG is enabled
1123 // TODO: Maybe we need a VAS for each GPU instance ?
1124 //
1125 if (bMIGInUse && (pChannel->pKernelMIGGpuInstance != NULL))
1126 {
1127 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
1128 MIG_INSTANCE_REF ref;
1129 RM_ENGINE_TYPE localCe;
1130 RsClient *pClient;
1131 Device *pDevice;
1132
1133 NV_ASSERT_OK_OR_RETURN(
1134 serverGetClientUnderLock(&g_resServ, hClientId, &pClient));
1135
1136 NV_ASSERT_OK_OR_RETURN(
1137 deviceGetByHandle(pClient, hDeviceId, &pDevice));
1138
1139 NV_ASSERT_OK_OR_RETURN(
1140 kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, pDevice, &ref));
1141 // Clear the Compute instance portion, if present
1142 ref = kmigmgrMakeGIReference(ref.pKernelMIGGpuInstance);
1143 NV_ASSERT_OK_OR_RETURN(
1144 kmigmgrGetGlobalToLocalEngineType(pGpu, pKernelMIGManager, ref,
1145 engineType,
1146 &localCe));
1147 channelGPFIFOAllocParams.engineType = gpuGetNv2080EngineType(localCe);
1148 }
1149 else
1150 {
1151 channelGPFIFOAllocParams.engineType = gpuGetNv2080EngineType(engineType);
1152 }
1153
1154 hClass = kfifoGetChannelClassId(pGpu, GPU_GET_KERNEL_FIFO(pGpu));
1155 if (!hClass)
1156 {
1157 NV_PRINTF(LEVEL_ERROR, "Unable to determine CE's channel class.\n");
1158 return NV_ERR_GENERIC;
1159 }
1160
1161 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
1162
1163 if (pChannel->bClientUserd)
1164 {
1165 NV_ASSERT_OK_OR_GOTO(
1166 rmStatus,
1167 _memUtilsAllocateUserD(pGpu,
1168 pMemoryManager,
1169 hClientId,
1170 hDeviceId,
1171 pChannel),
1172 cleanup);
1173
1174 SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY)
1175 channelGPFIFOAllocParams.hUserdMemory[gpumgrGetSubDeviceInstanceFromGpu(pGpu)] = pChannel->hUserD;
1176 channelGPFIFOAllocParams.userdOffset[gpumgrGetSubDeviceInstanceFromGpu(pGpu)] = 0;
1177 SLI_LOOP_END
1178 }
1179
1180 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(
1181 rmStatus,
1182 pRmApi->AllocWithHandle(pRmApi,
1183 hClientId,
1184 hDeviceId,
1185 hChannelId,
1186 hClass,
1187 &channelGPFIFOAllocParams,
1188 sizeof(channelGPFIFOAllocParams)));
1189
1190 cleanup:
1191 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
1192 rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
1193
1194 return rmStatus;
1195 }
1196
1197 /*!
1198 * Do a Non Blocking Memeset
1199 *
1200 * @param[in] pChannel OBJCHANNEL pointer
1201 * @param[in] base Offset in FB
1202 * @param[in] size size to scrub
1203 * @param[in] freeToken will be returned as a semaphore
1204 * @param[in] *pNumBlocks returns the number of blocks that were scrubbed
1205 * @returns NV_STATUS
1206 */
1207 NV_STATUS
memmgrMemUtilsMemSet_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,RmPhysAddr base,NvU64 size,NvU32 freeToken,NvU32 * pNumBlocks)1208 memmgrMemUtilsMemSet_GM107
1209 (
1210 OBJGPU *pGpu,
1211 MemoryManager *pMemoryManager,
1212 OBJCHANNEL *pChannel,
1213 RmPhysAddr base,
1214 NvU64 size,
1215 NvU32 freeToken,
1216 NvU32 *pNumBlocks
1217 )
1218 {
1219 NvU32 blocksPushed = 0;
1220
1221 if ((size % pChannel->minBlockSize) != 0)
1222 {
1223 NV_PRINTF(LEVEL_ERROR, "Size should be a multiple of %d\n",
1224 pChannel->minBlockSize);
1225 return NV_ERR_GENERIC;
1226
1227 }
1228
1229 NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
1230 NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
1231
1232 if (pChannel->isProgressChecked)
1233 {
1234 // if progress is checked insert the semaphore with freeToken as payload
1235 pChannel->finishPayload = freeToken;
1236 _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1237 0, 0, 0, // src parameters
1238 base, ADDR_FBMEM, 0, // dst parameters
1239 size,
1240 NV_FALSE, // blocking
1241 NV_TRUE, // insertFinishPayload
1242 NV_FALSE); // memcopy
1243 }
1244 else
1245 {
1246 // issue a standard async scrub
1247 blocksPushed = _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1248 0, 0, 0, // src parameters
1249 base, ADDR_FBMEM, 0, // dst parameters
1250 size,
1251 NV_FALSE, // blocking
1252 NV_FALSE, // insertFinishPayload
1253 NV_FALSE); // memcopy
1254 }
1255 *pNumBlocks = blocksPushed;
1256 return NV_OK;
1257 }
1258
1259 /*!
1260 * Do a Blocking Memset
1261 *
1262 * @param[in] pChannel OBJCHANNEL pointer
1263 * @param[in] base Offset in FB
1264 * @param[in] size size to scrub
1265 * @returns NV_STATUS
1266 */
1267
1268 NV_STATUS
memmgrMemUtilsMemSetBlocking_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,RmPhysAddr base,NvU64 size)1269 memmgrMemUtilsMemSetBlocking_GM107
1270 (
1271 OBJGPU *pGpu,
1272 MemoryManager *pMemoryManager,
1273 OBJCHANNEL *pChannel,
1274 RmPhysAddr base,
1275 NvU64 size
1276 )
1277 {
1278 NvU32 blocksPushed = 0;
1279
1280 if((size % pChannel->minBlockSize) != 0)
1281 {
1282 NV_PRINTF(LEVEL_ERROR, "Size should be a multiple of %d\n",
1283 pChannel->minBlockSize);
1284 DBG_BREAKPOINT();
1285 return NV_ERR_GENERIC;
1286
1287 }
1288
1289 NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
1290 NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
1291
1292 blocksPushed = _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1293 0, 0, 0, // src parameters
1294 base, ADDR_FBMEM, 0, // dst parameters
1295 size,
1296 NV_TRUE, // blocking
1297 NV_FALSE, // insertFinishPayload
1298 NV_FALSE); // memcopy
1299
1300 if (blocksPushed > 0)
1301 {
1302 NvU8 *semAddr = pChannel->pbCpuVA + pChannel->semaOffset;
1303 NV_STATUS timeoutStatus = NV_OK;
1304 RMTIMEOUT timeout;
1305
1306 gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1307
1308 while (MEM_RD32(semAddr) != pChannel->lastPayloadPushed)
1309 {
1310 NV_PRINTF(LEVEL_INFO, "Semaphore Payload is 0x%x last is 0x%x\n",
1311 MEM_RD32(semAddr), pChannel->lastPayloadPushed);
1312
1313 if (timeoutStatus == NV_ERR_TIMEOUT)
1314 {
1315 NV_PRINTF(LEVEL_ERROR,
1316 "Timed Out waiting for CE semaphore\n");
1317
1318 NV_PRINTF(LEVEL_ERROR,
1319 "GET=0x%x, PUT=0x%x, GPGET=0x%x, GPPUT=0x%x\n",
1320 pChannel->pControlGPFifo->Get,
1321 pChannel->pControlGPFifo->Put,
1322 pChannel->pControlGPFifo->GPGet,
1323 pChannel->pControlGPFifo->GPPut);
1324
1325 DBG_BREAKPOINT_REASON(NV_ERR_TIMEOUT);
1326 return NV_ERR_GENERIC;
1327 }
1328
1329 timeoutStatus = gpuCheckTimeout(pGpu, &timeout);
1330 }
1331 }
1332
1333 return NV_OK;
1334 }
1335
1336 /*!
1337 * This function allocates the ECC scrubber
1338 *
1339 * @param[in] pChannel OBJCHANNEL pointer
1340 * @returns Bool
1341 */
1342 NV_STATUS
memmgrMemUtilsAllocateEccScrubber_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)1343 memmgrMemUtilsAllocateEccScrubber_GM107
1344 (
1345 OBJGPU *pGpu,
1346 MemoryManager *pMemoryManager,
1347 OBJCHANNEL *pChannel
1348 )
1349 {
1350 NV_ASSERT_OK_OR_RETURN(channelAllocSubdevice(pGpu, pChannel));
1351
1352 memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pChannel);
1353
1354 memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pChannel);
1355
1356 _memUtilsAllocateReductionSema(pGpu, pMemoryManager, pChannel);
1357
1358 return NV_OK;
1359 }
1360
1361 /*!
1362 * This function allocates the ecc scrubber and the
1363 * DUpes the bitmap semaphore which is used for sync
1364 *
1365 * @param[in] pChannel OBJCHANNEL pointer
1366 * @returns Bool
1367 */
1368 NV_STATUS
memmgrMemUtilsAllocateEccAllocScrubber_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)1369 memmgrMemUtilsAllocateEccAllocScrubber_GM107
1370 (
1371 OBJGPU *pGpu,
1372 MemoryManager *pMemoryManager,
1373 OBJCHANNEL *pChannel
1374 )
1375 {
1376 OBJSCRUB *pEccTD = &pMemoryManager->eccScrubberState;
1377 OBJCHANNEL *pEccSyncChannel = &pEccTD->allocationScrubberState;
1378 OBJCHANNEL *pEccAsyncChannel = &pEccTD->tdHeapState;
1379 NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
1380 NV_STATUS lockStatus;
1381 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1382
1383 NV_ASSERT_OK_OR_RETURN(channelAllocSubdevice(pGpu, pChannel));
1384
1385 memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pEccSyncChannel);
1386 memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pEccSyncChannel);
1387
1388 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
1389
1390 // dup the reduction sema bit map object
1391 NV_ASSERT_OK(
1392 pRmApi->DupObject(pRmApi,
1393 pEccSyncChannel->hClient,
1394 pEccSyncChannel->deviceId,
1395 &pEccSyncChannel->bitMapSemPhysId,
1396 pEccAsyncChannel->hClient,
1397 pEccAsyncChannel->bitMapSemPhysId,
1398 0));
1399
1400 // allocate virtual memory for a bit map semaphore
1401 portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
1402 memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
1403 memAllocParams.type = NVOS32_TYPE_IMAGE;
1404 memAllocParams.size = (((pEccSyncChannel->blockCount + 31)/32)*4);
1405 memAllocParams.attr = NVOS32_ATTR_NONE;
1406 memAllocParams.attr2 = NVOS32_ATTR2_NONE;
1407 memAllocParams.flags = 0;
1408 memAllocParams.flags |= NVOS32_ALLOC_FLAGS_VIRTUAL;
1409
1410 NV_ASSERT_OK(
1411 pRmApi->AllocWithHandle(pRmApi,
1412 pEccSyncChannel->hClient,
1413 pEccSyncChannel->deviceId,
1414 pEccSyncChannel->bitMapSemVirtId,
1415 NV50_MEMORY_VIRTUAL,
1416 &memAllocParams,
1417 sizeof(memAllocParams)));
1418
1419 lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
1420
1421 if (lockStatus != NV_OK)
1422 {
1423 NV_ASSERT_FAILED("Could not get back lock after allocating reduction sema");
1424 return NV_ERR_GENERIC;
1425 }
1426
1427 NV_ASSERT_OK(
1428 pRmApi->Map(pRmApi,
1429 pEccSyncChannel->hClient,
1430 pEccSyncChannel->deviceId,
1431 pEccSyncChannel->bitMapSemVirtId,
1432 pEccSyncChannel->bitMapSemPhysId, //hMemory,
1433 0,
1434 (((pEccSyncChannel->blockCount + 31) / 32) * 4),
1435 NV04_MAP_MEMORY_FLAGS_NONE,
1436 &pEccSyncChannel->pbGpuBitMapVA));
1437
1438 pEccSyncChannel->pbBitMapVA = pEccAsyncChannel->pbBitMapVA;
1439
1440 return NV_OK;
1441 }
1442
1443 /*!
1444 * FUnction calculates the available space in PB
1445 * This is based on the reading the semaphore that
1446 * has the previous PUT pointer where methods were
1447 * inserted
1448 *
1449 * @param[in] pChannel OBJCHANNEL pointer
1450 * @returns size
1451 */
1452 static NvU32
_getSpaceInPb(OBJCHANNEL * pChannel)1453 _getSpaceInPb(OBJCHANNEL *pChannel)
1454 {
1455 NvU32 filledSpace;
1456 NvU32 avlblSpace;
1457
1458 NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, 0);
1459
1460 if (pChannel->channelPutOffset >= MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset))
1461 {
1462 filledSpace = (pChannel->channelPutOffset - MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset));
1463 avlblSpace = pChannel->channelPbSize - filledSpace;
1464
1465 }
1466 else
1467 {
1468 avlblSpace = (MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset) - pChannel->channelPutOffset);
1469 }
1470
1471 NV_PRINTF(LEVEL_INFO, "Space in PB is %d\n", avlblSpace);
1472
1473 return avlblSpace;
1474
1475 }
1476
1477 /*!
1478 * This function manages the PushBUffer
1479 * It will insert methods into the PB, manage wrap around
1480 * and decide when we need to add NON-STALL interrupts
1481 * and etra token semaphores
1482 *
1483 * @param[in] pChannel OBJCHANNEL pointer
1484 * @param[in] src Offset of src to copy from
1485 * @param[in] srcAddressSpace source surface address space type
1486 * @param[in] srcCpuCacheAttrib source surface address space attributes
1487 * @param[in] dst Offset of dst to scrub/copy to
1488 * @param[in] dstAddressSpace destination surface address space type
1489 * @param[in] dstCpuCacheAttrib destination surface address space attributes
1490 * @param[in] size size to scrub/copy
1491 * @param[in] blocking blocking will not insert non-stall
1492 * @param[in] payload will insert a token for the last block submitted
1493 * @param[in] bMemcopy NV_TRUE for memory copy / NV_FALSE for scrubbing
1494 * @returns Bool
1495 */
1496 static NvU32
_ceChannelScheduleWork_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,RmPhysAddr src,NV_ADDRESS_SPACE srcAddressSpace,NvU32 srcCpuCacheAttrib,RmPhysAddr dst,NV_ADDRESS_SPACE dstAddressSpace,NvU32 dstCpuCacheAttrib,NvU64 size,NvBool blocking,NvBool insertFinishPayload,NvBool bMemcopy)1497 _ceChannelScheduleWork_GM107
1498 (
1499 OBJGPU *pGpu,
1500 MemoryManager *pMemoryManager,
1501 OBJCHANNEL *pChannel,
1502 RmPhysAddr src,
1503 NV_ADDRESS_SPACE srcAddressSpace,
1504 NvU32 srcCpuCacheAttrib,
1505 RmPhysAddr dst,
1506 NV_ADDRESS_SPACE dstAddressSpace,
1507 NvU32 dstCpuCacheAttrib,
1508 NvU64 size,
1509 NvBool blocking,
1510 NvBool insertFinishPayload,
1511 NvBool bMemcopy
1512 )
1513 {
1514 RMTIMEOUT timeout;
1515 NvU32 spaceInPb;
1516 NvU32 numBytes;
1517 NvU32 *ptr;
1518 NvU32 gpBase;
1519 NvU32 semaCount = 0;
1520 NvBool addNonStallIntr = NV_FALSE;
1521 NvU32 blocksPushed = 0;
1522 NvBool addFinishPayload;
1523 NvU32 blockSize = 0;
1524
1525 NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, 0);
1526 NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, 0);
1527
1528 gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1529
1530 spaceInPb = _getSpaceInPb(pChannel);
1531
1532 NV_PRINTF(LEVEL_INFO, "Space in PB is %d and starting fill at 0x%x\n",
1533 spaceInPb, pChannel->channelPutOffset);
1534
1535 ptr = (NvU32 *)(pChannel->pbCpuVA + pChannel->channelPutOffset);
1536 gpBase = pChannel->channelPutOffset;
1537 numBytes = 0;
1538 do
1539 {
1540 // while we have space greater than one block
1541 while((spaceInPb > (pChannel->methodSizePerBlock+MAX_EXTRA_PAYLOAD)))
1542 {
1543 // if inserting one more block is greater than PB size then wrap around to the beginning
1544 if((pChannel->channelPutOffset + (pChannel->methodSizePerBlock+MAX_EXTRA_PAYLOAD)) > pChannel->channelPbSize)
1545 {
1546 NV_PRINTF(LEVEL_INFO, "Wrap numBytes %d\n", numBytes);
1547 //submit to gpfifo with numBytes and wrap around the PutOffset
1548 if(numBytes > 0)
1549 {
1550 _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, (gpBase), numBytes);
1551 }
1552 pChannel->channelPutOffset = 0;
1553 ptr = (NvU32 *)(pChannel->pbCpuVA + pChannel->channelPutOffset);
1554 gpBase = 0;
1555 numBytes = 0;
1556 // update the available space
1557 spaceInPb = _getSpaceInPb(pChannel);
1558 NV_PRINTF(LEVEL_INFO, "Wrapping PB around\n");
1559 continue;
1560 }
1561
1562 blockSize = (size > pChannel->maxBlockSize) ?
1563 pChannel->maxBlockSize : (NvU32) size;
1564
1565 // add a non-stall interupt every (8th of the size) or when we insert the last block
1566 if((semaCount > (pChannel->channelPbSize >> 3)) || (size <= pChannel->maxBlockSize))
1567 {
1568 addNonStallIntr = NV_TRUE;
1569 semaCount = 0;
1570 }
1571 else
1572 {
1573 addNonStallIntr = NV_FALSE;
1574 }
1575 // the finsh payload corresponds to inserting a token for every call to scrub that finishes
1576 if((insertFinishPayload) && (size <= pChannel->maxBlockSize))
1577 {
1578 addFinishPayload = NV_TRUE;
1579 NV_PRINTF(LEVEL_INFO, "Inserting Finish Payload!!!!!!!!!!\n");
1580 }
1581 else
1582 {
1583 addFinishPayload = NV_FALSE;
1584 }
1585 if(_checkSynchronization(pGpu, pMemoryManager, pChannel, BLOCK_INDEX_FROM_ADDR(dst, pChannel->blockShift)))
1586 {
1587 NvU32 bytesPushed = _ceChannelPushMethodsBlock_GM107(pGpu, pMemoryManager, pChannel,
1588 src, srcAddressSpace, srcCpuCacheAttrib, // src parameters
1589 dst, dstAddressSpace, dstCpuCacheAttrib, // dst parameters
1590 blockSize, &ptr, NV_TRUE, (addNonStallIntr && !blocking),
1591 addFinishPayload, bMemcopy);
1592 spaceInPb = spaceInPb - bytesPushed;
1593 numBytes = numBytes + bytesPushed;
1594 semaCount = semaCount + bytesPushed;
1595 blocksPushed++;
1596 // we are done pushing all methods
1597 }
1598
1599 dst += (NvU64) blockSize;
1600 if (bMemcopy)
1601 src += (NvU64) blockSize;
1602 size -= (NvU64) blockSize;
1603
1604 if(size == 0)
1605 {
1606 _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, gpBase, numBytes);
1607 return blocksPushed;
1608 }
1609 }
1610 spaceInPb = _getSpaceInPb(pChannel);
1611 if(spaceInPb <= (pChannel->methodSizePerBlock + MAX_EXTRA_PAYLOAD))
1612 {
1613 //no space in pb to push all blocks so put what we have and wait for space
1614 if(numBytes > 0)
1615 {
1616 _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, gpBase, numBytes);
1617 }
1618 gpBase = pChannel->channelPutOffset;
1619 numBytes = 0;
1620 }
1621 if (gpuCheckTimeout(pGpu, &timeout) == NV_ERR_TIMEOUT)
1622 {
1623 NV_ASSERT_FAILED("Timed out waiting for Space in PB!");
1624 return NV_ERR_GENERIC;
1625 }
1626 } while(1);
1627 }
1628
1629
1630 /*!
1631 * This function checks if the block has already been submitted
1632 * or scrubbed based on 2 bitmaps. One is a pending bitmap
1633 * updated by the CPU and one is a "Finished" bitmap updated by
1634 * the GPU
1635 *
1636 * @param[in] pChannel OBJCHANNEL pointer
1637 * @param[in] block block number
1638 *
1639 * @returns Bool
1640 */
1641 static NvBool
_checkSynchronization(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,NvU32 block)1642 _checkSynchronization
1643 (
1644 OBJGPU *pGpu,
1645 MemoryManager *pMemoryManager,
1646 OBJCHANNEL *pChannel,
1647 NvU32 block
1648 )
1649 {
1650 NvU32 blockSema;
1651
1652 if (!pChannel->isChannelSynchronized)
1653 {
1654 //synchronization is not required for this channel
1655 return NV_TRUE;
1656 }
1657
1658 blockSema = MEM_RD32((NvU8*)pChannel->pbBitMapVA + ((block/32)*4));
1659
1660 if( ((blockSema) & (1 << (block%32))) == 0 )
1661 {
1662 if (((pChannel->pBlockPendingState[block / 32] & (1 << (block % 32))) == 0) &&
1663 ((pChannel->pBlockDoneState[block / 32] & (1 << (block % 32))) == 0) )
1664 {
1665 pChannel->pBlockPendingState[block / 32] |= (1 << (block % 32));
1666 return NV_TRUE;
1667 }
1668 }
1669
1670 return NV_FALSE;
1671 }
1672
1673 /*!
1674 * Updates the GPfifo with the methods in the PB for
1675 * the given channel
1676 * @param[in] pChannel OBJCHANNEL pointer
1677 * @param[in] gpOffset Offset in the PB
1678 * @param[in] gpSize Size of segment
1679 * @returns None
1680 */
1681 static void
_ceChannelUpdateGpFifo_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,NvU32 gpOffset,NvU32 gpSize)1682 _ceChannelUpdateGpFifo_GM107
1683 (
1684 OBJGPU *pGpu,
1685 MemoryManager *pMemoryManager,
1686 OBJCHANNEL *pChannel,
1687 NvU32 gpOffset,
1688 NvU32 gpSize
1689
1690 )
1691 {
1692 RMTIMEOUT timeout;
1693 NvU32 GPPut;
1694 NvU32 GPGet;
1695 NvU64 get;
1696 NvU32 length;
1697 NvU32 *pGpEntry;
1698 NvU32 GpEntry0;
1699 NvU32 GpEntry1;
1700 NvU32 GPPutNext;
1701 NvU32 workSubmitToken = 0;
1702 KernelChannel *pFifoKernelChannel;
1703 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
1704
1705 NV_ASSERT_OR_RETURN_VOID(pChannel->pbCpuVA != NULL);
1706 NV_ASSERT_OR_RETURN_VOID(pChannel->pControlGPFifo != NULL);
1707
1708 gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1709 GPPut = MEM_RD32(&pChannel->pControlGPFifo->GPPut);
1710 GPGet = MEM_RD32(&pChannel->pControlGPFifo->GPGet);
1711
1712 GPPutNext = (GPPut + 1) % pChannel->channelNumGpFifioEntries;
1713
1714 NV_PRINTF(LEVEL_INFO, "Put %d Get %d PutNext%d\n", GPPut, GPGet,
1715 GPPutNext);
1716
1717 NV_PRINTF(LEVEL_INFO, "gp Base 0x%x, Size %d\n", (NvU32)(gpOffset),
1718 gpSize);
1719
1720 // if the size passed is zero do not update gpput
1721 if (gpSize == 0)
1722 return;
1723
1724 if (GPPut >= pChannel->channelNumGpFifioEntries)
1725 {
1726 // if the Put pointer is invalid, the GPU is likely inaccessible
1727 NV_PRINTF(LEVEL_INFO, "invalid Put %u >= %u\n", GPPut,
1728 pChannel->channelNumGpFifioEntries);
1729 return;
1730 }
1731
1732 while (GPPutNext == GPGet)
1733 {
1734 // need to wait for space
1735 GPGet = MEM_RD32(&pChannel->pControlGPFifo->GPGet);
1736
1737 if (gpuCheckTimeout(pGpu, &timeout) == NV_ERR_TIMEOUT)
1738 {
1739 NV_ASSERT_FAILED("Timed Out waiting for space in GPFIFIO!");
1740 return;
1741 }
1742 else if (GPGet >= pChannel->channelNumGpFifioEntries)
1743 {
1744 // if the Get pointer is invalid, the GPU is likely inaccessible
1745 NV_PRINTF(LEVEL_INFO, "invalid Get %u >= %u\n", GPGet,
1746 pChannel->channelNumGpFifioEntries);
1747 return;
1748 }
1749 }
1750
1751 get = pChannel->pbGpuVA + gpOffset;
1752 length = gpSize;
1753
1754 GpEntry0 =
1755 DRF_DEF(906F, _GP_ENTRY0, _NO_CONTEXT_SWITCH, _FALSE) |
1756 DRF_NUM(906F, _GP_ENTRY0, _GET, NvU64_LO32(get) >> 2);
1757 GpEntry1 =
1758 DRF_NUM(906F, _GP_ENTRY1, _GET_HI, NvU64_HI32(get)) |
1759 DRF_NUM(906F, _GP_ENTRY1, _LENGTH, length >> 2) |
1760 DRF_DEF(906F, _GP_ENTRY1, _PRIV, _USER) |
1761 DRF_DEF(906F, _GP_ENTRY1, _LEVEL, _MAIN);
1762
1763
1764 pGpEntry = (NvU32 *)(((NvU8*)pChannel->pbCpuVA) + pChannel->channelPbSize +
1765 GPPut*NV906F_GP_ENTRY__SIZE);
1766
1767 MEM_WR32(&pGpEntry[0], GpEntry0);
1768 MEM_WR32(&pGpEntry[1], GpEntry1);
1769
1770 // need to flush WRC buffer
1771 osFlushCpuWriteCombineBuffer();
1772
1773 // write gpput
1774 MEM_WR32(&pChannel->pControlGPFifo->GPPut, GPPutNext);
1775 osFlushCpuWriteCombineBuffer();
1776
1777 if (kfifoIsLiteModeEnabled_HAL(pGpu, pKernelFifo))
1778 {
1779 NV_ASSERT_OR_RETURN_VOID(0);
1780 }
1781 else
1782 {
1783 workSubmitToken = pChannel->workSubmitToken;
1784 NV_ASSERT_OR_RETURN_VOID(CliGetKernelChannelWithDevice(pChannel->pRsClient,
1785 pChannel->deviceId, pChannel->channelId,
1786 &pFifoKernelChannel) == NV_OK);
1787 }
1788 if (!kchannelIsRunlistSet(pGpu, pFifoKernelChannel))
1789 {
1790 NV_PRINTF(LEVEL_ERROR,
1791 "FAILED Channel 0x%x is not assigned to runlist yet\n",
1792 kchannelGetDebugTag(pFifoKernelChannel));
1793 return;
1794 }
1795 // update doorbell register
1796 kfifoUpdateUsermodeDoorbell_HAL(pGpu, pKernelFifo, workSubmitToken, kchannelGetRunlistId(pFifoKernelChannel));
1797 }
1798
1799 /*!
1800 * Inserts methods into the push buffer for one block
1801 *
1802 * @param[in] pChannel OBJCHANNEL pointer
1803 * @param[in] src Offset of src to copy from
1804 * @param[in] srcAddressSpace source surface address space type
1805 * @param[in] srcCpuCacheAttrib source surface address space attributes
1806 * @param[in] dst Offset of dst to scrub/copy to
1807 * @param[in] dstAddressSpace destination surface address space type
1808 * @param[in] dstCpuCacheAttrib destination surface address space attributes
1809 * @param[in] pPtr Double pointer to PB offset
1810 * @returns None
1811 */
1812 static void
_ceChannelPushMethodAperture_GM107(OBJCHANNEL * pChannel,NV_ADDRESS_SPACE srcAddressSpace,NvU32 srcCpuCacheAttrib,NV_ADDRESS_SPACE dstAddressSpace,NvU32 dstCpuCacheAttrib,NvU32 ** pPtr)1813 _ceChannelPushMethodAperture_GM107
1814 (
1815 OBJCHANNEL *pChannel,
1816 NV_ADDRESS_SPACE srcAddressSpace,
1817 NvU32 srcCpuCacheAttrib,
1818 NV_ADDRESS_SPACE dstAddressSpace,
1819 NvU32 dstCpuCacheAttrib,
1820 NvU32 **pPtr
1821 )
1822 {
1823 NvU32 *ptr = *pPtr;
1824 NvU32 data = 0;
1825
1826 // Set source parameters
1827 data = ((srcAddressSpace == ADDR_FBMEM) ? DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _LOCAL_FB) :
1828 (srcCpuCacheAttrib == NV_MEMORY_CACHED) ? DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _COHERENT_SYSMEM) :
1829 DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _NONCOHERENT_SYSMEM));
1830
1831 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_SRC_PHYS_MODE, data);
1832
1833 // Set destination parameters
1834 data = ((dstAddressSpace == ADDR_FBMEM) ? DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _LOCAL_FB) :
1835 (dstCpuCacheAttrib == NV_MEMORY_CACHED) ? DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _COHERENT_SYSMEM) :
1836 DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _NONCOHERENT_SYSMEM));
1837
1838 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_DST_PHYS_MODE, data);
1839
1840 *pPtr = ptr;
1841 }
1842
1843 /*!
1844 * Inserts methods into the push buffer for one block
1845 *
1846 * @param[in] pChannel OBJCHANNEL pointer
1847 * @param[in] src Offset of src to copy from
1848 * @param[in] srcAddressSpace source surface address space type
1849 * @param[in] srcCpuCacheAttrib source surface address space attributes
1850 * @param[in] dst Offset of dst to scrub/copy to
1851 * @param[in] dstAddressSpace destination surface address space type
1852 * @param[in] dstCpuCacheAttrib destination surface address space attributes
1853 * @param[in] size size of the region to scrub/copy
1854 * @param[in] pPtr Double pointer to PB offset
1855 * @param[in] addPayloadSema Bool to add default payload
1856 * @param[in] addNonStallInt Bool to add a non stall at the end
1857 * @param[in] addFinishPayload Bool to add an extra sema release for token
1858 * @param[in] bMemcopy NV_TRUE for memcopy / NV_FALSE for scrubbing
1859 * @returns None
1860 */
1861 static NvU32
_ceChannelPushMethodsBlock_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * channel,RmPhysAddr src,NV_ADDRESS_SPACE srcAddressSpace,NvU32 srcCpuCacheAttrib,RmPhysAddr dst,NV_ADDRESS_SPACE dstAddressSpace,NvU32 dstCpuCacheAttrib,NvU64 size,NvU32 ** pPtr,NvBool addPayloadSema,NvBool addNonStallIntr,NvBool addFinishPayload,NvBool bMemcopy)1862 _ceChannelPushMethodsBlock_GM107
1863 (
1864 OBJGPU *pGpu,
1865 MemoryManager *pMemoryManager,
1866 OBJCHANNEL *channel,
1867 RmPhysAddr src,
1868 NV_ADDRESS_SPACE srcAddressSpace,
1869 NvU32 srcCpuCacheAttrib,
1870 RmPhysAddr dst,
1871 NV_ADDRESS_SPACE dstAddressSpace,
1872 NvU32 dstCpuCacheAttrib,
1873 NvU64 size,
1874 NvU32 **pPtr,
1875 NvBool addPayloadSema,
1876 NvBool addNonStallIntr,
1877 NvBool addFinishPayload,
1878 NvBool bMemcopy
1879 )
1880 {
1881 NvU32 launchParams = 0;
1882 NvU32 *ptr = *pPtr;
1883 NvU32 *pStartPtr = ptr;
1884 NvBool addReductionOp = channel->isChannelSynchronized;
1885 NvBool bMemoryScrubEnable = NV_FALSE;
1886 NvU32 remapConstB = 0;
1887 NvU32 remapComponentSize = 0;
1888
1889 NV_PRINTF(LEVEL_INFO, "Base = 0x%llx, Size = 0x%llx, PB location = %p\n",
1890 dst, size, ptr);
1891
1892 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NV906F_SET_OBJECT, channel->classEngineID);
1893
1894 if (size > 0)
1895 {
1896 NvU32 payLoad = channel->channelPutOffset + channel->methodSizePerBlock;
1897
1898 if (addNonStallIntr) payLoad = payLoad + NONSTALL_METHOD_SIZE;
1899 if (addReductionOp) payLoad = payLoad + SEMAPHORE_ONLY_METHOD_SIZE;
1900 if (addFinishPayload) payLoad = payLoad + SEMAPHORE_ONLY_METHOD_SIZE;
1901
1902 if (addPayloadSema)
1903 {
1904 memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
1905 NVA06F_SUBCHANNEL_COPY_ENGINE,
1906 channel->pbGpuVA+channel->semaOffset, payLoad, &ptr);
1907
1908 NV_PRINTF(LEVEL_INFO, "Pushing Semaphore Payload 0x%x\n", payLoad);
1909 channel->lastPayloadPushed = payLoad;
1910 }
1911
1912 if (IS_SIMULATION(pGpu))
1913 {
1914 //
1915 // fmodel CE is slow (compared to emulation) so we don't bother
1916 // scrubbing the whole block. Fmodel already scrubs memory via ramif
1917 // so we'll never get exceptions
1918 //
1919 size = NV_MIN(size, 0x20);
1920 }
1921
1922 memmgrChannelPushAddressMethodsBlock_HAL(pMemoryManager, NV_FALSE,
1923 NVA06F_SUBCHANNEL_COPY_ENGINE, dst, &ptr);
1924
1925 if (bMemcopy)
1926 {
1927 memmgrChannelPushAddressMethodsBlock_HAL(pMemoryManager, NV_TRUE,
1928 NVA06F_SUBCHANNEL_COPY_ENGINE, src, &ptr);
1929
1930 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size));
1931 }
1932 else
1933 {
1934 bMemoryScrubEnable = memmgrMemUtilsCheckMemoryFastScrubEnable_HAL(pGpu,
1935 pMemoryManager,
1936 channel->hTdCopyClass,
1937 channel->bUseVasForCeCopy,
1938 dst,
1939 NvU64_LO32(size),
1940 dstAddressSpace);
1941 if (bMemoryScrubEnable)
1942 {
1943 NV_PRINTF(LEVEL_INFO, "Using Fast memory scrubber\n");
1944 remapConstB = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _DST_X, _CONST_B);
1945 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_CONST_B, 0x00000000);
1946
1947 remapComponentSize = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _COMPONENT_SIZE, _ONE);
1948 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size));
1949 }
1950 else
1951 {
1952 remapComponentSize = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _COMPONENT_SIZE, _FOUR);
1953 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size >> 2));
1954 }
1955
1956 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_COMPONENTS,
1957 DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _DST_X, _CONST_A) |
1958 DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _NUM_SRC_COMPONENTS, _ONE) |
1959 DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _NUM_DST_COMPONENTS, _ONE) |
1960 remapComponentSize |
1961 remapConstB);
1962
1963 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_CONST_A, 0x00000000);
1964
1965 NV_ASSERT(srcAddressSpace == 0);
1966 NV_ASSERT(dstAddressSpace == ADDR_FBMEM);
1967
1968 srcAddressSpace = ADDR_FBMEM;
1969 }
1970
1971 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_COUNT, 1);
1972
1973 _ceChannelPushMethodAperture_GM107(channel, srcAddressSpace, srcCpuCacheAttrib, dstAddressSpace, dstCpuCacheAttrib, &ptr);
1974
1975 launchParams = DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
1976 DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
1977 DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_TYPE, _PHYSICAL) |
1978 DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
1979 DRF_DEF(B0B5, _LAUNCH_DMA, _DST_TYPE, _PHYSICAL) |
1980 DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _PIPELINED);
1981
1982 if (addPayloadSema)
1983 {
1984 launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
1985 DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE);
1986 }
1987 else
1988 {
1989 launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _NONE);
1990 }
1991
1992 if (bMemoryScrubEnable)
1993 {
1994 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVC8B5_SET_MEMORY_SCRUB_PARAMETERS,
1995 DRF_DEF(C8B5, _SET_MEMORY_SCRUB_PARAMETERS, _DISCARDABLE, _FALSE));
1996
1997 launchParams |= DRF_DEF(C8B5, _LAUNCH_DMA, _MEMORY_SCRUB_ENABLE, _TRUE);
1998 launchParams |= DRF_DEF(C8B5, _LAUNCH_DMA, _REMAP_ENABLE, _FALSE);
1999
2000 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVC8B5_LAUNCH_DMA, launchParams);
2001 }
2002 else
2003 {
2004 if (!bMemcopy)
2005 {
2006 launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE);
2007 }
2008
2009 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
2010 }
2011 }
2012
2013 if (addReductionOp)
2014 {
2015 NvU32 currentBlock = BLOCK_INDEX_FROM_ADDR((dst), channel->blockShift);
2016 NvU32 blockOffset;
2017 NvU32 bitFlip;
2018
2019 blockOffset = (currentBlock / 32) * 4;
2020 bitFlip = ((NvU32)1 << (currentBlock % 32));
2021 memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
2022 NVA06F_SUBCHANNEL_COPY_ENGINE,
2023 channel->pbGpuBitMapVA+(blockOffset), bitFlip, &ptr);
2024
2025 launchParams = DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
2026 DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
2027 DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE) |
2028 DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE) |
2029 DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
2030 DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
2031 DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION_ENABLE, _TRUE) |
2032 DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION_SIGN, _UNSIGNED) |
2033 DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION, _IOR) |
2034 DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _NONE);
2035 // push only the second semaphore release
2036 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
2037 }
2038
2039 if (addFinishPayload)
2040 {
2041 memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
2042 NVA06F_SUBCHANNEL_COPY_ENGINE,
2043 channel->pbGpuVA+channel->finishPayloadOffset,
2044 channel->finishPayload, &ptr);
2045
2046 launchParams = DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
2047 DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
2048 DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE) |
2049 DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE) |
2050 DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
2051 DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
2052 DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _NONE);
2053 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
2054 NV_PRINTF(LEVEL_INFO, "Pushing Finishing Semaphore Payload 0x%x\n",
2055 channel->finishPayload);
2056 }
2057
2058 if (addNonStallIntr)
2059 {
2060 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NV906F_NON_STALL_INTERRUPT, 0);
2061 }
2062
2063 channel->channelPutOffset = (NvU32)((NvU8 *)ptr - (NvU8 *)channel->pbCpuVA);
2064 *pPtr = ptr;
2065
2066 // return length of methods inserted
2067 return (NvU32)((NvU8*)ptr - (NvU8*)pStartPtr);
2068 }
2069
2070 /*!
2071 * Getting the Copy Engine Class
2072 *
2073 * @param[in] pGpu OBJGPU pointer
2074 * @param[out] pClass pointer to class
2075 */
2076 NV_STATUS
memmgrMemUtilsGetCopyEngineClass_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,NvU32 * pClass)2077 memmgrMemUtilsGetCopyEngineClass_GM107
2078 (
2079 OBJGPU *pGpu,
2080 MemoryManager *pMemoryManager,
2081 NvU32 *pClass
2082 )
2083 {
2084 NV_STATUS status;
2085 NvU32 numClasses;
2086 NvU32 *pClassList = NULL;
2087 NvU32 i;
2088 NvU32 class = 0;
2089 NvU32 eng;
2090
2091 //
2092 // Pascal+ chips will have any combination of the 6 CEs
2093 // available. Loop over all the CEs to get the CE class
2094 // for the first available CE instead of using ENG_CE(0)
2095 //
2096 for (eng = 0; eng < ENG_CE__SIZE_1; eng++)
2097 {
2098 NV_ASSERT_OK_OR_ELSE(
2099 status,
2100 gpuGetClassList(pGpu, &numClasses, NULL, ENG_CE(eng)),
2101 return 0);
2102
2103 if (numClasses > 0)
2104 {
2105 break;
2106 }
2107 }
2108
2109 pClassList = portMemAllocNonPaged(sizeof(*pClassList) * numClasses);
2110 NV_ASSERT_OR_RETURN((pClassList != NULL), 0);
2111
2112 if (NV_OK == gpuGetClassList(pGpu, &numClasses, pClassList, ENG_CE(eng)))
2113 {
2114 for (i = 0; i < numClasses; i++)
2115 {
2116 class = NV_MAX(class, pClassList[i]);
2117 }
2118 }
2119
2120 NV_ASSERT(class != 0);
2121 portMemFree(pClassList);
2122 *pClass = class;
2123
2124 return NV_OK;
2125 }
2126