1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2012-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "core/core.h"
25 #include "gpu/gpu.h"
26 #include "gpu/device/device.h"
27 #include "os/os.h"
28 #include "gpu/bus/kern_bus.h"
29 #include "gpu/mem_mgr/mem_mgr.h"
30 #include "gpu/mem_mgr/heap.h"
31 #include "gpu/mem_mgr/mem_scrub.h"
32 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
33 #include "gpu/mem_mgr/mem_desc.h"
34 #include "gpu/ce/kernel_ce.h"
35 #include "gpu/ce/kernel_ce_private.h"
36 #include "mem_mgr/gpu_vaspace.h"
37 #include "core/locks.h"
38 #include "nvRmReg.h"
39 #include "rmapi/rs_utils.h"
40 #include "mem_mgr/ctx_buf_pool.h"
41 #include "vgpu/rpc.h"
42 #include "kernel/gpu/fifo/kernel_channel.h"
43 #include "platform/chipset/chipset.h"
44 #include "platform/sli/sli.h"
45 
46 #include "class/clc0b5sw.h"
47 #include "class/cla06fsubch.h" // NVA06F_SUBCHANNEL_COPY_ENGINE
48 #include "class/cl003e.h"      // NV01_MEMORY_SYSTEM
49 #include "class/cl0040.h"      // NV01_MEMORY_LOCAL_USER
50 #include "class/cl50a0.h"      // NV50_MEMORY_VIRTUAL
51 #include "class/cl00c2.h"      // NV01_MEMORY_LOCAL_PHYSICAL
52 #include "class/clb0b5.h"      // MAXWELL_DMA_COPY_A
53 #include "class/clc8b5.h"      // HOPPER_DMA_COPY_A
54 #include "class/cl90f1.h"      // FERMI_VASPACE_A
55 
56 #define NONSTALL_METHOD_SIZE            8
57 #define SEMAPHORE_ONLY_METHOD_SIZE      32
58 #define MAX_EXTRA_PAYLOAD               (NONSTALL_METHOD_SIZE + SEMAPHORE_ONLY_METHOD_SIZE)
59 
60 
61 static NV_STATUS _memUtilsChannelAllocatePB_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel);
62 static NV_STATUS _memUtilsAllocateChannel(OBJGPU *pGpu, MemoryManager *pMemoryManager, NvHandle hClientId,
63                                     NvHandle hDeviceId, NvHandle hChannelId, NvHandle hObjectError,
64                                     NvHandle hObjectBuffer, OBJCHANNEL *pChannel);
65 static NV_STATUS _memUtilsAllocCe_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
66                                              NvHandle hClientId, NvHandle hDeviceId, NvHandle hChannelId, NvHandle hCopyObjectId);
67 static NV_STATUS _memUtilsAllocateUserD(OBJGPU *pGpu, MemoryManager *pMemoryManager, NvHandle hClientId,
68                                         NvHandle hDeviceId, OBJCHANNEL *pChannel);
69 static NV_STATUS _memUtilsMapUserd_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager,
70                            OBJCHANNEL *pChannel, NvHandle hClientId, NvHandle hDeviceId,
71                            NvHandle hChannelId, NvBool bUseRmApiForBar1);
72 static NV_STATUS _memUtilsAllocateReductionSema(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel);
73 static NvU32 _ceChannelScheduleWork_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
74                                           RmPhysAddr src, NV_ADDRESS_SPACE srcAddressSpace, NvU32 srcCpuCacheAttrib,
75                                           RmPhysAddr dst, NV_ADDRESS_SPACE dstAddressSpace, NvU32 dstCpuCacheAttrib,
76                                           NvU64 size, NvBool blocking, NvBool insertFinishPayload, NvBool bMemcopy);
77 static void  _ceChannelUpdateGpFifo_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
78                                           NvU32 gpOffset,NvU32 gpSize);
79 static NvU32 _ceChannelPushMethodsBlock_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
80                                               RmPhysAddr src, NV_ADDRESS_SPACE srcAddressSpace, NvU32 srcCpuCacheAttrib,
81                                               RmPhysAddr dst, NV_ADDRESS_SPACE dstAddressSpace, NvU32 dstCpuCacheAttrib,
82                                               NvU64 size, NvU32 **pPtr, NvBool addPayloadSema,
83                                               NvBool addNonStallIntr, NvBool addFinishPayload, NvBool bMemcopy);
84 static NvU32 _getSpaceInPb(OBJCHANNEL *pChannel);
85 static NvBool _checkSynchronization(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel, NvU32 block);
86 
87 static NV_STATUS
_memUtilsAllocateReductionSema(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)88 _memUtilsAllocateReductionSema
89 (
90     OBJGPU        *pGpu,
91     MemoryManager *pMemoryManager,
92     OBJCHANNEL    *pChannel
93 )
94 {
95 
96     NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
97     NV_STATUS                   rmStatus;
98     NvU32                       i;
99     NV_STATUS                   lockStatus;
100     RM_API                     *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
101 
102     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
103     // allocate physical memory for a bit map semaphore
104     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
105 
106     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
107     memAllocParams.type      = NVOS32_TYPE_IMAGE;
108     memAllocParams.size      = (((pChannel->blockCount + 31)/32)*4);
109     memAllocParams.attr      = 0;
110     memAllocParams.attr      = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
111     //    memAllocParams.attr     |= NVOS32_ATTR_COHERENCY_WRITE_COMBINE;
112     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
113     memAllocParams.flags     = 0;
114 
115     //
116     // When APM feature is enabled all RM internal sysmem allocations must
117     // be in unprotected memory
118     // When Hopper CC is enabled all RM internal sysmem allocations that
119     // are required to be accessed from GPU should be in unprotected memory
120     // but those sysmem allocations that are not required to be accessed from
121     // GPU should be in protected memory.
122     //
123 
124     NV_ASSERT_OK_OR_RETURN(
125         pRmApi->AllocWithHandle(pRmApi,
126                                 pChannel->hClient,
127                                 pChannel->deviceId,
128                                 pChannel->bitMapSemPhysId,
129                                 NV01_MEMORY_SYSTEM,
130                                 &memAllocParams,
131                                 sizeof(memAllocParams)));
132 
133     // allocate virtual memory for a bit map semaphore
134     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
135     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
136     memAllocParams.type      = NVOS32_TYPE_IMAGE;
137     memAllocParams.size      = (((pChannel->blockCount + 31)/32)*4);
138     memAllocParams.attr      = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
139     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
140     memAllocParams.flags     = 0;
141     memAllocParams.flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL;
142 
143     NV_ASSERT_OK_OR_RETURN(
144         pRmApi->AllocWithHandle(pRmApi,
145                                 pChannel->hClient,
146                                 pChannel->deviceId,
147                                 pChannel->bitMapSemVirtId,
148                                 NV50_MEMORY_VIRTUAL,
149                                 &memAllocParams,
150                                 sizeof(memAllocParams)));
151 
152     lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
153     if(lockStatus != NV_OK)
154     {
155         NV_ASSERT_FAILED("Could not get back lock after allocating reduction sema");
156         return NV_ERR_GENERIC;
157     }
158 
159     NV_CHECK_OK_OR_GOTO(
160         rmStatus,
161         LEVEL_ERROR,
162         pRmApi->Map(pRmApi,
163                     pChannel->hClient,
164                     pChannel->deviceId,
165                     pChannel->bitMapSemVirtId,
166                     pChannel->bitMapSemPhysId, //hMemory,
167                     0,
168                     (((pChannel->blockCount + 31)/32)*4),
169                     NV04_MAP_MEMORY_FLAGS_NONE,
170                     &pChannel->pbGpuBitMapVA),
171         exit_sema_creation);
172 
173     NV_CHECK_OK_OR_GOTO(
174         rmStatus,
175         LEVEL_ERROR,
176         pRmApi->MapToCpu(pRmApi,
177                          pChannel->hClient,
178                          pChannel->deviceId,
179                          pChannel->bitMapSemPhysId,
180                          0,
181                          (((pChannel->blockCount + 31)/32)*4),
182                          (void **)&pChannel->pbBitMapVA,
183                          0),
184         exit_sema_creation);
185 
186     for(i = 0; i < (((pChannel->blockCount + 31) / 32) * 4);)
187     {
188         MEM_WR32((NvU8*)pChannel->pbBitMapVA + (i), 0);
189         i = i + 4;
190     }
191 
192     return NV_OK;
193 exit_sema_creation:
194     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
195     NV_PRINTF(LEVEL_INFO, "end  NV_STATUS=0x%08x\n", rmStatus);
196     return rmStatus;
197 }
198 
199 static NV_STATUS
_memUtilsChannelAllocatePB_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)200 _memUtilsChannelAllocatePB_GM107
201 (
202     OBJGPU        *pGpu,
203     MemoryManager *pMemoryManager,
204     OBJCHANNEL    *pChannel
205 
206     //  OBJMEMUTILS *to be added here
207 )
208 {
209     NV_STATUS                   rmStatus = NV_OK;
210     NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
211     NvHandle                    hDevice;
212     NvHandle                    hPhysMem;
213     NvU64                       size;
214     NvHandle                    hVirtMem;
215     NvU32                       hClass;
216     NvU32                       attr;
217     NvU32                       flags        = 0;
218     NvU32                       attrNotifier = NVOS32_ATTR_NONE;
219     RM_API                     *pRmApi       = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
220 
221     // Apply registry overrides to channel pushbuffer.
222     switch (DRF_VAL(_REG_STR_RM, _INST_LOC_4, _CHANNEL_PUSHBUFFER, pGpu->instLocOverrides4))
223     {
224         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_VID:
225             hClass = NV01_MEMORY_LOCAL_USER;
226             attr   = DRF_DEF(OS32, _ATTR, _LOCATION,  _VIDMEM)     |
227                      DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
228 
229             flags = NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM;
230             if (!IS_MIG_IN_USE(pGpu))
231             {
232                 attr |= DRF_DEF(OS32, _ATTR, _ALLOCATE_FROM_RESERVED_HEAP, _YES);
233             }
234             attrNotifier = attr;
235             break;
236 
237         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_COH:
238             hClass = NV01_MEMORY_SYSTEM;
239             attr   = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)        |
240                      DRF_DEF(OS32, _ATTR, _COHERENCY, _CACHED);
241             attrNotifier = attr;
242             break;
243 
244         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_NCOH:
245             hClass = NV01_MEMORY_SYSTEM;
246             attr   = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)        |
247                      DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
248             attrNotifier = attr;
249             break;
250 
251         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_DEFAULT:
252         default:
253             hClass = NV01_MEMORY_SYSTEM;
254             attr   = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)        |
255                      DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
256 
257             //
258             // The work submit token is read from notifier memory to support
259             // VM migration for the memory scrubber. The token is read from
260             // the notifier memory every time when the scrubber submits the work.
261             // It will help performance by changing the default setting of
262             // the notifier memory to be cached.
263             //
264             attrNotifier = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)  |
265                            DRF_DEF(OS32, _ATTR, _COHERENCY, _CACHED);
266             break;
267     }
268 
269     hDevice           =  pChannel->deviceId;
270     hPhysMem          =  pChannel->physMemId;
271     hVirtMem          =  pChannel->pushBufferId;
272     size              =  pChannel->channelSize;
273 
274     LOCK_ASSERT_AND_RETURN(!rmGpuLockIsOwner());
275     // allocate the physical memory
276     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
277     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
278     memAllocParams.type      = NVOS32_TYPE_IMAGE;
279     memAllocParams.size      = size;
280     memAllocParams.attr      = attr;
281     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
282     memAllocParams.flags     = flags;
283     memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
284 
285     //
286     // When APM is enabled all RM internal allocations must to go to
287     // unprotected memory irrespective of vidmem or sysmem
288     // When Hopper CC is enabled all RM internal sysmem allocations that
289     // are required to be accessed from GPU should be in unprotected memory
290     // but all vidmem allocations must go to protected memory
291     //
292     if (gpuIsApmFeatureEnabled(pGpu) ||
293         FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
294     {
295         memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
296                                         _UNPROTECTED);
297     }
298 
299     NV_CHECK_OK_OR_RETURN(
300         LEVEL_ERROR,
301         pRmApi->AllocWithHandle(pRmApi,
302                                 pChannel->hClient,
303                                 hDevice,
304                                 hPhysMem,
305                                 hClass,
306                                 &memAllocParams,
307                                 sizeof(memAllocParams)));
308 
309     // allocate the Virtual memory
310     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
311     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
312     memAllocParams.type      = NVOS32_TYPE_IMAGE;
313     memAllocParams.size      = size;
314     memAllocParams.attr      = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
315     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
316     memAllocParams.flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL;
317     memAllocParams.hVASpace = pChannel->hVASpaceId;
318 
319     NV_CHECK_OK_OR_RETURN(
320         LEVEL_ERROR,
321         pRmApi->AllocWithHandle(pRmApi,
322                                 pChannel->hClient,
323                                 hDevice,
324                                 hVirtMem,
325                                 NV50_MEMORY_VIRTUAL,
326                                 &memAllocParams,
327                                 sizeof(memAllocParams)));
328 
329     // allocate the physmem for the notifier
330 
331     if (gpuIsCCFeatureEnabled(pGpu))
332     {
333         //
334         // Force error notifier to ncoh sysmem when CC is enabled
335         // since key rotation notifier is part of error notifier and
336         // it needs to be in sysmem so we can create persistent mapping for it.
337         // we cannot create mappins on the fly since this notifier is
338         // written as part of 1 sec callback where creating mappings is
339         // not allowed.
340         //
341         hClass = NV01_MEMORY_SYSTEM;
342         attrNotifier   = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)        |
343                          DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
344     }
345 
346     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
347     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
348     memAllocParams.type      = NVOS32_TYPE_IMAGE;
349     memAllocParams.size      = pChannel->channelNotifierSize;
350     memAllocParams.attr      = attrNotifier;
351     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
352     memAllocParams.flags     = 0;
353     memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
354 
355     //
356     // When APM is enabled all RM internal allocations must to go to
357     // unprotected memory irrespective of vidmem or sysmem
358     // When Hopper CC is enabled all RM internal sysmem allocations that
359     // are required to be accessed from GPU should be in unprotected memory
360     // but all vidmem allocations must go to protected memory
361     //
362     if (gpuIsApmFeatureEnabled(pGpu) ||
363         FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
364     {
365         memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
366                                         _UNPROTECTED);
367     }
368 
369     NV_CHECK_OK_OR_RETURN(
370         LEVEL_ERROR,
371         pRmApi->AllocWithHandle(pRmApi,
372                                 pChannel->hClient,
373                                 hDevice,
374                                 pChannel->errNotifierIdPhys,
375                                 hClass,
376                                 &memAllocParams,
377                                 sizeof(memAllocParams)));
378 
379     // allocate Virtual Memory for the notifier
380     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
381     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
382     memAllocParams.type      = NVOS32_TYPE_IMAGE;
383     memAllocParams.size      = pChannel->channelNotifierSize;
384     memAllocParams.attr      = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
385     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
386     memAllocParams.flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL;
387     memAllocParams.hVASpace = pChannel->hVASpaceId;
388 
389     NV_CHECK_OK_OR_RETURN(
390         LEVEL_ERROR,
391         pRmApi->AllocWithHandle(pRmApi,
392                                 pChannel->hClient,
393                                 hDevice,
394                                 pChannel->errNotifierIdVirt,
395                                 NV50_MEMORY_VIRTUAL,
396                                 &memAllocParams,
397                                 sizeof(memAllocParams)));
398 
399     return rmStatus;
400 }
401 
402 NV_STATUS
memmgrMemUtilsChannelInitialize_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)403 memmgrMemUtilsChannelInitialize_GM107
404 (
405     OBJGPU        *pGpu,
406     MemoryManager *pMemoryManager,
407     OBJCHANNEL    *pChannel
408 )
409 {
410     NV_STATUS         rmStatus;
411     NV_STATUS         lockStatus;
412     RsClient         *pRsClient           = pChannel->pRsClient;
413     NvHandle          hClient             = pChannel->hClient;
414     NvHandle          hDevice             = pChannel->deviceId;
415     NvHandle          hPhysMem            = pChannel->physMemId;
416     NvU64             size                = pChannel->channelSize;
417     NvHandle          hChannel            = pChannel->channelId;
418     NvHandle          hErrNotifierVirt    = pChannel->errNotifierIdVirt;
419     NvHandle          hErrNotifierPhys    = pChannel->errNotifierIdPhys;
420     NvHandle          hPushBuffer         = pChannel->pushBufferId;
421     RM_API           *pRmApi              = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
422     NvBool            bMIGInUse           = IS_MIG_IN_USE(pGpu);
423     NvU8             *pErrNotifierCpuVA   = NULL;
424     NV_ADDRESS_SPACE  userdAddrSpace;
425     NV_ADDRESS_SPACE  pushBuffAddrSpace;
426     NV_ADDRESS_SPACE  gpFifoAddrSpace;
427     OBJSYS           *pSys                = SYS_GET_INSTANCE();
428     OBJCL            *pCl                 = SYS_GET_CL(pSys);
429     NvU32             cacheSnoopFlag      = 0 ;
430     NvBool            bUseRmApiForBar1    = NV_FALSE;
431 
432     //
433     // Heap alloc one chunk of memory to hold all of our alloc parameters to
434     // reduce stack usage
435     //
436     union
437     {
438         NV_VASPACE_ALLOCATION_PARAMETERS va;
439         NV_MEMORY_ALLOCATION_PARAMS      mem;
440     } *pParams = NULL;
441 
442     if (pCl->getProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT))
443     {
444         cacheSnoopFlag = DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _ENABLE);
445     }
446 
447     pParams = portMemAllocNonPaged(sizeof(*pParams));
448     if (pParams == NULL)
449     {
450         rmStatus = NV_ERR_NO_MEMORY;
451         goto exit_free_client;
452     }
453 
454     //
455     // client allocated userd only supported on volta+
456     // TODO: Use property to check if client allocated userd is supported
457     //
458     pChannel->bClientUserd = IsVOLTAorBetter(pGpu);
459 
460     //
461     // We need to allocate a VAS to use for CE copies, but also for
462     // GSP-RM + MIG, so that it doesn't get the device
463     // default VAS during channel bind (which is not properly handled
464     // by split VAS in MIG currently). We only need the identity mapping
465     // when actually using the VAS for copies.
466     //
467     if (pChannel->bUseVasForCeCopy ||
468         (IS_GSP_CLIENT(pGpu) && bMIGInUse))
469     {
470         NvBool bAcquireLock = NV_FALSE;
471         NvU64 startFbOffset = GPU_GET_HEAP(pGpu)->base;
472         NvU64 fbSize        = GPU_GET_HEAP(pGpu)->total;
473         NvU64 vaStartOffset = startFbOffset;
474 
475         NV_PRINTF(LEVEL_INFO, "Channel VAS heap base: %llx total: %llx \n", GPU_GET_HEAP(pGpu)->base,
476                   GPU_GET_HEAP(pGpu)->total);
477 
478         pChannel->startFbOffset = startFbOffset;
479         pChannel->fbSize = fbSize;
480 
481         if (pChannel->bUseVasForCeCopy)
482         {
483             NV_ASSERT_OK_OR_GOTO(rmStatus,
484                 clientGenResourceHandle(pRsClient, &pChannel->hFbAlias),
485                 exit_free_client);
486 
487             rmStatus = memmgrMemUtilsCreateMemoryAlias_HAL(pGpu, pMemoryManager, pChannel);
488             if (rmStatus != NV_OK)
489             {
490                 NV_PRINTF(LEVEL_ERROR, "Setting Identity mapping failed.. status: %x\n", rmStatus);
491                 goto exit_free_client;
492             }
493         }
494 
495         {
496             NV_VASPACE_ALLOCATION_PARAMETERS *pVa = &pParams->va;
497 
498             portMemSet(pVa, 0, sizeof(*pVa));
499             pVa->index  = NV_VASPACE_ALLOCATION_INDEX_GPU_NEW;
500             pVa->vaBase = pChannel->startFbOffset;
501             //
502             // how large should we go here ? we definitely need more than heapSize to allocate
503             // other metadata related to chnanel. Also need to account the discontiguous VA Range
504             // for split VAS, where we allocate 4GB to (4GB + 512MB) for Server VAS (mirrored).
505             // Rough VASpace Layout will be documented here:
506             //
507             //
508             if (gpuIsSplitVasManagementServerClientRmEnabled(pGpu))
509             {
510                 pVa->vaSize += (SPLIT_VAS_SERVER_RM_MANAGED_VA_START +
511                                 SPLIT_VAS_SERVER_RM_MANAGED_VA_SIZE) ;
512             }
513             pVa->vaSize += fbSize + pChannel->channelSize + SCRUBBER_VASPACE_BUFFER_SIZE;
514 
515             //
516             // We definitely need ALLOW_ZERO_ADDRESS, but SKIP_SCRUB_MEMPOOL is a patch
517             // until we figure out the right place for Scrubber page tables
518             //
519             pVa->flags |= NV_VASPACE_ALLOCATION_FLAGS_ALLOW_ZERO_ADDRESS |
520                           NV_VASPACE_ALLOCATION_FLAGS_SKIP_SCRUB_MEMPOOL |
521                           NV_VASPACE_ALLOCATION_FLAGS_OPTIMIZE_PTETABLE_MEMPOOL_USAGE;
522 
523             if (!IS_MIG_IN_USE(pGpu))
524             {
525                 pVa->flags |= NV_VASPACE_ALLOCATION_FLAGS_PTETABLE_HEAP_MANAGED;
526             }
527 
528             if (rmDeviceGpuLockIsOwner(pGpu->gpuInstance))
529             {
530                 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
531                 bAcquireLock = NV_TRUE;
532                 pRmApi = rmapiGetInterface(RMAPI_API_LOCK_INTERNAL);
533             }
534 
535             rmStatus = pRmApi->AllocWithHandle(pRmApi, hClient, pChannel->deviceId,
536                                                pChannel->hVASpaceId, FERMI_VASPACE_A,
537                                                pVa, sizeof(*pVa));
538         }
539         if (bAcquireLock)
540         {
541             NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus, rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
542             bAcquireLock = NV_FALSE;
543             pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
544         }
545 
546         if (rmStatus != NV_OK)
547         {
548             NV_PRINTF(LEVEL_ERROR, "failed allocating scrubber vaspace, status=0x%x\n",
549                     rmStatus);
550             goto exit_free_client;
551         }
552 
553         rmStatus = vaspaceGetByHandleOrDeviceDefault(pRsClient,
554                                                      pChannel->deviceId,
555                                                      pChannel->hVASpaceId,
556                                                      &pChannel->pVAS);
557         if (rmStatus != NV_OK)
558         {
559             NV_PRINTF(LEVEL_ERROR,
560                     "failed getting the scrubber vaspace from handle, status=0x%x\n",
561                     rmStatus);
562             goto exit_free_client;
563         }
564 
565         if (pChannel->bUseVasForCeCopy)
566         {
567             if (!gpuIsWarBug200577889SriovHeavyEnabled(pGpu))
568             {
569                 rmStatus = vaspacePinRootPageDir(pChannel->pVAS, pGpu);
570                 if (rmStatus != NV_OK)
571                 {
572                     NV_PRINTF(LEVEL_ERROR, "failed pinning down Scrubber VAS, status=0x%x\n",
573                             rmStatus);
574                     goto exit_free_client;
575                 }
576             }
577 
578             NV_ASSERT_OK_OR_GOTO(rmStatus,
579                  clientGenResourceHandle(pRsClient, &pChannel->hFbAliasVA), exit_free_client);
580         }
581 
582         if (gpuIsSplitVasManagementServerClientRmEnabled(pGpu))
583         {
584             OBJGVASPACE *pGVAS = dynamicCast(pChannel->pVAS, OBJGVASPACE);
585             vaStartOffset += pGVAS->vaLimitServerRMOwned + 1;
586             pChannel->vaStartOffset = vaStartOffset;
587         }
588 
589         if (rmDeviceGpuLockIsOwner(pGpu->gpuInstance))
590         {
591             rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
592             bAcquireLock = NV_TRUE;
593             pRmApi = rmapiGetInterface(RMAPI_API_LOCK_INTERNAL);
594         }
595 
596         // Allocate virtual memory for Identity Mapping
597         if (pChannel->bUseVasForCeCopy)
598         {
599             NV_MEMORY_ALLOCATION_PARAMS *pMem = &pParams->mem;
600             portMemSet(pMem, 0, sizeof(*pMem));
601             pMem->owner     = NVOS32_TYPE_OWNER_RM;
602             pMem->type      = NVOS32_TYPE_IMAGE;
603             pMem->size      = pChannel->fbSize;
604             pMem->attr      = (DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI) |
605                                DRF_DEF(OS32, _ATTR, _PAGE_SIZE, _BIG));
606             pMem->attr2     = NVOS32_ATTR2_NONE;
607             pMem->offset    = vaStartOffset;
608             pMem->flags     = 0;
609             pMem->flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL |
610                               NVOS32_ALLOC_FLAGS_FIXED_ADDRESS_ALLOCATE |
611                               NVOS32_ALLOC_FLAGS_LAZY;
612             pMem->hVASpace = pChannel->hVASpaceId;
613 
614             rmStatus = pRmApi->AllocWithHandle(pRmApi,
615                                                hClient,
616                                                pChannel->deviceId,
617                                                pChannel->hFbAliasVA,
618                                                NV50_MEMORY_VIRTUAL,
619                                                pMem,
620                                                sizeof(*pMem));
621         }
622 
623         if (bAcquireLock)
624         {
625             NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus, rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
626             bAcquireLock = NV_FALSE;
627             pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
628         }
629 
630         if (rmStatus != NV_OK)
631         {
632             NV_PRINTF(LEVEL_ERROR, "Allocating VASpace for (base, size): (%llx, %llx) failed,"
633                                    " with status: %x\n", vaStartOffset, pChannel->fbSize, rmStatus);
634             goto exit_free_client;
635         }
636 
637         // set up mapping of VA -> PA
638         if (pChannel->bUseVasForCeCopy)
639         {
640             NV_CHECK_OK_OR_GOTO(
641                 rmStatus,
642                 LEVEL_ERROR,
643                 pRmApi->Map(pRmApi,
644                             hClient,
645                             pChannel->deviceId,
646                             pChannel->hFbAliasVA,
647                             pChannel->hFbAlias,
648                             0,
649                             pChannel->fbSize,
650                             DRF_DEF(OS46, _FLAGS, _ACCESS,           _READ_WRITE) |
651                             DRF_DEF(OS46, _FLAGS, _PAGE_SIZE,        _BIG)        |
652                             DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP,      _ENABLE),
653                             &pChannel->fbAliasVA),
654                 exit_free_client);
655 
656             NV_PRINTF(LEVEL_INFO, "Scrubber VAS :%x identity mapped with start addr: %llx, size: %llx\n",
657                       pChannel->hFbAliasVA, pChannel->fbAliasVA, pChannel->fbSize);
658         }
659     }
660 
661     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
662 
663     //
664     // Fetch the physical location of the push buffer
665     //
666     // Bug 3434881 filed to track the following
667     // a.Implementation of the utility function to parse the
668     //   push buffer and userd regkeys
669     // b.Replace all instances of regkey pushbuffer/userd regkey
670     //   parsing with the utility function
671     //
672     switch (DRF_VAL(_REG_STR_RM, _INST_LOC_4, _CHANNEL_PUSHBUFFER, pGpu->instLocOverrides4))
673     {
674         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_VID:
675             pushBuffAddrSpace = ADDR_FBMEM;
676             break;
677 
678         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_COH:
679         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_NCOH:
680         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_DEFAULT:
681         default:
682             pushBuffAddrSpace = ADDR_SYSMEM;
683             break;
684     }
685 
686     gpFifoAddrSpace = pushBuffAddrSpace;
687 
688     //Fetch the physical location of userD
689     switch (DRF_VAL(_REG_STR_RM, _INST_LOC, _USERD, pGpu->instLocOverrides))
690     {
691         case NV_REG_STR_RM_INST_LOC_USERD_NCOH:
692         case NV_REG_STR_RM_INST_LOC_USERD_COH:
693             userdAddrSpace = ADDR_SYSMEM;
694             break;
695 
696         case NV_REG_STR_RM_INST_LOC_USERD_VID:
697         case NV_REG_STR_RM_INST_LOC_USERD_DEFAULT:
698         default:
699             userdAddrSpace = ADDR_FBMEM;
700             break;
701     }
702 
703     // RM WAR for Bug 3313719
704     // Disallow USERD in sysmem and (GPFIFO or pushbuffer) in vidmem
705     rmStatus = kfifoCheckChannelAllocAddrSpaces_HAL(GPU_GET_KERNEL_FIFO(pGpu),
706                                                     userdAddrSpace,
707                                                     pushBuffAddrSpace,
708                                                     gpFifoAddrSpace);
709     if (rmStatus != NV_OK)
710     {
711         NV_ASSERT_FAILED("USERD in sysmem and PushBuffer/GPFIFO in vidmem not allowed");
712         goto exit_free_client;
713     }
714 
715     _memUtilsChannelAllocatePB_GM107(pGpu, pMemoryManager, pChannel);
716     lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
717     if(lockStatus != NV_OK)
718     {
719         NV_ASSERT_FAILED("Could not get back lock after allocating Push Buffer sema");
720         goto exit_free_client;
721     }
722 
723     // map the pushbuffer
724     rmStatus = pRmApi->Map(pRmApi, hClient, hDevice,
725                            hPushBuffer,
726                            hPhysMem, //hMemory,
727                            0,
728                            size,
729                            cacheSnoopFlag,
730                            &pChannel->pbGpuVA);
731     // map the error notifier
732     rmStatus = pRmApi->Map(pRmApi, hClient, hDevice,
733                            hErrNotifierVirt,
734                            hErrNotifierPhys, //hMemory,
735                            0,
736                            pChannel->channelNotifierSize,
737                            DRF_DEF(OS46, _FLAGS, _KERNEL_MAPPING, _ENABLE) | cacheSnoopFlag,
738                            &pChannel->pbGpuNotifierVA);
739 
740     NV_CHECK_OK_OR_GOTO(
741         rmStatus,
742         LEVEL_ERROR,
743         _memUtilsAllocateChannel(pGpu,
744                                  pMemoryManager,
745                                  hClient,
746                                  hDevice,
747                                  hChannel,
748                                  hErrNotifierVirt,
749                                  hPushBuffer,
750                                  pChannel),
751         exit_free_client);
752 
753     // _memUtilsMapUserd
754     NV_CHECK_OK_OR_GOTO(
755         rmStatus,
756         LEVEL_ERROR,
757         _memUtilsMapUserd_GM107(pGpu, pMemoryManager, pChannel,
758                                 hClient, hDevice, hChannel, bUseRmApiForBar1),
759         exit_free_client);
760 
761     // Set up pushbuffer and semaphore memdesc and memset the buffer
762     pChannel->pChannelBufferMemdesc =
763         memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, pChannel->hClient, hPhysMem);
764     NV_ASSERT_OR_GOTO(pChannel->pChannelBufferMemdesc != NULL, exit_free_client);
765 
766     // Set up notifier memory
767     pChannel->pErrNotifierMemdesc =
768         memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, pChannel->hClient, hErrNotifierPhys);
769     NV_ASSERT_OR_GOTO(pChannel->pErrNotifierMemdesc != NULL, exit_free_client);
770 
771     if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
772     {
773         rmStatus = memmgrMemDescMemSet(pMemoryManager, pChannel->pChannelBufferMemdesc, 0,
774                                        (TRANSFER_FLAGS_SHADOW_ALLOC | TRANSFER_FLAGS_SHADOW_INIT_MEM));
775         NV_ASSERT_OR_GOTO(rmStatus == NV_OK, exit_free_client);
776 
777         pChannel->pbCpuVA = NULL;
778         pChannel->pTokenFromNotifier = NULL;
779     }
780     else
781     {
782         if (bUseRmApiForBar1)
783         {
784             NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
785                 pRmApi->MapToCpu(pRmApi, hClient, hDevice, hPhysMem, 0, size,
786                                  (void **)&pChannel->pbCpuVA, 0),
787                 exit_free_client);
788 
789             NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
790                 pRmApi->MapToCpu(pRmApi, hClient, hDevice, hErrNotifierPhys, 0,
791                     pChannel->channelNotifierSize, (void **)&pErrNotifierCpuVA, 0),
792                 exit_free_client);
793         }
794         else
795         {
796             //
797             // Most use cases can migrate to the internal memdescMap path for BAR1
798             // And it is preferred because external path will not work with CC
799             //
800             pChannel->pbCpuVA = memmgrMemDescBeginTransfer(pMemoryManager,
801                                     pChannel->pChannelBufferMemdesc, TRANSFER_FLAGS_USE_BAR1);
802             NV_ASSERT_OR_GOTO(pChannel->pbCpuVA != NULL, exit_free_client);
803 
804             pErrNotifierCpuVA = memmgrMemDescBeginTransfer(pMemoryManager,
805                                     pChannel->pErrNotifierMemdesc, TRANSFER_FLAGS_USE_BAR1);
806             NV_ASSERT_OR_GOTO(pErrNotifierCpuVA != NULL, exit_free_client);
807         }
808 
809         portMemSet(pChannel->pbCpuVA, 0, (NvLength)size);
810 
811         pChannel->pTokenFromNotifier =
812             (NvNotification *)(pErrNotifierCpuVA +
813                                (NV_CHANNELGPFIFO_NOTIFICATION_TYPE_WORK_SUBMIT_TOKEN *
814                                 sizeof(NvNotification)));
815     }
816 
817     //
818     // Allocate and map the doorbell region to use in scrub on free
819     // Set the doorbellregister to False, since pre-volta chips doesn't support
820     //
821     NV_CHECK_OK_OR_GOTO(
822         rmStatus,
823         LEVEL_ERROR,
824         memmgrScrubMapDoorbellRegion_HAL(pGpu, pMemoryManager, pChannel),
825         exit_free_client);
826 
827     portMemFree(pParams);
828     return NV_OK;
829 
830 exit_free_client:
831     if(!pChannel->bClientAllocated)
832     {
833         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
834     }
835     portMemFree(pParams);
836     NV_PRINTF(LEVEL_INFO, "end  NV_STATUS=0x%08x\n", rmStatus);
837     return rmStatus;
838 }
839 
840 
841 /** memmgrMemUtilsCreateMemoryAlias_GM107
842  *
843  *  @brief Creates an alias for the FB region
844  *         This function doesn't allocate any memory but just creates memory handle
845  *         which refers to FB range. This call can support for both baremetal and vGPU.
846  *  @param[in] pChannel             CHANNEL Pointer
847  *
848  *  @returns NV_OK on success
849  */
850 NV_STATUS
memmgrMemUtilsCreateMemoryAlias_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)851 memmgrMemUtilsCreateMemoryAlias_GM107
852 (
853     OBJGPU        *pGpu,
854     MemoryManager *pMemoryManager,
855     OBJCHANNEL    *pChannel
856 )
857 {
858     RM_API  *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
859     NV_STATUS status = NV_OK;
860 
861     NV_PHYSICAL_MEMORY_ALLOCATION_PARAMS physMemParams = {0};
862 
863     memmgrGetPteKindForScrubber_HAL(pMemoryManager, &physMemParams.format);
864 
865 
866     status = pRmApi->AllocWithHandle(pRmApi,
867                                     pChannel->hClient,
868                                     pChannel->deviceId,
869                                     pChannel->hFbAlias,
870                                     NV01_MEMORY_LOCAL_PHYSICAL,
871                                     &physMemParams,
872                                     sizeof(physMemParams));
873     if (status != NV_OK)
874     {
875         NV_CHECK_OK_FAILED(LEVEL_WARNING, "Aliasing FbListMem", status);
876         return status;
877     }
878 
879     NV_PRINTF(LEVEL_INFO, "Allocating FbAlias: %x for size: %llx, kind: %x\n", pChannel->hFbAlias,
880               pChannel->fbSize, physMemParams.format);
881 
882 
883     return NV_OK;
884 }
885 
886 NV_STATUS
memmgrMemUtilsCopyEngineInitialize_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)887 memmgrMemUtilsCopyEngineInitialize_GM107
888 (
889     OBJGPU        *pGpu,
890     MemoryManager *pMemoryManager,
891     OBJCHANNEL    *pChannel
892 )
893 {
894     NV_STATUS rmStatus = NV_OK;
895     RM_API   *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
896 
897     //allocce
898     NV_CHECK_OK_OR_GOTO(
899         rmStatus,
900         LEVEL_ERROR,
901         _memUtilsAllocCe_GM107(pGpu,
902                                pMemoryManager,
903                                pChannel,
904                                pChannel->hClient,
905                                pChannel->deviceId,
906                                pChannel->channelId,
907                                pChannel->engineObjectId),
908         exit_free);
909 
910     NV_CHECK_OK_OR_GOTO(
911         rmStatus,
912         LEVEL_ERROR,
913         memmgrMemUtilsChannelSchedulingSetup(pGpu, pMemoryManager, pChannel), exit_free);
914 
915     return NV_OK;
916 
917  exit_free:
918     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
919     NV_PRINTF(LEVEL_INFO, "end  NV_STATUS=0x%08x\n", rmStatus);
920     return rmStatus;
921 }
922 
_memUtilsAllocCe_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,NvHandle hClientId,NvHandle hDeviceId,NvHandle hChannelId,NvHandle hCopyObjectId)923 static NV_STATUS _memUtilsAllocCe_GM107
924 (
925     OBJGPU        *pGpu,
926     MemoryManager *pMemoryManager,
927     OBJCHANNEL    *pChannel,
928     NvHandle       hClientId,
929     NvHandle       hDeviceId,
930     NvHandle       hChannelId,
931     NvHandle       hCopyObjectId
932 
933 )
934 {
935     NVC0B5_ALLOCATION_PARAMETERS  createParams = {0};
936     RM_API                       *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
937 
938     createParams.version = NVC0B5_ALLOCATION_PARAMETERS_VERSION_1;
939     createParams.engineType = NV2080_ENGINE_TYPE_COPY(pChannel->ceId);
940     memmgrMemUtilsGetCopyEngineClass_HAL(pGpu, pMemoryManager, &pChannel->hTdCopyClass);
941     pChannel->engineType = gpuGetRmEngineType(createParams.engineType);
942 
943     if (!pChannel->hTdCopyClass)
944     {
945         NV_PRINTF(LEVEL_ERROR, "Unable to determine CE's engine class.\n");
946         return NV_ERR_GENERIC;
947     }
948 
949     NV_CHECK_OK_OR_RETURN(
950         LEVEL_ERROR,
951         pRmApi->AllocWithHandle(pRmApi,
952                                 hClientId,
953                                 hChannelId,
954                                 hCopyObjectId,
955                                 pChannel->hTdCopyClass,
956                                 &createParams,
957                                 sizeof(createParams)));
958 
959     return NV_OK;
960 }
961 
962 static NV_STATUS
_memUtilsMapUserd_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,NvHandle hClientId,NvHandle hDeviceId,NvHandle hChannelId,NvBool bUseRmApiForBar1)963 _memUtilsMapUserd_GM107
964 (
965     OBJGPU        *pGpu,
966     MemoryManager *pMemoryManager,
967     OBJCHANNEL    *pChannel,
968     NvHandle       hClientId,
969     NvHandle       hDeviceId,
970     NvHandle       hChannelId,
971     NvBool         bUseRmApiForBar1
972 )
973 {
974     //
975     // The memTransfer API only works for client-allocated USERD
976     // because otherwise we are calling MapToCpu using the channel
977     // handle instead.
978     //
979     if (pChannel->bClientUserd && !bUseRmApiForBar1)
980     {
981         pChannel->pUserdMemdesc =
982             memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, hClientId, pChannel->hUserD);
983         NV_ASSERT_OR_RETURN(pChannel->pUserdMemdesc != NULL, NV_ERR_GENERIC);
984 
985         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
986         {
987             //
988             // GPFIFO aceess will not be set up in order to facilitate memTransfer APIs
989             // which will use GSP-DMA/CE with shadow buffers
990             //
991             pChannel->pControlGPFifo = NULL;
992         }
993         else
994         {
995             pChannel->pControlGPFifo =
996                 (void *)memmgrMemDescBeginTransfer(pMemoryManager, pChannel->pUserdMemdesc,
997                                                    TRANSFER_FLAGS_USE_BAR1);
998             NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
999         }
1000     }
1001     else
1002     {
1003         NvU32   userdSize = 0;
1004         RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1005         kfifoGetUserdSizeAlign_HAL(GPU_GET_KERNEL_FIFO(pGpu), &userdSize, NULL);
1006 
1007         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1008             pRmApi->MapToCpu(pRmApi, hClientId, hDeviceId,
1009                              pChannel->bClientUserd ? pChannel->hUserD : hChannelId, 0,
1010                              userdSize, (void **)&pChannel->pControlGPFifo, 0));
1011     }
1012     return NV_OK;
1013 }
1014 
1015 static NV_STATUS
_memUtilsAllocateUserD(OBJGPU * pGpu,MemoryManager * pMemoryManager,NvHandle hClientId,NvHandle hDeviceId,OBJCHANNEL * pChannel)1016 _memUtilsAllocateUserD
1017 (
1018     OBJGPU        *pGpu,
1019     MemoryManager *pMemoryManager,
1020     NvHandle       hClientId,
1021     NvHandle       hDeviceId,
1022     OBJCHANNEL    *pChannel
1023 )
1024 {
1025     NV_STATUS                    rmStatus = NV_OK;
1026     KernelFifo                  *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
1027     RM_API                      *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1028     NV_MEMORY_ALLOCATION_PARAMS  memAllocParams;
1029     NvU32                        userdMemClass = NV01_MEMORY_LOCAL_USER;
1030 
1031     // Ensure that call is not made with lock held
1032     LOCK_ASSERT_AND_RETURN(!rmGpuLockIsOwner());
1033 
1034     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
1035 
1036     memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
1037     kfifoGetUserdSizeAlign_HAL(pKernelFifo, (NvU32 *)&memAllocParams.size, NULL);
1038     memAllocParams.type  = NVOS32_TYPE_IMAGE;
1039     memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
1040 
1041     // Apply registry overrides to USERD.
1042     switch (DRF_VAL(_REG_STR_RM, _INST_LOC, _USERD, pGpu->instLocOverrides))
1043     {
1044         case NV_REG_STR_RM_INST_LOC_USERD_NCOH:
1045         case NV_REG_STR_RM_INST_LOC_USERD_COH:
1046             userdMemClass = NV01_MEMORY_SYSTEM;
1047             memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
1048             break;
1049 
1050         case NV_REG_STR_RM_INST_LOC_USERD_VID:
1051         case NV_REG_STR_RM_INST_LOC_USERD_DEFAULT:
1052             userdMemClass = NV01_MEMORY_LOCAL_USER;
1053             memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _VIDMEM);
1054             memAllocParams.flags |= NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM;
1055             if (!IS_MIG_IN_USE(pGpu))
1056             {
1057                 memAllocParams.attr |= DRF_DEF(OS32, _ATTR, _ALLOCATE_FROM_RESERVED_HEAP, _YES);
1058             }
1059             break;
1060     }
1061 
1062     //
1063     // When APM is enabled all RM internal allocations must to go to
1064     // unprotected memory irrespective of vidmem or sysmem
1065     // When Hopper CC is enabled all RM internal sysmem allocations that
1066     // are required to be accessed from GPU should be in unprotected memory
1067     // but all vidmem allocations must go to protected memory
1068     //
1069     if (gpuIsApmFeatureEnabled(pGpu) ||
1070         FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
1071     {
1072         memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
1073                                         _UNPROTECTED);
1074     }
1075 
1076     NV_ASSERT_OK_OR_RETURN(pRmApi->AllocWithHandle(pRmApi, hClientId, hDeviceId,
1077                                                    pChannel->hUserD,
1078                                                    userdMemClass,
1079                                                    &memAllocParams,
1080                                                    sizeof(memAllocParams)));
1081 
1082     return rmStatus;
1083 }
1084 
1085 static NV_STATUS
_memUtilsAllocateChannel(OBJGPU * pGpu,MemoryManager * pMemoryManager,NvHandle hClientId,NvHandle hDeviceId,NvHandle hChannelId,NvHandle hObjectError,NvHandle hObjectBuffer,OBJCHANNEL * pChannel)1086 _memUtilsAllocateChannel
1087 (
1088     OBJGPU        *pGpu,
1089     MemoryManager *pMemoryManager,
1090     NvHandle       hClientId,
1091     NvHandle       hDeviceId,
1092     NvHandle       hChannelId,
1093     NvHandle       hObjectError,
1094     NvHandle       hObjectBuffer,
1095     OBJCHANNEL    *pChannel
1096 )
1097 {
1098     NV_CHANNEL_ALLOC_PARAMS channelGPFIFOAllocParams;
1099     NV_STATUS               rmStatus =  NV_OK;
1100     NvU32                   hClass;
1101     RM_API                 *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1102     NvBool                  bMIGInUse = IS_MIG_IN_USE(pGpu);
1103     NvU32                   flags = DRF_DEF(OS04, _FLAGS, _CHANNEL_SKIP_SCRUBBER, _TRUE);
1104     RM_ENGINE_TYPE          engineType = (pChannel->type == SWL_SCRUBBER_CHANNEL) ?
1105                                 RM_ENGINE_TYPE_SEC2 : RM_ENGINE_TYPE_COPY(pChannel->ceId);
1106 
1107     if (pChannel->bSecure)
1108     {
1109         flags |= DRF_DEF(OS04, _FLAGS, _CC_SECURE, _TRUE);
1110     }
1111 
1112     portMemSet(&channelGPFIFOAllocParams, 0, sizeof(NV_CHANNEL_ALLOC_PARAMS));
1113     channelGPFIFOAllocParams.hObjectError  = hObjectError;
1114     channelGPFIFOAllocParams.hObjectBuffer = hObjectBuffer;
1115     channelGPFIFOAllocParams.gpFifoOffset  = pChannel->pbGpuVA + pChannel->channelPbSize;
1116     channelGPFIFOAllocParams.gpFifoEntries = pChannel->channelNumGpFifioEntries;
1117     channelGPFIFOAllocParams.hContextShare = NV01_NULL_OBJECT;
1118     channelGPFIFOAllocParams.flags         = flags;
1119     channelGPFIFOAllocParams.hVASpace      = pChannel->hVASpaceId;
1120 
1121     //
1122     // Use GPU instance local Id if MIG is enabled
1123     // TODO: Maybe we need a VAS for each GPU instance ?
1124     //
1125     if (bMIGInUse && (pChannel->pKernelMIGGpuInstance != NULL))
1126     {
1127         KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
1128         MIG_INSTANCE_REF ref;
1129         RM_ENGINE_TYPE localCe;
1130         RsClient *pClient;
1131         Device *pDevice;
1132 
1133         NV_ASSERT_OK_OR_RETURN(
1134             serverGetClientUnderLock(&g_resServ, hClientId, &pClient));
1135 
1136         NV_ASSERT_OK_OR_RETURN(
1137             deviceGetByHandle(pClient, hDeviceId, &pDevice));
1138 
1139         NV_ASSERT_OK_OR_RETURN(
1140             kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, pDevice, &ref));
1141         // Clear the Compute instance portion, if present
1142         ref = kmigmgrMakeGIReference(ref.pKernelMIGGpuInstance);
1143         NV_ASSERT_OK_OR_RETURN(
1144             kmigmgrGetGlobalToLocalEngineType(pGpu, pKernelMIGManager, ref,
1145                                               engineType,
1146                                               &localCe));
1147         channelGPFIFOAllocParams.engineType = gpuGetNv2080EngineType(localCe);
1148     }
1149     else
1150     {
1151         channelGPFIFOAllocParams.engineType = gpuGetNv2080EngineType(engineType);
1152     }
1153 
1154     hClass = kfifoGetChannelClassId(pGpu, GPU_GET_KERNEL_FIFO(pGpu));
1155     if (!hClass)
1156     {
1157         NV_PRINTF(LEVEL_ERROR, "Unable to determine CE's channel class.\n");
1158         return NV_ERR_GENERIC;
1159     }
1160 
1161     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
1162 
1163     if (pChannel->bClientUserd)
1164     {
1165         NV_ASSERT_OK_OR_GOTO(
1166             rmStatus,
1167             _memUtilsAllocateUserD(pGpu,
1168                                    pMemoryManager,
1169                                    hClientId,
1170                                    hDeviceId,
1171                                    pChannel),
1172             cleanup);
1173 
1174         SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY)
1175         channelGPFIFOAllocParams.hUserdMemory[gpumgrGetSubDeviceInstanceFromGpu(pGpu)] = pChannel->hUserD;
1176         channelGPFIFOAllocParams.userdOffset[gpumgrGetSubDeviceInstanceFromGpu(pGpu)] = 0;
1177         SLI_LOOP_END
1178     }
1179 
1180     NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(
1181         rmStatus,
1182         pRmApi->AllocWithHandle(pRmApi,
1183                                 hClientId,
1184                                 hDeviceId,
1185                                 hChannelId,
1186                                 hClass,
1187                                 &channelGPFIFOAllocParams,
1188                                 sizeof(channelGPFIFOAllocParams)));
1189 
1190 cleanup:
1191     NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
1192                                         rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
1193 
1194     return rmStatus;
1195 }
1196 
1197 /*!
1198  * Do a Non Blocking Memeset
1199  *
1200  * @param[in]     pChannel    OBJCHANNEL pointer
1201  * @param[in]     base        Offset in FB
1202  * @param[in]     size        size to scrub
1203  * @param[in]     freeToken   will be returned as a semaphore
1204  * @param[in]     *pNumBlocks returns the number of blocks that were scrubbed
1205  * @returns NV_STATUS
1206  */
1207 NV_STATUS
memmgrMemUtilsMemSet_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,RmPhysAddr base,NvU64 size,NvU32 freeToken,NvU32 * pNumBlocks)1208 memmgrMemUtilsMemSet_GM107
1209 (
1210     OBJGPU        *pGpu,
1211     MemoryManager *pMemoryManager,
1212     OBJCHANNEL    *pChannel,
1213     RmPhysAddr     base,
1214     NvU64          size,
1215     NvU32          freeToken,
1216     NvU32         *pNumBlocks
1217 )
1218 {
1219     NvU32 blocksPushed = 0;
1220 
1221     if ((size % pChannel->minBlockSize) != 0)
1222     {
1223         NV_PRINTF(LEVEL_ERROR, "Size should be a multiple of %d\n",
1224                   pChannel->minBlockSize);
1225         return NV_ERR_GENERIC;
1226 
1227     }
1228 
1229     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
1230     NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
1231 
1232     if (pChannel->isProgressChecked)
1233     {
1234         // if progress is checked insert the semaphore with freeToken as payload
1235          pChannel->finishPayload = freeToken;
1236          _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1237                                       0, 0, 0,             // src parameters
1238                                       base, ADDR_FBMEM, 0, // dst parameters
1239                                       size,
1240                                       NV_FALSE,            // blocking
1241                                       NV_TRUE,             // insertFinishPayload
1242                                       NV_FALSE);           // memcopy
1243     }
1244     else
1245     {
1246         // issue a standard async scrub
1247        blocksPushed = _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1248                           0, 0, 0,             // src parameters
1249                           base, ADDR_FBMEM, 0, // dst parameters
1250                           size,
1251                           NV_FALSE,            // blocking
1252                           NV_FALSE,            // insertFinishPayload
1253                           NV_FALSE);           // memcopy
1254     }
1255     *pNumBlocks = blocksPushed;
1256     return NV_OK;
1257 }
1258 
1259 /*!
1260  * Do a Blocking Memset
1261  *
1262  * @param[in]     pChannel   OBJCHANNEL pointer
1263  * @param[in]     base       Offset in FB
1264  * @param[in]     size       size to scrub
1265  * @returns NV_STATUS
1266  */
1267 
1268 NV_STATUS
memmgrMemUtilsMemSetBlocking_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,RmPhysAddr base,NvU64 size)1269 memmgrMemUtilsMemSetBlocking_GM107
1270 (
1271     OBJGPU        *pGpu,
1272     MemoryManager *pMemoryManager,
1273     OBJCHANNEL    *pChannel,
1274     RmPhysAddr     base,
1275     NvU64          size
1276 )
1277 {
1278     NvU32 blocksPushed = 0;
1279 
1280     if((size % pChannel->minBlockSize) != 0)
1281     {
1282         NV_PRINTF(LEVEL_ERROR, "Size should be a multiple of %d\n",
1283                   pChannel->minBlockSize);
1284         DBG_BREAKPOINT();
1285         return NV_ERR_GENERIC;
1286 
1287     }
1288 
1289     NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
1290     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
1291 
1292     blocksPushed = _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1293                        0, 0, 0,              // src parameters
1294                        base, ADDR_FBMEM, 0,  // dst parameters
1295                        size,
1296                        NV_TRUE,              // blocking
1297                        NV_FALSE,             // insertFinishPayload
1298                        NV_FALSE);            // memcopy
1299 
1300     if (blocksPushed > 0)
1301     {
1302         NvU8     *semAddr       = pChannel->pbCpuVA + pChannel->semaOffset;
1303         NV_STATUS timeoutStatus = NV_OK;
1304         RMTIMEOUT timeout;
1305 
1306         gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1307 
1308         while (MEM_RD32(semAddr) != pChannel->lastPayloadPushed)
1309         {
1310             NV_PRINTF(LEVEL_INFO, "Semaphore Payload is 0x%x last is 0x%x\n",
1311                       MEM_RD32(semAddr), pChannel->lastPayloadPushed);
1312 
1313             if (timeoutStatus == NV_ERR_TIMEOUT)
1314             {
1315                 NV_PRINTF(LEVEL_ERROR,
1316                           "Timed Out waiting for CE semaphore\n");
1317 
1318                 NV_PRINTF(LEVEL_ERROR,
1319                           "GET=0x%x, PUT=0x%x, GPGET=0x%x, GPPUT=0x%x\n",
1320                           pChannel->pControlGPFifo->Get,
1321                           pChannel->pControlGPFifo->Put,
1322                           pChannel->pControlGPFifo->GPGet,
1323                           pChannel->pControlGPFifo->GPPut);
1324 
1325                 DBG_BREAKPOINT_REASON(NV_ERR_TIMEOUT);
1326                 return NV_ERR_GENERIC;
1327             }
1328 
1329             timeoutStatus = gpuCheckTimeout(pGpu, &timeout);
1330          }
1331     }
1332 
1333     return NV_OK;
1334 }
1335 
1336 /*!
1337  * This function allocates the ECC scrubber
1338  *
1339  * @param[in]     pChannel   OBJCHANNEL pointer
1340  * @returns Bool
1341  */
1342 NV_STATUS
memmgrMemUtilsAllocateEccScrubber_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)1343 memmgrMemUtilsAllocateEccScrubber_GM107
1344 (
1345     OBJGPU        *pGpu,
1346     MemoryManager *pMemoryManager,
1347     OBJCHANNEL    *pChannel
1348 )
1349 {
1350     NV_ASSERT_OK_OR_RETURN(channelAllocSubdevice(pGpu, pChannel));
1351 
1352     memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pChannel);
1353 
1354     memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pChannel);
1355 
1356     _memUtilsAllocateReductionSema(pGpu, pMemoryManager, pChannel);
1357 
1358     return NV_OK;
1359 }
1360 
1361 /*!
1362  * This function allocates the ecc scrubber and the
1363  * DUpes the bitmap semaphore which is used for sync
1364  *
1365  * @param[in]     pChannel   OBJCHANNEL pointer
1366  * @returns Bool
1367  */
1368 NV_STATUS
memmgrMemUtilsAllocateEccAllocScrubber_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel)1369 memmgrMemUtilsAllocateEccAllocScrubber_GM107
1370 (
1371     OBJGPU        *pGpu,
1372     MemoryManager *pMemoryManager,
1373     OBJCHANNEL    *pChannel
1374 )
1375 {
1376     OBJSCRUB                   *pEccTD           = &pMemoryManager->eccScrubberState;
1377     OBJCHANNEL                 *pEccSyncChannel  = &pEccTD->allocationScrubberState;
1378     OBJCHANNEL                 *pEccAsyncChannel = &pEccTD->tdHeapState;
1379     NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
1380     NV_STATUS                   lockStatus;
1381     RM_API                     *pRmApi           = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1382 
1383     NV_ASSERT_OK_OR_RETURN(channelAllocSubdevice(pGpu, pChannel));
1384 
1385     memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pEccSyncChannel);
1386     memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pEccSyncChannel);
1387 
1388     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
1389 
1390     // dup the reduction sema bit map object
1391     NV_ASSERT_OK(
1392         pRmApi->DupObject(pRmApi,
1393                           pEccSyncChannel->hClient,
1394                           pEccSyncChannel->deviceId,
1395                           &pEccSyncChannel->bitMapSemPhysId,
1396                           pEccAsyncChannel->hClient,
1397                           pEccAsyncChannel->bitMapSemPhysId,
1398                           0));
1399 
1400     // allocate virtual memory for a bit map semaphore
1401     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
1402     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
1403     memAllocParams.type      = NVOS32_TYPE_IMAGE;
1404     memAllocParams.size      = (((pEccSyncChannel->blockCount + 31)/32)*4);
1405     memAllocParams.attr      = NVOS32_ATTR_NONE;
1406     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
1407     memAllocParams.flags     = 0;
1408     memAllocParams.flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL;
1409 
1410     NV_ASSERT_OK(
1411         pRmApi->AllocWithHandle(pRmApi,
1412                                 pEccSyncChannel->hClient,
1413                                 pEccSyncChannel->deviceId,
1414                                 pEccSyncChannel->bitMapSemVirtId,
1415                                 NV50_MEMORY_VIRTUAL,
1416                                 &memAllocParams,
1417                                 sizeof(memAllocParams)));
1418 
1419     lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
1420 
1421     if (lockStatus != NV_OK)
1422     {
1423         NV_ASSERT_FAILED("Could not get back lock after allocating reduction sema");
1424         return NV_ERR_GENERIC;
1425     }
1426 
1427     NV_ASSERT_OK(
1428         pRmApi->Map(pRmApi,
1429                     pEccSyncChannel->hClient,
1430                     pEccSyncChannel->deviceId,
1431                     pEccSyncChannel->bitMapSemVirtId,
1432                     pEccSyncChannel->bitMapSemPhysId, //hMemory,
1433                     0,
1434                     (((pEccSyncChannel->blockCount + 31) / 32) * 4),
1435                     NV04_MAP_MEMORY_FLAGS_NONE,
1436                     &pEccSyncChannel->pbGpuBitMapVA));
1437 
1438     pEccSyncChannel->pbBitMapVA = pEccAsyncChannel->pbBitMapVA;
1439 
1440     return NV_OK;
1441 }
1442 
1443 /*!
1444  * FUnction calculates the available space in PB
1445  * This is based on the reading the semaphore that
1446  * has the previous PUT pointer where methods were
1447  * inserted
1448  *
1449  * @param[in]     pChannel  OBJCHANNEL pointer
1450  * @returns size
1451  */
1452 static NvU32
_getSpaceInPb(OBJCHANNEL * pChannel)1453 _getSpaceInPb(OBJCHANNEL *pChannel)
1454 {
1455     NvU32 filledSpace;
1456     NvU32 avlblSpace;
1457 
1458     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, 0);
1459 
1460     if (pChannel->channelPutOffset >= MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset))
1461     {
1462         filledSpace = (pChannel->channelPutOffset - MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset));
1463         avlblSpace = pChannel->channelPbSize - filledSpace;
1464 
1465     }
1466     else
1467     {
1468         avlblSpace = (MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset) - pChannel->channelPutOffset);
1469     }
1470 
1471     NV_PRINTF(LEVEL_INFO, "Space in PB is %d\n", avlblSpace);
1472 
1473     return avlblSpace;
1474 
1475 }
1476 
1477 /*!
1478  * This function manages the PushBUffer
1479  * It will insert methods into the PB, manage wrap around
1480  * and decide when we need to add NON-STALL interrupts
1481  * and etra token semaphores
1482  *
1483  * @param[in]     pChannel          OBJCHANNEL pointer
1484  * @param[in]     src               Offset of src to copy from
1485  * @param[in]     srcAddressSpace   source surface address space type
1486  * @param[in]     srcCpuCacheAttrib source surface address space attributes
1487  * @param[in]     dst               Offset of dst to scrub/copy to
1488  * @param[in]     dstAddressSpace   destination surface address space type
1489  * @param[in]     dstCpuCacheAttrib destination surface address space attributes
1490  * @param[in]     size              size to scrub/copy
1491  * @param[in]     blocking          blocking will not insert non-stall
1492  * @param[in]     payload           will insert a token for the last block submitted
1493  * @param[in]     bMemcopy          NV_TRUE for memory copy / NV_FALSE for scrubbing
1494  * @returns Bool
1495  */
1496 static NvU32
_ceChannelScheduleWork_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,RmPhysAddr src,NV_ADDRESS_SPACE srcAddressSpace,NvU32 srcCpuCacheAttrib,RmPhysAddr dst,NV_ADDRESS_SPACE dstAddressSpace,NvU32 dstCpuCacheAttrib,NvU64 size,NvBool blocking,NvBool insertFinishPayload,NvBool bMemcopy)1497 _ceChannelScheduleWork_GM107
1498 (
1499     OBJGPU          *pGpu,
1500     MemoryManager   *pMemoryManager,
1501     OBJCHANNEL      *pChannel,
1502     RmPhysAddr       src,
1503     NV_ADDRESS_SPACE srcAddressSpace,
1504     NvU32            srcCpuCacheAttrib,
1505     RmPhysAddr       dst,
1506     NV_ADDRESS_SPACE dstAddressSpace,
1507     NvU32            dstCpuCacheAttrib,
1508     NvU64            size,
1509     NvBool           blocking,
1510     NvBool           insertFinishPayload,
1511     NvBool           bMemcopy
1512 )
1513 {
1514     RMTIMEOUT        timeout;
1515     NvU32            spaceInPb;
1516     NvU32            numBytes;
1517     NvU32            *ptr;
1518     NvU32            gpBase;
1519     NvU32            semaCount = 0;
1520     NvBool           addNonStallIntr = NV_FALSE;
1521     NvU32            blocksPushed = 0;
1522     NvBool           addFinishPayload;
1523     NvU32            blockSize = 0;
1524 
1525     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, 0);
1526     NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, 0);
1527 
1528     gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1529 
1530     spaceInPb = _getSpaceInPb(pChannel);
1531 
1532     NV_PRINTF(LEVEL_INFO, "Space in PB is %d and starting fill at 0x%x\n",
1533               spaceInPb, pChannel->channelPutOffset);
1534 
1535     ptr = (NvU32 *)(pChannel->pbCpuVA + pChannel->channelPutOffset);
1536     gpBase = pChannel->channelPutOffset;
1537     numBytes = 0;
1538     do
1539     {
1540         // while we have space greater than one block
1541         while((spaceInPb > (pChannel->methodSizePerBlock+MAX_EXTRA_PAYLOAD)))
1542         {
1543             // if inserting one more block is greater than PB size then wrap around to the beginning
1544             if((pChannel->channelPutOffset + (pChannel->methodSizePerBlock+MAX_EXTRA_PAYLOAD)) > pChannel->channelPbSize)
1545             {
1546                 NV_PRINTF(LEVEL_INFO, "Wrap numBytes %d\n", numBytes);
1547                 //submit to gpfifo with numBytes and wrap around the PutOffset
1548                 if(numBytes > 0)
1549                 {
1550                     _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, (gpBase), numBytes);
1551                 }
1552                 pChannel->channelPutOffset = 0;
1553                 ptr = (NvU32 *)(pChannel->pbCpuVA + pChannel->channelPutOffset);
1554                 gpBase = 0;
1555                 numBytes = 0;
1556                 // update the available space
1557                 spaceInPb = _getSpaceInPb(pChannel);
1558                 NV_PRINTF(LEVEL_INFO, "Wrapping PB around\n");
1559                 continue;
1560             }
1561 
1562             blockSize  = (size > pChannel->maxBlockSize) ?
1563                          pChannel->maxBlockSize : (NvU32) size;
1564 
1565             // add a non-stall interupt every (8th of the size) or when we insert the last block
1566             if((semaCount > (pChannel->channelPbSize >> 3)) || (size <= pChannel->maxBlockSize))
1567             {
1568                 addNonStallIntr = NV_TRUE;
1569                 semaCount = 0;
1570             }
1571             else
1572             {
1573                 addNonStallIntr = NV_FALSE;
1574             }
1575             // the finsh payload corresponds to inserting a token for every call to scrub that finishes
1576             if((insertFinishPayload) && (size <= pChannel->maxBlockSize))
1577             {
1578                 addFinishPayload = NV_TRUE;
1579                 NV_PRINTF(LEVEL_INFO, "Inserting Finish Payload!!!!!!!!!!\n");
1580             }
1581             else
1582             {
1583                 addFinishPayload = NV_FALSE;
1584             }
1585             if(_checkSynchronization(pGpu, pMemoryManager, pChannel, BLOCK_INDEX_FROM_ADDR(dst, pChannel->blockShift)))
1586             {
1587                 NvU32 bytesPushed = _ceChannelPushMethodsBlock_GM107(pGpu, pMemoryManager, pChannel,
1588                     src, srcAddressSpace, srcCpuCacheAttrib, // src parameters
1589                     dst, dstAddressSpace, dstCpuCacheAttrib, // dst parameters
1590                     blockSize, &ptr, NV_TRUE, (addNonStallIntr && !blocking),
1591                     addFinishPayload, bMemcopy);
1592                 spaceInPb = spaceInPb - bytesPushed;
1593                 numBytes  = numBytes + bytesPushed;
1594                 semaCount = semaCount + bytesPushed;
1595                 blocksPushed++;
1596                 // we are done pushing all methods
1597             }
1598 
1599             dst += (NvU64) blockSize;
1600             if (bMemcopy)
1601                 src += (NvU64) blockSize;
1602             size -= (NvU64) blockSize;
1603 
1604             if(size == 0)
1605             {
1606                 _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, gpBase, numBytes);
1607                 return blocksPushed;
1608             }
1609         }
1610         spaceInPb = _getSpaceInPb(pChannel);
1611         if(spaceInPb <= (pChannel->methodSizePerBlock + MAX_EXTRA_PAYLOAD))
1612         {
1613             //no space in pb to push all blocks so put what we have and wait for space
1614             if(numBytes > 0)
1615             {
1616                 _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, gpBase, numBytes);
1617             }
1618             gpBase = pChannel->channelPutOffset;
1619             numBytes = 0;
1620         }
1621         if (gpuCheckTimeout(pGpu, &timeout) == NV_ERR_TIMEOUT)
1622         {
1623             NV_ASSERT_FAILED("Timed out waiting for Space in PB!");
1624             return NV_ERR_GENERIC;
1625         }
1626     } while(1);
1627 }
1628 
1629 
1630 /*!
1631  * This function checks if the block has already been submitted
1632  * or scrubbed based on 2 bitmaps. One is a pending bitmap
1633  * updated by the CPU and one is a "Finished" bitmap updated by
1634  * the GPU
1635  *
1636  * @param[in]     pChannel   OBJCHANNEL pointer
1637  * @param[in]     block      block number
1638  *
1639  * @returns Bool
1640  */
1641 static NvBool
_checkSynchronization(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,NvU32 block)1642 _checkSynchronization
1643 (
1644     OBJGPU        *pGpu,
1645     MemoryManager *pMemoryManager,
1646     OBJCHANNEL    *pChannel,
1647     NvU32          block
1648 )
1649 {
1650     NvU32 blockSema;
1651 
1652     if (!pChannel->isChannelSynchronized)
1653     {
1654         //synchronization is not required for this channel
1655         return NV_TRUE;
1656     }
1657 
1658     blockSema = MEM_RD32((NvU8*)pChannel->pbBitMapVA + ((block/32)*4));
1659 
1660     if( ((blockSema) & (1 << (block%32))) == 0 )
1661     {
1662         if (((pChannel->pBlockPendingState[block / 32] & (1 << (block % 32))) == 0) &&
1663                ((pChannel->pBlockDoneState[block / 32] & (1 << (block % 32))) == 0) )
1664         {
1665             pChannel->pBlockPendingState[block / 32] |= (1 << (block % 32));
1666             return NV_TRUE;
1667         }
1668     }
1669 
1670     return NV_FALSE;
1671 }
1672 
1673 /*!
1674  * Updates the GPfifo with the methods in the PB for
1675  * the given channel
1676  * @param[in]     pChannel   OBJCHANNEL pointer
1677  * @param[in]     gpOffset   Offset in the PB
1678  * @param[in]     gpSize     Size of segment
1679  * @returns None
1680  */
1681 static void
_ceChannelUpdateGpFifo_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * pChannel,NvU32 gpOffset,NvU32 gpSize)1682 _ceChannelUpdateGpFifo_GM107
1683 (
1684     OBJGPU        *pGpu,
1685     MemoryManager *pMemoryManager,
1686     OBJCHANNEL    *pChannel,
1687     NvU32          gpOffset,
1688     NvU32          gpSize
1689 
1690 )
1691 {
1692     RMTIMEOUT        timeout;
1693     NvU32            GPPut;
1694     NvU32            GPGet;
1695     NvU64            get;
1696     NvU32            length;
1697     NvU32            *pGpEntry;
1698     NvU32            GpEntry0;
1699     NvU32            GpEntry1;
1700     NvU32            GPPutNext;
1701     NvU32            workSubmitToken = 0;
1702     KernelChannel   *pFifoKernelChannel;
1703     KernelFifo      *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
1704 
1705     NV_ASSERT_OR_RETURN_VOID(pChannel->pbCpuVA != NULL);
1706     NV_ASSERT_OR_RETURN_VOID(pChannel->pControlGPFifo != NULL);
1707 
1708     gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1709     GPPut = MEM_RD32(&pChannel->pControlGPFifo->GPPut);
1710     GPGet = MEM_RD32(&pChannel->pControlGPFifo->GPGet);
1711 
1712     GPPutNext = (GPPut + 1) % pChannel->channelNumGpFifioEntries;
1713 
1714     NV_PRINTF(LEVEL_INFO, "Put %d Get %d PutNext%d\n", GPPut, GPGet,
1715               GPPutNext);
1716 
1717     NV_PRINTF(LEVEL_INFO, "gp Base 0x%x, Size %d\n", (NvU32)(gpOffset),
1718               gpSize);
1719 
1720     // if the size passed is zero do not update gpput
1721     if (gpSize == 0)
1722         return;
1723 
1724     if (GPPut >= pChannel->channelNumGpFifioEntries)
1725     {
1726         // if the Put pointer is invalid, the GPU is likely inaccessible
1727         NV_PRINTF(LEVEL_INFO, "invalid Put %u >= %u\n", GPPut,
1728                   pChannel->channelNumGpFifioEntries);
1729         return;
1730     }
1731 
1732     while (GPPutNext == GPGet)
1733     {
1734         // need to wait for space
1735         GPGet = MEM_RD32(&pChannel->pControlGPFifo->GPGet);
1736 
1737         if (gpuCheckTimeout(pGpu, &timeout) == NV_ERR_TIMEOUT)
1738         {
1739             NV_ASSERT_FAILED("Timed Out waiting for space in GPFIFIO!");
1740             return;
1741         }
1742         else if (GPGet >= pChannel->channelNumGpFifioEntries)
1743         {
1744             // if the Get pointer is invalid, the GPU is likely inaccessible
1745             NV_PRINTF(LEVEL_INFO, "invalid Get %u >= %u\n", GPGet,
1746                       pChannel->channelNumGpFifioEntries);
1747             return;
1748         }
1749     }
1750 
1751     get = pChannel->pbGpuVA + gpOffset;
1752     length = gpSize;
1753 
1754     GpEntry0 =
1755        DRF_DEF(906F, _GP_ENTRY0, _NO_CONTEXT_SWITCH, _FALSE) |
1756        DRF_NUM(906F, _GP_ENTRY0, _GET, NvU64_LO32(get) >> 2);
1757     GpEntry1 =
1758        DRF_NUM(906F, _GP_ENTRY1, _GET_HI, NvU64_HI32(get)) |
1759        DRF_NUM(906F, _GP_ENTRY1, _LENGTH, length >> 2) |
1760        DRF_DEF(906F, _GP_ENTRY1, _PRIV, _USER) |
1761        DRF_DEF(906F, _GP_ENTRY1, _LEVEL, _MAIN);
1762 
1763 
1764     pGpEntry = (NvU32 *)(((NvU8*)pChannel->pbCpuVA) + pChannel->channelPbSize +
1765         GPPut*NV906F_GP_ENTRY__SIZE);
1766 
1767     MEM_WR32(&pGpEntry[0], GpEntry0);
1768     MEM_WR32(&pGpEntry[1], GpEntry1);
1769 
1770     // need to flush WRC buffer
1771     osFlushCpuWriteCombineBuffer();
1772 
1773     // write gpput
1774     MEM_WR32(&pChannel->pControlGPFifo->GPPut, GPPutNext);
1775     osFlushCpuWriteCombineBuffer();
1776 
1777     if (kfifoIsLiteModeEnabled_HAL(pGpu, pKernelFifo))
1778     {
1779         NV_ASSERT_OR_RETURN_VOID(0);
1780     }
1781     else
1782     {
1783         workSubmitToken = pChannel->workSubmitToken;
1784         NV_ASSERT_OR_RETURN_VOID(CliGetKernelChannelWithDevice(pChannel->pRsClient,
1785                                  pChannel->deviceId, pChannel->channelId,
1786                                  &pFifoKernelChannel) == NV_OK);
1787     }
1788     if (!kchannelIsRunlistSet(pGpu, pFifoKernelChannel))
1789     {
1790         NV_PRINTF(LEVEL_ERROR,
1791                   "FAILED Channel 0x%x is not assigned to runlist yet\n",
1792                   kchannelGetDebugTag(pFifoKernelChannel));
1793         return;
1794     }
1795     // update doorbell register
1796     kfifoUpdateUsermodeDoorbell_HAL(pGpu, pKernelFifo, workSubmitToken, kchannelGetRunlistId(pFifoKernelChannel));
1797 }
1798 
1799 /*!
1800  * Inserts methods into the push buffer for one block
1801  *
1802  * @param[in]     pChannel          OBJCHANNEL pointer
1803  * @param[in]     src               Offset of src to copy from
1804  * @param[in]     srcAddressSpace   source surface address space type
1805  * @param[in]     srcCpuCacheAttrib source surface address space attributes
1806  * @param[in]     dst               Offset of dst to scrub/copy to
1807  * @param[in]     dstAddressSpace   destination surface address space type
1808  * @param[in]     dstCpuCacheAttrib destination surface address space attributes
1809  * @param[in]     pPtr              Double pointer to PB offset
1810  * @returns None
1811  */
1812 static void
_ceChannelPushMethodAperture_GM107(OBJCHANNEL * pChannel,NV_ADDRESS_SPACE srcAddressSpace,NvU32 srcCpuCacheAttrib,NV_ADDRESS_SPACE dstAddressSpace,NvU32 dstCpuCacheAttrib,NvU32 ** pPtr)1813 _ceChannelPushMethodAperture_GM107
1814 (
1815     OBJCHANNEL      *pChannel,
1816     NV_ADDRESS_SPACE srcAddressSpace,
1817     NvU32            srcCpuCacheAttrib,
1818     NV_ADDRESS_SPACE dstAddressSpace,
1819     NvU32            dstCpuCacheAttrib,
1820     NvU32          **pPtr
1821 )
1822 {
1823     NvU32 *ptr  = *pPtr;
1824     NvU32  data = 0;
1825 
1826     // Set source parameters
1827     data = ((srcAddressSpace == ADDR_FBMEM) ? DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _LOCAL_FB) :
1828         (srcCpuCacheAttrib == NV_MEMORY_CACHED) ? DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _COHERENT_SYSMEM) :
1829             DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _NONCOHERENT_SYSMEM));
1830 
1831     PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_SRC_PHYS_MODE, data);
1832 
1833     // Set destination parameters
1834     data = ((dstAddressSpace == ADDR_FBMEM) ? DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _LOCAL_FB) :
1835         (dstCpuCacheAttrib == NV_MEMORY_CACHED) ? DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _COHERENT_SYSMEM) :
1836             DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _NONCOHERENT_SYSMEM));
1837 
1838     PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_DST_PHYS_MODE, data);
1839 
1840     *pPtr = ptr;
1841 }
1842 
1843 /*!
1844  * Inserts methods into the push buffer for one block
1845  *
1846  * @param[in]     pChannel          OBJCHANNEL pointer
1847  * @param[in]     src               Offset of src to copy from
1848  * @param[in]     srcAddressSpace   source surface address space type
1849  * @param[in]     srcCpuCacheAttrib source surface address space attributes
1850  * @param[in]     dst               Offset of dst to scrub/copy to
1851  * @param[in]     dstAddressSpace   destination surface address space type
1852  * @param[in]     dstCpuCacheAttrib destination surface address space attributes
1853  * @param[in]     size              size of the region to scrub/copy
1854  * @param[in]     pPtr              Double pointer to PB offset
1855  * @param[in]     addPayloadSema    Bool to add default payload
1856  * @param[in]     addNonStallInt    Bool to add a non stall at the end
1857  * @param[in]     addFinishPayload  Bool to add an extra sema release for token
1858  * @param[in]     bMemcopy          NV_TRUE for memcopy / NV_FALSE for scrubbing
1859  * @returns None
1860  */
1861 static NvU32
_ceChannelPushMethodsBlock_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,OBJCHANNEL * channel,RmPhysAddr src,NV_ADDRESS_SPACE srcAddressSpace,NvU32 srcCpuCacheAttrib,RmPhysAddr dst,NV_ADDRESS_SPACE dstAddressSpace,NvU32 dstCpuCacheAttrib,NvU64 size,NvU32 ** pPtr,NvBool addPayloadSema,NvBool addNonStallIntr,NvBool addFinishPayload,NvBool bMemcopy)1862 _ceChannelPushMethodsBlock_GM107
1863 (
1864     OBJGPU          *pGpu,
1865     MemoryManager   *pMemoryManager,
1866     OBJCHANNEL      *channel,
1867     RmPhysAddr       src,
1868     NV_ADDRESS_SPACE srcAddressSpace,
1869     NvU32            srcCpuCacheAttrib,
1870     RmPhysAddr       dst,
1871     NV_ADDRESS_SPACE dstAddressSpace,
1872     NvU32            dstCpuCacheAttrib,
1873     NvU64            size,
1874     NvU32          **pPtr,
1875     NvBool           addPayloadSema,
1876     NvBool           addNonStallIntr,
1877     NvBool           addFinishPayload,
1878     NvBool           bMemcopy
1879 )
1880 {
1881     NvU32  launchParams       = 0;
1882     NvU32 *ptr                = *pPtr;
1883     NvU32 *pStartPtr          = ptr;
1884     NvBool addReductionOp     = channel->isChannelSynchronized;
1885     NvBool bMemoryScrubEnable = NV_FALSE;
1886     NvU32  remapConstB        = 0;
1887     NvU32  remapComponentSize = 0;
1888 
1889     NV_PRINTF(LEVEL_INFO, "Base = 0x%llx, Size = 0x%llx, PB location = %p\n",
1890               dst, size, ptr);
1891 
1892     PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NV906F_SET_OBJECT, channel->classEngineID);
1893 
1894     if (size > 0)
1895     {
1896         NvU32 payLoad = channel->channelPutOffset + channel->methodSizePerBlock;
1897 
1898         if (addNonStallIntr)  payLoad = payLoad + NONSTALL_METHOD_SIZE;
1899         if (addReductionOp)   payLoad = payLoad + SEMAPHORE_ONLY_METHOD_SIZE;
1900         if (addFinishPayload) payLoad = payLoad + SEMAPHORE_ONLY_METHOD_SIZE;
1901 
1902         if (addPayloadSema)
1903         {
1904             memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
1905                 NVA06F_SUBCHANNEL_COPY_ENGINE,
1906                 channel->pbGpuVA+channel->semaOffset, payLoad, &ptr);
1907 
1908             NV_PRINTF(LEVEL_INFO, "Pushing Semaphore Payload 0x%x\n", payLoad);
1909             channel->lastPayloadPushed = payLoad;
1910         }
1911 
1912         if (IS_SIMULATION(pGpu))
1913         {
1914             //
1915             // fmodel CE is slow (compared to emulation) so we don't bother
1916             // scrubbing the whole block. Fmodel already scrubs memory via ramif
1917             // so we'll never get exceptions
1918             //
1919             size = NV_MIN(size, 0x20);
1920         }
1921 
1922         memmgrChannelPushAddressMethodsBlock_HAL(pMemoryManager, NV_FALSE,
1923             NVA06F_SUBCHANNEL_COPY_ENGINE, dst, &ptr);
1924 
1925         if (bMemcopy)
1926         {
1927             memmgrChannelPushAddressMethodsBlock_HAL(pMemoryManager, NV_TRUE,
1928                 NVA06F_SUBCHANNEL_COPY_ENGINE, src, &ptr);
1929 
1930             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size));
1931         }
1932         else
1933         {
1934             bMemoryScrubEnable = memmgrMemUtilsCheckMemoryFastScrubEnable_HAL(pGpu,
1935                                                    pMemoryManager,
1936                                                    channel->hTdCopyClass,
1937                                                    channel->bUseVasForCeCopy,
1938                                                    dst,
1939                                                    NvU64_LO32(size),
1940                                                    dstAddressSpace);
1941             if (bMemoryScrubEnable)
1942             {
1943                 NV_PRINTF(LEVEL_INFO, "Using Fast memory scrubber\n");
1944                 remapConstB        = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _DST_X, _CONST_B);
1945                 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_CONST_B, 0x00000000);
1946 
1947                 remapComponentSize = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _COMPONENT_SIZE, _ONE);
1948                 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size));
1949             }
1950             else
1951             {
1952                 remapComponentSize = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _COMPONENT_SIZE, _FOUR);
1953                 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size >> 2));
1954             }
1955 
1956             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_COMPONENTS,
1957                                             DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _DST_X, _CONST_A)              |
1958                                             DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _NUM_SRC_COMPONENTS, _ONE)     |
1959                                             DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _NUM_DST_COMPONENTS, _ONE)     |
1960                                             remapComponentSize                                                  |
1961                                             remapConstB);
1962 
1963             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_CONST_A, 0x00000000);
1964 
1965             NV_ASSERT(srcAddressSpace == 0);
1966             NV_ASSERT(dstAddressSpace == ADDR_FBMEM);
1967 
1968             srcAddressSpace = ADDR_FBMEM;
1969         }
1970 
1971         PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_COUNT, 1);
1972 
1973         _ceChannelPushMethodAperture_GM107(channel, srcAddressSpace, srcCpuCacheAttrib, dstAddressSpace, dstCpuCacheAttrib, &ptr);
1974 
1975         launchParams = DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
1976                        DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
1977                        DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_TYPE, _PHYSICAL) |
1978                        DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
1979                        DRF_DEF(B0B5, _LAUNCH_DMA, _DST_TYPE, _PHYSICAL) |
1980                        DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _PIPELINED);
1981 
1982         if (addPayloadSema)
1983         {
1984             launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
1985                             DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE);
1986         }
1987         else
1988         {
1989             launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _NONE);
1990         }
1991 
1992         if (bMemoryScrubEnable)
1993         {
1994             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVC8B5_SET_MEMORY_SCRUB_PARAMETERS,
1995                           DRF_DEF(C8B5, _SET_MEMORY_SCRUB_PARAMETERS, _DISCARDABLE, _FALSE));
1996 
1997             launchParams |= DRF_DEF(C8B5, _LAUNCH_DMA, _MEMORY_SCRUB_ENABLE, _TRUE);
1998             launchParams |= DRF_DEF(C8B5, _LAUNCH_DMA, _REMAP_ENABLE, _FALSE);
1999 
2000             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVC8B5_LAUNCH_DMA, launchParams);
2001         }
2002         else
2003         {
2004             if (!bMemcopy)
2005             {
2006                 launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE);
2007             }
2008 
2009             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
2010         }
2011     }
2012 
2013     if (addReductionOp)
2014     {
2015         NvU32 currentBlock = BLOCK_INDEX_FROM_ADDR((dst), channel->blockShift);
2016         NvU32 blockOffset;
2017         NvU32 bitFlip;
2018 
2019         blockOffset = (currentBlock / 32) * 4;
2020         bitFlip     = ((NvU32)1 << (currentBlock % 32));
2021         memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
2022             NVA06F_SUBCHANNEL_COPY_ENGINE,
2023             channel->pbGpuBitMapVA+(blockOffset), bitFlip, &ptr);
2024 
2025         launchParams = DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
2026                        DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
2027                        DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE) |
2028                        DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE) |
2029                        DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
2030                        DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
2031                        DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION_ENABLE, _TRUE) |
2032                        DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION_SIGN, _UNSIGNED) |
2033                        DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION, _IOR) |
2034                        DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _NONE);
2035         // push only the second semaphore release
2036         PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
2037     }
2038 
2039     if (addFinishPayload)
2040     {
2041         memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
2042                 NVA06F_SUBCHANNEL_COPY_ENGINE,
2043                 channel->pbGpuVA+channel->finishPayloadOffset,
2044                 channel->finishPayload, &ptr);
2045 
2046         launchParams =  DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
2047                         DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
2048                         DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE) |
2049                         DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE) |
2050                         DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
2051                         DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
2052                         DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _NONE);
2053         PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
2054         NV_PRINTF(LEVEL_INFO, "Pushing Finishing Semaphore Payload 0x%x\n",
2055                   channel->finishPayload);
2056     }
2057 
2058     if (addNonStallIntr)
2059     {
2060         PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NV906F_NON_STALL_INTERRUPT, 0);
2061     }
2062 
2063     channel->channelPutOffset = (NvU32)((NvU8 *)ptr - (NvU8 *)channel->pbCpuVA);
2064     *pPtr = ptr;
2065 
2066     // return length of methods inserted
2067     return (NvU32)((NvU8*)ptr - (NvU8*)pStartPtr);
2068 }
2069 
2070 /*!
2071  * Getting the Copy Engine Class
2072  *
2073  * @param[in]     pGpu         OBJGPU pointer
2074  * @param[out]    pClass       pointer to class
2075  */
2076 NV_STATUS
memmgrMemUtilsGetCopyEngineClass_GM107(OBJGPU * pGpu,MemoryManager * pMemoryManager,NvU32 * pClass)2077 memmgrMemUtilsGetCopyEngineClass_GM107
2078 (
2079     OBJGPU        *pGpu,
2080     MemoryManager *pMemoryManager,
2081     NvU32         *pClass
2082 )
2083 {
2084     NV_STATUS status;
2085     NvU32 numClasses;
2086     NvU32 *pClassList = NULL;
2087     NvU32 i;
2088     NvU32 class = 0;
2089     NvU32 eng;
2090 
2091     //
2092     // Pascal+ chips will have any combination of the 6 CEs
2093     // available. Loop over all the CEs to get the CE class
2094     // for the first available CE instead of using ENG_CE(0)
2095     //
2096     for (eng = 0; eng < ENG_CE__SIZE_1; eng++)
2097     {
2098         NV_ASSERT_OK_OR_ELSE(
2099             status,
2100             gpuGetClassList(pGpu, &numClasses, NULL, ENG_CE(eng)),
2101             return 0);
2102 
2103         if (numClasses > 0)
2104         {
2105             break;
2106         }
2107     }
2108 
2109     pClassList = portMemAllocNonPaged(sizeof(*pClassList) * numClasses);
2110     NV_ASSERT_OR_RETURN((pClassList != NULL), 0);
2111 
2112     if (NV_OK == gpuGetClassList(pGpu, &numClasses, pClassList, ENG_CE(eng)))
2113     {
2114         for (i = 0; i < numClasses; i++)
2115         {
2116             class = NV_MAX(class, pClassList[i]);
2117         }
2118     }
2119 
2120     NV_ASSERT(class != 0);
2121     portMemFree(pClassList);
2122     *pClass = class;
2123 
2124     return NV_OK;
2125 }
2126