1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2012-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "core/core.h"
25 #include "gpu/gpu.h"
26 #include "gpu/device/device.h"
27 #include "os/os.h"
28 #include "gpu/bus/kern_bus.h"
29 #include "gpu/mem_mgr/mem_mgr.h"
30 #include "gpu/mem_mgr/heap.h"
31 #include "gpu/mem_mgr/mem_scrub.h"
32 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
33 #include "gpu/mem_mgr/mem_desc.h"
34 #include "gpu/ce/kernel_ce.h"
35 #include "gpu/ce/kernel_ce_private.h"
36 #include "mem_mgr/gpu_vaspace.h"
37 #include "core/locks.h"
38 #include "nvRmReg.h"
39 #include "rmapi/rs_utils.h"
40 #include "mem_mgr/ctx_buf_pool.h"
41 #include "vgpu/rpc.h"
42 #include "kernel/gpu/fifo/kernel_channel.h"
43 #include "platform/chipset/chipset.h"
44 #include "platform/sli/sli.h"
45 
46 #include "class/clc0b5sw.h"
47 #include "class/cla06fsubch.h" // NVA06F_SUBCHANNEL_COPY_ENGINE
48 #include "class/cl003e.h"      // NV01_MEMORY_SYSTEM
49 #include "class/cl0040.h"      // NV01_MEMORY_LOCAL_USER
50 #include "class/cl50a0.h"      // NV50_MEMORY_VIRTUAL
51 #include "class/cl00c2.h"      // NV01_MEMORY_LOCAL_PHYSICAL
52 #include "class/clb0b5.h"      // MAXWELL_DMA_COPY_A
53 #include "class/clc8b5.h"      // HOPPER_DMA_COPY_A
54 #include "class/cl90f1.h"      // FERMI_VASPACE_A
55 
56 #define NONSTALL_METHOD_SIZE            8
57 #define SEMAPHORE_ONLY_METHOD_SIZE      32
58 #define MAX_EXTRA_PAYLOAD               (NONSTALL_METHOD_SIZE + SEMAPHORE_ONLY_METHOD_SIZE)
59 
60 
61 static NV_STATUS _memUtilsChannelAllocatePB_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel);
62 static NV_STATUS _memUtilsAllocateChannel(OBJGPU *pGpu, MemoryManager *pMemoryManager, NvHandle hClientId,
63                                     NvHandle hDeviceId, NvHandle hChannelId, NvHandle hObjectError,
64                                     NvHandle hObjectBuffer, OBJCHANNEL *pChannel);
65 static NV_STATUS _memUtilsAllocCe_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
66                                              NvHandle hClientId, NvHandle hDeviceId, NvHandle hChannelId, NvHandle hCopyObjectId);
67 static NV_STATUS _memUtilsAllocateUserD(OBJGPU *pGpu, MemoryManager *pMemoryManager, NvHandle hClientId,
68                                         NvHandle hDeviceId, OBJCHANNEL *pChannel);
69 static NV_STATUS _memUtilsMapUserd_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager,
70                            OBJCHANNEL *pChannel, NvHandle hClientId, NvHandle hDeviceId,
71                            NvHandle hChannelId, NvBool bUseRmApiForBar1);
72 static NV_STATUS _memUtilsAllocateReductionSema(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel);
73 static NvU32 _ceChannelScheduleWork_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
74                                           RmPhysAddr src, NV_ADDRESS_SPACE srcAddressSpace, NvU32 srcCpuCacheAttrib,
75                                           RmPhysAddr dst, NV_ADDRESS_SPACE dstAddressSpace, NvU32 dstCpuCacheAttrib,
76                                           NvU64 size, NvBool blocking, NvBool insertFinishPayload, NvBool bMemcopy);
77 static void  _ceChannelUpdateGpFifo_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
78                                           NvU32 gpOffset,NvU32 gpSize);
79 static NvU32 _ceChannelPushMethodsBlock_GM107(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel,
80                                               RmPhysAddr src, NV_ADDRESS_SPACE srcAddressSpace, NvU32 srcCpuCacheAttrib,
81                                               RmPhysAddr dst, NV_ADDRESS_SPACE dstAddressSpace, NvU32 dstCpuCacheAttrib,
82                                               NvU64 size, NvU32 **pPtr, NvBool addPayloadSema,
83                                               NvBool addNonStallIntr, NvBool addFinishPayload, NvBool bMemcopy);
84 static NvU32 _getSpaceInPb(OBJCHANNEL *pChannel);
85 static NvBool _checkSynchronization(OBJGPU *pGpu, MemoryManager *pMemoryManager, OBJCHANNEL *pChannel, NvU32 block);
86 
87 static NV_STATUS
88 _memUtilsAllocateReductionSema
89 (
90     OBJGPU        *pGpu,
91     MemoryManager *pMemoryManager,
92     OBJCHANNEL    *pChannel
93 )
94 {
95 
96     NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
97     NV_STATUS                   rmStatus;
98     NvU32                       i;
99     NV_STATUS                   lockStatus;
100     RM_API                     *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
101 
102     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
103     // allocate physical memory for a bit map semaphore
104     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
105 
106     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
107     memAllocParams.type      = NVOS32_TYPE_IMAGE;
108     memAllocParams.size      = (((pChannel->blockCount + 31)/32)*4);
109     memAllocParams.attr      = 0;
110     memAllocParams.attr      = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
111     //    memAllocParams.attr     |= NVOS32_ATTR_COHERENCY_WRITE_COMBINE;
112     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
113     memAllocParams.flags     = 0;
114 
115     //
116     // When APM feature is enabled all RM internal sysmem allocations must
117     // be in unprotected memory
118     // When Hopper CC is enabled all RM internal sysmem allocations that
119     // are required to be accessed from GPU should be in unprotected memory
120     // but those sysmem allocations that are not required to be accessed from
121     // GPU should be in protected memory.
122     //
123 
124     NV_ASSERT_OK_OR_RETURN(
125         pRmApi->AllocWithHandle(pRmApi,
126                                 pChannel->hClient,
127                                 pChannel->deviceId,
128                                 pChannel->bitMapSemPhysId,
129                                 NV01_MEMORY_SYSTEM,
130                                 &memAllocParams,
131                                 sizeof(memAllocParams)));
132 
133     // allocate virtual memory for a bit map semaphore
134     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
135     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
136     memAllocParams.type      = NVOS32_TYPE_IMAGE;
137     memAllocParams.size      = (((pChannel->blockCount + 31)/32)*4);
138     memAllocParams.attr      = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
139     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
140     memAllocParams.flags     = 0;
141     memAllocParams.flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL;
142 
143     NV_ASSERT_OK_OR_RETURN(
144         pRmApi->AllocWithHandle(pRmApi,
145                                 pChannel->hClient,
146                                 pChannel->deviceId,
147                                 pChannel->bitMapSemVirtId,
148                                 NV50_MEMORY_VIRTUAL,
149                                 &memAllocParams,
150                                 sizeof(memAllocParams)));
151 
152     lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
153     if(lockStatus != NV_OK)
154     {
155         NV_ASSERT_FAILED("Could not get back lock after allocating reduction sema");
156         return NV_ERR_GENERIC;
157     }
158 
159     NV_CHECK_OK_OR_GOTO(
160         rmStatus,
161         LEVEL_ERROR,
162         pRmApi->Map(pRmApi,
163                     pChannel->hClient,
164                     pChannel->deviceId,
165                     pChannel->bitMapSemVirtId,
166                     pChannel->bitMapSemPhysId, //hMemory,
167                     0,
168                     (((pChannel->blockCount + 31)/32)*4),
169                     NV04_MAP_MEMORY_FLAGS_NONE,
170                     &pChannel->pbGpuBitMapVA),
171         exit_sema_creation);
172 
173     NV_CHECK_OK_OR_GOTO(
174         rmStatus,
175         LEVEL_ERROR,
176         pRmApi->MapToCpu(pRmApi,
177                          pChannel->hClient,
178                          pChannel->deviceId,
179                          pChannel->bitMapSemPhysId,
180                          0,
181                          (((pChannel->blockCount + 31)/32)*4),
182                          (void **)&pChannel->pbBitMapVA,
183                          0),
184         exit_sema_creation);
185 
186     for(i = 0; i < (((pChannel->blockCount + 31) / 32) * 4);)
187     {
188         MEM_WR32((NvU8*)pChannel->pbBitMapVA + (i), 0);
189         i = i + 4;
190     }
191 
192     return NV_OK;
193 exit_sema_creation:
194     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
195     NV_PRINTF(LEVEL_INFO, "end  NV_STATUS=0x%08x\n", rmStatus);
196     return rmStatus;
197 }
198 
199 static NV_STATUS
200 _memUtilsChannelAllocatePB_GM107
201 (
202     OBJGPU        *pGpu,
203     MemoryManager *pMemoryManager,
204     OBJCHANNEL    *pChannel
205 
206     //  OBJMEMUTILS *to be added here
207 )
208 {
209     NV_STATUS                   rmStatus = NV_OK;
210     NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
211     NvHandle                    hDevice;
212     NvHandle                    hPhysMem;
213     NvU64                       size;
214     NvHandle                    hVirtMem;
215     NvU32                       hClass;
216     NvU32                       attr;
217     NvU32                       flags        = 0;
218     NvU32                       attrNotifier = NVOS32_ATTR_NONE;
219     RM_API                     *pRmApi       = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
220 
221     // Apply registry overrides to channel pushbuffer.
222     switch (DRF_VAL(_REG_STR_RM, _INST_LOC_4, _CHANNEL_PUSHBUFFER, pGpu->instLocOverrides4))
223     {
224         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_VID:
225             hClass = NV01_MEMORY_LOCAL_USER;
226             attr   = DRF_DEF(OS32, _ATTR, _LOCATION,  _VIDMEM)     |
227                      DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
228 
229             flags = NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM;
230             if (!IS_MIG_IN_USE(pGpu))
231             {
232                 attr |= DRF_DEF(OS32, _ATTR, _ALLOCATE_FROM_RESERVED_HEAP, _YES);
233             }
234             attrNotifier = attr;
235             break;
236 
237         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_COH:
238             hClass = NV01_MEMORY_SYSTEM;
239             attr   = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)        |
240                      DRF_DEF(OS32, _ATTR, _COHERENCY, _CACHED);
241             attrNotifier = attr;
242             break;
243 
244         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_NCOH:
245             hClass = NV01_MEMORY_SYSTEM;
246             attr   = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)        |
247                      DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
248             attrNotifier = attr;
249             break;
250 
251         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_DEFAULT:
252         default:
253             hClass = NV01_MEMORY_SYSTEM;
254             attr   = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)        |
255                      DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
256 
257             //
258             // The work submit token is read from notifier memory to support
259             // VM migration for the memory scrubber. The token is read from
260             // the notifier memory every time when the scrubber submits the work.
261             // It will help performance by changing the default setting of
262             // the notifier memory to be cached.
263             //
264             attrNotifier = DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI)  |
265                            DRF_DEF(OS32, _ATTR, _COHERENCY, _CACHED);
266             break;
267     }
268 
269     hDevice           =  pChannel->deviceId;
270     hPhysMem          =  pChannel->physMemId;
271     hVirtMem          =  pChannel->pushBufferId;
272     size              =  pChannel->channelSize;
273 
274     LOCK_ASSERT_AND_RETURN(!rmGpuLockIsOwner());
275     // allocate the physical memory
276     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
277     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
278     memAllocParams.type      = NVOS32_TYPE_IMAGE;
279     memAllocParams.size      = size;
280     memAllocParams.attr      = attr;
281     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
282     memAllocParams.flags     = flags;
283     memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
284 
285     //
286     // When APM is enabled all RM internal allocations must to go to
287     // unprotected memory irrespective of vidmem or sysmem
288     // When Hopper CC is enabled all RM internal sysmem allocations that
289     // are required to be accessed from GPU should be in unprotected memory
290     // but all vidmem allocations must go to protected memory
291     //
292     if (gpuIsApmFeatureEnabled(pGpu) ||
293         FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
294     {
295         memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
296                                         _UNPROTECTED);
297     }
298 
299     NV_CHECK_OK_OR_RETURN(
300         LEVEL_ERROR,
301         pRmApi->AllocWithHandle(pRmApi,
302                                 pChannel->hClient,
303                                 hDevice,
304                                 hPhysMem,
305                                 hClass,
306                                 &memAllocParams,
307                                 sizeof(memAllocParams)));
308 
309     // allocate the Virtual memory
310     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
311     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
312     memAllocParams.type      = NVOS32_TYPE_IMAGE;
313     memAllocParams.size      = size;
314     memAllocParams.attr      = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
315     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
316     memAllocParams.flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL;
317     memAllocParams.hVASpace = pChannel->hVASpaceId;
318 
319     NV_CHECK_OK_OR_RETURN(
320         LEVEL_ERROR,
321         pRmApi->AllocWithHandle(pRmApi,
322                                 pChannel->hClient,
323                                 hDevice,
324                                 hVirtMem,
325                                 NV50_MEMORY_VIRTUAL,
326                                 &memAllocParams,
327                                 sizeof(memAllocParams)));
328 
329     // allocate the physmem for the notifier
330     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
331     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
332     memAllocParams.type      = NVOS32_TYPE_IMAGE;
333     memAllocParams.size      = pChannel->channelNotifierSize;
334     memAllocParams.attr      = attrNotifier;
335     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
336     memAllocParams.flags     = 0;
337     memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
338 
339     //
340     // When APM is enabled all RM internal allocations must to go to
341     // unprotected memory irrespective of vidmem or sysmem
342     // When Hopper CC is enabled all RM internal sysmem allocations that
343     // are required to be accessed from GPU should be in unprotected memory
344     // but all vidmem allocations must go to protected memory
345     //
346     if (gpuIsApmFeatureEnabled(pGpu) ||
347         FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
348     {
349         memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
350                                         _UNPROTECTED);
351     }
352 
353     NV_CHECK_OK_OR_RETURN(
354         LEVEL_ERROR,
355         pRmApi->AllocWithHandle(pRmApi,
356                                 pChannel->hClient,
357                                 hDevice,
358                                 pChannel->errNotifierIdPhys,
359                                 hClass,
360                                 &memAllocParams,
361                                 sizeof(memAllocParams)));
362 
363     // allocate Virtual Memory for the notifier
364     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
365     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
366     memAllocParams.type      = NVOS32_TYPE_IMAGE;
367     memAllocParams.size      = pChannel->channelNotifierSize;
368     memAllocParams.attr      = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
369     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
370     memAllocParams.flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL;
371     memAllocParams.hVASpace = pChannel->hVASpaceId;
372 
373     NV_CHECK_OK_OR_RETURN(
374         LEVEL_ERROR,
375         pRmApi->AllocWithHandle(pRmApi,
376                                 pChannel->hClient,
377                                 hDevice,
378                                 pChannel->errNotifierIdVirt,
379                                 NV50_MEMORY_VIRTUAL,
380                                 &memAllocParams,
381                                 sizeof(memAllocParams)));
382 
383     return rmStatus;
384 }
385 
386 NV_STATUS
387 memmgrMemUtilsChannelInitialize_GM107
388 (
389     OBJGPU        *pGpu,
390     MemoryManager *pMemoryManager,
391     OBJCHANNEL    *pChannel
392 )
393 {
394     NV_STATUS         rmStatus;
395     NV_STATUS         lockStatus;
396     RsClient         *pRsClient           = pChannel->pRsClient;
397     NvHandle          hClient             = pChannel->hClient;
398     NvHandle          hDevice             = pChannel->deviceId;
399     NvHandle          hPhysMem            = pChannel->physMemId;
400     NvU64             size                = pChannel->channelSize;
401     NvHandle          hChannel            = pChannel->channelId;
402     NvHandle          hErrNotifierVirt    = pChannel->errNotifierIdVirt;
403     NvHandle          hErrNotifierPhys    = pChannel->errNotifierIdPhys;
404     NvHandle          hPushBuffer         = pChannel->pushBufferId;
405     RM_API           *pRmApi              = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
406     NvBool            bMIGInUse           = IS_MIG_IN_USE(pGpu);
407     NvU8             *pErrNotifierCpuVA   = NULL;
408     NV_ADDRESS_SPACE  userdAddrSpace;
409     NV_ADDRESS_SPACE  pushBuffAddrSpace;
410     NV_ADDRESS_SPACE  gpFifoAddrSpace;
411     OBJSYS           *pSys                = SYS_GET_INSTANCE();
412     OBJCL            *pCl                 = SYS_GET_CL(pSys);
413     NvU32             cacheSnoopFlag      = 0 ;
414     NvBool            bUseRmApiForBar1    = NV_FALSE;
415 
416     //
417     // Heap alloc one chunk of memory to hold all of our alloc parameters to
418     // reduce stack usage
419     //
420     union
421     {
422         NV_VASPACE_ALLOCATION_PARAMETERS va;
423         NV_MEMORY_ALLOCATION_PARAMS      mem;
424     } *pParams = NULL;
425 
426     if (pCl->getProperty(pCl, PDB_PROP_CL_IS_CHIPSET_IO_COHERENT))
427     {
428         cacheSnoopFlag = DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _ENABLE);
429     }
430 
431     pParams = portMemAllocNonPaged(sizeof(*pParams));
432     if (pParams == NULL)
433     {
434         rmStatus = NV_ERR_NO_MEMORY;
435         goto exit_free_client;
436     }
437 
438     //
439     // client allocated userd only supported on volta+
440     // TODO: Use property to check if client allocated userd is supported
441     //
442     pChannel->bClientUserd = IsVOLTAorBetter(pGpu);
443 
444     //
445     // We need to allocate a VAS to use for CE copies, but also for
446     // GSP-RM + MIG, so that it doesn't get the device
447     // default VAS during channel bind (which is not properly handled
448     // by split VAS in MIG currently). We only need the identity mapping
449     // when actually using the VAS for copies.
450     //
451     if (pChannel->bUseVasForCeCopy ||
452         (IS_GSP_CLIENT(pGpu) && bMIGInUse))
453     {
454         NvBool bAcquireLock = NV_FALSE;
455         NvU64 startFbOffset = GPU_GET_HEAP(pGpu)->base;
456         NvU64 fbSize        = GPU_GET_HEAP(pGpu)->total;
457         NvU64 vaStartOffset = startFbOffset;
458 
459         NV_PRINTF(LEVEL_INFO, "Channel VAS heap base: %llx total: %llx \n", GPU_GET_HEAP(pGpu)->base,
460                   GPU_GET_HEAP(pGpu)->total);
461 
462         pChannel->startFbOffset = startFbOffset;
463         pChannel->fbSize = fbSize;
464 
465         if (pChannel->bUseVasForCeCopy)
466         {
467             NV_ASSERT_OK_OR_GOTO(rmStatus,
468                 clientGenResourceHandle(pRsClient, &pChannel->hFbAlias),
469                 exit_free_client);
470 
471             rmStatus = memmgrMemUtilsCreateMemoryAlias_HAL(pGpu, pMemoryManager, pChannel);
472             if (rmStatus != NV_OK)
473             {
474                 NV_PRINTF(LEVEL_ERROR, "Setting Identity mapping failed.. status: %x\n", rmStatus);
475                 goto exit_free_client;
476             }
477         }
478 
479         {
480             NV_VASPACE_ALLOCATION_PARAMETERS *pVa = &pParams->va;
481 
482             portMemSet(pVa, 0, sizeof(*pVa));
483             pVa->index  = NV_VASPACE_ALLOCATION_INDEX_GPU_NEW;
484             pVa->vaBase = pChannel->startFbOffset;
485             //
486             // how large should we go here ? we definitely need more than heapSize to allocate
487             // other metadata related to chnanel. Also need to account the discontiguous VA Range
488             // for split VAS, where we allocate 4GB to (4GB + 512MB) for Server VAS (mirrored).
489             // Rough VASpace Layout will be documented here:
490             //
491             //
492             if (gpuIsSplitVasManagementServerClientRmEnabled(pGpu))
493             {
494                 pVa->vaSize += (SPLIT_VAS_SERVER_RM_MANAGED_VA_START +
495                                 SPLIT_VAS_SERVER_RM_MANAGED_VA_SIZE) ;
496             }
497             pVa->vaSize += fbSize + pChannel->channelSize + SCRUBBER_VASPACE_BUFFER_SIZE;
498 
499             //
500             // We definitely need ALLOW_ZERO_ADDRESS, but SKIP_SCRUB_MEMPOOL is a patch
501             // until we figure out the right place for Scrubber page tables
502             //
503             pVa->flags |= NV_VASPACE_ALLOCATION_FLAGS_ALLOW_ZERO_ADDRESS |
504                           NV_VASPACE_ALLOCATION_FLAGS_SKIP_SCRUB_MEMPOOL |
505                           NV_VASPACE_ALLOCATION_FLAGS_OPTIMIZE_PTETABLE_MEMPOOL_USAGE;
506 
507             if (!IS_MIG_IN_USE(pGpu))
508             {
509                 pVa->flags |= NV_VASPACE_ALLOCATION_FLAGS_PTETABLE_HEAP_MANAGED;
510             }
511 
512             if (rmDeviceGpuLockIsOwner(pGpu->gpuInstance))
513             {
514                 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
515                 bAcquireLock = NV_TRUE;
516                 pRmApi = rmapiGetInterface(RMAPI_API_LOCK_INTERNAL);
517             }
518 
519             rmStatus = pRmApi->AllocWithHandle(pRmApi, hClient, pChannel->deviceId,
520                                                pChannel->hVASpaceId, FERMI_VASPACE_A,
521                                                pVa, sizeof(*pVa));
522         }
523         if (bAcquireLock)
524         {
525             NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus, rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
526             bAcquireLock = NV_FALSE;
527             pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
528         }
529 
530         if (rmStatus != NV_OK)
531         {
532             NV_PRINTF(LEVEL_ERROR, "failed allocating scrubber vaspace, status=0x%x\n",
533                     rmStatus);
534             goto exit_free_client;
535         }
536 
537         rmStatus = vaspaceGetByHandleOrDeviceDefault(pRsClient,
538                                                      pChannel->deviceId,
539                                                      pChannel->hVASpaceId,
540                                                      &pChannel->pVAS);
541         if (rmStatus != NV_OK)
542         {
543             NV_PRINTF(LEVEL_ERROR,
544                     "failed getting the scrubber vaspace from handle, status=0x%x\n",
545                     rmStatus);
546             goto exit_free_client;
547         }
548 
549         if (pChannel->bUseVasForCeCopy)
550         {
551             if (!gpuIsWarBug200577889SriovHeavyEnabled(pGpu))
552             {
553                 rmStatus = vaspacePinRootPageDir(pChannel->pVAS, pGpu);
554                 if (rmStatus != NV_OK)
555                 {
556                     NV_PRINTF(LEVEL_ERROR, "failed pinning down Scrubber VAS, status=0x%x\n",
557                             rmStatus);
558                     goto exit_free_client;
559                 }
560             }
561 
562             NV_ASSERT_OK_OR_GOTO(rmStatus,
563                  clientGenResourceHandle(pRsClient, &pChannel->hFbAliasVA), exit_free_client);
564         }
565 
566         if (gpuIsSplitVasManagementServerClientRmEnabled(pGpu))
567         {
568             OBJGVASPACE *pGVAS = dynamicCast(pChannel->pVAS, OBJGVASPACE);
569             vaStartOffset += pGVAS->vaLimitServerRMOwned + 1;
570             pChannel->vaStartOffset = vaStartOffset;
571         }
572 
573         if (rmDeviceGpuLockIsOwner(pGpu->gpuInstance))
574         {
575             rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
576             bAcquireLock = NV_TRUE;
577             pRmApi = rmapiGetInterface(RMAPI_API_LOCK_INTERNAL);
578         }
579 
580         // Allocate virtual memory for Identity Mapping
581         if (pChannel->bUseVasForCeCopy)
582         {
583             NV_MEMORY_ALLOCATION_PARAMS *pMem = &pParams->mem;
584             portMemSet(pMem, 0, sizeof(*pMem));
585             pMem->owner     = NVOS32_TYPE_OWNER_RM;
586             pMem->type      = NVOS32_TYPE_IMAGE;
587             pMem->size      = pChannel->fbSize;
588             pMem->attr      = (DRF_DEF(OS32, _ATTR, _LOCATION,  _PCI) |
589                                DRF_DEF(OS32, _ATTR, _PAGE_SIZE, _BIG));
590             pMem->attr2     = NVOS32_ATTR2_NONE;
591             pMem->offset    = vaStartOffset;
592             pMem->flags     = 0;
593             pMem->flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL |
594                               NVOS32_ALLOC_FLAGS_FIXED_ADDRESS_ALLOCATE |
595                               NVOS32_ALLOC_FLAGS_LAZY;
596             pMem->hVASpace = pChannel->hVASpaceId;
597 
598             rmStatus = pRmApi->AllocWithHandle(pRmApi,
599                                                hClient,
600                                                pChannel->deviceId,
601                                                pChannel->hFbAliasVA,
602                                                NV50_MEMORY_VIRTUAL,
603                                                pMem,
604                                                sizeof(*pMem));
605         }
606 
607         if (bAcquireLock)
608         {
609             NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus, rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
610             bAcquireLock = NV_FALSE;
611             pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
612         }
613 
614         if (rmStatus != NV_OK)
615         {
616             NV_PRINTF(LEVEL_ERROR, "Allocating VASpace for (base, size): (%llx, %llx) failed,"
617                                    " with status: %x\n", vaStartOffset, pChannel->fbSize, rmStatus);
618             goto exit_free_client;
619         }
620 
621         // set up mapping of VA -> PA
622         if (pChannel->bUseVasForCeCopy)
623         {
624             NV_CHECK_OK_OR_GOTO(
625                 rmStatus,
626                 LEVEL_ERROR,
627                 pRmApi->Map(pRmApi,
628                             hClient,
629                             pChannel->deviceId,
630                             pChannel->hFbAliasVA,
631                             pChannel->hFbAlias,
632                             0,
633                             pChannel->fbSize,
634                             DRF_DEF(OS46, _FLAGS, _ACCESS,           _READ_WRITE) |
635                             DRF_DEF(OS46, _FLAGS, _PAGE_SIZE,        _BIG)        |
636                             DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP,      _ENABLE),
637                             &pChannel->fbAliasVA),
638                 exit_free_client);
639 
640             NV_PRINTF(LEVEL_INFO, "Scrubber VAS :%x identity mapped with start addr: %llx, size: %llx\n",
641                       pChannel->hFbAliasVA, pChannel->fbAliasVA, pChannel->fbSize);
642         }
643     }
644 
645     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
646 
647     //
648     // Fetch the physical location of the push buffer
649     //
650     // Bug 3434881 filed to track the following
651     // a.Implementation of the utility function to parse the
652     //   push buffer and userd regkeys
653     // b.Replace all instances of regkey pushbuffer/userd regkey
654     //   parsing with the utility function
655     //
656     switch (DRF_VAL(_REG_STR_RM, _INST_LOC_4, _CHANNEL_PUSHBUFFER, pGpu->instLocOverrides4))
657     {
658         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_VID:
659             pushBuffAddrSpace = ADDR_FBMEM;
660             break;
661 
662         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_COH:
663         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_NCOH:
664         case NV_REG_STR_RM_INST_LOC_4_CHANNEL_PUSHBUFFER_DEFAULT:
665         default:
666             pushBuffAddrSpace = ADDR_SYSMEM;
667             break;
668     }
669 
670     gpFifoAddrSpace = pushBuffAddrSpace;
671 
672     //Fetch the physical location of userD
673     switch (DRF_VAL(_REG_STR_RM, _INST_LOC, _USERD, pGpu->instLocOverrides))
674     {
675         case NV_REG_STR_RM_INST_LOC_USERD_NCOH:
676         case NV_REG_STR_RM_INST_LOC_USERD_COH:
677             userdAddrSpace = ADDR_SYSMEM;
678             break;
679 
680         case NV_REG_STR_RM_INST_LOC_USERD_VID:
681         case NV_REG_STR_RM_INST_LOC_USERD_DEFAULT:
682         default:
683             userdAddrSpace = ADDR_FBMEM;
684             break;
685     }
686 
687     // RM WAR for Bug 3313719
688     // Disallow USERD in sysmem and (GPFIFO or pushbuffer) in vidmem
689     rmStatus = kfifoCheckChannelAllocAddrSpaces_HAL(GPU_GET_KERNEL_FIFO(pGpu),
690                                                     userdAddrSpace,
691                                                     pushBuffAddrSpace,
692                                                     gpFifoAddrSpace);
693     if (rmStatus != NV_OK)
694     {
695         NV_ASSERT_FAILED("USERD in sysmem and PushBuffer/GPFIFO in vidmem not allowed");
696         goto exit_free_client;
697     }
698 
699     _memUtilsChannelAllocatePB_GM107(pGpu, pMemoryManager, pChannel);
700     lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
701     if(lockStatus != NV_OK)
702     {
703         NV_ASSERT_FAILED("Could not get back lock after allocating Push Buffer sema");
704         goto exit_free_client;
705     }
706 
707     // map the pushbuffer
708     rmStatus = pRmApi->Map(pRmApi, hClient, hDevice,
709                            hPushBuffer,
710                            hPhysMem, //hMemory,
711                            0,
712                            size,
713                            cacheSnoopFlag,
714                            &pChannel->pbGpuVA);
715     // map the error notifier
716     rmStatus = pRmApi->Map(pRmApi, hClient, hDevice,
717                            hErrNotifierVirt,
718                            hErrNotifierPhys, //hMemory,
719                            0,
720                            pChannel->channelNotifierSize,
721                            DRF_DEF(OS46, _FLAGS, _KERNEL_MAPPING, _ENABLE) | cacheSnoopFlag,
722                            &pChannel->pbGpuNotifierVA);
723 
724     NV_CHECK_OK_OR_GOTO(
725         rmStatus,
726         LEVEL_ERROR,
727         _memUtilsAllocateChannel(pGpu,
728                                  pMemoryManager,
729                                  hClient,
730                                  hDevice,
731                                  hChannel,
732                                  hErrNotifierVirt,
733                                  hPushBuffer,
734                                  pChannel),
735         exit_free_client);
736 
737     // _memUtilsMapUserd
738     NV_CHECK_OK_OR_GOTO(
739         rmStatus,
740         LEVEL_ERROR,
741         _memUtilsMapUserd_GM107(pGpu, pMemoryManager, pChannel,
742                                 hClient, hDevice, hChannel, bUseRmApiForBar1),
743         exit_free_client);
744 
745     // Set up pushbuffer and semaphore memdesc and memset the buffer
746     pChannel->pChannelBufferMemdesc =
747         memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, pChannel->hClient, hPhysMem);
748     NV_ASSERT_OR_GOTO(pChannel->pChannelBufferMemdesc != NULL, exit_free_client);
749 
750     // Set up notifier memory
751     pChannel->pErrNotifierMemdesc =
752         memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, pChannel->hClient, hErrNotifierPhys);
753     NV_ASSERT_OR_GOTO(pChannel->pErrNotifierMemdesc != NULL, exit_free_client);
754 
755     if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
756     {
757         rmStatus = memmgrMemDescMemSet(pMemoryManager, pChannel->pChannelBufferMemdesc, 0,
758                                        (TRANSFER_FLAGS_SHADOW_ALLOC | TRANSFER_FLAGS_SHADOW_INIT_MEM));
759         NV_ASSERT_OR_GOTO(rmStatus == NV_OK, exit_free_client);
760 
761         pChannel->pbCpuVA = NULL;
762         pChannel->pTokenFromNotifier = NULL;
763     }
764     else
765     {
766         if (bUseRmApiForBar1)
767         {
768             NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
769                 pRmApi->MapToCpu(pRmApi, hClient, hDevice, hPhysMem, 0, size,
770                                  (void **)&pChannel->pbCpuVA, 0),
771                 exit_free_client);
772 
773             NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
774                 pRmApi->MapToCpu(pRmApi, hClient, hDevice, hErrNotifierPhys, 0,
775                     pChannel->channelNotifierSize, (void **)&pErrNotifierCpuVA, 0),
776                 exit_free_client);
777         }
778         else
779         {
780             //
781             // Most use cases can migrate to the internal memdescMap path for BAR1
782             // And it is preferred because external path will not work with CC
783             //
784             pChannel->pbCpuVA = memmgrMemDescBeginTransfer(pMemoryManager,
785                                     pChannel->pChannelBufferMemdesc, TRANSFER_FLAGS_USE_BAR1);
786             NV_ASSERT_OR_GOTO(pChannel->pbCpuVA != NULL, exit_free_client);
787 
788             pErrNotifierCpuVA = memmgrMemDescBeginTransfer(pMemoryManager,
789                                     pChannel->pErrNotifierMemdesc, TRANSFER_FLAGS_USE_BAR1);
790             NV_ASSERT_OR_GOTO(pErrNotifierCpuVA != NULL, exit_free_client);
791         }
792 
793         portMemSet(pChannel->pbCpuVA, 0, (NvLength)size);
794 
795         pChannel->pTokenFromNotifier =
796             (NvNotification *)(pErrNotifierCpuVA +
797                                (NV_CHANNELGPFIFO_NOTIFICATION_TYPE_WORK_SUBMIT_TOKEN *
798                                 sizeof(NvNotification)));
799     }
800 
801     //
802     // Allocate and map the doorbell region to use in scrub on free
803     // Set the doorbellregister to False, since pre-volta chips doesn't support
804     //
805     NV_CHECK_OK_OR_GOTO(
806         rmStatus,
807         LEVEL_ERROR,
808         memmgrScrubMapDoorbellRegion_HAL(pGpu, pMemoryManager, pChannel),
809         exit_free_client);
810 
811     portMemFree(pParams);
812     return NV_OK;
813 
814 exit_free_client:
815     if(!pChannel->bClientAllocated)
816     {
817         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
818     }
819     portMemFree(pParams);
820     NV_PRINTF(LEVEL_INFO, "end  NV_STATUS=0x%08x\n", rmStatus);
821     return rmStatus;
822 }
823 
824 
825 /** memmgrMemUtilsCreateMemoryAlias_GM107
826  *
827  *  @brief Creates an alias for the FB region
828  *         This function doesn't allocate any memory but just creates memory handle
829  *         which refers to FB range. This call can support for both baremetal and vGPU.
830  *  @param[in] pChannel             CHANNEL Pointer
831  *
832  *  @returns NV_OK on success
833  */
834 NV_STATUS
835 memmgrMemUtilsCreateMemoryAlias_GM107
836 (
837     OBJGPU        *pGpu,
838     MemoryManager *pMemoryManager,
839     OBJCHANNEL    *pChannel
840 )
841 {
842     RM_API  *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
843     NV_STATUS status = NV_OK;
844 
845     NV_PHYSICAL_MEMORY_ALLOCATION_PARAMS physMemParams = {0};
846 
847     memmgrGetPteKindForScrubber_HAL(pMemoryManager, &physMemParams.format);
848 
849 
850     status = pRmApi->AllocWithHandle(pRmApi,
851                                     pChannel->hClient,
852                                     pChannel->deviceId,
853                                     pChannel->hFbAlias,
854                                     NV01_MEMORY_LOCAL_PHYSICAL,
855                                     &physMemParams,
856                                     sizeof(physMemParams));
857     if (status != NV_OK)
858     {
859         NV_CHECK_OK_FAILED(LEVEL_WARNING, "Aliasing FbListMem", status);
860         return status;
861     }
862 
863     NV_PRINTF(LEVEL_INFO, "Allocating FbAlias: %x for size: %llx, kind: %x\n", pChannel->hFbAlias,
864               pChannel->fbSize, physMemParams.format);
865 
866 
867     return NV_OK;
868 }
869 
870 NV_STATUS
871 memmgrMemUtilsCopyEngineInitialize_GM107
872 (
873     OBJGPU        *pGpu,
874     MemoryManager *pMemoryManager,
875     OBJCHANNEL    *pChannel
876 )
877 {
878     NV_STATUS rmStatus = NV_OK;
879     RM_API   *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
880 
881     //allocce
882     NV_CHECK_OK_OR_GOTO(
883         rmStatus,
884         LEVEL_ERROR,
885         _memUtilsAllocCe_GM107(pGpu,
886                                pMemoryManager,
887                                pChannel,
888                                pChannel->hClient,
889                                pChannel->deviceId,
890                                pChannel->channelId,
891                                pChannel->engineObjectId),
892         exit_free);
893 
894     NV_CHECK_OK_OR_GOTO(
895         rmStatus,
896         LEVEL_ERROR,
897         memmgrMemUtilsChannelSchedulingSetup(pGpu, pMemoryManager, pChannel), exit_free);
898 
899     return NV_OK;
900 
901  exit_free:
902     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
903     NV_PRINTF(LEVEL_INFO, "end  NV_STATUS=0x%08x\n", rmStatus);
904     return rmStatus;
905 }
906 
907 static NV_STATUS _memUtilsAllocCe_GM107
908 (
909     OBJGPU        *pGpu,
910     MemoryManager *pMemoryManager,
911     OBJCHANNEL    *pChannel,
912     NvHandle       hClientId,
913     NvHandle       hDeviceId,
914     NvHandle       hChannelId,
915     NvHandle       hCopyObjectId
916 
917 )
918 {
919     NVC0B5_ALLOCATION_PARAMETERS  createParams = {0};
920     RM_API                       *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
921 
922     createParams.version = NVC0B5_ALLOCATION_PARAMETERS_VERSION_1;
923     createParams.engineType = NV2080_ENGINE_TYPE_COPY(pChannel->ceId);
924     memmgrMemUtilsGetCopyEngineClass_HAL(pGpu, pMemoryManager, &pChannel->hTdCopyClass);
925     pChannel->engineType = gpuGetRmEngineType(createParams.engineType);
926 
927     if (!pChannel->hTdCopyClass)
928     {
929         NV_PRINTF(LEVEL_ERROR, "Unable to determine CE's engine class.\n");
930         return NV_ERR_GENERIC;
931     }
932 
933     NV_CHECK_OK_OR_RETURN(
934         LEVEL_ERROR,
935         pRmApi->AllocWithHandle(pRmApi,
936                                 hClientId,
937                                 hChannelId,
938                                 hCopyObjectId,
939                                 pChannel->hTdCopyClass,
940                                 &createParams,
941                                 sizeof(createParams)));
942 
943     return NV_OK;
944 }
945 
946 static NV_STATUS
947 _memUtilsMapUserd_GM107
948 (
949     OBJGPU        *pGpu,
950     MemoryManager *pMemoryManager,
951     OBJCHANNEL    *pChannel,
952     NvHandle       hClientId,
953     NvHandle       hDeviceId,
954     NvHandle       hChannelId,
955     NvBool         bUseRmApiForBar1
956 )
957 {
958     //
959     // The memTransfer API only works for client-allocated USERD
960     // because otherwise we are calling MapToCpu using the channel
961     // handle instead.
962     //
963     if (pChannel->bClientUserd && !bUseRmApiForBar1)
964     {
965         pChannel->pUserdMemdesc =
966             memmgrMemUtilsGetMemDescFromHandle(pMemoryManager, hClientId, pChannel->hUserD);
967         NV_ASSERT_OR_RETURN(pChannel->pUserdMemdesc != NULL, NV_ERR_GENERIC);
968 
969         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
970         {
971             //
972             // GPFIFO aceess will not be set up in order to facilitate memTransfer APIs
973             // which will use GSP-DMA/CE with shadow buffers
974             //
975             pChannel->pControlGPFifo = NULL;
976         }
977         else
978         {
979             pChannel->pControlGPFifo =
980                 (void *)memmgrMemDescBeginTransfer(pMemoryManager, pChannel->pUserdMemdesc,
981                                                    TRANSFER_FLAGS_USE_BAR1);
982             NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
983         }
984     }
985     else
986     {
987         NvU32   userdSize = 0;
988         RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
989         kfifoGetUserdSizeAlign_HAL(GPU_GET_KERNEL_FIFO(pGpu), &userdSize, NULL);
990 
991         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
992             pRmApi->MapToCpu(pRmApi, hClientId, hDeviceId,
993                              pChannel->bClientUserd ? pChannel->hUserD : hChannelId, 0,
994                              userdSize, (void **)&pChannel->pControlGPFifo, 0));
995     }
996     return NV_OK;
997 }
998 
999 static NV_STATUS
1000 _memUtilsAllocateUserD
1001 (
1002     OBJGPU        *pGpu,
1003     MemoryManager *pMemoryManager,
1004     NvHandle       hClientId,
1005     NvHandle       hDeviceId,
1006     OBJCHANNEL    *pChannel
1007 )
1008 {
1009     NV_STATUS                    rmStatus = NV_OK;
1010     KernelFifo                  *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
1011     RM_API                      *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1012     NV_MEMORY_ALLOCATION_PARAMS  memAllocParams;
1013     NvU32                        userdMemClass = NV01_MEMORY_LOCAL_USER;
1014 
1015     // Ensure that call is not made with lock held
1016     LOCK_ASSERT_AND_RETURN(!rmGpuLockIsOwner());
1017 
1018     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
1019 
1020     memAllocParams.owner = HEAP_OWNER_RM_CLIENT_GENERIC;
1021     kfifoGetUserdSizeAlign_HAL(pKernelFifo, (NvU32 *)&memAllocParams.size, NULL);
1022     memAllocParams.type  = NVOS32_TYPE_IMAGE;
1023     memAllocParams.internalflags = NVOS32_ALLOC_INTERNAL_FLAGS_SKIP_SCRUB;
1024 
1025     // Apply registry overrides to USERD.
1026     switch (DRF_VAL(_REG_STR_RM, _INST_LOC, _USERD, pGpu->instLocOverrides))
1027     {
1028         case NV_REG_STR_RM_INST_LOC_USERD_NCOH:
1029         case NV_REG_STR_RM_INST_LOC_USERD_COH:
1030             userdMemClass = NV01_MEMORY_SYSTEM;
1031             memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _PCI);
1032             break;
1033 
1034         case NV_REG_STR_RM_INST_LOC_USERD_VID:
1035         case NV_REG_STR_RM_INST_LOC_USERD_DEFAULT:
1036             userdMemClass = NV01_MEMORY_LOCAL_USER;
1037             memAllocParams.attr = DRF_DEF(OS32, _ATTR, _LOCATION, _VIDMEM);
1038             memAllocParams.flags |= NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM;
1039             if (!IS_MIG_IN_USE(pGpu))
1040             {
1041                 memAllocParams.attr |= DRF_DEF(OS32, _ATTR, _ALLOCATE_FROM_RESERVED_HEAP, _YES);
1042             }
1043             break;
1044     }
1045 
1046     //
1047     // When APM is enabled all RM internal allocations must to go to
1048     // unprotected memory irrespective of vidmem or sysmem
1049     // When Hopper CC is enabled all RM internal sysmem allocations that
1050     // are required to be accessed from GPU should be in unprotected memory
1051     // but all vidmem allocations must go to protected memory
1052     //
1053     if (gpuIsApmFeatureEnabled(pGpu) ||
1054         FLD_TEST_DRF(OS32, _ATTR, _LOCATION, _PCI, memAllocParams.attr))
1055     {
1056         memAllocParams.attr2 |= DRF_DEF(OS32, _ATTR2, _MEMORY_PROTECTION,
1057                                         _UNPROTECTED);
1058     }
1059 
1060     NV_ASSERT_OK_OR_RETURN(pRmApi->AllocWithHandle(pRmApi, hClientId, hDeviceId,
1061                                                    pChannel->hUserD,
1062                                                    userdMemClass,
1063                                                    &memAllocParams,
1064                                                    sizeof(memAllocParams)));
1065 
1066     return rmStatus;
1067 }
1068 
1069 static NV_STATUS
1070 _memUtilsAllocateChannel
1071 (
1072     OBJGPU        *pGpu,
1073     MemoryManager *pMemoryManager,
1074     NvHandle       hClientId,
1075     NvHandle       hDeviceId,
1076     NvHandle       hChannelId,
1077     NvHandle       hObjectError,
1078     NvHandle       hObjectBuffer,
1079     OBJCHANNEL    *pChannel
1080 )
1081 {
1082     NV_CHANNEL_ALLOC_PARAMS channelGPFIFOAllocParams;
1083     NV_STATUS               rmStatus =  NV_OK;
1084     NvU32                   hClass;
1085     RM_API                 *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1086     NvBool                  bMIGInUse = IS_MIG_IN_USE(pGpu);
1087     NvU32                   flags = DRF_DEF(OS04, _FLAGS, _CHANNEL_SKIP_SCRUBBER, _TRUE);
1088     RM_ENGINE_TYPE          engineType = (pChannel->type == SWL_SCRUBBER_CHANNEL) ?
1089                                 RM_ENGINE_TYPE_SEC2 : RM_ENGINE_TYPE_COPY(pChannel->ceId);
1090 
1091     if (pChannel->bSecure)
1092     {
1093         flags |= DRF_DEF(OS04, _FLAGS, _CC_SECURE, _TRUE);
1094     }
1095 
1096     portMemSet(&channelGPFIFOAllocParams, 0, sizeof(NV_CHANNEL_ALLOC_PARAMS));
1097     channelGPFIFOAllocParams.hObjectError  = hObjectError;
1098     channelGPFIFOAllocParams.hObjectBuffer = hObjectBuffer;
1099     channelGPFIFOAllocParams.gpFifoOffset  = pChannel->pbGpuVA + pChannel->channelPbSize;
1100     channelGPFIFOAllocParams.gpFifoEntries = pChannel->channelNumGpFifioEntries;
1101     channelGPFIFOAllocParams.hContextShare = NV01_NULL_OBJECT;
1102     channelGPFIFOAllocParams.flags         = flags;
1103     channelGPFIFOAllocParams.hVASpace      = pChannel->hVASpaceId;
1104 
1105     //
1106     // Use GPU instance local Id if MIG is enabled
1107     // TODO: Maybe we need a VAS for each GPU instance ?
1108     //
1109     if (bMIGInUse && (pChannel->pKernelMIGGpuInstance != NULL))
1110     {
1111         KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
1112         MIG_INSTANCE_REF ref;
1113         RM_ENGINE_TYPE localCe;
1114         RsClient *pClient;
1115         Device *pDevice;
1116 
1117         NV_ASSERT_OK_OR_RETURN(
1118             serverGetClientUnderLock(&g_resServ, hClientId, &pClient));
1119 
1120         NV_ASSERT_OK_OR_RETURN(
1121             deviceGetByHandle(pClient, hDeviceId, &pDevice));
1122 
1123         NV_ASSERT_OK_OR_RETURN(
1124             kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, pDevice, &ref));
1125         // Clear the Compute instance portion, if present
1126         ref = kmigmgrMakeGIReference(ref.pKernelMIGGpuInstance);
1127         NV_ASSERT_OK_OR_RETURN(
1128             kmigmgrGetGlobalToLocalEngineType(pGpu, pKernelMIGManager, ref,
1129                                               engineType,
1130                                               &localCe));
1131         channelGPFIFOAllocParams.engineType = gpuGetNv2080EngineType(localCe);
1132     }
1133     else
1134     {
1135         channelGPFIFOAllocParams.engineType = gpuGetNv2080EngineType(engineType);
1136     }
1137 
1138     hClass = kfifoGetChannelClassId(pGpu, GPU_GET_KERNEL_FIFO(pGpu));
1139     if (!hClass)
1140     {
1141         NV_PRINTF(LEVEL_ERROR, "Unable to determine CE's channel class.\n");
1142         return NV_ERR_GENERIC;
1143     }
1144 
1145     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
1146 
1147     if (pChannel->bClientUserd)
1148     {
1149         NV_ASSERT_OK_OR_GOTO(
1150             rmStatus,
1151             _memUtilsAllocateUserD(pGpu,
1152                                    pMemoryManager,
1153                                    hClientId,
1154                                    hDeviceId,
1155                                    pChannel),
1156             cleanup);
1157 
1158         SLI_LOOP_START(SLI_LOOP_FLAGS_BC_ONLY)
1159         channelGPFIFOAllocParams.hUserdMemory[gpumgrGetSubDeviceInstanceFromGpu(pGpu)] = pChannel->hUserD;
1160         channelGPFIFOAllocParams.userdOffset[gpumgrGetSubDeviceInstanceFromGpu(pGpu)] = 0;
1161         SLI_LOOP_END
1162     }
1163 
1164     NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(
1165         rmStatus,
1166         pRmApi->AllocWithHandle(pRmApi,
1167                                 hClientId,
1168                                 hDeviceId,
1169                                 hChannelId,
1170                                 hClass,
1171                                 &channelGPFIFOAllocParams,
1172                                 sizeof(channelGPFIFOAllocParams)));
1173 
1174 cleanup:
1175     NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
1176                                         rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM));
1177 
1178     return rmStatus;
1179 }
1180 
1181 /*!
1182  * Do a Non Blocking Memeset
1183  *
1184  * @param[in]     pChannel    OBJCHANNEL pointer
1185  * @param[in]     base        Offset in FB
1186  * @param[in]     size        size to scrub
1187  * @param[in]     freeToken   will be returned as a semaphore
1188  * @param[in]     *pNumBlocks returns the number of blocks that were scrubbed
1189  * @returns NV_STATUS
1190  */
1191 NV_STATUS
1192 memmgrMemUtilsMemSet_GM107
1193 (
1194     OBJGPU        *pGpu,
1195     MemoryManager *pMemoryManager,
1196     OBJCHANNEL    *pChannel,
1197     RmPhysAddr     base,
1198     NvU64          size,
1199     NvU32          freeToken,
1200     NvU32         *pNumBlocks
1201 )
1202 {
1203     NvU32 blocksPushed = 0;
1204 
1205     if ((size % pChannel->minBlockSize) != 0)
1206     {
1207         NV_PRINTF(LEVEL_ERROR, "Size should be a multiple of %d\n",
1208                   pChannel->minBlockSize);
1209         return NV_ERR_GENERIC;
1210 
1211     }
1212 
1213     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
1214     NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
1215 
1216     if (pChannel->isProgressChecked)
1217     {
1218         // if progress is checked insert the semaphore with freeToken as payload
1219          pChannel->finishPayload = freeToken;
1220          _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1221                                       0, 0, 0,             // src parameters
1222                                       base, ADDR_FBMEM, 0, // dst parameters
1223                                       size,
1224                                       NV_FALSE,            // blocking
1225                                       NV_TRUE,             // insertFinishPayload
1226                                       NV_FALSE);           // memcopy
1227     }
1228     else
1229     {
1230         // issue a standard async scrub
1231        blocksPushed = _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1232                           0, 0, 0,             // src parameters
1233                           base, ADDR_FBMEM, 0, // dst parameters
1234                           size,
1235                           NV_FALSE,            // blocking
1236                           NV_FALSE,            // insertFinishPayload
1237                           NV_FALSE);           // memcopy
1238     }
1239     *pNumBlocks = blocksPushed;
1240     return NV_OK;
1241 }
1242 
1243 /*!
1244  * Do a Blocking Memset
1245  *
1246  * @param[in]     pChannel   OBJCHANNEL pointer
1247  * @param[in]     base       Offset in FB
1248  * @param[in]     size       size to scrub
1249  * @returns NV_STATUS
1250  */
1251 
1252 NV_STATUS
1253 memmgrMemUtilsMemSetBlocking_GM107
1254 (
1255     OBJGPU        *pGpu,
1256     MemoryManager *pMemoryManager,
1257     OBJCHANNEL    *pChannel,
1258     RmPhysAddr     base,
1259     NvU64          size
1260 )
1261 {
1262     NvU32 blocksPushed = 0;
1263 
1264     if((size % pChannel->minBlockSize) != 0)
1265     {
1266         NV_PRINTF(LEVEL_ERROR, "Size should be a multiple of %d\n",
1267                   pChannel->minBlockSize);
1268         DBG_BREAKPOINT();
1269         return NV_ERR_GENERIC;
1270 
1271     }
1272 
1273     NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, NV_ERR_GENERIC);
1274     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
1275 
1276     blocksPushed = _ceChannelScheduleWork_GM107(pGpu, pMemoryManager, pChannel,
1277                        0, 0, 0,              // src parameters
1278                        base, ADDR_FBMEM, 0,  // dst parameters
1279                        size,
1280                        NV_TRUE,              // blocking
1281                        NV_FALSE,             // insertFinishPayload
1282                        NV_FALSE);            // memcopy
1283 
1284     if (blocksPushed > 0)
1285     {
1286         NvU8     *semAddr       = pChannel->pbCpuVA + pChannel->semaOffset;
1287         NV_STATUS timeoutStatus = NV_OK;
1288         RMTIMEOUT timeout;
1289 
1290         gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1291 
1292         while (MEM_RD32(semAddr) != pChannel->lastPayloadPushed)
1293         {
1294             NV_PRINTF(LEVEL_INFO, "Semaphore Payload is 0x%x last is 0x%x\n",
1295                       MEM_RD32(semAddr), pChannel->lastPayloadPushed);
1296 
1297             if (timeoutStatus == NV_ERR_TIMEOUT)
1298             {
1299                 NV_PRINTF(LEVEL_ERROR,
1300                           "Timed Out waiting for CE semaphore\n");
1301 
1302                 NV_PRINTF(LEVEL_ERROR,
1303                           "GET=0x%x, PUT=0x%x, GPGET=0x%x, GPPUT=0x%x\n",
1304                           pChannel->pControlGPFifo->Get,
1305                           pChannel->pControlGPFifo->Put,
1306                           pChannel->pControlGPFifo->GPGet,
1307                           pChannel->pControlGPFifo->GPPut);
1308 
1309                 DBG_BREAKPOINT_REASON(NV_ERR_TIMEOUT);
1310                 return NV_ERR_GENERIC;
1311             }
1312 
1313             timeoutStatus = gpuCheckTimeout(pGpu, &timeout);
1314          }
1315     }
1316 
1317     return NV_OK;
1318 }
1319 
1320 /*!
1321  * This function allocates the ECC scrubber
1322  *
1323  * @param[in]     pChannel   OBJCHANNEL pointer
1324  * @returns Bool
1325  */
1326 NV_STATUS
1327 memmgrMemUtilsAllocateEccScrubber_GM107
1328 (
1329     OBJGPU        *pGpu,
1330     MemoryManager *pMemoryManager,
1331     OBJCHANNEL    *pChannel
1332 )
1333 {
1334     NV_ASSERT_OK_OR_RETURN(channelAllocSubdevice(pGpu, pChannel));
1335 
1336     memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pChannel);
1337 
1338     memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pChannel);
1339 
1340     _memUtilsAllocateReductionSema(pGpu, pMemoryManager, pChannel);
1341 
1342     return NV_OK;
1343 }
1344 
1345 /*!
1346  * This function allocates the ecc scrubber and the
1347  * DUpes the bitmap semaphore which is used for sync
1348  *
1349  * @param[in]     pChannel   OBJCHANNEL pointer
1350  * @returns Bool
1351  */
1352 NV_STATUS
1353 memmgrMemUtilsAllocateEccAllocScrubber_GM107
1354 (
1355     OBJGPU        *pGpu,
1356     MemoryManager *pMemoryManager,
1357     OBJCHANNEL    *pChannel
1358 )
1359 {
1360     OBJSCRUB                   *pEccTD           = &pMemoryManager->eccScrubberState;
1361     OBJCHANNEL                 *pEccSyncChannel  = &pEccTD->allocationScrubberState;
1362     OBJCHANNEL                 *pEccAsyncChannel = &pEccTD->tdHeapState;
1363     NV_MEMORY_ALLOCATION_PARAMS memAllocParams;
1364     NV_STATUS                   lockStatus;
1365     RM_API                     *pRmApi           = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
1366 
1367     NV_ASSERT_OK_OR_RETURN(channelAllocSubdevice(pGpu, pChannel));
1368 
1369     memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pEccSyncChannel);
1370     memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pEccSyncChannel);
1371 
1372     rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
1373 
1374     // dup the reduction sema bit map object
1375     NV_ASSERT_OK(
1376         pRmApi->DupObject(pRmApi,
1377                           pEccSyncChannel->hClient,
1378                           pEccSyncChannel->deviceId,
1379                           &pEccSyncChannel->bitMapSemPhysId,
1380                           pEccAsyncChannel->hClient,
1381                           pEccAsyncChannel->bitMapSemPhysId,
1382                           0));
1383 
1384     // allocate virtual memory for a bit map semaphore
1385     portMemSet(&memAllocParams, 0, sizeof(memAllocParams));
1386     memAllocParams.owner     = HEAP_OWNER_RM_CLIENT_GENERIC;
1387     memAllocParams.type      = NVOS32_TYPE_IMAGE;
1388     memAllocParams.size      = (((pEccSyncChannel->blockCount + 31)/32)*4);
1389     memAllocParams.attr      = NVOS32_ATTR_NONE;
1390     memAllocParams.attr2     = NVOS32_ATTR2_NONE;
1391     memAllocParams.flags     = 0;
1392     memAllocParams.flags    |= NVOS32_ALLOC_FLAGS_VIRTUAL;
1393 
1394     NV_ASSERT_OK(
1395         pRmApi->AllocWithHandle(pRmApi,
1396                                 pEccSyncChannel->hClient,
1397                                 pEccSyncChannel->deviceId,
1398                                 pEccSyncChannel->bitMapSemVirtId,
1399                                 NV50_MEMORY_VIRTUAL,
1400                                 &memAllocParams,
1401                                 sizeof(memAllocParams)));
1402 
1403     lockStatus = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_MEM);
1404 
1405     if (lockStatus != NV_OK)
1406     {
1407         NV_ASSERT_FAILED("Could not get back lock after allocating reduction sema");
1408         return NV_ERR_GENERIC;
1409     }
1410 
1411     NV_ASSERT_OK(
1412         pRmApi->Map(pRmApi,
1413                     pEccSyncChannel->hClient,
1414                     pEccSyncChannel->deviceId,
1415                     pEccSyncChannel->bitMapSemVirtId,
1416                     pEccSyncChannel->bitMapSemPhysId, //hMemory,
1417                     0,
1418                     (((pEccSyncChannel->blockCount + 31) / 32) * 4),
1419                     NV04_MAP_MEMORY_FLAGS_NONE,
1420                     &pEccSyncChannel->pbGpuBitMapVA));
1421 
1422     pEccSyncChannel->pbBitMapVA = pEccAsyncChannel->pbBitMapVA;
1423 
1424     return NV_OK;
1425 }
1426 
1427 /*!
1428  * FUnction calculates the available space in PB
1429  * This is based on the reading the semaphore that
1430  * has the previous PUT pointer where methods were
1431  * inserted
1432  *
1433  * @param[in]     pChannel  OBJCHANNEL pointer
1434  * @returns size
1435  */
1436 static NvU32
1437 _getSpaceInPb(OBJCHANNEL *pChannel)
1438 {
1439     NvU32 filledSpace;
1440     NvU32 avlblSpace;
1441 
1442     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, 0);
1443 
1444     if (pChannel->channelPutOffset >= MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset))
1445     {
1446         filledSpace = (pChannel->channelPutOffset - MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset));
1447         avlblSpace = pChannel->channelPbSize - filledSpace;
1448 
1449     }
1450     else
1451     {
1452         avlblSpace = (MEM_RD32((NvU8*)pChannel->pbCpuVA + pChannel->semaOffset) - pChannel->channelPutOffset);
1453     }
1454 
1455     NV_PRINTF(LEVEL_INFO, "Space in PB is %d\n", avlblSpace);
1456 
1457     return avlblSpace;
1458 
1459 }
1460 
1461 /*!
1462  * This function manages the PushBUffer
1463  * It will insert methods into the PB, manage wrap around
1464  * and decide when we need to add NON-STALL interrupts
1465  * and etra token semaphores
1466  *
1467  * @param[in]     pChannel          OBJCHANNEL pointer
1468  * @param[in]     src               Offset of src to copy from
1469  * @param[in]     srcAddressSpace   source surface address space type
1470  * @param[in]     srcCpuCacheAttrib source surface address space attributes
1471  * @param[in]     dst               Offset of dst to scrub/copy to
1472  * @param[in]     dstAddressSpace   destination surface address space type
1473  * @param[in]     dstCpuCacheAttrib destination surface address space attributes
1474  * @param[in]     size              size to scrub/copy
1475  * @param[in]     blocking          blocking will not insert non-stall
1476  * @param[in]     payload           will insert a token for the last block submitted
1477  * @param[in]     bMemcopy          NV_TRUE for memory copy / NV_FALSE for scrubbing
1478  * @returns Bool
1479  */
1480 static NvU32
1481 _ceChannelScheduleWork_GM107
1482 (
1483     OBJGPU          *pGpu,
1484     MemoryManager   *pMemoryManager,
1485     OBJCHANNEL      *pChannel,
1486     RmPhysAddr       src,
1487     NV_ADDRESS_SPACE srcAddressSpace,
1488     NvU32            srcCpuCacheAttrib,
1489     RmPhysAddr       dst,
1490     NV_ADDRESS_SPACE dstAddressSpace,
1491     NvU32            dstCpuCacheAttrib,
1492     NvU64            size,
1493     NvBool           blocking,
1494     NvBool           insertFinishPayload,
1495     NvBool           bMemcopy
1496 )
1497 {
1498     RMTIMEOUT        timeout;
1499     NvU32            spaceInPb;
1500     NvU32            numBytes;
1501     NvU32            *ptr;
1502     NvU32            gpBase;
1503     NvU32            semaCount = 0;
1504     NvBool           addNonStallIntr = NV_FALSE;
1505     NvU32            blocksPushed = 0;
1506     NvBool           addFinishPayload;
1507     NvU32            blockSize = 0;
1508 
1509     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, 0);
1510     NV_ASSERT_OR_RETURN(pChannel->pControlGPFifo != NULL, 0);
1511 
1512     gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1513 
1514     spaceInPb = _getSpaceInPb(pChannel);
1515 
1516     NV_PRINTF(LEVEL_INFO, "Space in PB is %d and starting fill at 0x%x\n",
1517               spaceInPb, pChannel->channelPutOffset);
1518 
1519     ptr = (NvU32 *)(pChannel->pbCpuVA + pChannel->channelPutOffset);
1520     gpBase = pChannel->channelPutOffset;
1521     numBytes = 0;
1522     do
1523     {
1524         // while we have space greater than one block
1525         while((spaceInPb > (pChannel->methodSizePerBlock+MAX_EXTRA_PAYLOAD)))
1526         {
1527             // if inserting one more block is greater than PB size then wrap around to the beginning
1528             if((pChannel->channelPutOffset + (pChannel->methodSizePerBlock+MAX_EXTRA_PAYLOAD)) > pChannel->channelPbSize)
1529             {
1530                 NV_PRINTF(LEVEL_INFO, "Wrap numBytes %d\n", numBytes);
1531                 //submit to gpfifo with numBytes and wrap around the PutOffset
1532                 if(numBytes > 0)
1533                 {
1534                     _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, (gpBase), numBytes);
1535                 }
1536                 pChannel->channelPutOffset = 0;
1537                 ptr = (NvU32 *)(pChannel->pbCpuVA + pChannel->channelPutOffset);
1538                 gpBase = 0;
1539                 numBytes = 0;
1540                 // update the available space
1541                 spaceInPb = _getSpaceInPb(pChannel);
1542                 NV_PRINTF(LEVEL_INFO, "Wrapping PB around\n");
1543                 continue;
1544             }
1545 
1546             blockSize  = (size > pChannel->maxBlockSize) ?
1547                          pChannel->maxBlockSize : (NvU32) size;
1548 
1549             // add a non-stall interupt every (8th of the size) or when we insert the last block
1550             if((semaCount > (pChannel->channelPbSize >> 3)) || (size <= pChannel->maxBlockSize))
1551             {
1552                 addNonStallIntr = NV_TRUE;
1553                 semaCount = 0;
1554             }
1555             else
1556             {
1557                 addNonStallIntr = NV_FALSE;
1558             }
1559             // the finsh payload corresponds to inserting a token for every call to scrub that finishes
1560             if((insertFinishPayload) && (size <= pChannel->maxBlockSize))
1561             {
1562                 addFinishPayload = NV_TRUE;
1563                 NV_PRINTF(LEVEL_INFO, "Inserting Finish Payload!!!!!!!!!!\n");
1564             }
1565             else
1566             {
1567                 addFinishPayload = NV_FALSE;
1568             }
1569             if(_checkSynchronization(pGpu, pMemoryManager, pChannel, BLOCK_INDEX_FROM_ADDR(dst, pChannel->blockShift)))
1570             {
1571                 NvU32 bytesPushed = _ceChannelPushMethodsBlock_GM107(pGpu, pMemoryManager, pChannel,
1572                     src, srcAddressSpace, srcCpuCacheAttrib, // src parameters
1573                     dst, dstAddressSpace, dstCpuCacheAttrib, // dst parameters
1574                     blockSize, &ptr, NV_TRUE, (addNonStallIntr && !blocking),
1575                     addFinishPayload, bMemcopy);
1576                 spaceInPb = spaceInPb - bytesPushed;
1577                 numBytes  = numBytes + bytesPushed;
1578                 semaCount = semaCount + bytesPushed;
1579                 blocksPushed++;
1580                 // we are done pushing all methods
1581             }
1582 
1583             dst += (NvU64) blockSize;
1584             if (bMemcopy)
1585                 src += (NvU64) blockSize;
1586             size -= (NvU64) blockSize;
1587 
1588             if(size == 0)
1589             {
1590                 _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, gpBase, numBytes);
1591                 return blocksPushed;
1592             }
1593         }
1594         spaceInPb = _getSpaceInPb(pChannel);
1595         if(spaceInPb <= (pChannel->methodSizePerBlock + MAX_EXTRA_PAYLOAD))
1596         {
1597             //no space in pb to push all blocks so put what we have and wait for space
1598             if(numBytes > 0)
1599             {
1600                 _ceChannelUpdateGpFifo_GM107(pGpu, pMemoryManager, pChannel, gpBase, numBytes);
1601             }
1602             gpBase = pChannel->channelPutOffset;
1603             numBytes = 0;
1604         }
1605         if (gpuCheckTimeout(pGpu, &timeout) == NV_ERR_TIMEOUT)
1606         {
1607             NV_ASSERT_FAILED("Timed out waiting for Space in PB!");
1608             return NV_ERR_GENERIC;
1609         }
1610     } while(1);
1611 }
1612 
1613 
1614 /*!
1615  * This function checks if the block has already been submitted
1616  * or scrubbed based on 2 bitmaps. One is a pending bitmap
1617  * updated by the CPU and one is a "Finished" bitmap updated by
1618  * the GPU
1619  *
1620  * @param[in]     pChannel   OBJCHANNEL pointer
1621  * @param[in]     block      block number
1622  *
1623  * @returns Bool
1624  */
1625 static NvBool
1626 _checkSynchronization
1627 (
1628     OBJGPU        *pGpu,
1629     MemoryManager *pMemoryManager,
1630     OBJCHANNEL    *pChannel,
1631     NvU32          block
1632 )
1633 {
1634     NvU32 blockSema;
1635 
1636     if (!pChannel->isChannelSynchronized)
1637     {
1638         //synchronization is not required for this channel
1639         return NV_TRUE;
1640     }
1641 
1642     blockSema = MEM_RD32((NvU8*)pChannel->pbBitMapVA + ((block/32)*4));
1643 
1644     if( ((blockSema) & (1 << (block%32))) == 0 )
1645     {
1646         if (((pChannel->pBlockPendingState[block / 32] & (1 << (block % 32))) == 0) &&
1647                ((pChannel->pBlockDoneState[block / 32] & (1 << (block % 32))) == 0) )
1648         {
1649             pChannel->pBlockPendingState[block / 32] |= (1 << (block % 32));
1650             return NV_TRUE;
1651         }
1652     }
1653 
1654     return NV_FALSE;
1655 }
1656 
1657 /*!
1658  * Updates the GPfifo with the methods in the PB for
1659  * the given channel
1660  * @param[in]     pChannel   OBJCHANNEL pointer
1661  * @param[in]     gpOffset   Offset in the PB
1662  * @param[in]     gpSize     Size of segment
1663  * @returns None
1664  */
1665 static void
1666 _ceChannelUpdateGpFifo_GM107
1667 (
1668     OBJGPU        *pGpu,
1669     MemoryManager *pMemoryManager,
1670     OBJCHANNEL    *pChannel,
1671     NvU32          gpOffset,
1672     NvU32          gpSize
1673 
1674 )
1675 {
1676     RMTIMEOUT        timeout;
1677     NvU32            GPPut;
1678     NvU32            GPGet;
1679     NvU64            get;
1680     NvU32            length;
1681     NvU32            *pGpEntry;
1682     NvU32            GpEntry0;
1683     NvU32            GpEntry1;
1684     NvU32            GPPutNext;
1685     NvU32            workSubmitToken = 0;
1686     KernelChannel   *pFifoKernelChannel;
1687     KernelFifo      *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
1688 
1689     NV_ASSERT_OR_RETURN_VOID(pChannel->pbCpuVA != NULL);
1690     NV_ASSERT_OR_RETURN_VOID(pChannel->pControlGPFifo != NULL);
1691 
1692     gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
1693     GPPut = MEM_RD32(&pChannel->pControlGPFifo->GPPut);
1694     GPGet = MEM_RD32(&pChannel->pControlGPFifo->GPGet);
1695 
1696     GPPutNext = (GPPut + 1) % pChannel->channelNumGpFifioEntries;
1697 
1698     NV_PRINTF(LEVEL_INFO, "Put %d Get %d PutNext%d\n", GPPut, GPGet,
1699               GPPutNext);
1700 
1701     NV_PRINTF(LEVEL_INFO, "gp Base 0x%x, Size %d\n", (NvU32)(gpOffset),
1702               gpSize);
1703 
1704     // if the size passed is zero do not update gpput
1705     if (gpSize == 0)
1706         return;
1707 
1708     if (GPPut >= pChannel->channelNumGpFifioEntries)
1709     {
1710         // if the Put pointer is invalid, the GPU is likely inaccessible
1711         NV_PRINTF(LEVEL_INFO, "invalid Put %u >= %u\n", GPPut,
1712                   pChannel->channelNumGpFifioEntries);
1713         return;
1714     }
1715 
1716     while (GPPutNext == GPGet)
1717     {
1718         // need to wait for space
1719         GPGet = MEM_RD32(&pChannel->pControlGPFifo->GPGet);
1720 
1721         if (gpuCheckTimeout(pGpu, &timeout) == NV_ERR_TIMEOUT)
1722         {
1723             NV_ASSERT_FAILED("Timed Out waiting for space in GPFIFIO!");
1724             return;
1725         }
1726         else if (GPGet >= pChannel->channelNumGpFifioEntries)
1727         {
1728             // if the Get pointer is invalid, the GPU is likely inaccessible
1729             NV_PRINTF(LEVEL_INFO, "invalid Get %u >= %u\n", GPGet,
1730                       pChannel->channelNumGpFifioEntries);
1731             return;
1732         }
1733     }
1734 
1735     get = pChannel->pbGpuVA + gpOffset;
1736     length = gpSize;
1737 
1738     GpEntry0 =
1739        DRF_DEF(906F, _GP_ENTRY0, _NO_CONTEXT_SWITCH, _FALSE) |
1740        DRF_NUM(906F, _GP_ENTRY0, _GET, NvU64_LO32(get) >> 2);
1741     GpEntry1 =
1742        DRF_NUM(906F, _GP_ENTRY1, _GET_HI, NvU64_HI32(get)) |
1743        DRF_NUM(906F, _GP_ENTRY1, _LENGTH, length >> 2) |
1744        DRF_DEF(906F, _GP_ENTRY1, _PRIV, _USER) |
1745        DRF_DEF(906F, _GP_ENTRY1, _LEVEL, _MAIN);
1746 
1747 
1748     pGpEntry = (NvU32 *)(((NvU8*)pChannel->pbCpuVA) + pChannel->channelPbSize +
1749         GPPut*NV906F_GP_ENTRY__SIZE);
1750 
1751     MEM_WR32(&pGpEntry[0], GpEntry0);
1752     MEM_WR32(&pGpEntry[1], GpEntry1);
1753 
1754     // need to flush WRC buffer
1755     osFlushCpuWriteCombineBuffer();
1756 
1757     // write gpput
1758     MEM_WR32(&pChannel->pControlGPFifo->GPPut, GPPutNext);
1759     osFlushCpuWriteCombineBuffer();
1760 
1761     if (kfifoIsLiteModeEnabled_HAL(pGpu, pKernelFifo))
1762     {
1763         NV_ASSERT_OR_RETURN_VOID(0);
1764     }
1765     else
1766     {
1767         workSubmitToken = pChannel->workSubmitToken;
1768         NV_ASSERT_OR_RETURN_VOID(CliGetKernelChannelWithDevice(pChannel->pRsClient,
1769                                  pChannel->deviceId, pChannel->channelId,
1770                                  &pFifoKernelChannel) == NV_OK);
1771     }
1772     if (!kchannelIsRunlistSet(pGpu, pFifoKernelChannel))
1773     {
1774         NV_PRINTF(LEVEL_ERROR,
1775                   "FAILED Channel 0x%x is not assigned to runlist yet\n",
1776                   kchannelGetDebugTag(pFifoKernelChannel));
1777         return;
1778     }
1779     // update doorbell register
1780     kfifoUpdateUsermodeDoorbell_HAL(pGpu, pKernelFifo, workSubmitToken, kchannelGetRunlistId(pFifoKernelChannel));
1781 }
1782 
1783 /*!
1784  * Inserts methods into the push buffer for one block
1785  *
1786  * @param[in]     pChannel          OBJCHANNEL pointer
1787  * @param[in]     src               Offset of src to copy from
1788  * @param[in]     srcAddressSpace   source surface address space type
1789  * @param[in]     srcCpuCacheAttrib source surface address space attributes
1790  * @param[in]     dst               Offset of dst to scrub/copy to
1791  * @param[in]     dstAddressSpace   destination surface address space type
1792  * @param[in]     dstCpuCacheAttrib destination surface address space attributes
1793  * @param[in]     pPtr              Double pointer to PB offset
1794  * @returns None
1795  */
1796 static void
1797 _ceChannelPushMethodAperture_GM107
1798 (
1799     OBJCHANNEL      *pChannel,
1800     NV_ADDRESS_SPACE srcAddressSpace,
1801     NvU32            srcCpuCacheAttrib,
1802     NV_ADDRESS_SPACE dstAddressSpace,
1803     NvU32            dstCpuCacheAttrib,
1804     NvU32          **pPtr
1805 )
1806 {
1807     NvU32 *ptr  = *pPtr;
1808     NvU32  data = 0;
1809 
1810     // Set source parameters
1811     data = ((srcAddressSpace == ADDR_FBMEM) ? DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _LOCAL_FB) :
1812         (srcCpuCacheAttrib == NV_MEMORY_CACHED) ? DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _COHERENT_SYSMEM) :
1813             DRF_DEF(B0B5, _SET_SRC_PHYS_MODE, _TARGET, _NONCOHERENT_SYSMEM));
1814 
1815     PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_SRC_PHYS_MODE, data);
1816 
1817     // Set destination parameters
1818     data = ((dstAddressSpace == ADDR_FBMEM) ? DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _LOCAL_FB) :
1819         (dstCpuCacheAttrib == NV_MEMORY_CACHED) ? DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _COHERENT_SYSMEM) :
1820             DRF_DEF(B0B5, _SET_DST_PHYS_MODE, _TARGET, _NONCOHERENT_SYSMEM));
1821 
1822     PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_DST_PHYS_MODE, data);
1823 
1824     *pPtr = ptr;
1825 }
1826 
1827 /*!
1828  * Inserts methods into the push buffer for one block
1829  *
1830  * @param[in]     pChannel          OBJCHANNEL pointer
1831  * @param[in]     src               Offset of src to copy from
1832  * @param[in]     srcAddressSpace   source surface address space type
1833  * @param[in]     srcCpuCacheAttrib source surface address space attributes
1834  * @param[in]     dst               Offset of dst to scrub/copy to
1835  * @param[in]     dstAddressSpace   destination surface address space type
1836  * @param[in]     dstCpuCacheAttrib destination surface address space attributes
1837  * @param[in]     size              size of the region to scrub/copy
1838  * @param[in]     pPtr              Double pointer to PB offset
1839  * @param[in]     addPayloadSema    Bool to add default payload
1840  * @param[in]     addNonStallInt    Bool to add a non stall at the end
1841  * @param[in]     addFinishPayload  Bool to add an extra sema release for token
1842  * @param[in]     bMemcopy          NV_TRUE for memcopy / NV_FALSE for scrubbing
1843  * @returns None
1844  */
1845 static NvU32
1846 _ceChannelPushMethodsBlock_GM107
1847 (
1848     OBJGPU          *pGpu,
1849     MemoryManager   *pMemoryManager,
1850     OBJCHANNEL      *channel,
1851     RmPhysAddr       src,
1852     NV_ADDRESS_SPACE srcAddressSpace,
1853     NvU32            srcCpuCacheAttrib,
1854     RmPhysAddr       dst,
1855     NV_ADDRESS_SPACE dstAddressSpace,
1856     NvU32            dstCpuCacheAttrib,
1857     NvU64            size,
1858     NvU32          **pPtr,
1859     NvBool           addPayloadSema,
1860     NvBool           addNonStallIntr,
1861     NvBool           addFinishPayload,
1862     NvBool           bMemcopy
1863 )
1864 {
1865     NvU32  launchParams       = 0;
1866     NvU32 *ptr                = *pPtr;
1867     NvU32 *pStartPtr          = ptr;
1868     NvBool addReductionOp     = channel->isChannelSynchronized;
1869     NvBool bMemoryScrubEnable = NV_FALSE;
1870     NvU32  remapConstB        = 0;
1871     NvU32  remapComponentSize = 0;
1872 
1873     NV_PRINTF(LEVEL_INFO, "Base = 0x%llx, Size = 0x%llx, PB location = %p\n",
1874               dst, size, ptr);
1875 
1876     PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NV906F_SET_OBJECT, channel->classEngineID);
1877 
1878     if (size > 0)
1879     {
1880         NvU32 payLoad = channel->channelPutOffset + channel->methodSizePerBlock;
1881 
1882         if (addNonStallIntr)  payLoad = payLoad + NONSTALL_METHOD_SIZE;
1883         if (addReductionOp)   payLoad = payLoad + SEMAPHORE_ONLY_METHOD_SIZE;
1884         if (addFinishPayload) payLoad = payLoad + SEMAPHORE_ONLY_METHOD_SIZE;
1885 
1886         if (addPayloadSema)
1887         {
1888             memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
1889                 NVA06F_SUBCHANNEL_COPY_ENGINE,
1890                 channel->pbGpuVA+channel->semaOffset, payLoad, &ptr);
1891 
1892             NV_PRINTF(LEVEL_INFO, "Pushing Semaphore Payload 0x%x\n", payLoad);
1893             channel->lastPayloadPushed = payLoad;
1894         }
1895 
1896         if (IS_SIMULATION(pGpu))
1897         {
1898             //
1899             // fmodel CE is slow (compared to emulation) so we don't bother
1900             // scrubbing the whole block. Fmodel already scrubs memory via ramif
1901             // so we'll never get exceptions
1902             //
1903             size = NV_MIN(size, 0x20);
1904         }
1905 
1906         memmgrChannelPushAddressMethodsBlock_HAL(pMemoryManager, NV_FALSE,
1907             NVA06F_SUBCHANNEL_COPY_ENGINE, dst, &ptr);
1908 
1909         if (bMemcopy)
1910         {
1911             memmgrChannelPushAddressMethodsBlock_HAL(pMemoryManager, NV_TRUE,
1912                 NVA06F_SUBCHANNEL_COPY_ENGINE, src, &ptr);
1913 
1914             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size));
1915         }
1916         else
1917         {
1918             bMemoryScrubEnable = memmgrMemUtilsCheckMemoryFastScrubEnable_HAL(pGpu,
1919                                                    pMemoryManager,
1920                                                    channel->hTdCopyClass,
1921                                                    channel->bUseVasForCeCopy,
1922                                                    dst,
1923                                                    NvU64_LO32(size),
1924                                                    dstAddressSpace);
1925             if (bMemoryScrubEnable)
1926             {
1927                 NV_PRINTF(LEVEL_INFO, "Using Fast memory scrubber\n");
1928                 remapConstB        = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _DST_X, _CONST_B);
1929                 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_CONST_B, 0x00000000);
1930 
1931                 remapComponentSize = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _COMPONENT_SIZE, _ONE);
1932                 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size));
1933             }
1934             else
1935             {
1936                 remapComponentSize = DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _COMPONENT_SIZE, _FOUR);
1937                 PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_LENGTH_IN, NvU64_LO32(size >> 2));
1938             }
1939 
1940             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_COMPONENTS,
1941                                             DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _DST_X, _CONST_A)              |
1942                                             DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _NUM_SRC_COMPONENTS, _ONE)     |
1943                                             DRF_DEF(B0B5, _SET_REMAP_COMPONENTS, _NUM_DST_COMPONENTS, _ONE)     |
1944                                             remapComponentSize                                                  |
1945                                             remapConstB);
1946 
1947             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_SET_REMAP_CONST_A, 0x00000000);
1948 
1949             NV_ASSERT(srcAddressSpace == 0);
1950             NV_ASSERT(dstAddressSpace == ADDR_FBMEM);
1951 
1952             srcAddressSpace = ADDR_FBMEM;
1953         }
1954 
1955         PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LINE_COUNT, 1);
1956 
1957         _ceChannelPushMethodAperture_GM107(channel, srcAddressSpace, srcCpuCacheAttrib, dstAddressSpace, dstCpuCacheAttrib, &ptr);
1958 
1959         launchParams = DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
1960                        DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
1961                        DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_TYPE, _PHYSICAL) |
1962                        DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
1963                        DRF_DEF(B0B5, _LAUNCH_DMA, _DST_TYPE, _PHYSICAL) |
1964                        DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _PIPELINED);
1965 
1966         if (addPayloadSema)
1967         {
1968             launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
1969                             DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE);
1970         }
1971         else
1972         {
1973             launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _NONE);
1974         }
1975 
1976         if (bMemoryScrubEnable)
1977         {
1978             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVC8B5_SET_MEMORY_SCRUB_PARAMETERS,
1979                           DRF_DEF(C8B5, _SET_MEMORY_SCRUB_PARAMETERS, _DISCARDABLE, _FALSE));
1980 
1981             launchParams |= DRF_DEF(C8B5, _LAUNCH_DMA, _MEMORY_SCRUB_ENABLE, _TRUE);
1982             launchParams |= DRF_DEF(C8B5, _LAUNCH_DMA, _REMAP_ENABLE, _FALSE);
1983 
1984             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVC8B5_LAUNCH_DMA, launchParams);
1985         }
1986         else
1987         {
1988             if (!bMemcopy)
1989             {
1990                 launchParams |= DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE);
1991             }
1992 
1993             PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
1994         }
1995     }
1996 
1997     if (addReductionOp)
1998     {
1999         NvU32 currentBlock = BLOCK_INDEX_FROM_ADDR((dst), channel->blockShift);
2000         NvU32 blockOffset;
2001         NvU32 bitFlip;
2002 
2003         blockOffset = (currentBlock / 32) * 4;
2004         bitFlip     = ((NvU32)1 << (currentBlock % 32));
2005         memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
2006             NVA06F_SUBCHANNEL_COPY_ENGINE,
2007             channel->pbGpuBitMapVA+(blockOffset), bitFlip, &ptr);
2008 
2009         launchParams = DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
2010                        DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
2011                        DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE) |
2012                        DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE) |
2013                        DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
2014                        DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
2015                        DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION_ENABLE, _TRUE) |
2016                        DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION_SIGN, _UNSIGNED) |
2017                        DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_REDUCTION, _IOR) |
2018                        DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _NONE);
2019         // push only the second semaphore release
2020         PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
2021     }
2022 
2023     if (addFinishPayload)
2024     {
2025         memmgrChannelPushSemaphoreMethodsBlock_HAL(pMemoryManager,
2026                 NVA06F_SUBCHANNEL_COPY_ENGINE,
2027                 channel->pbGpuVA+channel->finishPayloadOffset,
2028                 channel->finishPayload, &ptr);
2029 
2030         launchParams =  DRF_DEF(B0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE, _RELEASE_ONE_WORD_SEMAPHORE) |
2031                         DRF_DEF(B0B5, _LAUNCH_DMA, _INTERRUPT_TYPE, _NONE) |
2032                         DRF_DEF(B0B5, _LAUNCH_DMA, _FLUSH_ENABLE, _TRUE) |
2033                         DRF_DEF(B0B5, _LAUNCH_DMA, _REMAP_ENABLE, _TRUE) |
2034                         DRF_DEF(B0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT, _PITCH) |
2035                         DRF_DEF(B0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT, _PITCH) |
2036                         DRF_DEF(B0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _NONE);
2037         PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NVB0B5_LAUNCH_DMA, launchParams);
2038         NV_PRINTF(LEVEL_INFO, "Pushing Finishing Semaphore Payload 0x%x\n",
2039                   channel->finishPayload);
2040     }
2041 
2042     if (addNonStallIntr)
2043     {
2044         PUSH_PAIR(NVA06F_SUBCHANNEL_COPY_ENGINE, NV906F_NON_STALL_INTERRUPT, 0);
2045     }
2046 
2047     channel->channelPutOffset = (NvU32)((NvU8 *)ptr - (NvU8 *)channel->pbCpuVA);
2048     *pPtr = ptr;
2049 
2050     // return length of methods inserted
2051     return (NvU32)((NvU8*)ptr - (NvU8*)pStartPtr);
2052 }
2053 
2054 /*!
2055  * Getting the Copy Engine Class
2056  *
2057  * @param[in]     pGpu         OBJGPU pointer
2058  * @param[out]    pClass       pointer to class
2059  */
2060 NV_STATUS
2061 memmgrMemUtilsGetCopyEngineClass_GM107
2062 (
2063     OBJGPU        *pGpu,
2064     MemoryManager *pMemoryManager,
2065     NvU32         *pClass
2066 )
2067 {
2068     NV_STATUS status;
2069     NvU32 numClasses;
2070     NvU32 *pClassList = NULL;
2071     NvU32 i;
2072     NvU32 class = 0;
2073     NvU32 eng;
2074 
2075     //
2076     // Pascal+ chips will have any combination of the 6 CEs
2077     // available. Loop over all the CEs to get the CE class
2078     // for the first available CE instead of using ENG_CE(0)
2079     //
2080     for (eng = 0; eng < ENG_CE__SIZE_1; eng++)
2081     {
2082         NV_ASSERT_OK_OR_ELSE(
2083             status,
2084             gpuGetClassList(pGpu, &numClasses, NULL, ENG_CE(eng)),
2085             return 0);
2086 
2087         if (numClasses > 0)
2088         {
2089             break;
2090         }
2091     }
2092 
2093     pClassList = portMemAllocNonPaged(sizeof(*pClassList) * numClasses);
2094     NV_ASSERT_OR_RETURN((pClassList != NULL), 0);
2095 
2096     if (NV_OK == gpuGetClassList(pGpu, &numClasses, pClassList, ENG_CE(eng)))
2097     {
2098         for (i = 0; i < numClasses; i++)
2099         {
2100             class = NV_MAX(class, pClassList[i]);
2101         }
2102     }
2103 
2104     NV_ASSERT(class != 0);
2105     portMemFree(pClassList);
2106     *pClass = class;
2107 
2108     return NV_OK;
2109 }
2110