1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 
25 #include "core/core.h"
26 #include "gpu/ce/kernel_ce.h"
27 #include "gpu/bus/kern_bus.h"
28 #include "kernel/gpu/intr/intr.h"
29 #include "kernel/gpu/fifo/kernel_fifo.h"
30 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
31 #include "kernel/gpu/mem_mgr/channel_utils.h"
32 #include "rmapi/rs_utils.h"
33 #include "utils/nvassert.h"
34 #include "core/prelude.h"
35 #include "core/locks.h"
36 #include "gpu/mem_mgr/ce_utils.h"
37 #include "gpu/subdevice/subdevice.h"
38 #include "kernel/gpu/mem_mgr/ce_utils_sizes.h"
39 
40 #include "class/clb0b5.h" // MAXWELL_DMA_COPY_A
41 #include "class/clc0b5.h" // PASCAL_DMA_COPY_A
42 #include "class/clc1b5.h" // PASCAL_DMA_COPY_B
43 #include "class/clc3b5.h" // VOLTA_DMA_COPY_A
44 #include "class/clc5b5.h" // TURING_DMA_COPY_A
45 #include "class/clc8b5.h" // HOPPER_DMA_COPY_A
46 #include "class/clc86f.h" // HOPPER_CHANNEL_GPFIFO_A
47 
48 #include "class/cl0080.h"
49 
50 NV_STATUS
51 ceutilsConstruct_IMPL
52 (
53     CeUtils                      *pCeUtils,
54     OBJGPU                       *pGpu,
55     KERNEL_MIG_GPU_INSTANCE      *pKernelMIGGPUInstance,
56     NV0050_ALLOCATION_PARAMETERS *pAllocParams
57 )
58 {
59     NV_STATUS status = NV_OK;
60     NvU64 allocFlags = pAllocParams->flags;
61     NV_ASSERT_OR_RETURN(pGpu, NV_ERR_INVALID_STATE);
62 
63     NvBool bMIGInUse = IS_MIG_IN_USE(pGpu);
64     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
65 
66     // Allocate channel with RM internal client
67     RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
68     RmClient *pClient;
69 
70     OBJCHANNEL *pChannel = (OBJCHANNEL *) portMemAllocNonPaged(sizeof(OBJCHANNEL));
71     if (pChannel == NULL)
72     {
73         return NV_ERR_INSUFFICIENT_RESOURCES;
74     }
75 
76     portMemSet(pChannel, 0, sizeof(OBJCHANNEL));
77 
78     if (pCeUtils->hClient == NV01_NULL_OBJECT)
79     {
80         // Allocate client
81         status = pRmApi->AllocWithHandle(pRmApi, NV01_NULL_OBJECT, NV01_NULL_OBJECT,
82                                          NV01_NULL_OBJECT, NV01_ROOT, &pCeUtils->hClient,
83                                          sizeof(pCeUtils->hClient));
84         NV_ASSERT_OR_GOTO(status == NV_OK, cleanup);
85     }
86 
87     pChannel->hClient = pCeUtils->hClient;
88     pClient = serverutilGetClientUnderLock(pChannel->hClient);
89     NV_ASSERT_OR_GOTO(pClient != NULL, free_client);
90 
91     status = serverGetClientUnderLock(&g_resServ, pChannel->hClient, &pChannel->pRsClient);
92     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
93 
94     status = clientSetHandleGenerator(staticCast(pClient, RsClient), 1U, ~0U - 1U);
95     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
96 
97     pChannel->bClientAllocated = NV_TRUE;
98     pChannel->pGpu = pGpu;
99 
100     pChannel->deviceId = pCeUtils->hDevice;
101     pChannel->subdeviceId = pCeUtils->hSubdevice;
102 
103     pChannel->pKernelMIGGpuInstance = pKernelMIGGPUInstance;
104 
105     // We'll allocate new VAS for now. Sharing client VAS will be added later
106     pChannel->hVASpaceId = NV01_NULL_OBJECT;
107     pChannel->bUseVasForCeCopy = FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _VIRTUAL_MODE, _TRUE, allocFlags);
108 
109     // Detect if we can enable fast scrub on this channel
110     status = memmgrMemUtilsGetCopyEngineClass_HAL(pGpu, pMemoryManager, &pCeUtils->hTdCopyClass);
111     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
112 
113     if (((pCeUtils->hTdCopyClass == HOPPER_DMA_COPY_A)
114         ) && !pChannel->bUseVasForCeCopy)
115     {
116         pChannel->type = FAST_SCRUBBER_CHANNEL;
117         NV_PRINTF(LEVEL_INFO, "Enabled fast scrubber in construct.\n");
118     }
119     else
120     {
121         pChannel->type = CE_SCRUBBER_CHANNEL;
122     }
123 
124     // Set up various channel resources
125     status = channelSetupIDs(pChannel, pGpu, pChannel->bUseVasForCeCopy, bMIGInUse);
126     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
127 
128     channelSetupChannelBufferSizes(pChannel);
129 
130     status = memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pChannel);
131     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
132 
133     NV_PRINTF(LEVEL_INFO, "Channel alloc successful for ceUtils\n");
134     pCeUtils->pChannel = pChannel;
135 
136     // Allocate CE states
137     status = memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pChannel);
138     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
139 
140     pCeUtils->pGpu = pGpu;
141 
142     return status;
143 
144 free_channel:
145     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->channelId);
146 
147     if (pAllocParams->hVaspace != NV01_NULL_OBJECT)
148     {
149         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hVASpaceId);
150     }
151 free_client:
152     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _EXTERNAL, _FALSE, allocFlags))
153     {
154         // If client allocated client, we should not free it in RM
155         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
156     }
157 
158 cleanup:
159     portMemFree(pChannel);
160     return status;
161 }
162 
163 void
164 ceutilsDestruct_IMPL
165 (
166     CeUtils *pCeUtils
167 )
168 {
169     OBJCHANNEL *pChannel = pCeUtils->pChannel;
170     OBJGPU *pGpu = pCeUtils->pGpu;
171     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
172     RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
173 
174     // Sanity checks
175     if ((pGpu == NULL) || (pChannel == NULL))
176     {
177         NV_PRINTF(LEVEL_WARNING, "Possible double-free of CeUtils!\n");
178         return;
179     }
180     else if (pGpu != pChannel->pGpu)
181     {
182         NV_PRINTF(LEVEL_ERROR, "Bad state during ceUtils teardown!\n");
183         return;
184     }
185 
186     if ((pChannel->bClientUserd) && (pChannel->pControlGPFifo != NULL))
187     {
188         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
189         {
190             //
191             // When PCIE is blocked, mappings should be created, used and torn
192             // down when they are used
193             //
194             NV_PRINTF(LEVEL_ERROR, "Leaked USERD mapping from ceUtils!\n");
195         }
196         else
197         {
198             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pUserdMemdesc, TRANSFER_FLAGS_USE_BAR1);
199             pChannel->pControlGPFifo = NULL;
200         }
201     }
202 
203     if (pChannel->pbCpuVA != NULL)
204     {
205         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
206         {
207             NV_PRINTF(LEVEL_ERROR, "Leaked pushbuffer mapping!\n");
208         }
209         else
210         {
211             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc, TRANSFER_FLAGS_USE_BAR1);
212             pChannel->pbCpuVA = NULL;
213         }
214     }
215 
216     if (pChannel->pTokenFromNotifier != NULL)
217     {
218         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
219         {
220             NV_PRINTF(LEVEL_ERROR, "Leaked notifier mapping!\n");
221         }
222         else
223         {
224             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pErrNotifierMemdesc, TRANSFER_FLAGS_USE_BAR1);
225             pChannel->pTokenFromNotifier = NULL;
226         }
227     }
228 
229     // Resource server makes sure no leak can occur
230     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
231     portMemFree(pChannel);
232 }
233 
234 void
235 ceutilsServiceInterrupts_IMPL(CeUtils *pCeUtils)
236 {
237     OBJCHANNEL *pChannel = pCeUtils->pChannel;
238 
239     //
240     // FIXME: Bug 2463959: objmemscrub is called with the rmDeviceGpuLock in the
241     // heapFree_IMPL->_stdmemPmaFree->pmaFreePages->scrubSubmitPages path.
242     // Yielding while holding the rmDeviceGpuLock can lead to deadlock. Instead,
243     // if the lock is held, service any interrupts on the owned CE to make progress.
244     // Bug 2527660 is filed to remove this change.
245     //
246     // pChannel is null when PMA scrub requests are handled in vGPU plugin.
247     // In this case vGpu plugin allocates scrubber channel in PF domain so
248     // above mention deadlock is not present here.
249     //
250     if ((pChannel != NULL) && (rmDeviceGpuLockIsOwner(pChannel->pGpu->gpuInstance)))
251     {
252         channelServiceScrubberInterrupts(pChannel);
253     }
254     else
255     {
256         osSchedule();
257     }
258 }
259 
260 
261 static NvBool
262 _ceUtilsFastScrubEnabled
263 (
264     POBJCHANNEL      pChannel,
265     CHANNEL_PB_INFO *pChannelPbInfo
266 )
267 {
268     OBJGPU *pGpu = pChannel->pGpu;
269     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
270 
271     if (!memmgrIsFastScrubberEnabled(pMemoryManager))
272     {
273         return NV_FALSE;
274     }
275 
276     //
277     // Enable the  memory fast scrubbing only when
278     // Channel was allocated as fastScrub channel
279     // We are doing a memset operation
280     // Memset pattern is 0
281     // DstPhysMode.target == LOCAL_FB
282     // Address is 4KB aligned
283     // LineLength is 4KB aligned
284     //
285 
286     return ((pChannel->type == FAST_SCRUBBER_CHANNEL) &&
287             (!pChannelPbInfo->bCeMemcopy) &&
288             (pChannelPbInfo->pattern == 0) &&
289             (pChannelPbInfo->dstAddressSpace == ADDR_FBMEM) &&
290             (NV_IS_ALIGNED64(pChannelPbInfo->dstAddr, MEMUTIL_SCRUB_OFFSET_ALIGNMENT)) &&
291             (NV_IS_ALIGNED(pChannelPbInfo->size, MEMUTIL_SCRUB_LINE_LENGTH_ALIGNMENT)));
292 }
293 
294 
295 //
296 // Helper to deal with CE_MAX_BYTES_PER_LINE
297 // This function may modify some fileds in pChannelPbInfo
298 //
299 static NV_STATUS
300 _ceutilsSubmitPushBuffer
301 (
302     POBJCHANNEL       pChannel,
303     NvBool            bPipelined,
304     NvBool            bInsertFinishPayload,
305     CHANNEL_PB_INFO * pChannelPbInfo
306 )
307 {
308     NV_STATUS status = NV_OK;
309     NvU32 methodsLength, putIndex = 0;
310 
311     NV_ASSERT_OR_RETURN(pChannelPbInfo != NULL, NV_ERR_INVALID_ARGUMENT);
312     NV_ASSERT_OR_RETURN(pChannel != NULL, NV_ERR_INVALID_ARGUMENT);
313 
314     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pChannel->pGpu);
315     NvBool bReleaseMapping = NV_FALSE;
316 
317     //
318     // Use BAR1 if CPU access is allowed, otherwise allocate and init shadow
319     // buffer for DMA access
320     //
321     NvU32 transferFlags = (TRANSFER_FLAGS_USE_BAR1     |
322                            TRANSFER_FLAGS_SHADOW_ALLOC |
323                            TRANSFER_FLAGS_SHADOW_INIT_MEM);
324     NV_PRINTF(LEVEL_INFO, "Actual size of copying to be pushed: %x\n", pChannelPbInfo->size);
325 
326     status = channelWaitForFreeEntry(pChannel, &putIndex);
327     if (status != NV_OK)
328     {
329         NV_PRINTF(LEVEL_ERROR, "Cannot get putIndex.\n");
330         return status;
331     }
332 
333     if (pChannel->pbCpuVA == NULL)
334     {
335         pChannel->pbCpuVA = memmgrMemDescBeginTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc,
336                                                        transferFlags);
337         bReleaseMapping = NV_TRUE;
338     }
339     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
340 
341     if (_ceUtilsFastScrubEnabled(pChannel, pChannelPbInfo))
342     {
343         methodsLength = channelFillPbFastScrub(pChannel, putIndex, bPipelined, bInsertFinishPayload, pChannelPbInfo);
344     }
345     else
346     {
347         methodsLength = channelFillCePb(pChannel, putIndex, bPipelined, bInsertFinishPayload, pChannelPbInfo);
348     }
349 
350     if (bReleaseMapping)
351     {
352         memmgrMemDescEndTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc, transferFlags);
353         pChannel->pbCpuVA = NULL;
354     }
355 
356     if (methodsLength == 0)
357     {
358         NV_PRINTF(LEVEL_ERROR, "Cannot push methods to channel.\n");
359         return NV_ERR_NO_FREE_FIFOS;
360     }
361 
362     //
363     // Pushbuffer can be written in a batch, but GPFIFO and doorbell require
364     // careful ordering so we do each write one-by-one
365     //
366     status = channelFillGpFifo(pChannel, putIndex, methodsLength);
367     if (status != NV_OK)
368     {
369         NV_PRINTF(LEVEL_ERROR, "Channel operation failures during memcopy\n");
370         return status;
371     }
372 
373     pChannel->lastSubmittedEntry = putIndex;
374 
375     return status;
376 }
377 
378 
379 NV_STATUS
380 ceutilsMemset_IMPL
381 (
382     CeUtils *pCeUtils,
383     CEUTILS_MEMSET_PARAMS *pParams
384 )
385 {
386     OBJCHANNEL *pChannel = pCeUtils->pChannel;
387     NV_STATUS   status = NV_OK;
388 
389     NvU32 pteArraySize;
390     NvU64 offset, memsetLength, size, pageGranularity;
391     NvBool bContiguous;
392 
393     MEMORY_DESCRIPTOR *pMemDesc = pParams->pMemDesc;
394     CHANNEL_PB_INFO channelPbInfo = {0};
395 
396     NvBool bPipelined = pParams->flags & NV0050_CTRL_MEMSET_FLAGS_PIPELINED;
397 
398     if (pMemDesc == NULL)
399     {
400         NV_PRINTF(LEVEL_ERROR, "Invalid memdesc for CeUtils memset.\n");
401         return NV_ERR_INVALID_ARGUMENT;
402     }
403 
404     if ((memdescGetAddressSpace(pMemDesc) != ADDR_FBMEM) ||
405         (pMemDesc->pGpu != pCeUtils->pChannel->pGpu))
406     {
407         NV_PRINTF(LEVEL_ERROR, "Invalid memory descriptor passed.\n");
408         return NV_ERR_INVALID_ARGUMENT;
409     }
410 
411     size = memdescGetSize(pMemDesc);
412     pteArraySize = memdescGetPteArraySize(pMemDesc, AT_GPU);
413     bContiguous = (pMemDesc->_flags & MEMDESC_FLAGS_PHYSICALLY_CONTIGUOUS) || (pteArraySize == 1);
414 
415     if (pParams->offset >= size)
416     {
417         NV_PRINTF(LEVEL_ERROR, "Invalid offset passed for the memdesc.\n");
418         return NV_ERR_INVALID_ARGUMENT;
419     }
420 
421     NV_PRINTF(LEVEL_INFO, "CeUtils Args to memset - offset: %llx, size: %llx \n",
422               pParams->offset, pParams->length);
423 
424     if ((pParams->length == 0) || (pParams->length > (size - pParams->offset)))
425     {
426         NV_PRINTF(LEVEL_ERROR, "Invalid memset length passed.\n");
427         return NV_ERR_INVALID_ARGUMENT;
428     }
429 
430     channelPbInfo.bCeMemcopy = NV_FALSE;
431     channelPbInfo.payload = pCeUtils->lastSubmittedPayload + 1;
432     pCeUtils->lastSubmittedPayload = channelPbInfo.payload;
433 
434     channelPbInfo.pattern = pParams->pattern;
435     channelPbInfo.dstAddressSpace = memdescGetAddressSpace(pMemDesc);
436     channelPbInfo.dstCpuCacheAttrib = pMemDesc->_cpuCacheAttrib;
437 
438     pageGranularity = pMemDesc->pageArrayGranularity;
439     memsetLength = pParams->length;
440     offset = pParams->offset;
441 
442     do
443     {
444         NvU64 maxContigSize = bContiguous ? memsetLength : (pageGranularity - offset % pageGranularity);
445         NvU32 memsetSizeContig = (NvU32)NV_MIN(NV_MIN(memsetLength, maxContigSize), CE_MAX_BYTES_PER_LINE);
446 
447         channelPbInfo.dstAddr = memdescGetPhysAddr(pMemDesc, AT_GPU, offset);
448 
449         NV_PRINTF(LEVEL_INFO, "CeUtils Memset dstAddr: %llx,  size: %x\n",
450                   channelPbInfo.dstAddr, memsetSizeContig);
451 
452         channelPbInfo.size = memsetSizeContig;
453         status = _ceutilsSubmitPushBuffer(pChannel, bPipelined, memsetSizeContig == memsetLength, &channelPbInfo);
454         if (status != NV_OK)
455         {
456             NV_PRINTF(LEVEL_ERROR, "Cannot submit push buffer for memset.\n");
457             return status;
458         }
459 
460          // Allow _LAUNCH_DMA methods that belong to the same memset operation to be pipelined after each other, as there are no dependencies
461         bPipelined = NV_TRUE;
462 
463         memsetLength -= memsetSizeContig;
464         offset       += memsetSizeContig;
465     } while (memsetLength != 0);
466 
467     if (pParams->flags & NV0050_CTRL_MEMSET_FLAGS_ASYNC)
468     {
469         NV_PRINTF(LEVEL_INFO, "Async memset payload returned: 0x%x\n", channelPbInfo.payload);
470         pParams->submittedWorkId = channelPbInfo.payload;
471     }
472     else
473     {
474         // Check semaProgress and then timeout
475         status = channelWaitForFinishPayload(pChannel, channelPbInfo.payload);
476         if (status == NV_OK)
477         {
478             NV_PRINTF(LEVEL_INFO, "Work was done from RM PoV lastSubmitted = 0x%x\n", channelPbInfo.payload);
479         }
480     }
481 
482     return status;
483 }
484 
485 NV_STATUS
486 ceutilsMemcopy_IMPL
487 (
488     CeUtils *pCeUtils,
489     CEUTILS_MEMCOPY_PARAMS *pParams
490 )
491 {
492     OBJCHANNEL *pChannel = pCeUtils->pChannel;
493     NV_STATUS   status = NV_OK;
494 
495     NvU64  srcSize, dstSize, copyLength, srcPageGranularity, dstPageGranularity;
496     NvBool bSrcContig, bDstContig;
497 
498     CHANNEL_PB_INFO channelPbInfo  = {0};
499     MEMORY_DESCRIPTOR *pDstMemDesc = pParams->pDstMemDesc;
500     MEMORY_DESCRIPTOR *pSrcMemDesc = pParams->pSrcMemDesc;
501 
502     NvU64 length = pParams->length;
503     NvU64 srcOffset = pParams->srcOffset;
504     NvU64 dstOffset = pParams->dstOffset;
505 
506     NvBool bPipelined = pParams->flags & NV0050_CTRL_MEMCOPY_FLAGS_PIPELINED;
507 
508     // Validate params
509     if ((pSrcMemDesc == NULL) || (pDstMemDesc == NULL))
510     {
511         NV_PRINTF(LEVEL_ERROR, "Src/Dst Memory descriptor should be valid.\n");
512         return NV_ERR_INVALID_ARGUMENT;
513     }
514 
515     if ((memdescGetAddressSpace(pSrcMemDesc) != ADDR_FBMEM) &&
516         (memdescGetAddressSpace(pDstMemDesc) != ADDR_FBMEM))
517     {
518         NV_PRINTF(LEVEL_ERROR, "Either Dst or Src memory should be in vidmem.\n");
519         return NV_ERR_INVALID_ARGUMENT;
520     }
521 
522     if ((pSrcMemDesc->pGpu != pCeUtils->pChannel->pGpu) ||
523         (pDstMemDesc->pGpu != pCeUtils->pChannel->pGpu))
524     {
525         NV_PRINTF(LEVEL_ERROR, "CeUtils does not support p2p copies right now. \n");
526         return NV_ERR_INVALID_ARGUMENT;
527     }
528 
529     srcSize = memdescGetSize(pSrcMemDesc);
530     dstSize = memdescGetSize(pDstMemDesc);
531 
532     if ((srcOffset >= srcSize) || (dstOffset >= dstSize))
533     {
534         NV_PRINTF(LEVEL_ERROR, "Invalid offset passed for the src/dst memdesc.\n");
535         return NV_ERR_INVALID_ARGUMENT;
536     }
537 
538     if ((length == 0) ||
539         (srcOffset + length > srcSize) || (dstOffset + length > dstSize))
540     {
541         NV_PRINTF(LEVEL_ERROR, "Invalid memcopy length.\n");
542         return NV_ERR_INVALID_ARGUMENT;
543     }
544 
545     channelPbInfo.bCeMemcopy = NV_TRUE;
546     channelPbInfo.payload = pCeUtils->lastSubmittedPayload + 1;
547     pCeUtils->lastSubmittedPayload = channelPbInfo.payload;
548 
549     channelPbInfo.srcAddressSpace = memdescGetAddressSpace(pSrcMemDesc);
550     channelPbInfo.dstAddressSpace = memdescGetAddressSpace(pDstMemDesc);
551 
552     channelPbInfo.srcCpuCacheAttrib = pSrcMemDesc->_cpuCacheAttrib;
553     channelPbInfo.dstCpuCacheAttrib = pDstMemDesc->_cpuCacheAttrib;
554 
555     srcPageGranularity = pSrcMemDesc->pageArrayGranularity;
556     dstPageGranularity = pDstMemDesc->pageArrayGranularity;
557     bSrcContig = memdescGetContiguity(pSrcMemDesc, AT_GPU);
558     bDstContig = memdescGetContiguity(pDstMemDesc, AT_GPU);
559 
560     copyLength = length;
561 
562     do
563     {
564         //
565         // This algorithm finds the maximum contig region from both src and dst
566         // for each copy and iterate until we submitted the whole range to CE
567         //
568         NvU64 maxContigSizeSrc = bSrcContig ? copyLength : (srcPageGranularity - srcOffset % srcPageGranularity);
569         NvU64 maxContigSizeDst = bDstContig ? copyLength : (dstPageGranularity - dstOffset % dstPageGranularity);
570         NvU32 copySizeContig = (NvU32)NV_MIN(NV_MIN(copyLength, NV_MIN(maxContigSizeSrc, maxContigSizeDst)), CE_MAX_BYTES_PER_LINE);
571 
572         channelPbInfo.srcAddr = memdescGetPhysAddr(pSrcMemDesc, AT_GPU, srcOffset);
573         channelPbInfo.dstAddr = memdescGetPhysAddr(pDstMemDesc, AT_GPU, dstOffset);
574 
575         NV_PRINTF(LEVEL_INFO, "CeUtils Memcopy dstAddr: %llx, srcAddr: %llx, size: %x\n",
576                   channelPbInfo.dstAddr, channelPbInfo.srcAddr, copySizeContig);
577 
578         channelPbInfo.size = copySizeContig;
579         status = _ceutilsSubmitPushBuffer(pChannel, bPipelined, copySizeContig == copyLength, &channelPbInfo);
580         if (status != NV_OK)
581         {
582             NV_PRINTF(LEVEL_ERROR, "Cannot submit push buffer for memcopy.\n");
583             return status;
584         }
585 
586          // Allow _LAUNCH_DMA methods that belong to the same copy operation to be pipelined after each other, as there are no dependencies
587         bPipelined = NV_TRUE;
588 
589         copyLength -= copySizeContig;
590         srcOffset  += copySizeContig;
591         dstOffset  += copySizeContig;
592     } while (copyLength != 0);
593 
594     if (pParams->flags & NV0050_CTRL_MEMSET_FLAGS_ASYNC)
595     {
596         NV_PRINTF(LEVEL_INFO, "Async memset payload returned: 0x%x\n", channelPbInfo.payload);
597         pParams->submittedWorkId = channelPbInfo.payload;
598     }
599     else
600     {
601         // Check semaProgress and then timeout
602         status = channelWaitForFinishPayload(pChannel, channelPbInfo.payload);
603         if (status == NV_OK)
604         {
605             NV_PRINTF(LEVEL_INFO, "Work was done from RM PoV lastSubmitted = 0x%x\n", channelPbInfo.payload);
606         }
607     }
608 
609     return status;
610 }
611 
612 
613 // This function updates pCeUtils->lastCompletedPayload and handles wrap-around
614 NvU64
615 ceutilsUpdateProgress_IMPL
616 (
617     CeUtils *pCeUtils
618 )
619 {
620     NV_ASSERT((pCeUtils != NULL) && (pCeUtils->pChannel != NULL));
621 
622     NvU32 hwCurrentCompletedPayload = 0;
623     NvU64 swLastCompletedPayload = pCeUtils->lastCompletedPayload;
624 
625     //
626     // CeUtils uses 64 bit index to track the work submitted. But HW supports
627     // only 32 bit semaphore. The current completed Id is calculated here, based
628     // on the lastSubmittedPayload and current HW semaphore value.
629     //
630     hwCurrentCompletedPayload = READ_CHANNEL_PAYLOAD_SEMA(pCeUtils->pChannel);
631 
632     // No work has been completed since we checked last time
633     if (hwCurrentCompletedPayload == (NvU32)swLastCompletedPayload)
634     {
635         return swLastCompletedPayload;
636     }
637 
638     // Check for wrap around case. Increment the upper 32 bits
639     if (hwCurrentCompletedPayload < (NvU32)swLastCompletedPayload)
640     {
641         swLastCompletedPayload += 0x100000000ULL;
642     }
643 
644     // Update lower 32 bits regardless if wrap-around happened
645     swLastCompletedPayload &= 0xFFFFFFFF00000000ULL;
646     swLastCompletedPayload |= (NvU64)hwCurrentCompletedPayload;
647 
648     pCeUtils->lastCompletedPayload = swLastCompletedPayload;
649     return swLastCompletedPayload;
650 }
651 
652 #if defined(DEBUG) || defined (DEVELOP)
653 NV_STATUS
654 ceutilsapiCtrlCmdCheckProgress_IMPL
655 (
656     CeUtilsApi *pCeUtilsApi,
657     NV0050_CTRL_CHECK_PROGRESS_PARAMS *pParams
658 )
659 {
660     if (pParams->submittedWorkId <= ceutilsUpdateProgress(pCeUtilsApi->pCeUtils))
661     {
662         pParams->result = NV0050_CTRL_CHECK_PROGRESS_RESULT_FINISHED;
663     }
664 
665     return NV_OK;
666 }
667 
668 NV_STATUS
669 ceutilsapiConstruct_IMPL
670 (
671     CeUtilsApi                   *pCeUtilsApi,
672     CALL_CONTEXT                 *pCallContext,
673     RS_RES_ALLOC_PARAMS_INTERNAL *pParams
674 )
675 {
676     NV0050_ALLOCATION_PARAMETERS *pAllocParams = pParams->pAllocParams;
677 
678     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _EXTERNAL, _TRUE, pAllocParams->flags))
679     {
680         NV_PRINTF(LEVEL_ERROR, "CeUtils: unsupported flags = 0x%llx\n", pAllocParams->flags);
681         return NV_ERR_NOT_SUPPORTED;
682     }
683 
684     return objCreate(&pCeUtilsApi->pCeUtils, pCeUtilsApi, CeUtils, GPU_RES_GET_GPU(pCeUtilsApi), NULL, pAllocParams);
685 }
686 
687 void
688 ceutilsapiDestruct_IMPL
689 (
690     CeUtilsApi *pCeUtilsApi
691 )
692 {
693     objDelete(pCeUtilsApi->pCeUtils);
694 }
695 
696 NV_STATUS
697 ceutilsapiCtrlCmdMemset_IMPL
698 (
699     CeUtilsApi *pCeUtilsApi,
700     NV0050_CTRL_MEMSET_PARAMS *pParams
701 )
702 {
703     NV_STATUS          status = NV_OK;
704     NvHandle           hClient = RES_GET_CLIENT_HANDLE(pCeUtilsApi);
705     RsResourceRef     *pPhysmemRef;
706     MEMORY_DESCRIPTOR *pMemDesc = NULL;
707     CEUTILS_MEMSET_PARAMS internalParams = {0};
708 
709     if (pParams->hMemory == 0)
710     {
711         return NV_ERR_INVALID_ARGUMENT;
712     }
713 
714     status = serverutilGetResourceRef(hClient, pParams->hMemory, &pPhysmemRef);
715     if (status != NV_OK)
716     {
717         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
718         return status;
719     }
720     pMemDesc = (dynamicCast(pPhysmemRef->pResource, Memory))->pMemDesc;
721 
722     internalParams.pMemDesc = pMemDesc;
723     internalParams.offset = pParams->offset;
724     internalParams.length = pParams->length;
725     internalParams.pattern = pParams->pattern;
726     internalParams.flags = pParams->flags;
727 
728     status = ceutilsMemset(pCeUtilsApi->pCeUtils, &internalParams);
729     if (status == NV_OK)
730     {
731         pParams->submittedWorkId = internalParams.submittedWorkId;
732     }
733 
734     return status;
735 }
736 
737 NV_STATUS
738 ceutilsapiCtrlCmdMemcopy_IMPL
739 (
740     CeUtilsApi *pCeUtilsApi,
741     NV0050_CTRL_MEMCOPY_PARAMS *pParams
742 )
743 {
744     NV_STATUS          status = NV_OK;
745     NvHandle           hClient = RES_GET_CLIENT_HANDLE(pCeUtilsApi);
746     RsResourceRef     *pSrcPhysmemRef;
747     RsResourceRef     *pDstPhysmemRef;
748     MEMORY_DESCRIPTOR *pSrcMemDesc = NULL;
749     MEMORY_DESCRIPTOR *pDstMemDesc = NULL;
750     CEUTILS_MEMCOPY_PARAMS internalParams = {0};
751 
752     if ((pParams->hSrcMemory == 0) || (pParams->hDstMemory == 0))
753     {
754         return NV_ERR_INVALID_ARGUMENT;
755     }
756 
757     status = serverutilGetResourceRef(hClient, pParams->hDstMemory, &pDstPhysmemRef);
758     if (status != NV_OK)
759     {
760         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
761         return status;
762     }
763     pDstMemDesc = (dynamicCast(pDstPhysmemRef->pResource, Memory))->pMemDesc;
764 
765     status = serverutilGetResourceRef(hClient, pParams->hSrcMemory, &pSrcPhysmemRef);
766     if (status != NV_OK)
767     {
768         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
769         return status;
770     }
771     pSrcMemDesc = (dynamicCast(pSrcPhysmemRef->pResource, Memory))->pMemDesc;
772 
773     internalParams.pSrcMemDesc = pSrcMemDesc;
774     internalParams.pDstMemDesc = pDstMemDesc;
775     internalParams.srcOffset = pParams->srcOffset;
776     internalParams.dstOffset = pParams->dstOffset;
777     internalParams.length = pParams->length;
778     internalParams.flags = pParams->flags;
779 
780     status = ceutilsMemcopy(pCeUtilsApi->pCeUtils, &internalParams);
781     if (status == NV_OK)
782     {
783         pParams->submittedWorkId = internalParams.submittedWorkId;
784     }
785 
786     return status;
787 }
788 #endif // defined(DEBUG) || defined (DEVELOP)
789