1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 
25 #include "core/core.h"
26 #include "gpu/ce/kernel_ce.h"
27 #include "gpu/bus/kern_bus.h"
28 #include "kernel/gpu/intr/intr.h"
29 #include "kernel/gpu/fifo/kernel_fifo.h"
30 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
31 #include "kernel/gpu/mem_mgr/channel_utils.h"
32 #include "rmapi/rs_utils.h"
33 #include "utils/nvassert.h"
34 #include "core/prelude.h"
35 #include "core/locks.h"
36 #include "gpu/mem_mgr/ce_utils.h"
37 #include "kernel/gpu/mem_mgr/ce_utils_sizes.h"
38 #include "vgpu/rpc_headers.h"
39 
40 #include "class/clb0b5.h" // MAXWELL_DMA_COPY_A
41 #include "class/clc0b5.h" // PASCAL_DMA_COPY_A
42 #include "class/clc1b5.h" // PASCAL_DMA_COPY_B
43 #include "class/clc3b5.h" // VOLTA_DMA_COPY_A
44 #include "class/clc5b5.h" // TURING_DMA_COPY_A
45 #include "class/clc8b5.h" // HOPPER_DMA_COPY_A
46 #include "class/clc86f.h" // HOPPER_CHANNEL_GPFIFO_A
47 
48 #include "class/cl0080.h"
49 
50 NV_STATUS
51 ceutilsConstruct_IMPL
52 (
53     CeUtils                      *pCeUtils,
54     OBJGPU                       *pGpu,
55     KERNEL_MIG_GPU_INSTANCE      *pKernelMIGGPUInstance,
56     NV0050_ALLOCATION_PARAMETERS *pAllocParams
57 )
58 {
59     NV_STATUS status = NV_OK;
60     NvU64 allocFlags = pAllocParams->flags;
61     NV_ASSERT_OR_RETURN(pGpu, NV_ERR_INVALID_STATE);
62 
63     NvBool bMIGInUse = IS_MIG_IN_USE(pGpu);
64     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
65 
66     pCeUtils->pGpu = pGpu;
67 
68     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _FIFO_LITE, _TRUE, allocFlags))
69     {
70         return NV_ERR_NOT_SUPPORTED;
71     }
72 
73     // Allocate channel with RM internal client
74     RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
75     RmClient *pClient;
76 
77     OBJCHANNEL *pChannel = (OBJCHANNEL *) portMemAllocNonPaged(sizeof(OBJCHANNEL));
78     if (pChannel == NULL)
79     {
80         return NV_ERR_INSUFFICIENT_RESOURCES;
81     }
82 
83     portMemSet(pChannel, 0, sizeof(OBJCHANNEL));
84 
85     if (pCeUtils->hClient == NV01_NULL_OBJECT)
86     {
87         // Allocate client
88         status = pRmApi->AllocWithHandle(pRmApi, NV01_NULL_OBJECT, NV01_NULL_OBJECT,
89                                          NV01_NULL_OBJECT, NV01_ROOT, &pCeUtils->hClient,
90                                          sizeof(pCeUtils->hClient));
91         NV_ASSERT_OR_GOTO(status == NV_OK, cleanup);
92     }
93 
94     pChannel->hClient = pCeUtils->hClient;
95     pClient = serverutilGetClientUnderLock(pChannel->hClient);
96     NV_ASSERT_OR_GOTO(pClient != NULL, free_client);
97 
98     status = serverGetClientUnderLock(&g_resServ, pChannel->hClient, &pChannel->pRsClient);
99     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
100 
101     if (IS_VIRTUAL(pGpu))
102     {
103         NV_ASSERT_OK_OR_GOTO(
104             status,
105             clientSetHandleGenerator(staticCast(pClient, RsClient), RS_UNIQUE_HANDLE_BASE,
106                                      RS_UNIQUE_HANDLE_RANGE/2 - VGPU_RESERVED_HANDLE_RANGE),
107             free_client);
108     }
109     else
110     {
111         NV_ASSERT_OK_OR_GOTO(
112             status,
113             clientSetHandleGenerator(staticCast(pClient, RsClient), 1U, ~0U - 1U),
114             free_client);
115     }
116 
117     pChannel->bClientAllocated = NV_TRUE;
118     pChannel->pGpu = pGpu;
119 
120     pChannel->deviceId = pCeUtils->hDevice;
121     pChannel->subdeviceId = pCeUtils->hSubdevice;
122 
123     pChannel->pKernelMIGGpuInstance = pKernelMIGGPUInstance;
124 
125     // We'll allocate new VAS for now. Sharing client VAS will be added later
126     pChannel->hVASpaceId = NV01_NULL_OBJECT;
127     pChannel->bUseVasForCeCopy = FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _VIRTUAL_MODE, _TRUE, allocFlags);
128 
129     // Detect if we can enable fast scrub on this channel
130     status = memmgrMemUtilsGetCopyEngineClass_HAL(pGpu, pMemoryManager, &pCeUtils->hTdCopyClass);
131     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
132 
133     if (((pCeUtils->hTdCopyClass == HOPPER_DMA_COPY_A)
134         ) && !pChannel->bUseVasForCeCopy)
135     {
136         pChannel->type = FAST_SCRUBBER_CHANNEL;
137         NV_PRINTF(LEVEL_INFO, "Enabled fast scrubber in construct.\n");
138     }
139     else
140     {
141         pChannel->type = CE_SCRUBBER_CHANNEL;
142     }
143 
144     // For self-hosted Hopper, we can only use VA copy or faster scrubber
145     if (pMemoryManager->bCePhysicalVidmemAccessNotSupported)
146     {
147         if (!pChannel->bUseVasForCeCopy &&
148             (pChannel->type != FAST_SCRUBBER_CHANNEL))
149         {
150             status = NV_ERR_NOT_SUPPORTED;
151             goto free_channel;
152         }
153     }
154 
155     // Set up various channel resources
156     status = channelSetupIDs(pChannel, pGpu, pChannel->bUseVasForCeCopy, bMIGInUse);
157     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
158 
159     channelSetupChannelBufferSizes(pChannel);
160 
161     status = memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pChannel);
162     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
163 
164     NV_PRINTF(LEVEL_INFO, "Channel alloc successful for ceUtils\n");
165     pCeUtils->pChannel = pChannel;
166 
167     // Allocate CE states
168     status = memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pChannel);
169     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
170 
171     return status;
172 
173 free_channel:
174     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->channelId);
175 
176     if (pAllocParams->hVaspace != NV01_NULL_OBJECT)
177     {
178         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hVASpaceId);
179     }
180 free_client:
181     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _EXTERNAL, _FALSE, allocFlags))
182     {
183         // If client allocated client, we should not free it in RM
184         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
185     }
186 
187 cleanup:
188     portMemFree(pChannel);
189     return status;
190 }
191 
192 void
193 ceutilsDestruct_IMPL
194 (
195     CeUtils *pCeUtils
196 )
197 {
198     OBJCHANNEL *pChannel = pCeUtils->pChannel;
199     OBJGPU *pGpu = pCeUtils->pGpu;
200     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
201     RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
202 
203     if ((pChannel->bClientUserd) && (pChannel->pControlGPFifo != NULL))
204     {
205         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
206         {
207             //
208             // When PCIE is blocked, mappings should be created, used and torn
209             // down when they are used
210             //
211             NV_PRINTF(LEVEL_ERROR, "Leaked USERD mapping from ceUtils!\n");
212         }
213         else
214         {
215             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pUserdMemdesc, TRANSFER_FLAGS_USE_BAR1);
216             pChannel->pControlGPFifo = NULL;
217         }
218     }
219 
220     if (pChannel->pbCpuVA != NULL)
221     {
222         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
223         {
224             NV_PRINTF(LEVEL_ERROR, "Leaked pushbuffer mapping!\n");
225         }
226         else
227         {
228             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc, TRANSFER_FLAGS_USE_BAR1);
229             pChannel->pbCpuVA = NULL;
230         }
231     }
232 
233     if (pChannel->pTokenFromNotifier != NULL)
234     {
235         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
236         {
237             NV_PRINTF(LEVEL_ERROR, "Leaked notifier mapping!\n");
238         }
239         else
240         {
241             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pErrNotifierMemdesc, TRANSFER_FLAGS_USE_BAR1);
242             pChannel->pTokenFromNotifier = NULL;
243         }
244     }
245 
246     // Resource server makes sure no leak can occur
247     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
248     portMemFree(pChannel);
249 }
250 
251 void
252 ceutilsServiceInterrupts_IMPL(CeUtils *pCeUtils)
253 {
254     OBJCHANNEL *pChannel = pCeUtils->pChannel;
255 
256     //
257     // FIXME: Bug 2463959: objmemscrub is called with the rmDeviceGpuLock in the
258     // heapFree_IMPL->_stdmemPmaFree->pmaFreePages->scrubSubmitPages path.
259     // Yielding while holding the rmDeviceGpuLock can lead to deadlock. Instead,
260     // if the lock is held, service any interrupts on the owned CE to make progress.
261     // Bug 2527660 is filed to remove this change.
262     //
263     if (rmDeviceGpuLockIsOwner(pChannel->pGpu->gpuInstance))
264     {
265         channelServiceScrubberInterrupts(pChannel);
266     }
267     else
268     {
269         osSchedule();
270     }
271 }
272 
273 
274 static NvBool
275 _ceUtilsFastScrubEnabled
276 (
277     OBJCHANNEL      *pChannel,
278     CHANNEL_PB_INFO *pChannelPbInfo
279 )
280 {
281     OBJGPU *pGpu = pChannel->pGpu;
282     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
283 
284     if (!memmgrIsFastScrubberEnabled(pMemoryManager))
285     {
286         return NV_FALSE;
287     }
288 
289     //
290     // Enable the  memory fast scrubbing only when
291     // Channel was allocated as fastScrub channel
292     // We are doing a memset operation
293     // Memset pattern is 0
294     // DstPhysMode.target == LOCAL_FB
295     // Address is 4KB aligned
296     // LineLength is 4KB aligned
297     //
298 
299     return ((pChannel->type == FAST_SCRUBBER_CHANNEL) &&
300             (!pChannelPbInfo->bCeMemcopy) &&
301             (pChannelPbInfo->pattern == 0) &&
302             (pChannelPbInfo->dstAddressSpace == ADDR_FBMEM) &&
303             (NV_IS_ALIGNED64(pChannelPbInfo->dstAddr, MEMUTIL_SCRUB_OFFSET_ALIGNMENT)) &&
304             (NV_IS_ALIGNED(pChannelPbInfo->size, MEMUTIL_SCRUB_LINE_LENGTH_ALIGNMENT)));
305 }
306 
307 
308 //
309 // Helper to deal with CE_MAX_BYTES_PER_LINE
310 // This function may modify some fileds in pChannelPbInfo
311 //
312 static NV_STATUS
313 _ceutilsSubmitPushBuffer
314 (
315     OBJCHANNEL       *pChannel,
316     NvBool            bPipelined,
317     NvBool            bInsertFinishPayload,
318     CHANNEL_PB_INFO * pChannelPbInfo
319 )
320 {
321     NV_STATUS status = NV_OK;
322     NvU32 methodsLength, putIndex = 0;
323 
324     NV_ASSERT_OR_RETURN(pChannelPbInfo != NULL, NV_ERR_INVALID_ARGUMENT);
325     NV_ASSERT_OR_RETURN(pChannel != NULL, NV_ERR_INVALID_ARGUMENT);
326 
327     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pChannel->pGpu);
328     NvBool bReleaseMapping = NV_FALSE;
329 
330     //
331     // Use BAR1 if CPU access is allowed, otherwise allocate and init shadow
332     // buffer for DMA access
333     //
334     NvU32 transferFlags = (TRANSFER_FLAGS_USE_BAR1     |
335                            TRANSFER_FLAGS_SHADOW_ALLOC |
336                            TRANSFER_FLAGS_SHADOW_INIT_MEM);
337     NV_PRINTF(LEVEL_INFO, "Actual size of copying to be pushed: %x\n", pChannelPbInfo->size);
338 
339     status = channelWaitForFreeEntry(pChannel, &putIndex);
340     if (status != NV_OK)
341     {
342         NV_PRINTF(LEVEL_ERROR, "Cannot get putIndex.\n");
343         return status;
344     }
345 
346     if (pChannel->pbCpuVA == NULL)
347     {
348         pChannel->pbCpuVA = memmgrMemDescBeginTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc,
349                                                        transferFlags);
350         bReleaseMapping = NV_TRUE;
351     }
352     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
353 
354     if (_ceUtilsFastScrubEnabled(pChannel, pChannelPbInfo))
355     {
356         methodsLength = channelFillPbFastScrub(pChannel, putIndex, bPipelined, bInsertFinishPayload, pChannelPbInfo);
357     }
358     else
359     {
360         if (pMemoryManager->bCePhysicalVidmemAccessNotSupported)
361         {
362             // Self-hosted Hopper only supports VA copy or fast scrubber
363             NV_ASSERT_OR_RETURN(pChannel->bUseVasForCeCopy, NV_ERR_NOT_SUPPORTED);
364         }
365 
366         methodsLength = channelFillCePb(pChannel, putIndex, bPipelined, bInsertFinishPayload, pChannelPbInfo);
367     }
368 
369     if (bReleaseMapping)
370     {
371         memmgrMemDescEndTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc, transferFlags);
372         pChannel->pbCpuVA = NULL;
373     }
374 
375     if (methodsLength == 0)
376     {
377         NV_PRINTF(LEVEL_ERROR, "Cannot push methods to channel.\n");
378         return NV_ERR_NO_FREE_FIFOS;
379     }
380 
381     //
382     // Pushbuffer can be written in a batch, but GPFIFO and doorbell require
383     // careful ordering so we do each write one-by-one
384     //
385     status = channelFillGpFifo(pChannel, putIndex, methodsLength);
386     if (status != NV_OK)
387     {
388         NV_PRINTF(LEVEL_ERROR, "Channel operation failures during memcopy\n");
389         return status;
390     }
391 
392     pChannel->lastSubmittedEntry = putIndex;
393 
394     return status;
395 }
396 
397 
398 NV_STATUS
399 ceutilsMemset_IMPL
400 (
401     CeUtils *pCeUtils,
402     CEUTILS_MEMSET_PARAMS *pParams
403 )
404 {
405     OBJCHANNEL *pChannel = pCeUtils->pChannel;
406     NV_STATUS   status = NV_OK;
407 
408     NvU32 pteArraySize;
409     NvU64 offset, memsetLength, size, pageGranularity;
410     NvBool bContiguous;
411 
412     MEMORY_DESCRIPTOR *pMemDesc = pParams->pMemDesc;
413     CHANNEL_PB_INFO channelPbInfo = {0};
414 
415     NvBool bPipelined = pParams->flags & NV0050_CTRL_MEMSET_FLAGS_PIPELINED;
416 
417     if (pMemDesc == NULL)
418     {
419         NV_PRINTF(LEVEL_ERROR, "Invalid memdesc for CeUtils memset.\n");
420         return NV_ERR_INVALID_ARGUMENT;
421     }
422 
423     if (pMemDesc->pGpu != pCeUtils->pChannel->pGpu)
424     {
425         NV_PRINTF(LEVEL_ERROR, "Invalid memory descriptor passed.\n");
426         return NV_ERR_INVALID_ARGUMENT;
427     }
428 
429     size = memdescGetSize(pMemDesc);
430     pteArraySize = memdescGetPteArraySize(pMemDesc, AT_GPU);
431     bContiguous = (pMemDesc->_flags & MEMDESC_FLAGS_PHYSICALLY_CONTIGUOUS) || (pteArraySize == 1);
432 
433     if (pParams->offset >= size)
434     {
435         NV_PRINTF(LEVEL_ERROR, "Invalid offset passed for the memdesc.\n");
436         return NV_ERR_INVALID_ARGUMENT;
437     }
438 
439     NV_PRINTF(LEVEL_INFO, "CeUtils Args to memset - offset: %llx, size: %llx \n",
440               pParams->offset, pParams->length);
441 
442     if ((pParams->length == 0) || (pParams->length > (size - pParams->offset)))
443     {
444         NV_PRINTF(LEVEL_ERROR, "Invalid memset length passed.\n");
445         return NV_ERR_INVALID_ARGUMENT;
446     }
447 
448     channelPbInfo.bCeMemcopy = NV_FALSE;
449     channelPbInfo.payload = pCeUtils->lastSubmittedPayload + 1;
450     pCeUtils->lastSubmittedPayload = channelPbInfo.payload;
451 
452     channelPbInfo.pattern = pParams->pattern;
453     channelPbInfo.dstAddressSpace = memdescGetAddressSpace(pMemDesc);
454     channelPbInfo.dstCpuCacheAttrib = pMemDesc->_cpuCacheAttrib;
455 
456     pageGranularity = pMemDesc->pageArrayGranularity;
457     memsetLength = pParams->length;
458     offset = pParams->offset;
459 
460     do
461     {
462         NvU64 maxContigSize = bContiguous ? memsetLength : (pageGranularity - offset % pageGranularity);
463         NvU32 memsetSizeContig = (NvU32)NV_MIN(NV_MIN(memsetLength, maxContigSize), CE_MAX_BYTES_PER_LINE);
464 
465         channelPbInfo.dstAddr = memdescGetPhysAddr(pMemDesc, AT_GPU, offset);
466 
467         NV_PRINTF(LEVEL_INFO, "CeUtils Memset dstAddr: %llx,  size: %x\n",
468                   channelPbInfo.dstAddr, memsetSizeContig);
469 
470         channelPbInfo.size = memsetSizeContig;
471         status = _ceutilsSubmitPushBuffer(pChannel, bPipelined, memsetSizeContig == memsetLength, &channelPbInfo);
472         if (status != NV_OK)
473         {
474             NV_PRINTF(LEVEL_ERROR, "Cannot submit push buffer for memset.\n");
475             return status;
476         }
477 
478          // Allow _LAUNCH_DMA methods that belong to the same memset operation to be pipelined after each other, as there are no dependencies
479         bPipelined = NV_TRUE;
480 
481         memsetLength -= memsetSizeContig;
482         offset       += memsetSizeContig;
483     } while (memsetLength != 0);
484 
485     if (pParams->flags & NV0050_CTRL_MEMSET_FLAGS_ASYNC)
486     {
487         NV_PRINTF(LEVEL_INFO, "Async memset payload returned: 0x%x\n", channelPbInfo.payload);
488         pParams->submittedWorkId = channelPbInfo.payload;
489     }
490     else
491     {
492         // Check semaProgress and then timeout
493         status = channelWaitForFinishPayload(pChannel, channelPbInfo.payload);
494         if (status == NV_OK)
495         {
496             NV_PRINTF(LEVEL_INFO, "Work was done from RM PoV lastSubmitted = 0x%x\n", channelPbInfo.payload);
497         }
498     }
499 
500     return status;
501 }
502 
503 NV_STATUS
504 ceutilsMemcopy_IMPL
505 (
506     CeUtils *pCeUtils,
507     CEUTILS_MEMCOPY_PARAMS *pParams
508 )
509 {
510     OBJCHANNEL *pChannel = pCeUtils->pChannel;
511     NV_STATUS   status = NV_OK;
512 
513     NvU64  srcSize, dstSize, copyLength, srcPageGranularity, dstPageGranularity;
514     NvBool bSrcContig, bDstContig;
515 
516     CHANNEL_PB_INFO channelPbInfo  = {0};
517     MEMORY_DESCRIPTOR *pDstMemDesc = pParams->pDstMemDesc;
518     MEMORY_DESCRIPTOR *pSrcMemDesc = pParams->pSrcMemDesc;
519 
520     NvU64 length = pParams->length;
521     NvU64 srcOffset = pParams->srcOffset;
522     NvU64 dstOffset = pParams->dstOffset;
523 
524     NvBool bPipelined = pParams->flags & NV0050_CTRL_MEMCOPY_FLAGS_PIPELINED;
525 
526     // Validate params
527     if ((pSrcMemDesc == NULL) || (pDstMemDesc == NULL))
528     {
529         NV_PRINTF(LEVEL_ERROR, "Src/Dst Memory descriptor should be valid.\n");
530         return NV_ERR_INVALID_ARGUMENT;
531     }
532 
533     if ((pSrcMemDesc->pGpu != pCeUtils->pChannel->pGpu) ||
534         (pDstMemDesc->pGpu != pCeUtils->pChannel->pGpu))
535     {
536         NV_PRINTF(LEVEL_ERROR, "CeUtils does not support p2p copies right now. \n");
537         return NV_ERR_INVALID_ARGUMENT;
538     }
539 
540     srcSize = memdescGetSize(pSrcMemDesc);
541     dstSize = memdescGetSize(pDstMemDesc);
542 
543     if ((srcOffset >= srcSize) || (dstOffset >= dstSize))
544     {
545         NV_PRINTF(LEVEL_ERROR, "Invalid offset passed for the src/dst memdesc.\n");
546         return NV_ERR_INVALID_ARGUMENT;
547     }
548 
549     if ((length == 0) ||
550         (srcOffset + length > srcSize) || (dstOffset + length > dstSize))
551     {
552         NV_PRINTF(LEVEL_ERROR, "Invalid memcopy length.\n");
553         return NV_ERR_INVALID_ARGUMENT;
554     }
555 
556     channelPbInfo.bCeMemcopy = NV_TRUE;
557     channelPbInfo.payload = pCeUtils->lastSubmittedPayload + 1;
558     pCeUtils->lastSubmittedPayload = channelPbInfo.payload;
559 
560     channelPbInfo.srcAddressSpace = memdescGetAddressSpace(pSrcMemDesc);
561     channelPbInfo.dstAddressSpace = memdescGetAddressSpace(pDstMemDesc);
562 
563     channelPbInfo.srcCpuCacheAttrib = pSrcMemDesc->_cpuCacheAttrib;
564     channelPbInfo.dstCpuCacheAttrib = pDstMemDesc->_cpuCacheAttrib;
565 
566     srcPageGranularity = pSrcMemDesc->pageArrayGranularity;
567     dstPageGranularity = pDstMemDesc->pageArrayGranularity;
568     bSrcContig = memdescGetContiguity(pSrcMemDesc, AT_GPU);
569     bDstContig = memdescGetContiguity(pDstMemDesc, AT_GPU);
570 
571     copyLength = length;
572 
573     do
574     {
575         //
576         // This algorithm finds the maximum contig region from both src and dst
577         // for each copy and iterate until we submitted the whole range to CE
578         //
579         NvU64 maxContigSizeSrc = bSrcContig ? copyLength : (srcPageGranularity - srcOffset % srcPageGranularity);
580         NvU64 maxContigSizeDst = bDstContig ? copyLength : (dstPageGranularity - dstOffset % dstPageGranularity);
581         NvU32 copySizeContig = (NvU32)NV_MIN(NV_MIN(copyLength, NV_MIN(maxContigSizeSrc, maxContigSizeDst)), CE_MAX_BYTES_PER_LINE);
582 
583         channelPbInfo.srcAddr = memdescGetPhysAddr(pSrcMemDesc, AT_GPU, srcOffset);
584         channelPbInfo.dstAddr = memdescGetPhysAddr(pDstMemDesc, AT_GPU, dstOffset);
585 
586         NV_PRINTF(LEVEL_INFO, "CeUtils Memcopy dstAddr: %llx, srcAddr: %llx, size: %x\n",
587                   channelPbInfo.dstAddr, channelPbInfo.srcAddr, copySizeContig);
588 
589         channelPbInfo.size = copySizeContig;
590         status = _ceutilsSubmitPushBuffer(pChannel, bPipelined, copySizeContig == copyLength, &channelPbInfo);
591         if (status != NV_OK)
592         {
593             NV_PRINTF(LEVEL_ERROR, "Cannot submit push buffer for memcopy.\n");
594             return status;
595         }
596 
597          // Allow _LAUNCH_DMA methods that belong to the same copy operation to be pipelined after each other, as there are no dependencies
598         bPipelined = NV_TRUE;
599 
600         copyLength -= copySizeContig;
601         srcOffset  += copySizeContig;
602         dstOffset  += copySizeContig;
603     } while (copyLength != 0);
604 
605     if (pParams->flags & NV0050_CTRL_MEMSET_FLAGS_ASYNC)
606     {
607         NV_PRINTF(LEVEL_INFO, "Async memset payload returned: 0x%x\n", channelPbInfo.payload);
608         pParams->submittedWorkId = channelPbInfo.payload;
609     }
610     else
611     {
612         // Check semaProgress and then timeout
613         status = channelWaitForFinishPayload(pChannel, channelPbInfo.payload);
614         if (status == NV_OK)
615         {
616             NV_PRINTF(LEVEL_INFO, "Work was done from RM PoV lastSubmitted = 0x%x\n", channelPbInfo.payload);
617         }
618     }
619 
620     return status;
621 }
622 
623 
624 // This function updates pCeUtils->lastCompletedPayload and handles wrap-around
625 NvU64
626 ceutilsUpdateProgress_IMPL
627 (
628     CeUtils *pCeUtils
629 )
630 {
631     NV_ASSERT((pCeUtils != NULL) && (pCeUtils->pChannel != NULL));
632 
633     NvU32 hwCurrentCompletedPayload = 0;
634     NvU64 swLastCompletedPayload = pCeUtils->lastCompletedPayload;
635 
636     //
637     // CeUtils uses 64 bit index to track the work submitted. But HW supports
638     // only 32 bit semaphore. The current completed Id is calculated here, based
639     // on the lastSubmittedPayload and current HW semaphore value.
640     //
641     hwCurrentCompletedPayload = READ_CHANNEL_PAYLOAD_SEMA(pCeUtils->pChannel);
642 
643     // No work has been completed since we checked last time
644     if (hwCurrentCompletedPayload == (NvU32)swLastCompletedPayload)
645     {
646         return swLastCompletedPayload;
647     }
648 
649     // Check for wrap around case. Increment the upper 32 bits
650     if (hwCurrentCompletedPayload < (NvU32)swLastCompletedPayload)
651     {
652         swLastCompletedPayload += 0x100000000ULL;
653     }
654 
655     // Update lower 32 bits regardless if wrap-around happened
656     swLastCompletedPayload &= 0xFFFFFFFF00000000ULL;
657     swLastCompletedPayload |= (NvU64)hwCurrentCompletedPayload;
658 
659     pCeUtils->lastCompletedPayload = swLastCompletedPayload;
660     return swLastCompletedPayload;
661 }
662 
663 NV_STATUS
664 ceutilsapiCtrlCmdCheckProgress_IMPL
665 (
666     CeUtilsApi *pCeUtilsApi,
667     NV0050_CTRL_CHECK_PROGRESS_PARAMS *pParams
668 )
669 {
670     if (pParams->submittedWorkId <= ceutilsUpdateProgress(pCeUtilsApi->pCeUtils))
671     {
672         pParams->result = NV0050_CTRL_CHECK_PROGRESS_RESULT_FINISHED;
673     }
674 
675     return NV_OK;
676 }
677 
678 NV_STATUS
679 ceutilsapiConstruct_IMPL
680 (
681     CeUtilsApi                   *pCeUtilsApi,
682     CALL_CONTEXT                 *pCallContext,
683     RS_RES_ALLOC_PARAMS_INTERNAL *pParams
684 )
685 {
686     NV0050_ALLOCATION_PARAMETERS *pAllocParams = pParams->pAllocParams;
687 
688     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _EXTERNAL, _TRUE, pAllocParams->flags))
689     {
690         NV_PRINTF(LEVEL_ERROR, "CeUtils: unsupported flags = 0x%llx\n", pAllocParams->flags);
691         return NV_ERR_NOT_SUPPORTED;
692     }
693 
694     return objCreate(&pCeUtilsApi->pCeUtils, pCeUtilsApi, CeUtils, GPU_RES_GET_GPU(pCeUtilsApi), NULL, pAllocParams);
695 }
696 
697 void
698 ceutilsapiDestruct_IMPL
699 (
700     CeUtilsApi *pCeUtilsApi
701 )
702 {
703     objDelete(pCeUtilsApi->pCeUtils);
704 }
705 
706 NV_STATUS
707 ceutilsapiCtrlCmdMemset_IMPL
708 (
709     CeUtilsApi *pCeUtilsApi,
710     NV0050_CTRL_MEMSET_PARAMS *pParams
711 )
712 {
713     NV_STATUS          status = NV_OK;
714     NvHandle           hClient = RES_GET_CLIENT_HANDLE(pCeUtilsApi);
715     RsResourceRef     *pPhysmemRef;
716     MEMORY_DESCRIPTOR *pMemDesc = NULL;
717     CEUTILS_MEMSET_PARAMS internalParams = {0};
718 
719     if (pParams->hMemory == 0)
720     {
721         return NV_ERR_INVALID_ARGUMENT;
722     }
723 
724     status = serverutilGetResourceRef(hClient, pParams->hMemory, &pPhysmemRef);
725     if (status != NV_OK)
726     {
727         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
728         return status;
729     }
730     pMemDesc = (dynamicCast(pPhysmemRef->pResource, Memory))->pMemDesc;
731 
732     internalParams.pMemDesc = pMemDesc;
733     internalParams.offset = pParams->offset;
734     internalParams.length = pParams->length;
735     internalParams.pattern = pParams->pattern;
736     internalParams.flags = pParams->flags;
737 
738     status = ceutilsMemset(pCeUtilsApi->pCeUtils, &internalParams);
739     if (status == NV_OK)
740     {
741         pParams->submittedWorkId = internalParams.submittedWorkId;
742     }
743 
744     return status;
745 }
746 
747 NV_STATUS
748 ceutilsapiCtrlCmdMemcopy_IMPL
749 (
750     CeUtilsApi *pCeUtilsApi,
751     NV0050_CTRL_MEMCOPY_PARAMS *pParams
752 )
753 {
754     NV_STATUS          status = NV_OK;
755     NvHandle           hClient = RES_GET_CLIENT_HANDLE(pCeUtilsApi);
756     RsResourceRef     *pSrcPhysmemRef;
757     RsResourceRef     *pDstPhysmemRef;
758     MEMORY_DESCRIPTOR *pSrcMemDesc = NULL;
759     MEMORY_DESCRIPTOR *pDstMemDesc = NULL;
760     CEUTILS_MEMCOPY_PARAMS internalParams = {0};
761 
762     if ((pParams->hSrcMemory == 0) || (pParams->hDstMemory == 0))
763     {
764         return NV_ERR_INVALID_ARGUMENT;
765     }
766 
767     status = serverutilGetResourceRef(hClient, pParams->hDstMemory, &pDstPhysmemRef);
768     if (status != NV_OK)
769     {
770         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
771         return status;
772     }
773     pDstMemDesc = (dynamicCast(pDstPhysmemRef->pResource, Memory))->pMemDesc;
774 
775     status = serverutilGetResourceRef(hClient, pParams->hSrcMemory, &pSrcPhysmemRef);
776     if (status != NV_OK)
777     {
778         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
779         return status;
780     }
781     pSrcMemDesc = (dynamicCast(pSrcPhysmemRef->pResource, Memory))->pMemDesc;
782 
783     internalParams.pSrcMemDesc = pSrcMemDesc;
784     internalParams.pDstMemDesc = pDstMemDesc;
785     internalParams.srcOffset = pParams->srcOffset;
786     internalParams.dstOffset = pParams->dstOffset;
787     internalParams.length = pParams->length;
788     internalParams.flags = pParams->flags;
789 
790     status = ceutilsMemcopy(pCeUtilsApi->pCeUtils, &internalParams);
791     if (status == NV_OK)
792     {
793         pParams->submittedWorkId = internalParams.submittedWorkId;
794     }
795 
796     return status;
797 }
798