1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 
25 #include "core/core.h"
26 #include "gpu/ce/kernel_ce.h"
27 #include "gpu/bus/kern_bus.h"
28 #include "kernel/gpu/intr/intr.h"
29 #include "kernel/gpu/fifo/kernel_fifo.h"
30 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
31 #include "kernel/gpu/mem_mgr/channel_utils.h"
32 #include "rmapi/rs_utils.h"
33 #include "utils/nvassert.h"
34 #include "core/prelude.h"
35 #include "core/locks.h"
36 #include "gpu/mem_mgr/ce_utils.h"
37 #include "gpu/subdevice/subdevice.h"
38 #include "kernel/gpu/mem_mgr/ce_utils_sizes.h"
39 
40 #include "class/clb0b5.h" // MAXWELL_DMA_COPY_A
41 #include "class/clc0b5.h" // PASCAL_DMA_COPY_A
42 #include "class/clc1b5.h" // PASCAL_DMA_COPY_B
43 #include "class/clc3b5.h" // VOLTA_DMA_COPY_A
44 #include "class/clc5b5.h" // TURING_DMA_COPY_A
45 #include "class/clc8b5.h" // HOPPER_DMA_COPY_A
46 #include "class/clc86f.h" // HOPPER_CHANNEL_GPFIFO_A
47 
48 #include "class/cl0080.h"
49 
50 NV_STATUS
51 ceutilsConstruct_IMPL
52 (
53     CeUtils                      *pCeUtils,
54     OBJGPU                       *pGpu,
55     NV0050_ALLOCATION_PARAMETERS *pParams
56 )
57 {
58     NV_ASSERT_OR_RETURN(pGpu, NV_ERR_INVALID_STATE);
59     return ceutilsInitialize(pCeUtils, pGpu, pParams);
60 }
61 
62 
63 // This is used by internal callsites without resource server
64 NV_STATUS
65 ceutilsInitialize
66 (
67     CeUtils                      *pCeUtils,
68     OBJGPU                       *pGpu,
69     NV0050_ALLOCATION_PARAMETERS *pAllocParams
70 )
71 {
72     NV_STATUS status = NV_OK;
73     NvU64 allocFlags = pAllocParams->flags;
74     NV_ASSERT_OR_RETURN(pGpu, NV_ERR_INVALID_STATE);
75 
76     NvBool bMIGInUse = IS_MIG_IN_USE(pGpu);
77     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
78 
79     // Allocate channel with RM internal client
80     RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
81     RmClient *pClient;
82 
83     OBJCHANNEL *pChannel = (OBJCHANNEL *) portMemAllocNonPaged(sizeof(OBJCHANNEL));
84     if (pChannel == NULL)
85     {
86         return NV_ERR_INSUFFICIENT_RESOURCES;
87     }
88 
89     portMemSet(pChannel, 0, sizeof(OBJCHANNEL));
90 
91     if (pCeUtils->hClient == NV01_NULL_OBJECT)
92     {
93         // Allocate client
94         status = pRmApi->AllocWithHandle(pRmApi, NV01_NULL_OBJECT, NV01_NULL_OBJECT,
95                                          NV01_NULL_OBJECT, NV01_ROOT, &pCeUtils->hClient,
96                                          sizeof(pCeUtils->hClient));
97         NV_ASSERT_OR_GOTO(status == NV_OK, cleanup);
98     }
99 
100     pChannel->hClient = pCeUtils->hClient;
101     pClient = serverutilGetClientUnderLock(pChannel->hClient);
102     NV_ASSERT_OR_GOTO(pClient != NULL, free_client);
103 
104     status = serverGetClientUnderLock(&g_resServ, pChannel->hClient, &pChannel->pRsClient);
105     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
106 
107     status = clientSetHandleGenerator(staticCast(pClient, RsClient), 1U, ~0U - 1U);
108     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
109 
110     pChannel->bClientAllocated = NV_TRUE;
111     pChannel->pGpu = pGpu;
112 
113     pChannel->deviceId = pCeUtils->hDevice;
114     pChannel->subdeviceId = pCeUtils->hSubdevice;
115 
116     // We'll allocate new VAS for now. Sharing client VAS will be added later
117     pChannel->hVASpaceId = NV01_NULL_OBJECT;
118     pChannel->bUseVasForCeCopy = FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _VIRTUAL_MODE, _TRUE, allocFlags);
119 
120     // Detect if we can enable fast scrub on this channel
121     status = memmgrMemUtilsGetCopyEngineClass_HAL(pGpu, pMemoryManager, &pCeUtils->hTdCopyClass);
122     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
123 
124     if (((pCeUtils->hTdCopyClass == HOPPER_DMA_COPY_A)
125         ) && !pChannel->bUseVasForCeCopy)
126     {
127         pChannel->type = FAST_SCRUBBER_CHANNEL;
128         NV_PRINTF(LEVEL_INFO, "Enabled fast scrubber in construct.\n");
129     }
130 
131     // Set up various channel resources
132     status = channelSetupIDs(pChannel, pGpu, pChannel->bUseVasForCeCopy, bMIGInUse);
133     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
134 
135     channelSetupChannelBufferSizes(pChannel);
136 
137     if (pCeUtils->pKernelMIGGPUInstance != NULL)
138     {
139         pChannel->pKernelMIGGpuInstance = pCeUtils->pKernelMIGGPUInstance;
140     }
141 
142     status = memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pChannel);
143     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
144 
145     NV_PRINTF(LEVEL_INFO, "Channel alloc successful for ceUtils\n");
146     pCeUtils->pChannel = pChannel;
147 
148     // Allocate CE states
149     status = memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pChannel);
150     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
151 
152     pCeUtils->pGpu = pGpu;
153 
154     return status;
155 
156 free_channel:
157     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->channelId);
158 
159     if (pAllocParams->hVaspace != NV01_NULL_OBJECT)
160     {
161         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hVASpaceId);
162     }
163 free_client:
164     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _EXTERNAL, _FALSE, allocFlags))
165     {
166         // If client allocated client, we should not free it in RM
167         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
168     }
169 
170 cleanup:
171     portMemFree(pChannel);
172     return status;
173 }
174 
175 void
176 ceutilsDeinit
177 (
178     CeUtils *pCeUtils
179 )
180 {
181     OBJCHANNEL *pChannel = pCeUtils->pChannel;
182     OBJGPU *pGpu = pCeUtils->pGpu;
183     RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
184 
185     // Sanity checks
186     if ((pGpu == NULL) || (pChannel == NULL))
187     {
188         NV_PRINTF(LEVEL_WARNING, "Possible double-free of CeUtils!\n");
189         return;
190     }
191     else if (pGpu != pChannel->pGpu)
192     {
193         NV_PRINTF(LEVEL_ERROR, "Bad state during ceUtils teardown!\n");
194         return;
195     }
196 
197     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->channelId);
198     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->errNotifierIdPhys);
199     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->pushBufferId);
200     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->errNotifierIdVirt);
201     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hVASpaceId);
202     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->deviceId);
203 
204     // Resource server makes sure no leak can occur
205     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
206     portMemFree(pChannel);
207 }
208 
209 void
210 ceutilsDestruct_IMPL
211 (
212     CeUtils *pCeUtils
213 )
214 {
215     ceutilsDeinit(pCeUtils);
216 }
217 
218 void
219 ceutilsServiceInterrupts(CeUtils *pCeUtils)
220 {
221     OBJCHANNEL *pChannel = pCeUtils->pChannel;
222 
223     //
224     // FIXME: Bug 2463959: objmemscrub is called with the rmDeviceGpuLock in the
225     // heapFree_IMPL->_stdmemPmaFree->pmaFreePages->scrubSubmitPages path.
226     // Yielding while holding the rmDeviceGpuLock can lead to deadlock. Instead,
227     // if the lock is held, service any interrupts on the owned CE to make progress.
228     // Bug 2527660 is filed to remove this change.
229     //
230     // pChannel is null when PMA scrub requests are handled in vGPU plugin.
231     // In this case vGpu plugin allocates scrubber channel in PF domain so
232     // above mention deadlock is not present here.
233     //
234     if ((pChannel != NULL) && (rmDeviceGpuLockIsOwner(pChannel->pGpu->gpuInstance)))
235     {
236         Intr *pIntr = GPU_GET_INTR(pChannel->pGpu);
237         intrServiceStallSingle_HAL(pChannel->pGpu, pIntr, MC_ENGINE_IDX_CE(pChannel->ceId), NV_FALSE);
238     }
239     else
240     {
241         osSchedule();
242     }
243 }
244 
245 
246 static NvBool
247 _ceUtilsFastScrubEnabled
248 (
249     POBJCHANNEL      pChannel,
250     CHANNEL_PB_INFO *pChannelPbInfo
251 )
252 {
253     OBJGPU *pGpu = pChannel->pGpu;
254     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
255 
256     if (!memmgrIsFastScrubberEnabled(pMemoryManager))
257     {
258         return NV_FALSE;
259     }
260 
261     //
262     // Enable the  memory fast scrubbing only when
263     // Channel was allocated as fastScrub channel
264     // We are doing a memset operation
265     // Memset pattern is 0
266     // DstPhysMode.target == LOCAL_FB
267     // Address is 4KB aligned
268     // LineLength is 4KB aligned
269     //
270 
271     return ((pChannel->type == FAST_SCRUBBER_CHANNEL) &&
272             (!pChannelPbInfo->bCeMemcopy) &&
273             (pChannelPbInfo->pattern == 0) &&
274             (pChannelPbInfo->dstAddressSpace == ADDR_FBMEM) &&
275             (NV_IS_ALIGNED64(pChannelPbInfo->dstAddr, MEMUTIL_SCRUB_OFFSET_ALIGNMENT)) &&
276             (NV_IS_ALIGNED(pChannelPbInfo->size, MEMUTIL_SCRUB_LINE_LENGTH_ALIGNMENT)));
277 }
278 
279 
280 //
281 // Helper to deal with CE_MAX_BYTES_PER_LINE
282 // This function may modify some fileds in pChannelPbInfo
283 //
284 static NV_STATUS
285 _ceutilsSubmitPushBuffer
286 (
287     POBJCHANNEL       pChannel,
288     NvU64             opLength,
289     CHANNEL_PB_INFO * pChannelPbInfo
290 
291 )
292 {
293     NV_STATUS status = NV_OK;
294     NvBool bFirstIteration = NV_TRUE;
295     NvBool bInsertFinishPayload = NV_FALSE;
296     NvU32 methodsLength, tempSize, putIndex = 0;
297     NvU64 remainingLength = opLength;
298 
299     NV_ASSERT_OR_RETURN(pChannelPbInfo != NULL, NV_ERR_INVALID_ARGUMENT);
300     NV_ASSERT_OR_RETURN(pChannel != NULL, NV_ERR_INVALID_ARGUMENT);
301     NV_ASSERT_OR_RETURN(opLength != 0, NV_ERR_INVALID_ARGUMENT);
302 
303     do
304     {
305         tempSize = (NvU32)NV_MIN(remainingLength, CE_MAX_BYTES_PER_LINE);
306         pChannelPbInfo->size = tempSize;
307         bInsertFinishPayload = (remainingLength == tempSize);
308         NV_PRINTF(LEVEL_INFO, "Actual size of copying to be pushed: %x \n", tempSize);
309 
310         status = channelWaitForFreeEntry(pChannel, &putIndex);
311         if (status != NV_OK)
312         {
313             NV_PRINTF(LEVEL_ERROR, "Cannot get putIndex.\n");
314             return status;
315         }
316 
317         if (_ceUtilsFastScrubEnabled(pChannel, pChannelPbInfo))
318         {
319             methodsLength = channelFillPbFastScrub(pChannel, putIndex, bFirstIteration, bInsertFinishPayload, pChannelPbInfo);
320         }
321         else
322         {
323             methodsLength = channelFillPb(pChannel, putIndex, bFirstIteration, bInsertFinishPayload, pChannelPbInfo);
324         }
325         if (methodsLength == 0)
326         {
327             NV_PRINTF(LEVEL_ERROR, "Cannot push methods to channel.\n");
328             return NV_ERR_NO_FREE_FIFOS;
329         }
330 
331         status = channelFillGpFifo(pChannel, putIndex, methodsLength);
332         if (status != NV_OK)
333         {
334             NV_PRINTF(LEVEL_ERROR, "Channel operation failures during memcopy\n");
335             return status;
336         }
337 
338         pChannel->lastSubmittedEntry = putIndex;
339         remainingLength -= tempSize;
340 
341         pChannelPbInfo->dstAddr += tempSize;
342         pChannelPbInfo->srcAddr += tempSize;
343 
344         bFirstIteration = NV_FALSE;
345     } while (remainingLength > 0);
346 
347     return status;
348 }
349 
350 
351 NV_STATUS
352 ceutilsMemset_IMPL
353 (
354     CeUtils *pCeUtils,
355     CEUTILS_MEMSET_PARAMS *pParams
356 )
357 {
358     OBJCHANNEL *pChannel = pCeUtils->pChannel;
359     NV_STATUS   status = NV_OK;
360 
361     NvU32 pteArraySize;
362     NvU64 offset, memsetLength, size, pageGranularity;
363     NvBool bContiguous;
364 
365     MEMORY_DESCRIPTOR *pMemDesc = pParams->pMemDesc;
366     CHANNEL_PB_INFO channelPbInfo = {0};
367 
368     if (pMemDesc == NULL)
369     {
370         NV_PRINTF(LEVEL_ERROR, "Invalid memdesc for CeUtils memset.\n");
371         return NV_ERR_INVALID_ARGUMENT;
372     }
373 
374     if ((memdescGetAddressSpace(pMemDesc) != ADDR_FBMEM) ||
375         (pMemDesc->pGpu != pCeUtils->pChannel->pGpu))
376     {
377         NV_PRINTF(LEVEL_ERROR, "Invalid memory descriptor passed.\n");
378         return NV_ERR_INVALID_ARGUMENT;
379     }
380 
381     size = memdescGetSize(pMemDesc);
382     pteArraySize = memdescGetPteArraySize(pMemDesc, AT_GPU);
383     bContiguous = (pMemDesc->_flags & MEMDESC_FLAGS_PHYSICALLY_CONTIGUOUS) || (pteArraySize == 1);
384 
385     if (pParams->offset >= size)
386     {
387         NV_PRINTF(LEVEL_ERROR, "Invalid offset passed for the memdesc.\n");
388         return NV_ERR_INVALID_ARGUMENT;
389     }
390 
391     NV_PRINTF(LEVEL_INFO, "CeUtils Args to memset - offset: %llx, size: %llx \n",
392               pParams->offset, pParams->length);
393 
394     if ((pParams->length == 0) || (pParams->length > (size - pParams->offset)))
395     {
396         NV_PRINTF(LEVEL_ERROR, "Invalid memset length passed.\n");
397         return NV_ERR_INVALID_ARGUMENT;
398     }
399 
400     channelPbInfo.bCeMemcopy = NV_FALSE;
401     channelPbInfo.payload = pCeUtils->lastSubmittedPayload + 1;
402     pCeUtils->lastSubmittedPayload = channelPbInfo.payload;
403 
404     channelPbInfo.pattern = pParams->pattern;
405     channelPbInfo.dstAddressSpace = memdescGetAddressSpace(pMemDesc);
406     channelPbInfo.dstCpuCacheAttrib = pMemDesc->_cpuCacheAttrib;
407 
408     pageGranularity = pMemDesc->pageArrayGranularity;
409     memsetLength = pParams->length;
410     offset = pParams->offset;
411 
412     do
413     {
414         NvU64 maxContigSize = bContiguous ? memsetLength : (pageGranularity - offset % pageGranularity);
415         NvU64 memsetSizeContig = NV_MIN(memsetLength, maxContigSize);
416 
417         channelPbInfo.dstAddr = memdescGetPhysAddr(pMemDesc, AT_GPU, offset);
418 
419         NV_PRINTF(LEVEL_INFO, "CeUtils Memset dstAddr: %llx,  size: %llx\n",
420                   channelPbInfo.dstAddr, memsetSizeContig);
421 
422         status = _ceutilsSubmitPushBuffer(pChannel, memsetSizeContig, &channelPbInfo);
423         if (status != NV_OK)
424         {
425             NV_PRINTF(LEVEL_ERROR, "Cannot submit push buffer for memset.\n");
426             return status;
427         }
428 
429         memsetLength -= memsetSizeContig;
430         offset       += memsetSizeContig;
431     } while (memsetLength != 0);
432 
433     if (pParams->flags & NV0050_CTRL_MEMSET_FLAGS_ASYNC)
434     {
435         NV_PRINTF(LEVEL_INFO, "Async memset payload returned: 0x%x\n", channelPbInfo.payload);
436         pParams->submittedWorkId = channelPbInfo.payload;
437     }
438     else
439     {
440         // Check semaProgress and then timeout
441         status = channelWaitForFinishPayload(pChannel, channelPbInfo.payload);
442         if (status == NV_OK)
443         {
444             NV_PRINTF(LEVEL_INFO, "Work was done from RM PoV lastSubmitted = 0x%x\n", channelPbInfo.payload);
445         }
446     }
447 
448     return status;
449 }
450 
451 NV_STATUS
452 ceutilsMemcopy_IMPL
453 (
454     CeUtils *pCeUtils,
455     CEUTILS_MEMCOPY_PARAMS *pParams
456 )
457 {
458     OBJCHANNEL *pChannel = pCeUtils->pChannel;
459     NV_STATUS   status = NV_OK;
460 
461     NvU64  srcSize, dstSize, copyLength, srcPageGranularity, dstPageGranularity;
462     NvBool bSrcContig, bDstContig;
463 
464     CHANNEL_PB_INFO channelPbInfo  = {0};
465     MEMORY_DESCRIPTOR *pDstMemDesc = pParams->pDstMemDesc;
466     MEMORY_DESCRIPTOR *pSrcMemDesc = pParams->pSrcMemDesc;
467 
468     NvU64 length = pParams->length;
469     NvU64 srcOffset = pParams->srcOffset;
470     NvU64 dstOffset = pParams->dstOffset;
471 
472     // Validate params
473     if ((pSrcMemDesc == NULL) || (pDstMemDesc == NULL))
474     {
475         NV_PRINTF(LEVEL_ERROR, "Src/Dst Memory descriptor should be valid.\n");
476         return NV_ERR_INVALID_ARGUMENT;
477     }
478 
479     if ((memdescGetAddressSpace(pSrcMemDesc) != ADDR_FBMEM) &&
480         (memdescGetAddressSpace(pDstMemDesc) != ADDR_FBMEM))
481     {
482         NV_PRINTF(LEVEL_ERROR, "Either Dst or Src memory should be in vidmem.\n");
483         return NV_ERR_INVALID_ARGUMENT;
484     }
485 
486     if ((pSrcMemDesc->pGpu != pCeUtils->pChannel->pGpu) ||
487         (pDstMemDesc->pGpu != pCeUtils->pChannel->pGpu))
488     {
489         NV_PRINTF(LEVEL_ERROR, "CeUtils does not support p2p copies right now. \n");
490         return NV_ERR_INVALID_ARGUMENT;
491     }
492 
493     srcSize = memdescGetSize(pSrcMemDesc);
494     dstSize = memdescGetSize(pDstMemDesc);
495 
496     if ((srcOffset >= srcSize) || (dstOffset >= dstSize))
497     {
498         NV_PRINTF(LEVEL_ERROR, "Invalid offset passed for the src/dst memdesc.\n");
499         return NV_ERR_INVALID_ARGUMENT;
500     }
501 
502     if ((length == 0) ||
503         (srcOffset + length > srcSize) || (dstOffset + length > dstSize))
504     {
505         NV_PRINTF(LEVEL_ERROR, "Invalid memcopy length.\n");
506         return NV_ERR_INVALID_ARGUMENT;
507     }
508 
509     channelPbInfo.bCeMemcopy = NV_TRUE;
510     channelPbInfo.payload = pCeUtils->lastSubmittedPayload + 1;
511     pCeUtils->lastSubmittedPayload = channelPbInfo.payload;
512 
513     channelPbInfo.srcAddressSpace = memdescGetAddressSpace(pSrcMemDesc);
514     channelPbInfo.dstAddressSpace = memdescGetAddressSpace(pDstMemDesc);
515 
516     channelPbInfo.srcCpuCacheAttrib = pSrcMemDesc->_cpuCacheAttrib;
517     channelPbInfo.dstCpuCacheAttrib = pDstMemDesc->_cpuCacheAttrib;
518 
519     srcPageGranularity = pSrcMemDesc->pageArrayGranularity;
520     dstPageGranularity = pDstMemDesc->pageArrayGranularity;
521     bSrcContig = memdescGetContiguity(pSrcMemDesc, AT_GPU);
522     bDstContig = memdescGetContiguity(pDstMemDesc, AT_GPU);
523 
524     copyLength = length;
525 
526     do
527     {
528         //
529         // This algorithm finds the maximum contig region from both src and dst
530         // for each copy and iterate until we submitted the whole range to CE
531         //
532         NvU64 maxContigSizeSrc = bSrcContig ? copyLength : (srcPageGranularity - srcOffset % srcPageGranularity);
533         NvU64 maxContigSizeDst = bDstContig ? copyLength : (dstPageGranularity - dstOffset % dstPageGranularity);
534         NvU64 copySizeContig = NV_MIN(copyLength, NV_MIN(maxContigSizeSrc, maxContigSizeDst));
535 
536         channelPbInfo.srcAddr = memdescGetPhysAddr(pSrcMemDesc, AT_GPU, srcOffset);
537         channelPbInfo.dstAddr = memdescGetPhysAddr(pDstMemDesc, AT_GPU, dstOffset);
538 
539         NV_PRINTF(LEVEL_INFO, "CeUtils Memcopy dstAddr: %llx, srcAddr: %llx, size: %llx\n",
540                   channelPbInfo.dstAddr, channelPbInfo.srcAddr, copySizeContig);
541 
542         status = _ceutilsSubmitPushBuffer(pChannel, copySizeContig, &channelPbInfo);
543         if (status != NV_OK)
544         {
545             NV_PRINTF(LEVEL_ERROR, "Cannot submit push buffer for memcopy.\n");
546             return status;
547         }
548 
549         copyLength -= copySizeContig;
550         srcOffset  += copySizeContig;
551         dstOffset  += copySizeContig;
552     } while (copyLength != 0);
553 
554     if (pParams->flags & NV0050_CTRL_MEMSET_FLAGS_ASYNC)
555     {
556         NV_PRINTF(LEVEL_INFO, "Async memset payload returned: 0x%x\n", channelPbInfo.payload);
557         pParams->submittedWorkId = channelPbInfo.payload;
558     }
559     else
560     {
561         // Check semaProgress and then timeout
562         status = channelWaitForFinishPayload(pChannel, channelPbInfo.payload);
563         if (status == NV_OK)
564         {
565             NV_PRINTF(LEVEL_INFO, "Work was done from RM PoV lastSubmitted = 0x%x\n", channelPbInfo.payload);
566         }
567     }
568 
569     return status;
570 }
571 
572 
573 // This function updates pCeUtils->lastCompletedPayload and handles wrap-around
574 NvU64
575 ceutilsUpdateProgress_IMPL
576 (
577     CeUtils *pCeUtils
578 )
579 {
580     NV_ASSERT((pCeUtils != NULL) && (pCeUtils->pChannel != NULL));
581 
582     NvU32 hwCurrentCompletedPayload = 0;
583     NvU64 swLastCompletedPayload = pCeUtils->lastCompletedPayload;
584 
585     //
586     // CeUtils uses 64 bit index to track the work submitted. But HW supports
587     // only 32 bit semaphore. The current completed Id is calculated here, based
588     // on the lastSubmittedPayload and current HW semaphore value.
589     //
590     hwCurrentCompletedPayload = READ_CHANNEL_PAYLOAD_SEMA(pCeUtils->pChannel);
591 
592     // No work has been completed since we checked last time
593     if (hwCurrentCompletedPayload == (NvU32)swLastCompletedPayload)
594     {
595         return swLastCompletedPayload;
596     }
597 
598     // Check for wrap around case. Increment the upper 32 bits
599     if (hwCurrentCompletedPayload < (NvU32)swLastCompletedPayload)
600     {
601         swLastCompletedPayload += 0x100000000ULL;
602     }
603 
604     // Update lower 32 bits regardless if wrap-around happened
605     swLastCompletedPayload &= 0xFFFFFFFF00000000ULL;
606     swLastCompletedPayload |= (NvU64)hwCurrentCompletedPayload;
607 
608     pCeUtils->lastCompletedPayload = swLastCompletedPayload;
609     return swLastCompletedPayload;
610 }
611 
612 
613 void
614 ceutilsRegisterGPUInstance
615 (
616     CeUtils *pCeUtils,
617     KERNEL_MIG_GPU_INSTANCE *pKernelMIGGPUInstance
618 )
619 {
620     pCeUtils->pKernelMIGGPUInstance = pKernelMIGGPUInstance;
621 }
622 
623 #if defined(DEBUG) || defined (DEVELOP)
624 NV_STATUS
625 ceutilsapiCtrlCmdCheckProgress_IMPL
626 (
627     CeUtilsApi *pCeUtilsApi,
628     NV0050_CTRL_CHECK_PROGRESS_PARAMS *pParams
629 )
630 {
631     if (pParams->submittedWorkId <= ceutilsUpdateProgress(pCeUtilsApi->pCeUtils))
632     {
633         pParams->result = NV0050_CTRL_CHECK_PROGRESS_RESULT_FINISHED;
634     }
635 
636     return NV_OK;
637 }
638 
639 NV_STATUS
640 ceutilsapiConstruct_IMPL
641 (
642     CeUtilsApi                   *pCeUtilsApi,
643     CALL_CONTEXT                 *pCallContext,
644     RS_RES_ALLOC_PARAMS_INTERNAL *pParams
645 )
646 {
647     NV0050_ALLOCATION_PARAMETERS *pAllocParams = pParams->pAllocParams;
648 
649     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _EXTERNAL, _TRUE, pAllocParams->flags))
650     {
651         NV_PRINTF(LEVEL_ERROR, "CeUtils: unsupported flags = 0x%llx\n", pAllocParams->flags);
652         return NV_ERR_NOT_SUPPORTED;
653     }
654 
655     return objCreate(&pCeUtilsApi->pCeUtils, pCeUtilsApi, CeUtils, GPU_RES_GET_GPU(pCeUtilsApi), pAllocParams);
656 }
657 
658 void
659 ceutilsapiDestruct_IMPL
660 (
661     CeUtilsApi *pCeUtilsApi
662 )
663 {
664     objDelete(pCeUtilsApi->pCeUtils);
665 }
666 
667 NV_STATUS
668 ceutilsapiCtrlCmdMemset_IMPL
669 (
670     CeUtilsApi *pCeUtilsApi,
671     NV0050_CTRL_MEMSET_PARAMS *pParams
672 )
673 {
674     NV_STATUS          status = NV_OK;
675     NvHandle           hClient = RES_GET_CLIENT_HANDLE(pCeUtilsApi);
676     RsResourceRef     *pPhysmemRef;
677     MEMORY_DESCRIPTOR *pMemDesc = NULL;
678     CEUTILS_MEMSET_PARAMS internalParams = {0};
679 
680     if (pParams->hMemory == 0)
681     {
682         return NV_ERR_INVALID_ARGUMENT;
683     }
684 
685     status = serverutilGetResourceRef(hClient, pParams->hMemory, &pPhysmemRef);
686     if (status != NV_OK)
687     {
688         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
689         return status;
690     }
691     pMemDesc = (dynamicCast(pPhysmemRef->pResource, Memory))->pMemDesc;
692 
693     internalParams.pMemDesc = pMemDesc;
694     internalParams.offset = pParams->offset;
695     internalParams.length = pParams->length;
696     internalParams.pattern = pParams->pattern;
697     internalParams.flags = pParams->flags;
698 
699     status = ceutilsMemset(pCeUtilsApi->pCeUtils, &internalParams);
700     if (status == NV_OK)
701     {
702         pParams->submittedWorkId = internalParams.submittedWorkId;
703     }
704 
705     return status;
706 }
707 
708 NV_STATUS
709 ceutilsapiCtrlCmdMemcopy_IMPL
710 (
711     CeUtilsApi *pCeUtilsApi,
712     NV0050_CTRL_MEMCOPY_PARAMS *pParams
713 )
714 {
715     NV_STATUS          status = NV_OK;
716     NvHandle           hClient = RES_GET_CLIENT_HANDLE(pCeUtilsApi);
717     RsResourceRef     *pSrcPhysmemRef;
718     RsResourceRef     *pDstPhysmemRef;
719     MEMORY_DESCRIPTOR *pSrcMemDesc = NULL;
720     MEMORY_DESCRIPTOR *pDstMemDesc = NULL;
721     CEUTILS_MEMCOPY_PARAMS internalParams = {0};
722 
723     if ((pParams->hSrcMemory == 0) || (pParams->hDstMemory == 0))
724     {
725         return NV_ERR_INVALID_ARGUMENT;
726     }
727 
728     status = serverutilGetResourceRef(hClient, pParams->hDstMemory, &pDstPhysmemRef);
729     if (status != NV_OK)
730     {
731         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
732         return status;
733     }
734     pDstMemDesc = (dynamicCast(pDstPhysmemRef->pResource, Memory))->pMemDesc;
735 
736     status = serverutilGetResourceRef(hClient, pParams->hSrcMemory, &pSrcPhysmemRef);
737     if (status != NV_OK)
738     {
739         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
740         return status;
741     }
742     pSrcMemDesc = (dynamicCast(pSrcPhysmemRef->pResource, Memory))->pMemDesc;
743 
744     internalParams.pSrcMemDesc = pSrcMemDesc;
745     internalParams.pDstMemDesc = pDstMemDesc;
746     internalParams.srcOffset = pParams->srcOffset;
747     internalParams.dstOffset = pParams->dstOffset;
748     internalParams.length = pParams->length;
749     internalParams.flags = pParams->flags;
750 
751     status = ceutilsMemcopy(pCeUtilsApi->pCeUtils, &internalParams);
752     if (status == NV_OK)
753     {
754         pParams->submittedWorkId = internalParams.submittedWorkId;
755     }
756 
757     return status;
758 }
759 #endif // defined(DEBUG) || defined (DEVELOP)
760