1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 
25 #include "core/core.h"
26 #include "gpu/ce/kernel_ce.h"
27 #include "gpu/bus/kern_bus.h"
28 #include "kernel/gpu/intr/intr.h"
29 #include "kernel/gpu/fifo/kernel_fifo.h"
30 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
31 #include "kernel/gpu/mem_mgr/channel_utils.h"
32 #include "rmapi/rs_utils.h"
33 #include "utils/nvassert.h"
34 #include "core/prelude.h"
35 #include "core/locks.h"
36 #include "gpu/mem_mgr/ce_utils.h"
37 #include "kernel/gpu/mem_mgr/ce_utils_sizes.h"
38 #include "vgpu/rpc_headers.h"
39 #include "gpu/device/device.h"
40 
41 #include "class/clb0b5.h" // MAXWELL_DMA_COPY_A
42 #include "class/clc0b5.h" // PASCAL_DMA_COPY_A
43 #include "class/clc1b5.h" // PASCAL_DMA_COPY_B
44 #include "class/clc3b5.h" // VOLTA_DMA_COPY_A
45 #include "class/clc5b5.h" // TURING_DMA_COPY_A
46 #include "class/clc8b5.h" // HOPPER_DMA_COPY_A
47 #include "class/clc86f.h" // HOPPER_CHANNEL_GPFIFO_A
48 
49 #include "class/cl0080.h"
50 
_memUtilsGetCe(OBJGPU * pGpu,NvHandle hClient,NvHandle hDevice,NvU32 * pCeInstance)51 static NV_STATUS _memUtilsGetCe
52 (
53     OBJGPU *pGpu,
54     NvHandle hClient,
55     NvHandle hDevice,
56     NvU32 *pCeInstance
57 )
58 {
59     if (IS_MIG_IN_USE(pGpu))
60     {
61         RsClient *pClient;
62         Device *pDevice;
63 
64         NV_ASSERT_OK_OR_RETURN(
65             serverGetClientUnderLock(&g_resServ, hClient, &pClient));
66 
67         NV_ASSERT_OK_OR_RETURN(
68             deviceGetByHandle(pClient, hDevice, &pDevice));
69 
70         NV_ASSERT_OK_OR_RETURN(kmigmgrGetGPUInstanceScrubberCe(pGpu, GPU_GET_KERNEL_MIG_MANAGER(pGpu), pDevice, pCeInstance));
71         return NV_OK;
72     }
73     else
74     {
75         KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu);
76 
77         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, gpuUpdateEngineTable(pGpu));
78 
79         KernelCE  *pKCe = NULL;
80 
81         KCE_ITER_ALL_BEGIN(pGpu, pKCe, 0)
82             if (kbusCheckEngine_HAL(pGpu, pKernelBus, ENG_CE(pKCe->publicID)) &&
83                !ceIsCeGrce(pGpu, RM_ENGINE_TYPE_COPY(pKCe->publicID)) &&
84                gpuCheckEngineTable(pGpu, RM_ENGINE_TYPE_COPY(pKCe->publicID)))
85             {
86                 *pCeInstance = pKCe->publicID;
87                 return NV_OK;
88             }
89         KCE_ITER_END
90     }
91 
92     return NV_ERR_INSUFFICIENT_RESOURCES;
93 }
94 
95 NV_STATUS
ceutilsConstruct_IMPL(CeUtils * pCeUtils,OBJGPU * pGpu,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGPUInstance,NV0050_ALLOCATION_PARAMETERS * pAllocParams)96 ceutilsConstruct_IMPL
97 (
98     CeUtils                      *pCeUtils,
99     OBJGPU                       *pGpu,
100     KERNEL_MIG_GPU_INSTANCE      *pKernelMIGGPUInstance,
101     NV0050_ALLOCATION_PARAMETERS *pAllocParams
102 )
103 {
104     NV_STATUS status = NV_OK;
105     NvU64 allocFlags = pAllocParams->flags;
106     NvBool bForceCeId = FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _FORCE_CE_ID, _TRUE, allocFlags);
107     NV_ASSERT_OR_RETURN(pGpu, NV_ERR_INVALID_STATE);
108 
109     NvBool bMIGInUse = IS_MIG_IN_USE(pGpu);
110     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
111 
112     pCeUtils->pGpu = pGpu;
113 
114     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _FIFO_LITE, _TRUE, allocFlags))
115     {
116         return NV_ERR_NOT_SUPPORTED;
117     }
118 
119     // Allocate channel with RM internal client
120     RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
121     RmClient *pClient;
122 
123     OBJCHANNEL *pChannel = (OBJCHANNEL *) portMemAllocNonPaged(sizeof(OBJCHANNEL));
124     if (pChannel == NULL)
125     {
126         return NV_ERR_INSUFFICIENT_RESOURCES;
127     }
128 
129     portMemSet(pChannel, 0, sizeof(OBJCHANNEL));
130 
131     if (pCeUtils->hClient == NV01_NULL_OBJECT)
132     {
133         // Allocate client
134         status = pRmApi->AllocWithHandle(pRmApi, NV01_NULL_OBJECT, NV01_NULL_OBJECT,
135                                          NV01_NULL_OBJECT, NV01_ROOT, &pCeUtils->hClient,
136                                          sizeof(pCeUtils->hClient));
137         NV_ASSERT_OR_GOTO(status == NV_OK, cleanup);
138     }
139 
140     pChannel->hClient = pCeUtils->hClient;
141     pClient = serverutilGetClientUnderLock(pChannel->hClient);
142     NV_ASSERT_OR_GOTO(pClient != NULL, free_client);
143 
144     status = serverGetClientUnderLock(&g_resServ, pChannel->hClient, &pChannel->pRsClient);
145     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
146 
147     if (IS_VIRTUAL(pGpu))
148     {
149         NV_ASSERT_OK_OR_GOTO(
150             status,
151             clientSetHandleGenerator(staticCast(pClient, RsClient), RS_UNIQUE_HANDLE_BASE,
152                                      RS_UNIQUE_HANDLE_RANGE/2 - VGPU_RESERVED_HANDLE_RANGE),
153             free_client);
154     }
155     else
156     {
157         NV_ASSERT_OK_OR_GOTO(
158             status,
159             clientSetHandleGenerator(staticCast(pClient, RsClient), 1U, ~0U - 1U),
160             free_client);
161     }
162 
163     pChannel->bClientAllocated = NV_TRUE;
164     pChannel->pGpu = pGpu;
165     pChannel->pKernelMIGGpuInstance = pKernelMIGGPUInstance;
166 
167     // We'll allocate new VAS for now. Sharing client VAS will be added later
168     pChannel->hVASpaceId = NV01_NULL_OBJECT;
169     pChannel->bUseVasForCeCopy = FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _VIRTUAL_MODE, _TRUE, allocFlags);
170 
171     pChannel->bSecure = FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _CC_SECURE, _TRUE, allocFlags);
172 
173     // Detect if we can enable fast scrub on this channel
174     status = memmgrMemUtilsGetCopyEngineClass_HAL(pGpu, pMemoryManager, &pCeUtils->hTdCopyClass);
175     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
176 
177     if (((pCeUtils->hTdCopyClass == HOPPER_DMA_COPY_A)
178         ) && !pChannel->bUseVasForCeCopy)
179     {
180         pChannel->type = FAST_SCRUBBER_CHANNEL;
181         NV_PRINTF(LEVEL_INFO, "Enabled fast scrubber in construct.\n");
182     }
183     else
184     {
185         pChannel->type = CE_SCRUBBER_CHANNEL;
186     }
187 
188     // For self-hosted Hopper, we can only use VA copy or faster scrubber
189     if (pMemoryManager->bCePhysicalVidmemAccessNotSupported)
190     {
191         if (!pChannel->bUseVasForCeCopy &&
192             (pChannel->type != FAST_SCRUBBER_CHANNEL))
193         {
194             status = NV_ERR_NOT_SUPPORTED;
195             goto free_channel;
196         }
197     }
198 
199     // Set up various channel resources
200     status = channelSetupIDs(pChannel, pGpu, pChannel->bUseVasForCeCopy, bMIGInUse);
201     NV_ASSERT_OR_GOTO(status == NV_OK, free_client);
202 
203     channelSetupChannelBufferSizes(pChannel);
204 
205     NV_ASSERT_OK_OR_GOTO(status, channelAllocSubdevice(pGpu, pChannel), free_client);
206 
207     if (bForceCeId)
208     {
209         pChannel->ceId = pAllocParams->forceCeId;
210     }
211     else
212     {
213         NV_ASSERT_OK_OR_GOTO(status,
214             _memUtilsGetCe(pGpu, pChannel->hClient, pChannel->deviceId, &pChannel->ceId),
215             free_client);
216     }
217 
218     status = memmgrMemUtilsChannelInitialize_HAL(pGpu, pMemoryManager, pChannel);
219     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
220 
221     NV_PRINTF(LEVEL_INFO, "Channel alloc successful for ceUtils\n");
222     pCeUtils->pChannel = pChannel;
223 
224     // Allocate CE states
225     status = memmgrMemUtilsCopyEngineInitialize_HAL(pGpu, pMemoryManager, pChannel);
226     NV_ASSERT_OR_GOTO(status == NV_OK, free_channel);
227 
228     return status;
229 
230 free_channel:
231     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->channelId);
232 
233     if (pAllocParams->hVaspace != NV01_NULL_OBJECT)
234     {
235         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hVASpaceId);
236     }
237 free_client:
238     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _EXTERNAL, _FALSE, allocFlags))
239     {
240         // If client allocated client, we should not free it in RM
241         pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
242     }
243 
244 cleanup:
245     portMemFree(pChannel);
246     return status;
247 }
248 
249 void
ceutilsDestruct_IMPL(CeUtils * pCeUtils)250 ceutilsDestruct_IMPL
251 (
252     CeUtils *pCeUtils
253 )
254 {
255     OBJCHANNEL *pChannel = pCeUtils->pChannel;
256     OBJGPU *pGpu = pCeUtils->pGpu;
257     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
258     RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
259 
260     if ((pChannel->bClientUserd) && (pChannel->pControlGPFifo != NULL))
261     {
262         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
263         {
264             //
265             // When PCIE is blocked, mappings should be created, used and torn
266             // down when they are used
267             //
268             NV_PRINTF(LEVEL_ERROR, "Leaked USERD mapping from ceUtils!\n");
269         }
270         else
271         {
272             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pUserdMemdesc, TRANSFER_FLAGS_USE_BAR1);
273             pChannel->pControlGPFifo = NULL;
274         }
275     }
276 
277     if (pChannel->pbCpuVA != NULL)
278     {
279         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
280         {
281             NV_PRINTF(LEVEL_ERROR, "Leaked pushbuffer mapping!\n");
282         }
283         else
284         {
285             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc, TRANSFER_FLAGS_USE_BAR1);
286             pChannel->pbCpuVA = NULL;
287         }
288     }
289 
290     if (pChannel->pTokenFromNotifier != NULL)
291     {
292         if (kbusIsBarAccessBlocked(GPU_GET_KERNEL_BUS(pGpu)))
293         {
294             NV_PRINTF(LEVEL_ERROR, "Leaked notifier mapping!\n");
295         }
296         else
297         {
298             memmgrMemDescEndTransfer(pMemoryManager, pChannel->pErrNotifierMemdesc, TRANSFER_FLAGS_USE_BAR1);
299             pChannel->pTokenFromNotifier = NULL;
300         }
301     }
302 
303     // Resource server makes sure no leak can occur
304     pRmApi->Free(pRmApi, pChannel->hClient, pChannel->hClient);
305     portMemFree(pChannel);
306 }
307 
308 void
ceutilsServiceInterrupts_IMPL(CeUtils * pCeUtils)309 ceutilsServiceInterrupts_IMPL(CeUtils *pCeUtils)
310 {
311     OBJCHANNEL *pChannel = pCeUtils->pChannel;
312 
313     //
314     // FIXME: Bug 2463959: objmemscrub is called with the rmDeviceGpuLock in the
315     // heapFree_IMPL->_stdmemPmaFree->pmaFreePages->scrubSubmitPages path.
316     // Yielding while holding the rmDeviceGpuLock can lead to deadlock. Instead,
317     // if the lock is held, service any interrupts on the owned CE to make progress.
318     // Bug 2527660 is filed to remove this change.
319     //
320     if (rmDeviceGpuLockIsOwner(pChannel->pGpu->gpuInstance))
321     {
322         channelServiceScrubberInterrupts(pChannel);
323     }
324     else
325     {
326         osSchedule();
327     }
328 }
329 
330 
331 static NvBool
_ceUtilsFastScrubEnabled(OBJCHANNEL * pChannel,CHANNEL_PB_INFO * pChannelPbInfo)332 _ceUtilsFastScrubEnabled
333 (
334     OBJCHANNEL      *pChannel,
335     CHANNEL_PB_INFO *pChannelPbInfo
336 )
337 {
338     OBJGPU *pGpu = pChannel->pGpu;
339     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
340 
341     if (!memmgrIsFastScrubberEnabled(pMemoryManager))
342     {
343         return NV_FALSE;
344     }
345 
346     //
347     // Enable the  memory fast scrubbing only when
348     // Channel was allocated as fastScrub channel
349     // We are doing a memset operation
350     // Memset pattern is 0
351     // DstPhysMode.target == LOCAL_FB
352     // Address is 4KB aligned
353     // LineLength is 4KB aligned
354     //
355 
356     return ((pChannel->type == FAST_SCRUBBER_CHANNEL) &&
357             (!pChannelPbInfo->bCeMemcopy) &&
358             (pChannelPbInfo->pattern == 0) &&
359             (pChannelPbInfo->dstAddressSpace == ADDR_FBMEM) &&
360             (NV_IS_ALIGNED64(pChannelPbInfo->dstAddr, MEMUTIL_SCRUB_OFFSET_ALIGNMENT)) &&
361             (NV_IS_ALIGNED(pChannelPbInfo->size, MEMUTIL_SCRUB_LINE_LENGTH_ALIGNMENT)));
362 }
363 
364 
365 //
366 // Helper to deal with CE_MAX_BYTES_PER_LINE
367 // This function may modify some fileds in pChannelPbInfo
368 //
369 static NV_STATUS
_ceutilsSubmitPushBuffer(OBJCHANNEL * pChannel,NvBool bPipelined,NvBool bInsertFinishPayload,CHANNEL_PB_INFO * pChannelPbInfo)370 _ceutilsSubmitPushBuffer
371 (
372     OBJCHANNEL       *pChannel,
373     NvBool            bPipelined,
374     NvBool            bInsertFinishPayload,
375     CHANNEL_PB_INFO * pChannelPbInfo
376 )
377 {
378     NV_STATUS status = NV_OK;
379     NvU32 methodsLength, putIndex = 0;
380 
381     NV_ASSERT_OR_RETURN(pChannelPbInfo != NULL, NV_ERR_INVALID_ARGUMENT);
382     NV_ASSERT_OR_RETURN(pChannel != NULL, NV_ERR_INVALID_ARGUMENT);
383 
384     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pChannel->pGpu);
385     NvBool bReleaseMapping = NV_FALSE;
386 
387     //
388     // Use BAR1 if CPU access is allowed, otherwise allocate and init shadow
389     // buffer for DMA access
390     //
391     NvU32 transferFlags = (TRANSFER_FLAGS_USE_BAR1     |
392                            TRANSFER_FLAGS_SHADOW_ALLOC |
393                            TRANSFER_FLAGS_SHADOW_INIT_MEM);
394     NV_PRINTF(LEVEL_INFO, "Actual size of copying to be pushed: %x\n", pChannelPbInfo->size);
395 
396     status = channelWaitForFreeEntry(pChannel, &putIndex);
397     if (status != NV_OK)
398     {
399         NV_PRINTF(LEVEL_ERROR, "Cannot get putIndex.\n");
400         return status;
401     }
402 
403     if (pChannel->pbCpuVA == NULL)
404     {
405         pChannel->pbCpuVA = memmgrMemDescBeginTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc,
406                                                        transferFlags);
407         bReleaseMapping = NV_TRUE;
408     }
409     NV_ASSERT_OR_RETURN(pChannel->pbCpuVA != NULL, NV_ERR_GENERIC);
410 
411     if (_ceUtilsFastScrubEnabled(pChannel, pChannelPbInfo))
412     {
413         methodsLength = channelFillPbFastScrub(pChannel, putIndex, bPipelined, bInsertFinishPayload, pChannelPbInfo);
414     }
415     else
416     {
417         if (pMemoryManager->bCePhysicalVidmemAccessNotSupported)
418         {
419             // Self-hosted Hopper only supports VA copy or fast scrubber
420             NV_ASSERT_OR_RETURN(pChannel->bUseVasForCeCopy, NV_ERR_NOT_SUPPORTED);
421         }
422 
423         methodsLength = channelFillCePb(pChannel, putIndex, bPipelined, bInsertFinishPayload, pChannelPbInfo);
424     }
425 
426     if (bReleaseMapping)
427     {
428         memmgrMemDescEndTransfer(pMemoryManager, pChannel->pChannelBufferMemdesc, transferFlags);
429         pChannel->pbCpuVA = NULL;
430     }
431 
432     if (methodsLength == 0)
433     {
434         NV_PRINTF(LEVEL_ERROR, "Cannot push methods to channel.\n");
435         return NV_ERR_NO_FREE_FIFOS;
436     }
437 
438     //
439     // Pushbuffer can be written in a batch, but GPFIFO and doorbell require
440     // careful ordering so we do each write one-by-one
441     //
442     status = channelFillGpFifo(pChannel, putIndex, methodsLength);
443     if (status != NV_OK)
444     {
445         NV_PRINTF(LEVEL_ERROR, "Channel operation failures during memcopy\n");
446         return status;
447     }
448 
449     pChannel->lastSubmittedEntry = putIndex;
450 
451     return status;
452 }
453 
454 
455 NV_STATUS
ceutilsMemset_IMPL(CeUtils * pCeUtils,CEUTILS_MEMSET_PARAMS * pParams)456 ceutilsMemset_IMPL
457 (
458     CeUtils *pCeUtils,
459     CEUTILS_MEMSET_PARAMS *pParams
460 )
461 {
462     OBJCHANNEL *pChannel = pCeUtils->pChannel;
463     NV_STATUS   status = NV_OK;
464 
465     NvU32 pteArraySize;
466     NvU64 offset, memsetLength, size, pageGranularity;
467     NvBool bContiguous;
468 
469     MEMORY_DESCRIPTOR *pMemDesc = pParams->pMemDesc;
470     CHANNEL_PB_INFO channelPbInfo = {0};
471 
472     NvBool bPipelined = pParams->flags & NV0050_CTRL_MEMSET_FLAGS_PIPELINED;
473 
474     if (pMemDesc == NULL)
475     {
476         NV_PRINTF(LEVEL_ERROR, "Invalid memdesc for CeUtils memset.\n");
477         return NV_ERR_INVALID_ARGUMENT;
478     }
479 
480     if (pMemDesc->pGpu != pCeUtils->pChannel->pGpu)
481     {
482         NV_PRINTF(LEVEL_ERROR, "Invalid memory descriptor passed.\n");
483         return NV_ERR_INVALID_ARGUMENT;
484     }
485 
486     size = memdescGetSize(pMemDesc);
487     pteArraySize = memdescGetPteArraySize(pMemDesc, AT_GPU);
488     bContiguous = (pMemDesc->_flags & MEMDESC_FLAGS_PHYSICALLY_CONTIGUOUS) || (pteArraySize == 1);
489 
490     if (pParams->offset >= size)
491     {
492         NV_PRINTF(LEVEL_ERROR, "Invalid offset passed for the memdesc.\n");
493         return NV_ERR_INVALID_ARGUMENT;
494     }
495 
496     NV_PRINTF(LEVEL_INFO, "CeUtils Args to memset - offset: %llx, size: %llx \n",
497               pParams->offset, pParams->length);
498 
499     if ((pParams->length == 0) || (pParams->length > (size - pParams->offset)))
500     {
501         NV_PRINTF(LEVEL_ERROR, "Invalid memset length passed.\n");
502         return NV_ERR_INVALID_ARGUMENT;
503     }
504 
505     channelPbInfo.bCeMemcopy = NV_FALSE;
506     channelPbInfo.payload = pCeUtils->lastSubmittedPayload + 1;
507     pCeUtils->lastSubmittedPayload = channelPbInfo.payload;
508 
509     channelPbInfo.pattern = pParams->pattern;
510     channelPbInfo.dstAddressSpace = memdescGetAddressSpace(pMemDesc);
511     channelPbInfo.dstCpuCacheAttrib = pMemDesc->_cpuCacheAttrib;
512 
513     pageGranularity = pMemDesc->pageArrayGranularity;
514     memsetLength = pParams->length;
515     offset = pParams->offset;
516 
517     do
518     {
519         NvU64 maxContigSize = bContiguous ? memsetLength : (pageGranularity - offset % pageGranularity);
520         NvU32 memsetSizeContig = (NvU32)NV_MIN(NV_MIN(memsetLength, maxContigSize), CE_MAX_BYTES_PER_LINE);
521 
522         channelPbInfo.dstAddr = memdescGetPhysAddr(pMemDesc, AT_GPU, offset);
523 
524         NV_PRINTF(LEVEL_INFO, "CeUtils Memset dstAddr: %llx,  size: %x\n",
525                   channelPbInfo.dstAddr, memsetSizeContig);
526 
527         channelPbInfo.size = memsetSizeContig;
528         status = _ceutilsSubmitPushBuffer(pChannel, bPipelined, memsetSizeContig == memsetLength, &channelPbInfo);
529         if (status != NV_OK)
530         {
531             NV_PRINTF(LEVEL_ERROR, "Cannot submit push buffer for memset.\n");
532             return status;
533         }
534 
535          // Allow _LAUNCH_DMA methods that belong to the same memset operation to be pipelined after each other, as there are no dependencies
536         bPipelined = NV_TRUE;
537 
538         memsetLength -= memsetSizeContig;
539         offset       += memsetSizeContig;
540     } while (memsetLength != 0);
541 
542     if (pParams->flags & NV0050_CTRL_MEMSET_FLAGS_ASYNC)
543     {
544         NV_PRINTF(LEVEL_INFO, "Async memset payload returned: 0x%x\n", channelPbInfo.payload);
545         pParams->submittedWorkId = channelPbInfo.payload;
546     }
547     else
548     {
549         // Check semaProgress and then timeout
550         status = channelWaitForFinishPayload(pChannel, channelPbInfo.payload);
551         if (status == NV_OK)
552         {
553             NV_PRINTF(LEVEL_INFO, "Work was done from RM PoV lastSubmitted = 0x%x\n", channelPbInfo.payload);
554         }
555     }
556 
557     return status;
558 }
559 
560 NV_STATUS
ceutilsMemcopy_IMPL(CeUtils * pCeUtils,CEUTILS_MEMCOPY_PARAMS * pParams)561 ceutilsMemcopy_IMPL
562 (
563     CeUtils *pCeUtils,
564     CEUTILS_MEMCOPY_PARAMS *pParams
565 )
566 {
567     OBJCHANNEL *pChannel = pCeUtils->pChannel;
568     NV_STATUS   status = NV_OK;
569 
570     NvU64  srcSize, dstSize, copyLength, srcPageGranularity, dstPageGranularity;
571     NvBool bSrcContig, bDstContig;
572 
573     CHANNEL_PB_INFO channelPbInfo  = {0};
574     MEMORY_DESCRIPTOR *pDstMemDesc = pParams->pDstMemDesc;
575     MEMORY_DESCRIPTOR *pSrcMemDesc = pParams->pSrcMemDesc;
576 
577     NvU64 length = pParams->length;
578     NvU64 srcOffset = pParams->srcOffset;
579     NvU64 dstOffset = pParams->dstOffset;
580 
581     NvBool bPipelined = pParams->flags & NV0050_CTRL_MEMCOPY_FLAGS_PIPELINED;
582 
583     // Validate params
584     if ((pSrcMemDesc == NULL) || (pDstMemDesc == NULL))
585     {
586         NV_PRINTF(LEVEL_ERROR, "Src/Dst Memory descriptor should be valid.\n");
587         return NV_ERR_INVALID_ARGUMENT;
588     }
589 
590     if ((pSrcMemDesc->pGpu != pCeUtils->pChannel->pGpu) ||
591         (pDstMemDesc->pGpu != pCeUtils->pChannel->pGpu))
592     {
593         NV_PRINTF(LEVEL_ERROR, "CeUtils does not support p2p copies right now. \n");
594         return NV_ERR_INVALID_ARGUMENT;
595     }
596 
597     srcSize = memdescGetSize(pSrcMemDesc);
598     dstSize = memdescGetSize(pDstMemDesc);
599 
600     if ((srcOffset >= srcSize) || (dstOffset >= dstSize))
601     {
602         NV_PRINTF(LEVEL_ERROR, "Invalid offset passed for the src/dst memdesc.\n");
603         return NV_ERR_INVALID_ARGUMENT;
604     }
605 
606     if ((length == 0) ||
607         (srcOffset + length > srcSize) || (dstOffset + length > dstSize))
608     {
609         NV_PRINTF(LEVEL_ERROR, "Invalid memcopy length.\n");
610         return NV_ERR_INVALID_ARGUMENT;
611     }
612 
613     channelPbInfo.bCeMemcopy = NV_TRUE;
614     channelPbInfo.payload = pCeUtils->lastSubmittedPayload + 1;
615     pCeUtils->lastSubmittedPayload = channelPbInfo.payload;
616 
617     channelPbInfo.srcAddressSpace = memdescGetAddressSpace(pSrcMemDesc);
618     channelPbInfo.dstAddressSpace = memdescGetAddressSpace(pDstMemDesc);
619 
620     channelPbInfo.srcCpuCacheAttrib = pSrcMemDesc->_cpuCacheAttrib;
621     channelPbInfo.dstCpuCacheAttrib = pDstMemDesc->_cpuCacheAttrib;
622 
623     channelPbInfo.bSecureCopy = pParams->bSecureCopy;
624     channelPbInfo.bEncrypt = pParams->bEncrypt;
625     channelPbInfo.authTagAddr = pParams->authTagAddr;
626     channelPbInfo.encryptIvAddr = pParams->encryptIvAddr;
627 
628     srcPageGranularity = pSrcMemDesc->pageArrayGranularity;
629     dstPageGranularity = pDstMemDesc->pageArrayGranularity;
630     bSrcContig = memdescGetContiguity(pSrcMemDesc, AT_GPU);
631     bDstContig = memdescGetContiguity(pDstMemDesc, AT_GPU);
632 
633     copyLength = length;
634 
635     do
636     {
637         //
638         // This algorithm finds the maximum contig region from both src and dst
639         // for each copy and iterate until we submitted the whole range to CE
640         //
641         NvU64 maxContigSizeSrc = bSrcContig ? copyLength : (srcPageGranularity - srcOffset % srcPageGranularity);
642         NvU64 maxContigSizeDst = bDstContig ? copyLength : (dstPageGranularity - dstOffset % dstPageGranularity);
643         NvU32 copySizeContig = (NvU32)NV_MIN(NV_MIN(copyLength, NV_MIN(maxContigSizeSrc, maxContigSizeDst)), CE_MAX_BYTES_PER_LINE);
644 
645         channelPbInfo.srcAddr = memdescGetPhysAddr(pSrcMemDesc, AT_GPU, srcOffset);
646         channelPbInfo.dstAddr = memdescGetPhysAddr(pDstMemDesc, AT_GPU, dstOffset);
647 
648         NV_PRINTF(LEVEL_INFO, "CeUtils Memcopy dstAddr: %llx, srcAddr: %llx, size: %x\n",
649                   channelPbInfo.dstAddr, channelPbInfo.srcAddr, copySizeContig);
650 
651         channelPbInfo.size = copySizeContig;
652         status = _ceutilsSubmitPushBuffer(pChannel, bPipelined, copySizeContig == copyLength, &channelPbInfo);
653         if (status != NV_OK)
654         {
655             NV_PRINTF(LEVEL_ERROR, "Cannot submit push buffer for memcopy.\n");
656             return status;
657         }
658 
659          // Allow _LAUNCH_DMA methods that belong to the same copy operation to be pipelined after each other, as there are no dependencies
660         bPipelined = NV_TRUE;
661 
662         copyLength -= copySizeContig;
663         srcOffset  += copySizeContig;
664         dstOffset  += copySizeContig;
665     } while (copyLength != 0);
666 
667     if (pParams->flags & NV0050_CTRL_MEMSET_FLAGS_ASYNC)
668     {
669         NV_PRINTF(LEVEL_INFO, "Async memset payload returned: 0x%x\n", channelPbInfo.payload);
670         pParams->submittedWorkId = channelPbInfo.payload;
671     }
672     else
673     {
674         // Check semaProgress and then timeout
675         status = channelWaitForFinishPayload(pChannel, channelPbInfo.payload);
676         if (status == NV_OK)
677         {
678             NV_PRINTF(LEVEL_INFO, "Work was done from RM PoV lastSubmitted = 0x%x\n", channelPbInfo.payload);
679         }
680     }
681 
682     return status;
683 }
684 
685 
686 // This function updates pCeUtils->lastCompletedPayload and handles wrap-around
687 NvU64
ceutilsUpdateProgress_IMPL(CeUtils * pCeUtils)688 ceutilsUpdateProgress_IMPL
689 (
690     CeUtils *pCeUtils
691 )
692 {
693     NV_ASSERT((pCeUtils != NULL) && (pCeUtils->pChannel != NULL));
694 
695     NvU32 hwCurrentCompletedPayload = 0;
696     NvU64 swLastCompletedPayload = pCeUtils->lastCompletedPayload;
697 
698     //
699     // CeUtils uses 64 bit index to track the work submitted. But HW supports
700     // only 32 bit semaphore. The current completed Id is calculated here, based
701     // on the lastSubmittedPayload and current HW semaphore value.
702     //
703     hwCurrentCompletedPayload = READ_CHANNEL_PAYLOAD_SEMA(pCeUtils->pChannel);
704 
705     // No work has been completed since we checked last time
706     if (hwCurrentCompletedPayload == (NvU32)swLastCompletedPayload)
707     {
708         return swLastCompletedPayload;
709     }
710 
711     // Check for wrap around case. Increment the upper 32 bits
712     if (hwCurrentCompletedPayload < (NvU32)swLastCompletedPayload)
713     {
714         swLastCompletedPayload += 0x100000000ULL;
715     }
716 
717     // Update lower 32 bits regardless if wrap-around happened
718     swLastCompletedPayload &= 0xFFFFFFFF00000000ULL;
719     swLastCompletedPayload |= (NvU64)hwCurrentCompletedPayload;
720 
721     pCeUtils->lastCompletedPayload = swLastCompletedPayload;
722     return swLastCompletedPayload;
723 }
724 
725 NV_STATUS
ceutilsapiCtrlCmdCheckProgress_IMPL(CeUtilsApi * pCeUtilsApi,NV0050_CTRL_CHECK_PROGRESS_PARAMS * pParams)726 ceutilsapiCtrlCmdCheckProgress_IMPL
727 (
728     CeUtilsApi *pCeUtilsApi,
729     NV0050_CTRL_CHECK_PROGRESS_PARAMS *pParams
730 )
731 {
732     if (pParams->submittedWorkId <= ceutilsUpdateProgress(pCeUtilsApi->pCeUtils))
733     {
734         pParams->result = NV0050_CTRL_CHECK_PROGRESS_RESULT_FINISHED;
735     }
736 
737     return NV_OK;
738 }
739 
740 NV_STATUS
ceutilsapiConstruct_IMPL(CeUtilsApi * pCeUtilsApi,CALL_CONTEXT * pCallContext,RS_RES_ALLOC_PARAMS_INTERNAL * pParams)741 ceutilsapiConstruct_IMPL
742 (
743     CeUtilsApi                   *pCeUtilsApi,
744     CALL_CONTEXT                 *pCallContext,
745     RS_RES_ALLOC_PARAMS_INTERNAL *pParams
746 )
747 {
748     NV0050_ALLOCATION_PARAMETERS *pAllocParams = pParams->pAllocParams;
749 
750     if (FLD_TEST_DRF(0050_CEUTILS, _FLAGS, _EXTERNAL, _TRUE, pAllocParams->flags))
751     {
752         NV_PRINTF(LEVEL_ERROR, "CeUtils: unsupported flags = 0x%llx\n", pAllocParams->flags);
753         return NV_ERR_NOT_SUPPORTED;
754     }
755 
756     return objCreate(&pCeUtilsApi->pCeUtils, pCeUtilsApi, CeUtils, GPU_RES_GET_GPU(pCeUtilsApi), NULL, pAllocParams);
757 }
758 
759 void
ceutilsapiDestruct_IMPL(CeUtilsApi * pCeUtilsApi)760 ceutilsapiDestruct_IMPL
761 (
762     CeUtilsApi *pCeUtilsApi
763 )
764 {
765     objDelete(pCeUtilsApi->pCeUtils);
766 }
767 
768 NV_STATUS
ceutilsapiCtrlCmdMemset_IMPL(CeUtilsApi * pCeUtilsApi,NV0050_CTRL_MEMSET_PARAMS * pParams)769 ceutilsapiCtrlCmdMemset_IMPL
770 (
771     CeUtilsApi *pCeUtilsApi,
772     NV0050_CTRL_MEMSET_PARAMS *pParams
773 )
774 {
775     NV_STATUS          status = NV_OK;
776     NvHandle           hClient = RES_GET_CLIENT_HANDLE(pCeUtilsApi);
777     RsResourceRef     *pPhysmemRef;
778     MEMORY_DESCRIPTOR *pMemDesc = NULL;
779     CEUTILS_MEMSET_PARAMS internalParams = {0};
780 
781     if (pParams->hMemory == 0)
782     {
783         return NV_ERR_INVALID_ARGUMENT;
784     }
785 
786     status = serverutilGetResourceRef(hClient, pParams->hMemory, &pPhysmemRef);
787     if (status != NV_OK)
788     {
789         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
790         return status;
791     }
792     pMemDesc = (dynamicCast(pPhysmemRef->pResource, Memory))->pMemDesc;
793 
794     internalParams.pMemDesc = pMemDesc;
795     internalParams.offset = pParams->offset;
796     internalParams.length = pParams->length;
797     internalParams.pattern = pParams->pattern;
798     internalParams.flags = pParams->flags;
799 
800     status = ceutilsMemset(pCeUtilsApi->pCeUtils, &internalParams);
801     if (status == NV_OK)
802     {
803         pParams->submittedWorkId = internalParams.submittedWorkId;
804     }
805 
806     return status;
807 }
808 
809 NV_STATUS
ceutilsapiCtrlCmdMemcopy_IMPL(CeUtilsApi * pCeUtilsApi,NV0050_CTRL_MEMCOPY_PARAMS * pParams)810 ceutilsapiCtrlCmdMemcopy_IMPL
811 (
812     CeUtilsApi *pCeUtilsApi,
813     NV0050_CTRL_MEMCOPY_PARAMS *pParams
814 )
815 {
816     NV_STATUS          status = NV_OK;
817     NvHandle           hClient = RES_GET_CLIENT_HANDLE(pCeUtilsApi);
818     RsResourceRef     *pSrcPhysmemRef;
819     RsResourceRef     *pDstPhysmemRef;
820     MEMORY_DESCRIPTOR *pSrcMemDesc = NULL;
821     MEMORY_DESCRIPTOR *pDstMemDesc = NULL;
822     CEUTILS_MEMCOPY_PARAMS internalParams = {0};
823 
824     if ((pParams->hSrcMemory == 0) || (pParams->hDstMemory == 0))
825     {
826         return NV_ERR_INVALID_ARGUMENT;
827     }
828 
829     status = serverutilGetResourceRef(hClient, pParams->hDstMemory, &pDstPhysmemRef);
830     if (status != NV_OK)
831     {
832         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
833         return status;
834     }
835     pDstMemDesc = (dynamicCast(pDstPhysmemRef->pResource, Memory))->pMemDesc;
836 
837     status = serverutilGetResourceRef(hClient, pParams->hSrcMemory, &pSrcPhysmemRef);
838     if (status != NV_OK)
839     {
840         NV_PRINTF(LEVEL_ERROR, "Failed to get resource in resource server for physical memory handle.\n");
841         return status;
842     }
843     pSrcMemDesc = (dynamicCast(pSrcPhysmemRef->pResource, Memory))->pMemDesc;
844 
845     internalParams.pSrcMemDesc = pSrcMemDesc;
846     internalParams.pDstMemDesc = pDstMemDesc;
847     internalParams.srcOffset = pParams->srcOffset;
848     internalParams.dstOffset = pParams->dstOffset;
849     internalParams.length = pParams->length;
850     internalParams.flags = pParams->flags;
851 
852     status = ceutilsMemcopy(pCeUtilsApi->pCeUtils, &internalParams);
853     if (status == NV_OK)
854     {
855         pParams->submittedWorkId = internalParams.submittedWorkId;
856     }
857 
858     return status;
859 }
860