1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /*!
25  * GSP MESSAGE QUEUE - CPU-SIDE CODE
26  */
27 
28 #include "core/core.h"
29 #include "core/thread_state.h"
30 
31 
32 #include "os/os.h"
33 
34 #include "vgpu/rpc_headers.h"
35 #include "gpu/mem_mgr/virt_mem_allocator_common.h"
36 
37 #define RPC_STRUCTURES
38 #define RPC_GENERIC_UNION
39 #include "g_rpc-structures.h"
40 #undef RPC_STRUCTURES
41 #undef RPC_GENERIC_UNION
42 
43 #define RPC_MESSAGE_STRUCTURES
44 #define RPC_MESSAGE_GENERIC_UNION
45 #include "g_rpc-message-header.h"
46 #undef RPC_MESSAGE_STRUCTURES
47 #undef RPC_MESSAGE_GENERIC_UNION
48 
49 #include "gpu/gsp/message_queue.h"
50 #include "gpu/gsp/message_queue_priv.h"
51 #include "msgq/msgq_priv.h"
52 #include "gpu/gsp/kernel_gsp.h"
53 #include "nvrm_registry.h"
54 #include "gpu/conf_compute/ccsl.h"
55 #include "gpu/conf_compute/conf_compute.h"
56 
57 ct_assert(GSP_MSG_QUEUE_HEADER_SIZE > sizeof(msgqTxHeader) + sizeof(msgqRxHeader));
58 
59 static void _gspMsgQueueCleanup(MESSAGE_QUEUE_INFO *pMQI);
60 
61 static void
62 _getMsgQueueParams
63 (
64     OBJGPU *pGpu,
65     MESSAGE_QUEUE_COLLECTION *pMQCollection
66 )
67 {
68     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
69     NvLength queueSize;
70     MESSAGE_QUEUE_INFO *pRmQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX];
71     MESSAGE_QUEUE_INFO *pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX];
72     NvU32 numPtes;
73     const NvLength defaultCommandQueueSize = 0x40000; // 256 KB
74     const NvLength defaultStatusQueueSize  = 0x40000; // 256 KB
75     NvU32 regStatusQueueSize;
76 
77     // RmQueue sizes
78     if (IS_SILICON(pGpu))
79     {
80         pRmQueueInfo->commandQueueSize = defaultCommandQueueSize;
81     }
82     else
83     {
84         //
85         // Pre-silicon platforms need a large command queue in order to send
86         // the VBIOS image via RPC.
87         //
88         pRmQueueInfo->commandQueueSize = defaultCommandQueueSize * 6;
89     }
90 
91     // Check for status queue size overried
92     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_GSP_STATUS_QUEUE_SIZE, &regStatusQueueSize) == NV_OK)
93     {
94         regStatusQueueSize *= 1024; // to bytes
95         regStatusQueueSize = NV_MAX(GSP_MSG_QUEUE_ELEMENT_SIZE_MAX, regStatusQueueSize);
96         regStatusQueueSize = NV_ALIGN_UP(regStatusQueueSize, 1 << GSP_MSG_QUEUE_ALIGN);
97         pRmQueueInfo->statusQueueSize = regStatusQueueSize;
98     }
99     else
100     {
101         pRmQueueInfo->statusQueueSize = defaultStatusQueueSize;
102     }
103 
104     // TaskIsrQueue sizes
105     if (pKernelGsp->bIsTaskIsrQueueRequired)
106     {
107         pTaskIsrQueueInfo->commandQueueSize = defaultCommandQueueSize;
108         pTaskIsrQueueInfo->statusQueueSize = defaultStatusQueueSize;
109     }
110     else
111     {
112         pTaskIsrQueueInfo->commandQueueSize = 0;
113         pTaskIsrQueueInfo->statusQueueSize = 0;
114     }
115 
116     //
117     // Calculate the number of entries required to map both queues in addition
118     // to the page table itself.
119     //
120     queueSize = pRmQueueInfo->commandQueueSize      + pRmQueueInfo->statusQueueSize +
121                 pTaskIsrQueueInfo->commandQueueSize + pTaskIsrQueueInfo->statusQueueSize;
122     NV_ASSERT((queueSize & RM_PAGE_MASK) == 0);
123     numPtes = (queueSize >> RM_PAGE_SHIFT);
124 
125     // Account for the pages needed to store the PTEs
126     numPtes += NV_DIV_AND_CEIL(numPtes * sizeof(RmPhysAddr), RM_PAGE_SIZE);
127 
128     //
129     // Align the page table size to RM_PAGE_SIZE, so that the command queue is
130     // aligned.
131     //
132     pMQCollection->pageTableSize = RM_PAGE_ALIGN_UP(numPtes * sizeof(RmPhysAddr));
133     pMQCollection->pageTableEntryCount = numPtes;
134 }
135 
136 static NV_STATUS
137 _gspMsgQueueInit
138 (
139     MESSAGE_QUEUE_INFO *pMQI
140 )
141 {
142     NvU32 workAreaSize;
143     NV_STATUS nvStatus = NV_OK;
144     int nRet;
145 
146     // Allocate work area.
147     workAreaSize = (1 << GSP_MSG_QUEUE_ELEMENT_ALIGN) +
148                    GSP_MSG_QUEUE_ELEMENT_SIZE_MAX + msgqGetMetaSize();
149     pMQI->pWorkArea = portMemAllocNonPaged(workAreaSize);
150     if (pMQI->pWorkArea == NULL)
151     {
152         NV_PRINTF(LEVEL_ERROR, "Error allocating pWorkArea.\n");
153         return NV_ERR_NO_MEMORY;
154     }
155 
156     portMemSet(pMQI->pWorkArea, 0, workAreaSize);
157 
158     pMQI->pCmdQueueElement = (GSP_MSG_QUEUE_ELEMENT *)
159         NV_ALIGN_UP((NvUPtr)pMQI->pWorkArea, 1 << GSP_MSG_QUEUE_ELEMENT_ALIGN);
160     pMQI->pMetaData = (void *)((NvUPtr)pMQI->pCmdQueueElement + GSP_MSG_QUEUE_ELEMENT_SIZE_MAX);
161 
162     nRet = msgqInit(&pMQI->hQueue, pMQI->pMetaData);
163     if (nRet < 0)
164     {
165         NV_PRINTF(LEVEL_ERROR, "msgqInit failed: %d\n", nRet);
166         nvStatus = NV_ERR_GENERIC;
167         goto error_ret;
168     }
169 
170     nRet = msgqTxCreate(pMQI->hQueue,
171                 pMQI->pCommandQueue,
172                 pMQI->commandQueueSize,
173                 GSP_MSG_QUEUE_ELEMENT_SIZE_MIN,
174                 GSP_MSG_QUEUE_HEADER_ALIGN,
175                 GSP_MSG_QUEUE_ELEMENT_ALIGN,
176                 MSGQ_FLAGS_SWAP_RX);
177     if (nRet < 0)
178     {
179         NV_PRINTF(LEVEL_ERROR, "msgqTxCreate failed: %d\n", nRet);
180         nvStatus = NV_ERR_GENERIC;
181         goto error_ret;
182     }
183 
184     pMQI->pRpcMsgBuf   = &pMQI->pCmdQueueElement->rpc;
185 
186     NV_PRINTF(LEVEL_INFO, "Created command queue.\n");
187     return nvStatus;
188 
189 error_ret:
190     _gspMsgQueueCleanup(pMQI);
191     return nvStatus;
192 }
193 
194 /*!
195  * GspMsgQueueInit
196  *
197  * Initialize the command queues for CPU side.
198  * Must not be called before portInitialize.
199  */
200 NV_STATUS
201 GspMsgQueuesInit
202 (
203     OBJGPU                    *pGpu,
204     MESSAGE_QUEUE_COLLECTION **ppMQCollection
205 )
206 {
207     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
208     MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL;
209     MESSAGE_QUEUE_INFO  *pRmQueueInfo = NULL;
210     MESSAGE_QUEUE_INFO  *pTaskIsrQueueInfo = NULL;
211     RmPhysAddr  *pPageTbl;
212     NvP64        pVaKernel;
213     NvP64        pPrivKernel;
214     NV_STATUS    nvStatus         = NV_OK;
215     NvLength     sharedBufSize;
216     NvP64        lastQueueVa;
217     NvLength     lastQueueSize;
218     NvU64 flags = MEMDESC_FLAGS_NONE;
219 
220     if (*ppMQCollection != NULL)
221     {
222         NV_PRINTF(LEVEL_ERROR, "GSP message queue was already initialized.\n");
223         return NV_ERR_INVALID_STATE;
224     }
225 
226     pMQCollection = portMemAllocNonPaged(sizeof *pMQCollection);
227     if (pMQCollection == NULL)
228     {
229         NV_PRINTF(LEVEL_ERROR, "Error allocating queue info area.\n");
230         nvStatus = NV_ERR_NO_MEMORY;
231         goto done;
232     }
233     portMemSet(pMQCollection, 0, sizeof *pMQCollection);
234 
235     _getMsgQueueParams(pGpu, pMQCollection);
236 
237     pRmQueueInfo      = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX];
238     pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX];
239 
240     sharedBufSize = pMQCollection->pageTableSize +
241                     pRmQueueInfo->commandQueueSize +
242                     pRmQueueInfo->statusQueueSize +
243                     pTaskIsrQueueInfo->commandQueueSize +
244                     pTaskIsrQueueInfo->statusQueueSize;
245 
246     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
247 
248     //
249     // For now, put all shared queue memory in one block.
250     //
251     NV_ASSERT_OK_OR_GOTO(nvStatus,
252         memdescCreate(&pMQCollection->pSharedMemDesc, pGpu, sharedBufSize,
253             RM_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, NV_MEMORY_CACHED,
254             flags),
255         done);
256 
257     memdescSetFlag(pMQCollection->pSharedMemDesc, MEMDESC_FLAGS_KERNEL_MODE, NV_TRUE);
258 
259     memdescTagAlloc(nvStatus, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_58,
260                     pMQCollection->pSharedMemDesc);
261     NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_ret);
262 
263     // Create kernel mapping for command queue.
264     NV_ASSERT_OK_OR_GOTO(nvStatus,
265         memdescMap(pMQCollection->pSharedMemDesc, 0, sharedBufSize,
266             NV_TRUE, NV_PROTECT_WRITEABLE,
267             &pVaKernel, &pPrivKernel),
268         error_ret);
269 
270     memdescSetKernelMapping(pMQCollection->pSharedMemDesc, pVaKernel);
271     memdescSetKernelMappingPriv(pMQCollection->pSharedMemDesc, pPrivKernel);
272 
273     if (pVaKernel == NvP64_NULL)
274     {
275         NV_PRINTF(LEVEL_ERROR, "Error allocating message queue shared buffer\n");
276         nvStatus = NV_ERR_NO_MEMORY;
277         goto error_ret;
278     }
279 
280     portMemSet((void *)pVaKernel, 0, sharedBufSize);
281 
282     pPageTbl = pVaKernel;
283 
284     // Shared memory layout.
285     //
286     // Each of the following are page aligned:
287     //   Shared memory layout header (includes page table)
288     //   RM Command queue header
289     //   RM Command queue entries
290     //   RM Status queue header
291     //   RM Status queue entries
292     //   TASKISR Command queue header
293     //   TASKISR Command queue entries
294     //   TASKISR Status queue header
295     //   TASKISR Status queue entries
296     memdescGetPhysAddrs(pMQCollection->pSharedMemDesc,
297                     AT_GPU,                     // addressTranslation
298                     0,                          // offset
299                     RM_PAGE_SIZE,               // stride
300                     pMQCollection->pageTableEntryCount,  // count
301                     pPageTbl);                  // physical address table
302 
303     pRmQueueInfo->pCommandQueue = NvP64_VALUE(
304         NvP64_PLUS_OFFSET(pVaKernel, pMQCollection->pageTableSize));
305 
306     pRmQueueInfo->pStatusQueue  = NvP64_VALUE(
307         NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pRmQueueInfo->pCommandQueue), pRmQueueInfo->commandQueueSize));
308 
309     lastQueueVa   = NV_PTR_TO_NvP64(pRmQueueInfo->pStatusQueue);
310     lastQueueSize = pRmQueueInfo->statusQueueSize;
311 
312     if (pKernelGsp->bIsTaskIsrQueueRequired)
313     {
314         pTaskIsrQueueInfo->pCommandQueue = NvP64_VALUE(
315             NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pRmQueueInfo->pStatusQueue), pRmQueueInfo->statusQueueSize));
316 
317         pTaskIsrQueueInfo->pStatusQueue  = NvP64_VALUE(
318             NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pTaskIsrQueueInfo->pCommandQueue), pTaskIsrQueueInfo->commandQueueSize));
319 
320         lastQueueVa   = NV_PTR_TO_NvP64(pTaskIsrQueueInfo->pStatusQueue);
321         lastQueueSize = pTaskIsrQueueInfo->statusQueueSize;
322     }
323 
324     // Assert that the last queue offset + size fits into the shared memory.
325     NV_ASSERT(NvP64_PLUS_OFFSET(pVaKernel, sharedBufSize) ==
326               NvP64_PLUS_OFFSET(lastQueueVa, lastQueueSize));
327 
328     NV_ASSERT_OK_OR_GOTO(nvStatus, _gspMsgQueueInit(pRmQueueInfo), error_ret);
329     pRmQueueInfo->queueIdx = RPC_TASK_RM_QUEUE_IDX;
330 
331     if (pKernelGsp->bIsTaskIsrQueueRequired)
332     {
333         NV_ASSERT_OK_OR_GOTO(nvStatus, _gspMsgQueueInit(pTaskIsrQueueInfo), error_ret);
334         pTaskIsrQueueInfo->queueIdx = RPC_TASK_ISR_QUEUE_IDX;
335     }
336 
337     *ppMQCollection             = pMQCollection;
338     pMQCollection->sharedMemPA  = pPageTbl[0];
339 
340 done:
341     return nvStatus;
342 
343 error_ret:
344     GspMsgQueuesCleanup(&pMQCollection);
345     return nvStatus;
346 }
347 
348 NV_STATUS GspStatusQueueInit(OBJGPU *pGpu, MESSAGE_QUEUE_INFO **ppMQI)
349 {
350     NV_STATUS  nvStatus = NV_ERR_GENERIC;
351     int        nRet = 0;
352     int        nRetries;
353     RMTIMEOUT  timeout;
354     NvU32      timeoutUs = 4000000;
355     NvU32      timeoutFlags = GPU_TIMEOUT_FLAGS_DEFAULT;
356     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
357 
358     // GSP-RM in emulation/simulation environment is extremely slow
359     if (IS_EMULATION(pGpu) || IS_SIMULATION(pGpu))
360     {
361         //
362         // Scaling timeoutUs by GSP_SCALE_TIMEOUT_EMU_SIM overflows 32 bits,
363         // so just max it out instead.
364         //
365         timeoutUs = NV_U32_MAX;
366 
367         //
368         // On slower emulators and simulation, the time it takes to link the
369         // status queue is longer than the thread state timeout, so bypass
370         // the thread state so our longer timeout applies.
371         //
372         timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE;
373     }
374 
375     gpuSetTimeout(pGpu, timeoutUs, &timeout, timeoutFlags);
376 
377     // Wait other end of the queue to run msgqInit.  Retry until the timeout.
378     for (nRetries = 0; ; nRetries++)
379     {
380         // Link in status queue
381         portAtomicMemoryFenceFull();
382 
383         nRet = msgqRxLink((*ppMQI)->hQueue, (*ppMQI)->pStatusQueue,
384             (*ppMQI)->statusQueueSize, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN);
385 
386         if (nRet == 0)
387         {
388             NV_PRINTF(LEVEL_INFO, "Status queue linked to command queue.\n");
389 
390             //
391             // If we've bypassed the thread state timeout check for slower
392             // environments, it will have lapsed by now, so reset it so that
393             // the next timeout check doesn't fail immediately.
394             //
395             if (timeoutFlags & GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE)
396                 threadStateResetTimeout(pGpu);
397 
398             return NV_OK;
399         }
400 
401         osSpinLoop();
402 
403         nvStatus = gpuCheckTimeout(pGpu, &timeout);
404         if (nvStatus != NV_OK)
405             break;
406 
407         kgspDumpGspLogs(pKernelGsp, NV_FALSE);
408         if (!kgspHealthCheck_HAL(pGpu, pKernelGsp))
409         {
410             nvStatus = NV_ERR_RESET_REQUIRED;
411             break;
412         }
413     }
414 
415     if (nRet < 0)
416     {
417         NV_PRINTF(LEVEL_ERROR,
418             "msgqRxLink failed: %d, nvStatus 0x%08x, retries: %d\n",
419             nRet, nvStatus, nRetries);
420         _gspMsgQueueCleanup(*ppMQI);
421     }
422 
423     return nvStatus;
424 }
425 
426 static void
427 _gspMsgQueueCleanup(MESSAGE_QUEUE_INFO *pMQI)
428 {
429     if (pMQI == NULL)
430     {
431         return;
432     }
433 
434     portMemFree(pMQI->pWorkArea);
435 
436     pMQI->pWorkArea        = NULL;
437     pMQI->pCmdQueueElement = NULL;
438     pMQI->pMetaData        = NULL;
439 }
440 
441 void GspMsgQueuesCleanup(MESSAGE_QUEUE_COLLECTION **ppMQCollection)
442 {
443     MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL;
444     MESSAGE_QUEUE_INFO       *pRmQueueInfo  = NULL;
445     MESSAGE_QUEUE_INFO       *pTaskIsrQueueInfo = NULL;
446 
447     if ((ppMQCollection == NULL) || (*ppMQCollection == NULL))
448         return;
449 
450     pMQCollection     = *ppMQCollection;
451     pRmQueueInfo      = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX];
452     pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX];
453 
454     _gspMsgQueueCleanup(pRmQueueInfo);
455     _gspMsgQueueCleanup(pTaskIsrQueueInfo);
456 
457     if (pMQCollection->pSharedMemDesc != NULL)
458     {
459         NvP64 pVaKernel   = memdescGetKernelMapping(pMQCollection->pSharedMemDesc);
460         NvP64 pPrivKernel = memdescGetKernelMappingPriv(pMQCollection->pSharedMemDesc);
461 
462         // Destroy kernel mapping for command queue.
463         if (pVaKernel != 0)
464         {
465             memdescUnmap(pMQCollection->pSharedMemDesc, NV_TRUE, osGetCurrentProcess(),
466                          pVaKernel, pPrivKernel);
467         }
468 
469         // Free command queue memory.
470         memdescFree(pMQCollection->pSharedMemDesc);
471         memdescDestroy(pMQCollection->pSharedMemDesc);
472         pMQCollection->pSharedMemDesc = NULL;
473     }
474 
475     portMemFree(pMQCollection);
476     *ppMQCollection = NULL;
477 }
478 
479 /*!
480  * Calculate 32-bit checksum
481  *
482  * This routine assumes that the data is padded out with zeros to the next
483  * 8-byte alignment, and it is OK to read past the end to the 8-byte alignment.
484  */
485 static NV_INLINE NvU32 _checkSum32(void *pData, NvU32 uLen)
486 {
487     NvU64 *p        = (NvU64 *)pData;
488     NvU64 *pEnd     = (NvU64 *)((NvUPtr)pData + uLen);
489     NvU64  checkSum = 0;
490 
491     while (p < pEnd)
492         checkSum ^= *p++;
493 
494     return NvU64_HI32(checkSum) ^ NvU64_LO32(checkSum);
495 }
496 
497 /*!
498  * GspMsgQueueSendCommand
499  *
500  * Move a command record from our staging area to the command queue.
501  *
502  * Returns
503  *  NV_OK                       - Record sucessfully sent.
504  *  NV_ERR_INVALID_PARAM_STRUCT - Bad record length.
505  *  NV_ERR_BUSY_RETRY           - No space in the queue.
506  *  NV_ERR_INVALID_STATE        - Something really bad happenned.
507  */
508 NV_STATUS GspMsgQueueSendCommand(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
509 {
510     GSP_MSG_QUEUE_ELEMENT *pCQE = pMQI->pCmdQueueElement;
511     NvU8      *pSrc             = (NvU8 *)pCQE;
512     NvU8      *pNextElement     = NULL;
513     int        nRet;
514     NvU32      i;
515     RMTIMEOUT  timeout;
516     NV_STATUS  nvStatus         = NV_OK;
517     NvU32      uElementSize     = GSP_MSG_QUEUE_ELEMENT_HDR_SIZE +
518                                   pMQI->pCmdQueueElement->rpc.length;
519 
520     if ((uElementSize < sizeof(GSP_MSG_QUEUE_ELEMENT)) ||
521         (uElementSize > GSP_MSG_QUEUE_ELEMENT_SIZE_MAX))
522     {
523         NV_PRINTF(LEVEL_ERROR, "Incorrect length %u\n",
524             pMQI->pCmdQueueElement->rpc.length);
525         nvStatus = NV_ERR_INVALID_PARAM_STRUCT;
526         goto done;
527     }
528 
529     // Make sure the queue element in our working space is zero padded for checksum.
530     if ((uElementSize & 7) != 0)
531         portMemSet(pSrc + uElementSize, 0, 8 - (uElementSize & 7));
532 
533     pCQE->seqNum    = pMQI->txSeqNum;
534     pCQE->elemCount = GSP_MSG_QUEUE_BYTES_TO_ELEMENTS(uElementSize);
535     pCQE->checkSum  = 0;
536 
537     if (gpuIsCCFeatureEnabled(pGpu))
538     {
539         ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu);
540 
541         // Use sequence number as AAD.
542         portMemCopy((NvU8*)pCQE->aadBuffer, sizeof(pCQE->aadBuffer), (NvU8 *)&pCQE->seqNum, sizeof(pCQE->seqNum));
543 
544         // We need to encrypt the full queue elements to obscure the data.
545         nvStatus = ccslEncrypt(pCC->pRpcCcslCtx,
546                                (pCQE->elemCount * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN) - GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
547                                pSrc + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
548                                (NvU8*)pCQE->aadBuffer,
549                                sizeof(pCQE->aadBuffer),
550                                pSrc + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
551                                pCQE->authTagBuffer);
552 
553         if (nvStatus != NV_OK)
554         {
555             // Do not re-try if encryption fails.
556             NV_PRINTF(LEVEL_ERROR, "Encryption failed with status = 0x%x.\n", nvStatus);
557             if (nvStatus == NV_ERR_INSUFFICIENT_RESOURCES)
558             {
559                 // We hit potential IV overflow, this is fatal.
560                 NV_PRINTF(LEVEL_ERROR, "Fatal error detected in RPC encrypt: IV overflow!\n");
561                 confComputeSetErrorState(pGpu, pCC);
562             }
563             return nvStatus;
564         }
565 
566         // Now that encryption covers elements completely, include them in checksum.
567         pCQE->checkSum = _checkSum32(pSrc, pCQE->elemCount * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN);
568     }
569     else
570     {
571         pCQE->checkSum = _checkSum32(pSrc, uElementSize);
572     }
573 
574     for (i = 0; i < pCQE->elemCount; i++)
575     {
576         NvU32 timeoutFlags = 0;
577 
578         if (pMQI->txBufferFull)
579             timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_JOURNAL_LOG;
580 
581         // Set a timeout of 1 sec
582         gpuSetTimeout(pGpu, 1000000, &timeout, timeoutFlags);
583 
584         // Wait for space to put the next element.
585         while (NV_TRUE)
586         {
587             // Must get the buffers one at a time, since they could wrap.
588             pNextElement = (NvU8 *)msgqTxGetWriteBuffer(pMQI->hQueue, i);
589 
590             if (pNextElement != NULL)
591                 break;
592 
593             if (gpuCheckTimeout(pGpu, &timeout) != NV_OK)
594                 break;
595 
596             portAtomicMemoryFenceFull();
597 
598             osSpinLoop();
599         }
600 
601         if (pNextElement == NULL)
602         {
603             pMQI->txBufferFull++;
604             NV_PRINTF_COND(pMQI->txBufferFull == 1, LEVEL_ERROR, LEVEL_INFO,
605                            "buffer is full (waiting for %d free elements, got %d)\n",
606                            pCQE->elemCount, i);
607             nvStatus = NV_ERR_BUSY_RETRY;
608             goto done;
609         }
610         else
611         {
612             pMQI->txBufferFull = 0;
613         }
614 
615         portMemCopy(pNextElement, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN,
616                     pSrc,         GSP_MSG_QUEUE_ELEMENT_SIZE_MIN);
617         pSrc += GSP_MSG_QUEUE_ELEMENT_SIZE_MIN;
618     }
619 
620     //
621     // If write after write (WAW) memory ordering is relaxed in a CPU, then
622     // it's possible that below msgq update reaches memory first followed by
623     // above portMemCopy data. This is an issue for GSP RM which will read
624     // incorrect data because msgq was updated first. This is a typical
625     // example of producer consumer problem in memory ordering world. Hence,
626     // a store fence is needed here.
627     //
628     portAtomicMemoryFenceStore();
629 
630     nRet = msgqTxSubmitBuffers(pMQI->hQueue, pCQE->elemCount);
631 
632     if (nRet != 0)
633     {
634         NV_PRINTF(LEVEL_ERROR, "msgqTxSubmitBuffers failed: %d\n", nRet);
635         nvStatus = NV_ERR_INVALID_STATE;
636         goto done;
637     }
638 
639     // Advance seq num only if we actually used it.
640     pMQI->txSeqNum++;
641 
642     nvStatus = NV_OK;
643 
644 done:
645     return nvStatus;
646 }
647 
648 /*!
649  * GspMsgQueueReceiveStatus
650  *
651  * Get a status record from the GSP and move it from the rx queue to our
652  * staging area.
653  *
654  * Returns
655  *  NV_OK                       - Record sucessfully read.
656  *  NV_ERR_INVALID_PARAM_STRUCT - Bad record length.
657  *  NV_ERR_NOT_READY            - Partial read.
658  *  NV_ERR_INVALID_STATE        - Something really bad happenned.
659  */
660 NV_STATUS GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
661 {
662     const NvU8 *pNextElement = NULL;
663     NvU8       *pTgt         = (NvU8 *)pMQI->pCmdQueueElement;
664     int         nRet;
665     NvU32       i;
666     NvU32       nRetries;
667     NvU32       nMaxRetries  = 3;
668     NvU32       nElements    = 1;  // Assume record fits in one queue element for now.
669     NvU32       uElementSize = 0;
670     NvU32       seqMismatchDiff = NV_U32_MAX;
671     NV_STATUS   nvStatus     = NV_OK;
672 
673     for (nRetries = 0; nRetries < nMaxRetries; nRetries++)
674     {
675         pTgt      = (NvU8 *)pMQI->pCmdQueueElement;
676         nvStatus  = NV_OK;
677         nElements = 1;  // Assume record fits in one queue element for now.
678 
679         for (i = 0; i < nElements; i++)
680         {
681             // Get the pointer to the next queue element.
682             pNextElement = msgqRxGetReadBuffer(pMQI->hQueue, i);
683             if (pNextElement == NULL)
684             {
685                 // Early exit if this is the first read and there is no data.
686                 if (i == 0)
687                     return NV_WARN_NOTHING_TO_DO;
688 
689                 //
690                 // We already successfully read part of the record, so we are here
691                 // because the data is in flight (no fence) or the length was wrong.
692                 //
693                 NV_PRINTF(LEVEL_ERROR, "Incomplete read.\n");
694                 nvStatus = NV_ERR_NOT_READY;
695                 break;
696             }
697 
698             // Copy the next element to our staging area.
699             portMemCopy(pTgt, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN,
700                         pNextElement, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN);
701             pTgt += GSP_MSG_QUEUE_ELEMENT_SIZE_MIN;
702 
703             if (i == 0)
704             {
705                 //
706                 // Special processing for first element of the record.
707                 // Pull out the element count. This adjusts the loop condition.
708                 //
709                 nElements = pMQI->pCmdQueueElement->elemCount;
710             }
711         }
712 
713         // Retry if there was an error.
714         if (nvStatus != NV_OK)
715             continue;
716 
717         // Retry if checksum fails.
718         if (gpuIsCCFeatureEnabled(pGpu))
719         {
720             // In Confidential Compute scenario, checksum includes complete element range.
721             if (_checkSum32(pMQI->pCmdQueueElement, (nElements * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN)) != 0)
722             {
723                 NV_PRINTF(LEVEL_ERROR, "Bad checksum.\n");
724                 nvStatus = NV_ERR_INVALID_DATA;
725                 continue;
726             }
727         } else
728         if (_checkSum32(pMQI->pCmdQueueElement, uElementSize) != 0)
729         {
730             NV_PRINTF(LEVEL_ERROR, "Bad checksum.\n");
731             nvStatus = NV_ERR_INVALID_DATA;
732             continue;
733         }
734 
735         // Retry if sequence number is wrong.
736         if (pMQI->pCmdQueueElement->seqNum != pMQI->rxSeqNum)
737         {
738             NV_PRINTF(LEVEL_ERROR, "Bad sequence number.  Expected %u got %u. Possible memory corruption.\n",
739                 pMQI->rxSeqNum, pMQI->pCmdQueueElement->seqNum);
740 
741             // If we read an old piece of data, try to ignore it and move on..
742             if (pMQI->pCmdQueueElement->seqNum < pMQI->rxSeqNum)
743             {
744                 // Make sure we're converging to the desired pMQI->rxSeqNum
745                 if ((pMQI->rxSeqNum - pMQI->pCmdQueueElement->seqNum) < seqMismatchDiff)
746                 {
747                     NV_PRINTF(LEVEL_ERROR, "Attempting recovery: ignoring old package with seqNum=%u of %u elements.\n",
748                         pMQI->pCmdQueueElement->seqNum, nElements);
749 
750                     seqMismatchDiff = pMQI->rxSeqNum - pMQI->pCmdQueueElement->seqNum;
751                     nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements);
752                     if (nRet < 0)
753                     {
754                         NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet);
755                     }
756                     nMaxRetries++;
757                 }
758             }
759 
760             nvStatus = NV_ERR_INVALID_DATA;
761             continue;
762         }
763 
764         // We have the whole record, so break out of the retry loop.
765         break;
766     }
767 
768     if (nRetries > 0)
769     {
770         if (nvStatus == NV_OK)
771         {
772             NV_PRINTF(LEVEL_ERROR, "Read succeeded with %d retries.\n", nRetries);
773         }
774         else
775         {
776             NV_PRINTF(LEVEL_ERROR, "Read failed after %d retries.\n", nRetries);
777             return nvStatus;
778         }
779     }
780 
781     if (gpuIsCCFeatureEnabled(pGpu))
782     {
783         ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu);
784         nvStatus = ccslDecrypt(pCC->pRpcCcslCtx,
785                                (nElements * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN) - GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
786                                ((NvU8*)pMQI->pCmdQueueElement) + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
787                                NULL,
788                                (NvU8*)pMQI->pCmdQueueElement->aadBuffer,
789                                sizeof(pMQI->pCmdQueueElement->aadBuffer),
790                                ((NvU8*)pMQI->pCmdQueueElement) + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
791                                ((NvU8*)pMQI->pCmdQueueElement->authTagBuffer));
792 
793         if (nvStatus != NV_OK)
794         {
795             // Do not re-try if decryption failed. Decryption failure is considered fatal.
796             NV_PRINTF(LEVEL_ERROR, "Fatal error detected in RPC decrypt: 0x%x!\n", nvStatus);
797             confComputeSetErrorState(pGpu, pCC);
798             return nvStatus;
799         }
800     }
801 
802     // Sanity check for the given RPC length
803     uElementSize = GSP_MSG_QUEUE_ELEMENT_HDR_SIZE + pMQI->pCmdQueueElement->rpc.length;
804 
805     if ((uElementSize < sizeof(GSP_MSG_QUEUE_ELEMENT)) ||
806         (uElementSize > GSP_MSG_QUEUE_ELEMENT_SIZE_MAX))
807     {
808         // The length is not valid.  If we are running without a fence,
809         // this could mean that the data is still in flight from the CPU.
810         NV_PRINTF(LEVEL_ERROR, "Incorrect length %u\n",
811             pMQI->pCmdQueueElement->rpc.length);
812         nvStatus = NV_ERR_INVALID_PARAM_STRUCT;
813     }
814 
815     if (nvStatus == NV_OK)
816     {
817         pMQI->rxSeqNum++;
818 
819         nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements);
820         if (nRet < 0)
821         {
822             NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet);
823             nvStatus = NV_ERR_GENERIC;
824         }
825     }
826 
827     return nvStatus;
828 }
829 
830