1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /*!
25 * GSP MESSAGE QUEUE - CPU-SIDE CODE
26 */
27
28 #include "core/core.h"
29 #include "core/thread_state.h"
30
31
32 #include "os/os.h"
33
34 #include "vgpu/rpc_headers.h"
35 #include "gpu/mem_mgr/virt_mem_allocator_common.h"
36
37 #define RPC_STRUCTURES
38 #define RPC_GENERIC_UNION
39 #include "g_rpc-structures.h"
40 #undef RPC_STRUCTURES
41 #undef RPC_GENERIC_UNION
42
43 #define RPC_MESSAGE_STRUCTURES
44 #define RPC_MESSAGE_GENERIC_UNION
45 #include "g_rpc-message-header.h"
46 #undef RPC_MESSAGE_STRUCTURES
47 #undef RPC_MESSAGE_GENERIC_UNION
48
49 #include "gpu/gsp/message_queue.h"
50 #include "gpu/gsp/message_queue_priv.h"
51 #include "msgq/msgq_priv.h"
52 #include "gpu/gsp/kernel_gsp.h"
53 #include "nvrm_registry.h"
54 #include "gpu/conf_compute/ccsl.h"
55 #include "gpu/conf_compute/conf_compute.h"
56
57 ct_assert(GSP_MSG_QUEUE_HEADER_SIZE > sizeof(msgqTxHeader) + sizeof(msgqRxHeader));
58
59 static void _gspMsgQueueCleanup(MESSAGE_QUEUE_INFO *pMQI);
60
61 static void
_getMsgQueueParams(OBJGPU * pGpu,MESSAGE_QUEUE_COLLECTION * pMQCollection)62 _getMsgQueueParams
63 (
64 OBJGPU *pGpu,
65 MESSAGE_QUEUE_COLLECTION *pMQCollection
66 )
67 {
68 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
69 NvLength queueSize;
70 MESSAGE_QUEUE_INFO *pRmQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX];
71 MESSAGE_QUEUE_INFO *pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX];
72 NvU32 numPtes;
73 const NvLength defaultCommandQueueSize = 0x40000; // 256 KB
74 const NvLength defaultStatusQueueSize = 0x40000; // 256 KB
75 NvU32 regStatusQueueSize;
76
77 // RmQueue sizes
78 if (IS_SILICON(pGpu))
79 {
80 pRmQueueInfo->commandQueueSize = defaultCommandQueueSize;
81 }
82 else
83 {
84 //
85 // Pre-silicon platforms need a large command queue in order to send
86 // the VBIOS image via RPC.
87 //
88 pRmQueueInfo->commandQueueSize = defaultCommandQueueSize * 6;
89 }
90
91 // Check for status queue size overried
92 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_GSP_STATUS_QUEUE_SIZE, ®StatusQueueSize) == NV_OK)
93 {
94 regStatusQueueSize *= 1024; // to bytes
95 regStatusQueueSize = NV_MAX(GSP_MSG_QUEUE_ELEMENT_SIZE_MAX, regStatusQueueSize);
96 regStatusQueueSize = NV_ALIGN_UP(regStatusQueueSize, 1 << GSP_MSG_QUEUE_ALIGN);
97 pRmQueueInfo->statusQueueSize = regStatusQueueSize;
98 }
99 else
100 {
101 pRmQueueInfo->statusQueueSize = defaultStatusQueueSize;
102 }
103
104 // TaskIsrQueue sizes
105 if (pKernelGsp->bIsTaskIsrQueueRequired)
106 {
107 pTaskIsrQueueInfo->commandQueueSize = defaultCommandQueueSize;
108 pTaskIsrQueueInfo->statusQueueSize = defaultStatusQueueSize;
109 }
110 else
111 {
112 pTaskIsrQueueInfo->commandQueueSize = 0;
113 pTaskIsrQueueInfo->statusQueueSize = 0;
114 }
115
116 //
117 // Calculate the number of entries required to map both queues in addition
118 // to the page table itself.
119 //
120 queueSize = pRmQueueInfo->commandQueueSize + pRmQueueInfo->statusQueueSize +
121 pTaskIsrQueueInfo->commandQueueSize + pTaskIsrQueueInfo->statusQueueSize;
122 NV_ASSERT((queueSize & RM_PAGE_MASK) == 0);
123 numPtes = (queueSize >> RM_PAGE_SHIFT);
124
125 // Account for the pages needed to store the PTEs
126 numPtes += NV_DIV_AND_CEIL(numPtes * sizeof(RmPhysAddr), RM_PAGE_SIZE);
127
128 //
129 // Align the page table size to RM_PAGE_SIZE, so that the command queue is
130 // aligned.
131 //
132 pMQCollection->pageTableSize = RM_PAGE_ALIGN_UP(numPtes * sizeof(RmPhysAddr));
133 pMQCollection->pageTableEntryCount = numPtes;
134 }
135
136 static NV_STATUS
_gspMsgQueueInit(MESSAGE_QUEUE_INFO * pMQI)137 _gspMsgQueueInit
138 (
139 MESSAGE_QUEUE_INFO *pMQI
140 )
141 {
142 NvU32 workAreaSize;
143 NV_STATUS nvStatus = NV_OK;
144 int nRet;
145
146 // Allocate work area.
147 workAreaSize = (1 << GSP_MSG_QUEUE_ELEMENT_ALIGN) +
148 GSP_MSG_QUEUE_ELEMENT_SIZE_MAX + msgqGetMetaSize();
149 pMQI->pWorkArea = portMemAllocNonPaged(workAreaSize);
150 if (pMQI->pWorkArea == NULL)
151 {
152 NV_PRINTF(LEVEL_ERROR, "Error allocating pWorkArea.\n");
153 return NV_ERR_NO_MEMORY;
154 }
155
156 portMemSet(pMQI->pWorkArea, 0, workAreaSize);
157
158 pMQI->pCmdQueueElement = (GSP_MSG_QUEUE_ELEMENT *)
159 NV_ALIGN_UP((NvUPtr)pMQI->pWorkArea, 1 << GSP_MSG_QUEUE_ELEMENT_ALIGN);
160 pMQI->pMetaData = (void *)((NvUPtr)pMQI->pCmdQueueElement + GSP_MSG_QUEUE_ELEMENT_SIZE_MAX);
161
162 nRet = msgqInit(&pMQI->hQueue, pMQI->pMetaData);
163 if (nRet < 0)
164 {
165 NV_PRINTF(LEVEL_ERROR, "msgqInit failed: %d\n", nRet);
166 nvStatus = NV_ERR_GENERIC;
167 goto error_ret;
168 }
169
170 nRet = msgqTxCreate(pMQI->hQueue,
171 pMQI->pCommandQueue,
172 pMQI->commandQueueSize,
173 GSP_MSG_QUEUE_ELEMENT_SIZE_MIN,
174 GSP_MSG_QUEUE_HEADER_ALIGN,
175 GSP_MSG_QUEUE_ELEMENT_ALIGN,
176 MSGQ_FLAGS_SWAP_RX);
177 if (nRet < 0)
178 {
179 NV_PRINTF(LEVEL_ERROR, "msgqTxCreate failed: %d\n", nRet);
180 nvStatus = NV_ERR_GENERIC;
181 goto error_ret;
182 }
183
184 pMQI->pRpcMsgBuf = &pMQI->pCmdQueueElement->rpc;
185
186 NV_PRINTF(LEVEL_INFO, "Created command queue.\n");
187 return nvStatus;
188
189 error_ret:
190 _gspMsgQueueCleanup(pMQI);
191 return nvStatus;
192 }
193
194 /*!
195 * GspMsgQueueInit
196 *
197 * Initialize the command queues for CPU side.
198 * Must not be called before portInitialize.
199 */
200 NV_STATUS
GspMsgQueuesInit(OBJGPU * pGpu,MESSAGE_QUEUE_COLLECTION ** ppMQCollection)201 GspMsgQueuesInit
202 (
203 OBJGPU *pGpu,
204 MESSAGE_QUEUE_COLLECTION **ppMQCollection
205 )
206 {
207 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
208 MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL;
209 MESSAGE_QUEUE_INFO *pRmQueueInfo = NULL;
210 MESSAGE_QUEUE_INFO *pTaskIsrQueueInfo = NULL;
211 RmPhysAddr *pPageTbl;
212 NvP64 pVaKernel;
213 NvP64 pPrivKernel;
214 NV_STATUS nvStatus = NV_OK;
215 NvLength sharedBufSize;
216 NvP64 lastQueueVa;
217 NvLength lastQueueSize;
218 NvU64 flags = MEMDESC_FLAGS_NONE;
219
220 if (*ppMQCollection != NULL)
221 {
222 NV_PRINTF(LEVEL_ERROR, "GSP message queue was already initialized.\n");
223 return NV_ERR_INVALID_STATE;
224 }
225
226 pMQCollection = portMemAllocNonPaged(sizeof *pMQCollection);
227 if (pMQCollection == NULL)
228 {
229 NV_PRINTF(LEVEL_ERROR, "Error allocating queue info area.\n");
230 nvStatus = NV_ERR_NO_MEMORY;
231 goto done;
232 }
233 portMemSet(pMQCollection, 0, sizeof *pMQCollection);
234
235 _getMsgQueueParams(pGpu, pMQCollection);
236
237 pRmQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX];
238 pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX];
239
240 sharedBufSize = pMQCollection->pageTableSize +
241 pRmQueueInfo->commandQueueSize +
242 pRmQueueInfo->statusQueueSize +
243 pTaskIsrQueueInfo->commandQueueSize +
244 pTaskIsrQueueInfo->statusQueueSize;
245
246 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
247
248 //
249 // For now, put all shared queue memory in one block.
250 //
251 NV_ASSERT_OK_OR_GOTO(nvStatus,
252 memdescCreate(&pMQCollection->pSharedMemDesc, pGpu, sharedBufSize,
253 RM_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, NV_MEMORY_CACHED,
254 flags),
255 done);
256
257 memdescSetFlag(pMQCollection->pSharedMemDesc, MEMDESC_FLAGS_KERNEL_MODE, NV_TRUE);
258
259 memdescTagAlloc(nvStatus, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_58,
260 pMQCollection->pSharedMemDesc);
261 NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_ret);
262
263 // Create kernel mapping for command queue.
264 NV_ASSERT_OK_OR_GOTO(nvStatus,
265 memdescMap(pMQCollection->pSharedMemDesc, 0, sharedBufSize,
266 NV_TRUE, NV_PROTECT_WRITEABLE,
267 &pVaKernel, &pPrivKernel),
268 error_ret);
269
270 memdescSetKernelMapping(pMQCollection->pSharedMemDesc, pVaKernel);
271 memdescSetKernelMappingPriv(pMQCollection->pSharedMemDesc, pPrivKernel);
272
273 if (pVaKernel == NvP64_NULL)
274 {
275 NV_PRINTF(LEVEL_ERROR, "Error allocating message queue shared buffer\n");
276 nvStatus = NV_ERR_NO_MEMORY;
277 goto error_ret;
278 }
279
280 portMemSet((void *)pVaKernel, 0, sharedBufSize);
281
282 pPageTbl = pVaKernel;
283
284 // Shared memory layout.
285 //
286 // Each of the following are page aligned:
287 // Shared memory layout header (includes page table)
288 // RM Command queue header
289 // RM Command queue entries
290 // RM Status queue header
291 // RM Status queue entries
292 // TASKISR Command queue header
293 // TASKISR Command queue entries
294 // TASKISR Status queue header
295 // TASKISR Status queue entries
296 memdescGetPhysAddrs(pMQCollection->pSharedMemDesc,
297 AT_GPU, // addressTranslation
298 0, // offset
299 RM_PAGE_SIZE, // stride
300 pMQCollection->pageTableEntryCount, // count
301 pPageTbl); // physical address table
302
303 pRmQueueInfo->pCommandQueue = NvP64_VALUE(
304 NvP64_PLUS_OFFSET(pVaKernel, pMQCollection->pageTableSize));
305
306 pRmQueueInfo->pStatusQueue = NvP64_VALUE(
307 NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pRmQueueInfo->pCommandQueue), pRmQueueInfo->commandQueueSize));
308
309 lastQueueVa = NV_PTR_TO_NvP64(pRmQueueInfo->pStatusQueue);
310 lastQueueSize = pRmQueueInfo->statusQueueSize;
311
312 if (pKernelGsp->bIsTaskIsrQueueRequired)
313 {
314 pTaskIsrQueueInfo->pCommandQueue = NvP64_VALUE(
315 NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pRmQueueInfo->pStatusQueue), pRmQueueInfo->statusQueueSize));
316
317 pTaskIsrQueueInfo->pStatusQueue = NvP64_VALUE(
318 NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pTaskIsrQueueInfo->pCommandQueue), pTaskIsrQueueInfo->commandQueueSize));
319
320 lastQueueVa = NV_PTR_TO_NvP64(pTaskIsrQueueInfo->pStatusQueue);
321 lastQueueSize = pTaskIsrQueueInfo->statusQueueSize;
322 }
323
324 // Assert that the last queue offset + size fits into the shared memory.
325 NV_ASSERT(NvP64_PLUS_OFFSET(pVaKernel, sharedBufSize) ==
326 NvP64_PLUS_OFFSET(lastQueueVa, lastQueueSize));
327
328 NV_ASSERT_OK_OR_GOTO(nvStatus, _gspMsgQueueInit(pRmQueueInfo), error_ret);
329 pRmQueueInfo->queueIdx = RPC_TASK_RM_QUEUE_IDX;
330
331 if (pKernelGsp->bIsTaskIsrQueueRequired)
332 {
333 NV_ASSERT_OK_OR_GOTO(nvStatus, _gspMsgQueueInit(pTaskIsrQueueInfo), error_ret);
334 pTaskIsrQueueInfo->queueIdx = RPC_TASK_ISR_QUEUE_IDX;
335 }
336
337 *ppMQCollection = pMQCollection;
338 pMQCollection->sharedMemPA = pPageTbl[0];
339
340 done:
341 return nvStatus;
342
343 error_ret:
344 GspMsgQueuesCleanup(&pMQCollection);
345 return nvStatus;
346 }
347
GspStatusQueueInit(OBJGPU * pGpu,MESSAGE_QUEUE_INFO ** ppMQI)348 NV_STATUS GspStatusQueueInit(OBJGPU *pGpu, MESSAGE_QUEUE_INFO **ppMQI)
349 {
350 NV_STATUS nvStatus = NV_ERR_GENERIC;
351 int nRet = 0;
352 int nRetries;
353 RMTIMEOUT timeout;
354 NvU32 timeoutUs = 4000000;
355 NvU32 timeoutFlags = GPU_TIMEOUT_FLAGS_DEFAULT;
356 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
357
358 // GSP-RM in emulation/simulation environment is extremely slow
359 if (IS_EMULATION(pGpu) || IS_SIMULATION(pGpu))
360 {
361 //
362 // Scaling timeoutUs by GSP_SCALE_TIMEOUT_EMU_SIM overflows 32 bits,
363 // so just max it out instead.
364 //
365 timeoutUs = NV_U32_MAX;
366
367 //
368 // On slower emulators and simulation, the time it takes to link the
369 // status queue is longer than the thread state timeout, so bypass
370 // the thread state so our longer timeout applies.
371 //
372 timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE;
373 }
374
375 gpuSetTimeout(pGpu, timeoutUs, &timeout, timeoutFlags);
376
377 // Wait other end of the queue to run msgqInit. Retry until the timeout.
378 for (nRetries = 0; ; nRetries++)
379 {
380 // Link in status queue
381 portAtomicMemoryFenceFull();
382
383 nRet = msgqRxLink((*ppMQI)->hQueue, (*ppMQI)->pStatusQueue,
384 (*ppMQI)->statusQueueSize, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN);
385
386 if (nRet == 0)
387 {
388 NV_PRINTF(LEVEL_INFO, "Status queue linked to command queue.\n");
389
390 //
391 // If we've bypassed the thread state timeout check for slower
392 // environments, it will have lapsed by now, so reset it so that
393 // the next timeout check doesn't fail immediately.
394 //
395 if (timeoutFlags & GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE)
396 threadStateResetTimeout(pGpu);
397
398 return NV_OK;
399 }
400
401 osSpinLoop();
402
403 nvStatus = gpuCheckTimeout(pGpu, &timeout);
404 if (nvStatus != NV_OK)
405 break;
406
407 kgspDumpGspLogs(pKernelGsp, NV_FALSE);
408 if (!kgspHealthCheck_HAL(pGpu, pKernelGsp))
409 {
410 nvStatus = NV_ERR_RESET_REQUIRED;
411 break;
412 }
413 }
414
415 if (nRet < 0)
416 {
417 NV_PRINTF(LEVEL_ERROR,
418 "msgqRxLink failed: %d, nvStatus 0x%08x, retries: %d\n",
419 nRet, nvStatus, nRetries);
420 _gspMsgQueueCleanup(*ppMQI);
421 }
422
423 return nvStatus;
424 }
425
426 static void
_gspMsgQueueCleanup(MESSAGE_QUEUE_INFO * pMQI)427 _gspMsgQueueCleanup(MESSAGE_QUEUE_INFO *pMQI)
428 {
429 if (pMQI == NULL)
430 {
431 return;
432 }
433
434 portMemFree(pMQI->pWorkArea);
435
436 pMQI->pWorkArea = NULL;
437 pMQI->pCmdQueueElement = NULL;
438 pMQI->pMetaData = NULL;
439 }
440
GspMsgQueuesCleanup(MESSAGE_QUEUE_COLLECTION ** ppMQCollection)441 void GspMsgQueuesCleanup(MESSAGE_QUEUE_COLLECTION **ppMQCollection)
442 {
443 MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL;
444 MESSAGE_QUEUE_INFO *pRmQueueInfo = NULL;
445 MESSAGE_QUEUE_INFO *pTaskIsrQueueInfo = NULL;
446
447 if ((ppMQCollection == NULL) || (*ppMQCollection == NULL))
448 return;
449
450 pMQCollection = *ppMQCollection;
451 pRmQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX];
452 pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX];
453
454 _gspMsgQueueCleanup(pRmQueueInfo);
455 _gspMsgQueueCleanup(pTaskIsrQueueInfo);
456
457 if (pMQCollection->pSharedMemDesc != NULL)
458 {
459 NvP64 pVaKernel = memdescGetKernelMapping(pMQCollection->pSharedMemDesc);
460 NvP64 pPrivKernel = memdescGetKernelMappingPriv(pMQCollection->pSharedMemDesc);
461
462 // Destroy kernel mapping for command queue.
463 if (pVaKernel != 0)
464 {
465 memdescUnmap(pMQCollection->pSharedMemDesc, NV_TRUE, osGetCurrentProcess(),
466 pVaKernel, pPrivKernel);
467 }
468
469 // Free command queue memory.
470 memdescFree(pMQCollection->pSharedMemDesc);
471 memdescDestroy(pMQCollection->pSharedMemDesc);
472 pMQCollection->pSharedMemDesc = NULL;
473 }
474
475 portMemFree(pMQCollection);
476 *ppMQCollection = NULL;
477 }
478
479 /*!
480 * GspMsgQueueSendCommand
481 *
482 * Move a command record from our staging area to the command queue.
483 *
484 * Returns
485 * NV_OK - Record sucessfully sent.
486 * NV_ERR_INVALID_PARAM_STRUCT - Bad record length.
487 * NV_ERR_BUSY_RETRY - No space in the queue.
488 * NV_ERR_INVALID_STATE - Something really bad happenned.
489 */
GspMsgQueueSendCommand(MESSAGE_QUEUE_INFO * pMQI,OBJGPU * pGpu)490 NV_STATUS GspMsgQueueSendCommand(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
491 {
492 GSP_MSG_QUEUE_ELEMENT *pCQE = pMQI->pCmdQueueElement;
493 NvU8 *pSrc = (NvU8 *)pCQE;
494 NvU8 *pNextElement = NULL;
495 int nRet;
496 NvU32 i;
497 RMTIMEOUT timeout;
498 NV_STATUS nvStatus = NV_OK;
499 NvU32 uElementSize = GSP_MSG_QUEUE_ELEMENT_HDR_SIZE +
500 pMQI->pCmdQueueElement->rpc.length;
501
502 if ((uElementSize < sizeof(GSP_MSG_QUEUE_ELEMENT)) ||
503 (uElementSize > GSP_MSG_QUEUE_ELEMENT_SIZE_MAX))
504 {
505 NV_PRINTF(LEVEL_ERROR, "Incorrect length %u\n",
506 pMQI->pCmdQueueElement->rpc.length);
507 nvStatus = NV_ERR_INVALID_PARAM_STRUCT;
508 goto done;
509 }
510
511 // Make sure the queue element in our working space is zero padded for checksum.
512 if ((uElementSize & 7) != 0)
513 portMemSet(pSrc + uElementSize, 0, 8 - (uElementSize & 7));
514
515 pCQE->seqNum = pMQI->txSeqNum;
516 pCQE->elemCount = GSP_MSG_QUEUE_BYTES_TO_ELEMENTS(uElementSize);
517 pCQE->checkSum = 0; // The checkSum field is included in the checksum calculation, so zero it.
518
519 if (gpuIsCCFeatureEnabled(pGpu))
520 {
521 ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu);
522
523 // Use sequence number as AAD.
524 portMemCopy((NvU8*)pCQE->aadBuffer, sizeof(pCQE->aadBuffer), (NvU8 *)&pCQE->seqNum, sizeof(pCQE->seqNum));
525
526 // We need to encrypt the full queue elements to obscure the data.
527 nvStatus = ccslEncryptWithRotationChecks(pCC->pRpcCcslCtx,
528 (pCQE->elemCount * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN) - GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
529 pSrc + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
530 (NvU8*)pCQE->aadBuffer,
531 sizeof(pCQE->aadBuffer),
532 pSrc + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
533 pCQE->authTagBuffer);
534
535 if (nvStatus != NV_OK)
536 {
537 // Do not re-try if encryption fails.
538 NV_PRINTF(LEVEL_ERROR, "Encryption failed with status = 0x%x.\n", nvStatus);
539 if (nvStatus == NV_ERR_INSUFFICIENT_RESOURCES)
540 {
541 // We hit potential IV overflow, this is fatal.
542 NV_PRINTF(LEVEL_ERROR, "Fatal error detected in RPC encrypt: IV overflow!\n");
543 confComputeSetErrorState(pGpu, pCC);
544 }
545 return nvStatus;
546 }
547
548 // Now that encryption covers elements completely, include them in checksum.
549 pCQE->checkSum = _checkSum32(pSrc, pCQE->elemCount * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN);
550 }
551 else
552 {
553 pCQE->checkSum = _checkSum32(pSrc, uElementSize);
554 }
555
556 for (i = 0; i < pCQE->elemCount; i++)
557 {
558 NvU32 timeoutFlags = 0;
559
560 if (pMQI->txBufferFull)
561 timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_JOURNAL_LOG;
562
563 // Set a timeout of 1 sec
564 gpuSetTimeout(pGpu, 1000000, &timeout, timeoutFlags);
565
566 // Wait for space to put the next element.
567 while (NV_TRUE)
568 {
569 // Must get the buffers one at a time, since they could wrap.
570 pNextElement = (NvU8 *)msgqTxGetWriteBuffer(pMQI->hQueue, i);
571
572 if (pNextElement != NULL)
573 break;
574
575 if (gpuCheckTimeout(pGpu, &timeout) != NV_OK)
576 break;
577
578 portAtomicMemoryFenceFull();
579
580 osSpinLoop();
581 }
582
583 if (pNextElement == NULL)
584 {
585 pMQI->txBufferFull++;
586 NV_PRINTF_COND(pMQI->txBufferFull == 1, LEVEL_ERROR, LEVEL_INFO,
587 "buffer is full (waiting for %d free elements, got %d)\n",
588 pCQE->elemCount, i);
589 nvStatus = NV_ERR_BUSY_RETRY;
590 goto done;
591 }
592 else
593 {
594 pMQI->txBufferFull = 0;
595 }
596
597 portMemCopy(pNextElement, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN,
598 pSrc, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN);
599 pSrc += GSP_MSG_QUEUE_ELEMENT_SIZE_MIN;
600 }
601
602 //
603 // If write after write (WAW) memory ordering is relaxed in a CPU, then
604 // it's possible that below msgq update reaches memory first followed by
605 // above portMemCopy data. This is an issue for GSP RM which will read
606 // incorrect data because msgq was updated first. This is a typical
607 // example of producer consumer problem in memory ordering world. Hence,
608 // a store fence is needed here.
609 //
610 portAtomicMemoryFenceStore();
611
612 nRet = msgqTxSubmitBuffers(pMQI->hQueue, pCQE->elemCount);
613
614 if (nRet != 0)
615 {
616 NV_PRINTF(LEVEL_ERROR, "msgqTxSubmitBuffers failed: %d\n", nRet);
617 nvStatus = NV_ERR_INVALID_STATE;
618 goto done;
619 }
620
621 // Advance seq num only if we actually used it.
622 pMQI->txSeqNum++;
623
624 nvStatus = NV_OK;
625
626 done:
627 return nvStatus;
628 }
629
630 /*!
631 * GspMsgQueueReceiveStatus
632 *
633 * Get a status record from the GSP and move it from the rx queue to our
634 * staging area.
635 *
636 * Returns
637 * NV_OK - Record sucessfully read.
638 * NV_ERR_INVALID_PARAM_STRUCT - Bad record length.
639 * NV_ERR_NOT_READY - Partial read.
640 * NV_ERR_INVALID_STATE - Something really bad happenned.
641 */
GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO * pMQI,OBJGPU * pGpu)642 NV_STATUS GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
643 {
644 const NvU8 *pNextElement = NULL;
645 NvU8 *pTgt = (NvU8 *)pMQI->pCmdQueueElement;
646 int nRet;
647 NvU32 i;
648 NvU32 nRetries;
649 NvU32 nMaxRetries = 3;
650 NvU32 nElements = 1; // Assume record fits in one queue element for now.
651 NvU32 uElementSize;
652 NvU32 checkSum;
653 NvU32 seqMismatchDiff = NV_U32_MAX;
654 NV_STATUS nvStatus = NV_OK;
655
656 for (nRetries = 0; nRetries < nMaxRetries; nRetries++)
657 {
658 pTgt = (NvU8 *)pMQI->pCmdQueueElement;
659 nvStatus = NV_OK;
660 nElements = 1; // Assume record fits in one queue element for now.
661
662 for (i = 0; i < nElements; i++)
663 {
664 // Get the pointer to the next queue element.
665 pNextElement = msgqRxGetReadBuffer(pMQI->hQueue, i);
666 if (pNextElement == NULL)
667 {
668 // Early exit if this is the first read and there is no data.
669 if (i == 0)
670 return NV_WARN_NOTHING_TO_DO;
671
672 //
673 // We already successfully read part of the record, so we are here
674 // because the data is in flight (no fence) or the length was wrong.
675 //
676 NV_PRINTF(LEVEL_ERROR, "Incomplete read.\n");
677 nvStatus = NV_ERR_NOT_READY;
678 break;
679 }
680
681 // Copy the next element to our staging area.
682 portMemCopy(pTgt, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN,
683 pNextElement, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN);
684 pTgt += GSP_MSG_QUEUE_ELEMENT_SIZE_MIN;
685
686 if (i == 0)
687 {
688 //
689 // Special processing for first element of the record.
690 // Pull out the element count. This adjusts the loop condition.
691 //
692 nElements = pMQI->pCmdQueueElement->elemCount;
693 }
694 }
695
696 // Retry if there was an error.
697 if (nvStatus != NV_OK)
698 continue;
699
700 // Retry if checksum fails.
701 if (gpuIsCCFeatureEnabled(pGpu))
702 {
703 //
704 // In the Confidential Compute scenario, the actual message length
705 // is inside the encrypted payload, and we can't access it before
706 // decryption, therefore the checksum encompasses the whole element
707 // range. This makes checksum verification significantly slower
708 // because messages are typically much smaller than element size.
709 //
710 checkSum = _checkSum32(pMQI->pCmdQueueElement,
711 (nElements * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN));
712 } else
713 {
714 checkSum = _checkSum32(pMQI->pCmdQueueElement,
715 (GSP_MSG_QUEUE_ELEMENT_HDR_SIZE +
716 pMQI->pCmdQueueElement->rpc.length));
717 }
718
719 if (checkSum != 0)
720 {
721 NV_PRINTF(LEVEL_ERROR, "Bad checksum.\n");
722 nvStatus = NV_ERR_INVALID_DATA;
723 continue;
724 }
725
726 // Retry if sequence number is wrong.
727 if (pMQI->pCmdQueueElement->seqNum != pMQI->rxSeqNum)
728 {
729 NV_PRINTF(LEVEL_ERROR, "Bad sequence number. Expected %u got %u. Possible memory corruption.\n",
730 pMQI->rxSeqNum, pMQI->pCmdQueueElement->seqNum);
731
732 // If we read an old piece of data, try to ignore it and move on..
733 if (pMQI->pCmdQueueElement->seqNum < pMQI->rxSeqNum)
734 {
735 // Make sure we're converging to the desired pMQI->rxSeqNum
736 if ((pMQI->rxSeqNum - pMQI->pCmdQueueElement->seqNum) < seqMismatchDiff)
737 {
738 NV_PRINTF(LEVEL_ERROR, "Attempting recovery: ignoring old package with seqNum=%u of %u elements.\n",
739 pMQI->pCmdQueueElement->seqNum, nElements);
740
741 seqMismatchDiff = pMQI->rxSeqNum - pMQI->pCmdQueueElement->seqNum;
742 nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements);
743 if (nRet < 0)
744 {
745 NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet);
746 }
747 nMaxRetries++;
748 }
749 }
750
751 nvStatus = NV_ERR_INVALID_DATA;
752 continue;
753 }
754
755 // We have the whole record, so break out of the retry loop.
756 break;
757 }
758
759 if (nRetries > 0)
760 {
761 if (nvStatus == NV_OK)
762 {
763 NV_PRINTF(LEVEL_ERROR, "Read succeeded with %d retries.\n", nRetries);
764 }
765 else
766 {
767 NV_PRINTF(LEVEL_ERROR, "Read failed after %d retries.\n", nRetries);
768 return nvStatus;
769 }
770 }
771
772 if (gpuIsCCFeatureEnabled(pGpu))
773 {
774 ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu);
775 nvStatus = ccslDecryptWithRotationChecks(pCC->pRpcCcslCtx,
776 (nElements * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN) - GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
777 ((NvU8*)pMQI->pCmdQueueElement) + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
778 NULL,
779 (NvU8*)pMQI->pCmdQueueElement->aadBuffer,
780 sizeof(pMQI->pCmdQueueElement->aadBuffer),
781 ((NvU8*)pMQI->pCmdQueueElement) + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE,
782 ((NvU8*)pMQI->pCmdQueueElement->authTagBuffer));
783
784 if (nvStatus != NV_OK)
785 {
786 // Do not re-try if decryption failed. Decryption failure is considered fatal.
787 NV_PRINTF(LEVEL_ERROR, "Fatal error detected in RPC decrypt: 0x%x!\n", nvStatus);
788 confComputeSetErrorState(pGpu, pCC);
789 return nvStatus;
790 }
791 }
792
793 // Sanity check for the given RPC length
794 uElementSize = GSP_MSG_QUEUE_ELEMENT_HDR_SIZE + pMQI->pCmdQueueElement->rpc.length;
795
796 if ((uElementSize < sizeof(GSP_MSG_QUEUE_ELEMENT)) ||
797 (uElementSize > GSP_MSG_QUEUE_ELEMENT_SIZE_MAX))
798 {
799 // The length is not valid. If we are running without a fence,
800 // this could mean that the data is still in flight from the CPU.
801 NV_PRINTF(LEVEL_ERROR, "Incorrect length %u\n",
802 pMQI->pCmdQueueElement->rpc.length);
803 nvStatus = NV_ERR_INVALID_PARAM_STRUCT;
804 }
805
806 if (nvStatus == NV_OK)
807 {
808 pMQI->rxSeqNum++;
809
810 nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements);
811 if (nRet < 0)
812 {
813 NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet);
814 nvStatus = NV_ERR_GENERIC;
815 }
816 }
817
818 return nvStatus;
819 }
820
821