1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 * SPDX-License-Identifier: MIT 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 /*! 25 * GSP MESSAGE QUEUE - CPU-SIDE CODE 26 */ 27 28 #include "core/core.h" 29 #include "core/thread_state.h" 30 31 32 #include "os/os.h" 33 34 #include "vgpu/rpc_headers.h" 35 #include "gpu/mem_mgr/virt_mem_allocator_common.h" 36 37 #define RPC_STRUCTURES 38 #define RPC_GENERIC_UNION 39 #include "g_rpc-structures.h" 40 #undef RPC_STRUCTURES 41 #undef RPC_GENERIC_UNION 42 43 #define RPC_MESSAGE_STRUCTURES 44 #define RPC_MESSAGE_GENERIC_UNION 45 #include "g_rpc-message-header.h" 46 #undef RPC_MESSAGE_STRUCTURES 47 #undef RPC_MESSAGE_GENERIC_UNION 48 49 #include "gpu/gsp/message_queue.h" 50 #include "gpu/gsp/message_queue_priv.h" 51 #include "msgq/msgq_priv.h" 52 #include "gpu/gsp/kernel_gsp.h" 53 #include "nvrm_registry.h" 54 #include "gpu/conf_compute/ccsl.h" 55 #include "gpu/conf_compute/conf_compute.h" 56 57 ct_assert(GSP_MSG_QUEUE_HEADER_SIZE > sizeof(msgqTxHeader) + sizeof(msgqRxHeader)); 58 59 static void _gspMsgQueueCleanup(MESSAGE_QUEUE_INFO *pMQI); 60 61 static void 62 _getMsgQueueParams 63 ( 64 OBJGPU *pGpu, 65 MESSAGE_QUEUE_COLLECTION *pMQCollection 66 ) 67 { 68 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 69 NvLength queueSize; 70 MESSAGE_QUEUE_INFO *pRmQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX]; 71 MESSAGE_QUEUE_INFO *pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX]; 72 NvU32 numPtes; 73 const NvLength defaultCommandQueueSize = 0x40000; // 256 KB 74 const NvLength defaultStatusQueueSize = 0x40000; // 256 KB 75 NvU32 regStatusQueueSize; 76 77 // RmQueue sizes 78 if (IS_SILICON(pGpu)) 79 { 80 pRmQueueInfo->commandQueueSize = defaultCommandQueueSize; 81 } 82 else 83 { 84 // 85 // Pre-silicon platforms need a large command queue in order to send 86 // the VBIOS image via RPC. 87 // 88 pRmQueueInfo->commandQueueSize = defaultCommandQueueSize * 6; 89 } 90 91 // Check for status queue size overried 92 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_GSP_STATUS_QUEUE_SIZE, ®StatusQueueSize) == NV_OK) 93 { 94 regStatusQueueSize *= 1024; // to bytes 95 regStatusQueueSize = NV_MAX(GSP_MSG_QUEUE_ELEMENT_SIZE_MAX, regStatusQueueSize); 96 regStatusQueueSize = NV_ALIGN_UP(regStatusQueueSize, 1 << GSP_MSG_QUEUE_ALIGN); 97 pRmQueueInfo->statusQueueSize = regStatusQueueSize; 98 } 99 else 100 { 101 pRmQueueInfo->statusQueueSize = defaultStatusQueueSize; 102 } 103 104 // TaskIsrQueue sizes 105 if (pKernelGsp->bIsTaskIsrQueueRequired) 106 { 107 pTaskIsrQueueInfo->commandQueueSize = defaultCommandQueueSize; 108 pTaskIsrQueueInfo->statusQueueSize = defaultStatusQueueSize; 109 } 110 else 111 { 112 pTaskIsrQueueInfo->commandQueueSize = 0; 113 pTaskIsrQueueInfo->statusQueueSize = 0; 114 } 115 116 // 117 // Calculate the number of entries required to map both queues in addition 118 // to the page table itself. 119 // 120 queueSize = pRmQueueInfo->commandQueueSize + pRmQueueInfo->statusQueueSize + 121 pTaskIsrQueueInfo->commandQueueSize + pTaskIsrQueueInfo->statusQueueSize; 122 NV_ASSERT((queueSize & RM_PAGE_MASK) == 0); 123 numPtes = (queueSize >> RM_PAGE_SHIFT); 124 125 // Account for the pages needed to store the PTEs 126 numPtes += NV_DIV_AND_CEIL(numPtes * sizeof(RmPhysAddr), RM_PAGE_SIZE); 127 128 // 129 // Align the page table size to RM_PAGE_SIZE, so that the command queue is 130 // aligned. 131 // 132 pMQCollection->pageTableSize = RM_PAGE_ALIGN_UP(numPtes * sizeof(RmPhysAddr)); 133 pMQCollection->pageTableEntryCount = numPtes; 134 } 135 136 static NV_STATUS 137 _gspMsgQueueInit 138 ( 139 MESSAGE_QUEUE_INFO *pMQI 140 ) 141 { 142 NvU32 workAreaSize; 143 NV_STATUS nvStatus = NV_OK; 144 int nRet; 145 146 // Allocate work area. 147 workAreaSize = (1 << GSP_MSG_QUEUE_ELEMENT_ALIGN) + 148 GSP_MSG_QUEUE_ELEMENT_SIZE_MAX + msgqGetMetaSize(); 149 pMQI->pWorkArea = portMemAllocNonPaged(workAreaSize); 150 if (pMQI->pWorkArea == NULL) 151 { 152 NV_PRINTF(LEVEL_ERROR, "Error allocating pWorkArea.\n"); 153 return NV_ERR_NO_MEMORY; 154 } 155 156 portMemSet(pMQI->pWorkArea, 0, workAreaSize); 157 158 pMQI->pCmdQueueElement = (GSP_MSG_QUEUE_ELEMENT *) 159 NV_ALIGN_UP((NvUPtr)pMQI->pWorkArea, 1 << GSP_MSG_QUEUE_ELEMENT_ALIGN); 160 pMQI->pMetaData = (void *)((NvUPtr)pMQI->pCmdQueueElement + GSP_MSG_QUEUE_ELEMENT_SIZE_MAX); 161 162 nRet = msgqInit(&pMQI->hQueue, pMQI->pMetaData); 163 if (nRet < 0) 164 { 165 NV_PRINTF(LEVEL_ERROR, "msgqInit failed: %d\n", nRet); 166 nvStatus = NV_ERR_GENERIC; 167 goto error_ret; 168 } 169 170 nRet = msgqTxCreate(pMQI->hQueue, 171 pMQI->pCommandQueue, 172 pMQI->commandQueueSize, 173 GSP_MSG_QUEUE_ELEMENT_SIZE_MIN, 174 GSP_MSG_QUEUE_HEADER_ALIGN, 175 GSP_MSG_QUEUE_ELEMENT_ALIGN, 176 MSGQ_FLAGS_SWAP_RX); 177 if (nRet < 0) 178 { 179 NV_PRINTF(LEVEL_ERROR, "msgqTxCreate failed: %d\n", nRet); 180 nvStatus = NV_ERR_GENERIC; 181 goto error_ret; 182 } 183 184 pMQI->pRpcMsgBuf = &pMQI->pCmdQueueElement->rpc; 185 186 NV_PRINTF(LEVEL_INFO, "Created command queue.\n"); 187 return nvStatus; 188 189 error_ret: 190 _gspMsgQueueCleanup(pMQI); 191 return nvStatus; 192 } 193 194 /*! 195 * GspMsgQueueInit 196 * 197 * Initialize the command queues for CPU side. 198 * Must not be called before portInitialize. 199 */ 200 NV_STATUS 201 GspMsgQueuesInit 202 ( 203 OBJGPU *pGpu, 204 MESSAGE_QUEUE_COLLECTION **ppMQCollection 205 ) 206 { 207 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 208 MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL; 209 MESSAGE_QUEUE_INFO *pRmQueueInfo = NULL; 210 MESSAGE_QUEUE_INFO *pTaskIsrQueueInfo = NULL; 211 RmPhysAddr *pPageTbl; 212 NvP64 pVaKernel; 213 NvP64 pPrivKernel; 214 NV_STATUS nvStatus = NV_OK; 215 NvLength sharedBufSize; 216 NvP64 lastQueueVa; 217 NvLength lastQueueSize; 218 NvU64 flags = MEMDESC_FLAGS_NONE; 219 220 if (*ppMQCollection != NULL) 221 { 222 NV_PRINTF(LEVEL_ERROR, "GSP message queue was already initialized.\n"); 223 return NV_ERR_INVALID_STATE; 224 } 225 226 pMQCollection = portMemAllocNonPaged(sizeof *pMQCollection); 227 if (pMQCollection == NULL) 228 { 229 NV_PRINTF(LEVEL_ERROR, "Error allocating queue info area.\n"); 230 nvStatus = NV_ERR_NO_MEMORY; 231 goto done; 232 } 233 portMemSet(pMQCollection, 0, sizeof *pMQCollection); 234 235 _getMsgQueueParams(pGpu, pMQCollection); 236 237 pRmQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX]; 238 pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX]; 239 240 sharedBufSize = pMQCollection->pageTableSize + 241 pRmQueueInfo->commandQueueSize + 242 pRmQueueInfo->statusQueueSize + 243 pTaskIsrQueueInfo->commandQueueSize + 244 pTaskIsrQueueInfo->statusQueueSize; 245 246 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY; 247 248 // 249 // For now, put all shared queue memory in one block. 250 // 251 NV_ASSERT_OK_OR_GOTO(nvStatus, 252 memdescCreate(&pMQCollection->pSharedMemDesc, pGpu, sharedBufSize, 253 RM_PAGE_SIZE, NV_MEMORY_NONCONTIGUOUS, ADDR_SYSMEM, NV_MEMORY_CACHED, 254 flags), 255 done); 256 257 memdescSetFlag(pMQCollection->pSharedMemDesc, MEMDESC_FLAGS_KERNEL_MODE, NV_TRUE); 258 259 memdescTagAlloc(nvStatus, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_58, 260 pMQCollection->pSharedMemDesc); 261 NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_ret); 262 263 // Create kernel mapping for command queue. 264 NV_ASSERT_OK_OR_GOTO(nvStatus, 265 memdescMap(pMQCollection->pSharedMemDesc, 0, sharedBufSize, 266 NV_TRUE, NV_PROTECT_WRITEABLE, 267 &pVaKernel, &pPrivKernel), 268 error_ret); 269 270 memdescSetKernelMapping(pMQCollection->pSharedMemDesc, pVaKernel); 271 memdescSetKernelMappingPriv(pMQCollection->pSharedMemDesc, pPrivKernel); 272 273 if (pVaKernel == NvP64_NULL) 274 { 275 NV_PRINTF(LEVEL_ERROR, "Error allocating message queue shared buffer\n"); 276 nvStatus = NV_ERR_NO_MEMORY; 277 goto error_ret; 278 } 279 280 portMemSet((void *)pVaKernel, 0, sharedBufSize); 281 282 pPageTbl = pVaKernel; 283 284 // Shared memory layout. 285 // 286 // Each of the following are page aligned: 287 // Shared memory layout header (includes page table) 288 // RM Command queue header 289 // RM Command queue entries 290 // RM Status queue header 291 // RM Status queue entries 292 // TASKISR Command queue header 293 // TASKISR Command queue entries 294 // TASKISR Status queue header 295 // TASKISR Status queue entries 296 memdescGetPhysAddrs(pMQCollection->pSharedMemDesc, 297 AT_GPU, // addressTranslation 298 0, // offset 299 RM_PAGE_SIZE, // stride 300 pMQCollection->pageTableEntryCount, // count 301 pPageTbl); // physical address table 302 303 pRmQueueInfo->pCommandQueue = NvP64_VALUE( 304 NvP64_PLUS_OFFSET(pVaKernel, pMQCollection->pageTableSize)); 305 306 pRmQueueInfo->pStatusQueue = NvP64_VALUE( 307 NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pRmQueueInfo->pCommandQueue), pRmQueueInfo->commandQueueSize)); 308 309 lastQueueVa = NV_PTR_TO_NvP64(pRmQueueInfo->pStatusQueue); 310 lastQueueSize = pRmQueueInfo->statusQueueSize; 311 312 if (pKernelGsp->bIsTaskIsrQueueRequired) 313 { 314 pTaskIsrQueueInfo->pCommandQueue = NvP64_VALUE( 315 NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pRmQueueInfo->pStatusQueue), pRmQueueInfo->statusQueueSize)); 316 317 pTaskIsrQueueInfo->pStatusQueue = NvP64_VALUE( 318 NvP64_PLUS_OFFSET(NV_PTR_TO_NvP64(pTaskIsrQueueInfo->pCommandQueue), pTaskIsrQueueInfo->commandQueueSize)); 319 320 lastQueueVa = NV_PTR_TO_NvP64(pTaskIsrQueueInfo->pStatusQueue); 321 lastQueueSize = pTaskIsrQueueInfo->statusQueueSize; 322 } 323 324 // Assert that the last queue offset + size fits into the shared memory. 325 NV_ASSERT(NvP64_PLUS_OFFSET(pVaKernel, sharedBufSize) == 326 NvP64_PLUS_OFFSET(lastQueueVa, lastQueueSize)); 327 328 NV_ASSERT_OK_OR_GOTO(nvStatus, _gspMsgQueueInit(pRmQueueInfo), error_ret); 329 pRmQueueInfo->queueIdx = RPC_TASK_RM_QUEUE_IDX; 330 331 if (pKernelGsp->bIsTaskIsrQueueRequired) 332 { 333 NV_ASSERT_OK_OR_GOTO(nvStatus, _gspMsgQueueInit(pTaskIsrQueueInfo), error_ret); 334 pTaskIsrQueueInfo->queueIdx = RPC_TASK_ISR_QUEUE_IDX; 335 } 336 337 *ppMQCollection = pMQCollection; 338 pMQCollection->sharedMemPA = pPageTbl[0]; 339 340 done: 341 return nvStatus; 342 343 error_ret: 344 GspMsgQueuesCleanup(&pMQCollection); 345 return nvStatus; 346 } 347 348 NV_STATUS GspStatusQueueInit(OBJGPU *pGpu, MESSAGE_QUEUE_INFO **ppMQI) 349 { 350 NV_STATUS nvStatus = NV_ERR_GENERIC; 351 int nRet = 0; 352 int nRetries; 353 RMTIMEOUT timeout; 354 NvU32 timeoutUs = 4000000; 355 NvU32 timeoutFlags = GPU_TIMEOUT_FLAGS_DEFAULT; 356 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 357 358 // GSP-RM in emulation/simulation environment is extremely slow 359 if (IS_EMULATION(pGpu) || IS_SIMULATION(pGpu)) 360 { 361 // 362 // Scaling timeoutUs by GSP_SCALE_TIMEOUT_EMU_SIM overflows 32 bits, 363 // so just max it out instead. 364 // 365 timeoutUs = NV_U32_MAX; 366 367 // 368 // On slower emulators and simulation, the time it takes to link the 369 // status queue is longer than the thread state timeout, so bypass 370 // the thread state so our longer timeout applies. 371 // 372 timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE; 373 } 374 375 gpuSetTimeout(pGpu, timeoutUs, &timeout, timeoutFlags); 376 377 // Wait other end of the queue to run msgqInit. Retry until the timeout. 378 for (nRetries = 0; ; nRetries++) 379 { 380 // Link in status queue 381 portAtomicMemoryFenceFull(); 382 383 nRet = msgqRxLink((*ppMQI)->hQueue, (*ppMQI)->pStatusQueue, 384 (*ppMQI)->statusQueueSize, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN); 385 386 if (nRet == 0) 387 { 388 NV_PRINTF(LEVEL_INFO, "Status queue linked to command queue.\n"); 389 390 // 391 // If we've bypassed the thread state timeout check for slower 392 // environments, it will have lapsed by now, so reset it so that 393 // the next timeout check doesn't fail immediately. 394 // 395 if (timeoutFlags & GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE) 396 threadStateResetTimeout(pGpu); 397 398 return NV_OK; 399 } 400 401 osSpinLoop(); 402 403 nvStatus = gpuCheckTimeout(pGpu, &timeout); 404 if (nvStatus != NV_OK) 405 break; 406 407 kgspDumpGspLogs(pKernelGsp, NV_FALSE); 408 if (!kgspHealthCheck_HAL(pGpu, pKernelGsp)) 409 { 410 nvStatus = NV_ERR_RESET_REQUIRED; 411 break; 412 } 413 } 414 415 if (nRet < 0) 416 { 417 NV_PRINTF(LEVEL_ERROR, 418 "msgqRxLink failed: %d, nvStatus 0x%08x, retries: %d\n", 419 nRet, nvStatus, nRetries); 420 _gspMsgQueueCleanup(*ppMQI); 421 } 422 423 return nvStatus; 424 } 425 426 static void 427 _gspMsgQueueCleanup(MESSAGE_QUEUE_INFO *pMQI) 428 { 429 if (pMQI == NULL) 430 { 431 return; 432 } 433 434 portMemFree(pMQI->pWorkArea); 435 436 pMQI->pWorkArea = NULL; 437 pMQI->pCmdQueueElement = NULL; 438 pMQI->pMetaData = NULL; 439 } 440 441 void GspMsgQueuesCleanup(MESSAGE_QUEUE_COLLECTION **ppMQCollection) 442 { 443 MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL; 444 MESSAGE_QUEUE_INFO *pRmQueueInfo = NULL; 445 MESSAGE_QUEUE_INFO *pTaskIsrQueueInfo = NULL; 446 447 if ((ppMQCollection == NULL) || (*ppMQCollection == NULL)) 448 return; 449 450 pMQCollection = *ppMQCollection; 451 pRmQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX]; 452 pTaskIsrQueueInfo = &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX]; 453 454 _gspMsgQueueCleanup(pRmQueueInfo); 455 _gspMsgQueueCleanup(pTaskIsrQueueInfo); 456 457 if (pMQCollection->pSharedMemDesc != NULL) 458 { 459 NvP64 pVaKernel = memdescGetKernelMapping(pMQCollection->pSharedMemDesc); 460 NvP64 pPrivKernel = memdescGetKernelMappingPriv(pMQCollection->pSharedMemDesc); 461 462 // Destroy kernel mapping for command queue. 463 if (pVaKernel != 0) 464 { 465 memdescUnmap(pMQCollection->pSharedMemDesc, NV_TRUE, osGetCurrentProcess(), 466 pVaKernel, pPrivKernel); 467 } 468 469 // Free command queue memory. 470 memdescFree(pMQCollection->pSharedMemDesc); 471 memdescDestroy(pMQCollection->pSharedMemDesc); 472 pMQCollection->pSharedMemDesc = NULL; 473 } 474 475 portMemFree(pMQCollection); 476 *ppMQCollection = NULL; 477 } 478 479 /*! 480 * Calculate 32-bit checksum 481 * 482 * This routine assumes that the data is padded out with zeros to the next 483 * 8-byte alignment, and it is OK to read past the end to the 8-byte alignment. 484 */ 485 static NV_INLINE NvU32 _checkSum32(void *pData, NvU32 uLen) 486 { 487 NvU64 *p = (NvU64 *)pData; 488 NvU64 *pEnd = (NvU64 *)((NvUPtr)pData + uLen); 489 NvU64 checkSum = 0; 490 491 while (p < pEnd) 492 checkSum ^= *p++; 493 494 return NvU64_HI32(checkSum) ^ NvU64_LO32(checkSum); 495 } 496 497 /*! 498 * GspMsgQueueSendCommand 499 * 500 * Move a command record from our staging area to the command queue. 501 * 502 * Returns 503 * NV_OK - Record sucessfully sent. 504 * NV_ERR_INVALID_PARAM_STRUCT - Bad record length. 505 * NV_ERR_BUSY_RETRY - No space in the queue. 506 * NV_ERR_INVALID_STATE - Something really bad happenned. 507 */ 508 NV_STATUS GspMsgQueueSendCommand(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu) 509 { 510 GSP_MSG_QUEUE_ELEMENT *pCQE = pMQI->pCmdQueueElement; 511 NvU8 *pSrc = (NvU8 *)pCQE; 512 NvU8 *pNextElement = NULL; 513 int nRet; 514 NvU32 i; 515 RMTIMEOUT timeout; 516 NV_STATUS nvStatus = NV_OK; 517 NvU32 uElementSize = GSP_MSG_QUEUE_ELEMENT_HDR_SIZE + 518 pMQI->pCmdQueueElement->rpc.length; 519 520 if ((uElementSize < sizeof(GSP_MSG_QUEUE_ELEMENT)) || 521 (uElementSize > GSP_MSG_QUEUE_ELEMENT_SIZE_MAX)) 522 { 523 NV_PRINTF(LEVEL_ERROR, "Incorrect length %u\n", 524 pMQI->pCmdQueueElement->rpc.length); 525 nvStatus = NV_ERR_INVALID_PARAM_STRUCT; 526 goto done; 527 } 528 529 // Make sure the queue element in our working space is zero padded for checksum. 530 if ((uElementSize & 7) != 0) 531 portMemSet(pSrc + uElementSize, 0, 8 - (uElementSize & 7)); 532 533 pCQE->seqNum = pMQI->txSeqNum; 534 pCQE->elemCount = GSP_MSG_QUEUE_BYTES_TO_ELEMENTS(uElementSize); 535 pCQE->checkSum = 0; 536 537 if (gpuIsCCFeatureEnabled(pGpu)) 538 { 539 ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu); 540 541 // Use sequence number as AAD. 542 portMemCopy((NvU8*)pCQE->aadBuffer, sizeof(pCQE->aadBuffer), (NvU8 *)&pCQE->seqNum, sizeof(pCQE->seqNum)); 543 544 // We need to encrypt the full queue elements to obscure the data. 545 nvStatus = ccslEncrypt(pCC->pRpcCcslCtx, 546 (pCQE->elemCount * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN) - GSP_MSG_QUEUE_ELEMENT_HDR_SIZE, 547 pSrc + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE, 548 (NvU8*)pCQE->aadBuffer, 549 sizeof(pCQE->aadBuffer), 550 pSrc + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE, 551 pCQE->authTagBuffer); 552 553 if (nvStatus != NV_OK) 554 { 555 // Do not re-try if encryption fails. 556 NV_PRINTF(LEVEL_ERROR, "Encryption failed with status = 0x%x.\n", nvStatus); 557 if (nvStatus == NV_ERR_INSUFFICIENT_RESOURCES) 558 { 559 // We hit potential IV overflow, this is fatal. 560 NV_PRINTF(LEVEL_ERROR, "Fatal error detected in RPC encrypt: IV overflow!\n"); 561 confComputeSetErrorState(pGpu, pCC); 562 } 563 return nvStatus; 564 } 565 566 // Now that encryption covers elements completely, include them in checksum. 567 pCQE->checkSum = _checkSum32(pSrc, pCQE->elemCount * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN); 568 } 569 else 570 { 571 pCQE->checkSum = _checkSum32(pSrc, uElementSize); 572 } 573 574 for (i = 0; i < pCQE->elemCount; i++) 575 { 576 NvU32 timeoutFlags = 0; 577 578 if (pMQI->txBufferFull) 579 timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_JOURNAL_LOG; 580 581 // Set a timeout of 1 sec 582 gpuSetTimeout(pGpu, 1000000, &timeout, timeoutFlags); 583 584 // Wait for space to put the next element. 585 while (NV_TRUE) 586 { 587 // Must get the buffers one at a time, since they could wrap. 588 pNextElement = (NvU8 *)msgqTxGetWriteBuffer(pMQI->hQueue, i); 589 590 if (pNextElement != NULL) 591 break; 592 593 if (gpuCheckTimeout(pGpu, &timeout) != NV_OK) 594 break; 595 596 portAtomicMemoryFenceFull(); 597 598 osSpinLoop(); 599 } 600 601 if (pNextElement == NULL) 602 { 603 pMQI->txBufferFull++; 604 NV_PRINTF_COND(pMQI->txBufferFull == 1, LEVEL_ERROR, LEVEL_INFO, 605 "buffer is full (waiting for %d free elements, got %d)\n", 606 pCQE->elemCount, i); 607 nvStatus = NV_ERR_BUSY_RETRY; 608 goto done; 609 } 610 else 611 { 612 pMQI->txBufferFull = 0; 613 } 614 615 portMemCopy(pNextElement, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN, 616 pSrc, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN); 617 pSrc += GSP_MSG_QUEUE_ELEMENT_SIZE_MIN; 618 } 619 620 // 621 // If write after write (WAW) memory ordering is relaxed in a CPU, then 622 // it's possible that below msgq update reaches memory first followed by 623 // above portMemCopy data. This is an issue for GSP RM which will read 624 // incorrect data because msgq was updated first. This is a typical 625 // example of producer consumer problem in memory ordering world. Hence, 626 // a store fence is needed here. 627 // 628 portAtomicMemoryFenceStore(); 629 630 nRet = msgqTxSubmitBuffers(pMQI->hQueue, pCQE->elemCount); 631 632 if (nRet != 0) 633 { 634 NV_PRINTF(LEVEL_ERROR, "msgqTxSubmitBuffers failed: %d\n", nRet); 635 nvStatus = NV_ERR_INVALID_STATE; 636 goto done; 637 } 638 639 // Advance seq num only if we actually used it. 640 pMQI->txSeqNum++; 641 642 nvStatus = NV_OK; 643 644 done: 645 return nvStatus; 646 } 647 648 /*! 649 * GspMsgQueueReceiveStatus 650 * 651 * Get a status record from the GSP and move it from the rx queue to our 652 * staging area. 653 * 654 * Returns 655 * NV_OK - Record sucessfully read. 656 * NV_ERR_INVALID_PARAM_STRUCT - Bad record length. 657 * NV_ERR_NOT_READY - Partial read. 658 * NV_ERR_INVALID_STATE - Something really bad happenned. 659 */ 660 NV_STATUS GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu) 661 { 662 const NvU8 *pNextElement = NULL; 663 NvU8 *pTgt = (NvU8 *)pMQI->pCmdQueueElement; 664 int nRet; 665 NvU32 i; 666 NvU32 nRetries; 667 NvU32 nMaxRetries = 3; 668 NvU32 nElements = 1; // Assume record fits in one queue element for now. 669 NvU32 uElementSize = 0; 670 NvU32 seqMismatchDiff = NV_U32_MAX; 671 NV_STATUS nvStatus = NV_OK; 672 673 for (nRetries = 0; nRetries < nMaxRetries; nRetries++) 674 { 675 pTgt = (NvU8 *)pMQI->pCmdQueueElement; 676 nvStatus = NV_OK; 677 nElements = 1; // Assume record fits in one queue element for now. 678 679 for (i = 0; i < nElements; i++) 680 { 681 // Get the pointer to the next queue element. 682 pNextElement = msgqRxGetReadBuffer(pMQI->hQueue, i); 683 if (pNextElement == NULL) 684 { 685 // Early exit if this is the first read and there is no data. 686 if (i == 0) 687 return NV_WARN_NOTHING_TO_DO; 688 689 // 690 // We already successfully read part of the record, so we are here 691 // because the data is in flight (no fence) or the length was wrong. 692 // 693 NV_PRINTF(LEVEL_ERROR, "Incomplete read.\n"); 694 nvStatus = NV_ERR_NOT_READY; 695 break; 696 } 697 698 // Copy the next element to our staging area. 699 portMemCopy(pTgt, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN, 700 pNextElement, GSP_MSG_QUEUE_ELEMENT_SIZE_MIN); 701 pTgt += GSP_MSG_QUEUE_ELEMENT_SIZE_MIN; 702 703 if (i == 0) 704 { 705 // 706 // Special processing for first element of the record. 707 // Pull out the element count. This adjusts the loop condition. 708 // 709 nElements = pMQI->pCmdQueueElement->elemCount; 710 } 711 } 712 713 // Retry if there was an error. 714 if (nvStatus != NV_OK) 715 continue; 716 717 // Retry if checksum fails. 718 if (gpuIsCCFeatureEnabled(pGpu)) 719 { 720 // In Confidential Compute scenario, checksum includes complete element range. 721 if (_checkSum32(pMQI->pCmdQueueElement, (nElements * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN)) != 0) 722 { 723 NV_PRINTF(LEVEL_ERROR, "Bad checksum.\n"); 724 nvStatus = NV_ERR_INVALID_DATA; 725 continue; 726 } 727 } else 728 if (_checkSum32(pMQI->pCmdQueueElement, uElementSize) != 0) 729 { 730 NV_PRINTF(LEVEL_ERROR, "Bad checksum.\n"); 731 nvStatus = NV_ERR_INVALID_DATA; 732 continue; 733 } 734 735 // Retry if sequence number is wrong. 736 if (pMQI->pCmdQueueElement->seqNum != pMQI->rxSeqNum) 737 { 738 NV_PRINTF(LEVEL_ERROR, "Bad sequence number. Expected %u got %u. Possible memory corruption.\n", 739 pMQI->rxSeqNum, pMQI->pCmdQueueElement->seqNum); 740 741 // If we read an old piece of data, try to ignore it and move on.. 742 if (pMQI->pCmdQueueElement->seqNum < pMQI->rxSeqNum) 743 { 744 // Make sure we're converging to the desired pMQI->rxSeqNum 745 if ((pMQI->rxSeqNum - pMQI->pCmdQueueElement->seqNum) < seqMismatchDiff) 746 { 747 NV_PRINTF(LEVEL_ERROR, "Attempting recovery: ignoring old package with seqNum=%u of %u elements.\n", 748 pMQI->pCmdQueueElement->seqNum, nElements); 749 750 seqMismatchDiff = pMQI->rxSeqNum - pMQI->pCmdQueueElement->seqNum; 751 nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements); 752 if (nRet < 0) 753 { 754 NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet); 755 } 756 nMaxRetries++; 757 } 758 } 759 760 nvStatus = NV_ERR_INVALID_DATA; 761 continue; 762 } 763 764 // We have the whole record, so break out of the retry loop. 765 break; 766 } 767 768 if (nRetries > 0) 769 { 770 if (nvStatus == NV_OK) 771 { 772 NV_PRINTF(LEVEL_ERROR, "Read succeeded with %d retries.\n", nRetries); 773 } 774 else 775 { 776 NV_PRINTF(LEVEL_ERROR, "Read failed after %d retries.\n", nRetries); 777 return nvStatus; 778 } 779 } 780 781 if (gpuIsCCFeatureEnabled(pGpu)) 782 { 783 ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu); 784 nvStatus = ccslDecrypt(pCC->pRpcCcslCtx, 785 (nElements * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN) - GSP_MSG_QUEUE_ELEMENT_HDR_SIZE, 786 ((NvU8*)pMQI->pCmdQueueElement) + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE, 787 NULL, 788 (NvU8*)pMQI->pCmdQueueElement->aadBuffer, 789 sizeof(pMQI->pCmdQueueElement->aadBuffer), 790 ((NvU8*)pMQI->pCmdQueueElement) + GSP_MSG_QUEUE_ELEMENT_HDR_SIZE, 791 ((NvU8*)pMQI->pCmdQueueElement->authTagBuffer)); 792 793 if (nvStatus != NV_OK) 794 { 795 // Do not re-try if decryption failed. Decryption failure is considered fatal. 796 NV_PRINTF(LEVEL_ERROR, "Fatal error detected in RPC decrypt: 0x%x!\n", nvStatus); 797 confComputeSetErrorState(pGpu, pCC); 798 return nvStatus; 799 } 800 } 801 802 // Sanity check for the given RPC length 803 uElementSize = GSP_MSG_QUEUE_ELEMENT_HDR_SIZE + pMQI->pCmdQueueElement->rpc.length; 804 805 if ((uElementSize < sizeof(GSP_MSG_QUEUE_ELEMENT)) || 806 (uElementSize > GSP_MSG_QUEUE_ELEMENT_SIZE_MAX)) 807 { 808 // The length is not valid. If we are running without a fence, 809 // this could mean that the data is still in flight from the CPU. 810 NV_PRINTF(LEVEL_ERROR, "Incorrect length %u\n", 811 pMQI->pCmdQueueElement->rpc.length); 812 nvStatus = NV_ERR_INVALID_PARAM_STRUCT; 813 } 814 815 if (nvStatus == NV_OK) 816 { 817 pMQI->rxSeqNum++; 818 819 nRet = msgqRxMarkConsumed(pMQI->hQueue, nElements); 820 if (nRet < 0) 821 { 822 NV_PRINTF(LEVEL_ERROR, "msgqRxMarkConsumed failed: %d\n", nRet); 823 nvStatus = NV_ERR_GENERIC; 824 } 825 } 826 827 return nvStatus; 828 } 829 830