1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 * SPDX-License-Identifier: MIT 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "resserv/rs_server.h" 25 26 #include "gpu/gsp/kernel_gsp.h" 27 28 #include "kernel/core/thread_state.h" 29 #include "kernel/core/locks.h" 30 #include "kernel/diagnostics/gpu_acct.h" 31 #include "kernel/diagnostics/journal.h" 32 #include "kernel/gpu/fifo/kernel_channel.h" 33 #include "kernel/gpu/gsp/gsp_trace_rats_macro.h" 34 #include "kernel/gpu/intr/engine_idx.h" 35 #include "kernel/gpu/mem_mgr/heap.h" 36 #include "kernel/gpu/mem_mgr/mem_mgr.h" 37 #include "kernel/gpu/mem_sys/kern_mem_sys.h" 38 #include "kernel/gpu/rc/kernel_rc.h" 39 #include "kernel/gpu/nvlink/kernel_nvlink.h" 40 #include "virtualization/hypervisor/hypervisor.h" 41 #include "virtualization/vgpuconfigapi.h" 42 #include "kernel/gpu/disp/kern_disp.h" 43 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h" 44 #include "kernel/gpu/device/device.h" 45 #include "gpu/external_device/external_device.h" 46 #include "kernel/platform/platform_request_handler.h" 47 #include "class/cl2080.h" // NV20_SUBDEVICE_0 48 #include "ctrl/ctrl2080/ctrl2080nvd.h" 49 #include "liblogdecode.h" 50 #include "libelf.h" 51 #include "nverror.h" 52 #include "nvrm_registry.h" 53 #include "nv-firmware.h" 54 #include "nv-firmware-chip-family-select.h" 55 #include "nvtypes.h" 56 #include "nvVer.h" 57 #include "objrpc.h" 58 #include "objtmr.h" 59 #include "os/os.h" 60 #include "rmgspseq.h" 61 #include "sweng/dispsw.h" 62 #include "kernel/gpu/timed_sema.h" 63 #include "vgpu/rpc.h" 64 #include "kernel/gpu/pmu/kern_pmu.h" 65 #include "gpu/perf/kern_perf.h" 66 #include "core/locks.h" 67 #include "kernel/gpu/intr/intr.h" 68 69 #define RPC_STRUCTURES 70 #define RPC_GENERIC_UNION 71 #include "g_rpc-structures.h" 72 #undef RPC_STRUCTURES 73 #undef RPC_GENERIC_UNION 74 75 #define RPC_MESSAGE_STRUCTURES 76 #define RPC_MESSAGE_GENERIC_UNION 77 #include "g_rpc-message-header.h" 78 #undef RPC_MESSAGE_STRUCTURES 79 #undef RPC_MESSAGE_GENERIC_UNION 80 81 #include "gpu/gsp/message_queue_priv.h" 82 83 #include "gpu/conf_compute/conf_compute.h" 84 85 #define RPC_HDR ((rpc_message_header_v*)(pRpc->message_buffer)) 86 87 struct MIG_CI_UPDATE_CALLBACK_PARAMS 88 { 89 NvU32 execPartCount; 90 NvU32 execPartId[NVC637_CTRL_MAX_EXEC_PARTITIONS]; 91 NvU32 gfid; 92 NvBool bDelete; 93 }; 94 95 // 96 // RPC_PARAMS defines the rpc_params pointer and initializes it to the correct 97 // sub-structure. 98 // 99 // RPC_PARAMS intentionally assigns the the latest version structure to the 100 // versioned rpc_params pointer. With the -Werror=incompatible-pointer-types 101 // compiler flag, this checks for mismatched structure versions at compile time. 102 // 103 // For example: 104 // RPC_PARAMS(free, _v03_00); 105 // expands to 106 // rpc_free_v03_00 *rpc_params = &RPC_HDR->rpc_message_data->free_v; 107 // 108 #define RPC_PARAMS(r, v) rpc_##r##v *rpc_params = &RPC_HDR->rpc_message_data->r##_v 109 110 static NV_STATUS _kgspInitRpcInfrastructure(OBJGPU *, KernelGsp *); 111 static void _kgspFreeRpcInfrastructure(OBJGPU *, KernelGsp *); 112 113 static NV_STATUS _kgspConstructRpcObject(OBJGPU *, KernelGsp *, MESSAGE_QUEUE_INFO *, OBJRPC **); 114 115 static NV_STATUS _kgspRpcSendMessage(OBJGPU *, OBJRPC *); 116 static NV_STATUS _kgspRpcRecvPoll(OBJGPU *, OBJRPC *, NvU32); 117 static NV_STATUS _kgspRpcDrainEvents(OBJGPU *, KernelGsp *, NvU32, KernelGspRpcEventHandlerContext); 118 static void _kgspRpcIncrementTimeoutCountAndRateLimitPrints(OBJGPU *, OBJRPC *); 119 120 static NV_STATUS _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp); 121 static void _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp); 122 123 static NV_STATUS _kgspAllocNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp); 124 static void _kgspFreeNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp); 125 126 static void _kgspStopLogPolling(OBJGPU *pGpu, KernelGsp *pKernelGsp); 127 128 static void _kgspFreeBootBinaryImage(OBJGPU *pGpu, KernelGsp *pKernelGsp); 129 130 static NV_STATUS _kgspPrepareGspRmBinaryImage(OBJGPU *pGpu, KernelGsp *pKernelGsp, GSP_FIRMWARE *pGspFw); 131 132 static NV_STATUS _kgspCreateSignatureMemdesc(OBJGPU *pGpu, KernelGsp *pKernelGsp, 133 GSP_FIRMWARE *pGspFw); 134 135 static NV_STATUS _kgspFwContainerVerifyVersion(OBJGPU *pGpu, KernelGsp *pKernelGsp, 136 const void *pElfData, NvU64 elfDataSize, 137 const char *pNameInMsg); 138 139 static NV_STATUS _kgspFwContainerGetSection(OBJGPU *pGpu, KernelGsp *pKernelGsp, 140 const void *pElfData, NvU64 elfDataSize, 141 const char *pSectionName, 142 const void **ppSectionData, NvU64 *pSectionSize); 143 144 static NV_STATUS _kgspGetSectionNameForPrefix(OBJGPU *pGpu, KernelGsp *pKernelGsp, 145 char *pSectionNameBuf, NvLength sectionNameBufSize, 146 const char *pSectionPrefix); 147 148 static void 149 _kgspGetActiveRpcDebugData 150 ( 151 OBJRPC *pRpc, 152 NvU32 function, 153 NvU64 *data0, 154 NvU64 *data1 155 ) 156 { 157 switch (function) 158 { 159 // Functions (CPU -> GSP) 160 case NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL: 161 { 162 RPC_PARAMS(gsp_rm_control, _v03_00); 163 *data0 = rpc_params->cmd; 164 *data1 = rpc_params->paramsSize; 165 break; 166 } 167 case NV_VGPU_MSG_FUNCTION_GSP_RM_ALLOC: 168 { 169 RPC_PARAMS(gsp_rm_alloc, _v03_00); 170 *data0 = rpc_params->hClass; 171 *data1 = rpc_params->paramsSize; 172 break; 173 } 174 case NV_VGPU_MSG_FUNCTION_FREE: 175 { 176 RPC_PARAMS(free, _v03_00); 177 *data0 = rpc_params->params.hObjectOld; 178 *data1 = rpc_params->params.hObjectParent; 179 break; 180 } 181 182 // Events (CPU <- GSP) 183 case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER: 184 { 185 RPC_PARAMS(run_cpu_sequencer, _v17_00); 186 *data0 = rpc_params->cmdIndex; 187 *data1 = rpc_params->bufferSizeDWord; 188 break; 189 } 190 case NV_VGPU_MSG_EVENT_POST_EVENT: 191 { 192 RPC_PARAMS(post_event, _v17_00); 193 *data0 = rpc_params->notifyIndex; 194 *data1 = rpc_params->data; 195 break; 196 } 197 case NV_VGPU_MSG_EVENT_RC_TRIGGERED: 198 { 199 RPC_PARAMS(rc_triggered, _v17_02); 200 *data0 = rpc_params->nv2080EngineType; 201 *data1 = rpc_params->exceptType; 202 break; 203 } 204 case NV_VGPU_MSG_EVENT_VGPU_GSP_PLUGIN_TRIGGERED: 205 { 206 RPC_PARAMS(vgpu_gsp_plugin_triggered, _v17_00); 207 *data0 = rpc_params->gfid; 208 *data1 = rpc_params->notifyIndex; 209 break; 210 } 211 case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE: 212 { 213 RPC_PARAMS(gsp_lockdown_notice, _v17_00); 214 *data0 = rpc_params->bLockdownEngaging; 215 *data1 = 0; 216 break; 217 } 218 case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD: 219 { 220 RPC_PARAMS(gsp_post_nocat_record, _v01_00); 221 const NV2080CtrlNocatJournalInsertRecord *pRecord = 222 (const NV2080CtrlNocatJournalInsertRecord *)&rpc_params->data; 223 *data0 = pRecord->recType; 224 *data1 = pRecord->errorCode; 225 break; 226 } 227 228 default: 229 { 230 *data0 = 0; 231 *data1 = 0; 232 break; 233 } 234 } 235 } 236 237 static NV_STATUS 238 _kgspRpcSanityCheck(OBJGPU *pGpu, KernelGsp *pKernelGsp, OBJRPC *pRpc) 239 { 240 if (pKernelGsp->bFatalError) 241 { 242 NV_PRINTF(LEVEL_INFO, "GSP crashed, skipping RPC\n"); 243 // 244 // In case of a fatal GSP error, if there was an outstanding RPC at the 245 // time, we should have already printed the error for that, so this is a 246 // new RPC call...from now on don't bother printing RPC errors anymore, 247 // as it can be too noisy and overrun logs. 248 // 249 pRpc->bQuietPrints = NV_TRUE; 250 return NV_ERR_RESET_REQUIRED; 251 } 252 if (API_GPU_IN_RESET_SANITY_CHECK(pGpu)) 253 { 254 NV_PRINTF(LEVEL_INFO, "GPU in reset, skipping RPC\n"); 255 return NV_ERR_GPU_IN_FULLCHIP_RESET; 256 } 257 if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu) || 258 pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST)) 259 { 260 NV_PRINTF(LEVEL_INFO, "GPU lost, skipping RPC\n"); 261 return NV_ERR_GPU_IS_LOST; 262 } 263 if (osIsGpuShutdown(pGpu)) 264 { 265 NV_PRINTF(LEVEL_INFO, "GPU shutdown, skipping RPC\n"); 266 return NV_ERR_GPU_IS_LOST; 267 } 268 if (!gpuIsGpuFullPowerForPmResume(pGpu)) 269 { 270 NV_PRINTF(LEVEL_INFO, "GPU not full power, skipping RPC\n"); 271 return NV_ERR_GPU_NOT_FULL_POWER; 272 } 273 if (!gpuCheckSysmemAccess(pGpu)) 274 { 275 NV_PRINTF(LEVEL_INFO, "GPU has no sysmem access, skipping RPC\n"); 276 return NV_ERR_INVALID_ACCESS_TYPE; 277 } 278 return NV_OK; 279 } 280 281 static void 282 _kgspAddRpcHistoryEntry 283 ( 284 OBJRPC *pRpc, 285 RpcHistoryEntry *pHistory, 286 NvU32 *pCurrent 287 ) 288 { 289 NvU32 func = RPC_HDR->function; 290 NvU32 entry; 291 292 entry = *pCurrent = (*pCurrent + 1) % RPC_HISTORY_DEPTH; 293 294 portMemSet(&pHistory[entry], 0, sizeof(pHistory[0])); 295 pHistory[entry].function = func; 296 pHistory[entry].ts_start = osGetTimestamp(); 297 298 _kgspGetActiveRpcDebugData(pRpc, func, 299 &pHistory[entry].data[0], 300 &pHistory[entry].data[1]); 301 } 302 303 static void 304 _kgspCompleteRpcHistoryEntry 305 ( 306 RpcHistoryEntry *pHistory, 307 NvU32 current 308 ) 309 { 310 NvU32 historyIndex; 311 NvU32 historyEntry; 312 313 pHistory[current].ts_end = osGetTimestamp(); 314 315 // 316 // Complete any previous entries that aren't marked complete yet, using the same timestamp 317 // (we may not have explicitly waited for them) 318 // 319 for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++) 320 { 321 historyEntry = (current + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH; 322 if (pHistory[historyEntry].ts_start != 0 && 323 pHistory[historyEntry].ts_end == 0) 324 { 325 pHistory[historyEntry].ts_end = pHistory[current].ts_end; 326 } 327 } 328 } 329 330 /*! 331 * GSP client RM RPC send routine 332 */ 333 static NV_STATUS 334 _kgspRpcSendMessage 335 ( 336 OBJGPU *pGpu, 337 OBJRPC *pRpc 338 ) 339 { 340 NV_STATUS nvStatus; 341 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 342 NvU32 gpuMaskUnused; 343 344 NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused)); 345 346 NV_CHECK_OK_OR_RETURN(LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc)); 347 348 nvStatus = GspMsgQueueSendCommand(pRpc->pMessageQueueInfo, pGpu); 349 if (nvStatus != NV_OK) 350 { 351 if (nvStatus == NV_ERR_TIMEOUT || 352 nvStatus == NV_ERR_BUSY_RETRY) 353 { 354 _kgspRpcIncrementTimeoutCountAndRateLimitPrints(pGpu, pRpc); 355 } 356 NV_PRINTF_COND(pRpc->bQuietPrints, LEVEL_INFO, LEVEL_ERROR, 357 "GspMsgQueueSendCommand failed on GPU%d: 0x%x\n", 358 gpuGetInstance(pGpu), nvStatus); 359 return nvStatus; 360 } 361 362 kgspSetCmdQueueHead_HAL(pGpu, pKernelGsp, pRpc->pMessageQueueInfo->queueIdx, 0); 363 364 _kgspAddRpcHistoryEntry(pRpc, pRpc->rpcHistory, &pRpc->rpcHistoryCurrent); 365 366 return NV_OK; 367 } 368 369 static NV_STATUS 370 _kgspRpcRunCpuSequencer 371 ( 372 OBJGPU *pGpu, 373 OBJRPC *pRpc 374 ) 375 { 376 RPC_PARAMS(run_cpu_sequencer, _v17_00); 377 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 378 379 return kgspExecuteSequencerBuffer(pGpu, pKernelGsp, rpc_params); 380 } 381 382 static void 383 _kgspProcessEccNotifier 384 ( 385 OBJGPU *pGpu, 386 void *eventData 387 ) 388 { 389 NV_STATUS nvStatus = NV_OK; 390 MemoryManager *pMemoryMgr = GPU_GET_MEMORY_MANAGER(pGpu); 391 392 if (pMemoryMgr->bEnableDynamicPageOfflining) 393 { 394 Nv2080EccDbeNotification *pParams = (Nv2080EccDbeNotification*)eventData; 395 if ((nvStatus = heapStorePendingBlackList(pGpu, GPU_GET_HEAP(pGpu), pParams->physAddress , 396 pParams->physAddress)) != NV_OK) 397 { 398 if (nvStatus == NV_ERR_RESET_REQUIRED) 399 { 400 NV_PRINTF(LEVEL_INFO, "Since we hit the DED on the reserved region, nothing to handle in this code path... \n"); 401 NV_PRINTF(LEVEL_INFO, "Relying on FBHUB interrupt to kill all the channels and force reset the GPU..\n"); 402 } 403 else 404 { 405 NV_PRINTF(LEVEL_INFO, "Dynamically blacklisting the DED page offset failed with, status: %x\n", nvStatus); 406 DBG_BREAKPOINT(); 407 } 408 } 409 410 } 411 } 412 413 /*! 414 * Receive an event notification from GSP-RM. 415 * 416 * When an event fires in GSP-RM, osNotifyEvent and osEventNotification check 417 * whether the event was originally allocated from client-RM. If so, they post 418 * it to the event queue and take no further action. Client RM picks up the 419 * event here and handles it. 420 */ 421 static NV_STATUS 422 _kgspRpcPostEvent 423 ( 424 OBJGPU *pGpu, 425 OBJRPC *pRpc 426 ) 427 { 428 RPC_PARAMS(post_event, _v17_00); 429 PEVENTNOTIFICATION pNotifyList = NULL; 430 PEVENTNOTIFICATION pNotifyEvent = NULL; 431 Event *pEvent = NULL; 432 NV_STATUS nvStatus = NV_OK; 433 434 // Get the notification list that contains this event. 435 NV_ASSERT_OR_RETURN(CliGetEventInfo(rpc_params->hClient, 436 rpc_params->hEvent, &pEvent), NV_ERR_OBJECT_NOT_FOUND); 437 438 if (pEvent->pNotifierShare != NULL) 439 pNotifyList = pEvent->pNotifierShare->pEventList; 440 441 NV_ASSERT_OR_RETURN(pNotifyList != NULL, NV_ERR_INVALID_POINTER); 442 443 switch (rpc_params->notifyIndex) 444 { 445 case NV2080_NOTIFIERS_ECC_DBE: 446 _kgspProcessEccNotifier(pGpu, rpc_params->eventData); 447 break; 448 } 449 450 // Send the event. 451 if (rpc_params->bNotifyList) 452 { 453 // Send notification to all matching events on the list. 454 nvStatus = osEventNotificationWithInfo(pGpu, pNotifyList, rpc_params->notifyIndex, 455 rpc_params->data, rpc_params->info16, rpc_params->eventData, rpc_params->eventDataSize); 456 } 457 else 458 { 459 // Send event to a specific hEvent. Find hEvent in the notification list. 460 for (pNotifyEvent = pNotifyList; pNotifyEvent; pNotifyEvent = pNotifyEvent->Next) 461 { 462 if (pNotifyEvent->hEvent == rpc_params->hEvent) 463 { 464 nvStatus = osNotifyEvent(pGpu, pNotifyEvent, 0, 465 rpc_params->data, rpc_params->status); 466 break; 467 } 468 } 469 NV_ASSERT_OR_RETURN(pNotifyEvent != NULL, NV_ERR_OBJECT_NOT_FOUND); 470 } 471 472 return nvStatus; 473 } 474 475 /*! 476 * Receive RC notification from GSP-RM. 477 * 478 * RC error handling ("Channel Teardown sequence") is executed in GSP-RM. 479 * Client notifications, OS interaction etc happen in CPU-RM (Kernel RM). 480 */ 481 static NV_STATUS 482 _kgspRpcRCTriggered 483 ( 484 OBJGPU *pGpu, 485 OBJRPC *pRpc 486 ) 487 { 488 RPC_PARAMS(rc_triggered, _v17_02); 489 490 KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu); 491 KernelChannel *pKernelChannel; 492 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu); 493 CHID_MGR *pChidMgr; 494 NvU32 status = NV_OK; 495 RM_ENGINE_TYPE rmEngineType = gpuGetRmEngineType(rpc_params->nv2080EngineType); 496 NvBool bIsCcEnabled = NV_FALSE; 497 498 // check if there's a PCI-E error pending either in device status or in AER 499 krcCheckBusError_HAL(pGpu, pKernelRc); 500 501 // 502 // If we have received a special msg from GSP then ack back immediately 503 // that we are done writing notifiers since we would have already processed the 504 // other RC msgs that trigger notifier writes before this one. 505 // 506 if (rpc_params->exceptType == ROBUST_CHANNEL_FAST_PATH_ERROR) 507 { 508 NV_RM_RPC_ECC_NOTIFIER_WRITE_ACK(pGpu, status); 509 NV_ASSERT_OK(status); 510 return status; 511 } 512 513 status = kfifoGetChidMgrFromType(pGpu, pKernelFifo, 514 ENGINE_INFO_TYPE_RM_ENGINE_TYPE, 515 (NvU32)rmEngineType, 516 &pChidMgr); 517 if (status != NV_OK) 518 return status; 519 520 pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo, 521 pChidMgr, 522 rpc_params->chid); 523 NV_CHECK_OR_RETURN(LEVEL_ERROR, 524 pKernelChannel != NULL, 525 NV_ERR_INVALID_CHANNEL); 526 527 // Add the RcDiag records we received from GSP-RM to our system wide journal 528 { 529 OBJSYS *pSys = SYS_GET_INSTANCE(); 530 Journal *pRcDB = SYS_GET_RCDB(pSys); 531 RmClient *pClient; 532 533 NvU32 recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport); 534 NvU32 rcDiagRecStart = pRcDB->RcErrRptNextIdx; 535 NvU32 rcDiagRecEnd; 536 NvU32 processId = 0; 537 NvU32 owner = RCDB_RCDIAG_DEFAULT_OWNER; 538 539 pClient = dynamicCast(RES_GET_CLIENT(pKernelChannel), RmClient); 540 NV_ASSERT(pClient != NULL); 541 if (pClient != NULL) 542 processId = pClient->ProcID; 543 544 for (NvU32 i = 0; i < rpc_params->rcJournalBufferSize / recordSize; i++) 545 { 546 RmRCCommonJournal_RECORD *pCommonRecord = 547 (RmRCCommonJournal_RECORD *)((NvU8*)&rpc_params->rcJournalBuffer + i * recordSize); 548 RmRcDiag_RECORD *pRcDiagRecord = 549 (RmRcDiag_RECORD *)&pCommonRecord[1]; 550 551 #if defined(DEBUG) 552 NV_PRINTF(LEVEL_INFO, "%d: GPUTag=0x%x CPUTag=0x%llx timestamp=0x%llx stateMask=0x%llx\n", 553 i, pCommonRecord->GPUTag, pCommonRecord->CPUTag, pCommonRecord->timeStamp, 554 pCommonRecord->stateMask); 555 NV_PRINTF(LEVEL_INFO, " idx=%d timeStamp=0x%x type=0x%x flags=0x%x count=%d owner=0x%x processId=0x%x\n", 556 pRcDiagRecord->idx, pRcDiagRecord->timeStamp, pRcDiagRecord->type, pRcDiagRecord->flags, 557 pRcDiagRecord->count, pRcDiagRecord->owner, processId); 558 for (NvU32 j = 0; j < pRcDiagRecord->count; j++) 559 { 560 NV_PRINTF(LEVEL_INFO, " %d: offset=0x08%x tag=0x08%x value=0x08%x attribute=0x08%x\n", 561 j, pRcDiagRecord->data[j].offset, pRcDiagRecord->data[j].tag, 562 pRcDiagRecord->data[j].value, pRcDiagRecord->data[j].attribute); 563 } 564 #endif 565 if (rcdbAddRcDiagRecFromGsp(pGpu, pRcDB, pCommonRecord, pRcDiagRecord) == NULL) 566 { 567 NV_PRINTF(LEVEL_WARNING, "Lost RC diagnostic record coming from GPU%d GSP: type=0x%x stateMask=0x%llx\n", 568 gpuGetInstance(pGpu), pRcDiagRecord->type, pCommonRecord->stateMask); 569 } 570 } 571 572 rcDiagRecEnd = pRcDB->RcErrRptNextIdx - 1; 573 574 // Update records to have the correct PID associated with the channel 575 if (rcDiagRecStart != rcDiagRecEnd) 576 { 577 rcdbUpdateRcDiagRecContext(pRcDB, 578 rcDiagRecStart, 579 rcDiagRecEnd, 580 processId, 581 owner); 582 } 583 } 584 585 bIsCcEnabled = gpuIsCCFeatureEnabled(pGpu); 586 587 // With CC enabled, CPU-RM needs to write error notifiers 588 if (bIsCcEnabled) 589 { 590 NV_ASSERT_OK_OR_RETURN(krcErrorSetNotifier(pGpu, pKernelRc, 591 pKernelChannel, 592 rpc_params->exceptType, 593 rmEngineType, 594 rpc_params->scope)); 595 } 596 597 return krcErrorSendEventNotifications_HAL(pGpu, pKernelRc, 598 pKernelChannel, 599 rmEngineType, // unused on kernel side 600 rpc_params->exceptType, 601 rpc_params->scope, 602 rpc_params->partitionAttributionId); 603 } 604 605 /*! 606 * Receive Xid notification from GSP-RM 607 * 608 * Passes Xid errors that are triggered on GSP-RM to nvErrorLog for OS interactions 609 * (logging and OS notifications). 610 */ 611 static void 612 _kgspRpcOsErrorLog 613 ( 614 OBJGPU *pGpu, 615 OBJRPC *pRpc 616 ) 617 { 618 RPC_PARAMS(os_error_log, _v17_00); 619 620 KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu); 621 KernelChannel *pKernelChannel = NULL; 622 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu); 623 CHID_MGR *pChidMgr; 624 625 if (rpc_params->chid != INVALID_CHID) 626 { 627 pChidMgr = kfifoGetChidMgr(pGpu, pKernelFifo, rpc_params->runlistId); 628 if (pChidMgr != NULL) 629 { 630 pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo, 631 pChidMgr, 632 rpc_params->chid); 633 } 634 } 635 636 pKernelRc->pPreviousChannelInError = pKernelChannel; 637 nvErrorLog_va(pGpu, rpc_params->exceptType, "%s", rpc_params->errString); 638 pKernelRc->pPreviousChannelInError = NULL; 639 } 640 641 /*! 642 * Receives RPC events containing periodic perfmon utilization samples, passing them 643 * to GPUACCT for processing. 644 */ 645 static void 646 _kgspRpcGpuacctPerfmonUtilSamples 647 ( 648 OBJGPU *pGpu, 649 OBJRPC *pRpc 650 ) 651 { 652 OBJSYS *pSys = SYS_GET_INSTANCE(); 653 GpuAccounting *pGpuAcct = SYS_GET_GPUACCT(pSys); 654 GPUACCT_GPU_INSTANCE_INFO *pGpuInstanceInfo = &pGpuAcct->gpuInstanceInfo[pGpu->gpuInstance]; 655 RPC_PARAMS(gpuacct_perfmon_util_samples, _v1F_0E); 656 657 NV2080_CTRL_PERF_GET_GPUMON_PERFMON_UTIL_SAMPLES_V2_PARAMS_v1F_0E *src = &rpc_params->params; 658 NV2080_CTRL_PERF_GET_GPUMON_PERFMON_UTIL_SAMPLES_V2_PARAMS *dest; 659 NvU32 i; 660 661 dest = pGpuInstanceInfo->pSamplesParams; 662 if (dest == NULL) 663 { 664 // This RPC event can be received even when the RM hasn't fully started. 665 // For instance, CPU RM can take longer than usual to initialize, 666 // but the GSP RM sampling timer (a 1 sec interval) is about to tick. 667 // In that case, pSamplesParams can not even be allocated by that time. 668 // Ignore this RPC event if pSamplesParams has not been allocated yet. 669 // See GPUSWSEC-1543 for more info. 670 return; 671 } 672 673 portMemSet(dest, 0, sizeof(*dest)); 674 dest->type = src->type; 675 dest->bufSize = src->bufSize; 676 dest->count = src->count; 677 dest->tracker = src->tracker; 678 679 for (i = 0; i < NV2080_CTRL_PERF_GPUMON_SAMPLE_COUNT_PERFMON_UTIL_v1F_0E; i++) 680 { 681 dest->samples[i].base.timeStamp = src->samples[i].timeStamp; 682 683 dest->samples[i].fb.util = src->samples[i].fb.util; 684 dest->samples[i].fb.procId = src->samples[i].fb.procId; 685 dest->samples[i].fb.subProcessID = src->samples[i].fb.subProcessID; 686 687 dest->samples[i].gr.util = src->samples[i].gr.util; 688 dest->samples[i].gr.procId = src->samples[i].gr.procId; 689 dest->samples[i].gr.subProcessID = src->samples[i].gr.subProcessID; 690 691 dest->samples[i].nvenc.util = src->samples[i].nvenc.util; 692 dest->samples[i].nvenc.procId = src->samples[i].nvenc.procId; 693 dest->samples[i].nvenc.subProcessID = src->samples[i].nvenc.subProcessID; 694 695 dest->samples[i].nvdec.util = src->samples[i].nvdec.util; 696 dest->samples[i].nvdec.procId = src->samples[i].nvdec.procId; 697 dest->samples[i].nvdec.subProcessID = src->samples[i].nvdec.subProcessID; 698 } 699 700 gpuacctProcessGpuUtil(pGpuInstanceInfo, &dest->samples[0]); 701 } 702 703 /*! 704 * Receives RPC events containing current GPU Boost synchronization limits 705 * that should be cached and considered in the GPU Boost algorithm and runs 706 * the algorithm. 707 */ 708 static void 709 _kgspRpcPerfGpuBoostSyncLimitsCallback 710 ( 711 OBJGPU *pGpu, 712 OBJRPC *pRpc 713 ) 714 { 715 KernelPerf *pKernelPerf = GPU_GET_KERNEL_PERF(pGpu); 716 717 RPC_PARAMS(perf_gpu_boost_sync_limits_callback, _v17_00); 718 719 NV2080_CTRL_INTERNAL_PERF_GPU_BOOST_SYNC_SET_LIMITS_PARAMS_v17_00 *src = &rpc_params->params; 720 NV2080_CTRL_INTERNAL_PERF_GPU_BOOST_SYNC_SET_LIMITS_PARAMS dest; 721 NvU32 i; 722 723 dest.flags = src->flags; 724 dest.bBridgeless = src->bBridgeless; 725 726 for (i = 0; i < NV2080_CTRL_INTERNAL_PERF_SYNC_GPU_BOOST_LIMITS_NUM; i++) 727 { 728 dest.currLimits[i] = src->currLimits[i]; 729 } 730 731 kperfDoSyncGpuBoostLimits(pGpu, pKernelPerf, &dest); 732 733 } 734 735 /*! 736 * Recieves RPC events containing latest change of bridgeless information 737 */ 738 static void 739 _kgspRpcPerfBridgelessInfoUpdate 740 ( 741 OBJGPU *pGpu, 742 OBJRPC *pRpc 743 ) 744 { 745 RPC_PARAMS(perf_bridgeless_info_update, _v17_00); 746 747 kPerfGpuBoostSyncBridgelessUpdateInfo(pGpu, rpc_params->bBridgeless); 748 } 749 750 static void 751 _kgspRpcNvlinkFaultUpCallback 752 ( 753 OBJGPU *pGpu, 754 OBJRPC *pRpc 755 ) 756 { 757 RPC_PARAMS(nvlink_fault_up, _v17_00); 758 759 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 760 761 knvlinkHandleFaultUpInterrupt_HAL(pGpu, pKernelNvlink, rpc_params->linkId); 762 } 763 764 static void 765 _kgspRpcNvlinkInbandReceivedData256Callback 766 ( 767 OBJGPU *pGpu, 768 OBJRPC *pRpc 769 ) 770 { 771 RPC_PARAMS(nvlink_inband_received_data_256, _v17_00); 772 773 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_256_PARAMS_v17_00 *dest = &rpc_params->params; 774 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 775 776 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data)); 777 } 778 779 static void 780 _kgspRpcNvlinkInbandReceivedData512Callback 781 ( 782 OBJGPU *pGpu, 783 OBJRPC *pRpc 784 ) 785 { 786 RPC_PARAMS(nvlink_inband_received_data_512, _v17_00); 787 788 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_512_PARAMS_v17_00 *dest = &rpc_params->params; 789 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 790 791 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data)); 792 } 793 794 static void 795 _kgspRpcNvlinkInbandReceivedData1024Callback 796 ( 797 OBJGPU *pGpu, 798 OBJRPC *pRpc 799 ) 800 { 801 RPC_PARAMS(nvlink_inband_received_data_1024, _v17_00); 802 803 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_1024_PARAMS_v17_00 *dest = &rpc_params->params; 804 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 805 806 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data)); 807 } 808 809 static void 810 _kgspRpcNvlinkInbandReceivedData2048Callback 811 ( 812 OBJGPU *pGpu, 813 OBJRPC *pRpc 814 ) 815 { 816 RPC_PARAMS(nvlink_inband_received_data_2048, _v17_00); 817 818 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_2048_PARAMS_v17_00 *dest = &rpc_params->params; 819 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 820 821 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data)); 822 } 823 824 static void 825 _kgspRpcNvlinkInbandReceivedData4096Callback 826 ( 827 OBJGPU *pGpu, 828 OBJRPC *pRpc 829 ) 830 { 831 RPC_PARAMS(nvlink_inband_received_data_4096, _v17_00); 832 833 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_4096_PARAMS_v17_00 *dest = &rpc_params->params; 834 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 835 836 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data)); 837 } 838 839 /*! 840 * CPU-RM: Receive GPU Degraded status from GSP 841 */ 842 static void 843 _kgspRpcEventIsGpuDegradedCallback 844 ( 845 OBJGPU *pGpu, 846 OBJRPC *pRpc 847 ) 848 { 849 RPC_PARAMS(nvlink_is_gpu_degraded, _v17_00); 850 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 851 NV2080_CTRL_NVLINK_IS_GPU_DEGRADED_PARAMS_v17_00 *dest = &rpc_params->params; 852 853 if(dest->bIsGpuDegraded) 854 { 855 knvlinkSetDegradedMode(pGpu, pKernelNvlink, dest->linkId); 856 } 857 } 858 859 static void 860 _kgspRpcNvlinkFatalErrorRecoveryCallback 861 ( 862 OBJGPU *pGpu, 863 OBJRPC *pRpc 864 ) 865 { 866 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 867 NV_ASSERT_OK(knvlinkFatalErrorRecovery(pGpu, pKernelNvlink)); 868 } 869 870 /*! 871 * Receive MMU fault queue notification from GSP-RM. 872 * 873 * Non-replayable fault handling is split between GSP-RM and the UVM driver. 874 * GSP-RM copies designated faults to the UVM driver's shadow buffer, 875 * and sends a notification. CPU-RM, in turn, needs to notify the UVM 876 * driver (schedule the UVM ISR to be run). 877 */ 878 static NV_STATUS 879 _kgspRpcMMUFaultQueued( 880 OBJGPU *pGpu, 881 OBJRPC *pRpc 882 ) 883 { 884 osQueueMMUFaultHandler(pGpu); 885 886 return NV_OK; 887 } 888 889 static NV_STATUS 890 _kgspRpcSimRead 891 ( 892 OBJGPU *pGpu, 893 OBJRPC *pRpc 894 ) 895 { 896 RPC_PARAMS(sim_read, _v1E_01); 897 if (IS_SIMULATION(pGpu)) 898 { 899 const NvU32 count = rpc_params->index + (rpc_params->count / sizeof(NvU32)); 900 NvU32 i; 901 902 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 903 904 NV_ASSERT_OR_RETURN(rpc_params->count <= sizeof(pKernelGsp->pSimAccessBuf->data), NV_ERR_BUFFER_TOO_SMALL); 905 906 for (i = rpc_params->index; i < count; i++) 907 { 908 NvU32 data; 909 gpuSimEscapeRead(pGpu, rpc_params->path, i, 4, &data); 910 pKernelGsp->pSimAccessBuf->data[i] = data; 911 } 912 913 pKernelGsp->pSimAccessBuf->seq++; 914 return NV_OK; 915 } 916 917 return NV_ERR_NOT_SUPPORTED; 918 } 919 920 static NV_STATUS 921 _kgspRpcSimWrite 922 ( 923 OBJGPU *pGpu, 924 OBJRPC *pRpc 925 ) 926 { 927 RPC_PARAMS(sim_write, _v1E_01); 928 if (IS_SIMULATION(pGpu)) 929 { 930 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 931 932 gpuSimEscapeWrite(pGpu, rpc_params->path, rpc_params->index, rpc_params->count, rpc_params->data); 933 pKernelGsp->pSimAccessBuf->seq++; 934 return NV_OK; 935 } 936 937 return NV_ERR_NOT_SUPPORTED; 938 } 939 940 static NV_STATUS 941 _kgspRpcSemaphoreScheduleCallback( 942 OBJGPU *pGpu, 943 OBJRPC *pRpc 944 ) 945 { 946 RPC_PARAMS(semaphore_schedule_callback, _v17_00); 947 NV_STATUS status; 948 RsClient *pClient; 949 Device *pDevice; 950 951 status = serverGetClientUnderLock(&g_resServ, rpc_params->hClient, &pClient); 952 if (status != NV_OK) 953 return status; 954 955 status = deviceGetByHandle(pClient, rpc_params->hEvent, &pDevice); 956 if (status != NV_OK) 957 return status; 958 959 return dispswReleaseSemaphoreAndNotifierFill(pGpu, 960 rpc_params->GPUVA, 961 rpc_params->hVASpace, 962 rpc_params->ReleaseValue, 963 rpc_params->Flags, 964 rpc_params->completionStatus, 965 pDevice); 966 } 967 968 static NV_STATUS 969 _kgspRpcTimedSemaphoreRelease( 970 OBJGPU *pGpu, 971 OBJRPC *pRpc 972 ) 973 { 974 RPC_PARAMS(timed_semaphore_release, _v01_00); 975 NV_STATUS status; 976 RsClient *pClient; 977 Device *pDevice; 978 979 status = serverGetClientUnderLock(&g_resServ, rpc_params->hClient, &pClient); 980 if (status != NV_OK) 981 return status; 982 983 status = deviceGetByHandle(pClient, rpc_params->hDevice, &pDevice); 984 if (status != NV_OK) 985 return status; 986 987 return tsemaRelease_HAL(pGpu, 988 rpc_params->semaphoreVA, 989 rpc_params->notifierVA, 990 rpc_params->hVASpace, 991 rpc_params->releaseValue, 992 rpc_params->completionStatus, 993 pDevice); 994 } 995 996 997 static NV_STATUS 998 _kgspRpcUcodeLibosPrint 999 ( 1000 OBJGPU *pGpu, 1001 OBJRPC *pRpc 1002 ) 1003 { 1004 RPC_PARAMS(ucode_libos_print, _v1E_08); 1005 1006 // Check ucodes registered with the libos print mechanism 1007 switch (rpc_params->ucodeEngDesc) 1008 { 1009 case ENG_PMU: 1010 { 1011 KernelPmu *pKernelPmu = GPU_GET_KERNEL_PMU(pGpu); 1012 NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelPmu != NULL, NV_ERR_OBJECT_NOT_FOUND); 1013 1014 kpmuLogBuf(pGpu, pKernelPmu, 1015 rpc_params->libosPrintBuf, rpc_params->libosPrintBufSize); 1016 1017 return NV_OK; 1018 } 1019 default: 1020 NV_ASSERT_FAILED("Attempting to use libos prints with an unsupported ucode!\n"); 1021 return NV_ERR_NOT_SUPPORTED; 1022 } 1023 } 1024 1025 static NV_STATUS 1026 _kgspRpcVgpuGspPluginTriggered 1027 ( 1028 OBJGPU *pGpu, 1029 OBJRPC *pRpc 1030 ) 1031 { 1032 RPC_PARAMS(vgpu_gsp_plugin_triggered, _v17_00); 1033 1034 if (!IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu)) 1035 return NV_ERR_NOT_SUPPORTED; 1036 1037 gpuGspPluginTriggeredEvent(pGpu, rpc_params->gfid, rpc_params->notifyIndex); 1038 return NV_OK; 1039 } 1040 1041 static NV_STATUS 1042 _kgspRpcGspVgpuConfig 1043 ( 1044 OBJGPU *pGpu, 1045 OBJRPC *pRpc 1046 ) 1047 { 1048 RPC_PARAMS(vgpu_config_event, _v17_00); 1049 1050 NV_ASSERT_OR_RETURN(rpc_params->notifyIndex < NVA081_NOTIFIERS_MAXCOUNT, 1051 NV_ERR_INVALID_ARGUMENT); 1052 1053 CliNotifyVgpuConfigEvent(pGpu, rpc_params->notifyIndex); 1054 1055 return NV_OK; 1056 } 1057 1058 static NV_STATUS 1059 _kgspRpcGspExtdevIntrService 1060 ( 1061 OBJGPU *pGpu, 1062 OBJRPC *pRpc 1063 ) 1064 { 1065 RPC_PARAMS(extdev_intr_service, _v17_00); 1066 1067 extdevGsyncService(pGpu, rpc_params->lossRegStatus, rpc_params->gainRegStatus, rpc_params->miscRegStatus, rpc_params->rmStatus); 1068 1069 return NV_OK; 1070 } 1071 1072 static void 1073 _kgspRpcMigCiConfigUpdateCallback 1074 ( 1075 NvU32 gpuInstance, 1076 void *pArgs 1077 ) 1078 { 1079 OBJGPU *pGpu = gpumgrGetGpu(gpuInstance); 1080 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu); 1081 struct MIG_CI_UPDATE_CALLBACK_PARAMS * pParams = (struct MIG_CI_UPDATE_CALLBACK_PARAMS *)pArgs; 1082 1083 kmigmgrUpdateCiConfigForVgpu(pGpu, pKernelMIGManager, 1084 pParams->execPartCount, pParams->execPartId, 1085 pParams->gfid, pParams->bDelete); 1086 1087 return; 1088 } 1089 1090 static NV_STATUS 1091 _kgspRpcMigCiConfigUpdate 1092 ( 1093 OBJGPU *pGpu, 1094 OBJRPC *pRpc 1095 ) 1096 { 1097 NV_STATUS status; 1098 struct MIG_CI_UPDATE_CALLBACK_PARAMS *pParams; 1099 1100 RPC_PARAMS(vgpu_gsp_mig_ci_config, _v21_03); 1101 1102 NV_ASSERT_OR_RETURN(rpc_params->execPartCount <= NVC637_CTRL_MAX_EXEC_PARTITIONS, 1103 NV_ERR_INVALID_ARGUMENT); 1104 1105 pParams = portMemAllocNonPaged(sizeof(struct MIG_CI_UPDATE_CALLBACK_PARAMS)); 1106 if (pParams == NULL) 1107 { 1108 return NV_ERR_NO_MEMORY; 1109 } 1110 1111 pParams->execPartCount = rpc_params->execPartCount; 1112 portMemCopy(pParams->execPartId, (sizeof(NvU32) * rpc_params->execPartCount), 1113 rpc_params->execPartId, (sizeof(NvU32) * rpc_params->execPartCount)); 1114 pParams->gfid = rpc_params->gfid; 1115 pParams->bDelete = rpc_params->bDelete; 1116 status = osQueueWorkItemWithFlags(pGpu, 1117 _kgspRpcMigCiConfigUpdateCallback, 1118 (void *)pParams, 1119 OS_QUEUE_WORKITEM_FLAGS_LOCK_API_RW | OS_QUEUE_WORKITEM_FLAGS_LOCK_GPUS_RW); 1120 if (status != NV_OK) 1121 { 1122 portMemFree(pParams); 1123 } 1124 1125 return status; 1126 } 1127 1128 static void 1129 _kgspRpcGspUpdateTrace 1130 ( 1131 OBJGPU *pGpu, 1132 OBJRPC *pRpc 1133 ) 1134 { 1135 #if KERNEL_GSP_TRACING_RATS_ENABLED 1136 RPC_PARAMS(update_gsp_trace, _v01_00); 1137 NvU32 i; 1138 NV_RATS_GSP_TRACE_RECORD *GspTraceRecords = (NV_RATS_GSP_TRACE_RECORD*) (&rpc_params->data); 1139 for (i = 0; i < rpc_params->records; i++) 1140 { 1141 gspTraceEventBufferLogRecord(pGpu, &GspTraceRecords[i]); 1142 } 1143 #endif 1144 } 1145 1146 static void 1147 _kgspRpcGspPostNocatRecord 1148 ( 1149 OBJGPU *pGpu, 1150 OBJRPC *pRpc 1151 ) 1152 { 1153 OBJSYS *pSys = SYS_GET_INSTANCE(); 1154 Journal *pRcdb = SYS_GET_RCDB(pSys); 1155 NOCAT_JOURNAL_PARAMS newEntry; 1156 const NV2080CtrlNocatJournalInsertRecord *pRecord = NULL; 1157 RPC_PARAMS(gsp_post_nocat_record, _v01_00); 1158 1159 // make a pointer to the record. 1160 pRecord = (const NV2080CtrlNocatJournalInsertRecord *)&rpc_params->data; 1161 1162 portMemSet(&newEntry, 0, sizeof(newEntry)); 1163 newEntry.timestamp = pRecord->timestamp; 1164 newEntry.recType = pRecord->recType; 1165 newEntry.bugcheck = pRecord->bugcheck; 1166 newEntry.pSource = pRecord->source; 1167 newEntry.subsystem = pRecord->subsystem; 1168 newEntry.errorCode = pRecord->errorCode; 1169 newEntry.diagBufferLen = pRecord->diagBufferLen; 1170 newEntry.pDiagBuffer = pRecord->diagBuffer; 1171 newEntry.pFaultingEngine = pRecord->faultingEngine; 1172 newEntry.tdrReason = pRecord->tdrReason; 1173 1174 (void)rcdbNocatInsertNocatError(pGpu, &newEntry); 1175 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_RPC_INSERT_RECORDS_IDX]++; 1176 } 1177 1178 static NV_STATUS 1179 _kgspRpcRgLineIntr 1180 ( 1181 OBJGPU *pGpu, 1182 OBJRPC *pRpc 1183 ) 1184 { 1185 RPC_PARAMS(rg_line_intr, _v17_00); 1186 1187 KernelDisplay *pKernelDisplay = GPU_GET_KERNEL_DISPLAY(pGpu); 1188 NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelDisplay != NULL, NV_ERR_OBJECT_NOT_FOUND); 1189 1190 kdispInvokeRgLineCallback(pKernelDisplay, rpc_params->head, rpc_params->rgIntr, NV_FALSE); 1191 1192 return NV_OK; 1193 } 1194 1195 static NV_STATUS 1196 _kgspRpcEventPlatformRequestHandlerStateSyncCallback 1197 ( 1198 OBJGPU* pGpu, 1199 OBJRPC* pRpc 1200 ) 1201 { 1202 OBJSYS *pSys = SYS_GET_INSTANCE(); 1203 PlatformRequestHandler* pPlatformRequestHandler 1204 = SYS_GET_PFM_REQ_HNDLR(pSys); 1205 1206 RPC_PARAMS(pfm_req_hndlr_state_sync_callback, _v21_04); 1207 1208 NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_PARAMS_v21_04 *src = &rpc_params->params; 1209 NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_PARAMS dst = { 0 }; 1210 1211 dst.flags = src->flags; 1212 dst.syncData.type = src->syncData.type; 1213 1214 // Copy in the rpc data 1215 switch (src->syncData.type) 1216 { 1217 case NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_DATA_TYPE_SMBPBI: 1218 { 1219 dst.syncData.data.smbpbi.sensorId = 1220 src->syncData.data.smbpbi.sensorId; 1221 dst.syncData.data.smbpbi.limit = 1222 src->syncData.data.smbpbi.limit; 1223 break; 1224 } 1225 default: 1226 { 1227 // Nothing for now 1228 break; 1229 } 1230 } 1231 1232 pfmreqhndlrStateSync(pPlatformRequestHandler, pGpu, &dst); 1233 return NV_OK; 1234 } 1235 1236 static void 1237 _kgspRpcGspLockdownNotice 1238 ( 1239 OBJGPU *pGpu, 1240 OBJRPC *pRpc 1241 ) 1242 { 1243 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 1244 RPC_PARAMS(gsp_lockdown_notice, _v17_00); 1245 1246 // 1247 // While the GSP is in lockdown, we cannot access some of its registers, 1248 // including interrupt status and control. We shouldn't receive any more 1249 // SWGEN0 interrupts while the core is in lockdown. 1250 // 1251 pKernelGsp->bInLockdown = rpc_params->bLockdownEngaging; 1252 1253 NV_PRINTF(LEVEL_INFO, "GSP lockdown %s\n", 1254 pKernelGsp->bInLockdown ? "engaged" : "disengaged"); 1255 } 1256 1257 static 1258 const char *_getRpcName 1259 ( 1260 NvU32 id 1261 ) 1262 { 1263 static const char *rpcName[] = 1264 { 1265 #define X(UNIT, a, VAL) #a, 1266 #define E(a, VAL) #a, 1267 #undef _RPC_GLOBAL_ENUMS_H_ 1268 #include "vgpu/rpc_global_enums.h" 1269 #undef X 1270 #undef E 1271 }; 1272 1273 if (id < NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS) 1274 { 1275 return rpcName[id]; 1276 } 1277 else if ((id > NV_VGPU_MSG_EVENT_FIRST_EVENT) && (id < NV_VGPU_MSG_EVENT_NUM_EVENTS)) 1278 { 1279 NvU32 index = id - (NV_VGPU_MSG_EVENT_FIRST_EVENT - NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS) + 1; 1280 return rpcName[index]; 1281 } 1282 1283 return "Unknown"; 1284 } 1285 1286 /*! 1287 * GSP client process RPC events 1288 */ 1289 static void 1290 _kgspProcessRpcEvent 1291 ( 1292 OBJGPU *pGpu, 1293 OBJRPC *pRpc, 1294 KernelGspRpcEventHandlerContext rpcHandlerContext 1295 ) 1296 { 1297 rpc_message_header_v *pMsgHdr = RPC_HDR; 1298 NV_STATUS nvStatus = NV_OK; 1299 NvU32 event = pMsgHdr->function; 1300 1301 NV_PRINTF(LEVEL_INFO, "received event from GPU%d: 0x%x (%s) status: 0x%x size: %d\n", 1302 gpuGetInstance(pGpu), event, _getRpcName(event), pMsgHdr->rpc_result, pMsgHdr->length); 1303 1304 _kgspAddRpcHistoryEntry(pRpc, pRpc->rpcEventHistory, &pRpc->rpcEventHistoryCurrent); 1305 1306 /* 1307 * Shortlist of RPC's that have been manually screened to be safe without the API lock 1308 * that are called during GSP bootup 1309 */ 1310 if ((rpcHandlerContext == KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL_BOOTUP) && 1311 (!rmapiLockIsOwner())) 1312 { 1313 switch(pMsgHdr->function) 1314 { 1315 case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER: 1316 case NV_VGPU_MSG_EVENT_UCODE_LIBOS_PRINT: 1317 case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE: 1318 case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD: 1319 case NV_VGPU_MSG_EVENT_GSP_INIT_DONE: 1320 case NV_VGPU_MSG_EVENT_OS_ERROR_LOG: 1321 break; 1322 default: 1323 NV_PRINTF(LEVEL_ERROR, "Attempted to process RPC event from GPU%d: 0x%x (%s) during bootup without API lock\n", 1324 gpuGetInstance(pGpu), event, _getRpcName(event)); 1325 NV_ASSERT(0); 1326 goto done; 1327 } 1328 } 1329 1330 switch(event) 1331 { 1332 case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER: 1333 nvStatus = _kgspRpcRunCpuSequencer(pGpu, pRpc); 1334 break; 1335 1336 case NV_VGPU_MSG_EVENT_POST_EVENT: 1337 nvStatus = _kgspRpcPostEvent(pGpu, pRpc); 1338 break; 1339 1340 case NV_VGPU_MSG_EVENT_RC_TRIGGERED: 1341 nvStatus = _kgspRpcRCTriggered(pGpu, pRpc); 1342 break; 1343 1344 case NV_VGPU_MSG_EVENT_MMU_FAULT_QUEUED: 1345 nvStatus = _kgspRpcMMUFaultQueued(pGpu, pRpc); 1346 break; 1347 1348 case NV_VGPU_MSG_EVENT_SIM_READ: 1349 nvStatus = _kgspRpcSimRead(pGpu, pRpc); 1350 break; 1351 1352 case NV_VGPU_MSG_EVENT_SIM_WRITE: 1353 nvStatus = _kgspRpcSimWrite(pGpu, pRpc); 1354 break; 1355 1356 case NV_VGPU_MSG_EVENT_OS_ERROR_LOG: 1357 _kgspRpcOsErrorLog(pGpu, pRpc); 1358 break; 1359 1360 case NV_VGPU_MSG_EVENT_GPUACCT_PERFMON_UTIL_SAMPLES: 1361 _kgspRpcGpuacctPerfmonUtilSamples(pGpu, pRpc); 1362 break; 1363 1364 case NV_VGPU_MSG_EVENT_PERF_GPU_BOOST_SYNC_LIMITS_CALLBACK: 1365 _kgspRpcPerfGpuBoostSyncLimitsCallback(pGpu, pRpc); 1366 break; 1367 1368 case NV_VGPU_MSG_EVENT_PERF_BRIDGELESS_INFO_UPDATE: 1369 _kgspRpcPerfBridgelessInfoUpdate(pGpu, pRpc); 1370 break; 1371 1372 case NV_VGPU_MSG_EVENT_SEMAPHORE_SCHEDULE_CALLBACK: 1373 _kgspRpcSemaphoreScheduleCallback(pGpu, pRpc); 1374 break; 1375 1376 case NV_VGPU_MSG_EVENT_TIMED_SEMAPHORE_RELEASE: 1377 _kgspRpcTimedSemaphoreRelease(pGpu, pRpc); 1378 break; 1379 1380 case NV_VGPU_MSG_EVENT_NVLINK_FAULT_UP: 1381 _kgspRpcNvlinkFaultUpCallback(pGpu, pRpc); 1382 break; 1383 1384 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_256: 1385 _kgspRpcNvlinkInbandReceivedData256Callback(pGpu, pRpc); 1386 break; 1387 1388 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_512: 1389 _kgspRpcNvlinkInbandReceivedData512Callback(pGpu, pRpc); 1390 break; 1391 1392 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_1024: 1393 _kgspRpcNvlinkInbandReceivedData1024Callback(pGpu, pRpc); 1394 break; 1395 1396 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_2048: 1397 _kgspRpcNvlinkInbandReceivedData2048Callback(pGpu, pRpc); 1398 break; 1399 1400 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_4096: 1401 _kgspRpcNvlinkInbandReceivedData4096Callback(pGpu, pRpc); 1402 break; 1403 1404 case NV_VGPU_MSG_EVENT_NVLINK_FATAL_ERROR_RECOVERY: 1405 _kgspRpcNvlinkFatalErrorRecoveryCallback(pGpu, pRpc); 1406 break; 1407 1408 case NV_VGPU_MSG_EVENT_NVLINK_IS_GPU_DEGRADED : 1409 _kgspRpcEventIsGpuDegradedCallback(pGpu, pRpc); 1410 break; 1411 1412 case NV_VGPU_MSG_EVENT_RG_LINE_INTR: 1413 _kgspRpcRgLineIntr(pGpu, pRpc); 1414 break; 1415 1416 case NV_VGPU_MSG_EVENT_UCODE_LIBOS_PRINT: 1417 nvStatus = _kgspRpcUcodeLibosPrint(pGpu, pRpc); 1418 break; 1419 1420 case NV_VGPU_MSG_EVENT_VGPU_GSP_PLUGIN_TRIGGERED: 1421 nvStatus = _kgspRpcVgpuGspPluginTriggered(pGpu, pRpc); 1422 break; 1423 1424 case NV_VGPU_MSG_EVENT_VGPU_CONFIG: 1425 nvStatus = _kgspRpcGspVgpuConfig(pGpu, pRpc); 1426 break; 1427 1428 case NV_VGPU_MSG_EVENT_EXTDEV_INTR_SERVICE: 1429 nvStatus = _kgspRpcGspExtdevIntrService(pGpu, pRpc); 1430 break; 1431 1432 case NV_VGPU_MSG_EVENT_PFM_REQ_HNDLR_STATE_SYNC_CALLBACK: 1433 nvStatus = _kgspRpcEventPlatformRequestHandlerStateSyncCallback(pGpu, pRpc); 1434 break; 1435 1436 case NV_VGPU_MSG_EVENT_MIG_CI_CONFIG_UPDATE: 1437 nvStatus = _kgspRpcMigCiConfigUpdate(pGpu, pRpc); 1438 break; 1439 1440 case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE: 1441 _kgspRpcGspLockdownNotice(pGpu, pRpc); 1442 break; 1443 1444 case NV_VGPU_MSG_EVENT_UPDATE_GSP_TRACE: 1445 _kgspRpcGspUpdateTrace(pGpu, pRpc); 1446 break; 1447 1448 case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD: 1449 _kgspRpcGspPostNocatRecord(pGpu, pRpc); 1450 break; 1451 1452 case NV_VGPU_MSG_EVENT_GSP_INIT_DONE: // Handled by _kgspRpcRecvPoll. 1453 default: 1454 // 1455 // Log, but otherwise ignore unexpected events. 1456 // 1457 // We will get here if the previous RPC timed out. The response 1458 // eventually comes in as an unexpected event. The error handling 1459 // for the timeout should have already happened. 1460 // 1461 NV_PRINTF(LEVEL_ERROR, "Unexpected RPC event from GPU%d: 0x%x (%s)\n", 1462 gpuGetInstance(pGpu), event, _getRpcName(event)); 1463 break; 1464 } 1465 1466 if (nvStatus != NV_OK) 1467 { 1468 // 1469 // Failing to properly handle a specific event does not mean we should stop 1470 // processing events/RPCs, so print the error and soldier on. 1471 // 1472 NV_PRINTF(LEVEL_ERROR, 1473 "Failed to process received event 0x%x (%s) from GPU%d: status=0x%x\n", 1474 event, _getRpcName(event), gpuGetInstance(pGpu), nvStatus); 1475 } 1476 1477 done: 1478 _kgspCompleteRpcHistoryEntry(pRpc->rpcEventHistory, pRpc->rpcEventHistoryCurrent); 1479 } 1480 1481 /*! 1482 * Handle a single RPC event from GSP unless the event is [an RPC return for] expectedFunc, 1483 * or there are no events available in the buffer. 1484 * 1485 * @return 1486 * NV_OK if the event is successfully handled. 1487 * NV_WARN_NOTHING_TO_DO if there are no events available. 1488 * NV_WARN_MORE_PROCESSING_REQUIRED if the event is expectedFunc: it is unhandled and in the staging area. 1489 * (Another status) if event reading fails. 1490 */ 1491 static NV_STATUS 1492 _kgspRpcDrainOneEvent 1493 ( 1494 OBJGPU *pGpu, 1495 OBJRPC *pRpc, 1496 NvU32 expectedFunc, 1497 KernelGspRpcEventHandlerContext rpcHandlerContext 1498 ) 1499 { 1500 NV_STATUS nvStatus; 1501 1502 // Issue a memory barrier to ensure we see any queue updates. 1503 // Note: Without the fence, the CPU may get stuck in an infinite loop 1504 // waiting for a message that has already arrived. 1505 portAtomicMemoryFenceFull(); 1506 1507 nvStatus = GspMsgQueueReceiveStatus(pRpc->pMessageQueueInfo, pGpu); 1508 1509 if (nvStatus == NV_OK) 1510 { 1511 rpc_message_header_v *pMsgHdr = RPC_HDR; 1512 1513 if (pMsgHdr->function == expectedFunc) 1514 return NV_WARN_MORE_PROCESSING_REQUIRED; 1515 1516 _kgspProcessRpcEvent(pGpu, pRpc, rpcHandlerContext); 1517 } 1518 1519 // 1520 // We don't expect NV_WARN_MORE_PROCESSING_REQUIRED here. 1521 // If we get it we need to suppress it to avoid confusing our caller, for whom it has special meaning. 1522 // 1523 NV_ASSERT_OR_ELSE(nvStatus != NV_WARN_MORE_PROCESSING_REQUIRED, 1524 nvStatus = NV_ERR_GENERIC); 1525 1526 return nvStatus; 1527 } 1528 1529 /*! 1530 * Handle RPC events from GSP until the event is [an RPC return for] expectedFunc, 1531 * or there are no events available in the buffer. 1532 * 1533 * Also dump GSP logs, and check for severe errors coming from GSP. 1534 * 1535 * @return 1536 * NV_OK if one or more events are handled and there are none left. 1537 * NV_WARN_MORE_PROCESSING_REQUIRED if an expectedFunc event is found: it is unhandled and in the staging area. 1538 * (Zero or more preceding events were successfully handled.) 1539 * (Another status) if event reading or processing fails. 1540 */ 1541 static NV_STATUS 1542 _kgspRpcDrainEvents 1543 ( 1544 OBJGPU *pGpu, 1545 KernelGsp *pKernelGsp, 1546 NvU32 expectedFunc, 1547 KernelGspRpcEventHandlerContext rpcHandlerContext 1548 ) 1549 { 1550 NV_STATUS nvStatus = NV_OK; 1551 OBJRPC *pRpc = GPU_GET_RPC(pGpu); 1552 1553 while (nvStatus == NV_OK) 1554 { 1555 nvStatus = _kgspRpcDrainOneEvent(pGpu, pRpc, expectedFunc, rpcHandlerContext); 1556 kgspDumpGspLogs(pKernelGsp, NV_FALSE); 1557 } 1558 1559 // If GSP-RM has died, the GPU will need to be reset 1560 if (!kgspHealthCheck_HAL(pGpu, pKernelGsp)) 1561 return NV_ERR_RESET_REQUIRED; 1562 1563 if (nvStatus == NV_WARN_NOTHING_TO_DO) 1564 nvStatus = NV_OK; 1565 1566 return nvStatus; 1567 } 1568 1569 static NvU64 1570 _tsDiffToDuration 1571 ( 1572 NvU64 duration, 1573 char *pDurationUnitsChar 1574 ) 1575 { 1576 const NvU64 tsFreqUs = osGetTimestampFreq() / 1000000; 1577 1578 *pDurationUnitsChar = 'u'; 1579 1580 NV_ASSERT_OR_RETURN(tsFreqUs > 0, 0); 1581 1582 duration /= tsFreqUs; 1583 1584 // 999999us then 1000ms 1585 if (duration >= 1000000) 1586 { 1587 duration /= 1000; 1588 *pDurationUnitsChar = 'm'; 1589 } 1590 1591 // 9999ms then 10s 1592 if (duration >= 10000) 1593 { 1594 duration /= 1000; 1595 *pDurationUnitsChar = ' '; // so caller can always just append 's' 1596 } 1597 1598 return duration; 1599 } 1600 1601 static NvBool 1602 _kgspIsTimestampDuringRecentRpc 1603 ( 1604 OBJRPC *pRpc, 1605 NvU64 timestamp, 1606 NvBool bCheckIncompleteRpcsOnly 1607 ) 1608 { 1609 NvU32 historyIndex; 1610 NvU32 historyEntry; 1611 1612 for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++) 1613 { 1614 historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH; 1615 if (pRpc->rpcHistory[historyEntry].function != 0) 1616 { 1617 if ((timestamp >= pRpc->rpcHistory[historyEntry].ts_start) && 1618 ((pRpc->rpcHistory[historyEntry].ts_end == 0) || 1619 (!bCheckIncompleteRpcsOnly && (timestamp <= pRpc->rpcHistory[historyEntry].ts_end)))) 1620 { 1621 return NV_TRUE; 1622 } 1623 } 1624 } 1625 1626 return NV_FALSE; 1627 } 1628 1629 static void 1630 _kgspLogRpcHistoryEntry 1631 ( 1632 OBJGPU *pGpu, 1633 NvU32 errorNum, 1634 NvU32 historyIndex, 1635 RpcHistoryEntry *pEntry, 1636 NvBool lastColumnCondition 1637 ) 1638 { 1639 NvU64 duration; 1640 char durationUnitsChar; 1641 1642 if (pEntry->function != 0) 1643 { 1644 duration = (pEntry->ts_end > pEntry->ts_start) ? (pEntry->ts_end - pEntry->ts_start) : 0; 1645 if (duration) 1646 { 1647 duration = _tsDiffToDuration(duration, &durationUnitsChar); 1648 1649 NV_ERROR_LOG_DATA(pGpu, errorNum, 1650 " %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx %6llu%cs %c\n", 1651 ((historyIndex == 0) ? ' ' : '-'), 1652 historyIndex, 1653 pEntry->function, 1654 _getRpcName(pEntry->function), 1655 pEntry->data[0], 1656 pEntry->data[1], 1657 pEntry->ts_start, 1658 pEntry->ts_end, 1659 duration, durationUnitsChar, 1660 (lastColumnCondition ? 'y' : ' ')); 1661 } 1662 else 1663 { 1664 NV_ERROR_LOG_DATA(pGpu, errorNum, 1665 " %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx %c\n", 1666 ((historyIndex == 0) ? ' ' : '-'), 1667 historyIndex, 1668 pEntry->function, 1669 _getRpcName(pEntry->function), 1670 pEntry->data[0], 1671 pEntry->data[1], 1672 pEntry->ts_start, 1673 pEntry->ts_end, 1674 (lastColumnCondition ? 'y' : ' ')); 1675 } 1676 } 1677 } 1678 1679 void 1680 kgspLogRpcDebugInfo 1681 ( 1682 OBJGPU *pGpu, 1683 OBJRPC *pRpc, 1684 NvU32 errorNum, 1685 NvBool bPollingForRpcResponse 1686 ) 1687 { 1688 const rpc_message_header_v *pMsgHdr = RPC_HDR; 1689 NvU32 historyIndex; 1690 NvU32 historyEntry; 1691 NvU64 activeData[2]; 1692 1693 _kgspGetActiveRpcDebugData(pRpc, pMsgHdr->function, 1694 &activeData[0], &activeData[1]); 1695 NV_ERROR_LOG_DATA(pGpu, errorNum, 1696 "GPU%d GSP RPC buffer contains function %d (%s) and data 0x%016llx 0x%016llx.\n", 1697 gpuGetInstance(pGpu), 1698 pMsgHdr->function, _getRpcName(pMsgHdr->function), 1699 activeData[0], activeData[1]); 1700 1701 NV_ERROR_LOG_DATA(pGpu, errorNum, 1702 "GPU%d RPC history (CPU -> GSP):\n", 1703 gpuGetInstance(pGpu)); 1704 NV_ERROR_LOG_DATA(pGpu, errorNum, 1705 " entry function data0 data1 ts_start ts_end duration actively_polling\n"); 1706 for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++) 1707 { 1708 historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH; 1709 _kgspLogRpcHistoryEntry(pGpu, errorNum, historyIndex, &pRpc->rpcHistory[historyEntry], 1710 ((historyIndex == 0) && bPollingForRpcResponse)); 1711 } 1712 1713 NV_ERROR_LOG_DATA(pGpu, errorNum, 1714 "GPU%d RPC event history (CPU <- GSP):\n", 1715 gpuGetInstance(pGpu)); 1716 NV_ERROR_LOG_DATA(pGpu, errorNum, 1717 " entry function data0 data1 ts_start ts_end duration during_incomplete_rpc\n"); 1718 for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++) 1719 { 1720 historyEntry = (pRpc->rpcEventHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH; 1721 _kgspLogRpcHistoryEntry(pGpu, errorNum, historyIndex, &pRpc->rpcEventHistory[historyEntry], 1722 _kgspIsTimestampDuringRecentRpc(pRpc, 1723 pRpc->rpcEventHistory[historyEntry].ts_start, 1724 NV_TRUE/*bCheckIncompleteRpcsOnly*/)); 1725 } 1726 } 1727 1728 /*! 1729 * Log Xid 119 - GSP RPC Timeout 1730 */ 1731 static void 1732 _kgspLogXid119 1733 ( 1734 OBJGPU *pGpu, 1735 OBJRPC *pRpc, 1736 NvU32 expectedFunc 1737 ) 1738 { 1739 RpcHistoryEntry *pHistoryEntry = &pRpc->rpcHistory[pRpc->rpcHistoryCurrent]; 1740 NvU64 ts_end = osGetTimestamp(); 1741 NvU64 duration; 1742 char durationUnitsChar; 1743 1744 if (pRpc->timeoutCount == 1) 1745 { 1746 NV_PRINTF(LEVEL_ERROR, 1747 "********************************* GSP Timeout **********************************\n"); 1748 NV_PRINTF(LEVEL_ERROR, 1749 "Note: Please also check logs above.\n"); 1750 } 1751 1752 NV_ASSERT(expectedFunc == pHistoryEntry->function); 1753 1754 NV_ASSERT(ts_end > pHistoryEntry->ts_start); 1755 duration = _tsDiffToDuration(ts_end - pHistoryEntry->ts_start, &durationUnitsChar); 1756 1757 NV_ERROR_LOG(pGpu, GSP_RPC_TIMEOUT, 1758 "Timeout after %llus of waiting for RPC response from GPU%d GSP! Expected function %d (%s) (0x%x 0x%x).", 1759 (durationUnitsChar == 'm' ? duration / 1000 : duration), 1760 gpuGetInstance(pGpu), 1761 expectedFunc, 1762 _getRpcName(expectedFunc), 1763 pHistoryEntry->data[0], 1764 pHistoryEntry->data[1]); 1765 1766 if (pRpc->timeoutCount == 1) 1767 { 1768 kgspLogRpcDebugInfo(pGpu, pRpc, GSP_RPC_TIMEOUT, NV_TRUE/*bPollingForRpcResponse*/); 1769 1770 osAssertFailed(); 1771 1772 NV_PRINTF(LEVEL_ERROR, 1773 "********************************************************************************\n"); 1774 } 1775 } 1776 1777 static void 1778 _kgspRpcIncrementTimeoutCountAndRateLimitPrints 1779 ( 1780 OBJGPU *pGpu, 1781 OBJRPC *pRpc 1782 ) 1783 { 1784 pRpc->timeoutCount++; 1785 1786 if ((pRpc->timeoutCount == (RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH + 1)) && 1787 (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP > 0)) 1788 { 1789 // make sure we warn Xid and NV_PRINTF/NVLOG consumers that we are rate limiting prints 1790 if (GPU_GET_KERNEL_RC(pGpu)->bLogEvents) 1791 { 1792 portDbgPrintf( 1793 "NVRM: Rate limiting GSP RPC error prints for GPU at PCI:%04x:%02x:%02x (printing 1 of every %d). The GPU likely needs to be reset.\n", 1794 gpuGetDomain(pGpu), 1795 gpuGetBus(pGpu), 1796 gpuGetDevice(pGpu), 1797 RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1); 1798 } 1799 NV_PRINTF(LEVEL_WARNING, 1800 "Rate limiting GSP RPC error prints (printing 1 of every %d)\n", 1801 RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1); 1802 } 1803 1804 pRpc->bQuietPrints = ((pRpc->timeoutCount > RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH) && 1805 ((pRpc->timeoutCount % (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1)) != 0)); 1806 } 1807 1808 /*! 1809 * GSP client RM RPC poll routine 1810 */ 1811 static NV_STATUS 1812 _kgspRpcRecvPoll 1813 ( 1814 OBJGPU *pGpu, 1815 OBJRPC *pRpc, 1816 NvU32 expectedFunc 1817 ) 1818 { 1819 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 1820 NV_STATUS rpcStatus = NV_OK; 1821 NV_STATUS timeoutStatus = NV_OK; 1822 RMTIMEOUT timeout; 1823 NvU32 timeoutUs; 1824 NvU32 timeoutFlags; 1825 NvBool bSlowGspRpc = IS_EMULATION(pGpu) || IS_SIMULATION(pGpu); 1826 NvU32 gpuMaskUnused; 1827 1828 KernelGspRpcEventHandlerContext rpcHandlerContext = KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL; 1829 if (expectedFunc == NV_VGPU_MSG_EVENT_GSP_INIT_DONE) 1830 { 1831 // special case for bootup path without API lock 1832 rpcHandlerContext = KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL_BOOTUP; 1833 } 1834 // 1835 // We do not allow recursive polling. This can happen if e.g. 1836 // 1. CPU-RM issues RPC-A to GSP and polls waiting for it to finish 1837 // 2. While servicing RPC-A, GSP emits an async event back to CPU-RM 1838 // 3. CPU-RM services the async event and sends another synchronous RPC-B 1839 // 4. RPC-A response will come first, but CPU-RM is now waiting on RPC-B 1840 // 1841 // We don't have a good way to handle this and should just be deferring the 1842 // second RPC until the first one is done, via e.g. osQueueWorkItem(). 1843 // This assert is meant to catch and loudly fail such cases. 1844 // 1845 NV_ASSERT_OR_RETURN(!pKernelGsp->bPollingForRpcResponse, NV_ERR_INVALID_STATE); 1846 pKernelGsp->bPollingForRpcResponse = NV_TRUE; 1847 1848 // 1849 // GSP-RM init in emulation/simulation environment is extremely slow, 1850 // so need to increment timeout. 1851 // Apply the timeout extension to other RPCs as well, mostly so that 1852 // we'll reset the thread state after each RPC, not just while waiting 1853 // for the INIT_DONE event. 1854 // 1855 if (bSlowGspRpc) 1856 { 1857 NvU32 timeoutResult; 1858 1859 // On slow Apollo emulators, GSP-RM init could take more than an hour 1860 NV_ASSERT(portSafeMulU32(GSP_SCALE_TIMEOUT_EMU_SIM, 1500000, &timeoutResult)); 1861 timeoutUs = timeoutResult; 1862 } 1863 else 1864 { 1865 NvU32 defaultus = pGpu->timeoutData.defaultus; 1866 1867 if (IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu)) 1868 { 1869 // Ensure at least 3.1s for vGPU-GSP before adding leeway (Bug 3928607) 1870 timeoutUs = NV_MAX(3100 * 1000, defaultus) + (defaultus / 2); 1871 } 1872 else 1873 { 1874 // 1875 // We should only ever timeout this when GSP is in really bad state, so if it just 1876 // happens to timeout on default timeout it should be OK for us to give it a little 1877 // more time - make this timeout 1.5 of the default to allow some leeway. 1878 // 1879 timeoutUs = defaultus + defaultus / 2; 1880 } 1881 } 1882 1883 NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused)); 1884 1885 timeoutFlags = GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE; 1886 if (pRpc->bQuietPrints) 1887 timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_JOURNAL_LOG; 1888 1889 gpuSetTimeout(pGpu, timeoutUs, &timeout, timeoutFlags); 1890 1891 for (;;) 1892 { 1893 // 1894 // Check for GPU timeout, save that information, and then verify if the RPC is completed. 1895 // Otherwise if the CPU thread goes to sleep immediately after the RPC check, it may result in hitting a timeout. 1896 // 1897 timeoutStatus = gpuCheckTimeout(pGpu, &timeout); 1898 1899 rpcStatus = _kgspRpcDrainEvents(pGpu, pKernelGsp, expectedFunc, rpcHandlerContext); 1900 1901 switch (rpcStatus) { 1902 case NV_WARN_MORE_PROCESSING_REQUIRED: 1903 // The synchronous RPC response we were waiting for is here 1904 _kgspCompleteRpcHistoryEntry(pRpc->rpcHistory, pRpc->rpcHistoryCurrent); 1905 rpcStatus = NV_OK; 1906 goto done; 1907 case NV_OK: 1908 // Check timeout and continue outer loop. 1909 break; 1910 default: 1911 goto done; 1912 } 1913 1914 NV_CHECK_OK_OR_GOTO(rpcStatus, LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc), done); 1915 1916 if (timeoutStatus == NV_ERR_TIMEOUT) 1917 { 1918 rpcStatus = timeoutStatus; 1919 1920 _kgspRpcIncrementTimeoutCountAndRateLimitPrints(pGpu, pRpc); 1921 1922 if (!pRpc->bQuietPrints) 1923 { 1924 _kgspLogXid119(pGpu, pRpc, expectedFunc); 1925 } 1926 1927 goto done; 1928 } 1929 else if (timeoutStatus != NV_OK) 1930 { 1931 NV_PRINTF(LEVEL_ERROR, "gpuCheckTimeout() returned unexpected error (0x%08x)\n", 1932 timeoutStatus); 1933 rpcStatus = timeoutStatus; 1934 goto done; 1935 } 1936 1937 osSpinLoop(); 1938 } 1939 1940 pRpc->timeoutCount = 0; 1941 1942 done: 1943 pKernelGsp->bPollingForRpcResponse = NV_FALSE; 1944 1945 if (bSlowGspRpc) 1946 { 1947 // Avoid cumulative timeout due to slow RPC 1948 threadStateResetTimeout(pGpu); 1949 } 1950 1951 return rpcStatus; 1952 } 1953 1954 /*! 1955 * Initialize RPC objects required for interfacing with GSP. 1956 */ 1957 static NV_STATUS 1958 _kgspInitRpcInfrastructure 1959 ( 1960 OBJGPU *pGpu, 1961 KernelGsp *pKernelGsp 1962 ) 1963 { 1964 NV_STATUS nvStatus = NV_OK; 1965 MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL; 1966 1967 nvStatus = GspMsgQueuesInit(pGpu, &pMQCollection); 1968 if (nvStatus != NV_OK) 1969 { 1970 NV_PRINTF(LEVEL_ERROR, "GspMsgQueueInit failed\n"); 1971 goto done; 1972 } 1973 1974 pKernelGsp->pMQCollection = pMQCollection; 1975 1976 // Init RM RPC object 1977 nvStatus = _kgspConstructRpcObject(pGpu, pKernelGsp, 1978 &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX], 1979 &pKernelGsp->pRpc); 1980 if (nvStatus != NV_OK) 1981 { 1982 NV_PRINTF(LEVEL_ERROR, "init task RM RPC infrastructure failed\n"); 1983 goto done; 1984 } 1985 1986 // Init task_isr RPC object 1987 if (pKernelGsp->bIsTaskIsrQueueRequired) 1988 { 1989 nvStatus = _kgspConstructRpcObject(pGpu, pKernelGsp, 1990 &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX], 1991 &pKernelGsp->pLocklessRpc); 1992 if (nvStatus != NV_OK) 1993 { 1994 NV_PRINTF(LEVEL_ERROR, "init task ISR RPC infrastructure failed\n"); 1995 goto done; 1996 } 1997 } 1998 1999 done: 2000 if (nvStatus != NV_OK) 2001 { 2002 _kgspFreeRpcInfrastructure(pGpu, pKernelGsp); 2003 } 2004 2005 return nvStatus; 2006 } 2007 2008 2009 /*! 2010 * Initialize stripped down version of RPC infra init for GSP clients. 2011 */ 2012 static NV_STATUS 2013 _kgspConstructRpcObject 2014 ( 2015 OBJGPU *pGpu, 2016 KernelGsp *pKernelGsp, 2017 MESSAGE_QUEUE_INFO *pMQI, 2018 OBJRPC **ppRpc 2019 ) 2020 { 2021 OBJRPC *pRpc; 2022 2023 NV_ASSERT_OR_RETURN(pMQI != NULL, NV_ERR_INVALID_ARGUMENT); 2024 2025 pRpc = initRpcObject(pGpu); 2026 if (pRpc == NULL) 2027 { 2028 NV_PRINTF(LEVEL_ERROR, "initRpcObject failed\n"); 2029 return NV_ERR_INSUFFICIENT_RESOURCES; 2030 } 2031 2032 pRpc->pMessageQueueInfo = pMQI; 2033 2034 portMemSet(&pRpc->rpcHistory, 0, sizeof(pRpc->rpcHistory)); 2035 pRpc->rpcHistoryCurrent = RPC_HISTORY_DEPTH - 1; 2036 portMemSet(&pRpc->rpcEventHistory, 0, sizeof(pRpc->rpcEventHistory)); 2037 pRpc->rpcEventHistoryCurrent = RPC_HISTORY_DEPTH - 1; 2038 2039 pRpc->message_buffer = (NvU32 *)pRpc->pMessageQueueInfo->pRpcMsgBuf; 2040 pRpc->maxRpcSize = GSP_MSG_QUEUE_RPC_SIZE_MAX; 2041 2042 rpcSendMessage_FNPTR(pRpc) = _kgspRpcSendMessage; 2043 rpcRecvPoll_FNPTR(pRpc) = _kgspRpcRecvPoll; 2044 2045 *ppRpc = pRpc; 2046 2047 return NV_OK; 2048 } 2049 2050 static void 2051 _kgspFreeRpcInfrastructure 2052 ( 2053 OBJGPU *pGpu, 2054 KernelGsp *pKernelGsp 2055 ) 2056 { 2057 if (pKernelGsp->pRpc != NULL) 2058 { 2059 rpcDestroy(pGpu, pKernelGsp->pRpc); 2060 portMemFree(pKernelGsp->pRpc); 2061 pKernelGsp->pRpc = NULL; 2062 } 2063 if (pKernelGsp->pLocklessRpc != NULL) 2064 { 2065 rpcDestroy(pGpu, pKernelGsp->pLocklessRpc); 2066 portMemFree(pKernelGsp->pLocklessRpc); 2067 pKernelGsp->pLocklessRpc = NULL; 2068 } 2069 GspMsgQueuesCleanup(&pKernelGsp->pMQCollection); 2070 } 2071 2072 /*! 2073 * Convert init arg name to 64bit id value. 2074 * 2075 * @param[in] name String representing name of init arg 2076 */ 2077 static NvU64 2078 _kgspGenerateInitArgId(const char *name) 2079 { 2080 NvU64 id = 0; 2081 NvU8 c; 2082 NvU32 i; 2083 2084 // Convert at most 8 characters from name into id. 2085 for (i = 0; i < (sizeof(NvU64) / sizeof(NvU8)); ++i) 2086 { 2087 c = (NvU8)*name++; 2088 if (c == '\0') 2089 { 2090 break; 2091 } 2092 id = (id << 8) | c; 2093 } 2094 2095 return id; 2096 } 2097 2098 static void 2099 _kgspUnmapTaskLogBuf(OBJGPU *pGpu, RM_LIBOS_LOG_MEM *pLog) 2100 { 2101 // release log memory for this task. 2102 if (pLog->pTaskLogBuffer != NULL) 2103 { 2104 memdescUnmapInternal(pGpu, pLog->pTaskLogDescriptor, TRANSFER_FLAGS_NONE); 2105 pLog->pTaskLogBuffer = NULL; 2106 } 2107 2108 if (pLog->pTaskLogDescriptor != NULL) 2109 { 2110 memdescFree(pLog->pTaskLogDescriptor); 2111 memdescDestroy(pLog->pTaskLogDescriptor); 2112 pLog->pTaskLogDescriptor = NULL; 2113 } 2114 } 2115 2116 /*! 2117 * Free vgpu partition LIBOS task logging structures 2118 */ 2119 static void 2120 _kgspFreeLibosVgpuPartitionLoggingStructures 2121 ( 2122 OBJGPU *pGpu, 2123 KernelGsp *pKernelGsp, 2124 NvU32 gfid 2125 ) 2126 { 2127 RM_LIBOS_LOG_MEM *vgpuLogBuffers[] = 2128 { 2129 pKernelGsp->gspPluginInitTaskLogMem, 2130 pKernelGsp->gspPluginVgpuTaskLogMem 2131 }; 2132 2133 libosLogDestroy(&pKernelGsp->logDecodeVgpuPartition[gfid - 1]); 2134 2135 // release all the vgpu tasks' log buffer memory 2136 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(vgpuLogBuffers); ++i) 2137 { 2138 RM_LIBOS_LOG_MEM *pTaskLog = &vgpuLogBuffers[i][gfid - 1]; 2139 _kgspUnmapTaskLogBuf(pGpu, pTaskLog); 2140 } 2141 } 2142 2143 /*! 2144 * Free vgpu partition LIBOS task logging structures 2145 */ 2146 NV_STATUS 2147 kgspFreeVgpuPartitionLogging_IMPL 2148 ( 2149 OBJGPU *pGpu, 2150 KernelGsp *pKernelGsp, 2151 NvU32 gfid 2152 ) 2153 { 2154 if (gfid > MAX_PARTITIONS_WITH_GFID) 2155 { 2156 return NV_ERR_INVALID_ARGUMENT; 2157 } 2158 else 2159 { 2160 // Make sure there is no lingering debug output. 2161 kgspDumpGspLogs(pKernelGsp, NV_FALSE); 2162 2163 _kgspFreeLibosVgpuPartitionLoggingStructures(pGpu, pKernelGsp, gfid); 2164 return NV_OK; 2165 } 2166 } 2167 2168 /*! 2169 * Initialize vgpu partition LIBOS task logging structures 2170 */ 2171 NV_STATUS 2172 kgspInitVgpuPartitionLogging_IMPL 2173 ( 2174 OBJGPU *pGpu, 2175 KernelGsp *pKernelGsp, 2176 NvU32 gfid, 2177 NvU64 initTaskLogBUffOffset, 2178 NvU64 initTaskLogBUffSize, 2179 NvU64 vgpuTaskLogBUffOffset, 2180 NvU64 vgpuTaskLogBuffSize 2181 ) 2182 { 2183 struct 2184 { 2185 const char *szMemoryId; 2186 const char *szPrefix; 2187 const char *elfSectionName; 2188 NvU64 bufOffset; 2189 NvU64 bufSize; 2190 RM_LIBOS_LOG_MEM *taskLogArr; 2191 } logInitValues[] = 2192 { 2193 {"LOGINIT", "INIT", ".fwlogging_init", initTaskLogBUffOffset, initTaskLogBUffSize, pKernelGsp->gspPluginInitTaskLogMem}, 2194 {"LOGVGPU", "VGPU", ".fwlogging_vgpu", vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pKernelGsp->gspPluginVgpuTaskLogMem} 2195 }; 2196 ct_assert(NV_ARRAY_ELEMENTS(logInitValues) <= LIBOS_LOG_MAX_LOGS); 2197 2198 NV_STATUS nvStatus = NV_OK; 2199 RM_LIBOS_LOG_MEM *pTaskLog = NULL; 2200 char vm_string[8], sourceName[SOURCE_NAME_MAX_LENGTH]; 2201 2202 if (gfid > MAX_PARTITIONS_WITH_GFID) 2203 { 2204 return NV_ERR_INVALID_ARGUMENT; 2205 } 2206 2207 if (pKernelGsp->pNvlogFlushMtx != NULL) 2208 portSyncMutexAcquire(pKernelGsp->pNvlogFlushMtx); 2209 2210 // Source name is used to generate a tag that is a unique identifier for nvlog buffers. 2211 // As the source name 'GSP' is already in use, we will need a custom source name. 2212 nvDbgSnprintf(sourceName, SOURCE_NAME_MAX_LENGTH, "V%02d", gfid); 2213 libosLogCreateEx(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], sourceName); 2214 2215 // Setup logging for each task in vgpu partition 2216 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(logInitValues); ++i) 2217 { 2218 pTaskLog = &logInitValues[i].taskLogArr[gfid - 1]; 2219 NvP64 pVa = NvP64_NULL; 2220 2221 NV_ASSERT_OK_OR_GOTO(nvStatus, 2222 memdescCreate(&pTaskLog->pTaskLogDescriptor, 2223 pGpu, 2224 logInitValues[i].bufSize, 2225 RM_PAGE_SIZE, 2226 NV_TRUE, ADDR_FBMEM, NV_MEMORY_CACHED, 2227 MEMDESC_FLAGS_NONE), 2228 error_cleanup); 2229 2230 memdescDescribe(pTaskLog->pTaskLogDescriptor, ADDR_FBMEM, logInitValues[i].bufOffset, logInitValues[i].bufSize); 2231 2232 pVa = memdescMapInternal(pGpu, pTaskLog->pTaskLogDescriptor, TRANSFER_FLAGS_NONE); 2233 if (pVa != NvP64_NULL) 2234 { 2235 pTaskLog->pTaskLogBuffer = pVa; 2236 portMemSet(pTaskLog->pTaskLogBuffer, 0, logInitValues[i].bufSize); 2237 2238 pTaskLog->id8 = _kgspGenerateInitArgId(logInitValues[i].szMemoryId); 2239 2240 nvDbgSnprintf(vm_string, sizeof(vm_string), "%s%d", logInitValues[i].szPrefix, gfid); 2241 2242 libosLogAddLogEx(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], 2243 pTaskLog->pTaskLogBuffer, 2244 memdescGetSize(pTaskLog->pTaskLogDescriptor), 2245 pGpu->gpuInstance, 2246 (gpuGetChipArch(pGpu) >> GPU_ARCH_SHIFT), 2247 gpuGetChipImpl(pGpu), 2248 vm_string, 2249 logInitValues[i].elfSectionName); 2250 } 2251 else 2252 { 2253 NV_PRINTF(LEVEL_ERROR, "Failed to map memory for %s task log buffer for vGPU partition \n", logInitValues[i].szPrefix); 2254 nvStatus = NV_ERR_INSUFFICIENT_RESOURCES; 2255 goto error_cleanup; 2256 } 2257 } 2258 2259 { 2260 libosLogInit(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], pKernelGsp->pLogElf, pKernelGsp->logElfDataSize); 2261 // nvlog buffers are now setup using the appropriate sourceName to avoid tag-value clash. 2262 // Now sourceName can be modified to preserve the 'GSP-VGPUx' logging convention. 2263 portStringCopy(pKernelGsp->logDecodeVgpuPartition[gfid - 1].sourceName, 2264 SOURCE_NAME_MAX_LENGTH, 2265 "GSP", SOURCE_NAME_MAX_LENGTH); 2266 } 2267 2268 pKernelGsp->bHasVgpuLogs = NV_TRUE; 2269 2270 error_cleanup: 2271 if (pKernelGsp->pNvlogFlushMtx != NULL) 2272 portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx); 2273 2274 if (nvStatus != NV_OK) 2275 _kgspFreeLibosVgpuPartitionLoggingStructures(pGpu, pKernelGsp, gfid); 2276 2277 return nvStatus; 2278 } 2279 2280 void kgspNvlogFlushCb(void *pKernelGsp) 2281 { 2282 if (pKernelGsp != NULL) 2283 kgspDumpGspLogs((KernelGsp*)pKernelGsp, NV_TRUE); 2284 } 2285 2286 /*! 2287 * Free LIBOS task logging structures 2288 */ 2289 static void 2290 _kgspFreeLibosLoggingStructures 2291 ( 2292 OBJGPU *pGpu, 2293 KernelGsp *pKernelGsp 2294 ) 2295 { 2296 NvU8 idx; 2297 2298 _kgspStopLogPolling(pGpu, pKernelGsp); 2299 2300 // Make sure there is no lingering debug output. 2301 kgspDumpGspLogs(pKernelGsp, NV_FALSE); 2302 2303 if (pKernelGsp->pLogElf == NULL) 2304 nvlogDeregisterFlushCb(kgspNvlogFlushCb, pKernelGsp); 2305 2306 if (pKernelGsp->pNvlogFlushMtx != NULL) 2307 { 2308 portSyncMutexDestroy(pKernelGsp->pNvlogFlushMtx); 2309 pKernelGsp->pNvlogFlushMtx = NULL; 2310 } 2311 2312 libosLogDestroy(&pKernelGsp->logDecode); 2313 2314 for (idx = 0; idx < LOGIDX_SIZE; idx++) 2315 { 2316 RM_LIBOS_LOG_MEM *pLog = &pKernelGsp->rmLibosLogMem[idx]; 2317 2318 // release log memory for each task. 2319 if (pLog->pTaskLogBuffer != NULL) 2320 { 2321 memdescUnmap(pLog->pTaskLogDescriptor, 2322 NV_TRUE, osGetCurrentProcess(), 2323 (void *)pLog->pTaskLogBuffer, 2324 pLog->pTaskLogMappingPriv); 2325 pLog->pTaskLogBuffer = NULL; 2326 pLog->pTaskLogMappingPriv = NULL; 2327 } 2328 2329 if (pLog->pTaskLogDescriptor != NULL) 2330 { 2331 memdescFree(pLog->pTaskLogDescriptor); 2332 memdescDestroy(pLog->pTaskLogDescriptor); 2333 pLog->pTaskLogDescriptor = NULL; 2334 } 2335 } 2336 2337 portMemFree(pKernelGsp->pLogElf); 2338 pKernelGsp->pLogElf = NULL; 2339 } 2340 2341 /*! 2342 * Initialize LIBOS task logging structures 2343 */ 2344 static NV_STATUS 2345 _kgspInitLibosLoggingStructures 2346 ( 2347 OBJGPU *pGpu, 2348 KernelGsp *pKernelGsp 2349 ) 2350 { 2351 static const struct 2352 { 2353 const char *szMemoryId; 2354 const char *szPrefix; 2355 NvU32 size; 2356 const char *elfSectionName; 2357 } logInitValues[] = 2358 { 2359 {"LOGINIT", "INIT", 0x10000, ".fwlogging_init"}, // 64KB for stack traces 2360 #if defined(DEVELOP) || defined(DEBUG) 2361 // The interrupt task is in the rm elf, so they share the same logging elf too 2362 {"LOGINTR", "INTR", 0x40000, ".fwlogging_rm"}, // 256KB ISR debug log on develop/debug builds 2363 {"LOGRM", "RM", 0x40000, ".fwlogging_rm"} // 256KB RM debug log on develop/debug builds 2364 #else 2365 // The interrupt task is in the rm elf, so they share the same logging elf too 2366 {"LOGINTR", "INTR", 0x10000, ".fwlogging_rm"}, // 64KB ISR debug log on develop/debug builds 2367 {"LOGRM", "RM", 0x10000, ".fwlogging_rm"} // 64KB RM debug log on release builds 2368 #endif 2369 }; 2370 ct_assert(NV_ARRAY_ELEMENTS(logInitValues) <= LIBOS_LOG_MAX_LOGS); 2371 ct_assert(NV_ARRAY_ELEMENTS(logInitValues) == LOGIDX_SIZE); 2372 2373 NV_STATUS nvStatus = NV_OK; 2374 NvU8 idx; 2375 NvU64 flags = MEMDESC_FLAGS_NONE; 2376 2377 // Needed only on Unix where NV_ESC_RM_LOCKLESS_DIAGNOSTIC is supported 2378 if (RMCFG_FEATURE_PLATFORM_UNIX) 2379 { 2380 pKernelGsp->pNvlogFlushMtx = portSyncMutexCreate(portMemAllocatorGetGlobalNonPaged()); 2381 if (pKernelGsp->pNvlogFlushMtx == NULL) 2382 { 2383 nvStatus = NV_ERR_INSUFFICIENT_RESOURCES; 2384 goto error_cleanup; 2385 } 2386 } 2387 2388 libosLogCreate(&pKernelGsp->logDecode); 2389 2390 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY; 2391 2392 for (idx = 0; idx < LOGIDX_SIZE; idx++) 2393 { 2394 RM_LIBOS_LOG_MEM *pLog = &pKernelGsp->rmLibosLogMem[idx]; 2395 NvP64 pVa = NvP64_NULL; 2396 NvP64 pPriv = NvP64_NULL; 2397 2398 // Setup logging memory for each task. 2399 NV_ASSERT_OK_OR_GOTO(nvStatus, 2400 memdescCreate(&pLog->pTaskLogDescriptor, 2401 pGpu, 2402 logInitValues[idx].size, 2403 RM_PAGE_SIZE, 2404 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, 2405 flags), 2406 error_cleanup); 2407 2408 memdescTagAlloc(nvStatus, 2409 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_12, pLog->pTaskLogDescriptor); 2410 NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, 2411 error_cleanup); 2412 2413 NV_ASSERT_OK_OR_GOTO(nvStatus, 2414 memdescMap(pLog->pTaskLogDescriptor, 0, 2415 memdescGetSize(pLog->pTaskLogDescriptor), 2416 NV_TRUE, NV_PROTECT_READ_WRITE, 2417 &pVa, &pPriv), 2418 error_cleanup); 2419 2420 pLog->pTaskLogBuffer = pVa; 2421 pLog->pTaskLogMappingPriv = pPriv; 2422 portMemSet(pLog->pTaskLogBuffer, 0, memdescGetSize(pLog->pTaskLogDescriptor)); 2423 2424 // Pass the PTE table for the log buffer in the log buffer, after the put pointer. 2425 memdescGetPhysAddrs(pLog->pTaskLogDescriptor, 2426 AT_GPU, 2427 0, 2428 RM_PAGE_SIZE, 2429 NV_CEIL(memdescGetSize(pLog->pTaskLogDescriptor), RM_PAGE_SIZE), 2430 &pLog->pTaskLogBuffer[1]); 2431 2432 pLog->id8 = _kgspGenerateInitArgId(logInitValues[idx].szMemoryId); 2433 2434 libosLogAddLogEx(&pKernelGsp->logDecode, 2435 pLog->pTaskLogBuffer, 2436 memdescGetSize(pLog->pTaskLogDescriptor), 2437 pGpu->gpuInstance, 2438 (gpuGetChipArch(pGpu) >> GPU_ARCH_SHIFT), 2439 gpuGetChipImpl(pGpu), 2440 logInitValues[idx].szPrefix, 2441 logInitValues[idx].elfSectionName); 2442 } 2443 2444 error_cleanup: 2445 if (nvStatus != NV_OK) 2446 _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp); 2447 2448 return nvStatus; 2449 } 2450 2451 static NV_STATUS 2452 _kgspInitLibosLogDecoder 2453 ( 2454 OBJGPU *pGpu, 2455 KernelGsp *pKernelGsp, 2456 GSP_FIRMWARE *pGspFw 2457 ) 2458 { 2459 // If there's no log ELF or it's already been wired, skip wiring it now 2460 if ((pGspFw->pLogElf == NULL) || (pKernelGsp->pLogElf != NULL)) 2461 return NV_OK; 2462 2463 // Setup symbol decoder 2464 const void *pLogData = NULL; 2465 NvU64 logSize = 0; 2466 2467 NV_ASSERT_OK_OR_RETURN( 2468 _kgspFwContainerVerifyVersion(pGpu, pKernelGsp, 2469 pGspFw->pLogElf, 2470 pGspFw->logElfSize, 2471 "GSP firmware log")); 2472 2473 NV_ASSERT_OK_OR_RETURN( 2474 _kgspFwContainerGetSection(pGpu, pKernelGsp, 2475 pGspFw->pLogElf, 2476 pGspFw->logElfSize, 2477 GSP_LOGGING_SECTION_NAME, 2478 &pLogData, 2479 &logSize)); 2480 2481 pKernelGsp->pLogElf = portMemAllocNonPaged(logSize); 2482 pKernelGsp->logElfDataSize = logSize; 2483 2484 NV_ASSERT_OR_RETURN(pKernelGsp->pLogElf != NULL, NV_ERR_NO_MEMORY); 2485 2486 portMemCopy(pKernelGsp->pLogElf, logSize, pLogData, logSize); 2487 libosLogInit(&pKernelGsp->logDecode, pKernelGsp->pLogElf, logSize); 2488 2489 return NV_OK; 2490 } 2491 2492 static NV_STATUS 2493 _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp) 2494 { 2495 NvP64 pVa = NvP64_NULL; 2496 NvP64 pPriv = NvP64_NULL; 2497 NV_STATUS nvStatus; 2498 2499 if (!IS_SIMULATION(pGpu)) 2500 { 2501 pKernelGsp->pMemDesc_simAccessBuf = NULL; 2502 pKernelGsp->pSimAccessBuf = NULL; 2503 pKernelGsp->pSimAccessBufPriv = NULL; 2504 return NV_ERR_NOT_SUPPORTED; 2505 } 2506 2507 NV_ASSERT_OK_OR_GOTO(nvStatus, 2508 memdescCreate(&pKernelGsp->pMemDesc_simAccessBuf, 2509 pGpu, 2510 sizeof(SimAccessBuffer), 2511 RM_PAGE_SIZE, 2512 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_UNCACHED, 2513 MEMDESC_FLAGS_NONE), 2514 error_cleanup); 2515 2516 memdescTagAlloc(nvStatus, 2517 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_13, pKernelGsp->pMemDesc_simAccessBuf); 2518 NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_cleanup); 2519 2520 NV_ASSERT_OK_OR_GOTO(nvStatus, 2521 memdescMap(pKernelGsp->pMemDesc_simAccessBuf, 0, 2522 memdescGetSize(pKernelGsp->pMemDesc_simAccessBuf), 2523 NV_TRUE, NV_PROTECT_READ_WRITE, 2524 &pVa, &pPriv), 2525 error_cleanup); 2526 2527 pKernelGsp->pSimAccessBuf = (SimAccessBuffer*)pVa; 2528 pKernelGsp->pSimAccessBufPriv = pPriv; 2529 2530 portMemSet(pKernelGsp->pSimAccessBuf, 0, memdescGetSize(pKernelGsp->pMemDesc_simAccessBuf)); 2531 2532 error_cleanup: 2533 if (nvStatus != NV_OK) 2534 _kgspFreeSimAccessBuffer(pGpu, pKernelGsp); 2535 2536 return nvStatus; 2537 } 2538 2539 static void 2540 _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp) 2541 { 2542 if (!IS_SIMULATION(pGpu)) 2543 { 2544 return; 2545 } 2546 2547 if (pKernelGsp->pMemDesc_simAccessBuf != NULL) 2548 { 2549 memdescFree(pKernelGsp->pMemDesc_simAccessBuf); 2550 memdescDestroy(pKernelGsp->pMemDesc_simAccessBuf); 2551 } 2552 2553 pKernelGsp->pMemDesc_simAccessBuf = NULL; 2554 pKernelGsp->pSimAccessBuf = NULL; 2555 pKernelGsp->pSimAccessBufPriv = NULL; 2556 } 2557 2558 static NV_STATUS 2559 _kgspAllocNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp) 2560 { 2561 NvP64 pVa = NvP64_NULL; 2562 NvP64 pPriv = NvP64_NULL; 2563 NV_STATUS nvStatus; 2564 NvU64 flags = MEMDESC_FLAGS_NONE; 2565 2566 // 2567 // On systems with SEV enabled, the fault buffer flush sequence memory should be allocated 2568 // in unprotected sysmem as GSP will be writing to this location to let the guest 2569 // know a the issued notify op has finished as well as the status of the operation. 2570 // 2571 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY; 2572 2573 NV_ASSERT_OK_OR_GOTO(nvStatus, 2574 memdescCreate(&pKernelGsp->pNotifyOpSurfMemDesc, 2575 pGpu, 2576 sizeof(NotifyOpSharedSurface), 2577 RM_PAGE_SIZE, 2578 NV_FALSE, ADDR_SYSMEM, NV_MEMORY_UNCACHED, 2579 flags), 2580 error_cleanup); 2581 2582 memdescTagAlloc(nvStatus, 2583 NV_FB_ALLOC_RM_INTERNAL_OWNER_GSP_NOTIFY_OP_SURFACE, pKernelGsp->pNotifyOpSurfMemDesc); 2584 NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_cleanup); 2585 2586 NV_ASSERT_OK_OR_GOTO(nvStatus, 2587 memdescMap(pKernelGsp->pNotifyOpSurfMemDesc, 0, 2588 memdescGetSize(pKernelGsp->pNotifyOpSurfMemDesc), 2589 NV_TRUE, NV_PROTECT_READ_WRITE, 2590 &pVa, &pPriv), 2591 error_cleanup); 2592 2593 pKernelGsp->pNotifyOpSurf = (NotifyOpSharedSurface*)pVa; 2594 pKernelGsp->pNotifyOpSurfPriv = pPriv; 2595 2596 portMemSet(pKernelGsp->pNotifyOpSurf, 0, memdescGetSize(pKernelGsp->pNotifyOpSurfMemDesc)); 2597 2598 error_cleanup: 2599 if (nvStatus != NV_OK) 2600 _kgspFreeNotifyOpSharedSurface(pGpu, pKernelGsp); 2601 2602 return nvStatus; 2603 } 2604 2605 static void 2606 _kgspFreeNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp) 2607 { 2608 if (pKernelGsp->pNotifyOpSurfMemDesc != NULL) 2609 { 2610 memdescFree(pKernelGsp->pNotifyOpSurfMemDesc); 2611 memdescDestroy(pKernelGsp->pNotifyOpSurfMemDesc); 2612 } 2613 2614 pKernelGsp->pNotifyOpSurfMemDesc = NULL; 2615 pKernelGsp->pNotifyOpSurf = NULL; 2616 pKernelGsp->pNotifyOpSurfPriv = NULL; 2617 } 2618 2619 /*! 2620 * Create KernelGsp object and initialize RPC infrastructure 2621 */ 2622 NV_STATUS 2623 kgspConstructEngine_IMPL 2624 ( 2625 OBJGPU *pGpu, 2626 KernelGsp *pKernelGsp, 2627 ENGDESCRIPTOR engDesc 2628 ) 2629 { 2630 NV_STATUS nvStatus = NV_OK; 2631 2632 if (!IS_GSP_CLIENT(pGpu)) 2633 return NV_ERR_NOT_SUPPORTED; 2634 2635 kgspConfigureFalcon_HAL(pGpu, pKernelGsp); 2636 2637 // Init RPC objects used to communicate with GSP. 2638 nvStatus = _kgspInitRpcInfrastructure(pGpu, pKernelGsp); 2639 if (nvStatus != NV_OK) 2640 { 2641 NV_PRINTF(LEVEL_ERROR, "init RPC infrastructure failed\n"); 2642 goto done; 2643 } 2644 2645 // Init logging memory used by GSP 2646 nvStatus = _kgspInitLibosLoggingStructures(pGpu, pKernelGsp); 2647 if (nvStatus != NV_OK) 2648 { 2649 NV_PRINTF(LEVEL_ERROR, "init libos logging structures failed: 0x%x\n", nvStatus); 2650 goto done; 2651 } 2652 2653 // Clear out the gspStaticInfo. We will populate this once GSP-RM is up. 2654 portMemSet(&pKernelGsp->gspStaticInfo, 0, 2655 sizeof(pKernelGsp->gspStaticInfo)); 2656 2657 nvStatus = kgspAllocBootArgs_HAL(pGpu, pKernelGsp); 2658 if (nvStatus != NV_OK) 2659 { 2660 NV_PRINTF(LEVEL_ERROR, "boot arg alloc failed: 0x%x\n", nvStatus); 2661 goto done; 2662 } 2663 2664 if (IS_SIMULATION(pGpu)) 2665 { 2666 nvStatus = _kgspAllocSimAccessBuffer(pGpu, pKernelGsp); 2667 if (nvStatus != NV_OK) 2668 { 2669 NV_PRINTF(LEVEL_ERROR, "sim access buffer alloc failed: 0x%x\n", nvStatus); 2670 goto done; 2671 } 2672 } 2673 2674 nvStatus = _kgspAllocNotifyOpSharedSurface(pGpu, pKernelGsp); 2675 if (nvStatus != NV_OK) 2676 { 2677 NV_PRINTF(LEVEL_ERROR, "notify operation shared surface alloc failed: 0x%x\n", nvStatus); 2678 goto done; 2679 } 2680 2681 #if KERNEL_GSP_TRACING_RATS_ENABLED 2682 multimapInit(&pGpu->gspTraceEventBufferBindingsUid, portMemAllocatorGetGlobalNonPaged()); 2683 #endif 2684 2685 done: 2686 if (nvStatus != NV_OK) 2687 { 2688 _kgspFreeSimAccessBuffer(pGpu, pKernelGsp); 2689 kgspFreeBootArgs_HAL(pGpu, pKernelGsp); 2690 _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp); 2691 _kgspFreeRpcInfrastructure(pGpu, pKernelGsp); 2692 } 2693 2694 return nvStatus; 2695 } 2696 2697 /*! 2698 * Convert VBIOS version containing Version and OemVersion packed together to 2699 * a string representation. 2700 * 2701 * Example: 2702 * for Version 0x05400001, OemVersion 0x12 2703 * input argument vbiosVersionCombined 0x0540000112 2704 * output str "5.40.00.01.12" 2705 */ 2706 static void 2707 _kgspVbiosVersionToStr(NvU64 vbiosVersionCombined, char *pVbiosVersionStr, NvU32 size) 2708 { 2709 nvDbgSnprintf(pVbiosVersionStr, size, "%2X.%02X.%02X.%02X.%02X", 2710 (vbiosVersionCombined >> 32) & 0xff, 2711 (vbiosVersionCombined >> 24) & 0xff, 2712 (vbiosVersionCombined >> 16) & 0xff, 2713 (vbiosVersionCombined >> 8) & 0xff, 2714 (vbiosVersionCombined) & 0xff); 2715 } 2716 2717 static NV_STATUS 2718 _kgspPrepareScrubberImageIfNeeded(OBJGPU *pGpu, KernelGsp *pKernelGsp) 2719 { 2720 // Prepare Scrubber ucode image if pre-scrubbed memory is insufficient 2721 NvU64 neededSize = pKernelGsp->pWprMeta->fbSize - pKernelGsp->pWprMeta->gspFwRsvdStart; 2722 NvU64 prescrubbedSize = kgspGetPrescrubbedTopFbSize(pGpu, pKernelGsp); 2723 NV_PRINTF(LEVEL_INFO, "pre-scrubbed memory: 0x%llx bytes, needed: 0x%llx bytes\n", 2724 prescrubbedSize, neededSize); 2725 2726 if (neededSize > prescrubbedSize) 2727 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 2728 kgspAllocateScrubberUcodeImage(pGpu, pKernelGsp, &pKernelGsp->pScrubberUcode)); 2729 2730 return NV_OK; 2731 } 2732 2733 /*! 2734 * Prepare and place RPCs in message queue that GSP-RM will process 2735 * in early boot before OBJGPU is created. 2736 * 2737 * @param[in] pGpu GPU object pointer 2738 * @param[in] pKernelGsp KernelGsp object pointer 2739 * 2740 * @return NV_OK if RPCs queued successfully. 2741 * Appropriate NV_ERR_xxx value otherwise. 2742 */ 2743 NV_STATUS 2744 kgspQueueAsyncInitRpcs_IMPL 2745 ( 2746 OBJGPU *pGpu, 2747 KernelGsp *pKernelGsp 2748 ) 2749 { 2750 NV_STATUS status = NV_OK; 2751 2752 NV_RM_RPC_GSP_SET_SYSTEM_INFO(pGpu, status); 2753 if (status != NV_OK) 2754 { 2755 NV_ASSERT_OK_FAILED("NV_RM_RPC_GSP_SET_SYSTEM_INFO", status); 2756 return status; 2757 } 2758 2759 NV_RM_RPC_SET_REGISTRY(pGpu, status); 2760 if (status != NV_OK) 2761 { 2762 NV_ASSERT_OK_FAILED("NV_RM_RPC_SET_REGISTRY", status); 2763 return status; 2764 } 2765 2766 return NV_OK; 2767 } 2768 2769 static NvBool 2770 _kgspShouldRelaxGspInitLocking 2771 ( 2772 OBJGPU *pGpu 2773 ) 2774 { 2775 NvU32 relaxGspInitLockingReg; 2776 2777 if (!RMCFG_FEATURE_PLATFORM_UNIX) 2778 { 2779 return NV_FALSE; 2780 } 2781 2782 if (gpuIsCCFeatureEnabled(pGpu)) 2783 { 2784 return NV_FALSE; 2785 } 2786 2787 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING, &relaxGspInitLockingReg) != NV_OK) 2788 { 2789 relaxGspInitLockingReg = NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_DEFAULT; 2790 } 2791 2792 // Due to bug 4399629, restrict which platforms have parallel init enabled by default 2793 if (relaxGspInitLockingReg == NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_DEFAULT) 2794 { 2795 NvU16 devId = (NvU16)(((pGpu->idInfo.PCIDeviceID) >> 16) & 0x0000FFFF); 2796 NvU32 i; 2797 2798 static const NvU16 defaultRelaxGspInitLockingGpus[] = { 2799 0x1EB8, // T4 2800 0x1EB9, // T4 2801 }; 2802 2803 if (IsHOPPER(pGpu) || IsADA(pGpu)) 2804 { 2805 return NV_TRUE; 2806 } 2807 2808 for (i = 0; i < NV_ARRAY_ELEMENTS(defaultRelaxGspInitLockingGpus); i++) 2809 { 2810 if (devId == defaultRelaxGspInitLockingGpus[i]) 2811 { 2812 return NV_TRUE; 2813 } 2814 } 2815 return NV_FALSE; 2816 } 2817 2818 return (relaxGspInitLockingReg == NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_ENABLE); 2819 return NV_FALSE; 2820 } 2821 2822 static NV_STATUS 2823 _kgspBootReacquireLocks(OBJGPU *pGpu, KernelGsp *pKernelGsp, GPU_MASK *pGpusLockedMask) 2824 { 2825 // 2826 // To follow lock order constraints, GPU lock needs to be released before acquiring API lock 2827 // As this path doesn't go through resource server, no client locks should be held at this point. 2828 // Note: we must not hold any client locks when re-acquiring the API per lock ordering 2829 // 2830 rmGpuGroupLockRelease(*pGpusLockedMask, GPUS_LOCK_FLAGS_NONE); 2831 *pGpusLockedMask = 0; 2832 2833 // 2834 // rmapiLockAcquire should never fail on Linux if the API lock and GPU locks are not held. 2835 // Failure to acquire the API lock means the cleanup sequence will skipped since it is 2836 // unsafe without the lock. 2837 // 2838 NV_ASSERT_OK_OR_RETURN(rmapiLockAcquire(API_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT)); 2839 2840 // 2841 // This should never fail on Linux due to locks in the Unix layer. 2842 // This will need to be revisited when parallel init is enabled on other platforms. 2843 // 2844 NV_ASSERT_OR_RETURN(gpumgrIsGpuPointerAttached(pGpu), NV_ERR_INVALID_DEVICE); 2845 2846 // Reqcquire the GPU lock released above. 2847 NV_ASSERT_OK_OR_RETURN(rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, 2848 GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT, 2849 pGpusLockedMask)); 2850 2851 return NV_OK; 2852 } 2853 2854 static NV_STATUS 2855 _kgspBootGspRm(OBJGPU *pGpu, KernelGsp *pKernelGsp, GSP_FIRMWARE *pGspFw, GPU_MASK *pGpusLockedMask) 2856 { 2857 NV_STATUS status; 2858 2859 // Fail early if WPR2 is up 2860 if (kgspIsWpr2Up_HAL(pGpu, pKernelGsp)) 2861 { 2862 NV_PRINTF(LEVEL_ERROR, "unexpected WPR2 already up, cannot proceed with booting GSP\n"); 2863 NV_PRINTF(LEVEL_ERROR, "(the GPU is likely in a bad state and may need to be reset)\n"); 2864 return NV_ERR_INVALID_STATE; 2865 } 2866 2867 // Calculate FB layout (requires knowing FB size which depends on GFW_BOOT) 2868 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgspCalculateFbLayout_HAL(pGpu, pKernelGsp, pGspFw)); 2869 2870 // If the new FB layout requires a scrubber ucode to scrub additional space, prepare it now 2871 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, _kgspPrepareScrubberImageIfNeeded(pGpu, pKernelGsp)); 2872 2873 // Setup arguments for bootstrapping GSP 2874 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgspPrepareForBootstrap_HAL(pGpu, pKernelGsp, pGspFw)); 2875 2876 // Release the API lock if relaxed locking for parallel init is enabled 2877 NvBool bRelaxedLocking = _kgspShouldRelaxGspInitLocking(pGpu); 2878 if (bRelaxedLocking) 2879 rmapiLockRelease(); 2880 2881 // Proceed with GSP boot - if it fails, check for ECC errors 2882 status = kgspBootstrap_HAL(pGpu, pKernelGsp, pGspFw); 2883 if ((status != NV_OK) && gpuCheckEccCounts_HAL(pGpu)) 2884 status = NV_ERR_ECC_ERROR; 2885 2886 pKernelGsp->bootAttempts++; 2887 2888 // 2889 // The caller will check that both the API lock and the GPU lock will be held upon return from 2890 // this function, regardless of whether GSP bootstrap succeeded. 2891 // 2892 if (bRelaxedLocking) 2893 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 2894 _kgspBootReacquireLocks(pGpu, pKernelGsp, pGpusLockedMask)); 2895 2896 return status; 2897 } 2898 2899 /*! 2900 * Initialize GSP-RM 2901 * 2902 * @param[in] pGpu GPU object pointer 2903 * @param[in] pKernelGsp KernelGsp object pointer 2904 * @param[in] pGspFw GSP firmware structure pointer 2905 * 2906 * @return NV_OK if GSP fw RM offload successfully initialized. 2907 * Appropriate NV_ERR_xxx value otherwise. 2908 */ 2909 NV_STATUS 2910 kgspInitRm_IMPL 2911 ( 2912 OBJGPU *pGpu, 2913 KernelGsp *pKernelGsp, 2914 GSP_FIRMWARE *pGspFw 2915 ) 2916 { 2917 NV_STATUS status = NV_OK; 2918 OBJTMR *pTmr = GPU_GET_TIMER(pGpu); 2919 GPU_MASK gpusLockedMask = 0; 2920 2921 if (!IS_GSP_CLIENT(pGpu)) 2922 return NV_OK; 2923 2924 if ((pGspFw == NULL) || (pGspFw->pBuf == NULL) || (pGspFw->size == 0)) 2925 { 2926 NV_PRINTF(LEVEL_ERROR, "need firmware to initialize GSP\n"); 2927 return NV_ERR_INVALID_ARGUMENT; 2928 } 2929 2930 pKernelGsp->bInInit = NV_TRUE; 2931 2932 // Need to hold the GPU instance lock in order to write to the RPC queue 2933 NV_ASSERT_OK_OR_GOTO(status, 2934 rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, 2935 GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT, &gpusLockedMask), 2936 done); 2937 2938 /* 2939 * For GSP-RM boot, we must trigger FRTS (if it exists for the chip) 2940 * before loading GSP-RM so that FRTS data and GSP-RM code/data/heap can coexist 2941 * in WPR2. FRTS is triggered by running a VBIOS-provided ucode called FWSEC. 2942 * 2943 * Here, we extract a VBIOS image from ROM, and parse it for FWSEC. 2944 */ 2945 if (pKernelGsp->pFwsecUcode == NULL) 2946 { 2947 KernelGspVbiosImg *pVbiosImg = NULL; 2948 2949 // Start VBIOS version string as "unknown" 2950 portStringCopy(pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr), "unknown", sizeof("unknown")); 2951 2952 // Try and extract a VBIOS image. 2953 status = kgspExtractVbiosFromRom_HAL(pGpu, pKernelGsp, &pVbiosImg); 2954 2955 if (status == NV_OK) 2956 { 2957 NvU64 vbiosVersionCombined = 0; 2958 2959 // Got a VBIOS image, now parse it for FWSEC. 2960 status = kgspParseFwsecUcodeFromVbiosImg(pGpu, pKernelGsp, pVbiosImg, 2961 &pKernelGsp->pFwsecUcode, &vbiosVersionCombined); 2962 kgspFreeVbiosImg(pVbiosImg); 2963 2964 if (vbiosVersionCombined > 0) 2965 { 2966 _kgspVbiosVersionToStr(vbiosVersionCombined, pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr)); 2967 } 2968 2969 if (status != NV_OK) 2970 { 2971 NV_PRINTF(LEVEL_ERROR, "failed to parse FWSEC ucode from VBIOS image (VBIOS version %s): 0x%x\n", 2972 pKernelGsp->vbiosVersionStr, status); 2973 goto done; 2974 } 2975 2976 NV_PRINTF(LEVEL_INFO, "parsed VBIOS version %s\n", pKernelGsp->vbiosVersionStr); 2977 } 2978 else if (status == NV_ERR_NOT_SUPPORTED) 2979 { 2980 // Extracting VBIOS image from ROM is not supported. 2981 status = NV_OK; 2982 } 2983 else 2984 { 2985 NV_PRINTF(LEVEL_ERROR, "failed to extract VBIOS image from ROM: 0x%x\n", 2986 status); 2987 goto done; 2988 } 2989 2990 } 2991 2992 /* 2993 * We use a set of Booter ucodes to boot GSP-RM as well as manage its lifecycle. 2994 * 2995 * Booter Load loads, verifies, and boots GSP-RM in WPR2. 2996 * Booter Unload tears down WPR2 for driver unload. 2997 * 2998 * Here we prepare the Booter ucode images in SYSMEM so they may be loaded onto 2999 * SEC2 (Load / Unload) and NVDEC0 (Unload). 3000 */ 3001 if (pKernelGsp->bPartitionedFmc) 3002 { 3003 // 3004 // The secure boot ucode is included in the partitioned FMC, no need for 3005 // separate Booter ucodes. 3006 // 3007 } 3008 else 3009 { 3010 if (pKernelGsp->pBooterLoadUcode == NULL) 3011 { 3012 status = kgspAllocateBooterLoadUcodeImage(pGpu, pKernelGsp, 3013 &pKernelGsp->pBooterLoadUcode); 3014 if (status != NV_OK) 3015 { 3016 NV_PRINTF(LEVEL_ERROR, "failed to allocate Booter Load ucode: 0x%x\n", status); 3017 goto done; 3018 } 3019 } 3020 3021 if (pKernelGsp->pBooterUnloadUcode == NULL) 3022 { 3023 status = kgspAllocateBooterUnloadUcodeImage(pGpu, pKernelGsp, 3024 &pKernelGsp->pBooterUnloadUcode); 3025 if (status != NV_OK) 3026 { 3027 NV_PRINTF(LEVEL_ERROR, "failed to allocate Booter Unload ucode: 0x%x\n", status); 3028 goto done; 3029 } 3030 } 3031 } 3032 3033 // Prepare boot binary image. 3034 status = kgspPrepareBootBinaryImage(pGpu, pKernelGsp); 3035 if (status != NV_OK) 3036 { 3037 NV_PRINTF(LEVEL_ERROR, "Error preparing boot binary image\n"); 3038 goto done; 3039 } 3040 3041 // Prepare GSP-RM image. 3042 status = _kgspPrepareGspRmBinaryImage(pGpu, pKernelGsp, pGspFw); 3043 if (status != NV_OK) 3044 { 3045 NV_PRINTF(LEVEL_ERROR, "Error preparing GSP-RM image\n"); 3046 goto done; 3047 } 3048 3049 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, _kgspInitLibosLogDecoder(pGpu, pKernelGsp, pGspFw), done); 3050 3051 // 3052 // Do not register nvlog flush callback if: 3053 // 1. Live decoding is enabled, as logs will be printed to dmesg. 3054 // 2. NV_ESC_RM_LOCKLESS_DIAGNOSTIC is not supported on this platform, i.e. pNvlogFlushMtx=NULL. 3055 // 3056 if (pKernelGsp->pLogElf == NULL && pKernelGsp->pNvlogFlushMtx != NULL) 3057 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, nvlogRegisterFlushCb(kgspNvlogFlushCb, pKernelGsp), done); 3058 3059 // Reset thread state timeout and wait for GFW_BOOT OK status 3060 threadStateResetTimeout(pGpu); 3061 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspWaitForGfwBootOk_HAL(pGpu, pKernelGsp), done); 3062 3063 // 3064 // Set the GPU time to the wall-clock time after GFW boot is complete 3065 // (to avoid PLM collisions) but before loading GSP-RM ucode (which 3066 // consumes the updated GPU time). 3067 // 3068 tmrSetCurrentTime_HAL(pGpu, pTmr); 3069 3070 // Initialize libos init args list 3071 kgspSetupLibosInitArgs(pGpu, pKernelGsp); 3072 3073 // Fill in the GSP-RM message queue init parameters 3074 kgspPopulateGspRmInitArgs(pGpu, pKernelGsp, NULL); 3075 3076 // 3077 // If ConfCompute is enabled, all RPC traffic must be encrypted. Since we 3078 // can't encrypt until GSP boots and session is established, we must send 3079 // these messages later (kgspBootstrap_HAL) in CC. 3080 // 3081 ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu); 3082 if (pCC == NULL || !pCC->getProperty(pCC, PDB_PROP_CONFCOMPUTE_CC_FEATURE_ENABLED)) 3083 { 3084 // 3085 // Stuff the message queue with async init messages that will be run 3086 // before OBJGPU is created. 3087 // 3088 status = kgspQueueAsyncInitRpcs(pGpu, pKernelGsp); 3089 if (status != NV_OK) 3090 { 3091 goto done; 3092 } 3093 } 3094 3095 // 3096 // Bring up ucode with RM offload task. 3097 // If an ECC error occurs which results in the failure of the bootstrap, try again. 3098 // Subsequent attempts will shift the GSP region of FB in an attempt to avoid the 3099 // unstable memory. 3100 // 3101 const NvU8 MAX_GSP_BOOT_ATTEMPTS = 4; 3102 do 3103 { 3104 // Reset the thread state timeout after failed attempts to prevent premature timeouts. 3105 if (status != NV_OK) 3106 threadStateResetTimeout(pGpu); 3107 3108 // 3109 // _kgspBootGspRm() will return NV_ERR_ECC_ERROR if any unhandled ECC errors are 3110 // detected during a failed GSP boot attempt. Depending on where and when the 3111 // error occurred, we may not be able to try again, in which case a different 3112 // error code will be returned. 3113 // 3114 status = _kgspBootGspRm(pGpu, pKernelGsp, pGspFw, &gpusLockedMask); 3115 3116 // 3117 // _kgspBootGspRm() may temporarily release locks to facilitate parallel GSP bootstrap on 3118 // other GPUs. It is responsible for reacquiring them in the proper order. If there is a 3119 // failure to reacquire locks, it is unsafe to continue, regardless of the initialization 3120 // status - so we return immediately here, rather attempting cleanup. 3121 // 3122 // Note: _kgspBootGspRm() is structured such that gpusLockedMask will always be 0 (no GPU 3123 // locks held) if the API lock is not held upon return. 3124 // 3125 NV_ASSERT_OR_RETURN(rmapiLockIsOwner() && (gpusLockedMask != 0), 3126 NV_ERR_INVALID_LOCK_STATE); 3127 } while ((status == NV_ERR_ECC_ERROR) && (pKernelGsp->bootAttempts < MAX_GSP_BOOT_ATTEMPTS)); 3128 3129 if (status != NV_OK) 3130 { 3131 if (status == NV_ERR_INSUFFICIENT_POWER) 3132 { 3133 OBJSYS *pSys = SYS_GET_INSTANCE(); 3134 OBJGPUMGR *pGpuMgr = SYS_GET_GPUMGR(pSys); 3135 3136 pGpuMgr->powerDisconnectedGpuBus[pGpuMgr->powerDisconnectedGpuCount++] = gpuGetBus(pGpu); 3137 } 3138 3139 // 3140 // Ignore return value - a crash report may have already been consumed, 3141 // this is just here as a last attempt to report boot issues that might 3142 // have escaped prior checks. 3143 // 3144 (void)kgspHealthCheck_HAL(pGpu, pKernelGsp); 3145 goto done; 3146 } 3147 3148 // at this point we should be able to exchange RPCs with RM offload task 3149 NV_RM_RPC_SET_GUEST_SYSTEM_INFO(pGpu, status); 3150 if (status != NV_OK) 3151 { 3152 NV_PRINTF(LEVEL_ERROR, "SET_GUEST_SYSTEM_INFO failed: 0x%x\n", status); 3153 goto done; 3154 } 3155 3156 NV_RM_RPC_GET_GSP_STATIC_INFO(pGpu, status); 3157 if (status != NV_OK) 3158 { 3159 NV_PRINTF(LEVEL_ERROR, "GET_GSP_STATIC_INFO failed: 0x%x\n", status); 3160 goto done; 3161 } 3162 3163 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspStartLogPolling(pGpu, pKernelGsp), done); 3164 3165 done: 3166 pKernelGsp->bInInit = NV_FALSE; 3167 3168 if (status != NV_OK) 3169 { 3170 KernelPmu *pKernelPmu = GPU_GET_KERNEL_PMU(pGpu); 3171 3172 // Preserve any captured GSP-RM logs 3173 libosPreserveLogs(&pKernelGsp->logDecode); 3174 3175 if (pKernelPmu != NULL) 3176 { 3177 // If PMU init fails, kgsp init will also fail 3178 libosPreserveLogs(&pKernelPmu->logDecode); 3179 } 3180 } 3181 3182 if (gpusLockedMask != 0) 3183 { 3184 rmGpuGroupLockRelease(gpusLockedMask, GPUS_LOCK_FLAGS_NONE); 3185 } 3186 3187 return status; 3188 } 3189 3190 /*! 3191 * Unload GSP-RM 3192 */ 3193 NV_STATUS 3194 kgspUnloadRm_IMPL 3195 ( 3196 OBJGPU *pGpu, 3197 KernelGsp *pKernelGsp 3198 ) 3199 { 3200 NV_STATUS rpcStatus = NV_OK; 3201 NV_STATUS status; 3202 KernelGspPreparedFwsecCmd preparedCmd; 3203 3204 NV_PRINTF(LEVEL_INFO, "unloading GSP-RM\n"); 3205 NV_RM_RPC_UNLOADING_GUEST_DRIVER(pGpu, rpcStatus, NV_FALSE, NV_FALSE, 0); 3206 3207 if (gpuIsCCFeatureEnabled(pGpu)) 3208 { 3209 // FIPS: If CC enabled, we need to confirm GSP-RM was able to teardown CC state. 3210 kgspCheckGspRmCcCleanup_HAL(pGpu, pKernelGsp); 3211 } 3212 3213 // Wait for GSP-RM processor to suspend 3214 kgspWaitForProcessorSuspend_HAL(pGpu, pKernelGsp); 3215 3216 // Dump GSP-RM logs and reset before invoking FWSEC-SB 3217 kgspDumpGspLogs(pKernelGsp, NV_FALSE); 3218 3219 // 3220 // Avoid cascading timeouts when attempting to invoke the below ucodes if 3221 // we are unloading due to a GSP-RM timeout. 3222 // 3223 threadStateResetTimeout(pGpu); 3224 3225 // Because of COT, RM cannot reset GSP-RISCV and FSP has exclusive access to reset and reboot GSP for next run. 3226 if(!(pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_COT_ENABLED))) 3227 { 3228 kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon)); 3229 } 3230 3231 // Invoke FWSEC-SB to put back PreOsApps during driver unload 3232 status = kgspPrepareForFwsecSb_HAL(pGpu, pKernelGsp, pKernelGsp->pFwsecUcode, &preparedCmd); 3233 if (status == NV_ERR_NOT_SUPPORTED) 3234 { 3235 // skip FWSEC-SB during driver unload if unsupported (e.g. on Hopper+) 3236 status = NV_OK; 3237 } 3238 else if (status != NV_OK) 3239 { 3240 NV_PRINTF(LEVEL_ERROR, "failed to prepare for FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status); 3241 NV_ASSERT(0); 3242 } 3243 else 3244 { 3245 status = kgspExecuteFwsec_HAL(pGpu, pKernelGsp, &preparedCmd); 3246 if (status != NV_OK) 3247 { 3248 NV_PRINTF(LEVEL_ERROR, "failed to execute FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status); 3249 NV_ASSERT(0); 3250 } 3251 } 3252 3253 if (pKernelGsp->bPartitionedFmc) 3254 { 3255 // 3256 // GSP-RM invokes the partitioned FMC to unload directly as part of the 3257 // NV_RM_RPC_UNLOADING_GUEST_DRIVER call above. 3258 // 3259 status = rpcStatus; 3260 } 3261 else 3262 { 3263 // After instructing GSP-RM to unload itself, run Booter Unload to teardown WPR2 3264 status = kgspExecuteBooterUnloadIfNeeded_HAL(pGpu, pKernelGsp, 0); 3265 } 3266 3267 // 3268 // To fix boot issue after GPU reset on ESXi config: 3269 // We still do not have root cause but looks like some sanity is failing during boot after reset is done. 3270 // As temp WAR, add delay of 250 ms after gsp rm unload is done. 3271 // Limit this to [VGPU-GSP] supported configs only and when we are in GPU RESET path. 3272 // 3273 if (API_GPU_IN_RESET_SANITY_CHECK(pGpu) && 3274 gpuIsSriovEnabled(pGpu) && 3275 IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu)) 3276 { 3277 osDelay(250); 3278 } 3279 3280 if (rpcStatus != NV_OK) 3281 { 3282 return rpcStatus; 3283 } 3284 3285 return status; 3286 } 3287 3288 /*! 3289 * Free RPC infrastructure and KernelGsp object 3290 */ 3291 void 3292 kgspDestruct_IMPL 3293 ( 3294 KernelGsp *pKernelGsp 3295 ) 3296 { 3297 OBJGPU *pGpu = ENG_GET_GPU(pKernelGsp); 3298 3299 if (!IS_GSP_CLIENT(pGpu)) 3300 return; 3301 3302 // set VBIOS version string back to "unknown" 3303 portStringCopy(pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr), "unknown", sizeof("unknown")); 3304 3305 kgspFreeFlcnUcode(pKernelGsp->pFwsecUcode); 3306 pKernelGsp->pFwsecUcode = NULL; 3307 3308 kgspFreeFlcnUcode(pKernelGsp->pBooterLoadUcode); 3309 pKernelGsp->pBooterLoadUcode = NULL; 3310 3311 kgspFreeFlcnUcode(pKernelGsp->pBooterUnloadUcode); 3312 pKernelGsp->pBooterUnloadUcode = NULL; 3313 3314 kgspFreeFlcnUcode(pKernelGsp->pScrubberUcode); 3315 pKernelGsp->pScrubberUcode = NULL; 3316 3317 kgspFreeBootArgs_HAL(pGpu, pKernelGsp); 3318 3319 _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp); 3320 _kgspFreeRpcInfrastructure(pGpu, pKernelGsp); 3321 _kgspFreeBootBinaryImage(pGpu, pKernelGsp); 3322 _kgspFreeSimAccessBuffer(pGpu, pKernelGsp); 3323 _kgspFreeNotifyOpSharedSurface(pGpu, pKernelGsp); 3324 3325 kgspFreeSuspendResumeData_HAL(pGpu, pKernelGsp); 3326 3327 #if KERNEL_GSP_TRACING_RATS_ENABLED 3328 multimapDestroy(&pGpu->gspTraceEventBufferBindingsUid); 3329 #endif 3330 } 3331 3332 void 3333 kgspDumpGspLogsUnlocked_IMPL 3334 ( 3335 KernelGsp *pKernelGsp, 3336 NvBool bSyncNvLog 3337 ) 3338 { 3339 if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog) 3340 { 3341 libosExtractLogs(&pKernelGsp->logDecode, bSyncNvLog); 3342 3343 if (pKernelGsp->bHasVgpuLogs) 3344 { 3345 // Dump logs from vGPU partition 3346 for (NvU32 i = 0; i < MAX_PARTITIONS_WITH_GFID; i++) 3347 { 3348 libosExtractLogs(&pKernelGsp->logDecodeVgpuPartition[i], bSyncNvLog); 3349 } 3350 } 3351 } 3352 3353 } 3354 3355 /*! 3356 * Dump logs coming from GSP-RM 3357 * 3358 * @param[in] pKernelGsp KernelGsp pointer 3359 * @param[in] bSyncNvLog NV_TRUE: Copy a snapshot of the libos logs 3360 * into the nvLog wrap buffers. 3361 */ 3362 void 3363 kgspDumpGspLogs_IMPL 3364 ( 3365 KernelGsp *pKernelGsp, 3366 NvBool bSyncNvLog 3367 ) 3368 { 3369 if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog) 3370 { 3371 if (pKernelGsp->pNvlogFlushMtx != NULL) 3372 portSyncMutexAcquire(pKernelGsp->pNvlogFlushMtx); 3373 3374 kgspDumpGspLogsUnlocked(pKernelGsp, bSyncNvLog); 3375 3376 if (pKernelGsp->pNvlogFlushMtx != NULL) 3377 portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx); 3378 } 3379 } 3380 3381 /*! 3382 * Populate GSP-RM init arguments. 3383 */ 3384 void 3385 kgspPopulateGspRmInitArgs_IMPL 3386 ( 3387 OBJGPU *pGpu, 3388 KernelGsp *pKernelGsp, 3389 GSP_SR_INIT_ARGUMENTS *pGspInitArgs 3390 ) 3391 { 3392 GSP_ARGUMENTS_CACHED *pGspArgs = pKernelGsp->pGspArgumentsCached; 3393 MESSAGE_QUEUE_INIT_ARGUMENTS *pMQInitArgs = &pGspArgs->messageQueueInitArguments; 3394 MESSAGE_QUEUE_COLLECTION *pMQCollection = pKernelGsp->pMQCollection; 3395 GSP_SR_INIT_ARGUMENTS *pSrInitArgs = &pGspArgs->srInitArguments; 3396 3397 // Setup the message queue arguments 3398 pMQInitArgs->sharedMemPhysAddr = pMQCollection->sharedMemPA; 3399 pMQInitArgs->pageTableEntryCount = pMQCollection->pageTableEntryCount; 3400 pMQInitArgs->cmdQueueOffset = pMQCollection->pageTableSize; 3401 pMQInitArgs->statQueueOffset = pMQInitArgs->cmdQueueOffset + pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX].commandQueueSize; 3402 if (pKernelGsp->bIsTaskIsrQueueRequired) 3403 { 3404 pMQInitArgs->locklessCmdQueueOffset = pMQInitArgs->statQueueOffset + pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX].statusQueueSize; 3405 pMQInitArgs->locklessStatQueueOffset = pMQInitArgs->locklessCmdQueueOffset + pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX].commandQueueSize; 3406 } 3407 else 3408 { 3409 pMQInitArgs->locklessCmdQueueOffset = 0; 3410 pMQInitArgs->locklessStatQueueOffset = 0; 3411 } 3412 3413 if (pGspInitArgs == NULL) 3414 { 3415 pSrInitArgs->bInPMTransition = NV_FALSE; 3416 pSrInitArgs->oldLevel = 0; 3417 pSrInitArgs->flags = 0; 3418 } 3419 else 3420 { 3421 pSrInitArgs->bInPMTransition = NV_TRUE; 3422 pSrInitArgs->oldLevel = pGspInitArgs->oldLevel; 3423 pSrInitArgs->flags = pGspInitArgs->flags; 3424 } 3425 3426 pGspArgs->gpuInstance = pGpu->gpuInstance; 3427 3428 portMemSet(&pGspArgs->profilerArgs, 0, sizeof(pGspArgs->profilerArgs)); 3429 3430 if (pKernelGsp->pProfilerSamples != NULL && 3431 pKernelGsp->pProfilerSamplesMD != NULL) 3432 { 3433 pGspArgs->profilerArgs.pa = memdescGetPhysAddr(pKernelGsp->pProfilerSamplesMD, AT_GPU, 0); 3434 pGspArgs->profilerArgs.size = memdescGetSize(pKernelGsp->pProfilerSamplesMD); 3435 } 3436 } 3437 3438 /*! 3439 * Prepare boot binary image for GSP-RM boot. 3440 * 3441 * @return NV_OK if boot binary image prepared successfully. 3442 * Appropriate NV_ERR_xxx value otherwise. 3443 */ 3444 NV_STATUS 3445 kgspPrepareBootBinaryImage_IMPL 3446 ( 3447 OBJGPU *pGpu, 3448 KernelGsp *pKernelGsp 3449 ) 3450 { 3451 NV_STATUS status; 3452 BINDATA_STORAGE *pBinStorageImage; 3453 BINDATA_STORAGE *pBinStorageDesc; 3454 NvU32 bufSize; 3455 NvU32 bufSizeAligned; 3456 RM_RISCV_UCODE_DESC *pDesc = NULL; 3457 NvP64 pVa = NvP64_NULL; 3458 NvP64 pPriv = NvP64_NULL; 3459 NvU64 flags = MEMDESC_FLAGS_NONE; 3460 3461 NV_ASSERT_OR_RETURN(pKernelGsp->pGspRmBootUcodeImage == NULL, NV_ERR_INVALID_STATE); 3462 NV_ASSERT_OR_RETURN(pKernelGsp->pGspRmBootUcodeDesc == NULL, NV_ERR_INVALID_STATE); 3463 3464 // get the bindata storage for the image/descriptor 3465 kgspGetGspRmBootUcodeStorage_HAL(pGpu, pKernelGsp, &pBinStorageImage, &pBinStorageDesc); 3466 3467 // copy the image to sysmem 3468 bufSize = bindataGetBufferSize(pBinStorageImage); 3469 bufSizeAligned = NV_ALIGN_UP(bufSize, 0x1000); 3470 3471 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY; 3472 3473 NV_ASSERT_OK_OR_GOTO(status, 3474 memdescCreate(&pKernelGsp->pGspRmBootUcodeMemdesc, 3475 pGpu, 3476 bufSizeAligned, 3477 RM_PAGE_SIZE, 3478 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, 3479 flags), 3480 fail); 3481 3482 memdescTagAlloc(status, 3483 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_15, pKernelGsp->pGspRmBootUcodeMemdesc); 3484 NV_ASSERT_OK_OR_GOTO(status, status, fail); 3485 3486 NV_ASSERT_OK_OR_GOTO(status, 3487 memdescMap(pKernelGsp->pGspRmBootUcodeMemdesc, 0, 3488 memdescGetSize(pKernelGsp->pGspRmBootUcodeMemdesc), 3489 NV_TRUE, NV_PROTECT_READ_WRITE, 3490 &pVa, &pPriv), 3491 fail); 3492 3493 pKernelGsp->gspRmBootUcodeSize = bufSize; 3494 pKernelGsp->pGspRmBootUcodeImage = (NvU8 *)NvP64_VALUE(pVa);; 3495 pKernelGsp->pGspRmBootUcodeMemdescPriv = pPriv; 3496 3497 NV_ASSERT_OK_OR_GOTO(status, 3498 bindataWriteToBuffer(pBinStorageImage, 3499 pKernelGsp->pGspRmBootUcodeImage, 3500 bufSize), 3501 fail); 3502 3503 // get the image descriptor 3504 NV_ASSERT_OK_OR_GOTO(status, 3505 bindataStorageAcquireData(pBinStorageDesc, (const void**)&pDesc), 3506 fail); 3507 pKernelGsp->pGspRmBootUcodeDesc = pDesc; 3508 3509 return status; 3510 3511 fail: 3512 _kgspFreeBootBinaryImage(pGpu, pKernelGsp); 3513 return status; 3514 } 3515 3516 static void 3517 _kgspFreeBootBinaryImage 3518 ( 3519 OBJGPU *pGpu, 3520 KernelGsp *pKernelGsp 3521 ) 3522 { 3523 bindataStorageReleaseData(pKernelGsp->pGspRmBootUcodeDesc); 3524 pKernelGsp->pGspRmBootUcodeDesc = NULL; 3525 3526 if (pKernelGsp->pGspRmBootUcodeImage != NULL) 3527 { 3528 memdescUnmap(pKernelGsp->pGspRmBootUcodeMemdesc, 3529 NV_TRUE, osGetCurrentProcess(), 3530 (void *)pKernelGsp->pGspRmBootUcodeImage, 3531 pKernelGsp->pGspRmBootUcodeMemdescPriv); 3532 pKernelGsp->pGspRmBootUcodeImage = NULL; 3533 pKernelGsp->pGspRmBootUcodeMemdescPriv = NULL; 3534 } 3535 if (pKernelGsp->pGspRmBootUcodeMemdesc != NULL) 3536 { 3537 memdescFree(pKernelGsp->pGspRmBootUcodeMemdesc); 3538 memdescDestroy(pKernelGsp->pGspRmBootUcodeMemdesc); 3539 pKernelGsp->pGspRmBootUcodeMemdesc = NULL; 3540 } 3541 3542 pKernelGsp->gspRmBootUcodeSize = 0; 3543 } 3544 3545 static NV_STATUS 3546 _kgspCreateSignatureMemdesc 3547 ( 3548 OBJGPU *pGpu, 3549 KernelGsp *pKernelGsp, 3550 GSP_FIRMWARE *pGspFw 3551 ) 3552 { 3553 NV_STATUS status = NV_OK; 3554 NvU8 *pSignatureVa = NULL; 3555 NvU64 flags = MEMDESC_FLAGS_NONE; 3556 3557 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY; 3558 3559 // NOTE: align to 256 because that's the alignment needed for Booter DMA 3560 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 3561 memdescCreate(&pKernelGsp->pSignatureMemdesc, pGpu, 3562 NV_ALIGN_UP(pGspFw->signatureSize, 256), 256, 3563 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags)); 3564 3565 memdescTagAlloc(status, 3566 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_16, pKernelGsp->pSignatureMemdesc); 3567 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, status, fail_create); 3568 3569 pSignatureVa = memdescMapInternal(pGpu, pKernelGsp->pSignatureMemdesc, TRANSFER_FLAGS_NONE); 3570 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, 3571 (pSignatureVa != NULL) ? NV_OK : NV_ERR_INSUFFICIENT_RESOURCES, 3572 fail_alloc); 3573 3574 portMemCopy(pSignatureVa, memdescGetSize(pKernelGsp->pSignatureMemdesc), 3575 pGspFw->pSignatureData, pGspFw->signatureSize); 3576 3577 memdescUnmapInternal(pGpu, pKernelGsp->pSignatureMemdesc, 0); 3578 pSignatureVa = NULL; 3579 3580 return status; 3581 3582 fail_alloc: 3583 memdescFree(pKernelGsp->pSignatureMemdesc); 3584 3585 fail_create: 3586 memdescDestroy(pKernelGsp->pSignatureMemdesc); 3587 pKernelGsp->pSignatureMemdesc = NULL; 3588 3589 return status; 3590 } 3591 3592 /*! 3593 * Verify that the version embedded in the .fwversion section of the ELF given 3594 * by pElfData and elfDataSize matches our NV_VERSION_STRING. 3595 */ 3596 static NV_STATUS 3597 _kgspFwContainerVerifyVersion 3598 ( 3599 OBJGPU *pGpu, 3600 KernelGsp *pKernelGsp, 3601 const void *pElfData, 3602 NvU64 elfDataSize, 3603 const char *pNameInMsg 3604 ) 3605 { 3606 const char *pFwversion; 3607 NvU64 fwversionSize; 3608 NvU64 expectedVersionLength = portStringLength(NV_VERSION_STRING); 3609 3610 { 3611 const void *pFwversionRaw; 3612 3613 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 3614 _kgspFwContainerGetSection(pGpu, pKernelGsp, 3615 pElfData, 3616 elfDataSize, 3617 GSP_VERSION_SECTION_NAME, 3618 &pFwversionRaw, 3619 &fwversionSize)); 3620 3621 pFwversion = (const char *) pFwversionRaw; 3622 } 3623 3624 // Check that text in .fwversion section of ELF matches our NV_VERSION_STRING 3625 if ((fwversionSize != expectedVersionLength + 1) || 3626 (portStringCompare(pFwversion, NV_VERSION_STRING, expectedVersionLength) != 0)) 3627 { 3628 // Sanity check .fwversion before attempting to print it in the error message 3629 if ((fwversionSize > 0) && 3630 (fwversionSize < 64) && 3631 (pFwversion[fwversionSize - 1] == '\0')) 3632 { 3633 NV_PRINTF(LEVEL_ERROR, "%s version mismatch: got version %s, expected version %s\n", 3634 pNameInMsg, pFwversion, NV_VERSION_STRING); 3635 } 3636 else 3637 { 3638 NV_PRINTF(LEVEL_ERROR, "%s version unknown or malformed, expected version %s\n", 3639 pNameInMsg, NV_VERSION_STRING); 3640 } 3641 return NV_ERR_INVALID_DATA; 3642 } 3643 3644 return NV_OK; 3645 } 3646 3647 /*! 3648 * Get the name of the section corresponding to the given section name 3649 * prefix and the current chip. 3650 */ 3651 static NV_STATUS 3652 _kgspGetSectionNameForPrefix 3653 ( 3654 OBJGPU *pGpu, 3655 KernelGsp *pKernelGsp, 3656 char *pSectionNameBuf, // out 3657 NvLength sectionNameBufSize, 3658 const char *pSectionPrefix 3659 ) 3660 { 3661 NvLength sectionPrefixLength; 3662 3663 nv_firmware_chip_family_t chipFamily; 3664 const char *pChipFamilyName; 3665 NvLength chipFamilyNameLength; 3666 3667 NvLength totalSize; 3668 3669 NV_ASSERT_OR_RETURN(pSectionNameBuf != NULL, NV_ERR_INVALID_ARGUMENT); 3670 NV_ASSERT_OR_RETURN(sectionNameBufSize > 0, NV_ERR_INVALID_ARGUMENT); 3671 NV_ASSERT_OR_RETURN(pSectionPrefix != NULL, NV_ERR_INVALID_ARGUMENT); 3672 3673 chipFamily = nv_firmware_get_chip_family(gpuGetChipArch(pGpu), 3674 gpuGetChipImpl(pGpu)); 3675 NV_ASSERT_OR_RETURN(chipFamily != NV_FIRMWARE_CHIP_FAMILY_NULL, 3676 NV_ERR_INVALID_STATE); 3677 3678 pChipFamilyName = nv_firmware_chip_family_to_string(chipFamily); 3679 NV_ASSERT_OR_RETURN(pChipFamilyName != NULL, NV_ERR_INVALID_STATE); 3680 3681 sectionPrefixLength = portStringLength(pSectionPrefix); 3682 chipFamilyNameLength = portStringLength(pChipFamilyName); 3683 3684 totalSize = sectionPrefixLength + chipFamilyNameLength + 1; 3685 NV_ASSERT_OR_RETURN(sectionNameBufSize >= sectionPrefixLength + 1, 3686 NV_ERR_BUFFER_TOO_SMALL); 3687 NV_ASSERT_OR_RETURN(sectionNameBufSize >= totalSize, 3688 NV_ERR_BUFFER_TOO_SMALL); 3689 3690 portStringCopy(pSectionNameBuf, sectionNameBufSize, 3691 pSectionPrefix, sectionPrefixLength + 1); 3692 portStringCat(pSectionNameBuf, sectionNameBufSize, 3693 pChipFamilyName, chipFamilyNameLength + 1); 3694 3695 return NV_OK; 3696 } 3697 3698 static NV_STATUS 3699 _kgspPrepareGspRmBinaryImage 3700 ( 3701 OBJGPU *pGpu, 3702 KernelGsp *pKernelGsp, 3703 GSP_FIRMWARE *pGspFw 3704 ) 3705 { 3706 char signatureSectionName[32]; 3707 3708 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 3709 _kgspFwContainerVerifyVersion(pGpu, pKernelGsp, 3710 pGspFw->pBuf, 3711 pGspFw->size, 3712 "GSP firmware image")); 3713 3714 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 3715 _kgspFwContainerGetSection(pGpu, pKernelGsp, 3716 pGspFw->pBuf, 3717 pGspFw->size, 3718 GSP_IMAGE_SECTION_NAME, 3719 &pGspFw->pImageData, 3720 &pGspFw->imageSize)); 3721 3722 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 3723 _kgspGetSectionNameForPrefix(pGpu, pKernelGsp, 3724 signatureSectionName, sizeof(signatureSectionName), 3725 kgspGetSignatureSectionNamePrefix_HAL(pGpu, pKernelGsp))); 3726 3727 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 3728 _kgspFwContainerGetSection(pGpu, pKernelGsp, 3729 pGspFw->pBuf, 3730 pGspFw->size, 3731 signatureSectionName, 3732 &pGspFw->pSignatureData, 3733 &pGspFw->signatureSize)); 3734 3735 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 3736 _kgspCreateSignatureMemdesc(pGpu, pKernelGsp, 3737 pGspFw)); 3738 3739 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 3740 kgspCreateRadix3(pGpu, pKernelGsp, &pKernelGsp->pGspUCodeRadix3Descriptor, 3741 NULL, pGspFw->pImageData, pGspFw->imageSize)); 3742 3743 return NV_OK; 3744 } 3745 3746 NV_STATUS 3747 kgspCreateRadix3_IMPL 3748 ( 3749 OBJGPU *pGpu, 3750 KernelGsp *pKernelGsp, 3751 MEMORY_DESCRIPTOR **ppMemdescRadix3, 3752 MEMORY_DESCRIPTOR *pMemdescData, 3753 const void *pData, 3754 NvU64 size 3755 ) 3756 { 3757 const NvU64 entriesLog2 = LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2 - 3; 3758 NvU8 *pRadix3Buf; 3759 NvP64 pVaKernel; 3760 NvP64 pPrivKernel; 3761 NvU64 ptSize; 3762 NvU64 allocSize; 3763 NvU64 nPages = 0; 3764 NvU64 dataOffset = 0; 3765 NvU32 i; 3766 NV_STATUS status = NV_OK; 3767 NvU64 flags = MEMDESC_FLAGS_KERNEL_MODE; 3768 3769 // radix3 working array. 3770 struct 3771 { 3772 NvU64 nPages; 3773 NvU64 offset; 3774 } radix3[4]; 3775 3776 NV_ASSERT_OR_RETURN(ppMemdescRadix3 != NULL, NV_ERR_INVALID_PARAMETER); 3777 NV_ASSERT_OR_ELSE_STR(!((pMemdescData != NULL) && (pData != NULL)), 3778 "Specify pMemdescData or pData, or none, but not both", 3779 return NV_ERR_INVALID_PARAMETER); 3780 3781 // If the size is not specified, get it from the memory descriptor. 3782 if ((size == 0) && (pMemdescData != NULL)) 3783 size = memdescGetSize(pMemdescData); 3784 NV_ASSERT_OR_RETURN(size > 0, NV_ERR_OUT_OF_RANGE); 3785 3786 // Clear working structure. 3787 portMemSet(radix3, 0, sizeof radix3); 3788 3789 // Populate npages, high to low. 3790 i = NV_ARRAY_ELEMENTS(radix3) - 1; 3791 radix3[i].nPages = (size + LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE - 1) >> 3792 LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2; 3793 for (; i > 0; i--) 3794 radix3[i - 1].nPages = ((radix3[i].nPages - 1) >> entriesLog2) + 1; 3795 3796 // Populate offset, low to high. 3797 for (i = 1; i < NV_ARRAY_ELEMENTS(radix3); i++) 3798 { 3799 nPages += radix3[i - 1].nPages; 3800 radix3[i].offset = nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2; 3801 } 3802 3803 NV_ASSERT_OR_RETURN(radix3[0].nPages == 1, NV_ERR_OUT_OF_RANGE); 3804 3805 // Allocate space for PTEs and PDEs. 3806 ptSize = nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2; 3807 allocSize = ptSize; 3808 3809 if (pMemdescData == NULL) 3810 { 3811 // We don't have a separate descriptor for the data. We need PTEs, 3812 // so include space for data in the new descriptor. 3813 allocSize += radix3[3].nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2; 3814 } 3815 3816 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY; 3817 3818 NV_ASSERT_OK_OR_GOTO(status, 3819 memdescCreate(ppMemdescRadix3, pGpu, allocSize, 3820 LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE, 3821 NV_MEMORY_NONCONTIGUOUS, 3822 ADDR_SYSMEM, 3823 NV_MEMORY_CACHED, 3824 flags), 3825 done); 3826 3827 memdescTagAlloc(status, 3828 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_17, (*ppMemdescRadix3)); 3829 NV_ASSERT_OK_OR_GOTO(status, status, error_ret); 3830 3831 // Create kernel mapping. 3832 NV_ASSERT_OK_OR_GOTO(status, 3833 memdescMap(*ppMemdescRadix3, 0, allocSize, NV_TRUE, NV_PROTECT_WRITEABLE, 3834 &pVaKernel, &pPrivKernel), 3835 error_ret); 3836 3837 if (pVaKernel == NvP64_NULL) 3838 { 3839 NV_PRINTF(LEVEL_ERROR, "VA error for radix3 shared buffer\n"); 3840 status = NV_ERR_NO_MEMORY; 3841 goto error_ret; 3842 } 3843 3844 pRadix3Buf = KERNEL_POINTER_FROM_NvP64(NvU8 *, pVaKernel); 3845 3846 // Zap out page table. 3847 portMemSet(pRadix3Buf, 0, ptSize); 3848 3849 // Fill in PDEs. 3850 for (i = 0; i < NV_ARRAY_ELEMENTS(radix3) - 2; i++) 3851 { 3852 memdescGetPhysAddrs(*ppMemdescRadix3, 3853 AT_GPU, // addressTranslation 3854 radix3[i + 1].offset, // offset 3855 RM_PAGE_SIZE, // stride 3856 radix3[i + 1].nPages, // count 3857 (RmPhysAddr *)(pRadix3Buf + radix3[i].offset)); // physical address table 3858 } 3859 3860 dataOffset = radix3[3].offset; 3861 3862 if (pData != NULL) 3863 { 3864 // Optionally copy data into the radix3 buffer. 3865 portMemCopy(pRadix3Buf + dataOffset, size, pData, size); 3866 3867 // If we only have part of the last page, clear the rest. 3868 NvU32 clearSize = allocSize - dataOffset - size; 3869 if (clearSize != 0) 3870 portMemSet(pRadix3Buf + dataOffset + size, 0, clearSize); 3871 3872 pMemdescData = *ppMemdescRadix3; 3873 } 3874 3875 memdescGetPhysAddrs(*ppMemdescRadix3, 3876 AT_GPU, // addressTranslation 3877 dataOffset, // offset 3878 RM_PAGE_SIZE, // stride 3879 radix3[3].nPages, // count 3880 (RmPhysAddr *)(pRadix3Buf + radix3[2].offset)); // physical address table 3881 3882 // 3883 // No reason to keep this memory mapped on the CPU side. Only GSP will 3884 // access it after this point. 3885 // 3886 memdescUnmap(*ppMemdescRadix3, NV_TRUE, osGetCurrentProcess(), 3887 pVaKernel, pPrivKernel); 3888 done: 3889 return status; 3890 3891 error_ret: 3892 if (*ppMemdescRadix3 != NULL) 3893 { 3894 memdescFree(*ppMemdescRadix3); 3895 memdescDestroy(*ppMemdescRadix3); 3896 *ppMemdescRadix3 = NULL; 3897 } 3898 3899 return status; 3900 } 3901 3902 static NV_STATUS 3903 _kgspFwContainerGetSection 3904 ( 3905 OBJGPU *pGpu, 3906 KernelGsp *pKernelGsp, 3907 const void *pElfData, 3908 NvU64 elfDataSize, 3909 const char *pSectionName, 3910 const void **ppSectionData, 3911 NvU64 *pSectionSize 3912 ) 3913 { 3914 const NvU8 *pGspBuf = pElfData; 3915 const LibosElf64Header *pElfHeader; 3916 const LibosElf64SectionHeader *pElfSectionHeader; 3917 NvU64 elfSectionHeaderTableLength; 3918 NvU64 elfSectionHeaderMaxIdx; 3919 NvU64 elfSectionNamesTableOffset; 3920 NvU64 elfSectionNamesTableSize; 3921 NvU64 elfSectionNamesTableMaxIdx; 3922 static const NvU32 elfMagicNumber = 0x464C457F; 3923 static const NvU8 elfClass64 = 0x2; 3924 static const NvU8 elfLittleEndian = 0x1; 3925 const char *pCurrentSectionName; 3926 NvLength sectionNameLength; 3927 NvS16 idx; 3928 3929 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfData != NULL, NV_ERR_INVALID_ARGUMENT); 3930 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize > 0, NV_ERR_INVALID_ARGUMENT); 3931 NV_CHECK_OR_RETURN(LEVEL_ERROR, pSectionName != NULL, NV_ERR_INVALID_ARGUMENT); 3932 NV_CHECK_OR_RETURN(LEVEL_ERROR, ppSectionData != NULL, NV_ERR_INVALID_ARGUMENT); 3933 NV_CHECK_OR_RETURN(LEVEL_ERROR, pSectionSize != NULL, NV_ERR_INVALID_ARGUMENT); 3934 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= sizeof(LibosElf64Header), NV_ERR_INVALID_DATA); 3935 3936 sectionNameLength = portStringLength(pSectionName); 3937 3938 pElfHeader = (const LibosElf64Header*) pGspBuf; 3939 3940 // Check for the elf identifier at the beginning of the file 3941 NV_CHECK_OR_RETURN(LEVEL_ERROR, *(NvU32*)&pElfHeader->ident == elfMagicNumber, NV_ERR_INVALID_DATA); 3942 // Make sure the data is formatted as little endian 3943 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->ident[5] == elfLittleEndian, NV_ERR_INVALID_DATA); 3944 // Check the class type, only ELFCLASS64 is supported 3945 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->ident[4] == elfClass64, NV_ERR_INVALID_DATA); 3946 3947 // Make sure that the elf section header table is valid 3948 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->shentsize == sizeof(LibosElf64SectionHeader), NV_ERR_INVALID_DATA); 3949 NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeMulU64(pElfHeader->shentsize, pElfHeader->shnum, &elfSectionHeaderTableLength), NV_ERR_INVALID_DATA); 3950 NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(pElfHeader->shoff, elfSectionHeaderTableLength - 1, &elfSectionHeaderMaxIdx), NV_ERR_INVALID_DATA); 3951 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionHeaderMaxIdx, NV_ERR_INVALID_DATA); 3952 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->shstrndx <= pElfHeader->shnum, NV_ERR_INVALID_DATA); 3953 3954 // Get the offset and size of the table that holds the section names and make sure they are valid 3955 pElfSectionHeader = (const LibosElf64SectionHeader*) &pGspBuf[pElfHeader->shoff + (pElfHeader->shstrndx * pElfHeader->shentsize)]; 3956 elfSectionNamesTableOffset = pElfSectionHeader->offset; 3957 elfSectionNamesTableSize = pElfSectionHeader->size; 3958 NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(elfSectionNamesTableOffset, elfSectionNamesTableSize - 1, &elfSectionNamesTableMaxIdx), NV_ERR_INVALID_DATA); 3959 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionNamesTableMaxIdx, NV_ERR_INVALID_DATA); 3960 3961 // Iterate through all of the section headers to find the signatures 3962 pElfSectionHeader = (const LibosElf64SectionHeader*) &pGspBuf[elfSectionHeaderMaxIdx + 1 - sizeof(*pElfSectionHeader)]; 3963 3964 for (idx = pElfHeader->shnum - 1; idx >= 0; idx--, pElfSectionHeader--) 3965 { 3966 NvU64 currentSectionNameMaxLength; 3967 NvU64 elfSectionMaxIdx; 3968 3969 // Make sure the header name index fits within the section names table 3970 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfSectionNamesTableSize - 1 >= pElfSectionHeader->name, NV_ERR_INVALID_DATA); 3971 currentSectionNameMaxLength = elfSectionNamesTableSize - pElfSectionHeader->name - 1; 3972 pCurrentSectionName = (const char *) &pGspBuf[elfSectionNamesTableOffset + pElfSectionHeader->name]; 3973 3974 // Make sure the elf section size and offset are valid 3975 if (pElfSectionHeader->size > 0) 3976 { 3977 NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(pElfSectionHeader->offset, pElfSectionHeader->size - 1, &elfSectionMaxIdx), NV_ERR_INVALID_DATA); 3978 } 3979 else 3980 { 3981 elfSectionMaxIdx = pElfSectionHeader->offset; 3982 } 3983 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionMaxIdx, NV_ERR_INVALID_DATA); 3984 3985 // Check whether the section name matches the expected section name 3986 if ((sectionNameLength <= currentSectionNameMaxLength) && 3987 (portStringCompare(pCurrentSectionName, pSectionName, sectionNameLength) == 0) && 3988 (pCurrentSectionName[sectionNameLength] == '\0')) 3989 { 3990 *ppSectionData = &pGspBuf[pElfSectionHeader->offset]; 3991 *pSectionSize = pElfSectionHeader->size; 3992 3993 return NV_OK; 3994 } 3995 } 3996 3997 return NV_ERR_OBJECT_NOT_FOUND; 3998 } 3999 4000 /*! 4001 * Setup libos init arguments. 4002 */ 4003 void 4004 kgspSetupLibosInitArgs_IMPL 4005 ( 4006 OBJGPU *pGpu, 4007 KernelGsp *pKernelGsp 4008 ) 4009 { 4010 LibosMemoryRegionInitArgument *pLibosInitArgs = pKernelGsp->pLibosInitArgumentsCached; 4011 NvU8 idx; 4012 4013 portMemSet(pLibosInitArgs, 0, LIBOS_INIT_ARGUMENTS_SIZE); 4014 4015 // Add memory areas for logging each LIBOS task. 4016 // @note LOGINIT must be first for early init logging to work. 4017 // @note: These should be switched to radix regions to remove the need 4018 // for large apertures in the RM task for logging. 4019 for (idx = 0; idx < LOGIDX_SIZE; idx++) 4020 { 4021 pLibosInitArgs[idx].kind = LIBOS_MEMORY_REGION_CONTIGUOUS; 4022 pLibosInitArgs[idx].loc = LIBOS_MEMORY_REGION_LOC_SYSMEM; 4023 pLibosInitArgs[idx].id8 = pKernelGsp->rmLibosLogMem[idx].id8; 4024 pLibosInitArgs[idx].pa = pKernelGsp->rmLibosLogMem[idx].pTaskLogBuffer[1]; 4025 pLibosInitArgs[idx].size = memdescGetSize(pKernelGsp->rmLibosLogMem[idx].pTaskLogDescriptor); 4026 } 4027 4028 // insert GSP-RM ELF args address; id must match libos-config.py entry 4029 pLibosInitArgs[idx].kind = LIBOS_MEMORY_REGION_CONTIGUOUS; 4030 pLibosInitArgs[idx].loc = LIBOS_MEMORY_REGION_LOC_SYSMEM; 4031 pLibosInitArgs[idx].id8 = _kgspGenerateInitArgId("RMARGS"); 4032 pLibosInitArgs[idx].pa = memdescGetPhysAddr(pKernelGsp->pGspArgumentsDescriptor, AT_GPU, 0); 4033 pLibosInitArgs[idx].size = memdescGetSize(pKernelGsp->pGspArgumentsDescriptor); 4034 4035 portAtomicMemoryFenceFull(); 4036 } 4037 4038 /*! 4039 * Receive and process RPC event from GSP-RM. 4040 * 4041 * This function is called from interrupt bottom-half handler (DPC) and 4042 * would race with normal RPC flow, _kgspRpcRecvPoll(). 4043 * This race is currently avoided only because DPC is executed under 4044 * gpus lock, so RPC and Bottom-half handler are mutually exclusive 4045 * control flows. 4046 */ 4047 void 4048 kgspRpcRecvEvents_IMPL 4049 ( 4050 OBJGPU *pGpu, 4051 KernelGsp *pKernelGsp 4052 ) 4053 { 4054 NvU32 gpuMaskUnused; 4055 NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused)); 4056 // 4057 // We should never have an event with code NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS. 4058 // If we do the assert will fail on NV_WARN_MORE_PROCESSING_REQUIRED, 4059 // in addition to general error codes. 4060 // 4061 NV_ASSERT_OK(_kgspRpcDrainEvents(pGpu, pKernelGsp, NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS, KGSP_RPC_EVENT_HANDLER_CONTEXT_INTERRUPT)); 4062 } 4063 4064 /*! 4065 * Wait for GSP-RM initialization to complete. 4066 */ 4067 NV_STATUS 4068 kgspWaitForRmInitDone_IMPL 4069 ( 4070 OBJGPU *pGpu, 4071 KernelGsp *pKernelGsp 4072 ) 4073 { 4074 OBJRPC *pRpc = pKernelGsp->pRpc; 4075 4076 // 4077 // Kernel RM can timeout when GSP-RM has an error condition. Give GSP-RM 4078 // a chance to report the error before we pull the rug out from under it. 4079 // 4080 threadStateResetTimeout(pGpu); 4081 4082 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 4083 rpcRecvPoll(pGpu, pRpc, NV_VGPU_MSG_EVENT_GSP_INIT_DONE)); 4084 4085 // 4086 // Now check if RPC really succeeded (NV_VGPU_MSG_RESULT_* are defined to 4087 // equivalent NV_STATUS codes in RM). 4088 // 4089 NV_ASSERT_OK_OR_RETURN(RPC_HDR->rpc_result); 4090 4091 pGpu->gspRmInitialized = NV_TRUE; 4092 if (hypervisorIsVgxHyper() && pGpu->getProperty(pGpu, PDB_PROP_GPU_EXTENDED_GSP_RM_INITIALIZATION_TIMEOUT_FOR_VGX)) 4093 { 4094 // Decrease timeout values for VGX driver 4095 timeoutInitializeGpuDefault(&pGpu->timeoutData, pGpu); 4096 } 4097 4098 return NV_OK; 4099 } 4100 4101 /*! 4102 * Execute a sequencer buffer coming from GSP 4103 * 4104 * @param[in] pGpu GPU object pointer 4105 * @param[in] pKernelGsp KernelGsp object pointer 4106 * @param[in] pRunCpuSeqParams Sequence buffer RPC parameters 4107 * 4108 * @return NV_OK if the GSP sequencer buffer has been executed successfully 4109 * NV_ERR_INVALID_STATE if the sequencer buffer is not allocated 4110 * NV_ERR_INVALID_DATA is the sequencer buffer is malformed 4111 */ 4112 NV_STATUS 4113 kgspExecuteSequencerBuffer_IMPL 4114 ( 4115 OBJGPU *pGpu, 4116 KernelGsp *pKernelGsp, 4117 void *pRunCpuSeqParams 4118 ) 4119 { 4120 rpc_run_cpu_sequencer_v17_00 *pParams = (rpc_run_cpu_sequencer_v17_00 *)pRunCpuSeqParams; 4121 NvU32 *pCmd = pParams->commandBuffer; 4122 NvU32 buffer_end = pParams->cmdIndex; 4123 NvU32 current_cmd_index = 0; 4124 NV_STATUS nvStatus = NV_OK; 4125 NvU32 payloadSize; 4126 4127 NV_ASSERT_OR_RETURN(IS_GSP_CLIENT(pGpu), NV_ERR_NOT_SUPPORTED); 4128 NV_ASSERT_OR_RETURN((pParams->bufferSizeDWord != 0), NV_ERR_INVALID_STATE); 4129 NV_ASSERT_OR_RETURN(buffer_end < pParams->bufferSizeDWord, NV_ERR_INVALID_DATA); 4130 4131 while (current_cmd_index < buffer_end) 4132 { 4133 NvU32 opCode = pCmd[current_cmd_index++]; 4134 payloadSize = GSP_SEQUENCER_PAYLOAD_SIZE_DWORDS(opCode); 4135 4136 NV_ASSERT_OR_RETURN(current_cmd_index + payloadSize <= buffer_end, NV_ERR_INVALID_DATA); 4137 4138 // 4139 // Handling of sequencer commands is split between those commands 4140 // that are common to all architectures (handled directly here) and 4141 // those commands that are arch-specific and are handled via the 4142 // kgspExecuteSequencerCommand_HAL() call below. 4143 // 4144 switch (opCode) 4145 { 4146 // 2 arguments 4147 case GSP_SEQ_BUF_OPCODE_REG_WRITE: 4148 { 4149 GSP_SEQ_BUF_PAYLOAD_REG_WRITE regWrite; 4150 portMemCopy(®Write, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_WRITE), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_WRITE)); 4151 4152 GPU_REG_WR32(pGpu, regWrite.addr, regWrite.val); 4153 break; 4154 } 4155 4156 // 3 arguments 4157 case GSP_SEQ_BUF_OPCODE_REG_MODIFY: 4158 { 4159 GSP_SEQ_BUF_PAYLOAD_REG_MODIFY regModify; 4160 NvU32 regVal; 4161 4162 portMemCopy(®Modify, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_MODIFY), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_MODIFY)); 4163 4164 regVal = GPU_REG_RD32(pGpu, regModify.addr); 4165 regVal = regVal & ~regModify.mask; 4166 regVal = regVal | regModify.val; 4167 GPU_REG_WR32(pGpu, regModify.addr, regVal); 4168 break; 4169 } 4170 4171 // 5 arguments 4172 case GSP_SEQ_BUF_OPCODE_REG_POLL: 4173 { 4174 GSP_SEQ_BUF_PAYLOAD_REG_POLL regPoll; 4175 NvU32 regval; 4176 RMTIMEOUT timeout; 4177 4178 portMemCopy(®Poll, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_POLL), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_POLL)); 4179 4180 regval = GPU_REG_RD32(pGpu, regPoll.addr); 4181 4182 gpuSetTimeout(pGpu, regPoll.timeout, &timeout, 0); 4183 while ((regval & regPoll.mask) != regPoll.val) 4184 { 4185 nvStatus = gpuCheckTimeout(pGpu, &timeout); 4186 if (nvStatus == NV_ERR_TIMEOUT) 4187 { 4188 NV_PRINTF(LEVEL_ERROR, "Timeout waiting for register to settle, value = 0x%x, err_code = 0x%x\n", 4189 regval, regPoll.error); 4190 DBG_BREAKPOINT(); 4191 return nvStatus; 4192 } 4193 osSpinLoop(); 4194 regval = GPU_REG_RD32(pGpu, regPoll.addr); 4195 } 4196 break; 4197 } 4198 4199 case GSP_SEQ_BUF_OPCODE_DELAY_US: 4200 { 4201 GSP_SEQ_BUF_PAYLOAD_DELAY_US delayUs; 4202 portMemCopy(&delayUs, sizeof(GSP_SEQ_BUF_PAYLOAD_DELAY_US), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_DELAY_US)); 4203 4204 osDelayUs(delayUs.val); 4205 break; 4206 } 4207 4208 case GSP_SEQ_BUF_OPCODE_REG_STORE: 4209 { 4210 GSP_SEQ_BUF_PAYLOAD_REG_STORE regStore; 4211 portMemCopy(®Store, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_STORE), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_STORE)); 4212 4213 NV_ASSERT_OR_RETURN(regStore.index < GSP_SEQ_BUF_REG_SAVE_SIZE, NV_ERR_INVALID_ARGUMENT); 4214 4215 pParams->regSaveArea[regStore.index] = GPU_REG_RD32(pGpu, regStore.addr); 4216 break; 4217 } 4218 4219 case GSP_SEQ_BUF_OPCODE_CORE_RESET: 4220 { 4221 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT); 4222 4223 kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon)); 4224 kflcnDisableCtxReq_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon)); 4225 break; 4226 } 4227 4228 case GSP_SEQ_BUF_OPCODE_CORE_START: 4229 { 4230 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT); 4231 4232 kflcnStartCpu_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon)); 4233 break; 4234 } 4235 4236 case GSP_SEQ_BUF_OPCODE_CORE_WAIT_FOR_HALT: 4237 { 4238 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT); 4239 4240 NV_ASSERT_OK_OR_RETURN(kflcnWaitForHalt_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon), GPU_TIMEOUT_DEFAULT, 0)); 4241 break; 4242 } 4243 4244 default: 4245 // 4246 // Route this command to the arch-specific handler. 4247 // 4248 NV_ASSERT_OK_OR_RETURN(kgspExecuteSequencerCommand_HAL(pGpu, pKernelGsp, opCode, &pCmd[current_cmd_index], payloadSize * sizeof (*pCmd))); 4249 break; 4250 } 4251 current_cmd_index += payloadSize; 4252 } 4253 4254 return NV_OK; 4255 } 4256 4257 #if LIBOS_LOG_DECODE_ENABLE 4258 static void 4259 _kgspLogPollingCallback 4260 ( 4261 OBJGPU *pGpu, 4262 void *data 4263 ) 4264 { 4265 // 4266 // Do not take any locks in kgspDumpGspLogs. As this callback only fires when kgspNvlogFlushCb 4267 // is not registered, there is no possibility of data race. 4268 // 4269 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); 4270 kgspDumpGspLogsUnlocked(pKernelGsp, NV_FALSE); 4271 } 4272 4273 NV_STATUS 4274 kgspStartLogPolling_IMPL 4275 ( 4276 OBJGPU *pGpu, 4277 KernelGsp *pKernelGsp 4278 ) 4279 { 4280 NV_STATUS status = NV_OK; 4281 4282 // 4283 // Only enable the 1 Hz poll if we can live decode logs in dmesg. Else we'll 4284 // flush it on demand by nvidia-debugdump. 4285 // 4286 if (pKernelGsp->pLogElf != NULL) 4287 { 4288 status = osSchedule1HzCallback(pGpu, 4289 _kgspLogPollingCallback, 4290 NULL, 4291 NV_OS_1HZ_REPEAT); 4292 } 4293 return status; 4294 } 4295 4296 static void 4297 _kgspStopLogPolling 4298 ( 4299 OBJGPU *pGpu, 4300 KernelGsp *pKernelGsp 4301 ) 4302 { 4303 if (pKernelGsp->pLogElf != NULL) 4304 { 4305 osRemove1HzCallback(pGpu, _kgspLogPollingCallback, NULL); 4306 } 4307 } 4308 4309 #else // LIBOS_LOG_DECODE_ENABLE 4310 4311 NV_STATUS 4312 kgspStartLogPolling_IMPL 4313 ( 4314 OBJGPU *pGpu, 4315 KernelGsp *pKernelGsp 4316 ) 4317 { 4318 return NV_OK; 4319 } 4320 4321 static void 4322 _kgspStopLogPolling 4323 ( 4324 OBJGPU *pGpu, 4325 KernelGsp *pKernelGsp 4326 ) 4327 { 4328 return; 4329 } 4330 #endif // LIBOS_LOG_DECODE_ENABLE 4331 4332 /*! 4333 * Provides an opportunity to register some IntrService during intrStateInit. 4334 */ 4335 void 4336 kgspRegisterIntrService_IMPL 4337 ( 4338 OBJGPU *pGpu, 4339 KernelGsp *pKernelGsp, 4340 IntrServiceRecord pRecords[MC_ENGINE_IDX_MAX] 4341 ) 4342 { 4343 NvU32 engineIdx = MC_ENGINE_IDX_GSP; 4344 4345 if (!IS_GSP_CLIENT(pGpu)) 4346 return; 4347 4348 NV_ASSERT(pRecords[engineIdx].pInterruptService == NULL); 4349 pRecords[engineIdx].pInterruptService = staticCast(pKernelGsp, IntrService); 4350 } 4351 4352 /*! 4353 * Service GSP interrupts. 4354 * 4355 * @returns Zero, or any implementation-chosen nonzero value. If the same nonzero value is returned enough 4356 * times the interrupt is considered stuck. 4357 */ 4358 NvU32 4359 kgspServiceInterrupt_IMPL 4360 ( 4361 OBJGPU *pGpu, 4362 KernelGsp *pKernelGsp, 4363 IntrServiceServiceInterruptArguments *pParams 4364 ) 4365 { 4366 NV_ASSERT_OR_RETURN(pParams != NULL, 0); 4367 NV_ASSERT_OR_RETURN(pParams->engineIdx == MC_ENGINE_IDX_GSP, 0); 4368 4369 return kgspService_HAL(pGpu, pKernelGsp); 4370 } 4371 4372 /*! 4373 * Calculates the GSP FW heap size based on the GPU's resources. 4374 */ 4375 static NvU64 4376 _kgspCalculateFwHeapSize 4377 ( 4378 OBJGPU *pGpu, 4379 KernelGsp *pKernelGsp, 4380 NvU32 maxGspFwHeapSizeMB 4381 ) 4382 { 4383 // For VGPU, use the static pre-calculated size 4384 if (pGpu->bVgpuGspPluginOffloadEnabled) 4385 return GSP_FW_HEAP_SIZE_VGPU_DEFAULT; 4386 4387 // 4388 // The baremetal heap calculation is a function of the architecture, FB 4389 // size, and a chunk for backing client allocations (pre-calibrated for the 4390 // architecture through rough profiling). 4391 // 4392 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu); 4393 NvU64 fbSize = 0; 4394 4395 NV_ASSERT_OK(kmemsysGetUsableFbSize_HAL(pGpu, pKernelMemorySystem, &fbSize)); 4396 const NvU32 fbSizeGB = (NvU32)(NV_ALIGN_UP64(fbSize, 1 << 30) >> 30); 4397 4398 // 4399 // Reclaimable binary data will end up padding the heap (in some cases, 4400 // significantly), but due to memory fragmentation we can't rely on it to 4401 // linearly reduce the amount needed in the primary heap, so it is not a 4402 // factor here. Instead, it's just extra margin to keep us from exhausting 4403 // the heap at runtime. 4404 // 4405 NvU64 heapSize = kgspGetFwHeapParamOsCarveoutSize_HAL(pGpu, pKernelGsp) + 4406 pKernelGsp->fwHeapParamBaseSize + 4407 NV_ALIGN_UP(GSP_FW_HEAP_PARAM_SIZE_PER_GB_FB * fbSizeGB, 1 << 20) + 4408 NV_ALIGN_UP(GSP_FW_HEAP_PARAM_CLIENT_ALLOC_SIZE, 1 << 20); 4409 4410 // Clamp to the minimum, even if the calculations say we can do with less 4411 const NvU32 minGspFwHeapSizeMB = kgspGetMinWprHeapSizeMB_HAL(pGpu, pKernelGsp); 4412 heapSize = NV_MAX(heapSize, (NvU64)minGspFwHeapSizeMB << 20); 4413 4414 // Clamp to the maximum heap size, if necessary 4415 heapSize = NV_MIN(heapSize, (NvU64)maxGspFwHeapSizeMB << 20); 4416 4417 NV_PRINTF(LEVEL_INFO, "GSP FW heap %lluMB of %uGB\n", 4418 heapSize >> 20, fbSizeGB); 4419 4420 return heapSize; 4421 } 4422 4423 /*! 4424 * Returns the size in bytes of the GSP FW heap: 4425 * - the registry override, if present 4426 * - otherwise, calculate the FW heap size for this GPU, limiting it to stay 4427 * within the pre-scrubbed area at the end of FB, if needed 4428 * 4429 * @param[in] posteriorFbSize - size in bytes of the memory reserved between the 4430 * end of the GSP FW heap and the end of FB, or 0 4431 * to disable limiting of the heap range to within 4432 * the pre-scrubbed area at the end of FB 4433 */ 4434 NvU64 4435 kgspGetFwHeapSize_IMPL 4436 ( 4437 OBJGPU *pGpu, 4438 KernelGsp *pKernelGsp, 4439 NvU64 posteriorFbSize 4440 ) 4441 { 4442 NvU32 maxScrubbedHeapSizeMB = NV_U32_MAX; 4443 NvU32 heapSizeMB = 0; 4444 4445 // 4446 // The pre-scrubbed region at the end of FB may limit the heap size, if no 4447 // scrubber ucode is supported to unlock the rest of memory prior to booting 4448 // GSP-RM. 4449 // 4450 if (!pKernelGsp->bScrubberUcodeSupported && (posteriorFbSize != 0)) 4451 { 4452 const NvU64 prescrubbedSize = kgspGetPrescrubbedTopFbSize(pGpu, pKernelGsp); 4453 if (prescrubbedSize < NV_U64_MAX) 4454 maxScrubbedHeapSizeMB = (NvU32)((prescrubbedSize - posteriorFbSize) >> 20); 4455 } 4456 4457 // Get the heap size override from the registry, if any 4458 if ((osReadRegistryDword(pGpu, NV_REG_STR_GSP_FIRMWARE_HEAP_SIZE_MB, &heapSizeMB) == NV_OK) && 4459 (heapSizeMB != NV_REG_STR_GSP_FIRMWARE_HEAP_SIZE_MB_DEFAULT)) 4460 { 4461 const NvU32 minGspFwHeapSizeMB = kgspGetMinWprHeapSizeMB_HAL(pGpu, pKernelGsp); 4462 const NvU32 maxGspFwHeapSizeMB = NV_MIN(kgspGetMaxWprHeapSizeMB_HAL(pGpu, pKernelGsp), 4463 maxScrubbedHeapSizeMB); 4464 4465 NV_ASSERT(minGspFwHeapSizeMB < maxGspFwHeapSizeMB); 4466 4467 if (heapSizeMB > maxGspFwHeapSizeMB) 4468 { 4469 NV_PRINTF(LEVEL_WARNING, "Firmware heap size clamped to maximum (%uMB)\n", 4470 maxGspFwHeapSizeMB); 4471 heapSizeMB = maxGspFwHeapSizeMB; 4472 } 4473 else if (heapSizeMB < minGspFwHeapSizeMB) 4474 { 4475 NV_PRINTF(LEVEL_WARNING, "Firmware heap size clamped to minimum (%uMB)\n", 4476 minGspFwHeapSizeMB); 4477 heapSizeMB = minGspFwHeapSizeMB; 4478 } 4479 else 4480 { 4481 NV_PRINTF(LEVEL_WARNING, "Firmware heap size overridden (%uMB)\n", 4482 heapSizeMB); 4483 } 4484 4485 return ((NvU64)heapSizeMB) << 20; 4486 } 4487 4488 return _kgspCalculateFwHeapSize(pGpu, pKernelGsp, maxScrubbedHeapSizeMB); 4489 } 4490 4491 NvU64 kgspGetWprEndMargin_IMPL(OBJGPU *pGpu, KernelGsp *pKernelGsp) 4492 { 4493 NvU64 wprEndMargin; 4494 NvU32 marginOverride = 0; 4495 GspFwWprMeta *pWprMeta = pKernelGsp->pWprMeta; 4496 4497 (void)osReadRegistryDword(pGpu, NV_REG_STR_RM_GSP_WPR_END_MARGIN, &marginOverride); 4498 4499 wprEndMargin = ((NvU64)DRF_VAL(_REG, _RM_GSP_WPR_END_MARGIN, _MB, marginOverride)) << 20; 4500 if (wprEndMargin == 0) 4501 { 4502 // Calculate the default margin size based on the WPR size 4503 const GspFwWprMeta *pWprMeta = pKernelGsp->pWprMeta; 4504 4505 // 4506 // This needs to be called after pWprMeta->sizeOfRadix3Elf has been initialized, 4507 // in order to estimate the default WPR size. 4508 // 4509 NV_ASSERT(pWprMeta->sizeOfRadix3Elf > 0); 4510 4511 // 4512 // If the bounds are encoded in GspFwWprMeta from a prior attempt, use them. 4513 // Otherwise, estimate the WPR size by the sizes of the elements in the layout 4514 // 4515 if (pWprMeta->gspFwWprEnd > pWprMeta->nonWprHeapOffset) 4516 { 4517 wprEndMargin = pWprMeta->gspFwWprEnd - pWprMeta->nonWprHeapOffset; 4518 } 4519 else 4520 { 4521 wprEndMargin += kgspGetFrtsSize_HAL(pGpu, pKernelGsp); 4522 wprEndMargin += pKernelGsp->gspRmBootUcodeSize; 4523 wprEndMargin += pWprMeta->sizeOfRadix3Elf; 4524 wprEndMargin += kgspGetFwHeapSize(pGpu, pKernelGsp, 0); 4525 wprEndMargin += kgspGetNonWprHeapSize(pGpu, pKernelGsp); 4526 } 4527 4528 if (pKernelGsp->bootAttempts > 0) 4529 wprEndMargin *= pKernelGsp->bootAttempts; 4530 } 4531 4532 if (FLD_TEST_DRF(_REG, _RM_GSP_WPR_END_MARGIN, _APPLY, _ALWAYS, marginOverride) || 4533 (pKernelGsp->bootAttempts > 0)) 4534 { 4535 NV_PRINTF(LEVEL_WARNING, "Adding margin of 0x%llx bytes after the end of WPR2\n", 4536 wprEndMargin); 4537 pWprMeta->flags |= GSP_FW_FLAGS_RECOVERY_MARGIN_PRESENT; 4538 return wprEndMargin; 4539 } 4540 4541 // Normal boot path 4542 pWprMeta->flags &= ~GSP_FW_FLAGS_RECOVERY_MARGIN_PRESENT; 4543 return 0; 4544 } 4545