1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "resserv/rs_server.h"
25
26 #include "gpu/gsp/kernel_gsp.h"
27
28 #include "kernel/core/thread_state.h"
29 #include "kernel/core/locks.h"
30 #include "kernel/diagnostics/gpu_acct.h"
31 #include "kernel/diagnostics/journal.h"
32 #include "kernel/gpu/fifo/kernel_channel.h"
33 #include "kernel/gpu/gsp/gsp_trace_rats_macro.h"
34 #include "kernel/gpu/intr/engine_idx.h"
35 #include "kernel/gpu/mem_mgr/heap.h"
36 #include "kernel/gpu/mem_mgr/mem_mgr.h"
37 #include "kernel/gpu/mem_sys/kern_mem_sys.h"
38 #include "kernel/gpu/rc/kernel_rc.h"
39 #include "kernel/gpu/nvlink/kernel_nvlink.h"
40 #include "virtualization/hypervisor/hypervisor.h"
41 #include "virtualization/vgpuconfigapi.h"
42 #include "kernel/gpu/disp/kern_disp.h"
43 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
44 #include "kernel/gpu/device/device.h"
45 #include "gpu/external_device/external_device.h"
46 #include "kernel/platform/platform_request_handler.h"
47 #include "class/cl2080.h" // NV20_SUBDEVICE_0
48 #include "ctrl/ctrl2080/ctrl2080nvd.h"
49 #include "liblogdecode.h"
50 #include "libelf.h"
51 #include "nverror.h"
52 #include "nvrm_registry.h"
53 #include "nv-firmware.h"
54 #include "nv-firmware-chip-family-select.h"
55 #include "nvtypes.h"
56 #include "nvVer.h"
57 #include "objrpc.h"
58 #include "objtmr.h"
59 #include "os/os.h"
60 #include "rmgspseq.h"
61 #include "sweng/dispsw.h"
62 #include "kernel/gpu/timed_sema.h"
63 #include "vgpu/rpc.h"
64 #include "kernel/gpu/pmu/kern_pmu.h"
65 #include "gpu/perf/kern_perf.h"
66 #include "core/locks.h"
67 #include "kernel/gpu/intr/intr.h"
68
69 #define RPC_STRUCTURES
70 #define RPC_GENERIC_UNION
71 #include "g_rpc-structures.h"
72 #undef RPC_STRUCTURES
73 #undef RPC_GENERIC_UNION
74
75 #define RPC_MESSAGE_STRUCTURES
76 #define RPC_MESSAGE_GENERIC_UNION
77 #include "g_rpc-message-header.h"
78 #undef RPC_MESSAGE_STRUCTURES
79 #undef RPC_MESSAGE_GENERIC_UNION
80
81 #include "gpu/gsp/message_queue_priv.h"
82
83 #include "gpu/conf_compute/conf_compute.h"
84
85 #define RPC_HDR ((rpc_message_header_v*)(pRpc->message_buffer))
86
87 struct MIG_CI_UPDATE_CALLBACK_PARAMS
88 {
89 NvU32 execPartCount;
90 NvU32 execPartId[NVC637_CTRL_MAX_EXEC_PARTITIONS];
91 NvU32 gfid;
92 NvBool bDelete;
93 };
94
95 //
96 // RPC_PARAMS defines the rpc_params pointer and initializes it to the correct
97 // sub-structure.
98 //
99 // RPC_PARAMS intentionally assigns the the latest version structure to the
100 // versioned rpc_params pointer. With the -Werror=incompatible-pointer-types
101 // compiler flag, this checks for mismatched structure versions at compile time.
102 //
103 // For example:
104 // RPC_PARAMS(free, _v03_00);
105 // expands to
106 // rpc_free_v03_00 *rpc_params = &RPC_HDR->rpc_message_data->free_v;
107 //
108 #define RPC_PARAMS(r, v) rpc_##r##v *rpc_params = &RPC_HDR->rpc_message_data->r##_v
109
110 static NV_STATUS _kgspInitRpcInfrastructure(OBJGPU *, KernelGsp *);
111 static void _kgspFreeRpcInfrastructure(OBJGPU *, KernelGsp *);
112
113 static NV_STATUS _kgspConstructRpcObject(OBJGPU *, KernelGsp *, MESSAGE_QUEUE_INFO *, OBJRPC **);
114
115 static NV_STATUS _kgspRpcSendMessage(OBJGPU *, OBJRPC *);
116 static NV_STATUS _kgspRpcRecvPoll(OBJGPU *, OBJRPC *, NvU32);
117 static NV_STATUS _kgspRpcDrainEvents(OBJGPU *, KernelGsp *, NvU32, KernelGspRpcEventHandlerContext);
118 static void _kgspRpcIncrementTimeoutCountAndRateLimitPrints(OBJGPU *, OBJRPC *);
119
120 static NV_STATUS _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
121 static void _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
122
123 static NV_STATUS _kgspAllocNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp);
124 static void _kgspFreeNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp);
125
126 static void _kgspStopLogPolling(OBJGPU *pGpu, KernelGsp *pKernelGsp);
127
128 static void _kgspFreeBootBinaryImage(OBJGPU *pGpu, KernelGsp *pKernelGsp);
129
130 static NV_STATUS _kgspPrepareGspRmBinaryImage(OBJGPU *pGpu, KernelGsp *pKernelGsp, GSP_FIRMWARE *pGspFw);
131
132 static NV_STATUS _kgspCreateSignatureMemdesc(OBJGPU *pGpu, KernelGsp *pKernelGsp,
133 GSP_FIRMWARE *pGspFw);
134
135 static NV_STATUS _kgspFwContainerVerifyVersion(OBJGPU *pGpu, KernelGsp *pKernelGsp,
136 const void *pElfData, NvU64 elfDataSize,
137 const char *pNameInMsg);
138
139 static NV_STATUS _kgspFwContainerGetSection(OBJGPU *pGpu, KernelGsp *pKernelGsp,
140 const void *pElfData, NvU64 elfDataSize,
141 const char *pSectionName,
142 const void **ppSectionData, NvU64 *pSectionSize);
143
144 static NV_STATUS _kgspGetSectionNameForPrefix(OBJGPU *pGpu, KernelGsp *pKernelGsp,
145 char *pSectionNameBuf, NvLength sectionNameBufSize,
146 const char *pSectionPrefix);
147
148 static void
_kgspGetActiveRpcDebugData(OBJRPC * pRpc,NvU32 function,NvU64 * data0,NvU64 * data1)149 _kgspGetActiveRpcDebugData
150 (
151 OBJRPC *pRpc,
152 NvU32 function,
153 NvU64 *data0,
154 NvU64 *data1
155 )
156 {
157 switch (function)
158 {
159 // Functions (CPU -> GSP)
160 case NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL:
161 {
162 RPC_PARAMS(gsp_rm_control, _v03_00);
163 *data0 = rpc_params->cmd;
164 *data1 = rpc_params->paramsSize;
165 break;
166 }
167 case NV_VGPU_MSG_FUNCTION_GSP_RM_ALLOC:
168 {
169 RPC_PARAMS(gsp_rm_alloc, _v03_00);
170 *data0 = rpc_params->hClass;
171 *data1 = rpc_params->paramsSize;
172 break;
173 }
174 case NV_VGPU_MSG_FUNCTION_FREE:
175 {
176 RPC_PARAMS(free, _v03_00);
177 *data0 = rpc_params->params.hObjectOld;
178 *data1 = rpc_params->params.hObjectParent;
179 break;
180 }
181
182 // Events (CPU <- GSP)
183 case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
184 {
185 RPC_PARAMS(run_cpu_sequencer, _v17_00);
186 *data0 = rpc_params->cmdIndex;
187 *data1 = rpc_params->bufferSizeDWord;
188 break;
189 }
190 case NV_VGPU_MSG_EVENT_POST_EVENT:
191 {
192 RPC_PARAMS(post_event, _v17_00);
193 *data0 = rpc_params->notifyIndex;
194 *data1 = rpc_params->data;
195 break;
196 }
197 case NV_VGPU_MSG_EVENT_RC_TRIGGERED:
198 {
199 RPC_PARAMS(rc_triggered, _v17_02);
200 *data0 = rpc_params->nv2080EngineType;
201 *data1 = rpc_params->exceptType;
202 break;
203 }
204 case NV_VGPU_MSG_EVENT_VGPU_GSP_PLUGIN_TRIGGERED:
205 {
206 RPC_PARAMS(vgpu_gsp_plugin_triggered, _v17_00);
207 *data0 = rpc_params->gfid;
208 *data1 = rpc_params->notifyIndex;
209 break;
210 }
211 case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
212 {
213 RPC_PARAMS(gsp_lockdown_notice, _v17_00);
214 *data0 = rpc_params->bLockdownEngaging;
215 *data1 = 0;
216 break;
217 }
218 case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
219 {
220 RPC_PARAMS(gsp_post_nocat_record, _v01_00);
221 const NV2080CtrlNocatJournalInsertRecord *pRecord =
222 (const NV2080CtrlNocatJournalInsertRecord *)&rpc_params->data;
223 *data0 = pRecord->recType;
224 *data1 = pRecord->errorCode;
225 break;
226 }
227
228 default:
229 {
230 *data0 = 0;
231 *data1 = 0;
232 break;
233 }
234 }
235 }
236
237 static NV_STATUS
_kgspRpcSanityCheck(OBJGPU * pGpu,KernelGsp * pKernelGsp,OBJRPC * pRpc)238 _kgspRpcSanityCheck(OBJGPU *pGpu, KernelGsp *pKernelGsp, OBJRPC *pRpc)
239 {
240 if (pKernelGsp->bFatalError)
241 {
242 NV_PRINTF(LEVEL_INFO, "GSP crashed, skipping RPC\n");
243 //
244 // In case of a fatal GSP error, if there was an outstanding RPC at the
245 // time, we should have already printed the error for that, so this is a
246 // new RPC call...from now on don't bother printing RPC errors anymore,
247 // as it can be too noisy and overrun logs.
248 //
249 pRpc->bQuietPrints = NV_TRUE;
250 return NV_ERR_RESET_REQUIRED;
251 }
252 if (API_GPU_IN_RESET_SANITY_CHECK(pGpu))
253 {
254 NV_PRINTF(LEVEL_INFO, "GPU in reset, skipping RPC\n");
255 return NV_ERR_GPU_IN_FULLCHIP_RESET;
256 }
257 if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu) ||
258 pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
259 {
260 NV_PRINTF(LEVEL_INFO, "GPU lost, skipping RPC\n");
261 return NV_ERR_GPU_IS_LOST;
262 }
263 if (osIsGpuShutdown(pGpu))
264 {
265 NV_PRINTF(LEVEL_INFO, "GPU shutdown, skipping RPC\n");
266 return NV_ERR_GPU_IS_LOST;
267 }
268 if (!gpuIsGpuFullPowerForPmResume(pGpu))
269 {
270 NV_PRINTF(LEVEL_INFO, "GPU not full power, skipping RPC\n");
271 return NV_ERR_GPU_NOT_FULL_POWER;
272 }
273 if (!gpuCheckSysmemAccess(pGpu))
274 {
275 NV_PRINTF(LEVEL_INFO, "GPU has no sysmem access, skipping RPC\n");
276 return NV_ERR_INVALID_ACCESS_TYPE;
277 }
278 return NV_OK;
279 }
280
281 static void
_kgspAddRpcHistoryEntry(OBJRPC * pRpc,RpcHistoryEntry * pHistory,NvU32 * pCurrent)282 _kgspAddRpcHistoryEntry
283 (
284 OBJRPC *pRpc,
285 RpcHistoryEntry *pHistory,
286 NvU32 *pCurrent
287 )
288 {
289 NvU32 func = RPC_HDR->function;
290 NvU32 entry;
291
292 entry = *pCurrent = (*pCurrent + 1) % RPC_HISTORY_DEPTH;
293
294 portMemSet(&pHistory[entry], 0, sizeof(pHistory[0]));
295 pHistory[entry].function = func;
296 pHistory[entry].ts_start = osGetTimestamp();
297
298 _kgspGetActiveRpcDebugData(pRpc, func,
299 &pHistory[entry].data[0],
300 &pHistory[entry].data[1]);
301 }
302
303 static void
_kgspCompleteRpcHistoryEntry(RpcHistoryEntry * pHistory,NvU32 current)304 _kgspCompleteRpcHistoryEntry
305 (
306 RpcHistoryEntry *pHistory,
307 NvU32 current
308 )
309 {
310 NvU32 historyIndex;
311 NvU32 historyEntry;
312
313 pHistory[current].ts_end = osGetTimestamp();
314
315 //
316 // Complete any previous entries that aren't marked complete yet, using the same timestamp
317 // (we may not have explicitly waited for them)
318 //
319 for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
320 {
321 historyEntry = (current + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
322 if (pHistory[historyEntry].ts_start != 0 &&
323 pHistory[historyEntry].ts_end == 0)
324 {
325 pHistory[historyEntry].ts_end = pHistory[current].ts_end;
326 }
327 }
328 }
329
330 /*!
331 * GSP client RM RPC send routine
332 */
333 static NV_STATUS
_kgspRpcSendMessage(OBJGPU * pGpu,OBJRPC * pRpc)334 _kgspRpcSendMessage
335 (
336 OBJGPU *pGpu,
337 OBJRPC *pRpc
338 )
339 {
340 NV_STATUS nvStatus;
341 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
342 NvU32 gpuMaskUnused;
343
344 NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
345
346 NV_CHECK_OK_OR_RETURN(LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc));
347
348 nvStatus = GspMsgQueueSendCommand(pRpc->pMessageQueueInfo, pGpu);
349 if (nvStatus != NV_OK)
350 {
351 if (nvStatus == NV_ERR_TIMEOUT ||
352 nvStatus == NV_ERR_BUSY_RETRY)
353 {
354 _kgspRpcIncrementTimeoutCountAndRateLimitPrints(pGpu, pRpc);
355 }
356 NV_PRINTF_COND(pRpc->bQuietPrints, LEVEL_INFO, LEVEL_ERROR,
357 "GspMsgQueueSendCommand failed on GPU%d: 0x%x\n",
358 gpuGetInstance(pGpu), nvStatus);
359 return nvStatus;
360 }
361
362 kgspSetCmdQueueHead_HAL(pGpu, pKernelGsp, pRpc->pMessageQueueInfo->queueIdx, 0);
363
364 _kgspAddRpcHistoryEntry(pRpc, pRpc->rpcHistory, &pRpc->rpcHistoryCurrent);
365
366 return NV_OK;
367 }
368
369 static NV_STATUS
_kgspRpcRunCpuSequencer(OBJGPU * pGpu,OBJRPC * pRpc)370 _kgspRpcRunCpuSequencer
371 (
372 OBJGPU *pGpu,
373 OBJRPC *pRpc
374 )
375 {
376 RPC_PARAMS(run_cpu_sequencer, _v17_00);
377 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
378
379 return kgspExecuteSequencerBuffer(pGpu, pKernelGsp, rpc_params);
380 }
381
382 static void
_kgspProcessEccNotifier(OBJGPU * pGpu,void * eventData)383 _kgspProcessEccNotifier
384 (
385 OBJGPU *pGpu,
386 void *eventData
387 )
388 {
389 NV_STATUS nvStatus = NV_OK;
390 MemoryManager *pMemoryMgr = GPU_GET_MEMORY_MANAGER(pGpu);
391
392 if (pMemoryMgr->bEnableDynamicPageOfflining)
393 {
394 Nv2080EccDbeNotification *pParams = (Nv2080EccDbeNotification*)eventData;
395 if ((nvStatus = heapStorePendingBlackList(pGpu, GPU_GET_HEAP(pGpu), pParams->physAddress ,
396 pParams->physAddress)) != NV_OK)
397 {
398 if (nvStatus == NV_ERR_RESET_REQUIRED)
399 {
400 NV_PRINTF(LEVEL_INFO, "Since we hit the DED on the reserved region, nothing to handle in this code path... \n");
401 NV_PRINTF(LEVEL_INFO, "Relying on FBHUB interrupt to kill all the channels and force reset the GPU..\n");
402 }
403 else
404 {
405 NV_PRINTF(LEVEL_INFO, "Dynamically blacklisting the DED page offset failed with, status: %x\n", nvStatus);
406 DBG_BREAKPOINT();
407 }
408 }
409
410 }
411 }
412
413 /*!
414 * Receive an event notification from GSP-RM.
415 *
416 * When an event fires in GSP-RM, osNotifyEvent and osEventNotification check
417 * whether the event was originally allocated from client-RM. If so, they post
418 * it to the event queue and take no further action. Client RM picks up the
419 * event here and handles it.
420 */
421 static NV_STATUS
_kgspRpcPostEvent(OBJGPU * pGpu,OBJRPC * pRpc)422 _kgspRpcPostEvent
423 (
424 OBJGPU *pGpu,
425 OBJRPC *pRpc
426 )
427 {
428 RPC_PARAMS(post_event, _v17_00);
429 PEVENTNOTIFICATION pNotifyList = NULL;
430 PEVENTNOTIFICATION pNotifyEvent = NULL;
431 Event *pEvent = NULL;
432 NV_STATUS nvStatus = NV_OK;
433
434 // Get the notification list that contains this event.
435 NV_ASSERT_OR_RETURN(CliGetEventInfo(rpc_params->hClient,
436 rpc_params->hEvent, &pEvent), NV_ERR_OBJECT_NOT_FOUND);
437
438 if (pEvent->pNotifierShare != NULL)
439 pNotifyList = pEvent->pNotifierShare->pEventList;
440
441 NV_ASSERT_OR_RETURN(pNotifyList != NULL, NV_ERR_INVALID_POINTER);
442
443 switch (rpc_params->notifyIndex)
444 {
445 case NV2080_NOTIFIERS_ECC_DBE:
446 _kgspProcessEccNotifier(pGpu, rpc_params->eventData);
447 break;
448 }
449
450 // Send the event.
451 if (rpc_params->bNotifyList)
452 {
453 // Send notification to all matching events on the list.
454 nvStatus = osEventNotificationWithInfo(pGpu, pNotifyList, rpc_params->notifyIndex,
455 rpc_params->data, rpc_params->info16, rpc_params->eventData, rpc_params->eventDataSize);
456 }
457 else
458 {
459 // Send event to a specific hEvent. Find hEvent in the notification list.
460 for (pNotifyEvent = pNotifyList; pNotifyEvent; pNotifyEvent = pNotifyEvent->Next)
461 {
462 if (pNotifyEvent->hEvent == rpc_params->hEvent)
463 {
464 nvStatus = osNotifyEvent(pGpu, pNotifyEvent, 0,
465 rpc_params->data, rpc_params->status);
466 break;
467 }
468 }
469 NV_ASSERT_OR_RETURN(pNotifyEvent != NULL, NV_ERR_OBJECT_NOT_FOUND);
470 }
471
472 return nvStatus;
473 }
474
475 /*!
476 * Receive RC notification from GSP-RM.
477 *
478 * RC error handling ("Channel Teardown sequence") is executed in GSP-RM.
479 * Client notifications, OS interaction etc happen in CPU-RM (Kernel RM).
480 */
481 static NV_STATUS
_kgspRpcRCTriggered(OBJGPU * pGpu,OBJRPC * pRpc)482 _kgspRpcRCTriggered
483 (
484 OBJGPU *pGpu,
485 OBJRPC *pRpc
486 )
487 {
488 RPC_PARAMS(rc_triggered, _v17_02);
489
490 KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
491 KernelChannel *pKernelChannel;
492 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
493 CHID_MGR *pChidMgr;
494 NvU32 status = NV_OK;
495 RM_ENGINE_TYPE rmEngineType = gpuGetRmEngineType(rpc_params->nv2080EngineType);
496 NvBool bIsCcEnabled = NV_FALSE;
497
498 // check if there's a PCI-E error pending either in device status or in AER
499 krcCheckBusError_HAL(pGpu, pKernelRc);
500
501 //
502 // If we have received a special msg from GSP then ack back immediately
503 // that we are done writing notifiers since we would have already processed the
504 // other RC msgs that trigger notifier writes before this one.
505 //
506 if (rpc_params->exceptType == ROBUST_CHANNEL_FAST_PATH_ERROR)
507 {
508 NV_RM_RPC_ECC_NOTIFIER_WRITE_ACK(pGpu, status);
509 NV_ASSERT_OK(status);
510 return status;
511 }
512
513 status = kfifoGetChidMgrFromType(pGpu, pKernelFifo,
514 ENGINE_INFO_TYPE_RM_ENGINE_TYPE,
515 (NvU32)rmEngineType,
516 &pChidMgr);
517 if (status != NV_OK)
518 return status;
519
520 pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo,
521 pChidMgr,
522 rpc_params->chid);
523 NV_CHECK_OR_RETURN(LEVEL_ERROR,
524 pKernelChannel != NULL,
525 NV_ERR_INVALID_CHANNEL);
526
527 // Add the RcDiag records we received from GSP-RM to our system wide journal
528 {
529 OBJSYS *pSys = SYS_GET_INSTANCE();
530 Journal *pRcDB = SYS_GET_RCDB(pSys);
531 RmClient *pClient;
532
533 NvU32 recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport);
534 NvU32 rcDiagRecStart = pRcDB->RcErrRptNextIdx;
535 NvU32 rcDiagRecEnd;
536 NvU32 processId = 0;
537 NvU32 owner = RCDB_RCDIAG_DEFAULT_OWNER;
538
539 pClient = dynamicCast(RES_GET_CLIENT(pKernelChannel), RmClient);
540 NV_ASSERT(pClient != NULL);
541 if (pClient != NULL)
542 processId = pClient->ProcID;
543
544 for (NvU32 i = 0; i < rpc_params->rcJournalBufferSize / recordSize; i++)
545 {
546 RmRCCommonJournal_RECORD *pCommonRecord =
547 (RmRCCommonJournal_RECORD *)((NvU8*)&rpc_params->rcJournalBuffer + i * recordSize);
548 RmRcDiag_RECORD *pRcDiagRecord =
549 (RmRcDiag_RECORD *)&pCommonRecord[1];
550
551 #if defined(DEBUG)
552 NV_PRINTF(LEVEL_INFO, "%d: GPUTag=0x%x CPUTag=0x%llx timestamp=0x%llx stateMask=0x%llx\n",
553 i, pCommonRecord->GPUTag, pCommonRecord->CPUTag, pCommonRecord->timeStamp,
554 pCommonRecord->stateMask);
555 NV_PRINTF(LEVEL_INFO, " idx=%d timeStamp=0x%x type=0x%x flags=0x%x count=%d owner=0x%x processId=0x%x\n",
556 pRcDiagRecord->idx, pRcDiagRecord->timeStamp, pRcDiagRecord->type, pRcDiagRecord->flags,
557 pRcDiagRecord->count, pRcDiagRecord->owner, processId);
558 for (NvU32 j = 0; j < pRcDiagRecord->count; j++)
559 {
560 NV_PRINTF(LEVEL_INFO, " %d: offset=0x08%x tag=0x08%x value=0x08%x attribute=0x08%x\n",
561 j, pRcDiagRecord->data[j].offset, pRcDiagRecord->data[j].tag,
562 pRcDiagRecord->data[j].value, pRcDiagRecord->data[j].attribute);
563 }
564 #endif
565 if (rcdbAddRcDiagRecFromGsp(pGpu, pRcDB, pCommonRecord, pRcDiagRecord) == NULL)
566 {
567 NV_PRINTF(LEVEL_WARNING, "Lost RC diagnostic record coming from GPU%d GSP: type=0x%x stateMask=0x%llx\n",
568 gpuGetInstance(pGpu), pRcDiagRecord->type, pCommonRecord->stateMask);
569 }
570 }
571
572 rcDiagRecEnd = pRcDB->RcErrRptNextIdx - 1;
573
574 // Update records to have the correct PID associated with the channel
575 if (rcDiagRecStart != rcDiagRecEnd)
576 {
577 rcdbUpdateRcDiagRecContext(pRcDB,
578 rcDiagRecStart,
579 rcDiagRecEnd,
580 processId,
581 owner);
582 }
583 }
584
585 bIsCcEnabled = gpuIsCCFeatureEnabled(pGpu);
586
587 // With CC enabled, CPU-RM needs to write error notifiers
588 if (bIsCcEnabled)
589 {
590 NV_ASSERT_OK_OR_RETURN(krcErrorSetNotifier(pGpu, pKernelRc,
591 pKernelChannel,
592 rpc_params->exceptType,
593 rmEngineType,
594 rpc_params->scope));
595 }
596
597 return krcErrorSendEventNotifications_HAL(pGpu, pKernelRc,
598 pKernelChannel,
599 rmEngineType, // unused on kernel side
600 rpc_params->exceptType,
601 rpc_params->scope,
602 rpc_params->partitionAttributionId);
603 }
604
605 /*!
606 * This function is called on critical FW crash to RC and notify an error code to
607 * all user mode channels, allowing the user mode apps to fail deterministically.
608 *
609 * @param[in] pGpu GPU object pointer
610 * @param[in] pKernelGsp KernelGsp object pointer
611 * @param[in] exceptType Error code to send to the RC notifiers
612 *
613 */
614 void
kgspRcAndNotifyAllUserChannels(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 exceptType)615 kgspRcAndNotifyAllUserChannels
616 (
617 OBJGPU *pGpu,
618 KernelGsp *pKernelGsp,
619 NvU32 exceptType
620 )
621 {
622 KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
623 KernelChannel *pKernelChannel;
624 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
625 CHANNEL_ITERATOR chanIt;
626 RMTIMEOUT timeout;
627
628 NV_PRINTF(LEVEL_ERROR, "RC all user channels for critical error %d.\n", exceptType);
629
630 // Pass 1: halt all user channels.
631 kfifoGetChannelIterator(pGpu, pKernelFifo, &chanIt, INVALID_RUNLIST_ID);
632 while (kfifoGetNextKernelChannel(pGpu, pKernelFifo, &chanIt, &pKernelChannel) == NV_OK)
633 {
634 //
635 // Kernel (uvm) channels are skipped to workaround nvbug 4503046, where
636 // uvm attributes all errors as global and fails operations on all GPUs,
637 // in addition to the current failing GPU.
638 //
639 if (kchannelCheckIsKernel(pKernelChannel))
640 {
641 continue;
642 }
643
644 kfifoStartChannelHalt(pGpu, pKernelFifo, pKernelChannel);
645 }
646
647 //
648 // Pass 2: Wait for the halts to complete, and RC notify the user channels.
649 // The channel halts require a preemption, which may not be able to complete
650 // since the GSP is no longer servicing interrupts. Wait for up to the
651 // default GPU timeout value for the preemptions to complete.
652 //
653 gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
654 kfifoGetChannelIterator(pGpu, pKernelFifo, &chanIt, INVALID_RUNLIST_ID);
655 while (kfifoGetNextKernelChannel(pGpu, pKernelFifo, &chanIt, &pKernelChannel) == NV_OK)
656 {
657 // Skip kernel (uvm) channels as only user channel halts are initiated above.
658 if (kchannelCheckIsKernel(pKernelChannel))
659 {
660 continue;
661 }
662
663 kfifoCompleteChannelHalt(pGpu, pKernelFifo, pKernelChannel, &timeout);
664
665 NV_ASSERT_OK(krcErrorSetNotifier(pGpu, pKernelRc,
666 pKernelChannel,
667 exceptType,
668 kchannelGetEngineType(pKernelChannel),
669 RC_NOTIFIER_SCOPE_CHANNEL));
670
671 NV_ASSERT_OK(krcErrorSendEventNotifications_HAL(pGpu, pKernelRc,
672 pKernelChannel,
673 kchannelGetEngineType(pKernelChannel),
674 exceptType,
675 RC_NOTIFIER_SCOPE_CHANNEL,
676 0));
677 }
678 }
679
680 /*!
681 * Receive Xid notification from GSP-RM
682 *
683 * Passes Xid errors that are triggered on GSP-RM to nvErrorLog for OS interactions
684 * (logging and OS notifications).
685 */
686 static void
_kgspRpcOsErrorLog(OBJGPU * pGpu,OBJRPC * pRpc)687 _kgspRpcOsErrorLog
688 (
689 OBJGPU *pGpu,
690 OBJRPC *pRpc
691 )
692 {
693 RPC_PARAMS(os_error_log, _v17_00);
694
695 KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
696 KernelChannel *pKernelChannel = NULL;
697 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
698 CHID_MGR *pChidMgr;
699
700 if (rpc_params->chid != INVALID_CHID)
701 {
702 pChidMgr = kfifoGetChidMgr(pGpu, pKernelFifo, rpc_params->runlistId);
703 if (pChidMgr != NULL)
704 {
705 pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo,
706 pChidMgr,
707 rpc_params->chid);
708 }
709 }
710
711 pKernelRc->pPreviousChannelInError = pKernelChannel;
712 nvErrorLog_va(pGpu, rpc_params->exceptType, "%s", rpc_params->errString);
713 pKernelRc->pPreviousChannelInError = NULL;
714 }
715
716 /*!
717 * Receives RPC events containing periodic perfmon utilization samples, passing them
718 * to GPUACCT for processing.
719 */
720 static void
_kgspRpcGpuacctPerfmonUtilSamples(OBJGPU * pGpu,OBJRPC * pRpc)721 _kgspRpcGpuacctPerfmonUtilSamples
722 (
723 OBJGPU *pGpu,
724 OBJRPC *pRpc
725 )
726 {
727 OBJSYS *pSys = SYS_GET_INSTANCE();
728 GpuAccounting *pGpuAcct = SYS_GET_GPUACCT(pSys);
729 GPUACCT_GPU_INSTANCE_INFO *pGpuInstanceInfo = &pGpuAcct->gpuInstanceInfo[pGpu->gpuInstance];
730 RPC_PARAMS(gpuacct_perfmon_util_samples, _v1F_0E);
731
732 NV2080_CTRL_PERF_GET_GPUMON_PERFMON_UTIL_SAMPLES_V2_PARAMS_v1F_0E *src = &rpc_params->params;
733 NV2080_CTRL_PERF_GET_GPUMON_PERFMON_UTIL_SAMPLES_V2_PARAMS *dest;
734 NvU32 i;
735
736 dest = pGpuInstanceInfo->pSamplesParams;
737 if (dest == NULL)
738 {
739 // This RPC event can be received even when the RM hasn't fully started.
740 // For instance, CPU RM can take longer than usual to initialize,
741 // but the GSP RM sampling timer (a 1 sec interval) is about to tick.
742 // In that case, pSamplesParams can not even be allocated by that time.
743 // Ignore this RPC event if pSamplesParams has not been allocated yet.
744 // See GPUSWSEC-1543 for more info.
745 return;
746 }
747
748 portMemSet(dest, 0, sizeof(*dest));
749 dest->type = src->type;
750 dest->bufSize = src->bufSize;
751 dest->count = src->count;
752 dest->tracker = src->tracker;
753
754 for (i = 0; i < NV2080_CTRL_PERF_GPUMON_SAMPLE_COUNT_PERFMON_UTIL_v1F_0E; i++)
755 {
756 dest->samples[i].base.timeStamp = src->samples[i].timeStamp;
757
758 dest->samples[i].fb.util = src->samples[i].fb.util;
759 dest->samples[i].fb.procId = src->samples[i].fb.procId;
760 dest->samples[i].fb.subProcessID = src->samples[i].fb.subProcessID;
761
762 dest->samples[i].gr.util = src->samples[i].gr.util;
763 dest->samples[i].gr.procId = src->samples[i].gr.procId;
764 dest->samples[i].gr.subProcessID = src->samples[i].gr.subProcessID;
765
766 dest->samples[i].nvenc.util = src->samples[i].nvenc.util;
767 dest->samples[i].nvenc.procId = src->samples[i].nvenc.procId;
768 dest->samples[i].nvenc.subProcessID = src->samples[i].nvenc.subProcessID;
769
770 dest->samples[i].nvdec.util = src->samples[i].nvdec.util;
771 dest->samples[i].nvdec.procId = src->samples[i].nvdec.procId;
772 dest->samples[i].nvdec.subProcessID = src->samples[i].nvdec.subProcessID;
773 }
774
775 gpuacctProcessGpuUtil(pGpuInstanceInfo, &dest->samples[0]);
776 }
777
778 /*!
779 * Receives RPC events containing current GPU Boost synchronization limits
780 * that should be cached and considered in the GPU Boost algorithm and runs
781 * the algorithm.
782 */
783 static void
_kgspRpcPerfGpuBoostSyncLimitsCallback(OBJGPU * pGpu,OBJRPC * pRpc)784 _kgspRpcPerfGpuBoostSyncLimitsCallback
785 (
786 OBJGPU *pGpu,
787 OBJRPC *pRpc
788 )
789 {
790 KernelPerf *pKernelPerf = GPU_GET_KERNEL_PERF(pGpu);
791
792 RPC_PARAMS(perf_gpu_boost_sync_limits_callback, _v17_00);
793
794 NV2080_CTRL_INTERNAL_PERF_GPU_BOOST_SYNC_SET_LIMITS_PARAMS_v17_00 *src = &rpc_params->params;
795 NV2080_CTRL_INTERNAL_PERF_GPU_BOOST_SYNC_SET_LIMITS_PARAMS dest;
796 NvU32 i;
797
798 dest.flags = src->flags;
799 dest.bBridgeless = src->bBridgeless;
800
801 for (i = 0; i < NV2080_CTRL_INTERNAL_PERF_SYNC_GPU_BOOST_LIMITS_NUM; i++)
802 {
803 dest.currLimits[i] = src->currLimits[i];
804 }
805
806 kperfDoSyncGpuBoostLimits(pGpu, pKernelPerf, &dest);
807
808 }
809
810 /*!
811 * Recieves RPC events containing latest change of bridgeless information
812 */
813 static void
_kgspRpcPerfBridgelessInfoUpdate(OBJGPU * pGpu,OBJRPC * pRpc)814 _kgspRpcPerfBridgelessInfoUpdate
815 (
816 OBJGPU *pGpu,
817 OBJRPC *pRpc
818 )
819 {
820 RPC_PARAMS(perf_bridgeless_info_update, _v17_00);
821
822 kPerfGpuBoostSyncBridgelessUpdateInfo(pGpu, rpc_params->bBridgeless);
823 }
824
825 static void
_kgspRpcNvlinkFaultUpCallback(OBJGPU * pGpu,OBJRPC * pRpc)826 _kgspRpcNvlinkFaultUpCallback
827 (
828 OBJGPU *pGpu,
829 OBJRPC *pRpc
830 )
831 {
832 RPC_PARAMS(nvlink_fault_up, _v17_00);
833
834 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
835
836 knvlinkHandleFaultUpInterrupt_HAL(pGpu, pKernelNvlink, rpc_params->linkId);
837 }
838
839 static void
_kgspRpcNvlinkInbandReceivedData256Callback(OBJGPU * pGpu,OBJRPC * pRpc)840 _kgspRpcNvlinkInbandReceivedData256Callback
841 (
842 OBJGPU *pGpu,
843 OBJRPC *pRpc
844 )
845 {
846 RPC_PARAMS(nvlink_inband_received_data_256, _v17_00);
847
848 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_256_PARAMS_v17_00 *dest = &rpc_params->params;
849 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
850
851 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
852 }
853
854 static void
_kgspRpcNvlinkInbandReceivedData512Callback(OBJGPU * pGpu,OBJRPC * pRpc)855 _kgspRpcNvlinkInbandReceivedData512Callback
856 (
857 OBJGPU *pGpu,
858 OBJRPC *pRpc
859 )
860 {
861 RPC_PARAMS(nvlink_inband_received_data_512, _v17_00);
862
863 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_512_PARAMS_v17_00 *dest = &rpc_params->params;
864 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
865
866 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
867 }
868
869 static void
_kgspRpcNvlinkInbandReceivedData1024Callback(OBJGPU * pGpu,OBJRPC * pRpc)870 _kgspRpcNvlinkInbandReceivedData1024Callback
871 (
872 OBJGPU *pGpu,
873 OBJRPC *pRpc
874 )
875 {
876 RPC_PARAMS(nvlink_inband_received_data_1024, _v17_00);
877
878 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_1024_PARAMS_v17_00 *dest = &rpc_params->params;
879 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
880
881 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
882 }
883
884 static void
_kgspRpcNvlinkInbandReceivedData2048Callback(OBJGPU * pGpu,OBJRPC * pRpc)885 _kgspRpcNvlinkInbandReceivedData2048Callback
886 (
887 OBJGPU *pGpu,
888 OBJRPC *pRpc
889 )
890 {
891 RPC_PARAMS(nvlink_inband_received_data_2048, _v17_00);
892
893 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_2048_PARAMS_v17_00 *dest = &rpc_params->params;
894 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
895
896 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
897 }
898
899 static void
_kgspRpcNvlinkInbandReceivedData4096Callback(OBJGPU * pGpu,OBJRPC * pRpc)900 _kgspRpcNvlinkInbandReceivedData4096Callback
901 (
902 OBJGPU *pGpu,
903 OBJRPC *pRpc
904 )
905 {
906 RPC_PARAMS(nvlink_inband_received_data_4096, _v17_00);
907
908 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_4096_PARAMS_v17_00 *dest = &rpc_params->params;
909 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
910
911 NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
912 }
913
914 /*!
915 * CPU-RM: Receive GPU Degraded status from GSP
916 */
917 static void
_kgspRpcEventIsGpuDegradedCallback(OBJGPU * pGpu,OBJRPC * pRpc)918 _kgspRpcEventIsGpuDegradedCallback
919 (
920 OBJGPU *pGpu,
921 OBJRPC *pRpc
922 )
923 {
924 RPC_PARAMS(nvlink_is_gpu_degraded, _v17_00);
925 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
926 NV2080_CTRL_NVLINK_IS_GPU_DEGRADED_PARAMS_v17_00 *dest = &rpc_params->params;
927
928 if(dest->bIsGpuDegraded)
929 {
930 knvlinkSetDegradedMode(pGpu, pKernelNvlink, dest->linkId);
931 }
932 }
933
934 static void
_kgspRpcNvlinkFatalErrorRecoveryCallback(OBJGPU * pGpu,OBJRPC * pRpc)935 _kgspRpcNvlinkFatalErrorRecoveryCallback
936 (
937 OBJGPU *pGpu,
938 OBJRPC *pRpc
939 )
940 {
941 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
942 NV_ASSERT_OK(knvlinkFatalErrorRecovery(pGpu, pKernelNvlink));
943 }
944
945 /*!
946 * Receive MMU fault queue notification from GSP-RM.
947 *
948 * Non-replayable fault handling is split between GSP-RM and the UVM driver.
949 * GSP-RM copies designated faults to the UVM driver's shadow buffer,
950 * and sends a notification. CPU-RM, in turn, needs to notify the UVM
951 * driver (schedule the UVM ISR to be run).
952 */
953 static NV_STATUS
_kgspRpcMMUFaultQueued(OBJGPU * pGpu,OBJRPC * pRpc)954 _kgspRpcMMUFaultQueued(
955 OBJGPU *pGpu,
956 OBJRPC *pRpc
957 )
958 {
959 osQueueMMUFaultHandler(pGpu);
960
961 return NV_OK;
962 }
963
964 static NV_STATUS
_kgspRpcSimRead(OBJGPU * pGpu,OBJRPC * pRpc)965 _kgspRpcSimRead
966 (
967 OBJGPU *pGpu,
968 OBJRPC *pRpc
969 )
970 {
971 RPC_PARAMS(sim_read, _v1E_01);
972 if (IS_SIMULATION(pGpu))
973 {
974 const NvU32 count = rpc_params->index + (rpc_params->count / sizeof(NvU32));
975 NvU32 i;
976
977 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
978
979 NV_ASSERT_OR_RETURN(rpc_params->count <= sizeof(pKernelGsp->pSimAccessBuf->data), NV_ERR_BUFFER_TOO_SMALL);
980
981 for (i = rpc_params->index; i < count; i++)
982 {
983 NvU32 data;
984 gpuSimEscapeRead(pGpu, rpc_params->path, i, 4, &data);
985 pKernelGsp->pSimAccessBuf->data[i] = data;
986 }
987
988 pKernelGsp->pSimAccessBuf->seq++;
989 return NV_OK;
990 }
991
992 return NV_ERR_NOT_SUPPORTED;
993 }
994
995 static NV_STATUS
_kgspRpcSimWrite(OBJGPU * pGpu,OBJRPC * pRpc)996 _kgspRpcSimWrite
997 (
998 OBJGPU *pGpu,
999 OBJRPC *pRpc
1000 )
1001 {
1002 RPC_PARAMS(sim_write, _v1E_01);
1003 if (IS_SIMULATION(pGpu))
1004 {
1005 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
1006
1007 gpuSimEscapeWrite(pGpu, rpc_params->path, rpc_params->index, rpc_params->count, rpc_params->data);
1008 pKernelGsp->pSimAccessBuf->seq++;
1009 return NV_OK;
1010 }
1011
1012 return NV_ERR_NOT_SUPPORTED;
1013 }
1014
1015 static NV_STATUS
_kgspRpcSemaphoreScheduleCallback(OBJGPU * pGpu,OBJRPC * pRpc)1016 _kgspRpcSemaphoreScheduleCallback(
1017 OBJGPU *pGpu,
1018 OBJRPC *pRpc
1019 )
1020 {
1021 RPC_PARAMS(semaphore_schedule_callback, _v17_00);
1022 NV_STATUS status;
1023 RsClient *pClient;
1024 Device *pDevice;
1025
1026 status = serverGetClientUnderLock(&g_resServ, rpc_params->hClient, &pClient);
1027 if (status != NV_OK)
1028 return status;
1029
1030 status = deviceGetByHandle(pClient, rpc_params->hEvent, &pDevice);
1031 if (status != NV_OK)
1032 return status;
1033
1034 return dispswReleaseSemaphoreAndNotifierFill(pGpu,
1035 rpc_params->GPUVA,
1036 rpc_params->hVASpace,
1037 rpc_params->ReleaseValue,
1038 rpc_params->Flags,
1039 rpc_params->completionStatus,
1040 pDevice);
1041 }
1042
1043 static NV_STATUS
_kgspRpcTimedSemaphoreRelease(OBJGPU * pGpu,OBJRPC * pRpc)1044 _kgspRpcTimedSemaphoreRelease(
1045 OBJGPU *pGpu,
1046 OBJRPC *pRpc
1047 )
1048 {
1049 RPC_PARAMS(timed_semaphore_release, _v01_00);
1050 NV_STATUS status;
1051 RsClient *pClient;
1052 Device *pDevice;
1053
1054 status = serverGetClientUnderLock(&g_resServ, rpc_params->hClient, &pClient);
1055 if (status != NV_OK)
1056 return status;
1057
1058 status = deviceGetByHandle(pClient, rpc_params->hDevice, &pDevice);
1059 if (status != NV_OK)
1060 return status;
1061
1062 return tsemaRelease_HAL(pGpu,
1063 rpc_params->semaphoreVA,
1064 rpc_params->notifierVA,
1065 rpc_params->hVASpace,
1066 rpc_params->releaseValue,
1067 rpc_params->completionStatus,
1068 pDevice);
1069 }
1070
1071
1072 static NV_STATUS
_kgspRpcUcodeLibosPrint(OBJGPU * pGpu,OBJRPC * pRpc)1073 _kgspRpcUcodeLibosPrint
1074 (
1075 OBJGPU *pGpu,
1076 OBJRPC *pRpc
1077 )
1078 {
1079 RPC_PARAMS(ucode_libos_print, _v1E_08);
1080
1081 // Check ucodes registered with the libos print mechanism
1082 switch (rpc_params->ucodeEngDesc)
1083 {
1084 case ENG_PMU:
1085 {
1086 KernelPmu *pKernelPmu = GPU_GET_KERNEL_PMU(pGpu);
1087 NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelPmu != NULL, NV_ERR_OBJECT_NOT_FOUND);
1088
1089 kpmuLogBuf(pGpu, pKernelPmu,
1090 rpc_params->libosPrintBuf, rpc_params->libosPrintBufSize);
1091
1092 return NV_OK;
1093 }
1094 default:
1095 NV_ASSERT_FAILED("Attempting to use libos prints with an unsupported ucode!\n");
1096 return NV_ERR_NOT_SUPPORTED;
1097 }
1098 }
1099
1100 static NV_STATUS
_kgspRpcVgpuGspPluginTriggered(OBJGPU * pGpu,OBJRPC * pRpc)1101 _kgspRpcVgpuGspPluginTriggered
1102 (
1103 OBJGPU *pGpu,
1104 OBJRPC *pRpc
1105 )
1106 {
1107 RPC_PARAMS(vgpu_gsp_plugin_triggered, _v17_00);
1108
1109 if (!IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
1110 return NV_ERR_NOT_SUPPORTED;
1111
1112 gpuGspPluginTriggeredEvent(pGpu, rpc_params->gfid, rpc_params->notifyIndex);
1113 return NV_OK;
1114 }
1115
1116 static NV_STATUS
_kgspRpcGspVgpuConfig(OBJGPU * pGpu,OBJRPC * pRpc)1117 _kgspRpcGspVgpuConfig
1118 (
1119 OBJGPU *pGpu,
1120 OBJRPC *pRpc
1121 )
1122 {
1123 RPC_PARAMS(vgpu_config_event, _v17_00);
1124
1125 NV_ASSERT_OR_RETURN(rpc_params->notifyIndex < NVA081_NOTIFIERS_MAXCOUNT,
1126 NV_ERR_INVALID_ARGUMENT);
1127
1128 CliNotifyVgpuConfigEvent(pGpu, rpc_params->notifyIndex);
1129
1130 return NV_OK;
1131 }
1132
1133 static NV_STATUS
_kgspRpcGspExtdevIntrService(OBJGPU * pGpu,OBJRPC * pRpc)1134 _kgspRpcGspExtdevIntrService
1135 (
1136 OBJGPU *pGpu,
1137 OBJRPC *pRpc
1138 )
1139 {
1140 RPC_PARAMS(extdev_intr_service, _v17_00);
1141
1142 extdevGsyncService(pGpu, rpc_params->lossRegStatus, rpc_params->gainRegStatus, rpc_params->miscRegStatus, rpc_params->rmStatus);
1143
1144 return NV_OK;
1145 }
1146
1147 static void
_kgspRpcMigCiConfigUpdateCallback(NvU32 gpuInstance,void * pArgs)1148 _kgspRpcMigCiConfigUpdateCallback
1149 (
1150 NvU32 gpuInstance,
1151 void *pArgs
1152 )
1153 {
1154 OBJGPU *pGpu = gpumgrGetGpu(gpuInstance);
1155 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
1156 struct MIG_CI_UPDATE_CALLBACK_PARAMS * pParams = (struct MIG_CI_UPDATE_CALLBACK_PARAMS *)pArgs;
1157
1158 kmigmgrUpdateCiConfigForVgpu(pGpu, pKernelMIGManager,
1159 pParams->execPartCount, pParams->execPartId,
1160 pParams->gfid, pParams->bDelete);
1161
1162 return;
1163 }
1164
1165 static NV_STATUS
_kgspRpcMigCiConfigUpdate(OBJGPU * pGpu,OBJRPC * pRpc)1166 _kgspRpcMigCiConfigUpdate
1167 (
1168 OBJGPU *pGpu,
1169 OBJRPC *pRpc
1170 )
1171 {
1172 NV_STATUS status;
1173 struct MIG_CI_UPDATE_CALLBACK_PARAMS *pParams;
1174
1175 RPC_PARAMS(vgpu_gsp_mig_ci_config, _v21_03);
1176
1177 NV_ASSERT_OR_RETURN(rpc_params->execPartCount <= NVC637_CTRL_MAX_EXEC_PARTITIONS,
1178 NV_ERR_INVALID_ARGUMENT);
1179
1180 pParams = portMemAllocNonPaged(sizeof(struct MIG_CI_UPDATE_CALLBACK_PARAMS));
1181 if (pParams == NULL)
1182 {
1183 return NV_ERR_NO_MEMORY;
1184 }
1185
1186 pParams->execPartCount = rpc_params->execPartCount;
1187 portMemCopy(pParams->execPartId, (sizeof(NvU32) * rpc_params->execPartCount),
1188 rpc_params->execPartId, (sizeof(NvU32) * rpc_params->execPartCount));
1189 pParams->gfid = rpc_params->gfid;
1190 pParams->bDelete = rpc_params->bDelete;
1191 status = osQueueWorkItemWithFlags(pGpu,
1192 _kgspRpcMigCiConfigUpdateCallback,
1193 (void *)pParams,
1194 OS_QUEUE_WORKITEM_FLAGS_LOCK_API_RW | OS_QUEUE_WORKITEM_FLAGS_LOCK_GPUS_RW);
1195 if (status != NV_OK)
1196 {
1197 portMemFree(pParams);
1198 }
1199
1200 return status;
1201 }
1202
1203 static void
_kgspRpcGspUpdateTrace(OBJGPU * pGpu,OBJRPC * pRpc)1204 _kgspRpcGspUpdateTrace
1205 (
1206 OBJGPU *pGpu,
1207 OBJRPC *pRpc
1208 )
1209 {
1210 #if KERNEL_GSP_TRACING_RATS_ENABLED
1211 RPC_PARAMS(update_gsp_trace, _v01_00);
1212 NvU32 i;
1213 NV_RATS_GSP_TRACE_RECORD *GspTraceRecords = (NV_RATS_GSP_TRACE_RECORD*) (&rpc_params->data);
1214 for (i = 0; i < rpc_params->records; i++)
1215 {
1216 gspTraceEventBufferLogRecord(pGpu, &GspTraceRecords[i]);
1217 }
1218 #endif
1219 }
1220
1221 static void
_kgspRpcGspPostNocatRecord(OBJGPU * pGpu,OBJRPC * pRpc)1222 _kgspRpcGspPostNocatRecord
1223 (
1224 OBJGPU *pGpu,
1225 OBJRPC *pRpc
1226 )
1227 {
1228 OBJSYS *pSys = SYS_GET_INSTANCE();
1229 Journal *pRcdb = SYS_GET_RCDB(pSys);
1230 NOCAT_JOURNAL_PARAMS newEntry;
1231 const NV2080CtrlNocatJournalInsertRecord *pRecord = NULL;
1232 RPC_PARAMS(gsp_post_nocat_record, _v01_00);
1233
1234 // make a pointer to the record.
1235 pRecord = (const NV2080CtrlNocatJournalInsertRecord *)&rpc_params->data;
1236
1237 portMemSet(&newEntry, 0, sizeof(newEntry));
1238 newEntry.timestamp = pRecord->timestamp;
1239 newEntry.recType = pRecord->recType;
1240 newEntry.bugcheck = pRecord->bugcheck;
1241 newEntry.pSource = pRecord->source;
1242 newEntry.subsystem = pRecord->subsystem;
1243 newEntry.errorCode = pRecord->errorCode;
1244 newEntry.diagBufferLen = pRecord->diagBufferLen;
1245 newEntry.pDiagBuffer = pRecord->diagBuffer;
1246 newEntry.pFaultingEngine = pRecord->faultingEngine;
1247 newEntry.tdrReason = pRecord->tdrReason;
1248
1249 (void)rcdbNocatInsertNocatError(pGpu, &newEntry);
1250 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_RPC_INSERT_RECORDS_IDX]++;
1251 }
1252
1253 static NV_STATUS
_kgspRpcRgLineIntr(OBJGPU * pGpu,OBJRPC * pRpc)1254 _kgspRpcRgLineIntr
1255 (
1256 OBJGPU *pGpu,
1257 OBJRPC *pRpc
1258 )
1259 {
1260 RPC_PARAMS(rg_line_intr, _v17_00);
1261
1262 KernelDisplay *pKernelDisplay = GPU_GET_KERNEL_DISPLAY(pGpu);
1263 NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelDisplay != NULL, NV_ERR_OBJECT_NOT_FOUND);
1264
1265 kdispInvokeRgLineCallback(pKernelDisplay, rpc_params->head, rpc_params->rgIntr, NV_FALSE);
1266
1267 return NV_OK;
1268 }
1269
1270 static NV_STATUS
_kgspRpcEventPlatformRequestHandlerStateSyncCallback(OBJGPU * pGpu,OBJRPC * pRpc)1271 _kgspRpcEventPlatformRequestHandlerStateSyncCallback
1272 (
1273 OBJGPU* pGpu,
1274 OBJRPC* pRpc
1275 )
1276 {
1277 OBJSYS *pSys = SYS_GET_INSTANCE();
1278 PlatformRequestHandler* pPlatformRequestHandler
1279 = SYS_GET_PFM_REQ_HNDLR(pSys);
1280
1281 RPC_PARAMS(pfm_req_hndlr_state_sync_callback, _v21_04);
1282
1283 NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_PARAMS_v21_04 *src = &rpc_params->params;
1284 NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_PARAMS dst = { 0 };
1285
1286 dst.flags = src->flags;
1287 dst.syncData.type = src->syncData.type;
1288
1289 // Copy in the rpc data
1290 switch (src->syncData.type)
1291 {
1292 case NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_DATA_TYPE_SMBPBI:
1293 {
1294 dst.syncData.data.smbpbi.sensorId =
1295 src->syncData.data.smbpbi.sensorId;
1296 dst.syncData.data.smbpbi.limit =
1297 src->syncData.data.smbpbi.limit;
1298 break;
1299 }
1300 default:
1301 {
1302 // Nothing for now
1303 break;
1304 }
1305 }
1306
1307 pfmreqhndlrStateSync(pPlatformRequestHandler, pGpu, &dst);
1308 return NV_OK;
1309 }
1310
1311 static void
_kgspRpcGspLockdownNotice(OBJGPU * pGpu,OBJRPC * pRpc)1312 _kgspRpcGspLockdownNotice
1313 (
1314 OBJGPU *pGpu,
1315 OBJRPC *pRpc
1316 )
1317 {
1318 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
1319 RPC_PARAMS(gsp_lockdown_notice, _v17_00);
1320
1321 //
1322 // While the GSP is in lockdown, we cannot access some of its registers,
1323 // including interrupt status and control. We shouldn't receive any more
1324 // SWGEN0 interrupts while the core is in lockdown.
1325 //
1326 pKernelGsp->bInLockdown = rpc_params->bLockdownEngaging;
1327
1328 NV_PRINTF(LEVEL_INFO, "GSP lockdown %s\n",
1329 pKernelGsp->bInLockdown ? "engaged" : "disengaged");
1330 }
1331
1332 static
_getRpcName(NvU32 id)1333 const char *_getRpcName
1334 (
1335 NvU32 id
1336 )
1337 {
1338 static const char *rpcName[] =
1339 {
1340 #define X(UNIT, a, VAL) #a,
1341 #define E(a, VAL) #a,
1342 #undef _RPC_GLOBAL_ENUMS_H_
1343 #include "vgpu/rpc_global_enums.h"
1344 #undef X
1345 #undef E
1346 };
1347
1348 if (id < NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS)
1349 {
1350 return rpcName[id];
1351 }
1352 else if ((id > NV_VGPU_MSG_EVENT_FIRST_EVENT) && (id < NV_VGPU_MSG_EVENT_NUM_EVENTS))
1353 {
1354 NvU32 index = id - (NV_VGPU_MSG_EVENT_FIRST_EVENT - NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS) + 1;
1355 return rpcName[index];
1356 }
1357
1358 return "Unknown";
1359 }
1360
1361 /*!
1362 * GSP client process RPC events
1363 */
1364 static void
_kgspProcessRpcEvent(OBJGPU * pGpu,OBJRPC * pRpc,KernelGspRpcEventHandlerContext rpcHandlerContext)1365 _kgspProcessRpcEvent
1366 (
1367 OBJGPU *pGpu,
1368 OBJRPC *pRpc,
1369 KernelGspRpcEventHandlerContext rpcHandlerContext
1370 )
1371 {
1372 rpc_message_header_v *pMsgHdr = RPC_HDR;
1373 NV_STATUS nvStatus = NV_OK;
1374 NvU32 event = pMsgHdr->function;
1375
1376 NV_PRINTF(LEVEL_INFO, "received event from GPU%d: 0x%x (%s) status: 0x%x size: %d\n",
1377 gpuGetInstance(pGpu), event, _getRpcName(event), pMsgHdr->rpc_result, pMsgHdr->length);
1378
1379 _kgspAddRpcHistoryEntry(pRpc, pRpc->rpcEventHistory, &pRpc->rpcEventHistoryCurrent);
1380
1381 /*
1382 * Shortlist of RPC's that have been manually screened to be safe without the API lock
1383 * that are called during GSP bootup
1384 */
1385 if ((rpcHandlerContext == KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL_BOOTUP) &&
1386 (!rmapiLockIsOwner()))
1387 {
1388 switch(pMsgHdr->function)
1389 {
1390 case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
1391 case NV_VGPU_MSG_EVENT_UCODE_LIBOS_PRINT:
1392 case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
1393 case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
1394 case NV_VGPU_MSG_EVENT_GSP_INIT_DONE:
1395 case NV_VGPU_MSG_EVENT_OS_ERROR_LOG:
1396 break;
1397 default:
1398 NV_PRINTF(LEVEL_ERROR, "Attempted to process RPC event from GPU%d: 0x%x (%s) during bootup without API lock\n",
1399 gpuGetInstance(pGpu), event, _getRpcName(event));
1400 NV_ASSERT(0);
1401 goto done;
1402 }
1403 }
1404
1405 switch(event)
1406 {
1407 case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
1408 nvStatus = _kgspRpcRunCpuSequencer(pGpu, pRpc);
1409 break;
1410
1411 case NV_VGPU_MSG_EVENT_POST_EVENT:
1412 nvStatus = _kgspRpcPostEvent(pGpu, pRpc);
1413 break;
1414
1415 case NV_VGPU_MSG_EVENT_RC_TRIGGERED:
1416 nvStatus = _kgspRpcRCTriggered(pGpu, pRpc);
1417 break;
1418
1419 case NV_VGPU_MSG_EVENT_MMU_FAULT_QUEUED:
1420 nvStatus = _kgspRpcMMUFaultQueued(pGpu, pRpc);
1421 break;
1422
1423 case NV_VGPU_MSG_EVENT_SIM_READ:
1424 nvStatus = _kgspRpcSimRead(pGpu, pRpc);
1425 break;
1426
1427 case NV_VGPU_MSG_EVENT_SIM_WRITE:
1428 nvStatus = _kgspRpcSimWrite(pGpu, pRpc);
1429 break;
1430
1431 case NV_VGPU_MSG_EVENT_OS_ERROR_LOG:
1432 _kgspRpcOsErrorLog(pGpu, pRpc);
1433 break;
1434
1435 case NV_VGPU_MSG_EVENT_GPUACCT_PERFMON_UTIL_SAMPLES:
1436 _kgspRpcGpuacctPerfmonUtilSamples(pGpu, pRpc);
1437 break;
1438
1439 case NV_VGPU_MSG_EVENT_PERF_GPU_BOOST_SYNC_LIMITS_CALLBACK:
1440 _kgspRpcPerfGpuBoostSyncLimitsCallback(pGpu, pRpc);
1441 break;
1442
1443 case NV_VGPU_MSG_EVENT_PERF_BRIDGELESS_INFO_UPDATE:
1444 _kgspRpcPerfBridgelessInfoUpdate(pGpu, pRpc);
1445 break;
1446
1447 case NV_VGPU_MSG_EVENT_SEMAPHORE_SCHEDULE_CALLBACK:
1448 _kgspRpcSemaphoreScheduleCallback(pGpu, pRpc);
1449 break;
1450
1451 case NV_VGPU_MSG_EVENT_TIMED_SEMAPHORE_RELEASE:
1452 _kgspRpcTimedSemaphoreRelease(pGpu, pRpc);
1453 break;
1454
1455 case NV_VGPU_MSG_EVENT_NVLINK_FAULT_UP:
1456 _kgspRpcNvlinkFaultUpCallback(pGpu, pRpc);
1457 break;
1458
1459 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_256:
1460 _kgspRpcNvlinkInbandReceivedData256Callback(pGpu, pRpc);
1461 break;
1462
1463 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_512:
1464 _kgspRpcNvlinkInbandReceivedData512Callback(pGpu, pRpc);
1465 break;
1466
1467 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_1024:
1468 _kgspRpcNvlinkInbandReceivedData1024Callback(pGpu, pRpc);
1469 break;
1470
1471 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_2048:
1472 _kgspRpcNvlinkInbandReceivedData2048Callback(pGpu, pRpc);
1473 break;
1474
1475 case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_4096:
1476 _kgspRpcNvlinkInbandReceivedData4096Callback(pGpu, pRpc);
1477 break;
1478
1479 case NV_VGPU_MSG_EVENT_NVLINK_FATAL_ERROR_RECOVERY:
1480 _kgspRpcNvlinkFatalErrorRecoveryCallback(pGpu, pRpc);
1481 break;
1482
1483 case NV_VGPU_MSG_EVENT_NVLINK_IS_GPU_DEGRADED :
1484 _kgspRpcEventIsGpuDegradedCallback(pGpu, pRpc);
1485 break;
1486
1487 case NV_VGPU_MSG_EVENT_RG_LINE_INTR:
1488 _kgspRpcRgLineIntr(pGpu, pRpc);
1489 break;
1490
1491 case NV_VGPU_MSG_EVENT_UCODE_LIBOS_PRINT:
1492 nvStatus = _kgspRpcUcodeLibosPrint(pGpu, pRpc);
1493 break;
1494
1495 case NV_VGPU_MSG_EVENT_VGPU_GSP_PLUGIN_TRIGGERED:
1496 nvStatus = _kgspRpcVgpuGspPluginTriggered(pGpu, pRpc);
1497 break;
1498
1499 case NV_VGPU_MSG_EVENT_VGPU_CONFIG:
1500 nvStatus = _kgspRpcGspVgpuConfig(pGpu, pRpc);
1501 break;
1502
1503 case NV_VGPU_MSG_EVENT_EXTDEV_INTR_SERVICE:
1504 nvStatus = _kgspRpcGspExtdevIntrService(pGpu, pRpc);
1505 break;
1506
1507 case NV_VGPU_MSG_EVENT_PFM_REQ_HNDLR_STATE_SYNC_CALLBACK:
1508 nvStatus = _kgspRpcEventPlatformRequestHandlerStateSyncCallback(pGpu, pRpc);
1509 break;
1510
1511 case NV_VGPU_MSG_EVENT_MIG_CI_CONFIG_UPDATE:
1512 nvStatus = _kgspRpcMigCiConfigUpdate(pGpu, pRpc);
1513 break;
1514
1515 case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
1516 _kgspRpcGspLockdownNotice(pGpu, pRpc);
1517 break;
1518
1519 case NV_VGPU_MSG_EVENT_UPDATE_GSP_TRACE:
1520 _kgspRpcGspUpdateTrace(pGpu, pRpc);
1521 break;
1522
1523 case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
1524 _kgspRpcGspPostNocatRecord(pGpu, pRpc);
1525 break;
1526
1527 case NV_VGPU_MSG_EVENT_GSP_INIT_DONE: // Handled by _kgspRpcRecvPoll.
1528 default:
1529 //
1530 // Log, but otherwise ignore unexpected events.
1531 //
1532 // We will get here if the previous RPC timed out. The response
1533 // eventually comes in as an unexpected event. The error handling
1534 // for the timeout should have already happened.
1535 //
1536 NV_PRINTF(LEVEL_ERROR, "Unexpected RPC event from GPU%d: 0x%x (%s)\n",
1537 gpuGetInstance(pGpu), event, _getRpcName(event));
1538 break;
1539 }
1540
1541 if (nvStatus != NV_OK)
1542 {
1543 //
1544 // Failing to properly handle a specific event does not mean we should stop
1545 // processing events/RPCs, so print the error and soldier on.
1546 //
1547 NV_PRINTF(LEVEL_ERROR,
1548 "Failed to process received event 0x%x (%s) from GPU%d: status=0x%x\n",
1549 event, _getRpcName(event), gpuGetInstance(pGpu), nvStatus);
1550 }
1551
1552 done:
1553 _kgspCompleteRpcHistoryEntry(pRpc->rpcEventHistory, pRpc->rpcEventHistoryCurrent);
1554 }
1555
1556 /*!
1557 * Handle a single RPC event from GSP unless the event is [an RPC return for] expectedFunc,
1558 * or there are no events available in the buffer.
1559 *
1560 * @return
1561 * NV_OK if the event is successfully handled.
1562 * NV_WARN_NOTHING_TO_DO if there are no events available.
1563 * NV_WARN_MORE_PROCESSING_REQUIRED if the event is expectedFunc: it is unhandled and in the staging area.
1564 * (Another status) if event reading fails.
1565 */
1566 static NV_STATUS
_kgspRpcDrainOneEvent(OBJGPU * pGpu,OBJRPC * pRpc,NvU32 expectedFunc,KernelGspRpcEventHandlerContext rpcHandlerContext)1567 _kgspRpcDrainOneEvent
1568 (
1569 OBJGPU *pGpu,
1570 OBJRPC *pRpc,
1571 NvU32 expectedFunc,
1572 KernelGspRpcEventHandlerContext rpcHandlerContext
1573 )
1574 {
1575 NV_STATUS nvStatus;
1576
1577 // Issue a memory barrier to ensure we see any queue updates.
1578 // Note: Without the fence, the CPU may get stuck in an infinite loop
1579 // waiting for a message that has already arrived.
1580 portAtomicMemoryFenceFull();
1581
1582 nvStatus = GspMsgQueueReceiveStatus(pRpc->pMessageQueueInfo, pGpu);
1583
1584 if (nvStatus == NV_OK)
1585 {
1586 rpc_message_header_v *pMsgHdr = RPC_HDR;
1587
1588 if (pMsgHdr->function == expectedFunc)
1589 return NV_WARN_MORE_PROCESSING_REQUIRED;
1590
1591 _kgspProcessRpcEvent(pGpu, pRpc, rpcHandlerContext);
1592 }
1593
1594 //
1595 // We don't expect NV_WARN_MORE_PROCESSING_REQUIRED here.
1596 // If we get it we need to suppress it to avoid confusing our caller, for whom it has special meaning.
1597 //
1598 NV_ASSERT_OR_ELSE(nvStatus != NV_WARN_MORE_PROCESSING_REQUIRED,
1599 nvStatus = NV_ERR_GENERIC);
1600
1601 return nvStatus;
1602 }
1603
1604 /*!
1605 * Handle RPC events from GSP until the event is [an RPC return for] expectedFunc,
1606 * or there are no events available in the buffer.
1607 *
1608 * Also dump GSP logs, and check for severe errors coming from GSP.
1609 *
1610 * @return
1611 * NV_OK if one or more events are handled and there are none left.
1612 * NV_WARN_MORE_PROCESSING_REQUIRED if an expectedFunc event is found: it is unhandled and in the staging area.
1613 * (Zero or more preceding events were successfully handled.)
1614 * (Another status) if event reading or processing fails.
1615 */
1616 static NV_STATUS
_kgspRpcDrainEvents(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 expectedFunc,KernelGspRpcEventHandlerContext rpcHandlerContext)1617 _kgspRpcDrainEvents
1618 (
1619 OBJGPU *pGpu,
1620 KernelGsp *pKernelGsp,
1621 NvU32 expectedFunc,
1622 KernelGspRpcEventHandlerContext rpcHandlerContext
1623 )
1624 {
1625 NV_STATUS nvStatus = NV_OK;
1626 OBJRPC *pRpc = GPU_GET_RPC(pGpu);
1627
1628 while (nvStatus == NV_OK)
1629 {
1630 nvStatus = _kgspRpcDrainOneEvent(pGpu, pRpc, expectedFunc, rpcHandlerContext);
1631 kgspDumpGspLogs(pKernelGsp, NV_FALSE);
1632 }
1633
1634 // If GSP-RM has died, the GPU will need to be reset
1635 if (!kgspHealthCheck_HAL(pGpu, pKernelGsp))
1636 return NV_ERR_RESET_REQUIRED;
1637
1638 if (nvStatus == NV_WARN_NOTHING_TO_DO)
1639 nvStatus = NV_OK;
1640
1641 return nvStatus;
1642 }
1643
1644 static NvU64
_tsDiffToDuration(NvU64 duration,char * pDurationUnitsChar)1645 _tsDiffToDuration
1646 (
1647 NvU64 duration,
1648 char *pDurationUnitsChar
1649 )
1650 {
1651 const NvU64 tsFreqUs = osGetTimestampFreq() / 1000000;
1652
1653 *pDurationUnitsChar = 'u';
1654
1655 NV_ASSERT_OR_RETURN(tsFreqUs > 0, 0);
1656
1657 duration /= tsFreqUs;
1658
1659 // 999999us then 1000ms
1660 if (duration >= 1000000)
1661 {
1662 duration /= 1000;
1663 *pDurationUnitsChar = 'm';
1664 }
1665
1666 // 9999ms then 10s
1667 if (duration >= 10000)
1668 {
1669 duration /= 1000;
1670 *pDurationUnitsChar = ' '; // so caller can always just append 's'
1671 }
1672
1673 return duration;
1674 }
1675
1676 static NvBool
_kgspIsTimestampDuringRecentRpc(OBJRPC * pRpc,NvU64 timestamp,NvBool bCheckIncompleteRpcsOnly)1677 _kgspIsTimestampDuringRecentRpc
1678 (
1679 OBJRPC *pRpc,
1680 NvU64 timestamp,
1681 NvBool bCheckIncompleteRpcsOnly
1682 )
1683 {
1684 NvU32 historyIndex;
1685 NvU32 historyEntry;
1686
1687 for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1688 {
1689 historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1690 if (pRpc->rpcHistory[historyEntry].function != 0)
1691 {
1692 if ((timestamp >= pRpc->rpcHistory[historyEntry].ts_start) &&
1693 ((pRpc->rpcHistory[historyEntry].ts_end == 0) ||
1694 (!bCheckIncompleteRpcsOnly && (timestamp <= pRpc->rpcHistory[historyEntry].ts_end))))
1695 {
1696 return NV_TRUE;
1697 }
1698 }
1699 }
1700
1701 return NV_FALSE;
1702 }
1703
1704 static void
_kgspLogRpcHistoryEntry(OBJGPU * pGpu,NvU32 errorNum,NvU32 historyIndex,RpcHistoryEntry * pEntry,NvBool lastColumnCondition)1705 _kgspLogRpcHistoryEntry
1706 (
1707 OBJGPU *pGpu,
1708 NvU32 errorNum,
1709 NvU32 historyIndex,
1710 RpcHistoryEntry *pEntry,
1711 NvBool lastColumnCondition
1712 )
1713 {
1714 NvU64 duration;
1715 char durationUnitsChar;
1716
1717 if (pEntry->function != 0)
1718 {
1719 duration = (pEntry->ts_end > pEntry->ts_start) ? (pEntry->ts_end - pEntry->ts_start) : 0;
1720 if (duration)
1721 {
1722 duration = _tsDiffToDuration(duration, &durationUnitsChar);
1723
1724 NV_ERROR_LOG_DATA(pGpu, errorNum,
1725 " %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx %6llu%cs %c\n",
1726 ((historyIndex == 0) ? ' ' : '-'),
1727 historyIndex,
1728 pEntry->function,
1729 _getRpcName(pEntry->function),
1730 pEntry->data[0],
1731 pEntry->data[1],
1732 pEntry->ts_start,
1733 pEntry->ts_end,
1734 duration, durationUnitsChar,
1735 (lastColumnCondition ? 'y' : ' '));
1736 }
1737 else
1738 {
1739 NV_ERROR_LOG_DATA(pGpu, errorNum,
1740 " %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx %c\n",
1741 ((historyIndex == 0) ? ' ' : '-'),
1742 historyIndex,
1743 pEntry->function,
1744 _getRpcName(pEntry->function),
1745 pEntry->data[0],
1746 pEntry->data[1],
1747 pEntry->ts_start,
1748 pEntry->ts_end,
1749 (lastColumnCondition ? 'y' : ' '));
1750 }
1751 }
1752 }
1753
1754 void
kgspLogRpcDebugInfo(OBJGPU * pGpu,OBJRPC * pRpc,NvU32 errorNum,NvBool bPollingForRpcResponse)1755 kgspLogRpcDebugInfo
1756 (
1757 OBJGPU *pGpu,
1758 OBJRPC *pRpc,
1759 NvU32 errorNum,
1760 NvBool bPollingForRpcResponse
1761 )
1762 {
1763 const rpc_message_header_v *pMsgHdr = RPC_HDR;
1764 NvU32 historyIndex;
1765 NvU32 historyEntry;
1766 NvU64 activeData[2];
1767
1768 _kgspGetActiveRpcDebugData(pRpc, pMsgHdr->function,
1769 &activeData[0], &activeData[1]);
1770 NV_ERROR_LOG_DATA(pGpu, errorNum,
1771 "GPU%d GSP RPC buffer contains function %d (%s) and data 0x%016llx 0x%016llx.\n",
1772 gpuGetInstance(pGpu),
1773 pMsgHdr->function, _getRpcName(pMsgHdr->function),
1774 activeData[0], activeData[1]);
1775
1776 NV_ERROR_LOG_DATA(pGpu, errorNum,
1777 "GPU%d RPC history (CPU -> GSP):\n",
1778 gpuGetInstance(pGpu));
1779 NV_ERROR_LOG_DATA(pGpu, errorNum,
1780 " entry function data0 data1 ts_start ts_end duration actively_polling\n");
1781 for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1782 {
1783 historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1784 _kgspLogRpcHistoryEntry(pGpu, errorNum, historyIndex, &pRpc->rpcHistory[historyEntry],
1785 ((historyIndex == 0) && bPollingForRpcResponse));
1786 }
1787
1788 NV_ERROR_LOG_DATA(pGpu, errorNum,
1789 "GPU%d RPC event history (CPU <- GSP):\n",
1790 gpuGetInstance(pGpu));
1791 NV_ERROR_LOG_DATA(pGpu, errorNum,
1792 " entry function data0 data1 ts_start ts_end duration during_incomplete_rpc\n");
1793 for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1794 {
1795 historyEntry = (pRpc->rpcEventHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1796 _kgspLogRpcHistoryEntry(pGpu, errorNum, historyIndex, &pRpc->rpcEventHistory[historyEntry],
1797 _kgspIsTimestampDuringRecentRpc(pRpc,
1798 pRpc->rpcEventHistory[historyEntry].ts_start,
1799 NV_TRUE/*bCheckIncompleteRpcsOnly*/));
1800 }
1801 }
1802
1803 /*!
1804 * Log Xid 119 - GSP RPC Timeout
1805 */
1806 static void
_kgspLogXid119(OBJGPU * pGpu,OBJRPC * pRpc,NvU32 expectedFunc)1807 _kgspLogXid119
1808 (
1809 OBJGPU *pGpu,
1810 OBJRPC *pRpc,
1811 NvU32 expectedFunc
1812 )
1813 {
1814 RpcHistoryEntry *pHistoryEntry = &pRpc->rpcHistory[pRpc->rpcHistoryCurrent];
1815 NvU64 ts_end = osGetTimestamp();
1816 NvU64 duration;
1817 char durationUnitsChar;
1818
1819 if (pRpc->timeoutCount == 1)
1820 {
1821 NV_PRINTF(LEVEL_ERROR,
1822 "********************************* GSP Timeout **********************************\n");
1823 NV_PRINTF(LEVEL_ERROR,
1824 "Note: Please also check logs above.\n");
1825 }
1826
1827 NV_ASSERT(expectedFunc == pHistoryEntry->function);
1828
1829 NV_ASSERT(ts_end > pHistoryEntry->ts_start);
1830 duration = _tsDiffToDuration(ts_end - pHistoryEntry->ts_start, &durationUnitsChar);
1831
1832 NV_ERROR_LOG(pGpu, GSP_RPC_TIMEOUT,
1833 "Timeout after %llus of waiting for RPC response from GPU%d GSP! Expected function %d (%s) (0x%x 0x%x).",
1834 (durationUnitsChar == 'm' ? duration / 1000 : duration),
1835 gpuGetInstance(pGpu),
1836 expectedFunc,
1837 _getRpcName(expectedFunc),
1838 pHistoryEntry->data[0],
1839 pHistoryEntry->data[1]);
1840
1841 if (pRpc->timeoutCount == 1)
1842 {
1843 kgspLogRpcDebugInfo(pGpu, pRpc, GSP_RPC_TIMEOUT, NV_TRUE/*bPollingForRpcResponse*/);
1844
1845 osAssertFailed();
1846
1847 NV_PRINTF(LEVEL_ERROR,
1848 "********************************************************************************\n");
1849 }
1850 }
1851
1852 static void
_kgspRpcIncrementTimeoutCountAndRateLimitPrints(OBJGPU * pGpu,OBJRPC * pRpc)1853 _kgspRpcIncrementTimeoutCountAndRateLimitPrints
1854 (
1855 OBJGPU *pGpu,
1856 OBJRPC *pRpc
1857 )
1858 {
1859 pRpc->timeoutCount++;
1860
1861 if ((pRpc->timeoutCount == (RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH + 1)) &&
1862 (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP > 0))
1863 {
1864 // make sure we warn Xid and NV_PRINTF/NVLOG consumers that we are rate limiting prints
1865 if (GPU_GET_KERNEL_RC(pGpu)->bLogEvents)
1866 {
1867 portDbgPrintf(
1868 "NVRM: Rate limiting GSP RPC error prints for GPU at PCI:%04x:%02x:%02x (printing 1 of every %d). The GPU likely needs to be reset.\n",
1869 gpuGetDomain(pGpu),
1870 gpuGetBus(pGpu),
1871 gpuGetDevice(pGpu),
1872 RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1);
1873 }
1874 NV_PRINTF(LEVEL_WARNING,
1875 "Rate limiting GSP RPC error prints (printing 1 of every %d)\n",
1876 RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1);
1877 }
1878
1879 pRpc->bQuietPrints = ((pRpc->timeoutCount > RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH) &&
1880 ((pRpc->timeoutCount % (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1)) != 0));
1881 }
1882
1883 /*!
1884 * GSP client RM RPC poll routine
1885 */
1886 static NV_STATUS
_kgspRpcRecvPoll(OBJGPU * pGpu,OBJRPC * pRpc,NvU32 expectedFunc)1887 _kgspRpcRecvPoll
1888 (
1889 OBJGPU *pGpu,
1890 OBJRPC *pRpc,
1891 NvU32 expectedFunc
1892 )
1893 {
1894 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
1895 NV_STATUS rpcStatus = NV_OK;
1896 NV_STATUS timeoutStatus = NV_OK;
1897 RMTIMEOUT timeout;
1898 NvU32 timeoutUs;
1899 NvU32 timeoutFlags;
1900 NvBool bSlowGspRpc = IS_EMULATION(pGpu) || IS_SIMULATION(pGpu);
1901 NvU32 gpuMaskUnused;
1902
1903 KernelGspRpcEventHandlerContext rpcHandlerContext = KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL;
1904 if (expectedFunc == NV_VGPU_MSG_EVENT_GSP_INIT_DONE)
1905 {
1906 // special case for bootup path without API lock
1907 rpcHandlerContext = KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL_BOOTUP;
1908 }
1909 //
1910 // We do not allow recursive polling. This can happen if e.g.
1911 // 1. CPU-RM issues RPC-A to GSP and polls waiting for it to finish
1912 // 2. While servicing RPC-A, GSP emits an async event back to CPU-RM
1913 // 3. CPU-RM services the async event and sends another synchronous RPC-B
1914 // 4. RPC-A response will come first, but CPU-RM is now waiting on RPC-B
1915 //
1916 // We don't have a good way to handle this and should just be deferring the
1917 // second RPC until the first one is done, via e.g. osQueueWorkItem().
1918 // This assert is meant to catch and loudly fail such cases.
1919 //
1920 NV_ASSERT_OR_RETURN(!pKernelGsp->bPollingForRpcResponse, NV_ERR_INVALID_STATE);
1921 pKernelGsp->bPollingForRpcResponse = NV_TRUE;
1922
1923 //
1924 // GSP-RM init in emulation/simulation environment is extremely slow,
1925 // so need to increment timeout.
1926 // Apply the timeout extension to other RPCs as well, mostly so that
1927 // we'll reset the thread state after each RPC, not just while waiting
1928 // for the INIT_DONE event.
1929 //
1930 if (bSlowGspRpc)
1931 {
1932 NvU32 timeoutResult;
1933
1934 // On slow Apollo emulators, GSP-RM init could take more than an hour
1935 NV_ASSERT(portSafeMulU32(GSP_SCALE_TIMEOUT_EMU_SIM, 1500000, &timeoutResult));
1936 timeoutUs = timeoutResult;
1937 }
1938 else
1939 {
1940 NvU32 defaultus = pGpu->timeoutData.defaultus;
1941
1942 if (IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
1943 {
1944 // Ensure at least 3.1s for vGPU-GSP before adding leeway (Bug 3928607)
1945 timeoutUs = NV_MAX(3100 * 1000, defaultus) + (defaultus / 2);
1946 }
1947 else
1948 {
1949 //
1950 // We should only ever timeout this when GSP is in really bad state, so if it just
1951 // happens to timeout on default timeout it should be OK for us to give it a little
1952 // more time - make this timeout 1.5 of the default to allow some leeway.
1953 //
1954 timeoutUs = defaultus + defaultus / 2;
1955 }
1956 }
1957
1958 NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
1959
1960 timeoutFlags = GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE;
1961 if (pRpc->bQuietPrints)
1962 timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_JOURNAL_LOG;
1963
1964 gpuSetTimeout(pGpu, timeoutUs, &timeout, timeoutFlags);
1965
1966 for (;;)
1967 {
1968 //
1969 // Check for GPU timeout, save that information, and then verify if the RPC is completed.
1970 // Otherwise if the CPU thread goes to sleep immediately after the RPC check, it may result in hitting a timeout.
1971 //
1972 timeoutStatus = gpuCheckTimeout(pGpu, &timeout);
1973
1974 rpcStatus = _kgspRpcDrainEvents(pGpu, pKernelGsp, expectedFunc, rpcHandlerContext);
1975
1976 switch (rpcStatus) {
1977 case NV_WARN_MORE_PROCESSING_REQUIRED:
1978 // The synchronous RPC response we were waiting for is here
1979 _kgspCompleteRpcHistoryEntry(pRpc->rpcHistory, pRpc->rpcHistoryCurrent);
1980 rpcStatus = NV_OK;
1981 goto done;
1982 case NV_OK:
1983 // Check timeout and continue outer loop.
1984 break;
1985 default:
1986 goto done;
1987 }
1988
1989 NV_CHECK_OK_OR_GOTO(rpcStatus, LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc), done);
1990
1991 if (timeoutStatus == NV_ERR_TIMEOUT)
1992 {
1993 rpcStatus = timeoutStatus;
1994
1995 _kgspRpcIncrementTimeoutCountAndRateLimitPrints(pGpu, pRpc);
1996
1997 if (!pRpc->bQuietPrints)
1998 {
1999 _kgspLogXid119(pGpu, pRpc, expectedFunc);
2000 }
2001
2002 goto done;
2003 }
2004 else if (timeoutStatus != NV_OK)
2005 {
2006 NV_PRINTF(LEVEL_ERROR, "gpuCheckTimeout() returned unexpected error (0x%08x)\n",
2007 timeoutStatus);
2008 rpcStatus = timeoutStatus;
2009 goto done;
2010 }
2011
2012 osSpinLoop();
2013 }
2014
2015 pRpc->timeoutCount = 0;
2016
2017 done:
2018 pKernelGsp->bPollingForRpcResponse = NV_FALSE;
2019
2020 if (bSlowGspRpc)
2021 {
2022 // Avoid cumulative timeout due to slow RPC
2023 threadStateResetTimeout(pGpu);
2024 }
2025
2026 return rpcStatus;
2027 }
2028
2029 /*!
2030 * Initialize RPC objects required for interfacing with GSP.
2031 */
2032 static NV_STATUS
_kgspInitRpcInfrastructure(OBJGPU * pGpu,KernelGsp * pKernelGsp)2033 _kgspInitRpcInfrastructure
2034 (
2035 OBJGPU *pGpu,
2036 KernelGsp *pKernelGsp
2037 )
2038 {
2039 NV_STATUS nvStatus = NV_OK;
2040 MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL;
2041
2042 nvStatus = GspMsgQueuesInit(pGpu, &pMQCollection);
2043 if (nvStatus != NV_OK)
2044 {
2045 NV_PRINTF(LEVEL_ERROR, "GspMsgQueueInit failed\n");
2046 goto done;
2047 }
2048
2049 pKernelGsp->pMQCollection = pMQCollection;
2050
2051 // Init RM RPC object
2052 nvStatus = _kgspConstructRpcObject(pGpu, pKernelGsp,
2053 &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX],
2054 &pKernelGsp->pRpc);
2055 if (nvStatus != NV_OK)
2056 {
2057 NV_PRINTF(LEVEL_ERROR, "init task RM RPC infrastructure failed\n");
2058 goto done;
2059 }
2060
2061 // Init task_isr RPC object
2062 if (pKernelGsp->bIsTaskIsrQueueRequired)
2063 {
2064 nvStatus = _kgspConstructRpcObject(pGpu, pKernelGsp,
2065 &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX],
2066 &pKernelGsp->pLocklessRpc);
2067 if (nvStatus != NV_OK)
2068 {
2069 NV_PRINTF(LEVEL_ERROR, "init task ISR RPC infrastructure failed\n");
2070 goto done;
2071 }
2072 }
2073
2074 done:
2075 if (nvStatus != NV_OK)
2076 {
2077 _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
2078 }
2079
2080 return nvStatus;
2081 }
2082
2083
2084 /*!
2085 * Initialize stripped down version of RPC infra init for GSP clients.
2086 */
2087 static NV_STATUS
_kgspConstructRpcObject(OBJGPU * pGpu,KernelGsp * pKernelGsp,MESSAGE_QUEUE_INFO * pMQI,OBJRPC ** ppRpc)2088 _kgspConstructRpcObject
2089 (
2090 OBJGPU *pGpu,
2091 KernelGsp *pKernelGsp,
2092 MESSAGE_QUEUE_INFO *pMQI,
2093 OBJRPC **ppRpc
2094 )
2095 {
2096 OBJRPC *pRpc;
2097
2098 NV_ASSERT_OR_RETURN(pMQI != NULL, NV_ERR_INVALID_ARGUMENT);
2099
2100 pRpc = initRpcObject(pGpu);
2101 if (pRpc == NULL)
2102 {
2103 NV_PRINTF(LEVEL_ERROR, "initRpcObject failed\n");
2104 return NV_ERR_INSUFFICIENT_RESOURCES;
2105 }
2106
2107 pRpc->pMessageQueueInfo = pMQI;
2108
2109 portMemSet(&pRpc->rpcHistory, 0, sizeof(pRpc->rpcHistory));
2110 pRpc->rpcHistoryCurrent = RPC_HISTORY_DEPTH - 1;
2111 portMemSet(&pRpc->rpcEventHistory, 0, sizeof(pRpc->rpcEventHistory));
2112 pRpc->rpcEventHistoryCurrent = RPC_HISTORY_DEPTH - 1;
2113
2114 pRpc->message_buffer = (NvU32 *)pRpc->pMessageQueueInfo->pRpcMsgBuf;
2115 pRpc->maxRpcSize = GSP_MSG_QUEUE_RPC_SIZE_MAX;
2116
2117 rpcSendMessage_FNPTR(pRpc) = _kgspRpcSendMessage;
2118 rpcRecvPoll_FNPTR(pRpc) = _kgspRpcRecvPoll;
2119
2120 *ppRpc = pRpc;
2121
2122 return NV_OK;
2123 }
2124
2125 static void
_kgspFreeRpcInfrastructure(OBJGPU * pGpu,KernelGsp * pKernelGsp)2126 _kgspFreeRpcInfrastructure
2127 (
2128 OBJGPU *pGpu,
2129 KernelGsp *pKernelGsp
2130 )
2131 {
2132 if (pKernelGsp->pRpc != NULL)
2133 {
2134 rpcDestroy(pGpu, pKernelGsp->pRpc);
2135 portMemFree(pKernelGsp->pRpc);
2136 pKernelGsp->pRpc = NULL;
2137 }
2138 if (pKernelGsp->pLocklessRpc != NULL)
2139 {
2140 rpcDestroy(pGpu, pKernelGsp->pLocklessRpc);
2141 portMemFree(pKernelGsp->pLocklessRpc);
2142 pKernelGsp->pLocklessRpc = NULL;
2143 }
2144 GspMsgQueuesCleanup(&pKernelGsp->pMQCollection);
2145 }
2146
2147 /*!
2148 * Convert init arg name to 64bit id value.
2149 *
2150 * @param[in] name String representing name of init arg
2151 */
2152 static NvU64
_kgspGenerateInitArgId(const char * name)2153 _kgspGenerateInitArgId(const char *name)
2154 {
2155 NvU64 id = 0;
2156 NvU8 c;
2157 NvU32 i;
2158
2159 // Convert at most 8 characters from name into id.
2160 for (i = 0; i < (sizeof(NvU64) / sizeof(NvU8)); ++i)
2161 {
2162 c = (NvU8)*name++;
2163 if (c == '\0')
2164 {
2165 break;
2166 }
2167 id = (id << 8) | c;
2168 }
2169
2170 return id;
2171 }
2172
2173 static void
_kgspUnmapTaskLogBuf(OBJGPU * pGpu,RM_LIBOS_LOG_MEM * pLog)2174 _kgspUnmapTaskLogBuf(OBJGPU *pGpu, RM_LIBOS_LOG_MEM *pLog)
2175 {
2176 // release log memory for this task.
2177 if (pLog->pTaskLogBuffer != NULL)
2178 {
2179 memdescUnmapInternal(pGpu, pLog->pTaskLogDescriptor, TRANSFER_FLAGS_NONE);
2180 pLog->pTaskLogBuffer = NULL;
2181 }
2182
2183 if (pLog->pTaskLogDescriptor != NULL)
2184 {
2185 memdescFree(pLog->pTaskLogDescriptor);
2186 memdescDestroy(pLog->pTaskLogDescriptor);
2187 pLog->pTaskLogDescriptor = NULL;
2188 }
2189 }
2190
2191 /*!
2192 * Free vgpu partition LIBOS task logging structures
2193 */
2194 static void
_kgspFreeLibosVgpuPartitionLoggingStructures(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 gfid)2195 _kgspFreeLibosVgpuPartitionLoggingStructures
2196 (
2197 OBJGPU *pGpu,
2198 KernelGsp *pKernelGsp,
2199 NvU32 gfid
2200 )
2201 {
2202 RM_LIBOS_LOG_MEM *vgpuLogBuffers[] =
2203 {
2204 pKernelGsp->gspPluginInitTaskLogMem,
2205 pKernelGsp->gspPluginVgpuTaskLogMem
2206 };
2207
2208 libosLogDestroy(&pKernelGsp->logDecodeVgpuPartition[gfid - 1]);
2209
2210 // release all the vgpu tasks' log buffer memory
2211 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(vgpuLogBuffers); ++i)
2212 {
2213 RM_LIBOS_LOG_MEM *pTaskLog = &vgpuLogBuffers[i][gfid - 1];
2214 _kgspUnmapTaskLogBuf(pGpu, pTaskLog);
2215 }
2216 }
2217
2218 /*!
2219 * Free vgpu partition LIBOS task logging structures
2220 */
2221 NV_STATUS
kgspFreeVgpuPartitionLogging_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 gfid)2222 kgspFreeVgpuPartitionLogging_IMPL
2223 (
2224 OBJGPU *pGpu,
2225 KernelGsp *pKernelGsp,
2226 NvU32 gfid
2227 )
2228 {
2229 if (gfid > MAX_PARTITIONS_WITH_GFID)
2230 {
2231 return NV_ERR_INVALID_ARGUMENT;
2232 }
2233 else
2234 {
2235 // Make sure there is no lingering debug output.
2236 kgspDumpGspLogs(pKernelGsp, NV_FALSE);
2237
2238 _kgspFreeLibosVgpuPartitionLoggingStructures(pGpu, pKernelGsp, gfid);
2239 return NV_OK;
2240 }
2241 }
2242
2243 /*!
2244 * Initialize vgpu partition LIBOS task logging structures
2245 */
2246 NV_STATUS
kgspInitVgpuPartitionLogging_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 gfid,NvU64 initTaskLogBUffOffset,NvU64 initTaskLogBUffSize,NvU64 vgpuTaskLogBUffOffset,NvU64 vgpuTaskLogBuffSize)2247 kgspInitVgpuPartitionLogging_IMPL
2248 (
2249 OBJGPU *pGpu,
2250 KernelGsp *pKernelGsp,
2251 NvU32 gfid,
2252 NvU64 initTaskLogBUffOffset,
2253 NvU64 initTaskLogBUffSize,
2254 NvU64 vgpuTaskLogBUffOffset,
2255 NvU64 vgpuTaskLogBuffSize
2256 )
2257 {
2258 struct
2259 {
2260 const char *szMemoryId;
2261 const char *szPrefix;
2262 const char *elfSectionName;
2263 NvU64 bufOffset;
2264 NvU64 bufSize;
2265 RM_LIBOS_LOG_MEM *taskLogArr;
2266 } logInitValues[] =
2267 {
2268 {"LOGINIT", "INIT", ".fwlogging_init", initTaskLogBUffOffset, initTaskLogBUffSize, pKernelGsp->gspPluginInitTaskLogMem},
2269 {"LOGVGPU", "VGPU", ".fwlogging_vgpu", vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pKernelGsp->gspPluginVgpuTaskLogMem}
2270 };
2271 ct_assert(NV_ARRAY_ELEMENTS(logInitValues) <= LIBOS_LOG_MAX_LOGS);
2272
2273 NV_STATUS nvStatus = NV_OK;
2274 RM_LIBOS_LOG_MEM *pTaskLog = NULL;
2275 char vm_string[8], sourceName[SOURCE_NAME_MAX_LENGTH];
2276
2277 if (gfid > MAX_PARTITIONS_WITH_GFID)
2278 {
2279 return NV_ERR_INVALID_ARGUMENT;
2280 }
2281
2282 if (pKernelGsp->pNvlogFlushMtx != NULL)
2283 portSyncMutexAcquire(pKernelGsp->pNvlogFlushMtx);
2284
2285 // Source name is used to generate a tag that is a unique identifier for nvlog buffers.
2286 // As the source name 'GSP' is already in use, we will need a custom source name.
2287 nvDbgSnprintf(sourceName, SOURCE_NAME_MAX_LENGTH, "V%02d", gfid);
2288 libosLogCreateEx(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], sourceName);
2289
2290 // Setup logging for each task in vgpu partition
2291 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(logInitValues); ++i)
2292 {
2293 pTaskLog = &logInitValues[i].taskLogArr[gfid - 1];
2294 NvP64 pVa = NvP64_NULL;
2295
2296 NV_ASSERT_OK_OR_GOTO(nvStatus,
2297 memdescCreate(&pTaskLog->pTaskLogDescriptor,
2298 pGpu,
2299 logInitValues[i].bufSize,
2300 RM_PAGE_SIZE,
2301 NV_TRUE, ADDR_FBMEM, NV_MEMORY_CACHED,
2302 MEMDESC_FLAGS_NONE),
2303 error_cleanup);
2304
2305 memdescDescribe(pTaskLog->pTaskLogDescriptor, ADDR_FBMEM, logInitValues[i].bufOffset, logInitValues[i].bufSize);
2306
2307 pVa = memdescMapInternal(pGpu, pTaskLog->pTaskLogDescriptor, TRANSFER_FLAGS_NONE);
2308 if (pVa != NvP64_NULL)
2309 {
2310 pTaskLog->pTaskLogBuffer = pVa;
2311 portMemSet(pTaskLog->pTaskLogBuffer, 0, logInitValues[i].bufSize);
2312
2313 pTaskLog->id8 = _kgspGenerateInitArgId(logInitValues[i].szMemoryId);
2314
2315 nvDbgSnprintf(vm_string, sizeof(vm_string), "%s%d", logInitValues[i].szPrefix, gfid);
2316
2317 libosLogAddLogEx(&pKernelGsp->logDecodeVgpuPartition[gfid - 1],
2318 pTaskLog->pTaskLogBuffer,
2319 memdescGetSize(pTaskLog->pTaskLogDescriptor),
2320 pGpu->gpuInstance,
2321 (gpuGetChipArch(pGpu) >> GPU_ARCH_SHIFT),
2322 gpuGetChipImpl(pGpu),
2323 vm_string,
2324 logInitValues[i].elfSectionName);
2325 }
2326 else
2327 {
2328 NV_PRINTF(LEVEL_ERROR, "Failed to map memory for %s task log buffer for vGPU partition \n", logInitValues[i].szPrefix);
2329 nvStatus = NV_ERR_INSUFFICIENT_RESOURCES;
2330 goto error_cleanup;
2331 }
2332 }
2333
2334 {
2335 libosLogInit(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], pKernelGsp->pLogElf, pKernelGsp->logElfDataSize);
2336 // nvlog buffers are now setup using the appropriate sourceName to avoid tag-value clash.
2337 // Now sourceName can be modified to preserve the 'GSP-VGPUx' logging convention.
2338 portStringCopy(pKernelGsp->logDecodeVgpuPartition[gfid - 1].sourceName,
2339 SOURCE_NAME_MAX_LENGTH,
2340 "GSP", SOURCE_NAME_MAX_LENGTH);
2341 }
2342
2343 pKernelGsp->bHasVgpuLogs = NV_TRUE;
2344
2345 error_cleanup:
2346 if (pKernelGsp->pNvlogFlushMtx != NULL)
2347 portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx);
2348
2349 if (nvStatus != NV_OK)
2350 _kgspFreeLibosVgpuPartitionLoggingStructures(pGpu, pKernelGsp, gfid);
2351
2352 return nvStatus;
2353 }
2354
kgspNvlogFlushCb(void * pKernelGsp)2355 void kgspNvlogFlushCb(void *pKernelGsp)
2356 {
2357 if (pKernelGsp != NULL)
2358 kgspDumpGspLogs((KernelGsp*)pKernelGsp, NV_TRUE);
2359 }
2360
2361 /*!
2362 * Free LIBOS task logging structures
2363 */
2364 static void
_kgspFreeLibosLoggingStructures(OBJGPU * pGpu,KernelGsp * pKernelGsp)2365 _kgspFreeLibosLoggingStructures
2366 (
2367 OBJGPU *pGpu,
2368 KernelGsp *pKernelGsp
2369 )
2370 {
2371 NvU8 idx;
2372
2373 _kgspStopLogPolling(pGpu, pKernelGsp);
2374
2375 // Make sure there is no lingering debug output.
2376 kgspDumpGspLogs(pKernelGsp, NV_FALSE);
2377
2378 if (pKernelGsp->pLogElf == NULL)
2379 nvlogDeregisterFlushCb(kgspNvlogFlushCb, pKernelGsp);
2380
2381 if (pKernelGsp->pNvlogFlushMtx != NULL)
2382 {
2383 portSyncMutexDestroy(pKernelGsp->pNvlogFlushMtx);
2384 pKernelGsp->pNvlogFlushMtx = NULL;
2385 }
2386
2387 libosLogDestroy(&pKernelGsp->logDecode);
2388
2389 for (idx = 0; idx < LOGIDX_SIZE; idx++)
2390 {
2391 RM_LIBOS_LOG_MEM *pLog = &pKernelGsp->rmLibosLogMem[idx];
2392
2393 // release log memory for each task.
2394 if (pLog->pTaskLogBuffer != NULL)
2395 {
2396 memdescUnmap(pLog->pTaskLogDescriptor,
2397 NV_TRUE, osGetCurrentProcess(),
2398 (void *)pLog->pTaskLogBuffer,
2399 pLog->pTaskLogMappingPriv);
2400 pLog->pTaskLogBuffer = NULL;
2401 pLog->pTaskLogMappingPriv = NULL;
2402 }
2403
2404 if (pLog->pTaskLogDescriptor != NULL)
2405 {
2406 memdescFree(pLog->pTaskLogDescriptor);
2407 memdescDestroy(pLog->pTaskLogDescriptor);
2408 pLog->pTaskLogDescriptor = NULL;
2409 }
2410 }
2411
2412 portMemFree(pKernelGsp->pLogElf);
2413 pKernelGsp->pLogElf = NULL;
2414 }
2415
2416 /*!
2417 * Initialize LIBOS task logging structures
2418 */
2419 static NV_STATUS
_kgspInitLibosLoggingStructures(OBJGPU * pGpu,KernelGsp * pKernelGsp)2420 _kgspInitLibosLoggingStructures
2421 (
2422 OBJGPU *pGpu,
2423 KernelGsp *pKernelGsp
2424 )
2425 {
2426 static const struct
2427 {
2428 const char *szMemoryId;
2429 const char *szPrefix;
2430 NvU32 size;
2431 const char *elfSectionName;
2432 } logInitValues[] =
2433 {
2434 {"LOGINIT", "INIT", 0x10000, ".fwlogging_init"}, // 64KB for stack traces
2435 #if defined(DEVELOP) || defined(DEBUG)
2436 // The interrupt task is in the rm elf, so they share the same logging elf too
2437 {"LOGINTR", "INTR", 0x40000, ".fwlogging_rm"}, // 256KB ISR debug log on develop/debug builds
2438 {"LOGRM", "RM", 0x40000, ".fwlogging_rm"} // 256KB RM debug log on develop/debug builds
2439 #else
2440 // The interrupt task is in the rm elf, so they share the same logging elf too
2441 {"LOGINTR", "INTR", 0x10000, ".fwlogging_rm"}, // 64KB ISR debug log on develop/debug builds
2442 {"LOGRM", "RM", 0x10000, ".fwlogging_rm"} // 64KB RM debug log on release builds
2443 #endif
2444 };
2445 ct_assert(NV_ARRAY_ELEMENTS(logInitValues) <= LIBOS_LOG_MAX_LOGS);
2446 ct_assert(NV_ARRAY_ELEMENTS(logInitValues) == LOGIDX_SIZE);
2447
2448 NV_STATUS nvStatus = NV_OK;
2449 NvU8 idx;
2450 NvU64 flags = MEMDESC_FLAGS_NONE;
2451
2452 // Needed only on Unix where NV_ESC_RM_LOCKLESS_DIAGNOSTIC is supported
2453 if (RMCFG_FEATURE_PLATFORM_UNIX)
2454 {
2455 pKernelGsp->pNvlogFlushMtx = portSyncMutexCreate(portMemAllocatorGetGlobalNonPaged());
2456 if (pKernelGsp->pNvlogFlushMtx == NULL)
2457 {
2458 nvStatus = NV_ERR_INSUFFICIENT_RESOURCES;
2459 goto error_cleanup;
2460 }
2461 }
2462
2463 libosLogCreate(&pKernelGsp->logDecode);
2464
2465 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
2466
2467 for (idx = 0; idx < LOGIDX_SIZE; idx++)
2468 {
2469 RM_LIBOS_LOG_MEM *pLog = &pKernelGsp->rmLibosLogMem[idx];
2470 NvP64 pVa = NvP64_NULL;
2471 NvP64 pPriv = NvP64_NULL;
2472
2473 // Setup logging memory for each task.
2474 NV_ASSERT_OK_OR_GOTO(nvStatus,
2475 memdescCreate(&pLog->pTaskLogDescriptor,
2476 pGpu,
2477 logInitValues[idx].size,
2478 RM_PAGE_SIZE,
2479 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED,
2480 flags),
2481 error_cleanup);
2482
2483 memdescTagAlloc(nvStatus,
2484 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_12, pLog->pTaskLogDescriptor);
2485 NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus,
2486 error_cleanup);
2487
2488 NV_ASSERT_OK_OR_GOTO(nvStatus,
2489 memdescMap(pLog->pTaskLogDescriptor, 0,
2490 memdescGetSize(pLog->pTaskLogDescriptor),
2491 NV_TRUE, NV_PROTECT_READ_WRITE,
2492 &pVa, &pPriv),
2493 error_cleanup);
2494
2495 pLog->pTaskLogBuffer = pVa;
2496 pLog->pTaskLogMappingPriv = pPriv;
2497 portMemSet(pLog->pTaskLogBuffer, 0, memdescGetSize(pLog->pTaskLogDescriptor));
2498
2499 // Pass the PTE table for the log buffer in the log buffer, after the put pointer.
2500 memdescGetPhysAddrs(pLog->pTaskLogDescriptor,
2501 AT_GPU,
2502 0,
2503 RM_PAGE_SIZE,
2504 NV_CEIL(memdescGetSize(pLog->pTaskLogDescriptor), RM_PAGE_SIZE),
2505 &pLog->pTaskLogBuffer[1]);
2506
2507 pLog->id8 = _kgspGenerateInitArgId(logInitValues[idx].szMemoryId);
2508
2509 libosLogAddLogEx(&pKernelGsp->logDecode,
2510 pLog->pTaskLogBuffer,
2511 memdescGetSize(pLog->pTaskLogDescriptor),
2512 pGpu->gpuInstance,
2513 (gpuGetChipArch(pGpu) >> GPU_ARCH_SHIFT),
2514 gpuGetChipImpl(pGpu),
2515 logInitValues[idx].szPrefix,
2516 logInitValues[idx].elfSectionName);
2517 }
2518
2519 error_cleanup:
2520 if (nvStatus != NV_OK)
2521 _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
2522
2523 return nvStatus;
2524 }
2525
2526 static NV_STATUS
_kgspInitLibosLogDecoder(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw)2527 _kgspInitLibosLogDecoder
2528 (
2529 OBJGPU *pGpu,
2530 KernelGsp *pKernelGsp,
2531 GSP_FIRMWARE *pGspFw
2532 )
2533 {
2534 // If there's no log ELF or it's already been wired, skip wiring it now
2535 if ((pGspFw->pLogElf == NULL) || (pKernelGsp->pLogElf != NULL))
2536 return NV_OK;
2537
2538 // Setup symbol decoder
2539 const void *pLogData = NULL;
2540 NvU64 logSize = 0;
2541
2542 NV_ASSERT_OK_OR_RETURN(
2543 _kgspFwContainerVerifyVersion(pGpu, pKernelGsp,
2544 pGspFw->pLogElf,
2545 pGspFw->logElfSize,
2546 "GSP firmware log"));
2547
2548 NV_ASSERT_OK_OR_RETURN(
2549 _kgspFwContainerGetSection(pGpu, pKernelGsp,
2550 pGspFw->pLogElf,
2551 pGspFw->logElfSize,
2552 GSP_LOGGING_SECTION_NAME,
2553 &pLogData,
2554 &logSize));
2555
2556 pKernelGsp->pLogElf = portMemAllocNonPaged(logSize);
2557 pKernelGsp->logElfDataSize = logSize;
2558
2559 NV_ASSERT_OR_RETURN(pKernelGsp->pLogElf != NULL, NV_ERR_NO_MEMORY);
2560
2561 portMemCopy(pKernelGsp->pLogElf, logSize, pLogData, logSize);
2562 libosLogInit(&pKernelGsp->logDecode, pKernelGsp->pLogElf, logSize);
2563
2564 return NV_OK;
2565 }
2566
2567 static NV_STATUS
_kgspAllocSimAccessBuffer(OBJGPU * pGpu,KernelGsp * pKernelGsp)2568 _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2569 {
2570 NvP64 pVa = NvP64_NULL;
2571 NvP64 pPriv = NvP64_NULL;
2572 NV_STATUS nvStatus;
2573
2574 if (!IS_SIMULATION(pGpu))
2575 {
2576 pKernelGsp->pMemDesc_simAccessBuf = NULL;
2577 pKernelGsp->pSimAccessBuf = NULL;
2578 pKernelGsp->pSimAccessBufPriv = NULL;
2579 return NV_ERR_NOT_SUPPORTED;
2580 }
2581
2582 NV_ASSERT_OK_OR_GOTO(nvStatus,
2583 memdescCreate(&pKernelGsp->pMemDesc_simAccessBuf,
2584 pGpu,
2585 sizeof(SimAccessBuffer),
2586 RM_PAGE_SIZE,
2587 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_UNCACHED,
2588 MEMDESC_FLAGS_NONE),
2589 error_cleanup);
2590
2591 memdescTagAlloc(nvStatus,
2592 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_13, pKernelGsp->pMemDesc_simAccessBuf);
2593 NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_cleanup);
2594
2595 NV_ASSERT_OK_OR_GOTO(nvStatus,
2596 memdescMap(pKernelGsp->pMemDesc_simAccessBuf, 0,
2597 memdescGetSize(pKernelGsp->pMemDesc_simAccessBuf),
2598 NV_TRUE, NV_PROTECT_READ_WRITE,
2599 &pVa, &pPriv),
2600 error_cleanup);
2601
2602 pKernelGsp->pSimAccessBuf = (SimAccessBuffer*)pVa;
2603 pKernelGsp->pSimAccessBufPriv = pPriv;
2604
2605 portMemSet(pKernelGsp->pSimAccessBuf, 0, memdescGetSize(pKernelGsp->pMemDesc_simAccessBuf));
2606
2607 error_cleanup:
2608 if (nvStatus != NV_OK)
2609 _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
2610
2611 return nvStatus;
2612 }
2613
2614 static void
_kgspFreeSimAccessBuffer(OBJGPU * pGpu,KernelGsp * pKernelGsp)2615 _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2616 {
2617 if (!IS_SIMULATION(pGpu))
2618 {
2619 return;
2620 }
2621
2622 if (pKernelGsp->pMemDesc_simAccessBuf != NULL)
2623 {
2624 memdescFree(pKernelGsp->pMemDesc_simAccessBuf);
2625 memdescDestroy(pKernelGsp->pMemDesc_simAccessBuf);
2626 }
2627
2628 pKernelGsp->pMemDesc_simAccessBuf = NULL;
2629 pKernelGsp->pSimAccessBuf = NULL;
2630 pKernelGsp->pSimAccessBufPriv = NULL;
2631 }
2632
2633 static NV_STATUS
_kgspAllocNotifyOpSharedSurface(OBJGPU * pGpu,KernelGsp * pKernelGsp)2634 _kgspAllocNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2635 {
2636 NvP64 pVa = NvP64_NULL;
2637 NvP64 pPriv = NvP64_NULL;
2638 NV_STATUS nvStatus;
2639 NvU64 flags = MEMDESC_FLAGS_NONE;
2640
2641 //
2642 // On systems with SEV enabled, the fault buffer flush sequence memory should be allocated
2643 // in unprotected sysmem as GSP will be writing to this location to let the guest
2644 // know a the issued notify op has finished as well as the status of the operation.
2645 //
2646 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
2647
2648 NV_ASSERT_OK_OR_GOTO(nvStatus,
2649 memdescCreate(&pKernelGsp->pNotifyOpSurfMemDesc,
2650 pGpu,
2651 sizeof(NotifyOpSharedSurface),
2652 RM_PAGE_SIZE,
2653 NV_FALSE, ADDR_SYSMEM, NV_MEMORY_UNCACHED,
2654 flags),
2655 error_cleanup);
2656
2657 memdescTagAlloc(nvStatus,
2658 NV_FB_ALLOC_RM_INTERNAL_OWNER_GSP_NOTIFY_OP_SURFACE, pKernelGsp->pNotifyOpSurfMemDesc);
2659 NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_cleanup);
2660
2661 NV_ASSERT_OK_OR_GOTO(nvStatus,
2662 memdescMap(pKernelGsp->pNotifyOpSurfMemDesc, 0,
2663 memdescGetSize(pKernelGsp->pNotifyOpSurfMemDesc),
2664 NV_TRUE, NV_PROTECT_READ_WRITE,
2665 &pVa, &pPriv),
2666 error_cleanup);
2667
2668 pKernelGsp->pNotifyOpSurf = (NotifyOpSharedSurface*)pVa;
2669 pKernelGsp->pNotifyOpSurfPriv = pPriv;
2670
2671 portMemSet(pKernelGsp->pNotifyOpSurf, 0, memdescGetSize(pKernelGsp->pNotifyOpSurfMemDesc));
2672
2673 error_cleanup:
2674 if (nvStatus != NV_OK)
2675 _kgspFreeNotifyOpSharedSurface(pGpu, pKernelGsp);
2676
2677 return nvStatus;
2678 }
2679
2680 static void
_kgspFreeNotifyOpSharedSurface(OBJGPU * pGpu,KernelGsp * pKernelGsp)2681 _kgspFreeNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2682 {
2683 if (pKernelGsp->pNotifyOpSurfMemDesc != NULL)
2684 {
2685 memdescFree(pKernelGsp->pNotifyOpSurfMemDesc);
2686 memdescDestroy(pKernelGsp->pNotifyOpSurfMemDesc);
2687 }
2688
2689 pKernelGsp->pNotifyOpSurfMemDesc = NULL;
2690 pKernelGsp->pNotifyOpSurf = NULL;
2691 pKernelGsp->pNotifyOpSurfPriv = NULL;
2692 }
2693
2694 /*!
2695 * Create KernelGsp object and initialize RPC infrastructure
2696 */
2697 NV_STATUS
kgspConstructEngine_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,ENGDESCRIPTOR engDesc)2698 kgspConstructEngine_IMPL
2699 (
2700 OBJGPU *pGpu,
2701 KernelGsp *pKernelGsp,
2702 ENGDESCRIPTOR engDesc
2703 )
2704 {
2705 NV_STATUS nvStatus = NV_OK;
2706
2707 if (!IS_GSP_CLIENT(pGpu))
2708 return NV_ERR_NOT_SUPPORTED;
2709
2710 kgspConfigureFalcon_HAL(pGpu, pKernelGsp);
2711
2712 // Init RPC objects used to communicate with GSP.
2713 nvStatus = _kgspInitRpcInfrastructure(pGpu, pKernelGsp);
2714 if (nvStatus != NV_OK)
2715 {
2716 NV_PRINTF(LEVEL_ERROR, "init RPC infrastructure failed\n");
2717 goto done;
2718 }
2719
2720 // Init logging memory used by GSP
2721 nvStatus = _kgspInitLibosLoggingStructures(pGpu, pKernelGsp);
2722 if (nvStatus != NV_OK)
2723 {
2724 NV_PRINTF(LEVEL_ERROR, "init libos logging structures failed: 0x%x\n", nvStatus);
2725 goto done;
2726 }
2727
2728 // Clear out the gspStaticInfo. We will populate this once GSP-RM is up.
2729 portMemSet(&pKernelGsp->gspStaticInfo, 0,
2730 sizeof(pKernelGsp->gspStaticInfo));
2731
2732 nvStatus = kgspAllocBootArgs_HAL(pGpu, pKernelGsp);
2733 if (nvStatus != NV_OK)
2734 {
2735 NV_PRINTF(LEVEL_ERROR, "boot arg alloc failed: 0x%x\n", nvStatus);
2736 goto done;
2737 }
2738
2739 if (IS_SIMULATION(pGpu))
2740 {
2741 nvStatus = _kgspAllocSimAccessBuffer(pGpu, pKernelGsp);
2742 if (nvStatus != NV_OK)
2743 {
2744 NV_PRINTF(LEVEL_ERROR, "sim access buffer alloc failed: 0x%x\n", nvStatus);
2745 goto done;
2746 }
2747 }
2748
2749 nvStatus = _kgspAllocNotifyOpSharedSurface(pGpu, pKernelGsp);
2750 if (nvStatus != NV_OK)
2751 {
2752 NV_PRINTF(LEVEL_ERROR, "notify operation shared surface alloc failed: 0x%x\n", nvStatus);
2753 goto done;
2754 }
2755
2756 #if KERNEL_GSP_TRACING_RATS_ENABLED
2757 multimapInit(&pGpu->gspTraceEventBufferBindingsUid, portMemAllocatorGetGlobalNonPaged());
2758 #endif
2759
2760 done:
2761 if (nvStatus != NV_OK)
2762 {
2763 _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
2764 kgspFreeBootArgs_HAL(pGpu, pKernelGsp);
2765 _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
2766 _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
2767 }
2768
2769 return nvStatus;
2770 }
2771
2772 /*!
2773 * Convert VBIOS version containing Version and OemVersion packed together to
2774 * a string representation.
2775 *
2776 * Example:
2777 * for Version 0x05400001, OemVersion 0x12
2778 * input argument vbiosVersionCombined 0x0540000112
2779 * output str "5.40.00.01.12"
2780 */
2781 static void
_kgspVbiosVersionToStr(NvU64 vbiosVersionCombined,char * pVbiosVersionStr,NvU32 size)2782 _kgspVbiosVersionToStr(NvU64 vbiosVersionCombined, char *pVbiosVersionStr, NvU32 size)
2783 {
2784 nvDbgSnprintf(pVbiosVersionStr, size, "%2X.%02X.%02X.%02X.%02X",
2785 (vbiosVersionCombined >> 32) & 0xff,
2786 (vbiosVersionCombined >> 24) & 0xff,
2787 (vbiosVersionCombined >> 16) & 0xff,
2788 (vbiosVersionCombined >> 8) & 0xff,
2789 (vbiosVersionCombined) & 0xff);
2790 }
2791
2792 static NV_STATUS
_kgspPrepareScrubberImageIfNeeded(OBJGPU * pGpu,KernelGsp * pKernelGsp)2793 _kgspPrepareScrubberImageIfNeeded(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2794 {
2795 // Prepare Scrubber ucode image if pre-scrubbed memory is insufficient
2796 NvU64 neededSize = pKernelGsp->pWprMeta->fbSize - pKernelGsp->pWprMeta->gspFwRsvdStart;
2797 NvU64 prescrubbedSize = kgspGetPrescrubbedTopFbSize(pGpu, pKernelGsp);
2798 NV_PRINTF(LEVEL_INFO, "pre-scrubbed memory: 0x%llx bytes, needed: 0x%llx bytes\n",
2799 prescrubbedSize, neededSize);
2800
2801 if (neededSize > prescrubbedSize)
2802 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2803 kgspAllocateScrubberUcodeImage(pGpu, pKernelGsp, &pKernelGsp->pScrubberUcode));
2804
2805 return NV_OK;
2806 }
2807
2808 /*!
2809 * Prepare and place RPCs in message queue that GSP-RM will process
2810 * in early boot before OBJGPU is created.
2811 *
2812 * @param[in] pGpu GPU object pointer
2813 * @param[in] pKernelGsp KernelGsp object pointer
2814 *
2815 * @return NV_OK if RPCs queued successfully.
2816 * Appropriate NV_ERR_xxx value otherwise.
2817 */
2818 NV_STATUS
kgspQueueAsyncInitRpcs_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)2819 kgspQueueAsyncInitRpcs_IMPL
2820 (
2821 OBJGPU *pGpu,
2822 KernelGsp *pKernelGsp
2823 )
2824 {
2825 NV_STATUS status = NV_OK;
2826
2827 NV_RM_RPC_GSP_SET_SYSTEM_INFO(pGpu, status);
2828 if (status != NV_OK)
2829 {
2830 NV_ASSERT_OK_FAILED("NV_RM_RPC_GSP_SET_SYSTEM_INFO", status);
2831 return status;
2832 }
2833
2834 NV_RM_RPC_SET_REGISTRY(pGpu, status);
2835 if (status != NV_OK)
2836 {
2837 NV_ASSERT_OK_FAILED("NV_RM_RPC_SET_REGISTRY", status);
2838 return status;
2839 }
2840
2841 return NV_OK;
2842 }
2843
2844 static NvBool
_kgspShouldRelaxGspInitLocking(OBJGPU * pGpu)2845 _kgspShouldRelaxGspInitLocking
2846 (
2847 OBJGPU *pGpu
2848 )
2849 {
2850 NvU32 relaxGspInitLockingReg;
2851
2852 if (!RMCFG_FEATURE_PLATFORM_UNIX)
2853 {
2854 return NV_FALSE;
2855 }
2856
2857 if (gpuIsCCFeatureEnabled(pGpu))
2858 {
2859 return NV_FALSE;
2860 }
2861
2862 if (osReadRegistryDword(pGpu, NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING, &relaxGspInitLockingReg) != NV_OK)
2863 {
2864 relaxGspInitLockingReg = NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_DEFAULT;
2865 }
2866
2867 // Due to bug 4399629, restrict which platforms have parallel init enabled by default
2868 if (relaxGspInitLockingReg == NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_DEFAULT)
2869 {
2870 NvU16 devId = (NvU16)(((pGpu->idInfo.PCIDeviceID) >> 16) & 0x0000FFFF);
2871 NvU32 i;
2872
2873 static const NvU16 defaultRelaxGspInitLockingGpus[] = {
2874 0x1EB8, // T4
2875 0x1EB9, // T4
2876 };
2877
2878 if (IsHOPPER(pGpu) || IsADA(pGpu))
2879 {
2880 return NV_TRUE;
2881 }
2882
2883 for (i = 0; i < NV_ARRAY_ELEMENTS(defaultRelaxGspInitLockingGpus); i++)
2884 {
2885 if (devId == defaultRelaxGspInitLockingGpus[i])
2886 {
2887 return NV_TRUE;
2888 }
2889 }
2890 return NV_FALSE;
2891 }
2892
2893 return (relaxGspInitLockingReg == NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_ENABLE);
2894 return NV_FALSE;
2895 }
2896
2897 static NV_STATUS
_kgspBootReacquireLocks(OBJGPU * pGpu,KernelGsp * pKernelGsp,GPU_MASK * pGpusLockedMask)2898 _kgspBootReacquireLocks(OBJGPU *pGpu, KernelGsp *pKernelGsp, GPU_MASK *pGpusLockedMask)
2899 {
2900 //
2901 // To follow lock order constraints, GPU lock needs to be released before acquiring API lock
2902 // As this path doesn't go through resource server, no client locks should be held at this point.
2903 // Note: we must not hold any client locks when re-acquiring the API per lock ordering
2904 //
2905 rmGpuGroupLockRelease(*pGpusLockedMask, GPUS_LOCK_FLAGS_NONE);
2906 *pGpusLockedMask = 0;
2907
2908 //
2909 // rmapiLockAcquire should never fail on Linux if the API lock and GPU locks are not held.
2910 // Failure to acquire the API lock means the cleanup sequence will skipped since it is
2911 // unsafe without the lock.
2912 //
2913 NV_ASSERT_OK_OR_RETURN(rmapiLockAcquire(API_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT));
2914
2915 //
2916 // This should never fail on Linux due to locks in the Unix layer.
2917 // This will need to be revisited when parallel init is enabled on other platforms.
2918 //
2919 NV_ASSERT_OR_RETURN(gpumgrIsGpuPointerAttached(pGpu), NV_ERR_INVALID_DEVICE);
2920
2921 // Reqcquire the GPU lock released above.
2922 NV_ASSERT_OK_OR_RETURN(rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
2923 GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT,
2924 pGpusLockedMask));
2925
2926 return NV_OK;
2927 }
2928
2929 static NV_STATUS
_kgspBootGspRm(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw,GPU_MASK * pGpusLockedMask)2930 _kgspBootGspRm(OBJGPU *pGpu, KernelGsp *pKernelGsp, GSP_FIRMWARE *pGspFw, GPU_MASK *pGpusLockedMask)
2931 {
2932 NV_STATUS status;
2933
2934 // Fail early if WPR2 is up
2935 if (kgspIsWpr2Up_HAL(pGpu, pKernelGsp))
2936 {
2937 NV_PRINTF(LEVEL_ERROR, "unexpected WPR2 already up, cannot proceed with booting GSP\n");
2938 NV_PRINTF(LEVEL_ERROR, "(the GPU is likely in a bad state and may need to be reset)\n");
2939 return NV_ERR_INVALID_STATE;
2940 }
2941
2942 // Calculate FB layout (requires knowing FB size which depends on GFW_BOOT)
2943 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgspCalculateFbLayout_HAL(pGpu, pKernelGsp, pGspFw));
2944
2945 // If the new FB layout requires a scrubber ucode to scrub additional space, prepare it now
2946 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, _kgspPrepareScrubberImageIfNeeded(pGpu, pKernelGsp));
2947
2948 // Setup arguments for bootstrapping GSP
2949 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgspPrepareForBootstrap_HAL(pGpu, pKernelGsp, pGspFw));
2950
2951 // Release the API lock if relaxed locking for parallel init is enabled
2952 NvBool bRelaxedLocking = _kgspShouldRelaxGspInitLocking(pGpu);
2953 if (bRelaxedLocking)
2954 rmapiLockRelease();
2955
2956 // Proceed with GSP boot - if it fails, check for ECC errors
2957 status = kgspBootstrap_HAL(pGpu, pKernelGsp, pGspFw);
2958 if ((status != NV_OK) && gpuCheckEccCounts_HAL(pGpu))
2959 status = NV_ERR_ECC_ERROR;
2960
2961 pKernelGsp->bootAttempts++;
2962
2963 //
2964 // The caller will check that both the API lock and the GPU lock will be held upon return from
2965 // this function, regardless of whether GSP bootstrap succeeded.
2966 //
2967 if (bRelaxedLocking)
2968 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2969 _kgspBootReacquireLocks(pGpu, pKernelGsp, pGpusLockedMask));
2970
2971 return status;
2972 }
2973
2974 /*!
2975 * Initialize GSP-RM
2976 *
2977 * @param[in] pGpu GPU object pointer
2978 * @param[in] pKernelGsp KernelGsp object pointer
2979 * @param[in] pGspFw GSP firmware structure pointer
2980 *
2981 * @return NV_OK if GSP fw RM offload successfully initialized.
2982 * Appropriate NV_ERR_xxx value otherwise.
2983 */
2984 NV_STATUS
kgspInitRm_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw)2985 kgspInitRm_IMPL
2986 (
2987 OBJGPU *pGpu,
2988 KernelGsp *pKernelGsp,
2989 GSP_FIRMWARE *pGspFw
2990 )
2991 {
2992 NV_STATUS status = NV_OK;
2993 OBJTMR *pTmr = GPU_GET_TIMER(pGpu);
2994 GPU_MASK gpusLockedMask = 0;
2995
2996 if (!IS_GSP_CLIENT(pGpu))
2997 return NV_OK;
2998
2999 if ((pGspFw == NULL) || (pGspFw->pBuf == NULL) || (pGspFw->size == 0))
3000 {
3001 NV_PRINTF(LEVEL_ERROR, "need firmware to initialize GSP\n");
3002 return NV_ERR_INVALID_ARGUMENT;
3003 }
3004
3005 pKernelGsp->bInInit = NV_TRUE;
3006
3007 // Need to hold the GPU instance lock in order to write to the RPC queue
3008 NV_ASSERT_OK_OR_GOTO(status,
3009 rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
3010 GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT, &gpusLockedMask),
3011 done);
3012
3013 /*
3014 * For GSP-RM boot, we must trigger FRTS (if it exists for the chip)
3015 * before loading GSP-RM so that FRTS data and GSP-RM code/data/heap can coexist
3016 * in WPR2. FRTS is triggered by running a VBIOS-provided ucode called FWSEC.
3017 *
3018 * Here, we extract a VBIOS image from ROM, and parse it for FWSEC.
3019 */
3020 if (pKernelGsp->pFwsecUcode == NULL)
3021 {
3022 KernelGspVbiosImg *pVbiosImg = NULL;
3023
3024 // Start VBIOS version string as "unknown"
3025 portStringCopy(pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr), "unknown", sizeof("unknown"));
3026
3027 // Try and extract a VBIOS image.
3028 status = kgspExtractVbiosFromRom_HAL(pGpu, pKernelGsp, &pVbiosImg);
3029
3030 if (status == NV_OK)
3031 {
3032 NvU64 vbiosVersionCombined = 0;
3033
3034 // Got a VBIOS image, now parse it for FWSEC.
3035 status = kgspParseFwsecUcodeFromVbiosImg(pGpu, pKernelGsp, pVbiosImg,
3036 &pKernelGsp->pFwsecUcode, &vbiosVersionCombined);
3037 kgspFreeVbiosImg(pVbiosImg);
3038
3039 if (vbiosVersionCombined > 0)
3040 {
3041 _kgspVbiosVersionToStr(vbiosVersionCombined, pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr));
3042 }
3043
3044 if (status != NV_OK)
3045 {
3046 NV_PRINTF(LEVEL_ERROR, "failed to parse FWSEC ucode from VBIOS image (VBIOS version %s): 0x%x\n",
3047 pKernelGsp->vbiosVersionStr, status);
3048 goto done;
3049 }
3050
3051 NV_PRINTF(LEVEL_INFO, "parsed VBIOS version %s\n", pKernelGsp->vbiosVersionStr);
3052 }
3053 else if (status == NV_ERR_NOT_SUPPORTED)
3054 {
3055 // Extracting VBIOS image from ROM is not supported.
3056 status = NV_OK;
3057 }
3058 else
3059 {
3060 NV_PRINTF(LEVEL_ERROR, "failed to extract VBIOS image from ROM: 0x%x\n",
3061 status);
3062 goto done;
3063 }
3064
3065 }
3066
3067 /*
3068 * We use a set of Booter ucodes to boot GSP-RM as well as manage its lifecycle.
3069 *
3070 * Booter Load loads, verifies, and boots GSP-RM in WPR2.
3071 * Booter Unload tears down WPR2 for driver unload.
3072 *
3073 * Here we prepare the Booter ucode images in SYSMEM so they may be loaded onto
3074 * SEC2 (Load / Unload) and NVDEC0 (Unload).
3075 */
3076 if (pKernelGsp->bPartitionedFmc)
3077 {
3078 //
3079 // The secure boot ucode is included in the partitioned FMC, no need for
3080 // separate Booter ucodes.
3081 //
3082 }
3083 else
3084 {
3085 if (pKernelGsp->pBooterLoadUcode == NULL)
3086 {
3087 status = kgspAllocateBooterLoadUcodeImage(pGpu, pKernelGsp,
3088 &pKernelGsp->pBooterLoadUcode);
3089 if (status != NV_OK)
3090 {
3091 NV_PRINTF(LEVEL_ERROR, "failed to allocate Booter Load ucode: 0x%x\n", status);
3092 goto done;
3093 }
3094 }
3095
3096 if (pKernelGsp->pBooterUnloadUcode == NULL)
3097 {
3098 status = kgspAllocateBooterUnloadUcodeImage(pGpu, pKernelGsp,
3099 &pKernelGsp->pBooterUnloadUcode);
3100 if (status != NV_OK)
3101 {
3102 NV_PRINTF(LEVEL_ERROR, "failed to allocate Booter Unload ucode: 0x%x\n", status);
3103 goto done;
3104 }
3105 }
3106 }
3107
3108 // Prepare boot binary image.
3109 status = kgspPrepareBootBinaryImage(pGpu, pKernelGsp);
3110 if (status != NV_OK)
3111 {
3112 NV_PRINTF(LEVEL_ERROR, "Error preparing boot binary image\n");
3113 goto done;
3114 }
3115
3116 // Prepare GSP-RM image.
3117 status = _kgspPrepareGspRmBinaryImage(pGpu, pKernelGsp, pGspFw);
3118 if (status != NV_OK)
3119 {
3120 NV_PRINTF(LEVEL_ERROR, "Error preparing GSP-RM image\n");
3121 goto done;
3122 }
3123
3124 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, _kgspInitLibosLogDecoder(pGpu, pKernelGsp, pGspFw), done);
3125
3126 //
3127 // Do not register nvlog flush callback if:
3128 // 1. Live decoding is enabled, as logs will be printed to dmesg.
3129 // 2. NV_ESC_RM_LOCKLESS_DIAGNOSTIC is not supported on this platform, i.e. pNvlogFlushMtx=NULL.
3130 //
3131 if (pKernelGsp->pLogElf == NULL && pKernelGsp->pNvlogFlushMtx != NULL)
3132 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, nvlogRegisterFlushCb(kgspNvlogFlushCb, pKernelGsp), done);
3133
3134 // Reset thread state timeout and wait for GFW_BOOT OK status
3135 threadStateResetTimeout(pGpu);
3136 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspWaitForGfwBootOk_HAL(pGpu, pKernelGsp), done);
3137
3138 //
3139 // Set the GPU time to the wall-clock time after GFW boot is complete
3140 // (to avoid PLM collisions) but before loading GSP-RM ucode (which
3141 // consumes the updated GPU time).
3142 //
3143 tmrSetCurrentTime_HAL(pGpu, pTmr);
3144
3145 // Initialize libos init args list
3146 kgspSetupLibosInitArgs(pGpu, pKernelGsp);
3147
3148 // Fill in the GSP-RM message queue init parameters
3149 kgspPopulateGspRmInitArgs(pGpu, pKernelGsp, NULL);
3150
3151 //
3152 // If ConfCompute is enabled, all RPC traffic must be encrypted. Since we
3153 // can't encrypt until GSP boots and session is established, we must send
3154 // these messages later (kgspBootstrap_HAL) in CC.
3155 //
3156 ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu);
3157 if (pCC == NULL || !pCC->getProperty(pCC, PDB_PROP_CONFCOMPUTE_CC_FEATURE_ENABLED))
3158 {
3159 //
3160 // Stuff the message queue with async init messages that will be run
3161 // before OBJGPU is created.
3162 //
3163 status = kgspQueueAsyncInitRpcs(pGpu, pKernelGsp);
3164 if (status != NV_OK)
3165 {
3166 goto done;
3167 }
3168 }
3169
3170 //
3171 // Bring up ucode with RM offload task.
3172 // If an ECC error occurs which results in the failure of the bootstrap, try again.
3173 // Subsequent attempts will shift the GSP region of FB in an attempt to avoid the
3174 // unstable memory.
3175 //
3176 const NvU8 MAX_GSP_BOOT_ATTEMPTS = 4;
3177 do
3178 {
3179 // Reset the thread state timeout after failed attempts to prevent premature timeouts.
3180 if (status != NV_OK)
3181 threadStateResetTimeout(pGpu);
3182
3183 //
3184 // _kgspBootGspRm() will return NV_ERR_ECC_ERROR if any unhandled ECC errors are
3185 // detected during a failed GSP boot attempt. Depending on where and when the
3186 // error occurred, we may not be able to try again, in which case a different
3187 // error code will be returned.
3188 //
3189 status = _kgspBootGspRm(pGpu, pKernelGsp, pGspFw, &gpusLockedMask);
3190
3191 //
3192 // _kgspBootGspRm() may temporarily release locks to facilitate parallel GSP bootstrap on
3193 // other GPUs. It is responsible for reacquiring them in the proper order. If there is a
3194 // failure to reacquire locks, it is unsafe to continue, regardless of the initialization
3195 // status - so we return immediately here, rather attempting cleanup.
3196 //
3197 // Note: _kgspBootGspRm() is structured such that gpusLockedMask will always be 0 (no GPU
3198 // locks held) if the API lock is not held upon return.
3199 //
3200 NV_ASSERT_OR_RETURN(rmapiLockIsOwner() && (gpusLockedMask != 0),
3201 NV_ERR_INVALID_LOCK_STATE);
3202 } while ((status == NV_ERR_ECC_ERROR) && (pKernelGsp->bootAttempts < MAX_GSP_BOOT_ATTEMPTS));
3203
3204 if (status != NV_OK)
3205 {
3206 if (status == NV_ERR_INSUFFICIENT_POWER)
3207 {
3208 OBJSYS *pSys = SYS_GET_INSTANCE();
3209 OBJGPUMGR *pGpuMgr = SYS_GET_GPUMGR(pSys);
3210
3211 pGpuMgr->powerDisconnectedGpuBus[pGpuMgr->powerDisconnectedGpuCount++] = gpuGetBus(pGpu);
3212 }
3213
3214 //
3215 // Ignore return value - a crash report may have already been consumed,
3216 // this is just here as a last attempt to report boot issues that might
3217 // have escaped prior checks.
3218 //
3219 (void)kgspHealthCheck_HAL(pGpu, pKernelGsp);
3220 goto done;
3221 }
3222
3223 // at this point we should be able to exchange RPCs with RM offload task
3224 NV_RM_RPC_SET_GUEST_SYSTEM_INFO(pGpu, status);
3225 if (status != NV_OK)
3226 {
3227 NV_PRINTF(LEVEL_ERROR, "SET_GUEST_SYSTEM_INFO failed: 0x%x\n", status);
3228 goto done;
3229 }
3230
3231 NV_RM_RPC_GET_GSP_STATIC_INFO(pGpu, status);
3232 if (status != NV_OK)
3233 {
3234 NV_PRINTF(LEVEL_ERROR, "GET_GSP_STATIC_INFO failed: 0x%x\n", status);
3235 goto done;
3236 }
3237
3238 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspStartLogPolling(pGpu, pKernelGsp), done);
3239
3240 done:
3241 pKernelGsp->bInInit = NV_FALSE;
3242
3243 if (status != NV_OK)
3244 {
3245 KernelPmu *pKernelPmu = GPU_GET_KERNEL_PMU(pGpu);
3246
3247 // Preserve any captured GSP-RM logs
3248 libosPreserveLogs(&pKernelGsp->logDecode);
3249
3250 if (pKernelPmu != NULL)
3251 {
3252 // If PMU init fails, kgsp init will also fail
3253 libosPreserveLogs(&pKernelPmu->logDecode);
3254 }
3255 }
3256
3257 if (gpusLockedMask != 0)
3258 {
3259 rmGpuGroupLockRelease(gpusLockedMask, GPUS_LOCK_FLAGS_NONE);
3260 }
3261
3262 return status;
3263 }
3264
3265 /*!
3266 * Unload GSP-RM
3267 */
3268 NV_STATUS
kgspUnloadRm_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)3269 kgspUnloadRm_IMPL
3270 (
3271 OBJGPU *pGpu,
3272 KernelGsp *pKernelGsp
3273 )
3274 {
3275 NV_STATUS rpcStatus = NV_OK;
3276 NV_STATUS status;
3277 KernelGspPreparedFwsecCmd preparedCmd;
3278
3279 NV_PRINTF(LEVEL_INFO, "unloading GSP-RM\n");
3280 NV_RM_RPC_UNLOADING_GUEST_DRIVER(pGpu, rpcStatus, NV_FALSE, NV_FALSE, 0);
3281
3282 if (gpuIsCCFeatureEnabled(pGpu))
3283 {
3284 // FIPS: If CC enabled, we need to confirm GSP-RM was able to teardown CC state.
3285 kgspCheckGspRmCcCleanup_HAL(pGpu, pKernelGsp);
3286 }
3287
3288 // Wait for GSP-RM processor to suspend
3289 kgspWaitForProcessorSuspend_HAL(pGpu, pKernelGsp);
3290
3291 // Dump GSP-RM logs and reset before invoking FWSEC-SB
3292 kgspDumpGspLogs(pKernelGsp, NV_FALSE);
3293
3294 //
3295 // Avoid cascading timeouts when attempting to invoke the below ucodes if
3296 // we are unloading due to a GSP-RM timeout.
3297 //
3298 threadStateResetTimeout(pGpu);
3299
3300 // Because of COT, RM cannot reset GSP-RISCV and FSP has exclusive access to reset and reboot GSP for next run.
3301 if(!(pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_COT_ENABLED)))
3302 {
3303 kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
3304 }
3305
3306 // Invoke FWSEC-SB to put back PreOsApps during driver unload
3307 status = kgspPrepareForFwsecSb_HAL(pGpu, pKernelGsp, pKernelGsp->pFwsecUcode, &preparedCmd);
3308 if (status == NV_ERR_NOT_SUPPORTED)
3309 {
3310 // skip FWSEC-SB during driver unload if unsupported (e.g. on Hopper+)
3311 status = NV_OK;
3312 }
3313 else if (status != NV_OK)
3314 {
3315 NV_PRINTF(LEVEL_ERROR, "failed to prepare for FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status);
3316 NV_ASSERT(0);
3317 }
3318 else
3319 {
3320 status = kgspExecuteFwsec_HAL(pGpu, pKernelGsp, &preparedCmd);
3321 if (status != NV_OK)
3322 {
3323 NV_PRINTF(LEVEL_ERROR, "failed to execute FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status);
3324 NV_ASSERT(0);
3325 }
3326 }
3327
3328 if (pKernelGsp->bPartitionedFmc)
3329 {
3330 //
3331 // GSP-RM invokes the partitioned FMC to unload directly as part of the
3332 // NV_RM_RPC_UNLOADING_GUEST_DRIVER call above.
3333 //
3334 status = rpcStatus;
3335 }
3336 else
3337 {
3338 // After instructing GSP-RM to unload itself, run Booter Unload to teardown WPR2
3339 status = kgspExecuteBooterUnloadIfNeeded_HAL(pGpu, pKernelGsp, 0);
3340 }
3341
3342 //
3343 // To fix boot issue after GPU reset on ESXi config:
3344 // We still do not have root cause but looks like some sanity is failing during boot after reset is done.
3345 // As temp WAR, add delay of 250 ms after gsp rm unload is done.
3346 // Limit this to [VGPU-GSP] supported configs only and when we are in GPU RESET path.
3347 //
3348 if (API_GPU_IN_RESET_SANITY_CHECK(pGpu) &&
3349 gpuIsSriovEnabled(pGpu) &&
3350 IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
3351 {
3352 osDelay(250);
3353 }
3354
3355 if (rpcStatus != NV_OK)
3356 {
3357 return rpcStatus;
3358 }
3359
3360 return status;
3361 }
3362
3363 /*!
3364 * Free RPC infrastructure and KernelGsp object
3365 */
3366 void
kgspDestruct_IMPL(KernelGsp * pKernelGsp)3367 kgspDestruct_IMPL
3368 (
3369 KernelGsp *pKernelGsp
3370 )
3371 {
3372 OBJGPU *pGpu = ENG_GET_GPU(pKernelGsp);
3373
3374 if (!IS_GSP_CLIENT(pGpu))
3375 return;
3376
3377 // set VBIOS version string back to "unknown"
3378 portStringCopy(pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr), "unknown", sizeof("unknown"));
3379
3380 kgspFreeFlcnUcode(pKernelGsp->pFwsecUcode);
3381 pKernelGsp->pFwsecUcode = NULL;
3382
3383 kgspFreeFlcnUcode(pKernelGsp->pBooterLoadUcode);
3384 pKernelGsp->pBooterLoadUcode = NULL;
3385
3386 kgspFreeFlcnUcode(pKernelGsp->pBooterUnloadUcode);
3387 pKernelGsp->pBooterUnloadUcode = NULL;
3388
3389 kgspFreeFlcnUcode(pKernelGsp->pScrubberUcode);
3390 pKernelGsp->pScrubberUcode = NULL;
3391
3392 kgspFreeBootArgs_HAL(pGpu, pKernelGsp);
3393
3394 _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
3395 _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
3396 _kgspFreeBootBinaryImage(pGpu, pKernelGsp);
3397 _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
3398 _kgspFreeNotifyOpSharedSurface(pGpu, pKernelGsp);
3399
3400 kgspFreeSuspendResumeData_HAL(pGpu, pKernelGsp);
3401
3402 #if KERNEL_GSP_TRACING_RATS_ENABLED
3403 multimapDestroy(&pGpu->gspTraceEventBufferBindingsUid);
3404 #endif
3405 }
3406
3407 void
kgspDumpGspLogsUnlocked_IMPL(KernelGsp * pKernelGsp,NvBool bSyncNvLog)3408 kgspDumpGspLogsUnlocked_IMPL
3409 (
3410 KernelGsp *pKernelGsp,
3411 NvBool bSyncNvLog
3412 )
3413 {
3414 if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog)
3415 {
3416 libosExtractLogs(&pKernelGsp->logDecode, bSyncNvLog);
3417
3418 if (pKernelGsp->bHasVgpuLogs)
3419 {
3420 // Dump logs from vGPU partition
3421 for (NvU32 i = 0; i < MAX_PARTITIONS_WITH_GFID; i++)
3422 {
3423 libosExtractLogs(&pKernelGsp->logDecodeVgpuPartition[i], bSyncNvLog);
3424 }
3425 }
3426 }
3427
3428 }
3429
3430 /*!
3431 * Dump logs coming from GSP-RM
3432 *
3433 * @param[in] pKernelGsp KernelGsp pointer
3434 * @param[in] bSyncNvLog NV_TRUE: Copy a snapshot of the libos logs
3435 * into the nvLog wrap buffers.
3436 */
3437 void
kgspDumpGspLogs_IMPL(KernelGsp * pKernelGsp,NvBool bSyncNvLog)3438 kgspDumpGspLogs_IMPL
3439 (
3440 KernelGsp *pKernelGsp,
3441 NvBool bSyncNvLog
3442 )
3443 {
3444 if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog)
3445 {
3446 if (pKernelGsp->pNvlogFlushMtx != NULL)
3447 portSyncMutexAcquire(pKernelGsp->pNvlogFlushMtx);
3448
3449 kgspDumpGspLogsUnlocked(pKernelGsp, bSyncNvLog);
3450
3451 if (pKernelGsp->pNvlogFlushMtx != NULL)
3452 portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx);
3453 }
3454 }
3455
3456 /*!
3457 * Populate GSP-RM init arguments.
3458 */
3459 void
kgspPopulateGspRmInitArgs_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_SR_INIT_ARGUMENTS * pGspInitArgs)3460 kgspPopulateGspRmInitArgs_IMPL
3461 (
3462 OBJGPU *pGpu,
3463 KernelGsp *pKernelGsp,
3464 GSP_SR_INIT_ARGUMENTS *pGspInitArgs
3465 )
3466 {
3467 GSP_ARGUMENTS_CACHED *pGspArgs = pKernelGsp->pGspArgumentsCached;
3468 MESSAGE_QUEUE_INIT_ARGUMENTS *pMQInitArgs = &pGspArgs->messageQueueInitArguments;
3469 MESSAGE_QUEUE_COLLECTION *pMQCollection = pKernelGsp->pMQCollection;
3470 GSP_SR_INIT_ARGUMENTS *pSrInitArgs = &pGspArgs->srInitArguments;
3471
3472 // Setup the message queue arguments
3473 pMQInitArgs->sharedMemPhysAddr = pMQCollection->sharedMemPA;
3474 pMQInitArgs->pageTableEntryCount = pMQCollection->pageTableEntryCount;
3475 pMQInitArgs->cmdQueueOffset = pMQCollection->pageTableSize;
3476 pMQInitArgs->statQueueOffset = pMQInitArgs->cmdQueueOffset + pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX].commandQueueSize;
3477 if (pKernelGsp->bIsTaskIsrQueueRequired)
3478 {
3479 pMQInitArgs->locklessCmdQueueOffset = pMQInitArgs->statQueueOffset + pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX].statusQueueSize;
3480 pMQInitArgs->locklessStatQueueOffset = pMQInitArgs->locklessCmdQueueOffset + pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX].commandQueueSize;
3481 }
3482 else
3483 {
3484 pMQInitArgs->locklessCmdQueueOffset = 0;
3485 pMQInitArgs->locklessStatQueueOffset = 0;
3486 }
3487
3488 if (pGspInitArgs == NULL)
3489 {
3490 pSrInitArgs->bInPMTransition = NV_FALSE;
3491 pSrInitArgs->oldLevel = 0;
3492 pSrInitArgs->flags = 0;
3493 }
3494 else
3495 {
3496 pSrInitArgs->bInPMTransition = NV_TRUE;
3497 pSrInitArgs->oldLevel = pGspInitArgs->oldLevel;
3498 pSrInitArgs->flags = pGspInitArgs->flags;
3499 }
3500
3501 pGspArgs->gpuInstance = pGpu->gpuInstance;
3502
3503 portMemSet(&pGspArgs->profilerArgs, 0, sizeof(pGspArgs->profilerArgs));
3504
3505 if (pKernelGsp->pProfilerSamples != NULL &&
3506 pKernelGsp->pProfilerSamplesMD != NULL)
3507 {
3508 pGspArgs->profilerArgs.pa = memdescGetPhysAddr(pKernelGsp->pProfilerSamplesMD, AT_GPU, 0);
3509 pGspArgs->profilerArgs.size = memdescGetSize(pKernelGsp->pProfilerSamplesMD);
3510 }
3511 }
3512
3513 /*!
3514 * Prepare boot binary image for GSP-RM boot.
3515 *
3516 * @return NV_OK if boot binary image prepared successfully.
3517 * Appropriate NV_ERR_xxx value otherwise.
3518 */
3519 NV_STATUS
kgspPrepareBootBinaryImage_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)3520 kgspPrepareBootBinaryImage_IMPL
3521 (
3522 OBJGPU *pGpu,
3523 KernelGsp *pKernelGsp
3524 )
3525 {
3526 NV_STATUS status;
3527 BINDATA_STORAGE *pBinStorageImage;
3528 BINDATA_STORAGE *pBinStorageDesc;
3529 NvU32 bufSize;
3530 NvU32 bufSizeAligned;
3531 RM_RISCV_UCODE_DESC *pDesc = NULL;
3532 NvP64 pVa = NvP64_NULL;
3533 NvP64 pPriv = NvP64_NULL;
3534 NvU64 flags = MEMDESC_FLAGS_NONE;
3535
3536 NV_ASSERT_OR_RETURN(pKernelGsp->pGspRmBootUcodeImage == NULL, NV_ERR_INVALID_STATE);
3537 NV_ASSERT_OR_RETURN(pKernelGsp->pGspRmBootUcodeDesc == NULL, NV_ERR_INVALID_STATE);
3538
3539 // get the bindata storage for the image/descriptor
3540 kgspGetGspRmBootUcodeStorage_HAL(pGpu, pKernelGsp, &pBinStorageImage, &pBinStorageDesc);
3541
3542 // copy the image to sysmem
3543 bufSize = bindataGetBufferSize(pBinStorageImage);
3544 bufSizeAligned = NV_ALIGN_UP(bufSize, 0x1000);
3545
3546 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3547
3548 NV_ASSERT_OK_OR_GOTO(status,
3549 memdescCreate(&pKernelGsp->pGspRmBootUcodeMemdesc,
3550 pGpu,
3551 bufSizeAligned,
3552 RM_PAGE_SIZE,
3553 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED,
3554 flags),
3555 fail);
3556
3557 memdescTagAlloc(status,
3558 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_15, pKernelGsp->pGspRmBootUcodeMemdesc);
3559 NV_ASSERT_OK_OR_GOTO(status, status, fail);
3560
3561 NV_ASSERT_OK_OR_GOTO(status,
3562 memdescMap(pKernelGsp->pGspRmBootUcodeMemdesc, 0,
3563 memdescGetSize(pKernelGsp->pGspRmBootUcodeMemdesc),
3564 NV_TRUE, NV_PROTECT_READ_WRITE,
3565 &pVa, &pPriv),
3566 fail);
3567
3568 pKernelGsp->gspRmBootUcodeSize = bufSize;
3569 pKernelGsp->pGspRmBootUcodeImage = (NvU8 *)NvP64_VALUE(pVa);;
3570 pKernelGsp->pGspRmBootUcodeMemdescPriv = pPriv;
3571
3572 NV_ASSERT_OK_OR_GOTO(status,
3573 bindataWriteToBuffer(pBinStorageImage,
3574 pKernelGsp->pGspRmBootUcodeImage,
3575 bufSize),
3576 fail);
3577
3578 // get the image descriptor
3579 NV_ASSERT_OK_OR_GOTO(status,
3580 bindataStorageAcquireData(pBinStorageDesc, (const void**)&pDesc),
3581 fail);
3582 pKernelGsp->pGspRmBootUcodeDesc = pDesc;
3583
3584 return status;
3585
3586 fail:
3587 _kgspFreeBootBinaryImage(pGpu, pKernelGsp);
3588 return status;
3589 }
3590
3591 static void
_kgspFreeBootBinaryImage(OBJGPU * pGpu,KernelGsp * pKernelGsp)3592 _kgspFreeBootBinaryImage
3593 (
3594 OBJGPU *pGpu,
3595 KernelGsp *pKernelGsp
3596 )
3597 {
3598 bindataStorageReleaseData(pKernelGsp->pGspRmBootUcodeDesc);
3599 pKernelGsp->pGspRmBootUcodeDesc = NULL;
3600
3601 if (pKernelGsp->pGspRmBootUcodeImage != NULL)
3602 {
3603 memdescUnmap(pKernelGsp->pGspRmBootUcodeMemdesc,
3604 NV_TRUE, osGetCurrentProcess(),
3605 (void *)pKernelGsp->pGspRmBootUcodeImage,
3606 pKernelGsp->pGspRmBootUcodeMemdescPriv);
3607 pKernelGsp->pGspRmBootUcodeImage = NULL;
3608 pKernelGsp->pGspRmBootUcodeMemdescPriv = NULL;
3609 }
3610 if (pKernelGsp->pGspRmBootUcodeMemdesc != NULL)
3611 {
3612 memdescFree(pKernelGsp->pGspRmBootUcodeMemdesc);
3613 memdescDestroy(pKernelGsp->pGspRmBootUcodeMemdesc);
3614 pKernelGsp->pGspRmBootUcodeMemdesc = NULL;
3615 }
3616
3617 pKernelGsp->gspRmBootUcodeSize = 0;
3618 }
3619
3620 static NV_STATUS
_kgspCreateSignatureMemdesc(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw)3621 _kgspCreateSignatureMemdesc
3622 (
3623 OBJGPU *pGpu,
3624 KernelGsp *pKernelGsp,
3625 GSP_FIRMWARE *pGspFw
3626 )
3627 {
3628 NV_STATUS status = NV_OK;
3629 NvU8 *pSignatureVa = NULL;
3630 NvU64 flags = MEMDESC_FLAGS_NONE;
3631
3632 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3633
3634 // NOTE: align to 256 because that's the alignment needed for Booter DMA
3635 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3636 memdescCreate(&pKernelGsp->pSignatureMemdesc, pGpu,
3637 NV_ALIGN_UP(pGspFw->signatureSize, 256), 256,
3638 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags));
3639
3640 memdescTagAlloc(status,
3641 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_16, pKernelGsp->pSignatureMemdesc);
3642 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, status, fail_create);
3643
3644 pSignatureVa = memdescMapInternal(pGpu, pKernelGsp->pSignatureMemdesc, TRANSFER_FLAGS_NONE);
3645 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
3646 (pSignatureVa != NULL) ? NV_OK : NV_ERR_INSUFFICIENT_RESOURCES,
3647 fail_alloc);
3648
3649 portMemCopy(pSignatureVa, memdescGetSize(pKernelGsp->pSignatureMemdesc),
3650 pGspFw->pSignatureData, pGspFw->signatureSize);
3651
3652 memdescUnmapInternal(pGpu, pKernelGsp->pSignatureMemdesc, 0);
3653 pSignatureVa = NULL;
3654
3655 return status;
3656
3657 fail_alloc:
3658 memdescFree(pKernelGsp->pSignatureMemdesc);
3659
3660 fail_create:
3661 memdescDestroy(pKernelGsp->pSignatureMemdesc);
3662 pKernelGsp->pSignatureMemdesc = NULL;
3663
3664 return status;
3665 }
3666
3667 /*!
3668 * Verify that the version embedded in the .fwversion section of the ELF given
3669 * by pElfData and elfDataSize matches our NV_VERSION_STRING.
3670 */
3671 static NV_STATUS
_kgspFwContainerVerifyVersion(OBJGPU * pGpu,KernelGsp * pKernelGsp,const void * pElfData,NvU64 elfDataSize,const char * pNameInMsg)3672 _kgspFwContainerVerifyVersion
3673 (
3674 OBJGPU *pGpu,
3675 KernelGsp *pKernelGsp,
3676 const void *pElfData,
3677 NvU64 elfDataSize,
3678 const char *pNameInMsg
3679 )
3680 {
3681 const char *pFwversion;
3682 NvU64 fwversionSize;
3683 NvU64 expectedVersionLength = portStringLength(NV_VERSION_STRING);
3684
3685 {
3686 const void *pFwversionRaw;
3687
3688 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3689 _kgspFwContainerGetSection(pGpu, pKernelGsp,
3690 pElfData,
3691 elfDataSize,
3692 GSP_VERSION_SECTION_NAME,
3693 &pFwversionRaw,
3694 &fwversionSize));
3695
3696 pFwversion = (const char *) pFwversionRaw;
3697 }
3698
3699 // Check that text in .fwversion section of ELF matches our NV_VERSION_STRING
3700 if ((fwversionSize != expectedVersionLength + 1) ||
3701 (portStringCompare(pFwversion, NV_VERSION_STRING, expectedVersionLength) != 0))
3702 {
3703 // Sanity check .fwversion before attempting to print it in the error message
3704 if ((fwversionSize > 0) &&
3705 (fwversionSize < 64) &&
3706 (pFwversion[fwversionSize - 1] == '\0'))
3707 {
3708 NV_PRINTF(LEVEL_ERROR, "%s version mismatch: got version %s, expected version %s\n",
3709 pNameInMsg, pFwversion, NV_VERSION_STRING);
3710 }
3711 else
3712 {
3713 NV_PRINTF(LEVEL_ERROR, "%s version unknown or malformed, expected version %s\n",
3714 pNameInMsg, NV_VERSION_STRING);
3715 }
3716 return NV_ERR_INVALID_DATA;
3717 }
3718
3719 return NV_OK;
3720 }
3721
3722 /*!
3723 * Get the name of the section corresponding to the given section name
3724 * prefix and the current chip.
3725 */
3726 static NV_STATUS
_kgspGetSectionNameForPrefix(OBJGPU * pGpu,KernelGsp * pKernelGsp,char * pSectionNameBuf,NvLength sectionNameBufSize,const char * pSectionPrefix)3727 _kgspGetSectionNameForPrefix
3728 (
3729 OBJGPU *pGpu,
3730 KernelGsp *pKernelGsp,
3731 char *pSectionNameBuf, // out
3732 NvLength sectionNameBufSize,
3733 const char *pSectionPrefix
3734 )
3735 {
3736 NvLength sectionPrefixLength;
3737
3738 nv_firmware_chip_family_t chipFamily;
3739 const char *pChipFamilyName;
3740 NvLength chipFamilyNameLength;
3741
3742 NvLength totalSize;
3743
3744 NV_ASSERT_OR_RETURN(pSectionNameBuf != NULL, NV_ERR_INVALID_ARGUMENT);
3745 NV_ASSERT_OR_RETURN(sectionNameBufSize > 0, NV_ERR_INVALID_ARGUMENT);
3746 NV_ASSERT_OR_RETURN(pSectionPrefix != NULL, NV_ERR_INVALID_ARGUMENT);
3747
3748 chipFamily = nv_firmware_get_chip_family(gpuGetChipArch(pGpu),
3749 gpuGetChipImpl(pGpu));
3750 NV_ASSERT_OR_RETURN(chipFamily != NV_FIRMWARE_CHIP_FAMILY_NULL,
3751 NV_ERR_INVALID_STATE);
3752
3753 pChipFamilyName = nv_firmware_chip_family_to_string(chipFamily);
3754 NV_ASSERT_OR_RETURN(pChipFamilyName != NULL, NV_ERR_INVALID_STATE);
3755
3756 sectionPrefixLength = portStringLength(pSectionPrefix);
3757 chipFamilyNameLength = portStringLength(pChipFamilyName);
3758
3759 totalSize = sectionPrefixLength + chipFamilyNameLength + 1;
3760 NV_ASSERT_OR_RETURN(sectionNameBufSize >= sectionPrefixLength + 1,
3761 NV_ERR_BUFFER_TOO_SMALL);
3762 NV_ASSERT_OR_RETURN(sectionNameBufSize >= totalSize,
3763 NV_ERR_BUFFER_TOO_SMALL);
3764
3765 portStringCopy(pSectionNameBuf, sectionNameBufSize,
3766 pSectionPrefix, sectionPrefixLength + 1);
3767 portStringCat(pSectionNameBuf, sectionNameBufSize,
3768 pChipFamilyName, chipFamilyNameLength + 1);
3769
3770 return NV_OK;
3771 }
3772
3773 static NV_STATUS
_kgspPrepareGspRmBinaryImage(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw)3774 _kgspPrepareGspRmBinaryImage
3775 (
3776 OBJGPU *pGpu,
3777 KernelGsp *pKernelGsp,
3778 GSP_FIRMWARE *pGspFw
3779 )
3780 {
3781 char signatureSectionName[32];
3782
3783 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3784 _kgspFwContainerVerifyVersion(pGpu, pKernelGsp,
3785 pGspFw->pBuf,
3786 pGspFw->size,
3787 "GSP firmware image"));
3788
3789 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3790 _kgspFwContainerGetSection(pGpu, pKernelGsp,
3791 pGspFw->pBuf,
3792 pGspFw->size,
3793 GSP_IMAGE_SECTION_NAME,
3794 &pGspFw->pImageData,
3795 &pGspFw->imageSize));
3796
3797 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3798 _kgspGetSectionNameForPrefix(pGpu, pKernelGsp,
3799 signatureSectionName, sizeof(signatureSectionName),
3800 kgspGetSignatureSectionNamePrefix_HAL(pGpu, pKernelGsp)));
3801
3802 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3803 _kgspFwContainerGetSection(pGpu, pKernelGsp,
3804 pGspFw->pBuf,
3805 pGspFw->size,
3806 signatureSectionName,
3807 &pGspFw->pSignatureData,
3808 &pGspFw->signatureSize));
3809
3810 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3811 _kgspCreateSignatureMemdesc(pGpu, pKernelGsp,
3812 pGspFw));
3813
3814 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3815 kgspCreateRadix3(pGpu, pKernelGsp, &pKernelGsp->pGspUCodeRadix3Descriptor,
3816 NULL, pGspFw->pImageData, pGspFw->imageSize));
3817
3818 return NV_OK;
3819 }
3820
3821 NV_STATUS
kgspCreateRadix3_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,MEMORY_DESCRIPTOR ** ppMemdescRadix3,MEMORY_DESCRIPTOR * pMemdescData,const void * pData,NvU64 size)3822 kgspCreateRadix3_IMPL
3823 (
3824 OBJGPU *pGpu,
3825 KernelGsp *pKernelGsp,
3826 MEMORY_DESCRIPTOR **ppMemdescRadix3,
3827 MEMORY_DESCRIPTOR *pMemdescData,
3828 const void *pData,
3829 NvU64 size
3830 )
3831 {
3832 const NvU64 entriesLog2 = LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2 - 3;
3833 NvU8 *pRadix3Buf;
3834 NvP64 pVaKernel;
3835 NvP64 pPrivKernel;
3836 NvU64 ptSize;
3837 NvU64 allocSize;
3838 NvU64 nPages = 0;
3839 NvU64 dataOffset = 0;
3840 NvU32 i;
3841 NV_STATUS status = NV_OK;
3842 NvU64 flags = MEMDESC_FLAGS_KERNEL_MODE;
3843
3844 // radix3 working array.
3845 struct
3846 {
3847 NvU64 nPages;
3848 NvU64 offset;
3849 } radix3[4];
3850
3851 NV_ASSERT_OR_RETURN(ppMemdescRadix3 != NULL, NV_ERR_INVALID_PARAMETER);
3852 NV_ASSERT_OR_ELSE_STR(!((pMemdescData != NULL) && (pData != NULL)),
3853 "Specify pMemdescData or pData, or none, but not both",
3854 return NV_ERR_INVALID_PARAMETER);
3855
3856 // If the size is not specified, get it from the memory descriptor.
3857 if ((size == 0) && (pMemdescData != NULL))
3858 size = memdescGetSize(pMemdescData);
3859 NV_ASSERT_OR_RETURN(size > 0, NV_ERR_OUT_OF_RANGE);
3860
3861 // Clear working structure.
3862 portMemSet(radix3, 0, sizeof radix3);
3863
3864 // Populate npages, high to low.
3865 i = NV_ARRAY_ELEMENTS(radix3) - 1;
3866 radix3[i].nPages = (size + LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE - 1) >>
3867 LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3868 for (; i > 0; i--)
3869 radix3[i - 1].nPages = ((radix3[i].nPages - 1) >> entriesLog2) + 1;
3870
3871 // Populate offset, low to high.
3872 for (i = 1; i < NV_ARRAY_ELEMENTS(radix3); i++)
3873 {
3874 nPages += radix3[i - 1].nPages;
3875 radix3[i].offset = nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3876 }
3877
3878 NV_ASSERT_OR_RETURN(radix3[0].nPages == 1, NV_ERR_OUT_OF_RANGE);
3879
3880 // Allocate space for PTEs and PDEs.
3881 ptSize = nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3882 allocSize = ptSize;
3883
3884 if (pMemdescData == NULL)
3885 {
3886 // We don't have a separate descriptor for the data. We need PTEs,
3887 // so include space for data in the new descriptor.
3888 allocSize += radix3[3].nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3889 }
3890
3891 flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3892
3893 NV_ASSERT_OK_OR_GOTO(status,
3894 memdescCreate(ppMemdescRadix3, pGpu, allocSize,
3895 LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE,
3896 NV_MEMORY_NONCONTIGUOUS,
3897 ADDR_SYSMEM,
3898 NV_MEMORY_CACHED,
3899 flags),
3900 done);
3901
3902 memdescTagAlloc(status,
3903 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_17, (*ppMemdescRadix3));
3904 NV_ASSERT_OK_OR_GOTO(status, status, error_ret);
3905
3906 // Create kernel mapping.
3907 NV_ASSERT_OK_OR_GOTO(status,
3908 memdescMap(*ppMemdescRadix3, 0, allocSize, NV_TRUE, NV_PROTECT_WRITEABLE,
3909 &pVaKernel, &pPrivKernel),
3910 error_ret);
3911
3912 if (pVaKernel == NvP64_NULL)
3913 {
3914 NV_PRINTF(LEVEL_ERROR, "VA error for radix3 shared buffer\n");
3915 status = NV_ERR_NO_MEMORY;
3916 goto error_ret;
3917 }
3918
3919 pRadix3Buf = KERNEL_POINTER_FROM_NvP64(NvU8 *, pVaKernel);
3920
3921 // Zap out page table.
3922 portMemSet(pRadix3Buf, 0, ptSize);
3923
3924 // Fill in PDEs.
3925 for (i = 0; i < NV_ARRAY_ELEMENTS(radix3) - 2; i++)
3926 {
3927 memdescGetPhysAddrs(*ppMemdescRadix3,
3928 AT_GPU, // addressTranslation
3929 radix3[i + 1].offset, // offset
3930 RM_PAGE_SIZE, // stride
3931 radix3[i + 1].nPages, // count
3932 (RmPhysAddr *)(pRadix3Buf + radix3[i].offset)); // physical address table
3933 }
3934
3935 dataOffset = radix3[3].offset;
3936
3937 if (pData != NULL)
3938 {
3939 // Optionally copy data into the radix3 buffer.
3940 portMemCopy(pRadix3Buf + dataOffset, size, pData, size);
3941
3942 // If we only have part of the last page, clear the rest.
3943 NvU32 clearSize = allocSize - dataOffset - size;
3944 if (clearSize != 0)
3945 portMemSet(pRadix3Buf + dataOffset + size, 0, clearSize);
3946
3947 pMemdescData = *ppMemdescRadix3;
3948 }
3949
3950 memdescGetPhysAddrs(*ppMemdescRadix3,
3951 AT_GPU, // addressTranslation
3952 dataOffset, // offset
3953 RM_PAGE_SIZE, // stride
3954 radix3[3].nPages, // count
3955 (RmPhysAddr *)(pRadix3Buf + radix3[2].offset)); // physical address table
3956
3957 //
3958 // No reason to keep this memory mapped on the CPU side. Only GSP will
3959 // access it after this point.
3960 //
3961 memdescUnmap(*ppMemdescRadix3, NV_TRUE, osGetCurrentProcess(),
3962 pVaKernel, pPrivKernel);
3963 done:
3964 return status;
3965
3966 error_ret:
3967 if (*ppMemdescRadix3 != NULL)
3968 {
3969 memdescFree(*ppMemdescRadix3);
3970 memdescDestroy(*ppMemdescRadix3);
3971 *ppMemdescRadix3 = NULL;
3972 }
3973
3974 return status;
3975 }
3976
3977 static NV_STATUS
_kgspFwContainerGetSection(OBJGPU * pGpu,KernelGsp * pKernelGsp,const void * pElfData,NvU64 elfDataSize,const char * pSectionName,const void ** ppSectionData,NvU64 * pSectionSize)3978 _kgspFwContainerGetSection
3979 (
3980 OBJGPU *pGpu,
3981 KernelGsp *pKernelGsp,
3982 const void *pElfData,
3983 NvU64 elfDataSize,
3984 const char *pSectionName,
3985 const void **ppSectionData,
3986 NvU64 *pSectionSize
3987 )
3988 {
3989 const NvU8 *pGspBuf = pElfData;
3990 const LibosElf64Header *pElfHeader;
3991 const LibosElf64SectionHeader *pElfSectionHeader;
3992 NvU64 elfSectionHeaderTableLength;
3993 NvU64 elfSectionHeaderMaxIdx;
3994 NvU64 elfSectionNamesTableOffset;
3995 NvU64 elfSectionNamesTableSize;
3996 NvU64 elfSectionNamesTableMaxIdx;
3997 static const NvU32 elfMagicNumber = 0x464C457F;
3998 static const NvU8 elfClass64 = 0x2;
3999 static const NvU8 elfLittleEndian = 0x1;
4000 const char *pCurrentSectionName;
4001 NvLength sectionNameLength;
4002 NvS16 idx;
4003
4004 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfData != NULL, NV_ERR_INVALID_ARGUMENT);
4005 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize > 0, NV_ERR_INVALID_ARGUMENT);
4006 NV_CHECK_OR_RETURN(LEVEL_ERROR, pSectionName != NULL, NV_ERR_INVALID_ARGUMENT);
4007 NV_CHECK_OR_RETURN(LEVEL_ERROR, ppSectionData != NULL, NV_ERR_INVALID_ARGUMENT);
4008 NV_CHECK_OR_RETURN(LEVEL_ERROR, pSectionSize != NULL, NV_ERR_INVALID_ARGUMENT);
4009 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= sizeof(LibosElf64Header), NV_ERR_INVALID_DATA);
4010
4011 sectionNameLength = portStringLength(pSectionName);
4012
4013 pElfHeader = (const LibosElf64Header*) pGspBuf;
4014
4015 // Check for the elf identifier at the beginning of the file
4016 NV_CHECK_OR_RETURN(LEVEL_ERROR, *(NvU32*)&pElfHeader->ident == elfMagicNumber, NV_ERR_INVALID_DATA);
4017 // Make sure the data is formatted as little endian
4018 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->ident[5] == elfLittleEndian, NV_ERR_INVALID_DATA);
4019 // Check the class type, only ELFCLASS64 is supported
4020 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->ident[4] == elfClass64, NV_ERR_INVALID_DATA);
4021
4022 // Make sure that the elf section header table is valid
4023 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->shentsize == sizeof(LibosElf64SectionHeader), NV_ERR_INVALID_DATA);
4024 NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeMulU64(pElfHeader->shentsize, pElfHeader->shnum, &elfSectionHeaderTableLength), NV_ERR_INVALID_DATA);
4025 NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(pElfHeader->shoff, elfSectionHeaderTableLength - 1, &elfSectionHeaderMaxIdx), NV_ERR_INVALID_DATA);
4026 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionHeaderMaxIdx, NV_ERR_INVALID_DATA);
4027 NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->shstrndx <= pElfHeader->shnum, NV_ERR_INVALID_DATA);
4028
4029 // Get the offset and size of the table that holds the section names and make sure they are valid
4030 pElfSectionHeader = (const LibosElf64SectionHeader*) &pGspBuf[pElfHeader->shoff + (pElfHeader->shstrndx * pElfHeader->shentsize)];
4031 elfSectionNamesTableOffset = pElfSectionHeader->offset;
4032 elfSectionNamesTableSize = pElfSectionHeader->size;
4033 NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(elfSectionNamesTableOffset, elfSectionNamesTableSize - 1, &elfSectionNamesTableMaxIdx), NV_ERR_INVALID_DATA);
4034 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionNamesTableMaxIdx, NV_ERR_INVALID_DATA);
4035
4036 // Iterate through all of the section headers to find the signatures
4037 pElfSectionHeader = (const LibosElf64SectionHeader*) &pGspBuf[elfSectionHeaderMaxIdx + 1 - sizeof(*pElfSectionHeader)];
4038
4039 for (idx = pElfHeader->shnum - 1; idx >= 0; idx--, pElfSectionHeader--)
4040 {
4041 NvU64 currentSectionNameMaxLength;
4042 NvU64 elfSectionMaxIdx;
4043
4044 // Make sure the header name index fits within the section names table
4045 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfSectionNamesTableSize - 1 >= pElfSectionHeader->name, NV_ERR_INVALID_DATA);
4046 currentSectionNameMaxLength = elfSectionNamesTableSize - pElfSectionHeader->name - 1;
4047 pCurrentSectionName = (const char *) &pGspBuf[elfSectionNamesTableOffset + pElfSectionHeader->name];
4048
4049 // Make sure the elf section size and offset are valid
4050 if (pElfSectionHeader->size > 0)
4051 {
4052 NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(pElfSectionHeader->offset, pElfSectionHeader->size - 1, &elfSectionMaxIdx), NV_ERR_INVALID_DATA);
4053 }
4054 else
4055 {
4056 elfSectionMaxIdx = pElfSectionHeader->offset;
4057 }
4058 NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionMaxIdx, NV_ERR_INVALID_DATA);
4059
4060 // Check whether the section name matches the expected section name
4061 if ((sectionNameLength <= currentSectionNameMaxLength) &&
4062 (portStringCompare(pCurrentSectionName, pSectionName, sectionNameLength) == 0) &&
4063 (pCurrentSectionName[sectionNameLength] == '\0'))
4064 {
4065 *ppSectionData = &pGspBuf[pElfSectionHeader->offset];
4066 *pSectionSize = pElfSectionHeader->size;
4067
4068 return NV_OK;
4069 }
4070 }
4071
4072 return NV_ERR_OBJECT_NOT_FOUND;
4073 }
4074
4075 /*!
4076 * Setup libos init arguments.
4077 */
4078 void
kgspSetupLibosInitArgs_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4079 kgspSetupLibosInitArgs_IMPL
4080 (
4081 OBJGPU *pGpu,
4082 KernelGsp *pKernelGsp
4083 )
4084 {
4085 LibosMemoryRegionInitArgument *pLibosInitArgs = pKernelGsp->pLibosInitArgumentsCached;
4086 NvU8 idx;
4087
4088 portMemSet(pLibosInitArgs, 0, LIBOS_INIT_ARGUMENTS_SIZE);
4089
4090 // Add memory areas for logging each LIBOS task.
4091 // @note LOGINIT must be first for early init logging to work.
4092 // @note: These should be switched to radix regions to remove the need
4093 // for large apertures in the RM task for logging.
4094 for (idx = 0; idx < LOGIDX_SIZE; idx++)
4095 {
4096 pLibosInitArgs[idx].kind = LIBOS_MEMORY_REGION_CONTIGUOUS;
4097 pLibosInitArgs[idx].loc = LIBOS_MEMORY_REGION_LOC_SYSMEM;
4098 pLibosInitArgs[idx].id8 = pKernelGsp->rmLibosLogMem[idx].id8;
4099 pLibosInitArgs[idx].pa = pKernelGsp->rmLibosLogMem[idx].pTaskLogBuffer[1];
4100 pLibosInitArgs[idx].size = memdescGetSize(pKernelGsp->rmLibosLogMem[idx].pTaskLogDescriptor);
4101 }
4102
4103 // insert GSP-RM ELF args address; id must match libos-config.py entry
4104 pLibosInitArgs[idx].kind = LIBOS_MEMORY_REGION_CONTIGUOUS;
4105 pLibosInitArgs[idx].loc = LIBOS_MEMORY_REGION_LOC_SYSMEM;
4106 pLibosInitArgs[idx].id8 = _kgspGenerateInitArgId("RMARGS");
4107 pLibosInitArgs[idx].pa = memdescGetPhysAddr(pKernelGsp->pGspArgumentsDescriptor, AT_GPU, 0);
4108 pLibosInitArgs[idx].size = memdescGetSize(pKernelGsp->pGspArgumentsDescriptor);
4109
4110 portAtomicMemoryFenceFull();
4111 }
4112
4113 /*!
4114 * Receive and process RPC event from GSP-RM.
4115 *
4116 * This function is called from interrupt bottom-half handler (DPC) and
4117 * would race with normal RPC flow, _kgspRpcRecvPoll().
4118 * This race is currently avoided only because DPC is executed under
4119 * gpus lock, so RPC and Bottom-half handler are mutually exclusive
4120 * control flows.
4121 */
4122 void
kgspRpcRecvEvents_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4123 kgspRpcRecvEvents_IMPL
4124 (
4125 OBJGPU *pGpu,
4126 KernelGsp *pKernelGsp
4127 )
4128 {
4129 NvU32 gpuMaskUnused;
4130 NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
4131 //
4132 // We should never have an event with code NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS.
4133 // If we do the assert will fail on NV_WARN_MORE_PROCESSING_REQUIRED,
4134 // in addition to general error codes.
4135 //
4136 NV_ASSERT_OK(_kgspRpcDrainEvents(pGpu, pKernelGsp, NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS, KGSP_RPC_EVENT_HANDLER_CONTEXT_INTERRUPT));
4137 }
4138
4139 /*!
4140 * Wait for GSP-RM initialization to complete.
4141 */
4142 NV_STATUS
kgspWaitForRmInitDone_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4143 kgspWaitForRmInitDone_IMPL
4144 (
4145 OBJGPU *pGpu,
4146 KernelGsp *pKernelGsp
4147 )
4148 {
4149 OBJRPC *pRpc = pKernelGsp->pRpc;
4150
4151 //
4152 // Kernel RM can timeout when GSP-RM has an error condition. Give GSP-RM
4153 // a chance to report the error before we pull the rug out from under it.
4154 //
4155 threadStateResetTimeout(pGpu);
4156
4157 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
4158 rpcRecvPoll(pGpu, pRpc, NV_VGPU_MSG_EVENT_GSP_INIT_DONE));
4159
4160 //
4161 // Now check if RPC really succeeded (NV_VGPU_MSG_RESULT_* are defined to
4162 // equivalent NV_STATUS codes in RM).
4163 //
4164 NV_ASSERT_OK_OR_RETURN(RPC_HDR->rpc_result);
4165
4166 pGpu->gspRmInitialized = NV_TRUE;
4167 if (hypervisorIsVgxHyper() && pGpu->getProperty(pGpu, PDB_PROP_GPU_EXTENDED_GSP_RM_INITIALIZATION_TIMEOUT_FOR_VGX))
4168 {
4169 // Decrease timeout values for VGX driver
4170 timeoutInitializeGpuDefault(&pGpu->timeoutData, pGpu);
4171 }
4172
4173 return NV_OK;
4174 }
4175
4176 /*!
4177 * Execute a sequencer buffer coming from GSP
4178 *
4179 * @param[in] pGpu GPU object pointer
4180 * @param[in] pKernelGsp KernelGsp object pointer
4181 * @param[in] pRunCpuSeqParams Sequence buffer RPC parameters
4182 *
4183 * @return NV_OK if the GSP sequencer buffer has been executed successfully
4184 * NV_ERR_INVALID_STATE if the sequencer buffer is not allocated
4185 * NV_ERR_INVALID_DATA is the sequencer buffer is malformed
4186 */
4187 NV_STATUS
kgspExecuteSequencerBuffer_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,void * pRunCpuSeqParams)4188 kgspExecuteSequencerBuffer_IMPL
4189 (
4190 OBJGPU *pGpu,
4191 KernelGsp *pKernelGsp,
4192 void *pRunCpuSeqParams
4193 )
4194 {
4195 rpc_run_cpu_sequencer_v17_00 *pParams = (rpc_run_cpu_sequencer_v17_00 *)pRunCpuSeqParams;
4196 NvU32 *pCmd = pParams->commandBuffer;
4197 NvU32 buffer_end = pParams->cmdIndex;
4198 NvU32 current_cmd_index = 0;
4199 NV_STATUS nvStatus = NV_OK;
4200 NvU32 payloadSize;
4201
4202 NV_ASSERT_OR_RETURN(IS_GSP_CLIENT(pGpu), NV_ERR_NOT_SUPPORTED);
4203 NV_ASSERT_OR_RETURN((pParams->bufferSizeDWord != 0), NV_ERR_INVALID_STATE);
4204 NV_ASSERT_OR_RETURN(buffer_end < pParams->bufferSizeDWord, NV_ERR_INVALID_DATA);
4205
4206 while (current_cmd_index < buffer_end)
4207 {
4208 NvU32 opCode = pCmd[current_cmd_index++];
4209 payloadSize = GSP_SEQUENCER_PAYLOAD_SIZE_DWORDS(opCode);
4210
4211 NV_ASSERT_OR_RETURN(current_cmd_index + payloadSize <= buffer_end, NV_ERR_INVALID_DATA);
4212
4213 //
4214 // Handling of sequencer commands is split between those commands
4215 // that are common to all architectures (handled directly here) and
4216 // those commands that are arch-specific and are handled via the
4217 // kgspExecuteSequencerCommand_HAL() call below.
4218 //
4219 switch (opCode)
4220 {
4221 // 2 arguments
4222 case GSP_SEQ_BUF_OPCODE_REG_WRITE:
4223 {
4224 GSP_SEQ_BUF_PAYLOAD_REG_WRITE regWrite;
4225 portMemCopy(®Write, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_WRITE), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_WRITE));
4226
4227 GPU_REG_WR32(pGpu, regWrite.addr, regWrite.val);
4228 break;
4229 }
4230
4231 // 3 arguments
4232 case GSP_SEQ_BUF_OPCODE_REG_MODIFY:
4233 {
4234 GSP_SEQ_BUF_PAYLOAD_REG_MODIFY regModify;
4235 NvU32 regVal;
4236
4237 portMemCopy(®Modify, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_MODIFY), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_MODIFY));
4238
4239 regVal = GPU_REG_RD32(pGpu, regModify.addr);
4240 regVal = regVal & ~regModify.mask;
4241 regVal = regVal | regModify.val;
4242 GPU_REG_WR32(pGpu, regModify.addr, regVal);
4243 break;
4244 }
4245
4246 // 5 arguments
4247 case GSP_SEQ_BUF_OPCODE_REG_POLL:
4248 {
4249 GSP_SEQ_BUF_PAYLOAD_REG_POLL regPoll;
4250 NvU32 regval;
4251 RMTIMEOUT timeout;
4252
4253 portMemCopy(®Poll, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_POLL), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_POLL));
4254
4255 regval = GPU_REG_RD32(pGpu, regPoll.addr);
4256
4257 gpuSetTimeout(pGpu, regPoll.timeout, &timeout, 0);
4258 while ((regval & regPoll.mask) != regPoll.val)
4259 {
4260 nvStatus = gpuCheckTimeout(pGpu, &timeout);
4261 if (nvStatus == NV_ERR_TIMEOUT)
4262 {
4263 NV_PRINTF(LEVEL_ERROR, "Timeout waiting for register to settle, value = 0x%x, err_code = 0x%x\n",
4264 regval, regPoll.error);
4265 DBG_BREAKPOINT();
4266 return nvStatus;
4267 }
4268 osSpinLoop();
4269 regval = GPU_REG_RD32(pGpu, regPoll.addr);
4270 }
4271 break;
4272 }
4273
4274 case GSP_SEQ_BUF_OPCODE_DELAY_US:
4275 {
4276 GSP_SEQ_BUF_PAYLOAD_DELAY_US delayUs;
4277 portMemCopy(&delayUs, sizeof(GSP_SEQ_BUF_PAYLOAD_DELAY_US), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_DELAY_US));
4278
4279 osDelayUs(delayUs.val);
4280 break;
4281 }
4282
4283 case GSP_SEQ_BUF_OPCODE_REG_STORE:
4284 {
4285 GSP_SEQ_BUF_PAYLOAD_REG_STORE regStore;
4286 portMemCopy(®Store, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_STORE), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_STORE));
4287
4288 NV_ASSERT_OR_RETURN(regStore.index < GSP_SEQ_BUF_REG_SAVE_SIZE, NV_ERR_INVALID_ARGUMENT);
4289
4290 pParams->regSaveArea[regStore.index] = GPU_REG_RD32(pGpu, regStore.addr);
4291 break;
4292 }
4293
4294 case GSP_SEQ_BUF_OPCODE_CORE_RESET:
4295 {
4296 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4297
4298 kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4299 kflcnDisableCtxReq_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4300 break;
4301 }
4302
4303 case GSP_SEQ_BUF_OPCODE_CORE_START:
4304 {
4305 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4306
4307 kflcnStartCpu_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4308 break;
4309 }
4310
4311 case GSP_SEQ_BUF_OPCODE_CORE_WAIT_FOR_HALT:
4312 {
4313 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4314
4315 NV_ASSERT_OK_OR_RETURN(kflcnWaitForHalt_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon), GPU_TIMEOUT_DEFAULT, 0));
4316 break;
4317 }
4318
4319 default:
4320 //
4321 // Route this command to the arch-specific handler.
4322 //
4323 NV_ASSERT_OK_OR_RETURN(kgspExecuteSequencerCommand_HAL(pGpu, pKernelGsp, opCode, &pCmd[current_cmd_index], payloadSize * sizeof (*pCmd)));
4324 break;
4325 }
4326 current_cmd_index += payloadSize;
4327 }
4328
4329 return NV_OK;
4330 }
4331
4332 #if LIBOS_LOG_DECODE_ENABLE
4333 static void
_kgspLogPollingCallback(OBJGPU * pGpu,void * data)4334 _kgspLogPollingCallback
4335 (
4336 OBJGPU *pGpu,
4337 void *data
4338 )
4339 {
4340 //
4341 // Do not take any locks in kgspDumpGspLogs. As this callback only fires when kgspNvlogFlushCb
4342 // is not registered, there is no possibility of data race.
4343 //
4344 KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
4345 kgspDumpGspLogsUnlocked(pKernelGsp, NV_FALSE);
4346 }
4347
4348 NV_STATUS
kgspStartLogPolling_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4349 kgspStartLogPolling_IMPL
4350 (
4351 OBJGPU *pGpu,
4352 KernelGsp *pKernelGsp
4353 )
4354 {
4355 NV_STATUS status = NV_OK;
4356
4357 //
4358 // Only enable the 1 Hz poll if we can live decode logs in dmesg. Else we'll
4359 // flush it on demand by nvidia-debugdump.
4360 //
4361 if (pKernelGsp->pLogElf != NULL)
4362 {
4363 status = osSchedule1HzCallback(pGpu,
4364 _kgspLogPollingCallback,
4365 NULL,
4366 NV_OS_1HZ_REPEAT);
4367 }
4368 return status;
4369 }
4370
4371 static void
_kgspStopLogPolling(OBJGPU * pGpu,KernelGsp * pKernelGsp)4372 _kgspStopLogPolling
4373 (
4374 OBJGPU *pGpu,
4375 KernelGsp *pKernelGsp
4376 )
4377 {
4378 if (pKernelGsp->pLogElf != NULL)
4379 {
4380 osRemove1HzCallback(pGpu, _kgspLogPollingCallback, NULL);
4381 }
4382 }
4383
4384 #else // LIBOS_LOG_DECODE_ENABLE
4385
4386 NV_STATUS
kgspStartLogPolling_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4387 kgspStartLogPolling_IMPL
4388 (
4389 OBJGPU *pGpu,
4390 KernelGsp *pKernelGsp
4391 )
4392 {
4393 return NV_OK;
4394 }
4395
4396 static void
_kgspStopLogPolling(OBJGPU * pGpu,KernelGsp * pKernelGsp)4397 _kgspStopLogPolling
4398 (
4399 OBJGPU *pGpu,
4400 KernelGsp *pKernelGsp
4401 )
4402 {
4403 return;
4404 }
4405 #endif // LIBOS_LOG_DECODE_ENABLE
4406
4407 /*!
4408 * Provides an opportunity to register some IntrService during intrStateInit.
4409 */
4410 void
kgspRegisterIntrService_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,IntrServiceRecord pRecords[MC_ENGINE_IDX_MAX])4411 kgspRegisterIntrService_IMPL
4412 (
4413 OBJGPU *pGpu,
4414 KernelGsp *pKernelGsp,
4415 IntrServiceRecord pRecords[MC_ENGINE_IDX_MAX]
4416 )
4417 {
4418 NvU32 engineIdx = MC_ENGINE_IDX_GSP;
4419
4420 if (!IS_GSP_CLIENT(pGpu))
4421 return;
4422
4423 NV_ASSERT(pRecords[engineIdx].pInterruptService == NULL);
4424 pRecords[engineIdx].pInterruptService = staticCast(pKernelGsp, IntrService);
4425 }
4426
4427 /*!
4428 * Service GSP interrupts.
4429 *
4430 * @returns Zero, or any implementation-chosen nonzero value. If the same nonzero value is returned enough
4431 * times the interrupt is considered stuck.
4432 */
4433 NvU32
kgspServiceInterrupt_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,IntrServiceServiceInterruptArguments * pParams)4434 kgspServiceInterrupt_IMPL
4435 (
4436 OBJGPU *pGpu,
4437 KernelGsp *pKernelGsp,
4438 IntrServiceServiceInterruptArguments *pParams
4439 )
4440 {
4441 NV_ASSERT_OR_RETURN(pParams != NULL, 0);
4442 NV_ASSERT_OR_RETURN(pParams->engineIdx == MC_ENGINE_IDX_GSP, 0);
4443
4444 return kgspService_HAL(pGpu, pKernelGsp);
4445 }
4446
4447 /*!
4448 * Calculates the GSP FW heap size based on the GPU's resources.
4449 */
4450 static NvU64
_kgspCalculateFwHeapSize(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 maxGspFwHeapSizeMB)4451 _kgspCalculateFwHeapSize
4452 (
4453 OBJGPU *pGpu,
4454 KernelGsp *pKernelGsp,
4455 NvU32 maxGspFwHeapSizeMB
4456 )
4457 {
4458 // For VGPU, use the static pre-calculated size
4459 if (pGpu->bVgpuGspPluginOffloadEnabled)
4460 return GSP_FW_HEAP_SIZE_VGPU_DEFAULT;
4461
4462 //
4463 // The baremetal heap calculation is a function of the architecture, FB
4464 // size, and a chunk for backing client allocations (pre-calibrated for the
4465 // architecture through rough profiling).
4466 //
4467 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
4468 NvU64 fbSize = 0;
4469
4470 NV_ASSERT_OK(kmemsysGetUsableFbSize_HAL(pGpu, pKernelMemorySystem, &fbSize));
4471 const NvU32 fbSizeGB = (NvU32)(NV_ALIGN_UP64(fbSize, 1 << 30) >> 30);
4472
4473 //
4474 // Reclaimable binary data will end up padding the heap (in some cases,
4475 // significantly), but due to memory fragmentation we can't rely on it to
4476 // linearly reduce the amount needed in the primary heap, so it is not a
4477 // factor here. Instead, it's just extra margin to keep us from exhausting
4478 // the heap at runtime.
4479 //
4480 NvU64 heapSize = kgspGetFwHeapParamOsCarveoutSize_HAL(pGpu, pKernelGsp) +
4481 pKernelGsp->fwHeapParamBaseSize +
4482 NV_ALIGN_UP(GSP_FW_HEAP_PARAM_SIZE_PER_GB_FB * fbSizeGB, 1 << 20) +
4483 NV_ALIGN_UP(GSP_FW_HEAP_PARAM_CLIENT_ALLOC_SIZE, 1 << 20);
4484
4485 // Clamp to the minimum, even if the calculations say we can do with less
4486 const NvU32 minGspFwHeapSizeMB = kgspGetMinWprHeapSizeMB_HAL(pGpu, pKernelGsp);
4487 heapSize = NV_MAX(heapSize, (NvU64)minGspFwHeapSizeMB << 20);
4488
4489 // Clamp to the maximum heap size, if necessary
4490 heapSize = NV_MIN(heapSize, (NvU64)maxGspFwHeapSizeMB << 20);
4491
4492 NV_PRINTF(LEVEL_INFO, "GSP FW heap %lluMB of %uGB\n",
4493 heapSize >> 20, fbSizeGB);
4494
4495 return heapSize;
4496 }
4497
4498 /*!
4499 * Returns the size in bytes of the GSP FW heap:
4500 * - the registry override, if present
4501 * - otherwise, calculate the FW heap size for this GPU, limiting it to stay
4502 * within the pre-scrubbed area at the end of FB, if needed
4503 *
4504 * @param[in] posteriorFbSize - size in bytes of the memory reserved between the
4505 * end of the GSP FW heap and the end of FB, or 0
4506 * to disable limiting of the heap range to within
4507 * the pre-scrubbed area at the end of FB
4508 */
4509 NvU64
kgspGetFwHeapSize_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU64 posteriorFbSize)4510 kgspGetFwHeapSize_IMPL
4511 (
4512 OBJGPU *pGpu,
4513 KernelGsp *pKernelGsp,
4514 NvU64 posteriorFbSize
4515 )
4516 {
4517 NvU32 maxScrubbedHeapSizeMB = NV_U32_MAX;
4518 NvU32 heapSizeMB = 0;
4519
4520 //
4521 // The pre-scrubbed region at the end of FB may limit the heap size, if no
4522 // scrubber ucode is supported to unlock the rest of memory prior to booting
4523 // GSP-RM.
4524 //
4525 if (!pKernelGsp->bScrubberUcodeSupported && (posteriorFbSize != 0))
4526 {
4527 const NvU64 prescrubbedSize = kgspGetPrescrubbedTopFbSize(pGpu, pKernelGsp);
4528 if (prescrubbedSize < NV_U64_MAX)
4529 maxScrubbedHeapSizeMB = (NvU32)((prescrubbedSize - posteriorFbSize) >> 20);
4530 }
4531
4532 // Get the heap size override from the registry, if any
4533 if ((osReadRegistryDword(pGpu, NV_REG_STR_GSP_FIRMWARE_HEAP_SIZE_MB, &heapSizeMB) == NV_OK) &&
4534 (heapSizeMB != NV_REG_STR_GSP_FIRMWARE_HEAP_SIZE_MB_DEFAULT))
4535 {
4536 const NvU32 minGspFwHeapSizeMB = kgspGetMinWprHeapSizeMB_HAL(pGpu, pKernelGsp);
4537 const NvU32 maxGspFwHeapSizeMB = NV_MIN(kgspGetMaxWprHeapSizeMB_HAL(pGpu, pKernelGsp),
4538 maxScrubbedHeapSizeMB);
4539
4540 NV_ASSERT(minGspFwHeapSizeMB < maxGspFwHeapSizeMB);
4541
4542 if (heapSizeMB > maxGspFwHeapSizeMB)
4543 {
4544 NV_PRINTF(LEVEL_WARNING, "Firmware heap size clamped to maximum (%uMB)\n",
4545 maxGspFwHeapSizeMB);
4546 heapSizeMB = maxGspFwHeapSizeMB;
4547 }
4548 else if (heapSizeMB < minGspFwHeapSizeMB)
4549 {
4550 NV_PRINTF(LEVEL_WARNING, "Firmware heap size clamped to minimum (%uMB)\n",
4551 minGspFwHeapSizeMB);
4552 heapSizeMB = minGspFwHeapSizeMB;
4553 }
4554 else
4555 {
4556 NV_PRINTF(LEVEL_WARNING, "Firmware heap size overridden (%uMB)\n",
4557 heapSizeMB);
4558 }
4559
4560 return ((NvU64)heapSizeMB) << 20;
4561 }
4562
4563 return _kgspCalculateFwHeapSize(pGpu, pKernelGsp, maxScrubbedHeapSizeMB);
4564 }
4565
kgspGetWprEndMargin_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4566 NvU64 kgspGetWprEndMargin_IMPL(OBJGPU *pGpu, KernelGsp *pKernelGsp)
4567 {
4568 NvU64 wprEndMargin;
4569 NvU32 marginOverride = 0;
4570 GspFwWprMeta *pWprMeta = pKernelGsp->pWprMeta;
4571
4572 (void)osReadRegistryDword(pGpu, NV_REG_STR_RM_GSP_WPR_END_MARGIN, &marginOverride);
4573
4574 wprEndMargin = ((NvU64)DRF_VAL(_REG, _RM_GSP_WPR_END_MARGIN, _MB, marginOverride)) << 20;
4575 if (wprEndMargin == 0)
4576 {
4577 // Calculate the default margin size based on the WPR size
4578 const GspFwWprMeta *pWprMeta = pKernelGsp->pWprMeta;
4579
4580 //
4581 // This needs to be called after pWprMeta->sizeOfRadix3Elf has been initialized,
4582 // in order to estimate the default WPR size.
4583 //
4584 NV_ASSERT(pWprMeta->sizeOfRadix3Elf > 0);
4585
4586 //
4587 // If the bounds are encoded in GspFwWprMeta from a prior attempt, use them.
4588 // Otherwise, estimate the WPR size by the sizes of the elements in the layout
4589 //
4590 if (pWprMeta->gspFwWprEnd > pWprMeta->nonWprHeapOffset)
4591 {
4592 wprEndMargin = pWprMeta->gspFwWprEnd - pWprMeta->nonWprHeapOffset;
4593 }
4594 else
4595 {
4596 wprEndMargin += kgspGetFrtsSize_HAL(pGpu, pKernelGsp);
4597 wprEndMargin += pKernelGsp->gspRmBootUcodeSize;
4598 wprEndMargin += pWprMeta->sizeOfRadix3Elf;
4599 wprEndMargin += kgspGetFwHeapSize(pGpu, pKernelGsp, 0);
4600 wprEndMargin += kgspGetNonWprHeapSize(pGpu, pKernelGsp);
4601 }
4602
4603 if (pKernelGsp->bootAttempts > 0)
4604 wprEndMargin *= pKernelGsp->bootAttempts;
4605 }
4606
4607 if (FLD_TEST_DRF(_REG, _RM_GSP_WPR_END_MARGIN, _APPLY, _ALWAYS, marginOverride) ||
4608 (pKernelGsp->bootAttempts > 0))
4609 {
4610 NV_PRINTF(LEVEL_WARNING, "Adding margin of 0x%llx bytes after the end of WPR2\n",
4611 wprEndMargin);
4612 pWprMeta->flags |= GSP_FW_FLAGS_RECOVERY_MARGIN_PRESENT;
4613 return wprEndMargin;
4614 }
4615
4616 // Normal boot path
4617 pWprMeta->flags &= ~GSP_FW_FLAGS_RECOVERY_MARGIN_PRESENT;
4618 return 0;
4619 }
4620