1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "resserv/rs_server.h"
25 
26 #include "gpu/gsp/kernel_gsp.h"
27 
28 #include "kernel/core/thread_state.h"
29 #include "kernel/core/locks.h"
30 #include "kernel/diagnostics/gpu_acct.h"
31 #include "kernel/diagnostics/journal.h"
32 #include "kernel/gpu/fifo/kernel_channel.h"
33 #include "kernel/gpu/gsp/gsp_trace_rats_macro.h"
34 #include "kernel/gpu/intr/engine_idx.h"
35 #include "kernel/gpu/mem_mgr/heap.h"
36 #include "kernel/gpu/mem_mgr/mem_mgr.h"
37 #include "kernel/gpu/mem_sys/kern_mem_sys.h"
38 #include "kernel/gpu/rc/kernel_rc.h"
39 #include "kernel/gpu/nvlink/kernel_nvlink.h"
40 #include "virtualization/hypervisor/hypervisor.h"
41 #include "virtualization/vgpuconfigapi.h"
42 #include "kernel/gpu/disp/kern_disp.h"
43 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
44 #include "kernel/gpu/device/device.h"
45 #include "gpu/external_device/external_device.h"
46 #include "kernel/platform/platform_request_handler.h"
47 #include "class/cl2080.h" // NV20_SUBDEVICE_0
48 #include "ctrl/ctrl2080/ctrl2080nvd.h"
49 #include "liblogdecode.h"
50 #include "libelf.h"
51 #include "nverror.h"
52 #include "nvrm_registry.h"
53 #include "nv-firmware.h"
54 #include "nv-firmware-chip-family-select.h"
55 #include "nvtypes.h"
56 #include "nvVer.h"
57 #include "objrpc.h"
58 #include "objtmr.h"
59 #include "os/os.h"
60 #include "rmgspseq.h"
61 #include "sweng/dispsw.h"
62 #include "kernel/gpu/timed_sema.h"
63 #include "vgpu/rpc.h"
64 #include "kernel/gpu/pmu/kern_pmu.h"
65 #include "gpu/perf/kern_perf.h"
66 #include "core/locks.h"
67 #include "kernel/gpu/intr/intr.h"
68 
69 #define RPC_STRUCTURES
70 #define RPC_GENERIC_UNION
71 #include "g_rpc-structures.h"
72 #undef RPC_STRUCTURES
73 #undef RPC_GENERIC_UNION
74 
75 #define RPC_MESSAGE_STRUCTURES
76 #define RPC_MESSAGE_GENERIC_UNION
77 #include "g_rpc-message-header.h"
78 #undef RPC_MESSAGE_STRUCTURES
79 #undef RPC_MESSAGE_GENERIC_UNION
80 
81 #include "gpu/gsp/message_queue_priv.h"
82 
83 #include "gpu/conf_compute/conf_compute.h"
84 
85 #define RPC_HDR  ((rpc_message_header_v*)(pRpc->message_buffer))
86 
87 struct MIG_CI_UPDATE_CALLBACK_PARAMS
88 {
89     NvU32 execPartCount;
90     NvU32 execPartId[NVC637_CTRL_MAX_EXEC_PARTITIONS];
91     NvU32 gfid;
92     NvBool bDelete;
93 };
94 
95 //
96 // RPC_PARAMS defines the rpc_params pointer and initializes it to the correct
97 // sub-structure.
98 //
99 // RPC_PARAMS intentionally assigns the the latest version structure to the
100 // versioned rpc_params pointer.  With the -Werror=incompatible-pointer-types
101 // compiler flag, this checks for mismatched structure versions at compile time.
102 //
103 // For example:
104 //   RPC_PARAMS(free, _v03_00);
105 // expands to
106 //   rpc_free_v03_00 *rpc_params = &RPC_HDR->rpc_message_data->free_v;
107 //
108 #define RPC_PARAMS(r, v) rpc_##r##v *rpc_params = &RPC_HDR->rpc_message_data->r##_v
109 
110 static NV_STATUS _kgspInitRpcInfrastructure(OBJGPU *, KernelGsp *);
111 static void _kgspFreeRpcInfrastructure(OBJGPU *, KernelGsp *);
112 
113 static NV_STATUS _kgspConstructRpcObject(OBJGPU *, KernelGsp *, MESSAGE_QUEUE_INFO *, OBJRPC **);
114 
115 static NV_STATUS _kgspRpcSendMessage(OBJGPU *, OBJRPC *);
116 static NV_STATUS _kgspRpcRecvPoll(OBJGPU *, OBJRPC *, NvU32);
117 static NV_STATUS _kgspRpcDrainEvents(OBJGPU *, KernelGsp *, NvU32, KernelGspRpcEventHandlerContext);
118 static void      _kgspRpcIncrementTimeoutCountAndRateLimitPrints(OBJGPU *, OBJRPC *);
119 
120 static NV_STATUS _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
121 static void _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
122 
123 static NV_STATUS _kgspAllocNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp);
124 static void _kgspFreeNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp);
125 
126 static void _kgspStopLogPolling(OBJGPU *pGpu, KernelGsp *pKernelGsp);
127 
128 static void _kgspFreeBootBinaryImage(OBJGPU *pGpu, KernelGsp *pKernelGsp);
129 
130 static NV_STATUS _kgspPrepareGspRmBinaryImage(OBJGPU *pGpu, KernelGsp *pKernelGsp, GSP_FIRMWARE *pGspFw);
131 
132 static NV_STATUS _kgspCreateSignatureMemdesc(OBJGPU *pGpu, KernelGsp *pKernelGsp,
133                                              GSP_FIRMWARE *pGspFw);
134 
135 static NV_STATUS _kgspFwContainerVerifyVersion(OBJGPU *pGpu, KernelGsp *pKernelGsp,
136                                                const void *pElfData, NvU64 elfDataSize,
137                                                const char *pNameInMsg);
138 
139 static NV_STATUS _kgspFwContainerGetSection(OBJGPU *pGpu, KernelGsp *pKernelGsp,
140                                             const void *pElfData, NvU64 elfDataSize,
141                                             const char *pSectionName,
142                                             const void **ppSectionData, NvU64 *pSectionSize);
143 
144 static NV_STATUS _kgspGetSectionNameForPrefix(OBJGPU *pGpu, KernelGsp *pKernelGsp,
145                                               char *pSectionNameBuf, NvLength sectionNameBufSize,
146                                               const char *pSectionPrefix);
147 
148 static void
_kgspGetActiveRpcDebugData(OBJRPC * pRpc,NvU32 function,NvU64 * data0,NvU64 * data1)149 _kgspGetActiveRpcDebugData
150 (
151     OBJRPC *pRpc,
152     NvU32 function,
153     NvU64 *data0,
154     NvU64 *data1
155 )
156 {
157     switch (function)
158     {
159         // Functions (CPU -> GSP)
160         case NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL:
161         {
162             RPC_PARAMS(gsp_rm_control, _v03_00);
163             *data0 = rpc_params->cmd;
164             *data1 = rpc_params->paramsSize;
165             break;
166         }
167         case NV_VGPU_MSG_FUNCTION_GSP_RM_ALLOC:
168         {
169             RPC_PARAMS(gsp_rm_alloc, _v03_00);
170             *data0 = rpc_params->hClass;
171             *data1 = rpc_params->paramsSize;
172             break;
173         }
174         case NV_VGPU_MSG_FUNCTION_FREE:
175         {
176             RPC_PARAMS(free, _v03_00);
177             *data0 = rpc_params->params.hObjectOld;
178             *data1 = rpc_params->params.hObjectParent;
179             break;
180         }
181 
182         // Events (CPU <- GSP)
183         case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
184         {
185             RPC_PARAMS(run_cpu_sequencer, _v17_00);
186             *data0 = rpc_params->cmdIndex;
187             *data1 = rpc_params->bufferSizeDWord;
188             break;
189         }
190         case NV_VGPU_MSG_EVENT_POST_EVENT:
191         {
192             RPC_PARAMS(post_event, _v17_00);
193             *data0 = rpc_params->notifyIndex;
194             *data1 = rpc_params->data;
195             break;
196         }
197         case NV_VGPU_MSG_EVENT_RC_TRIGGERED:
198         {
199             RPC_PARAMS(rc_triggered, _v17_02);
200             *data0 = rpc_params->nv2080EngineType;
201             *data1 = rpc_params->exceptType;
202             break;
203         }
204         case NV_VGPU_MSG_EVENT_VGPU_GSP_PLUGIN_TRIGGERED:
205         {
206             RPC_PARAMS(vgpu_gsp_plugin_triggered, _v17_00);
207             *data0 = rpc_params->gfid;
208             *data1 = rpc_params->notifyIndex;
209             break;
210         }
211         case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
212         {
213             RPC_PARAMS(gsp_lockdown_notice, _v17_00);
214             *data0 = rpc_params->bLockdownEngaging;
215             *data1 = 0;
216             break;
217         }
218         case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
219         {
220             RPC_PARAMS(gsp_post_nocat_record, _v01_00);
221             const NV2080CtrlNocatJournalInsertRecord *pRecord =
222                 (const NV2080CtrlNocatJournalInsertRecord *)&rpc_params->data;
223             *data0 = pRecord->recType;
224             *data1 = pRecord->errorCode;
225             break;
226         }
227 
228         default:
229         {
230             *data0 = 0;
231             *data1 = 0;
232             break;
233         }
234     }
235 }
236 
237 static NV_STATUS
_kgspRpcSanityCheck(OBJGPU * pGpu,KernelGsp * pKernelGsp,OBJRPC * pRpc)238 _kgspRpcSanityCheck(OBJGPU *pGpu, KernelGsp *pKernelGsp, OBJRPC *pRpc)
239 {
240     if (pKernelGsp->bFatalError)
241     {
242         NV_PRINTF(LEVEL_INFO, "GSP crashed, skipping RPC\n");
243         //
244         // In case of a fatal GSP error, if there was an outstanding RPC at the
245         // time, we should have already printed the error for that, so this is a
246         // new RPC call...from now on don't bother printing RPC errors anymore,
247         // as it can be too noisy and overrun logs.
248         //
249         pRpc->bQuietPrints = NV_TRUE;
250         return NV_ERR_RESET_REQUIRED;
251     }
252     if (API_GPU_IN_RESET_SANITY_CHECK(pGpu))
253     {
254         NV_PRINTF(LEVEL_INFO, "GPU in reset, skipping RPC\n");
255         return NV_ERR_GPU_IN_FULLCHIP_RESET;
256     }
257     if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu) ||
258         pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
259     {
260         NV_PRINTF(LEVEL_INFO, "GPU lost, skipping RPC\n");
261         return NV_ERR_GPU_IS_LOST;
262     }
263     if (osIsGpuShutdown(pGpu))
264     {
265         NV_PRINTF(LEVEL_INFO, "GPU shutdown, skipping RPC\n");
266         return NV_ERR_GPU_IS_LOST;
267     }
268     if (!gpuIsGpuFullPowerForPmResume(pGpu))
269     {
270         NV_PRINTF(LEVEL_INFO, "GPU not full power, skipping RPC\n");
271         return NV_ERR_GPU_NOT_FULL_POWER;
272     }
273     if (!gpuCheckSysmemAccess(pGpu))
274     {
275         NV_PRINTF(LEVEL_INFO, "GPU has no sysmem access, skipping RPC\n");
276         return NV_ERR_INVALID_ACCESS_TYPE;
277     }
278     return NV_OK;
279 }
280 
281 static void
_kgspAddRpcHistoryEntry(OBJRPC * pRpc,RpcHistoryEntry * pHistory,NvU32 * pCurrent)282 _kgspAddRpcHistoryEntry
283 (
284     OBJRPC *pRpc,
285     RpcHistoryEntry *pHistory,
286     NvU32 *pCurrent
287 )
288 {
289     NvU32 func = RPC_HDR->function;
290     NvU32 entry;
291 
292     entry = *pCurrent = (*pCurrent + 1) % RPC_HISTORY_DEPTH;
293 
294     portMemSet(&pHistory[entry], 0, sizeof(pHistory[0]));
295     pHistory[entry].function = func;
296     pHistory[entry].ts_start = osGetTimestamp();
297 
298     _kgspGetActiveRpcDebugData(pRpc, func,
299                                &pHistory[entry].data[0],
300                                &pHistory[entry].data[1]);
301 }
302 
303 static void
_kgspCompleteRpcHistoryEntry(RpcHistoryEntry * pHistory,NvU32 current)304 _kgspCompleteRpcHistoryEntry
305 (
306     RpcHistoryEntry *pHistory,
307     NvU32 current
308 )
309 {
310     NvU32 historyIndex;
311     NvU32 historyEntry;
312 
313     pHistory[current].ts_end = osGetTimestamp();
314 
315     //
316     // Complete any previous entries that aren't marked complete yet, using the same timestamp
317     // (we may not have explicitly waited for them)
318     //
319     for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
320     {
321         historyEntry = (current + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
322         if (pHistory[historyEntry].ts_start != 0 &&
323             pHistory[historyEntry].ts_end   == 0)
324         {
325             pHistory[historyEntry].ts_end = pHistory[current].ts_end;
326         }
327     }
328 }
329 
330 /*!
331  * GSP client RM RPC send routine
332  */
333 static NV_STATUS
_kgspRpcSendMessage(OBJGPU * pGpu,OBJRPC * pRpc)334 _kgspRpcSendMessage
335 (
336     OBJGPU *pGpu,
337     OBJRPC *pRpc
338 )
339 {
340     NV_STATUS nvStatus;
341     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
342     NvU32 gpuMaskUnused;
343 
344     NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
345 
346     NV_CHECK_OK_OR_RETURN(LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc));
347 
348     nvStatus = GspMsgQueueSendCommand(pRpc->pMessageQueueInfo, pGpu);
349     if (nvStatus != NV_OK)
350     {
351         if (nvStatus == NV_ERR_TIMEOUT ||
352             nvStatus == NV_ERR_BUSY_RETRY)
353         {
354             _kgspRpcIncrementTimeoutCountAndRateLimitPrints(pGpu, pRpc);
355         }
356         NV_PRINTF_COND(pRpc->bQuietPrints, LEVEL_INFO, LEVEL_ERROR,
357                        "GspMsgQueueSendCommand failed on GPU%d: 0x%x\n",
358                        gpuGetInstance(pGpu), nvStatus);
359         return nvStatus;
360     }
361 
362     kgspSetCmdQueueHead_HAL(pGpu, pKernelGsp, pRpc->pMessageQueueInfo->queueIdx, 0);
363 
364     _kgspAddRpcHistoryEntry(pRpc, pRpc->rpcHistory, &pRpc->rpcHistoryCurrent);
365 
366     return NV_OK;
367 }
368 
369 static NV_STATUS
_kgspRpcRunCpuSequencer(OBJGPU * pGpu,OBJRPC * pRpc)370 _kgspRpcRunCpuSequencer
371 (
372     OBJGPU *pGpu,
373     OBJRPC *pRpc
374 )
375 {
376     RPC_PARAMS(run_cpu_sequencer, _v17_00);
377     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
378 
379     return kgspExecuteSequencerBuffer(pGpu, pKernelGsp, rpc_params);
380 }
381 
382 static void
_kgspProcessEccNotifier(OBJGPU * pGpu,void * eventData)383 _kgspProcessEccNotifier
384 (
385     OBJGPU *pGpu,
386     void   *eventData
387 )
388 {
389     NV_STATUS          nvStatus     = NV_OK;
390     MemoryManager     *pMemoryMgr   = GPU_GET_MEMORY_MANAGER(pGpu);
391 
392     if (pMemoryMgr->bEnableDynamicPageOfflining)
393     {
394         Nv2080EccDbeNotification *pParams = (Nv2080EccDbeNotification*)eventData;
395         if ((nvStatus = heapStorePendingBlackList(pGpu, GPU_GET_HEAP(pGpu), pParams->physAddress ,
396                                                   pParams->physAddress)) != NV_OK)
397         {
398             if (nvStatus == NV_ERR_RESET_REQUIRED)
399             {
400                 NV_PRINTF(LEVEL_INFO, "Since we hit the DED on the reserved region, nothing to handle in this code path... \n");
401                 NV_PRINTF(LEVEL_INFO, "Relying on FBHUB interrupt to kill all the channels and force reset the GPU..\n");
402             }
403             else
404             {
405                 NV_PRINTF(LEVEL_INFO, "Dynamically blacklisting the DED page offset failed with, status: %x\n", nvStatus);
406                 DBG_BREAKPOINT();
407             }
408         }
409 
410     }
411 }
412 
413 /*!
414  * Receive an event notification from GSP-RM.
415  *
416  * When an event fires in GSP-RM, osNotifyEvent and osEventNotification check
417  * whether the event was originally allocated from client-RM.  If so, they post
418  * it to the event queue and take no further action.  Client RM picks up the
419  * event here and handles it.
420  */
421 static NV_STATUS
_kgspRpcPostEvent(OBJGPU * pGpu,OBJRPC * pRpc)422 _kgspRpcPostEvent
423 (
424     OBJGPU *pGpu,
425     OBJRPC *pRpc
426 )
427 {
428     RPC_PARAMS(post_event, _v17_00);
429     PEVENTNOTIFICATION pNotifyList  = NULL;
430     PEVENTNOTIFICATION pNotifyEvent = NULL;
431     Event             *pEvent       = NULL;
432     NV_STATUS          nvStatus     = NV_OK;
433 
434     // Get the notification list that contains this event.
435     NV_ASSERT_OR_RETURN(CliGetEventInfo(rpc_params->hClient,
436         rpc_params->hEvent, &pEvent), NV_ERR_OBJECT_NOT_FOUND);
437 
438     if (pEvent->pNotifierShare != NULL)
439         pNotifyList = pEvent->pNotifierShare->pEventList;
440 
441     NV_ASSERT_OR_RETURN(pNotifyList != NULL, NV_ERR_INVALID_POINTER);
442 
443     switch (rpc_params->notifyIndex)
444     {
445         case NV2080_NOTIFIERS_ECC_DBE:
446             _kgspProcessEccNotifier(pGpu, rpc_params->eventData);
447             break;
448     }
449 
450     // Send the event.
451     if (rpc_params->bNotifyList)
452     {
453         // Send notification to all matching events on the list.
454         nvStatus = osEventNotificationWithInfo(pGpu, pNotifyList, rpc_params->notifyIndex,
455                        rpc_params->data, rpc_params->info16, rpc_params->eventData, rpc_params->eventDataSize);
456     }
457     else
458     {
459         // Send event to a specific hEvent.  Find hEvent in the notification list.
460         for (pNotifyEvent = pNotifyList; pNotifyEvent; pNotifyEvent = pNotifyEvent->Next)
461         {
462             if (pNotifyEvent->hEvent == rpc_params->hEvent)
463             {
464                 nvStatus = osNotifyEvent(pGpu, pNotifyEvent, 0,
465                                          rpc_params->data, rpc_params->status);
466                 break;
467             }
468         }
469         NV_ASSERT_OR_RETURN(pNotifyEvent != NULL, NV_ERR_OBJECT_NOT_FOUND);
470     }
471 
472     return nvStatus;
473 }
474 
475 /*!
476  * Receive RC notification from GSP-RM.
477  *
478  * RC error handling ("Channel Teardown sequence") is executed in GSP-RM.
479  * Client notifications, OS interaction etc happen in CPU-RM (Kernel RM).
480  */
481 static NV_STATUS
_kgspRpcRCTriggered(OBJGPU * pGpu,OBJRPC * pRpc)482 _kgspRpcRCTriggered
483 (
484     OBJGPU *pGpu,
485     OBJRPC *pRpc
486 )
487 {
488     RPC_PARAMS(rc_triggered, _v17_02);
489 
490     KernelRc      *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
491     KernelChannel *pKernelChannel;
492     KernelFifo    *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
493     CHID_MGR      *pChidMgr;
494     NvU32          status = NV_OK;
495     RM_ENGINE_TYPE rmEngineType = gpuGetRmEngineType(rpc_params->nv2080EngineType);
496     NvBool         bIsCcEnabled = NV_FALSE;
497 
498     // check if there's a PCI-E error pending either in device status or in AER
499     krcCheckBusError_HAL(pGpu, pKernelRc);
500 
501     //
502     // If we have received a special msg from GSP then ack back immediately
503     // that we are done writing notifiers since we would have already processed the
504     // other RC msgs that trigger notifier writes before this one.
505     //
506     if (rpc_params->exceptType == ROBUST_CHANNEL_FAST_PATH_ERROR)
507     {
508         NV_RM_RPC_ECC_NOTIFIER_WRITE_ACK(pGpu, status);
509         NV_ASSERT_OK(status);
510         return status;
511     }
512 
513     status = kfifoGetChidMgrFromType(pGpu, pKernelFifo,
514                                      ENGINE_INFO_TYPE_RM_ENGINE_TYPE,
515                                      (NvU32)rmEngineType,
516                                      &pChidMgr);
517     if (status != NV_OK)
518         return status;
519 
520     pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo,
521                                                   pChidMgr,
522                                                   rpc_params->chid);
523     NV_CHECK_OR_RETURN(LEVEL_ERROR,
524                        pKernelChannel != NULL,
525                        NV_ERR_INVALID_CHANNEL);
526 
527     // Add the RcDiag records we received from GSP-RM to our system wide journal
528     {
529         OBJSYS   *pSys = SYS_GET_INSTANCE();
530         Journal  *pRcDB = SYS_GET_RCDB(pSys);
531         RmClient *pClient;
532 
533         NvU32 recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport);
534         NvU32 rcDiagRecStart = pRcDB->RcErrRptNextIdx;
535         NvU32 rcDiagRecEnd;
536         NvU32 processId = 0;
537         NvU32 owner = RCDB_RCDIAG_DEFAULT_OWNER;
538 
539         pClient = dynamicCast(RES_GET_CLIENT(pKernelChannel), RmClient);
540         NV_ASSERT(pClient != NULL);
541         if (pClient != NULL)
542             processId = pClient->ProcID;
543 
544         for (NvU32 i = 0; i < rpc_params->rcJournalBufferSize / recordSize; i++)
545         {
546             RmRCCommonJournal_RECORD *pCommonRecord =
547                 (RmRCCommonJournal_RECORD *)((NvU8*)&rpc_params->rcJournalBuffer + i * recordSize);
548             RmRcDiag_RECORD *pRcDiagRecord =
549                 (RmRcDiag_RECORD *)&pCommonRecord[1];
550 
551 #if defined(DEBUG)
552             NV_PRINTF(LEVEL_INFO, "%d: GPUTag=0x%x CPUTag=0x%llx timestamp=0x%llx stateMask=0x%llx\n",
553                       i, pCommonRecord->GPUTag, pCommonRecord->CPUTag, pCommonRecord->timeStamp,
554                       pCommonRecord->stateMask);
555             NV_PRINTF(LEVEL_INFO, "   idx=%d timeStamp=0x%x type=0x%x flags=0x%x count=%d owner=0x%x processId=0x%x\n",
556                       pRcDiagRecord->idx, pRcDiagRecord->timeStamp, pRcDiagRecord->type, pRcDiagRecord->flags,
557                       pRcDiagRecord->count, pRcDiagRecord->owner, processId);
558             for (NvU32 j = 0; j < pRcDiagRecord->count; j++)
559             {
560                 NV_PRINTF(LEVEL_INFO, "     %d: offset=0x08%x tag=0x08%x value=0x08%x attribute=0x08%x\n",
561                           j, pRcDiagRecord->data[j].offset, pRcDiagRecord->data[j].tag,
562                           pRcDiagRecord->data[j].value, pRcDiagRecord->data[j].attribute);
563             }
564 #endif
565             if (rcdbAddRcDiagRecFromGsp(pGpu, pRcDB, pCommonRecord, pRcDiagRecord) == NULL)
566             {
567                 NV_PRINTF(LEVEL_WARNING, "Lost RC diagnostic record coming from GPU%d GSP: type=0x%x stateMask=0x%llx\n",
568                           gpuGetInstance(pGpu), pRcDiagRecord->type, pCommonRecord->stateMask);
569             }
570         }
571 
572         rcDiagRecEnd = pRcDB->RcErrRptNextIdx - 1;
573 
574         // Update records to have the correct PID associated with the channel
575         if (rcDiagRecStart != rcDiagRecEnd)
576         {
577             rcdbUpdateRcDiagRecContext(pRcDB,
578                                        rcDiagRecStart,
579                                        rcDiagRecEnd,
580                                        processId,
581                                        owner);
582         }
583     }
584 
585     bIsCcEnabled = gpuIsCCFeatureEnabled(pGpu);
586 
587     // With CC enabled, CPU-RM needs to write error notifiers
588     if (bIsCcEnabled)
589     {
590         NV_ASSERT_OK_OR_RETURN(krcErrorSetNotifier(pGpu, pKernelRc,
591                                                    pKernelChannel,
592                                                    rpc_params->exceptType,
593                                                    rmEngineType,
594                                                    rpc_params->scope));
595     }
596 
597     return krcErrorSendEventNotifications_HAL(pGpu, pKernelRc,
598         pKernelChannel,
599         rmEngineType,           // unused on kernel side
600         rpc_params->exceptType,
601         rpc_params->scope,
602         rpc_params->partitionAttributionId);
603 }
604 
605 /*!
606  * This function is called on critical FW crash to RC and notify an error code to
607  * all user mode channels, allowing the user mode apps to fail deterministically.
608  *
609  * @param[in] pGpu        GPU object pointer
610  * @param[in] pKernelGsp  KernelGsp object pointer
611  * @param[in] exceptType  Error code to send to the RC notifiers
612  *
613  */
614 void
kgspRcAndNotifyAllUserChannels(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 exceptType)615 kgspRcAndNotifyAllUserChannels
616 (
617     OBJGPU    *pGpu,
618     KernelGsp *pKernelGsp,
619     NvU32      exceptType
620 )
621 {
622     KernelRc         *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
623     KernelChannel    *pKernelChannel;
624     KernelFifo       *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
625     CHANNEL_ITERATOR  chanIt;
626     RMTIMEOUT         timeout;
627 
628     NV_PRINTF(LEVEL_ERROR, "RC all user channels for critical error %d.\n", exceptType);
629 
630     // Pass 1: halt all user channels.
631     kfifoGetChannelIterator(pGpu, pKernelFifo, &chanIt, INVALID_RUNLIST_ID);
632     while (kfifoGetNextKernelChannel(pGpu, pKernelFifo, &chanIt, &pKernelChannel) == NV_OK)
633     {
634         //
635         // Kernel (uvm) channels are skipped to workaround nvbug 4503046, where
636         // uvm attributes all errors as global and fails operations on all GPUs,
637         // in addition to the current failing GPU.
638         //
639         if (kchannelCheckIsKernel(pKernelChannel))
640         {
641             continue;
642         }
643 
644         kfifoStartChannelHalt(pGpu, pKernelFifo, pKernelChannel);
645     }
646 
647     //
648     // Pass 2: Wait for the halts to complete, and RC notify the user channels.
649     // The channel halts require a preemption, which may not be able to complete
650     // since the GSP is no longer servicing interrupts. Wait for up to the
651     // default GPU timeout value for the preemptions to complete.
652     //
653     gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
654     kfifoGetChannelIterator(pGpu, pKernelFifo, &chanIt, INVALID_RUNLIST_ID);
655     while (kfifoGetNextKernelChannel(pGpu, pKernelFifo, &chanIt, &pKernelChannel) == NV_OK)
656     {
657         // Skip kernel (uvm) channels as only user channel halts are initiated above.
658         if (kchannelCheckIsKernel(pKernelChannel))
659         {
660             continue;
661         }
662 
663         kfifoCompleteChannelHalt(pGpu, pKernelFifo, pKernelChannel, &timeout);
664 
665         NV_ASSERT_OK(krcErrorSetNotifier(pGpu, pKernelRc,
666                                          pKernelChannel,
667                                          exceptType,
668                                          kchannelGetEngineType(pKernelChannel),
669                                          RC_NOTIFIER_SCOPE_CHANNEL));
670 
671         NV_ASSERT_OK(krcErrorSendEventNotifications_HAL(pGpu, pKernelRc,
672             pKernelChannel,
673             kchannelGetEngineType(pKernelChannel),
674             exceptType,
675             RC_NOTIFIER_SCOPE_CHANNEL,
676             0));
677     }
678 }
679 
680 /*!
681  * Receive Xid notification from GSP-RM
682  *
683  * Passes Xid errors that are triggered on GSP-RM to nvErrorLog for OS interactions
684  * (logging and OS notifications).
685  */
686 static void
_kgspRpcOsErrorLog(OBJGPU * pGpu,OBJRPC * pRpc)687 _kgspRpcOsErrorLog
688 (
689     OBJGPU *pGpu,
690     OBJRPC *pRpc
691 )
692 {
693     RPC_PARAMS(os_error_log, _v17_00);
694 
695     KernelRc      *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
696     KernelChannel *pKernelChannel = NULL;
697     KernelFifo    *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
698     CHID_MGR      *pChidMgr;
699 
700     if (rpc_params->chid != INVALID_CHID)
701     {
702         pChidMgr = kfifoGetChidMgr(pGpu, pKernelFifo, rpc_params->runlistId);
703         if (pChidMgr != NULL)
704         {
705             pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo,
706                                                           pChidMgr,
707                                                           rpc_params->chid);
708         }
709     }
710 
711     pKernelRc->pPreviousChannelInError = pKernelChannel;
712     nvErrorLog_va(pGpu, rpc_params->exceptType, "%s", rpc_params->errString);
713     pKernelRc->pPreviousChannelInError = NULL;
714 }
715 
716 /*!
717  * Receives RPC events containing periodic perfmon utilization samples, passing them
718  * to GPUACCT for processing.
719  */
720 static void
_kgspRpcGpuacctPerfmonUtilSamples(OBJGPU * pGpu,OBJRPC * pRpc)721 _kgspRpcGpuacctPerfmonUtilSamples
722 (
723     OBJGPU *pGpu,
724     OBJRPC *pRpc
725 )
726 {
727     OBJSYS *pSys = SYS_GET_INSTANCE();
728     GpuAccounting *pGpuAcct = SYS_GET_GPUACCT(pSys);
729     GPUACCT_GPU_INSTANCE_INFO *pGpuInstanceInfo = &pGpuAcct->gpuInstanceInfo[pGpu->gpuInstance];
730     RPC_PARAMS(gpuacct_perfmon_util_samples, _v1F_0E);
731 
732     NV2080_CTRL_PERF_GET_GPUMON_PERFMON_UTIL_SAMPLES_V2_PARAMS_v1F_0E *src = &rpc_params->params;
733     NV2080_CTRL_PERF_GET_GPUMON_PERFMON_UTIL_SAMPLES_V2_PARAMS        *dest;
734     NvU32 i;
735 
736     dest = pGpuInstanceInfo->pSamplesParams;
737     if (dest == NULL)
738     {
739         // This RPC event can be received even when the RM hasn't fully started.
740         // For instance, CPU RM can take longer than usual to initialize,
741         // but the GSP RM sampling timer (a 1 sec interval) is about to tick.
742         // In that case, pSamplesParams can not even be allocated by that time.
743         // Ignore this RPC event if pSamplesParams has not been allocated yet.
744         // See GPUSWSEC-1543 for more info.
745         return;
746     }
747 
748     portMemSet(dest, 0, sizeof(*dest));
749     dest->type    = src->type;
750     dest->bufSize = src->bufSize;
751     dest->count   = src->count;
752     dest->tracker = src->tracker;
753 
754     for (i = 0; i < NV2080_CTRL_PERF_GPUMON_SAMPLE_COUNT_PERFMON_UTIL_v1F_0E; i++)
755     {
756         dest->samples[i].base.timeStamp     = src->samples[i].timeStamp;
757 
758         dest->samples[i].fb.util            = src->samples[i].fb.util;
759         dest->samples[i].fb.procId          = src->samples[i].fb.procId;
760         dest->samples[i].fb.subProcessID    = src->samples[i].fb.subProcessID;
761 
762         dest->samples[i].gr.util            = src->samples[i].gr.util;
763         dest->samples[i].gr.procId          = src->samples[i].gr.procId;
764         dest->samples[i].gr.subProcessID    = src->samples[i].gr.subProcessID;
765 
766         dest->samples[i].nvenc.util         = src->samples[i].nvenc.util;
767         dest->samples[i].nvenc.procId       = src->samples[i].nvenc.procId;
768         dest->samples[i].nvenc.subProcessID = src->samples[i].nvenc.subProcessID;
769 
770         dest->samples[i].nvdec.util         = src->samples[i].nvdec.util;
771         dest->samples[i].nvdec.procId       = src->samples[i].nvdec.procId;
772         dest->samples[i].nvdec.subProcessID = src->samples[i].nvdec.subProcessID;
773     }
774 
775     gpuacctProcessGpuUtil(pGpuInstanceInfo, &dest->samples[0]);
776 }
777 
778 /*!
779  * Receives RPC events containing current GPU Boost synchronization limits
780  * that should be cached and considered in the GPU Boost algorithm and runs
781  * the algorithm.
782  */
783 static void
_kgspRpcPerfGpuBoostSyncLimitsCallback(OBJGPU * pGpu,OBJRPC * pRpc)784 _kgspRpcPerfGpuBoostSyncLimitsCallback
785 (
786     OBJGPU *pGpu,
787     OBJRPC *pRpc
788 )
789 {
790     KernelPerf *pKernelPerf = GPU_GET_KERNEL_PERF(pGpu);
791 
792     RPC_PARAMS(perf_gpu_boost_sync_limits_callback, _v17_00);
793 
794     NV2080_CTRL_INTERNAL_PERF_GPU_BOOST_SYNC_SET_LIMITS_PARAMS_v17_00  *src = &rpc_params->params;
795     NV2080_CTRL_INTERNAL_PERF_GPU_BOOST_SYNC_SET_LIMITS_PARAMS          dest;
796     NvU32 i;
797 
798     dest.flags        = src->flags;
799     dest.bBridgeless  = src->bBridgeless;
800 
801     for (i = 0; i < NV2080_CTRL_INTERNAL_PERF_SYNC_GPU_BOOST_LIMITS_NUM; i++)
802     {
803         dest.currLimits[i] = src->currLimits[i];
804     }
805 
806     kperfDoSyncGpuBoostLimits(pGpu, pKernelPerf, &dest);
807 
808 }
809 
810 /*!
811  * Recieves RPC events containing latest change of bridgeless information
812  */
813 static void
_kgspRpcPerfBridgelessInfoUpdate(OBJGPU * pGpu,OBJRPC * pRpc)814 _kgspRpcPerfBridgelessInfoUpdate
815 (
816     OBJGPU  *pGpu,
817     OBJRPC  *pRpc
818 )
819 {
820     RPC_PARAMS(perf_bridgeless_info_update, _v17_00);
821 
822     kPerfGpuBoostSyncBridgelessUpdateInfo(pGpu, rpc_params->bBridgeless);
823 }
824 
825 static void
_kgspRpcNvlinkFaultUpCallback(OBJGPU * pGpu,OBJRPC * pRpc)826 _kgspRpcNvlinkFaultUpCallback
827 (
828     OBJGPU  *pGpu,
829     OBJRPC  *pRpc
830 )
831 {
832     RPC_PARAMS(nvlink_fault_up, _v17_00);
833 
834     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
835 
836     knvlinkHandleFaultUpInterrupt_HAL(pGpu, pKernelNvlink, rpc_params->linkId);
837 }
838 
839 static void
_kgspRpcNvlinkInbandReceivedData256Callback(OBJGPU * pGpu,OBJRPC * pRpc)840 _kgspRpcNvlinkInbandReceivedData256Callback
841 (
842     OBJGPU  *pGpu,
843     OBJRPC  *pRpc
844 )
845 {
846     RPC_PARAMS(nvlink_inband_received_data_256, _v17_00);
847 
848     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_256_PARAMS_v17_00 *dest = &rpc_params->params;
849     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
850 
851     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
852 }
853 
854 static void
_kgspRpcNvlinkInbandReceivedData512Callback(OBJGPU * pGpu,OBJRPC * pRpc)855 _kgspRpcNvlinkInbandReceivedData512Callback
856 (
857     OBJGPU  *pGpu,
858     OBJRPC  *pRpc
859 )
860 {
861     RPC_PARAMS(nvlink_inband_received_data_512, _v17_00);
862 
863     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_512_PARAMS_v17_00 *dest = &rpc_params->params;
864     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
865 
866     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
867 }
868 
869 static void
_kgspRpcNvlinkInbandReceivedData1024Callback(OBJGPU * pGpu,OBJRPC * pRpc)870 _kgspRpcNvlinkInbandReceivedData1024Callback
871 (
872     OBJGPU  *pGpu,
873     OBJRPC  *pRpc
874 )
875 {
876     RPC_PARAMS(nvlink_inband_received_data_1024, _v17_00);
877 
878     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_1024_PARAMS_v17_00 *dest = &rpc_params->params;
879     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
880 
881     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
882 }
883 
884 static void
_kgspRpcNvlinkInbandReceivedData2048Callback(OBJGPU * pGpu,OBJRPC * pRpc)885 _kgspRpcNvlinkInbandReceivedData2048Callback
886 (
887     OBJGPU  *pGpu,
888     OBJRPC  *pRpc
889 )
890 {
891     RPC_PARAMS(nvlink_inband_received_data_2048, _v17_00);
892 
893     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_2048_PARAMS_v17_00 *dest = &rpc_params->params;
894     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
895 
896     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
897 }
898 
899 static void
_kgspRpcNvlinkInbandReceivedData4096Callback(OBJGPU * pGpu,OBJRPC * pRpc)900 _kgspRpcNvlinkInbandReceivedData4096Callback
901 (
902     OBJGPU  *pGpu,
903     OBJRPC  *pRpc
904 )
905 {
906     RPC_PARAMS(nvlink_inband_received_data_4096, _v17_00);
907 
908     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_4096_PARAMS_v17_00 *dest = &rpc_params->params;
909     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
910 
911     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
912 }
913 
914 /*!
915  * CPU-RM: Receive GPU Degraded status from GSP
916  */
917 static void
_kgspRpcEventIsGpuDegradedCallback(OBJGPU * pGpu,OBJRPC * pRpc)918 _kgspRpcEventIsGpuDegradedCallback
919 (
920     OBJGPU  *pGpu,
921     OBJRPC  *pRpc
922 )
923 {
924     RPC_PARAMS(nvlink_is_gpu_degraded, _v17_00);
925     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
926     NV2080_CTRL_NVLINK_IS_GPU_DEGRADED_PARAMS_v17_00 *dest = &rpc_params->params;
927 
928     if(dest->bIsGpuDegraded)
929     {
930         knvlinkSetDegradedMode(pGpu, pKernelNvlink, dest->linkId);
931     }
932 }
933 
934 static void
_kgspRpcNvlinkFatalErrorRecoveryCallback(OBJGPU * pGpu,OBJRPC * pRpc)935 _kgspRpcNvlinkFatalErrorRecoveryCallback
936 (
937     OBJGPU  *pGpu,
938     OBJRPC  *pRpc
939 )
940 {
941     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
942     NV_ASSERT_OK(knvlinkFatalErrorRecovery(pGpu, pKernelNvlink));
943 }
944 
945 /*!
946  * Receive MMU fault queue notification from GSP-RM.
947  *
948  * Non-replayable fault handling is split between GSP-RM and the UVM driver.
949  * GSP-RM copies designated faults to the UVM driver's shadow buffer,
950  * and sends a notification.  CPU-RM, in turn, needs to notify the UVM
951  * driver (schedule the UVM ISR to be run).
952  */
953 static NV_STATUS
_kgspRpcMMUFaultQueued(OBJGPU * pGpu,OBJRPC * pRpc)954 _kgspRpcMMUFaultQueued(
955     OBJGPU *pGpu,
956     OBJRPC *pRpc
957 )
958 {
959     osQueueMMUFaultHandler(pGpu);
960 
961     return NV_OK;
962 }
963 
964 static NV_STATUS
_kgspRpcSimRead(OBJGPU * pGpu,OBJRPC * pRpc)965 _kgspRpcSimRead
966 (
967     OBJGPU *pGpu,
968     OBJRPC *pRpc
969 )
970 {
971     RPC_PARAMS(sim_read, _v1E_01);
972     if (IS_SIMULATION(pGpu))
973     {
974         const NvU32 count = rpc_params->index + (rpc_params->count / sizeof(NvU32));
975         NvU32 i;
976 
977         KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
978 
979         NV_ASSERT_OR_RETURN(rpc_params->count <= sizeof(pKernelGsp->pSimAccessBuf->data), NV_ERR_BUFFER_TOO_SMALL);
980 
981         for (i = rpc_params->index; i < count; i++)
982         {
983             NvU32 data;
984             gpuSimEscapeRead(pGpu, rpc_params->path, i, 4, &data);
985             pKernelGsp->pSimAccessBuf->data[i] = data;
986         }
987 
988         pKernelGsp->pSimAccessBuf->seq++;
989         return NV_OK;
990     }
991 
992     return NV_ERR_NOT_SUPPORTED;
993 }
994 
995 static NV_STATUS
_kgspRpcSimWrite(OBJGPU * pGpu,OBJRPC * pRpc)996 _kgspRpcSimWrite
997 (
998     OBJGPU *pGpu,
999     OBJRPC *pRpc
1000 )
1001 {
1002     RPC_PARAMS(sim_write, _v1E_01);
1003     if (IS_SIMULATION(pGpu))
1004     {
1005         KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
1006 
1007         gpuSimEscapeWrite(pGpu, rpc_params->path, rpc_params->index, rpc_params->count, rpc_params->data);
1008         pKernelGsp->pSimAccessBuf->seq++;
1009         return NV_OK;
1010     }
1011 
1012     return NV_ERR_NOT_SUPPORTED;
1013 }
1014 
1015 static NV_STATUS
_kgspRpcSemaphoreScheduleCallback(OBJGPU * pGpu,OBJRPC * pRpc)1016 _kgspRpcSemaphoreScheduleCallback(
1017     OBJGPU *pGpu,
1018     OBJRPC *pRpc
1019 )
1020 {
1021     RPC_PARAMS(semaphore_schedule_callback, _v17_00);
1022     NV_STATUS status;
1023     RsClient *pClient;
1024     Device *pDevice;
1025 
1026     status = serverGetClientUnderLock(&g_resServ, rpc_params->hClient, &pClient);
1027     if (status != NV_OK)
1028         return status;
1029 
1030     status = deviceGetByHandle(pClient, rpc_params->hEvent, &pDevice);
1031     if (status != NV_OK)
1032         return status;
1033 
1034     return dispswReleaseSemaphoreAndNotifierFill(pGpu,
1035                                                  rpc_params->GPUVA,
1036                                                  rpc_params->hVASpace,
1037                                                  rpc_params->ReleaseValue,
1038                                                  rpc_params->Flags,
1039                                                  rpc_params->completionStatus,
1040                                                  pDevice);
1041 }
1042 
1043 static NV_STATUS
_kgspRpcTimedSemaphoreRelease(OBJGPU * pGpu,OBJRPC * pRpc)1044 _kgspRpcTimedSemaphoreRelease(
1045     OBJGPU *pGpu,
1046     OBJRPC *pRpc
1047 )
1048 {
1049     RPC_PARAMS(timed_semaphore_release, _v01_00);
1050     NV_STATUS status;
1051     RsClient *pClient;
1052     Device *pDevice;
1053 
1054     status = serverGetClientUnderLock(&g_resServ, rpc_params->hClient, &pClient);
1055     if (status != NV_OK)
1056         return status;
1057 
1058     status = deviceGetByHandle(pClient, rpc_params->hDevice, &pDevice);
1059     if (status != NV_OK)
1060         return status;
1061 
1062     return tsemaRelease_HAL(pGpu,
1063                             rpc_params->semaphoreVA,
1064                             rpc_params->notifierVA,
1065                             rpc_params->hVASpace,
1066                             rpc_params->releaseValue,
1067                             rpc_params->completionStatus,
1068                             pDevice);
1069 }
1070 
1071 
1072 static NV_STATUS
_kgspRpcUcodeLibosPrint(OBJGPU * pGpu,OBJRPC * pRpc)1073 _kgspRpcUcodeLibosPrint
1074 (
1075     OBJGPU *pGpu,
1076     OBJRPC *pRpc
1077 )
1078 {
1079     RPC_PARAMS(ucode_libos_print, _v1E_08);
1080 
1081     // Check ucodes registered with the libos print mechanism
1082     switch (rpc_params->ucodeEngDesc)
1083     {
1084         case ENG_PMU:
1085         {
1086             KernelPmu *pKernelPmu = GPU_GET_KERNEL_PMU(pGpu);
1087             NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelPmu != NULL, NV_ERR_OBJECT_NOT_FOUND);
1088 
1089             kpmuLogBuf(pGpu, pKernelPmu,
1090                        rpc_params->libosPrintBuf, rpc_params->libosPrintBufSize);
1091 
1092             return NV_OK;
1093         }
1094         default:
1095             NV_ASSERT_FAILED("Attempting to use libos prints with an unsupported ucode!\n");
1096             return NV_ERR_NOT_SUPPORTED;
1097     }
1098 }
1099 
1100 static NV_STATUS
_kgspRpcVgpuGspPluginTriggered(OBJGPU * pGpu,OBJRPC * pRpc)1101 _kgspRpcVgpuGspPluginTriggered
1102 (
1103     OBJGPU *pGpu,
1104     OBJRPC *pRpc
1105 )
1106 {
1107     RPC_PARAMS(vgpu_gsp_plugin_triggered, _v17_00);
1108 
1109     if (!IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
1110         return NV_ERR_NOT_SUPPORTED;
1111 
1112     gpuGspPluginTriggeredEvent(pGpu, rpc_params->gfid, rpc_params->notifyIndex);
1113     return NV_OK;
1114 }
1115 
1116 static NV_STATUS
_kgspRpcGspVgpuConfig(OBJGPU * pGpu,OBJRPC * pRpc)1117 _kgspRpcGspVgpuConfig
1118 (
1119     OBJGPU *pGpu,
1120     OBJRPC *pRpc
1121 )
1122 {
1123     RPC_PARAMS(vgpu_config_event, _v17_00);
1124 
1125     NV_ASSERT_OR_RETURN(rpc_params->notifyIndex < NVA081_NOTIFIERS_MAXCOUNT,
1126                         NV_ERR_INVALID_ARGUMENT);
1127 
1128     CliNotifyVgpuConfigEvent(pGpu, rpc_params->notifyIndex);
1129 
1130     return NV_OK;
1131 }
1132 
1133 static NV_STATUS
_kgspRpcGspExtdevIntrService(OBJGPU * pGpu,OBJRPC * pRpc)1134 _kgspRpcGspExtdevIntrService
1135 (
1136     OBJGPU *pGpu,
1137     OBJRPC *pRpc
1138 )
1139 {
1140     RPC_PARAMS(extdev_intr_service, _v17_00);
1141 
1142     extdevGsyncService(pGpu, rpc_params->lossRegStatus, rpc_params->gainRegStatus, rpc_params->miscRegStatus, rpc_params->rmStatus);
1143 
1144     return NV_OK;
1145 }
1146 
1147 static void
_kgspRpcMigCiConfigUpdateCallback(NvU32 gpuInstance,void * pArgs)1148 _kgspRpcMigCiConfigUpdateCallback
1149 (
1150     NvU32 gpuInstance,
1151     void *pArgs
1152 )
1153 {
1154     OBJGPU *pGpu = gpumgrGetGpu(gpuInstance);
1155     KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
1156     struct MIG_CI_UPDATE_CALLBACK_PARAMS * pParams = (struct MIG_CI_UPDATE_CALLBACK_PARAMS *)pArgs;
1157 
1158     kmigmgrUpdateCiConfigForVgpu(pGpu, pKernelMIGManager,
1159                                  pParams->execPartCount, pParams->execPartId,
1160                                  pParams->gfid, pParams->bDelete);
1161 
1162     return;
1163 }
1164 
1165 static NV_STATUS
_kgspRpcMigCiConfigUpdate(OBJGPU * pGpu,OBJRPC * pRpc)1166 _kgspRpcMigCiConfigUpdate
1167 (
1168     OBJGPU *pGpu,
1169     OBJRPC *pRpc
1170 )
1171 {
1172     NV_STATUS status;
1173     struct MIG_CI_UPDATE_CALLBACK_PARAMS *pParams;
1174 
1175     RPC_PARAMS(vgpu_gsp_mig_ci_config, _v21_03);
1176 
1177     NV_ASSERT_OR_RETURN(rpc_params->execPartCount <= NVC637_CTRL_MAX_EXEC_PARTITIONS,
1178                         NV_ERR_INVALID_ARGUMENT);
1179 
1180     pParams = portMemAllocNonPaged(sizeof(struct MIG_CI_UPDATE_CALLBACK_PARAMS));
1181     if (pParams == NULL)
1182     {
1183         return NV_ERR_NO_MEMORY;
1184     }
1185 
1186     pParams->execPartCount = rpc_params->execPartCount;
1187     portMemCopy(pParams->execPartId, (sizeof(NvU32) * rpc_params->execPartCount),
1188                 rpc_params->execPartId, (sizeof(NvU32) * rpc_params->execPartCount));
1189     pParams->gfid = rpc_params->gfid;
1190     pParams->bDelete = rpc_params->bDelete;
1191     status = osQueueWorkItemWithFlags(pGpu,
1192                                       _kgspRpcMigCiConfigUpdateCallback,
1193                                       (void *)pParams,
1194                                       OS_QUEUE_WORKITEM_FLAGS_LOCK_API_RW | OS_QUEUE_WORKITEM_FLAGS_LOCK_GPUS_RW);
1195     if (status != NV_OK)
1196     {
1197         portMemFree(pParams);
1198     }
1199 
1200     return status;
1201 }
1202 
1203 static void
_kgspRpcGspUpdateTrace(OBJGPU * pGpu,OBJRPC * pRpc)1204 _kgspRpcGspUpdateTrace
1205 (
1206     OBJGPU *pGpu,
1207     OBJRPC *pRpc
1208 )
1209 {
1210 #if KERNEL_GSP_TRACING_RATS_ENABLED
1211     RPC_PARAMS(update_gsp_trace, _v01_00);
1212     NvU32 i;
1213     NV_RATS_GSP_TRACE_RECORD *GspTraceRecords = (NV_RATS_GSP_TRACE_RECORD*) (&rpc_params->data);
1214     for (i = 0; i < rpc_params->records; i++)
1215     {
1216         gspTraceEventBufferLogRecord(pGpu, &GspTraceRecords[i]);
1217     }
1218 #endif
1219 }
1220 
1221 static void
_kgspRpcGspPostNocatRecord(OBJGPU * pGpu,OBJRPC * pRpc)1222 _kgspRpcGspPostNocatRecord
1223 (
1224     OBJGPU *pGpu,
1225     OBJRPC *pRpc
1226 )
1227 {
1228     OBJSYS                  *pSys = SYS_GET_INSTANCE();
1229     Journal                 *pRcdb = SYS_GET_RCDB(pSys);
1230     NOCAT_JOURNAL_PARAMS    newEntry;
1231     const NV2080CtrlNocatJournalInsertRecord *pRecord = NULL;
1232     RPC_PARAMS(gsp_post_nocat_record, _v01_00);
1233 
1234     // make a pointer to the record.
1235     pRecord = (const NV2080CtrlNocatJournalInsertRecord *)&rpc_params->data;
1236 
1237     portMemSet(&newEntry, 0, sizeof(newEntry));
1238     newEntry.timestamp          = pRecord->timestamp;
1239     newEntry.recType            = pRecord->recType;
1240     newEntry.bugcheck           = pRecord->bugcheck;
1241     newEntry.pSource            = pRecord->source;
1242     newEntry.subsystem          = pRecord->subsystem;
1243     newEntry.errorCode          = pRecord->errorCode;
1244     newEntry.diagBufferLen      = pRecord->diagBufferLen;
1245     newEntry.pDiagBuffer        = pRecord->diagBuffer;
1246     newEntry.pFaultingEngine    = pRecord->faultingEngine;
1247     newEntry.tdrReason          = pRecord->tdrReason;
1248 
1249     (void)rcdbNocatInsertNocatError(pGpu, &newEntry);
1250     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_RPC_INSERT_RECORDS_IDX]++;
1251 }
1252 
1253 static NV_STATUS
_kgspRpcRgLineIntr(OBJGPU * pGpu,OBJRPC * pRpc)1254 _kgspRpcRgLineIntr
1255 (
1256     OBJGPU *pGpu,
1257     OBJRPC *pRpc
1258 )
1259 {
1260     RPC_PARAMS(rg_line_intr, _v17_00);
1261 
1262     KernelDisplay *pKernelDisplay = GPU_GET_KERNEL_DISPLAY(pGpu);
1263     NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelDisplay != NULL, NV_ERR_OBJECT_NOT_FOUND);
1264 
1265     kdispInvokeRgLineCallback(pKernelDisplay, rpc_params->head, rpc_params->rgIntr, NV_FALSE);
1266 
1267     return NV_OK;
1268 }
1269 
1270 static NV_STATUS
_kgspRpcEventPlatformRequestHandlerStateSyncCallback(OBJGPU * pGpu,OBJRPC * pRpc)1271 _kgspRpcEventPlatformRequestHandlerStateSyncCallback
1272 (
1273     OBJGPU* pGpu,
1274     OBJRPC* pRpc
1275 )
1276 {
1277     OBJSYS *pSys = SYS_GET_INSTANCE();
1278     PlatformRequestHandler* pPlatformRequestHandler
1279                  = SYS_GET_PFM_REQ_HNDLR(pSys);
1280 
1281     RPC_PARAMS(pfm_req_hndlr_state_sync_callback, _v21_04);
1282 
1283     NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_PARAMS_v21_04  *src = &rpc_params->params;
1284     NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_PARAMS          dst = { 0 };
1285 
1286     dst.flags         = src->flags;
1287     dst.syncData.type = src->syncData.type;
1288 
1289     // Copy in the rpc data
1290     switch (src->syncData.type)
1291     {
1292         case NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_DATA_TYPE_SMBPBI:
1293         {
1294             dst.syncData.data.smbpbi.sensorId =
1295                 src->syncData.data.smbpbi.sensorId;
1296             dst.syncData.data.smbpbi.limit =
1297                 src->syncData.data.smbpbi.limit;
1298             break;
1299         }
1300         default:
1301         {
1302             // Nothing for now
1303             break;
1304         }
1305     }
1306 
1307     pfmreqhndlrStateSync(pPlatformRequestHandler, pGpu, &dst);
1308     return NV_OK;
1309 }
1310 
1311 static void
_kgspRpcGspLockdownNotice(OBJGPU * pGpu,OBJRPC * pRpc)1312 _kgspRpcGspLockdownNotice
1313 (
1314     OBJGPU *pGpu,
1315     OBJRPC *pRpc
1316 )
1317 {
1318     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
1319     RPC_PARAMS(gsp_lockdown_notice, _v17_00);
1320 
1321     //
1322     // While the GSP is in lockdown, we cannot access some of its registers,
1323     // including interrupt status and control. We shouldn't receive any more
1324     // SWGEN0 interrupts while the core is in lockdown.
1325     //
1326     pKernelGsp->bInLockdown = rpc_params->bLockdownEngaging;
1327 
1328     NV_PRINTF(LEVEL_INFO, "GSP lockdown %s\n",
1329               pKernelGsp->bInLockdown ? "engaged" : "disengaged");
1330 }
1331 
1332 static
_getRpcName(NvU32 id)1333 const char *_getRpcName
1334 (
1335     NvU32 id
1336 )
1337 {
1338     static const char *rpcName[] =
1339         {
1340             #define X(UNIT, a, VAL) #a,
1341             #define E(a, VAL) #a,
1342             #undef _RPC_GLOBAL_ENUMS_H_
1343             #include "vgpu/rpc_global_enums.h"
1344             #undef X
1345             #undef E
1346         };
1347 
1348     if (id < NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS)
1349     {
1350         return rpcName[id];
1351     }
1352     else if ((id > NV_VGPU_MSG_EVENT_FIRST_EVENT) && (id < NV_VGPU_MSG_EVENT_NUM_EVENTS))
1353     {
1354         NvU32 index = id - (NV_VGPU_MSG_EVENT_FIRST_EVENT - NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS) + 1;
1355         return rpcName[index];
1356     }
1357 
1358     return "Unknown";
1359 }
1360 
1361 /*!
1362  * GSP client process RPC events
1363  */
1364 static void
_kgspProcessRpcEvent(OBJGPU * pGpu,OBJRPC * pRpc,KernelGspRpcEventHandlerContext rpcHandlerContext)1365 _kgspProcessRpcEvent
1366 (
1367     OBJGPU *pGpu,
1368     OBJRPC *pRpc,
1369     KernelGspRpcEventHandlerContext rpcHandlerContext
1370 )
1371 {
1372     rpc_message_header_v *pMsgHdr = RPC_HDR;
1373     NV_STATUS nvStatus = NV_OK;
1374     NvU32 event = pMsgHdr->function;
1375 
1376     NV_PRINTF(LEVEL_INFO, "received event from GPU%d: 0x%x (%s) status: 0x%x size: %d\n",
1377               gpuGetInstance(pGpu), event, _getRpcName(event), pMsgHdr->rpc_result, pMsgHdr->length);
1378 
1379     _kgspAddRpcHistoryEntry(pRpc, pRpc->rpcEventHistory, &pRpc->rpcEventHistoryCurrent);
1380 
1381     /*
1382      * Shortlist of RPC's that have been manually screened to be safe without the API lock
1383      * that are called during GSP bootup
1384      */
1385     if ((rpcHandlerContext == KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL_BOOTUP) &&
1386         (!rmapiLockIsOwner()))
1387     {
1388         switch(pMsgHdr->function)
1389         {
1390             case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
1391             case NV_VGPU_MSG_EVENT_UCODE_LIBOS_PRINT:
1392             case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
1393             case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
1394             case NV_VGPU_MSG_EVENT_GSP_INIT_DONE:
1395             case NV_VGPU_MSG_EVENT_OS_ERROR_LOG:
1396                 break;
1397             default:
1398                 NV_PRINTF(LEVEL_ERROR, "Attempted to process RPC event from GPU%d: 0x%x (%s) during bootup without API lock\n",
1399                         gpuGetInstance(pGpu), event, _getRpcName(event));
1400                 NV_ASSERT(0);
1401                 goto done;
1402         }
1403     }
1404 
1405     switch(event)
1406     {
1407         case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
1408             nvStatus = _kgspRpcRunCpuSequencer(pGpu, pRpc);
1409             break;
1410 
1411         case NV_VGPU_MSG_EVENT_POST_EVENT:
1412             nvStatus = _kgspRpcPostEvent(pGpu, pRpc);
1413             break;
1414 
1415         case NV_VGPU_MSG_EVENT_RC_TRIGGERED:
1416             nvStatus = _kgspRpcRCTriggered(pGpu, pRpc);
1417             break;
1418 
1419         case NV_VGPU_MSG_EVENT_MMU_FAULT_QUEUED:
1420             nvStatus = _kgspRpcMMUFaultQueued(pGpu, pRpc);
1421             break;
1422 
1423         case NV_VGPU_MSG_EVENT_SIM_READ:
1424             nvStatus = _kgspRpcSimRead(pGpu, pRpc);
1425             break;
1426 
1427         case NV_VGPU_MSG_EVENT_SIM_WRITE:
1428             nvStatus = _kgspRpcSimWrite(pGpu, pRpc);
1429             break;
1430 
1431         case NV_VGPU_MSG_EVENT_OS_ERROR_LOG:
1432             _kgspRpcOsErrorLog(pGpu, pRpc);
1433             break;
1434 
1435         case NV_VGPU_MSG_EVENT_GPUACCT_PERFMON_UTIL_SAMPLES:
1436             _kgspRpcGpuacctPerfmonUtilSamples(pGpu, pRpc);
1437             break;
1438 
1439         case NV_VGPU_MSG_EVENT_PERF_GPU_BOOST_SYNC_LIMITS_CALLBACK:
1440             _kgspRpcPerfGpuBoostSyncLimitsCallback(pGpu, pRpc);
1441             break;
1442 
1443         case NV_VGPU_MSG_EVENT_PERF_BRIDGELESS_INFO_UPDATE:
1444             _kgspRpcPerfBridgelessInfoUpdate(pGpu, pRpc);
1445             break;
1446 
1447         case NV_VGPU_MSG_EVENT_SEMAPHORE_SCHEDULE_CALLBACK:
1448             _kgspRpcSemaphoreScheduleCallback(pGpu, pRpc);
1449             break;
1450 
1451         case NV_VGPU_MSG_EVENT_TIMED_SEMAPHORE_RELEASE:
1452             _kgspRpcTimedSemaphoreRelease(pGpu, pRpc);
1453             break;
1454 
1455         case NV_VGPU_MSG_EVENT_NVLINK_FAULT_UP:
1456             _kgspRpcNvlinkFaultUpCallback(pGpu, pRpc);
1457              break;
1458 
1459         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_256:
1460             _kgspRpcNvlinkInbandReceivedData256Callback(pGpu, pRpc);
1461             break;
1462 
1463         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_512:
1464             _kgspRpcNvlinkInbandReceivedData512Callback(pGpu, pRpc);
1465             break;
1466 
1467         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_1024:
1468             _kgspRpcNvlinkInbandReceivedData1024Callback(pGpu, pRpc);
1469             break;
1470 
1471         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_2048:
1472             _kgspRpcNvlinkInbandReceivedData2048Callback(pGpu, pRpc);
1473             break;
1474 
1475         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_4096:
1476             _kgspRpcNvlinkInbandReceivedData4096Callback(pGpu, pRpc);
1477             break;
1478 
1479         case NV_VGPU_MSG_EVENT_NVLINK_FATAL_ERROR_RECOVERY:
1480             _kgspRpcNvlinkFatalErrorRecoveryCallback(pGpu, pRpc);
1481             break;
1482 
1483         case NV_VGPU_MSG_EVENT_NVLINK_IS_GPU_DEGRADED :
1484             _kgspRpcEventIsGpuDegradedCallback(pGpu, pRpc);
1485             break;
1486 
1487         case NV_VGPU_MSG_EVENT_RG_LINE_INTR:
1488             _kgspRpcRgLineIntr(pGpu, pRpc);
1489             break;
1490 
1491         case NV_VGPU_MSG_EVENT_UCODE_LIBOS_PRINT:
1492             nvStatus = _kgspRpcUcodeLibosPrint(pGpu, pRpc);
1493             break;
1494 
1495         case NV_VGPU_MSG_EVENT_VGPU_GSP_PLUGIN_TRIGGERED:
1496             nvStatus = _kgspRpcVgpuGspPluginTriggered(pGpu, pRpc);
1497             break;
1498 
1499         case NV_VGPU_MSG_EVENT_VGPU_CONFIG:
1500             nvStatus = _kgspRpcGspVgpuConfig(pGpu, pRpc);
1501             break;
1502 
1503         case NV_VGPU_MSG_EVENT_EXTDEV_INTR_SERVICE:
1504             nvStatus = _kgspRpcGspExtdevIntrService(pGpu, pRpc);
1505             break;
1506 
1507         case NV_VGPU_MSG_EVENT_PFM_REQ_HNDLR_STATE_SYNC_CALLBACK:
1508             nvStatus = _kgspRpcEventPlatformRequestHandlerStateSyncCallback(pGpu, pRpc);
1509             break;
1510 
1511         case NV_VGPU_MSG_EVENT_MIG_CI_CONFIG_UPDATE:
1512             nvStatus = _kgspRpcMigCiConfigUpdate(pGpu, pRpc);
1513             break;
1514 
1515         case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
1516             _kgspRpcGspLockdownNotice(pGpu, pRpc);
1517             break;
1518 
1519         case NV_VGPU_MSG_EVENT_UPDATE_GSP_TRACE:
1520             _kgspRpcGspUpdateTrace(pGpu, pRpc);
1521             break;
1522 
1523         case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
1524             _kgspRpcGspPostNocatRecord(pGpu, pRpc);
1525             break;
1526 
1527         case NV_VGPU_MSG_EVENT_GSP_INIT_DONE:   // Handled by _kgspRpcRecvPoll.
1528         default:
1529             //
1530             // Log, but otherwise ignore unexpected events.
1531             //
1532             // We will get here if the previous RPC timed out.  The response
1533             // eventually comes in as an unexpected event.  The error handling
1534             // for the timeout should have already happened.
1535             //
1536             NV_PRINTF(LEVEL_ERROR, "Unexpected RPC event from GPU%d: 0x%x (%s)\n",
1537                       gpuGetInstance(pGpu), event, _getRpcName(event));
1538             break;
1539     }
1540 
1541     if (nvStatus != NV_OK)
1542     {
1543         //
1544         // Failing to properly handle a specific event does not mean we should stop
1545         // processing events/RPCs, so print the error and soldier on.
1546         //
1547         NV_PRINTF(LEVEL_ERROR,
1548                   "Failed to process received event 0x%x (%s) from GPU%d: status=0x%x\n",
1549                   event, _getRpcName(event), gpuGetInstance(pGpu), nvStatus);
1550     }
1551 
1552 done:
1553     _kgspCompleteRpcHistoryEntry(pRpc->rpcEventHistory, pRpc->rpcEventHistoryCurrent);
1554 }
1555 
1556 /*!
1557  * Handle a single RPC event from GSP unless the event is [an RPC return for] expectedFunc,
1558  * or there are no events available in the buffer.
1559  *
1560  * @return
1561  *   NV_OK                              if the event is successfully handled.
1562  *   NV_WARN_NOTHING_TO_DO              if there are no events available.
1563  *   NV_WARN_MORE_PROCESSING_REQUIRED   if the event is expectedFunc: it is unhandled and in the staging area.
1564  *   (Another status)                   if event reading fails.
1565  */
1566 static NV_STATUS
_kgspRpcDrainOneEvent(OBJGPU * pGpu,OBJRPC * pRpc,NvU32 expectedFunc,KernelGspRpcEventHandlerContext rpcHandlerContext)1567 _kgspRpcDrainOneEvent
1568 (
1569     OBJGPU          *pGpu,
1570     OBJRPC          *pRpc,
1571     NvU32            expectedFunc,
1572     KernelGspRpcEventHandlerContext rpcHandlerContext
1573 )
1574 {
1575     NV_STATUS nvStatus;
1576 
1577     // Issue a memory barrier to ensure we see any queue updates.
1578     // Note: Without the fence, the CPU may get stuck in an infinite loop
1579     //       waiting for a message that has already arrived.
1580     portAtomicMemoryFenceFull();
1581 
1582     nvStatus = GspMsgQueueReceiveStatus(pRpc->pMessageQueueInfo, pGpu);
1583 
1584     if (nvStatus == NV_OK)
1585     {
1586         rpc_message_header_v *pMsgHdr = RPC_HDR;
1587 
1588         if (pMsgHdr->function == expectedFunc)
1589             return NV_WARN_MORE_PROCESSING_REQUIRED;
1590 
1591         _kgspProcessRpcEvent(pGpu, pRpc, rpcHandlerContext);
1592     }
1593 
1594     //
1595     // We don't expect NV_WARN_MORE_PROCESSING_REQUIRED here.
1596     // If we get it we need to suppress it to avoid confusing our caller, for whom it has special meaning.
1597     //
1598     NV_ASSERT_OR_ELSE(nvStatus != NV_WARN_MORE_PROCESSING_REQUIRED,
1599         nvStatus = NV_ERR_GENERIC);
1600 
1601     return nvStatus;
1602 }
1603 
1604 /*!
1605  * Handle RPC events from GSP until the event is [an RPC return for] expectedFunc,
1606  * or there are no events available in the buffer.
1607  *
1608  * Also dump GSP logs, and check for severe errors coming from GSP.
1609  *
1610  * @return
1611  *   NV_OK                              if one or more events are handled and there are none left.
1612  *   NV_WARN_MORE_PROCESSING_REQUIRED   if an expectedFunc event is found: it is unhandled and in the staging area.
1613  *                                        (Zero or more preceding events were successfully handled.)
1614  *   (Another status)                   if event reading or processing fails.
1615  */
1616 static NV_STATUS
_kgspRpcDrainEvents(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 expectedFunc,KernelGspRpcEventHandlerContext rpcHandlerContext)1617 _kgspRpcDrainEvents
1618 (
1619     OBJGPU    *pGpu,
1620     KernelGsp *pKernelGsp,
1621     NvU32      expectedFunc,
1622     KernelGspRpcEventHandlerContext rpcHandlerContext
1623 )
1624 {
1625     NV_STATUS nvStatus = NV_OK;
1626     OBJRPC *pRpc = GPU_GET_RPC(pGpu);
1627 
1628     while (nvStatus == NV_OK)
1629     {
1630         nvStatus = _kgspRpcDrainOneEvent(pGpu, pRpc, expectedFunc, rpcHandlerContext);
1631         kgspDumpGspLogs(pKernelGsp, NV_FALSE);
1632     }
1633 
1634     // If GSP-RM has died, the GPU will need to be reset
1635     if (!kgspHealthCheck_HAL(pGpu, pKernelGsp))
1636         return NV_ERR_RESET_REQUIRED;
1637 
1638     if (nvStatus == NV_WARN_NOTHING_TO_DO)
1639         nvStatus = NV_OK;
1640 
1641     return nvStatus;
1642 }
1643 
1644 static NvU64
_tsDiffToDuration(NvU64 duration,char * pDurationUnitsChar)1645 _tsDiffToDuration
1646 (
1647     NvU64 duration,
1648     char *pDurationUnitsChar
1649 )
1650 {
1651     const NvU64 tsFreqUs = osGetTimestampFreq() / 1000000;
1652 
1653     *pDurationUnitsChar = 'u';
1654 
1655     NV_ASSERT_OR_RETURN(tsFreqUs > 0, 0);
1656 
1657     duration /= tsFreqUs;
1658 
1659     // 999999us then 1000ms
1660     if (duration >= 1000000)
1661     {
1662         duration /= 1000;
1663         *pDurationUnitsChar = 'm';
1664     }
1665 
1666     // 9999ms then 10s
1667     if (duration >= 10000)
1668     {
1669         duration /= 1000;
1670         *pDurationUnitsChar = ' '; // so caller can always just append 's'
1671     }
1672 
1673     return duration;
1674 }
1675 
1676 static NvBool
_kgspIsTimestampDuringRecentRpc(OBJRPC * pRpc,NvU64 timestamp,NvBool bCheckIncompleteRpcsOnly)1677 _kgspIsTimestampDuringRecentRpc
1678 (
1679     OBJRPC *pRpc,
1680     NvU64 timestamp,
1681     NvBool bCheckIncompleteRpcsOnly
1682 )
1683 {
1684     NvU32 historyIndex;
1685     NvU32 historyEntry;
1686 
1687     for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1688     {
1689         historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1690         if (pRpc->rpcHistory[historyEntry].function != 0)
1691         {
1692             if ((timestamp >= pRpc->rpcHistory[historyEntry].ts_start) &&
1693                 ((pRpc->rpcHistory[historyEntry].ts_end == 0) ||
1694                  (!bCheckIncompleteRpcsOnly && (timestamp <= pRpc->rpcHistory[historyEntry].ts_end))))
1695             {
1696                 return NV_TRUE;
1697             }
1698         }
1699     }
1700 
1701     return NV_FALSE;
1702 }
1703 
1704 static void
_kgspLogRpcHistoryEntry(OBJGPU * pGpu,NvU32 errorNum,NvU32 historyIndex,RpcHistoryEntry * pEntry,NvBool lastColumnCondition)1705 _kgspLogRpcHistoryEntry
1706 (
1707     OBJGPU *pGpu,
1708     NvU32 errorNum,
1709     NvU32 historyIndex,
1710     RpcHistoryEntry *pEntry,
1711     NvBool lastColumnCondition
1712 )
1713 {
1714     NvU64 duration;
1715     char  durationUnitsChar;
1716 
1717     if (pEntry->function != 0)
1718     {
1719         duration = (pEntry->ts_end > pEntry->ts_start) ? (pEntry->ts_end - pEntry->ts_start) : 0;
1720         if (duration)
1721         {
1722             duration = _tsDiffToDuration(duration, &durationUnitsChar);
1723 
1724             NV_ERROR_LOG_DATA(pGpu, errorNum,
1725                               "    %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx %6llu%cs %c\n",
1726                               ((historyIndex == 0) ? ' ' : '-'),
1727                               historyIndex,
1728                               pEntry->function,
1729                               _getRpcName(pEntry->function),
1730                               pEntry->data[0],
1731                               pEntry->data[1],
1732                               pEntry->ts_start,
1733                               pEntry->ts_end,
1734                               duration, durationUnitsChar,
1735                               (lastColumnCondition ? 'y' : ' '));
1736         }
1737         else
1738         {
1739             NV_ERROR_LOG_DATA(pGpu, errorNum,
1740                               "    %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx          %c\n",
1741                               ((historyIndex == 0) ? ' ' : '-'),
1742                               historyIndex,
1743                               pEntry->function,
1744                               _getRpcName(pEntry->function),
1745                               pEntry->data[0],
1746                               pEntry->data[1],
1747                               pEntry->ts_start,
1748                               pEntry->ts_end,
1749                               (lastColumnCondition ? 'y' : ' '));
1750         }
1751     }
1752 }
1753 
1754 void
kgspLogRpcDebugInfo(OBJGPU * pGpu,OBJRPC * pRpc,NvU32 errorNum,NvBool bPollingForRpcResponse)1755 kgspLogRpcDebugInfo
1756 (
1757     OBJGPU *pGpu,
1758     OBJRPC *pRpc,
1759     NvU32   errorNum,
1760     NvBool  bPollingForRpcResponse
1761 )
1762 {
1763     const rpc_message_header_v *pMsgHdr = RPC_HDR;
1764     NvU32  historyIndex;
1765     NvU32  historyEntry;
1766     NvU64  activeData[2];
1767 
1768     _kgspGetActiveRpcDebugData(pRpc, pMsgHdr->function,
1769                                &activeData[0], &activeData[1]);
1770     NV_ERROR_LOG_DATA(pGpu, errorNum,
1771                       "GPU%d GSP RPC buffer contains function %d (%s) and data 0x%016llx 0x%016llx.\n",
1772                       gpuGetInstance(pGpu),
1773                       pMsgHdr->function, _getRpcName(pMsgHdr->function),
1774                       activeData[0], activeData[1]);
1775 
1776     NV_ERROR_LOG_DATA(pGpu, errorNum,
1777                       "GPU%d RPC history (CPU -> GSP):\n",
1778                       gpuGetInstance(pGpu));
1779     NV_ERROR_LOG_DATA(pGpu, errorNum,
1780                       "    entry function                   data0              data1              ts_start           ts_end             duration actively_polling\n");
1781     for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1782     {
1783         historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1784         _kgspLogRpcHistoryEntry(pGpu, errorNum, historyIndex, &pRpc->rpcHistory[historyEntry],
1785                                 ((historyIndex == 0) && bPollingForRpcResponse));
1786     }
1787 
1788     NV_ERROR_LOG_DATA(pGpu, errorNum,
1789                       "GPU%d RPC event history (CPU <- GSP):\n",
1790                       gpuGetInstance(pGpu));
1791     NV_ERROR_LOG_DATA(pGpu, errorNum,
1792                       "    entry function                   data0              data1              ts_start           ts_end             duration during_incomplete_rpc\n");
1793     for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1794     {
1795         historyEntry = (pRpc->rpcEventHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1796         _kgspLogRpcHistoryEntry(pGpu, errorNum, historyIndex, &pRpc->rpcEventHistory[historyEntry],
1797                                 _kgspIsTimestampDuringRecentRpc(pRpc,
1798                                                                 pRpc->rpcEventHistory[historyEntry].ts_start,
1799                                                                 NV_TRUE/*bCheckIncompleteRpcsOnly*/));
1800     }
1801 }
1802 
1803 /*!
1804  * Log Xid 119 - GSP RPC Timeout
1805  */
1806 static void
_kgspLogXid119(OBJGPU * pGpu,OBJRPC * pRpc,NvU32 expectedFunc)1807 _kgspLogXid119
1808 (
1809     OBJGPU *pGpu,
1810     OBJRPC *pRpc,
1811     NvU32 expectedFunc
1812 )
1813 {
1814     RpcHistoryEntry *pHistoryEntry = &pRpc->rpcHistory[pRpc->rpcHistoryCurrent];
1815     NvU64 ts_end = osGetTimestamp();
1816     NvU64 duration;
1817     char  durationUnitsChar;
1818 
1819     if (pRpc->timeoutCount == 1)
1820     {
1821         NV_PRINTF(LEVEL_ERROR,
1822                   "********************************* GSP Timeout **********************************\n");
1823         NV_PRINTF(LEVEL_ERROR,
1824                   "Note: Please also check logs above.\n");
1825     }
1826 
1827     NV_ASSERT(expectedFunc == pHistoryEntry->function);
1828 
1829     NV_ASSERT(ts_end > pHistoryEntry->ts_start);
1830     duration = _tsDiffToDuration(ts_end - pHistoryEntry->ts_start, &durationUnitsChar);
1831 
1832     NV_ERROR_LOG(pGpu, GSP_RPC_TIMEOUT,
1833                  "Timeout after %llus of waiting for RPC response from GPU%d GSP! Expected function %d (%s) (0x%x 0x%x).",
1834                  (durationUnitsChar == 'm' ? duration / 1000 : duration),
1835                  gpuGetInstance(pGpu),
1836                  expectedFunc,
1837                  _getRpcName(expectedFunc),
1838                  pHistoryEntry->data[0],
1839                  pHistoryEntry->data[1]);
1840 
1841     if (pRpc->timeoutCount == 1)
1842     {
1843         kgspLogRpcDebugInfo(pGpu, pRpc, GSP_RPC_TIMEOUT, NV_TRUE/*bPollingForRpcResponse*/);
1844 
1845         osAssertFailed();
1846 
1847         NV_PRINTF(LEVEL_ERROR,
1848                   "********************************************************************************\n");
1849     }
1850 }
1851 
1852 static void
_kgspRpcIncrementTimeoutCountAndRateLimitPrints(OBJGPU * pGpu,OBJRPC * pRpc)1853 _kgspRpcIncrementTimeoutCountAndRateLimitPrints
1854 (
1855     OBJGPU *pGpu,
1856     OBJRPC *pRpc
1857 )
1858 {
1859     pRpc->timeoutCount++;
1860 
1861     if ((pRpc->timeoutCount == (RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH + 1)) &&
1862         (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP > 0))
1863     {
1864         // make sure we warn Xid and NV_PRINTF/NVLOG consumers that we are rate limiting prints
1865         if (GPU_GET_KERNEL_RC(pGpu)->bLogEvents)
1866         {
1867             portDbgPrintf(
1868                 "NVRM: Rate limiting GSP RPC error prints for GPU at PCI:%04x:%02x:%02x (printing 1 of every %d).  The GPU likely needs to be reset.\n",
1869                 gpuGetDomain(pGpu),
1870                 gpuGetBus(pGpu),
1871                 gpuGetDevice(pGpu),
1872                 RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1);
1873         }
1874         NV_PRINTF(LEVEL_WARNING,
1875                   "Rate limiting GSP RPC error prints (printing 1 of every %d)\n",
1876                   RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1);
1877     }
1878 
1879     pRpc->bQuietPrints = ((pRpc->timeoutCount > RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH) &&
1880                           ((pRpc->timeoutCount % (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1)) != 0));
1881 }
1882 
1883 /*!
1884  * GSP client RM RPC poll routine
1885  */
1886 static NV_STATUS
_kgspRpcRecvPoll(OBJGPU * pGpu,OBJRPC * pRpc,NvU32 expectedFunc)1887 _kgspRpcRecvPoll
1888 (
1889     OBJGPU *pGpu,
1890     OBJRPC *pRpc,
1891     NvU32   expectedFunc
1892 )
1893 {
1894     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
1895     NV_STATUS  rpcStatus = NV_OK;
1896     NV_STATUS  timeoutStatus = NV_OK;
1897     RMTIMEOUT  timeout;
1898     NvU32      timeoutUs;
1899     NvU32      timeoutFlags;
1900     NvBool     bSlowGspRpc = IS_EMULATION(pGpu) || IS_SIMULATION(pGpu);
1901     NvU32      gpuMaskUnused;
1902 
1903     KernelGspRpcEventHandlerContext rpcHandlerContext = KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL;
1904     if (expectedFunc == NV_VGPU_MSG_EVENT_GSP_INIT_DONE)
1905     {
1906         // special case for bootup path without API lock
1907         rpcHandlerContext = KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL_BOOTUP;
1908     }
1909     //
1910     // We do not allow recursive polling. This can happen if e.g.
1911     //    1. CPU-RM issues RPC-A to GSP and polls waiting for it to finish
1912     //    2. While servicing RPC-A, GSP emits an async event back to CPU-RM
1913     //    3. CPU-RM services the async event and sends another synchronous RPC-B
1914     //    4. RPC-A response will come first, but CPU-RM is now waiting on RPC-B
1915     //
1916     // We don't have a good way to handle this and should just be deferring the
1917     // second RPC until the first one is done, via e.g. osQueueWorkItem().
1918     // This assert is meant to catch and loudly fail such cases.
1919     //
1920     NV_ASSERT_OR_RETURN(!pKernelGsp->bPollingForRpcResponse, NV_ERR_INVALID_STATE);
1921     pKernelGsp->bPollingForRpcResponse = NV_TRUE;
1922 
1923     //
1924     // GSP-RM init in emulation/simulation environment is extremely slow,
1925     // so need to increment timeout.
1926     // Apply the timeout extension to other RPCs as well, mostly so that
1927     // we'll reset the thread state after each RPC, not just while waiting
1928     // for the INIT_DONE event.
1929     //
1930     if (bSlowGspRpc)
1931     {
1932         NvU32 timeoutResult;
1933 
1934         // On slow Apollo emulators, GSP-RM init could take more than an hour
1935         NV_ASSERT(portSafeMulU32(GSP_SCALE_TIMEOUT_EMU_SIM, 1500000, &timeoutResult));
1936         timeoutUs = timeoutResult;
1937     }
1938     else
1939     {
1940         NvU32 defaultus = pGpu->timeoutData.defaultus;
1941 
1942         if (IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
1943         {
1944             // Ensure at least 3.1s for vGPU-GSP before adding leeway (Bug 3928607)
1945             timeoutUs = NV_MAX(3100 * 1000, defaultus) + (defaultus / 2);
1946         }
1947         else
1948         {
1949             //
1950             // We should only ever timeout this when GSP is in really bad state, so if it just
1951             // happens to timeout on default timeout it should be OK for us to give it a little
1952             // more time - make this timeout 1.5 of the default to allow some leeway.
1953             //
1954             timeoutUs = defaultus + defaultus / 2;
1955         }
1956     }
1957 
1958     NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
1959 
1960     timeoutFlags = GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE;
1961     if (pRpc->bQuietPrints)
1962         timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_JOURNAL_LOG;
1963 
1964     gpuSetTimeout(pGpu, timeoutUs, &timeout, timeoutFlags);
1965 
1966     for (;;)
1967     {
1968         //
1969         // Check for GPU timeout, save that information, and then verify if the RPC is completed.
1970         // Otherwise if the CPU thread goes to sleep immediately after the RPC check, it may result in hitting a timeout.
1971         //
1972         timeoutStatus = gpuCheckTimeout(pGpu, &timeout);
1973 
1974         rpcStatus = _kgspRpcDrainEvents(pGpu, pKernelGsp, expectedFunc, rpcHandlerContext);
1975 
1976         switch (rpcStatus) {
1977             case NV_WARN_MORE_PROCESSING_REQUIRED:
1978                 // The synchronous RPC response we were waiting for is here
1979                 _kgspCompleteRpcHistoryEntry(pRpc->rpcHistory, pRpc->rpcHistoryCurrent);
1980                 rpcStatus = NV_OK;
1981                 goto done;
1982             case NV_OK:
1983                 // Check timeout and continue outer loop.
1984                 break;
1985             default:
1986                 goto done;
1987         }
1988 
1989         NV_CHECK_OK_OR_GOTO(rpcStatus, LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc), done);
1990 
1991         if (timeoutStatus == NV_ERR_TIMEOUT)
1992         {
1993             rpcStatus = timeoutStatus;
1994 
1995             _kgspRpcIncrementTimeoutCountAndRateLimitPrints(pGpu, pRpc);
1996 
1997             if (!pRpc->bQuietPrints)
1998             {
1999                 _kgspLogXid119(pGpu, pRpc, expectedFunc);
2000             }
2001 
2002             goto done;
2003         }
2004         else if (timeoutStatus != NV_OK)
2005         {
2006             NV_PRINTF(LEVEL_ERROR, "gpuCheckTimeout() returned unexpected error (0x%08x)\n",
2007                       timeoutStatus);
2008             rpcStatus = timeoutStatus;
2009             goto done;
2010         }
2011 
2012         osSpinLoop();
2013     }
2014 
2015     pRpc->timeoutCount = 0;
2016 
2017 done:
2018     pKernelGsp->bPollingForRpcResponse = NV_FALSE;
2019 
2020     if (bSlowGspRpc)
2021     {
2022         // Avoid cumulative timeout due to slow RPC
2023         threadStateResetTimeout(pGpu);
2024     }
2025 
2026     return rpcStatus;
2027 }
2028 
2029 /*!
2030  * Initialize RPC objects required for interfacing with GSP.
2031  */
2032 static NV_STATUS
_kgspInitRpcInfrastructure(OBJGPU * pGpu,KernelGsp * pKernelGsp)2033 _kgspInitRpcInfrastructure
2034 (
2035     OBJGPU    *pGpu,
2036     KernelGsp *pKernelGsp
2037 )
2038 {
2039     NV_STATUS nvStatus = NV_OK;
2040     MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL;
2041 
2042     nvStatus = GspMsgQueuesInit(pGpu, &pMQCollection);
2043     if (nvStatus != NV_OK)
2044     {
2045         NV_PRINTF(LEVEL_ERROR, "GspMsgQueueInit failed\n");
2046         goto done;
2047     }
2048 
2049     pKernelGsp->pMQCollection = pMQCollection;
2050 
2051     // Init RM RPC object
2052     nvStatus = _kgspConstructRpcObject(pGpu, pKernelGsp,
2053                                        &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX],
2054                                        &pKernelGsp->pRpc);
2055     if (nvStatus != NV_OK)
2056     {
2057         NV_PRINTF(LEVEL_ERROR, "init task RM RPC infrastructure failed\n");
2058         goto done;
2059     }
2060 
2061     // Init task_isr RPC object
2062     if (pKernelGsp->bIsTaskIsrQueueRequired)
2063     {
2064         nvStatus = _kgspConstructRpcObject(pGpu, pKernelGsp,
2065                                            &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX],
2066                                            &pKernelGsp->pLocklessRpc);
2067         if (nvStatus != NV_OK)
2068         {
2069             NV_PRINTF(LEVEL_ERROR, "init task ISR RPC infrastructure failed\n");
2070             goto done;
2071         }
2072     }
2073 
2074 done:
2075     if (nvStatus != NV_OK)
2076     {
2077         _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
2078     }
2079 
2080     return nvStatus;
2081 }
2082 
2083 
2084 /*!
2085  * Initialize stripped down version of RPC infra init for GSP clients.
2086  */
2087 static NV_STATUS
_kgspConstructRpcObject(OBJGPU * pGpu,KernelGsp * pKernelGsp,MESSAGE_QUEUE_INFO * pMQI,OBJRPC ** ppRpc)2088 _kgspConstructRpcObject
2089 (
2090     OBJGPU *pGpu,
2091     KernelGsp *pKernelGsp,
2092     MESSAGE_QUEUE_INFO *pMQI,
2093     OBJRPC **ppRpc
2094 )
2095 {
2096     OBJRPC *pRpc;
2097 
2098     NV_ASSERT_OR_RETURN(pMQI != NULL, NV_ERR_INVALID_ARGUMENT);
2099 
2100     pRpc = initRpcObject(pGpu);
2101     if (pRpc == NULL)
2102     {
2103         NV_PRINTF(LEVEL_ERROR, "initRpcObject failed\n");
2104         return NV_ERR_INSUFFICIENT_RESOURCES;
2105     }
2106 
2107     pRpc->pMessageQueueInfo = pMQI;
2108 
2109     portMemSet(&pRpc->rpcHistory, 0, sizeof(pRpc->rpcHistory));
2110     pRpc->rpcHistoryCurrent = RPC_HISTORY_DEPTH - 1;
2111     portMemSet(&pRpc->rpcEventHistory, 0, sizeof(pRpc->rpcEventHistory));
2112     pRpc->rpcEventHistoryCurrent = RPC_HISTORY_DEPTH - 1;
2113 
2114     pRpc->message_buffer  = (NvU32 *)pRpc->pMessageQueueInfo->pRpcMsgBuf;
2115     pRpc->maxRpcSize      = GSP_MSG_QUEUE_RPC_SIZE_MAX;
2116 
2117     rpcSendMessage_FNPTR(pRpc) = _kgspRpcSendMessage;
2118     rpcRecvPoll_FNPTR(pRpc)    = _kgspRpcRecvPoll;
2119 
2120     *ppRpc = pRpc;
2121 
2122     return NV_OK;
2123 }
2124 
2125 static void
_kgspFreeRpcInfrastructure(OBJGPU * pGpu,KernelGsp * pKernelGsp)2126 _kgspFreeRpcInfrastructure
2127 (
2128     OBJGPU *pGpu,
2129     KernelGsp *pKernelGsp
2130 )
2131 {
2132     if (pKernelGsp->pRpc != NULL)
2133     {
2134         rpcDestroy(pGpu, pKernelGsp->pRpc);
2135         portMemFree(pKernelGsp->pRpc);
2136         pKernelGsp->pRpc = NULL;
2137     }
2138     if (pKernelGsp->pLocklessRpc != NULL)
2139     {
2140         rpcDestroy(pGpu, pKernelGsp->pLocklessRpc);
2141         portMemFree(pKernelGsp->pLocklessRpc);
2142         pKernelGsp->pLocklessRpc = NULL;
2143     }
2144     GspMsgQueuesCleanup(&pKernelGsp->pMQCollection);
2145 }
2146 
2147 /*!
2148  * Convert init arg name to 64bit id value.
2149  *
2150  * @param[in]      name  String representing name of init arg
2151  */
2152 static NvU64
_kgspGenerateInitArgId(const char * name)2153 _kgspGenerateInitArgId(const char *name)
2154 {
2155     NvU64 id = 0;
2156     NvU8 c;
2157     NvU32 i;
2158 
2159     // Convert at most 8 characters from name into id.
2160     for (i = 0; i < (sizeof(NvU64) / sizeof(NvU8)); ++i)
2161     {
2162         c = (NvU8)*name++;
2163         if (c == '\0')
2164         {
2165             break;
2166         }
2167         id = (id << 8) | c;
2168     }
2169 
2170     return id;
2171 }
2172 
2173 static void
_kgspUnmapTaskLogBuf(OBJGPU * pGpu,RM_LIBOS_LOG_MEM * pLog)2174 _kgspUnmapTaskLogBuf(OBJGPU *pGpu, RM_LIBOS_LOG_MEM *pLog)
2175 {
2176     // release log memory for this task.
2177     if (pLog->pTaskLogBuffer != NULL)
2178     {
2179         memdescUnmapInternal(pGpu, pLog->pTaskLogDescriptor, TRANSFER_FLAGS_NONE);
2180         pLog->pTaskLogBuffer = NULL;
2181     }
2182 
2183     if (pLog->pTaskLogDescriptor != NULL)
2184     {
2185         memdescFree(pLog->pTaskLogDescriptor);
2186         memdescDestroy(pLog->pTaskLogDescriptor);
2187         pLog->pTaskLogDescriptor = NULL;
2188     }
2189 }
2190 
2191 /*!
2192  * Free vgpu partition LIBOS task logging structures
2193  */
2194 static void
_kgspFreeLibosVgpuPartitionLoggingStructures(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 gfid)2195 _kgspFreeLibosVgpuPartitionLoggingStructures
2196 (
2197     OBJGPU *pGpu,
2198     KernelGsp *pKernelGsp,
2199     NvU32 gfid
2200 )
2201 {
2202     RM_LIBOS_LOG_MEM *vgpuLogBuffers[] =
2203     {
2204         pKernelGsp->gspPluginInitTaskLogMem,
2205         pKernelGsp->gspPluginVgpuTaskLogMem
2206     };
2207 
2208     libosLogDestroy(&pKernelGsp->logDecodeVgpuPartition[gfid - 1]);
2209 
2210     // release all the vgpu tasks' log buffer memory
2211     for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(vgpuLogBuffers); ++i)
2212     {
2213         RM_LIBOS_LOG_MEM *pTaskLog = &vgpuLogBuffers[i][gfid - 1];
2214         _kgspUnmapTaskLogBuf(pGpu, pTaskLog);
2215     }
2216 }
2217 
2218 /*!
2219  * Free vgpu partition LIBOS task logging structures
2220  */
2221 NV_STATUS
kgspFreeVgpuPartitionLogging_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 gfid)2222 kgspFreeVgpuPartitionLogging_IMPL
2223 (
2224     OBJGPU *pGpu,
2225     KernelGsp *pKernelGsp,
2226     NvU32 gfid
2227 )
2228 {
2229     if (gfid > MAX_PARTITIONS_WITH_GFID)
2230     {
2231         return NV_ERR_INVALID_ARGUMENT;
2232     }
2233     else
2234     {
2235         // Make sure there is no lingering debug output.
2236         kgspDumpGspLogs(pKernelGsp, NV_FALSE);
2237 
2238         _kgspFreeLibosVgpuPartitionLoggingStructures(pGpu, pKernelGsp, gfid);
2239         return NV_OK;
2240     }
2241 }
2242 
2243 /*!
2244  * Initialize vgpu partition LIBOS task logging structures
2245  */
2246 NV_STATUS
kgspInitVgpuPartitionLogging_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 gfid,NvU64 initTaskLogBUffOffset,NvU64 initTaskLogBUffSize,NvU64 vgpuTaskLogBUffOffset,NvU64 vgpuTaskLogBuffSize)2247 kgspInitVgpuPartitionLogging_IMPL
2248 (
2249     OBJGPU *pGpu,
2250     KernelGsp *pKernelGsp,
2251     NvU32 gfid,
2252     NvU64 initTaskLogBUffOffset,
2253     NvU64 initTaskLogBUffSize,
2254     NvU64 vgpuTaskLogBUffOffset,
2255     NvU64 vgpuTaskLogBuffSize
2256 )
2257 {
2258     struct
2259     {
2260         const char       *szMemoryId;
2261         const char       *szPrefix;
2262         const char       *elfSectionName;
2263         NvU64             bufOffset;
2264         NvU64             bufSize;
2265         RM_LIBOS_LOG_MEM *taskLogArr;
2266     } logInitValues[] =
2267     {
2268         {"LOGINIT", "INIT", ".fwlogging_init", initTaskLogBUffOffset, initTaskLogBUffSize, pKernelGsp->gspPluginInitTaskLogMem},
2269         {"LOGVGPU", "VGPU", ".fwlogging_vgpu", vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pKernelGsp->gspPluginVgpuTaskLogMem}
2270     };
2271     ct_assert(NV_ARRAY_ELEMENTS(logInitValues) <= LIBOS_LOG_MAX_LOGS);
2272 
2273     NV_STATUS nvStatus = NV_OK;
2274     RM_LIBOS_LOG_MEM *pTaskLog = NULL;
2275     char vm_string[8], sourceName[SOURCE_NAME_MAX_LENGTH];
2276 
2277     if (gfid > MAX_PARTITIONS_WITH_GFID)
2278     {
2279         return NV_ERR_INVALID_ARGUMENT;
2280     }
2281 
2282     if (pKernelGsp->pNvlogFlushMtx != NULL)
2283         portSyncMutexAcquire(pKernelGsp->pNvlogFlushMtx);
2284 
2285     // Source name is used to generate a tag that is a unique identifier for nvlog buffers.
2286     // As the source name 'GSP' is already in use, we will need a custom source name.
2287     nvDbgSnprintf(sourceName, SOURCE_NAME_MAX_LENGTH, "V%02d", gfid);
2288     libosLogCreateEx(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], sourceName);
2289 
2290     // Setup logging for each task in vgpu partition
2291     for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(logInitValues); ++i)
2292     {
2293         pTaskLog = &logInitValues[i].taskLogArr[gfid - 1];
2294         NvP64 pVa = NvP64_NULL;
2295 
2296         NV_ASSERT_OK_OR_GOTO(nvStatus,
2297             memdescCreate(&pTaskLog->pTaskLogDescriptor,
2298                           pGpu,
2299                           logInitValues[i].bufSize,
2300                           RM_PAGE_SIZE,
2301                           NV_TRUE, ADDR_FBMEM, NV_MEMORY_CACHED,
2302                           MEMDESC_FLAGS_NONE),
2303             error_cleanup);
2304 
2305         memdescDescribe(pTaskLog->pTaskLogDescriptor, ADDR_FBMEM, logInitValues[i].bufOffset,  logInitValues[i].bufSize);
2306 
2307         pVa = memdescMapInternal(pGpu, pTaskLog->pTaskLogDescriptor, TRANSFER_FLAGS_NONE);
2308         if (pVa != NvP64_NULL)
2309         {
2310             pTaskLog->pTaskLogBuffer = pVa;
2311             portMemSet(pTaskLog->pTaskLogBuffer, 0, logInitValues[i].bufSize);
2312 
2313             pTaskLog->id8 = _kgspGenerateInitArgId(logInitValues[i].szMemoryId);
2314 
2315             nvDbgSnprintf(vm_string, sizeof(vm_string), "%s%d", logInitValues[i].szPrefix, gfid);
2316 
2317             libosLogAddLogEx(&pKernelGsp->logDecodeVgpuPartition[gfid - 1],
2318                 pTaskLog->pTaskLogBuffer,
2319                 memdescGetSize(pTaskLog->pTaskLogDescriptor),
2320                 pGpu->gpuInstance,
2321                 (gpuGetChipArch(pGpu) >> GPU_ARCH_SHIFT),
2322                 gpuGetChipImpl(pGpu),
2323                 vm_string,
2324                 logInitValues[i].elfSectionName);
2325         }
2326         else
2327         {
2328             NV_PRINTF(LEVEL_ERROR, "Failed to map memory for %s task log buffer for vGPU partition \n", logInitValues[i].szPrefix);
2329             nvStatus = NV_ERR_INSUFFICIENT_RESOURCES;
2330             goto error_cleanup;
2331         }
2332     }
2333 
2334     {
2335         libosLogInit(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], pKernelGsp->pLogElf, pKernelGsp->logElfDataSize);
2336         // nvlog buffers are now setup using the appropriate sourceName to avoid tag-value clash.
2337         // Now sourceName can be modified to preserve the 'GSP-VGPUx' logging convention.
2338         portStringCopy(pKernelGsp->logDecodeVgpuPartition[gfid - 1].sourceName,
2339                        SOURCE_NAME_MAX_LENGTH,
2340                        "GSP", SOURCE_NAME_MAX_LENGTH);
2341     }
2342 
2343     pKernelGsp->bHasVgpuLogs = NV_TRUE;
2344 
2345 error_cleanup:
2346     if (pKernelGsp->pNvlogFlushMtx != NULL)
2347         portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx);
2348 
2349     if (nvStatus != NV_OK)
2350         _kgspFreeLibosVgpuPartitionLoggingStructures(pGpu, pKernelGsp, gfid);
2351 
2352     return nvStatus;
2353 }
2354 
kgspNvlogFlushCb(void * pKernelGsp)2355 void kgspNvlogFlushCb(void *pKernelGsp)
2356 {
2357     if (pKernelGsp != NULL)
2358         kgspDumpGspLogs((KernelGsp*)pKernelGsp, NV_TRUE);
2359 }
2360 
2361 /*!
2362  * Free LIBOS task logging structures
2363  */
2364 static void
_kgspFreeLibosLoggingStructures(OBJGPU * pGpu,KernelGsp * pKernelGsp)2365 _kgspFreeLibosLoggingStructures
2366 (
2367     OBJGPU *pGpu,
2368     KernelGsp *pKernelGsp
2369 )
2370 {
2371     NvU8 idx;
2372 
2373     _kgspStopLogPolling(pGpu, pKernelGsp);
2374 
2375     // Make sure there is no lingering debug output.
2376     kgspDumpGspLogs(pKernelGsp, NV_FALSE);
2377 
2378     if (pKernelGsp->pLogElf == NULL)
2379         nvlogDeregisterFlushCb(kgspNvlogFlushCb, pKernelGsp);
2380 
2381     if (pKernelGsp->pNvlogFlushMtx != NULL)
2382     {
2383         portSyncMutexDestroy(pKernelGsp->pNvlogFlushMtx);
2384         pKernelGsp->pNvlogFlushMtx = NULL;
2385     }
2386 
2387     libosLogDestroy(&pKernelGsp->logDecode);
2388 
2389     for (idx = 0; idx < LOGIDX_SIZE; idx++)
2390     {
2391         RM_LIBOS_LOG_MEM *pLog = &pKernelGsp->rmLibosLogMem[idx];
2392 
2393         // release log memory for each task.
2394         if (pLog->pTaskLogBuffer != NULL)
2395         {
2396             memdescUnmap(pLog->pTaskLogDescriptor,
2397                          NV_TRUE, osGetCurrentProcess(),
2398                          (void *)pLog->pTaskLogBuffer,
2399                          pLog->pTaskLogMappingPriv);
2400             pLog->pTaskLogBuffer = NULL;
2401             pLog->pTaskLogMappingPriv = NULL;
2402         }
2403 
2404         if (pLog->pTaskLogDescriptor != NULL)
2405         {
2406             memdescFree(pLog->pTaskLogDescriptor);
2407             memdescDestroy(pLog->pTaskLogDescriptor);
2408             pLog->pTaskLogDescriptor = NULL;
2409         }
2410     }
2411 
2412     portMemFree(pKernelGsp->pLogElf);
2413     pKernelGsp->pLogElf = NULL;
2414 }
2415 
2416 /*!
2417  * Initialize LIBOS task logging structures
2418  */
2419 static NV_STATUS
_kgspInitLibosLoggingStructures(OBJGPU * pGpu,KernelGsp * pKernelGsp)2420 _kgspInitLibosLoggingStructures
2421 (
2422     OBJGPU *pGpu,
2423     KernelGsp *pKernelGsp
2424 )
2425 {
2426     static const struct
2427     {
2428         const char *szMemoryId;
2429         const char *szPrefix;
2430         NvU32       size;
2431         const char *elfSectionName;
2432     } logInitValues[] =
2433     {
2434         {"LOGINIT", "INIT", 0x10000, ".fwlogging_init"},  // 64KB for stack traces
2435 #if defined(DEVELOP) || defined(DEBUG)
2436         // The interrupt task is in the rm elf, so they share the same logging elf too
2437         {"LOGINTR", "INTR", 0x40000, ".fwlogging_rm"},    // 256KB ISR debug log on develop/debug builds
2438         {"LOGRM",   "RM",   0x40000, ".fwlogging_rm"}     // 256KB RM debug log on develop/debug builds
2439 #else
2440         // The interrupt task is in the rm elf, so they share the same logging elf too
2441         {"LOGINTR", "INTR", 0x10000, ".fwlogging_rm"},    // 64KB ISR debug log on develop/debug builds
2442         {"LOGRM",   "RM",   0x10000, ".fwlogging_rm"}     // 64KB RM debug log on release builds
2443 #endif
2444     };
2445     ct_assert(NV_ARRAY_ELEMENTS(logInitValues) <= LIBOS_LOG_MAX_LOGS);
2446     ct_assert(NV_ARRAY_ELEMENTS(logInitValues) == LOGIDX_SIZE);
2447 
2448     NV_STATUS nvStatus = NV_OK;
2449     NvU8      idx;
2450     NvU64 flags = MEMDESC_FLAGS_NONE;
2451 
2452     // Needed only on Unix where NV_ESC_RM_LOCKLESS_DIAGNOSTIC is supported
2453     if (RMCFG_FEATURE_PLATFORM_UNIX)
2454     {
2455         pKernelGsp->pNvlogFlushMtx = portSyncMutexCreate(portMemAllocatorGetGlobalNonPaged());
2456         if (pKernelGsp->pNvlogFlushMtx == NULL)
2457         {
2458             nvStatus = NV_ERR_INSUFFICIENT_RESOURCES;
2459             goto error_cleanup;
2460         }
2461     }
2462 
2463     libosLogCreate(&pKernelGsp->logDecode);
2464 
2465     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
2466 
2467     for (idx = 0; idx < LOGIDX_SIZE; idx++)
2468     {
2469         RM_LIBOS_LOG_MEM *pLog = &pKernelGsp->rmLibosLogMem[idx];
2470         NvP64 pVa = NvP64_NULL;
2471         NvP64 pPriv = NvP64_NULL;
2472 
2473         // Setup logging memory for each task.
2474         NV_ASSERT_OK_OR_GOTO(nvStatus,
2475             memdescCreate(&pLog->pTaskLogDescriptor,
2476                           pGpu,
2477                           logInitValues[idx].size,
2478                           RM_PAGE_SIZE,
2479                           NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED,
2480                           flags),
2481             error_cleanup);
2482 
2483         memdescTagAlloc(nvStatus,
2484                       NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_12, pLog->pTaskLogDescriptor);
2485         NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus,
2486             error_cleanup);
2487 
2488         NV_ASSERT_OK_OR_GOTO(nvStatus,
2489             memdescMap(pLog->pTaskLogDescriptor, 0,
2490                        memdescGetSize(pLog->pTaskLogDescriptor),
2491                        NV_TRUE, NV_PROTECT_READ_WRITE,
2492                        &pVa, &pPriv),
2493             error_cleanup);
2494 
2495         pLog->pTaskLogBuffer = pVa;
2496         pLog->pTaskLogMappingPriv = pPriv;
2497         portMemSet(pLog->pTaskLogBuffer, 0, memdescGetSize(pLog->pTaskLogDescriptor));
2498 
2499         // Pass the PTE table for the log buffer in the log buffer, after the put pointer.
2500         memdescGetPhysAddrs(pLog->pTaskLogDescriptor,
2501                             AT_GPU,
2502                             0,
2503                             RM_PAGE_SIZE,
2504                             NV_CEIL(memdescGetSize(pLog->pTaskLogDescriptor), RM_PAGE_SIZE),
2505                             &pLog->pTaskLogBuffer[1]);
2506 
2507         pLog->id8 = _kgspGenerateInitArgId(logInitValues[idx].szMemoryId);
2508 
2509         libosLogAddLogEx(&pKernelGsp->logDecode,
2510             pLog->pTaskLogBuffer,
2511             memdescGetSize(pLog->pTaskLogDescriptor),
2512             pGpu->gpuInstance,
2513             (gpuGetChipArch(pGpu) >> GPU_ARCH_SHIFT),
2514             gpuGetChipImpl(pGpu),
2515             logInitValues[idx].szPrefix,
2516             logInitValues[idx].elfSectionName);
2517     }
2518 
2519 error_cleanup:
2520     if (nvStatus != NV_OK)
2521         _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
2522 
2523     return nvStatus;
2524 }
2525 
2526 static NV_STATUS
_kgspInitLibosLogDecoder(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw)2527 _kgspInitLibosLogDecoder
2528 (
2529     OBJGPU *pGpu,
2530     KernelGsp *pKernelGsp,
2531     GSP_FIRMWARE *pGspFw
2532 )
2533 {
2534     // If there's no log ELF or it's already been wired, skip wiring it now
2535     if ((pGspFw->pLogElf == NULL) || (pKernelGsp->pLogElf != NULL))
2536         return NV_OK;
2537 
2538     // Setup symbol decoder
2539     const void *pLogData = NULL;
2540     NvU64 logSize = 0;
2541 
2542     NV_ASSERT_OK_OR_RETURN(
2543         _kgspFwContainerVerifyVersion(pGpu, pKernelGsp,
2544             pGspFw->pLogElf,
2545             pGspFw->logElfSize,
2546             "GSP firmware log"));
2547 
2548     NV_ASSERT_OK_OR_RETURN(
2549         _kgspFwContainerGetSection(pGpu, pKernelGsp,
2550             pGspFw->pLogElf,
2551             pGspFw->logElfSize,
2552             GSP_LOGGING_SECTION_NAME,
2553             &pLogData,
2554             &logSize));
2555 
2556     pKernelGsp->pLogElf = portMemAllocNonPaged(logSize);
2557     pKernelGsp->logElfDataSize = logSize;
2558 
2559     NV_ASSERT_OR_RETURN(pKernelGsp->pLogElf != NULL, NV_ERR_NO_MEMORY);
2560 
2561     portMemCopy(pKernelGsp->pLogElf, logSize, pLogData, logSize);
2562     libosLogInit(&pKernelGsp->logDecode, pKernelGsp->pLogElf, logSize);
2563 
2564     return NV_OK;
2565 }
2566 
2567 static NV_STATUS
_kgspAllocSimAccessBuffer(OBJGPU * pGpu,KernelGsp * pKernelGsp)2568 _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2569 {
2570     NvP64 pVa   = NvP64_NULL;
2571     NvP64 pPriv = NvP64_NULL;
2572     NV_STATUS nvStatus;
2573 
2574     if (!IS_SIMULATION(pGpu))
2575     {
2576         pKernelGsp->pMemDesc_simAccessBuf = NULL;
2577         pKernelGsp->pSimAccessBuf         = NULL;
2578         pKernelGsp->pSimAccessBufPriv     = NULL;
2579         return NV_ERR_NOT_SUPPORTED;
2580     }
2581 
2582     NV_ASSERT_OK_OR_GOTO(nvStatus,
2583         memdescCreate(&pKernelGsp->pMemDesc_simAccessBuf,
2584                       pGpu,
2585                       sizeof(SimAccessBuffer),
2586                       RM_PAGE_SIZE,
2587                       NV_TRUE, ADDR_SYSMEM, NV_MEMORY_UNCACHED,
2588                       MEMDESC_FLAGS_NONE),
2589         error_cleanup);
2590 
2591         memdescTagAlloc(nvStatus,
2592                 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_13, pKernelGsp->pMemDesc_simAccessBuf);
2593         NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_cleanup);
2594 
2595     NV_ASSERT_OK_OR_GOTO(nvStatus,
2596         memdescMap(pKernelGsp->pMemDesc_simAccessBuf, 0,
2597                    memdescGetSize(pKernelGsp->pMemDesc_simAccessBuf),
2598                    NV_TRUE, NV_PROTECT_READ_WRITE,
2599                    &pVa, &pPriv),
2600         error_cleanup);
2601 
2602     pKernelGsp->pSimAccessBuf = (SimAccessBuffer*)pVa;
2603     pKernelGsp->pSimAccessBufPriv = pPriv;
2604 
2605     portMemSet(pKernelGsp->pSimAccessBuf, 0, memdescGetSize(pKernelGsp->pMemDesc_simAccessBuf));
2606 
2607 error_cleanup:
2608     if (nvStatus != NV_OK)
2609         _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
2610 
2611     return nvStatus;
2612 }
2613 
2614 static void
_kgspFreeSimAccessBuffer(OBJGPU * pGpu,KernelGsp * pKernelGsp)2615 _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2616 {
2617     if (!IS_SIMULATION(pGpu))
2618     {
2619         return;
2620     }
2621 
2622     if (pKernelGsp->pMemDesc_simAccessBuf != NULL)
2623     {
2624         memdescFree(pKernelGsp->pMemDesc_simAccessBuf);
2625         memdescDestroy(pKernelGsp->pMemDesc_simAccessBuf);
2626     }
2627 
2628     pKernelGsp->pMemDesc_simAccessBuf = NULL;
2629     pKernelGsp->pSimAccessBuf         = NULL;
2630     pKernelGsp->pSimAccessBufPriv     = NULL;
2631 }
2632 
2633 static NV_STATUS
_kgspAllocNotifyOpSharedSurface(OBJGPU * pGpu,KernelGsp * pKernelGsp)2634 _kgspAllocNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2635 {
2636     NvP64 pVa   = NvP64_NULL;
2637     NvP64 pPriv = NvP64_NULL;
2638     NV_STATUS nvStatus;
2639     NvU64 flags = MEMDESC_FLAGS_NONE;
2640 
2641     //
2642     // On systems with SEV enabled, the fault buffer flush sequence memory should be allocated
2643     // in unprotected sysmem as GSP will be writing to this location to let the guest
2644     // know a the issued notify op has finished as well as the status of the operation.
2645     //
2646     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
2647 
2648     NV_ASSERT_OK_OR_GOTO(nvStatus,
2649         memdescCreate(&pKernelGsp->pNotifyOpSurfMemDesc,
2650                       pGpu,
2651                       sizeof(NotifyOpSharedSurface),
2652                       RM_PAGE_SIZE,
2653                       NV_FALSE, ADDR_SYSMEM, NV_MEMORY_UNCACHED,
2654                       flags),
2655         error_cleanup);
2656 
2657         memdescTagAlloc(nvStatus,
2658                 NV_FB_ALLOC_RM_INTERNAL_OWNER_GSP_NOTIFY_OP_SURFACE, pKernelGsp->pNotifyOpSurfMemDesc);
2659         NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_cleanup);
2660 
2661     NV_ASSERT_OK_OR_GOTO(nvStatus,
2662         memdescMap(pKernelGsp->pNotifyOpSurfMemDesc, 0,
2663                    memdescGetSize(pKernelGsp->pNotifyOpSurfMemDesc),
2664                    NV_TRUE, NV_PROTECT_READ_WRITE,
2665                    &pVa, &pPriv),
2666         error_cleanup);
2667 
2668     pKernelGsp->pNotifyOpSurf = (NotifyOpSharedSurface*)pVa;
2669     pKernelGsp->pNotifyOpSurfPriv = pPriv;
2670 
2671     portMemSet(pKernelGsp->pNotifyOpSurf, 0, memdescGetSize(pKernelGsp->pNotifyOpSurfMemDesc));
2672 
2673 error_cleanup:
2674     if (nvStatus != NV_OK)
2675         _kgspFreeNotifyOpSharedSurface(pGpu, pKernelGsp);
2676 
2677     return nvStatus;
2678 }
2679 
2680 static void
_kgspFreeNotifyOpSharedSurface(OBJGPU * pGpu,KernelGsp * pKernelGsp)2681 _kgspFreeNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2682 {
2683     if (pKernelGsp->pNotifyOpSurfMemDesc != NULL)
2684     {
2685         memdescFree(pKernelGsp->pNotifyOpSurfMemDesc);
2686         memdescDestroy(pKernelGsp->pNotifyOpSurfMemDesc);
2687     }
2688 
2689     pKernelGsp->pNotifyOpSurfMemDesc = NULL;
2690     pKernelGsp->pNotifyOpSurf         = NULL;
2691     pKernelGsp->pNotifyOpSurfPriv     = NULL;
2692 }
2693 
2694 /*!
2695  * Create KernelGsp object and initialize RPC infrastructure
2696  */
2697 NV_STATUS
kgspConstructEngine_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,ENGDESCRIPTOR engDesc)2698 kgspConstructEngine_IMPL
2699 (
2700     OBJGPU *pGpu,
2701     KernelGsp *pKernelGsp,
2702     ENGDESCRIPTOR engDesc
2703 )
2704 {
2705     NV_STATUS nvStatus = NV_OK;
2706 
2707     if (!IS_GSP_CLIENT(pGpu))
2708         return NV_ERR_NOT_SUPPORTED;
2709 
2710     kgspConfigureFalcon_HAL(pGpu, pKernelGsp);
2711 
2712     // Init RPC objects used to communicate with GSP.
2713     nvStatus = _kgspInitRpcInfrastructure(pGpu, pKernelGsp);
2714     if (nvStatus != NV_OK)
2715     {
2716         NV_PRINTF(LEVEL_ERROR, "init RPC infrastructure failed\n");
2717         goto done;
2718     }
2719 
2720     // Init logging memory used by GSP
2721     nvStatus = _kgspInitLibosLoggingStructures(pGpu, pKernelGsp);
2722     if (nvStatus != NV_OK)
2723     {
2724         NV_PRINTF(LEVEL_ERROR, "init libos logging structures failed: 0x%x\n", nvStatus);
2725         goto done;
2726     }
2727 
2728     // Clear out the gspStaticInfo. We will populate this once GSP-RM is up.
2729     portMemSet(&pKernelGsp->gspStaticInfo, 0,
2730                sizeof(pKernelGsp->gspStaticInfo));
2731 
2732     nvStatus = kgspAllocBootArgs_HAL(pGpu, pKernelGsp);
2733     if (nvStatus != NV_OK)
2734     {
2735         NV_PRINTF(LEVEL_ERROR, "boot arg alloc failed: 0x%x\n", nvStatus);
2736         goto done;
2737     }
2738 
2739     if (IS_SIMULATION(pGpu))
2740     {
2741         nvStatus = _kgspAllocSimAccessBuffer(pGpu, pKernelGsp);
2742         if (nvStatus != NV_OK)
2743         {
2744             NV_PRINTF(LEVEL_ERROR, "sim access buffer alloc failed: 0x%x\n", nvStatus);
2745             goto done;
2746         }
2747     }
2748 
2749     nvStatus = _kgspAllocNotifyOpSharedSurface(pGpu, pKernelGsp);
2750     if (nvStatus != NV_OK)
2751     {
2752         NV_PRINTF(LEVEL_ERROR, "notify operation shared surface alloc failed: 0x%x\n", nvStatus);
2753         goto done;
2754     }
2755 
2756 #if KERNEL_GSP_TRACING_RATS_ENABLED
2757     multimapInit(&pGpu->gspTraceEventBufferBindingsUid, portMemAllocatorGetGlobalNonPaged());
2758 #endif
2759 
2760 done:
2761     if (nvStatus != NV_OK)
2762     {
2763         _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
2764         kgspFreeBootArgs_HAL(pGpu, pKernelGsp);
2765         _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
2766         _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
2767     }
2768 
2769     return nvStatus;
2770 }
2771 
2772 /*!
2773  * Convert VBIOS version containing Version and OemVersion packed together to
2774  * a string representation.
2775  *
2776  * Example:
2777  *   for Version 0x05400001, OemVersion 0x12
2778  *   input argument vbiosVersionCombined 0x0540000112
2779  *   output str "5.40.00.01.12"
2780  */
2781 static void
_kgspVbiosVersionToStr(NvU64 vbiosVersionCombined,char * pVbiosVersionStr,NvU32 size)2782 _kgspVbiosVersionToStr(NvU64 vbiosVersionCombined, char *pVbiosVersionStr, NvU32 size)
2783 {
2784     nvDbgSnprintf(pVbiosVersionStr, size, "%2X.%02X.%02X.%02X.%02X",
2785                   (vbiosVersionCombined >> 32) & 0xff,
2786                   (vbiosVersionCombined >> 24) & 0xff,
2787                   (vbiosVersionCombined >> 16) & 0xff,
2788                   (vbiosVersionCombined >> 8) & 0xff,
2789                   (vbiosVersionCombined) & 0xff);
2790 }
2791 
2792 static NV_STATUS
_kgspPrepareScrubberImageIfNeeded(OBJGPU * pGpu,KernelGsp * pKernelGsp)2793 _kgspPrepareScrubberImageIfNeeded(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2794 {
2795     // Prepare Scrubber ucode image if pre-scrubbed memory is insufficient
2796     NvU64 neededSize = pKernelGsp->pWprMeta->fbSize - pKernelGsp->pWprMeta->gspFwRsvdStart;
2797     NvU64 prescrubbedSize = kgspGetPrescrubbedTopFbSize(pGpu, pKernelGsp);
2798     NV_PRINTF(LEVEL_INFO, "pre-scrubbed memory: 0x%llx bytes, needed: 0x%llx bytes\n",
2799               prescrubbedSize, neededSize);
2800 
2801     if (neededSize > prescrubbedSize)
2802         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2803             kgspAllocateScrubberUcodeImage(pGpu, pKernelGsp, &pKernelGsp->pScrubberUcode));
2804 
2805     return NV_OK;
2806 }
2807 
2808 /*!
2809  * Prepare and place RPCs in message queue that GSP-RM will process
2810  * in early boot before OBJGPU is created.
2811  *
2812  * @param[in] pGpu        GPU object pointer
2813  * @param[in] pKernelGsp  KernelGsp object pointer
2814  *
2815  * @return NV_OK if RPCs queued successfully.
2816  *         Appropriate NV_ERR_xxx value otherwise.
2817  */
2818 NV_STATUS
kgspQueueAsyncInitRpcs_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)2819 kgspQueueAsyncInitRpcs_IMPL
2820 (
2821     OBJGPU    *pGpu,
2822     KernelGsp *pKernelGsp
2823 )
2824 {
2825     NV_STATUS status = NV_OK;
2826 
2827     NV_RM_RPC_GSP_SET_SYSTEM_INFO(pGpu, status);
2828     if (status != NV_OK)
2829     {
2830         NV_ASSERT_OK_FAILED("NV_RM_RPC_GSP_SET_SYSTEM_INFO", status);
2831         return status;
2832     }
2833 
2834     NV_RM_RPC_SET_REGISTRY(pGpu, status);
2835     if (status != NV_OK)
2836     {
2837         NV_ASSERT_OK_FAILED("NV_RM_RPC_SET_REGISTRY", status);
2838         return status;
2839     }
2840 
2841     return NV_OK;
2842 }
2843 
2844 static NvBool
_kgspShouldRelaxGspInitLocking(OBJGPU * pGpu)2845 _kgspShouldRelaxGspInitLocking
2846 (
2847     OBJGPU *pGpu
2848 )
2849 {
2850     NvU32 relaxGspInitLockingReg;
2851 
2852     if (!RMCFG_FEATURE_PLATFORM_UNIX)
2853     {
2854         return NV_FALSE;
2855     }
2856 
2857     if (gpuIsCCFeatureEnabled(pGpu))
2858     {
2859         return NV_FALSE;
2860     }
2861 
2862     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING, &relaxGspInitLockingReg) != NV_OK)
2863     {
2864         relaxGspInitLockingReg = NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_DEFAULT;
2865     }
2866 
2867     // Due to bug 4399629, restrict which platforms have parallel init enabled by default
2868     if (relaxGspInitLockingReg == NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_DEFAULT)
2869     {
2870         NvU16 devId = (NvU16)(((pGpu->idInfo.PCIDeviceID) >> 16) & 0x0000FFFF);
2871         NvU32 i;
2872 
2873         static const NvU16 defaultRelaxGspInitLockingGpus[] = {
2874             0x1EB8, // T4
2875             0x1EB9, // T4
2876         };
2877 
2878         if (IsHOPPER(pGpu) || IsADA(pGpu))
2879         {
2880             return NV_TRUE;
2881         }
2882 
2883         for (i = 0; i < NV_ARRAY_ELEMENTS(defaultRelaxGspInitLockingGpus); i++)
2884         {
2885             if (devId == defaultRelaxGspInitLockingGpus[i])
2886             {
2887                 return NV_TRUE;
2888             }
2889         }
2890         return NV_FALSE;
2891     }
2892 
2893     return (relaxGspInitLockingReg == NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_ENABLE);
2894     return NV_FALSE;
2895 }
2896 
2897 static NV_STATUS
_kgspBootReacquireLocks(OBJGPU * pGpu,KernelGsp * pKernelGsp,GPU_MASK * pGpusLockedMask)2898 _kgspBootReacquireLocks(OBJGPU *pGpu, KernelGsp *pKernelGsp, GPU_MASK *pGpusLockedMask)
2899 {
2900     //
2901     // To follow lock order constraints, GPU lock needs to be released before acquiring API lock
2902     // As this path doesn't go through resource server, no client locks should be held at this point.
2903     // Note: we must not hold any client locks when re-acquiring the API per lock ordering
2904     //
2905     rmGpuGroupLockRelease(*pGpusLockedMask, GPUS_LOCK_FLAGS_NONE);
2906     *pGpusLockedMask = 0;
2907 
2908     //
2909     // rmapiLockAcquire should never fail on Linux if the API lock and GPU locks are not held.
2910     // Failure to acquire the API lock means the cleanup sequence will skipped since it is
2911     // unsafe without the lock.
2912     //
2913     NV_ASSERT_OK_OR_RETURN(rmapiLockAcquire(API_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT));
2914 
2915     //
2916     // This should never fail on Linux due to locks in the Unix layer.
2917     // This will need to be revisited when parallel init is enabled on other platforms.
2918     //
2919     NV_ASSERT_OR_RETURN(gpumgrIsGpuPointerAttached(pGpu), NV_ERR_INVALID_DEVICE);
2920 
2921     // Reqcquire the GPU lock released above.
2922     NV_ASSERT_OK_OR_RETURN(rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
2923                                                  GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT,
2924                                                  pGpusLockedMask));
2925 
2926     return NV_OK;
2927 }
2928 
2929 static NV_STATUS
_kgspBootGspRm(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw,GPU_MASK * pGpusLockedMask)2930 _kgspBootGspRm(OBJGPU *pGpu, KernelGsp *pKernelGsp, GSP_FIRMWARE *pGspFw, GPU_MASK *pGpusLockedMask)
2931 {
2932     NV_STATUS status;
2933 
2934     // Fail early if WPR2 is up
2935     if (kgspIsWpr2Up_HAL(pGpu, pKernelGsp))
2936     {
2937         NV_PRINTF(LEVEL_ERROR, "unexpected WPR2 already up, cannot proceed with booting GSP\n");
2938         NV_PRINTF(LEVEL_ERROR, "(the GPU is likely in a bad state and may need to be reset)\n");
2939         return NV_ERR_INVALID_STATE;
2940     }
2941 
2942     // Calculate FB layout (requires knowing FB size which depends on GFW_BOOT)
2943     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgspCalculateFbLayout_HAL(pGpu, pKernelGsp, pGspFw));
2944 
2945     // If the new FB layout requires a scrubber ucode to scrub additional space, prepare it now
2946     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, _kgspPrepareScrubberImageIfNeeded(pGpu, pKernelGsp));
2947 
2948     // Setup arguments for bootstrapping GSP
2949     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgspPrepareForBootstrap_HAL(pGpu, pKernelGsp, pGspFw));
2950 
2951     // Release the API lock if relaxed locking for parallel init is enabled
2952     NvBool bRelaxedLocking = _kgspShouldRelaxGspInitLocking(pGpu);
2953     if (bRelaxedLocking)
2954         rmapiLockRelease();
2955 
2956     // Proceed with GSP boot - if it fails, check for ECC errors
2957     status = kgspBootstrap_HAL(pGpu, pKernelGsp, pGspFw);
2958     if ((status != NV_OK) && gpuCheckEccCounts_HAL(pGpu))
2959         status = NV_ERR_ECC_ERROR;
2960 
2961     pKernelGsp->bootAttempts++;
2962 
2963     //
2964     // The caller will check that both the API lock and the GPU lock will be held upon return from
2965     // this function, regardless of whether GSP bootstrap succeeded.
2966     //
2967     if (bRelaxedLocking)
2968         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2969                               _kgspBootReacquireLocks(pGpu, pKernelGsp, pGpusLockedMask));
2970 
2971     return status;
2972 }
2973 
2974 /*!
2975  * Initialize GSP-RM
2976  *
2977  * @param[in]      pGpu          GPU object pointer
2978  * @param[in]      pKernelGsp    KernelGsp object pointer
2979  * @param[in]      pGspFw        GSP firmware structure pointer
2980  *
2981  * @return NV_OK if GSP fw RM offload successfully initialized.
2982  *         Appropriate NV_ERR_xxx value otherwise.
2983  */
2984 NV_STATUS
kgspInitRm_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw)2985 kgspInitRm_IMPL
2986 (
2987     OBJGPU       *pGpu,
2988     KernelGsp    *pKernelGsp,
2989     GSP_FIRMWARE *pGspFw
2990 )
2991 {
2992     NV_STATUS  status = NV_OK;
2993     OBJTMR    *pTmr = GPU_GET_TIMER(pGpu);
2994     GPU_MASK   gpusLockedMask = 0;
2995 
2996     if (!IS_GSP_CLIENT(pGpu))
2997         return NV_OK;
2998 
2999     if ((pGspFw == NULL) || (pGspFw->pBuf == NULL) || (pGspFw->size == 0))
3000     {
3001         NV_PRINTF(LEVEL_ERROR, "need firmware to initialize GSP\n");
3002         return NV_ERR_INVALID_ARGUMENT;
3003     }
3004 
3005     pKernelGsp->bInInit = NV_TRUE;
3006 
3007     // Need to hold the GPU instance lock in order to write to the RPC queue
3008     NV_ASSERT_OK_OR_GOTO(status,
3009         rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
3010                               GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT, &gpusLockedMask),
3011         done);
3012 
3013     /*
3014      * For GSP-RM boot, we must trigger FRTS (if it exists for the chip)
3015      * before loading GSP-RM so that FRTS data and GSP-RM code/data/heap can coexist
3016      * in WPR2. FRTS is triggered by running a VBIOS-provided ucode called FWSEC.
3017      *
3018      * Here, we extract a VBIOS image from ROM, and parse it for FWSEC.
3019      */
3020     if (pKernelGsp->pFwsecUcode == NULL)
3021     {
3022         KernelGspVbiosImg *pVbiosImg = NULL;
3023 
3024         // Start VBIOS version string as "unknown"
3025         portStringCopy(pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr), "unknown", sizeof("unknown"));
3026 
3027         // Try and extract a VBIOS image.
3028         status = kgspExtractVbiosFromRom_HAL(pGpu, pKernelGsp, &pVbiosImg);
3029 
3030         if (status == NV_OK)
3031         {
3032             NvU64 vbiosVersionCombined = 0;
3033 
3034             // Got a VBIOS image, now parse it for FWSEC.
3035             status = kgspParseFwsecUcodeFromVbiosImg(pGpu, pKernelGsp, pVbiosImg,
3036                                                      &pKernelGsp->pFwsecUcode, &vbiosVersionCombined);
3037             kgspFreeVbiosImg(pVbiosImg);
3038 
3039             if (vbiosVersionCombined > 0)
3040             {
3041                 _kgspVbiosVersionToStr(vbiosVersionCombined, pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr));
3042             }
3043 
3044             if (status != NV_OK)
3045             {
3046                 NV_PRINTF(LEVEL_ERROR, "failed to parse FWSEC ucode from VBIOS image (VBIOS version %s): 0x%x\n",
3047                           pKernelGsp->vbiosVersionStr, status);
3048                 goto done;
3049             }
3050 
3051             NV_PRINTF(LEVEL_INFO, "parsed VBIOS version %s\n", pKernelGsp->vbiosVersionStr);
3052         }
3053         else if (status == NV_ERR_NOT_SUPPORTED)
3054         {
3055             // Extracting VBIOS image from ROM is not supported.
3056             status = NV_OK;
3057         }
3058         else
3059         {
3060             NV_PRINTF(LEVEL_ERROR, "failed to extract VBIOS image from ROM: 0x%x\n",
3061                         status);
3062             goto done;
3063         }
3064 
3065     }
3066 
3067     /*
3068      * We use a set of Booter ucodes to boot GSP-RM as well as manage its lifecycle.
3069      *
3070      * Booter Load loads, verifies, and boots GSP-RM in WPR2.
3071      * Booter Unload tears down WPR2 for driver unload.
3072      *
3073      * Here we prepare the Booter ucode images in SYSMEM so they may be loaded onto
3074      * SEC2 (Load / Unload) and NVDEC0 (Unload).
3075      */
3076     if (pKernelGsp->bPartitionedFmc)
3077     {
3078         //
3079         // The secure boot ucode is included in the partitioned FMC, no need for
3080         // separate Booter ucodes.
3081         //
3082     }
3083     else
3084     {
3085         if (pKernelGsp->pBooterLoadUcode == NULL)
3086         {
3087             status = kgspAllocateBooterLoadUcodeImage(pGpu, pKernelGsp,
3088                                                       &pKernelGsp->pBooterLoadUcode);
3089             if (status != NV_OK)
3090             {
3091                 NV_PRINTF(LEVEL_ERROR, "failed to allocate Booter Load ucode: 0x%x\n", status);
3092                 goto done;
3093             }
3094         }
3095 
3096         if (pKernelGsp->pBooterUnloadUcode == NULL)
3097         {
3098             status = kgspAllocateBooterUnloadUcodeImage(pGpu, pKernelGsp,
3099                                                         &pKernelGsp->pBooterUnloadUcode);
3100             if (status != NV_OK)
3101             {
3102                 NV_PRINTF(LEVEL_ERROR, "failed to allocate Booter Unload ucode: 0x%x\n", status);
3103                 goto done;
3104             }
3105         }
3106     }
3107 
3108     // Prepare boot binary image.
3109     status = kgspPrepareBootBinaryImage(pGpu, pKernelGsp);
3110     if (status != NV_OK)
3111     {
3112         NV_PRINTF(LEVEL_ERROR, "Error preparing boot binary image\n");
3113         goto done;
3114     }
3115 
3116     // Prepare GSP-RM image.
3117     status = _kgspPrepareGspRmBinaryImage(pGpu, pKernelGsp, pGspFw);
3118     if (status != NV_OK)
3119     {
3120         NV_PRINTF(LEVEL_ERROR, "Error preparing GSP-RM image\n");
3121         goto done;
3122     }
3123 
3124     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, _kgspInitLibosLogDecoder(pGpu, pKernelGsp, pGspFw), done);
3125 
3126     //
3127     // Do not register nvlog flush callback if:
3128     // 1. Live decoding is enabled, as logs will be printed to dmesg.
3129     // 2. NV_ESC_RM_LOCKLESS_DIAGNOSTIC is not supported on this platform, i.e. pNvlogFlushMtx=NULL.
3130     //
3131     if (pKernelGsp->pLogElf == NULL && pKernelGsp->pNvlogFlushMtx != NULL)
3132         NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, nvlogRegisterFlushCb(kgspNvlogFlushCb, pKernelGsp), done);
3133 
3134     // Reset thread state timeout and wait for GFW_BOOT OK status
3135     threadStateResetTimeout(pGpu);
3136     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspWaitForGfwBootOk_HAL(pGpu, pKernelGsp), done);
3137 
3138     //
3139     // Set the GPU time to the wall-clock time after GFW boot is complete
3140     // (to avoid PLM collisions) but before loading GSP-RM ucode (which
3141     // consumes the updated GPU time).
3142     //
3143     tmrSetCurrentTime_HAL(pGpu, pTmr);
3144 
3145     // Initialize libos init args list
3146     kgspSetupLibosInitArgs(pGpu, pKernelGsp);
3147 
3148     // Fill in the GSP-RM message queue init parameters
3149     kgspPopulateGspRmInitArgs(pGpu, pKernelGsp, NULL);
3150 
3151     //
3152     // If ConfCompute is enabled, all RPC traffic must be encrypted. Since we
3153     // can't encrypt until GSP boots and session is established, we must send
3154     // these messages later (kgspBootstrap_HAL) in CC.
3155     //
3156     ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu);
3157     if (pCC == NULL || !pCC->getProperty(pCC, PDB_PROP_CONFCOMPUTE_CC_FEATURE_ENABLED))
3158     {
3159         //
3160         // Stuff the message queue with async init messages that will be run
3161         // before OBJGPU is created.
3162         //
3163         status = kgspQueueAsyncInitRpcs(pGpu, pKernelGsp);
3164         if (status != NV_OK)
3165         {
3166             goto done;
3167         }
3168     }
3169 
3170     //
3171     // Bring up ucode with RM offload task.
3172     // If an ECC error occurs which results in the failure of the bootstrap, try again.
3173     // Subsequent attempts will shift the GSP region of FB in an attempt to avoid the
3174     // unstable memory.
3175     //
3176     const NvU8 MAX_GSP_BOOT_ATTEMPTS = 4;
3177     do
3178     {
3179         // Reset the thread state timeout after failed attempts to prevent premature timeouts.
3180         if (status != NV_OK)
3181             threadStateResetTimeout(pGpu);
3182 
3183         //
3184         // _kgspBootGspRm() will return NV_ERR_ECC_ERROR if any unhandled ECC errors are
3185         // detected during a failed GSP boot attempt. Depending on where and when the
3186         // error occurred, we may not be able to try again, in which case a different
3187         // error code will be returned.
3188         //
3189         status = _kgspBootGspRm(pGpu, pKernelGsp, pGspFw, &gpusLockedMask);
3190 
3191         //
3192         // _kgspBootGspRm() may temporarily release locks to facilitate parallel GSP bootstrap on
3193         // other GPUs. It is responsible for reacquiring them in the proper order. If there is a
3194         // failure to reacquire locks, it is unsafe to continue, regardless of the initialization
3195         // status - so we return immediately here, rather attempting cleanup.
3196         //
3197         // Note: _kgspBootGspRm() is structured such that gpusLockedMask will always be 0 (no GPU
3198         //       locks held) if the API lock is not held upon return.
3199         //
3200         NV_ASSERT_OR_RETURN(rmapiLockIsOwner() && (gpusLockedMask != 0),
3201                             NV_ERR_INVALID_LOCK_STATE);
3202     } while ((status == NV_ERR_ECC_ERROR) && (pKernelGsp->bootAttempts < MAX_GSP_BOOT_ATTEMPTS));
3203 
3204     if (status != NV_OK)
3205     {
3206         if (status == NV_ERR_INSUFFICIENT_POWER)
3207         {
3208             OBJSYS *pSys = SYS_GET_INSTANCE();
3209             OBJGPUMGR *pGpuMgr = SYS_GET_GPUMGR(pSys);
3210 
3211             pGpuMgr->powerDisconnectedGpuBus[pGpuMgr->powerDisconnectedGpuCount++] = gpuGetBus(pGpu);
3212         }
3213 
3214         //
3215         // Ignore return value - a crash report may have already been consumed,
3216         // this is just here as a last attempt to report boot issues that might
3217         // have escaped prior checks.
3218         //
3219         (void)kgspHealthCheck_HAL(pGpu, pKernelGsp);
3220         goto done;
3221     }
3222 
3223     // at this point we should be able to exchange RPCs with RM offload task
3224     NV_RM_RPC_SET_GUEST_SYSTEM_INFO(pGpu, status);
3225     if (status != NV_OK)
3226     {
3227         NV_PRINTF(LEVEL_ERROR, "SET_GUEST_SYSTEM_INFO failed: 0x%x\n", status);
3228         goto done;
3229     }
3230 
3231     NV_RM_RPC_GET_GSP_STATIC_INFO(pGpu, status);
3232     if (status != NV_OK)
3233     {
3234         NV_PRINTF(LEVEL_ERROR, "GET_GSP_STATIC_INFO failed: 0x%x\n", status);
3235         goto done;
3236     }
3237 
3238     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspStartLogPolling(pGpu, pKernelGsp), done);
3239 
3240 done:
3241     pKernelGsp->bInInit = NV_FALSE;
3242 
3243     if (status != NV_OK)
3244     {
3245         KernelPmu *pKernelPmu = GPU_GET_KERNEL_PMU(pGpu);
3246 
3247         // Preserve any captured GSP-RM logs
3248         libosPreserveLogs(&pKernelGsp->logDecode);
3249 
3250         if (pKernelPmu != NULL)
3251         {
3252             // If PMU init fails, kgsp init will also fail
3253             libosPreserveLogs(&pKernelPmu->logDecode);
3254         }
3255     }
3256 
3257     if (gpusLockedMask != 0)
3258     {
3259         rmGpuGroupLockRelease(gpusLockedMask, GPUS_LOCK_FLAGS_NONE);
3260     }
3261 
3262     return status;
3263 }
3264 
3265 /*!
3266  * Unload GSP-RM
3267  */
3268 NV_STATUS
kgspUnloadRm_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)3269 kgspUnloadRm_IMPL
3270 (
3271     OBJGPU *pGpu,
3272     KernelGsp *pKernelGsp
3273 )
3274 {
3275     NV_STATUS rpcStatus = NV_OK;
3276     NV_STATUS status;
3277     KernelGspPreparedFwsecCmd preparedCmd;
3278 
3279     NV_PRINTF(LEVEL_INFO, "unloading GSP-RM\n");
3280     NV_RM_RPC_UNLOADING_GUEST_DRIVER(pGpu, rpcStatus, NV_FALSE, NV_FALSE, 0);
3281 
3282     if (gpuIsCCFeatureEnabled(pGpu))
3283     {
3284         // FIPS: If CC enabled, we need to confirm GSP-RM was able to teardown CC state.
3285         kgspCheckGspRmCcCleanup_HAL(pGpu, pKernelGsp);
3286     }
3287 
3288     // Wait for GSP-RM processor to suspend
3289     kgspWaitForProcessorSuspend_HAL(pGpu, pKernelGsp);
3290 
3291     // Dump GSP-RM logs and reset before invoking FWSEC-SB
3292     kgspDumpGspLogs(pKernelGsp, NV_FALSE);
3293 
3294     //
3295     // Avoid cascading timeouts when attempting to invoke the below ucodes if
3296     // we are unloading due to a GSP-RM timeout.
3297     //
3298     threadStateResetTimeout(pGpu);
3299 
3300     // Because of COT, RM cannot reset GSP-RISCV and FSP has exclusive access to reset and reboot GSP for next run.
3301     if(!(pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_COT_ENABLED)))
3302     {
3303         kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
3304     }
3305 
3306     // Invoke FWSEC-SB to put back PreOsApps during driver unload
3307     status = kgspPrepareForFwsecSb_HAL(pGpu, pKernelGsp, pKernelGsp->pFwsecUcode, &preparedCmd);
3308     if (status == NV_ERR_NOT_SUPPORTED)
3309     {
3310         // skip FWSEC-SB during driver unload if unsupported (e.g. on Hopper+)
3311         status = NV_OK;
3312     }
3313     else if (status != NV_OK)
3314     {
3315         NV_PRINTF(LEVEL_ERROR, "failed to prepare for FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status);
3316         NV_ASSERT(0);
3317     }
3318     else
3319     {
3320         status = kgspExecuteFwsec_HAL(pGpu, pKernelGsp, &preparedCmd);
3321         if (status != NV_OK)
3322         {
3323             NV_PRINTF(LEVEL_ERROR, "failed to execute FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status);
3324             NV_ASSERT(0);
3325         }
3326     }
3327 
3328     if (pKernelGsp->bPartitionedFmc)
3329     {
3330         //
3331         // GSP-RM invokes the partitioned FMC to unload directly as part of the
3332         // NV_RM_RPC_UNLOADING_GUEST_DRIVER call above.
3333         //
3334         status = rpcStatus;
3335     }
3336     else
3337     {
3338         // After instructing GSP-RM to unload itself, run Booter Unload to teardown WPR2
3339         status = kgspExecuteBooterUnloadIfNeeded_HAL(pGpu, pKernelGsp, 0);
3340     }
3341 
3342     //
3343     // To fix boot issue after GPU reset on ESXi config:
3344     // We still do not have root cause but looks like some sanity is failing during boot after reset is done.
3345     // As temp WAR, add delay of 250 ms after gsp rm unload is done.
3346     // Limit this to [VGPU-GSP] supported configs only and when we are in GPU RESET path.
3347     //
3348     if (API_GPU_IN_RESET_SANITY_CHECK(pGpu) &&
3349         gpuIsSriovEnabled(pGpu) &&
3350         IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
3351     {
3352         osDelay(250);
3353     }
3354 
3355     if (rpcStatus != NV_OK)
3356     {
3357         return rpcStatus;
3358     }
3359 
3360     return status;
3361 }
3362 
3363 /*!
3364  * Free RPC infrastructure and KernelGsp object
3365  */
3366 void
kgspDestruct_IMPL(KernelGsp * pKernelGsp)3367 kgspDestruct_IMPL
3368 (
3369     KernelGsp *pKernelGsp
3370 )
3371 {
3372     OBJGPU *pGpu = ENG_GET_GPU(pKernelGsp);
3373 
3374     if (!IS_GSP_CLIENT(pGpu))
3375         return;
3376 
3377     // set VBIOS version string back to "unknown"
3378     portStringCopy(pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr), "unknown", sizeof("unknown"));
3379 
3380     kgspFreeFlcnUcode(pKernelGsp->pFwsecUcode);
3381     pKernelGsp->pFwsecUcode = NULL;
3382 
3383     kgspFreeFlcnUcode(pKernelGsp->pBooterLoadUcode);
3384     pKernelGsp->pBooterLoadUcode = NULL;
3385 
3386     kgspFreeFlcnUcode(pKernelGsp->pBooterUnloadUcode);
3387     pKernelGsp->pBooterUnloadUcode = NULL;
3388 
3389     kgspFreeFlcnUcode(pKernelGsp->pScrubberUcode);
3390     pKernelGsp->pScrubberUcode = NULL;
3391 
3392     kgspFreeBootArgs_HAL(pGpu, pKernelGsp);
3393 
3394     _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
3395     _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
3396     _kgspFreeBootBinaryImage(pGpu, pKernelGsp);
3397     _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
3398     _kgspFreeNotifyOpSharedSurface(pGpu, pKernelGsp);
3399 
3400     kgspFreeSuspendResumeData_HAL(pGpu, pKernelGsp);
3401 
3402 #if KERNEL_GSP_TRACING_RATS_ENABLED
3403     multimapDestroy(&pGpu->gspTraceEventBufferBindingsUid);
3404 #endif
3405 }
3406 
3407 void
kgspDumpGspLogsUnlocked_IMPL(KernelGsp * pKernelGsp,NvBool bSyncNvLog)3408 kgspDumpGspLogsUnlocked_IMPL
3409 (
3410     KernelGsp *pKernelGsp,
3411     NvBool bSyncNvLog
3412 )
3413 {
3414     if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog)
3415     {
3416         libosExtractLogs(&pKernelGsp->logDecode, bSyncNvLog);
3417 
3418         if (pKernelGsp->bHasVgpuLogs)
3419         {
3420             // Dump logs from vGPU partition
3421             for (NvU32 i = 0; i < MAX_PARTITIONS_WITH_GFID; i++)
3422             {
3423                 libosExtractLogs(&pKernelGsp->logDecodeVgpuPartition[i], bSyncNvLog);
3424             }
3425         }
3426     }
3427 
3428 }
3429 
3430 /*!
3431  * Dump logs coming from GSP-RM
3432  *
3433  * @param[in] pKernelGsp    KernelGsp pointer
3434  * @param[in] bSyncNvLog    NV_TRUE: Copy a snapshot of the libos logs
3435  *                          into the nvLog wrap buffers.
3436  */
3437 void
kgspDumpGspLogs_IMPL(KernelGsp * pKernelGsp,NvBool bSyncNvLog)3438 kgspDumpGspLogs_IMPL
3439 (
3440     KernelGsp *pKernelGsp,
3441     NvBool bSyncNvLog
3442 )
3443 {
3444     if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog)
3445     {
3446         if (pKernelGsp->pNvlogFlushMtx != NULL)
3447             portSyncMutexAcquire(pKernelGsp->pNvlogFlushMtx);
3448 
3449         kgspDumpGspLogsUnlocked(pKernelGsp, bSyncNvLog);
3450 
3451         if (pKernelGsp->pNvlogFlushMtx != NULL)
3452             portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx);
3453     }
3454 }
3455 
3456 /*!
3457  * Populate GSP-RM init arguments.
3458  */
3459 void
kgspPopulateGspRmInitArgs_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_SR_INIT_ARGUMENTS * pGspInitArgs)3460 kgspPopulateGspRmInitArgs_IMPL
3461 (
3462     OBJGPU    *pGpu,
3463     KernelGsp *pKernelGsp,
3464     GSP_SR_INIT_ARGUMENTS *pGspInitArgs
3465 )
3466 {
3467     GSP_ARGUMENTS_CACHED *pGspArgs = pKernelGsp->pGspArgumentsCached;
3468     MESSAGE_QUEUE_INIT_ARGUMENTS *pMQInitArgs = &pGspArgs->messageQueueInitArguments;
3469     MESSAGE_QUEUE_COLLECTION *pMQCollection   = pKernelGsp->pMQCollection;
3470     GSP_SR_INIT_ARGUMENTS *pSrInitArgs =  &pGspArgs->srInitArguments;
3471 
3472     // Setup the message queue arguments
3473     pMQInitArgs->sharedMemPhysAddr      = pMQCollection->sharedMemPA;
3474     pMQInitArgs->pageTableEntryCount    = pMQCollection->pageTableEntryCount;
3475     pMQInitArgs->cmdQueueOffset         = pMQCollection->pageTableSize;
3476     pMQInitArgs->statQueueOffset        = pMQInitArgs->cmdQueueOffset + pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX].commandQueueSize;
3477     if (pKernelGsp->bIsTaskIsrQueueRequired)
3478     {
3479         pMQInitArgs->locklessCmdQueueOffset  = pMQInitArgs->statQueueOffset        + pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX].statusQueueSize;
3480         pMQInitArgs->locklessStatQueueOffset = pMQInitArgs->locklessCmdQueueOffset + pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX].commandQueueSize;
3481     }
3482     else
3483     {
3484         pMQInitArgs->locklessCmdQueueOffset  = 0;
3485         pMQInitArgs->locklessStatQueueOffset = 0;
3486     }
3487 
3488     if (pGspInitArgs == NULL)
3489     {
3490         pSrInitArgs->bInPMTransition     = NV_FALSE;
3491         pSrInitArgs->oldLevel            = 0;
3492         pSrInitArgs->flags               = 0;
3493     }
3494     else
3495     {
3496         pSrInitArgs->bInPMTransition     = NV_TRUE;
3497         pSrInitArgs->oldLevel            = pGspInitArgs->oldLevel;
3498         pSrInitArgs->flags               = pGspInitArgs->flags;
3499     }
3500 
3501     pGspArgs->gpuInstance = pGpu->gpuInstance;
3502 
3503     portMemSet(&pGspArgs->profilerArgs, 0, sizeof(pGspArgs->profilerArgs));
3504 
3505     if (pKernelGsp->pProfilerSamples != NULL &&
3506         pKernelGsp->pProfilerSamplesMD != NULL)
3507     {
3508         pGspArgs->profilerArgs.pa = memdescGetPhysAddr(pKernelGsp->pProfilerSamplesMD, AT_GPU, 0);
3509         pGspArgs->profilerArgs.size = memdescGetSize(pKernelGsp->pProfilerSamplesMD);
3510     }
3511 }
3512 
3513 /*!
3514  * Prepare boot binary image for GSP-RM boot.
3515  *
3516  * @return NV_OK if boot binary image prepared successfully.
3517  *         Appropriate NV_ERR_xxx value otherwise.
3518  */
3519 NV_STATUS
kgspPrepareBootBinaryImage_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)3520 kgspPrepareBootBinaryImage_IMPL
3521 (
3522     OBJGPU *pGpu,
3523     KernelGsp *pKernelGsp
3524 )
3525 {
3526     NV_STATUS status;
3527     BINDATA_STORAGE *pBinStorageImage;
3528     BINDATA_STORAGE *pBinStorageDesc;
3529     NvU32 bufSize;
3530     NvU32 bufSizeAligned;
3531     RM_RISCV_UCODE_DESC *pDesc = NULL;
3532     NvP64 pVa = NvP64_NULL;
3533     NvP64 pPriv = NvP64_NULL;
3534     NvU64 flags = MEMDESC_FLAGS_NONE;
3535 
3536     NV_ASSERT_OR_RETURN(pKernelGsp->pGspRmBootUcodeImage == NULL, NV_ERR_INVALID_STATE);
3537     NV_ASSERT_OR_RETURN(pKernelGsp->pGspRmBootUcodeDesc  == NULL, NV_ERR_INVALID_STATE);
3538 
3539     // get the bindata storage for the image/descriptor
3540     kgspGetGspRmBootUcodeStorage_HAL(pGpu, pKernelGsp, &pBinStorageImage, &pBinStorageDesc);
3541 
3542     // copy the image to sysmem
3543     bufSize = bindataGetBufferSize(pBinStorageImage);
3544     bufSizeAligned = NV_ALIGN_UP(bufSize, 0x1000);
3545 
3546     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3547 
3548     NV_ASSERT_OK_OR_GOTO(status,
3549                         memdescCreate(&pKernelGsp->pGspRmBootUcodeMemdesc,
3550                                 pGpu,
3551                                 bufSizeAligned,
3552                                 RM_PAGE_SIZE,
3553                                 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED,
3554                                 flags),
3555                         fail);
3556 
3557     memdescTagAlloc(status,
3558             NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_15, pKernelGsp->pGspRmBootUcodeMemdesc);
3559     NV_ASSERT_OK_OR_GOTO(status, status, fail);
3560 
3561     NV_ASSERT_OK_OR_GOTO(status,
3562                         memdescMap(pKernelGsp->pGspRmBootUcodeMemdesc, 0,
3563                                 memdescGetSize(pKernelGsp->pGspRmBootUcodeMemdesc),
3564                                 NV_TRUE, NV_PROTECT_READ_WRITE,
3565                                 &pVa, &pPriv),
3566                         fail);
3567 
3568     pKernelGsp->gspRmBootUcodeSize   = bufSize;
3569     pKernelGsp->pGspRmBootUcodeImage = (NvU8 *)NvP64_VALUE(pVa);;
3570     pKernelGsp->pGspRmBootUcodeMemdescPriv = pPriv;
3571 
3572     NV_ASSERT_OK_OR_GOTO(status,
3573                         bindataWriteToBuffer(pBinStorageImage,
3574                                pKernelGsp->pGspRmBootUcodeImage,
3575                                bufSize),
3576                         fail);
3577 
3578     // get the image descriptor
3579     NV_ASSERT_OK_OR_GOTO(status,
3580                          bindataStorageAcquireData(pBinStorageDesc, (const void**)&pDesc),
3581                          fail);
3582     pKernelGsp->pGspRmBootUcodeDesc = pDesc;
3583 
3584     return status;
3585 
3586 fail:
3587     _kgspFreeBootBinaryImage(pGpu, pKernelGsp);
3588     return status;
3589 }
3590 
3591 static void
_kgspFreeBootBinaryImage(OBJGPU * pGpu,KernelGsp * pKernelGsp)3592 _kgspFreeBootBinaryImage
3593 (
3594     OBJGPU *pGpu,
3595     KernelGsp *pKernelGsp
3596 )
3597 {
3598     bindataStorageReleaseData(pKernelGsp->pGspRmBootUcodeDesc);
3599     pKernelGsp->pGspRmBootUcodeDesc  = NULL;
3600 
3601     if (pKernelGsp->pGspRmBootUcodeImage != NULL)
3602     {
3603         memdescUnmap(pKernelGsp->pGspRmBootUcodeMemdesc,
3604                      NV_TRUE, osGetCurrentProcess(),
3605                      (void *)pKernelGsp->pGspRmBootUcodeImage,
3606                      pKernelGsp->pGspRmBootUcodeMemdescPriv);
3607         pKernelGsp->pGspRmBootUcodeImage = NULL;
3608         pKernelGsp->pGspRmBootUcodeMemdescPriv = NULL;
3609     }
3610     if (pKernelGsp->pGspRmBootUcodeMemdesc != NULL)
3611     {
3612         memdescFree(pKernelGsp->pGspRmBootUcodeMemdesc);
3613         memdescDestroy(pKernelGsp->pGspRmBootUcodeMemdesc);
3614         pKernelGsp->pGspRmBootUcodeMemdesc = NULL;
3615     }
3616 
3617     pKernelGsp->gspRmBootUcodeSize   = 0;
3618 }
3619 
3620 static NV_STATUS
_kgspCreateSignatureMemdesc(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw)3621 _kgspCreateSignatureMemdesc
3622 (
3623     OBJGPU *pGpu,
3624     KernelGsp *pKernelGsp,
3625     GSP_FIRMWARE *pGspFw
3626 )
3627 {
3628     NV_STATUS status = NV_OK;
3629     NvU8 *pSignatureVa = NULL;
3630     NvU64 flags = MEMDESC_FLAGS_NONE;
3631 
3632     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3633 
3634     // NOTE: align to 256 because that's the alignment needed for Booter DMA
3635     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3636         memdescCreate(&pKernelGsp->pSignatureMemdesc, pGpu,
3637             NV_ALIGN_UP(pGspFw->signatureSize, 256), 256,
3638             NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags));
3639 
3640     memdescTagAlloc(status,
3641             NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_16, pKernelGsp->pSignatureMemdesc);
3642     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, status, fail_create);
3643 
3644     pSignatureVa = memdescMapInternal(pGpu, pKernelGsp->pSignatureMemdesc, TRANSFER_FLAGS_NONE);
3645     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
3646         (pSignatureVa != NULL) ? NV_OK : NV_ERR_INSUFFICIENT_RESOURCES,
3647         fail_alloc);
3648 
3649     portMemCopy(pSignatureVa, memdescGetSize(pKernelGsp->pSignatureMemdesc),
3650         pGspFw->pSignatureData, pGspFw->signatureSize);
3651 
3652     memdescUnmapInternal(pGpu, pKernelGsp->pSignatureMemdesc, 0);
3653     pSignatureVa = NULL;
3654 
3655     return status;
3656 
3657 fail_alloc:
3658     memdescFree(pKernelGsp->pSignatureMemdesc);
3659 
3660 fail_create:
3661     memdescDestroy(pKernelGsp->pSignatureMemdesc);
3662     pKernelGsp->pSignatureMemdesc = NULL;
3663 
3664     return status;
3665 }
3666 
3667 /*!
3668  * Verify that the version embedded in the .fwversion section of the ELF given
3669  * by pElfData and elfDataSize matches our NV_VERSION_STRING.
3670  */
3671 static NV_STATUS
_kgspFwContainerVerifyVersion(OBJGPU * pGpu,KernelGsp * pKernelGsp,const void * pElfData,NvU64 elfDataSize,const char * pNameInMsg)3672 _kgspFwContainerVerifyVersion
3673 (
3674     OBJGPU *pGpu,
3675     KernelGsp *pKernelGsp,
3676     const void *pElfData,
3677     NvU64 elfDataSize,
3678     const char *pNameInMsg
3679 )
3680 {
3681     const char *pFwversion;
3682     NvU64 fwversionSize;
3683     NvU64 expectedVersionLength = portStringLength(NV_VERSION_STRING);
3684 
3685     {
3686         const void *pFwversionRaw;
3687 
3688         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3689             _kgspFwContainerGetSection(pGpu, pKernelGsp,
3690                 pElfData,
3691                 elfDataSize,
3692                 GSP_VERSION_SECTION_NAME,
3693                 &pFwversionRaw,
3694                 &fwversionSize));
3695 
3696         pFwversion = (const char *) pFwversionRaw;
3697     }
3698 
3699     // Check that text in .fwversion section of ELF matches our NV_VERSION_STRING
3700     if ((fwversionSize != expectedVersionLength + 1) ||
3701         (portStringCompare(pFwversion, NV_VERSION_STRING, expectedVersionLength) != 0))
3702     {
3703         // Sanity check .fwversion before attempting to print it in the error message
3704         if ((fwversionSize > 0) &&
3705             (fwversionSize < 64) &&
3706             (pFwversion[fwversionSize - 1] == '\0'))
3707         {
3708             NV_PRINTF(LEVEL_ERROR, "%s version mismatch: got version %s, expected version %s\n",
3709                       pNameInMsg, pFwversion, NV_VERSION_STRING);
3710         }
3711         else
3712         {
3713             NV_PRINTF(LEVEL_ERROR, "%s version unknown or malformed, expected version %s\n",
3714                       pNameInMsg, NV_VERSION_STRING);
3715         }
3716         return NV_ERR_INVALID_DATA;
3717     }
3718 
3719     return NV_OK;
3720 }
3721 
3722 /*!
3723  * Get the name of the section corresponding to the given section name
3724  * prefix and the current chip.
3725  */
3726 static NV_STATUS
_kgspGetSectionNameForPrefix(OBJGPU * pGpu,KernelGsp * pKernelGsp,char * pSectionNameBuf,NvLength sectionNameBufSize,const char * pSectionPrefix)3727 _kgspGetSectionNameForPrefix
3728 (
3729     OBJGPU *pGpu,
3730     KernelGsp *pKernelGsp,
3731     char *pSectionNameBuf,  // out
3732     NvLength sectionNameBufSize,
3733     const char *pSectionPrefix
3734 )
3735 {
3736     NvLength sectionPrefixLength;
3737 
3738     nv_firmware_chip_family_t chipFamily;
3739     const char *pChipFamilyName;
3740     NvLength chipFamilyNameLength;
3741 
3742     NvLength totalSize;
3743 
3744     NV_ASSERT_OR_RETURN(pSectionNameBuf != NULL, NV_ERR_INVALID_ARGUMENT);
3745     NV_ASSERT_OR_RETURN(sectionNameBufSize > 0, NV_ERR_INVALID_ARGUMENT);
3746     NV_ASSERT_OR_RETURN(pSectionPrefix != NULL, NV_ERR_INVALID_ARGUMENT);
3747 
3748     chipFamily = nv_firmware_get_chip_family(gpuGetChipArch(pGpu),
3749                                              gpuGetChipImpl(pGpu));
3750     NV_ASSERT_OR_RETURN(chipFamily != NV_FIRMWARE_CHIP_FAMILY_NULL,
3751                         NV_ERR_INVALID_STATE);
3752 
3753     pChipFamilyName = nv_firmware_chip_family_to_string(chipFamily);
3754     NV_ASSERT_OR_RETURN(pChipFamilyName != NULL, NV_ERR_INVALID_STATE);
3755 
3756     sectionPrefixLength = portStringLength(pSectionPrefix);
3757     chipFamilyNameLength = portStringLength(pChipFamilyName);
3758 
3759     totalSize = sectionPrefixLength + chipFamilyNameLength + 1;
3760     NV_ASSERT_OR_RETURN(sectionNameBufSize >= sectionPrefixLength + 1,
3761                         NV_ERR_BUFFER_TOO_SMALL);
3762     NV_ASSERT_OR_RETURN(sectionNameBufSize >= totalSize,
3763                         NV_ERR_BUFFER_TOO_SMALL);
3764 
3765     portStringCopy(pSectionNameBuf, sectionNameBufSize,
3766                    pSectionPrefix, sectionPrefixLength + 1);
3767     portStringCat(pSectionNameBuf, sectionNameBufSize,
3768                   pChipFamilyName, chipFamilyNameLength + 1);
3769 
3770     return NV_OK;
3771 }
3772 
3773 static NV_STATUS
_kgspPrepareGspRmBinaryImage(OBJGPU * pGpu,KernelGsp * pKernelGsp,GSP_FIRMWARE * pGspFw)3774 _kgspPrepareGspRmBinaryImage
3775 (
3776     OBJGPU *pGpu,
3777     KernelGsp *pKernelGsp,
3778     GSP_FIRMWARE *pGspFw
3779 )
3780 {
3781     char signatureSectionName[32];
3782 
3783     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3784         _kgspFwContainerVerifyVersion(pGpu, pKernelGsp,
3785             pGspFw->pBuf,
3786             pGspFw->size,
3787             "GSP firmware image"));
3788 
3789     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3790         _kgspFwContainerGetSection(pGpu, pKernelGsp,
3791             pGspFw->pBuf,
3792             pGspFw->size,
3793             GSP_IMAGE_SECTION_NAME,
3794             &pGspFw->pImageData,
3795             &pGspFw->imageSize));
3796 
3797     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3798         _kgspGetSectionNameForPrefix(pGpu, pKernelGsp,
3799             signatureSectionName, sizeof(signatureSectionName),
3800             kgspGetSignatureSectionNamePrefix_HAL(pGpu, pKernelGsp)));
3801 
3802     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3803         _kgspFwContainerGetSection(pGpu, pKernelGsp,
3804             pGspFw->pBuf,
3805             pGspFw->size,
3806             signatureSectionName,
3807             &pGspFw->pSignatureData,
3808             &pGspFw->signatureSize));
3809 
3810     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3811         _kgspCreateSignatureMemdesc(pGpu, pKernelGsp,
3812             pGspFw));
3813 
3814     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3815         kgspCreateRadix3(pGpu, pKernelGsp, &pKernelGsp->pGspUCodeRadix3Descriptor,
3816             NULL, pGspFw->pImageData, pGspFw->imageSize));
3817 
3818     return NV_OK;
3819 }
3820 
3821 NV_STATUS
kgspCreateRadix3_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,MEMORY_DESCRIPTOR ** ppMemdescRadix3,MEMORY_DESCRIPTOR * pMemdescData,const void * pData,NvU64 size)3822 kgspCreateRadix3_IMPL
3823 (
3824     OBJGPU *pGpu,
3825     KernelGsp *pKernelGsp,
3826     MEMORY_DESCRIPTOR **ppMemdescRadix3,
3827     MEMORY_DESCRIPTOR *pMemdescData,
3828     const void *pData,
3829     NvU64 size
3830 )
3831 {
3832     const NvU64 entriesLog2 = LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2 - 3;
3833     NvU8 *pRadix3Buf;
3834     NvP64 pVaKernel;
3835     NvP64 pPrivKernel;
3836     NvU64 ptSize;
3837     NvU64 allocSize;
3838     NvU64 nPages = 0;
3839     NvU64 dataOffset = 0;
3840     NvU32 i;
3841     NV_STATUS status = NV_OK;
3842     NvU64 flags = MEMDESC_FLAGS_KERNEL_MODE;
3843 
3844     // radix3 working array.
3845     struct
3846     {
3847         NvU64  nPages;
3848         NvU64  offset;
3849     } radix3[4];
3850 
3851     NV_ASSERT_OR_RETURN(ppMemdescRadix3 != NULL, NV_ERR_INVALID_PARAMETER);
3852     NV_ASSERT_OR_ELSE_STR(!((pMemdescData != NULL) && (pData != NULL)),
3853                           "Specify pMemdescData or pData, or none, but not both",
3854                           return NV_ERR_INVALID_PARAMETER);
3855 
3856     // If the size is not specified, get it from the memory descriptor.
3857     if ((size == 0) && (pMemdescData != NULL))
3858         size = memdescGetSize(pMemdescData);
3859     NV_ASSERT_OR_RETURN(size > 0, NV_ERR_OUT_OF_RANGE);
3860 
3861     // Clear working structure.
3862     portMemSet(radix3, 0, sizeof radix3);
3863 
3864     // Populate npages, high to low.
3865     i = NV_ARRAY_ELEMENTS(radix3) - 1;
3866     radix3[i].nPages = (size + LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE - 1) >>
3867                        LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3868     for (; i > 0; i--)
3869         radix3[i - 1].nPages = ((radix3[i].nPages - 1) >> entriesLog2) + 1;
3870 
3871     // Populate offset, low to high.
3872     for (i = 1; i < NV_ARRAY_ELEMENTS(radix3); i++)
3873     {
3874         nPages += radix3[i - 1].nPages;
3875         radix3[i].offset = nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3876     }
3877 
3878     NV_ASSERT_OR_RETURN(radix3[0].nPages == 1, NV_ERR_OUT_OF_RANGE);
3879 
3880     // Allocate space for PTEs and PDEs.
3881     ptSize = nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3882     allocSize = ptSize;
3883 
3884     if (pMemdescData == NULL)
3885     {
3886         // We don't have a separate descriptor for the data.  We need PTEs,
3887         // so include space for data in the new descriptor.
3888         allocSize += radix3[3].nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3889     }
3890 
3891     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3892 
3893     NV_ASSERT_OK_OR_GOTO(status,
3894         memdescCreate(ppMemdescRadix3, pGpu, allocSize,
3895             LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE,
3896             NV_MEMORY_NONCONTIGUOUS,
3897             ADDR_SYSMEM,
3898             NV_MEMORY_CACHED,
3899             flags),
3900         done);
3901 
3902     memdescTagAlloc(status,
3903             NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_17, (*ppMemdescRadix3));
3904     NV_ASSERT_OK_OR_GOTO(status, status, error_ret);
3905 
3906     // Create kernel mapping.
3907     NV_ASSERT_OK_OR_GOTO(status,
3908         memdescMap(*ppMemdescRadix3, 0, allocSize, NV_TRUE, NV_PROTECT_WRITEABLE,
3909             &pVaKernel, &pPrivKernel),
3910         error_ret);
3911 
3912     if (pVaKernel == NvP64_NULL)
3913     {
3914         NV_PRINTF(LEVEL_ERROR, "VA error for radix3 shared buffer\n");
3915         status = NV_ERR_NO_MEMORY;
3916         goto error_ret;
3917     }
3918 
3919     pRadix3Buf = KERNEL_POINTER_FROM_NvP64(NvU8 *, pVaKernel);
3920 
3921     // Zap out page table.
3922     portMemSet(pRadix3Buf, 0, ptSize);
3923 
3924     // Fill in PDEs.
3925     for (i = 0; i < NV_ARRAY_ELEMENTS(radix3) - 2; i++)
3926     {
3927         memdescGetPhysAddrs(*ppMemdescRadix3,
3928             AT_GPU,                     // addressTranslation
3929             radix3[i + 1].offset,       // offset
3930             RM_PAGE_SIZE,               // stride
3931             radix3[i + 1].nPages,       // count
3932             (RmPhysAddr *)(pRadix3Buf + radix3[i].offset)); // physical address table
3933     }
3934 
3935     dataOffset = radix3[3].offset;
3936 
3937     if (pData != NULL)
3938     {
3939         // Optionally copy data into the radix3 buffer.
3940         portMemCopy(pRadix3Buf + dataOffset, size, pData, size);
3941 
3942         // If we only have part of the last page, clear the rest.
3943         NvU32 clearSize = allocSize - dataOffset - size;
3944         if (clearSize != 0)
3945             portMemSet(pRadix3Buf + dataOffset + size, 0, clearSize);
3946 
3947         pMemdescData = *ppMemdescRadix3;
3948     }
3949 
3950     memdescGetPhysAddrs(*ppMemdescRadix3,
3951         AT_GPU,                     // addressTranslation
3952         dataOffset,                 // offset
3953         RM_PAGE_SIZE,               // stride
3954         radix3[3].nPages,           // count
3955         (RmPhysAddr *)(pRadix3Buf + radix3[2].offset));  // physical address table
3956 
3957     //
3958     // No reason to keep this memory mapped on the CPU side.  Only GSP will
3959     // access it after this point.
3960     //
3961     memdescUnmap(*ppMemdescRadix3, NV_TRUE, osGetCurrentProcess(),
3962                   pVaKernel, pPrivKernel);
3963 done:
3964     return status;
3965 
3966 error_ret:
3967     if (*ppMemdescRadix3 != NULL)
3968     {
3969         memdescFree(*ppMemdescRadix3);
3970         memdescDestroy(*ppMemdescRadix3);
3971         *ppMemdescRadix3 = NULL;
3972     }
3973 
3974     return status;
3975 }
3976 
3977 static NV_STATUS
_kgspFwContainerGetSection(OBJGPU * pGpu,KernelGsp * pKernelGsp,const void * pElfData,NvU64 elfDataSize,const char * pSectionName,const void ** ppSectionData,NvU64 * pSectionSize)3978 _kgspFwContainerGetSection
3979 (
3980     OBJGPU *pGpu,
3981     KernelGsp *pKernelGsp,
3982     const void *pElfData,
3983     NvU64 elfDataSize,
3984     const char *pSectionName,
3985     const void **ppSectionData,
3986     NvU64 *pSectionSize
3987 )
3988 {
3989     const NvU8 *pGspBuf = pElfData;
3990     const LibosElf64Header *pElfHeader;
3991     const LibosElf64SectionHeader *pElfSectionHeader;
3992     NvU64 elfSectionHeaderTableLength;
3993     NvU64 elfSectionHeaderMaxIdx;
3994     NvU64 elfSectionNamesTableOffset;
3995     NvU64 elfSectionNamesTableSize;
3996     NvU64 elfSectionNamesTableMaxIdx;
3997     static const NvU32 elfMagicNumber = 0x464C457F;
3998     static const NvU8 elfClass64 = 0x2;
3999     static const NvU8 elfLittleEndian = 0x1;
4000     const char *pCurrentSectionName;
4001     NvLength sectionNameLength;
4002     NvS16 idx;
4003 
4004     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfData != NULL, NV_ERR_INVALID_ARGUMENT);
4005     NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize > 0, NV_ERR_INVALID_ARGUMENT);
4006     NV_CHECK_OR_RETURN(LEVEL_ERROR, pSectionName != NULL, NV_ERR_INVALID_ARGUMENT);
4007     NV_CHECK_OR_RETURN(LEVEL_ERROR, ppSectionData != NULL, NV_ERR_INVALID_ARGUMENT);
4008     NV_CHECK_OR_RETURN(LEVEL_ERROR, pSectionSize != NULL, NV_ERR_INVALID_ARGUMENT);
4009     NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= sizeof(LibosElf64Header), NV_ERR_INVALID_DATA);
4010 
4011     sectionNameLength = portStringLength(pSectionName);
4012 
4013     pElfHeader = (const LibosElf64Header*) pGspBuf;
4014 
4015     // Check for the elf identifier at the beginning of the file
4016     NV_CHECK_OR_RETURN(LEVEL_ERROR, *(NvU32*)&pElfHeader->ident == elfMagicNumber, NV_ERR_INVALID_DATA);
4017     // Make sure the data is formatted as little endian
4018     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->ident[5] == elfLittleEndian, NV_ERR_INVALID_DATA);
4019     // Check the class type, only ELFCLASS64 is supported
4020     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->ident[4] == elfClass64, NV_ERR_INVALID_DATA);
4021 
4022     // Make sure that the elf section header table is valid
4023     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->shentsize == sizeof(LibosElf64SectionHeader), NV_ERR_INVALID_DATA);
4024     NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeMulU64(pElfHeader->shentsize, pElfHeader->shnum, &elfSectionHeaderTableLength), NV_ERR_INVALID_DATA);
4025     NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(pElfHeader->shoff, elfSectionHeaderTableLength - 1, &elfSectionHeaderMaxIdx), NV_ERR_INVALID_DATA);
4026     NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionHeaderMaxIdx, NV_ERR_INVALID_DATA);
4027     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->shstrndx <= pElfHeader->shnum, NV_ERR_INVALID_DATA);
4028 
4029     // Get the offset and size of the table that holds the section names and make sure they are valid
4030     pElfSectionHeader = (const LibosElf64SectionHeader*) &pGspBuf[pElfHeader->shoff + (pElfHeader->shstrndx * pElfHeader->shentsize)];
4031     elfSectionNamesTableOffset = pElfSectionHeader->offset;
4032     elfSectionNamesTableSize = pElfSectionHeader->size;
4033     NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(elfSectionNamesTableOffset, elfSectionNamesTableSize - 1, &elfSectionNamesTableMaxIdx), NV_ERR_INVALID_DATA);
4034     NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionNamesTableMaxIdx, NV_ERR_INVALID_DATA);
4035 
4036     // Iterate through all of the section headers to find the signatures
4037     pElfSectionHeader = (const LibosElf64SectionHeader*) &pGspBuf[elfSectionHeaderMaxIdx + 1 - sizeof(*pElfSectionHeader)];
4038 
4039     for (idx = pElfHeader->shnum - 1; idx >= 0; idx--, pElfSectionHeader--)
4040     {
4041         NvU64 currentSectionNameMaxLength;
4042         NvU64 elfSectionMaxIdx;
4043 
4044         // Make sure the header name index fits within the section names table
4045         NV_CHECK_OR_RETURN(LEVEL_ERROR, elfSectionNamesTableSize - 1 >= pElfSectionHeader->name, NV_ERR_INVALID_DATA);
4046         currentSectionNameMaxLength = elfSectionNamesTableSize - pElfSectionHeader->name - 1;
4047         pCurrentSectionName = (const char *) &pGspBuf[elfSectionNamesTableOffset + pElfSectionHeader->name];
4048 
4049         // Make sure the elf section size and offset are valid
4050         if (pElfSectionHeader->size > 0)
4051         {
4052             NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(pElfSectionHeader->offset, pElfSectionHeader->size - 1, &elfSectionMaxIdx), NV_ERR_INVALID_DATA);
4053         }
4054         else
4055         {
4056             elfSectionMaxIdx = pElfSectionHeader->offset;
4057         }
4058         NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionMaxIdx, NV_ERR_INVALID_DATA);
4059 
4060         // Check whether the section name matches the expected section name
4061         if ((sectionNameLength <= currentSectionNameMaxLength) &&
4062             (portStringCompare(pCurrentSectionName, pSectionName, sectionNameLength) == 0) &&
4063             (pCurrentSectionName[sectionNameLength] == '\0'))
4064         {
4065             *ppSectionData = &pGspBuf[pElfSectionHeader->offset];
4066             *pSectionSize = pElfSectionHeader->size;
4067 
4068             return NV_OK;
4069         }
4070     }
4071 
4072     return NV_ERR_OBJECT_NOT_FOUND;
4073 }
4074 
4075 /*!
4076  * Setup libos init arguments.
4077  */
4078 void
kgspSetupLibosInitArgs_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4079 kgspSetupLibosInitArgs_IMPL
4080 (
4081     OBJGPU         *pGpu,
4082     KernelGsp *pKernelGsp
4083 )
4084 {
4085     LibosMemoryRegionInitArgument *pLibosInitArgs = pKernelGsp->pLibosInitArgumentsCached;
4086     NvU8 idx;
4087 
4088     portMemSet(pLibosInitArgs, 0, LIBOS_INIT_ARGUMENTS_SIZE);
4089 
4090     // Add memory areas for logging each LIBOS task.
4091     // @note LOGINIT must be first for early init logging to work.
4092     // @note: These should be switched to radix regions to remove the need
4093     //        for large apertures in the RM task for logging.
4094     for (idx = 0; idx < LOGIDX_SIZE; idx++)
4095     {
4096         pLibosInitArgs[idx].kind = LIBOS_MEMORY_REGION_CONTIGUOUS;
4097         pLibosInitArgs[idx].loc  = LIBOS_MEMORY_REGION_LOC_SYSMEM;
4098         pLibosInitArgs[idx].id8  = pKernelGsp->rmLibosLogMem[idx].id8;
4099         pLibosInitArgs[idx].pa   = pKernelGsp->rmLibosLogMem[idx].pTaskLogBuffer[1];
4100         pLibosInitArgs[idx].size = memdescGetSize(pKernelGsp->rmLibosLogMem[idx].pTaskLogDescriptor);
4101     }
4102 
4103     // insert GSP-RM ELF args address; id must match libos-config.py entry
4104     pLibosInitArgs[idx].kind = LIBOS_MEMORY_REGION_CONTIGUOUS;
4105     pLibosInitArgs[idx].loc  = LIBOS_MEMORY_REGION_LOC_SYSMEM;
4106     pLibosInitArgs[idx].id8  = _kgspGenerateInitArgId("RMARGS");
4107     pLibosInitArgs[idx].pa   = memdescGetPhysAddr(pKernelGsp->pGspArgumentsDescriptor, AT_GPU, 0);
4108     pLibosInitArgs[idx].size = memdescGetSize(pKernelGsp->pGspArgumentsDescriptor);
4109 
4110     portAtomicMemoryFenceFull();
4111 }
4112 
4113 /*!
4114  * Receive and process RPC event from GSP-RM.
4115  *
4116  * This function is called from interrupt bottom-half handler (DPC) and
4117  * would race with normal RPC flow, _kgspRpcRecvPoll().
4118  * This race is currently avoided only because DPC is executed under
4119  * gpus lock, so RPC and Bottom-half handler are mutually exclusive
4120  * control flows.
4121  */
4122 void
kgspRpcRecvEvents_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4123 kgspRpcRecvEvents_IMPL
4124 (
4125     OBJGPU *pGpu,
4126     KernelGsp  *pKernelGsp
4127 )
4128 {
4129     NvU32 gpuMaskUnused;
4130     NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
4131     //
4132     // We should never have an event with code NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS.
4133     // If we do the assert will fail on NV_WARN_MORE_PROCESSING_REQUIRED,
4134     // in addition to general error codes.
4135     //
4136     NV_ASSERT_OK(_kgspRpcDrainEvents(pGpu, pKernelGsp, NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS, KGSP_RPC_EVENT_HANDLER_CONTEXT_INTERRUPT));
4137 }
4138 
4139 /*!
4140  * Wait for GSP-RM initialization to complete.
4141  */
4142 NV_STATUS
kgspWaitForRmInitDone_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4143 kgspWaitForRmInitDone_IMPL
4144 (
4145     OBJGPU *pGpu,
4146     KernelGsp *pKernelGsp
4147 )
4148 {
4149     OBJRPC *pRpc = pKernelGsp->pRpc;
4150 
4151     //
4152     // Kernel RM can timeout when GSP-RM has an error condition.  Give GSP-RM
4153     // a chance to report the error before we pull the rug out from under it.
4154     //
4155     threadStateResetTimeout(pGpu);
4156 
4157     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
4158         rpcRecvPoll(pGpu, pRpc, NV_VGPU_MSG_EVENT_GSP_INIT_DONE));
4159 
4160     //
4161     // Now check if RPC really succeeded (NV_VGPU_MSG_RESULT_* are defined to
4162     // equivalent NV_STATUS codes in RM).
4163     //
4164     NV_ASSERT_OK_OR_RETURN(RPC_HDR->rpc_result);
4165 
4166     pGpu->gspRmInitialized = NV_TRUE;
4167     if (hypervisorIsVgxHyper() && pGpu->getProperty(pGpu, PDB_PROP_GPU_EXTENDED_GSP_RM_INITIALIZATION_TIMEOUT_FOR_VGX))
4168     {
4169         // Decrease timeout values for VGX driver
4170         timeoutInitializeGpuDefault(&pGpu->timeoutData, pGpu);
4171     }
4172 
4173     return NV_OK;
4174 }
4175 
4176 /*!
4177  * Execute a sequencer buffer coming from GSP
4178  *
4179  * @param[in]      pGpu             GPU object pointer
4180  * @param[in]      pKernelGsp       KernelGsp object pointer
4181  * @param[in]      pRunCpuSeqParams Sequence buffer RPC parameters
4182  *
4183  * @return NV_OK if the GSP sequencer buffer has been executed successfully
4184  *         NV_ERR_INVALID_STATE if the sequencer buffer is not allocated
4185  *         NV_ERR_INVALID_DATA is the sequencer buffer is malformed
4186  */
4187 NV_STATUS
kgspExecuteSequencerBuffer_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,void * pRunCpuSeqParams)4188 kgspExecuteSequencerBuffer_IMPL
4189 (
4190     OBJGPU    *pGpu,
4191     KernelGsp *pKernelGsp,
4192     void      *pRunCpuSeqParams
4193 )
4194 {
4195     rpc_run_cpu_sequencer_v17_00 *pParams = (rpc_run_cpu_sequencer_v17_00 *)pRunCpuSeqParams;
4196     NvU32 *pCmd = pParams->commandBuffer;
4197     NvU32 buffer_end = pParams->cmdIndex;
4198     NvU32 current_cmd_index = 0;
4199     NV_STATUS nvStatus = NV_OK;
4200     NvU32 payloadSize;
4201 
4202     NV_ASSERT_OR_RETURN(IS_GSP_CLIENT(pGpu), NV_ERR_NOT_SUPPORTED);
4203     NV_ASSERT_OR_RETURN((pParams->bufferSizeDWord != 0), NV_ERR_INVALID_STATE);
4204     NV_ASSERT_OR_RETURN(buffer_end < pParams->bufferSizeDWord, NV_ERR_INVALID_DATA);
4205 
4206     while (current_cmd_index < buffer_end)
4207     {
4208         NvU32 opCode = pCmd[current_cmd_index++];
4209         payloadSize = GSP_SEQUENCER_PAYLOAD_SIZE_DWORDS(opCode);
4210 
4211         NV_ASSERT_OR_RETURN(current_cmd_index + payloadSize <= buffer_end, NV_ERR_INVALID_DATA);
4212 
4213         //
4214         // Handling of sequencer commands is split between those commands
4215         // that are common to all architectures (handled directly here) and
4216         // those commands that are arch-specific and are handled via the
4217         // kgspExecuteSequencerCommand_HAL() call below.
4218         //
4219         switch (opCode)
4220         {
4221             // 2 arguments
4222             case GSP_SEQ_BUF_OPCODE_REG_WRITE:
4223             {
4224                 GSP_SEQ_BUF_PAYLOAD_REG_WRITE regWrite;
4225                 portMemCopy(&regWrite, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_WRITE), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_WRITE));
4226 
4227                 GPU_REG_WR32(pGpu, regWrite.addr, regWrite.val);
4228                 break;
4229             }
4230 
4231             // 3 arguments
4232             case GSP_SEQ_BUF_OPCODE_REG_MODIFY:
4233             {
4234                 GSP_SEQ_BUF_PAYLOAD_REG_MODIFY regModify;
4235                 NvU32 regVal;
4236 
4237                 portMemCopy(&regModify, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_MODIFY), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_MODIFY));
4238 
4239                 regVal = GPU_REG_RD32(pGpu, regModify.addr);
4240                 regVal = regVal & ~regModify.mask;
4241                 regVal = regVal | regModify.val;
4242                 GPU_REG_WR32(pGpu, regModify.addr, regVal);
4243                 break;
4244             }
4245 
4246             // 5 arguments
4247             case GSP_SEQ_BUF_OPCODE_REG_POLL:
4248             {
4249                 GSP_SEQ_BUF_PAYLOAD_REG_POLL regPoll;
4250                 NvU32 regval;
4251                 RMTIMEOUT timeout;
4252 
4253                 portMemCopy(&regPoll, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_POLL), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_POLL));
4254 
4255                 regval = GPU_REG_RD32(pGpu, regPoll.addr);
4256 
4257                 gpuSetTimeout(pGpu, regPoll.timeout, &timeout, 0);
4258                 while ((regval & regPoll.mask) != regPoll.val)
4259                 {
4260                     nvStatus = gpuCheckTimeout(pGpu, &timeout);
4261                     if (nvStatus == NV_ERR_TIMEOUT)
4262                     {
4263                         NV_PRINTF(LEVEL_ERROR, "Timeout waiting for register to settle, value = 0x%x, err_code = 0x%x\n",
4264                             regval, regPoll.error);
4265                         DBG_BREAKPOINT();
4266                         return nvStatus;
4267                     }
4268                     osSpinLoop();
4269                     regval = GPU_REG_RD32(pGpu, regPoll.addr);
4270                 }
4271                 break;
4272             }
4273 
4274             case GSP_SEQ_BUF_OPCODE_DELAY_US:
4275             {
4276                 GSP_SEQ_BUF_PAYLOAD_DELAY_US delayUs;
4277                 portMemCopy(&delayUs, sizeof(GSP_SEQ_BUF_PAYLOAD_DELAY_US), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_DELAY_US));
4278 
4279                 osDelayUs(delayUs.val);
4280                 break;
4281             }
4282 
4283             case GSP_SEQ_BUF_OPCODE_REG_STORE:
4284             {
4285                 GSP_SEQ_BUF_PAYLOAD_REG_STORE regStore;
4286                 portMemCopy(&regStore, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_STORE), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_STORE));
4287 
4288                 NV_ASSERT_OR_RETURN(regStore.index < GSP_SEQ_BUF_REG_SAVE_SIZE, NV_ERR_INVALID_ARGUMENT);
4289 
4290                 pParams->regSaveArea[regStore.index] = GPU_REG_RD32(pGpu, regStore.addr);
4291                 break;
4292             }
4293 
4294             case GSP_SEQ_BUF_OPCODE_CORE_RESET:
4295             {
4296                 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4297 
4298                 kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4299                 kflcnDisableCtxReq_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4300                 break;
4301             }
4302 
4303             case GSP_SEQ_BUF_OPCODE_CORE_START:
4304             {
4305                 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4306 
4307                 kflcnStartCpu_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4308                 break;
4309             }
4310 
4311             case GSP_SEQ_BUF_OPCODE_CORE_WAIT_FOR_HALT:
4312             {
4313                 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4314 
4315                 NV_ASSERT_OK_OR_RETURN(kflcnWaitForHalt_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon), GPU_TIMEOUT_DEFAULT, 0));
4316                 break;
4317             }
4318 
4319             default:
4320                 //
4321                 // Route this command to the arch-specific handler.
4322                 //
4323                 NV_ASSERT_OK_OR_RETURN(kgspExecuteSequencerCommand_HAL(pGpu, pKernelGsp, opCode, &pCmd[current_cmd_index], payloadSize * sizeof (*pCmd)));
4324                 break;
4325         }
4326         current_cmd_index += payloadSize;
4327     }
4328 
4329     return NV_OK;
4330 }
4331 
4332 #if LIBOS_LOG_DECODE_ENABLE
4333 static void
_kgspLogPollingCallback(OBJGPU * pGpu,void * data)4334 _kgspLogPollingCallback
4335 (
4336     OBJGPU *pGpu,
4337     void   *data
4338 )
4339 {
4340     //
4341     // Do not take any locks in kgspDumpGspLogs. As this callback only fires when kgspNvlogFlushCb
4342     // is not registered, there is no possibility of data race.
4343     //
4344     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
4345     kgspDumpGspLogsUnlocked(pKernelGsp, NV_FALSE);
4346 }
4347 
4348 NV_STATUS
kgspStartLogPolling_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4349 kgspStartLogPolling_IMPL
4350 (
4351     OBJGPU    *pGpu,
4352     KernelGsp *pKernelGsp
4353 )
4354 {
4355     NV_STATUS status = NV_OK;
4356 
4357     //
4358     // Only enable the 1 Hz poll if we can live decode logs in dmesg. Else we'll
4359     // flush it on demand by nvidia-debugdump.
4360     //
4361     if (pKernelGsp->pLogElf != NULL)
4362     {
4363         status = osSchedule1HzCallback(pGpu,
4364                                        _kgspLogPollingCallback,
4365                                        NULL,
4366                                        NV_OS_1HZ_REPEAT);
4367     }
4368     return status;
4369 }
4370 
4371 static void
_kgspStopLogPolling(OBJGPU * pGpu,KernelGsp * pKernelGsp)4372 _kgspStopLogPolling
4373 (
4374     OBJGPU    *pGpu,
4375     KernelGsp *pKernelGsp
4376 )
4377 {
4378     if (pKernelGsp->pLogElf != NULL)
4379     {
4380         osRemove1HzCallback(pGpu, _kgspLogPollingCallback, NULL);
4381     }
4382 }
4383 
4384 #else // LIBOS_LOG_DECODE_ENABLE
4385 
4386 NV_STATUS
kgspStartLogPolling_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4387 kgspStartLogPolling_IMPL
4388 (
4389     OBJGPU    *pGpu,
4390     KernelGsp *pKernelGsp
4391 )
4392 {
4393     return NV_OK;
4394 }
4395 
4396 static void
_kgspStopLogPolling(OBJGPU * pGpu,KernelGsp * pKernelGsp)4397 _kgspStopLogPolling
4398 (
4399     OBJGPU    *pGpu,
4400     KernelGsp *pKernelGsp
4401 )
4402 {
4403     return;
4404 }
4405 #endif // LIBOS_LOG_DECODE_ENABLE
4406 
4407 /*!
4408  * Provides an opportunity to register some IntrService during intrStateInit.
4409  */
4410 void
kgspRegisterIntrService_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,IntrServiceRecord pRecords[MC_ENGINE_IDX_MAX])4411 kgspRegisterIntrService_IMPL
4412 (
4413     OBJGPU *pGpu,
4414     KernelGsp *pKernelGsp,
4415     IntrServiceRecord pRecords[MC_ENGINE_IDX_MAX]
4416 )
4417 {
4418     NvU32 engineIdx = MC_ENGINE_IDX_GSP;
4419 
4420     if (!IS_GSP_CLIENT(pGpu))
4421         return;
4422 
4423     NV_ASSERT(pRecords[engineIdx].pInterruptService == NULL);
4424     pRecords[engineIdx].pInterruptService = staticCast(pKernelGsp, IntrService);
4425 }
4426 
4427 /*!
4428  * Service GSP interrupts.
4429  *
4430  * @returns Zero, or any implementation-chosen nonzero value. If the same nonzero value is returned enough
4431  *          times the interrupt is considered stuck.
4432  */
4433 NvU32
kgspServiceInterrupt_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,IntrServiceServiceInterruptArguments * pParams)4434 kgspServiceInterrupt_IMPL
4435 (
4436     OBJGPU *pGpu,
4437     KernelGsp *pKernelGsp,
4438     IntrServiceServiceInterruptArguments *pParams
4439 )
4440 {
4441     NV_ASSERT_OR_RETURN(pParams != NULL, 0);
4442     NV_ASSERT_OR_RETURN(pParams->engineIdx == MC_ENGINE_IDX_GSP, 0);
4443 
4444     return kgspService_HAL(pGpu, pKernelGsp);
4445 }
4446 
4447 /*!
4448  * Calculates the GSP FW heap size based on the GPU's resources.
4449  */
4450 static NvU64
_kgspCalculateFwHeapSize(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU32 maxGspFwHeapSizeMB)4451 _kgspCalculateFwHeapSize
4452 (
4453     OBJGPU *pGpu,
4454     KernelGsp *pKernelGsp,
4455     NvU32 maxGspFwHeapSizeMB
4456 )
4457 {
4458     // For VGPU, use the static pre-calculated size
4459     if (pGpu->bVgpuGspPluginOffloadEnabled)
4460         return GSP_FW_HEAP_SIZE_VGPU_DEFAULT;
4461 
4462     //
4463     // The baremetal heap calculation is a function of the architecture, FB
4464     // size, and a chunk for backing client allocations (pre-calibrated for the
4465     // architecture through rough profiling).
4466     //
4467     KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
4468     NvU64 fbSize = 0;
4469 
4470     NV_ASSERT_OK(kmemsysGetUsableFbSize_HAL(pGpu, pKernelMemorySystem, &fbSize));
4471     const NvU32 fbSizeGB = (NvU32)(NV_ALIGN_UP64(fbSize, 1 << 30) >> 30);
4472 
4473     //
4474     // Reclaimable binary data will end up padding the heap (in some cases,
4475     // significantly), but due to memory fragmentation we can't rely on it to
4476     // linearly reduce the amount needed in the primary heap, so it is not a
4477     // factor here. Instead, it's just extra margin to keep us from exhausting
4478     // the heap at runtime.
4479     //
4480     NvU64 heapSize = kgspGetFwHeapParamOsCarveoutSize_HAL(pGpu, pKernelGsp) +
4481                      pKernelGsp->fwHeapParamBaseSize +
4482                      NV_ALIGN_UP(GSP_FW_HEAP_PARAM_SIZE_PER_GB_FB * fbSizeGB, 1 << 20) +
4483                      NV_ALIGN_UP(GSP_FW_HEAP_PARAM_CLIENT_ALLOC_SIZE, 1 << 20);
4484 
4485     // Clamp to the minimum, even if the calculations say we can do with less
4486     const NvU32 minGspFwHeapSizeMB = kgspGetMinWprHeapSizeMB_HAL(pGpu, pKernelGsp);
4487     heapSize = NV_MAX(heapSize, (NvU64)minGspFwHeapSizeMB << 20);
4488 
4489     // Clamp to the maximum heap size, if necessary
4490     heapSize = NV_MIN(heapSize, (NvU64)maxGspFwHeapSizeMB << 20);
4491 
4492     NV_PRINTF(LEVEL_INFO, "GSP FW heap %lluMB of %uGB\n",
4493               heapSize >> 20, fbSizeGB);
4494 
4495     return heapSize;
4496 }
4497 
4498 /*!
4499  * Returns the size in bytes of the GSP FW heap:
4500  *  - the registry override, if present
4501  *  - otherwise, calculate the FW heap size for this GPU, limiting it to stay
4502  *    within the pre-scrubbed area at the end of FB, if needed
4503  *
4504  * @param[in] posteriorFbSize - size in bytes of the memory reserved between the
4505  *                              end of the GSP FW heap and the end of FB, or 0
4506  *                              to disable limiting of the heap range to within
4507  *                              the pre-scrubbed area at the end of FB
4508  */
4509 NvU64
kgspGetFwHeapSize_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp,NvU64 posteriorFbSize)4510 kgspGetFwHeapSize_IMPL
4511 (
4512     OBJGPU *pGpu,
4513     KernelGsp *pKernelGsp,
4514     NvU64 posteriorFbSize
4515 )
4516 {
4517     NvU32 maxScrubbedHeapSizeMB = NV_U32_MAX;
4518     NvU32 heapSizeMB = 0;
4519 
4520     //
4521     // The pre-scrubbed region at the end of FB may limit the heap size, if no
4522     // scrubber ucode is supported to unlock the rest of memory prior to booting
4523     // GSP-RM.
4524     //
4525     if (!pKernelGsp->bScrubberUcodeSupported && (posteriorFbSize != 0))
4526     {
4527         const NvU64 prescrubbedSize = kgspGetPrescrubbedTopFbSize(pGpu, pKernelGsp);
4528         if (prescrubbedSize < NV_U64_MAX)
4529             maxScrubbedHeapSizeMB = (NvU32)((prescrubbedSize - posteriorFbSize) >> 20);
4530     }
4531 
4532     // Get the heap size override from the registry, if any
4533     if ((osReadRegistryDword(pGpu, NV_REG_STR_GSP_FIRMWARE_HEAP_SIZE_MB, &heapSizeMB) == NV_OK) &&
4534         (heapSizeMB != NV_REG_STR_GSP_FIRMWARE_HEAP_SIZE_MB_DEFAULT))
4535     {
4536         const NvU32 minGspFwHeapSizeMB = kgspGetMinWprHeapSizeMB_HAL(pGpu, pKernelGsp);
4537         const NvU32 maxGspFwHeapSizeMB = NV_MIN(kgspGetMaxWprHeapSizeMB_HAL(pGpu, pKernelGsp),
4538                                                 maxScrubbedHeapSizeMB);
4539 
4540         NV_ASSERT(minGspFwHeapSizeMB < maxGspFwHeapSizeMB);
4541 
4542         if (heapSizeMB > maxGspFwHeapSizeMB)
4543         {
4544             NV_PRINTF(LEVEL_WARNING, "Firmware heap size clamped to maximum (%uMB)\n",
4545                       maxGspFwHeapSizeMB);
4546             heapSizeMB = maxGspFwHeapSizeMB;
4547         }
4548         else if (heapSizeMB < minGspFwHeapSizeMB)
4549         {
4550             NV_PRINTF(LEVEL_WARNING, "Firmware heap size clamped to minimum (%uMB)\n",
4551                       minGspFwHeapSizeMB);
4552             heapSizeMB = minGspFwHeapSizeMB;
4553         }
4554         else
4555         {
4556             NV_PRINTF(LEVEL_WARNING, "Firmware heap size overridden (%uMB)\n",
4557                       heapSizeMB);
4558         }
4559 
4560         return ((NvU64)heapSizeMB) << 20;
4561     }
4562 
4563     return _kgspCalculateFwHeapSize(pGpu, pKernelGsp, maxScrubbedHeapSizeMB);
4564 }
4565 
kgspGetWprEndMargin_IMPL(OBJGPU * pGpu,KernelGsp * pKernelGsp)4566 NvU64 kgspGetWprEndMargin_IMPL(OBJGPU *pGpu, KernelGsp *pKernelGsp)
4567 {
4568     NvU64 wprEndMargin;
4569     NvU32 marginOverride = 0;
4570     GspFwWprMeta *pWprMeta = pKernelGsp->pWprMeta;
4571 
4572     (void)osReadRegistryDword(pGpu, NV_REG_STR_RM_GSP_WPR_END_MARGIN, &marginOverride);
4573 
4574     wprEndMargin = ((NvU64)DRF_VAL(_REG, _RM_GSP_WPR_END_MARGIN, _MB, marginOverride)) << 20;
4575     if (wprEndMargin == 0)
4576     {
4577         // Calculate the default margin size based on the WPR size
4578         const GspFwWprMeta *pWprMeta = pKernelGsp->pWprMeta;
4579 
4580         //
4581         // This needs to be called after pWprMeta->sizeOfRadix3Elf has been initialized,
4582         // in order to estimate the default WPR size.
4583         //
4584         NV_ASSERT(pWprMeta->sizeOfRadix3Elf > 0);
4585 
4586         //
4587         // If the bounds are encoded in GspFwWprMeta from a prior attempt, use them.
4588         // Otherwise, estimate the WPR size by the sizes of the elements in the layout
4589         //
4590         if (pWprMeta->gspFwWprEnd > pWprMeta->nonWprHeapOffset)
4591         {
4592             wprEndMargin = pWprMeta->gspFwWprEnd - pWprMeta->nonWprHeapOffset;
4593         }
4594         else
4595         {
4596             wprEndMargin += kgspGetFrtsSize_HAL(pGpu, pKernelGsp);
4597             wprEndMargin += pKernelGsp->gspRmBootUcodeSize;
4598             wprEndMargin += pWprMeta->sizeOfRadix3Elf;
4599             wprEndMargin += kgspGetFwHeapSize(pGpu, pKernelGsp, 0);
4600             wprEndMargin += kgspGetNonWprHeapSize(pGpu, pKernelGsp);
4601         }
4602 
4603         if (pKernelGsp->bootAttempts > 0)
4604             wprEndMargin *= pKernelGsp->bootAttempts;
4605     }
4606 
4607     if (FLD_TEST_DRF(_REG, _RM_GSP_WPR_END_MARGIN, _APPLY, _ALWAYS, marginOverride) ||
4608         (pKernelGsp->bootAttempts > 0))
4609     {
4610         NV_PRINTF(LEVEL_WARNING, "Adding margin of 0x%llx bytes after the end of WPR2\n",
4611                   wprEndMargin);
4612         pWprMeta->flags |= GSP_FW_FLAGS_RECOVERY_MARGIN_PRESENT;
4613         return wprEndMargin;
4614     }
4615 
4616     // Normal boot path
4617     pWprMeta->flags &= ~GSP_FW_FLAGS_RECOVERY_MARGIN_PRESENT;
4618     return 0;
4619 }
4620