1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "resserv/rs_server.h"
25 
26 #include "gpu/gsp/kernel_gsp.h"
27 
28 #include "kernel/core/thread_state.h"
29 #include "kernel/core/locks.h"
30 #include "kernel/diagnostics/gpu_acct.h"
31 #include "kernel/diagnostics/journal.h"
32 #include "kernel/gpu/fifo/kernel_channel.h"
33 #include "kernel/gpu/gsp/gsp_trace_rats_macro.h"
34 #include "kernel/gpu/intr/engine_idx.h"
35 #include "kernel/gpu/mem_mgr/heap.h"
36 #include "kernel/gpu/mem_mgr/mem_mgr.h"
37 #include "kernel/gpu/mem_sys/kern_mem_sys.h"
38 #include "kernel/gpu/rc/kernel_rc.h"
39 #include "kernel/gpu/nvlink/kernel_nvlink.h"
40 #include "virtualization/hypervisor/hypervisor.h"
41 #include "virtualization/vgpuconfigapi.h"
42 #include "kernel/gpu/disp/kern_disp.h"
43 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
44 #include "kernel/gpu/device/device.h"
45 #include "gpu/external_device/external_device.h"
46 #include "kernel/platform/platform_request_handler.h"
47 #include "class/cl2080.h" // NV20_SUBDEVICE_0
48 #include "ctrl/ctrl2080/ctrl2080nvd.h"
49 #include "liblogdecode.h"
50 #include "libelf.h"
51 #include "nverror.h"
52 #include "nvrm_registry.h"
53 #include "nv-firmware.h"
54 #include "nv-firmware-chip-family-select.h"
55 #include "nvtypes.h"
56 #include "nvVer.h"
57 #include "objrpc.h"
58 #include "objtmr.h"
59 #include "os/os.h"
60 #include "rmgspseq.h"
61 #include "sweng/dispsw.h"
62 #include "kernel/gpu/timed_sema.h"
63 #include "vgpu/rpc.h"
64 #include "kernel/gpu/pmu/kern_pmu.h"
65 #include "gpu/perf/kern_perf.h"
66 #include "core/locks.h"
67 #include "kernel/gpu/intr/intr.h"
68 
69 #define RPC_STRUCTURES
70 #define RPC_GENERIC_UNION
71 #include "g_rpc-structures.h"
72 #undef RPC_STRUCTURES
73 #undef RPC_GENERIC_UNION
74 
75 #define RPC_MESSAGE_STRUCTURES
76 #define RPC_MESSAGE_GENERIC_UNION
77 #include "g_rpc-message-header.h"
78 #undef RPC_MESSAGE_STRUCTURES
79 #undef RPC_MESSAGE_GENERIC_UNION
80 
81 #include "gpu/gsp/message_queue_priv.h"
82 
83 #include "gpu/conf_compute/conf_compute.h"
84 
85 #define RPC_HDR  ((rpc_message_header_v*)(pRpc->message_buffer))
86 
87 struct MIG_CI_UPDATE_CALLBACK_PARAMS
88 {
89     NvU32 execPartCount;
90     NvU32 execPartId[NVC637_CTRL_MAX_EXEC_PARTITIONS];
91     NvU32 gfid;
92     NvBool bDelete;
93 };
94 
95 //
96 // RPC_PARAMS defines the rpc_params pointer and initializes it to the correct
97 // sub-structure.
98 //
99 // RPC_PARAMS intentionally assigns the the latest version structure to the
100 // versioned rpc_params pointer.  With the -Werror=incompatible-pointer-types
101 // compiler flag, this checks for mismatched structure versions at compile time.
102 //
103 // For example:
104 //   RPC_PARAMS(free, _v03_00);
105 // expands to
106 //   rpc_free_v03_00 *rpc_params = &RPC_HDR->rpc_message_data->free_v;
107 //
108 #define RPC_PARAMS(r, v) rpc_##r##v *rpc_params = &RPC_HDR->rpc_message_data->r##_v
109 
110 static NV_STATUS _kgspInitRpcInfrastructure(OBJGPU *, KernelGsp *);
111 static void _kgspFreeRpcInfrastructure(OBJGPU *, KernelGsp *);
112 
113 static NV_STATUS _kgspConstructRpcObject(OBJGPU *, KernelGsp *, MESSAGE_QUEUE_INFO *, OBJRPC **);
114 
115 static NV_STATUS _kgspRpcSendMessage(OBJGPU *, OBJRPC *);
116 static NV_STATUS _kgspRpcRecvPoll(OBJGPU *, OBJRPC *, NvU32);
117 static NV_STATUS _kgspRpcDrainEvents(OBJGPU *, KernelGsp *, NvU32, KernelGspRpcEventHandlerContext);
118 static void      _kgspRpcIncrementTimeoutCountAndRateLimitPrints(OBJGPU *, OBJRPC *);
119 
120 static NV_STATUS _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
121 static void _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
122 
123 static NV_STATUS _kgspAllocNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp);
124 static void _kgspFreeNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp);
125 
126 static void _kgspStopLogPolling(OBJGPU *pGpu, KernelGsp *pKernelGsp);
127 
128 static void _kgspFreeBootBinaryImage(OBJGPU *pGpu, KernelGsp *pKernelGsp);
129 
130 static NV_STATUS _kgspPrepareGspRmBinaryImage(OBJGPU *pGpu, KernelGsp *pKernelGsp, GSP_FIRMWARE *pGspFw);
131 
132 static NV_STATUS _kgspCreateSignatureMemdesc(OBJGPU *pGpu, KernelGsp *pKernelGsp,
133                                              GSP_FIRMWARE *pGspFw);
134 
135 static NV_STATUS _kgspFwContainerVerifyVersion(OBJGPU *pGpu, KernelGsp *pKernelGsp,
136                                                const void *pElfData, NvU64 elfDataSize,
137                                                const char *pNameInMsg);
138 
139 static NV_STATUS _kgspFwContainerGetSection(OBJGPU *pGpu, KernelGsp *pKernelGsp,
140                                             const void *pElfData, NvU64 elfDataSize,
141                                             const char *pSectionName,
142                                             const void **ppSectionData, NvU64 *pSectionSize);
143 
144 static NV_STATUS _kgspGetSectionNameForPrefix(OBJGPU *pGpu, KernelGsp *pKernelGsp,
145                                               char *pSectionNameBuf, NvLength sectionNameBufSize,
146                                               const char *pSectionPrefix);
147 
148 static void
149 _kgspGetActiveRpcDebugData
150 (
151     OBJRPC *pRpc,
152     NvU32 function,
153     NvU64 *data0,
154     NvU64 *data1
155 )
156 {
157     switch (function)
158     {
159         // Functions (CPU -> GSP)
160         case NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL:
161         {
162             RPC_PARAMS(gsp_rm_control, _v03_00);
163             *data0 = rpc_params->cmd;
164             *data1 = rpc_params->paramsSize;
165             break;
166         }
167         case NV_VGPU_MSG_FUNCTION_GSP_RM_ALLOC:
168         {
169             RPC_PARAMS(gsp_rm_alloc, _v03_00);
170             *data0 = rpc_params->hClass;
171             *data1 = rpc_params->paramsSize;
172             break;
173         }
174         case NV_VGPU_MSG_FUNCTION_FREE:
175         {
176             RPC_PARAMS(free, _v03_00);
177             *data0 = rpc_params->params.hObjectOld;
178             *data1 = rpc_params->params.hObjectParent;
179             break;
180         }
181 
182         // Events (CPU <- GSP)
183         case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
184         {
185             RPC_PARAMS(run_cpu_sequencer, _v17_00);
186             *data0 = rpc_params->cmdIndex;
187             *data1 = rpc_params->bufferSizeDWord;
188             break;
189         }
190         case NV_VGPU_MSG_EVENT_POST_EVENT:
191         {
192             RPC_PARAMS(post_event, _v17_00);
193             *data0 = rpc_params->notifyIndex;
194             *data1 = rpc_params->data;
195             break;
196         }
197         case NV_VGPU_MSG_EVENT_RC_TRIGGERED:
198         {
199             RPC_PARAMS(rc_triggered, _v17_02);
200             *data0 = rpc_params->nv2080EngineType;
201             *data1 = rpc_params->exceptType;
202             break;
203         }
204         case NV_VGPU_MSG_EVENT_VGPU_GSP_PLUGIN_TRIGGERED:
205         {
206             RPC_PARAMS(vgpu_gsp_plugin_triggered, _v17_00);
207             *data0 = rpc_params->gfid;
208             *data1 = rpc_params->notifyIndex;
209             break;
210         }
211         case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
212         {
213             RPC_PARAMS(gsp_lockdown_notice, _v17_00);
214             *data0 = rpc_params->bLockdownEngaging;
215             *data1 = 0;
216             break;
217         }
218         case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
219         {
220             RPC_PARAMS(gsp_post_nocat_record, _v01_00);
221             const NV2080CtrlNocatJournalInsertRecord *pRecord =
222                 (const NV2080CtrlNocatJournalInsertRecord *)&rpc_params->data;
223             *data0 = pRecord->recType;
224             *data1 = pRecord->errorCode;
225             break;
226         }
227 
228         default:
229         {
230             *data0 = 0;
231             *data1 = 0;
232             break;
233         }
234     }
235 }
236 
237 static NV_STATUS
238 _kgspRpcSanityCheck(OBJGPU *pGpu, KernelGsp *pKernelGsp, OBJRPC *pRpc)
239 {
240     if (pKernelGsp->bFatalError)
241     {
242         NV_PRINTF(LEVEL_INFO, "GSP crashed, skipping RPC\n");
243         //
244         // In case of a fatal GSP error, if there was an outstanding RPC at the
245         // time, we should have already printed the error for that, so this is a
246         // new RPC call...from now on don't bother printing RPC errors anymore,
247         // as it can be too noisy and overrun logs.
248         //
249         pRpc->bQuietPrints = NV_TRUE;
250         return NV_ERR_RESET_REQUIRED;
251     }
252     if (API_GPU_IN_RESET_SANITY_CHECK(pGpu))
253     {
254         NV_PRINTF(LEVEL_INFO, "GPU in reset, skipping RPC\n");
255         return NV_ERR_GPU_IN_FULLCHIP_RESET;
256     }
257     if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu) ||
258         pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
259     {
260         NV_PRINTF(LEVEL_INFO, "GPU lost, skipping RPC\n");
261         return NV_ERR_GPU_IS_LOST;
262     }
263     if (osIsGpuShutdown(pGpu))
264     {
265         NV_PRINTF(LEVEL_INFO, "GPU shutdown, skipping RPC\n");
266         return NV_ERR_GPU_IS_LOST;
267     }
268     if (!gpuIsGpuFullPowerForPmResume(pGpu))
269     {
270         NV_PRINTF(LEVEL_INFO, "GPU not full power, skipping RPC\n");
271         return NV_ERR_GPU_NOT_FULL_POWER;
272     }
273     if (!gpuCheckSysmemAccess(pGpu))
274     {
275         NV_PRINTF(LEVEL_INFO, "GPU has no sysmem access, skipping RPC\n");
276         return NV_ERR_INVALID_ACCESS_TYPE;
277     }
278     return NV_OK;
279 }
280 
281 static void
282 _kgspAddRpcHistoryEntry
283 (
284     OBJRPC *pRpc,
285     RpcHistoryEntry *pHistory,
286     NvU32 *pCurrent
287 )
288 {
289     NvU32 func = RPC_HDR->function;
290     NvU32 entry;
291 
292     entry = *pCurrent = (*pCurrent + 1) % RPC_HISTORY_DEPTH;
293 
294     portMemSet(&pHistory[entry], 0, sizeof(pHistory[0]));
295     pHistory[entry].function = func;
296     pHistory[entry].ts_start = osGetTimestamp();
297 
298     _kgspGetActiveRpcDebugData(pRpc, func,
299                                &pHistory[entry].data[0],
300                                &pHistory[entry].data[1]);
301 }
302 
303 static void
304 _kgspCompleteRpcHistoryEntry
305 (
306     RpcHistoryEntry *pHistory,
307     NvU32 current
308 )
309 {
310     NvU32 historyIndex;
311     NvU32 historyEntry;
312 
313     pHistory[current].ts_end = osGetTimestamp();
314 
315     //
316     // Complete any previous entries that aren't marked complete yet, using the same timestamp
317     // (we may not have explicitly waited for them)
318     //
319     for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
320     {
321         historyEntry = (current + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
322         if (pHistory[historyEntry].ts_start != 0 &&
323             pHistory[historyEntry].ts_end   == 0)
324         {
325             pHistory[historyEntry].ts_end = pHistory[current].ts_end;
326         }
327     }
328 }
329 
330 /*!
331  * GSP client RM RPC send routine
332  */
333 static NV_STATUS
334 _kgspRpcSendMessage
335 (
336     OBJGPU *pGpu,
337     OBJRPC *pRpc
338 )
339 {
340     NV_STATUS nvStatus;
341     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
342     NvU32 gpuMaskUnused;
343 
344     NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
345 
346     NV_CHECK_OK_OR_RETURN(LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc));
347 
348     nvStatus = GspMsgQueueSendCommand(pRpc->pMessageQueueInfo, pGpu);
349     if (nvStatus != NV_OK)
350     {
351         if (nvStatus == NV_ERR_TIMEOUT ||
352             nvStatus == NV_ERR_BUSY_RETRY)
353         {
354             _kgspRpcIncrementTimeoutCountAndRateLimitPrints(pGpu, pRpc);
355         }
356         NV_PRINTF_COND(pRpc->bQuietPrints, LEVEL_INFO, LEVEL_ERROR,
357                        "GspMsgQueueSendCommand failed on GPU%d: 0x%x\n",
358                        gpuGetInstance(pGpu), nvStatus);
359         return nvStatus;
360     }
361 
362     kgspSetCmdQueueHead_HAL(pGpu, pKernelGsp, pRpc->pMessageQueueInfo->queueIdx, 0);
363 
364     _kgspAddRpcHistoryEntry(pRpc, pRpc->rpcHistory, &pRpc->rpcHistoryCurrent);
365 
366     return NV_OK;
367 }
368 
369 static NV_STATUS
370 _kgspRpcRunCpuSequencer
371 (
372     OBJGPU *pGpu,
373     OBJRPC *pRpc
374 )
375 {
376     RPC_PARAMS(run_cpu_sequencer, _v17_00);
377     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
378 
379     return kgspExecuteSequencerBuffer(pGpu, pKernelGsp, rpc_params);
380 }
381 
382 static void
383 _kgspProcessEccNotifier
384 (
385     OBJGPU *pGpu,
386     void   *eventData
387 )
388 {
389     NV_STATUS          nvStatus     = NV_OK;
390     MemoryManager     *pMemoryMgr   = GPU_GET_MEMORY_MANAGER(pGpu);
391 
392     if (pMemoryMgr->bEnableDynamicPageOfflining)
393     {
394         Nv2080EccDbeNotification *pParams = (Nv2080EccDbeNotification*)eventData;
395         if ((nvStatus = heapStorePendingBlackList(pGpu, GPU_GET_HEAP(pGpu), pParams->physAddress ,
396                                                   pParams->physAddress)) != NV_OK)
397         {
398             if (nvStatus == NV_ERR_RESET_REQUIRED)
399             {
400                 NV_PRINTF(LEVEL_INFO, "Since we hit the DED on the reserved region, nothing to handle in this code path... \n");
401                 NV_PRINTF(LEVEL_INFO, "Relying on FBHUB interrupt to kill all the channels and force reset the GPU..\n");
402             }
403             else
404             {
405                 NV_PRINTF(LEVEL_INFO, "Dynamically blacklisting the DED page offset failed with, status: %x\n", nvStatus);
406                 DBG_BREAKPOINT();
407             }
408         }
409 
410     }
411 }
412 
413 /*!
414  * Receive an event notification from GSP-RM.
415  *
416  * When an event fires in GSP-RM, osNotifyEvent and osEventNotification check
417  * whether the event was originally allocated from client-RM.  If so, they post
418  * it to the event queue and take no further action.  Client RM picks up the
419  * event here and handles it.
420  */
421 static NV_STATUS
422 _kgspRpcPostEvent
423 (
424     OBJGPU *pGpu,
425     OBJRPC *pRpc
426 )
427 {
428     RPC_PARAMS(post_event, _v17_00);
429     PEVENTNOTIFICATION pNotifyList  = NULL;
430     PEVENTNOTIFICATION pNotifyEvent = NULL;
431     Event             *pEvent       = NULL;
432     NV_STATUS          nvStatus     = NV_OK;
433 
434     // Get the notification list that contains this event.
435     NV_ASSERT_OR_RETURN(CliGetEventInfo(rpc_params->hClient,
436         rpc_params->hEvent, &pEvent), NV_ERR_OBJECT_NOT_FOUND);
437 
438     if (pEvent->pNotifierShare != NULL)
439         pNotifyList = pEvent->pNotifierShare->pEventList;
440 
441     NV_ASSERT_OR_RETURN(pNotifyList != NULL, NV_ERR_INVALID_POINTER);
442 
443     switch (rpc_params->notifyIndex)
444     {
445         case NV2080_NOTIFIERS_ECC_DBE:
446             _kgspProcessEccNotifier(pGpu, rpc_params->eventData);
447             break;
448     }
449 
450     // Send the event.
451     if (rpc_params->bNotifyList)
452     {
453         // Send notification to all matching events on the list.
454         nvStatus = osEventNotificationWithInfo(pGpu, pNotifyList, rpc_params->notifyIndex,
455                        rpc_params->data, rpc_params->info16, rpc_params->eventData, rpc_params->eventDataSize);
456     }
457     else
458     {
459         // Send event to a specific hEvent.  Find hEvent in the notification list.
460         for (pNotifyEvent = pNotifyList; pNotifyEvent; pNotifyEvent = pNotifyEvent->Next)
461         {
462             if (pNotifyEvent->hEvent == rpc_params->hEvent)
463             {
464                 nvStatus = osNotifyEvent(pGpu, pNotifyEvent, 0,
465                                          rpc_params->data, rpc_params->status);
466                 break;
467             }
468         }
469         NV_ASSERT_OR_RETURN(pNotifyEvent != NULL, NV_ERR_OBJECT_NOT_FOUND);
470     }
471 
472     return nvStatus;
473 }
474 
475 /*!
476  * Receive RC notification from GSP-RM.
477  *
478  * RC error handling ("Channel Teardown sequence") is executed in GSP-RM.
479  * Client notifications, OS interaction etc happen in CPU-RM (Kernel RM).
480  */
481 static NV_STATUS
482 _kgspRpcRCTriggered
483 (
484     OBJGPU *pGpu,
485     OBJRPC *pRpc
486 )
487 {
488     RPC_PARAMS(rc_triggered, _v17_02);
489 
490     KernelRc      *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
491     KernelChannel *pKernelChannel;
492     KernelFifo    *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
493     CHID_MGR      *pChidMgr;
494     NvU32          status = NV_OK;
495     RM_ENGINE_TYPE rmEngineType = gpuGetRmEngineType(rpc_params->nv2080EngineType);
496     NvBool         bIsCcEnabled = NV_FALSE;
497 
498     // check if there's a PCI-E error pending either in device status or in AER
499     krcCheckBusError_HAL(pGpu, pKernelRc);
500 
501     //
502     // If we have received a special msg from GSP then ack back immediately
503     // that we are done writing notifiers since we would have already processed the
504     // other RC msgs that trigger notifier writes before this one.
505     //
506     if (rpc_params->exceptType == ROBUST_CHANNEL_FAST_PATH_ERROR)
507     {
508         NV_RM_RPC_ECC_NOTIFIER_WRITE_ACK(pGpu, status);
509         NV_ASSERT_OK(status);
510         return status;
511     }
512 
513     status = kfifoGetChidMgrFromType(pGpu, pKernelFifo,
514                                      ENGINE_INFO_TYPE_RM_ENGINE_TYPE,
515                                      (NvU32)rmEngineType,
516                                      &pChidMgr);
517     if (status != NV_OK)
518         return status;
519 
520     pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo,
521                                                   pChidMgr,
522                                                   rpc_params->chid);
523     NV_CHECK_OR_RETURN(LEVEL_ERROR,
524                        pKernelChannel != NULL,
525                        NV_ERR_INVALID_CHANNEL);
526 
527     // Add the RcDiag records we received from GSP-RM to our system wide journal
528     {
529         OBJSYS   *pSys = SYS_GET_INSTANCE();
530         Journal  *pRcDB = SYS_GET_RCDB(pSys);
531         RmClient *pClient;
532 
533         NvU32 recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport);
534         NvU32 rcDiagRecStart = pRcDB->RcErrRptNextIdx;
535         NvU32 rcDiagRecEnd;
536         NvU32 processId = 0;
537         NvU32 owner = RCDB_RCDIAG_DEFAULT_OWNER;
538 
539         pClient = dynamicCast(RES_GET_CLIENT(pKernelChannel), RmClient);
540         NV_ASSERT(pClient != NULL);
541         if (pClient != NULL)
542             processId = pClient->ProcID;
543 
544         for (NvU32 i = 0; i < rpc_params->rcJournalBufferSize / recordSize; i++)
545         {
546             RmRCCommonJournal_RECORD *pCommonRecord =
547                 (RmRCCommonJournal_RECORD *)((NvU8*)&rpc_params->rcJournalBuffer + i * recordSize);
548             RmRcDiag_RECORD *pRcDiagRecord =
549                 (RmRcDiag_RECORD *)&pCommonRecord[1];
550 
551 #if defined(DEBUG)
552             NV_PRINTF(LEVEL_INFO, "%d: GPUTag=0x%x CPUTag=0x%llx timestamp=0x%llx stateMask=0x%llx\n",
553                       i, pCommonRecord->GPUTag, pCommonRecord->CPUTag, pCommonRecord->timeStamp,
554                       pCommonRecord->stateMask);
555             NV_PRINTF(LEVEL_INFO, "   idx=%d timeStamp=0x%x type=0x%x flags=0x%x count=%d owner=0x%x processId=0x%x\n",
556                       pRcDiagRecord->idx, pRcDiagRecord->timeStamp, pRcDiagRecord->type, pRcDiagRecord->flags,
557                       pRcDiagRecord->count, pRcDiagRecord->owner, processId);
558             for (NvU32 j = 0; j < pRcDiagRecord->count; j++)
559             {
560                 NV_PRINTF(LEVEL_INFO, "     %d: offset=0x08%x tag=0x08%x value=0x08%x attribute=0x08%x\n",
561                           j, pRcDiagRecord->data[j].offset, pRcDiagRecord->data[j].tag,
562                           pRcDiagRecord->data[j].value, pRcDiagRecord->data[j].attribute);
563             }
564 #endif
565             if (rcdbAddRcDiagRecFromGsp(pGpu, pRcDB, pCommonRecord, pRcDiagRecord) == NULL)
566             {
567                 NV_PRINTF(LEVEL_WARNING, "Lost RC diagnostic record coming from GPU%d GSP: type=0x%x stateMask=0x%llx\n",
568                           gpuGetInstance(pGpu), pRcDiagRecord->type, pCommonRecord->stateMask);
569             }
570         }
571 
572         rcDiagRecEnd = pRcDB->RcErrRptNextIdx - 1;
573 
574         // Update records to have the correct PID associated with the channel
575         if (rcDiagRecStart != rcDiagRecEnd)
576         {
577             rcdbUpdateRcDiagRecContext(pRcDB,
578                                        rcDiagRecStart,
579                                        rcDiagRecEnd,
580                                        processId,
581                                        owner);
582         }
583     }
584 
585     bIsCcEnabled = gpuIsCCFeatureEnabled(pGpu);
586 
587     // With CC enabled, CPU-RM needs to write error notifiers
588     if (bIsCcEnabled)
589     {
590         NV_ASSERT_OK_OR_RETURN(krcErrorSetNotifier(pGpu, pKernelRc,
591                                                    pKernelChannel,
592                                                    rpc_params->exceptType,
593                                                    rmEngineType,
594                                                    rpc_params->scope));
595     }
596 
597     return krcErrorSendEventNotifications_HAL(pGpu, pKernelRc,
598         pKernelChannel,
599         rmEngineType,           // unused on kernel side
600         rpc_params->exceptType,
601         rpc_params->scope,
602         rpc_params->partitionAttributionId);
603 }
604 
605 /*!
606  * Receive Xid notification from GSP-RM
607  *
608  * Passes Xid errors that are triggered on GSP-RM to nvErrorLog for OS interactions
609  * (logging and OS notifications).
610  */
611 static void
612 _kgspRpcOsErrorLog
613 (
614     OBJGPU *pGpu,
615     OBJRPC *pRpc
616 )
617 {
618     RPC_PARAMS(os_error_log, _v17_00);
619 
620     KernelRc      *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
621     KernelChannel *pKernelChannel = NULL;
622     KernelFifo    *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
623     CHID_MGR      *pChidMgr;
624 
625     if (rpc_params->chid != INVALID_CHID)
626     {
627         pChidMgr = kfifoGetChidMgr(pGpu, pKernelFifo, rpc_params->runlistId);
628         if (pChidMgr != NULL)
629         {
630             pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo,
631                                                           pChidMgr,
632                                                           rpc_params->chid);
633         }
634     }
635 
636     pKernelRc->pPreviousChannelInError = pKernelChannel;
637     nvErrorLog_va(pGpu, rpc_params->exceptType, "%s", rpc_params->errString);
638     pKernelRc->pPreviousChannelInError = NULL;
639 }
640 
641 /*!
642  * Receives RPC events containing periodic perfmon utilization samples, passing them
643  * to GPUACCT for processing.
644  */
645 static void
646 _kgspRpcGpuacctPerfmonUtilSamples
647 (
648     OBJGPU *pGpu,
649     OBJRPC *pRpc
650 )
651 {
652     OBJSYS *pSys = SYS_GET_INSTANCE();
653     GpuAccounting *pGpuAcct = SYS_GET_GPUACCT(pSys);
654     GPUACCT_GPU_INSTANCE_INFO *pGpuInstanceInfo = &pGpuAcct->gpuInstanceInfo[pGpu->gpuInstance];
655     RPC_PARAMS(gpuacct_perfmon_util_samples, _v1F_0E);
656 
657     NV2080_CTRL_PERF_GET_GPUMON_PERFMON_UTIL_SAMPLES_V2_PARAMS_v1F_0E *src = &rpc_params->params;
658     NV2080_CTRL_PERF_GET_GPUMON_PERFMON_UTIL_SAMPLES_V2_PARAMS        *dest;
659     NvU32 i;
660 
661     dest = pGpuInstanceInfo->pSamplesParams;
662     if (dest == NULL)
663     {
664         // This RPC event can be received even when the RM hasn't fully started.
665         // For instance, CPU RM can take longer than usual to initialize,
666         // but the GSP RM sampling timer (a 1 sec interval) is about to tick.
667         // In that case, pSamplesParams can not even be allocated by that time.
668         // Ignore this RPC event if pSamplesParams has not been allocated yet.
669         // See GPUSWSEC-1543 for more info.
670         return;
671     }
672 
673     portMemSet(dest, 0, sizeof(*dest));
674     dest->type    = src->type;
675     dest->bufSize = src->bufSize;
676     dest->count   = src->count;
677     dest->tracker = src->tracker;
678 
679     for (i = 0; i < NV2080_CTRL_PERF_GPUMON_SAMPLE_COUNT_PERFMON_UTIL_v1F_0E; i++)
680     {
681         dest->samples[i].base.timeStamp     = src->samples[i].timeStamp;
682 
683         dest->samples[i].fb.util            = src->samples[i].fb.util;
684         dest->samples[i].fb.procId          = src->samples[i].fb.procId;
685         dest->samples[i].fb.subProcessID    = src->samples[i].fb.subProcessID;
686 
687         dest->samples[i].gr.util            = src->samples[i].gr.util;
688         dest->samples[i].gr.procId          = src->samples[i].gr.procId;
689         dest->samples[i].gr.subProcessID    = src->samples[i].gr.subProcessID;
690 
691         dest->samples[i].nvenc.util         = src->samples[i].nvenc.util;
692         dest->samples[i].nvenc.procId       = src->samples[i].nvenc.procId;
693         dest->samples[i].nvenc.subProcessID = src->samples[i].nvenc.subProcessID;
694 
695         dest->samples[i].nvdec.util         = src->samples[i].nvdec.util;
696         dest->samples[i].nvdec.procId       = src->samples[i].nvdec.procId;
697         dest->samples[i].nvdec.subProcessID = src->samples[i].nvdec.subProcessID;
698     }
699 
700     gpuacctProcessGpuUtil(pGpuInstanceInfo, &dest->samples[0]);
701 }
702 
703 /*!
704  * Receives RPC events containing current GPU Boost synchronization limits
705  * that should be cached and considered in the GPU Boost algorithm and runs
706  * the algorithm.
707  */
708 static void
709 _kgspRpcPerfGpuBoostSyncLimitsCallback
710 (
711     OBJGPU *pGpu,
712     OBJRPC *pRpc
713 )
714 {
715     KernelPerf *pKernelPerf = GPU_GET_KERNEL_PERF(pGpu);
716 
717     RPC_PARAMS(perf_gpu_boost_sync_limits_callback, _v17_00);
718 
719     NV2080_CTRL_INTERNAL_PERF_GPU_BOOST_SYNC_SET_LIMITS_PARAMS_v17_00  *src = &rpc_params->params;
720     NV2080_CTRL_INTERNAL_PERF_GPU_BOOST_SYNC_SET_LIMITS_PARAMS          dest;
721     NvU32 i;
722 
723     dest.flags        = src->flags;
724     dest.bBridgeless  = src->bBridgeless;
725 
726     for (i = 0; i < NV2080_CTRL_INTERNAL_PERF_SYNC_GPU_BOOST_LIMITS_NUM; i++)
727     {
728         dest.currLimits[i] = src->currLimits[i];
729     }
730 
731     kperfDoSyncGpuBoostLimits(pGpu, pKernelPerf, &dest);
732 
733 }
734 
735 /*!
736  * Recieves RPC events containing latest change of bridgeless information
737  */
738 static void
739 _kgspRpcPerfBridgelessInfoUpdate
740 (
741     OBJGPU  *pGpu,
742     OBJRPC  *pRpc
743 )
744 {
745     RPC_PARAMS(perf_bridgeless_info_update, _v17_00);
746 
747     kPerfGpuBoostSyncBridgelessUpdateInfo(pGpu, rpc_params->bBridgeless);
748 }
749 
750 static void
751 _kgspRpcNvlinkFaultUpCallback
752 (
753     OBJGPU  *pGpu,
754     OBJRPC  *pRpc
755 )
756 {
757     RPC_PARAMS(nvlink_fault_up, _v17_00);
758 
759     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
760 
761     knvlinkHandleFaultUpInterrupt_HAL(pGpu, pKernelNvlink, rpc_params->linkId);
762 }
763 
764 static void
765 _kgspRpcNvlinkInbandReceivedData256Callback
766 (
767     OBJGPU  *pGpu,
768     OBJRPC  *pRpc
769 )
770 {
771     RPC_PARAMS(nvlink_inband_received_data_256, _v17_00);
772 
773     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_256_PARAMS_v17_00 *dest = &rpc_params->params;
774     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
775 
776     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
777 }
778 
779 static void
780 _kgspRpcNvlinkInbandReceivedData512Callback
781 (
782     OBJGPU  *pGpu,
783     OBJRPC  *pRpc
784 )
785 {
786     RPC_PARAMS(nvlink_inband_received_data_512, _v17_00);
787 
788     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_512_PARAMS_v17_00 *dest = &rpc_params->params;
789     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
790 
791     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
792 }
793 
794 static void
795 _kgspRpcNvlinkInbandReceivedData1024Callback
796 (
797     OBJGPU  *pGpu,
798     OBJRPC  *pRpc
799 )
800 {
801     RPC_PARAMS(nvlink_inband_received_data_1024, _v17_00);
802 
803     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_1024_PARAMS_v17_00 *dest = &rpc_params->params;
804     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
805 
806     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
807 }
808 
809 static void
810 _kgspRpcNvlinkInbandReceivedData2048Callback
811 (
812     OBJGPU  *pGpu,
813     OBJRPC  *pRpc
814 )
815 {
816     RPC_PARAMS(nvlink_inband_received_data_2048, _v17_00);
817 
818     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_2048_PARAMS_v17_00 *dest = &rpc_params->params;
819     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
820 
821     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
822 }
823 
824 static void
825 _kgspRpcNvlinkInbandReceivedData4096Callback
826 (
827     OBJGPU  *pGpu,
828     OBJRPC  *pRpc
829 )
830 {
831     RPC_PARAMS(nvlink_inband_received_data_4096, _v17_00);
832 
833     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_4096_PARAMS_v17_00 *dest = &rpc_params->params;
834     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
835 
836     NV_ASSERT(NV_OK == knvlinkInbandMsgCallbackDispatcher(pGpu, pKernelNvlink, dest->dataSize, dest->data));
837 }
838 
839 /*!
840  * CPU-RM: Receive GPU Degraded status from GSP
841  */
842 static void
843 _kgspRpcEventIsGpuDegradedCallback
844 (
845     OBJGPU  *pGpu,
846     OBJRPC  *pRpc
847 )
848 {
849     RPC_PARAMS(nvlink_is_gpu_degraded, _v17_00);
850     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
851     NV2080_CTRL_NVLINK_IS_GPU_DEGRADED_PARAMS_v17_00 *dest = &rpc_params->params;
852 
853     if(dest->bIsGpuDegraded)
854     {
855         knvlinkSetDegradedMode(pGpu, pKernelNvlink, dest->linkId);
856     }
857 }
858 
859 static void
860 _kgspRpcNvlinkFatalErrorRecoveryCallback
861 (
862     OBJGPU  *pGpu,
863     OBJRPC  *pRpc
864 )
865 {
866     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
867     NV_ASSERT_OK(knvlinkFatalErrorRecovery(pGpu, pKernelNvlink));
868 }
869 
870 /*!
871  * Receive MMU fault queue notification from GSP-RM.
872  *
873  * Non-replayable fault handling is split between GSP-RM and the UVM driver.
874  * GSP-RM copies designated faults to the UVM driver's shadow buffer,
875  * and sends a notification.  CPU-RM, in turn, needs to notify the UVM
876  * driver (schedule the UVM ISR to be run).
877  */
878 static NV_STATUS
879 _kgspRpcMMUFaultQueued(
880     OBJGPU *pGpu,
881     OBJRPC *pRpc
882 )
883 {
884     osQueueMMUFaultHandler(pGpu);
885 
886     return NV_OK;
887 }
888 
889 static NV_STATUS
890 _kgspRpcSimRead
891 (
892     OBJGPU *pGpu,
893     OBJRPC *pRpc
894 )
895 {
896     RPC_PARAMS(sim_read, _v1E_01);
897     if (IS_SIMULATION(pGpu))
898     {
899         const NvU32 count = rpc_params->index + (rpc_params->count / sizeof(NvU32));
900         NvU32 i;
901 
902         KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
903 
904         NV_ASSERT_OR_RETURN(rpc_params->count <= sizeof(pKernelGsp->pSimAccessBuf->data), NV_ERR_BUFFER_TOO_SMALL);
905 
906         for (i = rpc_params->index; i < count; i++)
907         {
908             NvU32 data;
909             gpuSimEscapeRead(pGpu, rpc_params->path, i, 4, &data);
910             pKernelGsp->pSimAccessBuf->data[i] = data;
911         }
912 
913         pKernelGsp->pSimAccessBuf->seq++;
914         return NV_OK;
915     }
916 
917     return NV_ERR_NOT_SUPPORTED;
918 }
919 
920 static NV_STATUS
921 _kgspRpcSimWrite
922 (
923     OBJGPU *pGpu,
924     OBJRPC *pRpc
925 )
926 {
927     RPC_PARAMS(sim_write, _v1E_01);
928     if (IS_SIMULATION(pGpu))
929     {
930         KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
931 
932         gpuSimEscapeWrite(pGpu, rpc_params->path, rpc_params->index, rpc_params->count, rpc_params->data);
933         pKernelGsp->pSimAccessBuf->seq++;
934         return NV_OK;
935     }
936 
937     return NV_ERR_NOT_SUPPORTED;
938 }
939 
940 static NV_STATUS
941 _kgspRpcSemaphoreScheduleCallback(
942     OBJGPU *pGpu,
943     OBJRPC *pRpc
944 )
945 {
946     RPC_PARAMS(semaphore_schedule_callback, _v17_00);
947     NV_STATUS status;
948     RsClient *pClient;
949     Device *pDevice;
950 
951     status = serverGetClientUnderLock(&g_resServ, rpc_params->hClient, &pClient);
952     if (status != NV_OK)
953         return status;
954 
955     status = deviceGetByHandle(pClient, rpc_params->hEvent, &pDevice);
956     if (status != NV_OK)
957         return status;
958 
959     return dispswReleaseSemaphoreAndNotifierFill(pGpu,
960                                                  rpc_params->GPUVA,
961                                                  rpc_params->hVASpace,
962                                                  rpc_params->ReleaseValue,
963                                                  rpc_params->Flags,
964                                                  rpc_params->completionStatus,
965                                                  pDevice);
966 }
967 
968 static NV_STATUS
969 _kgspRpcTimedSemaphoreRelease(
970     OBJGPU *pGpu,
971     OBJRPC *pRpc
972 )
973 {
974     RPC_PARAMS(timed_semaphore_release, _v01_00);
975     NV_STATUS status;
976     RsClient *pClient;
977     Device *pDevice;
978 
979     status = serverGetClientUnderLock(&g_resServ, rpc_params->hClient, &pClient);
980     if (status != NV_OK)
981         return status;
982 
983     status = deviceGetByHandle(pClient, rpc_params->hDevice, &pDevice);
984     if (status != NV_OK)
985         return status;
986 
987     return tsemaRelease_HAL(pGpu,
988                             rpc_params->semaphoreVA,
989                             rpc_params->notifierVA,
990                             rpc_params->hVASpace,
991                             rpc_params->releaseValue,
992                             rpc_params->completionStatus,
993                             pDevice);
994 }
995 
996 
997 static NV_STATUS
998 _kgspRpcUcodeLibosPrint
999 (
1000     OBJGPU *pGpu,
1001     OBJRPC *pRpc
1002 )
1003 {
1004     RPC_PARAMS(ucode_libos_print, _v1E_08);
1005 
1006     // Check ucodes registered with the libos print mechanism
1007     switch (rpc_params->ucodeEngDesc)
1008     {
1009         case ENG_PMU:
1010         {
1011             KernelPmu *pKernelPmu = GPU_GET_KERNEL_PMU(pGpu);
1012             NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelPmu != NULL, NV_ERR_OBJECT_NOT_FOUND);
1013 
1014             kpmuLogBuf(pGpu, pKernelPmu,
1015                        rpc_params->libosPrintBuf, rpc_params->libosPrintBufSize);
1016 
1017             return NV_OK;
1018         }
1019         default:
1020             NV_ASSERT_FAILED("Attempting to use libos prints with an unsupported ucode!\n");
1021             return NV_ERR_NOT_SUPPORTED;
1022     }
1023 }
1024 
1025 static NV_STATUS
1026 _kgspRpcVgpuGspPluginTriggered
1027 (
1028     OBJGPU *pGpu,
1029     OBJRPC *pRpc
1030 )
1031 {
1032     RPC_PARAMS(vgpu_gsp_plugin_triggered, _v17_00);
1033 
1034     if (!IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
1035         return NV_ERR_NOT_SUPPORTED;
1036 
1037     gpuGspPluginTriggeredEvent(pGpu, rpc_params->gfid, rpc_params->notifyIndex);
1038     return NV_OK;
1039 }
1040 
1041 static NV_STATUS
1042 _kgspRpcGspVgpuConfig
1043 (
1044     OBJGPU *pGpu,
1045     OBJRPC *pRpc
1046 )
1047 {
1048     RPC_PARAMS(vgpu_config_event, _v17_00);
1049 
1050     NV_ASSERT_OR_RETURN(rpc_params->notifyIndex < NVA081_NOTIFIERS_MAXCOUNT,
1051                         NV_ERR_INVALID_ARGUMENT);
1052 
1053     CliNotifyVgpuConfigEvent(pGpu, rpc_params->notifyIndex);
1054 
1055     return NV_OK;
1056 }
1057 
1058 static NV_STATUS
1059 _kgspRpcGspExtdevIntrService
1060 (
1061     OBJGPU *pGpu,
1062     OBJRPC *pRpc
1063 )
1064 {
1065     RPC_PARAMS(extdev_intr_service, _v17_00);
1066 
1067     extdevGsyncService(pGpu, rpc_params->lossRegStatus, rpc_params->gainRegStatus, rpc_params->miscRegStatus, rpc_params->rmStatus);
1068 
1069     return NV_OK;
1070 }
1071 
1072 static void
1073 _kgspRpcMigCiConfigUpdateCallback
1074 (
1075     NvU32 gpuInstance,
1076     void *pArgs
1077 )
1078 {
1079     OBJGPU *pGpu = gpumgrGetGpu(gpuInstance);
1080     KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
1081     struct MIG_CI_UPDATE_CALLBACK_PARAMS * pParams = (struct MIG_CI_UPDATE_CALLBACK_PARAMS *)pArgs;
1082 
1083     kmigmgrUpdateCiConfigForVgpu(pGpu, pKernelMIGManager,
1084                                  pParams->execPartCount, pParams->execPartId,
1085                                  pParams->gfid, pParams->bDelete);
1086 
1087     return;
1088 }
1089 
1090 static NV_STATUS
1091 _kgspRpcMigCiConfigUpdate
1092 (
1093     OBJGPU *pGpu,
1094     OBJRPC *pRpc
1095 )
1096 {
1097     NV_STATUS status;
1098     struct MIG_CI_UPDATE_CALLBACK_PARAMS *pParams;
1099 
1100     RPC_PARAMS(vgpu_gsp_mig_ci_config, _v21_03);
1101 
1102     NV_ASSERT_OR_RETURN(rpc_params->execPartCount <= NVC637_CTRL_MAX_EXEC_PARTITIONS,
1103                         NV_ERR_INVALID_ARGUMENT);
1104 
1105     pParams = portMemAllocNonPaged(sizeof(struct MIG_CI_UPDATE_CALLBACK_PARAMS));
1106     if (pParams == NULL)
1107     {
1108         return NV_ERR_NO_MEMORY;
1109     }
1110 
1111     pParams->execPartCount = rpc_params->execPartCount;
1112     portMemCopy(pParams->execPartId, (sizeof(NvU32) * rpc_params->execPartCount),
1113                 rpc_params->execPartId, (sizeof(NvU32) * rpc_params->execPartCount));
1114     pParams->gfid = rpc_params->gfid;
1115     pParams->bDelete = rpc_params->bDelete;
1116     status = osQueueWorkItemWithFlags(pGpu,
1117                                       _kgspRpcMigCiConfigUpdateCallback,
1118                                       (void *)pParams,
1119                                       OS_QUEUE_WORKITEM_FLAGS_LOCK_API_RW | OS_QUEUE_WORKITEM_FLAGS_LOCK_GPUS_RW);
1120     if (status != NV_OK)
1121     {
1122         portMemFree(pParams);
1123     }
1124 
1125     return status;
1126 }
1127 
1128 static void
1129 _kgspRpcGspUpdateTrace
1130 (
1131     OBJGPU *pGpu,
1132     OBJRPC *pRpc
1133 )
1134 {
1135 #if KERNEL_GSP_TRACING_RATS_ENABLED
1136     RPC_PARAMS(update_gsp_trace, _v01_00);
1137     NvU32 i;
1138     NV_RATS_GSP_TRACE_RECORD *GspTraceRecords = (NV_RATS_GSP_TRACE_RECORD*) (&rpc_params->data);
1139     for (i = 0; i < rpc_params->records; i++)
1140     {
1141         gspTraceEventBufferLogRecord(pGpu, &GspTraceRecords[i]);
1142     }
1143 #endif
1144 }
1145 
1146 static void
1147 _kgspRpcGspPostNocatRecord
1148 (
1149     OBJGPU *pGpu,
1150     OBJRPC *pRpc
1151 )
1152 {
1153     OBJSYS                  *pSys = SYS_GET_INSTANCE();
1154     Journal                 *pRcdb = SYS_GET_RCDB(pSys);
1155     NOCAT_JOURNAL_PARAMS    newEntry;
1156     const NV2080CtrlNocatJournalInsertRecord *pRecord = NULL;
1157     RPC_PARAMS(gsp_post_nocat_record, _v01_00);
1158 
1159     // make a pointer to the record.
1160     pRecord = (const NV2080CtrlNocatJournalInsertRecord *)&rpc_params->data;
1161 
1162     portMemSet(&newEntry, 0, sizeof(newEntry));
1163     newEntry.timestamp          = pRecord->timestamp;
1164     newEntry.recType            = pRecord->recType;
1165     newEntry.bugcheck           = pRecord->bugcheck;
1166     newEntry.pSource            = pRecord->source;
1167     newEntry.subsystem          = pRecord->subsystem;
1168     newEntry.errorCode          = pRecord->errorCode;
1169     newEntry.diagBufferLen      = pRecord->diagBufferLen;
1170     newEntry.pDiagBuffer        = pRecord->diagBuffer;
1171     newEntry.pFaultingEngine    = pRecord->faultingEngine;
1172     newEntry.tdrReason          = pRecord->tdrReason;
1173 
1174     (void)rcdbNocatInsertNocatError(pGpu, &newEntry);
1175     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_RPC_INSERT_RECORDS_IDX]++;
1176 }
1177 
1178 static NV_STATUS
1179 _kgspRpcRgLineIntr
1180 (
1181     OBJGPU *pGpu,
1182     OBJRPC *pRpc
1183 )
1184 {
1185     RPC_PARAMS(rg_line_intr, _v17_00);
1186 
1187     KernelDisplay *pKernelDisplay = GPU_GET_KERNEL_DISPLAY(pGpu);
1188     NV_CHECK_OR_RETURN(LEVEL_ERROR, pKernelDisplay != NULL, NV_ERR_OBJECT_NOT_FOUND);
1189 
1190     kdispInvokeRgLineCallback(pKernelDisplay, rpc_params->head, rpc_params->rgIntr, NV_FALSE);
1191 
1192     return NV_OK;
1193 }
1194 
1195 static NV_STATUS
1196 _kgspRpcEventPlatformRequestHandlerStateSyncCallback
1197 (
1198     OBJGPU* pGpu,
1199     OBJRPC* pRpc
1200 )
1201 {
1202     OBJSYS *pSys = SYS_GET_INSTANCE();
1203     PlatformRequestHandler* pPlatformRequestHandler
1204                  = SYS_GET_PFM_REQ_HNDLR(pSys);
1205 
1206     RPC_PARAMS(pfm_req_hndlr_state_sync_callback, _v21_04);
1207 
1208     NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_PARAMS_v21_04  *src = &rpc_params->params;
1209     NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_PARAMS          dst = { 0 };
1210 
1211     dst.flags         = src->flags;
1212     dst.syncData.type = src->syncData.type;
1213 
1214     // Copy in the rpc data
1215     switch (src->syncData.type)
1216     {
1217         case NV2080_CTRL_INTERNAL_PFM_REQ_HNDLR_STATE_SYNC_DATA_TYPE_SMBPBI:
1218         {
1219             dst.syncData.data.smbpbi.sensorId =
1220                 src->syncData.data.smbpbi.sensorId;
1221             dst.syncData.data.smbpbi.limit =
1222                 src->syncData.data.smbpbi.limit;
1223             break;
1224         }
1225         default:
1226         {
1227             // Nothing for now
1228             break;
1229         }
1230     }
1231 
1232     pfmreqhndlrStateSync(pPlatformRequestHandler, pGpu, &dst);
1233     return NV_OK;
1234 }
1235 
1236 static void
1237 _kgspRpcGspLockdownNotice
1238 (
1239     OBJGPU *pGpu,
1240     OBJRPC *pRpc
1241 )
1242 {
1243     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
1244     RPC_PARAMS(gsp_lockdown_notice, _v17_00);
1245 
1246     //
1247     // While the GSP is in lockdown, we cannot access some of its registers,
1248     // including interrupt status and control. We shouldn't receive any more
1249     // SWGEN0 interrupts while the core is in lockdown.
1250     //
1251     pKernelGsp->bInLockdown = rpc_params->bLockdownEngaging;
1252 
1253     NV_PRINTF(LEVEL_INFO, "GSP lockdown %s\n",
1254               pKernelGsp->bInLockdown ? "engaged" : "disengaged");
1255 }
1256 
1257 static
1258 const char *_getRpcName
1259 (
1260     NvU32 id
1261 )
1262 {
1263     static const char *rpcName[] =
1264         {
1265             #define X(UNIT, a, VAL) #a,
1266             #define E(a, VAL) #a,
1267             #undef _RPC_GLOBAL_ENUMS_H_
1268             #include "vgpu/rpc_global_enums.h"
1269             #undef X
1270             #undef E
1271         };
1272 
1273     if (id < NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS)
1274     {
1275         return rpcName[id];
1276     }
1277     else if ((id > NV_VGPU_MSG_EVENT_FIRST_EVENT) && (id < NV_VGPU_MSG_EVENT_NUM_EVENTS))
1278     {
1279         NvU32 index = id - (NV_VGPU_MSG_EVENT_FIRST_EVENT - NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS) + 1;
1280         return rpcName[index];
1281     }
1282 
1283     return "Unknown";
1284 }
1285 
1286 /*!
1287  * GSP client process RPC events
1288  */
1289 static void
1290 _kgspProcessRpcEvent
1291 (
1292     OBJGPU *pGpu,
1293     OBJRPC *pRpc,
1294     KernelGspRpcEventHandlerContext rpcHandlerContext
1295 )
1296 {
1297     rpc_message_header_v *pMsgHdr = RPC_HDR;
1298     NV_STATUS nvStatus = NV_OK;
1299     NvU32 event = pMsgHdr->function;
1300 
1301     NV_PRINTF(LEVEL_INFO, "received event from GPU%d: 0x%x (%s) status: 0x%x size: %d\n",
1302               gpuGetInstance(pGpu), event, _getRpcName(event), pMsgHdr->rpc_result, pMsgHdr->length);
1303 
1304     _kgspAddRpcHistoryEntry(pRpc, pRpc->rpcEventHistory, &pRpc->rpcEventHistoryCurrent);
1305 
1306     /*
1307      * Shortlist of RPC's that have been manually screened to be safe without the API lock
1308      * that are called during GSP bootup
1309      */
1310     if ((rpcHandlerContext == KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL_BOOTUP) &&
1311         (!rmapiLockIsOwner()))
1312     {
1313         switch(pMsgHdr->function)
1314         {
1315             case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
1316             case NV_VGPU_MSG_EVENT_UCODE_LIBOS_PRINT:
1317             case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
1318             case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
1319             case NV_VGPU_MSG_EVENT_GSP_INIT_DONE:
1320             case NV_VGPU_MSG_EVENT_OS_ERROR_LOG:
1321                 break;
1322             default:
1323                 NV_PRINTF(LEVEL_ERROR, "Attempted to process RPC event from GPU%d: 0x%x (%s) during bootup without API lock\n",
1324                         gpuGetInstance(pGpu), event, _getRpcName(event));
1325                 NV_ASSERT(0);
1326                 goto done;
1327         }
1328     }
1329 
1330     switch(event)
1331     {
1332         case NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER:
1333             nvStatus = _kgspRpcRunCpuSequencer(pGpu, pRpc);
1334             break;
1335 
1336         case NV_VGPU_MSG_EVENT_POST_EVENT:
1337             nvStatus = _kgspRpcPostEvent(pGpu, pRpc);
1338             break;
1339 
1340         case NV_VGPU_MSG_EVENT_RC_TRIGGERED:
1341             nvStatus = _kgspRpcRCTriggered(pGpu, pRpc);
1342             break;
1343 
1344         case NV_VGPU_MSG_EVENT_MMU_FAULT_QUEUED:
1345             nvStatus = _kgspRpcMMUFaultQueued(pGpu, pRpc);
1346             break;
1347 
1348         case NV_VGPU_MSG_EVENT_SIM_READ:
1349             nvStatus = _kgspRpcSimRead(pGpu, pRpc);
1350             break;
1351 
1352         case NV_VGPU_MSG_EVENT_SIM_WRITE:
1353             nvStatus = _kgspRpcSimWrite(pGpu, pRpc);
1354             break;
1355 
1356         case NV_VGPU_MSG_EVENT_OS_ERROR_LOG:
1357             _kgspRpcOsErrorLog(pGpu, pRpc);
1358             break;
1359 
1360         case NV_VGPU_MSG_EVENT_GPUACCT_PERFMON_UTIL_SAMPLES:
1361             _kgspRpcGpuacctPerfmonUtilSamples(pGpu, pRpc);
1362             break;
1363 
1364         case NV_VGPU_MSG_EVENT_PERF_GPU_BOOST_SYNC_LIMITS_CALLBACK:
1365             _kgspRpcPerfGpuBoostSyncLimitsCallback(pGpu, pRpc);
1366             break;
1367 
1368         case NV_VGPU_MSG_EVENT_PERF_BRIDGELESS_INFO_UPDATE:
1369             _kgspRpcPerfBridgelessInfoUpdate(pGpu, pRpc);
1370             break;
1371 
1372         case NV_VGPU_MSG_EVENT_SEMAPHORE_SCHEDULE_CALLBACK:
1373             _kgspRpcSemaphoreScheduleCallback(pGpu, pRpc);
1374             break;
1375 
1376         case NV_VGPU_MSG_EVENT_TIMED_SEMAPHORE_RELEASE:
1377             _kgspRpcTimedSemaphoreRelease(pGpu, pRpc);
1378             break;
1379 
1380         case NV_VGPU_MSG_EVENT_NVLINK_FAULT_UP:
1381             _kgspRpcNvlinkFaultUpCallback(pGpu, pRpc);
1382              break;
1383 
1384         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_256:
1385             _kgspRpcNvlinkInbandReceivedData256Callback(pGpu, pRpc);
1386             break;
1387 
1388         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_512:
1389             _kgspRpcNvlinkInbandReceivedData512Callback(pGpu, pRpc);
1390             break;
1391 
1392         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_1024:
1393             _kgspRpcNvlinkInbandReceivedData1024Callback(pGpu, pRpc);
1394             break;
1395 
1396         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_2048:
1397             _kgspRpcNvlinkInbandReceivedData2048Callback(pGpu, pRpc);
1398             break;
1399 
1400         case NV_VGPU_MSG_EVENT_NVLINK_INBAND_RECEIVED_DATA_4096:
1401             _kgspRpcNvlinkInbandReceivedData4096Callback(pGpu, pRpc);
1402             break;
1403 
1404         case NV_VGPU_MSG_EVENT_NVLINK_FATAL_ERROR_RECOVERY:
1405             _kgspRpcNvlinkFatalErrorRecoveryCallback(pGpu, pRpc);
1406             break;
1407 
1408         case NV_VGPU_MSG_EVENT_NVLINK_IS_GPU_DEGRADED :
1409             _kgspRpcEventIsGpuDegradedCallback(pGpu, pRpc);
1410             break;
1411 
1412         case NV_VGPU_MSG_EVENT_RG_LINE_INTR:
1413             _kgspRpcRgLineIntr(pGpu, pRpc);
1414             break;
1415 
1416         case NV_VGPU_MSG_EVENT_UCODE_LIBOS_PRINT:
1417             nvStatus = _kgspRpcUcodeLibosPrint(pGpu, pRpc);
1418             break;
1419 
1420         case NV_VGPU_MSG_EVENT_VGPU_GSP_PLUGIN_TRIGGERED:
1421             nvStatus = _kgspRpcVgpuGspPluginTriggered(pGpu, pRpc);
1422             break;
1423 
1424         case NV_VGPU_MSG_EVENT_VGPU_CONFIG:
1425             nvStatus = _kgspRpcGspVgpuConfig(pGpu, pRpc);
1426             break;
1427 
1428         case NV_VGPU_MSG_EVENT_EXTDEV_INTR_SERVICE:
1429             nvStatus = _kgspRpcGspExtdevIntrService(pGpu, pRpc);
1430             break;
1431 
1432         case NV_VGPU_MSG_EVENT_PFM_REQ_HNDLR_STATE_SYNC_CALLBACK:
1433             nvStatus = _kgspRpcEventPlatformRequestHandlerStateSyncCallback(pGpu, pRpc);
1434             break;
1435 
1436         case NV_VGPU_MSG_EVENT_MIG_CI_CONFIG_UPDATE:
1437             nvStatus = _kgspRpcMigCiConfigUpdate(pGpu, pRpc);
1438             break;
1439 
1440         case NV_VGPU_MSG_EVENT_GSP_LOCKDOWN_NOTICE:
1441             _kgspRpcGspLockdownNotice(pGpu, pRpc);
1442             break;
1443 
1444         case NV_VGPU_MSG_EVENT_UPDATE_GSP_TRACE:
1445             _kgspRpcGspUpdateTrace(pGpu, pRpc);
1446             break;
1447 
1448         case NV_VGPU_MSG_EVENT_GSP_POST_NOCAT_RECORD:
1449             _kgspRpcGspPostNocatRecord(pGpu, pRpc);
1450             break;
1451 
1452         case NV_VGPU_MSG_EVENT_GSP_INIT_DONE:   // Handled by _kgspRpcRecvPoll.
1453         default:
1454             //
1455             // Log, but otherwise ignore unexpected events.
1456             //
1457             // We will get here if the previous RPC timed out.  The response
1458             // eventually comes in as an unexpected event.  The error handling
1459             // for the timeout should have already happened.
1460             //
1461             NV_PRINTF(LEVEL_ERROR, "Unexpected RPC event from GPU%d: 0x%x (%s)\n",
1462                       gpuGetInstance(pGpu), event, _getRpcName(event));
1463             break;
1464     }
1465 
1466     if (nvStatus != NV_OK)
1467     {
1468         //
1469         // Failing to properly handle a specific event does not mean we should stop
1470         // processing events/RPCs, so print the error and soldier on.
1471         //
1472         NV_PRINTF(LEVEL_ERROR,
1473                   "Failed to process received event 0x%x (%s) from GPU%d: status=0x%x\n",
1474                   event, _getRpcName(event), gpuGetInstance(pGpu), nvStatus);
1475     }
1476 
1477 done:
1478     _kgspCompleteRpcHistoryEntry(pRpc->rpcEventHistory, pRpc->rpcEventHistoryCurrent);
1479 }
1480 
1481 /*!
1482  * Handle a single RPC event from GSP unless the event is [an RPC return for] expectedFunc,
1483  * or there are no events available in the buffer.
1484  *
1485  * @return
1486  *   NV_OK                              if the event is successfully handled.
1487  *   NV_WARN_NOTHING_TO_DO              if there are no events available.
1488  *   NV_WARN_MORE_PROCESSING_REQUIRED   if the event is expectedFunc: it is unhandled and in the staging area.
1489  *   (Another status)                   if event reading fails.
1490  */
1491 static NV_STATUS
1492 _kgspRpcDrainOneEvent
1493 (
1494     OBJGPU          *pGpu,
1495     OBJRPC          *pRpc,
1496     NvU32            expectedFunc,
1497     KernelGspRpcEventHandlerContext rpcHandlerContext
1498 )
1499 {
1500     NV_STATUS nvStatus;
1501 
1502     // Issue a memory barrier to ensure we see any queue updates.
1503     // Note: Without the fence, the CPU may get stuck in an infinite loop
1504     //       waiting for a message that has already arrived.
1505     portAtomicMemoryFenceFull();
1506 
1507     nvStatus = GspMsgQueueReceiveStatus(pRpc->pMessageQueueInfo, pGpu);
1508 
1509     if (nvStatus == NV_OK)
1510     {
1511         rpc_message_header_v *pMsgHdr = RPC_HDR;
1512 
1513         if (pMsgHdr->function == expectedFunc)
1514             return NV_WARN_MORE_PROCESSING_REQUIRED;
1515 
1516         _kgspProcessRpcEvent(pGpu, pRpc, rpcHandlerContext);
1517     }
1518 
1519     //
1520     // We don't expect NV_WARN_MORE_PROCESSING_REQUIRED here.
1521     // If we get it we need to suppress it to avoid confusing our caller, for whom it has special meaning.
1522     //
1523     NV_ASSERT_OR_ELSE(nvStatus != NV_WARN_MORE_PROCESSING_REQUIRED,
1524         nvStatus = NV_ERR_GENERIC);
1525 
1526     return nvStatus;
1527 }
1528 
1529 /*!
1530  * Handle RPC events from GSP until the event is [an RPC return for] expectedFunc,
1531  * or there are no events available in the buffer.
1532  *
1533  * Also dump GSP logs, and check for severe errors coming from GSP.
1534  *
1535  * @return
1536  *   NV_OK                              if one or more events are handled and there are none left.
1537  *   NV_WARN_MORE_PROCESSING_REQUIRED   if an expectedFunc event is found: it is unhandled and in the staging area.
1538  *                                        (Zero or more preceding events were successfully handled.)
1539  *   (Another status)                   if event reading or processing fails.
1540  */
1541 static NV_STATUS
1542 _kgspRpcDrainEvents
1543 (
1544     OBJGPU    *pGpu,
1545     KernelGsp *pKernelGsp,
1546     NvU32      expectedFunc,
1547     KernelGspRpcEventHandlerContext rpcHandlerContext
1548 )
1549 {
1550     NV_STATUS nvStatus = NV_OK;
1551     OBJRPC *pRpc = GPU_GET_RPC(pGpu);
1552 
1553     while (nvStatus == NV_OK)
1554     {
1555         nvStatus = _kgspRpcDrainOneEvent(pGpu, pRpc, expectedFunc, rpcHandlerContext);
1556         kgspDumpGspLogs(pKernelGsp, NV_FALSE);
1557     }
1558 
1559     // If GSP-RM has died, the GPU will need to be reset
1560     if (!kgspHealthCheck_HAL(pGpu, pKernelGsp))
1561         return NV_ERR_RESET_REQUIRED;
1562 
1563     if (nvStatus == NV_WARN_NOTHING_TO_DO)
1564         nvStatus = NV_OK;
1565 
1566     return nvStatus;
1567 }
1568 
1569 static NvU64
1570 _tsDiffToDuration
1571 (
1572     NvU64 duration,
1573     char *pDurationUnitsChar
1574 )
1575 {
1576     const NvU64 tsFreqUs = osGetTimestampFreq() / 1000000;
1577 
1578     *pDurationUnitsChar = 'u';
1579 
1580     NV_ASSERT_OR_RETURN(tsFreqUs > 0, 0);
1581 
1582     duration /= tsFreqUs;
1583 
1584     // 999999us then 1000ms
1585     if (duration >= 1000000)
1586     {
1587         duration /= 1000;
1588         *pDurationUnitsChar = 'm';
1589     }
1590 
1591     // 9999ms then 10s
1592     if (duration >= 10000)
1593     {
1594         duration /= 1000;
1595         *pDurationUnitsChar = ' '; // so caller can always just append 's'
1596     }
1597 
1598     return duration;
1599 }
1600 
1601 static NvBool
1602 _kgspIsTimestampDuringRecentRpc
1603 (
1604     OBJRPC *pRpc,
1605     NvU64 timestamp,
1606     NvBool bCheckIncompleteRpcsOnly
1607 )
1608 {
1609     NvU32 historyIndex;
1610     NvU32 historyEntry;
1611 
1612     for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1613     {
1614         historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1615         if (pRpc->rpcHistory[historyEntry].function != 0)
1616         {
1617             if ((timestamp >= pRpc->rpcHistory[historyEntry].ts_start) &&
1618                 ((pRpc->rpcHistory[historyEntry].ts_end == 0) ||
1619                  (!bCheckIncompleteRpcsOnly && (timestamp <= pRpc->rpcHistory[historyEntry].ts_end))))
1620             {
1621                 return NV_TRUE;
1622             }
1623         }
1624     }
1625 
1626     return NV_FALSE;
1627 }
1628 
1629 static void
1630 _kgspLogRpcHistoryEntry
1631 (
1632     OBJGPU *pGpu,
1633     NvU32 errorNum,
1634     NvU32 historyIndex,
1635     RpcHistoryEntry *pEntry,
1636     NvBool lastColumnCondition
1637 )
1638 {
1639     NvU64 duration;
1640     char  durationUnitsChar;
1641 
1642     if (pEntry->function != 0)
1643     {
1644         duration = (pEntry->ts_end > pEntry->ts_start) ? (pEntry->ts_end - pEntry->ts_start) : 0;
1645         if (duration)
1646         {
1647             duration = _tsDiffToDuration(duration, &durationUnitsChar);
1648 
1649             NV_ERROR_LOG_DATA(pGpu, errorNum,
1650                               "    %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx %6llu%cs %c\n",
1651                               ((historyIndex == 0) ? ' ' : '-'),
1652                               historyIndex,
1653                               pEntry->function,
1654                               _getRpcName(pEntry->function),
1655                               pEntry->data[0],
1656                               pEntry->data[1],
1657                               pEntry->ts_start,
1658                               pEntry->ts_end,
1659                               duration, durationUnitsChar,
1660                               (lastColumnCondition ? 'y' : ' '));
1661         }
1662         else
1663         {
1664             NV_ERROR_LOG_DATA(pGpu, errorNum,
1665                               "    %c%-4d %-4d %-21.21s 0x%016llx 0x%016llx 0x%016llx 0x%016llx          %c\n",
1666                               ((historyIndex == 0) ? ' ' : '-'),
1667                               historyIndex,
1668                               pEntry->function,
1669                               _getRpcName(pEntry->function),
1670                               pEntry->data[0],
1671                               pEntry->data[1],
1672                               pEntry->ts_start,
1673                               pEntry->ts_end,
1674                               (lastColumnCondition ? 'y' : ' '));
1675         }
1676     }
1677 }
1678 
1679 void
1680 kgspLogRpcDebugInfo
1681 (
1682     OBJGPU *pGpu,
1683     OBJRPC *pRpc,
1684     NvU32   errorNum,
1685     NvBool  bPollingForRpcResponse
1686 )
1687 {
1688     const rpc_message_header_v *pMsgHdr = RPC_HDR;
1689     NvU32  historyIndex;
1690     NvU32  historyEntry;
1691     NvU64  activeData[2];
1692 
1693     _kgspGetActiveRpcDebugData(pRpc, pMsgHdr->function,
1694                                &activeData[0], &activeData[1]);
1695     NV_ERROR_LOG_DATA(pGpu, errorNum,
1696                       "GPU%d GSP RPC buffer contains function %d (%s) and data 0x%016llx 0x%016llx.\n",
1697                       gpuGetInstance(pGpu),
1698                       pMsgHdr->function, _getRpcName(pMsgHdr->function),
1699                       activeData[0], activeData[1]);
1700 
1701     NV_ERROR_LOG_DATA(pGpu, errorNum,
1702                       "GPU%d RPC history (CPU -> GSP):\n",
1703                       gpuGetInstance(pGpu));
1704     NV_ERROR_LOG_DATA(pGpu, errorNum,
1705                       "    entry function                   data0              data1              ts_start           ts_end             duration actively_polling\n");
1706     for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1707     {
1708         historyEntry = (pRpc->rpcHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1709         _kgspLogRpcHistoryEntry(pGpu, errorNum, historyIndex, &pRpc->rpcHistory[historyEntry],
1710                                 ((historyIndex == 0) && bPollingForRpcResponse));
1711     }
1712 
1713     NV_ERROR_LOG_DATA(pGpu, errorNum,
1714                       "GPU%d RPC event history (CPU <- GSP):\n",
1715                       gpuGetInstance(pGpu));
1716     NV_ERROR_LOG_DATA(pGpu, errorNum,
1717                       "    entry function                   data0              data1              ts_start           ts_end             duration during_incomplete_rpc\n");
1718     for (historyIndex = 0; historyIndex < RPC_HISTORY_DEPTH; historyIndex++)
1719     {
1720         historyEntry = (pRpc->rpcEventHistoryCurrent + RPC_HISTORY_DEPTH - historyIndex) % RPC_HISTORY_DEPTH;
1721         _kgspLogRpcHistoryEntry(pGpu, errorNum, historyIndex, &pRpc->rpcEventHistory[historyEntry],
1722                                 _kgspIsTimestampDuringRecentRpc(pRpc,
1723                                                                 pRpc->rpcEventHistory[historyEntry].ts_start,
1724                                                                 NV_TRUE/*bCheckIncompleteRpcsOnly*/));
1725     }
1726 }
1727 
1728 /*!
1729  * Log Xid 119 - GSP RPC Timeout
1730  */
1731 static void
1732 _kgspLogXid119
1733 (
1734     OBJGPU *pGpu,
1735     OBJRPC *pRpc,
1736     NvU32 expectedFunc
1737 )
1738 {
1739     RpcHistoryEntry *pHistoryEntry = &pRpc->rpcHistory[pRpc->rpcHistoryCurrent];
1740     NvU64 ts_end = osGetTimestamp();
1741     NvU64 duration;
1742     char  durationUnitsChar;
1743 
1744     if (pRpc->timeoutCount == 1)
1745     {
1746         NV_PRINTF(LEVEL_ERROR,
1747                   "********************************* GSP Timeout **********************************\n");
1748         NV_PRINTF(LEVEL_ERROR,
1749                   "Note: Please also check logs above.\n");
1750     }
1751 
1752     NV_ASSERT(expectedFunc == pHistoryEntry->function);
1753 
1754     NV_ASSERT(ts_end > pHistoryEntry->ts_start);
1755     duration = _tsDiffToDuration(ts_end - pHistoryEntry->ts_start, &durationUnitsChar);
1756 
1757     NV_ERROR_LOG(pGpu, GSP_RPC_TIMEOUT,
1758                  "Timeout after %llus of waiting for RPC response from GPU%d GSP! Expected function %d (%s) (0x%x 0x%x).",
1759                  (durationUnitsChar == 'm' ? duration / 1000 : duration),
1760                  gpuGetInstance(pGpu),
1761                  expectedFunc,
1762                  _getRpcName(expectedFunc),
1763                  pHistoryEntry->data[0],
1764                  pHistoryEntry->data[1]);
1765 
1766     if (pRpc->timeoutCount == 1)
1767     {
1768         kgspLogRpcDebugInfo(pGpu, pRpc, GSP_RPC_TIMEOUT, NV_TRUE/*bPollingForRpcResponse*/);
1769 
1770         osAssertFailed();
1771 
1772         NV_PRINTF(LEVEL_ERROR,
1773                   "********************************************************************************\n");
1774     }
1775 }
1776 
1777 static void
1778 _kgspRpcIncrementTimeoutCountAndRateLimitPrints
1779 (
1780     OBJGPU *pGpu,
1781     OBJRPC *pRpc
1782 )
1783 {
1784     pRpc->timeoutCount++;
1785 
1786     if ((pRpc->timeoutCount == (RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH + 1)) &&
1787         (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP > 0))
1788     {
1789         // make sure we warn Xid and NV_PRINTF/NVLOG consumers that we are rate limiting prints
1790         if (GPU_GET_KERNEL_RC(pGpu)->bLogEvents)
1791         {
1792             portDbgPrintf(
1793                 "NVRM: Rate limiting GSP RPC error prints for GPU at PCI:%04x:%02x:%02x (printing 1 of every %d).  The GPU likely needs to be reset.\n",
1794                 gpuGetDomain(pGpu),
1795                 gpuGetBus(pGpu),
1796                 gpuGetDevice(pGpu),
1797                 RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1);
1798         }
1799         NV_PRINTF(LEVEL_WARNING,
1800                   "Rate limiting GSP RPC error prints (printing 1 of every %d)\n",
1801                   RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1);
1802     }
1803 
1804     pRpc->bQuietPrints = ((pRpc->timeoutCount > RPC_TIMEOUT_LIMIT_PRINT_RATE_THRESH) &&
1805                           ((pRpc->timeoutCount % (RPC_TIMEOUT_LIMIT_PRINT_RATE_SKIP + 1)) != 0));
1806 }
1807 
1808 /*!
1809  * GSP client RM RPC poll routine
1810  */
1811 static NV_STATUS
1812 _kgspRpcRecvPoll
1813 (
1814     OBJGPU *pGpu,
1815     OBJRPC *pRpc,
1816     NvU32   expectedFunc
1817 )
1818 {
1819     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
1820     NV_STATUS  rpcStatus = NV_OK;
1821     NV_STATUS  timeoutStatus = NV_OK;
1822     RMTIMEOUT  timeout;
1823     NvU32      timeoutUs;
1824     NvU32      timeoutFlags;
1825     NvBool     bSlowGspRpc = IS_EMULATION(pGpu) || IS_SIMULATION(pGpu);
1826     NvU32      gpuMaskUnused;
1827 
1828     KernelGspRpcEventHandlerContext rpcHandlerContext = KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL;
1829     if (expectedFunc == NV_VGPU_MSG_EVENT_GSP_INIT_DONE)
1830     {
1831         // special case for bootup path without API lock
1832         rpcHandlerContext = KGSP_RPC_EVENT_HANDLER_CONTEXT_POLL_BOOTUP;
1833     }
1834     //
1835     // We do not allow recursive polling. This can happen if e.g.
1836     //    1. CPU-RM issues RPC-A to GSP and polls waiting for it to finish
1837     //    2. While servicing RPC-A, GSP emits an async event back to CPU-RM
1838     //    3. CPU-RM services the async event and sends another synchronous RPC-B
1839     //    4. RPC-A response will come first, but CPU-RM is now waiting on RPC-B
1840     //
1841     // We don't have a good way to handle this and should just be deferring the
1842     // second RPC until the first one is done, via e.g. osQueueWorkItem().
1843     // This assert is meant to catch and loudly fail such cases.
1844     //
1845     NV_ASSERT_OR_RETURN(!pKernelGsp->bPollingForRpcResponse, NV_ERR_INVALID_STATE);
1846     pKernelGsp->bPollingForRpcResponse = NV_TRUE;
1847 
1848     //
1849     // GSP-RM init in emulation/simulation environment is extremely slow,
1850     // so need to increment timeout.
1851     // Apply the timeout extension to other RPCs as well, mostly so that
1852     // we'll reset the thread state after each RPC, not just while waiting
1853     // for the INIT_DONE event.
1854     //
1855     if (bSlowGspRpc)
1856     {
1857         NvU32 timeoutResult;
1858 
1859         // On slow Apollo emulators, GSP-RM init could take more than an hour
1860         NV_ASSERT(portSafeMulU32(GSP_SCALE_TIMEOUT_EMU_SIM, 1500000, &timeoutResult));
1861         timeoutUs = timeoutResult;
1862     }
1863     else
1864     {
1865         NvU32 defaultus = pGpu->timeoutData.defaultus;
1866 
1867         if (IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
1868         {
1869             // Ensure at least 3.1s for vGPU-GSP before adding leeway (Bug 3928607)
1870             timeoutUs = NV_MAX(3100 * 1000, defaultus) + (defaultus / 2);
1871         }
1872         else
1873         {
1874             //
1875             // We should only ever timeout this when GSP is in really bad state, so if it just
1876             // happens to timeout on default timeout it should be OK for us to give it a little
1877             // more time - make this timeout 1.5 of the default to allow some leeway.
1878             //
1879             timeoutUs = defaultus + defaultus / 2;
1880         }
1881     }
1882 
1883     NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
1884 
1885     timeoutFlags = GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE;
1886     if (pRpc->bQuietPrints)
1887         timeoutFlags |= GPU_TIMEOUT_FLAGS_BYPASS_JOURNAL_LOG;
1888 
1889     gpuSetTimeout(pGpu, timeoutUs, &timeout, timeoutFlags);
1890 
1891     for (;;)
1892     {
1893         //
1894         // Check for GPU timeout, save that information, and then verify if the RPC is completed.
1895         // Otherwise if the CPU thread goes to sleep immediately after the RPC check, it may result in hitting a timeout.
1896         //
1897         timeoutStatus = gpuCheckTimeout(pGpu, &timeout);
1898 
1899         rpcStatus = _kgspRpcDrainEvents(pGpu, pKernelGsp, expectedFunc, rpcHandlerContext);
1900 
1901         switch (rpcStatus) {
1902             case NV_WARN_MORE_PROCESSING_REQUIRED:
1903                 // The synchronous RPC response we were waiting for is here
1904                 _kgspCompleteRpcHistoryEntry(pRpc->rpcHistory, pRpc->rpcHistoryCurrent);
1905                 rpcStatus = NV_OK;
1906                 goto done;
1907             case NV_OK:
1908                 // Check timeout and continue outer loop.
1909                 break;
1910             default:
1911                 goto done;
1912         }
1913 
1914         NV_CHECK_OK_OR_GOTO(rpcStatus, LEVEL_SILENT, _kgspRpcSanityCheck(pGpu, pKernelGsp, pRpc), done);
1915 
1916         if (timeoutStatus == NV_ERR_TIMEOUT)
1917         {
1918             rpcStatus = timeoutStatus;
1919 
1920             _kgspRpcIncrementTimeoutCountAndRateLimitPrints(pGpu, pRpc);
1921 
1922             if (!pRpc->bQuietPrints)
1923             {
1924                 _kgspLogXid119(pGpu, pRpc, expectedFunc);
1925             }
1926 
1927             goto done;
1928         }
1929         else if (timeoutStatus != NV_OK)
1930         {
1931             NV_PRINTF(LEVEL_ERROR, "gpuCheckTimeout() returned unexpected error (0x%08x)\n",
1932                       timeoutStatus);
1933             rpcStatus = timeoutStatus;
1934             goto done;
1935         }
1936 
1937         osSpinLoop();
1938     }
1939 
1940     pRpc->timeoutCount = 0;
1941 
1942 done:
1943     pKernelGsp->bPollingForRpcResponse = NV_FALSE;
1944 
1945     if (bSlowGspRpc)
1946     {
1947         // Avoid cumulative timeout due to slow RPC
1948         threadStateResetTimeout(pGpu);
1949     }
1950 
1951     return rpcStatus;
1952 }
1953 
1954 /*!
1955  * Initialize RPC objects required for interfacing with GSP.
1956  */
1957 static NV_STATUS
1958 _kgspInitRpcInfrastructure
1959 (
1960     OBJGPU    *pGpu,
1961     KernelGsp *pKernelGsp
1962 )
1963 {
1964     NV_STATUS nvStatus = NV_OK;
1965     MESSAGE_QUEUE_COLLECTION *pMQCollection = NULL;
1966 
1967     nvStatus = GspMsgQueuesInit(pGpu, &pMQCollection);
1968     if (nvStatus != NV_OK)
1969     {
1970         NV_PRINTF(LEVEL_ERROR, "GspMsgQueueInit failed\n");
1971         goto done;
1972     }
1973 
1974     pKernelGsp->pMQCollection = pMQCollection;
1975 
1976     // Init RM RPC object
1977     nvStatus = _kgspConstructRpcObject(pGpu, pKernelGsp,
1978                                        &pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX],
1979                                        &pKernelGsp->pRpc);
1980     if (nvStatus != NV_OK)
1981     {
1982         NV_PRINTF(LEVEL_ERROR, "init task RM RPC infrastructure failed\n");
1983         goto done;
1984     }
1985 
1986     // Init task_isr RPC object
1987     if (pKernelGsp->bIsTaskIsrQueueRequired)
1988     {
1989         nvStatus = _kgspConstructRpcObject(pGpu, pKernelGsp,
1990                                            &pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX],
1991                                            &pKernelGsp->pLocklessRpc);
1992         if (nvStatus != NV_OK)
1993         {
1994             NV_PRINTF(LEVEL_ERROR, "init task ISR RPC infrastructure failed\n");
1995             goto done;
1996         }
1997     }
1998 
1999 done:
2000     if (nvStatus != NV_OK)
2001     {
2002         _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
2003     }
2004 
2005     return nvStatus;
2006 }
2007 
2008 
2009 /*!
2010  * Initialize stripped down version of RPC infra init for GSP clients.
2011  */
2012 static NV_STATUS
2013 _kgspConstructRpcObject
2014 (
2015     OBJGPU *pGpu,
2016     KernelGsp *pKernelGsp,
2017     MESSAGE_QUEUE_INFO *pMQI,
2018     OBJRPC **ppRpc
2019 )
2020 {
2021     OBJRPC *pRpc;
2022 
2023     NV_ASSERT_OR_RETURN(pMQI != NULL, NV_ERR_INVALID_ARGUMENT);
2024 
2025     pRpc = initRpcObject(pGpu);
2026     if (pRpc == NULL)
2027     {
2028         NV_PRINTF(LEVEL_ERROR, "initRpcObject failed\n");
2029         return NV_ERR_INSUFFICIENT_RESOURCES;
2030     }
2031 
2032     pRpc->pMessageQueueInfo = pMQI;
2033 
2034     portMemSet(&pRpc->rpcHistory, 0, sizeof(pRpc->rpcHistory));
2035     pRpc->rpcHistoryCurrent = RPC_HISTORY_DEPTH - 1;
2036     portMemSet(&pRpc->rpcEventHistory, 0, sizeof(pRpc->rpcEventHistory));
2037     pRpc->rpcEventHistoryCurrent = RPC_HISTORY_DEPTH - 1;
2038 
2039     pRpc->message_buffer  = (NvU32 *)pRpc->pMessageQueueInfo->pRpcMsgBuf;
2040     pRpc->maxRpcSize      = GSP_MSG_QUEUE_RPC_SIZE_MAX;
2041 
2042     rpcSendMessage_FNPTR(pRpc) = _kgspRpcSendMessage;
2043     rpcRecvPoll_FNPTR(pRpc)    = _kgspRpcRecvPoll;
2044 
2045     *ppRpc = pRpc;
2046 
2047     return NV_OK;
2048 }
2049 
2050 static void
2051 _kgspFreeRpcInfrastructure
2052 (
2053     OBJGPU *pGpu,
2054     KernelGsp *pKernelGsp
2055 )
2056 {
2057     if (pKernelGsp->pRpc != NULL)
2058     {
2059         rpcDestroy(pGpu, pKernelGsp->pRpc);
2060         portMemFree(pKernelGsp->pRpc);
2061         pKernelGsp->pRpc = NULL;
2062     }
2063     if (pKernelGsp->pLocklessRpc != NULL)
2064     {
2065         rpcDestroy(pGpu, pKernelGsp->pLocklessRpc);
2066         portMemFree(pKernelGsp->pLocklessRpc);
2067         pKernelGsp->pLocklessRpc = NULL;
2068     }
2069     GspMsgQueuesCleanup(&pKernelGsp->pMQCollection);
2070 }
2071 
2072 /*!
2073  * Convert init arg name to 64bit id value.
2074  *
2075  * @param[in]      name  String representing name of init arg
2076  */
2077 static NvU64
2078 _kgspGenerateInitArgId(const char *name)
2079 {
2080     NvU64 id = 0;
2081     NvU8 c;
2082     NvU32 i;
2083 
2084     // Convert at most 8 characters from name into id.
2085     for (i = 0; i < (sizeof(NvU64) / sizeof(NvU8)); ++i)
2086     {
2087         c = (NvU8)*name++;
2088         if (c == '\0')
2089         {
2090             break;
2091         }
2092         id = (id << 8) | c;
2093     }
2094 
2095     return id;
2096 }
2097 
2098 static void
2099 _kgspUnmapTaskLogBuf(OBJGPU *pGpu, RM_LIBOS_LOG_MEM *pLog)
2100 {
2101     // release log memory for this task.
2102     if (pLog->pTaskLogBuffer != NULL)
2103     {
2104         memdescUnmapInternal(pGpu, pLog->pTaskLogDescriptor, TRANSFER_FLAGS_NONE);
2105         pLog->pTaskLogBuffer = NULL;
2106     }
2107 
2108     if (pLog->pTaskLogDescriptor != NULL)
2109     {
2110         memdescFree(pLog->pTaskLogDescriptor);
2111         memdescDestroy(pLog->pTaskLogDescriptor);
2112         pLog->pTaskLogDescriptor = NULL;
2113     }
2114 }
2115 
2116 /*!
2117  * Free vgpu partition LIBOS task logging structures
2118  */
2119 static void
2120 _kgspFreeLibosVgpuPartitionLoggingStructures
2121 (
2122     OBJGPU *pGpu,
2123     KernelGsp *pKernelGsp,
2124     NvU32 gfid
2125 )
2126 {
2127     RM_LIBOS_LOG_MEM *vgpuLogBuffers[] =
2128     {
2129         pKernelGsp->gspPluginInitTaskLogMem,
2130         pKernelGsp->gspPluginVgpuTaskLogMem
2131     };
2132 
2133     libosLogDestroy(&pKernelGsp->logDecodeVgpuPartition[gfid - 1]);
2134 
2135     // release all the vgpu tasks' log buffer memory
2136     for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(vgpuLogBuffers); ++i)
2137     {
2138         RM_LIBOS_LOG_MEM *pTaskLog = &vgpuLogBuffers[i][gfid - 1];
2139         _kgspUnmapTaskLogBuf(pGpu, pTaskLog);
2140     }
2141 }
2142 
2143 /*!
2144  * Free vgpu partition LIBOS task logging structures
2145  */
2146 NV_STATUS
2147 kgspFreeVgpuPartitionLogging_IMPL
2148 (
2149     OBJGPU *pGpu,
2150     KernelGsp *pKernelGsp,
2151     NvU32 gfid
2152 )
2153 {
2154     if (gfid > MAX_PARTITIONS_WITH_GFID)
2155     {
2156         return NV_ERR_INVALID_ARGUMENT;
2157     }
2158     else
2159     {
2160         // Make sure there is no lingering debug output.
2161         kgspDumpGspLogs(pKernelGsp, NV_FALSE);
2162 
2163         _kgspFreeLibosVgpuPartitionLoggingStructures(pGpu, pKernelGsp, gfid);
2164         return NV_OK;
2165     }
2166 }
2167 
2168 /*!
2169  * Initialize vgpu partition LIBOS task logging structures
2170  */
2171 NV_STATUS
2172 kgspInitVgpuPartitionLogging_IMPL
2173 (
2174     OBJGPU *pGpu,
2175     KernelGsp *pKernelGsp,
2176     NvU32 gfid,
2177     NvU64 initTaskLogBUffOffset,
2178     NvU64 initTaskLogBUffSize,
2179     NvU64 vgpuTaskLogBUffOffset,
2180     NvU64 vgpuTaskLogBuffSize
2181 )
2182 {
2183     struct
2184     {
2185         const char       *szMemoryId;
2186         const char       *szPrefix;
2187         const char       *elfSectionName;
2188         NvU64             bufOffset;
2189         NvU64             bufSize;
2190         RM_LIBOS_LOG_MEM *taskLogArr;
2191     } logInitValues[] =
2192     {
2193         {"LOGINIT", "INIT", ".fwlogging_init", initTaskLogBUffOffset, initTaskLogBUffSize, pKernelGsp->gspPluginInitTaskLogMem},
2194         {"LOGVGPU", "VGPU", ".fwlogging_vgpu", vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pKernelGsp->gspPluginVgpuTaskLogMem}
2195     };
2196     ct_assert(NV_ARRAY_ELEMENTS(logInitValues) <= LIBOS_LOG_MAX_LOGS);
2197 
2198     NV_STATUS nvStatus = NV_OK;
2199     RM_LIBOS_LOG_MEM *pTaskLog = NULL;
2200     char vm_string[8], sourceName[SOURCE_NAME_MAX_LENGTH];
2201 
2202     if (gfid > MAX_PARTITIONS_WITH_GFID)
2203     {
2204         return NV_ERR_INVALID_ARGUMENT;
2205     }
2206 
2207     if (pKernelGsp->pNvlogFlushMtx != NULL)
2208         portSyncMutexAcquire(pKernelGsp->pNvlogFlushMtx);
2209 
2210     // Source name is used to generate a tag that is a unique identifier for nvlog buffers.
2211     // As the source name 'GSP' is already in use, we will need a custom source name.
2212     nvDbgSnprintf(sourceName, SOURCE_NAME_MAX_LENGTH, "V%02d", gfid);
2213     libosLogCreateEx(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], sourceName);
2214 
2215     // Setup logging for each task in vgpu partition
2216     for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(logInitValues); ++i)
2217     {
2218         pTaskLog = &logInitValues[i].taskLogArr[gfid - 1];
2219         NvP64 pVa = NvP64_NULL;
2220 
2221         NV_ASSERT_OK_OR_GOTO(nvStatus,
2222             memdescCreate(&pTaskLog->pTaskLogDescriptor,
2223                           pGpu,
2224                           logInitValues[i].bufSize,
2225                           RM_PAGE_SIZE,
2226                           NV_TRUE, ADDR_FBMEM, NV_MEMORY_CACHED,
2227                           MEMDESC_FLAGS_NONE),
2228             error_cleanup);
2229 
2230         memdescDescribe(pTaskLog->pTaskLogDescriptor, ADDR_FBMEM, logInitValues[i].bufOffset,  logInitValues[i].bufSize);
2231 
2232         pVa = memdescMapInternal(pGpu, pTaskLog->pTaskLogDescriptor, TRANSFER_FLAGS_NONE);
2233         if (pVa != NvP64_NULL)
2234         {
2235             pTaskLog->pTaskLogBuffer = pVa;
2236             portMemSet(pTaskLog->pTaskLogBuffer, 0, logInitValues[i].bufSize);
2237 
2238             pTaskLog->id8 = _kgspGenerateInitArgId(logInitValues[i].szMemoryId);
2239 
2240             nvDbgSnprintf(vm_string, sizeof(vm_string), "%s%d", logInitValues[i].szPrefix, gfid);
2241 
2242             libosLogAddLogEx(&pKernelGsp->logDecodeVgpuPartition[gfid - 1],
2243                 pTaskLog->pTaskLogBuffer,
2244                 memdescGetSize(pTaskLog->pTaskLogDescriptor),
2245                 pGpu->gpuInstance,
2246                 (gpuGetChipArch(pGpu) >> GPU_ARCH_SHIFT),
2247                 gpuGetChipImpl(pGpu),
2248                 vm_string,
2249                 logInitValues[i].elfSectionName);
2250         }
2251         else
2252         {
2253             NV_PRINTF(LEVEL_ERROR, "Failed to map memory for %s task log buffer for vGPU partition \n", logInitValues[i].szPrefix);
2254             nvStatus = NV_ERR_INSUFFICIENT_RESOURCES;
2255             goto error_cleanup;
2256         }
2257     }
2258 
2259     {
2260         libosLogInit(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], pKernelGsp->pLogElf, pKernelGsp->logElfDataSize);
2261         // nvlog buffers are now setup using the appropriate sourceName to avoid tag-value clash.
2262         // Now sourceName can be modified to preserve the 'GSP-VGPUx' logging convention.
2263         portStringCopy(pKernelGsp->logDecodeVgpuPartition[gfid - 1].sourceName,
2264                        SOURCE_NAME_MAX_LENGTH,
2265                        "GSP", SOURCE_NAME_MAX_LENGTH);
2266     }
2267 
2268     pKernelGsp->bHasVgpuLogs = NV_TRUE;
2269 
2270 error_cleanup:
2271     if (pKernelGsp->pNvlogFlushMtx != NULL)
2272         portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx);
2273 
2274     if (nvStatus != NV_OK)
2275         _kgspFreeLibosVgpuPartitionLoggingStructures(pGpu, pKernelGsp, gfid);
2276 
2277     return nvStatus;
2278 }
2279 
2280 void kgspNvlogFlushCb(void *pKernelGsp)
2281 {
2282     if (pKernelGsp != NULL)
2283         kgspDumpGspLogs((KernelGsp*)pKernelGsp, NV_TRUE);
2284 }
2285 
2286 /*!
2287  * Free LIBOS task logging structures
2288  */
2289 static void
2290 _kgspFreeLibosLoggingStructures
2291 (
2292     OBJGPU *pGpu,
2293     KernelGsp *pKernelGsp
2294 )
2295 {
2296     NvU8 idx;
2297 
2298     _kgspStopLogPolling(pGpu, pKernelGsp);
2299 
2300     // Make sure there is no lingering debug output.
2301     kgspDumpGspLogs(pKernelGsp, NV_FALSE);
2302 
2303     if (pKernelGsp->pLogElf == NULL)
2304         nvlogDeregisterFlushCb(kgspNvlogFlushCb, pKernelGsp);
2305 
2306     if (pKernelGsp->pNvlogFlushMtx != NULL)
2307     {
2308         portSyncMutexDestroy(pKernelGsp->pNvlogFlushMtx);
2309         pKernelGsp->pNvlogFlushMtx = NULL;
2310     }
2311 
2312     libosLogDestroy(&pKernelGsp->logDecode);
2313 
2314     for (idx = 0; idx < LOGIDX_SIZE; idx++)
2315     {
2316         RM_LIBOS_LOG_MEM *pLog = &pKernelGsp->rmLibosLogMem[idx];
2317 
2318         // release log memory for each task.
2319         if (pLog->pTaskLogBuffer != NULL)
2320         {
2321             memdescUnmap(pLog->pTaskLogDescriptor,
2322                          NV_TRUE, osGetCurrentProcess(),
2323                          (void *)pLog->pTaskLogBuffer,
2324                          pLog->pTaskLogMappingPriv);
2325             pLog->pTaskLogBuffer = NULL;
2326             pLog->pTaskLogMappingPriv = NULL;
2327         }
2328 
2329         if (pLog->pTaskLogDescriptor != NULL)
2330         {
2331             memdescFree(pLog->pTaskLogDescriptor);
2332             memdescDestroy(pLog->pTaskLogDescriptor);
2333             pLog->pTaskLogDescriptor = NULL;
2334         }
2335     }
2336 
2337     portMemFree(pKernelGsp->pLogElf);
2338     pKernelGsp->pLogElf = NULL;
2339 }
2340 
2341 /*!
2342  * Initialize LIBOS task logging structures
2343  */
2344 static NV_STATUS
2345 _kgspInitLibosLoggingStructures
2346 (
2347     OBJGPU *pGpu,
2348     KernelGsp *pKernelGsp
2349 )
2350 {
2351     static const struct
2352     {
2353         const char *szMemoryId;
2354         const char *szPrefix;
2355         NvU32       size;
2356         const char *elfSectionName;
2357     } logInitValues[] =
2358     {
2359         {"LOGINIT", "INIT", 0x10000, ".fwlogging_init"},  // 64KB for stack traces
2360 #if defined(DEVELOP) || defined(DEBUG)
2361         // The interrupt task is in the rm elf, so they share the same logging elf too
2362         {"LOGINTR", "INTR", 0x40000, ".fwlogging_rm"},    // 256KB ISR debug log on develop/debug builds
2363         {"LOGRM",   "RM",   0x40000, ".fwlogging_rm"}     // 256KB RM debug log on develop/debug builds
2364 #else
2365         // The interrupt task is in the rm elf, so they share the same logging elf too
2366         {"LOGINTR", "INTR", 0x10000, ".fwlogging_rm"},    // 64KB ISR debug log on develop/debug builds
2367         {"LOGRM",   "RM",   0x10000, ".fwlogging_rm"}     // 64KB RM debug log on release builds
2368 #endif
2369     };
2370     ct_assert(NV_ARRAY_ELEMENTS(logInitValues) <= LIBOS_LOG_MAX_LOGS);
2371     ct_assert(NV_ARRAY_ELEMENTS(logInitValues) == LOGIDX_SIZE);
2372 
2373     NV_STATUS nvStatus = NV_OK;
2374     NvU8      idx;
2375     NvU64 flags = MEMDESC_FLAGS_NONE;
2376 
2377     // Needed only on Unix where NV_ESC_RM_LOCKLESS_DIAGNOSTIC is supported
2378     if (RMCFG_FEATURE_PLATFORM_UNIX)
2379     {
2380         pKernelGsp->pNvlogFlushMtx = portSyncMutexCreate(portMemAllocatorGetGlobalNonPaged());
2381         if (pKernelGsp->pNvlogFlushMtx == NULL)
2382         {
2383             nvStatus = NV_ERR_INSUFFICIENT_RESOURCES;
2384             goto error_cleanup;
2385         }
2386     }
2387 
2388     libosLogCreate(&pKernelGsp->logDecode);
2389 
2390     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
2391 
2392     for (idx = 0; idx < LOGIDX_SIZE; idx++)
2393     {
2394         RM_LIBOS_LOG_MEM *pLog = &pKernelGsp->rmLibosLogMem[idx];
2395         NvP64 pVa = NvP64_NULL;
2396         NvP64 pPriv = NvP64_NULL;
2397 
2398         // Setup logging memory for each task.
2399         NV_ASSERT_OK_OR_GOTO(nvStatus,
2400             memdescCreate(&pLog->pTaskLogDescriptor,
2401                           pGpu,
2402                           logInitValues[idx].size,
2403                           RM_PAGE_SIZE,
2404                           NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED,
2405                           flags),
2406             error_cleanup);
2407 
2408         memdescTagAlloc(nvStatus,
2409                       NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_12, pLog->pTaskLogDescriptor);
2410         NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus,
2411             error_cleanup);
2412 
2413         NV_ASSERT_OK_OR_GOTO(nvStatus,
2414             memdescMap(pLog->pTaskLogDescriptor, 0,
2415                        memdescGetSize(pLog->pTaskLogDescriptor),
2416                        NV_TRUE, NV_PROTECT_READ_WRITE,
2417                        &pVa, &pPriv),
2418             error_cleanup);
2419 
2420         pLog->pTaskLogBuffer = pVa;
2421         pLog->pTaskLogMappingPriv = pPriv;
2422         portMemSet(pLog->pTaskLogBuffer, 0, memdescGetSize(pLog->pTaskLogDescriptor));
2423 
2424         // Pass the PTE table for the log buffer in the log buffer, after the put pointer.
2425         memdescGetPhysAddrs(pLog->pTaskLogDescriptor,
2426                             AT_GPU,
2427                             0,
2428                             RM_PAGE_SIZE,
2429                             NV_CEIL(memdescGetSize(pLog->pTaskLogDescriptor), RM_PAGE_SIZE),
2430                             &pLog->pTaskLogBuffer[1]);
2431 
2432         pLog->id8 = _kgspGenerateInitArgId(logInitValues[idx].szMemoryId);
2433 
2434         libosLogAddLogEx(&pKernelGsp->logDecode,
2435             pLog->pTaskLogBuffer,
2436             memdescGetSize(pLog->pTaskLogDescriptor),
2437             pGpu->gpuInstance,
2438             (gpuGetChipArch(pGpu) >> GPU_ARCH_SHIFT),
2439             gpuGetChipImpl(pGpu),
2440             logInitValues[idx].szPrefix,
2441             logInitValues[idx].elfSectionName);
2442     }
2443 
2444 error_cleanup:
2445     if (nvStatus != NV_OK)
2446         _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
2447 
2448     return nvStatus;
2449 }
2450 
2451 static NV_STATUS
2452 _kgspInitLibosLogDecoder
2453 (
2454     OBJGPU *pGpu,
2455     KernelGsp *pKernelGsp,
2456     GSP_FIRMWARE *pGspFw
2457 )
2458 {
2459     // If there's no log ELF or it's already been wired, skip wiring it now
2460     if ((pGspFw->pLogElf == NULL) || (pKernelGsp->pLogElf != NULL))
2461         return NV_OK;
2462 
2463     // Setup symbol decoder
2464     const void *pLogData = NULL;
2465     NvU64 logSize = 0;
2466 
2467     NV_ASSERT_OK_OR_RETURN(
2468         _kgspFwContainerVerifyVersion(pGpu, pKernelGsp,
2469             pGspFw->pLogElf,
2470             pGspFw->logElfSize,
2471             "GSP firmware log"));
2472 
2473     NV_ASSERT_OK_OR_RETURN(
2474         _kgspFwContainerGetSection(pGpu, pKernelGsp,
2475             pGspFw->pLogElf,
2476             pGspFw->logElfSize,
2477             GSP_LOGGING_SECTION_NAME,
2478             &pLogData,
2479             &logSize));
2480 
2481     pKernelGsp->pLogElf = portMemAllocNonPaged(logSize);
2482     pKernelGsp->logElfDataSize = logSize;
2483 
2484     NV_ASSERT_OR_RETURN(pKernelGsp->pLogElf != NULL, NV_ERR_NO_MEMORY);
2485 
2486     portMemCopy(pKernelGsp->pLogElf, logSize, pLogData, logSize);
2487     libosLogInit(&pKernelGsp->logDecode, pKernelGsp->pLogElf, logSize);
2488 
2489     return NV_OK;
2490 }
2491 
2492 static NV_STATUS
2493 _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2494 {
2495     NvP64 pVa   = NvP64_NULL;
2496     NvP64 pPriv = NvP64_NULL;
2497     NV_STATUS nvStatus;
2498 
2499     if (!IS_SIMULATION(pGpu))
2500     {
2501         pKernelGsp->pMemDesc_simAccessBuf = NULL;
2502         pKernelGsp->pSimAccessBuf         = NULL;
2503         pKernelGsp->pSimAccessBufPriv     = NULL;
2504         return NV_ERR_NOT_SUPPORTED;
2505     }
2506 
2507     NV_ASSERT_OK_OR_GOTO(nvStatus,
2508         memdescCreate(&pKernelGsp->pMemDesc_simAccessBuf,
2509                       pGpu,
2510                       sizeof(SimAccessBuffer),
2511                       RM_PAGE_SIZE,
2512                       NV_TRUE, ADDR_SYSMEM, NV_MEMORY_UNCACHED,
2513                       MEMDESC_FLAGS_NONE),
2514         error_cleanup);
2515 
2516         memdescTagAlloc(nvStatus,
2517                 NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_13, pKernelGsp->pMemDesc_simAccessBuf);
2518         NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_cleanup);
2519 
2520     NV_ASSERT_OK_OR_GOTO(nvStatus,
2521         memdescMap(pKernelGsp->pMemDesc_simAccessBuf, 0,
2522                    memdescGetSize(pKernelGsp->pMemDesc_simAccessBuf),
2523                    NV_TRUE, NV_PROTECT_READ_WRITE,
2524                    &pVa, &pPriv),
2525         error_cleanup);
2526 
2527     pKernelGsp->pSimAccessBuf = (SimAccessBuffer*)pVa;
2528     pKernelGsp->pSimAccessBufPriv = pPriv;
2529 
2530     portMemSet(pKernelGsp->pSimAccessBuf, 0, memdescGetSize(pKernelGsp->pMemDesc_simAccessBuf));
2531 
2532 error_cleanup:
2533     if (nvStatus != NV_OK)
2534         _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
2535 
2536     return nvStatus;
2537 }
2538 
2539 static void
2540 _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2541 {
2542     if (!IS_SIMULATION(pGpu))
2543     {
2544         return;
2545     }
2546 
2547     if (pKernelGsp->pMemDesc_simAccessBuf != NULL)
2548     {
2549         memdescFree(pKernelGsp->pMemDesc_simAccessBuf);
2550         memdescDestroy(pKernelGsp->pMemDesc_simAccessBuf);
2551     }
2552 
2553     pKernelGsp->pMemDesc_simAccessBuf = NULL;
2554     pKernelGsp->pSimAccessBuf         = NULL;
2555     pKernelGsp->pSimAccessBufPriv     = NULL;
2556 }
2557 
2558 static NV_STATUS
2559 _kgspAllocNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2560 {
2561     NvP64 pVa   = NvP64_NULL;
2562     NvP64 pPriv = NvP64_NULL;
2563     NV_STATUS nvStatus;
2564     NvU64 flags = MEMDESC_FLAGS_NONE;
2565 
2566     //
2567     // On systems with SEV enabled, the fault buffer flush sequence memory should be allocated
2568     // in unprotected sysmem as GSP will be writing to this location to let the guest
2569     // know a the issued notify op has finished as well as the status of the operation.
2570     //
2571     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
2572 
2573     NV_ASSERT_OK_OR_GOTO(nvStatus,
2574         memdescCreate(&pKernelGsp->pNotifyOpSurfMemDesc,
2575                       pGpu,
2576                       sizeof(NotifyOpSharedSurface),
2577                       RM_PAGE_SIZE,
2578                       NV_FALSE, ADDR_SYSMEM, NV_MEMORY_UNCACHED,
2579                       flags),
2580         error_cleanup);
2581 
2582         memdescTagAlloc(nvStatus,
2583                 NV_FB_ALLOC_RM_INTERNAL_OWNER_GSP_NOTIFY_OP_SURFACE, pKernelGsp->pNotifyOpSurfMemDesc);
2584         NV_ASSERT_OK_OR_GOTO(nvStatus, nvStatus, error_cleanup);
2585 
2586     NV_ASSERT_OK_OR_GOTO(nvStatus,
2587         memdescMap(pKernelGsp->pNotifyOpSurfMemDesc, 0,
2588                    memdescGetSize(pKernelGsp->pNotifyOpSurfMemDesc),
2589                    NV_TRUE, NV_PROTECT_READ_WRITE,
2590                    &pVa, &pPriv),
2591         error_cleanup);
2592 
2593     pKernelGsp->pNotifyOpSurf = (NotifyOpSharedSurface*)pVa;
2594     pKernelGsp->pNotifyOpSurfPriv = pPriv;
2595 
2596     portMemSet(pKernelGsp->pNotifyOpSurf, 0, memdescGetSize(pKernelGsp->pNotifyOpSurfMemDesc));
2597 
2598 error_cleanup:
2599     if (nvStatus != NV_OK)
2600         _kgspFreeNotifyOpSharedSurface(pGpu, pKernelGsp);
2601 
2602     return nvStatus;
2603 }
2604 
2605 static void
2606 _kgspFreeNotifyOpSharedSurface(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2607 {
2608     if (pKernelGsp->pNotifyOpSurfMemDesc != NULL)
2609     {
2610         memdescFree(pKernelGsp->pNotifyOpSurfMemDesc);
2611         memdescDestroy(pKernelGsp->pNotifyOpSurfMemDesc);
2612     }
2613 
2614     pKernelGsp->pNotifyOpSurfMemDesc = NULL;
2615     pKernelGsp->pNotifyOpSurf         = NULL;
2616     pKernelGsp->pNotifyOpSurfPriv     = NULL;
2617 }
2618 
2619 /*!
2620  * Create KernelGsp object and initialize RPC infrastructure
2621  */
2622 NV_STATUS
2623 kgspConstructEngine_IMPL
2624 (
2625     OBJGPU *pGpu,
2626     KernelGsp *pKernelGsp,
2627     ENGDESCRIPTOR engDesc
2628 )
2629 {
2630     NV_STATUS nvStatus = NV_OK;
2631 
2632     if (!IS_GSP_CLIENT(pGpu))
2633         return NV_ERR_NOT_SUPPORTED;
2634 
2635     kgspConfigureFalcon_HAL(pGpu, pKernelGsp);
2636 
2637     // Init RPC objects used to communicate with GSP.
2638     nvStatus = _kgspInitRpcInfrastructure(pGpu, pKernelGsp);
2639     if (nvStatus != NV_OK)
2640     {
2641         NV_PRINTF(LEVEL_ERROR, "init RPC infrastructure failed\n");
2642         goto done;
2643     }
2644 
2645     // Init logging memory used by GSP
2646     nvStatus = _kgspInitLibosLoggingStructures(pGpu, pKernelGsp);
2647     if (nvStatus != NV_OK)
2648     {
2649         NV_PRINTF(LEVEL_ERROR, "init libos logging structures failed: 0x%x\n", nvStatus);
2650         goto done;
2651     }
2652 
2653     // Clear out the gspStaticInfo. We will populate this once GSP-RM is up.
2654     portMemSet(&pKernelGsp->gspStaticInfo, 0,
2655                sizeof(pKernelGsp->gspStaticInfo));
2656 
2657     nvStatus = kgspAllocBootArgs_HAL(pGpu, pKernelGsp);
2658     if (nvStatus != NV_OK)
2659     {
2660         NV_PRINTF(LEVEL_ERROR, "boot arg alloc failed: 0x%x\n", nvStatus);
2661         goto done;
2662     }
2663 
2664     if (IS_SIMULATION(pGpu))
2665     {
2666         nvStatus = _kgspAllocSimAccessBuffer(pGpu, pKernelGsp);
2667         if (nvStatus != NV_OK)
2668         {
2669             NV_PRINTF(LEVEL_ERROR, "sim access buffer alloc failed: 0x%x\n", nvStatus);
2670             goto done;
2671         }
2672     }
2673 
2674     nvStatus = _kgspAllocNotifyOpSharedSurface(pGpu, pKernelGsp);
2675     if (nvStatus != NV_OK)
2676     {
2677         NV_PRINTF(LEVEL_ERROR, "notify operation shared surface alloc failed: 0x%x\n", nvStatus);
2678         goto done;
2679     }
2680 
2681 #if KERNEL_GSP_TRACING_RATS_ENABLED
2682     multimapInit(&pGpu->gspTraceEventBufferBindingsUid, portMemAllocatorGetGlobalNonPaged());
2683 #endif
2684 
2685 done:
2686     if (nvStatus != NV_OK)
2687     {
2688         _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
2689         kgspFreeBootArgs_HAL(pGpu, pKernelGsp);
2690         _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
2691         _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
2692     }
2693 
2694     return nvStatus;
2695 }
2696 
2697 /*!
2698  * Convert VBIOS version containing Version and OemVersion packed together to
2699  * a string representation.
2700  *
2701  * Example:
2702  *   for Version 0x05400001, OemVersion 0x12
2703  *   input argument vbiosVersionCombined 0x0540000112
2704  *   output str "5.40.00.01.12"
2705  */
2706 static void
2707 _kgspVbiosVersionToStr(NvU64 vbiosVersionCombined, char *pVbiosVersionStr, NvU32 size)
2708 {
2709     nvDbgSnprintf(pVbiosVersionStr, size, "%2X.%02X.%02X.%02X.%02X",
2710                   (vbiosVersionCombined >> 32) & 0xff,
2711                   (vbiosVersionCombined >> 24) & 0xff,
2712                   (vbiosVersionCombined >> 16) & 0xff,
2713                   (vbiosVersionCombined >> 8) & 0xff,
2714                   (vbiosVersionCombined) & 0xff);
2715 }
2716 
2717 static NV_STATUS
2718 _kgspPrepareScrubberImageIfNeeded(OBJGPU *pGpu, KernelGsp *pKernelGsp)
2719 {
2720     // Prepare Scrubber ucode image if pre-scrubbed memory is insufficient
2721     NvU64 neededSize = pKernelGsp->pWprMeta->fbSize - pKernelGsp->pWprMeta->gspFwRsvdStart;
2722     NvU64 prescrubbedSize = kgspGetPrescrubbedTopFbSize(pGpu, pKernelGsp);
2723     NV_PRINTF(LEVEL_INFO, "pre-scrubbed memory: 0x%llx bytes, needed: 0x%llx bytes\n",
2724               prescrubbedSize, neededSize);
2725 
2726     if (neededSize > prescrubbedSize)
2727         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2728             kgspAllocateScrubberUcodeImage(pGpu, pKernelGsp, &pKernelGsp->pScrubberUcode));
2729 
2730     return NV_OK;
2731 }
2732 
2733 /*!
2734  * Prepare and place RPCs in message queue that GSP-RM will process
2735  * in early boot before OBJGPU is created.
2736  *
2737  * @param[in] pGpu        GPU object pointer
2738  * @param[in] pKernelGsp  KernelGsp object pointer
2739  *
2740  * @return NV_OK if RPCs queued successfully.
2741  *         Appropriate NV_ERR_xxx value otherwise.
2742  */
2743 NV_STATUS
2744 kgspQueueAsyncInitRpcs_IMPL
2745 (
2746     OBJGPU    *pGpu,
2747     KernelGsp *pKernelGsp
2748 )
2749 {
2750     NV_STATUS status = NV_OK;
2751 
2752     NV_RM_RPC_GSP_SET_SYSTEM_INFO(pGpu, status);
2753     if (status != NV_OK)
2754     {
2755         NV_ASSERT_OK_FAILED("NV_RM_RPC_GSP_SET_SYSTEM_INFO", status);
2756         return status;
2757     }
2758 
2759     NV_RM_RPC_SET_REGISTRY(pGpu, status);
2760     if (status != NV_OK)
2761     {
2762         NV_ASSERT_OK_FAILED("NV_RM_RPC_SET_REGISTRY", status);
2763         return status;
2764     }
2765 
2766     return NV_OK;
2767 }
2768 
2769 static NvBool
2770 _kgspShouldRelaxGspInitLocking
2771 (
2772     OBJGPU *pGpu
2773 )
2774 {
2775     NvU32 relaxGspInitLockingReg;
2776 
2777     if (!RMCFG_FEATURE_PLATFORM_UNIX)
2778     {
2779         return NV_FALSE;
2780     }
2781 
2782     if (gpuIsCCFeatureEnabled(pGpu))
2783     {
2784         return NV_FALSE;
2785     }
2786 
2787     if (osReadRegistryDword(pGpu, NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING, &relaxGspInitLockingReg) != NV_OK)
2788     {
2789         relaxGspInitLockingReg = NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_DEFAULT;
2790     }
2791 
2792     // Due to bug 4399629, restrict which platforms have parallel init enabled by default
2793     if (relaxGspInitLockingReg == NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_DEFAULT)
2794     {
2795         NvU16 devId = (NvU16)(((pGpu->idInfo.PCIDeviceID) >> 16) & 0x0000FFFF);
2796         NvU32 i;
2797 
2798         static const NvU16 defaultRelaxGspInitLockingGpus[] = {
2799             0x1EB8, // T4
2800             0x1EB9, // T4
2801         };
2802 
2803         if (IsHOPPER(pGpu) || IsADA(pGpu))
2804         {
2805             return NV_TRUE;
2806         }
2807 
2808         for (i = 0; i < NV_ARRAY_ELEMENTS(defaultRelaxGspInitLockingGpus); i++)
2809         {
2810             if (devId == defaultRelaxGspInitLockingGpus[i])
2811             {
2812                 return NV_TRUE;
2813             }
2814         }
2815         return NV_FALSE;
2816     }
2817 
2818     return (relaxGspInitLockingReg == NV_REG_STR_RM_RELAXED_GSP_INIT_LOCKING_ENABLE);
2819     return NV_FALSE;
2820 }
2821 
2822 static NV_STATUS
2823 _kgspBootReacquireLocks(OBJGPU *pGpu, KernelGsp *pKernelGsp, GPU_MASK *pGpusLockedMask)
2824 {
2825     //
2826     // To follow lock order constraints, GPU lock needs to be released before acquiring API lock
2827     // As this path doesn't go through resource server, no client locks should be held at this point.
2828     // Note: we must not hold any client locks when re-acquiring the API per lock ordering
2829     //
2830     rmGpuGroupLockRelease(*pGpusLockedMask, GPUS_LOCK_FLAGS_NONE);
2831     *pGpusLockedMask = 0;
2832 
2833     //
2834     // rmapiLockAcquire should never fail on Linux if the API lock and GPU locks are not held.
2835     // Failure to acquire the API lock means the cleanup sequence will skipped since it is
2836     // unsafe without the lock.
2837     //
2838     NV_ASSERT_OK_OR_RETURN(rmapiLockAcquire(API_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT));
2839 
2840     //
2841     // This should never fail on Linux due to locks in the Unix layer.
2842     // This will need to be revisited when parallel init is enabled on other platforms.
2843     //
2844     NV_ASSERT_OR_RETURN(gpumgrIsGpuPointerAttached(pGpu), NV_ERR_INVALID_DEVICE);
2845 
2846     // Reqcquire the GPU lock released above.
2847     NV_ASSERT_OK_OR_RETURN(rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
2848                                                  GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT,
2849                                                  pGpusLockedMask));
2850 
2851     return NV_OK;
2852 }
2853 
2854 static NV_STATUS
2855 _kgspBootGspRm(OBJGPU *pGpu, KernelGsp *pKernelGsp, GSP_FIRMWARE *pGspFw, GPU_MASK *pGpusLockedMask)
2856 {
2857     NV_STATUS status;
2858 
2859     // Fail early if WPR2 is up
2860     if (kgspIsWpr2Up_HAL(pGpu, pKernelGsp))
2861     {
2862         NV_PRINTF(LEVEL_ERROR, "unexpected WPR2 already up, cannot proceed with booting GSP\n");
2863         NV_PRINTF(LEVEL_ERROR, "(the GPU is likely in a bad state and may need to be reset)\n");
2864         return NV_ERR_INVALID_STATE;
2865     }
2866 
2867     // Calculate FB layout (requires knowing FB size which depends on GFW_BOOT)
2868     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgspCalculateFbLayout_HAL(pGpu, pKernelGsp, pGspFw));
2869 
2870     // If the new FB layout requires a scrubber ucode to scrub additional space, prepare it now
2871     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, _kgspPrepareScrubberImageIfNeeded(pGpu, pKernelGsp));
2872 
2873     // Setup arguments for bootstrapping GSP
2874     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kgspPrepareForBootstrap_HAL(pGpu, pKernelGsp, pGspFw));
2875 
2876     // Release the API lock if relaxed locking for parallel init is enabled
2877     NvBool bRelaxedLocking = _kgspShouldRelaxGspInitLocking(pGpu);
2878     if (bRelaxedLocking)
2879         rmapiLockRelease();
2880 
2881     // Proceed with GSP boot - if it fails, check for ECC errors
2882     status = kgspBootstrap_HAL(pGpu, pKernelGsp, pGspFw);
2883     if ((status != NV_OK) && gpuCheckEccCounts_HAL(pGpu))
2884         status = NV_ERR_ECC_ERROR;
2885 
2886     pKernelGsp->bootAttempts++;
2887 
2888     //
2889     // The caller will check that both the API lock and the GPU lock will be held upon return from
2890     // this function, regardless of whether GSP bootstrap succeeded.
2891     //
2892     if (bRelaxedLocking)
2893         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2894                               _kgspBootReacquireLocks(pGpu, pKernelGsp, pGpusLockedMask));
2895 
2896     return status;
2897 }
2898 
2899 /*!
2900  * Initialize GSP-RM
2901  *
2902  * @param[in]      pGpu          GPU object pointer
2903  * @param[in]      pKernelGsp    KernelGsp object pointer
2904  * @param[in]      pGspFw        GSP firmware structure pointer
2905  *
2906  * @return NV_OK if GSP fw RM offload successfully initialized.
2907  *         Appropriate NV_ERR_xxx value otherwise.
2908  */
2909 NV_STATUS
2910 kgspInitRm_IMPL
2911 (
2912     OBJGPU       *pGpu,
2913     KernelGsp    *pKernelGsp,
2914     GSP_FIRMWARE *pGspFw
2915 )
2916 {
2917     NV_STATUS  status = NV_OK;
2918     OBJTMR    *pTmr = GPU_GET_TIMER(pGpu);
2919     GPU_MASK   gpusLockedMask = 0;
2920 
2921     if (!IS_GSP_CLIENT(pGpu))
2922         return NV_OK;
2923 
2924     if ((pGspFw == NULL) || (pGspFw->pBuf == NULL) || (pGspFw->size == 0))
2925     {
2926         NV_PRINTF(LEVEL_ERROR, "need firmware to initialize GSP\n");
2927         return NV_ERR_INVALID_ARGUMENT;
2928     }
2929 
2930     pKernelGsp->bInInit = NV_TRUE;
2931 
2932     // Need to hold the GPU instance lock in order to write to the RPC queue
2933     NV_ASSERT_OK_OR_GOTO(status,
2934         rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
2935                               GPUS_LOCK_FLAGS_NONE, RM_LOCK_MODULES_INIT, &gpusLockedMask),
2936         done);
2937 
2938     /*
2939      * For GSP-RM boot, we must trigger FRTS (if it exists for the chip)
2940      * before loading GSP-RM so that FRTS data and GSP-RM code/data/heap can coexist
2941      * in WPR2. FRTS is triggered by running a VBIOS-provided ucode called FWSEC.
2942      *
2943      * Here, we extract a VBIOS image from ROM, and parse it for FWSEC.
2944      */
2945     if (pKernelGsp->pFwsecUcode == NULL)
2946     {
2947         KernelGspVbiosImg *pVbiosImg = NULL;
2948 
2949         // Start VBIOS version string as "unknown"
2950         portStringCopy(pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr), "unknown", sizeof("unknown"));
2951 
2952         // Try and extract a VBIOS image.
2953         status = kgspExtractVbiosFromRom_HAL(pGpu, pKernelGsp, &pVbiosImg);
2954 
2955         if (status == NV_OK)
2956         {
2957             NvU64 vbiosVersionCombined = 0;
2958 
2959             // Got a VBIOS image, now parse it for FWSEC.
2960             status = kgspParseFwsecUcodeFromVbiosImg(pGpu, pKernelGsp, pVbiosImg,
2961                                                      &pKernelGsp->pFwsecUcode, &vbiosVersionCombined);
2962             kgspFreeVbiosImg(pVbiosImg);
2963 
2964             if (vbiosVersionCombined > 0)
2965             {
2966                 _kgspVbiosVersionToStr(vbiosVersionCombined, pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr));
2967             }
2968 
2969             if (status != NV_OK)
2970             {
2971                 NV_PRINTF(LEVEL_ERROR, "failed to parse FWSEC ucode from VBIOS image (VBIOS version %s): 0x%x\n",
2972                           pKernelGsp->vbiosVersionStr, status);
2973                 goto done;
2974             }
2975 
2976             NV_PRINTF(LEVEL_INFO, "parsed VBIOS version %s\n", pKernelGsp->vbiosVersionStr);
2977         }
2978         else if (status == NV_ERR_NOT_SUPPORTED)
2979         {
2980             // Extracting VBIOS image from ROM is not supported.
2981             status = NV_OK;
2982         }
2983         else
2984         {
2985             NV_PRINTF(LEVEL_ERROR, "failed to extract VBIOS image from ROM: 0x%x\n",
2986                         status);
2987             goto done;
2988         }
2989 
2990     }
2991 
2992     /*
2993      * We use a set of Booter ucodes to boot GSP-RM as well as manage its lifecycle.
2994      *
2995      * Booter Load loads, verifies, and boots GSP-RM in WPR2.
2996      * Booter Unload tears down WPR2 for driver unload.
2997      *
2998      * Here we prepare the Booter ucode images in SYSMEM so they may be loaded onto
2999      * SEC2 (Load / Unload) and NVDEC0 (Unload).
3000      */
3001     if (pKernelGsp->bPartitionedFmc)
3002     {
3003         //
3004         // The secure boot ucode is included in the partitioned FMC, no need for
3005         // separate Booter ucodes.
3006         //
3007     }
3008     else
3009     {
3010         if (pKernelGsp->pBooterLoadUcode == NULL)
3011         {
3012             status = kgspAllocateBooterLoadUcodeImage(pGpu, pKernelGsp,
3013                                                       &pKernelGsp->pBooterLoadUcode);
3014             if (status != NV_OK)
3015             {
3016                 NV_PRINTF(LEVEL_ERROR, "failed to allocate Booter Load ucode: 0x%x\n", status);
3017                 goto done;
3018             }
3019         }
3020 
3021         if (pKernelGsp->pBooterUnloadUcode == NULL)
3022         {
3023             status = kgspAllocateBooterUnloadUcodeImage(pGpu, pKernelGsp,
3024                                                         &pKernelGsp->pBooterUnloadUcode);
3025             if (status != NV_OK)
3026             {
3027                 NV_PRINTF(LEVEL_ERROR, "failed to allocate Booter Unload ucode: 0x%x\n", status);
3028                 goto done;
3029             }
3030         }
3031     }
3032 
3033     // Prepare boot binary image.
3034     status = kgspPrepareBootBinaryImage(pGpu, pKernelGsp);
3035     if (status != NV_OK)
3036     {
3037         NV_PRINTF(LEVEL_ERROR, "Error preparing boot binary image\n");
3038         goto done;
3039     }
3040 
3041     // Prepare GSP-RM image.
3042     status = _kgspPrepareGspRmBinaryImage(pGpu, pKernelGsp, pGspFw);
3043     if (status != NV_OK)
3044     {
3045         NV_PRINTF(LEVEL_ERROR, "Error preparing GSP-RM image\n");
3046         goto done;
3047     }
3048 
3049     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, _kgspInitLibosLogDecoder(pGpu, pKernelGsp, pGspFw), done);
3050 
3051     //
3052     // Do not register nvlog flush callback if:
3053     // 1. Live decoding is enabled, as logs will be printed to dmesg.
3054     // 2. NV_ESC_RM_LOCKLESS_DIAGNOSTIC is not supported on this platform, i.e. pNvlogFlushMtx=NULL.
3055     //
3056     if (pKernelGsp->pLogElf == NULL && pKernelGsp->pNvlogFlushMtx != NULL)
3057         NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, nvlogRegisterFlushCb(kgspNvlogFlushCb, pKernelGsp), done);
3058 
3059     // Reset thread state timeout and wait for GFW_BOOT OK status
3060     threadStateResetTimeout(pGpu);
3061     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspWaitForGfwBootOk_HAL(pGpu, pKernelGsp), done);
3062 
3063     //
3064     // Set the GPU time to the wall-clock time after GFW boot is complete
3065     // (to avoid PLM collisions) but before loading GSP-RM ucode (which
3066     // consumes the updated GPU time).
3067     //
3068     tmrSetCurrentTime_HAL(pGpu, pTmr);
3069 
3070     // Initialize libos init args list
3071     kgspSetupLibosInitArgs(pGpu, pKernelGsp);
3072 
3073     // Fill in the GSP-RM message queue init parameters
3074     kgspPopulateGspRmInitArgs(pGpu, pKernelGsp, NULL);
3075 
3076     //
3077     // If ConfCompute is enabled, all RPC traffic must be encrypted. Since we
3078     // can't encrypt until GSP boots and session is established, we must send
3079     // these messages later (kgspBootstrap_HAL) in CC.
3080     //
3081     ConfidentialCompute *pCC = GPU_GET_CONF_COMPUTE(pGpu);
3082     if (pCC == NULL || !pCC->getProperty(pCC, PDB_PROP_CONFCOMPUTE_CC_FEATURE_ENABLED))
3083     {
3084         //
3085         // Stuff the message queue with async init messages that will be run
3086         // before OBJGPU is created.
3087         //
3088         status = kgspQueueAsyncInitRpcs(pGpu, pKernelGsp);
3089         if (status != NV_OK)
3090         {
3091             goto done;
3092         }
3093     }
3094 
3095     //
3096     // Bring up ucode with RM offload task.
3097     // If an ECC error occurs which results in the failure of the bootstrap, try again.
3098     // Subsequent attempts will shift the GSP region of FB in an attempt to avoid the
3099     // unstable memory.
3100     //
3101     const NvU8 MAX_GSP_BOOT_ATTEMPTS = 4;
3102     do
3103     {
3104         // Reset the thread state timeout after failed attempts to prevent premature timeouts.
3105         if (status != NV_OK)
3106             threadStateResetTimeout(pGpu);
3107 
3108         //
3109         // _kgspBootGspRm() will return NV_ERR_ECC_ERROR if any unhandled ECC errors are
3110         // detected during a failed GSP boot attempt. Depending on where and when the
3111         // error occurred, we may not be able to try again, in which case a different
3112         // error code will be returned.
3113         //
3114         status = _kgspBootGspRm(pGpu, pKernelGsp, pGspFw, &gpusLockedMask);
3115 
3116         //
3117         // _kgspBootGspRm() may temporarily release locks to facilitate parallel GSP bootstrap on
3118         // other GPUs. It is responsible for reacquiring them in the proper order. If there is a
3119         // failure to reacquire locks, it is unsafe to continue, regardless of the initialization
3120         // status - so we return immediately here, rather attempting cleanup.
3121         //
3122         // Note: _kgspBootGspRm() is structured such that gpusLockedMask will always be 0 (no GPU
3123         //       locks held) if the API lock is not held upon return.
3124         //
3125         NV_ASSERT_OR_RETURN(rmapiLockIsOwner() && (gpusLockedMask != 0),
3126                             NV_ERR_INVALID_LOCK_STATE);
3127     } while ((status == NV_ERR_ECC_ERROR) && (pKernelGsp->bootAttempts < MAX_GSP_BOOT_ATTEMPTS));
3128 
3129     if (status != NV_OK)
3130     {
3131         if (status == NV_ERR_INSUFFICIENT_POWER)
3132         {
3133             OBJSYS *pSys = SYS_GET_INSTANCE();
3134             OBJGPUMGR *pGpuMgr = SYS_GET_GPUMGR(pSys);
3135 
3136             pGpuMgr->powerDisconnectedGpuBus[pGpuMgr->powerDisconnectedGpuCount++] = gpuGetBus(pGpu);
3137         }
3138 
3139         //
3140         // Ignore return value - a crash report may have already been consumed,
3141         // this is just here as a last attempt to report boot issues that might
3142         // have escaped prior checks.
3143         //
3144         (void)kgspHealthCheck_HAL(pGpu, pKernelGsp);
3145         goto done;
3146     }
3147 
3148     // at this point we should be able to exchange RPCs with RM offload task
3149     NV_RM_RPC_SET_GUEST_SYSTEM_INFO(pGpu, status);
3150     if (status != NV_OK)
3151     {
3152         NV_PRINTF(LEVEL_ERROR, "SET_GUEST_SYSTEM_INFO failed: 0x%x\n", status);
3153         goto done;
3154     }
3155 
3156     NV_RM_RPC_GET_GSP_STATIC_INFO(pGpu, status);
3157     if (status != NV_OK)
3158     {
3159         NV_PRINTF(LEVEL_ERROR, "GET_GSP_STATIC_INFO failed: 0x%x\n", status);
3160         goto done;
3161     }
3162 
3163     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspStartLogPolling(pGpu, pKernelGsp), done);
3164 
3165 done:
3166     pKernelGsp->bInInit = NV_FALSE;
3167 
3168     if (status != NV_OK)
3169     {
3170         KernelPmu *pKernelPmu = GPU_GET_KERNEL_PMU(pGpu);
3171 
3172         // Preserve any captured GSP-RM logs
3173         libosPreserveLogs(&pKernelGsp->logDecode);
3174 
3175         if (pKernelPmu != NULL)
3176         {
3177             // If PMU init fails, kgsp init will also fail
3178             libosPreserveLogs(&pKernelPmu->logDecode);
3179         }
3180     }
3181 
3182     if (gpusLockedMask != 0)
3183     {
3184         rmGpuGroupLockRelease(gpusLockedMask, GPUS_LOCK_FLAGS_NONE);
3185     }
3186 
3187     return status;
3188 }
3189 
3190 /*!
3191  * Unload GSP-RM
3192  */
3193 NV_STATUS
3194 kgspUnloadRm_IMPL
3195 (
3196     OBJGPU *pGpu,
3197     KernelGsp *pKernelGsp
3198 )
3199 {
3200     NV_STATUS rpcStatus = NV_OK;
3201     NV_STATUS status;
3202     KernelGspPreparedFwsecCmd preparedCmd;
3203 
3204     NV_PRINTF(LEVEL_INFO, "unloading GSP-RM\n");
3205     NV_RM_RPC_UNLOADING_GUEST_DRIVER(pGpu, rpcStatus, NV_FALSE, NV_FALSE, 0);
3206 
3207     if (gpuIsCCFeatureEnabled(pGpu))
3208     {
3209         // FIPS: If CC enabled, we need to confirm GSP-RM was able to teardown CC state.
3210         kgspCheckGspRmCcCleanup_HAL(pGpu, pKernelGsp);
3211     }
3212 
3213     // Wait for GSP-RM processor to suspend
3214     kgspWaitForProcessorSuspend_HAL(pGpu, pKernelGsp);
3215 
3216     // Dump GSP-RM logs and reset before invoking FWSEC-SB
3217     kgspDumpGspLogs(pKernelGsp, NV_FALSE);
3218 
3219     //
3220     // Avoid cascading timeouts when attempting to invoke the below ucodes if
3221     // we are unloading due to a GSP-RM timeout.
3222     //
3223     threadStateResetTimeout(pGpu);
3224 
3225     // Because of COT, RM cannot reset GSP-RISCV and FSP has exclusive access to reset and reboot GSP for next run.
3226     if(!(pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_COT_ENABLED)))
3227     {
3228         kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
3229     }
3230 
3231     // Invoke FWSEC-SB to put back PreOsApps during driver unload
3232     status = kgspPrepareForFwsecSb_HAL(pGpu, pKernelGsp, pKernelGsp->pFwsecUcode, &preparedCmd);
3233     if (status == NV_ERR_NOT_SUPPORTED)
3234     {
3235         // skip FWSEC-SB during driver unload if unsupported (e.g. on Hopper+)
3236         status = NV_OK;
3237     }
3238     else if (status != NV_OK)
3239     {
3240         NV_PRINTF(LEVEL_ERROR, "failed to prepare for FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status);
3241         NV_ASSERT(0);
3242     }
3243     else
3244     {
3245         status = kgspExecuteFwsec_HAL(pGpu, pKernelGsp, &preparedCmd);
3246         if (status != NV_OK)
3247         {
3248             NV_PRINTF(LEVEL_ERROR, "failed to execute FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status);
3249             NV_ASSERT(0);
3250         }
3251     }
3252 
3253     if (pKernelGsp->bPartitionedFmc)
3254     {
3255         //
3256         // GSP-RM invokes the partitioned FMC to unload directly as part of the
3257         // NV_RM_RPC_UNLOADING_GUEST_DRIVER call above.
3258         //
3259         status = rpcStatus;
3260     }
3261     else
3262     {
3263         // After instructing GSP-RM to unload itself, run Booter Unload to teardown WPR2
3264         status = kgspExecuteBooterUnloadIfNeeded_HAL(pGpu, pKernelGsp, 0);
3265     }
3266 
3267     //
3268     // To fix boot issue after GPU reset on ESXi config:
3269     // We still do not have root cause but looks like some sanity is failing during boot after reset is done.
3270     // As temp WAR, add delay of 250 ms after gsp rm unload is done.
3271     // Limit this to [VGPU-GSP] supported configs only and when we are in GPU RESET path.
3272     //
3273     if (API_GPU_IN_RESET_SANITY_CHECK(pGpu) &&
3274         gpuIsSriovEnabled(pGpu) &&
3275         IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
3276     {
3277         osDelay(250);
3278     }
3279 
3280     if (rpcStatus != NV_OK)
3281     {
3282         return rpcStatus;
3283     }
3284 
3285     return status;
3286 }
3287 
3288 /*!
3289  * Free RPC infrastructure and KernelGsp object
3290  */
3291 void
3292 kgspDestruct_IMPL
3293 (
3294     KernelGsp *pKernelGsp
3295 )
3296 {
3297     OBJGPU *pGpu = ENG_GET_GPU(pKernelGsp);
3298 
3299     if (!IS_GSP_CLIENT(pGpu))
3300         return;
3301 
3302     // set VBIOS version string back to "unknown"
3303     portStringCopy(pKernelGsp->vbiosVersionStr, sizeof(pKernelGsp->vbiosVersionStr), "unknown", sizeof("unknown"));
3304 
3305     kgspFreeFlcnUcode(pKernelGsp->pFwsecUcode);
3306     pKernelGsp->pFwsecUcode = NULL;
3307 
3308     kgspFreeFlcnUcode(pKernelGsp->pBooterLoadUcode);
3309     pKernelGsp->pBooterLoadUcode = NULL;
3310 
3311     kgspFreeFlcnUcode(pKernelGsp->pBooterUnloadUcode);
3312     pKernelGsp->pBooterUnloadUcode = NULL;
3313 
3314     kgspFreeFlcnUcode(pKernelGsp->pScrubberUcode);
3315     pKernelGsp->pScrubberUcode = NULL;
3316 
3317     kgspFreeBootArgs_HAL(pGpu, pKernelGsp);
3318 
3319     _kgspFreeLibosLoggingStructures(pGpu, pKernelGsp);
3320     _kgspFreeRpcInfrastructure(pGpu, pKernelGsp);
3321     _kgspFreeBootBinaryImage(pGpu, pKernelGsp);
3322     _kgspFreeSimAccessBuffer(pGpu, pKernelGsp);
3323     _kgspFreeNotifyOpSharedSurface(pGpu, pKernelGsp);
3324 
3325     kgspFreeSuspendResumeData_HAL(pGpu, pKernelGsp);
3326 
3327 #if KERNEL_GSP_TRACING_RATS_ENABLED
3328     multimapDestroy(&pGpu->gspTraceEventBufferBindingsUid);
3329 #endif
3330 }
3331 
3332 void
3333 kgspDumpGspLogsUnlocked_IMPL
3334 (
3335     KernelGsp *pKernelGsp,
3336     NvBool bSyncNvLog
3337 )
3338 {
3339     if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog)
3340     {
3341         libosExtractLogs(&pKernelGsp->logDecode, bSyncNvLog);
3342 
3343         if (pKernelGsp->bHasVgpuLogs)
3344         {
3345             // Dump logs from vGPU partition
3346             for (NvU32 i = 0; i < MAX_PARTITIONS_WITH_GFID; i++)
3347             {
3348                 libosExtractLogs(&pKernelGsp->logDecodeVgpuPartition[i], bSyncNvLog);
3349             }
3350         }
3351     }
3352 
3353 }
3354 
3355 /*!
3356  * Dump logs coming from GSP-RM
3357  *
3358  * @param[in] pKernelGsp    KernelGsp pointer
3359  * @param[in] bSyncNvLog    NV_TRUE: Copy a snapshot of the libos logs
3360  *                          into the nvLog wrap buffers.
3361  */
3362 void
3363 kgspDumpGspLogs_IMPL
3364 (
3365     KernelGsp *pKernelGsp,
3366     NvBool bSyncNvLog
3367 )
3368 {
3369     if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog)
3370     {
3371         if (pKernelGsp->pNvlogFlushMtx != NULL)
3372             portSyncMutexAcquire(pKernelGsp->pNvlogFlushMtx);
3373 
3374         kgspDumpGspLogsUnlocked(pKernelGsp, bSyncNvLog);
3375 
3376         if (pKernelGsp->pNvlogFlushMtx != NULL)
3377             portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx);
3378     }
3379 }
3380 
3381 /*!
3382  * Populate GSP-RM init arguments.
3383  */
3384 void
3385 kgspPopulateGspRmInitArgs_IMPL
3386 (
3387     OBJGPU    *pGpu,
3388     KernelGsp *pKernelGsp,
3389     GSP_SR_INIT_ARGUMENTS *pGspInitArgs
3390 )
3391 {
3392     GSP_ARGUMENTS_CACHED *pGspArgs = pKernelGsp->pGspArgumentsCached;
3393     MESSAGE_QUEUE_INIT_ARGUMENTS *pMQInitArgs = &pGspArgs->messageQueueInitArguments;
3394     MESSAGE_QUEUE_COLLECTION *pMQCollection   = pKernelGsp->pMQCollection;
3395     GSP_SR_INIT_ARGUMENTS *pSrInitArgs =  &pGspArgs->srInitArguments;
3396 
3397     // Setup the message queue arguments
3398     pMQInitArgs->sharedMemPhysAddr      = pMQCollection->sharedMemPA;
3399     pMQInitArgs->pageTableEntryCount    = pMQCollection->pageTableEntryCount;
3400     pMQInitArgs->cmdQueueOffset         = pMQCollection->pageTableSize;
3401     pMQInitArgs->statQueueOffset        = pMQInitArgs->cmdQueueOffset + pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX].commandQueueSize;
3402     if (pKernelGsp->bIsTaskIsrQueueRequired)
3403     {
3404         pMQInitArgs->locklessCmdQueueOffset  = pMQInitArgs->statQueueOffset        + pMQCollection->rpcQueues[RPC_TASK_RM_QUEUE_IDX].statusQueueSize;
3405         pMQInitArgs->locklessStatQueueOffset = pMQInitArgs->locklessCmdQueueOffset + pMQCollection->rpcQueues[RPC_TASK_ISR_QUEUE_IDX].commandQueueSize;
3406     }
3407     else
3408     {
3409         pMQInitArgs->locklessCmdQueueOffset  = 0;
3410         pMQInitArgs->locklessStatQueueOffset = 0;
3411     }
3412 
3413     if (pGspInitArgs == NULL)
3414     {
3415         pSrInitArgs->bInPMTransition     = NV_FALSE;
3416         pSrInitArgs->oldLevel            = 0;
3417         pSrInitArgs->flags               = 0;
3418     }
3419     else
3420     {
3421         pSrInitArgs->bInPMTransition     = NV_TRUE;
3422         pSrInitArgs->oldLevel            = pGspInitArgs->oldLevel;
3423         pSrInitArgs->flags               = pGspInitArgs->flags;
3424     }
3425 
3426     pGspArgs->gpuInstance = pGpu->gpuInstance;
3427 
3428     portMemSet(&pGspArgs->profilerArgs, 0, sizeof(pGspArgs->profilerArgs));
3429 
3430     if (pKernelGsp->pProfilerSamples != NULL &&
3431         pKernelGsp->pProfilerSamplesMD != NULL)
3432     {
3433         pGspArgs->profilerArgs.pa = memdescGetPhysAddr(pKernelGsp->pProfilerSamplesMD, AT_GPU, 0);
3434         pGspArgs->profilerArgs.size = memdescGetSize(pKernelGsp->pProfilerSamplesMD);
3435     }
3436 }
3437 
3438 /*!
3439  * Prepare boot binary image for GSP-RM boot.
3440  *
3441  * @return NV_OK if boot binary image prepared successfully.
3442  *         Appropriate NV_ERR_xxx value otherwise.
3443  */
3444 NV_STATUS
3445 kgspPrepareBootBinaryImage_IMPL
3446 (
3447     OBJGPU *pGpu,
3448     KernelGsp *pKernelGsp
3449 )
3450 {
3451     NV_STATUS status;
3452     BINDATA_STORAGE *pBinStorageImage;
3453     BINDATA_STORAGE *pBinStorageDesc;
3454     NvU32 bufSize;
3455     NvU32 bufSizeAligned;
3456     RM_RISCV_UCODE_DESC *pDesc = NULL;
3457     NvP64 pVa = NvP64_NULL;
3458     NvP64 pPriv = NvP64_NULL;
3459     NvU64 flags = MEMDESC_FLAGS_NONE;
3460 
3461     NV_ASSERT_OR_RETURN(pKernelGsp->pGspRmBootUcodeImage == NULL, NV_ERR_INVALID_STATE);
3462     NV_ASSERT_OR_RETURN(pKernelGsp->pGspRmBootUcodeDesc  == NULL, NV_ERR_INVALID_STATE);
3463 
3464     // get the bindata storage for the image/descriptor
3465     kgspGetGspRmBootUcodeStorage_HAL(pGpu, pKernelGsp, &pBinStorageImage, &pBinStorageDesc);
3466 
3467     // copy the image to sysmem
3468     bufSize = bindataGetBufferSize(pBinStorageImage);
3469     bufSizeAligned = NV_ALIGN_UP(bufSize, 0x1000);
3470 
3471     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3472 
3473     NV_ASSERT_OK_OR_GOTO(status,
3474                         memdescCreate(&pKernelGsp->pGspRmBootUcodeMemdesc,
3475                                 pGpu,
3476                                 bufSizeAligned,
3477                                 RM_PAGE_SIZE,
3478                                 NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED,
3479                                 flags),
3480                         fail);
3481 
3482     memdescTagAlloc(status,
3483             NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_15, pKernelGsp->pGspRmBootUcodeMemdesc);
3484     NV_ASSERT_OK_OR_GOTO(status, status, fail);
3485 
3486     NV_ASSERT_OK_OR_GOTO(status,
3487                         memdescMap(pKernelGsp->pGspRmBootUcodeMemdesc, 0,
3488                                 memdescGetSize(pKernelGsp->pGspRmBootUcodeMemdesc),
3489                                 NV_TRUE, NV_PROTECT_READ_WRITE,
3490                                 &pVa, &pPriv),
3491                         fail);
3492 
3493     pKernelGsp->gspRmBootUcodeSize   = bufSize;
3494     pKernelGsp->pGspRmBootUcodeImage = (NvU8 *)NvP64_VALUE(pVa);;
3495     pKernelGsp->pGspRmBootUcodeMemdescPriv = pPriv;
3496 
3497     NV_ASSERT_OK_OR_GOTO(status,
3498                         bindataWriteToBuffer(pBinStorageImage,
3499                                pKernelGsp->pGspRmBootUcodeImage,
3500                                bufSize),
3501                         fail);
3502 
3503     // get the image descriptor
3504     NV_ASSERT_OK_OR_GOTO(status,
3505                          bindataStorageAcquireData(pBinStorageDesc, (const void**)&pDesc),
3506                          fail);
3507     pKernelGsp->pGspRmBootUcodeDesc = pDesc;
3508 
3509     return status;
3510 
3511 fail:
3512     _kgspFreeBootBinaryImage(pGpu, pKernelGsp);
3513     return status;
3514 }
3515 
3516 static void
3517 _kgspFreeBootBinaryImage
3518 (
3519     OBJGPU *pGpu,
3520     KernelGsp *pKernelGsp
3521 )
3522 {
3523     bindataStorageReleaseData(pKernelGsp->pGspRmBootUcodeDesc);
3524     pKernelGsp->pGspRmBootUcodeDesc  = NULL;
3525 
3526     if (pKernelGsp->pGspRmBootUcodeImage != NULL)
3527     {
3528         memdescUnmap(pKernelGsp->pGspRmBootUcodeMemdesc,
3529                      NV_TRUE, osGetCurrentProcess(),
3530                      (void *)pKernelGsp->pGspRmBootUcodeImage,
3531                      pKernelGsp->pGspRmBootUcodeMemdescPriv);
3532         pKernelGsp->pGspRmBootUcodeImage = NULL;
3533         pKernelGsp->pGspRmBootUcodeMemdescPriv = NULL;
3534     }
3535     if (pKernelGsp->pGspRmBootUcodeMemdesc != NULL)
3536     {
3537         memdescFree(pKernelGsp->pGspRmBootUcodeMemdesc);
3538         memdescDestroy(pKernelGsp->pGspRmBootUcodeMemdesc);
3539         pKernelGsp->pGspRmBootUcodeMemdesc = NULL;
3540     }
3541 
3542     pKernelGsp->gspRmBootUcodeSize   = 0;
3543 }
3544 
3545 static NV_STATUS
3546 _kgspCreateSignatureMemdesc
3547 (
3548     OBJGPU *pGpu,
3549     KernelGsp *pKernelGsp,
3550     GSP_FIRMWARE *pGspFw
3551 )
3552 {
3553     NV_STATUS status = NV_OK;
3554     NvU8 *pSignatureVa = NULL;
3555     NvU64 flags = MEMDESC_FLAGS_NONE;
3556 
3557     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3558 
3559     // NOTE: align to 256 because that's the alignment needed for Booter DMA
3560     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3561         memdescCreate(&pKernelGsp->pSignatureMemdesc, pGpu,
3562             NV_ALIGN_UP(pGspFw->signatureSize, 256), 256,
3563             NV_TRUE, ADDR_SYSMEM, NV_MEMORY_CACHED, flags));
3564 
3565     memdescTagAlloc(status,
3566             NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_16, pKernelGsp->pSignatureMemdesc);
3567     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, status, fail_create);
3568 
3569     pSignatureVa = memdescMapInternal(pGpu, pKernelGsp->pSignatureMemdesc, TRANSFER_FLAGS_NONE);
3570     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
3571         (pSignatureVa != NULL) ? NV_OK : NV_ERR_INSUFFICIENT_RESOURCES,
3572         fail_alloc);
3573 
3574     portMemCopy(pSignatureVa, memdescGetSize(pKernelGsp->pSignatureMemdesc),
3575         pGspFw->pSignatureData, pGspFw->signatureSize);
3576 
3577     memdescUnmapInternal(pGpu, pKernelGsp->pSignatureMemdesc, 0);
3578     pSignatureVa = NULL;
3579 
3580     return status;
3581 
3582 fail_alloc:
3583     memdescFree(pKernelGsp->pSignatureMemdesc);
3584 
3585 fail_create:
3586     memdescDestroy(pKernelGsp->pSignatureMemdesc);
3587     pKernelGsp->pSignatureMemdesc = NULL;
3588 
3589     return status;
3590 }
3591 
3592 /*!
3593  * Verify that the version embedded in the .fwversion section of the ELF given
3594  * by pElfData and elfDataSize matches our NV_VERSION_STRING.
3595  */
3596 static NV_STATUS
3597 _kgspFwContainerVerifyVersion
3598 (
3599     OBJGPU *pGpu,
3600     KernelGsp *pKernelGsp,
3601     const void *pElfData,
3602     NvU64 elfDataSize,
3603     const char *pNameInMsg
3604 )
3605 {
3606     const char *pFwversion;
3607     NvU64 fwversionSize;
3608     NvU64 expectedVersionLength = portStringLength(NV_VERSION_STRING);
3609 
3610     {
3611         const void *pFwversionRaw;
3612 
3613         NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3614             _kgspFwContainerGetSection(pGpu, pKernelGsp,
3615                 pElfData,
3616                 elfDataSize,
3617                 GSP_VERSION_SECTION_NAME,
3618                 &pFwversionRaw,
3619                 &fwversionSize));
3620 
3621         pFwversion = (const char *) pFwversionRaw;
3622     }
3623 
3624     // Check that text in .fwversion section of ELF matches our NV_VERSION_STRING
3625     if ((fwversionSize != expectedVersionLength + 1) ||
3626         (portStringCompare(pFwversion, NV_VERSION_STRING, expectedVersionLength) != 0))
3627     {
3628         // Sanity check .fwversion before attempting to print it in the error message
3629         if ((fwversionSize > 0) &&
3630             (fwversionSize < 64) &&
3631             (pFwversion[fwversionSize - 1] == '\0'))
3632         {
3633             NV_PRINTF(LEVEL_ERROR, "%s version mismatch: got version %s, expected version %s\n",
3634                       pNameInMsg, pFwversion, NV_VERSION_STRING);
3635         }
3636         else
3637         {
3638             NV_PRINTF(LEVEL_ERROR, "%s version unknown or malformed, expected version %s\n",
3639                       pNameInMsg, NV_VERSION_STRING);
3640         }
3641         return NV_ERR_INVALID_DATA;
3642     }
3643 
3644     return NV_OK;
3645 }
3646 
3647 /*!
3648  * Get the name of the section corresponding to the given section name
3649  * prefix and the current chip.
3650  */
3651 static NV_STATUS
3652 _kgspGetSectionNameForPrefix
3653 (
3654     OBJGPU *pGpu,
3655     KernelGsp *pKernelGsp,
3656     char *pSectionNameBuf,  // out
3657     NvLength sectionNameBufSize,
3658     const char *pSectionPrefix
3659 )
3660 {
3661     NvLength sectionPrefixLength;
3662 
3663     nv_firmware_chip_family_t chipFamily;
3664     const char *pChipFamilyName;
3665     NvLength chipFamilyNameLength;
3666 
3667     NvLength totalSize;
3668 
3669     NV_ASSERT_OR_RETURN(pSectionNameBuf != NULL, NV_ERR_INVALID_ARGUMENT);
3670     NV_ASSERT_OR_RETURN(sectionNameBufSize > 0, NV_ERR_INVALID_ARGUMENT);
3671     NV_ASSERT_OR_RETURN(pSectionPrefix != NULL, NV_ERR_INVALID_ARGUMENT);
3672 
3673     chipFamily = nv_firmware_get_chip_family(gpuGetChipArch(pGpu),
3674                                              gpuGetChipImpl(pGpu));
3675     NV_ASSERT_OR_RETURN(chipFamily != NV_FIRMWARE_CHIP_FAMILY_NULL,
3676                         NV_ERR_INVALID_STATE);
3677 
3678     pChipFamilyName = nv_firmware_chip_family_to_string(chipFamily);
3679     NV_ASSERT_OR_RETURN(pChipFamilyName != NULL, NV_ERR_INVALID_STATE);
3680 
3681     sectionPrefixLength = portStringLength(pSectionPrefix);
3682     chipFamilyNameLength = portStringLength(pChipFamilyName);
3683 
3684     totalSize = sectionPrefixLength + chipFamilyNameLength + 1;
3685     NV_ASSERT_OR_RETURN(sectionNameBufSize >= sectionPrefixLength + 1,
3686                         NV_ERR_BUFFER_TOO_SMALL);
3687     NV_ASSERT_OR_RETURN(sectionNameBufSize >= totalSize,
3688                         NV_ERR_BUFFER_TOO_SMALL);
3689 
3690     portStringCopy(pSectionNameBuf, sectionNameBufSize,
3691                    pSectionPrefix, sectionPrefixLength + 1);
3692     portStringCat(pSectionNameBuf, sectionNameBufSize,
3693                   pChipFamilyName, chipFamilyNameLength + 1);
3694 
3695     return NV_OK;
3696 }
3697 
3698 static NV_STATUS
3699 _kgspPrepareGspRmBinaryImage
3700 (
3701     OBJGPU *pGpu,
3702     KernelGsp *pKernelGsp,
3703     GSP_FIRMWARE *pGspFw
3704 )
3705 {
3706     char signatureSectionName[32];
3707 
3708     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3709         _kgspFwContainerVerifyVersion(pGpu, pKernelGsp,
3710             pGspFw->pBuf,
3711             pGspFw->size,
3712             "GSP firmware image"));
3713 
3714     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3715         _kgspFwContainerGetSection(pGpu, pKernelGsp,
3716             pGspFw->pBuf,
3717             pGspFw->size,
3718             GSP_IMAGE_SECTION_NAME,
3719             &pGspFw->pImageData,
3720             &pGspFw->imageSize));
3721 
3722     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3723         _kgspGetSectionNameForPrefix(pGpu, pKernelGsp,
3724             signatureSectionName, sizeof(signatureSectionName),
3725             kgspGetSignatureSectionNamePrefix_HAL(pGpu, pKernelGsp)));
3726 
3727     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3728         _kgspFwContainerGetSection(pGpu, pKernelGsp,
3729             pGspFw->pBuf,
3730             pGspFw->size,
3731             signatureSectionName,
3732             &pGspFw->pSignatureData,
3733             &pGspFw->signatureSize));
3734 
3735     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3736         _kgspCreateSignatureMemdesc(pGpu, pKernelGsp,
3737             pGspFw));
3738 
3739     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3740         kgspCreateRadix3(pGpu, pKernelGsp, &pKernelGsp->pGspUCodeRadix3Descriptor,
3741             NULL, pGspFw->pImageData, pGspFw->imageSize));
3742 
3743     return NV_OK;
3744 }
3745 
3746 NV_STATUS
3747 kgspCreateRadix3_IMPL
3748 (
3749     OBJGPU *pGpu,
3750     KernelGsp *pKernelGsp,
3751     MEMORY_DESCRIPTOR **ppMemdescRadix3,
3752     MEMORY_DESCRIPTOR *pMemdescData,
3753     const void *pData,
3754     NvU64 size
3755 )
3756 {
3757     const NvU64 entriesLog2 = LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2 - 3;
3758     NvU8 *pRadix3Buf;
3759     NvP64 pVaKernel;
3760     NvP64 pPrivKernel;
3761     NvU64 ptSize;
3762     NvU64 allocSize;
3763     NvU64 nPages = 0;
3764     NvU64 dataOffset = 0;
3765     NvU32 i;
3766     NV_STATUS status = NV_OK;
3767     NvU64 flags = MEMDESC_FLAGS_KERNEL_MODE;
3768 
3769     // radix3 working array.
3770     struct
3771     {
3772         NvU64  nPages;
3773         NvU64  offset;
3774     } radix3[4];
3775 
3776     NV_ASSERT_OR_RETURN(ppMemdescRadix3 != NULL, NV_ERR_INVALID_PARAMETER);
3777     NV_ASSERT_OR_ELSE_STR(!((pMemdescData != NULL) && (pData != NULL)),
3778                           "Specify pMemdescData or pData, or none, but not both",
3779                           return NV_ERR_INVALID_PARAMETER);
3780 
3781     // If the size is not specified, get it from the memory descriptor.
3782     if ((size == 0) && (pMemdescData != NULL))
3783         size = memdescGetSize(pMemdescData);
3784     NV_ASSERT_OR_RETURN(size > 0, NV_ERR_OUT_OF_RANGE);
3785 
3786     // Clear working structure.
3787     portMemSet(radix3, 0, sizeof radix3);
3788 
3789     // Populate npages, high to low.
3790     i = NV_ARRAY_ELEMENTS(radix3) - 1;
3791     radix3[i].nPages = (size + LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE - 1) >>
3792                        LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3793     for (; i > 0; i--)
3794         radix3[i - 1].nPages = ((radix3[i].nPages - 1) >> entriesLog2) + 1;
3795 
3796     // Populate offset, low to high.
3797     for (i = 1; i < NV_ARRAY_ELEMENTS(radix3); i++)
3798     {
3799         nPages += radix3[i - 1].nPages;
3800         radix3[i].offset = nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3801     }
3802 
3803     NV_ASSERT_OR_RETURN(radix3[0].nPages == 1, NV_ERR_OUT_OF_RANGE);
3804 
3805     // Allocate space for PTEs and PDEs.
3806     ptSize = nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3807     allocSize = ptSize;
3808 
3809     if (pMemdescData == NULL)
3810     {
3811         // We don't have a separate descriptor for the data.  We need PTEs,
3812         // so include space for data in the new descriptor.
3813         allocSize += radix3[3].nPages << LIBOS_MEMORY_REGION_RADIX_PAGE_LOG2;
3814     }
3815 
3816     flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY;
3817 
3818     NV_ASSERT_OK_OR_GOTO(status,
3819         memdescCreate(ppMemdescRadix3, pGpu, allocSize,
3820             LIBOS_MEMORY_REGION_RADIX_PAGE_SIZE,
3821             NV_MEMORY_NONCONTIGUOUS,
3822             ADDR_SYSMEM,
3823             NV_MEMORY_CACHED,
3824             flags),
3825         done);
3826 
3827     memdescTagAlloc(status,
3828             NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_17, (*ppMemdescRadix3));
3829     NV_ASSERT_OK_OR_GOTO(status, status, error_ret);
3830 
3831     // Create kernel mapping.
3832     NV_ASSERT_OK_OR_GOTO(status,
3833         memdescMap(*ppMemdescRadix3, 0, allocSize, NV_TRUE, NV_PROTECT_WRITEABLE,
3834             &pVaKernel, &pPrivKernel),
3835         error_ret);
3836 
3837     if (pVaKernel == NvP64_NULL)
3838     {
3839         NV_PRINTF(LEVEL_ERROR, "VA error for radix3 shared buffer\n");
3840         status = NV_ERR_NO_MEMORY;
3841         goto error_ret;
3842     }
3843 
3844     pRadix3Buf = KERNEL_POINTER_FROM_NvP64(NvU8 *, pVaKernel);
3845 
3846     // Zap out page table.
3847     portMemSet(pRadix3Buf, 0, ptSize);
3848 
3849     // Fill in PDEs.
3850     for (i = 0; i < NV_ARRAY_ELEMENTS(radix3) - 2; i++)
3851     {
3852         memdescGetPhysAddrs(*ppMemdescRadix3,
3853             AT_GPU,                     // addressTranslation
3854             radix3[i + 1].offset,       // offset
3855             RM_PAGE_SIZE,               // stride
3856             radix3[i + 1].nPages,       // count
3857             (RmPhysAddr *)(pRadix3Buf + radix3[i].offset)); // physical address table
3858     }
3859 
3860     dataOffset = radix3[3].offset;
3861 
3862     if (pData != NULL)
3863     {
3864         // Optionally copy data into the radix3 buffer.
3865         portMemCopy(pRadix3Buf + dataOffset, size, pData, size);
3866 
3867         // If we only have part of the last page, clear the rest.
3868         NvU32 clearSize = allocSize - dataOffset - size;
3869         if (clearSize != 0)
3870             portMemSet(pRadix3Buf + dataOffset + size, 0, clearSize);
3871 
3872         pMemdescData = *ppMemdescRadix3;
3873     }
3874 
3875     memdescGetPhysAddrs(*ppMemdescRadix3,
3876         AT_GPU,                     // addressTranslation
3877         dataOffset,                 // offset
3878         RM_PAGE_SIZE,               // stride
3879         radix3[3].nPages,           // count
3880         (RmPhysAddr *)(pRadix3Buf + radix3[2].offset));  // physical address table
3881 
3882     //
3883     // No reason to keep this memory mapped on the CPU side.  Only GSP will
3884     // access it after this point.
3885     //
3886     memdescUnmap(*ppMemdescRadix3, NV_TRUE, osGetCurrentProcess(),
3887                   pVaKernel, pPrivKernel);
3888 done:
3889     return status;
3890 
3891 error_ret:
3892     if (*ppMemdescRadix3 != NULL)
3893     {
3894         memdescFree(*ppMemdescRadix3);
3895         memdescDestroy(*ppMemdescRadix3);
3896         *ppMemdescRadix3 = NULL;
3897     }
3898 
3899     return status;
3900 }
3901 
3902 static NV_STATUS
3903 _kgspFwContainerGetSection
3904 (
3905     OBJGPU *pGpu,
3906     KernelGsp *pKernelGsp,
3907     const void *pElfData,
3908     NvU64 elfDataSize,
3909     const char *pSectionName,
3910     const void **ppSectionData,
3911     NvU64 *pSectionSize
3912 )
3913 {
3914     const NvU8 *pGspBuf = pElfData;
3915     const LibosElf64Header *pElfHeader;
3916     const LibosElf64SectionHeader *pElfSectionHeader;
3917     NvU64 elfSectionHeaderTableLength;
3918     NvU64 elfSectionHeaderMaxIdx;
3919     NvU64 elfSectionNamesTableOffset;
3920     NvU64 elfSectionNamesTableSize;
3921     NvU64 elfSectionNamesTableMaxIdx;
3922     static const NvU32 elfMagicNumber = 0x464C457F;
3923     static const NvU8 elfClass64 = 0x2;
3924     static const NvU8 elfLittleEndian = 0x1;
3925     const char *pCurrentSectionName;
3926     NvLength sectionNameLength;
3927     NvS16 idx;
3928 
3929     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfData != NULL, NV_ERR_INVALID_ARGUMENT);
3930     NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize > 0, NV_ERR_INVALID_ARGUMENT);
3931     NV_CHECK_OR_RETURN(LEVEL_ERROR, pSectionName != NULL, NV_ERR_INVALID_ARGUMENT);
3932     NV_CHECK_OR_RETURN(LEVEL_ERROR, ppSectionData != NULL, NV_ERR_INVALID_ARGUMENT);
3933     NV_CHECK_OR_RETURN(LEVEL_ERROR, pSectionSize != NULL, NV_ERR_INVALID_ARGUMENT);
3934     NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= sizeof(LibosElf64Header), NV_ERR_INVALID_DATA);
3935 
3936     sectionNameLength = portStringLength(pSectionName);
3937 
3938     pElfHeader = (const LibosElf64Header*) pGspBuf;
3939 
3940     // Check for the elf identifier at the beginning of the file
3941     NV_CHECK_OR_RETURN(LEVEL_ERROR, *(NvU32*)&pElfHeader->ident == elfMagicNumber, NV_ERR_INVALID_DATA);
3942     // Make sure the data is formatted as little endian
3943     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->ident[5] == elfLittleEndian, NV_ERR_INVALID_DATA);
3944     // Check the class type, only ELFCLASS64 is supported
3945     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->ident[4] == elfClass64, NV_ERR_INVALID_DATA);
3946 
3947     // Make sure that the elf section header table is valid
3948     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->shentsize == sizeof(LibosElf64SectionHeader), NV_ERR_INVALID_DATA);
3949     NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeMulU64(pElfHeader->shentsize, pElfHeader->shnum, &elfSectionHeaderTableLength), NV_ERR_INVALID_DATA);
3950     NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(pElfHeader->shoff, elfSectionHeaderTableLength - 1, &elfSectionHeaderMaxIdx), NV_ERR_INVALID_DATA);
3951     NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionHeaderMaxIdx, NV_ERR_INVALID_DATA);
3952     NV_CHECK_OR_RETURN(LEVEL_ERROR, pElfHeader->shstrndx <= pElfHeader->shnum, NV_ERR_INVALID_DATA);
3953 
3954     // Get the offset and size of the table that holds the section names and make sure they are valid
3955     pElfSectionHeader = (const LibosElf64SectionHeader*) &pGspBuf[pElfHeader->shoff + (pElfHeader->shstrndx * pElfHeader->shentsize)];
3956     elfSectionNamesTableOffset = pElfSectionHeader->offset;
3957     elfSectionNamesTableSize = pElfSectionHeader->size;
3958     NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(elfSectionNamesTableOffset, elfSectionNamesTableSize - 1, &elfSectionNamesTableMaxIdx), NV_ERR_INVALID_DATA);
3959     NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionNamesTableMaxIdx, NV_ERR_INVALID_DATA);
3960 
3961     // Iterate through all of the section headers to find the signatures
3962     pElfSectionHeader = (const LibosElf64SectionHeader*) &pGspBuf[elfSectionHeaderMaxIdx + 1 - sizeof(*pElfSectionHeader)];
3963 
3964     for (idx = pElfHeader->shnum - 1; idx >= 0; idx--, pElfSectionHeader--)
3965     {
3966         NvU64 currentSectionNameMaxLength;
3967         NvU64 elfSectionMaxIdx;
3968 
3969         // Make sure the header name index fits within the section names table
3970         NV_CHECK_OR_RETURN(LEVEL_ERROR, elfSectionNamesTableSize - 1 >= pElfSectionHeader->name, NV_ERR_INVALID_DATA);
3971         currentSectionNameMaxLength = elfSectionNamesTableSize - pElfSectionHeader->name - 1;
3972         pCurrentSectionName = (const char *) &pGspBuf[elfSectionNamesTableOffset + pElfSectionHeader->name];
3973 
3974         // Make sure the elf section size and offset are valid
3975         if (pElfSectionHeader->size > 0)
3976         {
3977             NV_CHECK_OR_RETURN(LEVEL_ERROR, portSafeAddU64(pElfSectionHeader->offset, pElfSectionHeader->size - 1, &elfSectionMaxIdx), NV_ERR_INVALID_DATA);
3978         }
3979         else
3980         {
3981             elfSectionMaxIdx = pElfSectionHeader->offset;
3982         }
3983         NV_CHECK_OR_RETURN(LEVEL_ERROR, elfDataSize >= elfSectionMaxIdx, NV_ERR_INVALID_DATA);
3984 
3985         // Check whether the section name matches the expected section name
3986         if ((sectionNameLength <= currentSectionNameMaxLength) &&
3987             (portStringCompare(pCurrentSectionName, pSectionName, sectionNameLength) == 0) &&
3988             (pCurrentSectionName[sectionNameLength] == '\0'))
3989         {
3990             *ppSectionData = &pGspBuf[pElfSectionHeader->offset];
3991             *pSectionSize = pElfSectionHeader->size;
3992 
3993             return NV_OK;
3994         }
3995     }
3996 
3997     return NV_ERR_OBJECT_NOT_FOUND;
3998 }
3999 
4000 /*!
4001  * Setup libos init arguments.
4002  */
4003 void
4004 kgspSetupLibosInitArgs_IMPL
4005 (
4006     OBJGPU         *pGpu,
4007     KernelGsp *pKernelGsp
4008 )
4009 {
4010     LibosMemoryRegionInitArgument *pLibosInitArgs = pKernelGsp->pLibosInitArgumentsCached;
4011     NvU8 idx;
4012 
4013     portMemSet(pLibosInitArgs, 0, LIBOS_INIT_ARGUMENTS_SIZE);
4014 
4015     // Add memory areas for logging each LIBOS task.
4016     // @note LOGINIT must be first for early init logging to work.
4017     // @note: These should be switched to radix regions to remove the need
4018     //        for large apertures in the RM task for logging.
4019     for (idx = 0; idx < LOGIDX_SIZE; idx++)
4020     {
4021         pLibosInitArgs[idx].kind = LIBOS_MEMORY_REGION_CONTIGUOUS;
4022         pLibosInitArgs[idx].loc  = LIBOS_MEMORY_REGION_LOC_SYSMEM;
4023         pLibosInitArgs[idx].id8  = pKernelGsp->rmLibosLogMem[idx].id8;
4024         pLibosInitArgs[idx].pa   = pKernelGsp->rmLibosLogMem[idx].pTaskLogBuffer[1];
4025         pLibosInitArgs[idx].size = memdescGetSize(pKernelGsp->rmLibosLogMem[idx].pTaskLogDescriptor);
4026     }
4027 
4028     // insert GSP-RM ELF args address; id must match libos-config.py entry
4029     pLibosInitArgs[idx].kind = LIBOS_MEMORY_REGION_CONTIGUOUS;
4030     pLibosInitArgs[idx].loc  = LIBOS_MEMORY_REGION_LOC_SYSMEM;
4031     pLibosInitArgs[idx].id8  = _kgspGenerateInitArgId("RMARGS");
4032     pLibosInitArgs[idx].pa   = memdescGetPhysAddr(pKernelGsp->pGspArgumentsDescriptor, AT_GPU, 0);
4033     pLibosInitArgs[idx].size = memdescGetSize(pKernelGsp->pGspArgumentsDescriptor);
4034 
4035     portAtomicMemoryFenceFull();
4036 }
4037 
4038 /*!
4039  * Receive and process RPC event from GSP-RM.
4040  *
4041  * This function is called from interrupt bottom-half handler (DPC) and
4042  * would race with normal RPC flow, _kgspRpcRecvPoll().
4043  * This race is currently avoided only because DPC is executed under
4044  * gpus lock, so RPC and Bottom-half handler are mutually exclusive
4045  * control flows.
4046  */
4047 void
4048 kgspRpcRecvEvents_IMPL
4049 (
4050     OBJGPU *pGpu,
4051     KernelGsp  *pKernelGsp
4052 )
4053 {
4054     NvU32 gpuMaskUnused;
4055     NV_ASSERT(rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE, &gpuMaskUnused));
4056     //
4057     // We should never have an event with code NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS.
4058     // If we do the assert will fail on NV_WARN_MORE_PROCESSING_REQUIRED,
4059     // in addition to general error codes.
4060     //
4061     NV_ASSERT_OK(_kgspRpcDrainEvents(pGpu, pKernelGsp, NV_VGPU_MSG_FUNCTION_NUM_FUNCTIONS, KGSP_RPC_EVENT_HANDLER_CONTEXT_INTERRUPT));
4062 }
4063 
4064 /*!
4065  * Wait for GSP-RM initialization to complete.
4066  */
4067 NV_STATUS
4068 kgspWaitForRmInitDone_IMPL
4069 (
4070     OBJGPU *pGpu,
4071     KernelGsp *pKernelGsp
4072 )
4073 {
4074     OBJRPC *pRpc = pKernelGsp->pRpc;
4075 
4076     //
4077     // Kernel RM can timeout when GSP-RM has an error condition.  Give GSP-RM
4078     // a chance to report the error before we pull the rug out from under it.
4079     //
4080     threadStateResetTimeout(pGpu);
4081 
4082     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
4083         rpcRecvPoll(pGpu, pRpc, NV_VGPU_MSG_EVENT_GSP_INIT_DONE));
4084 
4085     //
4086     // Now check if RPC really succeeded (NV_VGPU_MSG_RESULT_* are defined to
4087     // equivalent NV_STATUS codes in RM).
4088     //
4089     NV_ASSERT_OK_OR_RETURN(RPC_HDR->rpc_result);
4090 
4091     pGpu->gspRmInitialized = NV_TRUE;
4092     if (hypervisorIsVgxHyper() && pGpu->getProperty(pGpu, PDB_PROP_GPU_EXTENDED_GSP_RM_INITIALIZATION_TIMEOUT_FOR_VGX))
4093     {
4094         // Decrease timeout values for VGX driver
4095         timeoutInitializeGpuDefault(&pGpu->timeoutData, pGpu);
4096     }
4097 
4098     return NV_OK;
4099 }
4100 
4101 /*!
4102  * Execute a sequencer buffer coming from GSP
4103  *
4104  * @param[in]      pGpu             GPU object pointer
4105  * @param[in]      pKernelGsp       KernelGsp object pointer
4106  * @param[in]      pRunCpuSeqParams Sequence buffer RPC parameters
4107  *
4108  * @return NV_OK if the GSP sequencer buffer has been executed successfully
4109  *         NV_ERR_INVALID_STATE if the sequencer buffer is not allocated
4110  *         NV_ERR_INVALID_DATA is the sequencer buffer is malformed
4111  */
4112 NV_STATUS
4113 kgspExecuteSequencerBuffer_IMPL
4114 (
4115     OBJGPU    *pGpu,
4116     KernelGsp *pKernelGsp,
4117     void      *pRunCpuSeqParams
4118 )
4119 {
4120     rpc_run_cpu_sequencer_v17_00 *pParams = (rpc_run_cpu_sequencer_v17_00 *)pRunCpuSeqParams;
4121     NvU32 *pCmd = pParams->commandBuffer;
4122     NvU32 buffer_end = pParams->cmdIndex;
4123     NvU32 current_cmd_index = 0;
4124     NV_STATUS nvStatus = NV_OK;
4125     NvU32 payloadSize;
4126 
4127     NV_ASSERT_OR_RETURN(IS_GSP_CLIENT(pGpu), NV_ERR_NOT_SUPPORTED);
4128     NV_ASSERT_OR_RETURN((pParams->bufferSizeDWord != 0), NV_ERR_INVALID_STATE);
4129     NV_ASSERT_OR_RETURN(buffer_end < pParams->bufferSizeDWord, NV_ERR_INVALID_DATA);
4130 
4131     while (current_cmd_index < buffer_end)
4132     {
4133         NvU32 opCode = pCmd[current_cmd_index++];
4134         payloadSize = GSP_SEQUENCER_PAYLOAD_SIZE_DWORDS(opCode);
4135 
4136         NV_ASSERT_OR_RETURN(current_cmd_index + payloadSize <= buffer_end, NV_ERR_INVALID_DATA);
4137 
4138         //
4139         // Handling of sequencer commands is split between those commands
4140         // that are common to all architectures (handled directly here) and
4141         // those commands that are arch-specific and are handled via the
4142         // kgspExecuteSequencerCommand_HAL() call below.
4143         //
4144         switch (opCode)
4145         {
4146             // 2 arguments
4147             case GSP_SEQ_BUF_OPCODE_REG_WRITE:
4148             {
4149                 GSP_SEQ_BUF_PAYLOAD_REG_WRITE regWrite;
4150                 portMemCopy(&regWrite, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_WRITE), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_WRITE));
4151 
4152                 GPU_REG_WR32(pGpu, regWrite.addr, regWrite.val);
4153                 break;
4154             }
4155 
4156             // 3 arguments
4157             case GSP_SEQ_BUF_OPCODE_REG_MODIFY:
4158             {
4159                 GSP_SEQ_BUF_PAYLOAD_REG_MODIFY regModify;
4160                 NvU32 regVal;
4161 
4162                 portMemCopy(&regModify, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_MODIFY), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_MODIFY));
4163 
4164                 regVal = GPU_REG_RD32(pGpu, regModify.addr);
4165                 regVal = regVal & ~regModify.mask;
4166                 regVal = regVal | regModify.val;
4167                 GPU_REG_WR32(pGpu, regModify.addr, regVal);
4168                 break;
4169             }
4170 
4171             // 5 arguments
4172             case GSP_SEQ_BUF_OPCODE_REG_POLL:
4173             {
4174                 GSP_SEQ_BUF_PAYLOAD_REG_POLL regPoll;
4175                 NvU32 regval;
4176                 RMTIMEOUT timeout;
4177 
4178                 portMemCopy(&regPoll, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_POLL), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_POLL));
4179 
4180                 regval = GPU_REG_RD32(pGpu, regPoll.addr);
4181 
4182                 gpuSetTimeout(pGpu, regPoll.timeout, &timeout, 0);
4183                 while ((regval & regPoll.mask) != regPoll.val)
4184                 {
4185                     nvStatus = gpuCheckTimeout(pGpu, &timeout);
4186                     if (nvStatus == NV_ERR_TIMEOUT)
4187                     {
4188                         NV_PRINTF(LEVEL_ERROR, "Timeout waiting for register to settle, value = 0x%x, err_code = 0x%x\n",
4189                             regval, regPoll.error);
4190                         DBG_BREAKPOINT();
4191                         return nvStatus;
4192                     }
4193                     osSpinLoop();
4194                     regval = GPU_REG_RD32(pGpu, regPoll.addr);
4195                 }
4196                 break;
4197             }
4198 
4199             case GSP_SEQ_BUF_OPCODE_DELAY_US:
4200             {
4201                 GSP_SEQ_BUF_PAYLOAD_DELAY_US delayUs;
4202                 portMemCopy(&delayUs, sizeof(GSP_SEQ_BUF_PAYLOAD_DELAY_US), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_DELAY_US));
4203 
4204                 osDelayUs(delayUs.val);
4205                 break;
4206             }
4207 
4208             case GSP_SEQ_BUF_OPCODE_REG_STORE:
4209             {
4210                 GSP_SEQ_BUF_PAYLOAD_REG_STORE regStore;
4211                 portMemCopy(&regStore, sizeof(GSP_SEQ_BUF_PAYLOAD_REG_STORE), &pCmd[current_cmd_index], sizeof(GSP_SEQ_BUF_PAYLOAD_REG_STORE));
4212 
4213                 NV_ASSERT_OR_RETURN(regStore.index < GSP_SEQ_BUF_REG_SAVE_SIZE, NV_ERR_INVALID_ARGUMENT);
4214 
4215                 pParams->regSaveArea[regStore.index] = GPU_REG_RD32(pGpu, regStore.addr);
4216                 break;
4217             }
4218 
4219             case GSP_SEQ_BUF_OPCODE_CORE_RESET:
4220             {
4221                 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4222 
4223                 kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4224                 kflcnDisableCtxReq_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4225                 break;
4226             }
4227 
4228             case GSP_SEQ_BUF_OPCODE_CORE_START:
4229             {
4230                 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4231 
4232                 kflcnStartCpu_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
4233                 break;
4234             }
4235 
4236             case GSP_SEQ_BUF_OPCODE_CORE_WAIT_FOR_HALT:
4237             {
4238                 NV_ASSERT_OR_RETURN(payloadSize == 0, NV_ERR_INVALID_ARGUMENT);
4239 
4240                 NV_ASSERT_OK_OR_RETURN(kflcnWaitForHalt_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon), GPU_TIMEOUT_DEFAULT, 0));
4241                 break;
4242             }
4243 
4244             default:
4245                 //
4246                 // Route this command to the arch-specific handler.
4247                 //
4248                 NV_ASSERT_OK_OR_RETURN(kgspExecuteSequencerCommand_HAL(pGpu, pKernelGsp, opCode, &pCmd[current_cmd_index], payloadSize * sizeof (*pCmd)));
4249                 break;
4250         }
4251         current_cmd_index += payloadSize;
4252     }
4253 
4254     return NV_OK;
4255 }
4256 
4257 #if LIBOS_LOG_DECODE_ENABLE
4258 static void
4259 _kgspLogPollingCallback
4260 (
4261     OBJGPU *pGpu,
4262     void   *data
4263 )
4264 {
4265     //
4266     // Do not take any locks in kgspDumpGspLogs. As this callback only fires when kgspNvlogFlushCb
4267     // is not registered, there is no possibility of data race.
4268     //
4269     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
4270     kgspDumpGspLogsUnlocked(pKernelGsp, NV_FALSE);
4271 }
4272 
4273 NV_STATUS
4274 kgspStartLogPolling_IMPL
4275 (
4276     OBJGPU    *pGpu,
4277     KernelGsp *pKernelGsp
4278 )
4279 {
4280     NV_STATUS status = NV_OK;
4281 
4282     //
4283     // Only enable the 1 Hz poll if we can live decode logs in dmesg. Else we'll
4284     // flush it on demand by nvidia-debugdump.
4285     //
4286     if (pKernelGsp->pLogElf != NULL)
4287     {
4288         status = osSchedule1HzCallback(pGpu,
4289                                        _kgspLogPollingCallback,
4290                                        NULL,
4291                                        NV_OS_1HZ_REPEAT);
4292     }
4293     return status;
4294 }
4295 
4296 static void
4297 _kgspStopLogPolling
4298 (
4299     OBJGPU    *pGpu,
4300     KernelGsp *pKernelGsp
4301 )
4302 {
4303     if (pKernelGsp->pLogElf != NULL)
4304     {
4305         osRemove1HzCallback(pGpu, _kgspLogPollingCallback, NULL);
4306     }
4307 }
4308 
4309 #else // LIBOS_LOG_DECODE_ENABLE
4310 
4311 NV_STATUS
4312 kgspStartLogPolling_IMPL
4313 (
4314     OBJGPU    *pGpu,
4315     KernelGsp *pKernelGsp
4316 )
4317 {
4318     return NV_OK;
4319 }
4320 
4321 static void
4322 _kgspStopLogPolling
4323 (
4324     OBJGPU    *pGpu,
4325     KernelGsp *pKernelGsp
4326 )
4327 {
4328     return;
4329 }
4330 #endif // LIBOS_LOG_DECODE_ENABLE
4331 
4332 /*!
4333  * Provides an opportunity to register some IntrService during intrStateInit.
4334  */
4335 void
4336 kgspRegisterIntrService_IMPL
4337 (
4338     OBJGPU *pGpu,
4339     KernelGsp *pKernelGsp,
4340     IntrServiceRecord pRecords[MC_ENGINE_IDX_MAX]
4341 )
4342 {
4343     NvU32 engineIdx = MC_ENGINE_IDX_GSP;
4344 
4345     if (!IS_GSP_CLIENT(pGpu))
4346         return;
4347 
4348     NV_ASSERT(pRecords[engineIdx].pInterruptService == NULL);
4349     pRecords[engineIdx].pInterruptService = staticCast(pKernelGsp, IntrService);
4350 }
4351 
4352 /*!
4353  * Service GSP interrupts.
4354  *
4355  * @returns Zero, or any implementation-chosen nonzero value. If the same nonzero value is returned enough
4356  *          times the interrupt is considered stuck.
4357  */
4358 NvU32
4359 kgspServiceInterrupt_IMPL
4360 (
4361     OBJGPU *pGpu,
4362     KernelGsp *pKernelGsp,
4363     IntrServiceServiceInterruptArguments *pParams
4364 )
4365 {
4366     NV_ASSERT_OR_RETURN(pParams != NULL, 0);
4367     NV_ASSERT_OR_RETURN(pParams->engineIdx == MC_ENGINE_IDX_GSP, 0);
4368 
4369     return kgspService_HAL(pGpu, pKernelGsp);
4370 }
4371 
4372 /*!
4373  * Calculates the GSP FW heap size based on the GPU's resources.
4374  */
4375 static NvU64
4376 _kgspCalculateFwHeapSize
4377 (
4378     OBJGPU *pGpu,
4379     KernelGsp *pKernelGsp,
4380     NvU32 maxGspFwHeapSizeMB
4381 )
4382 {
4383     // For VGPU, use the static pre-calculated size
4384     if (pGpu->bVgpuGspPluginOffloadEnabled)
4385         return GSP_FW_HEAP_SIZE_VGPU_DEFAULT;
4386 
4387     //
4388     // The baremetal heap calculation is a function of the architecture, FB
4389     // size, and a chunk for backing client allocations (pre-calibrated for the
4390     // architecture through rough profiling).
4391     //
4392     KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
4393     NvU64 fbSize = 0;
4394 
4395     NV_ASSERT_OK(kmemsysGetUsableFbSize_HAL(pGpu, pKernelMemorySystem, &fbSize));
4396     const NvU32 fbSizeGB = (NvU32)(NV_ALIGN_UP64(fbSize, 1 << 30) >> 30);
4397 
4398     //
4399     // Reclaimable binary data will end up padding the heap (in some cases,
4400     // significantly), but due to memory fragmentation we can't rely on it to
4401     // linearly reduce the amount needed in the primary heap, so it is not a
4402     // factor here. Instead, it's just extra margin to keep us from exhausting
4403     // the heap at runtime.
4404     //
4405     NvU64 heapSize = kgspGetFwHeapParamOsCarveoutSize_HAL(pGpu, pKernelGsp) +
4406                      pKernelGsp->fwHeapParamBaseSize +
4407                      NV_ALIGN_UP(GSP_FW_HEAP_PARAM_SIZE_PER_GB_FB * fbSizeGB, 1 << 20) +
4408                      NV_ALIGN_UP(GSP_FW_HEAP_PARAM_CLIENT_ALLOC_SIZE, 1 << 20);
4409 
4410     // Clamp to the minimum, even if the calculations say we can do with less
4411     const NvU32 minGspFwHeapSizeMB = kgspGetMinWprHeapSizeMB_HAL(pGpu, pKernelGsp);
4412     heapSize = NV_MAX(heapSize, (NvU64)minGspFwHeapSizeMB << 20);
4413 
4414     // Clamp to the maximum heap size, if necessary
4415     heapSize = NV_MIN(heapSize, (NvU64)maxGspFwHeapSizeMB << 20);
4416 
4417     NV_PRINTF(LEVEL_INFO, "GSP FW heap %lluMB of %uGB\n",
4418               heapSize >> 20, fbSizeGB);
4419 
4420     return heapSize;
4421 }
4422 
4423 /*!
4424  * Returns the size in bytes of the GSP FW heap:
4425  *  - the registry override, if present
4426  *  - otherwise, calculate the FW heap size for this GPU, limiting it to stay
4427  *    within the pre-scrubbed area at the end of FB, if needed
4428  *
4429  * @param[in] posteriorFbSize - size in bytes of the memory reserved between the
4430  *                              end of the GSP FW heap and the end of FB, or 0
4431  *                              to disable limiting of the heap range to within
4432  *                              the pre-scrubbed area at the end of FB
4433  */
4434 NvU64
4435 kgspGetFwHeapSize_IMPL
4436 (
4437     OBJGPU *pGpu,
4438     KernelGsp *pKernelGsp,
4439     NvU64 posteriorFbSize
4440 )
4441 {
4442     NvU32 maxScrubbedHeapSizeMB = NV_U32_MAX;
4443     NvU32 heapSizeMB = 0;
4444 
4445     //
4446     // The pre-scrubbed region at the end of FB may limit the heap size, if no
4447     // scrubber ucode is supported to unlock the rest of memory prior to booting
4448     // GSP-RM.
4449     //
4450     if (!pKernelGsp->bScrubberUcodeSupported && (posteriorFbSize != 0))
4451     {
4452         const NvU64 prescrubbedSize = kgspGetPrescrubbedTopFbSize(pGpu, pKernelGsp);
4453         if (prescrubbedSize < NV_U64_MAX)
4454             maxScrubbedHeapSizeMB = (NvU32)((prescrubbedSize - posteriorFbSize) >> 20);
4455     }
4456 
4457     // Get the heap size override from the registry, if any
4458     if ((osReadRegistryDword(pGpu, NV_REG_STR_GSP_FIRMWARE_HEAP_SIZE_MB, &heapSizeMB) == NV_OK) &&
4459         (heapSizeMB != NV_REG_STR_GSP_FIRMWARE_HEAP_SIZE_MB_DEFAULT))
4460     {
4461         const NvU32 minGspFwHeapSizeMB = kgspGetMinWprHeapSizeMB_HAL(pGpu, pKernelGsp);
4462         const NvU32 maxGspFwHeapSizeMB = NV_MIN(kgspGetMaxWprHeapSizeMB_HAL(pGpu, pKernelGsp),
4463                                                 maxScrubbedHeapSizeMB);
4464 
4465         NV_ASSERT(minGspFwHeapSizeMB < maxGspFwHeapSizeMB);
4466 
4467         if (heapSizeMB > maxGspFwHeapSizeMB)
4468         {
4469             NV_PRINTF(LEVEL_WARNING, "Firmware heap size clamped to maximum (%uMB)\n",
4470                       maxGspFwHeapSizeMB);
4471             heapSizeMB = maxGspFwHeapSizeMB;
4472         }
4473         else if (heapSizeMB < minGspFwHeapSizeMB)
4474         {
4475             NV_PRINTF(LEVEL_WARNING, "Firmware heap size clamped to minimum (%uMB)\n",
4476                       minGspFwHeapSizeMB);
4477             heapSizeMB = minGspFwHeapSizeMB;
4478         }
4479         else
4480         {
4481             NV_PRINTF(LEVEL_WARNING, "Firmware heap size overridden (%uMB)\n",
4482                       heapSizeMB);
4483         }
4484 
4485         return ((NvU64)heapSizeMB) << 20;
4486     }
4487 
4488     return _kgspCalculateFwHeapSize(pGpu, pKernelGsp, maxScrubbedHeapSizeMB);
4489 }
4490 
4491 NvU64 kgspGetWprEndMargin_IMPL(OBJGPU *pGpu, KernelGsp *pKernelGsp)
4492 {
4493     NvU64 wprEndMargin;
4494     NvU32 marginOverride = 0;
4495     GspFwWprMeta *pWprMeta = pKernelGsp->pWprMeta;
4496 
4497     (void)osReadRegistryDword(pGpu, NV_REG_STR_RM_GSP_WPR_END_MARGIN, &marginOverride);
4498 
4499     wprEndMargin = ((NvU64)DRF_VAL(_REG, _RM_GSP_WPR_END_MARGIN, _MB, marginOverride)) << 20;
4500     if (wprEndMargin == 0)
4501     {
4502         // Calculate the default margin size based on the WPR size
4503         const GspFwWprMeta *pWprMeta = pKernelGsp->pWprMeta;
4504 
4505         //
4506         // This needs to be called after pWprMeta->sizeOfRadix3Elf has been initialized,
4507         // in order to estimate the default WPR size.
4508         //
4509         NV_ASSERT(pWprMeta->sizeOfRadix3Elf > 0);
4510 
4511         //
4512         // If the bounds are encoded in GspFwWprMeta from a prior attempt, use them.
4513         // Otherwise, estimate the WPR size by the sizes of the elements in the layout
4514         //
4515         if (pWprMeta->gspFwWprEnd > pWprMeta->nonWprHeapOffset)
4516         {
4517             wprEndMargin = pWprMeta->gspFwWprEnd - pWprMeta->nonWprHeapOffset;
4518         }
4519         else
4520         {
4521             wprEndMargin += kgspGetFrtsSize_HAL(pGpu, pKernelGsp);
4522             wprEndMargin += pKernelGsp->gspRmBootUcodeSize;
4523             wprEndMargin += pWprMeta->sizeOfRadix3Elf;
4524             wprEndMargin += kgspGetFwHeapSize(pGpu, pKernelGsp, 0);
4525             wprEndMargin += kgspGetNonWprHeapSize(pGpu, pKernelGsp);
4526         }
4527 
4528         if (pKernelGsp->bootAttempts > 0)
4529             wprEndMargin *= pKernelGsp->bootAttempts;
4530     }
4531 
4532     if (FLD_TEST_DRF(_REG, _RM_GSP_WPR_END_MARGIN, _APPLY, _ALWAYS, marginOverride) ||
4533         (pKernelGsp->bootAttempts > 0))
4534     {
4535         NV_PRINTF(LEVEL_WARNING, "Adding margin of 0x%llx bytes after the end of WPR2\n",
4536                   wprEndMargin);
4537         pWprMeta->flags |= GSP_FW_FLAGS_RECOVERY_MARGIN_PRESENT;
4538         return wprEndMargin;
4539     }
4540 
4541     // Normal boot path
4542     pWprMeta->flags &= ~GSP_FW_FLAGS_RECOVERY_MARGIN_PRESENT;
4543     return 0;
4544 }
4545