1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #define NVOC_KERNEL_NVLINK_H_PRIVATE_ACCESS_ALLOWED
25 
26 // FIXME XXX
27 #define NVOC_KERNEL_IOCTRL_H_PRIVATE_ACCESS_ALLOWED
28 
29 #include "kernel/gpu/nvlink/kernel_nvlink.h"
30 #include "kernel/gpu/nvlink/kernel_ioctrl.h"
31 #include "kernel/gpu/mem_sys/kern_mem_sys.h"
32 #include "os/os.h"
33 
34 static NV_STATUS _knvlinkCreateIoctrl(OBJGPU *, KernelNvlink *, NvU32);
35 static NV_STATUS _knvlinkFilterDiscoveredLinks(OBJGPU *, KernelNvlink *);
36 static NV_STATUS _knvlinkFilterIoctrls(OBJGPU *, KernelNvlink *);
37 static NV_STATUS _knvlinkProcessSysmemLinks(OBJGPU *, KernelNvlink *, NvBool);
38 static NV_STATUS _knvlinkPurgeState(OBJGPU *, KernelNvlink *);
39 
40 /*!
41  * @brief Create an IOCTRL object
42  *
43  * @param[in]   pGpu          GPU object pointer
44  * @param[in]   pKernelNvlink KernelNvlink object pointer
45  * @param[in]   PublicId      The ID of the ioctrl to be created
46  *
47  * @return  NV_OK on success
48  */
49 static NV_STATUS
50 _knvlinkCreateIoctrl
51 (
52     OBJGPU       *pGpu,
53     KernelNvlink *pKernelNvlink,
54     NvU32         PublicId
55 )
56 {
57     KernelIoctrl *pKernelIoctrl = NULL;
58     NV_STATUS     status        = NV_OK;
59 
60     status = objCreate(&pKernelIoctrl, pKernelNvlink, KernelIoctrl);
61     NV_ASSERT_OR_RETURN(status == NV_OK, status);
62 
63     pKernelNvlink->pKernelIoctrl[PublicId] = pKernelIoctrl;
64 
65     status = kioctrlConstructEngine(pGpu, pKernelIoctrl, PublicId);
66 
67     return status;
68 }
69 
70 /*!
71  * @brief Filter the discovered links against disabled links
72  *
73  * @param[in]   pGpu          GPU object pointer
74  * @param[in]   pKernelNvlink KernelNvlink object pointer
75  *
76  * @return  NV_OK on success
77  */
78 static NV_STATUS
79 _knvlinkFilterDiscoveredLinks
80 (
81     OBJGPU       *pGpu,
82     KernelNvlink *pKernelNvlink
83 )
84 {
85     // Ensure any vbios disabled links are removed from discovered
86     if (pKernelNvlink->vbiosDisabledLinkMask)
87     {
88         // Update the link mask if overridden through vbios
89         pKernelNvlink->discoveredLinks &= ~(pKernelNvlink->vbiosDisabledLinkMask);
90 
91         NV_PRINTF(LEVEL_INFO,
92                   "Links discovered after VBIOS overrides = 0x%x\n",
93                   pKernelNvlink->discoveredLinks);
94     }
95 
96     // Filter links that are disabled through regkey overrides
97     if (pKernelNvlink->regkeyDisabledLinksMask)
98     {
99         pKernelNvlink->discoveredLinks &= ~(pKernelNvlink->regkeyDisabledLinksMask);
100 
101         NV_PRINTF(LEVEL_INFO,
102                   "Links after applying disable links regkey = 0x%x\n",
103                   pKernelNvlink->discoveredLinks);
104     }
105 
106     return NV_OK;
107 }
108 
109 /*!
110  * @brief Filter the IOCTRLs which have no discovered links
111  *
112  * @param[in]   pGpu          GPU object pointer
113  * @param[in]   pKernelNvlink KernelNvlink object pointer
114  *
115  * @return  NV_OK on success
116  */
117 static NV_STATUS
118 _knvlinkFilterIoctrls
119 (
120     OBJGPU       *pGpu,
121     KernelNvlink *pKernelNvlink
122 )
123 {
124     KernelIoctrl *pKernelIoctrl;
125     NvU32         i;
126 
127     // Update local IOCTRL discovered masks after vbios and regkey overrides
128     FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->ioctrlMask)
129     {
130         pKernelIoctrl = KNVLINK_GET_IOCTRL(pKernelNvlink, i);
131 
132         // If minion force boot enabled, don't remove IOCTRL from list
133         if (pKernelIoctrl->getProperty(pKernelIoctrl, PDB_PROP_KIOCTRL_MINION_FORCE_BOOT))
134         {
135             continue;
136         }
137 
138         pKernelIoctrl->localDiscoveredLinks &=
139                  KIOCTRL_LINK_GLOBAL_TO_LOCAL_MASK(pKernelNvlink->discoveredLinks);
140 
141         // No need to handle the IOCTRL if no links are being enabled
142         if (pKernelIoctrl->localDiscoveredLinks == 0x0)
143         {
144             pKernelNvlink->ioctrlMask &= ~(NVBIT(pKernelIoctrl->PublicId));
145         }
146     }
147     FOR_EACH_INDEX_IN_MASK_END;
148 
149     return NV_OK;
150 }
151 
152 /*!
153  * @brief NVLINK ConstructEngine
154  *
155  * @param[in] pGpu          OBJGPU pointer
156  * @param[in] pKernelNvlink KernelNvlink pointer
157  * @param[in] engDesc       NVLink Engine descriptor
158  *
159  * @return  NV_OK on success
160  */
161 NV_STATUS
162 knvlinkConstructEngine_IMPL
163 (
164     OBJGPU        *pGpu,
165     KernelNvlink  *pKernelNvlink,
166     ENGDESCRIPTOR  engDesc
167 )
168 {
169     NV_STATUS status    = NV_OK;
170     NvU32     ioctrlIdx = 0;
171 
172     // Initialize the nvlink core library
173     knvlinkCoreDriverLoadWar(pGpu, pKernelNvlink);
174 
175     // Return early if nvlink core is not supported
176     status = knvlinkCoreIsDriverSupported(pGpu, pKernelNvlink);
177     if (status != NV_OK)
178     {
179         return status;
180     }
181 
182     //
183     // Apply NVLink regkey overrides for monolithic/CPU-RM.
184     // If NVLink is disabled, so is related functionality.
185     //
186     status = knvlinkApplyRegkeyOverrides(pGpu, pKernelNvlink);
187     if (status == NV_ERR_NOT_SUPPORTED)
188     {
189         return status;
190     }
191 
192     pKernelNvlink->bVerifTrainingEnable = NV_FALSE;
193     pKernelNvlink->bL2Entry             = NV_FALSE;
194 
195     status = knvlinkConstructHal_HAL(pGpu, pKernelNvlink);
196     if (status == NV_ERR_NOT_SUPPORTED)
197     {
198         return status;
199     }
200 
201     //
202     // When GSP inform about link error occurs on this GPU
203     // it will updated to NV_TRUE
204     //
205     pKernelNvlink->bIsGpuDegraded = NV_FALSE;
206 
207     //
208     // Create MAX KernelIoctrl objects.
209     // Later in knvlinkStatePreInit_IMPL, we will remove the objects for
210     // IOCTRLs that do not exist in the HW.
211     //
212     // We must use this ordering because we should not touch GPU registers
213     // during object creation
214     //
215     for (ioctrlIdx = 0; ioctrlIdx < NVLINK_MAX_IOCTRLS_SW; ioctrlIdx++)
216     {
217         if (!pKernelNvlink->pKernelIoctrl[ioctrlIdx])
218         {
219             _knvlinkCreateIoctrl(pGpu, pKernelNvlink, ioctrlIdx);
220         }
221     }
222 
223     //
224     // If not Silicon or EMU then GFW boot is not
225     // possible so set the property to false as soon
226     // as possible
227     //
228     if (!(IS_SILICON(pGpu) || IS_EMULATION(pGpu)))
229     {
230         pKernelNvlink->setProperty(pKernelNvlink,
231             PDB_PROP_KNVLINK_MINION_GFW_BOOT, NV_FALSE);
232     }
233 
234     return NV_OK;
235 }
236 
237 /*!
238  * @brief Determine if the NVLink IP is present and usable
239  *        This includes:
240  *        1. Detecting IOCTRL in PTOP
241  *        2. Detecting IOCTRL Discovery integrity
242  *        3. Detecting at least 1 link exposed in IOCTRL Discovery
243  *
244  * @param[in]   pGpu          OBJGPU pointer
245  * @param[in]   pKernelNvlink KernelNvlink pointer
246  */
247 NvBool
248 knvlinkIsPresent_IMPL
249 (
250     OBJGPU       *pGpu,
251     KernelNvlink *pKernelNvlink
252 )
253 {
254     NV_STATUS status = NV_OK;
255 
256     // Mark NVLINK as absent when HCC is enabled
257     if (gpuIsCCFeatureEnabled(pGpu))
258         return NV_FALSE;
259 
260     // On GSP clients, retrieve all device discovery info from GSP through RPC
261     status = knvlinkCopyNvlinkDeviceInfo(pGpu, pKernelNvlink);
262     if (status != NV_OK)
263         return NV_FALSE;
264 
265     status = knvlinkCopyIoctrlDeviceInfo(pGpu, pKernelNvlink);
266     if (status != NV_OK)
267         return NV_FALSE;
268 
269     return NV_TRUE;
270 }
271 
272 /*!
273  * @brief NVLINK State Pre-Init
274  *
275  * @param[in] pGpu           OBJGPU pointer
276  * @param[in] pKernelNvlink  KernelNvlink pointer
277  *
278  * @return  NV_OK on success
279  */
280 NV_STATUS
281 knvlinkStatePreInitLocked_IMPL
282 (
283     OBJGPU       *pGpu,
284     KernelNvlink *pKernelNvlink
285 )
286 {
287     return knvlinkRemoveMissingIoctrlObjects(pGpu, pKernelNvlink);
288 }
289 
290 /*!
291  * @brief Remove IOCTRL objects that are not present in the system
292  *
293  * @param[in] pGpu          OBJGPU pointer
294  * @param[in] pKernelNvlink KernelNvlink pointer
295  *
296  * @return  NV_OK on success
297  */
298 NV_STATUS
299 knvlinkRemoveMissingIoctrlObjects_IMPL
300 (
301     OBJGPU       *pGpu,
302     KernelNvlink *pKernelNvlink
303 )
304 {
305     NvU32         ioctrlIdx     = 0;
306     KernelIoctrl *pKernelIoctrl = NULL;
307     NV_STATUS     status        = NV_OK;
308 
309     // On GSP clients, retrieve all device discovery info from GSP
310     status = knvlinkCopyNvlinkDeviceInfo(pGpu, pKernelNvlink);
311     if (status != NV_OK)
312         return status;
313 
314     // Delete IOCTRL objects that are not present
315     for (ioctrlIdx = 0; ioctrlIdx < NVLINK_MAX_IOCTRLS_SW; ioctrlIdx++)
316     {
317         pKernelIoctrl = KNVLINK_GET_IOCTRL(pKernelNvlink, ioctrlIdx);
318         if (!(pKernelNvlink->ioctrlMask & NVBIT(ioctrlIdx)))
319         {
320             objDelete(pKernelIoctrl);
321             pKernelNvlink->pKernelIoctrl[ioctrlIdx] = NULL;
322         }
323     }
324 
325     return NV_OK;
326 }
327 
328 /*!
329  * @brief NVLINK StateLoad
330  *
331  * @param[in] pGpu          OBJGPU pointer
332  * @param[in] pKernelNvlink KernelNvlink pointer
333  * @param[in] flags         Flags
334  *
335  * @return  NV_OK on success
336  */
337 NV_STATUS
338 knvlinkStateLoad_IMPL
339 (
340     OBJGPU       *pGpu,
341     KernelNvlink *pKernelNvlink,
342     NvU32         flags
343 )
344 {
345     NV_STATUS         status = NV_OK;
346     OBJSYS           *pSys   = SYS_GET_INSTANCE();
347     KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
348     NvBool            bMIGNvLinkP2PDisabled = ((pKernelMIGManager != NULL) &&
349                                                !kmigmgrIsMIGNvlinkP2PSupported(pGpu, pKernelMIGManager));
350     NvU32             preInitializedLinks;
351     NvU32             i;
352     OBJTMR            *pTmr = GPU_GET_TIMER(pGpu);
353 
354     //
355     // If we are on the resume path, nvlinkIsPresent will not be called,
356     // but we need to call it to get all the devices set up, call it now.
357     //
358     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_RESUME_CODEPATH))
359     {
360         // The return is not important, the path is.
361         knvlinkIsPresent(pGpu, pKernelNvlink);
362     }
363 
364     // For GSP-CLIENTs, the link masks and vbios info need to synchronize with GSP
365     status = knvlinkSyncLinkMasksAndVbiosInfo(pGpu, pKernelNvlink);
366     if (status != NV_OK)
367     {
368         NV_ASSERT(status == NV_OK);
369         goto knvlinkStateLoad_end;
370     }
371 
372     // Filter discovered links against disabled links (vbios or regkey)
373     status = _knvlinkFilterDiscoveredLinks(pGpu, pKernelNvlink);
374     if (status != NV_OK)
375     {
376         NV_ASSERT(status == NV_OK);
377         goto knvlinkStateLoad_end;
378     }
379 
380     // Filter IOCTRLs which have no discovered links (vbios or regkey)
381     status = _knvlinkFilterIoctrls(pGpu, pKernelNvlink);
382     if (status != NV_OK)
383     {
384         NV_ASSERT(status == NV_OK);
385         goto knvlinkStateLoad_end;
386     }
387 
388     // Update power features supported based on the NVLink IP
389     knvlinkSetPowerFeatures(pGpu, pKernelNvlink);
390 
391     if (!IS_RTLSIM(pGpu) || pKernelNvlink->bForceEnableCoreLibRtlsims)
392     {
393         // NvLink should not be registered with core-lib when MIG is enabled
394         if (!knvlinkPoweredUpForD3_HAL(pGpu, pKernelNvlink) &&
395             !bMIGNvLinkP2PDisabled)
396         {
397             // Set the link training mode to be used by the device
398             status = knvlinkIsAliSupported_HAL(pGpu, pKernelNvlink);
399             if (status != NV_OK)
400             {
401                 NV_PRINTF(LEVEL_ERROR, "Failed to get ALI status\n");
402                 goto knvlinkStateLoad_end;
403             }
404 
405             // Add the NVGPU device to the nvlink core
406             status = knvlinkCoreAddDevice(pGpu, pKernelNvlink);
407             if (status != NV_OK)
408             {
409                 NV_PRINTF(LEVEL_ERROR,
410                           "Failed to add GPU device to nvlink core\n");
411                 goto knvlinkStateLoad_end;
412             }
413         }
414         else
415         {
416             NV_PRINTF(LEVEL_INFO,
417                       "MIG Enabled or NVLink L2 is supported on chip. "
418                       "Skip device registration in RTD3/FGC6 exit\n");
419         }
420     }
421 
422     //
423     // Process the mask of init disabled links. Links can be init disabled
424     // by the hypervisor in a virtualized system for links that connect to
425     // GPUs that do not belong to the same guest
426     //
427     status = knvlinkProcessInitDisabledLinks(pGpu, pKernelNvlink);
428     if (status != NV_OK)
429     {
430         NV_ASSERT(status == NV_OK);
431         goto knvlinkStateLoad_end;
432     }
433 
434     // Remove the init disabled links from the discovered links mask
435     pKernelNvlink->discoveredLinks &= ~(pKernelNvlink->initDisabledLinksMask);
436 
437     // Track un-connected links, we assume all discovered links are connected.
438     pKernelNvlink->connectedLinksMask = pKernelNvlink->discoveredLinks;
439 
440     // Initialize initializedLinks to 0 (assume no links initialized)
441     pKernelNvlink->initializedLinks = 0;
442 
443     // For GSP-CLIENTs, the link masks and vbios info need to synchronize with GSP
444     status = knvlinkSyncLinkMasksAndVbiosInfo(pGpu, pKernelNvlink);
445     if (status != NV_OK)
446     {
447         NV_ASSERT(status == NV_OK);
448         goto knvlinkStateLoad_end;
449     }
450 
451     //
452     // Save off the links that were previously initialized to be able to
453     // optimize away a heavy flush later. This is needed on IBM systems
454     //
455     preInitializedLinks = pKernelNvlink->initializedLinks;
456 
457     // Load link speed if forced from OS
458     status = knvlinkProgramLinkSpeed_HAL(pGpu, pKernelNvlink);
459     if (status != NV_OK)
460     {
461         NV_ASSERT(status == NV_OK);
462         goto knvlinkStateLoad_end;
463     }
464 
465     //
466     // At this point, the discovered links mask is filtered. If there are no
467     // discovered links, then we skip the rest of the steps.
468     //
469     if (pKernelNvlink->discoveredLinks == 0)
470     {
471         goto knvlinkStateLoad_end;
472     }
473 
474     //
475     // Override configuration for NVLink topology. This can be either
476     //     a. Legacy forced configurations
477     //     b. Chiplib forced configurations
478     //
479     status = knvlinkOverrideConfig_HAL(pGpu, pKernelNvlink, NVLINK_PHASE_STATE_LOAD);
480     if (status != NV_OK)
481     {
482         NV_ASSERT(status == NV_OK);
483         goto knvlinkStateLoad_end;
484     }
485 
486     //
487     // Finalize the enabledLinks mask. If the configuration is not forced
488     // (legacy or chiplib), this is same as the discovered links mask
489     //
490     if (pKernelNvlink->bRegistryLinkOverride)
491     {
492         pKernelNvlink->enabledLinks = pKernelNvlink->discoveredLinks &
493                                       pKernelNvlink->registryLinkMask;
494     }
495     else if (bMIGNvLinkP2PDisabled)
496     {
497         // NvLink is not supported with MIG
498         pKernelNvlink->enabledLinks = 0;
499     }
500     else
501     {
502         pKernelNvlink->enabledLinks = pKernelNvlink->discoveredLinks;
503     }
504 
505     // Sense NVLink bridge presence and remove links on missing bridges.
506     knvlinkFilterBridgeLinks_HAL(pGpu, pKernelNvlink);
507 
508     // Disconnected links mask tracks links whose remote ends are not discovered
509     pKernelNvlink->disconnectedLinkMask = pKernelNvlink->enabledLinks;
510 
511     if (!IS_RTLSIM(pGpu) || pKernelNvlink->bForceEnableCoreLibRtlsims)
512     {
513         if (!knvlinkPoweredUpForD3_HAL(pGpu, pKernelNvlink))
514         {
515             // Register links in the nvlink core library
516             FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
517             {
518                 status = knvlinkCoreAddLink(pGpu, pKernelNvlink, i);
519                 if (status != NV_OK)
520                 {
521                     NV_PRINTF(LEVEL_ERROR,
522                               "Failed to register Link%d in nvlink core\n", i);
523                     goto knvlinkStateLoad_end;
524                 }
525             }
526             FOR_EACH_INDEX_IN_MASK_END;
527         }
528         else
529         {
530             NV_PRINTF(LEVEL_INFO,
531                       "NVLink L2 is supported on the chip. "
532                       "Skip link registration in RTD3/FGC6 exit\n");
533         }
534     }
535 
536     // RPC to GSP-RM to perform pre-topology setup on mask of enabled links
537     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
538                                  NV2080_CTRL_CMD_NVLINK_ENABLE_LINKS,
539                                  NULL, 0);
540     if (status != NV_OK)
541     {
542         NV_PRINTF(LEVEL_ERROR,
543             "Failed to perform pre-topology setup on mask of enabled links\n");
544         return status;
545     }
546 
547     //
548     // Check for NVSwitch proxy presence and update the RM state accordingly. If
549     // this is service VM, then the NVSwitch fabric address should not have been
550     // by now. If this is guest VM, then NVSwitch fabric address should already
551     // have been programmed by service VM.
552     //
553     knvlinkDetectNvswitchProxy(pGpu, pKernelNvlink);
554 
555     //
556     // Check for NVSwitch proxy to enable External Fabric Management and force
557     // init fabric manager state.
558     //
559     if (knvlinkIsNvswitchProxyPresent(pGpu, pKernelNvlink))
560     {
561         sysEnableExternalFabricMgmt(pSys);
562         sysForceInitFabricManagerState(pSys);
563     }
564     if (GPU_IS_NVSWITCH_DETECTED(pGpu))
565     {
566         sysEnableExternalFabricMgmt(pSys);
567     }
568 
569     //
570     // WAR Bug# 3261027: Sync-up External Fabric Management status with GSP-RM.
571     //     Long term once objsys state is made available to GSP, this WAR won't
572     //     be needed.
573     //
574     status = sysSyncExternalFabricMgmtWAR(pSys, pGpu);
575     if (status != NV_OK)
576     {
577         NV_ASSERT(status == NV_OK);
578         goto knvlinkStateLoad_end;
579     }
580 
581     //
582     // If we are running on CPU-RM or monolithic, process SYSMEM links, if present
583     // on the system.
584     //
585 
586     status = _knvlinkProcessSysmemLinks(pGpu, pKernelNvlink,
587                         (preInitializedLinks != pKernelNvlink->initializedLinks));
588     if (status != NV_OK)
589     {
590         NV_ASSERT(status == NV_OK);
591         goto knvlinkStateLoad_end;
592     }
593 
594     //
595     // FLA is supported only on Nvlink enabled systems
596     // Don't move this code path, since FLA relies on property
597     // "PDB_PROP_SYS_FABRIC_IS_EXTERNALLY_MANAGED", which is set in this
598     // function.
599     //
600     if (!(flags & GPU_STATE_FLAGS_PRESERVING))
601     {
602         if ((status = kbusCheckFlaSupportedAndInit_HAL(pGpu, GPU_GET_KERNEL_BUS(pGpu), 0, 0)) != NV_OK)
603         {
604             NV_PRINTF(LEVEL_ERROR, "Init FLA failed, status:0x%x\n", status);
605             NV_ASSERT(status == NV_OK);
606         }
607     }
608 
609     //
610     // If ALI or non-ALI training is forced, then shutdown the links and re-train as GFW
611     // will have already trained the links and the intent is for the driver
612     // to train up the links
613     //
614     if ((pKernelNvlink->getProperty(pKernelNvlink,
615                                     PDB_PROP_KNVLINK_MINION_FORCE_ALI_TRAINING)      ||
616          pKernelNvlink->getProperty(pKernelNvlink,
617                                     PDB_PROP_KNVLINK_MINION_FORCE_NON_ALI_TRAINING)) &&
618          pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_MINION_GFW_BOOT))
619     {
620         knvlinkCoreShutdownDeviceLinks(pGpu, pKernelNvlink, NV_FALSE);
621     }
622 
623     if (!knvlinkIsForcedConfig(pGpu, pKernelNvlink) && pKernelNvlink->bEnableAli &&
624         (!pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_MINION_GFW_BOOT) ||
625           pKernelNvlink->getProperty(pKernelNvlink,
626                                      PDB_PROP_KNVLINK_MINION_FORCE_ALI_TRAINING)))
627     {
628         status = knvlinkPreTrainLinksToActiveAli(pGpu, pKernelNvlink,
629                                                  pKernelNvlink->enabledLinks, NV_TRUE);
630         if (status != NV_OK)
631         {
632             goto knvlinkStateLoad_end;
633         }
634 
635         //
636         // For each link, request a change to active.
637         // Don't have to wait for the request to finish as links
638         // will be queries via DLSTAT to know their status and training
639         // progression.
640         //
641         FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
642         {
643             status = knvlinkTrainLinksToActiveAli(pGpu, pKernelNvlink, NVBIT(i), NV_FALSE);
644             if (status != NV_OK)
645             {
646                 NV_PRINTF(LEVEL_ERROR,
647                           "Failed to request Link %d to transition to active\n", i);
648             }
649 
650             //
651             // Bug 3550098: the sleep has to be removed eventually as it
652             // isn't POR for RM to be waiting on sending these requests.
653             // Bug 3292497 references this as a WAR for EMU in the short term to
654             // help prevent starvation on MINION linkstate requests
655             //
656             if (IS_EMULATION(pGpu))
657             {
658                 // Delay the next set of links by 8 seconds
659                 osDelayUs(8000000);
660             }
661         }
662         FOR_EACH_INDEX_IN_MASK_END;
663     }
664 
665     FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
666     {
667         status = tmrEventCreate(pTmr, &pKernelNvlink->nvlinkLinks[i].pTmrEvent,
668                             ioctrlFaultUpTmrHandler, NULL,
669                             TMR_FLAGS_NONE);
670         if (status != NV_OK)
671         {
672            NV_PRINTF(LEVEL_ERROR,
673                           "Failed to create TmrEvent for Link %d\n", i);
674         }
675     }
676     FOR_EACH_INDEX_IN_MASK_END;
677 
678     listInit(&pKernelNvlink->faultUpLinks, portMemAllocatorGetGlobalNonPaged());
679 
680 knvlinkStateLoad_end:
681 
682     if (status != NV_OK)
683     {
684         _knvlinkPurgeState(pGpu, pKernelNvlink);
685     }
686 
687     return status;
688 }
689 
690 /*!
691  * @brief NVLINK StatePostLoad
692  *
693  * @param[in] pGpu          OBJGPU pointer
694  * @param[in] pKernelNvlink KernelNvlink pointer
695  * @param[in] flags         Flags
696  *
697  * @return  NV_OK on success
698  */
699 NV_STATUS
700 knvlinkStatePostLoad_IMPL
701 (
702     OBJGPU       *pGpu,
703     KernelNvlink *pKernelNvlink,
704     NvU32         flags
705 )
706 {
707     NV_STATUS  status              = NV_OK;
708     NV_STATUS  trainingStatus      = NV_OK;
709     OBJGPU    *pRemoteGpu          = NULL;
710     NvU32      linkTrainingTimeout = 15000000;
711     NvU32      gpuInstance;
712     NvU32      gpuMask;
713     RMTIMEOUT  timeout;
714 
715     knvlinkCoreUpdateDeviceUUID(pGpu, pKernelNvlink);
716 
717     if (!knvlinkIsForcedConfig(pGpu, pKernelNvlink))
718     {
719         //
720         // If link initialization to SAFE during driver load is force enabled
721         // through regkey, then trigger topology discovery now. This can't be
722         // done for ALI since topology discovery can only happen after
723         // verification training is complete
724         //
725         if ((!pKernelNvlink->bEnableAli) &&
726             (pKernelNvlink->bEnableSafeModeAtLoad || pKernelNvlink->bEnableTrainingAtLoad ||
727              pKernelNvlink->bVerifTrainingEnable))
728         {
729             knvlinkCoreGetRemoteDeviceInfo(pGpu, pKernelNvlink);
730         }
731 
732         //
733         // Bug# 1667991: Enabling link training to high speed
734         // at driver load for loopback or P2P links only as of now.
735         //
736         // Also train links on verif env like Emulation and Fmodel
737         //
738         if (pKernelNvlink->bEnableTrainingAtLoad || pKernelNvlink->bVerifTrainingEnable)
739         {
740             if (pKernelNvlink->bEnableAli &&
741                 knvlinkDiscoverPostRxDetLinks_HAL(pGpu, pKernelNvlink, pGpu) == NV_OK)
742             {
743                 gpuSetTimeout(pGpu, linkTrainingTimeout, &timeout, IS_SILICON(pGpu) ?
744                     (GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE | GPU_TIMEOUT_FLAGS_DEFAULT) : 0);
745                 do
746                 {
747 
748                     status = gpuCheckTimeout(pGpu, &timeout);
749                     trainingStatus = knvlinkCheckTrainingIsComplete(pGpu, pGpu, pKernelNvlink);
750                     if (trainingStatus == NV_OK)
751                     {
752                         break;
753                     }
754                     osSpinLoop();
755                 }
756                 while (status != NV_ERR_TIMEOUT);
757 
758                 if (status != NV_OK)
759                 {
760                     NV_PRINTF(LEVEL_ERROR,"Timedout while checking to see if training complete!\n");
761                 }
762 
763                 // Need to get the renote Device Info for ALI
764                 knvlinkCoreGetRemoteDeviceInfo(pGpu, pKernelNvlink);
765             }
766             else
767             {
768                 status = gpumgrGetGpuAttachInfo(NULL, &gpuMask);
769                 NV_ASSERT_OR_RETURN(status == NV_OK, status);
770                 gpuInstance = 0;
771                 while ((pRemoteGpu = gpumgrGetNextGpu(gpuMask, &gpuInstance)) != NULL)
772                 {
773                     knvlinkTrainP2pLinksToActive(pGpu, pRemoteGpu, pKernelNvlink);
774                 }
775             }
776         }
777     }
778 
779     status = knvlinkStatePostLoadHal_HAL(pGpu, pKernelNvlink);
780     if (status != NV_OK)
781     {
782         NV_PRINTF(LEVEL_ERROR," failed for GPU 0x%x\n", pGpu->gpuInstance);
783         return status;
784     }
785 
786     return NV_OK;
787 }
788 
789 /*!
790  * @brief NVLINK StateUnload
791  *
792  * @param[in] pGpu          OBJGPU pointer
793  * @param[in] pKernelNvlink KernelNvlink pointer
794  * @param[in] flags         Flags
795  *
796  * @return  NV_OK on success
797  */
798 NV_STATUS
799 knvlinkStateUnload_IMPL
800 (
801     OBJGPU       *pGpu,
802     KernelNvlink *pKernelNvlink,
803     NvU32         flags
804 )
805 {
806     //
807     // Don't tear down FLA when undergoing suspend/resume
808     // Enable this only for CPU-RM and monolithic RM
809     //
810     if (!(flags & GPU_STATE_FLAGS_PRESERVING))
811     {
812         kbusDestroyFla_HAL(pGpu, GPU_GET_KERNEL_BUS(pGpu));
813     }
814 
815     return NV_OK;
816 }
817 
818 /*!
819  * @brief NVLINK StatePostUnload
820  *
821  * @param[in] pGpu          OBJGPU pointer
822  * @param[in] pKernelNvlink KernelNvlink pointer
823  *
824  * @return  NV_OK on success
825  */
826 NV_STATUS
827 knvlinkStatePostUnload_IMPL
828 (
829     OBJGPU       *pGpu,
830     KernelNvlink *pKernelNvlink,
831     NvU32         flags
832 )
833 {
834     OBJSYS    *pSys   = SYS_GET_INSTANCE();
835     NV_STATUS  status = NV_OK;
836 #if defined(INCLUDE_NVLINK_LIB)
837     NvU32 linkId = 0;
838 #endif
839 
840     if ((knvlinkGetNumLinksToSystem(pGpu, pKernelNvlink) != 0) &&
841         pGpu->getProperty(pGpu, PDB_PROP_GPU_COHERENT_CPU_MAPPING))
842     {
843         //
844         // On GPU reset the CPU<->GPU NVLinks are reset, and leaving any GPU
845         // memory cached on the CPU leads to fatal errors when the CPU tries to
846         // flush it after the link is down.
847         //
848         // Handle this by flushing all of the CPU caches as part of destroying
849         // the mapping. Do it only if the GPU is being drained as that's an
850         // indication the GPU is going to be reset. Otherwise, the links stay
851         // up and it's unnecessary to flush the cache.
852         //
853         // Do the flush before the link is put into safe mode below as the
854         // flush is much slower (takes minutes) once that's done.
855         //
856         NvBool bFlush = pKernelNvlink->getProperty(pKernelNvlink,
857                                                PDB_PROP_KNVLINK_LANE_SHUTDOWN_ON_UNLOAD);
858         kmemsysTeardownCoherentCpuLink(pGpu, GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu), bFlush);
859     }
860 
861     //
862     // If GPU is in the D3 entry path and if NVLink L2 is supported and links are
863     // expected to be in L2 before D3 entry is triggered, skip lane shutdown
864     //
865     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH) &&
866         pKernelNvlink->getProperty(pKernelNvlink,
867                                    PDB_PROP_KNVLINK_L2_POWER_STATE_ENABLED) &&
868         pKernelNvlink->getProperty(pKernelNvlink,
869                                    PDB_PROP_KNVLINK_L2_POWER_STATE_FOR_LONG_IDLE))
870     {
871         goto knvlinkStatePostUnload_end;
872     }
873 
874     //
875     // Set HSHUB to init values.
876     //
877     // It is good to reset HSHUB when GPUs are going down. For example, a GPU
878     // can be torn down because it (or its peers) experienced an NVLink error.
879     // In such cases resetting HSHUB is must. Otherwise, consecutive RmInitAdaper
880     // on the GPU  could fail if membars are emitted on the broken NVLinks.
881     //
882     // We do not set sysmem masks to init values because P9 systems are crashing
883     // for an unknown reason with an HMI exception during consecutive
884     // RmInitAdapter.
885     //
886     // TODO: For now, we are enabling this change for NVSwitch systems in r400_00
887     // to unblock DGX-2 release. In chips_a, the change will be enabled on all
888     // platforms (in discussion with ARCH for non-NVSwitch platforms).
889     //
890     if (pSys->getProperty(pSys, PDB_PROP_SYS_NVSWITCH_IS_PRESENT) ||
891         knvlinkIsNvswitchProxyPresent(pGpu, pKernelNvlink)        ||
892         (GPU_IS_NVSWITCH_DETECTED(pGpu)))
893     {
894         knvlinkRemoveMapping_HAL(pGpu, pKernelNvlink, NV_FALSE,
895                                  ((1 << NVLINK_MAX_PEERS_SW) - 1),
896                                  NV_FALSE /* bL2Entry */);
897     }
898 
899     //
900     // Check if lane disable and shutdown during driver unload has been forced
901     // using regkey override, or as a part of the external reset sequence.
902     //
903     if (pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LANE_SHUTDOWN_ON_UNLOAD) &&
904                         !pSys->getProperty(pSys, PDB_PROP_SYS_FABRIC_IS_EXTERNALLY_MANAGED) &&
905                         !API_GPU_IN_RESET_SANITY_CHECK(pGpu))
906     {
907         NV2080_CTRL_NVLINK_DISABLE_DL_INTERRUPTS_PARAMS params;
908 
909         portMemSet(&params, 0, sizeof(params));
910         params.linkMask = pKernelNvlink->enabledLinks;
911 
912         // Disable all the DL interrupts
913         status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
914                                      NV2080_CTRL_CMD_NVLINK_DISABLE_DL_INTERRUPTS,
915                                      (void *)&params, sizeof(params));
916         if (status != NV_OK)
917         {
918             NV_PRINTF(LEVEL_ERROR, "Failed to disable DL interrupts for the links\n");
919             return status;
920         }
921 
922         // Shutdown all the links through pseudo-clean shutdown
923         status = knvlinkPrepareForXVEReset(pGpu, pKernelNvlink, NV_FALSE);
924         if (status != NV_OK)
925         {
926             NV_PRINTF(LEVEL_ERROR,
927                       "Failed to pseudo-clean shutdown the links for GPU%d\n",
928                       pGpu->gpuInstance);
929             return status;
930         }
931     }
932 
933 #if defined(INCLUDE_NVLINK_LIB)
934     FOR_EACH_INDEX_IN_MASK(32, linkId, pKernelNvlink->enabledLinks)
935     {
936         // Update remote GPU disconnectedLinkMasks
937         OBJGPU *pRemoteGpu = gpumgrGetGpuFromBusInfo(pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.domain,
938                                                      pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bus,
939                                                      pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.device);
940         if (!API_GPU_IN_RESET_SANITY_CHECK(pRemoteGpu))
941         {
942             KernelNvlink *pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu);
943 
944             pRemoteKernelNvlink->disconnectedLinkMask |= NVBIT(pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber);
945         }
946     }
947     FOR_EACH_INDEX_IN_MASK_END;
948 #endif
949 
950     listDestroy(&pKernelNvlink->faultUpLinks);
951 
952 knvlinkStatePostUnload_end:
953 
954     _knvlinkPurgeState(pGpu, pKernelNvlink);
955 
956     return status;
957 }
958 
959 /*!
960  * @brief Purge SW state
961  *
962  * @param[in] pGpu           OBJGPU pointer
963  * @param[in] pKernelNvlink  KernelNvlink pointer
964  *
965  * @return  NV_OK on success
966  */
967 static NV_STATUS
968 _knvlinkPurgeState
969 (
970     OBJGPU       *pGpu,
971     KernelNvlink *pKernelNvlink
972 )
973 {
974     KernelIoctrl *pKernelIoctrl = NULL;
975     NvU32         ioctrlIdx;
976 #if defined(INCLUDE_NVLINK_LIB)
977 
978     NvU32         linkId;
979     OBJTMR       *pTmr = GPU_GET_TIMER(pGpu);
980     KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
981     NvBool bMIGNvLinkP2PDisabled = ((pKernelMIGManager != NULL) &&
982                           !kmigmgrIsMIGNvlinkP2PSupported(pGpu, pKernelMIGManager));
983 
984     FOR_EACH_INDEX_IN_MASK(32, linkId, pKernelNvlink->enabledLinks)
985     {
986         if ((pKernelNvlink->nvlinkLinks[linkId].pTmrEvent != NULL) && (pTmr != NULL))
987         {
988             if (tmrEventOnList(pTmr, pKernelNvlink->nvlinkLinks[linkId].pTmrEvent))
989             {
990                  tmrEventCancel(pTmr, pKernelNvlink->nvlinkLinks[linkId].pTmrEvent);
991             }
992             tmrEventDestroy(pTmr, pKernelNvlink->nvlinkLinks[linkId].pTmrEvent);
993             pKernelNvlink->nvlinkLinks[linkId].pTmrEvent = NULL;
994         }
995     }
996     FOR_EACH_INDEX_IN_MASK_END;
997 
998     // RM disables NVLink at runtime in Hopper so device un-registration can't be skipped
999     if (!IsGH100orBetter(pGpu))
1000     {
1001         // With MIG NvLink registration was skipped with core-lib
1002         if (bMIGNvLinkP2PDisabled)
1003         {
1004             NV_PRINTF(LEVEL_INFO,
1005                       "Skipping device/link un-registration in MIG enabled path\n");
1006             goto _knvlinkPurgeState_end;
1007         }
1008     }
1009 
1010     if (knvlinkPoweredUpForD3_HAL(pGpu, pKernelNvlink))
1011     {
1012         NV_PRINTF(LEVEL_INFO,
1013                   "Skipping device/link un-registration in RTD3 GC6 entry path\n");
1014         goto _knvlinkPurgeState_end;
1015     }
1016 
1017     if (!IS_RTLSIM(pGpu) || pKernelNvlink->bForceEnableCoreLibRtlsims)
1018     {
1019         if (pKernelNvlink->pNvlinkDev)
1020         {
1021             // Un-register the links from nvlink core library
1022             FOR_EACH_INDEX_IN_MASK(32, linkId, pKernelNvlink->enabledLinks)
1023             {
1024                 knvlinkCoreRemoveLink(pGpu, pKernelNvlink, linkId);
1025             }
1026             FOR_EACH_INDEX_IN_MASK_END;
1027 
1028             // Un-register the nvgpu device from nvlink core library
1029             knvlinkCoreRemoveDevice(pGpu, pKernelNvlink);
1030         }
1031     }
1032 
1033 _knvlinkPurgeState_end:
1034 
1035 #endif
1036 
1037     //
1038     // This GPU is being shutdown, so need to clear the peerLinkMasks and sysmem link
1039     // mask.
1040     //
1041     portMemSet(pKernelNvlink->peerLinkMasks, 0, sizeof(pKernelNvlink->peerLinkMasks));
1042     pKernelNvlink->sysmemLinkMask = 0;
1043 
1044     // Unload each IOCTRL object
1045     for (ioctrlIdx = 0; ioctrlIdx < pKernelNvlink->ioctrlNumEntries; ioctrlIdx++)
1046     {
1047         pKernelIoctrl = KNVLINK_GET_IOCTRL(pKernelNvlink, ioctrlIdx);
1048         if (pKernelIoctrl)
1049         {
1050             kioctrlDestructEngine(pKernelIoctrl);
1051             pKernelNvlink->ioctrlMask &= ~NVBIT(ioctrlIdx);
1052         }
1053     }
1054 
1055     // Destroy the chiplib configuration memory
1056     portMemFree(pKernelNvlink->pLinkConnection);
1057     pKernelNvlink->pLinkConnection = NULL;
1058 
1059     return NV_OK;
1060 }
1061 
1062 /*!
1063  * @brief Degraded Mode will be set if other end of the linkId
1064  *        is not degraded.
1065  *        Once degraded destroy the RM NVLink SW state
1066  *
1067  * @param[in] pGpu           OBJGPU pointer
1068  * @param[in] pKernelNvlink  KernelNvlink pointer
1069  * @param[in] linkId         linkId of the error link
1070  *
1071  */
1072 void
1073 knvlinkSetDegradedMode_IMPL
1074 (
1075     OBJGPU       *pGpu,
1076     KernelNvlink *pKernelNvlink,
1077     NvU32         linkId
1078 )
1079 {
1080     NvU32         status = NV_ERR_GENERIC;
1081     NvU32         gpuInstance;
1082     OBJGPU       *pRemoteGpu = NULL;
1083     KernelNvlink *pRemoteKernelNvlink = NULL;
1084 
1085     if (!pKernelNvlink)
1086     {
1087         NV_PRINTF(LEVEL_ERROR,
1088                 "Failed to get Local Nvlink info for linkId %d to update Degraded GPU%d status\n",
1089                 linkId, pGpu->gpuInstance);
1090 
1091         return;
1092     }
1093 
1094     if(pKernelNvlink->bIsGpuDegraded)
1095     {
1096         return;
1097     }
1098 
1099     //Find the remote GPU/NVLink attached to this link, if any
1100     for (gpuInstance = 0; gpuInstance < NV_MAX_DEVICES; gpuInstance++)
1101     {
1102         if (pKernelNvlink->peerLinkMasks[gpuInstance] & NVBIT(linkId))
1103         {
1104             pRemoteGpu = gpumgrGetGpu(gpuInstance);
1105             break;
1106         }
1107     }
1108 
1109     if (pRemoteGpu == NULL)
1110     {
1111         NV_PRINTF(LEVEL_ERROR,
1112                 "Failed to get Remote GPU info for linkId %d to update Degraded GPU%d status\n",
1113                 linkId, pGpu->gpuInstance);
1114 
1115         return;
1116     }
1117 
1118     pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu);
1119     if (!pRemoteKernelNvlink)
1120     {
1121         NV_PRINTF(LEVEL_ERROR,
1122                 "Failed to get Remote Nvlink info for linkId %d to update Degraded GPU%d status\n",
1123                 linkId, pGpu->gpuInstance);
1124 
1125         return;
1126     }
1127 
1128     if (pRemoteKernelNvlink->bIsGpuDegraded == NV_FALSE)
1129     {
1130         pKernelNvlink->bIsGpuDegraded = NV_TRUE;
1131         NV_PRINTF(LEVEL_ERROR,
1132                 "GPU%d marked Degraded for error on linkId %d \n",
1133                 pGpu->gpuInstance, linkId);
1134 
1135         // shutdown all the links on this GPU
1136         status = knvlinkCoreShutdownDeviceLinks(pGpu, pKernelNvlink, NV_TRUE);
1137         if (status != NV_OK)
1138         {
1139            NV_PRINTF(LEVEL_ERROR,
1140                      "failed to shutdown links on degraded GPU%d\n", pGpu->gpuInstance);
1141         }
1142     }
1143 
1144     return;
1145 }
1146 
1147 void
1148 knvlinkDestruct_IMPL
1149 (
1150     KernelNvlink *pKernelNvlink
1151 )
1152 {
1153     OBJGPU       *pGpu          = ENG_GET_GPU(pKernelNvlink);
1154     KernelIoctrl *pKernelIoctrl = NULL;
1155     NvU32         ioctrlIdx;
1156 
1157     // Destroy the RM NVLink state
1158     _knvlinkPurgeState(pGpu, pKernelNvlink);
1159 
1160     // Free Ioctrls
1161     for (ioctrlIdx = 0; ioctrlIdx < NVLINK_MAX_IOCTRLS_SW; ioctrlIdx++)
1162     {
1163         pKernelIoctrl = KNVLINK_GET_IOCTRL(pKernelNvlink, ioctrlIdx);
1164         if (pKernelIoctrl)
1165         {
1166             objDelete(pKernelIoctrl);
1167             pKernelNvlink->pKernelIoctrl[ioctrlIdx] = NULL;
1168         }
1169     }
1170 
1171     // Unload the nvlink core library
1172     knvlinkCoreDriverUnloadWar(pGpu, pKernelNvlink);
1173 }
1174 
1175 /**
1176  * @brief Handle sysmem NVLink connections and ATS functionality
1177  *
1178  * @param[in] pGpu           OBJGPU pointer
1179  * @param[in] pKernelNvlink  Nvlink pointer
1180  * @param[in] bFlush         Whether the CPU cache of the GPU mapping
1181  *                           should be flushed
1182  *
1183  * @return  NV_OK on success
1184  */
1185 NV_STATUS
1186 _knvlinkProcessSysmemLinks
1187 (
1188     OBJGPU       *pGpu,
1189     KernelNvlink *pKernelNvlink,
1190     NvBool        bFlush
1191 )
1192 {
1193     NV_STATUS status = NV_OK;
1194 
1195 #if defined(NVCPU_PPC64LE) || defined(NVCPU_AARCH64)
1196     if (pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_SYSMEM_SUPPORT_ENABLED))
1197     {
1198         //
1199         // In case of IBM or Tegra, the sysmem links will already have
1200         // been registered in nvlink core library. In order to trigger
1201         // topology detection, call knvlinkCoreGetRemoteDeviceInfo
1202         //
1203         if (!knvlinkIsForcedConfig(pGpu, pKernelNvlink) && !pKernelNvlink->pLinkConnection)
1204         {
1205             //
1206             // Establish the current link topology and enable IBM CPU/SYSMEM links.
1207             // If any of the discovered links are CPU/SYSMEM, they will be trained,
1208             // post-enabled, and then enabled in HSHUB when the call has completed.
1209             //
1210             status = knvlinkCoreGetRemoteDeviceInfo(pGpu, pKernelNvlink);
1211             if (status != NV_OK)
1212             {
1213                 NV_PRINTF(LEVEL_ERROR,
1214                           "Failed call to get remote device info during IBM CPU/SYSMEM links "
1215                           "setup, failing NVLink StateLoad on GPU%d!!!\n\n",
1216                           pGpu->gpuInstance);
1217 
1218                 return status;
1219             }
1220         }
1221         else
1222         {
1223             // If the topology is forced, then just apply the settings
1224             knvlinkUpdateCurrentConfig(pGpu, pKernelNvlink);
1225         }
1226     }
1227 #else
1228     if (knvlinkIsForcedConfig(pGpu, pKernelNvlink) || !IS_SILICON(pGpu))
1229     {
1230         // Set up the current Nvlink configuration
1231         knvlinkUpdateCurrentConfig(pGpu, pKernelNvlink);
1232     }
1233 #endif
1234 
1235     if (knvlinkIsForcedConfig(pGpu, pKernelNvlink) || pKernelNvlink->pLinkConnection)
1236     {
1237         //
1238         // On Hopper+ chips we enable programming of MUX registers. However,
1239         // we need to follow a strict sequence between updating the MUX registers,
1240         // the CONFIG0 registers and setting buffer_rdy for the enabled links.
1241         // BUFFER_RDY should always be set only after *all* HSHUB registers needed
1242         // for traffic are programmed. Since we did not support this on pre-Hopper,
1243         // we need to change the sequence of where we set BUFFER_RDY relative to
1244         // the other HSHUB programming.
1245         //
1246         status = knvlinkPostSetupNvlinkPeer_HAL(pGpu, pKernelNvlink);
1247         if (status != NV_OK)
1248         {
1249             NV_PRINTF(LEVEL_ERROR,
1250                       "Failed to perform NvLink post setup!\n");
1251             return status;
1252         }
1253     }
1254 
1255     // Set Buffer ready for the sysmem links
1256     NV2080_CTRL_NVLINK_PROGRAM_BUFFERREADY_PARAMS programBufferRdyParams;
1257 
1258     portMemSet(&programBufferRdyParams, 0, sizeof(programBufferRdyParams));
1259     programBufferRdyParams.flags        = NV2080_CTRL_NVLINK_PROGRAM_BUFFERREADY_FLAGS_SET;
1260     programBufferRdyParams.bSysmem      = NV_TRUE;
1261     programBufferRdyParams.peerLinkMask = 0;
1262 
1263     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
1264                                  NV2080_CTRL_CMD_NVLINK_PROGRAM_BUFFERREADY,
1265                                  (void *)&programBufferRdyParams,
1266                                  sizeof(programBufferRdyParams));
1267     if (status != NV_OK)
1268     {
1269         NV_PRINTF(LEVEL_ERROR, "Failed to program bufferready for the sysmem nvlinks!\n");
1270         return status;
1271     }
1272 
1273     //
1274     // Enable functionality related to NVLink SYSMEM:
1275     // + ATS functionality if hardware support is available
1276     // + Apply the Bug 200279966 WAR
1277     //
1278     if (knvlinkGetNumLinksToSystem(pGpu, pKernelNvlink) != 0)
1279     {
1280         //
1281         // Configure sysmem atomics config after sysmem link is up.
1282         // Sysmem atomics are programmed from memsysConfigureSysmemAtomics_HAL
1283         // but in case of PPC+GV100, nvlink setup is not done during call to
1284         // memsysConfigureSysmemAtomics_HAL that leads to sysmem atomics not getting
1285         // configured. Hence configure the sysmem atomics now for taking care of
1286         // PPC+GV100.
1287         //
1288 
1289         NV2080_CTRL_NVLINK_ENABLE_SYSMEM_NVLINK_ATS_PARAMS sysmemNvlinkAtsParams;
1290         portMemSet(&sysmemNvlinkAtsParams, 0, sizeof(sysmemNvlinkAtsParams));
1291 
1292         status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
1293                                      NV2080_CTRL_CMD_NVLINK_ENABLE_SYSMEM_NVLINK_ATS,
1294                                      (void *)&sysmemNvlinkAtsParams,
1295                                      sizeof(sysmemNvlinkAtsParams));
1296         if (status != NV_OK)
1297         {
1298             NV_PRINTF(LEVEL_ERROR, "Failed to snable ATS functionality for NVLink sysmem!\n");
1299             return status;
1300         }
1301 
1302         status = kmemsysSetupCoherentCpuLink(pGpu, GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu), bFlush);
1303         NV_ASSERT_OR_RETURN(status == NV_OK, status);
1304 
1305         osSetNVLinkSysmemLinkState(pGpu, NV_TRUE);
1306     }
1307 
1308     return status;
1309 }
1310