11739a20eSAndy Ritger /*
2*e45d91deSBernhard Stoeckner  * SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
31739a20eSAndy Ritger  * SPDX-License-Identifier: MIT
41739a20eSAndy Ritger  *
51739a20eSAndy Ritger  * Permission is hereby granted, free of charge, to any person obtaining a
61739a20eSAndy Ritger  * copy of this software and associated documentation files (the "Software"),
71739a20eSAndy Ritger  * to deal in the Software without restriction, including without limitation
81739a20eSAndy Ritger  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
91739a20eSAndy Ritger  * and/or sell copies of the Software, and to permit persons to whom the
101739a20eSAndy Ritger  * Software is furnished to do so, subject to the following conditions:
111739a20eSAndy Ritger  *
121739a20eSAndy Ritger  * The above copyright notice and this permission notice shall be included in
131739a20eSAndy Ritger  * all copies or substantial portions of the Software.
141739a20eSAndy Ritger  *
151739a20eSAndy Ritger  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
161739a20eSAndy Ritger  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
171739a20eSAndy Ritger  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
181739a20eSAndy Ritger  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
191739a20eSAndy Ritger  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
201739a20eSAndy Ritger  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
211739a20eSAndy Ritger  * DEALINGS IN THE SOFTWARE.
221739a20eSAndy Ritger  */
231739a20eSAndy Ritger 
24eb5c7665SAndy Ritger #define NVOC_KERNEL_NVLINK_H_PRIVATE_ACCESS_ALLOWED
25eb5c7665SAndy Ritger 
26eb5c7665SAndy Ritger // FIXME XXX
27eb5c7665SAndy Ritger #define NVOC_KERNEL_IOCTRL_H_PRIVATE_ACCESS_ALLOWED
28eb5c7665SAndy Ritger 
291739a20eSAndy Ritger #include "os/os.h"
301739a20eSAndy Ritger #include "core/hal.h"
311739a20eSAndy Ritger #include "core/locks.h"
3291676d66SBernhard Stoeckner #include "gpu_mgr/gpu_mgr.h"
331739a20eSAndy Ritger #include "gpu/gpu.h"
341739a20eSAndy Ritger #include "kernel/gpu/nvlink/kernel_nvlink.h"
351739a20eSAndy Ritger #include "kernel/gpu/nvlink/kernel_ioctrl.h"
361739a20eSAndy Ritger #include "gpu/mem_mgr/mem_mgr.h"
371739a20eSAndy Ritger #include "gpu/mmu/kern_gmmu.h"
381739a20eSAndy Ritger #include "gpu/ce/kernel_ce.h"
3991676d66SBernhard Stoeckner #include "platform/sli/sli.h"
4091676d66SBernhard Stoeckner #include "gpu/gpu_fabric_probe.h"
4191676d66SBernhard Stoeckner #include "compute/imex_session_api.h"
4291676d66SBernhard Stoeckner #include "compute/fabric.h"
4391676d66SBernhard Stoeckner #include "mem_mgr/mem_multicast_fabric.h"
441739a20eSAndy Ritger 
451739a20eSAndy Ritger /*!
461739a20eSAndy Ritger  * @brief Is NVLINK topology forced? NVLink topology is considered
471739a20eSAndy Ritger  *        forced for both legacy forced config and chiplib configs
481739a20eSAndy Ritger  *
491739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU
501739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
511739a20eSAndy Ritger  *
521739a20eSAndy Ritger  * @return  NV_TRUE if topology is forced
531739a20eSAndy Ritger  */
541739a20eSAndy Ritger NvBool
knvlinkIsForcedConfig_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)551739a20eSAndy Ritger knvlinkIsForcedConfig_IMPL
561739a20eSAndy Ritger (
571739a20eSAndy Ritger     OBJGPU       *pGpu,
581739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
591739a20eSAndy Ritger )
601739a20eSAndy Ritger {
611739a20eSAndy Ritger     return (pKernelNvlink->bChiplibConfig);
621739a20eSAndy Ritger }
631739a20eSAndy Ritger 
641739a20eSAndy Ritger /*!
651739a20eSAndy Ritger  * @brief Determine if NVLink is enabled or disabled by default
661739a20eSAndy Ritger  *
671739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
681739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
691739a20eSAndy Ritger  *
701739a20eSAndy Ritger  * @return  NV_TRUE if NVLink is enabled on the GPU/platform
711739a20eSAndy Ritger  */
721739a20eSAndy Ritger NvBool
knvlinkIsNvlinkDefaultEnabled_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)731739a20eSAndy Ritger knvlinkIsNvlinkDefaultEnabled_IMPL
741739a20eSAndy Ritger (
751739a20eSAndy Ritger     OBJGPU       *pGpu,
761739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
771739a20eSAndy Ritger )
781739a20eSAndy Ritger {
791739a20eSAndy Ritger     //
801739a20eSAndy Ritger     // Currently it is critical that the following lib check be present.
811739a20eSAndy Ritger     // Burying this in the hal below it may get lost as the stub is all
821739a20eSAndy Ritger     // thats required for POR (always true from the hals perspective)
831739a20eSAndy Ritger     //
841739a20eSAndy Ritger #if !defined(INCLUDE_NVLINK_LIB)
851739a20eSAndy Ritger 
861739a20eSAndy Ritger     return NV_FALSE;
871739a20eSAndy Ritger 
881739a20eSAndy Ritger #endif
891739a20eSAndy Ritger 
901739a20eSAndy Ritger     // Let the PDB handle the final decision.
911739a20eSAndy Ritger     return pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_ENABLED);
921739a20eSAndy Ritger }
931739a20eSAndy Ritger 
941739a20eSAndy Ritger /*!
951739a20eSAndy Ritger  * @brief Determine if P2P loopback over NVLink is supported for
961739a20eSAndy Ritger  *        the given GPU. This function returns true if any link
971739a20eSAndy Ritger  *        is connected in loopback mode.
981739a20eSAndy Ritger  *
991739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
1001739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
1011739a20eSAndy Ritger  *
1021739a20eSAndy Ritger  * @return  NV_TRUE if any link is in loopback mode
1031739a20eSAndy Ritger  */
1041739a20eSAndy Ritger NvBool
knvlinkIsP2pLoopbackSupported_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)1051739a20eSAndy Ritger knvlinkIsP2pLoopbackSupported_IMPL
1061739a20eSAndy Ritger (
1071739a20eSAndy Ritger     OBJGPU       *pGpu,
1081739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
1091739a20eSAndy Ritger )
1101739a20eSAndy Ritger {
1111739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
1121739a20eSAndy Ritger 
1131739a20eSAndy Ritger     NvU32 i;
1141739a20eSAndy Ritger 
1151739a20eSAndy Ritger     if ((pGpu == NULL) || (pKernelNvlink == NULL))
1161739a20eSAndy Ritger     {
1171739a20eSAndy Ritger         return NV_FALSE;
1181739a20eSAndy Ritger     }
1191739a20eSAndy Ritger 
1201739a20eSAndy Ritger     // Return false if P2P loopback is disabled through regkey
1211739a20eSAndy Ritger     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED))
1221739a20eSAndy Ritger     {
1231739a20eSAndy Ritger         return NV_FALSE;
1241739a20eSAndy Ritger     }
1251739a20eSAndy Ritger 
1261739a20eSAndy Ritger     FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
1271739a20eSAndy Ritger     {
1281739a20eSAndy Ritger         if (knvlinkIsP2pLoopbackSupportedPerLink_IMPL(pGpu, pKernelNvlink, i))
1291739a20eSAndy Ritger             return NV_TRUE;
1301739a20eSAndy Ritger     }
1311739a20eSAndy Ritger     FOR_EACH_INDEX_IN_MASK_END
1321739a20eSAndy Ritger 
1331739a20eSAndy Ritger #endif
1341739a20eSAndy Ritger 
1351739a20eSAndy Ritger     return NV_FALSE;
1361739a20eSAndy Ritger }
1371739a20eSAndy Ritger 
1381739a20eSAndy Ritger /*!
1391739a20eSAndy Ritger  * @brief Determine if P2P loopback over NVLink is supported for
1401739a20eSAndy Ritger  *        the given link. This function returns true if the link
1411739a20eSAndy Ritger  *        is connected in loopback mode.
1421739a20eSAndy Ritger  *
1431739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
1441739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
1451739a20eSAndy Ritger  * @param[in] link           Link ID
1461739a20eSAndy Ritger  *
1471739a20eSAndy Ritger  * @return  NV_TRUE if the link is in loopback mode
1481739a20eSAndy Ritger  */
1491739a20eSAndy Ritger NvBool
knvlinkIsP2pLoopbackSupportedPerLink_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 link)1501739a20eSAndy Ritger knvlinkIsP2pLoopbackSupportedPerLink_IMPL
1511739a20eSAndy Ritger (
1521739a20eSAndy Ritger     OBJGPU       *pGpu,
1531739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
1541739a20eSAndy Ritger     NvU32         link
1551739a20eSAndy Ritger )
1561739a20eSAndy Ritger {
1571739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
1581739a20eSAndy Ritger 
1591739a20eSAndy Ritger    if ((pGpu == NULL) || (pKernelNvlink == NULL))
1601739a20eSAndy Ritger     {
1611739a20eSAndy Ritger         return NV_FALSE;
1621739a20eSAndy Ritger     }
1631739a20eSAndy Ritger 
1641739a20eSAndy Ritger     // Return false if P2P loopback is disabled through regkey
1651739a20eSAndy Ritger     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED))
1661739a20eSAndy Ritger     {
1671739a20eSAndy Ritger         return NV_FALSE;
1681739a20eSAndy Ritger     }
1691739a20eSAndy Ritger 
1701739a20eSAndy Ritger     // Return false if the given link is disabled
1711739a20eSAndy Ritger     if (!(NVBIT(link) & pKernelNvlink->enabledLinks))
1721739a20eSAndy Ritger     {
1731739a20eSAndy Ritger         return NV_FALSE;
1741739a20eSAndy Ritger     }
1751739a20eSAndy Ritger 
1761739a20eSAndy Ritger     // Check the link connected to the same GPU (loopback)
1771739a20eSAndy Ritger     if (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.bConnected)
1781739a20eSAndy Ritger     {
179b5bf85a8SAndy Ritger         if (((pKernelNvlink->nvlinkLinks[link].remoteEndInfo.domain   == gpuGetDomain(pGpu)) &&
1801739a20eSAndy Ritger             (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.bus      == gpuGetBus(pGpu))    &&
1811739a20eSAndy Ritger             (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.device   == gpuGetDevice(pGpu)) &&
182b5bf85a8SAndy Ritger             (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.function == 0)) ||
183b5bf85a8SAndy Ritger                 pKernelNvlink->PDB_PROP_KNVLINK_FORCED_LOOPBACK_ON_SWITCH_MODE_ENABLED)
1841739a20eSAndy Ritger         {
1851739a20eSAndy Ritger             return NV_TRUE;
1861739a20eSAndy Ritger         }
1871739a20eSAndy Ritger     }
1881739a20eSAndy Ritger 
1891739a20eSAndy Ritger #endif
1901739a20eSAndy Ritger 
1911739a20eSAndy Ritger     return NV_FALSE;
1921739a20eSAndy Ritger }
1931739a20eSAndy Ritger 
1941739a20eSAndy Ritger /*!
1951739a20eSAndy Ritger  * @brief Determine if P2P over NVLINK is supported between 2 GPUs
1961739a20eSAndy Ritger  *
1971739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer for local GPU
1981739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
1991739a20eSAndy Ritger  * @param[in] pPeerGpu       OBJGPU pointer for remote GPU
2001739a20eSAndy Ritger  *
2011739a20eSAndy Ritger  * @return  NV_TRUE if P2P is supported between the 2 GPUs
2021739a20eSAndy Ritger  */
2031739a20eSAndy Ritger NvBool
knvlinkIsNvlinkP2pSupported_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pPeerGpu)2041739a20eSAndy Ritger knvlinkIsNvlinkP2pSupported_IMPL
2051739a20eSAndy Ritger (
2061739a20eSAndy Ritger     OBJGPU       *pGpu,
2071739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
2081739a20eSAndy Ritger     OBJGPU       *pPeerGpu
2091739a20eSAndy Ritger )
2101739a20eSAndy Ritger {
2111739a20eSAndy Ritger     NV_STATUS status = NV_OK;
2121739a20eSAndy Ritger 
2131739a20eSAndy Ritger     if (pKernelNvlink == NULL)
2141739a20eSAndy Ritger     {
2151739a20eSAndy Ritger         return NV_FALSE;
2161739a20eSAndy Ritger     }
2171739a20eSAndy Ritger 
2184397463eSAndy Ritger     if (knvlinkIsBandwidthModeOff(pKernelNvlink))
2194397463eSAndy Ritger     {
2204397463eSAndy Ritger         return NV_FALSE;
2214397463eSAndy Ritger     }
2224397463eSAndy Ritger 
2231739a20eSAndy Ritger     // Get the Nvlink P2P connections from the core library
2241739a20eSAndy Ritger     status = knvlinkGetP2pConnectionStatus(pGpu, pKernelNvlink, pPeerGpu);
2251739a20eSAndy Ritger 
2261739a20eSAndy Ritger     if (status == NV_OK)
2271739a20eSAndy Ritger     {
2281739a20eSAndy Ritger         return NV_TRUE;
2291739a20eSAndy Ritger     }
2301739a20eSAndy Ritger 
2311739a20eSAndy Ritger     return NV_FALSE;
2321739a20eSAndy Ritger }
2331739a20eSAndy Ritger 
234b5bf85a8SAndy Ritger static NvBool
_knvlinkCheckFabricCliqueId(OBJGPU * pGpu,OBJGPU * pPeerGpu)235b5bf85a8SAndy Ritger _knvlinkCheckFabricCliqueId
236b5bf85a8SAndy Ritger (
237b5bf85a8SAndy Ritger     OBJGPU       *pGpu,
238b5bf85a8SAndy Ritger     OBJGPU       *pPeerGpu
239b5bf85a8SAndy Ritger )
240b5bf85a8SAndy Ritger {
241b5bf85a8SAndy Ritger     NvU32 cliqueId, peerCliqueId;
242b5bf85a8SAndy Ritger     NV_STATUS status;
243b5bf85a8SAndy Ritger 
244b5bf85a8SAndy Ritger     status = gpuFabricProbeGetFabricCliqueId(pGpu->pGpuFabricProbeInfoKernel,
245b5bf85a8SAndy Ritger                                              &cliqueId);
246b5bf85a8SAndy Ritger     if (status != NV_OK)
247b5bf85a8SAndy Ritger     {
248b5bf85a8SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "GPU %d failed to get fabric clique Id: 0x%x\n",
249b5bf85a8SAndy Ritger                                 gpuGetInstance(pGpu), status);
250b5bf85a8SAndy Ritger         return NV_FALSE;
251b5bf85a8SAndy Ritger     }
252b5bf85a8SAndy Ritger 
253b5bf85a8SAndy Ritger     status = gpuFabricProbeGetFabricCliqueId(pPeerGpu->pGpuFabricProbeInfoKernel,
254b5bf85a8SAndy Ritger                                              &peerCliqueId);
255b5bf85a8SAndy Ritger     if (status != NV_OK)
256b5bf85a8SAndy Ritger     {
257b5bf85a8SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "GPU %d failed to get fabric clique Id 0x%x\n",
258b5bf85a8SAndy Ritger                                 gpuGetInstance(pPeerGpu), status);
259b5bf85a8SAndy Ritger         return NV_FALSE;
260b5bf85a8SAndy Ritger     }
261b5bf85a8SAndy Ritger 
262b5bf85a8SAndy Ritger     if (cliqueId != peerCliqueId)
263b5bf85a8SAndy Ritger     {
264b5bf85a8SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "GPU %d and Peer GPU %d cliqueId doesn't match\n",
265b5bf85a8SAndy Ritger                   gpuGetInstance(pGpu), gpuGetInstance(pPeerGpu));
266b5bf85a8SAndy Ritger         return NV_FALSE;
267b5bf85a8SAndy Ritger     }
268b5bf85a8SAndy Ritger 
269b5bf85a8SAndy Ritger     return NV_TRUE;
270b5bf85a8SAndy Ritger }
271b5bf85a8SAndy Ritger 
2721739a20eSAndy Ritger /*!
273ea4c27faSBernhard Stoeckner  * @brief Checks whether EGM addresses are valid for P2P
274ea4c27faSBernhard Stoeckner  * when GPU is connected to NVSwitch
275ea4c27faSBernhard Stoeckner  *
276ea4c27faSBernhard Stoeckner  * @param[in] pGpu           OBJGPU pointer for local GPU
277ea4c27faSBernhard Stoeckner  * @param[in] pKernelNvlink  KernelNvlink pointer
278ea4c27faSBernhard Stoeckner  * @param[in] pPeerGpu       OBJGPU pointer for remote GPU
279ea4c27faSBernhard Stoeckner  *
280ea4c27faSBernhard Stoeckner  * @return  NV_TRUE if EGM addresses are valid
281ea4c27faSBernhard Stoeckner  */
282ea4c27faSBernhard Stoeckner static NvBool
_knvlinkCheckNvswitchEgmAddressSanity(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pPeerGpu)283ea4c27faSBernhard Stoeckner _knvlinkCheckNvswitchEgmAddressSanity
284ea4c27faSBernhard Stoeckner (
285ea4c27faSBernhard Stoeckner     OBJGPU       *pGpu,
286ea4c27faSBernhard Stoeckner     KernelNvlink *pKernelNvlink,
287ea4c27faSBernhard Stoeckner     OBJGPU       *pPeerGpu
288ea4c27faSBernhard Stoeckner )
289ea4c27faSBernhard Stoeckner {
290ea4c27faSBernhard Stoeckner     NvU64 egmRangeStart = knvlinkGetUniqueFabricEgmBaseAddress(pGpu, pKernelNvlink);
291ea4c27faSBernhard Stoeckner 
292ea4c27faSBernhard Stoeckner     if (knvlinkIsGpuConnectedToNvswitch(pGpu, pKernelNvlink))
293ea4c27faSBernhard Stoeckner     {
294ea4c27faSBernhard Stoeckner         if (gpuIsSriovEnabled(pGpu))
295ea4c27faSBernhard Stoeckner         {
296ea4c27faSBernhard Stoeckner             // currently vgpu + switch doesn't support GPA addressing.
297ea4c27faSBernhard Stoeckner             return NV_TRUE;
298ea4c27faSBernhard Stoeckner         }
299ea4c27faSBernhard Stoeckner 
300ea4c27faSBernhard Stoeckner         if (gpuFabricProbeIsSupported(pGpu) && gpuFabricProbeIsSupported(pPeerGpu))
301ea4c27faSBernhard Stoeckner         {
302ea4c27faSBernhard Stoeckner             if (!_knvlinkCheckFabricCliqueId(pGpu, pPeerGpu))
303ea4c27faSBernhard Stoeckner             {
304ea4c27faSBernhard Stoeckner                 return NV_FALSE;
305ea4c27faSBernhard Stoeckner             }
306ea4c27faSBernhard Stoeckner         }
307ea4c27faSBernhard Stoeckner 
308ea4c27faSBernhard Stoeckner         // Sanity checks for EGM address
309ea4c27faSBernhard Stoeckner         if (egmRangeStart == NVLINK_INVALID_FABRIC_ADDR)
310ea4c27faSBernhard Stoeckner         {
311ea4c27faSBernhard Stoeckner             NV_PRINTF(LEVEL_ERROR, "GPU %d doesn't have a EGM fabric address\n",
312ea4c27faSBernhard Stoeckner                       gpuGetInstance(pGpu));
313ea4c27faSBernhard Stoeckner 
314ea4c27faSBernhard Stoeckner             return NV_FALSE;
315ea4c27faSBernhard Stoeckner         }
316ea4c27faSBernhard Stoeckner     }
317ea4c27faSBernhard Stoeckner     else
318ea4c27faSBernhard Stoeckner     {
319ea4c27faSBernhard Stoeckner         // Sanity check for EGM address
320ea4c27faSBernhard Stoeckner         if (egmRangeStart != NVLINK_INVALID_FABRIC_ADDR)
321ea4c27faSBernhard Stoeckner         {
322ea4c27faSBernhard Stoeckner             NV_PRINTF(LEVEL_ERROR,
323ea4c27faSBernhard Stoeckner                       "non-NVSwitch GPU %d has a valid EGM fabric address\n",
324ea4c27faSBernhard Stoeckner                       gpuGetInstance(pGpu));
325ea4c27faSBernhard Stoeckner 
326ea4c27faSBernhard Stoeckner             return NV_FALSE;
327ea4c27faSBernhard Stoeckner         }
328ea4c27faSBernhard Stoeckner 
329ea4c27faSBernhard Stoeckner     }
330ea4c27faSBernhard Stoeckner 
331ea4c27faSBernhard Stoeckner     return NV_TRUE;
332ea4c27faSBernhard Stoeckner }
333ea4c27faSBernhard Stoeckner 
334ea4c27faSBernhard Stoeckner /*!
3351739a20eSAndy Ritger  * @brief Checks whether necessary the config setup is done to
3361739a20eSAndy Ritger  *        support P2P over NVSwitch
3371739a20eSAndy Ritger  *
3381739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer for local GPU
3391739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
3401739a20eSAndy Ritger  * @param[in] pPeerGpu       OBJGPU pointer for remote GPU
3411739a20eSAndy Ritger  *
3421739a20eSAndy Ritger  * @return  NV_TRUE if P2P over NVSwitch
3431739a20eSAndy Ritger  */
3441739a20eSAndy Ritger NvBool
knvlinkCheckNvswitchP2pConfig_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pPeerGpu)3451739a20eSAndy Ritger knvlinkCheckNvswitchP2pConfig_IMPL
3461739a20eSAndy Ritger (
3471739a20eSAndy Ritger     OBJGPU       *pGpu,
3481739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
3491739a20eSAndy Ritger     OBJGPU       *pPeerGpu
3501739a20eSAndy Ritger )
3511739a20eSAndy Ritger {
3521739a20eSAndy Ritger     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
353ea4c27faSBernhard Stoeckner     NvU64          hbmRangeStart  = knvlinkGetUniqueFabricBaseAddress(pGpu, pKernelNvlink);
354ea4c27faSBernhard Stoeckner     NvU64          hbmRangeEnd    = hbmRangeStart + (pMemoryManager->Ram.fbTotalMemSizeMb << 20);
355ea4c27faSBernhard Stoeckner     NvU64          hbmPeerRangeStart = knvlinkGetUniqueFabricBaseAddress(pPeerGpu,
3561739a20eSAndy Ritger                                         GPU_GET_KERNEL_NVLINK(pPeerGpu));
3571739a20eSAndy Ritger 
3581739a20eSAndy Ritger     if (knvlinkIsGpuConnectedToNvswitch(pGpu, pKernelNvlink))
3591739a20eSAndy Ritger     {
3601739a20eSAndy Ritger         if (gpuIsSriovEnabled(pGpu))
3611739a20eSAndy Ritger         {
3621739a20eSAndy Ritger             // currently vgpu + switch doesn't support GPA addresing.
3631739a20eSAndy Ritger             return NV_TRUE;
3641739a20eSAndy Ritger         }
3651739a20eSAndy Ritger 
366b5bf85a8SAndy Ritger         if (gpuFabricProbeIsSupported(pGpu) && gpuFabricProbeIsSupported(pPeerGpu))
367b5bf85a8SAndy Ritger         {
368b5bf85a8SAndy Ritger             if (!_knvlinkCheckFabricCliqueId(pGpu, pPeerGpu))
369b5bf85a8SAndy Ritger             {
370b5bf85a8SAndy Ritger                 return NV_FALSE;
371b5bf85a8SAndy Ritger             }
372b5bf85a8SAndy Ritger         }
373b5bf85a8SAndy Ritger 
374ea4c27faSBernhard Stoeckner         // Sanity checks for HBM addresses
375ea4c27faSBernhard Stoeckner         if (hbmRangeStart == NVLINK_INVALID_FABRIC_ADDR)
3761739a20eSAndy Ritger         {
3771739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR, "GPU %d doesn't have a fabric address\n",
3781739a20eSAndy Ritger                       gpuGetInstance(pGpu));
3791739a20eSAndy Ritger 
3801739a20eSAndy Ritger             return NV_FALSE;
3811739a20eSAndy Ritger         }
3821739a20eSAndy Ritger 
3831739a20eSAndy Ritger         if ((pGpu != pPeerGpu) &&
384ea4c27faSBernhard Stoeckner             ((hbmPeerRangeStart >= hbmRangeStart) && (hbmPeerRangeStart < hbmRangeEnd)))
3851739a20eSAndy Ritger         {
3861739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR,
3871739a20eSAndy Ritger                       "GPU %d doesn't have a unique fabric address\n",
3881739a20eSAndy Ritger                       gpuGetInstance(pGpu));
3891739a20eSAndy Ritger 
3901739a20eSAndy Ritger             return NV_FALSE;
3911739a20eSAndy Ritger         }
3921739a20eSAndy Ritger     }
3931739a20eSAndy Ritger     else
3941739a20eSAndy Ritger     {
395ea4c27faSBernhard Stoeckner         // Sanity check for HBM address
396ea4c27faSBernhard Stoeckner         if (hbmRangeStart != NVLINK_INVALID_FABRIC_ADDR)
3971739a20eSAndy Ritger         {
3981739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR,
3991739a20eSAndy Ritger                       "non-NVSwitch GPU %d has a valid fabric address\n",
4001739a20eSAndy Ritger                       gpuGetInstance(pGpu));
4011739a20eSAndy Ritger 
4021739a20eSAndy Ritger             return NV_FALSE;
4031739a20eSAndy Ritger         }
4041739a20eSAndy Ritger     }
4051739a20eSAndy Ritger 
406ea4c27faSBernhard Stoeckner     if (memmgrIsLocalEgmEnabled(pMemoryManager))
407ea4c27faSBernhard Stoeckner     {
408ea4c27faSBernhard Stoeckner         return _knvlinkCheckNvswitchEgmAddressSanity(pGpu, pKernelNvlink, pPeerGpu);
409ea4c27faSBernhard Stoeckner     }
410ea4c27faSBernhard Stoeckner 
4111739a20eSAndy Ritger     return NV_TRUE;
4121739a20eSAndy Ritger }
4131739a20eSAndy Ritger 
4141739a20eSAndy Ritger /*!
4151739a20eSAndy Ritger  * @brief Get Nvlink P2P connections between 2 GPUs
4161739a20eSAndy Ritger  *
4171739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer for local GPU
4181739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
4191739a20eSAndy Ritger  * @param[in] pPeerGpu       OBJGPU pointer for remote GPU
4201739a20eSAndy Ritger  *
4211739a20eSAndy Ritger  * @return  NV_OK if P2P connections are present
4221739a20eSAndy Ritger  */
4231739a20eSAndy Ritger NV_STATUS
knvlinkGetP2pConnectionStatus_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pPeerGpu)4241739a20eSAndy Ritger knvlinkGetP2pConnectionStatus_IMPL
4251739a20eSAndy Ritger (
4261739a20eSAndy Ritger     OBJGPU       *pGpu,
4271739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
4281739a20eSAndy Ritger     OBJGPU       *pPeerGpu
4291739a20eSAndy Ritger )
4301739a20eSAndy Ritger {
4311739a20eSAndy Ritger     NV_STATUS     status         = NV_OK;
4321739a20eSAndy Ritger     OBJGPU       *pGpu0          = pGpu;
4331739a20eSAndy Ritger     OBJGPU       *pGpu1          = pPeerGpu;
4341739a20eSAndy Ritger     KernelNvlink *pKernelNvlink0 = pKernelNvlink;
4351739a20eSAndy Ritger     KernelNvlink *pKernelNvlink1 = NULL;
4361739a20eSAndy Ritger     NvU32         numPeerLinks   = 0;
4371739a20eSAndy Ritger 
4381739a20eSAndy Ritger     if (pGpu1 == NULL)
4391739a20eSAndy Ritger     {
440b5bf85a8SAndy Ritger         NV_PRINTF(LEVEL_INFO, "Invalid pPeerGpu.\n");
4411739a20eSAndy Ritger 
4421739a20eSAndy Ritger         return NV_ERR_INVALID_ARGUMENT;
4431739a20eSAndy Ritger     }
4441739a20eSAndy Ritger     else if ((pGpu0 == pGpu1) &&
4451739a20eSAndy Ritger              (pGpu0->getProperty(pGpu0, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED)))
4461739a20eSAndy Ritger     {
4471739a20eSAndy Ritger         // P2P over loopback links are disabled through regkey overrides
4481739a20eSAndy Ritger         NV_PRINTF(LEVEL_INFO, "loopback P2P on GPU%u disabled by regkey\n",
4491739a20eSAndy Ritger                   gpuGetInstance(pGpu0));
4501739a20eSAndy Ritger 
4511739a20eSAndy Ritger         return NV_ERR_NOT_SUPPORTED;
4521739a20eSAndy Ritger     }
4531739a20eSAndy Ritger     else
4541739a20eSAndy Ritger     {
4551739a20eSAndy Ritger         pKernelNvlink1 = GPU_GET_KERNEL_NVLINK(pGpu1);
4561739a20eSAndy Ritger     }
4571739a20eSAndy Ritger 
4581739a20eSAndy Ritger     if (pKernelNvlink1 == NULL)
4591739a20eSAndy Ritger     {
460b5bf85a8SAndy Ritger         NV_PRINTF(LEVEL_INFO,
4611739a20eSAndy Ritger                   "Input mask contains a GPU on which NVLink is disabled.\n");
4621739a20eSAndy Ritger 
4631739a20eSAndy Ritger         return NV_ERR_INVALID_ARGUMENT;
4641739a20eSAndy Ritger     }
4651739a20eSAndy Ritger 
466758b4ee8SAndy Ritger     if(pKernelNvlink0->bIsGpuDegraded)
467758b4ee8SAndy Ritger     {
468758b4ee8SAndy Ritger         NV_PRINTF(LEVEL_INFO,
469758b4ee8SAndy Ritger                   "NVLink P2P is NOT supported between GPU%d and GPU%d\n",
470758b4ee8SAndy Ritger                   gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
471758b4ee8SAndy Ritger 
472758b4ee8SAndy Ritger         return NV_ERR_NOT_SUPPORTED;
473758b4ee8SAndy Ritger     }
474758b4ee8SAndy Ritger 
475758b4ee8SAndy Ritger     if(pKernelNvlink1->bIsGpuDegraded)
476758b4ee8SAndy Ritger     {
477758b4ee8SAndy Ritger         NV_PRINTF(LEVEL_INFO,
478758b4ee8SAndy Ritger                   "NVLink P2P is NOT supported between GPU%d and GPU%d\n",
479758b4ee8SAndy Ritger                   gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
480758b4ee8SAndy Ritger 
481758b4ee8SAndy Ritger         return NV_ERR_NOT_SUPPORTED;
482758b4ee8SAndy Ritger     }
483758b4ee8SAndy Ritger 
4841739a20eSAndy Ritger     if ((IS_RTLSIM(pGpu0) && !pKernelNvlink0->bForceEnableCoreLibRtlsims) ||
4851739a20eSAndy Ritger         knvlinkIsForcedConfig(pGpu0, pKernelNvlink0))
4861739a20eSAndy Ritger     {
4871739a20eSAndy Ritger         // For non-legacy configs.
4881739a20eSAndy Ritger         if (pKernelNvlink0->bChiplibConfig)
4891739a20eSAndy Ritger         {
4901739a20eSAndy Ritger             NV_PRINTF(LEVEL_INFO,
4911739a20eSAndy Ritger                       "NVLink P2P is supported between GPU%d and GPU%d\n",
4921739a20eSAndy Ritger                       gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
4931739a20eSAndy Ritger 
4941739a20eSAndy Ritger             return NV_OK;
4951739a20eSAndy Ritger         }
4961739a20eSAndy Ritger     }
4971739a20eSAndy Ritger 
4981739a20eSAndy Ritger     // Get the remote ends of the links of local GPU from the nvlink core
4994397463eSAndy Ritger     status = knvlinkCoreGetRemoteDeviceInfo(pGpu0, pKernelNvlink0);
5004397463eSAndy Ritger     if (status != NV_OK)
5014397463eSAndy Ritger     {
5024397463eSAndy Ritger         return status;
5034397463eSAndy Ritger     }
5041739a20eSAndy Ritger 
5051739a20eSAndy Ritger     // Post topology link enable on links of local GPU
5061739a20eSAndy Ritger     status = knvlinkEnableLinksPostTopology_HAL(pGpu0, pKernelNvlink0,
5071739a20eSAndy Ritger                                                 pKernelNvlink0->enabledLinks);
5081739a20eSAndy Ritger     if (status != NV_OK)
5091739a20eSAndy Ritger     {
5101739a20eSAndy Ritger         return status;
5111739a20eSAndy Ritger     }
5121739a20eSAndy Ritger 
5131739a20eSAndy Ritger     numPeerLinks = knvlinkGetNumLinksToPeer(pGpu0, pKernelNvlink0, pGpu1);
514758b4ee8SAndy Ritger 
515758b4ee8SAndy Ritger     //
516758b4ee8SAndy Ritger     // Maybe knvlinkCoreGetRemoteDeviceInfo was never called on pGpu1.
517758b4ee8SAndy Ritger     // This can happen on systems where FM doesn't configure GPUs
518758b4ee8SAndy Ritger     // using RM control calls explicitly.
519758b4ee8SAndy Ritger     //
520758b4ee8SAndy Ritger     if ((numPeerLinks == 0) && gpuFabricProbeIsSupported(pGpu1))
521758b4ee8SAndy Ritger     {
522758b4ee8SAndy Ritger         knvlinkCoreGetRemoteDeviceInfo(pGpu1, pKernelNvlink1);
523758b4ee8SAndy Ritger 
524758b4ee8SAndy Ritger         // Post topology link enable on links of remote GPU
525758b4ee8SAndy Ritger         status = knvlinkEnableLinksPostTopology_HAL(pGpu1, pKernelNvlink1,
526758b4ee8SAndy Ritger                                                     pKernelNvlink1->enabledLinks);
527758b4ee8SAndy Ritger         if (status != NV_OK)
528758b4ee8SAndy Ritger         {
529758b4ee8SAndy Ritger             return status;
530758b4ee8SAndy Ritger         }
531758b4ee8SAndy Ritger 
532758b4ee8SAndy Ritger         numPeerLinks = knvlinkGetNumLinksToPeer(pGpu0, pKernelNvlink0, pGpu1);
533758b4ee8SAndy Ritger     }
534758b4ee8SAndy Ritger 
5351739a20eSAndy Ritger     if (numPeerLinks > 0)
5361739a20eSAndy Ritger     {
5371739a20eSAndy Ritger         if (knvlinkGetNumLinksToPeer(pGpu1, pKernelNvlink1, pGpu0) != numPeerLinks)
5381739a20eSAndy Ritger         {
5391739a20eSAndy Ritger             // Get the remote ends of the links of remote GPU from the nvlink core
5404397463eSAndy Ritger             status = knvlinkCoreGetRemoteDeviceInfo(pGpu1, pKernelNvlink1);
5414397463eSAndy Ritger             if (status != NV_OK)
5424397463eSAndy Ritger             {
5434397463eSAndy Ritger                 return status;
5444397463eSAndy Ritger             }
5451739a20eSAndy Ritger 
5461739a20eSAndy Ritger             // Post topology link enable on links of remote GPU
5471739a20eSAndy Ritger             status = knvlinkEnableLinksPostTopology_HAL(pGpu1, pKernelNvlink1,
5481739a20eSAndy Ritger                                                         pKernelNvlink1->enabledLinks);
5491739a20eSAndy Ritger             if (status != NV_OK)
5501739a20eSAndy Ritger             {
5511739a20eSAndy Ritger                 return status;
5521739a20eSAndy Ritger             }
5531739a20eSAndy Ritger         }
5541739a20eSAndy Ritger 
5551739a20eSAndy Ritger         // Peers should have the same number of links pointing back at us
556b5bf85a8SAndy Ritger         NV_CHECK_OR_RETURN(LEVEL_INFO,
557b5bf85a8SAndy Ritger             (knvlinkGetNumLinksToPeer(pGpu1, pKernelNvlink1, pGpu0) == numPeerLinks),
5581739a20eSAndy Ritger             NV_ERR_INVALID_STATE);
5591739a20eSAndy Ritger 
560b5bf85a8SAndy Ritger         NV_CHECK_OR_RETURN(LEVEL_INFO,
561b5bf85a8SAndy Ritger                 knvlinkCheckNvswitchP2pConfig(pGpu0, pKernelNvlink0, pGpu1),
562b5bf85a8SAndy Ritger                 NV_ERR_INVALID_STATE);
563b5bf85a8SAndy Ritger 
564b5bf85a8SAndy Ritger         NV_CHECK_OR_RETURN(LEVEL_INFO,
565b5bf85a8SAndy Ritger                 knvlinkCheckNvswitchP2pConfig(pGpu1, pKernelNvlink1, pGpu0),
5661739a20eSAndy Ritger                 NV_ERR_INVALID_STATE);
5671739a20eSAndy Ritger 
5681739a20eSAndy Ritger         NV_PRINTF(LEVEL_INFO,
5691739a20eSAndy Ritger                   "NVLink P2P is supported between GPU%d and GPU%d\n",
5701739a20eSAndy Ritger                   gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
5711739a20eSAndy Ritger 
5721739a20eSAndy Ritger         return NV_OK;
5731739a20eSAndy Ritger     }
5741739a20eSAndy Ritger 
5751739a20eSAndy Ritger     NV_PRINTF(LEVEL_INFO,
5761739a20eSAndy Ritger               "NVLink P2P is NOT supported between between GPU%d and GPU%d\n",
5771739a20eSAndy Ritger               pGpu->gpuInstance, pGpu1->gpuInstance);
5781739a20eSAndy Ritger 
5791739a20eSAndy Ritger     return NV_ERR_NOT_SUPPORTED;
5801739a20eSAndy Ritger }
5811739a20eSAndy Ritger 
5821739a20eSAndy Ritger /*!
5831739a20eSAndy Ritger  * @brief Update the settings for the current established NVLink
5841739a20eSAndy Ritger  *        topology. This is the top level function that should be
5851739a20eSAndy Ritger  *        called, instead of applying the settings individually,
5861739a20eSAndy Ritger  *        since it grabs the required locks
5871739a20eSAndy Ritger  *
5881739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
5891739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
5901739a20eSAndy Ritger  *
5911739a20eSAndy Ritger  * @return  NV_OK on success
5921739a20eSAndy Ritger  */
5931739a20eSAndy Ritger NV_STATUS
knvlinkUpdateCurrentConfig_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)5941739a20eSAndy Ritger knvlinkUpdateCurrentConfig_IMPL
5951739a20eSAndy Ritger (
5961739a20eSAndy Ritger     OBJGPU       *pGpu,
5971739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
5981739a20eSAndy Ritger )
5991739a20eSAndy Ritger {
6001739a20eSAndy Ritger     OBJSYS    *pSys      = SYS_GET_INSTANCE();
6011739a20eSAndy Ritger     KernelCE  *pKCe      = NULL;
6021739a20eSAndy Ritger     NvBool     bOwnsLock = NV_FALSE;
6031739a20eSAndy Ritger     NV_STATUS  status    = NV_OK;
6041739a20eSAndy Ritger 
6051739a20eSAndy Ritger     if (osAcquireRmSema(pSys->pSema) == NV_OK)
6061739a20eSAndy Ritger     {
6071739a20eSAndy Ritger         //
6081739a20eSAndy Ritger         // XXX Bug 1795328: Fix P2P path to acquire locks for the GPU
6091739a20eSAndy Ritger         //  Due to platform differences in the P2P path, the GPU lock is not
6101739a20eSAndy Ritger         //  consistently held at this point in the call stack. This function
6111739a20eSAndy Ritger         //  requires exclusive access to RM/PMU data structures to update HSHUB,
6121739a20eSAndy Ritger         //  and therefore requires the GPU lock to be held at this point.
6131739a20eSAndy Ritger         //  This check should be removed once the P2P paths have been updated to
6141739a20eSAndy Ritger         //  acquire the GPU locks consistently for all platforms.
6151739a20eSAndy Ritger         //
6161739a20eSAndy Ritger         if (!rmDeviceGpuLockIsOwner(pGpu->gpuInstance))
6171739a20eSAndy Ritger         {
6181739a20eSAndy Ritger             status = rmDeviceGpuLocksAcquire(pGpu, GPUS_LOCK_FLAGS_NONE,
6191739a20eSAndy Ritger                                              RM_LOCK_MODULES_NVLINK);
6201739a20eSAndy Ritger             if (status != NV_OK)
6211739a20eSAndy Ritger             {
6221739a20eSAndy Ritger                 NV_ASSERT(0);
6231739a20eSAndy Ritger                 goto fail;
6241739a20eSAndy Ritger             }
6251739a20eSAndy Ritger 
6261739a20eSAndy Ritger             bOwnsLock = NV_TRUE;
6271739a20eSAndy Ritger         }
6281739a20eSAndy Ritger 
6291739a20eSAndy Ritger         //
6301739a20eSAndy Ritger         // Links that have remote end detected should have passed RXDET
6311739a20eSAndy Ritger         // Update the mask of connected links and bridged links
6321739a20eSAndy Ritger         //
6331739a20eSAndy Ritger         knvlinkFilterBridgeLinks_HAL(pGpu, pKernelNvlink);
6341739a20eSAndy Ritger 
6351739a20eSAndy Ritger         NV2080_CTRL_NVLINK_UPDATE_CURRENT_CONFIG_PARAMS params;
6361739a20eSAndy Ritger         portMemSet(&params, 0, sizeof(params));
6371739a20eSAndy Ritger 
6381739a20eSAndy Ritger         // Reset timeout to clear any accumulated timeouts from link init
6391739a20eSAndy Ritger         if (IS_GSP_CLIENT(pGpu))
6401739a20eSAndy Ritger         {
6411739a20eSAndy Ritger             threadStateResetTimeout(pGpu);
6421739a20eSAndy Ritger         }
6431739a20eSAndy Ritger 
6441739a20eSAndy Ritger         //
6451739a20eSAndy Ritger         // RPC into GSP-RM for programming the HSHUB, CONNECTION_CFG and LTCS
6461739a20eSAndy Ritger         // registers.
6471739a20eSAndy Ritger         //
6481739a20eSAndy Ritger         status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
6491739a20eSAndy Ritger                                      NV2080_CTRL_CMD_NVLINK_UPDATE_CURRENT_CONFIG,
6501739a20eSAndy Ritger                                      (void *)&params, sizeof(params));
6511739a20eSAndy Ritger         if (status != NV_OK)
6521739a20eSAndy Ritger         {
6531739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR, "Updating current NVLink config failed\n");
6541739a20eSAndy Ritger             goto fail;
6551739a20eSAndy Ritger         }
6561739a20eSAndy Ritger 
6571739a20eSAndy Ritger         // Sync the GPU property for NVLINK over SYSMEM with GSP-RM
6581739a20eSAndy Ritger         pGpu->setProperty(pGpu, PDB_PROP_GPU_NVLINK_SYSMEM, params.bNvlinkSysmemEnabled);
6591739a20eSAndy Ritger 
6601739a20eSAndy Ritger         // Update the PCE-LCE mappings
661758b4ee8SAndy Ritger         status = kceFindFirstInstance(pGpu, &pKCe);
662758b4ee8SAndy Ritger         if (status == NV_OK)
6631739a20eSAndy Ritger         {
6641739a20eSAndy Ritger             status = kceTopLevelPceLceMappingsUpdate(pGpu, pKCe);
6651739a20eSAndy Ritger             if (status != NV_OK)
6661739a20eSAndy Ritger             {
6671739a20eSAndy Ritger                 NV_PRINTF(LEVEL_ERROR, "Failed to update PCE-LCE mappings\n");
6681739a20eSAndy Ritger             }
6691739a20eSAndy Ritger         }
6701739a20eSAndy Ritger 
6711739a20eSAndy Ritger fail:
6721739a20eSAndy Ritger         if (bOwnsLock)
6731739a20eSAndy Ritger         {
6741739a20eSAndy Ritger             rmDeviceGpuLocksRelease(pGpu, GPUS_LOCK_FLAGS_NONE, NULL);
6751739a20eSAndy Ritger         }
6761739a20eSAndy Ritger 
6771739a20eSAndy Ritger         osReleaseRmSema(pSys->pSema, NULL);
6781739a20eSAndy Ritger     }
6791739a20eSAndy Ritger 
6801739a20eSAndy Ritger     return status;
6811739a20eSAndy Ritger }
6821739a20eSAndy Ritger 
68391676d66SBernhard Stoeckner const static NVLINK_INBAND_MSG_CALLBACK nvlink_inband_callbacks[] =
684758b4ee8SAndy Ritger {
685758b4ee8SAndy Ritger     {
68691676d66SBernhard Stoeckner         .messageType = NVLINK_INBAND_MSG_TYPE_GPU_PROBE_RSP,
68791676d66SBernhard Stoeckner         .pCallback = gpuFabricProbeReceiveKernelCallback,
68891676d66SBernhard Stoeckner         .wqItemFlags = OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
68991676d66SBernhard Stoeckner                        OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_SUBDEVICE_RW
69091676d66SBernhard Stoeckner     },
691758b4ee8SAndy Ritger 
692758b4ee8SAndy Ritger     {
69391676d66SBernhard Stoeckner         .messageType = NVLINK_INBAND_MSG_TYPE_MC_TEAM_SETUP_RSP,
69491676d66SBernhard Stoeckner         .pCallback = memorymulticastfabricTeamSetupResponseCallback,
69591676d66SBernhard Stoeckner         .wqItemFlags = OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
69691676d66SBernhard Stoeckner                        OS_QUEUE_WORKITEM_FLAGS_LOCK_GPUS_RW
69791676d66SBernhard Stoeckner     },
698758b4ee8SAndy Ritger 
699758b4ee8SAndy Ritger     {
70091676d66SBernhard Stoeckner         .messageType = NVLINK_INBAND_MSG_TYPE_GPU_PROBE_UPDATE_REQ,
70191676d66SBernhard Stoeckner         .pCallback = gpuFabricProbeReceiveUpdateKernelCallback,
70291676d66SBernhard Stoeckner         .wqItemFlags = OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
70391676d66SBernhard Stoeckner                        OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_SUBDEVICE_RW
704758b4ee8SAndy Ritger     }
70591676d66SBernhard Stoeckner };
706758b4ee8SAndy Ritger 
707758b4ee8SAndy Ritger void
knvlinkInbandMsgCallbackDispatcher_WORKITEM(NvU32 gpuInstance,void * pData)708758b4ee8SAndy Ritger knvlinkInbandMsgCallbackDispatcher_WORKITEM
709758b4ee8SAndy Ritger (
710758b4ee8SAndy Ritger     NvU32 gpuInstance,
711758b4ee8SAndy Ritger     void *pData
712758b4ee8SAndy Ritger )
713758b4ee8SAndy Ritger {
714758b4ee8SAndy Ritger     nvlink_inband_msg_header_t *pHeader;
715758b4ee8SAndy Ritger     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS *pMessage = pData;
71691676d66SBernhard Stoeckner     NvU8 i;
71791676d66SBernhard Stoeckner     const NVLINK_INBAND_MSG_CALLBACK *pCb = NULL;
718758b4ee8SAndy Ritger 
71991676d66SBernhard Stoeckner     // Dispatcher may not be called under GPU lock, so don't access pGpu.
720758b4ee8SAndy Ritger 
721758b4ee8SAndy Ritger     pHeader = (nvlink_inband_msg_header_t *)pMessage->data;
722758b4ee8SAndy Ritger 
72391676d66SBernhard Stoeckner     for (i = 0; i < NV_ARRAY_ELEMENTS(nvlink_inband_callbacks); i++)
72491676d66SBernhard Stoeckner     {
72591676d66SBernhard Stoeckner         if ((nvlink_inband_callbacks[i].messageType == pHeader->type) &&
72691676d66SBernhard Stoeckner             (nvlink_inband_callbacks[i].pCallback != NULL))
72791676d66SBernhard Stoeckner         {
72891676d66SBernhard Stoeckner             pCb = &nvlink_inband_callbacks[i];
72991676d66SBernhard Stoeckner             break;
73091676d66SBernhard Stoeckner         }
73191676d66SBernhard Stoeckner     }
73291676d66SBernhard Stoeckner 
73391676d66SBernhard Stoeckner     if (pCb == NULL)
734eb5c7665SAndy Ritger     {
735eb5c7665SAndy Ritger         NV_PRINTF(LEVEL_ERROR,
736eb5c7665SAndy Ritger                   "No Callback Registered for type %d. Dropping the msg\n",
737eb5c7665SAndy Ritger                   pHeader->type);
738eb5c7665SAndy Ritger         return;
739eb5c7665SAndy Ritger     }
740eb5c7665SAndy Ritger 
74191676d66SBernhard Stoeckner #if defined(DEBUG) || defined(DEVELOP)
74291676d66SBernhard Stoeckner     {
74391676d66SBernhard Stoeckner         NvU8 *pRsvd = NULL;
74491676d66SBernhard Stoeckner 
745758b4ee8SAndy Ritger         // Assert reserved in msgHdr are zero
746758b4ee8SAndy Ritger         pRsvd = &pHeader->reserved[0];
747758b4ee8SAndy Ritger         NV_ASSERT((pRsvd[0] == 0) && portMemCmp(pRsvd, pRsvd + 1,
748758b4ee8SAndy Ritger                   sizeof(pHeader->reserved) - 1) == 0);
74991676d66SBernhard Stoeckner     }
75091676d66SBernhard Stoeckner #endif
751758b4ee8SAndy Ritger 
75291676d66SBernhard Stoeckner     (void)pCb->pCallback(gpuInstance, NULL, pData);
753758b4ee8SAndy Ritger }
754758b4ee8SAndy Ritger 
755758b4ee8SAndy Ritger NV_STATUS
knvlinkInbandMsgCallbackDispatcher_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 dataSize,NvU8 * pMessage)756758b4ee8SAndy Ritger knvlinkInbandMsgCallbackDispatcher_IMPL
757758b4ee8SAndy Ritger (
758758b4ee8SAndy Ritger     OBJGPU *pGpu,
759758b4ee8SAndy Ritger     KernelNvlink *pKernelNvlink,
760758b4ee8SAndy Ritger     NvU32 dataSize,
761758b4ee8SAndy Ritger     NvU8  *pMessage
762758b4ee8SAndy Ritger )
763758b4ee8SAndy Ritger {
764758b4ee8SAndy Ritger     NV_STATUS status;
765758b4ee8SAndy Ritger     nvlink_inband_msg_header_t *pHeader;
766758b4ee8SAndy Ritger     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS *pData = NULL;
76791676d66SBernhard Stoeckner     const NVLINK_INBAND_MSG_CALLBACK *pCb = NULL;
76891676d66SBernhard Stoeckner     NvU8 i;
769758b4ee8SAndy Ritger 
770758b4ee8SAndy Ritger     pHeader = (nvlink_inband_msg_header_t *)pMessage;
771758b4ee8SAndy Ritger 
772758b4ee8SAndy Ritger     if (pHeader->type >= NVLINK_INBAND_MSG_TYPE_MAX)
773758b4ee8SAndy Ritger     {
774758b4ee8SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Message type received is Out of Bounds. Dropping  the msg\n");
775758b4ee8SAndy Ritger         return NV_ERR_INVALID_REQUEST;
776758b4ee8SAndy Ritger     }
777758b4ee8SAndy Ritger 
77891676d66SBernhard Stoeckner     for (i = 0; i < NV_ARRAY_ELEMENTS(nvlink_inband_callbacks); i++)
779758b4ee8SAndy Ritger     {
78091676d66SBernhard Stoeckner         if ((nvlink_inband_callbacks[i].messageType == pHeader->type) &&
78191676d66SBernhard Stoeckner             (nvlink_inband_callbacks[i].pCallback != NULL))
78291676d66SBernhard Stoeckner         {
78391676d66SBernhard Stoeckner             pCb = &nvlink_inband_callbacks[i];
78491676d66SBernhard Stoeckner             break;
78591676d66SBernhard Stoeckner         }
78691676d66SBernhard Stoeckner     }
78791676d66SBernhard Stoeckner 
78891676d66SBernhard Stoeckner     if (pCb == NULL)
78991676d66SBernhard Stoeckner     {
79091676d66SBernhard Stoeckner         NV_PRINTF(LEVEL_ERROR,
79191676d66SBernhard Stoeckner                   "No Callback Registered for type %d. Dropping the msg\n",
79291676d66SBernhard Stoeckner                   pHeader->type);
793758b4ee8SAndy Ritger         return NV_ERR_INVALID_REQUEST;
794758b4ee8SAndy Ritger     }
795758b4ee8SAndy Ritger 
796758b4ee8SAndy Ritger     pData = portMemAllocNonPaged(sizeof(NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS));
797758b4ee8SAndy Ritger     if (pData == NULL)
798758b4ee8SAndy Ritger     {
799758b4ee8SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Out of memory, Dropping message\n");
800758b4ee8SAndy Ritger         return NV_ERR_NO_MEMORY;
801758b4ee8SAndy Ritger     }
802758b4ee8SAndy Ritger 
803758b4ee8SAndy Ritger     pData->dataSize = dataSize;
804758b4ee8SAndy Ritger     portMemCopy(pData->data, pData->dataSize, pMessage, dataSize);
805758b4ee8SAndy Ritger 
806b5bf85a8SAndy Ritger     status = osQueueWorkItemWithFlags(pGpu, knvlinkInbandMsgCallbackDispatcher_WORKITEM, pData,
80791676d66SBernhard Stoeckner                                       pCb->wqItemFlags);
808758b4ee8SAndy Ritger      if (status != NV_OK)
809758b4ee8SAndy Ritger      {
810758b4ee8SAndy Ritger         portMemFree(pData);
811758b4ee8SAndy Ritger         return status;
812758b4ee8SAndy Ritger      }
813758b4ee8SAndy Ritger 
814758b4ee8SAndy Ritger      return NV_OK;
815758b4ee8SAndy Ritger }
816758b4ee8SAndy Ritger 
817758b4ee8SAndy Ritger NV_STATUS
knvlinkSendInbandData_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NV2080_CTRL_NVLINK_INBAND_SEND_DATA_PARAMS * pParams)818758b4ee8SAndy Ritger knvlinkSendInbandData_IMPL
819758b4ee8SAndy Ritger (
820758b4ee8SAndy Ritger     OBJGPU       *pGpu,
821758b4ee8SAndy Ritger     KernelNvlink *pKernelNvlink,
822758b4ee8SAndy Ritger     NV2080_CTRL_NVLINK_INBAND_SEND_DATA_PARAMS *pParams
823758b4ee8SAndy Ritger )
824758b4ee8SAndy Ritger {
825758b4ee8SAndy Ritger     NV_STATUS status;
826758b4ee8SAndy Ritger 
827758b4ee8SAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
828758b4ee8SAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_INBAND_SEND_DATA,
829758b4ee8SAndy Ritger                                  (void *)pParams,
830758b4ee8SAndy Ritger                                  sizeof(*pParams));
831758b4ee8SAndy Ritger 
832758b4ee8SAndy Ritger     return status;
833758b4ee8SAndy Ritger }
834758b4ee8SAndy Ritger /*!
8351739a20eSAndy Ritger  * @brief Return the mask of links enabled on the system
8361739a20eSAndy Ritger  *
8371739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
8381739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
8391739a20eSAndy Ritger  */
8401739a20eSAndy Ritger NvU32
knvlinkGetEnabledLinkMask_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)8411739a20eSAndy Ritger knvlinkGetEnabledLinkMask_IMPL
8421739a20eSAndy Ritger (
8431739a20eSAndy Ritger     OBJGPU       *pGpu,
8441739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
8451739a20eSAndy Ritger )
8461739a20eSAndy Ritger {
8471739a20eSAndy Ritger     return pKernelNvlink->enabledLinks;
8481739a20eSAndy Ritger }
8491739a20eSAndy Ritger 
8501739a20eSAndy Ritger /*!
8511739a20eSAndy Ritger  * @brief Return the mask of links discovered on the system
8521739a20eSAndy Ritger  *
8531739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
8541739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
8551739a20eSAndy Ritger  */
8561739a20eSAndy Ritger NvU32
knvlinkGetDiscoveredLinkMask_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)8571739a20eSAndy Ritger knvlinkGetDiscoveredLinkMask_IMPL
8581739a20eSAndy Ritger (
8591739a20eSAndy Ritger     OBJGPU       *pGpu,
8601739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
8611739a20eSAndy Ritger )
8621739a20eSAndy Ritger {
8631739a20eSAndy Ritger     return pKernelNvlink->discoveredLinks;
8641739a20eSAndy Ritger }
8651739a20eSAndy Ritger 
8661739a20eSAndy Ritger /*!
8671739a20eSAndy Ritger  * @brief Returns the number of sysmem links
8681739a20eSAndy Ritger  *
8691739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
8701739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
8711739a20eSAndy Ritger  *
8721739a20eSAndy Ritger  * @return  The #sysmem NVLinks
8731739a20eSAndy Ritger  */
8741739a20eSAndy Ritger NvU32
knvlinkGetNumLinksToSystem_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)8751739a20eSAndy Ritger knvlinkGetNumLinksToSystem_IMPL
8761739a20eSAndy Ritger (
8771739a20eSAndy Ritger     OBJGPU       *pGpu,
8781739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
8791739a20eSAndy Ritger )
8801739a20eSAndy Ritger {
8811739a20eSAndy Ritger     NvU32 numSysmemLinks = pKernelNvlink->sysmemLinkMask;
8821739a20eSAndy Ritger 
8831739a20eSAndy Ritger     if (numSysmemLinks != 0)
8841739a20eSAndy Ritger     {
8851739a20eSAndy Ritger         NUMSETBITS_32(numSysmemLinks);
8861739a20eSAndy Ritger     }
8871739a20eSAndy Ritger 
8881739a20eSAndy Ritger     return numSysmemLinks;
8891739a20eSAndy Ritger }
8901739a20eSAndy Ritger 
8911739a20eSAndy Ritger /*!
8921739a20eSAndy Ritger  * @brief Returns number of peer links to a remote GPU
8931739a20eSAndy Ritger  *
8941739a20eSAndy Ritger  * @param[in] pGpu             OBJGPU pointer of local GPU
8951739a20eSAndy Ritger  * @param[in] pKernelNvlink    KernelNvlink pointer
8961739a20eSAndy Ritger  * @param[in] pRemoteGpu       OBJGPU pointer of remote GPU
8971739a20eSAndy Ritger  *
8981739a20eSAndy Ritger  * @return  The #peer NVLinks to the remote GPU
8991739a20eSAndy Ritger  */
9001739a20eSAndy Ritger NvU32
knvlinkGetNumLinksToPeer_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pRemoteGpu)9011739a20eSAndy Ritger knvlinkGetNumLinksToPeer_IMPL
9021739a20eSAndy Ritger (
9031739a20eSAndy Ritger     OBJGPU       *pGpu,
9041739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
9051739a20eSAndy Ritger     OBJGPU       *pRemoteGpu
9061739a20eSAndy Ritger )
9071739a20eSAndy Ritger {
9081739a20eSAndy Ritger     NvU32 numPeerLinks =
9091739a20eSAndy Ritger         knvlinkGetLinkMaskToPeer(pGpu, pKernelNvlink, pRemoteGpu);
9101739a20eSAndy Ritger 
9111739a20eSAndy Ritger     if (numPeerLinks != 0)
9121739a20eSAndy Ritger     {
9131739a20eSAndy Ritger         NUMSETBITS_32(numPeerLinks);
9141739a20eSAndy Ritger     }
9151739a20eSAndy Ritger 
9161739a20eSAndy Ritger     return numPeerLinks;
9171739a20eSAndy Ritger }
9181739a20eSAndy Ritger 
9191739a20eSAndy Ritger /*!
9201739a20eSAndy Ritger  * @brief Gets the mask of peer links between the GPUs
9211739a20eSAndy Ritger  *
9221739a20eSAndy Ritger  * @param[in] pGpu0           OBJGPU pointer
9231739a20eSAndy Ritger  * @param[in] pKernelNvlink0  Nvlink pointer
9241739a20eSAndy Ritger  * @param[in] pGpu1           Remote OBJGPU pointer
9251739a20eSAndy Ritger  *
9261739a20eSAndy Ritger  * @return    Returns the mask of peer links between the GPUs
9271739a20eSAndy Ritger  */
9281739a20eSAndy Ritger NvU32
knvlinkGetLinkMaskToPeer_IMPL(OBJGPU * pGpu0,KernelNvlink * pKernelNvlink0,OBJGPU * pGpu1)9291739a20eSAndy Ritger knvlinkGetLinkMaskToPeer_IMPL
9301739a20eSAndy Ritger (
9311739a20eSAndy Ritger     OBJGPU       *pGpu0,
9321739a20eSAndy Ritger     KernelNvlink *pKernelNvlink0,
9331739a20eSAndy Ritger     OBJGPU       *pGpu1
9341739a20eSAndy Ritger )
9351739a20eSAndy Ritger {
9361739a20eSAndy Ritger     NvU32 peerLinkMask = 0;
937758b4ee8SAndy Ritger     KernelNvlink *pKernelNvlink1 = NULL;
938758b4ee8SAndy Ritger 
939758b4ee8SAndy Ritger     pKernelNvlink1 = GPU_GET_KERNEL_NVLINK(pGpu1);
940758b4ee8SAndy Ritger 
941758b4ee8SAndy Ritger     if (pKernelNvlink1 == NULL)
942758b4ee8SAndy Ritger     {
943b5bf85a8SAndy Ritger         NV_PRINTF(LEVEL_INFO,
944758b4ee8SAndy Ritger                   "on GPU%d NVLink is disabled.\n", gpuGetInstance(pGpu1));
945758b4ee8SAndy Ritger 
946758b4ee8SAndy Ritger         return 0;
947758b4ee8SAndy Ritger     }
948758b4ee8SAndy Ritger 
949758b4ee8SAndy Ritger     if(pKernelNvlink0->bIsGpuDegraded)
950758b4ee8SAndy Ritger     {
951758b4ee8SAndy Ritger         return peerLinkMask;
952758b4ee8SAndy Ritger     }
953758b4ee8SAndy Ritger 
954758b4ee8SAndy Ritger     if(pKernelNvlink1->bIsGpuDegraded)
955758b4ee8SAndy Ritger     {
956758b4ee8SAndy Ritger         return peerLinkMask;
957758b4ee8SAndy Ritger     }
9581739a20eSAndy Ritger 
9591739a20eSAndy Ritger     if (!knvlinkIsForcedConfig(pGpu0, pKernelNvlink0))
9601739a20eSAndy Ritger     {
9611739a20eSAndy Ritger         //
9621739a20eSAndy Ritger         // If nvlink topology is not forced, then the hshub registers
9631739a20eSAndy Ritger         // are updated only when a P2P object is allocated. So, return
9641739a20eSAndy Ritger         // the cached value of mask of links connected to a GPU
9651739a20eSAndy Ritger         //
9661739a20eSAndy Ritger         peerLinkMask = pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)];
9671739a20eSAndy Ritger     }
9681739a20eSAndy Ritger 
9691739a20eSAndy Ritger     return peerLinkMask;
9701739a20eSAndy Ritger }
9711739a20eSAndy Ritger 
9721739a20eSAndy Ritger /*!
9731739a20eSAndy Ritger  * @brief Sets the mask of peer links between the GPUs
9741739a20eSAndy Ritger  *
9751739a20eSAndy Ritger  * @param[in] pGpu0           OBJGPU pointer
9761739a20eSAndy Ritger  * @param[in] pKernelNvlink0  Nvlink pointer
9771739a20eSAndy Ritger  * @param[in] pGpu1           Remote OBJGPU pointer
9781739a20eSAndy Ritger  * @param[in] peerLinkMask    Mask of links to the peer GPU
9791739a20eSAndy Ritger  *
9801739a20eSAndy Ritger  * @return    NV_OK on success
9811739a20eSAndy Ritger  */
9821739a20eSAndy Ritger NV_STATUS
knvlinkSetLinkMaskToPeer_IMPL(OBJGPU * pGpu0,KernelNvlink * pKernelNvlink0,OBJGPU * pGpu1,NvU32 peerLinkMask)9831739a20eSAndy Ritger knvlinkSetLinkMaskToPeer_IMPL
9841739a20eSAndy Ritger (
9851739a20eSAndy Ritger     OBJGPU       *pGpu0,
9861739a20eSAndy Ritger     KernelNvlink *pKernelNvlink0,
9871739a20eSAndy Ritger     OBJGPU       *pGpu1,
9881739a20eSAndy Ritger     NvU32         peerLinkMask
9891739a20eSAndy Ritger )
9901739a20eSAndy Ritger {
9911739a20eSAndy Ritger     NV_STATUS status = NV_OK;
9921739a20eSAndy Ritger 
9931739a20eSAndy Ritger     // Return early if no update needed to the peer link mask
9941739a20eSAndy Ritger     if (pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)] == peerLinkMask)
9951739a20eSAndy Ritger         return NV_OK;
9961739a20eSAndy Ritger 
9971739a20eSAndy Ritger     pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)] = peerLinkMask;
9981739a20eSAndy Ritger 
9991739a20eSAndy Ritger     NV2080_CTRL_NVLINK_UPDATE_PEER_LINK_MASK_PARAMS params;
10001739a20eSAndy Ritger 
10011739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
10021739a20eSAndy Ritger     params.gpuInst      = gpuGetInstance(pGpu1);
10031739a20eSAndy Ritger     params.peerLinkMask = peerLinkMask;
10041739a20eSAndy Ritger 
10051739a20eSAndy Ritger     // Reset timeout to clear any accumulated timeouts from link init
10061739a20eSAndy Ritger     if (IS_GSP_CLIENT(pGpu0))
10071739a20eSAndy Ritger     {
10081739a20eSAndy Ritger         threadStateResetTimeout(pGpu0);
10091739a20eSAndy Ritger     }
10101739a20eSAndy Ritger 
10111739a20eSAndy Ritger     // Sync the peerLinkMask with GSP-RM
10121739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu0, pKernelNvlink0,
10131739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_UPDATE_PEER_LINK_MASK,
10141739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
10151739a20eSAndy Ritger     if (status != NV_OK)
10161739a20eSAndy Ritger     {
10171739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
10181739a20eSAndy Ritger                   "Failed to sync peerLinksMask from GPU%d to GPU%d\n",
10191739a20eSAndy Ritger                   gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
10201739a20eSAndy Ritger         return status;
10211739a20eSAndy Ritger     }
10221739a20eSAndy Ritger 
10231739a20eSAndy Ritger     return NV_OK;
10241739a20eSAndy Ritger }
10251739a20eSAndy Ritger 
10261739a20eSAndy Ritger /*!
10271739a20eSAndy Ritger  * @brief Get the mask of links that are peer links
10281739a20eSAndy Ritger  *
10291739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
10301739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
10311739a20eSAndy Ritger  */
10321739a20eSAndy Ritger NvU32
knvlinkGetPeersNvlinkMaskFromHshub_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)10331739a20eSAndy Ritger knvlinkGetPeersNvlinkMaskFromHshub_IMPL
10341739a20eSAndy Ritger (
10351739a20eSAndy Ritger     OBJGPU       *pGpu,
10361739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
10371739a20eSAndy Ritger )
10381739a20eSAndy Ritger {
10391739a20eSAndy Ritger     NV_STATUS status       = NV_OK;
10401739a20eSAndy Ritger     NvU32     peerLinkMask = 0;
10411739a20eSAndy Ritger     NvU32     i;
10421739a20eSAndy Ritger 
10431739a20eSAndy Ritger     NV2080_CTRL_NVLINK_GET_LINK_AND_CLOCK_INFO_PARAMS params;
10441739a20eSAndy Ritger 
10451739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
10461739a20eSAndy Ritger     params.linkMask = pKernelNvlink->enabledLinks;
1047*e45d91deSBernhard Stoeckner     params.bSublinkStateInst = NV_TRUE;
10481739a20eSAndy Ritger 
10491739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
10501739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_GET_LINK_AND_CLOCK_INFO,
10511739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
10521739a20eSAndy Ritger     if (status != NV_OK)
10531739a20eSAndy Ritger         return 0;
10541739a20eSAndy Ritger 
10551739a20eSAndy Ritger     // Scan enabled links for peer connections
10561739a20eSAndy Ritger     FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
10571739a20eSAndy Ritger     {
10581739a20eSAndy Ritger         if (params.linkInfo[i].bLinkConnectedToPeer)
10591739a20eSAndy Ritger             peerLinkMask |= NVBIT(i);
10601739a20eSAndy Ritger     }
10611739a20eSAndy Ritger     FOR_EACH_INDEX_IN_MASK_END;
10621739a20eSAndy Ritger 
10631739a20eSAndy Ritger     return peerLinkMask;
10641739a20eSAndy Ritger }
10651739a20eSAndy Ritger 
10661739a20eSAndy Ritger /*!
10671739a20eSAndy Ritger  * @brief Prepare a GPU's NVLink engine for reset by removing mappings
10681739a20eSAndy Ritger  *        to it from other GPUs.
10691739a20eSAndy Ritger  *
10701739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer
10711739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
10721739a20eSAndy Ritger  *
10731739a20eSAndy Ritger  * return  NV_OK on success
10741739a20eSAndy Ritger  */
10751739a20eSAndy Ritger NV_STATUS
knvlinkPrepareForXVEReset_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvBool bForceShutdown)10761739a20eSAndy Ritger knvlinkPrepareForXVEReset_IMPL
10771739a20eSAndy Ritger (
10781739a20eSAndy Ritger     OBJGPU       *pGpu,
107990eb1077SAndy Ritger     KernelNvlink *pKernelNvlink,
108090eb1077SAndy Ritger     NvBool        bForceShutdown
10811739a20eSAndy Ritger )
10821739a20eSAndy Ritger {
10831739a20eSAndy Ritger     OBJSYS    *pSys      = SYS_GET_INSTANCE();
10841739a20eSAndy Ritger     NV_STATUS  retStatus = NV_OK;
10851739a20eSAndy Ritger     OBJGPU    *pRemoteGpu;
10861739a20eSAndy Ritger     NV_STATUS  status;
10871739a20eSAndy Ritger     NvU32      gpuInstance;
10881739a20eSAndy Ritger     NvU32      gpuMask;
10891739a20eSAndy Ritger 
10901739a20eSAndy Ritger     // This is not supported on forced configs
10911739a20eSAndy Ritger     if (knvlinkIsForcedConfig(pGpu, pKernelNvlink))
10921739a20eSAndy Ritger     {
10931739a20eSAndy Ritger         return NV_OK;
10941739a20eSAndy Ritger     }
10951739a20eSAndy Ritger 
10961739a20eSAndy Ritger     //
10971739a20eSAndy Ritger     // Let fabric manager handle link shutdown/reset if the fabric is managed
10981739a20eSAndy Ritger     // externally.
10991739a20eSAndy Ritger     //
11005f40a5aeSAndy Ritger     if (pKernelNvlink->ipVerNvlink < NVLINK_VERSION_40 &&
11015f40a5aeSAndy Ritger         pSys->getProperty(pSys, PDB_PROP_SYS_FABRIC_IS_EXTERNALLY_MANAGED))
11021739a20eSAndy Ritger     {
11031739a20eSAndy Ritger         NV_PRINTF(LEVEL_INFO,
11041739a20eSAndy Ritger                   "NVLink fabric is externally managed, skipping\n");
11051739a20eSAndy Ritger         return NV_OK;
11061739a20eSAndy Ritger     }
11071739a20eSAndy Ritger 
11081739a20eSAndy Ritger     status = gpumgrGetGpuAttachInfo(NULL, &gpuMask);
11091739a20eSAndy Ritger     NV_ASSERT_OR_RETURN(status == NV_OK, status);
11101739a20eSAndy Ritger 
11111739a20eSAndy Ritger     gpuInstance = 0;
11121739a20eSAndy Ritger     while ((pRemoteGpu = gpumgrGetNextGpu(gpuMask, &gpuInstance)) != NULL)
11131739a20eSAndy Ritger     {
11141739a20eSAndy Ritger         KernelNvlink *pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu);
11151739a20eSAndy Ritger 
11161739a20eSAndy Ritger         if ((pRemoteGpu == pGpu) || (pRemoteKernelNvlink == NULL) ||
11171739a20eSAndy Ritger             (knvlinkGetNumLinksToPeer(pRemoteGpu, pRemoteKernelNvlink, pGpu) == 0) ||
111890eb1077SAndy Ritger             API_GPU_IN_RESET_SANITY_CHECK(pRemoteGpu) ||
111990eb1077SAndy Ritger             pRemoteGpu->getProperty(pRemoteGpu, PDB_PROP_GPU_IS_LOST))
11201739a20eSAndy Ritger         {
11211739a20eSAndy Ritger             continue;
11221739a20eSAndy Ritger         }
11231739a20eSAndy Ritger 
11241739a20eSAndy Ritger         //
11251739a20eSAndy Ritger         // Reset the peer masks in HSHUB of the remote GPU. Partial resets
11261739a20eSAndy Ritger         // (only removing the links connected to the GPU being reset) don't
11271739a20eSAndy Ritger         // appear to be sufficient. The reset will work fine, but the next
11281739a20eSAndy Ritger         // time we attempt to initialize this GPU, the copy engines will time
11291739a20eSAndy Ritger         // out while scrubbing FB and a GPU sysmembar (NV_UFLUSH_FB_FLUSH) will
11301739a20eSAndy Ritger         // fail to complete.
11311739a20eSAndy Ritger         //
11321739a20eSAndy Ritger         // The above symptoms haven't been root-caused (yet), but the current
11331739a20eSAndy Ritger         // POR for GPU reset is that once one GPU is reset, the others
11341739a20eSAndy Ritger         // connected to it over NVLink must also be reset before using NVLink
11351739a20eSAndy Ritger         // for peer traffic, so just use the big hammer and squash all HSHUB
11361739a20eSAndy Ritger         // configs on GPU reset.
11371739a20eSAndy Ritger         //
11381739a20eSAndy Ritger         // This allows us to reset the GPUs one by one, with GPU
11391739a20eSAndy Ritger         // initializations in between, without hanging up the GPU trying to
11401739a20eSAndy Ritger         // flush data over links that aren't available anymore.
11411739a20eSAndy Ritger         //
114212c07393SBernhard Stoeckner         // Starting from Ampere single GPU reset is supported and hence remove
114312c07393SBernhard Stoeckner         // only the nvlink's of the remote GPU's which are connected to the
114412c07393SBernhard Stoeckner         // current GPU.
114512c07393SBernhard Stoeckner         //
114612c07393SBernhard Stoeckner 
114712c07393SBernhard Stoeckner         if (IsAMPEREorBetter(pGpu))
114812c07393SBernhard Stoeckner         {
114912c07393SBernhard Stoeckner             NvU32 remPeerId = kbusGetPeerId_HAL(pRemoteGpu, GPU_GET_KERNEL_BUS(pRemoteGpu), pGpu);
115012c07393SBernhard Stoeckner             if (remPeerId != BUS_INVALID_PEER)
115112c07393SBernhard Stoeckner                 status = knvlinkRemoveMapping_HAL(pRemoteGpu, pRemoteKernelNvlink, NV_FALSE,
115212c07393SBernhard Stoeckner                                                   NVBIT(remPeerId),
115312c07393SBernhard Stoeckner                                                   NV_FALSE /* bL2Entry */);
115412c07393SBernhard Stoeckner         }
115512c07393SBernhard Stoeckner         else
115612c07393SBernhard Stoeckner         {
11571739a20eSAndy Ritger             status = knvlinkRemoveMapping_HAL(pRemoteGpu, pRemoteKernelNvlink, NV_FALSE,
11581739a20eSAndy Ritger                                               ((1 << NVLINK_MAX_PEERS_SW) - 1),
11591739a20eSAndy Ritger                                               NV_FALSE /* bL2Entry */);
116012c07393SBernhard Stoeckner         }
11611739a20eSAndy Ritger         if (status != NV_OK)
11621739a20eSAndy Ritger         {
11631739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR,
11641739a20eSAndy Ritger                       "failed to reset HSHUB on GPU%u while preparing for GPU%u XVE reset (0x%x)\n",
11651739a20eSAndy Ritger                       gpuGetInstance(pRemoteGpu), gpuGetInstance(pGpu),
11661739a20eSAndy Ritger                       status);
11671739a20eSAndy Ritger 
11681739a20eSAndy Ritger             retStatus = (retStatus == NV_OK) ? status : retStatus;
11691739a20eSAndy Ritger         }
11701739a20eSAndy Ritger     }
11711739a20eSAndy Ritger 
11721739a20eSAndy Ritger     // Remove all NVLink mappings in HSHUB config registers to init values
117390eb1077SAndy Ritger     if (!API_GPU_IN_RESET_SANITY_CHECK(pGpu) && !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
11741739a20eSAndy Ritger     status = knvlinkRemoveMapping_HAL(pGpu, pKernelNvlink, NV_TRUE, ((1 << NVLINK_MAX_PEERS_SW) - 1),
11751739a20eSAndy Ritger                                       NV_FALSE /* bL2Entry */);
11761739a20eSAndy Ritger     if (status != NV_OK)
11771739a20eSAndy Ritger     {
11781739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
11791739a20eSAndy Ritger                   "failed to reset HSHUB on GPU%u while preparing XVE reset: %s (0x%x)\n",
11801739a20eSAndy Ritger                   gpuGetInstance(pGpu), nvstatusToString(status), status);
11811739a20eSAndy Ritger 
11821739a20eSAndy Ritger         retStatus = (retStatus == NV_OK) ? status : retStatus;
11831739a20eSAndy Ritger     }
11841739a20eSAndy Ritger 
118590eb1077SAndy Ritger     //
118690eb1077SAndy Ritger     // If GFW is booted and running through link-training, then no need to tear-down the
118790eb1077SAndy Ritger     // links to reset. Exit out early from the function
118890eb1077SAndy Ritger     //
1189eb5c7665SAndy Ritger     if (!bForceShutdown && pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_MINION_GFW_BOOT))
119090eb1077SAndy Ritger     {
119190eb1077SAndy Ritger         return NV_OK;
119290eb1077SAndy Ritger     }
119390eb1077SAndy Ritger 
11941739a20eSAndy Ritger     // Pseudo-clean  shutdown the links from this GPU
119590eb1077SAndy Ritger     status = knvlinkCoreShutdownDeviceLinks(pGpu, pKernelNvlink, bForceShutdown);
11961739a20eSAndy Ritger     if (status != NV_OK)
11971739a20eSAndy Ritger     {
11981739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
11991739a20eSAndy Ritger                   "failed to shutdown links on GPU%u while preparing XVE reset: %s (0x%x)\n",
12001739a20eSAndy Ritger                   gpuGetInstance(pGpu), nvstatusToString(status), status);
12011739a20eSAndy Ritger 
12021739a20eSAndy Ritger         retStatus = (retStatus == NV_OK) ? status : retStatus;
12031739a20eSAndy Ritger     }
12041739a20eSAndy Ritger 
12051739a20eSAndy Ritger     //
12061739a20eSAndy Ritger     // Reset links related to this device and its peers (see Bug 2346447)
12071739a20eSAndy Ritger     // The property is disabled on Pascal, since the path hasn't been verified
12081739a20eSAndy Ritger     // and link reset after pseudo-clean shutdown results in DL and TL errors.
12091739a20eSAndy Ritger     //
12101739a20eSAndy Ritger     if (pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LINKRESET_AFTER_SHUTDOWN))
12111739a20eSAndy Ritger     {
12121739a20eSAndy Ritger         status = knvlinkCoreResetDeviceLinks(pGpu, pKernelNvlink);
12131739a20eSAndy Ritger         if (status != NV_OK)
12141739a20eSAndy Ritger         {
12151739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR,
12161739a20eSAndy Ritger                       "failed to reset links on GPU%u while preparing XVE reset: %s (0x%x)\n",
12171739a20eSAndy Ritger                       gpuGetInstance(pGpu), nvstatusToString(status), status);
12181739a20eSAndy Ritger 
12191739a20eSAndy Ritger             retStatus = (retStatus == NV_OK) ? status : retStatus;
12201739a20eSAndy Ritger         }
1221dac2350cSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
1222dac2350cSAndy Ritger         else
1223dac2350cSAndy Ritger         {
1224dac2350cSAndy Ritger             NvU32 linkId;
1225dac2350cSAndy Ritger 
1226dac2350cSAndy Ritger             //
1227dac2350cSAndy Ritger             // The connections have been successfully reset, update connected and disconnected
1228dac2350cSAndy Ritger             // links masks on both the devices
1229dac2350cSAndy Ritger             //
1230dac2350cSAndy Ritger             FOR_EACH_INDEX_IN_MASK(32, linkId, pKernelNvlink->enabledLinks)
1231dac2350cSAndy Ritger             {
1232dac2350cSAndy Ritger                 pKernelNvlink->disconnectedLinkMask |=  NVBIT(linkId);
1233dac2350cSAndy Ritger                 pKernelNvlink->connectedLinksMask   &= ~NVBIT(linkId);
1234dac2350cSAndy Ritger 
1235dac2350cSAndy Ritger                 if (pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.deviceType !=
1236dac2350cSAndy Ritger                                               NV2080_CTRL_NVLINK_DEVICE_INFO_DEVICE_TYPE_GPU)
1237dac2350cSAndy Ritger                 {
1238dac2350cSAndy Ritger                     continue;
1239dac2350cSAndy Ritger                 }
1240dac2350cSAndy Ritger 
1241dac2350cSAndy Ritger                 OBJGPU *pRemoteGpu = gpumgrGetGpuFromBusInfo(
1242dac2350cSAndy Ritger                                             pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.domain,
1243dac2350cSAndy Ritger                                             pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bus,
1244dac2350cSAndy Ritger                                             pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.device);
1245dac2350cSAndy Ritger 
1246dac2350cSAndy Ritger                 if (!API_GPU_IN_RESET_SANITY_CHECK(pRemoteGpu))
1247dac2350cSAndy Ritger                 {
1248dac2350cSAndy Ritger                     KernelNvlink *pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu);
1249dac2350cSAndy Ritger                     NvU32 remoteLinkId = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber;
1250dac2350cSAndy Ritger 
1251dac2350cSAndy Ritger                     pRemoteKernelNvlink->disconnectedLinkMask |=  NVBIT(remoteLinkId);
1252dac2350cSAndy Ritger                     pRemoteKernelNvlink->connectedLinksMask   &= ~NVBIT(remoteLinkId);
1253dac2350cSAndy Ritger                 }
1254dac2350cSAndy Ritger             }
1255dac2350cSAndy Ritger             FOR_EACH_INDEX_IN_MASK_END;
1256dac2350cSAndy Ritger         }
1257dac2350cSAndy Ritger #endif
12581739a20eSAndy Ritger 
12591739a20eSAndy Ritger         //
12601739a20eSAndy Ritger         // knvlinkCoreResetDeviceLinks() only resets the links which have
12611739a20eSAndy Ritger         // connectivity.
12621739a20eSAndy Ritger         // Pre-Ampere, we may run into a situation where the PLL
12631739a20eSAndy Ritger         // sharing partner links (both) may not be reset due to no connectivity.
12641739a20eSAndy Ritger         //
12651739a20eSAndy Ritger         // Hence, (re-)reset all the links to recover them after shutdown (pre-Ampere)
12661739a20eSAndy Ritger         //
12671739a20eSAndy Ritger         NV2080_CTRL_NVLINK_RESET_LINKS_PARAMS resetLinksparams;
12681739a20eSAndy Ritger 
12691739a20eSAndy Ritger         portMemSet(&resetLinksparams, 0, sizeof(resetLinksparams));
12701739a20eSAndy Ritger         resetLinksparams.linkMask = pKernelNvlink->enabledLinks;
12711739a20eSAndy Ritger         resetLinksparams.flags    = NV2080_CTRL_NVLINK_RESET_FLAGS_TOGGLE;
12721739a20eSAndy Ritger 
12731739a20eSAndy Ritger         status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
12741739a20eSAndy Ritger                                      NV2080_CTRL_CMD_NVLINK_RESET_LINKS,
12751739a20eSAndy Ritger                                      (void *)&resetLinksparams, sizeof(resetLinksparams));
12761739a20eSAndy Ritger 
12771739a20eSAndy Ritger         retStatus = (retStatus == NV_OK) ? status : retStatus;
12781739a20eSAndy Ritger     }
12791739a20eSAndy Ritger 
12801739a20eSAndy Ritger     return retStatus;
12811739a20eSAndy Ritger }
12821739a20eSAndy Ritger 
12831739a20eSAndy Ritger /*!
12841739a20eSAndy Ritger  * @brief Set the power features supported on this NVLink IP
12851739a20eSAndy Ritger  *
12861739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer
12871739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
12881739a20eSAndy Ritger  */
12891739a20eSAndy Ritger void
knvlinkSetPowerFeatures_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)12901739a20eSAndy Ritger knvlinkSetPowerFeatures_IMPL
12911739a20eSAndy Ritger (
12921739a20eSAndy Ritger     OBJGPU       *pGpu,
12931739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
12941739a20eSAndy Ritger )
12951739a20eSAndy Ritger {
12961739a20eSAndy Ritger     // Get the Ip Verion from the First available IOCTRL.
12971739a20eSAndy Ritger     switch (pKernelNvlink->ipVerNvlink)
12981739a20eSAndy Ritger     {
12991739a20eSAndy Ritger         case NVLINK_VERSION_22:
13001739a20eSAndy Ritger         {
13011739a20eSAndy Ritger             // NVLink L2 is supported only on MODS and Windows LDDM
130291676d66SBernhard Stoeckner             if (RMCFG_FEATURE_PLATFORM_WINDOWS || RMCFG_FEATURE_MODS_FEATURES)
13031739a20eSAndy Ritger             {
13041739a20eSAndy Ritger                 pKernelNvlink->setProperty(pKernelNvlink, PDB_PROP_KNVLINK_L2_POWER_STATE_ENABLED,
13051739a20eSAndy Ritger                                            (pKernelNvlink->bDisableL2Mode ? NV_FALSE : NV_TRUE));
13061739a20eSAndy Ritger             }
13071739a20eSAndy Ritger 
13081739a20eSAndy Ritger             break;
13091739a20eSAndy Ritger         }
13101739a20eSAndy Ritger         default:
13111739a20eSAndy Ritger             break;
13121739a20eSAndy Ritger     }
13131739a20eSAndy Ritger }
13141739a20eSAndy Ritger 
13151739a20eSAndy Ritger /*!
13161739a20eSAndy Ritger  * @brief Checks if NVSWITCH_FABRIC_ADDR field is valid.
13171739a20eSAndy Ritger  *
13181739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer
13191739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
13201739a20eSAndy Ritger  */
13211739a20eSAndy Ritger void
knvlinkDetectNvswitchProxy_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)13221739a20eSAndy Ritger knvlinkDetectNvswitchProxy_IMPL
13231739a20eSAndy Ritger (
13241739a20eSAndy Ritger     OBJGPU       *pGpu,
13251739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
13261739a20eSAndy Ritger )
13271739a20eSAndy Ritger {
13281739a20eSAndy Ritger     OBJSYS    *pSys   = SYS_GET_INSTANCE();
13291739a20eSAndy Ritger     NV_STATUS  status = NV_OK;
13301739a20eSAndy Ritger     NvU32      i;
13311739a20eSAndy Ritger 
13321739a20eSAndy Ritger     // Initialize fabricBaseAddr to NVLINK_INVALID_FABRIC_ADDR
13331739a20eSAndy Ritger     pKernelNvlink->fabricBaseAddr = NVLINK_INVALID_FABRIC_ADDR;
13341739a20eSAndy Ritger 
13351739a20eSAndy Ritger     if (pSys->getProperty(pSys, PDB_PROP_SYS_NVSWITCH_IS_PRESENT) ||
1336758b4ee8SAndy Ritger         pSys->getProperty(pSys, PDB_PROP_SYS_FABRIC_MANAGER_IS_REGISTERED) ||
1337758b4ee8SAndy Ritger         GPU_IS_NVSWITCH_DETECTED(pGpu))
13381739a20eSAndy Ritger     {
13391739a20eSAndy Ritger         return;
13401739a20eSAndy Ritger     }
13411739a20eSAndy Ritger 
13421739a20eSAndy Ritger     if (pKernelNvlink->discoveredLinks == 0)
13431739a20eSAndy Ritger     {
13441739a20eSAndy Ritger         return;
13451739a20eSAndy Ritger     }
13461739a20eSAndy Ritger 
13471739a20eSAndy Ritger     // Get the link train status for the enabled link masks
13481739a20eSAndy Ritger     NV2080_CTRL_NVLINK_ARE_LINKS_TRAINED_PARAMS linkTrainedParams;
13491739a20eSAndy Ritger 
13501739a20eSAndy Ritger     portMemSet(&linkTrainedParams, 0, sizeof(linkTrainedParams));
13511739a20eSAndy Ritger     linkTrainedParams.linkMask    = pKernelNvlink->enabledLinks;
13521739a20eSAndy Ritger     linkTrainedParams.bActiveOnly = NV_FALSE;
13531739a20eSAndy Ritger 
13541739a20eSAndy Ritger     // Reset timeout to clear any accumulated timeouts from link init
13551739a20eSAndy Ritger     if (IS_GSP_CLIENT(pGpu))
13561739a20eSAndy Ritger     {
13571739a20eSAndy Ritger         threadStateResetTimeout(pGpu);
13581739a20eSAndy Ritger     }
13591739a20eSAndy Ritger 
13601739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
13611739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_ARE_LINKS_TRAINED,
13621739a20eSAndy Ritger                                  (void *)&linkTrainedParams, sizeof(linkTrainedParams));
13631739a20eSAndy Ritger     if (status != NV_OK)
13641739a20eSAndy Ritger     {
13651739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to get the link train status for links\n");
13661739a20eSAndy Ritger         return;
13671739a20eSAndy Ritger     }
13681739a20eSAndy Ritger 
13691739a20eSAndy Ritger     FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
13701739a20eSAndy Ritger     {
13711739a20eSAndy Ritger         if (!linkTrainedParams.bIsLinkActive[i])
13721739a20eSAndy Ritger         {
13731739a20eSAndy Ritger             return;
13741739a20eSAndy Ritger         }
13751739a20eSAndy Ritger     }
13761739a20eSAndy Ritger     FOR_EACH_INDEX_IN_MASK_END;
13771739a20eSAndy Ritger 
13781739a20eSAndy Ritger     NV2080_CTRL_INTERNAL_NVLINK_GET_SET_NVSWITCH_FABRIC_ADDR_PARAMS params;
13791739a20eSAndy Ritger 
13801739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
13811739a20eSAndy Ritger     params.bGet = NV_TRUE;
13821739a20eSAndy Ritger     params.addr = NVLINK_INVALID_FABRIC_ADDR;
13831739a20eSAndy Ritger 
13841739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
13851739a20eSAndy Ritger                                  NV2080_CTRL_CMD_INTERNAL_NVLINK_GET_SET_NVSWITCH_FABRIC_ADDR,
13861739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
13871739a20eSAndy Ritger     if (status != NV_OK)
13881739a20eSAndy Ritger     {
13891739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to get fabric address for GPU %x\n",
13901739a20eSAndy Ritger                   pGpu->gpuInstance);
13911739a20eSAndy Ritger         return;
13921739a20eSAndy Ritger     }
13931739a20eSAndy Ritger 
13941739a20eSAndy Ritger     if (params.addr != NVLINK_INVALID_FABRIC_ADDR)
13951739a20eSAndy Ritger     {
13961739a20eSAndy Ritger         pKernelNvlink->fabricBaseAddr = params.addr;
13971739a20eSAndy Ritger         pKernelNvlink->bNvswitchProxy = NV_TRUE;
13981739a20eSAndy Ritger     }
13991739a20eSAndy Ritger }
14001739a20eSAndy Ritger 
14011739a20eSAndy Ritger /*!
14021739a20eSAndy Ritger  * @brief Sets NVSWITCH_FLA_ADDR field in the scratch register.
14031739a20eSAndy Ritger  *
14041739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
14051739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
14061739a20eSAndy Ritger  * @param[in] addr           FLA addr
14071739a20eSAndy Ritger  *
14081739a20eSAndy Ritger  * @return  Returns NV_OK upon success.
14091739a20eSAndy Ritger  *          Otherwise, returns NV_ERR_XXX.
14101739a20eSAndy Ritger  */
14111739a20eSAndy Ritger NV_STATUS
knvlinkSetNvswitchFlaAddr_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU64 addr)14121739a20eSAndy Ritger knvlinkSetNvswitchFlaAddr_IMPL
14131739a20eSAndy Ritger (
14141739a20eSAndy Ritger     OBJGPU       *pGpu,
14151739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
14161739a20eSAndy Ritger     NvU64         addr
14171739a20eSAndy Ritger )
14181739a20eSAndy Ritger {
14191739a20eSAndy Ritger     return NV_OK;
14201739a20eSAndy Ritger }
14211739a20eSAndy Ritger 
14221739a20eSAndy Ritger /*!
14231739a20eSAndy Ritger  * @brief Gets NVSWITCH_FLA_ADDR field from the scratch register.
14241739a20eSAndy Ritger  *
14251739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer
14261739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
14271739a20eSAndy Ritger  *
14281739a20eSAndy Ritger  * @return  Returns the stashed FLA starting address.
14291739a20eSAndy Ritger  */
14301739a20eSAndy Ritger NvU64
knvlinkGetNvswitchFlaAddr_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)14311739a20eSAndy Ritger knvlinkGetNvswitchFlaAddr_IMPL
14321739a20eSAndy Ritger (
14331739a20eSAndy Ritger     OBJGPU       *pGpu,
14341739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
14351739a20eSAndy Ritger )
14361739a20eSAndy Ritger {
14371739a20eSAndy Ritger     return 0;
14381739a20eSAndy Ritger }
14391739a20eSAndy Ritger 
14401739a20eSAndy Ritger /*!
14411739a20eSAndy Ritger  * @brief Checks if fabricBaseAddr is valid.
14421739a20eSAndy Ritger  *
14431739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer
14441739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
14451739a20eSAndy Ritger  *
14461739a20eSAndy Ritger  * @return  Returns true if the fabricBaseAddr is valid.
14471739a20eSAndy Ritger  */
14481739a20eSAndy Ritger NvBool
knvlinkIsNvswitchProxyPresent_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)14491739a20eSAndy Ritger knvlinkIsNvswitchProxyPresent_IMPL
14501739a20eSAndy Ritger (
14511739a20eSAndy Ritger     OBJGPU       *pGpu,
14521739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
14531739a20eSAndy Ritger )
14541739a20eSAndy Ritger {
14551739a20eSAndy Ritger     return pKernelNvlink->bNvswitchProxy;
14561739a20eSAndy Ritger }
14571739a20eSAndy Ritger 
14581739a20eSAndy Ritger 
14591739a20eSAndy Ritger /*!
14601739a20eSAndy Ritger  * @brief   Set unique FLA base address for NVSwitch enabled systems.
14611739a20eSAndy Ritger  *          Validates FLA base address and programs the base address
14621739a20eSAndy Ritger  *          in switch scratch registers for guest VM to pick it up.
14631739a20eSAndy Ritger  *
14641739a20eSAndy Ritger  * @param[in]   pGpu               OBJGPU pointer
14651739a20eSAndy Ritger  * @param[in]   pKernelNvlink      KernelNvlink pointer
14661739a20eSAndy Ritger  * @param[in]   flaBaseAddr        NvU64  base address
14671739a20eSAndy Ritger  *
14681739a20eSAndy Ritger  * @returns On success, sets unique FLA base address and returns NV_OK.
14691739a20eSAndy Ritger  *          On failure, returns NV_ERR_XXX.
14701739a20eSAndy Ritger  */
14711739a20eSAndy Ritger NV_STATUS
knvlinkSetUniqueFlaBaseAddress_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU64 flaBaseAddr)14721739a20eSAndy Ritger knvlinkSetUniqueFlaBaseAddress_IMPL
14731739a20eSAndy Ritger (
14741739a20eSAndy Ritger     OBJGPU       *pGpu,
14751739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
14761739a20eSAndy Ritger     NvU64         flaBaseAddr
14771739a20eSAndy Ritger )
14781739a20eSAndy Ritger {
14791739a20eSAndy Ritger     NV_STATUS  status     = NV_OK;
14801739a20eSAndy Ritger     KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu);
14811739a20eSAndy Ritger 
14821739a20eSAndy Ritger     NV2080_CTRL_NVLINK_GET_SET_NVSWITCH_FLA_ADDR_PARAMS params;
14831739a20eSAndy Ritger 
14841739a20eSAndy Ritger     if (!knvlinkIsForcedConfig(pGpu, pKernelNvlink))
14851739a20eSAndy Ritger     {
14861739a20eSAndy Ritger         knvlinkCoreGetRemoteDeviceInfo(pGpu, pKernelNvlink);
14871739a20eSAndy Ritger 
14881739a20eSAndy Ritger         status = knvlinkEnableLinksPostTopology_HAL(pGpu, pKernelNvlink,
14891739a20eSAndy Ritger                                                     pKernelNvlink->enabledLinks);
14901739a20eSAndy Ritger         if (status != NV_OK)
14911739a20eSAndy Ritger         {
14921739a20eSAndy Ritger             return status;
14931739a20eSAndy Ritger         }
14941739a20eSAndy Ritger     }
14951739a20eSAndy Ritger 
14961739a20eSAndy Ritger     status = kbusValidateFlaBaseAddress_HAL(pGpu, pKernelBus, flaBaseAddr);
14971739a20eSAndy Ritger     if (status != NV_OK)
14981739a20eSAndy Ritger     {
14991739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "FLA base addr validation failed for GPU %x\n",
15001739a20eSAndy Ritger                   pGpu->gpuInstance);
15011739a20eSAndy Ritger         return status;
15021739a20eSAndy Ritger     }
15031739a20eSAndy Ritger 
15041739a20eSAndy Ritger     if (IsSLIEnabled(pGpu))
15051739a20eSAndy Ritger     {
15061739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR,
15071739a20eSAndy Ritger                   "Operation is unsupported on SLI enabled GPU %x\n",
15081739a20eSAndy Ritger                   pGpu->gpuInstance);
15091739a20eSAndy Ritger         return NV_ERR_NOT_SUPPORTED;
15101739a20eSAndy Ritger     }
15111739a20eSAndy Ritger 
15121739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
15131739a20eSAndy Ritger     params.bGet = NV_FALSE;
15141739a20eSAndy Ritger     params.addr = flaBaseAddr;
15151739a20eSAndy Ritger 
15161739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
15171739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_GET_SET_NVSWITCH_FLA_ADDR,
15181739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
15191739a20eSAndy Ritger     if (status != NV_OK)
15201739a20eSAndy Ritger     {
15211739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to stash fla base address for GPU %x\n",
15221739a20eSAndy Ritger                   pGpu->gpuInstance);
15231739a20eSAndy Ritger         return status;
15241739a20eSAndy Ritger     }
15251739a20eSAndy Ritger 
15261739a20eSAndy Ritger     NV_PRINTF(LEVEL_INFO, "FLA base addr %llx is assigned to GPU %x\n",
15271739a20eSAndy Ritger               flaBaseAddr, pGpu->gpuInstance);
15281739a20eSAndy Ritger 
15291739a20eSAndy Ritger     return NV_OK;
15301739a20eSAndy Ritger }
15311739a20eSAndy Ritger 
15321739a20eSAndy Ritger /*!
15331739a20eSAndy Ritger  * @brief Synchronize the link masks and vbios defined properties
15341739a20eSAndy Ritger  *        between CPU and GSP-RMs
15351739a20eSAndy Ritger  *
15361739a20eSAndy Ritger  * @param[in]   pGpu           OBJGPU pointer
15371739a20eSAndy Ritger  * @param[in]   pKernelNvlink  KernelNvlink pointer
15381739a20eSAndy Ritger  */
15391739a20eSAndy Ritger NV_STATUS
knvlinkSyncLinkMasksAndVbiosInfo_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)15401739a20eSAndy Ritger knvlinkSyncLinkMasksAndVbiosInfo_IMPL
15411739a20eSAndy Ritger (
15421739a20eSAndy Ritger     OBJGPU       *pGpu,
15431739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
15441739a20eSAndy Ritger )
15451739a20eSAndy Ritger {
15461739a20eSAndy Ritger     NV_STATUS status = NV_OK;
15471739a20eSAndy Ritger 
15481739a20eSAndy Ritger     NV2080_CTRL_NVLINK_SYNC_LINK_MASKS_AND_VBIOS_INFO_PARAMS params;
15491739a20eSAndy Ritger 
15501739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
15511739a20eSAndy Ritger 
15521739a20eSAndy Ritger     params.discoveredLinks     = pKernelNvlink->discoveredLinks;
15531739a20eSAndy Ritger     params.connectedLinksMask  = pKernelNvlink->connectedLinksMask;
15541739a20eSAndy Ritger     params.bridgeSensableLinks = pKernelNvlink->bridgeSensableLinks;
15551739a20eSAndy Ritger     params.bridgedLinks        = pKernelNvlink->bridgedLinks;
15561739a20eSAndy Ritger 
15571739a20eSAndy Ritger     // Reset timeout to clear any accumulated timeouts from link init
15581739a20eSAndy Ritger     if (IS_GSP_CLIENT(pGpu))
15591739a20eSAndy Ritger     {
15601739a20eSAndy Ritger         threadStateResetTimeout(pGpu);
15611739a20eSAndy Ritger     }
15621739a20eSAndy Ritger 
15631739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
15641739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_SYNC_LINK_MASKS_AND_VBIOS_INFO,
15651739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
15661739a20eSAndy Ritger 
15671739a20eSAndy Ritger     pKernelNvlink->vbiosDisabledLinkMask = params.vbiosDisabledLinkMask;
15681739a20eSAndy Ritger     pKernelNvlink->initializedLinks      = params.initializedLinks;
15691739a20eSAndy Ritger     pKernelNvlink->initDisabledLinksMask = params.initDisabledLinksMask;
15701739a20eSAndy Ritger     pKernelNvlink->bEnableSafeModeAtLoad = params.bEnableSafeModeAtLoad;
15711739a20eSAndy Ritger     pKernelNvlink->bEnableTrainingAtLoad = params.bEnableTrainingAtLoad;
15721739a20eSAndy Ritger 
15731739a20eSAndy Ritger     return status;
15741739a20eSAndy Ritger }
15751739a20eSAndy Ritger 
15761739a20eSAndy Ritger /*!
15771739a20eSAndy Ritger  * @brief Update link connection status.
15781739a20eSAndy Ritger  *
15791739a20eSAndy Ritger  * @param[in]   pGpu           OBJGPU pointer
15801739a20eSAndy Ritger  * @param[in]   pKernelNvlink  KernelNvlink pointer
15811739a20eSAndy Ritger  * @param[in]   linkId         Target link Id
15821739a20eSAndy Ritger  */
15831739a20eSAndy Ritger NV_STATUS
knvlinkUpdateLinkConnectionStatus_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 linkId)15841739a20eSAndy Ritger knvlinkUpdateLinkConnectionStatus_IMPL
15851739a20eSAndy Ritger (
15861739a20eSAndy Ritger     OBJGPU       *pGpu,
15871739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
15881739a20eSAndy Ritger     NvU32         linkId
15891739a20eSAndy Ritger )
15901739a20eSAndy Ritger {
15911739a20eSAndy Ritger     NV_STATUS status = NV_OK;
15921739a20eSAndy Ritger 
15931739a20eSAndy Ritger     NV2080_CTRL_NVLINK_UPDATE_LINK_CONNECTION_PARAMS params;
15941739a20eSAndy Ritger 
15951739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
15961739a20eSAndy Ritger 
15971739a20eSAndy Ritger     params.linkId = linkId;
15981739a20eSAndy Ritger 
15991739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
16001739a20eSAndy Ritger 
16011739a20eSAndy Ritger     params.bConnected = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bConnected;
16021739a20eSAndy Ritger     params.remoteDeviceType = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.deviceType;
16031739a20eSAndy Ritger     params.remoteLinkNumber = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber;
1604758b4ee8SAndy Ritger     params.remoteChipSid = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.chipSid;
1605758b4ee8SAndy Ritger     params.remoteDomain = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.domain;
1606758b4ee8SAndy Ritger     params.remoteBus = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bus;
1607758b4ee8SAndy Ritger     params.remoteDevice = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.device;
1608758b4ee8SAndy Ritger     params.remoteFunction = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.function;
1609758b4ee8SAndy Ritger     params.remotePciDeviceId = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.pciDeviceId;
1610758b4ee8SAndy Ritger     params.laneRxdetStatusMask = pKernelNvlink->nvlinkLinks[linkId].laneRxdetStatusMask;
16111739a20eSAndy Ritger 
16121739a20eSAndy Ritger #endif
16131739a20eSAndy Ritger 
16141739a20eSAndy Ritger     // Reset timeout to clear any accumulated timeouts from link init
16151739a20eSAndy Ritger     if (IS_GSP_CLIENT(pGpu))
16161739a20eSAndy Ritger     {
16171739a20eSAndy Ritger         threadStateResetTimeout(pGpu);
16181739a20eSAndy Ritger     }
16191739a20eSAndy Ritger 
16201739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
16211739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_UPDATE_LINK_CONNECTION,
16221739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
16231739a20eSAndy Ritger     if (status != NV_OK)
16241739a20eSAndy Ritger     {
16251739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to update Link connection status!\n");
16261739a20eSAndy Ritger         return status;
16271739a20eSAndy Ritger     }
16281739a20eSAndy Ritger 
16291739a20eSAndy Ritger     return NV_OK;
16301739a20eSAndy Ritger }
16311739a20eSAndy Ritger 
16321739a20eSAndy Ritger /*!
163390eb1077SAndy Ritger  * @brief Execute initial steps to Train links for ALI.
163490eb1077SAndy Ritger  *
163590eb1077SAndy Ritger  * @param[in] pGpu           OBJGPU pointer for local GPU
163690eb1077SAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
163790eb1077SAndy Ritger  * @param[in] linkMask       Masks of links to enable
163890eb1077SAndy Ritger  * @param[in] bSync          Input sync boolean
163990eb1077SAndy Ritger  *
164090eb1077SAndy Ritger  */
164190eb1077SAndy Ritger NV_STATUS
knvlinkPreTrainLinksToActiveAli_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 linkMask,NvBool bSync)164290eb1077SAndy Ritger knvlinkPreTrainLinksToActiveAli_IMPL
164390eb1077SAndy Ritger (
164490eb1077SAndy Ritger     OBJGPU       *pGpu,
164590eb1077SAndy Ritger     KernelNvlink *pKernelNvlink,
164690eb1077SAndy Ritger     NvU32         linkMask,
164790eb1077SAndy Ritger     NvBool        bSync
164890eb1077SAndy Ritger )
164990eb1077SAndy Ritger {
165090eb1077SAndy Ritger     NV_STATUS status = NV_OK;
165190eb1077SAndy Ritger 
165290eb1077SAndy Ritger     NV2080_CTRL_NVLINK_PRE_LINK_TRAIN_ALI_PARAMS params;
165390eb1077SAndy Ritger 
165490eb1077SAndy Ritger     portMemSet(&params, 0, sizeof(params));
165590eb1077SAndy Ritger 
165690eb1077SAndy Ritger     params.linkMask = linkMask;
165790eb1077SAndy Ritger     params.bSync    = bSync;
165890eb1077SAndy Ritger 
165990eb1077SAndy Ritger     // Reset timeout to clear any accumulated timeouts from link init
166090eb1077SAndy Ritger     if (IS_GSP_CLIENT(pGpu))
166190eb1077SAndy Ritger     {
166290eb1077SAndy Ritger         threadStateResetTimeout(pGpu);
166390eb1077SAndy Ritger     }
166490eb1077SAndy Ritger 
166590eb1077SAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
166690eb1077SAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_PRE_LINK_TRAIN_ALI,
166790eb1077SAndy Ritger                                  (void *)&params, sizeof(params));
166890eb1077SAndy Ritger     if (status != NV_OK)
166990eb1077SAndy Ritger     {
167090eb1077SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to execute Pre Link Training ALI steps!\n");
167190eb1077SAndy Ritger         return status;
167290eb1077SAndy Ritger     }
167390eb1077SAndy Ritger 
167490eb1077SAndy Ritger     return NV_OK;
167590eb1077SAndy Ritger }
167690eb1077SAndy Ritger 
167790eb1077SAndy Ritger /*!
167890eb1077SAndy Ritger  * @brief Train links to active for ALI.
167990eb1077SAndy Ritger  *
168090eb1077SAndy Ritger  * @param[in] pGpu           OBJGPU pointer for local GPU
168190eb1077SAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
168290eb1077SAndy Ritger  * @param[in] linkMask       Masks of links to enable
168390eb1077SAndy Ritger  * @param[in] bSync          Input sync boolean
168490eb1077SAndy Ritger  *
168590eb1077SAndy Ritger  */
168690eb1077SAndy Ritger NV_STATUS
knvlinkTrainLinksToActiveAli_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 linkMask,NvBool bSync)168790eb1077SAndy Ritger knvlinkTrainLinksToActiveAli_IMPL
168890eb1077SAndy Ritger (
168990eb1077SAndy Ritger     OBJGPU       *pGpu,
169090eb1077SAndy Ritger     KernelNvlink *pKernelNvlink,
169190eb1077SAndy Ritger     NvU32         linkMask,
169290eb1077SAndy Ritger     NvBool        bSync
169390eb1077SAndy Ritger )
169490eb1077SAndy Ritger {
169590eb1077SAndy Ritger     NV_STATUS status = NV_OK;
169690eb1077SAndy Ritger 
169790eb1077SAndy Ritger     NV2080_CTRL_NVLINK_PRE_LINK_TRAIN_ALI_PARAMS params;
169890eb1077SAndy Ritger 
169990eb1077SAndy Ritger     portMemSet(&params, 0, sizeof(params));
170090eb1077SAndy Ritger 
170190eb1077SAndy Ritger     params.linkMask = linkMask;
170290eb1077SAndy Ritger     params.bSync    = bSync;
170390eb1077SAndy Ritger 
170490eb1077SAndy Ritger     // Reset timeout to clear any accumulated timeouts from link init
170590eb1077SAndy Ritger     if (IS_GSP_CLIENT(pGpu))
170690eb1077SAndy Ritger     {
170790eb1077SAndy Ritger         threadStateResetTimeout(pGpu);
170890eb1077SAndy Ritger     }
170990eb1077SAndy Ritger 
171090eb1077SAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
171190eb1077SAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_LINK_TRAIN_ALI,
171290eb1077SAndy Ritger                                  (void *)&params, sizeof(params));
171390eb1077SAndy Ritger     if (status != NV_OK)
171490eb1077SAndy Ritger     {
171590eb1077SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to change ALI Links to active!\n");
171690eb1077SAndy Ritger         return status;
171790eb1077SAndy Ritger     }
171890eb1077SAndy Ritger 
171990eb1077SAndy Ritger     return NV_OK;
172090eb1077SAndy Ritger }
172190eb1077SAndy Ritger 
172290eb1077SAndy Ritger /*!
17231739a20eSAndy Ritger  * @brief Update the post Rx Detect link mask.
17241739a20eSAndy Ritger  *
17251739a20eSAndy Ritger  * @param[in] pGpu           OBJGPU pointer for local GPU
17261739a20eSAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
17271739a20eSAndy Ritger  *
17281739a20eSAndy Ritger  */
17291739a20eSAndy Ritger NV_STATUS
knvlinkUpdatePostRxDetectLinkMask_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)17301739a20eSAndy Ritger knvlinkUpdatePostRxDetectLinkMask_IMPL
17311739a20eSAndy Ritger (
17321739a20eSAndy Ritger     OBJGPU       *pGpu,
17331739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
17341739a20eSAndy Ritger )
17351739a20eSAndy Ritger {
17361739a20eSAndy Ritger     NV_STATUS status = NV_OK;
17371739a20eSAndy Ritger 
17381739a20eSAndy Ritger     NV2080_CTRL_NVLINK_GET_LINK_MASK_POST_RX_DET_PARAMS params;
17391739a20eSAndy Ritger 
17401739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
17411739a20eSAndy Ritger 
17421739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
17431739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_GET_LINK_MASK_POST_RX_DET,
17441739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
17451739a20eSAndy Ritger     if (status != NV_OK)
17461739a20eSAndy Ritger     {
17471739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to update Rx Detect Link mask!\n");
17481739a20eSAndy Ritger         return status;
17491739a20eSAndy Ritger     }
17501739a20eSAndy Ritger 
17511739a20eSAndy Ritger     pKernelNvlink->postRxDetLinkMask = params.postRxDetLinkMask;
17521739a20eSAndy Ritger 
17531739a20eSAndy Ritger     return NV_OK;
17541739a20eSAndy Ritger }
17551739a20eSAndy Ritger 
17561739a20eSAndy Ritger /*!
17571739a20eSAndy Ritger  * @brief Copy over the NVLink devices information from GSP-RM.
17581739a20eSAndy Ritger  *
17591739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer for local GPU
17601739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
17611739a20eSAndy Ritger  */
17621739a20eSAndy Ritger NV_STATUS
knvlinkCopyNvlinkDeviceInfo_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)17631739a20eSAndy Ritger knvlinkCopyNvlinkDeviceInfo_IMPL
17641739a20eSAndy Ritger (
17651739a20eSAndy Ritger     OBJGPU       *pGpu,
17661739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
17671739a20eSAndy Ritger )
17681739a20eSAndy Ritger {
17691739a20eSAndy Ritger     NV_STATUS status = NV_OK;
17701739a20eSAndy Ritger     NvU32     i;
17711739a20eSAndy Ritger 
17721739a20eSAndy Ritger     NV2080_CTRL_NVLINK_GET_NVLINK_DEVICE_INFO_PARAMS nvlinkInfoParams;
17731739a20eSAndy Ritger 
17741739a20eSAndy Ritger     portMemSet(&nvlinkInfoParams, 0, sizeof(nvlinkInfoParams));
17751739a20eSAndy Ritger 
17761739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
17771739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_GET_NVLINK_DEVICE_INFO,
17781739a20eSAndy Ritger                                  (void *)&nvlinkInfoParams, sizeof(nvlinkInfoParams));
17791739a20eSAndy Ritger 
17801739a20eSAndy Ritger     if (status == NV_ERR_NOT_SUPPORTED)
17811739a20eSAndy Ritger     {
17821739a20eSAndy Ritger         NV_PRINTF(LEVEL_WARNING, "NVLink is unavailable\n");
17831739a20eSAndy Ritger         return status;
17841739a20eSAndy Ritger     }
17851739a20eSAndy Ritger     else if (status != NV_OK)
17861739a20eSAndy Ritger     {
17871739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to retrieve all nvlink device info!\n");
17881739a20eSAndy Ritger         return status;
17891739a20eSAndy Ritger     }
17901739a20eSAndy Ritger 
17911739a20eSAndy Ritger     // Update CPU-RM's NVLink state with the information received from GSP-RM RPC
17921739a20eSAndy Ritger     pKernelNvlink->ioctrlMask       = nvlinkInfoParams.ioctrlMask;
17931739a20eSAndy Ritger     pKernelNvlink->ioctrlNumEntries = nvlinkInfoParams.ioctrlNumEntries;
17941739a20eSAndy Ritger     pKernelNvlink->ioctrlSize       = nvlinkInfoParams.ioctrlSize;
17951739a20eSAndy Ritger     pKernelNvlink->discoveredLinks  = nvlinkInfoParams.discoveredLinks;
17961739a20eSAndy Ritger     pKernelNvlink->ipVerNvlink      = nvlinkInfoParams.ipVerNvlink;
17971739a20eSAndy Ritger 
17981739a20eSAndy Ritger     for (i = 0; i < NVLINK_MAX_LINKS_SW; i++)
17991739a20eSAndy Ritger     {
18001739a20eSAndy Ritger         pKernelNvlink->nvlinkLinks[i].pGpu     = pGpu;
18011739a20eSAndy Ritger         pKernelNvlink->nvlinkLinks[i].bValid   = nvlinkInfoParams.linkInfo[i].bValid;
18021739a20eSAndy Ritger         pKernelNvlink->nvlinkLinks[i].linkId   = nvlinkInfoParams.linkInfo[i].linkId;
18031739a20eSAndy Ritger         pKernelNvlink->nvlinkLinks[i].ioctrlId = nvlinkInfoParams.linkInfo[i].ioctrlId;
18041739a20eSAndy Ritger 
18051739a20eSAndy Ritger         // Copy over the link PLL master and slave relationship for each link
18061739a20eSAndy Ritger         pKernelNvlink->nvlinkLinks[i].pllMasterLinkId = nvlinkInfoParams.linkInfo[i].pllMasterLinkId;
18071739a20eSAndy Ritger         pKernelNvlink->nvlinkLinks[i].pllSlaveLinkId  = nvlinkInfoParams.linkInfo[i].pllSlaveLinkId;
18081739a20eSAndy Ritger 
18091739a20eSAndy Ritger         // Copy over the ip versions for DLPL devices discovered
18101739a20eSAndy Ritger         pKernelNvlink->nvlinkLinks[i].ipVerDlPl = nvlinkInfoParams.linkInfo[i].ipVerDlPl;
18111739a20eSAndy Ritger     }
18121739a20eSAndy Ritger 
18131739a20eSAndy Ritger     return NV_OK;
18141739a20eSAndy Ritger }
18151739a20eSAndy Ritger 
18161739a20eSAndy Ritger /*!
18171739a20eSAndy Ritger  * @brief Copy over the Ioctrl devices information from GSP-RM.
18181739a20eSAndy Ritger  *
18191739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer for local GPU
18201739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
18211739a20eSAndy Ritger  */
18221739a20eSAndy Ritger NV_STATUS
knvlinkCopyIoctrlDeviceInfo_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)18231739a20eSAndy Ritger knvlinkCopyIoctrlDeviceInfo_IMPL
18241739a20eSAndy Ritger (
18251739a20eSAndy Ritger     OBJGPU       *pGpu,
18261739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
18271739a20eSAndy Ritger )
18281739a20eSAndy Ritger {
18291739a20eSAndy Ritger     KernelIoctrl *pKernelIoctrl = NULL;
18301739a20eSAndy Ritger     NV_STATUS     status        = NV_OK;
18311739a20eSAndy Ritger     NvU32         ioctrlIdx;
18321739a20eSAndy Ritger 
18331739a20eSAndy Ritger     NV2080_CTRL_NVLINK_GET_IOCTRL_DEVICE_INFO_PARAMS ioctrlInfoParams;
18341739a20eSAndy Ritger 
18351739a20eSAndy Ritger     // Query the IOCTRL information for each of the IOCTRLs discovered
18361739a20eSAndy Ritger     FOR_EACH_INDEX_IN_MASK(32, ioctrlIdx, pKernelNvlink->ioctrlMask)
18371739a20eSAndy Ritger     {
18381739a20eSAndy Ritger         portMemSet(&ioctrlInfoParams, 0, sizeof(ioctrlInfoParams));
18391739a20eSAndy Ritger 
18401739a20eSAndy Ritger         ioctrlInfoParams.ioctrlIdx = ioctrlIdx;
18411739a20eSAndy Ritger 
18421739a20eSAndy Ritger         status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
18431739a20eSAndy Ritger                                      NV2080_CTRL_CMD_NVLINK_GET_IOCTRL_DEVICE_INFO,
18441739a20eSAndy Ritger                                      (void *)&ioctrlInfoParams, sizeof(ioctrlInfoParams));
18451739a20eSAndy Ritger 
18461739a20eSAndy Ritger         if (status == NV_ERR_NOT_SUPPORTED)
18471739a20eSAndy Ritger         {
18481739a20eSAndy Ritger             NV_PRINTF(LEVEL_WARNING, "NVLink is unavailable\n");
18491739a20eSAndy Ritger             return status;
18501739a20eSAndy Ritger         }
18511739a20eSAndy Ritger         else if (status != NV_OK)
18521739a20eSAndy Ritger         {
18531739a20eSAndy Ritger             NV_PRINTF(LEVEL_ERROR, "Failed to retrieve device info for IOCTRL %d!\n", ioctrlIdx);
18541739a20eSAndy Ritger             return status;
18551739a20eSAndy Ritger         }
18561739a20eSAndy Ritger 
18571739a20eSAndy Ritger         pKernelIoctrl = KNVLINK_GET_IOCTRL(pKernelNvlink, ioctrlIdx);
18581739a20eSAndy Ritger 
18591739a20eSAndy Ritger         // Update CPU-RM's NVLink state with the information received from GSP-RM RPC
18601739a20eSAndy Ritger         pKernelIoctrl->PublicId              = ioctrlInfoParams.PublicId;
18611739a20eSAndy Ritger         pKernelIoctrl->localDiscoveredLinks  = ioctrlInfoParams.localDiscoveredLinks;
18621739a20eSAndy Ritger         pKernelIoctrl->localGlobalLinkOffset = ioctrlInfoParams.localGlobalLinkOffset;
18631739a20eSAndy Ritger         pKernelIoctrl->ioctrlDiscoverySize   = ioctrlInfoParams.ioctrlDiscoverySize;
18641739a20eSAndy Ritger         pKernelIoctrl->numDevices            = ioctrlInfoParams.numDevices;
18651739a20eSAndy Ritger 
18661739a20eSAndy Ritger         // Copy over the ip versions for the ioctrl and minion devices discovered
18671739a20eSAndy Ritger         pKernelIoctrl->ipVerIoctrl = ioctrlInfoParams.ipRevisions.ipVerIoctrl;
18681739a20eSAndy Ritger         pKernelIoctrl->ipVerMinion = ioctrlInfoParams.ipRevisions.ipVerMinion;
18691739a20eSAndy Ritger 
18701739a20eSAndy Ritger         if (pKernelIoctrl->ipVerMinion == 0)
18711739a20eSAndy Ritger         {
18721739a20eSAndy Ritger             pKernelIoctrl->setProperty(pKernelIoctrl, PDB_PROP_KIOCTRL_MINION_AVAILABLE, NV_FALSE);
18731739a20eSAndy Ritger         }
18741739a20eSAndy Ritger     }
18751739a20eSAndy Ritger     FOR_EACH_INDEX_IN_MASK_END;
18761739a20eSAndy Ritger 
18771739a20eSAndy Ritger     return NV_OK;
18781739a20eSAndy Ritger }
18791739a20eSAndy Ritger 
18801739a20eSAndy Ritger /**
18811739a20eSAndy Ritger  * @brief Setup topology information for the forced nvlink configurations
18821739a20eSAndy Ritger  *
18831739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer for local GPU
18841739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
18851739a20eSAndy Ritger  */
18861739a20eSAndy Ritger NV_STATUS
knvlinkSetupTopologyForForcedConfig_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)18871739a20eSAndy Ritger knvlinkSetupTopologyForForcedConfig_IMPL
18881739a20eSAndy Ritger (
18891739a20eSAndy Ritger     OBJGPU       *pGpu,
18901739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
18911739a20eSAndy Ritger )
18921739a20eSAndy Ritger {
18931739a20eSAndy Ritger     NV_STATUS status  = NV_OK;
18941739a20eSAndy Ritger     NvU32     i, physLink;
18951739a20eSAndy Ritger 
18961739a20eSAndy Ritger     // Start with all links disabled and no forced config in effect
18971739a20eSAndy Ritger     pKernelNvlink->bRegistryLinkOverride = NV_TRUE;
18981739a20eSAndy Ritger     pKernelNvlink->registryLinkMask      = 0;
18991739a20eSAndy Ritger     pKernelNvlink->bChiplibConfig        = NV_FALSE;
19001739a20eSAndy Ritger 
19011739a20eSAndy Ritger     for (i = 0; i < NVLINK_MAX_LINKS_SW; i++)
19021739a20eSAndy Ritger     {
19031739a20eSAndy Ritger         // Filter against the links discovered from IOCTRL
19041739a20eSAndy Ritger         if (!(pKernelNvlink->discoveredLinks & NVBIT(i)))
19051739a20eSAndy Ritger             continue;
19061739a20eSAndy Ritger 
19071739a20eSAndy Ritger         // The physical link is guaranteed valid in all cases
19081739a20eSAndy Ritger         physLink = DRF_VAL(_NVLINK, _ARCH_CONNECTION, _PHYSICAL_LINK, pKernelNvlink->pLinkConnection[i]);
19091739a20eSAndy Ritger 
19101739a20eSAndy Ritger         // Update link tracking
19111739a20eSAndy Ritger         if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _ENABLED, pKernelNvlink->pLinkConnection[i]))
19121739a20eSAndy Ritger         {
19131739a20eSAndy Ritger             NV_PRINTF(LEVEL_INFO,
19141739a20eSAndy Ritger                       "ARCH_CONNECTION info from chiplib: ENABLED Logical link %d (Physical "
19151739a20eSAndy Ritger                       "link %d) = 0x%X\n", i, physLink,
19161739a20eSAndy Ritger                       pKernelNvlink->pLinkConnection[i]);
19171739a20eSAndy Ritger 
19181739a20eSAndy Ritger             //
19191739a20eSAndy Ritger             // This "link" should be ENABLED. We use the physical link since RM only deals with
19201739a20eSAndy Ritger             // physical links.
19211739a20eSAndy Ritger             //
19221739a20eSAndy Ritger             pKernelNvlink->registryLinkMask |= NVBIT(physLink);
19231739a20eSAndy Ritger 
19241739a20eSAndy Ritger             // Config is forced (at least one link requested)
19251739a20eSAndy Ritger             pKernelNvlink->bChiplibConfig = NV_TRUE;
19261739a20eSAndy Ritger         }
19271739a20eSAndy Ritger         else
19281739a20eSAndy Ritger         {
19291739a20eSAndy Ritger             NV_PRINTF(LEVEL_INFO,
19301739a20eSAndy Ritger                       "ARCH_CONNECTION info from chiplib: DISABLED Logical link %d (Physical "
19311739a20eSAndy Ritger                       "link %d) = 0x%X\n", i, physLink,
19321739a20eSAndy Ritger                       pKernelNvlink->pLinkConnection[i]);
19331739a20eSAndy Ritger         }
19341739a20eSAndy Ritger 
19351739a20eSAndy Ritger         // Accumulate any PEER links
19361739a20eSAndy Ritger         if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _PEER_MASK, pKernelNvlink->pLinkConnection[i]))
19371739a20eSAndy Ritger         {
19381739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
19391739a20eSAndy Ritger             // Ensure reginit has the info it needs for the remote side
19401739a20eSAndy Ritger             pKernelNvlink->nvlinkLinks[i].remoteEndInfo.bConnected = NV_TRUE;
19411739a20eSAndy Ritger             pKernelNvlink->nvlinkLinks[i].remoteEndInfo.deviceType =
19421739a20eSAndy Ritger                                                     NV2080_CTRL_NVLINK_DEVICE_INFO_DEVICE_TYPE_GPU;
19431739a20eSAndy Ritger 
19441739a20eSAndy Ritger #endif
19451739a20eSAndy Ritger         }
19461739a20eSAndy Ritger 
19471739a20eSAndy Ritger         // Accumulate any CPU links
19481739a20eSAndy Ritger         if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _CPU, pKernelNvlink->pLinkConnection[i]))
19491739a20eSAndy Ritger         {
19501739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
19511739a20eSAndy Ritger             // Ensure reginit has the info it needs for the remote side
19521739a20eSAndy Ritger             pKernelNvlink->nvlinkLinks[i].remoteEndInfo.bConnected = NV_TRUE;
19531739a20eSAndy Ritger             pKernelNvlink->nvlinkLinks[i].remoteEndInfo.deviceType = pKernelNvlink->forcedSysmemDeviceType;
19541739a20eSAndy Ritger #endif
19551739a20eSAndy Ritger         }
19561739a20eSAndy Ritger 
19571739a20eSAndy Ritger         // RPC into GSP-RM to update the link remote connection status
19581739a20eSAndy Ritger         status = knvlinkUpdateLinkConnectionStatus(pGpu, pKernelNvlink, i);
19591739a20eSAndy Ritger         if (status != NV_OK)
19601739a20eSAndy Ritger         {
19611739a20eSAndy Ritger             return status;
19621739a20eSAndy Ritger         }
19631739a20eSAndy Ritger     }
19641739a20eSAndy Ritger 
19651739a20eSAndy Ritger     // Update enabledLinks mask with the mask of forced link configurations
19661739a20eSAndy Ritger     pKernelNvlink->enabledLinks = pKernelNvlink->discoveredLinks & pKernelNvlink->registryLinkMask;
19671739a20eSAndy Ritger 
19681739a20eSAndy Ritger     return NV_OK;
19691739a20eSAndy Ritger }
19701739a20eSAndy Ritger 
19711739a20eSAndy Ritger /*!
19721739a20eSAndy Ritger  * @brief Sync the lane shutdown properties with GSP-RM
19731739a20eSAndy Ritger  *
19741739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer
19751739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
19761739a20eSAndy Ritger  */
19771739a20eSAndy Ritger NV_STATUS
knvlinkSyncLaneShutdownProps_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)19781739a20eSAndy Ritger knvlinkSyncLaneShutdownProps_IMPL
19791739a20eSAndy Ritger (
19801739a20eSAndy Ritger     OBJGPU       *pGpu,
19811739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
19821739a20eSAndy Ritger )
19831739a20eSAndy Ritger {
19841739a20eSAndy Ritger     NV_STATUS status = NV_OK;
19851739a20eSAndy Ritger 
19861739a20eSAndy Ritger     NV2080_CTRL_NVLINK_SYNC_NVLINK_SHUTDOWN_PROPS_PARAMS params;
19871739a20eSAndy Ritger 
19881739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
19891739a20eSAndy Ritger 
19901739a20eSAndy Ritger     params.bLaneShutdownEnabled  =
19911739a20eSAndy Ritger         pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LANE_SHUTDOWN_ENABLED);
19921739a20eSAndy Ritger     params.bLaneShutdownOnUnload =
19931739a20eSAndy Ritger         pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LANE_SHUTDOWN_ON_UNLOAD);
19941739a20eSAndy Ritger 
19951739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
19961739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_SYNC_NVLINK_SHUTDOWN_PROPS,
19971739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
19981739a20eSAndy Ritger     if (status != NV_OK)
19991739a20eSAndy Ritger     {
20001739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to sync NVLink shutdown properties with GSP!\n");
20011739a20eSAndy Ritger         return status;
20021739a20eSAndy Ritger     }
20031739a20eSAndy Ritger 
20041739a20eSAndy Ritger     return NV_OK;
20051739a20eSAndy Ritger }
20061739a20eSAndy Ritger 
20071739a20eSAndy Ritger /*!
200890eb1077SAndy Ritger  * @brief   Get the number of active links allowed per IOCTRL
200990eb1077SAndy Ritger  *
201090eb1077SAndy Ritger  * @param[in] pGpu           OBJGPU pointer
201190eb1077SAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
201290eb1077SAndy Ritger  *
201390eb1077SAndy Ritger  * @returns On success, returns the number of active links per IOCTRL.
201490eb1077SAndy Ritger  *          On failure, returns 0.
201590eb1077SAndy Ritger  */
201690eb1077SAndy Ritger NvU32
knvlinkGetNumActiveLinksPerIoctrl_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)201790eb1077SAndy Ritger knvlinkGetNumActiveLinksPerIoctrl_IMPL
201890eb1077SAndy Ritger (
201990eb1077SAndy Ritger     OBJGPU       *pGpu,
202090eb1077SAndy Ritger     KernelNvlink *pKernelNvlink
202190eb1077SAndy Ritger )
202290eb1077SAndy Ritger {
202390eb1077SAndy Ritger     NV_STATUS status;
202490eb1077SAndy Ritger     NV2080_CTRL_INTERNAL_NVLINK_GET_NUM_ACTIVE_LINK_PER_IOCTRL_PARAMS params;
202590eb1077SAndy Ritger     portMemSet(&params, 0, sizeof(params));
202690eb1077SAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
202790eb1077SAndy Ritger                                  NV2080_CTRL_INTERNAL_NVLINK_GET_NUM_ACTIVE_LINK_PER_IOCTRL,
202890eb1077SAndy Ritger                                  (void *)&params, sizeof(params));
202990eb1077SAndy Ritger     if (status != NV_OK)
203090eb1077SAndy Ritger     {
203190eb1077SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to get the number of active links per IOCTRL\n");
203290eb1077SAndy Ritger         return 0;
203390eb1077SAndy Ritger     }
203490eb1077SAndy Ritger     return params.numActiveLinksPerIoctrl;
203590eb1077SAndy Ritger }
203690eb1077SAndy Ritger 
203790eb1077SAndy Ritger /*!
203890eb1077SAndy Ritger  * @brief   Get the number of total links  per IOCTRL
203990eb1077SAndy Ritger  *
204090eb1077SAndy Ritger  * @param[in] pGpu           OBJGPU pointer
204190eb1077SAndy Ritger  * @param[in] pKernelNvlink  KernelNvlink pointer
204290eb1077SAndy Ritger  *
204390eb1077SAndy Ritger  * @returns On success, returns the number of total links per IOCTRL.
204490eb1077SAndy Ritger  *          On failure, returns 0.
204590eb1077SAndy Ritger  */
204690eb1077SAndy Ritger NvU32
knvlinkGetTotalNumLinksPerIoctrl_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)204790eb1077SAndy Ritger knvlinkGetTotalNumLinksPerIoctrl_IMPL
204890eb1077SAndy Ritger (
204990eb1077SAndy Ritger     OBJGPU       *pGpu,
205090eb1077SAndy Ritger     KernelNvlink *pKernelNvlink
205190eb1077SAndy Ritger )
205290eb1077SAndy Ritger {
205390eb1077SAndy Ritger     NV_STATUS status;
205490eb1077SAndy Ritger     NV2080_CTRL_INTERNAL_NVLINK_GET_TOTAL_NUM_LINK_PER_IOCTRL_PARAMS params;
205590eb1077SAndy Ritger     portMemSet(&params, 0, sizeof(params));
205690eb1077SAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
205790eb1077SAndy Ritger                                  NV2080_CTRL_INTERNAL_NVLINK_GET_TOTAL_NUM_LINK_PER_IOCTRL,
205890eb1077SAndy Ritger                                  (void *)&params, sizeof(params));
205990eb1077SAndy Ritger     if (status != NV_OK)
206090eb1077SAndy Ritger     {
206190eb1077SAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to get the total number of links per IOCTRL\n");
206290eb1077SAndy Ritger         return 0;
206390eb1077SAndy Ritger     }
206490eb1077SAndy Ritger     return params.numLinksPerIoctrl;
206590eb1077SAndy Ritger }
206690eb1077SAndy Ritger 
20671739a20eSAndy Ritger /**
20681739a20eSAndy Ritger  * @brief Process the mask of init disabled links
20691739a20eSAndy Ritger  *
20701739a20eSAndy Ritger  * @param[in] pGpu          OBJGPU pointer
20711739a20eSAndy Ritger  * @param[in] pKernelNvlink KernelNvlink pointer
20721739a20eSAndy Ritger  */
20731739a20eSAndy Ritger NV_STATUS
knvlinkProcessInitDisabledLinks_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)20741739a20eSAndy Ritger knvlinkProcessInitDisabledLinks_IMPL
20751739a20eSAndy Ritger (
20761739a20eSAndy Ritger     OBJGPU       *pGpu,
20771739a20eSAndy Ritger     KernelNvlink *pKernelNvlink
20781739a20eSAndy Ritger )
20791739a20eSAndy Ritger {
20801739a20eSAndy Ritger     NvU32     mask                 = 0;
20811739a20eSAndy Ritger     NvBool    bSkipHwNvlinkDisable = 0;
20821739a20eSAndy Ritger     NV_STATUS status               = NV_OK;
20831739a20eSAndy Ritger 
20841739a20eSAndy Ritger     NV2080_CTRL_NVLINK_PROCESS_INIT_DISABLED_LINKS_PARAMS params;
20851739a20eSAndy Ritger 
20861739a20eSAndy Ritger     status = gpumgrGetGpuInitDisabledNvlinks(pGpu->gpuId, &mask, &bSkipHwNvlinkDisable);
20871739a20eSAndy Ritger     if (status != NV_OK)
20881739a20eSAndy Ritger     {
20891739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to get init disabled links from gpumgr\n");
20901739a20eSAndy Ritger         return status;
20911739a20eSAndy Ritger     }
20921739a20eSAndy Ritger 
20931739a20eSAndy Ritger     portMemSet(&params, 0, sizeof(params));
20941739a20eSAndy Ritger 
20951739a20eSAndy Ritger     params.initDisabledLinksMask = mask;
20961739a20eSAndy Ritger     params.bSkipHwNvlinkDisable  = bSkipHwNvlinkDisable;
20971739a20eSAndy Ritger 
20981739a20eSAndy Ritger     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
20991739a20eSAndy Ritger                                  NV2080_CTRL_CMD_NVLINK_PROCESS_INIT_DISABLED_LINKS,
21001739a20eSAndy Ritger                                  (void *)&params, sizeof(params));
21011739a20eSAndy Ritger     if (status != NV_OK)
21021739a20eSAndy Ritger     {
21031739a20eSAndy Ritger         NV_PRINTF(LEVEL_ERROR, "Failed to process init disabled links in GSP\n");
21041739a20eSAndy Ritger         return status;
21051739a20eSAndy Ritger     }
21061739a20eSAndy Ritger 
21071739a20eSAndy Ritger     pKernelNvlink->initDisabledLinksMask = params.initDisabledLinksMask;
21081739a20eSAndy Ritger 
21091739a20eSAndy Ritger     return NV_OK;
21101739a20eSAndy Ritger }
21111739a20eSAndy Ritger 
211291676d66SBernhard Stoeckner void
knvlinkFatalErrorRecovery_WORKITEM(NvU32 gpuInstance,void * pArgs)211391676d66SBernhard Stoeckner knvlinkFatalErrorRecovery_WORKITEM
211491676d66SBernhard Stoeckner (
211591676d66SBernhard Stoeckner     NvU32 gpuInstance,
211691676d66SBernhard Stoeckner     void  *pArgs
211791676d66SBernhard Stoeckner )
211891676d66SBernhard Stoeckner {
211991676d66SBernhard Stoeckner     OBJGPU *pGpu = gpumgrGetGpu(gpuInstance);
212091676d66SBernhard Stoeckner     rcAndDisableOutstandingClientsWithImportedMemory(pGpu, NV_FABRIC_INVALID_NODE_ID);
212191676d66SBernhard Stoeckner }
212291676d66SBernhard Stoeckner 
212391676d66SBernhard Stoeckner NV_STATUS
knvlinkFatalErrorRecovery_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)212491676d66SBernhard Stoeckner knvlinkFatalErrorRecovery_IMPL
212591676d66SBernhard Stoeckner (
212691676d66SBernhard Stoeckner     OBJGPU *pGpu,
212791676d66SBernhard Stoeckner     KernelNvlink *pKernelNvlink
212891676d66SBernhard Stoeckner )
212991676d66SBernhard Stoeckner {
213091676d66SBernhard Stoeckner     NV_STATUS status;
213191676d66SBernhard Stoeckner 
213291676d66SBernhard Stoeckner     status = osQueueWorkItemWithFlags(pGpu, knvlinkFatalErrorRecovery_WORKITEM, NULL,
213391676d66SBernhard Stoeckner                                       (OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
213491676d66SBernhard Stoeckner                                         OS_QUEUE_WORKITEM_FLAGS_LOCK_API_RW |
213591676d66SBernhard Stoeckner                                         OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_SUBDEVICE_RW));
213691676d66SBernhard Stoeckner 
213791676d66SBernhard Stoeckner      return status;
213891676d66SBernhard Stoeckner }
213991676d66SBernhard Stoeckner 
21401739a20eSAndy Ritger // Grab GPU locks before RPCing into GSP-RM for NVLink RPCs
21411739a20eSAndy Ritger NV_STATUS
knvlinkExecGspRmRpc_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 cmd,void * paramAddr,NvU32 paramSize)21421739a20eSAndy Ritger knvlinkExecGspRmRpc_IMPL
21431739a20eSAndy Ritger (
21441739a20eSAndy Ritger     OBJGPU       *pGpu,
21451739a20eSAndy Ritger     KernelNvlink *pKernelNvlink,
21461739a20eSAndy Ritger     NvU32         cmd,
21471739a20eSAndy Ritger     void         *paramAddr,
21481739a20eSAndy Ritger     NvU32         paramSize
21491739a20eSAndy Ritger )
21501739a20eSAndy Ritger {
21511739a20eSAndy Ritger     NvU32     gpuMaskRelease = 0;
21521739a20eSAndy Ritger     NvU32     gpuMaskInitial = rmGpuLocksGetOwnedMask();
21531739a20eSAndy Ritger     NvU32     gpuMask        = gpuMaskInitial | NVBIT(pGpu->gpuInstance);
21541739a20eSAndy Ritger     NV_STATUS status         = NV_OK;
21551739a20eSAndy Ritger 
21561739a20eSAndy Ritger     if (IS_GSP_CLIENT(pGpu))
21571739a20eSAndy Ritger     {
21581739a20eSAndy Ritger         if (!rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_MASK, &gpuMask))
21591739a20eSAndy Ritger         {
21601739a20eSAndy Ritger             status = rmGpuGroupLockAcquire(pGpu->gpuInstance,
21611739a20eSAndy Ritger                                            GPU_LOCK_GRP_MASK,
21621739a20eSAndy Ritger                                            GPU_LOCK_FLAGS_SAFE_LOCK_UPGRADE,
21631739a20eSAndy Ritger                                            RM_LOCK_MODULES_NVLINK,
21641739a20eSAndy Ritger                                            &gpuMask);
21651739a20eSAndy Ritger             if (status != NV_OK)
21661739a20eSAndy Ritger             {
21671739a20eSAndy Ritger                 NV_PRINTF(LEVEL_ERROR, "Failed to acquire locks for gpumask 0x%x\n", gpuMask);
21681739a20eSAndy Ritger                 return status;
21691739a20eSAndy Ritger             }
21701739a20eSAndy Ritger 
21711739a20eSAndy Ritger             gpuMaskRelease = (gpuMask & (~gpuMaskInitial));
21721739a20eSAndy Ritger         }
21731739a20eSAndy Ritger     }
21741739a20eSAndy Ritger 
21751739a20eSAndy Ritger     RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
21761739a20eSAndy Ritger     status = pRmApi->Control(pRmApi,
21771739a20eSAndy Ritger                              pGpu->hInternalClient,
21781739a20eSAndy Ritger                              pGpu->hInternalSubdevice,
21791739a20eSAndy Ritger                              cmd, paramAddr, paramSize);
21801739a20eSAndy Ritger     if (gpuMaskRelease)
21811739a20eSAndy Ritger     {
21821739a20eSAndy Ritger         rmGpuGroupLockRelease(gpuMaskRelease, GPUS_LOCK_FLAGS_NONE);
21831739a20eSAndy Ritger     }
21841739a20eSAndy Ritger 
21851739a20eSAndy Ritger     return status;
21861739a20eSAndy Ritger }
21871739a20eSAndy Ritger 
21881739a20eSAndy Ritger void
knvlinkUtoa(NvU8 * str,NvU64 length,NvU64 val)21891739a20eSAndy Ritger knvlinkUtoa(NvU8 *str, NvU64 length, NvU64 val)
21901739a20eSAndy Ritger {
21911739a20eSAndy Ritger     NvU8  temp[NV2080_GPU_MAX_NAME_STRING_LENGTH];
21921739a20eSAndy Ritger     NvU8 *ptr = temp;
21931739a20eSAndy Ritger     NvU64 i = 0;
21941739a20eSAndy Ritger 
21951739a20eSAndy Ritger     NV_ASSERT(str != NULL);
21961739a20eSAndy Ritger 
21971739a20eSAndy Ritger     do
21981739a20eSAndy Ritger     {
21991739a20eSAndy Ritger         i   = val % 10;
22001739a20eSAndy Ritger         val = val / 10;
22011739a20eSAndy Ritger         *ptr++ = (NvU8)(i + '0');
22021739a20eSAndy Ritger     } while(val);
22031739a20eSAndy Ritger 
22041739a20eSAndy Ritger     NV_ASSERT(length > (NvU64) (ptr - temp));
22051739a20eSAndy Ritger 
22061739a20eSAndy Ritger     while (ptr > temp)
22071739a20eSAndy Ritger         *str++ = *--ptr;
22081739a20eSAndy Ritger 
22091739a20eSAndy Ritger     *str = '\0';
22101739a20eSAndy Ritger }
2211