11739a20eSAndy Ritger /*
2*e45d91deSBernhard Stoeckner * SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
31739a20eSAndy Ritger * SPDX-License-Identifier: MIT
41739a20eSAndy Ritger *
51739a20eSAndy Ritger * Permission is hereby granted, free of charge, to any person obtaining a
61739a20eSAndy Ritger * copy of this software and associated documentation files (the "Software"),
71739a20eSAndy Ritger * to deal in the Software without restriction, including without limitation
81739a20eSAndy Ritger * the rights to use, copy, modify, merge, publish, distribute, sublicense,
91739a20eSAndy Ritger * and/or sell copies of the Software, and to permit persons to whom the
101739a20eSAndy Ritger * Software is furnished to do so, subject to the following conditions:
111739a20eSAndy Ritger *
121739a20eSAndy Ritger * The above copyright notice and this permission notice shall be included in
131739a20eSAndy Ritger * all copies or substantial portions of the Software.
141739a20eSAndy Ritger *
151739a20eSAndy Ritger * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
161739a20eSAndy Ritger * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
171739a20eSAndy Ritger * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
181739a20eSAndy Ritger * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
191739a20eSAndy Ritger * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
201739a20eSAndy Ritger * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
211739a20eSAndy Ritger * DEALINGS IN THE SOFTWARE.
221739a20eSAndy Ritger */
231739a20eSAndy Ritger
24eb5c7665SAndy Ritger #define NVOC_KERNEL_NVLINK_H_PRIVATE_ACCESS_ALLOWED
25eb5c7665SAndy Ritger
26eb5c7665SAndy Ritger // FIXME XXX
27eb5c7665SAndy Ritger #define NVOC_KERNEL_IOCTRL_H_PRIVATE_ACCESS_ALLOWED
28eb5c7665SAndy Ritger
291739a20eSAndy Ritger #include "os/os.h"
301739a20eSAndy Ritger #include "core/hal.h"
311739a20eSAndy Ritger #include "core/locks.h"
3291676d66SBernhard Stoeckner #include "gpu_mgr/gpu_mgr.h"
331739a20eSAndy Ritger #include "gpu/gpu.h"
341739a20eSAndy Ritger #include "kernel/gpu/nvlink/kernel_nvlink.h"
351739a20eSAndy Ritger #include "kernel/gpu/nvlink/kernel_ioctrl.h"
361739a20eSAndy Ritger #include "gpu/mem_mgr/mem_mgr.h"
371739a20eSAndy Ritger #include "gpu/mmu/kern_gmmu.h"
381739a20eSAndy Ritger #include "gpu/ce/kernel_ce.h"
3991676d66SBernhard Stoeckner #include "platform/sli/sli.h"
4091676d66SBernhard Stoeckner #include "gpu/gpu_fabric_probe.h"
4191676d66SBernhard Stoeckner #include "compute/imex_session_api.h"
4291676d66SBernhard Stoeckner #include "compute/fabric.h"
4391676d66SBernhard Stoeckner #include "mem_mgr/mem_multicast_fabric.h"
441739a20eSAndy Ritger
451739a20eSAndy Ritger /*!
461739a20eSAndy Ritger * @brief Is NVLINK topology forced? NVLink topology is considered
471739a20eSAndy Ritger * forced for both legacy forced config and chiplib configs
481739a20eSAndy Ritger *
491739a20eSAndy Ritger * @param[in] pGpu OBJGPU
501739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
511739a20eSAndy Ritger *
521739a20eSAndy Ritger * @return NV_TRUE if topology is forced
531739a20eSAndy Ritger */
541739a20eSAndy Ritger NvBool
knvlinkIsForcedConfig_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)551739a20eSAndy Ritger knvlinkIsForcedConfig_IMPL
561739a20eSAndy Ritger (
571739a20eSAndy Ritger OBJGPU *pGpu,
581739a20eSAndy Ritger KernelNvlink *pKernelNvlink
591739a20eSAndy Ritger )
601739a20eSAndy Ritger {
611739a20eSAndy Ritger return (pKernelNvlink->bChiplibConfig);
621739a20eSAndy Ritger }
631739a20eSAndy Ritger
641739a20eSAndy Ritger /*!
651739a20eSAndy Ritger * @brief Determine if NVLink is enabled or disabled by default
661739a20eSAndy Ritger *
671739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
681739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
691739a20eSAndy Ritger *
701739a20eSAndy Ritger * @return NV_TRUE if NVLink is enabled on the GPU/platform
711739a20eSAndy Ritger */
721739a20eSAndy Ritger NvBool
knvlinkIsNvlinkDefaultEnabled_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)731739a20eSAndy Ritger knvlinkIsNvlinkDefaultEnabled_IMPL
741739a20eSAndy Ritger (
751739a20eSAndy Ritger OBJGPU *pGpu,
761739a20eSAndy Ritger KernelNvlink *pKernelNvlink
771739a20eSAndy Ritger )
781739a20eSAndy Ritger {
791739a20eSAndy Ritger //
801739a20eSAndy Ritger // Currently it is critical that the following lib check be present.
811739a20eSAndy Ritger // Burying this in the hal below it may get lost as the stub is all
821739a20eSAndy Ritger // thats required for POR (always true from the hals perspective)
831739a20eSAndy Ritger //
841739a20eSAndy Ritger #if !defined(INCLUDE_NVLINK_LIB)
851739a20eSAndy Ritger
861739a20eSAndy Ritger return NV_FALSE;
871739a20eSAndy Ritger
881739a20eSAndy Ritger #endif
891739a20eSAndy Ritger
901739a20eSAndy Ritger // Let the PDB handle the final decision.
911739a20eSAndy Ritger return pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_ENABLED);
921739a20eSAndy Ritger }
931739a20eSAndy Ritger
941739a20eSAndy Ritger /*!
951739a20eSAndy Ritger * @brief Determine if P2P loopback over NVLink is supported for
961739a20eSAndy Ritger * the given GPU. This function returns true if any link
971739a20eSAndy Ritger * is connected in loopback mode.
981739a20eSAndy Ritger *
991739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
1001739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
1011739a20eSAndy Ritger *
1021739a20eSAndy Ritger * @return NV_TRUE if any link is in loopback mode
1031739a20eSAndy Ritger */
1041739a20eSAndy Ritger NvBool
knvlinkIsP2pLoopbackSupported_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)1051739a20eSAndy Ritger knvlinkIsP2pLoopbackSupported_IMPL
1061739a20eSAndy Ritger (
1071739a20eSAndy Ritger OBJGPU *pGpu,
1081739a20eSAndy Ritger KernelNvlink *pKernelNvlink
1091739a20eSAndy Ritger )
1101739a20eSAndy Ritger {
1111739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
1121739a20eSAndy Ritger
1131739a20eSAndy Ritger NvU32 i;
1141739a20eSAndy Ritger
1151739a20eSAndy Ritger if ((pGpu == NULL) || (pKernelNvlink == NULL))
1161739a20eSAndy Ritger {
1171739a20eSAndy Ritger return NV_FALSE;
1181739a20eSAndy Ritger }
1191739a20eSAndy Ritger
1201739a20eSAndy Ritger // Return false if P2P loopback is disabled through regkey
1211739a20eSAndy Ritger if (pGpu->getProperty(pGpu, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED))
1221739a20eSAndy Ritger {
1231739a20eSAndy Ritger return NV_FALSE;
1241739a20eSAndy Ritger }
1251739a20eSAndy Ritger
1261739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
1271739a20eSAndy Ritger {
1281739a20eSAndy Ritger if (knvlinkIsP2pLoopbackSupportedPerLink_IMPL(pGpu, pKernelNvlink, i))
1291739a20eSAndy Ritger return NV_TRUE;
1301739a20eSAndy Ritger }
1311739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK_END
1321739a20eSAndy Ritger
1331739a20eSAndy Ritger #endif
1341739a20eSAndy Ritger
1351739a20eSAndy Ritger return NV_FALSE;
1361739a20eSAndy Ritger }
1371739a20eSAndy Ritger
1381739a20eSAndy Ritger /*!
1391739a20eSAndy Ritger * @brief Determine if P2P loopback over NVLink is supported for
1401739a20eSAndy Ritger * the given link. This function returns true if the link
1411739a20eSAndy Ritger * is connected in loopback mode.
1421739a20eSAndy Ritger *
1431739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
1441739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
1451739a20eSAndy Ritger * @param[in] link Link ID
1461739a20eSAndy Ritger *
1471739a20eSAndy Ritger * @return NV_TRUE if the link is in loopback mode
1481739a20eSAndy Ritger */
1491739a20eSAndy Ritger NvBool
knvlinkIsP2pLoopbackSupportedPerLink_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 link)1501739a20eSAndy Ritger knvlinkIsP2pLoopbackSupportedPerLink_IMPL
1511739a20eSAndy Ritger (
1521739a20eSAndy Ritger OBJGPU *pGpu,
1531739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
1541739a20eSAndy Ritger NvU32 link
1551739a20eSAndy Ritger )
1561739a20eSAndy Ritger {
1571739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
1581739a20eSAndy Ritger
1591739a20eSAndy Ritger if ((pGpu == NULL) || (pKernelNvlink == NULL))
1601739a20eSAndy Ritger {
1611739a20eSAndy Ritger return NV_FALSE;
1621739a20eSAndy Ritger }
1631739a20eSAndy Ritger
1641739a20eSAndy Ritger // Return false if P2P loopback is disabled through regkey
1651739a20eSAndy Ritger if (pGpu->getProperty(pGpu, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED))
1661739a20eSAndy Ritger {
1671739a20eSAndy Ritger return NV_FALSE;
1681739a20eSAndy Ritger }
1691739a20eSAndy Ritger
1701739a20eSAndy Ritger // Return false if the given link is disabled
1711739a20eSAndy Ritger if (!(NVBIT(link) & pKernelNvlink->enabledLinks))
1721739a20eSAndy Ritger {
1731739a20eSAndy Ritger return NV_FALSE;
1741739a20eSAndy Ritger }
1751739a20eSAndy Ritger
1761739a20eSAndy Ritger // Check the link connected to the same GPU (loopback)
1771739a20eSAndy Ritger if (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.bConnected)
1781739a20eSAndy Ritger {
179b5bf85a8SAndy Ritger if (((pKernelNvlink->nvlinkLinks[link].remoteEndInfo.domain == gpuGetDomain(pGpu)) &&
1801739a20eSAndy Ritger (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.bus == gpuGetBus(pGpu)) &&
1811739a20eSAndy Ritger (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.device == gpuGetDevice(pGpu)) &&
182b5bf85a8SAndy Ritger (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.function == 0)) ||
183b5bf85a8SAndy Ritger pKernelNvlink->PDB_PROP_KNVLINK_FORCED_LOOPBACK_ON_SWITCH_MODE_ENABLED)
1841739a20eSAndy Ritger {
1851739a20eSAndy Ritger return NV_TRUE;
1861739a20eSAndy Ritger }
1871739a20eSAndy Ritger }
1881739a20eSAndy Ritger
1891739a20eSAndy Ritger #endif
1901739a20eSAndy Ritger
1911739a20eSAndy Ritger return NV_FALSE;
1921739a20eSAndy Ritger }
1931739a20eSAndy Ritger
1941739a20eSAndy Ritger /*!
1951739a20eSAndy Ritger * @brief Determine if P2P over NVLINK is supported between 2 GPUs
1961739a20eSAndy Ritger *
1971739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
1981739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
1991739a20eSAndy Ritger * @param[in] pPeerGpu OBJGPU pointer for remote GPU
2001739a20eSAndy Ritger *
2011739a20eSAndy Ritger * @return NV_TRUE if P2P is supported between the 2 GPUs
2021739a20eSAndy Ritger */
2031739a20eSAndy Ritger NvBool
knvlinkIsNvlinkP2pSupported_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pPeerGpu)2041739a20eSAndy Ritger knvlinkIsNvlinkP2pSupported_IMPL
2051739a20eSAndy Ritger (
2061739a20eSAndy Ritger OBJGPU *pGpu,
2071739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
2081739a20eSAndy Ritger OBJGPU *pPeerGpu
2091739a20eSAndy Ritger )
2101739a20eSAndy Ritger {
2111739a20eSAndy Ritger NV_STATUS status = NV_OK;
2121739a20eSAndy Ritger
2131739a20eSAndy Ritger if (pKernelNvlink == NULL)
2141739a20eSAndy Ritger {
2151739a20eSAndy Ritger return NV_FALSE;
2161739a20eSAndy Ritger }
2171739a20eSAndy Ritger
2184397463eSAndy Ritger if (knvlinkIsBandwidthModeOff(pKernelNvlink))
2194397463eSAndy Ritger {
2204397463eSAndy Ritger return NV_FALSE;
2214397463eSAndy Ritger }
2224397463eSAndy Ritger
2231739a20eSAndy Ritger // Get the Nvlink P2P connections from the core library
2241739a20eSAndy Ritger status = knvlinkGetP2pConnectionStatus(pGpu, pKernelNvlink, pPeerGpu);
2251739a20eSAndy Ritger
2261739a20eSAndy Ritger if (status == NV_OK)
2271739a20eSAndy Ritger {
2281739a20eSAndy Ritger return NV_TRUE;
2291739a20eSAndy Ritger }
2301739a20eSAndy Ritger
2311739a20eSAndy Ritger return NV_FALSE;
2321739a20eSAndy Ritger }
2331739a20eSAndy Ritger
234b5bf85a8SAndy Ritger static NvBool
_knvlinkCheckFabricCliqueId(OBJGPU * pGpu,OBJGPU * pPeerGpu)235b5bf85a8SAndy Ritger _knvlinkCheckFabricCliqueId
236b5bf85a8SAndy Ritger (
237b5bf85a8SAndy Ritger OBJGPU *pGpu,
238b5bf85a8SAndy Ritger OBJGPU *pPeerGpu
239b5bf85a8SAndy Ritger )
240b5bf85a8SAndy Ritger {
241b5bf85a8SAndy Ritger NvU32 cliqueId, peerCliqueId;
242b5bf85a8SAndy Ritger NV_STATUS status;
243b5bf85a8SAndy Ritger
244b5bf85a8SAndy Ritger status = gpuFabricProbeGetFabricCliqueId(pGpu->pGpuFabricProbeInfoKernel,
245b5bf85a8SAndy Ritger &cliqueId);
246b5bf85a8SAndy Ritger if (status != NV_OK)
247b5bf85a8SAndy Ritger {
248b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "GPU %d failed to get fabric clique Id: 0x%x\n",
249b5bf85a8SAndy Ritger gpuGetInstance(pGpu), status);
250b5bf85a8SAndy Ritger return NV_FALSE;
251b5bf85a8SAndy Ritger }
252b5bf85a8SAndy Ritger
253b5bf85a8SAndy Ritger status = gpuFabricProbeGetFabricCliqueId(pPeerGpu->pGpuFabricProbeInfoKernel,
254b5bf85a8SAndy Ritger &peerCliqueId);
255b5bf85a8SAndy Ritger if (status != NV_OK)
256b5bf85a8SAndy Ritger {
257b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "GPU %d failed to get fabric clique Id 0x%x\n",
258b5bf85a8SAndy Ritger gpuGetInstance(pPeerGpu), status);
259b5bf85a8SAndy Ritger return NV_FALSE;
260b5bf85a8SAndy Ritger }
261b5bf85a8SAndy Ritger
262b5bf85a8SAndy Ritger if (cliqueId != peerCliqueId)
263b5bf85a8SAndy Ritger {
264b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "GPU %d and Peer GPU %d cliqueId doesn't match\n",
265b5bf85a8SAndy Ritger gpuGetInstance(pGpu), gpuGetInstance(pPeerGpu));
266b5bf85a8SAndy Ritger return NV_FALSE;
267b5bf85a8SAndy Ritger }
268b5bf85a8SAndy Ritger
269b5bf85a8SAndy Ritger return NV_TRUE;
270b5bf85a8SAndy Ritger }
271b5bf85a8SAndy Ritger
2721739a20eSAndy Ritger /*!
273ea4c27faSBernhard Stoeckner * @brief Checks whether EGM addresses are valid for P2P
274ea4c27faSBernhard Stoeckner * when GPU is connected to NVSwitch
275ea4c27faSBernhard Stoeckner *
276ea4c27faSBernhard Stoeckner * @param[in] pGpu OBJGPU pointer for local GPU
277ea4c27faSBernhard Stoeckner * @param[in] pKernelNvlink KernelNvlink pointer
278ea4c27faSBernhard Stoeckner * @param[in] pPeerGpu OBJGPU pointer for remote GPU
279ea4c27faSBernhard Stoeckner *
280ea4c27faSBernhard Stoeckner * @return NV_TRUE if EGM addresses are valid
281ea4c27faSBernhard Stoeckner */
282ea4c27faSBernhard Stoeckner static NvBool
_knvlinkCheckNvswitchEgmAddressSanity(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pPeerGpu)283ea4c27faSBernhard Stoeckner _knvlinkCheckNvswitchEgmAddressSanity
284ea4c27faSBernhard Stoeckner (
285ea4c27faSBernhard Stoeckner OBJGPU *pGpu,
286ea4c27faSBernhard Stoeckner KernelNvlink *pKernelNvlink,
287ea4c27faSBernhard Stoeckner OBJGPU *pPeerGpu
288ea4c27faSBernhard Stoeckner )
289ea4c27faSBernhard Stoeckner {
290ea4c27faSBernhard Stoeckner NvU64 egmRangeStart = knvlinkGetUniqueFabricEgmBaseAddress(pGpu, pKernelNvlink);
291ea4c27faSBernhard Stoeckner
292ea4c27faSBernhard Stoeckner if (knvlinkIsGpuConnectedToNvswitch(pGpu, pKernelNvlink))
293ea4c27faSBernhard Stoeckner {
294ea4c27faSBernhard Stoeckner if (gpuIsSriovEnabled(pGpu))
295ea4c27faSBernhard Stoeckner {
296ea4c27faSBernhard Stoeckner // currently vgpu + switch doesn't support GPA addressing.
297ea4c27faSBernhard Stoeckner return NV_TRUE;
298ea4c27faSBernhard Stoeckner }
299ea4c27faSBernhard Stoeckner
300ea4c27faSBernhard Stoeckner if (gpuFabricProbeIsSupported(pGpu) && gpuFabricProbeIsSupported(pPeerGpu))
301ea4c27faSBernhard Stoeckner {
302ea4c27faSBernhard Stoeckner if (!_knvlinkCheckFabricCliqueId(pGpu, pPeerGpu))
303ea4c27faSBernhard Stoeckner {
304ea4c27faSBernhard Stoeckner return NV_FALSE;
305ea4c27faSBernhard Stoeckner }
306ea4c27faSBernhard Stoeckner }
307ea4c27faSBernhard Stoeckner
308ea4c27faSBernhard Stoeckner // Sanity checks for EGM address
309ea4c27faSBernhard Stoeckner if (egmRangeStart == NVLINK_INVALID_FABRIC_ADDR)
310ea4c27faSBernhard Stoeckner {
311ea4c27faSBernhard Stoeckner NV_PRINTF(LEVEL_ERROR, "GPU %d doesn't have a EGM fabric address\n",
312ea4c27faSBernhard Stoeckner gpuGetInstance(pGpu));
313ea4c27faSBernhard Stoeckner
314ea4c27faSBernhard Stoeckner return NV_FALSE;
315ea4c27faSBernhard Stoeckner }
316ea4c27faSBernhard Stoeckner }
317ea4c27faSBernhard Stoeckner else
318ea4c27faSBernhard Stoeckner {
319ea4c27faSBernhard Stoeckner // Sanity check for EGM address
320ea4c27faSBernhard Stoeckner if (egmRangeStart != NVLINK_INVALID_FABRIC_ADDR)
321ea4c27faSBernhard Stoeckner {
322ea4c27faSBernhard Stoeckner NV_PRINTF(LEVEL_ERROR,
323ea4c27faSBernhard Stoeckner "non-NVSwitch GPU %d has a valid EGM fabric address\n",
324ea4c27faSBernhard Stoeckner gpuGetInstance(pGpu));
325ea4c27faSBernhard Stoeckner
326ea4c27faSBernhard Stoeckner return NV_FALSE;
327ea4c27faSBernhard Stoeckner }
328ea4c27faSBernhard Stoeckner
329ea4c27faSBernhard Stoeckner }
330ea4c27faSBernhard Stoeckner
331ea4c27faSBernhard Stoeckner return NV_TRUE;
332ea4c27faSBernhard Stoeckner }
333ea4c27faSBernhard Stoeckner
334ea4c27faSBernhard Stoeckner /*!
3351739a20eSAndy Ritger * @brief Checks whether necessary the config setup is done to
3361739a20eSAndy Ritger * support P2P over NVSwitch
3371739a20eSAndy Ritger *
3381739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
3391739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
3401739a20eSAndy Ritger * @param[in] pPeerGpu OBJGPU pointer for remote GPU
3411739a20eSAndy Ritger *
3421739a20eSAndy Ritger * @return NV_TRUE if P2P over NVSwitch
3431739a20eSAndy Ritger */
3441739a20eSAndy Ritger NvBool
knvlinkCheckNvswitchP2pConfig_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pPeerGpu)3451739a20eSAndy Ritger knvlinkCheckNvswitchP2pConfig_IMPL
3461739a20eSAndy Ritger (
3471739a20eSAndy Ritger OBJGPU *pGpu,
3481739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
3491739a20eSAndy Ritger OBJGPU *pPeerGpu
3501739a20eSAndy Ritger )
3511739a20eSAndy Ritger {
3521739a20eSAndy Ritger MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
353ea4c27faSBernhard Stoeckner NvU64 hbmRangeStart = knvlinkGetUniqueFabricBaseAddress(pGpu, pKernelNvlink);
354ea4c27faSBernhard Stoeckner NvU64 hbmRangeEnd = hbmRangeStart + (pMemoryManager->Ram.fbTotalMemSizeMb << 20);
355ea4c27faSBernhard Stoeckner NvU64 hbmPeerRangeStart = knvlinkGetUniqueFabricBaseAddress(pPeerGpu,
3561739a20eSAndy Ritger GPU_GET_KERNEL_NVLINK(pPeerGpu));
3571739a20eSAndy Ritger
3581739a20eSAndy Ritger if (knvlinkIsGpuConnectedToNvswitch(pGpu, pKernelNvlink))
3591739a20eSAndy Ritger {
3601739a20eSAndy Ritger if (gpuIsSriovEnabled(pGpu))
3611739a20eSAndy Ritger {
3621739a20eSAndy Ritger // currently vgpu + switch doesn't support GPA addresing.
3631739a20eSAndy Ritger return NV_TRUE;
3641739a20eSAndy Ritger }
3651739a20eSAndy Ritger
366b5bf85a8SAndy Ritger if (gpuFabricProbeIsSupported(pGpu) && gpuFabricProbeIsSupported(pPeerGpu))
367b5bf85a8SAndy Ritger {
368b5bf85a8SAndy Ritger if (!_knvlinkCheckFabricCliqueId(pGpu, pPeerGpu))
369b5bf85a8SAndy Ritger {
370b5bf85a8SAndy Ritger return NV_FALSE;
371b5bf85a8SAndy Ritger }
372b5bf85a8SAndy Ritger }
373b5bf85a8SAndy Ritger
374ea4c27faSBernhard Stoeckner // Sanity checks for HBM addresses
375ea4c27faSBernhard Stoeckner if (hbmRangeStart == NVLINK_INVALID_FABRIC_ADDR)
3761739a20eSAndy Ritger {
3771739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "GPU %d doesn't have a fabric address\n",
3781739a20eSAndy Ritger gpuGetInstance(pGpu));
3791739a20eSAndy Ritger
3801739a20eSAndy Ritger return NV_FALSE;
3811739a20eSAndy Ritger }
3821739a20eSAndy Ritger
3831739a20eSAndy Ritger if ((pGpu != pPeerGpu) &&
384ea4c27faSBernhard Stoeckner ((hbmPeerRangeStart >= hbmRangeStart) && (hbmPeerRangeStart < hbmRangeEnd)))
3851739a20eSAndy Ritger {
3861739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR,
3871739a20eSAndy Ritger "GPU %d doesn't have a unique fabric address\n",
3881739a20eSAndy Ritger gpuGetInstance(pGpu));
3891739a20eSAndy Ritger
3901739a20eSAndy Ritger return NV_FALSE;
3911739a20eSAndy Ritger }
3921739a20eSAndy Ritger }
3931739a20eSAndy Ritger else
3941739a20eSAndy Ritger {
395ea4c27faSBernhard Stoeckner // Sanity check for HBM address
396ea4c27faSBernhard Stoeckner if (hbmRangeStart != NVLINK_INVALID_FABRIC_ADDR)
3971739a20eSAndy Ritger {
3981739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR,
3991739a20eSAndy Ritger "non-NVSwitch GPU %d has a valid fabric address\n",
4001739a20eSAndy Ritger gpuGetInstance(pGpu));
4011739a20eSAndy Ritger
4021739a20eSAndy Ritger return NV_FALSE;
4031739a20eSAndy Ritger }
4041739a20eSAndy Ritger }
4051739a20eSAndy Ritger
406ea4c27faSBernhard Stoeckner if (memmgrIsLocalEgmEnabled(pMemoryManager))
407ea4c27faSBernhard Stoeckner {
408ea4c27faSBernhard Stoeckner return _knvlinkCheckNvswitchEgmAddressSanity(pGpu, pKernelNvlink, pPeerGpu);
409ea4c27faSBernhard Stoeckner }
410ea4c27faSBernhard Stoeckner
4111739a20eSAndy Ritger return NV_TRUE;
4121739a20eSAndy Ritger }
4131739a20eSAndy Ritger
4141739a20eSAndy Ritger /*!
4151739a20eSAndy Ritger * @brief Get Nvlink P2P connections between 2 GPUs
4161739a20eSAndy Ritger *
4171739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
4181739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
4191739a20eSAndy Ritger * @param[in] pPeerGpu OBJGPU pointer for remote GPU
4201739a20eSAndy Ritger *
4211739a20eSAndy Ritger * @return NV_OK if P2P connections are present
4221739a20eSAndy Ritger */
4231739a20eSAndy Ritger NV_STATUS
knvlinkGetP2pConnectionStatus_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pPeerGpu)4241739a20eSAndy Ritger knvlinkGetP2pConnectionStatus_IMPL
4251739a20eSAndy Ritger (
4261739a20eSAndy Ritger OBJGPU *pGpu,
4271739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
4281739a20eSAndy Ritger OBJGPU *pPeerGpu
4291739a20eSAndy Ritger )
4301739a20eSAndy Ritger {
4311739a20eSAndy Ritger NV_STATUS status = NV_OK;
4321739a20eSAndy Ritger OBJGPU *pGpu0 = pGpu;
4331739a20eSAndy Ritger OBJGPU *pGpu1 = pPeerGpu;
4341739a20eSAndy Ritger KernelNvlink *pKernelNvlink0 = pKernelNvlink;
4351739a20eSAndy Ritger KernelNvlink *pKernelNvlink1 = NULL;
4361739a20eSAndy Ritger NvU32 numPeerLinks = 0;
4371739a20eSAndy Ritger
4381739a20eSAndy Ritger if (pGpu1 == NULL)
4391739a20eSAndy Ritger {
440b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_INFO, "Invalid pPeerGpu.\n");
4411739a20eSAndy Ritger
4421739a20eSAndy Ritger return NV_ERR_INVALID_ARGUMENT;
4431739a20eSAndy Ritger }
4441739a20eSAndy Ritger else if ((pGpu0 == pGpu1) &&
4451739a20eSAndy Ritger (pGpu0->getProperty(pGpu0, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED)))
4461739a20eSAndy Ritger {
4471739a20eSAndy Ritger // P2P over loopback links are disabled through regkey overrides
4481739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, "loopback P2P on GPU%u disabled by regkey\n",
4491739a20eSAndy Ritger gpuGetInstance(pGpu0));
4501739a20eSAndy Ritger
4511739a20eSAndy Ritger return NV_ERR_NOT_SUPPORTED;
4521739a20eSAndy Ritger }
4531739a20eSAndy Ritger else
4541739a20eSAndy Ritger {
4551739a20eSAndy Ritger pKernelNvlink1 = GPU_GET_KERNEL_NVLINK(pGpu1);
4561739a20eSAndy Ritger }
4571739a20eSAndy Ritger
4581739a20eSAndy Ritger if (pKernelNvlink1 == NULL)
4591739a20eSAndy Ritger {
460b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_INFO,
4611739a20eSAndy Ritger "Input mask contains a GPU on which NVLink is disabled.\n");
4621739a20eSAndy Ritger
4631739a20eSAndy Ritger return NV_ERR_INVALID_ARGUMENT;
4641739a20eSAndy Ritger }
4651739a20eSAndy Ritger
466758b4ee8SAndy Ritger if(pKernelNvlink0->bIsGpuDegraded)
467758b4ee8SAndy Ritger {
468758b4ee8SAndy Ritger NV_PRINTF(LEVEL_INFO,
469758b4ee8SAndy Ritger "NVLink P2P is NOT supported between GPU%d and GPU%d\n",
470758b4ee8SAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
471758b4ee8SAndy Ritger
472758b4ee8SAndy Ritger return NV_ERR_NOT_SUPPORTED;
473758b4ee8SAndy Ritger }
474758b4ee8SAndy Ritger
475758b4ee8SAndy Ritger if(pKernelNvlink1->bIsGpuDegraded)
476758b4ee8SAndy Ritger {
477758b4ee8SAndy Ritger NV_PRINTF(LEVEL_INFO,
478758b4ee8SAndy Ritger "NVLink P2P is NOT supported between GPU%d and GPU%d\n",
479758b4ee8SAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
480758b4ee8SAndy Ritger
481758b4ee8SAndy Ritger return NV_ERR_NOT_SUPPORTED;
482758b4ee8SAndy Ritger }
483758b4ee8SAndy Ritger
4841739a20eSAndy Ritger if ((IS_RTLSIM(pGpu0) && !pKernelNvlink0->bForceEnableCoreLibRtlsims) ||
4851739a20eSAndy Ritger knvlinkIsForcedConfig(pGpu0, pKernelNvlink0))
4861739a20eSAndy Ritger {
4871739a20eSAndy Ritger // For non-legacy configs.
4881739a20eSAndy Ritger if (pKernelNvlink0->bChiplibConfig)
4891739a20eSAndy Ritger {
4901739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO,
4911739a20eSAndy Ritger "NVLink P2P is supported between GPU%d and GPU%d\n",
4921739a20eSAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
4931739a20eSAndy Ritger
4941739a20eSAndy Ritger return NV_OK;
4951739a20eSAndy Ritger }
4961739a20eSAndy Ritger }
4971739a20eSAndy Ritger
4981739a20eSAndy Ritger // Get the remote ends of the links of local GPU from the nvlink core
4994397463eSAndy Ritger status = knvlinkCoreGetRemoteDeviceInfo(pGpu0, pKernelNvlink0);
5004397463eSAndy Ritger if (status != NV_OK)
5014397463eSAndy Ritger {
5024397463eSAndy Ritger return status;
5034397463eSAndy Ritger }
5041739a20eSAndy Ritger
5051739a20eSAndy Ritger // Post topology link enable on links of local GPU
5061739a20eSAndy Ritger status = knvlinkEnableLinksPostTopology_HAL(pGpu0, pKernelNvlink0,
5071739a20eSAndy Ritger pKernelNvlink0->enabledLinks);
5081739a20eSAndy Ritger if (status != NV_OK)
5091739a20eSAndy Ritger {
5101739a20eSAndy Ritger return status;
5111739a20eSAndy Ritger }
5121739a20eSAndy Ritger
5131739a20eSAndy Ritger numPeerLinks = knvlinkGetNumLinksToPeer(pGpu0, pKernelNvlink0, pGpu1);
514758b4ee8SAndy Ritger
515758b4ee8SAndy Ritger //
516758b4ee8SAndy Ritger // Maybe knvlinkCoreGetRemoteDeviceInfo was never called on pGpu1.
517758b4ee8SAndy Ritger // This can happen on systems where FM doesn't configure GPUs
518758b4ee8SAndy Ritger // using RM control calls explicitly.
519758b4ee8SAndy Ritger //
520758b4ee8SAndy Ritger if ((numPeerLinks == 0) && gpuFabricProbeIsSupported(pGpu1))
521758b4ee8SAndy Ritger {
522758b4ee8SAndy Ritger knvlinkCoreGetRemoteDeviceInfo(pGpu1, pKernelNvlink1);
523758b4ee8SAndy Ritger
524758b4ee8SAndy Ritger // Post topology link enable on links of remote GPU
525758b4ee8SAndy Ritger status = knvlinkEnableLinksPostTopology_HAL(pGpu1, pKernelNvlink1,
526758b4ee8SAndy Ritger pKernelNvlink1->enabledLinks);
527758b4ee8SAndy Ritger if (status != NV_OK)
528758b4ee8SAndy Ritger {
529758b4ee8SAndy Ritger return status;
530758b4ee8SAndy Ritger }
531758b4ee8SAndy Ritger
532758b4ee8SAndy Ritger numPeerLinks = knvlinkGetNumLinksToPeer(pGpu0, pKernelNvlink0, pGpu1);
533758b4ee8SAndy Ritger }
534758b4ee8SAndy Ritger
5351739a20eSAndy Ritger if (numPeerLinks > 0)
5361739a20eSAndy Ritger {
5371739a20eSAndy Ritger if (knvlinkGetNumLinksToPeer(pGpu1, pKernelNvlink1, pGpu0) != numPeerLinks)
5381739a20eSAndy Ritger {
5391739a20eSAndy Ritger // Get the remote ends of the links of remote GPU from the nvlink core
5404397463eSAndy Ritger status = knvlinkCoreGetRemoteDeviceInfo(pGpu1, pKernelNvlink1);
5414397463eSAndy Ritger if (status != NV_OK)
5424397463eSAndy Ritger {
5434397463eSAndy Ritger return status;
5444397463eSAndy Ritger }
5451739a20eSAndy Ritger
5461739a20eSAndy Ritger // Post topology link enable on links of remote GPU
5471739a20eSAndy Ritger status = knvlinkEnableLinksPostTopology_HAL(pGpu1, pKernelNvlink1,
5481739a20eSAndy Ritger pKernelNvlink1->enabledLinks);
5491739a20eSAndy Ritger if (status != NV_OK)
5501739a20eSAndy Ritger {
5511739a20eSAndy Ritger return status;
5521739a20eSAndy Ritger }
5531739a20eSAndy Ritger }
5541739a20eSAndy Ritger
5551739a20eSAndy Ritger // Peers should have the same number of links pointing back at us
556b5bf85a8SAndy Ritger NV_CHECK_OR_RETURN(LEVEL_INFO,
557b5bf85a8SAndy Ritger (knvlinkGetNumLinksToPeer(pGpu1, pKernelNvlink1, pGpu0) == numPeerLinks),
5581739a20eSAndy Ritger NV_ERR_INVALID_STATE);
5591739a20eSAndy Ritger
560b5bf85a8SAndy Ritger NV_CHECK_OR_RETURN(LEVEL_INFO,
561b5bf85a8SAndy Ritger knvlinkCheckNvswitchP2pConfig(pGpu0, pKernelNvlink0, pGpu1),
562b5bf85a8SAndy Ritger NV_ERR_INVALID_STATE);
563b5bf85a8SAndy Ritger
564b5bf85a8SAndy Ritger NV_CHECK_OR_RETURN(LEVEL_INFO,
565b5bf85a8SAndy Ritger knvlinkCheckNvswitchP2pConfig(pGpu1, pKernelNvlink1, pGpu0),
5661739a20eSAndy Ritger NV_ERR_INVALID_STATE);
5671739a20eSAndy Ritger
5681739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO,
5691739a20eSAndy Ritger "NVLink P2P is supported between GPU%d and GPU%d\n",
5701739a20eSAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
5711739a20eSAndy Ritger
5721739a20eSAndy Ritger return NV_OK;
5731739a20eSAndy Ritger }
5741739a20eSAndy Ritger
5751739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO,
5761739a20eSAndy Ritger "NVLink P2P is NOT supported between between GPU%d and GPU%d\n",
5771739a20eSAndy Ritger pGpu->gpuInstance, pGpu1->gpuInstance);
5781739a20eSAndy Ritger
5791739a20eSAndy Ritger return NV_ERR_NOT_SUPPORTED;
5801739a20eSAndy Ritger }
5811739a20eSAndy Ritger
5821739a20eSAndy Ritger /*!
5831739a20eSAndy Ritger * @brief Update the settings for the current established NVLink
5841739a20eSAndy Ritger * topology. This is the top level function that should be
5851739a20eSAndy Ritger * called, instead of applying the settings individually,
5861739a20eSAndy Ritger * since it grabs the required locks
5871739a20eSAndy Ritger *
5881739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
5891739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
5901739a20eSAndy Ritger *
5911739a20eSAndy Ritger * @return NV_OK on success
5921739a20eSAndy Ritger */
5931739a20eSAndy Ritger NV_STATUS
knvlinkUpdateCurrentConfig_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)5941739a20eSAndy Ritger knvlinkUpdateCurrentConfig_IMPL
5951739a20eSAndy Ritger (
5961739a20eSAndy Ritger OBJGPU *pGpu,
5971739a20eSAndy Ritger KernelNvlink *pKernelNvlink
5981739a20eSAndy Ritger )
5991739a20eSAndy Ritger {
6001739a20eSAndy Ritger OBJSYS *pSys = SYS_GET_INSTANCE();
6011739a20eSAndy Ritger KernelCE *pKCe = NULL;
6021739a20eSAndy Ritger NvBool bOwnsLock = NV_FALSE;
6031739a20eSAndy Ritger NV_STATUS status = NV_OK;
6041739a20eSAndy Ritger
6051739a20eSAndy Ritger if (osAcquireRmSema(pSys->pSema) == NV_OK)
6061739a20eSAndy Ritger {
6071739a20eSAndy Ritger //
6081739a20eSAndy Ritger // XXX Bug 1795328: Fix P2P path to acquire locks for the GPU
6091739a20eSAndy Ritger // Due to platform differences in the P2P path, the GPU lock is not
6101739a20eSAndy Ritger // consistently held at this point in the call stack. This function
6111739a20eSAndy Ritger // requires exclusive access to RM/PMU data structures to update HSHUB,
6121739a20eSAndy Ritger // and therefore requires the GPU lock to be held at this point.
6131739a20eSAndy Ritger // This check should be removed once the P2P paths have been updated to
6141739a20eSAndy Ritger // acquire the GPU locks consistently for all platforms.
6151739a20eSAndy Ritger //
6161739a20eSAndy Ritger if (!rmDeviceGpuLockIsOwner(pGpu->gpuInstance))
6171739a20eSAndy Ritger {
6181739a20eSAndy Ritger status = rmDeviceGpuLocksAcquire(pGpu, GPUS_LOCK_FLAGS_NONE,
6191739a20eSAndy Ritger RM_LOCK_MODULES_NVLINK);
6201739a20eSAndy Ritger if (status != NV_OK)
6211739a20eSAndy Ritger {
6221739a20eSAndy Ritger NV_ASSERT(0);
6231739a20eSAndy Ritger goto fail;
6241739a20eSAndy Ritger }
6251739a20eSAndy Ritger
6261739a20eSAndy Ritger bOwnsLock = NV_TRUE;
6271739a20eSAndy Ritger }
6281739a20eSAndy Ritger
6291739a20eSAndy Ritger //
6301739a20eSAndy Ritger // Links that have remote end detected should have passed RXDET
6311739a20eSAndy Ritger // Update the mask of connected links and bridged links
6321739a20eSAndy Ritger //
6331739a20eSAndy Ritger knvlinkFilterBridgeLinks_HAL(pGpu, pKernelNvlink);
6341739a20eSAndy Ritger
6351739a20eSAndy Ritger NV2080_CTRL_NVLINK_UPDATE_CURRENT_CONFIG_PARAMS params;
6361739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
6371739a20eSAndy Ritger
6381739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init
6391739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu))
6401739a20eSAndy Ritger {
6411739a20eSAndy Ritger threadStateResetTimeout(pGpu);
6421739a20eSAndy Ritger }
6431739a20eSAndy Ritger
6441739a20eSAndy Ritger //
6451739a20eSAndy Ritger // RPC into GSP-RM for programming the HSHUB, CONNECTION_CFG and LTCS
6461739a20eSAndy Ritger // registers.
6471739a20eSAndy Ritger //
6481739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
6491739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_UPDATE_CURRENT_CONFIG,
6501739a20eSAndy Ritger (void *)¶ms, sizeof(params));
6511739a20eSAndy Ritger if (status != NV_OK)
6521739a20eSAndy Ritger {
6531739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Updating current NVLink config failed\n");
6541739a20eSAndy Ritger goto fail;
6551739a20eSAndy Ritger }
6561739a20eSAndy Ritger
6571739a20eSAndy Ritger // Sync the GPU property for NVLINK over SYSMEM with GSP-RM
6581739a20eSAndy Ritger pGpu->setProperty(pGpu, PDB_PROP_GPU_NVLINK_SYSMEM, params.bNvlinkSysmemEnabled);
6591739a20eSAndy Ritger
6601739a20eSAndy Ritger // Update the PCE-LCE mappings
661758b4ee8SAndy Ritger status = kceFindFirstInstance(pGpu, &pKCe);
662758b4ee8SAndy Ritger if (status == NV_OK)
6631739a20eSAndy Ritger {
6641739a20eSAndy Ritger status = kceTopLevelPceLceMappingsUpdate(pGpu, pKCe);
6651739a20eSAndy Ritger if (status != NV_OK)
6661739a20eSAndy Ritger {
6671739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to update PCE-LCE mappings\n");
6681739a20eSAndy Ritger }
6691739a20eSAndy Ritger }
6701739a20eSAndy Ritger
6711739a20eSAndy Ritger fail:
6721739a20eSAndy Ritger if (bOwnsLock)
6731739a20eSAndy Ritger {
6741739a20eSAndy Ritger rmDeviceGpuLocksRelease(pGpu, GPUS_LOCK_FLAGS_NONE, NULL);
6751739a20eSAndy Ritger }
6761739a20eSAndy Ritger
6771739a20eSAndy Ritger osReleaseRmSema(pSys->pSema, NULL);
6781739a20eSAndy Ritger }
6791739a20eSAndy Ritger
6801739a20eSAndy Ritger return status;
6811739a20eSAndy Ritger }
6821739a20eSAndy Ritger
68391676d66SBernhard Stoeckner const static NVLINK_INBAND_MSG_CALLBACK nvlink_inband_callbacks[] =
684758b4ee8SAndy Ritger {
685758b4ee8SAndy Ritger {
68691676d66SBernhard Stoeckner .messageType = NVLINK_INBAND_MSG_TYPE_GPU_PROBE_RSP,
68791676d66SBernhard Stoeckner .pCallback = gpuFabricProbeReceiveKernelCallback,
68891676d66SBernhard Stoeckner .wqItemFlags = OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
68991676d66SBernhard Stoeckner OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_SUBDEVICE_RW
69091676d66SBernhard Stoeckner },
691758b4ee8SAndy Ritger
692758b4ee8SAndy Ritger {
69391676d66SBernhard Stoeckner .messageType = NVLINK_INBAND_MSG_TYPE_MC_TEAM_SETUP_RSP,
69491676d66SBernhard Stoeckner .pCallback = memorymulticastfabricTeamSetupResponseCallback,
69591676d66SBernhard Stoeckner .wqItemFlags = OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
69691676d66SBernhard Stoeckner OS_QUEUE_WORKITEM_FLAGS_LOCK_GPUS_RW
69791676d66SBernhard Stoeckner },
698758b4ee8SAndy Ritger
699758b4ee8SAndy Ritger {
70091676d66SBernhard Stoeckner .messageType = NVLINK_INBAND_MSG_TYPE_GPU_PROBE_UPDATE_REQ,
70191676d66SBernhard Stoeckner .pCallback = gpuFabricProbeReceiveUpdateKernelCallback,
70291676d66SBernhard Stoeckner .wqItemFlags = OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
70391676d66SBernhard Stoeckner OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_SUBDEVICE_RW
704758b4ee8SAndy Ritger }
70591676d66SBernhard Stoeckner };
706758b4ee8SAndy Ritger
707758b4ee8SAndy Ritger void
knvlinkInbandMsgCallbackDispatcher_WORKITEM(NvU32 gpuInstance,void * pData)708758b4ee8SAndy Ritger knvlinkInbandMsgCallbackDispatcher_WORKITEM
709758b4ee8SAndy Ritger (
710758b4ee8SAndy Ritger NvU32 gpuInstance,
711758b4ee8SAndy Ritger void *pData
712758b4ee8SAndy Ritger )
713758b4ee8SAndy Ritger {
714758b4ee8SAndy Ritger nvlink_inband_msg_header_t *pHeader;
715758b4ee8SAndy Ritger NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS *pMessage = pData;
71691676d66SBernhard Stoeckner NvU8 i;
71791676d66SBernhard Stoeckner const NVLINK_INBAND_MSG_CALLBACK *pCb = NULL;
718758b4ee8SAndy Ritger
71991676d66SBernhard Stoeckner // Dispatcher may not be called under GPU lock, so don't access pGpu.
720758b4ee8SAndy Ritger
721758b4ee8SAndy Ritger pHeader = (nvlink_inband_msg_header_t *)pMessage->data;
722758b4ee8SAndy Ritger
72391676d66SBernhard Stoeckner for (i = 0; i < NV_ARRAY_ELEMENTS(nvlink_inband_callbacks); i++)
72491676d66SBernhard Stoeckner {
72591676d66SBernhard Stoeckner if ((nvlink_inband_callbacks[i].messageType == pHeader->type) &&
72691676d66SBernhard Stoeckner (nvlink_inband_callbacks[i].pCallback != NULL))
72791676d66SBernhard Stoeckner {
72891676d66SBernhard Stoeckner pCb = &nvlink_inband_callbacks[i];
72991676d66SBernhard Stoeckner break;
73091676d66SBernhard Stoeckner }
73191676d66SBernhard Stoeckner }
73291676d66SBernhard Stoeckner
73391676d66SBernhard Stoeckner if (pCb == NULL)
734eb5c7665SAndy Ritger {
735eb5c7665SAndy Ritger NV_PRINTF(LEVEL_ERROR,
736eb5c7665SAndy Ritger "No Callback Registered for type %d. Dropping the msg\n",
737eb5c7665SAndy Ritger pHeader->type);
738eb5c7665SAndy Ritger return;
739eb5c7665SAndy Ritger }
740eb5c7665SAndy Ritger
74191676d66SBernhard Stoeckner #if defined(DEBUG) || defined(DEVELOP)
74291676d66SBernhard Stoeckner {
74391676d66SBernhard Stoeckner NvU8 *pRsvd = NULL;
74491676d66SBernhard Stoeckner
745758b4ee8SAndy Ritger // Assert reserved in msgHdr are zero
746758b4ee8SAndy Ritger pRsvd = &pHeader->reserved[0];
747758b4ee8SAndy Ritger NV_ASSERT((pRsvd[0] == 0) && portMemCmp(pRsvd, pRsvd + 1,
748758b4ee8SAndy Ritger sizeof(pHeader->reserved) - 1) == 0);
74991676d66SBernhard Stoeckner }
75091676d66SBernhard Stoeckner #endif
751758b4ee8SAndy Ritger
75291676d66SBernhard Stoeckner (void)pCb->pCallback(gpuInstance, NULL, pData);
753758b4ee8SAndy Ritger }
754758b4ee8SAndy Ritger
755758b4ee8SAndy Ritger NV_STATUS
knvlinkInbandMsgCallbackDispatcher_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 dataSize,NvU8 * pMessage)756758b4ee8SAndy Ritger knvlinkInbandMsgCallbackDispatcher_IMPL
757758b4ee8SAndy Ritger (
758758b4ee8SAndy Ritger OBJGPU *pGpu,
759758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink,
760758b4ee8SAndy Ritger NvU32 dataSize,
761758b4ee8SAndy Ritger NvU8 *pMessage
762758b4ee8SAndy Ritger )
763758b4ee8SAndy Ritger {
764758b4ee8SAndy Ritger NV_STATUS status;
765758b4ee8SAndy Ritger nvlink_inband_msg_header_t *pHeader;
766758b4ee8SAndy Ritger NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS *pData = NULL;
76791676d66SBernhard Stoeckner const NVLINK_INBAND_MSG_CALLBACK *pCb = NULL;
76891676d66SBernhard Stoeckner NvU8 i;
769758b4ee8SAndy Ritger
770758b4ee8SAndy Ritger pHeader = (nvlink_inband_msg_header_t *)pMessage;
771758b4ee8SAndy Ritger
772758b4ee8SAndy Ritger if (pHeader->type >= NVLINK_INBAND_MSG_TYPE_MAX)
773758b4ee8SAndy Ritger {
774758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Message type received is Out of Bounds. Dropping the msg\n");
775758b4ee8SAndy Ritger return NV_ERR_INVALID_REQUEST;
776758b4ee8SAndy Ritger }
777758b4ee8SAndy Ritger
77891676d66SBernhard Stoeckner for (i = 0; i < NV_ARRAY_ELEMENTS(nvlink_inband_callbacks); i++)
779758b4ee8SAndy Ritger {
78091676d66SBernhard Stoeckner if ((nvlink_inband_callbacks[i].messageType == pHeader->type) &&
78191676d66SBernhard Stoeckner (nvlink_inband_callbacks[i].pCallback != NULL))
78291676d66SBernhard Stoeckner {
78391676d66SBernhard Stoeckner pCb = &nvlink_inband_callbacks[i];
78491676d66SBernhard Stoeckner break;
78591676d66SBernhard Stoeckner }
78691676d66SBernhard Stoeckner }
78791676d66SBernhard Stoeckner
78891676d66SBernhard Stoeckner if (pCb == NULL)
78991676d66SBernhard Stoeckner {
79091676d66SBernhard Stoeckner NV_PRINTF(LEVEL_ERROR,
79191676d66SBernhard Stoeckner "No Callback Registered for type %d. Dropping the msg\n",
79291676d66SBernhard Stoeckner pHeader->type);
793758b4ee8SAndy Ritger return NV_ERR_INVALID_REQUEST;
794758b4ee8SAndy Ritger }
795758b4ee8SAndy Ritger
796758b4ee8SAndy Ritger pData = portMemAllocNonPaged(sizeof(NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS));
797758b4ee8SAndy Ritger if (pData == NULL)
798758b4ee8SAndy Ritger {
799758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Out of memory, Dropping message\n");
800758b4ee8SAndy Ritger return NV_ERR_NO_MEMORY;
801758b4ee8SAndy Ritger }
802758b4ee8SAndy Ritger
803758b4ee8SAndy Ritger pData->dataSize = dataSize;
804758b4ee8SAndy Ritger portMemCopy(pData->data, pData->dataSize, pMessage, dataSize);
805758b4ee8SAndy Ritger
806b5bf85a8SAndy Ritger status = osQueueWorkItemWithFlags(pGpu, knvlinkInbandMsgCallbackDispatcher_WORKITEM, pData,
80791676d66SBernhard Stoeckner pCb->wqItemFlags);
808758b4ee8SAndy Ritger if (status != NV_OK)
809758b4ee8SAndy Ritger {
810758b4ee8SAndy Ritger portMemFree(pData);
811758b4ee8SAndy Ritger return status;
812758b4ee8SAndy Ritger }
813758b4ee8SAndy Ritger
814758b4ee8SAndy Ritger return NV_OK;
815758b4ee8SAndy Ritger }
816758b4ee8SAndy Ritger
817758b4ee8SAndy Ritger NV_STATUS
knvlinkSendInbandData_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NV2080_CTRL_NVLINK_INBAND_SEND_DATA_PARAMS * pParams)818758b4ee8SAndy Ritger knvlinkSendInbandData_IMPL
819758b4ee8SAndy Ritger (
820758b4ee8SAndy Ritger OBJGPU *pGpu,
821758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink,
822758b4ee8SAndy Ritger NV2080_CTRL_NVLINK_INBAND_SEND_DATA_PARAMS *pParams
823758b4ee8SAndy Ritger )
824758b4ee8SAndy Ritger {
825758b4ee8SAndy Ritger NV_STATUS status;
826758b4ee8SAndy Ritger
827758b4ee8SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
828758b4ee8SAndy Ritger NV2080_CTRL_CMD_NVLINK_INBAND_SEND_DATA,
829758b4ee8SAndy Ritger (void *)pParams,
830758b4ee8SAndy Ritger sizeof(*pParams));
831758b4ee8SAndy Ritger
832758b4ee8SAndy Ritger return status;
833758b4ee8SAndy Ritger }
834758b4ee8SAndy Ritger /*!
8351739a20eSAndy Ritger * @brief Return the mask of links enabled on the system
8361739a20eSAndy Ritger *
8371739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
8381739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
8391739a20eSAndy Ritger */
8401739a20eSAndy Ritger NvU32
knvlinkGetEnabledLinkMask_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)8411739a20eSAndy Ritger knvlinkGetEnabledLinkMask_IMPL
8421739a20eSAndy Ritger (
8431739a20eSAndy Ritger OBJGPU *pGpu,
8441739a20eSAndy Ritger KernelNvlink *pKernelNvlink
8451739a20eSAndy Ritger )
8461739a20eSAndy Ritger {
8471739a20eSAndy Ritger return pKernelNvlink->enabledLinks;
8481739a20eSAndy Ritger }
8491739a20eSAndy Ritger
8501739a20eSAndy Ritger /*!
8511739a20eSAndy Ritger * @brief Return the mask of links discovered on the system
8521739a20eSAndy Ritger *
8531739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
8541739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
8551739a20eSAndy Ritger */
8561739a20eSAndy Ritger NvU32
knvlinkGetDiscoveredLinkMask_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)8571739a20eSAndy Ritger knvlinkGetDiscoveredLinkMask_IMPL
8581739a20eSAndy Ritger (
8591739a20eSAndy Ritger OBJGPU *pGpu,
8601739a20eSAndy Ritger KernelNvlink *pKernelNvlink
8611739a20eSAndy Ritger )
8621739a20eSAndy Ritger {
8631739a20eSAndy Ritger return pKernelNvlink->discoveredLinks;
8641739a20eSAndy Ritger }
8651739a20eSAndy Ritger
8661739a20eSAndy Ritger /*!
8671739a20eSAndy Ritger * @brief Returns the number of sysmem links
8681739a20eSAndy Ritger *
8691739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
8701739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
8711739a20eSAndy Ritger *
8721739a20eSAndy Ritger * @return The #sysmem NVLinks
8731739a20eSAndy Ritger */
8741739a20eSAndy Ritger NvU32
knvlinkGetNumLinksToSystem_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)8751739a20eSAndy Ritger knvlinkGetNumLinksToSystem_IMPL
8761739a20eSAndy Ritger (
8771739a20eSAndy Ritger OBJGPU *pGpu,
8781739a20eSAndy Ritger KernelNvlink *pKernelNvlink
8791739a20eSAndy Ritger )
8801739a20eSAndy Ritger {
8811739a20eSAndy Ritger NvU32 numSysmemLinks = pKernelNvlink->sysmemLinkMask;
8821739a20eSAndy Ritger
8831739a20eSAndy Ritger if (numSysmemLinks != 0)
8841739a20eSAndy Ritger {
8851739a20eSAndy Ritger NUMSETBITS_32(numSysmemLinks);
8861739a20eSAndy Ritger }
8871739a20eSAndy Ritger
8881739a20eSAndy Ritger return numSysmemLinks;
8891739a20eSAndy Ritger }
8901739a20eSAndy Ritger
8911739a20eSAndy Ritger /*!
8921739a20eSAndy Ritger * @brief Returns number of peer links to a remote GPU
8931739a20eSAndy Ritger *
8941739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer of local GPU
8951739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
8961739a20eSAndy Ritger * @param[in] pRemoteGpu OBJGPU pointer of remote GPU
8971739a20eSAndy Ritger *
8981739a20eSAndy Ritger * @return The #peer NVLinks to the remote GPU
8991739a20eSAndy Ritger */
9001739a20eSAndy Ritger NvU32
knvlinkGetNumLinksToPeer_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,OBJGPU * pRemoteGpu)9011739a20eSAndy Ritger knvlinkGetNumLinksToPeer_IMPL
9021739a20eSAndy Ritger (
9031739a20eSAndy Ritger OBJGPU *pGpu,
9041739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
9051739a20eSAndy Ritger OBJGPU *pRemoteGpu
9061739a20eSAndy Ritger )
9071739a20eSAndy Ritger {
9081739a20eSAndy Ritger NvU32 numPeerLinks =
9091739a20eSAndy Ritger knvlinkGetLinkMaskToPeer(pGpu, pKernelNvlink, pRemoteGpu);
9101739a20eSAndy Ritger
9111739a20eSAndy Ritger if (numPeerLinks != 0)
9121739a20eSAndy Ritger {
9131739a20eSAndy Ritger NUMSETBITS_32(numPeerLinks);
9141739a20eSAndy Ritger }
9151739a20eSAndy Ritger
9161739a20eSAndy Ritger return numPeerLinks;
9171739a20eSAndy Ritger }
9181739a20eSAndy Ritger
9191739a20eSAndy Ritger /*!
9201739a20eSAndy Ritger * @brief Gets the mask of peer links between the GPUs
9211739a20eSAndy Ritger *
9221739a20eSAndy Ritger * @param[in] pGpu0 OBJGPU pointer
9231739a20eSAndy Ritger * @param[in] pKernelNvlink0 Nvlink pointer
9241739a20eSAndy Ritger * @param[in] pGpu1 Remote OBJGPU pointer
9251739a20eSAndy Ritger *
9261739a20eSAndy Ritger * @return Returns the mask of peer links between the GPUs
9271739a20eSAndy Ritger */
9281739a20eSAndy Ritger NvU32
knvlinkGetLinkMaskToPeer_IMPL(OBJGPU * pGpu0,KernelNvlink * pKernelNvlink0,OBJGPU * pGpu1)9291739a20eSAndy Ritger knvlinkGetLinkMaskToPeer_IMPL
9301739a20eSAndy Ritger (
9311739a20eSAndy Ritger OBJGPU *pGpu0,
9321739a20eSAndy Ritger KernelNvlink *pKernelNvlink0,
9331739a20eSAndy Ritger OBJGPU *pGpu1
9341739a20eSAndy Ritger )
9351739a20eSAndy Ritger {
9361739a20eSAndy Ritger NvU32 peerLinkMask = 0;
937758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink1 = NULL;
938758b4ee8SAndy Ritger
939758b4ee8SAndy Ritger pKernelNvlink1 = GPU_GET_KERNEL_NVLINK(pGpu1);
940758b4ee8SAndy Ritger
941758b4ee8SAndy Ritger if (pKernelNvlink1 == NULL)
942758b4ee8SAndy Ritger {
943b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_INFO,
944758b4ee8SAndy Ritger "on GPU%d NVLink is disabled.\n", gpuGetInstance(pGpu1));
945758b4ee8SAndy Ritger
946758b4ee8SAndy Ritger return 0;
947758b4ee8SAndy Ritger }
948758b4ee8SAndy Ritger
949758b4ee8SAndy Ritger if(pKernelNvlink0->bIsGpuDegraded)
950758b4ee8SAndy Ritger {
951758b4ee8SAndy Ritger return peerLinkMask;
952758b4ee8SAndy Ritger }
953758b4ee8SAndy Ritger
954758b4ee8SAndy Ritger if(pKernelNvlink1->bIsGpuDegraded)
955758b4ee8SAndy Ritger {
956758b4ee8SAndy Ritger return peerLinkMask;
957758b4ee8SAndy Ritger }
9581739a20eSAndy Ritger
9591739a20eSAndy Ritger if (!knvlinkIsForcedConfig(pGpu0, pKernelNvlink0))
9601739a20eSAndy Ritger {
9611739a20eSAndy Ritger //
9621739a20eSAndy Ritger // If nvlink topology is not forced, then the hshub registers
9631739a20eSAndy Ritger // are updated only when a P2P object is allocated. So, return
9641739a20eSAndy Ritger // the cached value of mask of links connected to a GPU
9651739a20eSAndy Ritger //
9661739a20eSAndy Ritger peerLinkMask = pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)];
9671739a20eSAndy Ritger }
9681739a20eSAndy Ritger
9691739a20eSAndy Ritger return peerLinkMask;
9701739a20eSAndy Ritger }
9711739a20eSAndy Ritger
9721739a20eSAndy Ritger /*!
9731739a20eSAndy Ritger * @brief Sets the mask of peer links between the GPUs
9741739a20eSAndy Ritger *
9751739a20eSAndy Ritger * @param[in] pGpu0 OBJGPU pointer
9761739a20eSAndy Ritger * @param[in] pKernelNvlink0 Nvlink pointer
9771739a20eSAndy Ritger * @param[in] pGpu1 Remote OBJGPU pointer
9781739a20eSAndy Ritger * @param[in] peerLinkMask Mask of links to the peer GPU
9791739a20eSAndy Ritger *
9801739a20eSAndy Ritger * @return NV_OK on success
9811739a20eSAndy Ritger */
9821739a20eSAndy Ritger NV_STATUS
knvlinkSetLinkMaskToPeer_IMPL(OBJGPU * pGpu0,KernelNvlink * pKernelNvlink0,OBJGPU * pGpu1,NvU32 peerLinkMask)9831739a20eSAndy Ritger knvlinkSetLinkMaskToPeer_IMPL
9841739a20eSAndy Ritger (
9851739a20eSAndy Ritger OBJGPU *pGpu0,
9861739a20eSAndy Ritger KernelNvlink *pKernelNvlink0,
9871739a20eSAndy Ritger OBJGPU *pGpu1,
9881739a20eSAndy Ritger NvU32 peerLinkMask
9891739a20eSAndy Ritger )
9901739a20eSAndy Ritger {
9911739a20eSAndy Ritger NV_STATUS status = NV_OK;
9921739a20eSAndy Ritger
9931739a20eSAndy Ritger // Return early if no update needed to the peer link mask
9941739a20eSAndy Ritger if (pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)] == peerLinkMask)
9951739a20eSAndy Ritger return NV_OK;
9961739a20eSAndy Ritger
9971739a20eSAndy Ritger pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)] = peerLinkMask;
9981739a20eSAndy Ritger
9991739a20eSAndy Ritger NV2080_CTRL_NVLINK_UPDATE_PEER_LINK_MASK_PARAMS params;
10001739a20eSAndy Ritger
10011739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
10021739a20eSAndy Ritger params.gpuInst = gpuGetInstance(pGpu1);
10031739a20eSAndy Ritger params.peerLinkMask = peerLinkMask;
10041739a20eSAndy Ritger
10051739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init
10061739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu0))
10071739a20eSAndy Ritger {
10081739a20eSAndy Ritger threadStateResetTimeout(pGpu0);
10091739a20eSAndy Ritger }
10101739a20eSAndy Ritger
10111739a20eSAndy Ritger // Sync the peerLinkMask with GSP-RM
10121739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu0, pKernelNvlink0,
10131739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_UPDATE_PEER_LINK_MASK,
10141739a20eSAndy Ritger (void *)¶ms, sizeof(params));
10151739a20eSAndy Ritger if (status != NV_OK)
10161739a20eSAndy Ritger {
10171739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR,
10181739a20eSAndy Ritger "Failed to sync peerLinksMask from GPU%d to GPU%d\n",
10191739a20eSAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1));
10201739a20eSAndy Ritger return status;
10211739a20eSAndy Ritger }
10221739a20eSAndy Ritger
10231739a20eSAndy Ritger return NV_OK;
10241739a20eSAndy Ritger }
10251739a20eSAndy Ritger
10261739a20eSAndy Ritger /*!
10271739a20eSAndy Ritger * @brief Get the mask of links that are peer links
10281739a20eSAndy Ritger *
10291739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
10301739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
10311739a20eSAndy Ritger */
10321739a20eSAndy Ritger NvU32
knvlinkGetPeersNvlinkMaskFromHshub_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)10331739a20eSAndy Ritger knvlinkGetPeersNvlinkMaskFromHshub_IMPL
10341739a20eSAndy Ritger (
10351739a20eSAndy Ritger OBJGPU *pGpu,
10361739a20eSAndy Ritger KernelNvlink *pKernelNvlink
10371739a20eSAndy Ritger )
10381739a20eSAndy Ritger {
10391739a20eSAndy Ritger NV_STATUS status = NV_OK;
10401739a20eSAndy Ritger NvU32 peerLinkMask = 0;
10411739a20eSAndy Ritger NvU32 i;
10421739a20eSAndy Ritger
10431739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_LINK_AND_CLOCK_INFO_PARAMS params;
10441739a20eSAndy Ritger
10451739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
10461739a20eSAndy Ritger params.linkMask = pKernelNvlink->enabledLinks;
1047*e45d91deSBernhard Stoeckner params.bSublinkStateInst = NV_TRUE;
10481739a20eSAndy Ritger
10491739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
10501739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_LINK_AND_CLOCK_INFO,
10511739a20eSAndy Ritger (void *)¶ms, sizeof(params));
10521739a20eSAndy Ritger if (status != NV_OK)
10531739a20eSAndy Ritger return 0;
10541739a20eSAndy Ritger
10551739a20eSAndy Ritger // Scan enabled links for peer connections
10561739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
10571739a20eSAndy Ritger {
10581739a20eSAndy Ritger if (params.linkInfo[i].bLinkConnectedToPeer)
10591739a20eSAndy Ritger peerLinkMask |= NVBIT(i);
10601739a20eSAndy Ritger }
10611739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK_END;
10621739a20eSAndy Ritger
10631739a20eSAndy Ritger return peerLinkMask;
10641739a20eSAndy Ritger }
10651739a20eSAndy Ritger
10661739a20eSAndy Ritger /*!
10671739a20eSAndy Ritger * @brief Prepare a GPU's NVLink engine for reset by removing mappings
10681739a20eSAndy Ritger * to it from other GPUs.
10691739a20eSAndy Ritger *
10701739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
10711739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
10721739a20eSAndy Ritger *
10731739a20eSAndy Ritger * return NV_OK on success
10741739a20eSAndy Ritger */
10751739a20eSAndy Ritger NV_STATUS
knvlinkPrepareForXVEReset_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvBool bForceShutdown)10761739a20eSAndy Ritger knvlinkPrepareForXVEReset_IMPL
10771739a20eSAndy Ritger (
10781739a20eSAndy Ritger OBJGPU *pGpu,
107990eb1077SAndy Ritger KernelNvlink *pKernelNvlink,
108090eb1077SAndy Ritger NvBool bForceShutdown
10811739a20eSAndy Ritger )
10821739a20eSAndy Ritger {
10831739a20eSAndy Ritger OBJSYS *pSys = SYS_GET_INSTANCE();
10841739a20eSAndy Ritger NV_STATUS retStatus = NV_OK;
10851739a20eSAndy Ritger OBJGPU *pRemoteGpu;
10861739a20eSAndy Ritger NV_STATUS status;
10871739a20eSAndy Ritger NvU32 gpuInstance;
10881739a20eSAndy Ritger NvU32 gpuMask;
10891739a20eSAndy Ritger
10901739a20eSAndy Ritger // This is not supported on forced configs
10911739a20eSAndy Ritger if (knvlinkIsForcedConfig(pGpu, pKernelNvlink))
10921739a20eSAndy Ritger {
10931739a20eSAndy Ritger return NV_OK;
10941739a20eSAndy Ritger }
10951739a20eSAndy Ritger
10961739a20eSAndy Ritger //
10971739a20eSAndy Ritger // Let fabric manager handle link shutdown/reset if the fabric is managed
10981739a20eSAndy Ritger // externally.
10991739a20eSAndy Ritger //
11005f40a5aeSAndy Ritger if (pKernelNvlink->ipVerNvlink < NVLINK_VERSION_40 &&
11015f40a5aeSAndy Ritger pSys->getProperty(pSys, PDB_PROP_SYS_FABRIC_IS_EXTERNALLY_MANAGED))
11021739a20eSAndy Ritger {
11031739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO,
11041739a20eSAndy Ritger "NVLink fabric is externally managed, skipping\n");
11051739a20eSAndy Ritger return NV_OK;
11061739a20eSAndy Ritger }
11071739a20eSAndy Ritger
11081739a20eSAndy Ritger status = gpumgrGetGpuAttachInfo(NULL, &gpuMask);
11091739a20eSAndy Ritger NV_ASSERT_OR_RETURN(status == NV_OK, status);
11101739a20eSAndy Ritger
11111739a20eSAndy Ritger gpuInstance = 0;
11121739a20eSAndy Ritger while ((pRemoteGpu = gpumgrGetNextGpu(gpuMask, &gpuInstance)) != NULL)
11131739a20eSAndy Ritger {
11141739a20eSAndy Ritger KernelNvlink *pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu);
11151739a20eSAndy Ritger
11161739a20eSAndy Ritger if ((pRemoteGpu == pGpu) || (pRemoteKernelNvlink == NULL) ||
11171739a20eSAndy Ritger (knvlinkGetNumLinksToPeer(pRemoteGpu, pRemoteKernelNvlink, pGpu) == 0) ||
111890eb1077SAndy Ritger API_GPU_IN_RESET_SANITY_CHECK(pRemoteGpu) ||
111990eb1077SAndy Ritger pRemoteGpu->getProperty(pRemoteGpu, PDB_PROP_GPU_IS_LOST))
11201739a20eSAndy Ritger {
11211739a20eSAndy Ritger continue;
11221739a20eSAndy Ritger }
11231739a20eSAndy Ritger
11241739a20eSAndy Ritger //
11251739a20eSAndy Ritger // Reset the peer masks in HSHUB of the remote GPU. Partial resets
11261739a20eSAndy Ritger // (only removing the links connected to the GPU being reset) don't
11271739a20eSAndy Ritger // appear to be sufficient. The reset will work fine, but the next
11281739a20eSAndy Ritger // time we attempt to initialize this GPU, the copy engines will time
11291739a20eSAndy Ritger // out while scrubbing FB and a GPU sysmembar (NV_UFLUSH_FB_FLUSH) will
11301739a20eSAndy Ritger // fail to complete.
11311739a20eSAndy Ritger //
11321739a20eSAndy Ritger // The above symptoms haven't been root-caused (yet), but the current
11331739a20eSAndy Ritger // POR for GPU reset is that once one GPU is reset, the others
11341739a20eSAndy Ritger // connected to it over NVLink must also be reset before using NVLink
11351739a20eSAndy Ritger // for peer traffic, so just use the big hammer and squash all HSHUB
11361739a20eSAndy Ritger // configs on GPU reset.
11371739a20eSAndy Ritger //
11381739a20eSAndy Ritger // This allows us to reset the GPUs one by one, with GPU
11391739a20eSAndy Ritger // initializations in between, without hanging up the GPU trying to
11401739a20eSAndy Ritger // flush data over links that aren't available anymore.
11411739a20eSAndy Ritger //
114212c07393SBernhard Stoeckner // Starting from Ampere single GPU reset is supported and hence remove
114312c07393SBernhard Stoeckner // only the nvlink's of the remote GPU's which are connected to the
114412c07393SBernhard Stoeckner // current GPU.
114512c07393SBernhard Stoeckner //
114612c07393SBernhard Stoeckner
114712c07393SBernhard Stoeckner if (IsAMPEREorBetter(pGpu))
114812c07393SBernhard Stoeckner {
114912c07393SBernhard Stoeckner NvU32 remPeerId = kbusGetPeerId_HAL(pRemoteGpu, GPU_GET_KERNEL_BUS(pRemoteGpu), pGpu);
115012c07393SBernhard Stoeckner if (remPeerId != BUS_INVALID_PEER)
115112c07393SBernhard Stoeckner status = knvlinkRemoveMapping_HAL(pRemoteGpu, pRemoteKernelNvlink, NV_FALSE,
115212c07393SBernhard Stoeckner NVBIT(remPeerId),
115312c07393SBernhard Stoeckner NV_FALSE /* bL2Entry */);
115412c07393SBernhard Stoeckner }
115512c07393SBernhard Stoeckner else
115612c07393SBernhard Stoeckner {
11571739a20eSAndy Ritger status = knvlinkRemoveMapping_HAL(pRemoteGpu, pRemoteKernelNvlink, NV_FALSE,
11581739a20eSAndy Ritger ((1 << NVLINK_MAX_PEERS_SW) - 1),
11591739a20eSAndy Ritger NV_FALSE /* bL2Entry */);
116012c07393SBernhard Stoeckner }
11611739a20eSAndy Ritger if (status != NV_OK)
11621739a20eSAndy Ritger {
11631739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR,
11641739a20eSAndy Ritger "failed to reset HSHUB on GPU%u while preparing for GPU%u XVE reset (0x%x)\n",
11651739a20eSAndy Ritger gpuGetInstance(pRemoteGpu), gpuGetInstance(pGpu),
11661739a20eSAndy Ritger status);
11671739a20eSAndy Ritger
11681739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus;
11691739a20eSAndy Ritger }
11701739a20eSAndy Ritger }
11711739a20eSAndy Ritger
11721739a20eSAndy Ritger // Remove all NVLink mappings in HSHUB config registers to init values
117390eb1077SAndy Ritger if (!API_GPU_IN_RESET_SANITY_CHECK(pGpu) && !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
11741739a20eSAndy Ritger status = knvlinkRemoveMapping_HAL(pGpu, pKernelNvlink, NV_TRUE, ((1 << NVLINK_MAX_PEERS_SW) - 1),
11751739a20eSAndy Ritger NV_FALSE /* bL2Entry */);
11761739a20eSAndy Ritger if (status != NV_OK)
11771739a20eSAndy Ritger {
11781739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR,
11791739a20eSAndy Ritger "failed to reset HSHUB on GPU%u while preparing XVE reset: %s (0x%x)\n",
11801739a20eSAndy Ritger gpuGetInstance(pGpu), nvstatusToString(status), status);
11811739a20eSAndy Ritger
11821739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus;
11831739a20eSAndy Ritger }
11841739a20eSAndy Ritger
118590eb1077SAndy Ritger //
118690eb1077SAndy Ritger // If GFW is booted and running through link-training, then no need to tear-down the
118790eb1077SAndy Ritger // links to reset. Exit out early from the function
118890eb1077SAndy Ritger //
1189eb5c7665SAndy Ritger if (!bForceShutdown && pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_MINION_GFW_BOOT))
119090eb1077SAndy Ritger {
119190eb1077SAndy Ritger return NV_OK;
119290eb1077SAndy Ritger }
119390eb1077SAndy Ritger
11941739a20eSAndy Ritger // Pseudo-clean shutdown the links from this GPU
119590eb1077SAndy Ritger status = knvlinkCoreShutdownDeviceLinks(pGpu, pKernelNvlink, bForceShutdown);
11961739a20eSAndy Ritger if (status != NV_OK)
11971739a20eSAndy Ritger {
11981739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR,
11991739a20eSAndy Ritger "failed to shutdown links on GPU%u while preparing XVE reset: %s (0x%x)\n",
12001739a20eSAndy Ritger gpuGetInstance(pGpu), nvstatusToString(status), status);
12011739a20eSAndy Ritger
12021739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus;
12031739a20eSAndy Ritger }
12041739a20eSAndy Ritger
12051739a20eSAndy Ritger //
12061739a20eSAndy Ritger // Reset links related to this device and its peers (see Bug 2346447)
12071739a20eSAndy Ritger // The property is disabled on Pascal, since the path hasn't been verified
12081739a20eSAndy Ritger // and link reset after pseudo-clean shutdown results in DL and TL errors.
12091739a20eSAndy Ritger //
12101739a20eSAndy Ritger if (pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LINKRESET_AFTER_SHUTDOWN))
12111739a20eSAndy Ritger {
12121739a20eSAndy Ritger status = knvlinkCoreResetDeviceLinks(pGpu, pKernelNvlink);
12131739a20eSAndy Ritger if (status != NV_OK)
12141739a20eSAndy Ritger {
12151739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR,
12161739a20eSAndy Ritger "failed to reset links on GPU%u while preparing XVE reset: %s (0x%x)\n",
12171739a20eSAndy Ritger gpuGetInstance(pGpu), nvstatusToString(status), status);
12181739a20eSAndy Ritger
12191739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus;
12201739a20eSAndy Ritger }
1221dac2350cSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
1222dac2350cSAndy Ritger else
1223dac2350cSAndy Ritger {
1224dac2350cSAndy Ritger NvU32 linkId;
1225dac2350cSAndy Ritger
1226dac2350cSAndy Ritger //
1227dac2350cSAndy Ritger // The connections have been successfully reset, update connected and disconnected
1228dac2350cSAndy Ritger // links masks on both the devices
1229dac2350cSAndy Ritger //
1230dac2350cSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, linkId, pKernelNvlink->enabledLinks)
1231dac2350cSAndy Ritger {
1232dac2350cSAndy Ritger pKernelNvlink->disconnectedLinkMask |= NVBIT(linkId);
1233dac2350cSAndy Ritger pKernelNvlink->connectedLinksMask &= ~NVBIT(linkId);
1234dac2350cSAndy Ritger
1235dac2350cSAndy Ritger if (pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.deviceType !=
1236dac2350cSAndy Ritger NV2080_CTRL_NVLINK_DEVICE_INFO_DEVICE_TYPE_GPU)
1237dac2350cSAndy Ritger {
1238dac2350cSAndy Ritger continue;
1239dac2350cSAndy Ritger }
1240dac2350cSAndy Ritger
1241dac2350cSAndy Ritger OBJGPU *pRemoteGpu = gpumgrGetGpuFromBusInfo(
1242dac2350cSAndy Ritger pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.domain,
1243dac2350cSAndy Ritger pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bus,
1244dac2350cSAndy Ritger pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.device);
1245dac2350cSAndy Ritger
1246dac2350cSAndy Ritger if (!API_GPU_IN_RESET_SANITY_CHECK(pRemoteGpu))
1247dac2350cSAndy Ritger {
1248dac2350cSAndy Ritger KernelNvlink *pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu);
1249dac2350cSAndy Ritger NvU32 remoteLinkId = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber;
1250dac2350cSAndy Ritger
1251dac2350cSAndy Ritger pRemoteKernelNvlink->disconnectedLinkMask |= NVBIT(remoteLinkId);
1252dac2350cSAndy Ritger pRemoteKernelNvlink->connectedLinksMask &= ~NVBIT(remoteLinkId);
1253dac2350cSAndy Ritger }
1254dac2350cSAndy Ritger }
1255dac2350cSAndy Ritger FOR_EACH_INDEX_IN_MASK_END;
1256dac2350cSAndy Ritger }
1257dac2350cSAndy Ritger #endif
12581739a20eSAndy Ritger
12591739a20eSAndy Ritger //
12601739a20eSAndy Ritger // knvlinkCoreResetDeviceLinks() only resets the links which have
12611739a20eSAndy Ritger // connectivity.
12621739a20eSAndy Ritger // Pre-Ampere, we may run into a situation where the PLL
12631739a20eSAndy Ritger // sharing partner links (both) may not be reset due to no connectivity.
12641739a20eSAndy Ritger //
12651739a20eSAndy Ritger // Hence, (re-)reset all the links to recover them after shutdown (pre-Ampere)
12661739a20eSAndy Ritger //
12671739a20eSAndy Ritger NV2080_CTRL_NVLINK_RESET_LINKS_PARAMS resetLinksparams;
12681739a20eSAndy Ritger
12691739a20eSAndy Ritger portMemSet(&resetLinksparams, 0, sizeof(resetLinksparams));
12701739a20eSAndy Ritger resetLinksparams.linkMask = pKernelNvlink->enabledLinks;
12711739a20eSAndy Ritger resetLinksparams.flags = NV2080_CTRL_NVLINK_RESET_FLAGS_TOGGLE;
12721739a20eSAndy Ritger
12731739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
12741739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_RESET_LINKS,
12751739a20eSAndy Ritger (void *)&resetLinksparams, sizeof(resetLinksparams));
12761739a20eSAndy Ritger
12771739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus;
12781739a20eSAndy Ritger }
12791739a20eSAndy Ritger
12801739a20eSAndy Ritger return retStatus;
12811739a20eSAndy Ritger }
12821739a20eSAndy Ritger
12831739a20eSAndy Ritger /*!
12841739a20eSAndy Ritger * @brief Set the power features supported on this NVLink IP
12851739a20eSAndy Ritger *
12861739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
12871739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
12881739a20eSAndy Ritger */
12891739a20eSAndy Ritger void
knvlinkSetPowerFeatures_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)12901739a20eSAndy Ritger knvlinkSetPowerFeatures_IMPL
12911739a20eSAndy Ritger (
12921739a20eSAndy Ritger OBJGPU *pGpu,
12931739a20eSAndy Ritger KernelNvlink *pKernelNvlink
12941739a20eSAndy Ritger )
12951739a20eSAndy Ritger {
12961739a20eSAndy Ritger // Get the Ip Verion from the First available IOCTRL.
12971739a20eSAndy Ritger switch (pKernelNvlink->ipVerNvlink)
12981739a20eSAndy Ritger {
12991739a20eSAndy Ritger case NVLINK_VERSION_22:
13001739a20eSAndy Ritger {
13011739a20eSAndy Ritger // NVLink L2 is supported only on MODS and Windows LDDM
130291676d66SBernhard Stoeckner if (RMCFG_FEATURE_PLATFORM_WINDOWS || RMCFG_FEATURE_MODS_FEATURES)
13031739a20eSAndy Ritger {
13041739a20eSAndy Ritger pKernelNvlink->setProperty(pKernelNvlink, PDB_PROP_KNVLINK_L2_POWER_STATE_ENABLED,
13051739a20eSAndy Ritger (pKernelNvlink->bDisableL2Mode ? NV_FALSE : NV_TRUE));
13061739a20eSAndy Ritger }
13071739a20eSAndy Ritger
13081739a20eSAndy Ritger break;
13091739a20eSAndy Ritger }
13101739a20eSAndy Ritger default:
13111739a20eSAndy Ritger break;
13121739a20eSAndy Ritger }
13131739a20eSAndy Ritger }
13141739a20eSAndy Ritger
13151739a20eSAndy Ritger /*!
13161739a20eSAndy Ritger * @brief Checks if NVSWITCH_FABRIC_ADDR field is valid.
13171739a20eSAndy Ritger *
13181739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
13191739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
13201739a20eSAndy Ritger */
13211739a20eSAndy Ritger void
knvlinkDetectNvswitchProxy_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)13221739a20eSAndy Ritger knvlinkDetectNvswitchProxy_IMPL
13231739a20eSAndy Ritger (
13241739a20eSAndy Ritger OBJGPU *pGpu,
13251739a20eSAndy Ritger KernelNvlink *pKernelNvlink
13261739a20eSAndy Ritger )
13271739a20eSAndy Ritger {
13281739a20eSAndy Ritger OBJSYS *pSys = SYS_GET_INSTANCE();
13291739a20eSAndy Ritger NV_STATUS status = NV_OK;
13301739a20eSAndy Ritger NvU32 i;
13311739a20eSAndy Ritger
13321739a20eSAndy Ritger // Initialize fabricBaseAddr to NVLINK_INVALID_FABRIC_ADDR
13331739a20eSAndy Ritger pKernelNvlink->fabricBaseAddr = NVLINK_INVALID_FABRIC_ADDR;
13341739a20eSAndy Ritger
13351739a20eSAndy Ritger if (pSys->getProperty(pSys, PDB_PROP_SYS_NVSWITCH_IS_PRESENT) ||
1336758b4ee8SAndy Ritger pSys->getProperty(pSys, PDB_PROP_SYS_FABRIC_MANAGER_IS_REGISTERED) ||
1337758b4ee8SAndy Ritger GPU_IS_NVSWITCH_DETECTED(pGpu))
13381739a20eSAndy Ritger {
13391739a20eSAndy Ritger return;
13401739a20eSAndy Ritger }
13411739a20eSAndy Ritger
13421739a20eSAndy Ritger if (pKernelNvlink->discoveredLinks == 0)
13431739a20eSAndy Ritger {
13441739a20eSAndy Ritger return;
13451739a20eSAndy Ritger }
13461739a20eSAndy Ritger
13471739a20eSAndy Ritger // Get the link train status for the enabled link masks
13481739a20eSAndy Ritger NV2080_CTRL_NVLINK_ARE_LINKS_TRAINED_PARAMS linkTrainedParams;
13491739a20eSAndy Ritger
13501739a20eSAndy Ritger portMemSet(&linkTrainedParams, 0, sizeof(linkTrainedParams));
13511739a20eSAndy Ritger linkTrainedParams.linkMask = pKernelNvlink->enabledLinks;
13521739a20eSAndy Ritger linkTrainedParams.bActiveOnly = NV_FALSE;
13531739a20eSAndy Ritger
13541739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init
13551739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu))
13561739a20eSAndy Ritger {
13571739a20eSAndy Ritger threadStateResetTimeout(pGpu);
13581739a20eSAndy Ritger }
13591739a20eSAndy Ritger
13601739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
13611739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_ARE_LINKS_TRAINED,
13621739a20eSAndy Ritger (void *)&linkTrainedParams, sizeof(linkTrainedParams));
13631739a20eSAndy Ritger if (status != NV_OK)
13641739a20eSAndy Ritger {
13651739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get the link train status for links\n");
13661739a20eSAndy Ritger return;
13671739a20eSAndy Ritger }
13681739a20eSAndy Ritger
13691739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks)
13701739a20eSAndy Ritger {
13711739a20eSAndy Ritger if (!linkTrainedParams.bIsLinkActive[i])
13721739a20eSAndy Ritger {
13731739a20eSAndy Ritger return;
13741739a20eSAndy Ritger }
13751739a20eSAndy Ritger }
13761739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK_END;
13771739a20eSAndy Ritger
13781739a20eSAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_SET_NVSWITCH_FABRIC_ADDR_PARAMS params;
13791739a20eSAndy Ritger
13801739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
13811739a20eSAndy Ritger params.bGet = NV_TRUE;
13821739a20eSAndy Ritger params.addr = NVLINK_INVALID_FABRIC_ADDR;
13831739a20eSAndy Ritger
13841739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
13851739a20eSAndy Ritger NV2080_CTRL_CMD_INTERNAL_NVLINK_GET_SET_NVSWITCH_FABRIC_ADDR,
13861739a20eSAndy Ritger (void *)¶ms, sizeof(params));
13871739a20eSAndy Ritger if (status != NV_OK)
13881739a20eSAndy Ritger {
13891739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get fabric address for GPU %x\n",
13901739a20eSAndy Ritger pGpu->gpuInstance);
13911739a20eSAndy Ritger return;
13921739a20eSAndy Ritger }
13931739a20eSAndy Ritger
13941739a20eSAndy Ritger if (params.addr != NVLINK_INVALID_FABRIC_ADDR)
13951739a20eSAndy Ritger {
13961739a20eSAndy Ritger pKernelNvlink->fabricBaseAddr = params.addr;
13971739a20eSAndy Ritger pKernelNvlink->bNvswitchProxy = NV_TRUE;
13981739a20eSAndy Ritger }
13991739a20eSAndy Ritger }
14001739a20eSAndy Ritger
14011739a20eSAndy Ritger /*!
14021739a20eSAndy Ritger * @brief Sets NVSWITCH_FLA_ADDR field in the scratch register.
14031739a20eSAndy Ritger *
14041739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
14051739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
14061739a20eSAndy Ritger * @param[in] addr FLA addr
14071739a20eSAndy Ritger *
14081739a20eSAndy Ritger * @return Returns NV_OK upon success.
14091739a20eSAndy Ritger * Otherwise, returns NV_ERR_XXX.
14101739a20eSAndy Ritger */
14111739a20eSAndy Ritger NV_STATUS
knvlinkSetNvswitchFlaAddr_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU64 addr)14121739a20eSAndy Ritger knvlinkSetNvswitchFlaAddr_IMPL
14131739a20eSAndy Ritger (
14141739a20eSAndy Ritger OBJGPU *pGpu,
14151739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
14161739a20eSAndy Ritger NvU64 addr
14171739a20eSAndy Ritger )
14181739a20eSAndy Ritger {
14191739a20eSAndy Ritger return NV_OK;
14201739a20eSAndy Ritger }
14211739a20eSAndy Ritger
14221739a20eSAndy Ritger /*!
14231739a20eSAndy Ritger * @brief Gets NVSWITCH_FLA_ADDR field from the scratch register.
14241739a20eSAndy Ritger *
14251739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
14261739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
14271739a20eSAndy Ritger *
14281739a20eSAndy Ritger * @return Returns the stashed FLA starting address.
14291739a20eSAndy Ritger */
14301739a20eSAndy Ritger NvU64
knvlinkGetNvswitchFlaAddr_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)14311739a20eSAndy Ritger knvlinkGetNvswitchFlaAddr_IMPL
14321739a20eSAndy Ritger (
14331739a20eSAndy Ritger OBJGPU *pGpu,
14341739a20eSAndy Ritger KernelNvlink *pKernelNvlink
14351739a20eSAndy Ritger )
14361739a20eSAndy Ritger {
14371739a20eSAndy Ritger return 0;
14381739a20eSAndy Ritger }
14391739a20eSAndy Ritger
14401739a20eSAndy Ritger /*!
14411739a20eSAndy Ritger * @brief Checks if fabricBaseAddr is valid.
14421739a20eSAndy Ritger *
14431739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
14441739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
14451739a20eSAndy Ritger *
14461739a20eSAndy Ritger * @return Returns true if the fabricBaseAddr is valid.
14471739a20eSAndy Ritger */
14481739a20eSAndy Ritger NvBool
knvlinkIsNvswitchProxyPresent_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)14491739a20eSAndy Ritger knvlinkIsNvswitchProxyPresent_IMPL
14501739a20eSAndy Ritger (
14511739a20eSAndy Ritger OBJGPU *pGpu,
14521739a20eSAndy Ritger KernelNvlink *pKernelNvlink
14531739a20eSAndy Ritger )
14541739a20eSAndy Ritger {
14551739a20eSAndy Ritger return pKernelNvlink->bNvswitchProxy;
14561739a20eSAndy Ritger }
14571739a20eSAndy Ritger
14581739a20eSAndy Ritger
14591739a20eSAndy Ritger /*!
14601739a20eSAndy Ritger * @brief Set unique FLA base address for NVSwitch enabled systems.
14611739a20eSAndy Ritger * Validates FLA base address and programs the base address
14621739a20eSAndy Ritger * in switch scratch registers for guest VM to pick it up.
14631739a20eSAndy Ritger *
14641739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
14651739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
14661739a20eSAndy Ritger * @param[in] flaBaseAddr NvU64 base address
14671739a20eSAndy Ritger *
14681739a20eSAndy Ritger * @returns On success, sets unique FLA base address and returns NV_OK.
14691739a20eSAndy Ritger * On failure, returns NV_ERR_XXX.
14701739a20eSAndy Ritger */
14711739a20eSAndy Ritger NV_STATUS
knvlinkSetUniqueFlaBaseAddress_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU64 flaBaseAddr)14721739a20eSAndy Ritger knvlinkSetUniqueFlaBaseAddress_IMPL
14731739a20eSAndy Ritger (
14741739a20eSAndy Ritger OBJGPU *pGpu,
14751739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
14761739a20eSAndy Ritger NvU64 flaBaseAddr
14771739a20eSAndy Ritger )
14781739a20eSAndy Ritger {
14791739a20eSAndy Ritger NV_STATUS status = NV_OK;
14801739a20eSAndy Ritger KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu);
14811739a20eSAndy Ritger
14821739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_SET_NVSWITCH_FLA_ADDR_PARAMS params;
14831739a20eSAndy Ritger
14841739a20eSAndy Ritger if (!knvlinkIsForcedConfig(pGpu, pKernelNvlink))
14851739a20eSAndy Ritger {
14861739a20eSAndy Ritger knvlinkCoreGetRemoteDeviceInfo(pGpu, pKernelNvlink);
14871739a20eSAndy Ritger
14881739a20eSAndy Ritger status = knvlinkEnableLinksPostTopology_HAL(pGpu, pKernelNvlink,
14891739a20eSAndy Ritger pKernelNvlink->enabledLinks);
14901739a20eSAndy Ritger if (status != NV_OK)
14911739a20eSAndy Ritger {
14921739a20eSAndy Ritger return status;
14931739a20eSAndy Ritger }
14941739a20eSAndy Ritger }
14951739a20eSAndy Ritger
14961739a20eSAndy Ritger status = kbusValidateFlaBaseAddress_HAL(pGpu, pKernelBus, flaBaseAddr);
14971739a20eSAndy Ritger if (status != NV_OK)
14981739a20eSAndy Ritger {
14991739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "FLA base addr validation failed for GPU %x\n",
15001739a20eSAndy Ritger pGpu->gpuInstance);
15011739a20eSAndy Ritger return status;
15021739a20eSAndy Ritger }
15031739a20eSAndy Ritger
15041739a20eSAndy Ritger if (IsSLIEnabled(pGpu))
15051739a20eSAndy Ritger {
15061739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR,
15071739a20eSAndy Ritger "Operation is unsupported on SLI enabled GPU %x\n",
15081739a20eSAndy Ritger pGpu->gpuInstance);
15091739a20eSAndy Ritger return NV_ERR_NOT_SUPPORTED;
15101739a20eSAndy Ritger }
15111739a20eSAndy Ritger
15121739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
15131739a20eSAndy Ritger params.bGet = NV_FALSE;
15141739a20eSAndy Ritger params.addr = flaBaseAddr;
15151739a20eSAndy Ritger
15161739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
15171739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_SET_NVSWITCH_FLA_ADDR,
15181739a20eSAndy Ritger (void *)¶ms, sizeof(params));
15191739a20eSAndy Ritger if (status != NV_OK)
15201739a20eSAndy Ritger {
15211739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to stash fla base address for GPU %x\n",
15221739a20eSAndy Ritger pGpu->gpuInstance);
15231739a20eSAndy Ritger return status;
15241739a20eSAndy Ritger }
15251739a20eSAndy Ritger
15261739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, "FLA base addr %llx is assigned to GPU %x\n",
15271739a20eSAndy Ritger flaBaseAddr, pGpu->gpuInstance);
15281739a20eSAndy Ritger
15291739a20eSAndy Ritger return NV_OK;
15301739a20eSAndy Ritger }
15311739a20eSAndy Ritger
15321739a20eSAndy Ritger /*!
15331739a20eSAndy Ritger * @brief Synchronize the link masks and vbios defined properties
15341739a20eSAndy Ritger * between CPU and GSP-RMs
15351739a20eSAndy Ritger *
15361739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
15371739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
15381739a20eSAndy Ritger */
15391739a20eSAndy Ritger NV_STATUS
knvlinkSyncLinkMasksAndVbiosInfo_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)15401739a20eSAndy Ritger knvlinkSyncLinkMasksAndVbiosInfo_IMPL
15411739a20eSAndy Ritger (
15421739a20eSAndy Ritger OBJGPU *pGpu,
15431739a20eSAndy Ritger KernelNvlink *pKernelNvlink
15441739a20eSAndy Ritger )
15451739a20eSAndy Ritger {
15461739a20eSAndy Ritger NV_STATUS status = NV_OK;
15471739a20eSAndy Ritger
15481739a20eSAndy Ritger NV2080_CTRL_NVLINK_SYNC_LINK_MASKS_AND_VBIOS_INFO_PARAMS params;
15491739a20eSAndy Ritger
15501739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
15511739a20eSAndy Ritger
15521739a20eSAndy Ritger params.discoveredLinks = pKernelNvlink->discoveredLinks;
15531739a20eSAndy Ritger params.connectedLinksMask = pKernelNvlink->connectedLinksMask;
15541739a20eSAndy Ritger params.bridgeSensableLinks = pKernelNvlink->bridgeSensableLinks;
15551739a20eSAndy Ritger params.bridgedLinks = pKernelNvlink->bridgedLinks;
15561739a20eSAndy Ritger
15571739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init
15581739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu))
15591739a20eSAndy Ritger {
15601739a20eSAndy Ritger threadStateResetTimeout(pGpu);
15611739a20eSAndy Ritger }
15621739a20eSAndy Ritger
15631739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
15641739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_SYNC_LINK_MASKS_AND_VBIOS_INFO,
15651739a20eSAndy Ritger (void *)¶ms, sizeof(params));
15661739a20eSAndy Ritger
15671739a20eSAndy Ritger pKernelNvlink->vbiosDisabledLinkMask = params.vbiosDisabledLinkMask;
15681739a20eSAndy Ritger pKernelNvlink->initializedLinks = params.initializedLinks;
15691739a20eSAndy Ritger pKernelNvlink->initDisabledLinksMask = params.initDisabledLinksMask;
15701739a20eSAndy Ritger pKernelNvlink->bEnableSafeModeAtLoad = params.bEnableSafeModeAtLoad;
15711739a20eSAndy Ritger pKernelNvlink->bEnableTrainingAtLoad = params.bEnableTrainingAtLoad;
15721739a20eSAndy Ritger
15731739a20eSAndy Ritger return status;
15741739a20eSAndy Ritger }
15751739a20eSAndy Ritger
15761739a20eSAndy Ritger /*!
15771739a20eSAndy Ritger * @brief Update link connection status.
15781739a20eSAndy Ritger *
15791739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
15801739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
15811739a20eSAndy Ritger * @param[in] linkId Target link Id
15821739a20eSAndy Ritger */
15831739a20eSAndy Ritger NV_STATUS
knvlinkUpdateLinkConnectionStatus_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 linkId)15841739a20eSAndy Ritger knvlinkUpdateLinkConnectionStatus_IMPL
15851739a20eSAndy Ritger (
15861739a20eSAndy Ritger OBJGPU *pGpu,
15871739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
15881739a20eSAndy Ritger NvU32 linkId
15891739a20eSAndy Ritger )
15901739a20eSAndy Ritger {
15911739a20eSAndy Ritger NV_STATUS status = NV_OK;
15921739a20eSAndy Ritger
15931739a20eSAndy Ritger NV2080_CTRL_NVLINK_UPDATE_LINK_CONNECTION_PARAMS params;
15941739a20eSAndy Ritger
15951739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
15961739a20eSAndy Ritger
15971739a20eSAndy Ritger params.linkId = linkId;
15981739a20eSAndy Ritger
15991739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
16001739a20eSAndy Ritger
16011739a20eSAndy Ritger params.bConnected = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bConnected;
16021739a20eSAndy Ritger params.remoteDeviceType = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.deviceType;
16031739a20eSAndy Ritger params.remoteLinkNumber = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber;
1604758b4ee8SAndy Ritger params.remoteChipSid = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.chipSid;
1605758b4ee8SAndy Ritger params.remoteDomain = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.domain;
1606758b4ee8SAndy Ritger params.remoteBus = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bus;
1607758b4ee8SAndy Ritger params.remoteDevice = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.device;
1608758b4ee8SAndy Ritger params.remoteFunction = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.function;
1609758b4ee8SAndy Ritger params.remotePciDeviceId = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.pciDeviceId;
1610758b4ee8SAndy Ritger params.laneRxdetStatusMask = pKernelNvlink->nvlinkLinks[linkId].laneRxdetStatusMask;
16111739a20eSAndy Ritger
16121739a20eSAndy Ritger #endif
16131739a20eSAndy Ritger
16141739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init
16151739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu))
16161739a20eSAndy Ritger {
16171739a20eSAndy Ritger threadStateResetTimeout(pGpu);
16181739a20eSAndy Ritger }
16191739a20eSAndy Ritger
16201739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
16211739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_UPDATE_LINK_CONNECTION,
16221739a20eSAndy Ritger (void *)¶ms, sizeof(params));
16231739a20eSAndy Ritger if (status != NV_OK)
16241739a20eSAndy Ritger {
16251739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to update Link connection status!\n");
16261739a20eSAndy Ritger return status;
16271739a20eSAndy Ritger }
16281739a20eSAndy Ritger
16291739a20eSAndy Ritger return NV_OK;
16301739a20eSAndy Ritger }
16311739a20eSAndy Ritger
16321739a20eSAndy Ritger /*!
163390eb1077SAndy Ritger * @brief Execute initial steps to Train links for ALI.
163490eb1077SAndy Ritger *
163590eb1077SAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
163690eb1077SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
163790eb1077SAndy Ritger * @param[in] linkMask Masks of links to enable
163890eb1077SAndy Ritger * @param[in] bSync Input sync boolean
163990eb1077SAndy Ritger *
164090eb1077SAndy Ritger */
164190eb1077SAndy Ritger NV_STATUS
knvlinkPreTrainLinksToActiveAli_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 linkMask,NvBool bSync)164290eb1077SAndy Ritger knvlinkPreTrainLinksToActiveAli_IMPL
164390eb1077SAndy Ritger (
164490eb1077SAndy Ritger OBJGPU *pGpu,
164590eb1077SAndy Ritger KernelNvlink *pKernelNvlink,
164690eb1077SAndy Ritger NvU32 linkMask,
164790eb1077SAndy Ritger NvBool bSync
164890eb1077SAndy Ritger )
164990eb1077SAndy Ritger {
165090eb1077SAndy Ritger NV_STATUS status = NV_OK;
165190eb1077SAndy Ritger
165290eb1077SAndy Ritger NV2080_CTRL_NVLINK_PRE_LINK_TRAIN_ALI_PARAMS params;
165390eb1077SAndy Ritger
165490eb1077SAndy Ritger portMemSet(¶ms, 0, sizeof(params));
165590eb1077SAndy Ritger
165690eb1077SAndy Ritger params.linkMask = linkMask;
165790eb1077SAndy Ritger params.bSync = bSync;
165890eb1077SAndy Ritger
165990eb1077SAndy Ritger // Reset timeout to clear any accumulated timeouts from link init
166090eb1077SAndy Ritger if (IS_GSP_CLIENT(pGpu))
166190eb1077SAndy Ritger {
166290eb1077SAndy Ritger threadStateResetTimeout(pGpu);
166390eb1077SAndy Ritger }
166490eb1077SAndy Ritger
166590eb1077SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
166690eb1077SAndy Ritger NV2080_CTRL_CMD_NVLINK_PRE_LINK_TRAIN_ALI,
166790eb1077SAndy Ritger (void *)¶ms, sizeof(params));
166890eb1077SAndy Ritger if (status != NV_OK)
166990eb1077SAndy Ritger {
167090eb1077SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to execute Pre Link Training ALI steps!\n");
167190eb1077SAndy Ritger return status;
167290eb1077SAndy Ritger }
167390eb1077SAndy Ritger
167490eb1077SAndy Ritger return NV_OK;
167590eb1077SAndy Ritger }
167690eb1077SAndy Ritger
167790eb1077SAndy Ritger /*!
167890eb1077SAndy Ritger * @brief Train links to active for ALI.
167990eb1077SAndy Ritger *
168090eb1077SAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
168190eb1077SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
168290eb1077SAndy Ritger * @param[in] linkMask Masks of links to enable
168390eb1077SAndy Ritger * @param[in] bSync Input sync boolean
168490eb1077SAndy Ritger *
168590eb1077SAndy Ritger */
168690eb1077SAndy Ritger NV_STATUS
knvlinkTrainLinksToActiveAli_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 linkMask,NvBool bSync)168790eb1077SAndy Ritger knvlinkTrainLinksToActiveAli_IMPL
168890eb1077SAndy Ritger (
168990eb1077SAndy Ritger OBJGPU *pGpu,
169090eb1077SAndy Ritger KernelNvlink *pKernelNvlink,
169190eb1077SAndy Ritger NvU32 linkMask,
169290eb1077SAndy Ritger NvBool bSync
169390eb1077SAndy Ritger )
169490eb1077SAndy Ritger {
169590eb1077SAndy Ritger NV_STATUS status = NV_OK;
169690eb1077SAndy Ritger
169790eb1077SAndy Ritger NV2080_CTRL_NVLINK_PRE_LINK_TRAIN_ALI_PARAMS params;
169890eb1077SAndy Ritger
169990eb1077SAndy Ritger portMemSet(¶ms, 0, sizeof(params));
170090eb1077SAndy Ritger
170190eb1077SAndy Ritger params.linkMask = linkMask;
170290eb1077SAndy Ritger params.bSync = bSync;
170390eb1077SAndy Ritger
170490eb1077SAndy Ritger // Reset timeout to clear any accumulated timeouts from link init
170590eb1077SAndy Ritger if (IS_GSP_CLIENT(pGpu))
170690eb1077SAndy Ritger {
170790eb1077SAndy Ritger threadStateResetTimeout(pGpu);
170890eb1077SAndy Ritger }
170990eb1077SAndy Ritger
171090eb1077SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
171190eb1077SAndy Ritger NV2080_CTRL_CMD_NVLINK_LINK_TRAIN_ALI,
171290eb1077SAndy Ritger (void *)¶ms, sizeof(params));
171390eb1077SAndy Ritger if (status != NV_OK)
171490eb1077SAndy Ritger {
171590eb1077SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to change ALI Links to active!\n");
171690eb1077SAndy Ritger return status;
171790eb1077SAndy Ritger }
171890eb1077SAndy Ritger
171990eb1077SAndy Ritger return NV_OK;
172090eb1077SAndy Ritger }
172190eb1077SAndy Ritger
172290eb1077SAndy Ritger /*!
17231739a20eSAndy Ritger * @brief Update the post Rx Detect link mask.
17241739a20eSAndy Ritger *
17251739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
17261739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
17271739a20eSAndy Ritger *
17281739a20eSAndy Ritger */
17291739a20eSAndy Ritger NV_STATUS
knvlinkUpdatePostRxDetectLinkMask_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)17301739a20eSAndy Ritger knvlinkUpdatePostRxDetectLinkMask_IMPL
17311739a20eSAndy Ritger (
17321739a20eSAndy Ritger OBJGPU *pGpu,
17331739a20eSAndy Ritger KernelNvlink *pKernelNvlink
17341739a20eSAndy Ritger )
17351739a20eSAndy Ritger {
17361739a20eSAndy Ritger NV_STATUS status = NV_OK;
17371739a20eSAndy Ritger
17381739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_LINK_MASK_POST_RX_DET_PARAMS params;
17391739a20eSAndy Ritger
17401739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
17411739a20eSAndy Ritger
17421739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
17431739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_LINK_MASK_POST_RX_DET,
17441739a20eSAndy Ritger (void *)¶ms, sizeof(params));
17451739a20eSAndy Ritger if (status != NV_OK)
17461739a20eSAndy Ritger {
17471739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to update Rx Detect Link mask!\n");
17481739a20eSAndy Ritger return status;
17491739a20eSAndy Ritger }
17501739a20eSAndy Ritger
17511739a20eSAndy Ritger pKernelNvlink->postRxDetLinkMask = params.postRxDetLinkMask;
17521739a20eSAndy Ritger
17531739a20eSAndy Ritger return NV_OK;
17541739a20eSAndy Ritger }
17551739a20eSAndy Ritger
17561739a20eSAndy Ritger /*!
17571739a20eSAndy Ritger * @brief Copy over the NVLink devices information from GSP-RM.
17581739a20eSAndy Ritger *
17591739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
17601739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
17611739a20eSAndy Ritger */
17621739a20eSAndy Ritger NV_STATUS
knvlinkCopyNvlinkDeviceInfo_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)17631739a20eSAndy Ritger knvlinkCopyNvlinkDeviceInfo_IMPL
17641739a20eSAndy Ritger (
17651739a20eSAndy Ritger OBJGPU *pGpu,
17661739a20eSAndy Ritger KernelNvlink *pKernelNvlink
17671739a20eSAndy Ritger )
17681739a20eSAndy Ritger {
17691739a20eSAndy Ritger NV_STATUS status = NV_OK;
17701739a20eSAndy Ritger NvU32 i;
17711739a20eSAndy Ritger
17721739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_NVLINK_DEVICE_INFO_PARAMS nvlinkInfoParams;
17731739a20eSAndy Ritger
17741739a20eSAndy Ritger portMemSet(&nvlinkInfoParams, 0, sizeof(nvlinkInfoParams));
17751739a20eSAndy Ritger
17761739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
17771739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_NVLINK_DEVICE_INFO,
17781739a20eSAndy Ritger (void *)&nvlinkInfoParams, sizeof(nvlinkInfoParams));
17791739a20eSAndy Ritger
17801739a20eSAndy Ritger if (status == NV_ERR_NOT_SUPPORTED)
17811739a20eSAndy Ritger {
17821739a20eSAndy Ritger NV_PRINTF(LEVEL_WARNING, "NVLink is unavailable\n");
17831739a20eSAndy Ritger return status;
17841739a20eSAndy Ritger }
17851739a20eSAndy Ritger else if (status != NV_OK)
17861739a20eSAndy Ritger {
17871739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to retrieve all nvlink device info!\n");
17881739a20eSAndy Ritger return status;
17891739a20eSAndy Ritger }
17901739a20eSAndy Ritger
17911739a20eSAndy Ritger // Update CPU-RM's NVLink state with the information received from GSP-RM RPC
17921739a20eSAndy Ritger pKernelNvlink->ioctrlMask = nvlinkInfoParams.ioctrlMask;
17931739a20eSAndy Ritger pKernelNvlink->ioctrlNumEntries = nvlinkInfoParams.ioctrlNumEntries;
17941739a20eSAndy Ritger pKernelNvlink->ioctrlSize = nvlinkInfoParams.ioctrlSize;
17951739a20eSAndy Ritger pKernelNvlink->discoveredLinks = nvlinkInfoParams.discoveredLinks;
17961739a20eSAndy Ritger pKernelNvlink->ipVerNvlink = nvlinkInfoParams.ipVerNvlink;
17971739a20eSAndy Ritger
17981739a20eSAndy Ritger for (i = 0; i < NVLINK_MAX_LINKS_SW; i++)
17991739a20eSAndy Ritger {
18001739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].pGpu = pGpu;
18011739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].bValid = nvlinkInfoParams.linkInfo[i].bValid;
18021739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].linkId = nvlinkInfoParams.linkInfo[i].linkId;
18031739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].ioctrlId = nvlinkInfoParams.linkInfo[i].ioctrlId;
18041739a20eSAndy Ritger
18051739a20eSAndy Ritger // Copy over the link PLL master and slave relationship for each link
18061739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].pllMasterLinkId = nvlinkInfoParams.linkInfo[i].pllMasterLinkId;
18071739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].pllSlaveLinkId = nvlinkInfoParams.linkInfo[i].pllSlaveLinkId;
18081739a20eSAndy Ritger
18091739a20eSAndy Ritger // Copy over the ip versions for DLPL devices discovered
18101739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].ipVerDlPl = nvlinkInfoParams.linkInfo[i].ipVerDlPl;
18111739a20eSAndy Ritger }
18121739a20eSAndy Ritger
18131739a20eSAndy Ritger return NV_OK;
18141739a20eSAndy Ritger }
18151739a20eSAndy Ritger
18161739a20eSAndy Ritger /*!
18171739a20eSAndy Ritger * @brief Copy over the Ioctrl devices information from GSP-RM.
18181739a20eSAndy Ritger *
18191739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
18201739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
18211739a20eSAndy Ritger */
18221739a20eSAndy Ritger NV_STATUS
knvlinkCopyIoctrlDeviceInfo_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)18231739a20eSAndy Ritger knvlinkCopyIoctrlDeviceInfo_IMPL
18241739a20eSAndy Ritger (
18251739a20eSAndy Ritger OBJGPU *pGpu,
18261739a20eSAndy Ritger KernelNvlink *pKernelNvlink
18271739a20eSAndy Ritger )
18281739a20eSAndy Ritger {
18291739a20eSAndy Ritger KernelIoctrl *pKernelIoctrl = NULL;
18301739a20eSAndy Ritger NV_STATUS status = NV_OK;
18311739a20eSAndy Ritger NvU32 ioctrlIdx;
18321739a20eSAndy Ritger
18331739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_IOCTRL_DEVICE_INFO_PARAMS ioctrlInfoParams;
18341739a20eSAndy Ritger
18351739a20eSAndy Ritger // Query the IOCTRL information for each of the IOCTRLs discovered
18361739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, ioctrlIdx, pKernelNvlink->ioctrlMask)
18371739a20eSAndy Ritger {
18381739a20eSAndy Ritger portMemSet(&ioctrlInfoParams, 0, sizeof(ioctrlInfoParams));
18391739a20eSAndy Ritger
18401739a20eSAndy Ritger ioctrlInfoParams.ioctrlIdx = ioctrlIdx;
18411739a20eSAndy Ritger
18421739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
18431739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_IOCTRL_DEVICE_INFO,
18441739a20eSAndy Ritger (void *)&ioctrlInfoParams, sizeof(ioctrlInfoParams));
18451739a20eSAndy Ritger
18461739a20eSAndy Ritger if (status == NV_ERR_NOT_SUPPORTED)
18471739a20eSAndy Ritger {
18481739a20eSAndy Ritger NV_PRINTF(LEVEL_WARNING, "NVLink is unavailable\n");
18491739a20eSAndy Ritger return status;
18501739a20eSAndy Ritger }
18511739a20eSAndy Ritger else if (status != NV_OK)
18521739a20eSAndy Ritger {
18531739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to retrieve device info for IOCTRL %d!\n", ioctrlIdx);
18541739a20eSAndy Ritger return status;
18551739a20eSAndy Ritger }
18561739a20eSAndy Ritger
18571739a20eSAndy Ritger pKernelIoctrl = KNVLINK_GET_IOCTRL(pKernelNvlink, ioctrlIdx);
18581739a20eSAndy Ritger
18591739a20eSAndy Ritger // Update CPU-RM's NVLink state with the information received from GSP-RM RPC
18601739a20eSAndy Ritger pKernelIoctrl->PublicId = ioctrlInfoParams.PublicId;
18611739a20eSAndy Ritger pKernelIoctrl->localDiscoveredLinks = ioctrlInfoParams.localDiscoveredLinks;
18621739a20eSAndy Ritger pKernelIoctrl->localGlobalLinkOffset = ioctrlInfoParams.localGlobalLinkOffset;
18631739a20eSAndy Ritger pKernelIoctrl->ioctrlDiscoverySize = ioctrlInfoParams.ioctrlDiscoverySize;
18641739a20eSAndy Ritger pKernelIoctrl->numDevices = ioctrlInfoParams.numDevices;
18651739a20eSAndy Ritger
18661739a20eSAndy Ritger // Copy over the ip versions for the ioctrl and minion devices discovered
18671739a20eSAndy Ritger pKernelIoctrl->ipVerIoctrl = ioctrlInfoParams.ipRevisions.ipVerIoctrl;
18681739a20eSAndy Ritger pKernelIoctrl->ipVerMinion = ioctrlInfoParams.ipRevisions.ipVerMinion;
18691739a20eSAndy Ritger
18701739a20eSAndy Ritger if (pKernelIoctrl->ipVerMinion == 0)
18711739a20eSAndy Ritger {
18721739a20eSAndy Ritger pKernelIoctrl->setProperty(pKernelIoctrl, PDB_PROP_KIOCTRL_MINION_AVAILABLE, NV_FALSE);
18731739a20eSAndy Ritger }
18741739a20eSAndy Ritger }
18751739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK_END;
18761739a20eSAndy Ritger
18771739a20eSAndy Ritger return NV_OK;
18781739a20eSAndy Ritger }
18791739a20eSAndy Ritger
18801739a20eSAndy Ritger /**
18811739a20eSAndy Ritger * @brief Setup topology information for the forced nvlink configurations
18821739a20eSAndy Ritger *
18831739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU
18841739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
18851739a20eSAndy Ritger */
18861739a20eSAndy Ritger NV_STATUS
knvlinkSetupTopologyForForcedConfig_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)18871739a20eSAndy Ritger knvlinkSetupTopologyForForcedConfig_IMPL
18881739a20eSAndy Ritger (
18891739a20eSAndy Ritger OBJGPU *pGpu,
18901739a20eSAndy Ritger KernelNvlink *pKernelNvlink
18911739a20eSAndy Ritger )
18921739a20eSAndy Ritger {
18931739a20eSAndy Ritger NV_STATUS status = NV_OK;
18941739a20eSAndy Ritger NvU32 i, physLink;
18951739a20eSAndy Ritger
18961739a20eSAndy Ritger // Start with all links disabled and no forced config in effect
18971739a20eSAndy Ritger pKernelNvlink->bRegistryLinkOverride = NV_TRUE;
18981739a20eSAndy Ritger pKernelNvlink->registryLinkMask = 0;
18991739a20eSAndy Ritger pKernelNvlink->bChiplibConfig = NV_FALSE;
19001739a20eSAndy Ritger
19011739a20eSAndy Ritger for (i = 0; i < NVLINK_MAX_LINKS_SW; i++)
19021739a20eSAndy Ritger {
19031739a20eSAndy Ritger // Filter against the links discovered from IOCTRL
19041739a20eSAndy Ritger if (!(pKernelNvlink->discoveredLinks & NVBIT(i)))
19051739a20eSAndy Ritger continue;
19061739a20eSAndy Ritger
19071739a20eSAndy Ritger // The physical link is guaranteed valid in all cases
19081739a20eSAndy Ritger physLink = DRF_VAL(_NVLINK, _ARCH_CONNECTION, _PHYSICAL_LINK, pKernelNvlink->pLinkConnection[i]);
19091739a20eSAndy Ritger
19101739a20eSAndy Ritger // Update link tracking
19111739a20eSAndy Ritger if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _ENABLED, pKernelNvlink->pLinkConnection[i]))
19121739a20eSAndy Ritger {
19131739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO,
19141739a20eSAndy Ritger "ARCH_CONNECTION info from chiplib: ENABLED Logical link %d (Physical "
19151739a20eSAndy Ritger "link %d) = 0x%X\n", i, physLink,
19161739a20eSAndy Ritger pKernelNvlink->pLinkConnection[i]);
19171739a20eSAndy Ritger
19181739a20eSAndy Ritger //
19191739a20eSAndy Ritger // This "link" should be ENABLED. We use the physical link since RM only deals with
19201739a20eSAndy Ritger // physical links.
19211739a20eSAndy Ritger //
19221739a20eSAndy Ritger pKernelNvlink->registryLinkMask |= NVBIT(physLink);
19231739a20eSAndy Ritger
19241739a20eSAndy Ritger // Config is forced (at least one link requested)
19251739a20eSAndy Ritger pKernelNvlink->bChiplibConfig = NV_TRUE;
19261739a20eSAndy Ritger }
19271739a20eSAndy Ritger else
19281739a20eSAndy Ritger {
19291739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO,
19301739a20eSAndy Ritger "ARCH_CONNECTION info from chiplib: DISABLED Logical link %d (Physical "
19311739a20eSAndy Ritger "link %d) = 0x%X\n", i, physLink,
19321739a20eSAndy Ritger pKernelNvlink->pLinkConnection[i]);
19331739a20eSAndy Ritger }
19341739a20eSAndy Ritger
19351739a20eSAndy Ritger // Accumulate any PEER links
19361739a20eSAndy Ritger if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _PEER_MASK, pKernelNvlink->pLinkConnection[i]))
19371739a20eSAndy Ritger {
19381739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
19391739a20eSAndy Ritger // Ensure reginit has the info it needs for the remote side
19401739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].remoteEndInfo.bConnected = NV_TRUE;
19411739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].remoteEndInfo.deviceType =
19421739a20eSAndy Ritger NV2080_CTRL_NVLINK_DEVICE_INFO_DEVICE_TYPE_GPU;
19431739a20eSAndy Ritger
19441739a20eSAndy Ritger #endif
19451739a20eSAndy Ritger }
19461739a20eSAndy Ritger
19471739a20eSAndy Ritger // Accumulate any CPU links
19481739a20eSAndy Ritger if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _CPU, pKernelNvlink->pLinkConnection[i]))
19491739a20eSAndy Ritger {
19501739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB)
19511739a20eSAndy Ritger // Ensure reginit has the info it needs for the remote side
19521739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].remoteEndInfo.bConnected = NV_TRUE;
19531739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].remoteEndInfo.deviceType = pKernelNvlink->forcedSysmemDeviceType;
19541739a20eSAndy Ritger #endif
19551739a20eSAndy Ritger }
19561739a20eSAndy Ritger
19571739a20eSAndy Ritger // RPC into GSP-RM to update the link remote connection status
19581739a20eSAndy Ritger status = knvlinkUpdateLinkConnectionStatus(pGpu, pKernelNvlink, i);
19591739a20eSAndy Ritger if (status != NV_OK)
19601739a20eSAndy Ritger {
19611739a20eSAndy Ritger return status;
19621739a20eSAndy Ritger }
19631739a20eSAndy Ritger }
19641739a20eSAndy Ritger
19651739a20eSAndy Ritger // Update enabledLinks mask with the mask of forced link configurations
19661739a20eSAndy Ritger pKernelNvlink->enabledLinks = pKernelNvlink->discoveredLinks & pKernelNvlink->registryLinkMask;
19671739a20eSAndy Ritger
19681739a20eSAndy Ritger return NV_OK;
19691739a20eSAndy Ritger }
19701739a20eSAndy Ritger
19711739a20eSAndy Ritger /*!
19721739a20eSAndy Ritger * @brief Sync the lane shutdown properties with GSP-RM
19731739a20eSAndy Ritger *
19741739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
19751739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
19761739a20eSAndy Ritger */
19771739a20eSAndy Ritger NV_STATUS
knvlinkSyncLaneShutdownProps_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)19781739a20eSAndy Ritger knvlinkSyncLaneShutdownProps_IMPL
19791739a20eSAndy Ritger (
19801739a20eSAndy Ritger OBJGPU *pGpu,
19811739a20eSAndy Ritger KernelNvlink *pKernelNvlink
19821739a20eSAndy Ritger )
19831739a20eSAndy Ritger {
19841739a20eSAndy Ritger NV_STATUS status = NV_OK;
19851739a20eSAndy Ritger
19861739a20eSAndy Ritger NV2080_CTRL_NVLINK_SYNC_NVLINK_SHUTDOWN_PROPS_PARAMS params;
19871739a20eSAndy Ritger
19881739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
19891739a20eSAndy Ritger
19901739a20eSAndy Ritger params.bLaneShutdownEnabled =
19911739a20eSAndy Ritger pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LANE_SHUTDOWN_ENABLED);
19921739a20eSAndy Ritger params.bLaneShutdownOnUnload =
19931739a20eSAndy Ritger pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LANE_SHUTDOWN_ON_UNLOAD);
19941739a20eSAndy Ritger
19951739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
19961739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_SYNC_NVLINK_SHUTDOWN_PROPS,
19971739a20eSAndy Ritger (void *)¶ms, sizeof(params));
19981739a20eSAndy Ritger if (status != NV_OK)
19991739a20eSAndy Ritger {
20001739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to sync NVLink shutdown properties with GSP!\n");
20011739a20eSAndy Ritger return status;
20021739a20eSAndy Ritger }
20031739a20eSAndy Ritger
20041739a20eSAndy Ritger return NV_OK;
20051739a20eSAndy Ritger }
20061739a20eSAndy Ritger
20071739a20eSAndy Ritger /*!
200890eb1077SAndy Ritger * @brief Get the number of active links allowed per IOCTRL
200990eb1077SAndy Ritger *
201090eb1077SAndy Ritger * @param[in] pGpu OBJGPU pointer
201190eb1077SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
201290eb1077SAndy Ritger *
201390eb1077SAndy Ritger * @returns On success, returns the number of active links per IOCTRL.
201490eb1077SAndy Ritger * On failure, returns 0.
201590eb1077SAndy Ritger */
201690eb1077SAndy Ritger NvU32
knvlinkGetNumActiveLinksPerIoctrl_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)201790eb1077SAndy Ritger knvlinkGetNumActiveLinksPerIoctrl_IMPL
201890eb1077SAndy Ritger (
201990eb1077SAndy Ritger OBJGPU *pGpu,
202090eb1077SAndy Ritger KernelNvlink *pKernelNvlink
202190eb1077SAndy Ritger )
202290eb1077SAndy Ritger {
202390eb1077SAndy Ritger NV_STATUS status;
202490eb1077SAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_NUM_ACTIVE_LINK_PER_IOCTRL_PARAMS params;
202590eb1077SAndy Ritger portMemSet(¶ms, 0, sizeof(params));
202690eb1077SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
202790eb1077SAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_NUM_ACTIVE_LINK_PER_IOCTRL,
202890eb1077SAndy Ritger (void *)¶ms, sizeof(params));
202990eb1077SAndy Ritger if (status != NV_OK)
203090eb1077SAndy Ritger {
203190eb1077SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get the number of active links per IOCTRL\n");
203290eb1077SAndy Ritger return 0;
203390eb1077SAndy Ritger }
203490eb1077SAndy Ritger return params.numActiveLinksPerIoctrl;
203590eb1077SAndy Ritger }
203690eb1077SAndy Ritger
203790eb1077SAndy Ritger /*!
203890eb1077SAndy Ritger * @brief Get the number of total links per IOCTRL
203990eb1077SAndy Ritger *
204090eb1077SAndy Ritger * @param[in] pGpu OBJGPU pointer
204190eb1077SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
204290eb1077SAndy Ritger *
204390eb1077SAndy Ritger * @returns On success, returns the number of total links per IOCTRL.
204490eb1077SAndy Ritger * On failure, returns 0.
204590eb1077SAndy Ritger */
204690eb1077SAndy Ritger NvU32
knvlinkGetTotalNumLinksPerIoctrl_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)204790eb1077SAndy Ritger knvlinkGetTotalNumLinksPerIoctrl_IMPL
204890eb1077SAndy Ritger (
204990eb1077SAndy Ritger OBJGPU *pGpu,
205090eb1077SAndy Ritger KernelNvlink *pKernelNvlink
205190eb1077SAndy Ritger )
205290eb1077SAndy Ritger {
205390eb1077SAndy Ritger NV_STATUS status;
205490eb1077SAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_TOTAL_NUM_LINK_PER_IOCTRL_PARAMS params;
205590eb1077SAndy Ritger portMemSet(¶ms, 0, sizeof(params));
205690eb1077SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
205790eb1077SAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_TOTAL_NUM_LINK_PER_IOCTRL,
205890eb1077SAndy Ritger (void *)¶ms, sizeof(params));
205990eb1077SAndy Ritger if (status != NV_OK)
206090eb1077SAndy Ritger {
206190eb1077SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get the total number of links per IOCTRL\n");
206290eb1077SAndy Ritger return 0;
206390eb1077SAndy Ritger }
206490eb1077SAndy Ritger return params.numLinksPerIoctrl;
206590eb1077SAndy Ritger }
206690eb1077SAndy Ritger
20671739a20eSAndy Ritger /**
20681739a20eSAndy Ritger * @brief Process the mask of init disabled links
20691739a20eSAndy Ritger *
20701739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer
20711739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer
20721739a20eSAndy Ritger */
20731739a20eSAndy Ritger NV_STATUS
knvlinkProcessInitDisabledLinks_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)20741739a20eSAndy Ritger knvlinkProcessInitDisabledLinks_IMPL
20751739a20eSAndy Ritger (
20761739a20eSAndy Ritger OBJGPU *pGpu,
20771739a20eSAndy Ritger KernelNvlink *pKernelNvlink
20781739a20eSAndy Ritger )
20791739a20eSAndy Ritger {
20801739a20eSAndy Ritger NvU32 mask = 0;
20811739a20eSAndy Ritger NvBool bSkipHwNvlinkDisable = 0;
20821739a20eSAndy Ritger NV_STATUS status = NV_OK;
20831739a20eSAndy Ritger
20841739a20eSAndy Ritger NV2080_CTRL_NVLINK_PROCESS_INIT_DISABLED_LINKS_PARAMS params;
20851739a20eSAndy Ritger
20861739a20eSAndy Ritger status = gpumgrGetGpuInitDisabledNvlinks(pGpu->gpuId, &mask, &bSkipHwNvlinkDisable);
20871739a20eSAndy Ritger if (status != NV_OK)
20881739a20eSAndy Ritger {
20891739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get init disabled links from gpumgr\n");
20901739a20eSAndy Ritger return status;
20911739a20eSAndy Ritger }
20921739a20eSAndy Ritger
20931739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params));
20941739a20eSAndy Ritger
20951739a20eSAndy Ritger params.initDisabledLinksMask = mask;
20961739a20eSAndy Ritger params.bSkipHwNvlinkDisable = bSkipHwNvlinkDisable;
20971739a20eSAndy Ritger
20981739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
20991739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_PROCESS_INIT_DISABLED_LINKS,
21001739a20eSAndy Ritger (void *)¶ms, sizeof(params));
21011739a20eSAndy Ritger if (status != NV_OK)
21021739a20eSAndy Ritger {
21031739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to process init disabled links in GSP\n");
21041739a20eSAndy Ritger return status;
21051739a20eSAndy Ritger }
21061739a20eSAndy Ritger
21071739a20eSAndy Ritger pKernelNvlink->initDisabledLinksMask = params.initDisabledLinksMask;
21081739a20eSAndy Ritger
21091739a20eSAndy Ritger return NV_OK;
21101739a20eSAndy Ritger }
21111739a20eSAndy Ritger
211291676d66SBernhard Stoeckner void
knvlinkFatalErrorRecovery_WORKITEM(NvU32 gpuInstance,void * pArgs)211391676d66SBernhard Stoeckner knvlinkFatalErrorRecovery_WORKITEM
211491676d66SBernhard Stoeckner (
211591676d66SBernhard Stoeckner NvU32 gpuInstance,
211691676d66SBernhard Stoeckner void *pArgs
211791676d66SBernhard Stoeckner )
211891676d66SBernhard Stoeckner {
211991676d66SBernhard Stoeckner OBJGPU *pGpu = gpumgrGetGpu(gpuInstance);
212091676d66SBernhard Stoeckner rcAndDisableOutstandingClientsWithImportedMemory(pGpu, NV_FABRIC_INVALID_NODE_ID);
212191676d66SBernhard Stoeckner }
212291676d66SBernhard Stoeckner
212391676d66SBernhard Stoeckner NV_STATUS
knvlinkFatalErrorRecovery_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink)212491676d66SBernhard Stoeckner knvlinkFatalErrorRecovery_IMPL
212591676d66SBernhard Stoeckner (
212691676d66SBernhard Stoeckner OBJGPU *pGpu,
212791676d66SBernhard Stoeckner KernelNvlink *pKernelNvlink
212891676d66SBernhard Stoeckner )
212991676d66SBernhard Stoeckner {
213091676d66SBernhard Stoeckner NV_STATUS status;
213191676d66SBernhard Stoeckner
213291676d66SBernhard Stoeckner status = osQueueWorkItemWithFlags(pGpu, knvlinkFatalErrorRecovery_WORKITEM, NULL,
213391676d66SBernhard Stoeckner (OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
213491676d66SBernhard Stoeckner OS_QUEUE_WORKITEM_FLAGS_LOCK_API_RW |
213591676d66SBernhard Stoeckner OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_SUBDEVICE_RW));
213691676d66SBernhard Stoeckner
213791676d66SBernhard Stoeckner return status;
213891676d66SBernhard Stoeckner }
213991676d66SBernhard Stoeckner
21401739a20eSAndy Ritger // Grab GPU locks before RPCing into GSP-RM for NVLink RPCs
21411739a20eSAndy Ritger NV_STATUS
knvlinkExecGspRmRpc_IMPL(OBJGPU * pGpu,KernelNvlink * pKernelNvlink,NvU32 cmd,void * paramAddr,NvU32 paramSize)21421739a20eSAndy Ritger knvlinkExecGspRmRpc_IMPL
21431739a20eSAndy Ritger (
21441739a20eSAndy Ritger OBJGPU *pGpu,
21451739a20eSAndy Ritger KernelNvlink *pKernelNvlink,
21461739a20eSAndy Ritger NvU32 cmd,
21471739a20eSAndy Ritger void *paramAddr,
21481739a20eSAndy Ritger NvU32 paramSize
21491739a20eSAndy Ritger )
21501739a20eSAndy Ritger {
21511739a20eSAndy Ritger NvU32 gpuMaskRelease = 0;
21521739a20eSAndy Ritger NvU32 gpuMaskInitial = rmGpuLocksGetOwnedMask();
21531739a20eSAndy Ritger NvU32 gpuMask = gpuMaskInitial | NVBIT(pGpu->gpuInstance);
21541739a20eSAndy Ritger NV_STATUS status = NV_OK;
21551739a20eSAndy Ritger
21561739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu))
21571739a20eSAndy Ritger {
21581739a20eSAndy Ritger if (!rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_MASK, &gpuMask))
21591739a20eSAndy Ritger {
21601739a20eSAndy Ritger status = rmGpuGroupLockAcquire(pGpu->gpuInstance,
21611739a20eSAndy Ritger GPU_LOCK_GRP_MASK,
21621739a20eSAndy Ritger GPU_LOCK_FLAGS_SAFE_LOCK_UPGRADE,
21631739a20eSAndy Ritger RM_LOCK_MODULES_NVLINK,
21641739a20eSAndy Ritger &gpuMask);
21651739a20eSAndy Ritger if (status != NV_OK)
21661739a20eSAndy Ritger {
21671739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to acquire locks for gpumask 0x%x\n", gpuMask);
21681739a20eSAndy Ritger return status;
21691739a20eSAndy Ritger }
21701739a20eSAndy Ritger
21711739a20eSAndy Ritger gpuMaskRelease = (gpuMask & (~gpuMaskInitial));
21721739a20eSAndy Ritger }
21731739a20eSAndy Ritger }
21741739a20eSAndy Ritger
21751739a20eSAndy Ritger RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
21761739a20eSAndy Ritger status = pRmApi->Control(pRmApi,
21771739a20eSAndy Ritger pGpu->hInternalClient,
21781739a20eSAndy Ritger pGpu->hInternalSubdevice,
21791739a20eSAndy Ritger cmd, paramAddr, paramSize);
21801739a20eSAndy Ritger if (gpuMaskRelease)
21811739a20eSAndy Ritger {
21821739a20eSAndy Ritger rmGpuGroupLockRelease(gpuMaskRelease, GPUS_LOCK_FLAGS_NONE);
21831739a20eSAndy Ritger }
21841739a20eSAndy Ritger
21851739a20eSAndy Ritger return status;
21861739a20eSAndy Ritger }
21871739a20eSAndy Ritger
21881739a20eSAndy Ritger void
knvlinkUtoa(NvU8 * str,NvU64 length,NvU64 val)21891739a20eSAndy Ritger knvlinkUtoa(NvU8 *str, NvU64 length, NvU64 val)
21901739a20eSAndy Ritger {
21911739a20eSAndy Ritger NvU8 temp[NV2080_GPU_MAX_NAME_STRING_LENGTH];
21921739a20eSAndy Ritger NvU8 *ptr = temp;
21931739a20eSAndy Ritger NvU64 i = 0;
21941739a20eSAndy Ritger
21951739a20eSAndy Ritger NV_ASSERT(str != NULL);
21961739a20eSAndy Ritger
21971739a20eSAndy Ritger do
21981739a20eSAndy Ritger {
21991739a20eSAndy Ritger i = val % 10;
22001739a20eSAndy Ritger val = val / 10;
22011739a20eSAndy Ritger *ptr++ = (NvU8)(i + '0');
22021739a20eSAndy Ritger } while(val);
22031739a20eSAndy Ritger
22041739a20eSAndy Ritger NV_ASSERT(length > (NvU64) (ptr - temp));
22051739a20eSAndy Ritger
22061739a20eSAndy Ritger while (ptr > temp)
22071739a20eSAndy Ritger *str++ = *--ptr;
22081739a20eSAndy Ritger
22091739a20eSAndy Ritger *str = '\0';
22101739a20eSAndy Ritger }
2211