11739a20eSAndy Ritger /* 2eb5c7665SAndy Ritger * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 31739a20eSAndy Ritger * SPDX-License-Identifier: MIT 41739a20eSAndy Ritger * 51739a20eSAndy Ritger * Permission is hereby granted, free of charge, to any person obtaining a 61739a20eSAndy Ritger * copy of this software and associated documentation files (the "Software"), 71739a20eSAndy Ritger * to deal in the Software without restriction, including without limitation 81739a20eSAndy Ritger * the rights to use, copy, modify, merge, publish, distribute, sublicense, 91739a20eSAndy Ritger * and/or sell copies of the Software, and to permit persons to whom the 101739a20eSAndy Ritger * Software is furnished to do so, subject to the following conditions: 111739a20eSAndy Ritger * 121739a20eSAndy Ritger * The above copyright notice and this permission notice shall be included in 131739a20eSAndy Ritger * all copies or substantial portions of the Software. 141739a20eSAndy Ritger * 151739a20eSAndy Ritger * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 161739a20eSAndy Ritger * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 171739a20eSAndy Ritger * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 181739a20eSAndy Ritger * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 191739a20eSAndy Ritger * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 201739a20eSAndy Ritger * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 211739a20eSAndy Ritger * DEALINGS IN THE SOFTWARE. 221739a20eSAndy Ritger */ 231739a20eSAndy Ritger 24eb5c7665SAndy Ritger #define NVOC_KERNEL_NVLINK_H_PRIVATE_ACCESS_ALLOWED 25eb5c7665SAndy Ritger 26eb5c7665SAndy Ritger // FIXME XXX 27eb5c7665SAndy Ritger #define NVOC_KERNEL_IOCTRL_H_PRIVATE_ACCESS_ALLOWED 28eb5c7665SAndy Ritger 291739a20eSAndy Ritger #include "os/os.h" 301739a20eSAndy Ritger #include "core/hal.h" 311739a20eSAndy Ritger #include "core/info_block.h" 321739a20eSAndy Ritger #include "core/locks.h" 331739a20eSAndy Ritger #include "gpu/gpu.h" 341739a20eSAndy Ritger #include "kernel/gpu/nvlink/kernel_nvlink.h" 351739a20eSAndy Ritger #include "kernel/gpu/nvlink/kernel_ioctrl.h" 361739a20eSAndy Ritger #include "gpu/mem_mgr/mem_mgr.h" 371739a20eSAndy Ritger #include "gpu/mmu/kern_gmmu.h" 381739a20eSAndy Ritger #include "gpu/ce/kernel_ce.h" 391739a20eSAndy Ritger 401739a20eSAndy Ritger /*! 411739a20eSAndy Ritger * @brief Is NVLINK topology forced? NVLink topology is considered 421739a20eSAndy Ritger * forced for both legacy forced config and chiplib configs 431739a20eSAndy Ritger * 441739a20eSAndy Ritger * @param[in] pGpu OBJGPU 451739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 461739a20eSAndy Ritger * 471739a20eSAndy Ritger * @return NV_TRUE if topology is forced 481739a20eSAndy Ritger */ 491739a20eSAndy Ritger NvBool 501739a20eSAndy Ritger knvlinkIsForcedConfig_IMPL 511739a20eSAndy Ritger ( 521739a20eSAndy Ritger OBJGPU *pGpu, 531739a20eSAndy Ritger KernelNvlink *pKernelNvlink 541739a20eSAndy Ritger ) 551739a20eSAndy Ritger { 561739a20eSAndy Ritger return (pKernelNvlink->bChiplibConfig); 571739a20eSAndy Ritger } 581739a20eSAndy Ritger 591739a20eSAndy Ritger /*! 601739a20eSAndy Ritger * @brief Determine if NVLink is enabled or disabled by default 611739a20eSAndy Ritger * 621739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 631739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 641739a20eSAndy Ritger * 651739a20eSAndy Ritger * @return NV_TRUE if NVLink is enabled on the GPU/platform 661739a20eSAndy Ritger */ 671739a20eSAndy Ritger NvBool 681739a20eSAndy Ritger knvlinkIsNvlinkDefaultEnabled_IMPL 691739a20eSAndy Ritger ( 701739a20eSAndy Ritger OBJGPU *pGpu, 711739a20eSAndy Ritger KernelNvlink *pKernelNvlink 721739a20eSAndy Ritger ) 731739a20eSAndy Ritger { 741739a20eSAndy Ritger // 751739a20eSAndy Ritger // Currently it is critical that the following lib check be present. 761739a20eSAndy Ritger // Burying this in the hal below it may get lost as the stub is all 771739a20eSAndy Ritger // thats required for POR (always true from the hals perspective) 781739a20eSAndy Ritger // 791739a20eSAndy Ritger #if !defined(INCLUDE_NVLINK_LIB) 801739a20eSAndy Ritger 811739a20eSAndy Ritger return NV_FALSE; 821739a20eSAndy Ritger 831739a20eSAndy Ritger #endif 841739a20eSAndy Ritger 851739a20eSAndy Ritger // Let the PDB handle the final decision. 861739a20eSAndy Ritger return pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_ENABLED); 871739a20eSAndy Ritger } 881739a20eSAndy Ritger 891739a20eSAndy Ritger /*! 901739a20eSAndy Ritger * @brief Determine if P2P loopback over NVLink is supported for 911739a20eSAndy Ritger * the given GPU. This function returns true if any link 921739a20eSAndy Ritger * is connected in loopback mode. 931739a20eSAndy Ritger * 941739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 951739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 961739a20eSAndy Ritger * 971739a20eSAndy Ritger * @return NV_TRUE if any link is in loopback mode 981739a20eSAndy Ritger */ 991739a20eSAndy Ritger NvBool 1001739a20eSAndy Ritger knvlinkIsP2pLoopbackSupported_IMPL 1011739a20eSAndy Ritger ( 1021739a20eSAndy Ritger OBJGPU *pGpu, 1031739a20eSAndy Ritger KernelNvlink *pKernelNvlink 1041739a20eSAndy Ritger ) 1051739a20eSAndy Ritger { 1061739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB) 1071739a20eSAndy Ritger 1081739a20eSAndy Ritger NvU32 i; 1091739a20eSAndy Ritger 1101739a20eSAndy Ritger if ((pGpu == NULL) || (pKernelNvlink == NULL)) 1111739a20eSAndy Ritger { 1121739a20eSAndy Ritger return NV_FALSE; 1131739a20eSAndy Ritger } 1141739a20eSAndy Ritger 1151739a20eSAndy Ritger // Return false if P2P loopback is disabled through regkey 1161739a20eSAndy Ritger if (pGpu->getProperty(pGpu, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED)) 1171739a20eSAndy Ritger { 1181739a20eSAndy Ritger return NV_FALSE; 1191739a20eSAndy Ritger } 1201739a20eSAndy Ritger 1211739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks) 1221739a20eSAndy Ritger { 1231739a20eSAndy Ritger if (knvlinkIsP2pLoopbackSupportedPerLink_IMPL(pGpu, pKernelNvlink, i)) 1241739a20eSAndy Ritger return NV_TRUE; 1251739a20eSAndy Ritger } 1261739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK_END 1271739a20eSAndy Ritger 1281739a20eSAndy Ritger #endif 1291739a20eSAndy Ritger 1301739a20eSAndy Ritger return NV_FALSE; 1311739a20eSAndy Ritger } 1321739a20eSAndy Ritger 1331739a20eSAndy Ritger /*! 1341739a20eSAndy Ritger * @brief Determine if P2P loopback over NVLink is supported for 1351739a20eSAndy Ritger * the given link. This function returns true if the link 1361739a20eSAndy Ritger * is connected in loopback mode. 1371739a20eSAndy Ritger * 1381739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 1391739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 1401739a20eSAndy Ritger * @param[in] link Link ID 1411739a20eSAndy Ritger * 1421739a20eSAndy Ritger * @return NV_TRUE if the link is in loopback mode 1431739a20eSAndy Ritger */ 1441739a20eSAndy Ritger NvBool 1451739a20eSAndy Ritger knvlinkIsP2pLoopbackSupportedPerLink_IMPL 1461739a20eSAndy Ritger ( 1471739a20eSAndy Ritger OBJGPU *pGpu, 1481739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 1491739a20eSAndy Ritger NvU32 link 1501739a20eSAndy Ritger ) 1511739a20eSAndy Ritger { 1521739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB) 1531739a20eSAndy Ritger 1541739a20eSAndy Ritger if ((pGpu == NULL) || (pKernelNvlink == NULL)) 1551739a20eSAndy Ritger { 1561739a20eSAndy Ritger return NV_FALSE; 1571739a20eSAndy Ritger } 1581739a20eSAndy Ritger 1591739a20eSAndy Ritger // Return false if P2P loopback is disabled through regkey 1601739a20eSAndy Ritger if (pGpu->getProperty(pGpu, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED)) 1611739a20eSAndy Ritger { 1621739a20eSAndy Ritger return NV_FALSE; 1631739a20eSAndy Ritger } 1641739a20eSAndy Ritger 1651739a20eSAndy Ritger // Return false if the given link is disabled 1661739a20eSAndy Ritger if (!(NVBIT(link) & pKernelNvlink->enabledLinks)) 1671739a20eSAndy Ritger { 1681739a20eSAndy Ritger return NV_FALSE; 1691739a20eSAndy Ritger } 1701739a20eSAndy Ritger 1711739a20eSAndy Ritger // Check the link connected to the same GPU (loopback) 1721739a20eSAndy Ritger if (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.bConnected) 1731739a20eSAndy Ritger { 174*b5bf85a8SAndy Ritger if (((pKernelNvlink->nvlinkLinks[link].remoteEndInfo.domain == gpuGetDomain(pGpu)) && 1751739a20eSAndy Ritger (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.bus == gpuGetBus(pGpu)) && 1761739a20eSAndy Ritger (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.device == gpuGetDevice(pGpu)) && 177*b5bf85a8SAndy Ritger (pKernelNvlink->nvlinkLinks[link].remoteEndInfo.function == 0)) || 178*b5bf85a8SAndy Ritger pKernelNvlink->PDB_PROP_KNVLINK_FORCED_LOOPBACK_ON_SWITCH_MODE_ENABLED) 1791739a20eSAndy Ritger { 1801739a20eSAndy Ritger return NV_TRUE; 1811739a20eSAndy Ritger } 1821739a20eSAndy Ritger } 1831739a20eSAndy Ritger 1841739a20eSAndy Ritger #endif 1851739a20eSAndy Ritger 1861739a20eSAndy Ritger return NV_FALSE; 1871739a20eSAndy Ritger } 1881739a20eSAndy Ritger 1891739a20eSAndy Ritger /*! 1901739a20eSAndy Ritger * @brief Determine if P2P over NVLINK is supported between 2 GPUs 1911739a20eSAndy Ritger * 1921739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 1931739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 1941739a20eSAndy Ritger * @param[in] pPeerGpu OBJGPU pointer for remote GPU 1951739a20eSAndy Ritger * 1961739a20eSAndy Ritger * @return NV_TRUE if P2P is supported between the 2 GPUs 1971739a20eSAndy Ritger */ 1981739a20eSAndy Ritger NvBool 1991739a20eSAndy Ritger knvlinkIsNvlinkP2pSupported_IMPL 2001739a20eSAndy Ritger ( 2011739a20eSAndy Ritger OBJGPU *pGpu, 2021739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 2031739a20eSAndy Ritger OBJGPU *pPeerGpu 2041739a20eSAndy Ritger ) 2051739a20eSAndy Ritger { 2061739a20eSAndy Ritger NV_STATUS status = NV_OK; 2071739a20eSAndy Ritger 2081739a20eSAndy Ritger if (pKernelNvlink == NULL) 2091739a20eSAndy Ritger { 2101739a20eSAndy Ritger return NV_FALSE; 2111739a20eSAndy Ritger } 2121739a20eSAndy Ritger 2134397463eSAndy Ritger if (knvlinkIsBandwidthModeOff(pKernelNvlink)) 2144397463eSAndy Ritger { 2154397463eSAndy Ritger return NV_FALSE; 2164397463eSAndy Ritger } 2174397463eSAndy Ritger 2181739a20eSAndy Ritger // Get the Nvlink P2P connections from the core library 2191739a20eSAndy Ritger status = knvlinkGetP2pConnectionStatus(pGpu, pKernelNvlink, pPeerGpu); 2201739a20eSAndy Ritger 2211739a20eSAndy Ritger if (status == NV_OK) 2221739a20eSAndy Ritger { 2231739a20eSAndy Ritger return NV_TRUE; 2241739a20eSAndy Ritger } 2251739a20eSAndy Ritger 2261739a20eSAndy Ritger return NV_FALSE; 2271739a20eSAndy Ritger } 2281739a20eSAndy Ritger 229*b5bf85a8SAndy Ritger static NvBool 230*b5bf85a8SAndy Ritger _knvlinkCheckFabricCliqueId 231*b5bf85a8SAndy Ritger ( 232*b5bf85a8SAndy Ritger OBJGPU *pGpu, 233*b5bf85a8SAndy Ritger OBJGPU *pPeerGpu 234*b5bf85a8SAndy Ritger ) 235*b5bf85a8SAndy Ritger { 236*b5bf85a8SAndy Ritger NvU32 cliqueId, peerCliqueId; 237*b5bf85a8SAndy Ritger NV_STATUS status; 238*b5bf85a8SAndy Ritger 239*b5bf85a8SAndy Ritger status = gpuFabricProbeGetFabricCliqueId(pGpu->pGpuFabricProbeInfoKernel, 240*b5bf85a8SAndy Ritger &cliqueId); 241*b5bf85a8SAndy Ritger if (status != NV_OK) 242*b5bf85a8SAndy Ritger { 243*b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "GPU %d failed to get fabric clique Id: 0x%x\n", 244*b5bf85a8SAndy Ritger gpuGetInstance(pGpu), status); 245*b5bf85a8SAndy Ritger return NV_FALSE; 246*b5bf85a8SAndy Ritger } 247*b5bf85a8SAndy Ritger 248*b5bf85a8SAndy Ritger status = gpuFabricProbeGetFabricCliqueId(pPeerGpu->pGpuFabricProbeInfoKernel, 249*b5bf85a8SAndy Ritger &peerCliqueId); 250*b5bf85a8SAndy Ritger if (status != NV_OK) 251*b5bf85a8SAndy Ritger { 252*b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "GPU %d failed to get fabric clique Id 0x%x\n", 253*b5bf85a8SAndy Ritger gpuGetInstance(pPeerGpu), status); 254*b5bf85a8SAndy Ritger return NV_FALSE; 255*b5bf85a8SAndy Ritger } 256*b5bf85a8SAndy Ritger 257*b5bf85a8SAndy Ritger if (cliqueId != peerCliqueId) 258*b5bf85a8SAndy Ritger { 259*b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "GPU %d and Peer GPU %d cliqueId doesn't match\n", 260*b5bf85a8SAndy Ritger gpuGetInstance(pGpu), gpuGetInstance(pPeerGpu)); 261*b5bf85a8SAndy Ritger return NV_FALSE; 262*b5bf85a8SAndy Ritger } 263*b5bf85a8SAndy Ritger 264*b5bf85a8SAndy Ritger return NV_TRUE; 265*b5bf85a8SAndy Ritger } 266*b5bf85a8SAndy Ritger 2671739a20eSAndy Ritger /*! 2681739a20eSAndy Ritger * @brief Checks whether necessary the config setup is done to 2691739a20eSAndy Ritger * support P2P over NVSwitch 2701739a20eSAndy Ritger * 2711739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 2721739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 2731739a20eSAndy Ritger * @param[in] pPeerGpu OBJGPU pointer for remote GPU 2741739a20eSAndy Ritger * 2751739a20eSAndy Ritger * @return NV_TRUE if P2P over NVSwitch 2761739a20eSAndy Ritger */ 2771739a20eSAndy Ritger NvBool 2781739a20eSAndy Ritger knvlinkCheckNvswitchP2pConfig_IMPL 2791739a20eSAndy Ritger ( 2801739a20eSAndy Ritger OBJGPU *pGpu, 2811739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 2821739a20eSAndy Ritger OBJGPU *pPeerGpu 2831739a20eSAndy Ritger ) 2841739a20eSAndy Ritger { 2851739a20eSAndy Ritger MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu); 2861739a20eSAndy Ritger NvU64 rangeStart = knvlinkGetUniqueFabricBaseAddress(pGpu, pKernelNvlink); 2871739a20eSAndy Ritger NvU64 rangeEnd = rangeStart + (pMemoryManager->Ram.fbTotalMemSizeMb << 20); 2881739a20eSAndy Ritger NvU64 peerRangeStart = knvlinkGetUniqueFabricBaseAddress(pPeerGpu, 2891739a20eSAndy Ritger GPU_GET_KERNEL_NVLINK(pPeerGpu)); 2901739a20eSAndy Ritger 2911739a20eSAndy Ritger if (knvlinkIsGpuConnectedToNvswitch(pGpu, pKernelNvlink)) 2921739a20eSAndy Ritger { 2931739a20eSAndy Ritger if (gpuIsSriovEnabled(pGpu)) 2941739a20eSAndy Ritger { 2951739a20eSAndy Ritger // currently vgpu + switch doesn't support GPA addresing. 2961739a20eSAndy Ritger return NV_TRUE; 2971739a20eSAndy Ritger } 2981739a20eSAndy Ritger 299*b5bf85a8SAndy Ritger if (gpuFabricProbeIsSupported(pGpu) && gpuFabricProbeIsSupported(pPeerGpu)) 300*b5bf85a8SAndy Ritger { 301*b5bf85a8SAndy Ritger if (!_knvlinkCheckFabricCliqueId(pGpu, pPeerGpu)) 302*b5bf85a8SAndy Ritger { 303*b5bf85a8SAndy Ritger return NV_FALSE; 304*b5bf85a8SAndy Ritger } 305*b5bf85a8SAndy Ritger } 306*b5bf85a8SAndy Ritger 3071739a20eSAndy Ritger if (knvlinkGetUniqueFabricBaseAddress(pGpu, pKernelNvlink) == 3081739a20eSAndy Ritger NVLINK_INVALID_FABRIC_ADDR) 3091739a20eSAndy Ritger { 3101739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "GPU %d doesn't have a fabric address\n", 3111739a20eSAndy Ritger gpuGetInstance(pGpu)); 3121739a20eSAndy Ritger 3131739a20eSAndy Ritger return NV_FALSE; 3141739a20eSAndy Ritger } 3151739a20eSAndy Ritger 3161739a20eSAndy Ritger if ((pGpu != pPeerGpu) && 3171739a20eSAndy Ritger ((peerRangeStart >= rangeStart) && (peerRangeStart < rangeEnd))) 3181739a20eSAndy Ritger { 3191739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, 3201739a20eSAndy Ritger "GPU %d doesn't have a unique fabric address\n", 3211739a20eSAndy Ritger gpuGetInstance(pGpu)); 3221739a20eSAndy Ritger 3231739a20eSAndy Ritger return NV_FALSE; 3241739a20eSAndy Ritger } 3251739a20eSAndy Ritger } 3261739a20eSAndy Ritger else 3271739a20eSAndy Ritger { 3281739a20eSAndy Ritger if (knvlinkGetUniqueFabricBaseAddress(pGpu, pKernelNvlink) != 3291739a20eSAndy Ritger NVLINK_INVALID_FABRIC_ADDR) 3301739a20eSAndy Ritger { 3311739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, 3321739a20eSAndy Ritger "non-NVSwitch GPU %d has a valid fabric address\n", 3331739a20eSAndy Ritger gpuGetInstance(pGpu)); 3341739a20eSAndy Ritger 3351739a20eSAndy Ritger return NV_FALSE; 3361739a20eSAndy Ritger } 3371739a20eSAndy Ritger } 3381739a20eSAndy Ritger 3391739a20eSAndy Ritger return NV_TRUE; 3401739a20eSAndy Ritger } 3411739a20eSAndy Ritger 3421739a20eSAndy Ritger /*! 3431739a20eSAndy Ritger * @brief Get Nvlink P2P connections between 2 GPUs 3441739a20eSAndy Ritger * 3451739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 3461739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 3471739a20eSAndy Ritger * @param[in] pPeerGpu OBJGPU pointer for remote GPU 3481739a20eSAndy Ritger * 3491739a20eSAndy Ritger * @return NV_OK if P2P connections are present 3501739a20eSAndy Ritger */ 3511739a20eSAndy Ritger NV_STATUS 3521739a20eSAndy Ritger knvlinkGetP2pConnectionStatus_IMPL 3531739a20eSAndy Ritger ( 3541739a20eSAndy Ritger OBJGPU *pGpu, 3551739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 3561739a20eSAndy Ritger OBJGPU *pPeerGpu 3571739a20eSAndy Ritger ) 3581739a20eSAndy Ritger { 3591739a20eSAndy Ritger NV_STATUS status = NV_OK; 3601739a20eSAndy Ritger OBJGPU *pGpu0 = pGpu; 3611739a20eSAndy Ritger OBJGPU *pGpu1 = pPeerGpu; 3621739a20eSAndy Ritger KernelNvlink *pKernelNvlink0 = pKernelNvlink; 3631739a20eSAndy Ritger KernelNvlink *pKernelNvlink1 = NULL; 3641739a20eSAndy Ritger NvU32 numPeerLinks = 0; 3651739a20eSAndy Ritger 3661739a20eSAndy Ritger if (pGpu1 == NULL) 3671739a20eSAndy Ritger { 368*b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_INFO, "Invalid pPeerGpu.\n"); 3691739a20eSAndy Ritger 3701739a20eSAndy Ritger return NV_ERR_INVALID_ARGUMENT; 3711739a20eSAndy Ritger } 3721739a20eSAndy Ritger else if ((pGpu0 == pGpu1) && 3731739a20eSAndy Ritger (pGpu0->getProperty(pGpu0, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED))) 3741739a20eSAndy Ritger { 3751739a20eSAndy Ritger // P2P over loopback links are disabled through regkey overrides 3761739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, "loopback P2P on GPU%u disabled by regkey\n", 3771739a20eSAndy Ritger gpuGetInstance(pGpu0)); 3781739a20eSAndy Ritger 3791739a20eSAndy Ritger return NV_ERR_NOT_SUPPORTED; 3801739a20eSAndy Ritger } 3811739a20eSAndy Ritger else 3821739a20eSAndy Ritger { 3831739a20eSAndy Ritger pKernelNvlink1 = GPU_GET_KERNEL_NVLINK(pGpu1); 3841739a20eSAndy Ritger } 3851739a20eSAndy Ritger 3861739a20eSAndy Ritger if (pKernelNvlink1 == NULL) 3871739a20eSAndy Ritger { 388*b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_INFO, 3891739a20eSAndy Ritger "Input mask contains a GPU on which NVLink is disabled.\n"); 3901739a20eSAndy Ritger 3911739a20eSAndy Ritger return NV_ERR_INVALID_ARGUMENT; 3921739a20eSAndy Ritger } 3931739a20eSAndy Ritger 394758b4ee8SAndy Ritger if(pKernelNvlink0->bIsGpuDegraded) 395758b4ee8SAndy Ritger { 396758b4ee8SAndy Ritger NV_PRINTF(LEVEL_INFO, 397758b4ee8SAndy Ritger "NVLink P2P is NOT supported between GPU%d and GPU%d\n", 398758b4ee8SAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1)); 399758b4ee8SAndy Ritger 400758b4ee8SAndy Ritger return NV_ERR_NOT_SUPPORTED; 401758b4ee8SAndy Ritger } 402758b4ee8SAndy Ritger 403758b4ee8SAndy Ritger if(pKernelNvlink1->bIsGpuDegraded) 404758b4ee8SAndy Ritger { 405758b4ee8SAndy Ritger NV_PRINTF(LEVEL_INFO, 406758b4ee8SAndy Ritger "NVLink P2P is NOT supported between GPU%d and GPU%d\n", 407758b4ee8SAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1)); 408758b4ee8SAndy Ritger 409758b4ee8SAndy Ritger return NV_ERR_NOT_SUPPORTED; 410758b4ee8SAndy Ritger } 411758b4ee8SAndy Ritger 4121739a20eSAndy Ritger if ((IS_RTLSIM(pGpu0) && !pKernelNvlink0->bForceEnableCoreLibRtlsims) || 4131739a20eSAndy Ritger knvlinkIsForcedConfig(pGpu0, pKernelNvlink0)) 4141739a20eSAndy Ritger { 4151739a20eSAndy Ritger // For non-legacy configs. 4161739a20eSAndy Ritger if (pKernelNvlink0->bChiplibConfig) 4171739a20eSAndy Ritger { 4181739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, 4191739a20eSAndy Ritger "NVLink P2P is supported between GPU%d and GPU%d\n", 4201739a20eSAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1)); 4211739a20eSAndy Ritger 4221739a20eSAndy Ritger return NV_OK; 4231739a20eSAndy Ritger } 4241739a20eSAndy Ritger } 4251739a20eSAndy Ritger 4261739a20eSAndy Ritger // Get the remote ends of the links of local GPU from the nvlink core 4274397463eSAndy Ritger status = knvlinkCoreGetRemoteDeviceInfo(pGpu0, pKernelNvlink0); 4284397463eSAndy Ritger if (status != NV_OK) 4294397463eSAndy Ritger { 4304397463eSAndy Ritger return status; 4314397463eSAndy Ritger } 4321739a20eSAndy Ritger 4331739a20eSAndy Ritger // Post topology link enable on links of local GPU 4341739a20eSAndy Ritger status = knvlinkEnableLinksPostTopology_HAL(pGpu0, pKernelNvlink0, 4351739a20eSAndy Ritger pKernelNvlink0->enabledLinks); 4361739a20eSAndy Ritger if (status != NV_OK) 4371739a20eSAndy Ritger { 4381739a20eSAndy Ritger return status; 4391739a20eSAndy Ritger } 4401739a20eSAndy Ritger 4411739a20eSAndy Ritger numPeerLinks = knvlinkGetNumLinksToPeer(pGpu0, pKernelNvlink0, pGpu1); 442758b4ee8SAndy Ritger 443758b4ee8SAndy Ritger // 444758b4ee8SAndy Ritger // Maybe knvlinkCoreGetRemoteDeviceInfo was never called on pGpu1. 445758b4ee8SAndy Ritger // This can happen on systems where FM doesn't configure GPUs 446758b4ee8SAndy Ritger // using RM control calls explicitly. 447758b4ee8SAndy Ritger // 448758b4ee8SAndy Ritger if ((numPeerLinks == 0) && gpuFabricProbeIsSupported(pGpu1)) 449758b4ee8SAndy Ritger { 450758b4ee8SAndy Ritger knvlinkCoreGetRemoteDeviceInfo(pGpu1, pKernelNvlink1); 451758b4ee8SAndy Ritger 452758b4ee8SAndy Ritger // Post topology link enable on links of remote GPU 453758b4ee8SAndy Ritger status = knvlinkEnableLinksPostTopology_HAL(pGpu1, pKernelNvlink1, 454758b4ee8SAndy Ritger pKernelNvlink1->enabledLinks); 455758b4ee8SAndy Ritger if (status != NV_OK) 456758b4ee8SAndy Ritger { 457758b4ee8SAndy Ritger return status; 458758b4ee8SAndy Ritger } 459758b4ee8SAndy Ritger 460758b4ee8SAndy Ritger numPeerLinks = knvlinkGetNumLinksToPeer(pGpu0, pKernelNvlink0, pGpu1); 461758b4ee8SAndy Ritger } 462758b4ee8SAndy Ritger 4631739a20eSAndy Ritger if (numPeerLinks > 0) 4641739a20eSAndy Ritger { 4651739a20eSAndy Ritger if (knvlinkGetNumLinksToPeer(pGpu1, pKernelNvlink1, pGpu0) != numPeerLinks) 4661739a20eSAndy Ritger { 4671739a20eSAndy Ritger // Get the remote ends of the links of remote GPU from the nvlink core 4684397463eSAndy Ritger status = knvlinkCoreGetRemoteDeviceInfo(pGpu1, pKernelNvlink1); 4694397463eSAndy Ritger if (status != NV_OK) 4704397463eSAndy Ritger { 4714397463eSAndy Ritger return status; 4724397463eSAndy Ritger } 4731739a20eSAndy Ritger 4741739a20eSAndy Ritger // Post topology link enable on links of remote GPU 4751739a20eSAndy Ritger status = knvlinkEnableLinksPostTopology_HAL(pGpu1, pKernelNvlink1, 4761739a20eSAndy Ritger pKernelNvlink1->enabledLinks); 4771739a20eSAndy Ritger if (status != NV_OK) 4781739a20eSAndy Ritger { 4791739a20eSAndy Ritger return status; 4801739a20eSAndy Ritger } 4811739a20eSAndy Ritger } 4821739a20eSAndy Ritger 4831739a20eSAndy Ritger // Peers should have the same number of links pointing back at us 484*b5bf85a8SAndy Ritger NV_CHECK_OR_RETURN(LEVEL_INFO, 485*b5bf85a8SAndy Ritger (knvlinkGetNumLinksToPeer(pGpu1, pKernelNvlink1, pGpu0) == numPeerLinks), 4861739a20eSAndy Ritger NV_ERR_INVALID_STATE); 4871739a20eSAndy Ritger 488*b5bf85a8SAndy Ritger NV_CHECK_OR_RETURN(LEVEL_INFO, 489*b5bf85a8SAndy Ritger knvlinkCheckNvswitchP2pConfig(pGpu0, pKernelNvlink0, pGpu1), 490*b5bf85a8SAndy Ritger NV_ERR_INVALID_STATE); 491*b5bf85a8SAndy Ritger 492*b5bf85a8SAndy Ritger NV_CHECK_OR_RETURN(LEVEL_INFO, 493*b5bf85a8SAndy Ritger knvlinkCheckNvswitchP2pConfig(pGpu1, pKernelNvlink1, pGpu0), 4941739a20eSAndy Ritger NV_ERR_INVALID_STATE); 4951739a20eSAndy Ritger 4961739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, 4971739a20eSAndy Ritger "NVLink P2P is supported between GPU%d and GPU%d\n", 4981739a20eSAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1)); 4991739a20eSAndy Ritger 5001739a20eSAndy Ritger return NV_OK; 5011739a20eSAndy Ritger } 5021739a20eSAndy Ritger 5031739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, 5041739a20eSAndy Ritger "NVLink P2P is NOT supported between between GPU%d and GPU%d\n", 5051739a20eSAndy Ritger pGpu->gpuInstance, pGpu1->gpuInstance); 5061739a20eSAndy Ritger 5071739a20eSAndy Ritger return NV_ERR_NOT_SUPPORTED; 5081739a20eSAndy Ritger } 5091739a20eSAndy Ritger 5101739a20eSAndy Ritger /*! 5111739a20eSAndy Ritger * @brief Update the settings for the current established NVLink 5121739a20eSAndy Ritger * topology. This is the top level function that should be 5131739a20eSAndy Ritger * called, instead of applying the settings individually, 5141739a20eSAndy Ritger * since it grabs the required locks 5151739a20eSAndy Ritger * 5161739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 5171739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 5181739a20eSAndy Ritger * 5191739a20eSAndy Ritger * @return NV_OK on success 5201739a20eSAndy Ritger */ 5211739a20eSAndy Ritger NV_STATUS 5221739a20eSAndy Ritger knvlinkUpdateCurrentConfig_IMPL 5231739a20eSAndy Ritger ( 5241739a20eSAndy Ritger OBJGPU *pGpu, 5251739a20eSAndy Ritger KernelNvlink *pKernelNvlink 5261739a20eSAndy Ritger ) 5271739a20eSAndy Ritger { 5281739a20eSAndy Ritger OBJSYS *pSys = SYS_GET_INSTANCE(); 5291739a20eSAndy Ritger KernelCE *pKCe = NULL; 5301739a20eSAndy Ritger NvBool bOwnsLock = NV_FALSE; 5311739a20eSAndy Ritger NV_STATUS status = NV_OK; 5321739a20eSAndy Ritger 5331739a20eSAndy Ritger if (osAcquireRmSema(pSys->pSema) == NV_OK) 5341739a20eSAndy Ritger { 5351739a20eSAndy Ritger // 5361739a20eSAndy Ritger // XXX Bug 1795328: Fix P2P path to acquire locks for the GPU 5371739a20eSAndy Ritger // Due to platform differences in the P2P path, the GPU lock is not 5381739a20eSAndy Ritger // consistently held at this point in the call stack. This function 5391739a20eSAndy Ritger // requires exclusive access to RM/PMU data structures to update HSHUB, 5401739a20eSAndy Ritger // and therefore requires the GPU lock to be held at this point. 5411739a20eSAndy Ritger // This check should be removed once the P2P paths have been updated to 5421739a20eSAndy Ritger // acquire the GPU locks consistently for all platforms. 5431739a20eSAndy Ritger // 5441739a20eSAndy Ritger if (!rmDeviceGpuLockIsOwner(pGpu->gpuInstance)) 5451739a20eSAndy Ritger { 5461739a20eSAndy Ritger status = rmDeviceGpuLocksAcquire(pGpu, GPUS_LOCK_FLAGS_NONE, 5471739a20eSAndy Ritger RM_LOCK_MODULES_NVLINK); 5481739a20eSAndy Ritger if (status != NV_OK) 5491739a20eSAndy Ritger { 5501739a20eSAndy Ritger NV_ASSERT(0); 5511739a20eSAndy Ritger goto fail; 5521739a20eSAndy Ritger } 5531739a20eSAndy Ritger 5541739a20eSAndy Ritger bOwnsLock = NV_TRUE; 5551739a20eSAndy Ritger } 5561739a20eSAndy Ritger 5571739a20eSAndy Ritger // 5581739a20eSAndy Ritger // Links that have remote end detected should have passed RXDET 5591739a20eSAndy Ritger // Update the mask of connected links and bridged links 5601739a20eSAndy Ritger // 5611739a20eSAndy Ritger knvlinkFilterBridgeLinks_HAL(pGpu, pKernelNvlink); 5621739a20eSAndy Ritger 5631739a20eSAndy Ritger NV2080_CTRL_NVLINK_UPDATE_CURRENT_CONFIG_PARAMS params; 5641739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 5651739a20eSAndy Ritger 5661739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init 5671739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu)) 5681739a20eSAndy Ritger { 5691739a20eSAndy Ritger threadStateResetTimeout(pGpu); 5701739a20eSAndy Ritger } 5711739a20eSAndy Ritger 5721739a20eSAndy Ritger // 5731739a20eSAndy Ritger // RPC into GSP-RM for programming the HSHUB, CONNECTION_CFG and LTCS 5741739a20eSAndy Ritger // registers. 5751739a20eSAndy Ritger // 5761739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 5771739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_UPDATE_CURRENT_CONFIG, 5781739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 5791739a20eSAndy Ritger if (status != NV_OK) 5801739a20eSAndy Ritger { 5811739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Updating current NVLink config failed\n"); 5821739a20eSAndy Ritger goto fail; 5831739a20eSAndy Ritger } 5841739a20eSAndy Ritger 5851739a20eSAndy Ritger // Sync the GPU property for NVLINK over SYSMEM with GSP-RM 5861739a20eSAndy Ritger pGpu->setProperty(pGpu, PDB_PROP_GPU_NVLINK_SYSMEM, params.bNvlinkSysmemEnabled); 5871739a20eSAndy Ritger 5881739a20eSAndy Ritger // Update the PCE-LCE mappings 589758b4ee8SAndy Ritger status = kceFindFirstInstance(pGpu, &pKCe); 590758b4ee8SAndy Ritger if (status == NV_OK) 5911739a20eSAndy Ritger { 5921739a20eSAndy Ritger status = kceTopLevelPceLceMappingsUpdate(pGpu, pKCe); 5931739a20eSAndy Ritger if (status != NV_OK) 5941739a20eSAndy Ritger { 5951739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to update PCE-LCE mappings\n"); 5961739a20eSAndy Ritger } 5971739a20eSAndy Ritger } 5981739a20eSAndy Ritger 5991739a20eSAndy Ritger fail: 6001739a20eSAndy Ritger if (bOwnsLock) 6011739a20eSAndy Ritger { 6021739a20eSAndy Ritger rmDeviceGpuLocksRelease(pGpu, GPUS_LOCK_FLAGS_NONE, NULL); 6031739a20eSAndy Ritger } 6041739a20eSAndy Ritger 6051739a20eSAndy Ritger osReleaseRmSema(pSys->pSema, NULL); 6061739a20eSAndy Ritger } 6071739a20eSAndy Ritger 6081739a20eSAndy Ritger return status; 6091739a20eSAndy Ritger } 6101739a20eSAndy Ritger 6111739a20eSAndy Ritger /*! 612758b4ee8SAndy Ritger * @brief Clients to register their callback functions for inband data 613758b4ee8SAndy Ritger * 614758b4ee8SAndy Ritger * @param[in] pGpu OBJGPU pointer 615758b4ee8SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 616758b4ee8SAndy Ritger * @param[in] params callback functions 617758b4ee8SAndy Ritger */ 618758b4ee8SAndy Ritger NV_STATUS 619758b4ee8SAndy Ritger knvlinkRegisterInbandCallback_IMPL 620758b4ee8SAndy Ritger ( 621758b4ee8SAndy Ritger OBJGPU *pGpu, 622758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink, 623758b4ee8SAndy Ritger NVLINK_INBAND_MSG_CALLBACK *params 624758b4ee8SAndy Ritger ) 625758b4ee8SAndy Ritger { 626758b4ee8SAndy Ritger if (params->messageType >= NVLINK_INBAND_MSG_TYPE_MAX) 627758b4ee8SAndy Ritger { 628758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Wrong msgType. Not registering\n"); 629758b4ee8SAndy Ritger return NV_ERR_INVALID_PARAMETER; 630758b4ee8SAndy Ritger } 631758b4ee8SAndy Ritger 632758b4ee8SAndy Ritger if (pKernelNvlink->inbandCallback[params->messageType].pCallback != NULL) 633758b4ee8SAndy Ritger { 634758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Callback has been already registered" 635758b4ee8SAndy Ritger "for msgType %d\n", params->messageType); 636758b4ee8SAndy Ritger return NV_ERR_IN_USE; 637758b4ee8SAndy Ritger } 638758b4ee8SAndy Ritger 639758b4ee8SAndy Ritger pKernelNvlink->inbandCallback[params->messageType].pCallback = params->pCallback; 640758b4ee8SAndy Ritger pKernelNvlink->inbandCallback[params->messageType].wqItemFlags = params->wqItemFlags; 641758b4ee8SAndy Ritger 642758b4ee8SAndy Ritger return NV_OK; 643758b4ee8SAndy Ritger } 644758b4ee8SAndy Ritger 645758b4ee8SAndy Ritger /*! 646758b4ee8SAndy Ritger * @brief Clients to unregister their callback functions for inband data 647758b4ee8SAndy Ritger * 648758b4ee8SAndy Ritger * @param[in] pGpu OBJGPU pointer 649758b4ee8SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 650758b4ee8SAndy Ritger * @param[in] msgType Inband Message type 651758b4ee8SAndy Ritger */ 652758b4ee8SAndy Ritger NV_STATUS 653758b4ee8SAndy Ritger knvlinkUnregisterInbandCallback_IMPL 654758b4ee8SAndy Ritger ( 655758b4ee8SAndy Ritger OBJGPU *pGpu, 656758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink, 657758b4ee8SAndy Ritger NvU16 msgType 658758b4ee8SAndy Ritger ) 659758b4ee8SAndy Ritger { 660758b4ee8SAndy Ritger if (msgType >= NVLINK_INBAND_MSG_TYPE_MAX) 661758b4ee8SAndy Ritger { 662758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Wrong msgType. Not unregistering\n"); 663758b4ee8SAndy Ritger return NV_ERR_INVALID_PARAMETER; 664758b4ee8SAndy Ritger } 665758b4ee8SAndy Ritger 666758b4ee8SAndy Ritger pKernelNvlink->inbandCallback[msgType].pCallback = NULL; 667758b4ee8SAndy Ritger pKernelNvlink->inbandCallback[msgType].wqItemFlags = 0; 668758b4ee8SAndy Ritger 669758b4ee8SAndy Ritger return NV_OK; 670758b4ee8SAndy Ritger } 671758b4ee8SAndy Ritger 672758b4ee8SAndy Ritger void 673758b4ee8SAndy Ritger knvlinkInbandMsgCallbackDispatcher_WORKITEM 674758b4ee8SAndy Ritger ( 675758b4ee8SAndy Ritger NvU32 gpuInstance, 676758b4ee8SAndy Ritger void *pData 677758b4ee8SAndy Ritger ) 678758b4ee8SAndy Ritger { 679758b4ee8SAndy Ritger OBJGPU *pGpu = NULL; 680758b4ee8SAndy Ritger nvlink_inband_msg_header_t *pHeader; 681758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink; 682758b4ee8SAndy Ritger NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS *pMessage = pData; 683758b4ee8SAndy Ritger NvU8 *pRsvd = NULL; 684758b4ee8SAndy Ritger 685758b4ee8SAndy Ritger pGpu = gpumgrGetGpu(gpuInstance); 686758b4ee8SAndy Ritger if (pGpu == NULL) 687758b4ee8SAndy Ritger { 688758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Invalid GPU\n"); 689758b4ee8SAndy Ritger return; 690758b4ee8SAndy Ritger } 691758b4ee8SAndy Ritger 692758b4ee8SAndy Ritger pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 693758b4ee8SAndy Ritger if (pKernelNvlink == NULL) 694758b4ee8SAndy Ritger { 695758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Invalid NVLink state\n"); 696758b4ee8SAndy Ritger return; 697758b4ee8SAndy Ritger } 698758b4ee8SAndy Ritger 699758b4ee8SAndy Ritger pHeader = (nvlink_inband_msg_header_t *)pMessage->data; 700758b4ee8SAndy Ritger 701eb5c7665SAndy Ritger if (pKernelNvlink->inbandCallback[pHeader->type].pCallback == NULL) 702eb5c7665SAndy Ritger { 703eb5c7665SAndy Ritger NV_PRINTF(LEVEL_ERROR, 704eb5c7665SAndy Ritger "No Callback Registered for type %d. Dropping the msg\n", 705eb5c7665SAndy Ritger pHeader->type); 706eb5c7665SAndy Ritger return; 707eb5c7665SAndy Ritger } 708eb5c7665SAndy Ritger 709758b4ee8SAndy Ritger // Assert reserved in msgHdr are zero 710758b4ee8SAndy Ritger pRsvd = &pHeader->reserved[0]; 711758b4ee8SAndy Ritger NV_ASSERT((pRsvd[0] == 0) && portMemCmp(pRsvd, pRsvd + 1, 712758b4ee8SAndy Ritger sizeof(pHeader->reserved) - 1) == 0); 713758b4ee8SAndy Ritger 714eb5c7665SAndy Ritger (void)pKernelNvlink->inbandCallback[pHeader->type].pCallback(gpuInstance, pData); 715758b4ee8SAndy Ritger } 716758b4ee8SAndy Ritger 717758b4ee8SAndy Ritger NV_STATUS 718758b4ee8SAndy Ritger knvlinkInbandMsgCallbackDispatcher_IMPL 719758b4ee8SAndy Ritger ( 720758b4ee8SAndy Ritger OBJGPU *pGpu, 721758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink, 722758b4ee8SAndy Ritger NvU32 dataSize, 723758b4ee8SAndy Ritger NvU8 *pMessage 724758b4ee8SAndy Ritger ) 725758b4ee8SAndy Ritger { 726758b4ee8SAndy Ritger NV_STATUS status; 727758b4ee8SAndy Ritger nvlink_inband_msg_header_t *pHeader; 728758b4ee8SAndy Ritger NVLINK_INBAND_MSG_CALLBACK *pParams; 729758b4ee8SAndy Ritger NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS *pData = NULL; 730758b4ee8SAndy Ritger 731758b4ee8SAndy Ritger pHeader = (nvlink_inband_msg_header_t *)pMessage; 732758b4ee8SAndy Ritger 733758b4ee8SAndy Ritger if (pHeader->type >= NVLINK_INBAND_MSG_TYPE_MAX) 734758b4ee8SAndy Ritger { 735758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Message type received is Out of Bounds. Dropping the msg\n"); 736758b4ee8SAndy Ritger return NV_ERR_INVALID_REQUEST; 737758b4ee8SAndy Ritger } 738758b4ee8SAndy Ritger 739758b4ee8SAndy Ritger pParams = &pKernelNvlink->inbandCallback[pHeader->type]; 740758b4ee8SAndy Ritger if (pParams->pCallback == NULL) 741758b4ee8SAndy Ritger { 742758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Callback not registered for the message type %d\n", pHeader->type); 743758b4ee8SAndy Ritger return NV_ERR_INVALID_REQUEST; 744758b4ee8SAndy Ritger } 745758b4ee8SAndy Ritger 746758b4ee8SAndy Ritger pData = portMemAllocNonPaged(sizeof(NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS)); 747758b4ee8SAndy Ritger if (pData == NULL) 748758b4ee8SAndy Ritger { 749758b4ee8SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Out of memory, Dropping message\n"); 750758b4ee8SAndy Ritger return NV_ERR_NO_MEMORY; 751758b4ee8SAndy Ritger } 752758b4ee8SAndy Ritger 753758b4ee8SAndy Ritger pData->dataSize = dataSize; 754758b4ee8SAndy Ritger portMemCopy(pData->data, pData->dataSize, pMessage, dataSize); 755758b4ee8SAndy Ritger 756*b5bf85a8SAndy Ritger status = osQueueWorkItemWithFlags(pGpu, knvlinkInbandMsgCallbackDispatcher_WORKITEM, pData, 757758b4ee8SAndy Ritger pParams->wqItemFlags); 758758b4ee8SAndy Ritger if (status != NV_OK) 759758b4ee8SAndy Ritger { 760758b4ee8SAndy Ritger portMemFree(pData); 761758b4ee8SAndy Ritger return status; 762758b4ee8SAndy Ritger } 763758b4ee8SAndy Ritger 764758b4ee8SAndy Ritger return NV_OK; 765758b4ee8SAndy Ritger } 766758b4ee8SAndy Ritger 767758b4ee8SAndy Ritger NV_STATUS 768758b4ee8SAndy Ritger knvlinkSendInbandData_IMPL 769758b4ee8SAndy Ritger ( 770758b4ee8SAndy Ritger OBJGPU *pGpu, 771758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink, 772758b4ee8SAndy Ritger NV2080_CTRL_NVLINK_INBAND_SEND_DATA_PARAMS *pParams 773758b4ee8SAndy Ritger ) 774758b4ee8SAndy Ritger { 775758b4ee8SAndy Ritger NV_STATUS status; 776758b4ee8SAndy Ritger 777758b4ee8SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 778758b4ee8SAndy Ritger NV2080_CTRL_CMD_NVLINK_INBAND_SEND_DATA, 779758b4ee8SAndy Ritger (void *)pParams, 780758b4ee8SAndy Ritger sizeof(*pParams)); 781758b4ee8SAndy Ritger 782758b4ee8SAndy Ritger return status; 783758b4ee8SAndy Ritger } 784758b4ee8SAndy Ritger /*! 7851739a20eSAndy Ritger * @brief Return the mask of links enabled on the system 7861739a20eSAndy Ritger * 7871739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 7881739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 7891739a20eSAndy Ritger */ 7901739a20eSAndy Ritger NvU32 7911739a20eSAndy Ritger knvlinkGetEnabledLinkMask_IMPL 7921739a20eSAndy Ritger ( 7931739a20eSAndy Ritger OBJGPU *pGpu, 7941739a20eSAndy Ritger KernelNvlink *pKernelNvlink 7951739a20eSAndy Ritger ) 7961739a20eSAndy Ritger { 7971739a20eSAndy Ritger return pKernelNvlink->enabledLinks; 7981739a20eSAndy Ritger } 7991739a20eSAndy Ritger 8001739a20eSAndy Ritger /*! 8011739a20eSAndy Ritger * @brief Return the mask of links discovered on the system 8021739a20eSAndy Ritger * 8031739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 8041739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 8051739a20eSAndy Ritger */ 8061739a20eSAndy Ritger NvU32 8071739a20eSAndy Ritger knvlinkGetDiscoveredLinkMask_IMPL 8081739a20eSAndy Ritger ( 8091739a20eSAndy Ritger OBJGPU *pGpu, 8101739a20eSAndy Ritger KernelNvlink *pKernelNvlink 8111739a20eSAndy Ritger ) 8121739a20eSAndy Ritger { 8131739a20eSAndy Ritger return pKernelNvlink->discoveredLinks; 8141739a20eSAndy Ritger } 8151739a20eSAndy Ritger 8161739a20eSAndy Ritger /*! 8171739a20eSAndy Ritger * @brief Returns the number of sysmem links 8181739a20eSAndy Ritger * 8191739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 8201739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 8211739a20eSAndy Ritger * 8221739a20eSAndy Ritger * @return The #sysmem NVLinks 8231739a20eSAndy Ritger */ 8241739a20eSAndy Ritger NvU32 8251739a20eSAndy Ritger knvlinkGetNumLinksToSystem_IMPL 8261739a20eSAndy Ritger ( 8271739a20eSAndy Ritger OBJGPU *pGpu, 8281739a20eSAndy Ritger KernelNvlink *pKernelNvlink 8291739a20eSAndy Ritger ) 8301739a20eSAndy Ritger { 8311739a20eSAndy Ritger NvU32 numSysmemLinks = pKernelNvlink->sysmemLinkMask; 8321739a20eSAndy Ritger 8331739a20eSAndy Ritger if (numSysmemLinks != 0) 8341739a20eSAndy Ritger { 8351739a20eSAndy Ritger NUMSETBITS_32(numSysmemLinks); 8361739a20eSAndy Ritger } 8371739a20eSAndy Ritger 8381739a20eSAndy Ritger return numSysmemLinks; 8391739a20eSAndy Ritger } 8401739a20eSAndy Ritger 8411739a20eSAndy Ritger /*! 8421739a20eSAndy Ritger * @brief Returns number of peer links to a remote GPU 8431739a20eSAndy Ritger * 8441739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer of local GPU 8451739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 8461739a20eSAndy Ritger * @param[in] pRemoteGpu OBJGPU pointer of remote GPU 8471739a20eSAndy Ritger * 8481739a20eSAndy Ritger * @return The #peer NVLinks to the remote GPU 8491739a20eSAndy Ritger */ 8501739a20eSAndy Ritger NvU32 8511739a20eSAndy Ritger knvlinkGetNumLinksToPeer_IMPL 8521739a20eSAndy Ritger ( 8531739a20eSAndy Ritger OBJGPU *pGpu, 8541739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 8551739a20eSAndy Ritger OBJGPU *pRemoteGpu 8561739a20eSAndy Ritger ) 8571739a20eSAndy Ritger { 8581739a20eSAndy Ritger NvU32 numPeerLinks = 8591739a20eSAndy Ritger knvlinkGetLinkMaskToPeer(pGpu, pKernelNvlink, pRemoteGpu); 8601739a20eSAndy Ritger 8611739a20eSAndy Ritger if (numPeerLinks != 0) 8621739a20eSAndy Ritger { 8631739a20eSAndy Ritger NUMSETBITS_32(numPeerLinks); 8641739a20eSAndy Ritger } 8651739a20eSAndy Ritger 8661739a20eSAndy Ritger return numPeerLinks; 8671739a20eSAndy Ritger } 8681739a20eSAndy Ritger 8691739a20eSAndy Ritger /*! 8701739a20eSAndy Ritger * @brief Gets the mask of peer links between the GPUs 8711739a20eSAndy Ritger * 8721739a20eSAndy Ritger * @param[in] pGpu0 OBJGPU pointer 8731739a20eSAndy Ritger * @param[in] pKernelNvlink0 Nvlink pointer 8741739a20eSAndy Ritger * @param[in] pGpu1 Remote OBJGPU pointer 8751739a20eSAndy Ritger * 8761739a20eSAndy Ritger * @return Returns the mask of peer links between the GPUs 8771739a20eSAndy Ritger */ 8781739a20eSAndy Ritger NvU32 8791739a20eSAndy Ritger knvlinkGetLinkMaskToPeer_IMPL 8801739a20eSAndy Ritger ( 8811739a20eSAndy Ritger OBJGPU *pGpu0, 8821739a20eSAndy Ritger KernelNvlink *pKernelNvlink0, 8831739a20eSAndy Ritger OBJGPU *pGpu1 8841739a20eSAndy Ritger ) 8851739a20eSAndy Ritger { 8861739a20eSAndy Ritger NvU32 peerLinkMask = 0; 887758b4ee8SAndy Ritger KernelNvlink *pKernelNvlink1 = NULL; 888758b4ee8SAndy Ritger 889758b4ee8SAndy Ritger pKernelNvlink1 = GPU_GET_KERNEL_NVLINK(pGpu1); 890758b4ee8SAndy Ritger 891758b4ee8SAndy Ritger if (pKernelNvlink1 == NULL) 892758b4ee8SAndy Ritger { 893*b5bf85a8SAndy Ritger NV_PRINTF(LEVEL_INFO, 894758b4ee8SAndy Ritger "on GPU%d NVLink is disabled.\n", gpuGetInstance(pGpu1)); 895758b4ee8SAndy Ritger 896758b4ee8SAndy Ritger return 0; 897758b4ee8SAndy Ritger } 898758b4ee8SAndy Ritger 899758b4ee8SAndy Ritger if(pKernelNvlink0->bIsGpuDegraded) 900758b4ee8SAndy Ritger { 901758b4ee8SAndy Ritger return peerLinkMask; 902758b4ee8SAndy Ritger } 903758b4ee8SAndy Ritger 904758b4ee8SAndy Ritger if(pKernelNvlink1->bIsGpuDegraded) 905758b4ee8SAndy Ritger { 906758b4ee8SAndy Ritger return peerLinkMask; 907758b4ee8SAndy Ritger } 9081739a20eSAndy Ritger 9091739a20eSAndy Ritger if (!knvlinkIsForcedConfig(pGpu0, pKernelNvlink0)) 9101739a20eSAndy Ritger { 9111739a20eSAndy Ritger // 9121739a20eSAndy Ritger // If nvlink topology is not forced, then the hshub registers 9131739a20eSAndy Ritger // are updated only when a P2P object is allocated. So, return 9141739a20eSAndy Ritger // the cached value of mask of links connected to a GPU 9151739a20eSAndy Ritger // 9161739a20eSAndy Ritger peerLinkMask = pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)]; 9171739a20eSAndy Ritger } 9181739a20eSAndy Ritger 9191739a20eSAndy Ritger return peerLinkMask; 9201739a20eSAndy Ritger } 9211739a20eSAndy Ritger 9221739a20eSAndy Ritger /*! 9231739a20eSAndy Ritger * @brief Sets the mask of peer links between the GPUs 9241739a20eSAndy Ritger * 9251739a20eSAndy Ritger * @param[in] pGpu0 OBJGPU pointer 9261739a20eSAndy Ritger * @param[in] pKernelNvlink0 Nvlink pointer 9271739a20eSAndy Ritger * @param[in] pGpu1 Remote OBJGPU pointer 9281739a20eSAndy Ritger * @param[in] peerLinkMask Mask of links to the peer GPU 9291739a20eSAndy Ritger * 9301739a20eSAndy Ritger * @return NV_OK on success 9311739a20eSAndy Ritger */ 9321739a20eSAndy Ritger NV_STATUS 9331739a20eSAndy Ritger knvlinkSetLinkMaskToPeer_IMPL 9341739a20eSAndy Ritger ( 9351739a20eSAndy Ritger OBJGPU *pGpu0, 9361739a20eSAndy Ritger KernelNvlink *pKernelNvlink0, 9371739a20eSAndy Ritger OBJGPU *pGpu1, 9381739a20eSAndy Ritger NvU32 peerLinkMask 9391739a20eSAndy Ritger ) 9401739a20eSAndy Ritger { 9411739a20eSAndy Ritger NV_STATUS status = NV_OK; 9421739a20eSAndy Ritger 9431739a20eSAndy Ritger // Return early if no update needed to the peer link mask 9441739a20eSAndy Ritger if (pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)] == peerLinkMask) 9451739a20eSAndy Ritger return NV_OK; 9461739a20eSAndy Ritger 9471739a20eSAndy Ritger pKernelNvlink0->peerLinkMasks[gpuGetInstance(pGpu1)] = peerLinkMask; 9481739a20eSAndy Ritger 9491739a20eSAndy Ritger NV2080_CTRL_NVLINK_UPDATE_PEER_LINK_MASK_PARAMS params; 9501739a20eSAndy Ritger 9511739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 9521739a20eSAndy Ritger params.gpuInst = gpuGetInstance(pGpu1); 9531739a20eSAndy Ritger params.peerLinkMask = peerLinkMask; 9541739a20eSAndy Ritger 9551739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init 9561739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu0)) 9571739a20eSAndy Ritger { 9581739a20eSAndy Ritger threadStateResetTimeout(pGpu0); 9591739a20eSAndy Ritger } 9601739a20eSAndy Ritger 9611739a20eSAndy Ritger // Sync the peerLinkMask with GSP-RM 9621739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu0, pKernelNvlink0, 9631739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_UPDATE_PEER_LINK_MASK, 9641739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 9651739a20eSAndy Ritger if (status != NV_OK) 9661739a20eSAndy Ritger { 9671739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, 9681739a20eSAndy Ritger "Failed to sync peerLinksMask from GPU%d to GPU%d\n", 9691739a20eSAndy Ritger gpuGetInstance(pGpu0), gpuGetInstance(pGpu1)); 9701739a20eSAndy Ritger return status; 9711739a20eSAndy Ritger } 9721739a20eSAndy Ritger 9731739a20eSAndy Ritger return NV_OK; 9741739a20eSAndy Ritger } 9751739a20eSAndy Ritger 9761739a20eSAndy Ritger /*! 9771739a20eSAndy Ritger * @brief Get the mask of links that are peer links 9781739a20eSAndy Ritger * 9791739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 9801739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 9811739a20eSAndy Ritger */ 9821739a20eSAndy Ritger NvU32 9831739a20eSAndy Ritger knvlinkGetPeersNvlinkMaskFromHshub_IMPL 9841739a20eSAndy Ritger ( 9851739a20eSAndy Ritger OBJGPU *pGpu, 9861739a20eSAndy Ritger KernelNvlink *pKernelNvlink 9871739a20eSAndy Ritger ) 9881739a20eSAndy Ritger { 9891739a20eSAndy Ritger NV_STATUS status = NV_OK; 9901739a20eSAndy Ritger NvU32 peerLinkMask = 0; 9911739a20eSAndy Ritger NvU32 i; 9921739a20eSAndy Ritger 9931739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_LINK_AND_CLOCK_INFO_PARAMS params; 9941739a20eSAndy Ritger 9951739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 9961739a20eSAndy Ritger params.linkMask = pKernelNvlink->enabledLinks; 9971739a20eSAndy Ritger 9981739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 9991739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_LINK_AND_CLOCK_INFO, 10001739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 10011739a20eSAndy Ritger if (status != NV_OK) 10021739a20eSAndy Ritger return 0; 10031739a20eSAndy Ritger 10041739a20eSAndy Ritger // Scan enabled links for peer connections 10051739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks) 10061739a20eSAndy Ritger { 10071739a20eSAndy Ritger if (params.linkInfo[i].bLinkConnectedToPeer) 10081739a20eSAndy Ritger peerLinkMask |= NVBIT(i); 10091739a20eSAndy Ritger } 10101739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK_END; 10111739a20eSAndy Ritger 10121739a20eSAndy Ritger return peerLinkMask; 10131739a20eSAndy Ritger } 10141739a20eSAndy Ritger 10151739a20eSAndy Ritger /*! 10161739a20eSAndy Ritger * @brief Prepare a GPU's NVLink engine for reset by removing mappings 10171739a20eSAndy Ritger * to it from other GPUs. 10181739a20eSAndy Ritger * 10191739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 10201739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 10211739a20eSAndy Ritger * 10221739a20eSAndy Ritger * return NV_OK on success 10231739a20eSAndy Ritger */ 10241739a20eSAndy Ritger NV_STATUS 10251739a20eSAndy Ritger knvlinkPrepareForXVEReset_IMPL 10261739a20eSAndy Ritger ( 10271739a20eSAndy Ritger OBJGPU *pGpu, 102890eb1077SAndy Ritger KernelNvlink *pKernelNvlink, 102990eb1077SAndy Ritger NvBool bForceShutdown 10301739a20eSAndy Ritger ) 10311739a20eSAndy Ritger { 10321739a20eSAndy Ritger OBJSYS *pSys = SYS_GET_INSTANCE(); 10331739a20eSAndy Ritger NV_STATUS retStatus = NV_OK; 10341739a20eSAndy Ritger OBJGPU *pRemoteGpu; 10351739a20eSAndy Ritger NV_STATUS status; 10361739a20eSAndy Ritger NvU32 gpuInstance; 10371739a20eSAndy Ritger NvU32 gpuMask; 10381739a20eSAndy Ritger 10391739a20eSAndy Ritger // This is not supported on forced configs 10401739a20eSAndy Ritger if (knvlinkIsForcedConfig(pGpu, pKernelNvlink)) 10411739a20eSAndy Ritger { 10421739a20eSAndy Ritger return NV_OK; 10431739a20eSAndy Ritger } 10441739a20eSAndy Ritger 10451739a20eSAndy Ritger // 10461739a20eSAndy Ritger // Let fabric manager handle link shutdown/reset if the fabric is managed 10471739a20eSAndy Ritger // externally. 10481739a20eSAndy Ritger // 10495f40a5aeSAndy Ritger if (pKernelNvlink->ipVerNvlink < NVLINK_VERSION_40 && 10505f40a5aeSAndy Ritger pSys->getProperty(pSys, PDB_PROP_SYS_FABRIC_IS_EXTERNALLY_MANAGED)) 10511739a20eSAndy Ritger { 10521739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, 10531739a20eSAndy Ritger "NVLink fabric is externally managed, skipping\n"); 10541739a20eSAndy Ritger return NV_OK; 10551739a20eSAndy Ritger } 10561739a20eSAndy Ritger 10571739a20eSAndy Ritger status = gpumgrGetGpuAttachInfo(NULL, &gpuMask); 10581739a20eSAndy Ritger NV_ASSERT_OR_RETURN(status == NV_OK, status); 10591739a20eSAndy Ritger 10601739a20eSAndy Ritger gpuInstance = 0; 10611739a20eSAndy Ritger while ((pRemoteGpu = gpumgrGetNextGpu(gpuMask, &gpuInstance)) != NULL) 10621739a20eSAndy Ritger { 10631739a20eSAndy Ritger KernelNvlink *pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu); 10641739a20eSAndy Ritger 10651739a20eSAndy Ritger if ((pRemoteGpu == pGpu) || (pRemoteKernelNvlink == NULL) || 10661739a20eSAndy Ritger (knvlinkGetNumLinksToPeer(pRemoteGpu, pRemoteKernelNvlink, pGpu) == 0) || 106790eb1077SAndy Ritger API_GPU_IN_RESET_SANITY_CHECK(pRemoteGpu) || 106890eb1077SAndy Ritger pRemoteGpu->getProperty(pRemoteGpu, PDB_PROP_GPU_IS_LOST)) 10691739a20eSAndy Ritger { 10701739a20eSAndy Ritger continue; 10711739a20eSAndy Ritger } 10721739a20eSAndy Ritger 10731739a20eSAndy Ritger // 10741739a20eSAndy Ritger // Reset the peer masks in HSHUB of the remote GPU. Partial resets 10751739a20eSAndy Ritger // (only removing the links connected to the GPU being reset) don't 10761739a20eSAndy Ritger // appear to be sufficient. The reset will work fine, but the next 10771739a20eSAndy Ritger // time we attempt to initialize this GPU, the copy engines will time 10781739a20eSAndy Ritger // out while scrubbing FB and a GPU sysmembar (NV_UFLUSH_FB_FLUSH) will 10791739a20eSAndy Ritger // fail to complete. 10801739a20eSAndy Ritger // 10811739a20eSAndy Ritger // The above symptoms haven't been root-caused (yet), but the current 10821739a20eSAndy Ritger // POR for GPU reset is that once one GPU is reset, the others 10831739a20eSAndy Ritger // connected to it over NVLink must also be reset before using NVLink 10841739a20eSAndy Ritger // for peer traffic, so just use the big hammer and squash all HSHUB 10851739a20eSAndy Ritger // configs on GPU reset. 10861739a20eSAndy Ritger // 10871739a20eSAndy Ritger // This allows us to reset the GPUs one by one, with GPU 10881739a20eSAndy Ritger // initializations in between, without hanging up the GPU trying to 10891739a20eSAndy Ritger // flush data over links that aren't available anymore. 10901739a20eSAndy Ritger // 109112c07393SBernhard Stoeckner // Starting from Ampere single GPU reset is supported and hence remove 109212c07393SBernhard Stoeckner // only the nvlink's of the remote GPU's which are connected to the 109312c07393SBernhard Stoeckner // current GPU. 109412c07393SBernhard Stoeckner // 109512c07393SBernhard Stoeckner 109612c07393SBernhard Stoeckner if (IsAMPEREorBetter(pGpu)) 109712c07393SBernhard Stoeckner { 109812c07393SBernhard Stoeckner NvU32 remPeerId = kbusGetPeerId_HAL(pRemoteGpu, GPU_GET_KERNEL_BUS(pRemoteGpu), pGpu); 109912c07393SBernhard Stoeckner if (remPeerId != BUS_INVALID_PEER) 110012c07393SBernhard Stoeckner status = knvlinkRemoveMapping_HAL(pRemoteGpu, pRemoteKernelNvlink, NV_FALSE, 110112c07393SBernhard Stoeckner NVBIT(remPeerId), 110212c07393SBernhard Stoeckner NV_FALSE /* bL2Entry */); 110312c07393SBernhard Stoeckner } 110412c07393SBernhard Stoeckner else 110512c07393SBernhard Stoeckner { 11061739a20eSAndy Ritger status = knvlinkRemoveMapping_HAL(pRemoteGpu, pRemoteKernelNvlink, NV_FALSE, 11071739a20eSAndy Ritger ((1 << NVLINK_MAX_PEERS_SW) - 1), 11081739a20eSAndy Ritger NV_FALSE /* bL2Entry */); 110912c07393SBernhard Stoeckner } 11101739a20eSAndy Ritger if (status != NV_OK) 11111739a20eSAndy Ritger { 11121739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, 11131739a20eSAndy Ritger "failed to reset HSHUB on GPU%u while preparing for GPU%u XVE reset (0x%x)\n", 11141739a20eSAndy Ritger gpuGetInstance(pRemoteGpu), gpuGetInstance(pGpu), 11151739a20eSAndy Ritger status); 11161739a20eSAndy Ritger 11171739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus; 11181739a20eSAndy Ritger } 11191739a20eSAndy Ritger } 11201739a20eSAndy Ritger 11211739a20eSAndy Ritger // Remove all NVLink mappings in HSHUB config registers to init values 112290eb1077SAndy Ritger if (!API_GPU_IN_RESET_SANITY_CHECK(pGpu) && !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST)) 11231739a20eSAndy Ritger status = knvlinkRemoveMapping_HAL(pGpu, pKernelNvlink, NV_TRUE, ((1 << NVLINK_MAX_PEERS_SW) - 1), 11241739a20eSAndy Ritger NV_FALSE /* bL2Entry */); 11251739a20eSAndy Ritger if (status != NV_OK) 11261739a20eSAndy Ritger { 11271739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, 11281739a20eSAndy Ritger "failed to reset HSHUB on GPU%u while preparing XVE reset: %s (0x%x)\n", 11291739a20eSAndy Ritger gpuGetInstance(pGpu), nvstatusToString(status), status); 11301739a20eSAndy Ritger 11311739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus; 11321739a20eSAndy Ritger } 11331739a20eSAndy Ritger 113490eb1077SAndy Ritger // 113590eb1077SAndy Ritger // If GFW is booted and running through link-training, then no need to tear-down the 113690eb1077SAndy Ritger // links to reset. Exit out early from the function 113790eb1077SAndy Ritger // 1138eb5c7665SAndy Ritger if (!bForceShutdown && pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_MINION_GFW_BOOT)) 113990eb1077SAndy Ritger { 114090eb1077SAndy Ritger return NV_OK; 114190eb1077SAndy Ritger } 114290eb1077SAndy Ritger 11431739a20eSAndy Ritger // Pseudo-clean shutdown the links from this GPU 114490eb1077SAndy Ritger status = knvlinkCoreShutdownDeviceLinks(pGpu, pKernelNvlink, bForceShutdown); 11451739a20eSAndy Ritger if (status != NV_OK) 11461739a20eSAndy Ritger { 11471739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, 11481739a20eSAndy Ritger "failed to shutdown links on GPU%u while preparing XVE reset: %s (0x%x)\n", 11491739a20eSAndy Ritger gpuGetInstance(pGpu), nvstatusToString(status), status); 11501739a20eSAndy Ritger 11511739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus; 11521739a20eSAndy Ritger } 11531739a20eSAndy Ritger 11541739a20eSAndy Ritger // 11551739a20eSAndy Ritger // Reset links related to this device and its peers (see Bug 2346447) 11561739a20eSAndy Ritger // The property is disabled on Pascal, since the path hasn't been verified 11571739a20eSAndy Ritger // and link reset after pseudo-clean shutdown results in DL and TL errors. 11581739a20eSAndy Ritger // 11591739a20eSAndy Ritger if (pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LINKRESET_AFTER_SHUTDOWN)) 11601739a20eSAndy Ritger { 11611739a20eSAndy Ritger status = knvlinkCoreResetDeviceLinks(pGpu, pKernelNvlink); 11621739a20eSAndy Ritger if (status != NV_OK) 11631739a20eSAndy Ritger { 11641739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, 11651739a20eSAndy Ritger "failed to reset links on GPU%u while preparing XVE reset: %s (0x%x)\n", 11661739a20eSAndy Ritger gpuGetInstance(pGpu), nvstatusToString(status), status); 11671739a20eSAndy Ritger 11681739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus; 11691739a20eSAndy Ritger } 1170dac2350cSAndy Ritger #if defined(INCLUDE_NVLINK_LIB) 1171dac2350cSAndy Ritger else 1172dac2350cSAndy Ritger { 1173dac2350cSAndy Ritger NvU32 linkId; 1174dac2350cSAndy Ritger 1175dac2350cSAndy Ritger // 1176dac2350cSAndy Ritger // The connections have been successfully reset, update connected and disconnected 1177dac2350cSAndy Ritger // links masks on both the devices 1178dac2350cSAndy Ritger // 1179dac2350cSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, linkId, pKernelNvlink->enabledLinks) 1180dac2350cSAndy Ritger { 1181dac2350cSAndy Ritger pKernelNvlink->disconnectedLinkMask |= NVBIT(linkId); 1182dac2350cSAndy Ritger pKernelNvlink->connectedLinksMask &= ~NVBIT(linkId); 1183dac2350cSAndy Ritger 1184dac2350cSAndy Ritger if (pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.deviceType != 1185dac2350cSAndy Ritger NV2080_CTRL_NVLINK_DEVICE_INFO_DEVICE_TYPE_GPU) 1186dac2350cSAndy Ritger { 1187dac2350cSAndy Ritger continue; 1188dac2350cSAndy Ritger } 1189dac2350cSAndy Ritger 1190dac2350cSAndy Ritger OBJGPU *pRemoteGpu = gpumgrGetGpuFromBusInfo( 1191dac2350cSAndy Ritger pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.domain, 1192dac2350cSAndy Ritger pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bus, 1193dac2350cSAndy Ritger pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.device); 1194dac2350cSAndy Ritger 1195dac2350cSAndy Ritger if (!API_GPU_IN_RESET_SANITY_CHECK(pRemoteGpu)) 1196dac2350cSAndy Ritger { 1197dac2350cSAndy Ritger KernelNvlink *pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu); 1198dac2350cSAndy Ritger NvU32 remoteLinkId = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber; 1199dac2350cSAndy Ritger 1200dac2350cSAndy Ritger pRemoteKernelNvlink->disconnectedLinkMask |= NVBIT(remoteLinkId); 1201dac2350cSAndy Ritger pRemoteKernelNvlink->connectedLinksMask &= ~NVBIT(remoteLinkId); 1202dac2350cSAndy Ritger } 1203dac2350cSAndy Ritger } 1204dac2350cSAndy Ritger FOR_EACH_INDEX_IN_MASK_END; 1205dac2350cSAndy Ritger } 1206dac2350cSAndy Ritger #endif 12071739a20eSAndy Ritger 12081739a20eSAndy Ritger // 12091739a20eSAndy Ritger // knvlinkCoreResetDeviceLinks() only resets the links which have 12101739a20eSAndy Ritger // connectivity. 12111739a20eSAndy Ritger // Pre-Ampere, we may run into a situation where the PLL 12121739a20eSAndy Ritger // sharing partner links (both) may not be reset due to no connectivity. 12131739a20eSAndy Ritger // 12141739a20eSAndy Ritger // Hence, (re-)reset all the links to recover them after shutdown (pre-Ampere) 12151739a20eSAndy Ritger // 12161739a20eSAndy Ritger NV2080_CTRL_NVLINK_RESET_LINKS_PARAMS resetLinksparams; 12171739a20eSAndy Ritger 12181739a20eSAndy Ritger portMemSet(&resetLinksparams, 0, sizeof(resetLinksparams)); 12191739a20eSAndy Ritger resetLinksparams.linkMask = pKernelNvlink->enabledLinks; 12201739a20eSAndy Ritger resetLinksparams.flags = NV2080_CTRL_NVLINK_RESET_FLAGS_TOGGLE; 12211739a20eSAndy Ritger 12221739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 12231739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_RESET_LINKS, 12241739a20eSAndy Ritger (void *)&resetLinksparams, sizeof(resetLinksparams)); 12251739a20eSAndy Ritger 12261739a20eSAndy Ritger retStatus = (retStatus == NV_OK) ? status : retStatus; 12271739a20eSAndy Ritger } 12281739a20eSAndy Ritger 12291739a20eSAndy Ritger return retStatus; 12301739a20eSAndy Ritger } 12311739a20eSAndy Ritger 12321739a20eSAndy Ritger /*! 12331739a20eSAndy Ritger * @brief Set the power features supported on this NVLink IP 12341739a20eSAndy Ritger * 12351739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 12361739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 12371739a20eSAndy Ritger */ 12381739a20eSAndy Ritger void 12391739a20eSAndy Ritger knvlinkSetPowerFeatures_IMPL 12401739a20eSAndy Ritger ( 12411739a20eSAndy Ritger OBJGPU *pGpu, 12421739a20eSAndy Ritger KernelNvlink *pKernelNvlink 12431739a20eSAndy Ritger ) 12441739a20eSAndy Ritger { 12451739a20eSAndy Ritger // Get the Ip Verion from the First available IOCTRL. 12461739a20eSAndy Ritger switch (pKernelNvlink->ipVerNvlink) 12471739a20eSAndy Ritger { 12481739a20eSAndy Ritger case NVLINK_VERSION_22: 12491739a20eSAndy Ritger { 12501739a20eSAndy Ritger // NVLink L2 is supported only on MODS and Windows LDDM 1251758b4ee8SAndy Ritger if (RMCFG_FEATURE_PLATFORM_WINDOWS_LDDM || RMCFG_FEATURE_MODS_FEATURES) 12521739a20eSAndy Ritger { 12531739a20eSAndy Ritger pKernelNvlink->setProperty(pKernelNvlink, PDB_PROP_KNVLINK_L2_POWER_STATE_ENABLED, 12541739a20eSAndy Ritger (pKernelNvlink->bDisableL2Mode ? NV_FALSE : NV_TRUE)); 12551739a20eSAndy Ritger } 12561739a20eSAndy Ritger 12571739a20eSAndy Ritger break; 12581739a20eSAndy Ritger } 12591739a20eSAndy Ritger default: 12601739a20eSAndy Ritger break; 12611739a20eSAndy Ritger } 12621739a20eSAndy Ritger } 12631739a20eSAndy Ritger 12641739a20eSAndy Ritger /*! 12651739a20eSAndy Ritger * @brief Checks if NVSWITCH_FABRIC_ADDR field is valid. 12661739a20eSAndy Ritger * 12671739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 12681739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 12691739a20eSAndy Ritger */ 12701739a20eSAndy Ritger void 12711739a20eSAndy Ritger knvlinkDetectNvswitchProxy_IMPL 12721739a20eSAndy Ritger ( 12731739a20eSAndy Ritger OBJGPU *pGpu, 12741739a20eSAndy Ritger KernelNvlink *pKernelNvlink 12751739a20eSAndy Ritger ) 12761739a20eSAndy Ritger { 12771739a20eSAndy Ritger OBJSYS *pSys = SYS_GET_INSTANCE(); 12781739a20eSAndy Ritger NV_STATUS status = NV_OK; 12791739a20eSAndy Ritger NvU32 i; 12801739a20eSAndy Ritger 12811739a20eSAndy Ritger // Initialize fabricBaseAddr to NVLINK_INVALID_FABRIC_ADDR 12821739a20eSAndy Ritger pKernelNvlink->fabricBaseAddr = NVLINK_INVALID_FABRIC_ADDR; 12831739a20eSAndy Ritger 12841739a20eSAndy Ritger if (pSys->getProperty(pSys, PDB_PROP_SYS_NVSWITCH_IS_PRESENT) || 1285758b4ee8SAndy Ritger pSys->getProperty(pSys, PDB_PROP_SYS_FABRIC_MANAGER_IS_REGISTERED) || 1286758b4ee8SAndy Ritger GPU_IS_NVSWITCH_DETECTED(pGpu)) 12871739a20eSAndy Ritger { 12881739a20eSAndy Ritger return; 12891739a20eSAndy Ritger } 12901739a20eSAndy Ritger 12911739a20eSAndy Ritger if (pKernelNvlink->discoveredLinks == 0) 12921739a20eSAndy Ritger { 12931739a20eSAndy Ritger return; 12941739a20eSAndy Ritger } 12951739a20eSAndy Ritger 12961739a20eSAndy Ritger // Get the link train status for the enabled link masks 12971739a20eSAndy Ritger NV2080_CTRL_NVLINK_ARE_LINKS_TRAINED_PARAMS linkTrainedParams; 12981739a20eSAndy Ritger 12991739a20eSAndy Ritger portMemSet(&linkTrainedParams, 0, sizeof(linkTrainedParams)); 13001739a20eSAndy Ritger linkTrainedParams.linkMask = pKernelNvlink->enabledLinks; 13011739a20eSAndy Ritger linkTrainedParams.bActiveOnly = NV_FALSE; 13021739a20eSAndy Ritger 13031739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init 13041739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu)) 13051739a20eSAndy Ritger { 13061739a20eSAndy Ritger threadStateResetTimeout(pGpu); 13071739a20eSAndy Ritger } 13081739a20eSAndy Ritger 13091739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 13101739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_ARE_LINKS_TRAINED, 13111739a20eSAndy Ritger (void *)&linkTrainedParams, sizeof(linkTrainedParams)); 13121739a20eSAndy Ritger if (status != NV_OK) 13131739a20eSAndy Ritger { 13141739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get the link train status for links\n"); 13151739a20eSAndy Ritger return; 13161739a20eSAndy Ritger } 13171739a20eSAndy Ritger 13181739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->enabledLinks) 13191739a20eSAndy Ritger { 13201739a20eSAndy Ritger if (!linkTrainedParams.bIsLinkActive[i]) 13211739a20eSAndy Ritger { 13221739a20eSAndy Ritger return; 13231739a20eSAndy Ritger } 13241739a20eSAndy Ritger } 13251739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK_END; 13261739a20eSAndy Ritger 13271739a20eSAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_SET_NVSWITCH_FABRIC_ADDR_PARAMS params; 13281739a20eSAndy Ritger 13291739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 13301739a20eSAndy Ritger params.bGet = NV_TRUE; 13311739a20eSAndy Ritger params.addr = NVLINK_INVALID_FABRIC_ADDR; 13321739a20eSAndy Ritger 13331739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 13341739a20eSAndy Ritger NV2080_CTRL_CMD_INTERNAL_NVLINK_GET_SET_NVSWITCH_FABRIC_ADDR, 13351739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 13361739a20eSAndy Ritger if (status != NV_OK) 13371739a20eSAndy Ritger { 13381739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get fabric address for GPU %x\n", 13391739a20eSAndy Ritger pGpu->gpuInstance); 13401739a20eSAndy Ritger return; 13411739a20eSAndy Ritger } 13421739a20eSAndy Ritger 13431739a20eSAndy Ritger if (params.addr != NVLINK_INVALID_FABRIC_ADDR) 13441739a20eSAndy Ritger { 13451739a20eSAndy Ritger pKernelNvlink->fabricBaseAddr = params.addr; 13461739a20eSAndy Ritger pKernelNvlink->bNvswitchProxy = NV_TRUE; 13471739a20eSAndy Ritger } 13481739a20eSAndy Ritger } 13491739a20eSAndy Ritger 13501739a20eSAndy Ritger /*! 13511739a20eSAndy Ritger * @brief Sets NVSWITCH_FLA_ADDR field in the scratch register. 13521739a20eSAndy Ritger * 13531739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 13541739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 13551739a20eSAndy Ritger * @param[in] addr FLA addr 13561739a20eSAndy Ritger * 13571739a20eSAndy Ritger * @return Returns NV_OK upon success. 13581739a20eSAndy Ritger * Otherwise, returns NV_ERR_XXX. 13591739a20eSAndy Ritger */ 13601739a20eSAndy Ritger NV_STATUS 13611739a20eSAndy Ritger knvlinkSetNvswitchFlaAddr_IMPL 13621739a20eSAndy Ritger ( 13631739a20eSAndy Ritger OBJGPU *pGpu, 13641739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 13651739a20eSAndy Ritger NvU64 addr 13661739a20eSAndy Ritger ) 13671739a20eSAndy Ritger { 13681739a20eSAndy Ritger return NV_OK; 13691739a20eSAndy Ritger } 13701739a20eSAndy Ritger 13711739a20eSAndy Ritger /*! 13721739a20eSAndy Ritger * @brief Gets NVSWITCH_FLA_ADDR field from the scratch register. 13731739a20eSAndy Ritger * 13741739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 13751739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 13761739a20eSAndy Ritger * 13771739a20eSAndy Ritger * @return Returns the stashed FLA starting address. 13781739a20eSAndy Ritger */ 13791739a20eSAndy Ritger NvU64 13801739a20eSAndy Ritger knvlinkGetNvswitchFlaAddr_IMPL 13811739a20eSAndy Ritger ( 13821739a20eSAndy Ritger OBJGPU *pGpu, 13831739a20eSAndy Ritger KernelNvlink *pKernelNvlink 13841739a20eSAndy Ritger ) 13851739a20eSAndy Ritger { 13861739a20eSAndy Ritger return 0; 13871739a20eSAndy Ritger } 13881739a20eSAndy Ritger 13891739a20eSAndy Ritger /*! 13901739a20eSAndy Ritger * @brief Checks if fabricBaseAddr is valid. 13911739a20eSAndy Ritger * 13921739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 13931739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 13941739a20eSAndy Ritger * 13951739a20eSAndy Ritger * @return Returns true if the fabricBaseAddr is valid. 13961739a20eSAndy Ritger */ 13971739a20eSAndy Ritger NvBool 13981739a20eSAndy Ritger knvlinkIsNvswitchProxyPresent_IMPL 13991739a20eSAndy Ritger ( 14001739a20eSAndy Ritger OBJGPU *pGpu, 14011739a20eSAndy Ritger KernelNvlink *pKernelNvlink 14021739a20eSAndy Ritger ) 14031739a20eSAndy Ritger { 14041739a20eSAndy Ritger return pKernelNvlink->bNvswitchProxy; 14051739a20eSAndy Ritger } 14061739a20eSAndy Ritger 14071739a20eSAndy Ritger 14081739a20eSAndy Ritger /*! 14091739a20eSAndy Ritger * @brief Set unique FLA base address for NVSwitch enabled systems. 14101739a20eSAndy Ritger * Validates FLA base address and programs the base address 14111739a20eSAndy Ritger * in switch scratch registers for guest VM to pick it up. 14121739a20eSAndy Ritger * 14131739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 14141739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 14151739a20eSAndy Ritger * @param[in] flaBaseAddr NvU64 base address 14161739a20eSAndy Ritger * 14171739a20eSAndy Ritger * @returns On success, sets unique FLA base address and returns NV_OK. 14181739a20eSAndy Ritger * On failure, returns NV_ERR_XXX. 14191739a20eSAndy Ritger */ 14201739a20eSAndy Ritger NV_STATUS 14211739a20eSAndy Ritger knvlinkSetUniqueFlaBaseAddress_IMPL 14221739a20eSAndy Ritger ( 14231739a20eSAndy Ritger OBJGPU *pGpu, 14241739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 14251739a20eSAndy Ritger NvU64 flaBaseAddr 14261739a20eSAndy Ritger ) 14271739a20eSAndy Ritger { 14281739a20eSAndy Ritger NV_STATUS status = NV_OK; 14291739a20eSAndy Ritger KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu); 14301739a20eSAndy Ritger 14311739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_SET_NVSWITCH_FLA_ADDR_PARAMS params; 14321739a20eSAndy Ritger 14331739a20eSAndy Ritger if (!knvlinkIsForcedConfig(pGpu, pKernelNvlink)) 14341739a20eSAndy Ritger { 14351739a20eSAndy Ritger knvlinkCoreGetRemoteDeviceInfo(pGpu, pKernelNvlink); 14361739a20eSAndy Ritger 14371739a20eSAndy Ritger status = knvlinkEnableLinksPostTopology_HAL(pGpu, pKernelNvlink, 14381739a20eSAndy Ritger pKernelNvlink->enabledLinks); 14391739a20eSAndy Ritger if (status != NV_OK) 14401739a20eSAndy Ritger { 14411739a20eSAndy Ritger return status; 14421739a20eSAndy Ritger } 14431739a20eSAndy Ritger } 14441739a20eSAndy Ritger 14451739a20eSAndy Ritger status = kbusValidateFlaBaseAddress_HAL(pGpu, pKernelBus, flaBaseAddr); 14461739a20eSAndy Ritger if (status != NV_OK) 14471739a20eSAndy Ritger { 14481739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "FLA base addr validation failed for GPU %x\n", 14491739a20eSAndy Ritger pGpu->gpuInstance); 14501739a20eSAndy Ritger return status; 14511739a20eSAndy Ritger } 14521739a20eSAndy Ritger 14531739a20eSAndy Ritger if (IsSLIEnabled(pGpu)) 14541739a20eSAndy Ritger { 14551739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, 14561739a20eSAndy Ritger "Operation is unsupported on SLI enabled GPU %x\n", 14571739a20eSAndy Ritger pGpu->gpuInstance); 14581739a20eSAndy Ritger return NV_ERR_NOT_SUPPORTED; 14591739a20eSAndy Ritger } 14601739a20eSAndy Ritger 14611739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 14621739a20eSAndy Ritger params.bGet = NV_FALSE; 14631739a20eSAndy Ritger params.addr = flaBaseAddr; 14641739a20eSAndy Ritger 14651739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 14661739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_SET_NVSWITCH_FLA_ADDR, 14671739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 14681739a20eSAndy Ritger if (status != NV_OK) 14691739a20eSAndy Ritger { 14701739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to stash fla base address for GPU %x\n", 14711739a20eSAndy Ritger pGpu->gpuInstance); 14721739a20eSAndy Ritger return status; 14731739a20eSAndy Ritger } 14741739a20eSAndy Ritger 14751739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, "FLA base addr %llx is assigned to GPU %x\n", 14761739a20eSAndy Ritger flaBaseAddr, pGpu->gpuInstance); 14771739a20eSAndy Ritger 14781739a20eSAndy Ritger return NV_OK; 14791739a20eSAndy Ritger } 14801739a20eSAndy Ritger 14811739a20eSAndy Ritger /*! 14821739a20eSAndy Ritger * @brief Synchronize the link masks and vbios defined properties 14831739a20eSAndy Ritger * between CPU and GSP-RMs 14841739a20eSAndy Ritger * 14851739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 14861739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 14871739a20eSAndy Ritger */ 14881739a20eSAndy Ritger NV_STATUS 14891739a20eSAndy Ritger knvlinkSyncLinkMasksAndVbiosInfo_IMPL 14901739a20eSAndy Ritger ( 14911739a20eSAndy Ritger OBJGPU *pGpu, 14921739a20eSAndy Ritger KernelNvlink *pKernelNvlink 14931739a20eSAndy Ritger ) 14941739a20eSAndy Ritger { 14951739a20eSAndy Ritger NV_STATUS status = NV_OK; 14961739a20eSAndy Ritger 14971739a20eSAndy Ritger NV2080_CTRL_NVLINK_SYNC_LINK_MASKS_AND_VBIOS_INFO_PARAMS params; 14981739a20eSAndy Ritger 14991739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 15001739a20eSAndy Ritger 15011739a20eSAndy Ritger params.discoveredLinks = pKernelNvlink->discoveredLinks; 15021739a20eSAndy Ritger params.connectedLinksMask = pKernelNvlink->connectedLinksMask; 15031739a20eSAndy Ritger params.bridgeSensableLinks = pKernelNvlink->bridgeSensableLinks; 15041739a20eSAndy Ritger params.bridgedLinks = pKernelNvlink->bridgedLinks; 15051739a20eSAndy Ritger 15061739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init 15071739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu)) 15081739a20eSAndy Ritger { 15091739a20eSAndy Ritger threadStateResetTimeout(pGpu); 15101739a20eSAndy Ritger } 15111739a20eSAndy Ritger 15121739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 15131739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_SYNC_LINK_MASKS_AND_VBIOS_INFO, 15141739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 15151739a20eSAndy Ritger 15161739a20eSAndy Ritger pKernelNvlink->vbiosDisabledLinkMask = params.vbiosDisabledLinkMask; 15171739a20eSAndy Ritger pKernelNvlink->initializedLinks = params.initializedLinks; 15181739a20eSAndy Ritger pKernelNvlink->initDisabledLinksMask = params.initDisabledLinksMask; 15191739a20eSAndy Ritger pKernelNvlink->bEnableSafeModeAtLoad = params.bEnableSafeModeAtLoad; 15201739a20eSAndy Ritger pKernelNvlink->bEnableTrainingAtLoad = params.bEnableTrainingAtLoad; 15211739a20eSAndy Ritger 15221739a20eSAndy Ritger return status; 15231739a20eSAndy Ritger } 15241739a20eSAndy Ritger 15251739a20eSAndy Ritger /*! 15261739a20eSAndy Ritger * @brief Update link connection status. 15271739a20eSAndy Ritger * 15281739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 15291739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 15301739a20eSAndy Ritger * @param[in] linkId Target link Id 15311739a20eSAndy Ritger */ 15321739a20eSAndy Ritger NV_STATUS 15331739a20eSAndy Ritger knvlinkUpdateLinkConnectionStatus_IMPL 15341739a20eSAndy Ritger ( 15351739a20eSAndy Ritger OBJGPU *pGpu, 15361739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 15371739a20eSAndy Ritger NvU32 linkId 15381739a20eSAndy Ritger ) 15391739a20eSAndy Ritger { 15401739a20eSAndy Ritger NV_STATUS status = NV_OK; 15411739a20eSAndy Ritger 15421739a20eSAndy Ritger NV2080_CTRL_NVLINK_UPDATE_LINK_CONNECTION_PARAMS params; 15431739a20eSAndy Ritger 15441739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 15451739a20eSAndy Ritger 15461739a20eSAndy Ritger params.linkId = linkId; 15471739a20eSAndy Ritger 15481739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB) 15491739a20eSAndy Ritger 15501739a20eSAndy Ritger params.bConnected = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bConnected; 15511739a20eSAndy Ritger params.remoteDeviceType = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.deviceType; 15521739a20eSAndy Ritger params.remoteLinkNumber = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber; 1553758b4ee8SAndy Ritger params.remoteChipSid = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.chipSid; 1554758b4ee8SAndy Ritger params.remoteDomain = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.domain; 1555758b4ee8SAndy Ritger params.remoteBus = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.bus; 1556758b4ee8SAndy Ritger params.remoteDevice = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.device; 1557758b4ee8SAndy Ritger params.remoteFunction = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.function; 1558758b4ee8SAndy Ritger params.remotePciDeviceId = pKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.pciDeviceId; 1559758b4ee8SAndy Ritger params.laneRxdetStatusMask = pKernelNvlink->nvlinkLinks[linkId].laneRxdetStatusMask; 15601739a20eSAndy Ritger 15611739a20eSAndy Ritger #endif 15621739a20eSAndy Ritger 15631739a20eSAndy Ritger // Reset timeout to clear any accumulated timeouts from link init 15641739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu)) 15651739a20eSAndy Ritger { 15661739a20eSAndy Ritger threadStateResetTimeout(pGpu); 15671739a20eSAndy Ritger } 15681739a20eSAndy Ritger 15691739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 15701739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_UPDATE_LINK_CONNECTION, 15711739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 15721739a20eSAndy Ritger if (status != NV_OK) 15731739a20eSAndy Ritger { 15741739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to update Link connection status!\n"); 15751739a20eSAndy Ritger return status; 15761739a20eSAndy Ritger } 15771739a20eSAndy Ritger 15781739a20eSAndy Ritger return NV_OK; 15791739a20eSAndy Ritger } 15801739a20eSAndy Ritger 15811739a20eSAndy Ritger /*! 158290eb1077SAndy Ritger * @brief Execute initial steps to Train links for ALI. 158390eb1077SAndy Ritger * 158490eb1077SAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 158590eb1077SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 158690eb1077SAndy Ritger * @param[in] linkMask Masks of links to enable 158790eb1077SAndy Ritger * @param[in] bSync Input sync boolean 158890eb1077SAndy Ritger * 158990eb1077SAndy Ritger */ 159090eb1077SAndy Ritger NV_STATUS 159190eb1077SAndy Ritger knvlinkPreTrainLinksToActiveAli_IMPL 159290eb1077SAndy Ritger ( 159390eb1077SAndy Ritger OBJGPU *pGpu, 159490eb1077SAndy Ritger KernelNvlink *pKernelNvlink, 159590eb1077SAndy Ritger NvU32 linkMask, 159690eb1077SAndy Ritger NvBool bSync 159790eb1077SAndy Ritger ) 159890eb1077SAndy Ritger { 159990eb1077SAndy Ritger NV_STATUS status = NV_OK; 160090eb1077SAndy Ritger 160190eb1077SAndy Ritger NV2080_CTRL_NVLINK_PRE_LINK_TRAIN_ALI_PARAMS params; 160290eb1077SAndy Ritger 160390eb1077SAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 160490eb1077SAndy Ritger 160590eb1077SAndy Ritger params.linkMask = linkMask; 160690eb1077SAndy Ritger params.bSync = bSync; 160790eb1077SAndy Ritger 160890eb1077SAndy Ritger // Reset timeout to clear any accumulated timeouts from link init 160990eb1077SAndy Ritger if (IS_GSP_CLIENT(pGpu)) 161090eb1077SAndy Ritger { 161190eb1077SAndy Ritger threadStateResetTimeout(pGpu); 161290eb1077SAndy Ritger } 161390eb1077SAndy Ritger 161490eb1077SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 161590eb1077SAndy Ritger NV2080_CTRL_CMD_NVLINK_PRE_LINK_TRAIN_ALI, 161690eb1077SAndy Ritger (void *)¶ms, sizeof(params)); 161790eb1077SAndy Ritger if (status != NV_OK) 161890eb1077SAndy Ritger { 161990eb1077SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to execute Pre Link Training ALI steps!\n"); 162090eb1077SAndy Ritger return status; 162190eb1077SAndy Ritger } 162290eb1077SAndy Ritger 162390eb1077SAndy Ritger return NV_OK; 162490eb1077SAndy Ritger } 162590eb1077SAndy Ritger 162690eb1077SAndy Ritger /*! 162790eb1077SAndy Ritger * @brief Train links to active for ALI. 162890eb1077SAndy Ritger * 162990eb1077SAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 163090eb1077SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 163190eb1077SAndy Ritger * @param[in] linkMask Masks of links to enable 163290eb1077SAndy Ritger * @param[in] bSync Input sync boolean 163390eb1077SAndy Ritger * 163490eb1077SAndy Ritger */ 163590eb1077SAndy Ritger NV_STATUS 163690eb1077SAndy Ritger knvlinkTrainLinksToActiveAli_IMPL 163790eb1077SAndy Ritger ( 163890eb1077SAndy Ritger OBJGPU *pGpu, 163990eb1077SAndy Ritger KernelNvlink *pKernelNvlink, 164090eb1077SAndy Ritger NvU32 linkMask, 164190eb1077SAndy Ritger NvBool bSync 164290eb1077SAndy Ritger ) 164390eb1077SAndy Ritger { 164490eb1077SAndy Ritger NV_STATUS status = NV_OK; 164590eb1077SAndy Ritger 164690eb1077SAndy Ritger NV2080_CTRL_NVLINK_PRE_LINK_TRAIN_ALI_PARAMS params; 164790eb1077SAndy Ritger 164890eb1077SAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 164990eb1077SAndy Ritger 165090eb1077SAndy Ritger params.linkMask = linkMask; 165190eb1077SAndy Ritger params.bSync = bSync; 165290eb1077SAndy Ritger 165390eb1077SAndy Ritger // Reset timeout to clear any accumulated timeouts from link init 165490eb1077SAndy Ritger if (IS_GSP_CLIENT(pGpu)) 165590eb1077SAndy Ritger { 165690eb1077SAndy Ritger threadStateResetTimeout(pGpu); 165790eb1077SAndy Ritger } 165890eb1077SAndy Ritger 165990eb1077SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 166090eb1077SAndy Ritger NV2080_CTRL_CMD_NVLINK_LINK_TRAIN_ALI, 166190eb1077SAndy Ritger (void *)¶ms, sizeof(params)); 166290eb1077SAndy Ritger if (status != NV_OK) 166390eb1077SAndy Ritger { 166490eb1077SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to change ALI Links to active!\n"); 166590eb1077SAndy Ritger return status; 166690eb1077SAndy Ritger } 166790eb1077SAndy Ritger 166890eb1077SAndy Ritger return NV_OK; 166990eb1077SAndy Ritger } 167090eb1077SAndy Ritger 167190eb1077SAndy Ritger /*! 16721739a20eSAndy Ritger * @brief Update the post Rx Detect link mask. 16731739a20eSAndy Ritger * 16741739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 16751739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 16761739a20eSAndy Ritger * 16771739a20eSAndy Ritger */ 16781739a20eSAndy Ritger NV_STATUS 16791739a20eSAndy Ritger knvlinkUpdatePostRxDetectLinkMask_IMPL 16801739a20eSAndy Ritger ( 16811739a20eSAndy Ritger OBJGPU *pGpu, 16821739a20eSAndy Ritger KernelNvlink *pKernelNvlink 16831739a20eSAndy Ritger ) 16841739a20eSAndy Ritger { 16851739a20eSAndy Ritger NV_STATUS status = NV_OK; 16861739a20eSAndy Ritger 16871739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_LINK_MASK_POST_RX_DET_PARAMS params; 16881739a20eSAndy Ritger 16891739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 16901739a20eSAndy Ritger 16911739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 16921739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_LINK_MASK_POST_RX_DET, 16931739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 16941739a20eSAndy Ritger if (status != NV_OK) 16951739a20eSAndy Ritger { 16961739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to update Rx Detect Link mask!\n"); 16971739a20eSAndy Ritger return status; 16981739a20eSAndy Ritger } 16991739a20eSAndy Ritger 17001739a20eSAndy Ritger pKernelNvlink->postRxDetLinkMask = params.postRxDetLinkMask; 17011739a20eSAndy Ritger 17021739a20eSAndy Ritger return NV_OK; 17031739a20eSAndy Ritger } 17041739a20eSAndy Ritger 17051739a20eSAndy Ritger /*! 17061739a20eSAndy Ritger * @brief Copy over the NVLink devices information from GSP-RM. 17071739a20eSAndy Ritger * 17081739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 17091739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 17101739a20eSAndy Ritger */ 17111739a20eSAndy Ritger NV_STATUS 17121739a20eSAndy Ritger knvlinkCopyNvlinkDeviceInfo_IMPL 17131739a20eSAndy Ritger ( 17141739a20eSAndy Ritger OBJGPU *pGpu, 17151739a20eSAndy Ritger KernelNvlink *pKernelNvlink 17161739a20eSAndy Ritger ) 17171739a20eSAndy Ritger { 17181739a20eSAndy Ritger NV_STATUS status = NV_OK; 17191739a20eSAndy Ritger NvU32 i; 17201739a20eSAndy Ritger 17211739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_NVLINK_DEVICE_INFO_PARAMS nvlinkInfoParams; 17221739a20eSAndy Ritger 17231739a20eSAndy Ritger portMemSet(&nvlinkInfoParams, 0, sizeof(nvlinkInfoParams)); 17241739a20eSAndy Ritger 17251739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 17261739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_NVLINK_DEVICE_INFO, 17271739a20eSAndy Ritger (void *)&nvlinkInfoParams, sizeof(nvlinkInfoParams)); 17281739a20eSAndy Ritger 17291739a20eSAndy Ritger if (status == NV_ERR_NOT_SUPPORTED) 17301739a20eSAndy Ritger { 17311739a20eSAndy Ritger NV_PRINTF(LEVEL_WARNING, "NVLink is unavailable\n"); 17321739a20eSAndy Ritger return status; 17331739a20eSAndy Ritger } 17341739a20eSAndy Ritger else if (status != NV_OK) 17351739a20eSAndy Ritger { 17361739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to retrieve all nvlink device info!\n"); 17371739a20eSAndy Ritger return status; 17381739a20eSAndy Ritger } 17391739a20eSAndy Ritger 17401739a20eSAndy Ritger // Update CPU-RM's NVLink state with the information received from GSP-RM RPC 17411739a20eSAndy Ritger pKernelNvlink->ioctrlMask = nvlinkInfoParams.ioctrlMask; 17421739a20eSAndy Ritger pKernelNvlink->ioctrlNumEntries = nvlinkInfoParams.ioctrlNumEntries; 17431739a20eSAndy Ritger pKernelNvlink->ioctrlSize = nvlinkInfoParams.ioctrlSize; 17441739a20eSAndy Ritger pKernelNvlink->discoveredLinks = nvlinkInfoParams.discoveredLinks; 17451739a20eSAndy Ritger pKernelNvlink->ipVerNvlink = nvlinkInfoParams.ipVerNvlink; 17461739a20eSAndy Ritger 17471739a20eSAndy Ritger for (i = 0; i < NVLINK_MAX_LINKS_SW; i++) 17481739a20eSAndy Ritger { 17491739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].pGpu = pGpu; 17501739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].bValid = nvlinkInfoParams.linkInfo[i].bValid; 17511739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].linkId = nvlinkInfoParams.linkInfo[i].linkId; 17521739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].ioctrlId = nvlinkInfoParams.linkInfo[i].ioctrlId; 17531739a20eSAndy Ritger 17541739a20eSAndy Ritger // Copy over the link PLL master and slave relationship for each link 17551739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].pllMasterLinkId = nvlinkInfoParams.linkInfo[i].pllMasterLinkId; 17561739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].pllSlaveLinkId = nvlinkInfoParams.linkInfo[i].pllSlaveLinkId; 17571739a20eSAndy Ritger 17581739a20eSAndy Ritger // Copy over the ip versions for DLPL devices discovered 17591739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].ipVerDlPl = nvlinkInfoParams.linkInfo[i].ipVerDlPl; 17601739a20eSAndy Ritger } 17611739a20eSAndy Ritger 17621739a20eSAndy Ritger return NV_OK; 17631739a20eSAndy Ritger } 17641739a20eSAndy Ritger 17651739a20eSAndy Ritger /*! 17661739a20eSAndy Ritger * @brief Copy over the Ioctrl devices information from GSP-RM. 17671739a20eSAndy Ritger * 17681739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 17691739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 17701739a20eSAndy Ritger */ 17711739a20eSAndy Ritger NV_STATUS 17721739a20eSAndy Ritger knvlinkCopyIoctrlDeviceInfo_IMPL 17731739a20eSAndy Ritger ( 17741739a20eSAndy Ritger OBJGPU *pGpu, 17751739a20eSAndy Ritger KernelNvlink *pKernelNvlink 17761739a20eSAndy Ritger ) 17771739a20eSAndy Ritger { 17781739a20eSAndy Ritger KernelIoctrl *pKernelIoctrl = NULL; 17791739a20eSAndy Ritger NV_STATUS status = NV_OK; 17801739a20eSAndy Ritger NvU32 ioctrlIdx; 17811739a20eSAndy Ritger 17821739a20eSAndy Ritger NV2080_CTRL_NVLINK_GET_IOCTRL_DEVICE_INFO_PARAMS ioctrlInfoParams; 17831739a20eSAndy Ritger 17841739a20eSAndy Ritger // Query the IOCTRL information for each of the IOCTRLs discovered 17851739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK(32, ioctrlIdx, pKernelNvlink->ioctrlMask) 17861739a20eSAndy Ritger { 17871739a20eSAndy Ritger portMemSet(&ioctrlInfoParams, 0, sizeof(ioctrlInfoParams)); 17881739a20eSAndy Ritger 17891739a20eSAndy Ritger ioctrlInfoParams.ioctrlIdx = ioctrlIdx; 17901739a20eSAndy Ritger 17911739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 17921739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_GET_IOCTRL_DEVICE_INFO, 17931739a20eSAndy Ritger (void *)&ioctrlInfoParams, sizeof(ioctrlInfoParams)); 17941739a20eSAndy Ritger 17951739a20eSAndy Ritger if (status == NV_ERR_NOT_SUPPORTED) 17961739a20eSAndy Ritger { 17971739a20eSAndy Ritger NV_PRINTF(LEVEL_WARNING, "NVLink is unavailable\n"); 17981739a20eSAndy Ritger return status; 17991739a20eSAndy Ritger } 18001739a20eSAndy Ritger else if (status != NV_OK) 18011739a20eSAndy Ritger { 18021739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to retrieve device info for IOCTRL %d!\n", ioctrlIdx); 18031739a20eSAndy Ritger return status; 18041739a20eSAndy Ritger } 18051739a20eSAndy Ritger 18061739a20eSAndy Ritger pKernelIoctrl = KNVLINK_GET_IOCTRL(pKernelNvlink, ioctrlIdx); 18071739a20eSAndy Ritger 18081739a20eSAndy Ritger // Update CPU-RM's NVLink state with the information received from GSP-RM RPC 18091739a20eSAndy Ritger pKernelIoctrl->PublicId = ioctrlInfoParams.PublicId; 18101739a20eSAndy Ritger pKernelIoctrl->localDiscoveredLinks = ioctrlInfoParams.localDiscoveredLinks; 18111739a20eSAndy Ritger pKernelIoctrl->localGlobalLinkOffset = ioctrlInfoParams.localGlobalLinkOffset; 18121739a20eSAndy Ritger pKernelIoctrl->ioctrlDiscoverySize = ioctrlInfoParams.ioctrlDiscoverySize; 18131739a20eSAndy Ritger pKernelIoctrl->numDevices = ioctrlInfoParams.numDevices; 18141739a20eSAndy Ritger 18151739a20eSAndy Ritger // Copy over the ip versions for the ioctrl and minion devices discovered 18161739a20eSAndy Ritger pKernelIoctrl->ipVerIoctrl = ioctrlInfoParams.ipRevisions.ipVerIoctrl; 18171739a20eSAndy Ritger pKernelIoctrl->ipVerMinion = ioctrlInfoParams.ipRevisions.ipVerMinion; 18181739a20eSAndy Ritger 18191739a20eSAndy Ritger if (pKernelIoctrl->ipVerMinion == 0) 18201739a20eSAndy Ritger { 18211739a20eSAndy Ritger pKernelIoctrl->setProperty(pKernelIoctrl, PDB_PROP_KIOCTRL_MINION_AVAILABLE, NV_FALSE); 18221739a20eSAndy Ritger } 18231739a20eSAndy Ritger } 18241739a20eSAndy Ritger FOR_EACH_INDEX_IN_MASK_END; 18251739a20eSAndy Ritger 18261739a20eSAndy Ritger return NV_OK; 18271739a20eSAndy Ritger } 18281739a20eSAndy Ritger 18291739a20eSAndy Ritger /** 18301739a20eSAndy Ritger * @brief Setup topology information for the forced nvlink configurations 18311739a20eSAndy Ritger * 18321739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer for local GPU 18331739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 18341739a20eSAndy Ritger */ 18351739a20eSAndy Ritger NV_STATUS 18361739a20eSAndy Ritger knvlinkSetupTopologyForForcedConfig_IMPL 18371739a20eSAndy Ritger ( 18381739a20eSAndy Ritger OBJGPU *pGpu, 18391739a20eSAndy Ritger KernelNvlink *pKernelNvlink 18401739a20eSAndy Ritger ) 18411739a20eSAndy Ritger { 18421739a20eSAndy Ritger NV_STATUS status = NV_OK; 18431739a20eSAndy Ritger NvU32 i, physLink; 18441739a20eSAndy Ritger 18451739a20eSAndy Ritger // Start with all links disabled and no forced config in effect 18461739a20eSAndy Ritger pKernelNvlink->bRegistryLinkOverride = NV_TRUE; 18471739a20eSAndy Ritger pKernelNvlink->registryLinkMask = 0; 18481739a20eSAndy Ritger pKernelNvlink->bChiplibConfig = NV_FALSE; 18491739a20eSAndy Ritger 18501739a20eSAndy Ritger for (i = 0; i < NVLINK_MAX_LINKS_SW; i++) 18511739a20eSAndy Ritger { 18521739a20eSAndy Ritger // Filter against the links discovered from IOCTRL 18531739a20eSAndy Ritger if (!(pKernelNvlink->discoveredLinks & NVBIT(i))) 18541739a20eSAndy Ritger continue; 18551739a20eSAndy Ritger 18561739a20eSAndy Ritger // The physical link is guaranteed valid in all cases 18571739a20eSAndy Ritger physLink = DRF_VAL(_NVLINK, _ARCH_CONNECTION, _PHYSICAL_LINK, pKernelNvlink->pLinkConnection[i]); 18581739a20eSAndy Ritger 18591739a20eSAndy Ritger // Update link tracking 18601739a20eSAndy Ritger if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _ENABLED, pKernelNvlink->pLinkConnection[i])) 18611739a20eSAndy Ritger { 18621739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, 18631739a20eSAndy Ritger "ARCH_CONNECTION info from chiplib: ENABLED Logical link %d (Physical " 18641739a20eSAndy Ritger "link %d) = 0x%X\n", i, physLink, 18651739a20eSAndy Ritger pKernelNvlink->pLinkConnection[i]); 18661739a20eSAndy Ritger 18671739a20eSAndy Ritger // 18681739a20eSAndy Ritger // This "link" should be ENABLED. We use the physical link since RM only deals with 18691739a20eSAndy Ritger // physical links. 18701739a20eSAndy Ritger // 18711739a20eSAndy Ritger pKernelNvlink->registryLinkMask |= NVBIT(physLink); 18721739a20eSAndy Ritger 18731739a20eSAndy Ritger // Config is forced (at least one link requested) 18741739a20eSAndy Ritger pKernelNvlink->bChiplibConfig = NV_TRUE; 18751739a20eSAndy Ritger } 18761739a20eSAndy Ritger else 18771739a20eSAndy Ritger { 18781739a20eSAndy Ritger NV_PRINTF(LEVEL_INFO, 18791739a20eSAndy Ritger "ARCH_CONNECTION info from chiplib: DISABLED Logical link %d (Physical " 18801739a20eSAndy Ritger "link %d) = 0x%X\n", i, physLink, 18811739a20eSAndy Ritger pKernelNvlink->pLinkConnection[i]); 18821739a20eSAndy Ritger } 18831739a20eSAndy Ritger 18841739a20eSAndy Ritger // Accumulate any PEER links 18851739a20eSAndy Ritger if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _PEER_MASK, pKernelNvlink->pLinkConnection[i])) 18861739a20eSAndy Ritger { 18871739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB) 18881739a20eSAndy Ritger // Ensure reginit has the info it needs for the remote side 18891739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].remoteEndInfo.bConnected = NV_TRUE; 18901739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].remoteEndInfo.deviceType = 18911739a20eSAndy Ritger NV2080_CTRL_NVLINK_DEVICE_INFO_DEVICE_TYPE_GPU; 18921739a20eSAndy Ritger 18931739a20eSAndy Ritger #endif 18941739a20eSAndy Ritger } 18951739a20eSAndy Ritger 18961739a20eSAndy Ritger // Accumulate any CPU links 18971739a20eSAndy Ritger if (DRF_VAL(_NVLINK, _ARCH_CONNECTION, _CPU, pKernelNvlink->pLinkConnection[i])) 18981739a20eSAndy Ritger { 18991739a20eSAndy Ritger #if defined(INCLUDE_NVLINK_LIB) 19001739a20eSAndy Ritger // Ensure reginit has the info it needs for the remote side 19011739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].remoteEndInfo.bConnected = NV_TRUE; 19021739a20eSAndy Ritger pKernelNvlink->nvlinkLinks[i].remoteEndInfo.deviceType = pKernelNvlink->forcedSysmemDeviceType; 19031739a20eSAndy Ritger #endif 19041739a20eSAndy Ritger } 19051739a20eSAndy Ritger 19061739a20eSAndy Ritger // RPC into GSP-RM to update the link remote connection status 19071739a20eSAndy Ritger status = knvlinkUpdateLinkConnectionStatus(pGpu, pKernelNvlink, i); 19081739a20eSAndy Ritger if (status != NV_OK) 19091739a20eSAndy Ritger { 19101739a20eSAndy Ritger return status; 19111739a20eSAndy Ritger } 19121739a20eSAndy Ritger } 19131739a20eSAndy Ritger 19141739a20eSAndy Ritger // Update enabledLinks mask with the mask of forced link configurations 19151739a20eSAndy Ritger pKernelNvlink->enabledLinks = pKernelNvlink->discoveredLinks & pKernelNvlink->registryLinkMask; 19161739a20eSAndy Ritger 19171739a20eSAndy Ritger return NV_OK; 19181739a20eSAndy Ritger } 19191739a20eSAndy Ritger 19201739a20eSAndy Ritger /*! 19211739a20eSAndy Ritger * @brief Sync the lane shutdown properties with GSP-RM 19221739a20eSAndy Ritger * 19231739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 19241739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 19251739a20eSAndy Ritger */ 19261739a20eSAndy Ritger NV_STATUS 19271739a20eSAndy Ritger knvlinkSyncLaneShutdownProps_IMPL 19281739a20eSAndy Ritger ( 19291739a20eSAndy Ritger OBJGPU *pGpu, 19301739a20eSAndy Ritger KernelNvlink *pKernelNvlink 19311739a20eSAndy Ritger ) 19321739a20eSAndy Ritger { 19331739a20eSAndy Ritger NV_STATUS status = NV_OK; 19341739a20eSAndy Ritger 19351739a20eSAndy Ritger NV2080_CTRL_NVLINK_SYNC_NVLINK_SHUTDOWN_PROPS_PARAMS params; 19361739a20eSAndy Ritger 19371739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 19381739a20eSAndy Ritger 19391739a20eSAndy Ritger params.bLaneShutdownEnabled = 19401739a20eSAndy Ritger pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LANE_SHUTDOWN_ENABLED); 19411739a20eSAndy Ritger params.bLaneShutdownOnUnload = 19421739a20eSAndy Ritger pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LANE_SHUTDOWN_ON_UNLOAD); 19431739a20eSAndy Ritger 19441739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 19451739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_SYNC_NVLINK_SHUTDOWN_PROPS, 19461739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 19471739a20eSAndy Ritger if (status != NV_OK) 19481739a20eSAndy Ritger { 19491739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to sync NVLink shutdown properties with GSP!\n"); 19501739a20eSAndy Ritger return status; 19511739a20eSAndy Ritger } 19521739a20eSAndy Ritger 19531739a20eSAndy Ritger return NV_OK; 19541739a20eSAndy Ritger } 19551739a20eSAndy Ritger 19561739a20eSAndy Ritger /*! 195790eb1077SAndy Ritger * @brief Get the number of active links allowed per IOCTRL 195890eb1077SAndy Ritger * 195990eb1077SAndy Ritger * @param[in] pGpu OBJGPU pointer 196090eb1077SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 196190eb1077SAndy Ritger * 196290eb1077SAndy Ritger * @returns On success, returns the number of active links per IOCTRL. 196390eb1077SAndy Ritger * On failure, returns 0. 196490eb1077SAndy Ritger */ 196590eb1077SAndy Ritger NvU32 196690eb1077SAndy Ritger knvlinkGetNumActiveLinksPerIoctrl_IMPL 196790eb1077SAndy Ritger ( 196890eb1077SAndy Ritger OBJGPU *pGpu, 196990eb1077SAndy Ritger KernelNvlink *pKernelNvlink 197090eb1077SAndy Ritger ) 197190eb1077SAndy Ritger { 197290eb1077SAndy Ritger NV_STATUS status; 197390eb1077SAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_NUM_ACTIVE_LINK_PER_IOCTRL_PARAMS params; 197490eb1077SAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 197590eb1077SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 197690eb1077SAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_NUM_ACTIVE_LINK_PER_IOCTRL, 197790eb1077SAndy Ritger (void *)¶ms, sizeof(params)); 197890eb1077SAndy Ritger if (status != NV_OK) 197990eb1077SAndy Ritger { 198090eb1077SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get the number of active links per IOCTRL\n"); 198190eb1077SAndy Ritger return 0; 198290eb1077SAndy Ritger } 198390eb1077SAndy Ritger return params.numActiveLinksPerIoctrl; 198490eb1077SAndy Ritger } 198590eb1077SAndy Ritger 198690eb1077SAndy Ritger /*! 198790eb1077SAndy Ritger * @brief Get the number of total links per IOCTRL 198890eb1077SAndy Ritger * 198990eb1077SAndy Ritger * @param[in] pGpu OBJGPU pointer 199090eb1077SAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 199190eb1077SAndy Ritger * 199290eb1077SAndy Ritger * @returns On success, returns the number of total links per IOCTRL. 199390eb1077SAndy Ritger * On failure, returns 0. 199490eb1077SAndy Ritger */ 199590eb1077SAndy Ritger NvU32 199690eb1077SAndy Ritger knvlinkGetTotalNumLinksPerIoctrl_IMPL 199790eb1077SAndy Ritger ( 199890eb1077SAndy Ritger OBJGPU *pGpu, 199990eb1077SAndy Ritger KernelNvlink *pKernelNvlink 200090eb1077SAndy Ritger ) 200190eb1077SAndy Ritger { 200290eb1077SAndy Ritger NV_STATUS status; 200390eb1077SAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_TOTAL_NUM_LINK_PER_IOCTRL_PARAMS params; 200490eb1077SAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 200590eb1077SAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 200690eb1077SAndy Ritger NV2080_CTRL_INTERNAL_NVLINK_GET_TOTAL_NUM_LINK_PER_IOCTRL, 200790eb1077SAndy Ritger (void *)¶ms, sizeof(params)); 200890eb1077SAndy Ritger if (status != NV_OK) 200990eb1077SAndy Ritger { 201090eb1077SAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get the total number of links per IOCTRL\n"); 201190eb1077SAndy Ritger return 0; 201290eb1077SAndy Ritger } 201390eb1077SAndy Ritger return params.numLinksPerIoctrl; 201490eb1077SAndy Ritger } 201590eb1077SAndy Ritger 20161739a20eSAndy Ritger /** 20171739a20eSAndy Ritger * @brief Process the mask of init disabled links 20181739a20eSAndy Ritger * 20191739a20eSAndy Ritger * @param[in] pGpu OBJGPU pointer 20201739a20eSAndy Ritger * @param[in] pKernelNvlink KernelNvlink pointer 20211739a20eSAndy Ritger */ 20221739a20eSAndy Ritger NV_STATUS 20231739a20eSAndy Ritger knvlinkProcessInitDisabledLinks_IMPL 20241739a20eSAndy Ritger ( 20251739a20eSAndy Ritger OBJGPU *pGpu, 20261739a20eSAndy Ritger KernelNvlink *pKernelNvlink 20271739a20eSAndy Ritger ) 20281739a20eSAndy Ritger { 20291739a20eSAndy Ritger NvU32 mask = 0; 20301739a20eSAndy Ritger NvBool bSkipHwNvlinkDisable = 0; 20311739a20eSAndy Ritger NV_STATUS status = NV_OK; 20321739a20eSAndy Ritger 20331739a20eSAndy Ritger NV2080_CTRL_NVLINK_PROCESS_INIT_DISABLED_LINKS_PARAMS params; 20341739a20eSAndy Ritger 20351739a20eSAndy Ritger status = gpumgrGetGpuInitDisabledNvlinks(pGpu->gpuId, &mask, &bSkipHwNvlinkDisable); 20361739a20eSAndy Ritger if (status != NV_OK) 20371739a20eSAndy Ritger { 20381739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to get init disabled links from gpumgr\n"); 20391739a20eSAndy Ritger return status; 20401739a20eSAndy Ritger } 20411739a20eSAndy Ritger 20421739a20eSAndy Ritger portMemSet(¶ms, 0, sizeof(params)); 20431739a20eSAndy Ritger 20441739a20eSAndy Ritger params.initDisabledLinksMask = mask; 20451739a20eSAndy Ritger params.bSkipHwNvlinkDisable = bSkipHwNvlinkDisable; 20461739a20eSAndy Ritger 20471739a20eSAndy Ritger status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 20481739a20eSAndy Ritger NV2080_CTRL_CMD_NVLINK_PROCESS_INIT_DISABLED_LINKS, 20491739a20eSAndy Ritger (void *)¶ms, sizeof(params)); 20501739a20eSAndy Ritger if (status != NV_OK) 20511739a20eSAndy Ritger { 20521739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to process init disabled links in GSP\n"); 20531739a20eSAndy Ritger return status; 20541739a20eSAndy Ritger } 20551739a20eSAndy Ritger 20561739a20eSAndy Ritger pKernelNvlink->initDisabledLinksMask = params.initDisabledLinksMask; 20571739a20eSAndy Ritger 20581739a20eSAndy Ritger return NV_OK; 20591739a20eSAndy Ritger } 20601739a20eSAndy Ritger 20611739a20eSAndy Ritger // Grab GPU locks before RPCing into GSP-RM for NVLink RPCs 20621739a20eSAndy Ritger NV_STATUS 20631739a20eSAndy Ritger knvlinkExecGspRmRpc_IMPL 20641739a20eSAndy Ritger ( 20651739a20eSAndy Ritger OBJGPU *pGpu, 20661739a20eSAndy Ritger KernelNvlink *pKernelNvlink, 20671739a20eSAndy Ritger NvU32 cmd, 20681739a20eSAndy Ritger void *paramAddr, 20691739a20eSAndy Ritger NvU32 paramSize 20701739a20eSAndy Ritger ) 20711739a20eSAndy Ritger { 20721739a20eSAndy Ritger NvU32 gpuMaskRelease = 0; 20731739a20eSAndy Ritger NvU32 gpuMaskInitial = rmGpuLocksGetOwnedMask(); 20741739a20eSAndy Ritger NvU32 gpuMask = gpuMaskInitial | NVBIT(pGpu->gpuInstance); 20751739a20eSAndy Ritger NV_STATUS status = NV_OK; 20761739a20eSAndy Ritger 20771739a20eSAndy Ritger if (IS_GSP_CLIENT(pGpu)) 20781739a20eSAndy Ritger { 20791739a20eSAndy Ritger if (!rmGpuGroupLockIsOwner(pGpu->gpuInstance, GPU_LOCK_GRP_MASK, &gpuMask)) 20801739a20eSAndy Ritger { 20811739a20eSAndy Ritger status = rmGpuGroupLockAcquire(pGpu->gpuInstance, 20821739a20eSAndy Ritger GPU_LOCK_GRP_MASK, 20831739a20eSAndy Ritger GPU_LOCK_FLAGS_SAFE_LOCK_UPGRADE, 20841739a20eSAndy Ritger RM_LOCK_MODULES_NVLINK, 20851739a20eSAndy Ritger &gpuMask); 20861739a20eSAndy Ritger if (status != NV_OK) 20871739a20eSAndy Ritger { 20881739a20eSAndy Ritger NV_PRINTF(LEVEL_ERROR, "Failed to acquire locks for gpumask 0x%x\n", gpuMask); 20891739a20eSAndy Ritger return status; 20901739a20eSAndy Ritger } 20911739a20eSAndy Ritger 20921739a20eSAndy Ritger gpuMaskRelease = (gpuMask & (~gpuMaskInitial)); 20931739a20eSAndy Ritger } 20941739a20eSAndy Ritger } 20951739a20eSAndy Ritger 20961739a20eSAndy Ritger RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); 20971739a20eSAndy Ritger status = pRmApi->Control(pRmApi, 20981739a20eSAndy Ritger pGpu->hInternalClient, 20991739a20eSAndy Ritger pGpu->hInternalSubdevice, 21001739a20eSAndy Ritger cmd, paramAddr, paramSize); 21011739a20eSAndy Ritger if (gpuMaskRelease) 21021739a20eSAndy Ritger { 21031739a20eSAndy Ritger rmGpuGroupLockRelease(gpuMaskRelease, GPUS_LOCK_FLAGS_NONE); 21041739a20eSAndy Ritger } 21051739a20eSAndy Ritger 21061739a20eSAndy Ritger return status; 21071739a20eSAndy Ritger } 21081739a20eSAndy Ritger 21091739a20eSAndy Ritger void 21101739a20eSAndy Ritger knvlinkUtoa(NvU8 *str, NvU64 length, NvU64 val) 21111739a20eSAndy Ritger { 21121739a20eSAndy Ritger NvU8 temp[NV2080_GPU_MAX_NAME_STRING_LENGTH]; 21131739a20eSAndy Ritger NvU8 *ptr = temp; 21141739a20eSAndy Ritger NvU64 i = 0; 21151739a20eSAndy Ritger 21161739a20eSAndy Ritger NV_ASSERT(str != NULL); 21171739a20eSAndy Ritger 21181739a20eSAndy Ritger do 21191739a20eSAndy Ritger { 21201739a20eSAndy Ritger i = val % 10; 21211739a20eSAndy Ritger val = val / 10; 21221739a20eSAndy Ritger *ptr++ = (NvU8)(i + '0'); 21231739a20eSAndy Ritger } while(val); 21241739a20eSAndy Ritger 21251739a20eSAndy Ritger NV_ASSERT(length > (NvU64) (ptr - temp)); 21261739a20eSAndy Ritger 21271739a20eSAndy Ritger while (ptr > temp) 21281739a20eSAndy Ritger *str++ = *--ptr; 21291739a20eSAndy Ritger 21301739a20eSAndy Ritger *str = '\0'; 21311739a20eSAndy Ritger } 2132