1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 * SPDX-License-Identifier: MIT 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "os/os.h" 25 #include "kernel/gpu/nvlink/kernel_nvlink.h" 26 #include "kernel/gpu/nvlink/kernel_ioctrl.h" 27 #include "gpu/gpu.h" 28 #include "gpu/mem_mgr/mem_mgr.h" 29 #include "nverror.h" 30 #include "objtmr.h" 31 #include "gpu_mgr/gpu_mgr.h" 32 33 /*! 34 * @brief Check if ALI is supported for the given device 35 * 36 * @param[in] pGpu OBJGPU pointer 37 * @param[in] pKernelNvlink KernelNvlink pointer 38 */ 39 NV_STATUS 40 knvlinkIsAliSupported_GH100 41 ( 42 OBJGPU *pGpu, 43 KernelNvlink *pKernelNvlink 44 ) 45 { 46 NvU32 status = NV_OK; 47 48 NV2080_CTRL_NVLINK_GET_ALI_ENABLED_PARAMS params; 49 50 portMemSet(¶ms, 0, sizeof(params)); 51 52 // Initialize to default settings 53 params.bEnableAli = NV_FALSE; 54 55 status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 56 NV2080_CTRL_CMD_NVLINK_GET_ALI_ENABLED, 57 (void *)¶ms, sizeof(params)); 58 if (status != NV_OK) 59 { 60 NV_PRINTF(LEVEL_ERROR, "Failed to get ALI enablement status!\n"); 61 return status; 62 } 63 64 pKernelNvlink->bEnableAli = params.bEnableAli; 65 66 return status; 67 } 68 69 /*! 70 * @brief Validates fabric base address. 71 * 72 * @param[in] pGpu OBJGPU pointer 73 * @param[in] pKernelNvlink KernelNvlink pointer 74 * @param[in] fabricBaseAddr Address to be validated 75 * 76 * @returns On success, NV_OK. 77 * On failure, returns NV_ERR_XXX. 78 */ 79 NV_STATUS 80 knvlinkValidateFabricBaseAddress_GH100 81 ( 82 OBJGPU *pGpu, 83 KernelNvlink *pKernelNvlink, 84 NvU64 fabricBaseAddr 85 ) 86 { 87 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu); 88 NvU64 fbSizeBytes; 89 90 fbSizeBytes = pMemoryManager->Ram.fbTotalMemSizeMb << 20; 91 92 // 93 // Hopper SKUs will be paired with NVSwitches (Limerock-next) supporting 2K 94 // mapslots that can cover 512GB each. Make sure that the fabric base 95 // address being used is valid to cover whole frame buffer. 96 // 97 98 // Check if fabric address is aligned to mapslot size. 99 if (fabricBaseAddr & (NVBIT64(39) - 1)) 100 { 101 return NV_ERR_INVALID_ARGUMENT; 102 } 103 104 // Align fbSize to mapslot size. 105 fbSizeBytes = RM_ALIGN_UP(fbSizeBytes, NVBIT64(39)); 106 107 return NV_OK; 108 } 109 110 /*! 111 * @brief Do post setup on nvlink peers 112 * 113 * @param[in] pGpu OBJGPU pointer 114 * @param[in] pKernelNvlink KernelNvlink pointer 115 */ 116 NV_STATUS 117 knvlinkPostSetupNvlinkPeer_GH100 118 ( 119 OBJGPU *pGpu, 120 KernelNvlink *pKernelNvlink 121 ) 122 { 123 NvU32 status = NV_OK; 124 NV2080_CTRL_NVLINK_POST_SETUP_NVLINK_PEER_PARAMS postSetupNvlinkPeerParams; 125 126 portMemSet(&postSetupNvlinkPeerParams, 0, sizeof(postSetupNvlinkPeerParams)); 127 128 postSetupNvlinkPeerParams.peerMask = (1 << NVLINK_MAX_PEERS_SW) - 1; 129 130 status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 131 NV2080_CTRL_CMD_NVLINK_POST_SETUP_NVLINK_PEER, 132 (void *)&postSetupNvlinkPeerParams, 133 sizeof(postSetupNvlinkPeerParams)); 134 if (status != NV_OK) 135 { 136 NV_PRINTF(LEVEL_ERROR, 137 "Failed to program post active settings and bufferready!\n"); 138 return status; 139 } 140 141 return status; 142 } 143 144 /*! 145 * @brief Discover all links that are training or have been 146 * trained on both GPUs 147 * 148 * @param[in] pGpu OBJGPU pointer for local GPU 149 * @param[in] pKernelNvlink KernelNvlink pointer 150 * @param[in] pPeerGpu OBJGPU pointer for remote GPU 151 * 152 * @return NV_OK if links are detected to be training 153 */ 154 NV_STATUS 155 knvlinkDiscoverPostRxDetLinks_GH100 156 ( 157 OBJGPU *pGpu, 158 KernelNvlink *pKernelNvlink, 159 OBJGPU *pPeerGpu 160 ) 161 { 162 NV_STATUS status = NV_ERR_NOT_SUPPORTED; 163 164 #if defined(INCLUDE_NVLINK_LIB) 165 166 OBJGPU *pGpu0 = pGpu; 167 OBJGPU *pGpu1 = pPeerGpu; 168 KernelNvlink *pKernelNvlink0 = GPU_GET_KERNEL_NVLINK(pGpu0); 169 KernelNvlink *pKernelNvlink1 = NULL; 170 171 if (pGpu1 == NULL) 172 { 173 NV_PRINTF(LEVEL_ERROR, "Invalid pPeerGpu.\n"); 174 175 return NV_ERR_INVALID_ARGUMENT; 176 } 177 else if ((pGpu0 == pGpu1) && 178 (pGpu0->getProperty(pGpu0, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED))) 179 { 180 // P2P over loopback links are disabled through regkey overrides 181 NV_PRINTF(LEVEL_INFO, "loopback P2P on GPU%u disabled by regkey\n", 182 gpuGetInstance(pGpu0)); 183 184 return NV_ERR_NOT_SUPPORTED; 185 } 186 else 187 { 188 pKernelNvlink1 = GPU_GET_KERNEL_NVLINK(pGpu1); 189 } 190 191 if (pKernelNvlink1 == NULL) 192 { 193 NV_PRINTF(LEVEL_ERROR, 194 "Input mask contains a GPU on which NVLink is disabled.\n"); 195 196 return NV_ERR_INVALID_ARGUMENT; 197 } 198 199 if ((IS_RTLSIM(pGpu) && !pKernelNvlink0->bForceEnableCoreLibRtlsims) || 200 (pKernelNvlink0->pNvlinkDev == NULL) || 201 !pKernelNvlink0->bEnableAli || 202 (pKernelNvlink1->pNvlinkDev == NULL) || 203 !pKernelNvlink1->bEnableAli) 204 { 205 NV_PRINTF(LEVEL_INFO, 206 "Not in ALI, checking PostRxDetLinks not supported.\n"); 207 return NV_ERR_NOT_SUPPORTED; 208 } 209 210 // 211 // Initialize Mask of links that have made it past RxDet to 0 then 212 // request to get all links from the given GPU that have gotted past RxDet 213 // 214 pKernelNvlink0->postRxDetLinkMask = 0; 215 status = knvlinkUpdatePostRxDetectLinkMask(pGpu0, pKernelNvlink0); 216 if(status != NV_OK) 217 { 218 NV_PRINTF(LEVEL_ERROR, 219 "Getting peer0's postRxDetLinkMask failed!\n"); 220 return NV_ERR_INVALID_STATE; 221 } 222 223 // Only query if we are not in loopback 224 if (pKernelNvlink0 != pKernelNvlink1) 225 { 226 pKernelNvlink1->postRxDetLinkMask = 0; 227 status = knvlinkUpdatePostRxDetectLinkMask(pGpu1, pKernelNvlink1); 228 if(status != NV_OK) 229 { 230 NV_PRINTF(LEVEL_ERROR, 231 "Getting peer1's postRxDetLinkMask failed!\n"); 232 return NV_ERR_INVALID_STATE; 233 } 234 } 235 236 // 237 // If the current gpu has no actively training or trained link OR 238 // if the peer gpu has no actively training or trained links then 239 // return an error. If either side has 0 links passed RxDet then 240 // there is no chance that we will find links connecting the devices 241 // further into discovery. 242 // 243 if(pKernelNvlink0->postRxDetLinkMask == 0 || 244 pKernelNvlink1->postRxDetLinkMask == 0) 245 { 246 NV_PRINTF(LEVEL_ERROR, "Got 0 post RxDet Links!"); 247 return NV_ERR_NOT_READY; 248 } 249 250 #endif 251 252 return status; 253 } 254 255 NV_STATUS 256 ioctrlFaultUpTmrHandler 257 ( 258 OBJGPU *pGpu, 259 OBJTMR *pTmr, 260 TMR_EVENT *pEvent 261 ) 262 { 263 //NvU32 linkId = *(NvU32*)pData; 264 NV_STATUS status = NV_OK; 265 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 266 NV2080_CTRL_NVLINK_POST_FAULT_UP_PARAMS *nvlinkPostFaultUpParams 267 = portMemAllocNonPaged(sizeof(NV2080_CTRL_NVLINK_POST_FAULT_UP_PARAMS)); 268 PNVLINK_ID pFaultLink; 269 pFaultLink = listHead(&pKernelNvlink->faultUpLinks); 270 271 nvlinkPostFaultUpParams->linkId = pFaultLink->linkId; 272 status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 273 NV2080_CTRL_CMD_NVLINK_POST_FAULT_UP, 274 (void *)nvlinkPostFaultUpParams, 275 sizeof(NV2080_CTRL_NVLINK_POST_FAULT_UP_PARAMS)); 276 277 if (status != NV_OK) 278 { 279 NV_PRINTF(LEVEL_ERROR, "Failed to send Faultup RPC\n"); 280 } 281 282 listRemove(&pKernelNvlink->faultUpLinks, pFaultLink); 283 portMemFree(nvlinkPostFaultUpParams); 284 285 return status; 286 } 287 288 NV_STATUS 289 knvlinkHandleFaultUpInterrupt_GH100 290 ( 291 OBJGPU *pGpu, 292 KernelNvlink *pKernelNvlink, 293 NvU32 linkId 294 ) 295 { 296 OBJTMR *pTmr = GPU_GET_TIMER(pGpu); 297 PNVLINK_ID pFaultLink; 298 NV_STATUS status = NV_OK; 299 300 pFaultLink = listAppendNew(&pKernelNvlink->faultUpLinks); 301 NV_ASSERT_OR_RETURN(pFaultLink != NULL, NV_ERR_GENERIC); 302 pFaultLink->linkId = linkId; 303 304 status = tmrEventScheduleRel(pTmr, pKernelNvlink->nvlinkLinks[linkId].pTmrEvent, NVLINK_RETRAIN_TIME); 305 if (status != NV_OK) 306 { 307 NV_PRINTF(LEVEL_ERROR, "GPU (ID: %d) tmrEventScheduleRel failed for linkid %d\n", 308 gpuGetInstance(pGpu), linkId); 309 return NV_ERR_GENERIC; 310 } 311 312 return status; 313 } 314 315 NV_STATUS 316 knvlinkLogAliDebugMessages_GH100 317 ( 318 OBJGPU *pGpu, 319 KernelNvlink *pKernelNvlink 320 ) 321 { 322 NV2080_CTRL_NVLINK_GET_ERR_INFO_PARAMS *nvlinkErrInfoParams = portMemAllocNonPaged(sizeof(NV2080_CTRL_NVLINK_GET_ERR_INFO_PARAMS)); 323 portMemSet(nvlinkErrInfoParams, 0, sizeof(NV2080_CTRL_NVLINK_GET_ERR_INFO_PARAMS)); 324 nvlinkErrInfoParams->ErrInfoFlags |= NV2080_CTRL_NVLINK_ERR_INFO_FLAGS_ALI_STATUS; 325 NvU32 i; 326 // This is a Physical, Hopper specific HAL for debug purposes. 327 NV_STATUS status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 328 NV2080_CTRL_CMD_NVLINK_GET_ERR_INFO, 329 (void *)nvlinkErrInfoParams, 330 sizeof(NV2080_CTRL_NVLINK_GET_ERR_INFO_PARAMS)); 331 if (status != NV_OK) 332 { 333 NV_PRINTF(LEVEL_ERROR, "Error getting debug info for link training!\n"); 334 portMemFree(nvlinkErrInfoParams); 335 return status; 336 } 337 338 FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->postRxDetLinkMask) 339 { 340 nvErrorLog_va((void *)pGpu, ALI_TRAINING_FAIL, 341 "NVLink: Link training failed for link %u", 342 "(0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x)", 343 i, 344 nvlinkErrInfoParams->linkErrInfo[i].NVLIPTLnkCtrlLinkStateRequest, 345 nvlinkErrInfoParams->linkErrInfo[i].NVLDLRxSlsmErrCntl, 346 nvlinkErrInfoParams->linkErrInfo[i].NVLDLTopLinkState, 347 nvlinkErrInfoParams->linkErrInfo[i].NVLDLTopIntr, 348 nvlinkErrInfoParams->linkErrInfo[i].DLStatMN00, 349 nvlinkErrInfoParams->linkErrInfo[i].DLStatUC01, 350 nvlinkErrInfoParams->linkErrInfo[i].MinionNvlinkLinkIntr); 351 352 if (pKernelNvlink->bLinkTrainingDebugSpew) 353 NV_PRINTF(LEVEL_ERROR,"ALI Error for GPU %d::linkId %d:" 354 "\nNVLIPT:\n\tCTRL_LINK_STATE_REQUEST_STATUS = %X\n" 355 "\nNVLDL :\n\tNV_NVLDL_RXSLSM_ERR_CNTL = %X\n" 356 "\n\tNV_NVLDL_TOP_LINK_STATE = %X\n" 357 "\n\tNV_NVLDL_TOP_INTR = %X\n" 358 "\nMINION DLSTAT:\n\tDLSTAT MN00 = %X\n" 359 "\n\tDLSTAT UC01 = %X\n" 360 "\n\tNV_MINION_NVLINK_LINK_INTR = %X\n", 361 pGpu->gpuInstance, i, 362 nvlinkErrInfoParams->linkErrInfo[i].NVLIPTLnkCtrlLinkStateRequest, 363 nvlinkErrInfoParams->linkErrInfo[i].NVLDLRxSlsmErrCntl, 364 nvlinkErrInfoParams->linkErrInfo[i].NVLDLTopLinkState, 365 nvlinkErrInfoParams->linkErrInfo[i].NVLDLTopIntr, 366 nvlinkErrInfoParams->linkErrInfo[i].DLStatMN00, 367 nvlinkErrInfoParams->linkErrInfo[i].DLStatUC01, 368 nvlinkErrInfoParams->linkErrInfo[i].MinionNvlinkLinkIntr); 369 } 370 FOR_EACH_INDEX_IN_MASK_END; 371 portMemFree(nvlinkErrInfoParams); 372 return NV_OK; 373 } 374 375 /** 376 * @brief Check if the nvlink bandwidth setting is OFF 377 * 378 * @param[in] pKernelNvlink reference of KernelNvlink 379 */ 380 NvBool 381 knvlinkIsBandwidthModeOff_GH100 382 ( 383 KernelNvlink *pKernelNvlink 384 ) 385 { 386 return (gpumgrGetGpuNvlinkBwMode() == GPU_NVLINK_BW_MODE_OFF); 387 } 388 389 /** 390 * @brief Calculate the number of active nvlinks needs to be reduced 391 * for direct connect GPU system 392 * 393 * @param[in] pKernelNvlink reference of KernelNvlink 394 */ 395 NvU32 396 knvlinkGetNumLinksToBeReducedPerIoctrl_GH100 397 ( 398 KernelNvlink *pKernelNvlink 399 ) 400 { 401 NvU32 numlinks = 0; 402 NvU8 mode; 403 404 #if defined(INCLUDE_NVLINK_LIB) 405 numlinks = pKernelNvlink->pNvlinkDev->numActiveLinksPerIoctrl; 406 #endif 407 408 if (numlinks == 0) 409 goto out; 410 411 mode = gpumgrGetGpuNvlinkBwMode(); 412 413 switch (mode) 414 { 415 case GPU_NVLINK_BW_MODE_OFF: 416 NV_PRINTF(LEVEL_ERROR, "Cannot reach here %s %d mode=%d\n", 417 __func__, __LINE__, mode); 418 NV_ASSERT(0); 419 break; 420 case GPU_NVLINK_BW_MODE_MIN: 421 numlinks = numlinks - 1; // At least one is ative at this point. 422 break; 423 case GPU_NVLINK_BW_MODE_HALF: 424 numlinks = numlinks / 2; 425 break; 426 case GPU_NVLINK_BW_MODE_3QUARTER: 427 numlinks = numlinks / 4; 428 break; 429 default: // Treat as GPU_NVLINK_BW_MODE_FULL 430 numlinks = 0; 431 break; 432 } 433 434 out: 435 return numlinks; 436 } 437 438 /** 439 * @brief Calculate the effective peer link mask for HS_HUB configuration 440 * 441 * @param[in] pGpu OBJGPU pointer of local GPU 442 * @param[in] pKernelNvlink reference of KernelNvlink 443 * @param[in] pRemoteGpu OBJGPU pointer of remote GPU 444 * @param[in/out] pPeerLinkMask reference of peerLinkMask 445 */ 446 void 447 knvlinkGetEffectivePeerLinkMask_GH100 448 ( 449 OBJGPU *pGpu, 450 KernelNvlink *pKernelNvlink, 451 OBJGPU *pRemoteGpu, 452 NvU32 *pPeerLinkMask 453 ) 454 { 455 NvU32 peerLinkMask, remotePeerLinkMask, effectivePeerLinkMask, peerLinkMaskPerIoctrl; 456 NvU32 gpuInstance, remoteGpuInstance; 457 NvU32 numLinksPerIoctrl, numIoctrls; 458 KernelNvlink *pRemoteKernelNvlink; 459 NvU32 numLinksToBeReduced; 460 NvU32 linkId, count, i; 461 462 gpuInstance = gpuGetInstance(pGpu); 463 remoteGpuInstance = gpuGetInstance(pRemoteGpu); 464 465 // Do not support NVSwitch systems for now. 466 if (knvlinkIsGpuConnectedToNvswitch(pGpu, pKernelNvlink)) 467 { 468 return; 469 } 470 471 peerLinkMask = pKernelNvlink->peerLinkMasks[remoteGpuInstance]; 472 if (peerLinkMask == 0) 473 { 474 return; 475 } 476 477 // 478 // No need to check if remotePeerLinkMask and peerLinkMask are equal because 479 // RM will not enable P2P otherwise. Given that we have reached here means 480 // the masks must be equal. 481 // 482 pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu); 483 remotePeerLinkMask = pRemoteKernelNvlink->peerLinkMasks[gpuInstance]; 484 NV_ASSERT(nvPopCount32(remotePeerLinkMask) == nvPopCount32(peerLinkMask)); 485 486 // Find out number of active NVLinks between the two GPUs. 487 numLinksToBeReduced = knvlinkGetNumLinksToBeReducedPerIoctrl_HAL(pKernelNvlink); 488 effectivePeerLinkMask = peerLinkMask; 489 490 if (numLinksToBeReduced == 0) 491 { 492 return; 493 } 494 495 // Start reducing effectivePeerLinkMask... 496 497 // 498 // To have deterministic approach, if local GPU ID is less than remote GPU 499 // ID, always trim peerLinkMask from the perspective of local GPU. 500 // Otherwise, use remote GPU for the same. 501 // 502 #if defined(INCLUDE_NVLINK_LIB) 503 numIoctrls = pKernelNvlink->pNvlinkDev->numIoctrls; 504 numLinksPerIoctrl = pKernelNvlink->pNvlinkDev->numLinksPerIoctrl; 505 #else 506 numIoctrls = 0; 507 numLinksPerIoctrl = 0; 508 #endif 509 510 if (pGpu->gpuId < pRemoteGpu->gpuId) 511 { 512 for (i = 0; i < numIoctrls; i++) 513 { 514 count = 0; 515 peerLinkMaskPerIoctrl = peerLinkMask & 516 (((1 << numLinksPerIoctrl) - 1) << (i * numLinksPerIoctrl)); 517 518 FOR_EACH_INDEX_IN_MASK(32, linkId, peerLinkMaskPerIoctrl) 519 { 520 if (count == numLinksToBeReduced) 521 { 522 break; 523 } 524 525 effectivePeerLinkMask &= (~NVBIT(linkId)); 526 count++; 527 } 528 FOR_EACH_INDEX_IN_MASK_END; 529 } 530 } 531 else 532 { 533 for (i = 0; i < numIoctrls; i++) 534 { 535 count = 0; 536 peerLinkMaskPerIoctrl = remotePeerLinkMask & 537 (((1 << numLinksPerIoctrl) - 1) << (i * numLinksPerIoctrl)); 538 539 FOR_EACH_INDEX_IN_MASK(32, linkId, peerLinkMaskPerIoctrl) 540 { 541 if (count == numLinksToBeReduced) 542 { 543 break; 544 } 545 546 #if defined(INCLUDE_NVLINK_LIB) 547 effectivePeerLinkMask &= 548 (~NVBIT(pRemoteKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber)); 549 #endif 550 count++; 551 } 552 FOR_EACH_INDEX_IN_MASK_END; 553 } 554 } 555 556 // 557 // effectivePeerLinkMask can never be zero, otherwise we create inconsistent 558 // HW/SW state, where we say that NVLink P2P is supported, but we don't 559 // program HSHUB. 560 // 561 // So, if not enough NVLinks are present, then drop effectivePeerLinkMask. 562 // 563 *pPeerLinkMask = (effectivePeerLinkMask > 0) ? effectivePeerLinkMask : peerLinkMask; 564 } 565 566 /*! 567 * @brief Set unique fabric address for NVSwitch enabled systems. 568 * 569 * @param[in] pGpu OBJGPU pointer 570 * @param[in] pKernelNvlink KernelNvlink pointer 571 * @param[in] fabricBaseAddr Fabric Address to set 572 * 573 * @returns On success, sets unique fabric address and returns NV_OK. 574 * On failure, returns NV_ERR_XXX. 575 */ 576 NV_STATUS 577 knvlinkSetUniqueFabricBaseAddress_GH100 578 ( 579 OBJGPU *pGpu, 580 KernelNvlink *pKernelNvlink, 581 NvU64 fabricBaseAddr 582 ) 583 { 584 NV_STATUS status = NV_OK; 585 586 status = knvlinkValidateFabricBaseAddress_HAL(pGpu, pKernelNvlink, 587 fabricBaseAddr); 588 if (status != NV_OK) 589 { 590 NV_PRINTF(LEVEL_ERROR, "Fabric addr validation failed for GPU %x\n", 591 pGpu->gpuInstance); 592 return status; 593 } 594 595 if (IsSLIEnabled(pGpu)) 596 { 597 NV_PRINTF(LEVEL_ERROR, 598 "Operation is unsupported on SLI enabled GPU %x\n", 599 pGpu->gpuInstance); 600 return NV_ERR_NOT_SUPPORTED; 601 } 602 603 if (pKernelNvlink->fabricBaseAddr == fabricBaseAddr) 604 { 605 NV_PRINTF(LEVEL_INFO, 606 "The same fabric addr is being re-assigned to GPU %x\n", 607 pGpu->gpuInstance); 608 return NV_OK; 609 } 610 611 if (pKernelNvlink->fabricBaseAddr != NVLINK_INVALID_FABRIC_ADDR) 612 { 613 NV_PRINTF(LEVEL_ERROR, "Fabric addr is already assigned to GPU %x\n", 614 pGpu->gpuInstance); 615 return NV_ERR_STATE_IN_USE; 616 } 617 618 pKernelNvlink->fabricBaseAddr = fabricBaseAddr; 619 620 NV_PRINTF(LEVEL_INFO, "Fabric base addr %llx is assigned to GPU %x\n", 621 pKernelNvlink->fabricBaseAddr, pGpu->gpuInstance); 622 623 return NV_OK; 624 } 625 626 /*! 627 * @brief Check if system has enough active NVLinks and 628 * enough NVLink bridges 629 * 630 * @param[in] pGpu OBJGPU pointer 631 * @param[in] pKernelNvlink KernelNvlink pointer 632 * 633 */ 634 void 635 knvlinkDirectConnectCheck_GH100 636 ( 637 OBJGPU *pGpu, 638 KernelNvlink *pKernelNvlink 639 ) 640 { 641 NV2080_CTRL_NVLINK_DIRECT_CONNECT_CHECK_PARAMS params = {0}; 642 643 knvlinkExecGspRmRpc(pGpu, pKernelNvlink, 644 NV2080_CTRL_CMD_NVLINK_DIRECT_CONNECT_CHECK, 645 (void *)¶ms, 646 sizeof(params)); 647 } 648 649 /*! 650 * @brief Check if floorsweeping is needed for this particular chip 651 * 652 * @param[in] pGpu OBJGPU pointer 653 * @param[in] pKernelNvlink KernelNvlink pointer 654 * 655 * @returns On success, sets unique fabric address and returns NV_OK. 656 * On failure, returns NV_ERR_XXX. 657 */ 658 NvBool 659 knvlinkIsFloorSweepingNeeded_GH100 660 ( 661 OBJGPU *pGpu, 662 KernelNvlink *pKernelNvlink, 663 NvU32 numActiveLinksPerIoctrl, 664 NvU32 numLinksPerIoctrl 665 ) 666 { 667 668 // 669 // Only floorsweep down the given GPU if the following conditions are met: 670 // 1. if the number of links for the IP is > 0 671 // 672 // 2. The number of active links allowed for the IOCTRL is less then the 673 // total number of links for the IOCTRL. No reason to spend time in code 674 // if the exectution of it will be a NOP 675 // 676 // 3. If the GPU has never been floorswept. An optimization to make sure RM 677 // doesn't burn cycles repeatedly running running code that will be a NOP 678 // 679 // 4. (temporary) Run only on Silicon chips. Fmodel currently doesn't support 680 // this feature 681 // 682 683 if ((numLinksPerIoctrl > 0 && numActiveLinksPerIoctrl > 0) && 684 numActiveLinksPerIoctrl < numLinksPerIoctrl && 685 !pKernelNvlink->bFloorSwept && 686 IS_SILICON(pGpu)) 687 { 688 return NV_TRUE; 689 } 690 691 return NV_FALSE; 692 } 693 694