1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "os/os.h"
25 #include "kernel/gpu/nvlink/kernel_nvlink.h"
26 #include "kernel/gpu/nvlink/kernel_ioctrl.h"
27 #include "gpu/gpu.h"
28 #include "gpu/mem_mgr/mem_mgr.h"
29 #include "nverror.h"
30 #include "objtmr.h"
31 #include "gpu_mgr/gpu_mgr.h"
32 
33 /*!
34  * @brief Check if ALI is supported for the given device
35  *
36  * @param[in]  pGpu           OBJGPU pointer
37  * @param[in]  pKernelNvlink  KernelNvlink pointer
38  */
39 NV_STATUS
40 knvlinkIsAliSupported_GH100
41 (
42     OBJGPU       *pGpu,
43     KernelNvlink *pKernelNvlink
44 )
45 {
46     NvU32 status = NV_OK;
47 
48     NV2080_CTRL_NVLINK_GET_ALI_ENABLED_PARAMS params;
49 
50     portMemSet(&params, 0, sizeof(params));
51 
52     // Initialize to default settings
53     params.bEnableAli = NV_FALSE;
54 
55     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
56                                  NV2080_CTRL_CMD_NVLINK_GET_ALI_ENABLED,
57                                  (void *)&params, sizeof(params));
58     if (status != NV_OK)
59     {
60         NV_PRINTF(LEVEL_ERROR, "Failed to get ALI enablement status!\n");
61         return status;
62     }
63 
64     pKernelNvlink->bEnableAli = params.bEnableAli;
65 
66     return status;
67 }
68 
69 /*!
70  * @brief   Validates fabric base address.
71  *
72  * @param[in]  pGpu           OBJGPU pointer
73  * @param[in]  pKernelNvlink  KernelNvlink pointer
74  * @param[in]  fabricBaseAddr Address to be validated
75  *
76  * @returns On success, NV_OK.
77  *          On failure, returns NV_ERR_XXX.
78  */
79 NV_STATUS
80 knvlinkValidateFabricBaseAddress_GH100
81 (
82     OBJGPU       *pGpu,
83     KernelNvlink *pKernelNvlink,
84     NvU64         fabricBaseAddr
85 )
86 {
87     MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
88     NvU64          fbSizeBytes;
89 
90     fbSizeBytes = pMemoryManager->Ram.fbTotalMemSizeMb << 20;
91 
92     //
93     // Hopper SKUs will be paired with NVSwitches (Limerock-next) supporting 2K
94     // mapslots that can cover 512GB each. Make sure that the fabric base
95     // address being used is valid to cover whole frame buffer.
96     //
97 
98     // Check if fabric address is aligned to mapslot size.
99     if (fabricBaseAddr & (NVBIT64(39) - 1))
100     {
101         return NV_ERR_INVALID_ARGUMENT;
102     }
103 
104     // Align fbSize to mapslot size.
105     fbSizeBytes = RM_ALIGN_UP(fbSizeBytes, NVBIT64(39));
106 
107     return NV_OK;
108 }
109 
110 /*!
111  * @brief Do post setup on nvlink peers
112  *
113  * @param[in] pGpu           OBJGPU pointer
114  * @param[in] pKernelNvlink  KernelNvlink pointer
115  */
116 NV_STATUS
117 knvlinkPostSetupNvlinkPeer_GH100
118 (
119     OBJGPU       *pGpu,
120     KernelNvlink *pKernelNvlink
121 )
122 {
123     NvU32 status = NV_OK;
124     NV2080_CTRL_NVLINK_POST_SETUP_NVLINK_PEER_PARAMS postSetupNvlinkPeerParams;
125 
126     portMemSet(&postSetupNvlinkPeerParams, 0, sizeof(postSetupNvlinkPeerParams));
127 
128     postSetupNvlinkPeerParams.peerMask = (1 << NVLINK_MAX_PEERS_SW) - 1;
129 
130     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
131                                  NV2080_CTRL_CMD_NVLINK_POST_SETUP_NVLINK_PEER,
132                                  (void *)&postSetupNvlinkPeerParams,
133                                  sizeof(postSetupNvlinkPeerParams));
134     if (status != NV_OK)
135     {
136         NV_PRINTF(LEVEL_ERROR,
137                   "Failed to program post active settings and bufferready!\n");
138         return status;
139     }
140 
141     return status;
142 }
143 
144 /*!
145  * @brief Discover all links that are training or have been
146  *        trained on both GPUs
147  *
148  * @param[in] pGpu           OBJGPU pointer for local GPU
149  * @param[in] pKernelNvlink  KernelNvlink pointer
150  * @param[in] pPeerGpu       OBJGPU pointer for remote GPU
151  *
152  * @return  NV_OK if links are detected to be training
153  */
154 NV_STATUS
155 knvlinkDiscoverPostRxDetLinks_GH100
156 (
157     OBJGPU       *pGpu,
158     KernelNvlink *pKernelNvlink,
159     OBJGPU       *pPeerGpu
160 )
161 {
162     NV_STATUS status = NV_ERR_NOT_SUPPORTED;
163 
164 #if defined(INCLUDE_NVLINK_LIB)
165 
166     OBJGPU       *pGpu0          = pGpu;
167     OBJGPU       *pGpu1          = pPeerGpu;
168     KernelNvlink *pKernelNvlink0 = GPU_GET_KERNEL_NVLINK(pGpu0);
169     KernelNvlink *pKernelNvlink1 = NULL;
170 
171     if (pGpu1 == NULL)
172     {
173         NV_PRINTF(LEVEL_ERROR, "Invalid pPeerGpu.\n");
174 
175         return NV_ERR_INVALID_ARGUMENT;
176     }
177     else if ((pGpu0 == pGpu1) &&
178              (pGpu0->getProperty(pGpu0, PDB_PROP_GPU_NVLINK_P2P_LOOPBACK_DISABLED)))
179     {
180         // P2P over loopback links are disabled through regkey overrides
181         NV_PRINTF(LEVEL_INFO, "loopback P2P on GPU%u disabled by regkey\n",
182                   gpuGetInstance(pGpu0));
183 
184         return NV_ERR_NOT_SUPPORTED;
185     }
186     else
187     {
188         pKernelNvlink1 = GPU_GET_KERNEL_NVLINK(pGpu1);
189     }
190 
191     if (pKernelNvlink1 == NULL)
192     {
193         NV_PRINTF(LEVEL_ERROR,
194                   "Input mask contains a GPU on which NVLink is disabled.\n");
195 
196         return NV_ERR_INVALID_ARGUMENT;
197     }
198 
199     if ((IS_RTLSIM(pGpu) && !pKernelNvlink0->bForceEnableCoreLibRtlsims) ||
200         (pKernelNvlink0->pNvlinkDev == NULL)                             ||
201         !pKernelNvlink0->bEnableAli                                      ||
202         (pKernelNvlink1->pNvlinkDev == NULL)                             ||
203         !pKernelNvlink1->bEnableAli)
204     {
205         NV_PRINTF(LEVEL_INFO,
206                 "Not in ALI, checking PostRxDetLinks not supported.\n");
207         return NV_ERR_NOT_SUPPORTED;
208     }
209 
210     //
211     // Initialize Mask of links that have made it past RxDet to 0 then
212     // request to get all links from the given GPU that have gotted past RxDet
213     //
214     pKernelNvlink0->postRxDetLinkMask = 0;
215     status = knvlinkUpdatePostRxDetectLinkMask(pGpu0, pKernelNvlink0);
216     if(status != NV_OK)
217     {
218         NV_PRINTF(LEVEL_ERROR,
219                   "Getting peer0's postRxDetLinkMask failed!\n");
220         return NV_ERR_INVALID_STATE;
221     }
222 
223     // Only query if we are not in loopback
224     if (pKernelNvlink0 != pKernelNvlink1)
225     {
226         pKernelNvlink1->postRxDetLinkMask = 0;
227         status = knvlinkUpdatePostRxDetectLinkMask(pGpu1, pKernelNvlink1);
228         if(status != NV_OK)
229         {
230             NV_PRINTF(LEVEL_ERROR,
231                       "Getting peer1's postRxDetLinkMask failed!\n");
232             return NV_ERR_INVALID_STATE;
233         }
234     }
235 
236     //
237     // If the current gpu has no actively training or trained link OR
238     // if the peer gpu has no actively training or trained links then
239     // return an error. If either side has 0 links passed RxDet then
240     // there is no chance that we will find links connecting the devices
241     // further into discovery.
242     //
243     if(pKernelNvlink0->postRxDetLinkMask == 0 ||
244        pKernelNvlink1->postRxDetLinkMask == 0)
245     {
246         NV_PRINTF(LEVEL_ERROR, "Got 0 post RxDet Links!");
247         return NV_ERR_NOT_READY;
248     }
249 
250 #endif
251 
252     return status;
253 }
254 
255 NV_STATUS
256 ioctrlFaultUpTmrHandler
257 (
258     OBJGPU *pGpu,
259     OBJTMR *pTmr,
260     TMR_EVENT *pEvent
261 )
262 {
263     //NvU32 linkId = *(NvU32*)pData;
264     NV_STATUS    status = NV_OK;
265     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
266     NV2080_CTRL_NVLINK_POST_FAULT_UP_PARAMS *nvlinkPostFaultUpParams
267                  = portMemAllocNonPaged(sizeof(NV2080_CTRL_NVLINK_POST_FAULT_UP_PARAMS));
268     PNVLINK_ID   pFaultLink;
269     pFaultLink = listHead(&pKernelNvlink->faultUpLinks);
270 
271     nvlinkPostFaultUpParams->linkId = pFaultLink->linkId;
272     status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
273                         NV2080_CTRL_CMD_NVLINK_POST_FAULT_UP,
274                         (void *)nvlinkPostFaultUpParams,
275                         sizeof(NV2080_CTRL_NVLINK_POST_FAULT_UP_PARAMS));
276 
277     if (status != NV_OK)
278     {
279         NV_PRINTF(LEVEL_ERROR, "Failed to send Faultup RPC\n");
280     }
281 
282     listRemove(&pKernelNvlink->faultUpLinks, pFaultLink);
283     portMemFree(nvlinkPostFaultUpParams);
284 
285     return status;
286 }
287 
288 NV_STATUS
289 knvlinkHandleFaultUpInterrupt_GH100
290 (
291     OBJGPU       *pGpu,
292     KernelNvlink *pKernelNvlink,
293     NvU32        linkId
294 )
295 {
296     OBJTMR    *pTmr = GPU_GET_TIMER(pGpu);
297     PNVLINK_ID pFaultLink;
298     NV_STATUS status = NV_OK;
299 
300     pFaultLink = listAppendNew(&pKernelNvlink->faultUpLinks);
301     NV_ASSERT_OR_RETURN(pFaultLink != NULL, NV_ERR_GENERIC);
302     pFaultLink->linkId = linkId;
303 
304     status = tmrEventScheduleRel(pTmr, pKernelNvlink->nvlinkLinks[linkId].pTmrEvent, NVLINK_RETRAIN_TIME);
305     if (status != NV_OK)
306     {
307         NV_PRINTF(LEVEL_ERROR, "GPU (ID: %d) tmrEventScheduleRel failed for linkid %d\n",
308                   gpuGetInstance(pGpu), linkId);
309         return NV_ERR_GENERIC;
310     }
311 
312     return status;
313 }
314 
315 NV_STATUS
316 knvlinkLogAliDebugMessages_GH100
317 (
318     OBJGPU       *pGpu,
319     KernelNvlink *pKernelNvlink
320 )
321 {
322     NV2080_CTRL_NVLINK_GET_ERR_INFO_PARAMS *nvlinkErrInfoParams = portMemAllocNonPaged(sizeof(NV2080_CTRL_NVLINK_GET_ERR_INFO_PARAMS));
323     portMemSet(nvlinkErrInfoParams, 0, sizeof(NV2080_CTRL_NVLINK_GET_ERR_INFO_PARAMS));
324     nvlinkErrInfoParams->ErrInfoFlags |= NV2080_CTRL_NVLINK_ERR_INFO_FLAGS_ALI_STATUS;
325     NvU32         i;
326     // This is a Physical, Hopper specific HAL for debug purposes.
327     NV_STATUS status = knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
328                         NV2080_CTRL_CMD_NVLINK_GET_ERR_INFO,
329                         (void *)nvlinkErrInfoParams,
330                         sizeof(NV2080_CTRL_NVLINK_GET_ERR_INFO_PARAMS));
331     if (status != NV_OK)
332     {
333         NV_PRINTF(LEVEL_ERROR, "Error getting debug info for link training!\n");
334         portMemFree(nvlinkErrInfoParams);
335         return status;
336     }
337 
338     FOR_EACH_INDEX_IN_MASK(32, i, pKernelNvlink->postRxDetLinkMask)
339     {
340         nvErrorLog_va((void *)pGpu, ALI_TRAINING_FAIL,
341                 "NVLink: Link training failed for link %u",
342                 "(0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x)",
343                 i,
344                 nvlinkErrInfoParams->linkErrInfo[i].NVLIPTLnkCtrlLinkStateRequest,
345                 nvlinkErrInfoParams->linkErrInfo[i].NVLDLRxSlsmErrCntl,
346                 nvlinkErrInfoParams->linkErrInfo[i].NVLDLTopLinkState,
347                 nvlinkErrInfoParams->linkErrInfo[i].NVLDLTopIntr,
348                 nvlinkErrInfoParams->linkErrInfo[i].DLStatMN00,
349                 nvlinkErrInfoParams->linkErrInfo[i].DLStatUC01,
350                 nvlinkErrInfoParams->linkErrInfo[i].MinionNvlinkLinkIntr);
351 
352         if (pKernelNvlink->bLinkTrainingDebugSpew)
353             NV_PRINTF(LEVEL_ERROR,"ALI Error for GPU %d::linkId %d:"
354                     "\nNVLIPT:\n\tCTRL_LINK_STATE_REQUEST_STATUS = %X\n"
355                     "\nNVLDL :\n\tNV_NVLDL_RXSLSM_ERR_CNTL = %X\n"
356                     "\n\tNV_NVLDL_TOP_LINK_STATE = %X\n"
357                     "\n\tNV_NVLDL_TOP_INTR = %X\n"
358                     "\nMINION DLSTAT:\n\tDLSTAT MN00 = %X\n"
359                     "\n\tDLSTAT UC01 = %X\n"
360                     "\n\tNV_MINION_NVLINK_LINK_INTR = %X\n",
361                     pGpu->gpuInstance, i,
362                     nvlinkErrInfoParams->linkErrInfo[i].NVLIPTLnkCtrlLinkStateRequest,
363                     nvlinkErrInfoParams->linkErrInfo[i].NVLDLRxSlsmErrCntl,
364                     nvlinkErrInfoParams->linkErrInfo[i].NVLDLTopLinkState,
365                     nvlinkErrInfoParams->linkErrInfo[i].NVLDLTopIntr,
366                     nvlinkErrInfoParams->linkErrInfo[i].DLStatMN00,
367                     nvlinkErrInfoParams->linkErrInfo[i].DLStatUC01,
368                     nvlinkErrInfoParams->linkErrInfo[i].MinionNvlinkLinkIntr);
369     }
370     FOR_EACH_INDEX_IN_MASK_END;
371     portMemFree(nvlinkErrInfoParams);
372     return NV_OK;
373 }
374 
375 /**
376  * @brief Check if the nvlink bandwidth setting is OFF
377  *
378  * @param[in]   pKernelNvlink         reference of KernelNvlink
379  */
380 NvBool
381 knvlinkIsBandwidthModeOff_GH100
382 (
383     KernelNvlink *pKernelNvlink
384 )
385 {
386     return (gpumgrGetGpuNvlinkBwMode() == GPU_NVLINK_BW_MODE_OFF);
387 }
388 
389 /**
390  * @brief Calculate the number of active nvlinks needs to be reduced
391  *        for direct connect GPU system
392  *
393  * @param[in]   pKernelNvlink         reference of KernelNvlink
394  */
395 NvU32
396 knvlinkGetNumLinksToBeReducedPerIoctrl_GH100
397 (
398     KernelNvlink *pKernelNvlink
399 )
400 {
401     NvU32 numlinks = 0;
402     NvU8 mode;
403 
404 #if defined(INCLUDE_NVLINK_LIB)
405     numlinks = pKernelNvlink->pNvlinkDev->numActiveLinksPerIoctrl;
406 #endif
407 
408     if (numlinks == 0)
409         goto out;
410 
411     mode = gpumgrGetGpuNvlinkBwMode();
412 
413     switch (mode)
414     {
415         case GPU_NVLINK_BW_MODE_OFF:
416             NV_PRINTF(LEVEL_ERROR, "Cannot reach here %s %d mode=%d\n",
417                       __func__, __LINE__, mode);
418             NV_ASSERT(0);
419             break;
420         case GPU_NVLINK_BW_MODE_MIN:
421             numlinks = numlinks - 1; // At least one is ative at this point.
422             break;
423         case GPU_NVLINK_BW_MODE_HALF:
424             numlinks = numlinks / 2;
425             break;
426         case GPU_NVLINK_BW_MODE_3QUARTER:
427             numlinks = numlinks / 4;
428             break;
429         default: // Treat as GPU_NVLINK_BW_MODE_FULL
430             numlinks = 0;
431             break;
432     }
433 
434 out:
435     return numlinks;
436 }
437 
438 /**
439  * @brief Calculate the effective peer link mask for HS_HUB configuration
440  *
441  * @param[in]   pGpu               OBJGPU pointer of local GPU
442  * @param[in]   pKernelNvlink      reference of KernelNvlink
443  * @param[in]   pRemoteGpu         OBJGPU pointer of remote GPU
444  * @param[in/out] pPeerLinkMask    reference of peerLinkMask
445  */
446 void
447 knvlinkGetEffectivePeerLinkMask_GH100
448 (
449     OBJGPU *pGpu,
450     KernelNvlink *pKernelNvlink,
451     OBJGPU *pRemoteGpu,
452     NvU32  *pPeerLinkMask
453 )
454 {
455     NvU32 peerLinkMask, remotePeerLinkMask, effectivePeerLinkMask, peerLinkMaskPerIoctrl;
456     NvU32 gpuInstance, remoteGpuInstance;
457     NvU32 numLinksPerIoctrl, numIoctrls;
458     KernelNvlink *pRemoteKernelNvlink;
459     NvU32 numLinksToBeReduced;
460     NvU32 linkId, count, i;
461 
462     gpuInstance = gpuGetInstance(pGpu);
463     remoteGpuInstance = gpuGetInstance(pRemoteGpu);
464 
465     // Do not support NVSwitch systems for now.
466     if (knvlinkIsGpuConnectedToNvswitch(pGpu, pKernelNvlink))
467     {
468         return;
469     }
470 
471     peerLinkMask = pKernelNvlink->peerLinkMasks[remoteGpuInstance];
472     if (peerLinkMask == 0)
473     {
474         return;
475     }
476 
477     //
478     // No need to check if remotePeerLinkMask and peerLinkMask are equal because
479     // RM will not enable P2P otherwise. Given that we have reached here means
480     // the masks must be equal.
481     //
482     pRemoteKernelNvlink = GPU_GET_KERNEL_NVLINK(pRemoteGpu);
483     remotePeerLinkMask = pRemoteKernelNvlink->peerLinkMasks[gpuInstance];
484     NV_ASSERT(nvPopCount32(remotePeerLinkMask) == nvPopCount32(peerLinkMask));
485 
486     // Find out number of active NVLinks between the two GPUs.
487     numLinksToBeReduced = knvlinkGetNumLinksToBeReducedPerIoctrl_HAL(pKernelNvlink);
488     effectivePeerLinkMask = peerLinkMask;
489 
490     if (numLinksToBeReduced == 0)
491     {
492         return;
493     }
494 
495     // Start reducing effectivePeerLinkMask...
496 
497     //
498     // To have deterministic approach, if local GPU ID is less than remote GPU
499     // ID, always trim peerLinkMask from the perspective of local GPU.
500     // Otherwise, use remote GPU for the same.
501     //
502 #if defined(INCLUDE_NVLINK_LIB)
503     numIoctrls = pKernelNvlink->pNvlinkDev->numIoctrls;
504     numLinksPerIoctrl = pKernelNvlink->pNvlinkDev->numLinksPerIoctrl;
505 #else
506     numIoctrls = 0;
507     numLinksPerIoctrl = 0;
508 #endif
509 
510     if (pGpu->gpuId < pRemoteGpu->gpuId)
511     {
512         for (i = 0; i < numIoctrls; i++)
513         {
514             count = 0;
515             peerLinkMaskPerIoctrl = peerLinkMask &
516                 (((1 << numLinksPerIoctrl) - 1) << (i * numLinksPerIoctrl));
517 
518             FOR_EACH_INDEX_IN_MASK(32, linkId, peerLinkMaskPerIoctrl)
519             {
520                 if (count == numLinksToBeReduced)
521                 {
522                     break;
523                 }
524 
525                 effectivePeerLinkMask &= (~NVBIT(linkId));
526                 count++;
527             }
528             FOR_EACH_INDEX_IN_MASK_END;
529         }
530     }
531     else
532     {
533         for (i = 0; i < numIoctrls; i++)
534         {
535             count = 0;
536             peerLinkMaskPerIoctrl = remotePeerLinkMask &
537                 (((1 << numLinksPerIoctrl) - 1) << (i * numLinksPerIoctrl));
538 
539             FOR_EACH_INDEX_IN_MASK(32, linkId, peerLinkMaskPerIoctrl)
540             {
541                 if (count == numLinksToBeReduced)
542                 {
543                     break;
544                 }
545 
546 #if defined(INCLUDE_NVLINK_LIB)
547                 effectivePeerLinkMask &=
548                     (~NVBIT(pRemoteKernelNvlink->nvlinkLinks[linkId].remoteEndInfo.linkNumber));
549 #endif
550                 count++;
551             }
552             FOR_EACH_INDEX_IN_MASK_END;
553         }
554     }
555 
556     //
557     // effectivePeerLinkMask can never be zero, otherwise we create inconsistent
558     // HW/SW state, where we say that NVLink P2P is supported, but we don't
559     // program HSHUB.
560     //
561     // So, if not enough NVLinks are present, then drop effectivePeerLinkMask.
562     //
563     *pPeerLinkMask = (effectivePeerLinkMask > 0) ? effectivePeerLinkMask : peerLinkMask;
564 }
565 
566 /*!
567  * @brief   Set unique fabric address for NVSwitch enabled systems.
568  *
569  * @param[in] pGpu           OBJGPU pointer
570  * @param[in] pKernelNvlink  KernelNvlink pointer
571  * @param[in] fabricBaseAddr Fabric Address to set
572  *
573  * @returns On success, sets unique fabric address and returns NV_OK.
574  *          On failure, returns NV_ERR_XXX.
575  */
576 NV_STATUS
577 knvlinkSetUniqueFabricBaseAddress_GH100
578 (
579     OBJGPU       *pGpu,
580     KernelNvlink *pKernelNvlink,
581     NvU64         fabricBaseAddr
582 )
583 {
584     NV_STATUS status = NV_OK;
585 
586     status = knvlinkValidateFabricBaseAddress_HAL(pGpu, pKernelNvlink,
587                                                   fabricBaseAddr);
588     if (status != NV_OK)
589     {
590         NV_PRINTF(LEVEL_ERROR, "Fabric addr validation failed for GPU %x\n",
591                   pGpu->gpuInstance);
592         return status;
593     }
594 
595     if (IsSLIEnabled(pGpu))
596     {
597         NV_PRINTF(LEVEL_ERROR,
598                   "Operation is unsupported on SLI enabled GPU %x\n",
599                   pGpu->gpuInstance);
600         return NV_ERR_NOT_SUPPORTED;
601     }
602 
603     if (pKernelNvlink->fabricBaseAddr == fabricBaseAddr)
604     {
605         NV_PRINTF(LEVEL_INFO,
606                   "The same fabric addr is being re-assigned to GPU %x\n",
607                   pGpu->gpuInstance);
608         return NV_OK;
609     }
610 
611     if (pKernelNvlink->fabricBaseAddr != NVLINK_INVALID_FABRIC_ADDR)
612     {
613         NV_PRINTF(LEVEL_ERROR, "Fabric addr is already assigned to GPU %x\n",
614                   pGpu->gpuInstance);
615         return NV_ERR_STATE_IN_USE;
616     }
617 
618     pKernelNvlink->fabricBaseAddr = fabricBaseAddr;
619 
620     NV_PRINTF(LEVEL_INFO, "Fabric base addr %llx is assigned to GPU %x\n",
621               pKernelNvlink->fabricBaseAddr, pGpu->gpuInstance);
622 
623     return NV_OK;
624 }
625 
626 /*!
627  * @brief   Check if system has enough active NVLinks and
628  *          enough NVLink bridges
629  *
630  * @param[in] pGpu           OBJGPU pointer
631  * @param[in] pKernelNvlink  KernelNvlink pointer
632  *
633  */
634 void
635 knvlinkDirectConnectCheck_GH100
636 (
637     OBJGPU       *pGpu,
638     KernelNvlink *pKernelNvlink
639 )
640 {
641     NV2080_CTRL_NVLINK_DIRECT_CONNECT_CHECK_PARAMS params = {0};
642 
643     knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
644                         NV2080_CTRL_CMD_NVLINK_DIRECT_CONNECT_CHECK,
645                         (void *)&params,
646                         sizeof(params));
647 }
648 
649 /*!
650  * @brief  Check if floorsweeping is needed for this particular chip
651  *
652  * @param[in]  pGpu            OBJGPU pointer
653  * @param[in]  pKernelNvlink   KernelNvlink pointer
654  *
655  * @returns On success, sets unique fabric address and returns NV_OK.
656  *          On failure, returns NV_ERR_XXX.
657  */
658 NvBool
659 knvlinkIsFloorSweepingNeeded_GH100
660 (
661     OBJGPU       *pGpu,
662     KernelNvlink *pKernelNvlink,
663     NvU32         numActiveLinksPerIoctrl,
664     NvU32         numLinksPerIoctrl
665 )
666 {
667 
668     //
669     // Only floorsweep down the given GPU if the following conditions are met:
670     // 1. if the number of links for the IP is > 0
671     //
672     // 2. The number of active links allowed for the IOCTRL is less then the
673     //    total number of links for the IOCTRL. No reason to spend time in code
674     //    if the exectution of it will be a NOP
675     //
676     // 3. If the GPU has never been floorswept. An optimization to make sure RM
677     //    doesn't burn cycles repeatedly running running code that will be a NOP
678     //
679     // 4. (temporary) Run only on Silicon chips. Fmodel currently doesn't support
680     //    this feature
681     //
682 
683     if ((numLinksPerIoctrl > 0 && numActiveLinksPerIoctrl > 0) &&
684         numActiveLinksPerIoctrl < numLinksPerIoctrl            &&
685         !pKernelNvlink->bFloorSwept                            &&
686         IS_SILICON(pGpu))
687     {
688         return NV_TRUE;
689     }
690 
691     return NV_FALSE;
692 }
693 
694