1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /*!
25  * @file
26  * @brief GPU Fabric Probe handling
27  */
28 
29 
30 #include "gpu/gpu.h"
31 #include "os/os.h"
32 #include "utils/nvprintf.h"
33 #include "kernel/gpu/nvlink/kernel_nvlink.h"
34 #include "gpu/gpu_fabric_probe.h"
35 #include "nvlink_inband_msg.h"
36 #include "kernel/mem_mgr/fabric_vaspace.h"
37 #include "ctrl/ctrl2080/ctrl2080internal.h"
38 
39 // Structure to hold gpu probe information
40 typedef struct GPU_FABRIC_PROBE_INFO_KERNEL
41 {
42     NvBool bProbeRespRcvd;
43     NvU8 bwMode;
44 
45     OBJGPU *pGpu;
46 
47     nvlink_inband_gpu_probe_rsp_msg_t probeResponseMsg;
48 
49 } GPU_FABRIC_PROBE_INFO_KERNEL;
50 
51 static NV_STATUS
52 _gpuFabricProbeFullSanityCheck
53 (
54     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel
55 )
56 {
57     if (pGpuFabricProbeInfoKernel == NULL)
58     {
59         return NV_ERR_NOT_SUPPORTED;
60     }
61 
62     LOCK_ASSERT_AND_RETURN(rmDeviceGpuLockIsOwner(
63                            gpuGetInstance(pGpuFabricProbeInfoKernel->pGpu)));
64 
65     if (!gpuFabricProbeIsReceived(pGpuFabricProbeInfoKernel))
66     {
67         return NV_ERR_NOT_READY;
68     }
69 
70     if (!gpuFabricProbeIsSuccess(pGpuFabricProbeInfoKernel))
71     {
72         return pGpuFabricProbeInfoKernel->probeResponseMsg.msgHdr.status;
73     }
74 
75     return NV_OK;
76 }
77 
78 NV_STATUS
79 gpuFabricProbeGetGpuFabricHandle
80 (
81     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
82     NvU64 *pHandle
83 )
84 {
85     NV_STATUS status;
86 
87     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
88 
89     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
90 
91     *pHandle = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.gpuHandle;
92 
93     return status;
94 }
95 
96 NV_STATUS
97 gpuFabricProbeGetGfId
98 (
99     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
100     NvU32 *pGfId
101 )
102 {
103     NV_STATUS status;
104 
105     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
106 
107     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
108 
109     *pGfId = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.gfId;
110 
111     return status;
112 }
113 
114 NV_STATUS
115 gpuFabricProbeGetfmCaps
116 (
117     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
118     NvU64 *pFmCaps
119 )
120 {
121     NV_STATUS status;
122 
123     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
124 
125     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
126 
127     *pFmCaps = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.fmCaps;
128 
129     return status;
130 }
131 
132 NV_STATUS
133 gpuFabricProbeGetClusterUuid
134 (
135     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
136     NvUuid *pClusterUuid
137 )
138 {
139     NV_STATUS status;
140 
141     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
142 
143     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
144 
145     portMemCopy(&pClusterUuid->uuid[0],
146                 sizeof(pClusterUuid->uuid),
147                 &pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.clusterUuid.uuid[0],
148                 sizeof(pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.clusterUuid.uuid));
149 
150     return status;
151 }
152 
153 NV_STATUS
154 gpuFabricProbeGetFabricPartitionId
155 (
156     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
157     NvU16 *pFabricPartitionId
158 )
159 {
160     NV_STATUS status;
161 
162     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
163 
164     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
165 
166     *pFabricPartitionId = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.fabricPartitionId;
167 
168     return status;
169 }
170 
171 NV_STATUS
172 gpuFabricProbeGetGpaAddress
173 (
174     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
175     NvU64 *pGpaAddress
176 )
177 {
178     NV_STATUS status;
179 
180     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
181 
182     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
183 
184     *pGpaAddress = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.gpaAddress;
185 
186     return status;
187 }
188 
189 NV_STATUS
190 gpuFabricProbeGetGpaAddressRange
191 (
192     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
193     NvU64 *pGpaAddressRange
194 )
195 {
196     NV_STATUS status;
197 
198     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
199 
200     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
201 
202     *pGpaAddressRange = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.gpaAddressRange;
203 
204     return status;
205 }
206 
207 NV_STATUS
208 gpuFabricProbeGetFlaAddress
209 (
210     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
211     NvU64 *pFlaAddress
212 )
213 {
214     NV_STATUS status;
215 
216     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
217 
218     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
219 
220     *pFlaAddress = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.flaAddress;
221 
222     return status;
223 }
224 
225 NV_STATUS
226 gpuFabricProbeGetFlaAddressRange
227 (
228     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
229     NvU64 *pFlaAddressRange
230 )
231 {
232     NV_STATUS status;
233 
234     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
235 
236     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
237 
238     *pFlaAddressRange = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.flaAddressRange;
239 
240     return status;
241 }
242 
243 NV_STATUS
244 gpuFabricProbeGetNumProbeReqs
245 (
246     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
247     NvU64 *numProbes
248 )
249 {
250     NV2080_CTRL_CMD_INTERNAL_GET_GPU_FABRIC_PROBE_INFO_PARAMS params = { 0 };
251     RM_API *pRmApi;
252     OBJGPU *pGpu;
253 
254     if (pGpuFabricProbeInfoKernel == NULL)
255     {
256         return NV_ERR_NOT_SUPPORTED;
257     }
258 
259     pGpu = pGpuFabricProbeInfoKernel->pGpu;
260     pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
261 
262     LOCK_ASSERT_AND_RETURN(rmDeviceGpuLockIsOwner(
263                            gpuGetInstance(pGpuFabricProbeInfoKernel->pGpu)));
264 
265     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
266           pRmApi->Control(pRmApi,
267                           pGpu->hInternalClient,
268                           pGpu->hInternalSubdevice,
269                           NV2080_CTRL_CMD_INTERNAL_GPU_GET_FABRIC_PROBE_INFO,
270                           &params,
271                           sizeof(params)));
272 
273     *numProbes = params.numProbes;
274 
275     return NV_OK;
276 }
277 
278 NvBool
279 gpuFabricProbeIsReceived
280 (
281     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel
282 )
283 {
284     if (pGpuFabricProbeInfoKernel == NULL)
285     {
286         return NV_FALSE;
287     }
288 
289     LOCK_ASSERT_AND_RETURN_BOOL(rmDeviceGpuLockIsOwner(
290                                 gpuGetInstance(pGpuFabricProbeInfoKernel->pGpu)),
291                                 NV_FALSE);
292 
293     return pGpuFabricProbeInfoKernel->bProbeRespRcvd;
294 }
295 
296 NvBool
297 gpuFabricProbeIsSuccess
298 (
299     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel
300 )
301 {
302     nvlink_inband_gpu_probe_rsp_msg_t *pProbeResponseMsg;
303     nvlink_inband_msg_header_t *pProbeRespMsgHdr;
304 
305     if (pGpuFabricProbeInfoKernel == NULL)
306     {
307         return NV_FALSE;
308     }
309 
310     LOCK_ASSERT_AND_RETURN_BOOL(rmDeviceGpuLockIsOwner(gpuGetInstance(
311                                 pGpuFabricProbeInfoKernel->pGpu)),
312                                 NV_FALSE);
313 
314     pProbeResponseMsg = &pGpuFabricProbeInfoKernel->probeResponseMsg;
315     pProbeRespMsgHdr = &pProbeResponseMsg->msgHdr;
316 
317     return pProbeRespMsgHdr->status == NV_OK;
318 }
319 
320 NV_STATUS
321 gpuFabricProbeGetFmStatus
322 (
323     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel
324 )
325 {
326     if (pGpuFabricProbeInfoKernel == NULL)
327     {
328         return NV_ERR_NOT_SUPPORTED;
329     }
330 
331     LOCK_ASSERT_AND_RETURN(rmDeviceGpuLockIsOwner(
332                            gpuGetInstance(pGpuFabricProbeInfoKernel->pGpu)));
333 
334     return pGpuFabricProbeInfoKernel->probeResponseMsg.msgHdr.status;
335 }
336 
337 static void
338 _gpuFabricProbeSetupGpaRange
339 (
340     OBJGPU                *pGpu,
341     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel
342 )
343 {
344     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
345     // setup GPA based system
346     if (pKernelNvlink != NULL)
347     {
348         NvU64 gpaAddress;
349         NvU64 gpaAddressSize;
350 
351         NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR,
352                     gpuFabricProbeGetGpaAddress(pGpuFabricProbeInfoKernel,
353                                                 &gpaAddress) == NV_OK);
354 
355         NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR,
356                     gpuFabricProbeGetGpaAddressRange(pGpuFabricProbeInfoKernel,
357                                                     &gpaAddressSize) == NV_OK);
358 
359         NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR,
360                     knvlinkSetUniqueFabricBaseAddress_HAL(pGpu, pKernelNvlink,
361                                                         gpaAddress) == NV_OK);
362     }
363 }
364 
365 static void
366 _gpuFabricProbeSetupFlaRange
367 (
368     OBJGPU                  *pGpu,
369     GPU_FABRIC_PROBE_INFO_KERNEL   *pGpuFabricProbeInfoKernel
370 )
371 {
372     if (pGpu->pFabricVAS != NULL)
373     {
374         NvU64 flaBaseAddress;
375         NvU64 flaSize;
376 
377         NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR,
378             gpuFabricProbeGetFlaAddress(pGpuFabricProbeInfoKernel,
379                                         &flaBaseAddress) == NV_OK);
380 
381         NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR,
382             gpuFabricProbeGetFlaAddressRange(pGpuFabricProbeInfoKernel,
383                                              &flaSize) == NV_OK);
384 
385         NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR,
386             fabricvaspaceInitUCRange(dynamicCast(pGpu->pFabricVAS, FABRIC_VASPACE),
387                                      pGpu, flaBaseAddress, flaSize) == NV_OK);
388     }
389 }
390 
391 static NV_STATUS
392 _gpuFabricProbeReceiveKernel
393 (
394     NvU32 gpuInstance,
395     NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS *pInbandRcvParams
396 )
397 {
398     OBJGPU *pGpu;
399     NvU32 gpuMaskUnused;
400     nvlink_inband_gpu_probe_rsp_msg_t *pProbeRespMsg;
401     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel;
402     NV_STATUS status;
403 
404     if ((pGpu = gpumgrGetGpu(gpuInstance)) == NULL)
405     {
406         NV_ASSERT_FAILED("Invalid GPU instance");
407         return NV_ERR_INVALID_ARGUMENT;
408     }
409 
410     //
411     // There is a scenario where _gpuFabricProbeStart fails in the GSP
412     // and returns failure to kernel ctrl call to start probe.
413     // This will set the pGpuFabricProbeInfoKernel to NULL.
414     // GSP also sends a probe response with failure error code.
415     // Handling this response causes kernel driver to crash since
416     // pGpuFabricProbeInfoKernel is already cleared in the kernel.
417     // This check is added to handle this scenario.
418     //
419     NV_CHECK_OR_RETURN(LEVEL_ERROR, pGpu->pGpuFabricProbeInfoKernel != NULL, NV_OK);
420 
421     NV_ASSERT(rmGpuGroupLockIsOwner(gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
422                                     &gpuMaskUnused));
423 
424     NV_ASSERT(pInbandRcvParams != NULL);
425 
426     pGpuFabricProbeInfoKernel = pGpu->pGpuFabricProbeInfoKernel;
427 
428     pProbeRespMsg = \
429         (nvlink_inband_gpu_probe_rsp_msg_t *)&pInbandRcvParams->data[0];
430 
431     portMemCopy(&pGpuFabricProbeInfoKernel->probeResponseMsg,
432                 sizeof(pGpuFabricProbeInfoKernel->probeResponseMsg),
433                 pProbeRespMsg,
434                 sizeof(*pProbeRespMsg));
435 
436     //
437     // TODO - Add additional check with versioning to continue with the
438     // timer and send lower version requests
439     //
440     pGpuFabricProbeInfoKernel->bProbeRespRcvd = NV_TRUE;
441 
442     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
443     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
444 
445     _gpuFabricProbeSetupGpaRange(pGpu, pGpuFabricProbeInfoKernel);
446     _gpuFabricProbeSetupFlaRange(pGpu, pGpuFabricProbeInfoKernel);
447 
448     return NV_OK;
449 }
450 
451 void
452 gpuFabricProbeSuspend
453 (
454     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel
455 )
456 {
457     OBJGPU *pGpu;
458     RM_API *pRmApi;
459     NV_STATUS status;
460 
461     if (pGpuFabricProbeInfoKernel == NULL)
462     {
463         return;
464     }
465 
466     pGpu = pGpuFabricProbeInfoKernel->pGpu;
467     pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
468 
469     NV_ASSERT(rmDeviceGpuLockIsOwner(gpuGetInstance(pGpu)));
470 
471     NV_CHECK_OK(status, LEVEL_ERROR,
472             pRmApi->Control(pRmApi,
473                             pGpu->hInternalClient,
474                             pGpu->hInternalSubdevice,
475                             NV2080_CTRL_CMD_INTERNAL_GPU_SUSPEND_FABRIC_PROBE,
476                             NULL, 0));
477 }
478 
479 NV_STATUS
480 gpuFabricProbeResume
481 (
482     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel
483 )
484 {
485     OBJGPU *pGpu;
486     RM_API *pRmApi;
487     NV2080_CTRL_CMD_INTERNAL_RESUME_GPU_FABRIC_PROBE_INFO_PARAMS params = { 0 };
488 
489     if (pGpuFabricProbeInfoKernel == NULL)
490     {
491         return NV_ERR_NOT_SUPPORTED;
492     }
493 
494     pGpu = pGpuFabricProbeInfoKernel->pGpu;
495     pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
496 
497     NV_ASSERT(rmDeviceGpuLockIsOwner(gpuGetInstance(pGpu)));
498 
499     params.bwMode = pGpuFabricProbeInfoKernel->bwMode;
500 
501     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
502               pRmApi->Control(pRmApi,
503                               pGpu->hInternalClient,
504                               pGpu->hInternalSubdevice,
505                               NV2080_CTRL_CMD_INTERNAL_GPU_RESUME_FABRIC_PROBE,
506                               &params, sizeof(params)));
507 
508     return NV_OK;
509 }
510 
511 NV_STATUS
512 gpuFabricProbeStart
513 (
514     OBJGPU *pGpu,
515     GPU_FABRIC_PROBE_INFO_KERNEL **ppGpuFabricProbeInfoKernel
516 )
517 {
518     NV_STATUS status;
519     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel;
520     NVLINK_INBAND_MSG_CALLBACK inbandMsgCbParams;
521     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
522     RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
523     NV2080_CTRL_CMD_INTERNAL_START_GPU_FABRIC_PROBE_INFO_PARAMS params = { 0 };
524 
525     LOCK_ASSERT_AND_RETURN(rmDeviceGpuLockIsOwner(gpuGetInstance(pGpu)));
526 
527     // Check if NVSwitch based system. If not return without doing anything
528     if (!gpuFabricProbeIsSupported(pGpu))
529     {
530         return NV_OK;
531     }
532 
533     *ppGpuFabricProbeInfoKernel = portMemAllocNonPaged(sizeof(*pGpuFabricProbeInfoKernel));
534     NV_ASSERT_OR_RETURN(*ppGpuFabricProbeInfoKernel != NULL, NV_ERR_NO_MEMORY);
535 
536     pGpuFabricProbeInfoKernel = *ppGpuFabricProbeInfoKernel;
537 
538     portMemSet(pGpuFabricProbeInfoKernel, 0, sizeof(*pGpuFabricProbeInfoKernel));
539 
540     pGpuFabricProbeInfoKernel->pGpu = pGpu;
541 
542     // Register the receive callback
543     inbandMsgCbParams.messageType = NVLINK_INBAND_MSG_TYPE_GPU_PROBE_RSP;
544     inbandMsgCbParams.pCallback = _gpuFabricProbeReceiveKernel;
545     inbandMsgCbParams.wqItemFlags = (OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA |
546                             OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_SUBDEVICE_RW);
547     status = knvlinkRegisterInbandCallback(pGpu,
548                                            pKernelNvlink,
549                                            &inbandMsgCbParams);
550     if (status != NV_OK)
551     {
552         NV_PRINTF(LEVEL_ERROR, "GPU%u Registering Inband Cb failed\n",
553                   gpuGetInstance(pGpu));
554         goto fail;
555     }
556 
557     pGpuFabricProbeInfoKernel->bwMode = gpumgrGetGpuNvlinkBwMode();
558     params.bwMode = pGpuFabricProbeInfoKernel->bwMode;
559 
560     // Send IOCTL to start probe
561     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
562             pRmApi->Control(pRmApi,
563                             pGpu->hInternalClient,
564                             pGpu->hInternalSubdevice,
565                             NV2080_CTRL_CMD_INTERNAL_GPU_START_FABRIC_PROBE,
566                             &params, sizeof(params)),
567             fail);
568 
569     return NV_OK;
570 
571 fail:
572     portMemFree(pGpuFabricProbeInfoKernel);
573     pGpu->pGpuFabricProbeInfoKernel = NULL;
574 
575     return status;
576 }
577 
578 void
579 gpuFabricProbeStop
580 (
581     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel
582 )
583 {
584     OBJGPU *pGpu;
585     KernelNvlink *pKernelNvlink;
586     RM_API *pRmApi;
587 
588     if (pGpuFabricProbeInfoKernel == NULL)
589     {
590         return;
591     }
592 
593     pGpu = pGpuFabricProbeInfoKernel->pGpu;
594 
595     NV_ASSERT_OR_RETURN_VOID(rmDeviceGpuLockIsOwner(gpuGetInstance(pGpu)));
596 
597     pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
598     NV_ASSERT(pKernelNvlink != NULL);
599 
600     // Unregister the receive callback
601     NV_ASSERT_OK(knvlinkUnregisterInbandCallback(pGpu, pKernelNvlink,
602                  NVLINK_INBAND_MSG_TYPE_GPU_PROBE_RSP));
603 
604     pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
605 
606     NV_ASSERT_OK(pRmApi->Control(pRmApi,
607                                  pGpu->hInternalClient,
608                                  pGpu->hInternalSubdevice,
609                                  NV2080_CTRL_CMD_INTERNAL_GPU_STOP_FABRIC_PROBE,
610                                  NULL, 0));
611 
612     portMemFree(pGpuFabricProbeInfoKernel);
613     pGpu->pGpuFabricProbeInfoKernel = NULL;
614 }
615 
616 NvBool
617 gpuFabricProbeIsSupported
618 (
619     OBJGPU *pGpu
620 )
621 {
622     if (pGpu->fabricProbeRetryDelay == 0)
623     {
624         NV_PRINTF(LEVEL_INFO, "GPU%u Probe handling is disabled\n",
625                   gpuGetInstance(pGpu));
626         return NV_FALSE;
627     }
628 
629     if (GPU_GET_KERNEL_NVLINK(pGpu) == NULL)
630     {
631         return NV_FALSE;
632     }
633 
634     return NV_TRUE;
635 }
636 
637 static void
638 _gpuFabricProbeInvalidate
639 (
640     OBJGPU *pGpu
641 )
642 {
643     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel =
644                                     pGpu->pGpuFabricProbeInfoKernel;
645     KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
646     FABRIC_VASPACE *pFabricVAS = dynamicCast(pGpu->pFabricVAS, FABRIC_VASPACE);
647 
648     pGpuFabricProbeInfoKernel->bProbeRespRcvd = NV_FALSE;
649 
650     if (pKernelNvlink != NULL)
651         knvlinkClearUniqueFabricBaseAddress_HAL(pGpu, pKernelNvlink);
652 
653     if (pFabricVAS != NULL)
654         fabricvaspaceClearUCRange(pFabricVAS);
655 }
656 
657 #define GPU_FABRIC_CHECK_BW_MODE(fmCaps, mode)                    \
658     do                                                            \
659     {                                                             \
660         if ((fmCaps & NVLINK_INBAND_FM_CAPS_BW_MODE_##mode) == 0) \
661             return NV_ERR_NOT_SUPPORTED;                          \
662     } while (0)
663 
664 static NV_STATUS
665 _gpuFabricProbeUpdateBwMode
666 (
667     OBJGPU *pGpu,
668     NvU8 mode
669 )
670 {
671     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel =
672                                                 pGpu->pGpuFabricProbeInfoKernel;
673     RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
674 
675     pGpuFabricProbeInfoKernel->bwMode = mode;
676 
677     gpuFabricProbeSuspend(pGpuFabricProbeInfoKernel);
678 
679     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
680           pRmApi->Control(pRmApi,
681                           pGpu->hInternalClient,
682                           pGpu->hInternalSubdevice,
683                           NV2080_CTRL_CMD_INTERNAL_GPU_INVALIDATE_FABRIC_PROBE,
684                           NULL, 0));
685 
686     _gpuFabricProbeInvalidate(pGpu);
687 
688     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, gpuFabricProbeResume(pGpuFabricProbeInfoKernel));
689 
690     return NV_OK;
691 }
692 
693 NV_STATUS
694 gpuFabricProbeSetBwMode
695 (
696     NvU8 mode
697 )
698 {
699     NvU32 attachedGpuCount;
700     NvU32 attachedGpuMask;
701     NV_STATUS status;
702     NvU32 gpuIndex;
703     OBJGPU *pGpu;
704 
705     status = gpumgrGetGpuAttachInfo(&attachedGpuCount, &attachedGpuMask);
706     if (status != NV_OK)
707     {
708         return NV_ERR_INVALID_STATE;
709     }
710 
711     // Check if all GPUs belong to NvSwitch
712     gpuIndex = 0;
713     for(pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex);
714         pGpu != NULL;
715         pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex))
716     {
717         if (!gpuFabricProbeIsSupported(pGpu))
718         {
719             // For directed connected system
720             return NV_OK;
721         }
722     }
723 
724     //
725     // Check if all GPUs received fabric probe and
726     //       if the mode is supported on all GPUs.
727     //
728     gpuIndex = 0;
729     for(pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex);
730         pGpu != NULL;
731         pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex))
732     {
733         if (!gpuFabricProbeIsReceived(pGpu->pGpuFabricProbeInfoKernel) ||
734             !gpuFabricProbeIsSuccess(pGpu->pGpuFabricProbeInfoKernel))
735         {
736             return NV_ERR_NOT_READY;
737         }
738 
739         NvU64 fmCaps = pGpu->pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.fmCaps;
740         switch(mode)
741         {
742             case GPU_NVLINK_BW_MODE_MIN:
743                 GPU_FABRIC_CHECK_BW_MODE(fmCaps, MIN);
744                 break;
745             case GPU_NVLINK_BW_MODE_HALF:
746                 GPU_FABRIC_CHECK_BW_MODE(fmCaps, HALF);
747                 break;
748             case GPU_NVLINK_BW_MODE_3QUARTER:
749                 GPU_FABRIC_CHECK_BW_MODE(fmCaps, 3QUARTER);
750                 break;
751             case GPU_NVLINK_BW_MODE_OFF:
752                 return NV_OK; // Don't need to ask FM
753             default:
754                 break;
755         }
756     }
757 
758     gpuIndex = 0;
759     for(pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex);
760         pGpu != NULL;
761         pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex))
762     {
763         FABRIC_VASPACE *pFabricVAS = dynamicCast(pGpu->pFabricVAS,
764                                                  FABRIC_VASPACE);
765         if (pFabricVAS == NULL)
766         {
767             continue;
768         }
769 
770         if (fabricvaspaceIsInUse(pFabricVAS))
771         {
772             return NV_ERR_STATE_IN_USE;
773         }
774     }
775 
776     gpuIndex = 0;
777     for(pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex);
778         pGpu != NULL;
779         pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex))
780     {
781         status = _gpuFabricProbeUpdateBwMode(pGpu, mode);
782         if (status != NV_OK)
783         {
784             return status;
785         }
786     }
787 
788     return NV_OK;
789 }
790 
791 NV_STATUS
792 gpuFabricProbeGetlinkMaskToBeReduced
793 (
794     GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel,
795     NvU32 *linkMaskToBeReduced
796 )
797 {
798     NV_STATUS status;
799 
800     status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel);
801     NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status);
802 
803     *linkMaskToBeReduced = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.linkMaskToBeReduced;
804     return NV_OK;
805 }
806